diff --git a/sglang/3rdparty/amd/profiling/PROFILING.md b/sglang/3rdparty/amd/profiling/PROFILING.md
new file mode 100644
index 0000000000000000000000000000000000000000..7e15ec844f2bf7743c641362826e957d6efd912b
--- /dev/null
+++ b/sglang/3rdparty/amd/profiling/PROFILING.md
@@ -0,0 +1,425 @@
+## Profiling SGLang Infer System with AMD GPUs
+This AppNote describes the SGLang profiling technical, code augment and running steps for systems with AMD Instinct GPUs, nevertheless the same procedure may work with Nvidia GPUs too.
+Examples and steps are provided in detail, to facilitate easy reproduce and use to localize performance problem towards optimizations.
+Two primary methods are covered:
+- [RPD](https://github.com/ROCm/rocmProfileData.git)
+- [PyTorch Profiler](https://pytorch.org/tutorials/recipes/recipes/profiler_recipe.html)
+
+### Profiling SGLang Infer System with RPD Profiler
+RPD profiler is a low-overhead cross-platform profiler. Therefore, the same RPD code augment not only works for profiling on ROCm/AMD GPUs, but also works for profiling on CUDA/Nvidia GPUs as well. To do RPD profiling on SGLang repository, please use scripts and patch files included in this directory and follow the steps below:
+1. Install RPD with rpd.patch applied during installation using install_rpd.sh, both files are in this directory.
+
+install_rpd.sh
+
+```bash
+# download and install RPD
+apt update && apt install -y sqlite3 libsqlite3-dev libfmt-dev
+
+# install rpd module
+git clone https://github.com/ROCmSoftwarePlatform/rocmProfileData
+cd rocmProfileData
+git checkout 976899e9c6dbc6dd2bccf770818e4e44125590ac
+git apply rpd.patch
+make && make install
+cd rocpd_python && python setup.py install && cd ..
+cd rpd_tracer && make clean;make install && python setup.py install && cd ..
+```
+
+rpd.patch
+
+```bash
+diff --git a/rpd_tracer/Makefile b/rpd_tracer/Makefile
+index e9d9feb..b2e9e1a 100644
+--- a/rpd_tracer/Makefile
++++ b/rpd_tracer/Makefile
+@@ -16,7 +16,7 @@ ifneq (,$(HIP_PATH))
+         $(info Building with roctracer)
+         RPD_LIBS += -L/opt/rocm/lib -lroctracer64 -lroctx64 -lamdhip64 -lrocm_smi64
+         RPD_INCLUDES += -I/opt/rocm/include -I/opt/rocm/include/roctracer -I/opt/rocm/include/hsa
+-        RPD_SRCS += RoctracerDataSource.cpp RocmSmiDataSource.cpp
++        RPD_SRCS += RoctracerDataSource.cpp
+         RPD_INCLUDES += -D__HIP_PLATFORM_AMD__
+ endif
+```
+2. Add loadTracer.sh file included in this directory to /sglang/python/sglang.
+
+loadTracer.sh
+
+```bash
+#!/bin/bash
+################################################################################
+# Copyright (c) 2021 - 2023 Advanced Micro Devices, Inc. All rights reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+# THE SOFTWARE.
+################################################################################
+OUTPUT_FILE="trace.rpd"
+
+if [ "$1" = "-o" ] ; then
+  OUTPUT_FILE=$2
+  shift
+  shift
+fi
+
+if [ -e ${OUTPUT_FILE} ] ; then
+  rm ${OUTPUT_FILE}
+fi
+
+python3 -m rocpd.schema --create ${OUTPUT_FILE}
+if [ $? != 0 ] ; then
+  echo "Error: Could not create rpd file. Please run 'python setup.py install' from the rocpd_python dir"
+  exit
+fi
+
+export RPDT_FILENAME=${OUTPUT_FILE}
+export RPDT_AUTOSTART=0
+LD_PRELOAD=librocm-smi_64:librpd_tracer.so "$@"
+```
+3. Apply patch (provided in this directory) with "git apply rpd_profile_server_enable.patch" if the main profiling purpose is to get info on gpu kernels as well as limited cpu activity info.
+
+#### Common Notes 1
+Please note that although we are doing TP=8 in the example, we purposely only log RPD profiling on 2 ranks in the patch file (i.e.tp_rank=0/1) for profiling/visualization convenience, as even Perfetto streaming mode can only load maximal 8GB json file for visualization. With 2 ranks logged in RPD profiling, we could still check whether there are issues among ranks (e.g. load imbalance issue, nccl issue), and at the same time, we could log relatively longer time duration before the json file generated from RPD file hits 8GB size.
+
+rpd_profile_server_enable.patch
+
+```bash
+diff --git a/python/sglang/srt/managers/scheduler.py b/python/sglang/srt/managers/scheduler.py
+index 62d1ff9..9021c01 100644
+--- a/python/sglang/srt/managers/scheduler.py
++++ b/python/sglang/srt/managers/scheduler.py
+@@ -71,6 +71,8 @@ from sglang.srt.utils import (
+     suppress_other_loggers,
+ )
+ from sglang.utils import get_exception_traceback
++from rpdTracerControl import rpdTracerControl
++rpdTracerControl.skipCreate()
+
+ logger = logging.getLogger(__name__)
+
+@@ -245,6 +247,7 @@ class Scheduler:
+                 ],
+                 with_stack=True,
+             )
++            self.rpd = rpdTracerControl()
+
+     @torch.inference_mode()
+     def event_loop(self):
+@@ -1027,15 +1030,24 @@ class Scheduler:
+     def start_profile(self) -> None:
+         if self.profiler is None:
+             raise RuntimeError("Profiler is not enabled.")
+-        self.profiler.start()
++        #self.profiler.start() #block pytorch profiler for rpd profiler enabling
++        if self.tp_rank == 0 or self.tp_rank == 1:
++            self.rpd.start()
++            self.rpd.rangePush("", "rpd profile range", "")
++            logger.info("rpd is enabled")
+
+     def stop_profile(self) -> None:
+         if self.profiler is None:
+             raise RuntimeError("Profiler is not enabled.")
+-        self.profiler.stop()
+-        self.profiler.export_chrome_trace(
+-            self.torch_profiler_trace_dir + "/" + str(time.time()) + ".trace.json.gz"
+-        )
++        #self.profiler.stop()
++        #self.profiler.export_chrome_trace(
++        #    self.torch_profiler_trace_dir + "/" + str(time.time()) + ".trace.json.gz"
++        #)
++        if self.tp_rank ==0 or self.tp_rank ==1:
++            self.rpd.rangePop()
++            self.rpd.stop()
++            self.rpd.flush()
++            logger.info("rpd is done")
+         logger.info("Profiler is done")
+```
+
+#### Advanced Debugging with RPD Profiler
+Sometimes, we want to use rpd profiler to capture more CPU and python activities in order to debug some challenging issues (e.g. root cause of load imbalance across gpu processes, root cause of bubbles, etc). Only in such cases, we need to apply patch "git apply rpd_profile_server_enable_wCPU_activities.patch", where 3 files are modified.
+
+rpd_profile_server_enable_wCPU_activities.patch
+
+```bash
+diff --git a/python/sglang/srt/managers/scheduler.py b/python/sglang/srt/managers/scheduler.py
+index 62d1ff9..2edb427 100644
+--- a/python/sglang/srt/managers/scheduler.py
++++ b/python/sglang/srt/managers/scheduler.py
+@@ -71,6 +71,8 @@ from sglang.srt.utils import (
+     suppress_other_loggers,
+ )
+ from sglang.utils import get_exception_traceback
++from rpdTracerControl import rpdTracerControl
++rpdTracerControl.skipCreate()
+
+ logger = logging.getLogger(__name__)
+
+@@ -245,6 +247,7 @@ class Scheduler:
+                 ],
+                 with_stack=True,
+             )
++            self.rpd = rpdTracerControl()
+
+     @torch.inference_mode()
+     def event_loop(self):
+@@ -1027,15 +1030,26 @@ class Scheduler:
+     def start_profile(self) -> None:
+         if self.profiler is None:
+             raise RuntimeError("Profiler is not enabled.")
+-        self.profiler.start()
++        #self.profiler.start()
++        logger.info("torch profiler is disabled")
++        if self.tp_rank == 0 or self.tp_rank == 1:
++            self.rpd.setPythonTrace(True)
++            self.rpd.start()
++            self.rpd.rangePush("", "scheduler", "")
++        logger.info("rpd is enabled inside scheduler profiling")
+
+     def stop_profile(self) -> None:
+         if self.profiler is None:
+             raise RuntimeError("Profiler is not enabled.")
+-        self.profiler.stop()
+-        self.profiler.export_chrome_trace(
+-            self.torch_profiler_trace_dir + "/" + str(time.time()) + ".trace.json.gz"
+-        )
++        #self.profiler.stop()
++        #self.profiler.export_chrome_trace(
++        #    self.torch_profiler_trace_dir + "/" + str(time.time()) + ".trace.json.gz"
++        #)
++        if self.tp_rank ==0 or self.tp_rank ==1:
++            self.rpd.rangePop()
++            self.rpd.stop()
++            self.rpd.flush()
++            logger.info("rpd is done inside scheduler")
+         logger.info("Profiler is done")
+
+
+diff --git a/python/sglang/srt/managers/tokenizer_manager.py b/python/sglang/srt/managers/tokenizer_manager.py
+index 2621ccd..181df85 100644
+--- a/python/sglang/srt/managers/tokenizer_manager.py
++++ b/python/sglang/srt/managers/tokenizer_manager.py
+@@ -58,6 +58,10 @@ from sglang.srt.sampling.sampling_params import SamplingParams
+ from sglang.srt.server_args import PortArgs, ServerArgs
+ from sglang.srt.utils import is_generation_model, is_multimodal_model
+
++from rpdTracerControl import rpdTracerControl
++rpdTracerControl.skipCreate()
++
++
+ asyncio.set_event_loop_policy(uvloop.EventLoopPolicy())
+
+ logger = logging.getLogger(__name__)
+@@ -514,10 +518,20 @@ class TokenizerManager:
+         self.send_to_scheduler.send_pyobj(req)
+
+     def start_profile(self):
++        rpd = rpdTracerControl()
++        rpd.setPythonTrace(True)
++        rpd.start()
++        rpd.rangePush("", "tokenizer_manager", "")
++        logger.info("tokenizer_manager rpd profiling started!")
+         req = ProfileReq.START_PROFILE
+         self.send_to_scheduler.send_pyobj(req)
+
+     def stop_profile(self):
++        rpd = rpdTracerControl()
++        rpd.rangePop()
++        rpd.stop()
++        rpd.flush()
++        logger.info("rpd profiling is done inside tokenizer_manager!")
+         req = ProfileReq.STOP_PROFILE
+         self.send_to_scheduler.send_pyobj(req)
+
+diff --git a/python/sglang/srt/server.py b/python/sglang/srt/server.py
+index 7111c93..2bd722c 100644
+--- a/python/sglang/srt/server.py
++++ b/python/sglang/srt/server.py
+@@ -30,6 +30,8 @@ import threading
+ import time
+ from http import HTTPStatus
+ from typing import Dict, List, Optional, Union
++from rpdTracerControl import rpdTracerControl
++rpdTracerControl.skipCreate()
+
+ # Fix a bug of Python threading
+ setattr(threading, "_register_atexit", lambda *args, **kwargs: None)
+@@ -152,6 +154,11 @@ async def flush_cache():
+ @app.post("/start_profile")
+ async def start_profile():
+     """Start profiling."""
++    rpd = rpdTracerControl()
++    rpd.setPythonTrace(True)
++    rpd.start()
++    rpd.rangePush("", "server rpd profile range", "")
++    logger.info("rpd profiling started in server.py!")
+     tokenizer_manager.start_profile()
+     return Response(
+         content="Start profiling.\n",
+@@ -164,6 +171,11 @@ async def start_profile():
+ async def stop_profile():
+     """Stop profiling."""
+     tokenizer_manager.stop_profile()
++    rpd = rpdTracerControl()
++    rpd.rangePop()
++    rpd.stop()
++    rpd.flush()
++    logger.info("rpd profiling is done in server.py!")
+     return Response(
+         content="Stop profiling. This will take some time.\n",
+         status_code=200,
+```
+
+4. As an example for grok1 profiling, we create a dummy_grok1 directory with config.json (see content below) inside this directory and copy this directory to the right path for "--model-path" if you want to use the example server.sh file provided.
+```bash
+cat ../dummy_grok1/config.json
+{
+  "architectures": [
+    "Grok1ModelForCausalLM"
+  ],
+  "embedding_multiplier_scale": 78.38367176906169,
+  "output_multiplier_scale": 0.5773502691896257,
+  "vocab_size": 131072,
+  "hidden_size": 6144,
+  "intermediate_size": 32768,
+  "max_position_embeddings": 8192,
+  "num_experts_per_tok": 2,
+  "num_local_experts": 8,
+  "num_attention_heads": 48,
+  "num_hidden_layers": 64,
+  "num_key_value_heads": 8,
+  "head_dim": 128,
+  "rms_norm_eps": 1e-05,
+  "rope_theta": 10000.0,
+  "model_type": "mixtral",
+  "torch_dtype": "bfloat16"
+}
+```
+5. Launch server with rpd enabled script ./server.sh in one terminal inside the docker container.
+
+#### Common Notes 2
+- Remember to change model-path to the correct path
+- loadTracer.sh is needed to conduct profiling
+- SGLANG_TORCH_PROFILER_DIR is used for default torch profiler
+- Do not use loadTracer.sh if you are using the torch profiler, simply use python3 -m sglang.launch_server.
+
+
+server.sh
+
+```bash
+#!/bin/bash
+
+# export SGLANG_TORCH_PROFILER_DIR=/data/sglang/
+export SGLANG_TORCH_PROFILER_DIR=/sgl-workspace/sglang/profile/
+
+# Get the current timestamp
+TIMESTAMP=$(date +"%Y%m%d_%H%M%S")
+
+# Define the log file with a timestamp
+LOGFILE="sglang_server_log_$TIMESTAMP.json"
+
+# Run the Python command and save the output to the log file
+loadTracer.sh python3 -m sglang.launch_server \
+    --model-path /sgl-workspace/sglang/dummy_grok1 \
+    --tokenizer-path Xenova/grok-1-tokenizer \
+    --load-format dummy \
+    --quantization fp8 \
+    --tp 8 \
+    --port 30000 \
+    --disable-radix-cache 2>&1 | tee "$LOGFILE"
+```
+6. Open another terminal for the same docker container, and run the rpd enabled ./client.sh after you see "The server is fired up and is ready to roll!" message from server side terminal.
+
+#### Common Notes 3
+- Use curl http://localhost:30000/start_profile & curl http://localhost:30000/stop_profile to control the start and end of profiling. Check sglang/python/sglang/srt/managers/scheduler.py for more details.
+- Please don't use RPD profiler together with PyTorch profiler to avoid interference.
+- The rocmProfileData/tools/rpd2tracing.py file is used to generate json file from RPD file.
+
+client.sh
+
+```bash
+#!/bin/bash
+
+# Start profiling via API
+curl http://localhost:30000/start_profile -H "Content-Type: application/json"
+
+# Benchmark serving using sglang with random dataset and tokenizer
+# Define the log file with a timestamp
+TIMESTAMP=$(date +%Y%m%d_%H%M%S)
+LOGFILE="sglang_client_log_$TIMESTAMP.json"
+
+# Run the benchmark with specified parameters and save logs
+python3 -m sglang.bench_serving \
+    --backend sglang \
+    --tokenizer Xenova/grok-1-tokenizer \
+    --dataset-name random \
+    --random-input 1024\
+    --random-output 1024 \
+    --num-prompts 120 \
+    --request-rate 8 \
+    --output-file online.jsonl 2>&1 | tee "$LOGFILE"
+
+# Stop profiling via API
+curl http://localhost:30000/stop_profile -H "Content-Type: application/json"
+
+# Convert tracing file to csv & json
+sqlite3 trace.rpd ".mode csv" ".header on" ".output trace.csv" "select * from top;" ".output stdout"
+python3 ./rocmProfileData/tools/rpd2tracing.py trace.rpd trace.json
+```
+7. Follow [Perfetto docs](https://perfetto.dev/docs/visualization/large-traces) to visualize large json files. Try to adjust parameters so that the trace.json file size is less than 9GB.
+
+### Profiling SGLang Infer System with PyTorch Profiler
+
+Please use the steps as follows:
+
+1. Apply the patch torch_profiler.patch. Note that you can modify "if self.tp_rank == 0" in the patch to allow more ranks be recorded in profiling.
+
+torch_profiler.patch
+```bash
+diff --git a/python/sglang/srt/managers/scheduler.py b/python/sglang/srt/managers/scheduler.py
+index 62d1ff9..6ecd78c 100644
+--- a/python/sglang/srt/managers/scheduler.py
++++ b/python/sglang/srt/managers/scheduler.py
+@@ -240,7 +240,6 @@ class Scheduler:
+             )
+             self.profiler = torch.profiler.profile(
+                 activities=[
+-                    torch.profiler.ProfilerActivity.CPU,
+                     torch.profiler.ProfilerActivity.CUDA,
+                 ],
+                 with_stack=True,
+@@ -1033,9 +1032,11 @@ class Scheduler:
+         if self.profiler is None:
+             raise RuntimeError("Profiler is not enabled.")
+         self.profiler.stop()
+-        self.profiler.export_chrome_trace(
+-            self.torch_profiler_trace_dir + "/" + str(time.time()) + ".trace.json.gz"
+-        )
++        if self.tp_rank == 0:
++            with open(f"stats_repro_{int(time.time())}.txt", "w") as f:
++                print(self.profiler.key_averages(group_by_input_shape=True).table(sort_by="cuda_time_total", row_limit=-1), file=f)
++                print("Profiling stats done.")
++
+         logger.info("Profiler is done")
+```
+
+2. Create the model path directory and copy it to the right path for "--model-path" if you want to use the server.sh file provided.
+
+3. Modify the included server.sh by removing "loadTracer.sh" before python command and launch script ./server.sh in one terminal inside the docker container.
+
+4. Similar to step 6 in RPD profiling section, but remove the last 2 lines in client.sh, which converted rpd file into csv and json files. Run modified client.sh for PyTorch profiling.
+-------
+- [Torch Profiler](https://pytorch.org/tutorials/recipes/recipes/profiler_recipe.html)
diff --git a/sglang/3rdparty/amd/profiling/client.sh b/sglang/3rdparty/amd/profiling/client.sh
new file mode 100644
index 0000000000000000000000000000000000000000..150ea9f193f9fdf217a1448b6c5da2057dcd80d9
--- /dev/null
+++ b/sglang/3rdparty/amd/profiling/client.sh
@@ -0,0 +1,27 @@
+#!/bin/bash
+
+# Start profiling via API
+curl http://localhost:30000/start_profile -H "Content-Type: application/json"
+
+# Benchmark serving using sglang with random dataset and tokenizer
+# Define the log file with a timestamp
+TIMESTAMP=$(date +%Y%m%d_%H%M%S)
+LOGFILE="sglang_client_log_$TIMESTAMP.json"
+
+# Run the benchmark with specified parameters and save logs
+python3 -m sglang.bench_serving \
+    --backend sglang \
+    --tokenizer Xenova/grok-1-tokenizer \
+    --dataset-name random \
+    --random-input 1024\
+    --random-output 1024 \
+    --num-prompts 240 \
+    --request-rate 8 \
+    --output-file online.jsonl 2>&1 | tee "$LOGFILE"
+
+# Stop profiling via API
+curl http://localhost:30000/stop_profile -H "Content-Type: application/json"
+
+# Convert tracing file to csv & json
+sqlite3 trace.rpd ".mode csv" ".header on" ".output trace.csv" "select * from top;" ".output stdout"
+python3 /sgl-workspace/rocmProfileData/tools/rpd2tracing.py trace.rpd trace.json
diff --git a/sglang/3rdparty/amd/profiling/install_rpd.sh b/sglang/3rdparty/amd/profiling/install_rpd.sh
new file mode 100644
index 0000000000000000000000000000000000000000..d1b04b9889767d19d3ce39a6a1dcdb81d93cf478
--- /dev/null
+++ b/sglang/3rdparty/amd/profiling/install_rpd.sh
@@ -0,0 +1,10 @@
+# download and install RPD
+apt update && apt install -y sqlite3 libsqlite3-dev libfmt-dev
+
+# install rpd module
+git clone https://github.com/ROCmSoftwarePlatform/rocmProfileData
+cd rocmProfileData
+git apply rpd.patch
+make && make install
+cd rocpd_python && python setup.py install && cd ..
+cd rpd_tracer && make clean;make install && python setup.py install && cd ..
diff --git a/sglang/3rdparty/amd/profiling/loadTracer.sh b/sglang/3rdparty/amd/profiling/loadTracer.sh
new file mode 100644
index 0000000000000000000000000000000000000000..8a95a335cb0e5ef0502e66f47ef54c368320017c
--- /dev/null
+++ b/sglang/3rdparty/amd/profiling/loadTracer.sh
@@ -0,0 +1,43 @@
+#!/bin/bash
+################################################################################
+# Copyright (c) 2021 - 2023 Advanced Micro Devices, Inc. All rights reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+# THE SOFTWARE.
+################################################################################
+OUTPUT_FILE="trace.rpd"
+
+if [ "$1" = "-o" ] ; then
+  OUTPUT_FILE=$2
+  shift
+  shift
+fi
+
+if [ -e ${OUTPUT_FILE} ] ; then
+  rm ${OUTPUT_FILE}
+fi
+
+python3 -m rocpd.schema --create ${OUTPUT_FILE}
+if [ $? != 0 ] ; then
+  echo "Error: Could not create rpd file. Please run 'python setup.py install' from the rocpd_python dir"
+  exit
+fi
+
+export RPDT_FILENAME=${OUTPUT_FILE}
+export RPDT_AUTOSTART=0
+LD_PRELOAD=librocm-smi_64:librpd_tracer.so "$@"
diff --git a/sglang/3rdparty/amd/profiling/rpd.patch b/sglang/3rdparty/amd/profiling/rpd.patch
new file mode 100644
index 0000000000000000000000000000000000000000..87917654ac8451f3d37ae1384cc8e074c8a6e31c
--- /dev/null
+++ b/sglang/3rdparty/amd/profiling/rpd.patch
@@ -0,0 +1,12 @@
+diff --git a/rpd_tracer/Makefile b/rpd_tracer/Makefile
+index e9d9feb..b2e9e1a 100644
+--- a/rpd_tracer/Makefile
++++ b/rpd_tracer/Makefile
+@@ -16,7 +16,7 @@ ifneq (,$(HIP_PATH))
+         $(info Building with roctracer)
+         RPD_LIBS += -L/opt/rocm/lib -lroctracer64 -lroctx64 -lamdhip64 -lrocm_smi64
+         RPD_INCLUDES += -I/opt/rocm/include -I/opt/rocm/include/roctracer -I/opt/rocm/include/hsa
+-        RPD_SRCS += RoctracerDataSource.cpp RocmSmiDataSource.cpp
++        RPD_SRCS += RoctracerDataSource.cpp
+         RPD_INCLUDES += -D__HIP_PLATFORM_AMD__
+ endif
diff --git a/sglang/3rdparty/amd/profiling/rpd_profile_server_enable.patch b/sglang/3rdparty/amd/profiling/rpd_profile_server_enable.patch
new file mode 100644
index 0000000000000000000000000000000000000000..3cd3915334eeb62abb92e2a0e9ef7b881613c882
--- /dev/null
+++ b/sglang/3rdparty/amd/profiling/rpd_profile_server_enable.patch
@@ -0,0 +1,49 @@
+diff --git a/python/sglang/srt/managers/scheduler.py b/python/sglang/srt/managers/scheduler.py
+index 62d1ff9..9021c01 100644
+--- a/python/sglang/srt/managers/scheduler.py
++++ b/python/sglang/srt/managers/scheduler.py
+@@ -71,6 +71,8 @@ from sglang.srt.utils import (
+     suppress_other_loggers,
+ )
+ from sglang.utils import get_exception_traceback
++from rpdTracerControl import rpdTracerControl
++rpdTracerControl.skipCreate()
+
+ logger = logging.getLogger(__name__)
+
+@@ -245,6 +247,7 @@ class Scheduler:
+                 ],
+                 with_stack=True,
+             )
++            self.rpd = rpdTracerControl()
+
+     @torch.inference_mode()
+     def event_loop(self):
+@@ -1027,15 +1030,24 @@ class Scheduler:
+     def start_profile(self) -> None:
+         if self.profiler is None:
+             raise RuntimeError("Profiler is not enabled.")
+-        self.profiler.start()
++        #self.profiler.start() #block pytorch profiler for rpd profiler enabling
++        if self.tp_rank == 0 or self.tp_rank == 1:
++            self.rpd.start()
++            self.rpd.rangePush("", "rpd profile range", "")
++            logger.info("rpd is enabled")
+
+     def stop_profile(self) -> None:
+         if self.profiler is None:
+             raise RuntimeError("Profiler is not enabled.")
+-        self.profiler.stop()
+-        self.profiler.export_chrome_trace(
+-            self.torch_profiler_trace_dir + "/" + str(time.time()) + ".trace.json.gz"
+-        )
++        #self.profiler.stop()
++        #self.profiler.export_chrome_trace(
++        #    self.torch_profiler_trace_dir + "/" + str(time.time()) + ".trace.json.gz"
++        #)
++        if self.tp_rank ==0 or self.tp_rank ==1:
++            self.rpd.rangePop()
++            self.rpd.stop()
++            self.rpd.flush()
++            logger.info("rpd is done")
+         logger.info("Profiler is done")
diff --git a/sglang/3rdparty/amd/profiling/rpd_profile_server_enable_wCPU_activities.patch b/sglang/3rdparty/amd/profiling/rpd_profile_server_enable_wCPU_activities.patch
new file mode 100644
index 0000000000000000000000000000000000000000..5416f4d571ae119f3cb24765984aaa75cd5d0e73
--- /dev/null
+++ b/sglang/3rdparty/amd/profiling/rpd_profile_server_enable_wCPU_activities.patch
@@ -0,0 +1,126 @@
+diff --git a/python/sglang/srt/managers/scheduler.py b/python/sglang/srt/managers/scheduler.py
+index 62d1ff9..2edb427 100644
+--- a/python/sglang/srt/managers/scheduler.py
++++ b/python/sglang/srt/managers/scheduler.py
+@@ -71,6 +71,8 @@ from sglang.srt.utils import (
+     suppress_other_loggers,
+ )
+ from sglang.utils import get_exception_traceback
++from rpdTracerControl import rpdTracerControl
++rpdTracerControl.skipCreate()
+
+ logger = logging.getLogger(__name__)
+
+@@ -245,6 +247,7 @@ class Scheduler:
+                 ],
+                 with_stack=True,
+             )
++            self.rpd = rpdTracerControl()
+
+     @torch.inference_mode()
+     def event_loop(self):
+@@ -1027,15 +1030,26 @@ class Scheduler:
+     def start_profile(self) -> None:
+         if self.profiler is None:
+             raise RuntimeError("Profiler is not enabled.")
+-        self.profiler.start()
++        #self.profiler.start()
++        logger.info("torch profiler is disabled")
++        if self.tp_rank == 0 or self.tp_rank == 1:
++            self.rpd.setPythonTrace(True)
++            self.rpd.start()
++            self.rpd.rangePush("", "scheduler", "")
++        logger.info("rpd is enabled inside scheduler profiling")
+
+     def stop_profile(self) -> None:
+         if self.profiler is None:
+             raise RuntimeError("Profiler is not enabled.")
+-        self.profiler.stop()
+-        self.profiler.export_chrome_trace(
+-            self.torch_profiler_trace_dir + "/" + str(time.time()) + ".trace.json.gz"
+-        )
++        #self.profiler.stop()
++        #self.profiler.export_chrome_trace(
++        #    self.torch_profiler_trace_dir + "/" + str(time.time()) + ".trace.json.gz"
++        #)
++        if self.tp_rank ==0 or self.tp_rank ==1:
++            self.rpd.rangePop()
++            self.rpd.stop()
++            self.rpd.flush()
++            logger.info("rpd is done inside scheduler")
+         logger.info("Profiler is done")
+
+
+diff --git a/python/sglang/srt/managers/tokenizer_manager.py b/python/sglang/srt/managers/tokenizer_manager.py
+index 2621ccd..181df85 100644
+--- a/python/sglang/srt/managers/tokenizer_manager.py
++++ b/python/sglang/srt/managers/tokenizer_manager.py
+@@ -58,6 +58,10 @@ from sglang.srt.sampling.sampling_params import SamplingParams
+ from sglang.srt.server_args import PortArgs, ServerArgs
+ from sglang.srt.utils import is_generation_model, is_multimodal_model
+
++from rpdTracerControl import rpdTracerControl
++rpdTracerControl.skipCreate()
++
++
+ asyncio.set_event_loop_policy(uvloop.EventLoopPolicy())
+
+ logger = logging.getLogger(__name__)
+@@ -514,10 +518,20 @@ class TokenizerManager:
+         self.send_to_scheduler.send_pyobj(req)
+
+     def start_profile(self):
++        rpd = rpdTracerControl()
++        rpd.setPythonTrace(True)
++        rpd.start()
++        rpd.rangePush("", "tokenizer_manager", "")
++        logger.info("tokenizer_manager rpd profiling started!")
+         req = ProfileReq.START_PROFILE
+         self.send_to_scheduler.send_pyobj(req)
+
+     def stop_profile(self):
++        rpd = rpdTracerControl()
++        rpd.rangePop()
++        rpd.stop()
++        rpd.flush()
++        logger.info("rpd profiling is done inside tokenizer_manager!")
+         req = ProfileReq.STOP_PROFILE
+         self.send_to_scheduler.send_pyobj(req)
+
+diff --git a/python/sglang/srt/server.py b/python/sglang/srt/server.py
+index 7111c93..2bd722c 100644
+--- a/python/sglang/srt/server.py
++++ b/python/sglang/srt/server.py
+@@ -30,6 +30,8 @@ import threading
+ import time
+ from http import HTTPStatus
+ from typing import Dict, List, Optional, Union
++from rpdTracerControl import rpdTracerControl
++rpdTracerControl.skipCreate()
+
+ # Fix a bug of Python threading
+ setattr(threading, "_register_atexit", lambda *args, **kwargs: None)
+@@ -152,6 +154,11 @@ async def flush_cache():
+ @app.post("/start_profile")
+ async def start_profile():
+     """Start profiling."""
++    rpd = rpdTracerControl()
++    rpd.setPythonTrace(True)
++    rpd.start()
++    rpd.rangePush("", "server rpd profile range", "")
++    logger.info("rpd profiling started in server.py!")
+     tokenizer_manager.start_profile()
+     return Response(
+         content="Start profiling.\n",
+@@ -164,6 +171,11 @@ async def start_profile():
+ async def stop_profile():
+     """Stop profiling."""
+     tokenizer_manager.stop_profile()
++    rpd = rpdTracerControl()
++    rpd.rangePop()
++    rpd.stop()
++    rpd.flush()
++    logger.info("rpd profiling is done in server.py!")
+     return Response(
+         content="Stop profiling. This will take some time.\n",
+         status_code=200,
diff --git a/sglang/3rdparty/amd/profiling/server.sh b/sglang/3rdparty/amd/profiling/server.sh
new file mode 100644
index 0000000000000000000000000000000000000000..f877e6c7acd40f48e6281f11be8c24c6ec6cf6aa
--- /dev/null
+++ b/sglang/3rdparty/amd/profiling/server.sh
@@ -0,0 +1,20 @@
+#!/bin/bash
+
+# export SGLANG_TORCH_PROFILER_DIR=/data/sglang/
+export SGLANG_TORCH_PROFILER_DIR=/sgl-workspace/sglang/profile/
+
+# Get the current timestamp
+TIMESTAMP=$(date +"%Y%m%d_%H%M%S")
+
+# Define the log file with a timestamp
+LOGFILE="sglang_server_log_$TIMESTAMP.json"
+
+# Run the Python command and save the output to the log file
+loadTracer.sh python3 -m sglang.launch_server \
+    --model-path /sgl-workspace/sglang/dummy_grok1 \
+    --tokenizer-path Xenova/grok-1-tokenizer \
+    --load-format dummy \
+    --quantization fp8 \
+    --tp 8 \
+    --port 30000 \
+    --disable-radix-cache 2>&1 | tee "$LOGFILE"
diff --git a/sglang/3rdparty/amd/profiling/torch_profiler.patch b/sglang/3rdparty/amd/profiling/torch_profiler.patch
new file mode 100644
index 0000000000000000000000000000000000000000..40f55740638a954ad4097520aa882c3351c2fbc7
--- /dev/null
+++ b/sglang/3rdparty/amd/profiling/torch_profiler.patch
@@ -0,0 +1,25 @@
+diff --git a/python/sglang/srt/managers/scheduler.py b/python/sglang/srt/managers/scheduler.py
+index 62d1ff9..6ecd78c 100644
+--- a/python/sglang/srt/managers/scheduler.py
++++ b/python/sglang/srt/managers/scheduler.py
+@@ -240,7 +240,6 @@ class Scheduler:
+             )
+             self.profiler = torch.profiler.profile(
+                 activities=[
+-                    torch.profiler.ProfilerActivity.CPU,
+                     torch.profiler.ProfilerActivity.CUDA,
+                 ],
+                 with_stack=True,
+@@ -1033,9 +1032,11 @@ class Scheduler:
+         if self.profiler is None:
+             raise RuntimeError("Profiler is not enabled.")
+         self.profiler.stop()
+-        self.profiler.export_chrome_trace(
+-            self.torch_profiler_trace_dir + "/" + str(time.time()) + ".trace.json.gz"
+-        )
++        if self.tp_rank == 0:
++            with open(f"stats_repro_{int(time.time())}.txt", "w") as f:
++                print(self.profiler.key_averages(group_by_input_shape=True).table(sort_by="cuda_time_total", row_limit=-1), file=f)
++                print("Profiling stats done.")
++
+         logger.info("Profiler is done")
diff --git a/sglang/3rdparty/amd/sgl-kernel/CMakeLists_rocm.txt b/sglang/3rdparty/amd/sgl-kernel/CMakeLists_rocm.txt
new file mode 100644
index 0000000000000000000000000000000000000000..86ba249c1bb76f7bbb57a6583035890cbd642e1d
--- /dev/null
+++ b/sglang/3rdparty/amd/sgl-kernel/CMakeLists_rocm.txt
@@ -0,0 +1,159 @@
+cmake_minimum_required(VERSION 3.24 FATAL_ERROR)
+project(sgl_kernel LANGUAGES CXX)
+
+# Cmake
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+set(CMAKE_POSITION_INDEPENDENT_CODE ON)
+set(CMAKE_SHARED_LIBRARY_PREFIX "")
+
+set(CMAKE_COLOR_DIAGNOSTICS ON)
+set(CMAKE_VERBOSE_MAKEFILE ON CACHE BOOL "ON")
+
+# Python / Torch
+find_package(Python COMPONENTS Interpreter Development.Module ${SKBUILD_SABI_COMPONENT} REQUIRED)
+
+execute_process(
+  COMMAND ${Python_EXECUTABLE} -c "import torch; print(torch.utils.cmake_prefix_path)"
+  OUTPUT_VARIABLE TORCH_PY_PREFIX
+  OUTPUT_STRIP_TRAILING_WHITESPACE
+)
+
+set(Torch_DIR "${TORCH_PY_PREFIX}/Torch")
+list(APPEND CMAKE_PREFIX_PATH "${TORCH_PY_PREFIX}/Torch")
+find_package(Torch REQUIRED)
+
+execute_process(
+  COMMAND ${Python_EXECUTABLE} -c "import torch; print(int(torch._C._GLIBCXX_USE_CXX11_ABI))"
+  OUTPUT_VARIABLE TORCH_CXX11_ABI
+  OUTPUT_STRIP_TRAILING_WHITESPACE
+)
+if(TORCH_CXX11_ABI STREQUAL "0")
+  add_compile_definitions(_GLIBCXX_USE_CXX11_ABI=0)
+else()
+  add_compile_definitions(_GLIBCXX_USE_CXX11_ABI=1)
+endif()
+
+# ROCm/HIP
+enable_language(HIP)
+find_package(hip REQUIRED CONFIG)
+
+# Determine AMDGPU target from environment variable or default to gfx942
+set(AMDGPU_TARGET_ENV "$ENV{AMDGPU_TARGET}")
+
+if(AMDGPU_TARGET_ENV)
+  # Use environment variable if specified
+  set(AMDGPU_TARGETS "${AMDGPU_TARGET_ENV}")
+  message(STATUS "Using AMDGPU_TARGET from environment: ${AMDGPU_TARGETS}")
+else()
+  # Default to gfx942 only
+  set(AMDGPU_TARGETS "gfx942")
+  message(STATUS "AMDGPU_TARGET not set, defaulting to gfx942")
+endif()
+
+# Set HIP architectures
+set(CMAKE_HIP_ARCHITECTURES ${AMDGPU_TARGETS})
+
+# FP8 macro selection
+# Always define HIP_FP8_TYPE_FNUZ=1 (for gfx942 and host compilation)
+# Additionally define HIP_FP8_TYPE_E4M3=1 when building for gfx950
+# The existing utils.h logic will pick the right one based on architecture
+set(SGL_FP8_MACROS "-DHIP_FP8_TYPE_FNUZ=1")
+
+if(AMDGPU_TARGETS MATCHES "gfx950")
+  list(APPEND SGL_FP8_MACROS "-DHIP_FP8_TYPE_E4M3=1")
+  message(STATUS "Multi-arch build: Enabling both HIP_FP8_TYPE_FNUZ (gfx942) and HIP_FP8_TYPE_E4M3 (gfx950)")
+elseif(AMDGPU_TARGETS MATCHES "gfx942")
+  message(STATUS "Single-arch build: Enabling HIP_FP8_TYPE_FNUZ for gfx942")
+else()
+  message(FATAL_ERROR "Unsupported AMDGPU_TARGET '${AMDGPU_TARGETS}'. Expected 'gfx942' or 'gfx950' or both.")
+endif()
+
+# TopK dynamic smem bytes
+# Dynamic shared-memory budget for the TopK kernels.
+# - gfx942 (MI300/MI325): LDS is typically 64KB per workgroup -> keep dynamic smem <= ~48KB
+#   (leaves room for static shared allocations in the kernel).
+# - gfx95x (MI350): LDS is larger (e.g. 160KB per CU) -> allow the original 128KB dynamic smem.
+if(AMDGPU_TARGET_ONE STREQUAL "gfx942")
+  math(EXPR SGL_TOPK_DYNAMIC_SMEM_BYTES "48 * 1024")
+else()
+  math(EXPR SGL_TOPK_DYNAMIC_SMEM_BYTES "32 * 1024 * 4")
+endif()
+
+set(SGL_TOPK_MACROS "-DSGL_TOPK_DYNAMIC_SMEM_BYTES=${SGL_TOPK_DYNAMIC_SMEM_BYTES}")
+
+# Paths / includes
+set(PROJ_ROOT ${CMAKE_CURRENT_LIST_DIR})
+set(SGL_INCLUDE_DIRS
+  ${PROJ_ROOT}/include
+  ${PROJ_ROOT}/include/impl
+  ${PROJ_ROOT}/csrc
+  ${TORCH_INCLUDE_DIRS}
+)
+
+# Platform-specific library directory
+set(PLAT_LIB_DIR "/usr/lib/x86_64-linux-gnu")
+link_directories(${PLAT_LIB_DIR})
+
+# Sources
+set(SOURCES
+${PROJ_ROOT}/csrc/allreduce/custom_all_reduce.hip
+${PROJ_ROOT}/csrc/allreduce/deterministic_all_reduce.hip
+${PROJ_ROOT}/csrc/allreduce/quick_all_reduce.hip
+${PROJ_ROOT}/csrc/common_extension_rocm.cc
+${PROJ_ROOT}/csrc/elementwise/activation.hip
+${PROJ_ROOT}/csrc/elementwise/pos_enc.hip
+${PROJ_ROOT}/csrc/elementwise/topk.hip
+${PROJ_ROOT}/csrc/grammar/apply_token_bitmask_inplace_hip.hip
+${PROJ_ROOT}/csrc/kvcacheio/transfer.hip
+${PROJ_ROOT}/csrc/moe/moe_align_kernel.hip
+${PROJ_ROOT}/csrc/moe/moe_topk_softmax_kernels.hip
+${PROJ_ROOT}/csrc/moe/moe_topk_sigmoid_kernels.hip
+${PROJ_ROOT}/csrc/speculative/eagle_utils.hip
+)
+set_source_files_properties(
+  ${SOURCES}
+  PROPERTIES
+    LANGUAGE HIP
+)
+
+# Compile / Link flags
+add_compile_options($<$<COMPILE_LANGUAGE:CXX>:-O3>)
+
+set(SGL_HIP_FLAGS
+  -DNDEBUG
+  -DOPERATOR_NAMESPACE=sgl_kernel
+  -O3
+  -std=c++17
+  -DENABLE_BF16
+  -DENABLE_FP8
+  ${SGL_FP8_MACROS}
+  -Wno-pass-failed
+  -Wundefined-internal
+  ${SGL_TOPK_MACROS}
+)
+
+# Python extension
+Python_add_library(common_ops MODULE USE_SABI ${SKBUILD_SABI_VERSION} WITH_SOABI ${SOURCES})
+target_include_directories(common_ops PRIVATE ${SGL_INCLUDE_DIRS})
+
+# Apply per-language flags
+target_compile_options(common_ops PRIVATE
+  $<$<COMPILE_LANGUAGE:HIP>:${SGL_HIP_FLAGS}>
+)
+
+target_link_libraries(common_ops PRIVATE
+  ${TORCH_LIBRARIES}
+  hip::device
+  hip::host
+  hiprtc
+  amdhip64
+)
+
+target_link_options(common_ops PRIVATE
+  "SHELL:-Wl,-rpath,'\$ORIGIN/../../torch/lib'"
+)
+
+install(TARGETS common_ops
+  LIBRARY DESTINATION sgl_kernel
+)
diff --git a/sglang/3rdparty/amd/sgl-kernel/build_rocm.sh b/sglang/3rdparty/amd/sgl-kernel/build_rocm.sh
new file mode 100644
index 0000000000000000000000000000000000000000..1022d8bb50f363fc4a1d5d3ba428f0cb8bbb435d
--- /dev/null
+++ b/sglang/3rdparty/amd/sgl-kernel/build_rocm.sh
@@ -0,0 +1,123 @@
+#!/bin/bash
+set -euo pipefail
+ROCM_VERSION=$1
+
+PYTHON_ROOT_PATH="/opt/venv/bin"
+AMDGPU_TARGET="gfx942;gfx950"
+
+echo "Python root path is: $PYTHON_ROOT_PATH"
+
+# Get version from git tags
+SGLANG_VERSION="v0.5.6"   # Default version, will be overridden if git tags are found
+
+# Fetch tags from origin to ensure we have the latest
+if git fetch --tags origin; then
+  # Get the latest version tag sorted by version number (e.g., v0.5.7)
+  VERSION_FROM_TAG=$(git tag -l 'v[0-9]*' --sort=-v:refname | head -1)
+  if [ -n "$VERSION_FROM_TAG" ]; then
+    SGLANG_VERSION="$VERSION_FROM_TAG"
+    echo "Using SGLang version from git tags: $SGLANG_VERSION"
+  else
+    echo "Warning: No version tags found; using default $SGLANG_VERSION" >&2
+  fi
+else
+  echo "Warning: Failed to fetch tags from origin; using default $SGLANG_VERSION" >&2
+fi
+
+# Default base tags (can be overridden by command line arguments)
+DEFAULT_MI30X_BASE_TAG="${SGLANG_VERSION}-rocm700-mi30x"
+DEFAULT_MI35X_BASE_TAG="${SGLANG_VERSION}-rocm700-mi35x"
+
+# Parse command line arguments
+MI30X_BASE_TAG="${DEFAULT_MI30X_BASE_TAG}"
+MI35X_BASE_TAG="${DEFAULT_MI35X_BASE_TAG}"
+
+# Detect GPU architecture from the Kubernetes runner hostname
+HOSTNAME_VALUE=$(hostname)
+GPU_ARCH="mi30x"   # default
+
+# Host names look like: linux-mi35x-gpu-1-xxxxx-runner-zzzzz
+if [[ "${HOSTNAME_VALUE}" =~ ^linux-(mi[0-9]+[a-z]*)-gpu-[0-9]+ ]]; then
+  GPU_ARCH="${BASH_REMATCH[1]}"
+  echo "Detected GPU architecture from hostname: ${GPU_ARCH}"
+else
+  echo "Warning: could not parse GPU architecture from '${HOSTNAME_VALUE}', defaulting to ${GPU_ARCH}"
+fi
+
+case "${GPU_ARCH}" in
+  mi35x)
+    echo "Runner uses ${GPU_ARCH}; will fetch mi35x image."
+    ;;
+  mi30x|mi300|mi325)
+    echo "Runner uses ${GPU_ARCH}; will fetch mi30x image."
+    GPU_ARCH="mi30x"
+    ;;
+  *)
+    echo "Runner architecture '${GPU_ARCH}' unrecognised; defaulting to mi30x image." >&2
+    GPU_ARCH="mi30x"
+    ;;
+esac
+
+if [[ -f /etc/podinfo/gha-render-devices ]]; then
+  DEVICE_FLAG=$(cat /etc/podinfo/gha-render-devices)
+else
+  DEVICE_FLAG="--device /dev/dri"
+fi
+
+# Find the latest image
+find_latest_image() {
+  local gpu_arch=$1
+  local base_tag days_back image_tag
+
+  case "${gpu_arch}" in
+      mi30x) base_tag="${MI30X_BASE_TAG}" ;;
+      mi35x) base_tag="${MI35X_BASE_TAG}" ;;
+      *)     echo "Error: unsupported GPU architecture '${gpu_arch}'" >&2; return 1 ;;
+  esac
+
+  for days_back in {0..6}; do
+    image_tag="${base_tag}-$(date -d "${days_back} days ago" +%Y%m%d)"
+    echo "Checking for image: rocm/sgl-dev:${image_tag}" >&2
+    if docker manifest inspect "rocm/sgl-dev:${image_tag}" >/dev/null 2>&1; then
+      echo "Found available image: rocm/sgl-dev:${image_tag}" >&2
+      echo "rocm/sgl-dev:${image_tag}"
+      return 0
+    fi
+  done
+
+  echo "Error: no ${gpu_arch} image found in the last 7 days for base ${base_tag}" >&2
+  echo "Using hard-coded fallback…" >&2
+  if [[ "${gpu_arch}" == "mi35x" ]]; then
+    echo "rocm/sgl-dev:v0.5.3-rocm700-mi35x-20251009"
+  else
+    echo "rocm/sgl-dev:v0.5.3-rocm700-mi30x-20251009"
+  fi
+}
+
+# Pull and run the latest image
+IMAGE=$(find_latest_image "${GPU_ARCH}")
+echo "Pulling Docker image: ${IMAGE}"
+docker pull "${IMAGE}"
+
+docker run --rm \
+  -v $(pwd):/sgl-kernel \
+  -e AMDGPU_TARGET="${AMDGPU_TARGET}" \
+  ${IMAGE} \
+  bash -c "
+  # Install CMake (version >= 3.26) - Robust Installation
+  export CMAKE_VERSION_MAJOR=3.31
+  export CMAKE_VERSION_MINOR=1
+  echo \"Downloading CMake from: https://cmake.org/files/v\${CMAKE_VERSION_MAJOR}/cmake-\${CMAKE_VERSION_MAJOR}.\${CMAKE_VERSION_MINOR}-linux-x86_64.tar.gz\"
+  wget https://cmake.org/files/v\${CMAKE_VERSION_MAJOR}/cmake-\${CMAKE_VERSION_MAJOR}.\${CMAKE_VERSION_MINOR}-linux-x86_64.tar.gz
+  tar -xzf cmake-\${CMAKE_VERSION_MAJOR}.\${CMAKE_VERSION_MINOR}-linux-x86_64.tar.gz
+  mv cmake-\${CMAKE_VERSION_MAJOR}.\${CMAKE_VERSION_MINOR}-linux-x86_64 /opt/cmake
+  export PATH=/opt/cmake/bin:\$PATH
+
+  ${PYTHON_ROOT_PATH}/pip install --no-cache-dir ninja setuptools wheel numpy uv scikit-build-core && \
+
+  cd /sgl-kernel && \
+  rm -rf CMakeLists.txt && mv CMakeLists_rocm.txt CMakeLists.txt && \
+  ${PYTHON_ROOT_PATH}/python rocm_hipify.py && \
+  ${PYTHON_ROOT_PATH}/python -m uv build --wheel -Cbuild-dir=build . --color=always --no-build-isolation && \
+  ./rename_wheels_rocm.sh
+"
diff --git a/sglang/3rdparty/amd/sgl-kernel/rename_wheels_rocm.sh b/sglang/3rdparty/amd/sgl-kernel/rename_wheels_rocm.sh
new file mode 100644
index 0000000000000000000000000000000000000000..691407a3e63f9db51ef9e81379bd8b5abe098e7a
--- /dev/null
+++ b/sglang/3rdparty/amd/sgl-kernel/rename_wheels_rocm.sh
@@ -0,0 +1,30 @@
+#!/usr/bin/env bash
+set -ex
+
+WHEEL_DIR="dist"
+
+wheel_files=($WHEEL_DIR/*.whl)
+for wheel in "${wheel_files[@]}"; do
+    intermediate_wheel="${wheel/linux/manylinux2014}"
+
+    # Extract the current python version from the wheel name
+    if [[ $intermediate_wheel =~ -cp([0-9]+)- ]]; then
+        cp_version="${BASH_REMATCH[1]}"
+    else
+        echo "Could not extract Python version from wheel name: $intermediate_wheel"
+        continue
+    fi
+
+    # Detect ROCm version and add appropriate suffix
+    if ls /opt | grep -q "7.0"; then
+        new_wheel="${intermediate_wheel/-cp${cp_version}/+rocm700-cp${cp_version}}"
+    else
+        new_wheel="$intermediate_wheel"
+    fi
+
+    if [[ "$wheel" != "$new_wheel" ]]; then
+        echo "Renaming $wheel to $new_wheel"
+        mv -- "$wheel" "$new_wheel"
+    fi
+done
+echo "Wheel renaming completed."
diff --git a/sglang/3rdparty/amd/sgl-kernel/rocm_hipify.py b/sglang/3rdparty/amd/sgl-kernel/rocm_hipify.py
new file mode 100644
index 0000000000000000000000000000000000000000..8373f741d1d60b971e9b2b4503e27835673d535c
--- /dev/null
+++ b/sglang/3rdparty/amd/sgl-kernel/rocm_hipify.py
@@ -0,0 +1,40 @@
+from pathlib import Path
+
+import torch
+from torch.utils.cpp_extension import CUDAExtension
+
+root = Path(__file__).parent.resolve()
+
+include_dirs = [
+    root / "include",
+    root / "include" / "impl",
+    root / "csrc",
+]
+
+sources = [
+    "csrc/allreduce/custom_all_reduce.hip",
+    "csrc/allreduce/deterministic_all_reduce.hip",
+    "csrc/allreduce/quick_all_reduce.cu",
+    "csrc/common_extension_rocm.cc",
+    "csrc/elementwise/activation.cu",
+    "csrc/elementwise/pos_enc.cu",
+    "csrc/elementwise/topk.cu",
+    "csrc/grammar/apply_token_bitmask_inplace_cuda.cu",
+    "csrc/kvcacheio/transfer.cu",
+    "csrc/moe/moe_align_kernel.cu",
+    "csrc/moe/moe_topk_softmax_kernels.cu",
+    "csrc/moe/moe_topk_sigmoid_kernels.cu",
+    "csrc/speculative/eagle_utils.cu",
+]
+
+libraries = ["hiprtc", "amdhip64", "c10", "torch", "torch_python"]
+
+ext_modules = [
+    CUDAExtension(
+        name="sgl_kernel.common_ops",
+        sources=sources,
+        include_dirs=include_dirs,
+        libraries=libraries,
+        py_limited_api=False,
+    ),
+]
diff --git a/sglang/3rdparty/amd/tuning/TUNING.md b/sglang/3rdparty/amd/tuning/TUNING.md
new file mode 100644
index 0000000000000000000000000000000000000000..a903bba03eca30efc8076cc878258ca6278290cf
--- /dev/null
+++ b/sglang/3rdparty/amd/tuning/TUNING.md
@@ -0,0 +1,118 @@
+## Tuning SGLang Infer System with AMD GPUs
+This AppNote describes the SGLang performance tuning technical, code harness and running steps for systems with AMD Instinct GPUs.
+Harness code, examples and steps are provided in detail, to facilitate easy reproduce & use to tune performance towards workloads.
+Three primary runtime areas are covered:
+
+## 1. Triton Kernels
+To maximize Triton kernel efficiency, several strategies can be employed:
+
+### Key Environment Variables:
+- **num_stages**: Adjusts the number of pipeline stages to optimize kernel efficiency based on the specific type of operations (e.g., General Matrix Multiplication - GEMM).
+- **waves_per_eu**: Controls the usage of Vector General Purpose Registers (VGPR) to enhance occupancy, thereby improving latency or throughput.
+- **BLOCK_M, BLOCK_N, BLOCK_K**: Tunable tile sizes that assist in balancing memory transfer and computational efficiency.
+- **matrix_instr_nonkdim**: Optimizes the usage of Matrix-Fused Multiply-Add (MFMA) instructions for specific kernel types, such as Flash Attention.
+- **OPTIMIZE_EPILOGUE**: An environment variable that can be set to `1` to enhance performance by eliminating the `convert_layout` operation in the kernel's epilogue.
+```python
+@triton.autotune(configs=[
+        triton.Config({'waves_per_eu': 1}, num_warps=4, num_stages=1),
+        triton.Config({'waves_per_eu': 1}, num_warps=8, num_stages=1),
+        triton.Config({'waves_per_eu': 1}, num_warps=16, num_stages=1),
+        triton.Config({'waves_per_eu': 2}, num_warps=4, num_stages=1),
+        triton.Config({'waves_per_eu': 2}, num_warps=8, num_stages=1),
+        triton.Config({'waves_per_eu': 2}, num_warps=16, num_stages=1),
+        triton.Config({'waves_per_eu': 4}, num_warps=4, num_stages=1),
+        triton.Config({'waves_per_eu': 4}, num_warps=8, num_stages=1),
+        triton.Config({'waves_per_eu': 4}, num_warps=16, num_stages=1),
+    ], key=['BLOCK_N', 'NUM_TOKEN_BLKS'], use_cuda_graph=True)
+@triton.jit
+def _triton_kernel_function():
+    ...
+```
+## 2. Torch Tunable Operations
+**TunableOp** is a feature in PyTorch that allows for the definition and optimization of custom kernels with tunable parameters. This feature is particularly useful for enhancing the performance of kernels by experimenting with different configurations.
+
+### Key Environment Variables:
+1. **PYTORCH_TUNABLEOP_ENABLED**:
+   - Default: `0`
+   - Set to `1` to enable TunableOp.
+
+2. **PYTORCH_TUNABLEOP_TUNING**:
+   - Default: `1`
+   - Set to `0` to disable tuning. If a tuned entry is not found, it will run the tuning step and record the entry when PYTORCH_TUNABLEOP_ENABLED is enabled.
+
+3. **PYTORCH_TUNABLEOP_VERBOSE**:
+   - Default: `0`
+   - Set to `1` to enable verbose output for TunableOp.
+
+### Usage Example:
+To enable TunableOp and tuning, and optionally enable verbose mode, you can run the following command in your terminal:
+
+```bash
+#Tuning
+PYTORCH_TUNABLEOP_ENABLED=1 PYTORCH_TUNABLEOP_TUNING=1 your_script.sh
+
+#Inference with tuning op
+PYTORCH_TUNABLEOP_ENABLED=1 PYTORCH_TUNABLEOP_TUNING=0 your_script.sh
+
+#Print out the log
+PYTORCH_TUNABLEOP_ENABLED=1 PYTORCH_TUNABLEOP_TUNING=0 PYTORCH_TUNABLEOP_VERBOSE=1 your_script.sh
+
+```
+## 3. Torch Compilation
+
+
+The following are suggestions for optimizing matrix multiplication (GEMM) and convolution (conv) operations in PyTorch using Inductor, a part of the PyTorch compilation framework. The goal is to leverage Triton to achieve better performance.
+
+To tune Triton kernels with GEMM and convolution ops (conv), use the `torch.compile` function with the max-autotune mode. This benchmarks a predefined list of Triton configurations and selects the fastest one for each shape.
+
+### Key Configurations:
+1. **Max Autotune**:
+   - Set `torch._inductor.config.max_autotune = True` or `TORCHINDUCTOR_MAX_AUTOTUNE=1`.
+
+2. **Fine-Grained Control**:
+   - Enable GEMM tuning: `torch._inductor.config.max_autotune_gemm = True`.
+   - Enable tuning for pointwise/reduction ops: `torch._inductor.config.max_autotune.pointwise = True`.
+
+3. **Backend Selection**:
+   - Use `torch._inductor.max_autotune_gemm_backends` to limit backends to TRITON for better performance.
+
+4. **Freezing for Inference**:
+   - Use `torch._inductor.config.freezing=True` to enable constant folding optimizations.
+
+5. **Debugging**:
+   - Set `TORCH_COMPILE_DEBUG=1` to extract Triton kernels generated by Inductor.
+
+### Example Code Block:
+```bash
+#Gemm Tuning
+TORCHINDUCTOR_MAX_AUTOTUNE=1 TORCHINDUCTOR_COORDINATE_DESCENT_TUNING=1 your_script.sh
+
+#Specify your backend to TRITON for Gemm Tuning
+TORCHINDUCTOR_MAX_AUTOTUNE=1 TORCHINDUCTOR_COORDINATE_DESCENT_TUNING=1 TORCHINDUCTOR_MAX_AUTOTUNE_GEMM_BACKENDS=TRITON your_script.sh
+
+#Inference with large improvement on AMD GPU
+TORCHINDUCTOR_FREEZING=1 your_script.sh
+```
+## 4. Fused MOE kernel
+To maximize moe kernel efficiency, need to use below scripts to find out the best launch configuration
+
+### Key parameters:
+- **--model**: what moe model type to do tuning, it will automatically decide the size of d_model, model_intermediate_size, num_layers
+- **--tp-size**: simulate the whole model run configuration to set the dimension size using tp correctly
+- **--batch**: M dimension size of moe kernel, for prefill moe kernel the value is batch*input_len, for decode moe kernel the value is batch
+- **--dtype**: computation type
+
+```bash
+#Tuning
+#for example, we have one case like this "python3 -m sglang.bench_latency --model dummy_grok1/ --load-format dummy --tokenizer-path Xenova/grok-1-tokenizer --tp 8 --batch-size 32 --input 1024 --output 8 --attention-backend triton --sampling-backend pytorch --quantization fp8" to run, it defined batch-size 32 input length 1024 and output length 8, from "--batch" in moe view point, the prefill batch is 32*1024 = 32768, the decode batch is 32*1(only one output token generated in each run).
+#so we can tune decode moe use below command
+python benchmark_moe_rocm.py --model grok1 --tp-size 8 --dtype float8 --batch "32"
+# and use this command to tune prefill moe
+python benchmark_moe_rocm.py --model grok1 --tp-size 8 --dtype float8 --batch "32768"
+```
+
+## Reference
+
+For more detailed information on tuning SGLang performance with AMD GPUs, please refer to the following link:
+
+[ROCm Documentation: Triton Kernel Performance Optimization](https://rocm.docs.amd.com/en/latest/how-to/tuning-guides/mi300x/workload.html#triton-kernel-performance-optimization)
diff --git a/sglang/3rdparty/amd/tuning/benchmark_moe_rocm.py b/sglang/3rdparty/amd/tuning/benchmark_moe_rocm.py
new file mode 100644
index 0000000000000000000000000000000000000000..131b25270ab874960c3f436d6af93e2addf77bc8
--- /dev/null
+++ b/sglang/3rdparty/amd/tuning/benchmark_moe_rocm.py
@@ -0,0 +1,378 @@
+import argparse
+import json
+import os
+import sys
+
+import torch
+import torch.nn.functional as F
+import triton
+import triton.language as tl
+from tqdm import tqdm
+from transformers import AutoConfig
+
+from sglang.srt.layers.moe.fused_moe_triton.fused_moe import (
+    fused_moe,
+    get_config_file_name,
+)
+
+padding_size = 128 if bool(int(os.getenv("SGLANG_MOE_PADDING", "0"))) else 0
+
+
+def main(model, tp_size, dtype: str, batches):
+    method = fused_moe
+
+    for bs in batches:
+        run_grid(int(bs), model=model, method=method, tp_size=tp_size, dtype=dtype)
+
+
+def prune_configs(M, N, K, configs):
+    pruned_configs = []
+    elemBytes_a = 1  # [DV Note] Hard-coded for float16 (2 bytes)
+    elemBytes_b = 1  # [DV Note] Hard-coded for float16 (2 bytes)
+
+    mfma = 16 if M < 32 or N < 32 else 32
+
+    # TODO (zhanglx): figure out the boundary between large and small gemms
+    large_gemm = False
+    if M >= 2048 and N >= 2048:
+        large_gemm = True
+
+    for config in configs:
+        BLOCK_SIZE_M = config.get("BLOCK_SIZE_M")
+        BLOCK_SIZE_N = config.get("BLOCK_SIZE_N")
+        BLOCK_SIZE_K = config.get("BLOCK_SIZE_K")
+        num_warps = config.get("num_warps")
+        matrix_instr_nonkdim = config.get("matrix_instr_nonkdim")
+        # kpack = config.get("kpack")
+        if matrix_instr_nonkdim > mfma:
+            continue
+        if mfma == 4 and BLOCK_SIZE_K < 64:
+            continue
+        # some layouts could not work properly in case
+        # number elements per thread is less 1
+        if BLOCK_SIZE_M * BLOCK_SIZE_N < 64:
+            continue
+        SPLIT_K = 1  # config.get("SPLIT_K")
+        GROUP_M = config.get("GROUP_SIZE_M")
+        if matrix_instr_nonkdim > BLOCK_SIZE_M or matrix_instr_nonkdim > BLOCK_SIZE_N:
+            continue
+        if matrix_instr_nonkdim >= M and matrix_instr_nonkdim != BLOCK_SIZE_M:
+            continue
+        if matrix_instr_nonkdim >= N and matrix_instr_nonkdim != BLOCK_SIZE_N:
+            continue
+        # Skip BLOCK_SIZE that is too large compare to M/N
+        # unless BLOCK_SIZE is already small enough
+        if M * 2 < BLOCK_SIZE_M and BLOCK_SIZE_M != 16:
+            continue
+        if N * 2 < BLOCK_SIZE_N and BLOCK_SIZE_N != 16:
+            continue
+        # skip large split_k when not necessary
+        if SPLIT_K != 1 and not need_split_k(M, N, K):
+            continue
+        # skip split_k that leads to EVEN_K = false
+        leap = SPLIT_K * BLOCK_SIZE_K
+        modv = K % leap
+        if modv != 0:
+            continue
+        # skip large GROUP_M
+        if GROUP_M * BLOCK_SIZE_M > M and GROUP_M != 1:
+            continue
+        # out of shared memory resource
+        # TODO (zhanglx): This does not consider the LDS usage in the epilogue
+        LDS = (
+            BLOCK_SIZE_K * BLOCK_SIZE_M * elemBytes_a
+            + BLOCK_SIZE_K * BLOCK_SIZE_N * elemBytes_b
+        )
+        if LDS > 65536:
+            continue
+        # Skip small block sizes and num_warps for large gemm
+        # For fp16 and f8, we want to only use BLOCK_SIZE >= 64
+        if large_gemm:
+            if BLOCK_SIZE_M < 64 or BLOCK_SIZE_N < 64:
+                continue
+            if BLOCK_SIZE_K < 64:
+                continue
+            if num_warps < 4:
+                continue
+
+        pruned_configs.append(config)
+
+    return pruned_configs
+
+
+def union_of_list_of_dicts(l1, l2):
+    result = []
+    temp_list = l1.copy()
+    temp_list.extend(l2)
+    for myDict in temp_list:
+        if myDict not in result:
+            result.append(myDict)
+
+    return result
+
+
+def run_grid(bs, model, method, tp_size, dtype: str):
+
+    config = AutoConfig.from_pretrained(model)
+
+    top_k = config.num_experts_per_tok
+    d_model = config.hidden_size
+    model_intermediate_size = config.intermediate_size
+    num_layers = config.num_hidden_layers
+    hidden_states_dtype = config.torch_dtype
+
+    if config.num_experts_per_tok:
+        if config.architectures[0] == "Grok1ModelForCausalLM":
+            num_total_experts = config.num_experts
+        else:
+            num_total_experts = config.num_local_experts
+    else:
+        raise ValueError(f"Unsupported Mixtral model {model}")
+
+    # tp_size = 2
+    num_warmup_calls = 10
+    num_calls = 30
+
+    num_warmup_trials = 1
+    num_trials = 1
+
+    full_configs = []
+
+    block_m_range = [16, 32, 64, 128, 256]
+    block_n_range = [16, 32, 64, 128, 256]
+    block_k_range = [32, 64, 128, 256]  # MUST >= 32
+    num_warps_range = [1, 2, 4, 8]
+    group_m_range = [1, 4, 8, 16, 32]
+    # For now we see better perf with num_stages=0 for all gemm configs we care
+    # But keep this explicit so that we do not forget we may need to set it to
+    # other values in the future
+    num_stage_range = [2]
+    waves_per_eu_range = [0, 1, 2, 4, 8]
+    # Remove 32 because of triton compiling error
+    matrix_instr_nonkdim_range = [16]
+    kpack_range = [1, 2]
+
+    for block_size_m in block_m_range:
+        for block_size_n in block_n_range:
+            for block_size_k in block_k_range:
+                for group_size_m in group_m_range:
+                    for num_warps in num_warps_range:
+                        for num_stages in num_stage_range:
+                            for waves_per_eu in waves_per_eu_range:
+                                for matrix_instr_nonkdim in matrix_instr_nonkdim_range:
+                                    for kpack in kpack_range:
+                                        full_configs.append(
+                                            {
+                                                "BLOCK_SIZE_M": block_size_m,
+                                                "BLOCK_SIZE_N": block_size_n,
+                                                "BLOCK_SIZE_K": block_size_k,
+                                                "GROUP_SIZE_M": group_size_m,
+                                                "num_warps": num_warps,
+                                                "num_stages": num_stages,
+                                                "waves_per_eu": waves_per_eu,
+                                                "matrix_instr_nonkdim": matrix_instr_nonkdim,
+                                                "kpack": kpack,
+                                            }
+                                        )
+
+    M1 = bs * 2
+    N1 = model_intermediate_size * 2 // tp_size
+    K1 = d_model
+    prune_configs_1 = prune_configs(M1, N1, K1, full_configs)
+
+    M2 = bs * 2
+    N2 = d_model
+    K2 = model_intermediate_size // tp_size
+    prune_configs_2 = prune_configs(M2, N2, K2, full_configs)
+
+    configs = union_of_list_of_dicts(prune_configs_1, prune_configs_2)
+
+    print(f"{bs=} || {len(full_configs)=} | {len(prune_configs_1)=} | \
+            {len(prune_configs_2)=} | {len(configs)=}")
+
+    best_config = None
+    best_time_us = 1e20
+
+    print(f"{tp_size=} {bs=}")
+
+    for config in tqdm(configs):
+        # warmup
+        try:
+            print(config)
+            for _ in range(num_warmup_trials):
+                run_timing(
+                    num_calls=num_warmup_calls,
+                    bs=bs,
+                    d_model=d_model,
+                    num_total_experts=num_total_experts,
+                    top_k=top_k,
+                    tp_size=tp_size,
+                    model_intermediate_size=model_intermediate_size,
+                    method=method,
+                    config=config,
+                    dtype=dtype,
+                    hidden_states_dtype=hidden_states_dtype,
+                )
+        except triton.runtime.autotuner.OutOfResources:
+            continue
+
+        # trial
+        for _ in range(num_trials):
+            kernel_dur_ms = run_timing(
+                num_calls=num_calls,
+                bs=bs,
+                d_model=d_model,
+                num_total_experts=num_total_experts,
+                top_k=top_k,
+                tp_size=tp_size,
+                model_intermediate_size=model_intermediate_size,
+                method=method,
+                config=config,
+                dtype=dtype,
+                hidden_states_dtype=hidden_states_dtype,
+            )
+
+            kernel_dur_us = 1000 * kernel_dur_ms
+            model_dur_ms = kernel_dur_ms * num_layers
+
+            if kernel_dur_us < best_time_us:
+                best_config = config
+                best_time_us = kernel_dur_us
+
+                tqdm.write(
+                    f"{kernel_dur_us=:.1f} {model_dur_ms=:.1f}"
+                    f" {bs=} {tp_size=} {top_k=} {num_total_experts=} "
+                    f"{d_model=} {model_intermediate_size=} {num_layers=}"
+                )
+
+    print("best_time_us", best_time_us)
+    print("best_config", best_config)
+
+    # holds Dict[str, Dict[str, int]]
+    filename = get_config_file_name(
+        num_total_experts,
+        model_intermediate_size // tp_size,
+        "float8" if dtype == "float8" else None,
+    )
+    print(f"writing config to file {filename}")
+    existing_content = {}
+    if os.path.exists(filename):
+        with open(filename, "r") as f:
+            existing_content = json.load(f)
+    existing_content[str(bs)] = best_config
+    with open(filename, "w") as f:
+        json.dump(existing_content, f, indent=4)
+        f.write("\n")
+
+
+def run_timing(
+    num_calls: int,
+    bs: int,
+    d_model: int,
+    num_total_experts: int,
+    top_k: int,
+    tp_size: int,
+    model_intermediate_size: int,
+    method,
+    config,
+    dtype: str,
+    hidden_states_dtype,
+) -> float:
+    shard_intermediate_size = model_intermediate_size // tp_size
+
+    hidden_states = torch.rand(
+        (bs, d_model),
+        device="cuda:0",
+        dtype=hidden_states_dtype,
+    )
+
+    w1 = torch.rand(
+        (num_total_experts, 2 * shard_intermediate_size, d_model + padding_size),
+        device=hidden_states.device,
+        dtype=hidden_states.dtype,
+    )
+
+    w2 = torch.rand(
+        (num_total_experts, d_model, shard_intermediate_size + padding_size),
+        device=hidden_states.device,
+        dtype=hidden_states.dtype,
+    )
+
+    w1_scale = None
+    w2_scale = None
+    a1_scale = None
+    a2_scale = None
+
+    if dtype == "float8":
+        w1 = w1.to(torch.float8_e4m3fnuz)
+        w2 = w2.to(torch.float8_e4m3fnuz)
+        w1_scale = torch.ones(
+            num_total_experts, device=hidden_states.device, dtype=torch.float32
+        )
+        w2_scale = torch.ones(
+            num_total_experts, device=hidden_states.device, dtype=torch.float32
+        )
+        a1_scale = torch.ones(1, device=hidden_states.device, dtype=torch.float32)
+        a2_scale = torch.ones(1, device=hidden_states.device, dtype=torch.float32)
+
+    gating_output = F.softmax(
+        torch.rand(
+            (num_calls, bs, num_total_experts),
+            device=hidden_states.device,
+            dtype=torch.float32,
+        ),
+        dim=-1,
+    )
+
+    ##################################
+
+    start_event = torch.cuda.Event(enable_timing=True)
+    end_event = torch.cuda.Event(enable_timing=True)
+
+    start_event.record()
+    for i in range(num_calls):
+        hidden_states = method(
+            hidden_states=hidden_states,
+            w1=w1,
+            w2=w2,
+            w1_scale=w1_scale,
+            w2_scale=w2_scale,
+            a1_scale=a1_scale,
+            a2_scale=a2_scale,
+            gating_output=gating_output[0],
+            topk=top_k,
+            renormalize=True,
+            inplace=True,
+            override_config=config,
+            use_fp8=dtype == "float8",
+        )
+
+    end_event.record()
+    end_event.synchronize()
+
+    dur_ms = start_event.elapsed_time(end_event) / num_calls
+    return dur_ms
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        prog="benchmark_mixtral_moe",
+        description="Benchmark and tune the fused_moe kernel",
+    )
+    parser.add_argument(
+        "--dtype",
+        type=str,
+        default="auto",
+        choices=["float8", "float16", "bfloat16"],
+        help="Data type used for fused_moe kernel computations",
+    )
+    parser.add_argument("--model", type=str, default="hpcai-tech/grok-1")
+
+    parser.add_argument("--tp-size", type=int, default=2, help="Tensor paralleli size")
+    parser.add_argument("-b", "--batches", type=str)
+
+    args = parser.parse_args()
+
+    batches = args.batches.split(",")
+
+    sys.exit(main(args.model, args.tp_size, args.dtype, batches))
diff --git a/sglang/docs/supported_models/extending/modelscope.md b/sglang/docs/supported_models/extending/modelscope.md
new file mode 100644
index 0000000000000000000000000000000000000000..4740c2770f9e9d910b8269e5431b604be9a3f740
--- /dev/null
+++ b/sglang/docs/supported_models/extending/modelscope.md
@@ -0,0 +1,28 @@
+# Use Models From ModelScope
+
+To use a model from [ModelScope](https://www.modelscope.cn), set the environment variable `SGLANG_USE_MODELSCOPE`.
+
+```bash
+export SGLANG_USE_MODELSCOPE=true
+```
+
+We take [Qwen2-7B-Instruct](https://www.modelscope.cn/models/qwen/qwen2-7b-instruct) as an example.
+
+Launch the Server:
+```bash
+python -m sglang.launch_server --model-path qwen/Qwen2-7B-Instruct --port 30000
+```
+
+Or start it by docker:
+
+```bash
+docker run --gpus all \
+    -p 30000:30000 \
+    -v ~/.cache/modelscope:/root/.cache/modelscope \
+    --env "SGLANG_USE_MODELSCOPE=true" \
+    --ipc=host \
+    lmsysorg/sglang:latest \
+    python3 -m sglang.launch_server --model-path Qwen/Qwen2.5-7B-Instruct --host 0.0.0.0 --port 30000
+```
+
+Note that modelscope uses a different cache directory than huggingface. You may need to set it manually to avoid running out of disk space.
diff --git a/sglang/docs/supported_models/extending/support_new_models.md b/sglang/docs/supported_models/extending/support_new_models.md
new file mode 100644
index 0000000000000000000000000000000000000000..bc683a636530eca52b9795b9013df686c141bb1b
--- /dev/null
+++ b/sglang/docs/supported_models/extending/support_new_models.md
@@ -0,0 +1,320 @@
+# How to Support New Models
+
+This document explains how to add support for new language models and multimodal large language models (MLLMs) in
+SGLang. It also covers how to test new models and register external implementations.
+
+## How to Support a New Language Model
+
+To support a new model in SGLang, you only need to add a single file under
+the [SGLang Models Directory](https://github.com/sgl-project/sglang/tree/main/python/sglang/srt/models). You can learn
+from existing model implementations and create a new file for your model. For most models, you should be able to find a
+similar model to start with (e.g., starting from Llama). Also refer how
+to [port a Model from vLLM to SGLang](#port-a-model-from-vllm-to-sglang)
+
+## How to Support a New Multimodal Large Language Model
+
+To support a new multimodal large language model (MLLM) in SGLang, there are several key components in addition to the
+standard LLM support:
+
+1. **Register your new model as multimodal**:
+   Extend `is_multimodal_model`
+   in [model_config.py](https://github.com/sgl-project/sglang/blob/0ab3f437aba729b348a683ab32b35b214456efc7/python/sglang/srt/configs/model_config.py#L561)
+   to return `True` for your model.
+
+2. **Register a new chat-template**:
+   Only when your default chat-template is unable to accept images as input: Register a new chat template in [conversation.py](https://github.com/sgl-project/sglang/tree/main/python/sglang/srt/conversation.py) and the corresponding matching function.
+
+3. **Multimodal Data Processor**:
+   Define a new `Processor` class that inherits from `BaseMultimodalProcessor` and register this processor as your
+   model’s dedicated processor.
+   See [multimodal_processor.py](https://github.com/sgl-project/sglang/tree/main/python/sglang/srt/multimodal/processors)
+   for more details.
+
+4. **Handle Multimodal Tokens**:
+   Implement a `pad_input_ids` function for your new model. In this function, multimodal tokens in the prompt should be
+   expanded (if necessary) and padded with multimodal-data-hashes so that SGLang can recognize different multimodal data
+   with `RadixAttention`.
+
+5. **Handle Image Feature Extraction**:
+   Implement a `get_image_feature` function for your new model, which extracts image features from raw image data and converts them into the embeddings used by the language model.
+
+6. **Adapt to Vision Attention**:
+   Adapt the multi-headed `Attention` of ViT with SGLang’s `VisionAttention`.
+
+You can refer to [Qwen2VL](https://github.com/sgl-project/sglang/blob/main/python/sglang/srt/models/qwen2_vl.py) or
+other mllm implementations. These models demonstrate how to correctly handle both multimodal and textual inputs.
+
+## Testing and Debugging
+
+Please note all your testing and benchmarking results in PR description.
+
+### Interactive Debugging
+
+For interactive debugging, compare the outputs of Hugging Face/Transformers and SGLang. The following two commands
+should give the same text output and very similar prefill logits:
+
+- Get the reference output:
+  ```bash
+  python3 scripts/playground/reference_hf.py --model-path [new model] --model-type {text,mllm}
+  ```
+- Get the SGLang output:
+  ```bash
+  python3 -m sglang.bench_one_batch --correct --model [new model]
+  ```
+
+### Add the Model to the Test Suite
+
+To ensure the new model is well maintained, add it to the test suite by including it in the `ALL_OTHER_MODELS` list in
+the [test_generation_models.py](https://github.com/sgl-project/sglang/blob/main/test/srt/models/test_generation_models.py)
+file, test the new model on your local machine and report the results on demonstrative benchmarks (GSM8K, MMLU, MMMU,
+MMMU-Pro, etc.) in your PR. \\
+For VLMs, also include a test in `test_vision_openai_server_{x}.py` (e.g. [test_vision_openai_server_a.py](https://github.com/sgl-project/sglang/blob/main/test/srt/test_vision_openai_server_a.py), [test_vision_openai_server_b.py](https://github.com/sgl-project/sglang/blob/main/test/srt/test_vision_openai_server_b.py)).
+
+This is an example command to run to test a new model on your local machine:
+
+```bash
+ONLY_RUN=Qwen/Qwen2-1.5B python3 -m unittest test_generation_models.TestGenerationModels.test_others
+```
+
+### Benchmark
+
+- **(Required) MMMU**: follow MMMU benchmark [README.md](https://github.com/sgl-project/sglang/blob/main/benchmark/mmmu/README.md) to get SGLang vs. HF Transformer accuracy comparison. The accuracy score from SGLang run should not be much lower than that from HF Transformer run. Similarly, follow https://docs.sglang.io/developer_guide/benchmark_and_profiling.html to get performance comparison: TTFT and throughput must meet or exceed baselines (e.g., HF Transformer).
+- **(Optional) Other evals**: If you ran other evals, please note the results in PR description.
+
+## Port a Model from vLLM to SGLang
+
+The [vLLM Models Directory](https://github.com/vllm-project/vllm/tree/main/vllm/model_executor/models) is a valuable
+resource, as vLLM covers many models. SGLang reuses vLLM’s interface and some layers, making it easier to port models
+from vLLM to SGLang.
+
+To port a model from vLLM to SGLang:
+
+- Compare these two files for guidance:
+  - [SGLang Llama Implementation](https://github.com/sgl-project/sglang/blob/main/python/sglang/srt/models/llama.py)
+  - [vLLM Llama Implementation](https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/models/llama.py)
+- The major differences include:
+  - **Replace vLLM’s `Attention` with `RadixAttention`** (ensure you pass `layer_id` to `RadixAttention`).
+  - **Replace vLLM’s `LogitsProcessor` with SGLang’s `LogitsProcessor`.**
+  - **Replace the multi-headed `Attention` of ViT with SGLang’s `VisionAttention`.**
+  - **Replace other vLLM layers** (such as `RMSNorm`, `SiluAndMul`) with SGLang layers.
+  - **Remove `Sample`.**
+  - **Change the `forward()` functions** and add a `forward_batch()` method.
+  - **Add `EntryClass`** at the end.
+  - **Ensure that the new implementation uses only SGLang components** and does not rely on any vLLM components.
+
+Note: make sure you add your new model to the supported models list in the supported models documentation.
+
+## Registering an External Model Implementation
+
+In addition to the methods above, you can register your new model with the `ModelRegistry` before launching the server.
+This allows you to integrate your model without modifying the source code.
+
+For example:
+
+```python
+from sglang.srt.models.registry import ModelRegistry
+from sglang.srt.entrypoints.http_server import launch_server
+
+# For a single model, add it to the registry:
+ModelRegistry.models[model_name] = model_class
+
+# For multiple models, you can imitate the import_model_classes() function:
+from functools import lru_cache
+
+@lru_cache()
+def import_new_model_classes():
+    model_arch_name_to_cls = {}
+    # Populate model_arch_name_to_cls with your new model classes.
+    ...
+    return model_arch_name_to_cls
+
+ModelRegistry.models.update(import_new_model_classes())
+
+# Launch the server with your server arguments:
+launch_server(server_args)
+```
+
+## Example: Implementing and Serving a Llama Wrapper Model
+
+Below is an introductory, step-by-step walkthrough on how to implement a new model end-to-end in SGLang and then run it via the [Offline Engine](https://github.com/sgl-project/sglang/blob/main/docs/basic_usage/offline_engine_api.ipynb).
+
+### Implementing Our Model
+
+To keep things simple, this new model will be a simple wrapper around [Llama 3.1-8B-Instruct](https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct), and our goal will be just to bias the output logits for each `forward` call by taking the square root of each individual logit.
+
+Let's start by defining our model in a file called `llama_wrapper.py`.
+The first step is to import the necessary libraries from SRT, which is SGLang's internal backend.
+
+```python
+# In the file `llama_wrapper.py`
+
+import torch
+from transformers import LlamaConfig
+from typing import Optional
+from sglang.srt.layers.logits_processor import LogitsProcessorOutput
+from sglang.srt.layers.quantization.base_config import QuantizationConfig
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch, PPProxyTensors
+
+from sglang.srt.models.llama import LlamaForCausalLM
+```
+
+Next, we declare a new `class` for our model and have it inherit from `LlamaForCausalLM`, which allows our model to access `LlamaForCausalLM`'s predefined modules and layers, such as `LlamaAttention` and `LlamaMLP`.
+Note that almost all model implementations take in `config` and `quant_config` as arguments for their `__init__` method; `config` and `quant_config` are passed in via [`model_loader/loader.py`](https://github.com/sgl-project/sglang/blob/bf72b80122fd888bf619d17b96fa3e323ab809fc/python/sglang/srt/model_loader/loader.py#L219).
+Because we have inherited from `LlamaForCausalLM`, we can pass our parameters directly to its constructor, which will set the member variables for us.
+
+```python
+class LlamaWrapper(LlamaForCausalLM):
+    def __init__(
+        self,
+        config: LlamaConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__(config=config, quant_config=quant_config, prefix=prefix)
+```
+
+Now, we want to define the `forward` method, which is what will be called at inference time.
+Note that the signature for `forward` is essentially the same for any model; you can take a look at the other models defined in the [`models` directory](https://github.com/sgl-project/sglang/blob/main/python/sglang/srt/models/) for references.
+To see where exactly `forward` is called in the SGLang runtime's internals, take a look at [`forward_decode`](https://github.com/sgl-project/sglang/blob/bf72b80122fd888bf619d17b96fa3e323ab809fc/python/sglang/srt/model_executor/model_runner.py#L1705) and [`forward_extend`](https://github.com/sgl-project/sglang/blob/bf72b80122fd888bf619d17b96fa3e323ab809fc/python/sglang/srt/model_executor/model_runner.py#L1724) in the [`ModelRunner` class](https://github.com/sgl-project/sglang/blob/main/python/sglang/srt/model_executor/model_runner.py).
+
+```python
+    @torch.no_grad()
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        pp_proxy_tensors: Optional[PPProxyTensors] = None,
+        input_embeds: Optional[torch.Tensor] = None,
+        get_embedding: bool = False,
+    ) -> LogitsProcessorOutput:
+```
+
+We now call the `__call__` method for `self.model` (which is a member variable that `LlamaForCausalLM` defines in its `__init__` method), which eventually calls `LlamaForCausalLM`'s `forward` method.
+After that, we feed the `hidden_states` into our model's `LogitsProcessor` (again defined in `LlamaForCausalLM`).
+
+```python
+        hidden_states = self.model(
+            input_ids,
+            positions,
+            forward_batch,
+            input_embeds,
+            pp_proxy_tensors=pp_proxy_tensors,
+        )
+
+        res: LogitsProcessorOutput = self.logits_processor(
+            input_ids,
+            hidden_states,
+            self.lm_head,
+            forward_batch,
+        )
+```
+
+After receiving the logits for the next token, we can finally perform our biasing step.
+
+```python
+        orig_logits = res.next_token_logits
+        res.next_token_logits = torch.where(
+            orig_logits > 0,
+            orig_logits.sqrt(),
+            orig_logits
+        )
+
+        return res
+```
+
+Now, our `LlamaWrapper` model is created and ready to be served!
+
+### Serving Our Model Via SGLang's Offline Engine
+
+The next step of this walkthrough involves hosting our new model offline, so that it can be served locally and without an HTTP server.
+
+First, create a new file called `run.py`.
+Now, we must ensure that SGLang's `ModelRegistry` can find our model.
+To do this, we first download the model's configuration and weights from Huggingface.
+
+```python
+# In the file `run.py`
+
+import asyncio
+from functools import lru_cache
+from huggingface_hub import snapshot_download
+from llama_wrapper import LlamaWrapper # Make sure to import our new model!
+import sglang as sgl
+from sglang.srt.models.registry import ModelRegistry
+
+# Make sure to request access to this model on Huggingface, then export your
+# `HF_TOKEN` to download the model snapshot
+llama_dir = snapshot_download(
+    repo_id="meta-llama/Llama-3.1-8B-Instruct",
+    local_dir="./llama_ckpt",
+)
+```
+
+Now that we have our model on disk, we want to point it to `LlamaWrapper` by changing the `architectures` field in `./llama_ckpt/config.json` to be `LlamaWrapper`.
+That way, when we pass in the path of our model checkpoint to SGLang, it will know that we want to use "LlamaWrapper" instead of "LlamaForCausalLM" as our model.
+
+```python
+{
+  "architectures": [
+   #  "LlamaForCausalLM"
+    "LlamaWrapper"
+  ],
+  ...
+}
+```
+
+However, if we don't link our `LlamaWrapper` class to the "LlamaWrapper" registry keyword, then SGLang won't be able to find our model.
+Thus, to register our `LlamaWrapper`, we want to follow the steps in the above section titled "Registering an External Model Implementation".
+
+```python
+@lru_cache()
+def import_new_model_classes():
+    model_arch_name_to_cls = {"LlamaWrapper": LlamaWrapper}
+    return model_arch_name_to_cls
+
+ModelRegistry.models.update(import_new_model_classes())
+```
+
+Lastly, when we create our `Engine`, we just pass in the path to the local model directory.
+Then, our `LlamaWrapper` is ready to be served; for this walkthrough, we will use SGLang `Engine`'s non-streaming asynchronous generation endpoint.
+
+```python
+def main():
+    llm = sgl.Engine(model_path="./llama_ckpt")
+    sampling_params = {"temperature": 0.2, "top_k": 5}
+    prompts = [
+        "Write a short, neutral self-introduction for a fictional character. Hello, my name is",
+        "Provide a concise factual statement about France’s capital city. The capital of France is",
+        "Explain possible future trends in artificial intelligence. The future of AI is",
+    ]
+
+    asyncio.run(run_llm(llm, sampling_params, prompts))
+
+    llm.shutdown()
+
+async def run_llm(
+    llm,
+    sampling_params,
+    prompts,
+) -> None:
+    outputs = await llm.async_generate(prompts, sampling_params)
+
+    for prompt, output in zip(prompts, outputs):
+        print(f"\nPrompt: {prompt}")
+        print(f"Generated text: {output['text']}")
+
+if __name__ == "__main__":
+    main()
+```
+
+Now, when we call `python run.py`, we will get the outputs of our newly created model!
+
+## Documentation
+
+Add to table of supported models in [generative_models.md](../text_generation/generative_models.md) or [multimodal_language_models.md](../text_generation/multimodal_language_models.md)
+
+---
+
+By following these guidelines, you can add support for new language models and multimodal large language models in
+SGLang and ensure they are thoroughly tested and easily integrated into the system.
diff --git a/sglang/docs/supported_models/retrieval_ranking/classify_models.md b/sglang/docs/supported_models/retrieval_ranking/classify_models.md
new file mode 100644
index 0000000000000000000000000000000000000000..9b3c6a5914f09f13ab9ae73f2b988bb25309096f
--- /dev/null
+++ b/sglang/docs/supported_models/retrieval_ranking/classify_models.md
@@ -0,0 +1,162 @@
+# Classification API
+
+This document describes the `/v1/classify` API endpoint implementation in SGLang, which is compatible with vLLM's classification API format.
+
+## Overview
+
+The classification API allows you to classify text inputs using classification models. This implementation follows the same format as vLLM's 0.7.0 classification API.
+
+## API Endpoint
+
+```
+POST /v1/classify
+```
+
+## Request Format
+
+```json
+{
+  "model": "model_name",
+  "input": "text to classify"
+}
+```
+
+### Parameters
+
+- `model` (string, required): The name of the classification model to use
+- `input` (string, required): The text to classify
+- `user` (string, optional): User identifier for tracking
+- `rid` (string, optional): Request ID for tracking
+- `priority` (integer, optional): Request priority
+
+## Response Format
+
+```json
+{
+  "id": "classify-9bf17f2847b046c7b2d5495f4b4f9682",
+  "object": "list",
+  "created": 1745383213,
+  "model": "jason9693/Qwen2.5-1.5B-apeach",
+  "data": [
+    {
+      "index": 0,
+      "label": "Default",
+      "probs": [0.565970778465271, 0.4340292513370514],
+      "num_classes": 2
+    }
+  ],
+  "usage": {
+    "prompt_tokens": 10,
+    "total_tokens": 10,
+    "completion_tokens": 0,
+    "prompt_tokens_details": null
+  }
+}
+```
+
+### Response Fields
+
+- `id`: Unique identifier for the classification request
+- `object`: Always "list"
+- `created`: Unix timestamp when the request was created
+- `model`: The model used for classification
+- `data`: Array of classification results
+  - `index`: Index of the result
+  - `label`: Predicted class label
+  - `probs`: Array of probabilities for each class
+  - `num_classes`: Total number of classes
+- `usage`: Token usage information
+  - `prompt_tokens`: Number of input tokens
+  - `total_tokens`: Total number of tokens
+  - `completion_tokens`: Number of completion tokens (always 0 for classification)
+  - `prompt_tokens_details`: Additional token details (optional)
+
+## Example Usage
+
+### Using curl
+
+```bash
+curl -v "http://127.0.0.1:8000/v1/classify" \
+  -H "Content-Type: application/json" \
+  -d '{
+    "model": "jason9693/Qwen2.5-1.5B-apeach",
+    "input": "Loved the new café—coffee was great."
+  }'
+```
+
+### Using Python
+
+```python
+import requests
+import json
+
+# Make classification request
+response = requests.post(
+    "http://127.0.0.1:8000/v1/classify",
+    headers={"Content-Type": "application/json"},
+    json={
+        "model": "jason9693/Qwen2.5-1.5B-apeach",
+        "input": "Loved the new café—coffee was great."
+    }
+)
+
+# Parse response
+result = response.json()
+print(json.dumps(result, indent=2))
+```
+
+## Supported Models
+
+The classification API works with any classification model supported by SGLang, including:
+
+### Classification Models (Multi-class)
+- `LlamaForSequenceClassification` - Multi-class classification
+- `Qwen2ForSequenceClassification` - Multi-class classification
+- `Qwen3ForSequenceClassification` - Multi-class classification
+- `BertForSequenceClassification` - Multi-class classification
+- `Gemma2ForSequenceClassification` - Multi-class classification
+
+**Label Mapping**: The API automatically uses the `id2label` mapping from the model's `config.json` file to provide meaningful label names instead of generic class names. If `id2label` is not available, it falls back to `LABEL_0`, `LABEL_1`, etc., or `Class_0`, `Class_1` as a last resort.
+
+### Reward Models (Single score)
+- `InternLM2ForRewardModel` - Single reward score
+- `Qwen2ForRewardModel` - Single reward score
+- `LlamaForSequenceClassificationWithNormal_Weights` - Special reward model
+
+**Note**: The `/classify` endpoint in SGLang was originally designed for reward models but now supports all non-generative models. Our `/v1/classify` endpoint provides a standardized vLLM-compatible interface for classification tasks.
+
+## Error Handling
+
+The API returns appropriate HTTP status codes and error messages:
+
+- `400 Bad Request`: Invalid request format or missing required fields
+- `500 Internal Server Error`: Server-side processing error
+
+Error response format:
+```json
+{
+  "error": "Error message",
+  "type": "error_type",
+  "code": 400
+}
+```
+
+## Implementation Details
+
+The classification API is implemented using:
+
+1. **Rust Model Gateway**: Handles routing and request/response models in `sgl-model-gateway/src/protocols/spec.rs`
+2. **Python HTTP Server**: Implements the actual endpoint in `python/sglang/srt/entrypoints/http_server.py`
+3. **Classification Service**: Handles the classification logic in `python/sglang/srt/entrypoints/openai/serving_classify.py`
+
+## Testing
+
+Use the provided test script to verify the implementation:
+
+```bash
+python test_classify_api.py
+```
+
+## Compatibility
+
+This implementation is compatible with vLLM's classification API format, allowing seamless migration from vLLM to SGLang for classification tasks.
diff --git a/sglang/docs/supported_models/retrieval_ranking/embedding_models.md b/sglang/docs/supported_models/retrieval_ranking/embedding_models.md
new file mode 100644
index 0000000000000000000000000000000000000000..906466ac5e6b00e2dccd9b29e4592f7449e606c3
--- /dev/null
+++ b/sglang/docs/supported_models/retrieval_ranking/embedding_models.md
@@ -0,0 +1,126 @@
+# Embedding Models
+
+SGLang provides robust support for embedding models by integrating efficient serving mechanisms with its flexible programming interface. This integration allows for streamlined handling of embedding tasks, facilitating faster and more accurate retrieval and semantic search operations. SGLang's architecture enables better resource utilization and reduced latency in embedding model deployment.
+
+```{important}
+Embedding models are executed with `--is-embedding` flag and some may require `--trust-remote-code`
+```
+
+## Quick Start
+
+### Launch Server
+
+```shell
+python3 -m sglang.launch_server \
+  --model-path Qwen/Qwen3-Embedding-4B \
+  --is-embedding \
+  --host 0.0.0.0 \
+  --port 30000
+```
+
+### Client Request
+
+```python
+import requests
+
+url = "http://127.0.0.1:30000"
+
+payload = {
+    "model": "Qwen/Qwen3-Embedding-4B",
+    "input": "What is the capital of France?",
+    "encoding_format": "float"
+}
+
+response = requests.post(url + "/v1/embeddings", json=payload).json()
+print("Embedding:", response["data"][0]["embedding"])
+```
+
+
+
+## Multimodal Embedding Example
+
+For multimodal models like GME that support both text and images:
+
+```shell
+python3 -m sglang.launch_server \
+  --model-path Alibaba-NLP/gme-Qwen2-VL-2B-Instruct \
+  --is-embedding \
+  --chat-template gme-qwen2-vl \
+  --host 0.0.0.0 \
+  --port 30000
+```
+
+```python
+import requests
+
+url = "http://127.0.0.1:30000"
+
+text_input = "Represent this image in embedding space."
+image_path = "https://huggingface.co/datasets/liuhaotian/llava-bench-in-the-wild/resolve/main/images/023.jpg"
+
+payload = {
+    "model": "gme-qwen2-vl",
+    "input": [
+        {
+            "text": text_input
+        },
+        {
+            "image": image_path
+        }
+    ],
+}
+
+response = requests.post(url + "/v1/embeddings", json=payload).json()
+
+print("Embeddings:", [x.get("embedding") for x in response.get("data", [])])
+```
+
+## Matryoshka Embedding Example
+
+[Matryoshka Embeddings](https://sbert.net/examples/sentence_transformer/training/matryoshka/README.html#matryoshka-embeddings) or [Matryoshka Representation Learning (MRL)](https://arxiv.org/abs/2205.13147) is a technique used in training embedding models. It allows user to trade off between performance and cost.
+
+### 1. Launch a Matryoshka‑capable model
+
+If the model config already includes `matryoshka_dimensions` or `is_matryoshka` then no override is needed. Otherwise, you can use `--json-model-override-args` as below:
+
+```shell
+python3 -m sglang.launch_server \
+    --model-path Qwen/Qwen3-Embedding-0.6B \
+    --is-embedding \
+    --host 0.0.0.0 \
+    --port 30000 \
+    --json-model-override-args '{"matryoshka_dimensions": [128, 256, 512, 1024, 1536]}'
+```
+
+1. Setting `"is_matryoshka": true` allows truncating to any dimension. Otherwise, the server will validate that the specified dimension in the request is one of `matryoshka_dimensions`.
+2. Omitting `dimensions` in a request returns the full vector.
+
+### 2. Make requests with different output dimensions
+
+```python
+import requests
+
+url = "http://127.0.0.1:30000"
+
+# Request a truncated (Matryoshka) embedding by specifying a supported dimension.
+payload = {
+    "model": "Qwen/Qwen3-Embedding-0.6B",
+    "input": "Explain diffusion models simply.",
+    "dimensions": 512  # change to 128 / 1024 / omit for full size
+}
+
+response = requests.post(url + "/v1/embeddings", json=payload).json()
+print("Embedding:", response["data"][0]["embedding"])
+```
+
+
+## Supported Models
+
+| Model Family                               | Example Model                          | Chat Template | Description                                                                 |
+| ------------------------------------------ | -------------------------------------- | ------------- | --------------------------------------------------------------------------- |
+| **E5 (Llama/Mistral based)**              | `intfloat/e5-mistral-7b-instruct`     | N/A           | High-quality text embeddings based on Mistral/Llama architectures          |
+| **GTE-Qwen2**                             | `Alibaba-NLP/gte-Qwen2-7B-instruct`   | N/A           | Alibaba's text embedding model with multilingual support                   |
+| **Qwen3-Embedding**                       | `Qwen/Qwen3-Embedding-4B`             | N/A           | Latest Qwen3-based text embedding model for semantic representation        |
+| **BGE**                                    | `BAAI/bge-large-en-v1.5`              | N/A           | BAAI's text embeddings (requires `attention-backend` triton/torch_native)  |
+| **GME (Multimodal)**                      | `Alibaba-NLP/gme-Qwen2-VL-2B-Instruct`| `gme-qwen2-vl`| Multimodal embedding for text and image cross-modal tasks                  |
+| **CLIP**                                   | `openai/clip-vit-large-patch14-336`   | N/A           | OpenAI's CLIP for image and text embeddings                                |
diff --git a/sglang/docs/supported_models/retrieval_ranking/rerank_models.md b/sglang/docs/supported_models/retrieval_ranking/rerank_models.md
new file mode 100644
index 0000000000000000000000000000000000000000..bb989128a8ec271ec128b4cdbaabae7e20c5e6f3
--- /dev/null
+++ b/sglang/docs/supported_models/retrieval_ranking/rerank_models.md
@@ -0,0 +1,313 @@
+# Rerank Models
+
+SGLang offers comprehensive support for rerank models by incorporating optimized serving frameworks with a flexible programming interface. This setup enables efficient processing of cross-encoder reranking tasks, improving the accuracy and relevance of search result ordering. SGLang’s design ensures high throughput and low latency during reranker model deployment, making it ideal for semantic-based result refinement in large-scale retrieval systems.
+
+```{important}
+Rerank models in SGLang fall into two categories:
+
+- **Cross-encoder rerank models**: run with `--is-embedding` (embedding runner).
+- **Decoder-only rerank models**: run **without** `--is-embedding` and use next-token logprob scoring (yes/no).
+  - Text-only (e.g. Qwen3-Reranker)
+  - Multimodal (e.g. Qwen3-VL-Reranker): also supports image/video content
+
+Some models may require `--trust-remote-code`.
+```
+
+## Supported rerank models
+
+| Model Family (Rerank)                          | Example HuggingFace Identifier       | Chat Template | Description                                                                                                                      |
+|------------------------------------------------|--------------------------------------|---------------|----------------------------------------------------------------------------------------------------------------------------------|
+| **BGE-Reranker (BgeRerankModel)**              | `BAAI/bge-reranker-v2-m3`            | N/A           | Currently only support `attention-backend` `triton` and `torch_native`. High-performance cross-encoder reranker model from BAAI. Suitable for reranking search results based on semantic relevance.   |
+| **Qwen3-Reranker (decoder-only yes/no)**       | `Qwen/Qwen3-Reranker-8B`             | `examples/chat_template/qwen3_reranker.jinja` | Decoder-only reranker using next-token logprob scoring for labels (yes/no). Launch **without** `--is-embedding`. |
+| **Qwen3-VL-Reranker (multimodal yes/no)**      | `Qwen/Qwen3-VL-Reranker-2B`          | `examples/chat_template/qwen3_vl_reranker.jinja` | Multimodal decoder-only reranker supporting text, images, and videos. Uses yes/no logprob scoring. Launch **without** `--is-embedding`. |
+
+
+## Cross-Encoder Rerank (embedding runner)
+
+### Launch Command
+
+```shell
+python3 -m sglang.launch_server \
+  --model-path BAAI/bge-reranker-v2-m3 \
+  --host 0.0.0.0 \
+  --disable-radix-cache \
+  --chunked-prefill-size -1 \
+  --attention-backend triton \
+  --is-embedding \
+  --port 30000
+```
+
+### Example Client Request
+
+```python
+import requests
+
+url = "http://127.0.0.1:30000/v1/rerank"
+
+payload = {
+    "model": "BAAI/bge-reranker-v2-m3",
+    "query": "what is panda?",
+    "documents": [
+        "hi",
+        "The giant panda (Ailuropoda melanoleuca), sometimes called a panda bear or simply panda, is a bear species endemic to China."
+    ],
+    "top_n": 1,
+    "return_documents": True
+}
+
+response = requests.post(url, json=payload)
+response_json = response.json()
+
+for item in response_json:
+    if item.get("document"):
+        print(f"Score: {item['score']:.2f} - Document: '{item['document']}'")
+    else:
+        print(f"Score: {item['score']:.2f} - Index: {item['index']}")
+```
+
+**Request Parameters:**
+
+- `query` (required): The query text to rank documents against
+- `documents` (required): List of documents to be ranked
+- `model` (required): Model to use for reranking
+- `top_n` (optional): Maximum number of documents to return. Defaults to returning all documents. If specified value is greater than the total number of documents, all documents will be returned.
+- `return_documents` (optional): Whether to return documents in the response. Defaults to `True`.
+
+## Qwen3-Reranker (decoder-only yes/no rerank)
+
+### Launch Command
+
+```shell
+python3 -m sglang.launch_server \
+  --model-path Qwen/Qwen3-Reranker-0.6B \
+  --trust-remote-code \
+  --disable-radix-cache \
+  --host 0.0.0.0 \
+  --port 8001 \
+  --chat-template examples/chat_template/qwen3_reranker.jinja
+```
+
+```{note}
+Qwen3-Reranker uses decoder-only logprob scoring (yes/no). Do NOT launch it with `--is-embedding`.
+```
+
+### Example Client Request (supports optional instruct, top_n, and return_documents)
+
+```shell
+curl -X POST http://127.0.0.1:8001/v1/rerank \
+  -H "Content-Type: application/json" \
+  -d '{
+    "model": "Qwen3-Reranker-0.6B",
+    "query": "法国首都是哪里？",
+    "documents": [
+      "法国的首都是巴黎。",
+      "德国的首都是柏林。",
+      "香蕉是黄色的水果。"
+    ],
+    "instruct": "Given a web search query, retrieve relevant passages that answer the query.",
+    "top_n": 2,
+    "return_documents": true
+  }'
+```
+
+**Request Parameters:**
+
+- `query` (required): The query text to rank documents against
+- `documents` (required): List of documents to be ranked
+- `model` (required): Model to use for reranking
+- `instruct` (optional): Instruction text for the reranker
+- `top_n` (optional): Maximum number of documents to return. Defaults to returning all documents. If specified value is greater than the total number of documents, all documents will be returned.
+- `return_documents` (optional): Whether to return documents in the response. Defaults to `True`.
+
+### Response Format
+
+`/v1/rerank` returns a list of objects (sorted by descending score):
+
+- `score`: float, higher means more relevant
+- `document`: the original document string (only included when `return_documents` is `true`)
+- `index`: the original index in the input `documents`
+- `meta_info`: optional debug/usage info (may be present for some models)
+
+The number of returned results is controlled by the `top_n` parameter. If `top_n` is not specified or is greater than the total number of documents, all documents are returned.
+
+Example (with `return_documents: true`):
+
+```json
+[
+  {"score": 0.99, "document": "法国的首都是巴黎。", "index": 0},
+  {"score": 0.01, "document": "德国的首都是柏林。", "index": 1},
+  {"score": 0.00, "document": "香蕉是黄色的水果。", "index": 2}
+]
+```
+
+Example (with `return_documents: false`):
+
+```json
+[
+  {"score": 0.99, "index": 0},
+  {"score": 0.01, "index": 1},
+  {"score": 0.00, "index": 2}
+]
+```
+
+Example (with `top_n: 2`):
+
+```json
+[
+  {"score": 0.99, "document": "法国的首都是巴黎。", "index": 0},
+  {"score": 0.01, "document": "德国的首都是柏林。", "index": 1}
+]
+```
+
+### Common Pitfalls
+
+- If you launch Qwen3-Reranker with `--is-embedding`, `/v1/rerank` cannot compute yes/no logprob scores. Relaunch **without** `--is-embedding`.
+- If you see a validation error like "score should be a valid number" and the backend returned a list, upgrade to a version that coerces `embedding[0]` into `score` for rerank responses.
+
+## Qwen3-VL-Reranker (multimodal decoder-only rerank)
+
+Qwen3-VL-Reranker extends the Qwen3-Reranker to support multimodal content, allowing reranking of documents containing text, images, and videos.
+
+### Launch Command
+
+```shell
+python3 -m sglang.launch_server \
+  --model-path Qwen/Qwen3-VL-Reranker-2B \
+  --trust-remote-code \
+  --disable-radix-cache \
+  --host 0.0.0.0 \
+  --port 30000 \
+  --chat-template examples/chat_template/qwen3_vl_reranker.jinja
+```
+
+```{note}
+Qwen3-VL-Reranker uses decoder-only logprob scoring (yes/no) like Qwen3-Reranker. Do NOT launch it with `--is-embedding`.
+```
+
+### Text-Only Reranking (backward compatible)
+
+```python
+import requests
+
+url = "http://127.0.0.1:30000/v1/rerank"
+
+payload = {
+    "model": "Qwen3-VL-Reranker-2B",
+    "query": "What is machine learning?",
+    "documents": [
+        "Machine learning is a branch of artificial intelligence that enables computers to learn from data.",
+        "The weather in Paris is usually mild with occasional rain.",
+        "Deep learning is a subset of machine learning using neural networks with many layers.",
+    ],
+    "instruct": "Retrieve passages that answer the question.",
+    "return_documents": True
+}
+
+response = requests.post(url, json=payload)
+results = response.json()
+
+for item in results:
+    print(f"Score: {item['score']:.4f} - {item['document'][:60]}...")
+```
+
+### Image Reranking (text query, image/mixed documents)
+
+```python
+import requests
+
+url = "http://127.0.0.1:30000/v1/rerank"
+
+payload = {
+    "query": "A woman playing with her dog on a beach at sunset.",
+    "documents": [
+        # Document 1: Text description
+        "A woman shares a joyful moment with her golden retriever on a sun-drenched beach at sunset.",
+        # Document 2: Image URL
+        [
+            {
+                "type": "image_url",
+                "image_url": {
+                    "url": "https://example.com/beach_dog.jpeg"
+                }
+            }
+        ],
+        # Document 3: Text + Image (mixed)
+        [
+            {"type": "text", "text": "A joyful scene at the beach:"},
+            {
+                "type": "image_url",
+                "image_url": {
+                    "url": "https://example.com/beach_dog.jpeg"
+                }
+            }
+        ]
+    ],
+    "instruct": "Retrieve images or text relevant to the user's query.",
+    "return_documents": False
+}
+
+response = requests.post(url, json=payload)
+results = response.json()
+
+for item in results:
+    print(f"Index: {item['index']}, Score: {item['score']:.4f}")
+```
+
+### Multimodal Query Reranking (query with image)
+
+```python
+import requests
+
+url = "http://127.0.0.1:30000/v1/rerank"
+
+payload = {
+    # Query with text and image
+    "query": [
+        {"type": "text", "text": "Find similar images to this:"},
+        {
+            "type": "image_url",
+            "image_url": {
+                "url": "https://example.com/reference_image.jpeg"
+            }
+        }
+    ],
+    "documents": [
+        "A cat sleeping on a couch.",
+        "A woman and her dog enjoying the sunset at the beach.",
+        "A busy city street with cars and pedestrians.",
+        [
+            {
+                "type": "image_url",
+                "image_url": {
+                    "url": "https://example.com/similar_image.jpeg"
+                }
+            }
+        ]
+    ],
+    "instruct": "Find images or descriptions similar to the query image."
+}
+
+response = requests.post(url, json=payload)
+results = response.json()
+
+for item in results:
+    print(f"Index: {item['index']}, Score: {item['score']:.4f}")
+```
+
+### Request Parameters (Multimodal)
+
+- `query` (required): Can be a string (text-only) or a list of content parts:
+  - `{"type": "text", "text": "..."}` for text
+  - `{"type": "image_url", "image_url": {"url": "..."}}` for images
+  - `{"type": "video_url", "video_url": {"url": "..."}}` for videos
+- `documents` (required): List where each document can be a string or list of content parts (same format as query)
+- `instruct` (optional): Instruction text for the reranker
+- `top_n` (optional): Maximum number of documents to return
+- `return_documents` (optional): Whether to return documents in the response (default: `false`)
+
+### Common Pitfalls
+
+- Always use `--chat-template examples/chat_template/qwen3_vl_reranker.jinja` for Qwen3-VL-Reranker.
+- Do NOT launch with `--is-embedding`.
+- For best results, use `--disable-radix-cache` to avoid caching issues with multimodal content.
+- **Note**: Currently only `Qwen3-VL-Reranker-2B` is tested and supported. The 8B model may have different behavior and is not guaranteed to work with this template.
diff --git a/sglang/docs/supported_models/specialized/index.rst b/sglang/docs/supported_models/specialized/index.rst
new file mode 100644
index 0000000000000000000000000000000000000000..40d108acb3df4db6e971b0e6a5b9cda9405edabe
--- /dev/null
+++ b/sglang/docs/supported_models/specialized/index.rst
@@ -0,0 +1,9 @@
+Specialized Models
+==================
+
+Models for specialized tasks like reward modeling.
+
+.. toctree::
+   :maxdepth: 1
+
+   reward_models.md
diff --git a/sglang/docs/supported_models/specialized/reward_models.md b/sglang/docs/supported_models/specialized/reward_models.md
new file mode 100644
index 0000000000000000000000000000000000000000..ef4474637fad4ad11c9b1b8f32af02e76943149f
--- /dev/null
+++ b/sglang/docs/supported_models/specialized/reward_models.md
@@ -0,0 +1,28 @@
+# Reward Models
+
+These models output a scalar reward score or classification result, often used in reinforcement learning or content moderation tasks.
+
+```{important}
+They are executed with `--is-embedding` and some may require `--trust-remote-code`.
+```
+
+## Example launch Command
+
+```shell
+python3 -m sglang.launch_server \
+  --model-path Qwen/Qwen2.5-Math-RM-72B \  # example HF/local path
+  --is-embedding \
+  --host 0.0.0.0 \
+  --tp-size=4 \                          # set for tensor parallelism
+  --port 30000 \
+```
+
+## Supported models
+
+| Model Family (Reward)                                                     | Example HuggingFace Identifier                              | Description                                                                     |
+|---------------------------------------------------------------------------|-----------------------------------------------------|---------------------------------------------------------------------------------|
+| **Llama (3.1 Reward / `LlamaForSequenceClassification`)**                   | `Skywork/Skywork-Reward-Llama-3.1-8B-v0.2`            | Reward model (preference classifier) based on Llama 3.1 (8B) for scoring and ranking responses for RLHF.  |
+| **Gemma 2 (27B Reward / `Gemma2ForSequenceClassification`)**                | `Skywork/Skywork-Reward-Gemma-2-27B-v0.2`             | Derived from Gemma‑2 (27B), this model provides human preference scoring for RLHF and multilingual tasks.  |
+| **InternLM 2 (Reward / `InternLM2ForRewardMode`)**                         | `internlm/internlm2-7b-reward`                       | InternLM 2 (7B)–based reward model used in alignment pipelines to guide outputs toward preferred behavior.  |
+| **Qwen2.5 (Reward - Math / `Qwen2ForRewardModel`)**                         | `Qwen/Qwen2.5-Math-RM-72B`                           | A 72B math-specialized RLHF reward model from the Qwen2.5 series, tuned for evaluating and refining responses.  |
+| **Qwen2.5 (Reward - Sequence / `Qwen2ForSequenceClassification`)**          | `jason9693/Qwen2.5-1.5B-apeach`                      | A smaller Qwen2.5 variant used for sequence classification, offering an alternative RLHF scoring mechanism.  |
diff --git a/sglang/docs/supported_models/text_generation/diffusion_language_models.md b/sglang/docs/supported_models/text_generation/diffusion_language_models.md
new file mode 100644
index 0000000000000000000000000000000000000000..7dbb4828b6956ee079349086d87c19bea991423b
--- /dev/null
+++ b/sglang/docs/supported_models/text_generation/diffusion_language_models.md
@@ -0,0 +1,111 @@
+# Diffusion Language Models
+
+Diffusion language models have shown promise for non-autoregressive text generation with parallel decoding capabilities. Unlike auto-regressive language models, different diffusion language models require different decoding strategies.
+
+## Example Launch Command
+
+SGLang supports different DLLM algorithms such as `LowConfidence` and `JointThreshold`.
+
+```shell
+python3 -m sglang.launch_server \
+  --model-path inclusionAI/LLaDA2.0-mini \ # example HF/local path
+  --dllm-algorithm LowConfidence \
+  --dllm-algorithm-config ./config.yaml \ # Optional. Uses the algorithm's default if not set.
+  --host 0.0.0.0 \
+  --port 30000
+```
+
+## Example Configuration File
+
+Depending on the algorithm selected, the configuration parameters vary.
+
+LowConfidence Config:
+
+```yaml
+# Confidence threshold for accepting predicted tokens
+# - Higher values: More conservative, better quality but slower
+# - Lower values: More aggressive, faster but potentially lower quality
+# Range: 0.0 - 1.0
+threshold: 0.95
+
+# Default: 32, for LLaDA2MoeModelLM
+block_size: 32
+```
+
+JointThreshold Config:
+
+```yaml
+# Decoding threshold for Mask-to-Token (M2T) phase
+# - Higher values: More conservative, better quality but slower
+# - Lower values: More aggressive, faster but potentially lower quality
+# Range: 0.0 - 1.0
+threshold: 0.5
+# Decoding threshold for Token-to-Token (T2T) phase
+# Range: 0.0 - 1.0
+# Setting to 0.0 allows full editing (recommended for most cases).
+edit_threshold: 0.0
+# Max extra T2T steps after all masks are removed. Prevents infinite loops.
+max_post_edit_steps: 16
+# 2-gram repetition penalty (default 0).
+# An empirical value of 3 is often sufficient to mitigate most repetitions.
+penalty_lambda: 0
+```
+
+## Example Client Code Snippet
+
+Just like other supported models, diffusion language models can be used via the REST API or Python client.
+
+Python client example for making a generation request to the launched server:
+
+```python
+import sglang as sgl
+
+def main():
+    llm = sgl.Engine(model_path="inclusionAI/LLaDA2.0-mini",
+                     dllm_algorithm="LowConfidence",
+                     max_running_requests=1,
+                     trust_remote_code=True)
+
+    prompts = [
+        "<role>SYSTEM</role>detailed thinking off<|role_end|><role>HUMAN</role> Write a brief introduction of the great wall <|role_end|><role>ASSISTANT</role>"
+    ]
+
+    sampling_params = {
+        "temperature": 0,
+        "max_new_tokens": 1024,
+    }
+
+    outputs = llm.generate(prompts, sampling_params)
+    print(outputs)
+
+if __name__ == '__main__':
+    main()
+```
+
+Curl example for making a generation request to the launched server:
+
+```bash
+curl -X POST "http://127.0.0.1:30000/generate" \
+     -H "Content-Type: application/json" \
+     -d '{
+        "text": [
+            "<role>SYSTEM</role>detailed thinking off<|role_end|><role>HUMAN</role> Write the number from 1 to 128 <|role_end|><role>ASSISTANT</role>",
+            "<role>SYSTEM</role>detailed thinking off<|role_end|><role>HUMAN</role> Write a brief introduction of the great wall <|role_end|><role>ASSISTANT</role>"
+        ],
+        "stream": true,
+        "sampling_params": {
+            "temperature": 0,
+            "max_new_tokens": 1024
+        }
+    }'
+```
+
+## Supported Models
+
+Below the supported models are summarized in a table.
+
+| Model Family               | Example Model                | Description                                                                                          |
+| -------------------------- | ---------------------------- | ---------------------------------------------------------------------------------------------------- |
+| **LLaDA2.0 (mini, flash)** | `inclusionAI/LLaDA2.0-flash` | LLaDA2.0-flash is a diffusion language model featuring a 100B Mixture-of-Experts (MoE) architecture. |
+| **SDAR (JetLM)**           | `JetLM/SDAR-8B-Chat`         | SDAR series diffusion language model (Chat), dense architecture.                                 |
+| **SDAR (JetLM)**           | `JetLM/SDAR-30B-A3B-Chat`    | SDAR series diffusion language model (Chat), MoE architecture.                                   |
diff --git a/sglang/docs/supported_models/text_generation/generative_models.md b/sglang/docs/supported_models/text_generation/generative_models.md
new file mode 100644
index 0000000000000000000000000000000000000000..f73aa200faed4e904f646faf0d43949eadb9167c
--- /dev/null
+++ b/sglang/docs/supported_models/text_generation/generative_models.md
@@ -0,0 +1,72 @@
+# Large Language Models
+
+These models accept text input and produce text output (e.g., chat completions). They are primarily large language models (LLMs), some with mixture-of-experts (MoE) architectures for scaling.
+
+## Example launch Command
+
+```shell
+python3 -m sglang.launch_server \
+  --model-path meta-llama/Llama-3.2-1B-Instruct \  # example HF/local path
+  --host 0.0.0.0 \
+  --port 30000 \
+```
+
+## Supported models
+
+Below the supported models are summarized in a table.
+
+If you are unsure if a specific architecture is implemented, you can search for it via GitHub. For example, to search for `Qwen3ForCausalLM`, use the expression:
+
+```
+repo:sgl-project/sglang path:/^python\/sglang\/srt\/models\// Qwen3ForCausalLM
+```
+
+in the GitHub search bar.
+
+| Model Family (Variants)             | Example HuggingFace Identifier                     | Description                                                                            |
+|-------------------------------------|--------------------------------------------------|----------------------------------------------------------------------------------------|
+| **DeepSeek** (v1, v2, v3/R1)        | `deepseek-ai/DeepSeek-R1`                        | Series of advanced reasoning-optimized models (including a 671B MoE) trained with reinforcement learning; top performance on complex reasoning, math, and code tasks. [SGLang provides Deepseek v3/R1 model-specific optimizations](../basic_usage/deepseek.md) and [Reasoning Parser](../advanced_features/separate_reasoning.ipynb)|
+| **Kimi K2** (Thinking, Instruct)    | `moonshotai/Kimi-K2-Instruct`                    | Moonshot AI's 1 trillion parameter MoE model (32B active) with 128K–256K context; state-of-the-art agentic intelligence with stable long-horizon agency across 200–300 sequential tool calls. Features MLA attention and native INT4 quantization. [See Reasoning Parser docs](../advanced_features/separate_reasoning.ipynb)|
+| **Kimi Linear** (48B-A3B)           | `moonshotai/Kimi-Linear-48B-A3B-Instruct`        | Moonshot AI's hybrid linear attention model (48B total, 3B active) with 1M token context; features Kimi Delta Attention (KDA) for up to 6× faster decoding and 75% KV cache reduction vs full attention. |
+| **GPT-OSS**       | `openai/gpt-oss-20b`, `openai/gpt-oss-120b`       | OpenAI’s latest GPT-OSS series for complex reasoning, agentic tasks, and versatile developer use cases.|
+| **Qwen** (3.5, 3, 3MoE, 3Next, 2.5, 2 series)       | `Qwen/Qwen3.5-397B-A17B`, `Qwen/Qwen3-0.6B`, `Qwen/Qwen3-30B-A3B`      | Alibaba’s latest Qwen3 series for complex reasoning, language understanding, and generation tasks; Support for MoE variants along with previous generation 2.5, 2, etc. [SGLang provides Qwen3 specific reasoning parser](../advanced_features/separate_reasoning.ipynb)|
+| **Llama** (2, 3.x, 4 series)        | `meta-llama/Llama-4-Scout-17B-16E-Instruct`       | Meta's open LLM series, spanning 7B to 400B parameters (Llama 2, 3, and new Llama 4) with well-recognized performance. [SGLang provides Llama-4 model-specific optimizations](../basic_usage/llama4.md)  |
+| **Mistral** (Mixtral, NeMo, Small3) | `mistralai/Mistral-7B-Instruct-v0.2`             | Open 7B LLM by Mistral AI with strong performance; extended into MoE (“Mixtral”) and NeMo Megatron variants for larger scale. |
+| **Gemma** (v1, v2, v3)              | `google/gemma-3-1b-it`                            | Google’s family of efficient multilingual models (1B–27B); Gemma 3 offers a 128K context window, and its larger (4B+) variants support vision input. |
+| **Phi** (Phi-1.5, Phi-2, Phi-3, Phi-4, Phi-MoE series) | `microsoft/Phi-4-multimodal-instruct`, `microsoft/Phi-3.5-MoE-instruct` | Microsoft’s Phi family of small models (1.3B–5.6B); Phi-4-multimodal (5.6B) processes text, images, and speech, Phi-4-mini is a high-accuracy text model and Phi-3.5-MoE is a mixture-of-experts model. |
+| **MiniCPM** (v3, 4B)               | `openbmb/MiniCPM3-4B`                            | OpenBMB’s series of compact LLMs for edge devices; MiniCPM 3 (4B) achieves GPT-3.5-level results in text tasks. |
+| **OLMo** (2, 3) | `allenai/OLMo-3-1125-32B`, `allenai/OLMo-3-32B-Think`, `allenai/OLMo-2-1124-7B-Instruct` | Allen AI’s series of Open Language Models designed to enable the science of language models. |
+| **OLMoE** (Open MoE)               | `allenai/OLMoE-1B-7B-0924`                       | Allen AI’s open Mixture-of-Experts model (7B total, 1B active parameters) delivering state-of-the-art results with sparse expert activation. |
+| **MiniMax-M2** (M2, M2.1, M2.5)               | `MiniMaxAI/MiniMax-M2.5`, `MiniMaxAI/MiniMax-M2.1`, `MiniMaxAI/MiniMax-M2` | MiniMax's SOTA LLM for coding & agentic workflows. |
+| **StableLM** (3B, 7B)               | `stabilityai/stablelm-tuned-alpha-7b`            | StabilityAI’s early open-source LLM (3B & 7B) for general text generation; a demonstration model with basic instruction-following ability. |
+| **Command-(R,A)** (Cohere)              | `CohereLabs/c4ai-command-r-v01`, `CohereLabs/c4ai-command-r7b-12-2024`, `CohereLabs/c4ai-command-a-03-2025`                 | Cohere’s open conversational LLM (Command series) optimized for long context, retrieval-augmented generation, and tool use. |
+| **DBRX** (Databricks)              | `databricks/dbrx-instruct`                       | Databricks’ 132B-parameter MoE model (36B active) trained on 12T tokens; competes with GPT-3.5 quality as a fully open foundation model. |
+| **Grok** (xAI)                     | `xai-org/grok-1`                                | xAI’s grok-1 model known for vast size(314B parameters) and high quality; integrated in SGLang for high-performance inference. |
+| **ChatGLM** (GLM-130B family)       | `THUDM/chatglm2-6b`                              | Zhipu AI’s bilingual chat model (6B) excelling at Chinese-English dialogue; fine-tuned for conversational quality and alignment. |
+| **InternLM 2** (7B, 20B)           | `internlm/internlm2-7b`                          | Next-gen InternLM (7B and 20B) from SenseTime, offering strong reasoning and ultra-long context support (up to 200K tokens). |
+| **ExaONE 3** (Korean-English)      | `LGAI-EXAONE/EXAONE-3.5-7.8B-Instruct`           | LG AI Research’s Korean-English model (7.8B) trained on 8T tokens; provides high-quality bilingual understanding and generation. |
+| **Baichuan 2** (7B, 13B)           | `baichuan-inc/Baichuan2-13B-Chat`                | BaichuanAI’s second-generation Chinese-English LLM (7B/13B) with improved performance and an open commercial license. |
+| **XVERSE** (MoE)                   | `xverse/XVERSE-MoE-A36B`                         | Yuanxiang’s open MoE LLM (XVERSE-MoE-A36B: 255B total, 36B active) supporting ~40 languages; delivers 100B+ dense-level performance via expert routing. |
+| **SmolLM** (135M–1.7B)            | `HuggingFaceTB/SmolLM-1.7B`                      | Hugging Face’s ultra-small LLM series (135M–1.7B params) offering surprisingly strong results, enabling advanced AI on mobile/edge devices. |
+| **GLM-4** (Multilingual 9B)        | `ZhipuAI/glm-4-9b-chat`                          | Zhipu’s GLM-4 series (up to 9B parameters) – open multilingual models with support for 1M-token context and even a 5.6B multimodal variant (Phi-4V). |
+| **MiMo** (7B series)               | `XiaomiMiMo/MiMo-7B-RL`                         | Xiaomi's reasoning-optimized model series, leverages Multiple-Token Prediction for faster inference. |
+| **ERNIE-4.5** (4.5, 4.5MoE series) | `baidu/ERNIE-4.5-21B-A3B-PT`                    | Baidu's ERNIE-4.5 series which consists of MoE with 47B and 3B active parameters, with the largest model having 424B total parameters, as well as a 0.3B dense model. |
+| **Arcee AFM-4.5B**               | `arcee-ai/AFM-4.5B-Base`                         | Arcee's foundational model series for real world reliability and edge deployments. |
+| **Persimmon** (8B)               | `adept/persimmon-8b-chat`                         | Adept’s open 8B model with a 16K context window and fast inference; trained for broad usability and licensed under Apache 2.0. |
+| **Solar** (10.7B)               | `upstage/SOLAR-10.7B-Instruct-v1.0`                         | Upstage's 10.7B parameter model, optimized for instruction-following tasks. This architecture incorporates a depth-up scaling methodology, enhancing model performance. |
+| **Tele FLM** (52B-1T)               | `CofeAI/Tele-FLM`                         | BAAI & TeleAI's multilingual model, available in 52-billion and 1-trillion parameter variants. It is a decoder-only transformer trained on ~2T tokens |
+| **Ling** (16.8B–290B) | `inclusionAI/Ling-lite`, `inclusionAI/Ling-plus` | InclusionAI’s open MoE models. Ling-Lite has 16.8B total / 2.75B active parameters, and Ling-Plus has 290B total / 28.8B active parameters. They are designed for high performance on NLP and complex reasoning tasks. |
+| **Granite 3.0, 3.1** (IBM)               | `ibm-granite/granite-3.1-8b-instruct`                          | IBM's open dense foundation models optimized for reasoning, code, and business AI use cases. Integrated with Red Hat and watsonx systems. |
+| **Granite 3.0 MoE** (IBM)               | `ibm-granite/granite-3.0-3b-a800m-instruct`                          | IBM’s Mixture-of-Experts models offering strong performance with cost-efficiency. MoE expert routing designed for enterprise deployment at scale. |
+| **GPT-J** (6B)                    | `EleutherAI/gpt-j-6b`                             | EleutherAI's GPT-2-like causal language model (6B) trained on the [Pile](https://pile.eleuther.ai/) dataset. |
+| **Orion** (14B)               | `OrionStarAI/Orion-14B-Base`                         | A series of open-source multilingual large language models by OrionStarAI, pretrained on a 2.5T token multilingual corpus including Chinese, English, Japanese, Korean, etc, and it exhibits superior performance in these languages. |
+| **Llama Nemotron Super** (v1, v1.5, NVIDIA) | `nvidia/Llama-3_3-Nemotron-Super-49B-v1`, `nvidia/Llama-3_3-Nemotron-Super-49B-v1_5` | The [NVIDIA Nemotron](https://www.nvidia.com/en-us/ai-data-science/foundation-models/nemotron/) family of multimodal models provides state-of-the-art reasoning models specifically designed for enterprise-ready AI agents. |
+| **Llama Nemotron Ultra** (v1, NVIDIA) | `nvidia/Llama-3_1-Nemotron-Ultra-253B-v1` | The [NVIDIA Nemotron](https://www.nvidia.com/en-us/ai-data-science/foundation-models/nemotron/) family of multimodal models provides state-of-the-art reasoning models specifically designed for enterprise-ready AI agents. |
+| **NVIDIA Nemotron Nano 2.0** | `nvidia/NVIDIA-Nemotron-Nano-9B-v2` | The [NVIDIA Nemotron](https://www.nvidia.com/en-us/ai-data-science/foundation-models/nemotron/) family of multimodal models provides state-of-the-art reasoning models specifically designed for enterprise-ready AI agents. `Nemotron-Nano-9B-v2` is a hybrid Mamba-Transformer language model designed to increase throughput for reasoning workloads while achieving state-of-the-art accuracy compared to similarly-sized models. |
+| **StarCoder2** (3B-15B) | `bigcode/starcoder2-7b` | StarCoder2 is a family of open large language models (LLMs) specialized for code generation and understanding. It is the successor to StarCoder, jointly developed by the BigCode project (a collaboration between Hugging Face, ServiceNow Research, and other contributors). |
+| **Jet-Nemotron** | `jet-ai/Jet-Nemotron-2B` | Jet-Nemotron is a new family of hybrid-architecture language models that surpass state-of-the-art open-source full-attention language models, while achieving significant efficiency gains. |
+| **Trinity** (Nano, Mini) | `arcee-ai/Trinity-Mini` | Arcee's foundational MoE Trinity family of models, open weights under Apache 2.0. |
+| **Falcon-H1** (0.5B–34B) | `tiiuae/Falcon-H1-34B-Instruct` | TII's hybrid Mamba-Transformer architecture combining attention and state-space models for efficient long-context inference. |
+| **Hunyuan-Large** (389B, MoE) | `tencent/Tencent-Hunyuan-Large` | Tencent's open-source MoE model with 389B total / 52B active parameters, featuring Cross-Layer Attention (CLA) for improved efficiency. |
+| **IBM Granite 4.0 (Hybrid, Dense)** | `ibm-granite/granite-4.0-h-micro`, `ibm-granite/granite-4.0-micro` | IBM Granite 4.0 micro models: hybrid Mamba–MoE (`h-micro`) and dense (`micro`) variants. Enterprise-focused reasoning models |
+| **Sarvam 2** (30B-A2B, 105B-A10B) | `sarvamai/sarvam-2` | Sarvam's Mixture-of-Experts models. The 105B variant uses MLA (Multi-head Latent Attention) and the 30B variant uses GQA, both with 128 routed experts. |
diff --git a/sglang/docs/supported_models/text_generation/index.rst b/sglang/docs/supported_models/text_generation/index.rst
new file mode 100644
index 0000000000000000000000000000000000000000..e315f83d1a057dc385de8006427d962f80bc81f6
--- /dev/null
+++ b/sglang/docs/supported_models/text_generation/index.rst
@@ -0,0 +1,11 @@
+Text Generation
+===============
+
+Models for generating text from text or multimodal inputs.
+
+.. toctree::
+   :maxdepth: 1
+
+   generative_models.md
+   multimodal_language_models.md
+   diffusion_language_models.md
diff --git a/sglang/docs/supported_models/text_generation/multimodal_language_models.md b/sglang/docs/supported_models/text_generation/multimodal_language_models.md
new file mode 100644
index 0000000000000000000000000000000000000000..0dab3a28af2538cb00c953d78caa7ac6445f36b6
--- /dev/null
+++ b/sglang/docs/supported_models/text_generation/multimodal_language_models.md
@@ -0,0 +1,136 @@
+# Multimodal Language Models
+
+These models accept multi-modal inputs (e.g., images and text) and generate text output. They augment language models with multimodal encoders.
+
+## Example launch Command
+
+```shell
+python3 -m sglang.launch_server \
+  --model-path meta-llama/Llama-3.2-11B-Vision-Instruct \  # example HF/local path
+  --host 0.0.0.0 \
+  --port 30000 \
+```
+
+> See the [OpenAI APIs section](https://docs.sglang.io/basic_usage/openai_api_vision.html) for how to send multimodal requests.
+
+## Supported models
+
+Below the supported models are summarized in a table.
+
+If you are unsure if a specific architecture is implemented, you can search for it via GitHub. For example, to search for `Qwen2_5_VLForConditionalGeneration`, use the expression:
+
+```
+repo:sgl-project/sglang path:/^python\/sglang\/srt\/models\// Qwen2_5_VLForConditionalGeneration
+```
+
+in the GitHub search bar.
+
+
+| Model Family (Variants)    | Example HuggingFace Identifier             | Description                                                                                                                                                                                                     | Notes |
+|----------------------------|--------------------------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-------|
+| **Qwen-VL** | `Qwen/Qwen3-VL-235B-A22B-Instruct`              | Alibaba's vision-language extension of Qwen; for example, Qwen2.5-VL (7B and larger variants) can analyze and converse about image content.                                                                     |  |
+| **DeepSeek-VL2**           | `deepseek-ai/deepseek-vl2`                 | Vision-language variant of DeepSeek (with a dedicated image processor), enabling advanced multimodal reasoning on image and text inputs.                                                                        |  |
+| **DeepSeek-OCR / OCR-2**   | `deepseek-ai/DeepSeek-OCR-2`               | OCR-focused DeepSeek models for document understanding and text extraction.                                                                                                                                    | Use `--trust-remote-code`. |
+| **Janus-Pro** (1B, 7B)     | `deepseek-ai/Janus-Pro-7B`                 | DeepSeek's open-source multimodal model capable of both image understanding and generation. Janus-Pro employs a decoupled architecture for separate visual encoding paths, enhancing performance in both tasks. |  |
+| **MiniCPM-V / MiniCPM-o**  | `openbmb/MiniCPM-V-2_6`                    | MiniCPM-V (2.6, ~8B) supports image inputs, and MiniCPM-o adds audio/video; these multimodal LLMs are optimized for end-side deployment on mobile/edge devices.                                                 |  |
+| **Llama 3.2 Vision** (11B) | `meta-llama/Llama-3.2-11B-Vision-Instruct` | Vision-enabled variant of Llama 3 (11B) that accepts image inputs for visual question answering and other multimodal tasks.                                                                                     |  |
+| **LLaVA** (v1.5 & v1.6)    | *e.g.* `liuhaotian/llava-v1.5-13b`         | Open vision-chat models that add an image encoder to LLaMA/Vicuna (e.g. LLaMA2 13B) for following multimodal instruction prompts.                                                                               |  |
+| **LLaVA-NeXT** (8B, 72B)   | `lmms-lab/llava-next-72b`                  | Improved LLaVA models (with an 8B Llama3 version and a 72B version) offering enhanced visual instruction-following and accuracy on multimodal benchmarks.                                                       |  |
+| **LLaVA-OneVision**        | `lmms-lab/llava-onevision-qwen2-7b-ov`     | Enhanced LLaVA variant integrating Qwen as the backbone; supports multiple images (and even video frames) as inputs via an OpenAI Vision API-compatible format.                                                 |  |
+| **Gemma 3 (Multimodal)**   | `google/gemma-3-4b-it`                     | Gemma 3's larger models (4B, 12B, 27B) accept images (each image encoded as 256 tokens) alongside text in a combined 128K-token context.                                                                        |  |
+| **Kimi-VL** (A3B)          | `moonshotai/Kimi-VL-A3B-Instruct`          | Kimi-VL is a multimodal model that can understand and generate text from images.                                                                                                                                |  |
+| **Mistral-Small-3.1-24B**  | `mistralai/Mistral-Small-3.1-24B-Instruct-2503` | Mistral 3.1 is a multimodal model that can generate text from text or images input. It also supports tool calling and structured output. |  |
+| **Phi-4-multimodal-instruct**  | `microsoft/Phi-4-multimodal-instruct` | Phi-4-multimodal-instruct is the multimodal variant of the Phi-4-mini model, enhanced with LoRA for improved multimodal capabilities. It supports text, vision and audio modalities in SGLang. |  |
+| **MiMo-VL** (7B)           | `XiaomiMiMo/MiMo-VL-7B-RL`                 | Xiaomi's compact yet powerful vision-language model featuring a native resolution ViT encoder for fine-grained visual details, an MLP projector for cross-modal alignment, and the MiMo-7B language model optimized for complex reasoning tasks. |  |
+| **GLM-4.5V** (106B) /  **GLM-4.1V**(9B)           | `zai-org/GLM-4.5V`                   | GLM-4.5V and GLM-4.1V-Thinking: Towards Versatile Multimodal Reasoning with Scalable Reinforcement Learning                                                                                                                                                                                                      | Use `--chat-template glm-4v` |
+| **GLM-OCR**          | `zai-org/GLM-OCR`                   | GLM-OCR: A fast and accurate general OCR model                                                                   |  |
+| **DotsVLM** (General/OCR)  | `rednote-hilab/dots.vlm1.inst`             | RedNote's vision-language model built on a 1.2B vision encoder and DeepSeek V3 LLM, featuring NaViT vision encoder trained from scratch with dynamic resolution support and enhanced OCR capabilities through structured image data training. |  |
+| **DotsVLM-OCR**            | `rednote-hilab/dots.ocr`                   | Specialized OCR variant of DotsVLM optimized for optical character recognition tasks with enhanced text extraction and document understanding capabilities. | Don't use `--trust-remote-code` |
+| **NVILA** (8B, 15B, Lite-2B, Lite-8B, Lite-15B) | `Efficient-Large-Model/NVILA-8B` | `chatml` | NVILA explores the full stack efficiency of multi-modal design, achieving cheaper training, faster deployment and better performance. |
+| **NVIDIA Nemotron Nano 2.0 VL** | `nvidia/NVIDIA-Nemotron-Nano-12B-v2-VL-BF16` | NVIDIA Nemotron Nano v2 VL enables multi-image reasoning and video understanding, along with strong document intelligence, visual Q&A and summarization capabilities. It builds on Nemotron Nano V2, a hybrid Mamba-Transformer LLM, in order to achieve higher inference throughput in long document and video scenarios. | Use `--trust-remote-code`. You may need to adjust `--max-mamba-cache-size` [default is 512] to fit memory constraints. |
+| **Ernie4.5-VL** | `baidu/ERNIE-4.5-VL-28B-A3B-PT`              | Baidu's vision-language models(28B,424B). Support image and video comprehension, and also support thinking.                                                                     |  |
+| **JetVLM** |  | JetVLM is an vision-language model designed for high-performance multimodal understanding and generation tasks built upon Jet-Nemotron. | Coming soon |
+| **Step3-VL** (10B) | `stepfun-ai/Step3-VL-10B` | StepFun's lightweight open-source 10B parameter VLM for multimodal intelligence, excelling in visual perception, complex reasoning, and human alignment. |  |
+| **Qwen3-Omni** | `Qwen/Qwen3-Omni-30B-A3B-Instruct` |  Alibaba's omni-modal MoE model. Currently supports the **Thinker** component (multimodal understanding for text, images, audio, and video), while the **Talker** component (audio generation) is not yet supported. |  |
+
+## Video Input Support
+
+SGLang supports video input for Vision-Language Models (VLMs), enabling temporal reasoning tasks such as video question answering, captioning, and holistic scene understanding. Video clips are decoded, key frames are sampled, and the resulting tensors are batched together with the text prompt, allowing multimodal inference to integrate visual and linguistic context.
+
+| Model Family | Example Identifier | Video notes |
+|--------------|--------------------|-------------|
+| **Qwen-VL** (Qwen2-VL, Qwen2.5-VL, Qwen3-VL, Qwen3-Omni) | `Qwen/Qwen3-VL-235B-A22B-Instruct` | The processor gathers `video_data`, runs Qwen's frame sampler, and merges the resulting features with text tokens before inference. |
+| **GLM-4v** (4.5V, 4.1V, MOE) | `zai-org/GLM-4.5V` | Video clips are read with Decord, converted to tensors, and passed to the model alongside metadata for rotary-position handling. |
+| **NVILA** (Full & Lite) | `Efficient-Large-Model/NVILA-8B` | The runtime samples eight frames per clip and attaches them to the multimodal request when `video_data` is present. |
+| **LLaVA video variants** (LLaVA-NeXT-Video, LLaVA-OneVision) | `lmms-lab/LLaVA-NeXT-Video-7B` | The processor routes video prompts to the LlavaVid video-enabled architecture, and the provided example shows how to query it with `sgl.video(...)` clips. |
+| **NVIDIA Nemotron Nano 2.0 VL** | `nvidia/NVIDIA-Nemotron-Nano-12B-v2-VL-BF16` | The processor samples at 2 FPS, at a max of 128 frames, as per model training. The model uses [EVS](../../python/sglang/srt/multimodal/evs/README.md), a pruning method that removes redundant tokens from video embeddings. By default `video_pruning_rate=0.7`. Change this by providing: `--json-model-override-args '{"video_pruning_rate": 0.0}'` to disable EVS, for example. |
+| **JetVLM** |  | The runtime samples eight frames per clip and attaches them to the multimodal request when `video_data` is present. |
+
+Use `sgl.video(path, num_frames)` when building prompts to attach clips from your SGLang programs.
+
+Example OpenAI-compatible request that sends a video clip:
+
+```python
+import requests
+
+url = "http://localhost:30000/v1/chat/completions"
+
+data = {
+    "model": "Qwen/Qwen3-VL-30B-A3B-Instruct",
+    "messages": [
+        {
+            "role": "user",
+            "content": [
+                {"type": "text", "text": "What’s happening in this video?"},
+                {
+                    "type": "video_url",
+                    "video_url": {
+                        "url": "https://github.com/sgl-project/sgl-test-files/raw/refs/heads/main/videos/jobs_presenting_ipod.mp4"
+                    },
+                },
+            ],
+        }
+    ],
+    "max_tokens": 300,
+}
+
+response = requests.post(url, json=data)
+print(response.text)
+```
+
+## Usage Notes
+
+### Performance Optimization
+
+For multimodal models, you can use the `--keep-mm-feature-on-device` flag to optimize for latency at the cost of increased GPU memory usage:
+
+- **Default behavior**: Multimodal feature tensors are moved to CPU after processing to save GPU memory
+- **With `--keep-mm-feature-on-device`**: Feature tensors remain on GPU, reducing device-to-host copy overhead and improving latency, but consuming more GPU memory
+
+Use this flag when you have sufficient GPU memory and want to minimize latency for multimodal inference.
+
+### Multimodal Inputs Limitation
+
+- **Use `--mm-process-config '{"image":{"max_pixels":1048576},"video":{"fps":3,"max_pixels":602112,"max_frames":60}}'`**: To set `image`, `video`, and `audio` input limits.
+
+This can reduce GPU memory usage, improve inference speed, and help to avoid OOM, but may impact model performance, thus set a proper value based on your specific use case. Currently, only `qwen_vl` supports this config. Please refer to [qwen_vl processor](https://github.com/sgl-project/sglang/blob/main/python/sglang/srt/multimodal/processors/qwen_vl.py) for understanding the meaning of each parameter.
+
+### Bidirectional Attention in Multimodal Model Serving
+**Note for serving the Gemma-3 multimodal model**:
+
+As mentioned in [Welcome Gemma 3: Google's all new multimodal, multilingual, long context open LLM
+](https://huggingface.co/blog/gemma3#multimodality), Gemma-3 employs bidirectional attention between image tokens during the prefill phase. Currently, SGLang only supports bidirectional attention when using the Triton Attention Backend. Note, however, that SGLang's current bidirectional attention implementation is incompatible with both CUDA Graph and Chunked Prefill.
+
+To enable bidirectional attention, you can use the `TritonAttnBackend` while disabling CUDA Graph and Chunked Prefill. Example launch command:
+```shell
+python -m sglang.launch_server \
+  --model-path google/gemma-3-4b-it \
+  --host 0.0.0.0 --port 30000 \
+  --enable-multimodal \
+  --dtype bfloat16 --triton-attention-reduce-in-fp32 \
+  --attention-backend triton \ # Use Triton attention backend
+  --disable-cuda-graph \ # Disable Cuda Graph
+  --chunked-prefill-size -1 # Disable Chunked Prefill
+```
+
+If higher serving performance is required and a certain degree of accuracy loss is acceptable, you may choose to use other attention backends, and you can also enable features like CUDA Graph and Chunked Prefill for better performance, but note that the model will fall back to using causal attention instead of bidirectional attention.
diff --git a/sglang/python/sglang/srt/__pycache__/constants.cpython-311.pyc b/sglang/python/sglang/srt/__pycache__/constants.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a760597764f0597c0ec6488ed7888857a5b4c9ae
Binary files /dev/null and b/sglang/python/sglang/srt/__pycache__/constants.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/__pycache__/environ.cpython-311.pyc b/sglang/python/sglang/srt/__pycache__/environ.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d3411d2f6324e7eacc1b05a64651d688114caf76
Binary files /dev/null and b/sglang/python/sglang/srt/__pycache__/environ.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/batch_overlap/__pycache__/operations.cpython-311.pyc b/sglang/python/sglang/srt/batch_overlap/__pycache__/operations.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..579d378a41f5735cf4717264f09bd75d1bcfb3e5
Binary files /dev/null and b/sglang/python/sglang/srt/batch_overlap/__pycache__/operations.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/batch_overlap/__pycache__/operations_strategy.cpython-311.pyc b/sglang/python/sglang/srt/batch_overlap/__pycache__/operations_strategy.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..01eddb87237558e906a9f1d9098cca529f4c16b9
Binary files /dev/null and b/sglang/python/sglang/srt/batch_overlap/__pycache__/operations_strategy.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/batch_overlap/__pycache__/single_batch_overlap.cpython-311.pyc b/sglang/python/sglang/srt/batch_overlap/__pycache__/single_batch_overlap.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5d43beff6a24504cfb6af84246d810ea0b99226d
Binary files /dev/null and b/sglang/python/sglang/srt/batch_overlap/__pycache__/single_batch_overlap.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/batch_overlap/__pycache__/two_batch_overlap.cpython-311.pyc b/sglang/python/sglang/srt/batch_overlap/__pycache__/two_batch_overlap.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..efcf37045c9f266728b3aaa91ab0f34ce121eb68
Binary files /dev/null and b/sglang/python/sglang/srt/batch_overlap/__pycache__/two_batch_overlap.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/batch_overlap/operations.py b/sglang/python/sglang/srt/batch_overlap/operations.py
new file mode 100644
index 0000000000000000000000000000000000000000..3d61ac82f500d4d22e01fe34d58c499558dcf010
--- /dev/null
+++ b/sglang/python/sglang/srt/batch_overlap/operations.py
@@ -0,0 +1,213 @@
+from __future__ import annotations
+
+import os
+from contextlib import contextmanager
+from dataclasses import dataclass
+from typing import TYPE_CHECKING, Any, Callable, Dict, Generator, List, Sequence, Union
+
+import torch
+
+from sglang.srt.layers.dp_attention import set_dp_buffer_len
+
+if TYPE_CHECKING:
+    from sglang.srt.model_executor.forward_batch_info import ForwardBatch
+
+_ENABLE_PROFILE = bool(int(os.environ.get("SGLANG_OPERATIONS_ENABLE_PROFILE", "0")))
+
+if _ENABLE_PROFILE:
+    import nvtx
+
+
+def execute_operations(inputs, operations):
+    stages = _convert_operations_to_stages(operations)
+    executor = _StageExecutor("primary", stages, inputs=inputs)
+    for _ in range(executor.num_stages):
+        executor.next()
+    assert executor.done
+    return executor.output
+
+
+def execute_overlapped_operations(
+    inputs_arr: Sequence,
+    operations_arr: Sequence,
+    delta_stages: Sequence[int],
+) -> Sequence:
+    # Make it explicit for clarity; if we need multi-batch overlap, this can be generalized
+    inputs_a, inputs_b = inputs_arr
+    operations_a, operations_b = operations_arr
+    delta_stage_a, delta_stage_b = delta_stages
+    assert delta_stage_a == 0
+    delta_stage = delta_stage_b
+
+    stages_a = _convert_operations_to_stages(operations_a)
+    stages_b = _convert_operations_to_stages(operations_b)
+    executor_a = _StageExecutor("a", stages_a, inputs=inputs_a)
+    executor_b = _StageExecutor("b", stages_b, inputs=inputs_b)
+
+    for _ in range(delta_stage):
+        executor_a.next()
+
+    for _ in range(executor_a.num_stages - delta_stage):
+        executor_a.next()
+        executor_b.next()
+
+    for _ in range(delta_stage):
+        executor_b.next()
+
+    assert executor_a.done and executor_b.done
+    return [executor_a.output, executor_b.output]
+
+
+class YieldOperation:
+    pass
+
+
+@dataclass
+class ExecutionOperation:
+    debug_name: str
+    fn: Callable
+
+
+Operation = Union[YieldOperation, ExecutionOperation, Callable]
+Stage = List[ExecutionOperation]
+
+
+class _StageExecutor:
+    def __init__(self, debug_name: str, stages: List[Stage], inputs: dict):
+        self._debug_name = debug_name
+        self._stages = stages
+        self._index = 0
+        self._stage_state = _StateDict()
+        self._stage_output = inputs
+
+        # handling DP attention
+        forward_batch: ForwardBatch = inputs["forward_batch"]
+        self._global_dp_buffer_len = forward_batch.global_dp_buffer_len
+        self._local_dp_buffer_len = forward_batch.tbo_padded_len
+        self._global_num_tokens = forward_batch.global_num_tokens_cpu
+        self._is_dp_max_padding = forward_batch.dp_padding_mode.is_max_len()
+
+    def next(self):
+        assert not self.done
+
+        stage = self._stages[self._index]
+
+        # TODO: We currently always call set_dp_buffer_len here because sub-batches
+        # may have different padded lengths. It can likely be removed after TBO slice &
+        # pad logic is refactored.
+        set_dp_buffer_len(
+            self._global_dp_buffer_len,
+            self._local_dp_buffer_len,
+            self._is_dp_max_padding,
+            self._global_num_tokens,
+        )
+
+        with _annotate_region(debug_name=f"{self._debug_name}{self._index}"):
+            for op in stage:
+                with _annotate_region(debug_name=op.debug_name):
+                    self._stage_output = op.fn(
+                        state=self._stage_state,
+                        **(
+                            self._stage_output if self._stage_output is not None else {}
+                        ),
+                    )
+
+        self._index += 1
+
+    @property
+    def output(self):
+        assert self.done
+        return self._stage_output
+
+    @property
+    def done(self):
+        return self._index >= self.num_stages
+
+    @property
+    def num_stages(self):
+        return len(self._stages)
+
+
+@contextmanager
+def _annotate_region(debug_name):
+    if _ENABLE_PROFILE:
+        with torch.autograd.profiler.record_function(debug_name):
+            with nvtx.annotate(debug_name):
+                yield
+    else:
+        yield
+
+
+class _StateDict:
+    def __init__(self):
+        self._data = {}
+
+    def __setattr__(self, key, value):
+        if key == "_data":
+            super().__setattr__(key, value)
+            return
+        assert (
+            key not in self._data
+        ), f"`{key}` already exist, are you sure you want to override it?"
+        self._data[key] = value
+
+    def __getattr__(self, item):
+        return self._data[item]
+
+    def __delattr__(self, item):
+        del self._data[item]
+
+    def pop(self, item):
+        return self._data.pop(item)
+
+    def update(self, values: Dict[str, Any]):
+        for k, v in values.items():
+            setattr(self, k, v)
+
+    def get(self, item):
+        return self._data.get(item)
+
+    def clear(self, expect_keys: Sequence[str]):
+        if set(self._data.keys()) != set(expect_keys):
+            raise Exception(
+                f"Unexpected keys when clearing. This may indicate you do not release memory early enough but leave it until here. {list(self._data.keys())=} {expect_keys=}"
+            )
+
+        self._data.clear()
+
+
+def _convert_operations_to_stages(operations: List[Operation]) -> List[Stage]:
+    operations = _decorate_operations(operations)
+    operation_chunks = list(
+        _chunk_by_separator(operations, lambda op: isinstance(op, YieldOperation))
+    )
+    assert all(len(chunk) > 0 for chunk in operation_chunks)
+    return operation_chunks
+
+
+def _chunk_by_separator(
+    items: List[Any], is_separator: Callable[[Any], bool]
+) -> Generator[List[Any], None, None]:
+    pending_items = []
+    for item in items:
+        if is_separator(item):
+            yield pending_items
+            pending_items = []
+        else:
+            pending_items.append(item)
+    if len(pending_items) > 0:
+        yield pending_items
+
+
+def _decorate_operations(operations: List[Operation], debug_name_prefix: str = ""):
+    return [_decorate_operation(op, debug_name_prefix) for op in operations]
+
+
+def _decorate_operation(operation: Operation, debug_name_prefix: str):
+    if isinstance(operation, YieldOperation):
+        return operation
+    return ExecutionOperation(
+        debug_name=debug_name_prefix
+        + getattr(operation, "__name__", "unknown").replace("op_", ""),
+        fn=operation,
+    )
diff --git a/sglang/python/sglang/srt/batch_overlap/operations_strategy.py b/sglang/python/sglang/srt/batch_overlap/operations_strategy.py
new file mode 100644
index 0000000000000000000000000000000000000000..d39ad838577d8d89e96d467be1875eff25c5434e
--- /dev/null
+++ b/sglang/python/sglang/srt/batch_overlap/operations_strategy.py
@@ -0,0 +1,302 @@
+from dataclasses import dataclass
+from typing import List, Optional
+
+import torch
+
+from sglang.srt.batch_overlap import operations
+from sglang.srt.batch_overlap.operations import Operation
+from sglang.srt.layers.moe.token_dispatcher import DeepEPConfig
+from sglang.srt.model_executor.forward_batch_info import ForwardMode
+from sglang.srt.utils import is_hip
+
+_is_hip = is_hip()
+
+
+@dataclass
+class OperationsStrategy:
+    operations: List[Operation]
+    deep_gemm_num_sms: Optional[int] = None
+    tbo_delta_stages: Optional[int] = None
+
+    @classmethod
+    def concat(cls, items: List["OperationsStrategy"]) -> "OperationsStrategy":
+        return OperationsStrategy(
+            operations=[x for item in items for x in item.operations],
+            deep_gemm_num_sms=_assert_all_same(
+                [item.deep_gemm_num_sms for item in items]
+            ),
+            tbo_delta_stages=_assert_all_same(
+                [item.tbo_delta_stages for item in items]
+            ),
+        )
+
+    @staticmethod
+    def init_new_tbo(
+        layers: torch.nn.ModuleList,
+        forward_mode: ForwardMode,
+    ) -> "OperationsStrategy":
+        layer_name = layers[0].__class__.__name__
+        if layer_name == "DeepseekV2DecoderLayer":
+            return OperationsStrategy.concat(
+                [
+                    _compute_moe_deepseek_layer_operations_strategy_tbo(
+                        layer, forward_mode
+                    )
+                    for layer in layers
+                ]
+            )
+        elif layer_name == "Qwen3MoeDecoderLayer":
+            return OperationsStrategy.concat(
+                [
+                    _compute_moe_qwen3_layer_operations_strategy_tbo(
+                        layer, forward_mode
+                    )
+                    for layer in layers
+                ]
+            )
+        elif layer_name == "MiMoV2DecoderLayer":
+            return OperationsStrategy.concat(
+                [
+                    _compute_moe_mimov2_layer_operations_strategy_tbo(
+                        layer, forward_mode
+                    )
+                    for layer in layers
+                ]
+            )
+        else:
+            raise NotImplementedError
+
+
+def _assert_all_same(items: List):
+    assert all(item == items[0] for item in items)
+    return items[0]
+
+
+# -------------------------------- Strategy for DeepSeek ---------------------------------------
+
+
+# TODO can refactor to make it more fancy if we have more complex strategies
+def _compute_moe_deepseek_layer_operations_strategy_tbo(
+    layer: torch.nn.Module,
+    forward_mode: ForwardMode,
+) -> OperationsStrategy:
+    assert layer.is_layer_sparse, "dense layer TBO not yet implemented"
+    if forward_mode == ForwardMode.EXTEND:
+        return _compute_moe_deepseek_blog_prefill(layer)
+    elif (
+        forward_mode == ForwardMode.DECODE or forward_mode == ForwardMode.TARGET_VERIFY
+    ):
+        return _compute_moe_deepseek_blog_decode(layer)
+    else:
+        raise NotImplementedError(f"Unsupported {forward_mode=}")
+
+
+def _compute_moe_deepseek_blog_prefill(layer):
+    device_properties = torch.cuda.get_device_properties(device="cuda")
+    total_num_sms = device_properties.multi_processor_count
+    deep_gemm_num_sms = None
+    if not _is_hip:
+        deep_gemm_num_sms = total_num_sms - DeepEPConfig.get_instance().num_sms
+
+    return OperationsStrategy(
+        deep_gemm_num_sms=deep_gemm_num_sms,
+        tbo_delta_stages=0,
+        operations=[
+            layer.op_comm_prepare_attn,
+            layer.self_attn.op_prepare,
+            layer.self_attn.op_core,
+            layer.op_comm_prepare_mlp,
+            layer.mlp.op_gate,
+            layer.mlp.op_select_experts,
+            layer.mlp.op_dispatch_a,
+            operations.YieldOperation(),
+            layer.mlp.op_dispatch_b,
+            layer.mlp.op_experts,
+            layer.mlp.op_combine_a,
+            operations.YieldOperation(),
+            layer.mlp.op_shared_experts,
+            layer.mlp.op_combine_b,
+            layer.mlp.op_output,
+            layer.op_comm_postprocess_layer,
+        ],
+    )
+
+
+def _compute_moe_deepseek_blog_decode(layer):
+    return OperationsStrategy(
+        deep_gemm_num_sms=None,
+        tbo_delta_stages=2,
+        operations=[
+            layer.op_comm_prepare_attn,
+            layer.self_attn.op_prepare,
+            operations.YieldOperation(),
+            layer.self_attn.op_core,
+            layer.op_comm_prepare_mlp,
+            layer.mlp.op_gate,
+            layer.mlp.op_select_experts,
+            operations.YieldOperation(),
+            layer.mlp.op_dispatch_a,
+            layer.mlp.op_shared_experts,
+            operations.YieldOperation(),
+            layer.mlp.op_dispatch_b,
+            layer.mlp.op_experts,
+            layer.mlp.op_combine_a,
+            operations.YieldOperation(),
+            layer.mlp.op_combine_b,
+            operations.YieldOperation(),
+            layer.mlp.op_output,
+            layer.op_comm_postprocess_layer,
+        ],
+    )
+
+
+# -------------------------------- Strategy for Qwen3 ---------------------------------------
+
+
+# TODO: unstable, current strategy is almost the same as DeepSeek, keep redundant code here for
+# convenience to adjust strategy
+def _compute_moe_qwen3_layer_operations_strategy_tbo(
+    layer: torch.nn.Module,
+    forward_mode: ForwardMode,
+) -> OperationsStrategy:
+    assert layer.is_layer_sparse, "qwen3 moe only support sparse layers"
+    if forward_mode == ForwardMode.EXTEND:
+        return _compute_moe_qwen3_prefill(layer)
+    elif (
+        forward_mode == ForwardMode.DECODE or forward_mode == ForwardMode.TARGET_VERIFY
+    ):
+        return _compute_moe_qwen3_decode(layer)
+    else:
+        raise NotImplementedError(f"Unsupported {forward_mode=}")
+
+
+def _compute_moe_qwen3_prefill(layer):
+    device_properties = torch.cuda.get_device_properties(device="cuda")
+    total_num_sms = device_properties.multi_processor_count
+    deep_gemm_num_sms = None
+    if not _is_hip:
+        deep_gemm_num_sms = total_num_sms - DeepEPConfig.get_instance().num_sms
+
+    return OperationsStrategy(
+        deep_gemm_num_sms=deep_gemm_num_sms,
+        tbo_delta_stages=0,
+        operations=[
+            layer.op_comm_prepare_attn,
+            layer.self_attn.op_prepare,
+            layer.self_attn.op_core,
+            layer.op_comm_prepare_mlp,
+            layer.mlp.op_gate,
+            layer.mlp.op_select_experts,
+            layer.mlp.op_dispatch_a,
+            operations.YieldOperation(),
+            layer.mlp.op_dispatch_b,
+            layer.mlp.op_experts,
+            layer.mlp.op_combine_a,
+            operations.YieldOperation(),
+            layer.mlp.op_combine_b,
+            layer.mlp.op_output,
+            layer.op_comm_postprocess_layer,
+        ],
+    )
+
+
+def _compute_moe_qwen3_decode(layer):
+    return OperationsStrategy(
+        deep_gemm_num_sms=None,
+        tbo_delta_stages=2,
+        operations=[
+            layer.op_comm_prepare_attn,
+            layer.self_attn.op_prepare,
+            operations.YieldOperation(),
+            layer.self_attn.op_core,
+            layer.op_comm_prepare_mlp,
+            layer.mlp.op_gate,
+            layer.mlp.op_select_experts,
+            operations.YieldOperation(),
+            layer.mlp.op_dispatch_a,
+            operations.YieldOperation(),
+            layer.mlp.op_dispatch_b,
+            layer.mlp.op_experts,
+            layer.mlp.op_combine_a,
+            operations.YieldOperation(),
+            layer.mlp.op_combine_b,
+            layer.mlp.op_output,
+            layer.op_comm_postprocess_layer,
+            operations.YieldOperation(),
+        ],
+    )
+
+
+# -------------------------------- Strategy for MiMoV2DecoderLayer ---------------------------------------
+
+
+# TODO: unstable; current strategy matches DeepSeek for the common operations (MiMoV2 has no op_shared_experts),
+# so we keep this redundant code here for convenience when adjusting the strategy
+def _compute_moe_mimov2_layer_operations_strategy_tbo(
+    layer: torch.nn.Module,
+    forward_mode: ForwardMode,
+) -> OperationsStrategy:
+    assert layer.is_layer_sparse, "MiMoV2DecoderLayer moe only support sparse layers"
+    if forward_mode == ForwardMode.EXTEND:
+        return _compute_moe_mimov2_prefill(layer)
+    elif (
+        forward_mode == ForwardMode.DECODE or forward_mode == ForwardMode.TARGET_VERIFY
+    ):
+        return _compute_moe_mimov2_decode(layer)
+    else:
+        raise NotImplementedError(f"Unsupported {forward_mode=}")
+
+
+def _compute_moe_mimov2_prefill(layer):
+    device_properties = torch.cuda.get_device_properties(device="cuda")
+    total_num_sms = device_properties.multi_processor_count
+    deep_gemm_num_sms = total_num_sms - DeepEPConfig.get_instance().num_sms
+
+    return OperationsStrategy(
+        deep_gemm_num_sms=deep_gemm_num_sms,
+        tbo_delta_stages=0,
+        operations=[
+            layer.op_comm_prepare_attn,
+            layer.self_attn.op_prepare,
+            layer.self_attn.op_core,
+            layer.op_comm_prepare_mlp,
+            layer.mlp.op_gate,
+            layer.mlp.op_select_experts,
+            layer.mlp.op_dispatch_a,
+            operations.YieldOperation(),
+            layer.mlp.op_dispatch_b,
+            layer.mlp.op_experts,
+            layer.mlp.op_combine_a,
+            operations.YieldOperation(),
+            layer.mlp.op_combine_b,
+            layer.mlp.op_output,
+            layer.op_comm_postprocess_layer,
+        ],
+    )
+
+
+def _compute_moe_mimov2_decode(layer):
+    return OperationsStrategy(
+        deep_gemm_num_sms=None,
+        tbo_delta_stages=2,
+        operations=[
+            layer.op_comm_prepare_attn,
+            layer.self_attn.op_prepare,
+            operations.YieldOperation(),
+            layer.self_attn.op_core,
+            layer.op_comm_prepare_mlp,
+            layer.mlp.op_gate,
+            layer.mlp.op_select_experts,
+            operations.YieldOperation(),
+            layer.mlp.op_dispatch_a,
+            operations.YieldOperation(),
+            layer.mlp.op_dispatch_b,
+            layer.mlp.op_experts,
+            layer.mlp.op_combine_a,
+            operations.YieldOperation(),
+            layer.mlp.op_combine_b,
+            layer.mlp.op_output,
+            layer.op_comm_postprocess_layer,
+            operations.YieldOperation(),
+        ],
+    )
diff --git a/sglang/python/sglang/srt/batch_overlap/single_batch_overlap.py b/sglang/python/sglang/srt/batch_overlap/single_batch_overlap.py
new file mode 100644
index 0000000000000000000000000000000000000000..815ba8715de678aca24457c51c72fb08a7d0b3be
--- /dev/null
+++ b/sglang/python/sglang/srt/batch_overlap/single_batch_overlap.py
@@ -0,0 +1,144 @@
+# Copyright 2025 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+from __future__ import annotations
+
+from dataclasses import dataclass
+from typing import Optional
+
+import torch
+
+from sglang.srt.environ import envs
+from sglang.srt.layers.moe import get_moe_runner_backend
+from sglang.srt.layers.moe.utils import is_sbo_enabled
+from sglang.srt.utils import is_blackwell
+
+
+class SboFlags:
+    # TODO may have: "enable_dispatch_gateup_gemm_two_stream_overlap", ...
+
+    @classmethod
+    def enable_combine_down_gemm_two_stream_overlap(cls):
+        return (
+            is_sbo_enabled()
+            # currently only cutedsl backend supports it
+            and (
+                get_moe_runner_backend().is_flashinfer_cutedsl()
+                or (get_moe_runner_backend().is_deep_gemm() and not is_blackwell())
+            )
+        )
+
+    @classmethod
+    def enable_combine_shared_two_stream_overlap(cls):
+        return (
+            is_sbo_enabled()
+            and not cls.enable_dispatch_shared_one_stream_overlap()
+            and not envs.SGLANG_BLACKWELL_OVERLAP_SHARED_EXPERTS_OUTSIDE_SBO.get()
+        )
+
+    @classmethod
+    def enable_dispatch_shared_one_stream_overlap(cls):
+        return is_sbo_enabled() and not is_blackwell()
+
+    @classmethod
+    def fuse_shared_experts_inside_sbo(cls):
+        return (
+            cls.enable_combine_shared_two_stream_overlap()
+            or cls.enable_dispatch_shared_one_stream_overlap()
+        )
+
+
+@dataclass
+class CombineOverlapArgs:
+    # this "overlap" flag means overlapping with down gemm, not the general two-stream overlap
+    overlap: bool
+    stream: torch.cuda.Stream
+    wait_event: torch.cuda.Event
+    num_sms: Optional[int] = None
+    signal: Optional[torch.Tensor] = None
+    block_m: Optional[int] = 64
+    threshold: Optional[int] = 0
+
+
+@dataclass
+class DownGemmOverlapArgs:
+    num_sms: int
+    signal: torch.Tensor
+    start_event: torch.cuda.Event
+
+
+def compute_overlap_args(dispatch_output, alt_stream):
+    if not (
+        SboFlags.enable_combine_down_gemm_two_stream_overlap()
+        or SboFlags.enable_combine_shared_two_stream_overlap()
+    ):
+        return None, None, {}
+
+    hidden_states = dispatch_output.hidden_states
+
+    num_local_experts, num_tokens_static, hidden_dim = hidden_states.shape
+
+    total_num_sms = torch.cuda.get_device_properties(
+        device="cuda"
+    ).multi_processor_count
+
+    if envs.SGLANG_DEEPEP_LL_COMBINE_SEND_NUM_SMS.is_set():
+        communicate_num_sms = envs.SGLANG_DEEPEP_LL_COMBINE_SEND_NUM_SMS.get()
+    else:
+        communicate_num_sms = 32 if is_blackwell() else 3
+    compute_num_sms = total_num_sms - communicate_num_sms
+
+    assert alt_stream is not None
+    combine_wait_event = torch.cuda.Event()
+    combine_overlap_args = CombineOverlapArgs(
+        overlap=False,
+        num_sms=communicate_num_sms,
+        stream=alt_stream,
+        wait_event=combine_wait_event,
+    )
+    meta_overlap_args = dict(
+        compute_num_sms=compute_num_sms,
+    )
+    down_gemm_overlap_args = None
+
+    if SboFlags.enable_combine_down_gemm_two_stream_overlap():
+        # TODO use zero_allocator to remove this `torch.zeros` call
+        # NOTE ours v2 use uint32 not int32 currently
+        if is_blackwell():
+            combine_signal = torch.zeros(
+                num_local_experts, dtype=torch.uint32, device=hidden_states.device
+            )
+        else:
+            MIN_BLOCK_M = 64
+            combine_signal_size = num_local_experts * (
+                (num_tokens_static + MIN_BLOCK_M - 1) // MIN_BLOCK_M
+            )
+            combine_signal = torch.zeros(
+                combine_signal_size, dtype=torch.int32, device=hidden_states.device
+            )
+
+        down_gemm_overlap_args = DownGemmOverlapArgs(
+            signal=combine_signal,
+            start_event=combine_wait_event,
+            num_sms=compute_num_sms,
+        )
+        combine_overlap_args.overlap = True
+        combine_overlap_args.signal = combine_signal
+        combine_overlap_args.threshold = compute_num_sms
+    else:
+        meta_overlap_args |= dict(
+            record_event_after_down=combine_wait_event,
+        )
+
+    return combine_overlap_args, down_gemm_overlap_args, meta_overlap_args
diff --git a/sglang/python/sglang/srt/batch_overlap/two_batch_overlap.py b/sglang/python/sglang/srt/batch_overlap/two_batch_overlap.py
new file mode 100644
index 0000000000000000000000000000000000000000..c0d1a4923d93b600af8613cc25e3d0ed65229f5a
--- /dev/null
+++ b/sglang/python/sglang/srt/batch_overlap/two_batch_overlap.py
@@ -0,0 +1,1082 @@
+from __future__ import annotations
+
+import copy
+import dataclasses
+import logging
+from dataclasses import replace
+from typing import TYPE_CHECKING, Dict, List, Optional, Sequence
+
+import torch
+
+from sglang.srt.batch_overlap.operations import (
+    execute_operations,
+    execute_overlapped_operations,
+)
+from sglang.srt.batch_overlap.operations_strategy import OperationsStrategy
+from sglang.srt.layers import deep_gemm_wrapper
+from sglang.srt.layers.attention.base_attn_backend import AttentionBackend
+from sglang.srt.layers.communicator import (
+    CommunicateContext,
+    CommunicateSummableTensorPairFn,
+    ScatterMode,
+)
+from sglang.srt.layers.dp_attention import get_attention_tp_size
+from sglang.srt.layers.moe import (
+    get_deepep_mode,
+    get_moe_a2a_backend,
+    get_tbo_token_distribution_threshold,
+    is_tbo_enabled,
+)
+from sglang.srt.layers.moe.token_dispatcher import (
+    DeepEPDispatcher,
+    MooncakeEPDispatcher,
+    MoriEPDispatcher,
+)
+from sglang.srt.layers.moe.token_dispatcher.base import BaseDispatcher
+from sglang.srt.managers.schedule_batch import ScheduleBatch
+from sglang.srt.model_executor.forward_batch_info import (
+    ForwardBatch,
+    ForwardMode,
+    compute_position,
+)
+from sglang.srt.server_args import get_global_server_args
+from sglang.srt.speculative.spec_info import SpecInput
+from sglang.srt.utils import BumpAllocator, empty_context, get_bool_env_var, is_hip
+
+if TYPE_CHECKING:
+    from sglang.srt.batch_overlap.single_batch_overlap import CombineOverlapArgs
+    from sglang.srt.layers.moe.token_dispatcher import DispatchOutput
+    from sglang.srt.speculative.eagle_info import EagleVerifyInput
+
+_is_hip = is_hip()
+
+_tbo_debug = get_bool_env_var("SGLANG_TBO_DEBUG")
+
+logger = logging.getLogger(__name__)
+
+
+# -------------------------------- Compute Basic Info ---------------------------------------
+
+
+def get_token_num_per_seq(
+    forward_mode: ForwardMode,
+    spec_info: Optional[SpecInput] = None,
+):
+    if forward_mode.is_target_verify():
+        return spec_info.draft_token_num
+    elif forward_mode.is_decode():
+        return 1
+    elif forward_mode.is_idle():
+        return 0
+    else:
+        # For extend, we should not use `token_num_per_seq`.
+        return None
+
+
+# TODO: may smartly disable TBO when batch size is too small b/c it will slow down
+def compute_split_seq_index(
+    forward_mode: ForwardMode,
+    num_tokens: int,
+    extend_lens: Optional[Sequence[int]],
+    token_num_per_seq: Optional[int],
+) -> Optional[int]:
+    if forward_mode == ForwardMode.EXTEND:
+        assert extend_lens is not None
+        return _split_extend_seqs(extend_lens)
+    elif forward_mode.is_target_verify() or forward_mode.is_decode():
+        assert token_num_per_seq is not None
+        return (num_tokens // token_num_per_seq) // 2
+    elif forward_mode.is_idle() or forward_mode.is_prebuilt():
+        assert num_tokens == 0
+        return 0
+    else:
+        raise NotImplementedError()
+
+
+def _is_two_chunk_split_enabled(extend_lens: Sequence[int]) -> bool:
+    if extend_lens is None:
+        return False
+
+    vanilla_split_seq_index = _split_array_by_balanced_sum(extend_lens)
+    left_sum = sum(extend_lens[:vanilla_split_seq_index])
+    overall_sum = sum(extend_lens)
+    threshold = get_tbo_token_distribution_threshold()
+    assert threshold <= 0.5, f"{threshold=}"
+    return left_sum < overall_sum * threshold or left_sum > overall_sum * (
+        1 - threshold
+    )
+
+
+def _split_extend_seqs(arr: Sequence[int]) -> int:
+    if _is_two_chunk_split_enabled(arr):
+        return _split_array_by_cum_less_than_half(arr)
+
+    return _split_array_by_balanced_sum(arr)
+
+
+def _split_array_by_cum_less_than_half(arr: Sequence[int]) -> int:
+    left_sum = 0
+    overall_sum = sum(arr)
+    half_sum = overall_sum // 2
+    chosen_index = 0
+
+    for i in range(len(arr)):
+        left_sum += arr[i]
+        if left_sum > half_sum:
+            chosen_index = i
+            break
+
+    return chosen_index
+
+
+def _split_array_by_balanced_sum(arr: Sequence[int]) -> int:
+    overall_sum = sum(arr)
+    left_sum = 0
+    min_diff = float("inf")
+    best_index = 0
+
+    for i in range(1, len(arr)):
+        left_sum += arr[i - 1]
+        right_sum = overall_sum - left_sum
+        diff = abs(left_sum - right_sum)
+        if diff <= min_diff:
+            min_diff = diff
+            best_index = i
+        else:
+            break
+
+    return best_index
+
+
+def _update_device_and_sum_field_from_cpu_field(
+    batch: ForwardBatch, cpu_field: str, device_field: str, sum_field: str = None
+):
+    cpu_value = getattr(batch, cpu_field, None)
+    old_device_value = getattr(batch, device_field, None)
+    if (
+        cpu_value is None
+        or old_device_value is None
+        or not (isinstance(cpu_value, torch.Tensor) or isinstance(cpu_value, list))
+    ):
+        return
+
+    new_device_value = (
+        cpu_value
+        if isinstance(cpu_value, torch.Tensor)
+        else torch.tensor(cpu_value, dtype=old_device_value.dtype)
+    ).to(device=get_global_server_args().device, non_blocking=True)
+    setattr(batch, device_field, new_device_value)
+
+    if sum_field is not None:
+        sum_value = (
+            cpu_value.sum().item()
+            if isinstance(cpu_value, torch.Tensor)
+            else sum(cpu_value)
+        )
+        setattr(batch, sum_field, sum_value)
+
+
+def _compute_mask_offset(seq_index: int, spec_info: Optional[EagleVerifyInput]) -> int:
+    if seq_index == 0:
+        return 0
+
+    offset = 0
+    max_seq_len = min(seq_index, spec_info.seq_lens_cpu.shape[0])
+    for i in range(max_seq_len):
+        offset += (
+            spec_info.seq_lens_cpu[i] + spec_info.draft_token_num
+        ) * spec_info.draft_token_num
+    return offset
+
+
+def split_spec_info(
+    spec_info: Optional[EagleVerifyInput],
+    start_seq_index: int,
+    end_seq_index: int,
+    start_token_index: int,
+    end_token_index: int,
+):
+    if spec_info is None:
+        return None
+    if spec_info.draft_token is not None:
+        draft_token = spec_info.draft_token[start_token_index:end_token_index]
+    else:
+        draft_token = None
+    if spec_info.custom_mask is not None and spec_info.draft_token is not None:
+        custom_mask_start = _compute_mask_offset(start_seq_index, spec_info)
+        if end_seq_index == spec_info.seq_lens_cpu.shape[0]:
+            custom_mask_end = spec_info.custom_mask.shape[0]
+        else:
+            custom_mask_end = _compute_mask_offset(end_seq_index, spec_info)
+
+        if custom_mask_end > custom_mask_start:
+            custom_mask = spec_info.custom_mask[custom_mask_start:custom_mask_end]
+        else:
+            custom_mask = spec_info.custom_mask
+    else:
+        custom_mask = spec_info.custom_mask
+    if spec_info.positions is not None:
+        positions = spec_info.positions[start_token_index:end_token_index]
+    else:
+        positions = None
+    if spec_info.retrive_index is not None:
+        retrive_index = spec_info.retrive_index[start_seq_index:end_seq_index]
+    else:
+        retrive_index = None
+    if spec_info.retrive_next_token is not None:
+        retrive_next_token = spec_info.retrive_next_token[start_seq_index:end_seq_index]
+    else:
+        retrive_next_token = None
+    if spec_info.retrive_next_sibling is not None:
+        retrive_next_sibling = spec_info.retrive_next_sibling[
+            start_seq_index:end_seq_index
+        ]
+    else:
+        retrive_next_sibling = None
+    if spec_info.retrive_cum_len is not None:
+        retrive_cum_len = spec_info.retrive_cum_len[start_seq_index:end_seq_index]
+    else:
+        retrive_cum_len = None
+
+    if spec_info.seq_lens_cpu is not None:
+        seq_lens_cpu = spec_info.seq_lens_cpu[start_seq_index:end_seq_index]
+    else:
+        seq_lens_cpu = None
+    if seq_lens_cpu is not None:
+        seq_lens_sum = seq_lens_cpu.sum()
+    else:
+        seq_lens_sum = None
+    output_spec_info = replace(
+        spec_info,
+        custom_mask=custom_mask,
+        draft_token=draft_token,
+        positions=positions,
+        retrive_index=retrive_index,
+        retrive_next_token=retrive_next_token,
+        retrive_next_sibling=retrive_next_sibling,
+        retrive_cum_len=retrive_cum_len,
+        seq_lens_cpu=seq_lens_cpu,
+        seq_lens_sum=seq_lens_sum,
+    )
+    return output_spec_info
+
+
+def compute_split_token_index(
+    split_seq_index: int,
+    forward_mode: "ForwardMode",
+    extend_seq_lens: Optional[Sequence[int]],
+    token_num_per_seq: Optional[int],
+) -> int:
+    if forward_mode == ForwardMode.EXTEND:
+        assert extend_seq_lens is not None
+        if _is_two_chunk_split_enabled(extend_seq_lens):
+            return sum(extend_seq_lens) // 2
+        return sum(extend_seq_lens[:split_seq_index])
+    elif forward_mode.is_target_verify() or forward_mode.is_decode():
+        assert token_num_per_seq is not None
+        return split_seq_index * token_num_per_seq
+    elif forward_mode.is_idle():
+        assert split_seq_index == 0
+        return 0
+    else:
+        raise NotImplementedError
+
+
+def compute_split_indices_for_cuda_graph_replay(
+    forward_mode: ForwardMode,
+    cuda_graph_num_tokens: int,
+    spec_info: Optional[SpecInput],
+):
+    forward_mode_for_tbo_split = (
+        forward_mode if forward_mode != ForwardMode.IDLE else ForwardMode.DECODE
+    )
+    token_num_per_seq = get_token_num_per_seq(
+        forward_mode=forward_mode, spec_info=spec_info
+    )
+    tbo_split_seq_index = compute_split_seq_index(
+        forward_mode=forward_mode_for_tbo_split,
+        num_tokens=cuda_graph_num_tokens,
+        extend_lens=None,
+        token_num_per_seq=token_num_per_seq,
+    )
+    tbo_split_token_index = compute_split_token_index(
+        split_seq_index=tbo_split_seq_index,
+        forward_mode=forward_mode_for_tbo_split,
+        extend_seq_lens=None,
+        token_num_per_seq=token_num_per_seq,
+    )
+    return tbo_split_seq_index, tbo_split_token_index
+
+
+# -------------------------------- Preparation ---------------------------------------
+
+
+class TboCudaGraphRunnerPlugin:
+    def __init__(self):
+        self._tbo_children_num_token_non_padded = torch.zeros((2,), dtype=torch.int32)
+
+    def capture_one_batch_size(self, batch: ForwardBatch, num_tokens: int):
+        if not is_tbo_enabled():
+            return
+        token_num_per_seq = get_token_num_per_seq(
+            forward_mode=batch.forward_mode, spec_info=batch.spec_info
+        )
+
+        batch.tbo_split_seq_index = compute_split_seq_index(
+            forward_mode=batch.forward_mode,
+            num_tokens=num_tokens,
+            extend_lens=None,
+            token_num_per_seq=token_num_per_seq,
+        )
+        # For simplicity, when two_batch_overlap is enabled, we only capture CUDA Graph for tbo=true
+        assert batch.tbo_split_seq_index is not None, f"{num_tokens=}"
+
+        self._tbo_children_num_token_non_padded[...] = (
+            TboForwardBatchPreparer.compute_tbo_children_num_token_non_padded(batch)
+        )
+
+        TboForwardBatchPreparer.prepare_raw(
+            batch,
+            tbo_children_num_token_non_padded=self._tbo_children_num_token_non_padded,
+        )
+
+    def replay_prepare(
+        self,
+        forward_mode: ForwardMode,
+        bs: int,
+        num_token_non_padded: int,
+        spec_info: Optional[SpecInput],
+    ):
+        token_num_per_seq = get_token_num_per_seq(
+            forward_mode=forward_mode, spec_info=spec_info
+        )
+        tbo_split_seq_index, tbo_split_token_index = (
+            compute_split_indices_for_cuda_graph_replay(
+                forward_mode=forward_mode,
+                cuda_graph_num_tokens=bs * token_num_per_seq,
+                spec_info=spec_info,
+            )
+        )
+
+        self._tbo_children_num_token_non_padded[...] = (
+            TboForwardBatchPreparer.compute_tbo_children_num_token_non_padded_raw(
+                tbo_split_token_index=tbo_split_token_index,
+                num_token_non_padded=num_token_non_padded,
+            )
+        )
+
+
+class TboDPAttentionPreparer:
+    def prepare_all_gather(
+        self,
+        local_batch: ScheduleBatch,
+    ):
+
+        deepep_mode = get_deepep_mode()
+        enable_a2a_moe = not get_moe_a2a_backend().is_none()
+        enable_two_batch_overlap = is_tbo_enabled()
+
+        self.enable_two_batch_overlap = enable_two_batch_overlap
+
+        if local_batch is not None:
+            token_num_per_seq = get_token_num_per_seq(
+                forward_mode=local_batch.forward_mode, spec_info=local_batch.spec_info
+            )
+
+            if (
+                local_batch.forward_mode.is_target_verify()
+                or local_batch.forward_mode.is_decode()
+            ):
+                num_tokens = local_batch.batch_size() * token_num_per_seq
+            elif local_batch.forward_mode.is_prebuilt():
+                num_tokens = 0
+            else:
+                num_tokens = local_batch.extend_num_tokens
+            self.local_tbo_split_seq_index = compute_split_seq_index(
+                forward_mode=local_batch.forward_mode,
+                num_tokens=num_tokens,
+                extend_lens=local_batch.extend_lens,
+                token_num_per_seq=token_num_per_seq,
+            )
+            resolved_deepep_mode = deepep_mode.resolve(local_batch.is_extend_in_batch)
+            local_can_run_tbo = (self.local_tbo_split_seq_index is not None) and not (
+                (
+                    local_batch.forward_mode.is_extend()
+                    and not local_batch.forward_mode.is_target_verify()
+                )
+                and enable_a2a_moe
+                and (resolved_deepep_mode.is_low_latency())
+            )
+        else:
+            self.local_tbo_split_seq_index = 0
+            local_can_run_tbo = True
+
+        local_forward_mode = self._compute_local_forward_mode(local_batch)
+
+        return local_can_run_tbo, local_forward_mode
+
+    def compute_output(self, partial_global_info):
+        # Perform only one Device-to-Host (D2H) memory copy
+        cpu_data = partial_global_info[:, :2].cpu()
+        local_can_run_tbo_aggregated = min(cpu_data[:, 0].tolist())
+        forward_modes = cpu_data[:, 1].tolist()
+
+        global_forward_mode, forward_mode_agree = self._compute_global_forward_mode(
+            forward_modes
+        )
+
+        can_run_tbo = (
+            self.enable_two_batch_overlap
+            and local_can_run_tbo_aggregated
+            and forward_mode_agree
+        )
+
+        tbo_split_seq_index = self.local_tbo_split_seq_index if can_run_tbo else None
+        global_forward_mode = global_forward_mode if can_run_tbo else None
+        return tbo_split_seq_index, global_forward_mode
+
+    @staticmethod
+    def _compute_local_forward_mode(local_batch):
+        return (
+            local_batch.forward_mode if local_batch is not None else ForwardMode.IDLE
+        ).value
+
+    @staticmethod
+    def _compute_global_forward_mode(forward_modes):
+        forward_modes_excluding_idle_and_prebuilt = [
+            x
+            for x in forward_modes
+            if x != ForwardMode.IDLE.value and x != ForwardMode.PREBUILT.value
+        ]
+
+        if not forward_modes_excluding_idle_and_prebuilt:
+            return ForwardMode.IDLE, False
+
+        forward_mode_agree = TboDPAttentionPreparer._is_all_same(
+            forward_modes_excluding_idle_and_prebuilt
+        )
+
+        global_forward_mode = (
+            ForwardMode(forward_modes_excluding_idle_and_prebuilt[0])
+            if forward_mode_agree
+            else None
+        )
+        return global_forward_mode, forward_mode_agree
+
+    @staticmethod
+    def _is_all_same(x):
+        return all(value == x[0] for value in x)
+
+
+class TboForwardBatchPreparer:
+    @classmethod
+    def prepare(cls, batch: ForwardBatch, is_draft_worker: bool = False):
+        if batch.tbo_split_seq_index is None or is_draft_worker:
+            return
+
+        tbo_children_num_token_non_padded = (
+            cls.compute_tbo_children_num_token_non_padded(batch)
+        )
+        cls.prepare_raw(
+            batch, tbo_children_num_token_non_padded=tbo_children_num_token_non_padded
+        )
+
+    @classmethod
+    def prepare_raw(
+        cls, batch: ForwardBatch, tbo_children_num_token_non_padded: torch.Tensor
+    ):
+        from sglang.srt.layers.attention.tbo_backend import TboAttnBackend
+
+        tbo_split_token_index = cls._compute_split_token_index(batch)
+
+        is_enable_two_chunk = (
+            batch.forward_mode == ForwardMode.EXTEND
+            and _is_two_chunk_split_enabled(batch.extend_seq_lens_cpu)
+        )
+
+        if _tbo_debug:
+            logger.info(
+                f"TboForwardBatchPreparer.prepare "
+                f"is_enable_two_chunk={is_enable_two_chunk} "
+                f"tbo_split_seq_index={batch.tbo_split_seq_index} "
+                f"tbo_split_token_index={tbo_split_token_index} "
+                f"extend_seq_lens={batch.extend_seq_lens_cpu} "
+                f"bs={batch.batch_size} "
+                f"forward_mode={batch.forward_mode}"
+            )
+
+        assert isinstance(batch.attn_backend, TboAttnBackend)
+        attn_backend_child_a, attn_backend_child_b = batch.attn_backend.children
+
+        [out_num_token_non_padded_a, out_num_token_non_padded_b] = (
+            tbo_children_num_token_non_padded
+        )
+
+        child_a = cls.filter_batch(
+            batch,
+            start_token_index=0,
+            end_token_index=tbo_split_token_index,
+            start_seq_index=0,
+            end_seq_index=(
+                batch.tbo_split_seq_index + 1
+                if is_enable_two_chunk
+                else batch.tbo_split_seq_index
+            ),
+            output_attn_backend=attn_backend_child_a,
+            out_num_token_non_padded=out_num_token_non_padded_a,
+        )
+        child_b = cls.filter_batch(
+            batch,
+            start_token_index=tbo_split_token_index,
+            end_token_index=batch.input_ids.shape[0],
+            start_seq_index=batch.tbo_split_seq_index,
+            end_seq_index=batch.batch_size,
+            output_attn_backend=attn_backend_child_b,
+            out_num_token_non_padded=out_num_token_non_padded_b,
+        )
+
+        if is_enable_two_chunk:
+            cls.derive_fields_related_to_seq_len_for_two_chunk(
+                batch,
+                child_a=child_a,
+                child_b=child_b,
+                tbo_split_seq_index=batch.tbo_split_seq_index,
+            )
+
+        assert batch.tbo_children is None
+        batch.tbo_children = [child_a, child_b]
+
+    @classmethod
+    def derive_fields_related_to_seq_len_for_two_chunk(
+        cls,
+        batch: ForwardBatch,
+        *,
+        child_a: ForwardBatch,
+        child_b: ForwardBatch,
+        tbo_split_seq_index: int,
+    ):
+        extend_seq_lens_cpu = batch.extend_seq_lens_cpu
+        overall_seq_lens_sum = sum(extend_seq_lens_cpu)
+        half_seq_lens_sum = overall_seq_lens_sum // 2
+        left_last_seq_token_num = half_seq_lens_sum - sum(
+            extend_seq_lens_cpu[:tbo_split_seq_index]
+        )
+        right_first_seq_token_num = (
+            extend_seq_lens_cpu[tbo_split_seq_index] - left_last_seq_token_num
+        )
+
+        # making deepcopy to be extra safe
+        child_a.extend_seq_lens_cpu = copy.deepcopy(child_a.extend_seq_lens_cpu)
+        child_a.extend_seq_lens_cpu[-1] = left_last_seq_token_num
+        child_b.extend_seq_lens_cpu = copy.deepcopy(child_b.extend_seq_lens_cpu)
+        child_b.extend_seq_lens_cpu[0] = right_first_seq_token_num
+        for child in [child_a, child_b]:
+            _update_device_and_sum_field_from_cpu_field(
+                batch=child,
+                cpu_field="extend_seq_lens_cpu",
+                device_field="extend_seq_lens",
+                sum_field="extend_num_tokens",
+            )
+
+        assert (
+            child_a.extend_num_tokens == half_seq_lens_sum
+        ), f"{child_a.extend_num_tokens=}, {half_seq_lens_sum=}"
+
+        child_a.seq_lens_cpu = copy.deepcopy(child_a.seq_lens_cpu)
+        child_a.seq_lens_cpu[-1] = (
+            child_a.extend_seq_lens_cpu[-1] + child_a.extend_prefix_lens_cpu[-1]
+        )
+        _update_device_and_sum_field_from_cpu_field(
+            batch=child_a,
+            cpu_field="seq_lens_cpu",
+            device_field="seq_lens",
+            sum_field="seq_lens_sum",
+        )
+
+        child_b.extend_prefix_lens_cpu = copy.deepcopy(child_b.extend_prefix_lens_cpu)
+        child_b.extend_prefix_lens_cpu[0] += left_last_seq_token_num
+        _update_device_and_sum_field_from_cpu_field(
+            batch=child_b,
+            cpu_field="extend_prefix_lens_cpu",
+            device_field="extend_prefix_lens",
+            sum_field=None,
+        )
+        _, child_b.extend_start_loc = compute_position(
+            get_global_server_args().attention_backend,
+            child_b.extend_prefix_lens,
+            child_b.extend_seq_lens,
+            child_b.extend_num_tokens,
+        )
+
+    @classmethod
+    def filter_batch(
+        cls,
+        batch: ForwardBatch,
+        *,
+        start_token_index: int,
+        end_token_index: int,
+        start_seq_index: int,
+        end_seq_index: int,
+        output_attn_backend: AttentionBackend,
+        out_num_token_non_padded: torch.Tensor,
+    ):
+        assert (
+            end_token_index >= start_token_index
+        ), f"{end_token_index=}, {start_token_index=}, batch={batch}"
+        num_tokens = batch.input_ids.shape[0]
+        num_seqs = batch.batch_size
+
+        output_dict = dict()
+
+        for key in [
+            "input_ids",
+            "positions",
+            "out_cache_loc",
+        ]:
+            old_value = getattr(batch, key)
+            assert (
+                old_value.shape[0] == num_tokens
+            ), f"{key=} {old_value=} {num_tokens=} {batch=}"
+            output_dict[key] = old_value[start_token_index:end_token_index]
+
+        attention_tp_size = get_attention_tp_size()
+        output_dict["tbo_padded_len"] = (
+            (end_token_index - start_token_index - 1) // attention_tp_size + 1
+        ) * attention_tp_size
+
+        for key in [
+            "req_pool_indices",
+            "seq_lens",
+            "seq_lens_cpu",
+            "extend_seq_lens",
+            "extend_prefix_lens",
+            "extend_start_loc",
+            "extend_prefix_lens_cpu",
+            "extend_seq_lens_cpu",
+            "extend_logprob_start_lens_cpu",
+            "lora_ids",
+            "rids",
+        ]:
+            old_value = getattr(batch, key)
+            if old_value is None:
+                continue
+            elif batch.forward_mode.is_target_verify() and (
+                key == "extend_seq_lens"
+                or key == "extend_prefix_lens"
+                or key == "extend_start_loc"
+                or key == "extend_prefix_lens_cpu"
+                or key == "extend_seq_lens_cpu"
+                or key == "extend_logprob_start_lens_cpu"
+            ):
+                output_dict[key] = None
+                continue
+            assert (
+                len(old_value) == num_seqs
+            ), f"{key=} {old_value=} {num_seqs=} {batch=}"
+            output_dict[key] = old_value[start_seq_index:end_seq_index]
+
+        spec_info = getattr(batch, "spec_info")
+        output_spec_info = split_spec_info(
+            spec_info=spec_info,
+            start_token_index=start_token_index,
+            end_token_index=end_token_index,
+            start_seq_index=start_seq_index,
+            end_seq_index=end_seq_index,
+        )
+        output_dict["spec_info"] = output_spec_info
+        for key in [
+            "forward_mode",
+            "is_extend_in_batch",
+            "all_extend_in_batch",
+            "return_logprob",
+            "req_to_token_pool",
+            "token_to_kv_pool",
+            "can_run_dp_cuda_graph",
+            "dp_padding_mode",
+            "global_forward_mode",
+            "is_prefill_only",
+            "spec_algorithm",
+            "capture_hidden_mode",
+            "padded_static_len",
+            "mrope_positions",  # only used by qwen2-vl, thus not care
+            "split_index",  # for split prefill
+            "orig_seq_lens",  # only used by qwen-1m, thus not care
+        ]:
+            output_dict[key] = getattr(batch, key)
+        if not batch.forward_mode.is_target_verify():
+            assert (
+                _compute_extend_num_tokens(batch.input_ids, batch.forward_mode)
+                == batch.extend_num_tokens
+            ), f"{batch=}"
+        extend_num_tokens = _compute_extend_num_tokens(
+            output_dict["input_ids"], output_dict["forward_mode"]
+        )
+
+        # TODO improve, e.g. unify w/ `init_raw`
+        if (
+            get_global_server_args().moe_dense_tp_size == 1
+            and batch.global_dp_buffer_len is not None
+        ):
+            sum_len = end_token_index - start_token_index
+            global_dp_buffer_len = sum_len
+        else:
+            global_dp_buffer_len = None
+
+        output_dict.update(
+            dict(
+                batch_size=end_seq_index - start_seq_index,
+                seq_lens_sum=(
+                    output_dict["seq_lens_cpu"].sum()
+                    if "seq_lens_cpu" in output_dict
+                    else None
+                ),
+                extend_num_tokens=extend_num_tokens,
+                attn_backend=output_attn_backend,
+                num_token_non_padded=out_num_token_non_padded,
+                # TODO: handle it when we need TBO + DeepSeek V3.2
+                num_token_non_padded_cpu=None,
+                tbo_split_seq_index=None,
+                tbo_parent_token_range=(start_token_index, end_token_index),
+                tbo_children=None,
+                original_global_num_tokens_cpu=None,
+                global_num_tokens_gpu=None,
+                global_num_tokens_cpu=None,
+                global_dp_buffer_len=global_dp_buffer_len,
+                global_num_tokens_for_logprob_gpu=None,
+                global_num_tokens_for_logprob_cpu=None,
+                sampling_info=None,
+                # For logits and logprobs post processing, thus we do not care
+                temp_scaled_logprobs=False,
+                temperature=None,
+                top_p_normalized_logprobs=False,
+                top_p=None,
+                mm_inputs=None,
+                top_logprobs_nums=None,
+                token_ids_logprobs=None,
+                next_token_logits_buffer=None,
+                return_hidden_states_before_norm=False,
+            )
+        )
+
+        errors = []
+        for field in dataclasses.fields(ForwardBatch):
+            if getattr(batch, field.name) is not None and field.name not in output_dict:
+                errors.append(
+                    f"Field {field.name} has value, but is not yet supported (value={getattr(batch, field.name)} batch={batch})"
+                )
+        if len(errors) > 0:
+            raise Exception(f"{len(errors)} errors happen:\n" + "\n\n".join(errors))
+
+        return ForwardBatch(**output_dict)
+
+    @classmethod
+    def compute_tbo_children_num_token_non_padded(cls, batch: ForwardBatch):
+        return cls.compute_tbo_children_num_token_non_padded_raw(
+            tbo_split_token_index=cls._compute_split_token_index(batch),
+            num_token_non_padded=len(batch.input_ids),
+        )
+
+    @classmethod
+    def compute_tbo_children_num_token_non_padded_raw(
+        cls, tbo_split_token_index: int, num_token_non_padded: int
+    ):
+        # TODO we may make padding on both sub-batches to make it slightly more balanced
+        value_a = min(tbo_split_token_index, num_token_non_padded)
+        value_b = max(0, num_token_non_padded - tbo_split_token_index)
+        return torch.tensor([value_a, value_b], dtype=torch.int32).to(
+            device=get_global_server_args().device, non_blocking=True
+        )
+
+    @classmethod
+    def _compute_split_token_index(cls, batch: ForwardBatch):
+        token_num_per_seq = get_token_num_per_seq(
+            forward_mode=batch.forward_mode, spec_info=batch.spec_info
+        )
+        return compute_split_token_index(
+            split_seq_index=batch.tbo_split_seq_index,
+            forward_mode=batch.forward_mode,
+            extend_seq_lens=batch.extend_seq_lens_cpu,
+            token_num_per_seq=token_num_per_seq,
+        )
+
+
+def _compute_extend_num_tokens(input_ids, forward_mode: ForwardMode):
+    if (
+        forward_mode.is_decode()
+        or forward_mode.is_idle()
+        or forward_mode.is_target_verify()
+    ):
+        return None
+    elif forward_mode.is_extend():
+        return input_ids.shape[0]
+    raise NotImplementedError
+
+
+# -------------------------------- Execution ---------------------------------------
+
+
+def model_forward_maybe_tbo(
+    layers,
+    enable_tbo: bool,
+    positions: torch.Tensor,
+    forward_batch: ForwardBatch,
+    hidden_states: torch.Tensor,
+    input_data_scatter_mode: ScatterMode,
+    residual: Optional[torch.Tensor],
+    zero_allocator: Optional[BumpAllocator] = None,
+):
+    inputs = dict(
+        positions=positions,
+        hidden_states=hidden_states,
+        forward_batch=forward_batch,
+        residual=residual,
+        zero_allocator=zero_allocator,
+    )
+    layer_input_scatter_mode = layers[0].layer_scatter_modes.layer_input_mode
+    operations_strategy = OperationsStrategy.init_new_tbo(
+        layers, forward_batch.global_forward_mode
+    )
+    if enable_tbo:
+        return _model_forward_tbo(
+            inputs=inputs,
+            operations_strategy=operations_strategy,
+            input_data_scatter_mode=input_data_scatter_mode,
+            layer_input_scatter_mode=layer_input_scatter_mode,
+        )
+    else:
+        return _model_forward_non_tbo(inputs, operations_strategy)
+
+
+def _model_forward_tbo(
+    inputs,
+    operations_strategy: OperationsStrategy,
+    input_data_scatter_mode: ScatterMode,
+    layer_input_scatter_mode: ScatterMode,
+):
+    inputs_arr = _model_forward_tbo_split_inputs(
+        **inputs,
+        input_data_scatter_mode=input_data_scatter_mode,
+        layer_input_scatter_mode=layer_input_scatter_mode,
+    )
+    original_hidden_states_len = inputs["hidden_states"].shape[0]
+    del inputs
+
+    context = (
+        empty_context()
+        if _is_hip
+        else deep_gemm_wrapper.configure_deep_gemm_num_sms(
+            operations_strategy.deep_gemm_num_sms
+        )
+    )
+
+    with context:
+        outputs_arr = execute_overlapped_operations(
+            inputs_arr=inputs_arr,
+            operations_arr=[operations_strategy.operations] * 2,
+            delta_stages=[0, operations_strategy.tbo_delta_stages],
+        )
+
+    return _model_forward_tbo_merge_outputs(*outputs_arr, original_hidden_states_len)
+
+
+def _model_forward_non_tbo(inputs, operations_strategy: OperationsStrategy):
+    outputs = execute_operations(inputs, operations_strategy.operations)
+    return outputs["hidden_states"], outputs["residual"]
+
+
+def _model_forward_tbo_split_inputs(
+    hidden_states: torch.Tensor,
+    residual: torch.Tensor,
+    positions: torch.Tensor,
+    forward_batch: ForwardBatch,
+    zero_allocator: Optional[BumpAllocator],
+    input_data_scatter_mode: ScatterMode,
+    layer_input_scatter_mode: ScatterMode,
+) -> List[Dict]:
+    tbo_splitter_scatter_mode = ScatterMode.TP_ATTN_FULL
+    context = CommunicateContext.init_new()
+
+    hidden_states, residual = CommunicateSummableTensorPairFn.execute(
+        hidden_states_input_mode=input_data_scatter_mode,
+        residual_input_mode=input_data_scatter_mode,
+        output_mode=tbo_splitter_scatter_mode,
+        hidden_states=hidden_states,
+        residual=residual,
+        forward_batch=forward_batch,
+        context=context,
+    )
+
+    inputs_arr = _model_forward_tbo_split_inputs_raw(
+        hidden_states=hidden_states,
+        residual=residual,
+        positions=positions,
+        forward_batch=forward_batch,
+        zero_allocator=zero_allocator,
+    )
+
+    def _post_transform(hidden_states, residual, forward_batch, **kwargs):
+        hidden_states, residual = CommunicateSummableTensorPairFn.execute(
+            hidden_states_input_mode=tbo_splitter_scatter_mode,
+            residual_input_mode=tbo_splitter_scatter_mode,
+            output_mode=layer_input_scatter_mode,
+            hidden_states=hidden_states,
+            residual=residual,
+            forward_batch=forward_batch,
+            context=context,
+        )
+        return dict(
+            hidden_states=hidden_states,
+            residual=residual,
+            forward_batch=forward_batch,
+            **kwargs,
+        )
+
+    return [_post_transform(**inputs) for inputs in inputs_arr]
+
+
+def _model_forward_tbo_split_inputs_raw(
+    hidden_states: torch.Tensor,
+    residual: torch.Tensor,
+    positions: torch.Tensor,
+    forward_batch: ForwardBatch,
+    zero_allocator: Optional[BumpAllocator],
+) -> List[Dict]:
+    return [
+        dict(
+            **_model_forward_filter_inputs(
+                hidden_states=hidden_states,
+                residual=residual,
+                positions=positions,
+                output_forward_batch=output_forward_batch,
+                tbo_subbatch_index=tbo_subbatch_index,
+            ),
+            **(
+                dict(zero_allocator=zero_allocator)
+                if zero_allocator is not None
+                else {}
+            ),
+        )
+        for tbo_subbatch_index, output_forward_batch in enumerate(
+            forward_batch.tbo_children
+        )
+    ]
+
+
+def _model_forward_filter_inputs(
+    hidden_states: torch.Tensor,
+    residual: torch.Tensor,
+    positions: torch.Tensor,
+    output_forward_batch: ForwardBatch,
+    tbo_subbatch_index: int,
+) -> Dict:
+    token_slice = slice(*output_forward_batch.tbo_parent_token_range)
+    hidden_states = hidden_states[token_slice]
+    residual = None if residual is None else residual[token_slice]
+    positions = positions[token_slice]
+
+    assert output_forward_batch.tbo_padded_len is not None
+    padded_len = output_forward_batch.tbo_padded_len
+
+    def _pad(x):
+        nonlocal padded_len
+        if x is None:
+            return None
+        if x.shape[0] == padded_len:
+            return x
+        res = torch.zeros((padded_len, *x.shape[1:]), dtype=x.dtype, device=x.device)
+        res[: x.shape[0]] = x
+        return res
+
+    return dict(
+        hidden_states=_pad(hidden_states),
+        residual=_pad(residual),
+        positions=_pad(positions),
+        forward_batch=output_forward_batch,
+        tbo_subbatch_index=tbo_subbatch_index,
+    )
+
+
+def _model_forward_tbo_merge_outputs(output_a, output_b, original_len):
+    def _handle_key(name):
+        value_a = output_a[name]
+        value_b = output_b[name]
+        assert (value_a is None) == (value_b is None)
+        if value_a is None:
+            return None
+        s0, t0 = output_a["forward_batch"].tbo_parent_token_range
+        s1, t1 = output_b["forward_batch"].tbo_parent_token_range
+        res = torch.zeros(
+            (original_len, *value_a.shape[1:]),
+            dtype=value_a.dtype,
+            device=value_a.device,
+        )
+        res[slice(s0, t0)] = value_a[: t0 - s0]
+        res[slice(s1, t1)] = value_b[: t1 - s1]
+        return res
+
+    return _handle_key("hidden_states"), _handle_key("residual")
+
+
+# -------------------------------- Utilities and wrappers ---------------------------------------
+
+
+class MaybeTboDeepEPDispatcher(BaseDispatcher):
+    def __init__(self, **kwargs):
+        super().__init__()
+        num_inner_dispatchers = 2 if is_tbo_enabled() else 1
+        if get_moe_a2a_backend().is_deepep():
+            self._inners = [
+                DeepEPDispatcher(**kwargs) for _ in range(num_inner_dispatchers)
+            ]
+        elif get_moe_a2a_backend().is_mooncake():
+            self._inners = [
+                MooncakeEPDispatcher(**kwargs) for _ in range(num_inner_dispatchers)
+            ]
+        elif get_moe_a2a_backend().is_mori():
+            self._inners = [
+                MoriEPDispatcher(**kwargs) for _ in range(num_inner_dispatchers)
+            ]
+
+    def _execute(self, name, tbo_subbatch_index: Optional[int] = None, **kwargs):
+        return getattr(self._inners[tbo_subbatch_index or 0], name)(**kwargs)
+
+    def dispatch(self, **kwargs) -> DispatchOutput:
+        return self._execute("dispatch", **kwargs)
+
+    def dispatch_a(self, **kwargs):
+        return self._execute("dispatch_a", **kwargs)
+
+    def dispatch_b(self, **kwargs):
+        return self._execute("dispatch_b", **kwargs)
+
+    def combine(self, **kwargs) -> torch.Tensor:
+        return self._execute("combine", **kwargs)
+
+    def combine_a(self, **kwargs):
+        return self._execute("combine_a", **kwargs)
+
+    def combine_b(self, **kwargs):
+        return self._execute("combine_b", **kwargs)
+
+    def register_deepep_dispatch_hook(self, hook):
+        handle_list = []
+        for inner in self._inners:
+            handle_list.append(inner.register_deepep_dispatch_hook(hook))
+        return handle_list
+
+    def set_quant_config(self, quant_config: dict):
+        super().set_quant_config(quant_config)
+        for inner in self._inners:
+            inner.set_quant_config(quant_config)
+
+    def set_overlap_args(
+        self, combine_overlap_args: CombineOverlapArgs, meta_overlap_args: dict
+    ):
+        super().set_overlap_args(combine_overlap_args, meta_overlap_args)
+        for inner in self._inners:
+            inner.set_overlap_args(combine_overlap_args, meta_overlap_args)
+
+    def clear_overlap_args(self):
+        super().clear_overlap_args()
+        for inner in self._inners:
+            inner.clear_overlap_args()
diff --git a/sglang/python/sglang/srt/checkpoint_engine/__init__.py b/sglang/python/sglang/srt/checkpoint_engine/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..d8a77f905b1b5f686ab07b1cb8e84ca8441270b6
--- /dev/null
+++ b/sglang/python/sglang/srt/checkpoint_engine/__init__.py
@@ -0,0 +1,9 @@
+"""
+Checkpoint engine module for SGLang.
+
+This module provides functionality for updating model weights via checkpoint engine.
+"""
+
+from sglang.srt.checkpoint_engine.update import main
+
+__all__ = ["main"]
diff --git a/sglang/python/sglang/srt/checkpoint_engine/checkpoint_engine_worker.py b/sglang/python/sglang/srt/checkpoint_engine/checkpoint_engine_worker.py
new file mode 100644
index 0000000000000000000000000000000000000000..6f11c7872540f3dd8235d9e04b61bed48df523f3
--- /dev/null
+++ b/sglang/python/sglang/srt/checkpoint_engine/checkpoint_engine_worker.py
@@ -0,0 +1,143 @@
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""
+Checkpoint-engine integration for SGLang.
+This module provides weight update functionality via IPC for checkpoint-engine compatibility.
+"""
+
+import logging
+from typing import Callable, Dict, Optional
+
+import torch
+import zmq
+
+try:
+    from checkpoint_engine.worker import update_weights_from_ipc
+except ImportError:
+    raise ImportError(
+        "checkpoint-engine is not installed. "
+        "Please install it with: pip install sglang[checkpoint-engine]"
+    )
+
+logger = logging.getLogger(__name__)
+
+
+class SGLangCheckpointEngineWorkerExtension:
+    """
+    Worker extension for SGLang to support checkpoint-engine IPC weight updates.
+    This class provides the interface needed for checkpoint-engine integration.
+    """
+
+    def __init__(self):
+        self._zmq_ctx: Optional[zmq.Context] = None
+
+    def get_device_uuid(self) -> str:
+        """Get the UUID of current device."""
+        # We need to implement this to get the device UUID
+        # This will be overridden when integrated into SGLang's worker
+        raise NotImplementedError(
+            "This method should be overridden by SGLang integration"
+        )
+
+    def get_device_id(self) -> int:
+        """Get the device ID."""
+        raise NotImplementedError(
+            "This method should be overridden by SGLang integration"
+        )
+
+    def get_model_loader(self) -> Callable:
+        """Get the model weight loader function."""
+        raise NotImplementedError(
+            "This method should be overridden by SGLang integration"
+        )
+
+    def get_post_hook(self) -> Optional[Callable]:
+        """Get the post-processing hook after weight loading."""
+        return None
+
+    def update_weights_from_ipc(self, zmq_handles: Dict[str, str]):
+        """
+        Update weights from IPC communication.
+        Args:
+            zmq_handles: Dict mapping device UUID to ZMQ socket path
+        """
+        if self._zmq_ctx is None:
+            self._zmq_ctx = zmq.Context()
+        device_uuid = self.get_device_uuid()
+        device_id = self.get_device_id()
+        if device_uuid not in zmq_handles:
+            raise ValueError(
+                f"Device UUID {device_uuid} not found in zmq_handles: {list(zmq_handles.keys())}"
+            )
+        update_weights_from_ipc(
+            self._zmq_ctx,
+            zmq_handles[device_uuid],
+            device_id=device_id,
+            run=self.get_model_loader(),
+            post_hook=self.get_post_hook(),
+        )
+
+
+class SGLangCheckpointEngineWorkerExtensionImpl(SGLangCheckpointEngineWorkerExtension):
+    """
+    Implementation of SGLangCheckpointEngineWorkerExtension that integrates with SGLang's model runner.
+    This class provides the concrete implementation for checkpoint-engine IPC weight updates.
+    """
+
+    def __init__(self, model_runner):
+        super().__init__()
+        self.model_runner = model_runner
+
+    def get_device_uuid(self) -> str:
+        """Get the UUID of current device."""
+        # Get device UUID for current device
+        device_id = torch.cuda.current_device()
+        try:
+            return f"GPU-{torch.cuda.get_device_properties(device_id).uuid!s}"
+        except AssertionError as e:
+            raise ValueError(f"Failed to get GPU UUID for device {device_id}") from e
+
+    def get_device_id(self) -> int:
+        """Get the device ID."""
+        return torch.cuda.current_device()
+
+    def get_model_loader(self) -> Callable:
+        """Get the model weight loader function."""
+        return self.model_runner.model.load_weights
+
+    def get_post_hook(self) -> Optional[Callable]:
+        """Get the post-processing hook after weight loading."""
+
+        def post_hook():
+            # Perform post-processing after weight loading similar to DefaultModelLoader
+            try:
+                from sglang.srt.model_loader.loader import device_loading_context
+
+                # Process quantization methods after loading weights
+                for _, module in self.model_runner.model.named_modules():
+                    quant_method = getattr(module, "quant_method", None)
+                    if quant_method is not None:
+                        # Move parameters to device if needed for quantization processing
+                        target_device = torch.device(
+                            "cuda", torch.cuda.current_device()
+                        )
+                        with device_loading_context(module, target_device):
+                            quant_method.process_weights_after_loading(module)
+                # Call model-specific post-loading hook if available
+                if hasattr(self.model_runner.model, "post_load_weights"):
+                    self.model_runner.model.post_load_weights()
+            except Exception as e:
+                logger.warning(f"Post-hook processing failed: {e}")
+
+        return post_hook
diff --git a/sglang/python/sglang/srt/checkpoint_engine/update.py b/sglang/python/sglang/srt/checkpoint_engine/update.py
new file mode 100644
index 0000000000000000000000000000000000000000..93c8b4b6e4c19553ef8357b8e25d55099a4a885a
--- /dev/null
+++ b/sglang/python/sglang/srt/checkpoint_engine/update.py
@@ -0,0 +1,317 @@
+"""
+Usage:
+1) Launch the server with wait-for-initial-weights option in one terminal:
+   python -m sglang.launch_server --model-path /workspace/Qwen/Qwen3-4B/ --tensor-parallel-size 2 --port 19730 --load-format dummy --checkpoint-engine-wait-weights-before-ready --mem-fraction-static 0.7
+
+2) Torchrun this script in another terminal:
+    torchrun --nproc-per-node 2 update.py --update-method broadcast --checkpoint-path /workspace/Qwen/Qwen3-4B/  --inference-parallel-size 2
+
+Or use the integrated entry point:
+    python -m sglang.srt.checkpoint_engine.update --update-method broadcast --checkpoint-path /workspace/Qwen/Qwen3-4B/  --inference-parallel-size 2
+"""
+
+import argparse
+import json
+import os
+import pickle
+import subprocess
+import sys
+import time
+from collections import defaultdict
+from collections.abc import Callable
+from contextlib import contextmanager
+from typing import Literal
+
+import httpx
+import torch
+import torch.distributed as dist
+from safetensors import safe_open
+
+try:
+    from checkpoint_engine.ps import ParameterServer
+    from loguru import logger
+except ImportError:
+    # Fallback for when checkpoint_engine is not available
+    ParameterServer = None
+    import logging
+
+    logger = logging.getLogger(__name__)
+
+
+@contextmanager
+def timer(msg: str):
+    start = time.perf_counter()
+    yield
+    end = time.perf_counter()
+    logger.info(f"{msg} duration: {end - start:.2f} seconds")
+
+
+def check_sglang_ready(
+    endpoint: str, inference_parallel_size: int, uds: str | None = None
+):
+    rank = int(os.getenv("RANK", 0))
+    if rank != rank // inference_parallel_size * inference_parallel_size:
+        return
+    retry_num = 0
+    transport = None
+    if uds is not None:
+        transport = httpx.HTTPTransport(uds=uds)
+    with httpx.Client(transport=transport) as client:
+        while True:
+            try:
+                response = client.get(f"{endpoint}/ping", timeout=10)
+                response.raise_for_status()
+                break
+            except (httpx.ConnectError, httpx.HTTPStatusError) as e:
+                if retry_num % 10 == 0:
+                    logger.warning(
+                        f"fail to check sglang ready, retry {retry_num} times, error: {e}"
+                    )
+                retry_num += 1
+                time.sleep(0.1)
+
+
+def split_checkpoint_files(
+    checkpoint_path: str, rank: int, world_size: int
+) -> list[str]:
+    checkpoint_files = [
+        os.path.join(checkpoint_path, f)
+        for f in filter(
+            lambda x: x.endswith(".safetensors"), os.listdir(checkpoint_path)
+        )
+    ]
+    files_per_rank = (len(checkpoint_files) + world_size - 1) // world_size
+    return checkpoint_files[rank * files_per_rank : (rank + 1) * files_per_rank]
+
+
+def split_tensors(
+    checkpoint_path: str, rank: int, world_size: int
+) -> dict[str, torch.Tensor]:
+    index_fn = os.path.join(checkpoint_path, "model.safetensors.index.json")
+    with open(index_fn) as f:
+        weight_map: dict[str, str] = json.load(f)["weight_map"]
+    weights_per_rank = (len(weight_map) + world_size - 1) // world_size
+    fn_tensors: dict[str, list[str]] = defaultdict(list)
+    weight_keys = list(weight_map.items())
+    for name, file in weight_keys[
+        rank * weights_per_rank : (rank + 1) * weights_per_rank
+    ]:
+        fn_tensors[file].append(name)
+    named_tensors = {}
+    for file, names in fn_tensors.items():
+        with safe_open(os.path.join(checkpoint_path, file), framework="pt") as f:
+            for name in names:
+                named_tensors[name] = f.get_tensor(name)
+    return named_tensors
+
+
+def req_inference(
+    endpoint: str,
+    inference_parallel_size: int,
+    timeout: float = 300.0,
+    uds: str | None = None,
+    weight_version: str | None = None,
+) -> Callable[[list[tuple[str, str]]], None]:
+    rank = int(os.getenv("RANK", 0))
+    src = rank // inference_parallel_size * inference_parallel_size
+
+    def req_func(socket_paths: list[tuple[str, str]]):
+        if rank == src:
+            with httpx.Client(transport=httpx.HTTPTransport(uds=uds)) as client:
+                resp = client.post(
+                    f"{endpoint}/update_weights_from_ipc",
+                    json={
+                        "zmq_handles": dict(
+                            socket_paths[src : src + inference_parallel_size]
+                        ),
+                        "flush_cache": True,
+                        "weight_version": weight_version,
+                    },
+                    timeout=timeout,
+                )
+                resp.raise_for_status()
+
+    return req_func
+
+
+def update_weights(
+    ps,
+    checkpoint_name: str,
+    checkpoint_files: list[str],
+    named_tensors: dict[str, torch.Tensor],
+    req_func: Callable[[list[tuple[str, str]]], None],
+    inference_parallel_size: int,
+    endpoint: str,
+    save_metas_file: str | None = None,
+    update_method: Literal["broadcast", "p2p", "all"] = "broadcast",
+    uds: str | None = None,
+):
+    ps.register_checkpoint(
+        checkpoint_name, files=checkpoint_files, named_tensors=named_tensors
+    )
+    ps.init_process_group()
+    check_sglang_ready(endpoint, inference_parallel_size, uds)
+    dist.barrier()
+    with timer("Gather metas"):
+        ps.gather_metas(checkpoint_name)
+    if save_metas_file and int(os.getenv("RANK")) == 0:
+        with open(save_metas_file, "wb") as f:
+            pickle.dump(ps.get_metas(), f)
+
+    if update_method == "broadcast" or update_method == "all":
+        with timer("Update weights without setting ranks"):
+            ps.update(checkpoint_name, req_func)
+
+    if update_method == "p2p" or update_method == "all":
+        if update_method:
+            # sleep 2s to wait destroy process group
+            time.sleep(2)
+        with timer("Update weights with setting ranks"):
+            ps.update(
+                checkpoint_name, req_func, ranks=list(range(inference_parallel_size))
+            )
+
+
+def join(
+    ps: ParameterServer,
+    checkpoint_name: str,
+    load_metas_file: str,
+    req_func: Callable[[list[tuple[str, str]]], None],
+    inference_parallel_size: int,
+    endpoint: str,
+    uds: str | None = None,
+):
+    assert load_metas_file, "load_metas_file is required"
+    with open(load_metas_file, "rb") as f:
+        metas = pickle.load(f)
+    ps.init_process_group()
+    check_sglang_ready(endpoint, inference_parallel_size, uds)
+    dist.barrier()
+    with timer("Gather metas before join"):
+        ps.gather_metas(checkpoint_name)
+    ps.load_metas(metas)
+    with timer(
+        f"Update weights with setting ranks as range(0, {inference_parallel_size}) by using p2p"
+    ):
+        ps.update(checkpoint_name, req_func, ranks=list(range(inference_parallel_size)))
+
+
+def run_with_torchrun():
+    """Run the update script with torchrun automatically."""
+    # Parse inference_parallel_size from command line arguments to determine nproc-per-node
+    inference_parallel_size = 8  # default
+    args = sys.argv[1:]  # Skip the script name
+
+    # Look for --inference-parallel-size in arguments
+    for i, arg in enumerate(args):
+        if arg == "--inference-parallel-size" and i + 1 < len(args):
+            try:
+                inference_parallel_size = int(args[i + 1])
+            except ValueError:
+                pass
+            break
+        elif arg.startswith("--inference-parallel-size="):
+            try:
+                inference_parallel_size = int(arg.split("=", 1)[1])
+            except ValueError:
+                pass
+            break
+
+    # Build torchrun command
+    cmd = ["torchrun", f"--nproc-per-node={inference_parallel_size}", __file__] + args
+
+    print(f"Running: {' '.join(cmd)}", file=sys.stderr)
+
+    # Execute torchrun with the original script
+    try:
+        result = subprocess.run(cmd, check=False)
+        sys.exit(result.returncode)
+    except FileNotFoundError:
+        print(
+            "Error: torchrun command not found. Please ensure PyTorch is installed.",
+            file=sys.stderr,
+        )
+        sys.exit(1)
+    except KeyboardInterrupt:
+        print("\nInterrupted by user", file=sys.stderr)
+        sys.exit(130)
+
+
+def main():
+    # Check if we're running under torchrun or need to invoke it
+    if os.getenv("RANK") is None:
+        # Not running under torchrun, so invoke it
+        run_with_torchrun()
+        return
+
+    # Running under torchrun, proceed with normal execution
+    parser = argparse.ArgumentParser(description="Update weights example")
+    parser.add_argument("--checkpoint-path", type=str, default=None)
+    parser.add_argument("--save-metas-file", type=str, default=None)
+    parser.add_argument("--load-metas-file", type=str, default=None)
+    parser.add_argument("--sleep-time", type=int, default=0)
+    parser.add_argument("--endpoint", type=str, default="http://localhost:19730")
+    parser.add_argument("--inference-parallel-size", type=int, default=8)
+    parser.add_argument("--checkpoint-name", type=str, default="my-checkpoint-iter-0")
+    parser.add_argument("--update-method", type=str, default="broadcast")
+    parser.add_argument("--uds", type=str, default=None)
+    parser.add_argument("--weight-version", type=str, default=None)
+    args = parser.parse_args()
+
+    # Get rank and world_size from environment (set by torchrun)
+    rank = int(os.getenv("RANK", 0))
+    world_size = int(os.getenv("WORLD_SIZE", 1))
+
+    req_func = req_inference(
+        args.endpoint,
+        args.inference_parallel_size,
+        uds=args.uds,
+        weight_version=args.weight_version,
+    )
+
+    if ParameterServer is None:
+        print("Error: checkpoint_engine package not available", file=sys.stderr)
+        sys.exit(1)
+
+    ps = ParameterServer(auto_pg=True)
+    ps._p2p_store = None
+    if args.load_metas_file:
+        join(
+            ps,
+            args.checkpoint_name,
+            args.load_metas_file,
+            req_func,
+            args.inference_parallel_size,
+            args.endpoint,
+            args.uds,
+        )
+    else:
+        if args.checkpoint_path and os.path.exists(
+            os.path.join(args.checkpoint_path, "model.safetensors.index.json")
+        ):
+            named_tensors = split_tensors(args.checkpoint_path, rank, world_size)
+            checkpoint_files = []
+        else:
+            checkpoint_files = (
+                split_checkpoint_files(args.checkpoint_path, rank, world_size)
+                if args.checkpoint_path
+                else []
+            )
+            named_tensors = {}
+        update_weights(
+            ps,
+            args.checkpoint_name,
+            checkpoint_files,
+            named_tensors,
+            req_func,
+            args.inference_parallel_size,
+            args.endpoint,
+            args.save_metas_file,
+            args.update_method,
+            args.uds,
+        )
+    time.sleep(args.sleep_time)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/sglang/python/sglang/srt/compilation/__pycache__/compilation_config.cpython-311.pyc b/sglang/python/sglang/srt/compilation/__pycache__/compilation_config.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..20c7f53376a588252aa6e142a68c1317a04610bf
Binary files /dev/null and b/sglang/python/sglang/srt/compilation/__pycache__/compilation_config.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/compilation/__pycache__/compile.cpython-311.pyc b/sglang/python/sglang/srt/compilation/__pycache__/compile.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ed919433bed365d558c6d9e8bae4126c61840798
Binary files /dev/null and b/sglang/python/sglang/srt/compilation/__pycache__/compile.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/compilation/__pycache__/piecewise_context_manager.cpython-311.pyc b/sglang/python/sglang/srt/compilation/__pycache__/piecewise_context_manager.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..45be632209579f32e88249ae7e34c8c0ab793329
Binary files /dev/null and b/sglang/python/sglang/srt/compilation/__pycache__/piecewise_context_manager.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/compilation/backend.py b/sglang/python/sglang/srt/compilation/backend.py
new file mode 100644
index 0000000000000000000000000000000000000000..8af025707f558fe2834431acdff73b80b38944fb
--- /dev/null
+++ b/sglang/python/sglang/srt/compilation/backend.py
@@ -0,0 +1,472 @@
+# Adapted from https://github.com/vllm-project/vllm/blob/v0.10.0/vllm/compilation/backend.py
+
+
+import ast
+import dataclasses
+import logging
+import os
+import pprint
+import time
+from collections.abc import Sequence
+from contextlib import contextmanager
+from typing import Any, Callable, Optional
+
+import torch
+import torch.fx as fx
+from torch._dispatch.python import enable_python_dispatcher
+
+from sglang.srt.compilation.compilation_config import CompilationConfig
+from sglang.srt.compilation.compilation_counter import compilation_counter
+from sglang.srt.compilation.compiler_interface import EagerAdapter, InductorAdaptor
+from sglang.srt.compilation.cuda_piecewise_backend import CUDAPiecewiseBackend
+from sglang.srt.compilation.npu_piecewise_backend import NPUPiecewiseBackend
+from sglang.srt.compilation.pass_manager import PostGradPassManager
+from sglang.srt.utils.common import is_npu, rank0_log
+
+logger = logging.getLogger(__name__)
+
+
+def make_compiler(config: CompilationConfig):
+    if config.compiler == "eager":
+        return EagerAdapter()
+    elif config.compiler == "inductor":
+        return InductorAdaptor()
+    else:
+        raise ValueError(f"Unknown compiler: {config.compiler}")
+
+
+def make_backend(
+    graph: fx.GraphModule,
+    compile_config: CompilationConfig,
+    inductor_config: dict[str, Any],
+    graph_pool: Any,
+    piecewise_compile_index: int,
+    total_piecewise_compiles: int,
+    sym_shape_indices: list[int],
+    compiled_graph_for_general_shape: Callable,
+    sglang_backend,
+):
+
+    backend_cls = CUDAPiecewiseBackend if not is_npu() else NPUPiecewiseBackend
+    return backend_cls(
+        graph,
+        compile_config,
+        inductor_config,
+        graph_pool,
+        piecewise_compile_index,
+        total_piecewise_compiles,
+        sym_shape_indices,
+        compiled_graph_for_general_shape,
+        sglang_backend,
+    )
+
+
+class CompilerManager:
+    def __init__(
+        self,
+        config: CompilationConfig,
+    ):
+        self.cache = dict()
+        self.is_cache_updated = False
+        self.compiler = make_compiler(config)
+
+    def compute_hash(self):
+        return self.compiler.compute_hash()
+
+    def initialize_cache(
+        self, cache_dir: str, disable_cache: bool = False, prefix: str = ""
+    ):
+        self.disable_cache = disable_cache
+        self.cache_dir = cache_dir
+        self.cache_file_path = os.path.join(cache_dir, "sglang_compile_cache.py")
+
+        if not disable_cache and os.path.exists(self.cache_file_path):
+            with open(self.cache_file_path) as f:
+                self.cache = ast.literal_eval(f.read())
+
+        self.compiler.initialize_cache(
+            cache_dir=cache_dir, disable_cache=disable_cache, prefix=prefix
+        )
+
+    def save_to_file(self):
+        if self.disable_cache or not self.is_cache_updated:
+            return
+        printer = pprint.PrettyPrinter(indent=4)
+        data = printer.pformat(self.cache)
+        with open(self.cache_file_path, "w") as f:
+            f.write(data)
+
+    def load(
+        self,
+        graph: fx.GraphModule,
+        example_inputs: list[Any],
+        graph_index: int,
+        runtime_shape: Optional[int] = None,
+    ) -> Optional[Callable]:
+        handle = self.cache[(runtime_shape, graph_index, self.compiler.name)]
+        compiled_graph = self.compiler.load(
+            handle, graph, example_inputs, graph_index, runtime_shape
+        )
+        if runtime_shape is None:
+            logger.debug(
+                "Directly load the %s-th graph for dynamic shape from %s via "
+                "handle %s",
+                graph_index,
+                self.compiler.name,
+                handle,
+            )
+        else:
+            logger.debug(
+                "Directly load the %s-th graph for shape %s from %s via " "handle %s",
+                graph_index,
+                str(runtime_shape),
+                self.compiler.name,
+                handle,
+            )
+        return compiled_graph
+
+    def compile(
+        self,
+        graph: fx.GraphModule,
+        example_inputs,
+        inductor_config: dict[str, Any],
+        graph_index: int = 0,
+        num_graphs: int = 1,
+        runtime_shape: Optional[int] = None,
+    ) -> Any:
+        if graph_index == 0:
+            # before compiling the first graph, record the start time
+            global compilation_start_time
+            compilation_start_time = time.time()
+
+        compilation_counter.num_backend_compilations += 1
+
+        compiled_graph = None
+
+        # TODO(Yuwei): support cache loading
+
+        # no compiler cached the graph, or the cache is disabled,
+        # we need to compile it
+        if isinstance(self.compiler, InductorAdaptor):
+            maybe_key = None
+        else:
+            maybe_key = f"artifact_shape_{runtime_shape}_subgraph_{graph_index}"
+        compiled_graph, handle = self.compiler.compile(
+            graph, example_inputs, inductor_config, runtime_shape, maybe_key
+        )
+
+        assert compiled_graph is not None, "Failed to compile the graph"
+
+        # store the artifact in the cache
+        if handle is not None:
+            self.cache[(runtime_shape, graph_index, self.compiler.name)] = handle
+            compilation_counter.num_cache_entries_updated += 1
+            self.is_cache_updated = True
+            if graph_index == 0:
+                # adds some info logging for the first graph
+                if runtime_shape is None:
+                    logger.info("Cache the graph for dynamic shape for later use")
+                else:
+                    logger.info(
+                        "Cache the graph of shape %s for later use", str(runtime_shape)
+                    )
+            if runtime_shape is None:
+                logger.debug(
+                    "Store the %s-th graph for dynamic shape from %s via " "handle %s",
+                    graph_index,
+                    self.compiler.name,
+                    handle,
+                )
+            else:
+                logger.debug(
+                    "Store the %s-th graph for shape %s from %s via handle %s",
+                    graph_index,
+                    str(runtime_shape),
+                    self.compiler.name,
+                    handle,
+                )
+
+        # after compiling the last graph, record the end time
+        if graph_index == num_graphs - 1:
+            now = time.time()
+            elapsed = now - compilation_start_time
+            if runtime_shape is None:
+                logger.info("Compiling a graph for dynamic shape takes %.2f s", elapsed)
+            else:
+                logger.info(
+                    "Compiling a graph for shape %s takes %.2f s",
+                    runtime_shape,
+                    elapsed,
+                )
+
+        return compiled_graph
+
+
+@dataclasses.dataclass
+class SplitItem:
+    submod_name: str
+    graph_id: int
+    is_splitting_graph: bool
+    graph: fx.GraphModule
+
+
+def split_graph(
+    graph: fx.GraphModule, ops: list[str]
+) -> tuple[fx.GraphModule, list[SplitItem]]:
+    # split graph by ops
+    subgraph_id = 0
+    node_to_subgraph_id = {}
+    split_op_graphs = []
+    for node in graph.graph.nodes:
+        if node.op in ("output", "placeholder"):
+            continue
+        if node.op == "call_function" and str(node.target) in ops:
+            subgraph_id += 1
+            node_to_subgraph_id[node] = subgraph_id
+            split_op_graphs.append(subgraph_id)
+            subgraph_id += 1
+        else:
+            node_to_subgraph_id[node] = subgraph_id
+
+    # `keep_original_order` is important!
+    # otherwise pytorch might reorder the nodes and
+    # the semantics of the graph will change when we
+    # have mutations in the graph
+    split_gm = torch.fx.passes.split_module.split_module(
+        graph, None, lambda node: node_to_subgraph_id[node], keep_original_order=True
+    )
+
+    outputs = []
+
+    names = [name for (name, module) in split_gm.named_modules()]
+
+    for name in names:
+        if "." in name or name == "":
+            # recursive child module or the root module
+            continue
+
+        module = getattr(split_gm, name)
+
+        graph_id = int(name.replace("submod_", ""))
+        outputs.append(SplitItem(name, graph_id, (graph_id in split_op_graphs), module))
+
+    # sort by intetger graph_id, rather than string name
+    outputs.sort(key=lambda x: x.graph_id)
+
+    return split_gm, outputs
+
+
+# we share the global graph pool among all the backends
+global_graph_pool = None
+
+compilation_start_time = 0.0
+
+
+class PiecewiseCompileInterpreter(torch.fx.Interpreter):
+    def __init__(
+        self,
+        module: torch.fx.GraphModule,
+        compile_submod_names: list[str],
+        inductor_config: dict[str, Any],
+        graph_pool,
+        compile_config: CompilationConfig,
+        sglang_backend: "SGLangBackend",
+    ):
+        super().__init__(module)
+        from torch._guards import detect_fake_mode
+
+        self.fake_mode = detect_fake_mode()
+        self.compile_submod_names = compile_submod_names
+        self.graph_pool = graph_pool
+        self.sglang_backend = sglang_backend
+        # When True, it annoyingly dumps the torch.fx.Graph on errors.
+        self.extra_traceback = False
+        self.inductor_config = inductor_config
+        self.compile_config = compile_config
+
+    def run(self, *args):
+        fake_args = [
+            self.fake_mode.from_tensor(t) if isinstance(t, torch.Tensor) else t
+            for t in args
+        ]
+        with self.fake_mode, enable_python_dispatcher():
+            return super().run(*fake_args)
+
+    def call_module(
+        self,
+        target: torch.fx.node.Target,
+        args: tuple[torch.fx.node.Argument, ...],
+        kwargs: dict[str, Any],
+    ) -> Any:
+        assert isinstance(target, str)
+        output = super().call_module(target, args, kwargs)
+
+        if target in self.compile_submod_names:
+            index = self.compile_submod_names.index(target)
+            submod = self.fetch_attr(target)
+            sym_shape_indices = [
+                i for i, x in enumerate(args) if isinstance(x, torch.SymInt)
+            ]
+            global compilation_start_time
+            compiled_graph_for_dynamic_shape = (
+                self.sglang_backend.compiler_manager.compile(
+                    submod,
+                    args,
+                    self.inductor_config,
+                    graph_index=index,
+                    num_graphs=len(self.compile_submod_names),
+                    runtime_shape=None,
+                )
+            )
+
+            self.module.__dict__[target] = make_backend(
+                submod,
+                self.compile_config,
+                self.inductor_config,
+                self.graph_pool,
+                index,
+                len(self.compile_submod_names),
+                sym_shape_indices,
+                compiled_graph_for_dynamic_shape,
+                self.sglang_backend,
+            )
+
+            compilation_counter.num_piecewise_capturable_graphs_seen += 1
+
+        return output
+
+
+model_tag: str = "backbone"
+
+
+@contextmanager
+def set_model_tag(tag: str):
+    """Context manager to set the model tag."""
+    global model_tag
+    assert (
+        tag != model_tag
+    ), f"Model tag {tag} is the same as the current tag {model_tag}."
+    old_tag = model_tag
+    model_tag = tag
+    try:
+        yield
+    finally:
+        model_tag = old_tag
+
+
+class SGLangBackend:
+
+    graph_pool: Any
+    _called: bool = False
+    # the graph we compiled
+    graph: fx.GraphModule
+    # the stiching graph module for all the piecewise graphs
+    split_gm: fx.GraphModule
+    piecewise_graphs: list[SplitItem]
+    returned_callable: Callable
+    # Inductor passes to run on the graph pre-defunctionalization
+    post_grad_passes: Sequence[Callable]
+    sym_tensor_indices: list[int]
+    input_buffers: list[torch.Tensor]
+    compiler_manager: CompilerManager
+
+    def __init__(
+        self,
+        config: CompilationConfig,
+        graph_pool: Any,
+    ):
+        rank0_log(f"Initializing SGLangBackend")
+        assert graph_pool is not None
+        self.graph_pool = graph_pool
+
+        self.post_grad_pass_manager = PostGradPassManager()
+        self.sym_tensor_indices = []
+        self.input_buffers = []
+
+        self.compiler_manager = CompilerManager(config)
+        self.inductor_config = {
+            "enable_auto_functionalized_v2": False,
+        }
+        self.compile_config = config
+
+    def configure_post_pass(self):
+        self.post_grad_pass_manager.configure()
+        self.inductor_config["post_grad_custom_post_pass"] = self.post_grad_pass_manager
+
+    def __call__(self, graph: fx.GraphModule, example_inputs) -> Callable:
+        rank0_log(f"SGLangBackend __call__")
+        base_cache_dir = os.path.expanduser(
+            os.getenv("SGLANG_CACHE_DIR", "~/.cache/sglang/")
+        )
+
+        cache_hash = self.compiler_manager.compute_hash()
+        cache_dir = os.path.join(
+            base_cache_dir,
+            "torch_compile_cache",
+            cache_hash,
+        )
+
+        os.makedirs(cache_dir, exist_ok=True)
+        rank = 0
+        dp_rank = 0
+        local_cache_dir = os.path.join(cache_dir, f"rank_{rank}_{dp_rank}", model_tag)
+        os.makedirs(local_cache_dir, exist_ok=True)
+        self.compiler_manager.initialize_cache(
+            local_cache_dir, disable_cache=False, prefix=""
+        )
+        compilation_counter.num_graphs_seen += 1
+
+        assert not self._called, "SGLangBackend can only be called once"
+
+        self.graph = graph
+        self.configure_post_pass()
+
+        self.split_gm, self.piecewise_graphs = split_graph(
+            graph,
+            self.compile_config.split_ops,
+        )
+        from torch._dynamo.utils import lazy_format_graph_code
+
+        # depyf will hook lazy_format_graph_code and dump the graph
+        # for debugging, no need to print the graph here
+        lazy_format_graph_code("before split", self.graph)
+        lazy_format_graph_code("after split", self.split_gm)
+
+        compilation_counter.num_piecewise_graphs_seen += len(self.piecewise_graphs)
+
+        submod_names_to_compile = [
+            item.submod_name
+            for item in self.piecewise_graphs
+            if not item.is_splitting_graph
+        ]
+
+        PiecewiseCompileInterpreter(
+            self.split_gm,
+            submod_names_to_compile,
+            self.inductor_config,
+            self.graph_pool,
+            self.compile_config,
+            self,
+        ).run(*example_inputs)
+
+        rank = torch.distributed.get_rank()
+
+        if rank == 0:
+            graph_path = os.path.join(
+                local_cache_dir, f"computation_graph_{time.time()}.py"
+            )
+            if not os.path.exists(graph_path):
+                # code adapted from https://github.com/thuml/depyf/blob/dab831108a752d1facc00acdd6d4243891845c37/depyf/explain/patched_lazy_format_graph_code.py#L30 # noqa
+                # use `print_readable` because it can include submodules
+                src = (
+                    "from __future__ import annotations\nimport torch\n"
+                    + self.split_gm.print_readable(print_output=False)
+                )
+                src = src.replace("<lambda>", "GraphModule")
+                with open(graph_path, "w") as f:
+                    f.write(src)
+
+                rank0_log(f"Computation graph saved to {graph_path}")
+
+        self._called = True
+        return self.split_gm
diff --git a/sglang/python/sglang/srt/compilation/compilation_config.py b/sglang/python/sglang/srt/compilation/compilation_config.py
new file mode 100644
index 0000000000000000000000000000000000000000..0388bbedac0637a01f1d897a89c184a042a41752
--- /dev/null
+++ b/sglang/python/sglang/srt/compilation/compilation_config.py
@@ -0,0 +1,45 @@
+# Adapted from https://github.com/vllm-project/vllm/blob/v0.10.0/vllm/compilation/compilation_config.py
+
+from typing import Callable, List, Optional
+
+SPLIT_OPS = []
+
+
+def register_split_op(op_name: Optional[str] = None):
+    def decorator(op_func: Callable):
+        name = op_name or op_func.__name__
+        SPLIT_OPS.append(f"sglang.{name}")
+        return op_func
+
+    return decorator
+
+
+# TODO(Yuwei): support better compile config support
+class CompilationConfig:
+    def __init__(
+        self,
+        capture_sizes: List[int],
+        compiler: str = "eager",
+        enable_debug_mode: bool = False,
+    ):
+        self.traced_files = set()
+        self.capture_sizes = capture_sizes
+        self.compiler = compiler
+        self.enable_debug_mode = enable_debug_mode
+        self.split_ops = []
+        self.split_ops.extend(SPLIT_OPS)
+
+    def add_split_op(self, op: str):
+        self.split_ops.append(op)
+
+    def add_traced_file(self, file_path: str):
+        self.traced_files.add(file_path)
+
+    def get_traced_files(self):
+        return self.traced_files
+
+    def get_capture_sizes(self):
+        return self.capture_sizes
+
+    def get_enable_debug_mode(self):
+        return self.enable_debug_mode
diff --git a/sglang/python/sglang/srt/compilation/compilation_counter.py b/sglang/python/sglang/srt/compilation/compilation_counter.py
new file mode 100644
index 0000000000000000000000000000000000000000..e973f8f2fc7d366c89410f93caa8eeaca360482d
--- /dev/null
+++ b/sglang/python/sglang/srt/compilation/compilation_counter.py
@@ -0,0 +1,47 @@
+# Adapted from https://github.com/vllm-project/vllm/blob/v0.10.0/vllm/compilation/compilation_counter.py
+
+import copy
+import dataclasses
+from contextlib import contextmanager
+
+
+@dataclasses.dataclass
+class CompilationCounter:
+    num_models_seen: int = 0
+    num_graphs_seen: int = 0
+    # including the splitting ops
+    num_piecewise_graphs_seen: int = 0
+    # not including the splitting ops
+    num_piecewise_capturable_graphs_seen: int = 0
+    num_backend_compilations: int = 0
+    # Number of gpu_model_runner attempts to trigger CUDAGraphs capture
+    num_gpu_runner_capture_triggers: int = 0
+    # Number of CUDAGraphs captured
+    num_cudagraph_captured: int = 0
+    # InductorAdapter.compile calls
+    num_inductor_compiles: int = 0
+    # EagerAdapter.compile calls
+    num_eager_compiles: int = 0
+    # The number of time vLLM's compiler cache entry was updated
+    num_cache_entries_updated: int = 0
+    # The number of standalone_compile compiled artifacts saved
+    num_compiled_artifacts_saved: int = 0
+    # Number of times a model was loaded with CompilationLevel.DYNAMO_AS_IS
+    dynamo_as_is_count: int = 0
+
+    def clone(self) -> "CompilationCounter":
+        return copy.deepcopy(self)
+
+    @contextmanager
+    def expect(self, **kwargs):
+        old = self.clone()
+        yield
+        for k, v in kwargs.items():
+            assert getattr(self, k) - getattr(old, k) == v, (
+                f"{k} not as expected, before it is {getattr(old, k)}"
+                f", after it is {getattr(self, k)}, "
+                f"expected diff is {v}"
+            )
+
+
+compilation_counter = CompilationCounter()
diff --git a/sglang/python/sglang/srt/compilation/compile.py b/sglang/python/sglang/srt/compilation/compile.py
new file mode 100644
index 0000000000000000000000000000000000000000..46a9240fb2596022bad424cbb9ea013fee515c8a
--- /dev/null
+++ b/sglang/python/sglang/srt/compilation/compile.py
@@ -0,0 +1,203 @@
+import inspect
+import logging
+import os
+import sys
+import types
+from dataclasses import dataclass
+from typing import Any, Callable, Optional, Union
+
+import torch
+
+from sglang.srt.compilation.compilation_config import CompilationConfig
+from sglang.srt.compilation.piecewise_context_manager import is_in_piecewise_cuda_graph
+from sglang.srt.utils.common import rank0_log
+
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class IntermediateTensors:
+    """For all pipeline stages except the last, we need to return the hidden
+    states and residuals to be sent to the next stage. This data structure
+    contains the hidden states and residuals for a request.
+
+    Each stage also needs to handle its own finished_sending and
+    finished_recving in case of kv transfer.
+    """
+
+    tensors: dict[str, torch.Tensor]
+    # [req_ids]
+    finished_sending: Optional[set[str]] = None
+    finished_recving: Optional[set[str]] = None
+
+    def __init__(self, tensors):
+        # manually define this function, so that
+        # Dynamo knows `IntermediateTensors()` comes from this file.
+        # Otherwise, dataclass will generate this function by evaluating
+        # a string, and we will lose the information about the source file.
+        self.tensors = tensors
+
+    def __getitem__(self, key: Union[str, slice]):
+        if isinstance(key, str):
+            return self.tensors[key]
+        elif isinstance(key, slice):
+            return self.__class__({k: v[key] for k, v in self.tensors.items()})
+
+    def __setitem__(self, key: str, value: torch.Tensor):
+        self.tensors[key] = value
+
+    def items(self):
+        return self.tensors.items()
+
+    def __len__(self):
+        return len(self.tensors)
+
+    def __eq__(self, other: object):
+        return isinstance(other, self.__class__) and self
+
+    def __repr__(self) -> str:
+        return f"IntermediateTensors(tensors={self.tensors})"
+
+
+def _normalize_dims(dims, ndim: int):
+    dims = [dims] if isinstance(dims, int) else list(dims)
+    return [d if d >= 0 else ndim + d for d in dims]
+
+
+class _MaybeIntermediateTensors:
+    """Duck-typed check to support your IntermediateTensors without importing."""
+
+    def __init__(self, obj):
+        self.is_intermediate = hasattr(obj, "tensors") and isinstance(
+            getattr(obj, "tensors"), dict
+        )
+        self.obj = obj
+
+
+def _mark_dynamic_on_value(val, dims):
+    if isinstance(val, torch.Tensor):
+        torch._dynamo.maybe_mark_dynamic(val, _normalize_dims(dims, val.ndim))
+    else:
+        mit = _MaybeIntermediateTensors(val)
+        if mit.is_intermediate:
+            for t in mit.obj.tensors.values():
+                torch._dynamo.maybe_mark_dynamic(t, _normalize_dims(dims, t.ndim))
+        # else: ignore (None or non-tensor)
+
+
+def _infer_dynamic_arg_dims_from_annotations(forward_fn):
+    sig = inspect.signature(forward_fn)
+    dyn = {}
+    for name, p in sig.parameters.items():
+        ann = p.annotation
+        # Accept torch.Tensor / Optional[torch.Tensor] / your IntermediateTensors types by name
+        if (
+            ann is torch.Tensor
+            or getattr(getattr(ann, "__args__", [None])[0], "__name__", "") == "Tensor"
+        ):
+            dyn[name] = 0
+        elif getattr(ann, "__name__", "") in ("IntermediateTensors",) or any(
+            getattr(a, "__name__", "") == "IntermediateTensors"
+            for a in getattr(ann, "__args__", [])
+        ):
+            dyn[name] = 0
+        elif ann == "torch.Tensor" or ann == "Optional[torch.Tensor]":
+            # For future import annotations (e.g. from __future__ import annotations), the annotation is a string
+            dyn[name] = 0
+    if not dyn:
+        raise ValueError("No dynamic dims inferred; pass dynamic_arg_dims explicitly.")
+    return dyn
+
+
+def install_torch_compiled(
+    module: torch.nn.Module,
+    *,
+    dynamic_arg_dims: dict[str, Union[int, list[int]]] | None = None,
+    backend_factory: Optional[Callable[[torch.fx.GraphModule, list], Callable]] = None,
+    compile_config: CompilationConfig = None,
+    fullgraph: bool = True,
+    graph_pool: Any = None,
+):
+    rank0_log(f"install_torch_compiled")
+    unbound_fwd = module.__class__.forward
+    if not callable(unbound_fwd):
+        raise TypeError("module.__class__.forward must be callable")
+    original_code = unbound_fwd.__code__
+
+    dyn_map = dynamic_arg_dims or _infer_dynamic_arg_dims_from_annotations(unbound_fwd)
+
+    if backend_factory is None:
+        from sglang.srt.compilation.backend import SGLangBackend
+
+        backend_factory = lambda gm, ex: SGLangBackend(compile_config, graph_pool)(
+            gm, ex
+        )
+
+    compiled_codes: list[type(original_code)] = []
+    state = {"compiled": False, "compiled_callable": None}
+
+    def bytecode_hook(old_code, new_code):
+        if old_code is not original_code:
+            return
+        frame = sys._getframe()
+        while frame and frame.f_back:
+            frame = frame.f_back
+            if (
+                frame.f_code.co_name == "_compile"
+                and os.path.basename(frame.f_code.co_filename) == "convert_frame.py"
+            ):
+                break
+        try:
+            dynamo_frame = frame.f_locals["frame"]
+        except Exception:
+            return
+        if dynamo_frame.f_code is not old_code:
+            return
+        if dynamo_frame.f_locals.get("self") is not module:
+            return
+        compiled_codes.append(new_code)
+
+    torch._dynamo.convert_frame.register_bytecode_hook(bytecode_hook)
+
+    def _ensure_compiled(self, *args, **kwargs):
+        """Compile on first use (with flag ON)."""
+        if state["compiled"]:
+            return
+        # Mark dynamic dims only when we are about to compile
+        sig = inspect.signature(unbound_fwd)
+        ba = sig.bind(self, *args, **kwargs)
+        ba.apply_defaults()
+        for name, dims in (dyn_map or {}).items():
+            if name in ba.arguments:
+                val = ba.arguments[name]
+                if val is not None:
+                    _mark_dynamic_on_value(val, dims)
+
+        # Avoid cross-instance cache reuse
+        torch._dynamo.eval_frame.remove_from_cache(unbound_fwd.__code__)
+
+        bound = types.MethodType(unbound_fwd, self)
+        compiled_callable = torch.compile(
+            bound, fullgraph=fullgraph, backend=backend_factory
+        )
+
+        # Trigger Dynamo so bytecode hook can capture
+        compiled_callable(*args, **kwargs)
+
+        state["compiled"] = True
+        state["compiled_callable"] = compiled_callable
+
+    def trampoline(self, *args, **kwargs):
+        use_compiled = is_in_piecewise_cuda_graph()
+        if use_compiled:
+            if not state["compiled"]:
+                _ensure_compiled(self, *args, **kwargs)
+
+            compiled_callable = state["compiled_callable"]
+            return compiled_callable(*args, **kwargs)
+        else:
+            # Explicitly run the original uncompiled forward
+            return unbound_fwd(self, *args, **kwargs)
+
+    module.forward = types.MethodType(trampoline, module)
+    return module
diff --git a/sglang/python/sglang/srt/compilation/compiler_interface.py b/sglang/python/sglang/srt/compilation/compiler_interface.py
new file mode 100644
index 0000000000000000000000000000000000000000..df7f28b103744132623582921c3af2d0c3566576
--- /dev/null
+++ b/sglang/python/sglang/srt/compilation/compiler_interface.py
@@ -0,0 +1,504 @@
+# Adapted from https://github.com/vllm-project/vllm/blob/v0.10.0/vllm/compilation/compiler_interface.py
+
+import contextlib
+import copy
+import hashlib
+import os
+from contextlib import ExitStack
+from typing import Any, Callable, Optional
+from unittest.mock import patch
+
+import torch
+import torch._inductor.compile_fx
+import torch.fx as fx
+
+from sglang.srt.compilation.compilation_counter import compilation_counter
+from sglang.srt.compilation.inductor_pass import pass_context
+from sglang.srt.utils.common import torch_release
+
+
+class CompilerInterface:
+    """
+    The interface for a compiler that can be used by vLLM.
+    """
+
+    # The name of the compiler, e.g. inductor.
+    # This is a class-level attribute.
+    name: str
+
+    def initialize_cache(
+        self, cache_dir: str, disable_cache: bool = False, prefix: str = ""
+    ):
+        """
+        when the vLLM process uses `cache_dir` as the cache directory,
+        the compiler should initialize itself with the cache directory,
+        e.g. by re-directing its own cache directory to a sub-directory.
+
+        prefix can be used in combination with cache_dir to figure out the base
+        cache directory, e.g. there're multiple parts of model being compiled,
+        but we want to share the same cache directory for all of them.
+
+        e.g.
+        cache_dir = "/path/to/dir/backbone", prefix = "backbone"
+        cache_dir = "/path/to/dir/eagle_head", prefix = "eagle_head"
+        """
+        pass
+
+    def compute_hash(self) -> str:
+        """
+        Gather all the relevant information from the vLLM config,
+        to compute a hash so that we can cache the compiled model.
+
+        See [`VllmConfig.compute_hash`][vllm.config.VllmConfig.compute_hash]
+        to check what information
+        is already considered by default. This function should only
+        consider the information that is specific to the compiler.
+        """
+        return ""
+
+    def compile(
+        self,
+        graph: fx.GraphModule,
+        example_inputs: list[Any],
+        compiler_config: dict[str, Any],
+        runtime_shape: Optional[int] = None,
+        key: Optional[str] = None,
+    ) -> tuple[Optional[Callable], Optional[Any]]:
+        """
+        Compile the graph with the given example inputs and compiler config,
+        with a runtime shape. If the `runtime_shape` is None, it means
+        the `example_inputs` have a dynamic shape. Otherwise, the
+        `runtime_shape` specifies the shape of the inputs. Right now we only
+        support one variable shape for all inputs, which is the batchsize
+        (number of tokens) during inference.
+
+        Dynamo will make sure `graph(*example_inputs)` is valid.
+
+        The function should return a compiled callable function, as well as
+        a handle that can be used to directly load the compiled function.
+
+        The handle should be a plain Python object, preferably a string or a
+        file path for readability.
+
+        If the compiler doesn't support caching, it should return None for the
+        handle. If the compiler fails to compile the graph, it should return
+        None for the compiled function as well.
+
+        `key` is required for StandaloneInductorAdapter, it specifies where to
+        save the compiled artifact. The compiled artifact gets saved to
+        `cache_dir/key`.
+        """
+        return None, None
+
+    def load(
+        self,
+        handle: Any,
+        graph: fx.GraphModule,
+        example_inputs: list[Any],
+        graph_index: int,
+        runtime_shape: Optional[int] = None,
+    ) -> Callable:
+        """
+        Load the compiled function from the handle.
+        Raises an error if the handle is invalid.
+
+        The handle is the second return value of the `compile` function.
+        """
+        raise NotImplementedError("caching is not supported")
+
+
+def get_inductor_factors() -> list[Any]:
+    factors: list[Any] = []
+    # summarize system state
+    from torch._inductor.codecache import CacheBase
+
+    system_factors = CacheBase.get_system()
+    factors.append(system_factors)
+
+    # summarize pytorch state
+    from torch._inductor.codecache import torch_key
+
+    torch_factors = torch_key()
+    factors.append(torch_factors)
+    return factors
+
+
+class AlwaysHitShapeEnv:
+    """
+    Why do we need this class:
+
+    For normal `torch.compile` usage, every compilation will have
+    one Dynamo bytecode compilation and one Inductor compilation.
+    The Inductor compilation happens under the context of the
+    Dynamo bytecode compilation, and that context is used to
+    determine the dynamic shape information, etc.
+
+    For our use case, we only run Dynamo bytecode compilation once,
+    and run Inductor compilation multiple times with different shapes
+    plus a general shape. The compilation for specific shapes happens
+    outside of the context of the Dynamo bytecode compilation. At that
+    time, we don't have shape environment to provide to Inductor, and
+    it will fail the Inductor code cache lookup.
+
+    By providing a dummy shape environment that always hits, we can
+    make the Inductor code cache lookup always hit, and we can
+    compile the graph for different shapes as needed.
+
+    The following dummy methods are obtained by trial-and-error
+    until it works.
+    """
+
+    def __init__(self) -> None:
+        self.guards: list[Any] = []
+
+    def evaluate_guards_expression(self, *args, **kwargs):
+        return True
+
+    def get_pruned_guards(self, *args, **kwargs):
+        return []
+
+    def produce_guards_expression(self, *args, **kwargs):
+        return ""
+
+
+class InductorAdaptor(CompilerInterface):
+    """
+    The adaptor for the Inductor compiler, version 2.5, 2.6, 2.7.
+    """
+
+    name = "inductor"
+
+    def compute_hash(self) -> str:
+        factors = get_inductor_factors()
+        hash_str = hashlib.md5(
+            str(factors).encode(), usedforsecurity=False
+        ).hexdigest()[:10]
+        return hash_str
+
+    def initialize_cache(
+        self, cache_dir: str, disable_cache: bool = False, prefix: str = ""
+    ):
+        self.cache_dir = cache_dir
+        self.prefix = prefix
+        self.base_cache_dir = cache_dir[: -len(prefix)] if prefix else cache_dir
+        if disable_cache:
+            return
+        # redirect the cache directory to a sub-directory
+        # set flags so that Inductor and Triton store their cache
+        # in the cache_dir, then users only need to copy the cache_dir
+        # to another machine to reuse the cache.
+        inductor_cache = os.path.join(self.base_cache_dir, "inductor_cache")
+        os.makedirs(inductor_cache, exist_ok=True)
+        os.environ["TORCHINDUCTOR_CACHE_DIR"] = inductor_cache
+        triton_cache = os.path.join(self.base_cache_dir, "triton_cache")
+        os.makedirs(triton_cache, exist_ok=True)
+        os.environ["TRITON_CACHE_DIR"] = triton_cache
+
+    def compile(
+        self,
+        graph: fx.GraphModule,
+        example_inputs: list[Any],
+        compiler_config: dict[str, Any],
+        runtime_shape: Optional[int] = None,
+        key: Optional[str] = None,
+    ) -> tuple[Optional[Callable], Optional[Any]]:
+        compilation_counter.num_inductor_compiles += 1
+        from torch._inductor.compile_fx import compile_fx
+
+        current_config = {}
+        if compiler_config is not None:
+            current_config.update(compiler_config)
+
+        # disable remote cache
+        current_config["fx_graph_cache"] = True
+        current_config["fx_graph_remote_cache"] = False
+
+        set_inductor_config(current_config, runtime_shape)
+
+        # inductor can inplace modify the graph, so we need to copy it
+        # see https://github.com/pytorch/pytorch/issues/138980
+        graph = copy.deepcopy(graph)
+
+        # it's the first time we compile this graph
+        # the assumption is that we don't have nested Inductor compilation.
+        # compiled_fx_graph_hash will only be called once, and we can hook
+        # it to get the hash of the compiled graph directly.
+
+        hash_str, file_path = None, None
+        from torch._inductor.codecache import FxGraphCache, compiled_fx_graph_hash
+
+        if torch_release[:2] == (2, 5):
+            original_load = FxGraphCache.load
+            original_load_name = "torch._inductor.codecache.FxGraphCache.load"
+
+            def hijack_load(*args, **kwargs):
+                inductor_compiled_graph = original_load(*args, **kwargs)
+                nonlocal file_path
+                compiled_fn = inductor_compiled_graph.current_callable
+                file_path = compiled_fn.__code__.co_filename  # noqa
+                if not file_path.startswith(self.base_cache_dir):
+                    # hooked in the align_inputs_from_check_idxs function
+                    # in torch/_inductor/utils.py
+                    for cell in compiled_fn.__closure__:
+                        if not callable(cell.cell_contents):
+                            continue
+                        if cell.cell_contents.__code__.co_filename.startswith(
+                            self.base_cache_dir
+                        ):
+                            # this is the real file path compiled from Inductor
+                            file_path = cell.cell_contents.__code__.co_filename
+                            break
+                return inductor_compiled_graph
+
+            hijacked_compile_fx_inner = (
+                torch._inductor.compile_fx.compile_fx_inner
+            )  # noqa
+        elif torch_release >= (2, 6):
+            # function renamed in 2.6
+            original_load_name = None
+
+            def hijacked_compile_fx_inner(*args, **kwargs):
+                output = torch._inductor.compile_fx.compile_fx_inner(*args, **kwargs)
+                nonlocal hash_str
+                inductor_compiled_graph = output
+                if inductor_compiled_graph is not None:
+                    nonlocal file_path
+                    compiled_fn = inductor_compiled_graph.current_callable
+                    file_path = compiled_fn.__code__.co_filename  # noqa
+                    if not file_path.startswith(self.base_cache_dir):
+                        # hooked in the align_inputs_from_check_idxs function
+                        # in torch/_inductor/utils.py
+                        for cell in compiled_fn.__closure__:
+                            if not callable(cell.cell_contents):
+                                continue
+                            code = cell.cell_contents.__code__
+                            if code.co_filename.startswith(self.base_cache_dir):
+                                # this is the real file path
+                                # compiled from Inductor
+                                file_path = code.co_filename
+                                break
+                    hash_str = inductor_compiled_graph._fx_graph_cache_key
+                return output
+
+        def hijack_compiled_fx_graph_hash(*args, **kwargs):
+            out = compiled_fx_graph_hash(*args, **kwargs)
+            nonlocal hash_str
+            hash_str = out[0]
+            return out
+
+        def _check_can_cache(*args, **kwargs):
+            # no error means it can be cached.
+            # Inductor refuses to cache the graph outside of Dynamo
+            # tracing context, and also disables caching for graphs
+            # with high-order ops.
+            # For vLLM, in either case, we want to cache the graph.
+            # see https://github.com/pytorch/pytorch/blob/9f5ebf3fc609105a74eab4ccc24932d6353ff566/torch/_inductor/codecache.py#L1221 # noqa
+            return
+
+        def _get_shape_env() -> AlwaysHitShapeEnv:
+            return AlwaysHitShapeEnv()
+
+        with ExitStack() as stack:
+            # hijack to get the compiled graph itself
+            if original_load_name is not None:
+                stack.enter_context(patch(original_load_name, hijack_load))
+
+            # for hijacking the hash of the compiled graph
+            stack.enter_context(
+                patch(
+                    "torch._inductor.codecache.compiled_fx_graph_hash",
+                    hijack_compiled_fx_graph_hash,
+                )
+            )
+
+            # for providing a dummy shape environment
+            stack.enter_context(
+                patch(
+                    "torch._inductor.codecache.FxGraphCache._get_shape_env",
+                    _get_shape_env,
+                )
+            )
+
+            from torch._functorch._aot_autograd.autograd_cache import AOTAutogradCache
+
+            # torch 2.8+ on main uses _get_shape_env in AOTAutogradCache
+            if hasattr(AOTAutogradCache, "_get_shape_env"):
+                stack.enter_context(
+                    patch(
+                        "torch._functorch._aot_autograd.autograd_cache.AOTAutogradCache._get_shape_env",
+                        _get_shape_env,
+                    )
+                )
+
+            # for forcing the graph to be cached
+            stack.enter_context(
+                patch(
+                    "torch._inductor.codecache.FxGraphCache._check_can_cache",
+                    _check_can_cache,
+                )
+            )
+
+            # Dynamo metrics context, see method for more details.
+            stack.enter_context(self.metrics_context())
+
+            # Disable remote caching. When these are on, on remote cache-hit,
+            # the monkey-patched functions never actually get called.
+            # vLLM today assumes and requires the monkey-patched functions to
+            # get hit.
+            # TODO(zou3519): we're going to replace this all with
+            # standalone_compile sometime.
+
+            stack.enter_context(
+                torch._inductor.config.patch(fx_graph_remote_cache=False)
+            )
+            # InductorAdaptor (unfortunately) requires AOTAutogradCache
+            # to be turned off to run. It will fail to acquire the hash_str
+            # and error if not.
+            # StandaloneInductorAdaptor (PyTorch 2.8+) fixes this problem.
+            stack.enter_context(
+                torch._functorch.config.patch(enable_autograd_cache=False)
+            )
+            stack.enter_context(
+                torch._functorch.config.patch(enable_remote_autograd_cache=False)
+            )
+
+            with pass_context(runtime_shape):
+                compiled_graph = compile_fx(
+                    graph,
+                    example_inputs,
+                    inner_compile=hijacked_compile_fx_inner,
+                    config_patches=current_config,
+                )
+        return compiled_graph, (hash_str, file_path)
+
+    def load(
+        self,
+        handle: Any,
+        graph: fx.GraphModule,
+        example_inputs: list[Any],
+        graph_index: int,
+        runtime_shape: Optional[int] = None,
+    ) -> Callable:
+        assert isinstance(handle, tuple)
+        assert isinstance(handle[0], str)
+        assert isinstance(handle[1], str)
+        hash_str = handle[0]
+
+        from torch._functorch._aot_autograd.autograd_cache import AOTAutogradCache
+        from torch._inductor.codecache import FxGraphCache
+
+        with ExitStack() as exit_stack:
+            exit_stack.enter_context(
+                patch(
+                    "torch._inductor.codecache.FxGraphCache._get_shape_env",
+                    lambda *args, **kwargs: AlwaysHitShapeEnv(),
+                )
+            )
+            # torch 2.8+ on main uses _get_shape_env in AOTAutogradCache
+            if hasattr(AOTAutogradCache, "_get_shape_env"):
+                exit_stack.enter_context(
+                    patch(
+                        "torch._functorch._aot_autograd.autograd_cache.AOTAutogradCache._get_shape_env",
+                        lambda *args, **kwargs: AlwaysHitShapeEnv(),
+                    )
+                )
+
+            # Dynamo metrics context, see method for more details.
+            exit_stack.enter_context(self.metrics_context())
+
+            if torch_release[:2] == (2, 5):
+                inductor_compiled_graph = FxGraphCache._lookup_graph(
+                    hash_str, example_inputs, True, False
+                )
+                assert inductor_compiled_graph is not None, (
+                    "Inductor cache lookup failed. Please remove"
+                    f"the cache directory and try again."  # noqa
+                )
+            elif torch_release >= (2, 6):
+                from torch._inductor.output_code import CompiledFxGraphConstantsWithGm
+
+                constants = CompiledFxGraphConstantsWithGm(graph)
+                inductor_compiled_graph, _ = FxGraphCache._lookup_graph(
+                    hash_str, example_inputs, True, None, constants
+                )
+                assert inductor_compiled_graph is not None, (
+                    "Inductor cache lookup failed. Please remove"
+                    f"the cache directory and try again."  # noqa
+                )
+
+        # Inductor calling convention (function signature):
+        # f(list) -> tuple
+        # Dynamo calling convention (function signature):
+        # f(*args) -> Any
+
+        # need to know if the graph returns a tuple
+        from torch._inductor.compile_fx import graph_returns_tuple
+
+        returns_tuple = graph_returns_tuple(graph)
+
+        # this is the callable we return to Dynamo to run
+        def compiled_graph(*args):
+            # convert args to list
+            list_args = list(args)
+            graph_output = inductor_compiled_graph(list_args)
+            # unpack the tuple if needed
+            if returns_tuple:
+                return graph_output
+            else:
+                return graph_output[0]
+
+        return compiled_graph
+
+    def metrics_context(self) -> contextlib.AbstractContextManager:
+        """
+        This method returns the Dynamo metrics context (if it exists,
+        otherwise a null context). It is used by various compile components.
+        Present in torch>=2.6, it's used inside FxGraphCache in
+        torch==2.6 (but not after). It might also be used in various other
+        torch.compile internal functions.
+
+        Because it is re-entrant, we always set it (even if entering via Dynamo
+        and the context was already entered). We might want to revisit if it
+        should be set at a different level of compilation.
+
+        This is likely a bug in PyTorch: public APIs should not rely on
+        manually setting up internal contexts. But we also rely on non-public
+        APIs which might not provide these guarantees.
+        """
+        import torch._dynamo.utils
+
+        return torch._dynamo.utils.get_metrics_context()
+
+
+def set_inductor_config(config, runtime_shape):
+    if isinstance(runtime_shape, int):
+        # for a specific batchsize, tuning triton kernel parameters
+        # can be beneficial
+        config["max_autotune"] = True
+        config["coordinate_descent_tuning"] = True
+
+
+class EagerAdapter(CompilerInterface):
+    name = "eager"
+
+    def compile(
+        self,
+        graph: fx.GraphModule,
+        example_inputs: list[Any],
+        compiler_config: dict[str, Any],
+        runtime_shape: Optional[int] = None,
+        key: Optional[str] = None,
+        num_graphs: int = 1,
+    ) -> tuple[Optional[Callable], Optional[Any]]:
+        return graph, None
+
+    def load(
+        self,
+        handle: Any,
+        graph: fx.GraphModule,
+        example_inputs: list[Any],
+        graph_index: int,
+        runtime_shape: Optional[int] = None,
+        num_graphs: int = 1,
+    ) -> Callable:
+        raise NotImplementedError("eager compilation is not supported")
diff --git a/sglang/python/sglang/srt/compilation/cuda_piecewise_backend.py b/sglang/python/sglang/srt/compilation/cuda_piecewise_backend.py
new file mode 100644
index 0000000000000000000000000000000000000000..8ca1e6a43cbcd0d3d6a08969a52fd73ee5e34f36
--- /dev/null
+++ b/sglang/python/sglang/srt/compilation/cuda_piecewise_backend.py
@@ -0,0 +1,206 @@
+# Adapted from https://github.com/vllm-project/vllm/blob/v0.10.0/vllm/compilation/cuda_piecewise_backend.py
+
+import dataclasses
+import logging
+from contextlib import ExitStack
+from typing import Any, Callable, Optional
+from unittest.mock import patch
+
+import torch
+import torch.fx as fx
+
+from sglang.srt.compilation.compilation_config import CompilationConfig
+from sglang.srt.compilation.compilation_counter import compilation_counter
+from sglang.srt.compilation.piecewise_context_manager import (
+    get_pcg_capture_stream,
+    is_in_pcg_torch_compile,
+)
+from sglang.srt.compilation.weak_ref_tensor import weak_ref_tensors
+
+logger = logging.getLogger(__name__)
+
+
+@dataclasses.dataclass
+class ConcreteSizeEntry:
+    runtime_shape: int
+    need_to_compile: bool  # the size is in compile_sizes
+    use_cudagraph: bool  # the size is in cudagraph_capture_sizes
+
+    compiled: bool = False
+    runnable: Callable = None  # type: ignore
+    num_finished_warmup: int = 0
+    cudagraph: Optional[torch.cuda.CUDAGraph] = None
+    output: Optional[Any] = None
+
+    # for cudagraph debugging, track the input addresses
+    # during capture, and check if they are the same during replay
+    input_addresses: Optional[list[int]] = None
+
+
+class CUDAPiecewiseBackend:
+
+    def __init__(
+        self,
+        graph: fx.GraphModule,
+        compile_config: CompilationConfig,
+        inductor_config: dict[str, Any],
+        graph_pool: Any,
+        piecewise_compile_index: int,
+        total_piecewise_compiles: int,
+        sym_shape_indices: list[int],
+        compiled_graph_for_general_shape: Callable,
+        sglang_backend,
+    ):
+        """
+        The backend for piecewise compilation.
+        It mainly handles the compilation and cudagraph capturing.
+
+        We will compile `self.graph` once for the general shape,
+        and then compile for different shapes specified in
+        `compilation_config.compile_sizes`.
+
+        Independently, we will capture cudagraph for different shapes.
+
+        If a shape needs both compilation and cudagraph, we will
+        compile it first, and then capture cudagraph.
+        """
+        self.graph = graph
+        self.inductor_config = inductor_config
+        self.graph_pool = graph_pool
+        self.piecewise_compile_index = piecewise_compile_index
+        self.total_piecewise_compiles = total_piecewise_compiles
+        self.sglang_backend = sglang_backend
+
+        self.is_first_graph = piecewise_compile_index == 0
+        self.is_last_graph = piecewise_compile_index == total_piecewise_compiles - 1
+
+        self.compile_sizes: set[int] = set([])
+        self.compile_config = compile_config
+        self.cudagraph_capture_sizes: set[int] = set(compile_config.get_capture_sizes())
+
+        self.first_run_finished = False
+
+        self.compiled_graph_for_general_shape = compiled_graph_for_general_shape  # noqa
+
+        self.sym_shape_indices = sym_shape_indices
+
+        # the entries for different shapes that we need to either
+        # compile or capture cudagraph
+        self.concrete_size_entries: dict[int, ConcreteSizeEntry] = {}
+
+        # to_be_compiled_sizes tracks the remaining sizes to compile,
+        # and updates during the compilation process, so we need to copy it
+        self.to_be_compiled_sizes: set[int] = self.compile_sizes.copy()
+        for shape in self.compile_sizes.union(self.cudagraph_capture_sizes):
+            self.concrete_size_entries[shape] = ConcreteSizeEntry(
+                runtime_shape=shape,
+                need_to_compile=shape in self.compile_sizes,
+                use_cudagraph=shape in self.cudagraph_capture_sizes,
+            )
+
+    def check_for_ending_compilation(self):
+        if self.is_last_graph and not self.to_be_compiled_sizes:
+            # no specific sizes to compile
+            # save the hash of the inductor graph for the next run
+            self.sglang_backend.compiler_manager.save_to_file()
+
+    def __call__(self, *args) -> Any:
+        if not self.first_run_finished:
+            self.first_run_finished = True
+            self.check_for_ending_compilation()
+            return self.compiled_graph_for_general_shape(*args)
+
+        if len(self.sym_shape_indices) == 0:
+            return self.compiled_graph_for_general_shape(*args)
+
+        runtime_shape = args[self.sym_shape_indices[0]]
+        if runtime_shape not in self.concrete_size_entries:
+            # we don't need to do anything for this shape
+            return self.compiled_graph_for_general_shape(*args)
+
+        entry = self.concrete_size_entries[runtime_shape]
+
+        if entry.runnable is None:
+            entry.runnable = self.compiled_graph_for_general_shape
+
+        if entry.need_to_compile and not entry.compiled:
+            entry.compiled = True
+            self.to_be_compiled_sizes.remove(runtime_shape)
+            # args are real arguments
+            entry.runnable = self.sglang_backend.compiler_manager.compile(
+                self.graph,
+                args,
+                self.inductor_config,
+                graph_index=self.piecewise_compile_index,
+                num_graphs=self.total_piecewise_compiles,
+                runtime_shape=runtime_shape,
+            )
+
+            # finished compilations for all required shapes
+            if self.is_last_graph and not self.to_be_compiled_sizes:
+                self.check_for_ending_compilation()
+
+        if is_in_pcg_torch_compile():
+            return entry.runnable(*args)
+
+        if entry.cudagraph is None:
+            if entry.num_finished_warmup < 1:  # noqa
+                entry.num_finished_warmup += 1
+                return entry.runnable(*args)
+
+            if self.compile_config.get_enable_debug_mode():
+                input_addresses = [
+                    x.data_ptr() for x in args if isinstance(x, torch.Tensor)
+                ]
+                entry.input_addresses = input_addresses
+            cudagraph = torch.cuda.CUDAGraph()
+
+            with ExitStack() as stack:
+                if not self.is_first_graph:
+                    # during every model forward, we will capture
+                    # many pieces of cudagraphs (roughly one per layer).
+                    # running gc again and again across layers will
+                    # make the cudagraph capture very slow.
+                    # therefore, we only run gc for the first graph,
+                    # and disable gc for the rest of the graphs.
+                    stack.enter_context(patch("gc.collect", lambda: None))
+                    stack.enter_context(patch("torch.cuda.empty_cache", lambda: None))
+                # mind-exploding: carefully manage the reference and memory.
+                stream = get_pcg_capture_stream()
+                assert (
+                    stream is not None
+                ), "PCG capture stream is not set, please check if runtime recompilation happened"
+                with torch.cuda.graph(cudagraph, pool=self.graph_pool, stream=stream):
+                    # `output` is managed by pytorch's cudagraph pool
+                    output = entry.runnable(*args)
+                    if self.is_last_graph:
+                        # by converting it to weak ref,
+                        # the original `output` will immediately be released
+                        # to save memory. It is only safe to do this for
+                        # the last graph, because the output of the last graph
+                        # will not be used by any other cuda graph.
+                        output = weak_ref_tensors(output)
+
+            # here we always use weak ref for the output
+            # to save memory
+            entry.output = weak_ref_tensors(output)
+            entry.cudagraph = cudagraph
+
+            compilation_counter.num_cudagraph_captured += 1
+
+            # important: we need to return the output, rather than
+            # the weak ref of the output, so that pytorch can correctly
+            # manage the memory during cuda graph capture
+            return output
+
+        if self.compile_config.get_enable_debug_mode():
+            # check if the input addresses are the same
+            new_input_addresses = [
+                x.data_ptr() for x in args if isinstance(x, torch.Tensor)
+            ]
+            assert new_input_addresses == entry.input_addresses, (
+                "Input addresses for cudagraphs are different during replay."
+                f" Expected {entry.input_addresses}, got {new_input_addresses}"
+            )
+        entry.cudagraph.replay()
+        return entry.output
diff --git a/sglang/python/sglang/srt/compilation/fix_functionalization.py b/sglang/python/sglang/srt/compilation/fix_functionalization.py
new file mode 100644
index 0000000000000000000000000000000000000000..8673e3576b00444fda7221b6d6a5a732c7a0a368
--- /dev/null
+++ b/sglang/python/sglang/srt/compilation/fix_functionalization.py
@@ -0,0 +1,134 @@
+# Adapted from https://github.com/vllm-project/vllm/blob/v0.10.0/vllm/compilation/fix_functionalization.py
+
+import logging
+import operator
+from collections.abc import Iterable
+from typing import Optional, Union
+
+import torch
+from torch._higher_order_ops.auto_functionalize import auto_functionalized
+
+from sglang.srt.compilation.fx_utils import is_func
+from sglang.srt.compilation.inductor_pass import SGLangInductorPass
+
+logger = logging.getLogger(__name__)
+
+
+class FixFunctionalizationPass(SGLangInductorPass):
+    """
+    This pass defunctionalizes certain nodes to avoid redundant tensor copies.
+    After this pass, DCE (dead-code elimination) should never be run,
+    as de-functionalized nodes may appear as dead code.
+
+    To add new nodes to defunctionalize, add to the if-elif chain in __call__.
+    """
+
+    def __call__(self, graph: torch.fx.Graph):
+        self.begin()
+        self.dump_graph(graph, "before_fix_functionalization")
+
+        self.nodes_to_remove: list[torch.fx.Node] = []
+        count = 0
+        for node in graph.nodes:
+            if not is_func(node, auto_functionalized):
+                continue  # Avoid deep if-elif nesting
+            count += 1
+
+        self.dump_graph(graph, "before_fix_functionalization_cleanup")
+
+        # Remove the nodes all at once
+        count_removed = len(self.nodes_to_remove)
+        for node in self.nodes_to_remove:
+            graph.erase_node(node)
+
+        logger.debug(
+            "De-functionalized %s nodes, removed %s nodes", count, count_removed
+        )
+        self.dump_graph(graph, "after_fix_functionalization")
+        self.end_and_log()
+
+    def _remove(self, node_or_nodes: Union[torch.fx.Node, Iterable[torch.fx.Node]]):
+        """
+        Stage a node (or nodes) for removal at the end of the pass.
+        """
+        if isinstance(node_or_nodes, torch.fx.Node):
+            self.nodes_to_remove.append(node_or_nodes)
+        else:
+            self.nodes_to_remove.extend(node_or_nodes)
+
+    def defunctionalize(
+        self,
+        graph: torch.fx.Graph,
+        node: torch.fx.Node,
+        mutated_args: dict[int, Union[torch.fx.Node, str]],
+        args: Optional[tuple[Union[torch.fx.Node, str], ...]] = None,
+    ):
+        """
+        De-functionalize a node by replacing it with a call to the original.
+        It also replaces the getitem users with the mutated arguments.
+        See replace_users_with_mutated_args and insert_defunctionalized.
+        """
+        self.replace_users_with_mutated_args(node, mutated_args)
+        self.insert_defunctionalized(graph, node, args=args)
+        self._remove(node)
+
+    def replace_users_with_mutated_args(
+        self, node: torch.fx.Node, mutated_args: dict[int, Union[torch.fx.Node, str]]
+    ):
+        """
+        Replace all getitem users of the auto-functionalized node with the
+        mutated arguments.
+        :param node: The auto-functionalized node
+        :param mutated_args: The mutated arguments, indexed by getitem index.
+        If the value of an arg is a string, `node.kwargs[arg]` is used.
+        """
+        for idx, user in self.getitem_users(node).items():
+            arg = mutated_args[idx]
+            arg = node.kwargs[arg] if isinstance(arg, str) else arg
+            user.replace_all_uses_with(arg)
+            self._remove(user)
+
+    def getitem_users(self, node: torch.fx.Node) -> dict[int, torch.fx.Node]:
+        """
+        Returns the operator.getitem users of the auto-functionalized node,
+        indexed by the index they are getting.
+        """
+        users = {}
+        for user in node.users:
+            if is_func(user, operator.getitem):
+                idx = user.args[1]
+                users[idx] = user
+        return users
+
+    def insert_defunctionalized(
+        self,
+        graph: torch.fx.Graph,
+        node: torch.fx.Node,
+        args: Optional[tuple[Union[torch.fx.Node, str], ...]] = None,
+    ):
+        """
+        Insert a new defunctionalized node into the graph before node.
+        If one of the kwargs is 'out', provide args directly,
+        as node.kwargs cannot be used.
+        See https://github.com/pytorch/pytorch/blob/a00faf440888ffb724bad413f329a49e2b6388e7/torch/_inductor/lowering.py#L351
+
+        :param graph: Graph to insert the defunctionalized node into
+        :param node: The auto-functionalized node to defunctionalize
+        :param args: If we cannot use kwargs, specify args directly.
+        If an arg is a string, `node.kwargs[arg]` is used.
+        """  # noqa: E501
+        assert is_func(
+            node, auto_functionalized
+        ), f"node must be auto-functionalized, is {node} instead"
+
+        # Create a new call to the original function
+        with graph.inserting_before(node):
+            function = node.args[0]
+            if args is None:
+                graph.call_function(function, kwargs=node.kwargs)
+            else:
+                # Args passed as strings refer to items in node.kwargs
+                args = tuple(
+                    node.kwargs[arg] if isinstance(arg, str) else arg for arg in args
+                )
+                graph.call_function(function, args=args)
diff --git a/sglang/python/sglang/srt/compilation/fx_utils.py b/sglang/python/sglang/srt/compilation/fx_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..b2e863e687181cf0e046fd9e413360c1d867112c
--- /dev/null
+++ b/sglang/python/sglang/srt/compilation/fx_utils.py
@@ -0,0 +1,83 @@
+# Adapted from https://github.com/vllm-project/vllm/blob/v0.10.0/vllm/compilation/fx_utils.py
+
+import operator
+from collections.abc import Iterable, Iterator
+from typing import Optional
+
+from torch import fx
+from torch._higher_order_ops.auto_functionalize import auto_functionalized
+from torch._ops import OpOverload
+
+
+def is_func(node: fx.Node, target) -> bool:
+    return node.op == "call_function" and node.target == target
+
+
+def is_auto_func(node: fx.Node, op: OpOverload) -> bool:
+    return is_func(node, auto_functionalized) and node.args[0] == op
+
+
+# Returns the first specified node with the given op (if it exists)
+def find_specified_fn_maybe(
+    nodes: Iterable[fx.Node], op: OpOverload
+) -> Optional[fx.Node]:
+    for node in nodes:
+        if node.target == op:
+            return node
+    return None
+
+
+# Returns the first specified node with the given op
+def find_specified_fn(nodes: Iterable[fx.Node], op: OpOverload) -> fx.Node:
+    node = find_specified_fn_maybe(nodes, op)
+    assert node is not None, f"Could not find {op} in nodes {nodes}"
+    return node
+
+
+# Returns the first auto_functionalized node with the given op (if it exists)
+def find_auto_fn_maybe(nodes: Iterable[fx.Node], op: OpOverload) -> Optional[fx.Node]:
+    for node in nodes:
+        if is_func(node, auto_functionalized) and node.args[0] == op:  # noqa
+            return node
+    return None
+
+
+# Returns the first auto_functionalized node with the given op
+def find_auto_fn(nodes: Iterable[fx.Node], op: OpOverload) -> fx.Node:
+    node = find_auto_fn_maybe(nodes, op)
+    assert node is not None, f"Could not find {op} in nodes {nodes}"
+    return node
+
+
+# Returns the getitem node that extracts the idx-th element from node
+# (if it exists)
+def find_getitem_maybe(node: fx.Node, idx: int) -> Optional[fx.Node]:
+    for user in node.users:
+        if is_func(user, operator.getitem) and user.args[1] == idx:
+            return user
+    return None
+
+
+# Returns the getitem node that extracts the idx-th element from node
+def find_getitem(node: fx.Node, idx: int) -> fx.Node:
+    ret = find_getitem_maybe(node, idx)
+    assert ret is not None, f"Could not find getitem {idx} in node {node}"
+    return ret
+
+
+# An auto-functionalization-aware utility for finding nodes with a specific op
+def find_op_nodes(op: OpOverload, graph: fx.Graph) -> Iterator[fx.Node]:
+    if not op._schema.is_mutable:
+        yield from graph.find_nodes(op="call_function", target=op)
+
+    for n in graph.find_nodes(op="call_function", target=auto_functionalized):
+        if n.args[0] == op:
+            yield n
+
+
+# Asserts that the node only has one user and returns it
+# Even if a node has only 1 user, it might share storage with another node,
+# which might need to be taken into account.
+def get_only_user(node: fx.Node) -> fx.Node:
+    assert len(node.users) == 1
+    return next(iter(node.users))
diff --git a/sglang/python/sglang/srt/compilation/inductor_pass.py b/sglang/python/sglang/srt/compilation/inductor_pass.py
new file mode 100644
index 0000000000000000000000000000000000000000..acbde65bf8ab383a80c6f1e6eee9a57440a157cf
--- /dev/null
+++ b/sglang/python/sglang/srt/compilation/inductor_pass.py
@@ -0,0 +1,140 @@
+# Adapted from https://github.com/vllm-project/vllm/blob/v0.10.0/vllm/compilation/inductor_pass.py
+
+import hashlib
+import inspect
+import json
+import logging
+import time
+import types
+from contextlib import contextmanager
+from typing import Any, Callable, Optional, Union
+
+import torch
+from torch import fx
+from torch._dynamo.utils import lazy_format_graph_code
+from torch._inductor.custom_graph_pass import CustomGraphPass
+
+logger = logging.getLogger(__name__)
+
+_pass_context = None
+
+
+class PassContext:
+
+    def __init__(self, runtime_shape: Optional[int]):
+        self.runtime_shape = runtime_shape
+
+
+def get_pass_context() -> PassContext:
+    """Get the current pass context."""
+    assert _pass_context is not None
+    return _pass_context
+
+
+@contextmanager
+def pass_context(runtime_shape: Optional[int]):
+    """A context manager that stores the current pass context,
+    usually it is a list of sizes to specialize.
+    """
+    global _pass_context
+    prev_context = _pass_context
+    _pass_context = PassContext(runtime_shape)
+    try:
+        yield
+    finally:
+        _pass_context = prev_context
+
+
+class InductorPass(CustomGraphPass):
+    """
+    A custom graph pass that uses a hash of its source as the UUID.
+    This is defined as a convenience and should work in most cases.
+    """
+
+    def uuid(self) -> Any:
+        """
+        Provide a unique identifier for the pass, used in Inductor code cache.
+        This should depend on the pass implementation, so that changes to the
+        pass result in recompilation.
+        By default, the object source is hashed.
+        """
+        return InductorPass.hash_source(self)
+
+    @staticmethod
+    def hash_source(*srcs: Union[str, Any]):
+        """
+        Utility method to hash the sources of functions or objects.
+        :param srcs: strings or objects to add to the hash.
+        Objects and functions have their source inspected.
+        :return:
+        """
+        hasher = hashlib.sha256()
+        for src in srcs:
+            if isinstance(src, str):
+                src_str = src
+            elif isinstance(src, types.FunctionType):
+                src_str = inspect.getsource(src)
+            else:
+                src_str = inspect.getsource(src.__class__)
+            hasher.update(src_str.encode("utf-8"))
+        return hasher.hexdigest()
+
+    @staticmethod
+    def hash_dict(dict_: dict[Any, Any]):
+        """
+        Utility method to hash a dictionary, can alternatively be used for uuid.
+        :return: A sha256 hash of the json rep of the dictionary.
+        """
+        encoded = json.dumps(dict_, sort_keys=True).encode("utf-8")
+        return hashlib.sha256(encoded).hexdigest()
+
+    def is_applicable_for_shape(self, shape: Optional[int]):
+        return True
+
+
+class CallableInductorPass(InductorPass):
+    """
+    This class is a wrapper for a callable that automatically provides an
+    implementation of the UUID.
+    """
+
+    def __init__(
+        self, callable: Callable[[fx.Graph], None], uuid: Optional[Any] = None
+    ):
+        self.callable = callable
+        self._uuid = self.hash_source(callable) if uuid is None else uuid
+
+    def __call__(self, graph: torch.fx.Graph):
+        self.callable(graph)
+
+    def uuid(self) -> Any:
+        return self._uuid
+
+
+class SGLangInductorPass(InductorPass):
+
+    def __init__(
+        self,
+    ):
+        self.pass_name = self.__class__.__name__
+
+    def dump_graph(self, graph: torch.fx.Graph, stage: str):
+        lazy_format_graph_code(stage, graph.owning_module)
+
+    def begin(self):
+        self._start_time = time.perf_counter_ns()
+
+    def end_and_log(self):
+        self._end_time = time.perf_counter_ns()
+        duration_ms = float(self._end_time - self._start_time) / 1.0e6
+        logger.debug("%s completed in %.1f ms", self.pass_name, duration_ms)
+
+
+class PrinterInductorPass(SGLangInductorPass):
+
+    def __init__(self, name: str):
+        super().__init__()
+        self.name = name
+
+    def __call__(self, graph: torch.fx.Graph):
+        self.dump_graph(graph, self.name)
diff --git a/sglang/python/sglang/srt/compilation/npu_piecewise_backend.py b/sglang/python/sglang/srt/compilation/npu_piecewise_backend.py
new file mode 100644
index 0000000000000000000000000000000000000000..dc97bd5c3f74b28d5fd1c91ed6c7810e757a1137
--- /dev/null
+++ b/sglang/python/sglang/srt/compilation/npu_piecewise_backend.py
@@ -0,0 +1,109 @@
+from contextlib import ExitStack
+from typing import Any, Callable
+from unittest.mock import patch
+
+import torch
+import torch.fx as fx
+
+from sglang.srt.compilation.compilation_config import CompilationConfig
+from sglang.srt.compilation.compilation_counter import compilation_counter
+from sglang.srt.compilation.cuda_piecewise_backend import (
+    CUDAPiecewiseBackend,
+    weak_ref_tensors,
+)
+
+
+class NPUPiecewiseBackend(CUDAPiecewiseBackend):
+    def __init__(
+        self,
+        graph: fx.GraphModule,
+        compile_config: CompilationConfig,
+        inductor_config: dict[str, Any],
+        graph_pool: Any,
+        piecewise_compile_index: int,
+        total_piecewise_compiles: int,
+        sym_shape_indices: list[int],
+        compiled_graph_for_general_shape: Callable,
+        sglang_backend,
+    ):
+        super().__init__(
+            graph,
+            compile_config,
+            inductor_config,
+            graph_pool,
+            piecewise_compile_index,
+            total_piecewise_compiles,
+            sym_shape_indices,
+            compiled_graph_for_general_shape,
+            sglang_backend,
+        )
+
+    def __call__(self, *args):
+        runtime_shape = args[self.sym_shape_indices[0]]
+        if runtime_shape not in self.concrete_size_entries:
+            # we don't need to do anything for this shape
+            return self.compiled_graph_for_general_shape(*args)
+
+        entry = self.concrete_size_entries[runtime_shape]
+
+        if entry.runnable is None:
+            entry.runnable = self.compiled_graph_for_general_shape
+
+        if entry.cudagraph is None:
+            if entry.num_finished_warmup < 1:  # noqa
+                entry.num_finished_warmup += 1
+                return entry.runnable(*args)
+
+            if self.compile_config.get_enable_debug_mode():
+                input_addresses = [
+                    x.data_ptr() for x in args if isinstance(x, torch.Tensor)
+                ]
+                entry.input_addresses = input_addresses
+            npugraph = torch.npu.NPUGraph()
+
+            with ExitStack() as stack:
+                if not self.is_first_graph:
+                    # during every model forward, we will capture
+                    # many pieces of cudagraphs (roughly one per layer).
+                    # running gc again and again across layers will
+                    # make the cudagraph capture very slow.
+                    # therefore, we only run gc for the first graph,
+                    # and disable gc for the rest of the graphs.
+                    stack.enter_context(patch("gc.collect", lambda: None))
+                    stack.enter_context(patch("torch.npu.empty_cache", lambda: None))
+
+                # mind-exploding: carefully manage the reference and memory.
+                with torch.npu.graph(npugraph, pool=self.graph_pool):
+                    # `output` is managed by pytorch's cudagraph pool
+                    output = entry.runnable(*args)
+                    if self.is_last_graph:
+                        # by converting it to weak ref,
+                        # the original `output` will immediately be released
+                        # to save memory. It is only safe to do this for
+                        # the last graph, because the output of the last graph
+                        # will not be used by any other cuda graph.
+                        output = weak_ref_tensors(output)
+
+            # here we always use weak ref for the output
+            # to save memory
+            entry.output = weak_ref_tensors(output)
+            entry.cudagraph = npugraph
+
+            compilation_counter.num_cudagraph_captured += 1
+
+            # important: we need to return the output, rather than
+            # the weak ref of the output, so that pytorch can correctly
+            # manage the memory during cuda graph capture
+            return output
+
+        if self.compile_config.get_enable_debug_mode():
+            # check if the input addresses are the same
+            new_input_addresses = [
+                x.data_ptr() for x in args if isinstance(x, torch.Tensor)
+            ]
+            assert new_input_addresses == entry.input_addresses, (
+                "Input addresses for cudagraphs are different during replay."
+                f" Expected {entry.input_addresses}, got {new_input_addresses}"
+            )
+        entry.cudagraph.replay()
+        return entry.output
diff --git a/sglang/python/sglang/srt/compilation/pass_manager.py b/sglang/python/sglang/srt/compilation/pass_manager.py
new file mode 100644
index 0000000000000000000000000000000000000000..9173976f1878b01035d069844989b50fbc442f8b
--- /dev/null
+++ b/sglang/python/sglang/srt/compilation/pass_manager.py
@@ -0,0 +1,66 @@
+# Adapted from https://github.com/vllm-project/vllm/blob/v0.10.0/vllm/compilation/pass_manager.py
+
+import logging
+
+from torch import fx as fx
+
+from sglang.srt.compilation.fix_functionalization import FixFunctionalizationPass
+from sglang.srt.compilation.inductor_pass import (
+    CustomGraphPass,
+    InductorPass,
+    SGLangInductorPass,
+    get_pass_context,
+)
+
+logger = logging.getLogger(__name__)
+
+
+class PostGradPassManager(CustomGraphPass):
+    """
+    The pass manager for post-grad passes.
+    It handles configuration, adding custom passes, and running passes.
+    It supports uuid for the Inductor code cache. That includes torch<2.6
+    support using pickling (in .inductor_pass.CustomGraphPass).
+
+    The order of the post-grad post-passes is:
+    1. passes (constructor parameter)
+    2. default passes (NoopEliminationPass, FusionPass)
+    3. config["post_grad_custom_post_pass"] (if it exists)
+    4. fix_functionalization
+    This way, all passes operate on a functionalized graph.
+    """
+
+    def __init__(self):
+        self.passes: list[SGLangInductorPass] = []
+
+    def __call__(self, graph: fx.Graph):
+        shape = get_pass_context().runtime_shape
+        for pass_ in self.passes:
+            if pass_.is_applicable_for_shape(shape):
+                pass_(graph)
+
+        # always run fix_functionalization last
+        self.fix_functionalization(graph)
+
+    def configure(
+        self,
+    ):
+        self.pass_config = dict()
+        self.fix_functionalization = FixFunctionalizationPass()
+
+    def add(self, pass_: InductorPass):
+        assert isinstance(pass_, InductorPass)
+        self.passes.append(pass_)
+
+    def uuid(self):
+        """
+        The PostGradPassManager is set as a custom pass in the Inductor and
+        affects compilation caching. Its uuid depends on the UUIDs of all
+        dependent passes and the pass config. See InductorPass for more info.
+        """
+        pass_manager_uuid = "fshdakhsa"
+        state = {"pass_config": pass_manager_uuid, "passes": []}
+        for pass_ in self.passes:
+            state["passes"].append(pass_.uuid())
+        state["passes"].append(self.fix_functionalization.uuid())
+        return InductorPass.hash_dict(state)
diff --git a/sglang/python/sglang/srt/compilation/piecewise_context_manager.py b/sglang/python/sglang/srt/compilation/piecewise_context_manager.py
new file mode 100644
index 0000000000000000000000000000000000000000..20a08a9972b9e5f1e730aa5bea7d8621f72b816b
--- /dev/null
+++ b/sglang/python/sglang/srt/compilation/piecewise_context_manager.py
@@ -0,0 +1,125 @@
+from __future__ import annotations
+
+import logging
+from contextlib import contextmanager
+from dataclasses import dataclass
+from typing import TYPE_CHECKING, Any, List, Optional
+
+import torch
+
+logger = logging.getLogger(__name__)
+
+
+if TYPE_CHECKING:
+    from sglang.srt.model_executor.forward_batch_info import ForwardBatch
+
+_in_piecewise_cuda_graph = False
+_in_pcg_torch_compile = False
+_pcg_capture_stream = None
+
+
+def is_in_piecewise_cuda_graph():
+    return _in_piecewise_cuda_graph
+
+
+def is_in_pcg_torch_compile():
+    return _in_pcg_torch_compile
+
+
+def get_pcg_capture_stream():
+    return _pcg_capture_stream
+
+
+@contextmanager
+def enable_piecewise_cuda_graph_compile():
+    global _in_pcg_torch_compile
+    _in_pcg_torch_compile = True
+    yield
+    _in_pcg_torch_compile = False
+
+
+@contextmanager
+def enable_piecewise_cuda_graph():
+    global _in_piecewise_cuda_graph
+    _in_piecewise_cuda_graph = True
+    try:
+        yield
+    except Exception as e:
+        logger.error(
+            "Piecewise CUDA Graph failed with error: %s\n%s",
+            e,
+            PIECEWISE_CUDA_GRAPH_CAPTURE_FAILED_MSG,
+        )
+        raise
+    finally:
+        _in_piecewise_cuda_graph = False
+
+
+@contextmanager
+def set_pcg_capture_stream(stream: torch.cuda.Stream):
+    global _pcg_capture_stream
+    _pcg_capture_stream = stream
+    yield
+    _pcg_capture_stream = None
+
+
+@dataclass
+class ForwardContext:
+    def __init__(self):
+        self.forward_batch = None
+        self.attention_layers = None
+        self.quant_config = None
+        self.moe_layers = None
+        self.moe_fusions = None
+
+    def set_forward_batch(self, forward_batch: ForwardBatch):
+        self.forward_batch = forward_batch
+
+    def set_attention_layers(self, layers: List[Any]):
+        self.attention_layers = layers
+
+    def set_quant_config(self, quant_config: Any):
+        self.quant_config = quant_config
+
+    def set_moe_layers(self, layers: List[Any]):
+        self.moe_layers = layers
+
+    def set_moe_fusions(self, fusions: List[Any]):
+        self.moe_fusions = fusions
+
+
+_forward_context: Optional[ForwardContext] = None
+
+
+def get_forward_context() -> Optional[ForwardContext]:
+    if _forward_context is None:
+        return None
+    return _forward_context
+
+
+@contextmanager
+def set_forward_context(
+    forward_batch: ForwardBatch,
+    attention_layers: List[Any],
+    quant_config: Any,
+    moe_layers: List[Any],
+    moe_fusions: List[Any],
+):
+    global _forward_context
+    _forward_context = ForwardContext()
+    _forward_context.set_forward_batch(forward_batch)
+    _forward_context.set_attention_layers(attention_layers)
+    _forward_context.set_quant_config(quant_config)
+    _forward_context.set_moe_layers(moe_layers)
+    _forward_context.set_moe_fusions(moe_fusions)
+    try:
+        yield
+    finally:
+        _forward_context = None
+
+
+PIECEWISE_CUDA_GRAPH_CAPTURE_FAILED_MSG = (
+    "Piecewise CUDA Graph is enabled by default as an experimental feature.\n"
+    "To work around this error, add --disable-piecewise-cuda-graph to your launch command.\n"
+    "Please report this issue at https://github.com/sgl-project/sglang/issues/new/choose"
+)
diff --git a/sglang/python/sglang/srt/compilation/weak_ref_tensor.py b/sglang/python/sglang/srt/compilation/weak_ref_tensor.py
new file mode 100644
index 0000000000000000000000000000000000000000..3564849c74075c45cc3fd5c43355fec2c603ab77
--- /dev/null
+++ b/sglang/python/sglang/srt/compilation/weak_ref_tensor.py
@@ -0,0 +1,28 @@
+from typing import Any, Union
+
+import torch
+
+from sglang.srt.utils.common import is_cuda, is_hip, is_npu
+
+if is_cuda() or is_hip():
+    from sgl_kernel import weak_ref_tensor
+elif is_npu():
+    from torch_npu._C import _weak_ref_tensor as weak_ref_tensor
+else:
+    raise NotImplementedError("weak_ref_tensor is implemented only for CUDA and NPU.")
+
+
+def weak_ref_tensors(
+    tensors: Union[torch.Tensor, list[torch.Tensor], tuple[torch.Tensor]],
+) -> Union[torch.Tensor, list[Any], tuple[Any], Any]:
+    """
+    Convenience function to create weak references to tensors,
+    for single tensor, list of tensors or tuple of tensors.
+    """
+    if isinstance(tensors, torch.Tensor):
+        return weak_ref_tensor(tensors)
+    if isinstance(tensors, list):
+        return [weak_ref_tensor(t) for t in tensors]
+    if isinstance(tensors, tuple):
+        return tuple(weak_ref_tensor(t) for t in tensors)
+    raise ValueError("Invalid type for tensors")
diff --git a/sglang/python/sglang/srt/configs/__init__.py b/sglang/python/sglang/srt/configs/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..3a3b37f54c0cdba156bf2b36a4617dc2462bad6d
--- /dev/null
+++ b/sglang/python/sglang/srt/configs/__init__.py
@@ -0,0 +1,64 @@
+from sglang.srt.configs.afmoe import AfmoeConfig
+from sglang.srt.configs.bailing_hybrid import BailingHybridConfig
+from sglang.srt.configs.chatglm import ChatGLMConfig
+from sglang.srt.configs.dbrx import DbrxConfig
+from sglang.srt.configs.deepseekvl2 import DeepseekVL2Config
+from sglang.srt.configs.dots_ocr import DotsOCRConfig
+from sglang.srt.configs.dots_vlm import DotsVLMConfig
+from sglang.srt.configs.exaone import ExaoneConfig
+from sglang.srt.configs.falcon_h1 import FalconH1Config
+from sglang.srt.configs.granitemoehybrid import GraniteMoeHybridConfig
+from sglang.srt.configs.janus_pro import MultiModalityConfig
+from sglang.srt.configs.jet_nemotron import JetNemotronConfig
+from sglang.srt.configs.jet_vlm import JetVLMConfig
+from sglang.srt.configs.kimi_k25 import KimiK25Config
+from sglang.srt.configs.kimi_linear import KimiLinearConfig
+from sglang.srt.configs.kimi_vl import KimiVLConfig
+from sglang.srt.configs.kimi_vl_moonvit import MoonViTConfig
+from sglang.srt.configs.lfm2 import Lfm2Config
+from sglang.srt.configs.lfm2_moe import Lfm2MoeConfig
+from sglang.srt.configs.longcat_flash import LongcatFlashConfig
+from sglang.srt.configs.nano_nemotron_vl import NemotronH_Nano_VL_V2_Config
+from sglang.srt.configs.nemotron_h import NemotronHConfig
+from sglang.srt.configs.olmo3 import Olmo3Config
+from sglang.srt.configs.qwen3_5 import Qwen3_5Config, Qwen3_5MoeConfig
+from sglang.srt.configs.qwen3_next import Qwen3NextConfig
+from sglang.srt.configs.step3_vl import (
+    Step3TextConfig,
+    Step3VisionEncoderConfig,
+    Step3VLConfig,
+)
+from sglang.srt.configs.step3p5 import Step3p5Config
+
+__all__ = [
+    "AfmoeConfig",
+    "BailingHybridConfig",
+    "ExaoneConfig",
+    "ChatGLMConfig",
+    "DbrxConfig",
+    "DeepseekVL2Config",
+    "LongcatFlashConfig",
+    "MultiModalityConfig",
+    "KimiVLConfig",
+    "MoonViTConfig",
+    "Step3VLConfig",
+    "Step3TextConfig",
+    "Step3VisionEncoderConfig",
+    "Olmo3Config",
+    "KimiLinearConfig",
+    "KimiK25Config",
+    "Qwen3NextConfig",
+    "Qwen3_5Config",
+    "Qwen3_5MoeConfig",
+    "DotsVLMConfig",
+    "DotsOCRConfig",
+    "FalconH1Config",
+    "GraniteMoeHybridConfig",
+    "Lfm2Config",
+    "Lfm2MoeConfig",
+    "NemotronHConfig",
+    "NemotronH_Nano_VL_V2_Config",
+    "JetNemotronConfig",
+    "JetVLMConfig",
+    "Step3p5Config",
+]
diff --git a/sglang/python/sglang/srt/configs/afmoe.py b/sglang/python/sglang/srt/configs/afmoe.py
new file mode 100644
index 0000000000000000000000000000000000000000..eab477339bc04b9f72e46e23e3e19d43df4b0ba2
--- /dev/null
+++ b/sglang/python/sglang/srt/configs/afmoe.py
@@ -0,0 +1,102 @@
+from typing import List, Optional
+
+from transformers import PretrainedConfig
+
+
+class AfmoeConfig(PretrainedConfig):
+    model_type = "afmoe"
+
+    def __init__(
+        self,
+        vocab_size: int = 32000,
+        hidden_size: int = 4096,
+        intermediate_size: int = 11008,
+        moe_intermediate_size: int = 256,
+        num_hidden_layers: int = 32,
+        num_attention_heads: int = 32,
+        num_key_value_heads: Optional[int] = None,
+        head_dim: Optional[int] = None,
+        hidden_act: str = "silu",
+        max_position_embeddings: int = 131072,
+        initializer_range: float = 0.02,
+        rms_norm_eps: float = 1e-5,
+        use_cache: bool = True,
+        pad_token_id: Optional[int] = None,
+        bos_token_id: int = 1,
+        eos_token_id: int = 2,
+        tie_word_embeddings: bool = False,
+        rope_theta: float = 10000.0,
+        rope_scaling: Optional[dict] = None,
+        attention_bias: bool = False,
+        attention_dropout: float = 0.0,
+        # MoE parameters
+        num_experts: Optional[int] = None,
+        num_experts_per_tok: Optional[int] = None,
+        num_shared_experts: int = 0,
+        num_dense_layers: int = 0,
+        # Routing parameters
+        score_func: str = "sigmoid",
+        route_norm: bool = True,
+        route_scale: float = 1.0,
+        n_group: int = 1,
+        topk_group: int = 1,
+        # Attention parameters
+        sliding_window: Optional[int] = None,
+        layer_types: Optional[List[str]] = None,
+        global_attn_every_n_layers: int = 4,
+        # muP scaling
+        mup_enabled: bool = False,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.moe_intermediate_size = moe_intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+
+        if num_key_value_heads is None:
+            num_key_value_heads = num_attention_heads
+
+        self.num_key_value_heads = num_key_value_heads
+        self.head_dim = (
+            head_dim if head_dim is not None else hidden_size // num_attention_heads
+        )
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+        self.rope_scaling = rope_scaling
+        self.attention_bias = attention_bias
+        self.attention_dropout = attention_dropout
+
+        # MoE parameters
+        self.num_experts = num_experts
+        self.num_experts_per_tok = num_experts_per_tok
+        self.num_shared_experts = num_shared_experts
+        self.num_dense_layers = num_dense_layers
+
+        # Routing parameters
+        self.score_func = score_func
+        self.route_norm = route_norm
+        self.route_scale = route_scale
+        self.n_group = n_group
+        self.topk_group = topk_group
+
+        # Attention parameters
+        self.sliding_window = sliding_window
+        self.layer_types = layer_types
+        self.global_attn_every_n_layers = global_attn_every_n_layers
+
+        # muP scaling
+        self.mup_enabled = mup_enabled
+
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
diff --git a/sglang/python/sglang/srt/configs/bailing_hybrid.py b/sglang/python/sglang/srt/configs/bailing_hybrid.py
new file mode 100644
index 0000000000000000000000000000000000000000..40933d90a73cf2de2c16079028dc513ee27edf1e
--- /dev/null
+++ b/sglang/python/sglang/srt/configs/bailing_hybrid.py
@@ -0,0 +1,188 @@
+# coding=utf-8
+# Copyright 2024 The Qwen team, Alibaba Group and the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""BailingHybrid model configuration"""
+
+import enum
+
+from transformers.configuration_utils import PretrainedConfig
+from transformers.utils import logging
+
+from sglang.srt.configs.mamba_utils import Mamba2CacheParams, Mamba2StateShape
+
+logger = logging.get_logger(__name__)
+
+
+class HybridLayerType(enum.Enum):
+    full_attention = "attention"
+    linear_attention = "linear_attention"
+
+
+class BailingHybridConfig(PretrainedConfig):
+
+    model_type = "bailing_hybrid"
+    keys_to_ignore_at_inference = ["past_key_values"]
+
+    def __init__(
+        self,
+        vocab_size=157184,
+        hidden_size=2048,
+        intermediate_size=5120,
+        num_hidden_layers=20,
+        num_attention_heads=16,
+        num_key_value_heads=4,
+        hidden_act="silu",
+        use_qkv_bias=False,  # bailing only
+        use_bias=False,  # bailing only
+        rms_norm_eps=1e-06,
+        tie_word_embeddings=False,  # PretrainedConfig key, here change default value.
+        embedding_dropout=0.0,
+        attention_dropout=0.0,
+        output_dropout=0.0,
+        initializer_range=0.02,
+        max_position_embeddings=32768,
+        rope_theta=600000.0,
+        use_cache=True,
+        max_window_layers=20,
+        rope_scaling=None,
+        pad_token_id=156892,
+        eos_token_id=156892,
+        num_experts=256,
+        num_shared_experts=1,
+        num_experts_per_tok=8,
+        n_group=8,
+        topk_group=4,
+        moe_intermediate_size=512,
+        first_k_dense_replace=1,
+        head_dim=128,
+        output_router_logits=False,
+        use_qk_norm=True,
+        num_nextn_predict_layers=0,
+        mtp_loss_scaling_factor=0,
+        moe_router_enable_expert_bias=True,
+        routed_scaling_factor=1.0,
+        layer_group_size=1,
+        group_norm_size=1,
+        linear_silu=False,
+        kv_lora_rank=512,
+        q_lora_rank=None,
+        qk_rope_head_dim=64,
+        v_head_dim=128,
+        qk_nope_head_dim=128,
+        rope_interleave=True,
+        **kwargs,
+    ):
+        self.num_hidden_layers = num_hidden_layers
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_attention_heads = num_attention_heads
+        self.num_key_value_heads = num_key_value_heads
+        self.hidden_act = hidden_act
+        self.use_qkv_bias = use_qkv_bias
+        self.use_bias = use_bias
+        self.rms_norm_eps = rms_norm_eps
+        self.embedding_dropout = embedding_dropout
+        self.attention_dropout = attention_dropout
+        self.output_dropout = output_dropout
+        self.num_nextn_predict_layers = num_nextn_predict_layers
+        self.mtp_loss_scaling_factor = mtp_loss_scaling_factor
+        self.initializer_range = initializer_range
+        self.max_position_embeddings = max_position_embeddings
+        self.rope_theta = rope_theta
+        self.use_cache = use_cache
+        self.max_window_layers = max_window_layers
+        self.head_dim = head_dim or self.hidden_size // self.num_attention_heads
+        self.rope_scaling = rope_scaling
+        self.use_qk_norm = use_qk_norm
+        self.moe_router_enable_expert_bias = moe_router_enable_expert_bias
+        self.routed_scaling_factor = routed_scaling_factor
+
+        # MoE configs
+        self.num_experts = num_experts
+        self.num_shared_experts = num_shared_experts
+        self.num_experts_per_tok = num_experts_per_tok
+        self.n_group = n_group
+        self.topk_group = topk_group
+        self.moe_intermediate_size = moe_intermediate_size
+        self.first_k_dense_replace = first_k_dense_replace
+        self.output_router_logits = output_router_logits
+
+        # Linear configs
+        self.layer_group_size = layer_group_size
+        self.group_norm_size = group_norm_size
+        self.linear_silu = linear_silu
+        self.num_linear_key_value_heads = num_attention_heads
+        # mla
+        self.kv_lora_rank = kv_lora_rank
+        self.q_lora_rank = q_lora_rank
+        self.qk_rope_head_dim = qk_rope_head_dim
+        self.v_head_dim = v_head_dim
+        self.qk_nope_head_dim = qk_nope_head_dim
+        self.qk_head_dim = qk_nope_head_dim + qk_rope_head_dim
+        self.rope_interleave = rope_interleave
+        self.for_nextn_model = False
+        super().__init__(
+            pad_token_id=pad_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
+
+    @property
+    def layers_block_type(self):
+        if self.for_nextn_model:
+            return [HybridLayerType.full_attention.value]
+
+        layer_type_list = []
+
+        for l in range(self.num_hidden_layers):
+            if (l + 1) % self.layer_group_size == 0:
+                layer_type_list.append(HybridLayerType.full_attention.value)
+            else:
+                layer_type_list.append(HybridLayerType.linear_attention.value)
+
+        return layer_type_list
+
+    @property
+    def linear_layer_ids(self):
+        return [
+            i
+            for i, type_value in enumerate(self.layers_block_type)
+            if type_value == HybridLayerType.linear_attention.value
+        ]
+
+    @property
+    def full_attention_layer_ids(self):
+        return [
+            i
+            for i, type_value in enumerate(self.layers_block_type)
+            if type_value == HybridLayerType.full_attention.value
+        ]
+
+    @property
+    def mamba2_cache_params(self) -> Mamba2CacheParams:
+        from sglang.srt.layers.dp_attention import get_attention_tp_size
+
+        shape = Mamba2StateShape.create(
+            tp_world_size=get_attention_tp_size(),
+            intermediate_size=0,
+            n_groups=0,
+            num_heads=self.num_linear_key_value_heads,
+            head_dim=self.head_dim,
+            state_size=self.head_dim,
+            conv_kernel=1,
+        )
+
+        return Mamba2CacheParams(shape=shape, layers=self.linear_layer_ids)
diff --git a/sglang/python/sglang/srt/configs/chatglm.py b/sglang/python/sglang/srt/configs/chatglm.py
new file mode 100644
index 0000000000000000000000000000000000000000..9370c218aab8405944c81b6ddf59f6d61d7c999f
--- /dev/null
+++ b/sglang/python/sglang/srt/configs/chatglm.py
@@ -0,0 +1,78 @@
+# Adapted from
+# https://github.com/THUDM/ChatGLM2-6B
+# https://github.com/vllm-project/vllm/blob/main/vllm/transformers_utils/configs/chatglm.py
+
+# ChatGLM2 and ChatGLM3 share the same config.
+# ChatGLM4 is officially supported by Huggingface
+# transformers >= 4.46.0 is required
+# https://huggingface.co/docs/transformers/en/model_doc/glm
+from transformers import PretrainedConfig
+
+
+class ChatGLMConfig(PretrainedConfig):
+    model_type = "chatglm"
+    attribute_map = {
+        "num_hidden_layers": "num_layers",
+        "n_head_kv": "multi_query_group_num",
+    }
+
+    def __init__(
+        self,
+        num_layers=28,
+        padded_vocab_size=65024,
+        hidden_size=4096,
+        ffn_hidden_size=13696,
+        kv_channels=128,
+        num_attention_heads=32,
+        seq_length=2048,
+        hidden_dropout=0.0,
+        attention_dropout=0.0,
+        layernorm_epsilon=1e-5,
+        rmsnorm=True,
+        apply_residual_connection_post_layernorm=False,
+        post_layer_norm=True,
+        add_bias_linear=False,
+        add_qkv_bias=False,
+        interleaved_qkv=False,
+        bias_dropout_fusion=True,
+        multi_query_attention=False,
+        multi_query_group_num=1,
+        apply_query_key_layer_scaling=True,
+        attention_softmax_in_fp32=True,
+        fp32_residual_connection=False,
+        quantization_bit=0,
+        pre_seq_len=None,
+        prefix_projection=False,
+        **kwargs
+    ):
+        self.num_layers = num_layers
+        self.vocab_size = padded_vocab_size
+        self.padded_vocab_size = padded_vocab_size
+        self.hidden_size = hidden_size
+        self.ffn_hidden_size = ffn_hidden_size
+        self.kv_channels = kv_channels
+        self.num_attention_heads = num_attention_heads
+        self.seq_length = seq_length
+        # It is to be compatible with long lora.
+        self.max_position_embeddings = seq_length
+        self.hidden_dropout = hidden_dropout
+        self.attention_dropout = attention_dropout
+        self.layernorm_epsilon = layernorm_epsilon
+        self.rmsnorm = rmsnorm
+        self.apply_residual_connection_post_layernorm = (
+            apply_residual_connection_post_layernorm
+        )
+        self.post_layer_norm = post_layer_norm
+        self.add_bias_linear = add_bias_linear
+        self.add_qkv_bias = add_qkv_bias
+        self.bias_dropout_fusion = bias_dropout_fusion
+        self.multi_query_attention = multi_query_attention
+        self.multi_query_group_num = multi_query_group_num
+        self.apply_query_key_layer_scaling = apply_query_key_layer_scaling
+        self.attention_softmax_in_fp32 = attention_softmax_in_fp32
+        self.fp32_residual_connection = fp32_residual_connection
+        self.quantization_bit = quantization_bit
+        self.pre_seq_len = pre_seq_len
+        self.prefix_projection = prefix_projection
+        self.interleaved_qkv = interleaved_qkv
+        super().__init__(**kwargs)
diff --git a/sglang/python/sglang/srt/configs/dbrx.py b/sglang/python/sglang/srt/configs/dbrx.py
new file mode 100644
index 0000000000000000000000000000000000000000..75ccbde944ea684689c656aaa75d3aeb1681c188
--- /dev/null
+++ b/sglang/python/sglang/srt/configs/dbrx.py
@@ -0,0 +1,279 @@
+# Adapted from
+# https://huggingface.co/databricks/dbrx-base/blob/main/configuration_dbrx.py
+# https://github.com/vllm-project/vllm/blob/main/vllm/transformers_utils/configs/dbrx.py
+"""Dbrx configuration."""
+
+from typing import Any, Optional
+
+from transformers.configuration_utils import PretrainedConfig
+from transformers.utils import logging
+
+logger = logging.get_logger(__name__)
+
+DBRX_PRETRAINED_CONFIG_ARCHIVE_MAP = {}  # type: ignore
+
+
+class DbrxAttentionConfig(PretrainedConfig):
+    """Configuration class for Dbrx Attention.
+
+    [`DbrxAttention`] class. It is used to instantiate attention layers
+    according to the specified arguments, defining the layers architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        attn_pdrop (`float`, *optional*, defaults to 0.0):
+            The dropout probability for the attention layers.
+        clip_qkv (`float`, *optional*, defaults to None):
+            If not `None`, clip the queries, keys, and values in the attention layer to this value.
+        kv_n_heads (Optional[int]): For grouped_query_attention only, allow user to specify number of kv heads.
+        rope_theta (float): The base frequency for rope.
+    """
+
+    def __init__(
+        self,
+        attn_pdrop: float = 0,
+        clip_qkv: Optional[float] = None,
+        kv_n_heads: int = 1,
+        rope_theta: float = 10000.0,
+        **kwargs: Any,
+    ):
+        super().__init__(**kwargs)
+        self.attn_pdrop = attn_pdrop
+        self.clip_qkv = clip_qkv
+        self.kv_n_heads = kv_n_heads
+        self.rope_theta = rope_theta
+
+        for k in ["model_type"]:
+            if k in kwargs:
+                kwargs.pop(k)
+        if len(kwargs) != 0:
+            raise ValueError(f"Found unknown {kwargs=}")
+
+    @classmethod
+    def from_pretrained(
+        cls, pretrained_model_name_or_path: str, **kwargs: Any
+    ) -> "PretrainedConfig":
+        cls._set_token_in_kwargs(kwargs)
+
+        config_dict, kwargs = cls.get_config_dict(
+            pretrained_model_name_or_path, **kwargs
+        )
+
+        if config_dict.get("model_type") == "dbrx":
+            config_dict = config_dict["attn_config"]
+
+        if (
+            "model_type" in config_dict
+            and hasattr(cls, "model_type")
+            and config_dict["model_type"] != cls.model_type
+        ):
+            logger.warning(
+                "You are using a model of type %s to instantiate a model of "
+                "type %s. This is not supported for all configurations of "
+                "models and can yield errors.",
+                config_dict["model_type"],
+                cls.model_type,
+            )
+
+        return cls.from_dict(config_dict, **kwargs)
+
+
+class DbrxFFNConfig(PretrainedConfig):
+    """Configuration class for Dbrx FFN.
+
+    [`DbrxFFN`] class. It is used to instantiate feedforward layers according to
+    the specified arguments, defining the layers architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        ffn_act_fn (dict, optional): A dict specifying activation function for the FFN.
+            The dict should have a key 'name' with the value being the name of
+            the activation function along with any additional keyword arguments.
+        ffn_hidden_size (int, optional): The hidden size of the feedforward network.
+        moe_num_experts (int, optional): The number of experts in the mixture of experts layer.
+        moe_top_k (int, optional): The number of experts to use in the mixture of experts layer.
+        moe_jitter_eps (float, optional): The jitter epsilon for the mixture of experts layer.
+        moe_loss_weight (float, optional): The loss weight for the mixture of experts layer.
+        moe_normalize_expert_weights (float, optional): The normalization factor for the expert weights.
+        uniform_expert_assignment (bool, optional): Whether to use uniform expert assignment.
+            This should only be used for benchmarking purposes.
+    """
+
+    def __init__(
+        self,
+        ffn_act_fn: Optional[dict] = None,
+        ffn_hidden_size: int = 3584,
+        moe_num_experts: int = 4,
+        moe_top_k: int = 1,
+        moe_jitter_eps: Optional[float] = None,
+        moe_loss_weight: float = 0.01,
+        moe_normalize_expert_weights: Optional[float] = 1,
+        uniform_expert_assignment: bool = False,
+        **kwargs: Any,
+    ):
+        super().__init__()
+        if ffn_act_fn is None:
+            ffn_act_fn = {"name": "silu"}
+        self.ffn_act_fn = ffn_act_fn
+        self.ffn_hidden_size = ffn_hidden_size
+        self.moe_num_experts = moe_num_experts
+        self.moe_top_k = moe_top_k
+        self.moe_jitter_eps = moe_jitter_eps
+        self.moe_loss_weight = moe_loss_weight
+        self.moe_normalize_expert_weights = moe_normalize_expert_weights
+        self.uniform_expert_assignment = uniform_expert_assignment
+
+        for k in ["model_type"]:
+            if k in kwargs:
+                kwargs.pop(k)
+        if len(kwargs) != 0:
+            raise ValueError(f"Found unknown {kwargs=}")
+
+    @classmethod
+    def from_pretrained(
+        cls, pretrained_model_name_or_path: str, **kwargs: Any
+    ) -> "PretrainedConfig":
+        cls._set_token_in_kwargs(kwargs)
+
+        config_dict, kwargs = cls.get_config_dict(
+            pretrained_model_name_or_path, **kwargs
+        )
+
+        if config_dict.get("model_type") == "dbrx":
+            config_dict = config_dict["ffn_config"]
+
+        if (
+            "model_type" in config_dict
+            and hasattr(cls, "model_type")
+            and config_dict["model_type"] != cls.model_type
+        ):
+            logger.warning(
+                "You are using a model of type %s to instantiate a model of "
+                "type %s. This is not supported for all "
+                "configurations of models and can yield errors.",
+                config_dict["model_type"],
+                cls.model_type,
+            )
+
+        return cls.from_dict(config_dict, **kwargs)
+
+
+class DbrxConfig(PretrainedConfig):
+    """Configuration class for Dbrx.
+
+    [`DbrxModel`]. It is used to instantiate a Dbrx model according to the
+    specified arguments, defining the model architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        d_model (`int`, *optional*, defaults to 6144):
+            Dimensionality of the embeddings and hidden states.
+        n_heads (`int`, *optional*, defaults to 48):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        n_layers (`int`, *optional*, defaults to 40):
+            Number of hidden layers in the Transformer encoder.
+        max_seq_len (`int`, *optional*, defaults to 32768):
+            The maximum sequence length of the model.
+        vocab_size (`int`, *optional*, defaults to 100352):
+            Vocabulary size of the Dbrx model. Defines the maximum number of different tokens that can be represented by
+            the `inputs_ids` passed when calling [`DbrxModel`].
+        resid_pdrop (`float`, *optional*, defaults to 0.0):
+            The dropout probability applied to the attention output before combining with residual.
+        emb_pdrop (`float`, *optional*, defaults to 0.0):
+            The dropout probability for the embedding layer.
+        attn_config (`dict`, *optional*):
+            A dictionary used to configure the model's attention module.
+        ffn_config (`dict`, *optional*):
+            A dictionary used to configure the model's FFN module.
+        use_cache (`bool`, *optional*, defaults to `False`):
+            Whether or not the model should return the last key/values attentions (not used by all models).
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        output_router_logits (`bool`, *optional*, defaults to `False`):
+            Whether or not the router logits should be returned by the model. Enabling this will also
+            allow the model to output the auxiliary loss. See [here]() for more details
+        router_aux_loss_coef (`float`, *optional*, defaults to 0.001):
+            The aux loss factor for the total loss.
+
+
+    Example:
+    ```python
+    >>> from transformers import DbrxConfig, DbrxModel
+
+    >>> # Initializing a Dbrx configuration
+    >>> configuration = DbrxConfig()
+
+    >>> # Initializing a model (with random weights) from the configuration
+    >>> model = DbrxModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```
+    """
+
+    model_type = "dbrx"
+    attribute_map = {
+        "num_attention_heads": "n_heads",
+        "hidden_size": "d_model",
+        "num_hidden_layers": "n_layers",
+        "max_position_embeddings": "max_seq_len",
+    }
+
+    def __init__(
+        self,
+        d_model: int = 2048,
+        n_heads: int = 16,
+        n_layers: int = 24,
+        max_seq_len: int = 2048,
+        vocab_size: int = 32000,
+        resid_pdrop: float = 0.0,
+        emb_pdrop: float = 0.0,
+        attn_config: Optional[DbrxAttentionConfig] = None,
+        ffn_config: Optional[DbrxFFNConfig] = None,
+        use_cache: bool = True,
+        initializer_range: float = 0.02,
+        output_router_logits: bool = False,
+        router_aux_loss_coef: float = 0.05,
+        **kwargs: Any,
+    ):
+        if attn_config is None:
+            self.attn_config = DbrxAttentionConfig()
+        elif isinstance(attn_config, dict):
+            self.attn_config = DbrxAttentionConfig(**attn_config)
+        else:
+            self.attn_config = attn_config
+
+        if ffn_config is None:
+            self.ffn_config = DbrxFFNConfig()
+        elif isinstance(ffn_config, dict):
+            self.ffn_config = DbrxFFNConfig(**ffn_config)
+        else:
+            self.ffn_config = ffn_config
+
+        self.d_model = d_model
+        self.n_heads = n_heads
+        self.n_layers = n_layers
+        self.max_seq_len = max_seq_len
+        self.vocab_size = vocab_size
+        self.resid_pdrop = resid_pdrop
+        self.emb_pdrop = emb_pdrop
+        self.use_cache = use_cache
+        self.initializer_range = initializer_range
+        self.output_router_logits = output_router_logits
+        self.router_aux_loss_coef = router_aux_loss_coef
+
+        tie_word_embeddings = kwargs.pop("tie_word_embeddings", False)
+        if tie_word_embeddings:
+            raise ValueError("tie_word_embeddings is not supported for Dbrx models.")
+
+        super().__init__(
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
diff --git a/sglang/python/sglang/srt/configs/deepseek_ocr.py b/sglang/python/sglang/srt/configs/deepseek_ocr.py
new file mode 100644
index 0000000000000000000000000000000000000000..1677423d1811775221b48d257b65865e3cf4c6c9
--- /dev/null
+++ b/sglang/python/sglang/srt/configs/deepseek_ocr.py
@@ -0,0 +1,817 @@
+import math
+from dataclasses import dataclass
+from typing import Any, Dict, List, Optional, Tuple
+
+import torch
+from PIL import Image, ImageOps
+from transformers import (
+    AutoProcessor,
+    LlamaTokenizerFast,
+    PretrainedConfig,
+    ProcessorMixin,
+)
+
+from sglang.srt.multimodal.customized_mm_processor_utils import (
+    register_customized_processor,
+)
+from sglang.srt.sampling.custom_logit_processor import (
+    DeepseekOCRNoRepeatNGramLogitProcessor,
+)
+
+BASE_SIZE = 1024
+IMAGE_SIZE = 640
+CROP_MODE = True
+MIN_CROPS = 2
+MAX_CROPS = 6  # max:9; If your GPU memory is small, it is recommended to set it to 6.
+MAX_CONCURRENCY = 100  # If you have limited GPU memory, lower the concurrency count.
+NUM_WORKERS = 64  # image pre-process (resize/padding) workers
+PRINT_NUM_VIS_TOKENS = False
+SKIP_REPEAT = True
+MODEL_PATH = "deepseek-ai/DeepSeek-OCR"  # change to your model path
+
+NGRAM_NO_REPEAT_SIZE = 30
+NGRAM_NO_REPEAT_WINDOW = 90
+# Whitelist `<td>` and `</td>` token ids to allow table structures.
+NGRAM_NO_REPEAT_WHITELIST = (128821, 128822)
+
+DEFAULT_CUSTOM_LOGIT_PROCESSOR = DeepseekOCRNoRepeatNGramLogitProcessor.to_str()
+
+
+def get_default_ngram_custom_params() -> Dict[str, Any]:
+    """Return default custom params for the DeepSeek-OCR n-gram no repeat processor."""
+
+    return {
+        "ngram_size": NGRAM_NO_REPEAT_SIZE,
+        "window_size": NGRAM_NO_REPEAT_WINDOW,
+        "whitelist_token_ids": list(NGRAM_NO_REPEAT_WHITELIST),
+    }
+
+
+PROMPT = "<image>\n<|grounding|>Convert the document to markdown."
+
+
+class DictOutput(object):
+    def items(self):
+        return self.__dict__.items()
+
+    def keys(self):
+        return self.__dict__.keys()
+
+    def __getitem__(self, item):
+        return self.__dict__[item]
+
+    def __contains__(self, key):
+        return key in self.__dict__
+
+    def __setitem__(self, key, value):
+        self.__dict__[key] = value
+
+
+@dataclass
+class VLChatProcessorOutput(DictOutput):
+    input_ids: torch.LongTensor
+    target_ids: torch.LongTensor
+    images_crop: torch.LongTensor
+    pixel_values: (
+        torch.Tensor
+    )  # rename from "images" to "pixel_values" for compatibility
+    images_seq_mask: torch.BoolTensor
+    images_spatial_crop: torch.LongTensor
+
+    def __len__(self):
+        return len(self.input_ids)
+
+
+class ImageTransform(object):
+    def __init__(
+        self,
+        mean: Optional[Tuple[float, float, float]] = (0.5, 0.5, 0.5),
+        std: Optional[Tuple[float, float, float]] = (0.5, 0.5, 0.5),
+        normalize: bool = True,
+    ):
+        self.mean = mean
+        self.std = std
+        self.normalize = normalize
+
+        # only load torchvision.transforms when needed
+        try:
+            import torchvision.transforms as T
+
+            # FIXME: add version check for gguf
+        except ImportError as err:
+            raise ImportError(
+                "Please install torchvision via `pip install torchvision` to use Deepseek-VL2."
+            ) from err
+
+        transform_pipelines = [T.ToTensor()]
+
+        if normalize:
+            transform_pipelines.append(T.Normalize(mean, std))
+
+        self.transform = T.Compose(transform_pipelines)
+
+    def __call__(self, pil_img: Image.Image):
+        x = self.transform(pil_img)
+        return x
+
+
+def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, image_size):
+    best_ratio_diff = float("inf")
+    best_ratio = (1, 1)
+    area = width * height
+    for ratio in target_ratios:
+        target_aspect_ratio = ratio[0] / ratio[1]
+        ratio_diff = abs(aspect_ratio - target_aspect_ratio)
+        if ratio_diff < best_ratio_diff:
+            best_ratio_diff = ratio_diff
+            best_ratio = ratio
+        elif ratio_diff == best_ratio_diff:
+            if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]:
+                best_ratio = ratio
+    return best_ratio
+
+
+def dynamic_preprocess(
+    image, min_num=MIN_CROPS, max_num=MAX_CROPS, image_size=640, use_thumbnail=False
+):
+    orig_width, orig_height = image.size
+    aspect_ratio = orig_width / orig_height
+
+    # calculate the existing image aspect ratio
+    target_ratios = set(
+        (i, j)
+        for n in range(min_num, max_num + 1)
+        for i in range(1, n + 1)
+        for j in range(1, n + 1)
+        if i * j <= max_num and i * j >= min_num
+    )
+    target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])
+
+    # find the closest aspect ratio to the target
+    target_aspect_ratio = find_closest_aspect_ratio(
+        aspect_ratio, target_ratios, orig_width, orig_height, image_size
+    )
+
+    # calculate the target width and height
+    target_width = image_size * target_aspect_ratio[0]
+    target_height = image_size * target_aspect_ratio[1]
+    blocks = target_aspect_ratio[0] * target_aspect_ratio[1]
+
+    # resize the image
+    resized_img = image.resize((target_width, target_height))
+    processed_images = []
+    for i in range(blocks):
+        box = (
+            (i % (target_width // image_size)) * image_size,
+            (i // (target_width // image_size)) * image_size,
+            ((i % (target_width // image_size)) + 1) * image_size,
+            ((i // (target_width // image_size)) + 1) * image_size,
+        )
+        # split the image
+        split_img = resized_img.crop(box)
+        processed_images.append(split_img)
+    assert len(processed_images) == blocks
+    if use_thumbnail and len(processed_images) != 1:
+        thumbnail_img = image.resize((image_size, image_size))
+        processed_images.append(thumbnail_img)
+    return processed_images, target_aspect_ratio
+
+
+class DeepseekOCRProcessor(ProcessorMixin):
+    tokenizer_class = ("LlamaTokenizer", "LlamaTokenizerFast")
+    attributes = ["tokenizer"]
+
+    def __init__(
+        self,
+        tokenizer: LlamaTokenizerFast,
+        candidate_resolutions: Tuple[Tuple[int, int]],
+        patch_size: int,
+        downsample_ratio: int,
+        image_mean: Tuple[float, float, float] = (0.5, 0.5, 0.5),
+        image_std: Tuple[float, float, float] = (0.5, 0.5, 0.5),
+        normalize: bool = True,
+        image_token: str = "<image>",
+        pad_token: str = "<｜▁pad▁｜>",
+        add_special_token: bool = False,
+        sft_format: str = "deepseek",
+        mask_prompt: bool = True,
+        ignore_id: int = -100,
+        ocr2_mode: bool = False,
+        **kwargs,
+    ):
+
+        self.candidate_resolutions = candidate_resolutions
+        self.image_size = candidate_resolutions[0][0]
+        self.patch_size = patch_size
+        self.image_mean = image_mean
+        self.image_std = image_std
+        self.normalize = normalize
+        self.downsample_ratio = downsample_ratio
+        self.base_size = BASE_SIZE
+        self.image_transform = ImageTransform(
+            mean=image_mean, std=image_std, normalize=normalize
+        )
+        self.tokenizer = tokenizer
+        # must set this，padding side with make a difference in batch inference
+        self.tokenizer.padding_side = "left"
+
+        # add the pad_token as special token to use 'tokenizer.pad_token' and 'tokenizer.pad_token_id'
+        if tokenizer.pad_token is None:
+            self.tokenizer.add_special_tokens({"pad_token": pad_token})
+
+        # add image token
+        image_token_id = self.tokenizer.vocab.get(image_token)
+        if image_token_id is None:
+            special_tokens = [image_token]
+            special_tokens_dict = {"additional_special_tokens": special_tokens}
+            self.tokenizer.add_special_tokens(special_tokens_dict)
+        self.image_token_id = self.tokenizer.vocab.get(image_token)
+
+        # add five special tokens for grounding-related tasks
+        # <|ref|>, <|/ref|>, <|det|>, <|/det|>, <|grounding|>
+        special_tokens = ["<|ref|>", "<|/ref|>", "<|det|>", "<|/det|>", "<|grounding|>"]
+        special_tokens_dict = {"additional_special_tokens": special_tokens}
+        self.tokenizer.add_special_tokens(special_tokens_dict)
+
+        # add special tokens for SFT data
+        special_tokens = ["<|User|>", "<|Assistant|>"]
+        special_tokens_dict = {"additional_special_tokens": special_tokens}
+        self.tokenizer.add_special_tokens(special_tokens_dict)
+
+        self.image_token = image_token
+        self.pad_token = pad_token
+        self.add_special_token = add_special_token
+        self.sft_format = sft_format
+        self.mask_prompt = mask_prompt
+        self.ignore_id = ignore_id
+        self.ocr2_mode = ocr2_mode
+
+        super().__init__(
+            tokenizer,
+            **kwargs,
+        )
+
+    def format_messages_v2(self, messages: str, pil_images, max_req_input_len=-1):
+        """play the role of format_messages_v2 and get_images_info in the last version"""
+        tokenized_data = []
+        masked_tokenized_data = []  # labels
+        images_list = []
+        images_seq_mask = []
+        images_spatial_crop = []
+
+        image_index = 0
+        image_token_cnt = messages.count(self.image_token)
+        (
+            input_ids,
+            images,
+            images_crop,
+            seq_mask,
+            spatial_crop,
+            num_image_tokens,
+            image_shapes,
+        ) = self.tokenize_with_images(
+            messages,
+            pil_images[image_index : image_index + image_token_cnt],
+            bos=True,
+            eos=True,
+            cropping=len(pil_images) <= 2,
+        )
+
+        image_index = image_token_cnt
+        images_list += images
+        images_seq_mask += seq_mask
+        images_spatial_crop = spatial_crop
+
+        return (
+            input_ids,
+            masked_tokenized_data,
+            images_list,
+            images_seq_mask,
+            images_spatial_crop,
+            images_crop,
+        )
+
+    @property
+    def bos_id(self):
+        return self.tokenizer.bos_token_id
+
+    @property
+    def eos_id(self):
+        return self.tokenizer.eos_token_id
+
+    @property
+    def pad_id(self):
+        return self.tokenizer.pad_token_id
+
+    def encode(self, text: str, bos: bool = True, eos: bool = False):
+        t = self.tokenizer.encode(text, add_special_tokens=False)
+
+        if bos:
+            t = [self.bos_id] + t
+        if eos:
+            t = t + [self.eos_id]
+
+        return t
+
+    def decode(self, t: List[int], **kwargs) -> str:
+        return self.tokenizer.decode(t, **kwargs)
+
+    def process_one(
+        self,
+        prompt: str = None,
+        conversations: List[Dict[str, str]] = None,
+        images: List[Image.Image] = None,
+        apply_sft_format: bool = False,
+        inference_mode: bool = True,
+        system_prompt: str = "",
+        max_req_input_len: int = -1,
+        cropping: bool = True,
+        **kwargs,
+    ):
+        """
+
+        Args:
+            prompt (str): the formatted prompt;
+            conversations (List[Dict]): conversations with a list of messages;
+            images (List[ImageType]): the list of images;
+            apply_sft_format (bool): if prompt is not None, then apply the SFT format to prompt;
+                if conversations is not None, then it will always apply the SFT format to conversations;
+            inference_mode (bool): if True, then remove the last eos token;
+            system_prompt (str): the system prompt;
+            **kwargs:
+
+        Returns:
+            outputs (BaseProcessorOutput): the output of the processor,
+                - input_ids (torch.LongTensor): [N + image tokens]
+                - target_ids (torch.LongTensor): [N + image tokens]
+                - images (torch.FloatTensor): [n_images, 3, H, W]
+                - image_id (int): the id of the image token
+                - num_image_tokens (List[int]): the number of image tokens
+        """
+
+        prompt = conversations or prompt
+        (
+            input_ids,
+            masked_tokenized_str,
+            images_list,
+            images_seq_mask,
+            images_spatial_crop,
+            images_crop,
+        ) = self.format_messages_v2(prompt, images, max_req_input_len)
+
+        target_ids = torch.LongTensor(masked_tokenized_str)
+
+        has_images = len(images_list) > 0
+        has_local_crops = False
+        if len(images_spatial_crop) > 0:
+            has_local_crops = any(
+                crop[0] > 1 or crop[1] > 1 for crop in images_spatial_crop
+            )
+
+        if len(images_list) == 0:
+            images = torch.zeros((1, 3, self.image_size, self.image_size))
+        else:
+            images = torch.stack(images_list, dim=0)
+
+        images_spatial_crop = torch.stack(
+            [images_spatial_crop], dim=0
+        )  # stack the tensor to make it a batch of 1
+
+        prepare = VLChatProcessorOutput(
+            input_ids=input_ids,
+            target_ids=target_ids,
+            images_crop=images_crop,
+            pixel_values=images,
+            images_seq_mask=images_seq_mask,
+            images_spatial_crop=images_spatial_crop,
+        )
+        prepare.has_images = has_images
+        prepare.has_local_crops = has_local_crops
+
+        return prepare
+
+    def __call__(
+        self,
+        *,
+        prompt: str = None,
+        conversations: List[Dict[str, str]] = None,
+        images: List[Image.Image] = None,
+        apply_sft_format: bool = False,
+        inference_mode: bool = True,
+        system_prompt: str = "",
+        max_req_input_len: int = -1,
+        text: list[str] = None,
+        **kwargs,
+    ):
+        assert text is None or isinstance(text, list)
+        if text is not None:
+            text = text[0]
+        prepare = self.process_one(
+            prompt=prompt or text,
+            conversations=conversations,
+            images=images,
+            apply_sft_format=apply_sft_format,
+            inference_mode=inference_mode,
+            system_prompt=system_prompt,
+            max_req_input_len=max_req_input_len,
+        )
+
+        return prepare
+
+    def find_all_indices(self, messages, target_value):
+        indices = []
+        for index, item in enumerate(messages):
+            if item == target_value:
+                indices.append(index)
+        return indices
+
+    def tokenize_with_images(
+        self,
+        conversation: str,
+        images: List[Image.Image],
+        bos: bool = True,
+        eos: bool = True,
+        cropping: bool = True,
+    ):
+        """Tokenize text with <image> tags."""
+
+        conversation = conversation
+        assert conversation.count(self.image_token) == len(images)
+        text_splits = conversation.split(self.image_token)
+        images_list, images_crop_list, images_seq_mask, images_spatial_crop = (
+            [],
+            [],
+            [],
+            [],
+        )
+        image_shapes = []
+        num_image_tokens = []
+        tokenized_str = []
+        for text_sep, image in zip(text_splits, images):
+            """encode text_sep"""
+            tokenized_sep = self.encode(text_sep, bos=False, eos=False)
+
+            tokenized_str += tokenized_sep
+            images_seq_mask += [False] * len(tokenized_sep)
+
+            image_shapes.append(image.size)
+
+            if image.size[0] <= 640 and image.size[1] <= 640:
+                crop_ratio = [1, 1]
+            else:
+                if cropping:
+                    images_crop_raw, crop_ratio = dynamic_preprocess(
+                        image, image_size=IMAGE_SIZE
+                    )
+                else:
+                    crop_ratio = [1, 1]
+
+            """process the global view"""
+            if self.image_size <= 640 and not cropping:
+                image = image.resize((self.image_size, self.image_size))
+
+            global_view = ImageOps.pad(
+                image,
+                (self.base_size, self.base_size),
+                color=tuple(int(x * 255) for x in self.image_transform.mean),
+            )
+            images_list.append(self.image_transform(global_view))
+
+            num_width_tiles, num_height_tiles = crop_ratio
+            images_spatial_crop.append([num_width_tiles, num_height_tiles])
+
+            if num_width_tiles > 1 or num_height_tiles > 1:
+                for i in range(len(images_crop_raw)):
+                    images_crop_list.append(self.image_transform(images_crop_raw[i]))
+
+            """add image tokens"""
+            num_queries = math.ceil(
+                (self.image_size // self.patch_size) / self.downsample_ratio
+            )
+            num_queries_base = math.ceil(
+                (self.base_size // self.patch_size) / self.downsample_ratio
+            )
+
+            if self.ocr2_mode:
+                tokenized_image = []
+                if num_width_tiles > 1 or num_height_tiles > 1:
+                    tokenized_image += [self.image_token_id] * (
+                        num_queries * num_width_tiles * num_queries * num_height_tiles
+                    )
+                tokenized_image += [self.image_token_id] * (
+                    num_queries_base * num_queries_base
+                )
+                # One extra token for the view separator.
+                tokenized_image += [self.image_token_id]
+            else:
+                tokenized_image = (
+                    [self.image_token_id] * num_queries_base + [self.image_token_id]
+                ) * num_queries_base
+                tokenized_image += [self.image_token_id]
+                if num_width_tiles > 1 or num_height_tiles > 1:
+                    tokenized_image += (
+                        [self.image_token_id] * (num_queries * num_width_tiles)
+                        + [self.image_token_id]
+                    ) * (num_queries * num_height_tiles)
+            tokenized_str += tokenized_image
+
+            images_seq_mask += [True] * len(tokenized_image)
+            num_image_tokens.append(len(tokenized_image))
+
+        """process the last text split"""
+        tokenized_sep = self.encode(text_splits[-1], bos=False, eos=False)
+
+        tokenized_str += tokenized_sep
+        images_seq_mask += [False] * len(tokenized_sep)
+
+        """add the bos and eos tokens"""
+        if bos:
+            tokenized_str = [self.bos_id] + tokenized_str
+            images_seq_mask = [False] + images_seq_mask
+        if eos:
+            tokenized_str = tokenized_str + [self.eos_id]
+            images_seq_mask = images_seq_mask + [False]
+
+        assert len(tokenized_str) == len(
+            images_seq_mask
+        ), f"tokenize_with_images func: tokenized_str's length {len(tokenized_str)} is not equal to imags_seq_mask's length {len(images_seq_mask)}"
+
+        masked_tokenized_str = []
+        for token_index in tokenized_str:
+            if token_index != self.image_token_id:
+                masked_tokenized_str.append(token_index)
+            else:
+                masked_tokenized_str.append(self.ignore_id)
+
+        assert (
+            len(tokenized_str) == len(images_seq_mask) == len(masked_tokenized_str)
+        ), (
+            f"tokenized_str's length {len(tokenized_str)}, input_ids' length {len(masked_tokenized_str)}, "
+            f"imags_seq_mask's length {len(images_seq_mask)}, are not equal"
+        )
+        input_ids = torch.LongTensor(tokenized_str)
+        target_ids = torch.LongTensor(masked_tokenized_str)
+        images_seq_mask = torch.tensor(images_seq_mask, dtype=torch.bool)
+
+        # set input_ids < 0 | input_ids == self.image_token_id as ignore_id
+        target_ids[(input_ids < 0) | (input_ids == self.image_token_id)] = (
+            self.ignore_id
+        )
+        input_ids[input_ids < 0] = self.pad_id
+
+        inference_mode = True
+
+        if inference_mode:
+            # Remove the ending eos token
+            assert input_ids[-1] == self.eos_id
+            input_ids = input_ids[:-1]
+            target_ids = target_ids[:-1]
+            images_seq_mask = images_seq_mask[:-1]
+
+        if len(images_list) == 0:
+            pixel_values = torch.zeros((1, 3, self.base_size, self.base_size))
+            images_spatial_crop = torch.zeros((1, 1), dtype=torch.long)
+            images_crop = torch.zeros(
+                (1, 3, self.image_size, self.image_size)
+            ).unsqueeze(0)
+        else:
+            pixel_values = torch.stack(images_list, dim=0)
+            images_spatial_crop = torch.tensor(images_spatial_crop, dtype=torch.long)
+            if images_crop_list:
+                images_crop = torch.stack(images_crop_list, dim=0).unsqueeze(0)
+            else:
+                images_crop = torch.zeros(
+                    (1, 3, self.image_size, self.image_size)
+                ).unsqueeze(0)
+
+        input_ids = input_ids.unsqueeze(0)
+        return (
+            input_ids,
+            pixel_values,
+            images_crop,
+            images_seq_mask,
+            images_spatial_crop,
+            num_image_tokens,
+            image_shapes,
+        )
+
+
+class VisionEncoderConfig(PretrainedConfig):
+    model_type: str = "vision"
+
+    model_name: str = "vit_so400m_patch14_siglip_384.webli"
+    image_size: int = 384
+    patch_size: int = 16
+    width: int = 1024
+    layers: int = 24
+    heads: int = 16
+    mlp_ratio: int = 4
+    global_pool: str = "map"
+    ignore_head: bool = True
+    class_token: bool = False
+    num_classes: int = 0
+    use_checkpoint: bool = False
+    weight_init: str = "skip"
+    deterministic: bool = False
+    num_recomputing_layers: int = 0
+
+    def __init__(
+        self,
+        model_name: str = "vit_so400m_patch14_siglip_384.webli",
+        image_size: int = 384,
+        patch_size: int = 16,
+        width: int = 1024,
+        layers: int = 24,
+        heads: int = 16,
+        mlp_ratio: int = 4,
+        global_pool: str = "map",
+        ignore_head: bool = True,
+        class_token: bool = False,
+        num_classes: int = 0,
+        use_checkpoint: bool = False,
+        **kwargs,
+    ):
+        self.model_name = model_name
+        self.image_size = image_size
+        self.patch_size = patch_size
+        self.width = width
+        self.layers = layers
+        self.heads = heads
+        self.mlp_ratio = mlp_ratio
+        self.global_pool = global_pool
+        self.ignore_head = ignore_head
+        self.class_token = class_token
+        self.num_classes = num_classes
+        self.use_checkpoint = use_checkpoint
+
+        super().__init__(**kwargs)
+
+
+class MlpProjectorConfig(PretrainedConfig):
+    model_type = "mlp_projector"
+    projector_type: str = "downsample_mlp_gelu"
+    input_dim: int = 1152
+    n_embed: int = 2048
+    depth: int = 2
+    mlp_ratio: int = 1
+    downsample_ratio: int = 2
+    token_pooling: bool = False
+
+    def __init__(
+        self,
+        projector_type: str = "downsample_mlp_gelu",
+        input_dim: int = 1152,
+        n_embed: int = 2048,
+        depth: int = 2,
+        mlp_ratio: int = 1,
+        downsample_ratio: int = 2,
+        **kwargs,
+    ):
+        self.projector_type = projector_type
+        self.input_dim = input_dim
+        self.n_embed = n_embed
+        self.depth = depth
+        self.mlp_ratio = mlp_ratio
+        self.downsample_ratio = downsample_ratio
+
+        super().__init__(**kwargs)
+
+
+class DeepseekV2Config(PretrainedConfig):
+    model_type = "deepseek_v2"
+    keys_to_ignore_at_inference = ["past_key_values"]
+
+    def __init__(
+        self,
+        vocab_size=102400,
+        hidden_size=4096,
+        intermediate_size=11008,
+        moe_intermediate_size=1407,
+        num_hidden_layers=30,
+        num_attention_heads=32,
+        num_key_value_heads=32,
+        n_shared_experts=None,
+        n_routed_experts=None,
+        ep_size=1,
+        routed_scaling_factor=1.0,
+        kv_lora_rank=512,
+        q_lora_rank=1536,
+        qk_rope_head_dim=64,
+        v_head_dim=128,
+        qk_nope_head_dim=128,
+        topk_method="gready",
+        n_group=None,
+        topk_group=None,
+        num_experts_per_tok=None,
+        moe_layer_freq=1,
+        first_k_dense_replace=0,
+        norm_topk_prob=False,
+        scoring_func="softmax",
+        aux_loss_alpha=0.001,
+        seq_aux=True,
+        hidden_act="silu",
+        max_position_embeddings=2048,
+        initializer_range=0.02,
+        rms_norm_eps=1e-6,
+        use_cache=True,
+        pad_token_id=None,
+        bos_token_id=100000,
+        eos_token_id=100001,
+        pretraining_tp=1,
+        tie_word_embeddings=False,
+        rope_theta=10000.0,
+        rope_scaling=None,
+        attention_bias=False,
+        attention_dropout=0.0,
+        use_mla=True,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.moe_intermediate_size = moe_intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.n_shared_experts = n_shared_experts
+        self.n_routed_experts = n_routed_experts
+        self.ep_size = ep_size
+        self.routed_scaling_factor = routed_scaling_factor
+        self.kv_lora_rank = kv_lora_rank
+        self.q_lora_rank = q_lora_rank
+        self.qk_rope_head_dim = qk_rope_head_dim
+        self.v_head_dim = v_head_dim
+        self.qk_nope_head_dim = qk_nope_head_dim
+        self.topk_method = topk_method
+        self.n_group = n_group
+        self.topk_group = topk_group
+        self.num_experts_per_tok = num_experts_per_tok
+        self.moe_layer_freq = moe_layer_freq
+        self.first_k_dense_replace = first_k_dense_replace
+        self.norm_topk_prob = norm_topk_prob
+        self.scoring_func = scoring_func
+        self.aux_loss_alpha = aux_loss_alpha
+        self.seq_aux = seq_aux
+        # for backward compatibility
+        if num_key_value_heads is None:
+            num_key_value_heads = num_attention_heads
+
+        self.num_key_value_heads = num_key_value_heads
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = float(rms_norm_eps)
+        self.pretraining_tp = pretraining_tp
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+        self.rope_scaling = rope_scaling
+        self.attention_bias = attention_bias
+        self.attention_dropout = attention_dropout
+        self.use_mla = use_mla
+
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
+
+
+@register_customized_processor(processor_class=DeepseekOCRProcessor)
+class DeepseekVLV2Config(PretrainedConfig):
+    # model_type = "deepseek_vl_v2"
+    model_type = "deepseek-ocr"
+    vision_config: VisionEncoderConfig
+    projector_config: MlpProjectorConfig
+
+    tile_tag: str = "2D"
+    global_view_pos: str = "head"
+    candidate_resolutions: tuple[tuple[int, int]] = ((384, 384),)
+    customized_processor_type: type[Any] = DeepseekOCRProcessor
+
+    def __init__(
+        self,
+        tile_tag: str = "tile_tag",
+        global_view_pos: str = "head",
+        candidate_resolutions: tuple[tuple[int, int]] = ((384, 384),),
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+
+        vision_config = kwargs.get("vision_config", {})
+        self.vision_config = VisionEncoderConfig(**vision_config)
+
+        projector_config = kwargs.get("projector_config", {})
+        self.projector_config = MlpProjectorConfig(**projector_config)
+
+        language_config = kwargs.get("language_config", {})
+        self.text_config = DeepseekV2Config(**language_config)
+
+        self.tile_tag = tile_tag
+        self.global_view_pos = global_view_pos
+        self.candidate_resolutions = candidate_resolutions
+        self.vocab_size = self.text_config.vocab_size
+        self.hidden_size = self.text_config.hidden_size
+
+
+AutoProcessor.register(DeepseekVLV2Config, DeepseekOCRProcessor)
diff --git a/sglang/python/sglang/srt/configs/deepseekvl2.py b/sglang/python/sglang/srt/configs/deepseekvl2.py
new file mode 100644
index 0000000000000000000000000000000000000000..9621f058bf631756940e1e99d9cee39f965d83d7
--- /dev/null
+++ b/sglang/python/sglang/srt/configs/deepseekvl2.py
@@ -0,0 +1,687 @@
+import math
+from dataclasses import dataclass
+from typing import Dict, List, Optional, Tuple
+
+import torch
+from PIL import Image, ImageOps
+from transformers import (
+    AutoProcessor,
+    LlamaTokenizerFast,
+    PretrainedConfig,
+    ProcessorMixin,
+)
+
+
+def select_best_resolution(image_size, candidate_resolutions):
+    # used for cropping
+    original_width, original_height = image_size
+    best_fit = None
+    max_effective_resolution = 0
+    min_wasted_resolution = float("inf")
+
+    for width, height in candidate_resolutions:
+        scale = min(width / original_width, height / original_height)
+        downscaled_width, downscaled_height = int(original_width * scale), int(
+            original_height * scale
+        )
+        effective_resolution = min(
+            downscaled_width * downscaled_height, original_width * original_height
+        )
+        wasted_resolution = (width * height) - effective_resolution
+
+        if effective_resolution > max_effective_resolution or (
+            effective_resolution == max_effective_resolution
+            and wasted_resolution < min_wasted_resolution
+        ):
+            max_effective_resolution = effective_resolution
+            min_wasted_resolution = wasted_resolution
+            best_fit = (width, height)
+
+    return best_fit
+
+
+class DictOutput(object):
+    def items(self):
+        return self.__dict__.items()
+
+    def keys(self):
+        return self.__dict__.keys()
+
+    def __getitem__(self, item):
+        return self.__dict__[item]
+
+    def __contains__(self, key):
+        return key in self.__dict__
+
+    def __setitem__(self, key, value):
+        self.__dict__[key] = value
+
+
+@dataclass
+class VLChatProcessorOutput(DictOutput):
+    input_ids: torch.LongTensor
+    target_ids: torch.LongTensor
+    pixel_values: (
+        torch.Tensor
+    )  # rename from "images" to "pixel_values" for compatibility
+    images_seq_mask: torch.BoolTensor
+    images_spatial_crop: torch.LongTensor
+
+    def __len__(self):
+        return len(self.input_ids)
+
+
+class ImageTransform(object):
+    def __init__(
+        self,
+        mean: Optional[Tuple[float, float, float]] = (0.5, 0.5, 0.5),
+        std: Optional[Tuple[float, float, float]] = (0.5, 0.5, 0.5),
+        normalize: bool = True,
+    ):
+        self.mean = mean
+        self.std = std
+        self.normalize = normalize
+
+        # only load torchvision.transforms when needed
+        try:
+            import torchvision.transforms as T
+
+            # FIXME: add version check for gguf
+        except ImportError as err:
+            raise ImportError(
+                "Please install torchvision via `pip install torchvision` to use Deepseek-VL2."
+            ) from err
+
+        transform_pipelines = [T.ToTensor()]
+
+        if normalize:
+            transform_pipelines.append(T.Normalize(mean, std))
+
+        self.transform = T.Compose(transform_pipelines)
+
+    def __call__(self, pil_img: Image.Image):
+        x = self.transform(pil_img)
+        return x
+
+
+class DeepseekVLV2Processor(ProcessorMixin):
+    tokenizer_class = ("LlamaTokenizer", "LlamaTokenizerFast")
+    attributes = ["tokenizer"]
+
+    def __init__(
+        self,
+        tokenizer: LlamaTokenizerFast,
+        candidate_resolutions: Tuple[Tuple[int, int]],
+        patch_size: int,
+        downsample_ratio: int,
+        image_mean: Tuple[float, float, float] = (0.5, 0.5, 0.5),
+        image_std: Tuple[float, float, float] = (0.5, 0.5, 0.5),
+        normalize: bool = True,
+        image_token: str = "<image>",
+        pad_token: str = "<｜▁pad▁｜>",
+        add_special_token: bool = False,
+        sft_format: str = "deepseek",
+        mask_prompt: bool = True,
+        ignore_id: int = -100,
+        **kwargs,
+    ):
+
+        self.candidate_resolutions = candidate_resolutions
+        self.image_size = candidate_resolutions[0][0]
+        self.patch_size = patch_size
+        self.image_mean = image_mean
+        self.image_std = image_std
+        self.normalize = normalize
+        self.downsample_ratio = downsample_ratio
+
+        self.image_transform = ImageTransform(
+            mean=image_mean, std=image_std, normalize=normalize
+        )
+        self.tokenizer = tokenizer
+        # must set this，padding side with make a difference in batch inference
+        self.tokenizer.padding_side = "left"
+
+        # add the pad_token as special token to use 'tokenizer.pad_token' and 'tokenizer.pad_token_id'
+        if tokenizer.pad_token is None:
+            self.tokenizer.add_special_tokens({"pad_token": pad_token})
+
+        # add image token
+        image_token_id = self.tokenizer.vocab.get(image_token)
+        if image_token_id is None:
+            special_tokens = [image_token]
+            special_tokens_dict = {"additional_special_tokens": special_tokens}
+            self.tokenizer.add_special_tokens(special_tokens_dict)
+        self.image_token_id = self.tokenizer.vocab.get(image_token)
+
+        # add five special tokens for grounding-related tasks
+        # <|ref|>, <|/ref|>, <|det|>, <|/det|>, <|grounding|>
+        special_tokens = ["<|ref|>", "<|/ref|>", "<|det|>", "<|/det|>", "<|grounding|>"]
+        special_tokens_dict = {"additional_special_tokens": special_tokens}
+        self.tokenizer.add_special_tokens(special_tokens_dict)
+
+        # add special tokens for SFT data
+        special_tokens = ["<|User|>", "<|Assistant|>"]
+        special_tokens_dict = {"additional_special_tokens": special_tokens}
+        self.tokenizer.add_special_tokens(special_tokens_dict)
+
+        self.image_token = image_token
+        self.pad_token = pad_token
+        self.add_special_token = add_special_token
+        self.sft_format = sft_format
+        self.mask_prompt = mask_prompt
+        self.ignore_id = ignore_id
+
+        super().__init__(
+            tokenizer,
+            **kwargs,
+        )
+
+    def format_messages_v2(self, messages, pil_images, max_req_input_len=-1):
+        """play the role of format_messages_v2 and get_images_info in the last version"""
+        tokenized_data = []
+        masked_tokenized_data = []  # labels
+        images_list = []
+        images_seq_mask = []
+        images_spatial_crop = []
+
+        image_index = 0
+        image_token_cnt = messages.count(self.image_token)
+        tokenized_str, images, seq_mask, spatial_crop = self.tokenize_with_images(
+            messages,
+            pil_images[image_index : image_index + image_token_cnt],
+            bos=True,
+            eos=True,
+            cropping=len(pil_images) <= 2,
+            max_req_input_len=max_req_input_len,
+        )
+
+        image_index = image_token_cnt
+        tokenized_data += tokenized_str
+        if self.mask_prompt:
+            masked_tokenized_data += [self.ignore_id] * len(tokenized_str)
+        else:
+            masked_tokenized_data += tokenized_str
+        images_list += images
+        images_seq_mask += seq_mask
+        images_spatial_crop += spatial_crop
+
+        assert len(tokenized_data) == len(
+            images_seq_mask
+        ), f"format_messages_v2: tokenized_str's length {len(tokenized_str)} is not equal to imags_seq_mask's length {len(images_seq_mask)}"
+
+        return (
+            tokenized_data,
+            masked_tokenized_data,
+            images_list,
+            images_seq_mask,
+            images_spatial_crop,
+        )
+
+    @property
+    def bos_id(self):
+        return self.tokenizer.bos_token_id
+
+    @property
+    def eos_id(self):
+        return self.tokenizer.eos_token_id
+
+    @property
+    def pad_id(self):
+        return self.tokenizer.pad_token_id
+
+    def encode(self, text: str, bos: bool = True, eos: bool = False):
+        t = self.tokenizer.encode(text, add_special_tokens=False)
+
+        if bos:
+            t = [self.bos_id] + t
+        if eos:
+            t = t + [self.eos_id]
+
+        return t
+
+    def decode(self, t: List[int], **kwargs) -> str:
+        return self.tokenizer.decode(t, **kwargs)
+
+    def process_one(
+        self,
+        prompt: str = None,
+        conversations: List[Dict[str, str]] = None,
+        images: List[Image.Image] = None,
+        apply_sft_format: bool = False,
+        inference_mode: bool = True,
+        system_prompt: str = "",
+        max_req_input_len: int = -1,
+        **kwargs,
+    ):
+        """
+
+        Args:
+            prompt (str): the formatted prompt;
+            conversations (List[Dict]): conversations with a list of messages;
+            images (List[ImageType]): the list of images;
+            apply_sft_format (bool): if prompt is not None, then apply the SFT format to prompt;
+                if conversations is not None, then it will always apply the SFT format to conversations;
+            inference_mode (bool): if True, then remove the last eos token;
+            system_prompt (str): the system prompt;
+            **kwargs:
+
+        Returns:
+            outputs (BaseProcessorOutput): the output of the processor,
+                - input_ids (torch.LongTensor): [N + image tokens]
+                - target_ids (torch.LongTensor): [N + image tokens]
+                - images (torch.FloatTensor): [n_images, 3, H, W]
+                - image_id (int): the id of the image token
+                - num_image_tokens (List[int]): the number of image tokens
+        """
+
+        assert (
+            prompt is None or conversations is None
+        ), "prompt and conversations cannot be used at the same time."
+
+        (
+            tokenized_str,
+            masked_tokenized_str,
+            images_list,
+            images_seq_mask,
+            images_spatial_crop,
+        ) = self.format_messages_v2(conversations, images, max_req_input_len)
+
+        assert (
+            len(tokenized_str) == len(images_seq_mask) == len(masked_tokenized_str)
+        ), (
+            f"tokenized_str's length {len(tokenized_str)}, input_ids' length {len(masked_tokenized_str)}, "
+            f"imags_seq_mask's length {len(images_seq_mask)}, are not equal"
+        )
+
+        input_ids = torch.LongTensor(tokenized_str)
+        target_ids = torch.LongTensor(masked_tokenized_str)
+        images_seq_mask = torch.tensor(images_seq_mask, dtype=torch.bool)
+
+        # set input_ids < 0 | input_ids == self.image_token_id as ignore_id
+        target_ids[(input_ids < 0) | (input_ids == self.image_token_id)] = (
+            self.ignore_id
+        )
+        input_ids[input_ids < 0] = self.pad_id
+
+        if inference_mode:
+            assert input_ids[-1] == self.eos_id
+            input_ids = input_ids[:-1]
+            target_ids = target_ids[:-1]
+            images_seq_mask = images_seq_mask[:-1]
+
+        if len(images_list) == 0:
+            images = torch.zeros((1, 3, self.image_size, self.image_size))
+            images_spatial_crop = torch.zeros((1, 2), dtype=torch.long)
+        else:
+            images = torch.stack(images_list, dim=0)
+            images_spatial_crop = torch.tensor(images_spatial_crop, dtype=torch.long)
+
+        images_spatial_crop = torch.stack(
+            [images_spatial_crop], dim=0
+        )  # stack the tensor to make it a batch of 1
+
+        prepare = VLChatProcessorOutput(
+            input_ids=input_ids,
+            target_ids=target_ids,
+            pixel_values=images,
+            images_seq_mask=images_seq_mask,
+            images_spatial_crop=images_spatial_crop,
+        )
+
+        return prepare
+
+    def __call__(
+        self,
+        *,
+        prompt: str = None,
+        conversations: List[Dict[str, str]] = None,
+        images: List[Image.Image] = None,
+        apply_sft_format: bool = False,
+        inference_mode: bool = True,
+        system_prompt: str = "",
+        max_req_input_len: int = -1,
+        **kwargs,
+    ):
+        prepare = self.process_one(
+            prompt=prompt,
+            conversations=conversations,
+            images=images,
+            apply_sft_format=apply_sft_format,
+            inference_mode=inference_mode,
+            system_prompt=system_prompt,
+            max_req_input_len=max_req_input_len,
+        )
+
+        return prepare
+
+    def find_all_indices(self, messages, target_value):
+        indices = []
+        for index, item in enumerate(messages):
+            if item == target_value:
+                indices.append(index)
+        return indices
+
+    def tokenize_with_images(
+        self,
+        conversation: str,
+        images: List[Image.Image],
+        bos: bool = True,
+        eos: bool = True,
+        cropping: bool = True,
+        max_req_input_len: int = -1,
+    ):
+        """Tokenize text with <image> tags."""
+        images_list, images_seq_mask, images_spatial_crop = [], [], []
+        text_splits = conversation.split(self.image_token)
+        tokenized_str = []
+        for text_sep, image in zip(text_splits, images):
+            """encode text_sep"""
+            tokenized_sep = self.encode(text_sep, bos=False, eos=False)
+            tokenized_str += tokenized_sep
+            images_seq_mask += [False] * len(tokenized_sep)
+
+            """select best resolution for anyres"""
+            if cropping:
+                best_width, best_height = select_best_resolution(
+                    image.size, self.candidate_resolutions
+                )
+            else:
+                best_width, best_height = self.image_size, self.image_size
+            # print(image.size, (best_width, best_height)) # check the select_best_resolutions func
+
+            """process the global view"""
+            global_view = ImageOps.pad(
+                image,
+                (self.image_size, self.image_size),
+                color=tuple(int(x * 255) for x in self.image_transform.mean),
+            )
+            images_list.append(self.image_transform(global_view))
+
+            """process the local views"""
+            local_view = ImageOps.pad(
+                image,
+                (best_width, best_height),
+                color=tuple(int(x * 255) for x in self.image_transform.mean),
+            )
+            for i in range(0, best_height, self.image_size):
+                for j in range(0, best_width, self.image_size):
+                    images_list.append(
+                        self.image_transform(
+                            local_view.crop(
+                                (j, i, j + self.image_size, i + self.image_size)
+                            )
+                        )
+                    )
+
+            """record height / width crop num"""
+            num_width_tiles, num_height_tiles = (
+                best_width // self.image_size,
+                best_height // self.image_size,
+            )
+            images_spatial_crop.append([num_width_tiles, num_height_tiles])
+
+            """add image tokens"""
+            h = w = math.ceil(
+                (self.image_size // self.patch_size) / self.downsample_ratio
+            )
+            # global views tokens h * (w + 1), 1 is for line separator
+            tokenized_image = [self.image_token_id] * h * (w + 1)
+            # add a separator between global and local views
+            tokenized_image += [self.image_token_id]
+            # local views tokens, (num_height_tiles * h) * (num_width_tiles * w + 1)
+            tokenized_image += (
+                [self.image_token_id]
+                * (num_height_tiles * h)
+                * (num_width_tiles * w + 1)
+            )
+
+            tokenized_str += tokenized_image
+            images_seq_mask += [True] * len(tokenized_image)
+            # print(width_crop_num, height_crop_num, len(tokenized_image)) # test the correctness of the number of image-related tokens
+
+        """process the last text split"""
+        tokenized_sep = self.encode(text_splits[-1], bos=False, eos=False)
+        # deal with video, limit with request len
+        if max_req_input_len > -1:
+            if max_req_input_len < len(tokenized_sep) + len(tokenized_str) - 1:
+                rest = max_req_input_len - len(tokenized_sep) - 1 - 1024
+                tokenized_str = tokenized_str[:rest]
+                images_seq_mask = images_seq_mask[:rest]
+        tokenized_str += tokenized_sep
+        images_seq_mask += [False] * len(tokenized_sep)
+
+        """add the bos and eos tokens"""
+        if bos:
+            tokenized_str = [self.bos_id] + tokenized_str
+            images_seq_mask = [False] + images_seq_mask
+        if eos:
+            tokenized_str = tokenized_str + [self.eos_id]
+            images_seq_mask = images_seq_mask + [False]
+
+        assert len(tokenized_str) == len(
+            images_seq_mask
+        ), f"tokenize_with_images func: tokenized_str's length {len(tokenized_str)} is not equal to imags_seq_mask's length {len(images_seq_mask)}"
+
+        return tokenized_str, images_list, images_seq_mask, images_spatial_crop
+
+
+class DeepseekVL2VisionEncoderConfig(PretrainedConfig):
+    model_type: str = "vision"
+
+    model_name: str = "siglip_large_patch16_384"
+    image_size: int = 384
+    patch_size: int = 16
+    width: int = 1024
+    layers: int = 24
+    heads: int = 16
+    mlp_ratio: int = 4
+    global_pool: str = "map"
+    ignore_head: bool = True
+    class_token: bool = False
+    num_classes: int = 0
+    use_checkpoint: bool = False
+    weight_init: str = "skip"
+    deterministic: bool = False
+    num_recomputing_layers: int = 0
+
+    def __init__(
+        self,
+        model_name: str = "siglip_large_patch16_384",
+        image_size: int = 384,
+        patch_size: int = 16,
+        width: int = 1024,
+        layers: int = 24,
+        heads: int = 16,
+        mlp_ratio: int = 4,
+        global_pool: str = "map",
+        ignore_head: bool = True,
+        class_token: bool = False,
+        num_classes: int = 0,
+        use_checkpoint: bool = False,
+        **kwargs,
+    ):
+        self.model_name = model_name
+        self.image_size = image_size
+        self.patch_size = patch_size
+        self.width = width
+        self.layers = layers
+        self.heads = heads
+        self.mlp_ratio = mlp_ratio
+        self.global_pool = global_pool
+        self.ignore_head = ignore_head
+        self.class_token = class_token
+        self.num_classes = num_classes
+        self.use_checkpoint = use_checkpoint
+
+        super().__init__(**kwargs)
+
+
+class DeepseekVL2MlpProjectorConfig(PretrainedConfig):
+    model_type = "mlp_projector"
+    projector_type: str = "downsample_mlp_gelu"
+    input_dim: int = 1152
+    n_embed: int = 2048
+    depth: int = 2
+    mlp_ratio: int = 1
+    downsample_ratio: int = 2
+    token_pooling: bool = False
+
+    def __init__(
+        self,
+        projector_type: str = "downsample_mlp_gelu",
+        input_dim: int = 1152,
+        n_embed: int = 2048,
+        depth: int = 2,
+        mlp_ratio: int = 1,
+        downsample_ratio: int = 2,
+        **kwargs,
+    ):
+        self.projector_type = projector_type
+        self.input_dim = input_dim
+        self.n_embed = n_embed
+        self.depth = depth
+        self.mlp_ratio = mlp_ratio
+        self.downsample_ratio = downsample_ratio
+
+        super().__init__(**kwargs)
+
+
+class DeepseekV2Config(PretrainedConfig):
+
+    model_type = "deepseek_v2"
+    keys_to_ignore_at_inference = ["past_key_values"]
+
+    def __init__(
+        self,
+        vocab_size=102400,
+        hidden_size=4096,
+        intermediate_size=11008,
+        moe_intermediate_size=1407,
+        num_hidden_layers=30,
+        num_attention_heads=32,
+        num_key_value_heads=32,
+        n_shared_experts=None,
+        n_routed_experts=None,
+        ep_size=1,
+        routed_scaling_factor=1.0,
+        kv_lora_rank=512,
+        q_lora_rank=1536,
+        qk_rope_head_dim=64,
+        v_head_dim=128,
+        qk_nope_head_dim=128,
+        topk_method="gready",
+        n_group=None,
+        topk_group=None,
+        num_experts_per_tok=None,
+        moe_layer_freq=1,
+        first_k_dense_replace=0,
+        norm_topk_prob=False,
+        scoring_func="softmax",
+        aux_loss_alpha=0.001,
+        seq_aux=True,
+        hidden_act="silu",
+        max_position_embeddings=2048,
+        initializer_range=0.02,
+        rms_norm_eps=1e-6,
+        use_cache=True,
+        pad_token_id=None,
+        bos_token_id=100000,
+        eos_token_id=100001,
+        pretraining_tp=1,
+        tie_word_embeddings=False,
+        rope_theta=10000.0,
+        rope_scaling=None,
+        attention_bias=False,
+        attention_dropout=0.0,
+        use_mla=True,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.moe_intermediate_size = moe_intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.n_shared_experts = n_shared_experts
+        self.n_routed_experts = n_routed_experts
+        self.ep_size = ep_size
+        self.routed_scaling_factor = routed_scaling_factor
+        self.kv_lora_rank = kv_lora_rank
+        self.q_lora_rank = q_lora_rank
+        self.qk_rope_head_dim = qk_rope_head_dim
+        self.v_head_dim = v_head_dim
+        self.qk_nope_head_dim = qk_nope_head_dim
+        self.topk_method = topk_method
+        self.n_group = n_group
+        self.topk_group = topk_group
+        self.num_experts_per_tok = num_experts_per_tok
+        self.moe_layer_freq = moe_layer_freq
+        self.first_k_dense_replace = first_k_dense_replace
+        self.norm_topk_prob = norm_topk_prob
+        self.scoring_func = scoring_func
+        self.aux_loss_alpha = aux_loss_alpha
+        self.seq_aux = seq_aux
+        # for backward compatibility
+        if num_key_value_heads is None:
+            num_key_value_heads = num_attention_heads
+
+        self.num_key_value_heads = num_key_value_heads
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = float(rms_norm_eps)
+        self.pretraining_tp = pretraining_tp
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+        self.rope_scaling = rope_scaling
+        self.attention_bias = attention_bias
+        self.attention_dropout = attention_dropout
+        self.use_mla = use_mla
+
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
+
+
+class DeepseekVL2Config(PretrainedConfig):
+    model_type = "deepseek_vl_v2"
+    vision_config: DeepseekVL2VisionEncoderConfig
+    projector_config: DeepseekVL2MlpProjectorConfig
+    language_config: DeepseekV2Config
+
+    tile_tag: str = "2D"
+    global_view_pos: str = "head"
+    candidate_resolutions: Tuple[Tuple[int, int]] = ((384, 384),)
+
+    def __init__(
+        self,
+        tile_tag: str = "tile_tag",
+        global_view_pos: str = "head",
+        candidate_resolutions: Tuple[Tuple[int, int]] = ((384, 384),),
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+
+        vision_config = kwargs.get("vision_config", {})
+        self.vision_config = DeepseekVL2VisionEncoderConfig(**vision_config)
+
+        projector_config = kwargs.get("projector_config", {})
+        self.projector_config = DeepseekVL2MlpProjectorConfig(**projector_config)
+
+        language_config = kwargs.get("language_config", {})
+        if isinstance(language_config, DeepseekV2Config):
+            self.language_config = language_config
+        else:
+            self.language_config = DeepseekV2Config(**language_config)
+
+        self.tile_tag = tile_tag
+        self.global_view_pos = global_view_pos
+        self.candidate_resolutions = candidate_resolutions
+        self.architectures = ["DeepseekVL2ForCausalLM"]
+
+
+AutoProcessor.register(DeepseekVL2Config, DeepseekVLV2Processor)
diff --git a/sglang/python/sglang/srt/configs/device_config.py b/sglang/python/sglang/srt/configs/device_config.py
new file mode 100644
index 0000000000000000000000000000000000000000..8ddcfd10816e96785a88b219a390383292617aa8
--- /dev/null
+++ b/sglang/python/sglang/srt/configs/device_config.py
@@ -0,0 +1,19 @@
+import logging
+from typing import Optional
+
+import torch
+
+logger = logging.getLogger(__name__)
+
+
+class DeviceConfig:
+    device: Optional[torch.device]
+    gpu_id: Optional[int]
+
+    def __init__(self, device: str = "cuda", gpu_id: int = -1) -> None:
+        if device in ["cuda", "xpu", "hpu", "cpu", "npu", "musa"]:
+            self.device_type = device
+        else:
+            raise RuntimeError(f"Not supported device type: {device}")
+        self.device = torch.device(self.device_type)
+        self.gpu_id = gpu_id
diff --git a/sglang/python/sglang/srt/configs/dots_ocr.py b/sglang/python/sglang/srt/configs/dots_ocr.py
new file mode 100644
index 0000000000000000000000000000000000000000..8b0693b8e9cc35fca97e2f9b0bf1a384d607ea39
--- /dev/null
+++ b/sglang/python/sglang/srt/configs/dots_ocr.py
@@ -0,0 +1,64 @@
+from typing import Optional
+
+from transformers import AutoProcessor, Qwen2_5_VLProcessor
+from transformers.image_processing_utils import BaseImageProcessor
+from transformers.models.qwen2 import Qwen2Config
+
+from sglang.srt.configs.dots_vlm import DotsVisionConfig
+
+
+class DotsOCRConfig(Qwen2Config):
+    model_type = "dots_ocr"
+
+    def __init__(
+        self,
+        image_token_id=151665,
+        video_token_id=151656,
+        vision_config: Optional[dict] = None,
+        *args,
+        **kwargs
+    ):
+        super().__init__(*args, **kwargs)
+        self.image_token_id = image_token_id
+        self.video_token_id = video_token_id
+        self.vision_config = DotsVisionConfig(**(vision_config or {}))
+
+    def save_pretrained(self, save_directory, **kwargs):
+        self._auto_class = None
+        super().save_pretrained(save_directory, **kwargs)
+
+
+class DummyVideoProcessor(BaseImageProcessor):
+    model_input_names = ["pixel_values"]
+
+    def __call__(self, *args, **kwargs):
+        return None
+
+
+class DotsVLProcessor(Qwen2_5_VLProcessor):
+    def __init__(
+        self,
+        image_processor=None,
+        tokenizer=None,
+        video_processor=None,
+        chat_template=None,
+        **kwargs
+    ):
+        if video_processor is None:
+            video_processor = DummyVideoProcessor()
+        super().__init__(
+            image_processor, tokenizer, video_processor, chat_template=chat_template
+        )
+        self.image_token = (
+            "<|imgpad|>"
+            if not hasattr(tokenizer, "image_token")
+            else tokenizer.image_token
+        )
+        self.image_token_id = (
+            tokenizer.image_token_id
+            if getattr(tokenizer, "image_token_id", None) is not None
+            else tokenizer.convert_tokens_to_ids(self.image_token)
+        )
+
+
+AutoProcessor.register(DotsOCRConfig, DotsVLProcessor)
diff --git a/sglang/python/sglang/srt/configs/dots_vlm.py b/sglang/python/sglang/srt/configs/dots_vlm.py
new file mode 100644
index 0000000000000000000000000000000000000000..dc921582ccf8f6ece01902780aa5ed733a3cd9a8
--- /dev/null
+++ b/sglang/python/sglang/srt/configs/dots_vlm.py
@@ -0,0 +1,134 @@
+from transformers import AutoProcessor, PretrainedConfig
+from transformers.processing_utils import ProcessingKwargs
+
+try:
+    from transformers import Qwen2_5_VLProcessor
+except ImportError:
+    raise ImportError(
+        "Qwen2_5_VLProcessor can not be found. Please upgrade your transformers version."
+    )
+
+from sglang.srt.configs.deepseekvl2 import DeepseekV2Config
+
+
+class DotsVisionConfig(PretrainedConfig):
+    model_type: str = "dots_vit"
+
+    def __init__(
+        self,
+        embed_dim: int = 1536,  # vision encoder embed size
+        hidden_size: int = 1536,  # after merger hidden size
+        intermediate_size: int = 4224,
+        num_hidden_layers: int = 42,
+        num_attention_heads: int = 12,
+        num_channels: int = 3,
+        patch_size: int = 14,
+        spatial_merge_size: int = 2,
+        temporal_patch_size: int = 1,
+        rms_norm_eps: float = 1e-5,
+        use_bias: bool = False,
+        attn_implementation="flash_attention_2",  # "eager","sdpa","flash_attention_2"
+        initializer_range=0.02,
+        init_merger_std=0.02,
+        is_causal=False,  # ve causal forward
+        post_norm=True,
+        gradient_checkpointing=False,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.embed_dim = embed_dim
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.num_channels = num_channels
+        self.patch_size = patch_size
+        self.spatial_merge_size = spatial_merge_size
+        self.temporal_patch_size = temporal_patch_size
+        self.rms_norm_eps = rms_norm_eps
+        self.use_bias = use_bias
+        self.attn_implementation = attn_implementation
+        self.initializer_range = initializer_range
+        self.init_merger_std = init_merger_std
+        self.is_causal = is_causal
+        self.post_norm = post_norm
+        self.gradient_checkpointing = gradient_checkpointing
+
+
+class DotsVLMConfig(PretrainedConfig):
+    model_type = "dots_vlm"
+
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+        vision_config = kwargs.get("vision_config", {})
+        self.im_span_id = kwargs.get("image_token_id", 128815)
+        self.video_span_id = kwargs.get("video_token_id", 128836)
+        self.vision_config = DotsVisionConfig(**vision_config)
+        self.language_config = DeepseekV2Config(**kwargs)
+        self.architectures = ["DotsVLMForCausalLM"]
+
+
+class DotsVLMProcessorKwargs(ProcessingKwargs, total=False):
+    _defaults = {
+        "text_kwargs": {
+            "padding": False,
+        },
+    }
+
+
+class DotsVLMProcessor(Qwen2_5_VLProcessor):
+    r"""
+    Constructs a DotsVLM processor which derives from Qwen2_5_VLProcessor, but overrides the image and video token ids.
+    Besides, its tokenizer is a LlamaTokenizerFast instead of Qwen2TokenizerFast.
+    [`DotsVLMProcessor`] offers all the functionalities of [`DotsVisionConfig`] and [`LlamaTokenizerFast`]. See the
+    [`~DotsVLMProcessor.__call__`] and [`~DotsVLMProcessor.decode`] for more information.
+    Args:
+        image_processor ([`Qwen2VLImageProcessor`], *optional*):
+            The image processor is a required input.
+        tokenizer ([`LlamaTokenizerFast`], *optional*):
+            The tokenizer is a required input.
+        chat_template (`str`, *optional*): A Jinja template which will be used to convert lists of messages
+            in a chat into a tokenizable string.
+    """
+
+    attributes = ["image_processor", "tokenizer"]
+
+    valid_kwargs = ["chat_template"]
+
+    tokenizer_class = ("LlamaTokenizer", "LlamaTokenizerFast")
+
+    def __init__(
+        self, image_processor=None, tokenizer=None, chat_template=None, **kwargs
+    ):
+        super().__init__(image_processor, tokenizer, chat_template=chat_template)
+        self.image_token = (
+            "<|imgpad|>"
+            if not hasattr(tokenizer, "image_token")
+            else tokenizer.image_token
+        )
+        self.video_token = (
+            "<|video_pad|>"
+            if not hasattr(tokenizer, "video_token")
+            else tokenizer.video_token
+        )
+        self.img_token = (
+            "<|img|>" if not hasattr(tokenizer, "img_token") else tokenizer.img_token
+        )
+        self.endofimg_token = (
+            "<|endofimg|>"
+            if not hasattr(tokenizer, "endofimg_token")
+            else tokenizer.endofimg_token
+        )
+        self.image_token_id = (
+            tokenizer.image_token_id
+            if getattr(tokenizer, "image_token_id", None)
+            else tokenizer.encode(self.image_token)[0]
+        )
+        self.video_token_id = (
+            tokenizer.video_token_id
+            if getattr(tokenizer, "video_token_id", None)
+            else tokenizer.encode(self.video_token)[0]
+        )
+
+
+AutoProcessor.register(DotsVLMConfig, DotsVLMProcessor)
diff --git a/sglang/python/sglang/srt/configs/exaone.py b/sglang/python/sglang/srt/configs/exaone.py
new file mode 100644
index 0000000000000000000000000000000000000000..f5d91c45cf84c733d28528bc8721be6f8ad16bd4
--- /dev/null
+++ b/sglang/python/sglang/srt/configs/exaone.py
@@ -0,0 +1,196 @@
+# coding=utf-8
+# Copyright 2024 The LG AI Research EXAONE Lab. All rights reserved.
+# Copyright 2024 The LG CNS AI Engineering Team.
+# Copyright 2023-2024 SGLang Team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""EXAONE model configuration"""
+
+from typing import Any, Dict
+
+from transformers.configuration_utils import PretrainedConfig
+from transformers.utils import logging
+
+logger = logging.get_logger(__name__)
+
+EXAONE_PRETRAINED_CONFIG_ARCHIVE_MAP: Dict[str, Any] = {}
+
+
+# ruff: noqa: E501
+class ExaoneConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a :class:`~transformers.ExaoneModel`. It is used to
+    instantiate a EXAONE model according to the specified arguments, defining the model architecture. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of the Exaone
+
+    Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
+    outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
+
+
+    Args:
+        vocab_size (:obj:`int`, `optional`, defaults to 102400):
+            Vocabulary size of the EXAONE model. Defines the number of different tokens that can be represented by the
+            :obj:`inputs_ids` passed when calling :class:`~transformers.ExaoneModel`. Vocabulary size of the model.
+            Defines the different tokens that can be represented by the `inputs_ids` passed to the forward method of
+            :class:`~transformers.EXAONEModel`.
+        max_position_embeddings (:obj:`int`, `optional`, defaults to 2048):
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            just in case (e.g., 512 or 1024 or 2048).
+        hidden_size (:obj:`int`, `optional`, defaults to 2048):
+            Dimensionality of the encoder layers and the pooler layer.
+        num_layers (:obj:`int`, `optional`, defaults to 32):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (:obj:`int`, `optional`, defaults to 32):
+            Number of attention heads for each attention layer in the Transformer decoder.
+        num_key_value_heads (:obj:`int`, `optional`):
+            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
+            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
+            `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When
+            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
+            by meanpooling all the original heads within that group. For more details checkout [this
+            paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
+            `num_attention_heads`.
+        intermediate_size (:obj:`int`, `optional`, defaults to `hidden_size * 4`):
+            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        activation_function (:obj:`str` or :obj:`function`, `optional`, defaults to :obj:`"silu"`):
+            The non-linear activation function (function or string) in the decoder.
+        rope_theta (:obj:`float`, `optional`, defaults to 10000.0):
+            The base period of the RoPE embeddings.
+        rope_scaling (:obj:`Dict`, `optional`):
+            Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type
+            and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value
+            accordingly.
+            Expected contents:
+                `rope_type` (:obj:`str`):
+                    The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope',
+                    'llama3'], with 'default' being the original RoPE implementation.
+                `factor` (:obj:`float`, `optional`):
+                    Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In
+                    most scaling types, a `factor` of x will enable the model to handle sequences of length x *
+                    original maximum pre-trained length.
+                `original_max_position_embeddings` (:obj:`int`, `optional`):
+                    Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during
+                    pretraining.
+                `attention_factor` (:obj:`float`, `optional`):
+                    Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention
+                    computation. If unspecified, it defaults to value recommended by the implementation, using the
+                    `factor` field to infer the suggested value.
+                `beta_fast` (:obj:`float`, `optional`):
+                    Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear
+                    ramp function. If unspecified, it defaults to 32.
+                `beta_slow` (:obj:`float`, `optional`):
+                    Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear
+                    ramp function. If unspecified, it defaults to 1.
+                `short_factor` (:obj:`List[float]`, `optional`):
+                    Only used with 'longrope'. The scaling factor to be applied to short contexts (<
+                    `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
+                    size divided by the number of attention heads divided by 2
+                `long_factor` (:obj:`List[float]`, `optional`):
+                    Only used with 'longrope'. The scaling factor to be applied to long contexts (<
+                    `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
+                    size divided by the number of attention heads divided by 2
+                `low_freq_factor` (:obj:`float`, `optional`):
+                    Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE
+                `high_freq_factor` (:obj:`float`, `optional`):
+                    Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE
+        embed_dropout (:obj:`float`, `optional`, defaults to 0.0):
+            The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_dropout (:obj:`float`, `optional`, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        layer_norm_epsilon (:obj:`float`, `optional`, defaults to 1e-5):
+            The epsilon used by the layer normalization layers.
+        initializer_range (:obj:`float`, `optional`, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if ``configs.is_decoder=True``.
+        bos_token_id (:obj:`int`, `optional`, defaults to 0):
+            Beginning of stream token id.
+        eos_token_id (:obj:`int`, `optional`, defaults to 2):
+            End of stream token id.
+        tie_word_embeddings (:obj:`bool`, `optional`, defaults to :obj:`True`):
+            Whether to tie weight embeddings
+        gradient_checkpointing (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            If True, use gradient checkpointing to save memory at the expense of slower backward pass.
+
+        Example::
+
+            >>> from transformers import EXAONEModel, ExaoneConfig
+
+            >>> # Initializing a EXAONE configuration
+            >>> configuration = ExaoneConfig()
+
+            >>> # Initializing a model from configuration
+            >>> model = EXAONEModel(configuration)
+
+            >>> # Accessing the model configuration
+            >>> configuration = model.configs
+    """
+
+    model_type = "exaone"
+    keys_to_ignore_at_inference = ["past_key_values"]
+    attribute_map = {"num_hidden_layers": "num_layers"}
+
+    def __init__(
+        self,
+        vocab_size=102400,
+        max_position_embeddings=2048,
+        hidden_size=2048,
+        num_layers=32,
+        num_attention_heads=32,
+        num_key_value_heads=None,
+        intermediate_size=None,
+        activation_function="silu",
+        rope_theta=10000.0,
+        rope_scaling=None,
+        embed_dropout=0.0,
+        attention_dropout=0.0,
+        layer_norm_epsilon=1e-5,
+        initializer_range=0.02,
+        use_cache=True,
+        bos_token_id=0,
+        eos_token_id=2,
+        tie_word_embeddings=True,
+        **kwargs
+    ):
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.num_layers = num_layers
+        self.num_attention_heads = num_attention_heads
+        self.num_hidden_layers = num_layers
+        if num_key_value_heads is None:
+            num_key_value_heads = num_attention_heads
+        self.num_key_value_heads = num_key_value_heads
+        if intermediate_size:
+            self.intermediate_size = intermediate_size
+        else:
+            self.intermediate_size = hidden_size * 4
+        self.activation_function = activation_function
+        self.embed_dropout = embed_dropout
+        self.attention_dropout = attention_dropout
+        self.layer_norm_epsilon = layer_norm_epsilon
+        self.initializer_range = initializer_range
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+        self.rope_scaling = rope_scaling
+
+        self.bos_token_id = bos_token_id
+        self.eos_token_id = eos_token_id
+
+        super().__init__(
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs
+        )
diff --git a/sglang/python/sglang/srt/configs/falcon_h1.py b/sglang/python/sglang/srt/configs/falcon_h1.py
new file mode 100644
index 0000000000000000000000000000000000000000..3aa038fb891da2ca7964c63c7070fc22cdd14b47
--- /dev/null
+++ b/sglang/python/sglang/srt/configs/falcon_h1.py
@@ -0,0 +1,315 @@
+# coding=utf-8
+# Copyright 2024 TII and the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Falcon-H1 model configuration"""
+
+from transformers.configuration_utils import PretrainedConfig
+from transformers.utils import logging
+
+from sglang.srt.configs.mamba_utils import (
+    Mamba2CacheParams,
+    Mamba2StateShape,
+    mamba2_state_dtype,
+)
+
+logger = logging.get_logger(__name__)
+
+
+class FalconH1Config(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`FalconH1Model`]. It is used to instantiate a
+    FalconH1Model model according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with defaults taken from [ibm-fms/FalconH1-9.8b-2.2T-hf](https://huggingface.co/ibm-fms/FalconH1-9.8b-2.2T-hf).
+    The FalconH1Model is a hybrid [mamba2](https://github.com/state-spaces/mamba) architecture with SwiGLU.
+    The checkpoints are  jointly trained by IBM, Princeton, and UIUC.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+    Args:
+        vocab_size (`int`, *optional*, defaults to 128000):
+            Vocabulary size of the FalconH1 model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`FalconH1Model`]
+        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
+            Whether the model's input and output word embeddings should be tied. Note that this is only relevant if the
+            model has a output word embedding layer.
+        hidden_size (`int`, *optional*, defaults to 4096):
+            Dimension of the hidden representations.
+        intermediate_size (`int`, *optional*, defaults to 14336):
+            Dimension of the MLP representations.
+        num_hidden_layers (`int`, *optional*, defaults to 32):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 32):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        num_key_value_heads (`int`, *optional*, defaults to 8):
+            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
+            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
+            `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
+            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
+            by meanpooling all the original heads within that group. For more details, check out [this
+            paper](https://huggingface.co/papers/2305.13245). If it is not specified, will default to `8`.
+        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
+            The non-linear activation function (function or string) in the decoder.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        rms_norm_eps (`float`, *optional*, defaults to 1e-05):
+            The epsilon used by the rms normalization layers.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+        num_logits_to_keep (`int` or `None`, *optional*, defaults to 1):
+            Number of prompt logits to calculate during generation. If `None`, all logits will be calculated. If an
+            integer value, only last `num_logits_to_keep` logits will be calculated. Default is 1 because only the
+            logits of the last prompt token are needed for generation. For long sequences, the logits for the entire
+            sequence may use a lot of memory so, setting `num_logits_to_keep=1` will reduce memory footprint
+            significantly.
+        pad_token_id (`int`, *optional*, defaults to 0):
+            The id of the padding token.
+        bos_token_id (`int`, *optional*, defaults to 1):
+            The id of the "beginning-of-sequence" token.
+        eos_token_id (`int`, *optional*, defaults to 2):
+            The id of the "end-of-sequence" token.
+        max_position_embeddings (`int`, *optional*, defaults to 8192):
+            Max cached sequence length for the model
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        mamba_d_ssm (`int`, *optional*, defaults to 1024):
+            The dimension of the SSM state space latents.
+        mamba_n_heads (`int`, *optional*, defaults to 128):
+            The number of mamba heads used in the v2 implementation.
+        mamba_d_head (`int`, *optional*, defaults to `"auto"`):
+            Head embedding dimension size
+        mamba_n_groups (`int`, *optional*, defaults to 1):
+            The number of the mamba groups used in the v2 implementation.
+        mamba_d_state (`int`, *optional*, defaults to 256):
+            The dimension the mamba state space latents
+        mamba_d_conv (`int`, *optional*, defaults to 4):
+            The size of the mamba convolution kernel
+        mamba_expand (`int`, *optional*, defaults to 2):
+            Expanding factor (relative to hidden_size) used to determine the mamba intermediate size
+        mamba_chunk_size (`int`, *optional*, defaults to 256):
+            The chunks in which to break the sequence when doing prefill/training
+        mamba_conv_bias (`bool`, *optional*, defaults to `True`):
+            Flag indicating whether or not to use bias in the convolution layer of the mamba mixer block.
+        mamba_proj_bias (`bool`, *optional*, defaults to `False`):
+            Flag indicating whether or not to use bias in the input and output projections (["in_proj", "out_proj"]) of the mamba mixer block
+        mamba_norm_before_gate (`bool`, *optional*, defaults to `True`):
+            Whether to use RMSNorm before the gate in the Mamba block
+        mamba_rms_norm (`bool`, *optional*, defaults to `False`):
+            Whether to use RMSNorm instead of LayerNorm in the Mamba block
+        projectors_bias (`bool`, *optional*, defaults to `False`):
+            Flag indicating whether or not to use bias in the input and output projections (["in_proj", "out_proj"]) of the attention block
+        rope_theta (`float`, *optional*, defaults to 100000.0):
+            The theta value used for the RoPE embeddings.
+        rope_scaling (`float`, *optional*):
+            The scaling value used for the RoPE embeddings. If `None`, no scaling is applied.
+        lm_head_multiplier (`float`, *optional*, defaults to 1.0):
+            The multiplier for the LM head. This is used to scale the output of the LM head.
+        embedding_multiplier (`float`, *optional*, defaults to 1.0):
+            The multiplier for the embedding layer. This is used to scale the output of the embedding layer.
+        mlp_multipliers (`list[float]`, *optional*):
+            The multipliers for the MLP layers. This is used to scale the output of the MLP layers. The first value is
+            the multiplier of gate layer, the second value is the multiplier of the down_proj layer.
+        key_multiplier (`float`, *optional*):
+            The multiplier for the key layer. This is used to scale the output of the key layer.
+        attention_out_multiplier (`float`, *optional*):
+            The multiplier for the attention output layer. This is used to scale the output of the attention output
+        attention_in_multiplier (`float`, *optional*):
+            The multiplier for the attention input layer. This is used to scale the output of the attention input layer.
+        ssm_multipliers (`list[float]`, *optional*):
+            The multipliers for the SSM layers. This is used to scale the output of the SSM layers.
+        ssm_in_multiplier (`float`, *optional*):
+            The multiplier for the SSM input layer. This is used to scale the output of the SSM input layer.
+        ssm_out_multiplier (`float`, *optional*):
+            The multiplier for the SSM output layer. This is used to scale the output of the SSM output layer.
+    """
+
+    model_type = "falcon_h1"
+    keys_to_ignore_at_inference = ["past_key_values"]
+
+    def __init__(
+        self,
+        vocab_size=128000,
+        tie_word_embeddings=False,
+        hidden_size=4096,
+        intermediate_size=14336,
+        num_hidden_layers=32,
+        num_attention_heads=32,
+        num_key_value_heads=8,
+        hidden_act="silu",
+        initializer_range=0.02,
+        rms_norm_eps=1e-5,
+        use_cache=True,
+        num_logits_to_keep=1,
+        pad_token_id=0,
+        bos_token_id=1,
+        eos_token_id=2,
+        max_position_embeddings=8192,
+        attention_dropout=0.0,
+        mamba_d_ssm=1024,
+        mamba_n_heads=128,
+        mamba_d_head="auto",
+        mamba_n_groups=1,
+        mamba_d_state=256,
+        mamba_d_conv=4,
+        mamba_expand=2,
+        mamba_chunk_size=256,
+        mamba_conv_bias=True,
+        mamba_proj_bias=False,
+        mamba_norm_before_gate=True,
+        mamba_rms_norm=False,
+        projectors_bias=False,
+        rope_theta=100000.0,
+        rope_scaling=None,
+        lm_head_multiplier=1.0,
+        embedding_multiplier=1.0,
+        mlp_multipliers=None,
+        key_multiplier=None,
+        attention_out_multiplier=None,
+        attention_in_multiplier=None,
+        ssm_multipliers=None,
+        ssm_in_multiplier=None,
+        ssm_out_multiplier=None,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.max_position_embeddings = max_position_embeddings
+        self.attention_dropout = attention_dropout
+        self.attention_bias = False
+        self.mlp_bias = False
+
+        # for backward compatibility
+        if num_key_value_heads is None:
+            num_key_value_heads = num_attention_heads
+
+        self.num_key_value_heads = num_key_value_heads
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+
+        self.use_cache = use_cache
+        self.num_logits_to_keep = num_logits_to_keep
+
+        self.rope_theta = rope_theta
+        self.rope_scaling = None
+        self.rope_scaling = rope_scaling
+        self.projectors_bias = projectors_bias
+        self.mamba_intermediate = mamba_intermediate = (
+            mamba_expand * hidden_size if mamba_d_ssm is None else mamba_d_ssm
+        )
+
+        if mamba_intermediate % mamba_n_heads != 0:
+            raise ValueError("mamba_n_heads must divide mamba_expand * hidden_size")
+
+        # for the mamba_v2, must satisfy the following
+        if mamba_d_head == "auto":
+            mamba_d_head = mamba_intermediate // mamba_n_heads
+
+        if mamba_d_head * mamba_n_heads != mamba_intermediate:
+            raise ValueError(
+                "The dimensions for the Mamba head state do not match the model intermediate_size"
+            )
+
+        self.mamba_d_ssm = mamba_d_ssm
+        self.mamba_n_heads = mamba_n_heads
+        self.mamba_d_head = mamba_d_head
+        self.mamba_n_groups = mamba_n_groups
+        self.mamba_d_state = mamba_d_state
+        self.mamba_d_conv = mamba_d_conv
+        self.mamba_expand = mamba_expand
+        self.mamba_chunk_size = mamba_chunk_size
+        self.mamba_conv_bias = mamba_conv_bias
+        self.mamba_proj_bias = mamba_proj_bias
+
+        self.mamba_norm_before_gate = mamba_norm_before_gate
+        self.mamba_rms_norm = mamba_rms_norm
+
+        self.lm_head_multiplier = lm_head_multiplier
+        self.embedding_multiplier = embedding_multiplier
+
+        if mlp_multipliers is not None:
+            self.mlp_multipliers = mlp_multipliers
+        else:
+            self.mlp_multipliers = [1.0, 1.0]
+
+        if attention_out_multiplier is not None:
+            self.attention_out_multiplier = attention_out_multiplier
+        else:
+            self.attention_out_multiplier = 1.0
+
+        if attention_in_multiplier is not None:
+            self.attention_in_multiplier = attention_in_multiplier
+        else:
+            self.attention_in_multiplier = 1.0
+
+        if key_multiplier is not None:
+            self.key_multiplier = key_multiplier
+        else:
+            self.key_multiplier = 1.0
+
+        if ssm_multipliers is not None:
+            self.ssm_multipliers = ssm_multipliers
+        else:
+            self.ssm_multipliers = [1.0, 1.0, 1.0, 1.0, 1.0]
+
+        if ssm_in_multiplier is not None:
+            self.ssm_in_multiplier = ssm_in_multiplier
+        else:
+            self.ssm_in_multiplier = 1.0
+
+        if ssm_out_multiplier is not None:
+            self.ssm_out_multiplier = ssm_out_multiplier
+        else:
+            self.ssm_out_multiplier = 1.0
+
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
+
+    @property
+    def layers_block_type(self):
+        return ["falcon_h1" for i in range(self.num_hidden_layers)]
+
+    @property
+    def full_attention_layer_ids(self):
+        # For Falcon-H1, we do have attention on all layers
+        return range(self.num_hidden_layers)
+
+    @property
+    def linear_layer_ids(self):
+        # For Falcon-H1, we do have mamba on all layers
+        return range(self.num_hidden_layers)
+
+    @property
+    def mamba2_cache_params(self):
+        from sglang.srt.layers.dp_attention import get_attention_tp_size
+
+        shape = Mamba2StateShape.create(
+            tp_world_size=get_attention_tp_size(),
+            intermediate_size=self.mamba_intermediate,
+            n_groups=self.mamba_n_groups,
+            num_heads=self.mamba_n_heads,
+            head_dim=self.mamba_d_head,
+            state_size=self.mamba_d_state,
+            conv_kernel=self.mamba_d_conv,
+        )
+        return Mamba2CacheParams(
+            shape=shape, layers=self.linear_layer_ids, dtype=mamba2_state_dtype(self)
+        )
diff --git a/sglang/python/sglang/srt/configs/granitemoehybrid.py b/sglang/python/sglang/srt/configs/granitemoehybrid.py
new file mode 100644
index 0000000000000000000000000000000000000000..8d2a7a7d1b024ada4e0e2ff811ada5edc4bb776c
--- /dev/null
+++ b/sglang/python/sglang/srt/configs/granitemoehybrid.py
@@ -0,0 +1,301 @@
+# coding=utf-8
+# Copyright 2025 IBM and the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""GraniteMoeHybrid model configuration"""
+
+from transformers.configuration_utils import PretrainedConfig
+from transformers.utils import logging
+
+from sglang.srt.configs.mamba_utils import Mamba2CacheParams, Mamba2StateShape
+
+logger = logging.get_logger(__name__)
+
+MAMBA = "mamba"
+ATTENTION = "attention"
+
+
+class GraniteMoeHybridConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`GraniteMoeHybridModel`]. It is used to instantiate a
+    GraniteMoeHybrid model according to the specified arguments, defining the model architecture. The GraniteMoeHybrid is a
+    hybrid architecture combining Mamba2 layers with attention layers, developed by IBM.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 100352):
+            Vocabulary size of the GraniteMoeHybrid model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`GraniteMoeHybridModel`]
+        tie_word_embeddings (`bool`, *optional*, defaults to `True`):
+            Whether the model's input and output word embeddings should be tied. Note that this is only relevant if the
+            model has a output word embedding layer.
+        hidden_size (`int`, *optional*, defaults to 2048):
+            Dimension of the hidden representations.
+        intermediate_size (`int`, *optional*, defaults to 8192):
+            Dimension of the MLP representations.
+        num_hidden_layers (`int`, *optional*, defaults to 40):
+            Number of hidden layers in the model.
+        layer_types (`list[str]`, *optional*):
+            List of layer types for each layer. Each element should be either "mamba" or "attention".
+            If not provided, defaults to alternating pattern based on num_hidden_layers.
+        num_attention_heads (`int`, *optional*, defaults to 32):
+            Number of attention heads for each attention layer.
+        num_key_value_heads (`int`, *optional*, defaults to 8):
+            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
+            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
+            `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used.
+        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
+            The non-linear activation function (function or string) in the decoder.
+        initializer_range (`float`, *optional*, defaults to 0.1):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        rms_norm_eps (`float`, *optional*, defaults to 1e-05):
+            The epsilon used by the rms normalization layers.
+        normalization_function (`str`, *optional*, defaults to `"rmsnorm"`):
+            The normalization function to use. Currently only "rmsnorm" is supported.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+        pad_token_id (`int`, *optional*, defaults to 100256):
+            The id of the padding token.
+        bos_token_id (`int`, *optional*, defaults to 100257):
+            The id of the "beginning-of-sequence" token.
+        eos_token_id (`int`, *optional*, defaults to 100257):
+            The id of the "end-of-sequence" token.
+        max_position_embeddings (`int`, *optional*, defaults to 131072):
+            Max cached sequence length for the model
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        attention_bias (`bool`, *optional*, defaults to `False`):
+            Whether to use bias in attention layers.
+        position_embedding_type (`str`, *optional*, defaults to `"nope"`):
+            Type of position embedding. Can be "nope" (no position embedding) or "rope".
+        rope_theta (`float`, *optional*, defaults to 10000.0):
+            The theta value used for the RoPE embeddings.
+        rope_scaling (`dict`, *optional*):
+            The scaling configuration for the RoPE embeddings. If `None`, no scaling is applied.
+        mamba_d_state (`int`, *optional*, defaults to 128):
+            The dimension of the mamba state space latents
+        mamba_d_conv (`int`, *optional*, defaults to 4):
+            The size of the mamba convolution kernel
+        mamba_expand (`int`, *optional*, defaults to 2):
+            Expanding factor (relative to hidden_size) used to determine the mamba intermediate size
+        mamba_d_head (`int`, *optional*, defaults to 64):
+            Head embedding dimension size for Mamba
+        mamba_n_heads (`int`, *optional*, defaults to 64):
+            The number of mamba heads
+        mamba_n_groups (`int`, *optional*, defaults to 1):
+            The number of the mamba groups
+        mamba_chunk_size (`int`, *optional*, defaults to 256):
+            The chunks in which to break the sequence when doing prefill/training
+        mamba_conv_bias (`bool`, *optional*, defaults to `True`):
+            Flag indicating whether or not to use bias in the convolution layer of the mamba mixer block.
+        mamba_proj_bias (`bool`, *optional*, defaults to `False`):
+            Flag indicating whether or not to use bias in the input and output projections of the mamba mixer block
+        embedding_multiplier (`float`, *optional*, defaults to 12.0):
+            The multiplier for the embedding layer. This is used to scale the output of the embedding layer.
+        logits_scaling (`float`, *optional*, defaults to 8.0):
+            The scaling factor for the logits.
+        attention_multiplier (`float`, *optional*, defaults to 0.015625):
+            The multiplier for the attention layers.
+        residual_multiplier (`float`, *optional*, defaults to 0.22):
+            The multiplier for residual connections.
+        num_local_experts (`int`, *optional*, defaults to 0):
+            Number of local experts in MoE layers.
+        num_experts_per_tok (`int`, *optional*, defaults to 0):
+            Number of experts to use per token in MoE layers.
+        shared_intermediate_size (`int`, *optional*, defaults to 8192):
+            Intermediate size for shared experts.
+        output_router_logits (`bool`, *optional*, defaults to `False`):
+            Whether to output router logits.
+        router_aux_loss_coef (`float`, *optional*, defaults to 0.01):
+            Auxiliary loss coefficient for the router.
+    """
+
+    model_type = "granitemoehybrid"
+    keys_to_ignore_at_inference = ["past_key_values"]
+
+    def __init__(
+        self,
+        vocab_size=100352,
+        tie_word_embeddings=True,
+        hidden_size=2048,
+        intermediate_size=8192,
+        num_hidden_layers=40,
+        layer_types=None,
+        num_attention_heads=32,
+        num_key_value_heads=8,
+        hidden_act="silu",
+        initializer_range=0.1,
+        rms_norm_eps=1e-5,
+        normalization_function="rmsnorm",
+        use_cache=True,
+        pad_token_id=100256,
+        bos_token_id=100257,
+        eos_token_id=100257,
+        max_position_embeddings=131072,
+        attention_dropout=0.0,
+        attention_bias=False,
+        position_embedding_type="nope",
+        rope_theta=10000.0,
+        rope_scaling=None,
+        mamba_d_state=128,
+        mamba_d_conv=4,
+        mamba_expand=2,
+        mamba_d_head=64,
+        mamba_n_heads=64,
+        mamba_n_groups=1,
+        mamba_chunk_size=256,
+        mamba_conv_bias=True,
+        mamba_proj_bias=False,
+        embedding_multiplier=12.0,
+        logits_scaling=8.0,
+        attention_multiplier=0.015625,
+        residual_multiplier=0.22,
+        num_local_experts=0,
+        num_experts_per_tok=0,
+        shared_intermediate_size=8192,
+        output_router_logits=False,
+        router_aux_loss_coef=0.01,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+
+        # Set layer types - if not provided, create default pattern
+        if layer_types is None:
+            # Default pattern: mamba layers with attention every 6th layer (roughly)
+            self.layer_types = []
+            for i in range(num_hidden_layers):
+                if (i + 1) % 6 == 0:
+                    self.layer_types.append(ATTENTION)
+                else:
+                    self.layer_types.append(MAMBA)
+        else:
+            self.layer_types = layer_types
+
+        # Validate layer_types
+        if len(self.layer_types) != self.num_hidden_layers:
+            raise ValueError(
+                f"layer_types must have length equal to num_hidden_layers ({num_hidden_layers}), "
+                f"but got {len(self.layer_types)}"
+            )
+
+        for layer_type in self.layer_types:
+            if layer_type not in [MAMBA, ATTENTION]:
+                raise ValueError(
+                    f"Each element in layer_types must be either '{MAMBA}' or '{ATTENTION}', "
+                    f"but got '{layer_type}'"
+                )
+
+        self.num_attention_heads = num_attention_heads
+        self.num_key_value_heads = num_key_value_heads
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.normalization_function = normalization_function
+
+        self.use_cache = use_cache
+        self.max_position_embeddings = max_position_embeddings
+        self.attention_dropout = attention_dropout
+        self.attention_bias = attention_bias
+
+        self.position_embedding_type = position_embedding_type
+        self.rope_theta = rope_theta
+        self.rope_scaling = rope_scaling
+
+        # Mamba configuration
+        self.mamba_d_state = mamba_d_state
+        self.mamba_d_conv = mamba_d_conv
+        self.mamba_expand = mamba_expand
+        self.mamba_d_head = mamba_d_head
+        self.mamba_n_heads = mamba_n_heads
+        self.mamba_n_groups = mamba_n_groups
+        self.mamba_chunk_size = mamba_chunk_size
+        self.mamba_conv_bias = mamba_conv_bias
+        self.mamba_proj_bias = mamba_proj_bias
+
+        # Calculate mamba intermediate size
+        self.mamba_intermediate_size = mamba_expand * hidden_size
+
+        # Validate mamba configuration
+        if self.mamba_intermediate_size % mamba_n_heads != 0:
+            raise ValueError(
+                f"mamba_intermediate_size ({self.mamba_intermediate_size}) must be divisible by "
+                f"mamba_n_heads ({mamba_n_heads})"
+            )
+
+        if mamba_d_head * mamba_n_heads != self.mamba_intermediate_size:
+            raise ValueError(
+                f"mamba_d_head ({mamba_d_head}) * mamba_n_heads ({mamba_n_heads}) must equal "
+                f"mamba_intermediate_size ({self.mamba_intermediate_size})"
+            )
+
+        # Scaling factors
+        self.embedding_multiplier = embedding_multiplier
+        self.logits_scaling = logits_scaling
+        self.attention_multiplier = attention_multiplier
+        self.residual_multiplier = residual_multiplier
+
+        # MoE configuration
+        self.num_local_experts = num_local_experts
+        self.num_experts_per_tok = num_experts_per_tok
+        self.shared_intermediate_size = shared_intermediate_size
+        self.output_router_logits = output_router_logits
+        self.router_aux_loss_coef = router_aux_loss_coef
+
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
+
+    @property
+    def mamba_layer_ids(self):
+        """Returns the indices of layers that are Mamba layers."""
+        return [
+            i for i in range(self.num_hidden_layers) if self.layer_types[i] == MAMBA
+        ]
+
+    @property
+    def attention_layer_ids(self):
+        """Returns the indices of layers that are attention layers."""
+        return [
+            i for i in range(self.num_hidden_layers) if self.layer_types[i] == ATTENTION
+        ]
+
+    @property
+    def full_attention_layer_ids(self):
+        """Alias for attention_layer_ids for compatibility."""
+        return self.attention_layer_ids
+
+    @property
+    def mamba2_cache_params(self):
+        """Returns the Mamba2 cache parameters for this configuration."""
+        from sglang.srt.layers.dp_attention import get_attention_tp_size
+
+        shape = Mamba2StateShape.create(
+            tp_world_size=get_attention_tp_size(),
+            intermediate_size=self.mamba_intermediate_size,
+            n_groups=self.mamba_n_groups,
+            num_heads=self.mamba_n_heads,
+            head_dim=self.mamba_d_head,
+            state_size=self.mamba_d_state,
+            conv_kernel=self.mamba_d_conv,
+        )
+        return Mamba2CacheParams(shape=shape, layers=self.mamba_layer_ids)
diff --git a/sglang/python/sglang/srt/configs/internvl.py b/sglang/python/sglang/srt/configs/internvl.py
new file mode 100644
index 0000000000000000000000000000000000000000..3ba9c61c10e043bcbebe34b1760790ebfc120c4b
--- /dev/null
+++ b/sglang/python/sglang/srt/configs/internvl.py
@@ -0,0 +1,706 @@
+import copy
+import os
+from shutil import copyfile
+from typing import Any, Dict, List, Optional, Tuple, Union
+
+import sentencepiece as spm
+from transformers import (
+    TOKENIZER_MAPPING,
+    GptOssConfig,
+    LlamaConfig,
+    PretrainedConfig,
+    PreTrainedTokenizer,
+    Qwen2Config,
+    Qwen3Config,
+    Qwen3MoeConfig,
+)
+
+from sglang.utils import logger
+
+# Copied from: https://github.com/OpenGVLab/InternVL/blob/34a81000402bf8f716bab8c9b57aff1f6b436bd0/internvl_chat/internvl/model/internvl_chat/configuration_internvl_chat.py#L21
+
+
+VOCAB_FILES_NAMES = {"vocab_file": "./tokenizer.model"}
+
+PRETRAINED_VOCAB_FILES_MAP = {}
+
+
+# Modified from transformers.model.llama.configuration_llama.LlamaConfig
+class InternLM2Config(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`InternLM2Model`]. It is used to instantiate
+    an InternLM2 model according to the specified arguments, defining the model architecture. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of the InternLM2-7B.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 32000):
+            Vocabulary size of the InternLM2 model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`InternLM2Model`]
+        hidden_size (`int`, *optional*, defaults to 4096):
+            Dimension of the hidden representations.
+        intermediate_size (`int`, *optional*, defaults to 11008):
+            Dimension of the MLP representations.
+        num_hidden_layers (`int`, *optional*, defaults to 32):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 32):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        num_key_value_heads (`int`, *optional*):
+            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
+            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
+            `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When
+            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
+            by meanpooling all the original heads within that group. For more details checkout [this
+            paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
+            `num_attention_heads`.
+        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
+            The non-linear activation function (function or string) in the decoder.
+        max_position_embeddings (`int`, *optional*, defaults to 2048):
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            just in case (e.g., 512 or 1024 or 2048).
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        rms_norm_eps (`float`, *optional*, defaults to 1e-12):
+            The epsilon used by the rms normalization layers.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+        tie_word_embeddings(`bool`, *optional*, defaults to `False`):
+            Whether to tie weight embeddings
+        Example:
+
+    """
+
+    model_type = "internlm2"
+    _auto_class = "AutoConfig"
+
+    def __init__(  # pylint: disable=W0102
+        self,
+        vocab_size=103168,
+        hidden_size=4096,
+        intermediate_size=11008,
+        num_hidden_layers=32,
+        num_attention_heads=32,
+        num_key_value_heads=None,
+        hidden_act="silu",
+        max_position_embeddings=2048,
+        initializer_range=0.02,
+        rms_norm_eps=1e-6,
+        use_cache=True,
+        pad_token_id=0,
+        bos_token_id=1,
+        eos_token_id=2,
+        tie_word_embeddings=False,
+        bias=True,
+        rope_theta=10000,
+        rope_scaling=None,
+        attn_implementation="eager",
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.bias = bias
+
+        if num_key_value_heads is None:
+            num_key_value_heads = num_attention_heads
+        self.num_key_value_heads = num_key_value_heads
+
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+        self.rope_scaling = rope_scaling
+        self._rope_scaling_validation()
+
+        self.attn_implementation = attn_implementation
+        if self.attn_implementation is None:
+            self.attn_implementation = "eager"
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
+
+    def _rope_scaling_validation(self):
+        """
+        Validate the `rope_scaling` configuration.
+        """
+        if self.rope_scaling is None:
+            return
+
+        if not isinstance(self.rope_scaling, dict) or len(self.rope_scaling) != 2:
+            raise ValueError(
+                "`rope_scaling` must be a dictionary with with two fields, `type` and `factor`, "
+                f"got {self.rope_scaling}"
+            )
+        rope_scaling_type = self.rope_scaling.get("type", None)
+        rope_scaling_factor = self.rope_scaling.get("factor", None)
+        if rope_scaling_type is None or rope_scaling_type not in ["linear", "dynamic"]:
+            raise ValueError(
+                f"`rope_scaling`'s type field must be one of ['linear', 'dynamic'], got {rope_scaling_type}"
+            )
+        if (
+            rope_scaling_factor is None
+            or not isinstance(rope_scaling_factor, (float, int))
+            or rope_scaling_factor < 1.0
+        ):
+            raise ValueError(
+                f"`rope_scaling`'s factor field must be a float|int >= 1, got {rope_scaling_factor=}, {type(rope_scaling_factor)=}"
+            )
+        if isinstance(rope_scaling_factor, int):
+            rope_scaling_factor = float(rope_scaling_factor)
+
+
+class InternVisionConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`InternVisionModel`]. It is used to
+    instantiate a vision encoder according to the specified arguments, defining the model architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        num_channels (`int`, *optional*, defaults to 3):
+            Number of color channels in the input images (e.g., 3 for RGB).
+        patch_size (`int`, *optional*, defaults to 14):
+            The size (resolution) of each patch.
+        image_size (`int`, *optional*, defaults to 224):
+            The size (resolution) of each image.
+        qkv_bias (`bool`, *optional*, defaults to `False`):
+            Whether to add a bias to the queries and values in the self-attention layers.
+        hidden_size (`int`, *optional*, defaults to 3200):
+            Dimensionality of the encoder layers and the pooler layer.
+        num_attention_heads (`int`, *optional*, defaults to 25):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        intermediate_size (`int`, *optional*, defaults to 12800):
+            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        qk_normalization (`bool`, *optional*, defaults to `True`):
+            Whether to normalize the queries and keys in the self-attention layers.
+        num_hidden_layers (`int`, *optional*, defaults to 48):
+            Number of hidden layers in the Transformer encoder.
+        use_flash_attn (`bool`, *optional*, defaults to `True`):
+            Whether to use flash attention mechanism.
+        hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"selu"` and `"gelu_new"` ``"gelu"` are supported.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-6):
+            The epsilon used by the layer normalization layers.
+        dropout (`float`, *optional*, defaults to 0.0):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        drop_path_rate (`float`, *optional*, defaults to 0.0):
+            Dropout rate for stochastic depth.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        initializer_factor (`float`, *optional*, defaults to 0.1):
+            A factor for layer scale.
+    """
+
+    model_type = "intern_vit_6b"
+
+    def __init__(
+        self,
+        num_channels=3,
+        patch_size=14,
+        image_size=224,
+        qkv_bias=False,
+        hidden_size=3200,
+        num_attention_heads=25,
+        intermediate_size=12800,
+        qk_normalization=True,
+        num_hidden_layers=48,
+        use_flash_attn=True,
+        hidden_act="gelu",
+        layer_norm_eps=1e-6,
+        dropout=0.0,
+        drop_path_rate=0.0,
+        attention_dropout=0.0,
+        initializer_range=0.02,
+        initializer_factor=0.1,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.dropout = dropout
+        self.drop_path_rate = drop_path_rate
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.num_channels = num_channels
+        self.patch_size = patch_size
+        self.image_size = image_size
+        self.initializer_range = initializer_range
+        self.initializer_factor = initializer_factor
+        self.attention_dropout = attention_dropout
+        self.layer_norm_eps = layer_norm_eps
+        self.hidden_act = hidden_act
+        self.qkv_bias = qkv_bias
+        self.qk_normalization = qk_normalization
+        self.use_flash_attn = use_flash_attn
+
+    @classmethod
+    def from_pretrained(
+        cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs
+    ) -> "PretrainedConfig":
+        config_dict, kwargs = cls.get_config_dict(
+            pretrained_model_name_or_path, **kwargs
+        )
+
+        if "vision_config" in config_dict:
+            config_dict = config_dict["vision_config"]
+
+        if (
+            "model_type" in config_dict
+            and hasattr(cls, "model_type")
+            and config_dict["model_type"] != cls.model_type
+        ):
+            logger.warning(
+                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
+                f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
+            )
+
+        return cls.from_dict(config_dict, **kwargs)
+
+
+class InternVLChatConfig(PretrainedConfig):
+    model_type = "internvl_chat"
+    is_composition = True
+
+    def __init__(
+        self,
+        vision_config=None,
+        llm_config=None,
+        use_backbone_lora=0,
+        use_llm_lora=0,
+        pad2square=False,
+        select_layer=-1,
+        force_image_size=None,
+        downsample_ratio=0.5,
+        template=None,
+        dynamic_image_size=False,
+        use_thumbnail=False,
+        ps_version="v1",
+        min_dynamic_patch=1,
+        max_dynamic_patch=6,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+
+        if vision_config is None:
+            vision_config = {"architectures": ["InternVisionModel"]}
+            logger.info(
+                "vision_config is None. Initializing the InternVisionConfig with default values."
+            )
+
+        if llm_config is None:
+            llm_config = {"architectures": ["InternLM2ForCausalLM"]}
+            logger.info(
+                "llm_config is None. Initializing the LlamaConfig config with default values (`LlamaConfig`)."
+            )
+
+        self.vision_config = InternVisionConfig(**vision_config)
+        if llm_config.get("architectures")[0] == "LlamaForCausalLM":
+            self.llm_config = LlamaConfig(**llm_config)
+        elif llm_config.get("architectures")[0] == "InternLM2ForCausalLM":
+            self.llm_config = InternLM2Config(**llm_config)
+        elif llm_config.get("architectures")[0] == "Qwen2ForCausalLM":
+            self.llm_config = Qwen2Config(**llm_config)
+        elif llm_config.get("architectures")[0] == "Qwen3MoeForCausalLM":
+            self.llm_config = Qwen3MoeConfig(**llm_config)
+        elif llm_config.get("architectures")[0] == "Qwen3ForCausalLM":
+            self.llm_config = Qwen3Config(**llm_config)
+        elif llm_config.get("architectures")[0] == "GptOssForCausalLM":
+            self.llm_config = GptOssConfig(**llm_config)
+        else:
+            raise ValueError(
+                "Unsupported architecture: {}".format(
+                    llm_config.get("architectures")[0]
+                )
+            )
+
+        self.use_backbone_lora = use_backbone_lora
+        self.use_llm_lora = use_llm_lora
+        self.pad2square = pad2square
+        self.select_layer = select_layer
+        self.force_image_size = force_image_size
+        self.downsample_ratio = downsample_ratio
+        self.template = template
+        self.dynamic_image_size = dynamic_image_size
+        self.use_thumbnail = use_thumbnail
+        self.ps_version = ps_version  # pixel shuffle version
+        self.min_dynamic_patch = min_dynamic_patch
+        self.max_dynamic_patch = max_dynamic_patch
+
+        self.hidden_size = self.llm_config.hidden_size
+        # By default, we use tie_word_embeddings=False for models of all sizes.
+        self.tie_word_embeddings = False
+        self.llm_config.tie_word_embeddings = self.tie_word_embeddings
+
+    def to_dict(self):
+        """
+        Serializes this instance to a Python dictionary. Override the default [`~PretrainedConfig.to_dict`].
+
+        Returns:
+            `Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance,
+        """
+        output = copy.deepcopy(self.__dict__)
+        output["vision_config"] = self.vision_config.to_dict()
+        output["llm_config"] = self.llm_config.to_dict()
+        output["model_type"] = self.__class__.model_type
+        output["use_backbone_lora"] = self.use_backbone_lora
+        output["use_llm_lora"] = self.use_llm_lora
+        output["select_layer"] = self.select_layer
+        output["force_image_size"] = self.force_image_size
+        output["downsample_ratio"] = self.downsample_ratio
+        output["template"] = self.template
+        output["dynamic_image_size"] = self.dynamic_image_size
+        output["use_thumbnail"] = self.use_thumbnail
+        output["ps_version"] = self.ps_version
+        output["min_dynamic_patch"] = self.min_dynamic_patch
+        output["max_dynamic_patch"] = self.max_dynamic_patch
+
+        return output
+
+
+# # Modified from transformers.model.llama.tokenization_llama_fast.LlamaTokenizerFast -> InternLM2TokenizerFast
+# class InternLM2TokenizerFast(PreTrainedTokenizerFast):
+#     vocab_files_names = VOCAB_FILES_NAMES
+#     slow_tokenizer_class = InternLM2Tokenizer
+#     padding_side = 'left'
+#     model_input_names = ['input_ids', 'attention_mask']
+#     _auto_class = 'AutoTokenizer'
+#
+#     def __init__(
+#         self,
+#         vocab_file,
+#         unk_token='<unk>',
+#         bos_token='<s>',
+#         eos_token='</s>',
+#         pad_token='</s>',
+#         sp_model_kwargs: Optional[Dict[str, Any]] = None,
+#         add_bos_token=True,
+#         add_eos_token=False,
+#         decode_with_prefix_space=False,
+#         clean_up_tokenization_spaces=False,
+#         **kwargs,
+#     ):
+#         super().__init__(
+#             vocab_file=vocab_file,
+#             unk_token=unk_token,
+#             bos_token=bos_token,
+#             eos_token=eos_token,
+#             pad_token=pad_token,
+#             sp_model_kwargs=sp_model_kwargs,
+#             add_bos_token=add_bos_token,
+#             add_eos_token=add_eos_token,
+#             decode_with_prefix_space=decode_with_prefix_space,
+#             clean_up_tokenization_spaces=clean_up_tokenization_spaces,
+#             **kwargs,
+#         )
+#         self._add_bos_token = add_bos_token
+#         self._add_eos_token = add_eos_token
+#         self.update_post_processor()
+#         self.vocab_file = vocab_file
+#
+#     @property
+#     def can_save_slow_tokenizer(self) -> bool:
+#         return os.path.isfile(self.vocab_file) if self.vocab_file else False
+#
+#     def update_post_processor(self):
+#         """
+#         Updates the underlying post processor with the current `bos_token` and `eos_token`.
+#         """
+#         bos = self.bos_token
+#         bos_token_id = self.bos_token_id
+#         if bos is None and self.add_bos_token:
+#             raise ValueError('add_bos_token = True but bos_token = None')
+#
+#         eos = self.eos_token
+#         eos_token_id = self.eos_token_id
+#         if eos is None and self.add_eos_token:
+#             raise ValueError('add_eos_token = True but eos_token = None')
+#
+#         single = f"{(bos + ':0 ') if self.add_bos_token else ''}$A:0{(' ' + eos + ':0') if self.add_eos_token else ''}"
+#         pair = f"{single}{(' ' + bos + ':1') if self.add_bos_token else ''} $B:1{(' ' + eos + ':1') if self.add_eos_token else ''}"
+#
+#         special_tokens = []
+#         if self.add_bos_token:
+#             special_tokens.append((bos, bos_token_id))
+#         if self.add_eos_token:
+#             special_tokens.append((eos, eos_token_id))
+#         self._tokenizer.post_processor = processors.TemplateProcessing(
+#             single=single, pair=pair, special_tokens=special_tokens
+#         )
+#
+#     @property
+#     def add_eos_token(self):
+#         return self._add_eos_token
+#
+#     @property
+#     def add_bos_token(self):
+#         return self._add_bos_token
+#
+#     @add_eos_token.setter
+#     def add_eos_token(self, value):
+#         self._add_eos_token = value
+#         self.update_post_processor()
+#
+#     @add_bos_token.setter
+#     def add_bos_token(self, value):
+#         self._add_bos_token = value
+#         self.update_post_processor()
+#
+#     def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
+#         if not self.can_save_slow_tokenizer:
+#             raise ValueError(
+#                 'Your fast tokenizer does not have the necessary information to save the vocabulary for a slow '
+#                 'tokenizer.'
+#             )
+#
+#         if not os.path.isdir(save_directory):
+#             logger.error(f'Vocabulary path ({save_directory}) should be a directory')
+#             return
+#         out_vocab_file = os.path.join(
+#             save_directory, (filename_prefix + '-' if filename_prefix else '') + VOCAB_FILES_NAMES['vocab_file']
+#         )
+#
+#         if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file):
+#             copyfile(self.vocab_file, out_vocab_file)
+#
+#         return (out_vocab_file,)
+
+
+# Modified from transformers.model.llama.tokenization_llama.LlamaTokenizer
+class InternLM2Tokenizer(PreTrainedTokenizer):
+    """
+    Construct a InternLM2 tokenizer. Based on byte-level Byte-Pair-Encoding.
+
+    Args:
+        vocab_file (`str`):
+            Path to the vocabulary file.
+    """
+
+    vocab_files_names = VOCAB_FILES_NAMES
+    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
+    model_input_names = ["input_ids", "attention_mask"]
+    _auto_class = "AutoTokenizer"
+
+    def __init__(
+        self,
+        vocab_file,
+        unk_token="<unk>",
+        bos_token="<s>",
+        eos_token="</s>",
+        pad_token="</s>",
+        sp_model_kwargs: Optional[Dict[str, Any]] = None,
+        add_bos_token=True,
+        add_eos_token=False,
+        decode_with_prefix_space=False,
+        clean_up_tokenization_spaces=False,
+        **kwargs,
+    ):
+        print("register succeed")
+        self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
+        self.vocab_file = vocab_file
+        self.add_bos_token = add_bos_token
+        self.add_eos_token = add_eos_token
+        self.decode_with_prefix_space = decode_with_prefix_space
+        self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
+        self.sp_model.Load(vocab_file)
+        self._no_prefix_space_tokens = None
+        super().__init__(
+            bos_token=bos_token,
+            eos_token=eos_token,
+            unk_token=unk_token,
+            pad_token=pad_token,
+            clean_up_tokenization_spaces=clean_up_tokenization_spaces,
+            **kwargs,
+        )
+
+    @property
+    def no_prefix_space_tokens(self):
+        if self._no_prefix_space_tokens is None:
+            vocab = self.convert_ids_to_tokens(list(range(self.vocab_size)))
+            self._no_prefix_space_tokens = {
+                i for i, tok in enumerate(vocab) if not tok.startswith("▁")
+            }
+        return self._no_prefix_space_tokens
+
+    @property
+    def vocab_size(self):
+        """Returns vocab size"""
+        return self.sp_model.get_piece_size()
+
+    @property
+    def bos_token_id(self) -> Optional[int]:
+        return self.sp_model.bos_id()
+
+    @property
+    def eos_token_id(self) -> Optional[int]:
+        return self.sp_model.eos_id()
+
+    def get_vocab(self):
+        """Returns vocab as a dict"""
+        vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
+        vocab.update(self.added_tokens_encoder)
+        return vocab
+
+    def _tokenize(self, text):
+        """Returns a tokenized string."""
+        return self.sp_model.encode(text, out_type=str)
+
+    def _convert_token_to_id(self, token):
+        """Converts a token (str) in an id using the vocab."""
+        return self.sp_model.piece_to_id(token)
+
+    def _convert_id_to_token(self, index):
+        """Converts an index (integer) in a token (str) using the vocab."""
+        token = self.sp_model.IdToPiece(index)
+        return token
+
+    def _maybe_add_prefix_space(self, tokens, decoded):
+        if tokens and tokens[0] not in self.no_prefix_space_tokens:
+            return " " + decoded
+        else:
+            return decoded
+
+    def convert_tokens_to_string(self, tokens):
+        """Converts a sequence of tokens (string) in a single string."""
+        current_sub_tokens = []
+        out_string = ""
+        prev_is_special = False
+        for token in tokens:
+            # make sure that special tokens are not decoded using sentencepiece model
+            if token in self.all_special_tokens:
+                if not prev_is_special:
+                    out_string += " "
+                out_string += self.sp_model.decode(current_sub_tokens) + token
+                prev_is_special = True
+                current_sub_tokens = []
+            else:
+                current_sub_tokens.append(token)
+                prev_is_special = False
+        out_string += self.sp_model.decode(current_sub_tokens)
+        out_string = self.clean_up_tokenization(out_string)
+        out_string = self._maybe_add_prefix_space(tokens=tokens, decoded=out_string)
+        return out_string[1:]
+
+    def save_vocabulary(
+        self, save_directory, filename_prefix: Optional[str] = None
+    ) -> Tuple[str]:
+        """
+        Save the vocabulary and special tokens file to a directory.
+
+        Args:
+            save_directory (`str`):
+                The directory in which to save the vocabulary.
+
+        Returns:
+            `Tuple(str)`: Paths to the files saved.
+        """
+        if not os.path.isdir(save_directory):
+            logger.error(f"Vocabulary path ({save_directory}) should be a directory")
+            return
+        out_vocab_file = os.path.join(
+            save_directory,
+            (filename_prefix + "-" if filename_prefix else "")
+            + VOCAB_FILES_NAMES["vocab_file"],
+        )
+
+        if os.path.abspath(self.vocab_file) != os.path.abspath(
+            out_vocab_file
+        ) and os.path.isfile(self.vocab_file):
+            copyfile(self.vocab_file, out_vocab_file)
+        elif not os.path.isfile(self.vocab_file):
+            with open(out_vocab_file, "wb") as fi:
+                content_spiece_model = self.sp_model.serialized_model_proto()
+                fi.write(content_spiece_model)
+
+        return (out_vocab_file,)
+
+    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
+        if self.add_bos_token:
+            bos_token_ids = [self.bos_token_id]
+        else:
+            bos_token_ids = []
+
+        output = bos_token_ids + token_ids_0
+
+        if token_ids_1 is not None:
+            output = output + token_ids_1
+
+        if self.add_eos_token:
+            output = output + [self.eos_token_id]
+
+        return output
+
+    def get_special_tokens_mask(
+        self,
+        token_ids_0: List[int],
+        token_ids_1: Optional[List[int]] = None,
+        already_has_special_tokens: bool = False,
+    ) -> List[int]:
+        """
+        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
+        special tokens using the tokenizer `prepare_for_model` method.
+
+        Args:
+            token_ids_0 (`List[int]`):
+                List of IDs.
+            token_ids_1 (`List[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
+                Whether or not the token list is already formatted with special tokens for the model.
+
+        Returns:
+            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+        """
+        if already_has_special_tokens:
+            return super().get_special_tokens_mask(
+                token_ids_0=token_ids_0,
+                token_ids_1=token_ids_1,
+                already_has_special_tokens=True,
+            )
+
+        if token_ids_1 is None:
+            return [1] + ([0] * len(token_ids_0)) + [1]
+        return [1] + ([0] * len(token_ids_0)) + [1, 1] + ([0] * len(token_ids_1)) + [1]
+
+    def create_token_type_ids_from_sequences(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
+    ) -> List[int]:
+        """
+        Create a mask from the two sequences passed to be used in a sequence-pair classification task. T5 does not make
+        use of token type ids, therefore a list of zeros is returned.
+
+        Args:
+            token_ids_0 (`List[int]`):
+                List of IDs.
+            token_ids_1 (`List[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+
+        Returns:
+            `List[int]`: List of zeros.
+        """
+        eos = [self.eos_token_id]
+
+        if token_ids_1 is None:
+            return len(token_ids_0 + eos) * [0]
+        return len(token_ids_0 + eos + token_ids_1 + eos) * [0]
+
+
+TOKENIZER_MAPPING.register(
+    InternVLChatConfig, (InternLM2Tokenizer, None), exist_ok=True
+)
diff --git a/sglang/python/sglang/srt/configs/janus_pro.py b/sglang/python/sglang/srt/configs/janus_pro.py
new file mode 100644
index 0000000000000000000000000000000000000000..d574953e95d92d4c68c67af8671f422e2be93f7f
--- /dev/null
+++ b/sglang/python/sglang/srt/configs/janus_pro.py
@@ -0,0 +1,634 @@
+# Adapted from:
+# https://github.com/deepseek-ai/Janus/tree/main/janus/models
+
+from dataclasses import dataclass
+from typing import Dict, List, Tuple, Union
+
+import numpy as np
+import PIL
+import torch
+from PIL.Image import Image
+from transformers import (
+    BaseImageProcessor,
+    BatchFeature,
+    LlamaConfig,
+    LlamaTokenizerFast,
+    PretrainedConfig,
+    ProcessorMixin,
+)
+from transformers.image_utils import to_numpy_array
+
+from sglang.srt.configs.utils import register_image_processor, register_processor
+from sglang.srt.multimodal.mm_utils import expand2square
+
+
+class DictToObject(dict):
+    def __init__(self, dictionary):
+        super(self).__init__(dictionary)
+
+        for key, value in dictionary.items():
+            if isinstance(value, dict):
+                value = DictToObject(value)
+            setattr(self, key, value)
+
+
+class VisionConfig(PretrainedConfig):
+    model_type = "vision"
+    cls: str = ""
+    params = {}
+
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+
+        self.cls = kwargs.get("cls", "")
+        if not isinstance(self.cls, str):
+            self.cls = self.cls.__name__
+
+        self.params = kwargs.get("params", {})
+
+
+class GenAlignerConfig(PretrainedConfig):
+    model_type = "gen_aligner"
+    cls: str = ""
+    params = {}
+
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+
+        self.cls = kwargs.get("cls", "")
+        if not isinstance(self.cls, str):
+            self.cls = self.cls.__name__
+
+        self.params = kwargs.get("params", {})
+
+
+class GenHeadConfig(PretrainedConfig):
+    model_type = "gen_head"
+    cls: str = ""
+    params = {}
+
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+
+        self.cls = kwargs.get("cls", "")
+        if not isinstance(self.cls, str):
+            self.cls = self.cls.__name__
+
+        self.params = kwargs.get("params", {})
+
+
+class AlignerConfig(PretrainedConfig):
+    model_type = "aligner"
+    cls: str = ""
+    params = {}
+
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+
+        self.cls = kwargs.get("cls", "")
+        if not isinstance(self.cls, str):
+            self.cls = self.cls.__name__
+
+        self.params = kwargs.get("params", {})
+
+
+class GenVisionConfig(PretrainedConfig):
+    model_type = "gen_vision"
+    cls: str = ""
+    params = {}
+
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+
+        self.cls = kwargs.get("cls", "")
+        if not isinstance(self.cls, str):
+            self.cls = self.cls.__name__
+
+        self.params = kwargs.get("params", {})
+
+
+@dataclass
+class SigLIPVisionCfg:
+    width: int = 1152
+    layers: Union[Tuple[int, int, int, int], int] = 27
+    heads: int = 16
+    patch_size: int = 14
+    image_size: Union[Tuple[int, int], int] = 336
+    global_pool: str = "map"
+    mlp_ratio: float = 3.7362
+    class_token: bool = False
+    num_classes: int = 0
+    use_checkpoint: bool = False
+
+
+class MultiModalityConfig(PretrainedConfig):
+    model_type = "multi_modality"
+    vision_config: VisionConfig
+    aligner_config: AlignerConfig
+
+    gen_vision_config: GenVisionConfig
+    gen_aligner_config: GenAlignerConfig
+    gen_head_config: GenHeadConfig
+
+    language_config: LlamaConfig
+
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+        vision_config = kwargs.get("vision_config", {})
+        self.vision_config = VisionConfig(**vision_config)
+
+        aligner_config = kwargs.get("aligner_config", {})
+        self.aligner_config = AlignerConfig(**aligner_config)
+
+        gen_vision_config = kwargs.get("gen_vision_config", {})
+        self.gen_vision_config = GenVisionConfig(**gen_vision_config)
+
+        gen_aligner_config = kwargs.get("gen_aligner_config", {})
+        self.gen_aligner_config = GenAlignerConfig(**gen_aligner_config)
+
+        gen_head_config = kwargs.get("gen_head_config", {})
+        self.gen_head_config = GenHeadConfig(**gen_head_config)
+
+        language_config = kwargs.get("language_config", {})
+        if isinstance(language_config, LlamaConfig):
+            self.language_config = language_config
+        else:
+            self.language_config = LlamaConfig(**language_config)
+
+
+class VLMImageProcessor(BaseImageProcessor):
+    model_input_names = ["pixel_values"]
+
+    def __init__(
+        self,
+        image_size: int,
+        min_size: int = 14,
+        image_mean: Union[Tuple[float, float, float], List[float]] = (
+            0.48145466,
+            0.4578275,
+            0.40821073,
+        ),
+        image_std: Union[Tuple[float, float, float], List[float]] = (
+            0.26862954,
+            0.26130258,
+            0.27577711,
+        ),
+        rescale_factor: float = 1.0 / 255.0,
+        do_normalize: bool = True,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+
+        self.image_size = image_size
+        self.rescale_factor = rescale_factor
+        self.image_mean = image_mean
+        self.image_std = image_std
+        self.min_size = min_size
+        self.do_normalize = do_normalize
+
+        if image_mean is None:
+            self.background_color = (127, 127, 127)
+        else:
+            self.background_color = tuple([int(x * 255) for x in image_mean])
+
+    def resize(self, pil_img: Image) -> np.ndarray:
+        """
+
+        Args:
+            pil_img (PIL.Image): [H, W, 3] in PIL.Image in RGB
+
+        Returns:
+            x (np.ndarray): [3, self.image_size, self.image_size]
+        """
+
+        width, height = pil_img.size
+        max_size = max(width, height)
+
+        size = [
+            max(int(height / max_size * self.image_size), self.min_size),
+            max(int(width / max_size * self.image_size), self.min_size),
+        ]
+
+        if width <= 0 or height <= 0 or size[0] <= 0 or size[1] <= 0:
+            # print(f"orig size = {pil_img.size}, new size = {size}")
+            raise ValueError("Invalid size!")
+
+        def resize(
+            pil_img, size, interpolation=PIL.Image.Resampling.BICUBIC, antialias=True
+        ):
+            if isinstance(size, int):
+                w, h = pil_img.size
+                if (w <= h and w == size) or (h <= w and h == size):
+                    return pil_img
+                if w < h:
+                    ow = size
+                    oh = int(size * h / w)
+                else:
+                    oh = size
+                    ow = int(size * w / h)
+                size = (ow, oh)
+            else:
+                size = (size[1], size[0])
+
+            return pil_img.resize(
+                size, resample=interpolation, reducing_gap=None if antialias else 3.0
+            )
+
+        pil_img = resize(
+            pil_img, size, interpolation=PIL.Image.Resampling.BICUBIC, antialias=True
+        )
+
+        pil_img = expand2square(pil_img, self.background_color)
+        x = to_numpy_array(pil_img)
+
+        # [H, W, 3] -> [3, H, W]
+        x = np.transpose(x, (2, 0, 1))
+
+        return x
+
+    def preprocess(self, images, return_tensors: str = "pt", **kwargs) -> BatchFeature:
+        # resize and pad to [self.image_size, self.image_size]
+        # then convert from [H, W, 3] to [3, H, W]
+        if not isinstance(images, list):
+            images = [images]
+        images: List[np.ndarray] = [self.resize(image) for image in images]
+        images = [image[:3, ...] for image in images]
+
+        # rescale from [0, 255] -> [0, 1]
+        images = [
+            self.rescale(
+                image=image,
+                scale=self.rescale_factor,
+                input_data_format="channels_first",
+            )
+            for image in images
+        ]
+
+        # normalize
+        if self.do_normalize:
+            images = [
+                self.normalize(
+                    image=image,
+                    mean=self.image_mean,
+                    std=self.image_std,
+                    input_data_format="channels_first",
+                )
+                for image in images
+            ]
+        data = {"pixel_values": images}
+        return BatchFeature(data=data, tensor_type=return_tensors)
+
+    @property
+    def default_shape(self):
+        return [3, self.image_size, self.image_size]
+
+
+class DictOutput(object):
+    def items(self):
+        return self.__dict__.items()
+
+    def keys(self):
+        return self.__dict__.keys()
+
+    def __getitem__(self, item):
+        return self.__dict__[item]
+
+    def __contains__(self, key):
+        return key in self.__dict__
+
+    def __setitem__(self, key, value):
+        self.__dict__[key] = value
+
+
+@dataclass
+class VLChatProcessorOutput(DictOutput):
+    sft_format: str
+    input_ids: torch.Tensor
+    pixel_values: torch.Tensor
+    num_image_tokens: torch.IntTensor
+
+    def __len__(self):
+        return len(self.input_ids)
+
+
+@dataclass
+class BatchedVLChatProcessorOutput(DictOutput):
+    sft_format: List[str]
+    input_ids: torch.Tensor
+    pixel_values: torch.Tensor
+    attention_mask: torch.Tensor
+    images_seq_mask: torch.BoolTensor
+    images_emb_mask: torch.BoolTensor
+
+
+# FIXME: had to place Official Processor here, since image_processor module would not be imported in all threads,
+# hence AutoProcessor registration would not be affective in some cases
+class VLChatProcessor(ProcessorMixin):
+    image_processor_class = "AutoImageProcessor"
+    tokenizer_class = ("LlamaTokenizer", "LlamaTokenizerFast")
+
+    attributes = ["image_processor", "tokenizer"]
+
+    def __init__(
+        self,
+        image_processor: VLMImageProcessor,
+        tokenizer: LlamaTokenizerFast,
+        image_tag: str = "<image_placeholder>",
+        image_start_tag: str = "<begin_of_image>",
+        image_end_tag: str = "<end_of_image>",
+        pad_tag: str = "<｜▁pad▁｜>",
+        num_image_tokens: int = 576,
+        add_special_token: bool = False,
+        sft_format: str = "deepseek",
+        mask_prompt: bool = True,
+        ignore_id: int = -100,
+        **kwargs,
+    ):
+        self.image_processor = image_processor
+        self.tokenizer = tokenizer
+
+        image_id = self.tokenizer.vocab.get(image_tag)
+        if image_id is None:
+            special_tokens = [image_tag]
+            special_tokens_dict = {"additional_special_tokens": special_tokens}
+            self.tokenizer.add_special_tokens(special_tokens_dict)
+            # print(f"Add image tag = {image_tag} to the tokenizer")
+
+        self.image_tag = image_tag
+        self.image_start_tag = image_start_tag
+        self.image_end_tag = image_end_tag
+        self.pad_tag = pad_tag
+
+        self.num_image_tokens = num_image_tokens
+        self.add_special_token = add_special_token
+        self.sft_format = sft_format
+        self.ignore_id = ignore_id
+
+        super().__init__(
+            image_processor,
+            tokenizer,
+            **kwargs,
+        )
+
+    @property
+    def image_token(self):
+        return self.image_tag
+
+    @property
+    def image_id(self) -> int:
+        image_id = self.tokenizer.vocab.get(self.image_tag)
+        return image_id
+
+    @property
+    def image_start_id(self):
+        image_start_id = self.tokenizer.vocab.get(self.image_start_tag)
+        return image_start_id
+
+    @property
+    def image_end_id(self):
+        image_end_id = self.tokenizer.vocab.get(self.image_end_tag)
+        return image_end_id
+
+    @property
+    def image_start_token(self):
+        return self.image_start_tag
+
+    @property
+    def image_end_token(self):
+        return self.image_end_tag
+
+    @property
+    def pad_id(self):
+        pad_id = self.tokenizer.vocab.get(self.pad_tag)
+        return pad_id
+
+    def add_image_token(
+        self,
+        image_indices: List[int],
+        input_ids: torch.LongTensor,
+    ):
+        """
+
+        Args:
+            image_indices (List[int]): [index_0, index_1, ..., index_j]
+            input_ids (torch.LongTensor): [N]
+
+        Returns:
+            input_ids (torch.LongTensor): [N + image tokens]
+            num_image_tokens (torch.IntTensor): [n_images]
+        """
+
+        input_slices = []
+
+        start = 0
+        for index in image_indices:
+            if self.add_special_token:
+                end = index + 1
+            else:
+                end = index
+
+            # original text tokens
+            input_slices.append(input_ids[start:end])
+
+            # add boi, image tokens, eoi and set the mask as False
+            input_slices.append(self.image_start_id * torch.ones((1), dtype=torch.long))
+            input_slices.append(
+                self.image_id * torch.ones((self.num_image_tokens,), dtype=torch.long)
+            )
+            input_slices.append(self.image_end_id * torch.ones((1), dtype=torch.long))
+            start = index + 1
+
+        # the left part
+        input_slices.append(input_ids[start:])
+
+        # concat all slices
+        input_ids = torch.cat(input_slices, dim=0)
+        num_image_tokens = torch.IntTensor([self.num_image_tokens] * len(image_indices))
+
+        return input_ids, num_image_tokens
+
+    def process_one(
+        self,
+        prompt: str = None,
+        images: List[Image] = None,
+        **kwargs,
+    ):
+        """
+
+        Args:
+            prompt (str): the formatted prompt;
+            images (List[ImageType]): the list of images;
+            **kwargs:
+
+        Returns:
+            outputs (BaseProcessorOutput): the output of the processor,
+                - input_ids (torch.LongTensor): [N + image tokens]
+                - target_ids (torch.LongTensor): [N + image tokens]
+                - images (torch.FloatTensor): [n_images, 3, H, W]
+                - image_id (int): the id of the image token
+                - num_image_tokens (List[int]): the number of image tokens
+        """
+
+        sft_format = prompt
+        # tokenize
+        input_ids = self.tokenizer.encode(sft_format)
+        input_ids = torch.LongTensor(input_ids)
+
+        # add image tokens to the input_ids
+        image_token_mask: torch.Tensor = (input_ids == self.image_id).to(torch.bool)
+        image_indices = image_token_mask.nonzero()
+        input_ids, num_image_tokens = self.add_image_token(
+            image_indices=image_indices,
+            input_ids=input_ids,
+        )
+
+        # load images
+        images_outputs = self.image_processor(images, return_tensors="pt")
+
+        prepare = VLChatProcessorOutput(
+            sft_format=sft_format,
+            input_ids=input_ids,
+            pixel_values=images_outputs.pixel_values,
+            num_image_tokens=num_image_tokens,
+        )
+
+        return prepare
+
+    def __call__(
+        self,
+        *,
+        prompt: str = None,
+        conversations: List[Dict[str, str]] = None,
+        images: List[Image] = None,
+        force_batchify: bool = True,
+        **kwargs,
+    ):
+        """
+
+        Args:
+            prompt (str): the formatted prompt;
+            conversations (List[Dict]): conversations with a list of messages;
+            images (List[ImageType]): the list of images;
+            force_batchify (bool): force batchify the inputs;
+            **kwargs:
+
+        Returns:
+            outputs (BaseProcessorOutput): the output of the processor,
+                - input_ids (torch.LongTensor): [N + image tokens]
+                - images (torch.FloatTensor): [n_images, 3, H, W]
+                - image_id (int): the id of the image token
+                - num_image_tokens (List[int]): the number of image tokens
+        """
+
+        prepare = self.process_one(
+            prompt=prompt, conversations=conversations, images=images
+        )
+
+        if force_batchify:
+            prepare = self.batchify([prepare])
+
+        return prepare
+
+    def batchify(
+        self, prepare_list: List[VLChatProcessorOutput]
+    ) -> BatchedVLChatProcessorOutput:
+        """
+        Preprocesses the inputs for multimodal inference.
+
+        Args:
+            prepare_list (List[VLChatProcessorOutput]): A list of VLChatProcessorOutput.
+
+        Returns:
+            BatchedVLChatProcessorOutput: A dictionary of the inputs to use for multimodal inference.
+        """
+
+        batch_size = len(prepare_list)
+        sft_format = []
+        n_images = []
+        seq_lens = []
+        for prepare in prepare_list:
+            n_images.append(len(prepare.num_image_tokens))
+            seq_lens.append(len(prepare))
+
+        input_token_max_len = max(seq_lens)
+        max_n_images = max(1, max(n_images))
+
+        batched_input_ids = torch.full(
+            (batch_size, input_token_max_len), self.pad_id
+        ).long()  # FIXME
+        batched_attention_mask = torch.zeros((batch_size, input_token_max_len)).long()
+        batched_pixel_values = torch.zeros(
+            (batch_size, max_n_images, *self.image_processor.default_shape)
+        ).float()
+        batched_images_seq_mask = torch.zeros((batch_size, input_token_max_len)).bool()
+        batched_images_emb_mask = torch.zeros(
+            (batch_size, max_n_images, self.num_image_tokens)
+        ).bool()
+
+        for i, prepare in enumerate(prepare_list):
+            input_ids = prepare.input_ids
+            seq_len = len(prepare)
+            n_image = len(prepare.num_image_tokens)
+            # left-padding
+            batched_attention_mask[i, -seq_len:] = 1
+            batched_input_ids[i, -seq_len:] = torch.LongTensor(input_ids)
+            batched_images_seq_mask[i, -seq_len:] = input_ids == self.image_id
+
+            if n_image > 0:
+                batched_pixel_values[i, :n_image] = prepare.pixel_values
+                for j, n_image_tokens in enumerate(prepare.num_image_tokens):
+                    batched_images_emb_mask[i, j, :n_image_tokens] = True
+
+            sft_format.append(prepare.sft_format)
+
+        batched_prepares = BatchedVLChatProcessorOutput(
+            input_ids=batched_input_ids,
+            attention_mask=batched_attention_mask,
+            pixel_values=batched_pixel_values,
+            images_seq_mask=batched_images_seq_mask,
+            images_emb_mask=batched_images_emb_mask,
+            sft_format=sft_format,
+        )
+
+        return batched_prepares
+
+
+class VLMImageProcessorConfig(PretrainedConfig):
+    model_type = "deepseek_vlm"
+    image_size: int
+    min_size: int
+    image_mean: Union[Tuple[float, float, float], List[float]]
+    image_std: Union[Tuple[float, float, float], List[float]]
+    rescale_factor: float
+    do_normalize: bool
+
+    def __init__(
+        self,
+        image_size: int,
+        min_size: int = 14,
+        image_mean: Union[Tuple[float, float, float], List[float]] = (
+            0.48145466,
+            0.4578275,
+            0.40821073,
+        ),
+        image_std: Union[Tuple[float, float, float], List[float]] = (
+            0.26862954,
+            0.26130258,
+            0.27577711,
+        ),
+        rescale_factor: float = 1.0 / 255.0,
+        do_normalize: bool = True,
+        **kwargs,
+    ):
+        self.image_size = image_size
+        self.min_size = min_size
+        self.image_mean = image_mean
+        self.image_std = image_std
+        self.rescale_factor = rescale_factor
+        self.do_normalize = do_normalize
+
+        super().__init__(**kwargs)
+
+
+register_processor(MultiModalityConfig, VLChatProcessor)
+register_image_processor(MultiModalityConfig, VLMImageProcessor)
diff --git a/sglang/python/sglang/srt/configs/jet_nemotron.py b/sglang/python/sglang/srt/configs/jet_nemotron.py
new file mode 100644
index 0000000000000000000000000000000000000000..1670da3b67f592df7c81c9ea89923afccf651df5
--- /dev/null
+++ b/sglang/python/sglang/srt/configs/jet_nemotron.py
@@ -0,0 +1,80 @@
+from dataclasses import dataclass
+from typing import Any
+
+from transformers.configuration_utils import PretrainedConfig
+
+from sglang.srt.configs.mamba_utils import (
+    Mamba2CacheParams,
+    Mamba2StateShape,
+    mamba2_state_dtype,
+)
+
+
+@dataclass
+class JetBlockConfig:
+    mode: str
+    expand_v: float
+    num_heads: int
+    head_dim: int
+    norm_eps: str
+    conv_size: int
+    dconv_generator_reduction: int
+    dconv_implementation: str
+
+
+class JetNemotronConfig(PretrainedConfig):
+    model_type: str = "jet_nemotron"
+
+    efficient_attention_config: dict[str, dict[str, Any]]
+    hidden_act: str
+    hidden_size: int
+    initializer_range: float
+    intermediate_size: int
+    layer_types: list[str]
+    max_position_embeddings: int
+    num_attention_heads: int
+    num_key_value_heads: int
+    rms_norm_eps: float
+    rope_scaling: None
+    rope_theta: float
+
+    @property
+    def full_attention_layer_ids(self) -> list[int]:
+        return [
+            idx
+            for idx, layer_type in enumerate(self.layer_types)
+            if layer_type in ("attn", "swa")
+        ]
+
+    @property
+    def linear_layer_ids(self) -> list[int]:
+        return [
+            idx
+            for idx, layer_type in enumerate(self.layer_types)
+            if layer_type == "jet"
+        ]
+
+    @property
+    def mamba2_cache_params(self) -> Mamba2CacheParams:
+        from sglang.srt.layers.dp_attention import get_attention_tp_size
+
+        jet_block_config = JetBlockConfig(**self.efficient_attention_config["jet"])
+
+        num_heads = jet_block_config.num_heads
+        head_k_dim = jet_block_config.head_dim
+        head_v_dim = int(head_k_dim * jet_block_config.expand_v)
+        total_v_dim = num_heads * head_v_dim
+
+        shape = Mamba2StateShape.create(
+            tp_world_size=get_attention_tp_size(),
+            intermediate_size=total_v_dim,
+            n_groups=num_heads,
+            num_heads=num_heads,
+            head_dim=head_v_dim,
+            state_size=head_k_dim,
+            conv_kernel=jet_block_config.conv_size,
+        )
+
+        return Mamba2CacheParams(
+            shape=shape, layers=self.linear_layer_ids, dtype=mamba2_state_dtype(self)
+        )
diff --git a/sglang/python/sglang/srt/configs/jet_vlm.py b/sglang/python/sglang/srt/configs/jet_vlm.py
new file mode 100644
index 0000000000000000000000000000000000000000..9b8cba6e157bd7bf1a61afba1854724cb75d54d2
--- /dev/null
+++ b/sglang/python/sglang/srt/configs/jet_vlm.py
@@ -0,0 +1,53 @@
+from typing import Any
+
+from transformers.configuration_utils import PretrainedConfig
+from transformers.models.siglip import SiglipVisionConfig
+
+from sglang.srt.configs.jet_nemotron import JetNemotronConfig
+from sglang.srt.configs.mamba_utils import Mamba2CacheParams
+
+
+class JetVLMConfig(PretrainedConfig):
+    model_type = "jet_vlm"
+    sub_configs = {
+        "text_config": JetNemotronConfig,
+        "vision_config": SiglipVisionConfig,
+    }
+    _auto_class = "AutoConfig"
+
+    def __init__(
+        self,
+        *,
+        text_config: dict[str, Any] | None = None,
+        vision_config: dict[str, Any] | None = None,
+        image_token_id: int | None = None,
+        video_token_id: int | None = None,
+        **kwargs,
+    ):
+        self.text_config = (
+            JetNemotronConfig(**text_config)
+            if text_config is not None
+            else JetNemotronConfig()
+        )
+        self.vision_config = (
+            SiglipVisionConfig(**vision_config)
+            if vision_config is not None
+            else SiglipVisionConfig()
+        )
+
+        self.image_token_id = image_token_id if image_token_id is not None else -1
+        self.video_token_id = video_token_id if video_token_id is not None else -1
+
+        super().__init__(**kwargs)
+
+    @property
+    def full_attention_layer_ids(self) -> list[int]:
+        return self.text_config.full_attention_layer_ids
+
+    @property
+    def linear_layer_ids(self) -> list[int]:
+        return self.text_config.linear_layer_ids
+
+    @property
+    def mamba2_cache_params(self) -> Mamba2CacheParams:
+        return self.text_config.mamba2_cache_params
diff --git a/sglang/python/sglang/srt/configs/kimi_k25.py b/sglang/python/sglang/srt/configs/kimi_k25.py
new file mode 100644
index 0000000000000000000000000000000000000000..1ea8e7d89eef16721fdb75ce1d55279c34d358ab
--- /dev/null
+++ b/sglang/python/sglang/srt/configs/kimi_k25.py
@@ -0,0 +1,171 @@
+"""
+Kimi K25 Model Configuration.
+"""
+
+from transformers import DeepseekV3Config
+from transformers.configuration_utils import PretrainedConfig
+
+
+class KimiK25VisionConfig(PretrainedConfig):
+    """Vision configuration for K2-VL (vision tower + mm projector).
+
+    Args:
+        Vision Tower Parameters:
+            patch_size: Patch size for vision tower.
+            init_pos_emb_height: Initial position embedding height.
+            init_pos_emb_width: Initial position embedding width.
+            init_pos_emb_time: Initial position embedding time dimension.
+            pos_emb_type: Type of position embedding.
+            num_attention_heads: Number of attention heads in vision tower.
+            num_hidden_layers: Number of hidden layers in vision tower.
+            hidden_size: Hidden size of vision tower.
+            intermediate_size: Intermediate size in vision tower FFN.
+            merge_kernel_size: Kernel size for spatial patch merging.
+            video_attn_type: Type of video attention.
+            merge_type: Type of merge operation.
+
+        MM Projector Parameters:
+            mm_projector_type: Type of multimodal projector.
+            mm_hidden_size: Hidden size for projector (defaults to hidden_size).
+            projector_hidden_act: Activation function for projector.
+            projector_ln_eps: Layer norm epsilon for projector.
+    """
+
+    model_type = "kimi_k25"
+
+    def __init__(
+        self,
+        # Vision Tower
+        patch_size: int = 14,
+        init_pos_emb_height: int = 64,
+        init_pos_emb_width: int = 64,
+        init_pos_emb_time: int = 4,
+        pos_emb_type: str = "divided_fixed",
+        num_attention_heads: int = 16,
+        num_hidden_layers: int = 27,
+        hidden_size: int = 1152,
+        intermediate_size: int = 4304,
+        merge_kernel_size: tuple[int, int] = (2, 2),
+        video_attn_type: str = "spatial_temporal",
+        merge_type: str = "sd2_tpool",
+        # MM Projector
+        mm_projector_type: str = "patchmerger",
+        mm_hidden_size: int | None = None,
+        projector_hidden_act: str = "gelu",
+        projector_ln_eps: float = 1e-5,
+        text_hidden_size: int = 7168,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        # Vision Tower
+        self.patch_size = patch_size
+        self.init_pos_emb_height = init_pos_emb_height
+        self.init_pos_emb_width = init_pos_emb_width
+        self.init_pos_emb_time = init_pos_emb_time
+        self.pos_emb_type = pos_emb_type
+        self.num_attention_heads = num_attention_heads
+        self.num_hidden_layers = num_hidden_layers
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.merge_kernel_size = merge_kernel_size
+        self.video_attn_type = video_attn_type
+        self.merge_type = merge_type
+        # MM Projector
+        self.mm_projector_type = mm_projector_type
+        if mm_hidden_size is not None:
+            self.mm_hidden_size = mm_hidden_size
+        else:
+            self.mm_hidden_size = hidden_size
+        self.projector_hidden_act = projector_hidden_act
+        self.projector_ln_eps = projector_ln_eps
+        self.text_hidden_size = text_hidden_size
+
+
+class KimiK25Config(PretrainedConfig):
+    """K2-VL model configuration.
+
+    K2-VL extends Kimi-VL with video support using video-chunks.
+    A video-chunk consists of multiple consecutive frames (default: 4)
+    that are processed together with temporal pooling.
+
+    Args:
+        text_config: Configuration for the text model (DeepseekV3).
+
+        Vision Tower Parameters:
+            patch_size: Patch size for vision tower.
+            init_pos_emb_height: Initial position embedding height.
+            init_pos_emb_width: Initial position embedding width.
+            init_pos_emb_time: Initial position embedding time dimension.
+            pos_emb_type: Type of position embedding.
+            vt_num_attention_heads: Number of attention heads in vision tower.
+            vt_num_hidden_layers: Number of hidden layers in vision tower.
+            vt_hidden_size: Hidden size of vision tower.
+            vt_intermediate_size: Intermediate size in vision tower FFN.
+            merge_kernel_size: Kernel size for spatial patch merging.
+            video_attn_type: Type of video attention.
+            merge_type: Type of merge operation.
+
+        Video-Chunk Parameters:
+            temporal_merge_kernel_size: Number of frames per video chunk.
+                Default is 4, meaning 4 frames are merged into 1 chunk.
+            sample_fps: Video sampling frame rate.
+            timestamp_mode: Format for chunk timestamps.
+
+        MM Projector Parameters:
+            mm_projector_type: Type of multimodal projector.
+            mm_hidden_size: Hidden size from vision tower.
+            projector_hidden_act: Activation function for projector.
+            projector_ln_eps: Layer norm epsilon for projector.
+
+        Other Parameters:
+            ignore_index: The ignore index for the loss function.
+            media_placeholder_token_id: The token ID for media placeholders.
+            pad_token_id: The token ID for padding.
+    """
+
+    model_type = "kimi_k25"
+
+    def __init__(
+        self,
+        text_config: dict | DeepseekV3Config | None = None,
+        vision_config: dict | KimiK25VisionConfig | None = None,
+        # Other parameters
+        ignore_index: int = -100,
+        media_placeholder_token_id: int = 163605,
+        pad_token_id: int = 0,
+        use_unified_vision_chunk: bool = False,
+        video_placeholder: str = "<|kimi_k25_video_placeholder|>",
+        **kwargs,
+    ):
+        if text_config is None:
+            text_config = DeepseekV3Config()
+        elif isinstance(text_config, dict):
+            text_config = DeepseekV3Config(**text_config)
+
+        if vision_config is None:
+            vision_config = KimiK25VisionConfig()
+        elif isinstance(vision_config, dict):
+            vision_config = KimiK25VisionConfig(**vision_config)
+        self.vision_config = vision_config
+        self.text_config = text_config
+        # Other config
+        self.ignore_index = ignore_index
+        self.media_placeholder_token_id = media_placeholder_token_id
+        self.use_unified_vision_chunk = use_unified_vision_chunk
+        self.video_placeholder = video_placeholder
+
+        # Propagate quantization config from text model
+        if getattr(self.text_config, "quantization_config", None) is not None:
+            self.quantization_config = self.text_config.quantization_config
+
+        super().__init__(pad_token_id=pad_token_id, **kwargs)
+
+    @property
+    def hidden_size(self) -> int:
+        """Get hidden size from text config for compatibility."""
+        return self.text_config.hidden_size
+
+    @property
+    def vocab_size(self) -> int:
+        """Get vocab size from text config for compatibility."""
+        return self.text_config.vocab_size
diff --git a/sglang/python/sglang/srt/configs/kimi_linear.py b/sglang/python/sglang/srt/configs/kimi_linear.py
new file mode 100644
index 0000000000000000000000000000000000000000..e73609044a1d8cd93709ddd8f8596d95dd81ba0b
--- /dev/null
+++ b/sglang/python/sglang/srt/configs/kimi_linear.py
@@ -0,0 +1,161 @@
+# Adapted from: https://github.com/vllm-project/vllm/blob/0384aa7150c4c9778efca041ffd1beb3ad2bd694/vllm/transformers_utils/configs/kimi_linear.py
+from transformers.configuration_utils import PretrainedConfig
+
+from sglang.srt.configs.mamba_utils import KimiLinearCacheParams, KimiLinearStateShape
+
+
+class KimiLinearConfig(PretrainedConfig):
+    model_type = "kimi_linear"
+    keys_to_ignore_at_inference = ["past_key_values"]
+
+    def __init__(
+        self,
+        model_type="kimi_linear",
+        vocab_size=163840,
+        hidden_size=4096,
+        head_dim=None,
+        intermediate_size=11008,
+        num_hidden_layers=32,
+        num_attention_heads=32,
+        num_key_value_heads=None,
+        hidden_act="silu",
+        initializer_range=0.02,
+        rms_norm_eps=1e-6,
+        use_cache=True,
+        pad_token_id=0,
+        bos_token_id=1,
+        eos_token_id=2,
+        rope_theta=10000.0,
+        rope_scaling=None,
+        tie_word_embeddings=False,
+        moe_intermediate_size: int | None = None,
+        moe_renormalize: bool = True,
+        moe_router_activation_func: str = "sigmoid",
+        num_experts: int | None = None,
+        num_experts_per_token: int | None = None,
+        num_shared_experts: int = 0,
+        routed_scaling_factor: float = 1.0,
+        first_k_dense_replace: int = 0,
+        moe_layer_freq: int = 1,
+        use_grouped_topk: bool = True,
+        num_expert_group: int = 1,
+        topk_group: int = 1,
+        q_lora_rank: int | None = None,
+        kv_lora_rank: int | None = None,
+        qk_nope_head_dim: int | None = None,
+        qk_rope_head_dim: int | None = None,
+        v_head_dim: int | None = None,
+        mla_use_nope: bool | None = False,
+        num_nextn_predict_layers: int = 0,
+        linear_attn_config: dict | None = None,
+        **kwargs,
+    ):
+        self.model_type = model_type
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.head_dim = (
+            head_dim if head_dim is not None else hidden_size // num_attention_heads
+        )
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+
+        # for backward compatibility
+        if num_key_value_heads is None:
+            num_key_value_heads = num_attention_heads
+
+        self.num_key_value_heads = num_key_value_heads
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+        self.rope_scaling = rope_scaling
+
+        self.q_lora_rank = q_lora_rank
+        self.kv_lora_rank = kv_lora_rank
+        self.qk_nope_head_dim = qk_nope_head_dim
+        self.qk_rope_head_dim = qk_rope_head_dim
+        self.v_head_dim = v_head_dim
+        self.mla_use_nope = mla_use_nope
+        # moe config
+        self.n_routed_experts = self.num_experts = num_experts
+        self.num_experts_per_token = num_experts_per_token
+        self.moe_renormalize = moe_renormalize
+        self.num_shared_experts = num_shared_experts
+        self.routed_scaling_factor = routed_scaling_factor
+        self.moe_router_activation_func = moe_router_activation_func
+        assert self.moe_router_activation_func in ("softmax", "sigmoid")
+        self.moe_intermediate_size = moe_intermediate_size
+        self.first_k_dense_replace = first_k_dense_replace
+        self.moe_layer_freq = moe_layer_freq
+        self.use_grouped_topk = use_grouped_topk
+        self.num_expert_group = num_expert_group
+        self.topk_group = topk_group
+        self.num_nextn_predict_layers = num_nextn_predict_layers
+
+        if linear_attn_config is not None:
+            assert linear_attn_config["kda_layers"] is not None
+            assert linear_attn_config["full_attn_layers"] is not None
+        self.linear_attn_config = linear_attn_config
+
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
+
+    @property
+    def is_mla(self):
+        return (
+            self.q_lora_rank is not None
+            or self.kv_lora_rank is not None
+            or self.qk_nope_head_dim is not None
+            or self.qk_rope_head_dim is not None
+            or self.v_head_dim is not None
+            or self.mla_use_nope is True
+        )
+
+    @property
+    def is_moe(self):
+        return self.num_experts is not None
+
+    @property
+    def is_linear_attn(self) -> bool:
+        return not (
+            self.linear_attn_config is None
+            or (
+                isinstance(self.linear_attn_config, dict)
+                and self.linear_attn_config["kda_layers"] is not None
+                and len(self.linear_attn_config["kda_layers"]) == 0
+            )
+        )
+
+    def is_kda_layer(self, layer_idx: int):
+        return (
+            self.linear_attn_config is not None
+            and (layer_idx + 1) in self.linear_attn_config["kda_layers"]
+        )
+
+    @property
+    def linear_layer_ids(self):
+        return [i for i in range(self.num_hidden_layers) if self.is_kda_layer(i)]
+
+    @property
+    def full_attention_layer_ids(self):
+        return [i for i in range(self.num_hidden_layers) if not self.is_kda_layer(i)]
+
+    @property
+    def mamba2_cache_params(self) -> KimiLinearCacheParams:
+        from sglang.srt.layers.dp_attention import get_attention_tp_size
+
+        shape = KimiLinearStateShape.create(
+            tp_world_size=get_attention_tp_size(),
+            num_heads=self.linear_attn_config["num_heads"],
+            head_dim=self.linear_attn_config["head_dim"],
+            conv_kernel_size=self.linear_attn_config["short_conv_kernel_size"],
+        )
+
+        return KimiLinearCacheParams(shape=shape, layers=self.linear_layer_ids)
diff --git a/sglang/python/sglang/srt/configs/kimi_vl.py b/sglang/python/sglang/srt/configs/kimi_vl.py
new file mode 100644
index 0000000000000000000000000000000000000000..3c7d20f5944da297f5d54311ce592ed3c83eeed5
--- /dev/null
+++ b/sglang/python/sglang/srt/configs/kimi_vl.py
@@ -0,0 +1,38 @@
+# SPDX-License-Identifier: Apache-2.0
+# Adapted from https://huggingface.co/moonshotai/Kimi-VL-A3B-Instruct/blob/main/configuration_kimi_vl.py
+from typing import Optional, Union
+
+from transformers.configuration_utils import PretrainedConfig
+
+from sglang.srt.configs.deepseekvl2 import DeepseekV2Config
+from sglang.srt.configs.kimi_vl_moonvit import MoonViTConfig
+
+
+class KimiVLConfig(PretrainedConfig):
+    model_type = "kimi_vl"
+
+    def __init__(
+        self,
+        vision_config: Optional[Union[dict, MoonViTConfig]] = None,
+        text_config: Optional[Union[dict, DeepseekV2Config]] = None,
+        ignore_index: int = -100,
+        media_placeholder_token_id: int = 163605,
+        pad_token_id: int = 0,
+        **kwargs
+    ):
+        if vision_config is None:
+            vision_config = MoonViTConfig()
+        elif isinstance(vision_config, dict):
+            vision_config = MoonViTConfig(**vision_config)
+        self.vision_config = vision_config
+
+        if text_config is None:
+            text_config = DeepseekV2Config()
+        elif isinstance(text_config, dict):
+            text_config = DeepseekV2Config(**text_config)
+        self.text_config = text_config
+
+        self.ignore_index = ignore_index
+        self.media_placeholder_token_id = media_placeholder_token_id
+
+        super().__init__(pad_token_id=pad_token_id, **kwargs)
diff --git a/sglang/python/sglang/srt/configs/kimi_vl_moonvit.py b/sglang/python/sglang/srt/configs/kimi_vl_moonvit.py
new file mode 100644
index 0000000000000000000000000000000000000000..166809eb6e9861672682f25539197facf99fb0a3
--- /dev/null
+++ b/sglang/python/sglang/srt/configs/kimi_vl_moonvit.py
@@ -0,0 +1,32 @@
+# SPDX-License-Identifier: Apache-2.0
+# Adapted from https://huggingface.co/moonshotai/Kimi-VL-A3B-Instruct/blob/main/configuration_kimi_vl.py
+from transformers.configuration_utils import PretrainedConfig
+
+
+class MoonViTConfig(PretrainedConfig):
+    model_type = "moonvit"
+
+    def __init__(
+        self,
+        patch_size: int = 14,
+        init_pos_emb_height: int = 64,
+        init_pos_emb_width: int = 64,
+        num_attention_heads: int = 16,
+        num_hidden_layers: int = 27,
+        hidden_size: int = 1152,
+        intermediate_size: int = 4304,
+        merge_kernel_size: tuple[int, int] = (2, 2),
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.patch_size = patch_size
+        # Positional embedding config
+        self.init_pos_emb_height = init_pos_emb_height
+        self.init_pos_emb_width = init_pos_emb_width
+        # Transformer config
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        # Patch merger config
+        self.merge_kernel_size = merge_kernel_size
diff --git a/sglang/python/sglang/srt/configs/lfm2.py b/sglang/python/sglang/srt/configs/lfm2.py
new file mode 100644
index 0000000000000000000000000000000000000000..bc74b4c23c3c19cf461d9403d1c2d3ffc0bb89d5
--- /dev/null
+++ b/sglang/python/sglang/srt/configs/lfm2.py
@@ -0,0 +1,104 @@
+# coding=utf-8
+# Copyright 2024 Liquid AI and the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""LFM2 (Liquid Foundation Model 2) configuration"""
+
+from typing import List, Optional
+
+from transformers import CONFIG_MAPPING
+from transformers import Lfm2Config as HFLfm2Config
+from transformers.utils import logging
+
+from sglang.srt.configs.mamba_utils import (
+    Mamba2CacheParams,
+    Mamba2StateShape,
+    mamba2_state_dtype,
+)
+
+logger = logging.get_logger(__name__)
+
+
+class Lfm2Config(HFLfm2Config):
+    """
+    SGLang configuration for LFM2 models.
+
+    Extends HuggingFace's Lfm2Config with hybrid model properties needed by SGLang.
+    LFM2 uses a hybrid architecture mixing full attention and ShortConv layers.
+    """
+
+    @property
+    def full_attention_layer_ids(self) -> List[int]:
+        """Return indices of attention layers for KV cache."""
+        return [i for i, lt in enumerate(self.layer_types) if lt == "full_attention"]
+
+    @property
+    def linear_layer_ids(self) -> List[int]:
+        """Return indices of conv layers for conv state cache."""
+        return [
+            i for i, lt in enumerate(self.layer_types) if lt in ("conv", "short_conv")
+        ]
+
+    @property
+    def mamba_chunk_size(self) -> int:
+        """Return chunk size for Mamba2 backend. LFM2 doesn't use chunking, return 1."""
+        return 1
+
+    @property
+    def mamba2_cache_params(self) -> Optional[Mamba2CacheParams]:
+        """
+        Get cache params for HybridReqToTokenPool initialization.
+
+        LFM2 uses ShortConv layers with a small fixed-size cache (kernel_size - 1).
+        Unlike full Mamba2 models, LFM2 only uses the conv state, not SSM temporal state.
+        """
+        from sglang.srt.layers.dp_attention import get_attention_tp_size
+
+        conv_layer_ids = self.linear_layer_ids
+        if not conv_layer_ids:
+            return None
+
+        hidden_size = self.hidden_size
+        conv_kernel = int(self.conv_L_cache)
+
+        # get_attention_tp_size() requires initialization, default to 1 if not available
+        try:
+            tp_size = get_attention_tp_size()
+        except (AssertionError, RuntimeError):
+            tp_size = 1
+
+        # For ShortConv layers, we use a simplified Mamba2StateShape
+        # LFM2 doesn't use SSM state (state_size=0), only conv state
+        # We pass num_heads=tp_size so divide(tp_size, tp_size)=1 always works.
+        # Since state_size=0, the temporal state shape has zero elements anyway.
+        shape = Mamba2StateShape.create(
+            tp_world_size=tp_size,
+            intermediate_size=hidden_size,
+            n_groups=1,  # ShortConv doesn't use grouping
+            num_heads=tp_size,  # Ensures divide works; temporal state is empty anyway
+            head_dim=hidden_size,  # Conv operates on full hidden dim
+            state_size=0,  # No SSM temporal state for ShortConv
+            conv_kernel=conv_kernel,
+        )
+
+        return Mamba2CacheParams(
+            shape=shape,
+            layers=conv_layer_ids,
+            dtype=mamba2_state_dtype(self),
+        )
+
+
+# Override HuggingFace's Lfm2Config with our extended version
+# Cannot use .register() because lfm2 is already registered by transformers
+# Directly modify the internal _extra_content dict instead
+CONFIG_MAPPING._extra_content["lfm2"] = Lfm2Config
diff --git a/sglang/python/sglang/srt/configs/lfm2_moe.py b/sglang/python/sglang/srt/configs/lfm2_moe.py
new file mode 100644
index 0000000000000000000000000000000000000000..23112ca0891404ec55204e444e00e87af0a08a91
--- /dev/null
+++ b/sglang/python/sglang/srt/configs/lfm2_moe.py
@@ -0,0 +1,192 @@
+# Copyright 2025 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""LFM2-MoE (Liquid Foundation Model 2 - Mixture of Experts) configuration
+
+Note: HF transformers has Lfm2MoeConfig in v5.0.0rc2 (unreleased).
+Once released, we could inherit from it like Lfm2Config does with HFLfm2Config.
+For now, we define a standalone config to support the model immediately.
+"""
+
+from typing import List, Optional
+
+from transformers import CONFIG_MAPPING
+from transformers.configuration_utils import PretrainedConfig
+
+from sglang.srt.configs.mamba_utils import Mamba2CacheParams, Mamba2StateShape
+
+
+class Lfm2MoeConfig(PretrainedConfig):
+    """
+    Configuration for LFM2-MoE models (e.g., LiquidAI/LFM2-8B-A1B).
+
+    LFM2-MoE is a hybrid architecture with:
+    - Attention layers and ShortConv layers (like dense LFM2)
+    - MoE (Mixture of Experts) FFN layers with sigmoid routing
+
+    Key MoE specifics:
+    - First `num_dense_layers` use dense MLP, rest use MoE
+    - Sigmoid routing (not softmax) with expert_bias for load balancing
+    - expert_bias is fp32 for numerical stability
+    """
+
+    model_type = "lfm2_moe"
+    keys_to_ignore_at_inference = ["past_key_values"]
+
+    def __init__(
+        self,
+        vocab_size: int = 65536,
+        hidden_size: int = 2048,
+        intermediate_size: int = 7168,
+        moe_intermediate_size: int = 1792,
+        num_hidden_layers: int = 32,
+        num_attention_heads: int = 32,
+        num_key_value_heads: int = 8,
+        max_position_embeddings: int = 128000,
+        initializer_range: float = 0.02,
+        norm_eps: float = 1e-5,
+        use_cache: bool = True,
+        pad_token_id: int = 0,
+        bos_token_id: int = 1,
+        eos_token_id: int = 2,
+        tie_word_embeddings: bool = True,
+        rope_parameters: Optional[dict] = None,
+        conv_bias: bool = False,
+        conv_L_cache: int = 3,
+        # MoE-specific parameters
+        num_dense_layers: int = 2,
+        num_experts: int = 32,
+        num_experts_per_tok: int = 4,
+        use_expert_bias: bool = True,
+        routed_scaling_factor: float = 1.0,
+        norm_topk_prob: bool = True,
+        # Layer types
+        layer_types: Optional[List[str]] = None,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.moe_intermediate_size = moe_intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.num_key_value_heads = num_key_value_heads
+        self.max_position_embeddings = max_position_embeddings
+        self.initializer_range = initializer_range
+        self.norm_eps = norm_eps
+        self.use_cache = use_cache
+
+        # Conv parameters
+        self.conv_bias = conv_bias
+        self.conv_L_cache = conv_L_cache
+
+        # MoE parameters
+        self.num_dense_layers = num_dense_layers
+        self.num_experts = num_experts
+        self.num_experts_per_tok = num_experts_per_tok
+        self.use_expert_bias = use_expert_bias
+        self.routed_scaling_factor = routed_scaling_factor
+        self.norm_topk_prob = norm_topk_prob
+
+        # Layer types (attention vs conv)
+        self.layer_types = layer_types
+
+        # RoPE parameters
+        self.rope_parameters = rope_parameters
+
+        # Validate layer_types length matches num_hidden_layers
+        if layer_types is not None and len(layer_types) != num_hidden_layers:
+            raise ValueError(
+                f"layer_types length ({len(layer_types)}) must match "
+                f"num_hidden_layers ({num_hidden_layers})"
+            )
+
+        # Handle tie_embedding alias from original config
+        tie_word_embeddings = kwargs.pop("tie_embedding", tie_word_embeddings)
+
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
+
+    @property
+    def full_attention_layer_ids(self) -> List[int]:
+        """Return indices of attention layers for KV cache."""
+        if self.layer_types is None:
+            return []
+        return [i for i, lt in enumerate(self.layer_types) if lt == "full_attention"]
+
+    @property
+    def linear_layer_ids(self) -> List[int]:
+        """Return indices of conv layers for conv state cache."""
+        if self.layer_types is None:
+            return []
+        return [
+            i for i, lt in enumerate(self.layer_types) if lt in ("conv", "short_conv")
+        ]
+
+    @property
+    def mamba_chunk_size(self) -> int:
+        """Return chunk size for Mamba2 backend. LFM2 doesn't use chunking."""
+        return 1
+
+    @property
+    def mamba2_cache_params(self) -> Optional[Mamba2CacheParams]:
+        """
+        Get cache params for HybridReqToTokenPool initialization.
+
+        LFM2-MoE uses ShortConv layers with a small fixed-size cache.
+        """
+        from sglang.srt.layers.dp_attention import get_attention_tp_size
+
+        conv_layer_ids = self.linear_layer_ids
+        if not conv_layer_ids:
+            return None
+
+        hidden_size = self.hidden_size
+        # conv_L_cache in config is kernel_size (e.g., 3)
+        conv_kernel = int(self.conv_L_cache)
+        # actual cache size is kernel_size - 1 (e.g., 2 for kernel=3)
+
+        try:
+            tp_size = get_attention_tp_size()
+        except (AssertionError, RuntimeError):
+            tp_size = 1
+
+        shape = Mamba2StateShape.create(
+            tp_world_size=tp_size,
+            intermediate_size=hidden_size,
+            n_groups=1,
+            num_heads=tp_size,  # Ensures divide works; temporal state is empty anyway
+            head_dim=hidden_size,
+            state_size=0,
+            conv_kernel=conv_kernel,
+        )
+
+        # Uses default mamba2_state_dtype() which reads SGLANG_MAMBA_CONV_DTYPE env var
+        # (defaults to bfloat16). Set SGLANG_MAMBA_CONV_DTYPE=float16 for fp16 inference.
+        return Mamba2CacheParams(
+            shape=shape,
+            layers=conv_layer_ids,
+        )
+
+
+# Register with transformers CONFIG_MAPPING so AutoConfig.from_pretrained()
+# can instantiate our config class when loading models with model_type="lfm2_moe"
+try:
+    CONFIG_MAPPING.register("lfm2_moe", Lfm2MoeConfig)
+except Exception:
+    # Already registered or registration failed - use direct assignment
+    CONFIG_MAPPING._extra_content["lfm2_moe"] = Lfm2MoeConfig
diff --git a/sglang/python/sglang/srt/configs/load_config.py b/sglang/python/sglang/srt/configs/load_config.py
new file mode 100644
index 0000000000000000000000000000000000000000..ddf8d2967ce64f3a99b44594672c226e306cc0a0
--- /dev/null
+++ b/sglang/python/sglang/srt/configs/load_config.py
@@ -0,0 +1,136 @@
+# Adapted from https://github.com/vllm-project/vllm/blob/v0.6.4.post1/vllm/config.py
+import enum
+import logging
+from dataclasses import dataclass, field
+from typing import Any, List, Optional, Union
+
+import orjson
+
+from sglang.srt.configs.modelopt_config import ModelOptConfig
+from sglang.srt.utils import is_hip
+
+logger = logging.getLogger(__name__)
+
+
+class LoadFormat(str, enum.Enum):
+    AUTO = "auto"
+    PT = "pt"
+    SAFETENSORS = "safetensors"
+    NPCACHE = "npcache"
+    DUMMY = "dummy"
+    SHARDED_STATE = "sharded_state"
+    GGUF = "gguf"
+    BITSANDBYTES = "bitsandbytes"
+    MISTRAL = "mistral"
+    LAYERED = "layered"
+    FLASH_RL = "flash_rl"  # For RL training with quantized models
+    JAX = "jax"
+    REMOTE = "remote"
+    REMOTE_INSTANCE = "remote_instance"
+    RDMA = "rdma"
+    LOCAL_CACHED = "local_cached"
+    FASTSAFETENSORS = "fastsafetensors"
+    PRIVATE = "private"
+
+
+@dataclass
+class LoadConfig:
+    """
+    download_dir: Directory to download and load the weights, default to the
+        default cache directory of huggingface.
+    load_format: The format of the model weights to load:
+        "auto" will try to load the weights in the safetensors format and
+            fall back to the pytorch bin format if safetensors format is
+            not available.
+        "pt" will load the weights in the pytorch bin format.
+        "safetensors" will load the weights in the safetensors format.
+        "npcache" will load the weights in pytorch format and store
+            a numpy cache to speed up the loading.
+        "dummy" will initialize the weights with random values, which is
+            mainly for profiling.
+        "bitsandbytes" will load nf4 type weights.
+        "flash_rl" will load weights with support for RL training
+            with quantized models, enabling efficient weight reloading.
+    ignore_patterns: The list of patterns to ignore when loading the model.
+        Default to "original/**/*" to avoid repeated loading of llama's
+        checkpoints.
+    decryption_key_file: If set, decrypts the output files with a password read
+        from this file (after PBKDF2).
+    decrypt_max_concurrency: The maximum number of concurrent processes to decrypt the safetensor files. -1 means no limit.
+
+    # ModelOpt-specific loading options
+    modelopt_checkpoint_restore_path: Optional[str] = None
+    modelopt_checkpoint_save_path: Optional[str] = None
+    modelopt_export_path: Optional[str] = None
+    """
+
+    load_format: Union[str, LoadFormat] = LoadFormat.AUTO
+    download_dir: Optional[str] = None
+    model_loader_extra_config: Optional[Union[str, dict]] = field(default_factory=dict)
+    ignore_patterns: Optional[Union[List[str], str]] = None
+    decryption_key_file: Optional[str] = None
+    decrypt_max_concurrency: int = -1
+    tp_rank: Optional[int] = None
+    remote_instance_weight_loader_seed_instance_ip: Optional[str] = None
+    remote_instance_weight_loader_seed_instance_service_port: Optional[int] = None
+    remote_instance_weight_loader_send_weights_group_ports: Optional[List[int]] = None
+    remote_instance_weight_loader_backend: Optional[str] = None
+    remote_instance_weight_loader_transfer_engine: Optional[Any] = None
+
+    # ModelOpt-specific loading options
+    modelopt_checkpoint_restore_path: Optional[str] = None
+    modelopt_checkpoint_save_path: Optional[str] = None
+    modelopt_export_path: Optional[str] = None
+
+    # ModelOpt configuration object
+    modelopt_config: Optional[ModelOptConfig] = None
+
+    # QuantizedRL-specific options (for FlashRL-style quantization)
+    rl_quant_profile: Optional[str] = (
+        None  # Path to rollout quantization profile (e.g., /root/profile.7b.pt)
+    )
+
+    # For multi-layer MTP
+    draft_model_idx: Optional[int] = None
+
+    def __post_init__(self):
+        model_loader_extra_config = self.model_loader_extra_config or {}
+        if isinstance(model_loader_extra_config, str):
+            self.model_loader_extra_config = orjson.loads(model_loader_extra_config)
+        self._verify_load_format()
+
+        if self.ignore_patterns is not None and len(self.ignore_patterns) > 0:
+            logger.info(
+                "Ignoring the following patterns when downloading weights: %s",
+                self.ignore_patterns,
+            )
+        else:
+            self.ignore_patterns = ["original/**/*"]
+
+        # Create ModelOptConfig if not provided
+        if self.modelopt_config is None:
+            self.modelopt_config = ModelOptConfig(
+                checkpoint_restore_path=self.modelopt_checkpoint_restore_path,
+                checkpoint_save_path=self.modelopt_checkpoint_save_path,
+                export_path=self.modelopt_export_path,
+            )
+
+    def _verify_load_format(self) -> None:
+        if not isinstance(self.load_format, str):
+            return
+
+        load_format = self.load_format.lower()
+        self.load_format = LoadFormat(load_format)
+
+        rocm_not_supported_load_format: List[str] = []
+        if is_hip() and load_format in rocm_not_supported_load_format:
+            rocm_supported_load_format = [
+                f
+                for f in LoadFormat.__members__
+                if (f not in rocm_not_supported_load_format)
+            ]
+            raise ValueError(
+                f"load format '{load_format}' is not supported in ROCm. "
+                f"Supported load formats are "
+                f"{rocm_supported_load_format}"
+            )
diff --git a/sglang/python/sglang/srt/configs/longcat_flash.py b/sglang/python/sglang/srt/configs/longcat_flash.py
new file mode 100644
index 0000000000000000000000000000000000000000..e6a2dfb026cac002c0bf95aa07761393a7930ad4
--- /dev/null
+++ b/sglang/python/sglang/srt/configs/longcat_flash.py
@@ -0,0 +1,104 @@
+from transformers.configuration_utils import PretrainedConfig
+from transformers.utils import logging
+
+logger = logging.get_logger(__name__)
+
+FLASH_PRETRAINED_CONFIG_ARCHIVE_MAP = {}
+
+
+class LongcatFlashConfig(PretrainedConfig):
+    model_type = "longcat_flash"
+    keys_to_ignore_at_inference = ["past_key_values"]
+
+    def __init__(
+        self,
+        vocab_size=131072,
+        hidden_size=6144,
+        intermediate_size=None,
+        ffn_hidden_size=12288,
+        expert_ffn_hidden_size=2048,
+        num_layers=28,
+        num_hidden_layers=None,
+        num_attention_heads=64,
+        ep_size=1,
+        kv_lora_rank=512,
+        q_lora_rank=1536,
+        qk_rope_head_dim=128,
+        qk_nope_head_dim=128,
+        v_head_dim=128,
+        n_routed_experts=512,
+        moe_topk=12,
+        norm_topk_prob=False,
+        max_position_embeddings=131072,
+        rms_norm_eps=1e-05,
+        use_cache=True,
+        pad_token_id=None,
+        bos_token_id=1,
+        eos_token_id=2,
+        pretraining_tp=1,
+        tie_word_embeddings=False,
+        rope_theta=10000000.0,
+        rope_scaling=None,
+        attention_bias=False,
+        attention_dropout=0.0,
+        mla_scale_q_lora=True,
+        mla_scale_kv_lora=True,
+        torch_dtype="bfloat16",
+        params_dtype="bfloat16",
+        rounter_params_dtype="float32",
+        router_bias=False,
+        topk_method=None,
+        routed_scaling_factor=6.0,
+        zero_expert_num=256,
+        zero_expert_type="identity",
+        nextn_use_scmoe=False,
+        num_nextn_predict_layers=1,
+        **kwargs,
+    ):
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            torch_dtype=torch_dtype,
+            params_dtype=params_dtype,
+            rounter_params_dtype=rounter_params_dtype,
+            topk_method=topk_method,
+            router_bias=router_bias,
+            nextn_use_scmoe=nextn_use_scmoe,
+            num_nextn_predict_layers=num_nextn_predict_layers,
+            **kwargs,
+        )
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = (
+            num_hidden_layers if num_hidden_layers is not None else num_layers
+        )
+        self.intermediate_size = (
+            intermediate_size if intermediate_size is not None else ffn_hidden_size
+        )
+        self.moe_intermediate_size = expert_ffn_hidden_size
+        self.num_attention_heads = num_attention_heads
+        self.ep_size = ep_size
+        self.kv_lora_rank = kv_lora_rank
+        self.q_lora_rank = q_lora_rank
+        self.qk_rope_head_dim = qk_rope_head_dim
+        self.v_head_dim = v_head_dim
+        self.qk_nope_head_dim = qk_nope_head_dim
+        self.n_routed_experts = n_routed_experts
+        self.moe_topk = moe_topk
+        self.norm_topk_prob = norm_topk_prob
+        self.rms_norm_eps = rms_norm_eps
+        self.pretraining_tp = pretraining_tp
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+        self.rope_scaling = rope_scaling
+        self.attention_bias = attention_bias
+        self.attention_dropout = attention_dropout
+        self.mla_scale_q_lora = mla_scale_q_lora
+        self.mla_scale_kv_lora = mla_scale_kv_lora
+        self.zero_expert_num = zero_expert_num
+        self.zero_expert_type = zero_expert_type
+        self.routed_scaling_factor = routed_scaling_factor
+        self.hidden_act = "silu"
diff --git a/sglang/python/sglang/srt/configs/mamba_utils.py b/sglang/python/sglang/srt/configs/mamba_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..96b2bca68ce8b582bd75027153492062096e2f95
--- /dev/null
+++ b/sglang/python/sglang/srt/configs/mamba_utils.py
@@ -0,0 +1,239 @@
+# Copyright 2025 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Common config utils for mamba2 - NemotronH, FalconH1, Qwen3Next, LFM2, etc."""
+
+import logging
+from abc import ABC
+from dataclasses import dataclass, field
+from typing import List, Optional
+
+import numpy as np
+import torch
+
+from sglang.srt.distributed.utils import divide
+from sglang.srt.environ import envs
+
+logger = logging.getLogger(__name__)
+
+
+def extra_groups_for_head_shards(ngroups: int, tp_size: int):
+    """Compute the increase in group numbers to account for
+    replication in order to accompany the head shards."""
+
+    # in the case ngoups % tp_size == 0, this will be zero
+    if ngroups % tp_size == 0:
+        return 0
+
+    # for n_groups == 1, this is exactly tp_size - n_groups
+    return tp_size - ngroups
+
+
+@dataclass(kw_only=True, frozen=True)
+class Mamba2StateDType:
+    conv: torch.dtype
+    temporal: torch.dtype
+
+
+def mamba2_state_dtype(config=None) -> Mamba2StateDType:
+    """
+    Get mamba2 state dtype from config or environment variable.
+
+    Priority (from highest to lowest):
+    1. Environment variable SGLANG_MAMBA_SSM_DTYPE
+    2. Config file (config.mamba_ssm_dtype or config.text_config.mamba_ssm_dtype)
+    3. Default "float32"
+
+    Args:
+        config: Optional config object (PretrainedConfig). If provided, will read
+                mamba_ssm_dtype from it. For VL models, reads from text_config.
+
+    Returns:
+        Mamba2StateDType with conv and temporal dtypes
+    """
+    dtype_map = {
+        "float32": torch.float32,
+        "bfloat16": torch.bfloat16,
+        "float16": torch.float16,
+    }
+    conv_dtype = dtype_map.get(envs.SGLANG_MAMBA_CONV_DTYPE.get(), torch.bfloat16)
+
+    # Get SSM dtype: default -> config -> env var
+    ssm_dtype = torch.float32  # Step 1: Default value
+
+    # Step 2: Try to read from config
+    if config is not None:
+        config_dtype = None
+        if hasattr(config, "text_config") and hasattr(
+            config.text_config, "mamba_ssm_dtype"
+        ):
+            # VL model: read from text_config
+            config_dtype = config.text_config.mamba_ssm_dtype
+        elif hasattr(config, "mamba_ssm_dtype"):
+            # Text model: read from root config
+            config_dtype = config.mamba_ssm_dtype
+
+        if config_dtype is not None:
+            if config_dtype not in dtype_map:
+                logger.warning(
+                    f"Invalid mamba_ssm_dtype '{config_dtype}' in config. "
+                    f"Must be one of {list(dtype_map.keys())}. Using default 'float32'."
+                )
+            else:
+                ssm_dtype = dtype_map[config_dtype]
+
+    # Step 3: Check environment variable, if not None, override
+    env_ssm_dtype = envs.SGLANG_MAMBA_SSM_DTYPE.get()
+    if env_ssm_dtype is not None:
+        if env_ssm_dtype not in dtype_map:
+            logger.warning(
+                f"Invalid mamba_ssm_dtype '{env_ssm_dtype}' from environment variable. "
+                f"Must be one of {list(dtype_map.keys())}. Using default 'float32'."
+            )
+        else:
+            ssm_dtype = dtype_map[env_ssm_dtype]
+
+    logger.debug(f"Mamba2 state dtype: conv_dtype={conv_dtype}, ssm_dtype={ssm_dtype}")
+
+    return Mamba2StateDType(conv=conv_dtype, temporal=ssm_dtype)
+
+
+@dataclass(kw_only=True, frozen=True)
+class BaseLinearStateParams(ABC):
+    dtype: Mamba2StateDType = field(default_factory=lambda: mamba2_state_dtype(None))
+    layers: list[int]
+
+    @property
+    def mamba_cache_per_req(self) -> int:
+        conv_numel = int(
+            np.sum([np.prod(conv_shape) for conv_shape in self.shape.conv])
+        )
+
+        ssm_numel = int(np.prod(self.shape.temporal))
+        return (
+            conv_numel * self.dtype.conv.itemsize
+            + ssm_numel * self.dtype.temporal.itemsize
+        ) * len(self.layers)
+
+
+@dataclass(kw_only=True, frozen=True)
+class Mamba2StateShape:
+    conv: list[tuple[int, int]]
+    temporal: tuple[int, int, int]
+
+    intermediate_size: int
+    conv_dim: int
+    ssm_state_size: int
+    num_heads: int
+    head_dim: int
+    state_size: int
+    conv_kernel: int
+
+    @staticmethod
+    def create(
+        *,
+        tp_world_size: int,
+        intermediate_size: int,
+        n_groups: int,
+        num_heads: int,
+        head_dim: int,
+        state_size: int,
+        conv_kernel: int,
+    ) -> "Mamba2StateShape":
+        # if n_groups is not divisible by world_size, need to extend the shards
+        # to ensure all groups needed by a head is sharded along with it
+        if n_groups % tp_world_size != 0:
+            extra_groups = extra_groups_for_head_shards(n_groups, tp_world_size)
+            n_groups += extra_groups
+        # heads and n_groups are TP-ed
+        conv_dim = intermediate_size + 2 * n_groups * state_size
+
+        # contiguous along 'dim' axis
+        conv_state_shape = divide(conv_dim, tp_world_size), conv_kernel - 1
+
+        # These are not TP-ed as they depend on A, dt_bias, D
+        # - they are typically small
+        #   e.g., QWen3-Next: (32, 128, 128)
+        temporal_state_shape = (divide(num_heads, tp_world_size), head_dim, state_size)
+        return Mamba2StateShape(
+            conv=[conv_state_shape],
+            temporal=temporal_state_shape,
+            intermediate_size=intermediate_size,
+            conv_dim=conv_dim,
+            ssm_state_size=state_size,
+            num_heads=num_heads,
+            head_dim=head_dim,
+            state_size=state_size,
+            conv_kernel=conv_kernel,
+        )
+
+
+@dataclass(kw_only=True, frozen=True)
+class Mamba2CacheParams(BaseLinearStateParams):
+    shape: Mamba2StateShape
+
+
+@dataclass(kw_only=True, frozen=True)
+class KimiLinearStateShape:
+    conv: List[tuple[int, int]]
+    temporal: tuple[int, int, int]
+
+    num_heads: int
+    head_dim: int
+    num_k_heads: int
+    head_k_dim: int
+    conv_kernel: int
+    num_spec: int
+
+    @staticmethod
+    def create(
+        *,
+        tp_world_size: int,
+        num_heads: int,
+        head_dim: int,
+        num_k_heads: Optional[int] = None,
+        head_k_dim: Optional[int] = None,
+        conv_kernel_size: int = 4,
+        num_spec: int = 0,
+    ) -> "KimiLinearStateShape":
+        if num_k_heads is None:
+            num_k_heads = num_heads
+        if head_k_dim is None:
+            head_k_dim = head_dim
+
+        proj_size = num_heads * head_dim
+        proj_k_size = num_k_heads * head_k_dim
+
+        conv_state_shape = (divide(proj_size, tp_world_size), conv_kernel_size - 1)
+        conv_state_k_shape = (divide(proj_k_size, tp_world_size), conv_kernel_size - 1)
+        temporal_state_shape = (divide(num_heads, tp_world_size), head_dim, head_dim)
+
+        conv_state_shape = (
+            conv_state_shape[1],
+            conv_state_shape[0] + conv_state_k_shape[0] * 2,
+        )
+
+        return KimiLinearStateShape(
+            conv=[conv_state_shape],
+            temporal=temporal_state_shape,
+            num_heads=num_heads,
+            head_dim=head_dim,
+            num_k_heads=num_k_heads,
+            head_k_dim=head_k_dim,
+            conv_kernel=conv_kernel_size,
+            num_spec=num_spec,
+        )
+
+
+@dataclass(kw_only=True, frozen=True)
+class KimiLinearCacheParams(BaseLinearStateParams):
+    shape: KimiLinearStateShape
diff --git a/sglang/python/sglang/srt/configs/model_config.py b/sglang/python/sglang/srt/configs/model_config.py
new file mode 100644
index 0000000000000000000000000000000000000000..41ad51ce004f096f17bb6139728aa3fb54d3ff4a
--- /dev/null
+++ b/sglang/python/sglang/srt/configs/model_config.py
@@ -0,0 +1,1462 @@
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+import json
+import logging
+import math
+import os
+from enum import Enum, IntEnum, auto
+from pathlib import Path
+from typing import Any, List, Optional, Set, Union
+
+import torch
+from transformers import PretrainedConfig
+
+from sglang.srt.environ import envs
+from sglang.srt.layers.quantization import QUANTIZATION_METHODS
+from sglang.srt.server_args import ServerArgs
+from sglang.srt.utils import is_hip, is_sm100_supported, retry
+from sglang.srt.utils.hf_transformers_utils import (
+    get_config,
+    get_context_length,
+    get_generation_config,
+    get_hf_text_config,
+    get_sparse_attention_config,
+)
+from sglang.utils import is_in_ci
+
+logger = logging.getLogger(__name__)
+
+
+class AttentionArch(IntEnum):
+    MLA = auto()
+    MHA = auto()
+
+
+class ModelImpl(str, Enum):
+    AUTO = "auto"
+    SGLANG = "sglang"
+    TRANSFORMERS = "transformers"
+    MINDSPORE = "mindspore"
+
+
+def is_deepseek_nsa(config: PretrainedConfig) -> bool:
+    return (
+        config.architectures is not None
+        and config.architectures[0]
+        in [
+            "DeepseekV3ForCausalLM",
+            "DeepseekV32ForCausalLM",
+            "DeepseekV3ForCausalLMNextN",
+            "MistralLarge3ForCausalLM",
+            "PixtralForConditionalGeneration",
+            "GlmMoeDsaForCausalLM",
+        ]
+        and getattr(config, "index_topk", None) is not None
+    )
+
+
+def get_nsa_index_head_dim(config: PretrainedConfig) -> int:
+    assert is_deepseek_nsa(config)
+    return config.index_head_dim
+
+
+def get_nsa_index_topk(config: PretrainedConfig) -> int:
+    assert is_deepseek_nsa(config)
+    return config.index_topk
+
+
+def get_nsa_index_n_heads(config: PretrainedConfig) -> int:
+    assert is_deepseek_nsa(config)
+    return config.index_n_heads
+
+
+class ModelConfig:
+    def __init__(
+        self,
+        model_path: str,
+        trust_remote_code: bool = True,
+        revision: Optional[str] = None,
+        context_length: Optional[int] = None,
+        model_override_args: str = "{}",
+        is_embedding: Optional[bool] = None,
+        enable_multimodal: Optional[bool] = None,
+        dtype: str = "auto",
+        quantization: Optional[str] = None,
+        override_config_file: Optional[str] = None,
+        is_draft_model: bool = False,
+        model_impl: Union[str, ModelImpl] = ModelImpl.AUTO,
+        sampling_defaults: str = "openai",
+        quantize_and_serve: bool = False,
+        is_multi_layer_eagle: bool = False,
+        encoder_only: bool = False,
+        language_only: bool = False,
+        disable_hybrid_swa_memory: bool = False,
+    ) -> None:
+        # Parse args
+        self.model_path = model_path
+        self.revision = revision
+        self.quantization = quantization
+        self.is_draft_model = is_draft_model
+        self.model_impl = model_impl
+        self.sampling_defaults = sampling_defaults
+        self.quantize_and_serve = quantize_and_serve
+        self.is_multi_layer_eagle = is_multi_layer_eagle
+        self.disable_hybrid_swa_memory = disable_hybrid_swa_memory
+
+        # Validate quantize_and_serve configuration
+        self._validate_quantize_and_serve_config()
+
+        # Get hf config
+        self._maybe_pull_model_tokenizer_from_remote()
+        self.model_override_args = json.loads(model_override_args)
+        kwargs = {}
+        if override_config_file and override_config_file.strip():
+            kwargs["_configuration_file"] = override_config_file.strip()
+        self.hf_config = get_config(
+            self.model_path,
+            trust_remote_code=trust_remote_code,
+            revision=revision,
+            model_override_args=self.model_override_args,
+            **kwargs,
+        )
+        self.hf_text_config = get_hf_text_config(self.hf_config)
+        self.hf_generation_config = get_generation_config(
+            self.model_path,
+            trust_remote_code=trust_remote_code,
+            revision=revision,
+            **kwargs,
+        )
+
+        # Set enable_multimodal
+        if enable_multimodal is None:
+            mm_disabled_models = [
+                "Gemma3ForConditionalGeneration",
+                "Llama4ForConditionalGeneration",
+                "Step3VLForConditionalGeneration",
+            ]
+            if self.hf_config.architectures[0] in mm_disabled_models:
+                enable_multimodal = False
+                logger.info(
+                    f"Multimodal is disabled for {self.hf_config.model_type}. To enable it, set --enable-multimodal."
+                )
+            else:
+                enable_multimodal = True
+
+        # Config draft model
+        self._config_draft_model()
+
+        # Check model type
+        self.attention_chunk_size = getattr(
+            self.hf_text_config, "attention_chunk_size", None
+        )
+        self.sliding_window_size = self._get_sliding_window_size()
+        self.is_generation = is_generation_model(
+            self.hf_config.architectures, is_embedding
+        )
+        self.is_multimodal = enable_multimodal and is_multimodal_model(
+            self.hf_config.architectures
+        )
+        self.is_multimodal_gen = enable_multimodal and is_multimodal_gen_model(
+            self.hf_config.architectures
+        )
+        self.is_image_gen = enable_multimodal and is_image_gen_model(
+            self.hf_config.architectures
+        )
+        self.is_audio_model = enable_multimodal and is_audio_model(
+            self.hf_config.architectures
+        )
+        # TODO: requires further polishing
+        self.is_image_understandable_model = enable_multimodal and hasattr(
+            self.hf_config, "vision_config"
+        )
+        self.is_audio_understandable_model = enable_multimodal and hasattr(
+            self.hf_config, "audio_config"
+        )
+
+        self.is_multimodal_chunked_prefill_supported = (
+            enable_multimodal
+            and is_multimodal_chunked_prefill_supported(self.hf_config.architectures)
+        )
+        self.is_encoder_decoder = is_encoder_decoder_model(self.hf_config.architectures)
+        self.is_local_attention_model = is_local_attention_model(
+            self.hf_config.architectures
+        )
+        self.is_piecewise_cuda_graph_disabled_model = (
+            is_piecewise_cuda_graph_disabled_model(self.hf_config.architectures)
+            or is_deepseek_nsa(self.hf_text_config)
+        )
+        self.dtype = _get_and_verify_dtype(self.hf_text_config, dtype)
+
+        # Derive context length and model shapes
+        self._derive_context_length(context_length)
+        self._derive_model_shapes()
+
+        # Update hybrid model
+        self._derive_hybrid_model()
+
+        # Verify quantization
+        self._verify_quantization()
+
+        self._verify_transformers_version()
+
+        # Verify dual-chunk attention config
+        self._verify_dual_chunk_attention_config()
+
+        # Cache attributes
+        self.hf_eos_token_id = self._get_hf_eos_token_id()
+
+        # multimodal
+        self.image_token_id = getattr(
+            self.hf_config, "image_token_id", None
+        ) or getattr(self.hf_config, "image_token_index", None)
+
+        self.hf_config.encoder_only = encoder_only
+        self.hf_config.language_only = language_only
+
+        # matryoshka embeddings
+        self.matryoshka_dimensions = getattr(
+            self.hf_config, "matryoshka_dimensions", None
+        )
+        self.is_matryoshka = self.matryoshka_dimensions or getattr(
+            self.hf_config, "is_matryoshka", False
+        )
+
+    @staticmethod
+    def from_server_args(
+        server_args: ServerArgs,
+        model_path: str = None,
+        model_revision: str = None,
+        is_draft_model: bool = False,
+        **kwargs,
+    ):
+        quantization = (
+            server_args.speculative_draft_model_quantization
+            if is_draft_model
+            else server_args.quantization
+        )
+        override_config_file = (
+            server_args.decrypted_draft_config_file
+            if is_draft_model
+            else server_args.decrypted_config_file
+        )
+        return ModelConfig(
+            model_path=model_path or server_args.model_path,
+            trust_remote_code=server_args.trust_remote_code,
+            revision=model_revision or server_args.revision,
+            context_length=server_args.context_length,
+            model_override_args=server_args.json_model_override_args,
+            is_embedding=server_args.is_embedding,
+            enable_multimodal=server_args.enable_multimodal,
+            dtype=server_args.dtype,
+            quantization=quantization,
+            model_impl=server_args.model_impl,
+            sampling_defaults=server_args.sampling_defaults,
+            quantize_and_serve=server_args.quantize_and_serve,
+            override_config_file=override_config_file,
+            is_multi_layer_eagle=server_args.enable_multi_layer_eagle,
+            language_only=server_args.language_only,
+            encoder_only=server_args.encoder_only,
+            is_draft_model=is_draft_model,
+            disable_hybrid_swa_memory=server_args.disable_hybrid_swa_memory,
+            **kwargs,
+        )
+
+    def _config_draft_model(self):
+        is_draft_model = self.is_draft_model
+
+        if is_draft_model and self.hf_config.architectures[0] in [
+            "DeepseekV3ForCausalLM",
+            "GlmMoeDsaForCausalLM",
+        ]:
+            self.hf_config.architectures[0] = "DeepseekV3ForCausalLMNextN"
+
+        if is_draft_model and self.hf_config.architectures[0] in [
+            "Glm4MoeForCausalLM",
+            "Glm4MoeLiteForCausalLM",
+        ]:
+            self.hf_config.architectures[0] = "Glm4MoeForCausalLMNextN"
+
+        if is_draft_model and self.hf_config.architectures[0] in [
+            "GlmOcrForConditionalGeneration",
+        ]:
+            self.hf_config.architectures[0] = "GlmOcrForConditionalGenerationNextN"
+
+        if (
+            is_draft_model
+            and self.hf_config.architectures[0] == "LongcatFlashForCausalLM"
+        ):
+            self.hf_config.architectures[0] = "LongcatFlashForCausalLMNextN"
+            self.hf_config.num_hidden_layers = self.hf_config.num_nextn_predict_layers
+
+        if is_draft_model and self.hf_config.architectures[0] == "MiMoForCausalLM":
+            self.hf_config.architectures[0] = "MiMoMTP"
+        if (
+            is_draft_model
+            and self.hf_config.architectures[0] == "MiMoV2FlashForCausalLM"
+        ):
+            self.hf_config.architectures[0] = "MiMoV2MTP"
+        if is_draft_model and self.hf_config.architectures[0] == "Step3p5ForCausalLM":
+            self.hf_config.architectures[0] = "Step3p5MTP"
+        if is_draft_model and self.hf_config.architectures[0] in [
+            "BailingMoeV2ForCausalLM",
+            "BailingMoeForCausalLM",
+            "BailingMoeV2_5ForCausalLM",
+        ]:
+            self.hf_config.architectures[0] = "BailingMoeForCausalLMNextN"
+        if (
+            is_draft_model
+            and self.hf_config.architectures[0] == "Ernie4_5_MoeForCausalLM"
+        ):
+            self.hf_config.architectures[0] = "Ernie4_5_MoeForCausalLMMTP"
+
+        if is_draft_model and self.hf_config.architectures[0] == "Qwen3NextForCausalLM":
+            self.hf_config.architectures[0] = "Qwen3NextForCausalLMMTP"
+            self.hf_config.num_nextn_predict_layers = 1
+
+        if is_draft_model and self.hf_config.architectures[0] in [
+            "Qwen3_5ForConditionalGeneration",
+            "Qwen3_5MoeForConditionalGeneration",
+        ]:
+            self.hf_config.architectures[0] = "Qwen3_5ForCausalLMMTP"
+            self.hf_config.num_nextn_predict_layers = 1
+
+        if is_draft_model and self.hf_config.architectures[0] == "ExaoneMoEForCausalLM":
+            self.hf_config.architectures[0] = "ExaoneMoEForCausalLMMTP"
+            self.hf_config.num_nextn_predict_layers = 1
+
+        if is_draft_model and self.hf_config.architectures[0] == "NemotronHForCausalLM":
+            self.hf_config.architectures[0] = "NemotronHForCausalLMMTP"
+            self.hf_config.num_nextn_predict_layers = 1
+
+    def _derive_hybrid_model(self):
+        # Use self.context_len after it has been initialized to prevent using context_len which may be None.
+        self.is_hybrid_swa = (
+            is_hybrid_swa_model(self.hf_config.architectures)
+            and not self.disable_hybrid_swa_memory
+        )
+
+        if self.is_hybrid_swa:
+            self.swa_attention_layer_ids, self.full_attention_layer_ids = (
+                get_hybrid_layer_ids(
+                    self.hf_config.architectures,
+                    self.hf_text_config,
+                )
+            )
+
+        self.is_hybrid_swa_compress = self.hf_config.architectures[0] in [
+            "MiMoV2FlashForCausalLM",
+            "MiMoV2MTP",
+        ]
+
+    def _derive_context_length(self, context_length: int):
+        is_draft_model = self.is_draft_model
+        derived_context_len = get_context_length(self.hf_text_config)
+
+        if context_length is not None:
+            if context_length > derived_context_len:
+                reason = "Target model's" if is_draft_model else "User-specified"
+                msg = (
+                    f"Warning: {reason} context_length ({context_length}) is greater than the derived context_length ({derived_context_len}). "
+                    f"This may lead to incorrect model outputs or CUDA errors. Note that the derived context_length may differ from max_position_embeddings in the model's config."
+                )
+                if (
+                    envs.SGLANG_ALLOW_OVERWRITE_LONGER_CONTEXT_LEN.get()
+                    or is_in_ci()  # FIXME: fix this special case
+                ):
+                    logger.warning(msg)
+                    self.context_len = context_length
+                    if is_draft_model:
+                        self.hf_text_config.max_position_embeddings = context_length
+                        logger.warning(
+                            f"Overriding the draft model's max_position_embeddings to {context_length}."
+                        )
+                else:
+                    raise ValueError(
+                        f"{msg} To allow overriding this maximum, set the env var SGLANG_ALLOW_OVERWRITE_LONGER_CONTEXT_LEN=1"
+                    )
+            else:
+                self.context_len = context_length
+        else:
+            self.context_len = derived_context_len
+
+        # Transfer context_len to HuggingFace config so models can access it
+        self.hf_config.context_len = self.context_len
+
+    def _derive_model_shapes(self):
+        # Unify the config keys for hf_text_config
+        self.head_dim = getattr(
+            self.hf_text_config,
+            "head_dim",
+            self.hf_text_config.hidden_size // self.hf_text_config.num_attention_heads,
+        )
+        self.v_head_dim = getattr(
+            self.hf_text_config,
+            "v_head_dim",
+            self.head_dim,
+        )
+
+        self.swa_head_dim = getattr(
+            self.hf_text_config,
+            "swa_head_dim",
+            self.head_dim,
+        )
+        self.swa_v_head_dim = getattr(
+            self.hf_text_config,
+            "swa_v_head_dim",
+            self.v_head_dim,
+        )
+        # FIXME: temporary special judge for MLA architecture
+        if (
+            "DeepseekV2ForCausalLM" in self.hf_config.architectures
+            or "DeepseekV32ForCausalLM" in self.hf_config.architectures
+            or "DeepseekV3ForCausalLM" in self.hf_config.architectures
+            or "DeepseekV3ForCausalLMNextN" in self.hf_config.architectures
+            or "Glm4MoeLiteForCausalLM" in self.hf_config.architectures
+            or "GlmMoeDsaForCausalLM" in self.hf_config.architectures
+            or "LongcatFlashForCausalLM" in self.hf_config.architectures
+            or "LongcatFlashForCausalLMNextN" in self.hf_config.architectures
+            or "DotsVLMForCausalLM" in self.hf_config.architectures
+            or "MistralLarge3ForCausalLM" in self.hf_config.architectures
+            or "PixtralForConditionalGeneration" in self.hf_config.architectures
+            or "MistralLarge3ForCausalLMEagle" in self.hf_config.architectures
+            or "KimiK25ForConditionalGeneration" in self.hf_config.architectures
+        ):
+            self.head_dim = 256
+            self.attention_arch = AttentionArch.MLA
+            self.kv_lora_rank = self.hf_text_config.kv_lora_rank
+            self.qk_nope_head_dim = self.hf_text_config.qk_nope_head_dim
+            self.qk_rope_head_dim = self.hf_text_config.qk_rope_head_dim
+            self.v_head_dim = self.hf_text_config.v_head_dim
+            self.index_head_dim = (
+                get_nsa_index_head_dim(self.hf_text_config)
+                if is_deepseek_nsa(self.hf_text_config)
+                else None
+            )
+            # Handle rope scaling
+            self.scaling = 1 / math.sqrt(self.qk_nope_head_dim + self.qk_rope_head_dim)
+            # in transformers v5, rope_scaling is just rope_parameters for backward compatibility
+            rope_scaling = self.hf_text_config.rope_scaling
+            if rope_scaling:
+                # v5 uses "rope_type", v4 uses "type"
+                rope_type = (
+                    rope_scaling.get("rope_type")
+                    or rope_scaling.get("type")
+                    or "default"
+                )
+                if rope_type != "default":
+                    mscale_all_dim = rope_scaling.get("mscale_all_dim", False)
+                    scaling_factor = rope_scaling["factor"]
+                    mscale = yarn_get_mscale(scaling_factor, float(mscale_all_dim))
+                    self.scaling = self.scaling * mscale * mscale
+        elif "MiniCPM3ForCausalLM" in self.hf_config.architectures:
+            self.head_dim = 128
+            self.attention_arch = AttentionArch.MLA
+            self.kv_lora_rank = self.hf_config.kv_lora_rank
+            self.qk_rope_head_dim = self.hf_config.qk_rope_head_dim
+        elif "DeepseekVL2ForCausalLM" in self.hf_config.architectures and getattr(
+            self.hf_text_config, "use_mla", True
+        ):
+            self.head_dim = 256
+            self.attention_arch = AttentionArch.MLA
+            self.kv_lora_rank = self.hf_text_config.kv_lora_rank
+            self.qk_rope_head_dim = self.hf_text_config.qk_rope_head_dim
+            self.qk_nope_head_dim = self.hf_text_config.qk_nope_head_dim
+        elif "KimiVLForConditionalGeneration" in self.hf_config.architectures:
+            self.head_dim = 256
+            self.attention_arch = AttentionArch.MLA
+            self.kv_lora_rank = self.hf_text_config.kv_lora_rank
+            self.qk_rope_head_dim = self.hf_text_config.qk_rope_head_dim
+            self.v_head_dim = self.hf_text_config.v_head_dim
+            self.qk_nope_head_dim = self.hf_text_config.qk_nope_head_dim
+        elif "KimiLinearForCausalLM" in self.hf_config.architectures:
+            self.head_dim = 72
+            self.attention_arch = AttentionArch.MLA
+            self.kv_lora_rank = self.hf_config.kv_lora_rank
+            self.qk_rope_head_dim = self.hf_config.qk_rope_head_dim
+            self.v_head_dim = self.hf_config.v_head_dim
+            self.qk_nope_head_dim = self.hf_config.qk_nope_head_dim
+        elif (
+            "BailingMoeV2_5ForCausalLM" in self.hf_config.architectures
+            or "BailingMoeForCausalLMNextN" in self.hf_config.architectures
+        ):
+            self.head_dim = self.hf_text_config.head_dim
+            self.attention_arch = AttentionArch.MLA
+            self.kv_lora_rank = self.hf_text_config.kv_lora_rank
+            self.qk_nope_head_dim = self.hf_text_config.qk_nope_head_dim
+            self.qk_rope_head_dim = self.hf_text_config.qk_rope_head_dim
+            self.v_head_dim = self.hf_config.v_head_dim
+            # Handle rope scaling with yarn
+            self.scaling = 1 / math.sqrt(self.qk_nope_head_dim + self.qk_rope_head_dim)
+            if self.hf_config.rope_scaling:
+                mscale_all_dim = self.hf_config.rope_scaling.get(
+                    "mscale_all_dim", False
+                )
+                scaling_factor = self.hf_config.rope_scaling["factor"]
+                mscale = yarn_get_mscale(scaling_factor, float(mscale_all_dim))
+                self.scaling = self.scaling * mscale * mscale
+        elif "SarvamMLAForCausalLM" in self.hf_config.architectures:
+            self.head_dim = (
+                self.hf_config.qk_nope_head_dim + self.hf_config.qk_rope_head_dim
+            )
+            self.attention_arch = AttentionArch.MLA
+            self.kv_lora_rank = self.hf_config.kv_lora_rank
+            self.qk_rope_head_dim = self.hf_config.qk_rope_head_dim
+            self.qk_nope_head_dim = self.hf_config.qk_nope_head_dim
+            self.v_head_dim = self.hf_config.v_head_dim
+            self.scaling = 1 / math.sqrt(self.qk_nope_head_dim + self.qk_rope_head_dim)
+            if self.hf_config.rope_scaling:
+                mscale_all_dim = self.hf_config.rope_scaling.get(
+                    "mscale_all_dim", False
+                )
+                scaling_factor = self.hf_config.rope_scaling["factor"]
+                mscale = yarn_get_mscale(scaling_factor, float(mscale_all_dim))
+                self.scaling = self.scaling * mscale * mscale
+        else:
+            if (
+                "MistralModel" in self.hf_config.architectures
+                or "MixtralForCausalLM" in self.hf_config.architectures
+                or "MistralForCausalLM" in self.hf_config.architectures
+            ):
+                if getattr(self, "head_dim", None) is None:
+                    self.head_dim = (
+                        self.hf_config.hidden_size // self.hf_config.num_attention_heads
+                    )
+                    # In transformers==4.52.3, the head_dim is null in MistralConfig
+                    if (
+                        not hasattr(self.hf_text_config, "head_dim")
+                        or self.hf_text_config.head_dim is None
+                    ):
+                        setattr(self.hf_text_config, "head_dim", self.head_dim)
+
+            elif "BaichuanForCausalLM" in self.hf_config.architectures:
+                self.use_alibi = self.hf_config.hidden_size != 4096
+
+            self.attention_arch = AttentionArch.MHA
+
+        self.num_attention_heads = self.hf_text_config.num_attention_heads
+        self.num_key_value_heads = getattr(
+            self.hf_text_config, "num_key_value_heads", None
+        )
+
+        # for Dbrx and MPT models
+        if self.hf_config.model_type in ["dbrx", "mpt"]:
+            self.num_key_value_heads = getattr(
+                self.hf_config.attn_config, "kv_n_heads", None
+            )
+
+        if self.num_key_value_heads is None:
+            self.num_key_value_heads = self.num_attention_heads
+        self.hidden_size = self.hf_text_config.hidden_size
+        self.num_hidden_layers = self.hf_text_config.num_hidden_layers
+        self.num_attention_layers = self.num_hidden_layers
+        if "LongcatFlashForCausalLM" in self.hf_config.architectures:
+            self.num_attention_layers = self.num_hidden_layers * 2
+        if "IQuestLoopCoderForCausalLM" in self.hf_config.architectures:
+            loop_num = getattr(self.hf_text_config, "loop_num", 1)
+            self.num_attention_layers = int(self.num_hidden_layers * int(loop_num))
+        if "WhisperForConditionalGeneration" in self.hf_config.architectures:
+            # Whisper has unique layer ID scheme:
+            # - Encoder self-attention: 0 to encoder_layers-1 (no KV cache)
+            # - Decoder self-attention: encoder_layers to encoder_layers+decoder_layers-1 (uses KV cache)
+            # - Decoder cross-attention: encoder_layers+decoder_layers to encoder_layers+2*decoder_layers-1
+            # Even though cross-attention doesn't save KV cache, attention backend needs buffer to exist
+            encoder_layers = getattr(self.hf_text_config, "encoder_layers", 0)
+            decoder_layers = getattr(
+                self.hf_text_config, "decoder_layers", self.num_hidden_layers
+            )
+            self.num_attention_layers = encoder_layers + 2 * decoder_layers
+        self.num_nextn_predict_layers = getattr(
+            self.hf_text_config, "num_nextn_predict_layers", None
+        )
+        self.vocab_size = self.hf_text_config.vocab_size
+
+    def get_total_num_attention_heads(self) -> int:
+        return self.num_attention_heads
+
+    def get_num_attention_heads(self, tensor_parallel_size) -> int:
+        total_num_attention_heads = self.num_attention_heads
+        return max(1, total_num_attention_heads // tensor_parallel_size)
+
+    # adapted from https://github.com/vllm-project/vllm/blob/main/vllm/config.py#L289
+    def get_total_num_kv_heads(self) -> int:
+        """Returns the total number of KV heads."""
+        # For GPTBigCode & Falcon:
+        # NOTE: for falcon, when new_decoder_architecture is True, the
+        # multi_query flag is ignored and we use n_head_kv for the number of
+        # KV heads.
+        falcon_model_types = ["falcon", "RefinedWeb", "RefinedWebModel"]
+        new_decoder_arch_falcon = (
+            self.hf_config.model_type in falcon_model_types
+            and getattr(self.hf_config, "new_decoder_architecture", False)
+        )
+        if not new_decoder_arch_falcon and getattr(
+            self.hf_text_config, "multi_query", False
+        ):
+            # Multi-query attention, only one KV head.
+            # Currently, tensor parallelism is not supported in this case.
+            return 1
+
+        # For DBRX and MPT
+        if self.hf_config.model_type in ["mpt"]:
+            if "kv_n_heads" in self.hf_config.attn_config:
+                return self.hf_config.attn_config["kv_n_heads"]
+            return self.hf_config.num_attention_heads
+        if self.hf_config.model_type in ["dbrx"]:
+            return getattr(
+                self.hf_config.attn_config,
+                "kv_n_heads",
+                self.hf_config.num_attention_heads,
+            )
+        if self.hf_config.model_type in ["nemotron-nas"]:
+            nkvh = {
+                self.hf_config.num_attention_heads // block.attention.n_heads_in_group
+                for block in self.hf_config.block_configs
+                if not block.attention.no_op
+            }
+            if len(nkvh) == 0:
+                raise RuntimeError("Couldn't determine number of kv heads")
+            if len(nkvh) > 1:
+                raise ValueError(
+                    "Variable GQA (VGQA) is not yet supported for nemotron-nas in sglang"
+                )
+            return next(iter(nkvh))
+
+        attributes = [
+            # For Falcon:
+            "n_head_kv",
+            "num_kv_heads",
+            # For LLaMA-2:
+            "num_key_value_heads",
+            # For ChatGLM:
+            "multi_query_group_num",
+            # For Step3
+            "num_attention_groups",
+        ]
+        for attr in attributes:
+            num_kv_heads = getattr(self.hf_text_config, attr, None)
+            if num_kv_heads is not None:
+                return num_kv_heads
+
+        # For non-grouped-query attention models, the number of KV heads is
+        # equal to the number of attention heads.
+        return self.hf_text_config.num_attention_heads
+
+    def get_num_kv_heads(self, tensor_parallel_size) -> int:
+        """Returns the number of KV heads per GPU."""
+        total_num_kv_heads = self.get_total_num_kv_heads()
+        # If tensor parallelism is used, we divide the number of KV heads by
+        # the tensor parallel size. We will replicate the KV heads in the
+        # case where the number of KV heads is smaller than the tensor
+        # parallel size so each GPU has at least one KV head.
+        return max(1, total_num_kv_heads // tensor_parallel_size)
+
+    def get_swa_num_kv_heads(self, tensor_parallel_size) -> int:
+        """Similar to get_num_kv_heads(), but for SWA."""
+        if hasattr(self.hf_text_config, "swa_num_key_value_heads"):
+            total_num_kv_heads = self.hf_text_config.swa_num_key_value_heads
+            return max(1, total_num_kv_heads // tensor_parallel_size)
+        elif hasattr(self.hf_text_config, "attention_other_setting"):  # For step3p5
+            total_num_kv_heads = self.hf_text_config.attention_other_setting.get(
+                "num_attention_groups"
+            )
+            return max(1, total_num_kv_heads // tensor_parallel_size)
+        else:
+            return self.get_num_kv_heads(tensor_parallel_size)
+
+    # adapted from https://github.com/vllm-project/vllm/blob/v0.6.4.post1/vllm/config.py
+    def _parse_quant_hf_config(self):
+        quant_cfg = getattr(self.hf_config, "quantization_config", None)
+        if quant_cfg is not None and not isinstance(quant_cfg, dict):
+            quant_cfg = quant_cfg.to_dict()
+        if quant_cfg is not None:
+            # Identify modelopt quantization
+            if (
+                "quant_method" not in quant_cfg
+                or quant_cfg["quant_method"] == "modelopt"
+            ):
+                parsed_cfg = self._parse_modelopt_quant_config(
+                    {"quantization": quant_cfg}
+                )
+                if parsed_cfg:
+                    quant_cfg.update(parsed_cfg)
+
+        if quant_cfg is None:
+            # compressed-tensors uses a "compression_config" key
+            quant_cfg = getattr(self.hf_config, "compression_config", None)
+        if quant_cfg is None:
+            # check if is modelopt or mixed-precision model -- Both of them don't have corresponding field
+            # in hf `config.json` but has a standalone `hf_quant_config.json` in the root directory
+            # example: https://huggingface.co/nvidia/Llama-3.1-8B-Instruct-FP8/tree/main
+            # example: https://huggingface.co/Barrrrry/DeepSeek-R1-W4AFP8/tree/main
+            is_local = os.path.exists(self.model_path)
+            if not is_local:
+                # Conditional import based on SGLANG_USE_MODELSCOPE environment variable
+                if envs.SGLANG_USE_MODELSCOPE.get():
+
+                    from modelscope import HubApi, model_file_download
+
+                    hf_api = HubApi()
+                else:
+                    import huggingface_hub
+                    from huggingface_hub import HfApi, hf_hub_download
+
+                    hf_api = HfApi()
+                try:
+                    # In offline mode, skip file_exists check to avoid OfflineModeIsEnabled error
+                    # Instead, directly try to download/read from cache with local_files_only
+                    file_exists = False  # Initialize to avoid UnboundLocalError
+                    if not huggingface_hub.constants.HF_HUB_OFFLINE:
+                        # Online mode: check if file exists before attempting download (optimization)
+                        file_exists = retry(
+                            lambda: hf_api.file_exists(
+                                self.model_path, "hf_quant_config.json"
+                            ),
+                            max_retry=2,
+                            initial_delay=1.0,
+                            max_delay=5.0,
+                        )
+                        if not file_exists:
+                            # File doesn't exist on hub, no need to try downloading
+                            return quant_cfg  # None
+
+                    # Download (online mode) or read from cache (offline mode)
+                    if envs.SGLANG_USE_MODELSCOPE.get():
+                        quant_config_file = model_file_download(
+                            model_id=self.model_path,
+                            file_path="hf_quant_config.json",
+                            revision=self.revision,
+                        )
+                    else:
+                        quant_config_file = hf_hub_download(
+                            repo_id=self.model_path,
+                            filename="hf_quant_config.json",
+                            revision=self.revision,
+                            local_files_only=huggingface_hub.constants.HF_HUB_OFFLINE,
+                        )
+                    with open(quant_config_file) as f:
+                        quant_config_dict = json.load(f)
+                    quant_cfg = self._parse_modelopt_quant_config(quant_config_dict)
+                except huggingface_hub.errors.LocalEntryNotFoundError:
+                    # Offline mode and file not in cache - this is normal for non-quantized models
+                    logger.debug(
+                        f"hf_quant_config.json not found in cache for {self.model_path} "
+                        "(offline mode, normal for non-quantized models)"
+                    )
+                except huggingface_hub.errors.OfflineModeIsEnabled:
+                    # Should not reach here after our changes, but keep for safety
+                    logger.warning(
+                        "Offline mode is enabled, skipping hf_quant_config.json check"
+                    )
+                except Exception as e:
+                    logger.warning(
+                        "Failed to load hf_quant_config.json for model %s: %s",
+                        self.model_path,
+                        e,
+                    )
+            elif os.path.exists(os.path.join(self.model_path, "hf_quant_config.json")):
+                quant_config_file = os.path.join(
+                    self.model_path, "hf_quant_config.json"
+                )
+                with open(quant_config_file) as f:
+                    quant_config_dict = json.load(f)
+                quant_cfg = self._parse_modelopt_quant_config(quant_config_dict)
+        return quant_cfg
+
+    def _find_quant_modelslim_config(self):
+        quant_config_file = Path(self.model_path, "quant_model_description.json")
+        quant_cfg = None
+        if quant_config_file.is_file():
+            with open(quant_config_file) as f:
+                quant_cfg = json.load(f)
+            # This field is required for flagless model loading but is not present in
+            # modelslim model description, so we're adding it here manually.
+            quant_cfg["quant_method"] = "modelslim"
+
+        return quant_cfg
+
+    def _parse_modelopt_quant_config(self, quant_config_dict: dict) -> Optional[dict]:
+        """Parse ModelOpt quantization config and return the appropriate quant_method."""
+        json_quant_configs = quant_config_dict["quantization"]
+        quant_algo = json_quant_configs.get("quant_algo", None)
+
+        if quant_algo == "MIXED_PRECISION":
+            return {"quant_method": "w4afp8", "quant_algo": quant_algo}
+        elif quant_algo and ("FP4" in quant_algo or "NVFP4" in quant_algo):
+            return {"quant_method": "modelopt_fp4", "quant_algo": quant_algo}
+        elif quant_algo and "FP8" in quant_algo:
+            return {"quant_method": "modelopt_fp8", "quant_algo": quant_algo}
+        else:
+            return None
+
+    def get_quantization_config_log_str(self) -> Optional[str]:
+        """
+        Get a concise string representation of the quantization config for logging.
+        Returns something like "quant=fp8, fmt=e4m3" or "quant=gptq, bits=4".
+        """
+        try:
+            quant_cfg = self._parse_quant_hf_config()
+            if not quant_cfg:
+                return None
+
+            quant_method = quant_cfg.get("quant_method", "quantized")
+            log_str = f"quant={quant_method}"
+
+            # Append interesting fields if they exist
+            for field in ["bits", "quant_algo", "fmt"]:
+                if field in quant_cfg:
+                    log_str += f", {field}={quant_cfg[field]}"
+
+            return log_str
+        except Exception:
+            return None
+
+    def _is_already_quantized(self) -> bool:
+        """Check if the model is already quantized based on config files."""
+        # Check for quantization in hf_config (config.json)
+        if getattr(self.hf_config, "quantization_config", None) or getattr(
+            self.hf_config, "compression_config", None
+        ):
+            return True
+
+        # Check for HuggingFace quantization config
+        from sglang.srt.utils import has_hf_quant_config
+
+        return has_hf_quant_config(self.model_path)
+
+    def _get_modelopt_quant_type(self) -> str:
+        """Extract ModelOpt quantization type from unified quantization flag."""
+        if self.quantization == "modelopt_fp8":
+            return "fp8"
+        elif self.quantization == "modelopt_fp4":
+            return "nvfp4"
+        elif self.quantization == "modelopt":
+            # Auto-detect from model config
+            quant_cfg = self._parse_quant_hf_config()
+            if quant_cfg:
+                quant_method = quant_cfg.get("quant_method", "").lower()
+                if "fp4" in quant_method:
+                    return "fp4"
+                elif "fp8" in quant_method:
+                    return "fp8"
+            # Default to fp8 if can't detect
+            return "fp8"
+        else:
+            return "fp8"  # Default fallback
+
+    def _get_sliding_window_size(self) -> Optional[int]:
+        sliding_window_size = getattr(self.hf_text_config, "sliding_window_size", None)
+        if sliding_window_size is None:
+            sliding_window_size = getattr(self.hf_text_config, "sliding_window", None)
+        return sliding_window_size
+
+    def _validate_quantize_and_serve_config(self):
+        """Validate quantize_and_serve configuration."""
+        if not self.quantize_and_serve:
+            return
+
+        # Check if ModelOpt quantization is specified
+        _MODELOPT_QUANTIZATION_METHODS = [
+            "modelopt",
+            "modelopt_fp8",
+            "modelopt_fp4",
+        ]
+        modelopt_quantization_specified = (
+            self.quantization in _MODELOPT_QUANTIZATION_METHODS
+        )
+
+        if not modelopt_quantization_specified:
+            raise ValueError(
+                "quantize_and_serve requires ModelOpt quantization (set with --quantization "
+                f"{{{', '.join(sorted(_MODELOPT_QUANTIZATION_METHODS))}}})"
+            )
+
+        # quantize_and_serve is disabled due to compatibility issues
+        raise NotImplementedError(
+            "quantize_and_serve functionality is currently disabled due to compatibility issues. "
+            "Please use the separate quantize-then-deploy workflow instead. "
+            "Step 1: Quantize and export model. "
+            "Step 2: Deploy the exported model."
+        )
+
+    # adapted from https://github.com/vllm-project/vllm/blob/v0.6.4.post1/vllm/config.py
+    def _verify_quantization(self) -> None:
+        supported_quantization = [*QUANTIZATION_METHODS]
+        rocm_supported_quantization = [
+            "awq",
+            "gptq",
+            "fp8",
+            "compressed_tensors",
+            "compressed-tensors",
+            "fbgemm_fp8",
+            "w8a8_fp8",
+            "petit_nvfp4",
+            "quark",
+            "mxfp4",
+            "auto-round",
+            "quark_int4fp8_moe",
+        ]
+        optimized_quantization_methods = [
+            "fp8",
+            "marlin",
+            "modelopt_fp8",
+            "modelopt_fp4",
+            "gptq_marlin_24",
+            "gptq_marlin",
+            "awq_marlin",
+            "fbgemm_fp8",
+            "compressed_tensors",
+            "compressed-tensors",
+            "experts_int8",
+            "w8a8_int8",
+            "w8a8_fp8",
+            "moe_wna16",
+            "qoq",
+            "w4afp8",
+            "petit_nvfp4",
+            "quark",
+            "modelslim",
+        ]
+        compatible_quantization_methods = {
+            "modelopt_fp8": ["modelopt"],
+            "modelopt_fp4": ["modelopt"],
+            "petit_nvfp4": ["modelopt"],
+            "w8a8_int8": ["compressed-tensors", "compressed_tensors"],
+            "w8a8_fp8": ["compressed-tensors", "compressed_tensors"],
+        }
+        if self.quantization is not None:
+            self.quantization = self.quantization.lower()
+
+        # Parse quantization method from the HF and ModelSlim model config, if available.
+        # Only one function should return config, other should return None.
+        cfg_list = []
+        hf_config = self._parse_quant_hf_config()
+        modelslim_config = self._find_quant_modelslim_config()
+        quant_config = modelslim_config or hf_config
+        if quant_config is not None:
+            cfg_list.append(quant_config)
+
+        # Filter out None values
+        cfg_list = [item for item in cfg_list if item is not None]
+        if len(cfg_list) > 1:
+            raise ValueError(
+                "Config list contains configs from 2 methods, must be only 1"
+            )
+        quant_cfg = cfg_list[0] if cfg_list else None
+
+        if quant_cfg is not None:
+            quant_method = quant_cfg.get(
+                "quant_method", "" if not self.quantization else self.quantization
+            ).lower()
+
+            # Detect which checkpoint is it
+            for _, method in QUANTIZATION_METHODS.items():
+                quantization_override = method.override_quantization_method(
+                    quant_cfg, self.quantization
+                )
+                if quantization_override:
+                    quant_method = quantization_override
+                    self.quantization = quantization_override
+                    break
+
+            # Verify quantization configurations.
+            if self.quantization is None:
+                self.quantization = quant_method
+            elif self.quantization != quant_method:
+                # Check if the CLI-specified quantization is compatible with HF config's quant_method
+                is_compatible = (
+                    self.quantization in compatible_quantization_methods
+                    and quant_method
+                    in compatible_quantization_methods[self.quantization]
+                )
+                if is_compatible:
+                    # Keep the CLI-specified quantization (e.g., modelopt_fp4) even if
+                    # HF config says "modelopt" - they are compatible
+                    logger.info(
+                        f"Using CLI-specified quantization ({self.quantization}) which is "
+                        f"compatible with HF config quant_method ({quant_method})."
+                    )
+                elif self.is_draft_model:
+                    # Allow auto-detection of quantization from checkpoint for draft model
+                    # only if the CLI quantization is not compatible
+                    logger.info(
+                        f"Draft model quantization ({quant_method}) differs from "
+                        f"main model quantization ({self.quantization}). "
+                        f"Using draft model's detected quantization: {quant_method}"
+                    )
+                    self.quantization = quant_method
+                else:
+                    raise ValueError(
+                        "Quantization method specified in the model config "
+                        f"({quant_method}) does not match the quantization "
+                        f"method specified in the `quantization` argument "
+                        f"({self.quantization})."
+                    )
+
+            # Check if the scale_fmt is ue8m0, and warn user if deepgemm is enabled for non-ue8m0 models on blackwell
+            self.use_scale_ue8m0 = quant_cfg.get("scale_fmt", None) == "ue8m0"
+            from sglang.srt.layers import deep_gemm_wrapper
+
+            if not self.use_scale_ue8m0 and deep_gemm_wrapper.DEEPGEMM_SCALE_UE8M0:
+                logger.warning(
+                    "DeepGemm is enabled but the scale_fmt of checkpoint is not ue8m0. This might cause accuracy degradation on Blackwell."
+                )
+
+        if self.quantization is not None:
+            if self.quantization not in supported_quantization:
+                raise ValueError(
+                    f"Unknown quantization method: {self.quantization}. Must "
+                    f"be one of {supported_quantization}."
+                )
+            if is_hip() and self.quantization not in rocm_supported_quantization:
+                raise ValueError(
+                    f"{self.quantization} quantization is currently not "
+                    f"supported in ROCm."
+                )
+            if self.quantization not in optimized_quantization_methods:
+                # Don't warn for MXFP4 on SM100 since it has optimized kernels
+                if not (self.quantization == "mxfp4" and is_sm100_supported()):
+                    logger.warning(
+                        "%s quantization is not fully "
+                        "optimized yet. The speed can be slower than "
+                        "non-quantized models.",
+                        self.quantization,
+                    )
+
+    def _verify_dual_chunk_attention_config(self) -> None:
+        if hasattr(self.hf_config, "dual_chunk_attention_config"):
+            # Try loading the sparse attention config
+            sparse_attn_config = get_sparse_attention_config(self.model_path)
+            if not sparse_attn_config:
+                return
+            self.hf_config.dual_chunk_attention_config["sparse_attention_config"] = (
+                sparse_attn_config
+            )
+            if (
+                "sparse_attention_enabled"
+                not in self.hf_config.dual_chunk_attention_config
+            ):
+                self.hf_config.dual_chunk_attention_config[
+                    "sparse_attention_enabled"
+                ] = True
+
+    def _verify_transformers_version(self):
+        import transformers
+        from packaging import version
+
+        tf_version_str = getattr(transformers, "__version__", None)
+        if tf_version_str is None:
+            return
+
+        vision_config = getattr(self.hf_config, "vision_config", None)
+        is_glm_46vmoe = "glm-4.6v" in self.model_path.lower() or (
+            vision_config is not None
+            and getattr(vision_config, "model_type", None) == "glm4v_moe_vision"
+            # The vision config model type for GLM-4.5v is 'glm4v_moe',
+            # while for GLM-4.6v, it is 'glm4v_moe_vision'.
+        )
+        needs_tf_v5 = is_glm_46vmoe
+
+        tf_version = version.parse(tf_version_str)
+        required_version = version.parse("5.0.0dev0")
+
+        if tf_version < required_version:
+            if needs_tf_v5:
+                raise ValueError(
+                    f"Transformers version {tf_version_str} is not supported for model {self.model_path} "
+                    f"or model type {self.hf_config.model_type}. "
+                    "Please upgrade transformers to >= 5.0.0."
+                )
+        elif not needs_tf_v5:
+            logger.warning(
+                f"Transformers version {tf_version_str} is used for model type {self.hf_config.model_type}. "
+                "If you experience issues related to RoPE parameters, "
+                "they may be due to incompatibilities between Transformers >=5.0.0 and some models. "
+                "You can try downgrading to transformers==4.57.1 as a workaround."
+            )
+
+    def _get_hf_eos_token_id(self) -> Optional[Set[int]]:
+        eos_ids = getattr(self.hf_config, "eos_token_id", None)
+        if eos_ids is not None:
+            # it can be either int or list of int
+            eos_ids = {eos_ids} if isinstance(eos_ids, int) else set(eos_ids)
+        if eos_ids is None:
+            eos_ids = set()
+        if self.hf_generation_config:
+            generation_eos_ids = getattr(
+                self.hf_generation_config, "eos_token_id", None
+            )
+            if generation_eos_ids:
+                generation_eos_ids = (
+                    {generation_eos_ids}
+                    if isinstance(generation_eos_ids, int)
+                    else set(generation_eos_ids)
+                )
+                eos_ids = eos_ids | generation_eos_ids
+        return eos_ids
+
+    def get_default_sampling_params(self) -> dict[str, Any]:
+        """
+        Get default sampling parameters from the model's generation config.
+
+        This method returns non-default sampling parameters from the model's
+        generation_config.json when sampling_defaults is set to "model".
+
+        Returns:
+            A dictionary containing the non-default sampling parameters.
+        """
+        if self.sampling_defaults != "model":
+            return {}
+
+        if self.hf_generation_config is None:
+            return {}
+
+        config = self.hf_generation_config.to_dict()
+
+        available_params = [
+            "repetition_penalty",
+            "temperature",
+            "top_k",
+            "top_p",
+            "min_p",
+        ]
+
+        default_sampling_params = {
+            p: config.get(p) for p in available_params if config.get(p) is not None
+        }
+
+        return default_sampling_params
+
+    def _maybe_pull_model_tokenizer_from_remote(self) -> None:
+        """
+        Pull the model config files to a temporary
+        directory in case of remote.
+
+        Args:
+            model: The model name or path.
+
+        """
+        from sglang.srt.connector import create_remote_connector
+        from sglang.srt.utils import is_remote_url
+
+        if is_remote_url(self.model_path):
+            logger.info("Pulling model configs from remote...")
+            # BaseConnector implements __del__() to clean up the local dir.
+            # Since config files need to exist all the time, so we DO NOT use
+            # with statement to avoid closing the client.
+            client = create_remote_connector(self.model_path)
+            if is_remote_url(self.model_path):
+                client.pull_files(allow_pattern=["*config.json"])
+                self.model_weights = self.model_path
+                self.model_path = client.get_local_dir()
+
+
+# adapted from https://github.com/vllm-project/vllm/blob/v0.6.4.post1/vllm/config.py
+_STR_DTYPE_TO_TORCH_DTYPE = {
+    "half": torch.float16,
+    "float16": torch.float16,
+    "float": torch.float32,
+    "float32": torch.float32,
+    "bfloat16": torch.bfloat16,
+}
+
+
+# adapted from https://github.com/vllm-project/vllm/blob/v0.6.4.post1/vllm/config.py
+def _get_and_verify_dtype(
+    config: PretrainedConfig,
+    dtype: Union[str, torch.dtype],
+) -> torch.dtype:
+    # NOTE: getattr(config, "torch_dtype", torch.float32) is not correct
+    # because config.torch_dtype can be None.
+    if isinstance(config, dict):
+        config_dtype = config.get("dtype", None) or config.get("torch_dtype", None)
+        model_type = config.get("model_type", "")
+    else:
+        config_dtype = getattr(config, "dtype", None)
+        model_type = getattr(config, "model_type", "")
+    if isinstance(config_dtype, str):
+        config_dtype = _STR_DTYPE_TO_TORCH_DTYPE.get(config_dtype, None)
+    if config_dtype is None:
+        config_dtype = torch.float32
+
+    if isinstance(dtype, str):
+        dtype = dtype.lower()
+        if dtype == "auto":
+            if config_dtype == torch.float32:
+                if model_type.startswith("gemma"):
+                    if model_type == "gemma":
+                        gemma_version = ""
+                    else:
+                        gemma_version = model_type[5]
+                    logger.info(
+                        f"For Gemma {gemma_version}, we downcast float32 to bfloat16 instead "
+                        "of float16 by default. Please specify `dtype` if you "
+                        "want to use float16."
+                    )
+                    torch_dtype = torch.bfloat16
+                else:
+                    # Following the common practice, we use float16 for float32
+                    # models.
+                    torch_dtype = torch.float16
+            else:
+                torch_dtype = config_dtype
+        else:
+            if dtype not in _STR_DTYPE_TO_TORCH_DTYPE:
+                raise ValueError(f"Unknown dtype: {dtype}")
+            torch_dtype = _STR_DTYPE_TO_TORCH_DTYPE[dtype]
+    elif isinstance(dtype, torch.dtype):
+        torch_dtype = dtype
+    else:
+        raise ValueError(f"Unknown dtype: {dtype}")
+
+    # Verify the dtype.
+    if torch_dtype != config_dtype:
+        if torch_dtype == torch.float32:
+            # Upcasting to float32 is allowed.
+            logger.info("Upcasting %s to %s.", config_dtype, torch_dtype)
+            pass
+        elif config_dtype == torch.float32:
+            # Downcasting from float32 to float16 or bfloat16 is allowed.
+            logger.info("Downcasting %s to %s.", config_dtype, torch_dtype)
+            pass
+        else:
+            # Casting between float16 and bfloat16 is allowed with a warning.
+            logger.warning("Casting %s to %s.", config_dtype, torch_dtype)
+
+    return torch_dtype
+
+
+def is_generation_model(model_architectures: List[str], is_embedding: bool = False):
+    # We have two ways to determine whether a model is a generative model.
+    # 1. Check the model architecture
+    # 2. check the `is_embedding` server args
+
+    if (
+        "LlamaEmbeddingModel" in model_architectures
+        or "MistralModel" in model_architectures
+        or "LlamaForSequenceClassification" in model_architectures
+        or "LlamaForSequenceClassificationWithNormal_Weights" in model_architectures
+        or "InternLM2ForRewardModel" in model_architectures
+        or "Qwen2ForRewardModel" in model_architectures
+        or "Qwen3ForRewardModel" in model_architectures
+        or "Qwen2ForSequenceClassification" in model_architectures
+        or "Qwen3ForSequenceClassification" in model_architectures
+        or "CLIPModel" in model_architectures
+        or "BertModel" in model_architectures
+        or "Contriever" in model_architectures
+        or "BertForSequenceClassification" in model_architectures
+        or "XLMRobertaModel" in model_architectures
+        or "XLMRobertaForSequenceClassification" in model_architectures
+        or "Gemma2ForSequenceClassification" in model_architectures
+    ):
+        return False
+    else:
+        return not is_embedding
+
+
+multimodal_model_archs = [
+    "CLIPModel",
+    "DeepseekVL2ForCausalLM",
+    "Ernie4_5_VLMoeForConditionalGeneration",
+    "Gemma3ForConditionalGeneration",
+    "Gemma3nForConditionalGeneration",
+    "Glm4vForConditionalGeneration",
+    "Glm4vMoeForConditionalGeneration",
+    "GlmOcrForConditionalGeneration",
+    "GlmAsrForConditionalGeneration",
+    "Grok1VForCausalLM",
+    "Grok1AForCausalLM",
+    "LlavaLlamaForCausalLM",
+    "Llama4ForConditionalGeneration",
+    "LlavaMistralForCausalLM",
+    "LlavaQwenForCausalLM",
+    "LlavaForConditionalGeneration",
+    "LlavaVidForCausalLM",
+    "LightOnOCRForConditionalGeneration",
+    "MiniCPMO",
+    "MiniCPMV",
+    "Mistral3ForConditionalGeneration",
+    "MultiModalityCausalLM",
+    "MllamaForConditionalGeneration",
+    "NemotronH_Nano_VL_V2",
+    "PixtralForConditionalGeneration",
+    "Qwen2AudioForConditionalGeneration",
+    "Qwen2VLForConditionalGeneration",
+    "Qwen2_5_VLForConditionalGeneration",
+    "Qwen3VLForConditionalGeneration",
+    "Qwen3VLMoeForConditionalGeneration",
+    "Qwen3_5ForConditionalGeneration",
+    "Qwen3_5MoeForConditionalGeneration",
+    "Qwen3OmniMoeForConditionalGeneration",
+    "KimiVLForConditionalGeneration",
+    "InternVLChatModel",
+    "InternS1ForConditionalGeneration",
+    "InternS1ProForConditionalGeneration",
+    "Phi4MMForCausalLM",
+    "WhisperForConditionalGeneration",
+    "Step3VLForConditionalGeneration",
+    "POINTSV15ChatModel",
+    "DotsVLMForCausalLM",
+    "DotsOCRForCausalLM",
+    "Sarashina2VisionForCausalLM",
+    "NVILAForConditionalGeneration",
+    "NVILALiteForConditionalGeneration",
+    "DeepseekOCRForCausalLM",
+    "JetVLMForConditionalGeneration",
+    "PaddleOCRVLForConditionalGeneration",
+    "MiDashengLMModel",
+    "StepVLForConditionalGeneration",
+    "KimiK25ForConditionalGeneration",
+]
+
+piecewise_cuda_graph_disabled_model_archs = [
+    "DeepseekV32ForCausalLM",
+    "Qwen3NextForCausalLM",
+    "GlmMoeDsaForCausalLM",
+    "BailingMoeV2_5ForCausalLM",
+    "LLaDAModelLM",
+]
+
+if external_mm_model_arch := envs.SGLANG_EXTERNAL_MM_MODEL_ARCH.get():
+    multimodal_model_archs.append(external_mm_model_arch)
+
+
+def is_multimodal_model(model_architectures: List[str]):
+    if any(
+        multi_model_arch in model_architectures
+        for multi_model_arch in multimodal_model_archs
+    ):
+        return True
+    else:
+        return False
+
+
+def is_multimodal_gen_model(model_architectures: List[str]):
+    return False
+
+
+def is_image_gen_model(model_architectures: List[str]):
+    return False
+
+
+def is_audio_model(model_architectures: List[str]):
+    models = [
+        "WhisperForConditionalGeneration",
+    ]
+    return any(model in model_architectures for model in models)
+
+
+def is_encoder_decoder_model(model_architectures: List[str]):
+    models = [
+        "WhisperForConditionalGeneration",
+        "MllamaForConditionalGeneration",
+    ]
+    return any(model in model_architectures for model in models)
+
+
+def is_local_attention_model(model_architectures: List[str]):
+    return "Llama4ForConditionalGeneration" in model_architectures
+
+
+def is_multimodal_chunked_prefill_supported(model_architectures: List[str]):
+    """Check if chunked prefill is supported for a MultiModal model."""
+    unsupported = [
+        "Grok1VForCausalLM",
+        "Grok1AForCausalLM",
+        "LlavaLlamaForCausalLM",
+        "MllamaForConditionalGeneration",
+        "CLIPModel",
+    ]
+    if any(multi_model_arch in unsupported for multi_model_arch in model_architectures):
+        return False
+    else:
+        return True
+
+
+def is_piecewise_cuda_graph_disabled_model(model_architectures: List[str]):
+    return any(
+        arch in piecewise_cuda_graph_disabled_model_archs
+        for arch in model_architectures
+    )
+
+
+def yarn_get_mscale(scale: float = 1, mscale: float = 1) -> float:
+    if scale <= 1:
+        return 1.0
+    return 0.1 * mscale * math.log(scale) + 1.0
+
+
+def is_hybrid_swa_model(model_architectures: List[str]):
+
+    hybrid_swa_archs = {
+        "Llama4ForConditionalGeneration",
+        "GptOssForCausalLM",
+        "MiMoV2FlashForCausalLM",
+        "MiMoV2MTP",
+        "Step3p5ForCausalLM",
+        "Step3p5MTP",
+    }
+    return any(arch in hybrid_swa_archs for arch in model_architectures)
+
+
+def get_hybrid_layer_ids(
+    model_architectures: List[str],
+    hf_text_config: PretrainedConfig,
+):
+    num_hidden_layers = hf_text_config.num_hidden_layers
+    if "Llama4ForConditionalGeneration" in model_architectures:
+        swa_attention_layer_ids = [
+            i for i in range(num_hidden_layers) if (i + 1) % 4 != 0
+        ]
+        full_attention_layer_ids = [
+            i for i in range(num_hidden_layers) if (i + 1) % 4 == 0
+        ]
+    elif "GptOssForCausalLM" in model_architectures:
+        layer_types = getattr(hf_text_config, "layer_types", None)
+        swa_attention_layer_ids = [
+            i for i, x in enumerate(layer_types) if x == "sliding_attention"
+        ]
+        full_attention_layer_ids = [
+            i for i, x in enumerate(layer_types) if x == "full_attention"
+        ]
+    elif "MiMoV2FlashForCausalLM" in model_architectures:
+        hybrid_layer_pattern = getattr(hf_text_config, "hybrid_layer_pattern", None)
+        swa_attention_layer_ids = [
+            i for i in range(num_hidden_layers) if hybrid_layer_pattern[i] == 1
+        ]
+        full_attention_layer_ids = [
+            i for i in range(num_hidden_layers) if hybrid_layer_pattern[i] == 0
+        ]
+    elif "MiMoV2MTP" in model_architectures:
+        swa_attention_layer_ids = [0]
+        full_attention_layer_ids = []
+    elif "Step3p5ForCausalLM" in model_architectures:
+        layer_types = hf_text_config.layer_types
+        swa_attention_layer_ids = [
+            i
+            for i, x in enumerate(layer_types)
+            if x == "sliding_attention" and i < num_hidden_layers
+        ]
+        full_attention_layer_ids = [
+            i
+            for i, x in enumerate(layer_types)
+            if x == "full_attention" and i < num_hidden_layers
+        ]
+    elif "Step3p5MTP" in model_architectures:
+        swa_attention_layer_ids = [0]
+        full_attention_layer_ids = []
+    else:
+        swa_attention_layer_ids = None
+        full_attention_layer_ids = None
+    return swa_attention_layer_ids, full_attention_layer_ids
diff --git a/sglang/python/sglang/srt/configs/modelopt_config.py b/sglang/python/sglang/srt/configs/modelopt_config.py
new file mode 100644
index 0000000000000000000000000000000000000000..911b4ce0cd96272db0266f686965aeda3ee50eac
--- /dev/null
+++ b/sglang/python/sglang/srt/configs/modelopt_config.py
@@ -0,0 +1,30 @@
+# Configuration for NVIDIA ModelOpt quantization integration
+from dataclasses import dataclass
+from typing import Optional
+
+
+@dataclass
+class ModelOptConfig:
+    """Configuration for NVIDIA ModelOpt quantization operations.
+
+    This configuration class holds parameters for ModelOpt quantization,
+    checkpoint management, and model export operations.
+
+    Args:
+        quant: Quantization method/type (e.g., "fp8", "fp4")
+        checkpoint_restore_path: Path to restore ModelOpt checkpoint from
+        checkpoint_save_path: Path to save ModelOpt checkpoint to
+        export_path: Path to export quantized model in HuggingFace format
+        quantize_and_serve: Whether to quantize and serve in one step
+    """
+
+    quant: Optional[str] = None
+    checkpoint_restore_path: Optional[str] = None
+    checkpoint_save_path: Optional[str] = None
+    export_path: Optional[str] = None
+    quantize_and_serve: bool = False
+
+    def __post_init__(self):
+        """Validate configuration after initialization."""
+        # Add any validation logic if needed
+        pass
diff --git a/sglang/python/sglang/srt/configs/nano_nemotron_vl.py b/sglang/python/sglang/srt/configs/nano_nemotron_vl.py
new file mode 100644
index 0000000000000000000000000000000000000000..09ab29abf4652e6b19cd1cfe5e3eef76938c2864
--- /dev/null
+++ b/sglang/python/sglang/srt/configs/nano_nemotron_vl.py
@@ -0,0 +1,114 @@
+# Copyright 2025 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+# Adapted from https://huggingface.co/nvidia/NVIDIA-Nemotron-Nano-12B-v2-VL-BF16/blob/cb5a65ff10232128389d882d805fa609427544f1/configuration.py
+
+from typing import Any
+
+from transformers.configuration_utils import PretrainedConfig
+
+from sglang.srt.configs.nemotron_h import NemotronHConfig
+from sglang.srt.configs.radio import RadioConfig
+from sglang.srt.multimodal.internvl_utils import IMAGENET_MEAN, IMAGENET_STD
+
+
+def float_triplet(seq: Any):
+    a, b, c = tuple(seq)
+    assert (
+        isinstance(a, float) and isinstance(b, float) and isinstance(c, float)
+    ), "expected three floats"
+    return a, b, c
+
+
+class NemotronH_Nano_VL_V2_Config(PretrainedConfig):
+    model_type = "NemotronH_Nano_VL_V2"
+    is_composition = True
+
+    def __init__(
+        self,
+        vision_config=None,
+        llm_config=None,
+        force_image_size: int = 512,
+        patch_size: int = 16,
+        downsample_ratio=0.5,
+        template=None,
+        ps_version="v2",
+        image_tag_type="internvl",
+        projector_hidden_size=4096,
+        vit_hidden_size=1280,
+        video_pruning_rate: float = 0.0,
+        video_context_token: str = "<video>",
+        img_context_token: str = "<image>",
+        img_start_token: str = "<img>",
+        img_end_token: str = "</img>",
+        norm_mean: tuple[float, float, float] | list[float] = IMAGENET_MEAN,
+        norm_std: tuple[float, float, float] | list[float] = IMAGENET_STD,
+        use_thumbnail: bool = True,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+
+        # Handle both cases: when loading from JSON (llm_config is dict) and when called internally by transformers (llm_config; vision_config are None)
+        if llm_config is not None:
+            self.llm_config = NemotronHConfig(**llm_config)
+            assert isinstance(vision_config, dict), "vision_config must be a dictionary"
+            self.raw_vision_config = vision_config
+        else:
+            assert vision_config is None
+            self.llm_config = NemotronHConfig()
+            self.raw_vision_config = {}
+
+        # Assign configuration values
+        vision_image_size = self.raw_vision_config.get("image_size", force_image_size)
+        vision_patch_size = self.raw_vision_config.get("patch_size", patch_size)
+        self.image_size = int(
+            vision_image_size[0]
+            if isinstance(vision_image_size, list)
+            else vision_image_size
+        )
+        self.patch_size = int(
+            vision_patch_size[0]
+            if isinstance(vision_patch_size, list)
+            else vision_patch_size
+        )
+
+        self.downsample_ratio = downsample_ratio
+        self.video_context_token = video_context_token
+        self.img_context_token = img_context_token
+        self.template = template  # TODO move out of here and into the tokenizer
+        self.ps_version = ps_version  # Pixel shuffle version
+        self.image_tag_type = image_tag_type  # TODO: into the tokenizer too?
+        self.projector_hidden_size = projector_hidden_size
+        self.vit_hidden_size = vit_hidden_size
+        self.video_pruning_rate = video_pruning_rate
+
+        self.norm_mean = float_triplet(norm_mean)
+        self.norm_std = float_triplet(norm_std)
+        self.use_thumbnail = use_thumbnail
+        self.img_start_token = img_start_token
+        self.img_end_token = img_end_token
+
+    def create_radio_config(self):
+        config = self.raw_vision_config
+        model_name = config["args"]["model"]
+        reg_tokens = config["args"].get("register_multiple")
+        image_size = config.get("preferred_resolution", [224])[0]
+        radio_config = RadioConfig(
+            patch_size=self.patch_size,
+            norm_mean=self.norm_mean,
+            norm_std=self.norm_std,
+            model_name=model_name,
+            reg_tokens=reg_tokens,
+            image_size=image_size,
+        )
+        return radio_config
diff --git a/sglang/python/sglang/srt/configs/nemotron_h.py b/sglang/python/sglang/srt/configs/nemotron_h.py
new file mode 100644
index 0000000000000000000000000000000000000000..833e97d8792890e388abf0f1b403b96f177c93ab
--- /dev/null
+++ b/sglang/python/sglang/srt/configs/nemotron_h.py
@@ -0,0 +1,314 @@
+# Copyright 2025 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+# Adapted from https://github.com/vllm-project/vllm/blob/main/vllm/transformers_utils/configs/nemotron_h.py
+
+"""NemotronH model configuration"""
+
+import regex as re
+from transformers.configuration_utils import PretrainedConfig
+from transformers.utils import logging
+
+from sglang.srt.configs.mamba_utils import (
+    Mamba2CacheParams,
+    Mamba2StateShape,
+    mamba2_state_dtype,
+)
+
+logger = logging.get_logger(__name__)
+
+MAMBA = "M"
+ATTENTION = "*"
+MLP = "-"
+MOE = "E"
+
+
+class NemotronHConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a
+    [`NemotronHModel`]. It is used to instantiate a NemotronH model according
+    to the specified arguments, defining the model architecture. Instantiating
+    a configuration with the defaults will yield a similar configuration to
+    that of the NemotronH-v0.1 model.
+    Args:
+        vocab_size (`int`, *optional*, defaults to 131072):
+            Vocabulary size of the NemotronH model. Defines the number of
+            different tokens that can be represented by the `inputs_ids`
+            passed when calling [`NemotronHModel`]
+        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
+            Whether the model's input and output word embeddings should be
+            tied. Note that this is only relevant if the model has an output
+            word embedding layer.
+        hidden_size (`int`, *optional*, defaults to 4096):
+            Dimension of the hidden representations.
+        intermediate_size (`int`, *optional*, defaults to 21504):
+            Dimension of the MLP representations.
+        num_hidden_layers (`int`, *optional*, defaults to 52):
+            Number of hidden layers in the Transformer encoder.
+        hybrid_override_pattern (`str`, *optional*, defaults to
+            `"M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M-"`):
+            The pattern of the hybrid model. The pattern is a string of
+            characters where each character represents
+            M: Mamba2, *: Attention, -: MLP
+        num_attention_heads (`int`, *optional*, defaults to 32):
+            Number of attention heads for each attention layer in the
+            Transformer encoder.
+        attention_head_dim (`int`, *optional*, defaults to 128):
+            Dimension of each attention head.
+        num_key_value_heads (`int`, *optional*, defaults to 8):
+            This is the number of key_value heads that should be used to
+            implement Grouped Query Attention. If
+            `num_key_value_heads=num_attention_heads`, the model will use
+            Multi Head Attention (MHA), if `num_key_value_heads=1` the model
+            will use Multi Query Attention (MQA) otherwise GQA is used.
+        mlp_hidden_act (`str`, *optional*, defaults to "relu2"):
+            The non-linear activation function in the MLP layers.
+        attention_bias (`bool`, *optional*, defaults to `False`):
+            Whether to use bias in attention layers.
+        mlp_bias (`bool`, *optional*, defaults to `False`):
+            Whether to use bias in MLP layers.
+        use_bias (`bool`, *optional*, defaults to `False`):
+            Whether to use bias in the model.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for
+            initializing all weight matrices.
+        layer_norm_epsilon (`float`, *optional*, defaults to 1e-5):
+            The epsilon used by the layer normalization layers.
+        residual_in_fp32 (`bool`, *optional*, defaults to `False`):
+            Whether or not residuals should be in `float32`. If set to `False`
+            residuals will keep the same `dtype` as the rest of the model.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values
+            attentions (not used by all models). Only relevant if
+            `config.is_decoder=True`.
+        num_logits_to_keep (`int` or `None`, *optional*, defaults to 1):
+            Number of prompt logits to calculate during generation. If `None`,
+            all logits will be calculated. If an integer value, only last
+            `num_logits_to_keep` logits will be calculated.
+        pad_token_id (`int`, *optional*, defaults to 0):
+            The id of the padding token.
+        bos_token_id (`int`, *optional*, defaults to 1):
+            The id of the "beginning-of-sequence" token.
+        eos_token_id (`int`, *optional*, defaults to 2):
+            The id of the "end-of-sequence" token.
+        sliding_window (`int`, *optional*, defaults to None):
+            Sliding window attention window size.
+        max_position_embeddings (`int`, *optional*, defaults to 4096):
+            The maximum sequence length that this model might ever be used
+            with.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        hidden_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the hidden states.
+        use_mamba_kernels (`bool`, *optional*, defaults to `True`):
+            Flag indicating whether or not to use the fast mamba kernels.
+            These are available only if `mamba-ssm` and `causal-conv1d`
+            are installed, and the mamba modules are running on a CUDA device.
+        ssm_state_size (`int`, *optional*, defaults to 128):
+            The dimension of the mamba state space latents.
+        mamba_num_heads (`int`, *optional*, defaults to 128):
+            Number of heads in Mamba layers.
+        mamba_n_groups (`int`, *optional*, defaults to 8):
+            Number of groups in Mamba layers.
+        mamba_head_dim (`int`, *optional*, defaults to 64):
+            Dimension of each Mamba head.
+        mamba_d_conv (`int`, *optional*, defaults to 4):
+            The size of the mamba convolution kernel.
+        mamba_expand (`int`, *optional*, defaults to 2):
+            Expanding factor used to determine the mamba intermediate size.
+        mamba_hidden_act (`str`, *optional*, defaults to "silu"):
+            The non-linear activation function in the Mamba layers.
+        mamba_dt_min (`float`, *optional*, defaults to 0.001):
+            Minimum value for the time step in Mamba.
+        mamba_dt_max (`float`, *optional*, defaults to 0.1):
+            Maximum value for the time step in Mamba.
+        mamba_dt_limit (`tuple`, *optional*, defaults to (0.0, float("inf"))):
+            Limits for the time step in Mamba.
+        mamba_dt_init_floor (`float`, *optional*, defaults to 1e-4):
+            Floor value for time step initialization in Mamba.
+        mamba_conv_bias (`bool`, *optional*, defaults to `True`):
+            Whether to use bias in the convolution layer of the mamba mixer
+            block.
+        mamba_proj_bias (`bool`, *optional*, defaults to `False`):
+            Whether to use bias in the input and output projections of the
+            mamba mixer block.
+        mamba_chunk_size (`int`, *optional*, defaults to 256):
+            Size of chunks for Mamba processing.
+        rescale_prenorm_residual (`bool`, *optional*, defaults to `True`):
+            Whether to rescale the pre-normalization residual connections.
+    """
+
+    model_type = "nemotron_h"
+    keys_to_ignore_at_inference = ["past_key_values"]
+
+    def __init__(
+        self,
+        vocab_size=131072,
+        tie_word_embeddings=False,
+        hidden_size=4096,
+        intermediate_size=21504,
+        num_hidden_layers=52,
+        hybrid_override_pattern="M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M-",
+        num_attention_heads=32,
+        head_dim=128,
+        num_key_value_heads=8,  # nemo: num_query_groups
+        mlp_hidden_act="relu2",
+        attention_bias=False,
+        mlp_bias=False,
+        use_bias=False,
+        initializer_range=0.02,  # nemo: init_method_std
+        layer_norm_epsilon=1e-5,  # nemo: layernorm_epsilon
+        residual_in_fp32=False,  #  Megatron Core default value
+        use_cache=True,
+        num_logits_to_keep=1,
+        pad_token_id=0,
+        bos_token_id=1,
+        eos_token_id=2,
+        sliding_window=None,
+        max_position_embeddings=4096,
+        attention_dropout=0.0,
+        hidden_dropout=0.0,  # * ADDED
+        use_mamba_kernels=True,
+        ssm_state_size=128,  # mamba_state_size
+        mamba_num_heads=128,
+        mamba_n_groups=8,  # nemo: mamba_ssm_ngroups = num_heads
+        mamba_head_dim=64,
+        mamba_d_conv=4,
+        mamba_expand=2,
+        mamba_hidden_act="silu",
+        mamba_dt_min=0.001,
+        mamba_dt_max=0.1,
+        mamba_dt_limit=(0.0, float("inf")),
+        mamba_dt_init_floor=1e-4,
+        mamba_conv_bias=True,
+        mamba_proj_bias=False,
+        mamba_chunk_size=256,
+        rescale_prenorm_residual=True,
+        n_routed_experts=8,
+        n_shared_experts=1,
+        moe_intermediate_size=7688,
+        moe_shared_expert_intermediate_size=7688,
+        moe_latent_size=None,
+        num_experts_per_tok=2,
+        routed_scaling_factor=1.0,
+        n_group=1,
+        topk_group=1,
+        norm_topk_prob=True,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.tie_word_embeddings = tie_word_embeddings
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.hybrid_override_pattern = hybrid_override_pattern
+        self.num_attention_heads = num_attention_heads
+        self.head_dim = head_dim
+        self.sliding_window = sliding_window
+        self.max_position_embeddings = max_position_embeddings
+        self.attention_dropout = attention_dropout
+        self.hidden_dropout = hidden_dropout
+
+        # Validate hybrid_override_pattern
+        # M: Mamba2, *: Attention, -: MLP
+        assert (
+            len(self.hybrid_override_pattern) == self.num_hidden_layers
+        ), "hybrid_override_pattern must have same length as num_hidden_layers"
+        assert re.match(
+            r"^[*\-ME]+$", self.hybrid_override_pattern
+        ), "hybrid_override_pattern must only contain characters 'M', '*', '-' or 'E'"
+
+        # for backward compatibility
+        if num_key_value_heads is None:
+            num_key_value_heads = num_attention_heads
+
+        self.num_key_value_heads = num_key_value_heads
+        self.mlp_hidden_act = mlp_hidden_act
+        self.attention_bias = attention_bias
+        self.mlp_bias = mlp_bias
+        self.use_bias = use_bias
+        self.initializer_range = initializer_range
+        self.layer_norm_epsilon = layer_norm_epsilon
+        self.residual_in_fp32 = residual_in_fp32
+
+        self.use_cache = use_cache
+        self.num_logits_to_keep = num_logits_to_keep
+
+        self.use_mamba_kernels = use_mamba_kernels
+        self.mamba_n_groups = mamba_n_groups
+        self.mamba_head_dim = mamba_head_dim
+        self.ssm_state_size = ssm_state_size
+        self.mamba_num_heads = mamba_num_heads
+        self.conv_kernel = mamba_d_conv
+        self.expand = mamba_expand
+        self.mamba_hidden_act = mamba_hidden_act
+        self.time_step_min = mamba_dt_min
+        self.time_step_max = mamba_dt_max
+        self.time_step_limit = mamba_dt_limit
+        self.time_step_floor = mamba_dt_init_floor
+        self.use_conv_bias = mamba_conv_bias
+        self.mamba_proj_bias = mamba_proj_bias
+        self.mamba_chunk_size = mamba_chunk_size
+        self.rescale_prenorm_residual = rescale_prenorm_residual
+        self.n_routed_experts = n_routed_experts
+        self.n_shared_experts = n_shared_experts
+        self.moe_intermediate_size = moe_intermediate_size
+        self.moe_shared_expert_intermediate_size = moe_shared_expert_intermediate_size
+        self.moe_latent_size = moe_latent_size
+        self.num_experts_per_tok = num_experts_per_tok
+        self.routed_scaling_factor = routed_scaling_factor
+        self.n_group = n_group
+        self.topk_group = topk_group
+        self.norm_topk_prob = norm_topk_prob
+
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
+
+    @property
+    def mamba_layer_ids(self):
+        return [
+            i
+            for i in range(self.num_hidden_layers)
+            if self.hybrid_override_pattern[i] == MAMBA
+        ]
+
+    @property
+    def full_attention_layer_ids(self):
+        return [
+            i
+            for i in range(self.num_hidden_layers)
+            if self.hybrid_override_pattern[i] == ATTENTION
+        ]
+
+    @property
+    def mamba2_cache_params(self) -> Mamba2CacheParams:
+        from sglang.srt.layers.dp_attention import get_attention_tp_size
+
+        shape = Mamba2StateShape.create(
+            tp_world_size=get_attention_tp_size(),
+            intermediate_size=self.mamba_num_heads * self.mamba_head_dim,
+            n_groups=self.n_groups,
+            num_heads=self.mamba_num_heads,
+            head_dim=self.mamba_head_dim,
+            state_size=self.ssm_state_size,
+            conv_kernel=self.conv_kernel,
+        )
+
+        return Mamba2CacheParams(
+            shape=shape, layers=self.mamba_layer_ids, dtype=mamba2_state_dtype(self)
+        )
diff --git a/sglang/python/sglang/srt/configs/olmo3.py b/sglang/python/sglang/srt/configs/olmo3.py
new file mode 100644
index 0000000000000000000000000000000000000000..8640342196e5fb436227f8fad454e33a8832a5e3
--- /dev/null
+++ b/sglang/python/sglang/srt/configs/olmo3.py
@@ -0,0 +1,103 @@
+# coding=utf-8
+# Copyright 2024 The Qwen team, Alibaba Group and the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Olmo3 model configuration"""
+
+import enum
+
+from transformers.configuration_utils import PretrainedConfig
+from transformers.utils import logging
+
+logger = logging.get_logger(__name__)
+
+
+class Olmo3LayerType(enum.Enum):
+    full_attention = "full_attention"
+    sliding_attention = "sliding_attention"
+
+
+class Olmo3Config(PretrainedConfig):
+
+    model_type = "olmo3"
+    keys_to_ignore_at_inference = ["past_key_values"]
+
+    def __init__(
+        self,
+        vocab_size=50304,
+        hidden_size=4096,
+        intermediate_size=11008,
+        num_hidden_layers=32,
+        num_attention_heads=32,
+        num_key_value_heads=None,
+        hidden_act="silu",
+        max_position_embeddings=2048,
+        initializer_range=0.02,
+        use_cache=True,
+        pad_token_id=1,
+        bos_token_id=None,
+        eos_token_id=50279,
+        tie_word_embeddings=False,
+        rope_theta=10000.0,
+        rope_scaling=None,
+        attention_bias=False,
+        attention_dropout=0.0,
+        rms_norm_eps=1e-5,
+        sliding_window=4096,
+        layer_types=None,
+        **kwargs,
+    ):
+        # This model uses Olmo3ForCausalLM in transformers but Olmo2ForCausalLM
+        # in sglang.
+        if "architectures" not in kwargs:
+            kwargs["architectures"] = ["Olmo2ForCausalLM"]
+        elif "Olmo3ForCausalLM" in kwargs["architectures"]:
+            kwargs["architectures"].remove("Olmo3ForCausalLM")
+            kwargs["architectures"].append("Olmo2ForCausalLM")
+
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+
+        # for backward compatibility
+        if num_key_value_heads is None:
+            num_key_value_heads = num_attention_heads
+
+        self.num_key_value_heads = num_key_value_heads
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+        self.rope_scaling = rope_scaling
+        self.attention_bias = attention_bias
+        self.attention_dropout = attention_dropout
+
+        self.rms_norm_eps = rms_norm_eps
+
+        self.sliding_window = sliding_window
+        self.layer_types = layer_types
+        if self.layer_types is None:
+            self.layer_types = [
+                "sliding_attention" if (i + 1) % 4 != 0 else "full_attention"
+                for i in range(self.num_hidden_layers)
+            ]
diff --git a/sglang/python/sglang/srt/configs/points_v15_chat.py b/sglang/python/sglang/srt/configs/points_v15_chat.py
new file mode 100644
index 0000000000000000000000000000000000000000..758939b275a879b200bfda11bac6f3b1f3c6aa4f
--- /dev/null
+++ b/sglang/python/sglang/srt/configs/points_v15_chat.py
@@ -0,0 +1,29 @@
+from typing import Optional, Union
+
+from transformers import PretrainedConfig, Qwen2Config
+from transformers.models.qwen2_vl.configuration_qwen2_vl import Qwen2VLVisionConfig
+
+
+class POINTSV15ChatConfig(PretrainedConfig):
+    model_type = "pointsv1.5_chat"
+
+    def __init__(
+        self,
+        vision_config: Optional[Union[dict, Qwen2VLVisionConfig]] = None,
+        llm_config: Optional[Union[dict, Qwen2Config]] = None,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        if vision_config is None:
+            vision_config = Qwen2VLVisionConfig()
+        elif isinstance(vision_config, dict):
+            vision_config = Qwen2VLVisionConfig(**vision_config)
+        self.vision_config = vision_config
+
+        if llm_config is None:
+            llm_config = Qwen2Config()
+        elif isinstance(llm_config, dict):
+            llm_config = Qwen2Config(**llm_config)
+
+        self.llm_config = llm_config
+        self.hidden_size = self.llm_config.hidden_size
diff --git a/sglang/python/sglang/srt/configs/qwen3_5.py b/sglang/python/sglang/srt/configs/qwen3_5.py
new file mode 100644
index 0000000000000000000000000000000000000000..fddf783ab3e58d2042064cb9b5c73406319f45ae
--- /dev/null
+++ b/sglang/python/sglang/srt/configs/qwen3_5.py
@@ -0,0 +1,122 @@
+from transformers import PretrainedConfig
+
+from sglang.srt.configs.qwen3_next import Qwen3NextConfig
+from sglang.srt.configs.qwen3_vl import Qwen3VLVisionConfig
+
+
+class Qwen3_5VisionConfig(Qwen3VLVisionConfig):
+    model_type = "qwen3_5"
+    base_config_key = "vision_config"
+
+
+class Qwen3_5TextConfig(Qwen3NextConfig):
+    model_type = "qwen3_5_text"
+    base_config_key = "text_config"
+
+    def __init__(
+        self,
+        **kwargs,
+    ):
+        # HF Qwen3.5 checkpoints may provide RoPE settings under rope_parameters.
+        # Normalize it before parent init so downstream code sees the expected values.
+        rope_parameters = kwargs.pop("rope_parameters", None)
+        if kwargs.get("rope_scaling") is None and rope_parameters is not None:
+            kwargs["rope_scaling"] = rope_parameters
+
+        super().__init__(**kwargs)
+        if self.rope_scaling is None:
+            self.rope_scaling = rope_parameters or {}
+
+        # Keep both names for compatibility with model code paths that read either.
+        self.rope_parameters = rope_parameters or self.rope_scaling
+
+
+class Qwen3_5Config(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`Qwen3_5Model`]. It is used to instantiate a
+    Qwen3.5 model according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of
+    Qwen3.5.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        text_config (`Union[PreTrainedConfig, dict]`, *optional*, defaults to `Qwen3_5TextConfig`):
+            The config object or dictionary of the text backbone.
+        vision_config (`Union[PreTrainedConfig, dict]`,  *optional*, defaults to `Qwen3_5VisionConfig`):
+            The config object or dictionary of the vision backbone.
+        image_token_id (`int`, *optional*, defaults to 151655):
+            The image token index to encode the image prompt.
+        video_token_id (`int`, *optional*, defaults to 151656):
+            The video token index to encode the image prompt.
+        vision_start_token_id (`int`, *optional*, defaults to 151652):
+            The start token index to encode the image prompt.
+        vision_end_token_id (`int`, *optional*, defaults to 151653):
+            The end token index to encode the image prompt.
+        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
+            Whether to tie the word embeddings.
+
+    ```python
+    >>> from transformers import Qwen3_5ForConditionalGeneration, Qwen3_5Config
+
+    >>> # Initializing a Qwen3.5 style configuration
+    >>> configuration = Qwen3_5Config()
+
+    >>> # Initializing a model from the Qwen3.5 style configuration
+    >>> model = Qwen3_5ForConditionalGeneration(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "qwen3_5"
+    sub_configs = {
+        "vision_config": Qwen3_5VisionConfig,
+        "text_config": Qwen3_5TextConfig,
+    }
+    keys_to_ignore_at_inference = ["past_key_values"]
+
+    def __init__(
+        self,
+        text_config=None,
+        vision_config=None,
+        image_token_id=151655,
+        video_token_id=151656,
+        vision_start_token_id=151652,
+        vision_end_token_id=151653,
+        tie_word_embeddings=False,
+        **kwargs,
+    ):
+        if isinstance(vision_config, dict):
+            self.vision_config = self.sub_configs["vision_config"](**vision_config)
+        elif vision_config is None:
+            self.vision_config = self.sub_configs["vision_config"]()
+
+        if isinstance(text_config, dict):
+            self.text_config = self.sub_configs["text_config"](**text_config)
+        elif text_config is None:
+            self.text_config = self.sub_configs["text_config"]()
+
+        self.image_token_id = image_token_id
+        self.video_token_id = video_token_id
+        self.vision_start_token_id = vision_start_token_id
+        self.vision_end_token_id = vision_end_token_id
+        super().__init__(**kwargs, tie_word_embeddings=tie_word_embeddings)
+
+
+class Qwen3_5MoeVisionConfig(Qwen3_5VisionConfig):
+    model_type = "qwen3_5_moe"
+
+
+class Qwen3_5MoeTextConfig(Qwen3_5TextConfig):
+    model_type = "qwen3_5_moe_text"
+
+
+class Qwen3_5MoeConfig(Qwen3_5Config):
+    model_type = "qwen3_5_moe"
+    sub_configs = {
+        "vision_config": Qwen3_5MoeVisionConfig,
+        "text_config": Qwen3_5MoeTextConfig,
+    }
diff --git a/sglang/python/sglang/srt/configs/qwen3_next.py b/sglang/python/sglang/srt/configs/qwen3_next.py
new file mode 100644
index 0000000000000000000000000000000000000000..3bf153a225720273173c75d9e7823222a93a6571
--- /dev/null
+++ b/sglang/python/sglang/srt/configs/qwen3_next.py
@@ -0,0 +1,302 @@
+# coding=utf-8
+# Copyright 2024 The Qwen team, Alibaba Group and the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Qwen3Hybrid model configuration"""
+
+import enum
+
+from transformers.configuration_utils import PretrainedConfig
+from transformers.utils import logging
+
+from sglang.srt.configs.mamba_utils import (
+    Mamba2CacheParams,
+    Mamba2StateShape,
+    mamba2_state_dtype,
+)
+from sglang.srt.configs.update_config import adjust_tp_num_heads_if_necessary
+from sglang.srt.utils import is_cpu
+
+logger = logging.get_logger(__name__)
+_is_cpu = is_cpu()
+
+
+class HybridLayerType(enum.Enum):
+    full_attention = "attention"
+    linear_attention = "linear_attention"
+
+
+class Qwen3NextConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`Qwen3NextModel`]. It is used to instantiate a
+    Qwen3-Next model according to the specified arguments, defining the model architecture.
+    Instantiating a configuration with the defaults will yield a similar configuration to that of
+    Qwen3-Next-80B-A3B-Instruct [Qwen/Qwen3-Next-80B-A3B-Instruct](https://huggingface.co/Qwen/Qwen3-Next-80B-A3B-Instruct).
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 151936):
+            Vocabulary size of the model. Defines the number of different tokens that can be represented by the
+            `inputs_ids`.
+        hidden_size (`int`, *optional*, defaults to 2048):
+            Dimension of the hidden representations.
+        intermediate_size (`int`, *optional*, defaults to 5632):
+            Dimension of the MLP representations.
+        num_hidden_layers (`int`, *optional*, defaults to 48):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 16):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        num_key_value_heads (`int`, *optional*, defaults to 2):
+            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
+            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
+            `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
+            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
+            by meanpooling all the original heads within that group. For more details checkout [this
+            paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `32`.
+        hidden_act (`str`, *optional*, defaults to `"silu"`):
+            The non-linear activation function in the decoder.
+        max_position_embeddings (`int`, *optional*, defaults to 32768):
+            The maximum sequence length that this model might ever be used with.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        rms_norm_eps (`float`, *optional*, defaults to 1e-06):
+            The epsilon used by the rms normalization layers.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
+            Whether the model's input and output word embeddings should be tied.
+        rope_theta (`float`, *optional*, defaults to 10000.0):
+            The base period of the RoPE embeddings.
+        rope_scaling (`Dict`, *optional*):
+            Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type
+            and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value
+            accordingly.
+            Expected contents:
+                `rope_type` (`str`):
+                    The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope',
+                    'llama3'], with 'default' being the original RoPE implementation.
+                `factor` (`float`, *optional*):
+                    Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In
+                    most scaling types, a `factor` of x will enable the model to handle sequences of length x *
+                    original maximum pre-trained length.
+                `original_max_position_embeddings` (`int`, *optional*):
+                    Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during
+                    pretraining.
+                `attention_factor` (`float`, *optional*):
+                    Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention
+                    computation. If unspecified, it defaults to value recommended by the implementation, using the
+                    `factor` field to infer the suggested value.
+                `beta_fast` (`float`, *optional*):
+                    Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear
+                    ramp function. If unspecified, it defaults to 32.
+                `beta_slow` (`float`, *optional*):
+                    Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear
+                    ramp function. If unspecified, it defaults to 1.
+                `short_factor` (`List[float]`, *optional*):
+                    Only used with 'longrope'. The scaling factor to be applied to short contexts (<
+                    `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
+                    size divided by the number of attention heads divided by 2
+                `long_factor` (`List[float]`, *optional*):
+                    Only used with 'longrope'. The scaling factor to be applied to long contexts (<
+                    `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
+                    size divided by the number of attention heads divided by 2
+                `low_freq_factor` (`float`, *optional*):
+                    Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE
+                `high_freq_factor` (`float`, *optional*):
+                    Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE
+        partial_rotary_factor (`float`, *optional*, defaults to 0.25):
+            Percentage of the query and keys which will have rotary embedding.
+        attention_bias (`bool`, *optional*, defaults to `False`):
+            Whether to use a bias in the query, key, value and output projection layers during self-attention.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        head_dim (`int`, *optional*, defaults to 256):
+            Projection weights dimension in multi-head attention.
+        linear_conv_kernel_dim (`int`, *optional*, defaults to 4):
+            Kernel size of the convolution used in linear attention layers.
+        linear_key_head_dim (`int`, *optional*, defaults to 128):
+            Dimension of each key head in linear attention.
+        linear_value_head_dim (`int`, *optional*, defaults to 128):
+            Dimension of each value head in linear attention.
+        linear_num_key_heads (`int`, *optional*, defaults to 16):
+            Number of key heads used in linear attention layers.
+        linear_num_value_heads (`int`, *optional*, defaults to 32):
+            Number of value heads used in linear attention layers.
+        decoder_sparse_step (`int`, *optional*, defaults to 1):
+            The frequency of the MoE layer.
+        moe_intermediate_size (`int`, *optional*, defaults to 512):
+            Intermediate size of the routed expert.
+        shared_expert_intermediate_size (`int`, *optional*, defaults to 512):
+            Intermediate size of the shared expert.
+        num_experts_per_tok (`int`, *optional*, defaults to 10):
+            Number of selected experts.
+        num_experts (`int`, *optional*, defaults to 512):
+            Number of routed experts.
+        norm_topk_prob (`bool`, *optional*, defaults to `True`):
+            Whether to normalize the topk probabilities.
+        output_router_logits (`bool`, *optional*, defaults to `False`):
+            Whether or not the router logits should be returned by the model. Enabling this will also
+            allow the model to output the auxiliary loss, including load balancing loss and router z-loss.
+        router_aux_loss_coef (`float`, *optional*, defaults to 0.001):
+            The aux loss factor for the total loss.
+        mlp_only_layers (`list[int]`, *optional*, defaults to `[]`):
+            Indicate which layers use Qwen3NextMLP rather than Qwen3NextSparseMoeBlock
+            The list contains layer index, from 0 to num_layers-1 if we have num_layers layers
+            If `mlp_only_layers` is empty, `decoder_sparse_step` is used to determine the sparsity.
+        layer_types (`list[str]`, *optional*, defaults to None):
+            Types of each layer (attention or linear).
+
+    ```python
+    >>> from transformers import Qwen3NextModel, Qwen3NextConfig
+
+    >>> # Initializing a Qwen3Next style configuration
+    >>> configuration =  Qwen3NextConfig()
+
+    >>> # Initializing a model from the Qwen3-Next-80B-A3B style configuration
+    >>> model = Qwen3NextModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```
+    """
+
+    model_type = "qwen3_next"
+    keys_to_ignore_at_inference = ["past_key_values"]
+
+    def __init__(
+        self,
+        vocab_size=151936,
+        hidden_size=2048,
+        intermediate_size=5632,
+        num_hidden_layers=48,
+        num_attention_heads=16,
+        num_key_value_heads=2,
+        hidden_act="silu",
+        max_position_embeddings=32768,
+        initializer_range=0.02,
+        rms_norm_eps=1e-6,
+        use_cache=True,
+        tie_word_embeddings=False,
+        rope_theta=10000.0,
+        rope_scaling=None,
+        partial_rotary_factor=0.25,
+        attention_bias=False,
+        attention_dropout=0.0,
+        head_dim=256,
+        linear_conv_kernel_dim=4,
+        linear_key_head_dim=128,
+        linear_value_head_dim=128,
+        linear_num_key_heads=16,
+        linear_num_value_heads=32,
+        decoder_sparse_step=1,
+        moe_intermediate_size=512,
+        shared_expert_intermediate_size=512,
+        num_experts_per_tok=10,
+        num_experts=512,
+        norm_topk_prob=True,
+        output_router_logits=False,
+        router_aux_loss_coef=0.001,
+        mlp_only_layers=[],
+        layer_types=None,
+        **kwargs,
+    ):
+        super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs)
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.num_key_value_heads = num_key_value_heads
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+        self.rope_scaling = rope_scaling
+        self.partial_rotary_factor = partial_rotary_factor
+        self.attention_bias = attention_bias
+        self.attention_dropout = attention_dropout
+        self.head_dim = head_dim
+
+        # linear attention (gdn now part)
+        self.linear_conv_kernel_dim = linear_conv_kernel_dim
+        self.linear_key_head_dim = linear_key_head_dim
+        self.linear_value_head_dim = linear_value_head_dim
+        self.linear_num_key_heads = linear_num_key_heads
+        self.linear_num_value_heads = linear_num_value_heads
+
+        # MoE arguments
+        self.decoder_sparse_step = decoder_sparse_step
+        self.moe_intermediate_size = moe_intermediate_size
+        self.shared_expert_intermediate_size = shared_expert_intermediate_size
+        self.num_experts_per_tok = num_experts_per_tok
+        self.num_experts = num_experts
+        self.norm_topk_prob = norm_topk_prob
+        self.output_router_logits = output_router_logits
+        self.router_aux_loss_coef = router_aux_loss_coef
+        self.mlp_only_layers = mlp_only_layers
+
+    @property
+    def layers_block_type(self):
+        layer_type_list = []
+
+        for l in range(self.num_hidden_layers):
+            if (l + 1) % self.full_attention_interval == 0:
+                layer_type_list.append(HybridLayerType.full_attention.value)
+            else:
+                layer_type_list.append(HybridLayerType.linear_attention.value)
+
+        return layer_type_list
+
+    @property
+    def linear_layer_ids(self):
+        return [
+            i
+            for i, type_value in enumerate(self.layers_block_type)
+            if type_value == HybridLayerType.linear_attention.value
+        ]
+
+    @property
+    def full_attention_layer_ids(self):
+        return [
+            i
+            for i, type_value in enumerate(self.layers_block_type)
+            if type_value == HybridLayerType.full_attention.value
+        ]
+
+    @property
+    def mamba2_cache_params(self) -> Mamba2CacheParams:
+        from sglang.srt.layers.dp_attention import get_attention_tp_size
+
+        if _is_cpu:
+            world_size = get_attention_tp_size()
+            adjust_tp_num_heads_if_necessary(self, world_size, False)
+
+        shape = Mamba2StateShape.create(
+            tp_world_size=get_attention_tp_size(),
+            intermediate_size=self.linear_value_head_dim * self.linear_num_value_heads,
+            n_groups=self.linear_num_key_heads,
+            num_heads=self.linear_num_value_heads,
+            head_dim=self.linear_value_head_dim,
+            state_size=self.linear_key_head_dim,
+            conv_kernel=self.linear_conv_kernel_dim,
+        )
+
+        return Mamba2CacheParams(
+            shape=shape, layers=self.linear_layer_ids, dtype=mamba2_state_dtype(self)
+        )
diff --git a/sglang/python/sglang/srt/configs/qwen3_omni.py b/sglang/python/sglang/srt/configs/qwen3_omni.py
new file mode 100644
index 0000000000000000000000000000000000000000..8baea892335dafbe673163c55f7bf4f3f3b24053
--- /dev/null
+++ b/sglang/python/sglang/srt/configs/qwen3_omni.py
@@ -0,0 +1,609 @@
+from transformers import PretrainedConfig
+from transformers.configuration_utils import layer_type_validation
+
+from sglang.utils import logger
+
+
+class Qwen3OmniMoeAudioEncoderConfig(PretrainedConfig):
+    model_type = "qwen3_omni_moe_audio_encoder"
+
+    def __init__(
+        self,
+        num_mel_bins=128,
+        encoder_layers=32,
+        encoder_attention_heads=20,
+        encoder_ffn_dim=5120,
+        d_model=1280,
+        dropout=0,
+        attention_dropout=0,
+        activation_function="gelu",
+        activation_dropout=0,
+        scale_embedding=False,
+        initializer_range=0.02,
+        max_source_positions=1500,
+        n_window=100,
+        output_dim=3584,
+        n_window_infer=400,
+        conv_chunksize=500,
+        downsample_hidden_size=480,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+
+        self.num_mel_bins = num_mel_bins
+        self.d_model = d_model
+        self.encoder_layers = encoder_layers
+        self.encoder_attention_heads = encoder_attention_heads
+        self.encoder_ffn_dim = encoder_ffn_dim
+        self.dropout = dropout
+        self.attention_dropout = attention_dropout
+        self.activation_function = activation_function
+        self.activation_dropout = activation_dropout
+        self.num_hidden_layers = encoder_layers
+        self.initializer_range = initializer_range
+        self.scale_embedding = (
+            scale_embedding  # scale factor will be sqrt(d_model) if True
+        )
+        self.max_source_positions = max_source_positions
+        self.n_window = n_window
+        self.output_dim = output_dim
+        self.n_window_infer = n_window_infer
+        self.conv_chunksize = conv_chunksize
+        self.downsample_hidden_size = downsample_hidden_size
+
+
+class Qwen3OmniMoeVisionEncoderConfig(PretrainedConfig):
+    model_type = "qwen3_omni_moe_vision_encoder"
+    base_config_key = "vision_config"
+
+    def __init__(
+        self,
+        depth=27,
+        hidden_size=1152,
+        hidden_act="gelu_pytorch_tanh",
+        intermediate_size=4304,
+        num_heads=16,
+        in_channels=3,
+        patch_size=16,
+        spatial_merge_size=2,
+        temporal_patch_size=2,
+        out_hidden_size=3584,
+        num_position_embeddings=2304,
+        deepstack_visual_indexes=[8, 16, 24],
+        initializer_range=0.02,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+
+        self.depth = depth
+        self.hidden_size = hidden_size
+        self.hidden_act = hidden_act
+        self.intermediate_size = intermediate_size
+        self.num_heads = num_heads
+        self.in_channels = in_channels
+        self.patch_size = patch_size
+        self.spatial_merge_size = spatial_merge_size
+        self.temporal_patch_size = temporal_patch_size
+        self.out_hidden_size = out_hidden_size
+        self.num_position_embeddings = num_position_embeddings
+        self.initializer_range = initializer_range
+        self.deepstack_visual_indexes = deepstack_visual_indexes
+
+
+class Qwen3OmniMoeTextConfig(PretrainedConfig):
+    model_type = "qwen3_omni_moe_text"
+    keys_to_ignore_at_inference = ["past_key_values"]
+
+    # Default tensor parallel plan for base model `Qwen3OmniMoeText`
+    base_model_tp_plan = {
+        "layers.*.self_attn.q_proj": "colwise",
+        "layers.*.self_attn.k_proj": "colwise",
+        "layers.*.self_attn.v_proj": "colwise",
+        "layers.*.self_attn.o_proj": "rowwise",
+        "layers.*.mlp.experts.*.gate_proj": "colwise",
+        "layers.*.mlp.experts.*.up_proj": "colwise",
+        "layers.*.mlp.experts.*.down_proj": "rowwise",
+        "layers.*.mlp.gate_proj": "colwise",
+        "layers.*.mlp.up_proj": "colwise",
+        "layers.*.mlp.down_proj": "rowwise",
+    }
+    base_model_pp_plan = {
+        "embed_tokens": (["input_ids"], ["inputs_embeds"]),
+        "layers": (["hidden_states", "attention_mask"], ["hidden_states"]),
+        "norm": (["hidden_states"], ["hidden_states"]),
+    }
+
+    def __init__(
+        self,
+        vocab_size=3584,
+        hidden_size=2048,
+        intermediate_size=18944,
+        num_hidden_layers=28,
+        num_attention_heads=28,
+        num_key_value_heads=4,
+        hidden_act="silu",
+        max_position_embeddings=32768,
+        initializer_range=0.02,
+        rms_norm_eps=1e-6,
+        use_cache=True,
+        tie_word_embeddings=False,
+        rope_theta=1000000.0,
+        rope_scaling=None,
+        attention_bias=False,
+        sliding_window=None,
+        attention_dropout=0,
+        decoder_sparse_step=1,
+        moe_intermediate_size=768,
+        num_experts_per_tok=8,
+        num_experts=128,
+        norm_topk_prob=True,
+        output_router_logits=False,
+        router_aux_loss_coef=0.001,
+        mlp_only_layers=None,
+        **kwargs,
+    ):
+        super().__init__(
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.sliding_window = sliding_window
+
+        self.num_key_value_heads = num_key_value_heads
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+        self.rope_scaling = rope_scaling
+        self.attention_bias = attention_bias
+        self.attention_dropout = attention_dropout
+        # Validate the correctness of rotary position embeddings parameters
+        # BC: if there is a 'type' field, move it to 'rope_type'.
+        if self.rope_scaling is not None and "type" in self.rope_scaling:
+            self.rope_scaling["rope_type"] = self.rope_scaling["type"]
+
+        # MoE arguments
+        self.decoder_sparse_step = decoder_sparse_step
+        self.moe_intermediate_size = moe_intermediate_size
+        self.num_experts_per_tok = num_experts_per_tok
+        self.num_experts = num_experts
+        self.norm_topk_prob = norm_topk_prob
+        self.output_router_logits = output_router_logits
+        self.router_aux_loss_coef = router_aux_loss_coef
+        self.mlp_only_layers = [] if mlp_only_layers is None else mlp_only_layers
+
+
+class Qwen3OmniMoeThinkerConfig(PretrainedConfig):
+    model_type = "qwen3_omni_moe_thinker"
+    attribute_map = {
+        "image_token_id": "image_token_index",
+        "video_token_id": "video_token_index",
+        "audio_token_id": "audio_token_index",
+    }
+    sub_configs = {
+        "audio_config": Qwen3OmniMoeAudioEncoderConfig,
+        "vision_config": Qwen3OmniMoeVisionEncoderConfig,
+        "text_config": Qwen3OmniMoeTextConfig,
+    }
+
+    def __init__(
+        self,
+        audio_config=None,
+        vision_config=None,
+        text_config=None,
+        audio_token_id=151646,
+        image_token_id=151655,
+        video_token_id=151656,
+        position_id_per_seconds=25,
+        audio_start_token_id=151647,
+        user_token_id=872,
+        initializer_range=0.02,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.user_token_id = user_token_id
+        self.position_id_per_seconds = position_id_per_seconds
+        self.audio_start_token_id = audio_start_token_id
+        self.initializer_range = initializer_range
+
+        if isinstance(vision_config, dict):
+            vision_config = Qwen3OmniMoeVisionEncoderConfig(**vision_config)
+        elif vision_config is None:
+            vision_config = Qwen3OmniMoeVisionEncoderConfig()
+        self.vision_config = vision_config
+
+        if isinstance(audio_config, dict):
+            audio_config = Qwen3OmniMoeAudioEncoderConfig(**audio_config)
+        elif audio_config is None:
+            audio_config = Qwen3OmniMoeAudioEncoderConfig()
+        self.audio_config = audio_config
+
+        if isinstance(text_config, dict):
+            text_config = Qwen3OmniMoeTextConfig(**text_config)
+        elif text_config is None:
+            text_config = Qwen3OmniMoeTextConfig()
+        self.text_config = text_config
+        self.audio_token_id = audio_token_id
+        self.image_token_id = image_token_id
+        self.video_token_id = video_token_id
+
+
+class Qwen3OmniMoeTalkerCodePredictorConfig(PretrainedConfig):
+
+    model_type = "qwen3_omni_moe_talker_code_predictor"
+    keys_to_ignore_at_inference = ["past_key_values"]
+
+    # Default tensor parallel plan for base model `Qwen3OmniMoeTalkerCodePredictor`
+    base_model_tp_plan = {
+        "layers.*.self_attn.q_proj": "colwise",
+        "layers.*.self_attn.k_proj": "colwise",
+        "layers.*.self_attn.v_proj": "colwise",
+        "layers.*.self_attn.o_proj": "rowwise",
+        "layers.*.mlp.gate_proj": "colwise",
+        "layers.*.mlp.up_proj": "colwise",
+        "layers.*.mlp.down_proj": "rowwise",
+    }
+    base_model_pp_plan = {
+        "embed_tokens": (["input_ids"], ["inputs_embeds"]),
+        "layers": (["hidden_states", "attention_mask"], ["hidden_states"]),
+        "norm": (["hidden_states"], ["hidden_states"]),
+    }
+
+    def __init__(
+        self,
+        vocab_size=2048,
+        hidden_size=1024,
+        intermediate_size=3072,
+        num_hidden_layers=5,
+        num_attention_heads=16,
+        num_key_value_heads=8,
+        head_dim=128,
+        hidden_act="silu",
+        max_position_embeddings=32768,
+        initializer_range=0.02,
+        rms_norm_eps=0.000001,
+        use_cache=True,
+        tie_word_embeddings=False,
+        rope_theta=10000,
+        rope_scaling=None,
+        attention_bias=False,
+        sliding_window=None,
+        layer_types=None,
+        attention_dropout=0,
+        num_code_groups=32,
+        **kwargs,
+    ):
+        super().__init__(
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.sliding_window = sliding_window
+
+        # for backward compatibility
+        if num_key_value_heads is None:
+            num_key_value_heads = num_attention_heads
+
+        self.num_key_value_heads = num_key_value_heads
+        self.head_dim = head_dim
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+        self.rope_scaling = rope_scaling
+        self.attention_bias = attention_bias
+        self.attention_dropout = attention_dropout
+        # Validate the correctness of rotary position embeddings parameters
+        # BC: if there is a 'type' field, move it to 'rope_type'.
+        if self.rope_scaling is not None and "type" in self.rope_scaling:
+            self.rope_scaling["rope_type"] = self.rope_scaling["type"]
+
+        self.layer_types = layer_types
+        if self.layer_types is None:
+            self.layer_types = [
+                (
+                    "sliding_attention"
+                    if self.sliding_window is not None and i >= self.max_window_layers
+                    else "full_attention"
+                )
+                for i in range(self.num_hidden_layers)
+            ]
+        layer_type_validation(self.layer_types, self.num_hidden_layers)
+        self.num_code_groups = num_code_groups
+
+
+class Qwen3OmniMoeTalkerTextConfig(PretrainedConfig):
+
+    model_type = "qwen3_omni_moe_talker_text"
+    keys_to_ignore_at_inference = ["past_key_values"]
+
+    # Default tensor parallel plan for base model `Qwen3OmniMoeTalkerText`
+    base_model_tp_plan = {
+        "layers.*.self_attn.q_proj": "colwise",
+        "layers.*.self_attn.k_proj": "colwise",
+        "layers.*.self_attn.v_proj": "colwise",
+        "layers.*.self_attn.o_proj": "rowwise",
+        "layers.*.mlp.experts.*.gate_proj": "colwise",
+        "layers.*.mlp.experts.*.up_proj": "colwise",
+        "layers.*.mlp.experts.*.down_proj": "rowwise",
+        "layers.*.mlp.gate_proj": "colwise",
+        "layers.*.mlp.up_proj": "colwise",
+        "layers.*.mlp.down_proj": "rowwise",
+    }
+    base_model_pp_plan = {
+        "embed_tokens": (["input_ids"], ["inputs_embeds"]),
+        "layers": (["hidden_states", "attention_mask"], ["hidden_states"]),
+        "norm": (["hidden_states"], ["hidden_states"]),
+    }
+
+    def __init__(
+        self,
+        vocab_size=3072,
+        hidden_size=1024,
+        intermediate_size=2048,
+        num_hidden_layers=20,
+        num_attention_heads=16,
+        num_key_value_heads=2,
+        hidden_act="silu",
+        max_position_embeddings=32768,
+        initializer_range=0.02,
+        rms_norm_eps=0.000001,
+        use_cache=True,
+        tie_word_embeddings=False,
+        rope_theta=10000,
+        rope_scaling=None,
+        attention_bias=False,
+        sliding_window=None,
+        attention_dropout=0,
+        decoder_sparse_step=1,
+        moe_intermediate_size=384,
+        num_experts_per_tok=8,
+        num_experts=128,
+        norm_topk_prob=False,
+        output_router_logits=False,
+        router_aux_loss_coef=0.001,
+        mlp_only_layers=None,
+        **kwargs,
+    ):
+        super().__init__(
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.sliding_window = sliding_window
+
+        self.num_key_value_heads = num_key_value_heads
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+        self.rope_scaling = rope_scaling
+        self.attention_bias = attention_bias
+        self.attention_dropout = attention_dropout
+        # Validate the correctness of rotary position embeddings parameters
+        # BC: if there is a 'type' field, move it to 'rope_type'.
+        if self.rope_scaling is not None and "type" in self.rope_scaling:
+            self.rope_scaling["rope_type"] = self.rope_scaling["type"]
+
+        # MoE arguments
+        self.decoder_sparse_step = decoder_sparse_step
+        self.moe_intermediate_size = moe_intermediate_size
+        self.num_experts_per_tok = num_experts_per_tok
+        self.num_experts = num_experts
+        self.norm_topk_prob = norm_topk_prob
+        self.output_router_logits = output_router_logits
+        self.router_aux_loss_coef = router_aux_loss_coef
+        self.mlp_only_layers = [] if mlp_only_layers is None else mlp_only_layers
+
+
+class Qwen3OmniMoeTalkerConfig(PretrainedConfig):
+
+    sub_configs = {
+        "code_predictor_config": Qwen3OmniMoeTalkerCodePredictorConfig,
+        "text_config": Qwen3OmniMoeTalkerTextConfig,
+    }
+
+    def __init__(
+        self,
+        code_predictor_config=None,
+        text_config=None,
+        num_code_groups=32,
+        thinker_hidden_size=2048,
+        codec_eos_token_id=4198,
+        accept_hidden_layer=18,
+        codec_nothink_id=4203,
+        codec_think_bos_id=4204,
+        codec_think_eos_id=4205,
+        codec_pad_id=4196,
+        codec_bos_id=4197,
+        audio_token_id=151646,
+        image_token_id=151655,
+        video_token_id=151656,
+        vision_start_token_id=151652,
+        position_id_per_seconds=25,
+        audio_start_token_id=151669,
+        speaker_id=None,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        if code_predictor_config is None:
+            code_predictor_config = {}
+            self.code_predictor_config = Qwen3OmniMoeTalkerCodePredictorConfig()
+            logger.info(
+                "code_predictor_config is None. Initializing code_predictor_config model with default values"
+            )
+        elif isinstance(code_predictor_config, Qwen3OmniMoeTalkerCodePredictorConfig):
+            self.code_predictor_config = code_predictor_config
+        else:
+            self.code_predictor_config = Qwen3OmniMoeTalkerCodePredictorConfig(
+                **code_predictor_config
+            )
+
+        if text_config is None:
+            text_config = {}
+            self.text_config = Qwen3OmniMoeTalkerTextConfig()
+            logger.info(
+                "talker text_config is None. Initializing talker text model with default values"
+            )
+        elif isinstance(text_config, Qwen3OmniMoeTalkerTextConfig):
+            self.text_config = text_config
+        else:
+            self.text_config = Qwen3OmniMoeTalkerTextConfig(**text_config)
+        self.num_code_groups = num_code_groups
+        self.thinker_hidden_size = thinker_hidden_size
+        self.codec_eos_token_id = codec_eos_token_id
+        self.accept_hidden_layer = accept_hidden_layer
+        self.codec_nothink_id = codec_nothink_id
+        self.codec_think_bos_id = codec_think_bos_id
+        self.codec_think_eos_id = codec_think_eos_id
+        self.codec_pad_id = codec_pad_id
+        self.codec_bos_id = codec_bos_id
+        self.audio_token_id = audio_token_id
+        self.image_token_id = image_token_id
+        self.video_token_id = video_token_id
+        self.position_id_per_seconds = position_id_per_seconds
+        self.audio_start_token_id = audio_start_token_id
+        self.vision_start_token_id = vision_start_token_id
+        self.speaker_id = speaker_id
+
+
+class Qwen3OmniMoeCode2WavConfig(PretrainedConfig):
+
+    def __init__(
+        self,
+        codebook_size=2048,
+        hidden_size=1024,
+        max_position_embeddings=8000,
+        rope_theta=10000,
+        num_attention_heads=16,
+        num_key_value_heads=16,
+        attention_bias=False,
+        sliding_window=72,
+        intermediate_size=3072,
+        hidden_act="silu",
+        layer_scale_initial_scale=0.01,
+        rms_norm_eps=1e-5,
+        num_hidden_layers=8,
+        num_quantizers=16,
+        upsample_rates=(8, 5, 4, 3),
+        upsampling_ratios=(2, 2),
+        decoder_dim=1536,
+        attention_dropout=0.0,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.codebook_size = codebook_size
+        self.hidden_size = hidden_size
+        self.max_position_embeddings = max_position_embeddings
+        self.rope_theta = rope_theta
+        self.num_attention_heads = num_attention_heads
+        self.num_key_value_heads = num_key_value_heads
+        self.attention_bias = attention_bias
+        self.sliding_window = sliding_window
+        self.intermediate_size = intermediate_size
+        self.hidden_act = hidden_act
+        self.layer_scale_initial_scale = layer_scale_initial_scale
+        self.rms_norm_eps = rms_norm_eps
+        self.num_hidden_layers = num_hidden_layers
+        self.num_quantizers = num_quantizers
+        self.upsample_rates = upsample_rates
+        self.upsampling_ratios = upsampling_ratios
+        self.decoder_dim = decoder_dim
+        self.attention_dropout = attention_dropout
+
+    @property
+    def layer_types(self):
+        """
+        All layer in code2wav should be sliding attention
+        """
+        return ["sliding_attention"] * self.num_hidden_layers
+
+
+class Qwen3OmniMoeConfig(PretrainedConfig):
+
+    model_type = "qwen3_omni_moe"
+    sub_configs = {
+        "thinker_config": Qwen3OmniMoeThinkerConfig,
+        "talker_config": Qwen3OmniMoeTalkerConfig,
+        "code2wav_config": Qwen3OmniMoeCode2WavConfig,
+    }
+
+    def __init__(
+        self,
+        thinker_config=None,
+        talker_config=None,
+        code2wav_config=None,
+        enable_audio_output=True,
+        im_start_token_id=151644,
+        im_end_token_id=151645,
+        tts_pad_token_id=151671,
+        tts_bos_token_id=151672,
+        tts_eos_token_id=151673,
+        system_token_id=8948,
+        user_token_id=872,
+        assistant_token_id=77091,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        if thinker_config is None:
+            thinker_config = {}
+            logger.info(
+                "thinker_config is None. Initializing thinker model with default values"
+            )
+
+        if talker_config is None:
+            talker_config = {}
+            logger.info(
+                "talker_config is None. Initializing talker model with default values"
+            )
+
+        if code2wav_config is None:
+            code2wav_config = {}
+            logger.info(
+                "code2wav_config is None. Initializing code2wav model with default values"
+            )
+
+        self.thinker_config = Qwen3OmniMoeThinkerConfig(**thinker_config)
+        self.talker_config = Qwen3OmniMoeTalkerConfig(**talker_config)
+        self.code2wav_config = Qwen3OmniMoeCode2WavConfig(**code2wav_config)
+        self.enable_audio_output = enable_audio_output
+        self.im_start_token_id = im_start_token_id
+        self.im_end_token_id = im_end_token_id
+        self.tts_pad_token_id = tts_pad_token_id
+        self.tts_bos_token_id = tts_bos_token_id
+        self.tts_eos_token_id = tts_eos_token_id
+        self.system_token_id = system_token_id
+        self.user_token_id = user_token_id
+        self.assistant_token_id = assistant_token_id
+
+    def get_text_config(self, decoder=False) -> "PretrainedConfig":
+        """
+        Returns the config that is meant to be used with text IO. On most models, it is the original config instance
+        itself. On specific composite models, it is under a set of valid names.
+
+        Args:
+            decoder (`Optional[bool]`, *optional*, defaults to `False`):
+                If set to `True`, then only search for decoder config names.
+        """
+        # Overridden for deeply nested config like Qwen2-Omni. We don't have any omni model
+        # except for Qwen yet. This has to be generalized if more deeply nested configs are
+        # added. NOTE: currently method used only by vLLM
+        return self.thinker_config.get_text_config()
diff --git a/sglang/python/sglang/srt/configs/qwen3_vl.py b/sglang/python/sglang/srt/configs/qwen3_vl.py
new file mode 100644
index 0000000000000000000000000000000000000000..85068b5a60020ce350f6b4ff0a4aa54291690e34
--- /dev/null
+++ b/sglang/python/sglang/srt/configs/qwen3_vl.py
@@ -0,0 +1,571 @@
+from transformers import PretrainedConfig
+
+
+class Qwen3VLVisionConfig(PretrainedConfig):
+    model_type = "qwen3_vl"
+    base_config_key = "vision_config"
+
+    def __init__(
+        self,
+        depth=27,
+        hidden_size=1152,
+        hidden_act="gelu_pytorch_tanh",
+        intermediate_size=4304,
+        num_heads=16,
+        in_channels=3,
+        patch_size=16,
+        spatial_merge_size=2,
+        temporal_patch_size=2,
+        out_hidden_size=3584,
+        num_position_embeddings=2304,
+        deepstack_visual_indexes=[8, 16, 24],
+        initializer_range=0.02,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+
+        self.depth = depth
+        self.hidden_size = hidden_size
+        self.hidden_act = hidden_act
+        self.intermediate_size = intermediate_size
+        self.num_heads = num_heads
+        self.in_channels = in_channels
+        self.patch_size = patch_size
+        self.spatial_merge_size = spatial_merge_size
+        self.temporal_patch_size = temporal_patch_size
+        self.out_hidden_size = out_hidden_size
+        self.num_position_embeddings = num_position_embeddings
+        self.initializer_range = initializer_range
+        self.deepstack_visual_indexes = deepstack_visual_indexes
+
+
+class Qwen3VLTextConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`Qwen3VLTextModel`]. It is used to instantiate a
+    Qwen3-VL model according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of
+    Qwen3-VL-4B-Instruct [Qwen/Qwen3-VL-4B-Instruct](https://huggingface.co/Qwen/Qwen3-VL-4B-Instruct).
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 151936):
+            Vocabulary size of the Qwen3VL model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`Qwen3VLModel`]
+        hidden_size (`int`, *optional*, defaults to 4096):
+            Dimension of the hidden representations.
+        intermediate_size (`int`, *optional*, defaults to 22016):
+            Dimension of the MLP representations.
+        num_hidden_layers (`int`, *optional*, defaults to 32):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 32):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        num_key_value_heads (`int`, *optional*, defaults to 32):
+            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
+            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
+            `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
+            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
+            by meanpooling all the original heads within that group. For more details, check out [this
+            paper](https://huggingface.co/papers/2305.13245). If it is not specified, will default to `32`.
+        head_dim (`int`, *optional*, defaults to 128):
+            The dimension of the head. If not specified, will default to `hidden_size // num_attention_heads`.
+        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
+            The non-linear activation function (function or string) in the decoder.
+        max_position_embeddings (`int`, *optional*, defaults to 128000):
+            The maximum sequence length that this model might ever be used with.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        rms_norm_eps (`float`, *optional*, defaults to 1e-06):
+            The epsilon used by the rms normalization layers.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
+            Whether the model's input and output word embeddings should be tied.
+        rope_theta (`float`, *optional*, defaults to 5000000.0):
+            The base period of the RoPE embeddings.
+        rope_scaling (`Dict`, *optional*):
+            Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type
+            and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value
+            accordingly.
+            Expected contents:
+                `rope_type` (`str`):
+                    The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope',
+                    'llama3'], with 'default' being the original RoPE implementation.
+                `factor` (`float`, *optional*):
+                    Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In
+                    most scaling types, a `factor` of x will enable the model to handle sequences of length x *
+                    original maximum pre-trained length.
+                `original_max_position_embeddings` (`int`, *optional*):
+                    Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during
+                    pretraining.
+                `attention_factor` (`float`, *optional*):
+                    Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention
+                    computation. If unspecified, it defaults to value recommended by the implementation, using the
+                    `factor` field to infer the suggested value.
+                `beta_fast` (`float`, *optional*):
+                    Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear
+                    ramp function. If unspecified, it defaults to 32.
+                `beta_slow` (`float`, *optional*):
+                    Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear
+                    ramp function. If unspecified, it defaults to 1.
+                `short_factor` (`list[float]`, *optional*):
+                    Only used with 'longrope'. The scaling factor to be applied to short contexts (<
+                    `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
+                    size divided by the number of attention heads divided by 2
+                `long_factor` (`list[float]`, *optional*):
+                    Only used with 'longrope'. The scaling factor to be applied to long contexts (<
+                    `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
+                    size divided by the number of attention heads divided by 2
+                `low_freq_factor` (`float`, *optional*):
+                    Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE
+                `high_freq_factor` (`float`, *optional*):
+                    Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE
+        attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`):
+            Whether to use a bias in the query, key, value and output projection layers during self-attention.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+
+    ```python
+    >>> from transformers import Qwen3VLTextModel, Qwen3VLTextConfig
+
+    >>> # Initializing a Qwen3VL style configuration
+    >>> configuration = Qwen3VLTextConfig()
+
+    >>> # Initializing a model from the Qwen3-VL-7B style configuration
+    >>> model = Qwen3VLTextModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "qwen3_vl_text"
+    base_config_key = "text_config"
+
+    def __init__(
+        self,
+        vocab_size=151936,
+        hidden_size=4096,
+        intermediate_size=22016,
+        num_hidden_layers=32,
+        num_attention_heads=32,
+        num_key_value_heads=32,
+        head_dim=128,
+        hidden_act="silu",
+        max_position_embeddings=128000,
+        initializer_range=0.02,
+        rms_norm_eps=1e-6,
+        use_cache=True,
+        tie_word_embeddings=False,
+        rope_theta=5000000.0,
+        rope_scaling=None,
+        attention_bias=False,
+        attention_dropout=0.0,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+
+        # for backward compatibility
+        if num_key_value_heads is None:
+            num_key_value_heads = num_attention_heads
+
+        self.num_key_value_heads = num_key_value_heads
+        self.head_dim = head_dim
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+        self.rope_scaling = rope_scaling
+        self.attention_bias = attention_bias
+        self.attention_dropout = attention_dropout
+
+        super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs)
+
+
+class Qwen3VLConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`Qwen3VLModel`]. It is used to instantiate a
+    Qwen3-VL model according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of
+    Qwen3-VL-4B-Instruct [Qwen/Qwen3-VL-4B-Instruct](https://huggingface.co/Qwen/Qwen3-VL-4B-Instruct).
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        text_config (`Union[PreTrainedConfig, dict]`, *optional*, defaults to `Qwen3VLTextConfig`):
+            The config object or dictionary of the text backbone.
+        vision_config (`Union[PreTrainedConfig, dict]`,  *optional*, defaults to `Qwen3VLVisionConfig`):
+            The config object or dictionary of the vision backbone.
+        image_token_id (`int`, *optional*, defaults to 151655):
+            The image token index to encode the image prompt.
+        video_token_id (`int`, *optional*, defaults to 151656):
+            The video token index to encode the image prompt.
+        vision_start_token_id (`int`, *optional*, defaults to 151652):
+            The start token index to encode the image prompt.
+        vision_end_token_id (`int`, *optional*, defaults to 151653):
+            The end token index to encode the image prompt.
+        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
+            Whether to tie the word embeddings.
+
+    ```python
+    >>> from transformers import Qwen3VLForConditionalGeneration, Qwen3VLConfig
+
+    >>> # Initializing a Qwen3-VL style configuration
+    >>> configuration = Qwen3VLConfig()
+
+    >>> # Initializing a model from the Qwen3-VL-4B style configuration
+    >>> model = Qwen3VLForConditionalGeneration(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "qwen3_vl"
+    sub_configs = {
+        "vision_config": Qwen3VLVisionConfig,
+        "text_config": Qwen3VLTextConfig,
+    }
+    keys_to_ignore_at_inference = ["past_key_values"]
+
+    def __init__(
+        self,
+        text_config=None,
+        vision_config=None,
+        image_token_id=151655,
+        video_token_id=151656,
+        vision_start_token_id=151652,
+        vision_end_token_id=151653,
+        tie_word_embeddings=False,
+        **kwargs,
+    ):
+        if isinstance(vision_config, dict):
+            self.vision_config = self.sub_configs["vision_config"](**vision_config)
+        elif vision_config is None:
+            self.vision_config = self.sub_configs["vision_config"]()
+
+        if isinstance(text_config, dict):
+            self.text_config = self.sub_configs["text_config"](**text_config)
+        elif text_config is None:
+            self.text_config = self.sub_configs["text_config"]()
+
+        self.image_token_id = image_token_id
+        self.video_token_id = video_token_id
+        self.vision_start_token_id = vision_start_token_id
+        self.vision_end_token_id = vision_end_token_id
+        super().__init__(**kwargs, tie_word_embeddings=tie_word_embeddings)
+
+
+class Qwen3VLMoeTextConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`Qwen3VLMoeTextModel`]. It is used to instantiate a
+    Qwen3-VL-MOE model according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of
+    Qwen3-VL-30B-A3B-Instruct [Qwen/Qwen3-VL-30B-A3B-Instruct](https://huggingface.co/Qwen/Qwen3-VL-30B-A3B-Instruct).
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 151936):
+            Vocabulary size of the Qwen2MoE model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`Qwen2MoeModel`]
+        hidden_size (`int`, *optional*, defaults to 2048):
+            Dimension of the hidden representations.
+        intermediate_size (`int`, *optional*, defaults to 5632):
+            Dimension of the MLP representations.
+        num_hidden_layers (`int`, *optional*, defaults to 24):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 16):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        num_key_value_heads (`int`, *optional*, defaults to 16):
+            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
+            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
+            `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
+            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
+            by meanpooling all the original heads within that group. For more details checkout [this
+            paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `32`.
+        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
+            The non-linear activation function (function or string) in the decoder.
+        max_position_embeddings (`int`, *optional*, defaults to 128000):
+            The maximum sequence length that this model might ever be used with.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        rms_norm_eps (`float`, *optional*, defaults to 1e-06):
+            The epsilon used by the rms normalization layers.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
+            Whether the model's input and output word embeddings should be tied.
+        rope_theta (`float`, *optional*, defaults to 5000000.0):
+            The base period of the RoPE embeddings.
+        attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`):
+            Whether to use a bias in the query, key, value and output projection layers during self-attention.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        decoder_sparse_step (`int`, *optional*, defaults to 1):
+            The frequency of the MoE layer.
+        moe_intermediate_size (`int`, *optional*, defaults to 1408):
+            Intermediate size of the routed expert.
+        num_experts_per_tok (`int`, *optional*, defaults to 4):
+            Number of selected experts.
+        num_experts (`int`, *optional*, defaults to 60):
+            Number of routed experts.
+        norm_topk_prob (`bool`, *optional*, defaults to `True`):
+            Whether to normalize the topk probabilities.
+        mlp_only_layers (`List[int]`, *optional*, defaults to `[]`):
+            Indicate which layers use Qwen3VLMoeMLP rather than Qwen3VLMoeSparseMoeBlock
+            The list contains layer index, from 0 to num_layers-1 if we have num_layers layers
+            If `mlp_only_layers` is empty, `decoder_sparse_step` is used to determine the sparsity.
+        rope_scaling (`Dict`, *optional*):
+            Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type
+            and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value
+            accordingly.
+            Expected contents:
+                `rope_type` (`str`):
+                    The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope',
+                    'llama3'], with 'default' being the original RoPE implementation.
+                `factor` (`float`, *optional*):
+                    Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In
+                    most scaling types, a `factor` of x will enable the model to handle sequences of length x *
+                    original maximum pre-trained length.
+                `original_max_position_embeddings` (`int`, *optional*):
+                    Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during
+                    pretraining.
+                `attention_factor` (`float`, *optional*):
+                    Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention
+                    computation. If unspecified, it defaults to value recommended by the implementation, using the
+                    `factor` field to infer the suggested value.
+                `beta_fast` (`float`, *optional*):
+                    Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear
+                    ramp function. If unspecified, it defaults to 32.
+                `beta_slow` (`float`, *optional*):
+                    Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear
+                    ramp function. If unspecified, it defaults to 1.
+                `short_factor` (`List[float]`, *optional*):
+                    Only used with 'longrope'. The scaling factor to be applied to short contexts (<
+                    `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
+                    size divided by the number of attention heads divided by 2
+                `long_factor` (`List[float]`, *optional*):
+                    Only used with 'longrope'. The scaling factor to be applied to long contexts (<
+                    `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
+                    size divided by the number of attention heads divided by 2
+                `low_freq_factor` (`float`, *optional*):
+                    Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE
+                `high_freq_factor` (`float`, *optional*):
+                    Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE
+        head_dim (`int`, *optional*):
+            The dimension of the head. If not specified, will default to `hidden_size // num_attention_heads`.
+
+    ```python
+    >>> from transformers import Qwen3VLMoeForConditionalGeneration, Qwen3VLMoeConfig
+
+    >>> # Initializing a Qwen3VLMoe style configuration
+    >>> configuration = Qwen3VLMoeConfig()
+
+    >>> # Initializing a model from the Qwen3-VL-30B-A3B style configuration
+    >>> model = Qwen3VLMoeForConditionalGeneration(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "qwen3_vl_moe_text"
+    base_config_key = "text_config"
+    keys_to_ignore_at_inference = ["past_key_values"]
+    # Default tensor parallel plan for base model `Qwen3VLMoe`
+    base_model_tp_plan = {
+        "layers.*.self_attn.q_proj": "colwise",
+        "layers.*.self_attn.k_proj": "colwise",
+        "layers.*.self_attn.v_proj": "colwise",
+        "layers.*.self_attn.o_proj": "rowwise",
+        "layers.*.mlp.gate_proj": "colwise",
+        "layers.*.mlp.up_proj": "colwise",
+        "layers.*.mlp.down_proj": "rowwise",
+    }
+    base_model_pp_plan = {
+        "embed_tokens": (["input_ids"], ["inputs_embeds"]),
+        "layers": (["hidden_states", "attention_mask"], ["hidden_states"]),
+        "norm": (["hidden_states"], ["hidden_states"]),
+    }
+
+    def __init__(
+        self,
+        vocab_size=151936,
+        hidden_size=2048,
+        intermediate_size=5632,
+        num_hidden_layers=24,
+        num_attention_heads=16,
+        num_key_value_heads=16,
+        hidden_act="silu",
+        max_position_embeddings=128000,
+        initializer_range=0.02,
+        rms_norm_eps=1e-6,
+        use_cache=True,
+        tie_word_embeddings=False,
+        rope_theta=5000000.0,
+        attention_bias=False,
+        attention_dropout=0.0,
+        decoder_sparse_step=1,
+        moe_intermediate_size=1408,
+        num_experts_per_tok=4,
+        num_experts=60,
+        norm_topk_prob=True,
+        mlp_only_layers=None,
+        rope_scaling=None,
+        head_dim=None,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+
+        # for backward compatibility
+        if num_key_value_heads is None:
+            num_key_value_heads = num_attention_heads
+
+        self.num_key_value_heads = num_key_value_heads
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+        self.attention_bias = attention_bias
+        self.attention_dropout = attention_dropout
+        self.rope_scaling = rope_scaling
+        self.head_dim = head_dim or hidden_size // num_attention_heads
+
+        # MoE arguments
+        self.decoder_sparse_step = decoder_sparse_step
+        self.moe_intermediate_size = moe_intermediate_size
+        self.num_experts_per_tok = num_experts_per_tok
+        self.num_experts = num_experts
+        self.norm_topk_prob = norm_topk_prob
+        self.mlp_only_layers = [] if mlp_only_layers is None else mlp_only_layers
+
+        super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs)
+
+
+class Qwen3VLMoeVisionConfig(PretrainedConfig):
+    model_type = "qwen3_vl_moe"
+    base_config_key = "vision_config"
+
+    def __init__(
+        self,
+        depth=27,
+        hidden_size=1152,
+        hidden_act="gelu_pytorch_tanh",
+        intermediate_size=4304,
+        num_heads=16,
+        in_channels=3,
+        patch_size=16,
+        spatial_merge_size=2,
+        temporal_patch_size=2,
+        out_hidden_size=3584,
+        num_position_embeddings=2304,
+        deepstack_visual_indexes=[8, 16, 24],
+        initializer_range=0.02,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+
+        self.depth = depth
+        self.hidden_size = hidden_size
+        self.hidden_act = hidden_act
+        self.intermediate_size = intermediate_size
+        self.num_heads = num_heads
+        self.in_channels = in_channels
+        self.patch_size = patch_size
+        self.spatial_merge_size = spatial_merge_size
+        self.temporal_patch_size = temporal_patch_size
+        self.out_hidden_size = out_hidden_size
+        self.num_position_embeddings = num_position_embeddings
+        self.initializer_range = initializer_range
+        self.deepstack_visual_indexes = deepstack_visual_indexes
+
+
+class Qwen3VLMoeConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`Qwen3VLMoeModel`]. It is used to instantiate a
+    Qwen3-VL-MOE model according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of
+    Qwen3-VL-30B-A3B-Instruct [Qwen/Qwen3-VL-30B-A3B-Instruct](https://huggingface.co/Qwen/Qwen3-VL-30B-A3B-Instruct).
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        text_config (`Union[PreTrainedConfig, dict]`, *optional*, defaults to `Qwen3VLMoeTextConfig`):
+            The config object or dictionary of the text backbone.
+        vision_config (`Union[PreTrainedConfig, dict]`,  *optional*, defaults to `Qwen3VLMoeVisionConfig`):
+            The config object or dictionary of the vision backbone.
+        image_token_id (`int`, *optional*, defaults to 151655):
+            The image token index to encode the image prompt.
+        video_token_id (`int`, *optional*, defaults to 151656):
+            The video token index to encode the image prompt.
+        vision_start_token_id (`int`, *optional*, defaults to 151652):
+            The start token index to encode the image prompt.
+        vision_end_token_id (`int`, *optional*, defaults to 151653):
+            The end token index to encode the image prompt.
+        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
+            Whether to tie the word embeddings.
+
+    ```python
+    >>> from transformers import Qwen3VLMoeForConditionalGeneration, Qwen3VLMoeConfig
+
+    >>> # Initializing a Qwen3-VL-MOE style configuration
+    >>> configuration = Qwen3VLMoeConfig()
+
+    >>> # Initializing a model from the Qwen3-VL-30B-A3B style configuration
+    >>> model = Qwen3VLMoeForConditionalGeneration(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "qwen3_vl_moe"
+    sub_configs = {
+        "vision_config": Qwen3VLMoeVisionConfig,
+        "text_config": Qwen3VLMoeTextConfig,
+    }
+    keys_to_ignore_at_inference = ["past_key_values"]
+
+    def __init__(
+        self,
+        text_config=None,
+        vision_config=None,
+        image_token_id=151655,
+        video_token_id=151656,
+        vision_start_token_id=151652,
+        vision_end_token_id=151653,
+        tie_word_embeddings=False,
+        **kwargs,
+    ):
+        if isinstance(vision_config, dict):
+            self.vision_config = self.sub_configs["vision_config"](**vision_config)
+        elif vision_config is None:
+            self.vision_config = self.sub_configs["vision_config"]()
+
+        if isinstance(text_config, dict):
+            self.text_config = self.sub_configs["text_config"](**text_config)
+        elif text_config is None:
+            self.text_config = self.sub_configs["text_config"]()
+
+        self.image_token_id = image_token_id
+        self.video_token_id = video_token_id
+        self.vision_start_token_id = vision_start_token_id
+        self.vision_end_token_id = vision_end_token_id
+        super().__init__(**kwargs, tie_word_embeddings=tie_word_embeddings)
diff --git a/sglang/python/sglang/srt/configs/radio.py b/sglang/python/sglang/srt/configs/radio.py
new file mode 100644
index 0000000000000000000000000000000000000000..cc6df58e0ff2eaa7fd24f51bfdf1efcba0d55cb5
--- /dev/null
+++ b/sglang/python/sglang/srt/configs/radio.py
@@ -0,0 +1,106 @@
+# Copyright 2025 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+# Adapted from https://github.com/vllm-project/vllm/blob/main/vllm/transformers_utils/configs/radio.py
+
+"""Radio vision model configuration"""
+
+from transformers.configuration_utils import PretrainedConfig
+from transformers.utils import logging
+
+logger = logging.get_logger(__name__)
+
+VIT_TIMM_DIM_BY_NAME: dict[str, tuple[int, int, int, int]] = {
+    "vit_small_patch16_224": (384, 12, 6, 1536),
+    "vit_base_patch16_224": (768, 12, 12, 3072),
+    "vit_large_patch16_224": (1024, 24, 16, 4096),
+    "vit_huge_patch16_224": (1280, 32, 16, 5120),
+}
+
+OPENAI_CLIP_MEAN = (0.48145466, 0.4578275, 0.40821073)
+OPENAI_CLIP_STD = (0.26862954, 0.26130258, 0.27577711)
+
+
+class RadioConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a Radio
+    vision model. It is used to instantiate a Radio model according to the
+    specified arguments, defining the model architecture.
+
+    Args:
+        model_name: Name of the vision transformer model
+            (e.g., "vit_base_patch16_224"). Used to determine architecture
+            dimensions from `VIT_TIMM_DIM_BY_NAME`.
+        image_size: The size (resolution) of each image.
+        patch_size: The size (resolution) of each patch.
+        qkv_bias: Whether to add a bias to the queries, keys and values.
+        qk_normalization: Whether to apply normalization to queries and keys.
+        norm_type: The normalization type to use.
+        layer_norm_eps: The epsilon used by the layer normalization layers.
+        initializer_factor: A factor for initializing all weight matrices.
+        hidden_act: The non-linear activation function in the encoder.
+        max_img_size: Maximum image size for position embeddings.
+        norm_mean: Mean values for image normalization (RGB channels).
+            Defaults to (0.48145466, 0.4578275, 0.40821073)).
+        norm_std: Standard deviation values for image normalization
+            (RGB channels). Defaults to (0.26862954, 0.26130258, 0.27577711)).
+        reg_tokens: Number of register tokens to use.
+    """
+
+    model_type = "radio"
+
+    def __init__(
+        self,
+        model_name: str,
+        image_size: int = 224,
+        patch_size: int = 16,
+        qkv_bias: bool = True,
+        qk_normalization: bool = False,
+        norm_type: str = "layer_norm",
+        layer_norm_eps: float = 1e-6,
+        initializer_factor: float = 1.0,
+        hidden_act: str = "gelu",
+        max_img_size: int = 2048,
+        norm_mean: tuple[float, float, float] | list = OPENAI_CLIP_MEAN,
+        norm_std: tuple[float, float, float] | list = OPENAI_CLIP_STD,
+        reg_tokens: int | None = None,
+        drop_path_rate: float = 0.0,
+        dropout: float = 0.0,
+        **kwargs,
+    ):
+        self.model_name = model_name
+        (
+            self.hidden_size,
+            self.num_hidden_layers,
+            self.num_attention_heads,
+            self.intermediate_size,
+        ) = VIT_TIMM_DIM_BY_NAME[model_name]
+        self.image_size = image_size
+        self.patch_size = patch_size
+        self.qkv_bias = qkv_bias
+        self.qk_normalization = qk_normalization
+        self.norm_type = norm_type
+        self.layer_norm_eps = layer_norm_eps
+        self.initializer_factor = initializer_factor
+        self.hidden_act = hidden_act
+        self.max_img_size = max_img_size
+        self.norm_mean = (
+            list(norm_mean) if isinstance(norm_mean, (tuple, list)) else norm_mean
+        )
+        self.norm_std = (
+            list(norm_std) if isinstance(norm_std, (tuple, list)) else norm_std
+        )
+        self.reg_tokens = reg_tokens
+        self.drop_path_rate = drop_path_rate
+        self.dropout = dropout
+        super().__init__(**kwargs)
diff --git a/sglang/python/sglang/srt/configs/step3_vl.py b/sglang/python/sglang/srt/configs/step3_vl.py
new file mode 100644
index 0000000000000000000000000000000000000000..5519605c675507b753f1b20f49f47bbc76f60cfb
--- /dev/null
+++ b/sglang/python/sglang/srt/configs/step3_vl.py
@@ -0,0 +1,172 @@
+from typing import Any, Optional, Union
+
+from transformers.configuration_utils import PretrainedConfig
+
+
+class Step3VisionEncoderConfig(PretrainedConfig):
+    model_type = "step3_vision_encoder"
+
+    def __init__(
+        self,
+        hidden_size=1792,
+        intermediate_size=3072,
+        output_hidden_size=4096,
+        num_hidden_layers=63,
+        num_attention_heads=16,
+        num_channels=3,
+        image_size=728,
+        patch_size=14,
+        hidden_act="quick_gelu",
+        layer_norm_eps=1e-5,
+        **kwargs,
+    ):
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.output_hidden_size = output_hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.num_channels = num_channels
+        self.patch_size = patch_size
+        self.image_size = image_size
+        self.layer_norm_eps = layer_norm_eps
+        self.hidden_act = hidden_act
+        super().__init__(**kwargs)
+
+
+class Step3TextConfig(PretrainedConfig):
+    model_type = "step3_text"
+    architectures = ["Step3TextForCausalLM"]
+
+    def __init__(
+        self,
+        hidden_size: int = 7168,
+        intermediate_size: int = 18432,
+        num_attention_heads: int = 64,
+        num_attention_groups: int = 1,
+        num_hidden_layers: int = 61,
+        max_seq_len: int = 65536,
+        vocab_size: int = 128815,
+        rms_norm_eps: float = 1e-5,
+        moe_intermediate_size: int = 5120,
+        moe_num_experts: int = 48,
+        moe_top_k: int = 3,
+        rope_theta: float = 500000,
+        rope_scaling: Optional[dict[str, Any]] = None,
+        max_position_embedding: int = 65536,
+        share_expert_dim: int = 5120,
+        share_q_dim: int = 2048,
+        head_dim: int = 256,
+        norm_expert_weight: bool = False,
+        moe_layers_enum: tuple[int] = (
+            4,
+            5,
+            6,
+            7,
+            8,
+            9,
+            10,
+            11,
+            12,
+            13,
+            14,
+            15,
+            16,
+            17,
+            18,
+            19,
+            20,
+            21,
+            22,
+            23,
+            24,
+            25,
+            26,
+            27,
+            28,
+            29,
+            30,
+            31,
+            32,
+            33,
+            34,
+            35,
+            36,
+            37,
+            38,
+            39,
+            40,
+            41,
+            42,
+            43,
+            44,
+            45,
+            46,
+            47,
+            48,
+            49,
+            50,
+            51,
+            52,
+            53,
+            54,
+            55,
+            56,
+            57,
+            58,
+            59,
+        ),
+        **kwargs,
+    ) -> None:
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_attention_heads = num_attention_heads
+        self.num_attention_groups = num_attention_groups
+        self.num_hidden_layers = num_hidden_layers
+        self.max_seq_len = max_seq_len
+        self.vocab_size = vocab_size
+        self.rms_norm_eps = rms_norm_eps
+        self.moe_intermediate_size = moe_intermediate_size
+        self.moe_num_experts = moe_num_experts
+        self.moe_top_k = moe_top_k
+        self.rope_theta = rope_theta
+        self.rope_scaling = rope_scaling
+        self.max_position_embedding = max_position_embedding
+        self.share_expert_dim = share_expert_dim
+        self.share_q_dim = share_q_dim
+        self.head_dim = head_dim
+        self.norm_expert_weight = norm_expert_weight
+        self.moe_layers_enum = moe_layers_enum
+
+        super().__init__(**kwargs)
+
+
+class Step3VLConfig(PretrainedConfig):
+    model_type = "step3_vl"
+
+    def __init__(
+        self,
+        vision_config: Optional[Union[dict, Step3VisionEncoderConfig]] = None,
+        text_config: Optional[Union[dict, Step3TextConfig]] = None,
+        understand_projector_stride: int = 1,
+        projector_bias: bool = True,
+        image_token_id: int = 128001,
+        **kwargs,
+    ) -> None:
+        if vision_config is None:
+            vision_config = Step3VisionEncoderConfig()
+        elif isinstance(vision_config, dict):
+            vision_config = Step3VisionEncoderConfig(**vision_config)
+        self.vision_config = vision_config
+
+        if text_config is None:
+            text_config = Step3TextConfig()
+        elif isinstance(text_config, dict):
+            text_config = Step3TextConfig(**text_config)
+        self.text_config = text_config
+
+        self.understand_projector_stride = understand_projector_stride
+        self.projector_bias = projector_bias
+        self.hidden_size = text_config.hidden_size
+        self.image_token_id = image_token_id
+
+        super().__init__(**kwargs)
diff --git a/sglang/python/sglang/srt/configs/step3p5.py b/sglang/python/sglang/srt/configs/step3p5.py
new file mode 100644
index 0000000000000000000000000000000000000000..eebf137fbe1f27b0170af892202a72a91a5d8afd
--- /dev/null
+++ b/sglang/python/sglang/srt/configs/step3p5.py
@@ -0,0 +1,97 @@
+from typing import Any, Optional
+
+from transformers.configuration_utils import PretrainedConfig
+
+
+class Step3p5Config(PretrainedConfig):
+    model_type = "step3p5"
+    architectures = ["Step3p5ForCausalLM"]
+
+    def __init__(
+        self,
+        hidden_size: int = 4096,
+        intermediate_size: int = 11264,
+        num_attention_heads: int = 64,
+        num_attention_groups: int = 8,
+        num_hidden_layers: int = 45,
+        max_seq_len: int = 128000,
+        vocab_size: int = 128815,
+        rms_norm_eps: float = 1e-5,
+        moe_intermediate_size: int = 1280,
+        moe_num_experts: int = 288,
+        moe_top_k: int = 8,
+        rope_theta: float = 10000,
+        rope_scaling: Optional[dict[str, Any]] = None,
+        max_position_embeddings: int = 128000,
+        share_expert_dims: int = 1280,
+        head_dim: int = 128,
+        norm_expert_weight: bool = True,
+        layer_types: list[str] = None,
+        sliding_window: Optional[int] = None,
+        moe_layers_enum: tuple[int] = (
+            3,
+            4,
+            5,
+            6,
+            7,
+            8,
+            9,
+            10,
+            11,
+            12,
+            13,
+            14,
+            15,
+            16,
+            17,
+            18,
+            19,
+            20,
+            21,
+            22,
+            23,
+            24,
+            25,
+            26,
+            27,
+            28,
+            29,
+            30,
+            31,
+            32,
+            33,
+            34,
+            35,
+            36,
+            37,
+            38,
+            39,
+            40,
+            41,
+            42,
+            43,
+            44,
+        ),
+        **kwargs,
+    ) -> None:
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_attention_heads = num_attention_heads
+        self.num_attention_groups = num_attention_groups
+        self.num_hidden_layers = num_hidden_layers
+        self.max_seq_len = max_seq_len
+        self.vocab_size = vocab_size
+        self.rms_norm_eps = rms_norm_eps
+        self.moe_intermediate_size = moe_intermediate_size
+        self.moe_num_experts = moe_num_experts
+        self.moe_top_k = moe_top_k
+        self.rope_theta = rope_theta
+        self.rope_scaling = rope_scaling
+        self.max_position_embeddings = max_position_embeddings
+        self.share_expert_dim = share_expert_dims
+        self.head_dim = head_dim
+        self.norm_expert_weight = norm_expert_weight
+        self.moe_layers_enum = moe_layers_enum
+        self.layer_types = layer_types
+        self.sliding_window = sliding_window
+        super().__init__(**kwargs)
diff --git a/sglang/python/sglang/srt/configs/update_config.py b/sglang/python/sglang/srt/configs/update_config.py
new file mode 100644
index 0000000000000000000000000000000000000000..7bb6aab35725ba68ce76838241ade2fabc9a5a90
--- /dev/null
+++ b/sglang/python/sglang/srt/configs/update_config.py
@@ -0,0 +1,211 @@
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+
+DEFAULT_MOE_PADDING_SIZE = 32
+
+
+if TYPE_CHECKING:
+    from sglang.srt.configs.load_config import LoadConfig
+    from sglang.srt.configs.model_config import ModelConfig
+
+
+def may_get_weight_block_size(model_config, load_config):
+    from sglang.srt.model_loader.loader import _get_quantization_config
+
+    quant_config = _get_quantization_config(model_config, load_config)
+
+    if quant_config is not None and hasattr(quant_config, "weight_block_size"):
+        return getattr(quant_config, "weight_block_size")
+    return None
+
+
+def get_moe_padding_size(weight_block_size):
+    if weight_block_size is not None:
+        # See NOTE(HandH1998): To ensure proper alignment of the block-wise quantization scales, the output_size of the weights for both the gate and up layers must be divisible by block_n.
+        assert (
+            len(weight_block_size) == 2
+        ), "Only len(weight_block_size) == 2 is supported"
+        assert (
+            weight_block_size[0] == weight_block_size[1]
+        ), "Only weight_block_size[0] == weight_block_size[1] is supported"
+
+        return weight_block_size[0]
+
+    return DEFAULT_MOE_PADDING_SIZE
+
+
+def get_num_heads_padding_size(tp_size, weight_block_size, head_dim):
+    pad_size = tp_size
+
+    if weight_block_size is not None and head_dim % weight_block_size[0] != 0:
+        import math
+
+        pad_size = tp_size * (
+            math.lcm(head_dim, weight_block_size[0]) // weight_block_size[0]
+        )
+
+    return pad_size
+
+
+def adjust_tp_num_heads_if_necessary(model_config, tp_size, is_post_update):
+    # is_post_update: whether to update an existing config
+    from sglang.srt.layers.vocab_parallel_embedding import pad_vocab_size
+
+    # Linear attn check logic
+    if hasattr(model_config, "linear_num_key_heads") and hasattr(
+        model_config, "linear_num_value_heads"
+    ):
+        if (
+            model_config.linear_num_key_heads % tp_size != 0
+            or model_config.linear_num_value_heads % tp_size != 0
+        ):
+            pad_size = tp_size
+            linear_num_key_heads_cpu = pad_vocab_size(
+                model_config.linear_num_key_heads, pad_size
+            )
+            linear_num_value_heads_cpu = (
+                linear_num_key_heads_cpu
+                * model_config.linear_num_value_heads
+                // model_config.linear_num_key_heads
+            )
+            if is_post_update:
+                model_config.linear_num_key_heads_cpu = linear_num_key_heads_cpu
+                model_config.linear_num_value_heads_cpu = linear_num_value_heads_cpu
+            else:
+                model_config.linear_num_key_heads = linear_num_key_heads_cpu
+                model_config.linear_num_value_heads = linear_num_value_heads_cpu
+
+        else:
+            if is_post_update:
+                model_config.linear_num_key_heads_cpu = (
+                    model_config.linear_num_key_heads
+                )
+                model_config.linear_num_value_heads_cpu = (
+                    model_config.linear_num_value_heads
+                )
+
+
+def update_intermediate_size(model_config, attr_name, intermediate_padding_size):
+    attr_value = intermediate_padding_size
+    if hasattr(model_config, "hf_config") and hasattr(
+        model_config.hf_config, attr_name
+    ):
+        attr_value = getattr(model_config.hf_config, attr_name)
+    elif hasattr(model_config, attr_name):
+        attr_value = getattr(model_config, attr_name)
+
+    if attr_value % intermediate_padding_size != 0:
+        from sglang.srt.layers.vocab_parallel_embedding import pad_vocab_size
+
+        attr_value = pad_vocab_size(attr_value, intermediate_padding_size)
+        if hasattr(model_config, "hf_config"):
+            setattr(model_config.hf_config, attr_name, attr_value)
+            if hasattr(model_config, "hf_text_config"):
+                setattr(model_config.hf_text_config, attr_name, attr_value)
+        else:
+            setattr(model_config, attr_name, attr_value)
+
+    return model_config
+
+
+def adjust_config_with_unaligned_cpu_tp(
+    model_config: ModelConfig, load_config: LoadConfig, tp_size: int
+) -> ModelConfig:
+    # Support the case where the num_attention_heads is not divisible by the TP size.
+    weight_block_size = may_get_weight_block_size(model_config, load_config)
+
+    model_config.hf_config.original_num_attention_heads = (
+        model_config.num_attention_heads
+    )
+    model_config.hf_text_config.original_num_attention_heads = (
+        model_config.num_attention_heads
+    )
+
+    model_config.hf_config.original_total_num_kv_heads = (
+        model_config.get_total_num_kv_heads()
+    )
+    model_config.hf_text_config.original_total_num_kv_heads = (
+        model_config.get_total_num_kv_heads()
+    )
+
+    if (
+        model_config.num_attention_heads % tp_size != 0
+        or model_config.get_total_num_kv_heads() % tp_size != 0
+    ):
+        # Compute the head_dim using the model_config.num_attention_heads before padding
+        if not hasattr(model_config.hf_config, "head_dim"):
+            model_config.hf_config.head_dim = (
+                model_config.hidden_size // model_config.num_attention_heads
+            )
+        if hasattr(model_config.hf_config, "qk_nope_head_dim") and hasattr(
+            model_config.hf_config, "qk_rope_head_dim"
+        ):
+            model_config.hf_config.qk_head_dim = (
+                model_config.hf_config.qk_nope_head_dim
+                + model_config.hf_config.qk_rope_head_dim
+            )
+
+        query_heads_per_kv = (
+            model_config.num_attention_heads // model_config.get_total_num_kv_heads()
+        )
+        total_kv_heads = model_config.get_total_num_kv_heads()
+        from sglang.srt.layers.vocab_parallel_embedding import pad_vocab_size
+
+        head_dim = (
+            model_config.hf_config.qk_head_dim
+            if hasattr(model_config.hf_config, "qk_head_dim")
+            else model_config.hf_config.head_dim
+        )
+        pad_size = get_num_heads_padding_size(tp_size, weight_block_size, head_dim)
+        num_key_value_heads = pad_vocab_size(total_kv_heads, pad_size)
+
+        model_config.num_key_value_heads = num_key_value_heads
+        model_config.hf_config.num_key_value_heads = num_key_value_heads
+        model_config.hf_text_config.num_key_value_heads = num_key_value_heads
+
+        num_attention_heads = num_key_value_heads * query_heads_per_kv
+        model_config.num_attention_heads = num_attention_heads
+        model_config.hf_config.num_attention_heads = num_attention_heads
+        model_config.hf_text_config.num_attention_heads = num_attention_heads
+
+    adjust_tp_num_heads_if_necessary(model_config.hf_config, tp_size, True)
+
+    intermediate_padding_size = tp_size * get_moe_padding_size(weight_block_size)
+    model_config = update_intermediate_size(
+        model_config, "moe_intermediate_size", intermediate_padding_size
+    )
+    model_config = update_intermediate_size(
+        model_config, "intermediate_size", intermediate_padding_size
+    )
+    model_config = update_intermediate_size(
+        model_config, "intermediate_size_mlp", intermediate_padding_size
+    )
+    model_config = update_intermediate_size(
+        model_config, "shared_expert_intermediate_size", intermediate_padding_size
+    )
+    if (
+        hasattr(model_config.hf_config, "vision_config")
+        and model_config.hf_config.vision_config.model_type == "siglip_vision_model"
+    ):
+        model_config.hf_config.vision_config.original_num_attention_heads = (
+            model_config.num_attention_heads
+        )
+        if model_config.hf_config.vision_config.num_attention_heads % tp_size != 0:
+            model_config.hf_config.vision_config.head_dim = (
+                model_config.hf_config.vision_config.hidden_size
+                // model_config.hf_config.vision_config.num_attention_heads
+            )
+            from sglang.srt.layers.vocab_parallel_embedding import pad_vocab_size
+
+            pad_size = get_num_heads_padding_size(tp_size, weight_block_size)
+            model_config.hf_config.vision_config.num_attention_heads = pad_vocab_size(
+                model_config.hf_config.vision_config.num_attention_heads, pad_size
+            )
+        model_config.hf_config.vision_config = update_intermediate_size(
+            model_config.hf_config.vision_config,
+            "intermediate_size",
+            intermediate_padding_size,
+        )
+
+    return model_config
diff --git a/sglang/python/sglang/srt/configs/utils.py b/sglang/python/sglang/srt/configs/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..16575dfe84adc2ee19b1391add64ba0932e93938
--- /dev/null
+++ b/sglang/python/sglang/srt/configs/utils.py
@@ -0,0 +1,27 @@
+from typing import Type
+
+from transformers import (
+    AutoImageProcessor,
+    AutoProcessor,
+    BaseImageProcessor,
+    PretrainedConfig,
+    ProcessorMixin,
+)
+
+
+def register_image_processor(
+    config: Type[PretrainedConfig], image_processor: Type[BaseImageProcessor]
+):
+    """
+    register customized hf image processor while removing hf impl
+    """
+    AutoImageProcessor.register(
+        config, slow_image_processor_class=image_processor, exist_ok=True
+    )
+
+
+def register_processor(config: Type[PretrainedConfig], processor: Type[ProcessorMixin]):
+    """
+    register customized hf processor while removing hf impl
+    """
+    AutoProcessor.register(config, processor, exist_ok=True)
diff --git a/sglang/python/sglang/srt/connector/__init__.py b/sglang/python/sglang/srt/connector/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..053ab7c49ecaded5e9a83183d4101caaa6dfda51
--- /dev/null
+++ b/sglang/python/sglang/srt/connector/__init__.py
@@ -0,0 +1,58 @@
+# SPDX-License-Identifier: Apache-2.0
+
+import enum
+import logging
+
+from sglang.srt.connector.base_connector import (
+    BaseConnector,
+    BaseFileConnector,
+    BaseKVConnector,
+)
+from sglang.srt.connector.redis import RedisConnector
+from sglang.srt.connector.remote_instance import RemoteInstanceConnector
+from sglang.srt.connector.s3 import S3Connector
+from sglang.srt.utils import parse_connector_type
+
+logger = logging.getLogger(__name__)
+
+
+class ConnectorType(str, enum.Enum):
+    FS = "filesystem"
+    KV = "KV"
+    INSTANCE = "instance"
+
+
+def create_remote_connector(url, device=None, **kwargs) -> BaseConnector:
+    connector_type = parse_connector_type(url)
+    if connector_type == "redis":
+        return RedisConnector(url)
+    elif connector_type == "s3":
+        return S3Connector(url)
+    elif connector_type == "instance":
+        return RemoteInstanceConnector(url, device)
+    else:
+        raise ValueError(f"Invalid connector type: {url}")
+
+
+def get_connector_type(client: BaseConnector) -> ConnectorType:
+    if isinstance(client, BaseKVConnector):
+        return ConnectorType.KV
+    if isinstance(client, BaseFileConnector):
+        return ConnectorType.FS
+    if isinstance(client, RemoteInstanceConnector):
+        return ConnectorType.INSTANCE
+
+    raise ValueError(f"Invalid connector type: {client}")
+
+
+__all__ = [
+    "BaseConnector",
+    "BaseFileConnector",
+    "BaseKVConnector",
+    "RedisConnector",
+    "RemoteInstanceConnector",
+    "S3Connector",
+    "ConnectorType",
+    "create_remote_connector",
+    "get_connector_type",
+]
diff --git a/sglang/python/sglang/srt/connector/base_connector.py b/sglang/python/sglang/srt/connector/base_connector.py
new file mode 100644
index 0000000000000000000000000000000000000000..c9a1c36e26317d30ff44be138fa651d56df64b51
--- /dev/null
+++ b/sglang/python/sglang/srt/connector/base_connector.py
@@ -0,0 +1,111 @@
+# SPDX-License-Identifier: Apache-2.0
+
+import os
+import shutil
+import signal
+import tempfile
+from abc import ABC, abstractmethod
+from typing import Generator, List, Optional, Tuple
+
+import torch
+
+
+class BaseConnector(ABC):
+    """
+    For fs connector such as s3:
+    <connector_type>://<path>/<filename>
+
+    For kv connector such as redis:
+    <connector_type>://<host>:<port>/<model_name>/keys/<key>
+    <connector_type://<host>:<port>/<model_name>/files/<filename>
+    """
+
+    def __init__(self, url: str):
+        self.url = url
+        self.closed = False
+        self.local_dir = tempfile.mkdtemp()
+        for sig in (signal.SIGINT, signal.SIGTERM):
+            existing_handler = signal.getsignal(sig)
+            signal.signal(sig, self._close_by_signal(existing_handler))
+
+    def get_local_dir(self):
+        return self.local_dir
+
+    @abstractmethod
+    def weight_iterator(
+        self, rank: int = 0
+    ) -> Generator[Tuple[str, torch.Tensor], None, None]:
+        raise NotImplementedError()
+
+    @abstractmethod
+    def pull_files(
+        self,
+        allow_pattern: Optional[List[str]] = None,
+        ignore_pattern: Optional[List[str]] = None,
+    ) -> None:
+        raise NotImplementedError()
+
+    def close(self):
+        if self.closed:
+            return
+
+        self.closed = True
+        if os.path.exists(self.local_dir):
+            shutil.rmtree(self.local_dir)
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        self.close()
+
+    def __del__(self):
+        self.close()
+
+    def _close_by_signal(self, existing_handler=None):
+
+        def new_handler(signum, frame):
+            self.close()
+            if existing_handler:
+                existing_handler(signum, frame)
+
+        return new_handler
+
+
+class BaseKVConnector(BaseConnector):
+
+    @abstractmethod
+    def get(self, key: str) -> Optional[torch.Tensor]:
+        raise NotImplementedError()
+
+    @abstractmethod
+    def getstr(self, key: str) -> Optional[str]:
+        raise NotImplementedError()
+
+    @abstractmethod
+    def set(self, key: str, obj: torch.Tensor) -> None:
+        raise NotImplementedError()
+
+    @abstractmethod
+    def setstr(self, key: str, obj: str) -> None:
+        raise NotImplementedError()
+
+    @abstractmethod
+    def list(self, prefix: str) -> List[str]:
+        raise NotImplementedError()
+
+
+class BaseFileConnector(BaseConnector):
+    """
+    List full file names from remote fs path and filter by allow pattern.
+
+    Args:
+        allow_pattern: A list of patterns of which files to pull.
+
+    Returns:
+        list[str]: List of full paths allowed by the pattern
+    """
+
+    @abstractmethod
+    def glob(self, allow_pattern: str) -> List[str]:
+        raise NotImplementedError()
diff --git a/sglang/python/sglang/srt/connector/redis.py b/sglang/python/sglang/srt/connector/redis.py
new file mode 100644
index 0000000000000000000000000000000000000000..cb1db3f7cc9966f490fe6d94891dd0bdfc4d1d5d
--- /dev/null
+++ b/sglang/python/sglang/srt/connector/redis.py
@@ -0,0 +1,85 @@
+# SPDX-License-Identifier: Apache-2.0
+
+import logging
+from typing import Generator, List, Optional, Tuple
+from urllib.parse import urlparse
+
+import torch
+
+from sglang.srt.connector import BaseKVConnector
+from sglang.srt.connector.serde import create_serde
+from sglang.srt.connector.utils import pull_files_from_db
+
+logger = logging.getLogger(__name__)
+
+
+class RedisConnector(BaseKVConnector):
+
+    def __init__(self, url: str):
+        import redis
+
+        super().__init__(url)
+        parsed_url = urlparse(url)
+        self.connection = redis.Redis(host=parsed_url.hostname, port=parsed_url.port)
+        self.model_name = parsed_url.path.lstrip("/")
+        # TODO: more serde options
+        self.s, self.d = create_serde("safe")
+
+    def get(self, key: str) -> Optional[torch.Tensor]:
+        val = self.connection.get(key)
+
+        if val is None:
+            logger.error("Key %s not found", key)
+            return None
+
+        return self.d.from_bytes(val)
+
+    def getstr(self, key: str) -> Optional[str]:
+        val = self.connection.get(key)
+        if val is None:
+            logger.error("Key %s not found", key)
+            return None
+
+        return val.decode("utf-8")
+
+    def set(self, key: str, tensor: torch.Tensor) -> None:
+        assert tensor is not None
+        self.connection.set(key, self.s.to_bytes(tensor))
+
+    def setstr(self, key: str, obj: str) -> None:
+        self.connection.set(key, obj)
+
+    def list(self, prefix: str) -> List[str]:
+        cursor = 0
+        all_keys: List[bytes] = []
+
+        while True:
+            ret: Tuple[int, List[bytes]] = self.connection.scan(
+                cursor=cursor, match=f"{prefix}*"
+            )  # type: ignore
+            cursor, keys = ret
+            all_keys.extend(keys)
+            if cursor == 0:
+                break
+
+        return [key.decode("utf-8") for key in all_keys]
+
+    def weight_iterator(
+        self, rank: int = 0
+    ) -> Generator[Tuple[str, bytes], None, None]:
+        keys = self.list(f"{self.model_name}/keys/rank_{rank}/")
+        for key in keys:
+            val = self.get(key)
+            key = key.removeprefix(f"{self.model_name}/keys/rank_{rank}/")
+            yield key, val
+
+    def pull_files(
+        self,
+        allow_pattern: Optional[List[str]] = None,
+        ignore_pattern: Optional[List[str]] = None,
+    ) -> None:
+        pull_files_from_db(self, self.model_name, allow_pattern, ignore_pattern)
+
+    def close(self):
+        self.connection.close()
+        super().close()
diff --git a/sglang/python/sglang/srt/connector/remote_instance.py b/sglang/python/sglang/srt/connector/remote_instance.py
new file mode 100644
index 0000000000000000000000000000000000000000..318c362afb33a7da6b3c89a5d2a932f492bd42ad
--- /dev/null
+++ b/sglang/python/sglang/srt/connector/remote_instance.py
@@ -0,0 +1,82 @@
+# SPDX-License-Identifier: Apache-2.0
+
+import logging
+from typing import Generator, Optional, Tuple
+from urllib.parse import urlparse
+
+import torch
+import torch.distributed as dist
+
+from sglang.srt.connector import BaseConnector
+from sglang.srt.utils import init_custom_process_group
+
+logger = logging.getLogger(__name__)
+
+
+class RemoteInstanceConnector(BaseConnector):
+
+    def __init__(self, url: str, device: torch.device = "cpu"):
+        assert (
+            device.type == "cuda" or device.type == "npu"
+        ), "RemoteInstanceConnector only supports cuda device."
+        super().__init__(url)
+        self.url = url
+        self.device = device
+
+    def build_group(
+        self,
+        gpu_id: int = -1,
+        tp_rank: int = -1,
+        instance_ip: str = None,
+        group_rank: int = 1,
+        world_size: int = 2,
+    ):
+        assert (
+            self.device.type == "cuda" or self.device.type == "npu"
+        ), "RemoteInstanceConnector only supports cuda device."
+        assert (
+            gpu_id != -1 and tp_rank != -1
+        ), "gpu_id and tp_rank must be specified for RemoteInstanceConnector. "
+
+        self.device_id = torch.device(self.device.type, gpu_id)
+
+        parsed_url = urlparse(self.url)
+        master_address = parsed_url.hostname
+        master_port = parsed_url.port
+        group_name = f"send_weights_{instance_ip}_{master_port}_{tp_rank}"
+        backend = "nccl"
+
+        logger.info(
+            f"init custom process group: master_address={master_address}, master_port={master_port}, "
+            f"rank_offset={group_rank}, world_size={world_size}, group_name={group_name}, backend={backend}"
+        )
+
+        try:
+            self._model_update_group = init_custom_process_group(
+                backend=backend,
+                init_method=f"tcp://{master_address}:{master_port}",
+                world_size=world_size,
+                rank=group_rank,
+                group_name=group_name,
+                device_id=self.device_id,
+            )
+            dist.barrier(group=self._model_update_group)
+            return True, "Succeeded to initialize custom process group."
+        except Exception as e:
+            message = f"Failed to initialize custom process group: {e}."
+            logger.error(message)
+            return False, message
+
+    # Implemented as a no-op to make BaseConnector interface consistent.
+    def pull_files(
+        self,
+        allow_pattern: Optional[list[str]] = None,
+        ignore_pattern: Optional[list[str]] = None,
+    ) -> None:
+        return
+
+    # Implemented as a no-op to make BaseConnector interface consistent.
+    def weight_iterator(
+        self, rank: int = 0
+    ) -> Generator[Tuple[str, torch.Tensor], None, None]:
+        return
diff --git a/sglang/python/sglang/srt/connector/s3.py b/sglang/python/sglang/srt/connector/s3.py
new file mode 100644
index 0000000000000000000000000000000000000000..7bef8f5d5225d0b71ab11a9743855bbc66702e28
--- /dev/null
+++ b/sglang/python/sglang/srt/connector/s3.py
@@ -0,0 +1,122 @@
+# SPDX-License-Identifier: Apache-2.0
+
+import fnmatch
+import os
+from pathlib import Path
+from typing import Generator, Optional, Tuple
+
+import torch
+
+from sglang.srt.connector import BaseFileConnector
+
+
+def _filter_allow(paths: list[str], patterns: list[str]) -> list[str]:
+    return [
+        path
+        for path in paths
+        if any(fnmatch.fnmatch(path, pattern) for pattern in patterns)
+    ]
+
+
+def _filter_ignore(paths: list[str], patterns: list[str]) -> list[str]:
+    return [
+        path
+        for path in paths
+        if not any(fnmatch.fnmatch(path, pattern) for pattern in patterns)
+    ]
+
+
+def list_files(
+    s3,
+    path: str,
+    allow_pattern: Optional[list[str]] = None,
+    ignore_pattern: Optional[list[str]] = None,
+) -> tuple[str, str, list[str]]:
+    """
+    List files from S3 path and filter by pattern.
+
+    Args:
+        s3: S3 client to use.
+        path: The S3 path to list from.
+        allow_pattern: A list of patterns of which files to pull.
+        ignore_pattern: A list of patterns of which files not to pull.
+
+    Returns:
+        tuple[str, str, list[str]]: A tuple where:
+            - The first element is the bucket name
+            - The second element is string represent the bucket
+              and the prefix as a dir like string
+            - The third element is a list of files allowed or
+              disallowed by pattern
+    """
+    parts = path.removeprefix("s3://").split("/")
+    prefix = "/".join(parts[1:])
+    bucket_name = parts[0]
+
+    objects = s3.list_objects_v2(Bucket=bucket_name, Prefix=prefix)
+    paths = [obj["Key"] for obj in objects.get("Contents", [])]
+
+    paths = _filter_ignore(paths, ["*/"])
+    if allow_pattern is not None:
+        paths = _filter_allow(paths, allow_pattern)
+
+    if ignore_pattern is not None:
+        paths = _filter_ignore(paths, ignore_pattern)
+
+    return bucket_name, prefix, paths
+
+
+class S3Connector(BaseFileConnector):
+
+    def __init__(self, url: str) -> None:
+        import boto3
+
+        super().__init__(url)
+        self.client = boto3.client("s3")
+
+    def glob(self, allow_pattern: Optional[list[str]] = None) -> list[str]:
+        bucket_name, _, paths = list_files(
+            self.client, path=self.url, allow_pattern=allow_pattern
+        )
+        return [f"s3://{bucket_name}/{path}" for path in paths]
+
+    def pull_files(
+        self,
+        allow_pattern: Optional[list[str]] = None,
+        ignore_pattern: Optional[list[str]] = None,
+    ) -> None:
+        """
+        Pull files from S3 storage into the temporary directory.
+
+        Args:
+            s3_model_path: The S3 path of the model.
+            allow_pattern: A list of patterns of which files to pull.
+            ignore_pattern: A list of patterns of which files not to pull.
+
+        """
+        bucket_name, base_dir, files = list_files(
+            self.client, self.url, allow_pattern, ignore_pattern
+        )
+        if len(files) == 0:
+            return
+
+        for file in files:
+            destination_file = os.path.join(self.local_dir, file.removeprefix(base_dir))
+            local_dir = Path(destination_file).parent
+            os.makedirs(local_dir, exist_ok=True)
+            self.client.download_file(bucket_name, file, destination_file)
+
+    def weight_iterator(
+        self, rank: int = 0
+    ) -> Generator[Tuple[str, torch.Tensor], None, None]:
+        from sglang.srt.model_loader.weight_utils import (
+            runai_safetensors_weights_iterator,
+        )
+
+        # only support safetensor files now
+        hf_weights_files = self.glob(allow_pattern=["*.safetensors"])
+        return runai_safetensors_weights_iterator(hf_weights_files)
+
+    def close(self):
+        self.client.close()
+        super().close()
diff --git a/sglang/python/sglang/srt/connector/utils.py b/sglang/python/sglang/srt/connector/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..6cd12da3337801258c2dfd38e5f9333799d12bd2
--- /dev/null
+++ b/sglang/python/sglang/srt/connector/utils.py
@@ -0,0 +1,35 @@
+# SPDX-License-Identifier: Apache-2.0
+
+import os
+from pathlib import Path
+from typing import Optional
+from urllib.parse import urlparse
+
+from sglang.srt.connector import BaseConnector
+
+
+def parse_model_name(url: str) -> str:
+    """
+    Parse the model name from the url.
+    Only used for db connector
+    """
+    parsed_url = urlparse(url)
+    return parsed_url.path.lstrip("/")
+
+
+def pull_files_from_db(
+    connector: BaseConnector,
+    model_name: str,
+    allow_pattern: Optional[list[str]] = None,
+    ignore_pattern: Optional[list[str]] = None,
+) -> None:
+    prefix = f"{model_name}/files/"
+    local_dir = connector.get_local_dir()
+    files = connector.list(prefix)
+
+    for file in files:
+        destination_file = os.path.join(local_dir, file.removeprefix(prefix))
+        local_dir = Path(destination_file).parent
+        os.makedirs(local_dir, exist_ok=True)
+        with open(destination_file, "wb") as f:
+            f.write(connector.getstr(file).encode("utf-8"))
diff --git a/sglang/python/sglang/srt/distributed/__init__.py b/sglang/python/sglang/srt/distributed/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..12f802055c504552b3bfd540017c7874956b4078
--- /dev/null
+++ b/sglang/python/sglang/srt/distributed/__init__.py
@@ -0,0 +1,3 @@
+from sglang.srt.distributed.communication_op import *
+from sglang.srt.distributed.parallel_state import *
+from sglang.srt.distributed.utils import *
diff --git a/sglang/python/sglang/srt/distributed/__pycache__/__init__.cpython-311.pyc b/sglang/python/sglang/srt/distributed/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4c7bb60401feeddc666737d370b16ab86eca8b72
Binary files /dev/null and b/sglang/python/sglang/srt/distributed/__pycache__/__init__.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/distributed/__pycache__/communication_op.cpython-311.pyc b/sglang/python/sglang/srt/distributed/__pycache__/communication_op.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..87fbe010793a17d2cbdc846872687ada9ae49b1a
Binary files /dev/null and b/sglang/python/sglang/srt/distributed/__pycache__/communication_op.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/distributed/__pycache__/naive_distributed.cpython-311.pyc b/sglang/python/sglang/srt/distributed/__pycache__/naive_distributed.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b3efa86fc6931cb98705c0b9bfa83e75150abea0
Binary files /dev/null and b/sglang/python/sglang/srt/distributed/__pycache__/naive_distributed.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/distributed/__pycache__/parallel_state.cpython-311.pyc b/sglang/python/sglang/srt/distributed/__pycache__/parallel_state.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7ed78cace16a1c31da7da95c3da6c5231a02888f
Binary files /dev/null and b/sglang/python/sglang/srt/distributed/__pycache__/parallel_state.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/distributed/__pycache__/utils.cpython-311.pyc b/sglang/python/sglang/srt/distributed/__pycache__/utils.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f1b42e4a09680de4eaa9f185790462738eb9c061
Binary files /dev/null and b/sglang/python/sglang/srt/distributed/__pycache__/utils.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/distributed/communication_op.py b/sglang/python/sglang/srt/distributed/communication_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..c5428a50b6df5776ceb2daf0fc93f9a9298fc628
--- /dev/null
+++ b/sglang/python/sglang/srt/distributed/communication_op.py
@@ -0,0 +1,50 @@
+# Adapted from https://github.com/vllm-project/vllm/blob/v0.6.4.post1/vllm/distributed/communication_op.py
+
+from typing import Any, Dict, Optional, Tuple, Union
+
+import torch
+import torch.distributed
+
+from .parallel_state import get_tp_group
+
+
+def tensor_model_parallel_all_reduce(input_: torch.Tensor) -> torch.Tensor:
+    """All-reduce the input tensor across model parallel group."""
+    return get_tp_group().all_reduce(input_)
+
+
+def tensor_model_parallel_fused_allreduce_rmsnorm(
+    input_: torch.Tensor,
+    residual_inp_: torch.Tensor,
+    weight_: torch.Tensor,
+    eps: float,
+) -> Optional[Tuple[torch.Tensor, torch.Tensor]]:
+    """Fused TP all-reduce + RMSNorm.
+
+    Policy and backend selection are owned by GroupCoordinator:
+    it may dispatch to communicator-native fused APIs, custom fused kernels,
+    or return None so callers can run generic fallback paths.
+    """
+    return get_tp_group().fused_allreduce_rmsnorm(input_, residual_inp_, weight_, eps)
+
+
+def tensor_model_parallel_all_gather(
+    input_: torch.Tensor, dim: int = -1
+) -> torch.Tensor:
+    """All-gather the input tensor across model parallel group."""
+    return get_tp_group().all_gather(input_, dim)
+
+
+def tensor_model_parallel_gather(
+    input_: torch.Tensor, dst: int = 0, dim: int = -1
+) -> Optional[torch.Tensor]:
+    """Gather the input tensor across model parallel group."""
+    return get_tp_group().gather(input_, dst, dim)
+
+
+def broadcast_tensor_dict(
+    tensor_dict: Optional[Dict[Any, Union[torch.Tensor, Any]]] = None, src: int = 0
+):
+    if not torch.distributed.is_initialized():
+        return tensor_dict
+    return get_tp_group().broadcast_tensor_dict(tensor_dict, src)
diff --git a/sglang/python/sglang/srt/distributed/device_communicators/__pycache__/all_reduce_utils.cpython-311.pyc b/sglang/python/sglang/srt/distributed/device_communicators/__pycache__/all_reduce_utils.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..38f881acc90da2d8ec195ca5a0b7d1e664c0fb1f
Binary files /dev/null and b/sglang/python/sglang/srt/distributed/device_communicators/__pycache__/all_reduce_utils.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/distributed/device_communicators/__pycache__/cuda_wrapper.cpython-311.pyc b/sglang/python/sglang/srt/distributed/device_communicators/__pycache__/cuda_wrapper.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5df56c174e4c5c9bac5c5e0f28e6b730bfb4447f
Binary files /dev/null and b/sglang/python/sglang/srt/distributed/device_communicators/__pycache__/cuda_wrapper.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/distributed/device_communicators/__pycache__/custom_all_reduce.cpython-311.pyc b/sglang/python/sglang/srt/distributed/device_communicators/__pycache__/custom_all_reduce.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f0d0a61463b8ed356e9ba1fc3d9c1d4970a05209
Binary files /dev/null and b/sglang/python/sglang/srt/distributed/device_communicators/__pycache__/custom_all_reduce.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/distributed/device_communicators/__pycache__/custom_all_reduce_ops.cpython-311.pyc b/sglang/python/sglang/srt/distributed/device_communicators/__pycache__/custom_all_reduce_ops.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..436e8897bf0853a3a07bdebe35efc0ff87074992
Binary files /dev/null and b/sglang/python/sglang/srt/distributed/device_communicators/__pycache__/custom_all_reduce_ops.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/distributed/device_communicators/__pycache__/custom_all_reduce_utils.cpython-311.pyc b/sglang/python/sglang/srt/distributed/device_communicators/__pycache__/custom_all_reduce_utils.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e054add7b9f75714713ab89314a003a1d5bf0efc
Binary files /dev/null and b/sglang/python/sglang/srt/distributed/device_communicators/__pycache__/custom_all_reduce_utils.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/distributed/device_communicators/__pycache__/hpu_communicator.cpython-311.pyc b/sglang/python/sglang/srt/distributed/device_communicators/__pycache__/hpu_communicator.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7a354863358e7a573732ed7a1ffb7caad8b56738
Binary files /dev/null and b/sglang/python/sglang/srt/distributed/device_communicators/__pycache__/hpu_communicator.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/distributed/device_communicators/__pycache__/npu_communicator.cpython-311.pyc b/sglang/python/sglang/srt/distributed/device_communicators/__pycache__/npu_communicator.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..aedb43c3297d7bd6053eee13326d6d838e5e7a0c
Binary files /dev/null and b/sglang/python/sglang/srt/distributed/device_communicators/__pycache__/npu_communicator.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/distributed/device_communicators/__pycache__/pymscclpp.cpython-311.pyc b/sglang/python/sglang/srt/distributed/device_communicators/__pycache__/pymscclpp.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..96aee944d09b1b55c59b1a762b83b5674270798c
Binary files /dev/null and b/sglang/python/sglang/srt/distributed/device_communicators/__pycache__/pymscclpp.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/distributed/device_communicators/__pycache__/pynccl.cpython-311.pyc b/sglang/python/sglang/srt/distributed/device_communicators/__pycache__/pynccl.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c6c511df0fc05a32885e750237fbfe9138e43c10
Binary files /dev/null and b/sglang/python/sglang/srt/distributed/device_communicators/__pycache__/pynccl.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/distributed/device_communicators/__pycache__/pynccl_allocator.cpython-311.pyc b/sglang/python/sglang/srt/distributed/device_communicators/__pycache__/pynccl_allocator.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1248747953441ab743c5d45ec18062055b0e902d
Binary files /dev/null and b/sglang/python/sglang/srt/distributed/device_communicators/__pycache__/pynccl_allocator.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/distributed/device_communicators/__pycache__/pynccl_wrapper.cpython-311.pyc b/sglang/python/sglang/srt/distributed/device_communicators/__pycache__/pynccl_wrapper.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..faea4701113460967b36e2e4f1d57f8b66dabd24
Binary files /dev/null and b/sglang/python/sglang/srt/distributed/device_communicators/__pycache__/pynccl_wrapper.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/distributed/device_communicators/__pycache__/shm_broadcast.cpython-311.pyc b/sglang/python/sglang/srt/distributed/device_communicators/__pycache__/shm_broadcast.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9432ff5ecd4b67518d41a1bb6a385ea81801cb14
Binary files /dev/null and b/sglang/python/sglang/srt/distributed/device_communicators/__pycache__/shm_broadcast.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/distributed/device_communicators/__pycache__/torch_symm_mem.cpython-311.pyc b/sglang/python/sglang/srt/distributed/device_communicators/__pycache__/torch_symm_mem.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..96d7ec7b5c3e107d302c5ef6dc170182c3a498d8
Binary files /dev/null and b/sglang/python/sglang/srt/distributed/device_communicators/__pycache__/torch_symm_mem.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/distributed/device_communicators/__pycache__/xpu_communicator.cpython-311.pyc b/sglang/python/sglang/srt/distributed/device_communicators/__pycache__/xpu_communicator.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..898e05b852f62209ab160511211e7f12a5fed14b
Binary files /dev/null and b/sglang/python/sglang/srt/distributed/device_communicators/__pycache__/xpu_communicator.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/distributed/device_communicators/all_reduce_utils.py b/sglang/python/sglang/srt/distributed/device_communicators/all_reduce_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..ff88b6a1dcfd632f58d2f73837b71fd007e0efca
--- /dev/null
+++ b/sglang/python/sglang/srt/distributed/device_communicators/all_reduce_utils.py
@@ -0,0 +1,16 @@
+MiB = 1024 * 1024
+
+TORCH_SYMM_MEM_ALL_REDUCE_MAX_SIZES = {
+    9: {
+        2: 64 * MiB,  # 64 MB
+        4: 64 * MiB,  # 64 MB
+        6: 128 * MiB,  # 128 MB
+        8: 128 * MiB,  # 128 MB
+    },
+    10: {
+        2: 64 * MiB,  # 64 MB
+        4: 64 * MiB,  # 64 MB
+        6: 128 * MiB,  # 128 MB
+        8: 128 * MiB,  # 128 MB
+    },
+}
diff --git a/sglang/python/sglang/srt/distributed/device_communicators/cuda_wrapper.py b/sglang/python/sglang/srt/distributed/device_communicators/cuda_wrapper.py
new file mode 100644
index 0000000000000000000000000000000000000000..f1dc99bd29b3fcc54ac997b7d2c1a914c1661b12
--- /dev/null
+++ b/sglang/python/sglang/srt/distributed/device_communicators/cuda_wrapper.py
@@ -0,0 +1,187 @@
+# Adapted from https://github.com/vllm-project/vllm/blob/v0.6.4.post1/vllm/distributed/device_communicators/cuda_wrapper.py
+
+"""This file is a pure Python wrapper for the cudart library.
+It avoids the need to compile a separate shared library, and is
+convenient for use when we just need to call a few functions.
+"""
+
+import ctypes
+import logging
+from dataclasses import dataclass
+from typing import Any, Dict, List, Optional
+
+# this line makes it possible to directly load `libcudart.so` using `ctypes`
+import torch  # noqa
+
+from sglang.srt.utils import is_musa
+
+_is_musa = is_musa()
+
+logger = logging.getLogger(__name__)
+
+# === export types and functions from cudart to Python ===
+# for the original cudart definition, please check
+# https://docs.nvidia.com/cuda/cuda-runtime-api/index.html
+
+cudaError_t = ctypes.c_int
+cudaMemcpyKind = ctypes.c_int
+
+
+class cudaIpcMemHandle_t(ctypes.Structure):
+    _fields_ = [("internal", ctypes.c_byte * 128)]
+
+
+@dataclass
+class Function:
+    name: str
+    restype: Any
+    argtypes: List[Any]
+
+
+def find_loaded_library(lib_name) -> Optional[str]:
+    """
+    According to according to https://man7.org/linux/man-pages/man5/proc_pid_maps.5.html,
+    the file `/proc/self/maps` contains the memory maps of the process, which includes the
+    shared libraries loaded by the process. We can use this file to find the path of the
+    a loaded library.
+    """  # noqa
+    found = False
+    with open("/proc/self/maps") as f:
+        for line in f:
+            if lib_name in line:
+                found = True
+                break
+    if not found:
+        # the library is not loaded in the current process
+        return None
+    # if lib_name is libcudart, we need to match a line with:
+    # address /path/to/libcudart-hash.so.11.0
+    start = line.index("/")
+    path = line[start:].strip()
+    filename = path.split("/")[-1]
+    assert filename.rpartition(".so")[0].startswith(
+        lib_name
+    ), f"Unexpected filename: {filename} for library {lib_name}"
+    return path
+
+
+class CudaRTLibrary:
+    exported_functions = [
+        # ​cudaError_t cudaSetDevice ( int  device )
+        Function("cudaSetDevice", cudaError_t, [ctypes.c_int]),
+        # cudaError_t 	cudaDeviceSynchronize ( void )
+        Function("cudaDeviceSynchronize", cudaError_t, []),
+        # ​cudaError_t cudaDeviceReset ( void )
+        Function("cudaDeviceReset", cudaError_t, []),
+        # const char* 	cudaGetErrorString ( cudaError_t error )
+        Function("cudaGetErrorString", ctypes.c_char_p, [cudaError_t]),
+        # ​cudaError_t 	cudaMalloc ( void** devPtr, size_t size )
+        Function(
+            "cudaMalloc",
+            cudaError_t,
+            [ctypes.POINTER(ctypes.c_void_p), ctypes.c_size_t],
+        ),
+        # ​cudaError_t 	cudaFree ( void* devPtr )
+        Function("cudaFree", cudaError_t, [ctypes.c_void_p]),
+        # ​cudaError_t cudaMemset ( void* devPtr, int  value, size_t count )
+        Function(
+            "cudaMemset", cudaError_t, [ctypes.c_void_p, ctypes.c_int, ctypes.c_size_t]
+        ),
+        # ​cudaError_t cudaMemcpy ( void* dst, const void* src, size_t count, cudaMemcpyKind kind ) # noqa
+        Function(
+            "cudaMemcpy",
+            cudaError_t,
+            [ctypes.c_void_p, ctypes.c_void_p, ctypes.c_size_t, cudaMemcpyKind],
+        ),
+        # cudaError_t cudaIpcGetMemHandle ( cudaIpcMemHandle_t* handle, void* devPtr ) # noqa
+        Function(
+            "cudaIpcGetMemHandle",
+            cudaError_t,
+            [ctypes.POINTER(cudaIpcMemHandle_t), ctypes.c_void_p],
+        ),
+        # ​cudaError_t cudaIpcOpenMemHandle ( void** devPtr, cudaIpcMemHandle_t handle, unsigned int  flags ) # noqa
+        Function(
+            "cudaIpcOpenMemHandle",
+            cudaError_t,
+            [ctypes.POINTER(ctypes.c_void_p), cudaIpcMemHandle_t, ctypes.c_uint],
+        ),
+    ]
+
+    # class attribute to store the mapping from the path to the library
+    # to avoid loading the same library multiple times
+    path_to_library_cache: Dict[str, Any] = {}
+
+    # class attribute to store the mapping from library path
+    #  to the corresponding dictionary
+    path_to_dict_mapping: Dict[str, Dict[str, Any]] = {}
+
+    def __init__(self, so_file: Optional[str] = None):
+        if so_file is None:
+            so_file = find_loaded_library("libcudart" if not _is_musa else "libmusart")
+            assert so_file is not None, "libcudart is not loaded in the current process"
+        if so_file not in CudaRTLibrary.path_to_library_cache:
+            lib = ctypes.CDLL(so_file)
+            CudaRTLibrary.path_to_library_cache[so_file] = lib
+        self.lib = CudaRTLibrary.path_to_library_cache[so_file]
+
+        if so_file not in CudaRTLibrary.path_to_dict_mapping:
+            _funcs = {}
+            for func in CudaRTLibrary.exported_functions:
+                f = getattr(self.lib, func.name)
+                f.restype = func.restype
+                f.argtypes = func.argtypes
+                _funcs[func.name] = f
+            CudaRTLibrary.path_to_dict_mapping[so_file] = _funcs
+        self.funcs = CudaRTLibrary.path_to_dict_mapping[so_file]
+
+    def CUDART_CHECK(self, result: cudaError_t) -> None:
+        if result != 0:
+            error_str = self.cudaGetErrorString(result)
+            raise RuntimeError(f"CUDART error: {error_str}")
+
+    def cudaGetErrorString(self, error: cudaError_t) -> str:
+        return self.funcs["cudaGetErrorString"](error).decode("utf-8")
+
+    def cudaSetDevice(self, device: int) -> None:
+        self.CUDART_CHECK(self.funcs["cudaSetDevice"](device))
+
+    def cudaDeviceSynchronize(self) -> None:
+        self.CUDART_CHECK(self.funcs["cudaDeviceSynchronize"]())
+
+    def cudaDeviceReset(self) -> None:
+        self.CUDART_CHECK(self.funcs["cudaDeviceReset"]())
+
+    def cudaMalloc(self, size: int) -> ctypes.c_void_p:
+        devPtr = ctypes.c_void_p()
+        self.CUDART_CHECK(self.funcs["cudaMalloc"](ctypes.byref(devPtr), size))
+        return devPtr
+
+    def cudaFree(self, devPtr: ctypes.c_void_p) -> None:
+        self.CUDART_CHECK(self.funcs["cudaFree"](devPtr))
+
+    def cudaMemset(self, devPtr: ctypes.c_void_p, value: int, count: int) -> None:
+        self.CUDART_CHECK(self.funcs["cudaMemset"](devPtr, value, count))
+
+    def cudaMemcpy(
+        self, dst: ctypes.c_void_p, src: ctypes.c_void_p, count: int
+    ) -> None:
+        cudaMemcpyDefault = 4
+        kind = cudaMemcpyDefault
+        self.CUDART_CHECK(self.funcs["cudaMemcpy"](dst, src, count, kind))
+
+    def cudaIpcGetMemHandle(self, devPtr: ctypes.c_void_p) -> cudaIpcMemHandle_t:
+        handle = cudaIpcMemHandle_t()
+        self.CUDART_CHECK(
+            self.funcs["cudaIpcGetMemHandle"](ctypes.byref(handle), devPtr)
+        )
+        return handle
+
+    def cudaIpcOpenMemHandle(self, handle: cudaIpcMemHandle_t) -> ctypes.c_void_p:
+        cudaIpcMemLazyEnablePeerAccess = 1
+        devPtr = ctypes.c_void_p()
+        self.CUDART_CHECK(
+            self.funcs["cudaIpcOpenMemHandle"](
+                ctypes.byref(devPtr), handle, cudaIpcMemLazyEnablePeerAccess
+            )
+        )
+        return devPtr
diff --git a/sglang/python/sglang/srt/distributed/device_communicators/custom_all_reduce.py b/sglang/python/sglang/srt/distributed/device_communicators/custom_all_reduce.py
new file mode 100644
index 0000000000000000000000000000000000000000..b6ca22ca4b798030b249ac3a6f05d37c2c9a11c1
--- /dev/null
+++ b/sglang/python/sglang/srt/distributed/device_communicators/custom_all_reduce.py
@@ -0,0 +1,507 @@
+# Adapted from https://github.com/vllm-project/vllm/blob/v0.6.4.post1/vllm/distributed/device_communicators/custom_all_reduce.py
+
+import ctypes
+import logging
+import os
+from contextlib import contextmanager
+from typing import Any, List, Optional, Union
+
+import torch
+import torch.distributed as dist
+from torch.distributed import ProcessGroup
+
+import sglang.srt.distributed.device_communicators.custom_all_reduce_ops as ops
+from sglang.srt.compilation.piecewise_context_manager import is_in_piecewise_cuda_graph
+from sglang.srt.distributed.device_communicators.cuda_wrapper import CudaRTLibrary
+from sglang.srt.distributed.device_communicators.custom_all_reduce_utils import (
+    gpu_p2p_access_check,
+    is_full_nvlink,
+    is_weak_contiguous,
+)
+from sglang.srt.distributed.parallel_state import in_the_same_node_as
+from sglang.srt.environ import envs
+from sglang.srt.utils import (
+    get_bool_env_var,
+    is_cuda,
+    is_hip,
+    is_musa,
+    log_info_on_rank0,
+)
+
+_is_cuda = is_cuda()
+_is_hip = is_hip()
+_is_musa = is_musa()
+
+logger = logging.getLogger(__name__)
+
+
+def _can_p2p(rank: int, world_size: int) -> bool:
+    # SGLANG_SKIP_P2P_CHECK can be set to False in sglang
+    SGLANG_SKIP_P2P_CHECK = os.getenv("SGLANG_SKIP_P2P_CHECK", "0") == "1"
+    for i in range(world_size):
+        if i == rank:
+            continue
+        if SGLANG_SKIP_P2P_CHECK:
+            logger.info("Skipping P2P check and trusting the driver's P2P report.")
+            return torch.cuda.can_device_access_peer(rank, i)
+        if not gpu_p2p_access_check(rank, i):
+            return False
+    return True
+
+
+class CustomAllreduce:
+    _SUPPORTED_WORLD_SIZES = [2, 4, 6, 8]
+    _MAX_CAR_SIZE = 8192 * 1024
+    if _is_hip:
+        # crossover is at 16MB buffer size for ROCm
+        _MAX_CAR_SIZE = 2 * 8192 * 1024
+    if _is_musa:
+        # crossover is at 128MB buffer size for MUSA
+        _MAX_CAR_SIZE = 16 * 8196 * 1024
+
+    # max_size: max supported allreduce size
+    def __init__(
+        self,
+        group: ProcessGroup,
+        device: Union[int, str, torch.device],
+        max_size=_MAX_CAR_SIZE,
+    ) -> None:
+        """
+        Args:
+            group: the process group to work on. If None, it will use the
+                default process group.
+            device: the device to bind the CustomAllreduce to. If None,
+                it will be bind to f"cuda:{local_rank}".
+        It is the caller's responsibility to make sure each communicator
+        is bind to a unique device, and all communicators in this group
+        are in the same node.
+        """
+        self._IS_CAPTURING = False
+        self.disabled = True  # This can be modified in-place by context manager in piecewise cuda graph runner
+        self.original_disabled = True  # To store the original state
+
+        if not ops.IS_CUSTOM_AR_AVAILABLE:
+            # disable because of missing custom allreduce library
+            # e.g. in a non-cuda environment
+            return
+
+        self.group = group
+
+        assert (
+            dist.get_backend(group) != dist.Backend.NCCL
+        ), "CustomAllreduce should be attached to a non-NCCL group."
+
+        if not all(in_the_same_node_as(group, source_rank=0)):
+            # No need to initialize custom allreduce for multi-node case.
+            logger.warning(
+                "Custom allreduce is disabled because this process group"
+                " spans across nodes."
+            )
+            return
+
+        rank = dist.get_rank(group=self.group)
+        world_size = dist.get_world_size(group=self.group)
+        if world_size == 1:
+            # No need to initialize custom allreduce for single GPU case.
+            return
+
+        if world_size not in CustomAllreduce._SUPPORTED_WORLD_SIZES:
+            logger.warning(
+                "Custom allreduce is disabled due to an unsupported world"
+                " size: %d. Supported world sizes: %s. To silence this "
+                "warning, specify disable_custom_all_reduce=True explicitly.",
+                world_size,
+                str(CustomAllreduce._SUPPORTED_WORLD_SIZES),
+            )
+            return
+
+        if isinstance(device, int):
+            device = torch.device(f"cuda:{device}")
+        elif isinstance(device, str):
+            device = torch.device(device)
+        # now `device` is a `torch.device` object
+        assert isinstance(device, torch.device)
+        self.device = device
+
+        cuda_visible_devices = os.environ.get("CUDA_VISIBLE_DEVICES", None)
+        if cuda_visible_devices:
+            device_ids = list(map(int, cuda_visible_devices.split(",")))
+        else:
+            device_ids = list(range(torch.cuda.device_count()))
+
+        physical_device_id = device_ids[device.index]
+        tensor = torch.tensor([physical_device_id], dtype=torch.int, device="cpu")
+        gather_list = [
+            torch.tensor([0], dtype=torch.int, device="cpu") for _ in range(world_size)
+        ]
+        dist.all_gather(gather_list, tensor, group=self.group)
+        physical_device_ids = [t.item() for t in gather_list]
+
+        # test nvlink first, this will filter out most of the cases
+        # where custom allreduce is not supported
+        # this checks hardware and driver support for NVLink
+        if _is_cuda or _is_hip or _is_musa:
+            full_nvlink = is_full_nvlink(physical_device_ids, world_size)
+
+        if world_size > 2 and not full_nvlink:
+            logger.warning(
+                "Custom allreduce is disabled because it's not supported on"
+                " more than two PCIe-only GPUs. To silence this warning, "
+                "specify disable_custom_all_reduce=True explicitly."
+            )
+            return
+        # test P2P capability, this checks software/cudaruntime support
+        # this is expensive to compute at the first time
+        # then we cache the result
+        # On AMD GPU, p2p is always enabled between XGMI connected GPUs
+        if not _is_hip and not _can_p2p(rank, world_size):
+            logger.warning(
+                "Custom allreduce is disabled because your platform lacks "
+                "GPU P2P capability or P2P test failed. To silence this "
+                "warning, specify disable_custom_all_reduce=True explicitly."
+            )
+            return
+
+        self.max_size = max_size
+        self.rank = rank
+        self.world_size = world_size
+        self.full_nvlink = full_nvlink
+
+        if not _is_hip:
+            # Buffers memory are owned by this Python class and passed to C++.
+            # Meta data composes of two parts: meta data for synchronization and a
+            # temporary buffer for storing intermediate allreduce results.
+            self.meta_ptrs = self.create_shared_buffer(
+                ops.meta_size() + max_size, group=group
+            )
+            # This is a pre-registered IPC buffer. In eager mode, input tensors
+            # are first copied into this buffer before allreduce is performed
+            self.buffer_ptrs = self.create_shared_buffer(max_size, group=group)
+            # This is a buffer for storing the tuples of pointers pointing to
+            # IPC buffers from all ranks. Each registered tuple has size of
+            # 8*world_size bytes where world_size is at most 8. Allocating 8MB
+            # is enough for 131072 such tuples. The largest model I've seen only
+            # needs less than 10000 of registered tuples.
+            self.rank_data = torch.empty(
+                max_size, dtype=torch.uint8, device=self.device
+            )
+            self._ptr = ops.init_custom_ar(
+                self.meta_ptrs, self.rank_data, rank, self.full_nvlink
+            )
+            ops.register_buffer(self._ptr, self.buffer_ptrs)
+        else:
+            # meta data buffers need to be "uncached" for signal on MI200
+            self.meta = ops.allocate_meta_buffer(ops.meta_size() + max_size)
+            self.buffer = torch.empty(max_size, dtype=torch.uint8, device=self.device)
+            handle = ops.get_meta_buffer_ipc_handle(self.meta)
+            shard_data = (
+                bytes(handle),  # ipc handle to base ptr
+                0,  # offset of base ptr
+            )
+            handles, offsets = self._gather_ipc_meta(shard_data)
+            self.rank_data = torch.empty(
+                max_size, dtype=torch.uint8, device=self.device
+            )
+            self._ptr = ops.init_custom_ar(
+                self.meta, self.rank_data, handles, offsets, rank, self.full_nvlink
+            )
+            self.register_buffer(self.buffer)
+
+        self.disabled = False
+        self.original_disabled = False  # Ensure original_disabled == disabled
+        self.tms_cudagraph = envs.SGLANG_MEMORY_SAVER_CUDA_GRAPH.get()
+
+    @staticmethod
+    def create_shared_buffer(
+        size_in_bytes: int, group: Optional[ProcessGroup] = None
+    ) -> List[int]:
+        """
+        Creates a shared buffer and returns a list of pointers
+        representing the buffer on all processes in the group.
+        """
+        lib = CudaRTLibrary()
+        pointer = lib.cudaMalloc(size_in_bytes)
+        if _is_musa:
+            lib.cudaMemset(pointer, 0, size_in_bytes)
+        handle = lib.cudaIpcGetMemHandle(pointer)
+        world_size = dist.get_world_size(group=group)
+        rank = dist.get_rank(group=group)
+        handles = [None] * world_size
+        dist.all_gather_object(handles, handle, group=group)
+
+        pointers: List[int] = []
+        for i, h in enumerate(handles):
+            if i == rank:
+                pointers.append(pointer.value)  # type: ignore
+            else:
+                pointers.append(lib.cudaIpcOpenMemHandle(h).value)  # type: ignore
+
+        return pointers
+
+    @staticmethod
+    def free_shared_buffer(
+        pointers: List[int], group: Optional[ProcessGroup] = None
+    ) -> None:
+        rank = dist.get_rank(group=group)
+        lib = CudaRTLibrary()
+        lib.cudaFree(ctypes.c_void_p(pointers[rank]))
+
+    @contextmanager
+    def capture(self):
+        """
+        The main responsibility of this context manager is the
+        `register_graph_buffers` call at the end of the context.
+        It records all the buffer addresses used in the CUDA graph.
+        """
+        try:
+            self._IS_CAPTURING = True
+            yield
+        finally:
+            self._IS_CAPTURING = False
+            if not self.disabled:
+                self.register_graph_buffers()
+
+    def _get_ipc_meta(self, inp: torch.Tensor):
+        # _share_cuda_() doesn't accept meta buffer not allocated from
+        # PyTorch cache allocator, use direct HIP call to get IPC handle
+        handle = ops.get_meta_buffer_ipc_handle(inp)
+        shard_data = (
+            bytes(handle),  # ipc handle to base ptr
+            0,  # offset of base ptr
+        )
+        return self._gather_ipc_meta(shard_data)
+
+    def _gather_ipc_meta(self, shard_data):
+        # Note: don't use `[[None]] * self.world_size` here
+        # because it will create a list of the same reference
+        all_data: List[Optional[Any]] = [[None] for i in range(self.world_size)]
+        all_data[self.rank][0] = shard_data
+
+        ranks = dist.get_process_group_ranks(group=self.group)
+        ranks.sort()
+        for i, rank in enumerate(ranks):
+            dist.broadcast_object_list(
+                all_data[i], src=rank, group=self.group, device="cpu"
+            )
+
+        # we cannot directly use `dist.all_gather_object` here
+        # because it is incompatible with `gloo` backend under inference mode.
+        # see https://github.com/pytorch/pytorch/issues/126032 for details.
+
+        handles = []
+        offsets = []
+        for i in range(len(all_data)):
+            handles.append(all_data[i][0][0])  # type: ignore
+            offsets.append(all_data[i][0][1])  # type: ignore
+        return handles, offsets
+
+    def register_buffer(self, inp: torch.Tensor):
+        handles, offsets = self._get_ipc_meta(inp)
+        ops.register_buffer(self._ptr, inp, handles, offsets)
+
+    def register_graph_buffers(self):
+        if _is_hip:
+            handle, offset = ops.get_graph_buffer_ipc_meta(self._ptr)
+            handles, offsets = self._gather_ipc_meta((bytes(handle), offset))
+            log_info_on_rank0(logger, f"Registering {len(offset)} cuda graph addresses")
+            ops.register_graph_buffers(self._ptr, handles, offsets)
+        else:
+            handle, offset = ops.get_graph_buffer_ipc_meta(self._ptr)
+            log_info_on_rank0(logger, f"Registering {len(offset)} cuda graph addresses")
+            # We cannot directly use `dist.all_gather_object` here
+            # because it is incompatible with `gloo` backend under inference mode.
+            # see https://github.com/pytorch/pytorch/issues/126032 for details.
+            all_data = [
+                [None, None] for _ in range(dist.get_world_size(group=self.group))
+            ]
+            all_data[self.rank] = [handle, offset]
+            ranks = sorted(dist.get_process_group_ranks(group=self.group))
+            for i, rank in enumerate(ranks):
+                dist.broadcast_object_list(
+                    all_data[i], src=rank, group=self.group, device="cpu"
+                )
+            # Unpack list of tuples to tuple of lists.
+            handles = [d[0] for d in all_data]  # type: ignore
+            offsets = [d[1] for d in all_data]  # type: ignore
+            ops.register_graph_buffers(self._ptr, handles, offsets)
+
+    def should_custom_ar(self, inp: torch.Tensor):
+        if self.disabled:
+            return False
+        inp_size = inp.numel() * inp.element_size()
+        # custom allreduce requires input byte size to be multiples of 16
+        if inp_size % 16 != 0:
+            return False
+        if not is_weak_contiguous(inp):
+            return False
+        # for 4 or more non NVLink-capable GPUs, custom allreduce provides
+        # little performance improvement over NCCL.
+        if not _is_hip:
+            if self.world_size == 2 or self.full_nvlink:
+                return inp_size <= self.max_size
+            return False
+
+        if _is_hip:
+            if self.full_nvlink:
+                return inp_size <= self.max_size
+            return False
+
+        return False
+
+    # all reduce, assuming inp tensor is IPC registered with register_buffer,
+    # or, in the context of cuda graphs, register_graph_buffers
+    def all_reduce_reg(self, inp: torch.Tensor, out: torch.Tensor = None):
+        if out is None:
+            out = torch.empty_like(inp)
+        ops.all_reduce_reg(self._ptr, inp, out)
+        return out
+
+    # all reduce, assuming inp tensor is NOT IPC registered
+    def all_reduce_unreg(self, inp: torch.Tensor, out: torch.Tensor = None):
+        if out is None:
+            out = torch.empty_like(inp)
+        ops.all_reduce_unreg(self._ptr, inp, self.buffer, out)
+        return out
+
+    def all_reduce(
+        self,
+        inp: torch.Tensor,
+        *,
+        out: torch.Tensor = None,
+        registered: bool = False,
+    ):
+        """Performs an out-of-place all reduce.
+
+        If registered is True, this assumes inp's pointer is already
+        IPC-registered. Otherwise, inp is first copied into a pre-registered
+        buffer.
+        """
+        if out is None:
+            out = torch.empty_like(inp)
+        if registered:
+            ops.all_reduce(self._ptr, inp, out, 0, 0)
+        else:
+            ops.all_reduce(
+                self._ptr, inp, out, self.buffer_ptrs[self.rank], self.max_size
+            )
+        return out
+
+    def deterministic_all_reduce(
+        self,
+        inp: torch.Tensor,
+        *,
+        out: torch.Tensor = None,
+        registered: bool = False,
+    ):
+        """Deterministic all-reduce using 1-stage kernel with fixed ordering (AMD only)."""
+        if out is None:
+            out = torch.empty_like(inp)
+        if registered:
+            ops.deterministic_all_reduce_reg(self._ptr, inp, out)
+        else:
+            reg_buffer = self.buffer.view(inp.dtype)[: inp.numel()]
+            ops.deterministic_all_reduce_unreg(self._ptr, inp, reg_buffer, out)
+        return out
+
+    def custom_all_reduce(self, input: torch.Tensor) -> Optional[torch.Tensor]:
+        """The main allreduce API that provides support for cuda graph."""
+        # When custom allreduce is disabled, this will be None.
+        if self.disabled or not self.should_custom_ar(input):
+            return None
+        if self._IS_CAPTURING:
+            if torch.cuda.is_current_stream_capturing():
+                if _is_hip:
+                    if self.tms_cudagraph:
+                        return self.all_reduce_unreg(input)
+                    return self.all_reduce_reg(input)
+                else:
+                    return self.all_reduce(input, registered=not self.tms_cudagraph)
+            else:
+                # Could be warmup OR piecewise cuda graph split op execution.
+                # In piecewise cuda graph, split ops run eagerly outside the graph
+                # but _IS_CAPTURING is still True. We need to do real all-reduce.
+                if is_in_piecewise_cuda_graph():
+                    # Split op execution - do real all-reduce
+                    if _is_hip:
+                        return self.all_reduce_unreg(input)
+                    else:
+                        return self.all_reduce(input, registered=False)
+                else:
+                    # True warmup - mimic the allocation pattern since custom
+                    # allreduce is out-of-place.
+                    return torch.zeros_like(input)
+        else:
+            if _is_hip:
+                # note: outside of cuda graph context,
+                # custom allreduce incurs a cost of cudaMemcpy, which should
+                # be small(<=1% of overall latency) compared to the performance
+                # gains of using custom kernels
+                return self.all_reduce_unreg(input)
+            else:
+                return self.all_reduce(input, registered=False)
+
+    def close(self):
+        if not self.disabled and self._ptr:
+            ops.dispose(self._ptr)
+            if _is_cuda:
+                self.free_shared_buffer(self.meta_ptrs)
+                self.free_shared_buffer(self.buffer_ptrs)
+            self._ptr = 0
+
+    def __del__(self):
+        self.close()
+
+
+def dispatch_custom_allreduce():
+    """Return the CustomAllreduce class to use (aiter on ROCm if enabled).
+
+    On AMD with 1-stage AR enabled, use sglang's CustomAllreduce (has deterministic_all_reduce method).
+    Otherwise use AiterCustomAllreduce if available.
+    """
+    if _is_cuda or _is_musa:
+        return CustomAllreduce
+
+    assert _is_hip
+
+    if envs.SGLANG_USE_1STAGE_ALLREDUCE.is_set():
+        if envs.SGLANG_USE_1STAGE_ALLREDUCE.get():
+            logger.debug(
+                "[AR] All-reduce: 1-stage kernel (SGLANG_USE_1STAGE_ALLREDUCE=1)"
+            )
+        else:
+            logger.debug("[AR] All-reduce: default (SGLANG_USE_1STAGE_ALLREDUCE=0)")
+    elif envs.SGLANG_ENABLE_DETERMINISTIC_INFERENCE.get():
+        logger.debug(
+            "[AR] All-reduce: 1-stage kernel (deterministic inference enabled)"
+        )
+    else:
+        logger.debug("[AR] All-reduce: default")
+
+    # Check if 1-stage AR should be used
+    if envs.SGLANG_USE_1STAGE_ALLREDUCE.is_set():
+        use_1stage = envs.SGLANG_USE_1STAGE_ALLREDUCE.get()
+    else:
+        use_1stage = envs.SGLANG_ENABLE_DETERMINISTIC_INFERENCE.get()
+
+    # On AMD with 1-stage AR, use sglang's CustomAllreduce
+    # (AiterCustomAllreduce doesn't have deterministic_all_reduce method)
+    if use_1stage:
+        return CustomAllreduce
+
+    if get_bool_env_var("SGLANG_USE_AITER_AR", default="true"):
+        try:
+            from aiter.dist.device_communicators.custom_all_reduce import (
+                CustomAllreduce as AiterCustomAllreduce,
+            )
+
+            logger.info("[AR] Using AiterCustomAllreduce (AMD default)")
+            return AiterCustomAllreduce
+        except ImportError as e:
+            logger.warning(
+                "[AR] Aiter custom all-reduce not available; "
+                "falling back to sglang CustomAllreduce. Details: %s",
+                e,
+            )
+            return CustomAllreduce
+
+    return CustomAllreduce
diff --git a/sglang/python/sglang/srt/distributed/device_communicators/custom_all_reduce_ops.py b/sglang/python/sglang/srt/distributed/device_communicators/custom_all_reduce_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..55879e55fb617cff782c93cde377427282b43a87
--- /dev/null
+++ b/sglang/python/sglang/srt/distributed/device_communicators/custom_all_reduce_ops.py
@@ -0,0 +1,208 @@
+# Adapted from https://github.com/vllm-project/vllm/blob/v0.6.4.post1/vllm/_custom_ops.py
+import logging
+from typing import List, Optional, Tuple
+
+import torch
+
+from sglang.srt.utils import is_cuda, is_hip, is_musa
+
+logger = logging.getLogger(__name__)
+
+_is_cuda = is_cuda()
+_is_hip = is_hip()
+_is_musa = is_musa()
+
+IS_CUSTOM_AR_AVAILABLE = _is_cuda or _is_hip or _is_musa
+IS_QUICK_AR_AVAILABLE = _is_hip
+# TODO(zyksir): mscclpp is untested on AMD and therefore disabled.
+IS_MSCCLPP_AR_AVAILABLE = _is_cuda
+
+try:
+    import sgl_kernel.allreduce as _custom_ar
+except ImportError as e:
+    if _is_cuda or _is_hip:
+        logger.warning("Failed to import from custom_ar with %r", e)
+    IS_CUSTOM_AR_AVAILABLE = False
+    IS_QUICK_AR_AVAILABLE = False
+    IS_MSCCLPP_AR_AVAILABLE = False
+
+# region IS_CUSTOM_AR_AVAILABLE
+
+if not IS_CUSTOM_AR_AVAILABLE:
+    pass
+
+elif _is_cuda or _is_musa:
+    # CUDA custom allreduce
+
+    def init_custom_ar(
+        ipc_tensors: List[torch.Tensor],
+        rank_data: torch.Tensor,
+        rank: int,
+        full_nvlink: bool,
+    ) -> int:
+        return _custom_ar.init_custom_ar(ipc_tensors, rank_data, rank, full_nvlink)
+
+    def all_reduce(
+        fa: int,
+        inp: torch.Tensor,
+        out: torch.Tensor,
+        reg_buffer: int,
+        reg_buffer_sz_bytes: int,
+    ) -> None:
+        _custom_ar.all_reduce(fa, inp, out, reg_buffer, reg_buffer_sz_bytes)
+
+    def dispose(fa: int) -> None:
+        _custom_ar.dispose(fa)
+
+    def meta_size() -> int:
+        return _custom_ar.meta_size()
+
+    def register_buffer(fa: int, ipc_tensors: List[int]) -> None:
+        return _custom_ar.register_buffer(fa, ipc_tensors)
+
+    def get_graph_buffer_ipc_meta(fa: int) -> Tuple[List[int], List[int]]:
+        return _custom_ar.get_graph_buffer_ipc_meta(fa)
+
+    def register_graph_buffers(
+        fa: int, handles: List[List[int]], offsets: List[List[int]]
+    ) -> None:
+        _custom_ar.register_graph_buffers(fa, handles, offsets)
+
+elif _is_hip:
+    # ROCM custom allreduce
+
+    def init_custom_ar(
+        meta: torch.Tensor,
+        rank_data: torch.Tensor,
+        handles: List[str],
+        offsets: List[int],
+        rank: int,
+        full_nvlink: bool,
+    ) -> int:
+        return _custom_ar.init_custom_ar(
+            meta, rank_data, handles, offsets, rank, full_nvlink
+        )
+
+    def all_reduce_reg(fa: int, inp: torch.Tensor, out: torch.Tensor) -> None:
+        _custom_ar.all_reduce_reg(fa, inp, out)
+
+    def all_reduce_unreg(
+        fa: int, inp: torch.Tensor, reg_buffer: torch.Tensor, out: torch.Tensor
+    ) -> None:
+        _custom_ar.all_reduce_unreg(fa, inp, reg_buffer, out)
+
+    def deterministic_all_reduce_reg(
+        fa: int, inp: torch.Tensor, out: torch.Tensor
+    ) -> None:
+        _custom_ar.deterministic_all_reduce_reg(fa, inp, out)
+
+    def deterministic_all_reduce_unreg(
+        fa: int, inp: torch.Tensor, reg_buffer: torch.Tensor, out: torch.Tensor
+    ) -> None:
+        _custom_ar.deterministic_all_reduce_unreg(fa, inp, reg_buffer, out)
+
+    def dispose(fa: int) -> None:
+        _custom_ar.dispose(fa)
+
+    def meta_size() -> int:
+        return _custom_ar.meta_size()
+
+    def register_buffer(
+        fa: int, t: torch.Tensor, handles: List[str], offsets: List[int]
+    ) -> None:
+        return _custom_ar.register_buffer(fa, t, handles, offsets)
+
+    def get_graph_buffer_ipc_meta(fa: int) -> Tuple[torch.Tensor, List[int]]:
+        return _custom_ar.get_graph_buffer_ipc_meta(fa)
+
+    def register_graph_buffers(
+        fa: int, handles: List[str], offsets: List[List[int]]
+    ) -> None:
+        _custom_ar.register_graph_buffers(fa, handles, offsets)
+
+    def allocate_meta_buffer(size: int) -> torch.Tensor:
+        return _custom_ar.allocate_meta_buffer(size)
+
+    def get_meta_buffer_ipc_handle(inp: torch.Tensor) -> torch.Tensor:
+        return _custom_ar.get_meta_buffer_ipc_handle(inp)
+
+
+# endregion
+
+# region IS_QUICK_AR_AVAILABLE
+
+if not IS_QUICK_AR_AVAILABLE:
+    pass
+
+elif _is_hip:
+    # ROCM custom quick allreduce
+
+    def init_custom_qr(
+        rank: int, world_size: int, qr_max_size: Optional[int] = None
+    ) -> int:
+        return _custom_ar.init_custom_qr(world_size, rank, qr_max_size)
+
+    def qr_get_handle(fa: int) -> torch.Tensor:
+        return _custom_ar.qr_get_handle(fa)
+
+    def qr_open_handles(fa: int, handles: list[torch.Tensor]) -> None:
+        _custom_ar.qr_open_handles(fa, handles)
+
+    def qr_all_reduce(
+        fa: int,
+        inp: torch.Tensor,
+        out: torch.Tensor,
+        quant_level: int,
+        cast_bf2half: bool,
+    ) -> None:
+        _custom_ar.qr_all_reduce(fa, inp, out, quant_level, cast_bf2half)
+
+    def qr_destroy(fa: int) -> None:
+        _custom_ar.qr_destroy(fa)
+
+    def qr_max_size() -> int:
+        return _custom_ar.qr_max_size()
+
+
+# endregion
+
+# region IS_MSCCLPP_AR_AVAILABLE
+
+if not IS_MSCCLPP_AR_AVAILABLE:
+    pass
+
+elif _is_cuda:
+
+    def mscclpp_generate_unique_id() -> bytes:
+        return _custom_ar.mscclpp_generate_unique_id()
+
+    def mscclpp_init_context(
+        unique_id: bytes,
+        rank: int,
+        world_size: int,
+        scratch: torch.Tensor,
+        put_buffer: torch.Tensor,
+        nranks_per_node: int,
+        rank_to_node: List[int],
+        rank_to_ib: List[int],
+        context_selection: int,
+    ) -> int:
+        return _custom_ar.mscclpp_init_context(
+            unique_id,
+            rank,
+            world_size,
+            scratch,
+            put_buffer,
+            nranks_per_node,
+            rank_to_node,
+            rank_to_ib,
+            context_selection,
+        )
+
+    def mscclpp_allreduce(
+        context: int, inp: torch.Tensor, out: torch.Tensor, nthreads: int, nblocks: int
+    ) -> None:
+        return _custom_ar.mscclpp_allreduce(context, inp, out, nthreads, nblocks)
+
+
+# endregion
diff --git a/sglang/python/sglang/srt/distributed/device_communicators/custom_all_reduce_utils.py b/sglang/python/sglang/srt/distributed/device_communicators/custom_all_reduce_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..c74cd58d6b43bef0af0aec1bbe6e16edf17a5191
--- /dev/null
+++ b/sglang/python/sglang/srt/distributed/device_communicators/custom_all_reduce_utils.py
@@ -0,0 +1,393 @@
+# Adapted from https://github.com/vllm-project/vllm/blob/v0.6.4.post1/vllm/distributed/device_communicators/custom_all_reduce_utils.py
+
+import ctypes
+import json
+import logging
+import os
+import pickle
+import subprocess
+import sys
+import tempfile
+from functools import wraps
+from itertools import product
+from typing import Callable, Dict, List, Optional, Sequence, TypeVar
+
+import torch
+import torch.distributed as dist
+import torch.multiprocessing as mp
+from typing_extensions import ParamSpec
+
+from sglang.srt.distributed.device_communicators.cuda_wrapper import CudaRTLibrary
+from sglang.srt.utils import is_cuda, is_hip, is_musa
+
+logger = logging.getLogger(__name__)
+
+_is_cuda = is_cuda()
+_is_hip = is_hip()
+_is_musa = is_musa()
+
+if _is_cuda:
+    try:
+        import pynvml
+    except ImportError as e:
+        logger.warning("Failed to import pynvml with %r", e)
+
+if _is_musa:
+    try:
+        import pymtml as pynvml
+    except ImportError as e:
+        logger.warning("Failed to import pymtml with %r", e)
+
+if _is_hip:
+    try:
+        from amdsmi import (
+            AmdSmiException,
+            amdsmi_get_processor_handles,
+            amdsmi_init,
+            amdsmi_shut_down,
+            amdsmi_topo_get_link_type,
+        )
+    except ImportError as e:
+        logger.warning("Failed to import amdsmi with %r", e)
+
+_P = ParamSpec("_P")
+_R = TypeVar("_R")
+
+
+def update_environment_variables(envs: Dict[str, str]):
+    for k, v in envs.items():
+        if k in os.environ and os.environ[k] != v:
+            logger.warning(
+                "Overwriting environment variable %s " "from '%s' to '%s'",
+                k,
+                os.environ[k],
+                v,
+            )
+        os.environ[k] = v
+
+
+def producer(
+    batch_src: Sequence[int],
+    producer_queue,
+    consumer_queue,
+    result_queue,
+    cuda_visible_devices: Optional[str] = None,
+):
+    if cuda_visible_devices is not None:
+        update_environment_variables({"CUDA_VISIBLE_DEVICES": cuda_visible_devices})
+
+    lib = CudaRTLibrary()
+    for i in batch_src:
+        lib.cudaSetDevice(i)
+        pointer = lib.cudaMalloc(1024)
+        lib.cudaMemset(pointer, 1, 1024)
+        lib.cudaDeviceSynchronize()
+        handle = lib.cudaIpcGetMemHandle(pointer)
+        producer_queue.put(handle)
+        open_success = consumer_queue.get()
+        if open_success:
+            # use two queues to simulate barrier
+            producer_queue.put(0)
+            consumer_queue.get()
+            # check if the memory is modified
+            host_data = (ctypes.c_char * 1024)()
+            lib.cudaMemcpy(host_data, pointer, 1024)  # type: ignore
+            for i in range(1024):
+                if ord(host_data[i]) != 2:
+                    open_success = False
+                    break
+        result_queue.put(open_success)
+        lib.cudaDeviceReset()
+
+
+def consumer(
+    batch_tgt: Sequence[int],
+    producer_queue,
+    consumer_queue,
+    result_queue,
+    cuda_visible_devices: Optional[str] = None,
+):
+    if cuda_visible_devices is not None:
+        update_environment_variables({"CUDA_VISIBLE_DEVICES": cuda_visible_devices})
+
+    lib = CudaRTLibrary()
+    for j in batch_tgt:
+        lib.cudaSetDevice(j)
+        handle = producer_queue.get()
+        open_success = False
+        try:
+            pointer = lib.cudaIpcOpenMemHandle(handle)  # type: ignore
+            open_success = True
+        except RuntimeError:
+            # cannot error out here, because the producer process
+            # is still waiting for the response.
+            pass
+        consumer_queue.put(open_success)
+        if open_success:
+            # modify the memory
+            lib.cudaMemset(pointer, 2, 1024)
+            lib.cudaDeviceSynchronize()
+            # use two queues to simulate barrier
+            producer_queue.get()
+            consumer_queue.put(0)
+            # check if the memory is modified
+            host_data = (ctypes.c_char * 1024)()
+            lib.cudaMemcpy(host_data, pointer, 1024)  # type: ignore
+            for i in range(1024):
+                if ord(host_data[i]) != 2:
+                    open_success = False
+                    break
+        result_queue.put(open_success)
+        lib.cudaDeviceReset()
+
+
+def can_actually_p2p(
+    batch_src: Sequence[int],
+    batch_tgt: Sequence[int],
+) -> Sequence[bool]:
+    """
+    Usually, checking if P2P access is enabled can be done by
+    `torch.cuda.can_device_access_peer(src, tgt)`. However, sometimes
+    the driver might be broken, and `torch.cuda.can_device_access_peer(src, tgt)`
+    returns `True` even if P2P access is not actually possible.
+    See https://github.com/vllm-project/vllm/issues/2728 and
+    https://forums.developer.nvidia.com/t/direct-gpu-gpu-communication-does-not-seem-to-work-properly/283264/10
+    Therefore, we have to perform a real P2P access to check if it is actually
+    possible.
+
+    Note on p2p and cuda IPC:
+    Usually, one process uses one GPU:
+    GPU src --> cuda context src --> tensor src --> process src
+
+    We need to combine p2p and cuda IPC, so that:
+    GPU src --> cuda context src --> tensor src --> process src
+                                      |shared|
+    GPU tgt --> cuda context tgt --> tensor tgt --> process tgt
+    That is to say, process src creates a tensor in GPU src, passes IPC handle to
+    process tgt, and process tgt accesses the tensor in GPU tgt. Any operation on the
+    tensor in process tgt will be reflected in the tensor in process src, because
+    they are the same memory segment.
+    It is important to note that process tgt accesses the tensor in GPU tgt, not
+    GPU src. That's why we need p2p access.
+
+    The most time-consuming part is the process creation. To avoid creating
+    processes for every pair of GPUs, we use batched testing. We create two
+    processes for testing all pairs of GPUs in batch. The trick is to reset
+    the device after each test (which is not available in PyTorch).
+    """  # noqa
+    cuda_visible_devices = os.environ.get("CUDA_VISIBLE_DEVICES", None)
+    # pass the CUDA_VISIBLE_DEVICES to the child process
+    # to make sure they see the same set of GPUs
+
+    # make sure the processes are spawned
+    smp = mp.get_context("spawn")
+    producer_queue = smp.Queue()
+    consumer_queue = smp.Queue()
+    result_queue = smp.Queue()
+    p_src = smp.Process(
+        target=producer,
+        args=(
+            batch_src,
+            producer_queue,
+            consumer_queue,
+            result_queue,
+            cuda_visible_devices,
+        ),
+    )
+    p_tgt = smp.Process(
+        target=consumer,
+        args=(
+            batch_tgt,
+            producer_queue,
+            consumer_queue,
+            result_queue,
+            cuda_visible_devices,
+        ),
+    )
+    p_src.start()
+    p_tgt.start()
+    p_src.join()
+    p_tgt.join()
+    assert p_src.exitcode == 0 and p_tgt.exitcode == 0
+    result: List[bool] = []
+    for src, tgt in zip(batch_src, batch_tgt):
+        a = result_queue.get()
+        b = result_queue.get()
+        if a != b:
+            logger.warning(
+                "Two processes do not agree on the P2P access"
+                " status on %d -> %d, treat as disabled.",
+                src,
+                tgt,
+            )
+            result.append(False)
+        else:
+            result.append(a)
+    return result
+
+
+# why do we need this cache?
+# we are testing peer-to-peer (p2p) access between GPUs,across processes.
+# if we test it every time, it will be very slow, because we need to create
+#  N * N * 2 processes, where N is the world size. This is very slow.
+# to reduce the time, we use a cache file to store the p2p access status.
+# the cache file is generated by the master process if it does not exist.
+# then all the processes can read the cache file to check the p2p access status.
+# Note that the cache file is suffixed by the CUDA_VISIBLE_DEVICES, so that we
+#  can have different cache files for different CUDA_VISIBLE_DEVICES settings,
+#  e.g. used by different vllm engines. The device id in the cache file is a
+#  **local** device id, i.e. from 0 to num_dev-1, where num_dev is the number
+#  of visible devices in the vllm engine.
+_gpu_p2p_access_cache: Optional[Dict[str, bool]] = None
+
+
+def gpu_p2p_access_check(src: int, tgt: int) -> bool:
+    """Check if GPU src can access GPU tgt."""
+
+    # if the cache variable is already calculated,
+    # read from the cache instead of checking it again
+    global _gpu_p2p_access_cache
+    if _gpu_p2p_access_cache is not None:
+        return _gpu_p2p_access_cache[f"{src}->{tgt}"]
+
+    is_distributed = dist.is_initialized()
+
+    num_dev = torch.cuda.device_count()
+    cuda_visible_devices = os.environ.get("CUDA_VISIBLE_DEVICES", None)
+    if cuda_visible_devices is None:
+        cuda_visible_devices = ",".join(str(i) for i in range(num_dev))
+
+    # VLLM_CACHE_ROOT -> SGLANG_CACHE_ROOT
+    # "~/.cache/vllm" -> "~/.cache/sglang"
+    SGLANG_CACHE_ROOT = os.path.expanduser("~/.cache/sglang")
+    path = os.path.join(
+        SGLANG_CACHE_ROOT, f"gpu_p2p_access_cache_for_{cuda_visible_devices}.json"
+    )
+    os.makedirs(os.path.dirname(path), exist_ok=True)
+    from sglang.srt.distributed.parallel_state import get_world_group
+
+    if (not is_distributed or get_world_group().local_rank == 0) and (
+        not os.path.exists(path)
+    ):
+        # only the local master process (with local_rank == 0) can
+        #  enter this block to calculate the cache
+        logger.info("generating GPU P2P access cache in %s", path)
+        cache: Dict[str, bool] = {}
+        ids = list(range(num_dev))
+        # batch of all pairs of GPUs
+        batch_src, batch_tgt = zip(*list(product(ids, ids)))
+        # NOTE: we use `subprocess` rather than `multiprocessing` here
+        # because the caller might not have `if __name__ == "__main__":`,
+        # in that case we cannot use spawn method in multiprocessing.
+        # However, `can_actually_p2p` requires spawn method.
+        # The fix is, we use `subprocess` to call the function,
+        # where we have `if __name__ == "__main__":` in this file.
+
+        # use a temporary file to store the result
+        # we don't use the output of the subprocess directly,
+        # because the subprocess might produce logging output
+        with tempfile.NamedTemporaryFile() as output_file:
+            input_bytes = pickle.dumps((batch_src, batch_tgt, output_file.name))
+            returned = subprocess.run(
+                [sys.executable, __file__], input=input_bytes, capture_output=True
+            )
+            # check if the subprocess is successful
+            try:
+                returned.check_returncode()
+            except Exception as e:
+                # wrap raised exception to provide more information
+                raise RuntimeError(
+                    f"Error happened when batch testing "
+                    f"peer-to-peer access from {batch_src} to {batch_tgt}:\n"
+                    f"{returned.stderr.decode()}"
+                ) from e
+            with open(output_file.name, "rb") as f:
+                result = pickle.load(f)
+        for _i, _j, r in zip(batch_src, batch_tgt, result):
+            cache[f"{_i}->{_j}"] = r
+        with open(path, "w") as f:
+            json.dump(cache, f, indent=4)
+    if is_distributed:
+        get_world_group().barrier()
+    logger.info("reading GPU P2P access cache from %s", path)
+    with open(path) as f:
+        cache = json.load(f)
+    _gpu_p2p_access_cache = cache
+    return _gpu_p2p_access_cache[f"{src}->{tgt}"]
+
+
+def with_nvml_context(fn: Callable[_P, _R]) -> Callable[_P, _R]:
+    @wraps(fn)
+    def wrapper(*args: _P.args, **kwargs: _P.kwargs) -> _R:
+        if _is_hip:
+            try:
+                amdsmi_init()
+                return fn(*args, **kwargs)
+            finally:
+                amdsmi_shut_down()
+        else:
+            pynvml.nvmlInit()
+            try:
+                return fn(*args, **kwargs)
+            finally:
+                pynvml.nvmlShutdown()
+
+    return wrapper
+
+
+@with_nvml_context
+def is_full_nvlink(physical_device_ids: List[int], world_size: int) -> bool:
+    if _is_hip:
+        """
+        query if the set of gpus are fully connected by xgmi (1 hop)
+        """
+        handles = [amdsmi_get_processor_handles()[i] for i in physical_device_ids]
+        for i, handle in enumerate(handles):
+            for j, peer_handle in enumerate(handles):
+                if i < j:
+                    try:
+                        link_type = amdsmi_topo_get_link_type(handle, peer_handle)
+                        # type is 2 for XGMI
+                        if link_type["hops"] != 1 or link_type["type"] != 2:
+                            return False
+                    except AmdSmiException as error:
+                        logger.error("AMD 1 hop XGMI detection failed.", exc_info=error)
+                        return False
+        return True
+    else:
+        """
+        query if the set of gpus are fully connected by nvlink (1 hop)
+        """
+        handles = [pynvml.nvmlDeviceGetHandleByIndex(i) for i in physical_device_ids]
+        for i, handle in enumerate(handles):
+            for j, peer_handle in enumerate(handles):
+                if i < j:
+                    try:
+                        p2p_status = pynvml.nvmlDeviceGetP2PStatus(
+                            handle, peer_handle, pynvml.NVML_P2P_CAPS_INDEX_NVLINK
+                        )
+                        if p2p_status != pynvml.NVML_P2P_STATUS_OK:
+                            return False
+                    except pynvml.NVMLError:
+                        logger.exception(
+                            "NVLink detection failed. This is normal if your"
+                            " machine has no NVLink equipped."
+                        )
+                        return False
+        return True
+
+
+def is_weak_contiguous(inp: torch.Tensor):
+    return inp.is_contiguous() or (
+        inp.storage().nbytes() - inp.storage_offset() * inp.element_size()
+        == inp.numel() * inp.element_size()
+    )
+
+
+__all__ = ["gpu_p2p_access_check"]
+
+if __name__ == "__main__":
+    batch_src, batch_tgt, output_file = pickle.loads(sys.stdin.buffer.read())
+    result = can_actually_p2p(batch_src, batch_tgt)
+    with open(output_file, "wb") as f:
+        f.write(pickle.dumps(result))
diff --git a/sglang/python/sglang/srt/distributed/device_communicators/hpu_communicator.py b/sglang/python/sglang/srt/distributed/device_communicators/hpu_communicator.py
new file mode 100644
index 0000000000000000000000000000000000000000..722e494cf775cf3a72d73ca511f49de1c6d87059
--- /dev/null
+++ b/sglang/python/sglang/srt/distributed/device_communicators/hpu_communicator.py
@@ -0,0 +1,49 @@
+# Adapted from https://github.com/vllm-project/vllm/blob/v0.6.4.post1/vllm/distributed/device_communicators/hpu_communicator.py
+
+import torch
+import torch.distributed as dist
+from torch.distributed import ProcessGroup
+
+from sglang.srt.utils import is_hpu
+
+if is_hpu():
+    import habana_frameworks.torch as htorch  # noqa: F401
+
+
+class HpuCommunicator:
+
+    def __init__(self, group: ProcessGroup):
+        if not is_hpu():
+            self.disabled = True
+            return
+        self.disabled = False
+        self.group = group
+        self.world_size = dist.get_world_size(self.group)
+
+    def all_reduce(self, x: torch.Tensor) -> torch.Tensor:
+        # FIXME(kzawora): this is a workaround for a bug in Habana PT bridge
+        # occurring when PT_HPU_ENABLE_LAZY_COLLECTIVES=true env var is used
+        # (which is required for tensor parallel HPUGraph inference)
+        htorch.core.mark_step()
+        dist.all_reduce(x, group=self.group)
+        return x
+
+    def all_gather(self, x: torch.Tensor, dim: int = -1) -> torch.Tensor:
+        world_size = self.world_size
+        if dim < 0:
+            # Convert negative dim to positive.
+            dim += x.dim()
+        input_size = x.size()
+        # Allocate output tensor.
+        output_tensor = torch.empty(
+            (world_size,) + input_size, dtype=x.dtype, device=x.device
+        )
+        # All-gather.
+        htorch.core.mark_step()
+        dist.all_gather_into_tensor(output_tensor, x, group=self.group)
+        # Reshape
+        output_tensor = output_tensor.movedim(0, dim)
+        output_tensor = output_tensor.reshape(
+            input_size[:dim] + (world_size * input_size[dim],) + input_size[dim + 1 :]
+        )
+        return output_tensor
diff --git a/sglang/python/sglang/srt/distributed/device_communicators/mooncake_transfer_engine.py b/sglang/python/sglang/srt/distributed/device_communicators/mooncake_transfer_engine.py
new file mode 100644
index 0000000000000000000000000000000000000000..3978bf26df00f78a3bd47071693c1bba126dba24
--- /dev/null
+++ b/sglang/python/sglang/srt/distributed/device_communicators/mooncake_transfer_engine.py
@@ -0,0 +1,286 @@
+import json
+import logging
+import os
+from typing import List, Optional
+
+from sglang.srt.environ import envs
+from sglang.srt.utils import get_free_port, maybe_wrap_ipv6_address
+
+logger = logging.getLogger(__name__)
+
+# Module-level shared engine instance, set by init_mooncake_transfer_engine().
+_mooncake_transfer_engine: Optional["MooncakeTransferEngine"] = None
+
+
+def get_ib_devices_for_gpu(ib_device_str: Optional[str], gpu_id: int) -> Optional[str]:
+    """
+    Parse IB device string and get IB devices for a specific GPU ID.
+
+    Supports all the following formats:
+    1. Old format: "ib0, ib1, ib2"
+    2. New format: {0: "ib0, ib1", 1: "ib2, ib3", 2: "ib4"}
+    3. JSON file: path to a JSON file containing the mapping
+
+    Args:
+        ib_device_str: The original IB device string or path to JSON file
+        gpu_id: The GPU ID to get devices for
+
+    Returns:
+        IB devices string for the GPU, or None if not available
+    """
+    if ib_device_str is None or not ib_device_str.strip():
+        return None
+
+    ib_device_str = ib_device_str.strip()
+
+    # Check if it's a JSON file first and load its content
+    is_json_file = ib_device_str.endswith(".json")
+    if is_json_file:
+        try:
+            if os.path.isfile(ib_device_str):
+                with open(ib_device_str, "r") as f:
+                    ib_device_str = f.read()
+            else:
+                # File doesn't exist, treat as old format
+                raise RuntimeError(f"File {ib_device_str} does not exist.")
+        except (IOError, OSError) as e:
+            # File reading failed, raise exception
+            raise RuntimeError(f"Failed to read JSON file {ib_device_str}: {e}") from e
+
+    # Check if it's JSON format (new format)
+    try:
+        parsed_json = json.loads(ib_device_str)
+        if isinstance(parsed_json, dict):
+            # Validate format - keys should be integers (or string rep), values should be strings
+            gpu_mapping = {}
+            for gpu_key, ib_devices in parsed_json.items():
+                if (
+                    isinstance(gpu_key, str)
+                    and gpu_key.isdigit()
+                    and isinstance(ib_devices, str)
+                ):
+                    gpu_mapping[int(gpu_key)] = ib_devices.strip()
+                elif isinstance(gpu_key, int) and isinstance(ib_devices, str):
+                    gpu_mapping[gpu_key] = ib_devices.strip()
+                else:
+                    raise ValueError(
+                        "Invalid format: keys must be integers (or string "
+                        "representations of integers) and values must be strings"
+                    )
+
+            if not gpu_mapping:
+                raise ValueError("No valid GPU mappings found in JSON")
+
+            # Return devices for specific GPU
+            if gpu_id in gpu_mapping:
+                return gpu_mapping[gpu_id]
+            else:
+                raise ValueError(
+                    f"No IB devices configured for GPU {gpu_id}. "
+                    f"Available GPUs: {list(gpu_mapping.keys())}"
+                )
+
+    except json.JSONDecodeError:
+        if is_json_file:
+            # It was supposed to be a JSON file but failed to parse
+            raise RuntimeError(
+                f"Failed to parse JSON content from file {ib_device_str}"
+            )
+        # Not JSON format, treat as old format - return same devices for all GPUs
+        return ib_device_str
+
+
+class MooncakeTransferEngine:
+    """Shared Mooncake transfer engine for RDMA/transfer operations."""
+
+    def __init__(
+        self,
+        hostname: str,
+        gpu_id: Optional[int] = None,
+        ib_device: Optional[str] = None,
+    ):
+        try:
+            from mooncake.engine import TransferEngine
+        except ImportError as e:
+            raise ImportError(
+                "Please install mooncake by following the instructions at "
+                "https://kvcache-ai.github.io/Mooncake/getting_started/build.html "
+                "to run SGLang with MooncakeTransferEngine."
+            ) from e
+
+        self.engine = TransferEngine()
+        self.hostname = hostname
+        self.gpu_id = gpu_id if gpu_id is not None else 0
+        self.ib_device = get_ib_devices_for_gpu(ib_device, self.gpu_id)
+
+        self.initialize(
+            hostname=self.hostname,
+            device_name=self.ib_device,
+        )
+        self.session_id = (
+            f"{maybe_wrap_ipv6_address(self.hostname)}:{self.engine.get_rpc_port()}"
+        )
+
+    def register(self, ptr, length):
+        try:
+            ret_value = self.engine.register_memory(ptr, length)
+        except Exception:
+            # Mark register as failed
+            ret_value = -1
+
+        if ret_value != 0:
+            logger.debug("Mooncake memory registration %s failed.", ptr)
+
+    def deregister(self, ptr):
+        try:
+            ret_value = self.engine.unregister_memory(ptr)
+        except Exception:
+            # Mark deregister as failed
+            ret_value = -1
+
+        if ret_value != 0:
+            logger.debug("Mooncake memory deregistration %s failed.", ptr)
+
+    def batch_register(self, ptrs: List[int], lengths: List[int]) -> int:
+        """Batch register multiple memory regions."""
+        try:
+            ret_value = self.engine.batch_register_memory(ptrs, lengths)
+        except Exception:
+            # Mark batch register as failed
+            ret_value = -1
+            if not hasattr(self.engine, "batch_register_memory"):
+                raise RuntimeError(
+                    "Mooncake's batch register requires a newer version of "
+                    "mooncake-transfer-engine. Please upgrade Mooncake."
+                )
+
+        if ret_value != 0:
+            logger.debug("Mooncake batch memory registration failed.")
+        return ret_value
+
+    def batch_deregister(self, ptrs: List[int]) -> int:
+        """Batch deregister multiple memory regions."""
+        try:
+            ret_value = self.engine.batch_unregister_memory(ptrs)
+        except Exception:
+            # Mark batch deregister as failed
+            ret_value = -1
+
+        if ret_value != 0:
+            logger.debug("Mooncake batch memory deregistration failed.")
+        return ret_value
+
+    def initialize(
+        self,
+        hostname: str,
+        device_name: Optional[str],
+    ) -> None:
+        """Initialize the mooncake instance."""
+        if envs.ENABLE_ASCEND_TRANSFER_WITH_MOONCAKE.get():
+            npu_phy_id = envs.ASCEND_NPU_PHY_ID.get()
+            if npu_phy_id == -1:
+                hostname += f":{get_free_port()}:npu_{self.gpu_id}"
+            else:
+                hostname += f":{get_free_port()}:npu_{npu_phy_id}"
+            ret_value = self.engine.initialize(
+                hostname,
+                "P2PHANDSHAKE",
+                "ascend",
+                device_name if device_name is not None else "",
+            )
+        else:
+            ret_value = self.engine.initialize(
+                hostname,
+                "P2PHANDSHAKE",
+                "rdma",
+                device_name if device_name is not None else "",
+            )
+        if ret_value != 0:
+            logger.error("Mooncake Transfer Engine initialization failed.")
+            raise RuntimeError("Mooncake Transfer Engine initialization failed.")
+
+    def transfer_sync(
+        self, session_id: str, buffer: int, peer_buffer_address: int, length: int
+    ) -> int:
+        """Synchronously transfer data to the specified address."""
+        try:
+            ret = self.engine.transfer_sync_write(
+                session_id, buffer, peer_buffer_address, length
+            )
+        except Exception:
+            ret = -1
+
+        if ret < 0:
+            logger.debug(
+                "Failed to transfer data from %s to %s - %s.",
+                buffer,
+                session_id,
+                peer_buffer_address,
+            )
+
+        return ret
+
+    def batch_transfer_sync(
+        self,
+        session_id: str,
+        buffers: List[int],
+        peer_buffer_addresses: List[int],
+        lengths: List[int],
+    ) -> int:
+        """Synchronously transfer data to the specified addresses in batches."""
+        try:
+            ret = self.engine.batch_transfer_sync_write(
+                session_id, buffers, peer_buffer_addresses, lengths
+            )
+        except Exception:
+            ret = -1
+            if not hasattr(self.engine, "batch_transfer_sync_write"):
+                raise RuntimeError(
+                    "Mooncake's batch transfer requires mooncake-transfer-engine "
+                    ">= 0.3.4.post2. Please upgrade Mooncake by "
+                    "'pip install mooncake-transfer-engine --upgrade'"
+                )
+
+        if ret < 0:
+            logger.debug(
+                "Failed to batch transfer data. Buffers: %s, Session: %s, "
+                "Peer addresses: %s",
+                buffers,
+                session_id,
+                peer_buffer_addresses,
+            )
+        return ret
+
+    def get_session_id(self):
+        return self.session_id
+
+    def get_engine(self):
+        return self.engine.get_engine()
+
+    def get_ib_device(self):
+        return self.ib_device
+
+
+def init_mooncake_transfer_engine(
+    hostname: str,
+    gpu_id: Optional[int] = None,
+    ib_device: Optional[str] = None,
+) -> MooncakeTransferEngine:
+    """
+    Initialize the shared MooncakeTransferEngine. Note: if already
+    initialized with the same (hostname, gpu_id, ib_device), returns existing
+    instance. Call from parallel_state when model parallel is set up and
+    mooncake transfer is needed.
+    """
+    global _mooncake_transfer_engine
+    if _mooncake_transfer_engine is not None:
+        return _mooncake_transfer_engine
+    _mooncake_transfer_engine = MooncakeTransferEngine(
+        hostname=hostname, gpu_id=gpu_id, ib_device=ib_device
+    )
+    return _mooncake_transfer_engine
+
+
+def get_mooncake_transfer_engine() -> Optional[MooncakeTransferEngine]:
+    """Return the shared MooncakeTransferEngine if initialized, else None."""
+    return _mooncake_transfer_engine
diff --git a/sglang/python/sglang/srt/distributed/device_communicators/npu_communicator.py b/sglang/python/sglang/srt/distributed/device_communicators/npu_communicator.py
new file mode 100644
index 0000000000000000000000000000000000000000..cb6eb88e39be5860d705a57a7e015c1c1a22d2e2
--- /dev/null
+++ b/sglang/python/sglang/srt/distributed/device_communicators/npu_communicator.py
@@ -0,0 +1,39 @@
+import torch
+import torch.distributed as dist
+from torch.distributed import ProcessGroup
+
+from sglang.srt.utils import is_npu
+
+
+class NpuCommunicator:
+
+    def __init__(self, group: ProcessGroup):
+        if not is_npu():
+            self.disabled = True
+            return
+        self.disabled = False
+        self.group = group
+        self.world_size = dist.get_world_size(self.group)
+
+    def all_reduce(self, x: torch.Tensor) -> torch.Tensor:
+        dist.all_reduce(x, group=self.group)
+        return x
+
+    def all_gather(self, x: torch.Tensor, dim: int = -1) -> torch.Tensor:
+        world_size = self.world_size
+        if dim < 0:
+            # Convert negative dim to positive.
+            dim += x.dim()
+        input_size = x.size()
+        output_size = (input_size[0] * world_size,) + input_size[1:]
+        # Allocate output tensor.
+        output_tensor = torch.empty(output_size, dtype=x.dtype, device=x.device)
+        # All-gather.
+        dist.all_gather_into_tensor(output_tensor, x, group=self.group)
+        # Reshape
+        output_tensor = output_tensor.reshape((world_size,) + input_size)
+        output_tensor = output_tensor.movedim(0, dim)
+        output_tensor = output_tensor.reshape(
+            input_size[:dim] + (world_size * input_size[dim],) + input_size[dim + 1 :]
+        )
+        return output_tensor
diff --git a/sglang/python/sglang/srt/distributed/device_communicators/pymscclpp.py b/sglang/python/sglang/srt/distributed/device_communicators/pymscclpp.py
new file mode 100644
index 0000000000000000000000000000000000000000..e45093c782b25f6abb97850aa13e36caa25d23e4
--- /dev/null
+++ b/sglang/python/sglang/srt/distributed/device_communicators/pymscclpp.py
@@ -0,0 +1,302 @@
+import bisect
+import logging
+import math
+import os
+from contextlib import contextmanager
+from enum import IntEnum
+from typing import Optional, Union
+
+import torch
+import torch.distributed as dist
+from torch.distributed import ProcessGroup, ReduceOp
+
+import sglang.srt.distributed.device_communicators.custom_all_reduce_ops as ops
+from sglang.srt.utils import is_hip
+
+logger = logging.getLogger(__name__)
+
+_is_hip = is_hip()
+
+
+class MscclContextSelection(IntEnum):
+    MSCCL1SHOT1NODELL = 1
+    MSCCL1SHOT2NODELL = 2
+
+
+def mscclpp_is_weak_contiguous(inp: torch.Tensor):
+    return inp.is_contiguous() or (
+        inp.storage().nbytes() - inp.storage_offset() * inp.element_size()
+        == inp.numel() * inp.element_size()
+    )
+
+
+def mscclpp_convert_to_bytes(size_str):
+    """
+    Converts a human-readable size string (e.g., "1MB", "2.5kb", "3 GB")
+    into the equivalent number of bytes using binary units.
+
+    Args:
+        size_str (str): A string representing size with unit (KB, MB, GB).
+
+    Returns:
+        int: Number of bytes.
+    """
+    size_str = size_str.strip().lower()
+
+    if not size_str:
+        raise ValueError("Empty input string")
+
+    # Extract numeric part and unit
+    for i in range(len(size_str)):
+        if not size_str[i].isdigit() and size_str[i] != ".":
+            break
+    num_str = size_str[:i]
+    unit = size_str[i:].strip()
+
+    try:
+        num = float(num_str)
+    except ValueError:
+        raise ValueError(f"Invalid numeric value in '{size_str}'")
+
+    # Conversion factors
+    if unit == "b":
+        return int(num)
+    elif unit == "kb":
+        return int(num * 1024)
+    elif unit == "mb":
+        return int(num * 1024 * 1024)
+    elif unit == "gb":
+        return int(num * 1024 * 1024 * 1024)
+    else:
+        raise ValueError(f"Unsupported unit: {unit}, support B, KB, MB, GB only")
+
+
+def mscclpp_bench_time(func, test_niter: int = 10, warmup_niter: int = 2):
+    # warmup
+    for _ in range(warmup_niter):
+        func()
+    start_event = torch.cuda.Event(enable_timing=True)
+    end_event = torch.cuda.Event(enable_timing=True)
+    torch.cuda.synchronize()
+    dist.barrier()
+    start_event.record()
+    for _ in range(test_niter):
+        func()
+    end_event.record()
+    end_event.synchronize()
+    func_cost_us = start_event.elapsed_time(end_event) / test_niter * 1000
+    return func_cost_us
+
+
+class PyMscclppCommunicator:
+    _SUPPORTED_WORLD_SIZES = [8, 16]
+    _MAX_BYTES = mscclpp_convert_to_bytes(os.getenv("SGLANG_MSCCLPP_MAX_BYTES", "1MB"))
+    _SUPPORTED_DTYPE = [torch.float, torch.float16, torch.bfloat16]
+
+    # max_bytes: max supported mscclpp allreduce size
+    # in A100 mscclpp is faster than nccl only under condition of msg size smaller than1MB
+    def __init__(
+        self,
+        group: ProcessGroup,
+        device: Union[int, str, torch.device],
+        max_bytes=_MAX_BYTES,
+    ) -> None:
+        """
+        Args:
+            group: the process group to work on. If None, it will use the
+                default process group.
+            device: the device to bind the CustomAllreduce to. If None,
+                it will be bind to f"cuda:{local_rank}".
+        It is the caller's responsibility to make sure each communicator
+        is bind to a unique device, and all communicators in this group
+        are in the same node.
+        """
+        self._IS_CAPTURING = False
+        self.disabled = True
+
+        if not ops.IS_MSCCLPP_AR_AVAILABLE:
+            # disable because of missing mscclpp library
+            # e.g. in a non-cuda environment
+            return
+
+        self.group = group
+
+        assert (
+            dist.get_backend(group) != dist.Backend.NCCL
+        ), "CustomAllreduce should be attached to a non-NCCL group."
+
+        rank = dist.get_rank(group=self.group)
+        world_size = dist.get_world_size(group=self.group)
+        if world_size == 1:
+            # No need to initialize mscclpp for single GPU case.
+            return
+
+        if world_size not in PyMscclppCommunicator._SUPPORTED_WORLD_SIZES:
+            logger.warning(
+                "PyMscclpp is disabled due to an unsupported world"
+                " size: %d. Supported world sizes: %s. To silence this "
+                "warning, specify disable_mscclpp=True explicitly.",
+                world_size,
+                str(PyMscclppCommunicator._SUPPORTED_WORLD_SIZES),
+            )
+            return
+
+        self.ranks = torch.distributed.get_process_group_ranks(group)
+        self.nranks_per_node = torch.cuda.device_count()
+        # for now mscclpp with stride in the communicator is not tested
+        if not (abs(self.ranks[-1] - self.ranks[0]) == world_size - 1):
+            logger.warning(
+                "PyMscclpp is disabled due to an unsupported group %s."
+                "Please ensure all ranks in the group are consecutive."
+                "To silence this warning, specify disable_mscclpp=True explicitly.",
+                str(self.ranks),
+            )
+            return
+
+        if isinstance(device, int):
+            device = torch.device(f"cuda:{device}")
+        elif isinstance(device, str):
+            device = torch.device(device)
+        # now `device` is a `torch.device` object
+        assert isinstance(device, torch.device)
+        self.device = device
+
+        self.max_bytes = max_bytes
+        self.rank = rank
+        self.world_size = world_size
+
+        if dist.get_rank(group) == 0:
+            unique_id = [ops.mscclpp_generate_unique_id()]
+        else:
+            unique_id = [None]
+        dist.broadcast_object_list(unique_id, src=self.ranks[0], group=self.group)
+        self.unique_id = unique_id[0]
+        self.rank_to_node, self.rank_to_ib = list(range(world_size)), list(
+            range(world_size)
+        )
+        for r in range(world_size):
+            self.rank_to_node[r] = r // 8
+            self.rank_to_ib[r] = self.rank % 8
+
+        self._context = None
+        self.context_selection = None
+        self.msg_size_for_finetune = [
+            2**i for i in range(10, math.floor(math.log2(self.max_bytes)) + 1)
+        ]
+        self.msg_size2best_config = {}
+        if world_size == 8:
+            self.context_selection = MscclContextSelection.MSCCL1SHOT1NODELL
+        elif world_size == 16:
+            self.context_selection = MscclContextSelection.MSCCL1SHOT2NODELL
+        if not _is_hip:
+            self.scratch = torch.empty(
+                self.max_bytes * 8,
+                dtype=torch.uint8,
+                device=self.device,
+            )
+            self.put_buffer = torch.empty(
+                self.max_bytes * 8 // self.nranks_per_node,
+                dtype=torch.uint8,
+                device=self.device,
+            )
+            self._context = ops.mscclpp_init_context(
+                self.unique_id,
+                self.rank,
+                self.world_size,
+                self.scratch,
+                self.put_buffer,
+                self.nranks_per_node,
+                self.rank_to_node,
+                self.rank_to_ib,
+                int(self.context_selection),
+            )
+        else:
+            raise NotImplementedError("HIP Mscclpp is not supported yet.")
+
+        self.msg_size2best_config = {}
+        self.pre_tune_config()
+        if dist.get_rank(group) == 0:
+            msg_size2best_config = [self.msg_size2best_config]
+        else:
+            msg_size2best_config = [None]
+        dist.broadcast_object_list(
+            msg_size2best_config, src=self.ranks[0], group=self.group
+        )
+        self.msg_size2best_config = msg_size2best_config[0]
+
+        # PyMscclpp is enabled only in cuda graph
+        self.disabled = True
+
+    def pre_tune_config(self, dtype=torch.bfloat16) -> bool:
+        logger.debug(f"start to pre-tune configs for rank {self.rank}")
+        nthreads_to_try = [256, 512, 1024]
+        nblocks_to_try = [21, 42, 84]
+        inp_randn = torch.ones(
+            self.msg_size_for_finetune[-1] // dtype.itemsize, dtype=dtype, device="cuda"
+        )
+        oup_randn = torch.empty_like(inp_randn)
+        for msg_size in self.msg_size_for_finetune:
+            mock_inp, mock_outp = (
+                inp_randn[: msg_size // dtype.itemsize],
+                oup_randn[: msg_size // dtype.itemsize],
+            )
+            best_config, best_time = None, None
+            for nthreads in nthreads_to_try:
+                for nblocks in nblocks_to_try:
+                    cur_cost = mscclpp_bench_time(
+                        lambda: ops.mscclpp_allreduce(
+                            self._context, mock_inp, mock_outp, nthreads, nblocks
+                        )
+                    )
+                    if best_time is None or cur_cost < best_time:
+                        best_config = (nthreads, nblocks)
+                        best_time = cur_cost
+            self.msg_size2best_config[msg_size] = best_config
+            if self.rank == 0:
+                logger.debug(
+                    f"for msg_size {msg_size}, best_config: {best_config}, best_time: {best_time}us"
+                )
+
+    def should_mscclpp_allreduce(
+        self, inp: torch.Tensor, op: ReduceOp = ReduceOp.SUM
+    ) -> bool:
+        if self.disabled or self._context is None:
+            return False
+        if inp.dtype not in PyMscclppCommunicator._SUPPORTED_DTYPE:
+            return False
+        if not mscclpp_is_weak_contiguous(inp):
+            return False
+        # only support sum op
+        if op != ReduceOp.SUM:
+            return False
+        if inp.numel() * inp.element_size() > self.max_bytes:
+            return False
+        return True
+
+    def all_reduce(self, tensor: torch.Tensor, op: ReduceOp = ReduceOp.SUM):
+        if self._IS_CAPTURING:
+            if torch.cuda.is_current_stream_capturing():
+                self.graph_input_set.add((tensor.dtype, tensor.numel()))
+        msg_size = tensor.numel() * tensor.itemsize
+        index = bisect.bisect_left(self.msg_size_for_finetune, msg_size)
+        msg_size_finetune = self.msg_size_for_finetune[index]
+        nthreads, nblocks = self.msg_size2best_config[msg_size_finetune]
+        result = torch.empty_like(tensor)
+        ops.mscclpp_allreduce(self._context, tensor, result, nthreads, nblocks)
+        return result
+
+    @contextmanager
+    def change_state(
+        self,
+        enable: Optional[bool] = None,
+    ):
+        if enable is None:
+            # guess a default value when not specified
+            enable = self.available
+
+        old_disable = self.disabled
+        self.disabled = not enable
+
+        yield
+
+        self.disabled = old_disable
diff --git a/sglang/python/sglang/srt/distributed/device_communicators/pynccl.py b/sglang/python/sglang/srt/distributed/device_communicators/pynccl.py
new file mode 100644
index 0000000000000000000000000000000000000000..660582ad373004599e2489cdbdd7e3b8ae3ebd2a
--- /dev/null
+++ b/sglang/python/sglang/srt/distributed/device_communicators/pynccl.py
@@ -0,0 +1,411 @@
+# Adapted from https://github.com/vllm-project/vllm/blob/v0.6.4.post1/vllm/distributed/device_communicators/pynccl.py
+
+import logging
+from contextlib import contextmanager
+from typing import Optional, Union
+
+# ===================== import region =====================
+import torch
+import torch.distributed as dist
+from torch.distributed import ProcessGroup, ReduceOp
+
+from sglang.srt.distributed.device_communicators.pynccl_wrapper import (
+    NCCLLibrary,
+    buffer_type,
+    cudaStream_t,
+    ncclComm_t,
+    ncclDataTypeEnum,
+    ncclRedOpTypeEnum,
+    ncclUniqueId,
+)
+from sglang.srt.distributed.utils import StatelessProcessGroup
+from sglang.srt.utils.common import get_current_device_stream_fast
+
+logger = logging.getLogger(__name__)
+
+
+class PyNcclCommunicator:
+
+    def __init__(
+        self,
+        group: Union[ProcessGroup, StatelessProcessGroup],
+        device: Union[int, str, torch.device],
+        library_path: Optional[str] = None,
+        use_current_stream: bool = False,
+    ):
+        """
+        Args:
+            group: the process group to work on. If None, it will use the
+                default process group.
+            device: the device to bind the PyNcclCommunicator to. If None,
+                it will be bind to f"cuda:{local_rank}".
+            library_path: the path to the NCCL library. If None, it will
+                use the default library path.
+        It is the caller's responsibility to make sure each communicator
+        is bind to a unique device.
+        """
+        if not isinstance(group, StatelessProcessGroup):
+            assert dist.is_initialized()
+            assert (
+                dist.get_backend(group) != dist.Backend.NCCL
+            ), "PyNcclCommunicator should be attached to a non-NCCL group."
+            # note: this rank is the rank in the group
+            self.rank = dist.get_rank(group)
+            self.world_size = dist.get_world_size(group)
+        else:
+            self.rank = group.rank
+            self.world_size = group.world_size
+
+        self.group = group
+
+        # if world_size == 1, no need to create communicator
+        if self.world_size == 1:
+            self.available = False
+            self.disabled = True
+            self.stream = None
+            return
+        try:
+            self.nccl = NCCLLibrary(library_path)
+        except Exception:
+            # disable because of missing NCCL library
+            # e.g. in a non-GPU environment
+            self.available = False
+            self.disabled = True
+            self.stream = None
+            return
+
+        self.available = True
+        self.disabled = False
+        self.use_current_stream = use_current_stream
+
+        self.nccl_version = self.nccl.ncclGetRawVersion()
+        if self.rank == 0:
+            logger.info("sglang is using nccl==%s", self.nccl.ncclGetVersion())
+
+        if self.rank == 0:
+            # get the unique id from NCCL
+            self.unique_id = self.nccl.ncclGetUniqueId()
+        else:
+            # construct an empty unique id
+            self.unique_id = ncclUniqueId()
+
+        if not isinstance(group, StatelessProcessGroup):
+            tensor = torch.ByteTensor(list(self.unique_id.internal))
+            ranks = dist.get_process_group_ranks(group)
+            # arg `src` in `broadcast` is the global rank
+            dist.broadcast(tensor, src=ranks[0], group=group)
+            byte_list = tensor.tolist()
+            for i, byte in enumerate(byte_list):
+                self.unique_id.internal[i] = byte
+        else:
+            self.unique_id = group.broadcast_obj(self.unique_id, src=0)
+        if isinstance(device, int):
+            device = torch.device(f"cuda:{device}")
+        elif isinstance(device, str):
+            device = torch.device(device)
+        # now `device` is a `torch.device` object
+        assert isinstance(device, torch.device)
+        self.device = device
+        # nccl communicator and stream will use this device
+        # `torch.cuda.device` is a context manager that changes the
+        # current cuda device to the specified one
+        with torch.cuda.device(device):
+            self.comm: ncclComm_t = self.nccl.ncclCommInitRank(
+                self.world_size, self.unique_id, self.rank
+            )
+            self.stream = torch.cuda.Stream()
+
+            # A small all_reduce for warmup.
+            data = torch.zeros(1, device=device)
+            self.all_reduce(data)
+            self.stream.synchronize()
+            del data
+
+        # by default it is disabled, e.g. in profiling models and prefill phase.
+        # to use it, use under `with obj.change_state(enable=True)`, usually
+        # when we are using CUDA graph.
+        self.disabled = True
+
+    def _resolve_stream(self, stream: Optional[torch.cuda.Stream]):
+        """Return the stream to use for NCCL calls.
+
+        Behavior mirrors the previous inline logic:
+        - if an explicit stream is provided, return it
+        - if stream is None and self.use_current_stream is True, return
+          torch.cuda.current_stream()
+        - otherwise return the communicator's default stream (self.stream)
+        """
+        if stream is not None:
+            return stream
+        if self.use_current_stream:
+            return get_current_device_stream_fast()
+        return self.stream
+
+    def all_reduce(
+        self, tensor: torch.Tensor, op: ReduceOp = ReduceOp.SUM, stream=None
+    ):
+        if self.disabled:
+            return
+        # nccl communicator created on a specific device
+        # will only work on tensors on the same device
+        # otherwise it will cause "illegal memory access"
+        assert tensor.device == self.device, (
+            f"this nccl communicator is created to work on {self.device}, "
+            f"but the input tensor is on {tensor.device}"
+        )
+        stream = self._resolve_stream(stream)
+        self.nccl.ncclAllReduce(
+            buffer_type(tensor.data_ptr()),
+            buffer_type(tensor.data_ptr()),
+            tensor.numel(),
+            ncclDataTypeEnum.from_torch(tensor.dtype),
+            ncclRedOpTypeEnum.from_torch(op),
+            self.comm,
+            cudaStream_t(stream.cuda_stream),
+        )
+
+    def outplace_all_reduce(
+        self,
+        in_tensor: torch.Tensor,
+        out_tensor: Optional[torch.Tensor] = None,
+        op: ReduceOp = ReduceOp.SUM,
+        stream=None,
+    ) -> Optional[torch.Tensor]:
+        if self.disabled:
+            return None
+        assert in_tensor.device == self.device, (
+            f"this nccl communicator is created to work on {self.device}, "
+            f"but the input tensor is on {in_tensor.device}"
+        )
+
+        if out_tensor is None:
+            out_tensor = torch.empty_like(in_tensor)
+
+        stream = self._resolve_stream(stream)
+        self.nccl.ncclAllReduce(
+            buffer_type(in_tensor.data_ptr()),  # sendbuff
+            buffer_type(out_tensor.data_ptr()),  # recvbuff - DIFFERENT pointer
+            in_tensor.numel(),
+            ncclDataTypeEnum.from_torch(in_tensor.dtype),
+            ncclRedOpTypeEnum.from_torch(op),
+            self.comm,
+            cudaStream_t(stream.cuda_stream),
+        )
+        return out_tensor
+
+    def all_gather(
+        self,
+        output_tensor: torch.Tensor,
+        input_tensor: torch.Tensor,
+        stream=None,
+        sizes: Optional[list[int]] = None,
+    ):
+        if self.disabled:
+            return
+        # nccl communicator created on a specific device
+        # will only work on tensors on the same device
+        # otherwise it will cause "illegal memory access"
+        assert input_tensor.device == self.device, (
+            f"this nccl communicator is created to work on {self.device}, "
+            f"but the input tensor is on {input_tensor.device}"
+        )
+        stream = self._resolve_stream(stream)
+
+        if sizes is not None:
+            split_offset = 0
+
+            self.nccl.ncclGroupStart()
+            for root, split_size in enumerate(sizes):
+                dst_slice = output_tensor[split_offset : split_offset + split_size]
+                self.nccl.ncclBroadcast(
+                    buffer_type(input_tensor.data_ptr()),
+                    buffer_type(dst_slice.data_ptr()),
+                    dst_slice.numel(),
+                    ncclDataTypeEnum.from_torch(input_tensor.dtype),
+                    root,
+                    self.comm,
+                    cudaStream_t(stream.cuda_stream),
+                )
+                split_offset += split_size
+            self.nccl.ncclGroupEnd()
+        else:
+            self.nccl.ncclAllGather(
+                buffer_type(input_tensor.data_ptr()),
+                buffer_type(output_tensor.data_ptr()),
+                input_tensor.numel(),
+                ncclDataTypeEnum.from_torch(input_tensor.dtype),
+                self.comm,
+                cudaStream_t(stream.cuda_stream),
+            )
+
+    def cp_all_gather_into_tensor(
+        self,
+        output_tensor: torch.Tensor,
+        input_tensor: torch.Tensor,
+        stream=None,
+        sizes: Optional[list[int]] = None,
+    ):
+        """
+        Currently, it is mainly used in context parallelism,
+        primarily leveraging pynccl to implement non-blocking allgather communication.
+        """
+        # nccl communicator created on a specific device
+        # will only work on tensors on the same device
+        # otherwise it will cause "illegal memory access"
+        assert input_tensor.device == self.device, (
+            f"this nccl communicator is created to work on {self.device}, "
+            f"but the input tensor is on {input_tensor.device}"
+        )
+        stream = self._resolve_stream(stream)
+        self.nccl.ncclAllGather(
+            buffer_type(input_tensor.data_ptr()),
+            buffer_type(output_tensor.data_ptr()),
+            input_tensor.numel(),
+            ncclDataTypeEnum.from_torch(input_tensor.dtype),
+            self.comm,
+            cudaStream_t(stream.cuda_stream),
+        )
+
+    def reduce_scatter(
+        self,
+        output_tensor: torch.Tensor,
+        input_tensor: torch.Tensor,
+        op: ReduceOp = ReduceOp.SUM,
+        stream=None,
+        sizes: Optional[list[int]] = None,
+    ):
+        if self.disabled:
+            return
+        # nccl communicator created on a specific device
+        # will only work on tensors on the same device
+        # otherwise it will cause "illegal memory access"
+        assert input_tensor.device == self.device, (
+            f"this nccl communicator is created to work on {self.device}, "
+            f"but the input tensor is on {input_tensor.device}"
+        )
+        stream = self._resolve_stream(stream)
+
+        if sizes is not None:
+            split_offset = 0
+            self.nccl.ncclGroupStart()
+            for root, split_size in enumerate(sizes):
+                chunk = input_tensor[split_offset : split_offset + split_size, ...]
+
+                self.nccl.ncclReduce(
+                    buffer_type(chunk.data_ptr()),
+                    buffer_type(output_tensor.data_ptr()),
+                    chunk.numel(),
+                    ncclDataTypeEnum.from_torch(input_tensor.dtype),
+                    ncclRedOpTypeEnum.from_torch(op),
+                    root,
+                    self.comm,
+                    cudaStream_t(stream.cuda_stream),
+                )
+                split_offset += split_size
+            self.nccl.ncclGroupEnd()
+        else:
+            self.nccl.ncclReduceScatter(
+                buffer_type(input_tensor.data_ptr()),
+                buffer_type(output_tensor.data_ptr()),
+                output_tensor.numel(),
+                ncclDataTypeEnum.from_torch(input_tensor.dtype),
+                ncclRedOpTypeEnum.from_torch(op),
+                self.comm,
+                cudaStream_t(stream.cuda_stream),
+            )
+
+    def send(self, tensor: torch.Tensor, dst: int, stream=None):
+        if self.disabled:
+            return
+        assert tensor.device == self.device, (
+            f"this nccl communicator is created to work on {self.device}, "
+            f"but the input tensor is on {tensor.device}"
+        )
+        stream = self._resolve_stream(stream)
+        self.nccl.ncclSend(
+            buffer_type(tensor.data_ptr()),
+            tensor.numel(),
+            ncclDataTypeEnum.from_torch(tensor.dtype),
+            dst,
+            self.comm,
+            cudaStream_t(stream.cuda_stream),
+        )
+
+    def recv(self, tensor: torch.Tensor, src: int, stream=None):
+        if self.disabled:
+            return
+        assert tensor.device == self.device, (
+            f"this nccl communicator is created to work on {self.device}, "
+            f"but the input tensor is on {tensor.device}"
+        )
+        stream = self._resolve_stream(stream)
+        self.nccl.ncclRecv(
+            buffer_type(tensor.data_ptr()),
+            tensor.numel(),
+            ncclDataTypeEnum.from_torch(tensor.dtype),
+            src,
+            self.comm,
+            cudaStream_t(stream.cuda_stream),
+        )
+
+    def broadcast(self, tensor: torch.Tensor, src: int, stream=None):
+        if self.disabled:
+            return
+        assert tensor.device == self.device, (
+            f"this nccl communicator is created to work on {self.device}, "
+            f"but the input tensor is on {tensor.device}"
+        )
+        stream = self._resolve_stream(stream)
+
+        if src == self.rank:
+            sendbuff = buffer_type(tensor.data_ptr())
+            # NCCL requires the sender also to have a receive buffer
+            recvbuff = buffer_type(tensor.data_ptr())
+        else:
+            sendbuff = buffer_type()
+            recvbuff = buffer_type(tensor.data_ptr())
+        self.nccl.ncclBroadcast(
+            sendbuff,
+            recvbuff,
+            tensor.numel(),
+            ncclDataTypeEnum.from_torch(tensor.dtype),
+            src,
+            self.comm,
+            cudaStream_t(stream.cuda_stream),
+        )
+
+    def register_comm_window_raw(self, ptr: int, size: int):
+        return self.nccl.ncclCommWindowRegister(self.comm, buffer_type(ptr), size, 1)
+
+    def deregister_comm_window(self, window):
+        return self.nccl.ncclCommWindowDeregister(self.comm, window)
+
+    def group_start(self):
+        self.nccl.ncclGroupStart()
+
+    def group_end(self):
+        self.nccl.ncclGroupEnd()
+
+    @contextmanager
+    def change_state(
+        self, enable: Optional[bool] = None, stream: Optional[torch.cuda.Stream] = None
+    ):
+        """
+        A context manager to change the state of the communicator.
+        """
+        if enable is None:
+            # guess a default value when not specified
+            enable = self.available
+
+        if stream is None:
+            stream = self.stream
+
+        old_disable = self.disabled
+        old_stream = self.stream
+
+        self.stream = stream
+        self.disabled = not enable
+        yield
+
+        self.disabled = old_disable
+        self.stream = old_stream
diff --git a/sglang/python/sglang/srt/distributed/device_communicators/pynccl_allocator.py b/sglang/python/sglang/srt/distributed/device_communicators/pynccl_allocator.py
new file mode 100644
index 0000000000000000000000000000000000000000..761b3c59224b4a51a040e1269c1a1e0e5776453c
--- /dev/null
+++ b/sglang/python/sglang/srt/distributed/device_communicators/pynccl_allocator.py
@@ -0,0 +1,213 @@
+import os
+import tempfile
+from contextlib import nullcontext
+
+import torch
+from torch.cuda.memory import CUDAPluggableAllocator
+
+from sglang.srt.distributed.parallel_state import GroupCoordinator
+from sglang.srt.server_args import get_global_server_args
+from sglang.srt.utils.common import torch_release
+
+after_2_8_0 = torch_release >= (2, 8)
+
+nccl_allocator_source = """
+
+#include <cuda_runtime.h>
+
+extern "C" {
+
+// copy from https://github.com/NVIDIA/nccl/blob/master/src/nccl.h.in
+typedef enum { ncclSuccess                 =  0,
+               ncclUnhandledCudaError      =  1,
+               ncclSystemError             =  2,
+               ncclInternalError           =  3,
+               ncclInvalidArgument         =  4,
+               ncclInvalidUsage            =  5,
+               ncclRemoteError             =  6,
+               ncclInProgress              =  7,
+               ncclNumResults              =  8 } ncclResult_t;
+typedef struct ncclComm* ncclComm_t;
+typedef struct ncclWindow_vidmem* ncclWindow_t;
+ncclResult_t  ncclCommWindowRegister(ncclComm_t comm, void* buff, size_t size, ncclWindow_t* win, int winFlags);
+#define NCCL_WIN_COLL_SYMMETRIC 0x01
+
+ncclResult_t  ncclMemAlloc(void** ptr, size_t size);
+ncclResult_t  ncclMemFree(void *ptr);
+const char*  ncclGetErrorString(ncclResult_t result);
+
+#define NCCLCHECK(cmd) do {                                               \
+  ncclResult_t res = cmd;                                                 \
+  if (res != ncclSuccess) {                                               \
+    fprintf(stderr, "ERROR: NCCL symmetric memory allocation failed. Most likely out of device memory. '%s'\\n", \
+           ncclGetErrorString(res));                       \
+    return NULL;                                                        \
+  }                                                                       \
+} while(0)
+
+void* nccl_alloc_plug(size_t size, int device, void* stream) {
+  void* ptr;
+  NCCLCHECK(ncclMemAlloc(&ptr, size));
+
+  const char *str_val = getenv("SGLANG_TMP_NCCL_COMM_VALUE");
+  char *endptr;
+  void* int_val = (void *)strtoull(str_val, &endptr, 0);
+
+  ncclComm_t comm = (ncclComm_t)(int_val);
+  ncclWindow_t win;
+  NCCLCHECK(ncclCommWindowRegister(comm, ptr, size, &win, NCCL_WIN_COLL_SYMMETRIC));
+
+  return ptr;
+}
+
+void nccl_free_plug(void* ptr, size_t size, int device, void* stream) {
+  ncclResult_t err = ncclMemFree(ptr);
+}
+
+}
+"""
+
+_allocator = None
+_mem_pool = None
+_graph_pool_id = None
+_cur_device = None
+_active_symmetric_memory_context = None
+
+
+def is_symmetric_memory_enabled():
+    try:
+        return get_global_server_args().enable_symm_mem
+    except ValueError:
+        return False
+
+
+def set_graph_pool_id(graph_pool_id):
+    global _graph_pool_id
+    _graph_pool_id = graph_pool_id
+
+
+def disable_symmetric_memory_context():
+    if _active_symmetric_memory_context is None:
+        return None
+    saved_context = _active_symmetric_memory_context
+    saved_context.__exit__(None, None, None)
+    return saved_context
+
+
+def restore_symmetric_memory_context(saved_context):
+    if saved_context is not None:
+        saved_context.__enter__()
+
+
+def get_nccl_mem_pool():
+    global _allocator, _mem_pool, _cur_device
+    if _mem_pool is None:
+        import torch.utils.cpp_extension
+
+        out_dir = os.path.join(tempfile.gettempdir(), "symm_allocator")
+        os.makedirs(out_dir, exist_ok=True)
+        # Make sure to clean up leftover pytorch lock files
+        # from previous runs and synchronize across processes
+        # right after
+        try:
+            os.remove(os.path.join(out_dir, "lock"))
+        except FileNotFoundError:
+            pass
+        torch.distributed.barrier()
+
+        nccl_allocator_libname = "nccl_allocator"
+        torch.utils.cpp_extension.load_inline(
+            name=nccl_allocator_libname,
+            cpp_sources=nccl_allocator_source,
+            with_cuda=True,
+            extra_ldflags=["-lnccl"],
+            verbose=True,
+            is_python_module=False,
+            build_directory=out_dir,
+        )
+        _allocator = CUDAPluggableAllocator(
+            f"{out_dir}/{nccl_allocator_libname}.so",
+            "nccl_alloc_plug",
+            "nccl_free_plug",
+        ).allocator()
+        _mem_pool = torch.cuda.MemPool(_allocator)
+        _cur_device = torch.cuda.current_device()
+    return _mem_pool
+
+
+class SymmetricMemoryContext:
+    """
+    Context manager for using symmetric memory with pynccl.
+
+    To Utilize the symmetric memory feature in NCCL, the buffers need to be allocated
+    by `ncclMemAlloc` and registered by `ncclCommWindowRegister`. Due to this, we introduce
+    this context manager. All tensors created under this context will be correctly
+    allocated and registered with a custom allocator.
+    """
+
+    def __init__(
+        self,
+        group_coordinator: GroupCoordinator,
+    ):
+        self.group_coordinator = group_coordinator
+        self._mem_pool_ctx = torch.cuda.use_mem_pool(get_nccl_mem_pool())
+        self.is_graph_capture = torch.cuda.is_current_stream_capturing()
+        self.exited = False
+
+    def __enter__(self):
+        assert (
+            self.group_coordinator.pynccl_comm is not None
+        ), f"Symmetric memory requires pynccl to be enabled in group '{self.group_coordinator.group_name}'"
+
+        if self.is_graph_capture:
+            assert (
+                _graph_pool_id is not None
+            ), "graph_pool_id is not set under graph capture"
+            # Pause graph memory pool to use symmetric memory with cuda graph
+            if after_2_8_0:
+                torch._C._cuda_endAllocateToPool(_cur_device, _graph_pool_id)
+            else:
+                torch._C._cuda_endAllocateCurrentStreamToPool(
+                    _cur_device, _graph_pool_id
+                )
+
+        if self.exited:
+            # mempool ctx (@contextlib.contextmanager) is not re-entrant
+            self._mem_pool_ctx = torch.cuda.use_mem_pool(get_nccl_mem_pool())
+            self.exited = False
+        self._mem_pool_ctx.__enter__()
+
+        # Set the env var to pass this argument to the C functions.
+        os.environ["SGLANG_TMP_NCCL_COMM_VALUE"] = str(
+            self.group_coordinator.pynccl_comm.comm.value
+        )
+
+        global _active_symmetric_memory_context
+        _active_symmetric_memory_context = self
+
+        return self
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        self._mem_pool_ctx.__exit__(exc_type, exc_val, exc_tb)
+
+        if self.is_graph_capture:
+            if after_2_8_0:
+                torch._C._cuda_beginAllocateCurrentThreadToPool(
+                    _cur_device, _graph_pool_id
+                )
+            else:
+                torch._C._cuda_beginAllocateToPool(_cur_device, _graph_pool_id)
+
+        global _active_symmetric_memory_context
+        _active_symmetric_memory_context = None
+
+        self.exited = True
+
+
+def use_symmetric_memory(group_coordinator: GroupCoordinator, disabled: bool = False):
+    disabled = (
+        not is_symmetric_memory_enabled()
+        or disabled
+        or group_coordinator.world_size == 1
+    )
+    return SymmetricMemoryContext(group_coordinator) if not disabled else nullcontext()
diff --git a/sglang/python/sglang/srt/distributed/device_communicators/pynccl_wrapper.py b/sglang/python/sglang/srt/distributed/device_communicators/pynccl_wrapper.py
new file mode 100644
index 0000000000000000000000000000000000000000..59775ad197da4ba1a7015a63bb8db315ac92b8d8
--- /dev/null
+++ b/sglang/python/sglang/srt/distributed/device_communicators/pynccl_wrapper.py
@@ -0,0 +1,565 @@
+# Adapted from https://github.com/vllm-project/vllm/blob/v0.6.4.post1/vllm/distributed/device_communicators/pynccl.py
+
+# This file is a pure Python wrapper for the NCCL library.
+# The main purpose is to use NCCL combined with CUDA graph.
+# Before writing this script, we tried the following approach:
+# 1. We tried to use `cupy`, it calls NCCL correctly, but `cupy` itself
+#  often gets stuck when initializing the NCCL communicator.
+# 2. We tried to use `torch.distributed`, but `torch.distributed.all_reduce`
+#  contains many other potential cuda APIs, that are not allowed during
+#  capturing the CUDA graph. For further details, please check
+# https://discuss.pytorch.org/t/pytorch-cudagraph-with-nccl-operation-failed/ .
+#
+# Another rejected idea is to write a C/C++ binding for NCCL. It is usually
+# doable, but we often encounter issues related with nccl versions, and need
+# to switch between different versions of NCCL. See
+# https://github.com/NVIDIA/nccl/issues/1234 for more details.
+# A C/C++ binding is not flexible enough to handle this. It requires
+# recompilation of the code every time we want to switch between different
+# versions. This current implementation, with a **pure** Python wrapper, is
+# more flexible. We can easily switch between different versions of NCCL by
+# changing the environment variable `SGLANG_NCCL_SO_PATH`, or the `so_file`
+# variable in the code.
+
+import ctypes
+import logging
+import os
+import platform
+from dataclasses import dataclass
+from typing import Any, Dict, List, Optional
+
+import torch
+from torch.distributed import ReduceOp
+
+logger = logging.getLogger(__name__)
+
+
+def find_nccl_library() -> str:
+    """
+    We either use the library file specified by the `SGLANG_NCCL_SO_PATH`
+    environment variable, or we find the library file brought by PyTorch.
+    After importing `torch`, `libnccl.so.2`, `librccl.so.1` or `libmccl.so.2`
+    can be found by `ctypes` automatically.
+    """
+
+    # so_file can be set to None in sglang
+    so_file = os.environ.get("SGLANG_NCCL_SO_PATH", None)
+
+    # manually load the nccl library
+    if so_file:
+        logger.info(
+            "Found nccl from environment variable SGLANG_NCCL_SO_PATH=%s", so_file
+        )
+    else:
+        if torch.version.cuda is not None:
+            so_file = "libnccl.so.2"
+        elif torch.version.hip is not None:
+            so_file = "librccl.so.1"
+        elif hasattr(torch.version, "musa") and torch.version.musa is not None:
+            so_file = "libmccl.so.2"
+        else:
+            raise ValueError("NCCL only supports CUDA, ROCm and MUSA backends.")
+        logger.debug("Found nccl from library %s", so_file)
+    return so_file
+
+
+# === export types and functions from nccl to Python ===
+# for the original nccl definition, please check
+# https://github.com/NVIDIA/nccl/blob/master/src/nccl.h.in
+
+ncclResult_t = ctypes.c_int
+ncclComm_t = ctypes.c_void_p
+ncclWindow_t = ctypes.c_void_p
+
+
+class ncclUniqueId(ctypes.Structure):
+    _fields_ = [("internal", ctypes.c_byte * 128)]
+
+
+cudaStream_t = ctypes.c_void_p
+buffer_type = ctypes.c_void_p
+
+ncclDataType_t = ctypes.c_int
+
+
+class ncclDataTypeEnum:
+    ncclInt8 = 0
+    ncclChar = 0
+    ncclUint8 = 1
+    ncclInt32 = 2
+    ncclInt = 2
+    ncclUint32 = 3
+    ncclInt64 = 4
+    ncclUint64 = 5
+    ncclFloat16 = 6
+    ncclHalf = 6
+    ncclFloat32 = 7
+    ncclFloat = 7
+    ncclFloat64 = 8
+    ncclDouble = 8
+    ncclBfloat16 = 9
+    ncclNumTypes = 10
+
+    @classmethod
+    def from_torch(cls, dtype: torch.dtype) -> int:
+        if dtype == torch.int8:
+            return cls.ncclInt8
+        if dtype == torch.uint8:
+            return cls.ncclUint8
+        if dtype == torch.int32:
+            return cls.ncclInt32
+        if dtype == torch.int64:
+            return cls.ncclInt64
+        if dtype == torch.float16:
+            return cls.ncclFloat16
+        if dtype == torch.float32:
+            return cls.ncclFloat32
+        if dtype == torch.float64:
+            return cls.ncclFloat64
+        if dtype == torch.bfloat16:
+            return cls.ncclBfloat16
+        raise ValueError(f"Unsupported dtype: {dtype}")
+
+
+ncclRedOp_t = ctypes.c_int
+
+
+class ncclRedOpTypeEnum:
+    ncclSum = 0
+    ncclProd = 1
+    ncclMax = 2
+    ncclMin = 3
+    ncclAvg = 4
+    ncclNumOps = 5
+
+    @classmethod
+    def from_torch(cls, op: ReduceOp) -> int:
+        if op == ReduceOp.SUM:
+            return cls.ncclSum
+        if op == ReduceOp.PRODUCT:
+            return cls.ncclProd
+        if op == ReduceOp.MAX:
+            return cls.ncclMax
+        if op == ReduceOp.MIN:
+            return cls.ncclMin
+        if op == ReduceOp.AVG:
+            return cls.ncclAvg
+        raise ValueError(f"Unsupported op: {op}")
+
+
+@dataclass
+class Function:
+    name: str
+    restype: Any
+    argtypes: List[Any]
+
+
+class NCCLLibrary:
+    exported_functions = [
+        # const char* ncclGetErrorString(ncclResult_t result)
+        Function("ncclGetErrorString", ctypes.c_char_p, [ncclResult_t]),
+        # ncclResult_t  ncclGetVersion(int *version);
+        Function("ncclGetVersion", ncclResult_t, [ctypes.POINTER(ctypes.c_int)]),
+        # ncclResult_t ncclGetUniqueId(ncclUniqueId* uniqueId);
+        Function("ncclGetUniqueId", ncclResult_t, [ctypes.POINTER(ncclUniqueId)]),
+        # ncclResult_t  ncclCommInitRank(
+        #   ncclComm_t* comm, int nranks, ncclUniqueId commId, int rank);
+        # note that ncclComm_t is a pointer type, so the first argument
+        # is a pointer to a pointer
+        Function(
+            "ncclCommInitRank",
+            ncclResult_t,
+            [ctypes.POINTER(ncclComm_t), ctypes.c_int, ncclUniqueId, ctypes.c_int],
+        ),
+        # ncclResult_t  ncclAllReduce(
+        #   const void* sendbuff, void* recvbuff, size_t count,
+        #   ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm,
+        #   cudaStream_t stream);
+        # note that cudaStream_t is a pointer type, so the last argument
+        # is a pointer
+        Function(
+            "ncclAllReduce",
+            ncclResult_t,
+            [
+                buffer_type,
+                buffer_type,
+                ctypes.c_size_t,
+                ncclDataType_t,
+                ncclRedOp_t,
+                ncclComm_t,
+                cudaStream_t,
+            ],
+        ),
+        # ncclResult_t  ncclAllGather(
+        #   const void* sendbuff, void* recvbuff, size_t count,
+        #   ncclDataType_t datatype, ncclComm_t comm,
+        #   cudaStream_t stream);
+        # note that cudaStream_t is a pointer type, so the last argument
+        # is a pointer
+        Function(
+            "ncclAllGather",
+            ncclResult_t,
+            [
+                buffer_type,
+                buffer_type,
+                ctypes.c_size_t,
+                ncclDataType_t,
+                ncclComm_t,
+                cudaStream_t,
+            ],
+        ),
+        # ncclResult_t  ncclReduce(
+        #   const void* sendbuff, void* recvbuff, size_t count,
+        #   ncclDataType_t datatype, ncclRedOp_t op, int root,
+        #   ncclComm_t comm,  cudaStream_t stream);
+        # note that cudaStream_t is a pointer type, so the last argument
+        # is a pointer
+        Function(
+            "ncclReduce",
+            ncclResult_t,
+            [
+                buffer_type,
+                buffer_type,
+                ctypes.c_size_t,
+                ncclDataType_t,
+                ncclRedOp_t,
+                ctypes.c_int,
+                ncclComm_t,
+                cudaStream_t,
+            ],
+        ),
+        # ncclResult_t  ncclReduceScatter(
+        #   const void* sendbuff, void* recvbuff, size_t count,
+        #   ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm,
+        #   cudaStream_t stream);
+        # note that cudaStream_t is a pointer type, so the last argument
+        # is a pointer
+        Function(
+            "ncclReduceScatter",
+            ncclResult_t,
+            [
+                buffer_type,
+                buffer_type,
+                ctypes.c_size_t,
+                ncclDataType_t,
+                ncclRedOp_t,
+                ncclComm_t,
+                cudaStream_t,
+            ],
+        ),
+        # ncclResult_t  ncclSend(
+        #   const void* sendbuff, size_t count, ncclDataType_t datatype,
+        #   int dest, ncclComm_t comm, cudaStream_t stream);
+        Function(
+            "ncclSend",
+            ncclResult_t,
+            [
+                buffer_type,
+                ctypes.c_size_t,
+                ncclDataType_t,
+                ctypes.c_int,
+                ncclComm_t,
+                cudaStream_t,
+            ],
+        ),
+        # ncclResult_t  ncclRecv(
+        #   void* recvbuff, size_t count, ncclDataType_t datatype,
+        #   int src, ncclComm_t comm, cudaStream_t stream);
+        Function(
+            "ncclRecv",
+            ncclResult_t,
+            [
+                buffer_type,
+                ctypes.c_size_t,
+                ncclDataType_t,
+                ctypes.c_int,
+                ncclComm_t,
+                cudaStream_t,
+            ],
+        ),
+        # ncclResult_t ncclBroadcast(
+        #   const void* sendbuff, void* recvbuff, size_t count,
+        #   ncclDataType_t datatype, int root, ncclComm_t comm,
+        #   cudaStream_t stream);
+        Function(
+            "ncclBroadcast",
+            ncclResult_t,
+            [
+                buffer_type,
+                buffer_type,
+                ctypes.c_size_t,
+                ncclDataType_t,
+                ctypes.c_int,
+                ncclComm_t,
+                cudaStream_t,
+            ],
+        ),
+        # be cautious! this is a collective call, it will block until all
+        # processes in the communicator have called this function.
+        # because Python object destruction can happen in random order,
+        # it is better not to call it at all.
+        # ncclResult_t  ncclCommDestroy(ncclComm_t comm);
+        Function("ncclCommDestroy", ncclResult_t, [ncclComm_t]),
+        # ncclResult_t ncclGroupStart();
+        Function("ncclGroupStart", ncclResult_t, []),
+        # ncclResult_t ncclGroupEnd();
+        Function("ncclGroupEnd", ncclResult_t, []),
+    ]
+
+    exported_functions_symm_mem = [
+        # ncclResult_t ncclCommWindowRegister(ncclComm_t comm, void* buff, size_t size, ncclWindow_t* win, int winFlags);
+        Function(
+            "ncclCommWindowRegister",
+            ncclResult_t,
+            [
+                ncclComm_t,
+                buffer_type,
+                ctypes.c_size_t,
+                ctypes.POINTER(ncclWindow_t),
+                ctypes.c_int,
+            ],
+        ),
+        # ncclResult_t ncclCommWindowDeregister(ncclComm_t comm, ncclWindow_t win);
+        Function("ncclCommWindowDeregister", ncclResult_t, [ncclComm_t, ncclWindow_t]),
+    ]
+
+    # class attribute to store the mapping from the path to the library
+    # to avoid loading the same library multiple times
+    path_to_library_cache: Dict[str, Any] = {}
+
+    # class attribute to store the mapping from library path
+    #  to the corresponding dictionary
+    path_to_dict_mapping: Dict[str, Dict[str, Any]] = {}
+
+    def __init__(self, so_file: Optional[str] = None):
+
+        so_file = so_file or find_nccl_library()
+
+        try:
+            if so_file not in NCCLLibrary.path_to_dict_mapping:
+                lib = ctypes.CDLL(so_file)
+                NCCLLibrary.path_to_library_cache[so_file] = lib
+            self.lib = NCCLLibrary.path_to_library_cache[so_file]
+        except Exception as e:
+            logger.error(
+                "Failed to load NCCL library from %s . "
+                "It is expected if you are not running on NVIDIA/AMD/MTHREADS GPUs. "
+                "Otherwise, the nccl library might not exist, be corrupted "
+                "or it does not support the current platform %s. "
+                "If you already have the library, please set the "
+                "environment variable SGLANG_NCCL_SO_PATH"
+                " to point to the correct nccl library path.",
+                so_file,
+                platform.platform(),
+            )
+            raise e
+
+        if so_file not in NCCLLibrary.path_to_dict_mapping:
+            _funcs: Dict[str, Any] = {}
+            exported_functions = NCCLLibrary.exported_functions
+            if hasattr(self.lib, "ncclCommWindowRegister"):
+                exported_functions.extend(NCCLLibrary.exported_functions_symm_mem)
+            for func in exported_functions:
+                f = getattr(self.lib, func.name)
+                f.restype = func.restype
+                f.argtypes = func.argtypes
+                _funcs[func.name] = f
+            NCCLLibrary.path_to_dict_mapping[so_file] = _funcs
+        self._funcs = NCCLLibrary.path_to_dict_mapping[so_file]
+
+    def ncclGetErrorString(self, result: ncclResult_t) -> str:
+        return self._funcs["ncclGetErrorString"](result).decode("utf-8")
+
+    def NCCL_CHECK(self, result: ncclResult_t) -> None:
+        if result != 0:
+            error_str = self.ncclGetErrorString(result)
+            raise RuntimeError(f"NCCL error: {error_str}")
+
+    def ncclGetRawVersion(self) -> int:
+        version = ctypes.c_int()
+        self.NCCL_CHECK(self._funcs["ncclGetVersion"](ctypes.byref(version)))
+        # something like 21903
+        return version.value
+
+    def ncclGetVersion(self) -> str:
+        version_str = str(self.ncclGetRawVersion())
+        # something like 21903 --> "2.19.3"
+        major = version_str[0].lstrip("0")
+        minor = version_str[1:3].lstrip("0")
+        patch = version_str[3:].lstrip("0")
+        return f"{major}.{minor}.{patch}"
+
+    def ncclGetUniqueId(self) -> ncclUniqueId:
+        unique_id = ncclUniqueId()
+        self.NCCL_CHECK(self._funcs["ncclGetUniqueId"](ctypes.byref(unique_id)))
+        return unique_id
+
+    def ncclCommInitRank(
+        self, world_size: int, unique_id: ncclUniqueId, rank: int
+    ) -> ncclComm_t:
+        comm = ncclComm_t()
+        self.NCCL_CHECK(
+            self._funcs["ncclCommInitRank"](
+                ctypes.byref(comm), world_size, unique_id, rank
+            )
+        )
+        return comm
+
+    def ncclAllReduce(
+        self,
+        sendbuff: buffer_type,
+        recvbuff: buffer_type,
+        count: int,
+        datatype: int,
+        op: int,
+        comm: ncclComm_t,
+        stream: cudaStream_t,
+    ) -> None:
+        # `datatype` actually should be `ncclDataType_t`
+        # and `op` should be `ncclRedOp_t`
+        # both are aliases of `ctypes.c_int`
+        # when we pass int to a function, it will be converted to `ctypes.c_int`
+        # by ctypes automatically
+        self.NCCL_CHECK(
+            self._funcs["ncclAllReduce"](
+                sendbuff, recvbuff, count, datatype, op, comm, stream
+            )
+        )
+
+    def ncclReduce(
+        self,
+        sendbuff: buffer_type,
+        recvbuff: buffer_type,
+        count: int,
+        datatype: int,
+        op: int,
+        root: int,
+        comm: ncclComm_t,
+        stream: cudaStream_t,
+    ) -> None:
+        # `datatype` actually should be `ncclDataType_t`
+        # and `op` should be `ncclRedOp_t`
+        # both are aliases of `ctypes.c_int`
+        # when we pass int to a function, it will be converted to `ctypes.c_int`
+        # by ctypes automatically
+        self.NCCL_CHECK(
+            self._funcs["ncclReduce"](
+                sendbuff, recvbuff, count, datatype, op, root, comm, stream
+            )
+        )
+
+    def ncclReduceScatter(
+        self,
+        sendbuff: buffer_type,
+        recvbuff: buffer_type,
+        count: int,
+        datatype: int,
+        op: int,
+        comm: ncclComm_t,
+        stream: cudaStream_t,
+    ) -> None:
+        # `datatype` actually should be `ncclDataType_t`
+        # and `op` should be `ncclRedOp_t`
+        # both are aliases of `ctypes.c_int`
+        # when we pass int to a function, it will be converted to `ctypes.c_int`
+        # by ctypes automatically
+        self.NCCL_CHECK(
+            self._funcs["ncclReduceScatter"](
+                sendbuff, recvbuff, count, datatype, op, comm, stream
+            )
+        )
+
+    def ncclAllGather(
+        self,
+        sendbuff: buffer_type,
+        recvbuff: buffer_type,
+        count: int,
+        datatype: int,
+        comm: ncclComm_t,
+        stream: cudaStream_t,
+    ) -> None:
+        # `datatype` actually should be `ncclDataType_t`
+        # which is an aliases of `ctypes.c_int`
+        # when we pass int to a function, it will be converted to `ctypes.c_int`
+        # by ctypes automatically
+        self.NCCL_CHECK(
+            self._funcs["ncclAllGather"](
+                sendbuff, recvbuff, count, datatype, comm, stream
+            )
+        )
+
+    def ncclSend(
+        self,
+        sendbuff: buffer_type,
+        count: int,
+        datatype: int,
+        dest: int,
+        comm: ncclComm_t,
+        stream: cudaStream_t,
+    ) -> None:
+        self.NCCL_CHECK(
+            self._funcs["ncclSend"](sendbuff, count, datatype, dest, comm, stream)
+        )
+
+    def ncclRecv(
+        self,
+        recvbuff: buffer_type,
+        count: int,
+        datatype: int,
+        src: int,
+        comm: ncclComm_t,
+        stream: cudaStream_t,
+    ) -> None:
+        self.NCCL_CHECK(
+            self._funcs["ncclRecv"](recvbuff, count, datatype, src, comm, stream)
+        )
+
+    def ncclBroadcast(
+        self,
+        sendbuff: buffer_type,
+        recvbuff: buffer_type,
+        count: int,
+        datatype: int,
+        root: int,
+        comm: ncclComm_t,
+        stream: cudaStream_t,
+    ) -> None:
+        self.NCCL_CHECK(
+            self._funcs["ncclBroadcast"](
+                sendbuff, recvbuff, count, datatype, root, comm, stream
+            )
+        )
+
+    def ncclCommDestroy(self, comm: ncclComm_t) -> None:
+        self.NCCL_CHECK(self._funcs["ncclCommDestroy"](comm))
+
+    def ncclCommWindowRegister(
+        self, comm: ncclComm_t, buff: buffer_type, size: int, win_flags: int
+    ) -> ncclWindow_t:
+        window = ncclWindow_t()
+        self.NCCL_CHECK(
+            self._funcs["ncclCommWindowRegister"](
+                comm, buff, size, ctypes.byref(window), win_flags
+            )
+        )
+        return window
+
+    def ncclCommWindowDeregister(self, comm: ncclComm_t, window: ncclWindow_t) -> None:
+        self.NCCL_CHECK(self._funcs["ncclCommWindowDeregister"](comm, window))
+
+    def ncclGroupStart(self) -> None:
+        self.NCCL_CHECK(self._funcs["ncclGroupStart"]())
+
+    def ncclGroupEnd(self) -> None:
+        self.NCCL_CHECK(self._funcs["ncclGroupEnd"]())
+
+
+__all__ = [
+    "NCCLLibrary",
+    "ncclDataTypeEnum",
+    "ncclRedOpTypeEnum",
+    "ncclUniqueId",
+    "ncclComm_t",
+    "cudaStream_t",
+    "buffer_type",
+]
diff --git a/sglang/python/sglang/srt/distributed/device_communicators/quick_all_reduce.py b/sglang/python/sglang/srt/distributed/device_communicators/quick_all_reduce.py
new file mode 100644
index 0000000000000000000000000000000000000000..f9d51246ed8c7234b73941c49c9d3c35ecc218e5
--- /dev/null
+++ b/sglang/python/sglang/srt/distributed/device_communicators/quick_all_reduce.py
@@ -0,0 +1,267 @@
+# SPDX-License-Identifier: Apache-2.0
+
+import logging
+import os
+from enum import Enum
+from functools import cache
+from typing import Union
+
+import torch
+import torch.distributed as dist
+from torch.distributed import ProcessGroup
+
+import sglang.srt.distributed.device_communicators.custom_all_reduce_ops as ops
+from sglang.srt.distributed.device_communicators.custom_all_reduce_utils import (
+    is_full_nvlink,
+    is_weak_contiguous,
+)
+from sglang.srt.distributed.parallel_state import in_the_same_node_as
+from sglang.srt.utils import is_cuda, is_hip
+
+logger = logging.getLogger(__name__)
+
+_is_cuda = is_cuda()
+_is_hip = is_hip()
+
+
+@cache
+def qr_rocm_arch_available():
+    if not _is_hip:
+        return False
+    try:
+        props = torch.cuda.get_device_properties(0)
+        gcn_arch = getattr(props, "gcnArchName", "")
+        supported_archs = ["gfx94", "gfx95"]
+        return any(gfx in gcn_arch for gfx in supported_archs)
+    except Exception as e:
+        logger.warning("Failed to determine ROCm for quick allreduce: %s", e)
+        return False
+
+
+class QuickReduceRegime(Enum):
+    FP = 0
+    INT8 = 1
+    INT6 = 2
+    INT4 = 3
+    NONE = 4
+
+
+MB = 1024 * 1024
+
+
+class QuickAllReduce:
+
+    _SUPPORTED_WORLD_SIZES = [2, 4, 8]
+    _SUPPORTED_DTYPES = [torch.float16, torch.bfloat16]
+    # The following data is based on kernel tests.
+    # In this order [FP, INT8, INT6, INT4].
+    _QR_MIN_SIZE = {
+        (torch.float16, 2): [1 * MB, 2 * MB, 2 * MB, 1 * MB],
+        (torch.float16, 4): [1 * MB, 16 * MB, 4 * MB, 2 * MB],
+        (torch.float16, 8): [16 * MB, 4 * MB, 4 * MB, 2 * MB],
+        (torch.bfloat16, 2): [2 * MB, 8 * MB, 8 * MB, 8 * MB],
+        (torch.bfloat16, 4): [8 * MB, 64 * MB, 64 * MB, 16 * MB],
+        (torch.bfloat16, 8): [16 * MB, 2048 * MB, 2048 * MB, 2048 * MB],
+    }
+
+    def __init__(
+        self, group: ProcessGroup, device: Union[int, str, torch.device]
+    ) -> None:
+        """
+        Custom allreduce provides non-destructive acceleration and is
+        available for CUDA and ROCm MI300 series.
+        Custom quick allreduce leverages quantization for further
+        acceleration on ROCm. It currently supports Q8, Q6, and Q4
+        quantization formats and FP(float16, bfloat16).
+        Quick allreduce is designed as a complement to custom allreduce.
+        Its initialization requires even stricter conditions.
+        Only the ROCm MI300 series is supported for quick allreduce at
+        this time.
+        Args:
+            group: the process group to work on. If None, it will use the
+                default process group.
+            device: the device to bind the CustomAllreduce to. If None,
+                it will be bind to f"cuda:{local_rank}".
+        It is the caller's responsibility to make sure each communicator
+        is bind to a unique device, and all communicators in this group
+        are in the same node.
+        """
+        self.disabled = True
+        if not qr_rocm_arch_available():
+            logger.debug(
+                "Custom quick allreduce is only supported on ROCm MI300 series."
+            )
+            return
+
+        if not ops.IS_QUICK_AR_AVAILABLE:
+            # disable because of missing quick reduce library
+            # e.g. in a cuda environment
+            logger.info(
+                "Custom quick allreduce is disabled because "
+                "of missing custom quick allreduce library"
+            )
+            return
+
+        self.group = group
+        assert (
+            dist.get_backend(group) != dist.Backend.NCCL
+        ), "Custom quick allreduce should be attached to a non-NCCL group."
+        if not all(in_the_same_node_as(group, source_rank=0)):
+            # No need to initialize custom quick allreduce for
+            # multi-node case.
+            logger.warning(
+                "Custom quick allreduce is disabled because this "
+                "process group spans across nodes."
+            )
+            return
+        rank = dist.get_rank(group=self.group)
+        world_size = dist.get_world_size(group=self.group)
+        self.rank = rank
+        self.world_size = world_size
+        if world_size == 1:
+            # No need to initialize QuickReduce for single GPU case.
+            return
+
+        if world_size not in QuickAllReduce._SUPPORTED_WORLD_SIZES:
+            logger.warning(
+                "Custom quick allreduce is disabled due to an "
+                "unsupported world size: %d. Supported world sizes: %s.",
+                world_size,
+                str(QuickAllReduce._SUPPORTED_WORLD_SIZES),
+            )
+            return
+
+        if isinstance(device, int):
+            device = torch.device(f"cuda:{device}")
+        elif isinstance(device, str):
+            device = torch.device(device)
+        assert isinstance(device, torch.device)
+        self.device = device
+
+        cuda_visible_devices = os.environ.get("CUDA_VISIBLE_DEVICES", None)
+        if cuda_visible_devices:
+            device_ids = list(map(int, cuda_visible_devices.split(",")))
+        else:
+            device_ids = list(range(torch.cuda.device_count()))
+        physical_device_id = device_ids[device.index]
+        tensor = torch.tensor([physical_device_id], dtype=torch.int, device="cpu")
+        gather_list = [
+            torch.tensor([0], dtype=torch.int, device="cpu")
+            for _ in range(self.world_size)
+        ]
+        dist.all_gather(gather_list, tensor, group=self.group)
+        physical_device_ids = [t.item() for t in gather_list]
+
+        # test nvlink first, this will filter out most of the cases
+        # where custom quick allreduce is not supported
+        # this checks hardware and driver support for NVLink
+        if _is_cuda or _is_hip:
+            self.fully_connected = is_full_nvlink(physical_device_ids, self.world_size)
+        if self.world_size > 2 and not self.fully_connected:
+            logger.debug(
+                "Custom quick allreduce is disabled because it's not supported "
+                "on more than two PCIe-only GPUs. "
+            )
+            return
+
+        self.init_quick_all_reduce()
+
+    def init_quick_all_reduce(self):
+        # On RocM, bfloat16 kernels are slower than fp16
+        # due to slower match operations
+        # If environment variable is set to 1, we convert input to fp16
+        self.use_fp16_kernels = int(
+            os.environ.get("ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16", 1)
+        )
+        regime_str = os.environ.get("ROCM_QUICK_REDUCE_QUANTIZATION", "NONE")
+        if regime_str not in QuickReduceRegime.__members__:
+            logger.warning(
+                "Custom quick allreduce:",
+                f"Invalid quantization level: {regime_str}. "
+                "Supported levels: "
+                f"{list(QuickReduceRegime.__members__.keys())}",
+            )
+            return
+
+        if regime_str == "NONE":
+            logger.debug(
+                "Custom quick allreduce is disabled based "
+                "on env variable "
+                "ROCM_QUICK_REDUCE_QUANTIZATION='NONE'"
+            )
+            return
+        self.qr_quant_level = QuickReduceRegime[regime_str]
+
+        # TODO: If the dtype is not bfloat16 or then float16,
+        # quickallreduce should not be created.
+
+        # ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB is specified in MB
+        qr_max_size = int(os.environ.get("ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB", 0))
+        if qr_max_size > 0:
+            if qr_max_size < 1:
+                logger.info(
+                    "You should not set a max_size smaller than 1MB, which can "
+                    "lead to error or degradation to custom allreduce or rccl."
+                )
+            qr_max_size = qr_max_size * MB
+        # If qr_max_size is None, then 2GB is used by default.
+        self._ptr = ops.init_custom_qr(self.rank, self.world_size, qr_max_size)
+        self.qr_max_size = qr_max_size if qr_max_size > 0 else ops.qr_max_size()
+        self.create_shared_buffer()
+        self.disabled = False
+
+    def create_shared_buffer(self):
+        """
+        Creates a shared buffer for quickreduce.
+        Has to be called after init_custom_qr
+        """
+        handle = ops.qr_get_handle(self._ptr)
+        world_size = dist.get_world_size(group=self.group)
+        handles = [None] * world_size
+        dist.all_gather_object(handles, handle, group=self.group)
+        ops.qr_open_handles(self._ptr, handles)
+
+    def should_quick_allreduce(self, inp: torch.Tensor):
+        """
+        Check if quickreduce is available
+        """
+        if self.disabled:
+            return False
+        if inp.dtype not in self._SUPPORTED_DTYPES:
+            return False
+        inp_size = inp.numel() * inp.element_size()
+        # custom quick allreduce requires input byte size to be
+        # multiples of 16
+        if inp_size % 16 != 0:
+            return False
+        if not is_weak_contiguous(inp):
+            return False
+        dtype = inp.dtype
+        if self.use_fp16_kernels:
+            dtype = torch.float16
+        return (
+            inp_size <= self.qr_max_size
+            and inp_size
+            >= self._QR_MIN_SIZE[(dtype, self.world_size)][self.qr_quant_level.value]
+        )
+
+    def quick_all_reduce(self, inp: torch.Tensor, *, out: torch.Tensor = None):
+        """Performs an out-of-place custom quick all reduce."""
+        # quick allreduce doesn't require a separate graph mode,
+        # as QR uses static IPC buffer.
+        if out is None:
+            out = torch.empty_like(inp)
+        ops.qr_all_reduce(
+            self._ptr, inp, out, self.qr_quant_level.value, self.use_fp16_kernels
+        )
+        return out
+
+    def close(self):
+        if not self.disabled and getattr(self, "_ptr", None):
+            if ops is not None:
+                ops.qr_destroy(self._ptr)
+            self._ptr = 0
+            self.disabled = True
+
+    def __del__(self):
+        self.close()
diff --git a/sglang/python/sglang/srt/distributed/device_communicators/shm_broadcast.py b/sglang/python/sglang/srt/distributed/device_communicators/shm_broadcast.py
new file mode 100644
index 0000000000000000000000000000000000000000..39788fe303698d8522dbbb40a835fe6e20c43e87
--- /dev/null
+++ b/sglang/python/sglang/srt/distributed/device_communicators/shm_broadcast.py
@@ -0,0 +1,510 @@
+# Adapted from https://github.com/vllm-project/vllm/blob/v0.6.4.post1/vllm/distributed/device_communicators/shm_broadcast.py
+
+import logging
+import os
+import pickle
+import time
+from contextlib import contextmanager
+from dataclasses import dataclass, field
+from multiprocessing import shared_memory
+from typing import List, Optional
+from unittest.mock import patch
+
+import torch
+import torch.distributed as dist
+from torch.distributed import ProcessGroup
+from zmq import IPV6  # type: ignore
+from zmq import SUB, SUBSCRIBE, XPUB, XPUB_VERBOSE, Context  # type: ignore
+
+from sglang.srt.utils import (
+    format_tcp_address,
+    get_local_ip_auto,
+    get_open_port,
+    is_valid_ipv6_address,
+)
+
+# SGLANG_RINGBUFFER_WARNING_INTERVAL can be set to 60
+SGLANG_RINGBUFFER_WARNING_INTERVAL = int(
+    os.environ.get("SGLANG_RINGBUFFER_WARNING_INTERVAL", "60")
+)
+
+logger = logging.getLogger(__name__)
+
+
+class ShmRingBuffer:
+
+    def __init__(
+        self,
+        n_reader: int,
+        max_chunk_bytes: int,
+        max_chunks: int,
+        name: Optional[str] = None,
+    ):
+        """
+        A shared memory ring buffer implementation for broadcast communication.
+        Essentially, it is a queue where only one will `enqueue` and multiple
+        will `dequeue`. The max size of each item, together with the max number
+        of items that can be stored in the buffer are known in advance.
+        In this case, we don't need to synchronize the access to
+         the buffer.
+
+        Buffer memory layout:
+                  data                                 metadata
+                    |                                      |
+                    | (current_idx)                        | (current_idx)
+                    v                                      v
+        +-------------------------------+----------------------------------------+
+        | chunk0 | chunk1 | ... | chunk | metadata0 | metadata1 | ... | metadata |
+        +-------------------------------+----------------------------------------+
+        | max_chunks x max_chunk_bytes  | max_chunks x (1 + n_reader) bytes      |
+
+        metadata memory layout: each byte is a flag, the first byte is the written
+        flag, and the rest are reader flags. The flags are set to 0 by default.
+        +--------------+--------------+--------------+-----+--------------+
+        | written_flag | reader0_flag | reader1_flag | ... | readerN_flag |
+        +--------------+--------------+--------------+-----+--------------+
+
+        The state of metadata is as follows:
+
+        (case 1) 0???...???: the block is not written yet, cannot read, can write
+        (case 2) 1000...000: the block is just written, can read, cannot write
+        (case 3) 1???...???: the block is written and read by some readers, can read if not read, cannot write
+        (case 4) 1111...111: the block is written and read by all readers, cannot read, can write
+
+        State transition for readers:
+
+        When a reader finds a block that it can read (case 2 or 3), it can yield the block for caller to read.
+        Only after the caller finishes reading the block, the reader can mark the block as read.
+        Readers only mark the block as read (from 0 to 1), the writer marks the block as ready to read (from 1 to 0).
+
+        State transition for writer:
+
+        When the writer writes to a block (case 1 or 4), it first resets the written flag to 0, converting either case
+        to case 1. Then it can yield the block for caller to write. After the caller finishes writing the block, the writer
+        can reset the reader flags to 0, and mark the block as written (from 0 to 1).
+        NOTE: the order is important here, first reset the reader flags (so that we are still in case 1), then mark the block as written. The state transition is atomic. If we do it in the reverse order, it will go through case 3 and then back to case 2, and readers might read the intermediate case 3, which is not correct.
+
+        During creation, `name` is None and the buffer is created. We can pass the
+        created object to other processes by pickling it. The other processes will
+        get the name of the shared memory and open it, so that they can access the
+        same shared memory buffer.
+        """  # noqa
+        self.n_reader = n_reader
+        self.metadata_size = 1 + n_reader
+        self.max_chunk_bytes = max_chunk_bytes
+        self.max_chunks = max_chunks
+        self.total_bytes_of_buffer = (
+            self.max_chunk_bytes + self.metadata_size
+        ) * self.max_chunks
+        self.data_offset = 0
+        self.metadata_offset = self.max_chunk_bytes * self.max_chunks
+
+        if name is None:
+            # we are creating a buffer
+            self.is_creator = True
+            self.shared_memory = shared_memory.SharedMemory(
+                create=True, size=self.total_bytes_of_buffer
+            )
+            # initialize the metadata section to 0
+            with memoryview(
+                self.shared_memory.buf[self.metadata_offset :]
+            ) as metadata_buffer:
+                torch.frombuffer(metadata_buffer, dtype=torch.uint8).fill_(0)
+        else:
+            # we are opening an existing buffer
+            self.is_creator = False
+            # fix to https://stackoverflow.com/q/62748654/9191338
+            # Python incorrectly tracks shared memory even if it is not
+            # created by the process. The following patch is a workaround.
+            with patch(
+                "multiprocessing.resource_tracker.register",
+                lambda *args, **kwargs: None,
+            ):
+                try:
+                    self.shared_memory = shared_memory.SharedMemory(name=name)
+                    assert self.shared_memory.size == self.total_bytes_of_buffer
+                except FileNotFoundError:
+                    # we might deserialize the object in a different node
+                    # in this case, this object is not used,
+                    # and we should suppress the error
+                    pass
+
+    def __reduce__(self):
+        return (
+            self.__class__,
+            (
+                self.n_reader,
+                self.max_chunk_bytes,
+                self.max_chunks,
+                self.shared_memory.name,
+            ),
+        )
+
+    def __del__(self):
+        if hasattr(self, "shared_memory"):
+            self.shared_memory.close()
+            if self.is_creator:
+                self.shared_memory.unlink()
+
+    @contextmanager
+    def get_data(self, current_idx: int):
+        start = self.data_offset + current_idx * self.max_chunk_bytes
+        end = start + self.max_chunk_bytes
+        with memoryview(self.shared_memory.buf[start:end]) as buf:
+            yield buf
+
+    @contextmanager
+    def get_metadata(self, current_idx: int):
+        start = self.metadata_offset + current_idx * self.metadata_size
+        end = start + self.metadata_size
+        with memoryview(self.shared_memory.buf[start:end]) as buf:
+            yield buf
+
+
+@dataclass
+class Handle:
+    connect_ip: str
+    local_reader_ranks: List[int] = field(default_factory=list)
+
+    buffer: Optional[ShmRingBuffer] = None
+    local_subscribe_port: Optional[int] = None
+    remote_subscribe_port: Optional[int] = None
+
+
+class MessageQueue:
+
+    def __init__(
+        self,
+        n_reader,  # number of all readers
+        n_local_reader,  # number of local readers through shared memory
+        local_reader_ranks: Optional[List[int]] = None,
+        max_chunk_bytes: int = 1024 * 1024 * 10,
+        max_chunks: int = 10,
+        connect_ip: Optional[str] = None,
+    ):
+        if local_reader_ranks is None:
+            local_reader_ranks = list(range(n_local_reader))
+        else:
+            assert len(local_reader_ranks) == n_local_reader
+        self.n_local_reader = n_local_reader
+        n_remote_reader = n_reader - n_local_reader
+        self.n_remote_reader = n_remote_reader
+
+        if connect_ip is None:
+            connect_ip = (
+                get_local_ip_auto("0.0.0.0") if n_remote_reader > 0 else "127.0.0.1"
+            )
+
+        context = Context()
+
+        if n_local_reader > 0:
+            # for local readers, we will:
+            # 1. create a shared memory ring buffer to communicate small data
+            # 2. create a publish-subscribe socket to communicate large data
+            self.buffer = ShmRingBuffer(n_local_reader, max_chunk_bytes, max_chunks)
+
+            # XPUB is very similar to PUB,
+            # except that it can receive subscription messages
+            # to confirm the number of subscribers
+            self.local_socket = context.socket(XPUB)
+            # set the verbose option so that we can receive every subscription
+            # message. otherwise, we will only receive the first subscription
+            # see http://api.zeromq.org/3-3:zmq-setsockopt for more details
+            self.local_socket.setsockopt(XPUB_VERBOSE, True)
+            local_subscribe_port = get_open_port()
+            socket_addr = f"tcp://127.0.0.1:{local_subscribe_port}"
+            logger.debug("Binding to %s", socket_addr)
+            self.local_socket.bind(socket_addr)
+            self.current_idx = 0
+
+        else:
+            self.buffer = None  # type: ignore
+            local_subscribe_port = None
+            self.local_socket = None
+            self.current_idx = -1
+
+        if n_remote_reader > 0:
+            # for remote readers, we will:
+            # create a publish-subscribe socket to communicate large data
+            self.remote_socket = context.socket(XPUB)
+            self.remote_socket.setsockopt(XPUB_VERBOSE, True)
+            remote_subscribe_port = get_open_port()
+            if is_valid_ipv6_address(connect_ip):
+                self.remote_socket.setsockopt(IPV6, 1)
+            address = format_tcp_address(connect_ip, remote_subscribe_port)
+            logger.debug(f"class MessageQueue: Binding remote socket to {address=}")
+            self.remote_socket.bind(address)
+
+        else:
+            remote_subscribe_port = None
+            self.remote_socket = None
+
+        self._is_writer = True
+        self._is_local_reader = False
+        self.local_reader_rank = -1
+        # rank does not matter for remote readers
+        self._is_remote_reader = False
+
+        self.handle = Handle(
+            connect_ip=connect_ip,
+            local_reader_ranks=local_reader_ranks,
+            buffer=self.buffer,
+            local_subscribe_port=local_subscribe_port,
+            remote_subscribe_port=remote_subscribe_port,
+        )
+
+        logger.debug("Message queue communication handle: %s", self.handle)
+
+    def export_handle(self) -> Handle:
+        return self.handle
+
+    @staticmethod
+    def create_from_handle(handle: Handle, rank) -> "MessageQueue":
+        self = MessageQueue.__new__(MessageQueue)
+        self.handle = handle
+        self._is_writer = False
+
+        context = Context()
+
+        if rank in handle.local_reader_ranks:
+            assert handle.buffer is not None
+            self.buffer = handle.buffer
+            self.current_idx = 0
+            self.local_reader_rank = handle.local_reader_ranks.index(rank)
+            self._is_local_reader = True
+            self._is_remote_reader = False
+
+            self.local_socket = context.socket(SUB)
+            self.local_socket.setsockopt_string(SUBSCRIBE, "")
+            socket_addr = f"tcp://127.0.0.1:{handle.local_subscribe_port}"
+            logger.debug("Connecting to %s", socket_addr)
+            self.local_socket.connect(socket_addr)
+
+            self.remote_socket = None
+        else:
+            self.buffer = None  # type: ignore
+            self.current_idx = -1
+            self.local_reader_rank = -1
+            self._is_local_reader = False
+            self._is_remote_reader = True
+
+            self.local_socket = None
+
+            self.remote_socket = context.socket(SUB)
+            self.remote_socket.setsockopt_string(SUBSCRIBE, "")
+            if is_valid_ipv6_address(handle.connect_ip):
+                self.remote_socket.setsockopt(IPV6, 1)
+            socket_addr = format_tcp_address(
+                handle.connect_ip, handle.remote_subscribe_port
+            )
+            logger.debug("Connecting to %s", socket_addr)
+            self.remote_socket.connect(socket_addr)
+
+        return self
+
+    def wait_until_ready(self):
+        """This is a collective operation. All processes (including the
+        readers and the writer) should call this function.
+        """
+        if self._is_writer:
+            # wait for all readers to connect
+
+            # local readers
+            for i in range(self.n_local_reader):
+                # wait for subscription messages from all local readers
+                self.local_socket.recv()
+            if self.n_local_reader > 0:
+                # send a message to all local readers
+                # to make sure the publish channel is working
+                self.local_socket.send(b"READY")
+
+            # remote readers
+            for i in range(self.n_remote_reader):
+                # wait for subscription messages from all remote readers
+                self.remote_socket.recv()
+            if self.n_remote_reader > 0:
+                # send a message to all remote readers
+                # to make sure the publish channel is working
+                self.remote_socket.send(b"READY")
+        elif self._is_local_reader:
+            # wait for the writer to send a message
+            recv = self.local_socket.recv()
+            assert recv == b"READY"
+        elif self._is_remote_reader:
+            # wait for the writer to send a message
+            recv = self.remote_socket.recv()
+            assert recv == b"READY"
+
+    @contextmanager
+    def acquire_write(self):
+        assert self._is_writer, "Only writers can acquire write"
+        start_time = time.monotonic()
+        n_warning = 1
+        while True:
+            with self.buffer.get_metadata(self.current_idx) as metadata_buffer:
+                read_count = sum(metadata_buffer[1:])
+                written_flag = metadata_buffer[0]
+                if written_flag and read_count != self.buffer.n_reader:
+                    # this block is written and not read by all readers
+                    # for writers, `self.current_idx` is the next block to write
+                    # if this block is not ready to write,
+                    # we need to wait until it is read by all readers
+
+                    # Release the processor to other threads
+                    os.sched_yield()
+
+                    # if we wait for a long time, we should warn the user
+                    if (
+                        time.monotonic() - start_time
+                        > SGLANG_RINGBUFFER_WARNING_INTERVAL * n_warning
+                    ):
+                        logger.warning(
+                            "No available block found in %s second. ",
+                            SGLANG_RINGBUFFER_WARNING_INTERVAL,
+                        )
+                        n_warning += 1
+
+                    continue
+                # found a block that is either
+                # (1) not written
+                # (2) read by all readers
+
+                # mark the block as not written
+                metadata_buffer[0] = 0
+                # let caller write to the buffer
+                with self.buffer.get_data(self.current_idx) as buf:
+                    yield buf
+
+                # caller has written to the buffer
+                # NOTE: order is important here
+                # first set the read flags to 0
+                # then set the written flag to 1
+                # otherwise, the readers may think they already read the block
+                for i in range(1, self.buffer.n_reader + 1):
+                    # set read flag to 0, meaning it is not read yet
+                    metadata_buffer[i] = 0
+                # mark the block as written
+                metadata_buffer[0] = 1
+                self.current_idx = (self.current_idx + 1) % self.buffer.max_chunks
+                break
+
+    @contextmanager
+    def acquire_read(self):
+        assert self._is_local_reader, "Only readers can acquire read"
+        start_time = time.monotonic()
+        n_warning = 1
+        while True:
+            with self.buffer.get_metadata(self.current_idx) as metadata_buffer:
+                read_flag = metadata_buffer[self.local_reader_rank + 1]
+                written_flag = metadata_buffer[0]
+                if not written_flag or read_flag:
+                    # this block is either
+                    # (1) not written
+                    # (2) already read by this reader
+
+                    # for readers, `self.current_idx` is the next block to read
+                    # if this block is not ready,
+                    # we need to wait until it is written
+
+                    # Release the processor to other threads
+                    os.sched_yield()
+
+                    # if we wait for a long time, we should warn the user
+                    if (
+                        time.monotonic() - start_time
+                        > SGLANG_RINGBUFFER_WARNING_INTERVAL * n_warning
+                    ):
+                        logger.warning(
+                            "No available block found in %s second. ",
+                            SGLANG_RINGBUFFER_WARNING_INTERVAL,
+                        )
+                        n_warning += 1
+
+                    continue
+                # found a block that is not read by this reader
+                # let caller read from the buffer
+                with self.buffer.get_data(self.current_idx) as buf:
+                    yield buf
+
+                # caller has read from the buffer
+                # set the read flag
+                metadata_buffer[self.local_reader_rank + 1] = 1
+                self.current_idx = (self.current_idx + 1) % self.buffer.max_chunks
+                break
+
+    def enqueue(self, obj):
+        assert self._is_writer, "Only writers can enqueue"
+        serialized_obj = pickle.dumps(obj, protocol=pickle.HIGHEST_PROTOCOL)
+        if self.n_local_reader > 0:
+            if len(serialized_obj) >= self.buffer.max_chunk_bytes:
+                with self.acquire_write() as buf:
+                    buf[0] = 1  # overflow
+                self.local_socket.send(serialized_obj)
+            else:
+                with self.acquire_write() as buf:
+                    buf[0] = 0  # not overflow
+                    buf[1 : len(serialized_obj) + 1] = serialized_obj
+        if self.n_remote_reader > 0:
+            self.remote_socket.send(serialized_obj)
+
+    def dequeue(self):
+        if self._is_local_reader:
+            with self.acquire_read() as buf:
+                overflow = buf[0] == 1
+                if not overflow:
+                    # no need to know the size of serialized object
+                    # pickle format contains the size information internally
+                    # see https://docs.python.org/3/library/pickle.html
+                    obj = pickle.loads(buf[1:])
+            if overflow:
+                recv = self.local_socket.recv()
+                obj = pickle.loads(recv)
+        elif self._is_remote_reader:
+            recv = self.remote_socket.recv()
+            obj = pickle.loads(recv)
+        else:
+            raise RuntimeError("Only readers can dequeue")
+        return obj
+
+    def broadcast_object(self, obj=None):
+        if self._is_writer:
+            self.enqueue(obj)
+            return obj
+        else:
+            return self.dequeue()
+
+    @staticmethod
+    def create_from_process_group(
+        pg: ProcessGroup, max_chunk_bytes, max_chunks, writer_rank=0
+    ) -> "MessageQueue":
+        group_rank = dist.get_rank(pg)
+        group_world_size = dist.get_world_size(pg)
+        global_ranks = dist.get_process_group_ranks(pg)
+
+        from sglang.srt.distributed.parallel_state import in_the_same_node_as
+
+        status = in_the_same_node_as(pg, source_rank=writer_rank)
+        same_node_ranks = [i for i, s in enumerate(status) if s]
+        n_reader = group_world_size - 1
+        n_local_reader = len(same_node_ranks) - 1
+        local_reader_ranks = [i for i in same_node_ranks if i != writer_rank]
+        buffer_io: MessageQueue
+        if group_rank == writer_rank:
+            buffer_io = MessageQueue(
+                n_reader=n_reader,
+                n_local_reader=n_local_reader,
+                local_reader_ranks=local_reader_ranks,
+                max_chunk_bytes=max_chunk_bytes,
+                max_chunks=max_chunks,
+            )
+            handle = buffer_io.export_handle()
+            dist.broadcast_object_list(
+                [handle], src=global_ranks[writer_rank], group=pg
+            )
+        else:
+            recv = [None]
+            dist.broadcast_object_list(recv, src=global_ranks[writer_rank], group=pg)
+            handle = recv[0]  # type: ignore
+            buffer_io = MessageQueue.create_from_handle(handle, group_rank)
+        buffer_io.wait_until_ready()
+        return buffer_io
diff --git a/sglang/python/sglang/srt/distributed/device_communicators/torch_symm_mem.py b/sglang/python/sglang/srt/distributed/device_communicators/torch_symm_mem.py
new file mode 100644
index 0000000000000000000000000000000000000000..fb1e1715ae5deced421afa7a2ed3c4a98f9e004e
--- /dev/null
+++ b/sglang/python/sglang/srt/distributed/device_communicators/torch_symm_mem.py
@@ -0,0 +1,163 @@
+# Adapted from https://github.com/vllm-project/vllm/blob/bf214ca22625e311a2c4c0dfbf7af19128f4919c/vllm/distributed/device_communicators/symm_mem.py
+import logging
+from typing import Optional, Union
+
+import torch
+import torch.distributed as dist
+from torch.distributed import ProcessGroup
+
+from sglang.srt.distributed.device_communicators.all_reduce_utils import (
+    TORCH_SYMM_MEM_ALL_REDUCE_MAX_SIZES,
+)
+from sglang.srt.utils import is_cuda, is_hip
+
+try:
+    import torch.distributed._symmetric_memory as torch_symm_mem
+
+    _is_cuda = is_cuda()
+    _is_hip = is_hip()
+
+    torch_symm_mem_available = False
+    if _is_cuda:
+        torch_symm_mem_available = True
+except ImportError:
+    torch_symm_mem_available = False
+
+
+logger = logging.getLogger(__name__)
+
+
+class TorchSymmMemCommunicator:
+    """
+    Thin wrapper around torch-symmetric-memory collectives.
+
+    This communicator:
+      - Validates device capability and world size.
+      - Allocates a shared symmetric buffer.
+      - Chooses between 'multimem' and 'two-shot' all-reduce kernels.
+      - Exposes a fast-path all_reduce() compatible with bfloat16 inputs.
+
+    If any prerequisite is not met, the instance remains disabled and will
+    decline to perform symmetric-memory all-reduce.
+    """
+
+    # Mapping: compute capability major -> supported world sizes for multimem
+    # If the current (cc_major, world_size) is not listed, we fall back
+    # to the two-shot path.
+    _WORLD_SIZES_MULTIMEM = {
+        9: [4, 6, 8],
+        10: [6, 8],
+    }
+
+    def __init__(self, group: ProcessGroup, device: Union[int, str, torch.device]):
+        """
+        Args:
+            group: Torch process group used for rendezvous and naming.
+            device: Target CUDA device (index, 'cuda:X', or torch.device).
+        """
+
+        self.disabled = True
+
+        if not torch_symm_mem_available:
+            return
+
+        if isinstance(device, int):
+            device = torch.device(f"cuda:{device}")
+        elif isinstance(device, str):
+            device = torch.device(device)
+        torch.cuda.set_device(device)
+        self.dtype = torch.bfloat16
+        self.device = device
+        self.group = group
+        self.world_size = dist.get_world_size(self.group)
+        self.device_capability = torch.cuda.get_device_capability(device)[0]
+        if self.device_capability < 9:
+            logger.warning(
+                "TorchSymmMemCommunicator: Device capability %s not supported, "
+                "communicator is not available.",
+                self.device_capability,
+            )
+            return
+        if (
+            self.world_size
+            not in TORCH_SYMM_MEM_ALL_REDUCE_MAX_SIZES[self.device_capability]
+        ):
+            logger.warning(
+                "TorchSymmMemCommunicator: World size %d not supported, "
+                "communicator is not available.",
+                self.world_size,
+            )
+            return
+        self.max_size = TORCH_SYMM_MEM_ALL_REDUCE_MAX_SIZES[self.device_capability][
+            self.world_size
+        ]
+        self.buffer = torch_symm_mem.empty(
+            self.max_size // self.dtype.itemsize,
+            device=self.device,
+            dtype=self.dtype,
+        )
+        handle = torch_symm_mem.rendezvous(self.buffer, self.group.group_name)
+        if handle.multicast_ptr == 0:
+            logger.warning(
+                "TorchSymmMemCommunicator: torch symmetric memory "
+                "multicast operations are not supported."
+            )
+            self.buffer = None
+            self.disabled = True
+            return
+        self.disabled = False
+
+    def should_torch_symm_mem_allreduce(self, inp: torch.Tensor):
+        """
+        Fast-path eligibility check for a given tensor.
+
+        Conditions:
+          - Communicator must be enabled.
+          - dtype must be bfloat16 (matches kernel + buffer dtype).
+          - Total byte size must be 4-byte aligned (hardware requirement).
+          - Payload must be smaller than the symmetric-memory max size.
+
+        Returns:
+            True if the symmetric-memory path can handle this tensor.
+        """
+        if self.disabled:
+            return False
+        if inp.dtype != self.dtype:
+            return False
+        inp_size = inp.numel() * inp.element_size()
+        # enforce 4-byte alignment
+        if inp_size % 4 != 0:
+            return False
+        return inp_size < self.max_size
+
+    def all_reduce(
+        self, inp: torch.Tensor, *, out: Optional[torch.Tensor] = None
+    ) -> Optional[torch.Tensor]:
+        """
+        Perform an in-place sum all-reduce via torch symmetric memory.
+
+        Args:
+            inp: Input tensor on the target CUDA device (bfloat16).
+            out: Optional output tensor; if omitted, a new tensor is allocated.
+
+        Returns:
+            The reduced tensor (same shape as inp), or None if disabled.
+
+        Implementation details:
+            - Stages 'inp' into the symmetric buffer.
+            - Selects 'multimem' or 'two_shot' kernel based on topology.
+            - Writes the result into 'out' and returns it.
+        """
+        if out is None:
+            out = torch.empty_like(inp)
+        self.buffer[: inp.numel()].copy_(inp.view(-1))
+        if self.world_size in self._WORLD_SIZES_MULTIMEM[self.device_capability]:
+            torch.ops.symm_mem.multimem_all_reduce_(
+                self.buffer[: inp.numel()], "sum", self.group.group_name
+            )
+        else:
+            torch.ops.symm_mem.two_shot_all_reduce_(
+                self.buffer[: inp.numel()], "sum", self.group.group_name
+            )
+        out.copy_(self.buffer[: inp.numel()].view(out.shape))
+        return out
diff --git a/sglang/python/sglang/srt/distributed/device_communicators/xpu_communicator.py b/sglang/python/sglang/srt/distributed/device_communicators/xpu_communicator.py
new file mode 100644
index 0000000000000000000000000000000000000000..532279f70c358a48a43862ccfb600e1cda2fb3b2
--- /dev/null
+++ b/sglang/python/sglang/srt/distributed/device_communicators/xpu_communicator.py
@@ -0,0 +1,48 @@
+# Adapted from https://github.com/vllm-project/vllm/blob/v0.6.4.post1/vllm/distributed/device_communicators/xpu_communicator.py
+
+import torch
+import torch.distributed as dist
+from torch.distributed import ProcessGroup
+
+from sglang.srt.utils import is_xpu
+
+
+class XpuCommunicator:
+
+    def __init__(self, group: ProcessGroup):
+        if not is_xpu():
+            self.disabled = True
+            return
+        self.disabled = False
+        self.group = group
+        self.world_size = dist.get_world_size(self.group)
+
+    def all_reduce(self, x: torch.Tensor) -> torch.Tensor:
+        dist.all_reduce(x, group=self.group)
+        return x
+
+    def gather(
+        self, input_: torch.Tensor, rank_in_group: int, dst: int = 0, dim: int = -1
+    ):
+        # For xpu path, gather doesn't work properly together with ray
+        # cluster so we use all_gather instead for now.
+        input_size = input_.size()
+        # Allocate output tensor.
+        output_tensor = torch.empty(
+            (self.world_size,) + input_size, dtype=input_.dtype, device=input_.device
+        )
+        # All-gather.
+        torch.distributed.all_gather_into_tensor(
+            output_tensor, input_, group=self.group
+        )
+        if rank_in_group == dst:
+            # Reshape
+            output_tensor = output_tensor.movedim(0, dim)
+            output_tensor = output_tensor.reshape(
+                input_size[:dim]
+                + (self.world_size * input_size[dim],)
+                + input_size[dim + 1 :]
+            )
+        else:
+            output_tensor = None
+        return output_tensor
diff --git a/sglang/python/sglang/srt/distributed/naive_distributed.py b/sglang/python/sglang/srt/distributed/naive_distributed.py
new file mode 100644
index 0000000000000000000000000000000000000000..b59380d07b57750fd309ede92358763a46980882
--- /dev/null
+++ b/sglang/python/sglang/srt/distributed/naive_distributed.py
@@ -0,0 +1,113 @@
+import pickle
+import time
+from pathlib import Path
+from typing import Any, List, Optional
+
+import pybase64
+import torch
+
+from sglang.srt.utils import MultiprocessingSerializer
+
+
+class NaiveDistributed:
+    def __init__(self, rank: int, world_size: int, rendezvous: str):
+        self._rank = rank
+        self._world_size = world_size
+        self._operation_index = 0
+        self._directory = Path(rendezvous)
+        self._directory.mkdir(parents=True, exist_ok=True)
+        assert 0 <= rank < world_size
+
+        # both barrier to be safe, and as a sanity check
+        self.barrier()
+
+    def get_rank(self):
+        return self._rank
+
+    def get_world_size(self):
+        return self._world_size
+
+    def scatter(
+        self, tensor: torch.Tensor, scatter_list: List[torch.Tensor], src: int = 0
+    ):
+        if self._rank == src:
+            assert len(scatter_list) == self._world_size
+        else:
+            assert scatter_list is None
+
+        gathered_objects = self.all_gather_object(
+            dict(
+                serialized_scatter_list=[
+                    (
+                        None
+                        if item_rank == src
+                        else MultiprocessingSerializer.serialize(item)
+                    )
+                    for item_rank, item in enumerate(scatter_list)
+                ]
+            )
+            if self._rank == src
+            else dict()
+        )
+
+        remote_serialized_tensor = gathered_objects[src]["serialized_scatter_list"][
+            self._rank
+        ]
+        if self._rank == src:
+            assert remote_serialized_tensor is None
+            remote_tensor = scatter_list[self._rank]
+        else:
+            remote_tensor = MultiprocessingSerializer.deserialize(
+                remote_serialized_tensor
+            )
+        tensor.copy_(remote_tensor)
+
+        # avoid src tensor be deleted too early
+        self.barrier()
+
+    def all_gather_object(self, obj: Any) -> List[Any]:
+        self._operation_index += 1
+
+        text_postfix = "\n"
+
+        def _get_path(interesting_rank: int):
+            return (
+                self._directory
+                / f"rank{interesting_rank}_op{self._operation_index}.txt"
+            )
+
+        _get_path(self._rank).write_text(
+            pybase64.b64encode(pickle.dumps(obj)).decode("utf-8") + text_postfix
+        )
+
+        def _read_one(interesting_rank: int):
+            p = _get_path(interesting_rank)
+            while True:
+                if p.exists() and (text := p.read_text()).endswith(text_postfix):
+                    return pickle.loads(
+                        pybase64.b64decode(text[: -len(text_postfix)], validate=True)
+                    )
+                time.sleep(0.001)
+
+        return [
+            _read_one(interesting_rank) for interesting_rank in range(self._world_size)
+        ]
+
+    def barrier(self):
+        actual_objs = self.all_gather_object(self._rank)
+        assert actual_objs == list(range(self._world_size)), f"{actual_objs=}"
+
+
+# Can have multi instances if needed
+_instance: Optional[NaiveDistributed] = None
+
+
+def get_naive_distributed():
+    assert _instance is not None
+    return _instance
+
+
+def set_naive_distributed(instance: NaiveDistributed):
+    global _instance
+    assert _instance is None
+    _instance = instance
diff --git a/sglang/python/sglang/srt/distributed/parallel_state.py b/sglang/python/sglang/srt/distributed/parallel_state.py
new file mode 100644
index 0000000000000000000000000000000000000000..9df3668d0bd54989a19bd696bc726344c931f5cc
--- /dev/null
+++ b/sglang/python/sglang/srt/distributed/parallel_state.py
@@ -0,0 +1,2326 @@
+# Adapted from https://github.com/vllm-project/vllm/blob/v0.6.4.post1/vllm/distributed/parallel_state.py
+
+# Copyright 2023 The vLLM team.
+# Adapted from
+# https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/core/parallel_state.py
+# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+"""Distributed state.
+It takes over the control of the distributed environment from PyTorch.
+The typical workflow is:
+
+- call `init_distributed_environment` to initialize the distributed environment.
+- call `initialize_model_parallel` or `ensure_model_parallel_initialized` to
+ initialize the model parallel groups.
+
+- any code dealing with the distributed stuff
+
+- call `destroy_model_parallel` to destroy the model parallel groups.
+- call `destroy_distributed_environment` to destroy the distributed environment.
+
+If you only need to use the distributed environment without model/pipeline
+ parallelism, you can skip the model parallel initialization and destruction
+ steps.
+"""
+
+import contextlib
+import gc
+import logging
+import os
+import pickle
+import weakref
+from collections import namedtuple
+from contextlib import contextmanager, nullcontext
+from dataclasses import dataclass
+from datetime import timedelta
+from multiprocessing import shared_memory
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+from unittest.mock import patch
+
+import torch
+import torch.distributed
+from torch.distributed import Backend, ProcessGroup
+
+from sglang.srt.compilation.compilation_config import register_split_op
+from sglang.srt.compilation.piecewise_context_manager import is_in_piecewise_cuda_graph
+from sglang.srt.environ import envs
+from sglang.srt.utils import (
+    get_bool_env_var,
+    get_current_device_stream_fast,
+    get_int_env_var,
+    get_local_ip_auto,
+    is_cpu,
+    is_cuda_alike,
+    is_hip,
+    is_musa,
+    is_npu,
+    is_shm_available,
+    is_xpu,
+)
+from sglang.srt.utils.custom_op import register_custom_op
+
+_is_npu = is_npu()
+_is_cpu = is_cpu()
+_is_xpu = is_xpu()
+_is_musa = is_musa()
+
+TensorMetadata = namedtuple("TensorMetadata", ["device", "dtype", "size"])
+
+# use int value instead of ReduceOp.SUM to support torch compile
+REDUCE_OP_SUM = int(torch.distributed.ReduceOp.SUM)
+
+
+def get_torch_distributed_pg_options(group_name=None):
+    if not _is_npu:
+        return None
+
+    # Only create HCCL options for default group or MoE-related groups
+    if group_name is not None and "moe" not in group_name:
+        return None
+
+    import torch_npu
+
+    options = torch_npu._C._distributed_c10d.ProcessGroupHCCL.Options()
+    hccl_buffer_size = int(
+        os.environ.get("DEEPEP_HCCL_BUFFSIZE") or os.environ.get("HCCL_BUFFSIZE") or 200
+    )
+    options.hccl_config = {"hccl_buffer_size": hccl_buffer_size}
+    return options
+
+
+@dataclass
+class GraphCaptureContext:
+    stream: torch.get_device_module().Stream
+
+
+@dataclass
+class P2PWork:
+    work: Optional[torch.distributed.Work]
+    payload: Optional[torch.Tensor]
+
+
+def _split_tensor_dict(
+    tensor_dict: Dict[str, Union[torch.Tensor, Any]],
+) -> Tuple[List[Tuple[str, Any]], List[torch.Tensor]]:
+    """Split the tensor dictionary into two parts:
+    1. A list of (key, value) pairs. If the value is a tensor, it is replaced
+         by its metadata.
+    2. A list of tensors.
+    """
+    metadata_list: List[Tuple[str, Any]] = []
+    tensor_list: List[torch.Tensor] = []
+    for key, value in tensor_dict.items():
+        if isinstance(value, torch.Tensor):
+            # Note: we cannot use `value.device` here,
+            # because it contains not only the device type but also the device
+            # index (e.g. "cuda:0"). We only need the device type.
+            # receiving side will set the device index.
+            device = value.device.type
+            metadata_list.append(
+                (key, TensorMetadata(device, value.dtype, value.size()))
+            )
+            tensor_list.append(value)
+        else:
+            metadata_list.append((key, value))
+    return metadata_list, tensor_list
+
+
+_group_name_counter: Dict[str, int] = {}
+
+
+def _get_unique_name(name: str) -> str:
+    """Get a unique name for the group.
+    Example:
+    _get_unique_name("tp") -> "tp:0"
+    _get_unique_name("tp") -> "tp:1"
+    """
+    if name not in _group_name_counter:
+        _group_name_counter[name] = 0
+    newname = f"{name}:{_group_name_counter[name]}"
+    _group_name_counter[name] += 1
+    return newname
+
+
+_groups: Dict[str, Callable[[], Optional["GroupCoordinator"]]] = {}
+
+
+def _register_group(group: "GroupCoordinator") -> None:
+    _groups[group.unique_name] = weakref.ref(group)
+
+
+@register_custom_op(mutates_args=["tensor"])
+@register_split_op()
+def inplace_all_reduce(tensor: torch.Tensor, group_name: str) -> None:
+    assert group_name in _groups, f"Group {group_name} is not found."
+    group = _groups[group_name]()
+    if group is None:
+        raise ValueError(f"Group {group_name} is destroyed.")
+    group._all_reduce_in_place(tensor)
+
+
+@register_custom_op(out_shape="tensor")
+def outplace_all_reduce(
+    tensor: torch.Tensor, group_name: str, outplace_all_reduce_method: str
+) -> torch.Tensor:
+    assert group_name in _groups, f"Group {group_name} is not found."
+    group = _groups[group_name]()
+    if group is None:
+        raise ValueError(f"Group {group_name} is destroyed.")
+    return group._all_reduce_out_place(tensor, outplace_all_reduce_method)
+
+
+@register_custom_op(mutates_args=["output"])
+def reg_all_gather_into_tensor(
+    output: torch.Tensor, input: torch.Tensor, group_name: str
+) -> None:
+    assert group_name in _groups, f"Group {group_name} is not found."
+    group = _groups[group_name]()
+    if group is None:
+        raise ValueError(f"Group {group_name} is destroyed.")
+    group._all_gather_into_tensor(output, input)
+
+
+@register_custom_op(mutates_args=["output"])
+def reg_reduce_scatter_tensor(
+    output: torch.Tensor, input: torch.Tensor, group_name: str
+) -> None:
+    assert group_name in _groups, f"Group {group_name} is not found."
+    group = _groups[group_name]()
+    if group is None:
+        raise ValueError(f"Group {group_name} is destroyed.")
+    group._reduce_scatter_tensor(output, input)
+
+
+class GroupCoordinator:
+    """
+    PyTorch ProcessGroup wrapper for a group of processes.
+    PyTorch ProcessGroup is bound to one specific communication backend,
+        e.g. NCCL, Gloo, MPI, etc.
+    GroupCoordinator takes charge of all the communication operations among
+        the processes in the group. It can route the communication to
+        a specific implementation (e.g. switch allreduce implementation
+        based on the tensor size and cuda graph mode).
+    """
+
+    # available attributes:
+    rank: int  # global rank
+    ranks: List[int]  # global ranks in the group
+    world_size: int  # size of the group
+    # difference between `local_rank` and `rank_in_group`:
+    # if we have a group of size 4 across two nodes:
+    # Process | Node | Rank | Local Rank | Rank in Group
+    #   0     |   0  |  0   |     0      |       0
+    #   1     |   0  |  1   |     1      |       1
+    #   2     |   1  |  2   |     0      |       2
+    #   3     |   1  |  3   |     1      |       3
+    local_rank: int  # local rank used to assign devices
+    rank_in_group: int  # rank inside the group
+    cpu_group: ProcessGroup  # group for CPU communication
+    device_group: ProcessGroup  # group for device communication
+    use_pynccl: bool  # a hint of whether to use PyNccl
+    use_pymscclpp: bool  # a hint of whether to use PyMsccl
+    use_custom_allreduce: bool  # a hint of whether to use CustomAllreduce
+    use_torch_symm_mem_all_reduce: (
+        bool  # a hint of whether to use TorchSymmMemAllReduce
+    )
+    use_message_queue_broadcaster: (
+        bool  # a hint of whether to use message queue broadcaster
+    )
+    # communicators are only created for world size > 1
+    pynccl_comm: Optional[Any]  # PyNccl communicator
+    ca_comm: Optional[Any]  # Custom allreduce communicator
+    torch_symm_mem_comm: Optional[Any]  # Torch symm mem communicator
+    mq_broadcaster: Optional[Any]  # shared memory broadcaster
+
+    def __init__(
+        self,
+        group_ranks: List[List[int]],
+        local_rank: int,
+        torch_distributed_backend: Union[str, Backend],
+        use_pynccl: bool,
+        use_pymscclpp: bool,
+        use_custom_allreduce: bool,
+        use_torch_symm_mem_all_reduce: bool,
+        use_hpu_communicator: bool,
+        use_xpu_communicator: bool,
+        use_npu_communicator: bool,
+        use_message_queue_broadcaster: bool = False,
+        group_name: Optional[str] = None,
+        pynccl_use_current_stream: bool = False,
+        gloo_timeout: timedelta = timedelta(seconds=120 * 60),
+    ):
+        # Set group info
+        group_name = group_name or "anonymous"
+        self.unique_name = _get_unique_name(group_name)
+        _register_group(self)
+
+        # Set rank info
+        self.rank = torch.distributed.get_rank()
+        self.local_rank = local_rank
+        self.device_group = None
+        self.cpu_group = None
+        self.local_size = get_int_env_var("LOCAL_SIZE", 0)
+
+        if is_cuda_alike():
+            device_id = (
+                0 if envs.SGLANG_ONE_VISIBLE_DEVICE_PER_PROCESS.get() else local_rank
+            )
+            self.device = torch.device(f"cuda:{device_id}")
+        elif _is_npu:
+            self.device = torch.device(f"npu:{local_rank}")
+        elif _is_xpu:
+            self.device = torch.device(f"xpu:{local_rank}")
+        elif _is_musa:
+            self.device = torch.device(f"musa:{local_rank}")
+        else:
+            self.device = torch.device("cpu")
+        self.device_module = torch.get_device_module(self.device)
+
+        for ranks in group_ranks:
+            active_ranks = torch.ones(len(ranks), dtype=torch.int32, device=self.device)
+            active_ranks_cpu = torch.ones(len(ranks), dtype=torch.int32)
+            if "mooncake" in torch_distributed_backend:
+                from mooncake.ep import MooncakeBackendOptions
+
+                device_group = torch.distributed.new_group(
+                    ranks,
+                    backend="mooncake",
+                    pg_options=MooncakeBackendOptions(active_ranks),
+                )
+                cpu_group = torch.distributed.new_group(
+                    ranks,
+                    backend="mooncake-cpu",
+                    pg_options=MooncakeBackendOptions(active_ranks_cpu),
+                )
+            else:
+                pg_options = get_torch_distributed_pg_options(group_name)
+                device_group = torch.distributed.new_group(
+                    ranks, backend=torch_distributed_backend, pg_options=pg_options
+                )
+                # a group with `gloo` backend, to allow direct coordination
+                # between processes through the CPU.
+                cpu_group = torch.distributed.new_group(
+                    ranks, backend="gloo", timeout=gloo_timeout
+                )
+            if self.rank in ranks:
+                self.ranks = ranks
+                self.world_size = len(ranks)
+                self.rank_in_group = ranks.index(self.rank)
+                self.device_group = device_group
+                self.cpu_group = cpu_group
+                self.active_ranks = active_ranks
+                self.active_ranks_cpu = active_ranks_cpu
+
+        assert self.cpu_group is not None
+        assert self.device_group is not None
+
+        # Import communicators
+        self.use_pynccl = use_pynccl
+        self.pynccl_use_current_stream = pynccl_use_current_stream
+        self.use_pymscclpp = use_pymscclpp
+        self.use_custom_allreduce = use_custom_allreduce
+        self.use_torch_symm_mem_all_reduce = use_torch_symm_mem_all_reduce
+        self.use_hpu_communicator = use_hpu_communicator
+        self.use_xpu_communicator = use_xpu_communicator
+        self.use_npu_communicator = use_npu_communicator
+        self.use_message_queue_broadcaster = use_message_queue_broadcaster
+
+        # Lazy import to avoid documentation build error
+        from sglang.srt.distributed.device_communicators.custom_all_reduce import (
+            dispatch_custom_allreduce,
+        )
+        from sglang.srt.distributed.device_communicators.pymscclpp import (
+            PyMscclppCommunicator,
+        )
+        from sglang.srt.distributed.device_communicators.pynccl import (
+            PyNcclCommunicator,
+        )
+        from sglang.srt.distributed.device_communicators.pynccl_allocator import (
+            is_symmetric_memory_enabled,
+            use_symmetric_memory,
+        )
+        from sglang.srt.distributed.device_communicators.torch_symm_mem import (
+            TorchSymmMemCommunicator,
+        )
+        from sglang.srt.layers.dp_attention import is_allocation_symmetric
+
+        self.is_symmetric_memory_enabled = is_symmetric_memory_enabled
+        self.use_symmetric_memory = use_symmetric_memory
+        self.is_allocation_symmetric = is_allocation_symmetric
+        if is_hip():
+            from sglang.srt.distributed.device_communicators.quick_all_reduce import (
+                QuickAllReduce,
+                qr_rocm_arch_available,
+            )
+
+        self.pynccl_comm: Optional[PyNcclCommunicator] = None
+        if use_pynccl and self.world_size > 1:
+            self.pynccl_comm = PyNcclCommunicator(
+                group=self.cpu_group,
+                device=self.device,
+                use_current_stream=pynccl_use_current_stream,
+            )
+
+        self.pymscclpp_comm: Optional[PyMscclppCommunicator] = None
+        if use_pymscclpp and self.world_size > 1:
+            self.pymscclpp_comm = PyMscclppCommunicator(
+                group=self.cpu_group,
+                device=self.device,
+            )
+
+        self.ca_comm: Optional[Any] = None
+        self.qr_comm: Optional[QuickAllReduce] = None
+        if use_custom_allreduce and self.world_size > 1:
+            # Initialize a custom fast all-reduce implementation.
+            try:
+                CAClass = dispatch_custom_allreduce()
+                self.ca_comm = CAClass(
+                    group=self.cpu_group,
+                    device=self.device,
+                )
+            except Exception as e:
+                logger.warning(
+                    f"Setup Custom allreduce failed with {e}. To silence this "
+                    "warning, specify --disable-custom-all-reduce explicitly."
+                )
+
+            if is_hip():
+                try:
+                    # Initialize a custom quick all-reduce implementation for AMD
+                    # when rocm >= gfx942. Quick reduce is designed as a
+                    # complement to custom allreduce.
+                    # Based on quickreduce (https://github.com/mk1-project/quickreduce).
+                    if qr_rocm_arch_available():
+                        self.qr_comm = QuickAllReduce(
+                            group=self.cpu_group, device=self.device
+                        )
+                except Exception as e:
+                    logger.warning(f"Failed to initialize QuickAllReduce: {e}")
+        elif self.world_size > 1 and is_hip():
+            logger.info("[AR] All-reduce call path: NCCL (custom AR disabled)")
+
+        self.torch_symm_mem_comm: Optional[TorchSymmMemCommunicator] = None
+        if self.use_torch_symm_mem_all_reduce and self.world_size > 1:
+            self.torch_symm_mem_comm = TorchSymmMemCommunicator(
+                group=self.cpu_group,
+                device=self.device,
+            )
+
+        # Create communicator for other hardware backends
+        from sglang.srt.distributed.device_communicators.hpu_communicator import (
+            HpuCommunicator,
+        )
+        from sglang.srt.distributed.device_communicators.npu_communicator import (
+            NpuCommunicator,
+        )
+        from sglang.srt.distributed.device_communicators.xpu_communicator import (
+            XpuCommunicator,
+        )
+
+        self.hpu_communicator: Optional[HpuCommunicator] = None
+        if use_hpu_communicator and self.world_size > 1:
+            self.hpu_communicator = HpuCommunicator(group=self.device_group)
+
+        self.xpu_communicator: Optional[XpuCommunicator] = None
+        if use_xpu_communicator and self.world_size > 1:
+            self.xpu_communicator = XpuCommunicator(group=self.device_group)
+
+        self.npu_communicator: Optional[NpuCommunicator] = None
+        if use_npu_communicator and self.world_size > 1:
+            self.npu_communicator = NpuCommunicator(group=self.device_group)
+
+        # Create message queue
+        from sglang.srt.distributed.device_communicators.shm_broadcast import (
+            MessageQueue,
+        )
+
+        self.mq_broadcaster: Optional[MessageQueue] = None
+        if use_message_queue_broadcaster and self.world_size > 1:
+            self.mq_broadcaster = MessageQueue.create_from_process_group(
+                self.cpu_group, 1 << 22, 6
+            )
+
+    def __repr__(self):
+        return (
+            f"ranks={self.ranks} rank={self.rank} local_rank={self.local_rank} use_pynccl={self.use_pynccl} "
+            f"device_group={self.device_group} cpu_group={self.cpu_group} unique_name={self.unique_name} "
+            f"world_size={self.world_size} rank_in_group={self.rank_in_group}"
+        )
+
+    @property
+    def first_rank(self):
+        """Return the global rank of the first process in the group"""
+        return self.ranks[0]
+
+    @property
+    def last_rank(self):
+        """Return the global rank of the last process in the group"""
+        return self.ranks[-1]
+
+    @property
+    def is_first_rank(self):
+        """Return whether the caller is the first process in the group"""
+        return self.rank == self.first_rank
+
+    @property
+    def is_last_rank(self):
+        """Return whether the caller is the last process in the group"""
+        return self.rank == self.last_rank
+
+    @property
+    def next_rank(self):
+        """Return the global rank of the process that follows the caller"""
+        rank_in_group = self.rank_in_group
+        world_size = self.world_size
+        return self.ranks[(rank_in_group + 1) % world_size]
+
+    @property
+    def prev_rank(self):
+        """Return the global rank of the process that precedes the caller"""
+        rank_in_group = self.rank_in_group
+        world_size = self.world_size
+        return self.ranks[(rank_in_group - 1) % world_size]
+
+    @contextmanager
+    def graph_capture(
+        self,
+        graph_capture_context: Optional[GraphCaptureContext] = None,
+        stream: Optional[torch.cuda.Stream] = None,
+    ):
+        if graph_capture_context is None:
+            if stream is None:
+                stream = self.device_module.Stream()
+            graph_capture_context = GraphCaptureContext(stream)
+        else:
+            stream = graph_capture_context.stream
+        # We don't need the context of custom quick allreduce because the ipc access
+        # is already collected in init() and we can capture the quick allreduce directly.
+        ca_comm = self.ca_comm
+        maybe_ca_context = nullcontext() if ca_comm is None else ca_comm.capture()
+
+        # ensure all initialization operations complete before attempting to
+        # capture the graph on another stream
+        curr_stream = get_current_device_stream_fast()
+        if curr_stream != stream:
+            stream.wait_stream(curr_stream)
+
+        with self.device_module.stream(stream), maybe_ca_context:
+            # In graph mode, we have to be very careful about the collective
+            # operations. The current status is:
+            #     allreduce \ Mode   |  Eager  |  Graph  |
+            # --------------------------------------------
+            # quick allreduce        | enabled | enabled |
+            # custom allreduce       | enabled | enabled |
+            # PyNccl                 | disabled| enabled |
+            # PyMscclpp              | disabled| enabled |
+            # TorchSymmMem           | disabled| enabled |
+            # torch.distributed      | enabled | disabled|
+            #
+            # Note: When custom quick allreduce is enabled, a runtime check
+            #  will be performed. If the tensor size is too small, it will
+            #  automatically fall back to the next available option.
+            # Note that custom allreduce will have a runtime check, if the
+            #  tensor size is too large, it will fallback to the next
+            #  available option.
+            # Note that the PyMsccl needs to register the tensor in ahead,
+            #  which will introduce large overhead in the eager case,
+            #  therefore it is only supported in the graph case.
+            # In summary: We select the appropriate allreduce method for
+            #  each mode based on the algorithm order in the table and
+            #  their usage conditions.
+            pynccl_comm = self.pynccl_comm
+            maybe_pynccl_context: Any
+            if not pynccl_comm:
+                maybe_pynccl_context = nullcontext()
+            else:
+                maybe_pynccl_context = pynccl_comm.change_state(
+                    enable=True, stream=get_current_device_stream_fast()
+                )
+
+            pymscclpp_comm = self.pymscclpp_comm
+            maybe_pymscclpp_context: Any
+            if not pymscclpp_comm:
+                maybe_pymscclpp_context = nullcontext()
+            else:
+                maybe_pymscclpp_context = pymscclpp_comm.change_state(enable=True)
+            with maybe_pynccl_context, maybe_pymscclpp_context:
+                yield graph_capture_context
+
+    def all_reduce(self, input_: torch.Tensor) -> torch.Tensor:
+        """
+        User-facing all-reduce function before we actually call the
+        all-reduce operation.
+
+        We need this because Dynamo does not support passing an arbitrary
+        object (`self` in this case) to a custom op. We need to pass the
+         group name as a string, and then look up the group coordinator from
+         the group name, dispatch the all-reduce operation to the group
+         coordinator.
+
+        In addition, PyTorch custom ops do not support mutation or returning
+        a new tensor in the same op. So we need to figure out if the op is
+        in-place or out-of-place ahead of time.
+        """
+        # Bypass the function if we are using only 1 GPU.
+        if self.world_size == 1:
+            return input_
+
+        # On AMD, use the deterministic 1-stage kernel when:
+        # - SGLANG_USE_1STAGE_ALLREDUCE=1 (explicitly enabled), OR
+        # - SGLANG_USE_1STAGE_ALLREDUCE not set AND --enable-deterministic-inference is on
+        if envs.SGLANG_USE_1STAGE_ALLREDUCE.is_set():
+            use_1stage_ar = envs.SGLANG_USE_1STAGE_ALLREDUCE.get()
+        else:
+            use_1stage_ar = envs.SGLANG_ENABLE_DETERMINISTIC_INFERENCE.get()
+        use_deterministic_ar = is_hip() and use_1stage_ar
+        if use_deterministic_ar:
+            if not input_.is_cpu and self.ca_comm is not None:
+                inp_size = input_.numel() * input_.element_size()
+                # Try unregistered mode first (faster for smaller tensors)
+                if inp_size < self.ca_comm.max_size:
+                    return self.ca_comm.deterministic_all_reduce(
+                        input_, registered=False
+                    )
+                # Use registered mode for larger tensors
+                self.ca_comm.register_buffer(input_)
+                return self.ca_comm.deterministic_all_reduce(input_, registered=True)
+
+        if input_.is_cpu:
+            if is_shm_available(input_.dtype, self.world_size, self.local_size):
+                torch.ops.sgl_kernel.shm_allreduce(input_, REDUCE_OP_SUM)
+            else:
+                torch.distributed.all_reduce(input_, group=self.device_group)
+            return input_
+
+        if self.hpu_communicator is not None and not self.hpu_communicator.disabled:
+            return self.hpu_communicator.all_reduce(input_)
+
+        if self.xpu_communicator is not None and not self.xpu_communicator.disabled:
+            return self.xpu_communicator.all_reduce(input_)
+
+        if self.npu_communicator is not None and not self.npu_communicator.disabled:
+            return self.npu_communicator.all_reduce(input_)
+
+        if self.pynccl_comm is not None and self.is_symmetric_memory_enabled():
+            with self.pynccl_comm.change_state(
+                enable=True, stream=get_current_device_stream_fast()
+            ):
+                self.pynccl_comm.all_reduce(input_)
+                return input_
+
+        outplace_all_reduce_method = None
+        if (
+            self.ca_comm is not None
+            and not self.ca_comm.disabled
+            and self.ca_comm.should_custom_ar(input_)
+        ):
+            outplace_all_reduce_method = "ca"
+        elif (
+            self.qr_comm is not None
+            and not self.qr_comm.disabled
+            and self.qr_comm.should_quick_allreduce(input_)
+        ):
+            outplace_all_reduce_method = "qr"
+        elif (
+            self.pymscclpp_comm is not None
+            and not self.pymscclpp_comm.disabled
+            and self.pymscclpp_comm.should_mscclpp_allreduce(input_)
+        ):
+            outplace_all_reduce_method = "pymscclpp"
+        elif (
+            self.torch_symm_mem_comm is not None
+            and not self.torch_symm_mem_comm.disabled
+            and self.torch_symm_mem_comm.should_torch_symm_mem_allreduce(input_)
+        ):
+            outplace_all_reduce_method = "torch_symm_mem"
+        elif is_in_piecewise_cuda_graph():
+            # For piecewise cuda graph, we use pynccl outplace allreduce
+            outplace_all_reduce_method = "pynccl"
+        if outplace_all_reduce_method is not None:
+            return outplace_all_reduce(
+                input_,
+                group_name=self.unique_name,
+                outplace_all_reduce_method=outplace_all_reduce_method,
+            )
+        else:
+            inplace_all_reduce(input_, group_name=self.unique_name)
+            return input_
+
+    def fused_allreduce_rmsnorm(
+        self,
+        input_: torch.Tensor,
+        residual_inp_: torch.Tensor,
+        weight_: torch.Tensor,
+        eps: float,
+    ) -> Optional[Tuple[torch.Tensor, torch.Tensor]]:
+        """Attempt fused all-reduce + RMSNorm via custom all-reduce communicator."""
+        ca_comm = self.ca_comm
+        if ca_comm is None or getattr(ca_comm, "disabled", True):
+            return None
+
+        # Prefer communicator-native fused API when provided.
+        if hasattr(ca_comm, "fused_allreduce_rmsnorm"):
+            try:
+                return ca_comm.fused_allreduce_rmsnorm(
+                    input_, residual_inp_, weight_, eps
+                )
+            except Exception:
+                # Fall back to custom_fused_ar_rms path below.
+                pass
+
+        if not hasattr(ca_comm, "custom_fused_ar_rms"):
+            return None
+
+        # 1-stage policy for fused AR+RMSNorm:
+        # 1) Explicit env override wins.
+        # 2) Deterministic inference forces 1-stage for reproducibility.
+        # 3) Otherwise follow AITER's heuristic (small payloads only).
+        if envs.SGLANG_USE_1STAGE_ALLREDUCE.is_set():
+            use_1stage_ar = envs.SGLANG_USE_1STAGE_ALLREDUCE.get()
+        elif envs.SGLANG_ENABLE_DETERMINISTIC_INFERENCE.get():
+            use_1stage_ar = True
+        else:
+            total_bytes = input_.numel() * input_.element_size()
+            hidden_dim = input_.shape[-1]
+            use_1stage_ar = total_bytes <= 128 * 1024 and hidden_dim in {
+                512,
+                1024,
+                2048,
+                4096,
+            }
+
+        fused_outputs = ca_comm.custom_fused_ar_rms(
+            input_,
+            residual_inp_,
+            weight_,
+            eps,
+            use_1stage_ar,
+        )
+        return fused_outputs
+
+    def _all_reduce_out_place(
+        self, input_: torch.Tensor, outplace_all_reduce_method: str
+    ) -> torch.Tensor:
+        ca_comm = self.ca_comm
+        qr_comm = self.qr_comm
+        pymscclpp_comm = self.pymscclpp_comm
+        torch_symm_mem_comm = self.torch_symm_mem_comm
+        pynccl_comm = self.pynccl_comm
+        assert any([qr_comm, ca_comm, pymscclpp_comm, torch_symm_mem_comm, pynccl_comm])
+        if outplace_all_reduce_method == "ca":
+            assert not ca_comm.disabled
+            out = ca_comm.custom_all_reduce(input_)
+        elif outplace_all_reduce_method == "qr":
+            assert not qr_comm.disabled
+            out = qr_comm.quick_all_reduce(input_)
+        elif outplace_all_reduce_method == "torch_symm_mem":
+            assert not torch_symm_mem_comm.disabled
+            out = torch_symm_mem_comm.all_reduce(input_)
+        elif outplace_all_reduce_method == "pymscclpp":
+            assert not pymscclpp_comm.disabled
+            out = pymscclpp_comm.all_reduce(input_)
+        elif outplace_all_reduce_method == "pynccl":
+            with pynccl_comm.change_state(
+                enable=True, stream=get_current_device_stream_fast()
+            ):
+                out = pynccl_comm.outplace_all_reduce(input_)
+        assert out is not None
+        return out
+
+    def _all_reduce_in_place(self, input_: torch.Tensor) -> None:
+        pynccl_comm = self.pynccl_comm
+        torch_symm_mem_comm = self.torch_symm_mem_comm
+        if pynccl_comm is not None and not pynccl_comm.disabled:
+            pynccl_comm.all_reduce(input_)
+        elif torch_symm_mem_comm is not None and not torch_symm_mem_comm.disabled:
+            torch_symm_mem_comm.all_reduce(input_)
+        else:
+            torch.distributed.all_reduce(input_, group=self.device_group)
+
+    def _reduce_scatter_tensor(
+        self,
+        output: torch.Tensor,
+        input: torch.Tensor,
+    ) -> torch.Tensor:
+        pynccl_comm = self.pynccl_comm
+        if pynccl_comm is not None and (
+            not pynccl_comm.disabled or self.is_symmetric_memory_enabled()
+        ):
+            with pynccl_comm.change_state(
+                enable=True, stream=get_current_device_stream_fast()
+            ):
+                pynccl_comm.reduce_scatter(output, input)
+        else:
+            torch.distributed.reduce_scatter_tensor(
+                output, input, group=self.device_group
+            )
+        return output
+
+    def reduce_scatter_tensor(self, output: torch.Tensor, input: torch.Tensor):
+        if _is_npu:
+            self._reduce_scatter_tensor(output, input)
+        else:
+            reg_reduce_scatter_tensor(output, input, group_name=self.unique_name)
+
+    def reduce_scatter(
+        self,
+        output: torch.Tensor,
+        input_list: List[torch.Tensor],
+    ) -> None:
+        # TODO(ch-wan): support other backends
+        torch.distributed.reduce_scatter(output, input_list, group=self.device_group)
+        return output
+
+    def reduce_scatterv(
+        self,
+        input_: torch.Tensor,
+        output: Optional[torch.Tensor] = None,
+        sizes: Optional[List[int]] = None,
+    ) -> torch.Tensor:
+        world_size = self.world_size
+        pynccl_comm = self.pynccl_comm
+
+        with pynccl_comm.change_state(
+            enable=True, stream=get_current_device_stream_fast()
+        ):
+            assert (
+                pynccl_comm is not None and not pynccl_comm.disabled
+            ), "pynccl is required for reduce_scatterv"
+
+            if sizes is not None:
+                assert len(sizes) == world_size
+                assert input_.shape[0] == sum(sizes)
+                chunk_size = sizes[self.rank_in_group]
+            else:
+                assert input_.shape[0] % world_size == 0
+                chunk_size = input_.shape[0] // world_size
+            output_shape = (chunk_size,) + input_.shape[1:]
+
+            if output is None:
+                output = torch.empty(
+                    output_shape, dtype=input_.dtype, device=input_.device
+                )
+            else:
+                assert output.shape == output_shape
+
+            pynccl_comm.reduce_scatter(output, input_, sizes=sizes)
+            return output
+
+    def _all_gather_into_tensor(self, output: torch.Tensor, input: torch.Tensor):
+        pynccl_comm = self.pynccl_comm
+        if pynccl_comm is not None and (
+            not pynccl_comm.disabled or self.is_symmetric_memory_enabled()
+        ):
+            with pynccl_comm.change_state(
+                enable=True, stream=get_current_device_stream_fast()
+            ):
+                pynccl_comm.all_gather(output, input)
+        else:
+            torch.distributed.all_gather_into_tensor(
+                output, input, group=self.device_group
+            )
+
+    def all_gather_into_tensor(self, output: torch.Tensor, input: torch.Tensor):
+        if _is_npu or _is_xpu:
+            self._all_gather_into_tensor(output, input)
+        else:
+            reg_all_gather_into_tensor(output, input, group_name=self.unique_name)
+
+    def cp_all_gather_into_tensor_async(
+        self, output: torch.Tensor, input: torch.Tensor, stream=None
+    ):
+        """
+        Implement an asynchronous `allgather` operation on a specified stream.
+        (the default `torch.distributed.all_gather_into_tensor` will trigger event synchronization),
+        eliminating the CPU-side launch-kernel blocking issue caused by synchronization problems.
+        The specific implementation uses the interface provided by pynccl to remove the synchronization logic of events.
+        """
+        assert (
+            stream is not None
+        ), f"Invalid params stream ({stream}, Please specify the stream to use when calling cp_all_gather_into_tensor_async.)"
+        pynccl_comm = self.pynccl_comm
+        if pynccl_comm is None or pynccl_comm.disabled:
+            self.all_gather_into_tensor(output, input)
+        else:
+            pynccl_comm.cp_all_gather_into_tensor(output, input, stream=stream)
+
+    def all_gather(
+        self,
+        input_: torch.Tensor,
+        dim: int = -1,
+        output_tensor_list: Optional[List[torch.Tensor]] = None,
+    ) -> torch.Tensor:
+        world_size = self.world_size
+        # Bypass the function if we are using only 1 GPU.
+        if world_size == 1:
+            if output_tensor_list is not None:
+                logger.warning(
+                    "Performing in-place all-gather with a group size of 1. "
+                    "This may be unnecessary; consider bypassing it for better efficiency."
+                )
+                output_tensor_list[0].copy_(input_)
+                return None
+            else:
+                return input_
+
+        if output_tensor_list is not None:
+            # TODO(ch-wan): support other backends
+            return torch.distributed.all_gather(
+                output_tensor_list, input_, group=self.device_group
+            )
+
+        assert (
+            -input_.dim() <= dim < input_.dim()
+        ), f"Invalid dim ({dim}) for input tensor with shape {input_.size()}"
+
+        # For HPUs, use HPU communicator.
+        hpu_comm = self.hpu_communicator
+        if hpu_comm is not None and not hpu_comm.disabled:
+            return hpu_comm.all_gather(input_, dim)
+
+        # For NPUs, use NPU communicator.
+        npu_comm = self.npu_communicator
+        if npu_comm is not None and not npu_comm.disabled:
+            return npu_comm.all_gather(input_, dim)
+
+        if dim < 0:
+            # Convert negative dim to positive.
+            dim += input_.dim()
+        input_size = input_.size()
+        # NOTE: we have to use concat-style all-gather here,
+        # stack-style all-gather has compatibility issues with
+        # torch.compile . see https://github.com/pytorch/pytorch/issues/138795
+        output_size = (input_size[0] * world_size,) + input_size[1:]
+        # Allocate output tensor.
+        with self.use_symmetric_memory(
+            self, disabled=not self.is_allocation_symmetric()
+        ):
+            output_tensor = torch.empty(
+                output_size, dtype=input_.dtype, device=input_.device
+            )
+
+        # All-gather.
+        if input_.is_cpu:
+            if is_shm_available(input_.dtype, self.world_size, self.local_size):
+                return torch.ops.sgl_kernel.shm_allgather(input_, dim)
+            else:
+                torch.distributed.all_gather_into_tensor(
+                    output_tensor, input_, group=self.device_group
+                )
+        else:
+            self.all_gather_into_tensor(output_tensor, input_)
+
+        # Reshape
+        output_tensor = output_tensor.reshape((world_size,) + input_size)
+        output_tensor = output_tensor.movedim(0, dim)
+        output_tensor = output_tensor.reshape(
+            input_size[:dim] + (world_size * input_size[dim],) + input_size[dim + 1 :]
+        )
+        return output_tensor
+
+    def all_gatherv(
+        self,
+        input_: Union[torch.Tensor, List[torch.Tensor]],
+        sizes: Optional[List[int]] = None,
+    ) -> Union[torch.Tensor, List[torch.Tensor]]:
+        """
+        Supports varying sizes per rank and input tensor list.
+        `sizes`: a list of len(world_size) with the number of items per rank to gather.
+        """
+        world_size = self.world_size
+        pynccl_comm = self.pynccl_comm
+
+        with pynccl_comm.change_state(
+            enable=True, stream=get_current_device_stream_fast()
+        ):
+            assert (
+                pynccl_comm is not None and not pynccl_comm.disabled
+            ), "pynccl is required for all_gatherv"
+
+            def _all_gather_allocate_output(
+                input_: torch.Tensor, sizes: Optional[List[int]] = None
+            ):
+                input_size = input_.size()
+                if sizes is not None:
+                    assert len(sizes) == world_size
+                    assert input_.shape[0] == sizes[self.rank_in_group]
+                    output_size = (sum(sizes),) + input_size[1:]
+                    # 'sizes' is not needed if all inputs in the same group have the same shape
+                    if all(s == sizes[0] for s in sizes):
+                        sizes = None
+                else:
+                    output_size = (input_size[0] * world_size,) + input_size[1:]
+                # Allocate output tensor.
+                with self.use_symmetric_memory(self, disabled=sizes is not None):
+                    output_tensor = torch.empty(
+                        output_size, dtype=input_.dtype, device=input_.device
+                    )
+                return output_tensor, sizes
+
+            if isinstance(input_, torch.Tensor):
+                input_ = [input_]
+
+            output_list = []
+            size_list = []
+            for inp in input_:
+                output_tensor, s = _all_gather_allocate_output(inp, sizes=sizes)
+                output_list.append(output_tensor)
+                size_list.append(s)
+
+            pynccl_comm.group_start()
+            for i, inp in enumerate(input_):
+                pynccl_comm.all_gather(output_list[i], inp, sizes=size_list[i])
+            pynccl_comm.group_end()
+
+            return output_list
+
+    def gather(
+        self, input_: torch.Tensor, dst: int = 0, dim: int = -1
+    ) -> Optional[torch.Tensor]:
+        """
+        NOTE: We assume that the input tensor is on the same device across
+        all the ranks.
+        NOTE: `dst` is the local rank of the destination rank.
+        """
+        world_size = self.world_size
+        # Bypass the function if we are using only 1 GPU.
+        if world_size == 1:
+            return input_
+        assert (
+            -input_.dim() <= dim < input_.dim()
+        ), f"Invalid dim ({dim}) for input tensor with shape {input_.size()}"
+        if dim < 0:
+            # Convert negative dim to positive.
+            dim += input_.dim()
+        if self.xpu_communicator is not None and not self.xpu_communicator.disabled:
+            return self.xpu_communicator.gather(input_, self.rank_in_group, dst, dim)
+        # Allocate output tensor.
+        if self.rank_in_group == dst:
+            gather_list = [torch.empty_like(input_) for _ in range(world_size)]
+        else:
+            gather_list = None
+        # Gather.
+        torch.distributed.gather(
+            input_, gather_list, dst=self.ranks[dst], group=self.device_group
+        )
+        if self.rank_in_group == dst:
+            output_tensor = torch.cat(gather_list, dim=dim)
+        else:
+            output_tensor = None
+        return output_tensor
+
+    def broadcast(self, input_: torch.Tensor, src: int = 0):
+        """Broadcast the input tensor.
+        NOTE: `src` is the local rank of the source rank.
+        """
+        assert src < self.world_size, f"Invalid src rank ({src})"
+
+        # Bypass the function if we are using only 1 GPU.
+        if self.world_size == 1:
+            return input_
+        # Broadcast.
+        torch.distributed.broadcast(
+            input_, src=self.ranks[src], group=self.device_group
+        )
+        return input_
+
+    def broadcast_object(self, obj: Optional[Any] = None, src: int = 0):
+        """Broadcast the input object.
+        NOTE: `src` is the local rank of the source rank.
+        """
+        assert src < self.world_size, f"Invalid src rank ({src})"
+
+        # Bypass the function if we are using only 1 GPU.
+        if self.world_size == 1:
+            return obj
+        if self.mq_broadcaster is not None:
+            assert src == 0, "Message queue broadcaster only supports src=0"
+            return self.mq_broadcaster.broadcast_object(obj)
+        if self.rank_in_group == src:
+            torch.distributed.broadcast_object_list(
+                [obj], src=self.ranks[src], group=self.cpu_group
+            )
+            return obj
+        else:
+            recv = [None]
+            torch.distributed.broadcast_object_list(
+                recv, src=self.ranks[src], group=self.cpu_group
+            )
+            return recv[0]
+
+    def broadcast_object_list(
+        self, obj_list: List[Any], src: int = 0, group: Optional[ProcessGroup] = None
+    ):
+        """Broadcast the input object list.
+        NOTE: `src` is the local rank of the source rank.
+        """
+        assert src < self.world_size, f"Invalid src rank ({src})"
+
+        # Bypass the function if we are using only 1 GPU.
+        if self.world_size == 1:
+            return obj_list
+        # Broadcast.
+        torch.distributed.broadcast_object_list(
+            obj_list, src=self.ranks[src], group=self.device_group
+        )
+        return obj_list
+
+    def all_gather_object(self, obj: Any) -> List[Any]:
+        objs = [None] * self.world_size
+        torch.distributed.all_gather_object(objs, obj, group=self.cpu_group)
+        return objs
+
+    def send_object(
+        self,
+        obj: Any,
+        dst: int,
+        async_send: bool = False,
+    ) -> List[P2PWork]:
+        """
+        Send the input object list to the destination rank.
+        This function uses the CPU group for all communications.
+
+        TODO: If you want to use GPU communication, please add a new argument (e.g., data_group, group),
+        use other functions (e.g., send), or implement a new function (e.g., send_object_device).
+
+        NOTE: `dst` is the local rank of the destination rank.
+        """
+
+        assert dst < self.world_size, f"Invalid dst rank ({dst})"
+        assert dst != self.rank_in_group, (
+            "Invalid destination rank. Destination rank is the same "
+            "as the current rank."
+        )
+        send_func = torch.distributed.isend if async_send else torch.distributed.send
+
+        # Serialize object to tensor and get the size as well
+        object_tensor = torch.frombuffer(pickle.dumps(obj), dtype=torch.uint8)
+        size_tensor = torch.tensor(
+            [object_tensor.numel()], dtype=torch.long, device="cpu"
+        )
+
+        # Send object size
+        p2p_work = []
+        size_work = send_func(
+            size_tensor,
+            self.ranks[dst],
+            group=self.cpu_group,
+        )
+        if async_send:
+            p2p_work.append(P2PWork(size_work, size_tensor))
+
+        object_work = send_func(
+            object_tensor,
+            self.ranks[dst],
+            group=self.cpu_group,
+        )
+        if async_send:
+            p2p_work.append(P2PWork(object_work, object_tensor))
+
+        return p2p_work
+
+    def recv_object(
+        self,
+        src: int,
+    ) -> Any:
+        """Receive the input object list from the source rank."""
+        """NOTE: `src` is the local rank of the source rank."""
+
+        assert src < self.world_size, f"Invalid src rank ({src})"
+        assert (
+            src != self.rank_in_group
+        ), "Invalid source rank. Source rank is the same as the current rank."
+
+        size_tensor = torch.empty(1, dtype=torch.long, device="cpu")
+
+        # Receive object size
+        # We have to use irecv here to make it work for both isend and send.
+        work = torch.distributed.irecv(
+            size_tensor, src=self.ranks[src], group=self.cpu_group
+        )
+        work.wait()
+
+        # Tensor to receive serialized objects into.
+        object_tensor: Any = torch.empty(  # type: ignore[call-overload]
+            size_tensor.item(),  # type: ignore[arg-type]
+            dtype=torch.uint8,
+            device="cpu",
+        )
+
+        work = torch.distributed.irecv(
+            object_tensor, src=self.ranks[src], group=self.cpu_group
+        )
+        work.wait()
+
+        obj = pickle.loads(object_tensor.numpy())
+        return obj
+
+    def broadcast_tensor_dict(
+        self,
+        tensor_dict: Optional[Dict[str, Union[torch.Tensor, Any]]] = None,
+        src: int = 0,
+        group: Optional[ProcessGroup] = None,
+        metadata_group: Optional[ProcessGroup] = None,
+    ) -> Optional[Dict[str, Union[torch.Tensor, Any]]]:
+        """Broadcast the input tensor dictionary.
+        NOTE: `src` is the local rank of the source rank.
+        """
+        # Bypass the function if we are using only 1 GPU.
+        if not torch.distributed.is_initialized() or self.world_size == 1:
+            return tensor_dict
+
+        group = self.device_group
+        metadata_group = self.cpu_group
+        assert src < self.world_size, f"Invalid src rank ({src})"
+
+        rank_in_group = self.rank_in_group
+        if rank_in_group == src:
+            metadata_list: List[Tuple[Any, Any]] = []
+            assert isinstance(
+                tensor_dict, dict
+            ), f"Expecting a dictionary, got {type(tensor_dict)}"
+            metadata_list, tensor_list = _split_tensor_dict(tensor_dict)
+            # `metadata_list` lives in CPU memory.
+            # `broadcast_object_list` has serialization & deserialization,
+            # all happening on CPU. Therefore, we can use the CPU group.
+            self.broadcast_object(metadata_list, src=src)
+            async_handles = []
+            for tensor in tensor_list:
+                if tensor.numel() == 0:
+                    # Skip broadcasting empty tensors.
+                    continue
+                if tensor.is_cpu:
+                    # use metadata_group for CPU tensors
+                    handle = torch.distributed.broadcast(
+                        tensor, src=self.ranks[src], group=metadata_group, async_op=True
+                    )
+                else:
+                    # use group for GPU tensors
+                    handle = torch.distributed.broadcast(
+                        tensor, src=self.ranks[src], group=group, async_op=True
+                    )
+                async_handles.append(handle)
+            for async_handle in async_handles:
+                async_handle.wait()
+
+        else:
+            metadata_list = self.broadcast_object(None, src=src)
+            tensor_dict = {}
+            async_handles = []
+            for key, value in metadata_list:
+                if isinstance(value, TensorMetadata):
+                    tensor = torch.empty(
+                        value.size, dtype=value.dtype, device=value.device
+                    )
+                    if tensor.numel() == 0:
+                        # Skip broadcasting empty tensors.
+                        tensor_dict[key] = tensor
+                        continue
+                    if tensor.is_cpu:
+                        # use metadata_group for CPU tensors
+                        handle = torch.distributed.broadcast(
+                            tensor,
+                            src=self.ranks[src],
+                            group=metadata_group,
+                            async_op=True,
+                        )
+                    else:
+                        # use group for GPU tensors
+                        handle = torch.distributed.broadcast(
+                            tensor, src=self.ranks[src], group=group, async_op=True
+                        )
+                    async_handles.append(handle)
+                    tensor_dict[key] = tensor
+                else:
+                    tensor_dict[key] = value
+            for async_handle in async_handles:
+                async_handle.wait()
+        return tensor_dict
+
+    def send_tensor_dict(
+        self,
+        tensor_dict: Dict[str, Union[torch.Tensor, Any]],
+        dst: Optional[int] = None,
+        all_gather_group: Optional["GroupCoordinator"] = None,
+        async_send: bool = False,
+    ) -> Optional[List[P2PWork]]:
+        """Send the input tensor dictionary.
+        NOTE: `dst` is the local rank of the source rank.
+        """
+        # Bypass the function if we are using only 1 GPU.
+        if self.world_size == 1:
+            return tensor_dict
+
+        all_gather_size = 1 if all_gather_group is None else all_gather_group.world_size
+        all_gather_rank = (
+            0 if all_gather_group is None else all_gather_group.rank_in_group
+        )
+
+        group = self.device_group
+        metadata_group = self.cpu_group
+
+        if dst is None:
+            dst = (self.rank_in_group + 1) % self.world_size
+        assert dst < self.world_size, f"Invalid dst rank ({dst})"
+
+        assert isinstance(
+            tensor_dict, dict
+        ), f"Expecting a dictionary, got {type(tensor_dict)}"
+        metadata_list, tensor_list = _split_tensor_dict(tensor_dict)
+        # Note: While switching to Device-to-Device (D2D) would introduce an extra
+        # Device-to-Host (D2H) memory copy overhead for serialization, our benchmarks
+        # show better overall transmission performance with D2D due to:
+        # 1. Superior D2D transfer bandwidth
+        # 2. Ability to overlap send and recv operations
+        # Thus the net performance gain justifies this approach.
+
+        send_func = torch.distributed.isend if async_send else torch.distributed.send
+        p2p_works = self.send_object(metadata_list, dst=dst, async_send=async_send)
+
+        for tensor in tensor_list:
+            if tensor.numel() == 0:
+                # Skip sending empty tensors.
+                continue
+
+            # send-allgather: send only a slice, then do allgather.
+            if all_gather_group is not None and tensor.numel() % all_gather_size == 0:
+                tensor = tensor.reshape(all_gather_size, -1)[all_gather_rank]
+
+            comm_group = metadata_group if tensor.is_cpu else group
+            work = send_func(tensor, self.ranks[dst], group=comm_group)
+            if async_send:
+                p2p_works.append(P2PWork(work, tensor))
+        return p2p_works
+
+    def recv_tensor_dict(
+        self,
+        src: Optional[int] = None,
+        all_gather_group: Optional["GroupCoordinator"] = None,
+    ) -> Optional[Dict[str, Union[torch.Tensor, Any]]]:
+        """Recv the input tensor dictionary.
+        NOTE: `src` is the local rank of the source rank.
+        """
+        # Bypass the function if we are using only 1 GPU.
+        if not torch.distributed.is_initialized() or self.world_size == 1:
+            return None
+
+        all_gather_size = 1 if all_gather_group is None else all_gather_group.world_size
+        all_gather_rank = (
+            0 if all_gather_group is None else all_gather_group.rank_in_group
+        )
+
+        group = self.device_group
+        metadata_group = self.cpu_group
+
+        if src is None:
+            src = (self.rank_in_group - 1) % self.world_size
+        assert src < self.world_size, f"Invalid src rank ({src})"
+
+        recv_metadata_list = self.recv_object(src=src)
+        tensor_dict: Dict[str, Any] = {}
+        for key, value in recv_metadata_list:
+            if isinstance(value, TensorMetadata):
+                tensor = torch.empty(value.size, dtype=value.dtype, device=value.device)
+                if tensor.numel() == 0:
+                    # Skip broadcasting empty tensors.
+                    tensor_dict[key] = tensor
+                    continue
+
+                # send-allgather: send only a slice, then do allgather.
+                use_all_gather = (
+                    all_gather_group is not None
+                    and tensor.numel() % all_gather_size == 0
+                )
+
+                if use_all_gather:
+                    orig_shape = tensor.shape
+                    tensor = tensor.reshape(all_gather_size, -1)[all_gather_rank]
+
+                # We have to use irecv here to make it work for both isend and send.
+                comm_group = metadata_group if tensor.is_cpu else group
+                work = torch.distributed.irecv(
+                    tensor, src=self.ranks[src], group=comm_group
+                )
+                work.wait()
+
+                if use_all_gather:
+                    tensor = all_gather_group.all_gather(tensor, dim=0)
+                    tensor = tensor.reshape(orig_shape)
+
+                tensor_dict[key] = tensor
+            else:
+                tensor_dict[key] = value
+        return tensor_dict
+
+    def barrier(self):
+        """Barrier synchronization among the group.
+        NOTE: don't use `device_group` here! `barrier` in NCCL is
+        terrible because it is internally a broadcast operation with
+        secretly created GPU tensors. It is easy to mess up the current
+        device. Use the CPU group instead.
+        """
+        torch.distributed.barrier(group=self.cpu_group)
+
+    def send(self, tensor: torch.Tensor, dst: Optional[int] = None) -> None:
+        """Sends a tensor to the destination rank in a non-blocking way"""
+        """NOTE: `dst` is the local rank of the destination rank."""
+        if dst is None:
+            dst = (self.rank_in_group + 1) % self.world_size
+
+        pynccl_comm = self.pynccl_comm
+        if pynccl_comm is not None and not pynccl_comm.disabled:
+            pynccl_comm.send(tensor, dst)
+        else:
+            torch.distributed.send(tensor, self.ranks[dst], self.device_group)
+
+    def recv(
+        self, size: torch.Size, dtype: torch.dtype, src: Optional[int] = None
+    ) -> torch.Tensor:
+        """Receives a tensor from the source rank."""
+        """NOTE: `src` is the local rank of the source rank."""
+        if src is None:
+            src = (self.rank_in_group - 1) % self.world_size
+
+        tensor = torch.empty(size, dtype=dtype, device=self.device)
+        pynccl_comm = self.pynccl_comm
+        if pynccl_comm is not None and not pynccl_comm.disabled:
+            pynccl_comm.recv(tensor, src)
+        else:
+            torch.distributed.recv(tensor, self.ranks[src], self.device_group)
+        return tensor
+
+    def destroy(self):
+        if self.device_group is not None:
+            torch.distributed.destroy_process_group(self.device_group)
+            self.device_group = None
+        if self.cpu_group is not None:
+            torch.distributed.destroy_process_group(self.cpu_group)
+            self.cpu_group = None
+        if self.pynccl_comm is not None:
+            self.pynccl_comm = None
+        if self.ca_comm is not None:
+            self.ca_comm = None
+        if self.mq_broadcaster is not None:
+            self.mq_broadcaster = None
+
+
+_WORLD: Optional[GroupCoordinator] = None
+
+
+def get_world_group() -> GroupCoordinator:
+    assert _WORLD is not None, "world group is not initialized"
+    return _WORLD
+
+
+def init_world_group(
+    ranks: List[int], local_rank: int, backend: str
+) -> GroupCoordinator:
+    return GroupCoordinator(
+        group_ranks=[ranks],
+        local_rank=local_rank,
+        torch_distributed_backend=backend,
+        use_pynccl=False,
+        use_pymscclpp=False,
+        use_custom_allreduce=False,
+        use_torch_symm_mem_all_reduce=False,
+        use_hpu_communicator=False,
+        use_xpu_communicator=False,
+        use_npu_communicator=False,
+        group_name="world",
+    )
+
+
+def init_model_parallel_group(
+    group_ranks: List[List[int]],
+    local_rank: int,
+    backend: str,
+    use_pynccl: Optional[bool] = None,
+    use_custom_allreduce: Optional[bool] = None,
+    use_message_queue_broadcaster: bool = False,
+    group_name: Optional[str] = None,
+    use_mscclpp_allreduce: Optional[bool] = None,
+    pynccl_use_current_stream: bool = True,
+    use_torch_symm_mem_allreduce: Optional[bool] = None,
+) -> GroupCoordinator:
+    if use_custom_allreduce is None:
+        use_custom_allreduce = _ENABLE_CUSTOM_ALL_REDUCE
+    if use_mscclpp_allreduce is None:
+        use_mscclpp_allreduce = _ENABLE_MSCCLPP_ALL_REDUCE
+    if use_torch_symm_mem_allreduce is None:
+        use_torch_symm_mem_allreduce = _ENABLE_TORCH_SYMM_MEM_ALL_REDUCE
+    return GroupCoordinator(
+        group_ranks=group_ranks,
+        local_rank=local_rank,
+        torch_distributed_backend=backend,
+        use_pynccl=(
+            not (_is_npu or _is_xpu or backend == "mooncake")
+            if use_pynccl is None
+            else use_pynccl
+        ),
+        use_pymscclpp=use_mscclpp_allreduce,
+        use_custom_allreduce=use_custom_allreduce,
+        use_torch_symm_mem_all_reduce=use_torch_symm_mem_allreduce,
+        use_hpu_communicator=True,
+        use_xpu_communicator=True,
+        use_npu_communicator=True,
+        use_message_queue_broadcaster=use_message_queue_broadcaster,
+        group_name=group_name,
+        pynccl_use_current_stream=pynccl_use_current_stream,
+    )
+
+
+_TP: Optional[GroupCoordinator] = None
+_ATTN_TP: Optional[GroupCoordinator] = None
+_ATTN_CP: Optional[GroupCoordinator] = None
+
+# duplicate GroupCoordinator for prefill in PD-Multiplexing
+_PDMUX_PREFILL_TP_GROUP: Optional[GroupCoordinator] = None
+
+_ENABLE_PDMUX_P_TP: bool = False
+
+
+def set_pdmux_status(enable_prefill_multiplexing: bool):
+    global _ENABLE_PDMUX_P_TP
+    _ENABLE_PDMUX_P_TP = enable_prefill_multiplexing
+
+
+def get_tp_group() -> GroupCoordinator:
+    if _ENABLE_PDMUX_P_TP:
+        assert (
+            _PDMUX_PREFILL_TP_GROUP is not None
+        ), "tensor model parallel group for PD-Multiplexing Prefill is not initialized"
+        return _PDMUX_PREFILL_TP_GROUP
+    assert _TP is not None, "tensor model parallel group is not initialized"
+    return _TP
+
+
+def get_attn_tp_group() -> GroupCoordinator:
+    assert (
+        _ATTN_TP is not None
+    ), "attention tensor model parallel group is not initialized"
+    return _ATTN_TP
+
+
+def get_attn_cp_group() -> GroupCoordinator:
+    assert (
+        _ATTN_CP is not None
+    ), "attention context model parallel group is not initialized"
+    return _ATTN_CP
+
+
+_MOE_DP: Optional[GroupCoordinator] = None
+_MOE_EP: Optional[GroupCoordinator] = None
+_MOE_TP: Optional[GroupCoordinator] = None
+
+
+def get_moe_dp_group() -> GroupCoordinator:
+    assert _MOE_DP is not None, "moe data parallel group is not initialized"
+    return _MOE_DP
+
+
+def get_moe_ep_group() -> GroupCoordinator:
+    assert _MOE_EP is not None, "expert model parallel group is not initialized"
+    return _MOE_EP
+
+
+def get_moe_tp_group() -> GroupCoordinator:
+    assert _MOE_TP is not None, "expert model parallel group is not initialized"
+    return _MOE_TP
+
+
+# kept for backward compatibility
+get_tensor_model_parallel_group = get_tp_group
+
+_PP: Optional[GroupCoordinator] = None
+
+
+def get_pp_group() -> GroupCoordinator:
+    assert _PP is not None, "pipeline model parallel group is not initialized"
+    return _PP
+
+
+# kept for backward compatibility
+get_pipeline_model_parallel_group = get_pp_group
+
+
+def get_mooncake_transfer_engine():
+    """
+    Return the shared MooncakeTransferEngine if initialized in device_communicators,
+    else None. Used by disaggregation mooncake backend and mem_cache mooncake_store.
+    """
+    from sglang.srt.distributed.device_communicators.mooncake_transfer_engine import (
+        get_mooncake_transfer_engine as _get_engine,
+    )
+
+    return _get_engine()
+
+
+@contextmanager
+def graph_capture(stream: Optional[torch.cuda.Stream] = None):
+    """
+    `graph_capture` is a context manager which should surround the code that
+    is capturing the CUDA graph. Its main purpose is to ensure that the
+    some operations will be run after the graph is captured, before the graph
+    is replayed. It returns a `GraphCaptureContext` object which contains the
+    necessary data for the graph capture. Currently, it only contains the
+    stream that the graph capture is running on. This stream is set to the
+    current CUDA stream when the context manager is entered and reset to the
+    default stream when the context manager is exited. This is to ensure that
+    the graph capture is running on a separate stream from the default stream,
+    in order to explicitly distinguish the kernels to capture
+    from other kernels possibly launched on background in the default stream.
+    """
+    with get_tp_group().graph_capture(
+        stream=stream
+    ) as context, get_pp_group().graph_capture(context):
+        yield context
+
+
+logger = logging.getLogger(__name__)
+
+_ENABLE_CUSTOM_ALL_REDUCE = True
+_ENABLE_MSCCLPP_ALL_REDUCE = False
+_ENABLE_TORCH_SYMM_MEM_ALL_REDUCE = False
+
+
+def set_custom_all_reduce(enable: bool):
+    global _ENABLE_CUSTOM_ALL_REDUCE
+    _ENABLE_CUSTOM_ALL_REDUCE = enable
+
+
+def set_mscclpp_all_reduce(enable: bool):
+    global _ENABLE_MSCCLPP_ALL_REDUCE
+    _ENABLE_MSCCLPP_ALL_REDUCE = enable
+
+
+def set_torch_symm_mem_all_reduce(enable: bool):
+    global _ENABLE_TORCH_SYMM_MEM_ALL_REDUCE
+    _ENABLE_TORCH_SYMM_MEM_ALL_REDUCE = enable
+
+
+_DEVICE_TO_DISTRIBUTED_BACKEND = {
+    "cuda": "nccl",
+    "xpu": "xccl",
+    "hpu": "hccl",
+    "cpu": "gloo",
+    "npu": "hccl",
+    "musa": "mccl",
+}
+
+
+def get_default_distributed_backend(device: str) -> str:
+    return _DEVICE_TO_DISTRIBUTED_BACKEND.get(device, "gloo")
+
+
+def init_distributed_environment(
+    world_size: int = -1,
+    rank: int = -1,
+    distributed_init_method: str = "env://",
+    local_rank: int = -1,
+    backend: str = "nccl",
+    timeout: Optional[int] = None,
+):
+    logger.debug(
+        "world_size=%d rank=%d local_rank=%d " "distributed_init_method=%s backend=%s",
+        world_size,
+        rank,
+        local_rank,
+        distributed_init_method,
+        backend,
+    )
+    if "mooncake" in backend:
+        try:
+            from mooncake import ep as mooncake_ep
+        except ImportError as e:
+            raise ImportError(
+                "Please install mooncake by following the instructions at "
+                "https://github.com/kvcache-ai/Mooncake/blob/main/doc/en/build.md "  # noqa: E501
+                "to run SGLang with Mooncake Backend."
+            ) from e
+        mooncake_ep.set_host_ip(get_local_ip_auto())
+
+    if not torch.distributed.is_initialized():
+        assert distributed_init_method is not None, (
+            "distributed_init_method must be provided when initializing "
+            "distributed environment"
+        )
+        if timeout is not None:
+            assert isinstance(timeout, (int)), "timeout must be a number"
+            assert timeout > 0, "timeout must be positive"
+            timeout = timedelta(seconds=timeout)
+
+        pg_options = get_torch_distributed_pg_options()
+
+        # this backend is used for WORLD
+        torch.distributed.init_process_group(
+            backend=backend,
+            init_method=distributed_init_method,
+            world_size=world_size,
+            rank=rank,
+            timeout=timeout,
+            pg_options=pg_options,
+        )
+
+    # set the local rank
+    # local_rank is not available in torch ProcessGroup,
+    # see https://github.com/pytorch/pytorch/issues/122816
+    if local_rank == -1:
+        # local rank not set, this usually happens in single-node
+        # setting, where we can use rank as local rank
+        if distributed_init_method == "env://":
+            local_rank = int(os.environ.get("LOCAL_RANK", "0"))
+        else:
+            local_rank = rank
+    global _WORLD
+    if _WORLD is None:
+        ranks = list(range(torch.distributed.get_world_size()))
+        _WORLD = init_world_group(ranks, local_rank, backend)
+    else:
+        assert (
+            _WORLD.world_size == torch.distributed.get_world_size()
+        ), "world group already initialized with a different world size"
+
+
+def initialize_model_parallel(
+    tensor_model_parallel_size: int = 1,
+    expert_model_parallel_size: int = 1,
+    pipeline_model_parallel_size: int = 1,
+    attention_data_parallel_size: int = 1,
+    attention_context_model_parallel_size: int = 1,
+    moe_data_model_parallel_size: int = 1,
+    backend: Optional[str] = None,
+    duplicate_tp_group: bool = False,
+) -> None:
+    """
+    Initialize model parallel groups.
+
+    Arguments:
+        tensor_model_parallel_size: number of GPUs used for tensor model
+            parallelism.
+        expert_model_parallel_size: number of GPUs used for expert model
+            parallelism.
+        pipeline_model_parallel_size: number of GPUs used for pipeline model
+            parallelism.
+        attention_data_parallel_size: number of GPUs used for attention data
+            parallelism.
+        attention_context_model_parallel_size: number of GPUs used for attention context
+            parallelism.
+        moe_data_model_parallel_size: number of GPUs used for moe data
+            parallelism.
+
+    Let's say we have a total of 8 GPUs denoted by g0 ... g7 and we
+    use 2 GPUs to parallelize the model tensor, and 4 GPUs to parallelize
+    the model pipeline. The present function will
+    create 4 tensor model-parallel groups and 2 pipeline model-parallel groups:
+        4 tensor model-parallel groups:
+            [g0, g1], [g2, g3], [g4, g5], [g6, g7]
+        2 pipeline model-parallel groups:
+            [g0, g2, g4, g6], [g1, g3, g5, g7]
+
+    Let's say we use 2 GPUs for attention context parallelism (attn_cp_size=2) and 4 GPUs for
+    attention tensor parallelism (attn_tp_size=4). As for MoE part, we use 2 GPUs for moe data
+    parallelism (moe_dp_size=2) and 4 GPUs for moe expert parallelism (moe_ep_size=4). The present
+    function will create the following groups:
+        2 tensor model-parallel groups:
+            [g0, g1, g2, g3], [g4, g5, g6, g7]
+        4 attention context-parallel groups:
+            [g0, g4], [g1, g5], [g2, g6], [g3, g7]
+        2 moe expert-parallel groups:
+            [g0, g1, g2, g3], [g4, g5, g6, g7]
+        4 moe data-parallel groups:
+            [g0, g4], [g1, g5], [g2, g6], [g3, g7]
+
+    Note that for efficiency, the caller should make sure adjacent ranks
+    are on the same DGX box. For example if we are using 2 DGX-1 boxes
+    with a total of 16 GPUs, rank 0 to 7 belong to the first box and
+    ranks 8 to 15 belong to the second box.
+    """
+    # Get world size and rank. Ensure some consistencies.
+    assert torch.distributed.is_initialized()
+    world_size: int = torch.distributed.get_world_size()
+    backend = backend or torch.distributed.get_backend(get_world_group().device_group)
+
+    if world_size != tensor_model_parallel_size * pipeline_model_parallel_size:
+        raise RuntimeError(
+            f"world_size ({world_size}) is not equal to "
+            f"tensor_model_parallel_size ({tensor_model_parallel_size}) x "
+            f"pipeline_model_parallel_size ({pipeline_model_parallel_size})"
+        )
+
+    # Build the tensor model-parallel groups.
+    num_tensor_model_parallel_groups: int = world_size // tensor_model_parallel_size
+    global _TP
+    assert _TP is None, "tensor model parallel group is already initialized"
+    group_ranks = []
+    for tp_group_idx in range(num_tensor_model_parallel_groups):
+        ranks = list(
+            range(
+                tp_group_idx * tensor_model_parallel_size,
+                (tp_group_idx + 1) * tensor_model_parallel_size,
+            )
+        )
+        group_ranks.append(ranks)
+
+    # message queue broadcaster is only used in tensor model parallel group
+    _TP = init_model_parallel_group(
+        group_ranks,
+        get_world_group().local_rank,
+        backend,
+        use_message_queue_broadcaster=get_bool_env_var(
+            "SGLANG_USE_MESSAGE_QUEUE_BROADCASTER", "true"
+        ),
+        group_name="tp",
+        pynccl_use_current_stream=duplicate_tp_group,
+    )
+
+    if duplicate_tp_group:
+        global _PDMUX_PREFILL_TP_GROUP
+        assert (
+            _PDMUX_PREFILL_TP_GROUP is None
+        ), "tensor model parallel group for PD-Multiplexing Prefill is already initialized"
+        _PDMUX_PREFILL_TP_GROUP = init_model_parallel_group(
+            group_ranks,
+            get_world_group().local_rank,
+            backend,
+            use_message_queue_broadcaster=get_bool_env_var(
+                "SGLANG_USE_MESSAGE_QUEUE_BROADCASTER", "true"
+            ),
+            group_name="pdmux_prefill_tp",
+            pynccl_use_current_stream=True,
+        )
+        if _TP.pynccl_comm:
+            _TP.pynccl_comm.disabled = False
+            _PDMUX_PREFILL_TP_GROUP.pynccl_comm.disabled = False
+
+    attn_dp_size = attention_data_parallel_size
+    attn_cp_size = attention_context_model_parallel_size
+    attn_tp_size = tensor_model_parallel_size // attn_cp_size // attn_dp_size
+
+    global _ATTN_CP
+    assert (
+        _ATTN_CP is None
+    ), "attention context model parallel group is already initialized"
+    if attn_cp_size == tensor_model_parallel_size:
+        _ATTN_CP = _TP
+    else:
+        group_ranks = []
+        for tp_group_idx in range(num_tensor_model_parallel_groups):
+            for dp_idx in range(attn_dp_size):
+                for attn_tp_idx in range(attn_tp_size):
+                    st = (
+                        tp_group_idx * tensor_model_parallel_size
+                        + dp_idx * attn_tp_size * attn_cp_size
+                        + attn_tp_idx
+                    )
+                    en = (
+                        tp_group_idx * tensor_model_parallel_size
+                        + (dp_idx + 1) * attn_tp_size * attn_cp_size
+                        + attn_tp_idx
+                    )
+                    ranks = list(range(st, en, attn_tp_size))
+                    group_ranks.append(ranks)
+        _ATTN_CP = init_model_parallel_group(
+            group_ranks,
+            get_world_group().local_rank,
+            backend,
+            group_name="attn_cp",
+        )
+
+    from sglang.srt.layers.sampler import SYNC_TOKEN_IDS_ACROSS_TP
+
+    global _ATTN_TP
+    assert (
+        _ATTN_TP is None
+    ), "attention tensor model parallel group is already initialized"
+    if attn_tp_size == tensor_model_parallel_size:
+        _ATTN_TP = _TP
+    else:
+        group_ranks = []
+        for tp_group_idx in range(num_tensor_model_parallel_groups):
+            for cp_dp_combined_idx in range(attn_cp_size * attn_dp_size):
+                st = (
+                    tp_group_idx * tensor_model_parallel_size
+                    + cp_dp_combined_idx * attn_tp_size
+                )
+                en = (
+                    tp_group_idx * tensor_model_parallel_size
+                    + (cp_dp_combined_idx + 1) * attn_tp_size
+                )
+                ranks = list(range(st, en))
+                group_ranks.append(ranks)
+        _ATTN_TP = init_model_parallel_group(
+            group_ranks,
+            get_world_group().local_rank,
+            backend,
+            use_pynccl=SYNC_TOKEN_IDS_ACROSS_TP,
+            use_mscclpp_allreduce=False,
+            use_custom_allreduce=False,
+            use_torch_symm_mem_allreduce=False,
+            group_name="attention_tp",
+        )
+
+    moe_ep_size = expert_model_parallel_size
+    moe_dp_size = moe_data_model_parallel_size
+    moe_tp_size = tensor_model_parallel_size // moe_ep_size // moe_dp_size
+
+    global _MOE_DP
+    assert _MOE_DP is None, "moe data parallel group is already initialized"
+    # gpus_per_pp_stage = tensor_model_parallel_size * attention_context_model_parallel_size
+    if moe_dp_size == tensor_model_parallel_size:
+        _MOE_DP = _TP
+    else:
+        group_ranks = []
+        for tp_group_idx in range(num_tensor_model_parallel_groups):
+            for tp_ep_combined_idx in range(moe_tp_size * moe_ep_size):
+                st = tp_group_idx * tensor_model_parallel_size + tp_ep_combined_idx
+                en = (
+                    tp_group_idx + 1
+                ) * tensor_model_parallel_size + tp_ep_combined_idx
+                ranks = list(range(st, en, moe_tp_size * moe_ep_size))
+                group_ranks.append(ranks)
+        _MOE_DP = init_model_parallel_group(
+            group_ranks,
+            get_world_group().local_rank,
+            backend,
+            group_name="moe_dp",
+        )
+
+    global _MOE_EP
+    assert _MOE_EP is None, "expert model parallel group is already initialized"
+    if moe_ep_size == tensor_model_parallel_size:
+        _MOE_EP = _TP
+    else:
+        # TODO(ch-wan): use split_group to save memory
+        group_ranks = []
+        for tp_group_idx in range(num_tensor_model_parallel_groups):
+            for moe_dp_idx in range(moe_dp_size):
+                for moe_tp_idx in range(moe_tp_size):
+                    st = (
+                        tp_group_idx * tensor_model_parallel_size
+                        + moe_dp_idx * moe_ep_size * moe_tp_size
+                        + moe_tp_idx
+                    )
+                    en = st + moe_ep_size * moe_tp_size
+                    ranks = list(range(st, en, moe_tp_size))
+                    group_ranks.append(ranks)
+        _MOE_EP = init_model_parallel_group(
+            group_ranks,
+            get_world_group().local_rank,
+            backend,
+            group_name="moe_ep",
+        )
+
+    global _MOE_TP
+    assert _MOE_TP is None, "expert model parallel group is already initialized"
+    if moe_tp_size == tensor_model_parallel_size:
+        _MOE_TP = _TP
+    else:
+        # TODO(ch-wan): use split_group to save memory
+        group_ranks = []
+        for tp_group_idx in range(num_tensor_model_parallel_groups):
+            for ep_dp_combined_idx in range(moe_ep_size * moe_dp_size):
+                st = (
+                    tp_group_idx * tensor_model_parallel_size
+                    + ep_dp_combined_idx * moe_tp_size
+                )
+                en = (
+                    tp_group_idx * tensor_model_parallel_size
+                    + (ep_dp_combined_idx + 1) * moe_tp_size
+                )
+                ranks = list(range(st, en))
+                group_ranks.append(ranks)
+        _MOE_TP = init_model_parallel_group(
+            group_ranks,
+            get_world_group().local_rank,
+            backend,
+            group_name="moe_tp",
+        )
+
+    # Build the pipeline model-parallel groups.
+    num_pipeline_model_parallel_groups: int = world_size // pipeline_model_parallel_size
+    global _PP
+    assert _PP is None, "pipeline model parallel group is already initialized"
+    group_ranks = []
+    for pp_group_idx in range(num_pipeline_model_parallel_groups):
+        ranks = list(
+            range(pp_group_idx, world_size, num_pipeline_model_parallel_groups)
+        )
+        group_ranks.append(ranks)
+    # pipeline parallel does not need custom allreduce
+    _PP = init_model_parallel_group(
+        group_ranks,
+        get_world_group().local_rank,
+        backend,
+        use_custom_allreduce=False,
+        group_name="pp",
+    )
+
+
+def create_custom_parallel_group(
+    group_ranks: List[int], backend: str = "gloo"
+) -> Optional[torch.distributed.ProcessGroup]:
+    """
+    Create a custom parallel group based on the provided ranks.
+
+    Args:
+        group_ranks: The list of ranks that the CURRENT process wants to join.
+                     (e.g., Rank 0 passes [0...7], Rank 8 passes [8...15])
+        backend: The communication backend (default: "gloo").
+
+    Returns:
+        The ProcessGroup if the current rank is in group_ranks, else None.
+    """
+    assert torch.distributed.is_initialized()
+
+    world_size = torch.distributed.get_world_size()
+    rank = torch.distributed.get_rank()
+
+    local_config = sorted(list(set(group_ranks)))
+    gathered_configs = [None for _ in range(world_size)]
+
+    torch.distributed.all_gather_object(gathered_configs, local_config)
+
+    unique_groups = []
+    seen_signatures = set()
+
+    for config in gathered_configs:
+        config_tuple = tuple(config)
+        if config_tuple not in seen_signatures:
+            seen_signatures.add(config_tuple)
+            unique_groups.append(list(config_tuple))
+
+    unique_groups.sort(key=lambda x: x[0])
+
+    my_new_group = None
+
+    for g_ranks in unique_groups:
+        group = torch.distributed.new_group(ranks=g_ranks, backend=backend)
+
+        if set(g_ranks) == set(local_config):
+            my_new_group = group
+            logger.debug(
+                f"Rank {rank} successfully created/joined custom group: {g_ranks}"
+            )
+
+    return my_new_group
+
+
+def ensure_model_parallel_initialized(
+    tensor_model_parallel_size: int,
+    expert_model_parallel_size: int,
+    pipeline_model_parallel_size: int,
+    backend: Optional[str] = None,
+) -> None:
+    """Helper to initialize model parallel groups if they are not initialized,
+    or ensure tensor-parallel and pipeline-parallel sizes are equal to expected
+    values if the model parallel groups are initialized.
+    """
+    backend = backend or torch.distributed.get_backend(get_world_group().device_group)
+    if not model_parallel_is_initialized():
+        initialize_model_parallel(
+            tensor_model_parallel_size,
+            expert_model_parallel_size,
+            pipeline_model_parallel_size,
+            backend,
+        )
+        return
+
+    assert get_tensor_model_parallel_world_size() == tensor_model_parallel_size, (
+        "tensor parallel group already initialized, but of unexpected size: "
+        f"{get_tensor_model_parallel_world_size()=} vs. "
+        f"{tensor_model_parallel_size=}"
+    )
+    pp_world_size = get_pp_group().world_size
+    assert pp_world_size == pipeline_model_parallel_size, (
+        "pipeline parallel group already initialized, but of unexpected size: "
+        f"{pp_world_size=} vs. "
+        f"{pipeline_model_parallel_size=}"
+    )
+
+
+def model_parallel_is_initialized():
+    """Check if tensor and pipeline parallel groups are initialized."""
+    return _TP is not None and _PP is not None
+
+
+_TP_STATE_PATCHED = False
+
+
+@contextmanager
+def patch_tensor_parallel_group(tp_group: GroupCoordinator):
+    """Patch the tp group temporarily until this function ends.
+
+    This method is for draft workers of speculative decoding to run draft model
+    with different tp degree from that of target model workers.
+
+    Args:
+        tp_group (GroupCoordinator): the tp group coordinator
+    """
+    global _TP_STATE_PATCHED
+    assert not _TP_STATE_PATCHED, "Should not call when it's already patched"
+
+    _TP_STATE_PATCHED = True
+    old_tp_group = get_tp_group()
+    global _TP
+    _TP = tp_group
+    try:
+        yield
+    finally:
+        # restore the original state
+        _TP_STATE_PATCHED = False
+        _TP = old_tp_group
+
+
+def get_world_size():
+    """Return world size for the world group."""
+    return get_world_group().world_size
+
+
+def get_world_rank():
+    """Return my rank for the world group."""
+    return get_world_group().rank_in_group
+
+
+def get_tensor_model_parallel_world_size():
+    """Return world size for the tensor model parallel group."""
+    return get_tp_group().world_size
+
+
+def get_tensor_model_parallel_rank():
+    """Return my rank for the tensor model parallel group."""
+    return get_tp_group().rank_in_group
+
+
+# ATTN_TP
+def get_attn_tensor_model_parallel_world_size():
+    """Return world size for the attention tensor model parallel group."""
+    return get_attn_tp_group().world_size
+
+
+def get_attn_tensor_model_parallel_rank():
+    """Return my rank for the attention tensor model parallel group."""
+    return get_attn_tp_group().rank_in_group
+
+
+# ATTN_CP
+def get_attn_context_model_parallel_world_size():
+    """Return world size for the attention context model parallel group."""
+    return get_attn_cp_group().world_size
+
+
+def get_attn_context_model_parallel_rank():
+    """Return my rank for the attention context model parallel group."""
+    return get_attn_cp_group().rank_in_group
+
+
+def get_pipeline_model_parallel_world_size():
+    """Return world size for the pipeline model parallel group."""
+    return get_pp_group().world_size
+
+
+def get_pipeline_model_parallel_rank():
+    """Return my rank for the pipeline model parallel group."""
+    return get_pp_group().rank_in_group
+
+
+# MOE_DP
+def get_moe_data_parallel_world_size():
+    """Return world size for the moe data parallel group."""
+    return get_moe_dp_group().world_size
+
+
+def get_moe_data_parallel_rank():
+    """Return my rank for the moe data parallel group."""
+    return get_moe_dp_group().rank_in_group
+
+
+# MOE_EP
+def get_moe_expert_parallel_world_size():
+    """Return world size for the moe expert parallel group."""
+    return get_moe_ep_group().world_size
+
+
+def get_moe_expert_parallel_rank():
+    """Return my rank for the moe expert parallel group."""
+    return get_moe_ep_group().rank_in_group
+
+
+# MOE_TP
+def get_moe_tensor_parallel_world_size():
+    """Return world size for the moe tensor parallel group."""
+    return get_moe_tp_group().world_size
+
+
+def get_moe_tensor_parallel_rank():
+    """Return my rank for the moe tensor parallel group."""
+    return get_moe_tp_group().rank_in_group
+
+
+def destroy_model_parallel():
+    """Set the groups to none and destroy them."""
+    global _TP
+    if _TP:
+        _TP.destroy()
+    _TP = None
+
+    global _PP
+    if _PP:
+        _PP.destroy()
+    _PP = None
+
+    global _MOE_EP
+    if _MOE_EP:
+        _MOE_EP.destroy()
+    _MOE_EP = None
+
+    global _MOE_TP
+    if _MOE_TP:
+        _MOE_TP.destroy()
+    _MOE_TP = None
+
+    global _ATTN_CP
+    if _ATTN_CP:
+        _ATTN_CP.destroy()
+    _ATTN_CP = None
+
+    global _ATTN_TP
+    if _ATTN_TP:
+        _ATTN_TP.destroy()
+    _ATTN_TP = None
+
+    global _MOE_DP
+    if _MOE_DP:
+        _MOE_DP.destroy()
+    _MOE_DP = None
+
+    global _PDMUX_PREFILL_TP_GROUP
+    if _PDMUX_PREFILL_TP_GROUP:  # type: ignore[union-attr]
+        _PDMUX_PREFILL_TP_GROUP.destroy()
+    _PDMUX_PREFILL_TP_GROUP = None
+
+
+def destroy_distributed_environment():
+    global _WORLD
+    if _WORLD:
+        _WORLD.destroy()
+    _WORLD = None
+    if torch.distributed.is_initialized():
+        torch.distributed.destroy_process_group()
+
+
+def cleanup_dist_env_and_memory(shutdown_ray: bool = False):
+    destroy_model_parallel()
+    destroy_distributed_environment()
+    with contextlib.suppress(AssertionError):
+        torch.distributed.destroy_process_group()
+    if shutdown_ray:
+        import ray  # Lazy import Ray
+
+        ray.shutdown()
+    gc.collect()
+    if not _is_cpu:
+        if hasattr(torch, "cuda") and torch.cuda.is_available():
+            torch.cuda.empty_cache()
+            if hasattr(torch._C, "_host_emptyCache"):
+                torch._C._host_emptyCache()
+            else:
+                logger.warning(
+                    "torch._C._host_emptyCache() only available in Pytorch >=2.5"
+                )
+        elif hasattr(torch, "xpu") and torch.xpu.is_available():
+            torch.xpu.empty_cache()
+        elif hasattr(torch, "npu") and torch.npu.is_available():
+            torch.npu.empty_cache()
+        elif hasattr(torch, "musa") and torch.musa.is_available():
+            torch.musa.empty_cache()
+
+
+def in_the_same_node_as(pg: ProcessGroup, source_rank: int = 0) -> List[bool]:
+    """
+    This is a collective operation that returns if each rank is in the same node
+    as the source rank. It tests if processes are attached to the same
+    memory system (shared access to shared memory).
+    """
+    assert (
+        torch.distributed.get_backend(pg) != torch.distributed.Backend.NCCL
+    ), "in_the_same_node_as should be tested with a non-NCCL group."
+    # local rank inside the group
+    rank = torch.distributed.get_rank(group=pg)
+    world_size = torch.distributed.get_world_size(group=pg)
+
+    # local tensor in each process to store the result
+    is_in_the_same_node = torch.tensor([0] * world_size, dtype=torch.int32)
+
+    # global ranks of the processes in the group
+    ranks = torch.distributed.get_process_group_ranks(pg)
+
+    magic_message = b"magic_message"
+    shm = None
+
+    try:
+        with contextlib.suppress(OSError):
+            if rank == source_rank:
+                # create a shared memory segment
+                shm = shared_memory.SharedMemory(create=True, size=128)
+                shm.buf[: len(magic_message)] = magic_message
+                torch.distributed.broadcast_object_list(
+                    [shm.name], src=ranks[source_rank], group=pg
+                )
+                is_in_the_same_node[rank] = 1
+            else:
+                # try to open the shared memory segment
+                recv = [None]
+                torch.distributed.broadcast_object_list(
+                    recv, src=ranks[source_rank], group=pg
+                )
+                name = recv[0]
+                # fix to https://stackoverflow.com/q/62748654/9191338
+                # Python incorrectly tracks shared memory even if it is not
+                # created by the process. The following patch is a workaround.
+                with patch(
+                    "multiprocessing.resource_tracker.register",
+                    lambda *args, **kwargs: None,
+                ):
+                    shm = shared_memory.SharedMemory(name=name)
+                if shm.buf[: len(magic_message)] == magic_message:
+                    is_in_the_same_node[rank] = 1
+    except Exception as e:
+        logger.error("Error ignored in is_in_the_same_node: %s", e)
+    finally:
+        if shm:
+            shm.close()
+
+    torch.distributed.barrier(group=pg)
+
+    # clean up the shared memory segment
+    with contextlib.suppress(OSError):
+        if rank == source_rank and shm:
+            shm.unlink()
+    torch.distributed.all_reduce(is_in_the_same_node, group=pg)
+
+    return [x == 1 for x in is_in_the_same_node.tolist()]
+
+
+vllm_get_pp_group = None
+vllm_get_tp_group = None
+vllm_get_world_group = None
+
+
+def monkey_patch_vllm_parallel_state(reverse: bool = False):
+    try:
+        import vllm.distributed.parallel_state as vllm_parrlel_state
+    except ImportError:
+        return
+
+    global vllm_get_pp_group, vllm_get_tp_group, vllm_get_world_group
+    if vllm_get_pp_group is None:
+        vllm_get_pp_group = vllm_parrlel_state.get_pp_group
+        vllm_get_tp_group = vllm_parrlel_state.get_tp_group
+        vllm_get_world_group = vllm_parrlel_state.get_world_group
+    if reverse:
+        setattr(vllm_parrlel_state, "get_pp_group", vllm_get_pp_group)
+        setattr(vllm_parrlel_state, "get_tp_group", vllm_get_tp_group)
+        setattr(vllm_parrlel_state, "get_world_group", vllm_get_world_group)
+    else:
+        setattr(vllm_parrlel_state, "get_pp_group", get_pp_group)
+        setattr(vllm_parrlel_state, "get_tp_group", get_tp_group)
+        setattr(vllm_parrlel_state, "get_world_group", get_world_group)
diff --git a/sglang/python/sglang/srt/distributed/utils.py b/sglang/python/sglang/srt/distributed/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..2fde4fc924570d4141ea92dd9015801b1d4fa8f0
--- /dev/null
+++ b/sglang/python/sglang/srt/distributed/utils.py
@@ -0,0 +1,230 @@
+# Adapted from https://github.com/vllm-project/vllm/blob/v0.6.4.post1/vllm/distributed/utils.py
+
+# Copyright 2023 The vLLM team.
+# Adapted from
+# https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/core/tensor_parallel/utils.py
+# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+import dataclasses
+import logging
+import os
+import pickle
+import time
+from collections import deque
+from typing import Any, Deque, Dict, Optional, Sequence, Tuple
+
+import torch
+from torch.distributed import TCPStore
+
+logger = logging.getLogger(__name__)
+
+
+def ensure_divisibility(numerator, denominator):
+    """Ensure that numerator is divisible by the denominator."""
+    assert numerator % denominator == 0, "{} is not divisible by {}".format(
+        numerator, denominator
+    )
+
+
+def divide(numerator, denominator):
+    """Ensure that numerator is divisible by the denominator and return
+    the division value."""
+    ensure_divisibility(numerator, denominator)
+    return numerator // denominator
+
+
+def split_tensor_along_last_dim(
+    tensor: torch.Tensor,
+    num_partitions: int,
+    contiguous_split_chunks: bool = False,
+) -> Sequence[torch.Tensor]:
+    """Split a tensor along its last dimension.
+
+    Arguments:
+        tensor: input tensor.
+        num_partitions: number of partitions to split the tensor
+        contiguous_split_chunks: If True, make each chunk contiguous
+                                 in memory.
+
+    Returns:
+        A list of Tensors
+    """
+    # Get the size and dimension.
+    last_dim = tensor.dim() - 1
+    last_dim_size = divide(tensor.size()[last_dim], num_partitions)
+    # Split.
+    tensor_list = torch.split(tensor, last_dim_size, dim=last_dim)
+    # NOTE: torch.split does not create contiguous tensors by default.
+    if contiguous_split_chunks:
+        return tuple(chunk.contiguous() for chunk in tensor_list)
+
+    return tensor_list
+
+
+def get_pp_indices(
+    num_hidden_layers: int, pp_rank: int, pp_size: int
+) -> Tuple[int, int]:
+    """Try to evenly distribute layers across partitions.
+    If the number of layers is not divisible by the number of partitions,
+    the last N partitions will have one extra layer, where N = remainder.
+    """
+    # partition_list_str can be set to None in sglang
+    partition_list_str = os.getenv("SGLANG_PP_LAYER_PARTITION", None)
+    if partition_list_str is not None:
+        try:
+            partitions = [int(layer) for layer in partition_list_str.split(",")]
+        except ValueError as err:
+            raise ValueError(
+                "Invalid partition string: {}".format(partition_list_str)
+            ) from err
+        if len(partitions) != pp_size:
+            raise ValueError(f"{len(partitions)=} does not match {pp_size=}.")
+        if sum(partitions) != num_hidden_layers:
+            raise ValueError(f"{sum(partitions)=} does not match {num_hidden_layers=}.")
+        start_layer = sum(partitions[:pp_rank])
+        end_layer = start_layer + partitions[pp_rank]
+    else:
+        base_layers = num_hidden_layers // pp_size
+        remainder = num_hidden_layers % pp_size
+        # Distribute the extra layers to the last 'remainder' partitions
+        if pp_rank >= pp_size - remainder:
+            partitions_without_extra_layer = pp_size - remainder
+            # This partition gets one extra layer
+            start_layer = pp_rank * (base_layers + 1) - partitions_without_extra_layer
+            end_layer = start_layer + (base_layers + 1)
+        else:
+            # This partition gets only base layers
+            start_layer = pp_rank * base_layers
+            end_layer = start_layer + base_layers
+
+    return (start_layer, end_layer)
+
+
+@dataclasses.dataclass
+class StatelessProcessGroup:
+    """A dataclass to hold a metadata store, and the rank, world_size of the
+    group. Only use it to communicate metadata between processes.
+    For data-plane communication, create NCCL-related objects.
+    """
+
+    rank: int
+    world_size: int
+    store: torch._C._distributed_c10d.Store
+    data_expiration_seconds: int = 3600  # 1 hour
+
+    # dst rank -> counter
+    send_dst_counter: Dict[int, int] = dataclasses.field(default_factory=dict)
+    # src rank -> counter
+    recv_src_counter: Dict[int, int] = dataclasses.field(default_factory=dict)
+    broadcast_send_counter: int = 0
+    broadcast_recv_src_counter: Dict[int, int] = dataclasses.field(default_factory=dict)
+
+    # A deque to store the data entries, with key and timestamp.
+    entries: Deque[Tuple[str, float]] = dataclasses.field(default_factory=deque)
+
+    def __post_init__(self):
+        assert self.rank < self.world_size
+        self.send_dst_counter = {i: 0 for i in range(self.world_size)}
+        self.recv_src_counter = {i: 0 for i in range(self.world_size)}
+        self.broadcast_recv_src_counter = {i: 0 for i in range(self.world_size)}
+
+    def send_obj(self, obj: Any, dst: int):
+        """Send an object to a destination rank."""
+        self.expire_data()
+        key = f"send_to/{dst}/{self.send_dst_counter[dst]}"
+        self.store.set(key, pickle.dumps(obj))
+        self.send_dst_counter[dst] += 1
+        self.entries.append((key, time.perf_counter()))
+
+    def expire_data(self):
+        """Expire data that is older than `data_expiration_seconds` seconds."""
+        while self.entries:
+            # check the oldest entry
+            key, timestamp = self.entries[0]
+            if time.perf_counter() - timestamp > self.data_expiration_seconds:
+                self.store.delete_key(key)
+                self.entries.popleft()
+            else:
+                break
+
+    def recv_obj(self, src: int) -> Any:
+        """Receive an object from a source rank."""
+        obj = pickle.loads(
+            self.store.get(f"send_to/{self.rank}/{self.recv_src_counter[src]}")
+        )
+        self.recv_src_counter[src] += 1
+        return obj
+
+    def broadcast_obj(self, obj: Optional[Any], src: int) -> Any:
+        """Broadcast an object from a source rank to all other ranks.
+        It does not clean up after all ranks have received the object.
+        Use it for limited times, e.g., for initialization.
+        """
+        if self.rank == src:
+            self.expire_data()
+            key = f"broadcast_from/{src}/" f"{self.broadcast_send_counter}"
+            self.store.set(key, pickle.dumps(obj))
+            self.broadcast_send_counter += 1
+            self.entries.append((key, time.perf_counter()))
+            return obj
+        else:
+            key = f"broadcast_from/{src}/" f"{self.broadcast_recv_src_counter[src]}"
+            recv_obj = pickle.loads(self.store.get(key))
+            self.broadcast_recv_src_counter[src] += 1
+            return recv_obj
+
+    def all_gather_obj(self, obj: Any) -> list[Any]:
+        """All gather an object from all ranks."""
+        gathered_objs = []
+        for i in range(self.world_size):
+            if i == self.rank:
+                gathered_objs.append(obj)
+                self.broadcast_obj(obj, src=self.rank)
+            else:
+                recv_obj = self.broadcast_obj(None, src=i)
+                gathered_objs.append(recv_obj)
+        return gathered_objs
+
+    def barrier(self):
+        """A barrier to synchronize all ranks."""
+        for i in range(self.world_size):
+            if i == self.rank:
+                self.broadcast_obj(None, src=self.rank)
+            else:
+                self.broadcast_obj(None, src=i)
+
+    @staticmethod
+    def create(
+        host: str,
+        port: int,
+        rank: int,
+        world_size: int,
+        data_expiration_seconds: int = 3600,
+    ) -> "StatelessProcessGroup":
+        """A replacement for `torch.distributed.init_process_group` that does not
+        pollute the global state.
+
+        If we have process A and process B called `torch.distributed.init_process_group`
+        to form a group, and then we want to form another group with process A, B, C,
+        D, it is not possible in PyTorch, because process A and process B have already
+        formed a group, and process C and process D cannot join that group. This
+        function is a workaround for this issue.
+
+        `torch.distributed.init_process_group` is a global call, while this function
+        is a stateless call. It will return a `StatelessProcessGroup` object that can be
+        used for exchanging metadata. With this function, process A and process B
+        can call `StatelessProcessGroup.create` to form a group, and then process A, B,
+        C, and D can call `StatelessProcessGroup.create` to form another group.
+        """  # noqa
+        store = TCPStore(
+            host_name=host,
+            port=port,
+            world_size=world_size,
+            is_master=(rank == 0),
+        )
+
+        return StatelessProcessGroup(
+            rank=rank,
+            world_size=world_size,
+            store=store,
+            data_expiration_seconds=data_expiration_seconds,
+        )
diff --git a/sglang/python/sglang/srt/entrypoints/EngineBase.py b/sglang/python/sglang/srt/entrypoints/EngineBase.py
new file mode 100644
index 0000000000000000000000000000000000000000..7bcac278d5ad8d0174d8a90bfc2c692b64624ec8
--- /dev/null
+++ b/sglang/python/sglang/srt/entrypoints/EngineBase.py
@@ -0,0 +1,76 @@
+from abc import ABC, abstractmethod
+from typing import Dict, Iterator, List, Optional, Tuple, Union
+
+import torch
+
+
+class EngineBase(ABC):
+    """
+    Abstract base class for engine interfaces that support generation, weight updating, and memory control.
+    This base class provides a unified API for both HTTP-based engines and engines.
+    """
+
+    @abstractmethod
+    def generate(
+        self,
+        prompt: Optional[Union[List[str], str]] = None,
+        sampling_params: Optional[Union[List[Dict], Dict]] = None,
+        input_ids: Optional[Union[List[List[int]], List[int]]] = None,
+        image_data: Optional[Union[List[str], str]] = None,
+        return_logprob: Optional[Union[List[bool], bool]] = False,
+        logprob_start_len: Optional[Union[List[int], int]] = None,
+        top_logprobs_num: Optional[Union[List[int], int]] = None,
+        token_ids_logprob: Optional[Union[List[List[int]], List[int]]] = None,
+        lora_path: Optional[Union[List[Optional[str]], Optional[str]]] = None,
+        custom_logit_processor: Optional[Union[List[str], str]] = None,
+        return_hidden_states: Optional[bool] = None,
+        stream: Optional[bool] = None,
+        bootstrap_host: Optional[Union[List[str], str]] = None,
+        bootstrap_port: Optional[Union[List[int], int]] = None,
+        bootstrap_room: Optional[Union[List[int], int]] = None,
+        routed_dp_rank: Optional[int] = None,
+        disagg_prefill_dp_rank: Optional[int] = None,
+        data_parallel_rank: Optional[int] = None,
+        rid: Optional[Union[List[str], str]] = None,
+        priority: Optional[int] = None,
+    ) -> Union[Dict, Iterator[Dict]]:
+        """Generate outputs based on given inputs."""
+        pass
+
+    @abstractmethod
+    def flush_cache(self):
+        """Flush the cache of the engine."""
+        pass
+
+    @abstractmethod
+    def update_weights_from_tensor(
+        self,
+        named_tensors: List[Tuple[str, torch.Tensor]],
+        load_format: Optional[str] = None,
+        flush_cache: bool = True,
+    ):
+        """Update model weights with in-memory tensor data."""
+        pass
+
+    def load_lora_adapter(self, lora_name: str, lora_path: str):
+        """Load a new LoRA adapter without re-launching the engine."""
+        pass
+
+    def unload_lora_adapter(self, lora_name: str):
+        """Unload a LoRA adapter without re-launching the engine."""
+        pass
+
+    @abstractmethod
+    def release_memory_occupation(self):
+        """Release GPU memory occupation temporarily."""
+        pass
+
+    @abstractmethod
+    def resume_memory_occupation(self):
+        """Resume GPU memory occupation which is previously released."""
+        pass
+
+    @abstractmethod
+    def shutdown(self):
+        """Shutdown the engine and clean up resources."""
+        pass
diff --git a/sglang/python/sglang/srt/entrypoints/__pycache__/EngineBase.cpython-311.pyc b/sglang/python/sglang/srt/entrypoints/__pycache__/EngineBase.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b56ed33022dead601795d7cb1217dd11a4535d77
Binary files /dev/null and b/sglang/python/sglang/srt/entrypoints/__pycache__/EngineBase.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/entrypoints/__pycache__/engine.cpython-311.pyc b/sglang/python/sglang/srt/entrypoints/__pycache__/engine.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3ff5002ef63b97c7e025b1b260c2d39030844bcc
Binary files /dev/null and b/sglang/python/sglang/srt/entrypoints/__pycache__/engine.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/entrypoints/__pycache__/http_server.cpython-311.pyc b/sglang/python/sglang/srt/entrypoints/__pycache__/http_server.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8ec58172fee02aadfd2e01b11cbea9388ef1e485
Binary files /dev/null and b/sglang/python/sglang/srt/entrypoints/__pycache__/http_server.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/entrypoints/__pycache__/v1_loads.cpython-311.pyc b/sglang/python/sglang/srt/entrypoints/__pycache__/v1_loads.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ee7709332855ca311a88bbb3f97aa33c7bffe4ec
Binary files /dev/null and b/sglang/python/sglang/srt/entrypoints/__pycache__/v1_loads.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/entrypoints/__pycache__/warmup.cpython-311.pyc b/sglang/python/sglang/srt/entrypoints/__pycache__/warmup.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1774abfa2e812635ce35069102627cdde2640ab2
Binary files /dev/null and b/sglang/python/sglang/srt/entrypoints/__pycache__/warmup.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/entrypoints/anthropic/__init__.py b/sglang/python/sglang/srt/entrypoints/anthropic/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/sglang/python/sglang/srt/entrypoints/anthropic/__pycache__/__init__.cpython-311.pyc b/sglang/python/sglang/srt/entrypoints/anthropic/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f2d29e192006ad33a0425ac68fba1404963a0e7a
Binary files /dev/null and b/sglang/python/sglang/srt/entrypoints/anthropic/__pycache__/__init__.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/entrypoints/anthropic/__pycache__/protocol.cpython-311.pyc b/sglang/python/sglang/srt/entrypoints/anthropic/__pycache__/protocol.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4eac5ef97817b0845f57d70f14a7cb0a7b2e0193
Binary files /dev/null and b/sglang/python/sglang/srt/entrypoints/anthropic/__pycache__/protocol.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/entrypoints/anthropic/__pycache__/serving.cpython-311.pyc b/sglang/python/sglang/srt/entrypoints/anthropic/__pycache__/serving.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..48a5ee333508cdac86a6e97f1acde4700c748057
Binary files /dev/null and b/sglang/python/sglang/srt/entrypoints/anthropic/__pycache__/serving.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/entrypoints/anthropic/protocol.py b/sglang/python/sglang/srt/entrypoints/anthropic/protocol.py
new file mode 100644
index 0000000000000000000000000000000000000000..6e4b2d7d727849e6d63a530270eb5e7edc1c58aa
--- /dev/null
+++ b/sglang/python/sglang/srt/entrypoints/anthropic/protocol.py
@@ -0,0 +1,178 @@
+"""Pydantic models for Anthropic Messages API protocol"""
+
+import uuid
+from typing import Any, Literal, Optional
+
+from pydantic import BaseModel, Field, field_validator
+
+
+class AnthropicError(BaseModel):
+    """Error structure for Anthropic API"""
+
+    type: str
+    message: str
+
+
+class AnthropicErrorResponse(BaseModel):
+    """Error response structure for Anthropic API"""
+
+    type: Literal["error"] = "error"
+    error: AnthropicError
+
+
+class AnthropicUsage(BaseModel):
+    """Token usage information"""
+
+    input_tokens: int
+    output_tokens: int
+    cache_creation_input_tokens: Optional[int] = None
+    cache_read_input_tokens: Optional[int] = None
+
+
+class AnthropicContentBlock(BaseModel):
+    """Content block in message"""
+
+    type: Literal[
+        "text", "image", "tool_use", "tool_result", "thinking", "redacted_thinking"
+    ]
+    text: Optional[str] = None
+    # For image content
+    source: Optional[dict[str, Any]] = None
+    # For tool use/result
+    id: Optional[str] = None
+    tool_use_id: Optional[str] = None
+    name: Optional[str] = None
+    input: Optional[dict[str, Any]] = None
+    content: Optional[str | list[dict[str, Any]]] = None
+    is_error: Optional[bool] = None
+    # For thinking content
+    thinking: Optional[str] = None
+    signature: Optional[str] = None
+
+
+class AnthropicMessage(BaseModel):
+    """Message structure"""
+
+    role: Literal["user", "assistant"]
+    content: str | list[AnthropicContentBlock]
+
+
+class AnthropicTool(BaseModel):
+    """Tool definition"""
+
+    name: str
+    description: Optional[str] = None
+    input_schema: dict[str, Any]
+
+    @field_validator("input_schema")
+    @classmethod
+    def validate_input_schema(cls, v):
+        if not isinstance(v, dict):
+            raise ValueError("input_schema must be a dictionary")
+        if "type" not in v:
+            v["type"] = "object"
+        return v
+
+
+class AnthropicToolChoice(BaseModel):
+    """Tool Choice definition"""
+
+    type: Literal["auto", "any", "tool", "none"]
+    name: Optional[str] = None
+
+
+class AnthropicCountTokensRequest(BaseModel):
+    """Anthropic Count Tokens API request"""
+
+    model: str
+    messages: list[AnthropicMessage]
+    system: Optional[str | list[AnthropicContentBlock]] = None
+    tool_choice: Optional[AnthropicToolChoice] = None
+    tools: Optional[list[AnthropicTool]] = None
+
+
+class AnthropicCountTokensResponse(BaseModel):
+    """Anthropic Count Tokens API response"""
+
+    input_tokens: int
+
+
+class AnthropicMessagesRequest(BaseModel):
+    """Anthropic Messages API request"""
+
+    model: str
+    messages: list[AnthropicMessage]
+    max_tokens: int
+    metadata: Optional[dict[str, Any]] = None
+    stop_sequences: Optional[list[str]] = None
+    stream: Optional[bool] = False
+    system: Optional[str | list[AnthropicContentBlock]] = None
+    temperature: Optional[float] = None
+    tool_choice: Optional[AnthropicToolChoice] = None
+    tools: Optional[list[AnthropicTool]] = None
+    top_k: Optional[int] = None
+    top_p: Optional[float] = None
+
+    @field_validator("model")
+    @classmethod
+    def validate_model(cls, v):
+        if not v:
+            raise ValueError("Model is required")
+        return v
+
+    @field_validator("max_tokens")
+    @classmethod
+    def validate_max_tokens(cls, v):
+        if v <= 0:
+            raise ValueError("max_tokens must be positive")
+        return v
+
+
+class AnthropicDelta(BaseModel):
+    """Delta for streaming responses"""
+
+    type: Optional[Literal["text_delta", "input_json_delta"]] = None
+    text: Optional[str] = None
+    partial_json: Optional[str] = None
+
+    # Message delta fields
+    stop_reason: Optional[
+        Literal["end_turn", "max_tokens", "stop_sequence", "tool_use"]
+    ] = None
+    stop_sequence: Optional[str] = None
+
+
+class AnthropicStreamEvent(BaseModel):
+    """Streaming event"""
+
+    type: Literal[
+        "message_start",
+        "message_delta",
+        "message_stop",
+        "content_block_start",
+        "content_block_delta",
+        "content_block_stop",
+        "ping",
+        "error",
+    ]
+    message: Optional["AnthropicMessagesResponse"] = None
+    delta: Optional[AnthropicDelta] = None
+    content_block: Optional[AnthropicContentBlock] = None
+    index: Optional[int] = None
+    error: Optional[AnthropicError] = None
+    usage: Optional[AnthropicUsage] = None
+
+
+class AnthropicMessagesResponse(BaseModel):
+    """Anthropic Messages API response"""
+
+    id: str = Field(default_factory=lambda: f"msg_{uuid.uuid4().hex}")
+    type: Literal["message"] = "message"
+    role: Literal["assistant"] = "assistant"
+    content: list[AnthropicContentBlock]
+    model: str
+    stop_reason: Optional[
+        Literal["end_turn", "max_tokens", "stop_sequence", "tool_use"]
+    ] = None
+    stop_sequence: Optional[str] = None
+    usage: Optional[AnthropicUsage] = None
diff --git a/sglang/python/sglang/srt/entrypoints/anthropic/serving.py b/sglang/python/sglang/srt/entrypoints/anthropic/serving.py
new file mode 100644
index 0000000000000000000000000000000000000000..88725d41c4fb6b83790fbeef9576f8704748e01f
--- /dev/null
+++ b/sglang/python/sglang/srt/entrypoints/anthropic/serving.py
@@ -0,0 +1,762 @@
+"""Handler for Anthropic Messages API requests.
+
+Converts Anthropic requests to OpenAI ChatCompletion format, delegates to
+OpenAIServingChat for processing, and converts responses back to Anthropic format.
+"""
+
+from __future__ import annotations
+
+import json
+import logging
+import time
+import uuid
+from typing import TYPE_CHECKING, AsyncGenerator, Optional, Union
+
+from fastapi import Request
+from fastapi.responses import JSONResponse, StreamingResponse
+
+from sglang.srt.entrypoints.anthropic.protocol import (
+    AnthropicContentBlock,
+    AnthropicCountTokensRequest,
+    AnthropicCountTokensResponse,
+    AnthropicDelta,
+    AnthropicError,
+    AnthropicErrorResponse,
+    AnthropicMessagesRequest,
+    AnthropicMessagesResponse,
+    AnthropicStreamEvent,
+    AnthropicUsage,
+)
+from sglang.srt.entrypoints.openai.protocol import (
+    ChatCompletionRequest,
+    ChatCompletionResponse,
+    ChatCompletionStreamResponse,
+    StreamOptions,
+    Tool,
+    ToolChoice,
+    ToolChoiceFuncName,
+)
+
+if TYPE_CHECKING:
+    from sglang.srt.entrypoints.openai.serving_chat import OpenAIServingChat
+
+logger = logging.getLogger(__name__)
+
+# Map OpenAI finish reasons to Anthropic stop reasons
+STOP_REASON_MAP = {
+    "stop": "end_turn",
+    "length": "max_tokens",
+    "tool_calls": "tool_use",
+}
+
+
+def _wrap_sse_event(data: str, event_type: str) -> str:
+    """Format an Anthropic SSE event with event type and data lines."""
+    return f"event: {event_type}\ndata: {data}\n\n"
+
+
+class AnthropicServing:
+    """Handler for Anthropic Messages API requests.
+
+    Acts as a translation layer between Anthropic's Messages API and SGLang's
+    OpenAI-compatible chat completion infrastructure.
+    """
+
+    def __init__(self, openai_serving_chat: OpenAIServingChat):
+        self.openai_serving_chat = openai_serving_chat
+
+    async def handle_messages(
+        self,
+        request: AnthropicMessagesRequest,
+        raw_request: Request,
+    ) -> Union[JSONResponse, StreamingResponse]:
+        """Main entry point for /v1/messages endpoint."""
+        try:
+            chat_request = self._convert_to_chat_completion_request(request)
+        except Exception as e:
+            logger.exception("Error converting Anthropic request: %s", e)
+            return self._error_response(
+                status_code=400,
+                error_type="invalid_request_error",
+                message=str(e),
+            )
+
+        if request.stream:
+            return await self._handle_streaming(chat_request, request, raw_request)
+        else:
+            return await self._handle_non_streaming(chat_request, request, raw_request)
+
+    def _convert_to_chat_completion_request(
+        self, anthropic_request: AnthropicMessagesRequest
+    ) -> ChatCompletionRequest:
+        """Convert an Anthropic Messages request to an OpenAI ChatCompletion request."""
+        openai_messages = []
+
+        def _convert_anthropic_image_source_to_openai_part(
+            source: Optional[dict],
+        ) -> Optional[dict]:
+            if not isinstance(source, dict):
+                return None
+
+            source_type = source.get("type")
+            if source_type == "base64":
+                media_type = source.get("media_type", "image/png")
+                data = source.get("data", "")
+                if not data:
+                    return None
+                return {
+                    "type": "image_url",
+                    "image_url": {
+                        "url": f"data:{media_type};base64,{data}",
+                    },
+                }
+
+            url = source.get("url")
+            if url:
+                return {
+                    "type": "image_url",
+                    "image_url": {
+                        "url": url,
+                    },
+                }
+
+            return None
+
+        def _convert_tool_result_content(
+            content: Optional[str | list[dict]],
+        ) -> tuple[str | list[dict], str]:
+            if isinstance(content, list):
+                tool_content_parts = []
+                tool_text_parts = []
+
+                for item in content:
+                    if not isinstance(item, dict):
+                        continue
+
+                    item_type = item.get("type")
+                    if item_type == "text":
+                        text = item.get("text", "")
+                        if text:
+                            tool_text_parts.append(text)
+                            tool_content_parts.append({"type": "text", "text": text})
+                    elif item_type == "image":
+                        image_part = _convert_anthropic_image_source_to_openai_part(
+                            item.get("source")
+                        )
+                        if image_part is not None:
+                            tool_content_parts.append(image_part)
+
+                tool_text = "\n".join(tool_text_parts)
+                if (
+                    len(tool_content_parts) == 1
+                    and tool_content_parts[0]["type"] == "text"
+                ):
+                    return tool_content_parts[0]["text"], tool_text
+                if tool_content_parts:
+                    return tool_content_parts, tool_text
+                return "", tool_text
+
+            tool_text = str(content) if content else ""
+            return tool_text, tool_text
+
+        # Add system message if provided
+        if anthropic_request.system:
+            if isinstance(anthropic_request.system, str):
+                openai_messages.append(
+                    {"role": "system", "content": anthropic_request.system}
+                )
+            else:
+                system_parts = []
+                for block in anthropic_request.system:
+                    if block.type == "text" and block.text:
+                        system_parts.append(block.text)
+                system_text = "\n".join(system_parts)
+                openai_messages.append({"role": "system", "content": system_text})
+
+        # Convert messages
+        for msg in anthropic_request.messages:
+            if isinstance(msg.content, str):
+                openai_messages.append({"role": msg.role, "content": msg.content})
+                continue
+
+            # Complex content with blocks
+            openai_msg = {"role": msg.role}
+            content_parts = []
+            tool_calls = []
+
+            for block in msg.content:
+                if block.type == "text" and block.text:
+                    content_parts.append({"type": "text", "text": block.text})
+
+                elif block.type == "image" and block.source:
+                    image_part = _convert_anthropic_image_source_to_openai_part(
+                        block.source
+                    )
+                    if image_part is not None:
+                        content_parts.append(image_part)
+
+                elif block.type == "tool_use":
+                    tool_call = {
+                        "id": block.id or f"call_{uuid.uuid4().hex}",
+                        "type": "function",
+                        "function": {
+                            "name": block.name or "",
+                            "arguments": json.dumps(block.input or {}),
+                        },
+                    }
+                    tool_calls.append(tool_call)
+
+                elif block.type == "tool_result":
+                    tool_content, tool_text = _convert_tool_result_content(
+                        block.content
+                    )
+
+                    # Use tool_use_id (per spec) with fallback to id
+                    tool_call_id = block.tool_use_id or block.id or ""
+
+                    # Tool results from user become separate tool messages
+                    if msg.role == "user":
+                        openai_messages.append(
+                            {
+                                "role": "tool",
+                                "tool_call_id": tool_call_id,
+                                "content": tool_content,
+                            }
+                        )
+                    else:
+                        content_parts.append(
+                            {
+                                "type": "text",
+                                "text": f"Tool result: {tool_text}",
+                            }
+                        )
+
+            # Attach tool calls to assistant messages
+            if tool_calls:
+                openai_msg["tool_calls"] = tool_calls
+
+            # Attach content
+            if content_parts:
+                if len(content_parts) == 1 and content_parts[0]["type"] == "text":
+                    openai_msg["content"] = content_parts[0]["text"]
+                else:
+                    openai_msg["content"] = content_parts
+            elif not tool_calls:
+                continue
+
+            openai_messages.append(openai_msg)
+
+        # Build ChatCompletionRequest
+        request_data = {
+            "messages": openai_messages,
+            "model": anthropic_request.model,
+            "max_tokens": anthropic_request.max_tokens,
+            "stream": anthropic_request.stream or False,
+        }
+
+        if anthropic_request.temperature is not None:
+            request_data["temperature"] = anthropic_request.temperature
+        if anthropic_request.top_p is not None:
+            request_data["top_p"] = anthropic_request.top_p
+        if anthropic_request.top_k is not None:
+            request_data["top_k"] = anthropic_request.top_k
+        if anthropic_request.stop_sequences is not None:
+            request_data["stop"] = anthropic_request.stop_sequences
+
+        # Enable usage in stream so we can report it
+        if anthropic_request.stream:
+            request_data["stream_options"] = StreamOptions(include_usage=True)
+
+        chat_request = ChatCompletionRequest(**request_data)
+
+        # Convert tools
+        if anthropic_request.tools:
+            tools = []
+            for tool in anthropic_request.tools:
+                tools.append(
+                    Tool(
+                        type="function",
+                        function={
+                            "name": tool.name,
+                            "description": tool.description or "",
+                            "parameters": tool.input_schema,
+                        },
+                    )
+                )
+            chat_request.tools = tools
+
+        # Convert tool choice
+        if anthropic_request.tool_choice is not None:
+            if anthropic_request.tool_choice.type == "none":
+                chat_request.tool_choice = "none"
+            elif anthropic_request.tool_choice.type == "auto":
+                chat_request.tool_choice = "auto"
+            elif anthropic_request.tool_choice.type == "any":
+                chat_request.tool_choice = "required"
+            elif anthropic_request.tool_choice.type == "tool":
+                chat_request.tool_choice = ToolChoice(
+                    type="function",
+                    function=ToolChoiceFuncName(
+                        name=anthropic_request.tool_choice.name
+                    ),
+                )
+        elif anthropic_request.tools:
+            # Default to auto when tools are provided
+            chat_request.tool_choice = "auto"
+
+        return chat_request
+
+    async def _handle_non_streaming(
+        self,
+        chat_request: ChatCompletionRequest,
+        anthropic_request: AnthropicMessagesRequest,
+        raw_request: Request,
+    ) -> JSONResponse:
+        """Handle non-streaming Anthropic request by delegating to OpenAI handler."""
+        received_time = time.time()
+        received_time_perf = time.perf_counter()
+
+        # Validate
+        error_msg = self.openai_serving_chat._validate_request(chat_request)
+        if error_msg:
+            return self._error_response(
+                status_code=400,
+                error_type="invalid_request_error",
+                message=error_msg,
+            )
+
+        try:
+            # Convert to internal request
+            validation_time = time.perf_counter() - received_time_perf
+            adapted_request, processed_request = (
+                self.openai_serving_chat._convert_to_internal_request(
+                    chat_request, raw_request
+                )
+            )
+            adapted_request.validation_time = validation_time
+            adapted_request.received_time = received_time
+            adapted_request.received_time_perf = received_time_perf
+
+            # Get response from OpenAI handler
+            response = await self.openai_serving_chat._handle_non_streaming_request(
+                adapted_request, processed_request, raw_request
+            )
+        except Exception as e:
+            logger.exception("Error processing Anthropic request: %s", e)
+            return self._error_response(
+                status_code=500,
+                error_type="internal_error",
+                message="Internal server error",
+            )
+
+        # Check for error responses from OpenAI handler
+        if not isinstance(response, ChatCompletionResponse):
+            # It's an error response (ORJSONResponse)
+            return self._error_response(
+                status_code=500,
+                error_type="internal_error",
+                message="Internal processing error",
+            )
+
+        # Convert to Anthropic response
+        anthropic_response = self._convert_response(response)
+        return JSONResponse(content=anthropic_response.model_dump(exclude_none=True))
+
+    async def _handle_streaming(
+        self,
+        chat_request: ChatCompletionRequest,
+        anthropic_request: AnthropicMessagesRequest,
+        raw_request: Request,
+    ) -> Union[StreamingResponse, JSONResponse]:
+        """Handle streaming Anthropic request."""
+        received_time = time.time()
+        received_time_perf = time.perf_counter()
+
+        # Validate
+        error_msg = self.openai_serving_chat._validate_request(chat_request)
+        if error_msg:
+            return self._error_response(
+                status_code=400,
+                error_type="invalid_request_error",
+                message=error_msg,
+            )
+
+        try:
+            validation_time = time.perf_counter() - received_time_perf
+            adapted_request, processed_request = (
+                self.openai_serving_chat._convert_to_internal_request(
+                    chat_request, raw_request
+                )
+            )
+            adapted_request.validation_time = validation_time
+            adapted_request.received_time = received_time
+            adapted_request.received_time_perf = received_time_perf
+        except Exception as e:
+            logger.exception("Error converting streaming request: %s", e)
+            return self._error_response(
+                status_code=500,
+                error_type="internal_error",
+                message="Internal server error",
+            )
+
+        return StreamingResponse(
+            self._generate_anthropic_stream(
+                adapted_request,
+                processed_request,
+                anthropic_request,
+                raw_request,
+            ),
+            media_type="text/event-stream",
+            background=self.openai_serving_chat.tokenizer_manager.create_abort_task(
+                adapted_request
+            ),
+        )
+
+    async def _generate_anthropic_stream(
+        self,
+        adapted_request,
+        processed_request: ChatCompletionRequest,
+        anthropic_request: AnthropicMessagesRequest,
+        raw_request: Request,
+    ) -> AsyncGenerator[str, None]:
+        """Convert OpenAI chat stream to Anthropic event stream."""
+        openai_stream = self.openai_serving_chat._generate_chat_stream(
+            adapted_request, processed_request, raw_request
+        )
+
+        # State tracking
+        first_chunk = True
+        content_block_index = 0
+        content_block_open = False
+        finish_reason: Optional[str] = None
+        usage_info: Optional[dict] = None
+        message_id = f"msg_{uuid.uuid4().hex}"
+        model = anthropic_request.model
+
+        async for sse_line in openai_stream:
+            if not sse_line.startswith("data: "):
+                continue
+
+            data_str = sse_line[6:].strip()
+
+            if data_str == "[DONE]":
+                # Close any open content block
+                if content_block_open:
+                    stop_event = AnthropicStreamEvent(
+                        type="content_block_stop",
+                        index=content_block_index,
+                    )
+                    yield _wrap_sse_event(
+                        stop_event.model_dump_json(exclude_none=True),
+                        "content_block_stop",
+                    )
+
+                # Emit message_delta with stop_reason and usage
+                stop_reason = STOP_REASON_MAP.get(finish_reason or "stop", "end_turn")
+                delta_event = AnthropicStreamEvent(
+                    type="message_delta",
+                    delta=AnthropicDelta(stop_reason=stop_reason),
+                    usage=AnthropicUsage(
+                        input_tokens=(
+                            usage_info.get("input_tokens", 0) if usage_info else 0
+                        ),
+                        output_tokens=(
+                            usage_info.get("output_tokens", 0) if usage_info else 0
+                        ),
+                    ),
+                )
+                yield _wrap_sse_event(
+                    delta_event.model_dump_json(exclude_none=True),
+                    "message_delta",
+                )
+
+                # Emit message_stop
+                stop_msg = AnthropicStreamEvent(type="message_stop")
+                yield _wrap_sse_event(
+                    stop_msg.model_dump_json(exclude_none=True),
+                    "message_stop",
+                )
+                continue
+
+            # Parse the OpenAI chunk
+            try:
+                chunk = ChatCompletionStreamResponse.model_validate_json(data_str)
+            except Exception:
+                logger.debug("Failed to parse stream chunk: %s", data_str)
+                error_event = AnthropicStreamEvent(
+                    type="error",
+                    error=AnthropicError(
+                        type="api_error", message="Stream processing error"
+                    ),
+                )
+                yield _wrap_sse_event(
+                    error_event.model_dump_json(exclude_none=True), "error"
+                )
+                continue
+
+            # First chunk: emit message_start
+            if first_chunk:
+                first_chunk = False
+
+                start_event = AnthropicStreamEvent(
+                    type="message_start",
+                    message=AnthropicMessagesResponse(
+                        id=message_id,
+                        content=[],
+                        model=model,
+                        usage=AnthropicUsage(
+                            input_tokens=(
+                                chunk.usage.prompt_tokens if chunk.usage else 0
+                            ),
+                            output_tokens=0,
+                        ),
+                    ),
+                )
+                yield _wrap_sse_event(
+                    start_event.model_dump_json(exclude_none=True),
+                    "message_start",
+                )
+                # Skip if this was just the role chunk with empty content
+                if chunk.choices and chunk.choices[0].delta.content == "":
+                    continue
+
+            # Usage-only chunk (empty choices with usage info)
+            if not chunk.choices and chunk.usage:
+                usage_info = {
+                    "input_tokens": chunk.usage.prompt_tokens,
+                    "output_tokens": chunk.usage.completion_tokens or 0,
+                }
+                continue
+
+            if not chunk.choices:
+                continue
+
+            choice = chunk.choices[0]
+
+            # Capture finish reason
+            if choice.finish_reason is not None:
+                finish_reason = choice.finish_reason
+                continue
+
+            delta = choice.delta
+
+            # Handle tool call deltas
+            if delta.tool_calls:
+                for tc in delta.tool_calls:
+                    tc_id = tc.id
+                    tc_func = tc.function
+
+                    # New tool call: close previous block, start new one
+                    if tc_func and tc_func.name:
+                        # Close previous content block if open
+                        if content_block_open:
+                            stop_event = AnthropicStreamEvent(
+                                type="content_block_stop",
+                                index=content_block_index,
+                            )
+                            yield _wrap_sse_event(
+                                stop_event.model_dump_json(exclude_none=True),
+                                "content_block_stop",
+                            )
+                            content_block_index += 1
+
+                        # Start tool_use content block
+                        start_event = AnthropicStreamEvent(
+                            type="content_block_start",
+                            index=content_block_index,
+                            content_block=AnthropicContentBlock(
+                                type="tool_use",
+                                id=tc_id or f"toolu_{uuid.uuid4().hex}",
+                                name=tc_func.name,
+                                input={},
+                            ),
+                        )
+                        yield _wrap_sse_event(
+                            start_event.model_dump_json(exclude_none=True),
+                            "content_block_start",
+                        )
+                        content_block_open = True
+
+                        # Stream initial arguments if present
+                        if tc_func.arguments:
+                            delta_event = AnthropicStreamEvent(
+                                type="content_block_delta",
+                                index=content_block_index,
+                                delta=AnthropicDelta(
+                                    type="input_json_delta",
+                                    partial_json=tc_func.arguments,
+                                ),
+                            )
+                            yield _wrap_sse_event(
+                                delta_event.model_dump_json(exclude_none=True),
+                                "content_block_delta",
+                            )
+
+                    elif tc_func and tc_func.arguments:
+                        # Continuing arguments for current tool call
+                        delta_event = AnthropicStreamEvent(
+                            type="content_block_delta",
+                            index=content_block_index,
+                            delta=AnthropicDelta(
+                                type="input_json_delta",
+                                partial_json=tc_func.arguments,
+                            ),
+                        )
+                        yield _wrap_sse_event(
+                            delta_event.model_dump_json(exclude_none=True),
+                            "content_block_delta",
+                        )
+                continue
+
+            # Handle text content deltas
+            if delta.content is not None and delta.content != "":
+                # Start a text content block if needed
+                if not content_block_open:
+                    start_event = AnthropicStreamEvent(
+                        type="content_block_start",
+                        index=content_block_index,
+                        content_block=AnthropicContentBlock(type="text", text=""),
+                    )
+                    yield _wrap_sse_event(
+                        start_event.model_dump_json(exclude_none=True),
+                        "content_block_start",
+                    )
+                    content_block_open = True
+
+                # Emit text delta
+                delta_event = AnthropicStreamEvent(
+                    type="content_block_delta",
+                    index=content_block_index,
+                    delta=AnthropicDelta(
+                        type="text_delta",
+                        text=delta.content,
+                    ),
+                )
+                yield _wrap_sse_event(
+                    delta_event.model_dump_json(exclude_none=True),
+                    "content_block_delta",
+                )
+
+    def _convert_response(
+        self, response: ChatCompletionResponse
+    ) -> AnthropicMessagesResponse:
+        """Convert an OpenAI ChatCompletionResponse to an Anthropic Messages response."""
+        if not response.choices:
+            return AnthropicMessagesResponse(
+                content=[AnthropicContentBlock(type="text", text="")],
+                model=response.model,
+                stop_reason="end_turn",
+                usage=AnthropicUsage(input_tokens=0, output_tokens=0),
+            )
+
+        choice = response.choices[0]
+        content: list[AnthropicContentBlock] = []
+
+        # Add text content
+        if choice.message.content:
+            content.append(
+                AnthropicContentBlock(type="text", text=choice.message.content)
+            )
+
+        # Add tool calls
+        if choice.message.tool_calls:
+            for tool_call in choice.message.tool_calls:
+                try:
+                    tool_input = json.loads(tool_call.function.arguments)
+                except (json.JSONDecodeError, TypeError):
+                    tool_input = {}
+
+                content.append(
+                    AnthropicContentBlock(
+                        type="tool_use",
+                        id=tool_call.id,
+                        name=tool_call.function.name,
+                        input=tool_input,
+                    )
+                )
+
+        # Map stop reason
+        stop_reason = STOP_REASON_MAP.get(choice.finish_reason or "stop", "end_turn")
+
+        return AnthropicMessagesResponse(
+            id=f"msg_{uuid.uuid4().hex}",
+            content=content,
+            model=response.model,
+            stop_reason=stop_reason,
+            usage=AnthropicUsage(
+                input_tokens=response.usage.prompt_tokens if response.usage else 0,
+                output_tokens=response.usage.completion_tokens if response.usage else 0,
+            ),
+        )
+
+    def _error_response(
+        self,
+        status_code: int,
+        error_type: str,
+        message: str,
+    ) -> JSONResponse:
+        """Create an Anthropic-format error response."""
+        error_resp = AnthropicErrorResponse(
+            error=AnthropicError(type=error_type, message=message)
+        )
+        return JSONResponse(
+            status_code=status_code,
+            content=error_resp.model_dump(),
+        )
+
+    async def handle_count_tokens(
+        self,
+        request: AnthropicCountTokensRequest,
+        raw_request: Request,
+    ) -> JSONResponse:
+        """Handle /v1/messages/count_tokens endpoint.
+
+        Converts the request to a ChatCompletionRequest, applies the chat
+        template via the OpenAI handler to tokenize, and returns the count.
+        """
+        try:
+            # Build a minimal AnthropicMessagesRequest so we can reuse conversion
+            messages_request = AnthropicMessagesRequest(
+                model=request.model,
+                messages=request.messages,
+                max_tokens=1,  # dummy, not used for counting
+                system=request.system,
+                tools=request.tools,
+                tool_choice=request.tool_choice,
+            )
+            chat_request = self._convert_to_chat_completion_request(messages_request)
+        except Exception as e:
+            logger.exception("Error converting count_tokens request: %s", e)
+            return self._error_response(
+                status_code=400,
+                error_type="invalid_request_error",
+                message=str(e),
+            )
+
+        try:
+            is_multimodal = (
+                self.openai_serving_chat.tokenizer_manager.model_config.is_multimodal
+            )
+            processed = self.openai_serving_chat._process_messages(
+                chat_request, is_multimodal
+            )
+
+            if isinstance(processed.prompt_ids, list):
+                input_tokens = len(processed.prompt_ids)
+            else:
+                # prompt_ids is a string (multimodal case) — tokenize it
+                tokenizer = self.openai_serving_chat.tokenizer_manager.tokenizer
+                input_tokens = len(tokenizer.encode(processed.prompt_ids))
+
+            return JSONResponse(
+                content=AnthropicCountTokensResponse(
+                    input_tokens=input_tokens
+                ).model_dump()
+            )
+        except Exception as e:
+            logger.exception("Error counting tokens: %s", e)
+            return self._error_response(
+                status_code=500,
+                error_type="internal_error",
+                message="Internal server error",
+            )
diff --git a/sglang/python/sglang/srt/entrypoints/context.py b/sglang/python/sglang/srt/entrypoints/context.py
new file mode 100644
index 0000000000000000000000000000000000000000..083e75f17ebf6366439e8ef80ef4ec7370da53c4
--- /dev/null
+++ b/sglang/python/sglang/srt/entrypoints/context.py
@@ -0,0 +1,249 @@
+# SPDX-License-Identifier: Apache-2.0
+# Copied from vLLM
+import logging
+from abc import ABC, abstractmethod
+from typing import Union
+
+import orjson
+
+logger = logging.getLogger(__name__)
+
+try:
+    from mcp import ClientSession
+except ImportError as e:
+    mcp = e
+
+from openai_harmony import Author, Message, Role, StreamState, TextContent
+
+from sglang.srt.entrypoints.harmony_utils import (
+    get_encoding,
+    get_streamable_parser_for_assistant,
+    render_for_completion,
+)
+from sglang.srt.entrypoints.tool import Tool
+
+
+class ConversationContext(ABC):
+
+    @abstractmethod
+    def append_output(self, output) -> None:
+        pass
+
+    @abstractmethod
+    async def call_tool(self) -> list[Message]:
+        pass
+
+    @abstractmethod
+    def need_builtin_tool_call(self) -> bool:
+        pass
+
+    @abstractmethod
+    def render_for_completion(self) -> list[int]:
+        pass
+
+
+class SimpleContext(ConversationContext):
+
+    def __init__(self):
+        self.last_output = None
+
+    def append_output(self, output) -> None:
+        self.last_output = output
+
+    def need_builtin_tool_call(self) -> bool:
+        return False
+
+    async def call_tool(self) -> list[Message]:
+        raise NotImplementedError("Should not be called.")
+
+    def render_for_completion(self) -> list[int]:
+        raise NotImplementedError("Should not be called.")
+
+
+class HarmonyContext(ConversationContext):
+
+    def __init__(
+        self,
+        messages: list,
+        tool_sessions: dict[str, Union["ClientSession", Tool]],
+    ):
+        # TODO: Remove the hack of Union[ClientSession, Tool] by using MCP
+        # when demo.
+        self._messages = messages
+        self.tool_sessions = tool_sessions
+
+        self.parser = get_streamable_parser_for_assistant()
+        self.num_init_messages = len(messages)
+        # TODO
+        self.num_prompt_tokens = 0
+        self.num_cached_tokens = 0
+        self.num_output_tokens = 0
+        self.num_reasoning_tokens = 0
+
+    def append_output(self, output) -> None:
+        if isinstance(output, dict) and "output_ids" in output:
+            output_token_ids = output["output_ids"]
+
+            for token_id in output_token_ids:
+                self.parser.process(token_id)
+            output_msgs = self.parser.messages
+
+            meta_info = output["meta_info"]
+
+            if isinstance(meta_info, dict):
+                if "prompt_token_ids" in meta_info:
+                    self.num_prompt_tokens = meta_info["prompt_tokens"]
+                if "cached_tokens" in meta_info:
+                    self.num_cached_tokens = meta_info["cached_tokens"]
+                if "completion_tokens" in meta_info:
+                    self.num_output_tokens += meta_info["completion_tokens"]
+
+        else:
+            output_msgs = output
+
+        self._messages.extend(output_msgs)
+
+    @property
+    def messages(self) -> list:
+        return self._messages
+
+    def need_builtin_tool_call(self) -> bool:
+        if not self.messages:
+            return False
+        last_msg = self.messages[-1]
+        recipient = last_msg.recipient
+        return recipient is not None and (
+            recipient.startswith("browser.") or recipient.startswith("python")
+        )
+
+    async def call_tool(self) -> list[Message]:
+        if not self.messages:
+            return []
+        last_msg = self.messages[-1]
+        recipient = last_msg.recipient
+        if recipient is not None:
+            if recipient.startswith("browser."):
+                return await self.call_search_tool(
+                    self.tool_sessions["browser"], last_msg
+                )
+            elif recipient.startswith("python"):
+                return await self.call_python_tool(
+                    self.tool_sessions["python"], last_msg
+                )
+        raise ValueError("No tool call found")
+
+    def render_for_completion(self) -> list[int]:
+        return render_for_completion(self.messages)
+
+    async def call_search_tool(
+        self, tool_session: Union["ClientSession", Tool], last_msg: Message
+    ) -> list[Message]:
+        if isinstance(tool_session, Tool):
+            return await tool_session.get_result(self)
+        tool_name = last_msg.recipient.split(".")[1]
+        args = orjson.loads(last_msg.content[0].text)
+        result = await tool_session.call_tool(tool_name, args)
+        result_str = result.content[0].text
+        content = TextContent(text=result_str)
+        author = Author(role=Role.TOOL, name=last_msg.recipient)
+        return [Message(author=author, content=[content], recipient=Role.ASSISTANT)]
+
+    async def call_python_tool(
+        self, tool_session: Union["ClientSession", Tool], last_msg: Message
+    ) -> list[Message]:
+        if isinstance(tool_session, Tool):
+            return await tool_session.get_result(self)
+        param = {
+            "code": last_msg.content[0].text,
+        }
+        result = await tool_session.call_tool("python", param)
+        result_str = result.content[0].text
+
+        content = TextContent(text=result_str)
+        author = Author(role=Role.TOOL, name="python")
+
+        return [
+            Message(
+                author=author,
+                content=[content],
+                channel=last_msg.channel,
+                recipient=Role.ASSISTANT,
+            )
+        ]
+
+
+class StreamingHarmonyContext(HarmonyContext):
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.last_output = None
+
+        self.parser = get_streamable_parser_for_assistant()
+        self.encoding = get_encoding()
+        self.last_tok = None
+        self.num_processed_tokens = 0
+
+    @property
+    def messages(self) -> list:
+        return self.parser.messages
+
+    def append_output(self, output) -> None:
+        if isinstance(output, dict) and "output_ids" in output:
+            # RequestOutput from SGLang with outputs
+            output_token_ids = output["output_ids"]
+
+            # Check if we need to handle cumulative tokens
+            meta_info = output.get("meta_info", {})
+            completion_tokens = meta_info.get("completion_tokens")
+            if (
+                completion_tokens is not None
+                and len(output_token_ids) == completion_tokens
+            ):
+                # Case 1: When --stream-output is not set.
+                # The output_ids contains all tokens generated so far.
+                # We only need to process the new tokens.
+                new_token_ids = output_token_ids[self.num_processed_tokens :]
+                self.num_processed_tokens = len(output_token_ids)
+            else:
+                # Case 2: When --stream-output is set.
+                # The output_ids contains only the new tokens.
+                new_token_ids = output_token_ids
+                self.num_processed_tokens += len(output_token_ids)
+
+            for token_id in new_token_ids:
+                self.parser.process(token_id)
+
+        else:
+            # Handle the case of tool output in direct message format
+            assert len(output) == 1, "Tool output should be a single message"
+            msg = output[0]
+            # Sometimes the recipient is not set for tool messages,
+            # so we set it to "assistant"
+            if msg.author.role == Role.TOOL and msg.recipient is None:
+                msg.recipient = "assistant"
+            toks = self.encoding.render(msg)
+            for tok in toks:
+                self.parser.process(tok)
+            self.last_tok = toks[-1]
+
+    def is_expecting_start(self) -> bool:
+        return self.parser.state == StreamState.EXPECT_START
+
+    def is_assistant_action_turn(self) -> bool:
+        return self.last_tok in self.encoding.stop_tokens_for_assistant_actions()
+
+    def render_for_completion(self) -> list[int]:
+        # now this list of tokens as next turn's starting tokens
+        # `<|start|>assistant``,
+        # we need to process them in parser.
+        rendered_tokens = super().render_for_completion()
+
+        last_n = -1
+        to_process = []
+        while rendered_tokens[last_n] != self.last_tok:
+            to_process.append(rendered_tokens[last_n])
+            last_n -= 1
+        for tok in reversed(to_process):
+            self.parser.process(tok)
+
+        return rendered_tokens
diff --git a/sglang/python/sglang/srt/entrypoints/engine.py b/sglang/python/sglang/srt/entrypoints/engine.py
new file mode 100644
index 0000000000000000000000000000000000000000..87c8c0a19f89bc82226d92cdb9e90322c5ae04a1
--- /dev/null
+++ b/sglang/python/sglang/srt/entrypoints/engine.py
@@ -0,0 +1,1231 @@
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""
+The entry point of inference server. (SRT = SGLang Runtime)
+
+This file implements python APIs for the inference engine.
+"""
+
+import asyncio
+import atexit
+import dataclasses
+import logging
+import multiprocessing as mp
+import os
+import random
+import signal
+import threading
+import time
+from typing import (
+    Any,
+    AsyncIterator,
+    Callable,
+    Dict,
+    Iterator,
+    List,
+    Optional,
+    Tuple,
+    Union,
+)
+
+# Fix a bug of Python threading
+setattr(threading, "_register_atexit", lambda *args, **kwargs: None)
+
+import torch
+import uvloop
+import zmq
+
+from sglang.srt.elastic_ep.expert_backup_manager import run_expert_backup_manager
+from sglang.srt.entrypoints.EngineBase import EngineBase
+from sglang.srt.managers.data_parallel_controller import (
+    run_data_parallel_controller_process,
+)
+from sglang.srt.managers.detokenizer_manager import run_detokenizer_process
+from sglang.srt.managers.io_struct import (
+    CloseSessionReqInput,
+    DestroyWeightsUpdateGroupReqInput,
+    EmbeddingReqInput,
+    GenerateReqInput,
+    GetWeightsByNameReqInput,
+    InitWeightsUpdateGroupReqInput,
+    LoadLoRAAdapterFromTensorsReqInput,
+    LoadLoRAAdapterReqInput,
+    MultimodalDataInputFormat,
+    OpenSessionReqInput,
+    ReleaseMemoryOccupationReqInput,
+    ResumeMemoryOccupationReqInput,
+    RpcReqInput,
+    RpcReqOutput,
+    UnloadLoRAAdapterReqInput,
+    UpdateWeightFromDiskReqInput,
+    UpdateWeightsFromDistributedReqInput,
+    UpdateWeightsFromIPCReqInput,
+    UpdateWeightsFromTensorReqInput,
+)
+from sglang.srt.managers.multi_tokenizer_mixin import MultiTokenizerRouter
+from sglang.srt.managers.scheduler import run_scheduler_process
+from sglang.srt.managers.template_manager import TemplateManager
+from sglang.srt.managers.tokenizer_manager import TokenizerManager
+from sglang.srt.managers.tokenizer_manager_multiitem_mixin import ScoreResult
+from sglang.srt.model_loader.remote_instance_weight_loader_utils import (
+    parse_remote_instance_transfer_engine_info_from_scheduler_infos,
+)
+from sglang.srt.observability.trace import process_tracing_init, trace_set_thread_info
+from sglang.srt.server_args import PortArgs, ServerArgs
+from sglang.srt.utils import (
+    MultiprocessingSerializer,
+    assert_pkg_version,
+    configure_logger,
+    get_bool_env_var,
+    get_zmq_socket,
+    is_cuda,
+    kill_process_tree,
+    launch_dummy_health_check_server,
+    maybe_reindex_device_id,
+    numa_utils,
+    set_prometheus_multiproc_dir,
+    set_ulimit,
+)
+from sglang.srt.utils.torch_memory_saver_adapter import TorchMemorySaverAdapter
+from sglang.version import __version__
+
+logger = logging.getLogger(__name__)
+asyncio.set_event_loop_policy(uvloop.EventLoopPolicy())
+
+_is_cuda = is_cuda()
+
+
+@dataclasses.dataclass
+class SchedulerInitResult:
+    """Result from launching schedulers."""
+
+    scheduler_infos: List[Dict[str, Any]]
+    wait_for_ready: Callable[[], None] = lambda: None
+    wait_for_completion: Callable[[], None] = lambda: None
+
+
+def init_tokenizer_manager(
+    server_args: ServerArgs,
+    port_args: PortArgs,
+    TokenizerManagerClass: Optional[TokenizerManager] = None,
+) -> Tuple[TokenizerManager, TemplateManager]:
+    # Launch tokenizer process
+    TokenizerManagerClass = TokenizerManagerClass or TokenizerManager
+    tokenizer_manager = TokenizerManagerClass(server_args, port_args)
+
+    # Initialize templates
+    template_manager = TemplateManager()
+    template_manager.initialize_templates(
+        tokenizer_manager=tokenizer_manager,
+        model_path=server_args.model_path,
+        chat_template=server_args.chat_template,
+        completion_template=server_args.completion_template,
+    )
+
+    return tokenizer_manager, template_manager
+
+
+class Engine(EngineBase):
+    """
+    The entry point to the inference engine.
+
+    - The engine consists of three components:
+        1. TokenizerManager: Tokenizes the requests and sends them to the scheduler.
+        2. Scheduler (subprocess): Receives requests from the Tokenizer Manager, schedules batches, forwards them, and sends the output tokens to the Detokenizer Manager.
+        3. DetokenizerManager (subprocess): Detokenizes the output tokens and sends the result back to the Tokenizer Manager.
+
+    Note:
+    1. The HTTP server, Engine, and TokenizerManager all run in the main process.
+    2. Inter-process communication is done through IPC (each process uses a different port) via the ZMQ library.
+    """
+
+    # Some fields to allow people to override the server args
+    # and launch processes for their private forks.
+    server_args_class: ServerArgs = ServerArgs
+    init_tokenizer_manager_func: Callable = staticmethod(init_tokenizer_manager)
+    run_scheduler_process_func: Callable = staticmethod(run_scheduler_process)
+    run_detokenizer_process_func: Callable = staticmethod(run_detokenizer_process)
+
+    def __init__(self, **kwargs):
+        """
+        The arguments of this function is the same as `sglang/srt/server_args.py::ServerArgs`.
+        Please refer to `ServerArgs` for the documentation.
+        """
+
+        # Parse server_args
+        if "server_args" in kwargs:
+            # Directly load server_args
+            server_args = kwargs["server_args"]
+        else:
+            # Construct server_args from kwargs
+            if "log_level" not in kwargs:
+                # Do not print logs by default
+                kwargs["log_level"] = "error"
+            server_args = self.server_args_class(**kwargs)
+        self.server_args = server_args
+        logger.info(f"{server_args=}")
+
+        # Shutdown the subprocesses automatically when the program exits
+        atexit.register(self.shutdown)
+
+        # Launch subprocesses
+        (
+            tokenizer_manager,
+            template_manager,
+            port_args,
+            scheduler_init_result,
+        ) = self._launch_subprocesses(
+            server_args=server_args,
+            init_tokenizer_manager_func=self.init_tokenizer_manager_func,
+            run_scheduler_process_func=self.run_scheduler_process_func,
+            run_detokenizer_process_func=self.run_detokenizer_process_func,
+        )
+        self.tokenizer_manager = tokenizer_manager
+        self.template_manager = template_manager
+        self._scheduler_init_result = scheduler_init_result
+        self.port_args = port_args
+        self.remote_instance_transfer_engine_info = (
+            parse_remote_instance_transfer_engine_info_from_scheduler_infos(
+                scheduler_init_result.scheduler_infos
+            )
+        )
+
+        # Initialize ZMQ sockets
+        context = zmq.Context(2)
+        if self.server_args.node_rank == 0:
+            self.send_to_rpc = get_zmq_socket(
+                context, zmq.DEALER, self.port_args.rpc_ipc_name, True
+            )
+        else:
+            self.send_to_rpc = None
+
+        # Enable tracing
+        if server_args.enable_trace:
+            process_tracing_init(server_args.otlp_traces_endpoint, "sglang")
+            thread_label = "Tokenizer"
+            if server_args.disaggregation_mode == "prefill":
+                thread_label = "Prefill Tokenizer"
+            elif server_args.disaggregation_mode == "decode":
+                thread_label = "Decode Tokenizer"
+            trace_set_thread_info(thread_label)
+
+        try:
+            self.loop = asyncio.get_running_loop()
+        except RuntimeError:
+            self.loop = asyncio.new_event_loop()
+            asyncio.set_event_loop(self.loop)
+
+    def _resolve_routed_dp_rank(
+        self,
+        routed_dp_rank: Optional[int],
+        data_parallel_rank: Optional[int],
+    ) -> Optional[int]:
+        if data_parallel_rank is not None:
+            import warnings
+
+            warnings.warn(
+                "'data_parallel_rank' is deprecated, use 'routed_dp_rank' instead.",
+                DeprecationWarning,
+                stacklevel=3,
+            )
+            if routed_dp_rank is None:
+                routed_dp_rank = data_parallel_rank
+
+        if routed_dp_rank is not None:
+            dp_size = self.server_args.dp_size
+            if dp_size <= 1 and routed_dp_rank == 0:
+                logger.warning(
+                    f"routed_dp_rank={routed_dp_rank} is ignored because dp_size={dp_size}"
+                )
+                return None
+            if routed_dp_rank < 0 or routed_dp_rank >= dp_size:
+                raise ValueError(
+                    f"routed_dp_rank={routed_dp_rank} out of range [0, {dp_size})"
+                )
+
+        logger.debug(f"routed_dp_rank: {routed_dp_rank}")
+        return routed_dp_rank
+
+    def generate(
+        self,
+        # The input prompt. It can be a single prompt or a batch of prompts.
+        prompt: Optional[Union[List[str], str]] = None,
+        sampling_params: Optional[Union[List[Dict], Dict]] = None,
+        # The token ids for text; one can either specify text or input_ids.
+        input_ids: Optional[Union[List[List[int]], List[int]]] = None,
+        # The image input. It can be an image instance, file name, URL, or base64 encoded string.
+        # Can be formatted as:
+        # - Single image for a single request
+        # - List of images (one per request in a batch)
+        # - List of lists of images (multiple images per request)
+        # - List of preprocessed outputs from a Huggingface processor, each as a dict containing `format`: 'processor_output' and other data
+        # - List of precomputed image embeddings, each as a dict containing field `format`: 'precomputed_embedding' and `feature`: the precomputed embedding
+        # See also python/sglang/srt/utils.py:load_image for more details.
+        image_data: Optional[MultimodalDataInputFormat] = None,
+        audio_data: Optional[MultimodalDataInputFormat] = None,
+        video_data: Optional[MultimodalDataInputFormat] = None,
+        return_logprob: Optional[Union[List[bool], bool]] = False,
+        logprob_start_len: Optional[Union[List[int], int]] = None,
+        top_logprobs_num: Optional[Union[List[int], int]] = None,
+        token_ids_logprob: Optional[Union[List[List[int]], List[int]]] = None,
+        lora_path: Optional[List[Optional[str]]] = None,
+        custom_logit_processor: Optional[Union[List[str], str]] = None,
+        return_hidden_states: bool = False,
+        return_routed_experts: bool = False,
+        stream: bool = False,
+        bootstrap_host: Optional[Union[List[str], str]] = None,
+        bootstrap_port: Optional[Union[List[int], int]] = None,
+        bootstrap_room: Optional[Union[List[int], int]] = None,
+        routed_dp_rank: Optional[int] = None,
+        disagg_prefill_dp_rank: Optional[int] = None,
+        # Deprecated: use routed_dp_rank instead
+        data_parallel_rank: Optional[int] = None,
+        external_trace_header: Optional[Dict] = None,
+        rid: Optional[Union[List[str], str]] = None,
+        session_params: Optional[Dict] = None,
+        priority: Optional[int] = None,
+    ) -> Union[Dict, Iterator[Dict]]:
+        """
+        The arguments of this function is the same as `sglang/srt/managers/io_struct.py::GenerateReqInput`.
+        Please refer to `GenerateReqInput` for the documentation.
+        """
+        routed_dp_rank = self._resolve_routed_dp_rank(
+            routed_dp_rank, data_parallel_rank
+        )
+
+        obj = GenerateReqInput(
+            text=prompt,
+            input_ids=input_ids,
+            sampling_params=sampling_params,
+            image_data=image_data,
+            audio_data=audio_data,
+            video_data=video_data,
+            return_logprob=return_logprob,
+            logprob_start_len=logprob_start_len,
+            top_logprobs_num=top_logprobs_num,
+            token_ids_logprob=token_ids_logprob,
+            lora_path=lora_path,
+            custom_logit_processor=custom_logit_processor,
+            return_hidden_states=return_hidden_states,
+            return_routed_experts=return_routed_experts,
+            stream=stream,
+            bootstrap_host=bootstrap_host,
+            bootstrap_port=bootstrap_port,
+            bootstrap_room=bootstrap_room,
+            routed_dp_rank=routed_dp_rank,
+            disagg_prefill_dp_rank=disagg_prefill_dp_rank,
+            external_trace_header=external_trace_header,
+            rid=rid,
+            session_params=session_params,
+            priority=priority,
+        )
+        generator = self.tokenizer_manager.generate_request(obj, None)
+
+        if stream:
+
+            def generator_wrapper():
+                while True:
+                    try:
+                        chunk = self.loop.run_until_complete(generator.__anext__())
+                        yield chunk
+                    except StopAsyncIteration:
+                        break
+
+            return generator_wrapper()
+        else:
+            ret = self.loop.run_until_complete(generator.__anext__())
+            return ret
+
+    async def async_generate(
+        self,
+        # The input prompt. It can be a single prompt or a batch of prompts.
+        prompt: Optional[Union[List[str], str]] = None,
+        sampling_params: Optional[Union[List[Dict], Dict]] = None,
+        # The token ids for text; one can either specify text or input_ids.
+        input_ids: Optional[Union[List[List[int]], List[int]]] = None,
+        # The image input. It can be an image instance, file name, URL, or base64 encoded string.
+        # Can be formatted as:
+        # - Single image for a single request
+        # - List of images (one per request in a batch)
+        # - List of lists of images (multiple images per request)
+        # - List of preprocessed outputs from a Huggingface processor, each as a dict containing `format`: 'processor_output' and other data
+        # - List of precomputed image embeddings, each as a dict containing field `format`: 'precomputed_embedding' and `feature`: the precomputed embedding
+        # See also python/sglang/srt/utils.py:load_image for more details.
+        image_data: Optional[MultimodalDataInputFormat] = None,
+        audio_data: Optional[MultimodalDataInputFormat] = None,
+        video_data: Optional[MultimodalDataInputFormat] = None,
+        return_logprob: Optional[Union[List[bool], bool]] = False,
+        logprob_start_len: Optional[Union[List[int], int]] = None,
+        top_logprobs_num: Optional[Union[List[int], int]] = None,
+        token_ids_logprob: Optional[Union[List[List[int]], List[int]]] = None,
+        lora_path: Optional[List[Optional[str]]] = None,
+        custom_logit_processor: Optional[Union[List[str], str]] = None,
+        return_hidden_states: bool = False,
+        return_routed_experts: bool = False,
+        stream: bool = False,
+        bootstrap_host: Optional[Union[List[str], str]] = None,
+        bootstrap_port: Optional[Union[List[int], int]] = None,
+        bootstrap_room: Optional[Union[List[int], int]] = None,
+        routed_dp_rank: Optional[int] = None,
+        disagg_prefill_dp_rank: Optional[int] = None,
+        # Deprecated: use routed_dp_rank instead
+        data_parallel_rank: Optional[int] = None,
+        external_trace_header: Optional[Dict] = None,
+        rid: Optional[Union[List[str], str]] = None,
+        session_params: Optional[Dict] = None,
+        priority: Optional[int] = None,
+    ) -> Union[Dict, AsyncIterator[Dict]]:
+        """
+        The arguments of this function is the same as `sglang/srt/managers/io_struct.py::GenerateReqInput`.
+        Please refer to `GenerateReqInput` for the documentation.
+        """
+        routed_dp_rank = self._resolve_routed_dp_rank(
+            routed_dp_rank, data_parallel_rank
+        )
+
+        obj = GenerateReqInput(
+            text=prompt,
+            input_ids=input_ids,
+            sampling_params=sampling_params,
+            image_data=image_data,
+            audio_data=audio_data,
+            video_data=video_data,
+            return_logprob=return_logprob,
+            logprob_start_len=logprob_start_len,
+            top_logprobs_num=top_logprobs_num,
+            token_ids_logprob=token_ids_logprob,
+            lora_path=lora_path,
+            return_hidden_states=return_hidden_states,
+            return_routed_experts=return_routed_experts,
+            stream=stream,
+            custom_logit_processor=custom_logit_processor,
+            bootstrap_host=bootstrap_host,
+            bootstrap_port=bootstrap_port,
+            bootstrap_room=bootstrap_room,
+            routed_dp_rank=routed_dp_rank,
+            disagg_prefill_dp_rank=disagg_prefill_dp_rank,
+            external_trace_header=external_trace_header,
+            rid=rid,
+            session_params=session_params,
+            priority=priority,
+        )
+        generator = self.tokenizer_manager.generate_request(obj, None)
+
+        if stream is True:
+            return generator
+        else:
+            return await generator.__anext__()
+
+    def encode(
+        self,
+        prompt: Union[str, List[str], List[Dict], List[List[Dict]]],
+        image_data: Optional[MultimodalDataInputFormat] = None,
+        audio_data: Optional[MultimodalDataInputFormat] = None,
+        video_data: Optional[MultimodalDataInputFormat] = None,
+        dimensions: Optional[int] = None,
+        lora_path: Optional[Union[List[Optional[str]], Optional[str]]] = None,
+        external_trace_header: Optional[Dict] = None,
+        rid: Optional[Union[List[str], str]] = None,
+    ) -> Dict:
+        """
+        The arguments of this function is the same as `sglang/srt/managers/io_struct.py::EmbeddingReqInput`.
+        Please refer to `EmbeddingReqInput` for the documentation.
+        """
+        obj = EmbeddingReqInput(
+            text=prompt,
+            image_data=image_data,
+            audio_data=audio_data,
+            video_data=video_data,
+            dimensions=dimensions,
+            lora_path=lora_path,
+            external_trace_header=external_trace_header,
+            rid=rid,
+        )
+        generator = self.tokenizer_manager.generate_request(obj, None)
+        ret = self.loop.run_until_complete(generator.__anext__())
+        return ret
+
+    async def async_encode(
+        self,
+        prompt: Union[str, List[str], List[Dict], List[List[Dict]]],
+        image_data: Optional[MultimodalDataInputFormat] = None,
+        audio_data: Optional[MultimodalDataInputFormat] = None,
+        video_data: Optional[MultimodalDataInputFormat] = None,
+        dimensions: Optional[int] = None,
+        lora_path: Optional[Union[List[Optional[str]], Optional[str]]] = None,
+        external_trace_header: Optional[Dict] = None,
+        rid: Optional[Union[List[str], str]] = None,
+    ) -> Dict:
+        """
+        Asynchronous version of encode method.
+
+        The arguments of this function is the same as `sglang/srt/managers/io_struct.py::EmbeddingReqInput`.
+        Please refer to `EmbeddingReqInput` for the documentation.
+        """
+        obj = EmbeddingReqInput(
+            text=prompt,
+            image_data=image_data,
+            audio_data=audio_data,
+            video_data=video_data,
+            dimensions=dimensions,
+            lora_path=lora_path,
+            external_trace_header=external_trace_header,
+            rid=rid,
+        )
+        generator = self.tokenizer_manager.generate_request(obj, None)
+        return await generator.__anext__()
+
+    def rerank(
+        self,
+        prompt: Union[List[List[str]]],
+    ) -> Dict:
+        """
+        The arguments of this function is the same as `sglang/srt/managers/io_struct.py::EmbeddingReqInput`.
+        Please refer to `EmbeddingReqInput` for the documentation.
+        """
+        obj = EmbeddingReqInput(text=prompt, is_cross_encoder_request=True)
+        generator = self.tokenizer_manager.generate_request(obj, None)
+        ret = self.loop.run_until_complete(generator.__anext__())
+        return ret
+
+    @classmethod
+    def _launch_scheduler_processes(
+        cls,
+        server_args: ServerArgs,
+        port_args: PortArgs,
+        run_scheduler_process_func: Callable,
+    ) -> SchedulerInitResult:
+        """Launch scheduler processes using multiprocessing.
+        Override in subclasses for different backends (e.g. Ray).
+        """
+        scheduler_procs = []
+
+        if server_args.dp_size == 1:
+            # Launch tensor parallel scheduler processes
+            memory_saver_adapter = TorchMemorySaverAdapter.create(
+                enable=server_args.enable_memory_saver
+            )
+            scheduler_pipe_readers = []
+
+            pp_rank_range, tp_rank_range, pp_size_per_node, tp_size_per_node = (
+                _calculate_rank_ranges(
+                    server_args.nnodes,
+                    server_args.pp_size,
+                    server_args.tp_size,
+                    server_args.node_rank,
+                )
+            )
+
+            for pp_rank in pp_rank_range:
+                for tp_rank in tp_rank_range:
+                    reader, writer = mp.Pipe(duplex=False)
+                    gpu_id = (
+                        server_args.base_gpu_id
+                        + ((pp_rank % pp_size_per_node) * tp_size_per_node)
+                        + (tp_rank % tp_size_per_node) * server_args.gpu_id_step
+                    )
+                    attn_cp_rank, moe_dp_rank, moe_ep_rank = _compute_parallelism_ranks(
+                        server_args, tp_rank
+                    )
+
+                    with maybe_reindex_device_id(gpu_id) as gpu_id:
+                        proc = mp.Process(
+                            target=run_scheduler_process_func,
+                            args=(
+                                server_args,
+                                port_args,
+                                gpu_id,
+                                tp_rank,
+                                attn_cp_rank,
+                                moe_dp_rank,
+                                moe_ep_rank,
+                                pp_rank,
+                                None,
+                                writer,
+                            ),
+                        )
+                        with memory_saver_adapter.configure_subprocess(), numa_utils.configure_subprocess(
+                            server_args, gpu_id
+                        ):
+                            proc.start()
+
+                    scheduler_procs.append(proc)
+                    scheduler_pipe_readers.append(reader)
+        else:
+            # Launch the data parallel controller
+            reader, writer = mp.Pipe(duplex=False)
+            scheduler_pipe_readers = [reader]
+            proc = mp.Process(
+                target=run_data_parallel_controller_process,
+                kwargs=dict(
+                    server_args=server_args,
+                    port_args=port_args,
+                    pipe_writer=writer,
+                    run_scheduler_process_func=run_scheduler_process_func,
+                ),
+            )
+            proc.start()
+            scheduler_procs.append(proc)
+
+        scheduler_infos = []
+
+        def wait_for_ready():
+            infos = _wait_for_scheduler_ready(scheduler_pipe_readers, scheduler_procs)
+            scheduler_infos.extend(infos)
+
+        def wait_for_completion():
+            for proc in scheduler_procs:
+                proc.join()
+                logger.error(
+                    f"Scheduler or DataParallelController {proc.pid} "
+                    f"terminated with {proc.exitcode}"
+                )
+
+        return SchedulerInitResult(
+            scheduler_infos=scheduler_infos,
+            wait_for_ready=wait_for_ready,
+            wait_for_completion=wait_for_completion,
+        )
+
+    @classmethod
+    def _launch_subprocesses(
+        cls,
+        server_args: ServerArgs,
+        init_tokenizer_manager_func: Callable,
+        run_scheduler_process_func: Callable,
+        run_detokenizer_process_func: Callable,
+        port_args: Optional[PortArgs] = None,
+    ) -> Tuple[TokenizerManager, TemplateManager, PortArgs, SchedulerInitResult]:
+        """Launch the TokenizerManager in the main process, the Scheduler in a subprocess, and the DetokenizerManager in another subprocess.
+
+        Returns:
+            Tuple of (tokenizer_manager, template_manager, port_args, scheduler_init_result).
+        """
+        # Configure global environment
+        configure_logger(server_args)
+        _set_envs_and_config(server_args)
+        server_args.check_server_args()
+
+        # Allocate ports for inter-process communications
+        if port_args is None:
+            port_args = PortArgs.init_new(server_args)
+        logger.info(f"{server_args=}")
+
+        # Launch scheduler processes
+        scheduler_init_result = cls._launch_scheduler_processes(
+            server_args, port_args, run_scheduler_process_func
+        )
+
+        if (
+            server_args.enable_elastic_expert_backup
+            and server_args.elastic_ep_backend is not None
+        ):
+            run_expert_backup_manager(server_args, port_args)
+
+        if server_args.node_rank >= 1:
+            # In multi-node cases, non-zero rank nodes do not need to run tokenizer or detokenizer,
+            # so they can just wait here.
+            scheduler_init_result.wait_for_ready()
+
+            if os.getenv("SGLANG_BLOCK_NONZERO_RANK_CHILDREN") == "0":
+                # When using `Engine` as a Python API, we don't want to block here.
+                return (
+                    None,
+                    None,
+                    port_args,
+                    scheduler_init_result,
+                )
+
+            launch_dummy_health_check_server(
+                server_args.host, server_args.port, server_args.enable_metrics
+            )
+
+            scheduler_init_result.wait_for_completion()
+            return (
+                None,
+                None,
+                port_args,
+                scheduler_init_result,
+            )
+
+        # Launch detokenizer process
+        detoken_proc = mp.Process(
+            target=run_detokenizer_process_func,
+            args=(
+                server_args,
+                port_args,
+            ),
+        )
+        detoken_proc.start()
+
+        # Init tokenizer manager first, as the bootstrap server is initialized here
+        if server_args.tokenizer_worker_num == 1:
+            tokenizer_manager, template_manager = init_tokenizer_manager_func(
+                server_args, port_args
+            )
+        else:
+            # Launch multi-tokenizer router
+            tokenizer_manager = MultiTokenizerRouter(server_args, port_args)
+            template_manager = None
+
+        # Wait for the model to finish loading
+        scheduler_init_result.wait_for_ready()
+
+        # Get back some info from scheduler to tokenizer_manager
+        tokenizer_manager.max_req_input_len = scheduler_init_result.scheduler_infos[0][
+            "max_req_input_len"
+        ]
+
+        return (
+            tokenizer_manager,
+            template_manager,
+            port_args,
+            scheduler_init_result,
+        )
+
+    def shutdown(self):
+        """Shutdown the engine"""
+        kill_process_tree(os.getpid(), include_parent=False)
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        self.shutdown()
+        return False
+
+    def flush_cache(self):
+        return self.loop.run_until_complete(self.tokenizer_manager.flush_cache())
+
+    def open_session(
+        self,
+        capacity_of_str_len: int,
+        session_id: Optional[str] = None,
+        streaming: bool = False,
+        timeout: Optional[float] = None,
+    ) -> str:
+        """Open a session for multi-turn conversation with shared context.
+
+        Args:
+            capacity_of_str_len: Maximum string length capacity for the session.
+            session_id: Optional session ID. If not provided, a UUID will be generated.
+            streaming: Use low-overhead path for realtime streaming (append-only mode).
+            timeout: If set, the session is automatically closed after being inactive
+                for this many seconds. Inactivity is measured from session open or the
+                most recent request submission.
+
+        Returns:
+            The session ID (either the provided one or a newly generated UUID).
+        """
+        obj = OpenSessionReqInput(
+            capacity_of_str_len=capacity_of_str_len,
+            session_id=session_id,
+            streaming=streaming,
+            timeout=timeout,
+        )
+        return self.loop.run_until_complete(
+            self.tokenizer_manager.open_session(obj, None)
+        )
+
+    def close_session(self, session_id: str) -> None:
+        """Close a session and release its resources.
+
+        Args:
+            session_id: The session ID to close.
+        """
+        obj = CloseSessionReqInput(session_id=session_id)
+        self.loop.run_until_complete(self.tokenizer_manager.close_session(obj, None))
+
+    def start_profile(self, **kwargs):
+        self.loop.run_until_complete(self.tokenizer_manager.start_profile(**kwargs))
+
+    def stop_profile(self):
+        self.loop.run_until_complete(self.tokenizer_manager.stop_profile())
+
+    def start_expert_distribution_record(self):
+        self.loop.run_until_complete(
+            self.tokenizer_manager.start_expert_distribution_record()
+        )
+
+    def stop_expert_distribution_record(self):
+        self.loop.run_until_complete(
+            self.tokenizer_manager.stop_expert_distribution_record()
+        )
+
+    def dump_expert_distribution_record(self):
+        self.loop.run_until_complete(
+            self.tokenizer_manager.dump_expert_distribution_record()
+        )
+
+    def get_server_info(self):
+        internal_states = self.loop.run_until_complete(
+            self.tokenizer_manager.get_internal_state()
+        )
+        return {
+            **dataclasses.asdict(self.tokenizer_manager.server_args),
+            **self._scheduler_init_result.scheduler_infos[0],
+            "internal_states": internal_states,
+            "version": __version__,
+        }
+
+    def init_weights_update_group(
+        self,
+        master_address: str,
+        master_port: int,
+        rank_offset: int,
+        world_size: int,
+        group_name: str,
+        backend: str = "nccl",
+    ):
+        """Initialize parameter update group."""
+        obj = InitWeightsUpdateGroupReqInput(
+            master_address=master_address,
+            master_port=master_port,
+            rank_offset=rank_offset,
+            world_size=world_size,
+            group_name=group_name,
+            backend=backend,
+        )
+        return self.loop.run_until_complete(
+            self.tokenizer_manager.init_weights_update_group(obj, None)
+        )
+
+    def destroy_weights_update_group(
+        self,
+        group_name: str,
+    ):
+        """Destroy parameter update group."""
+        obj = DestroyWeightsUpdateGroupReqInput(
+            group_name=group_name,
+        )
+        return self.loop.run_until_complete(
+            self.tokenizer_manager.destroy_weights_update_group(obj, None)
+        )
+
+    def update_weights_from_distributed(
+        self,
+        names: list[str],
+        dtypes: list[str],
+        shapes: list[list[int]],
+        group_name: str = "weight_update_group",
+        flush_cache: bool = True,
+        load_format: Optional[str] = None,
+    ):
+        """Update weights from distributed source."""
+        obj = UpdateWeightsFromDistributedReqInput(
+            names=names,
+            dtypes=dtypes,
+            shapes=shapes,
+            group_name=group_name,
+            flush_cache=flush_cache,
+            load_format=load_format,
+        )
+        return self.loop.run_until_complete(
+            self.tokenizer_manager.update_weights_from_distributed(obj, None)
+        )
+
+    def update_weights_from_tensor(
+        self,
+        named_tensors: List[Tuple[str, torch.Tensor]],
+        load_format: Optional[str] = None,
+        flush_cache: bool = True,
+    ):
+        """Update weights from distributed source. If there are going to be more updates, set `flush_cache` to be false
+        to avoid duplicated cache cleaning operation."""
+        if load_format == "flattened_bucket":
+            serialized_named_tensors = named_tensors
+        else:
+            serialized_named_tensors = [
+                MultiprocessingSerializer.serialize(named_tensors)
+                for _ in range(self.server_args.tp_size)
+            ]
+        obj = UpdateWeightsFromTensorReqInput(
+            serialized_named_tensors=serialized_named_tensors,
+            load_format=load_format,
+            flush_cache=flush_cache,
+        )
+        return self.loop.run_until_complete(
+            self.tokenizer_manager.update_weights_from_tensor(obj, None)
+        )
+
+    def update_weights_from_disk(
+        self,
+        model_path: str,
+        load_format: Optional[str] = None,
+    ):
+        """Update the weights from disk inplace without re-launching the engine.
+
+        This method allows updating the model weights from disk without restarting
+        the engine. It can be used to load a different model or update weights with
+        new training.
+        """
+        obj = UpdateWeightFromDiskReqInput(
+            model_path=model_path,
+            load_format=load_format,
+        )
+
+        return self.loop.run_until_complete(
+            self.tokenizer_manager.update_weights_from_disk(obj, None)
+        )
+
+    def update_weights_from_ipc(
+        self,
+        zmq_handles: Dict[str, str],
+        flush_cache: bool = True,
+    ):
+        """Update weights from IPC for checkpoint-engine integration."""
+        obj = UpdateWeightsFromIPCReqInput(
+            zmq_handles=zmq_handles,
+            flush_cache=flush_cache,
+        )
+        return self.loop.run_until_complete(
+            self.tokenizer_manager.update_weights_from_ipc(obj, None)
+        )
+
+    def get_weights_by_name(self, name: str, truncate_size: int = 100):
+        """Get weights by parameter name."""
+        obj = GetWeightsByNameReqInput(name=name, truncate_size=truncate_size)
+        return self.loop.run_until_complete(
+            self.tokenizer_manager.get_weights_by_name(obj, None)
+        )
+
+    def load_lora_adapter_from_tensors(
+        self,
+        lora_name: str,
+        tensors,
+        config_dict: Dict,
+        load_format: Optional[str] = None,
+    ):
+        if load_format == "flattened_bucket":
+            serialized_tensors = tensors
+        else:
+            serialized_tensors = MultiprocessingSerializer.serialize(
+                tensors, output_str=True
+            )
+        lora_req = LoadLoRAAdapterFromTensorsReqInput(
+            lora_name=lora_name,
+            config_dict=config_dict,
+            serialized_tensors=serialized_tensors,
+            load_format=load_format,
+        )
+        return self.loop.run_until_complete(
+            self.tokenizer_manager.load_lora_adapter_from_tensors(lora_req, None)
+        )
+
+    def load_lora_adapter(self, lora_name: str, lora_path: str, pinned: bool = False):
+        """Load a new LoRA adapter without re-launching the engine."""
+
+        obj = LoadLoRAAdapterReqInput(
+            lora_name=lora_name,
+            lora_path=lora_path,
+            pinned=pinned,
+        )
+
+        return self.loop.run_until_complete(
+            self.tokenizer_manager.load_lora_adapter(obj, None)
+        )
+
+    def unload_lora_adapter(self, lora_name: str):
+        """Unload a LoRA adapter without re-launching the engine."""
+
+        obj = UnloadLoRAAdapterReqInput(lora_name=lora_name)
+
+        return self.loop.run_until_complete(
+            self.tokenizer_manager.unload_lora_adapter(obj, None)
+        )
+
+    def release_memory_occupation(self, tags: Optional[List[str]] = None):
+        obj = ReleaseMemoryOccupationReqInput(tags=tags)
+        return self.loop.run_until_complete(
+            self.tokenizer_manager.release_memory_occupation(obj, None)
+        )
+
+    def resume_memory_occupation(self, tags: Optional[List[str]] = None):
+        obj = ResumeMemoryOccupationReqInput(tags=tags)
+        return self.loop.run_until_complete(
+            self.tokenizer_manager.resume_memory_occupation(obj, None)
+        )
+
+    def freeze_gc(self):
+        """
+        To maintain a high performance server with low latency, we want to reduce the
+        stalls caused by the garbage collector scanning through a large number of objects.
+
+        It is usually helpful to start the server and warm it up with real requests to
+        initialize many of the long-lived objects that do not need to be garbage collected.
+
+        After sufficient warmup, we can call this function to freeze the garbage collector
+        so that all objects created before this point are considered out of scope for garbage
+        collection.
+        """
+
+        self.loop.run_until_complete(self.tokenizer_manager.freeze_gc())
+
+    """
+    Execute an RPC call on all scheduler processes.
+    """
+
+    def collective_rpc(self, method: str, **kwargs):
+        obj = RpcReqInput(method=method, parameters=kwargs)
+        self.send_to_rpc.send_pyobj(obj)
+        recv_req = self.send_to_rpc.recv_pyobj(zmq.BLOCKY)
+        assert isinstance(recv_req, RpcReqOutput)
+        assert recv_req.success, recv_req.message
+
+    def save_remote_model(self, **kwargs):
+        self.collective_rpc("save_remote_model", **kwargs)
+
+    def save_sharded_model(self, **kwargs):
+        self.collective_rpc("save_sharded_model", **kwargs)
+
+    def score(
+        self,
+        query: Optional[Union[str, List[int]]] = None,
+        items: Optional[Union[str, List[str], List[List[int]]]] = None,
+        label_token_ids: Optional[List[int]] = None,
+        apply_softmax: bool = False,
+        item_first: bool = False,
+    ) -> ScoreResult:
+        """
+        Score the probability of specified token IDs appearing after the given (query + item) pair. For example:
+        query = "<|user|>Is the following city the capital of France? "
+        items = ["Paris <|assistant|>", "London <|assistant|>", "Berlin <|assistant|>"]
+        label_token_ids = [2332, 1223] # Token IDs for "Yes" and "No"
+        item_first = False
+
+        This would pass the following prompts to the model:
+        "<|user|>Is the following city the capital of France? Paris <|assistant|>"
+        "<|user|>Is the following city the capital of France? London <|assistant|>"
+        "<|user|>Is the following city the capital of France? Berlin <|assistant|>"
+        The api would then return the probabilities of the model producing "Yes" and "No" as the next token.
+        The output would look like:
+        [[0.9, 0.1], [0.2, 0.8], [0.1, 0.9]]
+
+
+        Args:
+            query: The query text or pre-tokenized query token IDs. Must be provided.
+            items: The item text(s) or pre-tokenized item token IDs. Must be provided.
+            label_token_ids: List of token IDs to compute probabilities for. If None, no token probabilities will be computed.
+            apply_softmax: Whether to normalize probabilities using softmax.
+            item_first: If True, prepend items to query. Otherwise append items to query.
+
+        Returns:
+            ScoreResult with:
+                scores: List of lists containing probabilities for each item and each label token
+                prompt_tokens: The number of prompt tokens processed.
+
+        Raises:
+            ValueError: If query is not provided, or if items is not provided,
+                      or if token IDs are out of vocabulary, or if logprobs are not available for the specified tokens.
+        """
+        return self.loop.run_until_complete(
+            self.tokenizer_manager.score_request(
+                query=query,
+                items=items,
+                label_token_ids=label_token_ids,
+                apply_softmax=apply_softmax,
+                item_first=item_first,
+                request=None,
+            )
+        )
+
+    async def async_score(
+        self,
+        query: Optional[Union[str, List[int]]] = None,
+        items: Optional[Union[str, List[str], List[List[int]]]] = None,
+        label_token_ids: Optional[List[int]] = None,
+        apply_softmax: bool = False,
+        item_first: bool = False,
+    ) -> ScoreResult:
+        """
+        Asynchronous version of score method.
+
+        See score() for detailed documentation.
+        """
+        return await self.tokenizer_manager.score_request(
+            query=query,
+            items=items,
+            label_token_ids=label_token_ids,
+            apply_softmax=apply_softmax,
+            item_first=item_first,
+            request=None,
+        )
+
+
+def _set_envs_and_config(server_args: ServerArgs):
+    # Set global environments
+    if "NCCL_CUMEM_ENABLE" not in os.environ or server_args.enable_symm_mem:
+        os.environ["NCCL_CUMEM_ENABLE"] = str(int(server_args.enable_symm_mem))
+    if (
+        "NCCL_NVLS_ENABLE" not in os.environ
+        or server_args.enable_nccl_nvls
+        or server_args.enable_symm_mem
+    ):
+        os.environ["NCCL_NVLS_ENABLE"] = str(
+            int(server_args.enable_nccl_nvls or server_args.enable_symm_mem)
+        )
+    os.environ["CUDA_DEVICE_MAX_CONNECTIONS"] = "8"
+    os.environ["CUDA_MODULE_LOADING"] = "AUTO"
+
+    if os.environ.get("TRTLLM_ENABLE_PDL", "1") != "0":
+        # flashinfer uses this environment variable for various kernels from MoE to quant kernels
+        os.environ["TRTLLM_ENABLE_PDL"] = "1"
+
+    if os.environ.get("CUTE_DSL_LOG_LEVEL") is None:
+        # Default to warning level, to avoid too many logs
+        os.environ["CUTE_DSL_LOG_LEVEL"] = "30"
+
+    if os.environ.get("CUTE_DSL_LOG_TO_CONSOLE") is None:
+        # Need to set log to console, otherwise the log level won't take effect
+        os.environ["CUTE_DSL_LOG_TO_CONSOLE"] = "1"
+
+    # Can also be passed as argument
+    os.environ["SGLANG_RUN_ID"] = (
+        f"sglang-run-{time.time()}-{random.randint(0, 100000000)}"
+    )
+
+    # Set prometheus env vars
+    if server_args.enable_metrics:
+        set_prometheus_multiproc_dir()
+
+    # Set ulimit
+    set_ulimit()
+
+    # Check flashinfer version
+    if not get_bool_env_var("SGLANG_SKIP_SGL_KERNEL_VERSION_CHECK"):
+        if server_args.attention_backend == "flashinfer":
+            assert_pkg_version(
+                "flashinfer_python",
+                "0.6.4",
+                "Please uninstall the old version and "
+                "reinstall the latest version by following the instructions "
+                "at https://docs.flashinfer.ai/installation.html.",
+            )
+        if _is_cuda:
+            assert_pkg_version(
+                "sgl-kernel",
+                "0.3.21",
+                "Please reinstall the latest version with `pip install sgl-kernel --force-reinstall`",
+            )
+
+    # Signal handlers can only be registered from the main thread.
+    if threading.current_thread() is threading.main_thread():
+        if server_args.custom_sigquit_handler is None:
+            # Register the signal handler.
+            # The child processes will send SIGQUIT to this process when any error happens
+            # This process then clean up the whole process tree
+            # Note: This sigquit handler is used in the launch phase, and may be replaced by
+            # the running_phase_sigquit_handler in the tokenizer manager after the grpc server is launched.
+            def launch_phase_sigquit_handler(signum, frame):
+                logger.error(
+                    "Received sigquit from a child process. It usually means the child failed."
+                )
+                kill_process_tree(os.getpid())
+
+            signal.signal(signal.SIGQUIT, launch_phase_sigquit_handler)
+        else:
+            # Allow users to register a custom SIGQUIT handler for things like crash dump
+            logger.error(
+                f"Using custom SIGQUIT handler: {server_args.custom_sigquit_handler}"
+            )
+            signal.signal(signal.SIGQUIT, server_args.custom_sigquit_handler)
+    else:
+        logger.warning(
+            "Signal handler is not added because the engine is not in the "
+            "main thread. This disables the SIGQUIT handler for cleaning up "
+            "the process tree when a child process fails."
+        )
+
+    # Set mp start method
+    mp.set_start_method("spawn", force=True)
+
+
+def _wait_for_scheduler_ready(
+    scheduler_pipe_readers: List,
+    scheduler_procs: List,
+) -> List[Dict]:
+    """Wait for the model to finish loading and return scheduler infos."""
+    scheduler_infos = []
+    for i in range(len(scheduler_pipe_readers)):
+        try:
+            data = scheduler_pipe_readers[i].recv()
+        except EOFError:
+            logger.error(
+                f"Rank {i} scheduler is dead. Please check if there are relevant logs."
+            )
+            scheduler_procs[i].join()
+            logger.error(f"Exit code: {scheduler_procs[i].exitcode}")
+            raise
+
+        if data["status"] != "ready":
+            raise RuntimeError(
+                "Initialization failed. Please see the error messages above."
+            )
+        scheduler_infos.append(data)
+    return scheduler_infos
+
+
+def _calculate_rank_ranges(
+    nnodes: int, pp_size: int, tp_size: int, node_rank: int
+) -> Tuple[range, range, int, int]:
+    """Calculate pp_rank_range and tp_rank_range for a given node.
+
+    Args:
+        nnodes: Total number of nodes.
+        pp_size: Pipeline parallel size.
+        tp_size: Tensor parallel size.
+        node_rank: The rank of the node to compute ranges for.
+
+    Returns:
+        A tuple of (pp_rank_range, tp_rank_range, pp_size_per_node, tp_size_per_node):
+        - pp_rank_range: range of pipeline-parallel ranks assigned to this node.
+        - tp_rank_range: range of tensor-parallel ranks assigned to this node.
+        - pp_size_per_node: number of PP ranks per node.
+        - tp_size_per_node: number of TP ranks per node.
+    """
+    pp_size_per_node = max(pp_size // nnodes, 1)
+    nnodes_per_pp_rank = max(nnodes // pp_size, 1)
+    pp_rank_range = range(
+        pp_size_per_node * (node_rank // nnodes_per_pp_rank),
+        pp_size_per_node * (node_rank // nnodes_per_pp_rank + 1),
+    )
+
+    nnodes_per_tp_group = nnodes_per_pp_rank
+    tp_size_per_node = tp_size // nnodes_per_tp_group
+    tp_rank_range = range(
+        tp_size_per_node * (node_rank % nnodes_per_tp_group),
+        tp_size_per_node * (node_rank % nnodes_per_tp_group + 1),
+    )
+
+    return pp_rank_range, tp_rank_range, pp_size_per_node, tp_size_per_node
+
+
+def _compute_parallelism_ranks(
+    server_args: ServerArgs, tp_rank: int
+) -> Tuple[int, int, int]:
+    """Compute attention-CP, MoE-DP, and MoE-EP ranks for a TP rank."""
+    attn_dp_size = server_args.dp_size if server_args.enable_dp_attention else 1
+
+    # Parallelism hierarchy (outermost to innermost):
+    # - Attention: Global(TP) -> DP -> ATTN_CP -> ATTN_TP (innermost)
+    # - MoE: Global(TP) -> MOE_DP -> EP -> MOE_TP (innermost)
+    attn_tp_size = server_args.tp_size // attn_dp_size // server_args.attn_cp_size
+    attn_cp_rank = (tp_rank // attn_tp_size) % server_args.attn_cp_size
+    moe_dp_rank = tp_rank // (server_args.tp_size // server_args.moe_dp_size)
+    moe_ep_rank = (
+        tp_rank
+        % (server_args.tp_size // server_args.moe_dp_size)
+        // (server_args.tp_size // server_args.moe_dp_size // server_args.ep_size)
+    )
+    return attn_cp_rank, moe_dp_rank, moe_ep_rank
diff --git a/sglang/python/sglang/srt/entrypoints/grpc_server.py b/sglang/python/sglang/srt/entrypoints/grpc_server.py
new file mode 100644
index 0000000000000000000000000000000000000000..e8ff92a83f85eb2b12b713e9c0ca9f1deb68d19b
--- /dev/null
+++ b/sglang/python/sglang/srt/entrypoints/grpc_server.py
@@ -0,0 +1,1378 @@
+"""
+Standalone gRPC Server for SGLang - Fully separated from HTTP server.
+Uses GrpcRequestManager for orchestration without tokenization.
+"""
+
+import asyncio
+import dataclasses
+import json
+import logging
+import os
+import signal
+import threading
+import time
+from concurrent import futures
+from datetime import datetime, timezone
+from typing import AsyncIterator, Dict, Optional
+
+import grpc
+import numpy as np
+import torch
+from google.protobuf.json_format import MessageToDict
+from google.protobuf.struct_pb2 import Struct
+from google.protobuf.timestamp_pb2 import Timestamp
+from grpc_health.v1 import health_pb2_grpc
+from grpc_reflection.v1alpha import reflection
+from smg_grpc_proto import sglang_scheduler_pb2, sglang_scheduler_pb2_grpc
+
+import sglang
+from sglang.srt.configs.model_config import ModelConfig
+from sglang.srt.disaggregation.utils import FAKE_BOOTSTRAP_HOST, DisaggregationMode
+from sglang.srt.grpc.grpc_request_manager import GrpcRequestManager
+from sglang.srt.grpc.health_servicer import SGLangHealthServicer
+from sglang.srt.grpc.scheduler_launcher import launch_scheduler_process_only
+from sglang.srt.grpc.utils import abort_code_from_output
+from sglang.srt.managers.disagg_service import start_disagg_service
+from sglang.srt.managers.io_struct import (
+    GetLoadsReqOutput,
+    TokenizedEmbeddingReqInput,
+    TokenizedGenerateReqInput,
+)
+from sglang.srt.managers.schedule_batch import Modality, MultimodalDataItem
+from sglang.srt.sampling.sampling_params import SamplingParams as SGLSamplingParams
+from sglang.srt.server_args import ServerArgs
+from sglang.srt.utils import kill_process_tree
+from sglang.utils import get_exception_traceback
+
+logger = logging.getLogger(__name__)
+HEALTH_CHECK_TIMEOUT = int(os.getenv("SGLANG_HEALTH_CHECK_TIMEOUT", 20))
+
+
+def _convert_loads_to_protobuf(
+    result: GetLoadsReqOutput,
+) -> sglang_scheduler_pb2.SchedulerLoad:
+    """Convert GetLoadsReqOutput dataclass to protobuf SchedulerLoad message."""
+    scheduler_load = sglang_scheduler_pb2.SchedulerLoad(
+        dp_rank=result.dp_rank,
+        num_running_reqs=result.num_running_reqs,
+        num_waiting_reqs=result.num_waiting_reqs,
+        num_total_reqs=result.num_running_reqs + result.num_waiting_reqs,
+        num_used_tokens=result.num_used_tokens,
+        max_total_num_tokens=result.max_total_num_tokens,
+        token_usage=result.token_usage,
+        gen_throughput=result.gen_throughput,
+        cache_hit_rate=result.cache_hit_rate,
+        utilization=result.utilization,
+        max_running_requests=result.max_running_requests,
+    )
+
+    # Add optional sections using CopyFrom for proper protobuf assignment
+    if result.memory:
+        scheduler_load.memory.CopyFrom(
+            sglang_scheduler_pb2.MemoryMetrics(
+                weight_gb=result.memory.weight_gb,
+                kv_cache_gb=result.memory.kv_cache_gb,
+                graph_gb=result.memory.graph_gb,
+                token_capacity=result.memory.token_capacity,
+            )
+        )
+
+    if result.speculative:
+        scheduler_load.speculative.CopyFrom(
+            sglang_scheduler_pb2.SpeculativeMetrics(
+                accept_length=result.speculative.accept_length,
+                accept_rate=result.speculative.accept_rate,
+            )
+        )
+
+    if result.lora:
+        scheduler_load.lora.CopyFrom(
+            sglang_scheduler_pb2.LoRAMetrics(
+                slots_used=result.lora.slots_used,
+                slots_total=result.lora.slots_total,
+                utilization=result.lora.utilization,
+            )
+        )
+
+    if result.disaggregation:
+        scheduler_load.disaggregation.CopyFrom(
+            sglang_scheduler_pb2.DisaggregationMetrics(
+                mode=result.disaggregation.mode,
+                prefill_prealloc_queue_reqs=result.disaggregation.prefill_prealloc_queue_reqs,
+                prefill_inflight_queue_reqs=result.disaggregation.prefill_inflight_queue_reqs,
+                decode_prealloc_queue_reqs=result.disaggregation.decode_prealloc_queue_reqs,
+                decode_transfer_queue_reqs=result.disaggregation.decode_transfer_queue_reqs,
+                decode_retracted_queue_reqs=result.disaggregation.decode_retracted_queue_reqs,
+                kv_transfer_speed_gb_s=result.disaggregation.kv_transfer_speed_gb_s,
+                kv_transfer_latency_ms=result.disaggregation.kv_transfer_latency_ms,
+            )
+        )
+
+    if result.queues:
+        scheduler_load.queues.CopyFrom(
+            sglang_scheduler_pb2.QueueMetrics(
+                waiting=result.queues.waiting,
+                grammar=result.queues.grammar,
+                paused=result.queues.paused,
+                retracted=result.queues.retracted,
+            )
+        )
+
+    return scheduler_load
+
+
+def _compute_aggregate_protobuf(
+    loads: list,
+) -> sglang_scheduler_pb2.AggregateMetrics:
+    """Compute aggregate metrics from list of SchedulerLoad protobuf messages."""
+    if not loads:
+        return sglang_scheduler_pb2.AggregateMetrics()
+
+    n = len(loads)
+    total_running = sum(load.num_running_reqs for load in loads)
+    total_waiting = sum(load.num_waiting_reqs for load in loads)
+
+    return sglang_scheduler_pb2.AggregateMetrics(
+        total_running_reqs=total_running,
+        total_waiting_reqs=total_waiting,
+        total_reqs=total_running + total_waiting,
+        avg_token_usage=round(sum(load.token_usage for load in loads) / n, 4),
+        avg_throughput=round(sum(load.gen_throughput for load in loads) / n, 2),
+        avg_utilization=round(sum(load.utilization for load in loads) / n, 4),
+    )
+
+
+class SGLangSchedulerServicer(sglang_scheduler_pb2_grpc.SglangSchedulerServicer):
+    """
+    Standalone gRPC service implementation using GrpcRequestManager.
+    Fully separated from HTTP server with its own process and no shared globals.
+    """
+
+    def __init__(
+        self,
+        request_manager: GrpcRequestManager,
+        server_args: ServerArgs,
+        model_info: Dict,
+        scheduler_info: Dict,
+        health_servicer: Optional[SGLangHealthServicer] = None,
+    ):
+        """Initialize the standalone gRPC service."""
+        self.request_manager = request_manager
+        self.server_args = server_args
+        self.model_info = model_info
+        self.scheduler_info = scheduler_info
+        self.start_time = time.time()
+        self.health_servicer = health_servicer
+        self.mm_receiver = None
+        if (
+            self.server_args.language_only
+            and self.server_args.encoder_transfer_backend == "zmq_to_scheduler"
+        ):
+            from sglang.srt.disaggregation import encode_receiver as mm_receiver
+
+            self.mm_receiver = mm_receiver.create_mm_receiver(self.server_args)
+
+        # Start the request manager's event loop using auto_create_handle_loop
+        self.request_manager.auto_create_handle_loop()
+
+        logger.info("gRPC scheduler servicer initialized")
+
+    async def Generate(
+        self,
+        request: sglang_scheduler_pb2.GenerateRequest,
+        context: grpc.aio.ServicerContext,
+    ) -> AsyncIterator[sglang_scheduler_pb2.GenerateResponse]:
+        """Handle generation requests with streaming responses."""
+        logger.info(f"Receive generation request: {request.request_id}")
+
+        try:
+            # Convert gRPC request to internal format
+            tokenized_req = self._convert_generate_request(request)
+            self._handle_epd_disaggregation_encode_request(request, tokenized_req)
+
+            # Submit to request manager (automatically handles n>1)
+            response_generator = self.request_manager.generate_request(
+                obj=tokenized_req,
+                request_id=request.request_id,
+                grpc_context=context,
+            )
+
+            async for output in response_generator:
+                # Handle batch responses (for n>1 non-streaming)
+                if isinstance(output, list):
+                    for batch_output in output:
+                        if "error" in batch_output:
+                            await context.abort(
+                                abort_code_from_output(batch_output),
+                                batch_output["error"],
+                            )
+                        else:
+                            # All non-error batch outputs are final responses
+                            yield self._create_completion_response(
+                                request.request_id, batch_output
+                            )
+                else:
+                    # Handle single response (for streaming or n=1 non-streaming)
+                    if "error" in output:
+                        await context.abort(
+                            abort_code_from_output(output),
+                            output["error"],
+                        )
+                    elif request.stream:
+                        yield self._create_chunk_response(request.request_id, output)
+                        if output.get("finished", False):
+                            yield self._create_completion_response(
+                                request.request_id, output
+                            )
+                    else:
+                        # Non-streaming n=1: single completion response
+                        yield self._create_completion_response(
+                            request.request_id, output
+                        )
+
+        except grpc.aio.AbortError:
+            raise
+        except ValueError as e:
+            logger.warning(f"Generate invalid request {request.request_id}: {e}")
+            await context.abort(grpc.StatusCode.INVALID_ARGUMENT, str(e))
+        except Exception as e:
+            logger.error(
+                f"Generate failed for request {request.request_id}: {e}\n"
+                f"{get_exception_traceback()}"
+            )
+            await context.abort(grpc.StatusCode.INTERNAL, str(e))
+
+    async def Embed(
+        self,
+        request: sglang_scheduler_pb2.EmbedRequest,
+        context: grpc.aio.ServicerContext,
+    ) -> sglang_scheduler_pb2.EmbedResponse:
+        """Handle embedding requests."""
+        logger.info(f"Receive embedding request: {request.request_id}")
+
+        try:
+            tokenized_req = self._convert_embed_request(request)
+
+            future = await self.request_manager.embedding_request(
+                obj=tokenized_req,
+                request_id=request.request_id,
+            )
+
+            result = await future
+
+            return sglang_scheduler_pb2.EmbedResponse(
+                request_id=request.request_id,
+                complete=sglang_scheduler_pb2.EmbedComplete(
+                    embedding=result["embedding"],
+                    prompt_tokens=result.get("prompt_tokens", 0),
+                    cached_tokens=0,
+                    embedding_dim=len(result["embedding"]),
+                ),
+            )
+
+        except grpc.aio.AbortError:
+            raise
+        except ValueError as e:
+            logger.warning(f"Embed invalid request {request.request_id}: {e}")
+            await context.abort(grpc.StatusCode.INVALID_ARGUMENT, str(e))
+        except Exception as e:
+            logger.error(
+                f"Embed failed for request {request.request_id}: {e}\n"
+                f"{get_exception_traceback()}"
+            )
+            await context.abort(grpc.StatusCode.INTERNAL, str(e))
+
+    async def HealthCheck(
+        self,
+        request: sglang_scheduler_pb2.HealthCheckRequest,
+        context: grpc.aio.ServicerContext,
+    ) -> sglang_scheduler_pb2.HealthCheckResponse:
+        """
+        Check the health of the inference server by sending a special request to generate one token.
+        Similar to HTTP server's /health endpoint.
+        """
+        rid = f"HEALTH_CHECK_{time.time()}"
+        logger.info(f"Receive health check request: {rid}")
+
+        if self.request_manager.gracefully_exit:
+            logger.info(
+                "Health check request received during shutdown. Returning unhealthy."
+            )
+            return sglang_scheduler_pb2.HealthCheckResponse(
+                healthy=False, message="Server is shutting down"
+            )
+
+        # Create a special health check request
+        sampling_params = SGLSamplingParams(max_new_tokens=1, temperature=0.0)
+        sampling_params.normalize(tokenizer=None)
+
+        # Create health check request
+        is_generation = self.scheduler_info.get("is_generation")
+        if is_generation is None:
+            is_generation = not self.server_args.is_embedding
+
+        if is_generation:
+            health_req = TokenizedGenerateReqInput(
+                rid=rid,
+                input_text="",
+                input_ids=[0],
+                sampling_params=sampling_params,
+                return_logprob=False,
+                logprob_start_len=-1,
+                top_logprobs_num=0,
+                stream=False,
+                mm_inputs=None,
+                token_ids_logprob=None,
+            )
+            # Set disaggregation params if needed
+            if self.server_args.disaggregation_mode != DisaggregationMode.NULL.value:
+                health_req.bootstrap_host = FAKE_BOOTSTRAP_HOST
+                health_req.bootstrap_room = 0
+        else:
+            sampling_params.max_new_tokens = 0
+            health_req = TokenizedEmbeddingReqInput(
+                rid=rid,
+                input_text="",
+                input_ids=[0],
+                image_inputs={"mm_items": []},
+                token_type_ids=[0],
+                sampling_params=sampling_params,
+            )
+
+        # Submit health check request
+        async def run_health_check():
+            try:
+                async for _ in self.request_manager.generate_request(
+                    obj=health_req,
+                    request_id=rid,
+                ):
+                    # Got at least one response, server is healthy
+                    return True
+            except Exception as e:
+                logger.warning(f"Health check failed: {e}")
+                return False
+            return False
+
+        task = asyncio.create_task(run_health_check())
+
+        # Wait for response with timeout
+        tic = time.time()
+        while time.time() < tic + HEALTH_CHECK_TIMEOUT:
+            await asyncio.sleep(1)
+            # Check if we got a response from scheduler
+            if self.request_manager.last_receive_tstamp > tic:
+                task.cancel()
+                # Clean up health check state
+                self.request_manager._cleanup_request_state(rid)
+                return sglang_scheduler_pb2.HealthCheckResponse(
+                    healthy=True, message="Health check passed"
+                )
+
+        # Timeout - server not responding
+        task.cancel()
+        self.request_manager._cleanup_request_state(rid)
+        logger.warning(f"Health check timeout after {HEALTH_CHECK_TIMEOUT}s")
+        return sglang_scheduler_pb2.HealthCheckResponse(
+            healthy=False, message=f"Health check timeout after {HEALTH_CHECK_TIMEOUT}s"
+        )
+
+    async def Abort(
+        self,
+        request: sglang_scheduler_pb2.AbortRequest,
+        _context: grpc.aio.ServicerContext,
+    ) -> sglang_scheduler_pb2.AbortResponse:
+        """Abort an ongoing request."""
+        logger.info(f"Receive abort request: {request.request_id}")
+
+        try:
+            success = await self.request_manager.abort_request(request.request_id)
+
+            return sglang_scheduler_pb2.AbortResponse(
+                success=success,
+                message=f"Request {request.request_id} {'aborted' if success else 'not found'}",
+            )
+        except Exception as e:
+            logger.error(
+                f"Abort failed for request {request.request_id}: {e}\n"
+                f"{get_exception_traceback()}"
+            )
+            return sglang_scheduler_pb2.AbortResponse(
+                success=False,
+                message=str(e),
+            )
+
+    async def GetModelInfo(
+        self,
+        _request: sglang_scheduler_pb2.GetModelInfoRequest,
+        _context: grpc.aio.ServicerContext,
+    ) -> sglang_scheduler_pb2.GetModelInfoResponse:
+        """Get model information."""
+        logger.debug("Receive model info request")
+
+        is_generation = self.scheduler_info.get("is_generation")
+        if is_generation is None:
+            is_generation = not self.server_args.is_embedding
+
+        return sglang_scheduler_pb2.GetModelInfoResponse(
+            model_path=self.server_args.model_path,
+            tokenizer_path=self.server_args.tokenizer_path or "",
+            is_generation=is_generation,
+            preferred_sampling_params=(
+                self.server_args.preferred_sampling_params or ""
+            ),
+            weight_version=self.server_args.weight_version or "",
+            served_model_name=self.server_args.served_model_name,
+            max_context_length=self.model_info["max_context_length"],
+            vocab_size=self.model_info["vocab_size"],
+            supports_vision=self.model_info["supports_vision"],
+            model_type=self.model_info.get("model_type") or "",
+            architectures=self.model_info.get("architectures") or [],
+            eos_token_ids=self.model_info["eos_token_ids"],
+            pad_token_id=self.model_info["pad_token_id"],
+            bos_token_id=self.model_info["bos_token_id"],
+            max_req_input_len=self.model_info["max_req_input_len"],
+            # Classification model support
+            id2label_json=self.model_info.get("id2label_json") or "",
+            num_labels=self.model_info.get("num_labels") or 0,
+        )
+
+    async def GetServerInfo(
+        self,
+        _request: sglang_scheduler_pb2.GetServerInfoRequest,
+        _context: grpc.aio.ServicerContext,
+    ) -> sglang_scheduler_pb2.GetServerInfoResponse:
+        """Get server information."""
+        logger.debug("Receive server info request")
+
+        server_args_dict = dataclasses.asdict(self.server_args)
+        server_args_struct = Struct()
+
+        def make_serializable(obj):
+            if obj is None:
+                return None
+            elif isinstance(obj, (str, int, float, bool)):
+                return obj
+            elif isinstance(obj, (list, tuple, set)):
+                return [make_serializable(item) for item in obj]
+            elif isinstance(obj, dict):
+                return {k: make_serializable(v) for k, v in obj.items()}
+            else:
+                return str(obj)
+
+        serializable_args = make_serializable(server_args_dict)
+        server_args_struct.update(serializable_args)
+
+        # Convert scheduler_info to Struct
+        scheduler_info_struct = Struct()
+        scheduler_info_struct.update(self.scheduler_info)
+
+        # Get runtime state from request manager
+        manager_state = self.request_manager.get_server_info()
+
+        # Calculate uptime
+        uptime = time.time() - self.start_time
+
+        # Create timestamp
+        start_timestamp = Timestamp()
+        start_timestamp.FromSeconds(int(self.start_time))
+
+        return sglang_scheduler_pb2.GetServerInfoResponse(
+            server_args=server_args_struct,
+            scheduler_info=scheduler_info_struct,
+            active_requests=manager_state["active_requests"],
+            is_paused=manager_state["paused"],
+            last_receive_timestamp=manager_state["last_receive_time"],
+            uptime_seconds=uptime,
+            sglang_version=sglang.__version__,
+            server_type="grpc",
+            start_time=start_timestamp,
+        )
+
+    async def GetLoads(
+        self,
+        request: sglang_scheduler_pb2.GetLoadsRequest,
+        context: grpc.aio.ServicerContext,
+    ) -> sglang_scheduler_pb2.GetLoadsResponse:
+        """
+        Get comprehensive load metrics for all DP ranks.
+
+        Uses the communicator pattern to fetch real-time metrics,
+        providing full parity with the HTTP /v1/loads endpoint.
+        """
+        logger.debug("Receive get loads request")
+
+        include = list(request.include) if request.include else ["all"]
+        dp_rank = request.dp_rank if request.HasField("dp_rank") else None
+
+        try:
+            results = await self.request_manager.get_loads(
+                include=include, dp_rank=dp_rank
+            )
+        except ValueError as e:
+            # Validation error (e.g., invalid include sections)
+            context.set_code(grpc.StatusCode.INVALID_ARGUMENT)
+            context.set_details(str(e))
+            return sglang_scheduler_pb2.GetLoadsResponse()
+        except asyncio.TimeoutError:
+            context.set_code(grpc.StatusCode.DEADLINE_EXCEEDED)
+            context.set_details("Timeout waiting for scheduler response")
+            return sglang_scheduler_pb2.GetLoadsResponse()
+        except Exception as e:
+            logger.error(f"GetLoads failed: {e}\n{get_exception_traceback()}")
+            context.set_code(grpc.StatusCode.INTERNAL)
+            context.set_details(f"Failed to get load metrics: {e}")
+            return sglang_scheduler_pb2.GetLoadsResponse()
+
+        loads = [_convert_loads_to_protobuf(r) for r in results]
+
+        return sglang_scheduler_pb2.GetLoadsResponse(
+            timestamp=datetime.now(timezone.utc).isoformat(),
+            version=sglang.__version__,
+            dp_rank_count=len(loads),
+            loads=loads,
+            aggregate=_compute_aggregate_protobuf(loads),
+        )
+
+    def _handle_epd_disaggregation_encode_request(
+        self,
+        grpc_req: sglang_scheduler_pb2.GenerateRequest,
+        tokenized_req: TokenizedGenerateReqInput,
+    ) -> None:
+        if not self.mm_receiver:
+            return
+
+        image_urls = list(grpc_req.mm_inputs.image_urls)
+        if not image_urls:
+            return
+
+        encode_req = self.mm_receiver.build_and_send_encode_request(
+            image_urls=image_urls,
+            rid=grpc_req.request_id,
+        )
+        tokenized_req.need_wait_for_image = bool(encode_req.need_wait_for_image)
+        tokenized_req.num_items_assigned = encode_req.num_items_assigned
+
+    # Helper methods for request/response conversion
+
+    def _convert_generate_request(
+        self, grpc_req: sglang_scheduler_pb2.GenerateRequest
+    ) -> TokenizedGenerateReqInput:
+        """Convert gRPC GenerateRequest to internal format."""
+
+        # Extract tokenized input
+        if not grpc_req.HasField("tokenized"):
+            raise ValueError("Tokenized input must be provided")
+
+        input_text = grpc_req.tokenized.original_text
+        input_ids = list(grpc_req.tokenized.input_ids)
+
+        # Convert sampling params
+        sampling_params = self._convert_sampling_params(grpc_req.sampling_params)
+        sampling_params.normalize(tokenizer=None)
+
+        # Extract disaggregated params if present
+        bootstrap_host = None
+        bootstrap_port = None
+        bootstrap_room = None
+        if grpc_req.HasField("disaggregated_params"):
+            # Don't use 'or None' as it treats 0 as falsy
+            bootstrap_host = (
+                grpc_req.disaggregated_params.bootstrap_host
+                if grpc_req.disaggregated_params.bootstrap_host
+                else None
+            )
+            bootstrap_port = (
+                grpc_req.disaggregated_params.bootstrap_port
+                if grpc_req.disaggregated_params.bootstrap_port
+                else None
+            )
+            bootstrap_room = (
+                grpc_req.disaggregated_params.bootstrap_room
+            )  # Can be 0, don't use 'or None'
+
+        # Parse multimodal inputs if present
+        mm_inputs = None
+        if grpc_req.HasField("mm_inputs") and grpc_req.mm_inputs.HasField(
+            "pixel_values"
+        ):
+            mm_inputs = self._parse_mm_inputs(grpc_req.mm_inputs)
+
+        # Create request
+        return TokenizedGenerateReqInput(
+            rid=grpc_req.request_id,
+            input_text=input_text,
+            input_ids=input_ids,
+            mm_inputs=mm_inputs,
+            sampling_params=sampling_params,
+            return_logprob=grpc_req.return_logprob,
+            logprob_start_len=(
+                grpc_req.logprob_start_len
+                if grpc_req.logprob_start_len is not None
+                else -1
+            ),
+            top_logprobs_num=grpc_req.top_logprobs_num or 0,
+            stream=grpc_req.stream or False,
+            lora_id=grpc_req.lora_id if grpc_req.lora_id else None,
+            token_ids_logprob=(
+                list(grpc_req.token_ids_logprob) if grpc_req.token_ids_logprob else None
+            ),
+            bootstrap_host=bootstrap_host,
+            bootstrap_port=bootstrap_port,
+            bootstrap_room=bootstrap_room,
+        )
+
+    @staticmethod
+    def _decode_tensor_data(tensor_data):
+        """Decode a proto TensorData message into a torch.Tensor."""
+        dtype_map = {"float32": np.float32, "int64": np.int64}
+        np_dtype = dtype_map.get(tensor_data.dtype, np.float32)
+        shape = list(tensor_data.shape)
+        arr = np.frombuffer(tensor_data.data, dtype=np_dtype).reshape(shape)
+        return torch.from_numpy(arr)
+
+    def _parse_mm_inputs(self, mm_proto) -> dict:
+        """Parse proto MultimodalInputs into the mm_inputs dict expected by scheduler."""
+        # Decode pixel_values from typed TensorData field
+        pixel_values = self._decode_tensor_data(mm_proto.pixel_values)
+
+        # Decode model-specific tensors
+        model_specific_data = {}
+        for key, tensor_data in mm_proto.model_specific_tensors.items():
+            model_specific_data[key] = self._decode_tensor_data(tensor_data)
+
+        # Convert placeholder ranges to offsets: list of (start, end_inclusive)
+        offsets = [
+            (p.offset, p.offset + p.length - 1) for p in mm_proto.mm_placeholders
+        ]
+        if not offsets:
+            logger.warning(
+                "No mm_placeholders from Rust gateway — token expansion may have "
+                "failed to find the placeholder token in input_ids. "
+                "Check that placeholder_token_id matches the tokenized image token."
+            )
+            offsets = None
+
+        mm_item = MultimodalDataItem(
+            modality=Modality.IMAGE,
+            feature=pixel_values,
+            model_specific_data=model_specific_data,
+            offsets=offsets,
+        )
+
+        result = {"mm_items": [mm_item]}
+
+        if mm_proto.HasField("im_token_id"):
+            result["im_token_id"] = mm_proto.im_token_id
+
+        return result
+
+    def _convert_embed_request(
+        self, grpc_req: sglang_scheduler_pb2.EmbedRequest
+    ) -> TokenizedEmbeddingReqInput:
+        """Convert gRPC EmbedRequest to internal format."""
+
+        # Extract tokenized input
+        if not grpc_req.HasField("tokenized"):
+            raise ValueError("Tokenized input must be provided")
+
+        input_text = grpc_req.tokenized.original_text
+        input_ids = list(grpc_req.tokenized.input_ids)
+
+        # Convert sampling params
+        sampling_params = self._convert_sampling_params(grpc_req.sampling_params)
+
+        # For embedding requests, max_new_tokens should be 0.
+        # The scheduler logic expects an integer, not None.
+        sampling_params.max_new_tokens = 0
+
+        sampling_params.normalize(tokenizer=None)
+
+        return TokenizedEmbeddingReqInput(
+            rid=grpc_req.request_id,
+            input_text=input_text,
+            input_ids=input_ids,
+            image_inputs={"mm_items": []},
+            token_type_ids=list(grpc_req.token_type_ids),
+            sampling_params=sampling_params,
+        )
+
+    def _convert_sampling_params(
+        self, grpc_params: sglang_scheduler_pb2.SamplingParams
+    ) -> SGLSamplingParams:
+        """Convert gRPC SamplingParams to internal format."""
+
+        # Handle constraint types
+        regex = None
+        json_schema = None
+        ebnf_grammar = None
+        structural_tag = None
+
+        if grpc_params.HasField("regex"):
+            regex = grpc_params.regex
+        elif grpc_params.HasField("json_schema"):
+            json_schema = grpc_params.json_schema
+        elif grpc_params.HasField("ebnf_grammar"):
+            ebnf_grammar = grpc_params.ebnf_grammar
+        elif grpc_params.HasField("structural_tag"):
+            structural_tag = grpc_params.structural_tag
+
+        # Handle optional parameters conversion
+        custom_params = (
+            MessageToDict(grpc_params.custom_params)
+            if grpc_params.HasField("custom_params")
+            else None
+        )
+        max_new_tokens = (
+            grpc_params.max_new_tokens
+            if grpc_params.HasField("max_new_tokens")
+            else None
+        )
+        stream_interval = (
+            grpc_params.stream_interval
+            if grpc_params.HasField("stream_interval")
+            else None
+        )
+        logit_bias = dict(grpc_params.logit_bias) if grpc_params.logit_bias else None
+        stop = list(grpc_params.stop) if grpc_params.stop else None
+        stop_token_ids = (
+            list(grpc_params.stop_token_ids) if grpc_params.stop_token_ids else None
+        )
+
+        return SGLSamplingParams(
+            temperature=grpc_params.temperature,
+            top_p=grpc_params.top_p,
+            top_k=grpc_params.top_k,
+            min_p=grpc_params.min_p,
+            frequency_penalty=grpc_params.frequency_penalty,
+            presence_penalty=grpc_params.presence_penalty,
+            repetition_penalty=grpc_params.repetition_penalty,
+            max_new_tokens=max_new_tokens,
+            min_new_tokens=grpc_params.min_new_tokens,
+            stop=stop,
+            stop_token_ids=stop_token_ids,
+            skip_special_tokens=grpc_params.skip_special_tokens,
+            spaces_between_special_tokens=grpc_params.spaces_between_special_tokens,
+            no_stop_trim=grpc_params.no_stop_trim,
+            regex=regex,
+            json_schema=json_schema,
+            ebnf=ebnf_grammar,
+            structural_tag=structural_tag,
+            n=grpc_params.n,
+            ignore_eos=grpc_params.ignore_eos,
+            stream_interval=stream_interval,
+            logit_bias=logit_bias,
+            custom_params=custom_params,
+        )
+
+    def _convert_output_logprobs_to_proto(
+        self, logprobs_data: Dict
+    ) -> Optional[sglang_scheduler_pb2.OutputLogProbs]:
+        """Convert output logprobs dict to proto (no None values, plain floats)."""
+        if not logprobs_data:
+            return None
+
+        token_logprobs_val = logprobs_data.get("token_logprobs_val", [])
+        token_logprobs_idx = logprobs_data.get("token_logprobs_idx", [])
+        top_logprobs_val = logprobs_data.get("top_logprobs_val", [])
+        top_logprobs_idx = logprobs_data.get("top_logprobs_idx", [])
+
+        # Build TopLogProbs entries
+        top_logprobs_proto = []
+        if top_logprobs_val and top_logprobs_idx:
+            for val_list, idx_list in zip(top_logprobs_val, top_logprobs_idx):
+                top_logprobs_proto.append(
+                    sglang_scheduler_pb2.TopLogProbs(
+                        values=val_list,
+                        token_ids=idx_list,
+                    )
+                )
+
+        return sglang_scheduler_pb2.OutputLogProbs(
+            token_logprobs=token_logprobs_val,  # Plain float array
+            token_ids=token_logprobs_idx,
+            top_logprobs=top_logprobs_proto,
+        )
+
+    def _convert_input_logprobs_to_proto(
+        self, logprobs_data: Dict
+    ) -> Optional[sglang_scheduler_pb2.InputLogProbs]:
+        """Convert input logprobs dict to proto (first token is None, wrapped in InputTokenLogProb)."""
+        if not logprobs_data:
+            return None
+
+        token_logprobs_val = logprobs_data.get("token_logprobs_val", [])
+        token_logprobs_idx = logprobs_data.get("token_logprobs_idx", [])
+        top_logprobs_val = logprobs_data.get("top_logprobs_val", [])
+        top_logprobs_idx = logprobs_data.get("top_logprobs_idx", [])
+
+        # Wrap values in InputTokenLogProb (None for first token, value for others)
+        token_logprobs_wrapped = [
+            (
+                sglang_scheduler_pb2.InputTokenLogProb()
+                if x is None
+                else sglang_scheduler_pb2.InputTokenLogProb(value=x)
+            )
+            for x in token_logprobs_val
+        ]
+
+        # Build TopLogProbs entries
+        top_logprobs_proto = []
+        if top_logprobs_val and top_logprobs_idx:
+            for val_list, idx_list in zip(top_logprobs_val, top_logprobs_idx):
+                top_logprobs_proto.append(
+                    sglang_scheduler_pb2.TopLogProbs(
+                        values=val_list,
+                        token_ids=idx_list,
+                    )
+                )
+
+        return sglang_scheduler_pb2.InputLogProbs(
+            token_logprobs=token_logprobs_wrapped,
+            token_ids=token_logprobs_idx,
+            top_logprobs=top_logprobs_proto,
+        )
+
+    def _create_chunk_response(
+        self, request_id: str, output: Dict
+    ) -> sglang_scheduler_pb2.GenerateResponse:
+        """Create a streaming chunk response."""
+        meta_info = output.get("meta_info", {})
+
+        # Convert output logprobs if present
+        output_logprobs_proto = self._convert_output_logprobs_to_proto(
+            output.get("output_logprobs")
+        )
+
+        # Convert input logprobs if present (only in first chunk)
+        input_logprobs_proto = self._convert_input_logprobs_to_proto(
+            output.get("input_logprobs")
+        )
+
+        return sglang_scheduler_pb2.GenerateResponse(
+            request_id=request_id,
+            chunk=sglang_scheduler_pb2.GenerateStreamChunk(
+                token_ids=output.get("token_ids", []),
+                prompt_tokens=meta_info.get("prompt_tokens", 0),
+                completion_tokens=meta_info.get("completion_tokens", 0),
+                cached_tokens=meta_info.get("cached_tokens", 0),
+                output_logprobs=output_logprobs_proto,
+                input_logprobs=input_logprobs_proto,
+                index=output.get("index", 0),
+            ),
+        )
+
+    def _create_completion_response(
+        self, request_id: str, output: Dict
+    ) -> sglang_scheduler_pb2.GenerateResponse:
+        """Create a completion response."""
+
+        # Extract meta info and finish reason details
+        meta_info = output.get("meta_info", {})
+        finish_reason_data = meta_info.get("finish_reason")
+
+        # Determine finish reason, default is stop
+        finish_reason = "stop"
+        if finish_reason_data:
+            if isinstance(finish_reason_data, dict):
+                finish_reason_type = finish_reason_data.get("type")
+            else:
+                # Handle legacy string format
+                finish_reason_type = finish_reason_data
+
+            if finish_reason_type == "length":
+                finish_reason = "length"
+            elif finish_reason_type == "abort":
+                finish_reason = "abort"
+
+        # Extract matched_stop information
+        matched_stop_kwargs = {}
+        if isinstance(finish_reason_data, dict) and "matched" in finish_reason_data:
+            matched = finish_reason_data["matched"]
+            if isinstance(matched, int):
+                matched_stop_kwargs["matched_token_id"] = matched
+            elif isinstance(matched, str):
+                matched_stop_kwargs["matched_stop_str"] = matched
+
+        # Convert output logprobs if present
+        output_logprobs_proto = self._convert_output_logprobs_to_proto(
+            output.get("output_logprobs")
+        )
+
+        # Convert input logprobs if present
+        input_logprobs_proto = self._convert_input_logprobs_to_proto(
+            output.get("input_logprobs")
+        )
+
+        return sglang_scheduler_pb2.GenerateResponse(
+            request_id=request_id,
+            complete=sglang_scheduler_pb2.GenerateComplete(
+                output_ids=output.get("token_ids", []),
+                finish_reason=finish_reason,
+                prompt_tokens=meta_info.get("prompt_tokens", 0),
+                completion_tokens=meta_info.get(
+                    "completion_tokens", len(output.get("token_ids", []))
+                ),
+                cached_tokens=meta_info.get("cached_tokens", 0),
+                output_logprobs=output_logprobs_proto,
+                input_logprobs=input_logprobs_proto,
+                index=output.get("index", 0),
+                **matched_stop_kwargs,
+            ),
+        )
+
+    async def shutdown(self):
+        """Shutdown the service."""
+        logger.info("Shutting down gRPC service")
+
+        # Mark health service as NOT_SERVING before shutdown
+        if self.health_servicer:
+            self.health_servicer.set_not_serving()
+
+        # Shutdown request manager (handles its own tasks)
+        await self.request_manager.shutdown()
+
+
+async def serve_grpc(
+    server_args: ServerArgs,
+    model_info: Optional[Dict] = None,
+):
+    """Start the standalone gRPC server with integrated scheduler."""
+
+    # Start bootstrap server BEFORE launching scheduler processes (only in PREFILL mode)
+    # This ensures the bootstrap server is ready when prefill schedulers try to register
+    bootstrap_server = None
+    if server_args.disaggregation_mode == "prefill":
+        bootstrap_server = start_disagg_service(server_args)
+        if bootstrap_server:
+            logger.info(
+                f"Bootstrap server started for disaggregation mode on {server_args.host}:{server_args.disaggregation_bootstrap_port}"
+            )
+
+    # Launch only the scheduler process(es) (no tokenizer/detokenizer needed for gRPC)
+    logger.info("Launching scheduler process(es)...")
+    scheduler_info, port_args, scheduler_procs = launch_scheduler_process_only(
+        server_args=server_args,
+    )
+
+    # Load model config to get HF config info (same as TokenizerManager does)
+    model_config = ModelConfig.from_server_args(server_args)
+
+    # Update model info from scheduler info and model config
+    if model_info is None:
+        # Extract classification labels from HuggingFace config (if available)
+        # Match logic in serving_classify.py::_get_id2label_mapping
+        hf_config = model_config.hf_config
+        id2label = getattr(hf_config, "id2label", None)
+        num_labels = getattr(hf_config, "num_labels", 0) or 0
+
+        # If no id2label but num_labels exists, create default mapping
+        if not id2label and num_labels:
+            id2label = {i: f"LABEL_{i}" for i in range(num_labels)}
+        elif id2label and not num_labels:
+            num_labels = len(id2label)
+
+        # Convert to JSON string for proto transport
+        # id2label is a dict like {0: "negative", 1: "positive"}
+        id2label_json = json.dumps(id2label) if id2label else ""
+
+        model_info = {
+            "model_name": server_args.model_path,
+            "max_context_length": scheduler_info.get(
+                "max_total_num_tokens", server_args.context_length or 8192
+            ),
+            "vocab_size": scheduler_info.get("vocab_size", 128256),
+            "supports_vision": scheduler_info.get("supports_vision", False),
+            "model_type": getattr(hf_config, "model_type", None),
+            "architectures": getattr(hf_config, "architectures", None),
+            "max_req_input_len": scheduler_info.get("max_req_input_len", 8192),
+            "eos_token_ids": scheduler_info.get("eos_token_ids", []),
+            "pad_token_id": scheduler_info.get("pad_token_id", 0),
+            "bos_token_id": scheduler_info.get("bos_token_id", 1),
+            # Classification model support
+            "id2label_json": id2label_json,
+            "num_labels": num_labels or 0,
+        }
+
+    # Create request manager with the correct port args
+    # Note: We pass None for bootstrap_server since it's already started above
+    request_manager = GrpcRequestManager(
+        server_args=server_args,
+        port_args=port_args,
+        bootstrap_server=bootstrap_server,
+    )
+
+    # Create gRPC server
+    server = grpc.aio.server(
+        futures.ThreadPoolExecutor(max_workers=10),
+        options=[
+            ("grpc.max_send_message_length", 1024 * 1024 * 256),
+            ("grpc.max_receive_message_length", 1024 * 1024 * 256),
+            # Allow client HTTP/2 keepalive pings every 10s+.
+            # Without this, the gRPC C-core default (300s minimum) causes
+            # GOAWAY when clients send pings more frequently during long
+            # requests (e.g. prefill) where no DATA frames flow.
+            ("grpc.http2.min_recv_ping_interval_without_data_ms", 10000),
+            ("grpc.keepalive_permit_without_calls", True),
+        ],
+    )
+
+    # Create standard health service (for Kubernetes probes)
+    health_servicer = SGLangHealthServicer(
+        request_manager=request_manager,
+        scheduler_info=scheduler_info,
+    )
+    health_pb2_grpc.add_HealthServicer_to_server(health_servicer, server)
+
+    # Add SGLang service
+    servicer = SGLangSchedulerServicer(
+        request_manager=request_manager,
+        server_args=server_args,
+        model_info=model_info,
+        scheduler_info=scheduler_info,
+        health_servicer=health_servicer,
+    )
+    sglang_scheduler_pb2_grpc.add_SglangSchedulerServicer_to_server(servicer, server)
+
+    # Enable reflection
+    SERVICE_NAMES = (
+        sglang_scheduler_pb2.DESCRIPTOR.services_by_name["SglangScheduler"].full_name,
+        "grpc.health.v1.Health",
+        reflection.SERVICE_NAME,
+    )
+    reflection.enable_server_reflection(SERVICE_NAMES, server)
+
+    # Start server
+    listen_addr = f"{server_args.host}:{server_args.port}"
+    if server_args.ssl_certfile and server_args.ssl_keyfile:
+        if server_args.ssl_keyfile_password:
+            raise ValueError(
+                "gRPC mode does not support encrypted SSL key files "
+                "(--ssl-keyfile-password). Please provide an unencrypted key "
+                "file when using --grpc-mode."
+            )
+
+        def _read_ssl_file(filepath: str, description: str) -> bytes:
+            try:
+                with open(filepath, "rb") as f:
+                    return f.read()
+            except OSError as e:
+                raise ValueError(
+                    f"Failed to read {description} '{filepath}': {e}"
+                ) from e
+
+        private_key = _read_ssl_file(server_args.ssl_keyfile, "SSL key file")
+        certificate_chain = _read_ssl_file(
+            server_args.ssl_certfile, "SSL certificate file"
+        )
+        root_certificates = None
+        if server_args.ssl_ca_certs:
+            root_certificates = _read_ssl_file(
+                server_args.ssl_ca_certs, "SSL CA certificates file"
+            )
+
+        if server_args.enable_ssl_refresh:
+            # Use dynamic credentials so gRPC re-reads certs on each
+            # new connection via the fetcher callback.
+            _cert_mtime = os.path.getmtime(server_args.ssl_certfile)
+            _key_mtime = os.path.getmtime(server_args.ssl_keyfile)
+            _ca_mtime = (
+                os.path.getmtime(server_args.ssl_ca_certs)
+                if server_args.ssl_ca_certs
+                else None
+            )
+
+            def _cert_config_fetcher():
+                nonlocal _cert_mtime, _key_mtime, _ca_mtime
+                try:
+                    new_cert_mt = os.path.getmtime(server_args.ssl_certfile)
+                    new_key_mt = os.path.getmtime(server_args.ssl_keyfile)
+                    new_ca_mt = (
+                        os.path.getmtime(server_args.ssl_ca_certs)
+                        if server_args.ssl_ca_certs
+                        else None
+                    )
+
+                    if (
+                        new_cert_mt == _cert_mtime
+                        and new_key_mt == _key_mtime
+                        and new_ca_mt == _ca_mtime
+                    ):
+                        return None  # No change
+
+                    new_key = _read_ssl_file(server_args.ssl_keyfile, "SSL key file")
+                    new_cert = _read_ssl_file(
+                        server_args.ssl_certfile, "SSL certificate file"
+                    )
+                    new_root = None
+                    if server_args.ssl_ca_certs:
+                        new_root = _read_ssl_file(
+                            server_args.ssl_ca_certs,
+                            "SSL CA certificates file",
+                        )
+
+                    logger.info("gRPC SSL certificate change detected, reloading.")
+                    config = grpc.ssl_server_certificate_configuration(
+                        [(new_key, new_cert)],
+                        root_certificates=new_root,
+                    )
+
+                    # Update mtimes only after successful reload
+                    _cert_mtime = new_cert_mt
+                    _key_mtime = new_key_mt
+                    _ca_mtime = new_ca_mt
+
+                    return config
+                except Exception:
+                    logger.exception(
+                        "Failed to reload gRPC SSL certificates — "
+                        "continuing with previous certificates."
+                    )
+                    return None
+
+            try:
+                initial_config = grpc.ssl_server_certificate_configuration(
+                    [(private_key, certificate_chain)],
+                    root_certificates=root_certificates,
+                )
+                credentials = grpc.dynamic_ssl_server_credentials(
+                    initial_config,
+                    _cert_config_fetcher,
+                )
+            except Exception as e:
+                raise ValueError(
+                    f"Failed to create gRPC dynamic SSL credentials. "
+                    f"Verify that --ssl-keyfile and --ssl-certfile contain "
+                    f"valid, matching PEM data. Underlying error: {e}"
+                ) from e
+            logger.info("gRPC SSL certificate auto-refresh enabled.")
+        else:
+            try:
+                credentials = grpc.ssl_server_credentials(
+                    [(private_key, certificate_chain)],  # pairs: (key, cert)
+                    root_certificates=root_certificates,
+                )
+            except Exception as e:
+                raise ValueError(
+                    f"Failed to create gRPC SSL credentials. Verify that "
+                    f"--ssl-keyfile and --ssl-certfile contain valid, matching "
+                    f"PEM data. Underlying error: {e}"
+                ) from e
+        bound_port = server.add_secure_port(listen_addr, credentials)
+        if bound_port == 0:
+            raise RuntimeError(
+                f"Failed to bind gRPC TLS server to {listen_addr}. "
+                f"Check that the port is available and SSL credentials are valid."
+            )
+        logger.info(f"gRPC server (TLS) listening on {listen_addr}")
+    else:
+        server.add_insecure_port(listen_addr)
+        logger.info(f"gRPC server listening on {listen_addr}")
+
+    await server.start()
+
+    # Start warmup in a separate thread
+    warmup_thread = threading.Thread(
+        target=_wait_and_warmup_grpc,
+        args=(server_args, health_servicer),
+    )
+    warmup_thread.start()
+
+    # Handle shutdown signals
+    loop = asyncio.get_running_loop()
+    stop_event = asyncio.Event()
+
+    def signal_handler():
+        logger.info("Received shutdown signal")
+        stop_event.set()
+
+    for sig in (signal.SIGTERM, signal.SIGINT):
+        loop.add_signal_handler(sig, signal_handler)
+
+    try:
+        await stop_event.wait()
+    finally:
+        logger.info("Shutting down gRPC server")
+
+        # Shutdown request manager first - this closes ZMQ sockets and stops background tasks
+        await servicer.shutdown()
+
+        # Stop the gRPC server
+        await server.stop(5.0)
+
+        # Wait for warmup thread to finish
+        if warmup_thread.is_alive():
+            logger.info("Waiting for warmup thread to finish...")
+            warmup_thread.join(timeout=5.0)
+
+        # Terminate scheduler processes before exiting to avoid atexit hang
+        # The scheduler processes have SIGINT ignored, so they won't get KeyboardInterrupt
+        for i, proc in enumerate(scheduler_procs):
+            if proc.is_alive():
+                logger.info(f"Terminating scheduler process {i}...")
+                proc.terminate()
+                proc.join(timeout=2.0)
+                if proc.is_alive():
+                    logger.warning(
+                        f"Scheduler process {i} did not terminate, killing..."
+                    )
+                    proc.kill()
+                    proc.join(timeout=1.0)
+
+        logger.info("All scheduler processes terminated")
+
+
+def _execute_grpc_server_warmup(server_args: ServerArgs):
+    """Execute warmup for gRPC server by checking health and sending test request."""
+    try:
+        # Connect to the gRPC server
+        grpc_url = f"{server_args.host}:{server_args.port}"
+        channel = grpc.insecure_channel(
+            grpc_url,
+            options=[
+                ("grpc.max_send_message_length", 1024 * 1024 * 256),
+                ("grpc.max_receive_message_length", 1024 * 1024 * 256),
+            ],
+        )
+        stub = sglang_scheduler_pb2_grpc.SglangSchedulerStub(channel)
+
+        # Wait until the server is launched (poll GetModelInfo)
+        success = False
+        last_error = None
+        for _ in range(120):
+            time.sleep(1)
+            try:
+                request = sglang_scheduler_pb2.GetModelInfoRequest()
+                response = stub.GetModelInfo(request, timeout=5)
+                success = True
+                break
+            except Exception as e:
+                last_error = str(e)
+                pass
+
+        if not success:
+            error_msg = f"gRPC server warmup failed: Could not connect to server after 120 seconds. Last error: {last_error}"
+            logger.error(error_msg)
+            channel.close()
+            kill_process_tree(os.getpid())
+            return False
+
+        # Get model info to determine if it's generation or embedding
+        is_generation = response.is_generation
+
+        # Send a warmup request
+        logger.info("Sending warmup request to gRPC server...")
+        max_new_tokens = 8 if is_generation else 1
+
+        if is_generation:
+            warmup_request_kwargs = {
+                "request_id": f"WARMUP_{time.time()}",
+                "tokenized": sglang_scheduler_pb2.TokenizedInput(
+                    input_ids=[
+                        123,
+                        456,
+                        789,
+                        234,
+                        567,
+                        890,
+                        345,
+                    ],  # Random-looking but safe token IDs
+                    original_text="warmup request",
+                ),
+                "sampling_params": sglang_scheduler_pb2.SamplingParams(
+                    temperature=0.0,
+                    max_new_tokens=max_new_tokens,
+                ),
+                "stream": False,
+            }
+
+            # Set disaggregation params if needed
+            if server_args.disaggregation_mode != DisaggregationMode.NULL.value:
+                warmup_request_kwargs["disaggregated_params"] = (
+                    sglang_scheduler_pb2.DisaggregatedParams(
+                        bootstrap_host=FAKE_BOOTSTRAP_HOST,
+                        bootstrap_room=0,
+                    )
+                )
+
+            warmup_request = sglang_scheduler_pb2.GenerateRequest(
+                **warmup_request_kwargs
+            )
+
+            # Send the warmup request
+            try:
+                responses = list(stub.Generate(warmup_request, timeout=600))
+                # Check if we got a valid response
+                if responses and not responses[-1].HasField("error"):
+                    logger.info("gRPC warmup request completed successfully")
+                    success = True
+                else:
+                    error_msg = (
+                        responses[-1].error.message if responses else "No response"
+                    )
+                    logger.warning(f"gRPC warmup request returned error: {error_msg}")
+                    success = False
+            except Exception as e:
+                error_msg = f"gRPC warmup request failed: {e}"
+                logger.error(error_msg)
+                channel.close()
+                kill_process_tree(os.getpid())
+                return False
+        else:
+            # For embedding models
+            warmup_request = sglang_scheduler_pb2.EmbedRequest(
+                request_id=f"WARMUP_{time.time()}",
+                tokenized=sglang_scheduler_pb2.TokenizedInput(
+                    input_ids=[10, 11, 12],
+                    original_text="test embedding",
+                ),
+            )
+
+            try:
+                response = stub.Embed(warmup_request, timeout=600)
+                if not response.HasField("error"):
+                    logger.info("gRPC warmup request completed successfully")
+                    success = True
+                else:
+                    logger.warning(
+                        f"gRPC warmup request returned error: {response.error.message}"
+                    )
+                    success = False
+            except Exception as e:
+                error_msg = f"gRPC warmup request failed: {e}"
+                logger.error(error_msg)
+                channel.close()
+                kill_process_tree(os.getpid())
+                return False
+
+        channel.close()
+        return success
+
+    except Exception as e:
+        error_msg = (
+            f"gRPC warmup failed with exception: {e}\n{get_exception_traceback()}"
+        )
+        logger.error(error_msg)
+        try:
+            channel.close()
+        except Exception:
+            pass
+        kill_process_tree(os.getpid())
+        return False
+
+
+def _wait_and_warmup_grpc(
+    server_args: ServerArgs,
+    health_servicer: Optional[SGLangHealthServicer] = None,
+):
+    """Wait for gRPC server to be ready and execute warmup."""
+    if not server_args.skip_server_warmup:
+        if not _execute_grpc_server_warmup(server_args):
+            return
+    else:
+        logger.info("Skipping gRPC server warmup (skip_server_warmup=True)")
+
+    # Mark health service as SERVING after warmup completes
+    if health_servicer:
+        health_servicer.set_serving()
+
+    logger.info("The server is fired up and ready to roll!")
diff --git a/sglang/python/sglang/srt/entrypoints/harmony_utils.py b/sglang/python/sglang/srt/entrypoints/harmony_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..86d4356dc8e54aa39b34f35a70aeed584bf78f21
--- /dev/null
+++ b/sglang/python/sglang/srt/entrypoints/harmony_utils.py
@@ -0,0 +1,368 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+# Adapted from vLLM: https://github.com/vllm-project/vllm/blob/1b9902806915040ac9b3029f2ab7522ec505afc3/vllm/entrypoints/harmony_utils.py
+# Slight differences in processing chat messages
+import datetime
+from collections.abc import Iterable
+from typing import Literal, Optional, Union
+
+import orjson
+from openai.types.responses import (
+    ResponseOutputItem,
+    ResponseOutputMessage,
+    ResponseOutputText,
+    ResponseReasoningItem,
+)
+from openai.types.responses.response_function_tool_call import ResponseFunctionToolCall
+from openai.types.responses.response_function_web_search import (
+    ActionFind,
+    ActionOpenPage,
+    ActionSearch,
+    ResponseFunctionWebSearch,
+)
+from openai.types.responses.response_reasoning_item import (
+    Content as ResponseReasoningTextContent,
+)
+from openai.types.responses.tool import Tool
+from openai_harmony import (
+    Author,
+    Conversation,
+    DeveloperContent,
+    HarmonyEncodingName,
+    Message,
+    ReasoningEffort,
+    Role,
+    StreamableParser,
+    SystemContent,
+    TextContent,
+    ToolDescription,
+    load_harmony_encoding,
+)
+
+from sglang.srt.entrypoints.openai.protocol import ResponseInputOutputItem
+from sglang.srt.utils import random_uuid
+
+REASONING_EFFORT = {
+    "high": ReasoningEffort.HIGH,
+    "medium": ReasoningEffort.MEDIUM,
+    "low": ReasoningEffort.LOW,
+}
+
+_harmony_encoding = None
+
+
+def get_encoding():
+    global _harmony_encoding
+    if _harmony_encoding is None:
+        _harmony_encoding = load_harmony_encoding(HarmonyEncodingName.HARMONY_GPT_OSS)
+    return _harmony_encoding
+
+
+def get_system_message(
+    model_identity: Optional[str] = None,
+    reasoning_effort: Optional[Literal["high", "medium", "low"]] = None,
+    start_date: Optional[str] = None,
+    browser_description: Optional[str] = None,
+    python_description: Optional[str] = None,
+) -> Message:
+    sys_msg_content = SystemContent.new()
+    if model_identity is not None:
+        sys_msg_content = sys_msg_content.with_model_identity(model_identity)
+    if reasoning_effort is not None:
+        sys_msg_content = sys_msg_content.with_reasoning_effort(
+            REASONING_EFFORT[reasoning_effort]
+        )
+    if start_date is None:
+        start_date = datetime.datetime.now().strftime("%Y-%m-%d")
+    sys_msg_content = sys_msg_content.with_conversation_start_date(start_date)
+    if browser_description is not None:
+        sys_msg_content = sys_msg_content.with_tools(browser_description)
+    if python_description is not None:
+        sys_msg_content = sys_msg_content.with_tools(python_description)
+    sys_msg = Message.from_role_and_content(Role.SYSTEM, sys_msg_content)
+    return sys_msg
+
+
+def get_developer_message(
+    instructions: Optional[str] = None, tools: Optional[list[Tool]] = None
+) -> Message:
+    dev_msg_content = DeveloperContent.new()
+    if instructions is not None:
+        dev_msg_content = dev_msg_content.with_instructions(instructions)
+    if tools is not None:
+        function_tools = []
+        for tool in tools:
+            if tool.type in ("web_search_preview", "code_interpreter"):
+                # These are built-in tools that are added to the system message.
+                pass
+            elif tool.type == "function":
+                function_tools.append(tool)
+            else:
+                raise ValueError(f"tool type {tool.type} not supported")
+        if function_tools:
+            function_tool_descriptions = [
+                ToolDescription.new(
+                    name=tool.name,
+                    description=tool.description,
+                    parameters=tool.parameters,
+                )
+                for tool in function_tools
+            ]
+            dev_msg_content = dev_msg_content.with_function_tools(
+                function_tool_descriptions
+            )
+    dev_msg = Message.from_role_and_content(Role.DEVELOPER, dev_msg_content)
+    return dev_msg
+
+
+def get_user_message(content: str) -> Message:
+    return Message.from_role_and_content(Role.USER, content)
+
+
+def parse_response_input(
+    response_msg: ResponseInputOutputItem,
+    prev_responses: list[Union[ResponseOutputItem, ResponseReasoningItem]],
+) -> Message:
+    if not isinstance(response_msg, dict):
+        response_msg = response_msg.model_dump()
+    if "type" not in response_msg or response_msg["type"] == "message":
+        role = response_msg["role"]
+        content = response_msg["content"]
+        if role == "system":
+            # User is trying to set a system message. Change it to:
+            # <|start|>developer<|message|># Instructions
+            # {instructions}<|end|>
+            role = "developer"
+            text_prefix = "Instructions:\n"
+        else:
+            text_prefix = ""
+        if isinstance(content, str):
+            msg = Message.from_role_and_content(role, text_prefix + content)
+        else:
+            contents = [TextContent(text=text_prefix + c["text"]) for c in content]
+            msg = Message.from_role_and_contents(role, contents)
+    elif response_msg["type"] == "function_call_output":
+        call_id = response_msg["call_id"]
+        call_response: Optional[ResponseFunctionToolCall] = None
+        for prev_response in reversed(prev_responses):
+            if (
+                isinstance(prev_response, ResponseFunctionToolCall)
+                and prev_response.call_id == call_id
+            ):
+                call_response = prev_response
+                break
+        if call_response is None:
+            raise ValueError(f"No call message found for {call_id}")
+        msg = Message.from_author_and_content(
+            Author.new(Role.TOOL, f"functions.{call_response.name}"),
+            response_msg["output"],
+        )
+    elif response_msg["type"] == "reasoning":
+        content = response_msg["content"]
+        assert len(content) == 1
+        msg = Message.from_role_and_content(Role.ASSISTANT, content[0]["text"])
+    elif response_msg["type"] == "function_call":
+        msg = Message.from_role_and_content(Role.ASSISTANT, response_msg["arguments"])
+        msg = msg.with_channel("commentary")
+        msg = msg.with_recipient(f"functions.{response_msg['name']}")
+        msg = msg.with_content_type("json")
+    else:
+        raise ValueError(f"Unknown input type: {response_msg['type']}")
+    return msg
+
+
+def parse_response_output(output: ResponseOutputItem) -> Message:
+    if isinstance(output, ResponseOutputMessage):
+        role = output.role
+        contents = [TextContent(text=c.text) for c in output.content]
+        msg = Message.from_role_and_contents(role, contents)
+        return msg
+    elif isinstance(output, ResponseFunctionToolCall):
+        msg = Message.from_role_and_content(Role.ASSISTANT, output.arguments)
+        msg = msg.with_channel("commentary")
+        msg = msg.with_recipient(output.name)
+        msg = msg.with_content_type("json")
+        return msg
+    else:
+        raise ValueError(f"Unknown output type: {type(output)}")
+
+
+def parse_chat_input(chat_msg) -> Message:
+    role = chat_msg.role
+    content = chat_msg.content
+    if isinstance(content, str):
+        contents = [TextContent(text=content)]
+    else:
+        # TODO: Support refusal.
+        contents = [TextContent(text=c.text) for c in content]
+    msg = Message.from_role_and_contents(role, contents)
+    return msg
+
+
+def render_for_completion(messages: list[Message]) -> list[int]:
+    conversation = Conversation.from_messages(messages)
+    token_ids = get_encoding().render_conversation_for_completion(
+        conversation, Role.ASSISTANT
+    )
+    return token_ids
+
+
+def get_stop_tokens_for_assistant_actions() -> list[int]:
+    return get_encoding().stop_tokens_for_assistant_actions()
+
+
+def get_streamable_parser_for_assistant() -> StreamableParser:
+    return StreamableParser(get_encoding(), role=Role.ASSISTANT)
+
+
+def parse_output_message(message: Message):
+    if message.author.role != "assistant":
+        # This is a message from a tool to the assistant (e.g., search result).
+        # Don't include it in the final output for now. This aligns with
+        # OpenAI's behavior on models like o4-mini.
+        return []
+
+    output_items = []
+    recipient = message.recipient
+    if recipient is not None and recipient.startswith("browser."):
+        if len(message.content) != 1:
+            raise ValueError("Invalid number of contents in browser message")
+        content = message.content[0]
+        browser_call = orjson.loads(content.text)
+        # TODO: translate to url properly!
+        if recipient == "browser.search":
+            action = ActionSearch(
+                query=f"cursor:{browser_call.get('query', '')}", type="search"
+            )
+        elif recipient == "browser.open":
+            action = ActionOpenPage(
+                url=f"cursor:{browser_call.get('url', '')}", type="open_page"
+            )
+        elif recipient == "browser.find":
+            action = ActionFind(
+                pattern=browser_call["pattern"],
+                url=f"cursor:{browser_call.get('url', '')}",
+                type="find",
+            )
+        else:
+            raise ValueError(f"Unknown browser action: {recipient}")
+        web_search_item = ResponseFunctionWebSearch(
+            id=f"ws_{random_uuid()}",
+            action=action,
+            status="completed",
+            type="web_search_call",
+        )
+        output_items.append(web_search_item)
+    elif message.channel == "analysis":
+        for content in message.content:
+            reasoning_item = ResponseReasoningItem(
+                id=f"rs_{random_uuid()}",
+                type="reasoning",
+                summary=[],
+                content=[
+                    ResponseReasoningTextContent(
+                        text=content.text, type="reasoning_text"
+                    )
+                ],
+                status=None,
+            )
+            output_items.append(reasoning_item)
+    elif message.channel == "commentary":
+        if message.recipient.startswith("functions."):
+            function_name = message.recipient.split(".")[-1]
+            for content in message.content:
+                random_id = random_uuid()
+                response_item = ResponseFunctionToolCall(
+                    arguments=content.text,
+                    call_id=f"call_{random_id}",
+                    type="function_call",
+                    name=function_name,
+                    id=f"ft_{random_id}",
+                )
+                output_items.append(response_item)
+        elif message.recipient.startswith("python") or message.recipient.startswith(
+            "browser"
+        ):
+            for content in message.content:
+                reasoning_item = ResponseReasoningItem(
+                    id=f"rs_{random_uuid()}",
+                    type="reasoning",
+                    summary=[],
+                    content=[
+                        ResponseReasoningTextContent(
+                            text=content.text, type="reasoning_text"
+                        )
+                    ],
+                    status=None,
+                )
+                output_items.append(reasoning_item)
+        else:
+            raise ValueError(f"Unknown recipient: {message.recipient}")
+    elif message.channel == "final":
+        contents = []
+        for content in message.content:
+            output_text = ResponseOutputText(
+                text=content.text,
+                annotations=[],  # TODO
+                type="output_text",
+                logprobs=None,  # TODO
+            )
+            contents.append(output_text)
+        text_item = ResponseOutputMessage(
+            id=f"msg_{random_uuid()}",
+            content=contents,
+            role=message.author.role,
+            status="completed",
+            type="message",
+        )
+        output_items.append(text_item)
+    else:
+        raise ValueError(f"Unknown channel: {message.channel}")
+    return output_items
+
+
+def parse_remaining_state(parser: StreamableParser):
+    if not parser.current_content:
+        return []
+    if parser.current_role != Role.ASSISTANT:
+        return []
+    current_recipient = parser.current_recipient
+    if current_recipient is not None and current_recipient.startswith("browser."):
+        return []
+
+    if parser.current_channel == "analysis":
+        reasoning_item = ResponseReasoningItem(
+            id=f"rs_{random_uuid()}",
+            type="reasoning",
+            summary=[],
+            content=[
+                ResponseReasoningTextContent(
+                    text=parser.current_content, type="reasoning_text"
+                )
+            ],
+            status=None,
+        )
+        return [reasoning_item]
+    elif parser.current_channel == "final":
+        output_text = ResponseOutputText(
+            text=parser.current_content,
+            annotations=[],  # TODO
+            type="output_text",
+            logprobs=None,  # TODO
+        )
+        text_item = ResponseOutputMessage(
+            id=f"msg_{random_uuid()}",
+            content=[output_text],
+            role="assistant",
+            status="completed",
+            type="message",
+        )
+        return [text_item]
+    return []
+
+
+def parse_output_into_messages(token_ids: Iterable[int]):
+    parser = get_streamable_parser_for_assistant()
+    for token_id in token_ids:
+        parser.process(token_id)
+    return parser
diff --git a/sglang/python/sglang/srt/entrypoints/http_server.py b/sglang/python/sglang/srt/entrypoints/http_server.py
new file mode 100644
index 0000000000000000000000000000000000000000..d57108c410ac0cb8f9f5800caee89ab3f1526a00
--- /dev/null
+++ b/sglang/python/sglang/srt/entrypoints/http_server.py
@@ -0,0 +1,2188 @@
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""
+The entry point of inference server. (SRT = SGLang Runtime)
+
+This file implements HTTP APIs for the inference engine via fastapi.
+"""
+
+import asyncio
+import dataclasses
+import logging
+import os
+import tempfile
+import threading
+import time
+from contextlib import asynccontextmanager
+from http import HTTPStatus
+from typing import (
+    Any,
+    AsyncGenerator,
+    AsyncIterator,
+    Callable,
+    Dict,
+    List,
+    Optional,
+    Union,
+)
+
+# Fix a bug of Python threading
+setattr(threading, "_register_atexit", lambda *args, **kwargs: None)
+
+
+import numpy as np
+import orjson
+import requests
+import uvicorn
+import uvloop
+from fastapi import (
+    Depends,
+    FastAPI,
+    File,
+    Form,
+    HTTPException,
+    Query,
+    Request,
+    UploadFile,
+)
+from fastapi.exceptions import RequestValidationError
+from fastapi.middleware.cors import CORSMiddleware
+from fastapi.responses import ORJSONResponse, Response, StreamingResponse
+
+from sglang.srt.disaggregation.utils import FAKE_BOOTSTRAP_HOST, DisaggregationMode
+from sglang.srt.entrypoints.anthropic.protocol import (
+    AnthropicCountTokensRequest,
+    AnthropicMessagesRequest,
+)
+from sglang.srt.entrypoints.anthropic.serving import AnthropicServing
+from sglang.srt.entrypoints.engine import (
+    Engine,
+    init_tokenizer_manager,
+    run_detokenizer_process,
+    run_scheduler_process,
+)
+from sglang.srt.entrypoints.ollama.protocol import (
+    OllamaChatRequest,
+    OllamaGenerateRequest,
+    OllamaShowRequest,
+)
+from sglang.srt.entrypoints.ollama.serving import OllamaServing
+from sglang.srt.entrypoints.openai.protocol import (
+    ChatCompletionRequest,
+    ClassifyRequest,
+    CompletionRequest,
+    DetokenizeRequest,
+    EmbeddingRequest,
+    ErrorResponse,
+    ModelCard,
+    ModelList,
+    ResponsesRequest,
+    ScoringRequest,
+    TokenizeRequest,
+    V1RerankReqInput,
+)
+from sglang.srt.entrypoints.openai.serving_chat import OpenAIServingChat
+from sglang.srt.entrypoints.openai.serving_classify import OpenAIServingClassify
+from sglang.srt.entrypoints.openai.serving_completions import OpenAIServingCompletion
+from sglang.srt.entrypoints.openai.serving_embedding import OpenAIServingEmbedding
+from sglang.srt.entrypoints.openai.serving_rerank import OpenAIServingRerank
+from sglang.srt.entrypoints.openai.serving_score import OpenAIServingScore
+from sglang.srt.entrypoints.openai.serving_tokenize import (
+    OpenAIServingDetokenize,
+    OpenAIServingTokenize,
+)
+from sglang.srt.entrypoints.openai.serving_transcription import (
+    OpenAIServingTranscription,
+)
+from sglang.srt.entrypoints.warmup import execute_warmups
+from sglang.srt.environ import envs
+from sglang.srt.function_call.function_call_parser import FunctionCallParser
+from sglang.srt.managers.io_struct import (
+    AbortReq,
+    AttachHiCacheStorageReqInput,
+    CheckWeightsReqInput,
+    CloseSessionReqInput,
+    ConfigureLoggingReq,
+    ContinueGenerationReqInput,
+    DestroyWeightsUpdateGroupReqInput,
+    DumperControlReqInput,
+    EmbeddingReqInput,
+    GenerateReqInput,
+    GetWeightsByNameReqInput,
+    InitWeightsSendGroupForRemoteInstanceReqInput,
+    InitWeightsUpdateGroupReqInput,
+    LoadLoRAAdapterFromTensorsReqInput,
+    LoadLoRAAdapterReqInput,
+    OpenSessionReqInput,
+    ParseFunctionCallReq,
+    PauseGenerationReqInput,
+    PinPrefixReqInput,
+    ProfileReqInput,
+    ReleaseMemoryOccupationReqInput,
+    ResumeMemoryOccupationReqInput,
+    SendWeightsToRemoteInstanceReqInput,
+    SeparateReasoningReqInput,
+    SetInternalStateReq,
+    SlowDownReqInput,
+    UnloadLoRAAdapterReqInput,
+    UpdateWeightFromDiskReqInput,
+    UpdateWeightsFromDistributedReqInput,
+    UpdateWeightsFromIPCReqInput,
+    UpdateWeightsFromTensorReqInput,
+    UpdateWeightVersionReqInput,
+    VertexGenerateReqInput,
+)
+from sglang.srt.managers.multi_tokenizer_mixin import (
+    MultiTokenizerRouter,
+    TokenizerWorker,
+    get_main_process_id,
+    monkey_patch_uvicorn_multiprocessing,
+    read_from_shared_memory,
+    write_data_for_multi_tokenizer,
+)
+from sglang.srt.managers.template_manager import TemplateManager
+from sglang.srt.managers.tokenizer_manager import ServerStatus, TokenizerManager
+from sglang.srt.model_loader.remote_instance_weight_loader_utils import (
+    parse_remote_instance_transfer_engine_info_from_scheduler_infos,
+)
+from sglang.srt.observability.func_timer import enable_func_timer
+from sglang.srt.observability.trace import (
+    process_tracing_init,
+    set_global_trace_level,
+    trace_set_thread_info,
+)
+from sglang.srt.parser.reasoning_parser import ReasoningParser
+from sglang.srt.server_args import PortArgs, ServerArgs
+from sglang.srt.utils import (
+    add_prometheus_middleware,
+    add_prometheus_track_response_middleware,
+    delete_directory,
+    get_bool_env_var,
+    kill_process_tree,
+    set_uvicorn_logging_configs,
+)
+from sglang.srt.utils.auth import AuthLevel, app_has_admin_force_endpoints, auth_level
+from sglang.utils import _prebind_listening_socket, get_exception_traceback
+from sglang.version import __version__
+
+logger = logging.getLogger(__name__)
+asyncio.set_event_loop_policy(uvloop.EventLoopPolicy())
+
+# Global constants
+HEALTH_CHECK_TIMEOUT = int(os.getenv("SGLANG_HEALTH_CHECK_TIMEOUT", 20))
+WAIT_WEIGHTS_READY_TIMEOUT = int(os.getenv("SGLANG_WAIT_WEIGHTS_READY_TIMEOUT", 120))
+
+
+# Store global states
+@dataclasses.dataclass
+class _GlobalState:
+    tokenizer_manager: Union[TokenizerManager, MultiTokenizerRouter, TokenizerWorker]
+    template_manager: TemplateManager
+    scheduler_info: Dict
+    # Dict{
+    #   rank: Tuple(
+    #           session_id,
+    #           Dict{
+    #               name: Tuple (d_ptr, numel, element_size)
+    #           }
+    #         )
+    # }
+    remote_instance_transfer_engine_info: Optional[Dict] = None
+
+
+_global_state: Optional[_GlobalState] = None
+
+
+def set_global_state(global_state: _GlobalState):
+    global _global_state
+    _global_state = global_state
+
+
+def get_global_state() -> _GlobalState:
+    return _global_state
+
+
+async def init_multi_tokenizer() -> ServerArgs:
+    """
+    Initialization function for multi-process tokenizer mode.
+    It read args information from shm and inits tokenizer manager for current process.
+    """
+
+    # Read configuration from shared memory
+    main_pid = get_main_process_id()
+    port_args, server_args, scheduler_info = read_from_shared_memory(
+        f"multi_tokenizer_args_{main_pid}"
+    )
+    server_args: ServerArgs
+    port_args: PortArgs
+
+    # API key authentication is not supported in multi-tokenizer mode
+    assert (
+        server_args.api_key is None
+    ), "API key is not supported in multi-tokenizer mode"
+
+    # Create a new ipc name for the current process
+    port_args.tokenizer_ipc_name = (
+        f"ipc://{tempfile.NamedTemporaryFile(delete=False).name}"
+    )
+    logger.info(
+        f"Start multi-tokenizer worker process {os.getpid()}, "
+        f"ipc_name={port_args.tokenizer_ipc_name}"
+    )
+
+    # Launch multi-tokenizer manager process
+    tokenizer_manager = TokenizerWorker(server_args, port_args)
+    template_manager = TemplateManager()
+    template_manager.initialize_templates(
+        tokenizer_manager=tokenizer_manager,
+        model_path=server_args.model_path,
+        chat_template=server_args.chat_template,
+        completion_template=server_args.completion_template,
+    )
+
+    tokenizer_manager.max_req_input_len = scheduler_info["max_req_input_len"]
+
+    set_global_state(
+        _GlobalState(
+            tokenizer_manager=tokenizer_manager,
+            template_manager=template_manager,
+            scheduler_info=scheduler_info,
+        )
+    )
+
+    return server_args
+
+
+@asynccontextmanager
+async def lifespan(fast_api_app: FastAPI):
+    if getattr(fast_api_app, "is_single_tokenizer_mode", False):
+        server_args = fast_api_app.server_args
+        warmup_thread_kwargs = fast_api_app.warmup_thread_kwargs
+        thread_label = "Tokenizer"
+    else:
+        # Initialize multi-tokenizer support for worker processes
+        server_args = await init_multi_tokenizer()
+        warmup_thread_kwargs = dict(server_args=server_args)
+        thread_label = f"MultiTokenizer-{_global_state.tokenizer_manager.worker_id}"
+
+    # Add prometheus middleware
+    if server_args.enable_metrics:
+        add_prometheus_middleware(app)
+        enable_func_timer()
+
+    # Init tracing
+    if server_args.enable_trace:
+        process_tracing_init(server_args.otlp_traces_endpoint, "sglang")
+        if server_args.disaggregation_mode == "prefill":
+            thread_label = "Prefill" + thread_label
+        elif server_args.disaggregation_mode == "decode":
+            thread_label = "Decode" + thread_label
+        trace_set_thread_info(thread_label)
+
+    # Initialize OpenAI serving handlers
+    fast_api_app.state.openai_serving_completion = OpenAIServingCompletion(
+        _global_state.tokenizer_manager, _global_state.template_manager
+    )
+    fast_api_app.state.openai_serving_chat = OpenAIServingChat(
+        _global_state.tokenizer_manager, _global_state.template_manager
+    )
+    fast_api_app.state.openai_serving_embedding = OpenAIServingEmbedding(
+        _global_state.tokenizer_manager, _global_state.template_manager
+    )
+    fast_api_app.state.openai_serving_classify = OpenAIServingClassify(
+        _global_state.tokenizer_manager, _global_state.template_manager
+    )
+    fast_api_app.state.openai_serving_score = OpenAIServingScore(
+        _global_state.tokenizer_manager
+    )
+    fast_api_app.state.openai_serving_rerank = OpenAIServingRerank(
+        _global_state.tokenizer_manager, _global_state.template_manager
+    )
+    fast_api_app.state.openai_serving_tokenize = OpenAIServingTokenize(
+        _global_state.tokenizer_manager
+    )
+    fast_api_app.state.openai_serving_detokenize = OpenAIServingDetokenize(
+        _global_state.tokenizer_manager
+    )
+    fast_api_app.state.openai_serving_transcription = OpenAIServingTranscription(
+        _global_state.tokenizer_manager
+    )
+
+    # Initialize Ollama-compatible serving handler
+    fast_api_app.state.ollama_serving = OllamaServing(_global_state.tokenizer_manager)
+
+    # Initialize Anthropic-compatible serving handler
+    fast_api_app.state.anthropic_serving = AnthropicServing(
+        fast_api_app.state.openai_serving_chat
+    )
+
+    # Launch tool server
+    tool_server = None
+    if server_args.tool_server == "demo":
+        from sglang.srt.entrypoints.openai.tool_server import DemoToolServer
+
+        tool_server = DemoToolServer()
+    elif server_args.tool_server:
+        from sglang.srt.entrypoints.openai.tool_server import MCPToolServer
+
+        tool_server = MCPToolServer()
+        await tool_server.add_tool_server(server_args.tool_server)
+
+    try:
+        from sglang.srt.entrypoints.openai.serving_responses import (
+            OpenAIServingResponses,
+        )
+
+        fast_api_app.state.openai_serving_responses = OpenAIServingResponses(
+            _global_state.tokenizer_manager,
+            _global_state.template_manager,
+            enable_prompt_tokens_details=True,
+            enable_force_include_usage=True,
+            tool_server=tool_server,
+        )
+    except Exception:
+        traceback = get_exception_traceback()
+        logger.warning(f"Can not initialize OpenAIServingResponses, error: {traceback}")
+
+    # Execute custom warmups
+    if server_args.warmups is not None:
+        await execute_warmups(
+            server_args.disaggregation_mode,
+            server_args.warmups.split(","),
+            _global_state.tokenizer_manager,
+        )
+        logger.info("Warmup ended")
+
+    # Execute the general warmup
+    warmup_thread = threading.Thread(
+        target=_wait_and_warmup,
+        kwargs=warmup_thread_kwargs,
+    )
+    warmup_thread.start()
+
+    # Start the HTTP server
+    try:
+        yield
+    finally:
+        warmup_thread.join()
+
+
+# Fast API
+app = FastAPI(
+    lifespan=lifespan,
+    openapi_url=None if get_bool_env_var("DISABLE_OPENAPI_DOC") else "/openapi.json",
+)
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+
+# Include routers
+from sglang.srt.entrypoints.v1_loads import router as v1_loads_router
+
+app.include_router(v1_loads_router)
+
+
+@app.exception_handler(HTTPException)
+async def validation_exception_handler(request: Request, exc: HTTPException):
+    """Enrich HTTP exception with status code and other details.
+
+    For /v1/responses, emit OpenAI-style nested error envelope:
+    {"error": {"message": "...", "type": "...", "param": null, "code": <status>}}
+    """
+    # adjust fmt for responses api
+    if request.url.path.startswith("/v1/responses"):
+        nested_error = {
+            "message": exc.detail,
+            "type": HTTPStatus(exc.status_code).phrase,
+            "param": None,
+            "code": exc.status_code,
+        }
+        return ORJSONResponse(
+            content={"error": nested_error}, status_code=exc.status_code
+        )
+
+    error = ErrorResponse(
+        object="error",
+        message=exc.detail,
+        type=str(exc.status_code),
+        code=exc.status_code,
+    )
+    return ORJSONResponse(content=error.model_dump(), status_code=exc.status_code)
+
+
+# Custom exception handlers to change validation error status codes
+@app.exception_handler(RequestValidationError)
+async def validation_exception_handler(request: Request, exc: RequestValidationError):
+    """Override FastAPI's default 422 validation error with 400.
+
+    For /v1/responses, emit OpenAI-style nested error envelope; for other endpoints keep legacy format.
+    """
+    exc_str = str(exc)
+    errors_str = str(exc.errors())
+
+    if errors_str and errors_str != exc_str:
+        message = f"{exc_str} {errors_str}"
+    else:
+        message = exc_str
+
+    if request.url.path.startswith("/v1/responses"):
+        # adapt specially, for v1/responses API only (notice the error key is different)
+        nested_error = {
+            "message": message,
+            "type": HTTPStatus.BAD_REQUEST.phrase,
+            "param": None,
+            "code": HTTPStatus.BAD_REQUEST.value,
+        }
+        return ORJSONResponse(status_code=400, content={"error": nested_error})
+
+    err = ErrorResponse(
+        message=message,
+        type=HTTPStatus.BAD_REQUEST.phrase,
+        code=HTTPStatus.BAD_REQUEST.value,
+    )
+
+    return ORJSONResponse(
+        status_code=400,
+        content=err.model_dump(),
+    )
+
+
+async def validate_json_request(raw_request: Request):
+    """Validate that the request content-type is application/json."""
+    content_type = raw_request.headers.get("content-type", "").lower()
+    media_type = content_type.split(";", maxsplit=1)[0]
+    if media_type != "application/json":
+        raise RequestValidationError(
+            errors=[
+                {
+                    "loc": ["header", "content-type"],
+                    "msg": "Unsupported Media Type: Only 'application/json' is allowed",
+                    "type": "value_error",
+                }
+            ]
+        )
+
+
+##### Native API endpoints #####
+
+
+@app.get("/health")
+@app.get("/health_generate")
+async def health_generate(request: Request) -> Response:
+    """
+    Check the health of the inference server by sending a special request to generate one token.
+
+    If the server is running something, this request will be ignored, so it creates zero overhead.
+    If the server is not running anything, this request will be run, so we know whether the server is healthy.
+    """
+
+    if _global_state.tokenizer_manager.gracefully_exit:
+        logger.info("Health check request received during shutdown. Returning 503.")
+        return Response(status_code=503)
+
+    if _global_state.tokenizer_manager.server_status == ServerStatus.Starting:
+        return Response(status_code=503)
+
+    if (
+        not envs.SGLANG_ENABLE_HEALTH_ENDPOINT_GENERATION.get()
+        and request.url.path == "/health"
+    ):
+        return Response(status_code=200)
+
+    sampling_params = {"max_new_tokens": 1, "temperature": 0.0}
+    rid = f"HEALTH_CHECK_{time.time()}"
+
+    if _global_state.tokenizer_manager.is_image_gen:
+        gri = _global_state.tokenizer_manager.get_image_gen_health_check_request(
+            rid, sampling_params
+        )
+    elif _global_state.tokenizer_manager.is_generation:
+        gri = GenerateReqInput(
+            rid=rid,
+            input_ids=[0],
+            sampling_params=sampling_params,
+            log_metrics=False,
+        )
+        if (
+            _global_state.tokenizer_manager.server_args.disaggregation_mode
+            != DisaggregationMode.NULL.value
+        ):
+            gri.bootstrap_host = FAKE_BOOTSTRAP_HOST
+            gri.bootstrap_room = 0
+    else:
+        gri = EmbeddingReqInput(
+            rid=rid, input_ids=[0], sampling_params=sampling_params, log_metrics=False
+        )
+
+    async def gen():
+        async for _ in _global_state.tokenizer_manager.generate_request(gri, request):
+            break
+
+    task = asyncio.create_task(gen())
+
+    # As long as we receive any response from the detokenizer/scheduler, we consider the server is healthy.
+    tic = time.time()
+    while time.time() < tic + HEALTH_CHECK_TIMEOUT:
+        await asyncio.sleep(1)
+        if _global_state.tokenizer_manager.last_receive_tstamp > tic:
+            task.cancel()
+            _global_state.tokenizer_manager.rid_to_state.pop(rid, None)
+            _global_state.tokenizer_manager.server_status = ServerStatus.Up
+            return Response(status_code=200)
+
+    task.cancel()
+    tic_time = time.strftime("%H:%M:%S", time.localtime(tic))
+    last_receive_time = time.strftime(
+        "%H:%M:%S", time.localtime(_global_state.tokenizer_manager.last_receive_tstamp)
+    )
+    logger.error(
+        f"Health check failed. Server couldn't get a response from detokenizer for last "
+        f"{HEALTH_CHECK_TIMEOUT} seconds. tic start time: {tic_time}. "
+        f"last_heartbeat time: {last_receive_time}"
+    )
+    _global_state.tokenizer_manager.rid_to_state.pop(rid, None)
+    _global_state.tokenizer_manager.server_status = ServerStatus.UnHealthy
+    return Response(status_code=503)
+
+
+@app.get("/get_model_info")
+async def get_model_info():
+    """Get the model information (deprecated - use /model_info instead)."""
+    logger.warning(
+        "Endpoint '/get_model_info' is deprecated and will be removed in a future version. "
+        "Please use '/model_info' instead."
+    )
+    return await model_info()
+
+
+@app.get("/model_info")
+async def model_info():
+    """Get the model information."""
+    model_config = _global_state.tokenizer_manager.model_config
+    result = {
+        "model_path": _global_state.tokenizer_manager.model_path,
+        "tokenizer_path": _global_state.tokenizer_manager.server_args.tokenizer_path,
+        "is_generation": _global_state.tokenizer_manager.is_generation,
+        "preferred_sampling_params": _global_state.tokenizer_manager.server_args.preferred_sampling_params,
+        "weight_version": _global_state.tokenizer_manager.server_args.weight_version,
+        "has_image_understanding": model_config.is_image_understandable_model,
+        "has_audio_understanding": model_config.is_audio_understandable_model,
+        "model_type": getattr(model_config.hf_config, "model_type", None),
+        "architectures": getattr(model_config.hf_config, "architectures", None),
+        "weight_version": _global_state.tokenizer_manager.server_args.weight_version,
+        # "hf_config": model_config.hf_config.to_dict(),
+    }
+    return result
+
+
+@app.get("/get_weight_version")
+@app.get("/weight_version")
+async def weight_version():
+    """Get the current weight version."""
+    raise HTTPException(
+        status_code=404,
+        detail="Endpoint '/get_weight_version' or '/weight_version' is deprecated. Please use '/model_info' instead.",
+    )
+
+
+@app.get("/get_server_info")
+async def get_server_info():
+    """Get the server information (deprecated - use /server_info instead)."""
+    logger.warning(
+        "Endpoint '/get_server_info' is deprecated and will be removed in a future version. "
+        "Please use '/server_info' instead."
+    )
+    return await server_info()
+
+
+@app.get("/server_info")
+async def server_info():
+    """Get the server information."""
+    # Returns internal states per DP.
+    internal_states: List[Dict[Any, Any]] = (
+        await _global_state.tokenizer_manager.get_internal_state()
+    )
+
+    # This field is not serializable.
+    if hasattr(_global_state.tokenizer_manager.server_args, "model_config"):
+        del _global_state.tokenizer_manager.server_args.model_config
+
+    return {
+        **dataclasses.asdict(_global_state.tokenizer_manager.server_args),
+        **_global_state.scheduler_info,
+        "internal_states": internal_states,
+        "version": __version__,
+    }
+
+
+@app.get("/get_load")
+async def get_load():
+    """Get load metrics (deprecated - use /v1/loads instead)."""
+    logger.warning(
+        "Endpoint '/get_load' is deprecated and will be removed in a future version. "
+        "Please use '/v1/loads' instead."
+    )
+    return await _global_state.tokenizer_manager.get_load()
+
+
+# example usage:
+# curl -s -X POST http://localhost:30000/set_internal_state -H "Content-Type: application/json" -d '{"server_args": {"pp_max_micro_batch_size": 8}}'
+@app.api_route("/set_internal_state", methods=["POST", "PUT"])
+@auth_level(AuthLevel.ADMIN_OPTIONAL)
+async def set_internal_state(obj: SetInternalStateReq, request: Request):
+    res = await _global_state.tokenizer_manager.set_internal_state(obj)
+    return res
+
+
+# Do not import `dumper.py` to avoid dependency
+if os.environ.get("DUMPER_SERVER_PORT") == "reuse":
+
+    @app.api_route("/dumper/{method}", methods=["POST"])
+    @auth_level(AuthLevel.ADMIN_OPTIONAL)
+    async def _dumper_control_handler(method: str, request: Request):
+        body_bytes = await request.body()
+        body = await request.json() if body_bytes else {}
+        obj = DumperControlReqInput(method=method, body=body)
+        results = await _global_state.tokenizer_manager.dumper_control(obj)
+        if any(not r.success for r in results):
+            errors = [r.error for r in results if not r.success]
+            return ORJSONResponse(status_code=400, content={"error": errors})
+        return [x for result in results for x in result.response]
+
+
+# fastapi implicitly converts json in the request to obj (dataclass)
+@app.api_route("/generate", methods=["POST", "PUT"])
+async def generate_request(obj: GenerateReqInput, request: Request):
+    """Handle a generate request."""
+    if obj.stream:
+
+        async def stream_results() -> AsyncIterator[bytes]:
+            try:
+                async for out in _global_state.tokenizer_manager.generate_request(
+                    obj, request
+                ):
+                    yield b"data: " + orjson.dumps(
+                        out, option=orjson.OPT_NON_STR_KEYS | orjson.OPT_SERIALIZE_NUMPY
+                    ) + b"\n\n"
+            except ValueError as e:
+                out = {"error": {"message": str(e)}}
+                logger.error(f"[http_server] Error: {e}")
+                yield b"data: " + orjson.dumps(
+                    out, option=orjson.OPT_NON_STR_KEYS | orjson.OPT_SERIALIZE_NUMPY
+                ) + b"\n\n"
+            yield b"data: [DONE]\n\n"
+
+        return StreamingResponse(
+            stream_results(),
+            media_type="text/event-stream",
+            background=_global_state.tokenizer_manager.create_abort_task(obj),
+        )
+    else:
+        try:
+            ret = await _global_state.tokenizer_manager.generate_request(
+                obj, request
+            ).__anext__()
+            return Response(
+                content=orjson.dumps(
+                    ret, option=orjson.OPT_NON_STR_KEYS | orjson.OPT_SERIALIZE_NUMPY
+                ),
+                media_type="application/json",
+            )
+        except ValueError as e:
+            logger.error(f"[http_server] Error: {e}")
+            return _create_error_response(e)
+
+
+@app.api_route("/encode", methods=["POST", "PUT"])
+async def encode_request(obj: EmbeddingReqInput, request: Request):
+    """Handle an embedding request."""
+    try:
+        ret = await _global_state.tokenizer_manager.generate_request(
+            obj, request
+        ).__anext__()
+        return ret
+    except ValueError as e:
+        return _create_error_response(e)
+
+
+@app.api_route("/classify", methods=["POST", "PUT"])
+async def classify_request(obj: EmbeddingReqInput, request: Request):
+    """Handle a reward model request. Now the arguments and return values are the same as embedding models."""
+    try:
+        ret = await _global_state.tokenizer_manager.generate_request(
+            obj, request
+        ).__anext__()
+        return ret
+    except ValueError as e:
+        return _create_error_response(e)
+
+
+@app.api_route("/flush_cache", methods=["GET", "POST"])
+@auth_level(AuthLevel.ADMIN_OPTIONAL)
+async def flush_cache():
+    """Flush the radix cache."""
+    ret = await _global_state.tokenizer_manager.flush_cache()
+    return Response(
+        content="Cache flushed.\nPlease check backend logs for more details. "
+        "(When there are running or waiting requests, the operation will not be performed.)\n",
+        status_code=200 if ret.success else HTTPStatus.BAD_REQUEST,
+    )
+
+
+@app.api_route("/clear_hicache_storage_backend", methods=["GET", "POST"])
+@auth_level(AuthLevel.ADMIN_OPTIONAL)
+async def clear_hicache_storage_backend_deprecated():
+    """Deprecated: use POST /hicache/storage-backend/clear."""
+    ret = await _global_state.tokenizer_manager.clear_hicache_storage()
+    return Response(
+        content=(
+            "Deprecated endpoint. Use POST /hicache/storage-backend/clear.\n"
+            "Hierarchical cache storage backend cleared.\n"
+        ),
+        status_code=200 if ret.success else HTTPStatus.BAD_REQUEST,
+    )
+
+
+# example usage:
+# curl -s -X POST http://127.0.0.1:30000/clear_hicache_storage_backend
+@app.api_route("/hicache/storage-backend/clear", methods=["POST"])
+@auth_level(AuthLevel.ADMIN_OPTIONAL)
+async def clear_hicache_storage_backend():
+    """Clear the hierarchical cache storage backend."""
+    ret = await _global_state.tokenizer_manager.clear_hicache_storage()
+    return Response(
+        content="Hierarchical cache storage backend cleared.\n",
+        status_code=200 if ret.success else HTTPStatus.BAD_REQUEST,
+    )
+
+
+# example usage:
+# curl -s -X PUT http://127.0.0.1:30000/hicache/storage-backend \
+#  -H 'Content-Type: application/json' \
+#   -d '{
+#     "hicache_storage_backend": "file",
+#     "hicache_storage_backend_extra_config_json": "{}",
+#     "hicache_storage_prefetch_policy": "timeout",
+#     "hicache_write_policy": "write_through"
+#   }'
+@app.api_route("/hicache/storage-backend", methods=["PUT"])
+@auth_level(AuthLevel.ADMIN_OPTIONAL)
+async def attach_hicache_storage_backend(obj: AttachHiCacheStorageReqInput):
+    """Attach (enable) HiCache storage backend at runtime.
+
+    Only allowed when there are NO running / queued requests.
+    """
+    if not _global_state.tokenizer_manager.server_args.admin_api_key:
+        return _admin_api_key_missing_response()
+
+    ret = await _global_state.tokenizer_manager.attach_hicache_storage(
+        hicache_storage_backend=obj.hicache_storage_backend,
+        hicache_storage_backend_extra_config_json=obj.hicache_storage_backend_extra_config_json,
+        hicache_storage_prefetch_policy=obj.hicache_storage_prefetch_policy,
+        hicache_write_policy=obj.hicache_write_policy,
+    )
+    msg = getattr(ret, "message", "")
+    return Response(
+        content=(
+            (
+                "HiCache storage backend attached.\n"
+                if ret.success
+                else "Failed to attach HiCache storage backend.\n"
+            )
+            + (msg + "\n" if msg else "")
+        ),
+        status_code=200 if ret.success else HTTPStatus.BAD_REQUEST,
+    )
+
+
+# example usage:
+# curl -s -X DELETE http://127.0.0.1:30000/hicache/storage-backend
+@app.api_route("/hicache/storage-backend", methods=["DELETE"])
+@auth_level(AuthLevel.ADMIN_OPTIONAL)
+async def detach_hicache_storage_backend():
+    """Detach (disable) HiCache storage backend at runtime.
+
+    Only allowed when there are NO running / queued requests.
+    """
+    if not _global_state.tokenizer_manager.server_args.admin_api_key:
+        return _admin_api_key_missing_response()
+
+    ret = await _global_state.tokenizer_manager.detach_hicache_storage()
+    msg = getattr(ret, "message", "")
+    return Response(
+        content=(
+            (
+                "HiCache storage backend detached.\n"
+                if ret.success
+                else "Failed to detach HiCache storage backend.\n"
+            )
+            + (msg + "\n" if msg else "")
+        ),
+        status_code=200 if ret.success else HTTPStatus.BAD_REQUEST,
+    )
+
+
+# example usage:
+# curl -s http://127.0.0.1:30000/hicache/storage-backend
+@app.get("/hicache/storage-backend")
+@auth_level(AuthLevel.ADMIN_OPTIONAL)
+async def hicache_storage_backend_status():
+    """Get current HiCache storage backend status (tokenizer-side view)."""
+    if not _global_state.tokenizer_manager.server_args.admin_api_key:
+        return _admin_api_key_missing_response()
+
+    return {
+        "hicache_storage_backend": _global_state.tokenizer_manager.server_args.hicache_storage_backend,
+        "hicache_storage_backend_extra_config": _global_state.tokenizer_manager.server_args.hicache_storage_backend_extra_config,
+        "hicache_storage_prefetch_policy": _global_state.tokenizer_manager.server_args.hicache_storage_prefetch_policy,
+        "hicache_write_policy": _global_state.tokenizer_manager.server_args.hicache_write_policy,
+    }
+
+
+@app.api_route("/hicache/pin_prefix", methods=["POST"])
+@auth_level(AuthLevel.ADMIN_OPTIONAL)
+async def pin_prefix(obj: PinPrefixReqInput):
+    """Pin a prefix by token_ids to resist eviction."""
+    if not _global_state.tokenizer_manager.server_args.admin_api_key:
+        return _admin_api_key_missing_response()
+    ret = await _global_state.tokenizer_manager.pin_prefix(
+        obj.token_ids, obj.ttl_seconds
+    )
+    return ORJSONResponse(
+        content={
+            "status": "ok" if ret.success else "error",
+            "nodes_pinned": ret.nodes_pinned,
+            "message": ret.message,
+        },
+        status_code=200 if ret.success else HTTPStatus.BAD_REQUEST,
+    )
+
+
+@app.api_route("/start_profile", methods=["GET", "POST"])
+@auth_level(AuthLevel.ADMIN_OPTIONAL)
+async def start_profile_async(obj: Optional[ProfileReqInput] = None):
+    """Start profiling."""
+    if obj is None:
+        obj = ProfileReqInput()
+
+    await _global_state.tokenizer_manager.start_profile(
+        output_dir=obj.output_dir,
+        start_step=obj.start_step,
+        num_steps=obj.num_steps,
+        activities=obj.activities,
+        with_stack=obj.with_stack,
+        record_shapes=obj.record_shapes,
+        profile_by_stage=obj.profile_by_stage,
+        merge_profiles=obj.merge_profiles,
+        profile_prefix=obj.profile_prefix,
+        profile_stages=obj.profile_stages,
+    )
+    return Response(
+        content="Start profiling.\n",
+        status_code=200,
+    )
+
+
+@app.api_route("/stop_profile", methods=["GET", "POST"])
+@auth_level(AuthLevel.ADMIN_OPTIONAL)
+async def stop_profile_async():
+    """Stop profiling."""
+    await _global_state.tokenizer_manager.stop_profile()
+    return Response(
+        content="Stop profiling. This will take some time.\n",
+        status_code=200,
+    )
+
+
+@app.api_route("/set_trace_level", methods=["GET", "POST"])
+def set_trace_level(level: int = Query(..., ge=0)):
+    set_global_trace_level(level)
+
+    return Response(
+        content="success",
+        status_code=200,
+    )
+
+
+@app.api_route("/freeze_gc", methods=["GET", "POST"])
+@auth_level(AuthLevel.ADMIN_OPTIONAL)
+async def freeze_gc_async():
+    """
+    See engine.freeze_gc for more details.
+    """
+    await _global_state.tokenizer_manager.freeze_gc()
+    return Response(
+        content="Garbage collection frozen.\n",
+        status_code=200,
+    )
+
+
+@app.api_route("/start_expert_distribution_record", methods=["GET", "POST"])
+@auth_level(AuthLevel.ADMIN_OPTIONAL)
+async def start_expert_distribution_record_async():
+    """Start recording the expert distribution. Clear the previous record if any."""
+    await _global_state.tokenizer_manager.start_expert_distribution_record()
+    return Response(
+        content="Start recording the expert distribution.\n",
+        status_code=200,
+    )
+
+
+@app.api_route("/stop_expert_distribution_record", methods=["GET", "POST"])
+@auth_level(AuthLevel.ADMIN_OPTIONAL)
+async def stop_expert_distribution_record_async():
+    """Stop recording the expert distribution."""
+    await _global_state.tokenizer_manager.stop_expert_distribution_record()
+    return Response(
+        content="Stop recording the expert distribution.\n",
+        status_code=200,
+    )
+
+
+@app.api_route("/dump_expert_distribution_record", methods=["GET", "POST"])
+@auth_level(AuthLevel.ADMIN_OPTIONAL)
+async def dump_expert_distribution_record_async():
+    """Dump expert distribution record."""
+    await _global_state.tokenizer_manager.dump_expert_distribution_record()
+    return Response(
+        content="Dump expert distribution record.\n",
+        status_code=200,
+    )
+
+
+@app.post("/update_weights_from_disk")
+@auth_level(AuthLevel.ADMIN_OPTIONAL)
+async def update_weights_from_disk(obj: UpdateWeightFromDiskReqInput, request: Request):
+    """Update the weights from disk inplace without re-launching the server."""
+    success, message, num_paused_requests = (
+        await _global_state.tokenizer_manager.update_weights_from_disk(obj, request)
+    )
+
+    content = {
+        "success": success,
+        "message": message,
+        "num_paused_requests": num_paused_requests,
+    }
+    if success:
+        return ORJSONResponse(
+            content,
+            status_code=HTTPStatus.OK,
+        )
+    else:
+        return ORJSONResponse(
+            content,
+            status_code=HTTPStatus.BAD_REQUEST,
+        )
+
+
+@app.post("/init_weights_send_group_for_remote_instance")
+@auth_level(AuthLevel.ADMIN_OPTIONAL)
+async def init_weights_send_group_for_remote_instance(
+    obj: InitWeightsSendGroupForRemoteInstanceReqInput, request: Request
+):
+    success, message = (
+        await _global_state.tokenizer_manager.init_weights_send_group_for_remote_instance(
+            obj, request
+        )
+    )
+    content = {"success": success, "message": message}
+    if success:
+        return ORJSONResponse(content, status_code=200)
+    else:
+        return ORJSONResponse(content, status_code=HTTPStatus.BAD_REQUEST)
+
+
+@app.post("/send_weights_to_remote_instance")
+@auth_level(AuthLevel.ADMIN_OPTIONAL)
+async def send_weights_to_remote_instance(
+    obj: SendWeightsToRemoteInstanceReqInput, request: Request
+):
+    success, message = (
+        await _global_state.tokenizer_manager.send_weights_to_remote_instance(
+            obj, request
+        )
+    )
+    content = {"success": success, "message": message}
+    if success:
+        return ORJSONResponse(content, status_code=200)
+    else:
+        return ORJSONResponse(content, status_code=HTTPStatus.BAD_REQUEST)
+
+
+@app.get("/get_remote_instance_transfer_engine_info")
+@auth_level(AuthLevel.ADMIN_OPTIONAL)
+async def get_remote_instance_transfer_engine_info(rank: int = None):
+    if rank is None or rank < 0:
+        return Response(status_code=HTTPStatus.BAD_REQUEST)
+
+    if (
+        _global_state.remote_instance_transfer_engine_info is None
+        or len(_global_state.remote_instance_transfer_engine_info) == 0
+    ):
+        return Response(status_code=HTTPStatus.BAD_REQUEST)
+
+    try:
+        result = {
+            "rank": rank,
+            "remote_instance_transfer_engine_info": _global_state.remote_instance_transfer_engine_info[
+                rank
+            ],
+        }
+        return result
+    except Exception as e:
+        logger.error(f"Exception: {e}")
+        return Response(status_code=HTTPStatus.BAD_REQUEST)
+
+
+@app.post("/init_weights_update_group")
+@auth_level(AuthLevel.ADMIN_OPTIONAL)
+async def init_weights_update_group(
+    obj: InitWeightsUpdateGroupReqInput, request: Request
+):
+    """Initialize the parameter update group."""
+    success, message = await _global_state.tokenizer_manager.init_weights_update_group(
+        obj, request
+    )
+    content = {"success": success, "message": message}
+    if success:
+        return ORJSONResponse(content, status_code=200)
+    else:
+        return ORJSONResponse(content, status_code=HTTPStatus.BAD_REQUEST)
+
+
+@app.post("/destroy_weights_update_group")
+@auth_level(AuthLevel.ADMIN_OPTIONAL)
+async def destroy_weights_update_group(
+    obj: DestroyWeightsUpdateGroupReqInput, request: Request
+):
+    """Destroy the parameter update group."""
+    success, message = (
+        await _global_state.tokenizer_manager.destroy_weights_update_group(obj, request)
+    )
+    content = {"success": success, "message": message}
+    return ORJSONResponse(
+        content, status_code=200 if success else HTTPStatus.BAD_REQUEST
+    )
+
+
+@app.post("/update_weights_from_tensor")
+@auth_level(AuthLevel.ADMIN_OPTIONAL)
+async def update_weights_from_tensor(
+    obj: UpdateWeightsFromTensorReqInput, request: Request
+):
+    """Update the weights from tensor inplace without re-launching the server.
+    Notes:
+    1. Ensure that the model is on the correct device (e.g., GPU) before calling this endpoint. If the model is moved to the CPU unexpectedly, it may cause performance issues or runtime errors.
+    2. HTTP will transmit only the metadata of the tensor, while the tensor itself will be directly copied to the model.
+    3. Any binary data in the named tensors should be base64 encoded.
+    """
+
+    success, message = await _global_state.tokenizer_manager.update_weights_from_tensor(
+        obj, request
+    )
+
+    content = {"success": success, "message": message}
+    return ORJSONResponse(
+        content, status_code=200 if success else HTTPStatus.BAD_REQUEST
+    )
+
+
+@app.post("/update_weights_from_distributed")
+@auth_level(AuthLevel.ADMIN_OPTIONAL)
+async def update_weights_from_distributed(
+    obj: UpdateWeightsFromDistributedReqInput, request: Request
+):
+    """Update model parameter from distributed online."""
+    success, message = (
+        await _global_state.tokenizer_manager.update_weights_from_distributed(
+            obj, request
+        )
+    )
+
+    content = {"success": success, "message": message}
+    if success:
+        return ORJSONResponse(content, status_code=200)
+    else:
+        return ORJSONResponse(content, status_code=HTTPStatus.BAD_REQUEST)
+
+
+@app.post("/update_weights_from_ipc")
+@auth_level(AuthLevel.ADMIN_OPTIONAL)
+async def update_weights_from_ipc(obj: UpdateWeightsFromIPCReqInput, request: Request):
+    """Update the weights from IPC (Inter-Process Communication) for checkpoint-engine integration."""
+    success, message = await _global_state.tokenizer_manager.update_weights_from_ipc(
+        obj, request
+    )
+
+    content = {"success": success, "message": message}
+    if success:
+        if _global_state.tokenizer_manager.initial_weights_loaded is False:
+            _global_state.tokenizer_manager.initial_weights_loaded = True
+        return ORJSONResponse(content)
+    else:
+        return ORJSONResponse(content, status_code=HTTPStatus.BAD_REQUEST)
+
+
+@app.post("/update_weight_version")
+@auth_level(AuthLevel.ADMIN_OPTIONAL)
+async def update_weight_version(obj: UpdateWeightVersionReqInput, request: Request):
+    """Update the weight version. This operation requires no active requests."""
+    if obj.abort_all_requests:
+        _global_state.tokenizer_manager.abort_request(abort_all=True)
+
+    # Use a simple approach without the complex lock mechanism for now
+    # since weight_version update is a simple operation that doesn't affect model weights
+    try:
+        # Update the weight version in server args (the single source of truth)
+        _global_state.tokenizer_manager.server_args.weight_version = obj.new_version
+
+        return ORJSONResponse(
+            {
+                "success": True,
+                "message": f"Weight version updated to {obj.new_version}",
+                "new_version": obj.new_version,
+            },
+            status_code=HTTPStatus.OK,
+        )
+    except Exception as e:
+        return ORJSONResponse(
+            {
+                "success": False,
+                "message": f"Failed to update weight version: {str(e)}",
+            },
+            status_code=HTTPStatus.BAD_REQUEST,
+        )
+
+
+@app.api_route("/get_weights_by_name", methods=["GET", "POST"])
+@auth_level(AuthLevel.ADMIN_OPTIONAL)
+async def get_weights_by_name(obj: GetWeightsByNameReqInput, request: Request):
+    """Get model parameter by name."""
+    try:
+        ret = await _global_state.tokenizer_manager.get_weights_by_name(obj, request)
+        if ret is None:
+            return _create_error_response("Get parameter by name failed")
+        else:
+            return ORJSONResponse(ret, status_code=200)
+    except Exception as e:
+        return _create_error_response(e)
+
+
+@app.api_route("/release_memory_occupation", methods=["GET", "POST"])
+@auth_level(AuthLevel.ADMIN_OPTIONAL)
+async def release_memory_occupation(
+    obj: ReleaseMemoryOccupationReqInput, request: Request
+):
+    """Release GPU memory occupation temporarily."""
+    try:
+        await _global_state.tokenizer_manager.release_memory_occupation(obj, request)
+    except Exception as e:
+        return _create_error_response(e)
+
+
+@app.api_route("/resume_memory_occupation", methods=["GET", "POST"])
+@auth_level(AuthLevel.ADMIN_OPTIONAL)
+async def resume_memory_occupation(
+    obj: ResumeMemoryOccupationReqInput, request: Request
+):
+    """Resume GPU memory occupation."""
+    try:
+        await _global_state.tokenizer_manager.resume_memory_occupation(obj, request)
+    except Exception as e:
+        return _create_error_response(e)
+
+
+@app.post("/weights_checker")
+@auth_level(AuthLevel.ADMIN_OPTIONAL)
+async def check_weights(obj: CheckWeightsReqInput, request: Request):
+    success, message = await _global_state.tokenizer_manager.check_weights(obj, request)
+    return ORJSONResponse(
+        {"success": success, "message": message},
+        status_code=200 if success else HTTPStatus.BAD_REQUEST,
+    )
+
+
+@app.api_route("/slow_down", methods=["GET", "POST"])
+@auth_level(AuthLevel.ADMIN_OPTIONAL)
+async def slow_down(obj: SlowDownReqInput, request: Request):
+    """Slow down the system deliberately. Only for testing. Example scenario:
+    when we want to test performance of D in large-scale PD disaggregation and have no enough nodes for P,
+    we can use this to slow down D to let it have enough running sequences, and then disable slowdown
+    to let it run in full batch size.
+    """
+    try:
+        await _global_state.tokenizer_manager.slow_down(obj, request)
+    except Exception as e:
+        return _create_error_response(e)
+
+
+@app.api_route("/load_lora_adapter", methods=["POST"])
+@auth_level(AuthLevel.ADMIN_OPTIONAL)
+async def load_lora_adapter(obj: LoadLoRAAdapterReqInput, request: Request):
+    """Load a new LoRA adapter without re-launching the server."""
+    result = await _global_state.tokenizer_manager.load_lora_adapter(obj, request)
+
+    if result.success:
+        return ORJSONResponse(
+            result,
+            status_code=HTTPStatus.OK,
+        )
+    else:
+        return ORJSONResponse(
+            result,
+            status_code=HTTPStatus.BAD_REQUEST,
+        )
+
+
+@app.api_route("/load_lora_adapter_from_tensors", methods=["POST"])
+async def load_lora_adapter_from_tensors(
+    obj: LoadLoRAAdapterFromTensorsReqInput, request: Request
+):
+    """Load a new LoRA adapter from tensors without re-launching the server."""
+    result = await _global_state.tokenizer_manager.load_lora_adapter_from_tensors(
+        obj, request
+    )
+
+    if result.success:
+        return ORJSONResponse(result, status_code=HTTPStatus.OK)
+    else:
+        return ORJSONResponse(result, status_code=HTTPStatus.BAD_REQUEST)
+
+
+@app.api_route("/unload_lora_adapter", methods=["POST"])
+@auth_level(AuthLevel.ADMIN_OPTIONAL)
+async def unload_lora_adapter(obj: UnloadLoRAAdapterReqInput, request: Request):
+    """Load a new LoRA adapter without re-launching the server."""
+    result = await _global_state.tokenizer_manager.unload_lora_adapter(obj, request)
+
+    if result.success:
+        return ORJSONResponse(
+            result,
+            status_code=HTTPStatus.OK,
+        )
+    else:
+        return ORJSONResponse(
+            result,
+            status_code=HTTPStatus.BAD_REQUEST,
+        )
+
+
+@app.api_route("/open_session", methods=["GET", "POST"])
+async def open_session(obj: OpenSessionReqInput, request: Request):
+    """Open a session, and return its unique session id."""
+    try:
+        session_id = await _global_state.tokenizer_manager.open_session(obj, request)
+        if session_id is None:
+            raise Exception(
+                "Failed to open the session. Check if a session with the same id is still open."
+            )
+        return session_id
+    except Exception as e:
+        return _create_error_response(e)
+
+
+@app.api_route("/close_session", methods=["GET", "POST"])
+async def close_session(obj: CloseSessionReqInput, request: Request):
+    """Close the session."""
+    try:
+        await _global_state.tokenizer_manager.close_session(obj, request)
+        return Response(status_code=200)
+    except Exception as e:
+        return _create_error_response(e)
+
+
+@app.api_route("/configure_logging", methods=["GET", "POST"])
+@auth_level(AuthLevel.ADMIN_OPTIONAL)
+async def configure_logging(obj: ConfigureLoggingReq, request: Request):
+    """Configure the request logging options."""
+    _global_state.tokenizer_manager.configure_logging(obj)
+    return Response(status_code=200)
+
+
+@app.post("/abort_request")
+@auth_level(AuthLevel.ADMIN_OPTIONAL)
+async def abort_request(obj: AbortReq, request: Request):
+    """Abort a request."""
+    try:
+        _global_state.tokenizer_manager.abort_request(
+            rid=obj.rid, abort_all=obj.abort_all
+        )
+        return Response(status_code=200)
+    except Exception as e:
+        return _create_error_response(e)
+
+
+@app.post("/parse_function_call")
+async def parse_function_call_request(obj: ParseFunctionCallReq, request: Request):
+    """
+    A native API endpoint to parse function calls from a text.
+    """
+    # 1) Initialize the parser based on the request body
+    parser = FunctionCallParser(tools=obj.tools, tool_call_parser=obj.tool_call_parser)
+
+    # 2) Call the non-stream parsing method (non-stream)
+    normal_text, calls = parser.parse_non_stream(obj.text)
+
+    # 3) Organize the response content
+    response_data = {
+        "normal_text": normal_text,
+        "calls": [
+            call.model_dump() for call in calls
+        ],  # Convert pydantic objects to dictionaries
+    }
+
+    return ORJSONResponse(content=response_data, status_code=200)
+
+
+@app.post("/separate_reasoning")
+async def separate_reasoning_request(obj: SeparateReasoningReqInput, request: Request):
+    """
+    A native API endpoint to separate reasoning from a text.
+    """
+    # 1) Initialize the parser based on the request body
+    parser = ReasoningParser(model_type=obj.reasoning_parser, request=request)
+
+    # 2) Call the non-stream parsing method (non-stream)
+    reasoning_text, normal_text = parser.parse_non_stream(obj.text)
+
+    # 3) Organize the response content
+    response_data = {
+        "reasoning_text": reasoning_text,
+        "text": normal_text,
+    }
+
+    return ORJSONResponse(content=response_data, status_code=200)
+
+
+@app.post("/pause_generation")
+@auth_level(AuthLevel.ADMIN_OPTIONAL)
+async def pause_generation(obj: PauseGenerationReqInput, request: Request):
+    """Pause generation."""
+    await _global_state.tokenizer_manager.pause_generation(obj)
+    return ORJSONResponse(
+        content={"message": "Generation paused successfully.", "status": "ok"},
+        status_code=200,
+    )
+
+
+@app.post("/continue_generation")
+@auth_level(AuthLevel.ADMIN_OPTIONAL)
+async def continue_generation(obj: ContinueGenerationReqInput, request: Request):
+    """Continue generation."""
+    await _global_state.tokenizer_manager.continue_generation(obj)
+    return ORJSONResponse(
+        content={"message": "Generation continued successfully.", "status": "ok"},
+        status_code=200,
+    )
+
+
+##### OpenAI-compatible API endpoints #####
+
+
+@app.post("/v1/completions", dependencies=[Depends(validate_json_request)])
+async def openai_v1_completions(request: CompletionRequest, raw_request: Request):
+    """OpenAI-compatible text completion endpoint."""
+    return await raw_request.app.state.openai_serving_completion.handle_request(
+        request, raw_request
+    )
+
+
+@app.post("/v1/chat/completions", dependencies=[Depends(validate_json_request)])
+async def openai_v1_chat_completions(
+    request: ChatCompletionRequest, raw_request: Request
+):
+    """OpenAI-compatible chat completion endpoint."""
+    return await raw_request.app.state.openai_serving_chat.handle_request(
+        request, raw_request
+    )
+
+
+@app.post(
+    "/v1/embeddings",
+    response_class=ORJSONResponse,
+    dependencies=[Depends(validate_json_request)],
+)
+async def openai_v1_embeddings(request: EmbeddingRequest, raw_request: Request):
+    """OpenAI-compatible embeddings endpoint."""
+    return await raw_request.app.state.openai_serving_embedding.handle_request(
+        request, raw_request
+    )
+
+
+@app.post(
+    "/v1/classify",
+    response_class=ORJSONResponse,
+    dependencies=[Depends(validate_json_request)],
+)
+async def openai_v1_classify(request: ClassifyRequest, raw_request: Request):
+    """OpenAI-compatible classification endpoint."""
+    return await raw_request.app.state.openai_serving_classify.handle_request(
+        request, raw_request
+    )
+
+
+@app.post(
+    "/v1/tokenize",
+    response_class=ORJSONResponse,
+    dependencies=[Depends(validate_json_request)],
+)
+@app.post(
+    "/tokenize",
+    response_class=ORJSONResponse,
+    dependencies=[Depends(validate_json_request)],
+    include_in_schema=False,
+)
+async def openai_v1_tokenize(request: TokenizeRequest, raw_request: Request):
+    """OpenAI-compatible tokenization endpoint."""
+    return await raw_request.app.state.openai_serving_tokenize.handle_request(
+        request, raw_request
+    )
+
+
+@app.post(
+    "/v1/detokenize",
+    response_class=ORJSONResponse,
+    dependencies=[Depends(validate_json_request)],
+)
+@app.post(
+    "/detokenize",
+    response_class=ORJSONResponse,
+    dependencies=[Depends(validate_json_request)],
+    include_in_schema=False,
+)
+async def openai_v1_detokenize(request: DetokenizeRequest, raw_request: Request):
+    """OpenAI-compatible detokenization endpoint."""
+    return await raw_request.app.state.openai_serving_detokenize.handle_request(
+        request, raw_request
+    )
+
+
+@app.post("/v1/audio/transcriptions")
+async def openai_v1_audio_transcriptions(
+    raw_request: Request,
+    file: UploadFile = File(...),
+    model: str = Form(default="default"),
+    language: Optional[str] = Form(default=None),
+    response_format: str = Form(default="json"),
+    temperature: float = Form(default=0.0),
+    stream: bool = Form(default=False),
+):
+    """OpenAI-compatible audio transcription endpoint."""
+    if response_format not in ["json", "text"]:
+        return ORJSONResponse(
+            content={"error": {"message": "Only 'json' and 'text' formats supported"}},
+            status_code=400,
+        )
+
+    audio_data = await file.read()
+
+    return (
+        await raw_request.app.state.openai_serving_transcription.create_transcription(
+            audio_data=audio_data,
+            model=model,
+            language=language,
+            response_format=response_format,
+            temperature=temperature,
+            stream=stream,
+            raw_request=raw_request,
+        )
+    )
+
+
+@app.get("/v1/models", response_class=ORJSONResponse)
+async def available_models():
+    """Show available models. OpenAI-compatible endpoint."""
+    served_model_names = [_global_state.tokenizer_manager.served_model_name]
+    model_cards = []
+
+    # Add base model
+    for served_model_name in served_model_names:
+        model_cards.append(
+            ModelCard(
+                id=served_model_name,
+                root=served_model_name,
+                max_model_len=_global_state.tokenizer_manager.model_config.context_len,
+            )
+        )
+
+    # Add loaded LoRA adapters
+    if _global_state.tokenizer_manager.server_args.enable_lora:
+        lora_registry = _global_state.tokenizer_manager.lora_registry
+        for _, lora_ref in lora_registry.get_all_adapters().items():
+            model_cards.append(
+                ModelCard(
+                    id=lora_ref.lora_name,
+                    root=lora_ref.lora_path,
+                    parent=served_model_names[0],
+                    max_model_len=None,
+                )
+            )
+
+    return ModelList(data=model_cards)
+
+
+@app.get("/v1/models/{model:path}", response_class=ORJSONResponse)
+async def retrieve_model(model: str):
+    """Retrieves a model instance, providing basic information about the model."""
+    served_model_names = [_global_state.tokenizer_manager.served_model_name]
+
+    if model not in served_model_names:
+        return ORJSONResponse(
+            status_code=404,
+            content={
+                "error": {
+                    "message": f"The model '{model}' does not exist",
+                    "type": "invalid_request_error",
+                    "param": "model",
+                    "code": "model_not_found",
+                }
+            },
+        )
+
+    return ModelCard(
+        id=model,
+        root=model,
+        max_model_len=_global_state.tokenizer_manager.model_config.context_len,
+    )
+
+
+@app.post("/v1/score", dependencies=[Depends(validate_json_request)])
+async def v1_score_request(request: ScoringRequest, raw_request: Request):
+    """Endpoint for the decoder-only scoring API. See Engine.score() for detailed documentation."""
+    return await raw_request.app.state.openai_serving_score.handle_request(
+        request, raw_request
+    )
+
+
+@app.post("/v1/responses", dependencies=[Depends(validate_json_request)])
+async def v1_responses_request(request: dict, raw_request: Request):
+    """Endpoint for the responses API with reasoning support."""
+
+    request_obj = ResponsesRequest(**request)
+    result = await raw_request.app.state.openai_serving_responses.create_responses(
+        request_obj, raw_request
+    )
+
+    # Handle streaming responses
+    if isinstance(result, AsyncGenerator):
+        return StreamingResponse(
+            result,
+            media_type="text/event-stream",
+            headers={"Cache-Control": "no-cache", "Connection": "keep-alive"},
+        )
+
+    return result
+
+
+@app.get("/v1/responses/{response_id}")
+async def v1_retrieve_responses(response_id: str, raw_request: Request):
+    """Retrieve a response by ID."""
+    return await raw_request.app.state.openai_serving_responses.retrieve_responses(
+        response_id
+    )
+
+
+@app.post("/v1/responses/{response_id}/cancel")
+async def v1_cancel_responses(response_id: str, raw_request: Request):
+    """Cancel a background response."""
+    return await raw_request.app.state.openai_serving_responses.cancel_responses(
+        response_id
+    )
+
+
+@app.api_route(
+    "/v1/rerank", methods=["POST", "PUT"], dependencies=[Depends(validate_json_request)]
+)
+async def v1_rerank_request(request: V1RerankReqInput, raw_request: Request):
+    """Endpoint for reranking documents based on query relevance."""
+    return await raw_request.app.state.openai_serving_rerank.handle_request(
+        request, raw_request
+    )
+
+
+##### Ollama-compatible API endpoints #####
+
+
+@app.get(os.environ.get("SGLANG_OLLAMA_ROOT_ROUTE", "/"))
+@app.head(os.environ.get("SGLANG_OLLAMA_ROOT_ROUTE", "/"))
+async def ollama_root():
+    """Ollama-compatible root endpoint for health check."""
+    return "Ollama is running"
+
+
+@app.post(os.environ.get("SGLANG_OLLAMA_CHAT_ROUTE", "/api/chat"))
+async def ollama_chat(request: OllamaChatRequest, raw_request: Request):
+    """Ollama-compatible chat endpoint."""
+    return await raw_request.app.state.ollama_serving.handle_chat(request, raw_request)
+
+
+@app.post(os.environ.get("SGLANG_OLLAMA_GENERATE_ROUTE", "/api/generate"))
+async def ollama_generate(request: OllamaGenerateRequest, raw_request: Request):
+    """Ollama-compatible generate endpoint."""
+    return await raw_request.app.state.ollama_serving.handle_generate(
+        request, raw_request
+    )
+
+
+@app.get(os.environ.get("SGLANG_OLLAMA_TAGS_ROUTE", "/api/tags"))
+async def ollama_tags(raw_request: Request):
+    """Ollama-compatible list models endpoint."""
+    return raw_request.app.state.ollama_serving.get_tags()
+
+
+@app.post(os.environ.get("SGLANG_OLLAMA_SHOW_ROUTE", "/api/show"))
+async def ollama_show(request: OllamaShowRequest, raw_request: Request):
+    """Ollama-compatible show model info endpoint."""
+    return raw_request.app.state.ollama_serving.get_show(request.model)
+
+
+##### Anthropic-compatible API endpoints #####
+
+
+@app.post("/v1/messages", dependencies=[Depends(validate_json_request)])
+async def anthropic_v1_messages(
+    request: AnthropicMessagesRequest, raw_request: Request
+):
+    """Anthropic-compatible Messages API endpoint."""
+    return await raw_request.app.state.anthropic_serving.handle_messages(
+        request, raw_request
+    )
+
+
+@app.post("/v1/messages/count_tokens", dependencies=[Depends(validate_json_request)])
+async def anthropic_v1_count_tokens(
+    request: AnthropicCountTokensRequest, raw_request: Request
+):
+    """Anthropic-compatible token counting endpoint."""
+    return await raw_request.app.state.anthropic_serving.handle_count_tokens(
+        request, raw_request
+    )
+
+
+## SageMaker API
+@app.get("/ping")
+async def sagemaker_health() -> Response:
+    """Check the health of the http server."""
+    return Response(status_code=200)
+
+
+@app.post("/invocations")
+async def sagemaker_chat_completions(
+    request: ChatCompletionRequest, raw_request: Request
+):
+    """OpenAI-compatible chat completion endpoint."""
+    return await raw_request.app.state.openai_serving_chat.handle_request(
+        request, raw_request
+    )
+
+
+## Vertex AI API
+@app.post(os.environ.get("AIP_PREDICT_ROUTE", "/vertex_generate"))
+async def vertex_generate(vertex_req: VertexGenerateReqInput, raw_request: Request):
+    if not vertex_req.instances:
+        return []
+    inputs = {}
+    for input_key in ("text", "input_ids", "input_embeds"):
+        if vertex_req.instances[0].get(input_key):
+            inputs[input_key] = [
+                instance.get(input_key) for instance in vertex_req.instances
+            ]
+            break
+    image_data = [
+        instance.get("image_data")
+        for instance in vertex_req.instances
+        if instance.get("image_data") is not None
+    ] or None
+    req = GenerateReqInput(
+        **inputs,
+        image_data=image_data,
+        **(vertex_req.parameters or {}),
+    )
+    ret = await generate_request(req, raw_request)
+    if isinstance(ret, Response):
+        return ret
+    return ORJSONResponse({"predictions": ret})
+
+
+def _create_error_response(e):
+    return ORJSONResponse(
+        {"error": {"message": str(e)}}, status_code=HTTPStatus.BAD_REQUEST
+    )
+
+
+# FIXME: In theory we should configure ADMIN_FORCE for some entrypoints, but doing so
+# would currently cause all endpoints to go through add_api_key_middleware
+# (even when neither api-key nor admin-api-key is configured).
+#
+# For now, we simulate ADMIN_FORCE by explicitly checking the admin API key parameter.
+# Once the auth wiring is refactored so ADMIN_FORCE only affects the intended
+# admin endpoints, we should switch this logic to use ADMIN_FORCE directly.
+def _admin_api_key_missing_response(
+    status_code: HTTPStatus = HTTPStatus.BAD_REQUEST,
+) -> ORJSONResponse:
+    return ORJSONResponse(
+        content={
+            "error": (
+                "This endpoint requires admin API key, but this server was started "
+                "without one (admin-api-key). Restart with --admin-api-key to enable."
+            )
+        },
+        status_code=status_code,
+    )
+
+
+# Minimal 32x32 black PNG (base64, GLM4v requires at least 32x32 sized image)
+MINIMUM_PNG_PICTURE_BASE64 = "iVBORw0KGgoAAAANSUhEUgAAACAAAAAgCAYAAABzenr0AAAACXBIWXMAAA7EAAAOxAGVKw4bAAAAbUlEQVRYhe3VsQ2AMAxE0Y/lIgNQULD/OqyCMgCihCKSG4yRuKuiNH6JLsoEbMACOGBcua9HOR7Y6w6swBwMy0qLTpkeI77qdEBpBFAHBBDAGH8WrwJKI4AAegUCfAKgEgpQDvh3CR3oQCuav58qlAw73kKCSgAAAABJRU5ErkJggg=="
+
+
+def _execute_server_warmup(server_args: ServerArgs):
+    headers = {}
+    url = server_args.url()
+    if server_args.api_key:
+        headers["Authorization"] = f"Bearer {server_args.api_key}"
+
+    ssl_verify = server_args.ssl_verify()
+
+    # Wait until the server is launched
+    success = False
+    for _ in range(120):
+        time.sleep(1)
+        try:
+            res = requests.get(
+                url + "/model_info", timeout=5, headers=headers, verify=ssl_verify
+            )
+            assert res.status_code == 200, f"{res=}, {res.text=}"
+            success = True
+            break
+        except (AssertionError, requests.exceptions.RequestException):
+            last_traceback = get_exception_traceback()
+            pass
+
+    if not success:
+        logger.error(f"Initialization failed. warmup error: {last_traceback}")
+        kill_process_tree(os.getpid())
+        return success
+
+    model_info = res.json()
+
+    # Construct a warmup request
+    is_vlm = bool(model_info.get("has_image_understanding", False))
+    if model_info["is_generation"]:
+        if is_vlm and not server_args.skip_tokenizer_init:
+            request_name = "/v1/chat/completions"
+        else:
+            request_name = "/generate"
+    else:
+        request_name = "/encode"
+    max_new_tokens = 8 if model_info["is_generation"] else 1
+    json_data = {
+        "sampling_params": {
+            "temperature": 0,
+            "max_new_tokens": max_new_tokens,
+        },
+    }
+    if server_args.skip_tokenizer_init:
+        json_data["input_ids"] = [[10, 11, 12] for _ in range(server_args.dp_size)]
+        # TODO Workaround the bug that embedding errors for list of size 1
+        if server_args.dp_size == 1:
+            json_data["input_ids"] = json_data["input_ids"][0]
+    elif (
+        is_vlm
+        and server_args.disaggregation_mode == "null"
+        and model_info["is_generation"]
+    ):
+        # TODO: ChatCompletionRequest does not have bootstrap info required by disaggregation mode, disable image-warmup for now
+        # Only use chat completions format for generation models, not embedding models
+        json_data = {
+            "model": _global_state.tokenizer_manager.served_model_name,
+            "messages": [
+                {
+                    "role": "user",
+                    "content": [
+                        {
+                            "type": "image_url",
+                            "image_url": {
+                                "url": f"data:image/png;base64,{MINIMUM_PNG_PICTURE_BASE64}"
+                            },
+                        },
+                        {
+                            "type": "text",
+                            "text": "Describe the image.",
+                        },
+                    ],
+                }
+            ],
+            "max_tokens": max_new_tokens,
+            "stream": False,
+            "temperature": 0.0,
+        }
+    else:
+        json_data["text"] = ["The capital city of France is"] * server_args.dp_size
+        # TODO Workaround the bug that embedding errors for list of size 1
+        if server_args.dp_size == 1:
+            json_data["text"] = json_data["text"][0]
+
+    # Config debug dumping
+    if server_args.debug_tensor_dump_input_file:
+        json_data.pop("text", None)
+        json_data["input_ids"] = np.load(
+            server_args.debug_tensor_dump_input_file
+        ).tolist()
+        json_data["sampling_params"]["max_new_tokens"] = 0
+
+    # Send a warmup request
+    warmup_timeout = envs.SGLANG_WARMUP_TIMEOUT.get()
+    try:
+        if server_args.disaggregation_mode == "null":
+            res = requests.post(
+                url + request_name,
+                json=json_data,
+                headers=headers,
+                timeout=warmup_timeout if warmup_timeout > 0 else 600,
+                verify=ssl_verify,
+            )
+            assert res.status_code == 200, f"{res.text}"
+            _global_state.tokenizer_manager.server_status = ServerStatus.Up
+
+        else:
+            logger.info(f"Start of pd disaggregation warmup ...")
+            json_data = {
+                "sampling_params": {
+                    "temperature": 0.0,
+                    "max_new_tokens": 8,
+                    "ignore_eos": True,
+                },
+                "bootstrap_host": [FAKE_BOOTSTRAP_HOST] * server_args.dp_size,
+                # This is a hack to ensure fake transfer is enabled during prefill warmup
+                # ensure each dp rank has a unique bootstrap_room during prefill warmup
+                "bootstrap_room": [
+                    i * (2**63 // server_args.dp_size) + (i % server_args.tp_size)
+                    for i in range(server_args.dp_size)
+                ],
+                "input_ids": [[10, 11, 12, 13]] * server_args.dp_size,
+            }
+            res = requests.post(
+                url + request_name,
+                json=json_data,
+                headers=headers,
+                timeout=(
+                    warmup_timeout if warmup_timeout > 0 else 1800
+                ),  # because of deep gemm precache is very long if not precache.
+                verify=ssl_verify,
+            )
+            if res.status_code == 200:
+                logger.info(
+                    f"End of prefill disaggregation mode warmup with status {res.status_code}, resp: {res.json()}"
+                )
+                _global_state.tokenizer_manager.server_status = ServerStatus.Up
+            else:
+                logger.info(
+                    "Prefill disaggregation mode warm Up Failed, status code: {}".format(
+                        res.status_code
+                    )
+                )
+                _global_state.tokenizer_manager.server_status = ServerStatus.UnHealthy
+
+    except Exception:
+        last_traceback = get_exception_traceback()
+        logger.error(f"Initialization failed. warmup error: {last_traceback}")
+        kill_process_tree(os.getpid())
+        return False
+
+    # Debug print
+    # logger.info(f"warmup request returns: {res.json()=}")
+    return success
+
+
+def _wait_and_warmup(
+    server_args: ServerArgs,
+    launch_callback: Optional[Callable[[], None]] = None,
+    execute_warmup_func: Callable = _execute_server_warmup,
+):
+    if server_args.checkpoint_engine_wait_weights_before_ready:
+        _wait_weights_ready()
+
+    # Send a warmup request
+    if not server_args.skip_server_warmup:
+        if not execute_warmup_func(server_args):
+            return
+    else:
+        _global_state.tokenizer_manager.server_status = ServerStatus.Up
+
+    # The server is ready for requests
+    logger.info("The server is fired up and ready to roll!")
+
+    if server_args.delete_ckpt_after_loading:
+        delete_directory(server_args.model_path)
+
+    if server_args.debug_tensor_dump_input_file:
+        kill_process_tree(os.getpid())
+
+    if launch_callback is not None:
+        launch_callback()
+
+
+def _wait_weights_ready():
+    """Wait for weights to be ready within the specified timeout."""
+    timeout = WAIT_WEIGHTS_READY_TIMEOUT
+    start_time = time.time()
+
+    for _ in range(timeout):
+        if _global_state.tokenizer_manager.initial_weights_loaded:
+            logger.info(
+                f"Weights are ready after {time.time() - start_time:.2f} seconds"
+            )
+            return
+        time.sleep(1)
+
+    # Timeout reached without weights being ready
+    logger.error(
+        f"Weights are not ready after waiting {timeout} seconds. "
+        f"Consider increasing SGLANG_WAIT_WEIGHTS_READY_TIMEOUT environment variable. "
+        f"Current status: initial_weights_loaded={_global_state.tokenizer_manager.initial_weights_loaded}"
+    )
+
+
+def _setup_and_run_http_server(
+    server_args: ServerArgs,
+    tokenizer_manager,
+    template_manager,
+    port_args: PortArgs,
+    scheduler_infos: List[Dict],
+    execute_warmup_func: Callable = _execute_server_warmup,
+    launch_callback: Optional[Callable[[], None]] = None,
+):
+    """Set up global state, configure middleware, and run uvicorn.
+
+    Called by launch_server after subprocesses have been launched.
+    """
+    # Reserve the HTTP port before launching subprocesses to fail fast if port is unavailable.
+    # This prevents wasting time loading models only to discover port conflicts later.
+    reserved_socket = _prebind_listening_socket(server_args.host, server_args.port)
+    multi_tokenizer_args_shm = None
+
+    try:
+        # Parse info got from the schedulers
+        remote_instance_transfer_engine_info = (
+            parse_remote_instance_transfer_engine_info_from_scheduler_infos(
+                scheduler_infos
+            )
+        )
+
+        # Set global states
+        set_global_state(
+            _GlobalState(
+                tokenizer_manager=tokenizer_manager,
+                template_manager=template_manager,
+                scheduler_info=scheduler_infos[0],
+                remote_instance_transfer_engine_info=remote_instance_transfer_engine_info,
+            )
+        )
+
+        if server_args.enable_metrics:
+            add_prometheus_track_response_middleware(app)
+
+        # Pass additional arguments to the lifespan function.
+        # They will be used for additional initialization setups.
+        if server_args.tokenizer_worker_num == 1:
+            # If it is single tokenizer mode, we can pass the arguments by attributes of the app object.
+            app.is_single_tokenizer_mode = True
+            app.server_args = server_args
+            app.warmup_thread_kwargs = dict(
+                server_args=server_args,
+                launch_callback=launch_callback,
+                execute_warmup_func=execute_warmup_func,
+            )
+
+            # Add api key authorization
+            # This is only supported in single tokenizer mode.
+            #
+            # Backward compatibility:
+            # - api_key only: behavior matches legacy (all endpoints require api_key)
+            # - no keys: legacy had no restriction; ADMIN_FORCE endpoints must still be rejected when
+            #   admin_api_key is not configured.
+            if (
+                server_args.api_key
+                or server_args.admin_api_key
+                or app_has_admin_force_endpoints(app)
+            ):
+                from sglang.srt.utils.auth import add_api_key_middleware
+
+                add_api_key_middleware(
+                    app,
+                    api_key=server_args.api_key,
+                    admin_api_key=server_args.admin_api_key,
+                )
+        else:
+            # If it is multi-tokenizer mode, we need to write the arguments to shared memory
+            # for other worker processes to read.
+            app.is_single_tokenizer_mode = False
+            multi_tokenizer_args_shm = write_data_for_multi_tokenizer(
+                port_args, server_args, scheduler_infos[0]
+            )
+
+        # Update logging configs
+        set_uvicorn_logging_configs(server_args)
+
+        # Delay listen() until uvicorn startup to avoid accepting probe traffic
+        # while model/subprocess initialization is still in progress.
+        reserved_socket.listen(128)
+
+        if server_args.ssl_certfile:
+            logger.info(
+                f"SSL enabled: certfile={server_args.ssl_certfile}, "
+                f"keyfile={server_args.ssl_keyfile}"
+            )
+
+        # Listen for HTTP requests
+        if server_args.tokenizer_worker_num == 1:
+            if server_args.enable_ssl_refresh:
+                # Use Config/Server API for access to the SSLContext.
+                config = uvicorn.Config(
+                    app,
+                    fd=reserved_socket.fileno(),
+                    root_path=server_args.fastapi_root_path,
+                    log_level=server_args.log_level_http or server_args.log_level,
+                    timeout_keep_alive=envs.SGLANG_TIMEOUT_KEEP_ALIVE.get(),
+                    loop="uvloop",
+                    ssl_keyfile=server_args.ssl_keyfile,
+                    ssl_certfile=server_args.ssl_certfile,
+                    ssl_ca_certs=server_args.ssl_ca_certs,
+                    ssl_keyfile_password=server_args.ssl_keyfile_password,
+                )
+                config.load()  # Creates the SSLContext
+
+                from sglang.srt.entrypoints.ssl_utils import SSLCertRefresher
+
+                server = uvicorn.Server(config)
+
+                async def _run_with_ssl_refresh():
+                    refresher = SSLCertRefresher(
+                        config.ssl,
+                        server_args.ssl_keyfile,
+                        server_args.ssl_certfile,
+                        server_args.ssl_ca_certs,
+                    )
+                    logger.info("SSL certificate auto-refresh enabled.")
+                    try:
+                        await server.serve()
+                    finally:
+                        refresher.stop()
+
+                import asyncio
+
+                asyncio.run(_run_with_ssl_refresh())
+            else:
+                # Default case, one tokenizer process
+                uvicorn.run(
+                    app,
+                    fd=reserved_socket.fileno(),
+                    root_path=server_args.fastapi_root_path,
+                    log_level=server_args.log_level_http or server_args.log_level,
+                    timeout_keep_alive=envs.SGLANG_TIMEOUT_KEEP_ALIVE.get(),
+                    loop="uvloop",
+                    ssl_keyfile=server_args.ssl_keyfile,
+                    ssl_certfile=server_args.ssl_certfile,
+                    ssl_ca_certs=server_args.ssl_ca_certs,
+                    ssl_keyfile_password=server_args.ssl_keyfile_password,
+                )
+        else:
+            # Multiple tokenizer and http processes
+            from uvicorn.config import LOGGING_CONFIG
+
+            LOGGING_CONFIG["loggers"]["sglang.srt.entrypoints.http_server"] = {
+                "handlers": ["default"],
+                "level": "INFO",
+                "propagate": False,
+            }
+            monkey_patch_uvicorn_multiprocessing()
+
+            if server_args.enable_ssl_refresh:
+                logger.warning(
+                    "--enable-ssl-refresh is not supported with multiple "
+                    "tokenizer workers (--tokenizer-worker-num > 1). "
+                    "SSL refresh will be disabled."
+                )
+
+            uvicorn.run(
+                "sglang.srt.entrypoints.http_server:app",
+                fd=reserved_socket.fileno(),
+                root_path=server_args.fastapi_root_path,
+                log_level=server_args.log_level_http or server_args.log_level,
+                timeout_keep_alive=envs.SGLANG_TIMEOUT_KEEP_ALIVE.get(),
+                loop="uvloop",
+                workers=server_args.tokenizer_worker_num,
+                ssl_keyfile=server_args.ssl_keyfile,
+                ssl_certfile=server_args.ssl_certfile,
+                ssl_ca_certs=server_args.ssl_ca_certs,
+                ssl_keyfile_password=server_args.ssl_keyfile_password,
+            )
+    finally:
+        # Close the reserved socket after uvicorn exits or on any error
+        # This ensures the port is released even if initialization fails
+        reserved_socket.close()
+
+        if server_args.tokenizer_worker_num > 1:
+            if multi_tokenizer_args_shm is not None:
+                multi_tokenizer_args_shm.unlink()
+            if _global_state is not None:
+                _global_state.tokenizer_manager.socket_mapping.clear_all_sockets()
+
+
+def launch_server(
+    server_args: ServerArgs,
+    init_tokenizer_manager_func: Callable = init_tokenizer_manager,
+    run_scheduler_process_func: Callable = run_scheduler_process,
+    run_detokenizer_process_func: Callable = run_detokenizer_process,
+    execute_warmup_func: Callable = _execute_server_warmup,
+    launch_callback: Optional[Callable[[], None]] = None,
+):
+    """
+    Launch SRT (SGLang Runtime) Server.
+
+    The SRT server consists of an HTTP server and an SRT engine.
+
+    - HTTP server: A FastAPI server that routes requests to the engine.
+    - The engine consists of three components:
+        1. TokenizerManager: Tokenizes the requests and sends them to the scheduler.
+        2. Scheduler (subprocess): Receives requests from the Tokenizer Manager, schedules batches, forwards them, and sends the output tokens to the Detokenizer Manager.
+        3. DetokenizerManager (subprocess): Detokenizes the output tokens and sends the result back to the Tokenizer Manager.
+
+    Note:
+    1. The HTTP server, Engine, and TokenizerManager all run in the main process.
+    2. Inter-process communication is done through IPC (each process uses a different port) via the ZMQ library.
+    """
+    # Launch subprocesses
+    tokenizer_manager, template_manager, port_args, scheduler_init_result = (
+        Engine._launch_subprocesses(
+            server_args=server_args,
+            init_tokenizer_manager_func=init_tokenizer_manager_func,
+            run_scheduler_process_func=run_scheduler_process_func,
+            run_detokenizer_process_func=run_detokenizer_process_func,
+        )
+    )
+
+    _setup_and_run_http_server(
+        server_args,
+        tokenizer_manager,
+        template_manager,
+        port_args,
+        scheduler_init_result.scheduler_infos,
+        execute_warmup_func=execute_warmup_func,
+        launch_callback=launch_callback,
+    )
diff --git a/sglang/python/sglang/srt/entrypoints/http_server_engine.py b/sglang/python/sglang/srt/entrypoints/http_server_engine.py
new file mode 100644
index 0000000000000000000000000000000000000000..cb61598364458b0f357628b5f282fc6819fe5c4a
--- /dev/null
+++ b/sglang/python/sglang/srt/entrypoints/http_server_engine.py
@@ -0,0 +1,144 @@
+import multiprocessing
+import time
+from typing import List, Optional, Tuple
+
+import requests
+import torch
+
+from sglang.srt.entrypoints.EngineBase import EngineBase
+from sglang.srt.entrypoints.http_server import launch_server
+from sglang.srt.server_args import ServerArgs
+from sglang.srt.utils import MultiprocessingSerializer, kill_process_tree
+
+
+def launch_server_process(server_args: ServerArgs) -> multiprocessing.Process:
+
+    p = multiprocessing.Process(target=launch_server, args=(server_args,))
+    p.start()
+
+    base_url = server_args.url()
+    timeout = 300.0  # Increased timeout to 5 minutes for downloading large models
+    start_time = time.perf_counter()
+
+    ssl_verify = server_args.ssl_verify()
+
+    with requests.Session() as session:
+        while time.perf_counter() - start_time < timeout:
+            try:
+                headers = {
+                    "Content-Type": "application/json; charset=utf-8",
+                    "Authorization": f"Bearer {server_args.api_key}",
+                }
+                response = session.get(
+                    f"{base_url}/health_generate", headers=headers, verify=ssl_verify
+                )
+                if response.status_code == 200:
+                    return p
+            except requests.RequestException:
+                pass
+
+            if not p.is_alive():
+                raise Exception("Server process terminated unexpectedly.")
+
+            time.sleep(2)
+
+    p.terminate()
+    raise TimeoutError("Server failed to start within the timeout period.")
+
+
+class HttpServerEngineAdapter(EngineBase):
+    """
+    You can use this class to launch a server from a VerlEngine instance.
+    We recommend using this class only you need to use http server.
+    Otherwise, you can use Engine directly.
+    """
+
+    def __init__(self, **kwargs):
+        self.server_args = ServerArgs(**kwargs)
+        print(
+            f"Launch HttpServerEngineAdapter at: {self.server_args.host}:{self.server_args.port}"
+        )
+        self.process = launch_server_process(self.server_args)
+
+    def _make_request(self, endpoint: str, payload: Optional[dict] = None):
+        """Make a POST request to the specified endpoint with the given payload.
+        Args:
+            endpoint: The API endpoint to call
+            payload: The JSON payload to send (default: empty dict)
+        Returns:
+            The JSON response from the server
+        """
+        url = f"{self.server_args.url()}/{endpoint}"
+        response = requests.post(
+            url, json=payload or {}, verify=self.server_args.ssl_verify()
+        )
+        response.raise_for_status()
+        return response.json()
+
+    def update_weights_from_tensor(
+        self,
+        named_tensors: List[Tuple[str, torch.Tensor]],
+        load_format: Optional[str] = None,
+        flush_cache: bool = False,
+    ):
+        """
+        Update model weights from tensor data. The HTTP server will only post meta data, and the real weights will be copied directly from GPUs.
+        Note: The model should be on GPUs rather than CPU for this functionality to work properly.
+        If you encounter issues, ensure your model is loaded on GPU devices rather than CPU.
+        """
+
+        return self._make_request(
+            "update_weights_from_tensor",
+            {
+                "serialized_named_tensors": [
+                    MultiprocessingSerializer.serialize(named_tensors, output_str=True)
+                    for _ in range(self.server_args.tp_size)
+                ],
+                "load_format": load_format,
+                "flush_cache": flush_cache,
+            },
+        )
+
+    def shutdown(self):
+        kill_process_tree(self.process.pid)
+
+    def generate(
+        self,
+        prompt=None,
+        sampling_params=None,
+        input_ids=None,
+        image_data=None,
+        return_logprob=False,
+        logprob_start_len=None,
+        top_logprobs_num=None,
+        token_ids_logprob=None,
+        lora_path=None,
+        custom_logit_processor=None,
+        priority=None,
+    ):
+        payload = {
+            "text": prompt,
+            "sampling_params": sampling_params,
+            "input_ids": input_ids,
+            "image_data": image_data,
+            "return_logprob": return_logprob,
+            "logprob_start_len": logprob_start_len,
+            "top_logprobs_num": top_logprobs_num,
+            "token_ids_logprob": token_ids_logprob,
+            "lora_path": lora_path,
+            "custom_logit_processor": custom_logit_processor,
+            "priority": priority,
+        }
+        # Filter out None values
+        payload = {k: v for k, v in payload.items() if v is not None}
+
+        return self._make_request("generate", payload)
+
+    def release_memory_occupation(self):
+        return self._make_request("release_memory_occupation")
+
+    def resume_memory_occupation(self):
+        return self._make_request("resume_memory_occupation")
+
+    def flush_cache(self):
+        return self._make_request("flush_cache")
diff --git a/sglang/python/sglang/srt/entrypoints/ollama/README.md b/sglang/python/sglang/srt/entrypoints/ollama/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..a3f87460af2daccc27d63c03249ac63d88012c27
--- /dev/null
+++ b/sglang/python/sglang/srt/entrypoints/ollama/README.md
@@ -0,0 +1,112 @@
+# SGLang Ollama Integration
+
+Ollama API compatibility for SGLang, plus a Smart Router for intelligent routing between local and remote models.
+
+## Features
+
+1. **Ollama-compatible API** - Use Ollama CLI/library with SGLang backend
+2. **Smart Router** - LLM-based routing between local and remote models
+
+## Ollama API
+
+For basic Ollama API usage with SGLang (CLI and Python examples), see the [Ollama API documentation](https://sgl-project.github.io/basic_usage/ollama_api.html).
+
+## Smart Router
+
+### Prerequisites
+
+```bash
+pip install ollama
+```
+
+Intelligently routes requests between local Ollama and remote SGLang using an LLM judge.
+
+### How It Works
+
+```
+User Request
+     │
+     ▼
+┌─────────────────────┐
+│     LLM Judge       │  Classifies as SIMPLE or COMPLEX
+│  (local model)      │
+└─────────────────────┘
+     │
+     ▼
+┌─────────────────────┐
+│  SIMPLE → Local     │  Fast response from local Ollama
+│  COMPLEX → Remote   │  Powerful response from SGLang
+└─────────────────────┘
+```
+
+The LLM judge (running on local Ollama) analyzes each request and decides:
+- **SIMPLE**: Quick responses, greetings, factual questions, definitions, basic Q&A
+- **COMPLEX**: Deep reasoning, multi-step analysis, long explanations, creative writing
+
+### Setup
+
+**Terminal 1: Local Ollama**
+```bash
+ollama pull <LOCAL_MODEL>  # e.g., llama3.2, mistral, phi3
+ollama serve  # This will block the terminal
+```
+
+**Terminal 2: Remote SGLang (GPU)**
+```bash
+ssh user@gpu-server
+python -m sglang.launch_server --model <REMOTE_MODEL> --port 30001 --host 0.0.0.0
+```
+
+**Terminal 3: Smart Router**
+```bash
+ssh -L 30001:localhost:30001 user@gpu-server -N &
+python python/sglang/srt/entrypoints/ollama/smart_router.py
+```
+
+### Configuration
+
+```python
+from sglang.srt.entrypoints.ollama.smart_router import SmartRouter
+
+router = SmartRouter(
+    # Local Ollama
+    local_host="http://localhost:11434",
+    local_model="llama3.2",  # or any Ollama model
+
+    # Remote SGLang
+    remote_host="http://localhost:30001",
+    remote_model="Qwen/Qwen2.5-1.5B-Instruct",  # or any HuggingFace model
+
+    # LLM Judge (optional, defaults to local_model)
+    judge_model="llama3.2",
+)
+```
+
+### Usage
+
+```python
+# Auto-routing via LLM judge
+response = router.chat("Hello!", verbose=True)
+# [Router] LLM Judge: SIMPLE
+# [Router] -> Local Ollama | Model: llama3.2
+
+response = router.chat("Explain quantum computing in detail", verbose=True)
+# [Router] LLM Judge: COMPLEX
+# [Router] -> Remote SGLang | Model: Qwen/Qwen2.5-1.5B-Instruct
+
+# Force routing (skip LLM judge)
+response = router.chat("question", force_local=True)
+response = router.chat("question", force_remote=True)
+
+# Streaming
+for chunk in router.chat_stream("Tell me a story"):
+    print(chunk['message']['content'], end='')
+```
+
+---
+
+## Value
+
+- **Ollama**: Simple CLI/API developers already know
+- **SGLang**: High-performance inference
+- **Smart Router**: Intelligent routing - fast local for simple tasks, powerful remote for complex tasks
diff --git a/sglang/python/sglang/srt/entrypoints/ollama/__init__.py b/sglang/python/sglang/srt/entrypoints/ollama/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..cf572ac2f122a5d4a37f106a4e032a221074db23
--- /dev/null
+++ b/sglang/python/sglang/srt/entrypoints/ollama/__init__.py
@@ -0,0 +1 @@
+# Ollama-compatible API for SGLang
diff --git a/sglang/python/sglang/srt/entrypoints/ollama/__pycache__/__init__.cpython-311.pyc b/sglang/python/sglang/srt/entrypoints/ollama/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f46ece17a8c0f252ae6469411b03e07a966d2c69
Binary files /dev/null and b/sglang/python/sglang/srt/entrypoints/ollama/__pycache__/__init__.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/entrypoints/ollama/__pycache__/protocol.cpython-311.pyc b/sglang/python/sglang/srt/entrypoints/ollama/__pycache__/protocol.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..dd8a767364a8df4ac3d93121f3560a7dd95063fb
Binary files /dev/null and b/sglang/python/sglang/srt/entrypoints/ollama/__pycache__/protocol.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/entrypoints/ollama/__pycache__/serving.cpython-311.pyc b/sglang/python/sglang/srt/entrypoints/ollama/__pycache__/serving.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..275b2494b0d64ed9a6aab13861542216aeb84e17
Binary files /dev/null and b/sglang/python/sglang/srt/entrypoints/ollama/__pycache__/serving.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/entrypoints/ollama/protocol.py b/sglang/python/sglang/srt/entrypoints/ollama/protocol.py
new file mode 100644
index 0000000000000000000000000000000000000000..f7e18e2eb5d3feb4e036d899a15929576730425e
--- /dev/null
+++ b/sglang/python/sglang/srt/entrypoints/ollama/protocol.py
@@ -0,0 +1,137 @@
+"""
+Ollama-compatible API protocol definitions.
+
+These models match the Ollama API format:
+https://github.com/ollama/ollama/blob/main/docs/api.md
+"""
+
+from typing import Any, Dict, List, Literal, Optional, Union
+
+from pydantic import BaseModel, Field
+
+
+class OllamaMessage(BaseModel):
+    """Ollama message format."""
+
+    role: str
+    content: str
+    images: Optional[List[str]] = None
+
+
+class OllamaChatRequest(BaseModel):
+    """Ollama /api/chat request format."""
+
+    model: str
+    messages: List[OllamaMessage]
+    stream: bool = True
+    format: Optional[Union[Literal["json"], Dict[str, Any]]] = None
+    options: Optional[Dict[str, Any]] = None
+    keep_alive: Optional[Union[float, str]] = None
+    think: Optional[Union[bool, Literal["low", "medium", "high"]]] = None
+
+
+class OllamaChatResponse(BaseModel):
+    """Ollama /api/chat response format (non-streaming)."""
+
+    model: str
+    created_at: str
+    message: OllamaMessage
+    done: bool = True
+    done_reason: Optional[str] = "stop"
+    total_duration: Optional[int] = None
+    load_duration: Optional[int] = None
+    prompt_eval_count: Optional[int] = None
+    prompt_eval_duration: Optional[int] = None
+    eval_count: Optional[int] = None
+    eval_duration: Optional[int] = None
+
+
+class OllamaChatStreamResponse(BaseModel):
+    """Ollama /api/chat streaming response chunk."""
+
+    model: str
+    created_at: str
+    message: OllamaMessage
+    done: bool = False
+    done_reason: Optional[str] = None
+
+
+class OllamaGenerateRequest(BaseModel):
+    """Ollama /api/generate request format."""
+
+    model: str
+    prompt: str
+    suffix: Optional[str] = None
+    system: Optional[str] = None
+    template: Optional[str] = None
+    context: Optional[List[int]] = None
+    stream: bool = True
+    raw: bool = False
+    format: Optional[Union[Literal["json"], Dict[str, Any]]] = None
+    options: Optional[Dict[str, Any]] = None
+    keep_alive: Optional[Union[float, str]] = None
+    images: Optional[List[str]] = None
+    think: Optional[bool] = None
+
+
+class OllamaGenerateResponse(BaseModel):
+    """Ollama /api/generate response format (non-streaming)."""
+
+    model: str
+    created_at: str
+    response: str
+    done: bool = True
+    done_reason: Optional[str] = "stop"
+    context: Optional[List[int]] = None
+    total_duration: Optional[int] = None
+    load_duration: Optional[int] = None
+    prompt_eval_count: Optional[int] = None
+    prompt_eval_duration: Optional[int] = None
+    eval_count: Optional[int] = None
+    eval_duration: Optional[int] = None
+
+
+class OllamaGenerateStreamResponse(BaseModel):
+    """Ollama /api/generate streaming response chunk."""
+
+    model: str
+    created_at: str
+    response: str
+    done: bool = False
+    done_reason: Optional[str] = None
+
+
+class OllamaModelInfo(BaseModel):
+    """Model information for /api/tags response."""
+
+    name: str
+    model: str
+    modified_at: str
+    size: int
+    digest: str
+    details: Optional[Dict[str, Any]] = None
+
+
+class OllamaTagsResponse(BaseModel):
+    """Ollama /api/tags response format."""
+
+    models: List[OllamaModelInfo]
+
+
+class OllamaShowRequest(BaseModel):
+    """Ollama /api/show request format."""
+
+    model: str
+
+
+class OllamaShowResponse(BaseModel):
+    """Ollama /api/show response format."""
+
+    license: str = ""
+    modelfile: str = ""
+    parameters: str = ""
+    template: str = ""
+    modified_at: str = ""
+    details: Dict[str, Any] = Field(default_factory=dict)
+    model_info: Dict[str, Any] = Field(default_factory=dict)
+    capabilities: List[str] = Field(default_factory=list)
diff --git a/sglang/python/sglang/srt/entrypoints/ollama/serving.py b/sglang/python/sglang/srt/entrypoints/ollama/serving.py
new file mode 100644
index 0000000000000000000000000000000000000000..786ff43ebf4d587c5bdca6cc7186fdcc520017ff
--- /dev/null
+++ b/sglang/python/sglang/srt/entrypoints/ollama/serving.py
@@ -0,0 +1,349 @@
+"""
+Ollama-compatible API serving handlers.
+
+This module provides handlers that convert Ollama API requests to SGLang's
+internal format and return Ollama-compatible responses.
+"""
+
+import time
+from datetime import datetime, timezone
+from typing import AsyncIterator, Union
+
+import orjson
+from fastapi import Request
+from fastapi.responses import StreamingResponse
+
+from sglang.srt.entrypoints.ollama.protocol import (
+    OllamaChatRequest,
+    OllamaChatResponse,
+    OllamaChatStreamResponse,
+    OllamaGenerateRequest,
+    OllamaGenerateResponse,
+    OllamaGenerateStreamResponse,
+    OllamaMessage,
+    OllamaModelInfo,
+    OllamaShowResponse,
+    OllamaTagsResponse,
+)
+from sglang.srt.managers.io_struct import GenerateReqInput
+
+
+class OllamaServing:
+    """Handler for Ollama-compatible API endpoints."""
+
+    def __init__(self, tokenizer_manager):
+        self.tokenizer_manager = tokenizer_manager
+
+    def _get_timestamp(self) -> str:
+        """Get current timestamp in Ollama format."""
+        return datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%S.%f")[:-3] + "Z"
+
+    def _convert_options_to_sampling_params(self, options: dict = None) -> dict:
+        """Convert Ollama options to SGLang sampling params."""
+        sampling_params = {}
+
+        if options:
+            # Map Ollama options to SGLang params
+            param_mapping = {
+                "temperature": "temperature",
+                "top_p": "top_p",
+                "top_k": "top_k",
+                "num_predict": "max_new_tokens",
+                "stop": "stop",
+                "presence_penalty": "presence_penalty",
+                "frequency_penalty": "frequency_penalty",
+                "seed": "seed",
+            }
+            for ollama_param, sglang_param in param_mapping.items():
+                if ollama_param in options:
+                    sampling_params[sglang_param] = options[ollama_param]
+
+        # Set a reasonable default for max_new_tokens if not specified
+        # Ollama users typically expect longer responses than SGLang's default (128)
+        if "max_new_tokens" not in sampling_params:
+            sampling_params["max_new_tokens"] = 2048
+
+        return sampling_params
+
+    async def handle_chat(
+        self, request: OllamaChatRequest, raw_request: Request
+    ) -> Union[OllamaChatResponse, StreamingResponse]:
+        """Handle /api/chat endpoint."""
+        model_name = self.tokenizer_manager.served_model_name
+
+        # Convert messages to SGLang format
+        messages = [
+            {"role": msg.role, "content": msg.content} for msg in request.messages
+        ]
+
+        # Apply chat template using tokenizer
+        prompt_ids = self.tokenizer_manager.tokenizer.apply_chat_template(
+            messages,
+            tokenize=True,
+            add_generation_prompt=True,
+        )
+
+        # Convert options to sampling params
+        sampling_params = self._convert_options_to_sampling_params(request.options)
+
+        # Create SGLang request with input_ids
+        gen_request = GenerateReqInput(
+            input_ids=prompt_ids,
+            sampling_params=sampling_params,
+            stream=request.stream,
+        )
+
+        if request.stream:
+            return await self._stream_chat_response(
+                gen_request, raw_request, model_name
+            )
+        else:
+            return await self._generate_chat_response(
+                gen_request, raw_request, model_name
+            )
+
+    async def _generate_chat_response(
+        self, gen_request: GenerateReqInput, raw_request: Request, model_name: str
+    ) -> OllamaChatResponse:
+        """Generate non-streaming chat response."""
+        start_time = time.time_ns()
+
+        # Get response from tokenizer manager
+        response = await self.tokenizer_manager.generate_request(
+            gen_request, raw_request
+        ).__anext__()
+
+        end_time = time.time_ns()
+        total_duration = end_time - start_time
+
+        output_text = response.get("text", "")
+
+        return OllamaChatResponse(
+            model=model_name,
+            created_at=self._get_timestamp(),
+            message=OllamaMessage(role="assistant", content=output_text),
+            done=True,
+            done_reason="stop",
+            total_duration=total_duration,
+            prompt_eval_count=response.get("meta_info", {}).get("prompt_tokens", None),
+            eval_count=response.get("meta_info", {}).get("completion_tokens", None),
+        )
+
+    async def _stream_chat_response(
+        self, gen_request: GenerateReqInput, raw_request: Request, model_name: str
+    ) -> StreamingResponse:
+        """Generate streaming chat response."""
+
+        async def generate_stream() -> AsyncIterator[bytes]:
+            previous_text = ""
+            async for chunk in self.tokenizer_manager.generate_request(
+                gen_request, raw_request
+            ):
+                text = chunk.get("text", "")
+                is_done = chunk.get("meta_info", {}).get("finish_reason") is not None
+
+                # Calculate delta (new text since last chunk)
+                delta = text[len(previous_text) :]
+                previous_text = text
+
+                if is_done:
+                    # Final chunk
+                    response = OllamaChatStreamResponse(
+                        model=model_name,
+                        created_at=self._get_timestamp(),
+                        message=OllamaMessage(role="assistant", content=""),
+                        done=True,
+                        done_reason="stop",
+                    )
+                else:
+                    response = OllamaChatStreamResponse(
+                        model=model_name,
+                        created_at=self._get_timestamp(),
+                        message=OllamaMessage(role="assistant", content=delta),
+                        done=False,
+                    )
+
+                yield orjson.dumps(response.model_dump()) + b"\n"
+
+        return StreamingResponse(
+            generate_stream(),
+            media_type="application/x-ndjson",
+        )
+
+    async def handle_generate(
+        self, request: OllamaGenerateRequest, raw_request: Request
+    ) -> Union[OllamaGenerateResponse, StreamingResponse]:
+        """Handle /api/generate endpoint."""
+        model_name = self.tokenizer_manager.served_model_name
+
+        # Build prompt
+        prompt = request.prompt
+        if request.system:
+            prompt = f"{request.system}\n\n{prompt}"
+
+        # Handle empty prompt - Ollama CLI sends empty requests on initialization
+        if not prompt or not prompt.strip():
+            empty_response = OllamaGenerateResponse(
+                model=model_name,
+                created_at=self._get_timestamp(),
+                response="",
+                done=True,
+                done_reason="stop",
+            )
+            if request.stream:
+                # Return streaming response with done=True
+                async def empty_stream() -> AsyncIterator[bytes]:
+                    yield orjson.dumps(empty_response.model_dump()) + b"\n"
+
+                return StreamingResponse(
+                    empty_stream(),
+                    media_type="application/x-ndjson",
+                )
+            return empty_response
+
+        # Convert options to sampling params
+        sampling_params = self._convert_options_to_sampling_params(request.options)
+
+        # Create SGLang request
+        gen_request = GenerateReqInput(
+            text=prompt,
+            sampling_params=sampling_params,
+            stream=request.stream,
+        )
+
+        if request.stream:
+            return await self._stream_generate_response(
+                gen_request, raw_request, model_name
+            )
+        else:
+            return await self._generate_generate_response(
+                gen_request, raw_request, model_name
+            )
+
+    async def _generate_generate_response(
+        self, gen_request: GenerateReqInput, raw_request: Request, model_name: str
+    ) -> OllamaGenerateResponse:
+        """Generate non-streaming generate response."""
+        start_time = time.time_ns()
+
+        response = await self.tokenizer_manager.generate_request(
+            gen_request, raw_request
+        ).__anext__()
+
+        end_time = time.time_ns()
+        total_duration = end_time - start_time
+
+        output_text = response.get("text", "")
+
+        return OllamaGenerateResponse(
+            model=model_name,
+            created_at=self._get_timestamp(),
+            response=output_text,
+            done=True,
+            done_reason="stop",
+            total_duration=total_duration,
+            prompt_eval_count=response.get("meta_info", {}).get("prompt_tokens", None),
+            eval_count=response.get("meta_info", {}).get("completion_tokens", None),
+        )
+
+    async def _stream_generate_response(
+        self, gen_request: GenerateReqInput, raw_request: Request, model_name: str
+    ) -> StreamingResponse:
+        """Generate streaming generate response."""
+
+        async def generate_stream() -> AsyncIterator[bytes]:
+            previous_text = ""
+            async for chunk in self.tokenizer_manager.generate_request(
+                gen_request, raw_request
+            ):
+                text = chunk.get("text", "")
+                is_done = chunk.get("meta_info", {}).get("finish_reason") is not None
+
+                # Calculate delta (new text since last chunk)
+                delta = text[len(previous_text) :]
+                previous_text = text
+
+                if is_done:
+                    response = OllamaGenerateStreamResponse(
+                        model=model_name,
+                        created_at=self._get_timestamp(),
+                        response="",
+                        done=True,
+                        done_reason="stop",
+                    )
+                else:
+                    response = OllamaGenerateStreamResponse(
+                        model=model_name,
+                        created_at=self._get_timestamp(),
+                        response=delta,
+                        done=False,
+                    )
+
+                yield orjson.dumps(response.model_dump()) + b"\n"
+
+        return StreamingResponse(
+            generate_stream(),
+            media_type="application/x-ndjson",
+        )
+
+    def get_tags(self) -> OllamaTagsResponse:
+        """Handle /api/tags endpoint - list available models."""
+        model_name = self.tokenizer_manager.served_model_name
+
+        model_info = OllamaModelInfo(
+            name=model_name,
+            model=model_name,
+            modified_at=self._get_timestamp(),
+            size=0,  # We don't track model size
+            digest="sha256:sglang0000000000000000000000000000000000000000000000000000000000",
+            details={
+                "format": "sglang",
+                "family": (
+                    model_name.split("/")[-1] if "/" in model_name else model_name
+                ),
+                "parameter_size": "unknown",
+            },
+        )
+
+        return OllamaTagsResponse(models=[model_info])
+
+    def get_show(self, model: str) -> OllamaShowResponse:
+        """Handle /api/show endpoint - show model information."""
+        model_config = self.tokenizer_manager.model_config
+
+        # Extract model family from model name
+        model_family = model.split("/")[-1] if "/" in model else model
+        # Remove common suffixes to get base family
+        for suffix in ["-Instruct", "-Chat", "-Base"]:
+            if model_family.endswith(suffix):
+                model_family = model_family[: -len(suffix)]
+                break
+
+        # Build context length info
+        context_len = model_config.context_len if model_config else 4096
+
+        return OllamaShowResponse(
+            license="",  # License info not available from SGLang
+            modelfile=f"FROM {model}\nPARAMETER num_ctx {context_len}\n",
+            parameters=f"num_ctx {context_len}",
+            template="",  # Template info not easily accessible
+            modified_at=self._get_timestamp(),
+            details={
+                "parent_model": "",
+                "format": "sglang",
+                "family": model_family,
+                "families": [model_family],
+                "parameter_size": "unknown",
+                "quantization_level": "",
+            },
+            model_info={
+                "general.architecture": model_family,
+                "general.name": model,
+                "general.parameter_count": 0,
+                f"{model_family}.context_length": context_len,
+                f"{model_family}.block_count": 0,
+                f"{model_family}.embedding_length": 0,
+                f"{model_family}.attention.head_count": 0,
+            },
+            capabilities=["completion"],
+        )
diff --git a/sglang/python/sglang/srt/entrypoints/ollama/smart_router.py b/sglang/python/sglang/srt/entrypoints/ollama/smart_router.py
new file mode 100644
index 0000000000000000000000000000000000000000..de6fdddeeb579e80a3f8c9ff5b46f3b9d9d48704
--- /dev/null
+++ b/sglang/python/sglang/srt/entrypoints/ollama/smart_router.py
@@ -0,0 +1,296 @@
+"""
+Smart Router: Automatically routes requests between local Ollama and remote SGLang.
+
+Uses an LLM judge to classify tasks as simple or complex, then routes accordingly:
+- Simple tasks → Local Ollama (fast response)
+- Complex tasks → Remote SGLang (powerful model)
+
+Usage:
+    from sglang.srt.entrypoints.ollama.smart_router import SmartRouter
+
+    router = SmartRouter(
+        local_host="http://localhost:11434",
+        remote_host="http://sglang-server:30001",
+    )
+    response = router.chat("Hello!")
+"""
+
+from typing import Optional
+
+import ollama
+
+
+class SmartRouter:
+    """Routes requests between local Ollama and remote SGLang using LLM-based classification."""
+
+    # Classification prompt for LLM judge
+    CLASSIFICATION_PROMPT = """You are a task classifier. Classify the following user request into one of two categories.
+
+Categories:
+- SIMPLE: Quick responses, greetings, factual questions, definitions, translations, basic Q&A
+- COMPLEX: Tasks requiring deep reasoning, multi-step analysis, long explanations, creative writing, detailed research
+
+Reply with ONLY one word: either SIMPLE or COMPLEX.
+
+User request: "{prompt}"
+
+Category:"""
+
+    def __init__(
+        self,
+        local_host: str = "http://localhost:11434",
+        remote_host: str = "http://localhost:30001",
+        local_model: str = "llama3.2",
+        remote_model: str = "Qwen/Qwen2.5-1.5B-Instruct",
+        judge_model: Optional[str] = None,
+        judge_host: Optional[str] = None,
+    ):
+        """
+        Initialize the smart router.
+
+        Args:
+            local_host: URL of local Ollama server
+            remote_host: URL of remote SGLang server
+            local_model: Model name for local Ollama
+            remote_model: Model name for remote SGLang
+            judge_model: Model for LLM-based classification (default: same as local_model)
+            judge_host: Host for judge model (default: same as local_host)
+        """
+        self.local_client = ollama.Client(host=local_host)
+        self.remote_client = ollama.Client(host=remote_host)
+        self.local_model = local_model
+        self.remote_model = remote_model
+
+        # Judge model configuration
+        self.judge_model = judge_model or local_model
+        self.judge_host = judge_host or local_host
+        self.judge_client = ollama.Client(host=self.judge_host)
+
+    def _classify_with_llm(
+        self, prompt: str, verbose: bool = False
+    ) -> tuple[bool, str]:
+        """
+        Use LLM to classify the prompt.
+
+        Returns:
+            Tuple of (use_remote, reason)
+        """
+        try:
+            classification_prompt = self.CLASSIFICATION_PROMPT.format(
+                prompt=prompt[:500]  # Limit prompt length for classification
+            )
+
+            response = self.judge_client.chat(
+                model=self.judge_model,
+                messages=[{"role": "user", "content": classification_prompt}],
+                options={"temperature": 0, "num_predict": 10},
+            )
+
+            result = response["message"]["content"].strip().upper()
+
+            if verbose:
+                print(f"[Router] LLM Judge: {result}")
+
+            if "COMPLEX" in result:
+                return True, "Complex task"
+            else:
+                return False, "Simple task"
+
+        except Exception as e:
+            if verbose:
+                print(f"[Router] LLM Judge failed: {e}, defaulting to local")
+            return False, "Judge failed, defaulting to local"
+
+    def should_use_remote(self, prompt: str, verbose: bool = False) -> tuple[bool, str]:
+        """
+        Determine if the prompt should be routed to remote SGLang.
+
+        Args:
+            prompt: User's input prompt
+            verbose: Print debug information
+
+        Returns:
+            Tuple of (should_use_remote, reason)
+        """
+        return self._classify_with_llm(prompt, verbose)
+
+    def chat(
+        self,
+        prompt: str,
+        messages: Optional[list] = None,
+        verbose: bool = False,
+        force_local: bool = False,
+        force_remote: bool = False,
+    ) -> dict:
+        """
+        Route the request and get response.
+
+        Args:
+            prompt: User's input (used if messages is None)
+            messages: Full message history (overrides prompt if provided)
+            verbose: Print routing decision
+            force_local: Force use of local model
+            force_remote: Force use of remote model
+
+        Returns:
+            Response dict with 'content', 'model', 'location', 'reason' keys
+        """
+        # Build messages
+        if messages is None:
+            messages = [{"role": "user", "content": prompt}]
+            check_prompt = prompt
+        else:
+            # Use the last user message for routing decision
+            check_prompt = ""
+            for msg in reversed(messages):
+                if msg.get("role") == "user":
+                    check_prompt = msg.get("content", "")
+                    break
+
+        # Determine routing
+        if force_remote:
+            use_remote, reason = True, "Forced remote"
+        elif force_local:
+            use_remote, reason = False, "Forced local"
+        else:
+            use_remote, reason = self.should_use_remote(check_prompt, verbose)
+
+        if use_remote:
+            client = self.remote_client
+            model = self.remote_model
+            location = "Remote SGLang"
+        else:
+            client = self.local_client
+            model = self.local_model
+            location = "Local Ollama"
+
+        if verbose:
+            print(f"[Router] -> {location} | Model: {model}")
+
+        try:
+            response = client.chat(model=model, messages=messages)
+            return {
+                "content": response["message"]["content"],
+                "model": model,
+                "location": location,
+                "reason": reason,
+            }
+        except Exception as e:
+            # Fallback to the other option
+            if verbose:
+                print(f"[Router] {location} failed: {e}, falling back...")
+
+            fallback_client = (
+                self.remote_client if not use_remote else self.local_client
+            )
+            fallback_model = self.remote_model if not use_remote else self.local_model
+            fallback_location = "Remote SGLang" if not use_remote else "Local Ollama"
+
+            response = fallback_client.chat(model=fallback_model, messages=messages)
+            return {
+                "content": response["message"]["content"],
+                "model": fallback_model,
+                "location": fallback_location,
+                "reason": f"Fallback from {location}",
+            }
+
+    def chat_stream(
+        self,
+        prompt: str,
+        messages: Optional[list] = None,
+        verbose: bool = False,
+        force_local: bool = False,
+        force_remote: bool = False,
+    ):
+        """
+        Route the request and stream response.
+
+        Yields:
+            Response chunks
+        """
+        if messages is None:
+            messages = [{"role": "user", "content": prompt}]
+            check_prompt = prompt
+        else:
+            check_prompt = ""
+            for msg in reversed(messages):
+                if msg.get("role") == "user":
+                    check_prompt = msg.get("content", "")
+                    break
+
+        if force_remote:
+            use_remote, reason = True, "Forced remote"
+        elif force_local:
+            use_remote, reason = False, "Forced local"
+        else:
+            use_remote, reason = self.should_use_remote(check_prompt, verbose)
+
+        if use_remote:
+            client = self.remote_client
+            model = self.remote_model
+            location = "Remote SGLang"
+        else:
+            client = self.local_client
+            model = self.local_model
+            location = "Local Ollama"
+
+        if verbose:
+            print(f"[Router] -> {location} | Model: {model}")
+
+        for chunk in client.chat(model=model, messages=messages, stream=True):
+            yield chunk
+
+
+def main():
+    """Interactive demo of the smart router."""
+    print("=" * 60)
+    print("Smart Router: Local Ollama <-> Remote SGLang")
+    print("=" * 60)
+    print("\nRouting strategy:")
+    print("  LLM Judge classifies each request as SIMPLE or COMPLEX")
+    print("  - SIMPLE tasks -> Local Ollama (fast)")
+    print("  - COMPLEX tasks -> Remote SGLang (powerful)")
+    print("\nType 'quit' to exit\n")
+
+    router = SmartRouter(
+        local_host="http://localhost:11434",
+        remote_host="http://localhost:30001",
+        local_model="llama3.2",
+        remote_model="Qwen/Qwen2.5-1.5B-Instruct",
+    )
+
+    messages = []
+    while True:
+        try:
+            user_input = input("You: ").strip()
+            if user_input.lower() in ["quit", "exit", "q"]:
+                print("Goodbye!")
+                break
+            if not user_input:
+                continue
+
+            messages.append({"role": "user", "content": user_input})
+
+            # Use streaming for real-time output
+            print("\nAssistant: ", end="", flush=True)
+            full_response = ""
+            for chunk in router.chat_stream(
+                prompt=user_input, messages=messages, verbose=True
+            ):
+                content = chunk.get("message", {}).get("content", "")
+                if content:
+                    print(content, end="", flush=True)
+                    full_response += content
+            print("\n")
+
+            messages.append({"role": "assistant", "content": full_response})
+
+        except KeyboardInterrupt:
+            print("\nGoodbye!")
+            break
+        except Exception as e:
+            print(f"Error: {e}\n")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/sglang/python/sglang/srt/entrypoints/openai/__init__.py b/sglang/python/sglang/srt/entrypoints/openai/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/sglang/python/sglang/srt/entrypoints/openai/__pycache__/__init__.cpython-311.pyc b/sglang/python/sglang/srt/entrypoints/openai/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5c44e5e250eb93993b88ec46cfbc0f1cc387838d
Binary files /dev/null and b/sglang/python/sglang/srt/entrypoints/openai/__pycache__/__init__.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/entrypoints/openai/__pycache__/encoding_dsv32.cpython-311.pyc b/sglang/python/sglang/srt/entrypoints/openai/__pycache__/encoding_dsv32.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1b649f5adb926e8df98accd35b88467292bd66af
Binary files /dev/null and b/sglang/python/sglang/srt/entrypoints/openai/__pycache__/encoding_dsv32.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/entrypoints/openai/__pycache__/protocol.cpython-311.pyc b/sglang/python/sglang/srt/entrypoints/openai/__pycache__/protocol.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..76e2e8049e5b5c858addbfd6d9dfd2aa8e05e5eb
Binary files /dev/null and b/sglang/python/sglang/srt/entrypoints/openai/__pycache__/protocol.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/entrypoints/openai/__pycache__/serving_base.cpython-311.pyc b/sglang/python/sglang/srt/entrypoints/openai/__pycache__/serving_base.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..fc80d1181307451a4463cc95a550eec6798a813c
Binary files /dev/null and b/sglang/python/sglang/srt/entrypoints/openai/__pycache__/serving_base.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/entrypoints/openai/__pycache__/serving_chat.cpython-311.pyc b/sglang/python/sglang/srt/entrypoints/openai/__pycache__/serving_chat.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..434f32d6a2a101ec17bcd3fb4b36b7b88ce10e6b
Binary files /dev/null and b/sglang/python/sglang/srt/entrypoints/openai/__pycache__/serving_chat.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/entrypoints/openai/__pycache__/serving_classify.cpython-311.pyc b/sglang/python/sglang/srt/entrypoints/openai/__pycache__/serving_classify.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..22e8d75cbb5b7b7dfbc148321d8cca6160a8c850
Binary files /dev/null and b/sglang/python/sglang/srt/entrypoints/openai/__pycache__/serving_classify.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/entrypoints/openai/__pycache__/serving_completions.cpython-311.pyc b/sglang/python/sglang/srt/entrypoints/openai/__pycache__/serving_completions.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b188509b9c757d8b7896ec2e1997719f10df6bba
Binary files /dev/null and b/sglang/python/sglang/srt/entrypoints/openai/__pycache__/serving_completions.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/entrypoints/openai/__pycache__/serving_embedding.cpython-311.pyc b/sglang/python/sglang/srt/entrypoints/openai/__pycache__/serving_embedding.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1ade8267964669f422ecda36b58b81ecf04cb75d
Binary files /dev/null and b/sglang/python/sglang/srt/entrypoints/openai/__pycache__/serving_embedding.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/entrypoints/openai/__pycache__/serving_rerank.cpython-311.pyc b/sglang/python/sglang/srt/entrypoints/openai/__pycache__/serving_rerank.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9fa3497f07a2206a219954e132c05a7b877b59a6
Binary files /dev/null and b/sglang/python/sglang/srt/entrypoints/openai/__pycache__/serving_rerank.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/entrypoints/openai/__pycache__/serving_responses.cpython-311.pyc b/sglang/python/sglang/srt/entrypoints/openai/__pycache__/serving_responses.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..652f4de9227d137592d2a1c62f78a6efff7b81bd
Binary files /dev/null and b/sglang/python/sglang/srt/entrypoints/openai/__pycache__/serving_responses.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/entrypoints/openai/__pycache__/serving_score.cpython-311.pyc b/sglang/python/sglang/srt/entrypoints/openai/__pycache__/serving_score.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ab2a5f7e896ac6b5b30047d1c484e78dbe2b96b5
Binary files /dev/null and b/sglang/python/sglang/srt/entrypoints/openai/__pycache__/serving_score.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/entrypoints/openai/__pycache__/serving_tokenize.cpython-311.pyc b/sglang/python/sglang/srt/entrypoints/openai/__pycache__/serving_tokenize.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8edbc8faa2815061436d82f2516ce5215a823734
Binary files /dev/null and b/sglang/python/sglang/srt/entrypoints/openai/__pycache__/serving_tokenize.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/entrypoints/openai/__pycache__/serving_transcription.cpython-311.pyc b/sglang/python/sglang/srt/entrypoints/openai/__pycache__/serving_transcription.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b23c7b3a7510bebc884c0330e52ae49b45d5a6ef
Binary files /dev/null and b/sglang/python/sglang/srt/entrypoints/openai/__pycache__/serving_transcription.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/entrypoints/openai/__pycache__/usage_processor.cpython-311.pyc b/sglang/python/sglang/srt/entrypoints/openai/__pycache__/usage_processor.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1b6f56eefcb4f9c3e5b6ab1af3ecc4f829b56c77
Binary files /dev/null and b/sglang/python/sglang/srt/entrypoints/openai/__pycache__/usage_processor.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/entrypoints/openai/__pycache__/utils.cpython-311.pyc b/sglang/python/sglang/srt/entrypoints/openai/__pycache__/utils.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0a02456ab690bce85567570dd872c83dd949d9a6
Binary files /dev/null and b/sglang/python/sglang/srt/entrypoints/openai/__pycache__/utils.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/entrypoints/openai/encoding_dsv32.py b/sglang/python/sglang/srt/entrypoints/openai/encoding_dsv32.py
new file mode 100644
index 0000000000000000000000000000000000000000..8840173cb86e2d267ba24fa9cd3e8e65e18ea07b
--- /dev/null
+++ b/sglang/python/sglang/srt/entrypoints/openai/encoding_dsv32.py
@@ -0,0 +1,464 @@
+# Adapted from https://huggingface.co/deepseek-ai/DeepSeek-V3.2/blob/main/encoding/encoding_dsv32.py
+import copy
+import json
+import re
+from typing import Any, Dict, List, Optional, Tuple, Union
+
+
+class DS32EncodingError(Exception):
+    pass
+
+
+TOOLS_SYSTEM_TEMPLATE = """## Tools
+You have access to a set of tools you can use to answer the user's question.
+You can invoke functions by writing a "<{dsml_token}function_calls>" block like the following as part of your reply to the user:
+<{dsml_token}function_calls>
+<{dsml_token}invoke name="$FUNCTION_NAME">
+<{dsml_token}parameter name="$PARAMETER_NAME" string="true|false">$PARAMETER_VALUE</{dsml_token}parameter>
+...
+</{dsml_token}invoke>
+<{dsml_token}invoke name="$FUNCTION_NAME2">
+...
+</{dsml_token}invoke>
+</{dsml_token}function_calls>
+String and scalar parameters should be specified as is without any escaping or quotes, while lists and objects should use JSON format. The "string" attribute should be set to "true" for string type parameters and "false" for other types (numbers, booleans, arrays, objects).
+If the thinking_mode is enabled, then after function results you should strongly consider outputting a thinking block. Here is an example:
+<{dsml_token}function_calls>
+...
+</{dsml_token}function_calls>
+<function_results>
+...
+</function_results>
+{thinking_start_token}...thinking about results{thinking_end_token}
+Here are the functions available in JSONSchema format:
+<functions>
+{tool_schemas}
+</functions>
+"""
+
+bos_token: str = "<｜begin▁of▁sentence｜>"
+eos_token: str = "<｜end▁of▁sentence｜>"
+thinking_start_token: str = "<think>"
+thinking_end_token: str = "</think>"
+dsml_token: str = "｜DSML｜"
+system_msg_template: str = "{content}"
+user_msg_template: str = "<｜User｜>{content}<｜Assistant｜>"
+assistant_msg_template: str = "{reasoning}{content}{tool_calls}<｜end▁of▁sentence｜>"
+thinking_template = "{reasoning_content}"
+
+response_format_template: str = (
+    "## Response Format:\n\nYou MUST strictly adhere to the following schema to reply:\n{schema}"
+)
+tool_call_template: str = (
+    '<{dsml_token}invoke name="{name}">\n{arguments}\n</{dsml_token}invoke>'
+)
+tool_calls_template = (
+    "<{dsml_token}function_calls>\n{tool_calls}\n</{dsml_token}function_calls>"
+)
+
+tool_output_template: str = "\n<result>{content}</result>"
+
+
+def to_json(value: Any) -> str:
+    try:
+        return json.dumps(value, ensure_ascii=False)
+    except:
+        return json.dumps(value, ensure_ascii=True)
+
+
+def tools_from_openai_format(tools):
+    return [tool["function"] for tool in tools]
+
+
+def tool_calls_from_openai_format(tool_calls):
+    return [
+        {
+            "name": tool_call["function"]["name"],
+            "arguments": tool_call["function"]["arguments"],
+        }
+        for tool_call in tool_calls
+    ]
+
+
+def tool_calls_to_openai_format(tool_calls):
+    return [
+        {
+            "type": "function",
+            "function": {
+                "name": tool_call["name"],
+                "arguments": tool_call["arguments"],
+            },
+        }
+        for tool_call in tool_calls
+    ]
+
+
+def encode_arguments_to_dsml(tool_call: Dict[str, str]) -> str:
+    p_dsml_template = """<{dsml_token}parameter name="{key}" string="{is_str}">{value}</{dsml_token}parameter>"""
+    P_dsml_strs = []
+
+    arguments = json.loads(tool_call["arguments"])
+
+    for k, v in arguments.items():
+        p_dsml_str = p_dsml_template.format(
+            dsml_token=dsml_token,
+            key=k,
+            is_str="true" if isinstance(v, str) else "false",
+            value=v if isinstance(v, str) else to_json(v),
+        )
+
+        P_dsml_strs.append(p_dsml_str)
+
+    return "\n".join(P_dsml_strs)
+
+
+def decode_dsml_to_arguments(
+    tool_name: str, tool_args: Dict[str, Tuple[str, str]]
+) -> Dict[str, str]:
+    def _decode_value(key: str, value: str, string: str):
+        if string == "true":
+            value = to_json(value)
+        return f"{to_json(key)}: {value}"
+
+    tool_args_json = (
+        "{"
+        + ", ".join(
+            [_decode_value(k, v, string=is_str) for k, (v, is_str) in tool_args.items()]
+        )
+        + "}"
+    )
+    return dict(name=tool_name, arguments=tool_args_json)
+
+
+def render_tools(tools: List[Dict[str, Union[str, Dict[str, Any]]]]) -> str:
+    tools_json = [to_json(t) for t in tools]
+
+    return TOOLS_SYSTEM_TEMPLATE.format(
+        tool_schemas="\n".join(tools_json),
+        dsml_token=dsml_token,
+        thinking_start_token=thinking_start_token,
+        thinking_end_token=thinking_end_token,
+    )
+
+
+def find_last_user_index(messages: List[Dict[str, Any]]) -> int:
+    last_user_index = -1
+    for idx in range(len(messages) - 1, -1, -1):
+        if messages[idx].get("role") in ["user", "developer"]:
+            last_user_index = idx
+            break
+    return last_user_index
+
+
+def render_message(
+    index: int, messages: List[Dict[str, Any]], thinking_mode: str
+) -> str:
+    if not (0 <= index < len(messages)):
+        raise DS32EncodingError(
+            f"Index {index} out of range for messages list of length {len(messages)}"
+        )
+    if thinking_mode not in ["chat", "thinking"]:
+        raise DS32EncodingError(f"Invalid thinking_mode `{thinking_mode}`")
+
+    prompt = ""
+    msg = messages[index]
+    last_user_idx = find_last_user_index(messages)
+
+    role = msg.get("role")
+    content = msg.get("content")
+    tools = msg.get("tools")
+    response_format = msg.get("response_format")
+    tool_calls = msg.get("tool_calls")
+    reasoning_content = msg.get("reasoning_content")
+
+    if tools:
+        tools = tools_from_openai_format(tools)
+    if tool_calls:
+        tool_calls = tool_calls_from_openai_format(tool_calls)
+
+    if role == "system":
+        prompt += system_msg_template.format(content=content or "")
+        if tools:
+            prompt += "\n\n" + render_tools(tools)
+
+        if response_format:
+            prompt += "\n\n" + response_format_template.format(
+                schema=to_json(response_format)
+            )
+
+    elif role == "developer":
+        if not content:
+            raise DS32EncodingError(f"Invalid message for role `{role}`: {msg}")
+        content_developer = ""
+        if tools:
+            content_developer += "\n\n" + render_tools(tools)
+
+        if response_format:
+            content_developer += "\n\n" + response_format_template.format(
+                schema=to_json(response_format)
+            )
+
+        content_developer += "\n\n# The user's message is: {}".format(content)
+
+        prompt += user_msg_template.format(content=content_developer)
+        if index == last_user_idx and thinking_mode == "thinking":
+            prompt += thinking_start_token
+        else:
+            prompt += thinking_end_token
+
+    elif role == "user":
+        prompt += user_msg_template.format(content=content)
+
+        if index == last_user_idx and thinking_mode == "thinking":
+            prompt += thinking_start_token
+        else:
+            prompt += thinking_end_token
+
+    elif role == "tool":
+        prev_assistant_idx = index - 1
+        assistant_msg = messages[prev_assistant_idx]
+        while prev_assistant_idx >= 0 and assistant_msg.get("role") == "tool":
+            prev_assistant_idx -= 1
+            assistant_msg = messages[prev_assistant_idx]
+
+        if not (
+            index == 0
+            or (prev_assistant_idx >= 0 and assistant_msg.get("role") == "assistant")
+        ):
+            raise DS32EncodingError(f"Invalid messages at {index}:\n{assistant_msg}")
+
+        tool_call_order = index - prev_assistant_idx
+        assistant_tool_calls = assistant_msg.get("tool_calls")
+        if not (assistant_tool_calls and len(assistant_tool_calls) >= tool_call_order):
+            raise DS32EncodingError("No tool calls but found tool output")
+
+        if tool_call_order == 1:
+            prompt += "\n\n<function_results>"
+
+        prompt += tool_output_template.format(content=content)
+
+        if tool_call_order == len(assistant_tool_calls):
+            prompt += "\n</function_results>"
+
+            if index >= last_user_idx and thinking_mode == "thinking":
+                prompt += "\n\n" + thinking_start_token
+            else:
+                prompt += "\n\n" + thinking_end_token
+
+    elif role == "assistant":
+        prev_assistant_idx = index
+        thinking_part = ""
+
+        tool_calls_content = ""
+        if tool_calls:
+            tool_calls = [
+                tool_call_template.format(
+                    dsml_token=dsml_token,
+                    name=tool_call.get("name"),
+                    arguments=encode_arguments_to_dsml(tool_call),
+                )
+                for tool_call in tool_calls
+            ]
+            tool_calls_content += "\n\n" + tool_calls_template.format(
+                dsml_token=dsml_token, tool_calls="\n".join(tool_calls)
+            )
+
+        summary_content = content or ""
+
+        if thinking_mode == "thinking" and index > last_user_idx:
+            if not (reasoning_content or tool_calls):
+                raise DS32EncodingError(
+                    f"ThinkingMode: {thinking_mode}, invalid message without reasoning_content/tool_calls `{msg}` after last user message"
+                )
+            thinking_part = (
+                thinking_template.format(reasoning_content=reasoning_content or "")
+                + thinking_end_token
+            )
+
+        prompt += assistant_msg_template.format(
+            reasoning=thinking_part,
+            content=summary_content,
+            tool_calls=tool_calls_content,
+        )
+    else:
+        raise NotImplementedError(f"Unknown role: {role}")
+
+    return prompt
+
+
+def drop_thinking_messages(
+    messages: List[Dict[str, Any]], last_user_idx: Optional[int] = None
+) -> List[Dict[str, Any]]:
+    messages_wo_thinking: List[Dict[str, Any]] = []
+    last_user_idx = (
+        find_last_user_index(messages) if last_user_idx is None else last_user_idx
+    )
+    for idx, msg in enumerate(messages):
+        role = msg.get("role")
+        if role in ["user", "system", "tool"] or idx >= last_user_idx:
+            messages_wo_thinking.append(msg)
+            continue
+
+        elif role == "assistant":
+            msg_wo_thinking = copy.copy(msg)
+            msg_wo_thinking.pop("reasoning_content", None)
+            messages_wo_thinking.append(msg_wo_thinking)
+
+    return messages_wo_thinking
+
+
+def encode_messages(
+    messages: List[Dict[str, Any]],
+    thinking_mode: str,
+    context: Optional[List[Dict[str, Any]]] = None,
+    drop_thinking: bool = True,
+    add_default_bos_token: bool = True,
+) -> str:
+    context = context if context else []
+    full_messages = context + messages
+
+    prompt = bos_token if add_default_bos_token and len(context) == 0 else ""
+
+    if thinking_mode == "thinking" and drop_thinking:
+        full_messages = drop_thinking_messages(full_messages)
+
+    for idx in range(len(messages)):
+        prompt += render_message(
+            idx + len(context), full_messages, thinking_mode=thinking_mode
+        )
+
+    return prompt
+
+
+def _read_until_stop(
+    index: int, text: str, stop: List[str]
+) -> Tuple[int, str, Optional[str]]:
+    min_pos = len(text)
+    matched_stop = None
+
+    for s in stop:
+        pos = text.find(s, index)
+        if pos != -1 and pos < min_pos:
+            min_pos = pos
+            matched_stop = s
+
+    if matched_stop:
+        content = text[index:min_pos]
+        return min_pos + len(matched_stop), content, matched_stop
+    else:
+        content = text[index:]
+        return len(text), content, None
+
+
+def parse_tool_calls(index: int, text: str):
+    tool_calls: List[Dict[str, Any]] = []
+    stop_token = None
+    tool_calls_end_token = f"</{dsml_token}function_calls>"
+
+    while index < len(text):
+        index, _, stop_token = _read_until_stop(
+            index, text, [f"<{dsml_token}invoke", tool_calls_end_token]
+        )
+        if _ != ">\n":
+            raise DS32EncodingError("Tool call format error")
+
+        if stop_token == tool_calls_end_token:
+            break
+
+        if stop_token is None:
+            raise DS32EncodingError("Missing special token")
+
+        index, tool_name_content, stop_token = _read_until_stop(
+            index, text, [f"<{dsml_token}parameter", f"</{dsml_token}invoke"]
+        )
+
+        p_tool_name = re.findall(
+            r'^\s*name="(.*?)">\n$', tool_name_content, flags=re.DOTALL
+        )
+        if len(p_tool_name) != 1:
+            raise DS32EncodingError("Tool name format error")
+        tool_name = p_tool_name[0]
+
+        tool_args: Dict[str, Tuple[str, str]] = {}
+        while stop_token == f"<{dsml_token}parameter":
+            index, param_content, stop_token = _read_until_stop(
+                index, text, [f"/{dsml_token}parameter"]
+            )
+
+            param_kv = re.findall(
+                r'^ name="(.*?)" string="(true|false)">(.*?)<$',
+                param_content,
+                flags=re.DOTALL,
+            )
+            if len(param_kv) != 1:
+                raise DS32EncodingError("Parameter format error")
+            param_name, string, param_value = param_kv[0]
+
+            if param_name in tool_args:
+                raise DS32EncodingError("Duplicate parameter name")
+            tool_args[param_name] = (param_value, string)
+
+            index, content, stop_token = _read_until_stop(
+                index, text, [f"<{dsml_token}parameter", f"</{dsml_token}invoke"]
+            )
+            if content != ">\n":
+                raise DS32EncodingError("Parameter format error")
+
+        tool_call = decode_dsml_to_arguments(tool_name=tool_name, tool_args=tool_args)
+        tool_calls.append(tool_call)
+
+    return index, stop_token, tool_calls
+
+
+# NOTE: This function is designed to parse only correctly formatted string and will not attempt to correct malformed output that may be generated by the model.
+def parse_message_from_completion_text(text: str, thinking_mode: str):
+    summary_content, reasoning_content, tool_calls = "", "", []
+    index, stop_token = 0, None
+    tool_calls_start_token = f"\n\n<{dsml_token}function_calls"
+
+    is_thinking, is_tool_calling = thinking_mode == "thinking", False
+
+    if is_thinking:
+        index, content_delta, stop_token = _read_until_stop(
+            index, text, [thinking_end_token, tool_calls_start_token]
+        )
+        reasoning_content = content_delta
+        if stop_token != thinking_end_token:
+            raise DS32EncodingError("Invalid thinking format")
+
+    index, content_delta, stop_token = _read_until_stop(
+        index, text, [eos_token, tool_calls_start_token]
+    )
+    summary_content = content_delta
+    if stop_token == tool_calls_start_token:
+        is_tool_calling = True
+    else:
+        if stop_token != eos_token:
+            raise DS32EncodingError("Invalid summary format")
+
+    if is_tool_calling:
+        index, stop_token, tool_calls = parse_tool_calls(index, text)
+
+        index, tool_ends_text, stop_token = _read_until_stop(index, text, [eos_token])
+        if tool_ends_text:
+            raise DS32EncodingError("Unexpected content after tool calls")
+
+    if not (len(text) == index and stop_token in [eos_token, None]):
+        raise DS32EncodingError("Unexpected content at end")
+
+    for sp_token in [
+        bos_token,
+        eos_token,
+        thinking_start_token,
+        thinking_end_token,
+        dsml_token,
+    ]:
+        if sp_token in summary_content or sp_token in reasoning_content:
+            raise DS32EncodingError("Unexpected special token in content")
+
+    return {
+        "role": "assistant",
+        "content": summary_content,
+        "reasoning_content": reasoning_content,
+        "tool_calls": tool_calls_to_openai_format(tool_calls),
+    }
diff --git a/sglang/python/sglang/srt/entrypoints/openai/protocol.py b/sglang/python/sglang/srt/entrypoints/openai/protocol.py
new file mode 100644
index 0000000000000000000000000000000000000000..1550c707a3118755b02d7c2b05f73feeeafd8168
--- /dev/null
+++ b/sglang/python/sglang/srt/entrypoints/openai/protocol.py
@@ -0,0 +1,1471 @@
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Pydantic models for OpenAI API protocol"""
+
+import logging
+import time
+import uuid
+from dataclasses import dataclass
+from typing import Any, Dict, List, NamedTuple, Optional, Tuple, TypeAlias, Union
+
+from openai.types.responses import (
+    ResponseFunctionToolCall,
+    ResponseInputItemParam,
+    ResponseOutputItem,
+    ResponseOutputMessage,
+    ResponseOutputText,
+    ResponseReasoningItem,
+)
+from openai.types.responses.response import ToolChoice
+from openai.types.responses.tool import Tool
+from pydantic import (
+    BaseModel,
+    Field,
+    field_validator,
+    model_serializer,
+    model_validator,
+)
+from typing_extensions import Literal
+
+try:
+    from xgrammar import StructuralTag
+except:
+    StructuralTag = Any
+
+from sglang.utils import convert_json_schema_to_str
+
+logger = logging.getLogger(__name__)
+
+DEFAULT_MODEL_NAME = "default"
+
+
+class ModelCard(BaseModel):
+    """Model cards."""
+
+    id: str
+    object: str = "model"
+    created: int = Field(default_factory=lambda: int(time.time()))
+    owned_by: str = "sglang"
+    root: Optional[str] = None
+    parent: Optional[str] = None
+    max_model_len: Optional[int] = None
+
+
+class ModelList(BaseModel):
+    """Model list consists of model cards."""
+
+    object: str = "list"
+    data: List[ModelCard] = Field(default_factory=list)
+
+
+class ErrorResponse(BaseModel):
+    object: str = "error"
+    message: str
+    type: str
+    param: Optional[str] = None
+    code: int
+
+
+class LogProbs(BaseModel):
+    text_offset: List[int] = Field(default_factory=list)
+    token_logprobs: List[Optional[float]] = Field(default_factory=list)
+    tokens: List[str] = Field(default_factory=list)
+    top_logprobs: List[Optional[Dict[str, float]]] = Field(default_factory=list)
+
+
+class TopLogprob(BaseModel):
+    token: str
+    bytes: List[int]
+    logprob: float
+
+
+class ChatCompletionTokenLogprob(BaseModel):
+    token: str
+    bytes: List[int]
+    logprob: float
+    top_logprobs: List[TopLogprob]
+
+
+class ChoiceLogprobs(BaseModel):
+    # build for v1/chat/completions response
+    content: List[ChatCompletionTokenLogprob]
+
+
+class CachedTokensDetails(BaseModel):
+    """Detailed breakdown of cached tokens by cache source."""
+
+    device: int = 0  # Tokens from device cache (GPU)
+    host: int = 0  # Tokens from host cache (CPU memory)
+    # L3 storage fields are only present when storage backend is enabled
+    storage: Optional[int] = None  # Tokens from L3 storage backend
+    storage_backend: Optional[str] = None  # Type of storage backend used
+
+    @model_serializer(mode="wrap")
+    def _serialize(self, handler):
+        data = handler(self)
+        # Remove None fields so they don't appear in response when L3 is disabled
+        if self.storage is None:
+            data.pop("storage", None)
+        if self.storage_backend is None:
+            data.pop("storage_backend", None)
+        return data
+
+
+class PromptTokensDetails(BaseModel):
+    """Details about prompt tokens."""
+
+    cached_tokens: int = 0
+
+
+class UsageInfo(BaseModel):
+    prompt_tokens: int = 0
+    total_tokens: int = 0
+    completion_tokens: Optional[int] = 0
+    # Used to return cached tokens info when --enable-cache-report is set
+    prompt_tokens_details: Optional[PromptTokensDetails] = None
+    reasoning_tokens: Optional[int] = 0
+
+
+class StreamOptions(BaseModel):
+    include_usage: Optional[bool] = False
+    continuous_usage_stats: Optional[bool] = False
+
+
+class JsonSchemaResponseFormat(BaseModel):
+    name: str
+    description: Optional[str] = None
+    # use alias to workaround pydantic conflict
+    schema_: Optional[Dict[str, object]] = Field(alias="schema", default=None)
+    strict: Optional[bool] = False
+
+
+class ResponseFormat(BaseModel):
+    type: Literal["text", "json_object", "json_schema"]
+    json_schema: Optional[JsonSchemaResponseFormat] = None
+
+
+class StructuresResponseFormat(BaseModel):
+    begin: str
+    schema_: Optional[Dict[str, object]] = Field(alias="schema", default=None)
+    end: str
+
+
+# NOTE(dark): keep this for backward compatibility
+class LegacyStructuralTagResponseFormat(BaseModel):
+    type: Literal["structural_tag"]
+    structures: List[StructuresResponseFormat]
+    triggers: List[str]
+
+
+StructuralTagResponseFormat: TypeAlias = Union[
+    LegacyStructuralTagResponseFormat, StructuralTag
+]
+
+ToolCallConstraint: TypeAlias = Union[
+    Tuple[Literal["structural_tag"], StructuralTagResponseFormat],
+    Tuple[Literal["json_schema"], Any],  # json_schema can be dict/str/None
+]
+
+
+class FileRequest(BaseModel):
+    # https://platform.openai.com/docs/api-reference/files/create
+    file: bytes  # The File object (not file name) to be uploaded
+    purpose: str = (
+        "batch"  # The intended purpose of the uploaded file, default is "batch"
+    )
+
+
+class FileResponse(BaseModel):
+    id: str
+    object: str = "file"
+    bytes: int
+    created_at: int
+    filename: str
+    purpose: str
+
+
+class FileDeleteResponse(BaseModel):
+    id: str
+    object: str = "file"
+    deleted: bool
+
+
+class BatchRequest(BaseModel):
+    input_file_id: (
+        str  # The ID of an uploaded file that contains requests for the new batch
+    )
+    endpoint: str  # The endpoint to be used for all requests in the batch
+    completion_window: str  # The time frame within which the batch should be processed
+    metadata: Optional[dict] = None  # Optional custom metadata for the batch
+
+
+class BatchResponse(BaseModel):
+    id: str
+    object: str = "batch"
+    endpoint: str
+    errors: Optional[dict] = None
+    input_file_id: str
+    completion_window: str
+    status: str = "validating"
+    output_file_id: Optional[str] = None
+    error_file_id: Optional[str] = None
+    created_at: int
+    in_progress_at: Optional[int] = None
+    expires_at: Optional[int] = None
+    finalizing_at: Optional[int] = None
+    completed_at: Optional[int] = None
+    failed_at: Optional[int] = None
+    expired_at: Optional[int] = None
+    cancelling_at: Optional[int] = None
+    cancelled_at: Optional[int] = None
+    request_counts: Optional[dict] = None
+    metadata: Optional[dict] = None
+
+
+def _migrate_deprecated_dp_rank(values: dict) -> dict:
+    if isinstance(values, dict) and values.get("data_parallel_rank") is not None:
+        import warnings
+
+        warnings.warn(
+            "'data_parallel_rank' is deprecated, use 'routed_dp_rank' instead.",
+            DeprecationWarning,
+            stacklevel=2,
+        )
+        if values.get("routed_dp_rank") is None:
+            values["routed_dp_rank"] = values["data_parallel_rank"]
+    return values
+
+
+class CompletionRequest(BaseModel):
+    # Ordered by official OpenAI API documentation
+    # https://platform.openai.com/docs/api-reference/completions/create
+    model: str = Field(
+        default=DEFAULT_MODEL_NAME,
+        description="Model name. Supports LoRA adapters via 'base-model:adapter-name' syntax.",
+    )
+    prompt: Union[List[int], List[List[int]], str, List[str]]
+    best_of: Optional[int] = None
+    echo: bool = False
+    frequency_penalty: float = 0.0
+    logit_bias: Optional[Dict[str, float]] = None
+    logprobs: Optional[int] = None
+    max_tokens: int = 16
+    n: int = 1
+    presence_penalty: float = 0.0
+    seed: Optional[int] = None
+    stop: Optional[Union[str, List[str]]] = None
+    stream: bool = False
+    stream_options: Optional[StreamOptions] = None
+    suffix: Optional[str] = None
+    temperature: float = 1.0
+    top_p: float = 1.0
+    user: Optional[str] = None
+    return_hidden_states: bool = False
+    return_routed_experts: bool = False
+    return_cached_tokens_details: bool = False
+
+    # Extra parameters for SRT backend only and will be ignored by OpenAI models.
+    top_k: int = -1
+    min_p: float = 0.0
+    min_tokens: int = 0
+    json_schema: Optional[str] = None
+    regex: Optional[str] = None
+    ebnf: Optional[str] = None
+    repetition_penalty: float = 1.0
+    stop_token_ids: Optional[List[int]] = None
+    stop_regex: Optional[Union[str, List[str]]] = None
+    no_stop_trim: bool = False
+    ignore_eos: bool = False
+    skip_special_tokens: bool = True
+    lora_path: Optional[Union[List[Optional[str]], Optional[str]]] = None
+    session_params: Optional[Dict] = None
+    response_format: Optional[Union[ResponseFormat, StructuralTagResponseFormat]] = None
+    custom_params: Optional[Dict] = None
+    custom_logit_processor: Optional[str] = None
+
+    # For PD disaggregation
+    bootstrap_host: Optional[Union[List[str], str]] = None
+    bootstrap_port: Optional[Union[List[Optional[int]], int]] = None
+    bootstrap_room: Optional[Union[List[int], int]] = None
+
+    # For DP routing — external router assigns a specific DP worker
+    routed_dp_rank: Optional[int] = None
+    # For PD disagg — hint telling decode which prefill DP worker has the KV cache
+    disagg_prefill_dp_rank: Optional[int] = None
+    # Deprecated: use routed_dp_rank instead
+    data_parallel_rank: Optional[int] = None
+
+    # For request id
+    rid: Optional[Union[List[str], str]] = None
+    # Extra key for classifying the request (e.g. cache_salt)
+    extra_key: Optional[Union[List[str], str]] = None
+    # Cache salt for request caching
+    cache_salt: Optional[Union[List[str], str]] = None
+    # Priority for the request
+    priority: Optional[int] = None
+
+    # For custom metric labels
+    custom_labels: Optional[Dict[str, str]] = None
+
+    @model_validator(mode="before")
+    @classmethod
+    def _handle_deprecated_dp_rank(cls, values):
+        return _migrate_deprecated_dp_rank(values)
+
+    @field_validator("max_tokens")
+    @classmethod
+    def validate_max_tokens_positive(cls, v):
+        if v is not None and v <= 0:
+            raise ValueError("max_tokens must be positive")
+        return v
+
+
+class SglExt(BaseModel):
+    """SGLang extension fields for OpenAI-compatible responses.
+
+    Future SGLang-specific extensions to OpenAI-compatible response objects
+    should be added as fields here rather than directly on the choice object.
+    """
+
+    routed_experts: Optional[str] = None
+    cached_tokens_details: Optional[CachedTokensDetails] = None
+
+    @model_serializer(mode="wrap")
+    def _serialize(self, handler):
+        data = handler(self)
+        # Remove None fields to keep response clean
+        return {k: v for k, v in data.items() if v is not None}
+
+
+class CompletionResponseChoice(BaseModel):
+    index: int
+    text: str
+    logprobs: Optional[LogProbs] = None
+    finish_reason: Optional[Literal["stop", "length", "content_filter", "abort"]] = None
+    matched_stop: Union[None, int, str] = None
+    hidden_states: Optional[object] = None
+
+    @model_serializer(mode="wrap")
+    def _serialize(self, handler):
+        data = handler(self)
+        if self.hidden_states is None:
+            data.pop("hidden_states", None)
+        return data
+
+
+class CompletionResponse(BaseModel):
+    id: str
+    object: str = "text_completion"
+    created: int = Field(default_factory=lambda: int(time.time()))
+    model: str
+    choices: List[CompletionResponseChoice]
+    usage: UsageInfo
+    metadata: Optional[Dict[str, Any]] = None
+    sglext: Optional[SglExt] = None
+
+    @model_serializer(mode="wrap")
+    def _serialize(self, handler):
+        data = handler(self)
+        if self.sglext is None:
+            data.pop("sglext", None)
+        return data
+
+
+class CompletionResponseStreamChoice(BaseModel):
+    index: int
+    text: str
+    logprobs: Optional[LogProbs] = None
+    finish_reason: Optional[Literal["stop", "length", "content_filter", "abort"]] = None
+    matched_stop: Union[None, int, str] = None
+    hidden_states: Optional[object] = None
+
+    @model_serializer(mode="wrap")
+    def _serialize(self, handler):
+        data = handler(self)
+        if self.hidden_states is None:
+            data.pop("hidden_states", None)
+        return data
+
+
+class CompletionStreamResponse(BaseModel):
+    id: str
+    object: str = "text_completion"
+    created: int = Field(default_factory=lambda: int(time.time()))
+    model: str
+    choices: List[CompletionResponseStreamChoice]
+    usage: Optional[UsageInfo] = None
+    sglext: Optional[SglExt] = None
+
+    @model_serializer(mode="wrap")
+    def _serialize(self, handler):
+        data = handler(self)
+        if self.sglext is None:
+            data.pop("sglext", None)
+        return data
+
+
+class ChatCompletionMessageContentTextPart(BaseModel):
+    type: Literal["text"]
+    text: str
+
+
+class ChatCompletionMessageContentImageURL(BaseModel):
+    url: str
+    detail: Optional[Literal["auto", "low", "high"]] = "auto"
+    max_dynamic_patch: Optional[int] = None
+    min_dynamic_patch: Optional[int] = None
+
+
+class ChatCompletionMessageContentVideoURL(BaseModel):
+    url: str
+    max_dynamic_patch: Optional[int] = None
+    min_dynamic_patch: Optional[int] = None
+
+
+class ChatCompletionMessageContentAudioURL(BaseModel):
+    url: str
+
+
+class ChatCompletionMessageContentImagePart(BaseModel):
+    type: Literal["image_url"]
+    image_url: ChatCompletionMessageContentImageURL
+    modalities: Optional[Literal["image", "multi-images", "video"]] = "image"
+
+
+class ChatCompletionMessageContentVideoPart(BaseModel):
+    type: Literal["video_url"]
+    video_url: ChatCompletionMessageContentVideoURL
+
+
+class ChatCompletionMessageContentAudioPart(BaseModel):
+    type: Literal["audio_url"]
+    audio_url: ChatCompletionMessageContentAudioURL
+
+
+ChatCompletionMessageContentPart = Union[
+    ChatCompletionMessageContentTextPart,
+    ChatCompletionMessageContentImagePart,
+    ChatCompletionMessageContentVideoPart,
+    ChatCompletionMessageContentAudioPart,
+]
+
+# Rerank content types for multimodal reranking (e.g., Qwen3-VL-Reranker)
+# Can be a simple string (text-only) or a list of multimodal content parts
+RerankContentPart = Union[
+    ChatCompletionMessageContentTextPart,
+    ChatCompletionMessageContentImagePart,
+    ChatCompletionMessageContentVideoPart,
+]
+RerankContent = Union[str, List[RerankContentPart]]
+
+
+class FunctionResponse(BaseModel):
+    """Function response."""
+
+    name: Optional[str] = None
+    arguments: Optional[str | Dict[str, Any]] = None
+
+
+class ToolCall(BaseModel):
+    """Tool call response."""
+
+    id: Optional[str] = None
+    index: Optional[int] = None
+    type: Literal["function"] = "function"
+    function: FunctionResponse
+
+
+class ChatCompletionMessageGenericParam(BaseModel):
+    role: Literal["system", "assistant", "tool", "function", "developer"]
+    content: Union[str, List[ChatCompletionMessageContentPart], None] = Field(
+        default=None
+    )
+    tool_call_id: Optional[str] = None
+    name: Optional[str] = None
+    reasoning_content: Optional[str] = None
+    tool_calls: Optional[List[ToolCall]] = Field(default=None, examples=[None])
+    tools: Optional[List[Tool]] = Field(default=None, examples=[None])
+
+    @field_validator("role", mode="before")
+    @classmethod
+    def _normalize_role(cls, v):
+        if isinstance(v, str):
+            v_lower = v.lower()
+            if v_lower not in {"system", "assistant", "tool", "function", "developer"}:
+                raise ValueError(
+                    "'role' must be one of 'system', 'developer', 'assistant', 'tool', or 'function' (case-insensitive)."
+                )
+            return v_lower
+        raise ValueError("'role' must be a string")
+
+
+class ChatCompletionMessageUserParam(BaseModel):
+    role: Literal["user"]
+    content: Union[str, List[ChatCompletionMessageContentPart]]
+
+
+ChatCompletionMessageParam = Union[
+    ChatCompletionMessageGenericParam, ChatCompletionMessageUserParam
+]
+
+
+class Function(BaseModel):
+    """Function descriptions."""
+
+    description: Optional[str] = Field(default=None, examples=[None])
+    name: str
+    parameters: Optional[object] = None
+    strict: bool = False
+
+
+class Tool(BaseModel):
+    """Function wrapper."""
+
+    type: str = Field(default="function", examples=["function"])
+    function: Function
+
+
+class ToolChoiceFuncName(BaseModel):
+    """The name of tool choice function."""
+
+    name: Optional[str] = None
+
+
+class ToolChoice(BaseModel):
+    """The tool choice definition."""
+
+    function: ToolChoiceFuncName
+    type: Literal["function"] = Field(default="function", examples=["function"])
+
+
+class ChatCompletionRequest(BaseModel):
+    # Ordered by official OpenAI API documentation
+    # https://platform.openai.com/docs/api-reference/chat/create
+    messages: List[ChatCompletionMessageParam]
+    model: str = Field(
+        default=DEFAULT_MODEL_NAME,
+        description="Model name. Supports LoRA adapters via 'base-model:adapter-name' syntax.",
+    )
+    frequency_penalty: float = 0.0
+    logit_bias: Optional[Dict[str, float]] = None
+    logprobs: bool = False
+    top_logprobs: Optional[int] = None
+    max_tokens: Optional[int] = Field(
+        default=None,
+        deprecated="max_tokens is deprecated in favor of the max_completion_tokens field",
+        description="The maximum number of tokens that can be generated in the chat completion. ",
+    )
+    max_completion_tokens: Optional[int] = Field(
+        default=None,
+        description="The maximum number of completion tokens for a chat completion request, "
+        "including visible output tokens and reasoning tokens. Input tokens are not included. ",
+    )
+    n: int = 1
+    presence_penalty: float = 0.0
+    response_format: Optional[Union[ResponseFormat, StructuralTagResponseFormat]] = None
+    seed: Optional[int] = None
+    stop: Optional[Union[str, List[str]]] = None
+    stream: bool = False
+    stream_options: Optional[StreamOptions] = None
+    temperature: Optional[float] = None
+    top_p: Optional[float] = None
+    user: Optional[str] = None
+    tools: Optional[List[Tool]] = Field(default=None, examples=[None])
+    tool_choice: Union[ToolChoice, Literal["auto", "required", "none"]] = Field(
+        default="auto", examples=["none"]
+    )  # noqa
+    return_hidden_states: bool = False
+    return_routed_experts: bool = False
+    return_cached_tokens_details: bool = False
+    reasoning_effort: Optional[Literal["low", "medium", "high"]] = Field(
+        default="medium",
+        description="Constrains effort on reasoning for reasoning models. "
+        "'low' is the least effort, 'high' is the most effort. Reducing reasoning effort can "
+        "result in faster responses and fewer tokens used on reasoning in a response. "
+        "Currently only supported for OpenAI models in the harmony path, i.e GPT-OSS models.",
+    )
+
+    # Extra parameters for SRT backend only and will be ignored by OpenAI models.
+    top_k: Optional[int] = None
+    min_p: Optional[float] = None
+    min_tokens: int = 0
+    regex: Optional[str] = None
+    ebnf: Optional[str] = None
+    repetition_penalty: Optional[float] = None
+    stop_token_ids: Optional[List[int]] = None
+    stop_regex: Optional[Union[str, List[str]]] = None
+    no_stop_trim: bool = False
+    ignore_eos: bool = False
+    continue_final_message: bool = False
+    skip_special_tokens: bool = True
+    lora_path: Optional[Union[List[Optional[str]], Optional[str]]] = None
+    session_params: Optional[Dict] = None
+    separate_reasoning: bool = True
+    stream_reasoning: bool = True
+    chat_template_kwargs: Optional[Dict] = None
+
+    # SGLang multimodal tiling controls (extensions)
+    max_dynamic_patch: Optional[int] = None
+    min_dynamic_patch: Optional[int] = None
+
+    # Custom logit processor for advanced sampling control
+    custom_logit_processor: Optional[Union[List[Optional[str]], str]] = None
+    custom_params: Optional[Dict] = None
+
+    # For request id
+    rid: Optional[Union[List[str], str]] = None
+    # Extra key for classifying the request (e.g. cache_salt)
+    extra_key: Optional[Union[List[str], str]] = None
+    # Cache salt for request caching
+    cache_salt: Optional[Union[List[str], str]] = None
+    # Priority for the request
+    priority: Optional[int] = None
+
+    # For PD disaggregation
+    bootstrap_host: Optional[Union[List[str], str]] = None
+    bootstrap_port: Optional[Union[List[Optional[int]], int]] = None
+    bootstrap_room: Optional[Union[List[int], int]] = None
+
+    # For DP routing — external router assigns a specific DP worker
+    routed_dp_rank: Optional[int] = None
+    # For PD disagg — hint telling decode which prefill DP worker has the KV cache
+    disagg_prefill_dp_rank: Optional[int] = None
+    # Deprecated: use routed_dp_rank instead
+    data_parallel_rank: Optional[int] = None
+
+    # OpenAI/SGLang default sampling parameters
+    _DEFAULT_SAMPLING_PARAMS = {
+        "temperature": 1.0,
+        "top_p": 1.0,
+        "top_k": -1,
+        "min_p": 0.0,
+        "repetition_penalty": 1.0,
+    }
+
+    @model_validator(mode="before")
+    @classmethod
+    def _handle_deprecated_dp_rank(cls, values):
+        return _migrate_deprecated_dp_rank(values)
+
+    @model_validator(mode="before")
+    @classmethod
+    def set_tool_choice_default(cls, values):
+        if values.get("tool_choice") is None:
+            if values.get("tools") is None:
+                values["tool_choice"] = "none"
+            else:
+                values["tool_choice"] = "auto"
+        return values
+
+    @model_validator(mode="before")
+    @classmethod
+    def normalize_reasoning_inputs(cls, values: Dict):
+        r = values.get("reasoning")
+        if r is None:
+            return values
+
+        if isinstance(r, dict):
+            effort = r.get("effort") or r.get("reasoning_effort")
+            if effort in {"low", "medium", "high"}:
+                values["reasoning_effort"] = effort
+
+            enabled = (
+                r.get("enabled")
+                if r.get("enabled") is not None
+                else r.get("enable", False)
+            )
+            if isinstance(enabled, str):
+                enabled = enabled.strip().lower() in {"1", "true", "yes", "y", "on"}
+            if enabled:
+                ctk = values.get("chat_template_kwargs")
+                if not isinstance(ctk, dict):
+                    ctk = {}
+                ctk.setdefault("thinking", True)
+                values["chat_template_kwargs"] = ctk
+
+        return values
+
+    @model_validator(mode="before")
+    @classmethod
+    def set_json_schema(cls, values):
+        response_format = values.get("response_format")
+        if not response_format:
+            return values
+
+        if response_format.get("type") != "json_schema":
+            return values
+
+        schema = response_format.pop("schema", None)
+        json_schema = response_format.get("json_schema")
+
+        if json_schema:
+            return values
+
+        if schema:
+            name_ = schema.get("title", "Schema")
+            strict_ = False
+            if "properties" in schema and "strict" in schema["properties"]:
+                item = schema["properties"].pop("strict", None)
+                if item and item.get("default", False):
+                    strict_ = True
+
+            response_format["json_schema"] = {
+                "name": name_,
+                "schema": schema,
+                "strict": strict_,
+            }
+
+        return values
+
+    def to_sampling_params(
+        self,
+        stop: List[str],
+        model_generation_config: Dict[str, Any],
+        tool_call_constraint: Optional[ToolCallConstraint] = None,
+    ) -> Dict[str, Any]:
+        """
+        Convert request to sampling parameters.
+        Priority: user value > model generation_config > OpenAI defaults
+        """
+
+        def get_param(param_name: str):
+            value = getattr(self, param_name)
+            if value is None:
+                return model_generation_config.get(
+                    param_name, self._DEFAULT_SAMPLING_PARAMS[param_name]
+                )
+            return value
+
+        # add per user request
+        spaces_between_special_tokens = (
+            True
+            if self.chat_template_kwargs is None
+            else self.chat_template_kwargs.get("spaces_between_special_tokens", True)
+        )
+
+        sampling_params = {
+            "temperature": get_param("temperature"),
+            "max_new_tokens": self.max_completion_tokens or self.max_tokens,
+            "min_new_tokens": self.min_tokens,
+            "stop": stop,
+            "stop_token_ids": self.stop_token_ids,
+            "stop_regex": self.stop_regex,
+            "top_p": get_param("top_p"),
+            "top_k": get_param("top_k"),
+            "min_p": get_param("min_p"),
+            "presence_penalty": self.presence_penalty,
+            "frequency_penalty": self.frequency_penalty,
+            "repetition_penalty": get_param("repetition_penalty"),
+            "regex": self.regex,
+            "ebnf": self.ebnf,
+            "n": self.n,
+            "no_stop_trim": self.no_stop_trim,
+            "ignore_eos": self.ignore_eos,
+            "skip_special_tokens": self.skip_special_tokens,
+            "logit_bias": self.logit_bias,
+            "custom_params": self.custom_params,
+            "sampling_seed": self.seed,
+            "spaces_between_special_tokens": spaces_between_special_tokens,
+        }
+
+        if self.response_format and self.response_format.type == "json_schema":
+            sampling_params["json_schema"] = convert_json_schema_to_str(
+                self.response_format.json_schema.schema_
+            )
+        elif self.response_format and self.response_format.type == "json_object":
+            sampling_params["json_schema"] = '{"type": "object"}'
+        elif self.response_format and self.response_format.type == "structural_tag":
+            sampling_params["structural_tag"] = convert_json_schema_to_str(
+                self.response_format.model_dump(by_alias=True)
+            )
+
+        # Check if there are already existing output constraints
+        has_existing_constraints = (
+            sampling_params.get("regex")
+            or sampling_params.get("ebnf")
+            or sampling_params.get("structural_tag")
+            or sampling_params.get("json_schema")
+        )
+
+        if tool_call_constraint and has_existing_constraints:
+            logger.warning("Constrained decoding is not compatible with tool calls.")
+        elif tool_call_constraint:
+            constraint_type, constraint_value = tool_call_constraint
+            if constraint_type == "structural_tag":
+                sampling_params[constraint_type] = convert_json_schema_to_str(
+                    constraint_value.model_dump(by_alias=True)
+                )
+            elif constraint_type == "json_schema":
+                sampling_params[constraint_type] = convert_json_schema_to_str(
+                    constraint_value  # type: ignore
+                )
+            else:
+                sampling_params[constraint_type] = constraint_value
+
+        return sampling_params
+
+
+class ChatMessage(BaseModel):
+    role: Optional[str] = None
+    content: Optional[str] = None
+    reasoning_content: Optional[str] = None
+    tool_calls: Optional[List[ToolCall]] = Field(default=None, examples=[None])
+
+
+class ChatCompletionResponseChoice(BaseModel):
+    index: int
+    message: ChatMessage
+    logprobs: Optional[Union[LogProbs, ChoiceLogprobs]] = None
+    finish_reason: Optional[
+        Literal[
+            "stop", "length", "tool_calls", "content_filter", "function_call", "abort"
+        ]
+    ] = None
+    matched_stop: Union[None, int, str] = None
+    hidden_states: Optional[object] = None
+
+    @model_serializer(mode="wrap")
+    def _serialize(self, handler):
+        data = handler(self)
+        if self.hidden_states is None:
+            data.pop("hidden_states", None)
+        return data
+
+
+class ChatCompletionResponse(BaseModel):
+    id: str
+    object: str = "chat.completion"
+    created: int = Field(default_factory=lambda: int(time.time()))
+    model: str
+    choices: List[ChatCompletionResponseChoice]
+    usage: UsageInfo
+    metadata: Optional[Dict[str, Any]] = None
+    sglext: Optional[SglExt] = None
+
+    @model_serializer(mode="wrap")
+    def _serialize(self, handler):
+        data = handler(self)
+        if self.sglext is None:
+            data.pop("sglext", None)
+        return data
+
+
+class DeltaMessage(BaseModel):
+    role: Optional[str] = None
+    content: Optional[str] = None
+    reasoning_content: Optional[str] = None
+    tool_calls: Optional[List[ToolCall]] = Field(default=None, examples=[None])
+    hidden_states: Optional[object] = None
+
+    @model_serializer(mode="wrap")
+    def _serialize(self, handler):
+        data = handler(self)
+        if self.hidden_states is None:
+            data.pop("hidden_states", None)
+        return data
+
+
+class ChatCompletionResponseStreamChoice(BaseModel):
+    index: int
+    delta: DeltaMessage
+    logprobs: Optional[Union[LogProbs, ChoiceLogprobs]] = None
+    finish_reason: Optional[
+        Literal[
+            "stop", "length", "tool_calls", "content_filter", "function_call", "abort"
+        ]
+    ] = None
+    matched_stop: Union[None, int, str] = None
+
+
+class ChatCompletionStreamResponse(BaseModel):
+    id: str
+    object: str = "chat.completion.chunk"
+    created: int = Field(default_factory=lambda: int(time.time()))
+    model: str
+    choices: List[ChatCompletionResponseStreamChoice]
+    usage: Optional[UsageInfo] = None
+    sglext: Optional[SglExt] = None
+
+    @model_serializer(mode="wrap")
+    def _serialize(self, handler):
+        data = handler(self)
+        if self.sglext is None:
+            data.pop("sglext", None)
+        return data
+
+
+class MultimodalEmbeddingInput(BaseModel):
+    text: Optional[str] = None
+    image: Optional[str] = None
+    video: Optional[str] = None
+
+
+EmbeddingInput = Union[
+    List[int], List[List[int]], str, List[str], List[MultimodalEmbeddingInput]
+]
+
+
+class EmbeddingRequest(BaseModel):
+    # Ordered by official OpenAI API documentation
+    # https://platform.openai.com/docs/api-reference/embeddings/create
+    input: EmbeddingInput
+    model: str = DEFAULT_MODEL_NAME
+    encoding_format: str = "float"
+    dimensions: Optional[int] = None
+    user: Optional[str] = None
+
+    # The request id.
+    rid: Optional[Union[List[str], str]] = None
+    # Priority for the request
+    priority: Optional[int] = None
+    # LoRA adapter path(s)
+    lora_path: Optional[Union[List[Optional[str]], Optional[str]]] = None
+
+
+class EmbeddingObject(BaseModel):
+    embedding: List[float]
+    index: int
+    object: str = "embedding"
+
+
+ClassifyInput = Union[str, List[str], List[int]]
+
+
+class ClassifyRequest(BaseModel):
+    # OpenAI-compatible classification request
+    model: str = DEFAULT_MODEL_NAME
+    input: ClassifyInput
+    user: Optional[str] = None
+
+    # The request id.
+    rid: Optional[Union[List[str], str]] = None
+    # Priority for the request
+    priority: Optional[int] = None
+
+
+class ClassifyData(BaseModel):
+    index: int
+    label: str
+    probs: List[float]
+    num_classes: int
+
+
+class ClassifyResponse(BaseModel):
+    id: str
+    object: str = "list"
+    created: int
+    model: str
+    data: List[ClassifyData]
+    usage: UsageInfo
+
+
+class EmbeddingResponse(BaseModel):
+    data: List[EmbeddingObject]
+    model: str
+    object: str = "list"
+    usage: Optional[UsageInfo] = None
+
+
+class ScoringRequest(BaseModel):
+    query: Optional[Union[str, List[int]]] = (
+        None  # Query text or pre-tokenized token IDs
+    )
+    items: Optional[Union[str, List[str], List[List[int]]]] = (
+        None  # Item text(s) or pre-tokenized token IDs
+    )
+    label_token_ids: Optional[List[int]] = (
+        None  # Token IDs to compute probabilities for
+    )
+    apply_softmax: bool = False
+    item_first: bool = False
+    model: str = DEFAULT_MODEL_NAME
+
+
+class ScoringResponse(BaseModel):
+    scores: List[
+        List[float]
+    ]  # List of lists of probabilities, each in the order of label_token_ids
+    model: str
+    usage: Optional[UsageInfo] = None
+    object: str = "scoring"
+
+
+class V1RerankReqInput(BaseModel):
+    query: RerankContent = Field(
+        ...,
+        description="The query to match against documents. Can be a string (text-only) "
+        "or a list of content parts for multimodal queries (text, image_url, video_url).",
+    )
+    documents: List[RerankContent] = Field(
+        ...,
+        description="List of documents to rank. Each document can be a string (text-only) "
+        "or a list of content parts for multimodal documents (text, image_url, video_url).",
+    )
+    instruct: Optional[str] = Field(
+        default=None,
+        description="The instruct to the reranker model.",
+    )
+    top_n: Optional[int] = Field(
+        default=None,
+        description="Maximum number of documents to return. Defaults to returning all documents. "
+        "If specified value is greater than the total number of documents, all documents will be returned.",
+    )
+    return_documents: bool = Field(
+        default=True,
+        description="Whether to return documents in the response. Only included when set to true.",
+    )
+
+    @field_validator("top_n")
+    @classmethod
+    def validate_top_n(cls, v):
+        if v is not None and v < 1:
+            raise ValueError("Value error, parameter top_n should be larger than 0.")
+        return v
+
+    def is_multimodal(self) -> bool:
+        """Check if the request contains any multimodal content."""
+        if isinstance(self.query, list):
+            return True
+        for doc in self.documents:
+            if isinstance(doc, list):
+                return True
+        return False
+
+
+class RerankResponse(BaseModel):
+    score: float
+    document: Optional[str] = None
+    index: int
+    meta_info: Optional[dict] = None
+
+    @model_serializer(mode="wrap")
+    def _serialize(self, handler):
+        data = handler(self)
+        # Exclude document field if it's None
+        if self.document is None:
+            data.pop("document", None)
+        return data
+
+
+class TokenizeRequest(BaseModel):
+    """Request schema for the /tokenize endpoint."""
+
+    model: str = DEFAULT_MODEL_NAME
+    prompt: Union[str, List[str]]
+    add_special_tokens: bool = Field(
+        default=True,
+        description="whether to add model-specific special tokens (e.g. BOS/EOS) during encoding.",
+    )
+
+
+class TokenizeResponse(BaseModel):
+    """Response schema for the /tokenize endpoint."""
+
+    tokens: Union[List[int], List[List[int]]]
+    count: Union[int, List[int]]
+    max_model_len: int
+
+
+class DetokenizeRequest(BaseModel):
+    """Request schema for the /detokenize endpoint."""
+
+    model: str = DEFAULT_MODEL_NAME
+    tokens: Union[List[int], List[List[int]]]
+    skip_special_tokens: bool = Field(
+        default=True,
+        description="whether to exclude special tokens (e.g. padding or EOS) during decoding.",
+    )
+
+
+class DetokenizeResponse(BaseModel):
+    """Response schema for the /detokenize endpoint."""
+
+    text: Union[str, List[str]]
+
+
+OpenAIServingRequest = Union[
+    ChatCompletionRequest,
+    CompletionRequest,
+    EmbeddingRequest,
+    ClassifyRequest,
+    ScoringRequest,
+    V1RerankReqInput,
+    TokenizeRequest,
+    DetokenizeRequest,
+]
+
+
+# Response API protocol definitions
+class ResponseReasoningParam(BaseModel):
+    """Reasoning parameters for responses."""
+
+    effort: Optional[Literal["low", "medium", "high"]] = Field(
+        default="medium",
+        description="Constrains effort on reasoning for reasoning models.",
+    )
+
+
+class ResponseTool(BaseModel):
+    """Tool definition for responses."""
+
+    type: Literal["web_search_preview", "code_interpreter"] = Field(
+        description="Type of tool to enable"
+    )
+
+
+ResponseInputOutputItem: TypeAlias = Union[
+    ResponseInputItemParam,
+    "ResponseReasoningItem",
+    ResponseFunctionToolCall,
+]
+
+
+class ResponsesRequest(BaseModel):
+    """Request body for v1/responses endpoint."""
+
+    # Core OpenAI API fields (ordered by official documentation)
+    background: Optional[bool] = False
+    include: Optional[
+        List[
+            Literal[
+                "code_interpreter_call.outputs",
+                "computer_call_output.output.image_url",
+                "file_search_call.results",
+                "message.input_image.image_url",
+                "message.output_text.logprobs",
+                "reasoning.encrypted_content",
+            ]
+        ]
+    ] = None
+    input: Union[str, List[ResponseInputOutputItem]]
+    instructions: Optional[str] = None
+    max_output_tokens: Optional[int] = None
+    max_tool_calls: Optional[int] = None
+    metadata: Optional[Dict[str, Any]] = None
+    model: Optional[str] = None  # Made optional to match vLLM
+    parallel_tool_calls: Optional[bool] = True
+    previous_response_id: Optional[str] = None
+    reasoning: Optional[ResponseReasoningParam] = None
+    service_tier: Literal["auto", "default", "flex", "scale", "priority"] = "auto"
+    store: Optional[bool] = True
+    stream: Optional[bool] = False
+    temperature: Optional[float] = None
+    tool_choice: Literal["auto", "required", "none"] = "auto"
+    tools: List[ResponseTool] = Field(default_factory=list)
+    top_logprobs: Optional[int] = 0
+    top_p: Optional[float] = None
+    truncation: Optional[Literal["auto", "disabled"]] = "disabled"
+    user: Optional[str] = None
+
+    # Extra SGLang parameters
+    request_id: str = Field(
+        default_factory=lambda: f"resp_{uuid.uuid4().hex}",
+        description="The request_id related to this request. If the caller does not set it, a random uuid will be generated.",
+    )
+    priority: int = Field(default=0, description="Request priority")
+    extra_key: Optional[str] = Field(
+        default=None,
+        description="Extra key for classifying the request (e.g. cache_salt)",
+    )
+    cache_salt: Optional[str] = Field(
+        default=None, description="Cache salt for request caching"
+    )
+
+    # SGLang-specific sampling parameters
+    frequency_penalty: float = 0.0
+    presence_penalty: float = 0.0
+    stop: Optional[Union[str, List[str]]] = None
+    top_k: int = -1
+    min_p: float = 0.0
+    repetition_penalty: float = 1.0
+
+    # Default sampling parameters
+    _DEFAULT_SAMPLING_PARAMS = {
+        "temperature": 0.7,
+        "top_p": 1.0,
+        "top_k": -1,
+        "min_p": 0.0,
+        "repetition_penalty": 1.0,
+    }
+
+    def to_sampling_params(
+        self, default_max_tokens: int, default_params: Optional[Dict] = None
+    ) -> Dict[str, Any]:
+        """Convert to sampling parameters for generation."""
+        if default_params is None:
+            default_params = {}
+
+        # Use max_output_tokens if available, otherwise use max_tokens for backwards compatibility
+        if self.max_output_tokens is not None:
+            max_tokens = min(self.max_output_tokens, default_max_tokens)
+        else:
+            max_tokens = default_max_tokens
+
+        # Avoid exceed the context length by minus 2 token
+        max_tokens -= 2
+
+        # Get parameters with defaults
+        temperature = self.temperature
+        if temperature is None:
+            temperature = default_params.get(
+                "temperature", self._DEFAULT_SAMPLING_PARAMS["temperature"]
+            )
+
+        top_p = self.top_p
+        if top_p is None:
+            top_p = default_params.get("top_p", self._DEFAULT_SAMPLING_PARAMS["top_p"])
+
+        params = {
+            "max_new_tokens": max_tokens,
+            "temperature": temperature,
+            "top_p": top_p,
+            "frequency_penalty": self.frequency_penalty,
+            "presence_penalty": self.presence_penalty,
+            "stop": self.stop,
+            "top_k": self.top_k,
+            "min_p": self.min_p,
+            "repetition_penalty": self.repetition_penalty,
+        }
+
+        # Apply any additional default parameters
+        for key, value in default_params.items():
+            if key not in params or params[key] is None:
+                params[key] = value
+
+        return params
+
+
+class PromptTokenUsageInfo(BaseModel):
+    """Prompt token usage details."""
+
+    cached_tokens: int = 0
+
+
+class ResponsesResponse(BaseModel):
+    """Response body for v1/responses endpoint."""
+
+    id: str = Field(default_factory=lambda: f"resp_{time.time()}")
+    object: Literal["response"] = "response"
+    created_at: int = Field(default_factory=lambda: int(time.time()))
+    model: str
+
+    output: List[
+        Union[ResponseOutputItem, ResponseReasoningItem, ResponseFunctionToolCall]
+    ] = Field(default_factory=list)
+    status: Literal["queued", "in_progress", "completed", "failed", "cancelled"]
+    usage: Optional[UsageInfo] = None
+    parallel_tool_calls: bool = True
+    tool_choice: str = "auto"
+    tools: List[ResponseTool] = Field(default_factory=list)
+
+    # OpenAI compatibility fields. not all are used at the moment.
+    # Recommend checking https://platform.openai.com/docs/api-reference/responses
+    error: Optional[dict] = None
+    incomplete_details: Optional[dict] = None  # TODO(v) support this input
+    instructions: Optional[str] = None
+    max_output_tokens: Optional[int] = None
+    previous_response_id: Optional[str] = None
+    reasoning: Optional[dict] = (
+        # Unused. No model supports this. For GPT-oss, system prompt sets
+        # the field, not server args.
+        None  # {"effort": Optional[str], "summary": Optional[str]}
+    )
+    store: Optional[bool] = None
+    temperature: Optional[float] = None
+    text: Optional[dict] = None  # e.g. {"format": {"type": "text"}}
+    top_p: Optional[float] = None
+    truncation: Optional[str] = None
+    user: Optional[str] = None
+    metadata: Optional[Dict[str, Any]] = None
+
+    @classmethod
+    def from_request(
+        cls,
+        request: ResponsesRequest,
+        sampling_params: Any,
+        model_name: str,
+        created_time: int,
+        output: List[
+            Union[ResponseOutputItem, ResponseReasoningItem, ResponseFunctionToolCall]
+        ],
+        status: str,
+        usage: Optional[UsageInfo],
+    ) -> "ResponsesResponse":
+        """Create a response from a request."""
+
+        # Determine if the output is plain text only to set text.format
+        def _is_text_only(
+            items: List[
+                Union[
+                    ResponseOutputItem, ResponseReasoningItem, ResponseFunctionToolCall
+                ]
+            ],
+        ) -> bool:
+            if not items:
+                return False
+            for it in items:
+                # tool call -> not pure text.
+                if isinstance(it, ResponseReasoningItem) or isinstance(
+                    it, ResponseFunctionToolCall
+                ):
+                    return False
+                try:
+                    if isinstance(it, ResponseOutputText):
+                        continue
+                    elif isinstance(it, ResponseOutputMessage):
+                        if not it.content:
+                            continue
+                        for c in it.content:
+                            if not isinstance(c, ResponseOutputText):
+                                return False
+                    else:
+                        # Unknown type, not considered text-only
+                        return False
+                except AttributeError:
+                    return False
+            return True
+
+        text_format = {"format": {"type": "text"}} if _is_text_only(output) else None
+
+        return cls(
+            id=request.request_id,
+            created_at=created_time,
+            model=model_name,
+            output=output,
+            status=status,
+            usage=usage,
+            parallel_tool_calls=request.parallel_tool_calls or True,
+            tool_choice=request.tool_choice,
+            tools=request.tools,
+            # fields for parity with v1/responses
+            error=None,
+            incomplete_details=None,
+            instructions=request.instructions,
+            max_output_tokens=request.max_output_tokens,
+            previous_response_id=request.previous_response_id,  # TODO(v): ensure this is propagated if retrieved from store
+            reasoning={
+                "effort": request.reasoning.effort if request.reasoning else None,
+                "summary": None,  # unused
+            },
+            store=request.store,
+            temperature=request.temperature,
+            text=text_format,  # TODO(v): Expand coverage per https://platform.openai.com/docs/api-reference/responses/list
+            top_p=request.top_p,
+            truncation=request.truncation,
+            user=request.user,
+            metadata=request.metadata or {},
+        )
+
+
+class RequestResponseMetadata(BaseModel):
+    """Metadata for request/response tracking."""
+
+    request_id: str
+    final_usage_info: Optional[UsageInfo] = None
+
+
+@dataclass
+class MessageProcessingResult:
+    """Result of processing chat messages and applying templates.
+
+    This dataclass encapsulates all the outputs from message processing including
+    prompt generation, multimodal data extraction, and constraint preparation.
+    Used internally by OpenAIServingChat to pass processed data between methods.
+
+    Args:
+        prompt: The final text prompt after applying chat template
+        prompt_ids: Either the text prompt (str) or tokenized IDs (List[int])
+        image_data: Extracted image data from messages, if any
+        audio_data: Extracted audio data from messages, if any
+        modalities: List of modality types present in the messages
+        stop: Combined stop strings from template and request
+        tool_call_constraint: Optional constraint for structured tool calls
+    """
+
+    prompt: str
+    prompt_ids: Union[str, List[int]]
+    image_data: Optional[Any]
+    audio_data: Optional[Any]
+    video_data: Optional[Any]
+    modalities: List[str]
+    stop: List[str]
+    tool_call_constraint: Optional[ToolCallConstraint] = None
+
+
+class ToolCallProcessingResult(NamedTuple):
+    """Result of processing tool calls in a response."""
+
+    tool_calls: Optional[
+        List[Any]
+    ]  # List of ToolCall objects or None if parsing failed
+    remaining_text: str  # Text remaining after parsing tool calls
+    finish_reason: Dict[str, Any]  # Updated finish reason dictionary
+
+
+class ResponseReasoningTextContent(BaseModel):
+    text: str
+    type: Literal["reasoning_text"] = "reasoning_text"
+
+
+ResponseInputOutputItem: TypeAlias = Union[
+    ResponseInputItemParam, "ResponseReasoningItem", ResponseFunctionToolCall
+]
+
+
+# ================== Transcription API Protocol Definitions ==================
+
+
+class TranscriptionRequest(BaseModel):
+    """Request model for audio transcription (OpenAI-compatible)."""
+
+    model: str = DEFAULT_MODEL_NAME
+    language: Optional[str] = None
+    response_format: str = "json"
+    temperature: float = 0.0
+    stream: bool = False
+    # Internal fields (not from API)
+    audio_data: Optional[bytes] = None
+    audio_duration_s: float = 0.0
+
+
+class TranscriptionUsage(BaseModel):
+    """Usage info for transcription response (duration-based)."""
+
+    type: Literal["duration"] = "duration"
+    seconds: int  # Audio duration in seconds (rounded up)
+
+
+class TranscriptionResponse(BaseModel):
+    """Non-streaming transcription response (OpenAI-compatible)."""
+
+    text: str
+    usage: Optional[TranscriptionUsage] = None
+
+
+class TranscriptionStreamChoice(BaseModel):
+    """Delta content for streaming transcription."""
+
+    delta: DeltaMessage
+    finish_reason: Optional[str] = None
+
+
+class TranscriptionStreamResponse(BaseModel):
+    """Streaming transcription chunk (OpenAI-compatible)."""
+
+    id: str = Field(default_factory=lambda: f"trsc-{uuid.uuid4().hex}")
+    object: Literal["transcription.chunk"] = "transcription.chunk"
+    created: int = Field(default_factory=lambda: int(time.time()))
+    model: str
+    choices: List[TranscriptionStreamChoice]
+    usage: Optional[UsageInfo] = None
diff --git a/sglang/python/sglang/srt/entrypoints/openai/serving_base.py b/sglang/python/sglang/srt/entrypoints/openai/serving_base.py
new file mode 100644
index 0000000000000000000000000000000000000000..6c1d3c2866fb283477fbb84269c62d6a44d24649
--- /dev/null
+++ b/sglang/python/sglang/srt/entrypoints/openai/serving_base.py
@@ -0,0 +1,270 @@
+from __future__ import annotations
+
+import json
+import logging
+import uuid
+from abc import ABC, abstractmethod
+from typing import TYPE_CHECKING, Any, List, Optional, Tuple, Union
+
+import orjson
+from fastapi import HTTPException, Request
+from fastapi.responses import ORJSONResponse, StreamingResponse
+
+from sglang.srt.entrypoints.openai.encoding_dsv32 import DS32EncodingError
+from sglang.srt.entrypoints.openai.protocol import ErrorResponse, OpenAIServingRequest
+from sglang.srt.managers.io_struct import EmbeddingReqInput, GenerateReqInput
+from sglang.srt.observability.req_time_stats import monotonic_time
+from sglang.srt.server_args import ServerArgs
+
+if TYPE_CHECKING:
+    from sglang.srt.managers.tokenizer_manager import TokenizerManager
+
+logger = logging.getLogger(__name__)
+
+
+# Base class for specific endpoint handlers
+class OpenAIServingBase(ABC):
+    """Abstract base class for OpenAI endpoint handlers"""
+
+    def __init__(self, tokenizer_manager: TokenizerManager):
+        self.tokenizer_manager = tokenizer_manager
+        self.allowed_custom_labels = (
+            set(
+                self.tokenizer_manager.server_args.tokenizer_metrics_allowed_custom_labels
+            )
+            if isinstance(self.tokenizer_manager.server_args, ServerArgs)
+            and self.tokenizer_manager.server_args.tokenizer_metrics_allowed_custom_labels
+            else None
+        )
+
+    def _parse_model_parameter(self, model: str) -> Tuple[str, Optional[str]]:
+        """Parse 'base-model:adapter-name' syntax to extract LoRA adapter.
+
+        Returns (base_model, adapter_name) or (model, None) if no colon present.
+        """
+        if ":" not in model:
+            return model, None
+
+        # Split on first colon only to handle model paths with multiple colons
+        parts = model.split(":", 1)
+        base_model = parts[0].strip()
+        adapter_name = parts[1].strip() or None
+
+        return base_model, adapter_name
+
+    def _resolve_lora_path(
+        self,
+        request_model: str,
+        explicit_lora_path: Optional[Union[str, List[Optional[str]]]],
+    ) -> Optional[Union[str, List[Optional[str]]]]:
+        """Resolve LoRA adapter with priority: model parameter > explicit lora_path.
+
+        Returns adapter name or None. Supports both single values and lists (batches).
+        """
+        _, adapter_from_model = self._parse_model_parameter(request_model)
+
+        # Model parameter adapter takes precedence
+        if adapter_from_model is not None:
+            return adapter_from_model
+
+        # Fall back to explicit lora_path
+        return explicit_lora_path
+
+    async def handle_request(
+        self, request: OpenAIServingRequest, raw_request: Request
+    ) -> Union[Any, StreamingResponse, ErrorResponse]:
+        """Handle the specific request type with common pattern
+        If you want to override this method, you should be careful to record the validation time.
+        """
+        received_time = monotonic_time()
+
+        try:
+            # Validate request
+            error_msg = self._validate_request(request)
+            if error_msg:
+                return self.create_error_response(error_msg)
+
+            # Convert to internal format
+            adapted_request, processed_request = self._convert_to_internal_request(
+                request, raw_request
+            )
+
+            if isinstance(adapted_request, (GenerateReqInput, EmbeddingReqInput)):
+                # Only set timing fields if adapted_request supports them
+                adapted_request.received_time = received_time
+
+            # Note(Xinyuan): raw_request below is only used for detecting the connection of the client
+            if hasattr(request, "stream") and request.stream:
+                return await self._handle_streaming_request(
+                    adapted_request, processed_request, raw_request
+                )
+            else:
+                return await self._handle_non_streaming_request(
+                    adapted_request, processed_request, raw_request
+                )
+        except HTTPException as e:
+            return self.create_error_response(
+                message=e.detail, err_type=str(e.status_code), status_code=e.status_code
+            )
+        except ValueError as e:
+            return self.create_error_response(
+                message=str(e),
+                err_type="BadRequest",
+                status_code=400,
+            )
+        except DS32EncodingError as e:
+            logger.info(f"DS32EncodingError: {e}")
+            return self.create_error_response(
+                message=str(e),
+                err_type="BadRequest",
+                status_code=400,
+            )
+        except Exception as e:
+            logger.exception(f"Error in request: {e}")
+            return self.create_error_response(
+                message=f"Internal server error: {str(e)}",
+                err_type="InternalServerError",
+                status_code=500,
+            )
+
+    @abstractmethod
+    def _request_id_prefix(self) -> str:
+        """Generate request ID based on request type"""
+        pass
+
+    def _generate_request_id_base(self, request: OpenAIServingRequest) -> Optional[str]:
+        """Generate request ID based on request type"""
+        return None
+
+        # TODO(chang): the rid is used in io_strcut check and often violates `The rid should be a list` AssertionError
+        # Temporarily return None in this function until the rid logic is clear.
+        if rid := getattr(request, "rid", None):
+            return rid
+
+        return f"{self._request_id_prefix()}{uuid.uuid4().hex}"
+
+    def _compute_extra_key(self, request: OpenAIServingRequest) -> Optional[str]:
+        """Compute the final extra_key by concatenating cache_salt and extra_key if both are provided."""
+        parts = []
+        for key in ["cache_salt", "extra_key"]:
+            value = getattr(request, key, None)
+            if value:
+                if not isinstance(value, str):
+                    raise TypeError(
+                        f"Value of {key} must be a string, but got {type(value).__name__}"
+                    )
+                parts.append(value)
+        return "".join(parts) if parts else None
+
+    @abstractmethod
+    def _convert_to_internal_request(
+        self,
+        request: OpenAIServingRequest,
+        raw_request: Request = None,
+    ) -> tuple[GenerateReqInput, OpenAIServingRequest]:
+        """Convert OpenAI request to internal format"""
+        pass
+
+    async def _handle_streaming_request(
+        self,
+        adapted_request: GenerateReqInput,
+        request: OpenAIServingRequest,
+        raw_request: Request,
+    ) -> Union[StreamingResponse, ErrorResponse, ORJSONResponse]:
+        """Handle streaming request
+
+        Override this method in child classes that support streaming requests.
+        """
+        return self.create_error_response(
+            message=f"{self.__class__.__name__} does not support streaming requests",
+            err_type="NotImplementedError",
+            status_code=501,
+        )
+
+    async def _handle_non_streaming_request(
+        self,
+        adapted_request: GenerateReqInput,
+        request: OpenAIServingRequest,
+        raw_request: Request,
+    ) -> Union[Any, ErrorResponse, ORJSONResponse]:
+        """Handle non-streaming request
+
+        Override this method in child classes that support non-streaming requests.
+        """
+        return self.create_error_response(
+            message=f"{self.__class__.__name__} does not support non-streaming requests",
+            err_type="NotImplementedError",
+            status_code=501,
+        )
+
+    def _validate_request(self, _: OpenAIServingRequest) -> Optional[str]:
+        """Validate request"""
+        pass
+
+    def create_error_response(
+        self,
+        message: str,
+        err_type: str = "BadRequestError",
+        status_code: int = 400,
+        param: Optional[str] = None,
+    ) -> ORJSONResponse:
+        """Create an error response"""
+        # TODO: remove fastapi dependency in openai and move response handling to the entrypoint
+        error = ErrorResponse(
+            object="error",
+            message=message,
+            type=err_type,
+            param=param,
+            code=status_code,
+        )
+        return ORJSONResponse(content=error.model_dump(), status_code=status_code)
+
+    def create_streaming_error_response(
+        self,
+        message: str,
+        err_type: str = "BadRequestError",
+        status_code: int = 400,
+    ) -> str:
+        """Create a streaming error response"""
+        error = ErrorResponse(
+            object="error",
+            message=message,
+            type=err_type,
+            param=None,
+            code=status_code,
+        )
+        return json.dumps({"error": error.model_dump()})
+
+    def extract_custom_labels(self, raw_request):
+        if (
+            not self.allowed_custom_labels
+            or not self.tokenizer_manager.server_args.tokenizer_metrics_custom_labels_header
+        ):
+            return None
+
+        custom_labels = None
+        header = (
+            self.tokenizer_manager.server_args.tokenizer_metrics_custom_labels_header
+        )
+        try:
+            raw_labels = (
+                orjson.loads(raw_request.headers.get(header))
+                if raw_request and raw_request.headers.get(header)
+                else None
+            )
+        except json.JSONDecodeError as e:
+            logger.exception(f"Error in request: {e}")
+            raw_labels = None
+
+        if isinstance(raw_labels, dict):
+            custom_labels = {
+                label: value
+                for label, value in raw_labels.items()
+                if label in self.allowed_custom_labels
+            }
+        return custom_labels
+
+    def extract_routing_key(self, raw_request):
+        if raw_request is None:
+            return None
+        return raw_request.headers.get("x-smg-routing-key")
diff --git a/sglang/python/sglang/srt/entrypoints/openai/serving_chat.py b/sglang/python/sglang/srt/entrypoints/openai/serving_chat.py
new file mode 100644
index 0000000000000000000000000000000000000000..7913af172c3b868fa596dc2221ee756c5770d4db
--- /dev/null
+++ b/sglang/python/sglang/srt/entrypoints/openai/serving_chat.py
@@ -0,0 +1,1428 @@
+from __future__ import annotations
+
+import copy
+import json
+import logging
+import time
+import uuid
+from http import HTTPStatus
+from typing import TYPE_CHECKING, Any, AsyncGenerator, Dict, List, Optional, Union
+
+import jinja2
+import orjson
+from fastapi import Request
+from fastapi.responses import ORJSONResponse, StreamingResponse
+from jsonschema import Draft202012Validator, SchemaError
+
+from sglang.srt.entrypoints.openai.encoding_dsv32 import encode_messages
+from sglang.srt.entrypoints.openai.protocol import (
+    ChatCompletionRequest,
+    ChatCompletionResponse,
+    ChatCompletionResponseChoice,
+    ChatCompletionResponseStreamChoice,
+    ChatCompletionStreamResponse,
+    ChatCompletionTokenLogprob,
+    ChatMessage,
+    ChoiceLogprobs,
+    DeltaMessage,
+    ErrorResponse,
+    FunctionResponse,
+    LogProbs,
+    MessageProcessingResult,
+    SglExt,
+    ToolCall,
+    ToolCallProcessingResult,
+    ToolChoice,
+    TopLogprob,
+)
+from sglang.srt.entrypoints.openai.serving_base import OpenAIServingBase
+from sglang.srt.entrypoints.openai.usage_processor import UsageProcessor
+from sglang.srt.entrypoints.openai.utils import (
+    process_cached_tokens_details_from_ret,
+    process_hidden_states_from_ret,
+    process_routed_experts_from_ret,
+    to_openai_style_logprobs,
+)
+from sglang.srt.function_call.core_types import ToolCallItem
+from sglang.srt.function_call.function_call_parser import FunctionCallParser
+from sglang.srt.function_call.json_array_parser import JsonArrayParser
+from sglang.srt.function_call.utils import get_json_schema_constraint
+from sglang.srt.managers.io_struct import GenerateReqInput
+from sglang.srt.parser.conversation import generate_chat_conv
+from sglang.srt.parser.jinja_template_utils import process_content_for_template_format
+from sglang.srt.parser.reasoning_parser import ReasoningParser
+
+if TYPE_CHECKING:
+    from sglang.srt.managers.template_manager import TemplateManager
+    from sglang.srt.managers.tokenizer_manager import TokenizerManager
+
+logger = logging.getLogger(__name__)
+
+
+def _extract_max_dynamic_patch(request: ChatCompletionRequest):
+    img_vals = []
+    vid_vals = []
+    for msg in request.messages or []:
+        content = getattr(msg, "content", None)
+        if not isinstance(content, list):
+            continue
+        for part in content:
+            # pydantic object or dict type
+            if getattr(part, "type", None) == "image_url":
+                iu = getattr(part, "image_url", None)
+                mdp = getattr(iu, "max_dynamic_patch", None) if iu else None
+                if mdp is not None:
+                    img_vals.append(int(mdp))
+            elif getattr(part, "type", None) == "video_url":
+                vu = getattr(part, "video_url", None)
+                mdp = getattr(vu, "max_dynamic_patch", None) if vu else None
+                if mdp is not None:
+                    vid_vals.append(int(mdp))
+
+    # TODO(yuan-luo): per-item max_dynamic_patch for both image and video
+    img_max_dynamic_patch = min(img_vals) if img_vals else None
+    vid_max_dynamic_patch = min(vid_vals) if vid_vals else None
+    return img_max_dynamic_patch, vid_max_dynamic_patch
+
+
+class OpenAIServingChat(OpenAIServingBase):
+    """Handler for /v1/chat/completions requests"""
+
+    _default_sampling_params_logged = False
+
+    def __init__(
+        self,
+        tokenizer_manager: TokenizerManager,
+        template_manager: TemplateManager,
+    ):
+        super().__init__(tokenizer_manager)
+        self.template_manager = template_manager
+        self.tool_call_parser = self.tokenizer_manager.server_args.tool_call_parser
+        self.reasoning_parser = self.tokenizer_manager.server_args.reasoning_parser
+
+        # Get default sampling parameters from model's generation config
+        self.default_sampling_params = (
+            self.tokenizer_manager.model_config.get_default_sampling_params()
+        )
+        if (
+            self.default_sampling_params
+            and not OpenAIServingChat._default_sampling_params_logged
+        ):
+            logger.info(
+                f"Using default chat sampling params from model generation config: {self.default_sampling_params}",
+            )
+            OpenAIServingChat._default_sampling_params_logged = True
+
+        # Check if the model is a GPT-OSS model
+        self.is_gpt_oss = (
+            hasattr(self.tokenizer_manager.model_config, "hf_config")
+            and hasattr(self.tokenizer_manager.model_config.hf_config, "model_type")
+            and self.tokenizer_manager.model_config.hf_config.model_type == "gpt_oss"
+        )
+
+        self.use_dpsk_v32_encoding = self._use_dpsk_v32_encoding()
+
+    def _handle_last_assistant_message(
+        self,
+        messages: List[Dict[str, Any]],
+        request: ChatCompletionRequest,
+    ) -> tuple[List[Dict[str, Any]], Optional[str]]:
+        """
+        Handle continue_final_message feature: separate final assistant message.
+
+        If continue_final_message is enabled and the last message is from assistant,
+        extract its content and remove it from the message list.
+        If continue_final_message is False and the last message is from assistant,
+        convert it to a user message to ensure the last message is always from user.
+
+        Only processes text-based content (strings), ignoring multimodal content (lists).
+
+        Args:
+            messages: List of message dictionaries
+            request: ChatCompletionRequest with continue_final_message flag
+
+        Returns:
+            Tuple of (processed_messages, assistant_prefix)
+            - processed_messages: Messages with last assistant message handled appropriately
+            - assistant_prefix: Content of the last assistant message (string only), or None
+        """
+        assistant_prefix = None
+        if messages and messages[-1].get("role") == "assistant":
+            last_content = messages[-1].get("content")
+            # Only process string content, ignore multimodal content (lists)
+            if isinstance(last_content, str):
+                if request.continue_final_message:
+                    # Extract content and remove the assistant message
+                    assistant_prefix = last_content
+                    messages = messages[:-1]
+                else:
+                    # Convert the last assistant message to user message
+                    messages[-1] = {"role": "user", "content": last_content}
+        return messages, assistant_prefix
+
+    def _append_assistant_prefix_to_prompt_ids(
+        self, prompt_ids: List[int], assistant_prefix: str
+    ) -> List[int]:
+        """
+        Append assistant prefix to prompt_ids.
+
+        Args:
+            prompt_ids: Current prompt token IDs
+            assistant_prefix: Assistant message content to append
+
+        Returns:
+            Updated prompt_ids with assistant prefix appended
+        """
+        encoded = self.tokenizer_manager.tokenizer.encode(assistant_prefix)
+        if encoded and encoded[0] == self.tokenizer_manager.tokenizer.bos_token_id:
+            encoded = encoded[1:]
+        return prompt_ids + encoded
+
+    def _use_dpsk_v32_encoding(self) -> bool:
+        has_chat_template = (
+            self.tokenizer_manager.tokenizer is not None
+            and self.tokenizer_manager.tokenizer.chat_template is not None
+        )
+        architectures = self.tokenizer_manager.model_config.hf_config.architectures
+        is_dpsk_v32 = "DeepseekV3" in architectures[0] if architectures else False
+        return not has_chat_template and is_dpsk_v32
+
+    def _request_id_prefix(self) -> str:
+        return "chatcmpl-"
+
+    def _validate_request(self, request: ChatCompletionRequest) -> Optional[str]:
+        """Validate that the input is valid."""
+        if not request.messages:
+            return "Messages cannot be empty."
+
+        if (
+            isinstance(request.tool_choice, str)
+            and request.tool_choice.lower() == "required"
+            and not request.tools
+        ):
+            return "Tools cannot be empty if tool choice is set to required."
+
+        if request.tool_choice is not None and not isinstance(request.tool_choice, str):
+            if not request.tools:
+                return "Tools cannot be empty if tool choice is set to a specific tool."
+            tool_name = request.tool_choice.function.name
+            tool_exists = any(tool.function.name == tool_name for tool in request.tools)
+            if not tool_exists:
+                return f"Tool '{tool_name}' not found in tools list."
+
+        # Validate tool definitions
+        for i, tool in enumerate(request.tools or []):
+            if tool.function.parameters is None:
+                continue
+            try:
+                Draft202012Validator.check_schema(tool.function.parameters)
+            except SchemaError as e:
+                return f"Tool {i} function has invalid 'parameters' schema: {str(e)}"
+
+        max_output_tokens = request.max_completion_tokens or request.max_tokens
+        server_context_length = self.tokenizer_manager.server_args.context_length
+        if (
+            max_output_tokens
+            and server_context_length
+            and max_output_tokens > server_context_length
+        ) and not self.tokenizer_manager.server_args.allow_auto_truncate:
+            return (
+                f"max_completion_tokens is too large: {max_output_tokens}."
+                f"This model supports at most {server_context_length} completion tokens."
+            )
+
+        if request.response_format and request.response_format.type == "json_schema":
+            schema = getattr(request.response_format.json_schema, "schema_", None)
+            if schema is None:
+                return "schema_ is required for json_schema response format request."
+
+        return None
+
+    def _convert_to_internal_request(
+        self,
+        request: ChatCompletionRequest,
+        raw_request: Request = None,
+    ) -> tuple[GenerateReqInput, ChatCompletionRequest]:
+        reasoning_effort = (
+            request.chat_template_kwargs.pop("reasoning_effort", None)
+            if request.chat_template_kwargs
+            else None
+        )
+        if reasoning_effort is not None:
+            request.reasoning_effort = reasoning_effort
+
+        """Convert OpenAI chat completion request to internal format"""
+        is_multimodal = self.tokenizer_manager.model_config.is_multimodal
+
+        # Process messages and apply chat template
+        processed_messages = self._process_messages(request, is_multimodal)
+
+        # Build sampling parameters
+        sampling_params = request.to_sampling_params(
+            stop=processed_messages.stop,
+            model_generation_config=self.default_sampling_params,
+            tool_call_constraint=processed_messages.tool_call_constraint,
+        )
+
+        # Handle single vs multiple requests
+        if is_multimodal:
+            prompt_kwargs = {"text": processed_messages.prompt}
+        else:
+            if isinstance(processed_messages.prompt_ids, str):
+                prompt_kwargs = {"text": processed_messages.prompt_ids}
+            else:
+                prompt_kwargs = {"input_ids": processed_messages.prompt_ids}
+
+        # Extract custom labels from raw request headers
+        custom_labels = self.extract_custom_labels(raw_request)
+
+        # Resolve LoRA adapter from model parameter or explicit lora_path
+        lora_path = self._resolve_lora_path(request.model, request.lora_path)
+        img_max_dynamic_patch, vid_max_dynamic_patch = _extract_max_dynamic_patch(
+            request
+        )
+        adapted_request = GenerateReqInput(
+            **prompt_kwargs,
+            image_data=processed_messages.image_data,
+            video_data=processed_messages.video_data,
+            audio_data=processed_messages.audio_data,
+            sampling_params=sampling_params,
+            return_logprob=request.logprobs,
+            logprob_start_len=-1,
+            top_logprobs_num=request.top_logprobs or 0,
+            stream=request.stream,
+            return_text_in_logprobs=True,
+            modalities=processed_messages.modalities,
+            lora_path=lora_path,
+            bootstrap_host=request.bootstrap_host,
+            bootstrap_port=request.bootstrap_port,
+            bootstrap_room=request.bootstrap_room,
+            routed_dp_rank=request.routed_dp_rank,
+            disagg_prefill_dp_rank=request.disagg_prefill_dp_rank,
+            return_hidden_states=request.return_hidden_states,
+            return_routed_experts=request.return_routed_experts,
+            rid=request.rid,
+            extra_key=self._compute_extra_key(request),
+            require_reasoning=self._get_reasoning_from_request(request),
+            priority=request.priority,
+            routing_key=self.extract_routing_key(raw_request),
+            custom_labels=custom_labels,
+            custom_logit_processor=request.custom_logit_processor,
+            image_max_dynamic_patch=img_max_dynamic_patch,
+            video_max_dynamic_patch=vid_max_dynamic_patch,
+            max_dynamic_patch=getattr(request, "max_dynamic_patch", None),
+        )
+
+        return adapted_request, request
+
+    def _process_messages(
+        self, request: ChatCompletionRequest, is_multimodal: bool
+    ) -> MessageProcessingResult:
+        """Process chat messages and apply chat template"""
+        # GptOss model needs to keep special tokens for harmony parsing
+        if self.is_gpt_oss:
+            request.skip_special_tokens = False
+
+        tool_call_constraint = None
+
+        # Apply chat template and its stop strings
+        tools = None
+        if request.tools and request.tool_choice != "none":
+            request.skip_special_tokens = False
+            if not isinstance(request.tool_choice, str):
+                tools = [
+                    item.model_dump()
+                    for item in request.tools
+                    if item.function.name == request.tool_choice.function.name
+                ]
+            else:
+                tools = [item.model_dump() for item in request.tools]
+            if self.tool_call_parser:
+                parser = FunctionCallParser(request.tools, self.tool_call_parser)
+                tool_call_constraint = parser.get_structure_constraint(
+                    request.tool_choice
+                )
+            # Handle JSON schema constraint directly for required or named tool choice
+            if request.tool_choice == "required" or isinstance(
+                request.tool_choice, ToolChoice
+            ):
+                json_schema = get_json_schema_constraint(
+                    request.tools, request.tool_choice
+                )
+                tool_call_constraint = ("json_schema", json_schema)
+
+        # Use chat template
+        if self.template_manager.chat_template_name is None:
+            result = self._apply_jinja_template(request, tools, is_multimodal)
+        else:
+            result = self._apply_conversation_template(request, is_multimodal)
+
+        result.tool_call_constraint = tool_call_constraint
+        return result
+
+    def _apply_jinja_template(
+        self,
+        request: ChatCompletionRequest,
+        tools: Optional[List[Dict]],
+        is_multimodal: bool,
+    ) -> MessageProcessingResult:
+        """Apply Jinja chat template"""
+        prompt = ""
+        prompt_ids = []
+        openai_compatible_messages = []
+        image_data = []
+        video_data = []
+        audio_data = []
+        modalities = []
+
+        template_content_format = self.template_manager.jinja_template_content_format
+
+        if self.use_dpsk_v32_encoding:
+            thinking_mode = (
+                "thinking"
+                if (request.chat_template_kwargs or {}).get("thinking")
+                else "chat"
+            )
+            messages = request.messages
+            messages = [msg.model_dump() for msg in messages]
+
+            for msg in messages:
+                if msg.get("content") is None:
+                    msg["content"] = ""
+                processed_msg = process_content_for_template_format(
+                    msg,
+                    template_content_format,
+                    image_data,
+                    video_data,
+                    audio_data,
+                    modalities,
+                    use_dpsk_v32_encoding=self.use_dpsk_v32_encoding,
+                )
+                msg.update(processed_msg)
+
+            # Handle continue_final_message: separate final assistant message
+            messages, assistant_prefix = self._handle_last_assistant_message(
+                messages, request
+            )
+
+            if messages[0]["role"] != "system":
+                # insert an empty system prompt to help render tool system prompt
+                messages.insert(0, {"role": "system", "content": ""})
+            if request.tools:
+                messages[0]["tools"] = [tool.model_dump() for tool in request.tools]
+            real_input = encode_messages(messages, thinking_mode=thinking_mode)
+            prompt_ids = self.tokenizer_manager.tokenizer.encode(real_input)
+
+            # Append assistant prefix if continue_final_message is enabled
+            if assistant_prefix:
+                prompt_ids = self._append_assistant_prefix_to_prompt_ids(
+                    prompt_ids, assistant_prefix
+                )
+        else:
+            for message in request.messages:
+                if message.content is None:
+                    message.content = ""
+                msg_dict = message.model_dump()
+
+                # Process content based on detected template format
+                processed_msg = process_content_for_template_format(
+                    msg_dict,
+                    template_content_format,
+                    image_data,
+                    video_data,
+                    audio_data,
+                    modalities,
+                )
+
+                # per the Transformers docs & maintainers, tool call arguments in
+                # assistant-role messages with tool_calls need to be dicts not JSON str -
+                # this is how tool-use chat templates will expect them moving forwards
+                # so, for messages that have tool_calls, parse the string (which we get
+                # from openAI format) to dict
+                if (
+                    processed_msg["role"] == "assistant"
+                    and "tool_calls" in processed_msg
+                    and isinstance(processed_msg["tool_calls"], list)
+                ):
+                    for item in processed_msg["tool_calls"]:
+                        if "arguments" in item["function"] and isinstance(
+                            item["function"]["arguments"], str
+                        ):
+                            item["function"]["arguments"] = orjson.loads(
+                                item["function"]["arguments"]
+                            )
+
+                openai_compatible_messages.append(processed_msg)
+
+            # Handle continue_final_message: separate final assistant message
+            openai_compatible_messages, assistant_prefix = (
+                self._handle_last_assistant_message(openai_compatible_messages, request)
+            )
+
+            try:
+                prompt_ids = self.tokenizer_manager.tokenizer.apply_chat_template(
+                    openai_compatible_messages,
+                    tokenize=True,
+                    add_generation_prompt=True,
+                    tools=tools,
+                    reasoning_effort=request.reasoning_effort,
+                    **(
+                        request.chat_template_kwargs
+                        if request.chat_template_kwargs
+                        else {}
+                    ),
+                    return_dict=False,
+                )
+            except Exception as e:
+                # If the first attempt fails, try with flat function-only format.
+                # Some templates (e.g. Mistral) expect tools without the OpenAI wrapper.
+                tools = (
+                    [t["function"] if "function" in t else t for t in tools]
+                    if tools
+                    else None
+                )
+                try:
+                    prompt_ids = self.tokenizer_manager.tokenizer.apply_chat_template(
+                        openai_compatible_messages,
+                        tokenize=True,
+                        add_generation_prompt=True,
+                        tools=tools,
+                        reasoning_effort=request.reasoning_effort,
+                        **(
+                            request.chat_template_kwargs
+                            if request.chat_template_kwargs
+                            else {}
+                        ),
+                        return_dict=False,
+                    )
+                except jinja2.TemplateError as template_error:
+                    # Template errors (e.g., from raise_exception in Jinja templates)
+                    # should be treated as client errors (400 BadRequest)
+                    raise ValueError(str(template_error)) from template_error
+
+            # Append assistant prefix if continue_final_message is enabled
+            if assistant_prefix:
+                prompt_ids = self._append_assistant_prefix_to_prompt_ids(
+                    prompt_ids, assistant_prefix
+                )
+
+            if is_multimodal:
+                prompt = self.tokenizer_manager.tokenizer.decode(prompt_ids)
+
+        stop = request.stop
+        image_data = image_data if image_data else None
+        audio_data = audio_data if audio_data else None
+        video_data = video_data if video_data else None
+        modalities = modalities if modalities else []
+        return MessageProcessingResult(
+            prompt=prompt,
+            prompt_ids=prompt_ids,
+            image_data=image_data,
+            video_data=video_data,
+            audio_data=audio_data,
+            modalities=modalities,
+            stop=stop,
+        )
+
+    def _apply_conversation_template(
+        self,
+        request: ChatCompletionRequest,
+        is_multimodal: bool,
+    ) -> MessageProcessingResult:
+        """Apply conversation template"""
+        prompt = ""
+        prompt_ids = []
+        conv = generate_chat_conv(request, self.template_manager.chat_template_name)
+
+        # If we should continue the final assistant message, adjust the conversation.
+        if (
+            request.continue_final_message
+            and request.messages
+            and request.messages[-1].role == "assistant"
+        ):
+            # Remove the auto-added blank assistant turn, if present.
+            if conv.messages and conv.messages[-1][1] is None:
+                conv.messages.pop()
+            # Rebuild the prompt from the conversation.
+            prompt = conv.get_prompt()
+            # Strip trailing stop tokens or separators that indicate end-of-assistant.
+            if isinstance(conv.stop_str, list):
+                for stop_token in conv.stop_str:
+                    if prompt.endswith(stop_token):
+                        prompt = prompt[: -len(stop_token)]
+            elif isinstance(conv.stop_str, str) and prompt.endswith(conv.stop_str):
+                prompt = prompt[: -len(conv.stop_str)]
+            if conv.sep and prompt.endswith(conv.sep):
+                prompt = prompt[: -len(conv.sep)]
+            if getattr(conv, "sep2", None) and prompt.endswith(conv.sep2):
+                prompt = prompt[: -len(conv.sep2)]
+        else:
+            prompt = conv.get_prompt()
+            if self._get_reasoning_from_request(
+                request
+            ) and self.reasoning_parser not in ["qwen3", "qwen3-thinking", "glm4"]:
+                # qwen3 and glm4 think internally without a leading <think> token
+                prompt += "<think>"  # Note(Xinyuan): hard code thinking token
+
+        image_data = conv.image_data if conv.image_data else None
+        video_data = conv.video_data if conv.video_data else None
+        audio_data = conv.audio_data if conv.audio_data else None
+        modalities = conv.modalities if conv.modalities else []
+        stop = copy.copy(conv.stop_str or [] if not request.ignore_eos else [])
+
+        if request.stop:
+            if isinstance(request.stop, str):
+                stop.append(request.stop)
+            else:
+                stop.extend(request.stop)
+
+        if not is_multimodal:
+            prompt_ids = self.tokenizer_manager.tokenizer.encode(prompt)
+
+        return MessageProcessingResult(
+            prompt=prompt,
+            prompt_ids=prompt_ids,
+            image_data=image_data,
+            video_data=video_data,
+            audio_data=audio_data,
+            modalities=modalities,
+            stop=stop,
+        )
+
+    async def _handle_streaming_request(
+        self,
+        adapted_request: GenerateReqInput,
+        request: ChatCompletionRequest,
+        raw_request: Request,
+    ) -> StreamingResponse:
+        """Handle streaming chat completion request"""
+        return StreamingResponse(
+            self._generate_chat_stream(adapted_request, request, raw_request),
+            media_type="text/event-stream",
+            background=self.tokenizer_manager.create_abort_task(adapted_request),
+        )
+
+    async def _generate_chat_stream(
+        self,
+        adapted_request: GenerateReqInput,
+        request: ChatCompletionRequest,
+        raw_request: Request,
+    ) -> AsyncGenerator[str, None]:
+        """Generate streaming chat completion response"""
+        # Parsers for tool calls and reasoning
+        parser_dict = {}
+        reasoning_parser_dict = {}
+
+        # State tracking for streaming
+        is_firsts = {}
+        stream_buffers = {}
+        n_prev_tokens = {}
+        has_tool_calls = {}
+        finish_reasons = {}
+
+        # Usage tracking
+        prompt_tokens = {}
+        completion_tokens = {}
+        cached_tokens = {}
+        hidden_states = {}
+        routed_experts = {}
+
+        try:
+            async for content in self.tokenizer_manager.generate_request(
+                adapted_request, raw_request
+            ):
+                index = content.get("index", 0)
+
+                prompt_tokens[index] = content["meta_info"].get("prompt_tokens", 0)
+                completion_tokens[index] = content["meta_info"].get(
+                    "completion_tokens", 0
+                )
+                cached_tokens[index] = content["meta_info"].get("cached_tokens", 0)
+                hidden_states[index] = content["meta_info"].get("hidden_states", None)
+                routed_experts[index] = content["meta_info"].get("routed_experts", None)
+
+                # Handle logprobs
+                finish_reason = content["meta_info"].get("finish_reason", None)
+                choice_logprobs = None
+                if request.logprobs:
+                    n_prev_token = n_prev_tokens.get(index, 0)
+                    total_output_logprobs = len(
+                        content["meta_info"]["output_token_logprobs"]
+                    )
+                    # When finish_reason is set and all logprobs have been sent,
+                    # any remaining text is just buffered text being flushed by the
+                    # detokenizer (it holds back text at word boundaries). Return None
+                    # for logprobs since no new tokens were generated for this text.
+                    if n_prev_token < total_output_logprobs or finish_reason is None:
+                        choice_logprobs = self._process_streaming_logprobs(
+                            content, n_prev_token
+                        )
+                    n_prev_tokens[index] = total_output_logprobs
+                finish_reason_type = finish_reason["type"] if finish_reason else None
+
+                # Track finish_reason for each index
+                if finish_reason_type:
+                    # If the abort is from scheduler.
+                    if finish_reason_type == "abort":
+                        code = finish_reason.get(
+                            "status_code", HTTPStatus.INTERNAL_SERVER_ERROR
+                        )
+                        error = self.create_streaming_error_response(
+                            finish_reason.get("message", "Generation aborted."),
+                            code.name,
+                            code.value,
+                        )
+                        yield f"data: {error}\n\n"
+                        break
+                    else:
+                        finish_reasons[index] = finish_reason
+
+                # First chunk with role
+                if is_firsts.get(index, True):
+                    is_firsts[index] = False
+                    delta = DeltaMessage(role="assistant", content="")
+                    choice_data = ChatCompletionResponseStreamChoice(
+                        index=index,
+                        delta=delta,
+                        finish_reason=None,
+                        logprobs=None,
+                    )
+                    chunk = ChatCompletionStreamResponse(
+                        id=content["meta_info"]["id"],
+                        created=int(time.time()),
+                        choices=[choice_data],
+                        model=request.model,
+                    )
+                    yield f"data: {chunk.model_dump_json()}\n\n"
+
+                stream_buffer = stream_buffers.get(index, "")
+                delta = content["text"][len(stream_buffer) :]
+                stream_buffers[index] = stream_buffer + delta
+
+                # Handle reasoning content
+                if self.reasoning_parser and request.separate_reasoning:
+                    reasoning_text, delta = self._process_reasoning_stream(
+                        index, delta, reasoning_parser_dict, content, request
+                    )
+                    if reasoning_text:
+                        choice_data = ChatCompletionResponseStreamChoice(
+                            index=index,
+                            delta=DeltaMessage(reasoning_content=reasoning_text),
+                            finish_reason=None,
+                        )
+                        chunk = ChatCompletionStreamResponse(
+                            id=content["meta_info"]["id"],
+                            created=int(time.time()),
+                            choices=[choice_data],
+                            model=request.model,
+                        )
+
+                        # Add usage stats if continuous_usage_stats is enabled
+                        if (
+                            request.stream_options
+                            and request.stream_options.continuous_usage_stats
+                        ):
+                            chunk.usage = UsageProcessor.calculate_token_usage(
+                                prompt_tokens=prompt_tokens.get(index, 0),
+                                completion_tokens=completion_tokens.get(index, 0),
+                            )
+
+                        yield f"data: {chunk.model_dump_json()}\n\n"
+
+                # Handle tool calls
+                if (
+                    request.tool_choice != "none"
+                    and request.tools
+                    and self.tool_call_parser
+                ):
+                    async for chunk in self._process_tool_call_stream(
+                        index,
+                        delta,
+                        parser_dict,
+                        content,
+                        request,
+                        has_tool_calls,
+                    ):
+                        if chunk:
+                            yield chunk
+
+                    # Send any remaining tool call arguments when generation finishes
+                    if finish_reason_type is not None and index in parser_dict:
+                        parser = parser_dict[index]
+                        remaining_chunk = self._check_for_unstreamed_tool_args(
+                            parser, content, request, index
+                        )
+                        if remaining_chunk:
+                            yield remaining_chunk
+
+                else:
+                    # Regular content
+                    if delta:
+                        choice_data = ChatCompletionResponseStreamChoice(
+                            index=index,
+                            delta=DeltaMessage(content=delta),
+                            finish_reason=None,
+                            matched_stop=None,
+                            logprobs=choice_logprobs,
+                        )
+                        chunk = ChatCompletionStreamResponse(
+                            id=content["meta_info"]["id"],
+                            created=int(time.time()),
+                            choices=[choice_data],
+                            model=request.model,
+                        )
+
+                        # Add usage stats if continuous_usage_stats is enabled
+                        if (
+                            request.stream_options
+                            and request.stream_options.continuous_usage_stats
+                        ):
+                            chunk.usage = UsageProcessor.calculate_token_usage(
+                                prompt_tokens=prompt_tokens.get(index, 0),
+                                completion_tokens=completion_tokens.get(index, 0),
+                            )
+
+                        yield f"data: {chunk.model_dump_json()}\n\n"
+
+            # Send finish_reason chunks for each index that completed
+            for idx, finish_reason_data in finish_reasons.items():
+                finish_reason_type = finish_reason_data["type"]
+
+                # Change finish_reason to "tool_calls" if we had tool calls and stopped naturally
+                final_finish_reason = finish_reason_type
+                if has_tool_calls.get(idx, False) and finish_reason_type == "stop":
+                    final_finish_reason = "tool_calls"
+
+                finish_reason_chunk = ChatCompletionStreamResponse(
+                    id=content["meta_info"][
+                        "id"
+                    ],  # NOTE: openai uses the same chatcmpl-id for all indices
+                    created=int(time.time()),
+                    choices=[
+                        ChatCompletionResponseStreamChoice(
+                            index=idx,
+                            delta=DeltaMessage(),
+                            finish_reason=final_finish_reason,
+                            matched_stop=(
+                                finish_reason_data["matched"]
+                                if "matched" in finish_reason_data
+                                else None
+                            ),
+                        )
+                    ],
+                    model=request.model,
+                    usage=None,
+                )
+                yield f"data: {finish_reason_chunk.model_dump_json()}\n\n"
+
+            # Send hidden states if requested
+            if request.return_hidden_states and hidden_states:
+                for index, choice_hidden_states in hidden_states.items():
+                    if choice_hidden_states:
+                        last_token_hidden_states = (
+                            choice_hidden_states[-1]
+                            if len(choice_hidden_states) > 1
+                            else []
+                        )
+                        hidden_states_chunk = ChatCompletionStreamResponse(
+                            id=content["meta_info"]["id"],
+                            created=int(time.time()),
+                            choices=[
+                                ChatCompletionResponseStreamChoice(
+                                    index=index,
+                                    delta=DeltaMessage(
+                                        hidden_states=last_token_hidden_states
+                                    ),
+                                    finish_reason=None,  # Hidden states don't need finish_reason
+                                )
+                            ],
+                            model=request.model,
+                        )
+                        yield f"data: {hidden_states_chunk.model_dump_json()}\n\n"
+
+            if request.return_routed_experts and routed_experts:
+                # Get first non-None routed_experts value
+                first_routed_experts = next(
+                    (v for v in routed_experts.values() if v is not None), None
+                )
+                if first_routed_experts is not None:
+                    routed_experts_chunk = ChatCompletionStreamResponse(
+                        id=content["meta_info"]["id"],
+                        created=int(time.time()),
+                        choices=[],  # sglext is at response level
+                        model=request.model,
+                        sglext=SglExt(routed_experts=first_routed_experts),
+                    )
+                    yield f"data: {routed_experts_chunk.model_dump_json()}\n\n"
+
+            # Additional usage chunk
+            if request.stream_options and request.stream_options.include_usage:
+                usage = UsageProcessor.calculate_streaming_usage(
+                    prompt_tokens,
+                    completion_tokens,
+                    cached_tokens,
+                    n_choices=request.n,
+                    enable_cache_report=self.tokenizer_manager.server_args.enable_cache_report,
+                )
+                usage_chunk = ChatCompletionStreamResponse(
+                    id=content["meta_info"]["id"],
+                    created=int(time.time()),
+                    choices=[],  # Empty choices array as per OpenAI spec
+                    model=request.model,
+                    usage=usage,
+                )
+                yield f"data: {usage_chunk.model_dump_json()}\n\n"
+
+        except ValueError as e:
+            error = self.create_streaming_error_response(str(e))
+            yield f"data: {error}\n\n"
+
+        yield "data: [DONE]\n\n"
+
+    async def _handle_non_streaming_request(
+        self,
+        adapted_request: GenerateReqInput,
+        request: ChatCompletionRequest,
+        raw_request: Request,
+    ) -> Union[ChatCompletionResponse, ErrorResponse, ORJSONResponse]:
+        """Handle non-streaming chat completion request"""
+        try:
+            ret = await self.tokenizer_manager.generate_request(
+                adapted_request, raw_request
+            ).__anext__()
+        except ValueError as e:
+            return self.create_error_response(str(e))
+
+        if not isinstance(ret, list):
+            ret = [ret]
+
+        response = self._build_chat_response(
+            request,
+            ret,
+            int(time.time()),
+        )
+
+        return response
+
+    def _build_chat_response(
+        self,
+        request: ChatCompletionRequest,
+        ret: List[Dict[str, Any]],
+        created: int,
+    ) -> Union[ChatCompletionResponse, ORJSONResponse]:
+        """Build chat completion response from generation results"""
+        choices = []
+
+        # Build sglext at response level (from first ret_item, as these are per-request)
+        first_ret = ret[0]
+        routed_experts = process_routed_experts_from_ret(first_ret, request)
+        cached_tokens_details = process_cached_tokens_details_from_ret(
+            first_ret, request
+        )
+        response_sglext = None
+        if routed_experts or cached_tokens_details:
+            response_sglext = SglExt(
+                routed_experts=routed_experts,
+                cached_tokens_details=cached_tokens_details,
+            )
+
+        for idx, ret_item in enumerate(ret):
+            # Process logprobs
+            choice_logprobs = None
+            if request.logprobs:
+                choice_logprobs = self._process_response_logprobs(ret_item)
+
+            # Handle hidden states
+            hidden_states = process_hidden_states_from_ret(ret_item, request)
+
+            finish_reason = ret_item["meta_info"]["finish_reason"]
+            text = ret_item["text"]
+
+            # Handle reasoning content
+            reasoning_text = None
+            reasoning_parser = self.reasoning_parser
+            if reasoning_parser and request.separate_reasoning:
+                is_force_reasoning = (
+                    self.template_manager.force_reasoning
+                    or self._get_reasoning_from_request(request)
+                )
+                try:
+                    parser = ReasoningParser(
+                        model_type=reasoning_parser,
+                        stream_reasoning=False,
+                        force_reasoning=is_force_reasoning,
+                        request=request,
+                    )
+                    reasoning_text, text = parser.parse_non_stream(text)
+                except Exception as e:
+                    logger.error(f"Reasoning parsing error: {e}")
+                    return self.create_error_response(
+                        "Failed to parse reasoning content",
+                        err_type="InternalServerError",
+                        status_code=500,
+                    )
+
+            # Handle tool calls
+            tool_calls = None
+            if (
+                request.tool_choice != "none"
+                and request.tools
+                and self.tool_call_parser
+            ):
+                history_tool_calls_cnt = self._get_history_tool_calls_cnt(request)
+                tool_calls, text, finish_reason = self._process_tool_calls(
+                    text,
+                    request.tools,
+                    finish_reason,
+                    request.tool_choice,
+                    history_tool_calls_cnt,
+                )
+
+            choice_data = ChatCompletionResponseChoice(
+                index=idx,
+                message=ChatMessage(
+                    role="assistant",
+                    content=text if text else None,
+                    tool_calls=tool_calls,
+                    reasoning_content=reasoning_text if reasoning_text else None,
+                ),
+                logprobs=choice_logprobs,
+                finish_reason=finish_reason["type"] if finish_reason else None,
+                matched_stop=(
+                    finish_reason["matched"]
+                    if finish_reason and "matched" in finish_reason
+                    else None
+                ),
+                hidden_states=hidden_states,
+            )
+            choices.append(choice_data)
+
+        # Calculate usage
+        usage = UsageProcessor.calculate_response_usage(
+            ret,
+            n_choices=request.n,
+            enable_cache_report=self.tokenizer_manager.server_args.enable_cache_report,
+        )
+
+        return ChatCompletionResponse(
+            id=ret[0]["meta_info"]["id"],
+            created=created,
+            model=request.model,
+            choices=choices,
+            usage=usage,
+            metadata={"weight_version": ret[0]["meta_info"]["weight_version"]},
+            sglext=response_sglext,
+        )
+
+    def _process_logprobs_tokens(
+        self, logprobs: LogProbs, use_token_index: bool = False
+    ) -> List[ChatCompletionTokenLogprob]:
+        """Common helper to process logprobs tokens for both streaming and non-streaming
+
+        Args:
+            logprobs: LogProbs data from model
+            use_token_index: True for non-streaming (use token_idx), False for streaming (use index 0)
+        """
+        token_logprobs = []
+
+        for token_idx, (token, logprob) in enumerate(
+            zip(logprobs.tokens, logprobs.token_logprobs)
+        ):
+            token_bytes = list(token.encode("utf-8"))
+            top_logprobs = []
+            if logprobs.top_logprobs:
+                # - Non-streaming (use_token_index=True): uses token_idx for full data
+                # - Streaming (use_token_index=False): uses index 0 for pre-sliced data
+                top_logprobs_idx = token_idx if use_token_index else 0
+                for top_token, top_logprob in logprobs.top_logprobs[
+                    top_logprobs_idx
+                ].items():
+                    top_token_bytes = list(top_token.encode("utf-8"))
+                    top_logprobs.append(
+                        TopLogprob(
+                            token=top_token,
+                            bytes=top_token_bytes,
+                            logprob=top_logprob,
+                        )
+                    )
+            token_logprobs.append(
+                ChatCompletionTokenLogprob(
+                    token=token,
+                    bytes=token_bytes,
+                    logprob=logprob,
+                    top_logprobs=top_logprobs,
+                )
+            )
+
+        return token_logprobs
+
+    def _process_response_logprobs(self, ret_item: Dict[str, Any]) -> ChoiceLogprobs:
+        """Process logprobs for non-streaming response"""
+        logprobs = to_openai_style_logprobs(
+            output_token_logprobs=ret_item["meta_info"]["output_token_logprobs"],
+            output_top_logprobs=ret_item["meta_info"].get("output_top_logprobs", None),
+        )
+
+        token_logprobs = self._process_logprobs_tokens(logprobs, use_token_index=True)
+        return ChoiceLogprobs(content=token_logprobs)
+
+    def _process_tool_call_id(
+        self,
+        call_item: ToolCallItem,
+        history_tool_calls_cnt: int,
+    ) -> str:
+        """Process for generating a new and unique `tool_call_id`"""
+        if self.tool_call_parser != "kimi_k2":
+            # A simple uuid is sufficient for all models except for Kimi-K2.
+            tool_call_id = f"call_{uuid.uuid4().hex[:24]}"
+            return tool_call_id
+        else:
+            # Align with Kimi-K2 format: functions.{name}:{index}
+            # Kimi-K2 allows multiple tool_calls in one message; SGLang sets call_item.tool_index to the *local* position inside that message.
+            # Therefore, the index must be corrected by using `history_tool_calls_cnt + call_item.tool_index` to ensure globally unique and properly ordered.
+            tool_call_id = f"functions.{call_item.name}:{history_tool_calls_cnt+call_item.tool_index}"
+            logger.debug(
+                f"Process tool call idx, parser: {self.tool_call_parser}, tool_call_id: {tool_call_id}, history_cnt: {history_tool_calls_cnt}"
+            )
+            return tool_call_id
+
+    def _process_tool_calls(
+        self,
+        text: str,
+        tools: List[Any],
+        finish_reason: Dict[str, Any],
+        tool_choice: Optional[Union[str, ToolChoice]] = None,
+        history_tool_calls_cnt: int = 0,
+    ) -> ToolCallProcessingResult:
+        """Process tool calls in the response"""
+
+        # Handle required or named tool choice
+        if tool_choice == "required" or (
+            isinstance(tool_choice, ToolChoice) and tool_choice.type == "function"
+        ):
+            # Set finish reason to tool_calls since we're processing tool calls
+            if finish_reason["type"] == "stop":
+                finish_reason["type"] = "tool_calls"
+                finish_reason["matched"] = None
+            try:
+                # For required tool choice, we expect a JSON array of tool calls
+                tool_call_data = orjson.loads(text)
+                tool_calls = []
+                for i, tool in enumerate(tool_call_data):
+                    # Create a ToolCallItem from the JSON data
+                    call_info = ToolCallItem(
+                        tool_index=i,  # Use the loop index as tool_index
+                        name=tool["name"],
+                        parameters=json.dumps(tool["parameters"], ensure_ascii=False),
+                    )
+                    tool_id = self._process_tool_call_id(
+                        call_info, history_tool_calls_cnt
+                    )
+                    tool_calls.append(
+                        ToolCall(
+                            id=tool_id,
+                            index=i,
+                            function=FunctionResponse(
+                                name=tool["name"],
+                                arguments=json.dumps(
+                                    tool["parameters"], ensure_ascii=False
+                                ),
+                            ),
+                        )
+                    )
+                return ToolCallProcessingResult(tool_calls, "", finish_reason)
+            except json.JSONDecodeError as e:
+                logger.error(f"Tool call parsing error: {e}")
+                return ToolCallProcessingResult(None, text, finish_reason)
+
+        # Use parser since output is not constrained by JSON schema
+        parser = FunctionCallParser(tools, self.tool_call_parser)
+        if parser.has_tool_call(text):
+            if finish_reason["type"] == "stop":
+                finish_reason["type"] = "tool_calls"
+                finish_reason["matched"] = None
+            try:
+                text, call_info_list = parser.parse_non_stream(text)
+                tool_calls = []
+                for call_info in call_info_list:
+                    tool_id = self._process_tool_call_id(
+                        call_info, history_tool_calls_cnt
+                    )
+                    tool_calls.append(
+                        ToolCall(
+                            id=tool_id,
+                            index=getattr(call_info, "tool_index", None),
+                            function=FunctionResponse(
+                                name=call_info.name, arguments=call_info.parameters
+                            ),
+                        )
+                    )
+                return ToolCallProcessingResult(tool_calls, text, finish_reason)
+            except Exception as e:
+                logger.error(f"Tool call parsing error: {e}")
+                # Return error but don't fail the whole request
+                return ToolCallProcessingResult(None, text, finish_reason)
+
+        return ToolCallProcessingResult(None, text, finish_reason)
+
+    def _process_streaming_logprobs(
+        self, content: Dict[str, Any], n_prev_token: int
+    ) -> ChoiceLogprobs:
+        """Process logprobs for streaming response"""
+        logprobs = to_openai_style_logprobs(
+            output_token_logprobs=content["meta_info"]["output_token_logprobs"][
+                n_prev_token:
+            ],
+            output_top_logprobs=content["meta_info"].get("output_top_logprobs", [])[
+                n_prev_token:
+            ],
+        )
+
+        token_logprobs = self._process_logprobs_tokens(logprobs, use_token_index=False)
+        return ChoiceLogprobs(content=token_logprobs)
+
+    def _process_reasoning_stream(
+        self,
+        index: int,
+        delta: str,
+        reasoning_parser_dict: Dict[int, ReasoningParser],
+        content: Dict[str, Any],
+        request: ChatCompletionRequest,
+    ) -> tuple[Optional[str], str]:
+        """Process reasoning content in streaming response"""
+        if index not in reasoning_parser_dict:
+            is_force_reasoning = (
+                self.template_manager.force_reasoning
+                or self._get_reasoning_from_request(request)
+            )
+            reasoning_parser_dict[index] = ReasoningParser(
+                self.reasoning_parser,
+                request.stream_reasoning,
+                is_force_reasoning,
+                request,
+            )
+        reasoning_parser = reasoning_parser_dict[index]
+        return reasoning_parser.parse_stream_chunk(delta)
+
+    def _get_history_tool_calls_cnt(self, request: ChatCompletionRequest) -> int:
+        """Counts the number of tool calls in the request's message history.
+
+        NOTE: This method is only useful for models that include self-increasing
+        history tool call idx in tool calls id, such as kimi-k2
+
+        Args:
+            request: The chat completion request object.
+
+        Returns:
+            The total number of tool calls in the history, or 0 if not applicable.
+        """
+        messages = getattr(request, "messages", [])
+        idx = 0
+        for msg in messages:
+            if msg.role == "assistant":
+                tool_calls = getattr(msg, "tool_calls", None)
+                idx += len(list(tool_calls)) if tool_calls is not None else 0  # noqa
+        return idx
+
+    def _get_reasoning_from_request(self, request: ChatCompletionRequest) -> bool:
+        """Judge whether the request needs reasoning"""
+        if not self.reasoning_parser:
+            return False
+        if self.reasoning_parser in ["deepseek-v3"]:
+            # Models that require explicit enable thinking (thinking=True)
+            return (
+                request.chat_template_kwargs is not None
+                and request.chat_template_kwargs.get("thinking") is True
+            )
+        if self.reasoning_parser in ["kimi_k2"]:
+            # Models that thinking by default, and can be disabled by setting thinking=False
+            return (
+                not request.chat_template_kwargs
+                or request.chat_template_kwargs.get("thinking") is not False
+            )
+        if self.reasoning_parser in ["qwen3", "glm45", "nemotron_3", "interns1"]:
+            # Models that thinking by default, and can be disabled by setting enable_thinking=False
+            return (
+                not request.chat_template_kwargs
+                or request.chat_template_kwargs.get("enable_thinking") is not False
+            )
+        return True  # default
+
+    async def _process_tool_call_stream(
+        self,
+        index: int,
+        delta: str,
+        parser_dict: Dict[int, FunctionCallParser],
+        content: Dict[str, Any],
+        request: ChatCompletionRequest,
+        has_tool_calls: Dict[int, bool],
+    ):
+        """Process tool calls in streaming response"""
+        if index not in parser_dict:
+            # Use JSON detector directly for required or named tool choice
+            if request.tool_choice == "required" or isinstance(
+                request.tool_choice, ToolChoice
+            ):
+                parser_dict[index] = JsonArrayParser()
+            else:
+                parser_dict[index] = FunctionCallParser(
+                    tools=request.tools,
+                    tool_call_parser=self.tool_call_parser,
+                )
+
+        parser = parser_dict[index]
+
+        # Handle both FunctionCallParser and JsonArrayParser
+        if isinstance(parser, JsonArrayParser):
+            result = parser.parse_streaming_increment(delta, request.tools)
+            normal_text, calls = result.normal_text, result.calls
+        else:
+            normal_text, calls = parser.parse_stream_chunk(delta)
+
+        # Yield normal text
+        if normal_text:
+            choice_data = ChatCompletionResponseStreamChoice(
+                index=index,
+                delta=DeltaMessage(content=normal_text),
+                finish_reason=None,
+            )
+            chunk = ChatCompletionStreamResponse(
+                id=content["meta_info"]["id"],
+                created=int(time.time()),
+                choices=[choice_data],
+                model=request.model,
+            )
+
+            # Add usage stats if continuous_usage_stats is enabled
+            if request.stream_options and request.stream_options.continuous_usage_stats:
+                prompt_tokens = content["meta_info"].get("prompt_tokens", 0)
+                completion_tokens = content["meta_info"].get("completion_tokens", 0)
+                chunk.usage = UsageProcessor.calculate_token_usage(
+                    prompt_tokens=prompt_tokens,
+                    completion_tokens=completion_tokens,
+                )
+
+            yield f"data: {chunk.model_dump_json()}\n\n"
+
+        # Yield tool calls
+        history_tool_calls_cnt = self._get_history_tool_calls_cnt(request)
+        for call_item in calls:
+            # Mark that this choice has tool calls
+            has_tool_calls[index] = True
+
+            # Tool call ID should be generated only once per tool call
+            if call_item.name:
+                # First chunk: include ID and function name
+                tool_call_id = self._process_tool_call_id(
+                    call_item, history_tool_calls_cnt
+                )
+                function_name = call_item.name
+            else:
+                # Subsequent chunks: null ID and name for argument deltas
+                tool_call_id = None
+                function_name = None
+
+            tool_call = ToolCall(
+                id=tool_call_id,
+                index=call_item.tool_index,
+                function=FunctionResponse(
+                    name=function_name,
+                    arguments=call_item.parameters,
+                ),
+            )
+
+            choice_data = ChatCompletionResponseStreamChoice(
+                index=index,
+                delta=DeltaMessage(tool_calls=[tool_call]),
+                finish_reason=None,
+            )
+            chunk = ChatCompletionStreamResponse(
+                id=content["meta_info"]["id"],
+                created=int(time.time()),
+                choices=[choice_data],
+                model=request.model,
+            )
+
+            # Add usage stats if continuous_usage_stats is enabled
+            if request.stream_options and request.stream_options.continuous_usage_stats:
+                prompt_tokens = content["meta_info"].get("prompt_tokens", 0)
+                completion_tokens = content["meta_info"].get("completion_tokens", 0)
+                chunk.usage = UsageProcessor.calculate_token_usage(
+                    prompt_tokens=prompt_tokens,
+                    completion_tokens=completion_tokens,
+                )
+
+            yield f"data: {chunk.model_dump_json()}\n\n"
+
+    def _check_for_unstreamed_tool_args(
+        self,
+        parser: Union[FunctionCallParser, JsonArrayParser],
+        content: Dict[str, Any],
+        request: ChatCompletionRequest,
+        index: int,
+    ) -> Optional[str]:
+        """
+        Check for any remaining tool call arguments that need to be streamed
+        when generation finishes. This ensures tool calls are properly completed
+        even if the model generates the final arguments in the last chunk.
+        """
+        # Get the detector - either from FunctionCallParser or directly if json detector
+        detector = parser.detector if hasattr(parser, "detector") else parser
+
+        # Only check if we have tool calls and the detector has tracked data
+        if (
+            not hasattr(detector, "prev_tool_call_arr")
+            or not detector.prev_tool_call_arr
+        ):
+            return None
+
+        if (
+            not hasattr(detector, "streamed_args_for_tool")
+            or not detector.streamed_args_for_tool
+        ):
+            return None
+
+        # Get the last tool call that was being processed
+        tool_index = len(detector.prev_tool_call_arr) - 1
+        if tool_index < 0 or tool_index >= len(detector.streamed_args_for_tool):
+            return None
+
+        # Get expected vs actual arguments
+        expected_args = detector.prev_tool_call_arr[tool_index].get("arguments", {})
+        expected_call = json.dumps(expected_args, ensure_ascii=False)
+        actual_call = detector.streamed_args_for_tool[tool_index]
+
+        # Check if there are remaining arguments to send
+        remaining_call = (
+            expected_call.replace(actual_call, "", 1)
+            if actual_call in expected_call
+            else ""
+        )
+
+        if remaining_call:
+            # Create tool call chunk with remaining arguments
+            tool_call = ToolCall(
+                id=None,  # No ID for argument deltas
+                index=tool_index,
+                function=FunctionResponse(
+                    name=None,  # No name for argument deltas
+                    arguments=remaining_call,
+                ),
+            )
+
+            choice_data = ChatCompletionResponseStreamChoice(
+                index=index,
+                delta=DeltaMessage(tool_calls=[tool_call]),
+                finish_reason=None,  # Don't send finish_reason with this chunk
+            )
+
+            chunk = ChatCompletionStreamResponse(
+                id=content["meta_info"]["id"],
+                created=int(time.time()),
+                choices=[choice_data],
+                model=request.model,
+            )
+
+            return f"data: {chunk.model_dump_json()}\n\n"
+
+        return None
diff --git a/sglang/python/sglang/srt/entrypoints/openai/serving_classify.py b/sglang/python/sglang/srt/entrypoints/openai/serving_classify.py
new file mode 100644
index 0000000000000000000000000000000000000000..6b2a64abb24bb2fc0011aef25e65a4f5c64d7de1
--- /dev/null
+++ b/sglang/python/sglang/srt/entrypoints/openai/serving_classify.py
@@ -0,0 +1,204 @@
+from __future__ import annotations
+
+import logging
+import time
+import uuid
+from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union
+
+import torch
+import torch.nn.functional as F
+from fastapi import Request
+from fastapi.responses import ORJSONResponse
+
+from sglang.srt.entrypoints.openai.protocol import (
+    ClassifyRequest,
+    ClassifyResponse,
+    ErrorResponse,
+)
+from sglang.srt.entrypoints.openai.serving_base import OpenAIServingBase
+from sglang.srt.managers.io_struct import EmbeddingReqInput
+
+if TYPE_CHECKING:
+    from sglang.srt.managers.template_manager import TemplateManager
+    from sglang.srt.managers.tokenizer_manager import TokenizerManager
+
+logger = logging.getLogger(__name__)
+
+
+class OpenAIServingClassify(OpenAIServingBase):
+    """Handler for v1/classify requests"""
+
+    def __init__(
+        self,
+        tokenizer_manager: TokenizerManager,
+        template_manager: TemplateManager,
+    ):
+        super().__init__(tokenizer_manager)
+        self.template_manager = template_manager
+        self.id2label = self._get_id2label_mapping()
+        self.model_name = (
+            self.tokenizer_manager.served_model_name
+            if self.tokenizer_manager.served_model_name
+            else self.tokenizer_manager.server_args.model_path
+        )
+        if not self.id2label:
+            raise ValueError("id2label mapping is missing")
+
+    def _request_id_prefix(self) -> str:
+        return "classify-"
+
+    def _convert_to_internal_request(
+        self,
+        request: ClassifyRequest,
+        raw_request: Request = None,
+    ) -> tuple[EmbeddingReqInput, ClassifyRequest]:
+        """Convert OpenAI embedding request to internal format"""
+        prompt = request.input
+
+        if isinstance(prompt, str):
+            # Single string input
+            prompt_kwargs = {"text": prompt}
+        elif isinstance(prompt, list):
+            if len(prompt) > 0 and isinstance(prompt[0], str):
+                prompt_kwargs = {"text": prompt}
+            else:
+                # List of integers (token IDs) or empty list
+                prompt_kwargs = {"input_ids": prompt}
+        else:
+            # Other types (should not happen but handle gracefully)
+            prompt_kwargs = {"input_ids": prompt}
+
+        adapted_request = EmbeddingReqInput(
+            **prompt_kwargs,
+            rid=request.rid,
+            priority=request.priority,
+        )
+
+        return adapted_request, request
+
+    def _validate_request(self, request: ClassifyRequest) -> Optional[str]:
+        """Validate that the input is not empty or whitespace only."""
+        if not (input := request.input):
+            return "Input cannot be empty"
+
+        # Handle single string
+        if isinstance(input, str):
+            if not input.strip():
+                return "Input cannot be empty or whitespace only"
+            return None
+
+        # Handle list inputs
+        if isinstance(input, list):
+            # Check first element to determine type
+            first_item = input[0]
+
+            if isinstance(first_item, str):
+                # List of strings
+                for i, item in enumerate(input):
+                    if not isinstance(item, str):
+                        return f"All items in input list must be strings"
+                    if not item.strip():
+                        return f"Input at index {i} cannot be empty or whitespace only"
+            elif isinstance(first_item, int):
+                # List of integers (token IDs)
+                for i, item in enumerate(input):
+                    if not isinstance(item, int):
+                        return f"All items in input list must be integers"
+                    if item < 0:
+                        return f"Token ID at index {i} must be non-negative"
+        return None
+
+    def _get_id2label_mapping(self) -> Optional[Dict[int, str]]:
+        """Get id2label mapping from model config."""
+        try:
+            hf_config = self.tokenizer_manager.model_config.hf_config
+            # Check for id2label in hf_config
+            if hf_config.id2label:
+                return hf_config.id2label
+            # Check for num_labels and create default mapping if needed
+            if hasattr(hf_config, "num_labels") and hf_config.num_labels:
+                num_labels = hf_config.num_labels
+                # Create default mapping: {0: "LABEL_0", 1: "LABEL_1", ...}
+                return {i: f"LABEL_{i}" for i in range(num_labels)}
+
+        except Exception as e:
+            logger.warning(f"Failed to get id2label mapping: {e}")
+
+        return None
+
+    async def _handle_non_streaming_request(
+        self,
+        adapted_request: EmbeddingReqInput,
+        request: ClassifyRequest,
+        raw_request: Request,
+    ) -> Union[ClassifyResponse, ErrorResponse, ORJSONResponse]:
+        """Handle non-streaming classification request."""
+        # Generate request ID
+
+        try:
+            ret = await self.tokenizer_manager.generate_request(
+                adapted_request, raw_request
+            ).__anext__()
+        except ValueError as e:
+            return self.create_error_response(str(e))
+
+        if not isinstance(ret, list):
+            ret = [ret]
+
+        response = self._build_classify_response(ret)
+        return response
+
+    def _build_classify_response(self, ret: List[Dict[str, Any]]) -> ClassifyResponse:
+        request_id = f"{self._request_id_prefix()}{uuid.uuid4().hex}"
+        created_time = int(time.time())
+        classify_objects = []
+        prompt_tokens = 0
+        total_latency = 0.0
+
+        for i, item in enumerate(ret):
+            embedding = item.get("embedding", [])
+            meta_info = item.get("meta_info", {})
+
+            prompt_tokens += meta_info.get("prompt_tokens", 0)
+            total_latency += meta_info.get("e2e_latency", 0.0)
+
+            if embedding:
+                try:
+                    embedding_tensor = torch.tensor(embedding, dtype=torch.float32)
+                    probs = F.softmax(embedding_tensor, dim=0).tolist()
+
+                    predicted_class = torch.argmax(embedding_tensor).item()
+
+                    label = self.id2label[predicted_class]
+
+                except Exception as e:
+                    logger.error(f"Error processing embedding for item {i}: {e}")
+                    probs = [1.0]
+                    label = "Default"
+            else:
+                probs = [1.0]
+                label = "Default"
+
+            classify_obj = {
+                "index": i,
+                "label": label,
+                "probs": probs,
+                "num_classes": len(probs),
+            }
+            classify_objects.append(classify_obj)
+
+        response = {
+            "id": request_id,
+            "object": "list",
+            "created": created_time,
+            "model": self.model_name,
+            "data": classify_objects,
+            "usage": {
+                "prompt_tokens": prompt_tokens,
+                "total_tokens": prompt_tokens,
+                "completion_tokens": 0,
+                "prompt_tokens_details": None,
+            },
+        }
+
+        return ClassifyResponse(**response)
diff --git a/sglang/python/sglang/srt/entrypoints/openai/serving_completions.py b/sglang/python/sglang/srt/entrypoints/openai/serving_completions.py
new file mode 100644
index 0000000000000000000000000000000000000000..63b11eca4202f38bf53d927f91acb8f3e626efc4
--- /dev/null
+++ b/sglang/python/sglang/srt/entrypoints/openai/serving_completions.py
@@ -0,0 +1,552 @@
+from __future__ import annotations
+
+import logging
+import time
+from http import HTTPStatus
+from typing import TYPE_CHECKING, Any, AsyncGenerator, Dict, List, Optional, Union
+
+from fastapi import Request
+from fastapi.responses import ORJSONResponse, StreamingResponse
+
+from sglang.srt.entrypoints.openai.protocol import (
+    CompletionRequest,
+    CompletionResponse,
+    CompletionResponseChoice,
+    CompletionResponseStreamChoice,
+    CompletionStreamResponse,
+    ErrorResponse,
+    SglExt,
+)
+from sglang.srt.entrypoints.openai.serving_base import OpenAIServingBase
+from sglang.srt.entrypoints.openai.usage_processor import UsageProcessor
+from sglang.srt.entrypoints.openai.utils import (
+    process_cached_tokens_details_from_ret,
+    process_hidden_states_from_ret,
+    process_routed_experts_from_ret,
+    to_openai_style_logprobs,
+)
+from sglang.srt.managers.io_struct import GenerateReqInput
+from sglang.srt.parser.code_completion_parser import (
+    generate_completion_prompt_from_request,
+)
+from sglang.utils import convert_json_schema_to_str
+
+if TYPE_CHECKING:
+    from sglang.srt.managers.template_manager import TemplateManager
+    from sglang.srt.managers.tokenizer_manager import TokenizerManager
+
+logger = logging.getLogger(__name__)
+
+
+class OpenAIServingCompletion(OpenAIServingBase):
+    """Handler for /v1/completion requests"""
+
+    def __init__(
+        self,
+        tokenizer_manager: TokenizerManager,
+        template_manager: TemplateManager,
+    ):
+        super().__init__(tokenizer_manager)
+        self.template_manager = template_manager
+
+    def _request_id_prefix(self) -> str:
+        return "cmpl-"
+
+    def _validate_request(self, request: CompletionRequest) -> Optional[str]:
+        """Validate that the input is valid."""
+        prompt = request.prompt
+        if not prompt or (isinstance(prompt, list) and all(not p for p in prompt)):
+            return "Prompt cannot be empty"
+
+        return None
+
+    def _convert_to_internal_request(
+        self,
+        request: CompletionRequest,
+        raw_request: Request = None,
+    ) -> tuple[GenerateReqInput, CompletionRequest]:
+        """Convert OpenAI completion request to internal format"""
+        # NOTE: with openai API, the prompt's logprobs are always not computed
+        if request.echo and request.logprobs:
+            logger.warning(
+                "Echo is not compatible with logprobs. "
+                "To compute logprobs of input prompt, please use the native /generate API."
+            )
+        # Process prompt
+        prompt = request.prompt
+        if self.template_manager.completion_template_name is not None:
+            prompt = generate_completion_prompt_from_request(request)
+
+        # Set logprob start length based on echo and logprobs
+        if request.echo and request.logprobs:
+            logprob_start_len = 0
+        else:
+            logprob_start_len = -1
+
+        # Build sampling parameters
+        sampling_params = self._build_sampling_params(request)
+
+        # Determine prompt format
+        if isinstance(prompt, str) or (
+            isinstance(prompt, list) and isinstance(prompt[0], str)
+        ):
+            prompt_kwargs = {"text": prompt}
+        else:
+            prompt_kwargs = {"input_ids": prompt}
+
+        # Extract custom labels from raw request headers
+        custom_labels = self.extract_custom_labels(raw_request)
+
+        # Resolve LoRA adapter from model parameter or explicit lora_path
+        lora_path = self._resolve_lora_path(request.model, request.lora_path)
+
+        adapted_request = GenerateReqInput(
+            **prompt_kwargs,
+            sampling_params=sampling_params,
+            return_logprob=request.logprobs is not None,
+            top_logprobs_num=request.logprobs if request.logprobs is not None else 0,
+            logprob_start_len=logprob_start_len,
+            return_text_in_logprobs=True,
+            stream=request.stream,
+            lora_path=lora_path,
+            bootstrap_host=request.bootstrap_host,
+            bootstrap_port=request.bootstrap_port,
+            bootstrap_room=request.bootstrap_room,
+            routed_dp_rank=request.routed_dp_rank,
+            disagg_prefill_dp_rank=request.disagg_prefill_dp_rank,
+            return_hidden_states=request.return_hidden_states,
+            return_routed_experts=request.return_routed_experts,
+            rid=request.rid,
+            extra_key=self._compute_extra_key(request),
+            priority=request.priority,
+            routing_key=self.extract_routing_key(raw_request),
+            custom_labels=custom_labels,
+            custom_logit_processor=request.custom_logit_processor,
+        )
+
+        return adapted_request, request
+
+    def _build_sampling_params(self, request: CompletionRequest) -> Dict[str, Any]:
+        """Build sampling parameters for the request"""
+        # Start with common parameters
+        sampling_params = {
+            "temperature": request.temperature,
+            "max_new_tokens": request.max_tokens,
+            "min_new_tokens": request.min_tokens,
+            "stop": request.stop,
+            "stop_token_ids": request.stop_token_ids,
+            "stop_regex": request.stop_regex,
+            "top_p": request.top_p,
+            "top_k": request.top_k,
+            "min_p": request.min_p,
+            "presence_penalty": request.presence_penalty,
+            "frequency_penalty": request.frequency_penalty,
+            "repetition_penalty": request.repetition_penalty,
+            "regex": request.regex,
+            "json_schema": request.json_schema,
+            "ebnf": request.ebnf,
+            "n": request.n,
+            "no_stop_trim": request.no_stop_trim,
+            "ignore_eos": request.ignore_eos,
+            "skip_special_tokens": request.skip_special_tokens,
+            "logit_bias": request.logit_bias,
+            "custom_params": request.custom_params,
+            "sampling_seed": request.seed,
+        }
+
+        # Handle response_format constraints
+        if request.response_format and request.response_format.type == "json_schema":
+            sampling_params["json_schema"] = convert_json_schema_to_str(
+                request.response_format.json_schema.schema_
+            )
+        elif request.response_format and request.response_format.type == "json_object":
+            sampling_params["json_schema"] = '{"type": "object"}'
+        elif (
+            request.response_format and request.response_format.type == "structural_tag"
+        ):
+            sampling_params["structural_tag"] = convert_json_schema_to_str(
+                request.response_format.model_dump(by_alias=True)
+            )
+
+        return sampling_params
+
+    async def _handle_streaming_request(
+        self,
+        adapted_request: GenerateReqInput,
+        request: CompletionRequest,
+        raw_request: Request,
+    ) -> StreamingResponse:
+        """Handle streaming completion request"""
+        return StreamingResponse(
+            self._generate_completion_stream(adapted_request, request, raw_request),
+            media_type="text/event-stream",
+            background=self.tokenizer_manager.create_abort_task(adapted_request),
+        )
+
+    async def _generate_completion_stream(
+        self,
+        adapted_request: GenerateReqInput,
+        request: CompletionRequest,
+        raw_request: Request,
+    ) -> AsyncGenerator[str, None]:
+        """Generate streaming completion response"""
+        created = int(time.time())
+
+        # State tracking for streaming
+        stream_buffers = {}
+        n_prev_tokens = {}
+
+        # Usage tracking
+        prompt_tokens = {}
+        completion_tokens = {}
+        cached_tokens = {}
+        hidden_states = {}
+        routed_experts = {}
+
+        try:
+            async for content in self.tokenizer_manager.generate_request(
+                adapted_request, raw_request
+            ):
+                index = content.get("index", 0)
+
+                text = content["text"]
+                prompt_tokens[index] = content["meta_info"].get("prompt_tokens", 0)
+                completion_tokens[index] = content["meta_info"].get(
+                    "completion_tokens", 0
+                )
+                cached_tokens[index] = content["meta_info"].get("cached_tokens", 0)
+                hidden_states[index] = content["meta_info"].get("hidden_states", None)
+                routed_experts[index] = content["meta_info"].get("routed_experts", None)
+
+                stream_buffer = stream_buffers.get(index, "")
+                # Handle echo for first chunk
+                if not stream_buffer:  # The first chunk
+                    if request.echo:
+                        echo_text = self._get_echo_text(request, index)
+                        text = echo_text + text
+
+                # Handle logprobs
+                logprobs = None
+                if request.logprobs is not None:
+                    # The first chunk and echo is enabled.
+                    if not stream_buffer and request.echo:
+                        input_token_logprobs = content["meta_info"][
+                            "input_token_logprobs"
+                        ]
+                        input_top_logprobs = content["meta_info"]["input_top_logprobs"]
+                    else:
+                        input_token_logprobs = None
+                        input_top_logprobs = None
+
+                    n_prev_token = n_prev_tokens.get(index, 0)
+                    total_output_logprobs = len(
+                        content["meta_info"]["output_token_logprobs"]
+                    )
+                    output_logprobs_slice = content["meta_info"][
+                        "output_token_logprobs"
+                    ][n_prev_token:]
+                    finish_reason_for_logprobs = content["meta_info"]["finish_reason"]
+
+                    # When finish_reason is set and all logprobs have been sent,
+                    # any remaining text is just buffered text being flushed by the
+                    # detokenizer (it holds back text at word boundaries). Return None
+                    # for logprobs since no new tokens were generated for this text.
+                    if (
+                        len(output_logprobs_slice) == 0
+                        and finish_reason_for_logprobs is not None
+                        and input_token_logprobs is None
+                    ):
+                        logprobs = None
+                    else:
+                        logprobs = to_openai_style_logprobs(
+                            input_token_logprobs=input_token_logprobs,
+                            input_top_logprobs=input_top_logprobs,
+                            output_token_logprobs=output_logprobs_slice,
+                            output_top_logprobs=content["meta_info"].get(
+                                "output_top_logprobs", []
+                            )[n_prev_token:],
+                        )
+                    n_prev_tokens[index] = total_output_logprobs
+
+                # Generate delta
+                delta = text[len(stream_buffer) :]
+                stream_buffers[index] = stream_buffer + delta
+                finish_reason = content["meta_info"].get("finish_reason", None)
+                finish_reason_type = finish_reason["type"] if finish_reason else None
+
+                # If the abort is from scheduler.
+                if finish_reason_type == "abort":
+                    code = finish_reason.get(
+                        "status_code", HTTPStatus.INTERNAL_SERVER_ERROR
+                    )
+                    error = self.create_streaming_error_response(
+                        finish_reason.get("message", "Generation aborted."),
+                        code.name,
+                        code.value,
+                    )
+                    yield f"data: {error}\n\n"
+                    break
+
+                choice_data = CompletionResponseStreamChoice(
+                    index=index,
+                    text=delta,
+                    logprobs=logprobs,
+                    finish_reason=finish_reason_type,
+                    matched_stop=(
+                        finish_reason["matched"]
+                        if finish_reason and "matched" in finish_reason
+                        else None
+                    ),
+                )
+                chunk = CompletionStreamResponse(
+                    id=content["meta_info"]["id"],
+                    created=created,
+                    object="text_completion",
+                    choices=[choice_data],
+                    model=request.model,
+                )
+
+                # Add usage stats if continuous_usage_stats is enabled
+                if (
+                    request.stream_options
+                    and request.stream_options.continuous_usage_stats
+                ):
+                    chunk.usage = UsageProcessor.calculate_token_usage(
+                        prompt_tokens=prompt_tokens.get(index, 0),
+                        completion_tokens=completion_tokens.get(index, 0),
+                    )
+
+                yield f"data: {chunk.model_dump_json()}\n\n"
+
+            if request.return_hidden_states and hidden_states:
+                for index, choice_hidden_states in hidden_states.items():
+                    if choice_hidden_states:
+                        last_token_hidden_states = (
+                            choice_hidden_states[-1]
+                            if len(choice_hidden_states) > 1
+                            else []
+                        )
+                        hidden_states_chunk = CompletionStreamResponse(
+                            id=content["meta_info"]["id"],
+                            created=created,
+                            object="text_completion",
+                            choices=[
+                                CompletionResponseStreamChoice(
+                                    index=index,
+                                    text="",
+                                    hidden_states=last_token_hidden_states,
+                                    finish_reason=None,
+                                )
+                            ],
+                            model=request.model,
+                        )
+                        yield f"data: {hidden_states_chunk.model_dump_json()}\n\n"
+
+            if request.return_routed_experts and routed_experts:
+                # Get first non-None routed_experts value
+                first_routed_experts = next(
+                    (v for v in routed_experts.values() if v is not None), None
+                )
+                if first_routed_experts is not None:
+                    routed_experts_chunk = CompletionStreamResponse(
+                        id=content["meta_info"]["id"],
+                        created=created,
+                        object="text_completion",
+                        choices=[],  # sglext is at response level
+                        model=request.model,
+                        sglext=SglExt(routed_experts=first_routed_experts),
+                    )
+                    yield f"data: {routed_experts_chunk.model_dump_json()}\n\n"
+
+            # Handle final usage chunk
+            if request.stream_options and request.stream_options.include_usage:
+                usage = UsageProcessor.calculate_streaming_usage(
+                    prompt_tokens,
+                    completion_tokens,
+                    cached_tokens,
+                    n_choices=request.n,
+                    enable_cache_report=self.tokenizer_manager.server_args.enable_cache_report,
+                )
+                final_usage_chunk = CompletionStreamResponse(
+                    id=content["meta_info"]["id"],
+                    created=created,
+                    choices=[],
+                    model=request.model,
+                    usage=usage,
+                )
+                final_usage_data = final_usage_chunk.model_dump_json(exclude_none=True)
+                yield f"data: {final_usage_data}\n\n"
+
+        except Exception as e:
+            error = self.create_streaming_error_response(str(e))
+            yield f"data: {error}\n\n"
+
+        yield "data: [DONE]\n\n"
+
+    async def _handle_non_streaming_request(
+        self,
+        adapted_request: GenerateReqInput,
+        request: CompletionRequest,
+        raw_request: Request,
+    ) -> Union[CompletionResponse, ErrorResponse, ORJSONResponse]:
+        """Handle non-streaming completion request"""
+        try:
+            generator = self.tokenizer_manager.generate_request(
+                adapted_request, raw_request
+            )
+            ret = await generator.__anext__()
+        except ValueError as e:
+            return self.create_error_response(str(e))
+
+        if not isinstance(ret, list):
+            ret = [ret]
+
+        response = self._build_completion_response(
+            request,
+            ret,
+            int(time.time()),
+        )
+
+        return response
+
+    def _build_completion_response(
+        self,
+        request: CompletionRequest,
+        ret: List[Dict[str, Any]],
+        created: int,
+    ) -> CompletionResponse:
+        """Build completion response from generation results"""
+        choices = []
+        echo = False
+
+        # Prepare echo prompts if needed
+        echo_prompts = []
+        if request.echo:
+            echo_prompts = self._prepare_echo_prompts(request)
+            echo = True
+
+        # Build sglext at response level (from first ret_item, as these are per-request)
+        first_ret = ret[0]
+        routed_experts = process_routed_experts_from_ret(first_ret, request)
+        cached_tokens_details = process_cached_tokens_details_from_ret(
+            first_ret, request
+        )
+        response_sglext = None
+        if routed_experts or cached_tokens_details:
+            response_sglext = SglExt(
+                routed_experts=routed_experts,
+                cached_tokens_details=cached_tokens_details,
+            )
+
+        for idx, ret_item in enumerate(ret):
+            text = ret_item["text"]
+
+            # Handle echo
+            if echo:
+                prompt_index = idx // request.n
+                text = echo_prompts[prompt_index] + text
+
+            # Handle logprobs
+            logprobs = None
+            if request.logprobs is not None:
+                if echo:
+                    input_token_logprobs = ret_item["meta_info"]["input_token_logprobs"]
+                    input_top_logprobs = ret_item["meta_info"]["input_top_logprobs"]
+                else:
+                    input_token_logprobs = None
+                    input_top_logprobs = None
+
+                logprobs = to_openai_style_logprobs(
+                    input_token_logprobs=input_token_logprobs,
+                    input_top_logprobs=input_top_logprobs,
+                    output_token_logprobs=ret_item["meta_info"].get(
+                        "output_token_logprobs", []
+                    ),
+                    output_top_logprobs=ret_item["meta_info"].get(
+                        "output_top_logprobs", []
+                    ),
+                )
+
+            # Handle hidden states
+            hidden_states = process_hidden_states_from_ret(ret_item, request)
+
+            finish_reason = ret_item["meta_info"]["finish_reason"]
+
+            choice_data = CompletionResponseChoice(
+                index=idx,
+                text=text,
+                logprobs=logprobs,
+                finish_reason=finish_reason["type"] if finish_reason else None,
+                matched_stop=(
+                    finish_reason["matched"]
+                    if finish_reason and "matched" in finish_reason
+                    else None
+                ),
+                hidden_states=hidden_states,
+            )
+            choices.append(choice_data)
+
+        # Calculate usage
+        cache_report = self.tokenizer_manager.server_args.enable_cache_report
+        usage = UsageProcessor.calculate_response_usage(
+            ret, n_choices=request.n, enable_cache_report=cache_report
+        )
+
+        return CompletionResponse(
+            id=ret[0]["meta_info"]["id"],
+            model=request.model,
+            created=created,
+            choices=choices,
+            usage=usage,
+            metadata={"weight_version": ret[0]["meta_info"]["weight_version"]},
+            sglext=response_sglext,
+        )
+
+    def _get_echo_text(self, request: CompletionRequest, index: int) -> str:
+        """Get echo text for streaming response"""
+        if isinstance(request.prompt, str):
+            # for the case of single str prompts
+            return request.prompt
+        elif isinstance(request.prompt, list):
+            if isinstance(request.prompt[0], str):
+                # for the case of multiple str prompts
+                return request.prompt[index // request.n]
+            elif isinstance(request.prompt[0], int):
+                # for the case of single token ids prompt
+                return self.tokenizer_manager.tokenizer.decode(
+                    request.prompt, skip_special_tokens=True
+                )
+            elif isinstance(request.prompt[0], list) and isinstance(
+                request.prompt[0][0], int
+            ):
+                # for the case of multiple token ids prompts
+                return self.tokenizer_manager.tokenizer.decode(
+                    request.prompt[index // request.n],
+                    skip_special_tokens=True,
+                )
+        return ""
+
+    def _prepare_echo_prompts(self, request: CompletionRequest) -> List[str]:
+        """Prepare echo prompts for non-streaming response"""
+        # TODO: handle the case prompt is token ids
+        if isinstance(request.prompt, list) and isinstance(request.prompt[0], str):
+            # for the case of multiple str prompts
+            return request.prompt
+        elif isinstance(request.prompt, list) and isinstance(request.prompt[0], list):
+            # for the case of multiple token ids prompts
+            return [
+                self.tokenizer_manager.tokenizer.decode(
+                    prompt, skip_special_tokens=True
+                )
+                for prompt in request.prompt
+            ]
+        elif isinstance(request.prompt, list) and isinstance(request.prompt[0], int):
+            # for the case of single token ids prompt
+            return [
+                self.tokenizer_manager.tokenizer.decode(
+                    request.prompt, skip_special_tokens=True
+                )
+            ]
+        else:
+            # for the case of single str prompt
+            return [request.prompt]
diff --git a/sglang/python/sglang/srt/entrypoints/openai/serving_embedding.py b/sglang/python/sglang/srt/entrypoints/openai/serving_embedding.py
new file mode 100644
index 0000000000000000000000000000000000000000..7555cc476ebddca74165ac25b9d802fa077cf557
--- /dev/null
+++ b/sglang/python/sglang/srt/entrypoints/openai/serving_embedding.py
@@ -0,0 +1,186 @@
+from __future__ import annotations
+
+from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union
+
+from fastapi import Request
+from fastapi.responses import ORJSONResponse
+
+from sglang.srt.entrypoints.openai.protocol import (
+    EmbeddingObject,
+    EmbeddingRequest,
+    EmbeddingResponse,
+    ErrorResponse,
+    MultimodalEmbeddingInput,
+    UsageInfo,
+)
+from sglang.srt.entrypoints.openai.serving_base import OpenAIServingBase
+from sglang.srt.managers.io_struct import EmbeddingReqInput
+from sglang.srt.parser.conversation import generate_embedding_convs
+
+if TYPE_CHECKING:
+    from sglang.srt.managers.template_manager import TemplateManager
+    from sglang.srt.managers.tokenizer_manager import TokenizerManager
+
+
+class OpenAIServingEmbedding(OpenAIServingBase):
+    """Handler for v1/embeddings requests"""
+
+    def __init__(
+        self,
+        tokenizer_manager: TokenizerManager,
+        template_manager: TemplateManager,
+    ):
+        super().__init__(tokenizer_manager)
+        self.template_manager = template_manager
+
+    def _request_id_prefix(self) -> str:
+        return "embd-"
+
+    def _validate_request(self, request: EmbeddingRequest) -> Optional[str]:
+        """Validate that the input is not empty or whitespace only."""
+        if not (input := request.input):
+            return "Input cannot be empty"
+
+        # Handle single string
+        if isinstance(input, str):
+            if not input.strip():
+                return "Input cannot be empty or whitespace only"
+            return None
+
+        # Handle list inputs
+        if isinstance(input, list):
+            if len(input) == 0:
+                return "Input cannot be empty"
+
+            # Check first element to determine type
+            first_item = input[0]
+
+            if isinstance(first_item, str):
+                # List of strings
+                for i, item in enumerate(input):
+                    if not isinstance(item, str):
+                        return f"All items in input list must be strings"
+                    if not item.strip():
+                        return f"Input at index {i} cannot be empty or whitespace only"
+            elif isinstance(first_item, int):
+                # List of integers (token IDs)
+                for i, item in enumerate(input):
+                    if not isinstance(item, int):
+                        return f"All items in input list must be integers"
+                    if item < 0:
+                        return f"Token ID at index {i} must be non-negative"
+        return None
+
+    def _convert_to_internal_request(
+        self,
+        request: EmbeddingRequest,
+        raw_request: Request = None,
+    ) -> tuple[EmbeddingReqInput, EmbeddingRequest]:
+        """Convert OpenAI embedding request to internal format"""
+        prompt = request.input
+
+        if isinstance(prompt, str):
+            # Single string input
+            prompt_kwargs = {"text": prompt}
+        elif isinstance(prompt, list):
+            if len(prompt) > 0 and isinstance(prompt[0], str):
+                prompt_kwargs = {"text": prompt}
+            elif len(prompt) > 0 and isinstance(prompt[0], MultimodalEmbeddingInput):
+                # Handle multimodal embedding inputs
+                texts = []
+                images = []
+                videos = []
+                for item in prompt:
+                    # Use padding for text if None - this could be improved
+                    texts.append(item.text if item.text is not None else "padding")
+                    images.append(item.image if item.image is not None else None)
+                    videos.append(item.video if item.video is not None else None)
+
+                generate_prompts = []
+                # Check if we have a chat template for multimodal embeddings
+                if self.template_manager.chat_template_name is not None:
+                    convs = generate_embedding_convs(
+                        texts, images, videos, self.template_manager.chat_template_name
+                    )
+                    for conv in convs:
+                        generate_prompts.append(conv.get_prompt())
+                else:
+                    generate_prompts = texts
+
+                if len(generate_prompts) == 1:
+                    prompt_kwargs = {
+                        "text": generate_prompts[0],
+                        "image_data": images[0],
+                        "video_data": videos[0],
+                    }
+                else:
+                    prompt_kwargs = {
+                        "text": generate_prompts,
+                        "image_data": images,
+                        "video_data": videos,
+                    }
+            else:
+                # List of integers (token IDs) or empty list
+                prompt_kwargs = {"input_ids": prompt}
+        else:
+            # Other types (should not happen but handle gracefully)
+            prompt_kwargs = {"input_ids": prompt}
+
+        # Resolve LoRA adapter from model parameter or explicit lora_path
+        lora_path = self._resolve_lora_path(request.model, request.lora_path)
+
+        adapted_request = EmbeddingReqInput(
+            **prompt_kwargs,
+            rid=request.rid,
+            priority=request.priority,
+            routing_key=self.extract_routing_key(raw_request),
+            dimensions=request.dimensions,
+            lora_path=lora_path,
+        )
+
+        return adapted_request, request
+
+    async def _handle_non_streaming_request(
+        self,
+        adapted_request: EmbeddingReqInput,
+        request: EmbeddingRequest,
+        raw_request: Request,
+    ) -> Union[EmbeddingResponse, ErrorResponse, ORJSONResponse]:
+        """Handle the embedding request"""
+        try:
+            ret = await self.tokenizer_manager.generate_request(
+                adapted_request, raw_request
+            ).__anext__()
+        except ValueError as e:
+            return self.create_error_response(str(e))
+
+        if not isinstance(ret, list):
+            ret = [ret]
+
+        response = self._build_embedding_response(ret)
+        return response
+
+    def _build_embedding_response(self, ret: List[Dict[str, Any]]) -> EmbeddingResponse:
+        """Build the embedding response"""
+        embedding_objects = []
+        prompt_tokens = 0
+
+        for idx, ret_item in enumerate(ret):
+            embedding_objects.append(
+                EmbeddingObject(
+                    embedding=ret_item["embedding"],
+                    index=idx,
+                )
+            )
+            # Handle missing prompt_tokens gracefully
+            meta_info = ret_item.get("meta_info", {})
+            prompt_tokens += meta_info.get("prompt_tokens", 0)
+
+        return EmbeddingResponse(
+            data=embedding_objects,
+            model=self.tokenizer_manager.model_path,
+            usage=UsageInfo(
+                prompt_tokens=prompt_tokens,
+                total_tokens=prompt_tokens,
+            ),
+        )
diff --git a/sglang/python/sglang/srt/entrypoints/openai/serving_rerank.py b/sglang/python/sglang/srt/entrypoints/openai/serving_rerank.py
new file mode 100644
index 0000000000000000000000000000000000000000..853d44da069369deca309ae62db91f5d6ad8322d
--- /dev/null
+++ b/sglang/python/sglang/srt/entrypoints/openai/serving_rerank.py
@@ -0,0 +1,602 @@
+import logging
+from typing import Any, Dict, List, Optional, Union
+
+from fastapi import Request
+from fastapi.responses import ORJSONResponse
+
+from sglang.srt.entrypoints.openai.protocol import (
+    ChatCompletionMessageContentImagePart,
+    ChatCompletionMessageContentTextPart,
+    ChatCompletionMessageContentVideoPart,
+    ErrorResponse,
+    RerankContent,
+    RerankResponse,
+    V1RerankReqInput,
+)
+from sglang.srt.entrypoints.openai.serving_base import OpenAIServingBase
+from sglang.srt.managers.io_struct import EmbeddingReqInput, GenerateReqInput
+
+logger = logging.getLogger(__name__)
+
+
+def _get_yes_no_token_ids(tokenizer) -> tuple[int, int]:
+    """Get token IDs for 'yes' and 'no' from the tokenizer.
+
+    Different model sizes may have different token IDs, so we look them up dynamically.
+    """
+    # Try to encode 'yes' and 'no' to get their token IDs
+    # The tokenizer should return a single token for these common words
+    try:
+        yes_tokens = tokenizer.encode("yes", add_special_tokens=False)
+        no_tokens = tokenizer.encode("no", add_special_tokens=False)
+
+        if len(yes_tokens) == 1 and len(no_tokens) == 1:
+            return yes_tokens[0], no_tokens[0]
+
+        # Fallback: try convert_tokens_to_ids
+        yes_id = tokenizer.convert_tokens_to_ids("yes")
+        no_id = tokenizer.convert_tokens_to_ids("no")
+        if yes_id is not None and no_id is not None:
+            return yes_id, no_id
+
+    except Exception as e:
+        logger.warning(f"Failed to get yes/no token IDs dynamically: {e}")
+
+    # Fallback to known Qwen3 token IDs (may not work for all model sizes)
+    logger.warning("Using fallback token IDs for yes/no (9693/2152)")
+    return 9693, 2152
+
+
+def _is_qwen3_reranker_template(chat_template: str) -> bool:
+    """Detect if the chat template is for Qwen3 text-only reranker."""
+    if not chat_template:
+        return False
+    t = chat_template.lower()
+    return ('answer can only be "yes" or "no"' in t) or (
+        "answer can only be" in t and '"yes"' in t and '"no"' in t
+    )
+
+
+def _is_qwen3_vl_reranker_template(chat_template: str) -> bool:
+    """Detect if the chat template is for Qwen3-VL multimodal reranker.
+
+    VL reranker templates use `query` and `document` as jinja variables
+    and include vision token placeholders for image/video support.
+    """
+    if not chat_template:
+        return False
+    t = chat_template.lower()
+    # Check for reranker phrase (yes/no judgment)
+    has_reranker_phrase = ('answer can only be "yes" or "no"' in t) or (
+        "answer can only be" in t and '"yes"' in t and '"no"' in t
+    )
+    # Check for vision token placeholders (unique to VL templates)
+    has_vision_tokens = "<|vision_start|>" in t or "<|image_pad|>" in t
+    return has_reranker_phrase and has_vision_tokens
+
+
+def _is_qwen3_vl_model(model_path: str) -> bool:
+    """Check if the model is a Qwen3-VL model based on model path."""
+    if not model_path:
+        return False
+    model_lower = model_path.lower()
+    return "qwen3-vl" in model_lower or "qwen3vl" in model_lower
+
+
+def _detect_rerank_backend(
+    *,
+    request: V1RerankReqInput,
+    chat_template: Optional[str],
+    model_path: str,
+) -> str:
+    """
+    Unify rerank routing decisions used by both `_convert_to_internal_request` and
+    `_handle_non_streaming_request`.
+
+    Returns:
+        "vl_decoder" | "text_decoder" | "cross_encoder"
+    """
+    is_multimodal = request.is_multimodal()
+    is_vl_model = _is_qwen3_vl_model(model_path)
+    is_vl_template = _is_qwen3_vl_reranker_template(chat_template)
+    is_text_template = _is_qwen3_reranker_template(chat_template)
+
+    # Prefer VL when template/model indicates VL, or request is multimodal with reranker template.
+    if is_vl_template or is_vl_model or (is_multimodal and is_text_template):
+        return "vl_decoder"
+    if is_text_template:
+        return "text_decoder"
+    return "cross_encoder"
+
+
+def _qwen3_rerank_score(p_yes: float, p_no: float) -> float:
+    denom = p_yes + p_no
+    if denom <= 0.0:
+        return 0.0
+    return p_yes / denom
+
+
+def _get_jinja_env():
+    try:
+        import jinja2  # Lazy import: server env should provide this dependency.
+    except ModuleNotFoundError as e:
+        raise ValueError(
+            "Rendering Qwen3 reranker prompts requires `jinja2`. "
+            "Please install it in your runtime environment (e.g., `pip install jinja2`)."
+        ) from e
+
+    return jinja2.Environment(
+        loader=jinja2.BaseLoader(),
+        autoescape=False,
+        undefined=jinja2.Undefined,
+    )
+
+
+def _render_jinja_chat_template(
+    chat_template: str,
+    *,
+    query: RerankContent,
+    document: RerankContent,
+    instruct: Optional[str],
+) -> str:
+    """Render a loaded Jinja chat template for Qwen3 reranker prompts (text-only)."""
+    env = _get_jinja_env()
+    template = env.from_string(chat_template)
+
+    # For text-only template, extract text content
+    query_text = query if isinstance(query, str) else _extract_text_from_content(query)
+    doc_text = (
+        document if isinstance(document, str) else _extract_text_from_content(document)
+    )
+
+    render_kwargs = {
+        "messages": [
+            {"role": "user", "content": query_text},
+            {"role": "user", "content": doc_text},
+        ]
+    }
+    # Only pass instruct when explicitly provided; template uses `default(...)`
+    # which works only when the variable is undefined (not None).
+    if instruct:
+        render_kwargs["instruct"] = instruct
+    return template.render(**render_kwargs)
+
+
+def _render_vl_jinja_template(
+    chat_template: str,
+    *,
+    query: List[Dict[str, Any]],
+    document: List[Dict[str, Any]],
+    instruct: Optional[str],
+) -> str:
+    """Render a loaded Jinja chat template for Qwen3-VL reranker prompts (multimodal).
+
+    The template expects `query` and `document` as lists of content parts,
+    where each part has a `type` field (text, image, video) and corresponding data.
+    """
+    env = _get_jinja_env()
+    template = env.from_string(chat_template)
+
+    render_kwargs = {
+        "query": query,
+        "document": document,
+    }
+    if instruct:
+        render_kwargs["instruct"] = instruct
+    return template.render(**render_kwargs)
+
+
+def _extract_text_from_content(content: RerankContent) -> str:
+    """Extract text from multimodal content."""
+    if isinstance(content, str):
+        return content
+    texts = []
+    for part in content:
+        if isinstance(part, ChatCompletionMessageContentTextPart):
+            texts.append(part.text)
+        elif isinstance(part, dict) and part.get("type") == "text":
+            texts.append(part.get("text", ""))
+    return " ".join(texts)
+
+
+class OpenAIServingRerank(OpenAIServingBase):
+    """Handler for /v1/rerank requests"""
+
+    def __init__(self, tokenizer_manager, template_manager=None):
+        super().__init__(tokenizer_manager)
+        # TemplateManager is optional; rerank uses tokenizer.chat_template today.
+        # Keeping this explicit makes the dependency clear and supports future extensions.
+        self.template_manager = template_manager
+
+        # Cache yes/no token IDs for Qwen3 reranker scoring
+        self._yes_token_id, self._no_token_id = _get_yes_no_token_ids(
+            tokenizer_manager.tokenizer
+        )
+
+    # NOTE: /v1/rerank is not an official OpenAI endpoint. This module may be moved
+    # to another module in the future.
+
+    def _request_id_prefix(self) -> str:
+        return "rerank-"
+
+    def _validate_request(self, request: V1RerankReqInput) -> Optional[str]:
+        """Validate rerank request format and content"""
+        if not request.query:
+            return "Query cannot be empty"
+
+        if isinstance(request.query, str):
+            if not request.query.strip():
+                return "Query cannot be empty or whitespace only"
+
+        if not request.documents:
+            return "Documents cannot be empty"
+
+        for doc in request.documents:
+            if not doc:
+                return "Each document must be a non-empty string"
+            if isinstance(doc, str) and not doc.strip():
+                return "Each document cannot be empty or whitespace only"
+
+        return None
+
+    def _convert_to_internal_request(
+        self,
+        request: V1RerankReqInput,
+        raw_request: Request = None,
+    ) -> tuple[Union[EmbeddingReqInput, V1RerankReqInput], V1RerankReqInput]:
+        """
+        Convert OpenAI rerank request to internal format.
+
+        - For Qwen3-VL reranker (multimodal decoder-only): keep the request.
+        - For Qwen3 reranker (text-only decoder-only): keep the request and score via
+          `tokenizer_manager.score_prompts(...)` in the handler.
+        - For cross-encoder rerank models: adapt into `EmbeddingReqInput` pairs.
+        """
+        chat_template = self.tokenizer_manager.tokenizer.chat_template
+        model_path = getattr(self.tokenizer_manager.model_config, "model_path", "")
+        backend = _detect_rerank_backend(
+            request=request,
+            chat_template=chat_template if isinstance(chat_template, str) else None,
+            model_path=model_path,
+        )
+        if backend in ("vl_decoder", "text_decoder"):
+            return request, request
+
+        # Cross-encoder rerank: Create pairs of [query, document] for each document.
+        # Note: Cross-encoder only supports text-only content
+        if request.is_multimodal():
+            # Extract text for cross-encoder (multimodal not supported)
+            query_text = _extract_text_from_content(request.query)
+            doc_texts = [_extract_text_from_content(doc) for doc in request.documents]
+            pairs = [[query_text, doc] for doc in doc_texts]
+        else:
+            pairs = [[request.query, doc] for doc in request.documents]
+
+        adapted_request = EmbeddingReqInput(text=pairs, is_cross_encoder_request=True)
+        return adapted_request, request
+
+    async def _handle_non_streaming_request(
+        self,
+        adapted_request: Union[EmbeddingReqInput, V1RerankReqInput],
+        request: V1RerankReqInput,
+        raw_request: Request,
+    ) -> Union[List[RerankResponse], ErrorResponse, ORJSONResponse]:
+        """Handle the rerank request"""
+        chat_template = getattr(self.tokenizer_manager.tokenizer, "chat_template", None)
+        model_path = getattr(self.tokenizer_manager.model_config, "model_path", "")
+        rerank_ret = await self._handle_rerank_paths(
+            request=request,
+            raw_request=raw_request,
+            chat_template=chat_template,
+            model_path=model_path,
+        )
+        if rerank_ret is not None:
+            return rerank_ret
+
+        # Default cross-encoder rerank path (existing behavior).
+        try:
+            if not isinstance(adapted_request, EmbeddingReqInput):
+                raise ValueError(
+                    "Invalid rerank request adaptation. "
+                    "If you are serving a decoder-only reranker (e.g., Qwen3-Reranker), "
+                    "please provide the corresponding --chat-template and launch without --is-embedding."
+                )
+            ret = await self.tokenizer_manager.generate_request(
+                adapted_request, raw_request
+            ).__anext__()
+        except ValueError as e:
+            return self.create_error_response(str(e))
+
+        if not isinstance(ret, list):
+            ret = [ret]
+
+        responses = self._build_rerank_response(ret, request)
+        return responses
+
+    async def _handle_rerank_paths(
+        self,
+        *,
+        request: V1RerankReqInput,
+        raw_request: Request,
+        chat_template: Optional[str],
+        model_path: str,
+    ) -> Optional[Union[List[RerankResponse], ErrorResponse, ORJSONResponse]]:
+        """
+        Handle decoder-only rerank paths (VL/text) and return a response if matched.
+
+        Returns None if the request should fall back to cross-encoder rerank.
+        """
+        backend = _detect_rerank_backend(
+            request=request,
+            chat_template=chat_template,
+            model_path=model_path,
+        )
+
+        # Qwen3-VL reranker path (decoder-only scoring with query/document template format)
+        if backend == "vl_decoder":
+            return await self._handle_vl_reranker_request(
+                request, raw_request, chat_template or ""
+            )
+
+        # Qwen3 text-only reranker path (decoder-only scoring).
+        if backend == "text_decoder":
+            return await self._handle_text_reranker_request(
+                request=request,
+                raw_request=raw_request,
+                chat_template=chat_template or "",
+            )
+
+        return None
+
+    async def _handle_text_reranker_request(
+        self,
+        *,
+        request: V1RerankReqInput,
+        raw_request: Request,
+        chat_template: str,
+    ) -> Union[List[RerankResponse], ErrorResponse]:
+        """Handle text-only decoder reranker request via score_prompts()."""
+        # Qwen3 reranker relies on decoder-only logprobs. If the server is launched
+        # with --is-embedding, model_config.is_generation is typically False and
+        # logprob scoring is not supported.
+        if not self.tokenizer_manager.model_config.is_generation:
+            return self.create_error_response(
+                "Detected Qwen3 reranker chat template, but the server is not in generation mode. "
+                "Please relaunch without --is-embedding for Qwen3-Reranker models."
+            )
+
+        try:
+            prompts = [
+                _render_jinja_chat_template(
+                    chat_template,
+                    query=request.query,
+                    document=doc,
+                    instruct=getattr(request, "instruct", None),
+                )
+                for doc in request.documents
+            ]
+
+            result = await self.tokenizer_manager.score_prompts(
+                prompts,
+                label_token_ids=[self._yes_token_id, self._no_token_id],
+                apply_softmax=False,
+                request=raw_request,
+            )
+            scores = [_qwen3_rerank_score(s[0], s[1]) for s in result.scores]
+        except ValueError as e:
+            return self.create_error_response(str(e))
+        except Exception as e:
+            # Includes template rendering errors from jinja2.
+            return self.create_error_response(str(e))
+
+        responses = self._build_rerank_response(scores, request)
+        return responses
+
+    async def _handle_vl_reranker_request(
+        self,
+        request: V1RerankReqInput,
+        raw_request: Request,
+        _chat_template: str,
+    ) -> Union[List[RerankResponse], ErrorResponse]:
+        """Handle multimodal VL reranker request using chat completion with logprobs."""
+        if not self.tokenizer_manager.model_config.is_generation:
+            return self.create_error_response(
+                "Detected Qwen3-VL reranker, but the server is not in generation mode. "
+                "Please relaunch without --is-embedding for Qwen3-VL-Reranker models."
+            )
+
+        try:
+            scores = []
+            instruct = getattr(request, "instruct", None)
+
+            for doc in request.documents:
+                # Build multimodal content lists and render prompt using jinja template
+                query_content, doc_content, image_data, video_data = (
+                    self._build_vl_reranker_content(
+                        query=request.query,
+                        document=doc,
+                    )
+                )
+
+                # Render the chat template directly with query/document variables
+                prompt = _render_vl_jinja_template(
+                    chat_template=_chat_template,
+                    query=query_content,
+                    document=doc_content,
+                    instruct=instruct,
+                )
+
+                # Create generate request with logprobs
+                gen_request = GenerateReqInput(
+                    text=prompt,
+                    image_data=image_data if image_data else None,
+                    video_data=video_data if video_data else None,
+                    sampling_params={
+                        "max_new_tokens": 1,
+                        "temperature": 0,
+                    },
+                    return_logprob=True,
+                    top_logprobs_num=50,  # Get enough logprobs to find yes/no tokens
+                    logprob_start_len=0,
+                )
+
+                # Execute generation request
+                ret = await self.tokenizer_manager.generate_request(
+                    gen_request, raw_request
+                ).__anext__()
+
+                # Extract yes/no probabilities from logprobs
+                score = self._extract_score_from_logprobs(ret)
+                scores.append(score)
+
+            responses = self._build_rerank_response(scores, request)
+            return responses
+
+        except ValueError as e:
+            return self.create_error_response(str(e))
+        except Exception as e:
+            logger.exception("Error handling VL reranker request")
+            return self.create_error_response(str(e))
+
+    def _build_vl_reranker_content(
+        self,
+        query: RerankContent,
+        document: RerankContent,
+    ) -> tuple[List[Dict[str, Any]], List[Dict[str, Any]], List[str], List[str]]:
+        """Build content lists for VL reranker request.
+
+        Returns:
+            Tuple of (query_content, document_content, image_data, video_data)
+            where query_content and document_content are lists suitable for jinja template.
+        """
+        image_data = []
+        video_data = []
+
+        # Build query content list
+        query_content = self._content_to_template_list(query, image_data, video_data)
+
+        # Build document content list
+        doc_content = self._content_to_template_list(document, image_data, video_data)
+
+        return query_content, doc_content, image_data, video_data
+
+    def _content_to_template_list(
+        self,
+        content: RerankContent,
+        image_data: List[str],
+        video_data: List[str],
+    ) -> List[Dict[str, Any]]:
+        """Convert RerankContent to a list format suitable for jinja template."""
+        result = []
+
+        if isinstance(content, str):
+            result.append({"type": "text", "text": content})
+            return result
+
+        for part in content:
+            if isinstance(part, ChatCompletionMessageContentTextPart):
+                result.append({"type": "text", "text": part.text})
+            elif isinstance(part, ChatCompletionMessageContentImagePart):
+                if part.image_url:
+                    image_data.append(part.image_url.url)
+                    result.append({"type": "image"})
+            elif isinstance(part, ChatCompletionMessageContentVideoPart):
+                if part.video_url:
+                    video_data.append(part.video_url.url)
+                    result.append({"type": "video"})
+            elif isinstance(part, dict):
+                part_type = part.get("type")
+                if part_type == "text":
+                    result.append({"type": "text", "text": part.get("text", "")})
+                elif part_type == "image_url":
+                    image_url = part.get("image_url", {})
+                    if isinstance(image_url, dict):
+                        url = image_url.get("url")
+                    else:
+                        url = image_url
+                    if url:
+                        image_data.append(url)
+                        result.append({"type": "image"})
+                elif part_type == "video_url":
+                    video_url = part.get("video_url", {})
+                    if isinstance(video_url, dict):
+                        url = video_url.get("url")
+                    else:
+                        url = video_url
+                    if url:
+                        video_data.append(url)
+                        result.append({"type": "video"})
+
+        return result
+
+    def _extract_score_from_logprobs(self, ret: Dict[str, Any]) -> float:
+        """Extract reranking score from generation response with logprobs."""
+        import math
+
+        # Get logprobs from the response
+        meta_info = ret.get("meta_info", {})
+        output_top_logprobs = meta_info.get("output_top_logprobs", [])
+
+        # Use output_top_logprobs[0] - the model's prediction for the first generated token
+        top_logprobs = output_top_logprobs[0] if output_top_logprobs else []
+
+        # Find yes and no token probabilities
+        # Format: list of tuples (logprob, token_id, token_text)
+        p_yes = 0.0
+        p_no = 0.0
+
+        for item in top_logprobs:
+            logprob, token_id = item[0], item[1]
+            if token_id == self._yes_token_id:
+                p_yes = math.exp(logprob)
+            elif token_id == self._no_token_id:
+                p_no = math.exp(logprob)
+
+        return _qwen3_rerank_score(p_yes, p_no)
+
+    def _build_rerank_response(
+        self, ret: Union[List[Dict[str, Any]], List[float]], request: V1RerankReqInput
+    ) -> List[RerankResponse]:
+        """Build the rerank response from generation results"""
+        responses = []
+        for idx, item in enumerate(ret):
+            if isinstance(item, dict):
+                score_val = item.get("embedding")
+                # Some rerank/reward models return scalar score as embedding[0].
+                if isinstance(score_val, list):
+                    if len(score_val) == 0 or not isinstance(
+                        score_val[0], (int, float)
+                    ):
+                        raise ValueError(
+                            f"Invalid embedding score for rerank at index {idx}: {score_val!r}"
+                        )
+                    score_val = float(score_val[0])
+                responses.append(
+                    RerankResponse(
+                        score=float(score_val),
+                        document=(
+                            request.documents[idx] if request.return_documents else None
+                        ),
+                        index=idx,
+                        meta_info=item.get("meta_info"),
+                    )
+                )
+            else:
+                responses.append(
+                    RerankResponse(
+                        score=float(item),
+                        document=(
+                            request.documents[idx] if request.return_documents else None
+                        ),
+                        index=idx,
+                    )
+                )
+
+        # Sort by score in descending order (highest relevance first)
+        responses.sort(key=lambda x: x.score, reverse=True)
+
+        # Apply top_n limit if specified
+        if request.top_n is not None and request.top_n > 0:
+            responses = responses[: request.top_n]
+
+        return responses
diff --git a/sglang/python/sglang/srt/entrypoints/openai/serving_responses.py b/sglang/python/sglang/srt/entrypoints/openai/serving_responses.py
new file mode 100644
index 0000000000000000000000000000000000000000..edd4e52cf2c42406b9221a460ee5705d0302fbb1
--- /dev/null
+++ b/sglang/python/sglang/srt/entrypoints/openai/serving_responses.py
@@ -0,0 +1,1324 @@
+# SPDX-License-Identifier: Apache-2.0
+# Adapted from vLLM's OpenAIServingResponses
+"""Handler for /v1/responses requests"""
+
+from __future__ import annotations
+
+import asyncio
+import copy
+import json
+import logging
+import time
+from contextlib import AsyncExitStack
+from http import HTTPStatus
+from typing import TYPE_CHECKING, Any, AsyncGenerator, AsyncIterator, Optional, Union
+
+import jinja2
+import openai.types.responses as openai_responses_types
+import orjson
+from fastapi import Request
+from fastapi.responses import ORJSONResponse
+from openai.types.responses import (
+    ResponseOutputMessage,
+    ResponseOutputText,
+    ResponseReasoningItem,
+)
+from openai.types.responses.response_function_tool_call import ResponseFunctionToolCall
+from openai.types.responses.response_reasoning_item import (
+    Content as ResponseReasoningTextContent,
+)
+from openai_harmony import Message as OpenAIMessage
+
+from sglang.srt.entrypoints.context import (
+    ConversationContext,
+    HarmonyContext,
+    SimpleContext,
+    StreamingHarmonyContext,
+)
+from sglang.srt.entrypoints.harmony_utils import (
+    get_developer_message,
+    get_stop_tokens_for_assistant_actions,
+    get_system_message,
+    get_user_message,
+    parse_output_message,
+    parse_remaining_state,
+    parse_response_input,
+    render_for_completion,
+)
+from sglang.srt.entrypoints.openai.protocol import (
+    ChatCompletionMessageParam,
+    ChatCompletionRequest,
+    PromptTokenUsageInfo,
+    RequestResponseMetadata,
+    ResponsesRequest,
+    ResponsesResponse,
+    UsageInfo,
+)
+from sglang.srt.entrypoints.openai.serving_chat import OpenAIServingChat
+from sglang.srt.entrypoints.openai.tool_server import MCPToolServer, ToolServer
+from sglang.srt.managers.io_struct import GenerateReqInput
+from sglang.srt.parser.reasoning_parser import ReasoningParser
+from sglang.srt.utils import random_uuid
+
+if TYPE_CHECKING:
+    from sglang.srt.managers.template_manager import TemplateManager
+    from sglang.srt.managers.tokenizer_manager import TokenizerManager
+
+logger = logging.getLogger(__name__)
+
+
+class OpenAIServingResponses(OpenAIServingChat):
+    """Handler for /v1/responses requests"""
+
+    def __init__(
+        self,
+        tokenizer_manager: TokenizerManager,
+        template_manager: TemplateManager,
+        *,
+        enable_prompt_tokens_details: bool = False,
+        enable_force_include_usage: bool = False,
+        tool_server: Optional[ToolServer] = None,
+    ) -> None:
+        super().__init__(tokenizer_manager, template_manager)
+
+        # template_manager is already set by parent class
+        self.reasoning_parser = self.tokenizer_manager.server_args.reasoning_parser
+        self.enable_prompt_tokens_details = enable_prompt_tokens_details
+        self.enable_force_include_usage = enable_force_include_usage
+
+        # Get default sampling params from model config if available
+        self.default_sampling_params = {}
+
+        self.supports_browsing = (
+            tool_server.has_tool("browser") if tool_server else False
+        )
+        self.supports_code_interpreter = (
+            tool_server.has_tool("python") if tool_server else False
+        )
+        self.tool_server = tool_server
+        # Get from model config
+        self.use_harmony = (
+            self.tokenizer_manager.model_config.hf_config.model_type == "gpt_oss"
+        )
+
+        if self.use_harmony:
+            # OpenAI models have two EOS-like tokens: <|return|> and <|call|>.
+            # We need to add them to the stop token ids.
+            if "stop_token_ids" not in self.default_sampling_params:
+                self.default_sampling_params["stop_token_ids"] = []
+            self.default_sampling_params["stop_token_ids"].extend(
+                get_stop_tokens_for_assistant_actions()
+            )
+
+        # Response storage for background and retrieval operations
+        # Note: In production, this should use a proper storage backend (Redis, database)
+        # with TTL/expiration to prevent memory leaks
+        self.response_store: dict[str, ResponsesResponse] = {}
+        self.response_store_lock = asyncio.Lock()
+
+        # Message storage for conversation continuity
+        # Note: In production, this should use a proper storage backend (Redis, database)
+        # with TTL/expiration to prevent memory leaks
+        self.msg_store: dict[
+            str, Union[list[ChatCompletionMessageParam], list["OpenAIMessage"]]
+        ] = {}
+
+        self.background_tasks: dict[str, asyncio.Task] = {}
+
+    # error helpers dedicated for v1/responses
+    def create_error_response(
+        self,
+        message: str,
+        err_type: str = "invalid_request_error",
+        status_code: int = 400,
+        param: Optional[str] = None,
+    ) -> ORJSONResponse:
+        nested_error = {
+            "message": message,
+            "type": err_type,
+            "param": param,
+            "code": status_code,
+        }
+        return ORJSONResponse(content={"error": nested_error}, status_code=status_code)
+
+    def create_streaming_error_response(
+        self,
+        message: str,
+        err_type: str = "BadRequestError",
+        status_code: int = 400,
+    ) -> str:
+        return json.dumps(
+            {
+                "error": {
+                    "message": message,
+                    "type": err_type,
+                    "param": None,
+                    "code": status_code,
+                }
+            }
+        )
+
+    def _request_id_prefix(self) -> str:
+        return "resp_"
+
+    async def create_responses(
+        self,
+        request: ResponsesRequest,
+        raw_request: Optional[Request] = None,
+    ) -> Union[AsyncGenerator[str, None], ResponsesResponse, ORJSONResponse]:
+        # Validate model
+        if not self.tokenizer_manager:
+            return self.create_error_response("Model not loaded")
+
+        # FIXME: If the engine is dead, raise an error
+        # This is required for the streaming case
+
+        # Handle the previous response ID
+        prev_response_id = request.previous_response_id
+        if prev_response_id is not None:
+            if not prev_response_id.startswith("resp_"):
+                return self._make_invalid_id_error(prev_response_id)
+            async with self.response_store_lock:
+                prev_response = self.response_store.get(prev_response_id)
+            if prev_response is None:
+                return self._make_not_found_error(prev_response_id)
+        else:
+            prev_response = None
+
+        try:
+            model_name = request.model
+            tokenizer = self.tokenizer_manager.tokenizer
+
+            if self.use_harmony:
+                messages, request_prompts, engine_prompts = (
+                    self._make_request_with_harmony(request, prev_response)
+                )
+            else:
+                messages, request_prompts, engine_prompts = await self._make_request(
+                    request, prev_response, tokenizer
+                )
+
+        except (ValueError, TypeError, RuntimeError, jinja2.TemplateError) as e:
+            logger.exception("Error in preprocessing prompt inputs")
+            return self.create_error_response(f"{e} {e.__cause__}")
+
+        request_metadata = RequestResponseMetadata(request_id=request.request_id)
+        if raw_request:
+            raw_request.state.request_metadata = request_metadata
+
+        if (
+            self.tool_server is not None
+            and isinstance(self.tool_server, MCPToolServer)
+            and (request.background or request.stream)
+            and request.tools
+            and any(
+                tool.type in ["web_search_preview", "code_interpreter"]
+                for tool in request.tools
+            )
+        ):
+            return self.create_error_response(
+                "MCP tool server is not supported in background mode and "
+                "streaming mode"
+            )
+
+        # Schedule the request and get the result generator
+        generators: list[AsyncGenerator[Any, None]] = []
+        tool_list = []
+        if self.use_harmony:
+            if self.supports_browsing:
+                tool_list.append("browser")
+            if self.supports_code_interpreter:
+                tool_list.append("python")
+        async with AsyncExitStack() as exit_stack:
+            try:
+                if self.tool_server is not None:
+                    tool_session_ctxs: dict[str, Any] = {
+                        tool_name: exit_stack.enter_async_context(
+                            self.tool_server.get_tool_session(tool_name)
+                        )
+                        for tool_name in tool_list
+                    }
+                    tool_sessions = {}
+                    for tool_name in tool_list:
+                        tool_sessions[tool_name] = await tool_session_ctxs[tool_name]
+                else:
+                    assert len(tool_list) == 0
+                    tool_sessions = {}
+                for i, engine_prompt in enumerate(engine_prompts):
+                    # Calculate default max tokens from context length minus prompt length
+                    if hasattr(engine_prompt, "__len__"):
+                        prompt_length = len(engine_prompt)
+                    elif isinstance(engine_prompt, list):
+                        prompt_length = len(engine_prompt)
+                    else:
+                        prompt_length = 0
+
+                    context_len = (
+                        self.tokenizer_manager.model_config.context_len
+                        if hasattr(self.tokenizer_manager.model_config, "context_len")
+                        else 4096
+                    )
+                    default_max_tokens = max(
+                        context_len - prompt_length, 512
+                    )  # Ensure minimum 512 tokens
+                    sampling_params = request.to_sampling_params(
+                        default_max_tokens, self.default_sampling_params
+                    )
+
+                    context: ConversationContext
+                    if self.use_harmony:
+                        if request.stream:
+                            context = StreamingHarmonyContext(messages, tool_sessions)
+                        else:
+                            context = HarmonyContext(messages, tool_sessions)
+                    else:
+                        context = SimpleContext()
+
+                    # Create GenerateReqInput for SGLang
+                    adapted_request = GenerateReqInput(
+                        input_ids=engine_prompt,
+                        sampling_params=sampling_params,
+                        stream=request.stream,
+                        rid=request.request_id,
+                        extra_key=self._compute_extra_key(request),
+                        background=request.background,
+                    )
+
+                    generator = self._generate_with_builtin_tools(
+                        request.request_id,
+                        request_prompts[i],
+                        adapted_request,
+                        sampling_params,
+                        context,
+                        raw_request=raw_request,
+                        priority=request.priority,
+                    )
+                    generators.append(generator)
+            except ValueError as e:
+                return self.create_error_response(str(e))
+
+            assert len(generators) == 1
+            (result_generator,) = generators
+
+            # Store the input messages
+            if request.store:
+                self.msg_store[request.request_id] = messages
+
+            if request.background:
+                created_time = int(time.time())
+                response = ResponsesResponse.from_request(
+                    request,
+                    sampling_params,
+                    model_name=model_name,
+                    created_time=created_time,
+                    output=[],
+                    status="queued",
+                    usage=None,
+                )
+                async with self.response_store_lock:
+                    self.response_store[response.id] = response
+
+                # Run the request in the background
+                task = asyncio.create_task(
+                    self._run_background_request(
+                        request,
+                        sampling_params,
+                        result_generator,
+                        context,
+                        model_name,
+                        tokenizer,
+                        request_metadata,
+                        created_time,
+                    ),
+                    name=f"create_{response.id}",
+                )
+
+                # For cleanup
+                self.background_tasks[response.id] = task
+                task.add_done_callback(
+                    lambda _: self.background_tasks.pop(response.id, None)
+                )
+                return response
+
+            if request.stream:
+                return self.responses_stream_generator(
+                    request,
+                    sampling_params,
+                    result_generator,
+                    context,
+                    model_name,
+                    tokenizer,
+                    request_metadata,
+                )
+            try:
+                result: Union[ORJSONResponse, ResponsesResponse] = (
+                    await self.responses_full_generator(
+                        request,
+                        sampling_params,
+                        result_generator,
+                        context,
+                        model_name,
+                        tokenizer,
+                        request_metadata,
+                    )
+                )
+                return result
+            except Exception as e:
+                return self.create_error_response(str(e))
+        return self.create_error_response("Unknown error")
+
+    async def _make_request(
+        self,
+        request: ResponsesRequest,
+        prev_response: Optional[ResponsesResponse],
+        tokenizer: Any,
+    ):
+        # Construct the input messages
+        messages = self._construct_input_messages(request, prev_response)
+
+        # Follow SGLang's pattern: create a ChatCompletionRequest and process messages
+        try:
+            # Convert ResponsesRequest to ChatCompletionRequest for processing
+            chat_request = ChatCompletionRequest(
+                model=request.model,
+                messages=messages,
+                stream=request.stream,
+            )
+
+            # Follow SGLang's _process_messages pattern
+            is_multimodal = self.tokenizer_manager.model_config.is_multimodal
+            processed_messages = self._process_messages(chat_request, is_multimodal)
+
+            # Extract the results
+            if is_multimodal:
+                request_prompts = [processed_messages.prompt]
+                engine_prompts = [processed_messages.prompt]
+            else:
+                request_prompts = [processed_messages.prompt_ids]
+                engine_prompts = [processed_messages.prompt_ids]
+
+        except Exception as e:
+            logger.warning(f"Chat processing failed, using fallback: {e}")
+            # Fallback to simple encoding
+            prompt_text = ""
+            for msg in messages:
+                role = msg.get("role", "user")
+                content = msg.get("content", "")
+                prompt_text += f"{role}: {content}\n"
+            prompt_ids = tokenizer.encode(prompt_text)
+            request_prompts = [prompt_ids]
+            engine_prompts = [prompt_ids]
+
+        return messages, request_prompts, engine_prompts
+
+    def _make_request_with_harmony(
+        self,
+        request: ResponsesRequest,
+        prev_response: Optional[ResponsesResponse],
+    ):
+        if request.tool_choice != "auto":
+            raise NotImplementedError(
+                "Only 'auto' tool_choice is supported in " "response API"
+            )
+        messages = self._construct_input_messages_with_harmony(request, prev_response)
+        prompt_token_ids = render_for_completion(messages)
+        engine_prompt = prompt_token_ids
+        return messages, [prompt_token_ids], [engine_prompt]
+
+    async def responses_full_generator(
+        self,
+        request: ResponsesRequest,
+        sampling_params: Any,
+        result_generator: AsyncIterator[Any],
+        context: ConversationContext,
+        model_name: str,
+        tokenizer: Any,
+        request_metadata: RequestResponseMetadata,
+        created_time: Optional[int] = None,
+    ) -> Union[ResponsesResponse, ORJSONResponse]:
+        if created_time is None:
+            created_time = int(time.time())
+
+        try:
+            async for _ in result_generator:
+                pass
+        except asyncio.CancelledError:
+            return self.create_error_response("Client disconnected")
+        except ValueError as e:
+            return self.create_error_response(str(e))
+
+        if self.use_harmony:
+            assert isinstance(context, HarmonyContext)
+            output = self._make_response_output_items_with_harmony(context)
+            # TODO: these are all 0 for now!
+            num_prompt_tokens = context.num_prompt_tokens
+            num_generated_tokens = context.num_output_tokens
+            num_cached_tokens = context.num_cached_tokens
+            num_reasoning_tokens = context.num_reasoning_tokens
+        else:
+            assert isinstance(context, SimpleContext)
+            final_res = context.last_output
+            assert final_res is not None
+
+            output = self._make_response_output_items(
+                request, final_res["text"], tokenizer
+            )
+
+            # Calculate usage from actual output
+            if hasattr(final_res, "meta_info"):
+                num_prompt_tokens = final_res.meta_info.get("prompt_tokens", 0)
+                num_generated_tokens = final_res.meta_info.get("completion_tokens", 0)
+                num_cached_tokens = final_res.meta_info.get("cached_tokens", 0)
+            elif hasattr(final_res, "prompt_token_ids") and hasattr(
+                final_res, "outputs"
+            ):
+                # Fallback calculation if meta_info not available
+                num_prompt_tokens = (
+                    len(final_res.prompt_token_ids) if final_res.prompt_token_ids else 0
+                )
+                num_generated_tokens = (
+                    len(final_res.outputs[0].token_ids)
+                    if final_res.outputs and final_res.outputs[0].token_ids
+                    else 0
+                )
+                num_cached_tokens = getattr(final_res, "num_cached_tokens", 0)
+                num_reasoning_tokens = 0
+            else:
+                # Final fallback
+                num_prompt_tokens = 0
+                num_generated_tokens = 0
+                num_cached_tokens = 0
+                num_reasoning_tokens = 0
+
+        usage = UsageInfo(
+            prompt_tokens=num_prompt_tokens,
+            completion_tokens=num_generated_tokens,
+            total_tokens=num_prompt_tokens + num_generated_tokens,
+            reasoning_tokens=num_reasoning_tokens,
+        )
+        if self.enable_prompt_tokens_details and num_cached_tokens:
+            usage.prompt_tokens_details = PromptTokenUsageInfo(
+                cached_tokens=num_cached_tokens
+            )
+        request_metadata.final_usage_info = usage
+
+        response = ResponsesResponse.from_request(
+            request,
+            sampling_params,
+            model_name=model_name,
+            created_time=created_time,
+            output=output,
+            status="completed",
+            usage=usage,
+        )
+
+        if request.store:
+            async with self.response_store_lock:
+                stored_response = self.response_store.get(response.id)
+                # If the response is already cancelled, don't update it
+                if stored_response is None or stored_response.status != "cancelled":
+                    self.response_store[response.id] = response
+
+        return response
+
+    def _make_response_output_items(
+        self,
+        request: ResponsesRequest,
+        final_output: Any,
+        tokenizer: Any,
+    ):
+        # Handle reasoning parsing if enabled
+        if self.reasoning_parser:
+            # Use standard reasoning parser (openai maps to T4Detector internally)
+            reasoning_parser = ReasoningParser(
+                model_type=self.reasoning_parser,
+                stream_reasoning=False,
+                request=request,
+            )
+            reasoning_content, content = reasoning_parser.parse_non_stream(final_output)
+        else:
+            reasoning_content = None
+            content = final_output
+
+        output_items = []
+        if reasoning_content:
+            reasoning_item = ResponseReasoningItem(
+                id=f"rs_{random_uuid()}",
+                type="reasoning",
+                summary=[],
+                content=[
+                    ResponseReasoningTextContent(
+                        type="reasoning_text", text=reasoning_content
+                    ),
+                ],
+                status=None,
+            )
+            output_items.append(reasoning_item)
+        if content:
+            output_text = ResponseOutputText(
+                text=content,
+                annotations=[],  # TODO
+                type="output_text",
+                logprobs=None,  # TODO
+            )
+            message = ResponseOutputMessage(
+                id=f"msg_{random_uuid()}",
+                content=[output_text],
+                role="assistant",
+                status="completed",
+                type="message",
+            )
+            output_items.append(message)
+        return output_items
+
+    def _make_response_output_items_with_harmony(
+        self,
+        context: HarmonyContext,
+    ):
+        output_items = []
+        num_init_messages = context.num_init_messages
+        for msg in context.messages[num_init_messages:]:
+            output_items.extend(parse_output_message(msg))
+        # Handle the generation stopped in the middle (if any).
+        last_items = parse_remaining_state(context.parser)
+        if last_items:
+            output_items.extend(last_items)
+        return output_items
+
+    def _construct_input_messages(
+        self,
+        request: ResponsesRequest,
+        prev_response: Optional[ResponsesResponse] = None,
+    ) -> list[ChatCompletionMessageParam]:
+        messages: list[ChatCompletionMessageParam] = []
+        if request.instructions:
+            messages.append(
+                {
+                    "role": "system",
+                    "content": request.instructions,
+                }
+            )
+
+        # Prepend the conversation history
+        if prev_response is not None:
+            # Add the previous messages
+            prev_msg = self.msg_store[prev_response.id]
+            messages.extend(prev_msg)
+
+            # Add the previous output
+            for output_item in prev_response.output:
+                # NOTE: We skip the reasoning output of the previous response
+                if isinstance(output_item, ResponseReasoningItem):
+                    continue
+                for content in output_item.content:
+                    messages.append(
+                        {
+                            "role": "system",
+                            "content": request.instructions,
+                        }
+                    )
+
+        # Append the new input
+        # Responses API supports simple text inputs without chat format
+        if isinstance(request.input, str):
+            messages.append({"role": "user", "content": request.input})
+        else:
+            messages.extend(request.input)  # type: ignore
+        return messages
+
+    def _construct_input_messages_with_harmony(
+        self,
+        request: ResponsesRequest,
+        prev_response: Optional[ResponsesResponse],
+    ) -> list["OpenAIMessage"]:
+        messages: list["OpenAIMessage"] = []
+        if prev_response is None:
+            # New conversation.
+            reasoning_effort = request.reasoning.effort if request.reasoning else None
+            tool_types = [tool.type for tool in request.tools]
+            enable_browser = (
+                "web_search_preview" in tool_types and self.tool_server is not None
+            )
+            enable_code_interpreter = (
+                "code_interpreter" in tool_types and self.tool_server is not None
+            )
+            sys_msg = get_system_message(
+                reasoning_effort=reasoning_effort,
+                browser_description=(
+                    self.tool_server.get_tool_description("browser")
+                    if self.tool_server and enable_browser
+                    else None
+                ),
+                python_description=(
+                    self.tool_server.get_tool_description("python")
+                    if self.tool_server and enable_code_interpreter
+                    else None
+                ),
+            )
+            messages.append(sys_msg)
+            dev_msg = get_developer_message(request.instructions, request.tools)
+            messages.append(dev_msg)
+        else:
+            # Continue the previous conversation.
+            # FIXME: Currently, request params like reasoning and
+            # instructions are ignored.
+            prev_msgs = self.msg_store[prev_response.id]
+            # Remove the previous chain-of-thoughts if there is a new "final"
+            # message.
+            if (
+                len(prev_msgs) > 0
+                and hasattr(prev_msgs[-1], "channel")
+                and prev_msgs[-1].channel == "final"
+            ):  # type: ignore[union-attr]
+                prev_final_msg_idx = -1
+                for i in range(len(prev_msgs) - 2, -1, -1):
+                    if (
+                        hasattr(prev_msgs[i], "channel")
+                        and prev_msgs[i].channel == "final"
+                    ):  # type: ignore[union-attr]
+                        prev_final_msg_idx = i
+                        break
+                recent_turn_msgs = prev_msgs[prev_final_msg_idx + 1 :]
+                del prev_msgs[prev_final_msg_idx + 1 :]
+                for msg in recent_turn_msgs:
+                    if (
+                        hasattr(msg, "channel") and msg.channel != "analysis"
+                    ):  # type: ignore[union-attr]
+                        prev_msgs.append(msg)
+            messages.extend(prev_msgs)
+        # Append the new input.
+        # Responses API supports simple text inputs without chat format.
+        if isinstance(request.input, str):
+            messages.append(get_user_message(request.input))
+        else:
+            if prev_response is not None:
+                prev_outputs = copy(prev_response.output)
+            else:
+                prev_outputs = []
+            for response_msg in request.input:
+                messages.append(parse_response_input(response_msg, prev_outputs))
+                if isinstance(response_msg, ResponseFunctionToolCall):
+                    prev_outputs.append(response_msg)
+        return messages
+
+    async def _run_background_request(
+        self,
+        request: ResponsesRequest,
+        sampling_params: Any,
+        result_generator: AsyncIterator[Any],
+        context: ConversationContext,
+        model_name: str,
+        tokenizer: Any,
+        request_metadata: RequestResponseMetadata,
+        created_time: Optional[int] = None,
+        *args,
+        **kwargs,
+    ):
+        try:
+            # Update the status to "in_progress"
+            async with self.response_store_lock:
+                stored_response = self.response_store.get(request.request_id)
+                assert stored_response is not None
+                stored_response.status = "in_progress"
+
+            response = await self.responses_full_generator(
+                request,
+                sampling_params,
+                result_generator,
+                context,
+                model_name,
+                tokenizer,
+                request_metadata,
+                created_time,
+                *args,
+                **kwargs,
+            )
+        except Exception as e:
+            logger.exception("Background request failed for %s", request.request_id)
+            response = self.create_error_response(str(e))
+
+        if isinstance(response, ORJSONResponse):
+            # If the request has failed, update the status to "failed"
+            response_id = request.request_id
+            async with self.response_store_lock:
+                stored_response = self.response_store.get(response_id)
+                assert stored_response is not None
+                if stored_response.status not in ("completed", "cancelled"):
+                    stored_response.status = "failed"
+
+    async def retrieve_responses(
+        self,
+        response_id: str,
+    ) -> Union[ResponsesResponse, ORJSONResponse]:
+        if not response_id.startswith("resp_"):
+            return self._make_invalid_id_error(response_id)
+
+        async with self.response_store_lock:
+            response = self.response_store.get(response_id)
+
+        if response is None:
+            return self._make_not_found_error(response_id)
+        return response
+
+    async def cancel_responses(
+        self,
+        response_id: str,
+    ) -> Union[ResponsesResponse, ORJSONResponse]:
+        if not response_id.startswith("resp_"):
+            return self._make_invalid_id_error(response_id)
+
+        async with self.response_store_lock:
+            response = self.response_store.get(response_id)
+            if response is None:
+                return self._make_not_found_error(response_id)
+
+            prev_status = response.status
+            if prev_status not in ("queued", "in_progress"):
+                return self.create_error_response(
+                    err_type="invalid_request_error",
+                    message="Cannot cancel a synchronous response.",
+                )
+
+            # Update the status to "cancelled"
+            response.status = "cancelled"
+
+        # The response_id is the same as the rid used when submitting the request
+        self.tokenizer_manager.abort_request(rid=response_id)
+
+        if task := self.background_tasks.get(response_id):
+            task.cancel()
+            try:
+                await task
+            except asyncio.CancelledError:
+                logger.exception("Background task for %s was cancelled", response_id)
+        return response
+
+    def _make_invalid_id_error(self, response_id: str):
+        return self.create_error_response(
+            message=(
+                f"Invalid 'response_id': '{response_id}'. "
+                "Expected an ID that begins with 'resp'."
+            ),
+            err_type="invalid_request_error",
+            param="response_id",
+        )
+
+    def _make_not_found_error(self, response_id: str):
+        return self.create_error_response(
+            message=f"Response with id '{response_id}' not found.",
+            err_type="invalid_request_error",
+            status_code=HTTPStatus.NOT_FOUND,
+            param="response_id",
+        )
+
+    async def responses_stream_generator(
+        self,
+        request: ResponsesRequest,
+        sampling_params: Any,
+        result_generator: AsyncIterator[StreamingHarmonyContext],
+        context: StreamingHarmonyContext,
+        model_name: str,
+        tokenizer: Any,
+        request_metadata: RequestResponseMetadata,
+        created_time: Optional[int] = None,
+    ) -> AsyncGenerator[str, None]:
+        # TODO:
+        # 1. Handle disconnect
+
+        created_time = created_time or int(time.time())
+
+        sequence_number = 0
+
+        def _send_event(event):
+            nonlocal sequence_number
+            # Set sequence_number if the event has this attribute
+            if hasattr(event, "sequence_number"):
+                event.sequence_number = sequence_number
+            sequence_number += 1
+            # Get event type from the event's type field if it exists
+            event_type = getattr(event, "type", "unknown")
+            return (
+                f"event: {event_type}\n"
+                f"data: {event.model_dump_json(indent=None)}\n\n"
+            )
+
+        current_content_index = 0
+        current_output_index = 0
+        current_item_id = f"item_{random_uuid()}"
+        sent_output_item_added = False
+
+        initial_response = ResponsesResponse.from_request(
+            request,
+            sampling_params,
+            model_name=model_name,
+            created_time=created_time,
+            output=[],
+            status="in_progress",
+            usage=None,
+        ).model_dump()
+        yield _send_event(
+            openai_responses_types.ResponseCreatedEvent(
+                type="response.created",
+                sequence_number=-1,
+                response=initial_response,
+            )
+        )
+        yield _send_event(
+            openai_responses_types.ResponseInProgressEvent(
+                type="response.in_progress",
+                sequence_number=-1,
+                response=initial_response,
+            )
+        )
+
+        async for ctx in result_generator:
+
+            # Only process context objects that implement the `is_expecting_start()` method,
+            # which indicates they support per-turn streaming (e.g., StreamingHarmonyContext).
+            # Contexts without this method are skipped, as they do not represent a new turn
+            # or are not compatible with per-turn handling in the /v1/responses endpoint.
+            if not hasattr(ctx, "is_expecting_start"):
+                continue
+
+            if ctx.is_expecting_start():
+                current_output_index += 1
+                sent_output_item_added = False
+
+                if len(ctx.parser.messages) > 0:
+                    previous_item = ctx.parser.messages[-1]
+                    if previous_item.recipient is not None:
+                        # Deal with tool call here
+                        pass
+                    elif previous_item.channel == "analysis":
+                        reasoning_item = ResponseReasoningItem(
+                            id=f"rs_{random_uuid()}",
+                            type="reasoning",
+                            summary=[],
+                            content=[
+                                ResponseReasoningTextContent(
+                                    text=previous_item.content[0].text,
+                                    type="reasoning_text",
+                                ),
+                            ],
+                            status="completed",
+                        )
+                        yield _send_event(
+                            openai_responses_types.ResponseReasoningTextDoneEvent(
+                                type="response.reasoning_text.done",
+                                item_id=current_item_id,
+                                sequence_number=-1,
+                                output_index=current_output_index,
+                                content_index=current_content_index,
+                                text=previous_item.content[0].text,
+                            )
+                        )
+                        yield _send_event(
+                            openai_responses_types.ResponseOutputItemDoneEvent(
+                                type="response.output_item.done",
+                                sequence_number=-1,
+                                output_index=current_output_index,
+                                item=reasoning_item,
+                            )
+                        )
+                    elif previous_item.channel == "final":
+                        text_content = openai_responses_types.ResponseOutputText(
+                            type="output_text",
+                            text=previous_item.content[0].text,
+                            annotations=[],
+                        )
+                        yield _send_event(
+                            openai_responses_types.ResponseTextDoneEvent(
+                                type="response.output_text.done",
+                                sequence_number=-1,
+                                output_index=current_output_index,
+                                content_index=current_content_index,
+                                text=previous_item.content[0].text,
+                                logprobs=[],
+                                item_id=current_item_id,
+                            )
+                        )
+                        yield _send_event(
+                            openai_responses_types.ResponseContentPartDoneEvent(
+                                type="response.content_part.done",
+                                sequence_number=-1,
+                                item_id=current_item_id,
+                                output_index=current_output_index,
+                                content_index=current_content_index,
+                                part=text_content,
+                            )
+                        )
+                        yield _send_event(
+                            openai_responses_types.ResponseOutputItemDoneEvent(
+                                type="response.output_item.done",
+                                sequence_number=-1,
+                                output_index=current_output_index,
+                                item=openai_responses_types.ResponseOutputMessage(
+                                    id=current_item_id,
+                                    type="message",
+                                    role="assistant",
+                                    content=[text_content],
+                                    status="completed",
+                                ),
+                            )
+                        )
+
+            if ctx.parser.last_content_delta:
+                if (
+                    ctx.parser.current_channel == "final"
+                    and ctx.parser.current_recipient is None
+                ):
+                    if not sent_output_item_added:
+                        sent_output_item_added = True
+                        yield _send_event(
+                            openai_responses_types.ResponseOutputItemAddedEvent(
+                                type="response.output_item.added",
+                                sequence_number=-1,
+                                output_index=current_output_index,
+                                item=openai_responses_types.ResponseOutputMessage(
+                                    id=current_item_id,
+                                    type="message",
+                                    role="assistant",
+                                    content=[],
+                                    status="in_progress",
+                                ),
+                            )
+                        )
+                        yield _send_event(
+                            openai_responses_types.ResponseContentPartAddedEvent(
+                                type="response.content_part.added",
+                                sequence_number=-1,
+                                output_index=current_output_index,
+                                item_id=current_item_id,
+                                content_index=current_content_index,
+                                part=openai_responses_types.ResponseOutputText(
+                                    type="output_text",
+                                    text="",
+                                    annotations=[],
+                                    logprobs=None,
+                                ),
+                            )
+                        )
+                    yield _send_event(
+                        openai_responses_types.ResponseTextDeltaEvent(
+                            type="response.output_text.delta",
+                            sequence_number=-1,
+                            content_index=current_content_index,
+                            output_index=current_output_index,
+                            item_id=current_item_id,
+                            delta=ctx.parser.last_content_delta,
+                            # TODO, use logprobs from ctx.last_request_output
+                            logprobs=[],
+                        )
+                    )
+                elif (
+                    ctx.parser.current_channel == "analysis"
+                    and ctx.parser.current_recipient is None
+                ):
+                    if not sent_output_item_added:
+                        sent_output_item_added = True
+                        yield _send_event(
+                            openai_responses_types.ResponseOutputItemAddedEvent(
+                                type="response.output_item.added",
+                                sequence_number=-1,
+                                output_index=current_output_index,
+                                item=openai_responses_types.ResponseReasoningItem(
+                                    type="reasoning",
+                                    id=current_item_id,
+                                    summary=[],
+                                    status="in_progress",
+                                ),
+                            )
+                        )
+                        yield _send_event(
+                            openai_responses_types.ResponseContentPartAddedEvent(
+                                type="response.content_part.added",
+                                sequence_number=-1,
+                                output_index=current_output_index,
+                                item_id=current_item_id,
+                                content_index=current_content_index,
+                                # TODO: migrate this to
+                                # ResponseReasoningTextContent for now
+                                part=openai_responses_types.ResponseOutputText(
+                                    type="output_text",
+                                    text="",
+                                    annotations=[],
+                                    logprobs=None,
+                                ),
+                            )
+                        )
+                    # TODO: migrate to OpenAI types once updated.
+                    yield _send_event(
+                        openai_responses_types.ResponseReasoningTextDeltaEvent(
+                            type="response.reasoning_text.delta",
+                            item_id=current_item_id,
+                            output_index=current_output_index,
+                            content_index=current_content_index,
+                            delta=ctx.parser.last_content_delta,
+                            sequence_number=-1,
+                        )
+                    )
+
+            if ctx.is_assistant_action_turn() and len(ctx.parser.messages) > 0:
+                previous_item = ctx.parser.messages[-1]
+                if (
+                    self.supports_browsing
+                    and previous_item.recipient is not None
+                    and previous_item.recipient.startswith("browser.")
+                ):
+                    function_name = previous_item.recipient[len("browser.") :]
+                    action = None
+                    parsed_args = orjson.loads(previous_item.content[0].text)
+                    if function_name == "search":
+                        action = openai_responses_types.response_function_web_search.ActionSearch(
+                            type="search",
+                            query=parsed_args["query"],
+                        )
+                    elif function_name == "open":
+                        action = openai_responses_types.response_function_web_search.ActionOpenPage(
+                            type="open_page",
+                            # TODO: translate to url
+                            url=f"cursor:{parsed_args.get('cursor', '')}",
+                        )
+                    elif function_name == "find":
+                        action = openai_responses_types.response_function_web_search.ActionFind(
+                            type="find",
+                            pattern=parsed_args["pattern"],
+                            # TODO: translate to url
+                            url=f"cursor:{parsed_args.get('cursor', '')}",
+                        )
+                    else:
+                        raise ValueError(f"Unknown function name: {function_name}")
+
+                    yield _send_event(
+                        openai_responses_types.ResponseOutputItemAddedEvent(
+                            type="response.output_item.added",
+                            sequence_number=-1,
+                            output_index=current_output_index,
+                            item=openai_responses_types.response_function_web_search.ResponseFunctionWebSearch(
+                                # TODO: generate a unique id for web search call
+                                type="web_search_call",
+                                id=current_item_id,
+                                action=action,
+                                status="in_progress",
+                            ),
+                        )
+                    )
+                    yield _send_event(
+                        openai_responses_types.ResponseWebSearchCallInProgressEvent(
+                            type="response.web_search_call.in_progress",
+                            sequence_number=-1,
+                            output_index=current_output_index,
+                            item_id=current_item_id,
+                        )
+                    )
+                    yield _send_event(
+                        openai_responses_types.ResponseWebSearchCallSearchingEvent(
+                            type="response.web_search_call.searching",
+                            sequence_number=-1,
+                            output_index=current_output_index,
+                            item_id=current_item_id,
+                        )
+                    )
+
+                    # enqueue
+                    yield _send_event(
+                        openai_responses_types.ResponseWebSearchCallCompletedEvent(
+                            type="response.web_search_call.completed",
+                            sequence_number=-1,
+                            output_index=current_output_index,
+                            item_id=current_item_id,
+                        )
+                    )
+                    yield _send_event(
+                        openai_responses_types.ResponseOutputItemDoneEvent(
+                            type="response.output_item.done",
+                            sequence_number=-1,
+                            output_index=current_output_index,
+                            item=openai_responses_types.ResponseFunctionWebSearch(
+                                type="web_search_call",
+                                id=current_item_id,
+                                action=action,
+                                status="completed",
+                            ),
+                        )
+                    )
+
+                if (
+                    self.supports_code_interpreter
+                    and previous_item.recipient is not None
+                    and previous_item.recipient.startswith("python")
+                ):
+                    yield _send_event(
+                        openai_responses_types.ResponseOutputItemAddedEvent(
+                            type="response.output_item.added",
+                            sequence_number=-1,
+                            output_index=current_output_index,
+                            item=openai_responses_types.ResponseCodeInterpreterToolCallParam(
+                                type="code_interpreter_call",
+                                id=current_item_id,
+                                code="",
+                                container_id="auto",
+                                outputs=[],
+                                status="in_progress",
+                            ),
+                        )
+                    )
+                    yield _send_event(
+                        openai_responses_types.ResponseCodeInterpreterCallInProgressEvent(
+                            type="response.code_interpreter_call.in_progress",
+                            sequence_number=-1,
+                            output_index=current_output_index,
+                            item_id=current_item_id,
+                        )
+                    )
+                    # TODO: do we need to add delta event here?
+                    yield _send_event(
+                        openai_responses_types.ResponseCodeInterpreterCallCodeDoneEvent(
+                            type="response.code_interpreter_call_code.done",
+                            sequence_number=-1,
+                            output_index=current_output_index,
+                            item_id=current_item_id,
+                            code=previous_item.content[0].text,
+                        )
+                    )
+                    yield _send_event(
+                        openai_responses_types.ResponseCodeInterpreterCallInterpretingEvent(
+                            type="response.code_interpreter_call.interpreting",
+                            sequence_number=-1,
+                            output_index=current_output_index,
+                            item_id=current_item_id,
+                        )
+                    )
+                    yield _send_event(
+                        openai_responses_types.ResponseCodeInterpreterCallCompletedEvent(
+                            type="response.code_interpreter_call.completed",
+                            sequence_number=-1,
+                            output_index=current_output_index,
+                            item_id=current_item_id,
+                        )
+                    )
+                    yield _send_event(
+                        openai_responses_types.ResponseOutputItemDoneEvent(
+                            type="response.output_item.done",
+                            sequence_number=-1,
+                            output_index=current_output_index,
+                            item=openai_responses_types.ResponseCodeInterpreterToolCallParam(
+                                type="code_interpreter_call",
+                                id=current_item_id,
+                                code=previous_item.content[0].text,
+                                container_id="auto",
+                                # TODO: add outputs here
+                                outputs=[],
+                                status="completed",
+                            ),
+                        )
+                    )
+
+        async def empty_async_generator():
+            if False:
+                yield
+
+        final_response = await self.responses_full_generator(
+            request,
+            sampling_params,
+            empty_async_generator(),
+            context,
+            model_name,
+            tokenizer,
+            request_metadata,
+            created_time=created_time,
+        )
+        # Convert final_response to the format expected by ResponseCompletedEvent
+        response_dict = final_response.model_dump()
+
+        # Convert UsageInfo to ResponseUsage format
+        if response_dict.get("usage"):
+            usage_info = response_dict["usage"]
+            response_dict["usage"] = {
+                "input_tokens": usage_info.get("prompt_tokens", 0),
+                "input_tokens_details": {
+                    "cached_tokens": usage_info.get("cached_tokens", 0)
+                },
+                "output_tokens": usage_info.get("completion_tokens", 0),
+                "output_tokens_details": {
+                    "reasoning_tokens": usage_info.get("reasoning_tokens", 0)
+                },
+                "total_tokens": usage_info.get("total_tokens", 0),
+            }
+
+        yield _send_event(
+            openai_responses_types.ResponseCompletedEvent(
+                type="response.completed",
+                sequence_number=-1,
+                response=response_dict,
+            )
+        )
+
+    async def _generate_with_builtin_tools(
+        self,
+        request_id: str,
+        request_prompt: Any,
+        adapted_request: GenerateReqInput,
+        sampling_params: Any,
+        context: ConversationContext,
+        raw_request: Optional[Request] = None,
+        priority: Optional[int] = None,
+        **kwargs,
+    ) -> AsyncGenerator[Any, None]:
+        """Generate with builtin tool support for harmony-based models."""
+        orig_priority = priority or 0
+
+        while True:
+            # Generate using SGLang's tokenizer manager
+            generator = self.tokenizer_manager.generate_request(
+                adapted_request, raw_request
+            )
+
+            async for res in generator:
+                context.append_output(res)
+                # NOTE(woosuk): The stop condition is handled by the engine.
+                yield context
+
+            if not context.need_builtin_tool_call():
+                # The model did not ask for a tool call, so we're done.
+                break
+
+            # Call the tool and update the context with the result.
+            tool_output = await context.call_tool()
+            context.append_output(tool_output)
+
+            # Prepare for the next generation turn
+            # Render the updated conversation for the next completion
+            prompt_token_ids = context.render_for_completion()
+
+            # Update the adapted request with new prompt
+            adapted_request = GenerateReqInput(
+                input_ids=prompt_token_ids,
+                sampling_params=sampling_params,
+                stream=adapted_request.stream,
+                rid=request_id,
+                extra_key=adapted_request.extra_key,
+                return_logprob=adapted_request.return_logprob,
+                logprob_start_len=adapted_request.logprob_start_len,
+                top_logprobs_num=adapted_request.top_logprobs_num,
+                return_text_in_logprobs=adapted_request.return_text_in_logprobs,
+                return_hidden_states=adapted_request.return_hidden_states,
+                background=adapted_request.background,
+            )
+
+            # Update sampling params with reduced max_tokens
+            if hasattr(sampling_params, "max_new_tokens") or isinstance(
+                sampling_params, dict
+            ):
+                context_len = getattr(
+                    self.tokenizer_manager.model_config, "context_len", 4096
+                )
+                remaining_tokens = context_len - len(prompt_token_ids) - 1
+
+                if isinstance(sampling_params, dict):
+                    sampling_params["max_new_tokens"] = max(remaining_tokens, 1)
+                else:
+                    sampling_params.max_new_tokens = max(remaining_tokens, 1)
+
+            # Slightly reduce priority for subsequent tool calls
+            priority = orig_priority - 1
diff --git a/sglang/python/sglang/srt/entrypoints/openai/serving_score.py b/sglang/python/sglang/srt/entrypoints/openai/serving_score.py
new file mode 100644
index 0000000000000000000000000000000000000000..e9fb5f8c0005aa8dc7abc318b3c69190042c7bf3
--- /dev/null
+++ b/sglang/python/sglang/srt/entrypoints/openai/serving_score.py
@@ -0,0 +1,66 @@
+import logging
+from typing import Union
+
+from fastapi import Request
+
+from sglang.srt.entrypoints.openai.protocol import (
+    ErrorResponse,
+    ScoringRequest,
+    ScoringResponse,
+    UsageInfo,
+)
+from sglang.srt.entrypoints.openai.serving_base import OpenAIServingBase
+
+logger = logging.getLogger(__name__)
+
+
+class OpenAIServingScore(OpenAIServingBase):
+    """Handler for /v1/score requests"""
+
+    # NOTE: /v1/rerank is not an official OpenAI endpoint. This module may be moved
+    # to another module in the future.
+
+    def _request_id_prefix(self) -> str:
+        return "score-"
+
+    def _convert_to_internal_request(
+        self,
+        request: ScoringRequest,
+        raw_request: Request = None,
+    ) -> tuple[ScoringRequest, ScoringRequest]:
+        """Convert OpenAI scoring request to internal format"""
+        # For scoring, we pass the request directly as the tokenizer_manager
+        # has a specialized score_request method that doesn't use GenerateReqInput
+
+        return request, request
+
+    async def _handle_non_streaming_request(
+        self,
+        adapted_request: ScoringRequest,
+        request: ScoringRequest,
+        raw_request: Request,
+    ) -> Union[ScoringResponse, ErrorResponse]:
+        """Handle the scoring request"""
+        try:
+            # Use tokenizer_manager's score_request method directly
+            result = await self.tokenizer_manager.score_request(
+                query=request.query,
+                items=request.items,
+                label_token_ids=request.label_token_ids,
+                apply_softmax=request.apply_softmax,
+                item_first=request.item_first,
+                request=raw_request,
+            )
+
+            response = ScoringResponse(
+                scores=result.scores,
+                model=request.model,
+                usage=UsageInfo(
+                    prompt_tokens=result.prompt_tokens,
+                    total_tokens=result.prompt_tokens,
+                ),
+            )
+            return response
+
+        except ValueError as e:
+            return self.create_error_response(str(e))
diff --git a/sglang/python/sglang/srt/entrypoints/openai/serving_tokenize.py b/sglang/python/sglang/srt/entrypoints/openai/serving_tokenize.py
new file mode 100644
index 0000000000000000000000000000000000000000..1bf6de97acd74588763d5a19817ad7761692c6cd
--- /dev/null
+++ b/sglang/python/sglang/srt/entrypoints/openai/serving_tokenize.py
@@ -0,0 +1,144 @@
+import logging
+from http import HTTPStatus
+from typing import List, Union
+
+from fastapi import Request
+
+from sglang.srt.entrypoints.openai.protocol import (
+    DetokenizeRequest,
+    DetokenizeResponse,
+    ErrorResponse,
+    TokenizeRequest,
+    TokenizeResponse,
+)
+from sglang.srt.entrypoints.openai.serving_base import OpenAIServingBase
+
+logger = logging.getLogger(__name__)
+
+
+class OpenAIServingTokenize(OpenAIServingBase):
+    """Handler for /v1/tokenize requests"""
+
+    def _request_id_prefix(self) -> str:
+        return "tok-"
+
+    def _convert_to_internal_request(
+        self, request: TokenizeRequest, raw_request: Request
+    ) -> tuple[TokenizeRequest, TokenizeRequest]:
+        return request, request
+
+    async def _handle_non_streaming_request(
+        self,
+        adapted_request: TokenizeRequest,
+        request: TokenizeRequest,
+        raw_request: Request,
+    ) -> Union[TokenizeResponse, ErrorResponse]:
+        try:
+            tokenizer = self.tokenizer_manager.tokenizer
+            max_model_len = getattr(tokenizer, "model_max_length", -1)
+
+            if isinstance(request.prompt, str):
+                token_ids = tokenizer.encode(
+                    request.prompt,
+                    add_special_tokens=request.add_special_tokens,
+                )
+                tokens = token_ids
+                count = len(token_ids)
+            elif isinstance(request.prompt, list):
+                token_ids_list = [
+                    tokenizer.encode(
+                        text, add_special_tokens=request.add_special_tokens
+                    )
+                    for text in request.prompt
+                ]
+                tokens = token_ids_list
+                count = [len(ids) for ids in token_ids_list]
+            else:
+                return self.create_error_response(
+                    f"Invalid prompt type: {type(request.prompt)}. Expected str or List[str]."
+                )
+
+            return TokenizeResponse(
+                tokens=tokens, count=count, max_model_len=max_model_len
+            )
+        except Exception as e:
+            logger.error("Error during tokenization", exc_info=True)
+            return self.create_error_response(
+                f"Internal server error during tokenization: {e}",
+                err_type="InternalServerError",
+                status_code=HTTPStatus.INTERNAL_SERVER_ERROR,
+            )
+
+
+class OpenAIServingDetokenize(OpenAIServingBase):
+    """Handler for /v1/detokenize requests"""
+
+    def _request_id_prefix(self) -> str:
+        return "detok-"
+
+    def _convert_to_internal_request(
+        self, request: DetokenizeRequest, raw_request: Request
+    ) -> tuple[DetokenizeRequest, DetokenizeRequest]:
+        return request, request
+
+    async def _handle_non_streaming_request(
+        self,
+        adapted_request: DetokenizeRequest,
+        request: DetokenizeRequest,
+        raw_request: Request,
+    ) -> Union[DetokenizeResponse, ErrorResponse]:
+        try:
+            tokenizer = self.tokenizer_manager.tokenizer
+
+            if (
+                isinstance(request.tokens, list)
+                and request.tokens
+                and isinstance(request.tokens[0], int)
+            ):
+                if not all(isinstance(t, int) for t in request.tokens):
+                    return self.create_error_response(
+                        "Invalid input: 'tokens' must be a list of integers."
+                    )
+                tokens_to_decode = [int(t) for t in request.tokens]
+                text = tokenizer.decode(
+                    tokens_to_decode, skip_special_tokens=request.skip_special_tokens
+                )
+                text_out: Union[str, List[str]] = text
+            elif (
+                isinstance(request.tokens, list)
+                and request.tokens
+                and isinstance(request.tokens[0], list)
+            ):
+                texts: List[str] = []
+                for token_list in request.tokens:
+                    if not all(isinstance(t, int) for t in token_list):
+                        return self.create_error_response(
+                            f"Invalid input: Sublist in 'tokens' must contain only integers. Found: {token_list}"
+                        )
+                    decoded_text = tokenizer.decode(
+                        [int(t) for t in token_list],
+                        skip_special_tokens=request.skip_special_tokens,
+                    )
+                    texts.append(decoded_text)
+                text_out = texts
+            elif isinstance(request.tokens, list) and not request.tokens:
+                text_out = ""
+            else:
+                return self.create_error_response(
+                    f"Invalid tokens type: {type(request.tokens)}. Expected List[int] or List[List[int]]."
+                )
+
+            return DetokenizeResponse(text=text_out)
+        except Exception as e:
+            logger.error("Error during detokenization", exc_info=True)
+            if "decode" in str(e).lower():
+                return self.create_error_response(
+                    f"Error decoding tokens: {e}. Input tokens might be invalid for the model.",
+                    err_type="DecodeError",
+                    status_code=HTTPStatus.BAD_REQUEST,
+                )
+            return self.create_error_response(
+                f"Internal server error during detokenization: {e}",
+                err_type="InternalServerError",
+                status_code=HTTPStatus.INTERNAL_SERVER_ERROR,
+            )
diff --git a/sglang/python/sglang/srt/entrypoints/openai/serving_transcription.py b/sglang/python/sglang/srt/entrypoints/openai/serving_transcription.py
new file mode 100644
index 0000000000000000000000000000000000000000..2b5661f4967a02c898f6eede2367b5318128f376
--- /dev/null
+++ b/sglang/python/sglang/srt/entrypoints/openai/serving_transcription.py
@@ -0,0 +1,221 @@
+# Copyright 2025 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""
+OpenAI-compatible transcription endpoint handler for Whisper models.
+"""
+
+from __future__ import annotations
+
+import io
+import logging
+import math
+import time
+import uuid
+from typing import TYPE_CHECKING, AsyncGenerator, Optional, Union
+
+from fastapi import Request
+from fastapi.responses import ORJSONResponse, Response, StreamingResponse
+
+from sglang.srt.entrypoints.openai.protocol import (
+    DeltaMessage,
+    ErrorResponse,
+    TranscriptionRequest,
+    TranscriptionResponse,
+    TranscriptionStreamChoice,
+    TranscriptionStreamResponse,
+    TranscriptionUsage,
+)
+from sglang.srt.entrypoints.openai.serving_base import OpenAIServingBase
+from sglang.srt.managers.io_struct import GenerateReqInput
+
+if TYPE_CHECKING:
+    from sglang.srt.managers.tokenizer_manager import TokenizerManager
+
+logger = logging.getLogger(__name__)
+
+
+class OpenAIServingTranscription(OpenAIServingBase):
+    """Handler for /v1/audio/transcriptions requests"""
+
+    def __init__(self, tokenizer_manager: TokenizerManager):
+        super().__init__(tokenizer_manager)
+
+    def _request_id_prefix(self) -> str:
+        return "trsc-"
+
+    def _validate_request(self, request: TranscriptionRequest) -> Optional[str]:
+        """Validate transcription request."""
+        # Validation is done in the route handler for form data
+        return None
+
+    def _convert_to_internal_request(
+        self,
+        request: TranscriptionRequest,
+        raw_request: Request = None,
+    ) -> tuple[GenerateReqInput, TranscriptionRequest]:
+        """Convert transcription request to internal format."""
+        # Build sampling params - include language for WhisperProcessor
+        sampling_params = {
+            "temperature": request.temperature,
+            "max_new_tokens": 448,  # Whisper default max tokens
+            "language": request.language,  # Pass to WhisperProcessor for language-specific decoding
+        }
+
+        # For Whisper, we pass audio_data and let the processor handle it
+        adapted_request = GenerateReqInput(
+            text="",  # Empty text - Whisper processor will set proper decoder tokens
+            audio_data=request.audio_data,
+            sampling_params=sampling_params,
+            stream=request.stream,
+            modalities=["audio"],
+            routing_key=self.extract_routing_key(raw_request),
+        )
+
+        return adapted_request, request
+
+    def _get_audio_duration(self, audio_data: bytes) -> float:
+        """Calculate audio duration in seconds."""
+        try:
+            import soundfile as sf
+
+            audio_array, sr = sf.read(io.BytesIO(audio_data))
+            duration = len(audio_array) / sr
+            return duration
+        except Exception as e:
+            logger.warning(f"Could not calculate audio duration: {e}")
+            return 0.0
+
+    async def create_transcription(
+        self,
+        audio_data: bytes,
+        model: str,
+        language: Optional[str],
+        response_format: str,
+        temperature: float,
+        stream: bool,
+        raw_request: Request,
+    ) -> Union[TranscriptionResponse, StreamingResponse, Response, ORJSONResponse]:
+        """Main entry point for transcription requests."""
+        # Calculate audio duration for usage reporting
+        audio_duration_s = self._get_audio_duration(audio_data)
+
+        # Build request
+        request = TranscriptionRequest(
+            audio_data=audio_data,
+            model=model,
+            language=language,
+            response_format=response_format,
+            temperature=temperature,
+            stream=stream,
+            audio_duration_s=audio_duration_s,
+        )
+
+        # Use the base class handle_request pattern
+        return await self.handle_request(request, raw_request)
+
+    async def _handle_non_streaming_request(
+        self,
+        adapted_request: GenerateReqInput,
+        request: TranscriptionRequest,
+        raw_request: Request,
+    ) -> Union[TranscriptionResponse, ErrorResponse, ORJSONResponse, Response]:
+        """Handle non-streaming transcription request."""
+        try:
+            ret = await self.tokenizer_manager.generate_request(
+                adapted_request, raw_request
+            ).__anext__()
+        except ValueError as e:
+            return self.create_error_response(str(e))
+
+        text = ret.get("text", "")
+
+        # Build response based on format
+        if request.response_format == "text":
+            return Response(content=text, media_type="text/plain")
+
+        # JSON format
+        usage = TranscriptionUsage(seconds=int(math.ceil(request.audio_duration_s)))
+
+        return TranscriptionResponse(text=text, usage=usage)
+
+    async def _handle_streaming_request(
+        self,
+        adapted_request: GenerateReqInput,
+        request: TranscriptionRequest,
+        raw_request: Request,
+    ) -> StreamingResponse:
+        """Handle streaming transcription request."""
+        return StreamingResponse(
+            self._generate_transcription_stream(adapted_request, request, raw_request),
+            media_type="text/event-stream",
+            background=self.tokenizer_manager.create_abort_task(adapted_request),
+        )
+
+    async def _generate_transcription_stream(
+        self,
+        adapted_request: GenerateReqInput,
+        request: TranscriptionRequest,
+        raw_request: Request,
+    ) -> AsyncGenerator[str, None]:
+        """Generate streaming transcription response."""
+        created_time = int(time.time())
+        request_id = f"{self._request_id_prefix()}{uuid.uuid4().hex}"
+        model = request.model
+        stream_buffer = ""
+
+        try:
+            async for content in self.tokenizer_manager.generate_request(
+                adapted_request, raw_request
+            ):
+                finish_reason = content["meta_info"]["finish_reason"]
+                finish_reason_type = finish_reason["type"] if finish_reason else None
+
+                # Calculate delta (new text since last chunk)
+                current_text = content.get("text", "")
+                delta = current_text[len(stream_buffer) :]
+                stream_buffer = current_text
+
+                # Send content delta if there's new text
+                if delta:
+                    choice_data = TranscriptionStreamChoice(
+                        delta=DeltaMessage(content=delta),
+                        finish_reason=None,
+                    )
+                    chunk = TranscriptionStreamResponse(
+                        id=request_id,
+                        created=created_time,
+                        model=model,
+                        choices=[choice_data],
+                    )
+                    yield f"data: {chunk.model_dump_json()}\n\n"
+
+                # Send finish reason when done
+                if finish_reason_type:
+                    choice_data = TranscriptionStreamChoice(
+                        delta=DeltaMessage(),
+                        finish_reason=finish_reason_type,
+                    )
+                    chunk = TranscriptionStreamResponse(
+                        id=request_id,
+                        created=created_time,
+                        model=model,
+                        choices=[choice_data],
+                    )
+                    yield f"data: {chunk.model_dump_json()}\n\n"
+
+        except ValueError as e:
+            error = self.create_streaming_error_response(str(e))
+            yield f"data: {error}\n\n"
+
+        yield "data: [DONE]\n\n"
diff --git a/sglang/python/sglang/srt/entrypoints/openai/tool_server.py b/sglang/python/sglang/srt/entrypoints/openai/tool_server.py
new file mode 100644
index 0000000000000000000000000000000000000000..269d9e99eae491d1c310536ac095cae596a5163c
--- /dev/null
+++ b/sglang/python/sglang/srt/entrypoints/openai/tool_server.py
@@ -0,0 +1,175 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import logging
+from abc import ABC, abstractmethod
+from contextlib import AbstractAsyncContextManager, asynccontextmanager
+from typing import Any
+
+try:
+    from mcp import ClientSession
+    from mcp.client.sse import sse_client
+    from mcp.types import ListToolsResult
+except ImportError as e:
+    ClientSession = sse_client = ListToolsResult = e
+
+from openai_harmony import ToolDescription, ToolNamespaceConfig
+
+logger = logging.getLogger(__name__)
+
+
+async def list_server_and_tools(server_url: str):
+
+    async with sse_client(url=server_url) as streams, ClientSession(
+        *streams
+    ) as session:
+        initialize_response = await session.initialize()
+        list_tools_response = await session.list_tools()
+        return initialize_response, list_tools_response
+
+
+def trim_schema(schema: dict) -> dict:
+    # Turn JSON Schema from MCP generated into Harmony's variant.
+    if "title" in schema:
+        del schema["title"]
+    if "default" in schema and schema["default"] is None:
+        del schema["default"]
+    if "anyOf" in schema:
+        # Turn "anyOf": [{"type": "type-1"}, {"type": "type-2"}]
+        # into "type": ["type-1", "type-2"]
+        # if there's more than 1 types, also remove "null" type as Harmony will
+        # just ignore it
+        types = [
+            type_dict["type"]
+            for type_dict in schema["anyOf"]
+            if type_dict["type"] != "null"
+        ]
+        schema["type"] = types
+        del schema["anyOf"]
+    if "properties" in schema:
+        schema["properties"] = {
+            k: trim_schema(v) for k, v in schema["properties"].items()
+        }
+    return schema
+
+
+def post_process_tools_description(
+    list_tools_result: "ListToolsResult",
+) -> "ListToolsResult":
+    # Adapt the MCP tool result for Harmony
+    for tool in list_tools_result.tools:
+        tool.inputSchema = trim_schema(tool.inputSchema)
+
+    # Some tools schema don't need to be part of the prompt (e.g. simple text
+    # in text out for Python)
+    list_tools_result.tools = [
+        tool
+        for tool in list_tools_result.tools
+        if getattr(tool.annotations, "include_in_prompt", True)
+    ]
+
+    return list_tools_result
+
+
+class ToolServer(ABC):
+
+    @abstractmethod
+    def has_tool(self, tool_name: str):
+        pass
+
+    @abstractmethod
+    def get_tool_description(self, tool_name: str):
+        pass
+
+    @abstractmethod
+    def get_tool_session(self, tool_name: str) -> AbstractAsyncContextManager[Any]: ...
+
+
+class MCPToolServer(ToolServer):
+
+    def __init__(self):
+        self.harmony_tool_descriptions = {}
+
+    async def add_tool_server(self, server_url: str):
+        tool_urls = server_url.split(",")
+        self.harmony_tool_descriptions = {}
+        self.urls: dict[str, str] = {}
+        for url in tool_urls:
+            url = f"http://{url}/sse"
+            initialize_response, list_tools_response = await list_server_and_tools(url)
+
+            list_tools_response = post_process_tools_description(list_tools_response)
+
+            tool_from_mcp = ToolNamespaceConfig(
+                name=initialize_response.serverInfo.name,
+                description=initialize_response.instructions,
+                tools=[
+                    ToolDescription.new(
+                        name=tool.name,
+                        description=tool.description,
+                        parameters=tool.inputSchema,
+                    )
+                    for tool in list_tools_response.tools
+                ],
+            )
+            self.harmony_tool_descriptions[tool_from_mcp.name] = tool_from_mcp
+            if tool_from_mcp.name not in self.urls:
+                self.urls[tool_from_mcp.name] = url
+            else:
+                logger.warning(
+                    "Tool %s already exists. Ignoring duplicate tool server %s",
+                    tool_from_mcp.name,
+                    url,
+                )
+
+    def has_tool(self, tool_name: str):
+        return tool_name in self.harmony_tool_descriptions
+
+    def get_tool_description(self, tool_name: str):
+        return self.harmony_tool_descriptions.get(tool_name)
+
+    @asynccontextmanager
+    async def get_tool_session(self, tool_name: str):
+        url = self.urls.get(tool_name)
+        if url:
+            async with sse_client(url=url) as streams, ClientSession(
+                *streams
+            ) as session:
+                await session.initialize()
+                yield session
+        else:
+            logger.warning("Tool %s not found", tool_name)
+
+
+class DemoToolServer(ToolServer):
+
+    def __init__(self):
+        from sglang.srt.entrypoints.tool import (
+            HarmonyBrowserTool,
+            HarmonyPythonTool,
+            Tool,
+        )
+
+        self.tools: dict[str, Tool] = {}
+        browser_tool = HarmonyBrowserTool()
+        if browser_tool.enabled:
+            self.tools["browser"] = browser_tool
+        python_tool = HarmonyPythonTool()
+        if python_tool.enabled:
+            self.tools["python"] = python_tool
+
+    def has_tool(self, tool_name: str):
+        return tool_name in self.tools
+
+    def get_tool_description(self, tool_name: str):
+        if tool_name not in self.tools:
+            return None
+        if tool_name == "browser":
+            return ToolNamespaceConfig.browser()
+        elif tool_name == "python":
+            return ToolNamespaceConfig.python()
+        else:
+            raise ValueError(f"Unknown tool {tool_name}")
+
+    @asynccontextmanager
+    async def get_tool_session(self, tool_name: str):
+        yield self.tools[tool_name]
diff --git a/sglang/python/sglang/srt/entrypoints/openai/usage_processor.py b/sglang/python/sglang/srt/entrypoints/openai/usage_processor.py
new file mode 100644
index 0000000000000000000000000000000000000000..de88e9eb001caeec33ba58ddfe6cc91839b6e434
--- /dev/null
+++ b/sglang/python/sglang/srt/entrypoints/openai/usage_processor.py
@@ -0,0 +1,86 @@
+from __future__ import annotations
+
+from typing import Any, Dict, List, Mapping, Optional, final
+
+from sglang.srt.entrypoints.openai.protocol import PromptTokensDetails, UsageInfo
+
+
+@final
+class UsageProcessor:
+    """Stateless helpers that turn raw token counts into a UsageInfo."""
+
+    @staticmethod
+    def _details_if_cached(count: int) -> Optional[PromptTokensDetails]:
+        """Return PromptTokensDetails only when count > 0 (keeps JSON slim)."""
+        return PromptTokensDetails(cached_tokens=count) if count > 0 else None
+
+    @staticmethod
+    def calculate_response_usage(
+        responses: List[Dict[str, Any]],
+        n_choices: int = 1,
+        enable_cache_report: bool = False,
+    ) -> UsageInfo:
+        completion_tokens = sum(
+            r["meta_info"].get("completion_tokens", 0) for r in responses
+        )
+
+        prompt_tokens = sum(
+            responses[i]["meta_info"].get("prompt_tokens", 0)
+            for i in range(0, len(responses), n_choices)
+        )
+
+        cached_details = None
+        if enable_cache_report:
+            cached_total = sum(
+                responses[i]["meta_info"].get("cached_tokens", 0)
+                for i in range(0, len(responses), n_choices)
+            )
+            cached_details = UsageProcessor._details_if_cached(cached_total)
+
+        return UsageProcessor.calculate_token_usage(
+            prompt_tokens=prompt_tokens,
+            completion_tokens=completion_tokens,
+            cached_tokens=cached_details,
+        )
+
+    @staticmethod
+    def calculate_streaming_usage(
+        prompt_tokens: Mapping[int, int],
+        completion_tokens: Mapping[int, int],
+        cached_tokens: Mapping[int, int],
+        n_choices: int,
+        enable_cache_report: bool = False,
+    ) -> UsageInfo:
+        # index % n_choices == 0 marks the first choice of a prompt
+        total_prompt_tokens = sum(
+            tok for idx, tok in prompt_tokens.items() if idx % n_choices == 0
+        )
+        total_completion_tokens = sum(completion_tokens.values())
+
+        cached_details = (
+            UsageProcessor._details_if_cached(
+                sum(tok for idx, tok in cached_tokens.items() if idx % n_choices == 0)
+            )
+            if enable_cache_report
+            else None
+        )
+
+        return UsageProcessor.calculate_token_usage(
+            prompt_tokens=total_prompt_tokens,
+            completion_tokens=total_completion_tokens,
+            cached_tokens=cached_details,
+        )
+
+    @staticmethod
+    def calculate_token_usage(
+        prompt_tokens: int,
+        completion_tokens: int,
+        cached_tokens: Optional[PromptTokensDetails] = None,
+    ) -> UsageInfo:
+        """Calculate token usage information"""
+        return UsageInfo(
+            prompt_tokens=prompt_tokens,
+            completion_tokens=completion_tokens,
+            total_tokens=prompt_tokens + completion_tokens,
+            prompt_tokens_details=cached_tokens,
+        )
diff --git a/sglang/python/sglang/srt/entrypoints/openai/utils.py b/sglang/python/sglang/srt/entrypoints/openai/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..796f8f59b357d8d97f051860b6a8a3d308a2c70b
--- /dev/null
+++ b/sglang/python/sglang/srt/entrypoints/openai/utils.py
@@ -0,0 +1,116 @@
+import logging
+from typing import Any, Dict, List, Optional, Union
+
+from sglang.srt.entrypoints.openai.protocol import (
+    CachedTokensDetails,
+    ChatCompletionRequest,
+    CompletionRequest,
+    LogProbs,
+)
+
+logger = logging.getLogger(__name__)
+
+
+def to_openai_style_logprobs(
+    input_token_logprobs=None,
+    output_token_logprobs=None,
+    input_top_logprobs=None,
+    output_top_logprobs=None,
+):
+    ret_logprobs = LogProbs()
+
+    def append_token_logprobs(token_logprobs):
+        for logprob, _, token_text in token_logprobs:
+            ret_logprobs.tokens.append(token_text)
+            ret_logprobs.token_logprobs.append(logprob)
+
+            # Not supported yet
+            ret_logprobs.text_offset.append(-1)
+
+    def append_top_logprobs(top_logprobs):
+        for tokens in top_logprobs:
+            if tokens is not None:
+                ret_logprobs.top_logprobs.append(
+                    {token[2]: token[0] for token in tokens}
+                )
+            else:
+                ret_logprobs.top_logprobs.append(None)
+
+    if input_token_logprobs is not None:
+        append_token_logprobs(input_token_logprobs)
+    if output_token_logprobs is not None:
+        append_token_logprobs(output_token_logprobs)
+    if input_top_logprobs is not None:
+        append_top_logprobs(input_top_logprobs)
+    if output_top_logprobs is not None:
+        append_top_logprobs(output_top_logprobs)
+
+    return ret_logprobs
+
+
+def process_hidden_states_from_ret(
+    ret_item: Dict[str, Any],
+    request: Union[
+        ChatCompletionRequest,
+        CompletionRequest,
+    ],
+) -> Optional[List]:
+    """Process hidden states from a ret item in non-streaming response.
+
+    Args:
+        ret_item: Response item containing meta_info
+        request: The original request object
+
+    Returns:
+        Processed hidden states for the last token, or None
+    """
+    if not request.return_hidden_states:
+        return None
+
+    hidden_states = ret_item["meta_info"].get("hidden_states", None)
+    if hidden_states is not None:
+        hidden_states = hidden_states[-1] if len(hidden_states) > 1 else []
+    return hidden_states
+
+
+def process_routed_experts_from_ret(
+    ret_item: Dict[str, Any],
+    request: Union[
+        ChatCompletionRequest,
+        CompletionRequest,
+    ],
+) -> Optional[str]:
+    """Process routed experts from a ret item in non-streaming response."""
+    if not getattr(request, "return_routed_experts", False):
+        return None
+    return ret_item["meta_info"].get("routed_experts", None)
+
+
+def process_cached_tokens_details_from_ret(
+    ret_item: Dict[str, Any],
+    request: Union[
+        ChatCompletionRequest,
+        CompletionRequest,
+    ],
+) -> Optional[CachedTokensDetails]:
+    """Process cached tokens details from a ret item in non-streaming response."""
+    if not getattr(request, "return_cached_tokens_details", False):
+        return None
+
+    details = ret_item["meta_info"].get("cached_tokens_details", None)
+    if details is None:
+        return None
+
+    # Check if L3 storage fields are present
+    if "storage" in details:
+        return CachedTokensDetails(
+            device=details.get("device", 0),
+            host=details.get("host", 0),
+            storage=details.get("storage", 0),
+            storage_backend=details.get("storage_backend"),
+        )
+    else:
+        return CachedTokensDetails(
+            device=details.get("device", 0),
+            host=details.get("host", 0),
+        )
diff --git a/sglang/python/sglang/srt/entrypoints/ssl_utils.py b/sglang/python/sglang/srt/entrypoints/ssl_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..87e2b16f2806b19a9dc1eda9529094e3c3815643
--- /dev/null
+++ b/sglang/python/sglang/srt/entrypoints/ssl_utils.py
@@ -0,0 +1,88 @@
+"""Utilities for SSL certificate hot-reloading."""
+
+import asyncio
+import logging
+import ssl
+from typing import Optional
+
+from watchfiles import awatch
+
+logger = logging.getLogger(__name__)
+
+
+class SSLCertRefresher:
+    """Monitors SSL certificate files and reloads them when changed.
+
+    Uses ``watchfiles.awatch()`` for efficient inotify/kqueue-based
+    file monitoring.  On change the referenced :class:`ssl.SSLContext`
+    is updated in-place so that new TLS connections automatically pick
+    up the fresh certificates.
+    """
+
+    def __init__(
+        self,
+        ssl_context: ssl.SSLContext,
+        key_path: str,
+        cert_path: str,
+        ca_path: Optional[str] = None,
+    ) -> None:
+        self._ssl_context = ssl_context
+        self._key_path = key_path
+        self._cert_path = cert_path
+        self._ca_path = ca_path
+        self._tasks: list[asyncio.Task] = []
+
+        loop = asyncio.get_running_loop()
+        self._tasks.append(
+            loop.create_task(self._watch_cert_key(), name="ssl-cert-key-watcher")
+        )
+        if self._ca_path:
+            self._tasks.append(
+                loop.create_task(self._watch_ca(), name="ssl-ca-watcher")
+            )
+
+    async def _watch_cert_key(self) -> None:
+        """Watch cert and key files and reload on change."""
+        try:
+            async for _changes in awatch(self._cert_path, self._key_path):
+                logger.info(
+                    "SSL cert/key file change detected, reloading: " "cert=%s key=%s",
+                    self._cert_path,
+                    self._key_path,
+                )
+                try:
+                    self._ssl_context.load_cert_chain(self._cert_path, self._key_path)
+                    logger.info("SSL cert/key reloaded successfully.")
+                except Exception:
+                    logger.exception(
+                        "Failed to reload SSL cert/key — continuing with "
+                        "previous certificates."
+                    )
+        except asyncio.CancelledError:
+            return
+
+    async def _watch_ca(self) -> None:
+        """Watch CA file and reload on change."""
+        assert self._ca_path is not None
+        try:
+            async for _changes in awatch(self._ca_path):
+                logger.info(
+                    "SSL CA file change detected, reloading: ca=%s",
+                    self._ca_path,
+                )
+                try:
+                    self._ssl_context.load_verify_locations(self._ca_path)
+                    logger.info("SSL CA certificates reloaded successfully.")
+                except Exception:
+                    logger.exception(
+                        "Failed to reload SSL CA certificates — continuing "
+                        "with previous CA bundle."
+                    )
+        except asyncio.CancelledError:
+            return
+
+    def stop(self) -> None:
+        """Cancel all watching tasks."""
+        for task in self._tasks:
+            task.cancel()
+        self._tasks.clear()
diff --git a/sglang/python/sglang/srt/entrypoints/tool.py b/sglang/python/sglang/srt/entrypoints/tool.py
new file mode 100644
index 0000000000000000000000000000000000000000..45b87ac3aca821731f6a56dec36df3885b08455b
--- /dev/null
+++ b/sglang/python/sglang/srt/entrypoints/tool.py
@@ -0,0 +1,87 @@
+# SPDX-License-Identifier: Apache-2.0
+import logging
+import os
+from abc import ABC, abstractmethod
+from typing import TYPE_CHECKING, Any
+
+from sglang.srt.utils import print_info_once, print_warning_once
+
+if TYPE_CHECKING:
+    # Avoid circular import.
+    from sglang.srt.entrypoints.context import ConversationContext
+
+logger = logging.getLogger(__name__)
+
+
+class Tool(ABC):
+
+    @abstractmethod
+    async def get_result(self, context: "ConversationContext") -> Any:
+        pass
+
+
+class HarmonyBrowserTool(Tool):
+
+    def __init__(self):
+        self.enabled = True
+        exa_api_key = os.getenv("EXA_API_KEY")
+        if not exa_api_key:
+            self.enabled = False
+            print_warning_once("EXA_API_KEY is not set, browsing is disabled")
+            return
+
+        try:
+            from gpt_oss.tools.simple_browser import SimpleBrowserTool
+            from gpt_oss.tools.simple_browser.backend import ExaBackend
+        except ImportError:
+            self.enabled = False
+            print_warning_once("gpt_oss is not installed, browsing is disabled")
+            return
+
+        browser_backend = ExaBackend(source="web", api_key=exa_api_key)
+        self.browser_tool = SimpleBrowserTool(backend=browser_backend)
+        print_info_once("Browser tool initialized")
+
+    async def get_result(self, context: "ConversationContext") -> Any:
+        from sglang.srt.entrypoints.context import HarmonyContext
+
+        assert isinstance(context, HarmonyContext)
+        last_msg = context.messages[-1]
+        tool_output_msgs = []
+        async for msg in self.browser_tool.process(last_msg):
+            tool_output_msgs.append(msg)
+        return tool_output_msgs
+
+    @property
+    def tool_config(self) -> Any:
+        return self.browser_tool.tool_config
+
+
+class HarmonyPythonTool(Tool):
+
+    def __init__(self):
+        self.enabled = True
+
+        try:
+            from gpt_oss.tools.python_docker.docker_tool import PythonTool
+        except ImportError:
+            self.enabled = False
+            print_warning_once("gpt_oss is not installed, code interpreter is disabled")
+            return
+
+        self.python_tool = PythonTool()
+        print_info_once("Code interpreter tool initialized")
+
+    async def get_result(self, context: "ConversationContext") -> Any:
+        from sglang.srt.entrypoints.context import HarmonyContext
+
+        assert isinstance(context, HarmonyContext)
+        last_msg = context.messages[-1]
+        tool_output_msgs = []
+        async for msg in self.python_tool.process(last_msg):
+            tool_output_msgs.append(msg)
+        return tool_output_msgs
+
+    @property
+    def tool_config(self) -> Any:
+        return self.python_tool.tool_config
diff --git a/sglang/python/sglang/srt/entrypoints/v1_loads.py b/sglang/python/sglang/srt/entrypoints/v1_loads.py
new file mode 100644
index 0000000000000000000000000000000000000000..7844170102ac99862afbdb3176f0c9119efd78ec
--- /dev/null
+++ b/sglang/python/sglang/srt/entrypoints/v1_loads.py
@@ -0,0 +1,175 @@
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""
+/v1/loads API endpoint for comprehensive load metrics.
+
+This module provides the /v1/loads endpoint which returns detailed scheduler
+metrics for load balancing, monitoring, and capacity planning.
+"""
+
+import dataclasses
+from datetime import datetime, timezone
+from typing import Optional
+
+from fastapi import APIRouter, Depends, HTTPException
+from fastapi.responses import Response
+
+from sglang.srt.managers.io_struct import (
+    DisaggregationMetrics,
+    GetLoadsReqOutput,
+    LoRAMetrics,
+    MemoryMetrics,
+    QueueMetrics,
+    SpeculativeMetrics,
+)
+from sglang.version import __version__
+
+router = APIRouter()
+
+_OPTIONAL_METRIC_SECTIONS = {
+    "memory": ("memory", MemoryMetrics),
+    "speculative": ("spec", SpeculativeMetrics),
+    "lora": ("lora", LoRAMetrics),
+    "disaggregation": ("disagg", DisaggregationMetrics),
+    "queues": ("queues", QueueMetrics),
+}
+
+
+def _get_tokenizer_manager():
+    """Dependency to get tokenizer_manager from global state."""
+    from sglang.srt.entrypoints.http_server import get_global_state
+
+    return get_global_state().tokenizer_manager
+
+
+def _loads_dict_factory(items):
+    """Factory for dataclasses.asdict() that excludes None values and timestamp."""
+    return {k: v for k, v in items if v is not None and k != "timestamp"}
+
+
+def _compute_aggregate(load_dicts: list) -> dict:
+    """Compute aggregate metrics from load dicts."""
+    if not load_dicts:
+        return {
+            "total_running_reqs": 0,
+            "total_waiting_reqs": 0,
+            "total_reqs": 0,
+            "avg_token_usage": 0.0,
+            "avg_throughput": 0.0,
+            "avg_utilization": 0.0,
+        }
+
+    n = len(load_dicts)
+    return {
+        "total_running_reqs": sum(d["num_running_reqs"] for d in load_dicts),
+        "total_waiting_reqs": sum(d["num_waiting_reqs"] for d in load_dicts),
+        "total_reqs": sum(
+            d["num_running_reqs"] + d["num_waiting_reqs"] for d in load_dicts
+        ),
+        "avg_token_usage": round(sum(d["token_usage"] for d in load_dicts) / n, 4),
+        "avg_throughput": round(sum(d["gen_throughput"] for d in load_dicts) / n, 2),
+        "avg_utilization": round(sum(d["utilization"] for d in load_dicts) / n, 4),
+    }
+
+
+def _format_loads_prometheus(load_results) -> Response:
+    """Format load metrics in Prometheus text exposition format.
+
+    Metrics are derived from dataclass field metadata, providing a single source of truth.
+    """
+    lines = []
+
+    for f in dataclasses.fields(GetLoadsReqOutput):
+        if "metric" not in f.metadata:
+            continue
+        metric_type, description = f.metadata["metric"]
+        metric_name = f"sglang_{f.name}"
+        lines.append(f"# HELP {metric_name} {description}")
+        lines.append(f"# TYPE {metric_name} {metric_type}")
+        for load in load_results:
+            value = getattr(load, f.name, None)
+            if value is not None:
+                lines.append(f'{metric_name}{{dp_rank="{load.dp_rank}"}} {value}')
+
+    for attr_name, (prefix, dataclass_type) in _OPTIONAL_METRIC_SECTIONS.items():
+        if not any(getattr(load, attr_name, None) for load in load_results):
+            continue
+        for f in dataclasses.fields(dataclass_type):
+            if "metric" not in f.metadata:
+                continue
+            metric_type, description = f.metadata["metric"]
+            metric_name = f"sglang_{prefix}_{f.name}"
+            lines.append(f"# HELP {metric_name} {description}")
+            lines.append(f"# TYPE {metric_name} {metric_type}")
+            for load in load_results:
+                section = getattr(load, attr_name, None)
+                if section:
+                    value = getattr(section, f.name, None)
+                    if value is not None:
+                        lines.append(
+                            f'{metric_name}{{dp_rank="{load.dp_rank}"}} {value}'
+                        )
+
+    return Response(
+        content="\n".join(lines) + "\n",
+        media_type="text/plain; version=0.0.4; charset=utf-8",
+    )
+
+
+@router.get("/v1/loads")
+async def get_loads(
+    dp_rank: Optional[int] = None,
+    include: Optional[str] = None,
+    format: Optional[str] = None,
+    tokenizer_manager=Depends(_get_tokenizer_manager),
+):
+    """
+    Get comprehensive load metrics for all DP ranks.
+
+    Query Parameters:
+        dp_rank: Filter to specific DP rank (optional)
+        include: Comma-separated sections to include (optional)
+                 Options: core, memory, spec, lora, disagg, queues, all
+                 Default: all
+        format: Response format - 'json' (default) or 'prometheus'
+
+    Returns:
+        JSON response with timestamp, version, dp_rank_count, per-DP-rank loads, and aggregates
+    """
+    include_list = [s.strip() for s in include.split(",")] if include else None
+
+    try:
+        load_results = await tokenizer_manager.get_loads(
+            include=include_list,
+            dp_rank=dp_rank,
+        )
+    except ValueError as e:
+        raise HTTPException(status_code=400, detail=str(e))
+
+    if format == "prometheus":
+        return _format_loads_prometheus(load_results)
+
+    loads = []
+    for load in load_results:
+        d = dataclasses.asdict(load, dict_factory=_loads_dict_factory)
+        d["num_total_reqs"] = d["num_running_reqs"] + d["num_waiting_reqs"]
+        loads.append(d)
+
+    return {
+        "timestamp": datetime.now(timezone.utc).isoformat(),
+        "version": __version__,
+        "dp_rank_count": len(loads),
+        "loads": loads,
+        "aggregate": _compute_aggregate(loads),
+    }
diff --git a/sglang/python/sglang/srt/entrypoints/warmup.py b/sglang/python/sglang/srt/entrypoints/warmup.py
new file mode 100644
index 0000000000000000000000000000000000000000..afba03006a56e98a21fbcac5757c358d4e7187e0
--- /dev/null
+++ b/sglang/python/sglang/srt/entrypoints/warmup.py
@@ -0,0 +1,60 @@
+from __future__ import annotations
+
+import logging
+from typing import TYPE_CHECKING, List
+
+import numpy as np
+import tqdm
+
+from sglang.srt.disaggregation.utils import FAKE_BOOTSTRAP_HOST
+from sglang.srt.managers.io_struct import GenerateReqInput
+
+if TYPE_CHECKING:
+    from sglang.srt.managers.tokenizer_manager import TokenizerManager
+
+logger = logging.getLogger(__file__)
+
+_warmup_registry = {}
+
+
+def warmup(name: str):
+    def decorator(fn):
+        _warmup_registry[name] = fn
+        return fn
+
+    return decorator
+
+
+async def execute_warmups(
+    disaggregation_mode: str,
+    warmup_names: List[str],
+    tokenizer_manager: TokenizerManager,
+):
+    for warmup_name in warmup_names:
+        if warmup_name not in _warmup_registry:
+            logger.warning(f"Could not find custom warmup {warmup_name}")
+            continue
+        logger.info(f"Running warmup {warmup_name}")
+        await _warmup_registry[warmup_name](disaggregation_mode, tokenizer_manager)
+
+
+@warmup("voice_chat")
+async def voice_chat(disaggregation_mode: str, tokenizer_manager: TokenizerManager):
+    # this warms up the fused_moe triton kernels and caches them
+    # if we don't do this we break real time inference for voice chat
+    for i in tqdm.trange(1, 512):
+        size = i * 4
+        generate_req_input = GenerateReqInput(
+            input_ids=(np.random.randint(2**16, size=[size])).tolist(),
+            sampling_params={
+                "max_new_tokens": 30,
+                "temperature": 0.8,
+                "stop_token_ids": [1],
+                "min_p": 0.0,
+            },
+        )
+        if disaggregation_mode != "null":
+            generate_req_input.bootstrap_room = 0
+            generate_req_input.bootstrap_host = FAKE_BOOTSTRAP_HOST
+
+        await tokenizer_manager.generate_request(generate_req_input, None).__anext__()
diff --git a/sglang/python/sglang/srt/eplb/__init__.py b/sglang/python/sglang/srt/eplb/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/sglang/python/sglang/srt/eplb/eplb_manager.py b/sglang/python/sglang/srt/eplb/eplb_manager.py
new file mode 100644
index 0000000000000000000000000000000000000000..e88a3d28e0f3960db410aa9f798d50d9b51de585
--- /dev/null
+++ b/sglang/python/sglang/srt/eplb/eplb_manager.py
@@ -0,0 +1,118 @@
+import logging
+import time
+from typing import TYPE_CHECKING, List
+
+import torch.cuda
+
+from sglang.srt.eplb.expert_distribution import get_global_expert_distribution_recorder
+from sglang.srt.eplb.expert_location import ExpertLocationMetadata
+
+if TYPE_CHECKING:
+    from sglang.srt.model_executor.model_runner import ModelRunner
+
+logger = logging.getLogger(__name__)
+
+
+class EPLBManager:
+    def __init__(self, model_runner: "ModelRunner"):
+        super().__init__()
+        self._model_runner = model_runner
+        self._server_args = model_runner.server_args
+        self._rebalance_layers_per_chunk = (
+            self._server_args.eplb_rebalance_layers_per_chunk
+        )
+        self._rebalance_num_iterations = self._server_args.eplb_rebalance_num_iterations
+
+        # Otherwise, the circular buffer will contain stale data. If the case is needed, it can be implemented.
+        assert (
+            self._server_args.eplb_rebalance_num_iterations
+            >= self._server_args.expert_distribution_recorder_buffer_size
+        ), "eplb_rebalance_num_iterations must be greater than expert_distribution_recorder_buffer_size"
+
+        if not get_global_expert_distribution_recorder().recording:
+            get_global_expert_distribution_recorder().start_record()
+
+        logger.info(
+            f"[EPLBManager] system started, will rebalance per {self._rebalance_num_iterations} iterations."
+        )
+
+        self._main_generator = self._entrypoint()
+
+    def on_forward_pass_end(self):
+        next(self._main_generator)
+
+    # can be more complex if needed
+    def _entrypoint(self):
+        while True:
+            for _ in range(self._rebalance_num_iterations):
+                yield
+
+            yield from self.rebalance()
+
+    def rebalance(self):
+        logger.info("[EPLBManager] rebalance start")
+
+        enable_timing = self._rebalance_layers_per_chunk is None
+
+        if enable_timing:
+            torch.get_device_module().synchronize()
+            time_start = time.time()
+
+        dump_record_output = get_global_expert_distribution_recorder().dump_record(
+            output_mode="object"
+        )
+        logical_count = dump_record_output["logical_count"]
+        average_utilization_rate_over_window = dump_record_output[
+            "average_utilization_rate_over_window"
+        ]
+
+        # Check whether rebalancing is needed
+        if not self._check_rebalance_needed(average_utilization_rate_over_window):
+            return
+
+        expert_location_metadata = ExpertLocationMetadata.init_by_eplb(
+            self._server_args, self._model_runner.model_config, logical_count
+        )
+
+        update_layer_ids_chunks = self._compute_update_layer_ids_chunks()
+        for chunk_index, update_layer_ids in enumerate(update_layer_ids_chunks):
+            if len(update_layer_ids_chunks) > 1:
+                yield
+            self._model_runner.update_expert_location(
+                expert_location_metadata,
+                update_layer_ids=update_layer_ids,
+            )
+
+        msg = f"[EPLBManager] rebalance end"
+        if enable_timing:
+            torch.get_device_module().synchronize()
+            time_end = time.time()
+            msg += f" time={time_end - time_start:.3f}s"
+        logger.info(msg)
+
+    def _check_rebalance_needed(self, average_utilization_rate_over_window):
+        if average_utilization_rate_over_window is None:
+            return True
+
+        if (
+            average_utilization_rate_over_window
+            > self._server_args.eplb_min_rebalancing_utilization_threshold
+        ):
+            logger.info(
+                f"[EPLBManager] Skipped ep rebalancing: current GPU utilization {average_utilization_rate_over_window:.2f} > minimum rebalance threshold {self._server_args.eplb_min_rebalancing_utilization_threshold:.2f}"
+            )
+            return False
+
+        return True
+
+    def _compute_update_layer_ids_chunks(self) -> List[List[int]]:
+        all_layer_ids = sorted(
+            list(self._model_runner.model.routed_experts_weights_of_layer.keys())
+        )
+        chunk_size = self._rebalance_layers_per_chunk or 1000000
+        return list(_chunk_list(all_layer_ids, chunk_size=chunk_size))
+
+
+def _chunk_list(items: List, chunk_size):
+    for start_index in range(0, len(items), chunk_size):
+        yield items[start_index : start_index + chunk_size]
diff --git a/sglang/python/sglang/srt/eplb/expert_distribution.py b/sglang/python/sglang/srt/eplb/expert_distribution.py
new file mode 100644
index 0000000000000000000000000000000000000000..1859852af377a1cfcc2dd5649462daf7d089146a
--- /dev/null
+++ b/sglang/python/sglang/srt/eplb/expert_distribution.py
@@ -0,0 +1,1048 @@
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+from __future__ import annotations
+
+import logging
+import math
+import time
+from abc import ABC
+from collections import deque
+from contextlib import contextmanager
+from dataclasses import dataclass
+from pathlib import Path
+from typing import TYPE_CHECKING, Any, Dict, List, Literal, Optional, Tuple, Type
+
+import einops
+import torch
+import torch.distributed
+
+from sglang.srt.environ import envs
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch
+from sglang.srt.observability.metrics_collector import ExpertDispatchCollector
+from sglang.srt.server_args import ServerArgs
+from sglang.srt.utils import Withable, get_int_env_var
+
+if TYPE_CHECKING:
+    from sglang.srt.eplb.expert_location import ExpertLocationMetadata
+
+logger = logging.getLogger(__name__)
+
+# --------------------------------------- Entrypoint -----------------------------------------
+
+_OutputMode = Literal["file", "object"]
+
+
+@dataclass
+class ExpertDistributionMetrics:
+    eplb_balancedness: torch.Tensor
+
+    def copy_to_cpu(self):
+        self.eplb_balancedness = self.eplb_balancedness.to("cpu", non_blocking=True)
+
+
+class ExpertDistributionRecorder(ABC):
+    """Global expert distribution recording"""
+
+    @staticmethod
+    def init_new(
+        server_args: ServerArgs,
+        expert_location_metadata: ExpertLocationMetadata,
+        rank: int,
+    ):
+        if server_args.expert_distribution_recorder_mode is not None:
+            assert (
+                expert_location_metadata is not None
+            ), "ExpertLocationMetadata is required for expert distribution recording. One possible"
+            "reason is that you are using a model that does not support expert distribution"
+            "recording. Try setting `get_model_config_for_expert_location` in your model."
+            return _ExpertDistributionRecorderReal(
+                server_args, expert_location_metadata, rank
+            )
+        else:
+            return _ExpertDistributionRecorderNoop()
+
+    @contextmanager
+    def with_current_layer(self, layer_idx):
+        yield
+
+    @contextmanager
+    def with_debug_name(self, debug_name):
+        yield
+
+    @contextmanager
+    def disable_this_region(self):
+        yield
+
+    @contextmanager
+    def with_forward_pass(self, forward_pass_id: int, forward_batch: ForwardBatch):
+        yield {}
+
+    def on_select_experts(self, topk_ids: torch.Tensor):
+        pass
+
+    def on_deepep_dispatch_normal(
+        self,
+        local_physical_count_of_layer: List[int],
+        num_tokens_per_rank,
+        num_tokens_per_rdma_rank,
+        num_tokens_per_expert,
+    ):
+        pass
+
+    def on_deepep_dispatch_low_latency(
+        self, local_physical_count_of_layer: torch.Tensor
+    ):
+        pass
+
+    def start_record(self):
+        self._on_not_implemented()
+
+    def stop_record(self):
+        self._on_not_implemented()
+
+    def dump_record(self, output_mode: _OutputMode = "file"):
+        self._on_not_implemented()
+
+    @property
+    def recording(self):
+        return False
+
+    def _on_not_implemented(self):
+        raise Exception(
+            "Please set ServerArgs.expert_distribution_recorder_mode to use ExpertDistributionRecorder."
+        )
+
+
+class _ExpertDistributionRecorderNoop(ExpertDistributionRecorder):
+    pass
+
+
+class _ExpertDistributionRecorderReal(ExpertDistributionRecorder):
+    def __init__(
+        self,
+        server_args: ServerArgs,
+        expert_location_metadata: ExpertLocationMetadata,
+        rank: int,
+    ):
+        self._server_args = server_args
+        self._expert_location_metadata = expert_location_metadata
+
+        self._recording = False
+        self._disable_all = False
+        self._current_forward_pass_id = Withable()
+        self._current_layer_idx = Withable()
+        self._current_debug_name = Withable()
+        self._accumulator = _Accumulator.init_new(
+            server_args, expert_location_metadata, rank
+        )
+        self._single_pass_gatherers = {
+            k: _SinglePassGatherer.init_new(server_args, expert_location_metadata, rank)
+            for k in self._accumulator.get_single_pass_gatherer_keys()
+        }
+
+        if server_args.enable_expert_distribution_metrics:
+            logger.info(
+                "ExpertDistributionRecorder auto start record since enable_expert_distribution_metrics"
+            )
+            self.start_record()
+
+    def with_current_layer(self, layer_idx):
+        return self._current_layer_idx.with_value(layer_idx)
+
+    def with_debug_name(self, debug_name):
+        return self._current_debug_name.with_value(debug_name)
+
+    @contextmanager
+    def with_forward_pass(self, forward_pass_id: int, forward_batch: ForwardBatch):
+        outputs = {}
+        with self._current_forward_pass_id.with_value(forward_pass_id):
+            self._on_forward_pass_start(forward_batch)
+            try:
+                yield outputs
+            finally:
+                self._on_forward_pass_end(forward_pass_id, outputs)
+
+    @contextmanager
+    def disable_this_region(self):
+        """Context manager to temporarily disable recording."""
+        previous_disable_all = self._disable_all
+        self._disable_all = True
+        try:
+            yield
+        finally:
+            self._disable_all = previous_disable_all
+
+    def _on_forward_pass_start(self, forward_batch: ForwardBatch):
+        if not self._recording:
+            return
+        for gatherer_key, gatherer in self._single_pass_gatherers.items():
+            gatherer.reset()
+            gatherer.on_forward_pass_start(forward_batch)
+
+    def _on_forward_pass_end(self, forward_pass_id: int, outputs: Dict[str, Any]):
+        if not self._recording:
+            return
+        for gatherer_key, gatherer in self._single_pass_gatherers.items():
+            single_pass_data = gatherer.collect()
+            self._accumulator.append(
+                forward_pass_id, gatherer_key, single_pass_data, outputs
+            )
+
+    def on_select_experts(self, topk_ids: torch.Tensor):
+        self._on_hook("on_select_experts", topk_ids=topk_ids)
+
+    def on_deepep_dispatch_normal(
+        self,
+        local_physical_count_of_layer: List[int],
+        num_tokens_per_rank,
+        num_tokens_per_rdma_rank,
+        num_tokens_per_expert,
+    ):
+        self._on_hook(
+            "on_deepep_dispatch_normal",
+            local_physical_count_of_layer=local_physical_count_of_layer,
+            num_tokens_per_rank=num_tokens_per_rank,
+            num_tokens_per_rdma_rank=num_tokens_per_rdma_rank,
+            num_tokens_per_expert=num_tokens_per_expert,
+        )
+
+    def on_deepep_dispatch_low_latency(
+        self, local_physical_count_of_layer: torch.Tensor
+    ):
+        self._on_hook(
+            "on_deepep_dispatch_low_latency",
+            local_physical_count_of_layer=local_physical_count_of_layer,
+        )
+
+    def _on_hook(self, hook_name: str, **kwargs):
+        if self._disable_all:
+            return
+        if not (
+            self._recording or torch.get_device_module().is_current_stream_capturing()
+        ):
+            return
+        gatherer = self._single_pass_gatherers[
+            self._accumulator.get_single_pass_gatherer_key(
+                self._current_debug_name.value
+            )
+        ]
+        getattr(gatherer, hook_name)(layer_idx=self._current_layer_idx.value, **kwargs)
+
+    def _reset(self):
+        """Reset the expert distribution recorder."""
+        logger.info("Resetting ExpertDistributionRecorder...")
+        assert (
+            self._current_layer_idx.value is None
+        ), f"{self._current_layer_idx.value=}"
+        for gatherer in self._single_pass_gatherers.values():
+            gatherer.reset()
+        self._accumulator.reset()
+
+    def start_record(self):
+        """Start recording the expert distribution."""
+        if self._recording:
+            logger.warning(
+                "SGLang server is already recording expert ids. Did you forget to dump the expert ids recorded so far by sending requests to the `/stop_expert_distribution_record` and `/dump_expert_distribution_record` endpoints?"
+            )
+        self._reset()
+        self._recording = True
+
+    def stop_record(self):
+        """Stop recording the expert distribution."""
+        if not self._recording:
+            logger.warning(
+                "SGLang server has not been recording expert ids. Did you forget to start recording by sending request to the `/start_expert_distribution_record` endpoint?"
+            )
+        self._recording = False
+
+    def dump_record(self, output_mode: _OutputMode = "file"):
+        """Dump the expert distribution record and reset the recorder after dumping."""
+        output = self._accumulator.dump(output_mode=output_mode)
+        self._reset()
+        return output
+
+    @property
+    def recording(self):
+        return self._recording
+
+
+_global_expert_distribution_recorder: Optional[ExpertDistributionRecorder] = (
+    _ExpertDistributionRecorderNoop()
+)
+
+
+def get_global_expert_distribution_recorder():
+    return _global_expert_distribution_recorder
+
+
+def set_global_expert_distribution_recorder(value):
+    global _global_expert_distribution_recorder
+    _global_expert_distribution_recorder = value
+
+
+# --------------------------------------- SinglePassGatherer -----------------------------------------
+
+
+class _SinglePassGatherer(ABC):
+    @staticmethod
+    def init_new(
+        server_args: ServerArgs,
+        expert_location_metadata: ExpertLocationMetadata,
+        rank: int,
+    ) -> "_SinglePassGatherer":
+        if server_args.expert_distribution_recorder_mode == "per_token":
+            return _DetailSinglePassGatherer(
+                server_args, expert_location_metadata, rank
+            )
+
+        if server_args.expert_distribution_recorder_mode == "stat_approx":
+            if server_args.moe_a2a_backend != "none" and (
+                server_args.deepep_mode == "normal"
+            ):
+                return _DeepepNormalSinglePassGatherer(expert_location_metadata, rank)
+            else:
+                raise NotImplementedError
+
+        if server_args.moe_a2a_backend != "none":
+            if server_args.deepep_mode == "normal":
+                return _SelectExpertsSinglePassGatherer(expert_location_metadata, rank)
+            elif server_args.deepep_mode == "low_latency":
+                return _DeepepLowLatencySinglePassGatherer(
+                    expert_location_metadata, rank
+                )
+            else:
+                raise NotImplementedError
+
+        return _SelectExpertsSinglePassGatherer(expert_location_metadata, rank)
+
+    def __init__(self, expert_location_metadata: ExpertLocationMetadata, rank: int):
+        self._expert_location_metadata = expert_location_metadata
+        self._rank = rank
+
+    def on_forward_pass_start(self, forward_batch: ForwardBatch):
+        pass
+
+    def on_select_experts(self, layer_idx: int, topk_ids: torch.Tensor):
+        pass
+
+    def on_deepep_dispatch_normal(
+        self,
+        layer_idx: int,
+        local_physical_count_of_layer: List[int],
+        num_tokens_per_rank,
+        num_tokens_per_rdma_rank,
+        num_tokens_per_expert,
+    ):
+        pass
+
+    def on_deepep_dispatch_low_latency(
+        self, layer_idx: int, local_physical_count_of_layer: torch.Tensor
+    ):
+        pass
+
+    def reset(self):
+        raise NotImplementedError
+
+    def collect(self) -> Dict:
+        raise NotImplementedError
+
+
+class _DetailSinglePassGatherer(_SinglePassGatherer):
+    # DeepSeek V3 has this value; should generalize later
+    _TOP_K_NUM = 8
+
+    def __init__(
+        self,
+        server_args: ServerArgs,
+        expert_location_metadata: ExpertLocationMetadata,
+        rank: int,
+    ):
+        super().__init__(expert_location_metadata, rank)
+        self._metadata: Optional[Dict[str, Any]] = None
+        self._topk_ids_of_layer = torch.zeros(
+            (
+                expert_location_metadata.num_layers,
+                # TODO determine the max number
+                server_args.chunked_prefill_size * 8,
+                self._TOP_K_NUM,
+            ),
+            dtype=torch.int32,
+            device=server_args.device,
+        )
+        self._misc_objects: List[Dict[str, Any]] = []
+        assert (
+            not server_args.enable_two_batch_overlap
+        ), "DetailSinglePassGatherer does not support TBO yet"
+        # TODO assert shared experts fusion is disabled, o/w data is wrong
+
+    def on_forward_pass_start(self, forward_batch: ForwardBatch):
+        assert self._metadata is None
+        self._metadata = dict(
+            # TODO pr-chain
+            # rids=forward_batch.rids,
+            input_ids=forward_batch.input_ids.cpu().tolist(),
+            positions=forward_batch.positions.cpu().tolist(),
+            extend_seq_lens=forward_batch.extend_seq_lens_cpu,
+            forward_mode=forward_batch.forward_mode.value,
+        )
+
+    def on_select_experts(self, layer_idx: int, topk_ids: torch.Tensor):
+        self._topk_ids_of_layer[layer_idx, : topk_ids.shape[0], : topk_ids.shape[1]] = (
+            topk_ids
+        )
+
+    def on_deepep_dispatch_normal(
+        self,
+        layer_idx: int,
+        local_physical_count_of_layer: List[int],
+        num_tokens_per_rank,
+        num_tokens_per_rdma_rank,
+        num_tokens_per_expert,
+    ):
+        self._misc_objects.append(
+            dict(
+                layer_id=layer_idx,
+                num_tokens_per_rank=num_tokens_per_rank.cpu().tolist(),
+                num_tokens_per_rdma_rank=num_tokens_per_rdma_rank.cpu().tolist(),
+                num_tokens_per_expert=num_tokens_per_expert.cpu().tolist(),
+            )
+        )
+
+    def reset(self):
+        self._topk_ids_of_layer[...] = -1
+        self._misc_objects.clear()
+        self._metadata = None
+
+    def collect(self) -> Dict:
+        num_tokens = len(self._metadata["input_ids"])
+
+        global_physical_count = _convert_per_token_to_global_physical_count(
+            num_tokens,
+            num_layers=self._expert_location_metadata.num_layers,
+            num_physical_experts=self._expert_location_metadata.num_physical_experts,
+            _topk_ids_of_layer=self._topk_ids_of_layer,
+        )
+
+        return dict(
+            **self._metadata,
+            topk_ids_of_layer=self._topk_ids_of_layer[:, :num_tokens, :].clone().cpu(),
+            misc_objects=self._misc_objects,
+            global_physical_count=global_physical_count,
+        )
+
+
+class _LayerBasedCpuSinglePassGatherer(_SinglePassGatherer):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self._objects_of_layer = {}
+
+    def _on_layer_data(self, layer_idx: int, objects: List[int]):
+        assert 0 <= layer_idx < self._expert_location_metadata.num_layers
+        if layer_idx in self._objects_of_layer:
+            self._objects_of_layer[layer_idx] = _list_sum(
+                self._objects_of_layer[layer_idx], objects
+            )
+        else:
+            self._objects_of_layer[layer_idx] = objects
+
+    def reset(self):
+        self._objects_of_layer.clear()
+
+    def _collect_objects(self, pad_len: int) -> torch.Tensor:
+        data = [
+            self._objects_of_layer.get(layer_index) or ([0] * pad_len)
+            for layer_index in range(self._expert_location_metadata.num_layers)
+        ]
+        return torch.tensor(data)
+
+
+def _list_sum(a: List, b: List) -> List:
+    return [x + y for x, y in zip(a, b, strict=True)]
+
+
+class _LayerBasedGpuSinglePassGatherer(_SinglePassGatherer):
+    def __init__(self, *args, enable_global_physical_experts: bool, **kwargs):
+        super().__init__(*args, **kwargs)
+        self._enable_global_physical_experts = enable_global_physical_experts
+        self._data = torch.zeros(
+            (
+                self._expert_location_metadata.num_layers,
+                (
+                    self._expert_location_metadata.num_physical_experts
+                    if enable_global_physical_experts
+                    else self._expert_location_metadata.num_local_physical_experts
+                ),
+            ),
+            dtype=torch.int,
+            device="cuda",
+        )
+
+    def reset(self):
+        self._data[...] = 0
+
+    def collect(self) -> Dict:
+        if self._enable_global_physical_experts:
+            global_physical_count = self._data
+        else:
+            # Can optimize if bottleneck
+            global_physical_count = _convert_local_to_global_physical_count(
+                self._data,
+                rank=self._rank,
+                num_local_physical_experts=self._expert_location_metadata.num_local_physical_experts,
+                num_physical_experts=self._expert_location_metadata.num_physical_experts,
+            )
+
+        return dict(global_physical_count=global_physical_count)
+
+
+class _SelectExpertsSinglePassGatherer(_LayerBasedGpuSinglePassGatherer):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs, enable_global_physical_experts=True)
+
+    # can optimize (e.g. fuse / compile)
+    def on_select_experts(self, layer_idx: int, topk_ids: torch.Tensor):
+        topk_ids = topk_ids.flatten()
+        mask = topk_ids != -1
+        self._data[layer_idx, :].scatter_add_(
+            dim=0, index=topk_ids.masked_fill(~mask, 0).long(), src=mask.int()
+        )
+
+
+class _DeepepNormalSinglePassGatherer(_LayerBasedCpuSinglePassGatherer):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        if torch.distributed.get_rank() == 0:
+            logger.info(
+                "DeepepNormalSinglePassGatherer gathers approximate statistics. "
+                "If used with small batch size, consider using expert_distribution_recorder_mode=stat."
+            )
+
+    def on_deepep_dispatch_normal(
+        self,
+        layer_idx: int,
+        local_physical_count_of_layer: List[int],
+        num_tokens_per_rank,
+        num_tokens_per_rdma_rank,
+        num_tokens_per_expert,
+    ):
+        assert isinstance(local_physical_count_of_layer, list)
+        self._on_layer_data(layer_idx, local_physical_count_of_layer)
+
+    def collect(self) -> Dict:
+        local_physical_count = super()._collect_objects(
+            pad_len=self._expert_location_metadata.num_local_physical_experts
+        )
+        global_physical_count = _convert_local_to_global_physical_count(
+            local_physical_count,
+            rank=self._rank,
+            num_local_physical_experts=self._expert_location_metadata.num_local_physical_experts,
+            num_physical_experts=self._expert_location_metadata.num_physical_experts,
+        )
+        return dict(global_physical_count=global_physical_count)
+
+
+class _DeepepLowLatencySinglePassGatherer(_LayerBasedGpuSinglePassGatherer):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs, enable_global_physical_experts=False)
+
+    def on_deepep_dispatch_low_latency(
+        self, layer_idx: int, local_physical_count_of_layer: torch.Tensor
+    ):
+        # Most naive implementation, can optimize later
+        self._data[layer_idx, :] += local_physical_count_of_layer
+
+
+def _convert_per_token_to_global_physical_count(
+    num_tokens: int,
+    num_layers: int,
+    num_physical_experts: int,
+    _topk_ids_of_layer: torch.Tensor,
+) -> torch.Tensor:
+    topk_ids_layer_major = _topk_ids_of_layer[:, :num_tokens, :].reshape(num_layers, -1)
+    mask = topk_ids_layer_major != -1
+
+    index = topk_ids_layer_major.masked_fill(~mask, 0).long()
+    src = mask.int()
+
+    ans = torch.zeros(
+        (num_layers, num_physical_experts),
+        dtype=_topk_ids_of_layer.dtype,
+        device=_topk_ids_of_layer.device,
+    )
+    ans.scatter_add_(dim=1, index=index, src=src)
+    return ans
+
+
+def _convert_local_to_global_physical_count(
+    local_physical_count: torch.Tensor,
+    rank: int,
+    num_local_physical_experts: int,
+    num_physical_experts: int,
+) -> torch.Tensor:
+    dtype = local_physical_count.dtype
+    device = local_physical_count.device
+    num_layers, _ = local_physical_count.shape
+
+    ans = torch.zeros((num_layers, num_physical_experts), dtype=dtype, device=device)
+    ans[
+        :, num_local_physical_experts * rank : num_local_physical_experts * (rank + 1)
+    ] = local_physical_count
+    return ans
+
+
+# --------------------------------------- Accumulator -----------------------------------------
+
+_SINGLE_PASS_GATHERER_KEY_PRIMARY = "primary"
+
+
+class _Accumulator(ABC):
+    @staticmethod
+    def init_new(
+        server_args: ServerArgs,
+        expert_location_metadata: ExpertLocationMetadata,
+        rank: int,
+    ) -> "_Accumulator":
+        return _Accumulator.get_class(server_args)(
+            server_args, expert_location_metadata, rank
+        )
+
+    @staticmethod
+    def get_class(server_args: ServerArgs) -> Type["_Accumulator"]:
+        return {
+            "stat": _StatAccumulator,
+            "stat_approx": _StatAccumulator,
+            "per_pass": _DetailAccumulator,
+            "per_token": _DetailAccumulator,
+        }[server_args.expert_distribution_recorder_mode]
+
+    def __init__(
+        self,
+        server_args: ServerArgs,
+        expert_location_metadata: ExpertLocationMetadata,
+        rank: int,
+    ):
+        self._server_args = server_args
+        self._expert_location_metadata = expert_location_metadata
+        self._rank = rank
+
+    def get_single_pass_gatherer_keys(self):
+        return [_SINGLE_PASS_GATHERER_KEY_PRIMARY]
+
+    def get_single_pass_gatherer_key(self, debug_name: Optional[str]):
+        return _SINGLE_PASS_GATHERER_KEY_PRIMARY
+
+    def append(
+        self,
+        forward_pass_id: int,
+        gatherer_key: str,
+        single_pass_data: Dict,
+        outputs: Dict[str, Any],
+    ):
+        pass
+
+    def reset(self):
+        pass
+
+    def dump(self, output_mode: _OutputMode):
+        pass
+
+
+class _UtilizationRateAccumulatorMixin(_Accumulator):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+        self._enable = self._server_args.enable_expert_distribution_metrics
+
+        if self._enable:
+            self.window_sizes = [10, 100, 1000]
+            self._history = _DequeCollection(maxlens=self.window_sizes)
+            self._rank = torch.distributed.get_rank()
+            self._expert_dispatch_collector = ExpertDispatchCollector(
+                self._expert_location_metadata.ep_size
+            )
+            self._metric_heatmap_collection_counter = 0
+
+    def append(
+        self,
+        forward_pass_id: int,
+        gatherer_key: str,
+        single_pass_data: Dict,
+        outputs: Dict[str, Any],
+    ):
+        super().append(forward_pass_id, gatherer_key, single_pass_data, outputs)
+        if self._enable:
+            return self._append_utilization_rate(
+                forward_pass_id, single_pass_data["global_physical_count"], outputs
+            )
+
+    def reset(self):
+        super().reset()
+        if self._enable:
+            self._history.clear()
+
+    def _append_utilization_rate(
+        self,
+        forward_pass_id: int,
+        single_pass_global_physical_count: torch.Tensor,
+        outputs: Dict[str, Any],
+    ):
+        gpu_physical_count = compute_gpu_physical_count(
+            single_pass_global_physical_count,
+            num_gpu=self._expert_location_metadata.ep_size,
+        )
+        gpu_physical_count = gpu_physical_count.to(self._server_args.device)
+        torch.distributed.reduce(
+            gpu_physical_count, dst=0, op=torch.distributed.ReduceOp.SUM
+        )
+
+        if self._rank == 0:
+            self._handle_metric_eplb_heatmap(gpu_physical_count)
+
+            utilization_rate_gpu = torch.mean(
+                compute_utilization_rate(gpu_physical_count)
+            )
+            if envs.SGLANG_ENABLE_EPLB_BALANCEDNESS_METRIC.get():
+                print(f"hi {self._rank=} {utilization_rate_gpu=}")
+                outputs["metrics"] = ExpertDistributionMetrics(
+                    eplb_balancedness=utilization_rate_gpu,
+                )
+            else:
+                # TODO maybe refactor this part to also avoid a `.item()` gpu->cpu sync
+                utilization_rate_cpu = utilization_rate_gpu.item()
+                self._history.append(utilization_rate_cpu)
+
+                gpu_physical_count_sum = gpu_physical_count.sum().item()
+
+                logger.info(
+                    f"[Expert Balancedness] "
+                    f"forward_pass_id={forward_pass_id} "
+                    f"current_pass_balancedness={utilization_rate_cpu:.03f} "
+                    f"{''.join(f'last_{size}_average_balancedness={value:.03f} ' for size, value in self._history.mean().items())} "
+                    f"gpu_physical_count_sum={gpu_physical_count_sum}"
+                    # f"current_pass_per_layer={[round(x, 2) for x in utilization_rate_tensor.cpu().tolist()]}"
+                )
+
+    # TODO refactor
+    def _handle_metric_eplb_heatmap(self, gpu_physical_count: torch.Tensor):
+        # sglang:eplb_gpu_physical_count metric is disabled if SGLANG_EPLB_HEATMAP_COLLECTION_INTERVAL <= 0
+        interval = get_int_env_var("SGLANG_EPLB_HEATMAP_COLLECTION_INTERVAL", 0)
+        if interval > 0 and self._metric_heatmap_collection_counter % interval == 0:
+            for layer_idx in range(self._expert_location_metadata.num_layers):
+                count_of_layer = (
+                    self._expert_dispatch_collector.eplb_gpu_physical_count.labels(
+                        layer=str(layer_idx)
+                    )
+                )
+                # Exclude the +Inf bucket.
+                assert (
+                    self._expert_location_metadata.ep_size
+                    == len(count_of_layer._buckets) - 1
+                ), f"{self._expert_location_metadata.ep_size=}, {len(count_of_layer._buckets)=}"
+                for gpu_rank in range(self._expert_location_metadata.ep_size):
+                    count = gpu_physical_count[layer_idx, gpu_rank]
+                    if count > 0:
+                        count_of_layer._sum.inc(count * gpu_rank)
+                        count_of_layer._buckets[gpu_rank].inc(count)
+        self._metric_heatmap_collection_counter += 1
+
+
+class _DequeCollection:
+    def __init__(self, maxlens: List[int]):
+        self._dequeues = [deque(maxlen=maxlen) for maxlen in maxlens]
+
+    def append(self, value):
+        for d in self._dequeues:
+            d.append(value)
+
+    def clear(self):
+        for d in self._dequeues:
+            d.clear()
+
+    def mean(self) -> Dict[int, float]:
+        return {d.maxlen: sum(d) / len(d) for d in self._dequeues}
+
+
+class _DetailAccumulator(_UtilizationRateAccumulatorMixin):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self._records = []
+
+    def get_single_pass_gatherer_keys(self):
+        if False:  # TODO `server_args.enable_two_batch_overlap`
+            return [_SINGLE_PASS_GATHERER_KEY_PRIMARY, "child_a", "child_b"]
+        return super().get_single_pass_gatherer_keys()
+
+    def get_single_pass_gatherer_key(self, debug_name: Optional[str]):
+        if False:  # TODO `server_args.enable_two_batch_overlap`
+            return debug_name or _SINGLE_PASS_GATHERER_KEY_PRIMARY
+        return super().get_single_pass_gatherer_key(debug_name)
+
+    def append(
+        self,
+        forward_pass_id: int,
+        gatherer_key: str,
+        single_pass_data: Dict,
+        outputs: Dict[str, Any],
+    ):
+        super().append(forward_pass_id, gatherer_key, single_pass_data, outputs)
+
+        def _process_object(obj):
+            if isinstance(obj, torch.Tensor):
+                return obj.cpu().clone()
+            return obj
+
+        single_pass_data_processed = {
+            k: _process_object(v) for k, v in single_pass_data.items()
+        }
+
+        self._records.append(
+            dict(
+                forward_pass_id=forward_pass_id,
+                rank=self._rank,
+                gatherer_key=gatherer_key,
+                **single_pass_data_processed,
+            )
+        )
+
+    def reset(self):
+        super().reset()
+        self._records.clear()
+
+    def dump(self, output_mode: _OutputMode):
+        assert output_mode == "file"
+        output = dict(
+            records=self._records,
+            # NOTE: This may change during recording, so here we say it is the "last" one
+            last_physical_to_logical_map=self._expert_location_metadata.physical_to_logical_map,
+        )
+        _dump_to_file(
+            f"expert_distribution_recorder_{time.time()}_{self._rank}.pt", output
+        )
+
+
+class _StatAccumulator(_UtilizationRateAccumulatorMixin):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self._global_physical_count_of_buffered_step = _Buffer.init_new(
+            item_shape=(
+                self._expert_location_metadata.num_layers,
+                # Cannot use local_physical_count to support select_experts
+                self._expert_location_metadata.num_physical_experts,
+            ),
+            buffer_size=self._server_args.expert_distribution_recorder_buffer_size,
+            dtype=torch.int32,
+            device=self._server_args.device,
+        )
+        self._first_dump = True
+
+    def append(
+        self,
+        forward_pass_id: int,
+        gatherer_key: str,
+        single_pass_data: Dict,
+        outputs: Dict[str, Any],
+    ):
+        super().append(forward_pass_id, gatherer_key, single_pass_data, outputs)
+        # Can optimize if overhead here is large
+        self._global_physical_count_of_buffered_step.append(
+            single_pass_data["global_physical_count"]
+        )
+
+    def reset(self):
+        super().reset()
+        self._global_physical_count_of_buffered_step.reset()
+
+    def dump(self, output_mode: _OutputMode):
+        logical_count_of_buffered_step = _convert_global_physical_count_to_logical_count(
+            self._global_physical_count_of_buffered_step.get_all(),
+            num_layers=self._expert_location_metadata.num_layers,
+            num_logical_experts=self._expert_location_metadata.num_logical_experts,
+            physical_to_logical_map=self._expert_location_metadata.physical_to_logical_map,
+        )
+
+        if self._first_dump:
+            self._first_dump = False
+            torch.get_device_module().empty_cache()
+
+        torch.distributed.all_reduce(
+            logical_count_of_buffered_step, op=torch.distributed.ReduceOp.SUM
+        )
+
+        output = dict(
+            rank=self._rank,
+            logical_count=logical_count_of_buffered_step,
+            average_utilization_rate_over_window=self._get_global_average_utilization_rate(),
+        )
+
+        if output_mode == "file":
+            if self._rank == 0:
+                _dump_to_file(f"expert_distribution_recorder_{time.time()}.pt", output)
+        elif output_mode == "object":
+            return output
+        else:
+            raise NotImplementedError
+
+    def _get_global_average_utilization_rate(self):
+        if not self._enable or math.isclose(
+            self._server_args.eplb_min_rebalancing_utilization_threshold, 1.0
+        ):
+            return None
+
+        if self._rank == 0:
+            utilization_mean_rates = self._history.mean()
+            window_index = self.window_sizes[-1]
+            average_utilization_rate_over_window = (
+                utilization_mean_rates[window_index]
+                if window_index in utilization_mean_rates
+                else 0
+            )
+
+            avg_rate_tensor = torch.tensor(
+                [average_utilization_rate_over_window],
+                dtype=torch.float32,
+                device="cuda",
+            )
+        else:
+            avg_rate_tensor = torch.empty(1, dtype=torch.float32, device="cuda")
+        torch.distributed.broadcast(avg_rate_tensor, src=0)
+        return avg_rate_tensor.item()
+
+
+def _dump_to_file(name, data):
+    save_dir = Path(envs.SGLANG_EXPERT_DISTRIBUTION_RECORDER_DIR.get())
+    path_output = save_dir / name
+    logger.info(f"Write expert distribution to {path_output}")
+    if not save_dir.exists():
+        save_dir.mkdir(parents=True, exist_ok=True)
+    torch.save(data, str(path_output))
+
+
+class _Buffer:
+    @staticmethod
+    def init_new(item_shape: Tuple, buffer_size: int, dtype, device):
+        if buffer_size < 0:
+            return _InfiniteBuffer(item_shape, dtype=dtype, device=device)
+        else:
+            return _CircularBuffer(item_shape, buffer_size, dtype=dtype, device=device)
+
+    def append(self, value: torch.Tensor):
+        raise NotImplementedError
+
+    def get_all(self) -> torch.Tensor:
+        raise NotImplementedError
+
+    def reset(self):
+        raise NotImplementedError
+
+
+class _CircularBuffer(_Buffer):
+    def __init__(self, item_shape: Tuple, buffer_size: int, dtype, device):
+        self._buffer = torch.zeros(
+            (buffer_size, *item_shape), dtype=dtype, device=device
+        )
+        self._curr_index = 0
+
+    def append(self, value: torch.Tensor):
+        self._buffer[self._curr_index] = value
+        self._curr_index = (self._curr_index + 1) % len(self._buffer)
+
+    def get_all(self) -> torch.Tensor:
+        return self._buffer
+
+    def reset(self):
+        self._buffer[...] = 0
+
+
+class _InfiniteBuffer(_Buffer):
+    def __init__(self, item_shape: Tuple, dtype, device):
+        self._item_shape = item_shape
+        self._buffer = torch.zeros((128, *item_shape), dtype=dtype, device=device)
+        self._size = 0
+
+    def append(self, value: torch.Tensor):
+        curr_buffer_size = len(self._buffer)
+        dtype = self._buffer.dtype
+        device = self._buffer.device
+
+        if self._size == curr_buffer_size:
+            new_buffer = torch.zeros(
+                (2 * curr_buffer_size, *self._item_shape), dtype=dtype, device=device
+            )
+            new_buffer[:curr_buffer_size] = self._buffer
+            self._buffer = new_buffer
+
+        self._buffer[self._size] = value
+        self._size += 1
+
+    def get_all(self) -> torch.Tensor:
+        return self._buffer[: self._size]
+
+    def reset(self):
+        self._buffer[...] = 0
+        self._size = 0
+
+
+def _convert_global_physical_count_to_logical_count(
+    # (whatever, num_layers, num_physical_experts)
+    global_physical_count: torch.Tensor,
+    num_layers: int,
+    num_logical_experts: int,
+    physical_to_logical_map: torch.Tensor,
+):
+    dim_extra, _, _ = global_physical_count.shape
+    dtype = global_physical_count.dtype
+    device = global_physical_count.device
+    logical_count = torch.zeros(
+        (dim_extra, num_layers, num_logical_experts), dtype=dtype, device=device
+    )
+    logical_count.scatter_add_(
+        dim=2,
+        index=physical_to_logical_map.unsqueeze(0)
+        .expand(dim_extra, -1, -1)
+        .to(torch.int64),
+        src=global_physical_count,
+    )
+    return logical_count
+
+
+def compute_gpu_physical_count(
+    physical_count_of_whatever: torch.Tensor,  # (..., num_layer, num_physical_expert)
+    num_gpu: int,
+):
+    """output: gpu_physical_count_of_batch (..., num_layer, num_gpu)"""
+    return einops.reduce(
+        physical_count_of_whatever,
+        "... num_layer (num_gpu num_expert_per_gpu) -> ... num_layer num_gpu",
+        "sum",
+        num_gpu=num_gpu,
+    )
+
+
+def compute_utilization_rate(
+    gpu_physical_count_of_batch: torch.Tensor,  # (..., num_layer, num_gpu)
+):
+    """output: utilization_rate (..., num_layer)"""
+    gpu_physical_count_of_batch = gpu_physical_count_of_batch.float()
+    max_gpu_physical_count = einops.reduce(
+        gpu_physical_count_of_batch,
+        "... num_layer num_gpu -> ... num_layer",
+        "max",
+    )
+    avg_gpu_physical_count = einops.reduce(
+        gpu_physical_count_of_batch,
+        "... num_layer num_gpu -> ... num_layer",
+        "mean",
+    )
+    return (avg_gpu_physical_count + 1e-5) / (max_gpu_physical_count + 1e-5)
diff --git a/sglang/python/sglang/srt/eplb/expert_location.py b/sglang/python/sglang/srt/eplb/expert_location.py
new file mode 100644
index 0000000000000000000000000000000000000000..7bd0254baa5a28dfec7edff2c75dce012b188303
--- /dev/null
+++ b/sglang/python/sglang/srt/eplb/expert_location.py
@@ -0,0 +1,573 @@
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+from __future__ import annotations
+
+import json
+import logging
+import random
+from dataclasses import dataclass
+from pathlib import Path
+from typing import TYPE_CHECKING, List, Optional
+
+import torch
+import torch.distributed
+import torch.nn.functional as F
+
+from sglang.srt.eplb import eplb_algorithms
+from sglang.srt.model_loader import get_model_architecture
+
+if TYPE_CHECKING:
+    from sglang.srt.configs.model_config import ModelConfig
+    from sglang.srt.server_args import ServerArgs
+
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class ExpertLocationMetadata:
+    physical_to_logical_map: torch.Tensor  # (layers, num_physical_experts)
+    physical_to_logical_map_cpu: torch.Tensor
+    logical_to_all_physical_map: torch.Tensor  # (layers, num_logical_experts, X)
+    logical_to_all_physical_map_cpu: torch.Tensor  # CPU copy for performance
+    logical_to_all_physical_map_num_valid: torch.Tensor  # (layers, num_logical_experts)
+    # (layers, num_logical_experts)
+    logical_to_rank_dispatch_physical_map: Optional[torch.Tensor]
+
+    # -------------------------------- properties ------------------------------------
+
+    @property
+    def num_layers(self) -> int:
+        return self.physical_to_logical_map.shape[0]
+
+    @property
+    def num_physical_experts(self) -> int:
+        return self.physical_to_logical_map.shape[1]
+
+    @property
+    def num_local_physical_experts(self) -> int:
+        ans, remainder = divmod(self.num_physical_experts, self.ep_size)
+        assert remainder == 0
+        return ans
+
+    @property
+    def num_logical_experts(self) -> int:
+        return self.logical_to_all_physical_map.shape[1]
+
+    @property
+    def ep_size(self):
+        # TODO change when EP size != world size
+        return torch.distributed.get_world_size()
+
+    def __post_init__(self):
+        num_layers_0, num_physical_experts_0 = self.physical_to_logical_map.shape
+        num_layers_1, num_logical_experts_0, num_physical_experts_1 = (
+            self.logical_to_all_physical_map.shape
+        )
+        num_layers_2, num_logical_experts_1 = (
+            self.logical_to_all_physical_map_num_valid.shape
+        )
+        assert num_layers_0 == num_layers_1 == num_layers_2
+        assert num_logical_experts_0 == num_logical_experts_1
+        assert num_physical_experts_0 == num_physical_experts_1
+
+    # -------------------------------- construction ------------------------------------
+
+    @staticmethod
+    def init_trivial(
+        server_args: ServerArgs, model_config: ModelConfig, moe_ep_rank: int
+    ):
+        """Trivial location - logical expert i corresponds to physical expert i"""
+        common = ExpertLocationMetadata._init_common(server_args, model_config)
+
+        if common is None:
+            return None
+
+        num_physical_experts = common["num_physical_experts"]
+        model_config_for_expert_location = common["model_config_for_expert_location"]
+        num_layers = model_config_for_expert_location.num_layers
+        num_logical_experts = model_config_for_expert_location.num_logical_experts
+
+        physical_to_logical_map = (
+            torch.arange(0, num_physical_experts).repeat(num_layers, 1)
+            % num_logical_experts
+        )
+
+        return ExpertLocationMetadata.init_by_mapping(
+            server_args,
+            model_config,
+            physical_to_logical_map=physical_to_logical_map,
+            moe_ep_rank=moe_ep_rank,
+        )
+
+    @staticmethod
+    def init_by_mapping(
+        server_args: ServerArgs,
+        model_config: ModelConfig,
+        physical_to_logical_map,
+        moe_ep_rank: int = None,
+    ):
+        if not isinstance(physical_to_logical_map, torch.Tensor):
+            physical_to_logical_map = torch.tensor(physical_to_logical_map)
+        physical_to_logical_map = physical_to_logical_map.to(server_args.device)
+
+        common = ExpertLocationMetadata._init_common(server_args, model_config)
+
+        if common is None:
+            return None
+
+        model_config_for_expert_location = common["model_config_for_expert_location"]
+        logical_to_all_physical_map = _compute_logical_to_all_physical_map(
+            server_args=server_args,
+            physical_to_logical_map=physical_to_logical_map,
+            num_logical_experts=model_config_for_expert_location.num_logical_experts,
+            ep_size=common["ep_size"],
+            moe_ep_rank=moe_ep_rank,
+        )
+
+        return ExpertLocationMetadata._init_raw(
+            server_args=server_args,
+            ep_size=common["ep_size"],
+            physical_to_logical_map=physical_to_logical_map,
+            logical_to_all_physical_map=logical_to_all_physical_map,
+        )
+
+    @staticmethod
+    def init_by_eplb(
+        server_args: ServerArgs, model_config: ModelConfig, logical_count: torch.Tensor
+    ):
+        if not isinstance(logical_count, torch.Tensor):
+            logical_count = torch.tensor(logical_count)
+        if len(logical_count.shape) == 2:
+            logical_count = logical_count.unsqueeze(0)
+        logical_count = logical_count.to(server_args.device)
+
+        common = ExpertLocationMetadata._init_common(server_args, model_config)
+
+        if common is None:
+            return None
+
+        model_config_for_expert_location = common["model_config_for_expert_location"]
+        num_physical_experts = common["num_physical_experts"]
+        num_groups = model_config_for_expert_location.num_groups
+        num_nodes = server_args.nnodes
+
+        physical_to_logical_map, logical_to_all_physical_map, expert_count = (
+            eplb_algorithms.rebalance_experts(
+                tokens_per_expert=logical_count,
+                num_physical_experts=num_physical_experts,
+                num_local_physical_experts=num_physical_experts // common["ep_size"],
+                num_groups=num_groups,
+                num_nodes=num_nodes,
+                algorithm=eplb_algorithms.compute_algorithm(
+                    raw_algorithm=server_args.eplb_algorithm,
+                    num_groups=num_groups,
+                    num_nodes=num_nodes,
+                ),
+            )
+        )
+
+        return ExpertLocationMetadata._init_raw(
+            server_args=server_args,
+            ep_size=common["ep_size"],
+            physical_to_logical_map=physical_to_logical_map.to(server_args.device),
+            logical_to_all_physical_map=logical_to_all_physical_map.to(
+                server_args.device
+            ),
+        )
+
+    @staticmethod
+    def _init_common(server_args: ServerArgs, model_config: ModelConfig):
+        model_config_for_expert_location = (
+            ModelConfigForExpertLocation.from_model_config(model_config)
+        )
+
+        if model_config_for_expert_location is None:
+            return None
+
+        num_physical_experts = (
+            model_config_for_expert_location.num_logical_experts
+            + server_args.ep_num_redundant_experts
+        )
+        ep_size = server_args.ep_size
+        assert num_physical_experts % ep_size == 0
+        num_local_physical_experts = num_physical_experts // ep_size
+
+        return dict(
+            model_config_for_expert_location=model_config_for_expert_location,
+            num_physical_experts=num_physical_experts,
+            num_local_physical_experts=num_local_physical_experts,
+            ep_size=ep_size,
+        )
+
+    @staticmethod
+    def _init_raw(
+        server_args: ServerArgs,
+        ep_size: int,
+        physical_to_logical_map: torch.Tensor,
+        logical_to_all_physical_map: torch.Tensor,
+    ):
+        _, num_physical_experts = physical_to_logical_map.shape
+
+        logical_to_all_physical_map_padded = F.pad(
+            logical_to_all_physical_map,
+            (0, num_physical_experts - logical_to_all_physical_map.shape[-1]),
+            value=-1,
+        )
+
+        logical_to_all_physical_map_num_valid = torch.count_nonzero(
+            logical_to_all_physical_map != -1, dim=-1
+        )
+
+        return ExpertLocationMetadata(
+            physical_to_logical_map=physical_to_logical_map,
+            physical_to_logical_map_cpu=physical_to_logical_map.cpu(),
+            logical_to_all_physical_map=logical_to_all_physical_map_padded,
+            logical_to_all_physical_map_cpu=logical_to_all_physical_map_padded.cpu(),
+            logical_to_all_physical_map_num_valid=logical_to_all_physical_map_num_valid,
+            logical_to_rank_dispatch_physical_map=(
+                compute_logical_to_rank_dispatch_physical_map(
+                    server_args=server_args,
+                    logical_to_all_physical_map=logical_to_all_physical_map,
+                    ep_size=ep_size,
+                    num_physical_experts=num_physical_experts,
+                    # TODO improve when we have real EP rank
+                    ep_rank=torch.distributed.get_rank() % ep_size,
+                )
+                if server_args.ep_dispatch_algorithm == "static"
+                else None
+            ),
+        )
+
+    # -------------------------------- mutation ------------------------------------
+
+    def update(
+        self,
+        other: "ExpertLocationMetadata",
+        update_layer_ids: List[int],
+    ):
+        for field in [
+            "ep_size",
+        ]:
+            assert getattr(self, field) == getattr(other, field)
+
+        for field in [
+            "physical_to_logical_map",
+            "physical_to_logical_map_cpu",
+            "logical_to_all_physical_map",
+            "logical_to_all_physical_map_cpu",
+            "logical_to_all_physical_map_num_valid",
+            "logical_to_rank_dispatch_physical_map",
+        ]:
+            other_field = getattr(other, field)
+            self_field = getattr(self, field)
+            assert (other_field is not None) == (self_field is not None)
+            if self_field is not None:
+                mask_update = torch.tensor(
+                    [i in update_layer_ids for i in range(self.num_layers)]
+                )
+                mask_update = mask_update.view(*([-1] + [1] * (self_field.dim() - 1)))
+                mask_update = mask_update.to(self_field.device, non_blocking=True)
+                self_field[...] = torch.where(mask_update, other_field, self_field)
+
+    # -------------------------------- usage ------------------------------------
+
+    def logical_to_all_physical(
+        self,
+        layer_id: int,
+        logical_expert_id: int,
+        require_global_experts: bool = False,
+    ) -> List[int]:
+        # Use CPU copy to avoid GPU→CPU sync on every call, which is expensive in update weights scenario
+        if require_global_experts:
+            num_physical_experts = self.logical_to_all_physical_map_cpu[layer_id].shape[
+                -1
+            ]
+            return list(
+                range(logical_expert_id, num_physical_experts, self.num_logical_experts)
+            )
+        return [
+            physical_expert_id
+            for physical_expert_id in self.logical_to_all_physical_map_cpu[
+                layer_id, logical_expert_id
+            ].tolist()
+            if physical_expert_id != -1
+        ]
+
+
+_global_expert_location_metadata: Optional[ExpertLocationMetadata] = None
+
+
+def get_global_expert_location_metadata():
+    return _global_expert_location_metadata
+
+
+def set_global_expert_location_metadata(value):
+    global _global_expert_location_metadata
+    assert _global_expert_location_metadata is None
+    _global_expert_location_metadata = value
+
+
+def _compute_logical_to_all_physical_map(
+    server_args: ServerArgs,
+    physical_to_logical_map: torch.Tensor,
+    num_logical_experts: int,
+    ep_size: int,
+    moe_ep_rank: int,
+):
+    # This is rarely called, so we use for loops for maximum clarity
+
+    num_layers, num_physical_experts = physical_to_logical_map.shape
+
+    logical_to_all_physical_map = [
+        [[] for _ in range(num_logical_experts)] for _ in range(num_layers)
+    ]
+
+    # Find out the candidate physical experts for each logical expert on each layer
+    for layer_id in range(num_layers):
+        for physical_expert_id in range(num_physical_experts):
+            logical_expert_id = physical_to_logical_map[
+                layer_id, physical_expert_id
+            ].item()
+            logical_to_all_physical_map[layer_id][logical_expert_id].append(
+                physical_expert_id
+            )
+
+    # Replace by the physical expert on local GPU or node if possible
+    if moe_ep_rank is not None:
+        num_gpus_per_node = server_args.ep_size // server_args.nnodes
+        num_local_gpu_physical_experts = num_physical_experts // ep_size
+        num_local_node_physical_experts = (
+            num_local_gpu_physical_experts * num_gpus_per_node
+        )
+        for layer_id in range(num_layers):
+            for logical_expert_id in range(num_logical_experts):
+                # Try to find the nearest physical expert
+                nearest_expert = _find_nearest_expert(
+                    candidate_physical_expert_ids=logical_to_all_physical_map[layer_id][
+                        logical_expert_id
+                    ],
+                    num_local_gpu_physical_experts=num_local_gpu_physical_experts,
+                    moe_ep_rank=moe_ep_rank,
+                    num_gpus_per_node=num_gpus_per_node,
+                    num_local_node_physical_experts=num_local_node_physical_experts,
+                )
+
+                # Replace by the nearest physical expert
+                if nearest_expert != -1:
+                    logical_to_all_physical_map[layer_id][logical_expert_id] = [
+                        nearest_expert
+                    ]
+
+    logical_to_all_physical_map = _pad_nested_array(
+        logical_to_all_physical_map, pad_value=-1
+    )
+
+    return torch.tensor(
+        logical_to_all_physical_map, device=physical_to_logical_map.device
+    )
+
+
+def _pad_nested_array(arr, pad_value):
+    max_len = max(len(inner) for outer in arr for inner in outer)
+    padded = [
+        [inner + [pad_value] * (max_len - len(inner)) for inner in outer]
+        for outer in arr
+    ]
+    return padded
+
+
+# TODO optimize performance (rewrite and/or run in separate process with overlap)
+def compute_logical_to_rank_dispatch_physical_map(
+    server_args: ServerArgs,
+    logical_to_all_physical_map: torch.Tensor,
+    ep_size: int,
+    num_physical_experts: int,
+    ep_rank: int,
+    seed: int = 42,
+):
+    r = random.Random(seed)
+
+    num_local_gpu_physical_experts = num_physical_experts // ep_size
+    num_gpus_per_node = server_args.ep_size // server_args.nnodes
+    num_local_node_physical_experts = num_local_gpu_physical_experts * num_gpus_per_node
+    num_layers, num_logical_experts, _ = logical_to_all_physical_map.shape
+    dtype = logical_to_all_physical_map.dtype
+
+    logical_to_rank_dispatch_physical_map = torch.full(
+        size=(ep_size, num_layers, num_logical_experts),
+        fill_value=-1,
+        dtype=dtype,
+    )
+
+    for layer_id in range(num_layers):
+        for logical_expert_id in range(num_logical_experts):
+            candidate_physical_expert_ids = _logical_to_all_physical_raw(
+                logical_to_all_physical_map, layer_id, logical_expert_id
+            )
+            output_partial = logical_to_rank_dispatch_physical_map[
+                :, layer_id, logical_expert_id
+            ]
+
+            for moe_ep_rank in range(ep_size):
+                # Fill with the nearest physical expert
+                output_partial[moe_ep_rank] = _find_nearest_expert(
+                    candidate_physical_expert_ids=candidate_physical_expert_ids,
+                    num_local_gpu_physical_experts=num_local_gpu_physical_experts,
+                    moe_ep_rank=moe_ep_rank,
+                    num_gpus_per_node=num_gpus_per_node,
+                    num_local_node_physical_experts=num_local_node_physical_experts,
+                )
+
+            # Fill remaining slots with fair random choices
+            num_remain = torch.sum(output_partial == -1).item()
+            output_partial[output_partial == -1] = torch.tensor(
+                _fair_choices(candidate_physical_expert_ids, k=num_remain, r=r),
+                dtype=dtype,
+            )
+
+    assert torch.all(logical_to_rank_dispatch_physical_map != -1)
+
+    device = logical_to_all_physical_map.device
+    return logical_to_rank_dispatch_physical_map[ep_rank, :, :].to(device)
+
+
+def _logical_to_all_physical_raw(
+    logical_to_all_physical_map, layer_id: int, logical_expert_id: int
+) -> List[int]:
+    return [
+        physical_expert_id
+        for physical_expert_id in logical_to_all_physical_map[
+            layer_id, logical_expert_id
+        ].tolist()
+        if physical_expert_id != -1
+    ]
+
+
+def _compute_gpu_id_of_physical_expert(
+    physical_expert_id: int, num_local_gpu_physical_experts: int
+) -> int:
+    return physical_expert_id // num_local_gpu_physical_experts
+
+
+def _compute_node_id_of_physical_expert(
+    physical_expert_id: int, num_local_host_physical_experts: int
+) -> int:
+    return physical_expert_id // num_local_host_physical_experts
+
+
+def _find_nearest_expert(
+    candidate_physical_expert_ids: List[int],
+    num_local_gpu_physical_experts: int,
+    moe_ep_rank: int,
+    num_gpus_per_node: int,
+    num_local_node_physical_experts: int,
+) -> int:
+    # 1. If only one candidate, return it directly
+    if len(candidate_physical_expert_ids) == 1:
+        return candidate_physical_expert_ids[0]
+
+    # 2. Prefer same-GPU experts
+    same_gpu_physical_expert_ids = [
+        physical_expert_id
+        for physical_expert_id in candidate_physical_expert_ids
+        if _compute_gpu_id_of_physical_expert(
+            physical_expert_id, num_local_gpu_physical_experts
+        )
+        == moe_ep_rank
+    ]
+    if len(same_gpu_physical_expert_ids) > 0:
+        return same_gpu_physical_expert_ids[0]
+
+    # 3. Otherwise, prefer same-node experts
+    node_rank = moe_ep_rank // num_gpus_per_node
+    same_node_physical_expert_ids = [
+        physical_expert_id
+        for physical_expert_id in candidate_physical_expert_ids
+        if _compute_node_id_of_physical_expert(
+            physical_expert_id, num_local_node_physical_experts
+        )
+        == node_rank
+    ]
+    if len(same_node_physical_expert_ids) > 0:
+        return same_node_physical_expert_ids[0]
+
+    # 4. At last, leave it as -1 to indicate not found.
+    return -1
+
+
+def _fair_choices(arr: List, k: int, r: random.Random) -> List:
+    quotient, remainder = divmod(k, len(arr))
+    ans = arr * quotient + r.sample(arr, k=remainder)
+    r.shuffle(ans)
+    return ans
+
+
+@dataclass
+class ModelConfigForExpertLocation:
+    num_layers: int
+    num_logical_experts: int
+    num_groups: Optional[int] = None
+
+    @staticmethod
+    def from_model_config(model_config: ModelConfig):
+        model_class, _ = get_model_architecture(model_config)
+        if hasattr(model_class, "get_model_config_for_expert_location"):
+            return model_class.get_model_config_for_expert_location(
+                model_config.hf_config
+            )
+        else:
+            return None
+
+
+def compute_initial_expert_location_metadata(
+    server_args: ServerArgs,
+    model_config: ModelConfig,
+    moe_ep_rank: int,
+) -> Optional[ExpertLocationMetadata]:
+    data = server_args.init_expert_location
+    if data == "trivial":
+        return ExpertLocationMetadata.init_trivial(
+            server_args, model_config, moe_ep_rank
+        )
+
+    # TODO unify with the utils function
+    if data.endswith(".pt"):
+        data_dict = torch.load(data, weights_only=True)
+    elif data.endswith(".json"):
+        data_dict = json.loads(Path(data).read_text())
+    else:
+        data_dict = json.loads(data)
+
+    if "physical_to_logical_map" in data_dict:
+        logger.info(
+            "init_expert_location from init_by_mapping using ServerArgs.init_expert_location"
+        )
+        return ExpertLocationMetadata.init_by_mapping(
+            server_args,
+            model_config,
+            **data_dict,
+            moe_ep_rank=moe_ep_rank,
+        )
+    elif "logical_count" in data_dict:
+        logger.info(
+            "init_expert_location from init_by_eplb using ServerArgs.init_expert_location"
+        )
+        return ExpertLocationMetadata.init_by_eplb(
+            server_args, model_config, logical_count=data_dict["logical_count"]
+        )
+    else:
+        raise NotImplementedError(
+            f"Unknown init_expert_location format ({list(data_dict.keys())=})"
+        )
diff --git a/sglang/python/sglang/srt/eplb/expert_location_dispatch.py b/sglang/python/sglang/srt/eplb/expert_location_dispatch.py
new file mode 100644
index 0000000000000000000000000000000000000000..7ac03390aa3f985a2cf2bbba4a2652cc2591fc9c
--- /dev/null
+++ b/sglang/python/sglang/srt/eplb/expert_location_dispatch.py
@@ -0,0 +1,109 @@
+# Copyright 2023-2025 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+from dataclasses import dataclass
+from typing import Literal, Optional
+
+import torch
+
+from sglang.srt.eplb.expert_location import get_global_expert_location_metadata
+from sglang.srt.server_args import get_global_server_args
+
+
+@dataclass
+class ExpertLocationDispatchInfo:
+    ep_dispatch_algorithm: Literal["static", "random"]
+    # (num_logical_experts,)
+    partial_logical_to_rank_dispatch_physical_map: Optional[torch.Tensor]
+    # (num_logical_experts, X)
+    partial_logical_to_all_physical_map: torch.Tensor
+    # (num_logical_experts,)
+    partial_logical_to_all_physical_map_num_valid: torch.Tensor
+    num_physical_experts: int
+
+    @classmethod
+    def init_new(cls, layer_id: int):
+        ep_dispatch_algorithm = get_global_server_args().ep_dispatch_algorithm
+        expert_location_metadata = get_global_expert_location_metadata()
+        assert expert_location_metadata is not None
+
+        if ep_dispatch_algorithm is None:
+            return None
+
+        return cls(
+            ep_dispatch_algorithm=ep_dispatch_algorithm,
+            partial_logical_to_rank_dispatch_physical_map=(
+                expert_location_metadata.logical_to_rank_dispatch_physical_map[
+                    layer_id, :
+                ]
+                if expert_location_metadata.logical_to_rank_dispatch_physical_map
+                is not None
+                else None
+            ),
+            partial_logical_to_all_physical_map=expert_location_metadata.logical_to_all_physical_map[
+                layer_id, :
+            ],
+            partial_logical_to_all_physical_map_num_valid=expert_location_metadata.logical_to_all_physical_map_num_valid[
+                layer_id, :
+            ],
+            num_physical_experts=expert_location_metadata.num_physical_experts,
+        )
+
+
+def transform_select_experts_inputs(
+    router_logits: torch.Tensor,
+    correction_bias: Optional[torch.Tensor],
+    info: Optional[ExpertLocationDispatchInfo],
+):
+    if (info is not None) and (info.ep_dispatch_algorithm == "fake"):
+        router_logits.uniform_(5, 10)
+        if correction_bias is not None:
+            correction_bias = torch.zeros_like(correction_bias)
+    return router_logits, correction_bias
+
+
+def topk_ids_logical_to_physical(
+    topk_ids: torch.Tensor, info: Optional[ExpertLocationDispatchInfo]
+) -> torch.Tensor:
+    if info is None:
+        return topk_ids
+
+    if info.ep_dispatch_algorithm == "static":
+        return _topk_ids_logical_to_physical_static(topk_ids, info)
+    if info.ep_dispatch_algorithm in ["dynamic", "fake"]:
+        return _topk_ids_logical_to_physical_dynamic(topk_ids, info)
+    raise NotImplementedError(f"Unknown algorithm {info.ep_dispatch_algorithm}")
+
+
+def _topk_ids_logical_to_physical_static(
+    topk_ids: torch.Tensor, info: Optional[ExpertLocationDispatchInfo]
+) -> torch.Tensor:
+    return info.partial_logical_to_rank_dispatch_physical_map[topk_ids]
+
+
+def _topk_ids_logical_to_physical_dynamic(
+    topk_ids: torch.Tensor, info: Optional[ExpertLocationDispatchInfo]
+) -> torch.Tensor:
+    topk_ids_original_shape = topk_ids.shape
+    device = topk_ids.device
+    topk_ids = topk_ids.flatten()
+
+    chosen_dispatch_index = (
+        torch.randint(0, 65536, topk_ids.shape, dtype=torch.int32, device=device)
+        % info.partial_logical_to_all_physical_map_num_valid[topk_ids]
+    )
+    topk_ids = info.partial_logical_to_all_physical_map[topk_ids, chosen_dispatch_index]
+
+    topk_ids = topk_ids.view(topk_ids_original_shape)
+    return topk_ids
diff --git a/sglang/python/sglang/srt/eplb/expert_location_updater.py b/sglang/python/sglang/srt/eplb/expert_location_updater.py
new file mode 100644
index 0000000000000000000000000000000000000000..286f1d0e3c7ae8212e481a665a118561cf8a7eb5
--- /dev/null
+++ b/sglang/python/sglang/srt/eplb/expert_location_updater.py
@@ -0,0 +1,575 @@
+# Copyright 2023-2025 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+import logging
+from collections import defaultdict
+from typing import Dict, List, Optional, Tuple
+
+import einops
+import torch
+import torch.distributed
+from torch.distributed import P2POp
+
+from sglang.srt.eplb.expert_location import (
+    ExpertLocationMetadata,
+    get_global_expert_location_metadata,
+)
+from sglang.srt.server_args import get_global_server_args
+from sglang.srt.utils import get_bool_env_var
+
+logger = logging.getLogger(__name__)
+
+
+_LOG_INPUT = get_bool_env_var("SGLANG_EXPERT_LOCATION_UPDATER_LOG_INPUT")
+
+
+class ExpertLocationUpdater:
+    def __init__(self):
+        self._first_execution = True
+
+    def update(
+        self,
+        routed_experts_weights_of_layer: Dict[int, List[torch.Tensor]],
+        new_expert_location_metadata: ExpertLocationMetadata,
+        update_layer_ids: List[int],
+        nnodes: int,
+        rank: int,
+    ):
+        if self._first_execution:
+            self._first_execution = False
+            torch.get_device_module().empty_cache()
+
+        old_expert_location_metadata = get_global_expert_location_metadata()
+        assert old_expert_location_metadata is not None
+
+        _update_expert_weights(
+            routed_experts_weights_of_layer=routed_experts_weights_of_layer,
+            old_expert_location_metadata=old_expert_location_metadata,
+            new_expert_location_metadata=new_expert_location_metadata,
+            update_layer_ids=update_layer_ids,
+            nnodes=nnodes,
+            rank=rank,
+        )
+        old_expert_location_metadata.update(
+            new_expert_location_metadata,
+            update_layer_ids=update_layer_ids,
+        )
+
+
+def _update_expert_weights(**kwargs):
+    if get_bool_env_var("SGLANG_EXPERT_LOCATION_UPDATER_CANARY"):
+        return _update_expert_weights_with_canary(**kwargs)
+    else:
+        return _update_expert_weights_raw(**kwargs)
+
+
+# can add watchdog as well
+def _update_expert_weights_with_canary(
+    routed_experts_weights_of_layer: Dict[int, List[torch.Tensor]],
+    old_expert_location_metadata: ExpertLocationMetadata,
+    new_expert_location_metadata: ExpertLocationMetadata,
+    update_layer_ids: List[int],
+    nnodes: int,
+    rank: int,
+):
+    num_local_physical_experts = old_expert_location_metadata.num_local_physical_experts
+
+    def _get_canary_value(meta: ExpertLocationMetadata, layer_id: int):
+        return meta.physical_to_logical_map_cpu[
+            layer_id,
+            num_local_physical_experts * rank : num_local_physical_experts * (rank + 1),
+        ]
+
+    routed_experts_weights_of_layer = {
+        k: [x for x in v] for k, v in routed_experts_weights_of_layer.items()
+    }
+    for layer_id in update_layer_ids:
+        canary_tensor = (
+            _get_canary_value(old_expert_location_metadata, layer_id)
+            .clone()
+            .to(device=get_global_server_args().device, non_blocking=True)
+        )
+        routed_experts_weights_of_layer[layer_id].append(canary_tensor)
+
+    _update_expert_weights_raw(
+        routed_experts_weights_of_layer=routed_experts_weights_of_layer,
+        old_expert_location_metadata=old_expert_location_metadata,
+        new_expert_location_metadata=new_expert_location_metadata,
+        update_layer_ids=update_layer_ids,
+        nnodes=nnodes,
+        rank=rank,
+    )
+
+    for layer_id in update_layer_ids:
+        # can optimize speed if needed
+        expect_value = _get_canary_value(new_expert_location_metadata, layer_id)
+        actual_value = routed_experts_weights_of_layer[layer_id][-1].cpu()
+        assert torch.all(expect_value == actual_value), (
+            f"{expect_value=} {actual_value=} {layer_id=} "
+            f"{old_expert_location_metadata.physical_to_logical_map_cpu.tolist()=} "
+            f"{new_expert_location_metadata.physical_to_logical_map_cpu.tolist()=} "
+        )
+
+
+def _update_expert_weights_raw(
+    routed_experts_weights_of_layer: Dict[int, List[torch.Tensor]],
+    old_expert_location_metadata: ExpertLocationMetadata,
+    new_expert_location_metadata: ExpertLocationMetadata,
+    update_layer_ids: List[int],
+    nnodes: int,
+    rank: int,
+):
+    log_metrics = get_bool_env_var("SGLANG_EXPERT_LOCATION_UPDATER_LOG_METRICS")
+
+    temp_buffers = create_temp_buffers(
+        routed_experts_weights_of_layer[update_layer_ids[0]]
+    )
+
+    world_size = torch.distributed.get_world_size()
+    num_local_physical_experts = old_expert_location_metadata.num_local_physical_experts
+    num_gpu_per_node = world_size // nnodes
+
+    for layer_id in update_layer_ids:
+        update_expert_weights_single_layer(
+            routed_experts_weights=routed_experts_weights_of_layer[layer_id],
+            temp_buffers=temp_buffers,
+            old_physical_to_logical_map=old_expert_location_metadata.physical_to_logical_map_cpu[
+                layer_id
+            ].tolist(),
+            new_physical_to_logical_map=new_expert_location_metadata.physical_to_logical_map_cpu[
+                layer_id
+            ].tolist(),
+            num_local_physical_experts=num_local_physical_experts,
+            num_gpu_per_node=num_gpu_per_node,
+            rank=rank,
+            world_size=world_size,
+            log_metrics=log_metrics,
+        )
+
+
+def create_temp_buffers(sample_tensors):
+    return [torch.empty_like(tensor) for tensor in sample_tensors]
+
+
+def update_expert_weights_single_layer(
+    routed_experts_weights: List[torch.Tensor],
+    temp_buffers: List[torch.Tensor],
+    old_physical_to_logical_map: List[int],  # (num_physical_Experts,)
+    new_physical_to_logical_map: List[int],  # (num_physical_Experts,)
+    num_local_physical_experts: int,
+    num_gpu_per_node: int,
+    rank: int,
+    world_size: Optional[int] = None,
+    debug: bool = False,
+    log_metrics: bool = False,
+):
+    assert all(
+        tensor.shape[0] == num_local_physical_experts
+        for tensor in routed_experts_weights
+    ), f"{num_local_physical_experts=} {[x.shape for x in routed_experts_weights]=}"
+    assert isinstance(old_physical_to_logical_map, list)
+    assert isinstance(new_physical_to_logical_map, list)
+
+    if _LOG_INPUT:
+        logger.info(
+            "update_expert_weights_single_layer "
+            f"{[x.shape for x in routed_experts_weights]=} "
+            f"{[x.shape for x in temp_buffers]=} "
+            f"{old_physical_to_logical_map=} "
+            f"{new_physical_to_logical_map=} "
+            f"{num_local_physical_experts=} "
+            f"{num_gpu_per_node=} "
+            f"{rank=} "
+            f"{world_size=} "
+        )
+
+    output_logs = [] if debug else None
+
+    num_physical_experts = len(old_physical_to_logical_map)
+    num_tensors = len(routed_experts_weights)
+
+    self_node_id = rank // num_gpu_per_node
+
+    local_expert_location_range = (
+        rank * num_local_physical_experts,
+        (rank + 1) * num_local_physical_experts,
+    )
+
+    def _entrypoint():
+        # List[Tuple[logical_expert_id, List[P2POp]]]
+        p2p_op_infos: List[Tuple[int, List[P2POp]]] = []
+        # List[Tuple[temp_buffers_expert_location, routed_experts_weights_expert_location]]
+        buffer2weight_copy_infos: List[Tuple[int, int]] = []
+
+        _handle_recv(buffer2weight_copy_infos, p2p_op_infos)
+        _create_isend_ops(p2p_op_infos)
+        _execute_p2p_ops(p2p_op_infos)
+        _execute_buffer2weight_copies(buffer2weight_copy_infos)
+
+        if log_metrics:
+            _log_p2p_op_metrics(
+                p2p_op_infos,
+                world_size=world_size,
+                num_gpu_per_node=num_gpu_per_node,
+                self_node_id=self_node_id,
+            )
+
+        if debug:
+            output_logs.append(f"{p2p_op_infos=}")
+            output_logs.append(f"{buffer2weight_copy_infos=}")
+
+    def _handle_recv(buffer2weight_copy_infos, p2p_op_infos):
+        for dst_expert_location in range(*local_expert_location_range):
+            _handle_recv_of_dst_expert_location(
+                dst_expert_location, buffer2weight_copy_infos, p2p_op_infos
+            )
+
+    def _handle_recv_of_dst_expert_location(
+        dst_expert_location: int, buffer2weight_copy_infos, p2p_op_infos
+    ):
+        logical_expert_id = new_physical_to_logical_map[dst_expert_location]
+
+        # case 1: unchanged
+        if old_physical_to_logical_map[dst_expert_location] == logical_expert_id:
+            if debug:
+                output_logs.append(
+                    f"handle_recv_of_dst_expert_location {dst_expert_location=} case=unchanged"
+                )
+            return
+
+        # case 2: same-gpu
+        for src_expert_location in range(*local_expert_location_range):
+            if old_physical_to_logical_map[src_expert_location] == logical_expert_id:
+                for i in range(num_tensors):
+                    _get_tensor(temp_buffers, i, dst_expert_location).copy_(
+                        _get_tensor(routed_experts_weights, i, src_expert_location)
+                    )
+                buffer2weight_copy_infos.append(
+                    (dst_expert_location, dst_expert_location)
+                )
+                if debug:
+                    output_logs.append(
+                        f"handle_recv_of_dst_expert_location {dst_expert_location=} case=same-gpu {src_expert_location=}"
+                    )
+                return
+
+        # case 3: free-rider
+        for src_expert_location in range(
+            rank * num_local_physical_experts, dst_expert_location
+        ):
+            if new_physical_to_logical_map[src_expert_location] == logical_expert_id:
+                buffer2weight_copy_infos.append(
+                    (src_expert_location, dst_expert_location)
+                )
+                if debug:
+                    output_logs.append(
+                        f"handle_recv_of_dst_expert_location {dst_expert_location=} case=free-rider {src_expert_location=}"
+                    )
+                return
+
+        same_node_mapping, cross_node_mapping, need_comm_self_node_dst_ranks = (
+            _compute_comm_info(logical_expert_id=logical_expert_id)
+        )
+
+        # case 4: same-node
+        if rank in need_comm_self_node_dst_ranks:
+            chosen_src_rank = same_node_mapping.chunk_value_from_element_value(
+                element_value=rank
+            )
+            _create_p2p_recv_and_buffer2weight_copy(
+                buffer2weight_copy_infos,
+                p2p_op_infos,
+                src_rank=chosen_src_rank,
+                logical_expert_id=logical_expert_id,
+                dst_expert_location=dst_expert_location,
+            )
+            if debug:
+                output_logs.append(
+                    f"handle_recv_of_dst_expert_location {dst_expert_location=} case=same-node {chosen_src_rank=}"
+                )
+            return
+
+        # case 5: cross-node
+        # Future work: can optimize when there are multiple ranks in the same dst node that uses the same logical expert
+        chosen_src_rank = cross_node_mapping.chunk_value_from_element_value(
+            element_value=rank
+        )
+        _create_p2p_recv_and_buffer2weight_copy(
+            buffer2weight_copy_infos,
+            p2p_op_infos,
+            src_rank=chosen_src_rank,
+            logical_expert_id=logical_expert_id,
+            dst_expert_location=dst_expert_location,
+        )
+        if debug:
+            output_logs.append(
+                f"handle_recv_of_dst_expert_location {dst_expert_location=} case=cross-node {chosen_src_rank=}"
+            )
+        return
+
+    def _create_p2p_recv_and_buffer2weight_copy(
+        buffer2weight_copy_infos,
+        p2p_op_infos,
+        *,
+        logical_expert_id: int,
+        src_rank: int,
+        dst_expert_location: int,
+    ):
+        p2p_op_infos.append(
+            (
+                logical_expert_id,
+                [
+                    P2POp(
+                        op=torch.distributed.irecv,
+                        tensor=_get_tensor(temp_buffers, i, dst_expert_location),
+                        peer=src_rank,
+                    )
+                    for i in range(num_tensors)
+                ],
+            )
+        )
+        buffer2weight_copy_infos.append((dst_expert_location, dst_expert_location))
+
+    def _create_isend_ops(p2p_op_infos):
+        handled_logical_expert_ids = set()
+        for src_expert_location in range(*local_expert_location_range):
+            logical_expert_id = old_physical_to_logical_map[src_expert_location]
+
+            if logical_expert_id in handled_logical_expert_ids:
+                continue
+            handled_logical_expert_ids.add(logical_expert_id)
+
+            _create_isend_ops_of_logical_expert_id(
+                logical_expert_id, src_expert_location, p2p_op_infos
+            )
+
+    def _create_isend_ops_of_logical_expert_id(
+        logical_expert_id, src_expert_location, p2p_op_infos
+    ):
+        same_node_mapping, cross_node_mapping, need_comm_self_node_dst_ranks = (
+            _compute_comm_info(logical_expert_id=logical_expert_id)
+        )
+
+        same_node_dst_ranks = same_node_mapping.element_values_from_chunk_value(
+            chunk_value=rank
+        )
+        cross_node_dst_ranks = cross_node_mapping.element_values_from_chunk_value(
+            chunk_value=rank
+        )
+        all_dst_ranks = same_node_dst_ranks + cross_node_dst_ranks
+
+        if debug:
+            output_logs.append(
+                f"create_isend_ops_of_logical_expert_id {logical_expert_id=} {src_expert_location=} {same_node_dst_ranks=} {cross_node_dst_ranks=}"
+            )
+
+        p2p_op_infos.append(
+            (
+                logical_expert_id,
+                [
+                    P2POp(
+                        op=torch.distributed.isend,
+                        tensor=_get_tensor(
+                            routed_experts_weights, i, src_expert_location
+                        ),
+                        peer=dst_rank,
+                    )
+                    for dst_rank in all_dst_ranks
+                    for i in range(num_tensors)
+                ],
+            )
+        )
+
+    def _compute_comm_info(logical_expert_id: int):
+        all_src_ranks = _deduplicate_ordered(
+            [
+                x // num_local_physical_experts
+                for x in range(num_physical_experts)
+                if old_physical_to_logical_map[x] == logical_expert_id
+            ]
+        )
+        all_src_nodes = [x // num_gpu_per_node for x in all_src_ranks]
+        self_node_src_ranks = [
+            x for x in all_src_ranks if x // num_gpu_per_node == self_node_id
+        ]
+
+        need_comm_dst_ranks = _deduplicate_ordered(
+            [
+                x // num_local_physical_experts
+                for x in range(num_physical_experts)
+                if new_physical_to_logical_map[x] == logical_expert_id
+                and x // num_local_physical_experts not in all_src_ranks
+            ]
+        )
+        need_comm_self_node_dst_ranks = (
+            [x for x in need_comm_dst_ranks if x // num_gpu_per_node == self_node_id]
+            if len(self_node_src_ranks) > 0
+            else []
+        )
+        need_comm_cross_node_dst_ranks = [
+            x
+            for x in need_comm_dst_ranks
+            if (x // num_gpu_per_node) not in all_src_nodes
+        ]
+
+        same_node_mapping = _ChunkUtils(
+            chunk_values=self_node_src_ranks,
+            element_values=need_comm_self_node_dst_ranks,
+        )
+
+        cross_node_mapping = _ChunkUtils(
+            chunk_values=all_src_ranks,
+            element_values=need_comm_cross_node_dst_ranks,
+        )
+
+        return same_node_mapping, cross_node_mapping, need_comm_self_node_dst_ranks
+
+    def _execute_p2p_ops(p2p_op_infos):
+        sorted_infos = sorted(p2p_op_infos, key=lambda info: info[0])
+        p2p_ops = [op for _, ops in sorted_infos for op in ops]
+        if len(p2p_ops) == 0:
+            return
+
+        reqs = torch.distributed.batch_isend_irecv(p2p_ops)
+        for req in reqs:
+            req.wait()
+
+    def _execute_buffer2weight_copies(buffer2weight_copy_infos):
+        for (
+            temp_buffers_expert_location,
+            routed_experts_weights_expert_location,
+        ) in buffer2weight_copy_infos:
+            for i in range(num_tensors):
+                _get_tensor(
+                    routed_experts_weights, i, routed_experts_weights_expert_location
+                ).copy_(_get_tensor(temp_buffers, i, temp_buffers_expert_location))
+
+    def _get_tensor(tensors, tensor_index: int, expert_location: int) -> torch.Tensor:
+        return tensors[tensor_index][_get_local_expert_location(expert_location)]
+
+    def _get_local_expert_location(expert_location: int) -> int:
+        assert (
+            local_expert_location_range[0]
+            <= expert_location
+            < local_expert_location_range[1]
+        )
+        return expert_location % num_local_physical_experts
+
+    _entrypoint()
+
+    return output_logs
+
+
+class _ChunkUtils:
+    def __init__(self, *, chunk_values: List, element_values: List):
+        self.chunk_values = chunk_values
+        self.element_values = element_values
+
+    def chunk_value_from_element_value(self, element_value):
+        chunk_index = self._chunk_index_from_element_index(
+            num_elements=len(self.element_values),
+            num_chunks=len(self.chunk_values),
+            element_index=self.element_values.index(element_value),
+        )
+        return self.chunk_values[chunk_index]
+
+    def element_values_from_chunk_value(self, chunk_value) -> List:
+        if len(self.element_values) == 0:
+            return []
+        element_slice = self._element_slice_from_chunk_index(
+            num_elements=len(self.element_values),
+            num_chunks=len(self.chunk_values),
+            chunk_index=self.chunk_values.index(chunk_value),
+        )
+        return self.element_values[element_slice]
+
+    @staticmethod
+    def _chunk_index_from_element_index(
+        num_elements: int, num_chunks: int, element_index: int
+    ) -> int:
+        short_chunk_size, num_long_chunks = divmod(num_elements, num_chunks)
+        num_elements_for_long_chunks = num_long_chunks * (short_chunk_size + 1)
+        if element_index < num_elements_for_long_chunks:
+            return element_index // (short_chunk_size + 1)
+        else:
+            return (
+                num_long_chunks
+                + (element_index - num_elements_for_long_chunks) // short_chunk_size
+            )
+
+    @staticmethod
+    def _element_slice_from_chunk_index(
+        num_elements: int, num_chunks: int, chunk_index: int
+    ) -> slice:
+        short_chunk_size, num_long_chunks = divmod(num_elements, num_chunks)
+        start = chunk_index * short_chunk_size + min(chunk_index, num_long_chunks)
+        end = start + short_chunk_size + int(chunk_index < num_long_chunks)
+        return slice(start, end)
+
+
+def _deduplicate_ordered(arr: List[int]):
+    output = []
+    for item in arr:
+        if len(output) == 0 or item != output[-1]:
+            output.append(item)
+    return output
+
+
+def _log_p2p_op_metrics(
+    p2p_op_infos: List[Tuple[int, List[P2POp]]],
+    num_gpu_per_node: int,
+    world_size: int,
+    self_node_id: int,
+):
+    text = ""
+    all_ops = [op for _, ops in p2p_op_infos for op in ops]
+
+    for direction, ops in _group_by(all_ops, _get_direction_from_op).items():
+        nbytes_of_gpu = [0] * world_size
+        for op in ops:
+            nbytes_of_gpu[op.peer] += op.tensor.nbytes
+        nbytes_of_gpu = torch.tensor(nbytes_of_gpu, dtype=torch.int64)
+
+        nbytes_of_node = einops.reduce(
+            nbytes_of_gpu,
+            "(num_nodes num_gpu_per_node) -> num_nodes",
+            num_gpu_per_node=num_gpu_per_node,
+            reduction="sum",
+        )
+
+        nbytes_curr_node = nbytes_of_node[self_node_id]
+        nbytes_cross_node = torch.sum(nbytes_of_node) - nbytes_curr_node
+
+        text += (
+            f"{direction}_nbytes_of_gpu={nbytes_of_gpu.tolist()} "
+            f"{direction}_nbytes_of_node={nbytes_of_node.tolist()} "
+            f"{direction}_nbytes_curr_node={nbytes_curr_node.item()} "
+            f"{direction}_nbytes_cross_node={nbytes_cross_node.item()} "
+        )
+
+    logger.info(f"[ExpertLocationUpdater] {text}")
+
+
+def _get_direction_from_op(op: P2POp):
+    if op.op == torch.distributed.isend:
+        return "isend"
+    if op.op == torch.distributed.irecv:
+        return "irecv"
+    raise NotImplementedError
+
+
+def _group_by(items, keyfunc):
+    ans = defaultdict(list)
+    for item in items:
+        ans[keyfunc(item)].append(item)
+    return dict(ans)
diff --git a/sglang/python/sglang/srt/function_call/__pycache__/base_format_detector.cpython-311.pyc b/sglang/python/sglang/srt/function_call/__pycache__/base_format_detector.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..98da322d275397dce595af98a1a83fabf47ffb47
Binary files /dev/null and b/sglang/python/sglang/srt/function_call/__pycache__/base_format_detector.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/function_call/__pycache__/core_types.cpython-311.pyc b/sglang/python/sglang/srt/function_call/__pycache__/core_types.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2986d197363005485ffe7ec67b407d10b7370825
Binary files /dev/null and b/sglang/python/sglang/srt/function_call/__pycache__/core_types.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/function_call/__pycache__/deepseekv31_detector.cpython-311.pyc b/sglang/python/sglang/srt/function_call/__pycache__/deepseekv31_detector.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..17577adec05b3fdf8073ee953dc7697d4d524d00
Binary files /dev/null and b/sglang/python/sglang/srt/function_call/__pycache__/deepseekv31_detector.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/function_call/__pycache__/deepseekv32_detector.cpython-311.pyc b/sglang/python/sglang/srt/function_call/__pycache__/deepseekv32_detector.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5f5edac264abdc7ea97c73fee60daedf3a7ecacd
Binary files /dev/null and b/sglang/python/sglang/srt/function_call/__pycache__/deepseekv32_detector.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/function_call/__pycache__/deepseekv3_detector.cpython-311.pyc b/sglang/python/sglang/srt/function_call/__pycache__/deepseekv3_detector.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..981ae37ee6c7d918ae644f01693acf0df48bcda3
Binary files /dev/null and b/sglang/python/sglang/srt/function_call/__pycache__/deepseekv3_detector.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/function_call/__pycache__/function_call_parser.cpython-311.pyc b/sglang/python/sglang/srt/function_call/__pycache__/function_call_parser.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4765dc5c9c8eb8ad22aeee54ce20a49516985f52
Binary files /dev/null and b/sglang/python/sglang/srt/function_call/__pycache__/function_call_parser.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/function_call/__pycache__/gigachat3_detector.cpython-311.pyc b/sglang/python/sglang/srt/function_call/__pycache__/gigachat3_detector.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f712100d8200c77bc0bb4281082dc0f8e8408998
Binary files /dev/null and b/sglang/python/sglang/srt/function_call/__pycache__/gigachat3_detector.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/function_call/__pycache__/glm47_moe_detector.cpython-311.pyc b/sglang/python/sglang/srt/function_call/__pycache__/glm47_moe_detector.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..faaf50978b0a6c8c5677819af50f757a660a2850
Binary files /dev/null and b/sglang/python/sglang/srt/function_call/__pycache__/glm47_moe_detector.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/function_call/__pycache__/glm4_moe_detector.cpython-311.pyc b/sglang/python/sglang/srt/function_call/__pycache__/glm4_moe_detector.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b8f4c505c37d0f8af986a1fa7871abc250c5beea
Binary files /dev/null and b/sglang/python/sglang/srt/function_call/__pycache__/glm4_moe_detector.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/function_call/__pycache__/gpt_oss_detector.cpython-311.pyc b/sglang/python/sglang/srt/function_call/__pycache__/gpt_oss_detector.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..acc4b579cbd1c251a6c72525066f461af464a4f6
Binary files /dev/null and b/sglang/python/sglang/srt/function_call/__pycache__/gpt_oss_detector.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/function_call/__pycache__/hermes_detector.cpython-311.pyc b/sglang/python/sglang/srt/function_call/__pycache__/hermes_detector.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8fcdfb1127eb07beeeb1f26089f85d5bcf62b60e
Binary files /dev/null and b/sglang/python/sglang/srt/function_call/__pycache__/hermes_detector.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/function_call/__pycache__/internlm_detector.cpython-311.pyc b/sglang/python/sglang/srt/function_call/__pycache__/internlm_detector.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..12a8139f7ea708a2fe88e6a6977908a5083d7914
Binary files /dev/null and b/sglang/python/sglang/srt/function_call/__pycache__/internlm_detector.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/function_call/__pycache__/json_array_parser.cpython-311.pyc b/sglang/python/sglang/srt/function_call/__pycache__/json_array_parser.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..304738951ac2b1a77ba5baa2567c693a019f195e
Binary files /dev/null and b/sglang/python/sglang/srt/function_call/__pycache__/json_array_parser.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/function_call/__pycache__/kimik2_detector.cpython-311.pyc b/sglang/python/sglang/srt/function_call/__pycache__/kimik2_detector.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f9e2919a14c4064a1337255889d9d9c59ebad245
Binary files /dev/null and b/sglang/python/sglang/srt/function_call/__pycache__/kimik2_detector.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/function_call/__pycache__/lfm2_detector.cpython-311.pyc b/sglang/python/sglang/srt/function_call/__pycache__/lfm2_detector.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..861adff0d72c7b5db53838d4a0c866c32e21adc1
Binary files /dev/null and b/sglang/python/sglang/srt/function_call/__pycache__/lfm2_detector.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/function_call/__pycache__/llama32_detector.cpython-311.pyc b/sglang/python/sglang/srt/function_call/__pycache__/llama32_detector.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..31e50691ba42cbad28345050ff4a713aabdcb8d4
Binary files /dev/null and b/sglang/python/sglang/srt/function_call/__pycache__/llama32_detector.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/function_call/__pycache__/mimo_detector.cpython-311.pyc b/sglang/python/sglang/srt/function_call/__pycache__/mimo_detector.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..354a6f669979e16a042dc2f6a2ad62bcc938bdf5
Binary files /dev/null and b/sglang/python/sglang/srt/function_call/__pycache__/mimo_detector.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/function_call/__pycache__/minimax_m2.cpython-311.pyc b/sglang/python/sglang/srt/function_call/__pycache__/minimax_m2.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..edba0a6f6450b5c367dffdf7f41d0233a925801a
Binary files /dev/null and b/sglang/python/sglang/srt/function_call/__pycache__/minimax_m2.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/function_call/__pycache__/mistral_detector.cpython-311.pyc b/sglang/python/sglang/srt/function_call/__pycache__/mistral_detector.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0f1e8edbbbe6df35c7d637a676ea4e4255f3894e
Binary files /dev/null and b/sglang/python/sglang/srt/function_call/__pycache__/mistral_detector.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/function_call/__pycache__/pythonic_detector.cpython-311.pyc b/sglang/python/sglang/srt/function_call/__pycache__/pythonic_detector.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e691beea08bb8c2cc628a7e5f12c3f909106dd59
Binary files /dev/null and b/sglang/python/sglang/srt/function_call/__pycache__/pythonic_detector.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/function_call/__pycache__/qwen25_detector.cpython-311.pyc b/sglang/python/sglang/srt/function_call/__pycache__/qwen25_detector.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4819471212cc41b1465932797b73574720a533ed
Binary files /dev/null and b/sglang/python/sglang/srt/function_call/__pycache__/qwen25_detector.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/function_call/__pycache__/qwen3_coder_detector.cpython-311.pyc b/sglang/python/sglang/srt/function_call/__pycache__/qwen3_coder_detector.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f5f78e7851c55b2c20f023b18286ea785ab4da31
Binary files /dev/null and b/sglang/python/sglang/srt/function_call/__pycache__/qwen3_coder_detector.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/function_call/__pycache__/step3_detector.cpython-311.pyc b/sglang/python/sglang/srt/function_call/__pycache__/step3_detector.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e78d1c613f88226c5abae36609861ff1b090225e
Binary files /dev/null and b/sglang/python/sglang/srt/function_call/__pycache__/step3_detector.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/function_call/__pycache__/trinity_detector.cpython-311.pyc b/sglang/python/sglang/srt/function_call/__pycache__/trinity_detector.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..017b09fe6f9bd62f961d70aa424dd355c25aef9e
Binary files /dev/null and b/sglang/python/sglang/srt/function_call/__pycache__/trinity_detector.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/function_call/__pycache__/utils.cpython-311.pyc b/sglang/python/sglang/srt/function_call/__pycache__/utils.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0d88c2bab002aabf33c0b1a652afbfd03fa88b47
Binary files /dev/null and b/sglang/python/sglang/srt/function_call/__pycache__/utils.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/function_call/base_format_detector.py b/sglang/python/sglang/srt/function_call/base_format_detector.py
new file mode 100644
index 0000000000000000000000000000000000000000..8022dbe076e2bef0e3df5ae26622d0150552cde0
--- /dev/null
+++ b/sglang/python/sglang/srt/function_call/base_format_detector.py
@@ -0,0 +1,346 @@
+import json
+import logging
+from abc import ABC, abstractmethod
+from typing import Any, Dict, List
+
+import orjson
+from partial_json_parser.core.exceptions import MalformedJSON
+from partial_json_parser.core.options import Allow
+
+from sglang.srt.entrypoints.openai.protocol import Tool
+from sglang.srt.environ import envs
+from sglang.srt.function_call.core_types import (
+    StreamingParseResult,
+    ToolCallItem,
+    _GetInfoFunc,
+)
+from sglang.srt.function_call.utils import (
+    _find_common_prefix,
+    _is_complete_json,
+    _partial_json_loads,
+)
+
+logger = logging.getLogger(__name__)
+
+
+class BaseFormatDetector(ABC):
+    """Base class providing two sets of interfaces: one-time and streaming incremental."""
+
+    def __init__(self):
+        # Streaming state management
+        # Buffer for accumulating incomplete patterns that arrive across multiple streaming chunks
+        self._buffer = ""
+        # Stores complete tool call info (name and arguments) for each tool being parsed.
+        # Used by serving layer for completion handling when streaming ends.
+        # Format: [{"name": str, "arguments": dict}, ...]
+        self.prev_tool_call_arr: List[Dict] = []
+        # Index of currently streaming tool call. Starts at -1 (no active tool),
+        # increments as each tool completes. Tracks which tool's arguments are streaming.
+        self.current_tool_id: int = -1
+        # Flag for whether current tool's name has been sent to client.
+        # Tool names sent first with empty parameters, then arguments stream incrementally.
+        self.current_tool_name_sent: bool = False
+        # Tracks raw JSON string content streamed to client for each tool's arguments.
+        # Critical for serving layer to calculate remaining content when streaming ends.
+        # Each index corresponds to a tool_id. Example: ['{"location": "San Francisco"', '{"temp": 72']
+        self.streamed_args_for_tool: List[str] = []
+
+        # Token configuration (override in subclasses)
+        self.bot_token = ""
+        self.eot_token = ""
+        self.tool_call_separator = ", "
+
+    def _get_tool_indices(self, tools: List[Tool]) -> Dict[str, int]:
+        """
+        Get a mapping of tool names to their indices in the tools list.
+
+        This utility method creates a dictionary mapping function names to their
+        indices in the tools list, which is commonly needed for tool validation
+        and ToolCallItem creation.
+
+        Args:
+            tools: List of available tools
+
+        Returns:
+            Dictionary mapping tool names to their indices
+        """
+        return {
+            tool.function.name: i for i, tool in enumerate(tools) if tool.function.name
+        }
+
+    def parse_base_json(self, action: Any, tools: List[Tool]) -> List[ToolCallItem]:
+        tool_indices = self._get_tool_indices(tools)
+        if not isinstance(action, list):
+            action = [action]
+
+        results = []
+        for act in action:
+            name = act.get("name")
+            if not (name and name in tool_indices):
+                logger.warning(f"Model attempted to call undefined function: {name}")
+                if not envs.SGLANG_FORWARD_UNKNOWN_TOOLS.get():
+                    continue  # Skip unknown tools (default legacy behavior)
+
+            results.append(
+                ToolCallItem(
+                    tool_index=tool_indices.get(name, -1),
+                    name=name,
+                    parameters=json.dumps(
+                        act.get("parameters") or act.get("arguments", {}),
+                        ensure_ascii=False,
+                    ),
+                )
+            )
+
+        return results
+
+    @abstractmethod
+    def detect_and_parse(self, text: str, tools: List[Tool]) -> StreamingParseResult:
+        """
+        Parses the text in one go. Returns success=True if the format matches, otherwise False.
+        Note that leftover_text here represents "content that this parser will not consume further".
+        """
+        action = orjson.loads(text)
+        return StreamingParseResult(calls=self.parse_base_json(action, tools))
+
+    def _ends_with_partial_token(self, buffer: str, bot_token: str) -> int:
+        """
+        Check if buffer ends with a partial bot_token.
+        Return the length of the partial bot_token.
+
+        For some format, the bot_token is not a token in model's vocabulary, such as
+        `[TOOL_CALLS] [` in Mistral.
+        """
+        for i in range(1, min(len(buffer) + 1, len(bot_token))):
+            if bot_token.startswith(buffer[-i:]):
+                return i
+        return 0
+
+    def parse_streaming_increment(
+        self, new_text: str, tools: List[Tool]
+    ) -> StreamingParseResult:
+        """
+        Streaming incremental parsing with tool validation.
+
+        This base implementation works best with formats where:
+        1. bot_token is followed immediately by JSON (e.g., bot_token + JSON_array)
+        2. JSON can be parsed incrementally using partial_json_loads
+        3. Multiple tool calls are separated by "; " or ", "
+
+        Examples of incompatible formats (need custom implementation, may reuse some logic from this class):
+        - Each tool call is wrapped in a separate block: See Qwen25Detector
+        - Multiple separate blocks: [TOOL_CALLS] [...] \n [TOOL_CALLS] [...]
+        - Tool call is Pythonic style
+
+        For incompatible formats, detectors should override this method with custom logic.
+        """
+        # Append new text to buffer
+        self._buffer += new_text
+        current_text = self._buffer
+
+        # The current_text has tool_call if it is the start of a new tool call sequence
+        # or it is the start of a new tool call after a tool call separator, when there is a previous tool call
+        if not (
+            self.has_tool_call(current_text)
+            or (
+                self.current_tool_id > 0
+                and current_text.startswith(self.tool_call_separator)
+            )
+        ):
+            # Only clear buffer if we're sure no tool call is starting
+            if not self._ends_with_partial_token(self._buffer, self.bot_token):
+                normal_text = self._buffer
+                self._buffer = ""
+                if self.eot_token in normal_text:
+                    normal_text = normal_text.replace(self.eot_token, "")
+                return StreamingParseResult(normal_text=normal_text)
+            else:
+                # Might be partial bot_token, keep buffering
+                return StreamingParseResult()
+
+        # Build tool indices if not already built
+        if not hasattr(self, "_tool_indices"):
+            self._tool_indices = self._get_tool_indices(tools)
+
+        flags = Allow.ALL if self.current_tool_name_sent else Allow.ALL & ~Allow.STR
+
+        try:
+            try:
+                # Priority check: if we're processing a subsequent tool (current_tool_id > 0),
+                # first check if text starts with the tool separator. This is critical for
+                # parallel tool calls because the bot_token (e.g., '[') can also
+                # appear inside array parameters of the current tool, and we must not
+                # mistakenly identify that as the start of a new tool.
+                if self.current_tool_id > 0 and current_text.startswith(
+                    self.tool_call_separator
+                ):
+                    start_idx = len(self.tool_call_separator)
+                else:
+                    # Only search for bot_token if not processing subsequent tool
+                    tool_call_pos = current_text.find(self.bot_token)
+                    if tool_call_pos != -1:
+                        start_idx = tool_call_pos + len(self.bot_token)
+                    else:
+                        start_idx = 0
+
+                if start_idx >= len(current_text):
+                    return StreamingParseResult()
+
+                obj, end_idx = _partial_json_loads(current_text[start_idx:], flags)
+
+                is_current_complete = _is_complete_json(
+                    current_text[start_idx : start_idx + end_idx]
+                )
+
+                # Validate tool name if present
+                if "name" in obj and obj["name"] not in self._tool_indices:
+                    # Invalid tool name - reset state
+                    self._buffer = ""
+                    self.current_tool_id = -1
+                    self.current_tool_name_sent = False
+                    if self.streamed_args_for_tool:
+                        self.streamed_args_for_tool.pop()
+                    return StreamingParseResult()
+
+                # Handle parameters/arguments consistency
+                # NOTE: we assume here that the obj is always partial of a single tool call
+                if "parameters" in obj:
+                    assert (
+                        "arguments" not in obj
+                    ), "model generated both parameters and arguments"
+                    obj["arguments"] = obj["parameters"]
+
+                current_tool_call = obj
+
+            except MalformedJSON:
+                return StreamingParseResult()
+
+            if not current_tool_call:
+                return StreamingParseResult()
+
+            # Case 1: Handle tool name streaming
+            # This happens when we encounter a tool but haven't sent its name yet
+            if not self.current_tool_name_sent:
+                function_name = current_tool_call.get("name")
+
+                if function_name and function_name in self._tool_indices:
+                    # If this is a new tool (current_tool_id was -1), initialize it
+                    if self.current_tool_id == -1:
+                        self.current_tool_id = 0
+                        self.streamed_args_for_tool.append("")
+                    # If this is a subsequent tool, ensure streamed_args_for_tool is large enough
+                    elif self.current_tool_id >= len(self.streamed_args_for_tool):
+                        while len(self.streamed_args_for_tool) <= self.current_tool_id:
+                            self.streamed_args_for_tool.append("")
+
+                    # Send the tool name with empty parameters
+                    res = StreamingParseResult(
+                        calls=[
+                            ToolCallItem(
+                                tool_index=self.current_tool_id,
+                                name=function_name,
+                                parameters="",
+                            )
+                        ],
+                    )
+                    self.current_tool_name_sent = True
+                else:
+                    res = StreamingParseResult()
+
+            # Case 2: Handle streaming arguments
+            # This happens when we've already sent the tool name and now need to stream arguments incrementally
+            else:
+                cur_arguments = current_tool_call.get("arguments")
+                res = StreamingParseResult()
+
+                if cur_arguments:
+                    # Calculate how much of the arguments we've already streamed
+                    sent = len(self.streamed_args_for_tool[self.current_tool_id])
+                    cur_args_json = json.dumps(cur_arguments, ensure_ascii=False)
+                    prev_arguments = None
+                    if self.current_tool_id < len(self.prev_tool_call_arr):
+                        prev_arguments = self.prev_tool_call_arr[
+                            self.current_tool_id
+                        ].get("arguments")
+
+                    argument_diff = None
+
+                    # If the current tool's JSON is complete, send all remaining arguments
+                    if is_current_complete:
+                        argument_diff = cur_args_json[sent:]
+                        completing_tool_id = (
+                            self.current_tool_id
+                        )  # Save the ID of the tool that's completing
+
+                        # Only remove the processed portion, keep unprocessed content
+                        self._buffer = current_text[start_idx + end_idx :]
+
+                    # If the tool is still being parsed, send incremental changes
+                    elif prev_arguments:
+                        prev_args_json = json.dumps(prev_arguments, ensure_ascii=False)
+                        if cur_args_json != prev_args_json:
+                            prefix = _find_common_prefix(prev_args_json, cur_args_json)
+                            argument_diff = prefix[sent:]
+
+                    # Update prev_tool_call_arr with current state
+                    if self.current_tool_id >= 0:
+                        # Ensure prev_tool_call_arr is large enough
+                        while len(self.prev_tool_call_arr) <= self.current_tool_id:
+                            self.prev_tool_call_arr.append({})
+                        self.prev_tool_call_arr[self.current_tool_id] = (
+                            current_tool_call
+                        )
+
+                    # Advance to next tool if complete
+                    if is_current_complete:
+                        self.current_tool_name_sent = False
+                        self.current_tool_id += 1
+
+                    # Send the argument diff if there's something new
+                    if argument_diff is not None:
+                        # Use the correct tool_index: completing_tool_id for completed tools, current_tool_id for ongoing
+                        tool_index_to_use = (
+                            completing_tool_id
+                            if is_current_complete
+                            else self.current_tool_id
+                        )
+                        res = StreamingParseResult(
+                            calls=[
+                                ToolCallItem(
+                                    tool_index=tool_index_to_use,
+                                    parameters=argument_diff,
+                                )
+                            ],
+                        )
+                        self.streamed_args_for_tool[tool_index_to_use] += argument_diff
+
+            return res
+
+        except Exception as e:
+            logger.error(f"Error in parse_streaming_increment: {e}")
+            return StreamingParseResult()
+
+    @abstractmethod
+    def has_tool_call(self, text: str) -> bool:
+        """
+        Check if the given text contains function call markers specific to this format.
+        """
+        raise NotImplementedError()
+
+    def supports_structural_tag(self) -> bool:
+        """Return True if this detector supports structural tag format."""
+        return True
+
+    @abstractmethod
+    def structure_info(self) -> _GetInfoFunc:
+        """
+        Return a function that creates StructureInfo for constrained generation.
+
+        The returned function takes a tool name and returns a StructureInfo object
+        containing the begin/end patterns and trigger tokens needed for constrained
+        generation of function calls in this format.
+
+        Returns:
+            A function that takes a tool name (str) and returns StructureInfo
+        """
+        raise NotImplementedError()
diff --git a/sglang/python/sglang/srt/function_call/core_types.py b/sglang/python/sglang/srt/function_call/core_types.py
new file mode 100644
index 0000000000000000000000000000000000000000..1ea87df798c80932e0be46b6ef49d1037faef083
--- /dev/null
+++ b/sglang/python/sglang/srt/function_call/core_types.py
@@ -0,0 +1,34 @@
+from dataclasses import dataclass
+from typing import Callable, List, Optional
+
+from pydantic import BaseModel
+
+
+class ToolCallItem(BaseModel):
+    """Simple encapsulation of the parsed ToolCall result for easier usage in streaming contexts."""
+
+    tool_index: int
+    name: Optional[str] = None
+    parameters: str  # JSON string
+
+
+class StreamingParseResult(BaseModel):
+    """Result of streaming incremental parsing."""
+
+    normal_text: str = ""
+    calls: List[ToolCallItem] = []
+
+
+@dataclass
+class StructureInfo:
+    begin: str
+    end: str
+    trigger: str
+
+
+"""
+Helper alias of function
+Usually it is a function that takes a name string and returns a StructureInfo object,
+which can be used to construct a structural_tag object
+"""
+_GetInfoFunc = Callable[[str], StructureInfo]
diff --git a/sglang/python/sglang/srt/function_call/deepseekv31_detector.py b/sglang/python/sglang/srt/function_call/deepseekv31_detector.py
new file mode 100644
index 0000000000000000000000000000000000000000..e430cb86126baefe611b2c8e42db0dae17efdeb9
--- /dev/null
+++ b/sglang/python/sglang/srt/function_call/deepseekv31_detector.py
@@ -0,0 +1,206 @@
+import json
+import logging
+import re
+from typing import List
+
+from sglang.srt.entrypoints.openai.protocol import Tool
+from sglang.srt.function_call.base_format_detector import BaseFormatDetector
+from sglang.srt.function_call.core_types import (
+    StreamingParseResult,
+    StructureInfo,
+    ToolCallItem,
+    _GetInfoFunc,
+)
+from sglang.srt.function_call.utils import _is_complete_json
+
+logger = logging.getLogger(__name__)
+
+
+class DeepSeekV31Detector(BaseFormatDetector):
+    """
+    Detector for DeepSeek V3 model function call format.
+
+    The DeepSeek V3 format uses special Unicode tokens to delimit function calls
+    with JSON code blocks for arguments.
+
+    Format Structure:
+    ```
+    <｜tool▁calls▁begin｜><｜tool▁call▁begin｜>{function_name}<｜tool▁sep｜>{json_arguments}<｜tool▁calls▁end｜><｜end▁of▁sentence｜>
+    ```
+    Examples:
+    ```
+    <｜tool▁calls▁begin｜><｜tool▁call▁begin｜>get_current_weather<｜tool▁sep｜>{"location": "Tokyo"}<｜tool▁call▁end｜><｜tool▁call▁begin｜>get_current_weather<｜tool▁sep｜>{"location": "Paris"}<｜tool▁call▁end｜><｜tool▁calls▁end｜><｜end▁of▁sentence｜>
+    ```
+
+    Key Components:
+    - Tool Calls Section: Wrapped between `<｜tool▁calls▁begin｜>` and `<｜tool▁calls▁end｜>`
+    - Individual Tool Call: Wrapped between `<｜tool▁call▁begin｜>` and `<｜tool▁call▁end｜>`
+    - Function Declaration: `<｜tool▁call▁begin｜>{function_name}<｜tool▁sep｜>`
+    - Arguments: JSON code block between `<｜tool▁sep｜>` and `<｜tool▁call▁end｜>`
+    - Supports multiple tool calls
+
+    Reference: https://www.modelscope.cn/models/deepseek-ai/DeepSeek-V3.1
+    """
+
+    def __init__(self):
+        super().__init__()
+        self.bot_token = "<｜tool▁calls▁begin｜>"
+        self.eot_token = "<｜tool▁calls▁end｜>"
+        self.func_call_regex = r"<｜tool▁call▁begin｜>.*?<｜tool▁call▁end｜>"
+        self.func_detail_regex = (
+            r"<｜tool▁call▁begin｜>(.*)<｜tool▁sep｜>(.*)<｜tool▁call▁end｜>"
+        )
+        self._last_arguments = ""
+        self.current_tool_id = -1
+
+    def has_tool_call(self, text: str) -> bool:
+        """Check if the text contains a deepseek format tool call."""
+        return self.bot_token in text
+
+    def detect_and_parse(self, text: str, tools: List[Tool]) -> StreamingParseResult:
+        """
+        One-time parsing: Detects and parses tool calls in the provided text.
+
+        :param text: The complete text to parse.
+        :param tools: List of available tools.
+        :return: ParseResult indicating success or failure, consumed text, leftover text, and parsed calls.
+        """
+        idx = text.find(self.bot_token)
+        normal_text = text[:idx].strip() if idx != -1 else text
+        if self.bot_token not in text:
+            return StreamingParseResult(normal_text=normal_text, calls=[])
+        match_result_list = re.findall(self.func_call_regex, text, re.DOTALL)
+        calls = []
+        try:
+            for match_result in match_result_list:
+                # Get function name
+                func_detail = re.search(self.func_detail_regex, match_result, re.DOTALL)
+                func_name = func_detail.group(1)
+                func_args = func_detail.group(2)
+                func_args = json.loads(func_args)
+                # construct match_result for parse_base_json
+                match_result = {"name": func_name, "parameters": func_args}
+                calls.extend(self.parse_base_json(match_result, tools))
+            return StreamingParseResult(normal_text=normal_text, calls=calls)
+        except Exception as e:
+            logger.error(f"Error in detect_and_parse: {e}")
+            # return the normal text if parsing fails
+            return StreamingParseResult(normal_text=text)
+
+    def parse_streaming_increment(
+        self, new_text: str, tools: List[Tool]
+    ) -> StreamingParseResult:
+        """
+        Streaming incremental parsing tool calls for DeepSeekV3 format.
+        """
+        self._buffer += new_text
+        current_text = self._buffer
+
+        # Check if we have a tool call (either the start token or individual tool call)
+        has_tool_call = (
+            self.bot_token in current_text or "<｜tool▁call▁begin｜>" in current_text
+        )
+
+        if not has_tool_call:
+            self._buffer = ""
+            for e_token in [self.eot_token, "<｜tool▁call▁end｜>"]:
+                if e_token in new_text:
+                    new_text = new_text.replace(e_token, "")
+            return StreamingParseResult(normal_text=new_text)
+
+        if not hasattr(self, "_tool_indices"):
+            self._tool_indices = self._get_tool_indices(tools)
+
+        calls: list[ToolCallItem] = []
+        try:
+            partial_match = re.search(
+                pattern=r"<｜tool▁call▁begin｜>(.*)<｜tool▁sep｜>(.*?)(<｜tool▁call▁end｜>|$)",
+                string=current_text,
+                flags=re.DOTALL,
+            )
+            if partial_match:
+                func_name = partial_match.group(1).strip()
+                func_args_raw = partial_match.group(2).strip()
+                is_tool_end = partial_match.group(3)
+
+                # Initialize state if this is the first tool call
+                if self.current_tool_id == -1:
+                    self.current_tool_id = 0
+                    self.prev_tool_call_arr = []
+                    self.streamed_args_for_tool = [""]
+
+                # Ensure we have enough entries in our tracking arrays
+                while len(self.prev_tool_call_arr) <= self.current_tool_id:
+                    self.prev_tool_call_arr.append({})
+                while len(self.streamed_args_for_tool) <= self.current_tool_id:
+                    self.streamed_args_for_tool.append("")
+
+                if not self.current_tool_name_sent:
+                    calls.append(
+                        ToolCallItem(
+                            tool_index=self.current_tool_id,
+                            name=func_name,
+                            parameters="",
+                        )
+                    )
+                    self.current_tool_name_sent = True
+                    # Store the tool call info for serving layer completions endpoint
+                    self.prev_tool_call_arr[self.current_tool_id] = {
+                        "name": func_name,
+                        "arguments": {},
+                    }
+                else:
+                    argument_diff = (
+                        func_args_raw[len(self._last_arguments) :]
+                        if func_args_raw.startswith(self._last_arguments)
+                        else func_args_raw
+                    )
+
+                    if argument_diff:
+                        calls.append(
+                            ToolCallItem(
+                                tool_index=self.current_tool_id,
+                                name=None,
+                                parameters=argument_diff,
+                            )
+                        )
+                        self._last_arguments += argument_diff
+                        self.streamed_args_for_tool[
+                            self.current_tool_id
+                        ] += argument_diff
+
+                    if _is_complete_json(func_args_raw):
+                        # Update the stored arguments
+                        try:
+                            parsed_args = json.loads(func_args_raw)
+                            self.prev_tool_call_arr[self.current_tool_id][
+                                "arguments"
+                            ] = parsed_args
+                        except json.JSONDecodeError:
+                            pass
+
+                        # Find the end of the current tool call and remove only that part from buffer
+                        if is_tool_end:
+                            # Remove the completed tool call from buffer, keep any remaining content
+                            self._buffer = current_text[partial_match.end(3) :]
+                        else:
+                            self._buffer = ""
+
+                        result = StreamingParseResult(normal_text="", calls=calls)
+                        self.current_tool_id += 1
+                        self._last_arguments = ""
+                        self.current_tool_name_sent = False
+                        return result
+
+            return StreamingParseResult(normal_text="", calls=calls)
+
+        except Exception as e:
+            logger.error(f"Error in parse_streaming_increment: {e}")
+            return StreamingParseResult(normal_text=current_text)
+
+    def structure_info(self) -> _GetInfoFunc:
+        return lambda name: StructureInfo(
+            begin="<｜tool▁call▁begin｜>" + name + "<｜tool▁sep｜>",
+            end="<｜tool▁call▁end｜>",
+            trigger="<｜tool▁call▁begin｜>",
+        )
diff --git a/sglang/python/sglang/srt/function_call/deepseekv32_detector.py b/sglang/python/sglang/srt/function_call/deepseekv32_detector.py
new file mode 100644
index 0000000000000000000000000000000000000000..d9228743b6dd6be2f403ab97e043c5b6cc95894d
--- /dev/null
+++ b/sglang/python/sglang/srt/function_call/deepseekv32_detector.py
@@ -0,0 +1,353 @@
+import json
+import logging
+import re
+
+from partial_json_parser.core.options import Allow
+
+from sglang.srt.entrypoints.openai.protocol import Tool
+from sglang.srt.function_call.base_format_detector import BaseFormatDetector
+from sglang.srt.function_call.core_types import (
+    StreamingParseResult,
+    StructureInfo,
+    ToolCallItem,
+    _GetInfoFunc,
+)
+from sglang.srt.function_call.utils import _find_common_prefix, _partial_json_loads
+
+logger = logging.getLogger(__name__)
+
+
+class DeepSeekV32Detector(BaseFormatDetector):
+    """
+    Detector for DeepSeek V3.2 model function call format.
+
+    The DeepSeek V3.2 format uses XML-like DSML tags to delimit function calls.
+    Supports two parameter formats:
+
+    Format 1 - XML Parameter Tags:
+    ```
+    <｜DSML｜function_calls>
+        <｜DSML｜invoke name="function_name">
+        <｜DSML｜parameter name="param_name" string="true">value</｜DSML｜parameter>
+        ...
+    </｜DSML｜invoke>
+    </｜DSML｜function_calls>
+    ```
+
+    Format 2 - Direct JSON:
+    ```
+    <｜DSML｜function_calls>
+        <｜DSML｜invoke name="function_name">
+        {
+            "param_name": "value"
+        }
+    </｜DSML｜invoke>
+    </｜DSML｜function_calls>
+    ```
+
+    Examples:
+    ```
+    <｜DSML｜function_calls>
+        <｜DSML｜invoke name="get_favorite_tourist_spot">
+        <｜DSML｜parameter name="city" string="true">San Francisco</｜DSML｜parameter>
+    </｜DSML｜invoke>
+    </｜DSML｜function_calls>
+
+    <｜DSML｜function_calls>
+        <｜DSML｜invoke name="get_favorite_tourist_spot">
+        { "city": "San Francisco" }
+    </｜DSML｜invoke>
+    </｜DSML｜function_calls>
+    ```
+
+    Key Components:
+    - Tool Calls Section: Wrapped between `<｜DSML｜function_calls>` and `</｜DSML｜function_calls>`
+    - Individual Tool Call: Wrapped between `<｜DSML｜invoke name="...">` and `</｜DSML｜invoke>`
+    - Parameters: Either XML tags or direct JSON format
+    - Supports multiple tool calls
+
+    Reference: DeepSeek V3.2 format specification
+    """
+
+    def __init__(self):
+        super().__init__()
+        self.bot_token = "<｜DSML｜function_calls>"
+        self.eot_token = "</｜DSML｜function_calls>"
+        self.invoke_end_token = "</｜DSML｜invoke>"
+        self.parameter_regex = r'<｜DSML｜parameter\s+name="([^"]+)"\s+string="([^"]+)"\s*>(.*?)</｜DSML｜parameter>'
+        self.partial_parameter_regex = (
+            r'<｜DSML｜parameter\s+name="([^"]+)"\s+string="([^"]+)"\s*>(.*)$'
+        )
+        self.function_calls_regex = (
+            r"<｜DSML｜function_calls>(.*?)</｜DSML｜function_calls>"
+        )
+        self.invoke_regex = (
+            r'<｜DSML｜invoke\s+name="([^"]+)"\s*>(.*?)(</｜DSML｜invoke>|$)'
+        )
+        self.prefix_parameter_end_call = ["</", "｜DSML｜", "parameter"]
+        self.prefix_invoke_end_call = ["</", "｜DSML｜", "inv", "oke"]
+        self.current_tool_id = -1
+
+    def has_tool_call(self, text: str) -> bool:
+        """Check if the text contains a deepseek v32 format tool call."""
+        return self.bot_token in text or "<｜DSML｜invoke" in text
+
+    def _parse_parameters_from_xml(
+        self, invoke_content: str, allow_partial: bool = False
+    ) -> str:
+        """
+        Parse parameters from either XML-like format or JSON format to str.
+
+        Supports two formats:
+        1. XML parameter tags: <｜DSML｜parameter name="..." string="...">value</｜DSML｜parameter>
+        2. Direct JSON: { "key": "value" }
+        """
+        # First, try to parse as direct JSON (new format)
+        invoke_content_stripped = invoke_content.strip()
+        if invoke_content_stripped.startswith("{"):
+            if allow_partial:
+                # Remove incomplete invoke end call prefix in case they are captured by param
+                for token in reversed(self.prefix_invoke_end_call):
+                    invoke_content_stripped = invoke_content_stripped.rstrip(token)
+                return invoke_content_stripped
+            elif invoke_content_stripped.endswith("}"):
+                return invoke_content_stripped
+
+        # Fall back to XML parameter tag parsing (original format)
+        parameters = {}
+        # Find all complete parameter matches
+        param_matches = list(
+            re.finditer(self.parameter_regex, invoke_content, re.DOTALL)
+        )
+
+        last_match_end = 0
+        for match in param_matches:
+            param_name = match.group(1)
+            param_type = match.group(2)
+            param_value = match.group(3)
+            last_match_end = match.end()
+
+            # Convert value based on type
+            if param_type == "true":  # string type
+                parameters[param_name] = param_value.strip()
+            else:
+                # Try to parse as JSON for other types
+                try:
+                    parameters[param_name] = json.loads(param_value.strip())
+                except (json.JSONDecodeError, ValueError):
+                    parameters[param_name] = param_value.strip()
+
+        # If allowed, try to parse a partial parameter at the end
+        if allow_partial:
+            remaining_content = invoke_content[last_match_end:]
+
+            # Remove incomplete parameter_end_call prefix in case they are captured by param
+            for token in reversed(self.prefix_parameter_end_call):
+                remaining_content = remaining_content.rstrip(token)
+
+            # Match start of a parameter tag + value (potentially incomplete)
+            # Regex: <tag name="..." string="...">VALUE... (no end tag)
+            partial_match = re.search(
+                self.partial_parameter_regex, remaining_content, re.DOTALL
+            )
+
+            if partial_match and (param_value := partial_match.group(3)):
+                param_name = partial_match.group(1)
+                if partial_match.group(2) == "true":
+                    parameters[param_name] = param_value.strip()
+                else:
+                    try:
+                        parameters[param_name] = _partial_json_loads(
+                            param_value, Allow.ALL
+                        )[0]
+                    except json.JSONDecodeError:
+                        parameters[param_name] = param_value.strip()
+
+        return json.dumps(parameters, ensure_ascii=False)
+
+    def detect_and_parse(self, text: str, tools: list[Tool]) -> StreamingParseResult:
+        """
+        One-time parsing: Detects and parses tool calls in the provided text.
+
+        :param text: The complete text to parse.
+        :param tools: List of available tools.
+        :return: ParseResult indicating success or failure, consumed text, leftover text, and parsed calls.
+        """
+        idx = text.find(self.bot_token)
+        normal_text = text[:idx].strip() if idx != -1 else text
+        if self.bot_token not in text:
+            return StreamingParseResult(normal_text=normal_text, calls=[])
+
+        calls = []
+        try:
+            # Extract content between function_calls tags
+            function_calls_match = re.search(
+                self.function_calls_regex,
+                text,
+                re.DOTALL,
+            )
+            if not function_calls_match:
+                return StreamingParseResult(normal_text=normal_text, calls=[])
+
+            function_calls_content = function_calls_match.group(1)
+
+            # Find all invoke blocks
+            invoke_matches = re.findall(
+                self.invoke_regex, function_calls_content, re.DOTALL
+            )
+
+            for func_name, invoke_content, _ in invoke_matches:
+                # Parse parameters from XML format
+                func_args = self._parse_parameters_from_xml(invoke_content)
+                # construct match_result for parse_base_json
+                match_result = {"name": func_name, "parameters": json.loads(func_args)}
+                calls.extend(self.parse_base_json(match_result, tools))
+
+            return StreamingParseResult(normal_text=normal_text, calls=calls)
+        except Exception as e:
+            logger.error(f"Error in detect_and_parse: {e}")
+            # return the normal text if parsing fails
+            return StreamingParseResult(normal_text=text)
+
+    def parse_streaming_increment(
+        self, new_text: str, tools: list[Tool]
+    ) -> StreamingParseResult:
+        """
+        Streaming incremental parsing tool calls for DeepSeekV32 format.
+        Supports multiple consecutive invoke blocks and argument streaming.
+        """
+        self._buffer += new_text
+        current_text = self._buffer
+
+        # Check if buffer contains any DSML markers or ends with potential tag prefix
+        # This handles partial/streaming DSML content
+        dsml_markers = ["｜DSML｜", "<｜", "</｜"]
+        potentially_dsml = any(marker in current_text for marker in dsml_markers)
+
+        # Also check if text ends with start of a tag (to handle "<" arriving separately)
+        dsml_prefixes = ["<", "<｜", "</", "</｜"]
+        ends_with_prefix = any(
+            current_text.rstrip().endswith(prefix) for prefix in dsml_prefixes
+        )
+
+        if (
+            not self.has_tool_call(current_text)
+            and not potentially_dsml
+            and not ends_with_prefix
+        ):
+            self._buffer = ""
+            for e_token in [self.eot_token, self.invoke_end_token]:
+                if e_token in current_text:
+                    current_text = current_text.replace(e_token, "")
+            return StreamingParseResult(normal_text=current_text)
+
+        all_calls: list[ToolCallItem] = []
+        try:
+            # Loop to handle multiple consecutive invoke blocks
+            while True:
+                # Try to match an invoke block (may be partial)
+                invoke_match = re.search(
+                    pattern=self.invoke_regex,
+                    string=current_text,
+                    flags=re.DOTALL,
+                )
+                if not invoke_match:
+                    break
+
+                func_name = invoke_match.group(1).strip()
+                invoke_content = invoke_match.group(2)
+                # group(3) is either "</｜DSML｜invoke>" (complete) or "" (incomplete, matched with $)
+                is_tool_end = bool(invoke_match.group(3))
+
+                # Initialize state if this is the first tool call
+                if self.current_tool_id == -1:
+                    self.current_tool_id = 0
+                    self.prev_tool_call_arr = []
+                    self.streamed_args_for_tool = [""]
+
+                # Ensure arrays are large enough for current tool
+                while len(self.prev_tool_call_arr) <= self.current_tool_id:
+                    self.prev_tool_call_arr.append({})
+                while len(self.streamed_args_for_tool) <= self.current_tool_id:
+                    self.streamed_args_for_tool.append("")
+
+                # 1. Send tool name if not sent yet
+                if not self.current_tool_name_sent:
+                    all_calls.append(
+                        ToolCallItem(
+                            tool_index=self.current_tool_id,
+                            name=func_name,
+                            parameters="",
+                        )
+                    )
+                    self.current_tool_name_sent = True
+
+                # 2. Parse current parameters (partial or complete)
+                current_params = self._parse_parameters_from_xml(
+                    invoke_content, allow_partial=not is_tool_end
+                )
+
+                # 3. Calculate and send incremental arguments
+                sent_len = len(self.streamed_args_for_tool[self.current_tool_id])
+                prev_params = self.prev_tool_call_arr[self.current_tool_id].get(
+                    "arguments"
+                )
+
+                argument_diff = None
+
+                if is_tool_end:
+                    # If complete, send everything remaining
+                    argument_diff = current_params[sent_len:]
+                elif prev_params is not None:
+                    # If partial, send stable prefix diff
+                    if current_params != prev_params:
+                        prefix = _find_common_prefix(current_params, prev_params)
+                        if len(prefix) > sent_len:
+                            argument_diff = prefix[sent_len:]
+
+                if argument_diff:
+                    all_calls.append(
+                        ToolCallItem(
+                            tool_index=self.current_tool_id,
+                            name=None,
+                            parameters=argument_diff,
+                        )
+                    )
+                    self.streamed_args_for_tool[self.current_tool_id] += argument_diff
+
+                # Update the stored arguments
+                self.prev_tool_call_arr[self.current_tool_id] = {
+                    "name": func_name,
+                    "arguments": current_params,
+                }
+
+                # Check if tool call is complete (has closing tag)
+                if is_tool_end:
+                    # Remove the completed tool call from buffer
+                    self._buffer = current_text[invoke_match.end() :]
+                    current_text = self._buffer  # Update for next iteration
+
+                    # Move to next tool call
+                    self.current_tool_id += 1
+                    self.current_tool_name_sent = False
+
+                    # Continue loop to check for more invoke blocks
+                    continue
+                else:
+                    # Tool call not complete yet, don't return anything
+                    # Wait for more chunks until we see </｜DSML｜invoke>
+                    break
+
+            # No more invoke blocks found
+            return StreamingParseResult(normal_text="", calls=all_calls)
+
+        except Exception as e:
+            logger.error(f"Error in parse_streaming_increment: {e}")
+            return StreamingParseResult(normal_text=current_text)
+
+    def structure_info(self) -> _GetInfoFunc:
+        return lambda name: StructureInfo(
+            begin=f'<｜DSML｜invoke name="{name}">',
+            end="</｜DSML｜invoke>",
+            trigger="<｜DSML｜invoke",
+        )
diff --git a/sglang/python/sglang/srt/function_call/deepseekv3_detector.py b/sglang/python/sglang/srt/function_call/deepseekv3_detector.py
new file mode 100644
index 0000000000000000000000000000000000000000..8dcc2da4317eef0754e0598aec36800a0a9626c0
--- /dev/null
+++ b/sglang/python/sglang/srt/function_call/deepseekv3_detector.py
@@ -0,0 +1,209 @@
+import json
+import logging
+import re
+from typing import List
+
+from sglang.srt.entrypoints.openai.protocol import Tool
+from sglang.srt.function_call.base_format_detector import BaseFormatDetector
+from sglang.srt.function_call.core_types import (
+    StreamingParseResult,
+    StructureInfo,
+    ToolCallItem,
+    _GetInfoFunc,
+)
+from sglang.srt.function_call.utils import _is_complete_json
+
+logger = logging.getLogger(__name__)
+
+
+class DeepSeekV3Detector(BaseFormatDetector):
+    """
+    Detector for DeepSeek V3 model function call format.
+
+    The DeepSeek V3 format uses special Unicode tokens to delimit function calls
+    with JSON code blocks for arguments.
+
+    Format Structure:
+    ```
+    <｜tool▁calls▁begin｜><｜tool▁call▁begin｜>function<｜tool▁sep｜>{function_name}\n```json\n{json_arguments}\n```<｜tool▁calls▁end｜><｜end▁of▁sentence｜>
+    ```
+    Examples:
+    ```
+    <｜tool▁calls▁begin｜><｜tool▁call▁begin｜>function<｜tool▁sep｜>get_current_weather\n```json\n{"location": "Tokyo"}\n```<｜tool▁call▁end｜>\n<｜tool▁call▁begin｜>function<｜tool▁sep｜>get_current_weather\n```json\n{"location": "Paris"}\n```<｜tool▁call▁end｜><｜tool▁calls▁end｜><｜end▁of▁sentence｜>
+    ```
+
+    Key Components:
+    - Tool Calls Section: Wrapped between `<｜tool▁calls▁begin｜>` and `<｜tool▁calls▁end｜>`
+    - Individual Tool Call: Wrapped between `<｜tool▁call▁begin｜>` and `<｜tool▁call▁end｜>`
+    - Function Declaration: `function<｜tool▁sep｜>{function_name}`
+    - Arguments: JSON code block between ````json` and ````
+    - Supports multiple tool calls
+
+    Reference: https://huggingface.co/deepseek-ai/DeepSeek-V3-0324?chat_template=default
+    """
+
+    def __init__(self):
+        super().__init__()
+        self.bot_token = "<｜tool▁calls▁begin｜>"
+        self.eot_token = "<｜tool▁calls▁end｜>"
+        self.func_call_regex = r"<｜tool▁call▁begin｜>.*?<｜tool▁call▁end｜>"
+        self.func_detail_regex = r"<｜tool▁call▁begin｜>(.*)<｜tool▁sep｜>(.*)\n```json\n(.*)\n```<｜tool▁call▁end｜>"
+        self._last_arguments = ""
+        self.current_tool_id = -1
+
+    def has_tool_call(self, text: str) -> bool:
+        """Check if the text contains a deepseek format tool call."""
+        return self.bot_token in text
+
+    def detect_and_parse(self, text: str, tools: List[Tool]) -> StreamingParseResult:
+        """
+        One-time parsing: Detects and parses tool calls in the provided text.
+
+        :param text: The complete text to parse.
+        :param tools: List of available tools.
+        :return: ParseResult indicating success or failure, consumed text, leftover text, and parsed calls.
+        """
+        idx = text.find(self.bot_token)
+        normal_text = text[:idx].strip() if idx != -1 else text
+        if self.bot_token not in text:
+            return StreamingParseResult(normal_text=normal_text, calls=[])
+        match_result_list = re.findall(self.func_call_regex, text, re.DOTALL)
+        calls = []
+        try:
+            for match_result in match_result_list:
+                # Get function name
+                func_detail = re.search(self.func_detail_regex, match_result, re.DOTALL)
+                func_name = func_detail.group(2)
+                func_args = func_detail.group(3)
+                func_args = json.loads(func_args)
+                # construct match_result for parse_base_json
+                match_result = {"name": func_name, "parameters": func_args}
+                calls.extend(self.parse_base_json(match_result, tools))
+            return StreamingParseResult(normal_text=normal_text, calls=calls)
+        except Exception as e:
+            logger.error(f"Error in detect_and_parse: {e}")
+            # return the normal text if parsing fails
+            return StreamingParseResult(normal_text=text)
+
+    def parse_streaming_increment(
+        self, new_text: str, tools: List[Tool]
+    ) -> StreamingParseResult:
+        """
+        Streaming incremental parsing tool calls for DeepSeekV3 format.
+        """
+        self._buffer += new_text
+        current_text = self._buffer
+
+        # Check if we have a tool call (either the start token or individual tool call)
+        has_tool_call = (
+            self.bot_token in current_text or "<｜tool▁call▁begin｜>" in current_text
+        )
+
+        if not has_tool_call:
+            self._buffer = ""
+            for e_token in [self.eot_token, "```", "<｜tool▁call▁end｜>"]:
+                if e_token in new_text:
+                    new_text = new_text.replace(e_token, "")
+            return StreamingParseResult(normal_text=new_text)
+
+        if not hasattr(self, "_tool_indices"):
+            self._tool_indices = self._get_tool_indices(tools)
+
+        calls: list[ToolCallItem] = []
+        try:
+            partial_match = re.search(
+                pattern=r"<｜tool▁call▁begin｜>(.*)<｜tool▁sep｜>(.*)\n```json\n(.*)\n```.*",
+                string=current_text,
+                flags=re.DOTALL,
+            )
+            if partial_match:
+                func_name = partial_match.group(2).strip()
+                func_args_raw = partial_match.group(3).strip()
+
+                # Initialize state if this is the first tool call
+                if self.current_tool_id == -1:
+                    self.current_tool_id = 0
+                    self.prev_tool_call_arr = []
+                    self.streamed_args_for_tool = [""]
+
+                # Ensure we have enough entries in our tracking arrays
+                while len(self.prev_tool_call_arr) <= self.current_tool_id:
+                    self.prev_tool_call_arr.append({})
+                while len(self.streamed_args_for_tool) <= self.current_tool_id:
+                    self.streamed_args_for_tool.append("")
+
+                if not self.current_tool_name_sent:
+                    calls.append(
+                        ToolCallItem(
+                            tool_index=self.current_tool_id,
+                            name=func_name,
+                            parameters="",
+                        )
+                    )
+                    self.current_tool_name_sent = True
+                    # Store the tool call info for serving layer completions endpoint
+                    self.prev_tool_call_arr[self.current_tool_id] = {
+                        "name": func_name,
+                        "arguments": {},
+                    }
+                else:
+                    argument_diff = (
+                        func_args_raw[len(self._last_arguments) :]
+                        if func_args_raw.startswith(self._last_arguments)
+                        else func_args_raw
+                    )
+
+                    if argument_diff:
+                        calls.append(
+                            ToolCallItem(
+                                tool_index=self.current_tool_id,
+                                name=None,
+                                parameters=argument_diff,
+                            )
+                        )
+                        self._last_arguments += argument_diff
+                        self.streamed_args_for_tool[
+                            self.current_tool_id
+                        ] += argument_diff
+
+                    if _is_complete_json(func_args_raw):
+                        # Update the stored arguments
+                        try:
+                            parsed_args = json.loads(func_args_raw)
+                            self.prev_tool_call_arr[self.current_tool_id][
+                                "arguments"
+                            ] = parsed_args
+                        except json.JSONDecodeError:
+                            pass
+
+                        # Find the end of the current tool call and remove only that part from buffer
+                        tool_call_end_pattern = (
+                            r"<｜tool▁call▁begin｜>.*?<｜tool▁call▁end｜>"
+                        )
+                        match = re.search(
+                            tool_call_end_pattern, current_text, re.DOTALL
+                        )
+                        if match:
+                            # Remove the completed tool call from buffer, keep any remaining content
+                            self._buffer = current_text[match.end() :]
+                        else:
+                            self._buffer = ""
+
+                        result = StreamingParseResult(normal_text="", calls=calls)
+                        self.current_tool_id += 1
+                        self._last_arguments = ""
+                        self.current_tool_name_sent = False
+                        return result
+
+            return StreamingParseResult(normal_text="", calls=calls)
+
+        except Exception as e:
+            logger.error(f"Error in parse_streaming_increment: {e}")
+            return StreamingParseResult(normal_text=current_text)
+
+    def structure_info(self) -> _GetInfoFunc:
+        return lambda name: StructureInfo(
+            begin=">" + name + "\n```json\n",
+            end="\n```<",
+            trigger=">" + name + "\n```json\n",
+        )
diff --git a/sglang/python/sglang/srt/function_call/function_call_parser.py b/sglang/python/sglang/srt/function_call/function_call_parser.py
new file mode 100644
index 0000000000000000000000000000000000000000..2f562192e219ce316dd1086975a0466f547b6d37
--- /dev/null
+++ b/sglang/python/sglang/srt/function_call/function_call_parser.py
@@ -0,0 +1,214 @@
+import logging
+from typing import Dict, List, Literal, Optional, Set, Tuple, Type, Union
+
+from sglang.srt.entrypoints.openai.protocol import (
+    LegacyStructuralTagResponseFormat,
+    StructuresResponseFormat,
+    Tool,
+    ToolCallConstraint,
+    ToolChoice,
+)
+from sglang.srt.environ import ToolStrictLevel, envs
+from sglang.srt.function_call.base_format_detector import BaseFormatDetector
+from sglang.srt.function_call.core_types import ToolCallItem
+from sglang.srt.function_call.deepseekv3_detector import DeepSeekV3Detector
+from sglang.srt.function_call.deepseekv31_detector import DeepSeekV31Detector
+from sglang.srt.function_call.deepseekv32_detector import DeepSeekV32Detector
+from sglang.srt.function_call.gigachat3_detector import GigaChat3Detector
+from sglang.srt.function_call.glm4_moe_detector import Glm4MoeDetector
+from sglang.srt.function_call.glm47_moe_detector import Glm47MoeDetector
+from sglang.srt.function_call.gpt_oss_detector import GptOssDetector
+from sglang.srt.function_call.hermes_detector import HermesDetector
+from sglang.srt.function_call.internlm_detector import InternlmDetector
+from sglang.srt.function_call.kimik2_detector import KimiK2Detector
+from sglang.srt.function_call.lfm2_detector import Lfm2Detector
+from sglang.srt.function_call.llama32_detector import Llama32Detector
+from sglang.srt.function_call.mimo_detector import MiMoDetector
+from sglang.srt.function_call.minimax_m2 import MinimaxM2Detector
+from sglang.srt.function_call.mistral_detector import MistralDetector
+from sglang.srt.function_call.pythonic_detector import PythonicDetector
+from sglang.srt.function_call.qwen3_coder_detector import Qwen3CoderDetector
+from sglang.srt.function_call.qwen25_detector import Qwen25Detector
+from sglang.srt.function_call.step3_detector import Step3Detector
+from sglang.srt.function_call.trinity_detector import TrinityDetector
+from sglang.srt.function_call.utils import get_json_schema_constraint
+
+logger = logging.getLogger(__name__)
+
+
+class FunctionCallParser:
+    """
+    Parser for function/tool calls in model outputs.
+
+    This class handles both streaming and non-streaming parsing of function calls using a detector.
+    In streaming scenarios, each time new_text is received, it calls detector.parse_streaming_increment
+    and returns the resulting normal_text and calls to the upper layer (or SSE).
+    """
+
+    ToolCallParserEnum: Dict[str, Type[BaseFormatDetector]] = {
+        "deepseekv3": DeepSeekV3Detector,
+        "deepseekv31": DeepSeekV31Detector,
+        "deepseekv32": DeepSeekV32Detector,
+        "glm": Glm4MoeDetector,
+        "glm45": Glm4MoeDetector,
+        "glm47": Glm47MoeDetector,
+        "gpt-oss": GptOssDetector,
+        "kimi_k2": KimiK2Detector,
+        "lfm2": Lfm2Detector,
+        "llama3": Llama32Detector,
+        "mimo": MiMoDetector,
+        "mistral": MistralDetector,
+        "pythonic": PythonicDetector,
+        "qwen": Qwen25Detector,
+        "qwen25": Qwen25Detector,
+        "qwen3_coder": Qwen3CoderDetector,
+        "step3": Step3Detector,
+        "step3p5": Qwen3CoderDetector,
+        "minimax-m2": MinimaxM2Detector,
+        "trinity": TrinityDetector,
+        "interns1": InternlmDetector,
+        "hermes": HermesDetector,
+        "gigachat3": GigaChat3Detector,
+    }
+
+    def __init__(self, tools: List[Tool], tool_call_parser: str):
+        detector_class = self.ToolCallParserEnum.get(tool_call_parser)
+        if detector_class:
+            detector = detector_class()
+        else:
+            raise ValueError(f"Unsupported tool_call_parser: {tool_call_parser}")
+
+        self.detector = detector
+        self.tools = tools
+        self.tool_strict_level = envs.SGLANG_TOOL_STRICT_LEVEL.get()
+
+    def has_tool_call(self, text: str) -> bool:
+        """
+        Check if the given text contains a tool call in the format supported by this parser.
+        This delegates to the detector's implementation.
+
+        Args:
+            text: The text to check for tool calls
+
+        Returns:
+            True if the text contains a tool call, False otherwise
+        """
+        if not self.tools:
+            return False
+        return self.detector.has_tool_call(text)
+
+    def parse_non_stream(self, full_text: str) -> Tuple[str, list[ToolCallItem]]:
+        """
+        One-time parsing of the full text to extract tool calls.
+
+        Args:
+            full_text: The complete text to parse
+
+        Returns:
+            A tuple containing:
+            - The remaining text after parsing that was not consumed by the detector (can be treated as normal text)
+            - A list of tool calls parsed from the text
+        """
+        if not self.tools:
+            return full_text, []
+        parsed_result = self.detector.detect_and_parse(full_text, self.tools)
+        tool_call_list = parsed_result.calls
+        if tool_call_list:
+            return parsed_result.normal_text, tool_call_list
+        else:
+            return full_text, []
+
+    def parse_stream_chunk(self, chunk_text: str) -> Tuple[str, list[ToolCallItem]]:
+        """
+        Streaming incremental parsing of chunks of text as they arrive.
+
+        Args:
+            chunk_text: The new chunk of text to parse
+
+        Returns:
+            A tuple containing:
+            - The normal text that should be displayed to the user
+            - A list of tool calls parsed from the chunk
+        """
+        if not self.tools:
+            return chunk_text, []
+        final_normal_text = ""
+        final_calls = []
+
+        sp_result = self.detector.parse_streaming_increment(chunk_text, self.tools)
+        if sp_result.normal_text:
+            final_normal_text = sp_result.normal_text
+        if sp_result.calls:
+            final_calls.extend(sp_result.calls)
+            final_normal_text = sp_result.normal_text
+
+        return final_normal_text, final_calls
+
+    def get_structure_tag(self) -> LegacyStructuralTagResponseFormat:
+        """
+        Generate a structural tag response format for all available tools.
+
+        This creates the necessary structural tags that guide the model's output format.
+        """
+        tool_structures: List[StructuresResponseFormat] = list()
+        tool_trigger_set: Set[str] = set()
+
+        get_structure_info = self.detector.structure_info()
+        for tool in self.tools:
+            function = tool.function
+            name = function.name
+            assert name is not None
+            info = get_structure_info(name)
+
+            # accept all if not strict, otherwise only accept the schema
+            is_strict = (
+                function.strict or self.tool_strict_level >= ToolStrictLevel.PARAMETER
+            )
+            schema = function.parameters if is_strict else {}
+
+            tool_structures.append(
+                StructuresResponseFormat(
+                    begin=info.begin,
+                    schema=schema or {},  # type: ignore
+                    end=info.end,
+                )
+            )
+            tool_trigger_set.add(info.trigger)
+
+        # TODO(dark): move this into new structural tag format
+        # This requires all grammar backend support the new format
+        return LegacyStructuralTagResponseFormat(
+            type="structural_tag",
+            structures=tool_structures,
+            triggers=list(tool_trigger_set),
+        )
+
+    def get_structure_constraint(
+        self, tool_choice: Union[ToolChoice, Literal["auto", "required"]]
+    ) -> Optional[ToolCallConstraint]:
+        """
+        Returns the appropriate structure constraint for tool calls based on the tool_choice.
+        The constraint is used to guide the model's output format.
+
+        Args:
+            tool_choice: The tool choice setting from the request
+
+        Returns:
+            A tuple of (constraint_type, constraint_value) to be added to sampling parameters,
+            or None if no constraint applies.
+        """
+        # NOTE: structural_tag only supports JSON-compatible content between the begin and end.
+        # It cannot parse or validate function call Pythonic or XML-ish syntax.
+        if (
+            self.detector.supports_structural_tag()
+            and tool_choice == "auto"
+            and (
+                any(tool.function.strict for tool in self.tools)
+                or self.tool_strict_level >= ToolStrictLevel.FUNCTION
+            )
+        ):
+            tag = self.get_structure_tag()
+            return ("structural_tag", tag)
+        elif tool_choice == "required" or isinstance(tool_choice, ToolChoice):
+            json_schema = get_json_schema_constraint(self.tools, tool_choice)
+            return ("json_schema", json_schema)
diff --git a/sglang/python/sglang/srt/function_call/gigachat3_detector.py b/sglang/python/sglang/srt/function_call/gigachat3_detector.py
new file mode 100644
index 0000000000000000000000000000000000000000..ef94bdf9a0ce6a7b34d482a5edeac6a0d49e9a8b
--- /dev/null
+++ b/sglang/python/sglang/srt/function_call/gigachat3_detector.py
@@ -0,0 +1,209 @@
+import json
+import logging
+import re
+from typing import List
+
+from sglang.srt.entrypoints.openai.protocol import Tool
+from sglang.srt.function_call.base_format_detector import BaseFormatDetector
+from sglang.srt.function_call.core_types import (
+    StreamingParseResult,
+    ToolCallItem,
+    _GetInfoFunc,
+)
+
+logger = logging.getLogger(__name__)
+
+REGEX_FUNCTION_CALL = re.compile(
+    r"function call<\|role_sep\|>\n(.*)",
+    re.DOTALL,
+)
+
+REGEX_CONTENT_PATTERN = re.compile(
+    r"^(.*?)<\|message_sep\|>",
+    re.DOTALL,
+)
+
+NAME_REGEX = re.compile(
+    r'"name"\s*:\s*"([^"]*)"',
+    re.DOTALL,
+)
+
+ARGS_REGEX = re.compile(
+    r'"arguments"\s*:\s*(.*)',
+    re.DOTALL,
+)
+
+
+class GigaChat3Detector(BaseFormatDetector):
+    def __init__(self) -> None:
+        super().__init__()
+        self.tool_started: bool = False
+        self.tool_name_sent: bool = False
+        self.end_content: bool = False
+        self._buffer: str = ""
+        self.prev_tool_call_arr: list[dict] = []
+
+    def has_tool_call(self, text: str) -> bool:
+        """Check if text contains a tool call marker"""
+        return "function call<|role_sep|>\n" in text
+
+    def detect_and_parse(
+        self,
+        text: str,
+        tools: List[Tool],
+    ) -> StreamingParseResult:
+        """
+        Non-streaming parsing of complete model output.
+        Extracts tool calls and content from the full text.
+        """
+        logger.debug(f"[GigaChat3] detect_and_parse: {text}")
+        model_output = text
+        function_call = None
+        content = None
+        if model_output.rstrip().endswith("</s>"):
+            model_output = model_output[: model_output.rfind("</s>")]
+        m_func = REGEX_FUNCTION_CALL.search(model_output)
+        if m_func:
+            try:
+                function_call = json.loads(m_func.group(1), strict=False)
+                if not (
+                    isinstance(function_call, dict)
+                    and "name" in function_call
+                    and "arguments" in function_call
+                ):
+                    function_call = None
+                elif not isinstance(function_call["arguments"], dict):
+                    function_call = None
+            except json.JSONDecodeError as e:
+                logger.warning(f"[GigaChat3] JSON decode error: {e}")
+                return StreamingParseResult(
+                    normal_text=model_output,
+                    calls=[],
+                )
+        m_content = REGEX_CONTENT_PATTERN.search(model_output)
+        if m_content:
+            content = m_content.group(1)
+        else:
+            if "<|message_sep|>" in model_output:
+                content = model_output.split("<|message_sep|>")[0]
+            else:
+                content = model_output
+        if not function_call:
+            return StreamingParseResult(normal_text=content, calls=[])
+        name = function_call["name"]
+        args = function_call["arguments"]
+        match_result = {"name": name, "arguments": args}
+        calls = self.parse_base_json(match_result, tools)
+        return StreamingParseResult(normal_text=content, calls=calls)
+
+    def parse_streaming_increment(
+        self,
+        new_text: str,
+        tools: List[Tool],
+    ) -> StreamingParseResult:
+        """
+        Streaming parser for incremental text chunks.
+        Maintains state across calls to build complete tool calls.
+        """
+        if not new_text:
+            return StreamingParseResult()
+        logger.debug(f"[GigaChat3] parse_streaming_increment: '{new_text}'")
+        self._buffer += new_text
+        current_text = self._buffer
+        delta_text = new_text
+        content = None
+        func_name = None
+        cur_args = None
+        m_func = REGEX_FUNCTION_CALL.search(current_text)
+        if not self.tool_started:
+            m_content = REGEX_CONTENT_PATTERN.search(delta_text)
+            if m_content:
+                content = m_content.group(1)
+                self.end_content = True
+            else:
+                if "<|message_sep|>" in delta_text:
+                    content = delta_text.split("<|message_sep|>")[0]
+                    self.end_content = True
+                else:
+                    if not self.end_content:
+                        content = delta_text
+            if m_func:
+                self.tool_started = True
+                logger.debug("[GigaChat3] Tool call started")
+            if content:
+                return StreamingParseResult(normal_text=content)
+        if not m_func:
+            return StreamingParseResult()
+        json_tail = m_func.group(1).strip()
+        name_match = NAME_REGEX.search(json_tail)
+        if name_match:
+            func_name = name_match.group(1)
+        args_match = ARGS_REGEX.search(json_tail)
+        if args_match:
+            cur_args = args_match.group(1).strip()
+            if cur_args.endswith("</s>"):
+                cur_args = cur_args[: -len("</s>")]
+            if cur_args.endswith("}"):
+                try:
+                    candidate = cur_args[:-1].strip()
+                    json.loads(candidate, strict=False)
+                    cur_args = candidate
+                except json.JSONDecodeError:
+                    pass
+        calls: List[ToolCallItem] = []
+        if not self.prev_tool_call_arr:
+            self.prev_tool_call_arr.append({})
+        if not self.tool_name_sent:
+            if not func_name:
+                return StreamingParseResult()
+            self.tool_name_sent = True
+            self.prev_tool_call_arr[0]["name"] = func_name
+            logger.debug(f"[GigaChat3] Sending tool name: {func_name}")
+            calls.append(
+                ToolCallItem(
+                    tool_index=0,
+                    name=func_name,
+                    parameters="",
+                )
+            )
+            return StreamingParseResult(calls=calls)
+        if cur_args is None:
+            return StreamingParseResult()
+        prev_args = self.prev_tool_call_arr[0].get("arguments_str", "")
+        if not prev_args:
+            delta_args = cur_args
+        elif cur_args.startswith(prev_args):
+            delta_args = cur_args[len(prev_args) :]
+        else:
+            logger.warning(
+                f"[GigaChat3] Arguments overlap mismatch. "
+                f"prev='{prev_args[:50]}...' cur='{cur_args[:50]}...'"
+            )
+            return StreamingParseResult()
+        if not delta_args:
+            return StreamingParseResult()
+        self.prev_tool_call_arr[0]["arguments_str"] = cur_args
+        try:
+            args_dict = json.loads(cur_args, strict=False)
+            self.prev_tool_call_arr[0]["arguments"] = args_dict
+        except json.JSONDecodeError:
+            self.prev_tool_call_arr[0]["arguments"] = {}
+        logger.debug(f"[GigaChat3] Sending args delta: '{delta_args[:100]}...'")
+        calls.append(
+            ToolCallItem(
+                tool_index=0,
+                name=None,
+                parameters=delta_args,
+            )
+        )
+        return StreamingParseResult(calls=calls)
+
+    def supports_structural_tag(self) -> bool:
+        """GigaChat3 does not use structural tags"""
+        return False
+
+    def structure_info(self) -> _GetInfoFunc:
+        """Not applicable for GigaChat3"""
+        raise NotImplementedError(
+            "GigaChat3Detector does not support structural_tag format."
+        )
diff --git a/sglang/python/sglang/srt/function_call/glm47_moe_detector.py b/sglang/python/sglang/srt/function_call/glm47_moe_detector.py
new file mode 100644
index 0000000000000000000000000000000000000000..0b25b11eb3791e3eb7ddd147a19d0eb5159cf603
--- /dev/null
+++ b/sglang/python/sglang/srt/function_call/glm47_moe_detector.py
@@ -0,0 +1,788 @@
+import ast
+import json
+import logging
+import re
+from enum import Enum
+from typing import Any, Dict, List, Optional, Tuple
+
+from sglang.srt.entrypoints.openai.protocol import Tool
+from sglang.srt.function_call.base_format_detector import BaseFormatDetector
+from sglang.srt.function_call.core_types import (
+    StreamingParseResult,
+    ToolCallItem,
+    _GetInfoFunc,
+)
+from sglang.srt.function_call.utils import infer_type_from_json_schema
+
+logger = logging.getLogger(__name__)
+
+
+class StreamState(str, Enum):
+    """State machine states for XML to JSON streaming conversion."""
+
+    INIT = "INIT"
+    BETWEEN = "BETWEEN"
+    IN_KEY = "IN_KEY"
+    WAITING_VALUE = "WAITING_VALUE"
+    IN_VALUE = "IN_VALUE"
+
+
+def get_argument_type(
+    func_name: str, arg_key: str, defined_tools: List[Tool]
+) -> Optional[str]:
+    """Get the expected type of a function argument from tool definitions.
+
+    Supports complex JSON Schema definitions including:
+    - Direct type field (including type arrays)
+    - anyOf/oneOf: parameter can be any of multiple types
+    - enum: parameter must be one of enum values
+    - allOf: parameter must satisfy all type definitions
+    - properties: inferred as object type
+    - items: inferred as array type
+
+    Args:
+        func_name: Name of the function/tool
+        arg_key: Name of the argument
+        defined_tools: List of available tools
+
+    Returns:
+        The type string (e.g., 'string', 'number', 'object') or None if not found
+    """
+    name2tool = {tool.function.name: tool for tool in defined_tools}
+
+    # Check if function exists
+    tool = name2tool.get(func_name)
+    if not tool:
+        return None
+
+    # Get parameters safely using getattr
+    params = getattr(tool.function, "parameters", None)
+    if not isinstance(params, dict):
+        return None
+
+    # Navigate to the type using dict.get() for safe access
+    properties = params.get("properties")
+    if not isinstance(properties, dict):
+        return None
+
+    arg_spec = properties.get(arg_key)
+    if isinstance(arg_spec, dict):
+        # Use the new type inference function for complex JSON Schema support
+        return infer_type_from_json_schema(arg_spec)
+
+    return None
+
+
+def _convert_to_number(value: str) -> Any:
+    """Convert string to appropriate number type (int or float).
+
+    Args:
+        value: String value to convert
+
+    Returns:
+        Converted number or original string if conversion fails
+    """
+    try:
+        if "." in value or "e" in value.lower():
+            return float(value)
+        else:
+            return int(value)
+    except (ValueError, AttributeError):
+        return value
+
+
+def parse_arguments(
+    json_value: str, arg_type: Optional[str] = None
+) -> Tuple[Any, bool]:
+    """Parse argument value with multiple fallback strategies.
+
+    Args:
+        json_value: Raw string value to parse
+        arg_type: Expected type hint ('string', 'number', 'object', etc.)
+
+    Returns:
+        Tuple of (parsed_value, is_valid_json)
+    """
+    # Strategy 1: Direct JSON parsing
+    try:
+        parsed_value = json.loads(json_value)
+
+        # Type coercion for number type
+        if arg_type == "number" and isinstance(parsed_value, str):
+            parsed_value = _convert_to_number(parsed_value)
+
+        return parsed_value, True
+    except (json.JSONDecodeError, ValueError):
+        pass
+
+    # Strategy 2: Unescape and parse
+    try:
+        wrapped = json.loads('{"tmp": "' + json_value + '"}')
+        parsed_value = json.loads(wrapped["tmp"])
+
+        if arg_type == "number" and isinstance(parsed_value, str):
+            parsed_value = _convert_to_number(parsed_value)
+
+        return parsed_value, True
+    except (json.JSONDecodeError, ValueError, KeyError):
+        pass
+
+    # Strategy 3: ast.literal_eval
+    try:
+        parsed_value = ast.literal_eval(json_value)
+        return parsed_value, True
+    except (ValueError, SyntaxError):
+        pass
+
+    # Strategy 4: Treat as string
+    try:
+        quoted_value = json.dumps(str(json_value))
+        return json.loads(quoted_value), True
+    except (json.JSONDecodeError, ValueError):
+        return json_value, False
+
+
+class Glm47MoeDetector(BaseFormatDetector):
+    """
+    Detector for GLM-4.7 and GLM-5 models.
+    Assumes function call format:
+      <tool_call>get_weather<arg_key>city</arg_key><arg_value>北京</arg_value><arg_key>date</arg_key><arg_value>2024-06-27</arg_value></tool_call><tool_call>get_weather<arg_key>city</arg_key><arg_value>上海</arg_value><arg_key>date</arg_key><arg_value>2024-06-27</arg_value></tool_call>
+    """
+
+    def __init__(self):
+        super().__init__()
+        self.bot_token = "<tool_call>"
+        self.eot_token = "</tool_call>"
+        self.func_call_regex = r"<tool_call>.*?</tool_call>"
+        self.func_detail_regex = re.compile(
+            r"<tool_call>(.*?)(<arg_key>.*?)?</tool_call>", re.DOTALL
+        )
+        self.func_arg_regex = re.compile(
+            r"<arg_key>(.*?)</arg_key>(?:\\n|\s)*<arg_value>(.*?)</arg_value>",
+            re.DOTALL,
+        )
+        self._last_arguments = ""
+        self.current_tool_id = -1
+        self.current_tool_name_sent = False
+        self._streamed_raw_length = 0
+        self._tool_call_completed = False  # Track if tool call has been completed
+        self._sent_empty_object = (
+            False  # Track if empty object has been sent for no-arg functions
+        )
+        self._reset_streaming_state()
+
+    def _reset_streaming_state(self) -> None:
+        """Reset the streaming state machine for a new tool call."""
+        self._stream_state = StreamState.INIT
+        self._current_key = ""
+        self._current_value = ""
+        self._xml_tag_buffer = ""
+        self._is_first_param = True
+        self._value_started = False
+        self._cached_value_type: Optional[str] = (
+            None  # Cache the value type for consistency
+        )
+        self._tool_call_completed = False  # Reset tool call completion status
+        self._sent_empty_object = False  # Reset empty object sent status
+
+    def has_tool_call(self, text: str) -> bool:
+        """Check if the text contains a glm-4.5 / glm-4.6 format tool call."""
+        return self.bot_token in text
+
+    def detect_and_parse(self, text: str, tools: List[Tool]) -> StreamingParseResult:
+        """
+        One-time parsing: Detects and parses tool calls in the provided text.
+
+        :param text: The complete text to parse.
+        :param tools: List of available tools.
+        :return: ParseResult indicating success or failure, consumed text, leftover text, and parsed calls.
+        """
+        if self.bot_token not in text:
+            return StreamingParseResult(normal_text=text, calls=[])
+
+        # Extract all normal text (before, between, and after tool calls)
+        normal_text_parts = []
+        last_end = 0
+
+        # Find all tool call matches
+        for match in re.finditer(self.func_call_regex, text, re.DOTALL):
+            # Add text before this tool call
+            if match.start() > last_end:
+                normal_text_parts.append(text[last_end : match.start()])
+            last_end = match.end()
+
+        # Add any remaining text after the last tool call
+        if last_end < len(text):
+            normal_text_parts.append(text[last_end:])
+
+        # Combine all normal text parts
+        normal_text = "".join(normal_text_parts).strip()
+
+        # Parse tool calls
+        match_result_list = re.findall(self.func_call_regex, text, re.DOTALL)
+        calls = []
+        try:
+            for match_result in match_result_list:
+                # Get function name
+                func_detail = self.func_detail_regex.search(match_result)
+                if func_detail is None:
+                    continue
+                func_name = func_detail.group(1) if func_detail.group(1) else ""
+                func_args = func_detail.group(2) if func_detail.group(2) else ""
+                arguments = {}
+                if func_args:
+                    pairs = self.func_arg_regex.findall(func_args)
+                    # Parse arguments using shared method
+                    arguments = self._parse_argument_pairs(pairs, func_name, tools)
+
+                # construct match_result for parse_base_json
+                match_result = {"name": func_name, "parameters": arguments}
+                calls.extend(self.parse_base_json(match_result, tools))
+            return StreamingParseResult(normal_text=normal_text, calls=calls)
+        except Exception as e:
+            logger.error(f"Error in detect_and_parse: {e}", exc_info=True)
+            # return the normal text if parsing fails
+            return StreamingParseResult(normal_text=text)
+
+    def _get_value_type(self, func_name: str, key: str, tools: List[Tool]) -> str:
+        """Get parameter type from tool definition, with fallback to auto-detection.
+
+        Args:
+            func_name: Name of the function
+            key: Parameter name
+            tools: List of available tools
+
+        Returns:
+            Type string: 'string', 'number', 'object', 'array', or 'boolean'
+        """
+        arg_type = get_argument_type(func_name, key, tools)
+        if arg_type:
+            return arg_type
+
+        # Improved auto-detection type from value (best effort)
+        value_content = self._current_value.strip() if self._current_value else ""
+
+        if not value_content:
+            return "string"
+
+        # Try to parse as valid JSON first
+        try:
+            parsed = json.loads(value_content)
+            if isinstance(parsed, dict):
+                return "object"
+            elif isinstance(parsed, list):
+                return "array"
+            elif isinstance(parsed, bool):
+                return "boolean"
+            elif isinstance(parsed, (int, float)):
+                return "number"
+            # For string values, check if they look like numbers
+            elif isinstance(parsed, str):
+                if parsed.isdigit() or (
+                    parsed.startswith("-") and parsed[1:].isdigit()
+                ):
+                    return "number"
+                return "string"
+        except json.JSONDecodeError:
+            # Not valid JSON, try heuristic detection
+            first_char = value_content[0] if value_content else ""
+
+            if first_char.isdigit() or first_char in ["-", "."]:
+                return "number"
+            elif first_char in ["{", "["]:
+                return "object"
+            elif first_char in ['"', "'"]:
+                return "string"
+
+        # Default to string (safest fallback)
+        return "string"
+
+    def _format_value_complete(self, value: str, value_type: str) -> str:
+        """Format complete value based on type.
+
+        Args:
+            value: Raw value string
+            value_type: Expected type ('string', 'number', 'object')
+
+        Returns:
+            Properly formatted JSON value string
+        """
+        if value_type == "string":
+            # Ensure proper JSON string formatting with quotes
+            return json.dumps(value, ensure_ascii=False)
+        elif value_type == "number":
+            try:
+                num = _convert_to_number(value.strip() if value else "")
+                return str(num)
+            except (ValueError, AttributeError):
+                # Fallback to string if not a valid number
+                logger.warning(
+                    f"Failed to parse '{value}' as number, treating as string"
+                )
+                return json.dumps(str(value) if value else "", ensure_ascii=False)
+        else:
+            # For object/array types, return as-is (should already be valid JSON)
+            return value
+
+    def _process_xml_to_json_streaming(
+        self, raw_increment: str, func_name: str, tools: List[Tool]
+    ) -> str:
+        """Convert XML increment to JSON streaming output using state machine.
+
+        This method processes XML fragments character by character and converts them
+        to JSON format incrementally. It maintains state across calls to handle
+        partial XML tags and values.
+
+        Args:
+            raw_increment: New XML content to process
+            func_name: Name of the function being called
+            tools: List of available tools for type inference
+
+        Returns:
+            JSON string increment to append to the output
+        """
+        json_output = ""
+
+        for char in raw_increment:
+            self._xml_tag_buffer += char
+
+            if self._stream_state in [StreamState.INIT, StreamState.BETWEEN]:
+                if self._xml_tag_buffer.endswith("<arg_key>"):
+                    self._stream_state = StreamState.IN_KEY
+                    self._current_key = ""
+                    self._xml_tag_buffer = ""
+                    json_output += "{" if self._is_first_param else ", "
+                    self._is_first_param = False
+
+            elif self._stream_state == StreamState.IN_KEY:
+                if self._xml_tag_buffer.endswith("</arg_key>"):
+                    self._current_key = self._xml_tag_buffer[:-10].strip()
+                    self._xml_tag_buffer = ""
+                    self._stream_state = StreamState.WAITING_VALUE
+                    json_output += (
+                        json.dumps(self._current_key, ensure_ascii=False) + ": "
+                    )
+
+            elif self._stream_state == StreamState.WAITING_VALUE:
+                if self._xml_tag_buffer.endswith("<arg_value>"):
+                    self._stream_state = StreamState.IN_VALUE
+                    self._current_value = ""
+                    self._xml_tag_buffer = ""
+                    self._value_started = False
+                    # Determine and cache the value type at the start
+                    self._cached_value_type = self._get_value_type(
+                        func_name, self._current_key, tools
+                    )
+
+            elif self._stream_state == StreamState.IN_VALUE:
+                if self._xml_tag_buffer.endswith("</arg_value>"):
+                    final_value = self._xml_tag_buffer[:-12]
+                    self._current_value += final_value
+
+                    # Use cached value type for consistency
+                    value_type = self._cached_value_type or "string"
+
+                    if self._value_started:
+                        # Output any remaining content
+                        if final_value:
+                            if value_type == "string":
+                                json_output += json.dumps(
+                                    final_value, ensure_ascii=False
+                                )[1:-1]
+                            else:
+                                json_output += final_value
+                        # Always output closing quote for string type when value was started
+                        if value_type == "string":
+                            json_output += '"'
+                    else:
+                        # Value was never started (empty or complete in one chunk)
+                        json_output += self._format_value_complete(
+                            self._current_value, value_type
+                        )
+
+                    self._xml_tag_buffer = ""
+                    self._stream_state = StreamState.BETWEEN
+                    self._current_value = ""
+                    self._value_started = False
+                    self._cached_value_type = None  # Reset cached type
+                else:
+                    closing_tag = "</arg_value>"
+                    is_potential_closing = len(self._xml_tag_buffer) <= len(
+                        closing_tag
+                    ) and closing_tag.startswith(self._xml_tag_buffer)
+
+                    if not is_potential_closing:
+                        content = self._xml_tag_buffer
+                        # Use cached value type for consistency
+                        value_type = self._cached_value_type or "string"
+
+                        if value_type == "string":
+                            if not self._value_started:
+                                json_output += '"'
+                                self._value_started = True
+                            if content:
+                                json_output += json.dumps(content, ensure_ascii=False)[
+                                    1:-1
+                                ]
+                                self._current_value += content
+                                self._xml_tag_buffer = ""
+                        elif value_type == "number":
+                            if content:
+                                if not self._value_started:
+                                    self._value_started = True
+                                json_output += content
+                                self._current_value += content
+                                self._xml_tag_buffer = ""
+                        else:
+                            # For object/array types, output as-is
+                            if content:
+                                if not self._value_started:
+                                    self._value_started = True
+                                json_output += content
+                                self._current_value += content
+                                self._xml_tag_buffer = ""
+
+        return json_output
+
+    def _extract_match_groups(self, match: re.Match) -> tuple[str, str, str]:
+        """Extract function name, arguments and end marker from regex match.
+
+        Args:
+            match: Regex match object
+
+        Returns:
+            (func_name, func_args_raw, is_tool_end)
+        """
+        func_name = match.group(1).strip()
+        func_args_raw = match.group(2).strip() if match.group(2) else ""
+        is_tool_end = match.group(3) or ""
+        return func_name, func_args_raw, is_tool_end
+
+    def _send_tool_name_if_needed(
+        self, func_name: str, has_arg_key: bool, is_tool_end: str
+    ) -> Optional[ToolCallItem]:
+        """Send tool name if needed.
+
+        Args:
+            func_name: Function name
+            has_arg_key: Whether current text contains <arg_key
+            is_tool_end: Whether end marker is encountered
+
+        Returns:
+            Tool call item or None
+        """
+        if self.current_tool_name_sent:
+            return None
+
+        # Function name completeness check
+        is_func_name_complete = has_arg_key or is_tool_end == self.eot_token
+
+        if not is_func_name_complete:
+            return None
+
+        if not func_name:
+            logger.warning("Empty function name detected, skipping tool call")
+            return None
+
+        # Send tool name
+        self.current_tool_name_sent = True
+        self._streamed_raw_length = 0
+        self._reset_streaming_state()
+
+        # Record tool info
+        self.prev_tool_call_arr[self.current_tool_id] = {
+            "name": func_name,
+            "arguments": {},
+        }
+
+        return ToolCallItem(
+            tool_index=self.current_tool_id,
+            name=func_name,
+            parameters="",
+        )
+
+    def _process_arguments_streaming(
+        self, func_name: str, func_args_raw: str, tools: List[Tool]
+    ) -> Optional[ToolCallItem]:
+        """Process streaming arguments.
+
+        Args:
+            func_name: Function name
+            func_args_raw: Raw argument string
+            tools: List of available tools
+
+        Returns:
+            Tool call item with parameter updates or None
+        """
+        current_raw_length = len(func_args_raw)
+
+        if current_raw_length <= self._streamed_raw_length:
+            return None
+
+        # Get new raw XML content
+        raw_increment = func_args_raw[self._streamed_raw_length :]
+
+        # Convert XML to JSON using state machine
+        json_increment = self._process_xml_to_json_streaming(
+            raw_increment, func_name, tools
+        )
+
+        # CRITICAL: Update streamed length BEFORE early return
+        # Even if json_increment is empty, the input has been consumed by the state machine
+        self._streamed_raw_length = current_raw_length
+
+        if not json_increment:
+            return None
+
+        # Update state
+        self._last_arguments += json_increment
+        self.streamed_args_for_tool[self.current_tool_id] += json_increment
+
+        return ToolCallItem(
+            tool_index=self.current_tool_id,
+            name=None,
+            parameters=json_increment,
+        )
+
+    def _finalize_tool_call(
+        self,
+        func_name: str,
+        func_args_raw: str,
+        tools: List[Tool],
+        match_end_pos: int,
+        current_text: str,
+    ) -> List[ToolCallItem]:
+        """Complete tool call processing.
+
+        Args:
+            func_name: Function name
+            func_args_raw: Raw argument string
+            tools: List of available tools
+            match_end_pos: Match end position
+            current_text: Current text
+
+        Returns:
+            List of tool call items to add
+        """
+        calls = []
+
+        # Handle no-arg function or need to close braces
+        if self._is_first_param and not self._sent_empty_object:
+            # No-arg function
+            calls.append(
+                ToolCallItem(
+                    tool_index=self.current_tool_id,
+                    name=None,
+                    parameters="{}",
+                )
+            )
+            self._last_arguments += "{}"
+            self.streamed_args_for_tool[self.current_tool_id] += "{}"
+            self._sent_empty_object = True
+        elif not self._last_arguments.endswith("}") and not self._sent_empty_object:
+            # Need to close brace
+            calls.append(
+                ToolCallItem(
+                    tool_index=self.current_tool_id,
+                    name=None,
+                    parameters="}",
+                )
+            )
+            self._last_arguments += "}"
+            self.streamed_args_for_tool[self.current_tool_id] += "}"
+            self._sent_empty_object = True
+
+        # Parse final arguments
+        if func_args_raw:
+            try:
+                pairs = self.func_arg_regex.findall(func_args_raw)
+                if pairs:
+                    arguments = self._parse_argument_pairs(pairs, func_name, tools)
+                    self.prev_tool_call_arr[self.current_tool_id][
+                        "arguments"
+                    ] = arguments
+            except Exception as e:
+                logger.debug(f"Failed to parse arguments: {e}", exc_info=True)
+
+        # Clean buffer
+        self._buffer = current_text[match_end_pos:]
+
+        # Reset state for next tool call
+        self._tool_call_completed = True
+        self.current_tool_id += 1
+        self._last_arguments = ""
+        self.current_tool_name_sent = False
+        self._streamed_raw_length = 0
+        self._reset_streaming_state()
+
+        return calls
+
+    def parse_streaming_increment(
+        self, new_text: str, tools: List[Tool]
+    ) -> StreamingParseResult:
+        """
+        Streaming incremental parsing tool calls for GLM-4.5 and GLM-4.6 format.
+        Uses a state machine to convert XML to JSON incrementally for true character-by-character streaming.
+        Outputs JSON increments immediately as XML data arrives.
+        """
+        self._buffer += new_text
+        current_text = self._buffer
+
+        # Check if we have a tool call
+        has_tool_call = self.bot_token in current_text
+
+        if not has_tool_call:
+            # Check if buffer could be the start of a tool call
+            # Keep buffer if it could be a partial match of bot_token
+            is_potential_start = any(
+                self.bot_token.startswith(current_text[-i:])
+                for i in range(1, min(len(current_text), len(self.bot_token)) + 1)
+            )
+
+            if not is_potential_start:
+                # Not a potential tool call, return as normal text
+                # Must return the entire buffer (current_text), not just new_text,
+                # because buffer may contain previously accumulated characters like '<'
+                # that turned out not to be part of a tool call
+                output_text = current_text
+                self._buffer = ""
+                if self.eot_token in output_text:
+                    output_text = output_text.replace(self.eot_token, "")
+                return StreamingParseResult(normal_text=output_text)
+            else:
+                # Could be start of tool call, keep buffering
+                return StreamingParseResult(normal_text="", calls=[])
+
+        # Extract any text before the first bot_token and return it as normal_text
+        normal_text = ""
+        first_bot_token_idx = current_text.find(self.bot_token)
+        if first_bot_token_idx > 0:
+            normal_text = current_text[:first_bot_token_idx]
+            current_text = current_text[first_bot_token_idx:]
+            # Update buffer to only include from the bot token onwards
+            self._buffer = current_text
+
+        if not hasattr(self, "_tool_indices"):
+            self._tool_indices = self._get_tool_indices(tools)
+
+        calls: list[ToolCallItem] = []
+        try:
+            # Try to match a partial or complete tool call
+            # Use a single flexible regex pattern that handles all cases
+            partial_match = re.search(
+                r"<tool_call>(.*?)(?:(<arg_key.*?))?(?:(</tool_call>)|$)",
+                current_text,
+                re.DOTALL,
+            )
+
+            if not partial_match:
+                return StreamingParseResult(normal_text=normal_text, calls=[])
+
+            # Extract match groups using helper method
+            func_name, func_args_raw, is_tool_end = self._extract_match_groups(
+                partial_match
+            )
+
+            # Initialize tool call state if needed (keeping existing logic)
+            if self.current_tool_id == -1:
+                self.current_tool_id = 0
+                self.prev_tool_call_arr = []
+                self.streamed_args_for_tool = [""]
+                self._streamed_raw_length = 0
+                self.current_tool_name_sent = False  # Reset for new tool call
+                self._reset_streaming_state()
+            # Check if this is a continuation of an existing tool call or a new one
+            elif not self.current_tool_name_sent:
+                # Only increment tool_id if we're truly starting a NEW tool call
+                # Don't increment if this is just the first time we're processing
+                # a tool call that was received in the buffer
+                # The key insight: only increment when we've COMPLETED a previous tool call
+                # and now see another bot_token in new_text
+                pass  # Remove the problematic auto-increment logic
+
+            # Ensure tracking arrays are large enough (keeping existing logic)
+            while len(self.prev_tool_call_arr) <= self.current_tool_id:
+                self.prev_tool_call_arr.append({})
+            while len(self.streamed_args_for_tool) <= self.current_tool_id:
+                self.streamed_args_for_tool.append("")
+
+            # Determine if function name is complete by checking for <arg_key> in the full text
+            # This is important for streaming scenarios where args come in later chunks
+            has_arg_key = "<arg_key" in current_text
+
+            # Send tool name if needed
+            tool_name_item = self._send_tool_name_if_needed(
+                func_name, has_arg_key, is_tool_end
+            )
+            if tool_name_item:
+                calls.append(tool_name_item)
+
+            # Process streaming arguments if tool name has been sent
+            if self.current_tool_name_sent:
+                arg_item = self._process_arguments_streaming(
+                    func_name, func_args_raw, tools
+                )
+                if arg_item:
+                    calls.append(arg_item)
+
+                # Finalize tool call if end token is encountered
+                if is_tool_end == self.eot_token and not self._tool_call_completed:
+                    finalize_calls = self._finalize_tool_call(
+                        func_name,
+                        func_args_raw,
+                        tools,
+                        partial_match.end(),
+                        current_text,
+                    )
+                    calls.extend(finalize_calls)
+                    return StreamingParseResult(normal_text=normal_text, calls=calls)
+
+        except Exception as e:
+            logger.error(f"Error in parse_streaming_increment: {e}", exc_info=True)
+            return StreamingParseResult(normal_text=current_text)
+
+        return StreamingParseResult(normal_text=normal_text, calls=calls)
+
+    def _parse_argument_pairs(
+        self, pairs: List[Tuple[str, str]], func_name: str, tools: List[Tool]
+    ) -> Dict[str, Any]:
+        """Parse argument key-value pairs with type coercion.
+
+        Args:
+            pairs: List of (key, value) tuples from regex matching
+            func_name: Name of the function
+            tools: List of available tools
+
+        Returns:
+            Dictionary of parsed arguments
+        """
+        arguments = {}
+        for arg_key, arg_value in pairs:
+            arg_key = arg_key.strip()
+            arg_value = arg_value.strip()
+            arg_type = get_argument_type(func_name, arg_key, tools)
+            parsed_value, is_good_json = parse_arguments(arg_value, arg_type)
+
+            if arg_type == "string":
+                # Only convert to string if explicitly defined as string type
+                if isinstance(parsed_value, str):
+                    arguments[arg_key] = parsed_value
+                elif isinstance(parsed_value, (dict, list)):
+                    # If parsed as dict/list but schema says string, convert to JSON string
+                    arguments[arg_key] = json.dumps(parsed_value, ensure_ascii=False)
+                else:
+                    arguments[arg_key] = str(parsed_value)
+            elif arg_type is None:
+                # If type is not defined, keep the parsed value as-is
+                arguments[arg_key] = parsed_value if is_good_json else arg_value
+            else:
+                # For other types (number, object, array, etc.), use parsed value
+                arguments[arg_key] = parsed_value if is_good_json else arg_value
+
+        return arguments
+
+    def supports_structural_tag(self) -> bool:
+        return False
+
+    def structure_info(self) -> _GetInfoFunc:
+        raise NotImplementedError()
diff --git a/sglang/python/sglang/srt/function_call/glm4_moe_detector.py b/sglang/python/sglang/srt/function_call/glm4_moe_detector.py
new file mode 100644
index 0000000000000000000000000000000000000000..0761e24e7cba8778ab0bfafbd83fe77c8f346ef6
--- /dev/null
+++ b/sglang/python/sglang/srt/function_call/glm4_moe_detector.py
@@ -0,0 +1,642 @@
+import ast
+import json
+import logging
+import re
+from enum import Enum
+from typing import Any, Dict, List, Optional, Tuple
+
+from sglang.srt.entrypoints.openai.protocol import Tool
+from sglang.srt.function_call.base_format_detector import BaseFormatDetector
+from sglang.srt.function_call.core_types import (
+    StreamingParseResult,
+    ToolCallItem,
+    _GetInfoFunc,
+)
+from sglang.srt.function_call.utils import infer_type_from_json_schema
+
+logger = logging.getLogger(__name__)
+
+
+class StreamState(str, Enum):
+    """State machine states for XML to JSON streaming conversion."""
+
+    INIT = "INIT"
+    BETWEEN = "BETWEEN"
+    IN_KEY = "IN_KEY"
+    WAITING_VALUE = "WAITING_VALUE"
+    IN_VALUE = "IN_VALUE"
+
+
+def get_argument_type(
+    func_name: str, arg_key: str, defined_tools: List[Tool]
+) -> Optional[str]:
+    """Get the expected type of a function argument from tool definitions.
+
+    Supports complex JSON Schema definitions including:
+    - Direct type field (including type arrays)
+    - anyOf/oneOf: parameter can be any of multiple types
+    - enum: parameter must be one of enum values
+    - allOf: parameter must satisfy all type definitions
+    - properties: inferred as object type
+    - items: inferred as array type
+
+    Args:
+        func_name: Name of the function/tool
+        arg_key: Name of the argument
+        defined_tools: List of available tools
+
+    Returns:
+        The type string (e.g., 'string', 'number', 'object') or None if not found
+    """
+    name2tool = {tool.function.name: tool for tool in defined_tools}
+    if func_name not in name2tool:
+        return None
+    tool = name2tool[func_name]
+    properties = (tool.function.parameters or {}).get("properties", {})
+    if not isinstance(properties, dict):
+        properties = {}
+    if arg_key not in properties:
+        return None
+
+    # Use new type inference function for complex JSON Schema support
+    return infer_type_from_json_schema(properties[arg_key])
+
+
+def _convert_to_number(value: str) -> Any:
+    """Convert string to appropriate number type (int or float).
+
+    Args:
+        value: String value to convert
+
+    Returns:
+        Converted number or original string if conversion fails
+    """
+    try:
+        if "." in value or "e" in value.lower():
+            return float(value)
+        else:
+            return int(value)
+    except (ValueError, AttributeError):
+        return value
+
+
+def parse_arguments(
+    json_value: str, arg_type: Optional[str] = None
+) -> Tuple[Any, bool]:
+    """Parse argument value with multiple fallback strategies.
+
+    Args:
+        json_value: Raw string value to parse
+        arg_type: Expected type hint ('string', 'number', 'object', etc.)
+
+    Returns:
+        Tuple of (parsed_value, is_valid_json)
+    """
+    # Strategy 1: Direct JSON parsing
+    try:
+        parsed_value = json.loads(json_value)
+
+        # Type coercion for number type
+        if arg_type == "number" and isinstance(parsed_value, str):
+            parsed_value = _convert_to_number(parsed_value)
+
+        return parsed_value, True
+    except (json.JSONDecodeError, ValueError):
+        pass
+
+    # Strategy 2: Unescape and parse
+    try:
+        wrapped = json.loads('{"tmp": "' + json_value + '"}')
+        parsed_value = json.loads(wrapped["tmp"])
+
+        if arg_type == "number" and isinstance(parsed_value, str):
+            parsed_value = _convert_to_number(parsed_value)
+
+        return parsed_value, True
+    except (json.JSONDecodeError, ValueError, KeyError):
+        pass
+
+    # Strategy 3: ast.literal_eval
+    try:
+        parsed_value = ast.literal_eval(json_value)
+        return parsed_value, True
+    except (ValueError, SyntaxError):
+        pass
+
+    # Strategy 4: Treat as string
+    try:
+        quoted_value = json.dumps(str(json_value))
+        return json.loads(quoted_value), True
+    except (json.JSONDecodeError, ValueError):
+        return json_value, False
+
+
+class Glm4MoeDetector(BaseFormatDetector):
+    """
+    Detector for GLM-4.5 and GLM-4.6 models.
+    Assumes function call format (with actual newlines):
+      <tool_call>get_weather
+      <arg_key>city</arg_key>
+      <arg_value>北京</arg_value>
+      <arg_key>date</arg_key>
+      <arg_value>2024-06-27</arg_value>
+      </tool_call>
+
+    Or with literal \n characters (escaped as \\n in the output):
+      <tool_call>get_weather\n<arg_key>city</arg_key>\n<arg_value>北京</arg_value>\n</tool_call>
+
+    Uses a streaming state machine to convert XML to JSON incrementally for maximum speed.
+    """
+
+    def __init__(self):
+        super().__init__()
+        self.bot_token = "<tool_call>"
+        self.eot_token = "</tool_call>"
+        self.func_call_regex = r"<tool_call>.*?</tool_call>"
+        self.func_detail_regex = re.compile(
+            r"<tool_call>(.*?)(?:\\n|\n)(.*)</tool_call>", re.DOTALL
+        )
+        self.func_arg_regex = re.compile(
+            r"<arg_key>(.*?)</arg_key>(?:\\n|\s)*<arg_value>(.*?)</arg_value>",
+            re.DOTALL,
+        )
+        self._last_arguments = ""
+        self.current_tool_id = -1
+        self.current_tool_name_sent = False
+        self._streamed_raw_length = 0
+        self._reset_streaming_state()
+
+    def _reset_streaming_state(self) -> None:
+        """Reset the streaming state machine for a new tool call."""
+        self._stream_state = StreamState.INIT
+        self._current_key = ""
+        self._current_value = ""
+        self._xml_tag_buffer = ""
+        self._is_first_param = True
+        self._value_started = False
+        self._cached_value_type: Optional[str] = (
+            None  # Cache the value type for consistency
+        )
+
+    def has_tool_call(self, text: str) -> bool:
+        """Check if the text contains a glm-4.5 / glm-4.6 format tool call."""
+        return self.bot_token in text
+
+    def detect_and_parse(self, text: str, tools: List[Tool]) -> StreamingParseResult:
+        """
+        One-time parsing: Detects and parses tool calls in the provided text.
+
+        :param text: The complete text to parse.
+        :param tools: List of available tools.
+        :return: ParseResult indicating success or failure, consumed text, leftover text, and parsed calls.
+        """
+        idx = text.find(self.bot_token)
+        normal_text = text[:idx].strip() if idx != -1 else text
+        if self.bot_token not in text:
+            return StreamingParseResult(normal_text=normal_text, calls=[])
+        match_result_list = re.findall(self.func_call_regex, text, re.DOTALL)
+        calls = []
+        try:
+            for match_result in match_result_list:
+                # Get function name
+                func_detail = self.func_detail_regex.search(match_result)
+                if func_detail is None:
+                    continue
+                func_name = func_detail.group(1) if func_detail.group(1) else ""
+                func_args = func_detail.group(2) if func_detail.group(2) else ""
+                pairs = self.func_arg_regex.findall(func_args)
+
+                # Parse arguments using shared method
+                arguments = self._parse_argument_pairs(pairs, func_name, tools)
+
+                # construct match_result for parse_base_json
+                match_result = {"name": func_name, "parameters": arguments}
+                calls.extend(self.parse_base_json(match_result, tools))
+            return StreamingParseResult(normal_text=normal_text, calls=calls)
+        except Exception as e:
+            logger.error(f"Error in detect_and_parse: {e}", exc_info=True)
+            # return the normal text if parsing fails
+            return StreamingParseResult(normal_text=text)
+
+    def _get_value_type(self, func_name: str, key: str, tools: List[Tool]) -> str:
+        """Get parameter type from tool definition, with fallback to auto-detection.
+
+        Args:
+            func_name: Name of the function
+            key: Parameter name
+            tools: List of available tools
+
+        Returns:
+            Type string: 'string', 'number', 'object', 'array', or 'boolean'
+        """
+        arg_type = get_argument_type(func_name, key, tools)
+        if arg_type:
+            return arg_type
+
+        # Improved auto-detection type from value (best effort)
+        value_content = self._current_value.strip() if self._current_value else ""
+
+        if not value_content:
+            return "string"
+
+        # Try to parse as valid JSON first
+        try:
+            parsed = json.loads(value_content)
+            if isinstance(parsed, dict):
+                return "object"
+            elif isinstance(parsed, list):
+                return "array"
+            elif isinstance(parsed, bool):
+                return "boolean"
+            elif isinstance(parsed, (int, float)):
+                return "number"
+            # For string values, check if they look like numbers
+            elif isinstance(parsed, str):
+                if parsed.isdigit() or (
+                    parsed.startswith("-") and parsed[1:].isdigit()
+                ):
+                    return "number"
+                return "string"
+        except json.JSONDecodeError:
+            # Not valid JSON, try heuristic detection
+            first_char = value_content[0] if value_content else ""
+
+            if first_char.isdigit() or first_char in ["-", "."]:
+                return "number"
+            elif first_char in ["{", "["]:
+                return "object"
+            elif first_char in ['"', "'"]:
+                return "string"
+
+        # Default to string (safest fallback)
+        return "string"
+
+    def _format_value_complete(self, value: str, value_type: str) -> str:
+        """Format complete value based on type.
+
+        Args:
+            value: Raw value string
+            value_type: Expected type ('string', 'number', 'object')
+
+        Returns:
+            Properly formatted JSON value string
+        """
+        if value_type == "string":
+            # Ensure proper JSON string formatting with quotes
+            return json.dumps(value, ensure_ascii=False)
+        elif value_type == "number":
+            try:
+                num = _convert_to_number(value.strip())
+                return str(num)
+            except (ValueError, AttributeError):
+                # Fallback to string if not a valid number
+                logger.warning(
+                    f"Failed to parse '{value}' as number, treating as string"
+                )
+                return json.dumps(str(value), ensure_ascii=False)
+        else:
+            # For object/array types, return as-is (should already be valid JSON)
+            return value
+
+    def _process_xml_to_json_streaming(
+        self, raw_increment: str, func_name: str, tools: List[Tool]
+    ) -> str:
+        """Convert XML increment to JSON streaming output using state machine.
+
+        This method processes XML fragments character by character and converts them
+        to JSON format incrementally. It maintains state across calls to handle
+        partial XML tags and values.
+
+        Args:
+            raw_increment: New XML content to process
+            func_name: Name of the function being called
+            tools: List of available tools for type inference
+
+        Returns:
+            JSON string increment to append to the output
+        """
+        json_output = ""
+
+        for char in raw_increment:
+            self._xml_tag_buffer += char
+
+            if self._stream_state in [StreamState.INIT, StreamState.BETWEEN]:
+                if self._xml_tag_buffer.endswith("<arg_key>"):
+                    self._stream_state = StreamState.IN_KEY
+                    self._current_key = ""
+                    self._xml_tag_buffer = ""
+                    json_output += "{" if self._is_first_param else ", "
+                    self._is_first_param = False
+
+            elif self._stream_state == StreamState.IN_KEY:
+                if self._xml_tag_buffer.endswith("</arg_key>"):
+                    self._current_key = self._xml_tag_buffer[:-10].strip()
+                    self._xml_tag_buffer = ""
+                    self._stream_state = StreamState.WAITING_VALUE
+                    json_output += (
+                        json.dumps(self._current_key, ensure_ascii=False) + ": "
+                    )
+
+            elif self._stream_state == StreamState.WAITING_VALUE:
+                if self._xml_tag_buffer.endswith("<arg_value>"):
+                    self._stream_state = StreamState.IN_VALUE
+                    self._current_value = ""
+                    self._xml_tag_buffer = ""
+                    self._value_started = False
+                    # Determine and cache the value type at the start
+                    self._cached_value_type = self._get_value_type(
+                        func_name, self._current_key, tools
+                    )
+
+            elif self._stream_state == StreamState.IN_VALUE:
+                if self._xml_tag_buffer.endswith("</arg_value>"):
+                    final_value = self._xml_tag_buffer[:-12]
+                    self._current_value += final_value
+
+                    # Use cached value type for consistency
+                    value_type = self._cached_value_type or "string"
+
+                    if self._value_started:
+                        # Output any remaining content
+                        if final_value:
+                            if value_type == "string":
+                                json_output += json.dumps(
+                                    final_value, ensure_ascii=False
+                                )[1:-1]
+                            else:
+                                json_output += final_value
+                        # Always output closing quote for string type when value was started
+                        if value_type == "string":
+                            json_output += '"'
+                    else:
+                        # Value was never started (empty or complete in one chunk)
+                        json_output += self._format_value_complete(
+                            self._current_value, value_type
+                        )
+
+                    self._xml_tag_buffer = ""
+                    self._stream_state = StreamState.BETWEEN
+                    self._current_value = ""
+                    self._value_started = False
+                    self._cached_value_type = None  # Reset cached type
+                else:
+                    closing_tag = "</arg_value>"
+                    is_potential_closing = len(self._xml_tag_buffer) <= len(
+                        closing_tag
+                    ) and closing_tag.startswith(self._xml_tag_buffer)
+
+                    if not is_potential_closing:
+                        content = self._xml_tag_buffer
+                        # Use cached value type for consistency
+                        value_type = self._cached_value_type or "string"
+
+                        if value_type == "string":
+                            if not self._value_started:
+                                json_output += '"'
+                                self._value_started = True
+                            if content:
+                                json_output += json.dumps(content, ensure_ascii=False)[
+                                    1:-1
+                                ]
+                                self._current_value += content
+                                self._xml_tag_buffer = ""
+                        elif value_type == "number":
+                            if content:
+                                if not self._value_started:
+                                    self._value_started = True
+                                json_output += content
+                                self._current_value += content
+                                self._xml_tag_buffer = ""
+                        else:
+                            # For object/array types, output as-is
+                            if content:
+                                if not self._value_started:
+                                    self._value_started = True
+                                json_output += content
+                                self._current_value += content
+                                self._xml_tag_buffer = ""
+
+        return json_output
+
+    def parse_streaming_increment(
+        self, new_text: str, tools: List[Tool]
+    ) -> StreamingParseResult:
+        """
+        Streaming incremental parsing tool calls for GLM-4.5 and GLM-4.6 format.
+        Uses a state machine to convert XML to JSON incrementally for true character-by-character streaming.
+        Outputs JSON increments immediately as XML data arrives.
+        """
+        self._buffer += new_text
+        current_text = self._buffer
+
+        # Check if we have a tool call
+        has_tool_call = self.bot_token in current_text
+
+        if not has_tool_call:
+            # Check if buffer could be the start of a tool call
+            # Keep buffer if it could be a partial match of bot_token
+            is_potential_start = any(
+                self.bot_token.startswith(current_text[-i:])
+                for i in range(1, min(len(current_text), len(self.bot_token)) + 1)
+            )
+
+            if not is_potential_start:
+                # Not a potential tool call, return as normal text
+                # Must return the entire buffer (current_text), not just new_text,
+                # because buffer may contain previously accumulated characters like '<'
+                # that turned out not to be part of a tool call
+                output_text = current_text
+                self._buffer = ""
+                if self.eot_token in output_text:
+                    output_text = output_text.replace(self.eot_token, "")
+                return StreamingParseResult(normal_text=output_text)
+            else:
+                # Could be start of tool call, keep buffering
+                return StreamingParseResult(normal_text="", calls=[])
+
+        if not hasattr(self, "_tool_indices"):
+            self._tool_indices = self._get_tool_indices(tools)
+
+        calls: list[ToolCallItem] = []
+        try:
+            # Try to match a partial or complete tool call
+            partial_match = re.search(
+                pattern=r"<tool_call>(.*?)(?:\\n|\n)(.*?)(</tool_call>|$)",
+                string=current_text,
+                flags=re.DOTALL,
+            )
+            if partial_match:
+                func_name_raw = partial_match.group(1)
+                func_args_raw = partial_match.group(2)
+                is_tool_end = partial_match.group(3)
+
+                # Only proceed if we have a non-empty function name
+                if func_name_raw is None or not func_name_raw.strip():
+                    # If we only have the start token without a function name,
+                    # continue buffering until we get more content
+                    return StreamingParseResult(normal_text="", calls=[])
+
+                func_name = func_name_raw.strip()
+                func_args_raw = func_args_raw.strip() if func_args_raw else ""
+
+                # Initialize state if this is the first tool call
+                if self.current_tool_id == -1:
+                    self.current_tool_id = 0
+                    self.prev_tool_call_arr = []
+                    self.streamed_args_for_tool = [""]
+                    self._streamed_raw_length = 0
+                    self.current_tool_name_sent = False
+                    self._reset_streaming_state()
+
+                # Ensure we have enough entries in our tracking arrays
+                while len(self.prev_tool_call_arr) <= self.current_tool_id:
+                    self.prev_tool_call_arr.append({})
+                while len(self.streamed_args_for_tool) <= self.current_tool_id:
+                    self.streamed_args_for_tool.append("")
+
+                # Send tool name first if not sent yet
+                if not self.current_tool_name_sent:
+                    calls.append(
+                        ToolCallItem(
+                            tool_index=self.current_tool_id,
+                            name=func_name,
+                            parameters="",
+                        )
+                    )
+                    self.current_tool_name_sent = True
+                    self._streamed_raw_length = 0
+                    self._reset_streaming_state()
+                    # Store the tool call info
+                    self.prev_tool_call_arr[self.current_tool_id] = {
+                        "name": func_name,
+                        "arguments": {},
+                    }
+                else:
+                    # Process XML to JSON streaming
+                    current_raw_length = len(func_args_raw)
+
+                    if current_raw_length > self._streamed_raw_length:
+                        # Get the new raw XML content
+                        raw_increment = func_args_raw[self._streamed_raw_length :]
+
+                        # Convert XML increment to JSON increment using state machine
+                        json_increment = self._process_xml_to_json_streaming(
+                            raw_increment, func_name, tools
+                        )
+
+                        # CRITICAL: Update streamed length BEFORE checking json_increment
+                        # Even if json_increment is empty, the input has been consumed by the state machine
+                        self._streamed_raw_length = current_raw_length
+
+                        if json_increment:
+                            calls.append(
+                                ToolCallItem(
+                                    tool_index=self.current_tool_id,
+                                    name=None,
+                                    parameters=json_increment,
+                                )
+                            )
+                            self._last_arguments += json_increment
+                            self.streamed_args_for_tool[
+                                self.current_tool_id
+                            ] += json_increment
+
+                    if is_tool_end == self.eot_token:
+                        if self._is_first_param:
+                            empty_object = "{}"
+                            calls.append(
+                                ToolCallItem(
+                                    tool_index=self.current_tool_id,
+                                    name=None,
+                                    parameters=empty_object,
+                                )
+                            )
+                            self._last_arguments += empty_object
+                        elif not self._last_arguments.endswith("}"):
+                            closing_brace = "}"
+                            calls.append(
+                                ToolCallItem(
+                                    tool_index=self.current_tool_id,
+                                    name=None,
+                                    parameters=closing_brace,
+                                )
+                            )
+                            self._last_arguments += closing_brace
+                            self.streamed_args_for_tool[
+                                self.current_tool_id
+                            ] += closing_brace
+
+                        try:
+                            pairs = self.func_arg_regex.findall(func_args_raw)
+                            if pairs:
+                                arguments = self._parse_argument_pairs(
+                                    pairs, func_name, tools
+                                )
+                                self.prev_tool_call_arr[self.current_tool_id][
+                                    "arguments"
+                                ] = arguments
+                        except Exception as e:
+                            logger.debug(
+                                f"Failed to parse arguments: {e}", exc_info=True
+                            )
+
+                        # Remove the completed tool call from buffer
+                        self._buffer = current_text[partial_match.end(3) :]
+
+                        result = StreamingParseResult(normal_text="", calls=calls)
+                        self.current_tool_id += 1
+                        self._last_arguments = ""
+                        self.current_tool_name_sent = False
+                        self._streamed_raw_length = 0
+                        self._reset_streaming_state()
+                        return result
+
+            return StreamingParseResult(normal_text="", calls=calls)
+
+        except Exception as e:
+            logger.error(f"Error in parse_streaming_increment: {e}", exc_info=True)
+            return StreamingParseResult(normal_text=current_text)
+
+    def _parse_argument_pairs(
+        self, pairs: List[Tuple[str, str]], func_name: str, tools: List[Tool]
+    ) -> Dict[str, Any]:
+        """Parse argument key-value pairs with type coercion.
+
+        Args:
+            pairs: List of (key, value) tuples from regex matching
+            func_name: Name of the function
+            tools: List of available tools
+
+        Returns:
+            Dictionary of parsed arguments
+        """
+        arguments = {}
+        for arg_key, arg_value in pairs:
+            arg_key = arg_key.strip()
+            arg_value = arg_value.strip()
+            arg_type = get_argument_type(func_name, arg_key, tools)
+            parsed_value, is_good_json = parse_arguments(arg_value, arg_type)
+
+            if arg_type == "string":
+                # Only convert to string if explicitly defined as string type
+                if isinstance(parsed_value, str):
+                    arguments[arg_key] = parsed_value
+                elif isinstance(parsed_value, (dict, list)):
+                    # If parsed as dict/list but schema says string, convert to JSON string
+                    arguments[arg_key] = json.dumps(parsed_value, ensure_ascii=False)
+                else:
+                    arguments[arg_key] = str(parsed_value)
+            elif arg_type is None:
+                # If type is not defined, keep the parsed value as-is
+                arguments[arg_key] = parsed_value if is_good_json else arg_value
+            else:
+                # For other types (number, object, array, etc.), use parsed value
+                arguments[arg_key] = parsed_value if is_good_json else arg_value
+
+        return arguments
+
+    def supports_structural_tag(self) -> bool:
+        return False
+
+    def structure_info(self) -> _GetInfoFunc:
+        raise NotImplementedError()
diff --git a/sglang/python/sglang/srt/function_call/gpt_oss_detector.py b/sglang/python/sglang/srt/function_call/gpt_oss_detector.py
new file mode 100644
index 0000000000000000000000000000000000000000..b3fd5ac61e896728c8d4449b538cc9873917deb2
--- /dev/null
+++ b/sglang/python/sglang/srt/function_call/gpt_oss_detector.py
@@ -0,0 +1,241 @@
+import json
+import logging
+import re
+from typing import List, Optional
+
+from sglang.srt.entrypoints.openai.protocol import Tool
+from sglang.srt.environ import envs
+from sglang.srt.function_call.base_format_detector import BaseFormatDetector
+from sglang.srt.function_call.core_types import (
+    StreamingParseResult,
+    ToolCallItem,
+    _GetInfoFunc,
+)
+from sglang.srt.parser.harmony_parser import HarmonyParser
+
+logger = logging.getLogger(__name__)
+
+
+class GptOssDetector(BaseFormatDetector):
+    """
+    Detector for T4-style function calls using HarmonyParser.
+
+    Handles tool calls in the format:
+    <|channel|>commentary to={namespace.function}<|constrain|>json<|message|>{args}<|call|>
+    """
+
+    def __init__(self):
+        super().__init__()
+        self.harmony_parser = HarmonyParser()
+        self.bot_token = "<|start|>assistant<|channel|>commentary"
+        self.eot_token = "<|call|>"
+
+        # Pattern to extract function name and JSON from tool_call event content
+        self.tool_extract_pattern = re.compile(
+            r"to=([a-zA-Z_][a-zA-Z0-9_.-]*)\s*<\|constrain\|>json<\|message\|>(.*?)(?:<\|call\|>|$)",
+            re.DOTALL,
+        )
+
+    def has_tool_call(self, text: str) -> bool:
+        """Check if text contains TypeScript-style function call markers."""
+        return self.bot_token in text
+
+    def detect_and_parse(self, text: str, tools: List[Tool]) -> StreamingParseResult:
+        """Parse TypeScript-style function calls from complete text."""
+        if not self.has_tool_call(text):
+            return StreamingParseResult(normal_text=text, calls=[])
+
+        # Parse with HarmonyParser
+        events = self.harmony_parser.parse(text)
+        # Flush buffer for complete parsing
+        events += self.harmony_parser.parse("")
+
+        tool_indices = self._get_tool_indices(tools)
+        calls = []
+        normal_parts = []
+        tool_index = 0
+
+        for event in events:
+            if event.event_type == "tool_call":
+                # Extract tool call from event content
+                tool_call = self._extract_tool_call_from_event(
+                    event.raw_text if event.raw_text else event.content,
+                    tool_indices,
+                    tool_index,
+                )
+                if tool_call:
+                    calls.append(tool_call)
+                    tool_index += 1
+            elif event.event_type == "normal":
+                normal_parts.append(event.content)
+            # Ignore reasoning events in function call context
+
+        normal_text = " ".join(normal_parts).strip()
+        return StreamingParseResult(normal_text=normal_text, calls=calls)
+
+    def parse_streaming_increment(
+        self, new_text: str, tools: List[Tool]
+    ) -> StreamingParseResult:
+        """Parse incremental streaming text for TypeScript-style function calls."""
+        self._buffer += new_text
+
+        # Always use HarmonyParser for parsing to ensure proper filtering
+        events = self.harmony_parser.parse(new_text)
+
+        # If there are no parsed events and the chunk contains no Harmony structural
+        # markers, treat it as plain text and pass it through. This fixes a bug where
+        # normal content was held in the buffer when tools were provided but not used.
+        if not events:
+            has_harmony_markers = any(
+                marker in self._buffer
+                for marker in (
+                    "<|start|>",
+                    "<|channel|>",
+                    "<|message|>",
+                    "<|constrain|>",
+                    "<|end|>",
+                    "<|call|>",
+                    "<|return|>",
+                    "assistantfinal",
+                )
+            )
+            if not has_harmony_markers:
+                # Plain text with no tool markers — emit as normal content
+                out = self._buffer
+                self._buffer = ""
+                return StreamingParseResult(normal_text=out, calls=[])
+
+        # Quick check if we might have tool calls
+        if (
+            "<|channel|>commentary to=" not in self._buffer
+            and not self.current_tool_name_sent
+        ):
+            # No tool calls detected, check for final content
+            if (
+                "<|channel|>final" in self._buffer
+                or "assistantfinal" in self._buffer.lower()
+            ):
+                # Extract normal text from events
+                normal_text = "".join(
+                    [e.content for e in events if e.event_type == "normal"]
+                )
+                if normal_text:
+                    self._buffer = ""
+                    return StreamingParseResult(normal_text=normal_text, calls=[])
+
+            # For other content, extract normal text from events (with filtering applied)
+            normal_text = "".join(
+                [e.content for e in events if e.event_type == "normal"]
+            )
+            if normal_text or events:
+                self._buffer = ""
+                return StreamingParseResult(normal_text=normal_text, calls=[])
+            else:
+                # No events processed, continue buffering
+                return StreamingParseResult(normal_text="", calls=[])
+
+        if not events:
+            # No complete events yet
+            return StreamingParseResult(normal_text="", calls=[])
+
+        # Initialize state if needed
+        if not hasattr(self, "_tool_indices"):
+            self._tool_indices = self._get_tool_indices(tools)
+
+        calls = []
+        normal_text = ""
+
+        for event in events:
+            if event.event_type == "tool_call":
+                # We got a complete tool call from HarmonyParser
+                tool_call_info = self._extract_tool_call_from_event(
+                    event.raw_text if event.raw_text else event.content,
+                    self._tool_indices,
+                    self.current_tool_id if self.current_tool_id >= 0 else 0,
+                )
+
+                if tool_call_info:
+                    # Initialize state if first tool
+                    if self.current_tool_id == -1:
+                        self.current_tool_id = 0
+                        self.prev_tool_call_arr = []
+                        self.streamed_args_for_tool = [""]
+
+                    # Ensure arrays are large enough
+                    while len(self.prev_tool_call_arr) <= self.current_tool_id:
+                        self.prev_tool_call_arr.append({})
+                    while len(self.streamed_args_for_tool) <= self.current_tool_id:
+                        self.streamed_args_for_tool.append("")
+
+                    # Store tool call info
+                    self.prev_tool_call_arr[self.current_tool_id] = {
+                        "name": tool_call_info.name,
+                        "arguments": json.loads(tool_call_info.parameters),
+                    }
+
+                    # Emit the complete tool call at once
+                    # (Could be modified to emit name first, then args, if needed)
+                    calls.append(tool_call_info)
+
+                    # Mark as streamed
+                    self.streamed_args_for_tool[self.current_tool_id] = (
+                        tool_call_info.parameters
+                    )
+
+                    # Move to next tool
+                    self.current_tool_id += 1
+                    self.current_tool_name_sent = False
+
+            elif event.event_type == "normal":
+                normal_text += event.content
+
+        # Clear buffer since HarmonyParser handles buffering
+        self._buffer = ""
+
+        return StreamingParseResult(normal_text=normal_text, calls=calls)
+
+    def _extract_tool_call_from_event(
+        self, content: str, tool_indices: dict, tool_index: int
+    ) -> Optional[ToolCallItem]:
+        """
+        Extract tool call information from HarmonyParser event content.
+
+        Content format: "commentary to=functions.get_weather<|constrain|>json<|message|>{...}"
+        """
+        match = self.tool_extract_pattern.search(content)
+
+        if not match:
+            logger.debug(f"Could not extract tool call from: {content[:100]}")
+            return None
+
+        full_function_name = match.group(1)
+        json_content = match.group(2)
+
+        # Extract function name (last part after .)
+        function_name = (
+            full_function_name.split(".")[-1]
+            if "." in full_function_name
+            else full_function_name
+        )
+
+        # Check if tool exists
+        if function_name not in tool_indices:
+            logger.debug(f"Function {function_name} not in available tools")
+            if not envs.SGLANG_FORWARD_UNKNOWN_TOOLS.get():
+                return None  # Skip unknown tools (default legacy behavior)
+
+        # Parse JSON arguments
+        try:
+            arguments = json.loads(json_content) if json_content.strip() else {}
+        except json.JSONDecodeError as e:
+            logger.debug(f"Failed to parse JSON arguments: {e}")
+            return None
+
+        return ToolCallItem(
+            tool_index=tool_index,
+            name=function_name,
+            parameters=json.dumps(arguments, ensure_ascii=False),
+        )
+
+    def structure_info(self) -> _GetInfoFunc:
+        raise NotImplementedError("structure_info not used with HarmonyParser")
diff --git a/sglang/python/sglang/srt/function_call/hermes_detector.py b/sglang/python/sglang/srt/function_call/hermes_detector.py
new file mode 100644
index 0000000000000000000000000000000000000000..ff55461d8f6d56702ce3d18a8bd3c3ed58af85db
--- /dev/null
+++ b/sglang/python/sglang/srt/function_call/hermes_detector.py
@@ -0,0 +1,120 @@
+import json
+import logging
+import re
+from typing import List
+
+from sglang.srt.entrypoints.openai.protocol import Tool
+from sglang.srt.function_call.base_format_detector import BaseFormatDetector
+from sglang.srt.function_call.core_types import (
+    StreamingParseResult,
+    StructureInfo,
+    _GetInfoFunc,
+)
+
+logger = logging.getLogger(__name__)
+
+
+class HermesDetector(BaseFormatDetector):
+    """
+    Detector for Hermes tool call format.
+
+    Format:
+        <tool_call>{"name": "...", "arguments": {...}}</tool_call>
+    """
+
+    def __init__(self):
+        super().__init__()
+        self.bot_token = "<tool_call>"
+        self.eot_token = "</tool_call>"
+        self.tool_call_regex = re.compile(
+            r"<tool_call>(.*?)</tool_call>|<tool_call>(.*)", re.DOTALL
+        )
+        self._normal_text_buffer = ""
+
+    def has_tool_call(self, text: str) -> bool:
+        return self.bot_token in text
+
+    def detect_and_parse(self, text: str, tools: List[Tool]) -> StreamingParseResult:
+        """
+        One-time parsing: Detects and parses tool calls in the provided text.
+        """
+        idx = text.find(self.bot_token)
+        normal_text = text[:idx].strip() if idx != -1 else text
+        if self.bot_token not in text:
+            return StreamingParseResult(normal_text=normal_text, calls=[])
+
+        calls = []
+        try:
+            for match in self.tool_call_regex.findall(text):
+                raw = match[0] or match[1]
+                if not raw:
+                    continue
+                parsed = json.loads(raw.strip())
+                if isinstance(parsed, list):
+                    calls.extend(self.parse_base_json(parsed, tools))
+                else:
+                    calls.extend(self.parse_base_json(parsed, tools))
+            return StreamingParseResult(normal_text=normal_text, calls=calls)
+        except Exception as e:
+            logger.error(f"Error in detect_and_parse: {e}")
+            return StreamingParseResult(normal_text=text)
+
+    def _clean_normal_text(self, text: str) -> str:
+        if not text:
+            return text
+
+        self._normal_text_buffer += text
+
+        if self.eot_token in self._normal_text_buffer:
+            cleaned = self._normal_text_buffer.replace(self.eot_token, "")
+            self._normal_text_buffer = ""
+            return cleaned
+
+        partial_len = self._ends_with_partial_token(
+            self._normal_text_buffer, self.eot_token
+        )
+        if partial_len:
+            safe_text = self._normal_text_buffer[:-partial_len]
+            self._normal_text_buffer = self._normal_text_buffer[-partial_len:]
+            return safe_text
+
+        cleaned = self._normal_text_buffer
+        self._normal_text_buffer = ""
+        return cleaned
+
+    def parse_streaming_increment(
+        self, new_text: str, tools: List[Tool]
+    ) -> StreamingParseResult:
+        """
+        Streaming parsing: handle normal text, partial tags, and tool calls.
+        """
+        self._buffer += new_text
+        current_text = self._buffer
+
+        if self.bot_token not in current_text:
+            partial_len = self._ends_with_partial_token(current_text, self.bot_token)
+            if partial_len:
+                safe_text = current_text[:-partial_len]
+                self._buffer = current_text[-partial_len:]
+            else:
+                safe_text = current_text
+                self._buffer = ""
+            return StreamingParseResult(normal_text=self._clean_normal_text(safe_text))
+
+        bot_pos = current_text.find(self.bot_token)
+        if bot_pos > 0:
+            normal_text = current_text[:bot_pos]
+            self._buffer = current_text[bot_pos:]
+            return StreamingParseResult(normal_text=normal_text)
+
+        result = super().parse_streaming_increment(new_text="", tools=tools)
+        if result.normal_text:
+            result.normal_text = self._clean_normal_text(result.normal_text)
+        return result
+
+    def structure_info(self) -> _GetInfoFunc:
+        return lambda name: StructureInfo(
+            begin='<tool_call>{"name":"' + name + '", "arguments":',
+            end="}</tool_call>",
+            trigger="<tool_call>",
+        )
diff --git a/sglang/python/sglang/srt/function_call/internlm_detector.py b/sglang/python/sglang/srt/function_call/internlm_detector.py
new file mode 100644
index 0000000000000000000000000000000000000000..1781dbc5e34e8b5e97c9280490855800b23a43d8
--- /dev/null
+++ b/sglang/python/sglang/srt/function_call/internlm_detector.py
@@ -0,0 +1,248 @@
+# modified from https://github.com/InternLM/lmdeploy/blob/main/lmdeploy/serve/openai/tool_parser/internlm2_parser.py
+
+import json
+import logging
+import re
+from typing import List
+
+from sglang.srt.entrypoints.openai.protocol import Tool
+from sglang.srt.environ import envs
+from sglang.srt.function_call.base_format_detector import BaseFormatDetector
+from sglang.srt.function_call.core_types import (
+    StreamingParseResult,
+    StructureInfo,
+    ToolCallItem,
+    _GetInfoFunc,
+)
+
+logger = logging.getLogger(__name__)
+
+
+class InternlmDetector(BaseFormatDetector):
+    """
+    Detector for InternLM2/Intern-S1 model function call format.
+
+    The InternLM format uses special tokens to delimit function calls
+    with JSON for arguments.
+
+    Format Structure:
+    ```
+    text<|action_start|> <|plugin|>
+    {json}<|action_end|>
+    ```
+
+    Examples:
+    ```
+    What's the weather like?<|action_start|> <|plugin|>
+    {"name": "get_weather", "parameters": {"location": "Tokyo"}}<|action_end|>
+    ```
+
+    Key Components:
+    - Tool Call Start: `<|action_start|> <|plugin|>`
+    - Tool Call End: `<|action_end|>`
+    - Arguments: JSON object with `name` and `parameters`/`arguments`
+    - Supports multiple sequential tool calls in both streaming and non-streaming modes
+
+    """
+
+    def __init__(self):
+        super().__init__()
+        self.bot_token = "<|action_start|> <|plugin|>"
+        self.eot_token = "<|action_end|>"
+        self.position = 0
+
+    def has_tool_call(self, text: str) -> bool:
+        """Check if the text contains an InternLM format tool call."""
+        has_call = self.bot_token in text
+        return has_call
+
+    def get_arguments(self, obj):
+        """Extract arguments from object, supporting both 'parameters' and 'arguments' keys."""
+        if "parameters" in obj:
+            return obj.get("parameters")
+        elif "arguments" in obj:
+            return obj.get("arguments")
+        return None
+
+    def detect_and_parse(self, text: str, tools: List[Tool]) -> StreamingParseResult:
+        """
+        One-time parsing: Detects and parses tool calls in the provided text.
+        Supports multiple tool calls in the format:
+        <|action_start|> <|plugin|>\n{JSON}<|action_end|>
+
+        :param text: The complete text to parse.
+        :param tools: List of available tools.
+        :return: StreamingParseResult with normal text and parsed tool calls.
+        """
+
+        # Find the first occurrence of tool call marker to extract normal text
+        idx = text.find(self.bot_token)
+        normal_text = text[:idx].strip() if idx != -1 else text
+
+        if self.bot_token not in text:
+            logger.warning("[InternLM Tool Call] No tool call markers found in text")
+            return StreamingParseResult(normal_text=normal_text, calls=[])
+
+        # Use regex to find all tool call blocks
+        # Pattern matches: {self.bot_token}{...}{self.eot_token}
+        tool_call_pattern = (
+            rf"{re.escape(self.bot_token)}\s*(.*?){re.escape(self.eot_token)}"
+        )
+        matches = re.findall(tool_call_pattern, text, re.DOTALL)
+
+        if not matches:
+            logger.warning("[InternLM Tool Call] No complete tool call blocks found")
+            return StreamingParseResult(normal_text=text, calls=[])
+
+        logger.info(f"[InternLM Tool Call] Found {len(matches)} tool call(s)")
+
+        calls = []
+        tool_indices = self._get_tool_indices(tools)
+
+        try:
+            for idx, action_json in enumerate(matches):
+                action_json = action_json.strip()
+
+                try:
+                    # Parse the JSON
+                    action_dict = json.loads(action_json)
+                    name = action_dict.get("name")
+                    parameters = self.get_arguments(action_dict)
+
+                    if not parameters:
+                        parameters = {}
+
+                    logger.info(
+                        f"[InternLM Tool Call] Parsed tool call #{idx+1}: name={name}, "
+                        f"parameters={json.dumps(parameters, ensure_ascii=False)}"
+                    )
+
+                    # Validate tool name
+                    if not (name and name in tool_indices):
+                        logger.warning(
+                            f"[InternLM Tool Call] Model attempted to call undefined function: {name}, "
+                            f"available_tools={list(tool_indices.keys())}"
+                        )
+                        if not envs.SGLANG_FORWARD_UNKNOWN_TOOLS.get():
+                            continue  # Skip this tool call
+
+                    # Create tool call item and add to list
+                    tool_call = ToolCallItem(
+                        tool_index=tool_indices[name],
+                        name=name,
+                        parameters=json.dumps(parameters, ensure_ascii=False),
+                    )
+                    calls.append(tool_call)
+
+                except json.JSONDecodeError as e:
+                    logger.error(
+                        f"[InternLM Tool Call] Failed to parse JSON for tool call #{idx+1}: {e}"
+                    )
+                    continue
+
+            logger.info(
+                f"[InternLM Tool Call] Successfully parsed {len(calls)} tool call(s), "
+                f"normal_text_length={len(normal_text)}"
+            )
+            return StreamingParseResult(normal_text=normal_text, calls=calls)
+
+        except Exception as e:
+            logger.error(
+                f"[InternLM Tool Call] Error in detect_and_parse: {e}", exc_info=True
+            )
+            return StreamingParseResult(normal_text=text, calls=[])
+
+    def parse_streaming_increment(
+        self, new_text: str, tools: List[Tool]
+    ) -> StreamingParseResult:
+        """
+        Streaming incremental parsing for InternLM format.
+
+        Supports a single tool call in streaming mode.
+        """
+        self._buffer += new_text
+        current_text = self._buffer
+
+        # Check if we don't have a tool call start marker
+        start = current_text.find(self.bot_token)
+        if start == -1:
+            # No tool call marker found
+            # If we've already processed tool calls, don't return text again
+            if self.current_tool_id > 0:
+                self._buffer = ""
+                return StreamingParseResult(normal_text="")
+
+            # Check if buffer could be partial start of bot_token
+            if not self._ends_with_partial_token(current_text, self.bot_token):
+                # Not a partial match, return as normal text
+                normal_text = current_text
+                self._buffer = ""
+                # Clean up any stray end tokens
+                if self.eot_token in normal_text:
+                    normal_text = normal_text.replace(self.eot_token, "")
+                return StreamingParseResult(normal_text=normal_text)
+            else:
+                # Might be partial start token, keep buffering
+                return StreamingParseResult()
+
+        # Check if we have a complete tool call (with end marker)
+        end = current_text.find(self.eot_token)
+        if end != -1:
+            # We have a complete tool call
+            # Initialize state if this is the first tool call
+            if self.current_tool_id == -1:
+                self.current_tool_id = 0
+                self.prev_tool_call_arr = []
+                self.streamed_args_for_tool = [""]
+
+            # Ensure we have enough entries in our tracking arrays
+            while len(self.prev_tool_call_arr) <= self.current_tool_id:
+                self.prev_tool_call_arr.append({})
+            while len(self.streamed_args_for_tool) <= self.current_tool_id:
+                self.streamed_args_for_tool.append("")
+
+            # Use detect_and_parse on the complete tool call
+            complete_section = current_text[: end + len(self.eot_token)]
+            result = self.detect_and_parse(complete_section, tools=tools)
+
+            if result.calls:
+                # Update the tool call index
+                result.calls[0].tool_index = self.current_tool_id
+                # Store the parsed tool call for reference
+                self.prev_tool_call_arr[self.current_tool_id] = {
+                    "name": result.calls[0].name,
+                    "arguments": json.loads(result.calls[0].parameters),
+                }
+                self.streamed_args_for_tool[self.current_tool_id] = result.calls[
+                    0
+                ].parameters
+                # Increment tool ID for next tool call
+                self.current_tool_id += 1
+
+            # Remove the completed tool call from buffer
+            self._buffer = current_text[end + len(self.eot_token) :]
+            return result
+
+        # We have bot_token but no eot_token yet - handle partial tool call streaming
+        # Extract normal text before the tool call
+        normal_text = current_text[:start]
+        # Keep the tool call part in buffer
+        self._buffer = current_text[start:]
+        return StreamingParseResult(normal_text=normal_text)
+
+    def structure_info(self) -> _GetInfoFunc:
+        """
+        Return structure information for constrained generation.
+
+        For InternLM format, the structure is:
+        - begin: <|action_start|> <|plugin|>\n
+        - end: <|action_end|>
+        - trigger: the begin token
+        """
+        return lambda name: StructureInfo(
+            begin='<|action_start|> <|plugin|>\n{"name": "'
+            + name
+            + '", "parameters": ',
+            end="}<|action_end|>",
+            trigger="<|action_start|> <|plugin|>",
+        )
diff --git a/sglang/python/sglang/srt/function_call/json_array_parser.py b/sglang/python/sglang/srt/function_call/json_array_parser.py
new file mode 100644
index 0000000000000000000000000000000000000000..e38e3b1b05632f2f5a9280230224488da8e71c64
--- /dev/null
+++ b/sglang/python/sglang/srt/function_call/json_array_parser.py
@@ -0,0 +1,51 @@
+from typing import List
+
+from sglang.srt.entrypoints.openai.protocol import Tool
+from sglang.srt.function_call.base_format_detector import BaseFormatDetector
+from sglang.srt.function_call.core_types import StreamingParseResult
+
+
+class JsonArrayParser(BaseFormatDetector):
+    """
+    Parser for JSON array tool calls when JSON schema constraints are active.
+
+    This parser is used when tool_choice="required" or a specific tool is named,
+    bypassing model-specific parsers in favor of direct JSON array parsing.
+    """
+
+    def __init__(self):
+        super().__init__()
+        # Configure for JSON array parsing
+        self.bot_token = "["
+        self.eot_token = "]"
+        self.tool_call_separator = ","
+
+    def has_tool_call(self, text: str) -> bool:
+        """
+        Check if the given text contains a JSON tool call (array or single object).
+        """
+        return "[" in text or "{" in text
+
+    def detect_and_parse(self, text: str, tools: List[Tool]) -> StreamingParseResult:
+        """
+        Parse JSON tool calls using the base class implementation.
+        """
+        raise NotImplementedError(
+            "Detect and parse not supported for JSON schema constraints."
+        )
+
+    def parse_streaming_increment(
+        self, new_text: str, tools: List[Tool]
+    ) -> StreamingParseResult:
+        """
+        Streaming incremental parsing with tool validation.
+        """
+        return super().parse_streaming_increment(new_text, tools)
+
+    def structure_info(self) -> callable:
+        """
+        Return a function that creates StructureInfo for constrained generation.
+        This is not used for JSON schema constraints as they are handled
+        by the constraint backends directly.
+        """
+        raise NotImplementedError("structure_info not used for JSON schema constraints")
diff --git a/sglang/python/sglang/srt/function_call/kimik2_detector.py b/sglang/python/sglang/srt/function_call/kimik2_detector.py
new file mode 100644
index 0000000000000000000000000000000000000000..2480fe80ad76334feef01f03ef48acd839de167f
--- /dev/null
+++ b/sglang/python/sglang/srt/function_call/kimik2_detector.py
@@ -0,0 +1,241 @@
+import json
+import logging
+import re
+from typing import List
+
+from sglang.srt.entrypoints.openai.protocol import Tool
+from sglang.srt.function_call.base_format_detector import BaseFormatDetector
+from sglang.srt.function_call.core_types import (
+    StreamingParseResult,
+    StructureInfo,
+    ToolCallItem,
+    _GetInfoFunc,
+)
+from sglang.srt.function_call.utils import _is_complete_json
+
+logger = logging.getLogger(__name__)
+
+
+class KimiK2Detector(BaseFormatDetector):
+    """
+    Detector for Kimi K2 model function call format.
+
+    Format Structure:
+    ```
+    <|tool_calls_section_begin|>
+    <|tool_call_begin|>functions.{func_name}:{index}<|tool_call_argument_begin|>{json_args}<|tool_call_end|>
+    <|tool_calls_section_end|>
+    ```
+
+    Reference: https://huggingface.co/moonshotai/Kimi-K2-Instruct/blob/main/docs/tool_call_guidance.md
+    """
+
+    def __init__(self):
+        super().__init__()
+
+        self.bot_token: str = "<|tool_calls_section_begin|>"
+        self.eot_token: str = "<|tool_calls_section_end|>"
+
+        self.tool_call_start_token: str = "<|tool_call_begin|>"
+        self.tool_call_end_token: str = "<|tool_call_end|>"
+
+        self.tool_call_regex = re.compile(
+            r"<\|tool_call_begin\|>\s*(?P<tool_call_id>[\w\.]+:\d+)\s*<\|tool_call_argument_begin\|>\s*(?P<function_arguments>\{.*?\})\s*<\|tool_call_end\|>",
+            re.DOTALL,
+        )
+
+        self.stream_tool_call_portion_regex = re.compile(
+            r"<\|tool_call_begin\|>\s*(?P<tool_call_id>[\w\.]+:\d+)\s*<\|tool_call_argument_begin\|>\s*(?P<function_arguments>\{.*)",
+            re.DOTALL,
+        )
+
+        self._last_arguments = ""
+
+        # Robust parser for ids like "functions.search:0" or fallback "search:0"
+        self.tool_call_id_regex = re.compile(
+            r"^(?:functions\.)?(?P<name>[\w\.]+):(?P<index>\d+)$"
+        )
+
+    def has_tool_call(self, text: str) -> bool:
+        """Check if the text contains a KimiK2 format tool call."""
+        return self.bot_token in text
+
+    def detect_and_parse(self, text: str, tools: List[Tool]) -> StreamingParseResult:
+        """
+        One-time parsing: Detects and parses tool calls in the provided text.
+
+        :param text: The complete text to parse.
+        :param tools: List of available tools.
+        :return: ParseResult indicating success or failure, consumed text, leftover text, and parsed calls.
+        """
+        if self.bot_token not in text:
+            return StreamingParseResult(normal_text=text, calls=[])
+        try:
+            # there are two possible captures - between tags, or between a
+            # tag and end-of-string so the result of
+            # findall is an array of tuples where one is a function call and
+            # the other is None
+            function_call_tuples = self.tool_call_regex.findall(text)
+
+            logger.debug("function_call_tuples: %s", function_call_tuples)
+
+            tool_calls = []
+            for match in function_call_tuples:
+                function_id, function_args = match
+                m = self.tool_call_id_regex.match(function_id)
+                if not m:
+                    logger.warning("Unexpected tool_call_id format: %s", function_id)
+                    continue
+                function_name = m.group("name")
+                function_idx = int(m.group("index"))
+
+                logger.debug(f"function_name {function_name}")
+
+                tool_calls.append(
+                    ToolCallItem(
+                        tool_index=function_idx,
+                        name=function_name,
+                        parameters=function_args,
+                    )
+                )
+
+            content = text[: text.find(self.bot_token)]
+            return StreamingParseResult(normal_text=content, calls=tool_calls)
+
+        except Exception as e:
+            logger.error(f"Error in detect_and_parse: {e}")
+            # return the normal text if parsing fails
+            return StreamingParseResult(normal_text=text)
+
+    def parse_streaming_increment(
+        self, new_text: str, tools: List[Tool]
+    ) -> StreamingParseResult:
+        """
+        Streaming incremental parsing tool calls for KimiK2 format.
+        """
+        self._buffer += new_text
+        current_text = self._buffer
+
+        # Check if we have a tool call (either the start token or individual tool call)
+        has_tool_call = (
+            self.bot_token in current_text or self.tool_call_start_token in current_text
+        )
+
+        if not has_tool_call:
+            self._buffer = ""
+            for e_token in [self.eot_token, self.tool_call_end_token]:
+                if e_token in new_text:
+                    new_text = new_text.replace(e_token, "")
+            return StreamingParseResult(normal_text=new_text)
+
+        if not hasattr(self, "_tool_indices"):
+            self._tool_indices = self._get_tool_indices(tools)
+
+        calls: list[ToolCallItem] = []
+        try:
+            match = self.stream_tool_call_portion_regex.search(current_text)
+            if match:
+                function_id = match.group("tool_call_id")
+                function_args = match.group("function_arguments")
+
+                m = self.tool_call_id_regex.match(function_id)
+                if not m:
+                    logger.warning("Unexpected tool_call_id format: %s", function_id)
+                    return StreamingParseResult(normal_text="", calls=calls)
+                function_name = m.group("name")
+
+                # Initialize state if this is the first tool call
+                if self.current_tool_id == -1:
+                    self.current_tool_id = 0
+                    self.prev_tool_call_arr = []
+                    self.streamed_args_for_tool = [""]
+
+                # Ensure we have enough entries in our tracking arrays
+                while len(self.prev_tool_call_arr) <= self.current_tool_id:
+                    self.prev_tool_call_arr.append({})
+                while len(self.streamed_args_for_tool) <= self.current_tool_id:
+                    self.streamed_args_for_tool.append("")
+
+                if not self.current_tool_name_sent:
+                    calls.append(
+                        ToolCallItem(
+                            tool_index=self.current_tool_id,
+                            name=function_name,
+                            parameters="",
+                        )
+                    )
+                    self.current_tool_name_sent = True
+                    # Store the tool call info for serving layer completions endpoint
+                    self.prev_tool_call_arr[self.current_tool_id] = {
+                        "name": function_name,
+                        "arguments": {},
+                    }
+                else:
+                    argument_diff = (
+                        function_args[len(self._last_arguments) :]
+                        if function_args.startswith(self._last_arguments)
+                        else function_args
+                    )
+
+                    parsed_args_diff = argument_diff.split("<|tool_call_end|>", 1)[0]
+
+                    if parsed_args_diff:
+
+                        calls.append(
+                            ToolCallItem(
+                                tool_index=self.current_tool_id,
+                                name=None,
+                                parameters=parsed_args_diff,
+                            )
+                        )
+                        self._last_arguments += argument_diff
+                        self.streamed_args_for_tool[
+                            self.current_tool_id
+                        ] += parsed_args_diff
+
+                    parsed_args = function_args.split("<|tool_call_end|>", 1)[0]
+                    if _is_complete_json(parsed_args):
+                        try:
+                            parsed_args = json.loads(parsed_args)
+                            self.prev_tool_call_arr[self.current_tool_id][
+                                "arguments"
+                            ] = parsed_args
+                        except json.JSONDecodeError:
+                            pass
+
+                        # Find the end of the current tool call and remove only that part from buffer
+                        tool_call_end_pattern = (
+                            r"<\|tool_call_begin\|>.*?<\|tool_call_end\|>"
+                        )
+                        match = re.search(
+                            tool_call_end_pattern, current_text, re.DOTALL
+                        )
+                        if match:
+                            # Remove the completed tool call from buffer, keep any remaining content
+                            self._buffer = current_text[match.end() :]
+                        else:
+                            self._buffer = ""
+
+                        result = StreamingParseResult(normal_text="", calls=calls)
+                        self.current_tool_id += 1
+                        self._last_arguments = ""
+                        self.current_tool_name_sent = False
+                        return result
+
+            return StreamingParseResult(normal_text="", calls=calls)
+
+        except Exception as e:
+            logger.error(f"Error in parse_streaming_increment: {e}")
+            return StreamingParseResult(normal_text=current_text)
+
+    def structure_info(self) -> _GetInfoFunc:
+        """Return function that creates StructureInfo for guided generation."""
+
+        def get_info(name: str) -> StructureInfo:
+            return StructureInfo(
+                begin=f"<|tool_calls_section_begin|><|tool_call_begin|>functions.{name}:0<|tool_call_argument_begin|>",
+                end="<|tool_call_end|><|tool_calls_section_end|>",
+                trigger="<|tool_calls_section_begin|>",
+            )
+
+        return get_info
diff --git a/sglang/python/sglang/srt/function_call/lfm2_detector.py b/sglang/python/sglang/srt/function_call/lfm2_detector.py
new file mode 100644
index 0000000000000000000000000000000000000000..80ef9c45233379a66cebe4b84cad4d26b211e8cf
--- /dev/null
+++ b/sglang/python/sglang/srt/function_call/lfm2_detector.py
@@ -0,0 +1,387 @@
+"""
+Detector for LFM2 (Liquid Foundation Model 2) function call format.
+
+Format Structure (Pythonic style):
+```
+<|tool_call_start|>[function_name(arg1="value1", arg2="value2")]<|tool_call_end|>
+```
+
+Multiple tool calls:
+```
+<|tool_call_start|>[func1(arg="val"), func2(arg="val")]<|tool_call_end|>
+```
+
+Also supports JSON format:
+```
+<|tool_call_start|>[{"name": "func_name", "arguments": {...}}]<|tool_call_end|>
+```
+"""
+
+import ast
+import json
+import logging
+import re
+from typing import Any, Dict, List, Optional, Tuple
+
+from sglang.srt.entrypoints.openai.protocol import Tool
+from sglang.srt.environ import envs
+from sglang.srt.function_call.base_format_detector import BaseFormatDetector
+from sglang.srt.function_call.core_types import (
+    StreamingParseResult,
+    StructureInfo,
+    ToolCallItem,
+    _GetInfoFunc,
+)
+
+logger = logging.getLogger(__name__)
+
+
+class Lfm2Detector(BaseFormatDetector):
+    """
+    Detector for LFM2 (Liquid Foundation Model 2) function call format.
+
+    Supports both Pythonic and JSON formats:
+
+    Pythonic:
+    ```
+    <|tool_call_start|>[calculator(expression="5 * 7")]<|tool_call_end|>
+    ```
+
+    JSON:
+    ```
+    <|tool_call_start|>[{"name": "calculator", "arguments": {"expression": "5 * 7"}}]<|tool_call_end|>
+    ```
+    """
+
+    def __init__(self):
+        """
+        Initializes the detector with necessary state variables.
+        """
+        super().__init__()
+        self.bot_token = "<|tool_call_start|>"
+        self.eot_token = "<|tool_call_end|>"
+        self.tool_call_separator = ""
+
+    def has_tool_call(self, text: str) -> bool:
+        """Check if the text contains an LFM2 format tool call."""
+        return self.bot_token in text
+
+    def _get_parameter_value(self, val: ast.AST) -> Any:
+        """
+        Extract Python literal value from AST node.
+
+        Handles constants, dicts, and lists recursively.
+        Reuses pattern from PythonicDetector.
+        """
+        if isinstance(val, ast.Constant):
+            return val.value
+        elif isinstance(val, ast.Dict):
+            return {
+                self._get_parameter_value(k): self._get_parameter_value(v)
+                for k, v in zip(val.keys, val.values)
+                if k is not None  # Handle {**kwargs} case where key is None
+            }
+        elif isinstance(val, ast.List):
+            return [self._get_parameter_value(v) for v in val.elts]
+        elif isinstance(val, ast.Tuple):
+            return tuple(self._get_parameter_value(v) for v in val.elts)
+        elif isinstance(val, ast.Name):
+            # Handle True, False, None as names in older Python
+            if val.id == "True":
+                return True
+            elif val.id == "False":
+                return False
+            elif val.id == "None":
+                return None
+            else:
+                raise ValueError(f"Unsupported name reference: {val.id}")
+        elif isinstance(val, ast.UnaryOp) and isinstance(val.op, ast.USub):
+            # Handle negative numbers like -5
+            inner = self._get_parameter_value(val.operand)
+            if isinstance(inner, (int, float)):
+                return -inner
+            raise ValueError(f"Cannot negate non-numeric value: {inner}")
+        else:
+            raise ValueError(
+                f"Tool call arguments must be literals, got: {type(val).__name__}"
+            )
+
+    def _parse_pythonic_call(
+        self, call: ast.Call, call_index: int, tool_indices: Dict[str, int]
+    ) -> Optional[ToolCallItem]:
+        """
+        Parse a single AST Call node into a ToolCallItem.
+
+        Args:
+            call: AST Call node representing a function call
+            call_index: Index of this call in the list of calls
+            tool_indices: Mapping of tool names to their indices
+
+        Returns:
+            ToolCallItem if successful, None if the call should be skipped
+        """
+        if not isinstance(call.func, ast.Name):
+            logger.warning(
+                f"Tool call function must be a simple name, got: {type(call.func).__name__}"
+            )
+            return None
+
+        function_name = call.func.id
+
+        # Validate that the function exists in the tools
+        if function_name not in tool_indices:
+            logger.warning(
+                f"Model attempted to call undefined function: {function_name}"
+            )
+            if not envs.SGLANG_FORWARD_UNKNOWN_TOOLS.get():
+                return None  # Skip unknown tools (default legacy behavior)
+
+        # Parse arguments
+        arguments = {}
+        for keyword in call.keywords:
+            if keyword.arg is None:
+                # **kwargs unpacking - skip for now
+                logger.warning("Tool call with **kwargs unpacking is not supported")
+                continue
+            try:
+                arguments[keyword.arg] = self._get_parameter_value(keyword.value)
+            except ValueError as e:
+                logger.warning(f"Failed to parse argument {keyword.arg}: {e}")
+                return None
+
+        return ToolCallItem(
+            tool_index=call_index,  # Use the call index in the response, not tool position
+            name=function_name,
+            parameters=json.dumps(arguments, ensure_ascii=False),
+        )
+
+    def _parse_pythonic_content(
+        self, content: str, tools: List[Tool]
+    ) -> Tuple[List[ToolCallItem], str]:
+        """
+        Parse Pythonic format tool calls using AST.
+
+        Args:
+            content: The content between tool call tags (without the tags)
+            tools: List of available tools
+
+        Returns:
+            Tuple of (list of parsed calls, error message if any)
+        """
+        content = content.strip()
+        tool_indices = self._get_tool_indices(tools)
+
+        try:
+            module = ast.parse(content)
+            parsed = getattr(module.body[0], "value", None) if module.body else None
+
+            if parsed is None:
+                return [], "Empty or invalid Python expression"
+
+            # Handle both single call and list of calls
+            if isinstance(parsed, ast.List):
+                call_nodes = parsed.elts
+            elif isinstance(parsed, ast.Call):
+                call_nodes = [parsed]
+            else:
+                return (
+                    [],
+                    f"Expected function call or list, got: {type(parsed).__name__}",
+                )
+
+            # Validate all elements are calls
+            if not all(isinstance(e, ast.Call) for e in call_nodes):
+                return [], "Not all elements in list are function calls"
+
+            calls = []
+            for call_index, call in enumerate(call_nodes):
+                item = self._parse_pythonic_call(call, call_index, tool_indices)
+                if item is not None:
+                    calls.append(item)
+
+            return calls, ""
+
+        except SyntaxError as e:
+            return [], f"Python syntax error: {e}"
+        except Exception as e:
+            logger.exception("Unexpected error in pythonic tool call parsing")
+            return [], f"Unexpected error: {e}"
+
+    def _parse_json_content(
+        self, content: str, tools: List[Tool]
+    ) -> Tuple[List[ToolCallItem], str]:
+        """
+        Parse JSON format tool calls.
+
+        Uses parse_base_json from BaseFormatDetector for consistent handling
+        of SGLANG_FORWARD_UNKNOWN_TOOLS and tool validation.
+
+        Args:
+            content: The content between tool call tags (without the tags)
+            tools: List of available tools
+
+        Returns:
+            Tuple of (list of parsed calls, error message if any)
+        """
+        content = content.strip()
+
+        try:
+            parsed = json.loads(content)
+            # parse_base_json handles list/dict normalization, tool validation,
+            # and SGLANG_FORWARD_UNKNOWN_TOOLS consistently with other detectors
+            calls = self.parse_base_json(parsed, tools)
+            return calls, ""
+
+        except json.JSONDecodeError as e:
+            return [], f"JSON parse error: {e}"
+
+    def _parse_tool_calls_content(
+        self, content: str, tools: List[Tool]
+    ) -> List[ToolCallItem]:
+        """
+        Parse the content between tool call tags.
+        Handles both JSON and Pythonic formats.
+        """
+        content = content.strip()
+
+        # First, try JSON format (faster check)
+        if content.startswith("[{") or content.startswith("{"):
+            calls, error = self._parse_json_content(content, tools)
+            if calls:
+                return calls
+            # If JSON parsing failed but it looked like JSON, log the error
+            if error:
+                logger.debug(f"JSON parsing failed: {error}, trying Pythonic format")
+
+        # Try Pythonic format
+        calls, error = self._parse_pythonic_content(content, tools)
+        if calls:
+            return calls
+
+        if error:
+            logger.warning(f"Failed to parse tool calls: {error}")
+
+        return []
+
+    def detect_and_parse(self, text: str, tools: List[Tool]) -> StreamingParseResult:
+        """
+        One-time parsing: Detects and parses tool calls in the provided text.
+        """
+        idx = text.find(self.bot_token)
+        normal_text = text[:idx].strip() if idx != -1 else text
+
+        if self.bot_token not in text:
+            return StreamingParseResult(normal_text=normal_text, calls=[])
+
+        # Find all <|tool_call_start|>...<|tool_call_end|> blocks
+        pattern = rf"{re.escape(self.bot_token)}(.*?){re.escape(self.eot_token)}"
+        match_result_list = re.findall(pattern, text, re.DOTALL)
+
+        calls = []
+        for match_result in match_result_list:
+            parsed_calls = self._parse_tool_calls_content(match_result, tools)
+            calls.extend(parsed_calls)
+
+        return StreamingParseResult(normal_text=normal_text, calls=calls)
+
+    def _strip_special_tokens(self, text: str) -> str:
+        """Remove special tokens from text."""
+        return text.replace(self.bot_token, "").replace(self.eot_token, "")
+
+    def parse_streaming_increment(
+        self, new_text: str, tools: List[Tool]
+    ) -> StreamingParseResult:
+        """
+        Streaming incremental parsing for LFM2 tool calls.
+
+        This implementation properly handles Pythonic format by:
+        1. Buffering until we see complete <|tool_call_start|>[...]<|tool_call_end|>
+        2. Emitting normal text before tool calls immediately
+        3. Parsing complete tool call blocks using detect_and_parse
+
+        Based on PythonicDetector streaming logic.
+        """
+        self._buffer += new_text
+
+        # Check for partial bot_token at the end
+        partial_bot = self._ends_with_partial_token(self._buffer, self.bot_token)
+        partial_eot = self._ends_with_partial_token(self._buffer, self.eot_token)
+
+        # Find bot_token position
+        bot_pos = self._buffer.find(self.bot_token)
+
+        if bot_pos == -1:
+            # No tool call start found
+            if partial_bot:
+                # Might be partial bot_token, hold back that part
+                safe_text = self._buffer[:-partial_bot]
+                self._buffer = self._buffer[-partial_bot:]
+                return StreamingParseResult(normal_text=safe_text)
+            else:
+                # No tool call, emit all as normal text
+                normal_text = self._strip_special_tokens(self._buffer)
+                self._buffer = ""
+                return StreamingParseResult(normal_text=normal_text)
+
+        # We have bot_token - extract any normal text before it
+        normal_text_before = self._buffer[:bot_pos] if bot_pos > 0 else ""
+
+        # Look for the end token
+        eot_pos = self._buffer.find(self.eot_token, bot_pos + len(self.bot_token))
+
+        if eot_pos == -1:
+            # No end token yet - check if we might have a partial one
+            if partial_eot:
+                # Hold back the partial token, but we need to keep buffering
+                # Just emit any normal text before the tool call
+                if normal_text_before:
+                    self._buffer = self._buffer[bot_pos:]
+                    return StreamingParseResult(normal_text=normal_text_before)
+                # Keep buffering
+                return StreamingParseResult(normal_text="")
+
+            # No end token and no partial - keep buffering but emit normal text
+            if normal_text_before:
+                self._buffer = self._buffer[bot_pos:]
+                return StreamingParseResult(normal_text=normal_text_before)
+
+            # Just keep buffering
+            return StreamingParseResult(normal_text="")
+
+        # We have a complete tool call block
+        tool_call_block = self._buffer[bot_pos : eot_pos + len(self.eot_token)]
+        remaining = self._buffer[eot_pos + len(self.eot_token) :]
+
+        # Parse the complete block
+        result = self.detect_and_parse(tool_call_block, tools)
+
+        # Update buffer with remaining text
+        self._buffer = remaining
+
+        # Add any normal text before the tool call
+        if normal_text_before:
+            result.normal_text = normal_text_before + (result.normal_text or "")
+
+        return result
+
+    def supports_structural_tag(self) -> bool:
+        """
+        Return False because LFM2 uses Pythonic format which is not JSON-compatible.
+
+        structural_tag only supports JSON-compatible content between begin and end,
+        so it cannot parse Pythonic function call syntax like `func(arg="val")`.
+        """
+        return False
+
+    def structure_info(self) -> _GetInfoFunc:
+        """
+        Return structure info for constrained generation.
+
+        Note: This is provided for completeness but won't be used since
+        supports_structural_tag() returns False.
+        """
+        return lambda name: StructureInfo(
+            begin="<|tool_call_start|>[" + name + "(",
+            end=")]<|tool_call_end|>",
+            trigger="<|tool_call_start|>",
+        )
diff --git a/sglang/python/sglang/srt/function_call/llama32_detector.py b/sglang/python/sglang/srt/function_call/llama32_detector.py
new file mode 100644
index 0000000000000000000000000000000000000000..381bf6aff5d9c5cacfd41fa79c4752eb0de6cb5d
--- /dev/null
+++ b/sglang/python/sglang/srt/function_call/llama32_detector.py
@@ -0,0 +1,144 @@
+import ast
+import json
+import logging
+import re
+from typing import List
+
+from sglang.srt.entrypoints.openai.protocol import Tool
+from sglang.srt.function_call.base_format_detector import BaseFormatDetector
+from sglang.srt.function_call.core_types import (
+    StreamingParseResult,
+    StructureInfo,
+    _GetInfoFunc,
+)
+
+logger = logging.getLogger(__name__)
+
+
+class Llama32Detector(BaseFormatDetector):
+    """
+    Detector for Llama 3.2 models with json tool call format.
+
+    Format Structure:
+    ```
+    <python_tag>{"name":"xxx", "arguments":{...}}
+    ```
+    """
+
+    def __init__(self):
+        super().__init__()
+        self.bot_token = "<|python_tag|>"
+        # NOTE: technically Llama3.2 doesn't support well with parallel tool calls
+        # They need specific prompt engineering to support parallel tool calls
+        # Here we use ';' as the separator, which might have compatibility issues
+        # if users define to use a different separator in their prompt
+        self.tool_call_separator = ";"
+
+    def _convert_python_dict_to_json(self, text: str) -> str:
+        """Convert Python dict strings to JSON format."""
+        try:
+            parsed = ast.literal_eval(text.strip())
+            if isinstance(parsed, dict):
+                return json.dumps(parsed, ensure_ascii=False)
+        except:
+            pass
+        return text
+
+    def has_tool_call(self, text: str) -> bool:
+        """Check if the text contains a Llama 3.2 format tool call."""
+        # depending on the prompt format the Llama model may or may not
+        # prefix the output with the <|python_tag|> token
+        return "<|python_tag|>" in text or text.startswith("{")
+
+    def detect_and_parse(self, text: str, tools: List[Tool]) -> StreamingParseResult:
+        """Parse function calls from text, handling multiple JSON objects."""
+        if "<|python_tag|>" not in text and not text.startswith("{"):
+            return StreamingParseResult(normal_text=text, calls=[])
+
+        if "<|python_tag|>" in text:
+            normal_text, action_text = text.split("<|python_tag|>", maxsplit=1)
+        else:
+            normal_text, action_text = "", text
+
+        decoder = json.JSONDecoder()
+        idx = 0
+        safe_idx = idx  # the index of the last valid JSON object
+        all_actions = []
+        action_text_len = len(action_text)
+        while idx < action_text_len:
+            try:
+                obj, end = decoder.raw_decode(action_text[idx:])
+                all_actions.append(obj)
+                idx += end + len(self.tool_call_separator)
+                safe_idx = idx
+            except json.JSONDecodeError:
+                # Try Python dict conversion as fallback
+                try:
+                    dict_end = idx
+                    brace_count = 0
+                    for i in range(idx, action_text_len):
+                        if action_text[i] == "{":
+                            brace_count += 1
+                        elif action_text[i] == "}":
+                            brace_count -= 1
+                            if brace_count == 0:
+                                dict_end = i + 1
+                                break
+
+                    if dict_end > idx:
+                        potential_dict = action_text[idx:dict_end]
+                        json_version = self._convert_python_dict_to_json(potential_dict)
+                        if json_version != potential_dict:
+                            obj, _ = decoder.raw_decode(json_version)
+                            all_actions.append(obj)
+                            idx = dict_end + len(self.tool_call_separator)
+                            safe_idx = idx
+                            continue
+                except:
+                    pass
+
+                next_obj_start = action_text.find('{"name":', idx + 1)
+                if next_obj_start == -1:
+                    break
+                idx = next_obj_start
+
+        # Only process if we found valid JSON objects
+        calls = self.parse_base_json(all_actions, tools) if all_actions else []
+        # Use safe_idx to avoid idx containing the last part of an invalid JSON object
+        trailing_text = (
+            action_text[safe_idx:].strip() if safe_idx < action_text_len else ""
+        )
+        return StreamingParseResult(
+            normal_text=normal_text + trailing_text, calls=calls
+        )
+
+    def parse_streaming_increment(
+        self, new_text: str, tools: List[Tool]
+    ) -> StreamingParseResult:
+        """Override to handle Python dict format in streaming."""
+        # First try with converted Python dict
+        self._buffer += new_text
+        converted_buffer = self._buffer
+
+        # Convert Python dict syntax to JSON
+        converted_buffer = re.sub(r"'([^']*)':", r'"\1":', converted_buffer)
+        converted_buffer = re.sub(r":\s*'([^']*)'", r': "\1"', converted_buffer)
+
+        # Temporarily replace buffer for parsing
+        original_buffer = self._buffer
+        self._buffer = converted_buffer
+
+        try:
+            result = super().parse_streaming_increment("", tools)
+            return result
+        except:
+            # Fall back to original buffer
+            self._buffer = original_buffer
+            return super().parse_streaming_increment(new_text, tools)
+
+    def structure_info(self) -> _GetInfoFunc:
+        return lambda name: StructureInfo(
+            begin='<|python_tag|>{"name":"' + name + '", "arguments":',
+            end="}",
+            trigger="<|python_tag|>",
+        )
diff --git a/sglang/python/sglang/srt/function_call/mimo_detector.py b/sglang/python/sglang/srt/function_call/mimo_detector.py
new file mode 100644
index 0000000000000000000000000000000000000000..c9cef1c8912f075f7ac9af7aebc48bb8be9fa30c
--- /dev/null
+++ b/sglang/python/sglang/srt/function_call/mimo_detector.py
@@ -0,0 +1,281 @@
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+import ast
+import html
+import json
+import logging
+import re
+from typing import Any, Dict, List
+
+from sglang.srt.entrypoints.openai.protocol import Tool
+from sglang.srt.environ import envs
+from sglang.srt.function_call.base_format_detector import BaseFormatDetector
+from sglang.srt.function_call.core_types import StreamingParseResult, _GetInfoFunc
+
+logger = logging.getLogger(__name__)
+
+
+def _get_param_type(func_name: str, param_name: str, tools: List[Tool]) -> str:
+    """Get parameter type from tool schema."""
+    for tool in tools:
+        if tool.function.name == func_name:
+            props = tool.function.parameters.get("properties", {})
+            if param_name in props:
+                return props[param_name].get("type", "string")
+    return "string"
+
+
+def _convert_param_value(
+    param_value: str, param_name: str, func_name: str, tools: List[Tool]
+) -> Any:
+    """
+    Convert parameter value based on its type in the schema.
+    Adapted from vllm-project/vllm (vllm/entrypoints/openai/tool_parsers/qwen3coder_tool_parser.py)
+    """
+    param_value = html.unescape(param_value)
+
+    # Handle null value for any type
+    if param_value.lower() == "null":
+        return None
+
+    param_type = _get_param_type(func_name, param_name, tools)
+
+    if param_type in ["string", "str", "text", "varchar", "char", "enum"]:
+        return param_value
+    elif (
+        param_type.startswith("int")
+        or param_type.startswith("integer")
+        or param_type.startswith("uint")
+        or param_type.startswith("long")
+        or param_type.startswith("short")
+        or param_type.startswith("unsigned")
+    ):
+        try:
+            return int(param_value)
+        except (ValueError, TypeError):
+            logger.warning(
+                "Parsed value '%s' of parameter '%s' is not an "
+                "integer in tool '%s', degenerating to string.",
+                param_value,
+                param_name,
+                func_name,
+            )
+            return param_value
+    elif param_type.startswith("num") or param_type.startswith("float"):
+        try:
+            float_param_value = float(param_value)
+            return (
+                float_param_value
+                if float_param_value - int(float_param_value) != 0
+                else int(float_param_value)
+            )
+        except (ValueError, TypeError):
+            logger.warning(
+                "Parsed value '%s' of parameter '%s' is not a float "
+                "in tool '%s', degenerating to string.",
+                param_value,
+                param_name,
+                func_name,
+            )
+            return param_value
+    elif param_type in ["boolean", "bool", "binary"]:
+        param_value = param_value.lower()
+        if param_value not in ["true", "false"]:
+            logger.warning(
+                "Parsed value '%s' of parameter '%s' is not a boolean "
+                "(`true` or `false`) in tool '%s', degenerating to "
+                "false.",
+                param_value,
+                param_name,
+                func_name,
+            )
+        return param_value == "true"
+    else:
+        if (
+            param_type in ["object", "array", "arr"]
+            or param_type.startswith("dict")
+            or param_type.startswith("list")
+        ):
+            try:
+                param_value = json.loads(param_value)
+                return param_value
+            except (json.JSONDecodeError, TypeError, ValueError):
+                logger.warning(
+                    "Parsed value '%s' of parameter '%s' cannot be "
+                    "parsed with json.loads in tool '%s', will try "
+                    "other methods to parse it.",
+                    param_value,
+                    param_name,
+                    func_name,
+                )
+        try:
+            param_value = ast.literal_eval(param_value)  # safer
+        except (ValueError, SyntaxError, TypeError):
+            logger.warning(
+                "Parsed value '%s' of parameter '%s' cannot be "
+                "converted via Python `ast.literal_eval()` in tool "
+                "'%s', degenerating to string.",
+                param_value,
+                param_name,
+                func_name,
+            )
+        return param_value
+
+
+class MiMoDetector(BaseFormatDetector):
+    """
+    Detector for MiMo function call format.
+
+    Format:
+        <tool_call>
+        <function=execute_bash>
+        <parameter=command>pwd && ls</parameter>
+        </function>
+        </tool_call>
+    """
+
+    def __init__(self):
+        super().__init__()
+        self.bot_token = "<tool_call>"
+        self.eot_token = "</tool_call>"
+        self.tool_call_regex = re.compile(r"<tool_call>(.*?)</tool_call>", re.DOTALL)
+        self.func_regex = re.compile(r"<function=([^>]+)>(.*?)</function>", re.DOTALL)
+        self.param_regex = re.compile(
+            r"<parameter=([^>]+)>(.*?)</parameter>", re.DOTALL
+        )
+
+    def has_tool_call(self, text: str) -> bool:
+        return self.bot_token in text
+
+    def detect_and_parse(self, text: str, tools: List[Tool]) -> StreamingParseResult:
+        """Parse complete text for tool calls."""
+        idx = text.find(self.bot_token)
+        if idx == -1:
+            return StreamingParseResult(normal_text=text, calls=[])
+
+        normal_text = text[:idx]
+        tool_indices = self._get_tool_indices(tools)
+
+        calls = []
+        last_end = idx
+
+        for match in self.tool_call_regex.finditer(text):
+            tool_call_body = match.group(1)
+
+            parsed = self._parse_tool_call(tool_call_body, tools)
+
+            if parsed:
+                func_name = parsed.get("name")
+                if func_name not in tool_indices:
+                    # Unknown function
+                    logger.warning(f"Unknown function: {func_name}")
+                    if not envs.SGLANG_FORWARD_UNKNOWN_TOOLS.get():
+                        # Return tool call block as normal text
+                        normal_text += text[last_end : match.end()]
+                        last_end = match.end()
+                        continue
+                calls.extend(self.parse_base_json(parsed, tools))
+
+            last_end = match.end()
+
+        return StreamingParseResult(normal_text=normal_text, calls=calls)
+
+    def parse_streaming_increment(
+        self, new_text: str, tools: List[Tool]
+    ) -> StreamingParseResult:
+        """
+        Streaming parsing: buffer until complete tool call block.
+        """
+        self._buffer += new_text
+        current_text = self._buffer
+
+        start = current_text.find(self.bot_token)
+        if start == -1:
+            if self.current_tool_id > 0:
+                # Already processing tool calls, keep buffering
+                # (more tool calls might come, don't discard text yet)
+                return StreamingParseResult(normal_text="")
+            else:
+                # No tool calls seen yet, return as normal text
+                self._buffer = ""
+                return StreamingParseResult(normal_text=current_text)
+
+        # Find end token AFTER the start token
+        end = current_text.find(self.eot_token, start)
+        if end == -1:
+            # Incomplete tool call, return text before start and keep buffering
+            normal_text = current_text[:start]
+            self._buffer = current_text[start:]
+            return StreamingParseResult(normal_text=normal_text)
+
+        # Parse the complete tool call block
+        result = self.detect_and_parse(current_text[: end + len(self.eot_token)], tools)
+
+        if result.calls:
+            # Valid tool call - initialize tracking if first one
+            if self.current_tool_id == -1:
+                self.current_tool_id = 0
+                self.prev_tool_call_arr = []
+                self.streamed_args_for_tool = [""]
+
+            while len(self.prev_tool_call_arr) <= self.current_tool_id:
+                self.prev_tool_call_arr.append({})
+            while len(self.streamed_args_for_tool) <= self.current_tool_id:
+                self.streamed_args_for_tool.append("")
+
+            call = result.calls[0]
+            self.prev_tool_call_arr[self.current_tool_id] = {
+                "name": call.name,
+                "arguments": json.loads(call.parameters) if call.parameters else {},
+            }
+            self.streamed_args_for_tool[self.current_tool_id] = call.parameters
+            call.tool_index = self.current_tool_id
+            self.current_tool_id += 1
+
+        self._buffer = current_text[end + len(self.eot_token) :]
+        return result
+
+    def _parse_tool_call(
+        self, tool_call_body: str, tools: List[Tool]
+    ) -> Dict[str, Any]:
+        """
+        Parse content inside <tool_call>...</tool_call>.
+
+        Structure:
+            tool_call_body contains: <function=name>...params...</function>
+        """
+        # Match complete <function=name>body</function> block
+        func_match = self.func_regex.search(tool_call_body)
+        if not func_match:
+            return None
+
+        func_name = func_match.group(1).strip()
+        func_body = func_match.group(2)
+
+        params = {}
+        for param_match in self.param_regex.finditer(func_body):
+            param_name = param_match.group(1).strip()
+            param_value = param_match.group(2)
+            params[param_name] = _convert_param_value(
+                param_value, param_name, func_name, tools
+            )
+
+        return {"name": func_name, "parameters": params}
+
+    def supports_structural_tag(self) -> bool:
+        return False
+
+    def structure_info(self) -> _GetInfoFunc:
+        raise NotImplementedError
diff --git a/sglang/python/sglang/srt/function_call/minimax_m2.py b/sglang/python/sglang/srt/function_call/minimax_m2.py
new file mode 100644
index 0000000000000000000000000000000000000000..56078aa0d2dd335f7eb0fd0d4229a9197a2de168
--- /dev/null
+++ b/sglang/python/sglang/srt/function_call/minimax_m2.py
@@ -0,0 +1,522 @@
+import json
+import logging
+import re
+from typing import Any, Dict, List, Tuple
+
+from sglang.srt.entrypoints.openai.protocol import Tool
+from sglang.srt.function_call.base_format_detector import BaseFormatDetector
+from sglang.srt.function_call.core_types import (
+    StreamingParseResult,
+    ToolCallItem,
+    _GetInfoFunc,
+)
+
+logger = logging.getLogger(__name__)
+
+
+class MinimaxM2Detector(BaseFormatDetector):
+    """
+    Detector for MiniMax M2 models.
+    Assumes function call format:
+        <minimax:tool_call>
+        <invoke name="func1">
+        <parameter name="param1">value1</parameter>
+        <parameter name="param2">value2</parameter>
+        </invoke>
+        </minimax:tool_call>
+    """
+
+    def __init__(self):
+        super().__init__()
+        self.tool_call_start_token: str = "<minimax:tool_call>"
+        self.tool_call_end_token: str = "</minimax:tool_call>"
+        self.tool_call_prefix: str = '<invoke name="'
+        self.tool_call_function_end_token: str = "</invoke>"
+        self.tool_call_regex = re.compile(
+            r"<minimax:tool_call>(.*?)</minimax:tool_call>|<minimax:tool_call>(.*?)$",
+            re.DOTALL,
+        )
+        self.tool_call_function_regex = re.compile(
+            r"<invoke name=\"(.*?)</invoke>|<invoke name=\"(.*)$", re.DOTALL
+        )
+        self.tool_call_parameter_regex = re.compile(
+            r"<parameter name=\"(.*?)</parameter>|<parameter name=\"(.*?)$", re.DOTALL
+        )
+        self._buf: str = ""
+
+        # Streaming state variables
+        self._current_function_name: str = ""
+        self._current_parameters: Dict[str, Any] = {}
+        self._streamed_parameters: Dict[str, str] = (
+            {}
+        )  # Track what parameter content we've streamed
+        self._in_tool_call: bool = False
+        self._function_name_sent: bool = False
+
+    def has_tool_call(self, text: str) -> bool:
+        return self.tool_call_start_token in text
+
+    def detect_and_parse(self, text: str, tools: List[Tool]) -> StreamingParseResult:
+        normal, calls = self._extract(text, tools)
+        return StreamingParseResult(normal_text=normal, calls=calls)
+
+    def _convert_param_value(self, value: str, param_type: str) -> Any:
+        """Convert parameter value to the correct type (legacy single-type version)."""
+        return self._convert_param_value_with_types(value, [param_type])
+
+    def _extract_types_from_schema(self, schema: Any) -> list[str]:
+        """
+        Extract all possible types from a JSON schema definition.
+        Handles anyOf, oneOf, allOf, type arrays, and enum fields.
+
+        Args:
+            schema: The JSON schema definition for a parameter
+
+        Returns:
+            List of type strings (e.g., ["string", "integer", "null"])
+        """
+        if schema is None:
+            return ["string"]
+
+        if not isinstance(schema, dict):
+            return ["string"]
+
+        types: set[str] = set()
+
+        # Handle direct "type" field
+        if "type" in schema:
+            type_value = schema["type"]
+            if isinstance(type_value, str):
+                types.add(type_value)
+            elif isinstance(type_value, list):
+                for t in type_value:
+                    if isinstance(t, str):
+                        types.add(t)
+
+        # Handle enum - infer types from enum values
+        if "enum" in schema and isinstance(schema["enum"], list) and schema["enum"]:
+            for value in schema["enum"]:
+                if value is None:
+                    types.add("null")
+                elif isinstance(value, bool):
+                    types.add("boolean")
+                elif isinstance(value, int):
+                    types.add("integer")
+                elif isinstance(value, float):
+                    types.add("number")
+                elif isinstance(value, str):
+                    types.add("string")
+                elif isinstance(value, list):
+                    types.add("array")
+                elif isinstance(value, dict):
+                    types.add("object")
+
+        # Handle anyOf, oneOf, allOf - recursively extract types
+        for choice_field in ("anyOf", "oneOf", "allOf"):
+            if choice_field in schema and isinstance(schema[choice_field], list):
+                for choice in schema[choice_field]:
+                    extracted = self._extract_types_from_schema(choice)
+                    types.update(extracted)
+
+        # If no types found, default to string
+        if not types:
+            return ["string"]
+
+        return list(types)
+
+    def _convert_param_value_with_types(
+        self, value: str, param_types: list[str]
+    ) -> Any:
+        """
+        Convert parameter value to the correct type based on a list of possible types.
+        Tries each type in order until one succeeds.
+
+        Args:
+            value: The string value to convert
+            param_types: List of possible type strings
+
+        Returns:
+            The converted value
+        """
+        if value.lower() == "null":
+            return None
+
+        # Normalize types
+        normalized_types = [t.lower() for t in param_types]
+
+        # Try null first if it's in the list
+        if "null" in normalized_types or value.lower() in ("null", "none", "nil"):
+            return None
+
+        # Try each type in order of preference (most specific first, string as fallback)
+        # Priority: integer > number > boolean > object > array > string
+        type_priority = [
+            "integer",
+            "int",
+            "number",
+            "float",
+            "boolean",
+            "bool",
+            "object",
+            "array",
+            "string",
+            "str",
+            "text",
+        ]
+
+        for param_type in type_priority:
+            if param_type not in normalized_types:
+                continue
+
+            if param_type in ["string", "str", "text"]:
+                return value
+            elif param_type in ["integer", "int"]:
+                try:
+                    return int(value)
+                except (ValueError, TypeError):
+                    continue
+            elif param_type in ["number", "float"]:
+                try:
+                    val = float(value)
+                    return val if val != int(val) else int(val)
+                except (ValueError, TypeError):
+                    continue
+            elif param_type in ["boolean", "bool"]:
+                lower_val = value.lower().strip()
+                if lower_val in ["true", "1", "yes", "on"]:
+                    return True
+                elif lower_val in ["false", "0", "no", "off"]:
+                    return False
+                continue
+            elif param_type in ["object", "array"]:
+                try:
+                    return json.loads(value)
+                except json.JSONDecodeError:
+                    continue
+
+        # Fallback: try JSON parse, then return as string
+        try:
+            return json.loads(value)
+        except json.JSONDecodeError:
+            return value
+
+    def _get_param_types_from_config(
+        self, param_name: str, param_config: dict
+    ) -> list[str]:
+        """
+        Get parameter types from parameter configuration.
+        Handles anyOf, oneOf, allOf, and direct type definitions.
+
+        Args:
+            param_name: The name of the parameter
+            param_config: The properties dict from the tool schema
+
+        Returns:
+            List of type strings
+        """
+        if param_name not in param_config:
+            return ["string"]
+
+        param_schema = param_config[param_name]
+        if not isinstance(param_schema, dict):
+            return ["string"]
+
+        return self._extract_types_from_schema(param_schema)
+
+    def parse_streaming_increment(
+        self, new_text: str, tools: List[Tool]
+    ) -> StreamingParseResult:
+        self._buf += new_text
+        normal = ""
+        calls: List[ToolCallItem] = []
+
+        # Build tool indices for validation
+        if not hasattr(self, "_tool_indices"):
+            self._tool_indices = self._get_tool_indices(tools)
+
+        while True:
+            # If we're not in a tool call and don't see a start token, return normal text
+            if not self._in_tool_call and self.tool_call_start_token not in self._buf:
+                normal += self._buf
+                self._buf = ""
+                break
+
+            # Look for tool call start
+            if not self._in_tool_call:
+                s = self._buf.find(self.tool_call_start_token)
+                if s == -1:
+                    normal += self._buf
+                    self._buf = ""
+                    break
+
+                normal += self._buf[:s]
+                self._buf = self._buf[s:]
+
+                self._in_tool_call = True
+                self._function_name_sent = False
+                self._current_function_name = ""
+                self._current_parameters = {}
+                self._streamed_parameters = {}
+
+                # Remove the start token
+                self._buf = self._buf[len(self.tool_call_start_token) :]
+                continue
+
+            # We're in a tool call, try to parse function name if not sent yet
+            if not self._function_name_sent:
+                # Look for function name pattern: <invoke name=name>
+                function_match = re.search(r"<invoke name=\"([^>]+)\">", self._buf)
+                if function_match:
+                    function_name = function_match.group(1).strip()
+
+                    # Validate function name
+                    if function_name in self._tool_indices:
+                        self._current_function_name = function_name
+                        self._function_name_sent = True
+
+                        # Initialize tool call tracking
+                        if self.current_tool_id == -1:
+                            self.current_tool_id = 0
+
+                        # Ensure tracking arrays are large enough
+                        while len(self.prev_tool_call_arr) <= self.current_tool_id:
+                            self.prev_tool_call_arr.append({})
+                        while len(self.streamed_args_for_tool) <= self.current_tool_id:
+                            self.streamed_args_for_tool.append("")
+
+                        # Store tool call info
+                        self.prev_tool_call_arr[self.current_tool_id] = {
+                            "name": function_name,
+                            "arguments": {},
+                        }
+
+                        # Send tool name with empty parameters
+                        calls.append(
+                            ToolCallItem(
+                                tool_index=self.current_tool_id,
+                                name=function_name,
+                                parameters="",
+                            )
+                        )
+
+                        # Remove the processed function declaration
+                        self._buf = self._buf[function_match.end() :]
+                        continue
+                    else:
+                        # Invalid function name, reset state
+                        logger.warning(f"Invalid function name: {function_name}")
+                        self._reset_streaming_state()
+                        normal += self._buf
+                        self._buf = ""
+                        break
+                else:
+                    # Function name not complete yet, wait for more text
+                    break
+
+            # Parse parameters incrementally
+            if self._function_name_sent:
+                # Process parameters and get any calls to emit
+                parameter_calls = self._parse_and_stream_parameters(self._buf, tools)
+                calls.extend(parameter_calls)
+
+                # Check if tool call is complete
+                if self.tool_call_function_end_token in self._buf:
+                    end_pos = self._buf.find(self.tool_call_function_end_token)
+
+                    # Add closing brace to complete the JSON object
+                    current_streamed = self.streamed_args_for_tool[self.current_tool_id]
+                    if current_streamed:
+                        # Count opening and closing braces to check if JSON is complete
+                        open_braces = current_streamed.count("{")
+                        close_braces = current_streamed.count("}")
+                        if open_braces > close_braces:
+                            calls.append(
+                                ToolCallItem(
+                                    tool_index=self.current_tool_id,
+                                    name=None,
+                                    parameters="}",
+                                )
+                            )
+                            self.streamed_args_for_tool[self.current_tool_id] = (
+                                current_streamed + "}"
+                            )
+
+                    # Complete the tool call
+                    self._buf = self._buf[
+                        end_pos + len(self.tool_call_function_end_token) :
+                    ]
+                    self._reset_streaming_state(True)
+                    self.current_tool_id += 1
+                    continue
+                else:
+                    # Tool call not complete yet, wait for more text
+                    break
+
+        return StreamingParseResult(normal_text=normal, calls=calls)
+
+    def _parse_and_stream_parameters(
+        self, text_to_parse: str, tools: List[Tool]
+    ) -> List[ToolCallItem]:
+        """
+        Parse complete parameter blocks from text and return any tool call items to emit.
+
+        This method:
+        1. Finds all complete <parameter> blocks
+        2. Parses them into a dictionary
+        3. Compares with current parameters and generates diff if needed
+        4. Updates internal state
+
+        Args:
+            text_to_parse: The text to search for parameter blocks
+
+        Returns:
+            List of ToolCallItem objects to emit (may be empty)
+        """
+        calls: List[ToolCallItem] = []
+
+        # Find all complete parameter patterns
+        param_matches = list(
+            re.finditer(
+                r"<parameter name=\"([^>]+)\">(.*?)</parameter>",
+                text_to_parse,
+                re.DOTALL,
+            )
+        )
+
+        # Build new parameters dictionary
+        new_params = {}
+        for match in param_matches:
+            param_name = match.group(1).strip()
+            param_value = match.group(2)
+            new_params[param_name] = self._parse_parameter(
+                self._current_function_name, param_name, param_value, tools
+            )
+
+        # Calculate parameter diff to stream with proper incremental JSON building
+        if new_params != self._current_parameters:
+            previous_args_json = self.streamed_args_for_tool[self.current_tool_id]
+
+            # Build incremental JSON properly
+            if not self._current_parameters:
+                # First parameter(s) - start JSON object but don't close it yet
+                items = []
+                for key, value in new_params.items():
+                    items.append(
+                        f"{json.dumps(key, ensure_ascii=False)}: {json.dumps(value, ensure_ascii=False)}"
+                    )
+                json_fragment = "{" + ", ".join(items)
+
+                calls.append(
+                    ToolCallItem(
+                        tool_index=self.current_tool_id,
+                        name=None,
+                        parameters=json_fragment,
+                    )
+                )
+                self.streamed_args_for_tool[self.current_tool_id] = json_fragment
+
+            else:
+                # Additional parameters - add them incrementally
+                new_keys = set(new_params.keys()) - set(self._current_parameters.keys())
+                if new_keys:
+                    # Build the continuation part (no closing brace yet)
+                    continuation_parts = []
+                    for key in new_keys:
+                        value = new_params[key]
+                        continuation_parts.append(
+                            f"{json.dumps(key, ensure_ascii=False)}: {json.dumps(value, ensure_ascii=False)}"
+                        )
+
+                    json_fragment = ", " + ", ".join(continuation_parts)
+
+                    calls.append(
+                        ToolCallItem(
+                            tool_index=self.current_tool_id,
+                            name=None,
+                            parameters=json_fragment,
+                        )
+                    )
+                    self.streamed_args_for_tool[self.current_tool_id] = (
+                        previous_args_json + json_fragment
+                    )
+
+            # Update current state
+            self._current_parameters = new_params
+            self.prev_tool_call_arr[self.current_tool_id]["arguments"] = new_params
+
+        return calls
+
+    def _reset_streaming_state(self, still_in_tool_call: bool = False):
+        """Reset streaming state for the next tool call"""
+        self._in_tool_call = still_in_tool_call
+        self._function_name_sent = False
+        self._current_function_name = ""
+        self._current_parameters = {}
+        self._streamed_parameters = {}
+        self.current_tool_name_sent = False
+
+    def _extract(self, text: str, tools: List[Tool]) -> Tuple[str, List[ToolCallItem]]:
+        normal_parts: List[str] = []
+        calls: List[ToolCallItem] = []
+        cursor = 0
+        while True:
+            s = text.find(self.tool_call_start_token, cursor)
+            if s == -1:
+                normal_parts.append(text[cursor:])
+                break
+            normal_parts.append(text[cursor:s])
+            e = text.find(self.tool_call_end_token, s)
+            if e == -1:
+                normal_parts.append(text[s:])
+                break
+            block = text[s : e + len(self.tool_call_end_token)]
+            cursor = e + len(self.tool_call_end_token)
+            calls.extend(self._parse_block(block, tools))
+        return "".join(normal_parts), calls
+
+    def _parse_block(self, block: str, tools: List[Tool]) -> List[ToolCallItem]:
+        res: List[ToolCallItem] = []
+        for m in self.tool_call_function_regex.findall(block):
+            txt = m[0] if m[0] else m[1]
+            if '">' not in txt:
+                continue
+            idx = txt.index('">')
+            fname = txt[:idx].strip()
+            body = txt[idx + 2 :]
+            params: Dict[str, Any] = {}
+            for pm in self.tool_call_parameter_regex.findall(body):
+                ptxt = pm[0] if pm[0] else pm[1]
+                if '">' not in ptxt:
+                    continue
+                pidx = ptxt.index('">')
+                pname = ptxt[:pidx].strip()
+                pval = ptxt[pidx + 2 :].lstrip("\n").rstrip("\n")
+                params[pname] = self._parse_parameter(fname, pname, pval, tools)
+            raw = {"name": fname, "arguments": params}
+            try:
+                # TODO: fix idx in function call, the index for a function
+                # call will always be -1 in parse_base_json
+                res.extend(self.parse_base_json(raw, tools))
+            except Exception:
+                logger.warning("invalid tool call for %s dropped", fname)
+        return res
+
+    def _parse_parameter(
+        self, fname: str, pname: str, pval: str, tools: List[Tool]
+    ) -> Any:
+        param_config = {}
+        for tool in tools:
+            if tool.function.name == fname and tool.function.parameters is not None:
+                parameters = tool.function.parameters
+                if isinstance(parameters, dict) and "properties" in parameters:
+                    param_config = parameters["properties"]
+                    break
+
+        param_type = self._get_param_types_from_config(pname, param_config)
+        return self._convert_param_value_with_types(pval, param_type)
+
+    def supports_structural_tag(self) -> bool:
+        return False
+
+    def structure_info(self) -> _GetInfoFunc:
+        raise NotImplementedError
diff --git a/sglang/python/sglang/srt/function_call/mistral_detector.py b/sglang/python/sglang/srt/function_call/mistral_detector.py
new file mode 100644
index 0000000000000000000000000000000000000000..b1268b90fa0aa30c6c0b7bf4d813a267152e696d
--- /dev/null
+++ b/sglang/python/sglang/srt/function_call/mistral_detector.py
@@ -0,0 +1,335 @@
+import json
+import logging
+from typing import Any, List, Optional, Tuple
+
+from sglang.srt.entrypoints.openai.protocol import Tool
+from sglang.srt.function_call.base_format_detector import BaseFormatDetector
+from sglang.srt.function_call.core_types import (
+    StreamingParseResult,
+    StructureInfo,
+    ToolCallItem,
+    _GetInfoFunc,
+)
+from sglang.srt.function_call.utils import _is_complete_json
+
+logger = logging.getLogger(__name__)
+
+
+class MistralDetector(BaseFormatDetector):
+    """
+    Detector for Mistral tool/function call formats.
+
+    Supported formats:
+
+    1) JSON-array format:
+       `[TOOL_CALLS] [{"name": "...", "arguments": {...}}, ...]`
+
+    2) Compact format (common in newer templates/models, especially in streaming):
+       `[TOOL_CALLS]tool_name[ARGS]{...}`
+       (also tolerates missing delimiters like `]` after `[TOOL_CALLS` and/or `[ARGS]` while streaming)
+
+    Reference: https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.3?chat_template=default
+    """
+
+    def __init__(self):
+        """Initialize tokens and streaming state."""
+        super().__init__()
+        # Canonical Mistral prefix for JSON-array tool calls.
+        self.bot_token = "[TOOL_CALLS] ["
+        # Common marker shared by both JSON-array and compact formats.
+        self._tool_calls_marker = "[TOOL_CALLS"
+        self.eot_token = "]"
+        self.tool_call_separator = ", "
+
+    def has_tool_call(self, text: str) -> bool:
+        """Return True if the text contains either supported tool-call marker."""
+        return self._tool_calls_marker in text
+
+    def detect_and_parse(self, text: str, tools: List[Tool]) -> StreamingParseResult:
+        """
+        One-time parsing: Detects and parses tool calls in the provided text.
+
+        :param text: The complete text to parse.
+        :param tools: List of available tools.
+        :return: ParseResult indicating success or failure, consumed text, leftover text, and parsed calls.
+        """
+        marker_idx = text.find(self._tool_calls_marker)
+        if marker_idx == -1:
+            return StreamingParseResult(normal_text=text, calls=[])
+
+        normal_text = text[:marker_idx].strip()
+        tool_part = text[marker_idx:]
+
+        # Canonical: `[TOOL_CALLS] [{...}, ...]`
+        if self.bot_token in tool_part:
+            json_array_str = self._extract_json_array(tool_part)
+            if not json_array_str:
+                return StreamingParseResult(normal_text=normal_text, calls=[])
+
+            calls: list = []
+            try:
+                function_call_arr = json.loads(json_array_str)
+                if not isinstance(function_call_arr, list):
+                    function_call_arr = [function_call_arr]
+                calls = self.parse_base_json(function_call_arr, tools)
+            except json.JSONDecodeError as e:
+                logger.warning(
+                    f"Failed to parse JSON part: {json_array_str}, JSON parse error: {str(e)}"
+                )
+            json_pos = tool_part.find(json_array_str) if json_array_str else -1
+            trailing_text = (
+                tool_part[json_pos + len(json_array_str) :].strip()
+                if json_pos != -1
+                else ""
+            )
+            combined_normal = (
+                (normal_text + " " + trailing_text).strip()
+                if trailing_text
+                else normal_text
+            )
+            return StreamingParseResult(normal_text=combined_normal, calls=calls)
+
+        # Compact: `[TOOL_CALLS]tool_name[ARGS]{...}`
+        parsed = self._try_parse_compact_args_format(tool_part)
+        if not parsed:
+            return StreamingParseResult(normal_text=normal_text, calls=[])
+        func_name, args_obj, consumed = parsed
+
+        calls = self.parse_base_json({"name": func_name, "arguments": args_obj}, tools)
+        trailing_text = tool_part[consumed:].strip()
+        combined_normal = (
+            (normal_text + " " + trailing_text).strip()
+            if trailing_text
+            else normal_text
+        )
+        return StreamingParseResult(normal_text=combined_normal, calls=calls)
+
+    def parse_streaming_increment(
+        self, new_text: str, tools: List[Tool]
+    ) -> StreamingParseResult:
+        """
+        Streaming parsing for both JSON-array and compact formats.
+
+        For the compact format, this buffers until the JSON arguments payload is complete,
+        then emits two items: tool name (with empty parameters) and a full arguments JSON
+        chunk (OpenAI streaming semantics).
+        """
+        self._buffer += new_text
+        current_text = self._buffer
+
+        # No marker: either flush as normal text or keep buffering a partial marker.
+        if self._tool_calls_marker not in current_text:
+            if not self._ends_with_partial_token(self._buffer, self._tool_calls_marker):
+                normal_text = self._buffer
+                self._buffer = ""
+                if self.eot_token in normal_text:
+                    normal_text = normal_text.replace(self.eot_token, "")
+                return StreamingParseResult(normal_text=normal_text)
+            return StreamingParseResult()
+
+        # If there's leading normal text before the marker, stream it out first.
+        marker_pos = current_text.find(self._tool_calls_marker)
+        if marker_pos > 0:
+            normal_text = current_text[:marker_pos]
+            self._buffer = current_text[marker_pos:]
+            return StreamingParseResult(normal_text=normal_text)
+
+        # Build tool indices if not already built.
+        if not hasattr(self, "_tool_indices"):
+            self._tool_indices = self._get_tool_indices(tools)
+
+        # Try compact first; JSON-array requires `] [` and often arrives later in streaming.
+        compact = self._try_parse_compact_args_format(current_text)
+        if compact:
+            func_name, args_obj, consumed = compact
+            if func_name not in self._tool_indices:
+                # Unknown tool: treat as normal text and reset state.
+                normal_text = self._buffer
+                self._buffer = ""
+                return StreamingParseResult(normal_text=normal_text)
+
+            # Initialize state if this is the first tool call.
+            if self.current_tool_id == -1:
+                self.current_tool_id = 0
+                self.prev_tool_call_arr = []
+                self.streamed_args_for_tool = []
+
+            args_json = json.dumps(args_obj, ensure_ascii=False)
+            tool_id = self.current_tool_id
+
+            # Ensure arrays are large enough.
+            while len(self.prev_tool_call_arr) <= tool_id:
+                self.prev_tool_call_arr.append({})
+            while len(self.streamed_args_for_tool) <= tool_id:
+                self.streamed_args_for_tool.append("")
+
+            self.prev_tool_call_arr[tool_id] = {
+                "name": func_name,
+                "arguments": args_obj,
+            }
+            self.streamed_args_for_tool[tool_id] = args_json
+
+            calls: List[ToolCallItem] = [
+                ToolCallItem(tool_index=tool_id, name=func_name, parameters=""),
+                ToolCallItem(tool_index=tool_id, name=None, parameters=args_json),
+            ]
+
+            # Consume parsed content from buffer.
+            self._buffer = current_text[consumed:]
+            self.current_tool_id += 1
+            self.current_tool_name_sent = False
+            return StreamingParseResult(normal_text="", calls=calls)
+
+        # Canonical format delegates to the BaseFormatDetector JSON streaming logic.
+        if self.bot_token in current_text:
+            return super().parse_streaming_increment(new_text="", tools=tools)
+
+        # Otherwise, keep buffering.
+        return StreamingParseResult()
+
+    def _try_parse_compact_args_format(
+        self, text: str
+    ) -> Optional[Tuple[str, Any, int]]:
+        """
+        Parse the compact tool call format:
+            `[TOOL_CALLS]tool_name[ARGS]{...}`
+
+        Tolerates common streaming variants where delimiters are missing:
+            `[TOOL_CALLStool_name[ARGS{...}`
+
+        Returns:
+            (tool_name, arguments_obj, consumed_end_index) if a complete JSON arguments
+            payload is present; otherwise None.
+        """
+        start = text.find(self._tool_calls_marker)
+        if start == -1:
+            return None
+
+        i = start + len(self._tool_calls_marker)  # position after "[TOOL_CALLS"
+        if i < len(text) and text[i] == "]":
+            i += 1
+        while i < len(text) and text[i].isspace():
+            i += 1
+
+        args_marker = "[ARGS"
+        args_pos = text.find(args_marker, i)
+        if args_pos == -1:
+            return None
+
+        func_name = text[i:args_pos].strip()
+        if not func_name:
+            return None
+
+        j = args_pos + len(args_marker)
+        if j < len(text) and text[j] == "]":
+            j += 1
+        while j < len(text) and text[j].isspace():
+            j += 1
+
+        if j >= len(text) or text[j] not in "{[":
+            return None
+
+        json_str, end_idx = self._extract_json_value(text, j)
+        if not json_str:
+            return None
+        if not _is_complete_json(json_str):
+            return None
+
+        try:
+            args_obj = json.loads(json_str)
+        except json.JSONDecodeError:
+            return None
+
+        return func_name, args_obj, end_idx
+
+    def _extract_json_value(
+        self, text: str, json_start: int
+    ) -> Tuple[Optional[str], int]:
+        """
+        Extract a JSON value (object or array) starting at json_start using bracket counting,
+        robust to nested braces/brackets inside strings.
+
+        Returns:
+            (json_str_or_None, end_index_exclusive)
+        """
+        if json_start >= len(text) or text[json_start] not in "{[":
+            return None, json_start
+
+        opening = text[json_start]
+        closing = "}" if opening == "{" else "]"
+        depth = 0
+        in_string = False
+        escape_next = False
+
+        for k in range(json_start, len(text)):
+            ch = text[k]
+            if escape_next:
+                escape_next = False
+                continue
+            if ch == "\\":
+                escape_next = True
+                continue
+            if ch == '"' and not escape_next:
+                in_string = not in_string
+                continue
+            if in_string:
+                continue
+            if ch == opening:
+                depth += 1
+            elif ch == closing:
+                depth -= 1
+                if depth == 0:
+                    return text[json_start : k + 1], k + 1
+
+        return None, json_start
+
+    def _extract_json_array(self, text: str) -> str:
+        """
+        Extract the JSON array part using bracket counting to handle nested brackets.
+
+        :param text: The complete text containing [TOOL_CALLS] [...]
+        :return: The JSON array string or None if not found
+        """
+        start_idx = text.find(self.bot_token)
+        if start_idx == -1:
+            return None
+
+        # Start from the opening bracket after [TOOL_CALLS]
+        json_start = (
+            start_idx + len(self.bot_token) - 1
+        )  # -1 to include the opening bracket
+        bracket_count = 0
+        in_string = False
+        escape_next = False
+
+        for i in range(json_start, len(text)):
+            char = text[i]
+
+            if escape_next:
+                escape_next = False
+                continue
+
+            if char == "\\":
+                escape_next = True
+                continue
+
+            if char == '"' and not escape_next:
+                in_string = not in_string
+                continue
+
+            if not in_string:
+                if char == "[":
+                    bracket_count += 1
+                elif char == "]":
+                    bracket_count -= 1
+                    if bracket_count == 0:
+                        return text[json_start : i + 1]
+
+        return None
+
+    def structure_info(self) -> _GetInfoFunc:
+        return lambda name: StructureInfo(
+            begin='[TOOL_CALLS] [{"name":"' + name + '", "arguments":',
+            end="}]",
+            trigger="[TOOL_CALLS]",
+        )
diff --git a/sglang/python/sglang/srt/function_call/pythonic_detector.py b/sglang/python/sglang/srt/function_call/pythonic_detector.py
new file mode 100644
index 0000000000000000000000000000000000000000..928a766ec53a89771132461ea3db4d9660e97345
--- /dev/null
+++ b/sglang/python/sglang/srt/function_call/pythonic_detector.py
@@ -0,0 +1,224 @@
+import ast
+import json
+import logging
+import re
+from typing import List
+
+from sglang.srt.entrypoints.openai.protocol import Tool
+from sglang.srt.environ import envs
+from sglang.srt.function_call.base_format_detector import BaseFormatDetector
+from sglang.srt.function_call.core_types import (
+    StreamingParseResult,
+    ToolCallItem,
+    _GetInfoFunc,
+)
+
+logger = logging.getLogger(__name__)
+
+
+class PythonicDetector(BaseFormatDetector):
+    """
+    Detector for Llama-4 models with Pythonic tool call format.
+
+    The Pythonic format uses Python function call syntax within square brackets,
+    with arguments as Python literals rather than JSON.
+
+    Format Structure:
+    ```
+    [tool1(arg1=val1, arg2=val2), tool2(arg1=val3)]
+    ```
+
+    Reference: https://huggingface.co/meta-llama/Llama-4-Scout-17B-16E-Instruct?chat_template=default
+    """
+
+    def __init__(self):
+        super().__init__()
+        self.tool_call_regex = re.compile(
+            r"\[([a-zA-Z]+\w*\(([a-zA-Z]+\w*=.*,\s*)*([a-zA-Z]+\w*=.*\s)?\),\s*)*([a-zA-Z]+\w*\(([a-zA-Z]+\w*=.*,\s*)*([a-zA-Z]+\w*=.*\s*)?\)\s*)+\]",
+            re.DOTALL,
+        )
+
+    @staticmethod
+    def _text_strip(text: str) -> str:
+        # Llama 4 model sometime will output <|python_start|> and <|python_end|> tokens
+        # remove those tokens
+        text = text.replace("<|python_start|>", "")
+        text = text.replace("<|python_end|>", "")
+        return text
+
+    def has_tool_call(self, text: str) -> bool:
+        return bool(self.tool_call_regex.search(self._text_strip(text.strip())))
+
+    def detect_and_parse(self, text: str, tools: List[Tool]) -> StreamingParseResult:
+        # Try parsing the text as a Python list of function calls
+        text = text.strip()
+
+        # Remove unexpected <|python_start|> and <|python_end|> for llama4
+        text = self._text_strip(text)
+
+        match = self.tool_call_regex.search(text)
+        if match is None:
+            return StreamingParseResult(normal_text=text, calls=[])
+
+        # Extract the tool call part and any text before/after it
+        tool_call_start = match.start()
+        tool_call_end = match.end()
+
+        normal_text_before = text[:tool_call_start] if tool_call_start > 0 else ""
+        tool_call_text = text[tool_call_start:tool_call_end]
+        normal_text_after = text[tool_call_end:] if tool_call_end < len(text) else ""
+
+        # Combine normal text
+        normal_text = normal_text_before + normal_text_after
+
+        try:
+            module = ast.parse(tool_call_text)
+            parsed = getattr(module.body[0], "value", None)
+            if not (
+                isinstance(parsed, ast.List)
+                and all(isinstance(e, ast.Call) for e in parsed.elts)
+            ):
+                return StreamingParseResult(normal_text=normal_text, calls=[])
+
+            calls = []
+            tool_indices = self._get_tool_indices(tools)
+            for call_index, call in enumerate(parsed.elts):
+                if not isinstance(call.func, ast.Name):
+                    continue
+                function_name = call.func.id
+                # Validate that the function exists in the tools
+                if function_name not in tool_indices:
+                    logger.warning(
+                        f"Model attempted to call undefined function: {function_name}"
+                    )
+                    if not envs.SGLANG_FORWARD_UNKNOWN_TOOLS.get():
+                        continue  # Skip unknown tools (default legacy behavior)
+
+                arguments = {}
+                for keyword in call.keywords:
+                    arguments[keyword.arg] = self._get_parameter_value(keyword.value)
+                calls.append(
+                    ToolCallItem(
+                        tool_index=call_index,  # Use the call index in the response, not tool position
+                        name=function_name,
+                        parameters=json.dumps(arguments, ensure_ascii=False),
+                    )
+                )
+
+            return StreamingParseResult(normal_text=normal_text, calls=calls)
+        except Exception:
+            logger.exception("Error in pythonic tool call parsing.")
+            return StreamingParseResult(normal_text=normal_text, calls=[])
+
+    def _find_matching_bracket(self, buffer: str, start: int) -> int:
+        """
+        Find the matching closing bracket for the opening bracket at start position.
+        Properly handles nested brackets.
+
+        Args:
+            buffer: The text buffer to search in
+            start: Position of the opening bracket '['
+
+        Returns:
+            Position of the matching closing bracket ']', or -1 if not found
+        """
+        bracket_count = 0
+        for i in range(start, len(buffer)):
+            if buffer[i] == "[":
+                bracket_count += 1
+            elif buffer[i] == "]":
+                bracket_count -= 1
+                if bracket_count == 0:
+                    return i
+        return -1  # No matching bracket found
+
+    def _strip_and_split_buffer(self, buffer: str) -> tuple[str, str]:
+        """
+        Strip special tokens from buffer and split into safe_text and held_back_text.
+
+        Returns:
+            tuple of (safe_text_to_output, text_to_hold_in_buffer)
+        """
+        # Check if original buffer ends with a partial token at the end
+        special_tokens = ["<|python_start|>", "<|python_end|>"]
+
+        for token in special_tokens:
+            partial_length = self._ends_with_partial_token(buffer, token)
+            if partial_length > 0:
+                # Split buffer: safe part + held back partial token
+                safe_text = buffer[:-partial_length]
+                held_back = buffer[-partial_length:]
+                # Strip complete special tokens from safe part only
+                safe_text = self._text_strip(safe_text)
+                return safe_text, held_back
+
+        # No partial tokens found, strip complete tokens from entire buffer
+        safe_text = self._text_strip(buffer)
+        return safe_text, ""
+
+    def parse_streaming_increment(
+        self, new_text: str, tools: List[Tool]
+    ) -> StreamingParseResult:
+        """
+        Streaming incremental parsing for pythonic tool calls.
+        Buffers input until a complete pythonic tool call (from [ to ]) is found,
+        then parses and emits any detected calls.
+        """
+        self._buffer += new_text
+
+        # Strip special tokens from entire buffer and handle partial tokens
+        stripped_buffer, held_back = self._strip_and_split_buffer(self._buffer)
+
+        start = stripped_buffer.find("[")
+
+        if start == -1:
+            # No tool call bracket found
+            self._buffer = held_back
+            return StreamingParseResult(normal_text=stripped_buffer)
+
+        normal_text = stripped_buffer[:start] if start > 0 else ""
+
+        end = self._find_matching_bracket(stripped_buffer, start)
+        if end != -1:
+            # Found complete tool call
+            call_text = stripped_buffer[start : end + 1]
+            result = self.detect_and_parse(call_text, tools)
+
+            # Update buffer with remaining text after tool call plus any held back text
+            remaining_text = stripped_buffer[end + 1 :] + held_back
+            self._buffer = remaining_text
+
+            # If we had normal text before the tool call, add it to the result
+            if normal_text:
+                result.normal_text = normal_text + (result.normal_text or "")
+
+            return result
+
+        # We have an opening bracket but no closing bracket yet
+        # Put back everything from the bracket onwards plus held back text
+        self._buffer = stripped_buffer[start:] + held_back
+
+        if normal_text:
+            return StreamingParseResult(normal_text=normal_text)
+
+        # Otherwise, we're still accumulating a potential tool call
+        return StreamingParseResult(normal_text="")
+
+    def _get_parameter_value(self, val):
+        if isinstance(val, ast.Constant):
+            return val.value
+        elif isinstance(val, ast.Dict):
+            return {
+                k.value: self._get_parameter_value(v)
+                for k, v in zip(val.keys, val.values)
+            }
+        elif isinstance(val, ast.List):
+            return [self._get_parameter_value(v) for v in val.elts]
+        else:
+            raise ValueError("Tool call arguments must be literals")
+
+    def supports_structural_tag(self) -> bool:
+        return False
+
+    def structure_info(self) -> _GetInfoFunc:
+        raise NotImplementedError
diff --git a/sglang/python/sglang/srt/function_call/qwen25_detector.py b/sglang/python/sglang/srt/function_call/qwen25_detector.py
new file mode 100644
index 0000000000000000000000000000000000000000..33ae13cddb8d7c17926666aa5f14512ae8f1dae3
--- /dev/null
+++ b/sglang/python/sglang/srt/function_call/qwen25_detector.py
@@ -0,0 +1,120 @@
+import json
+import logging
+import re
+from typing import List
+
+from sglang.srt.entrypoints.openai.protocol import Tool
+from sglang.srt.function_call.base_format_detector import BaseFormatDetector
+from sglang.srt.function_call.core_types import (
+    StreamingParseResult,
+    StructureInfo,
+    _GetInfoFunc,
+)
+
+logger = logging.getLogger(__name__)
+
+
+class Qwen25Detector(BaseFormatDetector):
+    """
+    Detector for Qwen 2.5 and Qwen 3 model function call format.
+
+    Format Structure:
+    ```
+    <tool_call>\n{"name":"func1", "arguments":{...}}\n</tool_call>\n<tool_call>\n{"name":"func2", "arguments":{...}}\n</tool_call>
+    ```
+
+    Key Components:
+    - Tool Call Tags: `<tool_call>` and `</tool_call>` wrap each individual call
+    - Function Call Object: JSON object with "name" and "arguments" fields
+
+    Reference: https://huggingface.co/Qwen/Qwen2.5-0.5B-Instruct?chat_template=default
+    """
+
+    def __init__(self):
+        """
+        Initializes the detector with necessary state variables.
+        """
+        super().__init__()
+        self.bot_token = "<tool_call>\n"
+        self.eot_token = "\n</tool_call>"
+        self.tool_call_separator = "\n"
+        self._normal_text_buffer = ""  # Buffer for handling partial end tokens
+
+    def has_tool_call(self, text: str) -> bool:
+        """Check if the text contains a Qwen 2.5 format tool call."""
+        return self.bot_token in text
+
+    def detect_and_parse(self, text: str, tools: List[Tool]) -> StreamingParseResult:
+        """
+        One-time parsing: Detects and parses tool calls in the provided text.
+
+        :param text: The complete text to parse.
+        :param tools: List of available tools.
+        :return: ParseResult indicating success or failure, consumed text, leftover text, and parsed calls.
+        """
+        idx = text.find(self.bot_token)
+        normal_text = text[:idx].strip() if idx != -1 else text
+        if self.bot_token not in text:
+            return StreamingParseResult(normal_text=normal_text, calls=[])
+
+        # Find all <tool_call>\n...\n</tool_call> blocks
+        pattern = rf"{re.escape(self.bot_token)}(.*?){re.escape(self.eot_token)}"
+        match_result_list = re.findall(pattern, text, re.DOTALL)
+        calls = []
+        for match_result in match_result_list:
+            try:
+                parsed_call = json.loads(match_result.strip())
+                calls.extend(self.parse_base_json(parsed_call, tools))
+            except json.JSONDecodeError as e:
+                logger.warning(
+                    f"Failed to parse JSON part: {match_result}, JSON parse error: {str(e)}"
+                )
+                continue
+        return StreamingParseResult(normal_text=normal_text, calls=calls)
+
+    def parse_streaming_increment(
+        self, new_text: str, tools: List[Tool]
+    ) -> StreamingParseResult:
+        """
+        Streaming incremental parsing for Qwen 2.5 tool calls.
+        Uses base class implementation with buffering to handle partial end tokens.
+        """
+        result = super().parse_streaming_increment(new_text, tools)
+
+        # Handle partial end tokens that are streamed character by character
+        if result.normal_text:
+            self._normal_text_buffer += result.normal_text
+
+            # Check if buffer contains complete end token (without leading newline)
+            end_token_without_newline = self.eot_token[1:]  # "</tool_call>"
+            if end_token_without_newline in self._normal_text_buffer:
+                cleaned_text = self._normal_text_buffer.replace(
+                    end_token_without_newline, ""
+                )
+                self._normal_text_buffer = ""
+                result.normal_text = cleaned_text
+            else:
+                # Check if buffer might contain partial end token at the end
+                partial_match_len = self._ends_with_partial_token(
+                    self._normal_text_buffer, end_token_without_newline
+                )
+
+                if partial_match_len:
+                    # Keep potential partial match in buffer, return the rest
+                    result.normal_text = self._normal_text_buffer[:-partial_match_len]
+                    self._normal_text_buffer = self._normal_text_buffer[
+                        -partial_match_len:
+                    ]
+                else:
+                    # No partial match, return all buffered text
+                    result.normal_text = self._normal_text_buffer
+                    self._normal_text_buffer = ""
+
+        return result
+
+    def structure_info(self) -> _GetInfoFunc:
+        return lambda name: StructureInfo(
+            begin='<tool_call>\n{"name":"' + name + '", "arguments":',
+            end="}\n</tool_call>",
+            trigger="<tool_call>",
+        )
diff --git a/sglang/python/sglang/srt/function_call/qwen3_coder_detector.py b/sglang/python/sglang/srt/function_call/qwen3_coder_detector.py
new file mode 100644
index 0000000000000000000000000000000000000000..9dd77903d429ba0e6a32cd6df5f916b793cacec6
--- /dev/null
+++ b/sglang/python/sglang/srt/function_call/qwen3_coder_detector.py
@@ -0,0 +1,474 @@
+import ast
+import json
+import logging
+import re
+from typing import Any, List, Optional
+
+from sglang.srt.entrypoints.openai.protocol import Tool
+from sglang.srt.function_call.base_format_detector import BaseFormatDetector
+from sglang.srt.function_call.core_types import (
+    StreamingParseResult,
+    ToolCallItem,
+    _GetInfoFunc,
+)
+
+logger = logging.getLogger(__name__)
+
+
+class Qwen3CoderDetector(BaseFormatDetector):
+    def __init__(self):
+        super().__init__()
+
+        # Sentinel tokens
+        self.tool_call_start_token: str = "<tool_call>"
+        self.tool_call_end_token: str = "</tool_call>"
+        self.tool_call_prefix: str = "<function="
+        self.function_end_token: str = "</function>"
+        self.parameter_prefix: str = "<parameter="
+        self.parameter_end_token: str = "</parameter>"
+
+        # Regex for non-streaming fallback
+        self.tool_call_regex = re.compile(r"<tool_call>(.*?)</tool_call>", re.DOTALL)
+        self.tool_call_function_regex = re.compile(
+            r"<function=(.*?)</function>|<function=(.*)$", re.DOTALL
+        )
+        self.tool_call_parameter_regex = re.compile(
+            r"<parameter=(.*?)(?:</parameter>|(?=<parameter=)|(?=</function>)|$)",
+            re.DOTALL,
+        )
+
+        # Streaming State
+        # Base class already initializes _buffer, we just use it directly
+        # No need to check with hasattr - we control the lifecycle through inheritance
+
+        # Index pointing to the next character to be processed in buffer
+        self.parsed_pos: int = 0
+        # Parameter count inside the current tool being processed, used to determine whether to add comma
+        self.current_tool_param_count: int = 0
+        # Flag indicating whether current tool has already sent '{'
+        self.json_started: bool = False
+
+        # [FIX] New state flag: mark whether inside tool_call structure block
+        self.is_inside_tool_call: bool = False
+
+        # Initialize attributes that were missing in the original PR
+        self.current_func_name: Optional[str] = None
+
+    def has_tool_call(self, text: str) -> bool:
+        return self.tool_call_start_token in text
+
+    def _get_arguments_config(
+        self, func_name: str, tools: Optional[list[Tool]]
+    ) -> dict:
+        """Extract argument configuration for a function."""
+        if tools is None:
+            return {}
+        for config in tools:
+            try:
+                config_type = config.type
+                config_function = config.function
+                config_function_name = config_function.name
+            except AttributeError:
+                continue
+
+            if config_type == "function" and config_function_name == func_name:
+                try:
+                    params = config_function.parameters
+                except AttributeError:
+                    return {}
+
+                if isinstance(params, dict) and "properties" in params:
+                    return params["properties"]
+                elif isinstance(params, dict):
+                    return params
+                else:
+                    return {}
+        logger.warning(f"Tool '{func_name}' is not defined in the tools list.")
+        return {}
+
+    def _convert_param_value(
+        self, param_value: str, param_name: str, param_config: dict, func_name: str
+    ) -> Any:
+        """Convert parameter value based on its type in the schema."""
+        # Handle null value for any type
+        if param_value.lower() == "null":
+            return None
+
+        if param_name not in param_config:
+            if param_config != {}:
+                logger.warning(
+                    f"Parsed parameter '{param_name}' is not defined in the tool "
+                    f"parameters for tool '{func_name}', directly returning the string value."
+                )
+            return param_value
+
+        if (
+            isinstance(param_config[param_name], dict)
+            and "type" in param_config[param_name]
+        ):
+            param_type = str(param_config[param_name]["type"]).strip().lower()
+        else:
+            param_type = "string"
+        if param_type in ["string", "str", "text", "varchar", "char", "enum"]:
+            return param_value
+        elif (
+            param_type.startswith("int")
+            or param_type.startswith("uint")
+            or param_type.startswith("long")
+            or param_type.startswith("short")
+            or param_type.startswith("unsigned")
+        ):
+            try:
+                param_value = int(param_value)
+            except Exception:
+                logger.warning(
+                    f"Parsed value '{param_value}' of parameter '{param_name}' is not an integer in tool "
+                    f"'{func_name}', degenerating to string."
+                )
+            return param_value
+        elif param_type.startswith("num") or param_type.startswith("float"):
+            try:
+                maybe_convert = (
+                    False if "." in param_value or "e" in param_value.lower() else True
+                )
+                param_value: float = float(param_value)
+                if maybe_convert and param_value.is_integer():
+                    param_value = int(param_value)
+            except Exception:
+                logger.warning(
+                    f"Parsed value '{param_value}' of parameter '{param_name}' is not a float in tool "
+                    f"'{func_name}', degenerating to string."
+                )
+            return param_value
+        elif param_type in ["boolean", "bool", "binary"]:
+            param_value = param_value.lower()
+            if param_value not in ["true", "false"]:
+                logger.warning(
+                    f"Parsed value '{param_value}' of parameter '{param_name}' is not a boolean (`true` of `false`) in tool '{func_name}', degenerating to false."
+                )
+            return param_value == "true"
+        else:
+            if (
+                param_type in ["object", "array", "arr"]
+                or param_type.startswith("dict")
+                or param_type.startswith("list")
+            ):
+                try:
+                    param_value = json.loads(param_value)
+                    return param_value
+                except Exception:
+                    logger.warning(
+                        f"Parsed value '{param_value}' of parameter '{param_name}' cannot be parsed with json.loads in tool "
+                        f"'{func_name}', will try other methods to parse it."
+                    )
+            try:
+                param_value = ast.literal_eval(param_value)  # safer
+            except Exception:
+                logger.warning(
+                    f"Parsed value '{param_value}' of parameter '{param_name}' cannot be converted via Python `ast.literal_eval()` in tool '{func_name}', degenerating to string."
+                )
+            return param_value
+
+    def detect_and_parse(self, text: str, tools: List[Tool]) -> StreamingParseResult:
+        """One-shot parsing for non-streaming scenarios."""
+        if self.tool_call_start_token not in text:
+            return StreamingParseResult(normal_text=text)
+
+        calls = []
+        try:
+            # Simple cleanup of the text to find tool calls
+            # Note: This is a simplified regex approach consistent with vLLM
+            raw_tool_calls = self.tool_call_regex.findall(text)
+            if not raw_tool_calls:
+                # Fallback: maybe the whole text is inside the tag or tags are stripped
+                if self.tool_call_prefix in text:
+                    raw_tool_calls = [text]
+
+            tool_idx = 0
+            for tool_content in raw_tool_calls:
+                # Find function calls
+                funcs = self.tool_call_function_regex.findall(tool_content)
+                for func_match in funcs:
+                    func_body = func_match[0] or func_match[1]
+                    if ">" not in func_body:
+                        continue
+
+                    name_end = func_body.index(">")
+                    func_name = func_body[:name_end]
+                    params_str = func_body[name_end + 1 :]
+
+                    param_config = self._get_arguments_config(func_name, tools)
+                    parsed_params = {}
+
+                    for p_match in self.tool_call_parameter_regex.findall(params_str):
+                        if ">" not in p_match:
+                            continue
+                        p_idx = p_match.index(">")
+                        p_name = p_match[:p_idx]
+                        p_val = p_match[p_idx + 1 :]
+                        # Remove prefixing and trailing \n
+                        if p_val.startswith("\n"):
+                            p_val = p_val[1:]
+                        if p_val.endswith("\n"):
+                            p_val = p_val[:-1]
+
+                        parsed_params[p_name] = self._convert_param_value(
+                            p_val, p_name, param_config, func_name
+                        )
+
+                    calls.append(
+                        ToolCallItem(
+                            tool_index=tool_idx,
+                            name=func_name,
+                            parameters=json.dumps(parsed_params, ensure_ascii=False),
+                        )
+                    )
+                    tool_idx += 1
+
+            # Determine normal text (text before the first tool call)
+            start_idx = text.find(self.tool_call_start_token)
+            if start_idx == -1:
+                start_idx = text.find(self.tool_call_prefix)
+            normal_text = text[:start_idx] if start_idx > 0 else ""
+
+            return StreamingParseResult(normal_text=normal_text, calls=calls)
+
+        except Exception as e:
+            logger.error(f"Error in detect_and_parse: {e}")
+            return StreamingParseResult(normal_text=text)
+
+    def parse_streaming_increment(
+        self, new_text: str, tools: List[Tool]
+    ) -> StreamingParseResult:
+        """
+        Robust cursor-based streaming parser.
+        """
+        self._buffer += new_text
+
+        # Guard against empty buffer
+        if not self._buffer:
+            return StreamingParseResult()
+
+        calls = []
+        normal_text_chunks = []
+
+        while True:
+            # Working text slice
+            current_slice = self._buffer[self.parsed_pos :]
+
+            # Optimization: If almost empty, wait for more
+            if not current_slice:
+                break
+
+            # -------------------------------------------------------
+            # 1. Priority detection: check if it's the start of Tool Call
+            # -------------------------------------------------------
+            if current_slice.startswith(self.tool_call_start_token):
+                self.parsed_pos += len(self.tool_call_start_token)
+                self.is_inside_tool_call = True
+                continue
+
+            # -------------------------------------------------------
+            # 2. Function Name: <function=name>
+            # -------------------------------------------------------
+            if current_slice.startswith(self.tool_call_prefix):
+                end_angle = current_slice.find(">")
+                if end_angle != -1:
+                    func_name = current_slice[len(self.tool_call_prefix) : end_angle]
+
+                    self.current_tool_id += 1
+                    self.current_tool_name_sent = True
+                    self.current_tool_param_count = 0
+                    self.json_started = False
+                    self.current_func_name = func_name
+
+                    calls.append(
+                        ToolCallItem(
+                            tool_index=self.current_tool_id,
+                            name=func_name,
+                            parameters="",
+                        )
+                    )
+
+                    self.parsed_pos += end_angle + 1
+                    continue
+                else:
+                    # Incomplete tag
+                    break
+
+            # -------------------------------------------------------
+            # 3. Parameter: <parameter=name>value...
+            # -------------------------------------------------------
+            if current_slice.startswith(self.parameter_prefix):
+                name_end = current_slice.find(">")
+                if name_end != -1:
+                    value_start_idx = name_end + 1
+                    rest_of_slice = current_slice[value_start_idx:]
+
+                    # A parameter can end in multiple ways:
+                    # 1. [Normal] Encounter </parameter>
+                    # 2. [Abnormal] Encounter next <parameter=
+                    # 3. [Abnormal] Encounter </function>
+                    # So we need to find the smallest one as the parameter end position.
+                    cand_end_param = rest_of_slice.find(self.parameter_end_token)
+                    cand_next_param = rest_of_slice.find(self.parameter_prefix)
+                    cand_end_func = rest_of_slice.find(self.function_end_token)
+
+                    candidates = []
+                    if cand_end_param != -1:
+                        candidates.append(
+                            (cand_end_param, len(self.parameter_end_token))
+                        )
+                    if cand_next_param != -1:
+                        candidates.append((cand_next_param, 0))
+                    if cand_end_func != -1:
+                        candidates.append((cand_end_func, 0))
+
+                    if candidates:
+                        best_cand = min(candidates, key=lambda x: x[0])
+                        end_pos = best_cand[0]
+                        end_token_len = best_cand[1]
+
+                        param_name = current_slice[
+                            len(self.parameter_prefix) : name_end
+                        ]
+                        raw_value = rest_of_slice[:end_pos]
+
+                        # Cleanup value
+                        if raw_value.startswith("\n"):
+                            raw_value = raw_value[1:]
+                        if raw_value.endswith("\n"):
+                            raw_value = raw_value[:-1]
+
+                        # JSON Construction
+                        if not self.json_started:
+                            calls.append(
+                                ToolCallItem(
+                                    tool_index=self.current_tool_id, parameters="{"
+                                )
+                            )
+                            self.json_started = True
+
+                        param_config = self._get_arguments_config(
+                            self.current_func_name, tools
+                        )
+                        converted_val = self._convert_param_value(
+                            raw_value, param_name, param_config, self.current_func_name
+                        )
+
+                        # Construct JSON fragment: "key": value
+                        # Note: We must be careful with json.dumps to ensure valid JSON streaming
+                        json_key_val = f"{json.dumps(param_name)}: {json.dumps(converted_val, ensure_ascii=False)}"
+
+                        if self.current_tool_param_count > 0:
+                            fragment = f", {json_key_val}"
+                        else:
+                            fragment = json_key_val
+
+                        calls.append(
+                            ToolCallItem(
+                                tool_index=self.current_tool_id, parameters=fragment
+                            )
+                        )
+                        self.current_tool_param_count += 1
+
+                        # Advance cursor
+                        total_len = (name_end + 1) + end_pos + end_token_len
+                        self.parsed_pos += total_len
+                        continue
+
+                # Incomplete parameter tag or value
+                break
+
+            # -------------------------------------------------------
+            # 4. Function End: </function>
+            # -------------------------------------------------------
+            if current_slice.startswith(self.function_end_token):
+                if not self.json_started:
+                    calls.append(
+                        ToolCallItem(tool_index=self.current_tool_id, parameters="{")
+                    )
+                    self.json_started = True
+
+                calls.append(
+                    ToolCallItem(tool_index=self.current_tool_id, parameters="}")
+                )
+                self.parsed_pos += len(self.function_end_token)
+                self.current_func_name = None
+                continue
+
+            # -------------------------------------------------------
+            # 5. Tool Call End: </tool_call>
+            # -------------------------------------------------------
+            if current_slice.startswith(self.tool_call_end_token):
+                self.parsed_pos += len(self.tool_call_end_token)
+                self.is_inside_tool_call = False  # [FIX] Exit tool call region
+                continue
+
+            # -------------------------------------------------------
+            # 6. Handling content / whitespace / normal text
+            # -------------------------------------------------------
+            # If current position is not the start of a tag (i.e., doesn't start with <), it might be plain text,
+            # or a newline between two tags.
+            # But we need to be careful not to output truncated tags like "<fun" as text.
+
+            next_open_angle = current_slice.find("<")
+
+            if next_open_angle == -1:
+                # This entire segment is plain text
+                if not self.is_inside_tool_call:
+                    normal_text_chunks.append(current_slice)
+                # [FIX] If inside tool call, discard this text (usually \n), don't append
+                self.parsed_pos += len(current_slice)
+                continue
+
+            elif next_open_angle == 0:
+                # Looks like a Tag, but doesn't match any known Tag above
+
+                possible_tags = [
+                    self.tool_call_start_token,
+                    self.tool_call_end_token,
+                    self.tool_call_prefix,
+                    self.function_end_token,
+                    self.parameter_prefix,
+                    self.parameter_end_token,
+                ]
+
+                is_potential_tag = False
+                for tag in possible_tags:
+                    if tag.startswith(current_slice):
+                        is_potential_tag = True
+                        break
+
+                if is_potential_tag:
+                    break  # Wait for more
+                else:
+                    # Just a plain '<' symbol
+                    if not self.is_inside_tool_call:
+                        normal_text_chunks.append("<")
+                    self.parsed_pos += 1
+                    continue
+
+            else:
+                # '<' is in the middle
+                text_segment = current_slice[:next_open_angle]
+                if not self.is_inside_tool_call:
+                    normal_text_chunks.append(text_segment)
+                # [FIX] If inside tool call, discard whitespace/text before Tag
+                self.parsed_pos += next_open_angle
+                continue
+
+        # Memory Cleanup: Slice the buffer
+        # Keep unparsed part, discard parsed part
+        if self.parsed_pos > 0:
+            self._buffer = self._buffer[self.parsed_pos :]
+            self.parsed_pos = 0
+
+        normal_text = "".join(normal_text_chunks) if normal_text_chunks else ""
+        return StreamingParseResult(calls=calls, normal_text=normal_text)
+
+    def supports_structural_tag(self) -> bool:
+        return False
+
+    def structure_info(self) -> _GetInfoFunc:
+        raise NotImplementedError
diff --git a/sglang/python/sglang/srt/function_call/step3_detector.py b/sglang/python/sglang/srt/function_call/step3_detector.py
new file mode 100644
index 0000000000000000000000000000000000000000..3fba98774e4c585eeb9312dcf0c1f8aff87461bd
--- /dev/null
+++ b/sglang/python/sglang/srt/function_call/step3_detector.py
@@ -0,0 +1,407 @@
+import ast
+import json
+import logging
+import re
+from typing import Any, Dict, List
+
+from sglang.srt.entrypoints.openai.protocol import Tool
+from sglang.srt.function_call.base_format_detector import BaseFormatDetector
+from sglang.srt.function_call.core_types import (
+    StreamingParseResult,
+    ToolCallItem,
+    _GetInfoFunc,
+)
+
+logger = logging.getLogger(__name__)
+
+
+def get_argument_type(func_name: str, arg_key: str, defined_tools: List[Tool]) -> str:
+    """Get the expected type for a function argument from tool schema."""
+    name2tool = {tool.function.name: tool for tool in defined_tools}
+    if func_name not in name2tool:
+        return None
+    tool = name2tool[func_name]
+    parameters = tool.function.parameters or {}
+    properties = parameters.get("properties", {})
+    if arg_key not in properties:
+        return None
+    return properties[arg_key].get("type", None)
+
+
+def parse_arguments(value: str) -> tuple[Any, bool]:
+    """Parse a string value to appropriate type. Returns (parsed_value, success)."""
+    try:
+        try:
+            parsed_value = json.loads(value)
+        except:
+            parsed_value = ast.literal_eval(value)
+        return parsed_value, True
+    except:
+        return value, False
+
+
+class Step3Detector(BaseFormatDetector):
+    """
+    Detector for Step3 model function call format.
+
+    The Step3 format uses special Unicode tokens to delimit function calls
+    with steptml XML format for invocations.
+
+    Format Structure:
+    ```
+    <｜tool_calls_begin｜>
+    <｜tool_call_begin｜>function<｜tool_sep｜><steptml:invoke name="function_name">
+    <steptml:parameter name="param1">value1</steptml:parameter>
+    <steptml:parameter name="param2">value2</steptml:parameter>
+    </steptml:invoke><｜tool_call_end｜>
+    <｜tool_calls_end｜>
+    ```
+    """
+
+    def __init__(self):
+        super().__init__()
+        self.bot_token = "<｜tool_calls_begin｜>"
+        self.eot_token = "<｜tool_calls_end｜>"
+        self.tool_call_begin = "<｜tool_call_begin｜>"
+        self.tool_call_end = "<｜tool_call_end｜>"
+        self.tool_sep = "<｜tool_sep｜>"
+
+        # Regex for parsing steptml invocations
+        self.invoke_regex = re.compile(
+            r'<steptml:invoke name="([^"]+)">(.+?)</steptml:invoke>', re.DOTALL
+        )
+        self.param_regex = re.compile(
+            r'<steptml:parameter name="([^"]+)">([^<]*)</steptml:parameter>', re.DOTALL
+        )
+
+        # Streaming state variables
+        self._in_tool_block: bool = False
+        self._tool_block_finished: bool = False
+        self._current_function_name: str = ""
+        self._current_parameters: Dict[str, Any] = {}
+        self._in_tool_call: bool = False
+        self._function_name_sent: bool = False
+
+    def has_tool_call(self, text: str) -> bool:
+        """Check if the text contains a Step3 format tool call."""
+        return self.bot_token in text
+
+    def _parse_steptml_invoke(
+        self, text: str, tools: List[Tool] = None
+    ) -> tuple[str, dict]:
+        """Parse steptml invoke format to extract function name and parameters."""
+        invoke_match = self.invoke_regex.search(text)
+        if not invoke_match:
+            return None, {}
+
+        func_name = invoke_match.group(1)
+        params_text = invoke_match.group(2)
+
+        params = {}
+        for param_match in self.param_regex.finditer(params_text):
+            param_name = param_match.group(1)
+            param_value = param_match.group(2).strip()
+
+            # If tools provided, use schema-aware parsing
+            if tools:
+                arg_type = get_argument_type(func_name, param_name, tools)
+                if arg_type and arg_type != "string":
+                    parsed_value, _ = parse_arguments(param_value)
+                    params[param_name] = parsed_value
+                else:
+                    params[param_name] = param_value
+            else:
+                # Fallback to generic parsing if no tools provided
+                parsed_value, _ = parse_arguments(param_value)
+                params[param_name] = parsed_value
+
+        return func_name, params
+
+    def detect_and_parse(self, text: str, tools: List[Tool]) -> StreamingParseResult:
+        """
+        One-time parsing: Detects and parses tool calls in the provided text.
+        """
+        if self.bot_token not in text:
+            return StreamingParseResult(normal_text=text, calls=[])
+
+        try:
+            pre_text, rest = text.split(self.bot_token, 1)
+
+            # If no end token, return everything as normal text
+            if self.eot_token not in rest:
+                return StreamingParseResult(normal_text=text, calls=[])
+
+            tool_section, post_text = rest.split(self.eot_token, 1)
+
+            # Find all individual tool calls using regex
+            calls = []
+            tool_call_pattern = (
+                f"{re.escape(self.tool_call_begin)}(.*?){re.escape(self.tool_call_end)}"
+            )
+
+            for match in re.finditer(tool_call_pattern, tool_section, re.DOTALL):
+                call_content = match.group(1)
+
+                # Check if it's a function call
+                if self.tool_sep not in call_content:
+                    continue
+
+                type_part, invoke_part = call_content.split(self.tool_sep, 1)
+                if type_part.strip() != "function":
+                    continue
+
+                func_name, params = self._parse_steptml_invoke(invoke_part, tools)
+                if func_name:
+                    # Use parse_base_json to create the ToolCallItem
+                    action = {"name": func_name, "arguments": params}
+                    calls.extend(self.parse_base_json(action, tools))
+
+            # Combine pre and post text
+            normal_text = pre_text + post_text
+
+            return StreamingParseResult(normal_text=normal_text, calls=calls)
+
+        except Exception as e:
+            logger.error(f"Error in detect_and_parse: {e}")
+            # Return the original text if parsing fails
+            return StreamingParseResult(normal_text=text)
+
+    def parse_streaming_increment(
+        self, new_text: str, tools: List[Tool]
+    ) -> StreamingParseResult:
+        """
+        Streaming incremental parsing for Step3 format.
+        """
+        self._buffer += new_text
+
+        # Build tool indices for validation
+        if not hasattr(self, "_tool_indices"):
+            self._tool_indices = self._get_tool_indices(tools)
+
+        # If we've finished the tool block, everything is normal text
+        if self._tool_block_finished:
+            normal_text = self._buffer
+            self._buffer = ""
+            return StreamingParseResult(normal_text=normal_text)
+
+        # Check if tool block hasn't started yet
+        if not self._in_tool_block:
+            if self.bot_token in self._buffer:
+                idx = self._buffer.find(self.bot_token)
+                normal_text = self._buffer[:idx]
+                self._buffer = self._buffer[idx + len(self.bot_token) :]
+                self._in_tool_block = True
+                return StreamingParseResult(normal_text=normal_text)
+            else:
+                # Check if we might have a partial bot_token
+                partial_len = self._ends_with_partial_token(
+                    self._buffer, self.bot_token
+                )
+                if partial_len:
+                    return StreamingParseResult()  # Wait for more text
+                else:
+                    normal_text = self._buffer
+                    self._buffer = ""
+                    return StreamingParseResult(normal_text=normal_text)
+
+        # We're inside the tool block
+        calls: List[ToolCallItem] = []
+
+        # Check if tool block is ending
+        if self.eot_token in self._buffer:
+            idx = self._buffer.find(self.eot_token)
+
+            # If we're in the middle of a tool call, we need to handle it
+            if self._in_tool_call:
+                # The buffer before eot_token might contain the end of the current tool call
+                before_eot = self._buffer[:idx]
+                if self.tool_call_end in before_eot:
+                    # Parse this final tool call
+                    result = self._parse_partial_tool_call(tools)
+                    calls.extend(result.calls)
+                else:
+                    # Incomplete tool call - log warning
+                    logger.warning("Tool block ended with incomplete tool call")
+
+            remaining = self._buffer[idx + len(self.eot_token) :]
+            self._buffer = ""
+            self._tool_block_finished = True
+
+            # Reset any partial tool call state
+            self._reset_streaming_state()
+
+            return StreamingParseResult(normal_text=remaining, calls=calls)
+
+        # Check if we're in a tool call or need to start one
+        if not self._in_tool_call:
+            if self.tool_call_begin in self._buffer:
+                idx = self._buffer.find(self.tool_call_begin)
+                # Remove any content before tool call begin (shouldn't happen but be safe)
+                self._buffer = self._buffer[idx + len(self.tool_call_begin) :]
+                self._in_tool_call = True
+                self._function_name_sent = False
+                self._current_function_name = ""
+                self._current_parameters = {}
+                # Fall through to parse the partial tool call
+            else:
+                # Wait for tool call to begin
+                return StreamingParseResult()
+
+        # Parse partial tool call
+        if self._in_tool_call:
+            return self._parse_partial_tool_call(tools)
+
+        return StreamingParseResult()
+
+    def _parse_partial_tool_call(self, tools: List[Tool]) -> StreamingParseResult:
+        """Parse partial tool call for streaming scenarios."""
+        calls = []
+
+        # Check if we have tool_sep (means we're past the type declaration)
+        if self.tool_sep not in self._buffer:
+            return StreamingParseResult(calls=calls)  # Wait for more text
+
+        type_part, invoke_part = self._buffer.split(self.tool_sep, 1)
+        if type_part.strip() != "function":
+            # Invalid tool type, skip this tool call
+            self._reset_streaming_state()
+            return StreamingParseResult(calls=calls)
+
+        # Try to extract function name if not sent yet
+        if not self._function_name_sent:
+            name_match = re.search(r'<steptml:invoke name="([^"]+)">', invoke_part)
+            if name_match:
+                func_name = name_match.group(1)
+
+                # Validate function name
+                if func_name in self._tool_indices:
+                    self._current_function_name = func_name
+                    self._function_name_sent = True
+
+                    # Initialize tool tracking
+                    if self.current_tool_id == -1:
+                        self.current_tool_id = 0
+
+                    # Ensure tracking arrays are large enough
+                    while len(self.prev_tool_call_arr) <= self.current_tool_id:
+                        self.prev_tool_call_arr.append({})
+                    while len(self.streamed_args_for_tool) <= self.current_tool_id:
+                        self.streamed_args_for_tool.append("")
+
+                    # Store tool call info
+                    self.prev_tool_call_arr[self.current_tool_id] = {
+                        "name": func_name,
+                        "arguments": {},
+                    }
+
+                    # Send tool name with empty parameters
+                    calls.append(
+                        ToolCallItem(
+                            tool_index=self.current_tool_id,
+                            name=func_name,
+                            parameters="",
+                        )
+                    )
+                else:
+                    # Invalid function name
+                    logger.warning(f"Invalid function name: {func_name}")
+                    self._reset_streaming_state()
+                    return StreamingParseResult(calls=calls)
+            else:
+                # Function name not complete yet
+                return StreamingParseResult(calls=calls)
+
+        # Parse parameters incrementally
+        if self._function_name_sent:
+            # Extract all complete parameters
+            new_params = {}
+            for param_match in self.param_regex.finditer(invoke_part):
+                param_name = param_match.group(1)
+                param_value = param_match.group(2).strip()
+
+                # Use schema-aware parsing
+                arg_type = get_argument_type(
+                    self._current_function_name, param_name, tools
+                )
+                if arg_type and arg_type != "string":
+                    parsed_value, _ = parse_arguments(param_value)
+                    new_params[param_name] = parsed_value
+                else:
+                    new_params[param_name] = param_value
+
+            # Check if we have new parameters to stream
+            if new_params != self._current_parameters:
+                # Build the JSON content without the closing brace for streaming
+                if not self._current_parameters:
+                    # First parameters - send opening brace and content
+                    params_content = json.dumps(new_params, ensure_ascii=False)
+                    if len(params_content) > 2:  # More than just "{}"
+                        # Send everything except the closing brace
+                        diff = params_content[:-1]
+                    else:
+                        diff = "{"
+                else:
+                    # Subsequent parameters - calculate the incremental diff
+                    old_json = json.dumps(self._current_parameters, ensure_ascii=False)
+                    new_json = json.dumps(new_params, ensure_ascii=False)
+
+                    # Remove closing braces for comparison
+                    old_without_brace = old_json[:-1]
+                    new_without_brace = new_json[:-1]
+
+                    # The new content should extend the old content
+                    if new_without_brace.startswith(old_without_brace):
+                        diff = new_without_brace[len(old_without_brace) :]
+                    else:
+                        # Parameters changed in unexpected way - shouldn't happen in normal streaming
+                        diff = ""
+
+                if diff:
+                    calls.append(
+                        ToolCallItem(
+                            tool_index=self.current_tool_id,
+                            parameters=diff,
+                        )
+                    )
+                    self.streamed_args_for_tool[self.current_tool_id] += diff
+
+                # Update current state
+                self._current_parameters = new_params
+                self.prev_tool_call_arr[self.current_tool_id]["arguments"] = new_params
+
+            # Check if tool call is complete
+            if self.tool_call_end in self._buffer:
+                # Send closing brace if we've sent any parameters
+                if self.streamed_args_for_tool[self.current_tool_id]:
+                    calls.append(
+                        ToolCallItem(
+                            tool_index=self.current_tool_id,
+                            parameters="}",
+                        )
+                    )
+                    self.streamed_args_for_tool[self.current_tool_id] += "}"
+
+                # Find the end position
+                end_idx = self._buffer.find(self.tool_call_end)
+                # Remove the processed tool call from buffer
+                self._buffer = self._buffer[end_idx + len(self.tool_call_end) :]
+
+                # Reset state for next tool call
+                self._reset_streaming_state()
+                self.current_tool_id += 1
+
+        return StreamingParseResult(calls=calls)
+
+    def _reset_streaming_state(self):
+        """Reset streaming state for the next tool call"""
+        self._in_tool_call = False
+        self._function_name_sent = False
+        self._current_function_name = ""
+        self._current_parameters = {}
+
+    def supports_structural_tag(self) -> bool:
+        """Return True if this detector supports structural tag format."""
+        return False
+
+    def structure_info(self) -> _GetInfoFunc:
+        raise NotImplementedError()
diff --git a/sglang/python/sglang/srt/function_call/trinity_detector.py b/sglang/python/sglang/srt/function_call/trinity_detector.py
new file mode 100644
index 0000000000000000000000000000000000000000..e6288d3b31b9cb94d697317935d10880ff60f678
--- /dev/null
+++ b/sglang/python/sglang/srt/function_call/trinity_detector.py
@@ -0,0 +1,43 @@
+import logging
+from typing import List
+
+from sglang.srt.entrypoints.openai.protocol import Tool
+from sglang.srt.function_call.core_types import StreamingParseResult
+from sglang.srt.function_call.qwen25_detector import Qwen25Detector
+
+logger = logging.getLogger(__name__)
+
+
+class TrinityDetector(Qwen25Detector):
+    """
+    Detector for Trinity models using Qwen-style function call format.
+
+    This detector extends Qwen25Detector to handle tool calls that may appear
+    inside <think> sections by stripping the think tags before parsing.
+
+    Reference: https://huggingface.co/Qwen/Qwen2.5-0.5B-Instruct?chat_template=default
+    """
+
+    def _strip_think_tags(self, text: str) -> str:
+        """Remove <think> and </think> tags, keeping the content inside."""
+        return text.replace("<think>", "").replace("</think>", "")
+
+    def has_tool_call(self, text: str) -> bool:
+        """Check if the text contains a tool call."""
+        return super().has_tool_call(self._strip_think_tags(text))
+
+    def detect_and_parse(self, text: str, tools: List[Tool]) -> StreamingParseResult:
+        """
+        One-time parsing: Detects and parses tool calls in the provided text.
+        """
+        return super().detect_and_parse(self._strip_think_tags(text), tools)
+
+    def parse_streaming_increment(
+        self, new_text: str, tools: List[Tool]
+    ) -> StreamingParseResult:
+        """
+        Streaming incremental parsing for tool calls.
+        """
+        return super().parse_streaming_increment(
+            self._strip_think_tags(new_text), tools
+        )
diff --git a/sglang/python/sglang/srt/function_call/utils.py b/sglang/python/sglang/srt/function_call/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..567ca583bb8fa067204c38286e33b680eb5adef3
--- /dev/null
+++ b/sglang/python/sglang/srt/function_call/utils.py
@@ -0,0 +1,246 @@
+from json import JSONDecodeError, JSONDecoder
+from json.decoder import WHITESPACE
+from typing import Any, Dict, List, Literal, Optional, Tuple, Union
+
+import orjson
+import partial_json_parser
+from partial_json_parser.core.options import Allow
+
+from sglang.srt.entrypoints.openai.protocol import Tool, ToolChoice
+
+
+def _find_common_prefix(s1: str, s2: str) -> str:
+    prefix = ""
+    min_length = min(len(s1), len(s2))
+    for i in range(0, min_length):
+        if s1[i] == s2[i]:
+            prefix += s1[i]
+        else:
+            break
+    return prefix
+
+
+def _partial_json_loads(input_str: str, flags: Allow) -> Tuple[Any, int]:
+    """
+    Parse incomplete or partial JSON strings commonly encountered during streaming.
+
+    Args:
+        input_str (str): The potentially incomplete JSON string to parse.
+        flags (Allow): Bitwise flags controlling what types of partial data are allowed.
+            Common flags include:
+            - Allow.STR: Allow partial strings (e.g., '"hello wo' -> 'hello wo')
+            - Allow.OBJ: Allow partial objects (e.g., '{"key":' -> {'key': None})
+            - Allow.ARR: Allow partial arrays (e.g., '[1, 2,' -> [1, 2])
+            - Allow.ALL: Allow all types of partial data
+
+    Returns:
+        Tuple[Any, int]: A tuple containing:
+            - parsed_object: The Python object parsed from the JSON
+            - consumed_length: Number of characters consumed from input_str
+    """
+    try:
+        return (partial_json_parser.loads(input_str, flags), len(input_str))
+    except (JSONDecodeError, IndexError) as e:
+        msg = getattr(e, "msg", str(e))
+        if "Extra data" in msg or "pop from empty list" in msg:
+            start = WHITESPACE.match(input_str, 0).end()
+            obj, end = JSONDecoder().raw_decode(input_str, start)
+            return obj, end
+        raise
+
+
+def _is_complete_json(input_str: str) -> bool:
+    try:
+        orjson.loads(input_str)
+        return True
+    except JSONDecodeError:
+        return False
+
+
+def _get_tool_schema_defs(tools: List[Tool]) -> dict:
+    """
+    Get consolidated $defs from all tools, validating for conflicts.
+
+    Args:
+        tools: List of tools to process
+
+    Returns:
+        Dictionary of consolidated $defs from all tools
+
+    Raises:
+        ValueError: If conflicting $defs are found
+    """
+    all_defs = {}
+    for tool in tools:
+        if tool.function.parameters is None:
+            continue
+        defs = tool.function.parameters.get("$defs", {})
+        for def_name, def_schema in defs.items():
+            if def_name in all_defs and all_defs[def_name] != def_schema:
+                raise ValueError(
+                    f"Tool definition '{def_name}' has "
+                    "multiple schemas, which is not "
+                    "supported."
+                )
+            else:
+                all_defs[def_name] = def_schema
+    return all_defs
+
+
+def _get_tool_schema(tool: Tool) -> dict:
+    return {
+        "properties": {
+            "name": {"type": "string", "enum": [tool.function.name]},
+            "parameters": (
+                tool.function.parameters
+                if tool.function.parameters
+                else {"type": "object", "properties": {}}
+            ),
+        },
+        "required": ["name", "parameters"],
+    }
+
+
+def infer_type_from_json_schema(schema: Dict[str, Any]) -> Optional[str]:
+    """
+    Infer the primary type of a parameter from JSON Schema.
+
+    Supports complex JSON Schema structures including:
+    - Direct type field (including type arrays)
+    - anyOf/oneOf: parameter can be any of multiple types
+    - enum: parameter must be one of enum values
+    - allOf: parameter must satisfy all type definitions
+    - properties: inferred as object type
+    - items: inferred as array type
+
+    Args:
+        schema: JSON Schema definition
+
+    Returns:
+        Inferred type ('string', 'number', 'object', 'array', etc.) or None
+    """
+    if not isinstance(schema, dict):
+        return None
+
+    # Priority 1: Direct type field (including type arrays)
+    if "type" in schema:
+        type_value = schema["type"]
+        if isinstance(type_value, str):
+            return type_value
+        elif isinstance(type_value, list) and type_value:
+            # Handle type arrays: return first non-null type
+            non_null_types = [t for t in type_value if t != "null"]
+            if non_null_types:
+                return non_null_types[0]
+            return "string"  # If only null, default to string
+
+    # Priority 2: Handle anyOf/oneOf
+    if "anyOf" in schema or "oneOf" in schema:
+        schemas = schema.get("anyOf") or schema.get("oneOf")
+        types = []
+
+        if isinstance(schemas, list):
+            for sub_schema in schemas:
+                inferred_type = infer_type_from_json_schema(sub_schema)
+                if inferred_type:
+                    types.append(inferred_type)
+
+            if types:
+                # If all types are the same, return unified type
+                if len(set(types)) == 1:
+                    return types[0]
+                # When types differ, prioritize string (safest)
+                if "string" in types:
+                    return "string"
+                # Otherwise return first type
+                return types[0]
+
+    # Priority 3: Handle enum (infer type from enum values)
+    if "enum" in schema and isinstance(schema["enum"], list):
+        if not schema["enum"]:
+            return "string"
+
+        # Infer type from enum values
+        enum_types = set()
+        for value in schema["enum"]:
+            if value is None:
+                enum_types.add("null")
+            elif isinstance(value, bool):
+                enum_types.add("boolean")
+            elif isinstance(value, int):
+                enum_types.add("integer")
+            elif isinstance(value, float):
+                enum_types.add("number")
+            elif isinstance(value, str):
+                enum_types.add("string")
+            elif isinstance(value, list):
+                enum_types.add("array")
+            elif isinstance(value, dict):
+                enum_types.add("object")
+
+        # If type is uniform, return that type
+        if len(enum_types) == 1:
+            return enum_types.pop()
+        # Mixed types, prioritize string
+        return "string"
+
+    # Priority 4: Handle allOf (must satisfy all types)
+    if "allOf" in schema and isinstance(schema["allOf"], list):
+        schemas = schema["allOf"]
+        for sub_schema in schemas:
+            inferred_type = infer_type_from_json_schema(sub_schema)
+            if inferred_type and inferred_type != "string":
+                return inferred_type
+        return "string"
+
+    # Priority 5: Infer object type
+    if "properties" in schema:
+        return "object"
+
+    # Priority 6: Infer array type
+    if "items" in schema:
+        return "array"
+
+    return None
+
+
+def get_json_schema_constraint(
+    tools: List[Tool], tool_choice: Union[ToolChoice, Literal["required"]]
+) -> Optional[dict]:
+    """
+    Get the JSON schema constraint for the specified tool choice.
+
+    Args:
+        tool_choice: The tool choice specification
+
+    Returns:
+        JSON schema dict, or None if no valid tools found
+    """
+
+    if isinstance(tool_choice, ToolChoice):
+        # For specific function choice, return the user's parameters schema directly
+        fn_name = tool_choice.function.name
+        for tool in tools:
+            if tool.function.name == fn_name:
+                return {
+                    "type": "array",
+                    "minItems": 1,
+                    "maxItems": 1,
+                    "items": _get_tool_schema(tool),
+                }
+        return None
+    elif tool_choice == "required":
+        json_schema = {
+            "type": "array",
+            "minItems": 1,
+            "items": {
+                "type": "object",
+                "anyOf": [_get_tool_schema(tool) for tool in tools],
+            },
+        }
+        json_schema_defs = _get_tool_schema_defs(tools)
+        if json_schema_defs:
+            json_schema["$defs"] = json_schema_defs
+        return json_schema
+
+    return None
diff --git a/sglang/python/sglang/srt/layers/__pycache__/activation.cpython-311.pyc b/sglang/python/sglang/srt/layers/__pycache__/activation.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..67d6cee26588d6cda1a9f4a15f668645a2b2fa79
Binary files /dev/null and b/sglang/python/sglang/srt/layers/__pycache__/activation.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/layers/__pycache__/amx_utils.cpython-311.pyc b/sglang/python/sglang/srt/layers/__pycache__/amx_utils.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0aa2cbd12760ba59d69ddf301112b9a0e90b0318
Binary files /dev/null and b/sglang/python/sglang/srt/layers/__pycache__/amx_utils.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/layers/__pycache__/communicator.cpython-311.pyc b/sglang/python/sglang/srt/layers/__pycache__/communicator.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..986d92bd7cde13bf344a80c100da84ef1ce21a60
Binary files /dev/null and b/sglang/python/sglang/srt/layers/__pycache__/communicator.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/layers/__pycache__/communicator_nsa_cp.cpython-311.pyc b/sglang/python/sglang/srt/layers/__pycache__/communicator_nsa_cp.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6a86335cfde21218091d1f61bab74a770977e790
Binary files /dev/null and b/sglang/python/sglang/srt/layers/__pycache__/communicator_nsa_cp.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/layers/__pycache__/dp_attention.cpython-311.pyc b/sglang/python/sglang/srt/layers/__pycache__/dp_attention.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d0664562f8c9adbbb8da729e50928d9e6d75264f
Binary files /dev/null and b/sglang/python/sglang/srt/layers/__pycache__/dp_attention.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/layers/__pycache__/elementwise.cpython-311.pyc b/sglang/python/sglang/srt/layers/__pycache__/elementwise.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..21c6ea9bd141d80bda26ba8479e11b7f99b216d8
Binary files /dev/null and b/sglang/python/sglang/srt/layers/__pycache__/elementwise.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/layers/__pycache__/int4fp8_utils.cpython-311.pyc b/sglang/python/sglang/srt/layers/__pycache__/int4fp8_utils.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..fa48947fa5c4da11d1215ac177a26956074adb2b
Binary files /dev/null and b/sglang/python/sglang/srt/layers/__pycache__/int4fp8_utils.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/layers/__pycache__/layernorm.cpython-311.pyc b/sglang/python/sglang/srt/layers/__pycache__/layernorm.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..67ec7b92365eb20efa9e7caf59405d73ea7fdda4
Binary files /dev/null and b/sglang/python/sglang/srt/layers/__pycache__/layernorm.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/layers/__pycache__/linear.cpython-311.pyc b/sglang/python/sglang/srt/layers/__pycache__/linear.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2346aa589ccc58562f5451d19db99872cfc90603
Binary files /dev/null and b/sglang/python/sglang/srt/layers/__pycache__/linear.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/layers/__pycache__/logits_processor.cpython-311.pyc b/sglang/python/sglang/srt/layers/__pycache__/logits_processor.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4a6e568efc56b4690b27d8e23fd4120601bf15b3
Binary files /dev/null and b/sglang/python/sglang/srt/layers/__pycache__/logits_processor.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/layers/__pycache__/modelopt_utils.cpython-311.pyc b/sglang/python/sglang/srt/layers/__pycache__/modelopt_utils.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ab16379b4ca798d7ec70e113054c3898f5460228
Binary files /dev/null and b/sglang/python/sglang/srt/layers/__pycache__/modelopt_utils.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/layers/__pycache__/multimodal.cpython-311.pyc b/sglang/python/sglang/srt/layers/__pycache__/multimodal.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5be4ca619c1828af4f1f991c62dae75fcbebefc0
Binary files /dev/null and b/sglang/python/sglang/srt/layers/__pycache__/multimodal.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/layers/__pycache__/parameter.cpython-311.pyc b/sglang/python/sglang/srt/layers/__pycache__/parameter.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f45c740814629039540657e6053c1e8a717095f3
Binary files /dev/null and b/sglang/python/sglang/srt/layers/__pycache__/parameter.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/layers/__pycache__/pooler.cpython-311.pyc b/sglang/python/sglang/srt/layers/__pycache__/pooler.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d6eaf50017d9d9269e8f7b3025348c7fbf8b5d0e
Binary files /dev/null and b/sglang/python/sglang/srt/layers/__pycache__/pooler.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/layers/__pycache__/radix_attention.cpython-311.pyc b/sglang/python/sglang/srt/layers/__pycache__/radix_attention.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2b3a4aa2489f3f74364fe67d7336496bd2342470
Binary files /dev/null and b/sglang/python/sglang/srt/layers/__pycache__/radix_attention.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/layers/__pycache__/radix_linear_attention.cpython-311.pyc b/sglang/python/sglang/srt/layers/__pycache__/radix_linear_attention.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..77f878008641e98b50e7a4ff29f5416472f59ab0
Binary files /dev/null and b/sglang/python/sglang/srt/layers/__pycache__/radix_linear_attention.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/layers/__pycache__/sampler.cpython-311.pyc b/sglang/python/sglang/srt/layers/__pycache__/sampler.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2da775774f63af8e05e422e98ea84970604a1508
Binary files /dev/null and b/sglang/python/sglang/srt/layers/__pycache__/sampler.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/layers/__pycache__/sparse_pooler.cpython-311.pyc b/sglang/python/sglang/srt/layers/__pycache__/sparse_pooler.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c9d67f702d89427479bbf9bce84e99ab96d1fbd4
Binary files /dev/null and b/sglang/python/sglang/srt/layers/__pycache__/sparse_pooler.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/layers/__pycache__/torchao_utils.cpython-311.pyc b/sglang/python/sglang/srt/layers/__pycache__/torchao_utils.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..dfafc7e36d0d864d949d91497711ac45217fb9c8
Binary files /dev/null and b/sglang/python/sglang/srt/layers/__pycache__/torchao_utils.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/layers/__pycache__/vocab_parallel_embedding.cpython-311.pyc b/sglang/python/sglang/srt/layers/__pycache__/vocab_parallel_embedding.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9cbb4be5e15569f7d2c3b80ea628244c4238b853
Binary files /dev/null and b/sglang/python/sglang/srt/layers/__pycache__/vocab_parallel_embedding.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/layers/activation.py b/sglang/python/sglang/srt/layers/activation.py
new file mode 100644
index 0000000000000000000000000000000000000000..a642fea950df10434b04d71caf3f1f910c360aba
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/activation.py
@@ -0,0 +1,383 @@
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Fused operators for activation layers."""
+
+import logging
+import math
+from typing import Optional
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from transformers import PretrainedConfig
+
+from sglang.srt.distributed import (
+    divide,
+    get_tensor_model_parallel_rank,
+    get_tensor_model_parallel_world_size,
+)
+from sglang.srt.environ import envs
+from sglang.srt.layers.quantization.base_config import QuantizationConfig
+from sglang.srt.layers.utils import MultiPlatformOp
+from sglang.srt.server_args import get_global_server_args
+from sglang.srt.utils import (
+    cpu_has_amx_support,
+    is_cpu,
+    is_cuda,
+    is_hip,
+    is_npu,
+    is_xpu,
+    set_weight_attrs,
+)
+from sglang.utils import resolve_obj_by_qualname
+
+_is_cuda = is_cuda()
+_is_npu = is_npu()
+_is_cpu_amx_available = cpu_has_amx_support()
+_is_cpu = is_cpu()
+_is_hip = is_hip()
+_is_xpu = is_xpu()
+
+if _is_cuda or _is_xpu:
+    from sgl_kernel import gelu_and_mul, gelu_tanh_and_mul, silu_and_mul
+elif _is_hip:
+    from sgl_kernel import gelu_and_mul, gelu_quick, gelu_tanh_and_mul, silu_and_mul
+
+if is_npu():
+    import torch_npu
+
+logger = logging.getLogger(__name__)
+
+
+class SiluAndMul(MultiPlatformOp):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        if get_global_server_args().rl_on_policy_target is not None:
+            self._forward_method = self.forward_native
+
+    def forward_native(self, x: torch.Tensor) -> torch.Tensor:
+        d = x.shape[-1] // 2
+        return F.silu(x[..., :d]) * x[..., d:]
+
+    def forward_cuda(self, x: torch.Tensor) -> torch.Tensor:
+        d = x.shape[-1] // 2
+        output_shape = x.shape[:-1] + (d,)
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        silu_and_mul(x, out)
+        return out
+
+    def forward_cpu(self, x: torch.Tensor) -> torch.Tensor:
+        if _is_cpu_amx_available:
+            out = torch.ops.sgl_kernel.silu_and_mul_cpu(x)
+            return out
+        else:
+            return self.forward_native(x)
+
+    def forward_npu(self, x: torch.Tensor) -> torch.Tensor:
+        out = torch_npu.npu_swiglu(x)
+        return out
+
+    def forward_xpu(self, x: torch.Tensor) -> torch.Tensor:
+        d = x.shape[-1] // 2
+        output_shape = x.shape[:-1] + (d,)
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        silu_and_mul(x, out)
+        return out
+
+
+class GeluAndMul(MultiPlatformOp):
+    def __init__(self, approximate="tanh"):
+        super().__init__()
+        self.approximate = approximate
+
+    def _forward_impl(self, x: torch.Tensor) -> torch.Tensor:
+        d = x.shape[-1] // 2
+        output_shape = x.shape[:-1] + (d,)
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        if self.approximate == "tanh":
+            gelu_tanh_and_mul(x, out)
+        elif self.approximate == "none":
+            gelu_and_mul(x, out)
+        else:
+            raise RuntimeError("GeluAndMul only support tanh or none")
+        return out
+
+    def forward_native(self, x: torch.Tensor) -> torch.Tensor:
+        d = x.shape[-1] // 2
+        return F.gelu(x[..., :d], approximate=self.approximate) * x[..., d:]
+
+    def forward_cpu(self, x: torch.Tensor) -> torch.Tensor:
+        if _is_cpu_amx_available and self.approximate == "tanh":
+            return torch.ops.sgl_kernel.gelu_tanh_and_mul_cpu(x)
+        elif _is_cpu_amx_available and self.approximate == "none":
+            return torch.ops.sgl_kernel.gelu_and_mul_cpu(x)
+        else:
+            return self.forward_native(x)
+
+    def forward_cuda(self, x: torch.Tensor) -> torch.Tensor:
+        return self._forward_impl(x)
+
+    def forward_xpu(self, x: torch.Tensor) -> torch.Tensor:
+        return self._forward_impl(x)
+
+    def forward_npu(self, x: torch.Tensor) -> torch.Tensor:
+        if envs.SGLANG_NPU_FORWARD_NATIVE_GELUTANH.get():
+            return self.forward_native(x)
+        y_npu, gelu_npu = torch_npu.npu_geglu(
+            x,
+            dim=-1,
+            approximate=1 if self.approximate == "tanh" else 0,
+            activate_left=True,
+        )
+        return y_npu
+
+
+class NewGELU(MultiPlatformOp):
+    def forward_native(self, x: torch.Tensor) -> torch.Tensor:
+        c = math.sqrt(2.0 / math.pi)
+        return 0.5 * x * (1.0 + torch.tanh(c * (x + 0.044715 * torch.pow(x, 3.0))))
+
+    def forward_cuda(self, x: torch.Tensor) -> torch.Tensor:
+        # TODO: Implement the CUDA kernel for NewGELU in sgl-kernel
+        return self.forward_native(x)
+
+
+class ReLU2(nn.Module):
+    """
+    Applies the squared Rectified Linear Unit function.
+    y = max(0, x)^2
+    """
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = F.relu(x)
+        return x * x
+
+
+class QuickGELU(MultiPlatformOp):
+    def forward_native(self, x: torch.Tensor) -> torch.Tensor:
+        return x * torch.sigmoid(1.702 * x)
+
+    def forward_cuda(self, x: torch.Tensor) -> torch.Tensor:
+        return self.forward_native(x)
+
+    def forward_hip(self, x: torch.Tensor) -> torch.Tensor:
+        out = torch.empty(x.shape, dtype=x.dtype, device=x.device)
+        gelu_quick(x, out)
+        return out
+
+    def forward_npu(self, x: torch.Tensor) -> torch.Tensor:
+        return torch_npu.npu_fast_gelu(x)
+
+
+class XIELU(MultiPlatformOp):
+    """
+    Applies the xIELU activation function introduced in https://arxiv.org/abs/2411.13010
+    If the user has installed the nickjbrowning/XIELU, we import xIELU CUDA
+    Otherwise, we emit a single warning and use xIELU Python
+    """
+
+    def __init__(
+        self,
+        alpha_p_init: float = 0.8,
+        alpha_n_init: float = 0.8,
+        beta: float = 0.5,
+        eps: float = -1e-6,
+        dtype: torch.dtype = torch.bfloat16,
+        with_vector_loads: bool = False,
+    ):
+        super().__init__()
+        self.alpha_p = nn.Parameter(
+            torch.log(torch.exp(torch.tensor(alpha_p_init, dtype=dtype)) - 1).unsqueeze(
+                0
+            )
+        )
+        self.alpha_n = nn.Parameter(
+            torch.log(
+                torch.exp(torch.tensor(alpha_n_init - beta, dtype=dtype)) - 1
+            ).unsqueeze(0)
+        )
+        self.register_buffer("beta", torch.tensor(beta, dtype=dtype))
+        self.register_buffer("eps", torch.tensor(eps, dtype=dtype))
+        self.with_vector_loads = with_vector_loads
+        # Temporary until xIELU CUDA fully implemented
+        self._beta_scalar = float(self.beta.detach().cpu().float().item())
+        self._eps_scalar = float(self.eps.detach().cpu().float().item())
+
+        self._xielu_cuda_obj = None
+        try:
+            import xielu.ops  # noqa: F401
+
+            self._xielu_cuda_obj = torch.classes.xielu.XIELU()
+            msg = "Using experimental xIELU CUDA."
+            try:
+                from torch._dynamo import allow_in_graph
+
+                self._xielu_cuda_fn = allow_in_graph(self._xielu_cuda)
+                msg += " Enabled torch._dynamo for xIELU CUDA."
+            except Exception as err:
+                msg += (
+                    f" Could not enable torch._dynamo for xIELU ({err}) - "
+                    "this may result in slower performance."
+                )
+                self._xielu_cuda_fn = self._xielu_cuda
+            logger.warning_once(msg)
+        except Exception as err:
+            pass
+            # logger.warning_once(
+            #     "CUDA-fused xIELU not available (%s) –"
+            #     " falling back to a Python version.\n"
+            #     "For CUDA xIELU (experimental), `pip install git+https://github.com/nickjbrowning/XIELU`",
+            #     str(err),
+            # )
+
+    def _xielu_python(self, x: torch.Tensor) -> torch.Tensor:
+        alpha_p = nn.functional.softplus(self.alpha_p)
+        alpha_n = self.beta + nn.functional.softplus(self.alpha_n)
+        return torch.where(
+            x > 0,
+            alpha_p * x * x + self.beta * x,
+            (torch.expm1(torch.min(x, self.eps)) - x) * alpha_n + self.beta * x,
+        )
+
+    def _xielu_cuda(self, x: torch.Tensor) -> torch.Tensor:
+        """Firewall function to prevent torch.compile from seeing .item()"""
+        assert self._xielu_cuda_obj is not None, "XIELU CUDA object must not be None"
+        original_shape = x.shape
+        # CUDA kernel expects 3D tensors, reshape if needed
+        while x.dim() < 3:
+            x = x.unsqueeze(0)
+        if x.dim() > 3:
+            x = x.view(-1, 1, x.size(-1))
+        if original_shape != x.shape:
+            logger.warning_once(
+                "Warning: xIELU input tensor expects 3 dimensions"
+                " but got (shape: %s). Reshaping to (shape: %s).\n"
+                "Note: For SGLang this may be expected if sending"
+                "[B*S,D] instead of [B,S,D].",
+                original_shape,
+                x.shape,
+            )
+        result = self._xielu_cuda_obj.forward(
+            x,
+            self.alpha_p,
+            self.alpha_n,
+            # Temporary until xIELU CUDA fully implemented -> self.{beta,eps}.item()
+            self._beta_scalar,
+            self._eps_scalar,
+            self.with_vector_loads,
+        )
+        return result.view(original_shape)
+
+    def forward(self, input: torch.Tensor) -> torch.Tensor:
+        if self._xielu_cuda_obj is not None and input.is_cuda:
+            if not torch._dynamo.is_compiling():
+                return self._xielu_cuda_fn(input)
+            else:
+                logger.warning_once(
+                    "torch._dynamo is compiling, using Python version of xIELU."
+                )
+        return self._xielu_python(input)
+
+
+class ScaledActivation(nn.Module):
+    """An activation function with post-scale parameters.
+
+    This is used for some quantization methods like AWQ.
+    """
+
+    def __init__(
+        self,
+        act_module: nn.Module,
+        intermediate_size: int,
+        input_is_parallel: bool = True,
+        params_dtype: Optional[torch.dtype] = None,
+    ):
+        super().__init__()
+        self.act = act_module
+        self.input_is_parallel = input_is_parallel
+        if input_is_parallel:
+            tp_size = get_tensor_model_parallel_world_size()
+            intermediate_size_per_partition = divide(intermediate_size, tp_size)
+        else:
+            intermediate_size_per_partition = intermediate_size
+        if params_dtype is None:
+            params_dtype = torch.get_default_dtype()
+        self.scales = nn.Parameter(
+            torch.empty(intermediate_size_per_partition, dtype=params_dtype)
+        )
+        set_weight_attrs(self.scales, {"weight_loader": self.weight_loader})
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.act(x) / self.scales
+
+    def weight_loader(self, param: nn.Parameter, loaded_weight: torch.Tensor):
+        param_data = param.data
+        if self.input_is_parallel:
+            tp_rank = get_tensor_model_parallel_rank()
+            shard_size = param_data.shape[0]
+            start_idx = tp_rank * shard_size
+            loaded_weight = loaded_weight.narrow(0, start_idx, shard_size)
+        assert param_data.shape == loaded_weight.shape
+        param_data.copy_(loaded_weight)
+
+
+_ACTIVATION_REGISTRY = {
+    "gelu": nn.GELU(),
+    "gelu_pytorch_tanh": nn.GELU(approximate="tanh"),
+    "gelu_new": NewGELU(),
+    "relu2": ReLU2(),
+    "xielu": XIELU(),
+}
+
+
+def get_act_fn(
+    act_fn_name: str,
+    quant_config: Optional[QuantizationConfig] = None,
+    intermediate_size: Optional[int] = None,
+    input_is_parallel: bool = True,
+    params_dtype: Optional[torch.dtype] = None,
+) -> nn.Module:
+    """Get an activation function by name."""
+    act_fn_name = act_fn_name.lower()
+    if act_fn_name not in _ACTIVATION_REGISTRY:
+        raise ValueError(f"Activation function {act_fn_name!r} is not supported.")
+
+    act_fn = _ACTIVATION_REGISTRY[act_fn_name]
+    if quant_config is not None and act_fn_name in quant_config.get_scaled_act_names():
+        if intermediate_size is None:
+            raise ValueError(
+                "intermediate_size must be specified for scaled "
+                "activation functions."
+            )
+        return ScaledActivation(
+            act_fn, intermediate_size, input_is_parallel, params_dtype
+        )
+    return act_fn
+
+
+def get_cross_encoder_activation_function(config: PretrainedConfig):
+    if (
+        hasattr(config, "sbert_ce_default_activation_function")
+        and config.sbert_ce_default_activation_function is not None
+    ):
+
+        function_name = config.sbert_ce_default_activation_function
+        assert function_name.startswith("torch.nn.modules."), (
+            "Loading of activation functions is restricted to "
+            "torch.nn.modules for security reasons"
+        )
+        return resolve_obj_by_qualname(function_name)()
+    else:
+        # adapt bge-reranker
+        return nn.Identity()
diff --git a/sglang/python/sglang/srt/layers/amx_utils.py b/sglang/python/sglang/srt/layers/amx_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..81a455976637974f5ee10f6fa2186bc15f2a29c2
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/amx_utils.py
@@ -0,0 +1,139 @@
+import logging
+
+import torch
+
+from sglang.srt.utils import cpu_has_amx_support
+
+logger = logging.getLogger(__name__)
+
+from enum import IntEnum
+
+
+class CPUQuantMethod(IntEnum):
+    UNQUANT = 0
+    INT8_W8A8 = 1
+    FP8_W8A16 = 2
+    INT4_W4A8 = 3
+
+
+def amx_process_weight_after_loading(weight, is_conv=False):
+    if weight.device != torch.device("cpu"):
+        return weight
+    if not cpu_has_amx_support():
+        return weight
+    if is_conv:
+        return torch.ops.sgl_kernel.causal_conv1d_weight_pack(
+            weight.view(-1, weight.size(-1))
+        )
+    else:
+        return torch.ops.sgl_kernel.convert_weight_packed(weight)
+
+
+# TODO: currently gemm kernel has the below requirements:
+# OC: OC % TILE_N == 0 or OC < TILE_N, where TILE_N = 16
+# IC: IC % TILE_K == 0, where TILE_K = 32
+def dim_is_supported(weight):
+    TILE_N = 16
+    TILE_K = 32
+    ndim = weight.ndim
+    OC = weight.size(1) if ndim == 3 else weight.size(0)
+    IC = weight.size(2) if ndim == 3 else weight.size(1)
+    is_oc_support = OC < TILE_N or OC % TILE_N == 0
+    is_ic_support = IC % TILE_K == 0
+    return is_oc_support and is_ic_support
+
+
+def dtype_is_supported(weight):
+    return weight.dtype in [
+        torch.float16,
+        torch.bfloat16,
+        torch.int8,
+        torch.float8_e4m3fn,
+    ]
+
+
+def is_dim_conv_weight(weight):
+    return weight.dim() == 3 and weight.size(1) == 1
+
+
+def _init_amx_conv_state(conv_state):
+    # CPU AMX layout for conv_state kernel optimization
+    conv_state_cpu = []
+    for conv_shape_t in conv_state:
+        conv_shape_new = conv_shape_t.as_strided_(
+            conv_shape_t.size(),
+            (
+                conv_shape_t.stride(0),
+                conv_shape_t.stride(1),
+                1,
+                conv_shape_t.size(2),
+            ),
+        )
+        conv_state_cpu.append(conv_shape_new)
+    return conv_state_cpu
+
+
+def _amx_process_weight_after_loading(
+    module, weight_names, transpose_dims=None
+) -> None:
+    # Pack weight for get better performance on CPU
+    devices = {getattr(module, weight_name).device for weight_name in weight_names}
+    assert len(devices) == 1, f"Expects all weights to be on the same device"
+    device = devices.pop()
+
+    if transpose_dims:
+        assert len(weight_names) == len(
+            transpose_dims
+        ), "len(weight_names) should be equal to len(transpose_dims)"
+
+    for i, weight_name in enumerate(weight_names):
+        weight_tensor = getattr(module, weight_name)
+
+        if transpose_dims and transpose_dims[i]:
+            weight_tensor = weight_tensor.transpose(*transpose_dims[i])
+        is_conv_weight = is_dim_conv_weight(weight_tensor)
+        # We don't pack weight or use intel amx backend if any weight of this module has unsupported dim.
+        if (
+            (not dim_is_supported(weight_tensor))
+            or not dtype_is_supported(weight_tensor)
+        ) and (not is_conv_weight):
+            logger.warning(
+                f"Unsupported dimension or dtype for prepacking for weight '{weight_name}' with shape {weight_tensor.shape} and dtype {weight_tensor.dtype} in {module}. "
+                f"The derived (OC, IC) dimensions must be divisible by (16, 32). "
+            )
+            module.use_intel_amx_backend = False
+            return
+
+        packed_weight = torch.nn.Parameter(
+            amx_process_weight_after_loading(weight_tensor, is_conv_weight),
+            requires_grad=False,
+        )
+        packed_weight.__dict__ = weight_tensor.__dict__
+        setattr(module, weight_name, packed_weight)
+        if is_conv_weight:
+            # need to use inplace copy for conv weight amx packing,
+            # as its usage in radix_linear_attention will use the original conv weight.
+            weight_tensor = weight_tensor.view(-1, weight_tensor.size(-1))
+            weight_tensor.copy_(packed_weight)
+
+    module.use_intel_amx_backend = (
+        device == torch.device("cpu") and cpu_has_amx_support()
+    )
+
+    if (
+        module.use_intel_amx_backend
+        and hasattr(module, "bias")
+        and module.bias is not None
+    ):
+        module.bias = torch.nn.Parameter(module.bias.data.float(), requires_grad=False)
+
+
+class PackWeightMethod:
+    def __init__(self, weight_names, transpose_dims=None):
+        self.weight_names = weight_names
+        self.transpose_dims = transpose_dims
+
+    def process_weights_after_loading(self, module) -> None:
+        _amx_process_weight_after_loading(
+            module, self.weight_names, self.transpose_dims
+        )
diff --git a/sglang/python/sglang/srt/layers/attention/__pycache__/attention_registry.cpython-311.pyc b/sglang/python/sglang/srt/layers/attention/__pycache__/attention_registry.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..932526c0e62fe1b6bae2830e819c87a791ed20b3
Binary files /dev/null and b/sglang/python/sglang/srt/layers/attention/__pycache__/attention_registry.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/layers/attention/__pycache__/base_attn_backend.cpython-311.pyc b/sglang/python/sglang/srt/layers/attention/__pycache__/base_attn_backend.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d961ca8cc72f2f934955a5d1279eb07c7b4d56a4
Binary files /dev/null and b/sglang/python/sglang/srt/layers/attention/__pycache__/base_attn_backend.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/layers/attention/__pycache__/flashattention_backend.cpython-311.pyc b/sglang/python/sglang/srt/layers/attention/__pycache__/flashattention_backend.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..664522a64348071efa3b3caf4f040765b14f8602
Binary files /dev/null and b/sglang/python/sglang/srt/layers/attention/__pycache__/flashattention_backend.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/layers/attention/__pycache__/hybrid_linear_attn_backend.cpython-311.pyc b/sglang/python/sglang/srt/layers/attention/__pycache__/hybrid_linear_attn_backend.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..717b6b20b796974a4441460856e970bf31bb094d
Binary files /dev/null and b/sglang/python/sglang/srt/layers/attention/__pycache__/hybrid_linear_attn_backend.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/layers/attention/__pycache__/tbo_backend.cpython-311.pyc b/sglang/python/sglang/srt/layers/attention/__pycache__/tbo_backend.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a85db9b39c5ff595a504b0bfe72277dffa53300c
Binary files /dev/null and b/sglang/python/sglang/srt/layers/attention/__pycache__/tbo_backend.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/layers/attention/__pycache__/triton_backend.cpython-311.pyc b/sglang/python/sglang/srt/layers/attention/__pycache__/triton_backend.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d58d3268e646fc857fb56c9c28ffc8fd6e87438c
Binary files /dev/null and b/sglang/python/sglang/srt/layers/attention/__pycache__/triton_backend.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/layers/attention/__pycache__/utils.cpython-311.pyc b/sglang/python/sglang/srt/layers/attention/__pycache__/utils.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..af95a2af88a0ddff91db90249ada972862012160
Binary files /dev/null and b/sglang/python/sglang/srt/layers/attention/__pycache__/utils.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/layers/attention/__pycache__/vision.cpython-311.pyc b/sglang/python/sglang/srt/layers/attention/__pycache__/vision.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e7c5264dc9fe9cdeaf12150b0245bd63d69a8953
Binary files /dev/null and b/sglang/python/sglang/srt/layers/attention/__pycache__/vision.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/layers/attention/__pycache__/vision_utils.cpython-311.pyc b/sglang/python/sglang/srt/layers/attention/__pycache__/vision_utils.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e1c47561e72a8072c852f77069d28ae333eee13b
Binary files /dev/null and b/sglang/python/sglang/srt/layers/attention/__pycache__/vision_utils.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/layers/attention/aiter_backend.py b/sglang/python/sglang/srt/layers/attention/aiter_backend.py
new file mode 100644
index 0000000000000000000000000000000000000000..7a2fca52216ba753f48f5ec2192ae74b2e019adf
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/attention/aiter_backend.py
@@ -0,0 +1,2236 @@
+from __future__ import annotations
+
+"""
+end to end attention solution with aiter kernels
+"""
+
+import logging
+from dataclasses import dataclass
+from enum import Enum, auto
+from typing import TYPE_CHECKING, Optional
+
+import torch
+import triton
+
+from sglang.srt.layers.attention.base_attn_backend import AttentionBackend
+from sglang.srt.layers.attention.utils import create_flashinfer_kv_indices_triton
+from sglang.srt.layers.dp_attention import (
+    get_attention_tp_size,
+    is_dp_attention_enabled,
+)
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch, ForwardMode
+from sglang.srt.utils import is_gfx95_supported
+
+if TYPE_CHECKING:
+    from sglang.srt.layers.radix_attention import RadixAttention
+    from sglang.srt.model_executor.model_runner import ModelRunner
+    from sglang.srt.speculative.spec_info import SpecInput
+
+try:
+    from aiter import (
+        flash_attn_varlen_func,
+        get_mla_metadata_info_v1,
+        get_mla_metadata_v1,
+        get_ps_metadata_info_v1,
+        get_ps_metadata_v1,
+        mha_batch_prefill_func,
+        mla_prefill_ps_asm_fwd,
+        mla_reduce_v1,
+        paged_attention_ragged,
+    )
+    from aiter.mla import mla_decode_fwd, mla_prefill_fwd
+except ImportError:
+    print(
+        "aiter is AMD specific kernel library. Please make sure aiter is installed on your AMD device."
+    )
+
+from sglang.srt.configs.model_config import AttentionArch
+from sglang.srt.layers.attention.utils import pad_sequence_with_mask
+from sglang.srt.layers.quantization.fp8_kernel import fp8_dtype
+from sglang.srt.utils import get_bool_env_var
+
+logger = logging.getLogger(__name__)
+
+# Use aiter mla persist design for fp8-kv cache
+_use_mla_ps_kernel = get_bool_env_var("SGLANG_AITER_MLA_PERSIST", "True")
+
+# Use fp8 prefill only on gfx95
+_use_fp8_prefill_attn = (
+    get_bool_env_var("SGLANG_AITER_FP8_PREFILL_ATTN", "True") and is_gfx95_supported()
+)
+
+# Persist
+# fast_mode=True if _use_mla_ps_kernel else False
+# intra_batch_mode=False if _use_mla_ps_kernel else True
+
+# fake non-ps, intra_batch_mode needs to be True for non-ps-mode
+fast_mode = False
+intra_batch_mode = True if _use_mla_ps_kernel else False
+
+
+class WrapperDispatch(Enum):
+    SLIDING_WINDOW = auto()
+    CROSS_ATTENTION = auto()
+
+
+@dataclass
+class ForwardMetadata:
+    kv_indptr: torch.Tensor
+    kv_indices: torch.Tensor
+    qo_indptr: torch.Tensor
+    kv_last_page_len: torch.Tensor
+    max_q_len: int
+    max_kv_len: Optional[int]
+    work_metadata: Optional[torch.Tensor] = None
+    work_info_set: Optional[torch.Tensor] = None
+    work_indptr: Optional[torch.Tensor] = None
+    reduce_indptr: Optional[torch.Tensor] = None
+    reduce_final_map: Optional[torch.Tensor] = None
+    reduce_partial_map: Optional[torch.Tensor] = None
+    num_kv_splits: Optional[int] = None
+    run_graph: Optional[bool] = True
+    custom_mask: Optional[torch.Tensor] = None
+    mask_indptr: Optional[torch.Tensor] = None
+    max_extend_len: Optional[int] = None
+    fp8_prefill_kv_indices: Optional[torch.Tensor] = None
+
+
+global_workspace_buffer = None
+
+_AITER_PARTITION_SIZE_ROCM = 256
+
+
+class AiterAttnBackend(AttentionBackend):
+    def __init__(
+        self,
+        model_runner: ModelRunner,
+        skip_prefill: bool = False,
+        kv_indptr_buf: Optional[torch.Tensor] = None,
+    ):
+        super().__init__()
+        # Lazy import to avoid the initialization of cuda context
+        from sglang.srt.layers.attention.triton_ops.extend_attention import (
+            extend_attention_fwd,
+        )
+
+        self.input_dtype = model_runner.model_config.dtype
+
+        self.page_size = model_runner.server_args.page_size
+
+        self.extend_attention_fwd = torch.compiler.disable(extend_attention_fwd)
+
+        self.device = model_runner.device
+        self.is_multimodal = model_runner.model_config.is_multimodal
+        self.num_draft_tokens = model_runner.server_args.speculative_num_draft_tokens
+        self.speculative_num_steps = model_runner.server_args.speculative_num_steps
+        self.num_head = (
+            model_runner.model_config.num_attention_heads // get_attention_tp_size()
+        )
+        self.head_dim = model_runner.model_config.head_dim
+        self.num_kv_head = model_runner.model_config.get_num_kv_heads(
+            get_attention_tp_size()
+        )
+        self.kv_cache_dtype = model_runner.kv_cache_dtype
+
+        self.req_to_token = model_runner.req_to_token_pool.req_to_token
+
+        self.use_mla = model_runner.model_config.attention_arch == AttentionArch.MLA
+
+        # Get v_head_dim based on model type
+        if self.use_mla:
+            # For MLA models, get v_head_dim from model config
+            self.v_head_dim = model_runner.model_config.v_head_dim
+        elif hasattr(model_runner.token_to_kv_pool, "get_v_head_dim"):
+            # For hybrid models (Mamba+attention, GDN, Kimi linear),
+            # layer_id=0 may not be a full attention layer
+            self.v_head_dim = model_runner.token_to_kv_pool.get_v_head_dim()
+        else:
+            self.v_head_dim = model_runner.token_to_kv_pool.get_value_buffer(0).shape[
+                -1
+            ]
+
+        # Parse constants
+        self.max_context_len = model_runner.model_config.context_len
+        self.skip_prefill = skip_prefill
+
+        max_bs = model_runner.req_to_token_pool.size
+
+        if kv_indptr_buf is None:
+            self.kv_indptr = torch.zeros(
+                (max_bs + 1,), dtype=torch.int32, device=model_runner.device
+            )
+        else:
+            self.kv_indptr = kv_indptr_buf
+
+        self.kv_last_page_len = torch.ones(
+            (max_bs,), dtype=torch.int32, device=model_runner.device
+        )
+        self.qo_indptr = torch.zeros(
+            (max_bs + 1,), dtype=torch.int32, device=model_runner.device
+        )
+        self.mask_indptr = torch.zeros(
+            (max_bs + 1,), dtype=torch.int64, device=model_runner.device
+        )
+
+        # Create prefill indices updater
+        if not skip_prefill:
+            self.indices_updater_prefill = AiterIndicesUpdaterPrefill(
+                model_runner, self
+            )
+            if self.use_mla:
+                self.mla_indices_updater_prefill = AiterMlaIndicesUpdaterPrefill(
+                    model_runner, self
+                )
+
+        # aiter kernel related initialization
+        self.max_num_partitions = (
+            self.max_context_len + _AITER_PARTITION_SIZE_ROCM - 1
+        ) // _AITER_PARTITION_SIZE_ROCM
+
+        nbyes_per_qo_elem = torch.finfo(torch.float32).bits // 8
+
+        if not self.use_mla:
+            self.workspace_buffer = torch.empty(
+                (max_bs * self.num_head * self.max_num_partitions * self.head_dim)
+                * nbyes_per_qo_elem
+                + 2 * (max_bs * self.num_head * self.max_num_partitions) * 4,
+                dtype=torch.uint8,
+                device=self.device,
+            )
+
+        self.scale = float(1.0 / (self.head_dim**0.5))
+        self.k_scale = self.v_scale = torch.tensor([1.0], dtype=torch.float32).to(
+            self.device
+        )
+
+        self.logits_soft_cap = 0.0
+
+        self.forward_metadata: ForwardMetadata = None
+
+        if self.use_mla:
+            self.enable_dp_attention = is_dp_attention_enabled()
+            self.qo_indptr_ = torch.zeros(
+                (max_bs + 1,), dtype=torch.int32, device=model_runner.device
+            )
+            global _use_mla_ps_kernel, fast_mode, intra_batch_mode
+
+            if self.num_head == 32:
+                fast_mode = True
+                intra_batch_mode = False
+
+            # current persist a16w16 mla_decode kernel does not support head_num = 128
+            # need to fall back to non-persist
+            # only use mla_ps_kernel when fp8 kv_cache
+            # for non-fp8 kv_cache on tp8, use non-persist kernel to avoid performance degradation
+            # head_num=16 (tp8 perf issue), head_num=128 (unsupported, like tp1 or --enable-dp-attention with tp8-dp8)
+            if (
+                self.num_head == 16 or self.num_head == 128
+            ) and self.kv_cache_dtype is not fp8_dtype:
+                _use_mla_ps_kernel = False
+                fast_mode = False
+                intra_batch_mode = False
+
+            self.max_split_per_batch = 32 if _use_mla_ps_kernel else None
+
+            if self.num_draft_tokens is None and _use_mla_ps_kernel:
+                self.max_split_per_batch = 64
+
+            self.fix_max_split_per_batch = self.max_split_per_batch
+
+    def make_mla_decode_meta_data_buffer(self, max_seqlen_qo, batch_size):
+        nhead = self.num_head
+        dtype = self.kv_cache_dtype
+
+        if self.enable_dp_attention:
+            gpu = torch.cuda.current_device()
+            device_properties = torch.cuda.get_device_properties(gpu)
+            cu_num = device_properties.multi_processor_count
+            self.max_split_per_batch = min(
+                (cu_num + batch_size - 1) // batch_size, self.fix_max_split_per_batch
+            )
+
+        (
+            (work_meta_data_size, work_meta_data_type),
+            (work_indptr_size, work_indptr_type),
+            (work_info_set_size, work_info_set_type),
+            (reduce_indptr_size, reduce_indptr_type),
+            (reduce_final_map_size, reduce_final_map_type),
+            (reduce_partial_map_size, reduce_partial_map_type),
+        ) = get_mla_metadata_info_v1(
+            batch_size,
+            max_seqlen_qo,
+            nhead,
+            dtype,
+            dtype,
+            is_sparse=False,
+            fast_mode=fast_mode,
+            num_kv_splits=self.max_split_per_batch,
+            intra_batch_mode=intra_batch_mode,
+        )
+
+        # aiter implementation
+        # the tensor's meaning please refer aiter/ops/attention.py
+        work_metadata = torch.empty(
+            work_meta_data_size, dtype=work_meta_data_type, device="cuda"
+        )
+        work_indptr = torch.empty(
+            work_indptr_size, dtype=work_indptr_type, device="cuda"
+        )
+        work_info_set = torch.empty(
+            work_info_set_size,
+            dtype=work_info_set_type,
+            device="cuda",
+        )
+        reduce_indptr = torch.empty(
+            reduce_indptr_size, dtype=reduce_indptr_type, device="cuda"
+        )
+        reduce_final_map = torch.empty(
+            reduce_final_map_size, dtype=reduce_final_map_type, device="cuda"
+        )
+        reduce_partial_map = torch.empty(
+            reduce_partial_map_size, dtype=reduce_partial_map_type, device="cuda"
+        )
+
+        return (
+            work_metadata,
+            work_indptr,
+            work_info_set,
+            reduce_indptr,
+            reduce_final_map,
+            reduce_partial_map,
+        )
+
+    def make_mla_meta_data(
+        self,
+        qo_indptr,
+        kv_indptr,
+        kv_last_page_len,
+        work_metadata,
+        work_info_set,
+        work_indptr,
+        reduce_indptr,
+        reduce_final_map,
+        reduce_partial_map,
+        max_q_len,
+        fast_mode,
+        max_split_per_batch,
+        intra_batch_mode,
+    ):
+
+        nhead_kv = 1
+        page_size = self.page_size
+        dtype = self.kv_cache_dtype
+
+        meta = get_mla_metadata_v1(
+            qo_indptr,
+            kv_indptr,
+            kv_last_page_len,
+            self.num_head // nhead_kv,
+            nhead_kv,
+            False,
+            work_metadata,
+            work_info_set,
+            work_indptr,
+            reduce_indptr,
+            reduce_final_map,
+            reduce_partial_map,
+            kv_granularity=max(page_size, 16),
+            max_seqlen_qo=max_q_len,
+            uni_seqlen_qo=max_q_len,
+            fast_mode=fast_mode,
+            max_split_per_batch=max_split_per_batch,
+            intra_batch_mode=intra_batch_mode,
+            dtype_q=dtype,
+            dtype_kv=dtype,
+        )
+
+    def make_mla_prefill_ps_meta_data_buffer(
+        self, batch_size: int, max_qlen: int, qlen_granularity: int
+    ):
+        (
+            (work_meta_data_size, work_meta_data_type),
+            (work_indptr_size, work_indptr_type),
+            (work_info_size, work_info_type),
+            (reduce_indptr_size, reduce_indptr_type),
+            (reduce_final_map_size, reduce_final_map_type),
+            (reduce_partial_map_size, reduce_partial_map_type),
+        ) = get_ps_metadata_info_v1(
+            batch_size=batch_size,
+            num_head_k=self.num_kv_head,
+            max_qlen=max_qlen,
+            qlen_granularity=qlen_granularity,
+        )
+
+        device = self.device
+        work_metadata_ptrs = torch.empty(
+            work_meta_data_size, dtype=work_meta_data_type, device=device
+        )
+        work_indptr = torch.empty(
+            work_indptr_size, dtype=work_indptr_type, device=device
+        )
+        work_info = torch.empty(work_info_size, dtype=work_info_type, device=device)
+        reduce_indptr = torch.empty(
+            reduce_indptr_size, dtype=reduce_indptr_type, device=device
+        )
+        reduce_final_map = torch.empty(
+            reduce_final_map_size, dtype=reduce_final_map_type, device=device
+        )
+        reduce_partial_map = torch.empty(
+            reduce_partial_map_size, dtype=reduce_partial_map_type, device=device
+        )
+
+        return (
+            work_metadata_ptrs,
+            work_indptr,
+            work_info,
+            reduce_indptr,
+            reduce_final_map,
+            reduce_partial_map,
+        )
+
+    def make_mla_prefill_ps_meta_data(
+        self,
+        qo_indptr: torch.Tensor,
+        kv_indptr: torch.Tensor,
+        seq_lens: torch.Tensor,
+        work_metadata: torch.Tensor,
+        work_indptr: torch.Tensor,
+        work_info: torch.Tensor,
+        reduce_indptr: torch.Tensor,
+        reduce_final_map: torch.Tensor,
+        reduce_partial_map: torch.Tensor,
+        is_causal: bool = True,
+    ):
+        gqa_ratio = self.num_head // self.num_kv_head
+        num_heads_k = self.num_kv_head
+        tile_q = 256
+        qhead_granularity = gqa_ratio
+        qlen_granularity = tile_q // qhead_granularity
+        kvlen_granularity = max(128, self.page_size)
+        block_size = self.page_size
+
+        qo_indptr_cpu = qo_indptr.to("cpu", dtype=torch.int32)
+        kv_indptr_cpu = kv_indptr.to("cpu", dtype=torch.int32)
+        seq_lens_cpu = seq_lens.to("cpu", dtype=torch.int32)
+
+        get_ps_metadata_v1(
+            qo_indptr_cpu,
+            kv_indptr_cpu,
+            seq_lens_cpu,
+            gqa_ratio,
+            num_heads_k,
+            work_metadata,
+            work_indptr,
+            work_info,
+            reduce_indptr,
+            reduce_final_map,
+            reduce_partial_map,
+            qhead_granularity=qhead_granularity,
+            qlen_granularity=qlen_granularity,
+            kvlen_granularity=kvlen_granularity,
+            block_size=block_size,
+            is_causal=is_causal,
+        )
+
+    def init_forward_metadata(self, forward_batch: ForwardBatch):
+        """Init auxiliary variables for triton attention backend."""
+
+        bs = forward_batch.batch_size
+        kv_indptr = self.kv_indptr
+        spec_info = forward_batch.spec_info
+        qo_indptr = None
+        kv_last_page_len = None
+        max_q_len = None
+
+        work_metadata = None
+        work_indptr = None
+        work_info_set = None
+        reduce_indptr = None
+        reduce_final_map = None
+        reduce_partial_map = None
+
+        num_kv_splits = None
+        # num_kv_splits_indptr = None
+
+        if forward_batch.forward_mode.is_decode_or_idle():
+            if spec_info is None or forward_batch.forward_mode.is_idle():
+                kv_indptr[1 : bs + 1] = torch.cumsum(forward_batch.seq_lens, dim=0)
+                kv_indptr = kv_indptr[: bs + 1]
+                kv_indices = torch.empty(
+                    forward_batch.seq_lens_sum, dtype=torch.int32, device=self.device
+                )
+                create_flashinfer_kv_indices_triton[(bs,)](
+                    self.req_to_token,
+                    forward_batch.req_pool_indices,
+                    forward_batch.seq_lens,
+                    kv_indptr,
+                    None,
+                    kv_indices,
+                    self.req_to_token.stride(0),
+                )
+            else:
+                kv_indptr, kv_indices = spec_info.kv_indptr, spec_info.kv_indices
+                bs = kv_indptr.shape[0] - 1
+
+            if self.use_mla:
+                qo_indptr = self.qo_indptr_[: bs + 1]
+                qo_indptr[1 : bs + 1] = torch.cumsum(self.kv_last_page_len[:bs], dim=0)
+                kv_last_page_len = self.kv_last_page_len[:bs]
+                max_q_len = 1
+
+                if _use_mla_ps_kernel:
+                    (
+                        work_metadata,
+                        work_indptr,
+                        work_info_set,
+                        reduce_indptr,
+                        reduce_final_map,
+                        reduce_partial_map,
+                    ) = self.make_mla_decode_meta_data_buffer(max_q_len, bs)
+
+                    num_kv_splits = self.max_split_per_batch
+
+                    self.make_mla_meta_data(
+                        qo_indptr,
+                        kv_indptr,
+                        kv_last_page_len,
+                        work_metadata,
+                        work_info_set,
+                        work_indptr,
+                        reduce_indptr,
+                        reduce_final_map,
+                        reduce_partial_map,
+                        max_q_len,
+                        fast_mode=fast_mode,
+                        max_split_per_batch=num_kv_splits,
+                        intra_batch_mode=intra_batch_mode,
+                    )
+
+            self.forward_metadata = ForwardMetadata(
+                kv_indptr,
+                kv_indices,
+                qo_indptr,
+                kv_last_page_len,
+                max_q_len,
+                None,
+                work_metadata=work_metadata,
+                work_info_set=work_info_set,
+                work_indptr=work_indptr,
+                reduce_indptr=reduce_indptr,
+                reduce_final_map=reduce_final_map,
+                reduce_partial_map=reduce_partial_map,
+                num_kv_splits=num_kv_splits,
+                run_graph=False,
+            )
+
+        elif forward_batch.forward_mode.is_draft_extend():
+            if self.use_mla:
+                kv_indices, kv_indptr, qo_indptr, custom_mask = (
+                    spec_info.generate_attn_arg_prefill(
+                        forward_batch.req_pool_indices,
+                        forward_batch.seq_lens,
+                        forward_batch.seq_lens_sum,
+                        self.req_to_token,
+                    )
+                )
+
+                if _use_mla_ps_kernel:
+                    max_seqlen_qo = max(forward_batch.extend_seq_lens_cpu)
+                    (
+                        work_metadata,
+                        work_indptr,
+                        work_info_set,
+                        reduce_indptr,
+                        reduce_final_map,
+                        reduce_partial_map,
+                    ) = self.make_mla_decode_meta_data_buffer(max_seqlen_qo, bs)
+
+                    num_kv_splits = self.max_split_per_batch
+
+                    self.make_mla_meta_data(
+                        qo_indptr,
+                        kv_indptr,
+                        self.kv_last_page_len[:bs],
+                        work_metadata,
+                        work_info_set,
+                        work_indptr,
+                        reduce_indptr,
+                        reduce_final_map,
+                        reduce_partial_map,
+                        max_seqlen_qo,
+                        fast_mode=fast_mode,
+                        max_split_per_batch=num_kv_splits,
+                        intra_batch_mode=intra_batch_mode,
+                    )
+
+                self.forward_metadata = ForwardMetadata(
+                    kv_indptr,
+                    kv_indices,
+                    qo_indptr,
+                    # self.mla_indices_updater_prefill.kv_last_page_len,
+                    self.kv_last_page_len[:bs],
+                    max(forward_batch.extend_seq_lens_cpu),
+                    forward_batch.seq_lens_cpu.max().item(),
+                    work_metadata=work_metadata,
+                    work_info_set=work_info_set,
+                    work_indptr=work_indptr,
+                    reduce_indptr=reduce_indptr,
+                    reduce_final_map=reduce_final_map,
+                    reduce_partial_map=reduce_partial_map,
+                    num_kv_splits=num_kv_splits,
+                    run_graph=False,
+                )
+            else:
+                # Non-MLA draft_extend: use triton extend kernel with causal masking
+                kv_indices, kv_indptr, qo_indptr, custom_mask = (
+                    spec_info.generate_attn_arg_prefill(
+                        forward_batch.req_pool_indices,
+                        forward_batch.seq_lens,
+                        forward_batch.seq_lens_sum,
+                        self.req_to_token,
+                    )
+                )
+                kv_indices = kv_indices.to(torch.int64)
+                draft_max_extend_len = torch.max(spec_info.accept_length).item()
+
+                self.forward_metadata = ForwardMetadata(
+                    kv_indptr,
+                    kv_indices,
+                    qo_indptr,
+                    None,
+                    draft_max_extend_len,
+                    None,
+                    custom_mask=custom_mask,
+                    mask_indptr=None,
+                    max_extend_len=draft_max_extend_len,
+                )
+        elif forward_batch.forward_mode.is_target_verify():
+            if self.use_mla:
+                draft_num = spec_info.draft_token_num
+                kv_lens = forward_batch.seq_lens + draft_num
+                kv_lens_sum = forward_batch.seq_lens_sum + draft_num * bs
+                device = forward_batch.seq_lens.device
+
+                qo_indptr = torch.arange(
+                    0,
+                    (1 + bs) * draft_num,
+                    step=draft_num,
+                    dtype=torch.int32,
+                    device=device,
+                )
+                kv_indptr = self.kv_indptr
+                kv_indptr[1 : bs + 1] = torch.cumsum(kv_lens, dim=0)
+                kv_indptr = kv_indptr[: bs + 1]
+                kv_indices = torch.empty(
+                    kv_lens_sum,
+                    dtype=torch.int32,
+                    device=device,
+                )
+                create_flashinfer_kv_indices_triton[(bs,)](
+                    self.req_to_token,
+                    forward_batch.req_pool_indices,
+                    kv_lens,
+                    kv_indptr,
+                    None,
+                    kv_indices,
+                    self.req_to_token.stride(0),
+                )
+
+                # if self.kv_cache_dtype == fp8_dtype:
+                if _use_mla_ps_kernel:
+                    max_seqlen_qo = draft_num
+                    (
+                        work_metadata,
+                        work_indptr,
+                        work_info_set,
+                        reduce_indptr,
+                        reduce_final_map,
+                        reduce_partial_map,
+                    ) = self.make_mla_decode_meta_data_buffer(max_seqlen_qo, bs)
+
+                    num_kv_splits = self.max_split_per_batch
+
+                    self.make_mla_meta_data(
+                        qo_indptr,
+                        kv_indptr,
+                        self.kv_last_page_len[:bs],
+                        work_metadata,
+                        work_info_set,
+                        work_indptr,
+                        reduce_indptr,
+                        reduce_final_map,
+                        reduce_partial_map,
+                        max_seqlen_qo,
+                        fast_mode=fast_mode,
+                        max_split_per_batch=num_kv_splits,
+                        intra_batch_mode=intra_batch_mode,
+                    )
+
+                self.forward_metadata = ForwardMetadata(
+                    kv_indptr,
+                    kv_indices,
+                    qo_indptr,
+                    # self.mla_indices_updater_prefill.kv_last_page_len,
+                    self.kv_last_page_len[:bs],
+                    draft_num,
+                    None,
+                    work_metadata=work_metadata,
+                    work_info_set=work_info_set,
+                    work_indptr=work_indptr,
+                    reduce_indptr=reduce_indptr,
+                    reduce_final_map=reduce_final_map,
+                    reduce_partial_map=reduce_partial_map,
+                    num_kv_splits=num_kv_splits,
+                    run_graph=False,
+                )
+            else:
+                # Non-MLA target_verify: use triton extend kernel with custom mask
+                bs = len(forward_batch.req_pool_indices)
+                draft_num = spec_info.draft_token_num
+
+                qo_indptr = torch.arange(
+                    0,
+                    (1 + bs) * draft_num,
+                    step=draft_num,
+                    dtype=torch.int32,
+                    device=self.device,
+                )
+
+                kv_indptr[1 : bs + 1] = torch.cumsum(forward_batch.seq_lens, dim=0)
+                kv_indptr = kv_indptr[: bs + 1]
+
+                kv_indices = torch.empty(
+                    kv_indptr[-1], dtype=torch.int64, device=self.device
+                )
+                create_flashinfer_kv_indices_triton[(bs,)](
+                    self.req_to_token,
+                    forward_batch.req_pool_indices,
+                    forward_batch.seq_lens,
+                    kv_indptr,
+                    None,
+                    kv_indices,
+                    self.req_to_token.stride(0),
+                )
+
+                custom_mask = spec_info.custom_mask
+                seq_mask_len = draft_num * (forward_batch.seq_lens + draft_num)
+                mask_indptr = self.mask_indptr
+                mask_indptr[1 : bs + 1] = torch.cumsum(seq_mask_len[:bs], dim=0)
+                mask_indptr = mask_indptr[: bs + 1]
+
+                self.forward_metadata = ForwardMetadata(
+                    kv_indptr,
+                    kv_indices,
+                    qo_indptr,
+                    None,
+                    draft_num,
+                    None,
+                    custom_mask=custom_mask,
+                    mask_indptr=mask_indptr,
+                    max_extend_len=draft_num,
+                )
+        else:
+            prefix_lens = forward_batch.extend_prefix_lens
+
+            if self.is_multimodal:
+                extend_no_prefix = False
+            else:
+                extend_no_prefix = not any(forward_batch.extend_prefix_lens_cpu)
+            if self.use_mla:
+                self.mla_indices_updater_prefill.update(
+                    forward_batch.req_pool_indices,
+                    forward_batch.seq_lens,
+                    forward_batch.seq_lens_sum,
+                    forward_batch.extend_seq_lens,
+                    forward_batch.extend_seq_lens.max().item(),
+                    forward_batch.seq_lens.max().item(),
+                    spec_info=None,
+                )
+
+                max_q_len = self.mla_indices_updater_prefill.max_q_len
+                qo_indptr = self.mla_indices_updater_prefill.qo_indptr
+
+                work_metadata = None
+                work_indptr = None
+                work_info_set = None
+                reduce_indptr = None
+                reduce_final_map = None
+                reduce_partial_map = None
+                fp8_prefill_kv_indices = None
+
+                if _use_fp8_prefill_attn:
+                    tile_q = 256
+                    qlen_granularity = tile_q // (self.num_head // self.num_kv_head)
+                    (
+                        work_metadata,
+                        work_indptr,
+                        work_info_set,
+                        reduce_indptr,
+                        reduce_final_map,
+                        reduce_partial_map,
+                    ) = self.make_mla_prefill_ps_meta_data_buffer(
+                        bs, max_q_len, qlen_granularity
+                    )
+
+                    self.make_mla_prefill_ps_meta_data(
+                        qo_indptr,
+                        qo_indptr,
+                        forward_batch.seq_lens,
+                        work_metadata,
+                        work_indptr,
+                        work_info_set,
+                        reduce_indptr,
+                        reduce_final_map,
+                        reduce_partial_map,
+                        is_causal=True,
+                    )
+
+                    total_s = int(forward_batch.extend_seq_lens.sum())
+                    fp8_prefill_kv_indices = torch.arange(
+                        total_s, device=self.device, dtype=torch.int32
+                    )
+
+                self.forward_metadata = ForwardMetadata(
+                    self.mla_indices_updater_prefill.kv_indptr,
+                    self.mla_indices_updater_prefill.kv_indices,
+                    qo_indptr,
+                    self.kv_last_page_len[:bs],
+                    max_q_len,
+                    self.mla_indices_updater_prefill.max_kv_len,
+                    work_metadata=work_metadata,
+                    work_info_set=work_info_set,
+                    work_indptr=work_indptr,
+                    reduce_indptr=reduce_indptr,
+                    reduce_final_map=reduce_final_map,
+                    reduce_partial_map=reduce_partial_map,
+                    fp8_prefill_kv_indices=fp8_prefill_kv_indices,
+                )
+            else:
+                self.indices_updater_prefill.update(
+                    forward_batch.req_pool_indices,
+                    forward_batch.seq_lens,
+                    forward_batch.seq_lens_sum,
+                    prefix_lens,
+                    encoder_lens=forward_batch.encoder_lens,
+                    spec_info=None,
+                )
+                self.forward_metadata = ForwardMetadata(
+                    self.indices_updater_prefill.kv_indptr,
+                    self.indices_updater_prefill.kv_indices,
+                    None,
+                    None,
+                    self.indices_updater_prefill.max_q_len,
+                    self.indices_updater_prefill.max_kv_len,
+                )
+
+    def init_cuda_graph_state(
+        self,
+        max_bs: int,
+        max_num_tokens: int,
+        kv_indices_buf: Optional[torch.Tensor] = None,
+    ):
+        self.cuda_graph_kv_last_page_len = torch.ones(max_bs, dtype=torch.int)
+        if kv_indices_buf is None:
+            self.cuda_graph_kv_indices = torch.zeros(
+                (max_bs * self.max_context_len),
+                dtype=torch.int32,
+                device=self.device,
+            )
+        else:
+            self.cuda_graph_kv_indices = kv_indices_buf
+
+        if not self.skip_prefill:
+            self.cuda_graph_custom_mask = torch.zeros(
+                (max_num_tokens * self.max_context_len),
+                dtype=torch.uint8,
+                device=self.device,
+            )
+
+        # if self.use_mla and (_use_mla_ps_kernel or self.kv_cache_dtype == fp8_dtype):
+        if self.use_mla and _use_mla_ps_kernel:
+            # for persistent mla_decode_fwd
+            max_seqlen_qo = (
+                1 if self.num_draft_tokens is None else self.num_draft_tokens
+            )
+
+            (
+                self.work_metadata,
+                self.work_indptr,
+                self.work_info_set,
+                self.reduce_indptr,
+                self.reduce_final_map,
+                self.reduce_partial_map,
+            ) = self.make_mla_decode_meta_data_buffer(max_seqlen_qo, max_bs)
+
+        else:
+            self.work_metadata = None
+            self.work_indptr = None
+            self.work_info_set = None
+
+            self.reduce_indptr = None
+            self.reduce_final_map = None
+            self.reduce_partial_map = None
+
+    def init_forward_metadata_capture_cuda_graph(
+        self,
+        bs: int,
+        num_tokens: int,
+        req_pool_indices: torch.Tensor,
+        seq_lens: torch.Tensor,
+        encoder_lens: Optional[torch.Tensor],
+        forward_mode: ForwardMode,
+        spec_info: Optional[SpecInput],
+    ):
+
+        num_kv_splits = None
+        # num_kv_splits_indptr = None
+
+        work_metadata = None
+        work_info_set = None
+        work_indptr = None
+
+        reduce_indptr = None
+        reduce_final_map = None
+        reduce_partial_map = None
+
+        if forward_mode.is_decode_or_idle():
+            qo_indptr = None
+            kv_last_page_len = None
+            max_q_len = None
+
+            if spec_info is None:
+                kv_indptr = self.kv_indptr
+                kv_indptr[1 : bs + 1] = torch.cumsum(seq_lens, dim=0)
+                kv_indptr = kv_indptr[: bs + 1]
+                kv_indices = self.cuda_graph_kv_indices
+                create_flashinfer_kv_indices_triton[(bs,)](
+                    self.req_to_token,
+                    req_pool_indices,
+                    seq_lens,
+                    kv_indptr,
+                    None,
+                    kv_indices,
+                    self.req_to_token.stride(0),
+                )
+            else:
+                kv_indptr, kv_indices = spec_info.kv_indptr, spec_info.kv_indices
+
+            if self.use_mla:
+                qo_indptr = self.qo_indptr_[: bs + 1]
+                qo_indptr[1 : bs + 1] = torch.cumsum(
+                    self.cuda_graph_kv_last_page_len[:bs], dim=0
+                )
+                kv_last_page_len = self.cuda_graph_kv_last_page_len[:bs]
+                max_q_len = 1
+
+                if _use_mla_ps_kernel:
+                    num_kv_splits = self.max_split_per_batch
+
+                    self.make_mla_meta_data(
+                        qo_indptr,
+                        kv_indptr,
+                        kv_last_page_len,
+                        self.work_metadata,
+                        self.work_info_set,
+                        self.work_indptr,
+                        self.reduce_indptr,
+                        self.reduce_final_map,
+                        self.reduce_partial_map,
+                        max_q_len,
+                        fast_mode=fast_mode,
+                        max_split_per_batch=num_kv_splits,
+                        intra_batch_mode=intra_batch_mode,
+                    )
+
+                    work_metadata = self.work_metadata
+                    work_info_set = self.work_info_set
+                    work_indptr = self.work_indptr
+
+                    reduce_indptr = self.reduce_indptr
+                    reduce_final_map = self.reduce_final_map
+                    reduce_partial_map = self.reduce_partial_map
+
+            self.forward_metadata = ForwardMetadata(
+                kv_indptr,
+                kv_indices,
+                qo_indptr,
+                kv_last_page_len,
+                max_q_len,
+                kv_indptr[-1].item(),
+                work_metadata=work_metadata,
+                work_info_set=work_info_set,
+                work_indptr=work_indptr,
+                reduce_indptr=reduce_indptr,
+                reduce_final_map=reduce_final_map,
+                reduce_partial_map=reduce_partial_map,
+                num_kv_splits=num_kv_splits,
+                # num_kv_splits_indptr=num_kv_splits_indptr,
+            )
+
+        elif forward_mode.is_target_verify():
+            qo_indptr = self.qo_indptr[: bs + 1]
+            qo_indptr[: bs + 1] = torch.arange(
+                0,
+                (1 + bs) * self.num_draft_tokens,
+                step=self.num_draft_tokens,
+                dtype=torch.int32,
+                device=self.device,
+            )
+            if self.use_mla:
+                kv_lens = seq_lens + self.num_draft_tokens
+            else:
+                kv_lens = seq_lens
+            kv_indptr = self.kv_indptr[: bs + 1]
+            kv_indptr[1 : bs + 1] = torch.cumsum(kv_lens, dim=0)
+            kv_indices = self.cuda_graph_kv_indices
+            create_flashinfer_kv_indices_triton[(bs,)](
+                self.req_to_token,
+                req_pool_indices,
+                kv_lens,
+                kv_indptr,
+                None,
+                kv_indices,
+                self.req_to_token.stride(0),
+            )
+            kv_last_page_len = self.cuda_graph_kv_last_page_len[:bs]
+            max_q_len = self.num_draft_tokens
+
+            if self.use_mla:
+                if _use_mla_ps_kernel:
+
+                    num_kv_splits = self.max_split_per_batch
+
+                    self.make_mla_meta_data(
+                        qo_indptr,
+                        kv_indptr,
+                        kv_last_page_len,
+                        self.work_metadata,
+                        self.work_info_set,
+                        self.work_indptr,
+                        self.reduce_indptr,
+                        self.reduce_final_map,
+                        self.reduce_partial_map,
+                        max_q_len,
+                        fast_mode=fast_mode,
+                        max_split_per_batch=num_kv_splits,
+                        intra_batch_mode=intra_batch_mode,
+                    )
+
+                    work_metadata = self.work_metadata
+                    work_info_set = self.work_info_set
+                    work_indptr = self.work_indptr
+
+                    reduce_indptr = self.reduce_indptr
+                    reduce_final_map = self.reduce_final_map
+                    reduce_partial_map = self.reduce_partial_map
+
+                self.forward_metadata = ForwardMetadata(
+                    kv_indptr,
+                    kv_indices,
+                    qo_indptr,
+                    kv_last_page_len,
+                    max_q_len,
+                    kv_indptr[-1].item(),
+                    work_metadata=work_metadata,
+                    work_info_set=work_info_set,
+                    work_indptr=work_indptr,
+                    reduce_indptr=reduce_indptr,
+                    reduce_final_map=reduce_final_map,
+                    reduce_partial_map=reduce_partial_map,
+                    num_kv_splits=num_kv_splits,
+                )
+            else:
+                custom_mask = self.cuda_graph_custom_mask
+                custom_mask[: spec_info.custom_mask.shape[0]] = spec_info.custom_mask
+                seq_mask_len = max_q_len * (seq_lens + max_q_len)
+                mask_indptr = self.mask_indptr
+                mask_indptr[1 : bs + 1] = torch.cumsum(seq_mask_len[:bs], dim=0)
+                mask_indptr = mask_indptr[: bs + 1]
+
+                self.forward_metadata = ForwardMetadata(
+                    kv_indptr,
+                    kv_indices,
+                    qo_indptr,
+                    kv_last_page_len,
+                    max_q_len,
+                    kv_indptr[-1].item(),
+                    custom_mask=custom_mask,
+                    mask_indptr=mask_indptr,
+                    max_extend_len=max_q_len,
+                )
+        elif forward_mode.is_draft_extend():
+            num_tokens_per_bs = self.speculative_num_steps + 1
+            qo_indptr = self.qo_indptr[: bs + 1]
+            qo_indptr[: bs + 1] = torch.arange(
+                0,
+                bs * num_tokens_per_bs + 1,
+                step=num_tokens_per_bs,
+                dtype=torch.int32,
+                device=self.device,
+            )
+            kv_indptr = self.kv_indptr[: bs + 1]
+            kv_indptr[1 : bs + 1] = torch.cumsum(seq_lens, dim=0)
+            kv_indices = self.cuda_graph_kv_indices
+            create_flashinfer_kv_indices_triton[(bs,)](
+                self.req_to_token,
+                req_pool_indices,
+                seq_lens,
+                kv_indptr,
+                None,
+                kv_indices,
+                self.req_to_token.stride(0),
+            )
+
+            if self.use_mla:
+                kv_last_page_len = self.cuda_graph_kv_last_page_len[:bs]
+                max_q_len = num_tokens_per_bs
+
+                if _use_mla_ps_kernel:
+
+                    num_kv_splits = self.max_split_per_batch
+
+                    self.make_mla_meta_data(
+                        qo_indptr,
+                        kv_indptr,
+                        kv_last_page_len,
+                        self.work_metadata,
+                        self.work_info_set,
+                        self.work_indptr,
+                        self.reduce_indptr,
+                        self.reduce_final_map,
+                        self.reduce_partial_map,
+                        max_q_len,
+                        fast_mode=fast_mode,
+                        max_split_per_batch=num_kv_splits,
+                        intra_batch_mode=intra_batch_mode,
+                    )
+
+                    work_metadata = self.work_metadata
+                    work_info_set = self.work_info_set
+                    work_indptr = self.work_indptr
+
+                    reduce_indptr = self.reduce_indptr
+                    reduce_final_map = self.reduce_final_map
+                    reduce_partial_map = self.reduce_partial_map
+
+                self.forward_metadata = ForwardMetadata(
+                    kv_indptr,
+                    kv_indices,
+                    qo_indptr,
+                    kv_last_page_len,
+                    max_q_len,
+                    kv_indptr[-1].item(),
+                    work_metadata=work_metadata,
+                    work_info_set=work_info_set,
+                    work_indptr=work_indptr,
+                    reduce_indptr=reduce_indptr,
+                    reduce_final_map=reduce_final_map,
+                    reduce_partial_map=reduce_partial_map,
+                    num_kv_splits=num_kv_splits,
+                )
+            else:
+                # Non-MLA draft_extend cuda graph: use triton extend kernel
+                self.forward_metadata = ForwardMetadata(
+                    kv_indptr,
+                    kv_indices,
+                    qo_indptr,
+                    None,
+                    num_tokens_per_bs,
+                    None,
+                    custom_mask=None,
+                    mask_indptr=None,
+                    max_extend_len=num_tokens_per_bs,
+                )
+        else:
+            raise ValueError(f"Invalid mode: {forward_mode=}")
+
+    def init_forward_metadata_replay_cuda_graph(
+        self,
+        bs: int,
+        req_pool_indices: torch.Tensor,
+        seq_lens: torch.Tensor,
+        seq_lens_sum: int,
+        encoder_lens: Optional[torch.Tensor],
+        forward_mode: ForwardMode,
+        spec_info: Optional[SpecInput],
+        seq_lens_cpu: Optional[torch.Tensor],
+    ):
+
+        num_kv_splits = None
+        # num_kv_splits_indptr = None
+
+        work_metadata = None
+        work_info_set = None
+        work_indptr = None
+
+        reduce_indptr = None
+        reduce_final_map = None
+        reduce_partial_map = None
+
+        if forward_mode.is_decode_or_idle():
+            qo_indptr = None
+            kv_last_page_len = None
+            max_q_len = None
+
+            if spec_info is None:
+                kv_indptr = self.kv_indptr
+                kv_indptr[1 : bs + 1] = torch.cumsum(seq_lens, dim=0)
+                kv_indptr = kv_indptr[: bs + 1]
+                kv_indices = self.cuda_graph_kv_indices
+                create_flashinfer_kv_indices_triton[(bs,)](
+                    self.req_to_token,
+                    req_pool_indices,
+                    seq_lens,
+                    kv_indptr,
+                    None,
+                    kv_indices,
+                    self.req_to_token.stride(0),
+                )
+            else:
+                kv_indptr, kv_indices = spec_info.kv_indptr, spec_info.kv_indices
+
+            if self.use_mla:
+                qo_indptr = self.qo_indptr_[: bs + 1]
+                qo_indptr[1 : bs + 1] = torch.cumsum(
+                    self.cuda_graph_kv_last_page_len[:bs], dim=0
+                )
+                kv_last_page_len = self.cuda_graph_kv_last_page_len[:bs]
+                max_q_len = 1
+
+                if _use_mla_ps_kernel:
+                    num_kv_splits = self.max_split_per_batch
+
+                    self.make_mla_meta_data(
+                        qo_indptr,
+                        kv_indptr,
+                        kv_last_page_len,
+                        self.work_metadata,
+                        self.work_info_set,
+                        self.work_indptr,
+                        self.reduce_indptr,
+                        self.reduce_final_map,
+                        self.reduce_partial_map,
+                        max_q_len,
+                        fast_mode=fast_mode,
+                        max_split_per_batch=num_kv_splits,
+                        intra_batch_mode=intra_batch_mode,
+                    )
+
+                    work_metadata = self.work_metadata
+                    work_info_set = self.work_info_set
+                    work_indptr = self.work_indptr
+
+                    reduce_indptr = self.reduce_indptr
+                    reduce_final_map = self.reduce_final_map
+                    reduce_partial_map = self.reduce_partial_map
+
+                self.forward_metadata = ForwardMetadata(
+                    kv_indptr,
+                    kv_indices,
+                    qo_indptr,
+                    kv_last_page_len,
+                    max_q_len,
+                    kv_indptr[-1].item(),
+                    work_metadata=work_metadata,
+                    work_info_set=work_info_set,
+                    work_indptr=work_indptr,
+                    reduce_indptr=reduce_indptr,
+                    reduce_final_map=reduce_final_map,
+                    reduce_partial_map=reduce_partial_map,
+                    num_kv_splits=num_kv_splits,
+                    # num_kv_splits_indptr=num_kv_splits_indptr,
+                )
+
+        elif forward_mode.is_target_verify():
+            bs = len(req_pool_indices)
+            qo_indptr = self.qo_indptr[: bs + 1]
+            qo_indptr[: bs + 1] = torch.arange(
+                0,
+                (1 + bs) * self.num_draft_tokens,
+                step=self.num_draft_tokens,
+                dtype=torch.int32,
+                device=self.device,
+            )
+            if self.use_mla:
+                kv_lens = seq_lens + self.num_draft_tokens
+            else:
+                kv_lens = seq_lens
+            kv_indptr = self.kv_indptr[: bs + 1]
+            kv_indptr[1 : bs + 1] = torch.cumsum(kv_lens, dim=0)
+            kv_indices = self.cuda_graph_kv_indices
+            create_flashinfer_kv_indices_triton[(bs,)](
+                self.req_to_token,
+                req_pool_indices,
+                kv_lens,
+                kv_indptr,
+                None,
+                kv_indices,
+                self.req_to_token.stride(0),
+            )
+            kv_last_page_len = self.cuda_graph_kv_last_page_len[:bs]
+            max_q_len = self.num_draft_tokens
+
+            if self.use_mla:
+                if _use_mla_ps_kernel:
+
+                    num_kv_splits = self.max_split_per_batch
+
+                    self.make_mla_meta_data(
+                        qo_indptr,
+                        kv_indptr,
+                        kv_last_page_len,
+                        self.work_metadata,
+                        self.work_info_set,
+                        self.work_indptr,
+                        self.reduce_indptr,
+                        self.reduce_final_map,
+                        self.reduce_partial_map,
+                        max_q_len,
+                        fast_mode=fast_mode,
+                        max_split_per_batch=num_kv_splits,
+                        intra_batch_mode=intra_batch_mode,
+                    )
+
+                    work_metadata = self.work_metadata
+                    work_info_set = self.work_info_set
+                    work_indptr = self.work_indptr
+
+                    reduce_indptr = self.reduce_indptr
+                    reduce_final_map = self.reduce_final_map
+                    reduce_partial_map = self.reduce_partial_map
+
+                self.forward_metadata = ForwardMetadata(
+                    kv_indptr,
+                    kv_indices,
+                    qo_indptr,
+                    kv_last_page_len,
+                    max_q_len,
+                    kv_indptr[-1].item(),
+                    work_metadata=work_metadata,
+                    work_info_set=work_info_set,
+                    work_indptr=work_indptr,
+                    reduce_indptr=reduce_indptr,
+                    reduce_final_map=reduce_final_map,
+                    reduce_partial_map=reduce_partial_map,
+                    num_kv_splits=num_kv_splits,
+                )
+            else:
+                custom_mask = self.cuda_graph_custom_mask
+                custom_mask[: spec_info.custom_mask.shape[0]] = spec_info.custom_mask
+                seq_mask_len = max_q_len * (seq_lens + max_q_len)
+                mask_indptr = self.mask_indptr[: bs + 1]
+                mask_indptr[1 : bs + 1] = torch.cumsum(seq_mask_len, dim=0)
+
+                self.forward_metadata = ForwardMetadata(
+                    kv_indptr,
+                    kv_indices,
+                    qo_indptr,
+                    kv_last_page_len,
+                    max_q_len,
+                    kv_indptr[-1].item(),
+                    custom_mask=custom_mask,
+                    mask_indptr=mask_indptr,
+                    max_extend_len=max_q_len,
+                )
+
+        elif forward_mode.is_draft_extend():
+            num_tokens_per_bs = self.speculative_num_steps + 1
+            seq_lens = seq_lens[:bs]
+            accept_lens = spec_info.accept_length[:bs]
+            qo_indptr = self.qo_indptr[: bs + 1]
+            qo_indptr[1 : bs + 1] = torch.cumsum(accept_lens, dim=0)
+            kv_indptr = self.kv_indptr[: bs + 1]
+            kv_indptr[1 : bs + 1] = torch.cumsum(seq_lens, dim=0)
+            kv_indices = self.cuda_graph_kv_indices
+            create_flashinfer_kv_indices_triton[(bs,)](
+                self.req_to_token,
+                req_pool_indices,
+                seq_lens,
+                kv_indptr,
+                None,
+                kv_indices,
+                self.req_to_token.stride(0),
+            )
+
+            kv_last_page_len = self.cuda_graph_kv_last_page_len[:bs]
+            max_q_len = num_tokens_per_bs
+
+            if self.use_mla and _use_mla_ps_kernel:
+
+                num_kv_splits = self.max_split_per_batch
+
+                self.make_mla_meta_data(
+                    qo_indptr,
+                    kv_indptr,
+                    kv_last_page_len,
+                    self.work_metadata,
+                    self.work_info_set,
+                    self.work_indptr,
+                    self.reduce_indptr,
+                    self.reduce_final_map,
+                    self.reduce_partial_map,
+                    max_q_len,
+                    fast_mode=fast_mode,
+                    max_split_per_batch=num_kv_splits,
+                    intra_batch_mode=intra_batch_mode,
+                )
+
+                work_metadata = self.work_metadata
+                work_info_set = self.work_info_set
+                work_indptr = self.work_indptr
+
+                reduce_indptr = self.reduce_indptr
+                reduce_final_map = self.reduce_final_map
+                reduce_partial_map = self.reduce_partial_map
+
+            self.forward_metadata = ForwardMetadata(
+                kv_indptr,
+                kv_indices,
+                qo_indptr,
+                kv_last_page_len,
+                max_q_len,
+                kv_indptr[-1].item(),
+                work_metadata=work_metadata,
+                work_info_set=work_info_set,
+                work_indptr=work_indptr,
+                reduce_indptr=reduce_indptr,
+                reduce_final_map=reduce_final_map,
+                reduce_partial_map=reduce_partial_map,
+                num_kv_splits=num_kv_splits,
+            )
+
+        else:
+            raise ValueError("Invalid forward mode")
+
+    def get_cuda_graph_seq_len_fill_value(self):
+        return 1
+
+    def forward_extend(
+        self,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
+        layer: RadixAttention,
+        forward_batch: ForwardBatch,
+        save_kv_cache=True,
+        sinks=None,
+    ):
+        cache_loc = (
+            forward_batch.out_cache_loc
+            if not layer.is_cross_attention
+            else forward_batch.encoder_out_cache_loc
+        )
+
+        self.logits_soft_cap = layer.logit_cap
+
+        if k is not None:
+            assert v is not None
+            if save_kv_cache:
+                if self.use_mla:
+                    forward_batch.token_to_kv_pool.set_kv_buffer(layer, cache_loc, k, v)
+                else:
+                    forward_batch.token_to_kv_pool.set_kv_buffer(
+                        layer, cache_loc, k, v, layer.k_scale, layer.v_scale
+                    )
+
+        if self.use_mla:
+            max_q_len = self.forward_metadata.max_q_len
+            max_kv_len = self.forward_metadata.max_kv_len
+            kv_indptr = self.forward_metadata.kv_indptr
+            kv_indices = self.forward_metadata.kv_indices
+            qo_indptr = self.forward_metadata.qo_indptr
+            K_Buffer = forward_batch.token_to_kv_pool.get_key_buffer(layer.layer_id)
+            V_Buffer = forward_batch.token_to_kv_pool.get_value_buffer(layer.layer_id)
+            kv_lora_rank = V_Buffer.shape[-1]
+            qk_rope_head_dim = K_Buffer.shape[-1] - kv_lora_rank
+            qk_nope_head_dim = k.shape[-1] - qk_rope_head_dim
+            assert len(q.shape) == 3
+            assert len(k.shape) == 3
+            assert len(v.shape) == 3
+
+            if (
+                forward_batch.forward_mode.is_extend()
+                and not forward_batch.forward_mode.is_target_verify()
+                and not forward_batch.forward_mode.is_draft_extend()
+            ):
+                extend_no_prefix = not any(forward_batch.extend_prefix_lens_cpu)
+                if kv_indices.shape[0] == 0 or extend_no_prefix:
+                    if _use_fp8_prefill_attn:
+                        total_s = q.shape[0]
+                        nhead = layer.tp_q_head_num
+                        v_head_dim = layer.v_head_dim
+
+                        # q is cast here (after RoPE).
+                        # k/v are already FP8 for MXFP4 main model (fused kv_b_proj),
+                        # but need casting for FP8/BF16 weights (e.g. MTP draft model).
+                        if q.dtype != fp8_dtype:
+                            q = q.to(fp8_dtype)
+                        if k.dtype != fp8_dtype:
+                            k = k.to(fp8_dtype)
+                        if v.dtype != fp8_dtype:
+                            v = v.to(fp8_dtype)
+                        one_scale = torch.ones((), dtype=torch.float32, device=q.device)
+
+                        kv_indptr_asm = qo_indptr
+                        kv_indices_asm = self.forward_metadata.fp8_prefill_kv_indices
+
+                        tile_q = 256
+                        reduce_indptr = self.forward_metadata.reduce_indptr
+                        reduce_final_map = self.forward_metadata.reduce_final_map
+                        reduce_partial_map = self.forward_metadata.reduce_partial_map
+
+                        logits = torch.empty(
+                            (reduce_partial_map.size(0) * tile_q, nhead, v_head_dim),
+                            dtype=torch.float32,
+                            device=q.device,
+                        )
+                        attn_lse = torch.empty(
+                            (reduce_partial_map.size(0) * tile_q, nhead),
+                            dtype=torch.float32,
+                            device=q.device,
+                        )
+                        final_lse = torch.empty(
+                            (total_s, nhead),
+                            dtype=torch.float32,
+                            device=q.device,
+                        )
+                        output = q.new_empty(
+                            (total_s, nhead, v_head_dim),
+                            dtype=self.input_dtype,
+                        )
+
+                        mla_prefill_ps_asm_fwd(
+                            q,
+                            k,
+                            v,
+                            qo_indptr,
+                            kv_indptr_asm,
+                            kv_indices_asm,
+                            self.forward_metadata.work_indptr,
+                            self.forward_metadata.work_info_set,
+                            max_q_len,
+                            layer.scaling,
+                            True,
+                            logits,
+                            attn_lse,
+                            output,
+                            one_scale,
+                            one_scale,
+                            one_scale,
+                        )
+                        mla_reduce_v1(
+                            logits,
+                            attn_lse,
+                            reduce_indptr,
+                            reduce_final_map,
+                            reduce_partial_map,
+                            tile_q,
+                            output,
+                            final_lse,
+                        )
+                    else:
+                        output = flash_attn_varlen_func(
+                            q,
+                            k,
+                            v,
+                            qo_indptr,
+                            qo_indptr,
+                            max_q_len,
+                            max_q_len,
+                            softmax_scale=layer.scaling,
+                            causal=True,
+                        )
+                    return output
+                elif layer.qk_head_dim != (kv_lora_rank + qk_rope_head_dim):
+                    K_Buffer = torch.index_select(K_Buffer, 0, kv_indices)
+                    kvc, k_pe = torch.split(
+                        K_Buffer, [kv_lora_rank, qk_rope_head_dim], dim=-1
+                    )
+
+                    if self.kv_cache_dtype == fp8_dtype:
+                        dtype = q.dtype
+
+                        kvc = kvc.to(dtype)
+                        k_pe = k_pe.to(dtype)
+
+                    kvprefix = layer.kv_b_proj(kvc.contiguous())[0]
+
+                    kvprefix = kvprefix.view(
+                        -1, layer.tp_k_head_num, qk_nope_head_dim + layer.v_head_dim
+                    )
+                    k_prefix, v_prefix = torch.split(
+                        kvprefix, [qk_nope_head_dim, layer.v_head_dim], dim=-1
+                    )
+                    k_prefix = torch.cat(
+                        [
+                            k_prefix,
+                            torch.broadcast_to(
+                                k_pe,
+                                (k_pe.shape[0], layer.tp_k_head_num, k_pe.shape[2]),
+                            ),
+                        ],
+                        dim=-1,
+                    )
+                    assert (
+                        forward_batch.extend_prefix_lens.shape
+                        == forward_batch.extend_seq_lens.shape
+                    )
+
+                    k = k_prefix
+                    v = v_prefix
+
+                    o = flash_attn_varlen_func(
+                        q,
+                        k,
+                        v,
+                        qo_indptr,
+                        kv_indptr,
+                        max_q_len,
+                        max_kv_len,
+                        softmax_scale=layer.scaling,
+                        causal=True,
+                    )
+                    return o
+
+                else:
+                    if layer.qk_head_dim != layer.v_head_dim:
+                        o = q.new_empty(
+                            (q.shape[0], layer.tp_q_head_num * layer.v_head_dim)
+                        )
+                    else:
+                        o = torch.empty_like(q)
+
+                    mla_prefill_fwd(
+                        q.view(-1, layer.tp_q_head_num, layer.qk_head_dim),
+                        K_Buffer.view(-1, 1, 1, layer.qk_head_dim),
+                        o.view(-1, layer.tp_q_head_num, layer.v_head_dim),
+                        qo_indptr,
+                        kv_indptr,
+                        kv_indices,
+                        self.forward_metadata.kv_last_page_len,
+                        self.forward_metadata.max_q_len,
+                        layer.scaling,
+                        layer.logit_cap,
+                    )
+                    K_Buffer = K_Buffer.view(-1, layer.tp_k_head_num, layer.qk_head_dim)
+                    return o
+            elif forward_batch.forward_mode.is_target_verify():
+                o = q.new_empty(
+                    (q.shape[0], layer.tp_q_head_num, layer.v_head_dim),
+                    dtype=self.input_dtype,
+                )
+
+                work_metadata = self.forward_metadata.work_metadata
+                work_indptr = self.forward_metadata.work_indptr
+                work_info_set = self.forward_metadata.work_info_set
+
+                reduce_indptr = self.forward_metadata.reduce_indptr
+                reduce_final_map = self.forward_metadata.reduce_final_map
+                reduce_partial_map = self.forward_metadata.reduce_partial_map
+
+                num_kv_splits = self.forward_metadata.num_kv_splits
+
+                mla_decode_fwd(
+                    q,
+                    K_Buffer.view(-1, 1, 1, layer.qk_head_dim),
+                    o,
+                    self.forward_metadata.qo_indptr,
+                    self.forward_metadata.kv_indptr,
+                    self.forward_metadata.kv_indices,
+                    self.forward_metadata.kv_last_page_len,
+                    self.forward_metadata.max_q_len,
+                    sm_scale=layer.scaling,
+                    logit_cap=layer.logit_cap,
+                    work_meta_data=work_metadata,
+                    work_indptr=work_indptr,
+                    work_info_set=work_info_set,
+                    reduce_indptr=reduce_indptr,
+                    reduce_final_map=reduce_final_map,
+                    reduce_partial_map=reduce_partial_map,
+                    q_scale=layer.k_scale,
+                    kv_scale=layer.k_scale,
+                    intra_batch_mode=intra_batch_mode,
+                    num_kv_splits=num_kv_splits,
+                )
+                return o
+            elif forward_batch.forward_mode.is_draft_extend():
+
+                work_metadata = self.forward_metadata.work_metadata
+                work_indptr = self.forward_metadata.work_indptr
+                work_info_set = self.forward_metadata.work_info_set
+
+                reduce_indptr = self.forward_metadata.reduce_indptr
+                reduce_final_map = self.forward_metadata.reduce_final_map
+                reduce_partial_map = self.forward_metadata.reduce_partial_map
+
+                num_kv_splits = self.forward_metadata.num_kv_splits
+
+                if self.forward_metadata.run_graph is not True:
+
+                    bs, q_pad, q_mask = pad_sequence_with_mask(
+                        q.view(q.shape[0], -1),
+                        qo_indptr[:-1],
+                        forward_batch.extend_seq_lens,
+                        self.forward_metadata.max_q_len,
+                    )
+                    o = q.new_empty(
+                        (
+                            bs * self.forward_metadata.max_q_len,
+                            layer.tp_q_head_num,
+                            layer.v_head_dim,
+                        ),
+                        dtype=self.input_dtype,
+                    )
+                    mla_decode_fwd(
+                        q_pad.view(-1, layer.tp_q_head_num, layer.qk_head_dim),
+                        K_Buffer.view(-1, 1, 1, layer.qk_head_dim),
+                        o,
+                        self.forward_metadata.qo_indptr,
+                        self.forward_metadata.kv_indptr,
+                        self.forward_metadata.kv_indices,
+                        self.forward_metadata.kv_last_page_len,
+                        self.forward_metadata.max_q_len,
+                        sm_scale=layer.scaling,
+                        logit_cap=layer.logit_cap,
+                        work_meta_data=work_metadata,
+                        work_indptr=work_indptr,
+                        work_info_set=work_info_set,
+                        reduce_indptr=reduce_indptr,
+                        reduce_final_map=reduce_final_map,
+                        reduce_partial_map=reduce_partial_map,
+                        q_scale=layer.k_scale,
+                        kv_scale=layer.k_scale,
+                        intra_batch_mode=intra_batch_mode,
+                        num_kv_splits=num_kv_splits,
+                    )
+
+                    total_valid_q = int(qo_indptr[-1].item())
+                    return o[:total_valid_q]
+                else:
+                    o = q.new_empty(
+                        (q.shape[0], layer.tp_q_head_num, layer.v_head_dim),
+                        dtype=self.input_dtype,
+                    )
+
+                    mla_decode_fwd(
+                        q,
+                        K_Buffer.view(-1, 1, 1, layer.qk_head_dim),
+                        o,
+                        self.forward_metadata.qo_indptr,
+                        self.forward_metadata.kv_indptr,
+                        self.forward_metadata.kv_indices,
+                        self.forward_metadata.kv_last_page_len,
+                        self.forward_metadata.max_q_len,
+                        sm_scale=layer.scaling,
+                        logit_cap=layer.logit_cap,
+                        work_meta_data=work_metadata,
+                        work_indptr=work_indptr,
+                        work_info_set=work_info_set,
+                        reduce_indptr=reduce_indptr,
+                        reduce_final_map=reduce_final_map,
+                        reduce_partial_map=reduce_partial_map,
+                        q_scale=layer.k_scale,
+                        kv_scale=layer.k_scale,
+                        intra_batch_mode=intra_batch_mode,
+                        num_kv_splits=num_kv_splits,
+                    )
+                    return o
+            else:
+                raise ValueError(
+                    f"Invalid forward mode for MLA prefill: {forward_batch.forward_mode=}"
+                )
+        else:
+            if (
+                forward_batch.forward_mode.is_target_verify()
+                or forward_batch.forward_mode.is_draft_extend()
+            ):
+                # Use triton extend kernel which supports custom masks and causal masking
+                if layer.qk_head_dim != layer.v_head_dim:
+                    o = q.new_empty(
+                        (q.shape[0], layer.tp_q_head_num * layer.v_head_dim)
+                    )
+                else:
+                    o = torch.empty_like(q)
+
+                self.extend_attention_fwd(
+                    q.view(-1, layer.tp_q_head_num, layer.qk_head_dim),
+                    k.contiguous(),
+                    v.contiguous(),
+                    o.view(-1, layer.tp_q_head_num, layer.v_head_dim),
+                    forward_batch.token_to_kv_pool.get_key_buffer(layer.layer_id),
+                    forward_batch.token_to_kv_pool.get_value_buffer(layer.layer_id),
+                    self.forward_metadata.qo_indptr,
+                    self.forward_metadata.kv_indptr,
+                    self.forward_metadata.kv_indices,
+                    self.forward_metadata.custom_mask,
+                    True,  # causal
+                    self.forward_metadata.mask_indptr,
+                    self.forward_metadata.max_extend_len,
+                    1.0,  # k_scale
+                    1.0,  # v_scale
+                    layer.scaling,
+                    logit_cap=layer.logit_cap,
+                )
+                return o.view(-1, layer.tp_q_head_num * layer.v_head_dim)
+
+            k_cache, v_cache = forward_batch.token_to_kv_pool.get_kv_buffer(
+                layer.layer_id
+            )
+
+            bs0 = forward_batch.batch_size + 1
+
+            # TODO kkhuang-amd need to remove it when mha_batch_prefill_func support fp8-kv
+            if self.kv_cache_dtype == fp8_dtype:
+                dtype = q.dtype
+                k_cache = k_cache.to(dtype)
+                v_cache = v_cache.to(dtype)
+
+            window_size = (-1, -1)
+            if layer.sliding_window_size is not None and layer.sliding_window_size > -1:
+                window_size = (layer.sliding_window_size, -1)
+
+            o = mha_batch_prefill_func(
+                q.contiguous().view(-1, layer.tp_q_head_num, layer.head_dim),
+                k_cache,
+                v_cache,
+                self.qo_indptr[:bs0],
+                self.forward_metadata.kv_indptr[:bs0],
+                self.forward_metadata.kv_indices,
+                self.forward_metadata.max_q_len,
+                self.forward_metadata.max_kv_len,
+                causal=True,
+                logits_soft_cap=self.logits_soft_cap,
+                alibi_slopes=None,
+                return_lse=False,
+                return_attn_probs=False,
+                window_size=window_size,
+                sink_ptr=sinks,
+            )
+
+            return o.view(-1, layer.tp_q_head_num * layer.head_dim)
+
+    def forward_decode(
+        self,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
+        layer: RadixAttention,
+        forward_batch: ForwardBatch,
+        save_kv_cache=True,
+    ):
+
+        q = q.reshape(-1, layer.tp_q_head_num * layer.qk_head_dim)
+
+        if layer.qk_head_dim != layer.v_head_dim:
+            o = q.new_empty(
+                (q.shape[0], layer.tp_q_head_num * layer.v_head_dim),
+                dtype=self.input_dtype,
+            )
+        else:
+            o = torch.empty_like(q, dtype=self.input_dtype)
+
+        if save_kv_cache:
+
+            forward_batch.token_to_kv_pool.set_kv_buffer(
+                layer, forward_batch.out_cache_loc, k, v
+            )
+
+        if self.use_mla:
+            k_buffer = forward_batch.token_to_kv_pool.get_key_buffer(layer.layer_id)
+
+            work_metadata = self.forward_metadata.work_metadata
+            work_indptr = self.forward_metadata.work_indptr
+            work_info_set = self.forward_metadata.work_info_set
+
+            reduce_indptr = self.forward_metadata.reduce_indptr
+            reduce_final_map = self.forward_metadata.reduce_final_map
+            reduce_partial_map = self.forward_metadata.reduce_partial_map
+
+            num_kv_splits = self.forward_metadata.num_kv_splits
+
+            mla_decode_fwd(
+                q.view(-1, layer.tp_q_head_num, layer.qk_head_dim),
+                k_buffer.view(-1, 1, 1, layer.qk_head_dim),
+                o.view(-1, layer.tp_q_head_num, layer.v_head_dim),
+                self.forward_metadata.qo_indptr,
+                self.forward_metadata.kv_indptr,
+                self.forward_metadata.kv_indices,
+                self.forward_metadata.kv_last_page_len,
+                self.forward_metadata.max_q_len,
+                sm_scale=layer.scaling,
+                logit_cap=layer.logit_cap,
+                work_meta_data=work_metadata,
+                work_indptr=work_indptr,
+                work_info_set=work_info_set,
+                reduce_indptr=reduce_indptr,
+                reduce_final_map=reduce_final_map,
+                reduce_partial_map=reduce_partial_map,
+                q_scale=layer.k_scale,
+                kv_scale=layer.k_scale,
+                intra_batch_mode=intra_batch_mode,
+                num_kv_splits=num_kv_splits,
+            )
+        else:
+            self.logits_soft_cap = layer.logit_cap
+
+            k_cache, v_cache = forward_batch.token_to_kv_pool.get_kv_buffer(
+                layer.layer_id
+            )
+
+            # TODO kkhuang-amd need to remove it when paged_attention_ragged support fp8-kv
+            if self.kv_cache_dtype == fp8_dtype:
+                dtype = q.dtype
+
+                k_cache = k_cache.to(dtype)
+                v_cache = v_cache.to(dtype)
+
+            paged_attention_ragged(
+                o.view(-1, layer.tp_q_head_num, layer.qk_head_dim),
+                self.workspace_buffer,
+                q.view(-1, layer.tp_q_head_num, layer.qk_head_dim),
+                k_cache.view(-1, 1, layer.tp_k_head_num, layer.qk_head_dim),
+                v_cache.view(-1, 1, layer.tp_v_head_num, layer.v_head_dim),
+                self.scale,
+                self.forward_metadata.kv_indptr,
+                self.forward_metadata.kv_indices,
+                self.kv_last_page_len,
+                1,
+                self.max_num_partitions,
+                None,
+                "auto",
+                "NHD",
+                self.logits_soft_cap,
+                self.k_scale,
+                self.v_scale,
+                None,
+                _AITER_PARTITION_SIZE_ROCM,
+            )
+
+        return o
+
+
+class AiterIndicesUpdaterPrefill:
+    def __init__(self, model_runner: ModelRunner, attn_backend: AttentionBackend):
+        # Parse Constants
+        self.num_qo_heads = (
+            model_runner.model_config.num_attention_heads // get_attention_tp_size()
+        )
+        self.num_kv_heads = model_runner.model_config.get_num_kv_heads(
+            get_attention_tp_size()
+        )
+        self.head_dim = model_runner.model_config.head_dim
+        self.data_type = model_runner.kv_cache_dtype
+        self.q_data_type = model_runner.dtype
+        self.sliding_window_size = model_runner.sliding_window_size
+        self.attn_backend = attn_backend
+
+        # Buffers and wrappers
+        self.kv_indptr = attn_backend.kv_indptr
+        self.kv_last_page_len = attn_backend.kv_last_page_len
+        self.qo_indptr = attn_backend.qo_indptr
+        self.req_to_token = model_runner.req_to_token_pool.req_to_token
+        self.update = self.update_single_wrapper
+
+        self.kv_indices = None
+        self.max_q_len = 0
+        self.max_kv_len = 0
+
+    def update(
+        self,
+        req_pool_indices: torch.Tensor,
+        seq_lens: torch.Tensor,
+        seq_lens_sum: int,
+        prefix_lens: torch.Tensor,
+        encoder_lens: Optional[torch.Tensor],
+        spec_info: Optional[SpecInput],
+    ):
+        # Keep the signature for type checking. It will be assigned during runtime.
+        raise NotImplementedError()
+
+    def update_single_wrapper(
+        self,
+        req_pool_indices: torch.Tensor,
+        seq_lens: torch.Tensor,
+        seq_lens_sum: int,
+        prefix_lens: torch.Tensor,
+        encoder_lens: Optional[torch.Tensor],
+        spec_info: Optional[SpecInput],
+    ):
+
+        kv_start_idx = None
+        kv_indptr = self.kv_indptr
+        qo_indptr = self.qo_indptr
+        paged_kernel_lens = seq_lens
+        paged_kernel_lens_sum = seq_lens_sum
+
+        bs = len(req_pool_indices)
+        if spec_info is None:
+            # Normal extend
+            kv_indptr[1 : bs + 1] = torch.cumsum(paged_kernel_lens, dim=0)
+            kv_indptr = kv_indptr[: bs + 1]
+
+            # (TODO: Kk) WA - CI test_moe_eval_accuracy_large.py
+            # mha_batch_prefill reads 128 data to do computatoin
+            # if real data is not long enough then original padding value 0 is used
+            # but the 0 location will be made nan (noqa) in cuda graph capture mode
+            # this will cause the output tensor value becomes nan
+            # WA is to assure that last index of pool not changed
+            kv_indices = torch.empty(
+                paged_kernel_lens_sum + 256,
+                dtype=torch.int32,
+                device=req_pool_indices.device,
+            )
+            create_flashinfer_kv_indices_triton[(bs,)](
+                self.req_to_token,
+                req_pool_indices,
+                paged_kernel_lens,
+                kv_indptr,
+                kv_start_idx,
+                kv_indices,
+                self.req_to_token.shape[1],
+            )
+
+            token_num = kv_indptr[-1]
+            kv_indices[token_num:] = kv_indices[0]
+
+            self.max_kv_len = torch.max(paged_kernel_lens).item()
+
+            extend_lens = seq_lens - prefix_lens
+            self.max_q_len = torch.max(extend_lens).item()
+
+            qo_indptr[1 : bs + 1] = torch.cumsum(extend_lens, dim=0)
+            qo_indptr = qo_indptr[: bs + 1]
+            custom_mask = None
+        else:
+            kv_indices, kv_indptr, qo_indptr, custom_mask = (
+                spec_info.generate_attn_arg_prefill(
+                    req_pool_indices,
+                    paged_kernel_lens,
+                    paged_kernel_lens_sum,
+                    self.req_to_token,
+                )
+            )
+
+        self.kv_indices = kv_indices
+
+
+class AiterMlaIndicesUpdaterPrefill:
+    def __init__(self, model_runner: ModelRunner, attn_backend: AttentionBackend):
+        # Parse Constants
+        self.attn_backend = attn_backend
+
+        # Buffers and wrappers
+        self.req_to_token = model_runner.req_to_token_pool.req_to_token
+        self.update = self.update_single_wrapper
+
+        self.kv_indptr = None
+        self.kv_indices = None
+        self.qo_indptr = None
+        self.kv_last_page_len = None
+        self.max_q_len = 0
+        self.max_kv_len = 0
+
+    def update(
+        self,
+        req_pool_indices: torch.Tensor,
+        kv_lens: torch.Tensor,
+        kv_lens_sum: int,
+        extend_lens: torch.Tensor,
+        max_q_len: int,
+        max_kv_len: int,
+        spec_info: Optional[SpecInput],
+    ):
+        # Keep the signature for type checking. It will be assigned during runtime.
+        raise NotImplementedError()
+
+    def update_single_wrapper(
+        self,
+        req_pool_indices: torch.Tensor,
+        kv_lens: torch.Tensor,
+        kv_lens_sum: int,
+        extend_lens: torch.Tensor,
+        max_q_len: int,
+        max_kv_len: int,
+        spec_info: Optional[SpecInput],
+    ):
+        bs = len(req_pool_indices)
+
+        kv_indptr = self.attn_backend.kv_indptr
+
+        if spec_info is None:
+            # Normal extend
+            kv_indptr[1 : bs + 1] = torch.cumsum(kv_lens, dim=0)
+            kv_indptr = kv_indptr[: bs + 1]
+            kv_indices = torch.empty(
+                kv_lens_sum,
+                dtype=torch.int32,
+                device=req_pool_indices.device,
+            )
+            create_flashinfer_kv_indices_triton[(bs,)](
+                self.req_to_token,
+                req_pool_indices,
+                kv_lens,
+                kv_indptr,
+                None,
+                kv_indices,
+                self.req_to_token.stride(0),
+            )
+
+            qo_indptr = self.attn_backend.qo_indptr
+            qo_indptr[1 : bs + 1] = torch.cumsum(extend_lens, dim=0)
+            qo_indptr = qo_indptr[: bs + 1]
+        else:
+            kv_indices, kv_indptr, qo_indptr, custom_mask = (
+                spec_info.generate_attn_arg_prefill(
+                    req_pool_indices,
+                    kv_lens,
+                    kv_lens_sum,
+                    self.req_to_token,
+                )
+            )
+
+        self.kv_indptr = kv_indptr
+        self.kv_indices = kv_indices
+        self.qo_indptr = qo_indptr
+        self.max_q_len = max_q_len
+        self.max_kv_len = max_kv_len
+
+
+class AiterMultiStepDraftBackend:
+    """
+    Wrap multiple triton attention backends as one for multiple consecutive
+    draft decoding steps.
+    """
+
+    def __init__(
+        self,
+        model_runner: ModelRunner,
+        topk: int,
+        speculative_num_steps: int,
+    ):
+        from sglang.srt.speculative.spec_utils import generate_draft_decode_kv_indices
+
+        self.topk = topk
+        self.speculative_num_steps = speculative_num_steps
+        self.generate_draft_decode_kv_indices = generate_draft_decode_kv_indices
+        max_bs = model_runner.req_to_token_pool.size * self.topk
+        self.kv_indptr = torch.zeros(
+            (
+                self.speculative_num_steps,
+                max_bs + 1,
+            ),
+            dtype=torch.int32,
+            device=model_runner.device,
+        )
+        self.attn_backends = []
+        for i in range(self.speculative_num_steps - 1):
+            self.attn_backends.append(
+                AiterAttnBackend(
+                    model_runner,
+                    skip_prefill=True,
+                    kv_indptr_buf=self.kv_indptr[i],
+                )
+            )
+        self.max_context_len = self.attn_backends[0].max_context_len
+        self.num_head = (
+            model_runner.model_config.num_attention_heads // get_attention_tp_size()
+        )
+        self.device = model_runner.device
+        # Cached variables for generate_draft_decode_kv_indices
+        self.pool_len = model_runner.req_to_token_pool.req_to_token.shape[1]
+        self.page_size = model_runner.server_args.page_size
+
+    def common_template(
+        self, forward_batch: ForwardBatch, kv_indices_buffer: torch.Tensor, call_fn: int
+    ):
+        num_seqs = forward_batch.batch_size
+        bs = self.topk * num_seqs
+        seq_lens_sum = forward_batch.seq_lens_sum
+
+        self.generate_draft_decode_kv_indices[
+            (self.speculative_num_steps, num_seqs, self.topk)
+        ](
+            forward_batch.req_pool_indices,
+            forward_batch.req_to_token_pool.req_to_token,
+            forward_batch.seq_lens,
+            kv_indices_buffer,
+            self.kv_indptr,
+            forward_batch.positions,
+            self.pool_len,
+            kv_indices_buffer.shape[1],
+            self.kv_indptr.shape[1],
+            triton.next_power_of_2(num_seqs),
+            triton.next_power_of_2(self.speculative_num_steps),
+            triton.next_power_of_2(bs),
+            self.page_size,
+        )
+
+        for i in range(self.speculative_num_steps - 1):
+            forward_batch.spec_info.kv_indptr = self.kv_indptr[i, : bs + 1]
+            forward_batch.spec_info.kv_indices = kv_indices_buffer[i][
+                : seq_lens_sum * self.topk + bs * (i + 1)
+            ]
+            call_fn(i, forward_batch)
+
+    def init_forward_metadata(self, forward_batch: ForwardBatch):
+        kv_indices = torch.empty(
+            (
+                self.speculative_num_steps,
+                forward_batch.batch_size * self.topk * self.max_context_len,
+            ),
+            dtype=torch.int32,
+            device=self.device,
+        )
+
+        def call_fn(i, forward_batch):
+            forward_batch.spec_info.kv_indptr = (
+                forward_batch.spec_info.kv_indptr.clone()
+            )
+            forward_batch.spec_info.kv_indices = (
+                forward_batch.spec_info.kv_indices.clone()
+            )
+            self.attn_backends[i].init_forward_metadata(forward_batch)
+
+        self.common_template(forward_batch, kv_indices, call_fn)
+
+    def init_cuda_graph_state(self, max_bs: int, max_num_tokens: int):
+        self.cuda_graph_kv_indices = torch.zeros(
+            (self.speculative_num_steps, max_num_tokens * self.max_context_len),
+            dtype=torch.int32,
+            device=self.device,
+        )
+        for i in range(self.speculative_num_steps - 1):
+            self.attn_backends[i].init_cuda_graph_state(
+                max_bs, max_num_tokens, kv_indices_buf=self.cuda_graph_kv_indices[i]
+            )
+
+    def init_forward_metadata_capture_cuda_graph(self, forward_batch: ForwardBatch):
+        def call_fn(i, forward_batch):
+            self.attn_backends[i].init_forward_metadata_capture_cuda_graph(
+                forward_batch.batch_size,
+                forward_batch.batch_size * self.topk,
+                forward_batch.req_pool_indices,
+                forward_batch.seq_lens,
+                encoder_lens=None,
+                forward_mode=ForwardMode.DECODE,
+                spec_info=forward_batch.spec_info,
+            )
+
+        self.common_template(forward_batch, self.cuda_graph_kv_indices, call_fn)
+
+    def init_forward_metadata_replay_cuda_graph(
+        self, forward_batch: ForwardBatch, bs: int
+    ):
+        def call_fn(i, forward_batch):
+            self.attn_backends[i].init_forward_metadata_replay_cuda_graph(
+                bs,
+                forward_batch.req_pool_indices,
+                forward_batch.seq_lens,
+                seq_lens_sum=-1,
+                encoder_lens=None,
+                forward_mode=ForwardMode.DECODE,
+                spec_info=forward_batch.spec_info,
+                seq_lens_cpu=None,
+            )
+
+        self.common_template(forward_batch, self.cuda_graph_kv_indices, call_fn)
diff --git a/sglang/python/sglang/srt/layers/attention/attention_registry.py b/sglang/python/sglang/srt/layers/attention/attention_registry.py
new file mode 100644
index 0000000000000000000000000000000000000000..9020e5260af9e256bfae3b551f595de82bd2042e
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/attention/attention_registry.py
@@ -0,0 +1,242 @@
+import logging
+from typing import TYPE_CHECKING
+
+logger = logging.getLogger(__name__)
+
+
+if TYPE_CHECKING:
+    # evade circular imports
+    from sglang.srt.layers.attention.base_attn_backend import AttentionBackend
+    from sglang.srt.model_executor.model_runner import ModelRunner
+
+ATTENTION_BACKENDS = {}
+
+
+def register_attention_backend(name):
+    def decorator(fn):
+        ATTENTION_BACKENDS[name] = fn
+        return fn
+
+    return decorator
+
+
+@register_attention_backend("flashinfer")
+def create_flashinfer_backend(runner):
+    import torch
+
+    if not runner.use_mla_backend:
+        from sglang.srt.layers.attention.flashinfer_backend import FlashInferAttnBackend
+
+        # Init streams
+        if runner.server_args.speculative_algorithm == "EAGLE":
+            if (
+                not hasattr(runner, "plan_stream_for_flashinfer")
+                or not runner.plan_stream_for_flashinfer
+            ):
+                runner.plan_stream_for_flashinfer = torch.cuda.Stream()
+        return FlashInferAttnBackend(
+            runner, init_new_workspace=runner.init_new_workspace
+        )
+    else:
+        from sglang.srt.layers.attention.flashinfer_mla_backend import (
+            FlashInferMLAAttnBackend,
+        )
+
+        return FlashInferMLAAttnBackend(runner)
+
+
+@register_attention_backend("trtllm_mla")
+def create_trtllm_mla_backend(runner):
+    if not runner.use_mla_backend:
+        raise ValueError("trtllm_mla backend can only be used with MLA models.")
+    from sglang.srt.layers.attention.trtllm_mla_backend import TRTLLMMLABackend
+
+    return TRTLLMMLABackend(runner)
+
+
+@register_attention_backend("aiter")
+def create_aiter_backend(runner):
+    from sglang.srt.layers.attention.aiter_backend import AiterAttnBackend
+
+    return AiterAttnBackend(runner)
+
+
+@register_attention_backend("wave")
+def create_wave_backend(runner):
+    from sglang.srt.layers.attention.wave_backend import WaveAttnBackend
+
+    return WaveAttnBackend(runner)
+
+
+@register_attention_backend("ascend")
+def create_ascend_backend(runner):
+    from sglang.srt.hardware_backend.npu.attention.ascend_backend import (
+        AscendAttnBackend,
+    )
+
+    return AscendAttnBackend(runner)
+
+
+@register_attention_backend("nsa")
+def create_nsa_backend(runner):
+    from sglang.srt.layers.attention.nsa_backend import NativeSparseAttnBackend
+
+    return NativeSparseAttnBackend(runner)
+
+
+@register_attention_backend("triton")
+def create_triton_backend(runner):
+    assert not runner.model_config.is_encoder_decoder, (
+        "Cross attention is not supported in the triton attention backend. "
+        "Please use `--attention-backend flashinfer`."
+    )
+    if runner.server_args.enable_double_sparsity:
+        from sglang.srt.layers.attention.double_sparsity_backend import (
+            DoubleSparseAttnBackend,
+        )
+
+        return DoubleSparseAttnBackend(runner)
+    else:
+        from sglang.srt.layers.attention.triton_backend import TritonAttnBackend
+
+        return TritonAttnBackend(runner)
+
+
+@register_attention_backend("torch_native")
+def create_torch_native_backend(runner):
+    from sglang.srt.layers.attention.torch_native_backend import TorchNativeAttnBackend
+
+    return TorchNativeAttnBackend(runner)
+
+
+@register_attention_backend("flex_attention")
+def create_flex_attention_backend(runner):
+    from sglang.srt.layers.attention.torch_flex_backend import TorchFlexAttnBackend
+
+    return TorchFlexAttnBackend(runner)
+
+
+@register_attention_backend("flashmla")
+def create_flashmla_backend(runner):
+    from sglang.srt.layers.attention.flashmla_backend import FlashMLABackend
+
+    return FlashMLABackend(runner)
+
+
+@register_attention_backend("fa3")
+def create_flashattention_v3_backend(runner):
+    import torch
+
+    assert (
+        torch.cuda.get_device_capability()[0] == 8 and not runner.use_mla_backend
+    ) or torch.cuda.get_device_capability()[0] == 9, (
+        "FlashAttention v3 Backend requires SM>=80 and SM<=90. "
+        "Please use `--attention-backend flashinfer`."
+    )
+    from sglang.srt.layers.attention.flashattention_backend import FlashAttentionBackend
+
+    return FlashAttentionBackend(runner)
+
+
+@register_attention_backend("fa4")
+def create_flashattention_v4_backend(runner):
+    from sglang.srt.layers.attention.flashattention_backend import FlashAttentionBackend
+
+    return FlashAttentionBackend(runner, fa_impl_ver=4)
+
+
+@register_attention_backend("cutlass_mla")
+def create_cutlass_mla_backend(runner):
+    from sglang.srt.layers.attention.cutlass_mla_backend import CutlassMLABackend
+
+    return CutlassMLABackend(runner)
+
+
+@register_attention_backend("trtllm_mha")
+def create_trtllm_mha_backend(runner):
+    if runner.use_mla_backend:
+        raise ValueError("trtllm_mha backend can only be used with non-MLA models.")
+    from sglang.srt.layers.attention.trtllm_mha_backend import TRTLLMHAAttnBackend
+
+    return TRTLLMHAAttnBackend(runner)
+
+
+@register_attention_backend("intel_amx")
+def create_intel_amx_backend(runner):
+    from sglang.srt.layers.attention.intel_amx_backend import IntelAMXAttnBackend
+
+    return IntelAMXAttnBackend(runner)
+
+
+@register_attention_backend("dual_chunk_flash_attn")
+def create_dual_chunk_flash_attn_backend(runner):
+    from sglang.srt.layers.attention.dual_chunk_flashattention_backend import (
+        DualChunkFlashAttentionBackend,
+    )
+
+    return DualChunkFlashAttentionBackend(runner)
+
+
+def attn_backend_wrapper(runner: "ModelRunner", full_attn_backend: "AttentionBackend"):
+    """
+    Wrapper for special models like hybrid GDN, so we don't
+    need to change the code of the original attention backend.
+    """
+    assert not (
+        runner.hybrid_gdn_config is not None and runner.use_mla_backend
+    ), "hybrid_gdn can only be used with non-MLA models."
+
+    if cfg := runner.mambaish_config:
+        from sglang.srt.layers.attention.fla.utils import check_environments
+        from sglang.srt.layers.attention.hybrid_linear_attn_backend import (
+            HybridLinearAttnBackend,
+            Mamba2AttnBackend,
+        )
+        from sglang.srt.layers.attention.linear.gdn_backend import GDNAttnBackend
+        from sglang.srt.layers.attention.linear.kda_backend import KDAAttnBackend
+        from sglang.srt.layers.attention.linear.lightning_backend import (
+            LightningAttentionBackend,
+        )
+        from sglang.srt.layers.attention.linear.utils import (
+            initialize_linear_attn_config,
+        )
+        from sglang.srt.utils import is_blackwell, is_npu
+
+        check_environments()
+        initialize_linear_attn_config(runner.server_args)
+        if runner.hybrid_gdn_config is not None:
+            if is_blackwell():
+                assert (
+                    runner.server_args.attention_backend == "triton"
+                    or runner.server_args.attention_backend == "trtllm_mha"
+                    or runner.server_args.attention_backend == "fa4"
+                ), "triton or trtllm_mha or fa4 backend are the only supported backends on Blackwell GPUs for hybrid GDN models, use --attention-backend triton or --attention-backend trtllm_mha to specify the backend."
+            if is_npu():
+                assert (
+                    runner.server_args.attention_backend == "ascend"
+                ), "ascend backend is the only supported backend on NPU for hybrid GDN models, use --attention-backend ascend to specify the backend."
+            logger.info(f"Using hybrid linear attention backend for hybrid GDN models.")
+            linear_attn_backend = GDNAttnBackend(runner)
+        elif runner.mamba2_config is not None:
+            linear_attn_backend = Mamba2AttnBackend(runner)
+        elif runner.kimi_linear_config is not None:
+            linear_attn_backend = KDAAttnBackend(runner)
+        elif runner.hybrid_lightning_config is not None:
+            linear_attn_backend = LightningAttentionBackend(runner)
+        else:
+            raise ValueError(
+                "Expected hybrid GDN or NemotronH models, but got unknown model."
+            )
+        full_attn_layers = cfg.full_attention_layer_ids
+        return HybridLinearAttnBackend(
+            full_attn_backend, linear_attn_backend, full_attn_layers
+        )
+
+    return full_attn_backend
+
+
+@register_attention_backend("intel_xpu")
+def create_intel_xpu_backend(runner):
+    from sglang.srt.layers.attention.xpu_backend import XPUAttentionBackend
+
+    return XPUAttentionBackend(runner)
diff --git a/sglang/python/sglang/srt/layers/attention/base_attn_backend.py b/sglang/python/sglang/srt/layers/attention/base_attn_backend.py
new file mode 100644
index 0000000000000000000000000000000000000000..8d14e32a916bc9016905cc90f61c7ccf2b642c29
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/attention/base_attn_backend.py
@@ -0,0 +1,169 @@
+from __future__ import annotations
+
+from abc import ABC, abstractmethod
+from typing import TYPE_CHECKING, Optional
+
+import torch
+
+from sglang.srt.utils.common import is_npu
+
+if TYPE_CHECKING:
+    from sglang.srt.layers.attention.nsa.nsa_indexer import BaseIndexerMetadata
+    from sglang.srt.layers.radix_attention import RadixAttention
+    from sglang.srt.model_executor.forward_batch_info import ForwardBatch, ForwardMode
+    from sglang.srt.speculative.spec_info import SpecInput
+
+
+class AttentionBackend(ABC):
+    """The base class of attention backends"""
+
+    @abstractmethod
+    def init_forward_metadata(self, forward_batch: ForwardBatch):
+        """Init the metadata for a forward pass."""
+        raise NotImplementedError()
+
+    def init_cuda_graph_state(self, max_bs: int, max_num_tokens: int):
+        """Init the global shared states for cuda graph."""
+        raise NotImplementedError()
+
+    def init_forward_metadata_capture_cuda_graph(
+        self,
+        bs: int,
+        num_tokens: int,
+        req_pool_indices: torch.Tensor,
+        seq_lens: torch.Tensor,
+        encoder_lens: Optional[torch.Tensor],
+        forward_mode: ForwardMode,
+        spec_info: Optional[SpecInput],
+    ):
+        """Init the metadata for a forward pass for capturing a cuda graph."""
+        raise NotImplementedError()
+
+    def init_forward_metadata_replay_cuda_graph(
+        self,
+        bs: int,
+        req_pool_indices: torch.Tensor,
+        seq_lens: torch.Tensor,
+        seq_lens_sum: int,
+        encoder_lens: Optional[torch.Tensor],
+        forward_mode: ForwardMode,
+        spec_info: Optional[SpecInput],
+        seq_lens_cpu: Optional[torch.Tensor],
+    ):
+        """Init the metadata for a forward pass for replaying a cuda graph."""
+        raise NotImplementedError()
+
+    def get_cuda_graph_seq_len_fill_value(self):
+        """Get the fill value for padded seq lens. Typically, it is 0 or 1."""
+        raise NotImplementedError()
+
+    def get_verify_buffers_to_fill_after_draft(self):
+        """
+        Return buffers of verify attention kernels that needs to be filled after draft.
+
+        Typically, these are tree mask and position buffers.
+        """
+        return [None, None]
+
+    def update_verify_buffers_to_fill_after_draft(
+        self, spec_info: SpecInput, cuda_graph_bs: Optional[int]
+    ):
+        """
+        Update the buffers returned by get_verify_fill_after_draft_buffers if needed.
+
+        Here, we need to redo the computation of all metadata of the attention backend
+        that depends on tree mask and position buffers.
+        """
+        raise NotImplementedError()
+
+    def forward(
+        self,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
+        layer: RadixAttention,
+        forward_batch: ForwardBatch,
+        save_kv_cache: bool = True,
+        **kwargs,
+    ):
+        """Run forward on an attention layer."""
+        if forward_batch.forward_mode.is_idle():
+            return q.new_empty(q.shape[0], layer.tp_q_head_num * layer.v_head_dim)
+        elif forward_batch.forward_mode.is_decode():
+            return self.forward_decode(
+                q,
+                k,
+                v,
+                layer,
+                forward_batch,
+                save_kv_cache=save_kv_cache,
+                **kwargs,
+            )
+        elif forward_batch.forward_mode.is_mixed() and is_npu():
+            return self.forward_mixed(
+                q,
+                k,
+                v,
+                layer,
+                forward_batch,
+                save_kv_cache=save_kv_cache,
+                **kwargs,
+            )
+        else:
+            return self.forward_extend(
+                q,
+                k,
+                v,
+                layer,
+                forward_batch,
+                save_kv_cache=save_kv_cache,
+                **kwargs,
+            )
+
+    def forward_decode(
+        self,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
+        layer: RadixAttention,
+        forward_batch: ForwardBatch,
+        save_kv_cache: bool = True,
+    ):
+        """Run a forward for decode."""
+        raise NotImplementedError()
+
+    def forward_extend(
+        self,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
+        layer: RadixAttention,
+        forward_batch: ForwardBatch,
+        save_kv_cache: bool = True,
+    ):
+        """Run a forward for extend."""
+        raise NotImplementedError()
+
+    def forward_mixed(
+        self,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
+        layer: RadixAttention,
+        forward_batch: ForwardBatch,
+        save_kv_cache: bool = True,
+    ):
+        """Run a forward for mix."""
+        raise NotImplementedError()
+
+    def support_triton(self):
+        """Check if the current backend supports triton."""
+        return True
+
+    def get_indexer_metadata(
+        self,
+        layer_id: int,
+        forward_batch: ForwardBatch,
+    ) -> Optional[BaseIndexerMetadata]:
+        """Get the indexer metadata. None means don't support indexer."""
+        return None
diff --git a/sglang/python/sglang/srt/layers/attention/cutlass_mla_backend.py b/sglang/python/sglang/srt/layers/attention/cutlass_mla_backend.py
new file mode 100644
index 0000000000000000000000000000000000000000..e81e761bcefdf6b405eb66f1718aa0827dee6ff8
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/attention/cutlass_mla_backend.py
@@ -0,0 +1,285 @@
+from __future__ import annotations
+
+"""
+Support attention backend for Cutlass MLA.
+
+"""
+
+from dataclasses import dataclass
+from typing import TYPE_CHECKING, Optional, Union
+
+import torch
+import triton
+
+from sglang.srt.layers.attention.flashinfer_mla_backend import FlashInferMLAAttnBackend
+from sglang.srt.layers.attention.utils import create_flashmla_kv_indices_triton
+from sglang.srt.layers.dp_attention import get_attention_tp_size
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch, ForwardMode
+from sglang.srt.utils import is_cuda
+
+if TYPE_CHECKING:
+    from sglang.srt.layers.radix_attention import RadixAttention
+    from sglang.srt.model_executor.model_runner import ModelRunner
+    from sglang.srt.speculative.spec_info import SpecInput
+
+_is_cuda = is_cuda()
+if _is_cuda:
+    from sgl_kernel import cutlass_mla_decode, cutlass_mla_get_workspace_size
+
+
+# Cutlass MLA only supports pagesize=128
+PAGE_SIZE = 128
+
+
+@dataclass
+class CutlassMLADecodeMetadata:
+    workspace: Optional[torch.Tensor] = None
+    block_kv_indices: Optional[torch.Tensor] = None
+
+    def __init__(
+        self,
+        workspace: Optional[torch.Tensor] = None,
+        block_kv_indices: Optional[torch.Tensor] = None,
+    ):
+        self.workspace = workspace
+        self.block_kv_indices = block_kv_indices
+
+
+class CutlassMLABackend(FlashInferMLAAttnBackend):
+    """Cutlass attention kernels."""
+
+    def __init__(
+        self,
+        model_runner: ModelRunner,
+        skip_prefill: bool = False,
+        kv_indptr_buf: Optional[torch.Tensor] = None,
+        kv_last_page_len_buf: Optional[torch.Tensor] = None,
+    ):
+        super().__init__(
+            model_runner, skip_prefill, kv_indptr_buf, kv_last_page_len_buf
+        )
+
+        self.num_q_heads = (
+            model_runner.model_config.num_attention_heads // get_attention_tp_size()
+        )
+        self.num_kv_heads = model_runner.model_config.get_num_kv_heads(
+            get_attention_tp_size()
+        )
+        self.req_to_token = model_runner.req_to_token_pool.req_to_token
+        self.num_local_heads = (
+            model_runner.model_config.num_attention_heads // get_attention_tp_size()
+        )
+        self.forward_metadata: Union[CutlassMLADecodeMetadata] = None
+        self.kv_lora_rank = model_runner.model_config.kv_lora_rank
+        self.qk_nope_head_dim = model_runner.model_config.qk_nope_head_dim
+        self.qk_rope_head_dim = model_runner.model_config.qk_rope_head_dim
+        self.v_head_dim = model_runner.model_config.v_head_dim
+        self.scaling = model_runner.model_config.scaling
+        self.data_type = model_runner.kv_cache_dtype
+        self.q_data_type = model_runner.dtype
+        self.kv_cache_dim = self.kv_lora_rank + self.qk_rope_head_dim
+
+    def init_forward_metadata(self, forward_batch: ForwardBatch):
+
+        bs = forward_batch.batch_size
+        spec_info = forward_batch.spec_info
+        if forward_batch.forward_mode.is_decode_or_idle():
+            if spec_info is None:
+                max_seqlen_pad = triton.cdiv(
+                    forward_batch.seq_lens_cpu.max().item(), PAGE_SIZE
+                )
+                block_kv_indices = torch.full(
+                    (bs, max_seqlen_pad),
+                    -1,
+                    dtype=torch.int32,
+                    device=forward_batch.seq_lens.device,
+                )
+                create_flashmla_kv_indices_triton[(bs,)](
+                    self.req_to_token,
+                    forward_batch.req_pool_indices,
+                    forward_batch.seq_lens,
+                    None,
+                    block_kv_indices,
+                    self.req_to_token.stride(0),
+                    max_seqlen_pad,
+                    PAGED_SIZE=PAGE_SIZE,
+                )
+                workspace_size = cutlass_mla_get_workspace_size(
+                    max_seqlen_pad * PAGE_SIZE, bs, num_kv_splits=1
+                )
+                workspace = torch.empty(
+                    workspace_size, device="cuda", dtype=torch.uint8
+                )
+                self.forward_metadata = CutlassMLADecodeMetadata(
+                    workspace,
+                    block_kv_indices,
+                )
+            else:
+                super().init_forward_metadata(forward_batch)
+        else:
+            super().init_forward_metadata(forward_batch)
+
+    def init_cuda_graph_state(
+        self,
+        max_bs: int,
+        max_num_tokens: int,
+        block_kv_indices: Optional[torch.Tensor] = None,
+    ):
+        if block_kv_indices is None:
+            cuda_graph_kv_indices = torch.full(
+                (max_bs, (self.max_context_len + PAGE_SIZE) // PAGE_SIZE),
+                1,
+                dtype=torch.int32,
+                device="cuda",
+            )
+        else:
+            cuda_graph_kv_indices = block_kv_indices
+
+        workspace_size = cutlass_mla_get_workspace_size(
+            cuda_graph_kv_indices.shape[1] * PAGE_SIZE, max_bs, num_kv_splits=1
+        )
+        self.cuda_graph_mla_workspace = torch.empty(
+            workspace_size, device="cuda", dtype=torch.uint8
+        )
+        self.cuda_graph_kv_indices = cuda_graph_kv_indices
+
+    def init_forward_metadata_capture_cuda_graph(
+        self,
+        bs: int,
+        num_tokens: int,
+        req_pool_indices: torch.Tensor,
+        seq_lens: torch.Tensor,
+        encoder_lens: Optional[torch.Tensor],
+        forward_mode: ForwardMode,
+        spec_info: Optional[SpecInput],
+    ):
+        if forward_mode.is_decode_or_idle():
+            if spec_info is None:
+                max_seqlen_pad = self.cuda_graph_kv_indices.shape[1]
+
+                create_flashmla_kv_indices_triton[(bs,)](
+                    self.req_to_token,
+                    req_pool_indices,
+                    seq_lens,
+                    None,
+                    self.cuda_graph_kv_indices,
+                    self.req_to_token.stride(0),
+                    self.cuda_graph_kv_indices.stride(0),
+                    PAGED_SIZE=PAGE_SIZE,
+                )
+                self.forward_metadata = CutlassMLADecodeMetadata(
+                    self.cuda_graph_mla_workspace,
+                    self.cuda_graph_kv_indices[:bs, :max_seqlen_pad],
+                )
+        else:
+            super().init_forward_metadata_capture_cuda_graph(
+                bs,
+                num_tokens,
+                req_pool_indices,
+                seq_lens,
+                encoder_lens,
+                forward_mode,
+                spec_info,
+            )
+
+    def init_forward_metadata_replay_cuda_graph(
+        self,
+        bs: int,
+        req_pool_indices: torch.Tensor,
+        seq_lens: torch.Tensor,
+        seq_lens_sum: int,
+        encoder_lens: Optional[torch.Tensor],
+        forward_mode: ForwardMode,
+        spec_info: Optional[SpecInput],
+        seq_lens_cpu: Optional[torch.Tensor],
+    ):
+
+        if forward_mode.is_decode_or_idle():
+            assert seq_lens_cpu is not None
+            seq_lens = seq_lens[:bs]
+
+            create_flashmla_kv_indices_triton[(bs,)](
+                self.req_to_token,
+                req_pool_indices[:bs],
+                seq_lens,
+                None,
+                self.cuda_graph_kv_indices,
+                self.req_to_token.stride(0),
+                self.cuda_graph_kv_indices.stride(0),
+                PAGED_SIZE=PAGE_SIZE,
+            )
+        else:
+            super().init_forward_metadata_replay_cuda_graph(
+                bs,
+                req_pool_indices,
+                seq_lens,
+                seq_lens_sum,
+                encoder_lens,
+                forward_mode,
+                spec_info,
+                seq_lens_cpu,
+            )
+
+    def get_cuda_graph_seq_len_fill_value(self):
+        return 1
+
+    def forward_decode(
+        self,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
+        layer: RadixAttention,
+        forward_batch: ForwardBatch,
+        save_kv_cache: bool = True,
+        # For multi-head latent attention
+        q_rope: Optional[torch.Tensor] = None,
+        k_rope: Optional[torch.Tensor] = None,
+    ):
+        cache_loc = forward_batch.out_cache_loc
+
+        if k is not None:
+            assert v is not None
+            if save_kv_cache:
+                if k_rope is not None:
+                    forward_batch.token_to_kv_pool.set_mla_kv_buffer(
+                        layer,
+                        cache_loc,
+                        k,
+                        k_rope,
+                    )
+                else:
+                    forward_batch.token_to_kv_pool.set_kv_buffer(
+                        layer,
+                        cache_loc,
+                        k,
+                        v,
+                    )
+
+        # Reshape inputs
+        if q_rope is not None:
+            q_nope = q.view(-1, layer.tp_q_head_num, layer.v_head_dim)
+            q_rope = q_rope.view(
+                -1, layer.tp_q_head_num, layer.head_dim - layer.v_head_dim
+            )
+        else:
+            reshaped_q = q.view(-1, layer.tp_q_head_num, layer.head_dim)
+            q_nope = reshaped_q[:, :, : layer.v_head_dim]
+            q_rope = reshaped_q[:, :, layer.v_head_dim :]
+
+        q_nope = q_nope.to(self.q_data_type)
+        q_rope = q_rope.to(self.q_data_type)
+
+        k_cache = forward_batch.token_to_kv_pool.get_key_buffer(layer.layer_id)
+
+        o = cutlass_mla_decode(
+            q_nope=q_nope,
+            q_pe=q_rope,
+            kv_c_and_k_pe_cache=k_cache.view(-1, PAGE_SIZE, self.kv_cache_dim),
+            seq_lens=forward_batch.seq_lens.to(torch.int32),
+            page_table=self.forward_metadata.block_kv_indices,
+            workspace=self.forward_metadata.workspace,
+            sm_scale=layer.scaling,
+            num_kv_splits=1,
+        )
+
+        return o.view(-1, layer.tp_q_head_num * layer.v_head_dim)
diff --git a/sglang/python/sglang/srt/layers/attention/double_sparsity_backend.py b/sglang/python/sglang/srt/layers/attention/double_sparsity_backend.py
new file mode 100644
index 0000000000000000000000000000000000000000..e8ee1538b1fc8e2dfba9040e6dad4bba5a1dad06
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/attention/double_sparsity_backend.py
@@ -0,0 +1,257 @@
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+
+import torch
+
+from sglang.srt.layers.attention.base_attn_backend import AttentionBackend
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch
+from sglang.srt.server_args import get_global_server_args
+
+if TYPE_CHECKING:
+    from sglang.srt.layers.radix_attention import RadixAttention
+    from sglang.srt.model_executor.model_runner import ModelRunner
+
+
+class DoubleSparseAttnBackend(AttentionBackend):
+    def __init__(self, model_runner: ModelRunner):
+        # Lazy import to avoid the initialization of cuda context
+        from sglang.srt.layers.attention.triton_ops.double_sparsity_attention import (
+            extend_attention_fwd,
+            flash_decode_attention_fwd,
+            flash_decode_sparse_attention_fwd,
+        )
+
+        super().__init__()
+
+        self.decode_attention_fwd = flash_decode_attention_fwd
+        self.decode_sparse_attention_fwd = flash_decode_sparse_attention_fwd
+        self.extend_attention_fwd = extend_attention_fwd
+        self.num_head = model_runner.model_config.num_attention_heads
+        self.head_dim = model_runner.model_config.hidden_size // self.num_head
+        self.heavy_token_num = model_runner.server_args.ds_heavy_token_num
+
+        self.sorted_channels = model_runner.sorted_channels
+        self.sparse_decode_threshold = (
+            model_runner.server_args.ds_sparse_decode_threshold
+        )
+        self.att_out_approx: torch.Tensor = None
+        self.mid_out: torch.Tensor = None
+        self.mid_o_logexpsum: torch.Tensor = None
+
+        # TODO: Change the hard-coded block_seq_num
+        self.BLOCK_SEQ = 128
+
+        if get_global_server_args().triton_attention_reduce_in_fp32:
+            self.reduce_dtype = torch.float32
+        else:
+            self.reduce_dtype = torch.float16
+
+        self.forward_metadata = None
+
+    def init_forward_metadata(self, forward_batch: ForwardBatch):
+        """Init auxiliary variables for triton attention backend."""
+
+        if forward_batch.forward_mode.is_decode():
+            start_loc = torch.zeros_like(forward_batch.seq_lens, dtype=torch.int32)
+            start_loc[1:] = torch.cumsum(forward_batch.seq_lens[:-1], dim=0)
+
+            total_num_tokens = torch.sum(forward_batch.seq_lens).item()
+            attn_logits = torch.empty(
+                (self.num_head, total_num_tokens),
+                dtype=self.reduce_dtype,
+                device="cuda",
+            )
+
+            max_seq_len = torch.max(forward_batch.seq_lens).item()
+            min_seq_len = torch.min(forward_batch.seq_lens).item()
+            max_extend_len = None
+            # NOTE: Align sequence order with req_to_token order
+            ds_req_to_token = forward_batch.req_to_token_pool.req_to_token[
+                forward_batch.req_pool_indices
+            ]
+
+            bsz = forward_batch.seq_lens.shape[0]
+
+            att_out_approx = torch.empty(
+                [self.num_head, bsz, max_seq_len],
+                dtype=self.reduce_dtype,
+                device="cuda",
+            )
+
+            block_seq_num = (
+                self.heavy_token_num + self.BLOCK_SEQ - 1
+            ) // self.BLOCK_SEQ
+
+            mid_out = torch.empty(
+                [bsz, self.num_head, block_seq_num, self.head_dim],
+                dtype=torch.float32,
+                device="cuda",
+            )
+            mid_o_logexpsum = torch.empty(
+                [bsz, self.num_head, block_seq_num], dtype=torch.float32, device="cuda"
+            )
+            self.att_out_approx = att_out_approx
+            self.mid_out = mid_out
+            self.mid_o_logexpsum = mid_o_logexpsum
+
+        else:
+            start_loc = attn_logits = max_seq_len = min_seq_len = None
+            prefix_lens = forward_batch.extend_prefix_lens
+            max_extend_len = torch.max(forward_batch.seq_lens - prefix_lens).item()
+            ds_req_to_token = None
+
+        self.forward_metadata = (
+            start_loc,
+            attn_logits,
+            max_seq_len,
+            min_seq_len,
+            max_extend_len,
+            ds_req_to_token,
+        )
+
+    def forward_extend(
+        self,
+        q,
+        k,
+        v,
+        layer: RadixAttention,
+        forward_batch: ForwardBatch,
+        save_kv_cache=True,
+    ):
+        # TODO: reuse the buffer across layers
+        if layer.qk_head_dim != layer.v_head_dim:
+            o = q.new_empty((q.shape[0], layer.tp_q_head_num * layer.v_head_dim))
+        else:
+            o = torch.empty_like(q)
+
+        k_label = torch.gather(
+            k,
+            2,
+            self.sorted_channels[layer.layer_id]
+            .unsqueeze(0)
+            .expand(k.shape[0], -1, -1),
+        )
+
+        if save_kv_cache:
+            forward_batch.token_to_kv_pool.set_kv_buffer(
+                layer, forward_batch.out_cache_loc, k, v, k_label
+            )
+
+        (
+            start_loc,
+            attn_logits,
+            max_seq_len,
+            min_seq_len,
+            max_extend_len,
+            ds_req_to_token,
+        ) = self.forward_metadata
+        self.extend_attention_fwd(
+            q.view(-1, layer.tp_q_head_num, layer.qk_head_dim),
+            k.contiguous(),
+            v.contiguous(),
+            o.view(-1, layer.tp_q_head_num, layer.v_head_dim),
+            forward_batch.token_to_kv_pool.get_key_buffer(layer.layer_id),
+            forward_batch.token_to_kv_pool.get_value_buffer(layer.layer_id),
+            forward_batch.req_to_token_pool.req_to_token,
+            forward_batch.req_pool_indices,
+            forward_batch.seq_lens,
+            forward_batch.extend_seq_lens,
+            forward_batch.extend_start_loc,
+            max_extend_len,
+            layer.scaling,
+            layer.logit_cap,
+        )
+        return o
+
+    def forward_decode(
+        self,
+        q,
+        k,
+        v,
+        layer: RadixAttention,
+        forward_batch: ForwardBatch,
+        save_kv_cache=True,
+    ):
+        # During torch.compile, there is a bug in rotary_emb that causes the
+        # output value to have a 3D tensor shape. This reshapes the output correctly.
+        q = q.reshape(-1, layer.tp_q_head_num * layer.qk_head_dim)
+
+        # TODO: reuse the buffer across layers
+        if layer.qk_head_dim != layer.v_head_dim:
+            o = q.new_empty((q.shape[0], layer.tp_q_head_num * layer.v_head_dim))
+        else:
+            o = torch.empty_like(q)
+
+        # TODO: Add min seqlen
+        (
+            start_loc,
+            attn_logits,
+            max_seq_len,
+            min_seq_len,
+            max_extend_len,
+            ds_req_to_token,
+        ) = self.forward_metadata
+
+        k_label = torch.gather(
+            k,
+            2,
+            self.sorted_channels[layer.layer_id]
+            .unsqueeze(0)
+            .expand(k.shape[0], -1, -1),
+        )
+
+        if save_kv_cache:
+            forward_batch.token_to_kv_pool.set_kv_buffer(
+                layer, forward_batch.out_cache_loc, k, v, k_label
+            )
+
+        # NOTE(Andy) shouldn't be used when max_len_in_batch < heavy_token_num
+        #            and set a minimum value for sparse_decode
+        if (
+            min_seq_len < self.heavy_token_num
+            or max_seq_len < self.sparse_decode_threshold
+        ):
+            self.decode_attention_fwd(
+                q.view(-1, layer.tp_q_head_num, layer.qk_head_dim),
+                forward_batch.token_to_kv_pool.get_key_buffer(layer.layer_id),
+                forward_batch.token_to_kv_pool.get_value_buffer(layer.layer_id),
+                o.view(-1, layer.tp_q_head_num, layer.v_head_dim),
+                forward_batch.req_to_token_pool.req_to_token,
+                forward_batch.req_pool_indices,
+                start_loc,
+                forward_batch.seq_lens,
+                attn_logits,
+                max_seq_len,
+                layer.scaling,
+                layer.logit_cap,
+            )
+        else:
+            # TODO(Andy): indexing with torch.gather or torch.index_select or customized kernel
+            q_label = torch.gather(
+                q.view(-1, layer.tp_q_head_num, layer.qk_head_dim),
+                2,
+                self.sorted_channels[layer.layer_id]
+                .unsqueeze(0)
+                .expand(q.shape[0], -1, -1),
+            )
+            self.decode_sparse_attention_fwd(
+                q.view(-1, layer.tp_q_head_num, layer.qk_head_dim),
+                forward_batch.token_to_kv_pool.get_key_buffer(layer.layer_id),
+                forward_batch.token_to_kv_pool.get_value_buffer(layer.layer_id),
+                o.view(-1, layer.tp_q_head_num, layer.qk_head_dim),
+                q_label,
+                forward_batch.token_to_kv_pool.get_label_buffer(layer.layer_id),
+                ds_req_to_token,
+                forward_batch.seq_lens,
+                max_seq_len,
+                layer.scaling,
+                layer.logit_cap,
+                self.heavy_token_num,
+                self.att_out_approx,
+                self.mid_out,
+                self.mid_o_logexpsum,
+                self.BLOCK_SEQ,
+            )
+
+        return o
diff --git a/sglang/python/sglang/srt/layers/attention/dual_chunk_flashattention_backend.py b/sglang/python/sglang/srt/layers/attention/dual_chunk_flashattention_backend.py
new file mode 100644
index 0000000000000000000000000000000000000000..e522fbe4a934a248daa5d1e4d5b6ba3d3b5ace80
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/attention/dual_chunk_flashattention_backend.py
@@ -0,0 +1,1700 @@
+# SPDX-License-Identifier: Apache-2.0
+"""Attention layer with Dual chunk flash attention and sparse attention."""
+
+import functools
+import logging
+import math
+from dataclasses import dataclass
+from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple
+
+import torch
+import torch.nn.functional as F
+from sgl_kernel.flash_attn import flash_attn_varlen_func, flash_attn_with_kvcache
+from sgl_kernel.sparse_flash_attn import (
+    convert_vertical_slash_indexes,
+    convert_vertical_slash_indexes_mergehead,
+    sparse_attn_func,
+)
+
+from sglang.srt.distributed.parallel_state import get_tensor_model_parallel_rank
+from sglang.srt.layers.attention.base_attn_backend import AttentionBackend
+from sglang.srt.layers.attention.flashattention_backend import FlashAttentionMetadata
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch, ForwardMode
+
+if TYPE_CHECKING:
+    from sglang.srt.layers.radix_attention import RadixAttention
+    from sglang.srt.model_executor.model_runner import ModelRunner
+
+
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class DualChunkFlashAttentionMetadata:
+    """Metadata for FlashAttentionBackend.
+
+    NOTE: Any python object stored here is not updated when it is
+    cuda-graph replayed. If you have values that need to be changed
+    dynamically, it should be stored in tensor. The tensor has to be
+    updated from `CUDAGraphRunner.forward` API.
+    """
+
+    # (batch_size,). The sequence length per sequence. Sequence length means
+    # the computed tokens + new tokens None if it is a decoding.
+    seq_lens: Optional[List[int]] = None
+    # seq_lens stored as a tensor.
+    seq_lens_tensor: Optional[torch.Tensor] = None
+    # Maximum sequence length among prefill batch. 0 if there are decoding
+    # requests only.
+    max_seq_len: int = None
+
+    # (batch_size,). The orig sequence length per sequence.
+    orig_seq_lens: Optional[List[int]] = None
+
+    # orig_seq_lens stored as a tensor.
+    orig_seq_lens_tensor: Optional[torch.Tensor] = None
+
+    # Block addresses per sequence. (Seq id -> list of physical block)
+    # E.g., [0, 1, 2] means tokens are stored in 0th, 1st, and 2nd blocks
+    # in the kv cache. Each block can contain up to block_size tokens.
+    # 2nd dimensions are padded up to max_blocks_per_seq if it is cuda-graph
+    # captured.
+    block_tables: Optional[torch.Tensor] = None
+
+    # (batch_size + 1,). The cumulative subquery lengths of the sequences in
+    # the batch, used to index into subquery. E.g., if the subquery length
+    # is [4, 6], it is [0, 4, 10].
+    query_start_loc: Optional[torch.Tensor] = None
+    # (batch_size + 1,). The cumulative sequence lengths of the sequences in
+    # the batch, used to index into sequence. E.g., if the sequence length is
+    # [4, 6], it is [0, 4, 10].
+    seq_start_loc: Optional[torch.Tensor] = None
+
+    # Length scaling factor
+    scaling_factor: Optional[torch.Tensor] = None
+
+    # (batch_size,). Sequence lengths for intra attention.
+    seq_lens_intra: Optional[torch.Tensor] = None
+
+    # Max sequence length for intra attention.
+    max_seq_len_intra: Optional[int] = None
+
+    # (batch_size, num_blocks). Block table for intra attention.
+    block_tables_intra: Optional[torch.Tensor] = None
+
+    # (batch_size,). Sequence lengths for succ attention.
+    seq_lens_succ: Optional[torch.Tensor] = None
+
+    # Max sequence length for succ attention.
+    max_seq_len_succ: Optional[int] = None
+
+    # (batch_size, num_blocks). Block table for succ attention.
+    block_tables_succ: Optional[torch.Tensor] = None
+
+    # (batch_size,). Sequence lengths for inter attention.
+    seq_lens_inter: Optional[torch.Tensor] = None
+
+    # Max sequence length for inter attention.
+    max_seq_len_inter: Optional[int] = None
+
+
+class DualChunkFlashAttentionBackend(AttentionBackend):
+    def __init__(
+        self,
+        model_runner: "ModelRunner",
+    ) -> None:
+        self.forward_metadata: FlashAttentionMetadata = None
+        self.device = model_runner.device
+        self.max_context_len = model_runner.model_config.context_len
+        self.num_heads = model_runner.model_config.get_num_attention_heads(
+            model_runner.server_args.tp_size
+        )
+        self.num_kv_heads = model_runner.model_config.get_num_kv_heads(
+            model_runner.server_args.tp_size
+        )
+        self.head_size = model_runner.model_config.head_dim
+
+        self.req_to_token = model_runner.req_to_token_pool.req_to_token
+        self.kv_cache_dtype = model_runner.kv_cache_dtype
+        self.kv_cache_dtype_str = model_runner.server_args.kv_cache_dtype
+        self.page_size = model_runner.page_size
+
+        assert self.num_heads % self.num_kv_heads == 0
+        self.num_queries_per_kv = self.num_heads // self.num_kv_heads
+
+        dual_chunk_attention_config = getattr(
+            model_runner.model_config.hf_config, "dual_chunk_attention_config", None
+        )
+        assert dual_chunk_attention_config is not None
+        self.chunk_size = dual_chunk_attention_config.get("chunk_size", 8192)
+        self.local_size = dual_chunk_attention_config.get("local_size", 1024)
+        self.original_max_position_embeddings = dual_chunk_attention_config.get(
+            "original_max_position_embeddings", 0
+        )
+        self.sparse_attention_config = dual_chunk_attention_config.get(
+            "sparse_attention_config", None
+        )
+        if not self.sparse_attention_config:
+            logger.warning_once(
+                "Sparse attention will not be enabled as "
+                "sparse attention config is not provided."
+            )
+        self.sparse_attention_enabled = dual_chunk_attention_config.get(
+            "sparse_attention_enabled", self.sparse_attention_config is not None
+        )
+        self.sparse_attention_threshold = dual_chunk_attention_config.get(
+            "sparse_attention_threshold", 32768
+        )
+        self.sparse_attention_last_q = dual_chunk_attention_config.get(
+            "sparse_attention_last_q", 64
+        )
+        self.dual_chunk_attention_config = dual_chunk_attention_config
+
+        if self.sparse_attention_enabled:
+            self.arange = torch.arange(self.sparse_attention_last_q, device="cuda")
+            self.last_q_mask = (
+                self.arange[None, None, :, None] >= self.arange[None, None, None, :]
+            )
+
+    @functools.lru_cache()
+    def get_sparse_attention_config(self, layer_idx) -> List[Dict[str, Any]]:
+        layer_sparse_attention_config = {
+            int(i): j for i, j in self.sparse_attention_config[layer_idx].items()
+        }
+        start_head = self.num_heads * get_tensor_model_parallel_rank()
+        end_head = start_head + self.num_heads
+        return [layer_sparse_attention_config[i] for i in range(start_head, end_head)]
+
+    def init_forward_metadata(self, forward_batch: ForwardBatch):
+        """Initialize forward metadata hence all layers in the forward pass can reuse it."""
+
+        forward_mode: ForwardMode = forward_batch.forward_mode
+        assert forward_mode.is_prefill() or forward_mode.is_decode()
+        batch_size = forward_batch.batch_size
+
+        metadata = DualChunkFlashAttentionMetadata()
+        metadata.seq_lens_tensor = forward_batch.seq_lens.to(torch.int32)
+        metadata.seq_lens = forward_batch.seq_lens.tolist()
+        metadata.max_seq_len = forward_batch.seq_lens.max().item()
+
+        metadata.orig_seq_lens_tensor = forward_batch.orig_seq_lens
+        metadata.orig_seq_lens = forward_batch.orig_seq_lens.tolist()
+
+        metadata.block_tables = forward_batch.req_to_token_pool.req_to_token[
+            forward_batch.req_pool_indices, : metadata.max_seq_len
+        ]
+        # Convert the block table to a strided format.
+        if self.page_size > 1:
+            strided_indices = torch.arange(
+                0, metadata.block_tables.shape[1], self.page_size, device=self.device
+            )
+            metadata.block_tables = (
+                metadata.block_tables[:, strided_indices] // self.page_size
+            )
+
+        metadata.query_start_loc = torch.zeros(
+            batch_size + 1, dtype=torch.int32, device=metadata.seq_lens_tensor.device
+        )
+        if forward_mode.is_prefill():
+            metadata.query_start_loc[1:] = torch.cumsum(
+                forward_batch.extend_seq_lens.to(torch.int32), dim=0, dtype=torch.int32
+            )
+        else:
+            metadata.query_start_loc[1:] = torch.cumsum(
+                torch.arange(
+                    batch_size,
+                    dtype=metadata.query_start_loc.dtype,
+                    device=metadata.query_start_loc.device,
+                ),
+                dim=0,
+                dtype=torch.int32,
+            )
+        metadata.seq_start_loc = torch.zeros(
+            batch_size + 1, dtype=torch.int32, device=metadata.seq_lens_tensor.device
+        )
+        metadata.seq_start_loc[1:] = torch.cumsum(
+            metadata.seq_lens_tensor, dim=0, dtype=torch.int32
+        )
+
+        if self.original_max_position_embeddings > 0:
+            if forward_mode.is_prefill():
+                metadata.scaling_factor = (
+                    0.1
+                    * torch.log(
+                        metadata.orig_seq_lens_tensor
+                        / self.original_max_position_embeddings
+                    )
+                    + 1.0
+                ).clip(min=1)
+            else:
+                metadata.scaling_factor = (
+                    0.1
+                    * torch.log(
+                        metadata.orig_seq_lens_tensor
+                        / self.original_max_position_embeddings
+                    )
+                    + 1.0
+                ).clip(min=1)
+
+        if forward_mode.is_decode():
+            cache_seq_lens = metadata.orig_seq_lens_tensor
+
+            chunk_len = self.chunk_size - self.local_size
+            chunk_num_curr = (cache_seq_lens - 1) // chunk_len
+
+            seq_lens_intra = cache_seq_lens - chunk_num_curr * chunk_len
+            max_seq_len_intra = seq_lens_intra.max().item()
+            metadata.seq_lens_intra = seq_lens_intra
+            metadata.max_seq_len_intra = max_seq_len_intra
+
+            block_tables_intra = torch.zeros(
+                batch_size,
+                (max_seq_len_intra - 1) // self.page_size + 1,
+                dtype=metadata.block_tables.dtype,
+                device=metadata.block_tables.device,
+            )
+            for i in range(batch_size):
+                st = chunk_num_curr[i] * chunk_len // self.page_size
+                ed = min(
+                    st + (max_seq_len_intra - 1) // self.page_size + 1,
+                    (cache_seq_lens[i] - 1) // self.page_size + 1,
+                )
+                block_tables_intra[i, : ed - st] = metadata.block_tables[i, st:ed]
+            metadata.block_tables_intra = block_tables_intra
+
+            metadata.seq_lens_succ = (
+                chunk_num_curr - (chunk_num_curr - 1).clip(min=0)
+            ) * chunk_len
+            metadata.max_seq_len_succ = metadata.seq_lens_succ.max().item()
+            if metadata.max_seq_len_succ:
+                block_tables_succ = torch.zeros(
+                    batch_size,
+                    (metadata.max_seq_len_succ - 1) // self.page_size + 1,
+                    dtype=metadata.block_tables.dtype,
+                    device=metadata.block_tables.device,
+                )
+                for i in range(batch_size):
+                    start = (
+                        (chunk_num_curr[i] - 1).clip(min=0)
+                        * chunk_len
+                        // self.page_size
+                    )
+                    end = min(
+                        start + (metadata.max_seq_len_succ - 1) // self.page_size + 1,
+                        (cache_seq_lens[i] - 1) // self.page_size + 1,
+                    )
+                    block_tables_succ[i, : end - start] = metadata.block_tables[
+                        i, start:end
+                    ]
+                metadata.block_tables_succ = block_tables_succ
+
+            metadata.seq_lens_inter = (chunk_num_curr - 1).clip(min=0) * chunk_len
+            metadata.max_seq_len_inter = metadata.seq_lens_inter.max().item()
+
+        self.forward_metadata = metadata
+
+    def forward_extend(
+        self,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
+        layer: "RadixAttention",
+        forward_batch: ForwardBatch,
+        save_kv_cache=True,
+    ):
+        # Use precomputed metadata across all layers
+        metadata = self.forward_metadata
+
+        (
+            query,
+            query_succ,
+            query_inter,
+            query_succ_critical,
+            query_inter_critical,
+        ) = torch.split(q, q.shape[-1] // 5, dim=-1)
+
+        # Reshape the query, key, and value tensors.
+        query = query.view(-1, self.num_heads, self.head_size)
+        query_succ = query_succ.view(-1, self.num_heads, self.head_size)
+        query_inter = query_inter.view(-1, self.num_heads, self.head_size)
+        query_succ_critical = query_succ_critical.view(
+            -1, self.num_heads, self.head_size
+        )
+        query_inter_critical = query_inter_critical.view(
+            -1, self.num_heads, self.head_size
+        )
+        key = k.view(-1, self.num_kv_heads, self.head_size)
+        value = v.view(-1, self.num_kv_heads, self.head_size)
+
+        # apply DCA scaling
+        if self.original_max_position_embeddings > 0:
+            assert metadata.scaling_factor is not None
+            assert metadata.query_start_loc is not None
+            assert metadata.orig_seq_lens is not None
+            current_start = 0
+            query_start_loc_cpu = metadata.query_start_loc.cpu()
+            for i in range(len(metadata.orig_seq_lens)):
+                current_end = (
+                    current_start
+                    + (query_start_loc_cpu[i + 1] - query_start_loc_cpu[i]).item()
+                )
+                key[current_start:current_end].mul_(metadata.scaling_factor[i])
+                current_start = current_end
+            assert current_end <= self.max_context_len
+
+        # Do multi-head attention
+        key_cache, value_cache = forward_batch.token_to_kv_pool.get_kv_buffer(
+            layer.layer_id
+        )
+        key_cache = key_cache.view(
+            -1, self.page_size, layer.tp_k_head_num, layer.head_dim
+        )
+        value_cache = value_cache.view(
+            -1, self.page_size, layer.tp_v_head_num, layer.head_dim
+        )
+
+        if key is not None and value is not None:
+            if save_kv_cache:
+                forward_batch.token_to_kv_pool.set_kv_buffer(
+                    layer,
+                    forward_batch.out_cache_loc,
+                    key,
+                    value,
+                    layer.k_scale,
+                    layer.v_scale,
+                )
+
+        if not save_kv_cache:
+            # profile run
+            o = flash_attn_varlen_func(
+                q=query,
+                k=key,
+                v=value,
+                cu_seqlens_q=metadata.seq_start_loc,
+                cu_seqlens_k=metadata.seq_start_loc,
+                max_seqlen_q=metadata.max_seq_len,
+                max_seqlen_k=metadata.max_seq_len,
+                softmax_scale=layer.scaling,
+                causal=True,
+            )
+        else:
+            # prefill/chunked-prefill
+            # get per layer sparse attention config
+            if self.sparse_attention_enabled:
+                self.layer_sparse_attention_config = self.get_sparse_attention_config(
+                    layer.layer_id
+                )
+            assert metadata.orig_seq_lens is not None
+            o = self._dual_chunk_flash_attn_prefill(
+                q=query,
+                q_succ=query_succ,
+                q_inter=query_inter,
+                q_succ_critical=query_succ_critical,
+                q_inter_critical=query_inter_critical,
+                k=key_cache,
+                v=value_cache,
+                cu_seqlens_q=metadata.query_start_loc,
+                cu_seqlens_k=metadata.seq_start_loc,
+                orig_seq_lens=metadata.orig_seq_lens,
+                scaling_factor=metadata.scaling_factor,
+                softmax_scale=layer.scaling,
+                causal=True,
+                window_size=(-1, -1),
+                block_table=metadata.block_tables,
+                chunk_size=self.chunk_size,
+                local_size=self.local_size,
+            )
+        return o.view(-1, layer.tp_q_head_num * layer.v_head_dim)
+
+    def forward_decode(
+        self,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
+        layer: "RadixAttention",
+        forward_batch: ForwardBatch,
+        save_kv_cache=True,
+    ) -> torch.Tensor:
+        # Use precomputed metadata across all layers
+        metadata = self.forward_metadata
+
+        (
+            query,
+            query_succ,
+            query_inter,
+            query_succ_critical,
+            query_inter_critical,
+        ) = torch.split(q, q.shape[-1] // 5, dim=-1)
+
+        # Reshape the query, key, and value tensors.
+        query = query.view(-1, self.num_heads, self.head_size)
+        query_succ = query_succ.view(-1, self.num_heads, self.head_size)
+        query_inter = query_inter.view(-1, self.num_heads, self.head_size)
+        query_succ_critical = query_succ_critical.view(
+            -1, self.num_heads, self.head_size
+        )
+        query_inter_critical = query_inter_critical.view(
+            -1, self.num_heads, self.head_size
+        )
+        key = k.view(-1, self.num_kv_heads, self.head_size)
+        value = v.view(-1, self.num_kv_heads, self.head_size)
+
+        key_cache, value_cache = forward_batch.token_to_kv_pool.get_kv_buffer(
+            layer.layer_id
+        )
+        key_cache = key_cache.view(
+            -1, self.page_size, layer.tp_k_head_num, layer.head_dim
+        )
+        value_cache = value_cache.view(
+            -1, self.page_size, layer.tp_v_head_num, layer.head_dim
+        )
+
+        if key is not None and value is not None:
+            if save_kv_cache:
+                forward_batch.token_to_kv_pool.set_kv_buffer(
+                    layer,
+                    forward_batch.out_cache_loc,
+                    key,
+                    value,
+                    layer.k_scale,
+                    layer.v_scale,
+                )
+
+        # apply DCA scaling
+        if self.original_max_position_embeddings > 0:
+            assert metadata.scaling_factor is not None
+            scaling_factor = metadata.scaling_factor
+            key.mul_(scaling_factor.unsqueeze(-1).unsqueeze(-1))
+
+        o = self._dual_chunk_flash_attn_decoding(
+            query.unsqueeze(1),
+            query_succ.unsqueeze(1),
+            query_inter.unsqueeze(1),
+            key_cache,
+            value_cache,
+            block_table=metadata.block_tables,
+            cache_seqlens=metadata.seq_lens_tensor,
+            softmax_scale=layer.scaling,
+            causal=True,
+            chunk_size=self.chunk_size,
+            local_size=self.local_size,
+            original_max_position_embeddings=self.original_max_position_embeddings,
+            decode_meta=metadata,
+        ).squeeze(1)
+        return o.view(-1, layer.tp_q_head_num * layer.v_head_dim)
+
+    def init_cuda_graph_state(self, max_bs: int, max_num_tokens: int):
+        """Initialize CUDA graph state for the attention backend.
+
+        Args:
+            max_bs (int): Maximum batch size to support in CUDA graphs
+
+        This creates fixed-size tensors that will be reused during CUDA graph replay
+        to avoid memory allocations.
+        """
+        self.decode_metadata = {
+            "seq_lens_tensor": torch.zeros(
+                max_bs, dtype=torch.int32, device=self.device
+            ),
+            "orig_seq_lens_tensor": torch.zeros(
+                max_bs, dtype=torch.int32, device=self.device
+            ),
+            "scaling_factor": torch.zeros(
+                max_bs, dtype=torch.float32, device=self.device
+            ),
+            "block_tables": torch.zeros(
+                max_bs,
+                (self.max_context_len - 1) // self.page_size + 1,
+                dtype=torch.int32,
+                device=self.device,
+            ),
+            "block_tables_intra": torch.zeros(
+                max_bs,
+                (self.max_context_len - 1) // self.page_size + 1,
+                dtype=torch.int32,
+                device=self.device,
+            ),
+            "seq_lens_intra": torch.zeros(
+                max_bs, dtype=torch.int32, device=self.device
+            ),
+            "block_tables_succ": torch.zeros(
+                max_bs,
+                (self.max_context_len - 1) // self.page_size + 1,
+                dtype=torch.int32,
+                device=self.device,
+            ),
+            "seq_lens_succ": torch.zeros(max_bs, dtype=torch.int32, device=self.device),
+            "seq_lens_inter": torch.zeros(
+                max_bs, dtype=torch.int32, device=self.device
+            ),
+        }
+
+    def init_forward_metadata_capture_cuda_graph(
+        self,
+        bs: int,
+        num_tokens: int,
+        req_pool_indices: torch.Tensor,
+        seq_lens: torch.Tensor,
+        encoder_lens: Optional[torch.Tensor],
+        forward_mode: ForwardMode,
+        spec_info: Optional[None],
+    ):
+        metadata = DualChunkFlashAttentionMetadata()
+
+        if forward_mode.is_decode_or_idle():
+            if self.original_max_position_embeddings > 0:
+                metadata.scaling_factor = self.decode_metadata["scaling_factor"][:bs]
+
+            metadata.seq_lens_tensor = self.decode_metadata["seq_lens_tensor"][:bs]
+            metadata.orig_seq_lens_tensor = self.decode_metadata[
+                "orig_seq_lens_tensor"
+            ][:bs]
+            metadata.max_seq_len = self.max_context_len
+            metadata.block_tables = self.decode_metadata["block_tables"][
+                req_pool_indices, :
+            ]
+
+            # intra
+            metadata.max_seq_len_intra = self.max_context_len
+            metadata.seq_lens_intra = self.decode_metadata["seq_lens_intra"][:bs]
+
+            metadata.block_tables_intra = self.decode_metadata["block_tables_intra"][
+                :bs, :
+            ]
+
+            # succ
+            metadata.seq_lens_succ = self.decode_metadata["seq_lens_succ"][:bs]
+            metadata.max_seq_len_succ = self.max_context_len
+
+            metadata.block_tables_succ = self.decode_metadata["block_tables_succ"][
+                :bs, :
+            ]
+
+            metadata.seq_lens_inter = self.decode_metadata["seq_lens_inter"][:bs]
+            metadata.max_seq_len_inter = self.max_context_len
+
+            self.decode_metadata[bs] = metadata
+
+        self.forward_metadata = metadata
+
+    def init_forward_metadata_replay_cuda_graph(
+        self,
+        bs: int,
+        req_pool_indices: torch.Tensor,
+        seq_lens: torch.Tensor,
+        seq_lens_sum: int,
+        encoder_lens: Optional[torch.Tensor],
+        forward_mode: ForwardMode,
+        spec_info: Optional[None],
+        seq_lens_cpu: Optional[torch.Tensor],
+        out_cache_loc: torch.Tensor = None,
+    ):
+        """Initialize forward metadata for replaying CUDA graph."""
+        assert forward_mode.is_decode()
+        seq_lens = seq_lens[:bs]
+        req_pool_indices = req_pool_indices[:bs]
+        metadata = self.decode_metadata[bs]
+
+        metadata.seq_lens_tensor.copy_(seq_lens.to(torch.int32))
+        metadata.seq_lens = seq_lens.tolist()
+        metadata.max_seq_len = seq_lens.max().item()
+
+        metadata.orig_seq_lens_tensor.copy_(seq_lens)
+        metadata.orig_seq_lens = seq_lens.tolist()
+
+        block_tables = self.req_to_token[req_pool_indices, : metadata.max_seq_len]
+        # Convert the block table to a strided format.
+        if self.page_size > 1:
+            strided_indices = torch.arange(
+                0, block_tables.shape[1], self.page_size, device=self.device
+            )
+            block_tables = block_tables[:, strided_indices] // self.page_size
+        metadata.block_tables.fill_(0)
+        metadata.block_tables[: block_tables.shape[0], : block_tables.shape[1]].copy_(
+            block_tables
+        )
+
+        if self.original_max_position_embeddings > 0:
+            scaling_factor = (
+                0.1
+                * torch.log(
+                    metadata.orig_seq_lens_tensor
+                    / self.original_max_position_embeddings
+                )
+                + 1.0
+            ).clip(min=1)
+            metadata.scaling_factor.copy_(scaling_factor)
+
+        cache_seq_lens = metadata.orig_seq_lens_tensor
+
+        chunk_len = self.chunk_size - self.local_size
+        chunk_num_curr = (cache_seq_lens - 1) // chunk_len
+
+        seq_lens_intra = cache_seq_lens - chunk_num_curr * chunk_len
+        max_seq_len_intra = seq_lens_intra.max().item()
+        metadata.seq_lens_intra.copy_(seq_lens_intra)
+        metadata.max_seq_len_intra = max_seq_len_intra
+
+        metadata.block_tables_intra.fill_(0)
+        for i in range(bs):
+            st = chunk_num_curr[i] * chunk_len // self.page_size
+            ed = min(
+                st + (max_seq_len_intra - 1) // self.page_size + 1,
+                (cache_seq_lens[i] - 1) // self.page_size + 1,
+            )
+            metadata.block_tables_intra[i, : ed - st] = metadata.block_tables[i, st:ed]
+
+        seq_lens_succ = (chunk_num_curr - (chunk_num_curr - 1).clip(min=0)) * chunk_len
+        metadata.seq_lens_succ.copy_(seq_lens_succ)
+        metadata.max_seq_len_succ = metadata.seq_lens_succ.max().item()
+        if metadata.max_seq_len_succ:
+            metadata.block_tables_succ.fill_(0)
+            for i in range(bs):
+                start = (
+                    (chunk_num_curr[i] - 1).clip(min=0) * chunk_len // self.page_size
+                )
+                end = min(
+                    start + (metadata.max_seq_len_succ - 1) // self.page_size + 1,
+                    (cache_seq_lens[i] - 1) // self.page_size + 1,
+                )
+                metadata.block_tables_succ[i, : end - start] = metadata.block_tables[
+                    i, start:end
+                ]
+
+        seq_lens_inter = (chunk_num_curr - 1).clip(min=0) * chunk_len
+        metadata.seq_lens_inter.copy_(seq_lens_inter)
+        metadata.max_seq_len_inter = metadata.seq_lens_inter.max().item()
+
+        self.forward_metadata = metadata
+
+    def get_cuda_graph_seq_len_fill_value(self):
+        """Get the fill value for sequence length in CUDA graph."""
+        return 1
+
+    def _dual_chunk_flash_attn_prefill(
+        self,
+        q,
+        q_succ,
+        q_inter,
+        q_succ_critical,
+        q_inter_critical,
+        k,
+        v,
+        cu_seqlens_q,
+        cu_seqlens_k,
+        orig_seq_lens: List[int],
+        scaling_factor: torch.Tensor,
+        softmax_scale: float,
+        causal: Optional[bool] = True,
+        window_size: Tuple[int, int] = (-1, -1),
+        block_table: Optional[torch.Tensor] = None,
+        chunk_size: int = 8192,
+        local_size: int = 1024,
+    ):
+        if not causal:
+            raise ValueError("Dual Chunk Attention does not support causal=False")
+        if window_size != (-1, -1):
+            raise ValueError("Dual Chunk Attention does not support window_size")
+
+        cu_seqlens_q_cpu = cu_seqlens_q.cpu().tolist()
+        cu_seqlens_k_cpu = cu_seqlens_k.cpu().tolist()
+        all_outputs = []
+
+        for i in range(0, len(cu_seqlens_q_cpu) - 1):
+            qs = cu_seqlens_q_cpu[i]
+            qe = cu_seqlens_q_cpu[i : i + 2][-1]
+            ks = cu_seqlens_k_cpu[i]
+            ke = cu_seqlens_k_cpu[i : i + 2][-1]
+
+            current_q = q[qs:qe]
+            current_q_succ = q_succ[qs:qe]
+            current_q_inter = q_inter[qs:qe]
+            current_q_succ_critical = q_succ_critical[qs:qe]
+            current_q_inter_critical = q_inter_critical[qs:qe]
+
+            if block_table is None:
+                current_k = k[ks:ke]
+                current_v = v[ks:ke]
+                current_block_table = None
+                current_orig_seq_len = orig_seq_lens[i]
+            else:
+                current_block_table = block_table[i]
+                current_orig_seq_len = orig_seq_lens[i]
+                current_k = k
+                current_v = v
+            sparse_attn_enabled = (
+                self.sparse_attention_enabled
+                and current_orig_seq_len > self.sparse_attention_threshold
+            )
+
+            if current_q.shape[0] == 0:
+                continue
+
+            if current_k.shape[0] == 0:
+                all_outputs.append(
+                    torch.zeros(
+                        (current_q.shape[0], current_q.shape[1], v.shape[2]),
+                        device=q.device,
+                        dtype=q.dtype,
+                    )
+                )
+                continue
+
+            current_output = torch.empty_like(current_q)
+            group_size = int(current_q.size(-2) / current_k.size(-2))
+
+            if sparse_attn_enabled:
+                num_device_q_heads = current_q.size(-2)
+                heads_vertical_size = torch.empty(
+                    size=(num_device_q_heads,), dtype=torch.int32
+                )
+                heads_slash_size = torch.empty(
+                    size=(num_device_q_heads,), dtype=torch.int32
+                )
+                for head_id in range(current_q.size(-2)):
+                    (
+                        ty,
+                        vertical_size,
+                        slash_size,
+                        _,
+                    ) = self.layer_sparse_attention_config[head_id]
+                    assert ty == "vertical_and_slash", "only support slash mode"
+
+                    if vertical_size == 30:
+                        vertical_size += 100
+                    heads_vertical_size[head_id] = vertical_size
+                    heads_slash_size[head_id] = slash_size
+
+                current_output = self._dual_chunk_flash_attn_prefill_func(
+                    current_q,  # allheads
+                    current_q_succ,
+                    current_q_inter,
+                    current_q_succ_critical,
+                    current_q_inter_critical,
+                    current_k,
+                    current_v,
+                    current_block_table,
+                    softmax_scale,
+                    chunk_size,
+                    local_size,
+                    scaling_factor[i].item(),
+                    ke - ks,
+                    sparse_attn_enabled=sparse_attn_enabled,
+                    heads_vertical_size=heads_vertical_size,
+                    heads_slash_size=heads_slash_size,
+                    group_size=group_size,
+                )
+            else:
+                for head_id in range(current_q.size(-2)):
+                    # (seq_len, num_heads, head_size)
+                    current_q_head = current_q[:, head_id, :].unsqueeze(1)
+                    current_q_succ_head = current_q_succ[:, head_id, :].unsqueeze(1)
+                    current_q_inter_head = current_q_inter[:, head_id, :].unsqueeze(1)
+                    current_q_succ_head_critical = current_q_succ_critical[
+                        :, head_id, :
+                    ].unsqueeze(1)
+                    current_q_inter_head_critical = current_q_inter_critical[
+                        :, head_id, :
+                    ].unsqueeze(1)
+                    if block_table is not None:
+                        current_k_head = current_k[
+                            ..., head_id // group_size, :
+                        ].unsqueeze(2)
+                        current_v_head = current_v[
+                            ..., head_id // group_size, :
+                        ].unsqueeze(2)
+
+                    else:
+                        current_k_head = current_k[:, head_id, :].unsqueeze(1)
+                        current_v_head = current_v[:, head_id, :].unsqueeze(1)
+
+                    current_out = self._dual_chunk_flash_attn_prefill_func(
+                        current_q_head,
+                        current_q_succ_head,
+                        current_q_inter_head,
+                        current_q_succ_head_critical,
+                        current_q_inter_head_critical,
+                        current_k_head,
+                        current_v_head,
+                        current_block_table,
+                        softmax_scale,
+                        chunk_size,
+                        local_size,
+                        scaling_factor[i].item(),
+                        ke - ks,
+                        sparse_attn_enabled=sparse_attn_enabled,
+                    )
+                    current_output[:, head_id : head_id + 1, :] = current_out
+            all_outputs.append(current_output)
+        return torch.cat(all_outputs, dim=0)
+
+    def _dual_chunk_flash_attn_prefill_func(
+        self,
+        q,
+        q_succ,
+        q_inter,
+        q_succ_critical,
+        q_inter_critical,
+        k,
+        v,
+        block_table,
+        softmax_scale: float,
+        chunk_size: int,
+        local_size: int,
+        scaling_factor: float,
+        k_length: int,
+        sparse_attn_enabled: Optional[bool] = True,
+        heads_vertical_size=None,
+        heads_slash_size=None,
+        group_size=None,
+    ):
+        flash_results = []
+        chunk_len = chunk_size - local_size
+
+        if block_table is not None:
+            block_size = v.shape[1]
+            if chunk_len % block_size != 0:
+                raise ValueError("chunk_len must be divisible by block_size.")
+        else:
+            block_size = 1
+
+        if self.original_max_position_embeddings > 0:
+            softmax_scale = softmax_scale * scaling_factor
+
+        begin = k_length - q.shape[0]
+        while begin < k_length:
+            flash_per_chunk = []
+
+            prev_chunk_end_pos = (begin // chunk_len) * chunk_len
+            next_chunk_end_pos = prev_chunk_end_pos + chunk_len
+            end = min(next_chunk_end_pos, k_length)
+            qbegin = begin - (k_length - q.shape[0])
+            qend = end - (k_length - q.shape[0])
+
+            qk_chunks = []
+            q_states_intra = q[qbegin:qend]
+            # choose critical token
+            if block_table is not None:
+                block_tables_intra = _get_block(
+                    block_table, block_size, prev_chunk_end_pos, end
+                )
+                k_states_intra = k[block_tables_intra].view(-1, *k.shape[-2:])[
+                    : (end - prev_chunk_end_pos)
+                ]
+                v_states_intra = v[block_tables_intra].view(-1, *v.shape[-2:])[
+                    : (end - prev_chunk_end_pos)
+                ]
+            else:
+                block_tables_intra = None
+                k_states_intra = k[prev_chunk_end_pos:end]
+                v_states_intra = v[prev_chunk_end_pos:end]
+
+            if sparse_attn_enabled:
+                last_q_size = min(qend - qbegin, self.sparse_attention_last_q)
+                _, num_device_k_heads, head_dim = k_states_intra.shape
+                k_states_intra = (
+                    k_states_intra.unsqueeze(2)
+                    .repeat(1, 1, group_size, 1)
+                    .reshape(-1, num_device_k_heads * group_size, head_dim)
+                )
+                v_states_intra = (
+                    v_states_intra.unsqueeze(2)
+                    .repeat(1, 1, group_size, 1)
+                    .reshape(-1, num_device_k_heads * group_size, head_dim)
+                )
+                qk_chunks.append(
+                    (q_states_intra.transpose(0, 1)[:, -last_q_size:] * softmax_scale)
+                    @ k_states_intra.permute(1, 2, 0)
+                )
+
+            if prev_chunk_end_pos - chunk_len >= 0:
+                q_states_succ = q_succ[qbegin:qend]
+                q_states_succ_critical = q_succ_critical[qbegin:qend]
+                if block_table is not None:
+                    block_tables_succ = _get_block(
+                        block_table,
+                        block_size,
+                        prev_chunk_end_pos - chunk_len,
+                        prev_chunk_end_pos,
+                    )
+                    k_states_succ = k[block_tables_succ].view(-1, *k.shape[-2:])[
+                        :chunk_len
+                    ]
+                    v_states_succ = v[block_tables_succ].view(-1, *v.shape[-2:])[
+                        :chunk_len
+                    ]
+                else:
+                    k_states_succ = k[
+                        prev_chunk_end_pos - chunk_len : prev_chunk_end_pos
+                    ]
+                    v_states_succ = v[
+                        prev_chunk_end_pos - chunk_len : prev_chunk_end_pos
+                    ]
+
+                if sparse_attn_enabled:
+                    k_states_succ = (
+                        k_states_succ.unsqueeze(2)
+                        .repeat(1, 1, group_size, 1)
+                        .reshape(-1, num_device_k_heads * group_size, head_dim)
+                    )
+                    v_states_succ = (
+                        v_states_succ.unsqueeze(2)
+                        .repeat(1, 1, group_size, 1)
+                        .reshape(-1, num_device_k_heads * group_size, head_dim)
+                    )
+                    qk_chunks.append(
+                        (
+                            q_states_succ_critical.transpose(0, 1)[:, -last_q_size:]
+                            * softmax_scale
+                        )
+                        @ k_states_succ.permute(1, 2, 0)
+                    )
+
+            if prev_chunk_end_pos - chunk_len * 2 >= 0:
+                q_states_inter = q_inter[qbegin:qend]
+                q_states_inter_critical = q_inter_critical[qbegin:qend]
+                if block_table is not None:
+                    block_tables_inter = _get_block(
+                        block_table, block_size, 0, prev_chunk_end_pos - chunk_len
+                    )
+                    k_states_inter = k[block_tables_inter].view(-1, *k.shape[-2:])[
+                        : (prev_chunk_end_pos - chunk_len)
+                    ]
+                    v_states_inter = v[block_tables_inter].view(-1, *v.shape[-2:])[
+                        : (prev_chunk_end_pos - chunk_len)
+                    ]
+                else:
+                    k_states_inter = k[: prev_chunk_end_pos - chunk_len]
+                    v_states_inter = v[: prev_chunk_end_pos - chunk_len]
+
+                if sparse_attn_enabled:
+                    k_states_inter = (
+                        k_states_inter.unsqueeze(2)
+                        .repeat(1, 1, group_size, 1)
+                        .reshape(-1, num_device_k_heads * group_size, head_dim)
+                    )
+                    v_states_inter = (
+                        v_states_inter.unsqueeze(2)
+                        .repeat(1, 1, group_size, 1)
+                        .reshape(-1, num_device_k_heads * group_size, head_dim)
+                    )
+                    qk_chunks.append(
+                        (
+                            q_states_inter_critical.transpose(0, 1)[:, -last_q_size:]
+                            * softmax_scale
+                        )
+                        @ k_states_inter.permute(1, 2, 0)
+                    )
+
+            if sparse_attn_enabled:
+                reversed_qk = qk_chunks[::-1]
+                qk = torch.cat(reversed_qk, dim=-1)
+
+                qk[:, :, -last_q_size:] = torch.where(
+                    self.last_q_mask[..., -last_q_size:, -last_q_size:].to(qk.device),
+                    qk[:, :, -last_q_size:],
+                    -torch.inf,
+                )
+                qk = F.softmax(qk, dim=-1, dtype=torch.float32)
+
+                vertical = qk.sum(-2, keepdim=True)
+                vertical[..., :30] = torch.inf
+
+                # Avoid sorting by using the min/max ints to fill the indexer
+                # buffers.
+                int32_max = torch.iinfo(torch.int32).max
+                int32_min = torch.iinfo(torch.int32).min
+                n_heads = qk.size()[0]
+                max_slash_topk = torch.max(heads_slash_size).item()
+                max_vertical_topk = torch.max(heads_vertical_size).item()
+                # store each head's slash topk, vertical topk
+                vertical = vertical.reshape((n_heads, -1))
+                # prevent out of range when prompt size < max_vertical_topk
+                max_vertical_topk = min(vertical.shape[-1], max_vertical_topk)
+                vertical_topk_buffer = torch.topk(
+                    vertical, max_vertical_topk, -1
+                ).indices
+                slash_topk_buffer = torch.empty(
+                    size=(n_heads, max_slash_topk), dtype=torch.int64, device=qk.device
+                )
+                for head_i in range(n_heads):
+                    #  (nqheads=1, lastq, k_len)
+                    head_score = qk[head_i : head_i + 1, :, :]
+                    slash_scores = _sum_all_diagonal_matrix(head_score)
+                    if head_score.size(1) != 1:
+                        # drop right up corner
+                        slash_scores = slash_scores[..., : -last_q_size + 1]
+                    slash_scores[..., -100:] = torch.inf
+
+                    head_slash_size = heads_slash_size[head_i]
+                    head_slash_size = min(head_slash_size, vertical.size(-1))
+                    slash_topk = torch.topk(slash_scores, head_slash_size, -1).indices
+                    # （nheads, max_topk）
+                    slash_topk_buffer[head_i, :head_slash_size] = slash_topk
+
+                    # reset heads topk
+                    heads_slash_size[head_i] = head_slash_size
+                    heads_vertical_size[head_i] = min(
+                        heads_vertical_size[head_i], max_vertical_topk
+                    )
+
+                # store
+                vertical_buffer = torch.full(
+                    (n_heads, max_vertical_topk),
+                    int32_max,
+                    dtype=torch.int64,
+                    device=q.device,
+                )
+                slash_buffer = torch.full(
+                    (n_heads, max_slash_topk),
+                    int32_min,
+                    dtype=torch.int64,
+                    device=q.device,
+                )
+                succ_vertical_buffer = torch.full(
+                    (n_heads, max_vertical_topk),
+                    int32_max,
+                    dtype=torch.int64,
+                    device=q.device,
+                )
+                succ_slash_buffer = torch.full(
+                    (n_heads, max_slash_topk),
+                    int32_min,
+                    dtype=torch.int64,
+                    device=q.device,
+                )
+                inter_vertical_buffer = torch.full(
+                    (n_heads, max_vertical_topk),
+                    int32_max,
+                    dtype=torch.int64,
+                    device=q.device,
+                )
+                inter_slash_buffer = torch.full(
+                    (n_heads, max_slash_topk),
+                    int32_min,
+                    dtype=torch.int64,
+                    device=q.device,
+                )
+
+                vertical_size_buffer = torch.empty(
+                    size=(n_heads,), dtype=torch.int32, device=q.device
+                )
+                slash_sizes_buffer = torch.empty(
+                    size=(n_heads,), dtype=torch.int32, device=q.device
+                )
+                succ_vertical_size_buffer = torch.empty(
+                    size=(n_heads,), dtype=torch.int32, device=q.device
+                )
+                succ_slash_sizes_buffer = torch.empty(
+                    size=(n_heads,), dtype=torch.int32, device=q.device
+                )
+                inter_vertical_size_buffer = torch.empty(
+                    size=(n_heads,), dtype=torch.int32, device=q.device
+                )
+                inter_slash_sizes_buffer = torch.empty(
+                    size=(n_heads,), dtype=torch.int32, device=q.device
+                )
+
+                for head_i in range(n_heads):
+                    vertical_topk = vertical_topk_buffer[
+                        head_i, : heads_vertical_size[head_i]
+                    ]
+                    # intra
+                    intra_vertical_indices = (
+                        vertical_topk[vertical_topk >= prev_chunk_end_pos]
+                        - prev_chunk_end_pos
+                    )
+                    if intra_vertical_indices.nelement() == 0:
+                        intra_vertical_indices = torch.cat(
+                            [
+                                intra_vertical_indices,
+                                torch.arange(
+                                    0,
+                                    k_states_intra.size(0),
+                                    max(1, k_states_intra.size(0) / 5),
+                                    dtype=torch.int32,
+                                    device=intra_vertical_indices.device,
+                                ),
+                            ]
+                        )
+                    slash_topk = slash_topk_buffer[head_i, : heads_slash_size[head_i]]
+                    intra_slash_indices = (qk.size(-1) - 1) - slash_topk[
+                        slash_topk >= prev_chunk_end_pos
+                    ]
+                    # fill buffer
+                    v_count = intra_vertical_indices.nelement()
+                    s_count = intra_slash_indices.nelement()
+                    vertical_size_buffer[head_i] = v_count
+                    slash_sizes_buffer[head_i] = s_count
+                    vertical_buffer[head_i, :v_count].copy_(intra_vertical_indices)
+                    slash_buffer[head_i, :s_count].copy_(intra_slash_indices)
+                    # succ
+                    if prev_chunk_end_pos - chunk_len >= 0:
+                        succ_vertical_indices = vertical_topk[
+                            (vertical_topk < prev_chunk_end_pos)
+                            & (vertical_topk >= prev_chunk_end_pos - chunk_len)
+                        ] - (prev_chunk_end_pos - chunk_len)
+                        # TODO: support no vertical
+                        if succ_vertical_indices.nelement() == 0:
+                            succ_vertical_indices = torch.cat(
+                                [
+                                    succ_vertical_indices,
+                                    torch.arange(
+                                        0,
+                                        k_states_succ.size(0),
+                                        max(1, k_states_succ.size(0) / 5),
+                                        dtype=torch.int32,
+                                        device=intra_vertical_indices.device,
+                                    ),
+                                ]
+                            )
+                        succ_slash_indices = (
+                            prev_chunk_end_pos + (qend - qbegin) - 1
+                        ) - slash_topk[
+                            (
+                                (slash_topk >= (prev_chunk_end_pos - chunk_len))
+                                & (slash_topk < (prev_chunk_end_pos + (qend - qbegin)))
+                            )
+                        ]
+                        if succ_slash_indices.nelement() == 0:
+                            succ_slash_indices = torch.cat(
+                                [
+                                    succ_slash_indices,
+                                    torch.arange(
+                                        0,
+                                        k_states_succ.size(0),
+                                        max(1, k_states_succ.size(0) / 5),
+                                        dtype=torch.int32,
+                                        device=intra_vertical_indices.device,
+                                    ),
+                                ]
+                            )
+                        # fill buffer
+                        v_count = succ_vertical_indices.nelement()
+                        s_count = succ_slash_indices.nelement()
+                        succ_vertical_size_buffer[head_i] = v_count
+                        succ_slash_sizes_buffer[head_i] = s_count
+                        succ_vertical_buffer[head_i, :v_count].copy_(
+                            succ_vertical_indices
+                        )
+                        succ_slash_buffer[head_i, :s_count].copy_(succ_slash_indices)
+
+                    if prev_chunk_end_pos - 2 * chunk_len >= 0:
+                        inter_vertical_indices = vertical_topk[
+                            vertical_topk < prev_chunk_end_pos - chunk_len
+                        ]
+
+                        if inter_vertical_indices.nelement() == 0:
+                            inter_vertical_indices = torch.cat(
+                                [
+                                    inter_vertical_indices,
+                                    torch.arange(
+                                        0,
+                                        k_states_inter.size(0),
+                                        max(1, k_states_inter.size(0) / 5),
+                                        dtype=torch.int32,
+                                        device=intra_vertical_indices.device,
+                                    ),
+                                ]
+                            )
+                        inter_slash_indices = (
+                            prev_chunk_end_pos - chunk_len + (qend - qbegin) - 1
+                        ) - slash_topk[
+                            slash_topk
+                            < (prev_chunk_end_pos - chunk_len + (qend - qbegin))
+                        ]
+                        if inter_slash_indices.nelement() == 0:
+                            inter_slash_indices = torch.cat(
+                                [
+                                    inter_slash_indices,
+                                    torch.arange(
+                                        0,
+                                        k_states_inter.size(0),
+                                        max(1, k_states_inter.size(0) / 5),
+                                        dtype=torch.int32,
+                                        device=intra_vertical_indices.device,
+                                    ),
+                                ]
+                            )
+                        # fill buffer
+                        v_count = inter_vertical_indices.nelement()
+                        s_count = inter_slash_indices.nelement()
+                        inter_vertical_size_buffer[head_i] = v_count
+                        inter_slash_sizes_buffer[head_i] = s_count
+                        inter_vertical_buffer[head_i, :v_count].copy_(
+                            inter_vertical_indices
+                        )
+                        inter_slash_buffer[head_i, :s_count].copy_(inter_slash_indices)
+            else:
+                intra_vertical_indices, intra_slash_indices = None, None
+                succ_vertical_indices, succ_slash_indices = None, None
+                inter_vertical_indices, inter_slash_indices = None, None
+
+            if sparse_attn_enabled:
+                flash_result = self._do_flash_attn(
+                    q_states_intra,
+                    k_states_intra,
+                    v_states_intra,
+                    softmax_scale=softmax_scale,
+                    causal=True,
+                    stage="intra",
+                    vertical_indices=vertical_buffer,
+                    slash_indices=slash_buffer,
+                    vertical_indices_count=vertical_size_buffer,
+                    slash_indices_count=slash_sizes_buffer,
+                    mergehead_softmax_scale=softmax_scale,
+                    sparse_attn_enabled=sparse_attn_enabled,
+                )
+            else:
+                flash_result = self._do_flash_attn(
+                    q_states_intra,
+                    k_states_intra,
+                    v_states_intra,
+                    softmax_scale=softmax_scale,
+                    causal=True,
+                    stage="intra",
+                    vertical_indices=intra_vertical_indices,
+                    slash_indices=intra_slash_indices,
+                    sparse_attn_enabled=sparse_attn_enabled,
+                )
+            flash_per_chunk.append(flash_result)
+
+            if prev_chunk_end_pos - chunk_len >= 0:
+                if sparse_attn_enabled:
+                    flash_result = self._do_flash_attn(
+                        q_states_succ,
+                        k_states_succ,
+                        v_states_succ,
+                        softmax_scale=softmax_scale,
+                        causal=False,
+                        stage="succ",
+                        vertical_indices=succ_vertical_buffer,
+                        slash_indices=succ_slash_buffer,
+                        vertical_indices_count=succ_vertical_size_buffer,
+                        slash_indices_count=succ_slash_sizes_buffer,
+                        mergehead_softmax_scale=softmax_scale,
+                        sparse_attn_enabled=sparse_attn_enabled,
+                    )
+                else:
+                    flash_result = self._do_flash_attn(
+                        q_states_succ,
+                        k_states_succ,
+                        v_states_succ,
+                        softmax_scale=softmax_scale,
+                        causal=False,
+                        stage="succ",
+                        vertical_indices=succ_vertical_indices,
+                        slash_indices=succ_slash_indices,
+                        sparse_attn_enabled=sparse_attn_enabled,
+                    )
+                flash_per_chunk.append(flash_result)
+
+            if prev_chunk_end_pos - chunk_len * 2 >= 0:
+                if sparse_attn_enabled:
+                    flash_result = self._do_flash_attn(
+                        q_states_inter,
+                        k_states_inter,
+                        v_states_inter,
+                        softmax_scale=softmax_scale,
+                        causal=False,
+                        stage="inter",
+                        vertical_indices=inter_vertical_buffer,
+                        slash_indices=inter_slash_buffer,
+                        vertical_indices_count=inter_vertical_size_buffer,
+                        slash_indices_count=inter_slash_sizes_buffer,
+                        mergehead_softmax_scale=softmax_scale,
+                        sparse_attn_enabled=sparse_attn_enabled,
+                    )
+                else:
+                    flash_result = self._do_flash_attn(
+                        q_states_inter,
+                        k_states_inter,
+                        v_states_inter,
+                        softmax_scale=softmax_scale,
+                        causal=False,
+                        stage="inter",
+                        vertical_indices=inter_vertical_indices,
+                        slash_indices=inter_slash_indices,
+                        sparse_attn_enabled=sparse_attn_enabled,
+                    )
+                flash_per_chunk.append(flash_result)
+
+            flash_results.append(flash_per_chunk)
+            begin = end
+
+        attn_output = self._merge_attn_outputs(flash_results)
+        del flash_results
+        return attn_output
+
+    def _do_flash_attn(
+        self,
+        query_states: torch.Tensor,
+        key_states: torch.Tensor,
+        value_states: torch.Tensor,
+        softmax_scale: float,
+        causal: bool = True,
+        max_seqlen_k: Optional[int] = None,
+        stage: str = "intra",
+        vertical_indices: Optional[torch.Tensor] = None,
+        slash_indices: Optional[torch.Tensor] = None,
+        vertical_indices_count: Optional[torch.Tensor] = None,
+        slash_indices_count: Optional[torch.Tensor] = None,
+        mergehead_softmax_scale: Optional[float] = None,
+        sparse_attn_enabled: Optional[bool] = False,
+    ):
+        if max_seqlen_k is None:
+            max_seqlen_k = key_states.shape[0]
+
+        q_len = query_states.shape[0]
+        q_heads = query_states.shape[1]
+        h_dim = query_states.shape[-1]
+
+        if sparse_attn_enabled:
+            assert slash_indices is not None
+            if stage == "intra":
+                assert causal
+            else:
+                assert not causal
+
+            query_states = query_states.unsqueeze(0).transpose(1, 2)
+            key_states = key_states.unsqueeze(0).transpose(1, 2)
+            value_states = value_states.unsqueeze(0).transpose(1, 2)
+
+            q = query_states
+            k = key_states
+            v = value_states
+
+            if vertical_indices_count is not None and slash_indices_count is not None:
+                assert mergehead_softmax_scale is not None
+
+                res, s_lse = _vertical_slash_sparse_attention(
+                    q,
+                    k,
+                    v,
+                    vertical_indices,
+                    slash_indices,
+                    mergehead_softmax_scale,
+                    causal=causal,
+                    stage=stage,
+                    vertical_indices_count=vertical_indices_count,
+                    slash_indices_count=slash_indices_count,
+                )
+                res = res.view(q_heads, q_len, h_dim).transpose(
+                    0, 1
+                )  # (qlen,nhead,h_dim)
+                s_lse = (
+                    s_lse.view(q_heads, q_len, 1).squeeze(-1).unsqueeze(0).float()
+                )  # (1, nhead,qlen)
+            else:
+                res, s_lse = _vertical_slash_sparse_attention(
+                    q,
+                    k,
+                    v,
+                    vertical_indices,
+                    slash_indices,
+                    softmax_scale,
+                    causal=causal,
+                    stage=stage,
+                )
+                res = res.view(q_len, q_heads, h_dim)
+                s_lse = s_lse.view(q_len, q_heads, 1).transpose(0, 2).float()
+            return res, s_lse
+
+        output, softmax_lse, *rest = flash_attn_varlen_func(
+            q=query_states,
+            k=key_states,
+            v=value_states,
+            softmax_scale=softmax_scale,
+            cu_seqlens_q=torch.tensor(
+                [0, query_states.shape[0]],
+                dtype=torch.int32,
+                device=query_states.device,
+            ),
+            max_seqlen_q=query_states.shape[0],
+            cu_seqlens_k=torch.tensor(
+                [0, max_seqlen_k], dtype=torch.int32, device=query_states.device
+            ),
+            max_seqlen_k=max_seqlen_k,
+            causal=causal,
+            return_softmax_lse=True,
+        )
+        softmax_lse = softmax_lse.view(q_len, q_heads, 1).transpose(0, 2).float()
+        return output, softmax_lse
+
+    def _merge_attn_outputs(
+        self,
+        flash_results: List[List[Tuple[torch.Tensor, torch.Tensor]]],
+        return_lse: Optional[bool] = False,
+    ) -> torch.Tensor:
+        attn_outputs_all = []
+        logits_all = []
+
+        for flash_per_chunk in flash_results:
+            if len(flash_per_chunk) == 1:
+                attn_outputs_all.append(flash_per_chunk[0][0])
+                if return_lse:
+                    logits_all.append(flash_per_chunk[0][1])
+                continue
+
+            attn_outputs = torch.stack(
+                [flash_attn_output[0] for flash_attn_output in flash_per_chunk]
+            )
+            logits = torch.stack(
+                [flash_attn_output[1] for flash_attn_output in flash_per_chunk]
+            )
+            logits = logits.to(torch.float32)
+
+            if return_lse:
+                max_val = torch.max(logits, dim=0).values
+                diff = torch.abs(logits[0] - logits[1])
+                log_sum_exp = max_val + torch.log1p(torch.exp(-diff))
+                logits_all.append(log_sum_exp)
+
+            max_logits = torch.max(logits, dim=0).values
+            stable_logits = logits - max_logits.unsqueeze(0)
+            lse_s = torch.exp(stable_logits).detach()
+            lse_sum = torch.sum(lse_s, dim=0)
+            lse_s /= lse_sum
+            attn_outputs *= lse_s.unsqueeze(-1).transpose(2, 3).squeeze(1)
+            attn_outputs_all.append(attn_outputs.sum(dim=0))
+
+        if return_lse:
+            return (torch.cat(attn_outputs_all, dim=0), torch.cat(logits_all, dim=-1))
+        else:
+            return torch.cat(attn_outputs_all, dim=0)
+
+    def _dual_chunk_flash_attn_decoding(
+        self,
+        query: torch.Tensor,
+        query_succ: torch.Tensor,
+        query_inter: torch.Tensor,
+        key_cache: torch.Tensor,
+        value_cache: torch.Tensor,
+        block_table: torch.Tensor,
+        cache_seqlens: torch.Tensor,
+        softmax_scale: float,
+        causal: bool,
+        chunk_size: int,
+        local_size: int,
+        original_max_position_embeddings: int,
+        decode_meta: DualChunkFlashAttentionMetadata,
+    ):
+        if not causal:
+            raise ValueError("Dual Chunk Attention does not support causal=False")
+
+        block_size = value_cache.shape[1]
+        chunk_len = chunk_size - local_size
+        if chunk_len % block_size != 0:
+            raise ValueError("chunk_len must be divisible by block_size.")
+        if original_max_position_embeddings > 0:
+            assert decode_meta.scaling_factor is not None
+            scaling_factor = decode_meta.scaling_factor
+            query = (query * scaling_factor.view(-1, 1, 1, 1)).to(
+                query.dtype
+            )  # possible for numerical issue, need to fused in the kernel
+            query_succ = (query_succ * scaling_factor.view(-1, 1, 1, 1)).to(query.dtype)
+            query_inter = (query_inter * scaling_factor.view(-1, 1, 1, 1)).to(
+                query.dtype
+            )
+        outputs_list = []
+        softmax_lses_list = []
+
+        # intra-attention
+        intra_output, intra_softmax_lse = (
+            self._dual_chunk_flash_attn_decoding_with_exp_sums(
+                query,
+                key_cache,
+                value_cache,
+                decode_meta.block_tables_intra,
+                decode_meta.seq_lens_intra,
+                softmax_scale,
+                causal=False,
+            )
+        )
+        outputs_list.append(intra_output)
+        softmax_lses_list.append(intra_softmax_lse)
+
+        # succ-attention
+        if decode_meta.max_seq_len_succ:
+            succ_output, succ_softmax_lse = (
+                self._dual_chunk_flash_attn_decoding_with_exp_sums(
+                    query_succ,
+                    key_cache,
+                    value_cache,
+                    decode_meta.block_tables_succ,
+                    decode_meta.seq_lens_succ,
+                    softmax_scale,
+                    causal=False,
+                )
+            )
+            outputs_list.append(succ_output)
+            softmax_lses_list.append(succ_softmax_lse)
+
+        # inter-attention
+        if decode_meta.max_seq_len_inter:
+            inter_output, inter_softmax_lse = (
+                self._dual_chunk_flash_attn_decoding_with_exp_sums(
+                    query_inter,
+                    key_cache,
+                    value_cache,
+                    block_table,
+                    decode_meta.seq_lens_inter,
+                    softmax_scale,
+                    causal=False,
+                )
+            )
+            outputs_list.append(inter_output)
+            softmax_lses_list.append(inter_softmax_lse)
+        outputs = torch.stack(outputs_list, dim=0)
+        del outputs_list
+        softmax_lses = torch.stack(softmax_lses_list, dim=0).to(torch.float32)
+        del softmax_lses_list
+        max_logits = torch.max(softmax_lses, dim=0).values
+        stable_logits = softmax_lses - max_logits.unsqueeze(0)
+        lse_s = torch.exp(stable_logits).detach()
+        lse_sum = torch.sum(lse_s, dim=0)
+        lse_s /= lse_sum
+        outputs *= lse_s.unsqueeze(-1).transpose(2, 3)
+        return outputs.sum(0)
+
+    def _dual_chunk_flash_attn_decoding_with_exp_sums(
+        self,
+        query: torch.Tensor,
+        key_cache: torch.Tensor,
+        value_cache: torch.Tensor,
+        block_table: torch.Tensor,
+        cache_seqlens: torch.Tensor,
+        softmax_scale: float,
+        causal: bool,
+    ):
+        out, softmax_lse, *rest_expand = flash_attn_with_kvcache(
+            q=query,
+            k_cache=key_cache,
+            v_cache=value_cache,
+            page_table=block_table,
+            cache_seqlens=cache_seqlens,
+            softmax_scale=softmax_scale,
+            causal=causal,
+            return_softmax_lse=True,
+        )
+        mask = cache_seqlens == 0
+        out[mask] = 0
+        softmax_lse[mask] = -float("inf")
+        return out, softmax_lse
+
+
+def _vertical_slash_sparse_attention(
+    query: torch.Tensor,  # [BATCH, N_HEADS, N_CTX, D_HEAD]
+    key: torch.Tensor,  # [BATCH, N_HEADS, N_KV_CTX, D_HEAD]
+    value: torch.Tensor,  # [BATCH, N_HEADS, N_KV_CTX, D_HEAD]
+    v_idx: torch.Tensor,  # [BATCH, N_HEADS, NNZ_V]
+    s_idx: torch.Tensor,  # [BATCH, N_HEADS, NNZ_S]
+    softmax_scale: float,
+    causal: bool = True,
+    stage: str = "intra",
+    block_size_M: int = 64,
+    block_size_N: int = 64,
+    vertical_indices_count: torch.Tensor = None,  # [N_HEADS,]
+    slash_indices_count: torch.Tensor = None,
+):
+    if stage == "intra":
+        assert causal
+    else:
+        assert not causal
+
+    batch_size, num_heads, context_size, head_dim = query.shape
+    _, _, kv_seq_len, _ = key.shape
+
+    if head_dim not in [16, 32, 64, 128, 256, 512]:
+        target_dim = 2 ** math.ceil(math.log2(head_dim)) - head_dim
+        query = F.pad(query, [0, target_dim, 0, 0, 0, 0, 0, 0])
+        key = F.pad(key, [0, target_dim, 0, 0, 0, 0, 0, 0])
+        value = F.pad(value, [0, target_dim, 0, 0, 0, 0, 0, 0])
+
+    v_idx = (
+        v_idx.to(torch.int32)
+        .reshape((batch_size, num_heads, -1))
+        .sort(dim=-1, descending=False)[0]
+    )
+    s_idx = (
+        s_idx.to(torch.int32)
+        .reshape((batch_size, num_heads, -1))
+        .sort(dim=-1, descending=True)[0]
+    )
+    q_seqlens = torch.tensor([context_size], dtype=torch.int32, device=query.device)
+    kv_seqlens = torch.tensor([kv_seq_len], dtype=torch.int32, device=query.device)
+
+    if vertical_indices_count is not None and slash_indices_count is not None:
+        (
+            block_count,
+            block_offset,
+            column_count,
+            column_index,
+        ) = convert_vertical_slash_indexes_mergehead(
+            q_seqlens,
+            kv_seqlens,
+            v_idx,
+            s_idx,
+            vertical_indices_count,
+            slash_indices_count,
+            context_size,
+            block_size_M,
+            block_size_N,
+            causal,
+        )
+    else:
+        (
+            block_count,
+            block_offset,
+            column_count,
+            column_index,
+        ) = convert_vertical_slash_indexes(
+            q_seqlens,
+            kv_seqlens,
+            v_idx,
+            s_idx,
+            context_size,
+            block_size_M,
+            block_size_N,
+            causal,
+        )
+
+    q = query.transpose(1, 2).contiguous()
+    k = key.transpose(1, 2).contiguous()
+    v = value.transpose(1, 2).contiguous()
+    out, lse = sparse_attn_func(
+        q,
+        k,
+        v,
+        block_count,
+        block_offset,
+        column_count,
+        column_index,
+        causal=causal,
+        softmax_scale=softmax_scale,
+        return_softmax_lse=True,
+    )
+    out = out.transpose(1, 2).contiguous()
+    softmax_lse = lse.reshape(*lse.shape, 1)
+    return (out[..., :context_size, :head_dim], softmax_lse[..., :context_size, :])
+
+
+def _sum_all_diagonal_matrix(mat: torch.tensor):
+    h, n, m = mat.shape
+    # Zero matrix used for padding
+    zero_mat = torch.zeros((h, n, n), device=mat.device)
+    # pads the matrix on left and right
+    mat_padded = torch.cat((zero_mat, mat, zero_mat), -1)
+    # Change the strides
+    mat_strided = mat_padded.as_strided(
+        (1, n, n + m), (n * (2 * n + m), 2 * n + m + 1, 1)
+    )
+    # Sums the resulting matrix's columns
+    sum_diags = torch.sum(mat_strided, 1)
+    return sum_diags[:, 1:]  # drop left bottom corner
+
+
+def _get_block(block_table: torch.Tensor, block_size: int, begin: int, end: int):
+    begin_block = begin // block_size
+    end_block = (end - 1) // block_size + 1
+    return block_table[begin_block:end_block]
diff --git a/sglang/python/sglang/srt/layers/attention/fla/__pycache__/chunk_delta_h.cpython-311.pyc b/sglang/python/sglang/srt/layers/attention/fla/__pycache__/chunk_delta_h.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c992afa88184fb746ef9bc69fc118eabc0789c43
Binary files /dev/null and b/sglang/python/sglang/srt/layers/attention/fla/__pycache__/chunk_delta_h.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/layers/attention/fla/__pycache__/cumsum.cpython-311.pyc b/sglang/python/sglang/srt/layers/attention/fla/__pycache__/cumsum.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0c1650f08366601f57dcd3463e2bf0f91f1f935c
Binary files /dev/null and b/sglang/python/sglang/srt/layers/attention/fla/__pycache__/cumsum.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/layers/attention/fla/__pycache__/fused_norm_gate.cpython-311.pyc b/sglang/python/sglang/srt/layers/attention/fla/__pycache__/fused_norm_gate.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..bcf7c871d3f9812dbf06b26aee1a2c875dbaab17
Binary files /dev/null and b/sglang/python/sglang/srt/layers/attention/fla/__pycache__/fused_norm_gate.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/layers/attention/fla/__pycache__/fused_recurrent.cpython-311.pyc b/sglang/python/sglang/srt/layers/attention/fla/__pycache__/fused_recurrent.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f76952005bc0d6cfa35789b17bd7c10ad0b97013
Binary files /dev/null and b/sglang/python/sglang/srt/layers/attention/fla/__pycache__/fused_recurrent.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/layers/attention/fla/__pycache__/index.cpython-311.pyc b/sglang/python/sglang/srt/layers/attention/fla/__pycache__/index.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..48e9887ccc5d3347ec4b3f2ff9119a9942258535
Binary files /dev/null and b/sglang/python/sglang/srt/layers/attention/fla/__pycache__/index.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/layers/attention/fla/__pycache__/kda.cpython-311.pyc b/sglang/python/sglang/srt/layers/attention/fla/__pycache__/kda.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..896852a029c88e968dae82149c5442f320f51680
Binary files /dev/null and b/sglang/python/sglang/srt/layers/attention/fla/__pycache__/kda.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/layers/attention/fla/__pycache__/l2norm.cpython-311.pyc b/sglang/python/sglang/srt/layers/attention/fla/__pycache__/l2norm.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d6215841dad8c8f636fb08d18d0f3137fbfe4ed7
Binary files /dev/null and b/sglang/python/sglang/srt/layers/attention/fla/__pycache__/l2norm.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/layers/attention/fla/__pycache__/layernorm_gated.cpython-311.pyc b/sglang/python/sglang/srt/layers/attention/fla/__pycache__/layernorm_gated.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3c7fe0e0cb74dcb018f6ce90ed904462c2710d1b
Binary files /dev/null and b/sglang/python/sglang/srt/layers/attention/fla/__pycache__/layernorm_gated.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/layers/attention/fla/__pycache__/op.cpython-311.pyc b/sglang/python/sglang/srt/layers/attention/fla/__pycache__/op.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7490d5d316d053f9b8d11254c3f2166e11beeac0
Binary files /dev/null and b/sglang/python/sglang/srt/layers/attention/fla/__pycache__/op.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/layers/attention/fla/__pycache__/solve_tril.cpython-311.pyc b/sglang/python/sglang/srt/layers/attention/fla/__pycache__/solve_tril.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f6d3da4643257df295c05f6a5ddf420d1373dbe9
Binary files /dev/null and b/sglang/python/sglang/srt/layers/attention/fla/__pycache__/solve_tril.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/layers/attention/fla/__pycache__/utils.cpython-311.pyc b/sglang/python/sglang/srt/layers/attention/fla/__pycache__/utils.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5f9f4521f24eedc9aca9d149e7621167f2f018aa
Binary files /dev/null and b/sglang/python/sglang/srt/layers/attention/fla/__pycache__/utils.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/layers/attention/fla/chunk.py b/sglang/python/sglang/srt/layers/attention/fla/chunk.py
new file mode 100644
index 0000000000000000000000000000000000000000..e7430f4f913ea66863bcc7ecbf289740dc82657a
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/attention/fla/chunk.py
@@ -0,0 +1,244 @@
+# Adapted from https://github.com/fla-org/flash-linear-attention/blob/main/fla/ops/gated_delta_rule/chunk.py
+# -*- coding: utf-8 -*-
+# Copyright (c) 2023-2025, Songlin Yang, Yu Zhang
+
+from typing import Optional
+
+import torch
+from einops import rearrange
+
+from sglang.srt.layers.attention.fla.chunk_delta_h import chunk_gated_delta_rule_fwd_h
+from sglang.srt.layers.attention.fla.chunk_o import chunk_fwd_o
+from sglang.srt.layers.attention.fla.chunk_scaled_dot_kkt import (
+    chunk_scaled_dot_kkt_fwd,
+)
+from sglang.srt.layers.attention.fla.cumsum import chunk_local_cumsum
+from sglang.srt.layers.attention.fla.l2norm import l2norm_fwd
+from sglang.srt.layers.attention.fla.solve_tril import solve_tril
+from sglang.srt.layers.attention.fla.utils import (
+    SUPPRESS_LEVEL,
+    autocast_custom_fwd,
+    input_guard,
+)
+from sglang.srt.layers.attention.fla.wy_fast import recompute_w_u_fwd
+
+
+def chunk_gated_delta_rule_fwd(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    g: torch.Tensor,
+    beta: torch.Tensor,
+    scale: float,
+    initial_state: torch.Tensor,
+    initial_state_indices: torch.Tensor,
+    cu_seqlens: Optional[torch.LongTensor] = None,
+):
+    g = chunk_local_cumsum(g, chunk_size=64, cu_seqlens=cu_seqlens)
+    # obtain WY representation. u is actually the new v.
+    A = chunk_scaled_dot_kkt_fwd(
+        k=k, beta=beta, g_cumsum=g, cu_seqlens=cu_seqlens, output_dtype=torch.float32
+    )
+    A = solve_tril(A=A, cu_seqlens=cu_seqlens, output_dtype=k.dtype)
+    w, u = recompute_w_u_fwd(
+        k=k,
+        v=v,
+        beta=beta,
+        A=A,
+        g_cumsum=g,
+        cu_seqlens=cu_seqlens,
+    )
+    h, v_new = chunk_gated_delta_rule_fwd_h(
+        k=k,
+        w=w,
+        u=u,
+        g=g,
+        initial_state=initial_state,
+        initial_state_indices=initial_state_indices,
+        cu_seqlens=cu_seqlens,
+    )
+    o = chunk_fwd_o(
+        q=q,
+        k=k,
+        v=v_new,
+        h=h,
+        g=g,
+        scale=scale,
+        cu_seqlens=cu_seqlens,
+    )
+    if SUPPRESS_LEVEL < 3:
+        return g, o, A, None, h, None
+    elif SUPPRESS_LEVEL >= 3:
+        return g, o, A, w, h, v_new
+
+
+class ChunkGatedDeltaRuleFunction(torch.autograd.Function):
+
+    @staticmethod
+    @input_guard
+    @autocast_custom_fwd
+    def forward(
+        ctx,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
+        g: torch.Tensor,
+        beta: torch.Tensor,
+        scale: float,
+        initial_state: torch.Tensor,
+        initial_state_indices: torch.Tensor,
+        cu_seqlens: Optional[torch.LongTensor] = None,
+        use_qk_l2norm_in_kernel: bool = False,
+    ):
+        q_orig = q
+        k_orig = k
+
+        if use_qk_l2norm_in_kernel:
+            q = l2norm_fwd(q)
+            k = l2norm_fwd(k)
+
+        g, o, A, w, h, v_new = chunk_gated_delta_rule_fwd(
+            q=q,
+            k=k,
+            v=v,
+            g=g,
+            beta=beta,
+            scale=scale,
+            initial_state=initial_state,
+            initial_state_indices=initial_state_indices,
+            cu_seqlens=cu_seqlens,
+        )
+        return o.to(q.dtype), h
+
+
+@torch.compiler.disable
+def chunk_gated_delta_rule(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    g: torch.Tensor,
+    beta: torch.Tensor,
+    scale: float = None,
+    initial_state: torch.Tensor = None,
+    initial_state_indices: torch.Tensor = None,
+    cu_seqlens: Optional[torch.LongTensor] = None,
+    head_first: bool = False,
+    use_qk_l2norm_in_kernel: bool = False,
+):
+    r"""
+    Args:
+        q (torch.Tensor):
+            queries of shape `[B, T, H, K]` if `head_first=False` else `[B, H, T, K]`.
+        k (torch.Tensor):
+            keys of shape `[B, T, H, K]` if `head_first=False` else `[B, H, T, K]`.
+        v (torch.Tensor):
+            values of shape `[B, T, H, V]` if `head_first=False` else `[B, H, T, V]`.
+        g (torch.Tensor):
+            (forget) gating tensor (in log space!) of shape `[B, T, H]` if `head_first=False` else `[B, H, T]`.
+        beta (torch.Tensor):
+            betas of shape `[B, T, H]` if `head_first=False` else `[B, H, T]`.
+        scale (Optional[int]):
+            Scale factor for the RetNet attention scores.
+            If not provided, it will default to `1 / sqrt(K)`. Default: `None`.
+        initial_state (Optional[torch.Tensor]):
+            Initial state of shape `[N, H, K, V]` for `N` input sequences.
+            For equal-length input sequences, `N` equals the batch size `B`.
+            Default: `None`.
+        output_final_state (Optional[bool]):
+            Whether to output the final state of shape `[N, H, K, V]`. Default: `False`.
+        cu_seqlens (torch.LongTensor):
+            Cumulative sequence lengths of shape `[N+1]` used for variable-length training,
+            consistent with the FlashAttention API.
+        head_first (Optional[bool]):
+            Whether the inputs are in the head-first format, which is not supported for variable-length inputs.
+            Default: `False`.
+
+    Returns:
+        o (torch.Tensor):
+            Outputs of shape `[B, T, H, V]` if `head_first=False` else `[B, H, T, V]`.
+        final_state (torch.Tensor):
+            Final state of shape `[N, H, K, V]` if `output_final_state=True` else `None`.
+
+    Examples::
+        >>> import torch
+        >>> import torch.nn.functional as F
+        >>> from einops import rearrange
+        >>> from fla.ops.gated_delta_rule import chunk_gated_delta_rule
+        # inputs with equal lengths
+        >>> B, T, H, K, V = 4, 2048, 4, 512, 512
+        >>> q = torch.randn(B, T, H, K, dtype=torch.bfloat16, device='cuda')
+        >>> k = F.normalize(torch.randn(B, T, H, K, dtype=torch.bfloat16, device='cuda'), p=2, dim=-1)
+        >>> v = torch.randn(B, T, H, V, dtype=torch.bfloat16, device='cuda')
+        >>> beta = torch.rand(B, T, H, dtype=torch.bfloat16, device='cuda').sigmoid()
+        >>> g = F.logsigmoid(torch.rand(B, T, H, dtype=torch.bfloat16, device='cuda'))
+        >>> h0 = torch.randn(B, H, K, V, dtype=torch.bfloat16, device='cuda')
+        >>> o, ht = chunk_gated_delta_rule(
+            q, k, v, g, beta,
+            initial_state=h0,
+            output_final_state=True
+        )
+        # for variable-length inputs, the batch size `B` is expected to be 1 and `cu_seqlens` is required
+        >>> q, k, v, beta, g = map(lambda x: rearrange(x, 'b t ... -> 1 (b t) ...'), (q, k, v, beta, g))
+        # for a batch with 4 sequences, `cu_seqlens` with 5 start/end positions are expected
+        >>> cu_seqlens = q.new_tensor([0, 2048, 4096, 6144, 8192], dtype=torch.long)
+        >>> o_var, ht_var = chunk_gated_delta_rule(
+            q, k, v, g, beta,
+            initial_state=h0,
+            output_final_state=True,
+            cu_seqlens=cu_seqlens
+        )
+    """
+    assert q.dtype == k.dtype == v.dtype
+    assert (
+        q.dtype != torch.float32
+    ), "ChunkGatedDeltaRuleFunction does not support float32. Please use bfloat16."
+    assert (
+        len(beta.shape) == 3
+    ), "beta must be of shape [B, T, H] if head_first=False, or [B, H, T] otherwise."
+
+    if head_first:
+        raise DeprecationWarning(
+            "head_first is deprecated and will be removed in a future version. "
+            "Please use head_first=False for now instead."
+        )
+        q, k, v, beta, g = map(
+            lambda x: rearrange(x, "b h t ... -> b t h ..."), (q, k, v, beta, g)
+        )
+    # if not head_first and q.shape[1] < q.shape[2]:
+    #     warnings.warn(
+    #         f"Input tensor shape suggests potential format mismatch: seq_len ({q.shape[1]}) < num_heads ({q.shape[2]}). "
+    #         "This may indicate the inputs were passed in head-first format [B, H, T, ...] "
+    #         "when head_first=False was specified. "
+    #         "Please verify your input tensor format matches the expected shape [B, T, H, ...]."
+    #     )
+    if cu_seqlens is not None:
+        if q.shape[0] != 1:
+            raise ValueError(
+                f"The batch size is expected to be 1 rather than {q.shape[0]} when using `cu_seqlens`."
+                f"Please flatten variable-length inputs before processing."
+            )
+        if (
+            initial_state_indices is not None
+            and initial_state_indices.shape[0] != len(cu_seqlens) - 1
+        ):
+            raise ValueError(
+                f"The number of initial states is expected to be equal to the number of input sequences, "
+                f"i.e., {len(cu_seqlens) - 1} rather than {initial_state_indices.shape[0]}."
+            )
+    if scale is None:
+        scale = k.shape[-1] ** -0.5
+    o, h = ChunkGatedDeltaRuleFunction.apply(
+        q,
+        k,
+        v,
+        g,
+        beta,
+        scale,
+        initial_state,
+        initial_state_indices,
+        cu_seqlens,
+        use_qk_l2norm_in_kernel,
+    )
+    if head_first:
+        o = rearrange(o, "b t h ... -> b h t ...")
+    return o, None, h
diff --git a/sglang/python/sglang/srt/layers/attention/fla/chunk_delta_h.py b/sglang/python/sglang/srt/layers/attention/fla/chunk_delta_h.py
new file mode 100644
index 0000000000000000000000000000000000000000..38a7c8f297e3a555cf56040b90044ecb1ef067c3
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/attention/fla/chunk_delta_h.py
@@ -0,0 +1,340 @@
+# Adapted from https://github.com/fla-org/flash-linear-attention/blob/main/fla/ops/common/chunk_delta_h.py
+# -*- coding: utf-8 -*-
+# Copyright (c) 2023-2025, Songlin Yang, Yu Zhang
+
+from typing import Optional, Tuple
+
+import torch
+import triton
+import triton.language as tl
+
+from sglang.srt.layers.attention.fla.index import (
+    prepare_chunk_indices,
+    prepare_chunk_offsets,
+)
+from sglang.srt.layers.attention.fla.op import exp, safe_exp
+from sglang.srt.layers.attention.fla.utils import is_nvidia_hopper
+
+NUM_WARPS = [2, 4] if is_nvidia_hopper else [2, 4, 8, 16]
+CHUNK_SIZE = 64
+
+
+# @triton.autotune(
+#     configs=[
+#         triton.Config({"BV": BV}, num_warps=num_warps, num_stages=num_stages)
+#         for num_warps in [2, 4]
+#         for num_stages in [2, 3, 4]
+#         for BV in [32, 64]
+#     ],
+#     key=["H", "K", "V", "BT", "USE_G"],
+#     use_cuda_graph=use_cuda_graph,
+# )
+@triton.jit(do_not_specialize=["T"])
+def chunk_gated_delta_rule_fwd_kernel_h_blockdim64(
+    k,
+    v,
+    w,
+    v_new,
+    g,
+    gk,
+    h,
+    initial_state,
+    initial_state_indices,
+    cu_seqlens,
+    chunk_offsets,
+    T,
+    H: tl.constexpr,
+    Hg: tl.constexpr,
+    K: tl.constexpr,
+    V: tl.constexpr,
+    BT: tl.constexpr,
+    BV: tl.constexpr,
+    USE_G: tl.constexpr,
+    USE_GK: tl.constexpr,
+    USE_INITIAL_STATE: tl.constexpr,
+    INPLACE_UPDATE: tl.constexpr,
+    SAVE_NEW_VALUE: tl.constexpr,
+    IS_VARLEN: tl.constexpr,
+):
+    i_v, i_nh = tl.program_id(0), tl.program_id(1)
+    i_n, i_h = i_nh // H, i_nh % H
+    if IS_VARLEN:
+        bos, eos = tl.load(cu_seqlens + i_n).to(tl.int32), tl.load(
+            cu_seqlens + i_n + 1
+        ).to(tl.int32)
+        T = eos - bos
+        NT = tl.cdiv(T, BT)
+        boh = tl.load(chunk_offsets + i_n).to(tl.int32)
+    else:
+        bos, eos = i_n * T, i_n * T + T
+        NT = tl.cdiv(T, BT)
+        boh = i_n * NT
+
+    # [BK, BV]
+    b_h1 = tl.zeros([64, BV], dtype=tl.float32)
+    if K > 64:
+        b_h2 = tl.zeros([64, BV], dtype=tl.float32)
+    if K > 128:
+        b_h3 = tl.zeros([64, BV], dtype=tl.float32)
+    if K > 192:
+        b_h4 = tl.zeros([64, BV], dtype=tl.float32)
+
+    # calculate offset
+    h += ((boh * H + i_h) * K * V).to(tl.int64)
+    v += ((bos * H + i_h) * V).to(tl.int64)
+    k += ((bos * Hg + i_h // (H // Hg)) * K).to(tl.int64)
+    w += ((bos * H + i_h) * K).to(tl.int64)
+    if SAVE_NEW_VALUE:
+        v_new += ((bos * H + i_h) * V).to(tl.int64)
+    stride_v = H * V
+    stride_h = H * K * V
+    stride_k = Hg * K
+    stride_w = H * K
+
+    index = tl.load(initial_state_indices + i_n).to(tl.int32)
+    h0 = initial_state + index * stride_h
+    ht = initial_state + index * stride_h
+    if USE_INITIAL_STATE:
+        h0 = h0 + i_h * K * V
+    if INPLACE_UPDATE:
+        ht = ht + i_h * K * V
+
+    # load initial state
+    if USE_INITIAL_STATE:
+        p_h0_1 = tl.make_block_ptr(h0, (K, V), (V, 1), (0, i_v * BV), (64, BV), (1, 0))
+        b_h1 += tl.load(p_h0_1, boundary_check=(0, 1)).to(tl.float32)
+        if K > 64:
+            p_h0_2 = tl.make_block_ptr(
+                h0, (K, V), (V, 1), (64, i_v * BV), (64, BV), (1, 0)
+            )
+            b_h2 += tl.load(p_h0_2, boundary_check=(0, 1)).to(tl.float32)
+        if K > 128:
+            p_h0_3 = tl.make_block_ptr(
+                h0, (K, V), (V, 1), (128, i_v * BV), (64, BV), (1, 0)
+            )
+            b_h3 += tl.load(p_h0_3, boundary_check=(0, 1)).to(tl.float32)
+        if K > 192:
+            p_h0_4 = tl.make_block_ptr(
+                h0, (K, V), (V, 1), (192, i_v * BV), (64, BV), (1, 0)
+            )
+            b_h4 += tl.load(p_h0_4, boundary_check=(0, 1)).to(tl.float32)
+
+    # main recurrence
+    for i_t in range(NT):
+        p_h1 = tl.make_block_ptr(
+            h + i_t * stride_h, (K, V), (V, 1), (0, i_v * BV), (64, BV), (1, 0)
+        )
+        tl.store(p_h1, b_h1.to(p_h1.dtype.element_ty), boundary_check=(0, 1))
+        if K > 64:
+            p_h2 = tl.make_block_ptr(
+                h + i_t * stride_h, (K, V), (V, 1), (64, i_v * BV), (64, BV), (1, 0)
+            )
+            tl.store(p_h2, b_h2.to(p_h2.dtype.element_ty), boundary_check=(0, 1))
+        if K > 128:
+            p_h3 = tl.make_block_ptr(
+                h + i_t * stride_h, (K, V), (V, 1), (128, i_v * BV), (64, BV), (1, 0)
+            )
+            tl.store(p_h3, b_h3.to(p_h3.dtype.element_ty), boundary_check=(0, 1))
+        if K > 192:
+            p_h4 = tl.make_block_ptr(
+                h + i_t * stride_h, (K, V), (V, 1), (192, i_v * BV), (64, BV), (1, 0)
+            )
+            tl.store(p_h4, b_h4.to(p_h4.dtype.element_ty), boundary_check=(0, 1))
+
+        p_w = tl.make_block_ptr(
+            w, (T, K), (stride_w, 1), (i_t * BT, 0), (BT, 64), (1, 0)
+        )
+        b_w = tl.load(p_w, boundary_check=(0, 1))
+        b_v = tl.dot(b_w, b_h1.to(b_w.dtype))
+        if K > 64:
+            p_w = tl.make_block_ptr(
+                w, (T, K), (stride_w, 1), (i_t * BT, 64), (BT, 64), (1, 0)
+            )
+            b_w = tl.load(p_w, boundary_check=(0, 1))
+            b_v += tl.dot(b_w, b_h2.to(b_w.dtype))
+        if K > 128:
+            p_w = tl.make_block_ptr(
+                w, (T, K), (stride_w, 1), (i_t * BT, 128), (BT, 64), (1, 0)
+            )
+            b_w = tl.load(p_w, boundary_check=(0, 1))
+            b_v += tl.dot(b_w, b_h3.to(b_w.dtype))
+        if K > 192:
+            p_w = tl.make_block_ptr(
+                w, (T, K), (stride_w, 1), (i_t * BT, 192), (BT, 64), (1, 0)
+            )
+            b_w = tl.load(p_w, boundary_check=(0, 1))
+            b_v += tl.dot(b_w, b_h4.to(b_w.dtype))
+        p_v = tl.make_block_ptr(
+            v, (T, V), (stride_v, 1), (i_t * BT, i_v * BV), (BT, BV), (1, 0)
+        )
+        b_v = tl.load(p_v, boundary_check=(0, 1)) - b_v
+
+        if SAVE_NEW_VALUE:
+            p_v = tl.make_block_ptr(
+                v_new, (T, V), (stride_v, 1), (i_t * BT, i_v * BV), (BT, BV), (1, 0)
+            )
+            tl.store(p_v, b_v.to(p_v.dtype.element_ty), boundary_check=(0, 1))
+
+        last_idx = min((i_t + 1) * BT, T) - 1
+        if USE_G:
+            b_g_last = tl.load(g + bos * H + last_idx * H + i_h)
+            p_g = tl.make_block_ptr(
+                g + bos * H + i_h, (T,), (H,), (i_t * BT,), (BT,), (0,)
+            )
+            b_g = tl.load(p_g, boundary_check=(0,))
+            b_v = b_v * safe_exp(b_g_last - b_g)[:, None]
+            b_g_last = exp(b_g_last)
+            b_h1 = b_h1 * b_g_last
+            if K > 64:
+                b_h2 = b_h2 * b_g_last
+            if K > 128:
+                b_h3 = b_h3 * b_g_last
+            if K > 192:
+                b_h4 = b_h4 * b_g_last
+
+        if USE_GK:
+            o_k1 = tl.arange(0, 64)
+            b_gk_last1 = tl.load(
+                gk + (bos + last_idx) * H * K + i_h * K + o_k1,
+                mask=(o_k1 < K),
+                other=0.0,
+            )
+            b_h1 *= exp(b_gk_last1)[:, None]
+            if K > 64:
+                o_k2 = 64 + o_k1
+                b_gk_last2 = tl.load(
+                    gk + (bos + last_idx) * H * K + i_h * K + o_k2,
+                    mask=(o_k2 < K),
+                    other=0.0,
+                )
+                b_h2 *= exp(b_gk_last2)[:, None]
+            if K > 128:
+                o_k3 = 128 + o_k1
+                b_gk_last3 = tl.load(
+                    gk + (bos + last_idx) * H * K + i_h * K + o_k3,
+                    mask=(o_k3 < K),
+                    other=0.0,
+                )
+                b_h3 *= exp(b_gk_last3)[:, None]
+            if K > 192:
+                o_k4 = 192 + o_k1
+                b_gk_last4 = tl.load(
+                    gk + (bos + last_idx) * H * K + i_h * K + o_k4,
+                    mask=(o_k4 < K),
+                    other=0.0,
+                )
+                b_h4 *= exp(b_gk_last4)[:, None]
+        b_v = b_v.to(k.dtype.element_ty)
+
+        p_k = tl.make_block_ptr(
+            k, (K, T), (1, stride_k), (0, i_t * BT), (64, BT), (0, 1)
+        )
+        b_k = tl.load(p_k, boundary_check=(0, 1))
+        b_h1 += tl.dot(b_k, b_v)
+        if K > 64:
+            p_k = tl.make_block_ptr(
+                k, (K, T), (1, stride_k), (64, i_t * BT), (64, BT), (0, 1)
+            )
+            b_k = tl.load(p_k, boundary_check=(0, 1))
+            b_h2 += tl.dot(b_k, b_v)
+        if K > 128:
+            p_k = tl.make_block_ptr(
+                k, (K, T), (1, stride_k), (128, i_t * BT), (64, BT), (0, 1)
+            )
+            b_k = tl.load(p_k, boundary_check=(0, 1))
+            b_h3 += tl.dot(b_k, b_v)
+        if K > 192:
+            p_k = tl.make_block_ptr(
+                k, (K, T), (1, stride_k), (192, i_t * BT), (64, BT), (0, 1)
+            )
+            b_k = tl.load(p_k, boundary_check=(0, 1))
+            b_h4 += tl.dot(b_k, b_v)
+
+    # epilogue
+    if INPLACE_UPDATE:
+        p_ht = tl.make_block_ptr(ht, (K, V), (V, 1), (0, i_v * BV), (64, BV), (1, 0))
+        tl.store(p_ht, b_h1.to(p_ht.dtype.element_ty), boundary_check=(0, 1))
+        if K > 64:
+            p_ht = tl.make_block_ptr(
+                ht, (K, V), (V, 1), (64, i_v * BV), (64, BV), (1, 0)
+            )
+            tl.store(p_ht, b_h2.to(p_ht.dtype.element_ty), boundary_check=(0, 1))
+        if K > 128:
+            p_ht = tl.make_block_ptr(
+                ht, (K, V), (V, 1), (128, i_v * BV), (64, BV), (1, 0)
+            )
+            tl.store(p_ht, b_h3.to(p_ht.dtype.element_ty), boundary_check=(0, 1))
+        if K > 192:
+            p_ht = tl.make_block_ptr(
+                ht, (K, V), (V, 1), (192, i_v * BV), (64, BV), (1, 0)
+            )
+            tl.store(p_ht, b_h4.to(p_ht.dtype.element_ty), boundary_check=(0, 1))
+
+
+def chunk_gated_delta_rule_fwd_h(
+    k: torch.Tensor,
+    w: torch.Tensor,
+    u: torch.Tensor,
+    g: Optional[torch.Tensor] = None,
+    gk: Optional[torch.Tensor] = None,
+    initial_state: Optional[torch.Tensor] = None,
+    initial_state_indices: Optional[torch.Tensor] = None,
+    save_new_value: bool = True,
+    cu_seqlens: Optional[torch.LongTensor] = None,
+) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    B, T, Hg, K, V = *k.shape, u.shape[-1]
+    H = u.shape[-2]
+    BT = CHUNK_SIZE
+
+    chunk_indices = (
+        prepare_chunk_indices(cu_seqlens, CHUNK_SIZE)
+        if cu_seqlens is not None
+        else None
+    )
+    # N: the actual number of sequences in the batch with either equal or variable lengths
+    if cu_seqlens is None:
+        N, NT, chunk_offsets = B, triton.cdiv(T, BT), None
+    else:
+        N, NT, chunk_offsets = (
+            len(cu_seqlens) - 1,
+            len(chunk_indices),
+            prepare_chunk_offsets(cu_seqlens, BT),
+        )
+    assert K <= 256, "current kernel does not support head dimension larger than 256."
+
+    h = k.new_empty(B, NT, H, K, V)
+
+    v_new = torch.empty_like(u) if save_new_value else None
+
+    def grid(meta):
+        return (triton.cdiv(V, meta["BV"]), N * H)
+
+    chunk_gated_delta_rule_fwd_kernel_h_blockdim64[grid](
+        k=k,
+        v=u,
+        w=w,
+        v_new=v_new,
+        g=g,
+        gk=gk,
+        h=h,
+        initial_state=initial_state,
+        initial_state_indices=initial_state_indices,
+        cu_seqlens=cu_seqlens,
+        chunk_offsets=chunk_offsets,
+        T=T,
+        H=H,
+        Hg=Hg,
+        K=K,
+        V=V,
+        BT=BT,
+        BV=32,
+        USE_G=g is not None,
+        USE_GK=gk is not None,
+        USE_INITIAL_STATE=initial_state is not None,
+        INPLACE_UPDATE=True,
+        SAVE_NEW_VALUE=v_new is not None,
+        IS_VARLEN=cu_seqlens is not None,
+        num_warps=4,
+        num_stages=2,
+    )
+    return h, v_new
diff --git a/sglang/python/sglang/srt/layers/attention/fla/chunk_o.py b/sglang/python/sglang/srt/layers/attention/fla/chunk_o.py
new file mode 100644
index 0000000000000000000000000000000000000000..bb89421eb87290ac295a95c91332c517327e6778
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/attention/fla/chunk_o.py
@@ -0,0 +1,174 @@
+# Adapted from https://github.com/fla-org/flash-linear-attention/blob/main/fla/ops/common/chunk_o.py
+# -*- coding: utf-8 -*-
+# Copyright (c) 2023-2025, Songlin Yang, Yu Zhang
+
+from typing import Optional
+
+import torch
+import triton
+import triton.language as tl
+
+from sglang.srt.layers.attention.fla.index import prepare_chunk_indices
+from sglang.srt.layers.attention.fla.op import exp, safe_exp
+from sglang.srt.layers.attention.fla.utils import check_shared_mem, is_nvidia_hopper
+
+BKV_LIST = [64, 128] if check_shared_mem() else [32, 64]
+NUM_WARPS = [2, 4] if is_nvidia_hopper else [2, 4, 8]
+
+
+# @triton.autotune(
+#     configs=[
+#         triton.Config({"BK": BK, "BV": BV}, num_warps=num_warps, num_stages=num_stages)
+#         for BK in BKV_LIST
+#         for BV in BKV_LIST
+#         for num_warps in NUM_WARPS
+#         for num_stages in [2, 3, 4]
+#     ],
+#     key=["H", "K", "V", "BT"],
+# )
+@triton.jit(do_not_specialize=["T"])
+def chunk_fwd_kernel_o(
+    q,
+    k,
+    v,
+    h,
+    g,
+    o,
+    cu_seqlens,
+    chunk_indices,
+    scale,
+    T,
+    H: tl.constexpr,
+    Hg: tl.constexpr,
+    K: tl.constexpr,
+    V: tl.constexpr,
+    BT: tl.constexpr,
+    BK: tl.constexpr,
+    BV: tl.constexpr,
+    USE_G: tl.constexpr,
+    IS_VARLEN: tl.constexpr,
+):
+    i_v, i_t, i_bh = tl.program_id(0), tl.program_id(1), tl.program_id(2)
+    i_b, i_h = i_bh // H, i_bh % H
+
+    if IS_VARLEN:
+        i_tg = i_t
+        i_n, i_t = tl.load(chunk_indices + i_t * 2).to(tl.int32), tl.load(
+            chunk_indices + i_t * 2 + 1
+        ).to(tl.int32)
+        bos, eos = tl.load(cu_seqlens + i_n).to(tl.int32), tl.load(
+            cu_seqlens + i_n + 1
+        ).to(tl.int32)
+        T = eos - bos
+        NT = tl.cdiv(T, BT)
+    else:
+        NT = tl.cdiv(T, BT)
+        i_tg = i_b * NT + i_t
+        bos, eos = i_b * T, i_b * T + T
+
+    # offset calculation
+    q += (bos * Hg + i_h // (H // Hg)) * K
+    k += (bos * Hg + i_h // (H // Hg)) * K
+    v += (bos * H + i_h) * V
+    o += (bos * H + i_h) * V
+    h += (i_tg * H + i_h).to(tl.int64) * K * V
+
+    b_o = tl.zeros([BT, BV], dtype=tl.float32)
+    b_A = tl.zeros([BT, BT], dtype=tl.float32)
+
+    for i_k in range(tl.cdiv(K, BK)):
+        p_q = tl.make_block_ptr(
+            q, (T, K), (Hg * K, 1), (i_t * BT, i_k * BK), (BT, BK), (1, 0)
+        )
+        p_k = tl.make_block_ptr(
+            k, (K, T), (1, Hg * K), (i_k * BK, i_t * BT), (BK, BT), (0, 1)
+        )
+        p_h = tl.make_block_ptr(
+            h, (K, V), (V, 1), (i_k * BK, i_v * BV), (BK, BV), (1, 0)
+        )
+        # [BT, BK]
+        b_q = tl.load(p_q, boundary_check=(0, 1))
+        # [BK, BT]
+        b_k = tl.load(p_k, boundary_check=(0, 1))
+        # [BK, BV]
+        b_h = tl.load(p_h, boundary_check=(0, 1))
+
+        # [BT, BK] @ [BK, BV] -> [BT, BV]
+        b_o += tl.dot(b_q, b_h)
+        # [BT, BK] @ [BK, BT] -> [BT, BT]
+        b_A += tl.dot(b_q, b_k)
+
+    if USE_G:
+        g += bos * H + i_h
+        p_g = tl.make_block_ptr(g, (T,), (H,), (i_t * BT,), (BT,), (0,))
+        b_g = tl.load(p_g, boundary_check=(0,))
+        b_o = b_o * exp(b_g)[:, None]
+        b_A = b_A * safe_exp(b_g[:, None] - b_g[None, :])
+
+    o_i = tl.arange(0, BT)
+    m_A = o_i[:, None] >= o_i[None, :]
+    b_A = tl.where(m_A, b_A, 0)
+
+    p_v = tl.make_block_ptr(
+        v, (T, V), (H * V, 1), (i_t * BT, i_v * BV), (BT, BV), (1, 0)
+    )
+    p_o = tl.make_block_ptr(
+        o, (T, V), (H * V, 1), (i_t * BT, i_v * BV), (BT, BV), (1, 0)
+    )
+    b_v = tl.load(p_v, boundary_check=(0, 1))
+
+    # to fix mma -> mma layout conversion
+    # already solved by triton v3.2 or higher
+    b_o = b_o * scale + tl.dot(b_A.to(b_v.dtype), b_v) * scale
+    tl.store(p_o, b_o.to(p_o.dtype.element_ty), boundary_check=(0, 1))
+
+
+def chunk_fwd_o(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    h: torch.Tensor,
+    g: Optional[torch.Tensor] = None,  # cumsum of log decay
+    scale: Optional[float] = None,
+    cu_seqlens: Optional[torch.LongTensor] = None,
+    chunk_size: int = 64,
+) -> torch.Tensor:
+    B, T, Hg, K, V = *q.shape, v.shape[-1]
+    H = v.shape[-2]
+    BT = min(chunk_size, max(16, triton.next_power_of_2(T)))
+    chunk_indices = (
+        prepare_chunk_indices(cu_seqlens, BT) if cu_seqlens is not None else None
+    )
+    NT = triton.cdiv(T, BT) if cu_seqlens is None else len(chunk_indices)
+    if scale is None:
+        scale = k.shape[-1] ** -0.5
+
+    o = torch.zeros_like(v)
+
+    def grid(meta):
+        return (triton.cdiv(V, meta["BV"]), NT, B * H)
+
+    chunk_fwd_kernel_o[grid](
+        q,
+        k,
+        v,
+        h,
+        g,
+        o,
+        cu_seqlens,
+        chunk_indices,
+        scale,
+        T=T,
+        H=H,
+        Hg=Hg,
+        K=K,
+        V=V,
+        BT=BT,
+        BK=128,
+        BV=64,
+        USE_G=g is not None,
+        IS_VARLEN=cu_seqlens is not None,
+        num_warps=4,
+        num_stages=2,
+    )
+    return o
diff --git a/sglang/python/sglang/srt/layers/attention/fla/chunk_scaled_dot_kkt.py b/sglang/python/sglang/srt/layers/attention/fla/chunk_scaled_dot_kkt.py
new file mode 100644
index 0000000000000000000000000000000000000000..e1059ac297e569160d604e65732c79ea887dc420
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/attention/fla/chunk_scaled_dot_kkt.py
@@ -0,0 +1,147 @@
+# Adapted from https://github.com/fla-org/flash-linear-attention/blob/main/fla/ops/common/chunk_scaled_dot_kkt.py
+# -*- coding: utf-8 -*-
+# Copyright (c) 2023-2025, Songlin Yang, Yu Zhang
+
+from typing import Optional
+
+import torch
+import triton
+import triton.language as tl
+
+from sglang.srt.layers.attention.fla.index import prepare_chunk_indices
+from sglang.srt.layers.attention.fla.op import safe_exp
+
+
+# @triton.autotune(
+#     configs=[
+#         triton.Config({"BK": BK}, num_warps=num_warps, num_stages=num_stages)
+#         for BK in [32, 64, 128]
+#         for num_warps in [2, 4, 8]
+#         for num_stages in [2, 3, 4]
+#     ],
+#     key=["H", "K", "BT", "IS_VARLEN"],
+# )
+@triton.jit(do_not_specialize=["T"])
+def chunk_scaled_dot_kkt_fwd_kernel(
+    k,
+    beta,
+    g_cumsum,
+    A,
+    cu_seqlens,
+    chunk_indices,
+    T,
+    H: tl.constexpr,
+    Hg: tl.constexpr,
+    K: tl.constexpr,
+    BT: tl.constexpr,
+    BK: tl.constexpr,
+    IS_VARLEN: tl.constexpr,
+    USE_G: tl.constexpr,
+):
+    i_t, i_bh = tl.program_id(0), tl.program_id(1)
+    i_b, i_h = i_bh // H, i_bh % H
+    if IS_VARLEN:
+        i_n, i_t = tl.load(chunk_indices + i_t * 2).to(tl.int32), tl.load(
+            chunk_indices + i_t * 2 + 1
+        ).to(tl.int32)
+        bos, eos = tl.load(cu_seqlens + i_n).to(tl.int32), tl.load(
+            cu_seqlens + i_n + 1
+        ).to(tl.int32)
+        T = eos - bos
+    else:
+        bos, eos = i_b * T, i_b * T + T
+    o_t = tl.arange(0, BT)
+
+    p_beta = tl.make_block_ptr(
+        beta + bos * H + i_h, (T,), (H,), (i_t * BT,), (BT,), (0,)
+    )
+    b_beta = tl.load(p_beta, boundary_check=(0,))
+
+    b_A = tl.zeros([BT, BT], dtype=tl.float32)
+    for i_k in range(tl.cdiv(K, BK)):
+        p_k = tl.make_block_ptr(
+            k + (bos * Hg + i_h // (H // Hg)) * K,
+            (T, K),
+            (Hg * K, 1),
+            (i_t * BT, i_k * BK),
+            (BT, BK),
+            (1, 0),
+        )
+        b_k = tl.load(p_k, boundary_check=(0, 1))
+        b_A += tl.dot(b_k, tl.trans(b_k))
+
+    if USE_G:
+        p_g = tl.make_block_ptr(
+            g_cumsum + bos * H + i_h, (T,), (H,), (i_t * BT,), (BT,), (0,)
+        )
+        b_g = tl.load(p_g, boundary_check=(0,))
+        b_g_diff = b_g[:, None] - b_g[None, :]
+        b_A = b_A * safe_exp(b_g_diff)
+
+    b_A *= b_beta[:, None]
+    b_A = tl.where(o_t[:, None] > o_t[None, :], b_A, 0)
+    p_A = tl.make_block_ptr(
+        A + (bos * H + i_h) * BT, (T, BT), (BT * H, 1), (i_t * BT, 0), (BT, BT), (1, 0)
+    )
+    tl.store(p_A, b_A.to(p_A.dtype.element_ty), boundary_check=(0, 1))
+
+
+def chunk_scaled_dot_kkt_fwd(
+    k: torch.Tensor,
+    beta: torch.Tensor,
+    g_cumsum: Optional[torch.Tensor] = None,
+    cu_seqlens: Optional[torch.LongTensor] = None,
+    chunk_size: int = 64,
+    output_dtype: torch.dtype = torch.float32,
+) -> torch.Tensor:
+    r"""
+    Compute beta * K * K^T.
+
+    Args:
+        k (torch.Tensor):
+            The key tensor of shape `[B, T, H, K]`.
+        beta (torch.Tensor):
+            The beta tensor of shape `[B, T, H]`.
+        g_cumsum (torch.Tensor):
+            The cumulative sum of the gate tensor of shape `[B, T, H]`.
+            Default: None
+        cu_seqlens (torch.LongTensor):
+            The cumulative sequence lengths of the input tensor.
+            Default: None
+        chunk_size (int):
+            The chunk size. Default: 64.
+        output_dtype (torch.dtype):
+            The dtype of the output tensor. Default: `torch.float32`
+
+    Returns:
+        beta * K * K^T of shape `[B, T, H, BT]` where `BT` is the chunk size.
+    """
+
+    B, T, Hg, K = k.shape
+
+    H = beta.shape[-1]
+    BT = chunk_size
+    chunk_indices = (
+        prepare_chunk_indices(cu_seqlens, BT) if cu_seqlens is not None else None
+    )
+    NT = triton.cdiv(T, BT) if cu_seqlens is None else len(chunk_indices)
+    A = torch.empty(B, T, H, BT, device=k.device, dtype=output_dtype)
+    chunk_scaled_dot_kkt_fwd_kernel[(NT, B * H)](
+        k=k,
+        beta=beta,
+        g_cumsum=g_cumsum,
+        A=A,
+        cu_seqlens=cu_seqlens,
+        chunk_indices=chunk_indices,
+        T=T,
+        H=H,
+        Hg=Hg,
+        K=K,
+        BT=BT,
+        BK=64,
+        IS_VARLEN=cu_seqlens is not None,
+        USE_G=g_cumsum is not None,
+        num_warps=8,
+        num_stages=3,
+    )
+    return A
diff --git a/sglang/python/sglang/srt/layers/attention/fla/cumsum.py b/sglang/python/sglang/srt/layers/attention/fla/cumsum.py
new file mode 100644
index 0000000000000000000000000000000000000000..39d2f472277821c3215eb06c94e628fd88d39d0f
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/attention/fla/cumsum.py
@@ -0,0 +1,292 @@
+# Adapt from https://github.com/fla-org/flash-linear-attention/blob/main/fla/ops/utils/cumsum.py
+# -*- coding: utf-8 -*-
+# Copyright (c) 2023-2025, Songlin Yang, Yu Zhang
+
+from typing import Optional
+
+import torch
+import triton
+import triton.language as tl
+
+from sglang.srt.layers.attention.fla.index import prepare_chunk_indices
+from sglang.srt.layers.attention.fla.utils import check_shared_mem, input_guard
+
+BS_LIST = [32, 64] if check_shared_mem() else [16, 32]
+
+
+# @triton.autotune(
+#     configs=[triton.Config({}, num_warps=num_warps) for num_warps in [1, 2, 4, 8]],
+#     key=["B", "H", "BT", "IS_VARLEN", "REVERSE"],
+# )
+@triton.jit(do_not_specialize=["T"])
+def chunk_local_cumsum_scalar_kernel(
+    s,
+    o,
+    scale,
+    cu_seqlens,
+    chunk_indices,
+    T,
+    B: tl.constexpr,
+    H: tl.constexpr,
+    BT: tl.constexpr,
+    REVERSE: tl.constexpr,
+    HAS_SCALE: tl.constexpr,
+    IS_VARLEN: tl.constexpr,
+    HEAD_FIRST: tl.constexpr,
+):
+    i_t, i_bh = tl.program_id(0), tl.program_id(1)
+    i_b, i_h = i_bh // H, i_bh % H
+    if IS_VARLEN:
+        i_n, i_t = tl.load(chunk_indices + i_t * 2).to(tl.int32), tl.load(
+            chunk_indices + i_t * 2 + 1
+        ).to(tl.int32)
+        bos, eos = tl.load(cu_seqlens + i_n).to(tl.int32), tl.load(
+            cu_seqlens + i_n + 1
+        ).to(tl.int32)
+        T = eos - bos
+    else:
+        bos, eos = i_b * T, i_b * T + T
+
+    if HEAD_FIRST:
+        p_s = tl.make_block_ptr(
+            s + bos * H + i_h * T, (T,), (1,), (i_t * BT,), (BT,), (0,)
+        )
+        p_o = tl.make_block_ptr(
+            o + bos * H + i_h * T, (T,), (1,), (i_t * BT,), (BT,), (0,)
+        )
+    else:
+        p_s = tl.make_block_ptr(s + bos * H + i_h, (T,), (H,), (i_t * BT,), (BT,), (0,))
+        p_o = tl.make_block_ptr(o + bos * H + i_h, (T,), (H,), (i_t * BT,), (BT,), (0,))
+    # [BT]
+    b_s = tl.load(p_s, boundary_check=(0,)).to(tl.float32)
+    b_o = tl.cumsum(b_s, axis=0)
+    if REVERSE:
+        b_z = tl.sum(b_s, axis=0)
+        b_o = -b_o + b_z[None] + b_s
+    if HAS_SCALE:
+        b_o *= scale
+    tl.store(p_o, b_o.to(p_o.dtype.element_ty), boundary_check=(0,))
+
+
+@triton.autotune(
+    configs=[
+        triton.Config({"BS": BS}, num_warps=num_warps)
+        for BS in BS_LIST
+        for num_warps in [2, 4, 8]
+    ],
+    key=["B", "H", "S", "BT", "IS_VARLEN", "REVERSE", "HAS_SCALE"],
+)
+@triton.jit(do_not_specialize=["T"])
+def chunk_local_cumsum_vector_kernel(
+    s,
+    o,
+    scale,
+    cu_seqlens,
+    chunk_indices,
+    T,
+    B: tl.constexpr,
+    H: tl.constexpr,
+    S: tl.constexpr,
+    BT: tl.constexpr,
+    BS: tl.constexpr,
+    REVERSE: tl.constexpr,
+    HAS_SCALE: tl.constexpr,
+    IS_VARLEN: tl.constexpr,
+    HEAD_FIRST: tl.constexpr,
+):
+    i_s, i_t, i_bh = tl.program_id(0), tl.program_id(1), tl.program_id(2)
+    i_b, i_h = i_bh // H, i_bh % H
+    if IS_VARLEN:
+        i_n, i_t = tl.load(chunk_indices + i_t * 2).to(tl.int32), tl.load(
+            chunk_indices + i_t * 2 + 1
+        ).to(tl.int32)
+        bos, eos = tl.load(cu_seqlens + i_n).to(tl.int32), tl.load(
+            cu_seqlens + i_n + 1
+        ).to(tl.int32)
+        T = eos - bos
+    else:
+        bos, eos = i_b * T, i_b * T + T
+
+    o_i = tl.arange(0, BT)
+    if REVERSE:
+        m_s = tl.where(o_i[:, None] <= o_i[None, :], 1.0, 0.0)
+    else:
+        m_s = tl.where(o_i[:, None] >= o_i[None, :], 1.0, 0.0)
+
+    if HEAD_FIRST:
+        p_s = tl.make_block_ptr(
+            s + (bos * H + i_h * T) * S,
+            (T, S),
+            (S, 1),
+            (i_t * BT, i_s * BS),
+            (BT, BS),
+            (1, 0),
+        )
+        p_o = tl.make_block_ptr(
+            o + (bos * H + i_h * T) * S,
+            (T, S),
+            (S, 1),
+            (i_t * BT, i_s * BS),
+            (BT, BS),
+            (1, 0),
+        )
+    else:
+        p_s = tl.make_block_ptr(
+            s + (bos * H + i_h) * S,
+            (T, S),
+            (H * S, 1),
+            (i_t * BT, i_s * BS),
+            (BT, BS),
+            (1, 0),
+        )
+        p_o = tl.make_block_ptr(
+            o + (bos * H + i_h) * S,
+            (T, S),
+            (H * S, 1),
+            (i_t * BT, i_s * BS),
+            (BT, BS),
+            (1, 0),
+        )
+    # [BT, BS]
+    b_s = tl.load(p_s, boundary_check=(0, 1)).to(tl.float32)
+    b_o = tl.dot(m_s, b_s, allow_tf32=False)
+    if HAS_SCALE:
+        b_o *= scale
+    tl.store(p_o, b_o.to(p_o.dtype.element_ty), boundary_check=(0, 1))
+
+
+def chunk_local_cumsum_scalar(
+    g: torch.Tensor,
+    chunk_size: int,
+    reverse: bool = False,
+    scale: float = None,
+    cu_seqlens: Optional[torch.Tensor] = None,
+    head_first: bool = False,
+    output_dtype: Optional[torch.dtype] = torch.float,
+) -> torch.Tensor:
+    if head_first:
+        B, H, T = g.shape
+    else:
+        B, T, H = g.shape
+    assert chunk_size == 2 ** (
+        chunk_size.bit_length() - 1
+    ), "chunk_size must be a power of 2"
+    BT = chunk_size
+    chunk_indices = (
+        prepare_chunk_indices(cu_seqlens, BT) if cu_seqlens is not None else None
+    )
+    NT = triton.cdiv(T, BT) if cu_seqlens is None else len(chunk_indices)
+    g_org, g = g, torch.empty_like(g, dtype=output_dtype or g.dtype)
+    grid = (NT, B * H)
+    chunk_local_cumsum_scalar_kernel[grid](
+        s=g_org,
+        o=g,
+        scale=scale,
+        cu_seqlens=cu_seqlens,
+        chunk_indices=chunk_indices,
+        T=T,
+        B=B,
+        H=H,
+        BT=BT,
+        HEAD_FIRST=head_first,
+        REVERSE=reverse,
+        HAS_SCALE=scale is not None,
+        IS_VARLEN=cu_seqlens is not None,
+        num_warps=8,
+        num_stages=3,
+    )
+    return g
+
+
+def chunk_local_cumsum_vector(
+    g: torch.Tensor,
+    chunk_size: int,
+    reverse: bool = False,
+    scale: float = None,
+    cu_seqlens: Optional[torch.Tensor] = None,
+    head_first: bool = False,
+    output_dtype: Optional[torch.dtype] = torch.float,
+) -> torch.Tensor:
+    if head_first:
+        B, H, T, S = g.shape
+    else:
+        B, T, H, S = g.shape
+    BT = chunk_size
+    chunk_indices = (
+        prepare_chunk_indices(cu_seqlens, chunk_size)
+        if cu_seqlens is not None
+        else None
+    )
+    NT = triton.cdiv(T, BT) if cu_seqlens is None else len(chunk_indices)
+    assert chunk_size == 2 ** (
+        chunk_size.bit_length() - 1
+    ), "chunk_size must be a power of 2"
+
+    g_org, g = g, torch.empty_like(g, dtype=output_dtype or g.dtype)
+
+    def grid(meta):
+        return (triton.cdiv(meta["S"], meta["BS"]), NT, B * H)
+
+    # keep cumulative normalizer in fp32
+    # this kernel is equivalent to
+    # g = g.view(B, H, NT, BT, -1).cumsum(-2).view(B, H, T, -1)
+    chunk_local_cumsum_vector_kernel[grid](
+        s=g_org,
+        o=g,
+        scale=scale,
+        cu_seqlens=cu_seqlens,
+        chunk_indices=chunk_indices,
+        T=T,
+        B=B,
+        H=H,
+        S=S,
+        BT=BT,
+        HEAD_FIRST=head_first,
+        REVERSE=reverse,
+        HAS_SCALE=scale is not None,
+        IS_VARLEN=cu_seqlens is not None,
+    )
+    return g
+
+
+@input_guard
+def chunk_local_cumsum(
+    g: torch.Tensor,
+    chunk_size: int,
+    reverse: bool = False,
+    scale: float = None,
+    cu_seqlens: Optional[torch.Tensor] = None,
+    head_first: bool = False,
+    output_dtype: Optional[torch.dtype] = torch.float,
+    **kwargs,
+) -> torch.Tensor:
+    if cu_seqlens is not None:
+        assert (
+            g.shape[0] == 1
+        ), "Only batch size 1 is supported when cu_seqlens are provided"
+    if len(g.shape) == 3:
+        return chunk_local_cumsum_scalar(
+            g=g,
+            chunk_size=chunk_size,
+            reverse=reverse,
+            scale=scale,
+            cu_seqlens=cu_seqlens,
+            head_first=head_first,
+            output_dtype=output_dtype,
+        )
+    elif len(g.shape) == 4:
+        return chunk_local_cumsum_vector(
+            g=g,
+            chunk_size=chunk_size,
+            reverse=reverse,
+            scale=scale,
+            cu_seqlens=cu_seqlens,
+            head_first=head_first,
+            output_dtype=output_dtype,
+        )
+    else:
+        raise ValueError(
+            f"Unsupported input shape {g.shape}, "
+            f"which should be (B, T, H, D) if `head_first=False` "
+            f"or (B, H, T, D) otherwise"
+        )
diff --git a/sglang/python/sglang/srt/layers/attention/fla/fused_gdn_gating.py b/sglang/python/sglang/srt/layers/attention/fla/fused_gdn_gating.py
new file mode 100644
index 0000000000000000000000000000000000000000..6e92208ec130e61bba9ab8d57a7bfa7a09a46e59
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/attention/fla/fused_gdn_gating.py
@@ -0,0 +1,69 @@
+from typing import Tuple
+
+import torch
+import triton
+import triton.language as tl
+
+
+# g = -self.A_log.float().exp() * F.softplus(a.float() + self.dt_bias)
+# beta_output = b.sigmoid()
+@triton.jit
+def fused_gdn_gating_kernel(
+    g,
+    beta_output,
+    A_log,
+    a,
+    b,
+    dt_bias,
+    seq_len,
+    NUM_HEADS: tl.constexpr,
+    beta: tl.constexpr,
+    threshold: tl.constexpr,
+    BLK_HEADS: tl.constexpr,
+):
+    i_b, i_s, i_d = tl.program_id(0), tl.program_id(1), tl.program_id(2)
+    head_off = i_d * BLK_HEADS + tl.arange(0, BLK_HEADS)
+    off = i_b * seq_len * NUM_HEADS + i_s * NUM_HEADS + head_off
+    mask = head_off < NUM_HEADS
+    blk_A_log = tl.load(A_log + head_off, mask=mask)
+    blk_a = tl.load(a + off, mask=mask)
+    blk_b = tl.load(b + off, mask=mask)
+    blk_bias = tl.load(dt_bias + head_off, mask=mask)
+    x = blk_a.to(tl.float32) + blk_bias.to(tl.float32)
+    softplus_x = tl.where(
+        beta * x <= threshold, (1 / beta) * tl.log(1 + tl.exp(beta * x)), x
+    )
+    blk_g = -tl.exp(blk_A_log.to(tl.float32)) * softplus_x
+    tl.store(g + off, blk_g.to(g.dtype.element_ty), mask=mask)
+    blk_beta_output = tl.sigmoid(blk_b.to(tl.float32))
+    tl.store(beta_output + off, blk_beta_output.to(b.dtype.element_ty), mask=mask)
+
+
+def fused_gdn_gating(
+    A_log: torch.Tensor,
+    a: torch.Tensor,
+    b: torch.Tensor,
+    dt_bias: torch.Tensor,
+    beta: float = 1.0,
+    threshold: float = 20.0,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    batch, num_heads = a.shape
+    seq_len = 1
+    grid = (batch, seq_len, triton.cdiv(num_heads, 8))
+    g = torch.empty(1, batch, num_heads, dtype=torch.float32, device=a.device)
+    beta_output = torch.empty(1, batch, num_heads, dtype=torch.float32, device=b.device)
+    fused_gdn_gating_kernel[grid](
+        g,
+        beta_output,
+        A_log,
+        a,
+        b,
+        dt_bias,
+        seq_len,
+        num_heads,
+        beta,
+        threshold,
+        8,
+        num_warps=1,
+    )
+    return g, beta_output
diff --git a/sglang/python/sglang/srt/layers/attention/fla/fused_norm_gate.py b/sglang/python/sglang/srt/layers/attention/fla/fused_norm_gate.py
new file mode 100644
index 0000000000000000000000000000000000000000..68fcbf91d729e5bca1105a2929fdd3bb5722d12a
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/attention/fla/fused_norm_gate.py
@@ -0,0 +1,388 @@
+# Adapt from https://github.com/fla-org/flash-linear-attention/blob/main/fla/modules/fused_norm_gate.py
+# Copyright (c) 2023-2025, Songlin Yang, Yu Zhang
+
+
+import torch
+import torch.nn as nn
+import triton
+import triton.language as tl
+
+from sglang.srt.utils import (
+    cdiv,
+    cpu_has_amx_support,
+    is_cpu,
+    is_npu,
+    next_power_of_2,
+)
+
+_is_npu = is_npu()
+_use_cpu = is_cpu() and cpu_has_amx_support()
+
+# Maximum rows per Triton block for layernorm gated kernel
+MAX_ROWS_PER_BLOCK = 4
+
+
+@triton.jit
+def layer_norm_gated_fwd_kernel(
+    x,  # pointer to the input
+    g,  # pointer to the gate
+    y,  # pointer to the output
+    w,  # pointer to the weights
+    b,  # pointer to the biases
+    residual,  # pointer to the residual
+    residual_out,  # pointer to the residual
+    mean,  # pointer to the mean
+    rstd,  # pointer to the 1/std
+    eps,  # epsilon to avoid division by zero
+    T,  # number of rows in x
+    D: tl.constexpr,  # number of columns in x
+    BT: tl.constexpr,
+    BD: tl.constexpr,
+    ACTIVATION: tl.constexpr,
+    IS_RMS_NORM: tl.constexpr,
+    STORE_RESIDUAL_OUT: tl.constexpr,
+    HAS_RESIDUAL: tl.constexpr,
+    HAS_WEIGHT: tl.constexpr,
+    HAS_BIAS: tl.constexpr,
+):
+    i_t = tl.program_id(0)
+
+    o_d = tl.arange(0, BD)
+    m_d = o_d < D
+
+    p_x = tl.make_block_ptr(x, (T, D), (D, 1), (i_t * BT, 0), (BT, BD), (1, 0))
+    b_x = tl.load(p_x, boundary_check=(0, 1)).to(tl.float32)
+    if HAS_RESIDUAL:
+        p_res = tl.make_block_ptr(
+            residual, (T, D), (D, 1), (i_t * BT, 0), (BT, BD), (1, 0)
+        )
+        b_x += tl.load(p_res, boundary_check=(0, 1)).to(tl.float32)
+    if STORE_RESIDUAL_OUT:
+        p_res_out = tl.make_block_ptr(
+            residual_out, (T, D), (D, 1), (i_t * BT, 0), (BT, BD), (1, 0)
+        )
+        tl.store(p_res_out, b_x.to(p_res_out.dtype.element_ty), boundary_check=(0, 1))
+    if not IS_RMS_NORM:
+        b_mean = tl.sum(b_x, axis=1) / D
+        p_mean = tl.make_block_ptr(mean, (T,), (1,), (i_t * BT,), (BT,), (0,))
+        tl.store(p_mean, b_mean.to(p_mean.dtype.element_ty), boundary_check=(0,))
+        b_xbar = tl.where(m_d[None, :], b_x - b_mean[:, None], 0.0)
+        b_var = tl.sum(b_xbar * b_xbar, axis=1) / D
+    else:
+        b_xbar = tl.where(m_d[None, :], b_x, 0.0)
+        b_var = tl.sum(b_xbar * b_xbar, axis=1) / D
+    b_rstd = 1 / tl.sqrt(b_var + eps)
+
+    p_rstd = tl.make_block_ptr(rstd, (T,), (1,), (i_t * BT,), (BT,), (0,))
+    tl.store(p_rstd, b_rstd.to(p_rstd.dtype.element_ty), boundary_check=(0,))
+
+    if HAS_WEIGHT:
+        b_w = tl.load(w + o_d, mask=m_d).to(tl.float32)
+    if HAS_BIAS:
+        b_b = tl.load(b + o_d, mask=m_d).to(tl.float32)
+    b_x_hat = (
+        (b_x - b_mean[:, None]) * b_rstd[:, None]
+        if not IS_RMS_NORM
+        else b_x * b_rstd[:, None]
+    )
+    b_y = b_x_hat * b_w[None, :] if HAS_WEIGHT else b_x_hat
+    if HAS_BIAS:
+        b_y = b_y + b_b[None, :]
+
+    # swish/sigmoid output gate
+    p_g = tl.make_block_ptr(g, (T, D), (D, 1), (i_t * BT, 0), (BT, BD), (1, 0))
+    b_g = tl.load(p_g, boundary_check=(0, 1)).to(tl.float32)
+    if ACTIVATION == "swish" or ACTIVATION == "silu":
+        b_y = b_y * b_g * tl.sigmoid(b_g)
+    elif ACTIVATION == "sigmoid":
+        b_y = b_y * tl.sigmoid(b_g)
+
+    # Write output
+    p_y = tl.make_block_ptr(y, (T, D), (D, 1), (i_t * BT, 0), (BT, BD), (1, 0))
+    tl.store(p_y, b_y.to(p_y.dtype.element_ty), boundary_check=(0, 1))
+
+
+@triton.jit
+def layer_norm_gated_fwd_kernel1(
+    x,  # pointer to the input
+    g,  # pointer to the gate
+    y,  # pointer to the output
+    w,  # pointer to the weights
+    b,  # pointer to the biases
+    residual,  # pointer to the residual
+    residual_out,  # pointer to the residual
+    mean,  # pointer to the mean
+    rstd,  # pointer to the 1/std
+    eps,  # epsilon to avoid division by zero
+    D: tl.constexpr,  # number of columns in x
+    BD: tl.constexpr,
+    ACTIVATION: tl.constexpr,
+    IS_RMS_NORM: tl.constexpr,
+    STORE_RESIDUAL_OUT: tl.constexpr,
+    HAS_RESIDUAL: tl.constexpr,
+    HAS_WEIGHT: tl.constexpr,
+    HAS_BIAS: tl.constexpr,
+):
+    i_t = tl.program_id(0)
+    x += i_t * D
+    y += i_t * D
+    g += i_t * D
+    if HAS_RESIDUAL:
+        residual += i_t * D
+    if STORE_RESIDUAL_OUT:
+        residual_out += i_t * D
+
+    o_d = tl.arange(0, BD)
+    m_d = o_d < D
+    b_x = tl.load(x + o_d, mask=m_d, other=0.0).to(tl.float32)
+    if HAS_RESIDUAL:
+        b_x += tl.load(residual + o_d, mask=m_d, other=0.0).to(tl.float32)
+    if STORE_RESIDUAL_OUT:
+        tl.store(residual_out + o_d, b_x, mask=m_d)
+    if not IS_RMS_NORM:
+        b_mean = tl.sum(b_x, axis=0) / D
+        tl.store(mean + i_t, b_mean)
+        b_xbar = tl.where(m_d, b_x - b_mean, 0.0)
+        b_var = tl.sum(b_xbar * b_xbar, axis=0) / D
+    else:
+        b_xbar = tl.where(m_d, b_x, 0.0)
+        b_var = tl.sum(b_xbar * b_xbar, axis=0) / D
+    b_rstd = 1 / tl.sqrt(b_var + eps)
+    tl.store(rstd + i_t, b_rstd)
+
+    if HAS_WEIGHT:
+        b_w = tl.load(w + o_d, mask=m_d).to(tl.float32)
+    if HAS_BIAS:
+        b_b = tl.load(b + o_d, mask=m_d).to(tl.float32)
+    b_x_hat = (b_x - b_mean) * b_rstd if not IS_RMS_NORM else b_x * b_rstd
+    b_y = b_x_hat * b_w if HAS_WEIGHT else b_x_hat
+    if HAS_BIAS:
+        b_y = b_y + b_b
+
+    # swish/sigmoid output gate
+    b_g = tl.load(g + o_d, mask=m_d, other=0.0).to(tl.float32)
+    if ACTIVATION == "swish" or ACTIVATION == "silu":
+        b_y = b_y * b_g * tl.sigmoid(b_g)
+    elif ACTIVATION == "sigmoid":
+        b_y = b_y * tl.sigmoid(b_g)
+
+    # Write output
+    tl.store(y + o_d, b_y, mask=m_d)
+
+
+def layer_norm_gated_fwd(
+    x: torch.Tensor,
+    g: torch.Tensor,
+    weight: torch.Tensor,
+    bias: torch.Tensor,
+    activation: str = "swish",
+    eps: float = 1e-5,
+    residual: torch.Tensor = None,
+    out_dtype: torch.dtype = None,
+    residual_dtype: torch.dtype = None,
+    is_rms_norm: bool = False,
+):
+    if residual is not None:
+        residual_dtype = residual.dtype
+    T, D = x.shape
+    if residual is not None:
+        assert residual.shape == (T, D)
+    if weight is not None:
+        assert weight.shape == (D,)
+    if bias is not None:
+        assert bias.shape == (D,)
+    # allocate output
+    y = x if out_dtype is None else torch.empty_like(x, dtype=out_dtype)
+    if residual is not None or (
+        residual_dtype is not None and residual_dtype != x.dtype
+    ):
+        residual_out = torch.empty(T, D, device=x.device, dtype=residual_dtype)
+    else:
+        residual_out = None
+    mean = (
+        torch.empty((T,), dtype=torch.float, device=x.device)
+        if not is_rms_norm
+        else None
+    )
+    rstd = torch.empty((T,), dtype=torch.float, device=x.device)
+    # Less than 64KB per feature: enqueue fused kernel
+    MAX_FUSED_SIZE = 65536 // x.element_size()
+    BD = min(MAX_FUSED_SIZE, next_power_of_2(D))
+    if D > BD:
+        raise RuntimeError("This layer norm doesn't support feature dim >= 64KB.")
+    # heuristics for number of warps
+
+    if D <= 512:
+        BT = 32
+        layer_norm_gated_fwd_kernel[(cdiv(T, BT),)](
+            x=x,
+            g=g,
+            y=y,
+            w=weight,
+            b=bias,
+            residual=residual,
+            residual_out=residual_out,
+            mean=mean,
+            rstd=rstd,
+            eps=eps,
+            T=T,
+            D=D,
+            BD=BD,
+            BT=BT,
+            ACTIVATION=activation,
+            IS_RMS_NORM=is_rms_norm,
+            STORE_RESIDUAL_OUT=residual_out is not None,
+            HAS_RESIDUAL=residual is not None,
+            HAS_WEIGHT=weight is not None,
+            HAS_BIAS=bias is not None,
+            num_warps=4,
+        )
+    else:
+        layer_norm_gated_fwd_kernel1[(T,)](
+            x=x,
+            g=g,
+            y=y,
+            w=weight,
+            b=bias,
+            residual=residual,
+            residual_out=residual_out,
+            mean=mean,
+            rstd=rstd,
+            eps=eps,
+            D=D,
+            BD=BD,
+            ACTIVATION=activation,
+            IS_RMS_NORM=is_rms_norm,
+            STORE_RESIDUAL_OUT=residual_out is not None,
+            HAS_RESIDUAL=residual is not None,
+            HAS_WEIGHT=weight is not None,
+            HAS_BIAS=bias is not None,
+            num_warps=4,
+        )
+    # residual_out is None if residual is None and residual_dtype == input_dtype
+    return y, mean, rstd, residual_out if residual_out is not None else x
+
+
+class LayerNormGatedFunction(torch.autograd.Function):
+    @staticmethod
+    def forward(
+        ctx,
+        x: torch.Tensor,
+        g: torch.Tensor,
+        weight: torch.Tensor,
+        bias: torch.Tensor,
+        activation: str,
+        residual: torch.Tensor | None = None,
+        eps: float = 1e-6,
+        prenorm: bool = False,
+        residual_in_fp32: bool = False,
+        is_rms_norm: bool = False,
+    ):
+        x_shape_og = x.shape
+        g_shape_og = g.shape
+        # reshape input data into 2D tensor
+        x = x.reshape(-1, x.shape[-1])
+        g = g.reshape(-1, g.shape[-1])
+        if residual is not None:
+            assert residual.shape == x_shape_og
+            residual = residual.reshape(-1, residual.shape[-1])
+        residual_dtype = (
+            residual.dtype
+            if residual is not None
+            else (torch.float if residual_in_fp32 else None)
+        )
+        y, mean, rstd, residual_out = layer_norm_gated_fwd(
+            x=x,
+            g=g,
+            weight=weight,
+            bias=bias,
+            activation=activation,
+            eps=eps,
+            residual=residual,
+            residual_dtype=residual_dtype,
+            is_rms_norm=is_rms_norm,
+        )
+        ctx.save_for_backward(residual_out, g, weight, bias, mean, rstd)
+        ctx.x_shape_og = x_shape_og
+        ctx.g_shape_og = g_shape_og
+        ctx.activation = activation
+        ctx.eps = eps
+        ctx.is_rms_norm = is_rms_norm
+        ctx.has_residual = residual is not None
+        ctx.prenorm = prenorm
+        ctx.x_dtype = x.dtype
+        y = y.reshape(x_shape_og)
+        return y if not prenorm else (y, residual_out.reshape(x_shape_og))
+
+
+def rms_norm_gated(
+    x: torch.Tensor,
+    g: torch.Tensor,
+    weight: torch.Tensor,
+    bias: torch.Tensor,
+    activation: str = "swish",
+    residual: torch.Tensor | None = None,
+    prenorm: bool = False,
+    residual_in_fp32: bool = False,
+    eps: float = 1e-6,
+):
+    return LayerNormGatedFunction.apply(
+        x,
+        g,
+        weight,
+        bias,
+        activation,
+        residual,
+        eps,
+        prenorm,
+        residual_in_fp32,
+        True,
+    )
+
+
+class FusedRMSNormGated(nn.Module):
+    def __init__(
+        self,
+        hidden_size: int,
+        elementwise_affine: bool = True,
+        eps: float = 1e-5,
+        activation: str = "swish",
+        device: torch.device | None = None,
+        dtype: torch.dtype | None = None,
+    ) -> None:
+        factory_kwargs = {"device": device, "dtype": dtype}
+        super().__init__()
+
+        self.hidden_size = hidden_size
+        self.elementwise_affine = elementwise_affine
+        self.eps = eps
+        self.activation = activation
+
+        if self.activation not in ["swish", "silu", "sigmoid"]:
+            raise ValueError(f"Unsupported activation: {self.activation}")
+
+        if elementwise_affine:
+            self.weight = nn.Parameter(torch.empty(hidden_size, **factory_kwargs))
+        else:
+            self.register_parameter("weight", None)
+        self.register_parameter("bias", None)
+
+    def forward(
+        self,
+        x: torch.Tensor,
+        g: torch.Tensor,
+        residual: torch.Tensor | None = None,
+        prenorm: bool = False,
+        residual_in_fp32: bool = False,
+    ) -> torch.Tensor:
+        return rms_norm_gated(
+            x,
+            g,
+            self.weight,
+            self.bias,
+            self.activation,
+            residual=residual,
+            eps=self.eps,
+            prenorm=prenorm,
+            residual_in_fp32=residual_in_fp32,
+        )
diff --git a/sglang/python/sglang/srt/layers/attention/fla/fused_recurrent.py b/sglang/python/sglang/srt/layers/attention/fla/fused_recurrent.py
new file mode 100644
index 0000000000000000000000000000000000000000..8f4130c10ebd1a25b5cc81153714dff1a8253117
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/attention/fla/fused_recurrent.py
@@ -0,0 +1,721 @@
+# Adapt from https://github.com/fla-org/flash-linear-attention/blob/main/fla/ops/gated_delta_rule/fused_recurrent.py
+# -*- coding: utf-8 -*-
+# Copyright (c) 2023-2025, Songlin Yang, Yu Zhang
+
+from typing import Optional, Tuple
+
+import torch
+import triton
+import triton.language as tl
+
+from sglang.srt.layers.attention.fla.op import exp
+from sglang.srt.layers.attention.fla.utils import input_guard
+
+
+@triton.jit(do_not_specialize=["T"])
+def fused_recurrent_gated_delta_rule_fwd_kernel(
+    q,
+    k,
+    v,
+    g,
+    beta,
+    o,
+    h0,
+    ht,
+    cu_seqlens,
+    scale,
+    T,
+    B: tl.constexpr,
+    H: tl.constexpr,
+    HV: tl.constexpr,
+    K: tl.constexpr,
+    V: tl.constexpr,
+    BK: tl.constexpr,
+    BV: tl.constexpr,
+    USE_INITIAL_STATE: tl.constexpr,  # whether to use initial state
+    STORE_FINAL_STATE: tl.constexpr,  # whether to store final state
+    IS_BETA_HEADWISE: tl.constexpr,  # whether beta is headwise vector or scalar,
+    USE_QK_L2NORM_IN_KERNEL: tl.constexpr,
+    IS_VARLEN: tl.constexpr,
+    IS_KDA: tl.constexpr,
+):
+    i_k, i_v, i_nh = tl.program_id(0), tl.program_id(1), tl.program_id(2)
+    i_n, i_hv = i_nh // HV, i_nh % HV
+    i_h = i_hv // (HV // H)
+    if IS_VARLEN:
+        bos, eos = tl.load(cu_seqlens + i_n).to(tl.int64), tl.load(
+            cu_seqlens + i_n + 1
+        ).to(tl.int64)
+        all = T
+        T = eos - bos
+    else:
+        bos, eos = i_n * T, i_n * T + T
+        all = B * T
+    o_k = i_k * BK + tl.arange(0, BK)
+    o_v = i_v * BV + tl.arange(0, BV)
+
+    p_q = q + (bos * H + i_h) * K + o_k
+    p_k = k + (bos * H + i_h) * K + o_k
+    p_v = v + (bos * HV + i_hv) * V + o_v
+    if IS_BETA_HEADWISE:
+        p_beta = beta + (bos * HV + i_hv) * V + o_v
+    else:
+        p_beta = beta + bos * HV + i_hv
+    if not IS_KDA:
+        p_g = g + bos * HV + i_hv
+    else:
+        p_gk = g + (bos * HV + i_hv) * K + o_k
+
+    p_o = o + ((i_k * all + bos) * HV + i_hv) * V + o_v
+
+    mask_k = o_k < K
+    mask_v = o_v < V
+    mask_h = mask_k[:, None] & mask_v[None, :]
+
+    b_h = tl.zeros([BK, BV], dtype=tl.float32)
+    if USE_INITIAL_STATE:
+        p_h0 = h0 + i_nh * K * V + o_k[:, None] * V + o_v[None, :]
+        b_h += tl.load(p_h0, mask=mask_h, other=0).to(tl.float32)
+
+    for _ in range(0, T):
+        b_q = tl.load(p_q, mask=mask_k, other=0).to(tl.float32)
+        b_k = tl.load(p_k, mask=mask_k, other=0).to(tl.float32)
+        b_v = tl.load(p_v, mask=mask_v, other=0).to(tl.float32)
+
+        if USE_QK_L2NORM_IN_KERNEL:
+            b_q = b_q / (tl.sqrt(tl.sum(b_q * b_q) + 1e-6))
+            b_k = b_k / (tl.sqrt(tl.sum(b_k * b_k) + 1e-6))
+        b_q = b_q * scale
+        # [BK, BV]
+        if not IS_KDA:
+            b_g = tl.load(p_g).to(tl.float32)
+            b_h *= exp(b_g)
+        else:
+            b_gk = tl.load(p_gk).to(tl.float32)
+            b_h *= exp(b_gk[:, None])
+        # [BV]
+        b_v -= tl.sum(b_h * b_k[:, None], 0)
+        if IS_BETA_HEADWISE:
+            b_beta = tl.load(p_beta, mask=mask_v, other=0).to(tl.float32)
+        else:
+            b_beta = tl.load(p_beta).to(tl.float32)
+        b_v *= b_beta
+        # [BK, BV]
+        b_h += b_k[:, None] * b_v[None, :]
+        # [BV]
+        b_o = tl.sum(b_h * b_q[:, None], 0)
+        tl.store(p_o, b_o.to(p_o.dtype.element_ty), mask=mask_v)
+
+        p_q += H * K
+        p_k += H * K
+        p_o += HV * V
+        p_v += HV * V
+        if not IS_KDA:
+            p_g += HV
+        else:
+            p_gk += HV * K
+        p_beta += HV * (V if IS_BETA_HEADWISE else 1)
+
+    if STORE_FINAL_STATE:
+        p_ht = ht + i_nh * K * V + o_k[:, None] * V + o_v[None, :]
+        tl.store(p_ht, b_h.to(p_ht.dtype.element_ty), mask=mask_h)
+
+
+def fused_recurrent_gated_delta_rule_fwd(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    g: torch.Tensor,
+    beta: torch.Tensor,
+    scale: float,
+    initial_state: torch.Tensor,
+    output_final_state: bool,
+    use_qk_l2norm_in_kernel: bool = False,
+    cu_seqlens: Optional[torch.LongTensor] = None,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    B, T, H, K, V = *k.shape, v.shape[-1]
+    HV = v.shape[2]
+    N = B if cu_seqlens is None else len(cu_seqlens) - 1
+    BK, BV = triton.next_power_of_2(K), min(triton.next_power_of_2(V), 32)
+    NK, NV = triton.cdiv(K, BK), triton.cdiv(V, BV)
+    assert NK == 1, "NK > 1 is not supported yet"
+    num_stages = 3
+    num_warps = 1
+
+    o = q.new_empty(NK, *v.shape)
+    if output_final_state:
+        final_state = q.new_empty(N, HV, K, V, dtype=torch.float32)
+    else:
+        final_state = None
+
+    grid = (NK, NV, N * HV)
+    fused_recurrent_gated_delta_rule_fwd_kernel[grid](
+        q=q,
+        k=k,
+        v=v,
+        g=g,
+        beta=beta,
+        o=o,
+        h0=initial_state,
+        ht=final_state,
+        cu_seqlens=cu_seqlens,
+        scale=scale,
+        T=T,
+        B=B,
+        H=H,
+        HV=HV,
+        K=K,
+        V=V,
+        BK=BK,
+        BV=BV,
+        USE_INITIAL_STATE=initial_state is not None,
+        STORE_FINAL_STATE=final_state is not None,
+        IS_BETA_HEADWISE=beta.ndim == v.ndim,
+        USE_QK_L2NORM_IN_KERNEL=use_qk_l2norm_in_kernel,
+        IS_VARLEN=cu_seqlens is not None,
+        IS_KDA=False,
+        num_warps=num_warps,
+        num_stages=num_stages,
+    )
+    o = o.squeeze(0)
+    return o, final_state
+
+
+class FusedRecurrentFunction(torch.autograd.Function):
+
+    @staticmethod
+    @input_guard
+    def forward(
+        ctx,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
+        g: torch.Tensor,
+        beta: torch.Tensor,
+        scale: float,
+        initial_state: torch.Tensor,
+        output_final_state: bool,
+        cu_seqlens: Optional[torch.LongTensor] = None,
+        use_qk_l2norm_in_kernel: bool = False,
+    ):
+        o, final_state = fused_recurrent_gated_delta_rule_fwd(
+            q=q,
+            k=k,
+            v=v,
+            g=g,
+            beta=beta,
+            scale=scale,
+            initial_state=initial_state,
+            output_final_state=output_final_state,
+            use_qk_l2norm_in_kernel=use_qk_l2norm_in_kernel,
+            cu_seqlens=cu_seqlens,
+        )
+
+        return o, final_state
+
+    @staticmethod
+    @input_guard
+    def backward(ctx, do, dht):
+        raise NotImplementedError(
+            "Backward pass is not implemented yet and we do not have plans to implement it "
+            "because we haven't figured out how to compute dg without materializing the full "
+            "hidden states for all time steps."
+        )
+
+
+def fused_recurrent_gated_delta_rule(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    g: torch.Tensor,
+    beta: torch.Tensor = None,
+    scale: float = None,
+    initial_state: torch.Tensor = None,
+    output_final_state: bool = False,
+    cu_seqlens: Optional[torch.LongTensor] = None,
+    use_qk_l2norm_in_kernel: bool = False,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    r"""
+    Args:
+        q (torch.Tensor):
+            queries of shape `[B, T, H, K]`.
+        k (torch.Tensor):
+            keys of shape `[B, T, H, K]`.
+        v (torch.Tensor):
+            values of shape `[B, T, HV, V]`.
+            GVA is applied if `HV > H`.
+        g (torch.Tensor):
+            g (decays) of shape `[B, T, HV]`.
+        beta (torch.Tensor):
+            betas of shape `[B, T, HV]`.
+        scale (Optional[int]):
+            Scale factor for the RetNet attention scores.
+            If not provided, it will default to `1 / sqrt(K)`. Default: `None`.
+        initial_state (Optional[torch.Tensor]):
+            Initial state of shape `[N, HV, K, V]` for `N` input sequences.
+            For equal-length input sequences, `N` equals the batch size `B`.
+            Default: `None`.
+        output_final_state (Optional[bool]):
+            Whether to output the final state of shape `[N, HV, K, V]`. Default: `False`.
+        cu_seqlens (torch.LongTensor):
+            Cumulative sequence lengths of shape `[N+1]` used for variable-length training,
+            consistent with the FlashAttention API.
+    Returns:
+        o (torch.Tensor):
+            Outputs of shape `[B, T, HV, V]`.
+        final_state (torch.Tensor):
+            Final state of shape `[N, HV, K, V]` if `output_final_state=True` else `None`.
+    Examples::
+        >>> import torch
+        >>> import torch.nn.functional as F
+        >>> from einops import rearrange
+        >>> from fla.ops.gated_delta_rule import fused_recurrent_gated_delta_rule
+        # inputs with equal lengths
+        >>> B, T, H, HV, K, V = 4, 2048, 4, 8, 512, 512
+        >>> q = torch.randn(B, T, H, K, device='cuda')
+        >>> k = F.normalize(torch.randn(B, T, H, K, device='cuda'), p=2, dim=-1)
+        >>> v = torch.randn(B, T, HV, V, device='cuda')
+        >>> g = F.logsigmoid(torch.rand(B, T, HV, device='cuda'))
+        >>> beta = torch.rand(B, T, HV, device='cuda').sigmoid()
+        >>> h0 = torch.randn(B, HV, K, V, device='cuda')
+        >>> o, ht = fused_gated_recurrent_delta_rule(
+            q, k, v, g, beta,
+            initial_state=h0,
+            output_final_state=True
+        )
+        # for variable-length inputs, the batch size `B` is expected to be 1 and `cu_seqlens` is required
+        >>> q, k, v, g, beta = map(lambda x: rearrange(x, 'b t ... -> 1 (b t) ...'), (q, k, v, g, beta))
+        # for a batch with 4 sequences, `cu_seqlens` with 5 start/end positions are expected
+        >>> cu_seqlens = q.new_tensor([0, 2048, 4096, 6144, 8192], dtype=torch.long)
+        >>> o_var, ht_var = fused_gated_recurrent_delta_rule(
+            q, k, v, g, beta,
+            initial_state=h0,
+            output_final_state=True,
+            cu_seqlens=cu_seqlens
+        )
+    """
+    if cu_seqlens is not None:
+        if q.shape[0] != 1:
+            raise ValueError(
+                f"The batch size is expected to be 1 rather than {q.shape[0]} when using `cu_seqlens`."
+                f"Please flatten variable-length inputs before processing."
+            )
+        if initial_state is not None and initial_state.shape[0] != len(cu_seqlens) - 1:
+            raise ValueError(
+                f"The number of initial states is expected to be equal to the number of input sequences, "
+                f"i.e., {len(cu_seqlens) - 1} rather than {initial_state.shape[0]}."
+            )
+    if scale is None:
+        scale = k.shape[-1] ** -0.5
+    else:
+        assert scale > 0, "scale must be positive"
+    if beta is None:
+        beta = torch.ones_like(q[..., 0])
+    o, final_state = FusedRecurrentFunction.apply(
+        q,
+        k,
+        v,
+        g,
+        beta,
+        scale,
+        initial_state,
+        output_final_state,
+        cu_seqlens,
+        use_qk_l2norm_in_kernel,
+    )
+    return o, final_state
+
+
+# HAS_EAGLE_TREE_CUSTOM_ATTN_MASK is added to support eagle tree attention mask
+# retrieve_parent_token_ptr: [N, NP2_T], retrieve_next_sibling_ptr: [N, NP2_T]
+# e.g. for a sequence of length 4, the eagle tree attention structure is:
+# retrieve_next_token=[1, 3, -1, -1] -> retrieve_next_token[i]: the 1st child token of token i
+# retrieve_next_sibling=[-1, 2, -1, -1] -> retrieve_next_sibling[i]: the 1st tree sibling token of token i
+# retrieve_parent_token=[n/a, 0, 0, 1] -> retrieve_parent_token[i]: the parent token of token i
+# Tree:
+#    0
+#   / \
+#  1   2
+# /
+# 3
+# When calculating token 3's attention, it should attend to token 1 (parent) and token 0 (grand-parent)
+# When calculating token 2's attention, it should attend to token 0 (parent)
+@triton.jit(do_not_specialize=["T"])
+def fused_recurrent_gated_delta_rule_update_fwd_kernel(
+    q,
+    k,
+    v,
+    g,
+    beta,
+    o,
+    h0_source,
+    h0_indices,
+    cu_seqlens,
+    scale,
+    intermediate_states_buffer,
+    intermediate_state_indices,
+    cache_steps,
+    retrieve_parent_token_ptr,
+    stride_retrieve_parent_token_seq: tl.constexpr,
+    stride_retrieve_parent_token_token: tl.constexpr,
+    T,
+    NP2_T: tl.constexpr,
+    B: tl.constexpr,
+    H: tl.constexpr,
+    HV: tl.constexpr,
+    K: tl.constexpr,
+    V: tl.constexpr,
+    BK: tl.constexpr,
+    BV: tl.constexpr,
+    USE_INITIAL_STATE: tl.constexpr,  # whether to use initial state
+    IS_BETA_HEADWISE: tl.constexpr,  # whether beta is headwise vector or scalar,
+    USE_QK_L2NORM_IN_KERNEL: tl.constexpr,
+    IS_VARLEN: tl.constexpr,
+    DISABLE_STATE_UPDATE: tl.constexpr,  # whether to disable final state update
+    DISABLE_OUTPUT_CALCULATION: tl.constexpr,  # whether to disable output calculation
+    CACHE_INTERMEDIATE_STATES: tl.constexpr,
+    HAS_EAGLE_TREE_CUSTOM_ATTN_MASK: tl.constexpr,
+):
+    i_k, i_v, i_nh = tl.program_id(0), tl.program_id(1), tl.program_id(2)
+    i_n, i_hv = i_nh // HV, i_nh % HV
+    i_h = i_hv // (HV // H)
+    if IS_VARLEN:
+        bos, eos = tl.load(cu_seqlens + i_n).to(tl.int64), tl.load(
+            cu_seqlens + i_n + 1
+        ).to(tl.int64)
+        all = T
+        T = eos - bos
+    else:
+        bos, eos = i_n * T, i_n * T + T
+        all = B * T
+    o_k = i_k * BK + tl.arange(0, BK)
+    o_v = i_v * BV + tl.arange(0, BV)
+
+    p_q = q + (bos * H + i_h) * K + o_k
+    p_k = k + (bos * H + i_h) * K + o_k
+    p_v = v + (bos * HV + i_hv) * V + o_v
+    if IS_BETA_HEADWISE:
+        p_beta = beta + (bos * HV + i_hv) * V + o_v
+    else:
+        p_beta = beta + bos * HV + i_hv
+    p_g = g + bos * HV + i_hv
+    p_o = o + ((i_k * all + bos) * HV + i_hv) * V + o_v
+
+    if HAS_EAGLE_TREE_CUSTOM_ATTN_MASK:
+        token_indices = tl.arange(0, NP2_T)
+        mask_retrieve = token_indices < T
+        retrieve_parent_token_base = (
+            retrieve_parent_token_ptr
+            + (i_n * stride_retrieve_parent_token_seq)
+            + token_indices * stride_retrieve_parent_token_token
+        )
+        parent_idx_tokens = tl.load(retrieve_parent_token_base, mask_retrieve)
+
+    mask_k = o_k < K
+    mask_v = o_v < V
+    mask_h = mask_k[:, None] & mask_v[None, :]
+
+    b_h = tl.zeros([BK, BV], dtype=tl.float32)
+    if USE_INITIAL_STATE:
+        idx = tl.load(h0_indices + i_n)
+        # Add bounds checking for idx
+        if idx >= 0:  # Assuming negative indices are invalid
+            p_h0 = (
+                h0_source
+                + idx * HV * K * V
+                + i_hv * K * V
+                + o_k[:, None] * V
+                + o_v[None, :]
+            )
+            b_h += tl.load(p_h0, mask=mask_h, other=0).to(tl.float32)
+
+    # Prepare intermediate state cache variables if enabled
+    cache_idx = -1
+    if CACHE_INTERMEDIATE_STATES:
+        cache_idx = tl.load(intermediate_state_indices + i_n)
+
+    step_idx = 0
+    for _ in range(0, T):
+        if HAS_EAGLE_TREE_CUSTOM_ATTN_MASK:
+            # step_idx = 0 should use the b_h from USE_INITIAL_STATE
+            if step_idx != 0 and cache_idx >= 0:
+                # when calculating current step's attention, load the state from the parent token
+                parent_step_idx = tl.sum(
+                    tl.where(token_indices == step_idx, parent_idx_tokens, 0)
+                )
+                step_offset = parent_step_idx * HV * K * V
+                cache_ptr = (
+                    intermediate_states_buffer
+                    + cache_idx * cache_steps * HV * K * V
+                    + step_offset
+                    + i_hv * K * V
+                    + o_k[:, None] * V
+                    + o_v[None, :]
+                )
+                b_h = tl.load(cache_ptr, mask=mask_h, other=0).to(tl.float32)
+
+        b_q = tl.load(p_q, mask=mask_k, other=0).to(tl.float32)
+        b_k = tl.load(p_k, mask=mask_k, other=0).to(tl.float32)
+        b_v = tl.load(p_v, mask=mask_v, other=0).to(tl.float32)
+        b_g = tl.load(p_g).to(tl.float32)
+
+        if USE_QK_L2NORM_IN_KERNEL:
+            b_q = b_q / (tl.sqrt(tl.sum(b_q * b_q) + 1e-6))
+            b_k = b_k / (tl.sqrt(tl.sum(b_k * b_k) + 1e-6))
+        b_q = b_q * scale
+        # [BK, BV]
+        b_h *= exp(b_g)
+        # [BV]
+        b_v -= tl.sum(b_h * b_k[:, None], 0)
+        if IS_BETA_HEADWISE:
+            b_beta = tl.load(p_beta, mask=mask_v, other=0).to(tl.float32)
+        else:
+            b_beta = tl.load(p_beta).to(tl.float32)
+        b_v *= b_beta
+        # [BK, BV]
+        b_h += b_k[:, None] * b_v[None, :]
+        # [BV]
+        if not DISABLE_OUTPUT_CALCULATION:
+            b_o = tl.sum(b_h * b_q[:, None], 0)
+            # core attn output
+            tl.store(p_o, b_o.to(p_o.dtype.element_ty), mask=mask_v)
+
+        # store intermediate states if enabled
+        if CACHE_INTERMEDIATE_STATES:
+            if cache_idx >= 0:
+                # Compute cache pointer for this step
+                step_offset = step_idx * HV * K * V
+                cache_ptr = (
+                    intermediate_states_buffer
+                    + cache_idx * cache_steps * HV * K * V
+                    + step_offset
+                    + i_hv * K * V
+                    + o_k[:, None] * V
+                    + o_v[None, :]
+                )
+                tl.store(cache_ptr, b_h.to(cache_ptr.dtype.element_ty), mask=mask_h)
+
+        step_idx += 1
+
+        p_q += H * K
+        p_k += H * K
+        p_o += HV * V
+        p_v += HV * V
+        p_g += HV
+        p_beta += HV * (V if IS_BETA_HEADWISE else 1)
+
+    # Store final state back to h0_source with bounds checking
+    # ssm states
+    if not DISABLE_STATE_UPDATE:
+        idx = tl.load(h0_indices + i_n)
+        if idx >= 0:  # Add bounds checking
+            p_h0 = (
+                h0_source
+                + idx * HV * K * V
+                + i_hv * K * V
+                + o_k[:, None] * V
+                + o_v[None, :]
+            )
+            tl.store(p_h0, b_h.to(p_h0.dtype.element_ty), mask=mask_h)
+
+
+def fused_recurrent_gated_delta_rule_update_fwd(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    g: torch.Tensor,
+    beta: torch.Tensor,
+    scale: float,
+    initial_state_source: torch.Tensor,
+    initial_state_indices: torch.Tensor,
+    use_qk_l2norm_in_kernel: bool = False,
+    cu_seqlens: Optional[torch.LongTensor] = None,
+    disable_state_update: bool = False,
+    disable_output_calculation: bool = False,
+    intermediate_states_buffer: Optional[torch.Tensor] = None,
+    intermediate_state_indices: Optional[torch.Tensor] = None,
+    cache_steps: Optional[int] = None,
+    retrieve_parent_token: Optional[torch.Tensor] = None,
+) -> torch.Tensor:
+    B, T, H, K, V = *k.shape, v.shape[-1]
+    HV = v.shape[2]
+    N = B if cu_seqlens is None else len(cu_seqlens) - 1
+    BK, BV = triton.next_power_of_2(K), min(triton.next_power_of_2(V), 8)
+    NK, NV = triton.cdiv(K, BK), triton.cdiv(V, BV)
+    assert NK == 1, "NK > 1 is not supported yet"
+    num_stages = 3
+    num_warps = 1
+
+    if disable_output_calculation:
+        # When output calculation is disabled, allocate minimal tensor
+        o = q.new_empty(NK, 1, 1, 1, 1)  # minimal allocation
+    else:
+        o = q.new_empty(NK, *v.shape)
+
+    grid = (NK, NV, N * HV)
+
+    # prepare retrieve next token buffer strides if provided
+    if retrieve_parent_token is not None:
+        stride_retrieve_parent_token_seq, stride_retrieve_parent_token_token = (
+            retrieve_parent_token.stride(0),
+            retrieve_parent_token.stride(1),
+        )
+    else:
+        stride_retrieve_parent_token_seq = stride_retrieve_parent_token_token = 0
+
+    NP2_T = triton.next_power_of_2(T)
+    fused_recurrent_gated_delta_rule_update_fwd_kernel[grid](
+        q=q,
+        k=k,
+        v=v,
+        g=g,
+        beta=beta,
+        o=o,
+        h0_source=initial_state_source,
+        h0_indices=initial_state_indices,
+        cu_seqlens=cu_seqlens,
+        scale=scale,
+        intermediate_states_buffer=intermediate_states_buffer,
+        intermediate_state_indices=intermediate_state_indices,
+        cache_steps=0 if cache_steps is None else cache_steps,
+        retrieve_parent_token_ptr=retrieve_parent_token,
+        stride_retrieve_parent_token_seq=stride_retrieve_parent_token_seq,
+        stride_retrieve_parent_token_token=stride_retrieve_parent_token_token,
+        T=T,
+        NP2_T=NP2_T,
+        B=B,
+        H=H,
+        HV=HV,
+        K=K,
+        V=V,
+        BK=BK,
+        BV=BV,
+        USE_INITIAL_STATE=initial_state_source is not None,
+        IS_VARLEN=cu_seqlens is not None,
+        CACHE_INTERMEDIATE_STATES=intermediate_states_buffer is not None,
+        HAS_EAGLE_TREE_CUSTOM_ATTN_MASK=retrieve_parent_token is not None,
+        IS_BETA_HEADWISE=beta.ndim == v.ndim,
+        USE_QK_L2NORM_IN_KERNEL=use_qk_l2norm_in_kernel,
+        DISABLE_STATE_UPDATE=disable_state_update,
+        DISABLE_OUTPUT_CALCULATION=disable_output_calculation,
+        num_warps=num_warps,
+        num_stages=num_stages,
+    )
+    o = o.squeeze(0)
+    return o
+
+
+class FusedRecurrentUpdateFunction(torch.autograd.Function):
+
+    @staticmethod
+    @input_guard
+    def forward(
+        ctx,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
+        g: torch.Tensor,
+        beta: torch.Tensor,
+        scale: float,
+        initial_state_source: torch.Tensor,
+        initial_state_indices: torch.Tensor,
+        cu_seqlens: Optional[torch.LongTensor] = None,
+        use_qk_l2norm_in_kernel: bool = False,
+        disable_state_update: bool = False,
+        disable_output_calculation: bool = False,
+        intermediate_states_buffer: Optional[torch.Tensor] = None,
+        intermediate_state_indices: Optional[torch.Tensor] = None,
+        cache_steps: Optional[int] = None,
+        retrieve_parent_token: Optional[torch.Tensor] = None,
+    ):
+        o = fused_recurrent_gated_delta_rule_update_fwd(
+            q=q,
+            k=k,
+            v=v,
+            g=g,
+            beta=beta,
+            scale=scale,
+            initial_state_source=initial_state_source,
+            initial_state_indices=initial_state_indices,
+            use_qk_l2norm_in_kernel=use_qk_l2norm_in_kernel,
+            cu_seqlens=cu_seqlens,
+            disable_state_update=disable_state_update,
+            disable_output_calculation=disable_output_calculation,
+            intermediate_states_buffer=intermediate_states_buffer,
+            intermediate_state_indices=intermediate_state_indices,
+            cache_steps=cache_steps,
+            retrieve_parent_token=retrieve_parent_token,
+        )
+
+        return o
+
+    @staticmethod
+    @input_guard
+    def backward(ctx, do, dht):
+        raise NotImplementedError(
+            "Backward pass is not implemented yet and we do not have plans to implement it "
+            "because we haven't figured out how to compute dg without materializing the full "
+            "hidden states for all time steps."
+        )
+
+
+def fused_recurrent_gated_delta_rule_update(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    g: torch.Tensor,
+    beta: torch.Tensor = None,
+    scale: float = None,
+    initial_state_source: torch.Tensor = None,
+    initial_state_indices: torch.Tensor = None,
+    cu_seqlens: Optional[torch.LongTensor] = None,
+    use_qk_l2norm_in_kernel: bool = False,
+    disable_state_update: bool = False,
+    disable_output_calculation: bool = False,
+    intermediate_states_buffer: Optional[torch.Tensor] = None,
+    intermediate_state_indices: Optional[torch.Tensor] = None,
+    cache_steps: Optional[int] = None,
+    retrieve_parent_token: Optional[torch.Tensor] = None,
+) -> torch.Tensor:
+    if cu_seqlens is not None:
+        if q.shape[0] != 1:
+            raise ValueError(
+                f"The batch size is expected to be 1 rather than {q.shape[0]} when using `cu_seqlens`."
+                f"Please flatten variable-length inputs before processing."
+            )
+        if initial_state_source is not None:
+            if initial_state_indices.shape[0] != len(cu_seqlens) - 1:
+                raise ValueError(
+                    f"The number of initial states is expected to be equal to the number of input sequences, "
+                    f"i.e., {len(cu_seqlens) - 1} rather than {initial_state_indices.shape[0]}."
+                )
+            if initial_state_indices.shape[0] != intermediate_state_indices.shape[0]:
+                raise ValueError(
+                    f"The number of intermediate state indices is expected to be equal to the number of input sequences, "
+                    f"i.e., {initial_state_indices.shape[0]} != {intermediate_state_indices.shape[0]}."
+                )
+    if scale is None:
+        scale = k.shape[-1] ** -0.5
+    else:
+        assert scale > 0, "scale must be positive"
+    if beta is None:
+        beta = torch.ones_like(q[..., 0])
+    o = FusedRecurrentUpdateFunction.apply(
+        q,
+        k,
+        v,
+        g,
+        beta,
+        scale,
+        initial_state_source,
+        initial_state_indices,
+        cu_seqlens,
+        use_qk_l2norm_in_kernel,
+        disable_state_update,
+        disable_output_calculation,
+        intermediate_states_buffer,
+        intermediate_state_indices,
+        cache_steps,
+        retrieve_parent_token,
+    )
+    return o
diff --git a/sglang/python/sglang/srt/layers/attention/fla/fused_sigmoid_gating_recurrent.py b/sglang/python/sglang/srt/layers/attention/fla/fused_sigmoid_gating_recurrent.py
new file mode 100644
index 0000000000000000000000000000000000000000..f140ccae437a7a17c3bcdc6f39fd0b5e46a6ad8d
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/attention/fla/fused_sigmoid_gating_recurrent.py
@@ -0,0 +1,353 @@
+from typing import Optional
+
+import torch
+import triton
+import triton.language as tl
+
+
+@triton.jit(do_not_specialize=["T"])
+def fused_sigmoid_gating_delta_rule_update_kernel(
+    A_log,
+    a,
+    dt_bias,
+    softplus_beta,
+    softplus_threshold,
+    q,
+    k,
+    v,
+    b,
+    o,
+    h0_source,
+    h0_indices,
+    cu_seqlens,
+    # Parameters for target_verify support (unused for decode)
+    intermediate_states_buffer,
+    intermediate_state_indices,
+    cache_steps,
+    retrieve_parent_token_ptr,
+    stride_retrieve_parent_token_seq: tl.constexpr,
+    stride_retrieve_parent_token_token: tl.constexpr,
+    # ================================================
+    scale,
+    T,
+    stride_q,
+    stride_k,
+    stride_v,
+    stride_b,
+    NP2_T: tl.constexpr,
+    B: tl.constexpr,
+    H: tl.constexpr,
+    HV: tl.constexpr,
+    K: tl.constexpr,
+    V: tl.constexpr,
+    BK: tl.constexpr,
+    BV: tl.constexpr,
+    USE_INITIAL_STATE: tl.constexpr,
+    USE_QK_L2NORM_IN_KERNEL: tl.constexpr,
+    IS_VARLEN: tl.constexpr,
+    IS_KDA: tl.constexpr,
+    # Optional flags for target_verify support (default False for decode)
+    DISABLE_STATE_UPDATE: tl.constexpr = False,
+    CACHE_INTERMEDIATE_STATES: tl.constexpr = False,
+    HAS_EAGLE_TREE_CUSTOM_ATTN_MASK: tl.constexpr = False,
+):
+    """
+    Fused kernel that combines sigmoid gating computation with recurrent delta rule update.
+    """
+    i_k, i_v, i_nh = tl.program_id(0), tl.program_id(1), tl.program_id(2)
+    i_n, i_hv = i_nh // HV, i_nh % HV
+    i_h = i_hv // (HV // H)
+
+    if IS_VARLEN:
+        bos, eos = (
+            tl.load(cu_seqlens + i_n).to(tl.int64),
+            tl.load(cu_seqlens + i_n + 1).to(tl.int64),
+        )
+        all = T
+        T = eos - bos
+    else:
+        bos, eos = i_n * T, i_n * T + T
+        all = B * T
+
+    o_k = i_k * BK + tl.arange(0, BK)
+    o_v = i_v * BV + tl.arange(0, BV)
+
+    p_q = q + bos * stride_q + i_h * K + o_k
+    p_k = k + bos * stride_k + i_h * K + o_k
+    p_v = v + bos * stride_v + i_hv * V + o_v
+    p_b = b + bos * stride_b + i_hv
+    p_o = o + ((i_k * all + bos) * HV + i_hv) * V + o_v
+
+    # Gating computation pointers
+    p_A_log = A_log + i_hv
+    if IS_KDA:
+        p_a = a + (bos * HV + i_hv) * K + o_k
+        p_dt_bias = dt_bias + i_hv * K + o_k
+    else:
+        p_a = a + bos * HV + i_hv
+        p_dt_bias = dt_bias + i_hv
+
+    mask_k = o_k < K
+    mask_v = o_v < V
+    mask_h = mask_k[:, None] & mask_v[None, :]
+
+    b_h = tl.zeros([BK, BV], dtype=tl.float32)
+    if USE_INITIAL_STATE:
+        idx = tl.load(h0_indices + i_n)
+        if idx >= 0:
+            p_h0 = (
+                h0_source
+                + idx * HV * K * V
+                + i_hv * K * V
+                + o_k[:, None] * V
+                + o_v[None, :]
+            )
+            b_h += tl.load(p_h0, mask=mask_h, other=0).to(tl.float32)
+
+    # Preload tree attention data if needed
+    if HAS_EAGLE_TREE_CUSTOM_ATTN_MASK:
+        token_indices = tl.arange(0, NP2_T)
+        mask_retrieve = token_indices < T
+        retrieve_parent_token_base = (
+            retrieve_parent_token_ptr
+            + (i_n * stride_retrieve_parent_token_seq)
+            + token_indices * stride_retrieve_parent_token_token
+        )
+        parent_idx_tokens = tl.load(
+            retrieve_parent_token_base, mask=mask_retrieve, other=0
+        )
+
+    # Prepare intermediate state cache index if enabled
+    cache_idx = -1
+    if CACHE_INTERMEDIATE_STATES:
+        cache_idx = tl.load(intermediate_state_indices + i_n)
+
+    step_idx = 0
+    for _ in range(0, T):
+        # Tree attention: load parent's cached state
+        if HAS_EAGLE_TREE_CUSTOM_ATTN_MASK:
+            # step_idx == 0 uses b_h from USE_INITIAL_STATE
+            if step_idx != 0 and cache_idx >= 0:
+                parent_step_idx = tl.sum(
+                    tl.where(token_indices == step_idx, parent_idx_tokens, 0)
+                )
+                step_offset = parent_step_idx * HV * K * V
+                cache_ptr = (
+                    intermediate_states_buffer
+                    + cache_idx * cache_steps * HV * K * V
+                    + step_offset
+                    + i_hv * K * V
+                    + o_k[:, None] * V
+                    + o_v[None, :]
+                )
+                b_h = tl.load(cache_ptr, mask=mask_h, other=0).to(tl.float32)
+
+        # Load inputs
+        b_q = tl.load(p_q, mask=mask_k, other=0).to(tl.float32)
+        b_k = tl.load(p_k, mask=mask_k, other=0).to(tl.float32)
+        b_v = tl.load(p_v, mask=mask_v, other=0).to(tl.float32)
+        b_b = tl.load(p_b).to(tl.float32)
+
+        # Compute sigmoid gating
+        # Load gating parameters
+        b_A_log = tl.load(p_A_log).to(tl.float32)
+        if IS_KDA:
+            b_a = tl.load(p_a, mask=mask_k, other=0).to(tl.float32)
+            b_dt_bias = tl.load(p_dt_bias, mask=mask_k, other=0).to(tl.float32)
+        else:
+            b_a = tl.load(p_a).to(tl.float32)
+            b_dt_bias = tl.load(p_dt_bias).to(tl.float32)
+
+        # Compute g = -exp(A_log) * softplus(a + dt_bias)
+        x = b_a + b_dt_bias
+        beta_x = softplus_beta * x
+        # Apply softplus with numerical stability
+        softplus_x = tl.where(
+            beta_x <= softplus_threshold,
+            (1.0 / softplus_beta) * tl.log(1.0 + tl.exp(beta_x)),
+            x,
+        )
+        b_g = -tl.exp(b_A_log) * softplus_x
+
+        # Compute beta = sigmoid(b)
+        b_beta = 1.0 / (1.0 + tl.exp(-b_b))
+
+        # Apply L2 normalization if enabled
+        if USE_QK_L2NORM_IN_KERNEL:
+            b_q = b_q / (tl.sqrt(tl.sum(b_q * b_q) + 1e-6))
+            b_k = b_k / (tl.sqrt(tl.sum(b_k * b_k) + 1e-6))
+
+        b_q = b_q * scale
+
+        # Apply gating to hidden state: h *= exp(g)
+        if IS_KDA:
+            b_h *= tl.exp(b_g[:, None])
+        else:
+            b_h *= tl.exp(b_g)
+
+        # Delta rule: v -= sum(h * k, dim=0)
+        b_v -= tl.sum(b_h * b_k[:, None], 0)
+
+        # Apply beta gating: v *= beta
+        b_v *= b_beta
+
+        # Update hidden state: h += k[:, None] * v[None, :]
+        b_h += b_k[:, None] * b_v[None, :]
+
+        # Compute output: o = sum(h * q, dim=0)
+        b_o = tl.sum(b_h * b_q[:, None], 0)
+        tl.store(p_o, b_o.to(p_o.dtype.element_ty), mask=mask_v)
+
+        # Cache intermediate states if enabled
+        if CACHE_INTERMEDIATE_STATES:
+            if cache_idx >= 0:
+                step_offset = step_idx * HV * K * V
+                cache_ptr = (
+                    intermediate_states_buffer
+                    + cache_idx * cache_steps * HV * K * V
+                    + step_offset
+                    + i_hv * K * V
+                    + o_k[:, None] * V
+                    + o_v[None, :]
+                )
+                tl.store(cache_ptr, b_h.to(cache_ptr.dtype.element_ty), mask=mask_h)
+
+        step_idx += 1
+
+        # Update pointers for next timestep
+        p_q += stride_q
+        p_k += stride_k
+        p_v += stride_v
+        p_b += stride_b
+        p_o += HV * V
+        if IS_KDA:
+            p_a += HV * K
+        else:
+            p_a += HV
+
+    # Store final state back to h0_source with bounds checking
+    if not DISABLE_STATE_UPDATE:
+        if USE_INITIAL_STATE:
+            idx = tl.load(h0_indices + i_n)
+            if idx >= 0:
+                p_h0 = (
+                    h0_source
+                    + idx * HV * K * V
+                    + i_hv * K * V
+                    + o_k[:, None] * V
+                    + o_v[None, :]
+                )
+                tl.store(p_h0, b_h.to(p_h0.dtype.element_ty), mask=mask_h)
+
+
+def fused_sigmoid_gating_delta_rule_update(
+    A_log: torch.Tensor,
+    a: torch.Tensor,
+    dt_bias: torch.Tensor,
+    softplus_beta: float,
+    softplus_threshold: float,
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    b: torch.Tensor,
+    initial_state_source: torch.Tensor,
+    initial_state_indices: torch.Tensor,
+    scale: Optional[float] = None,
+    use_qk_l2norm_in_kernel: bool = False,
+    cu_seqlens: Optional[torch.Tensor] = None,
+    is_kda: bool = False,
+    # Optional parameters for target_verify support
+    disable_state_update: bool = False,
+    intermediate_states_buffer: Optional[torch.Tensor] = None,
+    intermediate_state_indices: Optional[torch.Tensor] = None,
+    cache_steps: Optional[int] = None,
+    retrieve_parent_token: Optional[torch.Tensor] = None,
+):
+    """
+    Fused triton implementation of sigmoid gating delta rule update.
+    This function uses a single fused kernel that combines both sigmoid gating computation
+    and the recurrent delta rule update for better performance.
+
+    Supports both decode and target_verify modes:
+    - decode: standard single-step update with state write-back
+    - target_verify: multi-step with intermediate state caching, optional tree attention,
+                     and optional state update disable
+    """
+    B, T, H, K, V = *k.shape, v.shape[-1]
+    stride_q = q.stride()[1]
+    stride_k = k.stride()[1]
+    stride_v = v.stride()[1]
+    stride_b = b.stride()[-2]
+    HV = v.shape[2]
+    N = B if cu_seqlens is None else len(cu_seqlens) - 1
+    BK, BV = triton.next_power_of_2(K), min(triton.next_power_of_2(V), 32)
+    NK, NV = triton.cdiv(K, BK), triton.cdiv(V, BV)
+    assert NK == 1, "NK > 1 is not supported yet"
+    num_stages = 3
+    num_warps = 1
+
+    if scale is None:
+        scale = k.shape[-1] ** -0.5
+    else:
+        assert scale > 0, "scale must be positive"
+
+    o = q.new_empty(NK, *v.shape)
+
+    # Prepare retrieve_parent_token strides
+    if retrieve_parent_token is not None:
+        stride_retrieve_parent_token_seq = retrieve_parent_token.stride(0)
+        stride_retrieve_parent_token_token = retrieve_parent_token.stride(1)
+    else:
+        stride_retrieve_parent_token_seq = 0
+        stride_retrieve_parent_token_token = 0
+
+    NP2_T = triton.next_power_of_2(T)
+
+    grid = (NK, NV, N * HV)
+
+    fused_sigmoid_gating_delta_rule_update_kernel[grid](
+        A_log=A_log,
+        a=a,
+        dt_bias=dt_bias,
+        softplus_beta=softplus_beta,
+        softplus_threshold=softplus_threshold,
+        q=q,
+        k=k,
+        v=v,
+        b=b,
+        o=o,
+        h0_source=initial_state_source,
+        h0_indices=initial_state_indices,
+        cu_seqlens=cu_seqlens,
+        intermediate_states_buffer=intermediate_states_buffer,
+        intermediate_state_indices=intermediate_state_indices,
+        cache_steps=0 if cache_steps is None else cache_steps,
+        retrieve_parent_token_ptr=retrieve_parent_token,
+        stride_retrieve_parent_token_seq=stride_retrieve_parent_token_seq,
+        stride_retrieve_parent_token_token=stride_retrieve_parent_token_token,
+        scale=scale,
+        T=T,
+        stride_q=stride_q,
+        stride_k=stride_k,
+        stride_v=stride_v,
+        stride_b=stride_b,
+        NP2_T=NP2_T,
+        B=B,
+        H=H,
+        HV=HV,
+        K=K,
+        V=V,
+        BK=BK,
+        BV=BV,
+        USE_INITIAL_STATE=initial_state_source is not None,
+        USE_QK_L2NORM_IN_KERNEL=use_qk_l2norm_in_kernel,
+        IS_VARLEN=cu_seqlens is not None,
+        IS_KDA=is_kda,
+        DISABLE_STATE_UPDATE=disable_state_update,
+        CACHE_INTERMEDIATE_STATES=intermediate_states_buffer is not None,
+        HAS_EAGLE_TREE_CUSTOM_ATTN_MASK=retrieve_parent_token is not None,
+        num_warps=num_warps,
+        num_stages=num_stages,
+    )
+    o = o.squeeze(0)
+    return o
diff --git a/sglang/python/sglang/srt/layers/attention/fla/index.py b/sglang/python/sglang/srt/layers/attention/fla/index.py
new file mode 100644
index 0000000000000000000000000000000000000000..31b2e524e2a465cf50c9f968021a8483fd526dd6
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/attention/fla/index.py
@@ -0,0 +1,35 @@
+# Adapt from https://github.com/fla-org/flash-linear-attention/blob/main/fla/ops/utils/index.py
+# -*- coding: utf-8 -*-
+# Copyright (c) 2023-2025, Songlin Yang, Yu Zhang
+
+import torch
+import triton
+
+from sglang.srt.layers.attention.fla.utils import tensor_cache
+
+
+@tensor_cache
+def prepare_lens(cu_seqlens: torch.LongTensor) -> torch.LongTensor:
+    return cu_seqlens[1:] - cu_seqlens[:-1]
+
+
+@tensor_cache
+def prepare_chunk_indices(
+    cu_seqlens: torch.LongTensor, chunk_size: int
+) -> torch.LongTensor:
+    indices = torch.cat(
+        [
+            torch.arange(n)
+            for n in triton.cdiv(prepare_lens(cu_seqlens), chunk_size).tolist()
+        ]
+    )
+    return torch.stack([indices.eq(0).cumsum(0) - 1, indices], 1).to(cu_seqlens)
+
+
+@tensor_cache
+def prepare_chunk_offsets(
+    cu_seqlens: torch.LongTensor, chunk_size: int
+) -> torch.LongTensor:
+    return torch.cat(
+        [cu_seqlens.new_tensor([0]), triton.cdiv(prepare_lens(cu_seqlens), chunk_size)]
+    ).cumsum(-1)
diff --git a/sglang/python/sglang/srt/layers/attention/fla/kda.py b/sglang/python/sglang/srt/layers/attention/fla/kda.py
new file mode 100644
index 0000000000000000000000000000000000000000..286746332955c65a6e8d79ea7139ad7fbea4a519
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/attention/fla/kda.py
@@ -0,0 +1,1061 @@
+# Adapted from https://github.com/vllm-project/vllm/blob/0384aa7150c4c9778efca041ffd1beb3ad2bd694/vllm/model_executor/layers/fla/ops/kda.py
+# This file contains code copied from the flash-linear-attention project.
+# The original source code was licensed under the MIT license and included
+# the following copyright notice:
+# Copyright (c) 2023-2025, Songlin Yang, Yu Zhang
+
+import torch
+import triton
+import triton.language as tl
+
+from sglang.srt.layers.attention.fla.chunk_delta_h import chunk_gated_delta_rule_fwd_h
+from sglang.srt.layers.attention.fla.cumsum import chunk_local_cumsum
+from sglang.srt.layers.attention.fla.fused_norm_gate import layer_norm_gated_fwd
+from sglang.srt.layers.attention.fla.fused_recurrent import (
+    fused_recurrent_gated_delta_rule_fwd_kernel,
+)
+from sglang.srt.layers.attention.fla.index import prepare_chunk_indices
+from sglang.srt.layers.attention.fla.l2norm import l2norm_fwd
+from sglang.srt.layers.attention.fla.op import exp, log
+from sglang.srt.layers.attention.fla.solve_tril import solve_tril
+from sglang.srt.layers.attention.fla.utils import is_amd
+
+BT_LIST_AUTOTUNE = [32, 64, 128]
+NUM_WARPS_AUTOTUNE = [2, 4, 8, 16] if is_amd else [4, 8, 16, 32]
+
+
+def cdiv(a: int, b: int) -> int:
+    """Ceiling division."""
+    return -(a // -b)
+
+
+def next_power_of_2(n: int) -> int:
+    """The next power of 2 (inclusive)"""
+    if n < 1:
+        return 1
+    return 1 << (n - 1).bit_length()
+
+
+def fused_recurrent_kda_fwd(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    g: torch.Tensor,
+    beta: torch.Tensor,
+    scale: float,
+    initial_state: torch.Tensor,
+    inplace_final_state: bool = True,
+    cu_seqlens: torch.LongTensor | None = None,
+    # ssm_state_indices: torch.Tensor | None = None,
+    num_accepted_tokens: torch.Tensor | None = None,
+    use_qk_l2norm_in_kernel: bool = False,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    B, T, H, K, V = *k.shape, v.shape[-1]
+    HV = v.shape[2]
+    N = B if cu_seqlens is None else len(cu_seqlens) - 1
+    BK, BV = next_power_of_2(K), min(next_power_of_2(V), 8)
+    NK, NV = cdiv(K, BK), cdiv(V, BV)
+    assert NK == 1, "NK > 1 is not supported yet"
+    num_stages = 3
+    num_warps = 1
+
+    o = torch.empty_like(k)
+    if inplace_final_state:
+        final_state = initial_state
+    else:
+        final_state = q.new_empty(T, HV, K, V, dtype=initial_state.dtype)
+
+    stride_init_state_token = initial_state.stride(0)
+    stride_final_state_token = final_state.stride(0)
+
+    # if ssm_state_indices is None:
+    #     stride_indices_seq, stride_indices_tok = 1, 1
+    # elif ssm_state_indices.ndim == 1:
+    #     stride_indices_seq, stride_indices_tok = ssm_state_indices.stride(0), 1
+    # else:
+    #     stride_indices_seq, stride_indices_tok = ssm_state_indices.stride()
+
+    grid = (NK, NV, N * HV)
+    fused_recurrent_gated_delta_rule_fwd_kernel[grid](
+        q=q,
+        k=k,
+        v=v,
+        g=g,
+        beta=beta,
+        o=o,
+        h0=initial_state,
+        ht=final_state,
+        cu_seqlens=cu_seqlens,
+        # ssm_state_indices=ssm_state_indices,
+        # num_accepted_tokens=num_accepted_tokens,
+        scale=scale,
+        # N=N,
+        T=T,
+        B=B,
+        H=H,
+        HV=HV,
+        K=K,
+        V=V,
+        BK=BK,
+        BV=BV,
+        # stride_init_state_token=stride_init_state_token,
+        # stride_final_state_token=stride_final_state_token,
+        # stride_indices_seq=stride_indices_seq,
+        # stride_indices_tok=stride_indices_tok,
+        USE_INITIAL_STATE=initial_state is not None,
+        STORE_FINAL_STATE=final_state is not None,
+        IS_BETA_HEADWISE=beta.ndim == v.ndim,
+        USE_QK_L2NORM_IN_KERNEL=use_qk_l2norm_in_kernel,
+        IS_VARLEN=cu_seqlens is not None,
+        # INPLACE_FINAL_STATE=inplace_final_state,
+        IS_KDA=True,
+        num_warps=num_warps,
+        num_stages=num_stages,
+    )
+
+    return o, final_state
+
+
+def fused_recurrent_kda(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    g: torch.Tensor,
+    beta: torch.Tensor = None,
+    scale: float = None,
+    initial_state: torch.Tensor = None,
+    inplace_final_state: bool = True,
+    use_qk_l2norm_in_kernel: bool = True,
+    cu_seqlens: torch.LongTensor | None = None,
+    # ssm_state_indices: torch.LongTensor | None = None,
+    **kwargs,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    if cu_seqlens is not None and q.shape[0] != 1:
+        raise ValueError(
+            f"The batch size is expected to be 1 rather than {q.shape[0]} when using `cu_seqlens`."
+            f"Please flatten variable-length inputs before processing."
+        )
+    if scale is None:
+        scale = k.shape[-1] ** -0.5
+
+    o, final_state = fused_recurrent_kda_fwd(
+        q=q.contiguous(),
+        k=k.contiguous(),
+        v=v.contiguous(),
+        g=g.contiguous(),
+        beta=beta.contiguous(),
+        scale=scale,
+        initial_state=initial_state,
+        inplace_final_state=inplace_final_state,
+        cu_seqlens=cu_seqlens,
+        # ssm_state_indices=ssm_state_indices,
+        num_accepted_tokens=None,
+        use_qk_l2norm_in_kernel=use_qk_l2norm_in_kernel,
+    )
+    return o, final_state
+
+
+def rms_norm_gated(
+    x: torch.Tensor,
+    g: torch.Tensor,
+    weight: torch.Tensor,
+    bias: torch.Tensor,
+    activation: str = "swish",
+    residual: torch.Tensor | None = None,
+    prenorm: bool = False,
+    residual_in_fp32: bool = False,
+    eps: float = 1e-6,
+):
+    x_shape_og = x.shape
+    # reshape input data into 2D tensor
+    x = x.contiguous().reshape(-1, x.shape[-1])
+    g = g.contiguous().reshape(-1, g.shape[-1])
+    if residual is not None:
+        assert residual.shape == x_shape_og
+        residual = residual.contiguous().reshape(-1, residual.shape[-1])
+    residual_dtype = (
+        residual.dtype
+        if residual is not None
+        else (torch.float if residual_in_fp32 else None)
+    )
+    y, _, _, residual_out = layer_norm_gated_fwd(
+        x=x,
+        g=g,
+        weight=weight,
+        bias=bias,
+        activation=activation,
+        eps=eps,
+        residual=residual,
+        residual_dtype=residual_dtype,
+        is_rms_norm=True,
+    )
+    y = y.reshape(x_shape_og)
+    return y if not prenorm else (y, residual_out.reshape(x_shape_og))
+
+
+@triton.autotune(
+    configs=[
+        triton.Config({"BK": BK}, num_warps=num_warps, num_stages=num_stages)
+        for BK in [32, 64]
+        for num_warps in [1, 2, 4, 8]
+        for num_stages in [2, 3, 4]
+    ],
+    key=["BC", "IS_VARLEN"],
+)
+@triton.jit(do_not_specialize=["T"])
+def chunk_kda_scaled_dot_kkt_fwd_kernel_intra_sub_inter(
+    q,
+    k,
+    g,
+    beta,
+    A,
+    Aqk,
+    scale,
+    cu_seqlens,
+    chunk_indices,
+    T,
+    H: tl.constexpr,
+    K: tl.constexpr,
+    BT: tl.constexpr,
+    BC: tl.constexpr,
+    BK: tl.constexpr,
+    NC: tl.constexpr,
+    IS_VARLEN: tl.constexpr,
+):
+    i_t, i_c, i_bh = tl.program_id(0), tl.program_id(1), tl.program_id(2)
+    i_b, i_h = i_bh // H, i_bh % H
+    i_i, i_j = i_c // NC, i_c % NC
+    if IS_VARLEN:
+        i_n, i_t = (
+            tl.load(chunk_indices + i_t * 2).to(tl.int32),
+            tl.load(chunk_indices + i_t * 2 + 1).to(tl.int32),
+        )
+        bos, eos = (
+            tl.load(cu_seqlens + i_n).to(tl.int32),
+            tl.load(cu_seqlens + i_n + 1).to(tl.int32),
+        )
+        T = eos - bos
+    else:
+        bos, eos = i_b * T, i_b * T + T
+
+    if i_t * BT + i_i * BC >= T:
+        return
+    if i_i <= i_j:
+        return
+
+    q += (bos * H + i_h) * K
+    k += (bos * H + i_h) * K
+    g += (bos * H + i_h) * K
+    A += (bos * H + i_h) * BT
+    Aqk += (bos * H + i_h) * BT
+
+    p_b = tl.make_block_ptr(
+        beta + bos * H + i_h, (T,), (H,), (i_t * BT + i_i * BC,), (BC,), (0,)
+    )
+    b_b = tl.load(p_b, boundary_check=(0,))
+
+    b_A = tl.zeros([BC, BC], dtype=tl.float32)
+    b_Aqk = tl.zeros([BC, BC], dtype=tl.float32)
+    for i_k in range(tl.cdiv(K, BK)):
+        p_q = tl.make_block_ptr(
+            q, (T, K), (H * K, 1), (i_t * BT + i_i * BC, i_k * BK), (BC, BK), (1, 0)
+        )
+        p_k = tl.make_block_ptr(
+            k, (T, K), (H * K, 1), (i_t * BT + i_i * BC, i_k * BK), (BC, BK), (1, 0)
+        )
+        p_g = tl.make_block_ptr(
+            g, (T, K), (H * K, 1), (i_t * BT + i_i * BC, i_k * BK), (BC, BK), (1, 0)
+        )
+        b_kt = tl.make_block_ptr(
+            k, (K, T), (1, H * K), (i_k * BK, i_t * BT + i_j * BC), (BK, BC), (0, 1)
+        )
+        p_gk = tl.make_block_ptr(
+            g, (K, T), (1, H * K), (i_k * BK, i_t * BT + i_j * BC), (BK, BC), (0, 1)
+        )
+
+        o_k = i_k * BK + tl.arange(0, BK)
+        m_k = o_k < K
+        # [BK,]
+        b_gn = tl.load(g + (i_t * BT + i_i * BC) * H * K + o_k, mask=m_k, other=0)
+        # [BC, BK]
+        b_g = tl.load(p_g, boundary_check=(0, 1))
+        b_k = tl.load(p_k, boundary_check=(0, 1)) * exp(b_g - b_gn[None, :])
+        # [BK, BC]
+        b_gk = tl.load(p_gk, boundary_check=(0, 1))
+        b_kt = tl.load(b_kt, boundary_check=(0, 1))
+        # [BC, BC]
+        b_ktg = b_kt * exp(b_gn[:, None] - b_gk)
+        b_A += tl.dot(b_k, b_ktg)
+
+        b_q = tl.load(p_q, boundary_check=(0, 1))
+        b_qg = b_q * exp(b_g - b_gn[None, :]) * scale
+        b_Aqk += tl.dot(b_qg, b_ktg)
+
+    b_A *= b_b[:, None]
+
+    p_A = tl.make_block_ptr(
+        A, (T, BT), (H * BT, 1), (i_t * BT + i_i * BC, i_j * BC), (BC, BC), (1, 0)
+    )
+    tl.store(p_A, b_A.to(A.dtype.element_ty), boundary_check=(0, 1))
+    p_Aqk = tl.make_block_ptr(
+        Aqk, (T, BT), (H * BT, 1), (i_t * BT + i_i * BC, i_j * BC), (BC, BC), (1, 0)
+    )
+    tl.store(p_Aqk, b_Aqk.to(Aqk.dtype.element_ty), boundary_check=(0, 1))
+
+
+@triton.autotune(
+    configs=[triton.Config({}, num_warps=num_warps) for num_warps in [1, 2, 4, 8]],
+    key=["BK", "BT", "IS_VARLEN"],
+)
+@triton.jit(do_not_specialize=["T"])
+def chunk_kda_scaled_dot_kkt_fwd_kernel_intra_sub_intra(
+    q,
+    k,
+    g,
+    beta,
+    A,
+    Aqk,
+    scale,
+    cu_seqlens,
+    chunk_indices,
+    T,
+    H: tl.constexpr,
+    K: tl.constexpr,
+    BT: tl.constexpr,
+    BC: tl.constexpr,
+    BK: tl.constexpr,
+    IS_VARLEN: tl.constexpr,
+):
+    i_t, i_i, i_bh = tl.program_id(0), tl.program_id(1), tl.program_id(2)
+    i_b, i_h = i_bh // H, i_bh % H
+    if IS_VARLEN:
+        i_n, i_t = (
+            tl.load(chunk_indices + i_t * 2).to(tl.int32),
+            tl.load(chunk_indices + i_t * 2 + 1).to(tl.int32),
+        )
+        bos, eos = (
+            tl.load(cu_seqlens + i_n).to(tl.int32),
+            tl.load(cu_seqlens + i_n + 1).to(tl.int32),
+        )
+        T = eos - bos
+    else:
+        bos, eos = i_b * T, i_b * T + T
+
+    if i_t * BT + i_i * BC >= T:
+        return
+
+    o_i = tl.arange(0, BC)
+    o_k = tl.arange(0, BK)
+    m_k = o_k < K
+    m_A = (i_t * BT + i_i * BC + o_i) < T
+    o_A = (bos + i_t * BT + i_i * BC + o_i) * H * BT + i_h * BT + i_i * BC
+
+    p_q = tl.make_block_ptr(
+        q + (bos * H + i_h) * K,
+        (T, K),
+        (H * K, 1),
+        (i_t * BT + i_i * BC, 0),
+        (BC, BK),
+        (1, 0),
+    )
+    p_k = tl.make_block_ptr(
+        k + (bos * H + i_h) * K,
+        (T, K),
+        (H * K, 1),
+        (i_t * BT + i_i * BC, 0),
+        (BC, BK),
+        (1, 0),
+    )
+    p_g = tl.make_block_ptr(
+        g + (bos * H + i_h) * K,
+        (T, K),
+        (H * K, 1),
+        (i_t * BT + i_i * BC, 0),
+        (BC, BK),
+        (1, 0),
+    )
+    b_q = tl.load(p_q, boundary_check=(0, 1))
+    b_k = tl.load(p_k, boundary_check=(0, 1))
+    b_g = tl.load(p_g, boundary_check=(0, 1))
+
+    p_b = beta + (bos + i_t * BT + i_i * BC + o_i) * H + i_h
+    b_k = b_k * tl.load(p_b, mask=m_A, other=0)[:, None]
+
+    p_kt = k + (bos + i_t * BT + i_i * BC) * H * K + i_h * K + o_k
+    p_gk = g + (bos + i_t * BT + i_i * BC) * H * K + i_h * K + o_k
+
+    for j in range(0, min(BC, T - i_t * BT - i_i * BC)):
+        b_kt = tl.load(p_kt, mask=m_k, other=0).to(tl.float32)
+        b_gk = tl.load(p_gk, mask=m_k, other=0).to(tl.float32)
+        b_ktg = b_kt[None, :] * exp(b_g - b_gk[None, :])
+        b_A = tl.sum(b_k * b_ktg, 1)
+        b_A = tl.where(o_i > j, b_A, 0.0)
+        b_Aqk = tl.sum(b_q * b_ktg, 1)
+        b_Aqk = tl.where(o_i >= j, b_Aqk * scale, 0.0)
+        tl.store(A + o_A + j, b_A, mask=m_A)
+        tl.store(Aqk + o_A + j, b_Aqk, mask=m_A)
+        p_kt += H * K
+        p_gk += H * K
+
+
+def chunk_kda_scaled_dot_kkt_fwd(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    gk: torch.Tensor | None = None,
+    beta: torch.Tensor | None = None,
+    scale: float | None = None,
+    cu_seqlens: torch.LongTensor | None = None,
+    chunk_size: int = 64,
+    output_dtype: torch.dtype = torch.float32,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    r"""
+    Compute beta * K * K^T.
+
+    Args:
+        k (torch.Tensor):
+            The key tensor of shape `[B, T, H, K]`.
+        beta (torch.Tensor):
+            The beta tensor of shape `[B, T, H]`.
+        gk (torch.Tensor):
+            The cumulative sum of the gate tensor of shape `[B, T, H, K]` applied to the key tensor. Default: `None`.
+        cu_seqlens (torch.LongTensor):
+            The cumulative sequence lengths of the input tensor.
+            Default: None
+        chunk_size (int):
+            The chunk size. Default: 64.
+        output_dtype (torch.dtype):
+            The dtype of the output tensor. Default: `torch.float32`
+
+    Returns:
+        beta * K * K^T of shape `[B, T, H, BT]` where `BT` is the chunk size.
+    """
+    B, T, H, K = k.shape
+    assert K <= 256
+    BT = chunk_size
+    chunk_indices = (
+        prepare_chunk_indices(cu_seqlens, BT) if cu_seqlens is not None else None
+    )
+    NT = cdiv(T, BT) if cu_seqlens is None else len(chunk_indices)
+
+    BC = min(16, BT)
+    NC = cdiv(BT, BC)
+    BK = max(next_power_of_2(K), 16)
+    A = torch.zeros(B, T, H, BT, device=k.device, dtype=output_dtype)
+    Aqk = torch.zeros(B, T, H, BT, device=k.device, dtype=output_dtype)
+    grid = (NT, NC * NC, B * H)
+    chunk_kda_scaled_dot_kkt_fwd_kernel_intra_sub_inter[grid](
+        q=q,
+        k=k,
+        g=gk,
+        beta=beta,
+        A=A,
+        Aqk=Aqk,
+        scale=scale,
+        cu_seqlens=cu_seqlens,
+        chunk_indices=chunk_indices,
+        T=T,
+        H=H,
+        K=K,
+        BT=BT,
+        BC=BC,
+        NC=NC,
+        IS_VARLEN=cu_seqlens is not None,
+    )
+
+    grid = (NT, NC, B * H)
+    chunk_kda_scaled_dot_kkt_fwd_kernel_intra_sub_intra[grid](
+        q=q,
+        k=k,
+        g=gk,
+        beta=beta,
+        A=A,
+        Aqk=Aqk,
+        scale=scale,
+        cu_seqlens=cu_seqlens,
+        chunk_indices=chunk_indices,
+        T=T,
+        H=H,
+        K=K,
+        BT=BT,
+        BC=BC,
+        BK=BK,
+        IS_VARLEN=cu_seqlens is not None,
+    )
+    return A, Aqk
+
+
+@triton.autotune(
+    configs=[
+        triton.Config({}, num_warps=num_warps, num_stages=num_stages)
+        for num_warps in [2, 4, 8]
+        for num_stages in [2, 3, 4]
+    ],
+    key=["H", "K", "V", "BT", "BK", "BV", "IS_VARLEN"],
+)
+@triton.jit(do_not_specialize=["T"])
+def recompute_w_u_fwd_kernel(
+    q,
+    k,
+    qg,
+    kg,
+    v,
+    beta,
+    w,
+    u,
+    A,
+    gk,
+    cu_seqlens,
+    chunk_indices,
+    T,
+    H: tl.constexpr,
+    K: tl.constexpr,
+    V: tl.constexpr,
+    BT: tl.constexpr,
+    BK: tl.constexpr,
+    BV: tl.constexpr,
+    STORE_QG: tl.constexpr,
+    STORE_KG: tl.constexpr,
+    IS_VARLEN: tl.constexpr,
+    DOT_PRECISION: tl.constexpr,
+):
+    i_t, i_bh = tl.program_id(0), tl.program_id(1)
+    i_b, i_h = i_bh // H, i_bh % H
+    if IS_VARLEN:
+        i_n, i_t = (
+            tl.load(chunk_indices + i_t * 2).to(tl.int32),
+            tl.load(chunk_indices + i_t * 2 + 1).to(tl.int32),
+        )
+        bos, eos = (
+            tl.load(cu_seqlens + i_n).to(tl.int32),
+            tl.load(cu_seqlens + i_n + 1).to(tl.int32),
+        )
+        T = eos - bos
+    else:
+        bos, eos = i_b * T, i_b * T + T
+    p_b = tl.make_block_ptr(beta + bos * H + i_h, (T,), (H,), (i_t * BT,), (BT,), (0,))
+    b_b = tl.load(p_b, boundary_check=(0,))
+
+    p_A = tl.make_block_ptr(
+        A + (bos * H + i_h) * BT, (T, BT), (H * BT, 1), (i_t * BT, 0), (BT, BT), (1, 0)
+    )
+    b_A = tl.load(p_A, boundary_check=(0, 1))
+
+    for i_v in range(tl.cdiv(V, BV)):
+        p_v = tl.make_block_ptr(
+            v + (bos * H + i_h) * V,
+            (T, V),
+            (H * V, 1),
+            (i_t * BT, i_v * BV),
+            (BT, BV),
+            (1, 0),
+        )
+        p_u = tl.make_block_ptr(
+            u + (bos * H + i_h) * V,
+            (T, V),
+            (H * V, 1),
+            (i_t * BT, i_v * BV),
+            (BT, BV),
+            (1, 0),
+        )
+        b_v = tl.load(p_v, boundary_check=(0, 1))
+        b_vb = (b_v * b_b[:, None]).to(b_v.dtype)
+        b_u = tl.dot(b_A, b_vb, input_precision=DOT_PRECISION)
+        tl.store(p_u, b_u.to(p_u.dtype.element_ty), boundary_check=(0, 1))
+
+    for i_k in range(tl.cdiv(K, BK)):
+        p_w = tl.make_block_ptr(
+            w + (bos * H + i_h) * K,
+            (T, K),
+            (H * K, 1),
+            (i_t * BT, i_k * BK),
+            (BT, BK),
+            (1, 0),
+        )
+        p_k = tl.make_block_ptr(
+            k + (bos * H + i_h) * K,
+            (T, K),
+            (H * K, 1),
+            (i_t * BT, i_k * BK),
+            (BT, BK),
+            (1, 0),
+        )
+        b_k = tl.load(p_k, boundary_check=(0, 1))
+        b_kb = b_k * b_b[:, None]
+
+        p_gk = tl.make_block_ptr(
+            gk + (bos * H + i_h) * K,
+            (T, K),
+            (H * K, 1),
+            (i_t * BT, i_k * BK),
+            (BT, BK),
+            (1, 0),
+        )
+        b_gk = tl.load(p_gk, boundary_check=(0, 1))
+        b_kb *= exp(b_gk)
+        if STORE_QG:
+            p_q = tl.make_block_ptr(
+                q + (bos * H + i_h) * K,
+                (T, K),
+                (H * K, 1),
+                (i_t * BT, i_k * BK),
+                (BT, BK),
+                (1, 0),
+            )
+            p_qg = tl.make_block_ptr(
+                qg + (bos * H + i_h) * K,
+                (T, K),
+                (H * K, 1),
+                (i_t * BT, i_k * BK),
+                (BT, BK),
+                (1, 0),
+            )
+            b_q = tl.load(p_q, boundary_check=(0, 1))
+            b_qg = b_q * exp(b_gk)
+            tl.store(p_qg, b_qg.to(p_qg.dtype.element_ty), boundary_check=(0, 1))
+        if STORE_KG:
+            last_idx = min(i_t * BT + BT, T) - 1
+
+            o_k = i_k * BK + tl.arange(0, BK)
+            m_k = o_k < K
+            b_gn = tl.load(
+                gk + ((bos + last_idx) * H + i_h) * K + o_k, mask=m_k, other=0.0
+            )
+            b_kg = b_k * exp(b_gn - b_gk)
+
+            p_kg = tl.make_block_ptr(
+                kg + (bos * H + i_h) * K,
+                (T, K),
+                (H * K, 1),
+                (i_t * BT, i_k * BK),
+                (BT, BK),
+                (1, 0),
+            )
+            tl.store(p_kg, b_kg.to(p_kg.dtype.element_ty), boundary_check=(0, 1))
+
+        b_w = tl.dot(b_A, b_kb.to(b_k.dtype))
+        tl.store(p_w, b_w.to(p_w.dtype.element_ty), boundary_check=(0, 1))
+
+
+def recompute_w_u_fwd(
+    k: torch.Tensor,
+    v: torch.Tensor,
+    beta: torch.Tensor,
+    A: torch.Tensor,
+    q: torch.Tensor | None = None,
+    gk: torch.Tensor | None = None,
+    cu_seqlens: torch.LongTensor | None = None,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    B, T, H, K, V = *k.shape, v.shape[-1]
+    BT = A.shape[-1]
+    BK = 64
+    BV = 64
+
+    chunk_indices = (
+        prepare_chunk_indices(cu_seqlens, BT) if cu_seqlens is not None else None
+    )
+    NT = cdiv(T, BT) if cu_seqlens is None else len(chunk_indices)
+
+    w = torch.empty_like(k)
+    u = torch.empty_like(v)
+    kg = torch.empty_like(k) if gk is not None else None
+    recompute_w_u_fwd_kernel[(NT, B * H)](
+        q=q,
+        k=k,
+        qg=None,
+        kg=kg,
+        v=v,
+        beta=beta,
+        w=w,
+        u=u,
+        A=A,
+        gk=gk,
+        cu_seqlens=cu_seqlens,
+        chunk_indices=chunk_indices,
+        T=T,
+        H=H,
+        K=K,
+        V=V,
+        BT=BT,
+        BK=BK,
+        BV=BV,
+        STORE_QG=False,
+        STORE_KG=kg is not None,
+        IS_VARLEN=cu_seqlens is not None,
+        DOT_PRECISION="ieee",
+    )
+    return w, u, None, kg
+
+
+@triton.autotune(
+    configs=[
+        triton.Config({"BK": BK, "BV": BV}, num_warps=num_warps, num_stages=num_stages)
+        for BK in [32, 64]
+        for BV in [64, 128]
+        for num_warps in [2, 4, 8]
+        for num_stages in [2, 3, 4]
+    ],
+    key=["BT", "IS_VARLEN"],
+)
+@triton.jit(do_not_specialize=["T"])
+def chunk_gla_fwd_kernel_o(
+    q,
+    v,
+    g,
+    h,
+    o,
+    A,
+    cu_seqlens,
+    chunk_indices,
+    scale,
+    T,
+    H: tl.constexpr,
+    K: tl.constexpr,
+    V: tl.constexpr,
+    BT: tl.constexpr,
+    BK: tl.constexpr,
+    BV: tl.constexpr,
+    IS_VARLEN: tl.constexpr,
+):
+    i_v, i_t, i_bh = tl.program_id(0), tl.program_id(1), tl.program_id(2)
+    i_b, i_h = i_bh // H, i_bh % H
+    if IS_VARLEN:
+        i_tg = i_t
+        i_n, i_t = (
+            tl.load(chunk_indices + i_t * 2).to(tl.int32),
+            tl.load(chunk_indices + i_t * 2 + 1).to(tl.int32),
+        )
+        bos, eos = (
+            tl.load(cu_seqlens + i_n).to(tl.int32),
+            tl.load(cu_seqlens + i_n + 1).to(tl.int32),
+        )
+        T = eos - bos
+        NT = tl.cdiv(T, BT)
+    else:
+        NT = tl.cdiv(T, BT)
+        i_tg = i_b * NT + i_t
+        bos, eos = i_b * T, i_b * T + T
+
+    m_s = tl.arange(0, BT)[:, None] >= tl.arange(0, BT)[None, :]
+
+    b_o = tl.zeros([BT, BV], dtype=tl.float32)
+    for i_k in range(tl.cdiv(K, BK)):
+        p_q = tl.make_block_ptr(
+            q + (bos * H + i_h) * K,
+            (T, K),
+            (H * K, 1),
+            (i_t * BT, i_k * BK),
+            (BT, BK),
+            (1, 0),
+        )
+        p_g = tl.make_block_ptr(
+            g + (bos * H + i_h) * K,
+            (T, K),
+            (H * K, 1),
+            (i_t * BT, i_k * BK),
+            (BT, BK),
+            (1, 0),
+        )
+        p_h = tl.make_block_ptr(
+            h + (i_tg * H + i_h) * K * V,
+            (K, V),
+            (V, 1),
+            (i_k * BK, i_v * BV),
+            (BK, BV),
+            (1, 0),
+        )
+
+        # [BT, BK]
+        b_q = tl.load(p_q, boundary_check=(0, 1))
+        b_q = (b_q * scale).to(b_q.dtype)
+        # [BT, BK]
+        b_g = tl.load(p_g, boundary_check=(0, 1))
+        # [BT, BK]
+        b_qg = (b_q * exp(b_g)).to(b_q.dtype)
+        # [BK, BV]
+        b_h = tl.load(p_h, boundary_check=(0, 1))
+        # works but dkw, owing to divine benevolence
+        # [BT, BV]
+        if i_k >= 0:
+            b_o += tl.dot(b_qg, b_h.to(b_qg.dtype))
+    p_v = tl.make_block_ptr(
+        v + (bos * H + i_h) * V,
+        (T, V),
+        (H * V, 1),
+        (i_t * BT, i_v * BV),
+        (BT, BV),
+        (1, 0),
+    )
+    p_o = tl.make_block_ptr(
+        o + (bos * H + i_h) * V,
+        (T, V),
+        (H * V, 1),
+        (i_t * BT, i_v * BV),
+        (BT, BV),
+        (1, 0),
+    )
+    p_A = tl.make_block_ptr(
+        A + (bos * H + i_h) * BT, (T, BT), (H * BT, 1), (i_t * BT, 0), (BT, BT), (1, 0)
+    )
+    # [BT, BV]
+    b_v = tl.load(p_v, boundary_check=(0, 1))
+    # [BT, BT]
+    b_A = tl.load(p_A, boundary_check=(0, 1))
+    b_A = tl.where(m_s, b_A, 0.0).to(b_v.dtype)
+    b_o += tl.dot(b_A, b_v, allow_tf32=False)
+    tl.store(p_o, b_o.to(p_o.dtype.element_ty), boundary_check=(0, 1))
+
+
+def chunk_gla_fwd_o_gk(
+    q: torch.Tensor,
+    v: torch.Tensor,
+    g: torch.Tensor,
+    A: torch.Tensor,
+    h: torch.Tensor,
+    o: torch.Tensor,
+    scale: float,
+    cu_seqlens: torch.LongTensor | None = None,
+    chunk_size: int = 64,
+):
+    B, T, H, K, V = *q.shape, v.shape[-1]
+    BT = chunk_size
+
+    chunk_indices = (
+        prepare_chunk_indices(cu_seqlens, chunk_size)
+        if cu_seqlens is not None
+        else None
+    )
+    NT = cdiv(T, BT) if cu_seqlens is None else len(chunk_indices)
+
+    def grid(meta):
+        return (cdiv(V, meta["BV"]), NT, B * H)
+
+    chunk_gla_fwd_kernel_o[grid](
+        q=q,
+        v=v,
+        g=g,
+        h=h,
+        o=o,
+        A=A,
+        cu_seqlens=cu_seqlens,
+        chunk_indices=chunk_indices,
+        scale=scale,
+        T=T,
+        H=H,
+        K=K,
+        V=V,
+        BT=BT,
+        IS_VARLEN=cu_seqlens is not None,
+    )
+    return o
+
+
+def chunk_kda_fwd(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    g: torch.Tensor,
+    beta: torch.Tensor,
+    scale: float,
+    initial_state: torch.Tensor,
+    initial_state_indices: torch.Tensor,
+    cu_seqlens: torch.LongTensor | None = None,
+):
+    chunk_size = 64
+    g = chunk_local_cumsum(g, chunk_size=chunk_size, cu_seqlens=cu_seqlens)
+    # the intra Aqk is kept in fp32
+    # the computation has very marginal effect on the entire throughput
+    A, Aqk = chunk_kda_scaled_dot_kkt_fwd(
+        q=q,
+        k=k,
+        gk=g,
+        beta=beta,
+        scale=scale,
+        cu_seqlens=cu_seqlens,
+        output_dtype=torch.float32,
+    )
+    A = solve_tril(A=A, cu_seqlens=cu_seqlens, output_dtype=k.dtype)
+    w, u, _, kg = recompute_w_u_fwd(
+        k=k,
+        v=v,
+        beta=beta,
+        A=A,
+        gk=g,
+        cu_seqlens=cu_seqlens,
+    )
+    del A
+    h, v_new = chunk_gated_delta_rule_fwd_h(
+        k=kg,
+        w=w,
+        u=u,
+        gk=g,
+        initial_state=initial_state,
+        initial_state_indices=initial_state_indices,
+        cu_seqlens=cu_seqlens,
+    )
+    del w, u, kg
+    o = chunk_gla_fwd_o_gk(
+        q=q,
+        v=v_new,
+        g=g,
+        A=Aqk,
+        h=h,
+        o=v,
+        scale=scale,
+        cu_seqlens=cu_seqlens,
+        chunk_size=chunk_size,
+    )
+    del Aqk, v_new, h
+    return o
+
+
+def chunk_kda(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    g: torch.Tensor,
+    beta: torch.Tensor,
+    scale: float = None,
+    initial_state: torch.Tensor = None,
+    initial_state_indices: torch.Tensor = None,
+    use_qk_l2norm_in_kernel: bool = False,
+    cu_seqlens: torch.LongTensor | None = None,
+    **kwargs,
+):
+    if scale is None:
+        scale = k.shape[-1] ** -0.5
+
+    if use_qk_l2norm_in_kernel:
+        q = l2norm_fwd(q.contiguous())
+        k = l2norm_fwd(k.contiguous())
+
+    o = chunk_kda_fwd(
+        q=q,
+        k=k,
+        v=v.contiguous(),
+        g=g.contiguous(),
+        beta=beta.contiguous(),
+        scale=scale,
+        initial_state=initial_state,
+        initial_state_indices=initial_state_indices,
+        cu_seqlens=cu_seqlens,
+    )
+    return o
+
+
+@triton.autotune(
+    configs=[
+        triton.Config({"BT": bt}, num_warps=nw, num_stages=ns)
+        for bt in BT_LIST_AUTOTUNE
+        for nw in NUM_WARPS_AUTOTUNE
+        for ns in [2, 3]
+    ],
+    key=["H", "D"],
+)
+@triton.jit
+def kda_gate_fwd_kernel(
+    g,
+    A,
+    y,
+    g_bias,
+    beta: tl.constexpr,
+    threshold: tl.constexpr,
+    T,
+    H,
+    D: tl.constexpr,
+    BT: tl.constexpr,
+    BD: tl.constexpr,
+    HAS_BIAS: tl.constexpr,
+):
+    i_t, i_h = tl.program_id(0), tl.program_id(1)
+    n_t = i_t * BT
+
+    b_a = tl.load(A + i_h).to(tl.float32)
+    b_a = -tl.exp(b_a)
+
+    stride_row = H * D
+    stride_col = 1
+
+    g_ptr = tl.make_block_ptr(
+        base=g + i_h * D,
+        shape=(T, D),
+        strides=(stride_row, stride_col),
+        offsets=(n_t, 0),
+        block_shape=(BT, BD),
+        order=(1, 0),
+    )
+
+    y_ptr = tl.make_block_ptr(
+        base=y + i_h * D,
+        shape=(T, D),
+        strides=(stride_row, stride_col),
+        offsets=(n_t, 0),
+        block_shape=(BT, BD),
+        order=(1, 0),
+    )
+
+    b_g = tl.load(g_ptr, boundary_check=(0, 1)).to(tl.float32)
+
+    if HAS_BIAS:
+        n_d = tl.arange(0, BD)
+        bias_mask = n_d < D
+        b_bias = tl.load(g_bias + i_h * D + n_d, mask=bias_mask, other=0.0).to(
+            tl.float32
+        )
+        b_g = b_g + b_bias[None, :]
+
+    # softplus(x, beta) = (1/beta) * log(1 + exp(beta * x))
+    # When beta * x > threshold, use linear approximation x
+    # Use threshold to switch to linear when beta*x > threshold
+    g_scaled = b_g * beta
+    use_linear = g_scaled > threshold
+    sp = tl.where(use_linear, b_g, (1.0 / beta) * log(1.0 + tl.exp(g_scaled)))
+    b_y = b_a * sp
+
+    tl.store(y_ptr, b_y.to(y.dtype.element_ty), boundary_check=(0, 1))
+
+
+def fused_kda_gate(
+    g: torch.Tensor,
+    A: torch.Tensor,
+    head_k_dim: int,
+    g_bias: torch.Tensor | None = None,
+    beta: float = 1.0,
+    threshold: float = 20.0,
+) -> torch.Tensor:
+    """
+    Forward pass for KDA gate:
+      input g: [..., H*D]
+      param A: [H] or [1, 1, H, 1]
+      beta: softplus beta parameter
+      threshold: softplus threshold parameter
+      return  : [..., H, D]
+    """
+    orig_shape = g.shape[:-1]
+
+    g = g.view(-1, g.shape[-1])
+    T = g.shape[0]
+    HD = g.shape[1]
+    H = A.numel()
+    assert H * head_k_dim == HD
+
+    y = torch.empty_like(g, dtype=torch.float32)
+
+    def grid(meta):
+        return (cdiv(T, meta["BT"]), H)
+
+    kda_gate_fwd_kernel[grid](
+        g,
+        A,
+        y,
+        g_bias,
+        beta,
+        threshold,
+        T,
+        H,
+        head_k_dim,
+        BD=next_power_of_2(head_k_dim),
+        HAS_BIAS=g_bias is not None,
+    )
+
+    y = y.view(*orig_shape, H, head_k_dim)
+    return y
diff --git a/sglang/python/sglang/srt/layers/attention/fla/l2norm.py b/sglang/python/sglang/srt/layers/attention/fla/l2norm.py
new file mode 100644
index 0000000000000000000000000000000000000000..d6b6ae7f7d283cd6f26b5304821c9c108903c1d0
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/attention/fla/l2norm.py
@@ -0,0 +1,150 @@
+# Adapt from https://github.com/fla-org/flash-linear-attention/blob/main/fla/modules/l2norm.py
+# -*- coding: utf-8 -*-
+# Copyright (c) 2023-2025, Songlin Yang, Yu Zhang
+
+from typing import Optional
+
+import torch
+import torch.nn as nn
+import triton
+import triton.language as tl
+
+from sglang.srt.layers.attention.fla.utils import input_guard
+
+BT_LIST = [8, 16, 32, 64, 128]
+
+
+# @triton.autotune(
+#     configs=[
+#         triton.Config({}, num_warps=num_warps) for num_warps in [1, 2, 4, 8, 16, 32]
+#     ],
+#     key=["D"],
+# )
+@triton.jit
+def l2norm_fwd_kernel1(
+    x,
+    y,
+    D,
+    BD: tl.constexpr,
+    eps,
+):
+    i_t = tl.program_id(0)
+    x += i_t * D
+    y += i_t * D
+    # Compute mean and variance
+    cols = tl.arange(0, BD)
+    mask = cols < D
+    b_x = tl.load(x + cols, mask=mask, other=0.0).to(tl.float32)
+    b_var = tl.sum(b_x * b_x, axis=0)
+    b_rstd = 1 / tl.sqrt(b_var + eps)
+    # tl.store(Rstd + i_t, rstd)
+    # Normalize and apply linear transformation
+    b_y = b_x * b_rstd
+    tl.store(y + cols, b_y, mask=mask)
+
+
+# @triton.autotune(
+#     configs=[
+#         triton.Config({"BT": BT}, num_warps=num_warps)
+#         for num_warps in [1, 2, 4, 8, 16]
+#         for BT in BT_LIST
+#     ],
+#     key=["D", "NB"],
+# )
+@triton.jit
+def l2norm_fwd_kernel(
+    x,
+    y,
+    eps,
+    NB: tl.constexpr,
+    T: tl.constexpr,
+    D: tl.constexpr,
+    BT: tl.constexpr,
+    BD: tl.constexpr,
+):
+    i_t = tl.program_id(0)
+    p_x = tl.make_block_ptr(x, (T, D), (D, 1), (i_t * BT, 0), (BT, BD), (1, 0))
+    b_x = tl.load(p_x, boundary_check=(0, 1)).to(tl.float32)
+    b_var = tl.sum(b_x * b_x, axis=1)
+    b_y = b_x / tl.sqrt(b_var + eps)[:, None]
+    p_y = tl.make_block_ptr(y, (T, D), (D, 1), (i_t * BT, 0), (BT, BD), (1, 0))
+    tl.store(p_y, b_y.to(p_y.dtype.element_ty), boundary_check=(0, 1))
+
+
+def l2norm_fwd(
+    x: torch.Tensor, eps: float = 1e-6, output_dtype: Optional[torch.dtype] = None
+):
+    x_shape_og = x.shape
+    x = x.view(-1, x.shape[-1])
+    # allocate output
+    if output_dtype is None:
+        y = torch.empty_like(x)
+    else:
+        y = torch.empty_like(x, dtype=output_dtype)
+    assert y.stride(-1) == 1
+    T, D = x.shape[0], x.shape[-1]
+    # rstd = torch.empty((T,), dtype=torch.float32, device=x.device)
+    # Less than 64KB per feature: enqueue fused kernel
+    MAX_FUSED_SIZE = 65536 // x.element_size()
+    BD = min(MAX_FUSED_SIZE, triton.next_power_of_2(D))
+    if D > BD:
+        raise RuntimeError("This layer doesn't support feature dim >= 64KB.")
+
+    if D <= 512:
+        NB = triton.cdiv(T, 2048)
+
+        def grid(meta):
+            return (triton.cdiv(T, meta["BT"]),)
+
+        l2norm_fwd_kernel[grid](
+            x,
+            y,
+            eps,
+            NB=NB,
+            T=T,
+            D=D,
+            BD=BD,
+            BT=16,
+            num_warps=8,
+            num_stages=3,
+        )
+    else:
+        l2norm_fwd_kernel1[(T,)](
+            x,
+            y,
+            eps=eps,
+            D=D,
+            BD=BD,
+            num_warps=8,
+            num_stages=3,
+        )
+
+    return y.view(x_shape_og)
+
+
+class L2NormFunction(torch.autograd.Function):
+
+    @staticmethod
+    @input_guard
+    def forward(ctx, x, eps=1e-6, output_dtype=None):
+        return l2norm_fwd(x, eps, output_dtype)
+
+
+def l2norm(
+    x: torch.Tensor, eps: float = 1e-6, output_dtype: Optional[torch.dtype] = None
+) -> torch.Tensor:
+    return L2NormFunction.apply(x, eps, output_dtype)
+
+
+l2_norm = l2norm
+
+
+class L2Norm(nn.Module):
+
+    def __init__(self, eps: float = 1e-6, output_dtype: Optional[torch.dtype] = None):
+        super().__init__()
+        self.eps = eps
+        self.output_dtype = output_dtype
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return l2norm(x, self.eps, self.output_dtype)
diff --git a/sglang/python/sglang/srt/layers/attention/fla/layernorm_gated.py b/sglang/python/sglang/srt/layers/attention/fla/layernorm_gated.py
new file mode 100644
index 0000000000000000000000000000000000000000..b30cdacdb3732347e2e5edaeebf6f61efbf9c03d
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/attention/fla/layernorm_gated.py
@@ -0,0 +1,453 @@
+# Adapt from https://github.com/fla-org/flash-linear-attention/blob/main/fla/modules/layernorm_gated.py
+# Copyright (c) 2024, Tri Dao.
+# Based on the Triton LayerNorm tutorial: https://triton-lang.org/main/getting-started/tutorials/05-layer-norm.html
+# For the backward pass, we keep weight_grad and bias_grad in registers and accumulate.
+# This backward pass is faster for dimensions up to 8k, but after that it's much slower due to register spilling.
+# The models we train have hidden dim up to 8k anyway (e.g. Llama 70B), so this is fine.
+
+
+from functools import lru_cache
+
+import torch
+import torch.nn.functional as F
+import triton
+import triton.language as tl
+from einops import rearrange
+
+from sglang.srt.server_args import get_global_server_args
+from sglang.srt.utils import (
+    cdiv,
+    cpu_has_amx_support,
+    device_context,
+    is_cpu,
+    is_npu,
+    next_power_of_2,
+)
+
+_is_npu = is_npu()
+_use_cpu = is_cpu() and cpu_has_amx_support()
+
+# Maximum rows per Triton block for layernorm gated kernel
+MAX_ROWS_PER_BLOCK = 4
+
+
+def rms_norm_ref(
+    x,
+    weight,
+    bias,
+    z=None,
+    eps=1e-6,
+    group_size=None,
+    norm_before_gate=True,
+    upcast=True,
+):
+    dtype = x.dtype
+    N = x.shape[-1]
+    weight = weight.float()
+    bias = bias.float() if bias is not None else None
+    if upcast:
+        x = x.float()
+        z = z.float() if z is not None else z
+    if z is not None and not norm_before_gate:
+        x = x * F.silu(z)
+    if group_size is None:
+        rstd = 1 / torch.sqrt((x.square()).mean(dim=-1, keepdim=True) + eps)
+        out = (x * rstd * weight) + bias if bias is not None else (x * rstd * weight)
+    else:
+        x_group = rearrange(x, "... (g d) -> ... g d", d=group_size)
+        rstd = 1 / torch.sqrt((x_group.square()).mean(dim=-1, keepdim=True) + eps)
+        out = rearrange(x_group * rstd, "... g d -> ... (g d)") * weight
+        if bias is not None:
+            out = out + bias
+    if z is not None and norm_before_gate:
+        out *= F.silu(z)
+    return out.to(dtype)
+
+
+@triton.jit
+def _layer_norm_fwd_1pass_kernel(
+    X,  # pointer to the input
+    Y,  # pointer to the output
+    W,  # pointer to the weights
+    B,  # pointer to the biases
+    Z,  # pointer to the other branch
+    Mean,  # pointer to the mean
+    Rstd,  # pointer to the 1/std
+    stride_x_row,  # how much to increase the pointer when moving by 1 row
+    stride_y_row,
+    stride_z_row,
+    M,  # number of rows in X
+    N: tl.constexpr,  # number of columns in X
+    eps,  # epsilon to avoid division by zero
+    BLOCK_N: tl.constexpr,
+    ROWS_PER_BLOCK: tl.constexpr,
+    HAS_BIAS: tl.constexpr,
+    HAS_Z: tl.constexpr,
+    NORM_BEFORE_GATE: tl.constexpr,
+    IS_RMS_NORM: tl.constexpr,
+    ACTIVATION: tl.constexpr,
+):
+    # Map the program id to the starting row of X and Y it should compute.
+    row_start = tl.program_id(0) * ROWS_PER_BLOCK
+    group = tl.program_id(1)
+
+    # Create 2D tile: [ROWS_PER_BLOCK, BLOCK_N]
+    rows = row_start + tl.arange(0, ROWS_PER_BLOCK)
+    cols = tl.arange(0, BLOCK_N)
+
+    # Compute offsets for 2D tile
+    row_offsets = rows[:, None] * stride_x_row
+    col_offsets = cols[None, :] + group * N
+
+    # Base pointers
+    X_base = X + row_offsets + col_offsets
+    Y_base = Y + rows[:, None] * stride_y_row + col_offsets
+
+    # Create mask for valid rows and columns
+    row_mask = rows[:, None] < M
+    col_mask = cols[None, :] < N
+    mask = row_mask & col_mask
+
+    # Load input data with 2D tile
+    x = tl.load(X_base, mask=mask, other=0.0).to(tl.float32)
+
+    if HAS_Z and not NORM_BEFORE_GATE:
+        Z_base = Z + rows[:, None] * stride_z_row + col_offsets
+        z = tl.load(Z_base, mask=mask, other=0.0).to(tl.float32)
+        if ACTIVATION == "swish" or ACTIVATION == "silu":
+            x *= z * tl.sigmoid(z)
+        elif ACTIVATION == "sigmoid":
+            x *= tl.sigmoid(z)
+
+    # Compute mean and variance per row (reduce along axis 1)
+    if not IS_RMS_NORM:
+        mean = tl.sum(x, axis=1) / N  # Shape: [ROWS_PER_BLOCK]
+        # Store mean for each row
+        mean_offsets = group * M + rows
+        mean_mask = rows < M
+        tl.store(Mean + mean_offsets, mean, mask=mean_mask)
+        # Broadcast mean back to 2D for subtraction
+        xbar = tl.where(mask, x - mean[:, None], 0.0)
+        var = tl.sum(xbar * xbar, axis=1) / N  # Shape: [ROWS_PER_BLOCK]
+    else:
+        xbar = tl.where(mask, x, 0.0)
+        var = tl.sum(xbar * xbar, axis=1) / N  # Shape: [ROWS_PER_BLOCK]
+        mean = 0.0  # Placeholder for RMS norm
+
+    rstd = tl.rsqrt(var + eps)  # Shape: [ROWS_PER_BLOCK]
+
+    # Store rstd for each row
+    rstd_offsets = group * M + rows
+    rstd_mask = rows < M
+    tl.store(Rstd + rstd_offsets, rstd, mask=rstd_mask)
+
+    # Load weights and biases (broadcast across rows)
+    w_offsets = cols + group * N
+    w_mask = cols < N
+    w = tl.load(W + w_offsets, mask=w_mask, other=0.0).to(tl.float32)
+
+    if HAS_BIAS:
+        b = tl.load(B + w_offsets, mask=w_mask, other=0.0).to(tl.float32)
+
+    # Normalize and apply linear transformation
+    if not IS_RMS_NORM:
+        x_hat = (x - mean[:, None]) * rstd[:, None]
+    else:
+        x_hat = x * rstd[:, None]
+
+    y = x_hat * w[None, :] + b[None, :] if HAS_BIAS else x_hat * w[None, :]
+
+    if HAS_Z and NORM_BEFORE_GATE:
+        Z_base = Z + rows[:, None] * stride_z_row + col_offsets
+        z = tl.load(Z_base, mask=mask, other=0.0).to(tl.float32)
+        if ACTIVATION == "swish" or ACTIVATION == "silu":
+            y *= z * tl.sigmoid(z)
+        elif ACTIVATION == "sigmoid":
+            y *= tl.sigmoid(z)
+
+    # Write output
+    tl.store(Y_base, y, mask=mask)
+
+
+@lru_cache
+def _get_sm_count(device: torch.device) -> int:
+    """Get and cache the SM count for a given device."""
+    props = torch.cuda.get_device_properties(device)
+    return props.multi_processor_count
+
+
+def calc_rows_per_block(M: int, device: torch.device) -> int:
+    # When piecewise cuda graph is enabled, use a constant value to avoid
+    # torch.compile creating guards on the dynamic batch dimension.
+    try:
+        if not get_global_server_args().disable_piecewise_cuda_graph:
+            return MAX_ROWS_PER_BLOCK
+    except ValueError:
+        # Global server args not initialized (e.g., in unit tests)
+        pass
+    sm_count = _get_sm_count(device)
+    rows_per_block = next_power_of_2(cdiv(M, 2 * sm_count))
+    rows_per_block = min(rows_per_block, MAX_ROWS_PER_BLOCK)
+    return rows_per_block
+
+
+def _layer_norm_fwd(
+    x,
+    weight,
+    bias,
+    eps,
+    z=None,
+    out=None,
+    group_size=None,
+    norm_before_gate=True,
+    is_rms_norm=False,
+    activation: str = "swish",
+):
+    M, N = x.shape
+    if group_size is None:
+        group_size = N
+    assert N % group_size == 0
+    ngroups = N // group_size
+    assert x.stride(-1) == 1
+    if z is not None:
+        assert z.stride(-1) == 1
+        assert z.shape == (M, N)
+    assert weight.shape == (N,)
+    assert weight.stride(-1) == 1
+    if bias is not None:
+        assert bias.stride(-1) == 1
+        assert bias.shape == (N,)
+    # allocate output
+    if out is not None:
+        assert out.shape == x.shape
+    else:
+        out = torch.empty_like(x)
+    assert out.stride(-1) == 1
+    mean = (
+        torch.empty((ngroups * M,), dtype=torch.float32, device=x.device)
+        if not is_rms_norm
+        else None
+    )
+    rstd = torch.empty((ngroups * M,), dtype=torch.float32, device=x.device)
+    # Less than 64KB per feature: enqueue fused kernel
+    MAX_FUSED_SIZE = 65536 // x.element_size()
+    BLOCK_N = min(MAX_FUSED_SIZE, triton.next_power_of_2(group_size))
+    if group_size > BLOCK_N:
+        raise RuntimeError("This layer norm doesn't support feature dim >= 64KB.")
+    # heuristics for number of warps
+    num_warps = min(max(BLOCK_N // 256, 1), 8)
+    # Calculate rows per block based on SM count
+    rows_per_block = calc_rows_per_block(M, x.device)
+    # Update grid to use rows_per_block
+    grid = (cdiv(M, rows_per_block), ngroups)
+    with device_context(x.device):
+        _layer_norm_fwd_1pass_kernel[grid](
+            x,
+            out,
+            weight,
+            bias,
+            z,
+            mean,
+            rstd,
+            x.stride(0),
+            out.stride(0),
+            z.stride(0) if z is not None else 0,
+            M,
+            group_size,
+            eps,
+            BLOCK_N=BLOCK_N,
+            ROWS_PER_BLOCK=rows_per_block,
+            HAS_BIAS=bias is not None,
+            HAS_Z=z is not None,
+            NORM_BEFORE_GATE=norm_before_gate,
+            IS_RMS_NORM=is_rms_norm,
+            num_warps=num_warps,
+            ACTIVATION=activation,
+        )
+    return out, mean, rstd
+
+
+if _is_npu:
+    from sgl_kernel_npu.fla.layernorm_gated import layer_norm_fwd_npu as _layer_norm_fwd
+
+
+def rms_norm_gated(
+    *,
+    x,
+    weight,
+    bias,
+    z=None,
+    eps=1e-6,
+    group_size=None,
+    norm_before_gate=True,
+    is_rms_norm=False,
+    activation: str = "swish",
+):
+    """If z is not None, we do norm(x) * silu(z) if norm_before_gate, else norm(x * silu(z))"""
+
+    x_shape_og = x.shape
+    # reshape input data into 2D tensor
+    x = x.reshape(-1, x.shape[-1])
+    if x.stride(-1) != 1:
+        x = x.contiguous()
+    if z is not None:
+        assert z.shape == x_shape_og
+        z = z.reshape(-1, z.shape[-1])
+        if z.stride(-1) != 1:
+            z = z.contiguous()
+    weight = weight.contiguous()
+    if bias is not None:
+        bias = bias.contiguous()
+    if _is_npu:
+        assert activation == "swish", "NPU only supports swish activation"
+    y, mean, rstd = _layer_norm_fwd(
+        x,
+        weight,
+        bias,
+        eps,
+        z=z,
+        group_size=group_size,
+        norm_before_gate=norm_before_gate,
+        is_rms_norm=is_rms_norm,
+        activation=activation,
+    )
+    return y.reshape(x_shape_og)
+
+
+class LayerNormFn(torch.autograd.Function):
+
+    @staticmethod
+    def forward(
+        ctx,
+        x,
+        weight,
+        bias,
+        z=None,
+        eps=1e-6,
+        group_size=None,
+        norm_before_gate=True,
+        is_rms_norm=False,
+        activation: str = "swish",
+    ):
+        return rms_norm_gated(
+            x=x,
+            weight=weight,
+            bias=bias,
+            eps=eps,
+            z=z,
+            group_size=group_size,
+            norm_before_gate=norm_before_gate,
+            is_rms_norm=is_rms_norm,
+            activation=activation,
+        )
+
+
+def layernorm_fn(
+    x,
+    weight,
+    bias,
+    z=None,
+    eps=1e-6,
+    group_size=None,
+    norm_before_gate=True,
+    is_rms_norm=False,
+    activation: str = "swish",
+):
+    return LayerNormFn.apply(
+        x, weight, bias, z, eps, group_size, norm_before_gate, is_rms_norm, activation
+    )
+
+
+class LayerNorm(torch.nn.Module):
+
+    def __init__(
+        self,
+        hidden_size,
+        eps=1e-5,
+        group_size=None,
+        norm_before_gate=True,
+        device=None,
+        dtype=None,
+    ):
+        """If group_size is not None, we do GroupNorm with each group having group_size elements.
+        group_size=None is equivalent to group_size=hidden_size (i.e. there's only 1 group).
+        """
+
+        factory_kwargs = {"device": device, "dtype": dtype}
+        super().__init__()
+        self.eps = eps
+        self.weight = torch.nn.Parameter(torch.empty(hidden_size, **factory_kwargs))
+        self.bias = torch.nn.Parameter(torch.empty(hidden_size, **factory_kwargs))
+        self.group_size = group_size
+        self.norm_before_gate = norm_before_gate
+        self.reset_parameters()
+
+    def reset_parameters(self):
+        torch.nn.init.ones_(self.weight)
+        torch.nn.init.zeros_(self.bias)
+
+    def forward(self, x, z=None):
+        """If z is not None, we do norm(x) * silu(z) if norm_before_gate, else norm(x * silu(z))"""
+        return layernorm_fn(
+            x,
+            self.weight,
+            self.bias,
+            z=z,
+            group_size=self.group_size,
+            eps=self.eps,
+            norm_before_gate=self.norm_before_gate,
+            is_rms_norm=False,
+        )
+
+
+class RMSNorm(torch.nn.Module):
+
+    def __init__(
+        self,
+        hidden_size,
+        eps=1e-5,
+        group_size=None,
+        norm_before_gate=True,
+        device=None,
+        dtype=None,
+        activation: str = "swish",
+    ):
+        """If group_size is not None, we do GroupNorm with each group having group_size elements.
+        group_size=None is equivalent to group_size=hidden_size (i.e. there's only 1 group).
+        """
+        factory_kwargs = {"device": device, "dtype": dtype}
+        super().__init__()
+        self.eps = eps
+        self.activation = activation
+        self.weight = torch.nn.Parameter(torch.empty(hidden_size, **factory_kwargs))
+        self.register_parameter("bias", None)
+        self.group_size = group_size
+        self.norm_before_gate = norm_before_gate
+        self.reset_parameters()
+
+    def reset_parameters(self):
+        torch.nn.init.ones_(self.weight)
+
+    def forward(self, x, z=None):
+        """If z is not None, we do norm(x) * silu(z) if norm_before_gate, else norm(x * silu(z))"""
+        if _use_cpu:
+            assert (
+                self.norm_before_gate
+                and self.group_size is None
+                and self.activation == "swish"
+            ), "CPU rmsnorm_gated currently only supports norm before gate without group size or activation other than swish"
+            return torch.ops.sgl_kernel.fused_rmsnorm_gated_cpu(
+                x, self.weight, z, self.eps
+            )
+        else:
+            return layernorm_fn(
+                x,
+                self.weight,
+                self.bias,
+                z=z,
+                eps=self.eps,
+                group_size=self.group_size,
+                norm_before_gate=self.norm_before_gate,
+                is_rms_norm=True,
+                activation=self.activation,
+            )
diff --git a/sglang/python/sglang/srt/layers/attention/fla/op.py b/sglang/python/sglang/srt/layers/attention/fla/op.py
new file mode 100644
index 0000000000000000000000000000000000000000..9b3191075b75cd686a99f5944f5a55e1c1f6bbe3
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/attention/fla/op.py
@@ -0,0 +1,66 @@
+# Adapt from https://github.com/fla-org/flash-linear-attention/blob/main/fla/ops/utils/op.py
+# -*- coding: utf-8 -*-
+# Copyright (c) 2023-2025, Songlin Yang, Yu Zhang
+
+import os
+
+import triton
+import triton.language as tl
+import triton.language.extra.libdevice as tldevice
+
+from sglang.srt.layers.attention.fla.utils import is_gather_supported
+
+if os.environ.get("FLA_USE_FAST_OPS", "0") == "1":
+    exp = tldevice.fast_expf
+    exp2 = tldevice.exp2
+    log = tldevice.fast_logf
+    log2 = tldevice.fast_log2f
+else:
+    exp = tl.exp
+    exp2 = tl.math.exp2
+    log = tl.log
+    log2 = tl.log2
+
+
+@triton.jit
+def safe_exp(x):
+    return exp(tl.where(x <= 0, x, float("-inf")))
+
+
+if not is_gather_supported:
+
+    @triton.jit
+    def gather(src, index, axis, _builder=None):
+        """
+        Gather operation that works when tl.gather is not supported.
+        This is a fallback implementation that returns None.
+        Just to make triton compiler happy.
+        """
+        return None
+
+else:
+    gather = tl.gather
+
+
+if hasattr(triton.language, "_experimental_make_tensor_descriptor"):
+    # For Triton 3.3.x
+    make_tensor_descriptor = triton.language._experimental_make_tensor_descriptor
+elif hasattr(triton.language, "make_tensor_descriptor"):
+    # For Triton 3.4.x and later
+    make_tensor_descriptor = triton.language.make_tensor_descriptor
+else:
+    """
+    Fallback implementation when TMA is not supported.
+    Returns None to indicate TMA descriptors are unavailable.
+    Just make triton compiler happy.
+    """
+
+    @triton.jit
+    def make_tensor_descriptor(
+        base,
+        shape,
+        strides,
+        block_shape,
+        _builder=None,
+    ):
+        return None
diff --git a/sglang/python/sglang/srt/layers/attention/fla/solve_tril.py b/sglang/python/sglang/srt/layers/attention/fla/solve_tril.py
new file mode 100644
index 0000000000000000000000000000000000000000..4aedad34254b0ab85d320a5627c4be30b4f991e7
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/attention/fla/solve_tril.py
@@ -0,0 +1,464 @@
+# Adapt from https://github.com/fla-org/flash-linear-attention/blob/main/fla/ops/utils/solve_tril.py
+# -*- coding: utf-8 -*-
+# Copyright (c) 2023-2025, Songlin Yang, Yu Zhang
+
+from typing import Optional
+
+import torch
+import triton
+import triton.language as tl
+
+from sglang.srt.layers.attention.fla.index import prepare_chunk_indices
+from sglang.srt.layers.attention.fla.utils import input_guard
+
+
+# @triton.autotune(
+#     configs=[
+#         triton.Config({}, num_warps=num_warps, num_stages=num_stages)
+#         for num_warps in [1, 2, 4, 8]
+#         for num_stages in [2, 3, 4, 5]
+#     ],
+#     key=["BT"],
+# )
+@triton.jit(do_not_specialize=["T"])
+def solve_tril_16x16_kernel(
+    A,
+    Ad,
+    cu_seqlens,
+    chunk_indices,
+    T,
+    H: tl.constexpr,
+    BT: tl.constexpr,
+    IS_VARLEN: tl.constexpr,
+):
+    i_t, i_bh = tl.program_id(0), tl.program_id(1)
+    i_b, i_h = i_bh // H, i_bh % H
+    if IS_VARLEN:
+        i_n, i_t = tl.load(chunk_indices + i_t * 2).to(tl.int32), tl.load(
+            chunk_indices + i_t * 2 + 1
+        ).to(tl.int32)
+        bos, eos = tl.load(cu_seqlens + i_n).to(tl.int32), tl.load(
+            cu_seqlens + i_n + 1
+        ).to(tl.int32)
+        T = eos - bos
+    else:
+        bos, eos = i_b * T, i_b * T + T
+
+    A = A + (bos * H + i_h) * BT
+    Ad = Ad + (bos * H + i_h) * 16
+
+    offset = (i_t * 16) % BT
+    p_A = tl.make_block_ptr(
+        A, (T, BT), (H * BT, 1), (i_t * 16, offset), (16, 16), (1, 0)
+    )
+    p_Ai = tl.make_block_ptr(Ad, (T, 16), (H * 16, 1), (i_t * 16, 0), (16, 16), (1, 0))
+    b_A = tl.load(p_A, boundary_check=(0, 1)).to(tl.float32)
+    b_A = -tl.where(tl.arange(0, 16)[:, None] > tl.arange(0, 16)[None, :], b_A, 0)
+
+    o_i = tl.arange(0, 16)
+    for i in range(1, min(16, T - i_t * 16)):
+        b_a = -tl.load(A + (i_t * 16 + i) * H * BT + o_i + offset)
+        b_a = b_a + tl.sum(b_a[:, None] * b_A, 0)
+        mask = o_i == i
+        b_A = tl.where(mask[:, None], b_a, b_A)
+    b_A += o_i[:, None] == o_i[None, :]
+    tl.store(
+        p_Ai,
+        b_A.to(p_Ai.dtype.element_ty, fp_downcast_rounding="rtne"),
+        boundary_check=(0, 1),
+    )
+
+
+# @triton.autotune(
+#     configs=[
+#         triton.Config({}, num_warps=num_warps, num_stages=num_stages)
+#         for num_warps in [1, 2, 4, 8]
+#         for num_stages in [2, 3, 4, 5]
+#     ],
+#     key=["H", "BT", "IS_VARLEN"],
+# )
+@triton.jit(do_not_specialize=["T"])
+def merge_16x16_to_32x32_inverse_kernel(
+    A,
+    Ad,
+    Ai,
+    cu_seqlens,
+    chunk_indices,
+    T,
+    H: tl.constexpr,
+    BT: tl.constexpr,
+    IS_VARLEN: tl.constexpr,
+):
+    i_t, i_bh = tl.program_id(0), tl.program_id(1)
+    i_b, i_h = i_bh // H, i_bh % H
+    if IS_VARLEN:
+        i_n, i_t = tl.load(chunk_indices + i_t * 2).to(tl.int32), tl.load(
+            chunk_indices + i_t * 2 + 1
+        ).to(tl.int32)
+        bos, eos = tl.load(cu_seqlens + i_n).to(tl.int32), tl.load(
+            cu_seqlens + i_n + 1
+        ).to(tl.int32)
+        T = eos - bos
+    else:
+        bos, eos = i_b * T, i_b * T + T
+
+    A += (bos * H + i_h) * 32
+    Ad += (bos * H + i_h) * 16
+    Ai += (bos * H + i_h) * 32
+
+    p_A_21 = tl.make_block_ptr(
+        A, (T, 32), (H * 32, 1), (i_t * 32 + 16, 0), (16, 16), (1, 0)
+    )
+    p_Ad_11 = tl.make_block_ptr(
+        Ad, (T, 16), (H * 16, 1), (i_t * 32, 0), (16, 16), (1, 0)
+    )
+    p_Ad_22 = tl.make_block_ptr(
+        Ad, (T, 16), (H * 16, 1), (i_t * 32 + 16, 0), (16, 16), (1, 0)
+    )
+    p_Ai_11 = tl.make_block_ptr(
+        Ai, (T, 32), (H * 32, 1), (i_t * 32, 0), (16, 16), (1, 0)
+    )
+    p_Ai_22 = tl.make_block_ptr(
+        Ai, (T, 32), (H * 32, 1), (i_t * 32 + 16, 16), (16, 16), (1, 0)
+    )
+    p_Ai_21 = tl.make_block_ptr(
+        Ai, (T, 32), (H * 32, 1), (i_t * 32 + 16, 0), (16, 16), (1, 0)
+    )
+
+    A_21 = tl.load(p_A_21, boundary_check=(0, 1)).to(tl.float32)
+    Ai_11 = tl.load(p_Ad_11, boundary_check=(0, 1)).to(tl.float32)
+    Ai_22 = tl.load(p_Ad_22, boundary_check=(0, 1)).to(tl.float32)
+    Ai_21 = -tl.dot(
+        tl.dot(Ai_22, A_21, input_precision="ieee"), Ai_11, input_precision="ieee"
+    )
+    tl.store(
+        p_Ai_11,
+        Ai_11.to(p_Ai_11.dtype.element_ty, fp_downcast_rounding="rtne"),
+        boundary_check=(0, 1),
+    )
+    tl.store(
+        p_Ai_22,
+        Ai_22.to(p_Ai_22.dtype.element_ty, fp_downcast_rounding="rtne"),
+        boundary_check=(0, 1),
+    )
+    tl.store(
+        p_Ai_21,
+        Ai_21.to(p_Ai_21.dtype.element_ty, fp_downcast_rounding="rtne"),
+        boundary_check=(0, 1),
+    )
+
+
+# @triton.autotune(
+#     configs=[
+#         triton.Config({}, num_warps=num_warps, num_stages=num_stages)
+#         for num_warps in [2, 4, 8]
+#         for num_stages in [2, 3, 4, 5]
+#     ],
+#     key=["H", "BT", "IS_VARLEN"],
+# )
+@triton.jit(do_not_specialize=["T"])
+def merge_16x16_to_64x64_inverse_kernel(
+    A,
+    Ad,
+    Ai,
+    cu_seqlens,
+    chunk_indices,
+    T,
+    H: tl.constexpr,
+    BT: tl.constexpr,
+    IS_VARLEN: tl.constexpr,
+):
+    i_t, i_bh = tl.program_id(0), tl.program_id(1)
+    i_b, i_h = i_bh // H, i_bh % H
+    if IS_VARLEN:
+        i_n, i_t = tl.load(chunk_indices + i_t * 2).to(tl.int32), tl.load(
+            chunk_indices + i_t * 2 + 1
+        ).to(tl.int32)
+        bos, eos = tl.load(cu_seqlens + i_n).to(tl.int32), tl.load(
+            cu_seqlens + i_n + 1
+        ).to(tl.int32)
+        T = eos - bos
+    else:
+        bos, eos = i_b * T, i_b * T + T
+
+    A += (bos * H + i_h) * 64
+    Ad += (bos * H + i_h) * 16
+    Ai += (bos * H + i_h) * 64
+
+    p_A_21 = tl.make_block_ptr(
+        A, (T, 64), (H * 64, 1), (i_t * 64 + 16, 0), (16, 16), (1, 0)
+    )
+    p_A_32 = tl.make_block_ptr(
+        A, (T, 64), (H * 64, 1), (i_t * 64 + 32, 16), (16, 16), (1, 0)
+    )
+    p_A_31 = tl.make_block_ptr(
+        A, (T, 64), (H * 64, 1), (i_t * 64 + 32, 0), (16, 16), (1, 0)
+    )
+    p_A_43 = tl.make_block_ptr(
+        A, (T, 64), (H * 64, 1), (i_t * 64 + 48, 32), (16, 16), (1, 0)
+    )
+    p_A_42 = tl.make_block_ptr(
+        A, (T, 64), (H * 64, 1), (i_t * 64 + 48, 16), (16, 16), (1, 0)
+    )
+    p_A_41 = tl.make_block_ptr(
+        A, (T, 64), (H * 64, 1), (i_t * 64 + 48, 0), (16, 16), (1, 0)
+    )
+    p_Ad_11 = tl.make_block_ptr(
+        Ad, (T, 16), (H * 16, 1), (i_t * 64, 0), (16, 16), (1, 0)
+    )
+    p_Ad_22 = tl.make_block_ptr(
+        Ad, (T, 16), (H * 16, 1), (i_t * 64 + 16, 0), (16, 16), (1, 0)
+    )
+    p_Ad_33 = tl.make_block_ptr(
+        Ad, (T, 16), (H * 16, 1), (i_t * 64 + 32, 0), (16, 16), (1, 0)
+    )
+    p_Ad_44 = tl.make_block_ptr(
+        Ad, (T, 16), (H * 16, 1), (i_t * 64 + 48, 0), (16, 16), (1, 0)
+    )
+
+    A_21 = tl.load(p_A_21, boundary_check=(0, 1)).to(tl.float32)
+    A_32 = tl.load(p_A_32, boundary_check=(0, 1)).to(tl.float32)
+    A_31 = tl.load(p_A_31, boundary_check=(0, 1)).to(tl.float32)
+    A_43 = tl.load(p_A_43, boundary_check=(0, 1)).to(tl.float32)
+    A_42 = tl.load(p_A_42, boundary_check=(0, 1)).to(tl.float32)
+    A_41 = tl.load(p_A_41, boundary_check=(0, 1)).to(tl.float32)
+
+    Ai_11 = tl.load(p_Ad_11, boundary_check=(0, 1)).to(tl.float32)
+    Ai_22 = tl.load(p_Ad_22, boundary_check=(0, 1)).to(tl.float32)
+    Ai_33 = tl.load(p_Ad_33, boundary_check=(0, 1)).to(tl.float32)
+    Ai_44 = tl.load(p_Ad_44, boundary_check=(0, 1)).to(tl.float32)
+
+    Ai_21 = -tl.dot(
+        tl.dot(Ai_22, A_21, input_precision="ieee"), Ai_11, input_precision="ieee"
+    )
+    Ai_32 = -tl.dot(
+        tl.dot(Ai_33, A_32, input_precision="ieee"), Ai_22, input_precision="ieee"
+    )
+    Ai_43 = -tl.dot(
+        tl.dot(Ai_44, A_43, input_precision="ieee"), Ai_33, input_precision="ieee"
+    )
+
+    Ai_31 = -tl.dot(
+        Ai_33,
+        tl.dot(A_31, Ai_11, input_precision="ieee")
+        + tl.dot(A_32, Ai_21, input_precision="ieee"),
+        input_precision="ieee",
+    )
+    Ai_42 = -tl.dot(
+        Ai_44,
+        tl.dot(A_42, Ai_22, input_precision="ieee")
+        + tl.dot(A_43, Ai_32, input_precision="ieee"),
+        input_precision="ieee",
+    )
+    Ai_41 = -tl.dot(
+        Ai_44,
+        tl.dot(A_41, Ai_11, input_precision="ieee")
+        + tl.dot(A_42, Ai_21, input_precision="ieee")
+        + tl.dot(A_43, Ai_31, input_precision="ieee"),
+        input_precision="ieee",
+    )
+
+    p_Ai_11 = tl.make_block_ptr(
+        Ai, (T, 64), (H * 64, 1), (i_t * 64, 0), (16, 16), (1, 0)
+    )
+    p_Ai_22 = tl.make_block_ptr(
+        Ai, (T, 64), (H * 64, 1), (i_t * 64 + 16, 16), (16, 16), (1, 0)
+    )
+    p_Ai_33 = tl.make_block_ptr(
+        Ai, (T, 64), (H * 64, 1), (i_t * 64 + 32, 32), (16, 16), (1, 0)
+    )
+    p_Ai_44 = tl.make_block_ptr(
+        Ai, (T, 64), (H * 64, 1), (i_t * 64 + 48, 48), (16, 16), (1, 0)
+    )
+    p_Ai_21 = tl.make_block_ptr(
+        Ai, (T, 64), (H * 64, 1), (i_t * 64 + 16, 0), (16, 16), (1, 0)
+    )
+    p_Ai_31 = tl.make_block_ptr(
+        Ai, (T, 64), (H * 64, 1), (i_t * 64 + 32, 0), (16, 16), (1, 0)
+    )
+    p_Ai_32 = tl.make_block_ptr(
+        Ai, (T, 64), (H * 64, 1), (i_t * 64 + 32, 16), (16, 16), (1, 0)
+    )
+    p_Ai_41 = tl.make_block_ptr(
+        Ai, (T, 64), (H * 64, 1), (i_t * 64 + 48, 0), (16, 16), (1, 0)
+    )
+    p_Ai_42 = tl.make_block_ptr(
+        Ai, (T, 64), (H * 64, 1), (i_t * 64 + 48, 16), (16, 16), (1, 0)
+    )
+    p_Ai_43 = tl.make_block_ptr(
+        Ai, (T, 64), (H * 64, 1), (i_t * 64 + 48, 32), (16, 16), (1, 0)
+    )
+    tl.store(
+        p_Ai_11,
+        Ai_11.to(p_Ai_11.dtype.element_ty, fp_downcast_rounding="rtne"),
+        boundary_check=(0, 1),
+    )
+    tl.store(
+        p_Ai_22,
+        Ai_22.to(p_Ai_22.dtype.element_ty, fp_downcast_rounding="rtne"),
+        boundary_check=(0, 1),
+    )
+    tl.store(
+        p_Ai_33,
+        Ai_33.to(p_Ai_33.dtype.element_ty, fp_downcast_rounding="rtne"),
+        boundary_check=(0, 1),
+    )
+    tl.store(
+        p_Ai_44,
+        Ai_44.to(p_Ai_44.dtype.element_ty, fp_downcast_rounding="rtne"),
+        boundary_check=(0, 1),
+    )
+    tl.store(
+        p_Ai_21,
+        Ai_21.to(p_Ai_21.dtype.element_ty, fp_downcast_rounding="rtne"),
+        boundary_check=(0, 1),
+    )
+    tl.store(
+        p_Ai_31,
+        Ai_31.to(p_Ai_31.dtype.element_ty, fp_downcast_rounding="rtne"),
+        boundary_check=(0, 1),
+    )
+    tl.store(
+        p_Ai_32,
+        Ai_32.to(p_Ai_32.dtype.element_ty, fp_downcast_rounding="rtne"),
+        boundary_check=(0, 1),
+    )
+    tl.store(
+        p_Ai_41,
+        Ai_41.to(p_Ai_41.dtype.element_ty, fp_downcast_rounding="rtne"),
+        boundary_check=(0, 1),
+    )
+    tl.store(
+        p_Ai_42,
+        Ai_42.to(p_Ai_42.dtype.element_ty, fp_downcast_rounding="rtne"),
+        boundary_check=(0, 1),
+    )
+    tl.store(
+        p_Ai_43,
+        Ai_43.to(p_Ai_43.dtype.element_ty, fp_downcast_rounding="rtne"),
+        boundary_check=(0, 1),
+    )
+
+    fill_zeros = tl.zeros((16, 16), dtype=tl.float32)
+    p_Ai_12 = tl.make_block_ptr(
+        Ai, (T, 64), (H * 64, 1), (i_t * 64, 16), (16, 16), (1, 0)
+    )
+    p_Ai_13 = tl.make_block_ptr(
+        Ai, (T, 64), (H * 64, 1), (i_t * 64, 32), (16, 16), (1, 0)
+    )
+    p_Ai_14 = tl.make_block_ptr(
+        Ai, (T, 64), (H * 64, 1), (i_t * 64, 48), (16, 16), (1, 0)
+    )
+    p_Ai_23 = tl.make_block_ptr(
+        Ai, (T, 64), (H * 64, 1), (i_t * 64 + 16, 32), (16, 16), (1, 0)
+    )
+    p_Ai_24 = tl.make_block_ptr(
+        Ai, (T, 64), (H * 64, 1), (i_t * 64 + 16, 48), (16, 16), (1, 0)
+    )
+    p_Ai_34 = tl.make_block_ptr(
+        Ai, (T, 64), (H * 64, 1), (i_t * 64 + 32, 48), (16, 16), (1, 0)
+    )
+    tl.store(
+        p_Ai_12,
+        fill_zeros.to(p_Ai_12.dtype.element_ty, fp_downcast_rounding="rtne"),
+        boundary_check=(0, 1),
+    )
+    tl.store(
+        p_Ai_13,
+        fill_zeros.to(p_Ai_13.dtype.element_ty, fp_downcast_rounding="rtne"),
+        boundary_check=(0, 1),
+    )
+    tl.store(
+        p_Ai_14,
+        fill_zeros.to(p_Ai_14.dtype.element_ty, fp_downcast_rounding="rtne"),
+        boundary_check=(0, 1),
+    )
+    tl.store(
+        p_Ai_23,
+        fill_zeros.to(p_Ai_23.dtype.element_ty, fp_downcast_rounding="rtne"),
+        boundary_check=(0, 1),
+    )
+    tl.store(
+        p_Ai_24,
+        fill_zeros.to(p_Ai_24.dtype.element_ty, fp_downcast_rounding="rtne"),
+        boundary_check=(0, 1),
+    )
+    tl.store(
+        p_Ai_34,
+        fill_zeros.to(p_Ai_34.dtype.element_ty, fp_downcast_rounding="rtne"),
+        boundary_check=(0, 1),
+    )
+
+
+@input_guard
+def solve_tril(
+    A: torch.Tensor,
+    cu_seqlens: Optional[torch.Tensor] = None,
+    output_dtype: torch.dtype = torch.float,
+) -> torch.Tensor:
+    """
+    Compute the inverse of the lower triangular matrix
+    A should be strictly lower triangular, i.e., A.triu() == 0.
+
+    Args:
+        A (torch.Tensor):
+            [B, T, H, K]
+        cu_seqlens (torch.Tensor):
+            The cumulative sequence lengths of the input tensor.
+            Default: None.
+        output_dtype (torch.dtype):
+            The dtype of the output tensor. Default: `torch.float`
+
+    Returns:
+        (I + A)^-1 with the same shape as A
+    """
+    assert A.shape[-1] in [16, 32, 64]
+
+    B, T, H, BT = A.shape
+    Ad = torch.empty(
+        B, T, H, 16, device=A.device, dtype=torch.float if BT != 16 else output_dtype
+    )
+
+    chunk_indices = (
+        prepare_chunk_indices(cu_seqlens, 16) if cu_seqlens is not None else None
+    )
+    NT = len(chunk_indices) if cu_seqlens is not None else triton.cdiv(T, 16)
+    solve_tril_16x16_kernel[NT, B * H](
+        A=A,
+        Ad=Ad,
+        cu_seqlens=cu_seqlens,
+        chunk_indices=chunk_indices,
+        T=T,
+        H=H,
+        BT=BT,
+        IS_VARLEN=cu_seqlens is not None,
+        num_warps=1,
+        num_stages=4,
+    )
+    if BT == 16:
+        return Ad
+
+    Ai = torch.empty(B, T, H, BT, device=A.device, dtype=output_dtype)
+    merge_fn = (
+        merge_16x16_to_32x32_inverse_kernel
+        if BT == 32
+        else merge_16x16_to_64x64_inverse_kernel
+    )
+    chunk_indices = (
+        prepare_chunk_indices(cu_seqlens, BT) if cu_seqlens is not None else None
+    )
+    NT = len(chunk_indices) if cu_seqlens is not None else triton.cdiv(T, BT)
+    merge_fn[NT, B * H](
+        A=A,
+        Ad=Ad,
+        Ai=Ai,
+        cu_seqlens=cu_seqlens,
+        chunk_indices=chunk_indices,
+        T=T,
+        H=H,
+        BT=BT,
+        IS_VARLEN=cu_seqlens is not None,
+        num_warps=4,
+        num_stages=3,
+    )
+    return Ai
diff --git a/sglang/python/sglang/srt/layers/attention/fla/utils.py b/sglang/python/sglang/srt/layers/attention/fla/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..af6ca3d6e572278723d92624ebcb095a59732b90
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/attention/fla/utils.py
@@ -0,0 +1,325 @@
+# Adapt from https://github.com/fla-org/flash-linear-attention/blob/main/fla/utils.py
+# -*- coding: utf-8 -*-
+
+import contextlib
+import functools
+import logging
+import os
+import sys
+from enum import Enum
+from functools import lru_cache
+from typing import Any, Callable, Dict, Literal, Optional, Tuple
+
+import torch
+import triton
+from packaging import version
+
+from sglang.srt.utils.common import torch_release
+
+logger = logging.getLogger(__name__)
+
+COMPILER_MODE = os.getenv("FLA_COMPILER_MODE") == "1"
+FLA_CI_ENV = os.getenv("FLA_CI_ENV") == "1"
+
+
+@lru_cache(maxsize=1)
+def check_environments():
+    """
+    Checks the current operating system, Triton version, and Python version,
+    issuing warnings if they don't meet recommendations.
+    This function's body only runs once due to lru_cache.
+    """
+    # Check Operating System
+    if sys.platform == "win32":
+        logger.warning(
+            "Detected Windows operating system. Triton does not have an official Windows release, "
+            "thus FLA will not be adapted for Windows, and any potential errors will not be fixed. "
+            "Please consider using a Linux environment for compatibility."
+        )
+
+    triton_version = version.parse(triton.__version__)
+    required_triton_version = version.parse("3.2.0")
+
+    if triton_version < required_triton_version:
+        logger.warning(
+            f"Current Triton version {triton_version} is below the recommended 3.2.0 version. "
+            "Errors may occur and these issues will not be fixed. "
+            "Please consider upgrading Triton."
+        )
+
+    # Check Python version
+    py_version = version.parse(f"{sys.version_info.major}.{sys.version_info.minor}")
+    required_py_version = version.parse("3.11")
+
+    if py_version < required_py_version:
+        logger.warning(
+            f"Current Python version {py_version} is below the recommended 3.11 version. "
+            "It is recommended to upgrade to Python 3.11 or higher for the best experience."
+        )
+
+    return None
+
+
+def get_abs_err(x, y):
+    return (x.detach() - y.detach()).flatten().abs().max().item()
+
+
+def get_err_ratio(x, y):
+    err = (x.detach() - y.detach()).flatten().square().mean().sqrt().item()
+    base = (x.detach()).flatten().square().mean().sqrt().item()
+    return err / (base + 1e-8)
+
+
+def assert_close(prefix, ref, tri, ratio, warning=False, err_atol=1e-6):
+    abs_atol = get_abs_err(ref, tri)
+    msg = f"{prefix} diff: {abs_atol:.6f} ratio: {get_err_ratio(ref, tri):.6f}"
+    logger.info(msg)
+    error_rate = get_err_ratio(ref, tri)
+    if abs_atol <= err_atol:
+        return
+    if warning or (FLA_CI_ENV and (error_rate < 0.01 or abs_atol <= 0.3)):
+        if error_rate > ratio:
+            import warnings
+
+            warnings.warn(msg)
+    else:
+        assert error_rate < ratio, msg
+
+
+SUPPRESS_LEVEL = int(os.getenv("GDN_RECOMPUTE_SUPPRESS_LEVEL", "0"))
+
+
+def tensor_cache(fn: Callable[..., torch.Tensor]) -> Callable[..., torch.Tensor]:
+    """
+    A decorator that caches the most recent results of a function with tensor inputs.
+    This decorator will store the output of the decorated function for the most recent set of input tensors.
+    The cache is limited to a fixed size (default is 4). When the cache is full, the oldest entry will be removed.
+    Args:
+        fn (Callable[..., torch.Tensor]):
+            The function to be decorated. It should take tensor inputs and return tensor outputs.
+    Returns:
+        Callable[..., torch.Tensor]:
+            A wrapped version of the input function with single-entry caching.
+    """
+
+    cache_entries: Tuple[Optional[Tuple], Optional[Dict], Any] = []
+    cache_size = 4
+
+    @functools.wraps(fn)
+    def wrapper(*args: Any, **kwargs: Any) -> Any:
+        nonlocal cache_entries, cache_size
+        for i, entry in enumerate(cache_entries):
+            last_args, last_kwargs, last_result = entry
+            if len(args) == len(last_args) and len(kwargs) == len(last_kwargs):
+                if all(a is b for a, b in zip(args, last_args)) and all(
+                    k in last_kwargs and v is last_kwargs[k] for k, v in kwargs.items()
+                ):
+                    cache_entries = (
+                        cache_entries[:i]
+                        + cache_entries[i + 1 :]
+                        + [(args, kwargs, last_result)]
+                    )
+                    return last_result
+
+        result = fn(*args, **kwargs)
+
+        if len(cache_entries) >= cache_size:
+            cache_entries = cache_entries[1:]
+        cache_entries.append((args, kwargs, result))
+        return result
+
+    return wrapper
+
+
+def input_guard(fn: Callable[..., torch.Tensor]) -> Callable[..., torch.Tensor]:
+    """
+    A decorator to make sure all input tensors are contiguous and set the device based on input tensors.
+    """
+
+    @functools.wraps(fn)
+    def wrapper(*args, **kwargs):
+        contiguous_args = (
+            i if not isinstance(i, torch.Tensor) else i.contiguous() for i in args
+        )
+        contiguous_kwargs = {
+            k: (v if not isinstance(v, torch.Tensor) else v.contiguous())
+            for k, v in kwargs.items()
+        }
+
+        tensor = None
+        for arg in args:
+            if isinstance(arg, torch.Tensor):
+                tensor = arg
+                break
+        if tensor is None:
+            for value in kwargs.values():
+                if isinstance(value, torch.Tensor):
+                    tensor = value
+                    break
+
+        if tensor is not None:
+            ctx = custom_device_ctx(tensor.device.index)
+        else:
+            ctx = contextlib.nullcontext()
+
+        with ctx:
+            return fn(*contiguous_args, **contiguous_kwargs)
+
+    return wrapper
+
+
+contiguous = input_guard
+
+
+def require_version(version, hint):
+    """
+    Perform a runtime check of the dependency versions, using the exact same syntax used by pip.
+    """
+
+    def decorator(fn):
+        @functools.wraps(fn)
+        def wrapper(ctx, *args, **kwargs):
+            from transformers.utils.versions import require_version
+
+            require_version(version, hint)
+            return fn(
+                ctx,
+                *(
+                    i if not isinstance(i, torch.Tensor) else i.contiguous()
+                    for i in args
+                ),
+                **{
+                    k: (v if not isinstance(v, torch.Tensor) else v.contiguous())
+                    for k, v in kwargs.items()
+                },
+            )
+
+        return wrapper
+
+    return decorator
+
+
+def checkpoint(fn):
+    def wrapper(*args, **kwargs):
+        return torch.utils.checkpoint.checkpoint(fn, *args, **kwargs)
+
+    return wrapper
+
+
+def _cpu_device_warning():
+    import warnings
+
+    warnings.warn(
+        ("Triton is not supported on current platform, roll back to CPU."), stacklevel=1
+    )
+
+
+@lru_cache(maxsize=None)
+def get_multiprocessor_count(tensor_idx: int = 0) -> int:
+    try:
+        return triton.runtime.driver.active.utils.get_device_properties(tensor_idx)[
+            "multiprocessor_count"
+        ]
+    except BaseException:
+        _cpu_device_warning()
+        return -1
+
+
+@lru_cache(maxsize=None)
+def get_available_device() -> str:
+    try:
+        return triton.runtime.driver.active.get_current_target().backend
+    except BaseException:
+        _cpu_device_warning()
+        return "cpu"
+
+
+@lru_cache(maxsize=None)
+def _check_platform() -> Literal["nvidia", "amd", "intel", "musa"]:
+    device = get_available_device()
+    if device == "cuda":
+        return "nvidia"
+    elif device == "hip":
+        return "amd"
+    elif device == "xpu":
+        return "intel"
+    else:
+        return device
+
+
+# For AMD GPUs, the triton backend is 'hip', while for Nvidia GPUs, the triton backend is 'cuda'.
+# However, the torch backend is 'cuda' for both Nvidia and AMD GPUs.
+# Therefore, we need to check the triton backend to determine the actual GPU vendor.
+device = get_available_device() if get_available_device() != "hip" else "cuda"
+device_torch_lib = getattr(torch, device)
+device_platform = _check_platform()
+
+is_amd = device_platform == "amd"
+is_intel = device_platform == "intel"
+is_nvidia = device_platform == "nvidia"
+is_intel_alchemist = is_intel and "Intel(R) Arc(TM) A" in torch.xpu.get_device_name(0)
+is_nvidia_hopper = is_nvidia and (
+    "NVIDIA H" in torch.cuda.get_device_name(0)
+    or torch.cuda.get_device_capability()[0] >= 9
+)
+use_cuda_graph = is_nvidia and os.environ.get("FLA_USE_CUDA_GRAPH", "0") == "1"
+
+# Nvidia Ampere or newer, haven't check AMD and intel yet.
+is_tf32_supported = is_nvidia and torch.cuda.get_device_capability(0)[0] >= 8
+is_gather_supported = hasattr(triton.language, "gather")
+
+
+def get_all_max_shared_mem():
+    try:
+        return [
+            triton.runtime.driver.active.utils.get_device_properties(i)[
+                "max_shared_mem"
+            ]
+            for i in range(device_torch_lib.device_count())
+        ]
+    except BaseException:
+        _cpu_device_warning()
+        return [-1]
+
+
+class Backend(Enum):
+    ADA = 101376  # RTX 4090
+    AMPERE = 166912  # A100
+    HOPPER = 232448  # H100
+    DEFAULT = 102400  # Default
+
+    @classmethod
+    def get_shared_memory(cls, arch: str) -> int:
+        try:
+            return cls[arch.upper()].value
+        except KeyError:
+            return cls.DEFAULT.value
+
+
+@lru_cache(maxsize=None)
+def check_shared_mem(arch: str = "none", tensor_idx: int = 0) -> bool:
+    try:
+        device_shared_mem_list = get_all_max_shared_mem()
+        max_shared_memory = device_shared_mem_list[tensor_idx]
+        return max_shared_memory >= Backend.get_shared_memory(arch)
+    except Exception:
+        return False
+
+
+if torch_release >= (2, 4):
+    device = "cuda" if device == "cpu" else device
+    autocast_custom_fwd = functools.partial(torch.amp.custom_fwd, device_type=device)
+    autocast_custom_bwd = functools.partial(torch.amp.custom_bwd, device_type=device)
+
+    def custom_device_ctx(index: int):
+        return device_torch_lib.device(index)
+
+else:
+    assert (
+        device == "cuda"
+    ), "Only cuda device is supported for PyTorch version < 2.4.0."
+    autocast_custom_fwd = device_torch_lib.amp.custom_fwd
+    autocast_custom_bwd = device_torch_lib.amp.custom_bwd
+
+    def custom_device_ctx(index: int):
+        return torch.cuda.device(index)
diff --git a/sglang/python/sglang/srt/layers/attention/fla/wy_fast.py b/sglang/python/sglang/srt/layers/attention/fla/wy_fast.py
new file mode 100644
index 0000000000000000000000000000000000000000..757e5621087baf03e787120d2d123834f4b2a6e4
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/attention/fla/wy_fast.py
@@ -0,0 +1,156 @@
+# Adapt from https://github.com/fla-org/flash-linear-attention/blob/main/fla/ops/gated_delta_rule/wy_fast.py
+# -*- coding: utf-8 -*-
+# Copyright (c) 2023-2025, Songlin Yang, Yu Zhang
+
+from typing import Optional, Tuple
+
+import torch
+import triton
+import triton.language as tl
+
+from sglang.srt.layers.attention.fla.index import prepare_chunk_indices
+
+
+# @triton.autotune(
+#     configs=[
+#         triton.Config({}, num_warps=num_warps, num_stages=num_stages)
+#         for num_warps in [2, 4, 8]
+#         for num_stages in [2, 3, 4]
+#     ],
+#     key=["H", "K", "V", "BT", "BK", "BV", "IS_VARLEN"],
+# )
+@triton.jit(do_not_specialize=["T"])
+def recompute_w_u_fwd_kernel(
+    k,
+    v,
+    beta,
+    w,
+    u,
+    A,
+    g,
+    cu_seqlens,
+    chunk_indices,
+    T,
+    H: tl.constexpr,
+    Hg: tl.constexpr,
+    K: tl.constexpr,
+    V: tl.constexpr,
+    BT: tl.constexpr,
+    BK: tl.constexpr,
+    BV: tl.constexpr,
+    IS_VARLEN: tl.constexpr,
+):
+    i_t, i_bh = tl.program_id(0), tl.program_id(1)
+    i_b, i_h = i_bh // H, i_bh % H
+    if IS_VARLEN:
+        i_n, i_t = tl.load(chunk_indices + i_t * 2).to(tl.int32), tl.load(
+            chunk_indices + i_t * 2 + 1
+        ).to(tl.int32)
+        bos, eos = tl.load(cu_seqlens + i_n).to(tl.int32), tl.load(
+            cu_seqlens + i_n + 1
+        ).to(tl.int32)
+        T = eos - bos
+    else:
+        bos, eos = i_b * T, i_b * T + T
+    p_beta = tl.make_block_ptr(
+        beta + bos * H + i_h, (T,), (H,), (i_t * BT,), (BT,), (0,)
+    )
+    p_g = tl.make_block_ptr(g + (bos * H + i_h), (T,), (H,), (i_t * BT,), (BT,), (0,))
+    p_A = tl.make_block_ptr(
+        A + (bos * H + i_h) * BT, (T, BT), (H * BT, 1), (i_t * BT, 0), (BT, BT), (1, 0)
+    )
+    b_beta = tl.load(p_beta, boundary_check=(0,))
+    b_A = tl.load(p_A, boundary_check=(0, 1))
+    b_g = tl.exp(tl.load(p_g, boundary_check=(0,)))
+
+    for i_v in range(tl.cdiv(V, BV)):
+        p_v = tl.make_block_ptr(
+            v + (bos * H + i_h) * V,
+            (T, V),
+            (H * V, 1),
+            (i_t * BT, i_v * BV),
+            (BT, BV),
+            (1, 0),
+        )
+        p_u = tl.make_block_ptr(
+            u + (bos * H + i_h) * V,
+            (T, V),
+            (H * V, 1),
+            (i_t * BT, i_v * BV),
+            (BT, BV),
+            (1, 0),
+        )
+        b_v = tl.load(p_v, boundary_check=(0, 1))
+        b_vb = (b_v * b_beta[:, None]).to(b_v.dtype)
+        b_u = tl.dot(b_A, b_vb, allow_tf32=False)
+        tl.store(p_u, b_u.to(p_u.dtype.element_ty), boundary_check=(0, 1))
+
+    for i_k in range(tl.cdiv(K, BK)):
+        p_k = tl.make_block_ptr(
+            k + (bos * Hg + i_h // (H // Hg)) * K,
+            (T, K),
+            (Hg * K, 1),
+            (i_t * BT, i_k * BK),
+            (BT, BK),
+            (1, 0),
+        )
+        p_w = tl.make_block_ptr(
+            w + (bos * H + i_h) * K,
+            (T, K),
+            (H * K, 1),
+            (i_t * BT, i_k * BK),
+            (BT, BK),
+            (1, 0),
+        )
+        b_k = tl.load(p_k, boundary_check=(0, 1))
+        b_kb = (b_k * b_beta[:, None] * b_g[:, None]).to(b_k.dtype)
+        b_w = tl.dot(b_A, b_kb)
+        tl.store(p_w, b_w.to(p_w.dtype.element_ty), boundary_check=(0, 1))
+
+
+def recompute_w_u_fwd(
+    k: torch.Tensor,
+    v: torch.Tensor,
+    beta: torch.Tensor,
+    g_cumsum: torch.Tensor,
+    A: torch.Tensor,
+    cu_seqlens: Optional[torch.LongTensor],
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    B, T, Hg, K, V = *k.shape, v.shape[-1]
+    H = v.shape[-2]
+    BT = A.shape[-1]
+
+    chunk_indices = (
+        prepare_chunk_indices(cu_seqlens, BT) if cu_seqlens is not None else None
+    )
+    NT = triton.cdiv(T, BT) if cu_seqlens is None else len(chunk_indices)
+    BK = 64
+    BV = 64
+    u = torch.empty_like(v)
+    w = k.new_empty(B, T, H, K)
+    recompute_w_u_fwd_kernel[(NT, B * H)](
+        k=k,
+        v=v,
+        beta=beta,
+        w=w,
+        u=u,
+        A=A,
+        g=g_cumsum,
+        cu_seqlens=cu_seqlens,
+        chunk_indices=chunk_indices,
+        T=T,
+        H=H,
+        Hg=Hg,
+        K=K,
+        V=V,
+        BT=BT,
+        BK=BK,
+        BV=BV,
+        IS_VARLEN=cu_seqlens is not None,
+        num_warps=4,
+        num_stages=3,
+    )
+    return w, u
+
+
+fwd_recompute_w_u = recompute_w_u_fwd
diff --git a/sglang/python/sglang/srt/layers/attention/flashattention_backend.py b/sglang/python/sglang/srt/layers/attention/flashattention_backend.py
new file mode 100644
index 0000000000000000000000000000000000000000..6c5d67220f1f1ef87b03009886b06316917c7245
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/attention/flashattention_backend.py
@@ -0,0 +1,2657 @@
+from __future__ import annotations
+
+from dataclasses import dataclass
+from typing import TYPE_CHECKING, Optional
+
+import numpy as np
+import torch
+import triton
+import triton.language as tl
+
+from sglang.srt.configs.model_config import AttentionArch
+from sglang.srt.layers.attention.base_attn_backend import AttentionBackend
+from sglang.srt.layers.radix_attention import AttentionType
+from sglang.srt.mem_cache.swa_memory_pool import SWAKVPool
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch, ForwardMode
+from sglang.srt.server_args import get_global_server_args
+from sglang.srt.speculative.spec_info import SpecInput
+from sglang.srt.utils import get_compiler_backend
+
+if TYPE_CHECKING:
+    from sglang.srt.layers.radix_attention import RadixAttention
+    from sglang.srt.model_executor.model_runner import ModelRunner
+
+from sgl_kernel import merge_state_v2
+from sgl_kernel.flash_attn import flash_attn_varlen_func as flash_attn_varlen_func_fa3
+from sgl_kernel.flash_attn import flash_attn_with_kvcache as flash_attn_with_kvcache_fa3
+
+flash_attn_varlen_func = flash_attn_varlen_func_fa3
+flash_attn_with_kvcache = flash_attn_with_kvcache_fa3
+
+from sglang.jit_kernel.flash_attention_v4 import (
+    flash_attn_varlen_func as flash_attn_varlen_func_fa4,
+)
+from sglang.jit_kernel.flash_attention_v4 import (
+    flash_attn_with_kvcache as flash_attn_with_kvcache_fa4,
+)
+
+
+@dataclass
+class FlashAttentionMetadata:
+    """Metadata to be init once in the model forward pass,
+    each layer's forward pass can reuse the metadata.
+
+    For each init metadata function, we will try set up them in below order
+    """
+
+    # Sequence lengths for the forward batch
+    cache_seqlens_int32: torch.Tensor = None
+    # Maximum sequence length for query
+    max_seq_len_q: int = 1
+    # Maximum sequence length for key
+    max_seq_len_k: int = 0
+    # Cumulative sequence lengths for query
+    cu_seqlens_q: torch.Tensor = None
+    # Cumulative sequence lengths for key
+    cu_seqlens_k: torch.Tensor = None
+    # Window size (typically used by Gemma)
+    window_size: tuple = (-1, -1)
+    # Page table, the index of KV Cache Tables/Blocks
+    page_table: torch.Tensor = None
+    # Page table for Sliding Window Attention
+    swa_page_table: torch.Tensor = None
+
+    # Encoder metadata
+    # Cumulative sequence lengths for encoder key
+    encoder_cu_seqlens_k: torch.Tensor = None
+    # Maximum sequence length for encoder key
+    encoder_max_seq_len_k: int = 0
+    # Sequence lengths for the forward batch
+    encoder_lens_int32: torch.Tensor = None
+    # Page table for the encoder
+    encoder_page_table: torch.Tensor = None
+
+    @dataclass
+    class LocalAttentionMetadata:
+        local_query_start_loc: torch.Tensor = None  # cu_seqlens_q for local attention
+        local_seqused_k: torch.Tensor = None  # sequence lengths for local attention
+        local_block_table: torch.Tensor = None  # block table for local attention
+        local_max_query_len: int = 0  # max query length for local attention
+        local_max_seq_len: int = 0  # max sequence length for local attention
+
+    local_attn_metadata: Optional[LocalAttentionMetadata] = None
+
+    # For sliding window attention topk>1 spec decoding
+    swa_spec_metadata: Optional[FlashAttentionMetadata] = None
+
+
+# Copied from:
+# https://github.com/houseroad/vllm/blob/4e45bfcaf928bdb9bd952b4ac922a3c205589ae8/vllm/v1/attention/backends/flash_attn.py
+#
+# Take in `query_start_loc_np` and `seq_lens_np` and break the sequences into
+# local attention blocks, where each block is passed to the attention kernel
+# as an independent local ("virtual") batch item.
+#
+# For example, if are performing a chunked prefill a batch of 3 sequences:
+#   q_seqlens  = [4, 10, 5]
+#   kv_seqlens = [6, 17, 9]
+# Then normally for regular attention we would compute with an attention mask
+#  for batch idx 0 (q_seqlens = 4, kv_seqlens = 6) like:
+#   batch idx: 0 (q_seqlens = 4, kv_seqlens = 6)
+#        k_toks >   0 1 2 3 4 5
+#        q_toks v  _____________
+#               0 | 1 1 1
+#               1 | 1 1 1 1
+#               2 | 1 1 1 1 1
+#               3 | 1 1 1 1 1 1
+#
+# for local attention (with attn_chunk_size = 4) we would compute with an
+#  attention mask like:
+#   batch idx: 0  (q_seqlens = 4, kv_seqlens = 6, attn_chunk_size = 4)
+#        k_toks >   0 1 2 3 4 5
+#        q_toks v  _____________
+#               0 | 1 1 1
+#               1 | 1 1 1 1
+#               2 |         1
+#               3 |         1 1
+#
+# We can simulate this mask using standard flash-attention by breaking the
+#  sequences into local ("virtual") batches, where each local batch item is a
+#  local attention block, so in this case batch idx 0 would be broken up into:
+#
+#   local-batch idx: 0 (q_seqlens = 2, kv_seqlens = 4)  (batch 0)
+#        k_toks >   0 1 2 3
+#        q_toks v  _____________
+#               0 | 1 1 1
+#               1 | 1 1 1 1
+#   local-batch idx: 1 (q_seqlens = 2, kv_seqlens = 2) (batch 0)
+#        k_toks >   4 5
+#        q_toks v  _____________
+#               2 | 1
+#               3 | 1 1
+#
+# e.g. if we have:
+#   attn_chunk_size = 4
+#   query_start_loc_np = [0, 4, 14, 19] (q_seqlens = [4, 10, 5])
+# Then this function would return:
+#                           __b0__  ______b1______  __b2__ < orig batch indices
+#   q_seqlens_local    = [   2,  2,  1,  4,  4,  1,  4,  1]
+#   cu_seqlens_q_local = [0, 4,  6, 10, 14, 18, 19, 23, 24]
+#   seqlens_k_local    = [   4,  2,  4,  4,  4,  1,  4,  1]
+#   block_table_local  : shape[local_virtual_batches, pages_per_local_batch]
+def make_local_attention_virtual_batches(
+    attn_chunk_size: int,
+    query_start_loc_np: np.ndarray,
+    seq_lens_np: np.ndarray,
+    block_table: torch.Tensor,
+    page_size: int = 0,
+) -> tuple[np.ndarray, np.ndarray, np.ndarray, torch.Tensor]:
+    """
+    Take in `query_start_loc_np` and `seq_lens_np` and break the sequences into
+    local attention blocks, where each block is passed to the attention kernel
+    as an independent local ("virtual") batch item.
+
+    Args:
+        attn_chunk_size: Size of local attention chunks
+        query_start_loc_np: Cumulative sum of query lengths (numpy array)
+        seq_lens_np: Sequence lengths (numpy array)
+        block_table: Block table for KV cache
+        page_size: Size of each page in the KV cache
+
+    Returns:
+        seqlens_q_local: Query sequence lengths for local attention
+        cu_seqlens_q_local: Cumulative sum of query sequence lengths for local attention
+        seqlens_k_local: Key sequence lengths for local attention
+        block_table_local: Block table for local attention
+    """
+    # Adjust attention_chunk_size based on the actual sequence length
+    # to avoid index out of bounds errors
+    max_seq_len = seq_lens_np.max()
+    effective_chunk_size = min(attn_chunk_size, max_seq_len)
+    # Make sure effective_chunk_size is divisible by page_size
+    effective_chunk_size = (effective_chunk_size // page_size) * page_size
+    if effective_chunk_size < page_size:
+        effective_chunk_size = page_size
+    attn_chunk_size = effective_chunk_size
+
+    q_seqlens = query_start_loc_np[1:] - query_start_loc_np[:-1]
+    actual_batch_size = seq_lens_np.shape[0]
+
+    # Handle if we are starting in the middle of a local attention block,
+    #  we assume q_seqlens > 0 (for all elements), for each batch idx we compute
+    #  the number of tokens that are not in the first local attention block and
+    #  then we can simply use a cdiv for the rest.
+    # For example if we have:
+    #   attn_chunk_size = 4
+    #   q_seqlens = [4, 10, 5]
+    #   k_seqlens = [6, 17, 9]
+    # Then we would get:
+    #   new_tokens_in_first_block = [2, 1, 4]
+    #   local_blocks = [2, 4, 2]
+    q_tokens_in_first_block = np.minimum(
+        attn_chunk_size - ((seq_lens_np - q_seqlens) % attn_chunk_size), q_seqlens
+    ).astype(np.int32)
+    tokens_in_last_block = attn_chunk_size + (seq_lens_np % -attn_chunk_size)
+    local_blocks = 1 + cdiv(q_seqlens - q_tokens_in_first_block, attn_chunk_size)
+
+    # Once we know the number of local blocks we can compute the request spans
+    #  for each batch idx, we can figure out the number of "virtual" requests we
+    #  have to make,
+    # For the above example we would get:
+    #   seqlens_q_local = [2, 2, 1, 4, 4, 1, 4, 1]
+    #
+    # First Get batched arange. (E.g., [2, 4, 2] -> [0, 1, 0, 1, 2, 3, 0, 1])
+    #   (TODO: max a utility to share this code with _prepare_inputs)
+    # arange step 1. [2, 4, 2] -> [2, 6, 8]
+    cu_num_blocks = np.cumsum(local_blocks)
+    virtual_batches = cu_num_blocks[-1]
+    # arange step 2. [2, 6, 8] -> [0, 0, 2, 2, 2, 2, 6, 6]
+    block_offsets = np.repeat(cu_num_blocks - local_blocks, local_blocks)
+    # arange step 3. [0, 1, 0, 1, 2, 3, 0, 1]
+    arange = np.arange(virtual_batches, dtype=np.int32) - block_offsets
+    # also compute reverse arange (i.e. [1, 0, 3, 2, 1, 0, 1, 0])
+    rarange = np.repeat(local_blocks, local_blocks) - arange - 1
+    # Then we can compute the seqlens_q_local, handling the fact that the
+    #  first and last blocks could be partial
+    seqlens_q_local = np.repeat(q_seqlens - q_tokens_in_first_block, local_blocks)
+    # set the first block since this may be a partial block
+    seqlens_q_local[arange == 0] = q_tokens_in_first_block
+    # set the remaining blocks
+    seqlens_q_local[arange > 0] = np.minimum(
+        seqlens_q_local - attn_chunk_size * (arange - 1), attn_chunk_size
+    )[arange > 0]
+
+    # convert from q_seqlens to cu_seqlens_q
+    cu_seqlens_q_local = np.pad(np.cumsum(seqlens_q_local), (1, 0)).astype(np.int32)
+
+    # compute the seqlens_k_local,
+    #  basically a full local attention block for all but the last block in each
+    #  batch
+    # For our example this will be:
+    #   seqlens_k_local = [4, 2, 4, 4, 4, 1, 4, 1]
+    seqlens_k_local = np.full(cu_num_blocks[-1], attn_chunk_size, dtype=np.int32)
+    seqlens_k_local[cu_num_blocks - 1] = tokens_in_last_block
+
+    k_seqstarts_absolute = np.repeat(seq_lens_np, local_blocks) - (
+        rarange * attn_chunk_size + np.repeat(tokens_in_last_block, local_blocks)
+    )
+    # For the example the local attention blocks start at:
+    #                           _b0_  _____b1_____  _b2_
+    #   k_seqstarts_absolute = [0, 4, 4, 8, 12, 16, 4, 8]
+    block_starts = k_seqstarts_absolute // page_size
+
+    assert attn_chunk_size % page_size == 0, (
+        f"attn_chunk_size {attn_chunk_size} is not "
+        f"divisible by page_size {page_size}"
+    )
+    pages_per_local_batch = attn_chunk_size // page_size
+
+    # Create a block_table for the local attention blocks
+    # For out example if we have a block-table like (assuming page_size=2):
+    #   block_table = [
+    #     [ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9],  < batch 0
+    #     [10, 11, 12, 13, 14, 15, 16, 17, 18, 19],  < batch 1
+    #     [20, 21, 22, 23, 24, 25, 26, 27, 28, 29],  < batch 2
+    #   ]
+    # Then for the local batches we would want a block-table like
+    #   block_table_local = [
+    #     [  0,  1 ], < local-batch 0, (batch 0, starting from k[0])
+    #     [  2,  3 ], < local-batch 1, (batch 0, starting from k[4])
+    #     [ 12, 13 ], < local-batch 2, (batch 1, starting from k[4])
+    #     [ 14, 15 ], < local-batch 3, (batch 1, starting from k[8])
+    #     [ 16, 17 ], < local-batch 4, (batch 1, starting from k[12])
+    #     [ 18, 19 ], < local-batch 5, (batch 1, starting from k[16])
+    #     [ 22, 23 ], < local-batch 6, (batch 2, starting from k[4])
+    #     [ 24, 25 ], < local-batch 7, (batch 2, starting from k[8])
+    #   ]
+    block_indices = np.broadcast_to(
+        np.arange(pages_per_local_batch, dtype=np.int32),
+        (virtual_batches, pages_per_local_batch),
+    ) + np.expand_dims(block_starts, axis=1)
+    # Ensure block_indices doesn't exceed block_table dimensions
+    # This is a critical safety check that prevents index out of bounds errors
+    # when dealing with large sequences (>8192 tokens) or when the block_table
+    # dimensions are smaller than what would be needed for the full attention chunk size.
+    block_indices = block_indices.flatten().clip(max=block_table.shape[1] - 1)
+    batch_indices = np.repeat(
+        np.arange(actual_batch_size, dtype=np.int32),
+        local_blocks * pages_per_local_batch,
+    )
+
+    # NOTE: https://github.com/pytorch/pytorch/pull/160256 causes performance
+    # regression when using numpy arrays (batch and block indices) to index into
+    # torch tensor (block_table). As a workaround, convert numpy arrays to torch
+    # tensor first, which recovers perf.
+    batch_indices_torch = torch.from_numpy(batch_indices)
+    block_indices_torch = torch.from_numpy(block_indices)
+    block_table_local = block_table[batch_indices_torch, block_indices_torch].view(
+        virtual_batches, -1
+    )
+
+    return seqlens_q_local, cu_seqlens_q_local, seqlens_k_local, block_table_local
+
+
+def cdiv(a: int, b: int) -> int:
+    """Ceiling division."""
+    return -(a // -b)
+
+
+# TODO(hebiao064): remove this once we have a better way to handle the merge_state_v2 torch.compile issue
+@torch._dynamo.disable()
+def merge_state_v2_wrapper(o, s_a, o_exp, s_b):
+    return merge_state_v2(o, s_a, o_exp, s_b)
+
+
+class FlashAttentionBackend(AttentionBackend):
+    """FlashAttention backend implementation.
+
+    Note about the init:
+    - If no spec decoding
+        - FlashAttentionBackend will be init once when the server starts.
+    - If spec decoding
+        - FlashAttentionBackend will be init once for the target worker
+        - FlashAttentionMultiStepBackend will be once for the draft worker
+            - It will spawn num_steps FlashAttentionBackend for the draft worker
+
+    Note about CUDA Graph:
+    - We only support CUDA Graph for Decode (Normal Decode and Draft Decode) and Target Verify.
+    - We don't support CUDA Graph for Extend and Draft Extend.
+    - When server init, init_cuda_graph_state will be called first and then init_cuda_graph_capture will be called.
+    - For each forward batch, init_replay_cuda_graph will be called first and then replay the graph.
+    """
+
+    def __init__(
+        self,
+        model_runner: ModelRunner,
+        skip_prefill: bool = False,
+        speculative_step_id=0,
+        topk=0,
+        speculative_num_steps=0,
+        fa_impl_ver=3,
+    ):
+        super().__init__()
+
+        assert not (
+            model_runner.sliding_window_size is not None
+            and model_runner.model_config.is_encoder_decoder
+        ), "Sliding window and cross attention are not supported together"
+
+        self.is_encoder_decoder = model_runner.model_config.is_encoder_decoder
+        self.forward_metadata: FlashAttentionMetadata = None
+        # extra metadata for handling speculative decoding topk > 1, extended draft decode and verify
+        self.forward_metadata_spec_decode_expand: FlashAttentionMetadata = None
+        self.max_context_len = model_runner.model_config.context_len
+        self.device = model_runner.device
+        self.decode_cuda_graph_metadata = {}
+        self.target_verify_metadata = {}
+        self.req_to_token = model_runner.req_to_token_pool.req_to_token
+        self.kv_cache_dtype = model_runner.kv_cache_dtype
+        self.kv_cache_dtype_str = model_runner.server_args.kv_cache_dtype
+        self.page_size = model_runner.page_size
+        self.use_mla = model_runner.model_config.attention_arch == AttentionArch.MLA
+        self.skip_prefill = skip_prefill
+
+        self.use_sliding_window_kv_pool = (
+            isinstance(model_runner.token_to_kv_pool, SWAKVPool)
+            and model_runner.token_to_kv_pool.swa_layer_nums > 0
+        )
+
+        if self.use_sliding_window_kv_pool:
+            self.token_to_kv_pool = model_runner.token_to_kv_pool
+
+        self.topk = model_runner.server_args.speculative_eagle_topk or 0
+        self.speculative_num_steps = speculative_num_steps
+        self.speculative_num_draft_tokens = (
+            model_runner.server_args.speculative_num_draft_tokens
+        )
+        self.speculative_step_id = speculative_step_id
+
+        self.fa_impl_ver = fa_impl_ver
+
+        # Local attention settings
+        self.has_local_attention = model_runner.model_config.is_local_attention_model
+        if self.has_local_attention:
+            assert (
+                model_runner.attention_chunk_size is not None
+            ), "Attention chunk size is required for local attention"
+            self.attention_chunk_size = model_runner.attention_chunk_size
+
+        # For each layer, the sliding_window_size can be different. This is only used for preparing SWA metadata.
+        # We use `layer.sliding_window_size` to decide whether to use SWA for each layer.
+        self.sliding_window_size = model_runner.sliding_window_size
+        self.has_swa = (
+            self.sliding_window_size is not None and self.sliding_window_size > -1
+        )
+
+        # If num_splits == 0, we use a heuristic to automatically determine the number of splits.
+        # We set nums splits to 1 if deterministic inference is enabled.
+        # See https://thinkingmachines.ai/blog/defeating-nondeterminism-in-llm-inference/ for more details.
+        # Furthermore, FA4 does not support num_splits=0 with CUDA Graph, so we set num_splits to 1 if CUDA Graph is enabled.
+        self.num_splits = (
+            1
+            if model_runner.server_args.enable_deterministic_inference
+            or (
+                self.fa_impl_ver == 4
+                and not model_runner.server_args.disable_cuda_graph
+            )
+            else 0
+        )
+
+    def init_forward_metadata(self, forward_batch: ForwardBatch):
+        """Initialize forward metadata hence all layers in the forward pass can reuse it."""
+        metadata = FlashAttentionMetadata()
+        seqlens_in_batch = forward_batch.seq_lens
+        batch_size = forward_batch.batch_size
+        device = seqlens_in_batch.device
+
+        if forward_batch.forward_mode.is_decode_or_idle():
+            # Draft Decode
+            if forward_batch.spec_info is not None:
+                if self.topk <= 1:
+                    metadata.cache_seqlens_int32 = (
+                        seqlens_in_batch + (self.speculative_step_id + 1)
+                    ).to(torch.int32)
+                    metadata.max_seq_len_k = forward_batch.seq_lens_cpu.max().item() + (
+                        self.speculative_step_id + 1
+                    )
+                    metadata.cu_seqlens_q = torch.arange(
+                        0, batch_size + 1, dtype=torch.int32, device=device
+                    )
+                    metadata.cu_seqlens_k = torch.nn.functional.pad(
+                        torch.cumsum(
+                            metadata.cache_seqlens_int32, dim=0, dtype=torch.int32
+                        ),
+                        (1, 0),
+                    )
+                    metadata.page_table = forward_batch.req_to_token_pool.req_to_token[
+                        forward_batch.req_pool_indices, : metadata.max_seq_len_k
+                    ]
+                else:
+                    metadata.cache_seqlens_int32 = (seqlens_in_batch).to(torch.int32)
+                    metadata.max_seq_len_q = self.topk
+                    metadata.max_seq_len_k = forward_batch.seq_lens_cpu.max().item()
+                    metadata.cu_seqlens_q = torch.arange(
+                        0,
+                        batch_size * self.topk + 1,
+                        step=self.topk,
+                        dtype=torch.int32,
+                        device=device,
+                    )
+                    metadata.cu_seqlens_k = torch.nn.functional.pad(
+                        torch.cumsum(
+                            metadata.cache_seqlens_int32, dim=0, dtype=torch.int32
+                        ),
+                        (1, 0),
+                    )
+                    metadata.page_table = forward_batch.req_to_token_pool.req_to_token[
+                        forward_batch.req_pool_indices, : metadata.max_seq_len_k
+                    ]
+                    metadata_expand = FlashAttentionMetadata()
+                    decode_length = self.speculative_step_id + 1
+                    metadata_expand.cache_seqlens_int32 = torch.full(
+                        (seqlens_in_batch.numel() * self.topk,),
+                        decode_length,
+                        device=device,
+                        dtype=torch.int32,
+                    )
+                    metadata_expand.max_seq_len_q = 1
+                    metadata_expand.cu_seqlens_q = torch.arange(
+                        0,
+                        metadata_expand.cache_seqlens_int32.numel() + 1,
+                        dtype=torch.int32,
+                        device=device,
+                    )
+                    metadata_expand.cu_seqlens_k = torch.arange(
+                        0,
+                        metadata_expand.cache_seqlens_int32.numel() * decode_length + 1,
+                        step=decode_length,
+                        dtype=torch.int32,
+                        device=device,
+                    )
+                    # shape: [bs, num_steps, topk] -> [bs x topk, num_steps]
+                    cache_loc = forward_batch.out_cache_loc.view(
+                        -1, self.speculative_num_steps
+                    )
+                    metadata_expand.page_table = (
+                        cache_loc[:, :decode_length].contiguous().to(torch.int32)
+                    )
+                    self.forward_metadata_spec_decode_expand = metadata_expand
+            else:
+                # Normal Decode
+                metadata.cache_seqlens_int32 = seqlens_in_batch.to(torch.int32)
+                metadata.max_seq_len_k = forward_batch.seq_lens_cpu.max().item()
+                metadata.cu_seqlens_q = torch.arange(
+                    0, batch_size + 1, dtype=torch.int32, device=device
+                )
+                metadata.cu_seqlens_k = torch.nn.functional.pad(
+                    torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.int32), (1, 0)
+                )
+                metadata.page_table = forward_batch.req_to_token_pool.req_to_token[
+                    forward_batch.req_pool_indices, : metadata.max_seq_len_k
+                ]
+            # TODO: we need to test this part for llama 4 eagle case
+            self._maybe_init_local_attn_metadata(forward_batch, metadata, device)
+        elif forward_batch.forward_mode.is_target_verify():
+            if self.topk <= 1:
+                metadata.cache_seqlens_int32 = (
+                    forward_batch.seq_lens + self.speculative_num_draft_tokens
+                ).to(torch.int32)
+                metadata.max_seq_len_q = self.speculative_num_draft_tokens
+                metadata.max_seq_len_k = (
+                    forward_batch.seq_lens_cpu.max().item()
+                    + self.speculative_num_draft_tokens
+                )
+                metadata.cu_seqlens_q = torch.arange(
+                    0,
+                    batch_size * self.speculative_num_draft_tokens + 1,
+                    self.speculative_num_draft_tokens,
+                    dtype=torch.int32,
+                    device=device,
+                )
+                metadata.cu_seqlens_k = torch.nn.functional.pad(
+                    torch.cumsum(
+                        metadata.cache_seqlens_int32, dim=0, dtype=torch.int32
+                    ),
+                    (1, 0),
+                )
+                metadata.page_table = forward_batch.req_to_token_pool.req_to_token[
+                    forward_batch.req_pool_indices, : metadata.max_seq_len_k
+                ]
+
+                self._maybe_init_local_attn_metadata(forward_batch, metadata, device)
+            else:
+                metadata.cache_seqlens_int32 = forward_batch.seq_lens.to(torch.int32)
+                metadata.max_seq_len_q = self.speculative_num_draft_tokens
+                metadata.max_seq_len_k = forward_batch.seq_lens_cpu.max().item()
+                metadata.cu_seqlens_q = torch.arange(
+                    0,
+                    batch_size * self.speculative_num_draft_tokens + 1,
+                    step=self.speculative_num_draft_tokens,
+                    dtype=torch.int32,
+                    device=device,
+                )
+                metadata.cu_seqlens_k = torch.nn.functional.pad(
+                    torch.cumsum(
+                        metadata.cache_seqlens_int32, dim=0, dtype=torch.int32
+                    ),
+                    (1, 0),
+                )
+                metadata.page_table = forward_batch.req_to_token_pool.req_to_token[
+                    forward_batch.req_pool_indices, : metadata.max_seq_len_k
+                ]
+
+                metadata_expand = FlashAttentionMetadata()
+
+                metadata_expand.max_seq_len_q = 1
+                metadata_expand.cu_seqlens_q = torch.arange(
+                    0,
+                    forward_batch.seq_lens.numel() * self.speculative_num_draft_tokens
+                    + 1,
+                    dtype=torch.int32,
+                    device=device,
+                )
+
+                # create expand page table
+                offsets = torch.arange(
+                    self.speculative_num_draft_tokens, device=device
+                ).unsqueeze(
+                    0
+                )  # shape: (1, self.speculative_num_draft_tokens)
+                cols = offsets.expand(
+                    forward_batch.seq_lens.numel(), -1
+                ) + forward_batch.seq_lens.unsqueeze(1)
+                cum_len = torch.nn.functional.pad(
+                    torch.cumsum(
+                        (
+                            forward_batch.seq_lens + self.speculative_num_draft_tokens
+                        ).repeat_interleave(self.speculative_num_draft_tokens),
+                        dim=0,
+                    ),
+                    (1, 0),
+                )[:-1]
+                mask_extraction_indices = (
+                    cols.repeat_interleave(self.speculative_num_draft_tokens, dim=0)
+                    + cum_len[:, None]
+                ).view(1, -1)
+                mask = forward_batch.spec_info.custom_mask[
+                    mask_extraction_indices
+                ].view(
+                    -1, self.speculative_num_draft_tokens
+                )  # (bsz * draft_num, draft_num)
+
+                # shift table indices to avoid padding
+                # non_masked_page_table [[8, 9, 10],   mask (display with int format) [[1, 0, 0],
+                #                        [8, 9, 10],                                   [1, 1, 0],
+                #                        [8, 9, 10]]                                   [1, 0, 1]]
+                # if masked with padding [[8, 0, 0],   our mask without padding       [[8, 9, 10],
+                #                        [8, 9, 0],                                    [8, 9, 10],
+                #                        [8, 0, 10]]                                   [8, 10, 9]]
+                # note here cache_seqlens_int32 is [1, 2, 2] so extra page indices will be ignored in each row
+                col_indices = offsets.expand(
+                    mask.shape[0], self.speculative_num_draft_tokens
+                )
+                # Build keys: if an entry is valid (mask==True), keep its original index;
+                # if not, add self.speculative_num_draft_tokens so that it sorts after all valid entries.
+                keys = torch.where(
+                    mask, col_indices, col_indices + self.speculative_num_draft_tokens
+                )
+                _, sort_order = torch.sort(keys, dim=1)
+                non_masked_page_table = (
+                    forward_batch.req_to_token_pool.req_to_token[
+                        forward_batch.req_pool_indices, :
+                    ]
+                    .gather(1, cols)
+                    .repeat_interleave(self.speculative_num_draft_tokens, dim=0)
+                )  # (bsz, draft_num)
+                metadata_expand.page_table = non_masked_page_table.gather(1, sort_order)
+                metadata_expand.cache_seqlens_int32 = mask.sum(dim=1).to(torch.int32)
+                metadata_expand.cu_seqlens_k = torch.nn.functional.pad(
+                    torch.cumsum(
+                        metadata_expand.cache_seqlens_int32, dim=0, dtype=torch.int32
+                    ),
+                    (1, 0),
+                )
+                self.forward_metadata_spec_decode_expand = metadata_expand
+
+                if self.has_swa:
+                    self._init_sliding_window_attn_spec_metadata(
+                        metadata, metadata_expand
+                    )
+
+        elif forward_batch.forward_mode.is_extend_or_draft_extend_or_mixed(
+            include_draft_extend_v2=True
+        ):
+            metadata.cache_seqlens_int32 = seqlens_in_batch.to(torch.int32)
+            metadata.max_seq_len_k = forward_batch.seq_lens_cpu.max().item()
+            metadata.cu_seqlens_k = torch.nn.functional.pad(
+                torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.int32), (1, 0)
+            )
+            metadata.page_table = forward_batch.req_to_token_pool.req_to_token[
+                forward_batch.req_pool_indices, : metadata.max_seq_len_k
+            ]
+
+            if any(
+                forward_batch.extend_prefix_lens_cpu
+            ) or forward_batch.forward_mode.is_draft_extend(include_v2=True):
+                extend_seq_lens = forward_batch.extend_seq_lens
+                metadata.max_seq_len_q = max(forward_batch.extend_seq_lens_cpu)
+                metadata.cu_seqlens_q = torch.nn.functional.pad(
+                    torch.cumsum(extend_seq_lens, dim=0, dtype=torch.int32), (1, 0)
+                )
+            else:
+                metadata.max_seq_len_q = metadata.max_seq_len_k
+                metadata.cu_seqlens_q = metadata.cu_seqlens_k
+
+            # Setup local attention if enabled
+            if forward_batch.forward_mode == ForwardMode.EXTEND:
+                self._maybe_init_local_attn_metadata(forward_batch, metadata, device)
+
+        # Encoder metadata for cross attention
+        if forward_batch.encoder_lens is not None:
+            assert (
+                forward_batch.encoder_lens.numel() == 1
+            ), "Only encoder size 1 is supported for now"
+
+            metadata.encoder_lens_int32 = forward_batch.encoder_lens.to(torch.int32)
+            metadata.encoder_cu_seqlens_k = torch.nn.functional.pad(
+                torch.cumsum(metadata.encoder_lens_int32, dim=0, dtype=torch.int32),
+                (1, 0),
+            )
+            metadata.encoder_max_seq_len_k = metadata.encoder_lens_int32.max().item()
+            metadata.encoder_page_table = forward_batch.req_to_token_pool.req_to_token[
+                forward_batch.req_pool_indices, : metadata.encoder_max_seq_len_k
+            ]
+
+            # Currently only support forward_batch.encoder_lens.numel() == 1
+            metadata.page_table = forward_batch.req_to_token_pool.req_to_token[
+                forward_batch.req_pool_indices,
+                metadata.encoder_max_seq_len_k : (
+                    metadata.encoder_max_seq_len_k + metadata.max_seq_len_k
+                ),
+            ]
+
+        if self.use_sliding_window_kv_pool:
+            metadata.swa_page_table = (
+                self.token_to_kv_pool.translate_loc_from_full_to_swa(
+                    metadata.page_table
+                )
+            )
+
+        # Convert the page table to a strided format which is needed by FA3 API
+        if self.page_size > 1:
+            self.strided_indices = torch.arange(
+                0, metadata.page_table.shape[1], self.page_size, device=self.device
+            )
+
+            if self.use_sliding_window_kv_pool:
+                metadata.swa_page_table = (
+                    metadata.swa_page_table[:, self.strided_indices] // self.page_size
+                )
+
+            metadata.page_table = (
+                metadata.page_table[:, self.strided_indices] // self.page_size
+            )
+
+            if (
+                self.topk > 1
+                and forward_batch.forward_mode.is_decode_or_idle()
+                and forward_batch.spec_info is not None
+            ):
+                # Modifies cache_seqlens_int32 and page_table(B, speculative_num_steps).
+                last_page_lens = forward_batch.seq_lens % self.page_size
+                # First attention handles prefix - last_page_len part.
+                metadata.cache_seqlens_int32 -= last_page_lens  # Both (B, )
+
+                # Second attention handles last_page_len + decode part.
+                expanded_last_page_lens = last_page_lens.repeat_interleave(self.topk)
+                self.forward_metadata_spec_decode_expand.cache_seqlens_int32 += (
+                    expanded_last_page_lens
+                )
+                # NOTE: the max decode length is speculative_num_steps - 1 (one token always generated by draft extend)
+                # and we leave one extra for last_page_len, which -> speculative_num_steps for the page table
+                expand_page_table = torch.zeros(
+                    forward_batch.batch_size * self.topk,
+                    self.speculative_num_steps,
+                    dtype=torch.int32,
+                    device=self.device,
+                )
+                # shape: [bs, num_steps, topk] -> [bs x topk, num_steps]
+                cache_loc = forward_batch.out_cache_loc.view(
+                    -1, self.speculative_num_steps
+                )
+                draft_decode_set_expand_metadata(
+                    cache_seqlens_int32=self.forward_metadata_spec_decode_expand.cache_seqlens_int32,
+                    page_table=expand_page_table,
+                    last_page_lens=last_page_lens,
+                    decode_length=decode_length,
+                    cache_loc=cache_loc,
+                    topk=self.topk,
+                    page_size=self.page_size,
+                )
+                self.forward_metadata_spec_decode_expand.page_table = expand_page_table
+
+        self.forward_metadata = metadata
+
+    def forward_extend(
+        self,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
+        layer: RadixAttention,
+        forward_batch: ForwardBatch,
+        save_kv_cache=True,
+        # For multi-head latent attention
+        q_rope: Optional[torch.Tensor] = None,
+        k_rope: Optional[torch.Tensor] = None,
+        sinks: Optional[torch.Tensor] = None,
+    ):
+        if k is not None:
+            assert v is not None
+            if save_kv_cache:
+                cache_loc = (
+                    forward_batch.out_cache_loc
+                    if not layer.is_cross_attention
+                    else forward_batch.encoder_out_cache_loc
+                )
+                if not self.use_mla:
+                    forward_batch.token_to_kv_pool.set_kv_buffer(
+                        layer, cache_loc, k, v, layer.k_scale, layer.v_scale
+                    )
+                else:
+                    forward_batch.token_to_kv_pool.set_mla_kv_buffer(
+                        layer,
+                        cache_loc,
+                        k,
+                        k_rope,
+                    )
+
+        # Use precomputed metadata across all layers
+        metadata = self.forward_metadata
+
+        # Calculate window size (can be moved to metadata if layer properties don't change)
+        # we don't do layer.sliding_window_size - 1 since in model.get_attention_sliding_window_size() we already - 1
+        # here is two side inclusive
+        is_swa_layer = (
+            layer.sliding_window_size is not None and layer.sliding_window_size > -1
+        )
+        window_size = (layer.sliding_window_size, 0) if is_swa_layer else (-1, -1)
+        k_descale, v_descale = None, None
+        # only use kv scaling if: 1) fp8 kv is explicitly enabled, 2) RadixAttention
+        # has corresponding quantization method so that layer.k_scale is not None,
+        # 3) layer.head_dim <= 256 since fa3 kernel require fp16 and bf16 data type in this case,
+        # 4) fa_impl_ver != 4 since fa4 does not currently support fp8 queries and keys.
+        if (
+            self.kv_cache_dtype_str != "auto"
+            and layer.head_dim <= 256
+            and self.fa_impl_ver != 4
+        ):
+            if layer.k_scale is not None:
+                descale_shape = (forward_batch.batch_size, layer.tp_k_head_num)
+                k_descale = layer.k_scale.expand(descale_shape)
+                v_descale = layer.v_scale.expand(descale_shape)
+            q = q.to(self.kv_cache_dtype)
+            q_rope = q_rope.to(self.kv_cache_dtype) if q_rope is not None else None
+            k_rope = k_rope.to(self.kv_cache_dtype) if k_rope is not None else None
+        causal = True
+        if layer.is_cross_attention or layer.attn_type == AttentionType.ENCODER_ONLY:
+            causal = False
+
+        # Check if we should use local attention
+        use_local_attn = (
+            self.has_local_attention
+            and self.attention_chunk_size is not None
+            and metadata.local_attn_metadata is not None
+            and (hasattr(layer, "use_irope") and layer.use_irope)
+        )
+
+        # We do cascade attention for Target Verify with topk > 1
+        # We don't use cascade attention for Sliding Window Attention:
+        # - Different window sizes should be passed in for each q in the first stage of cascade attention, but FA3 interface doesn't support pass in a list of window sizes.
+        # - The overhead of duplicated computation of the common prefix part is small for sliding window layers (seq_len <= window_size), so we can just expand it.
+        use_cascade_attn = (
+            forward_batch.forward_mode.is_target_verify()
+            and self.topk > 1
+            and not is_swa_layer
+        )
+
+        flash_attn_varlen_func_base = flash_attn_varlen_func_fa3
+        flash_attn_with_kvcache_base = flash_attn_with_kvcache_fa3
+
+        flash_attn_varlen_func = (
+            flash_attn_varlen_func_fa4
+            if self.fa_impl_ver == 4
+            else flash_attn_varlen_func_base
+        )
+        flash_attn_with_kvcache = (
+            flash_attn_with_kvcache_fa4
+            if self.fa_impl_ver == 4
+            else flash_attn_with_kvcache_base
+        )
+
+        kwargs = {}
+        if sinks is not None:
+            kwargs["sinks"] = sinks
+
+        # Get the appropriate page table based on whether we're using local attention
+        if use_local_attn:
+            local_metadata = metadata.local_attn_metadata
+            page_table = local_metadata.local_block_table
+            cu_seqlens_q = local_metadata.local_query_start_loc
+            cache_seqlens = local_metadata.local_seqused_k
+            max_seqlen_q = local_metadata.local_max_query_len
+        elif is_swa_layer and metadata.swa_spec_metadata is not None:
+            swa_spec_metadata = metadata.swa_spec_metadata
+            page_table = swa_spec_metadata.page_table
+            cu_seqlens_q = swa_spec_metadata.cu_seqlens_q
+            cache_seqlens = swa_spec_metadata.cache_seqlens_int32
+            max_seqlen_q = swa_spec_metadata.max_seq_len_q
+            cu_seqlens_k = swa_spec_metadata.cu_seqlens_k
+        else:
+            page_table = metadata.page_table
+            if is_swa_layer and self.use_sliding_window_kv_pool:
+                if metadata.swa_page_table is not None:
+                    page_table = metadata.swa_page_table
+                else:
+                    page_table = self.token_to_kv_pool.translate_loc_from_full_to_swa(
+                        metadata.page_table
+                    )
+            cu_seqlens_q = metadata.cu_seqlens_q
+            cache_seqlens = metadata.cache_seqlens_int32
+            max_seqlen_q = metadata.max_seq_len_q
+            cu_seqlens_k = metadata.cu_seqlens_k
+
+        # Use Flash Attention for prefill
+        if not self.use_mla:
+            # Do multi-head attention
+            key_cache, value_cache = forward_batch.token_to_kv_pool.get_kv_buffer(
+                layer.layer_id
+            )
+            key_cache = key_cache.view(
+                -1, self.page_size, layer.tp_k_head_num, layer.head_dim
+            )
+            value_cache = value_cache.view(
+                -1, self.page_size, layer.tp_v_head_num, layer.v_head_dim
+            )
+            if layer.is_cross_attention:
+                page_table = metadata.encoder_page_table
+                cache_seqlens = metadata.encoder_lens_int32
+                cu_seqlens_k = metadata.encoder_cu_seqlens_k
+                window_size = (-1, -1)
+
+            result = flash_attn_with_kvcache(
+                q=q.contiguous().view(-1, layer.tp_q_head_num, layer.head_dim),
+                k_cache=key_cache,
+                v_cache=value_cache,
+                page_table=page_table,
+                cache_seqlens=cache_seqlens,
+                cu_seqlens_q=cu_seqlens_q,
+                cu_seqlens_k_new=cu_seqlens_k if not use_local_attn else None,
+                max_seqlen_q=max_seqlen_q,
+                softmax_scale=layer.scaling,
+                causal=False if use_cascade_attn else causal,
+                window_size=window_size,
+                softcap=layer.logit_cap,
+                k_descale=k_descale,
+                v_descale=v_descale,
+                return_softmax_lse=use_cascade_attn,
+                num_splits=self.num_splits,
+                **kwargs,
+            )
+
+            if use_cascade_attn:
+                o, softmax_lse, *rest = result
+                o_expand, softmax_lse_expand, *rest_expand = flash_attn_with_kvcache(
+                    q=q.contiguous().view(-1, layer.tp_q_head_num, layer.head_dim),
+                    # Here metadata_expand.page_table is not divided with page_size.
+                    # This is because we loose the fine control of  what token to attend,
+                    # but has to attend to some block completely.
+                    k_cache=key_cache.view(-1, 1, layer.tp_k_head_num, layer.head_dim),
+                    v_cache=value_cache.view(
+                        -1, 1, layer.tp_v_head_num, layer.head_dim
+                    ),
+                    page_table=self.forward_metadata_spec_decode_expand.page_table,
+                    cache_seqlens=self.forward_metadata_spec_decode_expand.cache_seqlens_int32,
+                    cu_seqlens_q=self.forward_metadata_spec_decode_expand.cu_seqlens_q,
+                    cu_seqlens_k_new=self.forward_metadata_spec_decode_expand.cu_seqlens_k,
+                    max_seqlen_q=self.forward_metadata_spec_decode_expand.max_seq_len_q,
+                    softmax_scale=layer.scaling,
+                    causal=False,
+                    window_size=window_size,
+                    softcap=layer.logit_cap,
+                    k_descale=k_descale,
+                    v_descale=v_descale,
+                    return_softmax_lse=True,
+                    num_splits=self.num_splits,
+                    **kwargs,
+                )
+                o, _ = merge_state_v2_wrapper(
+                    o,
+                    softmax_lse.T.contiguous(),
+                    o_expand,
+                    softmax_lse_expand.T.contiguous(),
+                )
+            else:
+                o = result
+        else:
+            if (
+                forward_batch.attn_attend_prefix_cache is not None
+                and not forward_batch.forward_mode.is_target_verify()
+                and not forward_batch.forward_mode.is_draft_extend(include_v2=True)
+            ):
+                # Do multi-head attention with chunked prefix cache
+                if forward_batch.attn_attend_prefix_cache:
+                    assert not get_global_server_args().disable_chunked_prefix_cache
+                    # MHA for chunked prefix kv cache when running model with MLA
+                    assert forward_batch.prefix_chunk_idx is not None
+                    assert forward_batch.prefix_chunk_cu_seq_lens is not None
+                    assert forward_batch.prefix_chunk_max_seq_lens is not None
+
+                    chunk_idx = forward_batch.prefix_chunk_idx
+                    assert chunk_idx >= 0
+
+                    assert forward_batch.mha_return_lse
+                    output = flash_attn_varlen_func(
+                        q=q.view(-1, layer.tp_q_head_num, layer.head_dim),
+                        k=k.view(-1, layer.tp_k_head_num, layer.head_dim).to(q.dtype),
+                        v=v.view(-1, layer.tp_k_head_num, layer.v_head_dim).to(q.dtype),
+                        cu_seqlens_q=metadata.cu_seqlens_q,
+                        cu_seqlens_k=forward_batch.prefix_chunk_cu_seq_lens[chunk_idx],
+                        max_seqlen_q=metadata.max_seq_len_q,
+                        max_seqlen_k=forward_batch.prefix_chunk_max_seq_lens[chunk_idx],
+                        softmax_scale=layer.scaling,
+                        causal=False,
+                        return_softmax_lse=True,
+                        **kwargs,
+                    )
+                else:
+                    # MHA for extend part of sequence without attending prefix kv cache
+                    cu_seqlens_k = (
+                        metadata.cu_seqlens_q
+                        if not forward_batch.mha_one_shot
+                        else metadata.cu_seqlens_k
+                    )
+                    max_seqlen_k = (
+                        metadata.max_seq_len_q
+                        if not forward_batch.mha_one_shot
+                        else metadata.max_seq_len_k
+                    )
+                    output = flash_attn_varlen_func(
+                        q=q.view(-1, layer.tp_q_head_num, layer.head_dim),
+                        k=k.view(-1, layer.tp_k_head_num, layer.head_dim).to(q.dtype),
+                        v=v.view(-1, layer.tp_k_head_num, layer.v_head_dim).to(q.dtype),
+                        cu_seqlens_q=metadata.cu_seqlens_q,
+                        cu_seqlens_k=cu_seqlens_k,
+                        max_seqlen_q=metadata.max_seq_len_q,
+                        max_seqlen_k=max_seqlen_k,
+                        softmax_scale=layer.scaling,
+                        causal=True,
+                        return_softmax_lse=forward_batch.mha_return_lse,
+                        **kwargs,
+                    )
+                if forward_batch.mha_return_lse:
+                    output, lse, *rest = output
+                    lse = torch.transpose(lse, 0, 1).contiguous()
+                    return output, lse
+                return output
+            else:
+                assert self.fa_impl_ver in [3], "Only FA3 support here"
+                # Do absorbed multi-latent attention
+                kv_cache = forward_batch.token_to_kv_pool.get_key_buffer(
+                    layer.layer_id
+                ).to(q.dtype)
+                k_rope = kv_cache[:, :, layer.v_head_dim :]
+                c_kv = kv_cache[:, :, : layer.v_head_dim]
+                k_rope_cache = k_rope.view(
+                    -1,
+                    self.page_size,
+                    layer.tp_k_head_num,
+                    layer.head_dim - layer.v_head_dim,
+                )
+                c_kv_cache = c_kv.view(
+                    -1, self.page_size, layer.tp_v_head_num, layer.v_head_dim
+                )
+                if q_rope is not None:
+                    q_nope = q.view(-1, layer.tp_q_head_num, layer.v_head_dim)
+                    q_rope = q_rope.view(
+                        -1, layer.tp_q_head_num, layer.head_dim - layer.v_head_dim
+                    )
+                else:
+                    q_all = q.contiguous().view(-1, layer.tp_q_head_num, layer.head_dim)
+                    q_nope = q_all[:, :, : layer.v_head_dim]
+                    q_rope = q_all[:, :, layer.v_head_dim :]
+
+                result = flash_attn_with_kvcache(
+                    q=q_rope,
+                    k_cache=k_rope_cache,
+                    v_cache=c_kv_cache,
+                    qv=q_nope,
+                    page_table=page_table,
+                    cache_seqlens=cache_seqlens,
+                    cu_seqlens_q=cu_seqlens_q,
+                    cu_seqlens_k_new=cu_seqlens_k if not use_local_attn else None,
+                    max_seqlen_q=max_seqlen_q,
+                    softmax_scale=layer.scaling,
+                    causal=False if use_cascade_attn else causal,
+                    softcap=layer.logit_cap,
+                    k_descale=k_descale,
+                    v_descale=v_descale,
+                    return_softmax_lse=use_cascade_attn,
+                    num_splits=self.num_splits,
+                )
+                if use_cascade_attn:
+                    o, softmax_lse, *rest = result
+                    o_expand, softmax_lse_expand, *rest_expand = (
+                        flash_attn_with_kvcache(
+                            q=q_rope,
+                            k_cache=k_rope_cache,
+                            v_cache=c_kv_cache,
+                            qv=q_nope,
+                            page_table=self.forward_metadata_spec_decode_expand.page_table,
+                            cache_seqlens=self.forward_metadata_spec_decode_expand.cache_seqlens_int32,
+                            cu_seqlens_q=self.forward_metadata_spec_decode_expand.cu_seqlens_q,
+                            cu_seqlens_k_new=self.forward_metadata_spec_decode_expand.cu_seqlens_k,
+                            max_seqlen_q=self.forward_metadata_spec_decode_expand.max_seq_len_q,
+                            softmax_scale=layer.scaling,
+                            causal=False,
+                            window_size=window_size,
+                            softcap=layer.logit_cap,
+                            k_descale=k_descale,
+                            v_descale=v_descale,
+                            return_softmax_lse=True,
+                            num_splits=self.num_splits,
+                        )
+                    )
+                    o, _ = merge_state_v2_wrapper(
+                        o,
+                        softmax_lse.T.contiguous(),
+                        o_expand,
+                        softmax_lse_expand.T.contiguous(),
+                    )
+                else:
+                    o = result
+
+        return o.view(-1, layer.tp_q_head_num * layer.v_head_dim)
+
+    def forward_decode(
+        self,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
+        layer: RadixAttention,
+        forward_batch: ForwardBatch,
+        save_kv_cache=True,
+        # For multi-head latent attention
+        q_rope: Optional[torch.Tensor] = None,
+        k_rope: Optional[torch.Tensor] = None,
+        sinks: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        if k is not None:
+            assert v is not None
+            if save_kv_cache:
+                cache_loc = (
+                    forward_batch.out_cache_loc
+                    if not layer.is_cross_attention
+                    else forward_batch.encoder_out_cache_loc
+                )
+                if not self.use_mla:
+                    forward_batch.token_to_kv_pool.set_kv_buffer(
+                        layer, cache_loc, k, v, layer.k_scale, layer.v_scale
+                    )
+                else:
+                    forward_batch.token_to_kv_pool.set_mla_kv_buffer(
+                        layer,
+                        cache_loc,
+                        k,
+                        k_rope,
+                    )
+
+        # Use precomputed metadata across all layers
+        metadata = self.forward_metadata
+        local_attn_metadata = getattr(metadata, "local_attn_metadata", None)
+        use_local_attn = (
+            self.has_local_attention
+            and self.attention_chunk_size is not None
+            and local_attn_metadata is not None
+            and (hasattr(layer, "use_irope") and layer.use_irope)
+        )
+
+        # When Spec Decode enabled, forward_decode would be called with two mode:
+        # 1. DRAFT_DECODE: we enable cascade attention when top_k > 1
+        # 2. IDLE: we don’t need cascade attention, spec_info will be none in this case
+        use_cascade_attn = forward_batch.spec_info is not None and self.topk > 1
+
+        # Calculate window size (can be moved to metadata if layer properties don't change)
+        # we don't do layer.sliding_window_size - 1 since in model.get_attention_sliding_window_size() we already - 1
+        # here is two side inclusive
+        is_swa_layer = (
+            layer.sliding_window_size is not None and layer.sliding_window_size > -1
+        )
+        window_size = (layer.sliding_window_size, 0) if is_swa_layer else (-1, -1)
+
+        causal = True
+        if layer.is_cross_attention or layer.attn_type == AttentionType.ENCODER_ONLY:
+            causal = False
+
+        kwargs = {}
+        if sinks is not None:
+            kwargs["sinks"] = sinks
+
+        flash_attn_with_kvcache_base = flash_attn_with_kvcache_fa3
+
+        flash_attn_with_kvcache = (
+            flash_attn_with_kvcache_fa4
+            if self.fa_impl_ver == 4
+            else flash_attn_with_kvcache_base
+        )
+
+        k_descale, v_descale = None, None
+        # only use kv scaling if: 1) fp8 kv is explicitly enabled, 2) RadixAttention
+        # has corresponding quantization method so that layer.k_scale is not None,
+        # 3) layer.head_dim <= 256 since fa3 kernel require fp16 and bf16 data type in this case.
+        if self.kv_cache_dtype_str != "auto" and layer.head_dim <= 256:
+            if layer.k_scale is not None:
+                descale_shape = (forward_batch.batch_size, layer.tp_k_head_num)
+                k_descale = layer.k_scale.expand(descale_shape)
+                v_descale = layer.v_scale.expand(descale_shape)
+            q = q.to(self.kv_cache_dtype)
+            q_rope = q_rope.to(self.kv_cache_dtype) if q_rope is not None else None
+            k_rope = k_rope.to(self.kv_cache_dtype) if k_rope is not None else None
+        if not self.use_mla:
+            # Do multi-head attention
+
+            key_cache, value_cache = forward_batch.token_to_kv_pool.get_kv_buffer(
+                layer.layer_id
+            )
+            key_cache = key_cache.view(
+                -1, self.page_size, layer.tp_k_head_num, layer.head_dim
+            )
+            value_cache = value_cache.view(
+                -1, self.page_size, layer.tp_v_head_num, layer.v_head_dim
+            )
+
+            if layer.is_cross_attention:
+                # Always use non-chunked logic for cross-attention
+                o = flash_attn_with_kvcache(
+                    q=q.contiguous().view(-1, layer.tp_q_head_num, layer.head_dim),
+                    k_cache=key_cache,
+                    v_cache=value_cache,
+                    page_table=metadata.encoder_page_table,
+                    cache_seqlens=metadata.encoder_lens_int32,
+                    cu_seqlens_q=metadata.cu_seqlens_q,
+                    cu_seqlens_k_new=metadata.encoder_cu_seqlens_k,
+                    max_seqlen_q=1,
+                    softmax_scale=layer.scaling,
+                    causal=False,
+                    window_size=(-1, -1),
+                    softcap=layer.logit_cap,
+                    k_descale=k_descale,
+                    v_descale=v_descale,
+                    num_splits=self.num_splits,
+                    **kwargs,
+                )
+            elif use_local_attn:
+                # Use chunked (local) attention batching for self-attention
+                o = flash_attn_with_kvcache(
+                    q=q.contiguous().view(-1, layer.tp_q_head_num, layer.head_dim),
+                    k_cache=key_cache,
+                    v_cache=value_cache,
+                    page_table=local_attn_metadata.local_block_table,
+                    cache_seqlens=local_attn_metadata.local_seqused_k,
+                    cu_seqlens_q=local_attn_metadata.local_query_start_loc,
+                    cu_seqlens_k_new=None,
+                    max_seqlen_q=local_attn_metadata.local_max_query_len,
+                    softmax_scale=layer.scaling,
+                    causal=True,
+                    window_size=(-1, -1),
+                    softcap=layer.logit_cap,
+                    k_descale=k_descale,
+                    v_descale=v_descale,
+                    num_splits=self.num_splits,
+                    **kwargs,
+                )
+            else:
+                page_table = metadata.page_table
+                if is_swa_layer and self.use_sliding_window_kv_pool:
+                    if metadata.swa_page_table is not None:
+                        page_table = metadata.swa_page_table
+                    else:
+                        page_table = (
+                            self.token_to_kv_pool.translate_loc_from_full_to_swa(
+                                metadata.page_table
+                            )
+                        )
+                cache_seqlens = metadata.cache_seqlens_int32
+                cu_seqlens_k = metadata.cu_seqlens_k
+                max_seqlen_q = metadata.max_seq_len_q
+                q_reshaped = q.contiguous().view(
+                    -1, layer.tp_q_head_num, layer.head_dim
+                )
+
+                # Default: single-token self-attention
+                result = flash_attn_with_kvcache(
+                    q=q_reshaped,
+                    k_cache=key_cache,
+                    v_cache=value_cache,
+                    page_table=page_table,
+                    cache_seqlens=cache_seqlens,
+                    cu_seqlens_q=metadata.cu_seqlens_q,
+                    max_seqlen_q=max_seqlen_q,
+                    softmax_scale=layer.scaling,
+                    causal=False if use_cascade_attn else causal,
+                    window_size=window_size,
+                    softcap=layer.logit_cap,
+                    k_descale=k_descale,
+                    v_descale=v_descale,
+                    return_softmax_lse=use_cascade_attn,
+                    num_splits=self.num_splits,
+                    **kwargs,
+                )
+                if use_cascade_attn:
+                    o, softmax_lse, *rest = result
+                    o_expand, softmax_lse_expand, *rest_expand = (
+                        flash_attn_with_kvcache(
+                            q=q_reshaped,
+                            k_cache=key_cache,
+                            v_cache=value_cache,
+                            page_table=self.forward_metadata_spec_decode_expand.page_table,
+                            cache_seqlens=self.forward_metadata_spec_decode_expand.cache_seqlens_int32,
+                            cu_seqlens_q=self.forward_metadata_spec_decode_expand.cu_seqlens_q,
+                            cu_seqlens_k_new=self.forward_metadata_spec_decode_expand.cu_seqlens_k,
+                            max_seqlen_q=self.forward_metadata_spec_decode_expand.max_seq_len_q,
+                            softmax_scale=layer.scaling,
+                            causal=False,
+                            window_size=window_size,
+                            softcap=layer.logit_cap,
+                            k_descale=k_descale,
+                            v_descale=v_descale,
+                            return_softmax_lse=True,
+                            num_splits=self.num_splits,
+                            **kwargs,
+                        )
+                    )
+                    o, _ = merge_state_v2(
+                        o,
+                        softmax_lse.T.contiguous(),
+                        o_expand,
+                        softmax_lse_expand.T.contiguous(),
+                    )
+                else:
+                    o = result
+        else:
+            # Do absorbed multi-latent attention
+            kv_cache = forward_batch.token_to_kv_pool.get_key_buffer(layer.layer_id).to(
+                q.dtype
+            )
+            k_rope = kv_cache[:, :, layer.v_head_dim :]
+            c_kv = kv_cache[:, :, : layer.v_head_dim]
+            k_rope_cache = k_rope.view(
+                -1,
+                self.page_size,
+                layer.tp_k_head_num,
+                layer.head_dim - layer.v_head_dim,
+            )
+            c_kv_cache = c_kv.view(
+                -1, self.page_size, layer.tp_v_head_num, layer.v_head_dim
+            )
+
+            if q_rope is not None:
+                q_nope = q.view(-1, layer.tp_q_head_num, layer.v_head_dim)
+                q_rope = q_rope.view(
+                    -1, layer.tp_q_head_num, layer.head_dim - layer.v_head_dim
+                )
+            else:
+                q_all = q.contiguous().view(-1, layer.tp_q_head_num, layer.head_dim)
+                q_nope = q_all[:, :, : layer.v_head_dim]
+                q_rope = q_all[:, :, layer.v_head_dim :]
+            max_seqlen_q = metadata.max_seq_len_q
+
+            result = flash_attn_with_kvcache(
+                q=q_rope,
+                k_cache=k_rope_cache,
+                v_cache=c_kv_cache,
+                qv=q_nope,
+                page_table=metadata.page_table,
+                cache_seqlens=metadata.cache_seqlens_int32,
+                cu_seqlens_q=metadata.cu_seqlens_q,
+                cu_seqlens_k_new=metadata.cu_seqlens_k,
+                max_seqlen_q=max_seqlen_q,
+                softmax_scale=layer.scaling,
+                causal=False if use_cascade_attn else causal,
+                softcap=layer.logit_cap,
+                k_descale=k_descale,
+                v_descale=v_descale,
+                return_softmax_lse=use_cascade_attn,  # softmax_lse is needed for merge states
+                num_splits=self.num_splits,
+            )
+            if use_cascade_attn:
+                o, softmax_lse, *rest = result
+                o_expand, softmax_lse_expand, *rest_expand = flash_attn_with_kvcache(
+                    q=q_rope,
+                    k_cache=k_rope_cache,
+                    v_cache=c_kv_cache,
+                    qv=q_nope,
+                    page_table=self.forward_metadata_spec_decode_expand.page_table,
+                    cache_seqlens=self.forward_metadata_spec_decode_expand.cache_seqlens_int32,
+                    cu_seqlens_q=self.forward_metadata_spec_decode_expand.cu_seqlens_q,
+                    cu_seqlens_k_new=self.forward_metadata_spec_decode_expand.cu_seqlens_k,
+                    max_seqlen_q=self.forward_metadata_spec_decode_expand.max_seq_len_q,
+                    softmax_scale=layer.scaling,
+                    causal=False,
+                    window_size=window_size,
+                    softcap=layer.logit_cap,
+                    k_descale=k_descale,
+                    v_descale=v_descale,
+                    return_softmax_lse=True,
+                    num_splits=self.num_splits,
+                )
+                o, _ = merge_state_v2(
+                    o,
+                    softmax_lse.T.contiguous(),
+                    o_expand,
+                    softmax_lse_expand.T.contiguous(),
+                )
+            else:
+                o = result
+
+        return o.view(-1, layer.tp_q_head_num * layer.v_head_dim)
+
+    def init_cuda_graph_state(self, max_bs: int, max_num_tokens: int):
+        """Initialize CUDA graph state for the attention backend.
+
+        Args:
+            max_bs (int): Maximum batch size to support in CUDA graphs
+
+        This creates fixed-size tensors that will be reused during CUDA graph replay
+        to avoid memory allocations.
+        """
+        max_num_pages = (self.max_context_len + self.page_size - 1) // self.page_size
+
+        # This is being used by normal decode and draft decode when topk == 1
+        self.decode_cuda_graph_metadata = {
+            "cache_seqlens": torch.zeros(max_bs, dtype=torch.int32, device=self.device),
+            "cu_seqlens_q": torch.arange(
+                0, max_bs + 1, dtype=torch.int32, device=self.device
+            ),
+            "cu_seqlens_k": torch.zeros(
+                max_bs + 1, dtype=torch.int32, device=self.device
+            ),
+            "page_table": torch.zeros(
+                max_bs,
+                max_num_pages,
+                dtype=torch.int32,
+                device=self.device,
+            ),
+            "strided_indices": torch.arange(
+                0, self.max_context_len, self.page_size, device=self.device
+            ),
+        }
+        # Only allocate local attention buffers if local attention is enabled
+        # This prevents OOM errors when local attention is not being used
+        if self.has_local_attention:
+            # Estimate maximum sizes for local attention metadata
+            max_seq_len = self.max_context_len
+            page_size = self.page_size or 1
+            attn_chunk_size = self.attention_chunk_size
+            max_virtual_batches = max_bs * (
+                (max_seq_len + attn_chunk_size - 1) // attn_chunk_size
+            )
+            max_pages_per_block = (attn_chunk_size + page_size - 1) // page_size
+
+            self.decode_cuda_graph_local_attn_metadata = {
+                "local_query_start_loc": torch.zeros(
+                    max_virtual_batches + 1, dtype=torch.int32, device=self.device
+                ),
+                "local_seqused_k": torch.zeros(
+                    max_virtual_batches, dtype=torch.int32, device=self.device
+                ),
+                "local_block_table": torch.zeros(
+                    max_virtual_batches,
+                    max_pages_per_block,
+                    dtype=torch.int32,
+                    device=self.device,
+                ),
+            }
+
+        if self.use_sliding_window_kv_pool:
+            self.decode_cuda_graph_metadata["swa_page_table"] = torch.zeros(
+                max_bs,
+                max_num_pages,
+                dtype=torch.int32,
+                device=self.device,
+            )
+
+        # This is used by draft decode's first half of metadata when topk > 1
+        if self.topk > 1:
+            self.draft_decode_metadata_topk_normal = {
+                "cache_seqlens": torch.zeros(
+                    max_bs, dtype=torch.int32, device=self.device
+                ),
+                "cu_seqlens_q": torch.arange(
+                    0,
+                    max_bs * self.topk + 1,
+                    step=self.topk,
+                    dtype=torch.int32,
+                    device=self.device,
+                ),
+                "cu_seqlens_k": torch.zeros(
+                    max_bs + 1, dtype=torch.int32, device=self.device
+                ),
+                "page_table": torch.zeros(
+                    max_bs,
+                    self.max_context_len,
+                    dtype=torch.int32,
+                    device=self.device,
+                ),
+            }
+
+            # This is used by draft decode's second half of metadata when topk > 1
+            decode_length = self.speculative_step_id + 1
+            self.draft_decode_metadata_topk_expand = {
+                "cache_seqlens": torch.full(
+                    (max_bs * self.topk,),
+                    decode_length,
+                    device=self.device,
+                    dtype=torch.int32,
+                ),
+                "cu_seqlens_q": torch.arange(
+                    0,
+                    max_bs * self.topk + 1,
+                    dtype=torch.int32,
+                    device=self.device,
+                ),
+                "cu_seqlens_k": torch.arange(
+                    0,
+                    max_bs * self.topk * decode_length + 1,
+                    step=decode_length,
+                    dtype=torch.int32,
+                    device=self.device,
+                ),
+                "page_table": torch.zeros(
+                    max_bs * self.topk,
+                    decode_length + 1,  # Additional page for last partial page
+                    dtype=torch.int32,
+                    device=self.device,
+                ),
+            }
+
+        if (
+            self.speculative_num_draft_tokens is not None
+            and self.speculative_num_draft_tokens > 0
+        ):
+            # "page_table_draft_decode" will be set only when spec decoding enabled to save memory
+            self.decode_cuda_graph_metadata["page_table_draft_decode"] = torch.zeros(
+                max_bs,
+                max_num_pages,
+                dtype=torch.int32,
+                device=self.device,
+            )
+
+            self.target_verify_metadata = {
+                "cache_seqlens": torch.zeros(
+                    max_bs, dtype=torch.int32, device=self.device
+                ),
+                "cu_seqlens_q": torch.arange(
+                    0,
+                    max_bs * self.speculative_num_draft_tokens + 1,
+                    step=self.speculative_num_draft_tokens,
+                    dtype=torch.int32,
+                    device=self.device,
+                ),
+                "cu_seqlens_k": torch.zeros(
+                    max_bs + 1, dtype=torch.int32, device=self.device
+                ),
+                "page_table": torch.zeros(
+                    max_bs,
+                    max_num_pages,
+                    dtype=torch.int32,
+                    device=self.device,
+                ),
+                "strided_indices": torch.arange(
+                    0, self.max_context_len, self.page_size, device=self.device
+                ),
+            }
+
+            self.draft_extend_metadata = {
+                "cache_seqlens": torch.zeros(
+                    max_bs, dtype=torch.int32, device=self.device
+                ),
+                "cu_seqlens_q": torch.zeros(
+                    max_bs + 1,
+                    dtype=torch.int32,
+                    device=self.device,
+                ),
+                "cu_seqlens_k": torch.zeros(
+                    max_bs + 1, dtype=torch.int32, device=self.device
+                ),
+                "page_table": torch.zeros(
+                    max_bs,
+                    max_num_pages,
+                    dtype=torch.int32,
+                    device=self.device,
+                ),
+                "strided_indices": torch.arange(
+                    0, self.max_context_len, self.page_size, device=self.device
+                ),
+            }
+
+        if self.topk > 1:
+            self.target_verify_metadata_topk_normal = {
+                "cache_seqlens": torch.zeros(
+                    max_bs, dtype=torch.int32, device=self.device
+                ),
+                "cu_seqlens_q": torch.arange(
+                    0,
+                    max_bs * self.speculative_num_draft_tokens + 1,
+                    step=self.speculative_num_draft_tokens,
+                    dtype=torch.int32,
+                    device=self.device,
+                ),
+                "cu_seqlens_k": torch.zeros(
+                    max_bs + 1, dtype=torch.int32, device=self.device
+                ),
+                "page_table": torch.zeros(
+                    max_bs,
+                    self.max_context_len,
+                    dtype=torch.int32,
+                    device=self.device,
+                ),
+            }
+
+            self.target_verify_metadata_topk_expand = {
+                "cache_seqlens": torch.zeros(
+                    max_bs * self.speculative_num_draft_tokens,
+                    dtype=torch.int32,
+                    device=self.device,
+                ),
+                "cu_seqlens_k": torch.zeros(
+                    max_bs * self.speculative_num_draft_tokens + 1,
+                    dtype=torch.int32,
+                    device=self.device,
+                ),
+                "cu_seqlens_q": torch.arange(
+                    0,
+                    max_bs * self.speculative_num_draft_tokens + 1,
+                    dtype=torch.int32,
+                    device=self.device,
+                ),
+                "page_table": torch.zeros(
+                    max_bs * self.speculative_num_draft_tokens,
+                    self.speculative_num_draft_tokens,
+                    dtype=torch.int32,
+                    device=self.device,
+                ),
+            }
+
+            if self.has_swa:
+                self.target_verify_metadata_topk_swa = {
+                    "cache_seqlens": torch.zeros(
+                        max_bs * self.speculative_num_draft_tokens,
+                        dtype=torch.int32,
+                        device=self.device,
+                    ),
+                    "cu_seqlens_k": torch.zeros(
+                        max_bs * self.speculative_num_draft_tokens + 1,
+                        dtype=torch.int32,
+                        device=self.device,
+                    ),
+                    "cu_seqlens_q": torch.arange(
+                        0,
+                        max_bs * self.speculative_num_draft_tokens + 1,
+                        dtype=torch.int32,
+                        device=self.device,
+                    ),
+                    "page_table": torch.zeros(
+                        max_bs * self.speculative_num_draft_tokens,
+                        self.max_context_len,
+                        dtype=torch.int32,
+                        device=self.device,
+                    ),
+                }
+
+        # Only allocate encoder metadata for encoder-decoder models
+        if self.is_encoder_decoder:
+            self.encoder_metadata = {
+                "encoder_page_table": torch.zeros(
+                    max_bs,
+                    self.max_context_len,
+                    dtype=torch.int32,
+                    device=self.device,
+                ),
+                "encoder_lens_int32": torch.zeros(
+                    max_bs, dtype=torch.int32, device=self.device
+                ),
+                "encoder_cu_seqlens_k": torch.zeros(
+                    max_bs + 1, dtype=torch.int32, device=self.device
+                ),
+            }
+        else:
+            # For decoder-only models, skip encoder_metadata allocation
+            self.encoder_metadata = {}
+
+    def init_forward_metadata_capture_cuda_graph(
+        self,
+        bs: int,
+        num_tokens: int,
+        req_pool_indices: torch.Tensor,
+        seq_lens: torch.Tensor,
+        encoder_lens: Optional[torch.Tensor],
+        forward_mode: ForwardMode,
+        spec_info: Optional[SpecInput],
+    ):
+        """Initialize forward metadata for capturing CUDA graph."""
+        metadata = FlashAttentionMetadata()
+
+        # metadata_expand is needed for Spec Decoding when top k > 1
+        metadata_expand = FlashAttentionMetadata()
+
+        device = seq_lens.device
+        if forward_mode.is_decode_or_idle():
+            if spec_info is not None:
+                # Draft Decode
+                if self.topk <= 1:
+                    # When topk = 1, we use the normal decode metadata
+                    metadata.cache_seqlens_int32 = self.decode_cuda_graph_metadata[
+                        "cache_seqlens"
+                    ][:bs]
+                    metadata.max_seq_len_k = seq_lens.max().item() + (
+                        self.speculative_step_id + 1
+                    )
+                    metadata.cu_seqlens_q = self.decode_cuda_graph_metadata[
+                        "cu_seqlens_q"
+                    ][: bs + 1]
+                    metadata.cu_seqlens_k = torch.nn.functional.pad(
+                        torch.cumsum(
+                            metadata.cache_seqlens_int32, dim=0, dtype=torch.int32
+                        ),
+                        (1, 0),
+                    )
+                    metadata.page_table = self.decode_cuda_graph_metadata[
+                        "page_table_draft_decode"
+                    ][:bs, :]
+                    self.decode_cuda_graph_metadata[bs] = metadata
+                else:
+                    # When top k > 1, we need two specific draft decode metadata, and then merge states
+                    # 1. The first half of metadata for prefix tokens
+                    metadata.cache_seqlens_int32 = (
+                        self.draft_decode_metadata_topk_normal["cache_seqlens"][:bs]
+                    )
+                    metadata.max_seq_len_q = self.topk
+                    metadata.max_seq_len_k = seq_lens.max().item()
+                    metadata.cu_seqlens_q = self.draft_decode_metadata_topk_normal[
+                        "cu_seqlens_q"
+                    ][: bs + 1]
+                    metadata.cu_seqlens_k = self.draft_decode_metadata_topk_normal[
+                        "cu_seqlens_k"
+                    ][: bs + 1]
+                    metadata.page_table = self.draft_decode_metadata_topk_normal[
+                        "page_table"
+                    ][:bs, :]
+
+                    # 2. The second half of metadata for draft tokens (per_batch_num_tokens = topk)
+                    metadata_expand.cache_seqlens_int32 = (
+                        self.draft_decode_metadata_topk_expand["cache_seqlens"][
+                            : bs * self.topk
+                        ]
+                    )
+                    metadata_expand.max_seq_len_q = 1
+                    metadata_expand.cu_seqlens_q = (
+                        self.draft_decode_metadata_topk_expand["cu_seqlens_q"][
+                            : bs * self.topk + 1
+                        ]
+                    )
+                    metadata_expand.cu_seqlens_k = (
+                        self.draft_decode_metadata_topk_expand["cu_seqlens_k"][
+                            : bs * self.topk + 1
+                        ]
+                    )
+                    metadata_expand.page_table = self.draft_decode_metadata_topk_expand[
+                        "page_table"
+                    ][: bs * self.topk]
+                    self.draft_decode_metadata_topk_normal[bs] = metadata
+                    self.draft_decode_metadata_topk_expand[bs] = metadata_expand
+            else:
+                # Normal Decode
+                # Get sequence information
+                metadata.cache_seqlens_int32 = seq_lens.to(torch.int32)
+                batch_size = len(seq_lens)
+                device = seq_lens.device
+                metadata.cu_seqlens_k = torch.nn.functional.pad(
+                    torch.cumsum(seq_lens, dim=0, dtype=torch.int32), (1, 0)
+                )
+                # Precompute maximum sequence length
+                metadata.max_seq_len_k = seq_lens.max().item()
+                # Precompute page table
+                metadata.page_table = self.decode_cuda_graph_metadata["page_table"][
+                    :bs, :
+                ]
+                if self.use_sliding_window_kv_pool:
+                    metadata.swa_page_table = self.decode_cuda_graph_metadata[
+                        "swa_page_table"
+                    ][:bs, :]
+                # Precompute cumulative sequence lengths
+                metadata.cu_seqlens_q = torch.arange(
+                    0, batch_size + 1, dtype=torch.int32, device=device
+                )
+                self.decode_cuda_graph_metadata[bs] = metadata
+
+                self._maybe_update_local_attn_metadata_for_capture(metadata, batch_size)
+
+        elif forward_mode.is_target_verify():
+            if self.topk <= 1:
+                metadata.cache_seqlens_int32 = self.target_verify_metadata[
+                    "cache_seqlens"
+                ][:bs]
+                metadata.cache_seqlens_int32.copy_(
+                    (seq_lens + self.speculative_num_draft_tokens)
+                )
+
+                metadata.max_seq_len_q = self.speculative_num_draft_tokens
+                metadata.max_seq_len_k = (
+                    seq_lens.max().item() + self.speculative_num_draft_tokens
+                )
+
+                metadata.cu_seqlens_q = torch.arange(
+                    0,
+                    bs * self.speculative_num_draft_tokens + 1,
+                    self.speculative_num_draft_tokens,
+                    dtype=torch.int32,
+                    device=device,
+                )
+
+                metadata.cu_seqlens_k = self.target_verify_metadata["cu_seqlens_k"][
+                    : (bs + 1)
+                ]
+
+                metadata.page_table = self.target_verify_metadata["page_table"][:bs, :]
+
+                self.target_verify_metadata[bs] = metadata
+            else:
+                # When topk > 1, we need two specific target verify metadata, and then merge states
+                # 1. The first half of metadata for prefix tokens
+                metadata.cache_seqlens_int32 = self.target_verify_metadata_topk_normal[
+                    "cache_seqlens"
+                ][:bs]
+                metadata.max_seq_len_q = self.speculative_num_draft_tokens
+                # metadata.max_seq_len_k = forward_batch.seq_lens_cpu.max().item(), do this in replay
+                metadata.cu_seqlens_q = self.target_verify_metadata_topk_normal[
+                    "cu_seqlens_q"
+                ][: bs + 1]
+                metadata.cu_seqlens_k = self.target_verify_metadata_topk_normal[
+                    "cu_seqlens_k"
+                ][: bs + 1]
+                metadata.page_table = self.target_verify_metadata_topk_normal[
+                    "page_table"
+                ][:bs, :]
+
+                # 2. The second half of metadata for draft tokens (per_batch_num_tokens = topk)
+                metadata_expand.cache_seqlens_int32 = (
+                    self.target_verify_metadata_topk_expand["cache_seqlens"][
+                        : bs * self.speculative_num_draft_tokens
+                    ]
+                )
+                metadata_expand.max_seq_len_q = 1
+                metadata_expand.cu_seqlens_q = self.target_verify_metadata_topk_expand[
+                    "cu_seqlens_q"
+                ][: bs * self.speculative_num_draft_tokens + 1]
+                metadata_expand.cu_seqlens_k = self.target_verify_metadata_topk_expand[
+                    "cu_seqlens_k"
+                ][: bs * self.speculative_num_draft_tokens + 1]
+
+                metadata_expand.page_table = self.target_verify_metadata_topk_expand[
+                    "page_table"
+                ][: bs * self.speculative_num_draft_tokens]
+
+                self.target_verify_metadata_topk_normal[bs] = metadata
+                self.target_verify_metadata_topk_expand[bs] = metadata_expand
+
+                if self.has_swa:
+                    metadata_swa = FlashAttentionMetadata()
+                    metadata_swa.cache_seqlens_int32 = (
+                        self.target_verify_metadata_topk_swa["cache_seqlens"][
+                            : bs * self.speculative_num_draft_tokens
+                        ]
+                    )
+                    metadata_swa.max_seq_len_q = 1
+                    metadata_swa.cu_seqlens_q = self.target_verify_metadata_topk_swa[
+                        "cu_seqlens_q"
+                    ][: bs * self.speculative_num_draft_tokens + 1]
+                    metadata_swa.cu_seqlens_k = self.target_verify_metadata_topk_swa[
+                        "cu_seqlens_k"
+                    ][: bs * self.speculative_num_draft_tokens + 1]
+
+                    metadata_swa.page_table = self.target_verify_metadata_topk_swa[
+                        "page_table"
+                    ][: bs * self.speculative_num_draft_tokens]
+                    self.target_verify_metadata_topk_swa[bs] = metadata_swa
+                    metadata.swa_spec_metadata = metadata_swa
+
+        elif forward_mode.is_draft_extend(include_v2=True):
+            metadata.cache_seqlens_int32 = self.draft_extend_metadata["cache_seqlens"][
+                :bs
+            ]
+            metadata.cache_seqlens_int32.copy_(seq_lens)
+
+            num_tokens_per_bs = num_tokens // bs
+            metadata.max_seq_len_q = num_tokens_per_bs
+            metadata.max_seq_len_k = seq_lens.max().item()
+
+            metadata.cu_seqlens_q = torch.arange(
+                0,
+                bs * num_tokens_per_bs + 1,
+                num_tokens_per_bs,
+                dtype=torch.int32,
+                device=device,
+            )
+
+            metadata.cu_seqlens_k = self.draft_extend_metadata["cu_seqlens_k"][
+                : (bs + 1)
+            ]
+            metadata.page_table = self.draft_extend_metadata["page_table"][:bs, :]
+
+            self.draft_extend_metadata[bs] = metadata
+
+        if encoder_lens is not None:
+            encoder_bs = encoder_lens.numel()
+            metadata.encoder_lens_int32 = self.encoder_metadata["encoder_lens_int32"][
+                :encoder_bs
+            ]
+            metadata.encoder_cu_seqlens_k = self.encoder_metadata[
+                "encoder_cu_seqlens_k"
+            ][: (encoder_bs + 1)]
+
+            metadata.encoder_page_table = self.encoder_metadata["encoder_page_table"][
+                :bs, :
+            ]
+
+        self.forward_metadata = metadata
+        self.forward_metadata_spec_decode_expand = metadata_expand
+
+    def init_forward_metadata_replay_cuda_graph(
+        self,
+        bs: int,
+        req_pool_indices: torch.Tensor,
+        seq_lens: torch.Tensor,
+        seq_lens_sum: int,
+        encoder_lens: Optional[torch.Tensor],
+        forward_mode: ForwardMode,
+        spec_info: Optional[SpecInput],
+        seq_lens_cpu: Optional[torch.Tensor],
+        out_cache_loc: Optional[torch.Tensor] = None,
+    ):
+        """Initialize forward metadata for replaying CUDA graph."""
+        seq_lens = seq_lens[:bs]
+        seq_lens_cpu = seq_lens_cpu[:bs]
+        req_pool_indices = req_pool_indices[:bs]
+        device = seq_lens.device
+        metadata = None
+        metadata_expand = None
+
+        if forward_mode.is_decode_or_idle():
+
+            if spec_info is not None:
+                # Draft Decode
+                if self.topk <= 1:
+                    # When topk = 1, we use the normal decode metadata
+                    metadata = self.decode_cuda_graph_metadata[bs]
+                    max_len = seq_lens_cpu.max().item()
+                    metadata.max_seq_len_k = max_len + self.speculative_step_id + 1
+                    max_seq_pages = (
+                        metadata.max_seq_len_k + self.page_size - 1
+                    ) // self.page_size
+
+                    normal_decode_set_metadata(
+                        metadata.cache_seqlens_int32,
+                        metadata.cu_seqlens_k,
+                        metadata.page_table,
+                        self.req_to_token,
+                        req_pool_indices,
+                        self.decode_cuda_graph_metadata["strided_indices"],
+                        max_seq_pages,
+                        seq_lens,
+                        self.speculative_step_id + 1,
+                        self.page_size,
+                    )
+
+                else:
+                    # When top k > 1, we need two specific draft decode metadata, and then merge states
+                    # 1. The first half of metadata for prefix tokens
+                    metadata = self.draft_decode_metadata_topk_normal[bs]
+                    if self.page_size > 1:
+                        # First attention handles seq_lens - last_page_lens if page size > 1.
+                        last_page_lens = seq_lens % self.page_size
+                        seq_lens = seq_lens - last_page_lens
+                    metadata.cache_seqlens_int32.copy_(seq_lens)
+                    # metadata.max_seq_len_q = self.topk, already set in capture
+                    # metadata.cu_seqlens_q already set in capture
+                    # metadata.cu_seqlens_k is not needed
+
+                    metadata.max_seq_len_k = seq_lens_cpu.max().item()
+                    max_seq_pages = (
+                        metadata.max_seq_len_k + self.page_size - 1
+                    ) // self.page_size
+                    strided_indices = self.decode_cuda_graph_metadata["strided_indices"]
+                    strided_indices = strided_indices[:max_seq_pages]
+                    page_table = (
+                        self.req_to_token[
+                            req_pool_indices[:, None],  # shape [bs, 1]
+                            strided_indices[None, :],  # shape [1, max_seq_pages]
+                        ]
+                        // self.page_size
+                    )
+                    metadata.page_table[:, :max_seq_pages].copy_(page_table)
+                    # 2. The second half of metadata for draft tokens (per_batch_num_tokens = topk)
+                    metadata_expand = self.draft_decode_metadata_topk_expand[bs]
+                    decode_length = self.speculative_step_id + 1
+                    # shape: [bs, num_steps, topk] -> [bs x topk, num_steps]
+                    cache_loc = out_cache_loc.view(-1, self.speculative_num_steps)
+                    if self.page_size > 1:
+                        draft_decode_set_expand_metadata(
+                            cache_seqlens_int32=metadata_expand.cache_seqlens_int32,
+                            page_table=metadata_expand.page_table,
+                            last_page_lens=last_page_lens,
+                            decode_length=decode_length,
+                            cache_loc=cache_loc,
+                            topk=self.topk,
+                            page_size=self.page_size,
+                        )
+                    else:
+                        num_seqs = cache_loc.shape[0]
+                        metadata_expand.page_table[:num_seqs, :decode_length].copy_(
+                            cache_loc[:, :decode_length]
+                        )
+                # TODO: Handle local attention metadata for draft decode when llama4 eagle is supported
+            else:
+                # Normal Decode
+                metadata = self.decode_cuda_graph_metadata[bs]
+                max_len = seq_lens_cpu.max().item()
+                max_seq_pages = (max_len + self.page_size - 1) // self.page_size
+                metadata.max_seq_len_k = max_len
+
+                normal_decode_set_metadata(
+                    metadata.cache_seqlens_int32,
+                    metadata.cu_seqlens_k,
+                    metadata.page_table,
+                    self.req_to_token,
+                    req_pool_indices,
+                    self.decode_cuda_graph_metadata["strided_indices"],
+                    max_seq_pages,
+                    seq_lens,
+                    0,
+                    self.page_size,
+                    metadata.swa_page_table,
+                    self.token_to_kv_pool if self.use_sliding_window_kv_pool else None,
+                )
+
+                self._maybe_update_local_attn_metadata_for_replay(
+                    metadata,
+                    bs,
+                )
+        elif forward_mode.is_target_verify():
+            if self.topk <= 1:
+                metadata = self.target_verify_metadata[bs]
+                metadata.cache_seqlens_int32.copy_(
+                    (seq_lens + self.speculative_num_draft_tokens)
+                )
+
+                metadata.max_seq_len_k = (
+                    seq_lens_cpu.max().item() + self.speculative_num_draft_tokens
+                )
+                metadata.cu_seqlens_k[1:].copy_(
+                    torch.cumsum(metadata.cache_seqlens_int32, dim=0, dtype=torch.int32)
+                )
+                max_seq_pages = (
+                    metadata.max_seq_len_k + self.page_size - 1
+                ) // self.page_size
+                page_indices = self.req_to_token[
+                    req_pool_indices[:, None],
+                    self.decode_cuda_graph_metadata["strided_indices"][:max_seq_pages],
+                ]
+                page_indices //= self.page_size
+                metadata.page_table[:, :max_seq_pages].copy_(page_indices)
+            else:
+                # When topk > 1, we need two specific target verify metadata, and then merge states
+                # 1. The first half of metadata for prefix tokens
+                metadata = self.target_verify_metadata_topk_normal[bs]
+                metadata.cache_seqlens_int32.copy_(seq_lens)
+                # metadata.max_seq_len_q = self.speculative_num_draft_tokens, already set in capture
+                metadata.max_seq_len_k = seq_lens_cpu.max().item()
+                # metadata.cu_seqlens_q already set in capture
+                metadata.cu_seqlens_k[1:].copy_(
+                    torch.cumsum(metadata.cache_seqlens_int32, dim=0, dtype=torch.int32)
+                )
+                max_seq_pages = (
+                    metadata.max_seq_len_k + self.page_size - 1
+                ) // self.page_size
+                page_indices = self.req_to_token[
+                    req_pool_indices[:, None],
+                    self.decode_cuda_graph_metadata["strided_indices"][:max_seq_pages],
+                ]
+                page_indices //= self.page_size
+                metadata.page_table[:, :max_seq_pages].copy_(page_indices)
+
+                # 2. The second half of metadata for draft tokens (per_batch_num_tokens = topk)
+                metadata_expand = self.target_verify_metadata_topk_expand[bs]
+
+                # metadata_expand.max_seq_len_q = 1, already set in capture
+                # metadata_expand.cu_seqlens_q already set in capture
+                offsets = torch.arange(
+                    self.speculative_num_draft_tokens, device=device
+                ).unsqueeze(
+                    0
+                )  # shape: (1, self.speculative_num_draft_tokens)
+
+                cols = offsets.expand(seq_lens.numel(), -1) + seq_lens.unsqueeze(1)
+                cum_len = torch.nn.functional.pad(
+                    torch.cumsum(
+                        (
+                            seq_lens + self.speculative_num_draft_tokens
+                        ).repeat_interleave(self.speculative_num_draft_tokens),
+                        dim=0,
+                    ),
+                    (1, 0),
+                )[:-1]
+                mask_extraction_indices = (
+                    cols.repeat_interleave(self.speculative_num_draft_tokens, dim=0)
+                    + cum_len[:, None]
+                ).view(1, -1)
+                # avoid extracting padded seq indices which will be out of boundary
+                mask_extraction_indices[
+                    :,
+                    spec_info.positions.numel() * self.speculative_num_draft_tokens :,
+                ].fill_(0)
+                mask = spec_info.custom_mask[mask_extraction_indices].view(
+                    -1, self.speculative_num_draft_tokens
+                )  # (bsz * draft_num, draft_num)
+
+                col_indices = offsets.expand(
+                    mask.shape[0], self.speculative_num_draft_tokens
+                )
+                keys = torch.where(
+                    mask,
+                    col_indices,
+                    col_indices + self.speculative_num_draft_tokens,
+                )
+                _, sort_order = torch.sort(keys, dim=1)
+
+                non_masked_page_table = (
+                    self.req_to_token[req_pool_indices, :]
+                    .gather(1, cols)
+                    .repeat_interleave(self.speculative_num_draft_tokens, dim=0)
+                )  # (bsz, draft_num)
+
+                metadata_expand.page_table.copy_(
+                    non_masked_page_table.gather(1, sort_order)
+                )
+                metadata_expand.cache_seqlens_int32.copy_(mask.sum(dim=1))
+                metadata_expand.cu_seqlens_k[1:].copy_(
+                    torch.cumsum(
+                        metadata_expand.cache_seqlens_int32,
+                        dim=0,
+                        dtype=torch.int32,
+                    )
+                )
+                if self.has_swa:
+                    metadata_swa = self.target_verify_metadata_topk_swa[bs]
+                    self._init_sliding_window_attn_spec_metadata(
+                        metadata, metadata_expand, metadata_swa
+                    )
+
+        elif forward_mode.is_draft_extend():
+            metadata = self.draft_extend_metadata[bs]
+            metadata.cache_seqlens_int32.copy_(seq_lens)
+
+            metadata.max_seq_len_k = seq_lens_cpu.max().item()
+            metadata.cu_seqlens_k[1:].copy_(
+                torch.cumsum(metadata.cache_seqlens_int32, dim=0, dtype=torch.int32)
+            )
+            accept_length = spec_info.accept_length[:bs]
+            if spec_info.accept_length_cpu:
+                metadata.max_seq_len_q = max(spec_info.accept_length_cpu) + 1
+            else:
+                metadata.max_seq_len_q = 1
+
+            metadata.cu_seqlens_q[1:].copy_(
+                torch.cumsum(accept_length, dim=0, dtype=torch.int32)
+            )
+
+            max_seq_pages = (
+                metadata.max_seq_len_k + self.page_size - 1
+            ) // self.page_size
+            page_indices = self.req_to_token[
+                req_pool_indices[:, None],
+                self.draft_extend_metadata["strided_indices"][:max_seq_pages],
+            ]
+            metadata.page_table[:, :max_seq_pages].copy_(page_indices // self.page_size)
+
+        elif forward_mode.is_draft_extend_v2():
+            metadata = self.draft_extend_metadata[bs]
+            metadata.cache_seqlens_int32.copy_(seq_lens)
+
+            metadata.max_seq_len_k = seq_lens_cpu.max().item()
+            metadata.cu_seqlens_k[1:].copy_(
+                torch.cumsum(metadata.cache_seqlens_int32, dim=0, dtype=torch.int32)
+            )
+
+            extend_seq_lens_tensor = getattr(spec_info, "extend_seq_lens_tensor", None)
+            extend_seq_lens_cpu = getattr(spec_info, "extend_seq_lens_cpu", None)
+            if extend_seq_lens_tensor is not None:
+                extend_seq_lens = extend_seq_lens_tensor.to(torch.int32)
+            elif extend_seq_lens_cpu is not None:
+                extend_seq_lens = torch.as_tensor(
+                    extend_seq_lens_cpu,
+                    dtype=torch.int32,
+                    device=device,
+                )
+            else:
+                default_extend = getattr(
+                    spec_info, "num_tokens_per_req", self.speculative_num_steps + 1
+                )
+                extend_seq_lens = torch.full(
+                    (bs,), default_extend, dtype=torch.int32, device=device
+                )
+                extend_seq_lens_cpu = [default_extend] * bs
+
+            if extend_seq_lens_cpu:
+                metadata.max_seq_len_q = int(max(extend_seq_lens_cpu))
+            else:
+                metadata.max_seq_len_q = getattr(
+                    spec_info, "num_tokens_per_req", self.speculative_num_steps + 1
+                )
+
+            metadata.cu_seqlens_q[1:].copy_(
+                torch.cumsum(extend_seq_lens, dim=0, dtype=torch.int32)
+            )
+
+            max_seq_pages = (
+                metadata.max_seq_len_k + self.page_size - 1
+            ) // self.page_size
+            page_indices = self.req_to_token[
+                req_pool_indices[:, None],
+                self.draft_extend_metadata["strided_indices"][:max_seq_pages],
+            ]
+            metadata.page_table[:, :max_seq_pages].copy_(page_indices // self.page_size)
+
+        if encoder_lens is not None:
+            # Only support encoder size 1 for now
+            metadata.encoder_max_seq_len_k = encoder_lens[0]
+            metadata.encoder_lens_int32.copy_(encoder_lens[:1])
+            metadata.encoder_cu_seqlens_k[1:].copy_(
+                torch.cumsum(metadata.encoder_lens_int32, dim=0, dtype=torch.int32)
+            )
+
+            metadata.encoder_page_table[:, : metadata.encoder_max_seq_len_k].copy_(
+                self.req_to_token[req_pool_indices, : metadata.encoder_max_seq_len_k]
+            )
+
+            # Update the regular page table
+            page_table = self.req_to_token[
+                req_pool_indices,
+                metadata.encoder_max_seq_len_k : (
+                    metadata.encoder_max_seq_len_k + metadata.max_seq_len_k
+                ),
+            ]
+            metadata.page_table[:, : metadata.max_seq_len_k].copy_(page_table)
+
+        self.forward_metadata = metadata
+        self.forward_metadata_spec_decode_expand = metadata_expand
+
+    def get_cuda_graph_seq_len_fill_value(self):
+        """Get the fill value for sequence length in CUDA graph."""
+        return 1
+
+    def _maybe_init_local_attn_metadata(
+        self, forwardbatch: ForwardBatch, metadata: FlashAttentionMetadata, device
+    ):
+        """Centralized utility to initialize local_attn_metadata if chunked attention is enabled."""
+        if not self.has_local_attention:
+            metadata.local_attn_metadata = None
+            return
+
+        cu_seqlens_q = metadata.cu_seqlens_q
+        cache_seqlens_int32 = metadata.cache_seqlens_int32
+        if self.use_sliding_window_kv_pool:
+            page_table = self.token_to_kv_pool.translate_loc_from_full_to_swa(
+                metadata.page_table
+            )
+        else:
+            page_table = metadata.page_table
+        if cu_seqlens_q is None or cache_seqlens_int32 is None or page_table is None:
+            metadata.local_attn_metadata = None
+            return
+
+        cu_seqlens_q_np = cu_seqlens_q.cpu().numpy()
+        seq_lens_np = cache_seqlens_int32.cpu().numpy()
+        (
+            seqlens_q_local_np,
+            cu_seqlens_q_local_np,
+            seqlens_k_local_np,
+            block_table_local,
+        ) = make_local_attention_virtual_batches(
+            self.attention_chunk_size,
+            cu_seqlens_q_np,
+            seq_lens_np,
+            page_table,
+            self.page_size,
+        )
+
+        local_metadata = FlashAttentionMetadata.LocalAttentionMetadata(
+            local_query_start_loc=torch.from_numpy(cu_seqlens_q_local_np).to(device),
+            local_seqused_k=torch.from_numpy(seqlens_k_local_np).to(device),
+            local_block_table=block_table_local.to(device),
+            local_max_query_len=int(seqlens_q_local_np.max()),
+            local_max_seq_len=int(seqlens_k_local_np.max()),
+        )
+        metadata.local_attn_metadata = local_metadata
+
+    def _maybe_update_local_attn_metadata_for_capture(
+        self, metadata: FlashAttentionMetadata, bs: int
+    ):
+        """Update local attention metadata during CUDA graph capture phase.
+
+        This method calculates the exact buffer sizes needed for local attention metadata
+        during the CUDA graph capture phase, optimizing memory usage by creating views of
+        pre-allocated buffers with exactly the sizes needed.
+        """
+        if not self.has_local_attention:
+            return
+
+        seq_lens_capture = metadata.cache_seqlens_int32
+        max_seq_len = int(seq_lens_capture.max().item())
+        page_table_capture = metadata.page_table
+
+        cu_seqlens_q_np = metadata.cu_seqlens_q.cpu().numpy()
+        seqlens_np = seq_lens_capture.cpu().numpy()
+        (
+            seqlens_q_local_np,
+            cu_seqlens_q_local_np,
+            seqlens_k_local_np,
+            block_table_local_np,
+        ) = make_local_attention_virtual_batches(
+            self.attention_chunk_size,
+            cu_seqlens_q_np,
+            seqlens_np,
+            page_table_capture,
+            self.page_size,
+        )
+
+        # Get exact dimensions from the calculation
+        q_len = len(cu_seqlens_q_local_np)
+        k_len = len(seqlens_k_local_np)
+        b0 = block_table_local_np.shape[0] if block_table_local_np.shape[0] > 0 else bs
+        b1 = block_table_local_np.shape[1] if block_table_local_np.shape[1] > 0 else 1
+
+        # Create views of the pre-allocated buffers with exactly these sizes
+        # This is the key optimization - we only use the memory we actually need
+        local_query_start_loc = self.decode_cuda_graph_local_attn_metadata[
+            "local_query_start_loc"
+        ][:q_len]
+
+        local_seqused_k = self.decode_cuda_graph_local_attn_metadata["local_seqused_k"][
+            :k_len
+        ]
+
+        local_block_table = self.decode_cuda_graph_local_attn_metadata[
+            "local_block_table"
+        ][:b0, :b1]
+
+        metadata.local_attn_metadata = FlashAttentionMetadata.LocalAttentionMetadata(
+            local_query_start_loc=local_query_start_loc,
+            local_seqused_k=local_seqused_k,
+            local_block_table=local_block_table,
+            local_max_query_len=1,
+            local_max_seq_len=max_seq_len,
+        )
+
+    def _maybe_update_local_attn_metadata_for_replay(
+        self,
+        metadata: FlashAttentionMetadata,
+        bs: int,
+    ):
+        """Update preallocated local attention metadata in-place before CUDA graph replay."""
+        if not self.has_local_attention:
+            return
+
+        # Access preallocated buffers
+        local_q_buf = self.decode_cuda_graph_local_attn_metadata[
+            "local_query_start_loc"
+        ]
+        local_k_buf = self.decode_cuda_graph_local_attn_metadata["local_seqused_k"]
+        local_block_buf = self.decode_cuda_graph_local_attn_metadata[
+            "local_block_table"
+        ]
+        cu_seqlens_q = self.decode_cuda_graph_metadata["cu_seqlens_q"]
+
+        # Create a modified version for local attention that only processes the last token
+        # This mimics the normal decode pattern
+        cu_seqlens_q = torch.arange(
+            bs + 1, device=cu_seqlens_q.device, dtype=cu_seqlens_q.dtype
+        )
+        seqlens = metadata.cache_seqlens_int32[:bs]
+        # Slice the page_table to match the batch size and actual sequence length
+        # This serves three important purposes:
+        # 1. Ensures we only process the actual batch size (bs) and not the maximum batch size
+        # 2. Limits the sequence length to prevent processing padding tokens or garbage values
+        # 3. Prevents zeros in the block table which can cause garbage output during replay
+        #
+        # Without this slicing, the pre-allocated page_table may contain zeros or invalid indices
+        # beyond the actual sequence length, leading to incorrect attention calculations
+        max_seq_len = int(seqlens.max().item())
+        if self.use_sliding_window_kv_pool:
+            sliced_page_table = self.token_to_kv_pool.translate_loc_from_full_to_swa(
+                metadata.page_table[:bs, :max_seq_len]
+            )
+        else:
+            sliced_page_table = metadata.page_table[:bs, :max_seq_len]
+
+        cu_seqlens_q_np = cu_seqlens_q.cpu().numpy()
+        seqlens_np = seqlens.cpu().numpy()
+        (
+            seqlens_q_local_np,
+            cu_seqlens_q_local_np,
+            seqlens_k_local_np,
+            block_table_local,
+        ) = make_local_attention_virtual_batches(
+            self.attention_chunk_size,
+            cu_seqlens_q_np,
+            seqlens_np,
+            sliced_page_table,
+            self.page_size,
+        )
+
+        # Convert back to tensors
+        device = local_q_buf.device
+        cu_seqlens_q_local = torch.from_numpy(cu_seqlens_q_local_np).to(device)
+        seqlens_k_local = torch.from_numpy(seqlens_k_local_np).to(device)
+        block_table_local = block_table_local.to(device)
+        # Get sizes
+        q_len = cu_seqlens_q_local.shape[0]
+        k_len = seqlens_k_local.shape[0]
+        b0, b1 = block_table_local.shape
+
+        # In-place updates into preallocated tensors and zero out the unused space
+        local_q_buf[:q_len].copy_(cu_seqlens_q_local)
+        local_q_buf[q_len:].fill_(0)
+        local_k_buf[:k_len].copy_(seqlens_k_local)
+        local_k_buf[k_len:].fill_(0)
+        local_block_buf[:b0, :b1].copy_(block_table_local)
+        local_block_buf[b0:, :].fill_(0)
+        local_block_buf[:b0, b1:].fill_(0)
+
+        if metadata.local_attn_metadata is not None:
+            lam = metadata.local_attn_metadata
+            lam.local_max_query_len = int(seqlens_q_local_np.max())
+            lam.local_max_seq_len = int(seqlens_k_local_np.max())
+
+    def _init_sliding_window_attn_spec_metadata(
+        self,
+        metadata: FlashAttentionMetadata,
+        metadata_expand: FlashAttentionMetadata,
+        metadata_swa: Optional[FlashAttentionMetadata] = None,
+    ):
+        # TODO: support page_size > 1 for swa spec
+        assert (
+            self.page_size == 1
+        ), "FlashAttention backend doesn't support topk > 1 speculative decoding with page size > 1 sliding window attention"
+
+        cache_seqlens_int32 = (
+            metadata.cache_seqlens_int32.repeat_interleave(
+                self.speculative_num_draft_tokens
+            )
+            + metadata_expand.cache_seqlens_int32
+        )
+        cu_seqlens_k = torch.nn.functional.pad(
+            torch.cumsum(cache_seqlens_int32, dim=0, dtype=torch.int32), (1, 0)
+        )
+        bs = cache_seqlens_int32.shape[0]
+        page_table = (
+            metadata.page_table.new_zeros(
+                (bs, metadata.max_seq_len_k + metadata_expand.page_table.shape[1])
+            )
+            if metadata_swa is None
+            else metadata_swa.page_table
+        )
+
+        prepare_swa_spec_page_table_triton(
+            page_table,
+            metadata.page_table,
+            metadata_expand.page_table,
+            metadata.cache_seqlens_int32,
+            metadata_expand.cache_seqlens_int32,
+            self.speculative_num_draft_tokens,
+        )
+
+        if metadata_swa is None:
+            metadata_swa = FlashAttentionMetadata()
+            metadata_swa.max_seq_len_q = 1
+            metadata_swa.cu_seqlens_q = metadata_expand.cu_seqlens_q
+            metadata_swa.cache_seqlens_int32 = cache_seqlens_int32
+            metadata_swa.cu_seqlens_k = cu_seqlens_k
+            metadata_swa.page_table = page_table
+        else:
+            metadata_swa.cache_seqlens_int32.copy_(cache_seqlens_int32)
+            metadata_swa.cu_seqlens_k.copy_(cu_seqlens_k)
+
+        metadata.swa_spec_metadata = metadata_swa
+
+
+@triton.jit
+def _prepare_swa_spec_page_table_kernel(
+    dst_ptr,
+    src_a_ptr,
+    src_b_ptr,
+    seq_len_a_ptr,
+    seq_len_b_ptr,
+    dst_stride_m,
+    dst_stride_n,
+    a_stride_m,
+    a_stride_n,
+    b_stride_m,
+    b_stride_n,
+    LEN_A: tl.constexpr,
+    LEN_B: tl.constexpr,
+    REPEAT_STEP: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+):
+    pid_m = tl.program_id(0)
+    pid_n = tl.program_id(1)
+
+    idx_a = pid_m // REPEAT_STEP
+    idx_b = pid_m
+    seq_len_a = tl.load(seq_len_a_ptr + idx_a)
+    seq_len_b = tl.load(seq_len_b_ptr + idx_b)
+
+    offs_n = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
+    total_len = seq_len_a + seq_len_b
+
+    if pid_n * BLOCK_N >= total_len:
+        return
+
+    mask = offs_n < total_len
+    dst = dst_ptr + pid_m * dst_stride_m + offs_n * dst_stride_n
+
+    if (pid_n + 1) * BLOCK_N < seq_len_a:
+        a_ptr = src_a_ptr + idx_a * a_stride_m + offs_n * a_stride_n
+        a_mask = mask & (offs_n < LEN_A)
+        val = tl.load(a_ptr, mask=a_mask, other=0)
+        tl.store(dst, val, mask=mask)
+    elif pid_n * BLOCK_N >= seq_len_a:
+        offs_b = offs_n - seq_len_a
+        b_ptr = src_b_ptr + idx_b * b_stride_m + offs_b * b_stride_n
+        b_mask = mask & (offs_b < LEN_B)
+        val = tl.load(b_ptr, mask=b_mask, other=0)
+        tl.store(dst, val, mask=mask)
+    else:
+        # mixed part
+        a_offs = offs_n
+        a_mask = (a_offs < seq_len_a) & (a_offs < LEN_A)
+        a_ptr = src_a_ptr + idx_a * a_stride_m + a_offs * a_stride_n
+        a_val = tl.load(a_ptr, mask=a_mask, other=0)
+
+        b_offs = offs_n - seq_len_a
+        b_mask = (b_offs >= 0) & (b_offs < seq_len_b) & (b_offs < LEN_B)
+        b_ptr = src_b_ptr + idx_b * b_stride_m + b_offs * b_stride_n
+        b_val = tl.load(b_ptr, mask=b_mask, other=0)
+
+        result = tl.where(offs_n < seq_len_a, a_val, b_val)
+        tl.store(dst, result, mask=mask)
+
+
+def prepare_swa_spec_page_table_triton(
+    page_table_dst: torch.Tensor,
+    page_table_a: torch.Tensor,
+    page_table_b: torch.Tensor,  # expand page table
+    seq_len_a: torch.Tensor,
+    seq_len_b: torch.Tensor,  # expand seq lens
+    speculative_num_draft_tokens: int,
+):
+    # concat page_table and expand page_table by kv seq length
+    bs = seq_len_a.numel()
+    bs_expand = seq_len_b.numel()
+    assert bs_expand == bs * speculative_num_draft_tokens
+
+    LEN_A = page_table_a.shape[1]
+    LEN_B = page_table_b.shape[1]
+    LEN_OUT = LEN_A + LEN_B
+    REPEAT_STEP = speculative_num_draft_tokens
+    BLOCK_N = 256
+
+    grid = (bs_expand, triton.cdiv(LEN_OUT, BLOCK_N))
+    _prepare_swa_spec_page_table_kernel[grid](
+        page_table_dst,
+        page_table_a,
+        page_table_b,
+        seq_len_a,
+        seq_len_b,
+        page_table_dst.stride(0),
+        page_table_dst.stride(1),
+        page_table_a.stride(0),
+        page_table_a.stride(1),
+        page_table_b.stride(0),
+        page_table_b.stride(1),
+        LEN_A=LEN_A,
+        LEN_B=LEN_B,
+        REPEAT_STEP=REPEAT_STEP,
+        BLOCK_N=BLOCK_N,
+        num_warps=4,
+    )
+
+
+class FlashAttentionMultiStepBackend:
+
+    def __init__(
+        self, model_runner: ModelRunner, topk: int, speculative_num_steps: int
+    ):
+        self.model_runner = model_runner
+        self.topk = topk
+        self.speculative_num_steps = speculative_num_steps
+        self.attn_backends = []
+        for i in range(self.speculative_num_steps - 1):
+            self.attn_backends.append(
+                FlashAttentionBackend(
+                    model_runner,
+                    speculative_step_id=i,
+                    topk=self.topk,
+                    speculative_num_steps=self.speculative_num_steps,
+                )
+            )
+
+    def init_forward_metadata(self, forward_batch: ForwardBatch):
+        for i in range(self.speculative_num_steps - 1):
+            self.attn_backends[i].init_forward_metadata(forward_batch)
+
+    def init_cuda_graph_state(self, max_bs: int, max_num_tokens: int):
+        for i in range(self.speculative_num_steps - 1):
+            self.attn_backends[i].init_cuda_graph_state(max_bs, max_num_tokens)
+
+    def init_forward_metadata_capture_cuda_graph(
+        self,
+        forward_batch: ForwardBatch,
+    ):
+        assert forward_batch.spec_info is not None
+        assert forward_batch.spec_info.is_draft_input()
+
+        for i in range(self.speculative_num_steps - 1):
+            self.attn_backends[i].init_forward_metadata_capture_cuda_graph(
+                forward_batch.batch_size,
+                forward_batch.batch_size * self.topk,
+                forward_batch.req_pool_indices,
+                forward_batch.seq_lens,
+                encoder_lens=forward_batch.encoder_lens,
+                forward_mode=ForwardMode.DECODE,
+                spec_info=forward_batch.spec_info,
+            )
+
+    def init_forward_metadata_replay_cuda_graph(
+        self, forward_batch: ForwardBatch, bs: int
+    ):
+        assert forward_batch.spec_info is not None
+        assert forward_batch.spec_info.is_draft_input()
+
+        for i in range(self.speculative_num_steps - 1):
+            # TODO: incrementally update the metadata for the later steps,
+            # so that they do not need to recompute everything from scratch.
+            self.attn_backends[i].init_forward_metadata_replay_cuda_graph(
+                bs,
+                forward_batch.req_pool_indices,
+                forward_batch.seq_lens,
+                forward_batch.seq_lens_sum,
+                encoder_lens=forward_batch.encoder_lens,
+                forward_mode=ForwardMode.DECODE,
+                spec_info=forward_batch.spec_info,
+                seq_lens_cpu=forward_batch.seq_lens_cpu,
+                out_cache_loc=forward_batch.out_cache_loc,
+            )
+
+
+# @torch.compile(dynamic=True, backend=get_compiler_backend())
+# TODO: fuse these kernels
+# NOTE: torch.compile makes it slower in speculative decoding
+def normal_decode_set_metadata(
+    cache_seqlens_int32: torch.Tensor,
+    cu_seqlens_k: torch.Tensor,
+    page_table: torch.Tensor,
+    req_to_token: torch.Tensor,
+    req_pool_indices: torch.Tensor,
+    strided_indices: torch.Tensor,
+    max_seq_pages: torch.Tensor,
+    seq_lens: torch.Tensor,
+    seq_len_delta: int,
+    page_size: int,
+    swa_page_table: Optional[torch.Tensor] = None,
+    token_to_kv_pool: Optional[SWAKVPool] = None,
+):
+    cache_seqlens_int32.copy_(seq_lens + seq_len_delta)
+    cu_seqlens_k[1:].copy_(torch.cumsum(cache_seqlens_int32, dim=0, dtype=torch.int32))
+    page_indices = req_to_token[
+        req_pool_indices[:, None],
+        strided_indices[:max_seq_pages][None, :],
+    ]
+    page_table[:, :max_seq_pages].copy_(page_indices // page_size)
+
+    if swa_page_table is not None and token_to_kv_pool is not None:
+        assert isinstance(token_to_kv_pool, SWAKVPool)
+        swa_page_indices = token_to_kv_pool.translate_loc_from_full_to_swa(page_indices)
+        swa_page_table[:, :max_seq_pages].copy_(swa_page_indices // page_size)
+
+
+@torch.compile(dynamic=True, backend=get_compiler_backend())
+def draft_decode_set_expand_metadata(
+    cache_seqlens_int32: torch.Tensor,  # Modifies
+    page_table: torch.Tensor,  # Modifies
+    last_page_lens: torch.Tensor,
+    decode_length: int,
+    cache_loc: torch.Tensor,
+    topk: int,
+    page_size: int,
+):
+    expanded_last_page_lens = last_page_lens.repeat_interleave(topk)
+    cache_seqlens_int32.copy_(decode_length + expanded_last_page_lens)
+    cache_loc = (cache_loc // page_size).to(torch.int32)
+    if cache_loc.dim() == 1:
+        cache_loc = cache_loc.unsqueeze(0)
+    # Vectorized torch.unique_consecutive: track value change points then scatter
+    mask = torch.ones_like(cache_loc, dtype=torch.bool)
+    mask[:, 1:] = cache_loc[:, 1:] != cache_loc[:, :-1]
+    positions = mask.cumsum(dim=1) - 1
+    num_seqs = cache_loc.shape[0]
+    page_table[:num_seqs, :].scatter_(1, positions, cache_loc)
diff --git a/sglang/python/sglang/srt/layers/attention/flashinfer_backend.py b/sglang/python/sglang/srt/layers/attention/flashinfer_backend.py
new file mode 100644
index 0000000000000000000000000000000000000000..c511272544f9270d5a12ee7c181d9fbec00adaba
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/attention/flashinfer_backend.py
@@ -0,0 +1,1666 @@
+from __future__ import annotations
+
+"""
+Support different attention backends.
+Now there are two backends: FlashInfer and Triton.
+FlashInfer is faster and Triton is easier to customize.
+Each backend supports two operators: extend (i.e. prefill with cached prefix) and decode.
+"""
+
+import logging
+import os
+from dataclasses import dataclass
+from enum import Enum, auto
+from functools import partial
+from typing import TYPE_CHECKING, Callable, List, Optional, Union
+
+import torch
+
+from sglang.srt.compilation.piecewise_context_manager import is_in_piecewise_cuda_graph
+from sglang.srt.dllm.config import DllmConfig
+from sglang.srt.environ import envs
+from sglang.srt.layers.attention.base_attn_backend import AttentionBackend
+from sglang.srt.layers.attention.utils import create_flashinfer_kv_indices_triton
+from sglang.srt.layers.dp_attention import get_attention_tp_size
+from sglang.srt.layers.radix_attention import AttentionType
+from sglang.srt.mem_cache.swa_memory_pool import SWATokenToKVPoolAllocator
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch, ForwardMode
+from sglang.srt.speculative.spec_info import SpecInput
+from sglang.srt.utils import (
+    get_int_env_var,
+    is_flashinfer_available,
+    is_sm100_supported,
+    next_power_of_2,
+)
+
+if TYPE_CHECKING:
+    from sglang.srt.layers.radix_attention import RadixAttention
+    from sglang.srt.model_executor.model_runner import ModelRunner
+
+logger = logging.getLogger(__name__)
+
+if envs.SGLANG_ENABLE_TORCH_COMPILE.get():
+    torch._logging.set_logs(dynamo=logging.ERROR)
+    torch._dynamo.config.suppress_errors = True
+
+
+if is_flashinfer_available():
+    from flashinfer import (
+        BatchDecodeWithPagedKVCacheWrapper,
+        BatchPrefillWithPagedKVCacheWrapper,
+        BatchPrefillWithRaggedKVCacheWrapper,
+        fast_decode_plan,
+    )
+    from flashinfer.cascade import merge_state
+
+
+class WrapperDispatch(Enum):
+    SLIDING_WINDOW = auto()
+    CROSS_ATTENTION = auto()
+
+
+@dataclass
+class MultiItemScoringParams:
+    """Parameters for multi-item scoring in attention computation.
+
+    Used when processing sequences with multiple items separated by delimiters,
+    where each item needs specific attention patterns that respect item boundaries.
+
+    Attributes:
+        prefix_len_ptr: A uint32 1D tensor indicating the prefix length of each prompt.
+                       The tensor size is equal to the batch size.
+        token_pos_in_items_ptr: A uint16 1D tensor indicating the token position of each item
+                               starting from 0 (delimiter) for each item. For batch size > 1,
+                               sequences are concatenated with zero padding to ensure same length.
+        token_pos_in_items_len: Zero padding length for token_pos_in_items_ptr to handle
+                               batch_size > 1 case. Defines the padded length for each sequence.
+        max_item_len_ptr: A uint16 tensor containing the max token length of all items
+                         for each prompt in the batch.
+
+    """
+
+    prefix_len_ptr: Optional[torch.Tensor] = None
+    token_pos_in_items_ptr: Optional[torch.Tensor] = None
+    token_pos_in_items_len: int = 0
+    max_item_len_ptr: Optional[torch.Tensor] = None
+
+    def is_enabled(self) -> bool:
+        """Check if multi-item scoring is enabled."""
+        return self.prefix_len_ptr is not None
+
+
+@dataclass
+class DecodeMetadata:
+    decode_wrappers: List[BatchDecodeWithPagedKVCacheWrapper]
+
+
+@dataclass
+class PrefillMetadata:
+    prefill_wrappers: List[BatchPrefillWithPagedKVCacheWrapper]
+    use_ragged: bool
+    extend_no_prefix: bool
+    multi_item_params: Optional[MultiItemScoringParams] = None
+
+
+# Reuse this workspace buffer across all flashinfer wrappers
+global_workspace_buffer = None
+
+# Use as a fast path to override the indptr in flashinfer's plan function
+# This is used to remove some host-to-device copy overhead.
+global_override_indptr_cpu = None
+
+
+class FlashInferAttnBackend(AttentionBackend):
+    """Flashinfer attention kernels."""
+
+    def __init__(
+        self,
+        model_runner: ModelRunner,
+        skip_prefill: bool = False,
+        kv_indptr_buf: Optional[torch.Tensor] = None,
+        kv_last_page_len_buf: Optional[torch.Tensor] = None,
+        init_new_workspace: bool = False,
+    ):
+        super().__init__()
+        self.prefill_backend = "fa2"
+        self.decode_backend = "fa2"
+
+        # Store multi-item scoring delimiter for efficient access
+        self.multi_item_scoring_delimiter = (
+            model_runner.server_args.multi_item_scoring_delimiter
+        )
+
+        # FIXME: remove dllm workarounds from flashinfer
+        self.dllm_config = DllmConfig.from_server_args(model_runner.server_args)
+        self.is_dllm_model = self.dllm_config is not None
+
+        # Parse constants
+        self.decode_use_tensor_cores = should_use_tensor_core(
+            kv_cache_dtype=model_runner.kv_cache_dtype,
+            num_attention_heads=model_runner.model_config.num_attention_heads
+            // get_attention_tp_size(),
+            num_kv_heads=model_runner.model_config.get_num_kv_heads(
+                get_attention_tp_size()
+            ),
+        )
+        self.max_context_len = model_runner.model_config.context_len
+        self.skip_prefill = skip_prefill
+        self.is_multimodal = model_runner.model_config.is_multimodal
+
+        assert not (
+            model_runner.sliding_window_size is not None
+            and model_runner.model_config.is_encoder_decoder
+        ), "Sliding window and cross attention are not supported together"
+
+        if model_runner.sliding_window_size is not None:
+            self.num_wrappers = 2
+            self.dispatch_reason = WrapperDispatch.SLIDING_WINDOW
+        elif model_runner.model_config.is_encoder_decoder:
+            self.num_wrappers = 2
+            self.dispatch_reason = WrapperDispatch.CROSS_ATTENTION
+        else:
+            self.num_wrappers = 1
+            self.dispatch_reason = None
+
+        # Qwen2/Qwen3 models require higher flashinfer workspace size
+        if (
+            "Qwen2ForCausalLM" in model_runner.model_config.hf_config.architectures
+            or "Qwen3ForCausalLM" in model_runner.model_config.hf_config.architectures
+            or "MiMoForCausalLM" in model_runner.model_config.hf_config.architectures
+            or "Qwen3VLForConditionalGeneration"
+            in model_runner.model_config.hf_config.architectures
+            or "Qwen3VLMoeForConditionalGeneration"
+            in model_runner.model_config.hf_config.architectures
+        ):
+            envs.SGLANG_FLASHINFER_WORKSPACE_SIZE.set(512 * 1024 * 1024)
+
+        # When deterministic inference is enabled, tensor cores should be used for decode
+        # Also set split tile sizes for prefill and decode from environment variables, and disable kv split for cuda graph
+        # More information can be found here: https://github.com/flashinfer-ai/flashinfer/pull/1675
+        self.enable_deterministic = (
+            model_runner.server_args.enable_deterministic_inference
+        )
+        self.prefill_split_tile_size = None
+        self.decode_split_tile_size = None
+        self.disable_cuda_graph_kv_split = False
+        if self.enable_deterministic:
+            self.decode_use_tensor_cores = True
+            self.prefill_split_tile_size = get_int_env_var(
+                "SGLANG_FLASHINFER_PREFILL_SPLIT_TILE_SIZE", 4096
+            )
+            self.decode_split_tile_size = get_int_env_var(
+                "SGLANG_FLASHINFER_DECODE_SPLIT_TILE_SIZE", 2048
+            )
+            self.disable_cuda_graph_kv_split = True
+            envs.SGLANG_FLASHINFER_WORKSPACE_SIZE.set(2048 * 1024 * 1024)
+
+        # Allocate buffers
+        global global_workspace_buffer
+        if global_workspace_buffer is None:
+            # different from flashinfer zero_init_global_workspace_buffer
+            global_workspace_size = envs.SGLANG_FLASHINFER_WORKSPACE_SIZE.get()
+            global_workspace_buffer = torch.empty(
+                global_workspace_size,
+                dtype=torch.uint8,
+                device=model_runner.device,
+            )
+        if init_new_workspace:
+            self.workspace_buffer = torch.empty(
+                envs.SGLANG_FLASHINFER_WORKSPACE_SIZE.get(),
+                dtype=torch.uint8,
+                device=model_runner.device,
+            )
+        else:
+            self.workspace_buffer = global_workspace_buffer
+        max_bs = model_runner.req_to_token_pool.size
+        if kv_indptr_buf is None:
+            self.kv_indptr = [
+                torch.zeros(
+                    (max_bs + 1,), dtype=torch.int32, device=model_runner.device
+                )
+                for _ in range(self.num_wrappers)
+            ]
+        else:
+            assert self.num_wrappers == 1
+            self.kv_indptr = [kv_indptr_buf]
+
+        if kv_last_page_len_buf is None:
+            self.kv_last_page_len = torch.ones(
+                (max_bs,), dtype=torch.int32, device=model_runner.device
+            )
+        else:
+            assert self.num_wrappers == 1
+            self.kv_last_page_len = kv_last_page_len_buf
+
+        if not self.skip_prefill:
+            self.qo_indptr = [
+                torch.zeros(
+                    (max_bs + 1,), dtype=torch.int32, device=model_runner.device
+                )
+                for _ in range(self.num_wrappers)
+            ]
+
+        fmha_backend = "auto"
+        if is_sm100_supported():
+            # Disable CUTLASS backend when piecewise cuda graph is enabled
+            # due to TMA descriptor initialization issues on B200
+            if not model_runner.server_args.disable_piecewise_cuda_graph:
+                logger.warning(
+                    "CUTLASS backend is disabled when piecewise cuda graph is enabled "
+                    "due to TMA descriptor initialization issues on B200. "
+                    "Using auto backend instead for stability."
+                )
+            else:
+                fmha_backend = "cutlass"
+        self.prefill_wrapper_ragged = BatchPrefillWithRaggedKVCacheWrapper(
+            self.workspace_buffer, "NHD", backend=fmha_backend
+        )
+
+        # Two wrappers: one for sliding window attention and one for full attention.
+        # Using two wrappers is unnecessary in the current PR, but are prepared for future PRs
+        self.prefill_wrappers_paged = []
+        self.prefill_wrappers_verify = []
+        self.decode_wrappers = []
+        for _ in range(self.num_wrappers):
+            if not skip_prefill:
+                self.prefill_wrappers_paged.append(
+                    BatchPrefillWithPagedKVCacheWrapper(
+                        self.workspace_buffer,
+                        "NHD",
+                        backend=self.prefill_backend,
+                    )
+                )
+                self.prefill_wrappers_verify.append(
+                    BatchPrefillWithPagedKVCacheWrapper(
+                        self.workspace_buffer,
+                        "NHD",
+                        backend=self.prefill_backend,
+                    )
+                )
+            self.decode_wrappers.append(
+                BatchDecodeWithPagedKVCacheWrapper(
+                    self.workspace_buffer,
+                    "NHD",
+                    backend=self.decode_backend,
+                    use_tensor_cores=self.decode_use_tensor_cores,
+                )
+            )
+
+        # Create indices updater
+        if not skip_prefill:
+            self.indices_updater_prefill = FlashInferIndicesUpdaterPrefill(
+                model_runner, self
+            )  # for verify
+        self.indices_updater_decode = FlashInferIndicesUpdaterDecode(model_runner, self)
+
+        # Other metadata
+        self.forward_metadata: Union[PrefillMetadata, DecodeMetadata] = None
+
+        self.decode_cuda_graph_metadata = {}
+        self.prefill_cuda_graph_metadata = {}  # For verify
+        self.draft_extend_cuda_graph_metadata = {}  # For draft extend
+
+    def _process_multi_item_scoring(
+        self, forward_batch: ForwardBatch
+    ) -> MultiItemScoringParams:
+        """Process multi-item scoring tensors for FlashInfer attention.
+
+        This method handles sequences containing multiple "items" separated by delimiter tokens,
+        where each item needs specific attention patterns that respect item boundaries.
+
+        The method produces four key tensors for FlashInfer:
+        - prefix_len_ptr: uint32 tensor with prefix length for each prompt in batch
+        - token_pos_in_items_ptr: uint16 tensor with token positions starting from 0 at delimiters
+        - token_pos_in_items_len: padding length for batch processing
+        - max_item_len_ptr: uint16 tensor with max item length for each prompt
+
+        Args:
+            forward_batch: The forward batch containing input sequences and delimiter info
+
+        Returns:
+            MultiItemScoringParams: The processed multi-item scoring parameters
+
+        Examples:
+            Following FlashInfer definition: for 3 items of length 3, 2, 4 respectively:
+            token_pos_in_items_ptr = [0, 1, 2, 3, 0, 1, 2, 0, 1, 2, 3, 4, 0]
+
+            Case 1: Single sequence
+            Text: "What is the capital of France? <delim> London <delim> Paris <delim> Berlin <delim>"
+            Tokens: [What, is, the, capital, of, France, ?, <delim>, London, <delim>, Paris, <delim>, Berlin, <delim>]
+            Indices: [ 0,   1,  2,   3,      4,  5,     6,   7,     8,      9,     10,    11,    12,     13]
+            - prefix_len_ptr: [7] (query length before first delimiter)
+            - token_pos_in_items_ptr: [0, 1, 0, 1, 0, 1, 0] (delim=0, London=1, delim=0, Paris=1, delim=0, Berlin=1, delim=0)
+            - token_pos_in_items_len: 7 (actual length)
+            - max_item_len_ptr: [1] (max item length is 1 token - all options are single tokens)
+
+            Case 2: Batch processing (batch_size=2)
+            Sequence 1: 2 items of length 2, 1 → [0, 1, 2, 0, 1, 0] (6 elements)
+            Sequence 2: 3 items of length 1, 3, 2 → [0, 1, 0, 1, 2, 3, 0, 1, 2, 0] (10 elements)
+            After padding both to length 10:
+            - token_pos_in_items_ptr: [0, 1, 2, 0, 1, 0, 0, 0, 0, 0,    0, 1, 0, 1, 2, 3, 0, 1, 2, 0]
+            - token_pos_in_items_len: 10 (padded length for batch processing)
+            - max_item_len_ptr: [2, 3] (max lengths per sequence)
+        """
+
+        delimiter = self.multi_item_scoring_delimiter
+        if delimiter is None or forward_batch.forward_mode == ForwardMode.DECODE:
+            return MultiItemScoringParams()
+
+        delimiter_mask = forward_batch.input_ids == delimiter
+        prefix_cache_lens = getattr(forward_batch, "extend_prefix_lens", None)
+        extend_seq_lens = getattr(forward_batch, "extend_seq_lens", None)
+        prefix_len_ptr, token_pos_in_items_ptr = [], []
+        token_pos_in_items_len = 0
+
+        # If no extend_seq_lens, treat whole batch as one sequence
+        if extend_seq_lens is None or len(extend_seq_lens) <= 1:
+            extend_seq_lens = [forward_batch.input_ids.size(0)]
+
+        seq_start = 0
+        for i, seq_len in enumerate(extend_seq_lens):
+            seq_end = seq_start + seq_len
+            mask = delimiter_mask[seq_start:seq_end]
+            pos = forward_batch.positions[seq_start:seq_end]
+            delimiter_indices = torch.nonzero(mask, as_tuple=True)[0]
+
+            if len(delimiter_indices) > 0:
+                first_delim = delimiter_indices[0]
+                # Prefix length: store as scalar
+                prefix_len = first_delim + (
+                    prefix_cache_lens[i] if prefix_cache_lens is not None else 0
+                )
+                prefix_len_ptr.append(
+                    prefix_len.item() if torch.is_tensor(prefix_len) else prefix_len
+                )
+
+                # Compute relative positions within items after delimiters
+                diff = pos[first_delim:] - torch.cummax(mask[first_delim:], 0)[1]
+                token_pos = (diff - pos[first_delim]).to(torch.uint16)
+                token_pos_in_items_ptr.append(token_pos)
+
+                # Update forward_batch positions in-place
+                pos[first_delim:] = diff - 1
+                forward_batch.positions[seq_start:seq_end] = pos
+
+            seq_start = seq_end
+
+        # Pad token_pos_in_items_ptr for batch processing
+        if token_pos_in_items_ptr:
+            token_pos_in_items_len = max(t.numel() for t in token_pos_in_items_ptr)
+            device = forward_batch.input_ids.device
+            token_pos_in_items_ptr = [
+                torch.cat(
+                    [
+                        t,
+                        torch.zeros(
+                            token_pos_in_items_len - t.numel(),
+                            dtype=torch.uint16,
+                            device=device,
+                        ),
+                    ]
+                )
+                for t in token_pos_in_items_ptr
+            ]
+
+        if not prefix_len_ptr or not token_pos_in_items_ptr:
+            return MultiItemScoringParams()
+
+        # Build final params
+        device = forward_batch.input_ids.device
+        return MultiItemScoringParams(
+            prefix_len_ptr=torch.tensor(
+                prefix_len_ptr, dtype=torch.uint32, device=device
+            ),
+            token_pos_in_items_ptr=torch.cat(token_pos_in_items_ptr, dim=0),
+            token_pos_in_items_len=token_pos_in_items_len & 0xFFFFFFFF,
+            max_item_len_ptr=torch.stack(
+                [
+                    t.to(torch.int32).max().to(torch.uint16)
+                    for t in token_pos_in_items_ptr
+                ],
+                dim=0,
+            ),
+        )
+
+    def init_forward_metadata(self, forward_batch: ForwardBatch):
+        if forward_batch.forward_mode.is_decode_or_idle():
+            self.indices_updater_decode.update(
+                forward_batch.req_pool_indices,
+                forward_batch.seq_lens,
+                forward_batch.seq_lens_cpu,
+                forward_batch.seq_lens_sum,
+                decode_wrappers=self.decode_wrappers,
+                encoder_lens=forward_batch.encoder_lens,
+                spec_info=forward_batch.spec_info,
+                fixed_split_size=self.decode_split_tile_size,
+                disable_split_kv=False,
+            )
+            self.forward_metadata = DecodeMetadata(self.decode_wrappers)
+        elif forward_batch.forward_mode.is_draft_extend():
+            self.indices_updater_prefill.update(
+                forward_batch.req_pool_indices,
+                forward_batch.seq_lens,
+                forward_batch.seq_lens_cpu,
+                forward_batch.seq_lens_sum,
+                prefix_lens=None,
+                prefill_wrappers=self.prefill_wrappers_paged,
+                use_ragged=False,
+                encoder_lens=forward_batch.encoder_lens,
+                spec_info=forward_batch.spec_info,
+            )
+            self.forward_metadata = PrefillMetadata(
+                self.prefill_wrappers_paged, False, False
+            )
+        elif forward_batch.forward_mode.is_target_verify():
+            self.indices_updater_prefill.update(
+                forward_batch.req_pool_indices,
+                forward_batch.seq_lens,
+                forward_batch.seq_lens_cpu,
+                forward_batch.seq_lens_sum,
+                prefix_lens=None,
+                prefill_wrappers=self.prefill_wrappers_verify,
+                use_ragged=False,
+                encoder_lens=forward_batch.encoder_lens,
+                spec_info=forward_batch.spec_info,
+            )
+            self.forward_metadata = PrefillMetadata(
+                self.prefill_wrappers_verify, False, False
+            )
+        else:
+            prefix_lens = forward_batch.extend_prefix_lens
+
+            # Disable ragged wrapper and ensure prefix handling for multimodal and multi-item scoring
+            if self.is_multimodal or self.multi_item_scoring_delimiter is not None:
+                # use_ragged = False: Multi-item scoring requires the paged wrapper because:
+                # 1. Ragged wrapper doesn't support the specialized multi-item parameters
+                #    (prefix_len_ptr, token_pos_in_items_ptr, etc.)
+                # 2. Paged wrapper provides better control over attention masking needed
+                #    for respecting item boundaries in multi-item sequences
+                # 3. Custom masking logic conflicts with ragged wrapper's assumptions
+                use_ragged = False
+                extend_no_prefix = False
+            else:
+                use_ragged = (
+                    not self.enable_deterministic and not is_in_piecewise_cuda_graph()
+                )
+                extend_no_prefix = not any(forward_batch.extend_prefix_lens_cpu)
+
+            # Process multi-item scoring in attention backend instead of ForwardBatch
+            multi_item_params = MultiItemScoringParams()
+            if self.multi_item_scoring_delimiter is not None:
+                # Use new backend-specific implementation
+                multi_item_params = self._process_multi_item_scoring(forward_batch)
+
+            self.indices_updater_prefill.update(
+                forward_batch.req_pool_indices,
+                forward_batch.seq_lens,
+                forward_batch.seq_lens_cpu,
+                forward_batch.seq_lens_sum,
+                prefix_lens,
+                prefill_wrappers=self.prefill_wrappers_paged,
+                use_ragged=use_ragged,
+                encoder_lens=forward_batch.encoder_lens,
+                spec_info=None,
+                fixed_split_size=self.prefill_split_tile_size,
+                multi_item_params=multi_item_params,
+            )
+            self.forward_metadata = PrefillMetadata(
+                self.prefill_wrappers_paged,
+                use_ragged,
+                extend_no_prefix,
+                multi_item_params,
+            )
+
+    def init_cuda_graph_state(
+        self,
+        max_bs: int,
+        max_num_tokens: int,
+        kv_indices_buf: Optional[torch.Tensor] = None,
+    ):
+        if kv_indices_buf is None:
+            cuda_graph_kv_indices = torch.zeros(
+                (max_num_tokens * self.max_context_len,),
+                dtype=torch.int32,
+                device="cuda",
+            )
+        else:
+            cuda_graph_kv_indices = kv_indices_buf
+
+        self.cuda_graph_kv_indices = [cuda_graph_kv_indices] + [
+            cuda_graph_kv_indices.clone() for _ in range(self.num_wrappers - 1)
+        ]
+
+        # Ensure tensors are properly allocated
+        for i in range(self.num_wrappers):
+            # Force allocation by performing a small operation
+            if len(self.cuda_graph_kv_indices[i]) > 0:
+                self.cuda_graph_kv_indices[i][0] = 0
+
+        if not self.skip_prefill:
+            self.cuda_graph_custom_mask = torch.zeros(
+                (max_num_tokens * self.max_context_len),
+                dtype=torch.uint8,
+                device="cuda",
+            )
+            self.cuda_graph_qk_indptr = [x.clone() for x in self.kv_indptr]
+            self.cuda_graph_qo_indptr = [x.clone() for x in self.kv_indptr]
+
+    def init_forward_metadata_capture_cuda_graph(
+        self,
+        bs: int,
+        num_tokens: int,
+        req_pool_indices: torch.Tensor,
+        seq_lens: torch.Tensor,
+        encoder_lens: Optional[torch.Tensor],
+        forward_mode: ForwardMode,
+        spec_info: Optional[SpecInput],
+    ):
+        if forward_mode.is_decode_or_idle():
+            decode_wrappers = []
+            for i in range(self.num_wrappers):
+                decode_wrappers.append(
+                    BatchDecodeWithPagedKVCacheWrapper(
+                        self.workspace_buffer,
+                        "NHD",
+                        backend=self.decode_backend,
+                        use_cuda_graph=True,
+                        use_tensor_cores=self.decode_use_tensor_cores,
+                        paged_kv_indptr_buffer=self.kv_indptr[i][: num_tokens + 1],
+                        paged_kv_indices_buffer=self.cuda_graph_kv_indices[i],
+                        paged_kv_last_page_len_buffer=self.kv_last_page_len[
+                            :num_tokens
+                        ],
+                    )
+                )
+            seq_lens_sum = seq_lens.sum().item()
+            self.indices_updater_decode.update(
+                req_pool_indices,
+                seq_lens,
+                seq_lens.cpu(),  # may add a little overhead in capture stage
+                seq_lens_sum,
+                decode_wrappers=decode_wrappers,
+                encoder_lens=encoder_lens,
+                spec_info=spec_info,
+                fixed_split_size=None,
+                disable_split_kv=self.disable_cuda_graph_kv_split,
+            )
+            self.decode_cuda_graph_metadata[bs] = decode_wrappers
+            self.forward_metadata = DecodeMetadata(decode_wrappers)
+            for i in range(self.num_wrappers):
+                decode_wrappers[i].begin_forward = partial(
+                    fast_decode_plan, decode_wrappers[i]
+                )
+        elif forward_mode.is_target_verify():
+            prefill_wrappers = []
+            for i in range(self.num_wrappers):
+                prefill_wrappers.append(
+                    BatchPrefillWithPagedKVCacheWrapper(
+                        self.workspace_buffer,
+                        "NHD",
+                        use_cuda_graph=True,
+                        backend=self.prefill_backend,
+                        qo_indptr_buf=self.cuda_graph_qo_indptr[i][: bs + 1],
+                        paged_kv_indptr_buf=self.kv_indptr[i][: bs + 1],
+                        paged_kv_indices_buf=self.cuda_graph_kv_indices[i],
+                        paged_kv_last_page_len_buf=self.kv_last_page_len[:bs],
+                        custom_mask_buf=self.cuda_graph_custom_mask,
+                        mask_indptr_buf=self.cuda_graph_qk_indptr[i][: bs + 1],
+                    )
+                )
+            seq_lens_sum = seq_lens.sum().item()
+            self.indices_updater_prefill.update(
+                req_pool_indices,
+                seq_lens,
+                seq_lens.cpu(),  # may add a little overhead in capture stage
+                seq_lens_sum,
+                prefix_lens=None,
+                prefill_wrappers=prefill_wrappers,
+                use_ragged=False,
+                encoder_lens=encoder_lens,
+                spec_info=spec_info,
+            )
+            self.prefill_cuda_graph_metadata[bs] = prefill_wrappers
+            self.forward_metadata = PrefillMetadata(prefill_wrappers, False, False)
+        elif forward_mode.is_draft_extend():
+            prefill_wrappers = []
+            for i in range(self.num_wrappers):
+                prefill_wrappers.append(
+                    BatchPrefillWithPagedKVCacheWrapper(
+                        self.workspace_buffer,
+                        "NHD",
+                        backend=self.prefill_backend,
+                        use_cuda_graph=True,
+                        qo_indptr_buf=self.cuda_graph_qo_indptr[i][: bs + 1],
+                        paged_kv_indptr_buf=self.kv_indptr[i][: bs + 1],
+                        paged_kv_indices_buf=self.cuda_graph_kv_indices[i],
+                        paged_kv_last_page_len_buf=self.kv_last_page_len[:bs],
+                    )
+                )
+
+            seq_lens_sum = seq_lens.sum().item()
+            self.indices_updater_prefill.update(
+                req_pool_indices,
+                seq_lens,
+                seq_lens.cpu(),  # may add a little overhead in capture stage
+                seq_lens_sum,
+                prefix_lens=None,
+                prefill_wrappers=prefill_wrappers,
+                use_ragged=False,
+                encoder_lens=encoder_lens,
+                spec_info=spec_info,
+            )
+            self.prefill_cuda_graph_metadata[bs] = prefill_wrappers
+            self.forward_metadata = PrefillMetadata(prefill_wrappers, False, False)
+        elif forward_mode.is_dllm_extend():
+            prefill_wrappers = []
+            for i in range(self.num_wrappers):
+                prefill_wrappers.append(
+                    BatchPrefillWithPagedKVCacheWrapper(
+                        self.workspace_buffer,
+                        "NHD",
+                        backend=self.prefill_backend,
+                        use_cuda_graph=True,
+                        qo_indptr_buf=self.cuda_graph_qo_indptr[i][: bs + 1],
+                        paged_kv_indptr_buf=self.kv_indptr[i][: bs + 1],
+                        paged_kv_indices_buf=self.cuda_graph_kv_indices[i],
+                        paged_kv_last_page_len_buf=self.kv_last_page_len[:bs],
+                    )
+                )
+            seq_lens_sum = seq_lens.sum().item()
+            self.indices_updater_prefill.update(
+                req_pool_indices,
+                seq_lens,
+                seq_lens.cpu(),  # may add a little overhead in capture stage
+                seq_lens_sum,
+                prefix_lens=seq_lens - self.dllm_config.block_size,
+                prefill_wrappers=prefill_wrappers,
+                use_ragged=True,
+                encoder_lens=encoder_lens,
+                spec_info=None,
+            )
+            self.prefill_cuda_graph_metadata[bs] = prefill_wrappers
+            self.forward_metadata = PrefillMetadata(prefill_wrappers, True, False)
+        else:
+            raise ValueError(f"Invalid mode: {forward_mode=}")
+
+    def init_forward_metadata_replay_cuda_graph(
+        self,
+        bs: int,
+        req_pool_indices: torch.Tensor,
+        seq_lens: torch.Tensor,
+        seq_lens_sum: int,
+        encoder_lens: Optional[torch.Tensor],
+        forward_mode: ForwardMode,
+        spec_info: Optional[SpecInput],
+        seq_lens_cpu: Optional[torch.Tensor],
+    ):
+        if forward_mode.is_decode_or_idle():
+            self.indices_updater_decode.update(
+                req_pool_indices[:bs],
+                seq_lens[:bs],
+                seq_lens_cpu[:bs] if seq_lens_cpu is not None else None,
+                seq_lens_sum,
+                decode_wrappers=self.decode_cuda_graph_metadata[bs],
+                encoder_lens=encoder_lens[:bs] if encoder_lens is not None else None,
+                spec_info=spec_info,
+                fixed_split_size=None,
+                disable_split_kv=self.disable_cuda_graph_kv_split,
+            )
+        elif forward_mode.is_target_verify():
+            self.indices_updater_prefill.update(
+                req_pool_indices[:bs],
+                seq_lens[:bs],
+                seq_lens_cpu[:bs] if seq_lens_cpu is not None else None,
+                seq_lens_sum,
+                prefix_lens=None,
+                prefill_wrappers=self.prefill_cuda_graph_metadata[bs],
+                use_ragged=False,
+                encoder_lens=encoder_lens[:bs] if encoder_lens is not None else None,
+                spec_info=spec_info,
+            )
+        elif forward_mode.is_draft_extend():
+            self.indices_updater_prefill.update(
+                req_pool_indices[:bs],
+                seq_lens[:bs],
+                seq_lens_cpu[:bs] if seq_lens_cpu is not None else None,
+                seq_lens_sum,
+                prefix_lens=None,
+                prefill_wrappers=self.prefill_cuda_graph_metadata[bs],
+                use_ragged=False,
+                encoder_lens=encoder_lens[:bs] if encoder_lens is not None else None,
+                spec_info=spec_info,
+            )
+        elif forward_mode.is_dllm_extend():
+            self.indices_updater_prefill.update(
+                req_pool_indices[:bs],
+                seq_lens[:bs],
+                seq_lens_cpu[:bs] if seq_lens_cpu is not None else None,
+                seq_lens_sum,
+                prefix_lens=seq_lens - self.dllm_config.block_size,
+                prefill_wrappers=self.prefill_cuda_graph_metadata[bs],
+                use_ragged=True,
+                encoder_lens=encoder_lens[:bs] if encoder_lens is not None else None,
+                spec_info=None,
+            )
+        else:
+            raise ValueError("Invalid forward mode")
+
+    def get_cuda_graph_seq_len_fill_value(self):
+        return 1
+
+    def forward_extend(
+        self,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
+        layer: RadixAttention,
+        forward_batch: ForwardBatch,
+        save_kv_cache=True,
+    ):
+        prefill_wrapper_paged = self.forward_metadata.prefill_wrappers[
+            self._get_wrapper_idx(layer)
+        ]
+        cache_loc = (
+            forward_batch.out_cache_loc
+            if not layer.is_cross_attention
+            else forward_batch.encoder_out_cache_loc
+        )
+
+        logits_soft_cap = layer.logit_cap
+
+        q = q.contiguous()
+        if not self.forward_metadata.use_ragged:
+            if k is not None:
+                assert v is not None
+                if save_kv_cache:
+                    forward_batch.token_to_kv_pool.set_kv_buffer(
+                        layer, cache_loc, k, v, layer.k_scale, layer.v_scale
+                    )
+
+            o = prefill_wrapper_paged.forward(
+                q.view(-1, layer.tp_q_head_num, layer.head_dim),
+                forward_batch.token_to_kv_pool.get_kv_buffer(layer.layer_id),
+                causal=not layer.is_cross_attention,
+                sm_scale=layer.scaling,
+                # Disable sliding window attention for multi-item scoring:
+                # - Sliding window could cut across item boundaries, breaking semantic coherence
+                # - Multi-item sequences need full attention to properly handle delimiter tokens
+                # - Specialized multi-item parameters (prefix_len_ptr, token_pos_in_items_ptr)
+                #   provide more precise attention control than simple sliding windows
+                # - Item-aware masking takes precedence over window-based masking
+                window_left=(
+                    layer.sliding_window_size
+                    if not (
+                        self.forward_metadata.multi_item_params
+                        and self.forward_metadata.multi_item_params.is_enabled()
+                    )
+                    else -1
+                ),
+                logits_soft_cap=logits_soft_cap,
+                # Must use _float to avoid device-to-host copy that breaks cuda graph capture.
+                k_scale=layer.k_scale_float,
+                v_scale=layer.v_scale_float,
+            )
+        else:
+            # If `k`/`v` are not explicitly provided, fall back to the KV cache stored in
+            # `forward_batch.token_to_kv_pool` for this layer. This enables attention over
+            # previously cached context without re-materializing KV tensors (e.g., the
+            # IQuestLoopCoder path uses token_to_kv_pool as the KV source).
+            if k is None and v is None:
+                k = forward_batch.token_to_kv_pool.get_kv_buffer(layer.layer_id)[0]
+                v = forward_batch.token_to_kv_pool.get_kv_buffer(layer.layer_id)[1]
+            causal = True
+            if (
+                layer.is_cross_attention
+                or layer.attn_type == AttentionType.ENCODER_ONLY
+            ):
+                causal = False
+            if not self.is_dllm_model and layer.attn_type == AttentionType.ENCODER_ONLY:
+                save_kv_cache = False
+
+            if self.forward_metadata.extend_no_prefix:
+                # NOTE: FlashInfer currently has limitations with head_dim = 32 or other dimensions
+                # The FlashInfer head_dim limitation itself is tracked here:
+                # https://github.com/flashinfer-ai/flashinfer/issues/1048
+                o = self.prefill_wrapper_ragged.forward(
+                    q.view(-1, layer.tp_q_head_num, layer.head_dim),
+                    k.view(-1, layer.tp_k_head_num, layer.head_dim),
+                    v.view(-1, layer.tp_v_head_num, layer.head_dim),
+                    causal=causal,
+                    sm_scale=layer.scaling,
+                    logits_soft_cap=logits_soft_cap,
+                )
+
+            else:
+                if not self.is_dllm_model:
+                    # TODO: design a better interface
+                    # For other models, use causal attention for the ragged part as previously
+                    causal = True
+
+                o1, s1 = self.prefill_wrapper_ragged.forward_return_lse(
+                    q.view(-1, layer.tp_q_head_num, layer.head_dim),
+                    k.view(-1, layer.tp_k_head_num, layer.head_dim),
+                    v.view(-1, layer.tp_v_head_num, layer.head_dim),
+                    causal=causal,
+                    sm_scale=layer.scaling,
+                    logits_soft_cap=logits_soft_cap,
+                )
+                o2, s2 = prefill_wrapper_paged.forward_return_lse(
+                    q.view(-1, layer.tp_q_head_num, layer.head_dim),
+                    forward_batch.token_to_kv_pool.get_kv_buffer(layer.layer_id),
+                    causal=False,
+                    sm_scale=layer.scaling,
+                    logits_soft_cap=logits_soft_cap,
+                )
+
+                o, _ = merge_state(o1, s1, o2, s2)
+
+            if save_kv_cache:
+                forward_batch.token_to_kv_pool.set_kv_buffer(
+                    layer, cache_loc, k, v, layer.k_scale, layer.v_scale
+                )
+
+        return o.view(-1, layer.tp_q_head_num * layer.head_dim)
+
+    def forward_decode(
+        self,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
+        layer: RadixAttention,
+        forward_batch: ForwardBatch,
+        save_kv_cache=True,
+    ):
+        decode_wrapper = self.forward_metadata.decode_wrappers[
+            self._get_wrapper_idx(layer)
+        ]
+        cache_loc = (
+            forward_batch.out_cache_loc
+            if not layer.is_cross_attention
+            else forward_batch.encoder_out_cache_loc
+        )
+
+        if k is not None:
+            assert v is not None
+            if save_kv_cache:
+                forward_batch.token_to_kv_pool.set_kv_buffer(
+                    layer, cache_loc, k, v, layer.k_scale, layer.v_scale
+                )
+
+        # Call the wrapped function
+        o = decode_wrapper.forward(
+            q.contiguous().view(-1, layer.tp_q_head_num, layer.head_dim),
+            forward_batch.token_to_kv_pool.get_kv_buffer(layer.layer_id),
+            sm_scale=layer.scaling,
+            logits_soft_cap=layer.logit_cap,
+            # Must use _float to avoid device-to-host copy that breaks cuda graph capture.
+            k_scale=layer.k_scale_float,
+            v_scale=layer.v_scale_float,
+        )
+
+        return o.view(-1, layer.tp_q_head_num * layer.head_dim)
+
+    def _get_wrapper_idx(self, layer: RadixAttention):
+        if self.num_wrappers == 1:
+            return 0
+
+        if self.dispatch_reason == WrapperDispatch.SLIDING_WINDOW:
+            return layer.sliding_window_size == -1
+        if self.dispatch_reason == WrapperDispatch.CROSS_ATTENTION:
+            return layer.is_cross_attention
+
+        raise ValueError(f"Unknown dispatch reason: {self.dispatch_reason}")
+
+
+class FlashInferIndicesUpdaterDecode:
+    def __init__(self, model_runner: ModelRunner, attn_backend: FlashInferAttnBackend):
+        # Parse Constants
+        self.num_qo_heads = (
+            model_runner.model_config.num_attention_heads // get_attention_tp_size()
+        )
+        self.num_kv_heads = model_runner.model_config.get_num_kv_heads(
+            get_attention_tp_size()
+        )
+        self.head_dim = model_runner.model_config.head_dim
+        self.data_type = model_runner.kv_cache_dtype
+        self.q_data_type = model_runner.dtype
+        self.sliding_window_size = model_runner.sliding_window_size
+        self.attn_backend = attn_backend
+
+        # Buffers and wrappers
+        self.kv_indptr = attn_backend.kv_indptr
+        self.kv_last_page_len = attn_backend.kv_last_page_len
+        self.req_to_token = model_runner.req_to_token_pool.req_to_token
+        self.token_to_kv_pool_allocator = model_runner.token_to_kv_pool_allocator
+
+        # Dispatch the update function
+        if self.attn_backend.dispatch_reason == WrapperDispatch.SLIDING_WINDOW:
+            self.update = self.update_sliding_window
+        elif self.attn_backend.dispatch_reason == WrapperDispatch.CROSS_ATTENTION:
+            self.update = self.update_cross_attention
+        else:
+            assert self.attn_backend.num_wrappers == 1
+            self.update = self.update_single_wrapper
+
+    def update(
+        self,
+        req_pool_indices: torch.Tensor,
+        seq_lens: torch.Tensor,
+        seq_lens_cpu: Optional[torch.Tensor],
+        seq_lens_sum: int,
+        decode_wrappers: List[BatchDecodeWithPagedKVCacheWrapper],
+        encoder_lens: Optional[torch.Tensor],
+        spec_info: Optional[SpecInput],
+        fixed_split_size: Optional[int] = None,
+        disable_split_kv: Optional[bool] = None,
+    ):
+        # Keep the signature for type checking. It will be assigned during runtime.
+        raise NotImplementedError()
+
+    def update_single_wrapper(
+        self,
+        req_pool_indices: torch.Tensor,
+        seq_lens: torch.Tensor,
+        seq_lens_cpu: Optional[torch.Tensor],
+        seq_lens_sum: int,
+        decode_wrappers: List[BatchDecodeWithPagedKVCacheWrapper],
+        encoder_lens: Optional[torch.Tensor],
+        spec_info: Optional[SpecInput],
+        fixed_split_size: Optional[int] = None,
+        disable_split_kv: Optional[bool] = None,
+    ):
+        decode_wrappers = decode_wrappers or self.decode_wrappers
+        self.call_begin_forward(
+            decode_wrappers[0],
+            req_pool_indices,
+            seq_lens,
+            seq_lens_sum,
+            self.kv_indptr[0],
+            None,
+            spec_info,
+            seq_lens_cpu,
+            fixed_split_size=fixed_split_size,
+            disable_split_kv=disable_split_kv,
+        )
+
+    def update_sliding_window(
+        self,
+        req_pool_indices: torch.Tensor,
+        seq_lens: torch.Tensor,
+        seq_lens_cpu: Optional[torch.Tensor],
+        seq_lens_sum: int,
+        decode_wrappers: List[BatchDecodeWithPagedKVCacheWrapper],
+        encoder_lens: Optional[torch.Tensor],
+        spec_info: Optional[SpecInput],
+        fixed_split_size: Optional[int] = None,
+        disable_split_kv: Optional[bool] = None,
+    ):
+        assert self.sliding_window_size is not None
+        for wrapper_id in range(2):
+            if wrapper_id == 0:
+                # Sliding window attention
+                paged_kernel_lens_tmp = torch.clamp(
+                    seq_lens, max=self.sliding_window_size + 1
+                )
+                if seq_lens_cpu is not None:
+                    seq_lens_cpu_tmp = torch.clamp(
+                        seq_lens_cpu, max=self.sliding_window_size + 1
+                    )
+                    paged_kernel_lens_sum_tmp = seq_lens_cpu_tmp.sum().item()
+                else:
+                    paged_kernel_lens_sum_tmp = paged_kernel_lens_tmp.sum().item()
+                kv_start_idx_tmp = seq_lens - paged_kernel_lens_tmp
+            else:
+                # Full attention
+                paged_kernel_lens_tmp = seq_lens
+                paged_kernel_lens_sum_tmp = seq_lens_sum
+                seq_lens_cpu_tmp = seq_lens_cpu
+                kv_start_idx_tmp = None
+
+            use_sliding_window_kv_pool = wrapper_id == 0 and isinstance(
+                self.token_to_kv_pool_allocator, SWATokenToKVPoolAllocator
+            )
+
+            self.call_begin_forward(
+                decode_wrappers[wrapper_id],
+                req_pool_indices,
+                paged_kernel_lens_tmp,
+                paged_kernel_lens_sum_tmp,
+                self.kv_indptr[wrapper_id],
+                kv_start_idx_tmp,
+                spec_info,
+                seq_lens_cpu=seq_lens_cpu_tmp,
+                use_sliding_window_kv_pool=use_sliding_window_kv_pool,
+            )
+
+    def update_cross_attention(
+        self,
+        req_pool_indices: torch.Tensor,
+        seq_lens: torch.Tensor,
+        seq_lens_cpu: Optional[torch.Tensor],
+        seq_lens_sum: int,
+        decode_wrappers: List[BatchDecodeWithPagedKVCacheWrapper],
+        encoder_lens: Optional[torch.Tensor],
+        spec_info: Optional[SpecInput],
+        fixed_split_size: Optional[int] = None,
+        disable_split_kv: Optional[bool] = None,
+    ):
+        for wrapper_id in range(2):
+            if wrapper_id == 0:
+                # Normal attention
+                paged_kernel_lens = seq_lens
+                kv_start_idx = encoder_lens
+            else:
+                # Cross attention
+                paged_kernel_lens = encoder_lens
+                kv_start_idx = torch.zeros_like(encoder_lens)
+                seq_lens_sum = encoder_lens.sum().item()
+
+            self.call_begin_forward(
+                decode_wrappers[wrapper_id],
+                req_pool_indices,
+                paged_kernel_lens,
+                seq_lens_sum,
+                self.kv_indptr[wrapper_id],
+                kv_start_idx,
+                spec_info,
+                seq_lens_cpu=seq_lens_cpu,
+            )
+
+    def call_begin_forward(
+        self,
+        wrapper: BatchDecodeWithPagedKVCacheWrapper,
+        req_pool_indices: torch.Tensor,
+        paged_kernel_lens: torch.Tensor,
+        paged_kernel_lens_sum: int,
+        kv_indptr: torch.Tensor,
+        kv_start_idx: torch.Tensor,
+        spec_info: Optional[SpecInput],
+        seq_lens_cpu: Optional[torch.Tensor],
+        use_sliding_window_kv_pool: bool = False,
+        fixed_split_size: Optional[int] = None,
+        disable_split_kv: Optional[bool] = None,
+    ):
+        if spec_info is None:
+            bs = len(req_pool_indices)
+            kv_indptr[1 : bs + 1] = torch.cumsum(paged_kernel_lens, dim=0)
+            kv_indptr = kv_indptr[: bs + 1]
+
+            if wrapper.is_cuda_graph_enabled:
+                # Directly write to the cuda graph input buffer
+                kv_indices = wrapper._paged_kv_indices_buf
+            else:
+                kv_indices = torch.empty(
+                    paged_kernel_lens_sum, dtype=torch.int32, device="cuda"
+                )
+
+            create_flashinfer_kv_indices_triton[(bs,)](
+                self.req_to_token,
+                req_pool_indices,
+                paged_kernel_lens,
+                kv_indptr,
+                kv_start_idx,
+                kv_indices,
+                self.req_to_token.shape[1],
+            )
+        else:
+            kv_indptr, kv_indices = spec_info.kv_indptr, spec_info.kv_indices
+            bs = kv_indptr.shape[0] - 1
+
+        if use_sliding_window_kv_pool:
+            kv_last_index = kv_indptr[-1]
+            kv_indices[:kv_last_index] = (
+                self.token_to_kv_pool_allocator.translate_loc_from_full_to_swa(
+                    kv_indices[:kv_last_index]
+                )
+            )
+
+        global global_override_indptr_cpu
+        locally_override = False
+        if seq_lens_cpu is not None and global_override_indptr_cpu is None:
+            locally_override = True
+            global_override_indptr_cpu = torch.empty_like(kv_indptr, device="cpu")
+            global_override_indptr_cpu[0] = 0
+            global_override_indptr_cpu[1 : bs + 1] = torch.cumsum(seq_lens_cpu, dim=0)
+
+        # Check if this specific wrapper's begin_forward has been replaced with fast_decode_plan
+        # by checking if it's a partial function with fast_decode_plan as the func
+        wrapper_uses_fast_decode_plan = (
+            hasattr(wrapper.begin_forward, "func")
+            and wrapper.begin_forward.func == fast_decode_plan
+        )
+
+        if wrapper_uses_fast_decode_plan:
+            # When begin_forward is replaced with fast_decode_plan, pass global_override_indptr_cpu
+            wrapper.begin_forward(
+                kv_indptr,
+                kv_indices,
+                self.kv_last_page_len[:bs],
+                self.num_qo_heads,
+                self.num_kv_heads,
+                self.head_dim,
+                1,
+                data_type=self.data_type,
+                q_data_type=self.q_data_type,
+                non_blocking=True,
+                fixed_split_size=fixed_split_size,
+                disable_split_kv=(
+                    disable_split_kv if disable_split_kv is not None else False
+                ),
+                global_override_indptr_cpu=global_override_indptr_cpu,
+            )
+        else:
+            # When using original begin_forward, don't pass global_override_indptr_cpu
+            wrapper.begin_forward(
+                kv_indptr,
+                kv_indices,
+                self.kv_last_page_len[:bs],
+                self.num_qo_heads,
+                self.num_kv_heads,
+                self.head_dim,
+                1,
+                data_type=self.data_type,
+                q_data_type=self.q_data_type,
+                non_blocking=True,
+                fixed_split_size=fixed_split_size,
+                disable_split_kv=(
+                    disable_split_kv if disable_split_kv is not None else False
+                ),
+            )
+
+        if locally_override:
+            global_override_indptr_cpu = None
+
+
+class FlashInferIndicesUpdaterPrefill:
+    def __init__(self, model_runner: ModelRunner, attn_backend: FlashInferAttnBackend):
+        # Parse Constants
+        self.num_qo_heads = (
+            model_runner.model_config.num_attention_heads // get_attention_tp_size()
+        )
+        self.num_kv_heads = model_runner.model_config.get_num_kv_heads(
+            get_attention_tp_size()
+        )
+        self.head_dim = model_runner.model_config.head_dim
+        self.data_type = model_runner.kv_cache_dtype
+        self.q_data_type = model_runner.dtype
+        self.sliding_window_size = model_runner.sliding_window_size
+        self.attn_backend = attn_backend
+
+        # Buffers and wrappers
+        self.kv_indptr = attn_backend.kv_indptr
+        self.kv_last_page_len = attn_backend.kv_last_page_len
+        self.qo_indptr = attn_backend.qo_indptr
+        self.req_to_token = model_runner.req_to_token_pool.req_to_token
+        self.token_to_kv_pool_allocator = model_runner.token_to_kv_pool_allocator
+        self.prefill_wrapper_ragged = attn_backend.prefill_wrapper_ragged
+
+        # Dispatch the update function
+        if self.attn_backend.dispatch_reason == WrapperDispatch.SLIDING_WINDOW:
+            self.update = self.update_sliding_window
+        elif self.attn_backend.dispatch_reason == WrapperDispatch.CROSS_ATTENTION:
+            self.update = self.update_cross_attention
+        else:
+            assert self.attn_backend.num_wrappers == 1
+            self.update = self.update_single_wrapper
+
+    def update(
+        self,
+        req_pool_indices: torch.Tensor,
+        seq_lens: torch.Tensor,
+        seq_lens_cpu: Optional[torch.Tensor],
+        seq_lens_sum: int,
+        prefix_lens: torch.Tensor,
+        prefill_wrappers: List[BatchPrefillWithPagedKVCacheWrapper],
+        use_ragged: bool,
+        encoder_lens: Optional[torch.Tensor],
+        spec_info: Optional[SpecInput],
+        fixed_split_size: Optional[int] = None,
+    ):
+        # Keep the signature for type checking. It will be assigned during runtime.
+        raise NotImplementedError()
+
+    def update_single_wrapper(
+        self,
+        req_pool_indices: torch.Tensor,
+        seq_lens: torch.Tensor,
+        seq_lens_cpu: Optional[torch.Tensor],
+        seq_lens_sum: int,
+        prefix_lens: torch.Tensor,
+        prefill_wrappers: List[BatchPrefillWithPagedKVCacheWrapper],
+        use_ragged: bool,
+        encoder_lens: Optional[torch.Tensor],
+        spec_info: Optional[SpecInput],
+        fixed_split_size: Optional[int] = None,
+        multi_item_params: Optional[MultiItemScoringParams] = None,
+    ):
+        if use_ragged:
+            # TODO: remove this device sync, we can use forward_batch.extend_prefix_lens_cpu
+            # and forward_batch.extend_seq_lens_cpu
+            paged_kernel_lens = prefix_lens
+            paged_kernel_lens_sum = paged_kernel_lens.sum().item()
+        else:
+            paged_kernel_lens = seq_lens
+            paged_kernel_lens_sum = seq_lens_sum
+
+        self.call_begin_forward(
+            self.prefill_wrapper_ragged,
+            prefill_wrappers[0],
+            req_pool_indices,
+            paged_kernel_lens,
+            paged_kernel_lens_sum,
+            seq_lens,
+            prefix_lens,
+            None,
+            self.kv_indptr[0],
+            self.qo_indptr[0],
+            use_ragged,
+            spec_info,
+            fixed_split_size=fixed_split_size,
+            multi_item_params=multi_item_params,
+        )
+
+    def update_sliding_window(
+        self,
+        req_pool_indices: torch.Tensor,
+        seq_lens: torch.Tensor,
+        seq_lens_cpu: Optional[torch.Tensor],
+        seq_lens_sum: int,
+        prefix_lens: torch.Tensor,
+        prefill_wrappers: List[BatchPrefillWithPagedKVCacheWrapper],
+        use_ragged: bool,
+        encoder_lens: Optional[torch.Tensor],
+        spec_info: Optional[SpecInput],
+        fixed_split_size: Optional[int] = None,
+        multi_item_params: Optional[MultiItemScoringParams] = None,
+    ):
+        for wrapper_id in range(2):
+            if wrapper_id == 0:
+                # window attention use paged only
+                paged_kernel_lens = torch.minimum(
+                    seq_lens,
+                    torch.tensor(self.sliding_window_size) + seq_lens - prefix_lens,
+                )
+                paged_kernel_lens_sum = paged_kernel_lens.sum().item()
+            else:
+                # full attention
+                paged_kernel_lens = seq_lens
+                paged_kernel_lens_sum = seq_lens_sum
+
+            kv_start_idx = seq_lens - paged_kernel_lens
+            use_sliding_window_kv_pool = wrapper_id == 0 and isinstance(
+                self.token_to_kv_pool_allocator, SWATokenToKVPoolAllocator
+            )
+
+            self.call_begin_forward(
+                self.prefill_wrapper_ragged,
+                prefill_wrappers[wrapper_id],
+                req_pool_indices,
+                paged_kernel_lens,
+                paged_kernel_lens_sum,
+                seq_lens,
+                prefix_lens,
+                kv_start_idx,
+                self.kv_indptr[wrapper_id],
+                self.qo_indptr[wrapper_id],
+                use_ragged,
+                spec_info,
+                use_sliding_window_kv_pool=use_sliding_window_kv_pool,
+                multi_item_params=multi_item_params,
+            )
+
+    def update_cross_attention(
+        self,
+        req_pool_indices: torch.Tensor,
+        seq_lens: torch.Tensor,
+        seq_lens_cpu: Optional[torch.Tensor],
+        seq_lens_sum: int,
+        prefix_lens: torch.Tensor,
+        prefill_wrappers: List[BatchPrefillWithPagedKVCacheWrapper],
+        use_ragged: bool,
+        encoder_lens: Optional[torch.Tensor],
+        spec_info: Optional[SpecInput],
+        fixed_split_size: Optional[int] = None,
+        multi_item_params: Optional[MultiItemScoringParams] = None,
+    ):
+        for wrapper_id in range(2):
+            if wrapper_id == 0:
+                # normal attention
+                paged_kernel_lens = seq_lens
+                kv_start_idx = encoder_lens
+                paged_kernel_lens_sum = seq_lens_sum
+            else:
+                # cross attention
+                paged_kernel_lens = encoder_lens
+                kv_start_idx = torch.zeros_like(encoder_lens)
+                paged_kernel_lens_sum = paged_kernel_lens.sum().item()
+
+            self.call_begin_forward(
+                self.prefill_wrapper_ragged,
+                prefill_wrappers[wrapper_id],
+                req_pool_indices,
+                paged_kernel_lens,
+                paged_kernel_lens_sum,
+                seq_lens,
+                prefix_lens,
+                kv_start_idx,
+                self.kv_indptr[wrapper_id],
+                self.qo_indptr[wrapper_id],
+                use_ragged,
+                spec_info,
+                multi_item_params=multi_item_params,
+            )
+
+    def call_begin_forward(
+        self,
+        wrapper_ragged: BatchPrefillWithRaggedKVCacheWrapper,
+        wrapper_paged: BatchPrefillWithPagedKVCacheWrapper,
+        req_pool_indices: torch.Tensor,
+        paged_kernel_lens: torch.Tensor,
+        paged_kernel_lens_sum: int,
+        seq_lens: torch.Tensor,
+        prefix_lens: torch.Tensor,
+        kv_start_idx: torch.Tensor,
+        kv_indptr: torch.Tensor,
+        qo_indptr: torch.Tensor,
+        use_ragged: bool,
+        spec_info: Optional[SpecInput],
+        use_sliding_window_kv_pool: bool = False,
+        fixed_split_size: Optional[int] = None,
+        multi_item_params: Optional[MultiItemScoringParams] = None,
+    ):
+        bs = len(seq_lens)
+        if spec_info is None:
+            assert len(seq_lens) == len(req_pool_indices)
+            # Normal extend
+            kv_indptr[1 : bs + 1] = torch.cumsum(paged_kernel_lens, dim=0)
+            kv_indptr = kv_indptr[: bs + 1]
+            kv_indices = torch.empty(
+                paged_kernel_lens_sum + 256,
+                dtype=torch.int32,
+                device=req_pool_indices.device,
+            )
+            create_flashinfer_kv_indices_triton[(bs,)](
+                self.req_to_token,
+                req_pool_indices,
+                paged_kernel_lens,
+                kv_indptr,
+                kv_start_idx,
+                kv_indices,
+                self.req_to_token.shape[1],
+            )
+            qo_indptr[1 : bs + 1] = torch.cumsum(seq_lens - prefix_lens, dim=0)
+            qo_indptr = qo_indptr[: bs + 1]
+            custom_mask = None
+        else:
+            assert isinstance(spec_info, SpecInput)
+            kv_indices, kv_indptr, qo_indptr, custom_mask = (
+                spec_info.generate_attn_arg_prefill(
+                    req_pool_indices,
+                    paged_kernel_lens,
+                    paged_kernel_lens_sum,
+                    self.req_to_token,
+                )
+            )
+
+        # extend part
+        if use_ragged:
+            wrapper_ragged.begin_forward(
+                qo_indptr,
+                qo_indptr,
+                self.num_qo_heads,
+                self.num_kv_heads,
+                self.head_dim,
+                q_data_type=self.q_data_type,
+            )
+
+        if use_sliding_window_kv_pool:
+            kv_last_index = kv_indptr[-1]
+            kv_indices[:kv_last_index] = (
+                self.token_to_kv_pool_allocator.translate_loc_from_full_to_swa(
+                    kv_indices[:kv_last_index]
+                )
+            )
+
+        # cached part
+        # Conditionally set multi-item parameters
+        if multi_item_params is not None and multi_item_params.is_enabled():
+            # Multi-item scoring is active - use specialized parameters and disable generic custom_mask
+            use_custom_mask = None
+            prefix_len_ptr = multi_item_params.prefix_len_ptr
+            token_pos_in_items_ptr = multi_item_params.token_pos_in_items_ptr
+            token_pos_in_items_len = multi_item_params.token_pos_in_items_len
+            max_item_len_ptr = multi_item_params.max_item_len_ptr
+        else:
+            # No multi-item scoring - use standard parameters
+            use_custom_mask = custom_mask
+            prefix_len_ptr = None
+            token_pos_in_items_ptr = None
+            token_pos_in_items_len = 0
+            max_item_len_ptr = None
+
+        wrapper_paged.begin_forward(
+            qo_indptr,
+            kv_indptr,
+            kv_indices,
+            self.kv_last_page_len[:bs],
+            self.num_qo_heads,
+            self.num_kv_heads,
+            self.head_dim,
+            1,
+            q_data_type=self.q_data_type,
+            kv_data_type=self.data_type,
+            custom_mask=use_custom_mask,
+            non_blocking=True,
+            fixed_split_size=fixed_split_size,
+            prefix_len_ptr=prefix_len_ptr,
+            token_pos_in_items_ptr=token_pos_in_items_ptr,
+            token_pos_in_items_len=token_pos_in_items_len,
+            max_item_len_ptr=max_item_len_ptr,
+        )
+
+
+class FlashInferMultiStepDraftBackend:
+    """
+    Wrap multiple flashinfer attention backends as one for multiple consecutive
+    draft decoding steps.
+    """
+
+    def __init__(
+        self,
+        model_runner: ModelRunner,
+        topk: int,
+        speculative_num_steps: int,
+    ):
+        from sglang.srt.speculative.spec_utils import generate_draft_decode_kv_indices
+
+        self.topk = topk
+        self.speculative_num_steps = speculative_num_steps
+        self.generate_draft_decode_kv_indices = generate_draft_decode_kv_indices
+        self.page_size = model_runner.page_size
+
+        max_bs = model_runner.req_to_token_pool.size * self.topk
+        self.kv_indptr = torch.zeros(
+            (
+                self.speculative_num_steps,
+                max_bs + 1,
+            ),
+            dtype=torch.int32,
+            device=model_runner.device,
+        )
+        self.kv_last_page_len = torch.ones(
+            (max_bs,), dtype=torch.int32, device=model_runner.device
+        )
+        self.attn_backends: List[FlashInferAttnBackend] = []
+        for i in range(self.speculative_num_steps - 1):
+            self.attn_backends.append(
+                FlashInferAttnBackend(
+                    model_runner,
+                    skip_prefill=True,
+                    kv_indptr_buf=self.kv_indptr[i],
+                    kv_last_page_len_buf=self.kv_last_page_len,
+                )
+            )
+
+        self.max_context_len = self.attn_backends[0].max_context_len
+
+        # Cached variables for generate_draft_decode_kv_indices
+        self.pool_len = model_runner.req_to_token_pool.req_to_token.shape[1]
+
+    def common_template(
+        self,
+        forward_batch: ForwardBatch,
+        kv_indices_buffer: torch.Tensor,
+        call_fn: Callable,
+    ):
+        num_seqs = forward_batch.batch_size
+        bs = self.topk * num_seqs
+        seq_lens_sum = forward_batch.seq_lens_sum
+
+        self.generate_draft_decode_kv_indices[
+            (self.speculative_num_steps, num_seqs, self.topk)
+        ](
+            forward_batch.req_pool_indices,
+            forward_batch.req_to_token_pool.req_to_token,
+            forward_batch.seq_lens,
+            kv_indices_buffer,
+            self.kv_indptr,
+            forward_batch.positions,
+            self.pool_len,
+            kv_indices_buffer.shape[1],
+            self.kv_indptr.shape[1],
+            next_power_of_2(num_seqs),
+            next_power_of_2(self.speculative_num_steps),
+            next_power_of_2(bs),
+            self.page_size,
+        )
+
+        assert forward_batch.spec_info is not None
+        assert forward_batch.spec_info.is_draft_input()
+
+        # Copy the kv_indptr once to avoid multiple device-to-host copies in flashinfer's plan.
+        indptr_cpu_whole = self.kv_indptr[:, : bs + 1].cpu()
+        global global_override_indptr_cpu
+
+        for i in range(self.speculative_num_steps - 1):
+            forward_batch.spec_info.kv_indptr = self.kv_indptr[i, : bs + 1]
+            forward_batch.spec_info.kv_indices = kv_indices_buffer[i][
+                : seq_lens_sum * self.topk + bs * (i + 1)
+            ]
+            global_override_indptr_cpu = indptr_cpu_whole[i]
+            call_fn(i, forward_batch)
+
+        global_override_indptr_cpu = None
+
+    def init_forward_metadata(self, forward_batch: ForwardBatch):
+        kv_indices = torch.empty(
+            (
+                self.speculative_num_steps,
+                forward_batch.batch_size * self.topk * self.max_context_len,
+            ),
+            dtype=torch.int32,
+            device="cuda",
+        )
+
+        def call_fn(i, forward_batch):
+            forward_batch.spec_info.kv_indptr = (
+                forward_batch.spec_info.kv_indptr.clone()
+            )
+            forward_batch.spec_info.kv_indices = (
+                forward_batch.spec_info.kv_indices.clone()
+            )
+            self.attn_backends[i].init_forward_metadata(forward_batch)
+
+        self.common_template(forward_batch, kv_indices, call_fn)
+
+    def init_cuda_graph_state(self, max_bs: int, max_num_tokens: int):
+        self.cuda_graph_kv_indices = torch.zeros(
+            (self.speculative_num_steps, max_bs * self.max_context_len),
+            dtype=torch.int32,
+            device="cuda",
+        )
+
+        for i in range(self.speculative_num_steps - 1):
+            self.attn_backends[i].init_cuda_graph_state(
+                max_bs, max_num_tokens, kv_indices_buf=self.cuda_graph_kv_indices[i]
+            )
+
+    def init_forward_metadata_capture_cuda_graph(self, forward_batch: ForwardBatch):
+        def call_fn(i, forward_batch):
+            self.attn_backends[i].init_forward_metadata_capture_cuda_graph(
+                forward_batch.batch_size,
+                forward_batch.batch_size * self.topk,
+                forward_batch.req_pool_indices,
+                forward_batch.seq_lens,
+                encoder_lens=None,
+                forward_mode=ForwardMode.DECODE,
+                spec_info=forward_batch.spec_info,
+            )
+
+        self.common_template(forward_batch, self.cuda_graph_kv_indices, call_fn)
+
+    def init_forward_metadata_replay_cuda_graph(
+        self, forward_batch: ForwardBatch, bs: int
+    ):
+        def call_fn(i, forward_batch):
+            self.attn_backends[i].init_forward_metadata_replay_cuda_graph(
+                bs,
+                forward_batch.req_pool_indices,
+                forward_batch.seq_lens,
+                seq_lens_sum=-1,
+                encoder_lens=None,
+                forward_mode=ForwardMode.DECODE,
+                spec_info=forward_batch.spec_info,
+                seq_lens_cpu=forward_batch.seq_lens_cpu,
+            )
+
+        self.common_template(forward_batch, self.cuda_graph_kv_indices, call_fn)
+
+
+def should_use_tensor_core(
+    kv_cache_dtype: torch.dtype,
+    num_attention_heads: int,
+    num_kv_heads: int,
+) -> bool:
+    """
+    Determine whether to use tensor cores for attention computation.
+
+    Args:
+        kv_cache_dtype: Data type of the KV cache
+        num_attention_heads: Number of attention heads
+        num_kv_heads: Number of key/value heads
+
+    Returns:
+        bool: Whether to use tensor cores
+    """
+    # Try to use environment variable first
+    env_override = os.environ.get("SGLANG_FLASHINFER_USE_TENSOR_CORE")
+    if env_override is not None:
+        return env_override.lower() == "true"
+
+    # Try to use _grouped_size_compiled_for_decode_kernels if available
+    # This is for flashinfer <=0.1.6. Otherwise, there is an accuracy bug
+    try:
+        from flashinfer.decode import _grouped_size_compiled_for_decode_kernels
+
+        if not _grouped_size_compiled_for_decode_kernels(
+            num_attention_heads,
+            num_kv_heads,
+        ):
+            return True
+        else:
+            return False
+    except (ImportError, AttributeError):
+        pass
+
+    # Calculate GQA group size
+    gqa_group_size = num_attention_heads // num_kv_heads
+
+    # For Flashinfer, a GQA group size of at least 4 is needed to efficiently
+    # use Tensor Cores, as it fuses the head group with the token dimension in MMA.
+    if kv_cache_dtype in (torch.float8_e4m3fn, torch.float8_e5m2):
+        return True
+    elif kv_cache_dtype in (torch.float16, torch.half, torch.bfloat16):
+        return gqa_group_size >= 4
+    else:
+        return False
diff --git a/sglang/python/sglang/srt/layers/attention/flashinfer_mla_backend.py b/sglang/python/sglang/srt/layers/attention/flashinfer_mla_backend.py
new file mode 100644
index 0000000000000000000000000000000000000000..601a80cea52b1cd47389f5c23e547bab3d6b798b
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/attention/flashinfer_mla_backend.py
@@ -0,0 +1,1090 @@
+from __future__ import annotations
+
+"""
+Support attention backend for flashinfer MLA.
+The flashinfer_mla_disable_ragged flag controls whether to use ragged prefill wrapper and defaults to be false.
+When it's set to false, all wrappers are BatchMLAPaged wrapper.
+When it's set to true, the backend uses BatchRagged and BatchMLAPaged wrapper for prefilling,
+and uses BatchMLAPaged wrapper for decoding.
+More details can be found in https://docs.flashinfer.ai/api/mla.html
+"""
+
+from dataclasses import dataclass
+from functools import partial
+from typing import TYPE_CHECKING, Callable, Optional, Union
+
+import torch
+
+from sglang.srt.compilation.piecewise_context_manager import is_in_piecewise_cuda_graph
+from sglang.srt.environ import envs
+from sglang.srt.layers.attention.base_attn_backend import AttentionBackend
+from sglang.srt.layers.attention.flashinfer_backend import (
+    create_flashinfer_kv_indices_triton,
+)
+from sglang.srt.layers.dp_attention import get_attention_tp_size
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch, ForwardMode
+from sglang.srt.server_args import get_global_server_args
+from sglang.srt.speculative.spec_info import SpecInput
+from sglang.srt.utils import (
+    is_flashinfer_available,
+    is_sm100_supported,
+    next_power_of_2,
+)
+
+if TYPE_CHECKING:
+    from sglang.srt.layers.attention.flashinfer_mla_backend import (
+        FlashInferMlaAttnBackend,
+    )
+    from sglang.srt.layers.radix_attention import RadixAttention
+    from sglang.srt.model_executor.model_runner import ModelRunner
+    from sglang.srt.speculative.spec_info import SpecInput
+
+if envs.SGLANG_ENABLE_TORCH_COMPILE.get():
+    import logging
+
+    torch._logging.set_logs(dynamo=logging.ERROR)
+    torch._dynamo.config.suppress_errors = True
+
+if is_flashinfer_available():
+    from flashinfer import (
+        BatchMLAPagedAttentionWrapper,
+        BatchPrefillWithRaggedKVCacheWrapper,
+    )
+
+
+@dataclass
+class DecodeMetadata:
+    decode_wrapper: BatchMLAPagedAttentionWrapper
+
+
+@dataclass
+class PrefillMetadata:
+    prefill_wrapper: BatchMLAPagedAttentionWrapper
+    use_ragged: bool
+
+
+# Reuse this workspace buffer across all flashinfer wrappers
+global_workspace_buffer = None
+
+
+class FlashInferMhaChunkKVRunner:
+    def __init__(
+        self, model_runner: ModelRunner, attn_backend: FlashInferMlaAttnBackend
+    ):
+        # Parse Constants
+        self.num_local_heads = (
+            model_runner.model_config.num_attention_heads // get_attention_tp_size()
+        )
+        self.qk_nope_head_dim = model_runner.model_config.qk_nope_head_dim
+        self.qk_rope_head_dim = model_runner.model_config.qk_rope_head_dim
+        self.v_head_dim = model_runner.model_config.v_head_dim
+        self.data_type = model_runner.dtype
+        self.q_data_type = model_runner.dtype
+
+        # Buffers and wrappers
+        self.qo_indptr = attn_backend.qo_indptr
+        self.kv_indptr = attn_backend.kv_indptr
+        self.workspace_buffer = attn_backend.workspace_buffer
+        self.fmha_backend = attn_backend.fmha_backend
+
+        self.chunk_ragged_wrappers = []
+        self.ragged_wrapper = attn_backend.prefill_wrapper_ragged
+
+    def update_prefix_chunks(self, num_prefix_chunks: int):
+        while num_prefix_chunks > len(self.chunk_ragged_wrappers):
+            ragged_wrapper = BatchPrefillWithRaggedKVCacheWrapper(
+                self.workspace_buffer, "NHD", backend=self.fmha_backend
+            )
+            self.chunk_ragged_wrappers.append(ragged_wrapper)
+
+    def update_wrapper(
+        self,
+        forward_batch: ForwardBatch,
+        disable_flashinfer_ragged: bool = False,
+    ):
+        assert forward_batch.num_prefix_chunks is not None
+        num_prefix_chunks = forward_batch.num_prefix_chunks
+        self.update_prefix_chunks(num_prefix_chunks)
+
+        prefix_lens = forward_batch.extend_prefix_lens
+        seq_lens = forward_batch.seq_lens
+
+        bs = len(seq_lens)
+        qo_indptr = self.qo_indptr
+        qo_indptr[1 : bs + 1] = torch.cumsum(seq_lens - prefix_lens, dim=0)
+        qo_indptr = qo_indptr[: bs + 1]
+
+        for chunk_idx in range(forward_batch.num_prefix_chunks):
+            # MHA for chunked prefix kv cache when running model with MLA
+            assert forward_batch.prefix_chunk_idx is not None
+            assert forward_batch.prefix_chunk_cu_seq_lens is not None
+            assert forward_batch.prefix_chunk_max_seq_lens is not None
+
+            kv_indptr = forward_batch.prefix_chunk_cu_seq_lens[chunk_idx]
+            wrapper = self.chunk_ragged_wrappers[chunk_idx]
+            wrapper.begin_forward(
+                qo_indptr=qo_indptr,
+                kv_indptr=kv_indptr,
+                num_qo_heads=self.num_local_heads,
+                num_kv_heads=self.num_local_heads,
+                head_dim_qk=self.qk_nope_head_dim + self.qk_rope_head_dim,
+                head_dim_vo=self.v_head_dim,
+                q_data_type=self.q_data_type,
+                causal=False,
+            )
+        # ragged prefill
+        if not disable_flashinfer_ragged:
+            kv_indptr = (
+                qo_indptr
+                if not forward_batch.mha_one_shot
+                else self.kv_indptr[: bs + 1]
+            )
+            self.ragged_wrapper.begin_forward(
+                qo_indptr=qo_indptr,
+                kv_indptr=kv_indptr,
+                num_qo_heads=self.num_local_heads,
+                num_kv_heads=self.num_local_heads,
+                head_dim_qk=self.qk_nope_head_dim + self.qk_rope_head_dim,
+                head_dim_vo=self.v_head_dim,
+                q_data_type=self.q_data_type,
+                causal=True,
+            )
+
+    def forward(
+        self,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
+        layer: RadixAttention,
+        forward_batch: ForwardBatch,
+    ):
+        logits_soft_cap = layer.logit_cap
+        if forward_batch.attn_attend_prefix_cache:
+            chunk_idx = forward_batch.prefix_chunk_idx
+            assert chunk_idx >= 0
+            wrapper = self.chunk_ragged_wrappers[chunk_idx]
+            o = wrapper.forward_return_lse(
+                q.view(-1, layer.tp_q_head_num, layer.head_dim),
+                k.view(-1, layer.tp_k_head_num, layer.head_dim).to(q.dtype),
+                v.view(-1, layer.tp_v_head_num, layer.v_head_dim).to(q.dtype),
+                causal=False,
+                sm_scale=layer.scaling,
+                logits_soft_cap=logits_soft_cap,
+            )
+        else:
+            forward = (
+                self.ragged_wrapper.forward_return_lse
+                if forward_batch.mha_return_lse
+                else self.ragged_wrapper.forward
+            )
+            o = forward(
+                q.view(-1, layer.tp_q_head_num, layer.head_dim),
+                k.view(-1, layer.tp_k_head_num, layer.head_dim).to(q.dtype),
+                v.view(-1, layer.tp_v_head_num, layer.v_head_dim).to(q.dtype),
+                causal=True,
+                sm_scale=layer.scaling,
+                logits_soft_cap=logits_soft_cap,
+            )
+        return o
+
+
+class FlashInferMLAAttnBackend(AttentionBackend):
+    """Flashinfer attention kernels."""
+
+    def __init__(
+        self,
+        model_runner: ModelRunner,
+        skip_prefill: bool = False,
+        kv_indptr_buf: Optional[torch.Tensor] = None,
+        q_indptr_decode_buf: Optional[torch.Tensor] = None,
+    ):
+        super().__init__()
+
+        # Parse constants
+        self.max_context_len = model_runner.model_config.context_len
+        self.device = model_runner.device
+        self.skip_prefill = skip_prefill
+        self.enable_chunk_kv = (
+            not skip_prefill
+            and get_global_server_args().disaggregation_mode != "decode"
+            and not get_global_server_args().disable_chunked_prefix_cache
+            and not get_global_server_args().flashinfer_mla_disable_ragged
+        )
+        self.page_size = model_runner.page_size
+
+        # Allocate buffers
+        global global_workspace_buffer
+        if global_workspace_buffer is None:
+            # different from flashinfer zero_init_global_workspace_buffer
+            global_workspace_buffer = torch.empty(
+                envs.SGLANG_FLASHINFER_WORKSPACE_SIZE.get(),
+                dtype=torch.uint8,
+                device=model_runner.device,
+            )
+        self.workspace_buffer = global_workspace_buffer
+
+        max_bs = model_runner.req_to_token_pool.size
+        if kv_indptr_buf is None:
+            self.kv_indptr = torch.zeros(
+                (max_bs + 1,), dtype=torch.int32, device=model_runner.device
+            )
+        else:
+            self.kv_indptr = kv_indptr_buf
+
+        if not self.skip_prefill:
+            self.qo_indptr = torch.zeros(
+                (max_bs + 1,), dtype=torch.int32, device=model_runner.device
+            )
+
+        if q_indptr_decode_buf is None:
+            self.q_indptr_decode = torch.arange(
+                0, max_bs + 1, dtype=torch.int32, device=model_runner.device
+            )
+        else:
+            self.q_indptr_decode = q_indptr_decode_buf
+
+        if is_sm100_supported():
+            self.fmha_backend = "cutlass"
+        else:
+            self.fmha_backend = "auto"
+
+        self.prefill_wrapper_ragged = BatchPrefillWithRaggedKVCacheWrapper(
+            self.workspace_buffer, "NHD", backend=self.fmha_backend
+        )
+
+        if not self.skip_prefill:
+            self.prefill_wrapper_paged = BatchMLAPagedAttentionWrapper(
+                self.workspace_buffer,
+                backend="auto",
+            )
+
+            # FlashinferMLA backend uses mla wrapper for target verify
+            self.prefill_wrapper_verify = BatchMLAPagedAttentionWrapper(
+                self.workspace_buffer,
+                backend="auto",
+            )
+
+        self.decode_wrapper = BatchMLAPagedAttentionWrapper(
+            self.workspace_buffer, backend="auto"
+        )
+
+        # Create indices updater
+        if not skip_prefill:
+            self.indices_updater_prefill = FlashInferMLAIndicesUpdaterPrefill(
+                model_runner, self
+            )
+            if self.enable_chunk_kv:
+                self.mha_chunk_kv_cache = FlashInferMhaChunkKVRunner(model_runner, self)
+
+        self.indices_updater_decode = FlashInferMLAIndicesUpdaterDecode(
+            model_runner, self
+        )
+
+        # Other metadata
+        self.forward_metadata: Union[PrefillMetadata, DecodeMetadata] = None
+        self.decode_cuda_graph_metadata = {}
+        self.prefill_cuda_graph_metadata = {}  # For verify
+
+    def init_forward_metadata(self, forward_batch: ForwardBatch):
+        if forward_batch.forward_mode.is_decode_or_idle():
+            self.indices_updater_decode.update(
+                forward_batch.req_pool_indices,
+                forward_batch.seq_lens,
+                forward_batch.seq_lens_sum,
+                decode_wrapper=self.decode_wrapper,
+                init_metadata_replay=False,
+            )
+            self.forward_metadata = DecodeMetadata(self.decode_wrapper)
+        elif forward_batch.forward_mode.is_draft_extend():
+            self.indices_updater_prefill.update(
+                forward_batch.req_pool_indices,
+                forward_batch.seq_lens,
+                forward_batch.seq_lens_sum,
+                prefix_lens=None,
+                prefill_wrapper_paged=self.prefill_wrapper_paged,
+                use_ragged=False,
+                spec_info=forward_batch.spec_info,
+            )
+            self.forward_metadata = PrefillMetadata(self.prefill_wrapper_paged, False)
+        elif forward_batch.forward_mode.is_target_verify():
+            self.indices_updater_prefill.update(
+                forward_batch.req_pool_indices,
+                forward_batch.seq_lens,
+                forward_batch.seq_lens_sum,
+                prefix_lens=None,
+                prefill_wrapper_paged=self.prefill_wrapper_verify,
+                use_ragged=False,
+                spec_info=forward_batch.spec_info,
+            )
+            self.forward_metadata = PrefillMetadata(self.prefill_wrapper_verify, False)
+        else:
+            prefix_lens = forward_batch.extend_prefix_lens
+            extend_no_prefix = not any(forward_batch.extend_prefix_lens_cpu)
+            use_ragged = (
+                not get_global_server_args().flashinfer_mla_disable_ragged
+                and extend_no_prefix
+                # Piecewise cuda graph should use paged prefill to be compatible with prefix cache
+                and not is_in_piecewise_cuda_graph()
+            )
+
+            self.indices_updater_prefill.update(
+                forward_batch.req_pool_indices,
+                forward_batch.seq_lens,
+                forward_batch.seq_lens_sum,
+                prefix_lens,
+                prefill_wrapper_paged=self.prefill_wrapper_paged,
+                use_ragged=use_ragged,
+            )
+            self.forward_metadata = PrefillMetadata(
+                self.prefill_wrapper_paged, use_ragged
+            )
+
+    def init_cuda_graph_state(
+        self,
+        max_bs: int,
+        max_num_tokens: int,
+        kv_indices_buf: Optional[torch.Tensor] = None,
+    ):
+        if kv_indices_buf is None:
+            cuda_graph_kv_indices = torch.zeros(
+                (max_bs * self.max_context_len,),
+                dtype=torch.int32,
+                device="cuda",
+            )
+        else:
+            cuda_graph_kv_indices = kv_indices_buf
+
+        self.cuda_graph_kv_indices = cuda_graph_kv_indices
+        self.cuda_graph_qo_indptr = self.q_indptr_decode.clone()
+        self.cuda_graph_kv_indptr = self.kv_indptr.clone()
+        self.cuda_graph_kv_lens = torch.ones(
+            (max_bs,), dtype=torch.int32, device=self.device
+        )
+
+        # For fast decode plan in graph replaying
+        self.cuda_graph_qo_indptr_cpu = self.cuda_graph_qo_indptr.to("cpu")
+        self.cuda_graph_kv_indptr_cpu = self.cuda_graph_kv_indptr.to("cpu")
+        self.fast_decode_kwargs = {
+            "qo_indptr_cpu": self.cuda_graph_qo_indptr_cpu,
+            "kv_indptr_cpu": self.cuda_graph_kv_indptr_cpu,
+            "kv_indices": self.cuda_graph_kv_indices,
+        }
+
+    def init_forward_metadata_capture_cuda_graph(
+        self,
+        bs: int,
+        num_tokens: int,
+        req_pool_indices: torch.Tensor,
+        seq_lens: torch.Tensor,
+        encoder_lens: Optional[torch.Tensor],
+        forward_mode: ForwardMode,
+        spec_info: Optional[SpecInput],
+    ):
+        if forward_mode.is_decode_or_idle():
+            decode_wrapper = BatchMLAPagedAttentionWrapper(
+                self.workspace_buffer,
+                use_cuda_graph=True,
+                qo_indptr=self.cuda_graph_qo_indptr[: num_tokens + 1],
+                kv_indptr=self.cuda_graph_kv_indptr[: num_tokens + 1],
+                kv_indices=self.cuda_graph_kv_indices,
+                kv_len_arr=self.cuda_graph_kv_lens[:num_tokens],
+                backend="auto",
+            )
+
+            seq_lens_sum = seq_lens.sum().item()
+            self.indices_updater_decode.update(
+                req_pool_indices,
+                seq_lens,
+                seq_lens_sum,
+                decode_wrapper=decode_wrapper,
+                init_metadata_replay=False,
+                spec_info=spec_info,
+            )
+            self.decode_cuda_graph_metadata[bs] = decode_wrapper
+            self.forward_metadata = DecodeMetadata(decode_wrapper)
+            decode_wrapper.plan = partial(fast_mla_decode_plan, decode_wrapper)
+        elif forward_mode.is_target_verify():
+            verify_wrapper = BatchMLAPagedAttentionWrapper(
+                self.workspace_buffer,
+                use_cuda_graph=True,
+                qo_indptr=self.cuda_graph_qo_indptr[: bs + 1],
+                kv_indptr=self.cuda_graph_kv_indptr[: bs + 1],
+                kv_indices=self.cuda_graph_kv_indices,
+                kv_len_arr=self.cuda_graph_kv_lens[:bs],
+                backend="auto",
+            )
+            seq_lens_sum = seq_lens.sum().item()
+            self.indices_updater_prefill.update(
+                req_pool_indices,
+                seq_lens,
+                seq_lens_sum,
+                prefix_lens=None,
+                prefill_wrapper_paged=verify_wrapper,
+                use_ragged=False,
+                spec_info=spec_info,
+            )
+            self.prefill_cuda_graph_metadata[bs] = verify_wrapper
+            self.forward_metadata = PrefillMetadata(verify_wrapper, False)
+        elif forward_mode.is_draft_extend():
+            draft_extend_wrapper = BatchMLAPagedAttentionWrapper(
+                self.workspace_buffer,
+                use_cuda_graph=True,
+                qo_indptr=self.cuda_graph_qo_indptr[: bs + 1],
+                kv_indptr=self.cuda_graph_kv_indptr[: bs + 1],
+                kv_indices=self.cuda_graph_kv_indices,
+                kv_len_arr=self.cuda_graph_kv_lens[:bs],
+                backend="auto",
+            )
+            seq_lens_sum = seq_lens.sum().item()
+            self.indices_updater_prefill.update(
+                req_pool_indices,
+                seq_lens,
+                seq_lens_sum,
+                prefix_lens=None,
+                prefill_wrapper_paged=draft_extend_wrapper,
+                use_ragged=False,
+                spec_info=spec_info,
+            )
+            self.prefill_cuda_graph_metadata[bs] = draft_extend_wrapper
+            self.forward_metadata = PrefillMetadata(draft_extend_wrapper, False)
+        else:
+            raise ValueError(f"Invalid mode: {forward_mode=}")
+
+    def init_forward_metadata_replay_cuda_graph(
+        self,
+        bs: int,
+        req_pool_indices: torch.Tensor,
+        seq_lens: torch.Tensor,
+        seq_lens_sum: int,
+        encoder_lens: Optional[torch.Tensor],
+        forward_mode: ForwardMode,
+        spec_info: Optional[SpecInput],
+        seq_lens_cpu: Optional[torch.Tensor],
+    ):
+        if forward_mode.is_decode_or_idle():
+            assert seq_lens_cpu is not None
+            kv_len_arr_cpu = seq_lens_cpu[:bs]
+            self.cuda_graph_kv_indptr_cpu[1 : bs + 1] = torch.cumsum(
+                kv_len_arr_cpu, dim=0
+            )
+            self.fast_decode_kwargs.update(
+                {
+                    "qo_indptr_cpu": self.cuda_graph_qo_indptr_cpu[: bs + 1],
+                    "kv_indptr_cpu": self.cuda_graph_kv_indptr_cpu[: bs + 1],
+                    "kv_len_arr_cpu": kv_len_arr_cpu,
+                }
+            )
+
+            self.indices_updater_decode.update(
+                req_pool_indices[:bs],
+                seq_lens[:bs],
+                seq_lens_sum,
+                decode_wrapper=self.decode_cuda_graph_metadata[bs],
+                init_metadata_replay=True,
+                spec_info=spec_info,
+                **self.fast_decode_kwargs,
+            )
+        elif forward_mode.is_target_verify():
+            self.indices_updater_prefill.update(
+                req_pool_indices[:bs],
+                seq_lens[:bs],
+                seq_lens_sum,
+                prefix_lens=None,
+                prefill_wrapper_paged=self.prefill_cuda_graph_metadata[bs],
+                use_ragged=False,
+                spec_info=spec_info,
+            )
+        elif forward_mode.is_draft_extend():
+            self.indices_updater_prefill.update(
+                req_pool_indices[:bs],
+                seq_lens[:bs],
+                seq_lens_sum,
+                prefix_lens=None,
+                prefill_wrapper_paged=self.prefill_cuda_graph_metadata[bs],
+                use_ragged=False,
+                spec_info=spec_info,
+            )
+        else:
+            raise ValueError(f"Invalid forward mode: {forward_mode=}")
+
+    def get_cuda_graph_seq_len_fill_value(self):
+        return 1
+
+    def init_mha_chunk_metadata(
+        self, forward_batch: ForwardBatch, disable_flashinfer_ragged: bool = False
+    ):
+        """Init the metadata for a forward pass."""
+        self.mha_chunk_kv_cache.update_wrapper(forward_batch, disable_flashinfer_ragged)
+
+    def forward_extend(
+        self,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
+        layer: RadixAttention,
+        forward_batch: ForwardBatch,
+        save_kv_cache: bool = True,
+        q_rope: Optional[torch.Tensor] = None,
+        k_rope: Optional[torch.Tensor] = None,
+    ):
+        if forward_batch.attn_attend_prefix_cache is not None and any(
+            forward_batch.extend_prefix_lens_cpu
+        ):  # MHA Chunk
+            assert self.enable_chunk_kv
+            assert q_rope is None
+            assert k_rope is None
+            return self.mha_chunk_kv_cache.forward(q, k, v, layer, forward_batch)
+
+        cache_loc = forward_batch.out_cache_loc
+        logits_soft_cap = layer.logit_cap
+        prefill_wrapper_paged = self.forward_metadata.prefill_wrapper
+
+        # Save kv cache
+        if save_kv_cache and k is not None:
+            assert v is not None
+            if save_kv_cache:
+                if k_rope is not None:
+                    forward_batch.token_to_kv_pool.set_mla_kv_buffer(
+                        layer, cache_loc, k, k_rope
+                    )
+                else:
+                    forward_batch.token_to_kv_pool.set_kv_buffer(layer, cache_loc, k, v)
+        if q_rope is not None:
+            q = q.view(-1, layer.tp_q_head_num, layer.v_head_dim)
+            q_rope = q_rope.view(
+                -1, layer.tp_q_head_num, layer.head_dim - layer.v_head_dim
+            )
+
+        if self.forward_metadata.use_ragged:
+            # ragged prefill
+            if q_rope is not None:
+                q = torch.cat([q, q_rope], dim=-1)
+            qall = q.view(-1, layer.tp_q_head_num, layer.head_dim)
+            if k_rope is not None:
+                k = torch.cat([k, k_rope], dim=-1)
+            o = self.prefill_wrapper_ragged.forward(
+                qall,
+                k.view(-1, layer.tp_k_head_num, layer.head_dim).to(q.dtype),
+                v.view(-1, layer.tp_k_head_num, layer.v_head_dim).to(q.dtype),
+                causal=True,
+                sm_scale=layer.scaling,
+                logits_soft_cap=logits_soft_cap,
+            )
+        else:
+            # mla paged prefill
+            k_buf = forward_batch.token_to_kv_pool.get_key_buffer(layer.layer_id).to(
+                q.dtype
+            )
+            if q_rope is None:
+                qall = q.view(-1, layer.tp_q_head_num, layer.head_dim)
+                q, q_rope = (
+                    qall[:, :, : layer.v_head_dim],
+                    qall[:, :, layer.v_head_dim :],
+                )
+            o = q.new_empty(q.shape)
+            o = prefill_wrapper_paged.run(
+                q,
+                q_rope,
+                k_buf[:, :, : layer.v_head_dim],
+                k_buf[:, :, layer.v_head_dim :],
+                out=o,
+            )
+
+        return o.view(-1, layer.tp_q_head_num * layer.v_head_dim)
+
+    def forward_decode(
+        self,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
+        layer: RadixAttention,
+        forward_batch: ForwardBatch,
+        save_kv_cache: bool = True,
+        # For multi-head latent attention
+        q_rope: Optional[torch.Tensor] = None,
+        k_rope: Optional[torch.Tensor] = None,
+    ):
+        decode_wrapper = self.forward_metadata.decode_wrapper
+        cache_loc = forward_batch.out_cache_loc
+
+        if k is not None:
+            assert v is not None
+            if save_kv_cache:
+                if k_rope is not None:
+                    forward_batch.token_to_kv_pool.set_mla_kv_buffer(
+                        layer,
+                        cache_loc,
+                        k,
+                        k_rope,
+                    )
+                else:
+                    forward_batch.token_to_kv_pool.set_kv_buffer(
+                        layer,
+                        cache_loc,
+                        k,
+                        v,
+                    )
+
+        # Reshape inputs
+        if q_rope is not None:
+            q_nope = q.view(-1, layer.tp_q_head_num, layer.v_head_dim)
+            q_rope = q_rope.view(
+                -1, layer.tp_q_head_num, layer.head_dim - layer.v_head_dim
+            )
+        else:
+            reshaped_q = q.view(-1, layer.tp_q_head_num, layer.head_dim)
+            q_nope = reshaped_q[:, :, : layer.v_head_dim]
+            q_rope = reshaped_q[:, :, layer.v_head_dim :]
+
+        k_buffer = forward_batch.token_to_kv_pool.get_key_buffer(layer.layer_id).to(
+            q.dtype
+        )
+
+        o = q_nope.new_empty(q_nope.shape)
+        # Direct call to run without the wrapper
+        o = decode_wrapper.run(
+            q_nope,
+            q_rope,
+            k_buffer[:, :, : layer.v_head_dim],
+            k_buffer[:, :, layer.v_head_dim :],
+            out=o,
+        )
+
+        return o.view(-1, layer.tp_q_head_num * layer.v_head_dim)
+
+
+class FlashInferMLAIndicesUpdaterDecode:
+    def __init__(self, model_runner: ModelRunner, attn_backend: AttentionBackend):
+        # Parse Constants
+        self.num_local_heads = (
+            model_runner.model_config.num_attention_heads // get_attention_tp_size()
+        )
+        self.kv_lora_rank = model_runner.model_config.kv_lora_rank
+        self.qk_nope_head_dim = model_runner.model_config.qk_nope_head_dim
+        self.qk_rope_head_dim = model_runner.model_config.qk_rope_head_dim
+        self.scaling = model_runner.model_config.scaling
+        self.data_type = model_runner.dtype
+        self.attn_backend = attn_backend
+
+        # Buffers and wrappers
+        self.kv_indptr = attn_backend.kv_indptr
+        self.req_to_token = model_runner.req_to_token_pool.req_to_token
+        self.q_indptr = attn_backend.q_indptr_decode
+
+    def update(
+        self,
+        req_pool_indices: torch.Tensor,
+        seq_lens: torch.Tensor,
+        seq_lens_sum: int,
+        decode_wrapper: BatchMLAPagedAttentionWrapper,
+        init_metadata_replay: bool = False,
+        spec_info: Optional[SpecInput] = None,
+        **fast_decode_kwargs,
+    ):
+        decode_wrapper = decode_wrapper or self.decode_wrapper
+        self.call_begin_forward(
+            decode_wrapper,
+            req_pool_indices,
+            seq_lens,
+            seq_lens_sum,
+            self.q_indptr,
+            self.kv_indptr,
+            init_metadata_replay,
+            spec_info,
+            **fast_decode_kwargs,
+        )
+
+    def call_begin_forward(
+        self,
+        wrapper: BatchMLAPagedAttentionWrapper,
+        req_pool_indices: torch.Tensor,
+        paged_kernel_lens: torch.Tensor,
+        paged_kernel_lens_sum: int,
+        q_indptr: torch.Tensor,
+        kv_indptr: torch.Tensor,
+        init_metadata_replay: bool = False,
+        spec_info: Optional[SpecInput] = None,
+        **fast_decode_kwargs,
+    ):
+        bs = len(req_pool_indices)
+        q_indptr = q_indptr[: bs + 1]
+        kv_lens = paged_kernel_lens.to(torch.int32)
+        sm_scale = self.scaling
+        if spec_info is None:
+            kv_indptr[1 : bs + 1] = torch.cumsum(paged_kernel_lens, dim=0)
+            kv_indptr = kv_indptr[: bs + 1]
+            kv_indices = (
+                torch.empty(paged_kernel_lens_sum, dtype=torch.int32, device="cuda")
+                if not init_metadata_replay
+                else fast_decode_kwargs["kv_indices"]
+            )
+            create_flashinfer_kv_indices_triton[(bs,)](
+                self.req_to_token,
+                req_pool_indices,
+                paged_kernel_lens,
+                kv_indptr,
+                None,
+                kv_indices,
+                self.req_to_token.shape[1],
+            )
+        else:
+            kv_indptr, kv_indices = spec_info.kv_indptr, spec_info.kv_indices
+
+        if not init_metadata_replay:
+            wrapper.plan(
+                q_indptr,
+                kv_indptr,
+                kv_indices,
+                kv_lens,
+                self.num_local_heads,
+                self.kv_lora_rank,
+                self.qk_rope_head_dim,
+                1,
+                False,
+                sm_scale,
+                self.data_type,
+                self.data_type,
+            )
+        else:
+            wrapper.plan(
+                fast_decode_kwargs["qo_indptr_cpu"],
+                fast_decode_kwargs["kv_indptr_cpu"],
+                kv_indices,
+                fast_decode_kwargs["kv_len_arr_cpu"],
+                self.num_local_heads,
+                self.kv_lora_rank,
+                self.qk_rope_head_dim,
+                1,
+                False,
+                sm_scale,
+                self.data_type,
+                self.data_type,
+            )
+
+
+class FlashInferMLAIndicesUpdaterPrefill:
+    def __init__(self, model_runner: ModelRunner, attn_backend: AttentionBackend):
+        # Parse Constants
+        self.num_local_heads = (
+            model_runner.model_config.num_attention_heads // get_attention_tp_size()
+        )
+        self.kv_lora_rank = model_runner.model_config.kv_lora_rank
+        self.qk_nope_head_dim = model_runner.model_config.qk_nope_head_dim
+        self.qk_rope_head_dim = model_runner.model_config.qk_rope_head_dim
+        self.v_head_dim = model_runner.model_config.v_head_dim
+        self.scaling = model_runner.model_config.scaling
+        self.data_type = model_runner.dtype
+        self.q_data_type = model_runner.dtype
+        self.attn_backend = attn_backend
+
+        # Buffers and wrappers
+        self.kv_indptr = attn_backend.kv_indptr
+        self.qo_indptr = attn_backend.qo_indptr
+        self.req_to_token = model_runner.req_to_token_pool.req_to_token
+        self.prefill_wrapper_ragged = attn_backend.prefill_wrapper_ragged
+
+    def update(
+        self,
+        req_pool_indices: torch.Tnesor,
+        seq_lens: torch.Tensor,
+        seq_lens_sum: int,
+        prefix_lens: torch.Tensor,
+        prefill_wrapper_paged: BatchMLAPagedAttentionWrapper,
+        use_ragged: bool,
+        spec_info: Optional[SpecInput] = None,
+    ):
+        if use_ragged:
+            paged_kernel_lens = prefix_lens
+            paged_kernel_lens_sum = paged_kernel_lens.sum().item()
+        else:
+            paged_kernel_lens = seq_lens
+            paged_kernel_lens_sum = seq_lens_sum
+
+        self.call_begin_forward(
+            self.prefill_wrapper_ragged,
+            prefill_wrapper_paged,
+            req_pool_indices,
+            paged_kernel_lens,
+            paged_kernel_lens_sum,
+            seq_lens,
+            prefix_lens,
+            self.kv_indptr,
+            self.qo_indptr,
+            use_ragged,
+            spec_info,
+        )
+
+    def call_begin_forward(
+        self,
+        wrapper_ragged: BatchPrefillWithRaggedKVCacheWrapper,
+        wrapper_paged: BatchMLAPagedAttentionWrapper,
+        req_pool_indices: torch.Tensor,
+        paged_kernel_lens: torch.Tensor,
+        paged_kernel_lens_sum: int,
+        seq_lens: torch.Tensor,
+        prefix_lens: torch.Tensor,
+        kv_indptr: torch.Tensor,
+        qo_indptr: torch.Tensor,
+        use_ragged: bool,
+        spec_info: Optional[SpecInput] = None,
+    ):
+        bs = len(seq_lens)
+        sm_scale = self.scaling
+
+        if spec_info is None:
+            assert len(seq_lens) == len(req_pool_indices)
+            kv_indptr[1 : bs + 1] = torch.cumsum(paged_kernel_lens, dim=0)
+            kv_indptr = kv_indptr[: bs + 1]
+            kv_indices = torch.empty(
+                paged_kernel_lens_sum,
+                dtype=torch.int32,
+                device=req_pool_indices.device,
+            )
+            create_flashinfer_kv_indices_triton[(bs,)](
+                self.req_to_token,
+                req_pool_indices,
+                paged_kernel_lens,
+                kv_indptr,
+                None,
+                kv_indices,
+                self.req_to_token.shape[1],
+            )
+            qo_indptr[1 : bs + 1] = torch.cumsum(seq_lens - prefix_lens, dim=0)
+            qo_indptr = qo_indptr[: bs + 1]
+            custom_mask = None
+        else:
+            assert isinstance(spec_info, SpecInput)
+            # TODO: Support topk > 1 with custom mask
+            kv_indices, kv_indptr, qo_indptr, custom_mask = (
+                spec_info.generate_attn_arg_prefill(
+                    req_pool_indices,
+                    paged_kernel_lens,
+                    paged_kernel_lens_sum,
+                    self.req_to_token,
+                )
+            )
+
+        if use_ragged:
+            # ragged prefill
+            wrapper_ragged.begin_forward(
+                qo_indptr=qo_indptr,
+                kv_indptr=qo_indptr,
+                num_qo_heads=self.num_local_heads,
+                num_kv_heads=self.num_local_heads,
+                head_dim_qk=self.qk_nope_head_dim + self.qk_rope_head_dim,
+                head_dim_vo=self.v_head_dim,
+                q_data_type=self.q_data_type,
+                causal=True,
+            )
+        else:
+            # mla paged prefill
+            kv_len_arr = kv_indptr[1:] - kv_indptr[:-1]
+            wrapper_paged.plan(
+                qo_indptr,
+                kv_indptr,
+                kv_indices,
+                kv_len_arr,
+                self.num_local_heads,
+                self.kv_lora_rank,
+                self.qk_rope_head_dim,
+                1,
+                True,
+                sm_scale,
+                self.q_data_type,
+                self.data_type,
+            )
+
+
+class FlashInferMLAMultiStepDraftBackend:
+    """
+    Wrap multiple flashinfer mla attention backends as one for multiple consecutive
+    draft decoding steps.
+    """
+
+    def __init__(
+        self,
+        model_runner: ModelRunner,
+        topk: int,
+        speculative_num_steps: int,
+    ):
+        from sglang.srt.speculative.spec_utils import generate_draft_decode_kv_indices
+
+        if topk > 1:
+            raise ValueError(
+                "Currently Flashinfer MLA only supports topk=1 for speculative decoding"
+            )
+        self.topk = topk
+        self.speculative_num_steps = speculative_num_steps
+        self.generate_draft_decode_kv_indices = generate_draft_decode_kv_indices
+
+        max_bs = model_runner.req_to_token_pool.size * self.topk
+        self.kv_indptr = torch.zeros(
+            (
+                self.speculative_num_steps,
+                max_bs + 1,
+            ),
+            dtype=torch.int32,
+            device=model_runner.device,
+        )
+        self.q_indptr_decode = torch.arange(
+            0, max_bs + 1, dtype=torch.int32, device=model_runner.device
+        )
+
+        self.attn_backends = []
+        for i in range(self.speculative_num_steps - 1):
+            self.attn_backends.append(
+                FlashInferMLAAttnBackend(
+                    model_runner,
+                    skip_prefill=True,
+                    kv_indptr_buf=self.kv_indptr[i],
+                    q_indptr_decode_buf=self.q_indptr_decode,
+                )
+            )
+
+        self.max_context_len = self.attn_backends[0].max_context_len
+
+        # Cached variables for generate_draft_decode_kv_indices
+        self.pool_len = model_runner.req_to_token_pool.req_to_token.shape[1]
+        self.page_size = model_runner.server_args.page_size
+
+    def common_template(
+        self,
+        forward_batch: ForwardBatch,
+        kv_indices_buffer: torch.Tensor,
+        call_fn: Callable,
+    ):
+        num_seqs = forward_batch.batch_size
+        bs = self.topk * num_seqs
+        seq_lens_sum = forward_batch.seq_lens_sum
+
+        self.generate_draft_decode_kv_indices[
+            (self.speculative_num_steps, num_seqs, self.topk)
+        ](
+            forward_batch.req_pool_indices,
+            forward_batch.req_to_token_pool.req_to_token,
+            forward_batch.seq_lens,
+            kv_indices_buffer,
+            self.kv_indptr,
+            forward_batch.positions,
+            self.pool_len,
+            kv_indices_buffer.shape[1],
+            self.kv_indptr.shape[1],
+            next_power_of_2(num_seqs),
+            next_power_of_2(self.speculative_num_steps),
+            next_power_of_2(bs),
+            self.page_size,
+        )
+
+        assert forward_batch.spec_info is not None
+        assert forward_batch.spec_info.is_draft_input()
+
+        for i in range(self.speculative_num_steps - 1):
+            forward_batch.spec_info.kv_indptr = self.kv_indptr[i, : bs + 1]
+            forward_batch.spec_info.kv_indices = kv_indices_buffer[i][
+                : seq_lens_sum * self.topk + bs * (i + 1)
+            ]
+            call_fn(i, forward_batch)
+
+    def init_forward_metadata(self, forward_batch: ForwardBatch):
+        kv_indices = torch.zeros(
+            (
+                self.speculative_num_steps,
+                forward_batch.batch_size * self.topk * self.max_context_len,
+            ),
+            dtype=torch.int32,
+            device="cuda",
+        )
+
+        def call_fn(i, forward_batch):
+            forward_batch.spec_info.kv_indptr = (
+                forward_batch.spec_info.kv_indptr.clone()
+            )
+            forward_batch.spec_info.kv_indices = (
+                forward_batch.spec_info.kv_indices.clone()
+            )
+            self.attn_backends[i].init_forward_metadata(forward_batch)
+
+        self.common_template(forward_batch, kv_indices, call_fn)
+
+    def init_cuda_graph_state(self, max_bs: int, max_num_tokens: int):
+        self.cuda_graph_kv_indices = torch.zeros(
+            (self.speculative_num_steps, max_bs * self.max_context_len),
+            dtype=torch.int32,
+            device="cuda",
+        )
+
+        for i in range(self.speculative_num_steps - 1):
+            self.attn_backends[i].init_cuda_graph_state(
+                max_bs, max_num_tokens, kv_indices_buf=self.cuda_graph_kv_indices[i]
+            )
+
+    def init_forward_metadata_capture_cuda_graph(self, forward_batch: ForwardBatch):
+        def call_fn(i, forward_batch):
+            self.attn_backends[i].init_forward_metadata_capture_cuda_graph(
+                forward_batch.batch_size,
+                forward_batch.batch_size * self.topk,
+                forward_batch.req_pool_indices,
+                forward_batch.seq_lens,
+                encoder_lens=None,
+                forward_mode=ForwardMode.DECODE,
+                spec_info=forward_batch.spec_info,
+            )
+
+        self.common_template(forward_batch, self.cuda_graph_kv_indices, call_fn)
+
+    def init_forward_metadata_replay_cuda_graph(
+        self, forward_batch: ForwardBatch, bs: int
+    ):
+        def call_fn(i, forward_batch):
+            self.attn_backends[i].init_forward_metadata_replay_cuda_graph(
+                bs,
+                forward_batch.req_pool_indices,
+                forward_batch.seq_lens,
+                seq_lens_sum=-1,
+                encoder_lens=None,
+                forward_mode=ForwardMode.DECODE,
+                spec_info=forward_batch.spec_info,
+                seq_lens_cpu=forward_batch.seq_lens_cpu,
+            )
+
+        self.common_template(forward_batch, self.cuda_graph_kv_indices, call_fn)
+
+
+def fast_mla_decode_plan(
+    self,
+    qo_indptr_cpu: torch.Tensor,
+    kv_indptr_cpu: torch.Tensor,
+    kv_indices: torch.Tensor,
+    kv_len_arr_cpu: torch.Tensor,
+    num_heads: int,
+    head_dim_ckv: int,
+    head_dim_kpe: int,
+    page_size: int,
+    causal: bool,
+    sm_scale: float,
+    q_data_type: torch.dtype,
+    kv_data_type: torch.dtype,
+) -> None:
+    """A faster version of BatchMLAPagedAttentionWrapper::plan,
+    for skipping the stream synchronization in original plan function during
+    cuda graph replaying.
+    """
+    self._causal = causal
+    self._page_size = page_size
+    self._sm_scale = sm_scale
+
+    try:
+        # Standard version with just the required arguments (no use_profiler)
+        self._cached_module.plan(
+            self._float_workspace_buffer,
+            self._int_workspace_buffer,
+            self._pin_memory_int_workspace_buffer,
+            qo_indptr_cpu,
+            kv_indptr_cpu,
+            kv_len_arr_cpu,
+            num_heads,
+            head_dim_ckv,
+            causal,
+        )
+    except Exception as e:
+        raise RuntimeError(f"Error in alternate MLA plan: {e}")
diff --git a/sglang/python/sglang/srt/layers/attention/flashmla_backend.py b/sglang/python/sglang/srt/layers/attention/flashmla_backend.py
new file mode 100644
index 0000000000000000000000000000000000000000..c3d017583d598c8756f723d32d93505a835a188d
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/attention/flashmla_backend.py
@@ -0,0 +1,596 @@
+from __future__ import annotations
+
+"""
+Support attention backend for FlashMLA.
+"""
+
+from dataclasses import dataclass
+from typing import TYPE_CHECKING, Callable, Optional, Tuple, Union
+
+import torch
+import triton
+from sgl_kernel.flash_mla import flash_mla_with_kvcache, get_mla_metadata
+
+from sglang.srt.layers.attention.flashinfer_mla_backend import FlashInferMLAAttnBackend
+from sglang.srt.layers.attention.utils import create_flashmla_kv_indices_triton
+from sglang.srt.layers.dp_attention import get_attention_tp_size
+from sglang.srt.layers.quantization.fp8_kernel import scaled_fp8_quant
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch, ForwardMode
+
+if TYPE_CHECKING:
+    from sglang.srt.layers.radix_attention import RadixAttention
+    from sglang.srt.model_executor.model_runner import ModelRunner
+    from sglang.srt.speculative.spec_info import SpecInput
+
+
+# FlashMLA only supports pagesize=64
+PAGE_SIZE = 64
+
+# FlashMLA FP8 issue: https://github.com/deepseek-ai/FlashMLA/issues/56
+
+
+@dataclass
+class FlashMLADecodeMetadata:
+    flashmla_metadata: Optional[Tuple[torch.Tensor, torch.Tensor]] = None
+    num_splits: Optional[torch.Tensor] = None
+    block_kv_indices: Optional[torch.Tensor] = None
+
+    def __init__(
+        self,
+        flashmla_metadata: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+        num_splits: Optional[torch.Tensor] = None,
+        block_kv_indices: Optional[torch.Tensor] = None,
+    ):
+        self.flashmla_metadata = flashmla_metadata
+        self.num_splits = num_splits
+        self.block_kv_indices = block_kv_indices
+
+
+class FlashMLABackend(FlashInferMLAAttnBackend):
+    """Flashmla attention kernels."""
+
+    def __init__(
+        self,
+        model_runner: ModelRunner,
+        skip_prefill: bool = False,
+        kv_indptr_buf: Optional[torch.Tensor] = None,
+        kv_last_page_len_buf: Optional[torch.Tensor] = None,
+    ):
+        super().__init__(
+            model_runner, skip_prefill, kv_indptr_buf, kv_last_page_len_buf
+        )
+
+        self.num_q_heads = (
+            model_runner.model_config.num_attention_heads // get_attention_tp_size()
+        )
+        self.req_to_token = model_runner.req_to_token_pool.req_to_token
+        self.num_local_heads = (
+            model_runner.model_config.num_attention_heads // get_attention_tp_size()
+        )
+        self.forward_metadata: Union[FlashMLADecodeMetadata] = None
+        self.kv_lora_rank = model_runner.model_config.kv_lora_rank
+        self.qk_nope_head_dim = model_runner.model_config.qk_nope_head_dim
+        self.qk_rope_head_dim = model_runner.model_config.qk_rope_head_dim
+        self.v_head_dim = model_runner.model_config.v_head_dim
+        self.scaling = model_runner.model_config.scaling
+        self.data_type = model_runner.kv_cache_dtype
+        self.q_data_type = model_runner.dtype
+        self.kv_cache_dim = self.kv_lora_rank + self.qk_rope_head_dim
+        # Check if KV cache is FP8 (supports both e4m3 and e5m2)
+        self.is_fp8_kvcache = self.data_type in {
+            torch.float8_e4m3fn,
+            torch.float8_e5m2,
+        }
+
+        self.num_draft_tokens = model_runner.server_args.speculative_num_draft_tokens
+
+    def init_forward_metadata(self, forward_batch: ForwardBatch):
+
+        bs = forward_batch.batch_size
+        if forward_batch.forward_mode.is_decode_or_idle():
+            max_seqlen_pad = triton.cdiv(
+                forward_batch.seq_lens_cpu.max().item(), PAGE_SIZE
+            )
+            block_kv_indices = torch.full(
+                (bs, max_seqlen_pad),
+                -1,
+                dtype=torch.int32,
+                device=forward_batch.seq_lens.device,
+            )
+            create_flashmla_kv_indices_triton[(bs,)](
+                self.req_to_token,
+                forward_batch.req_pool_indices,
+                forward_batch.seq_lens,
+                None,
+                block_kv_indices,
+                self.req_to_token.stride(0),
+                max_seqlen_pad,
+            )
+            mla_metadata, num_splits = get_mla_metadata(
+                forward_batch.seq_lens.to(torch.int32),
+                self.num_q_heads,
+                1,
+                is_fp8_kvcache=self.is_fp8_kvcache,
+            )
+            self.forward_metadata = FlashMLADecodeMetadata(
+                mla_metadata,
+                num_splits,
+                block_kv_indices,
+            )
+        elif forward_batch.forward_mode.is_target_verify():
+            seq_lens_cpu = forward_batch.seq_lens_cpu + self.num_draft_tokens
+            seq_lens = forward_batch.seq_lens + self.num_draft_tokens
+
+            max_seqlen_pad = triton.cdiv(seq_lens_cpu.max().item(), PAGE_SIZE)
+            block_kv_indices = torch.full(
+                (bs, max_seqlen_pad),
+                -1,
+                dtype=torch.int32,
+                device=seq_lens.device,
+            )
+            create_flashmla_kv_indices_triton[(bs,)](
+                self.req_to_token,
+                forward_batch.req_pool_indices,
+                seq_lens,
+                None,
+                block_kv_indices,
+                self.req_to_token.stride(0),
+                max_seqlen_pad,
+            )
+            mla_metadata, num_splits = get_mla_metadata(
+                seq_lens.to(torch.int32),
+                self.num_draft_tokens * self.num_q_heads,
+                1,
+                is_fp8_kvcache=self.is_fp8_kvcache,
+            )
+
+            # Use FlashMLADecodeMetadata which has the attributes forward_extend expects
+            self.forward_metadata = FlashMLADecodeMetadata(
+                mla_metadata,
+                num_splits,
+                block_kv_indices,
+            )
+        else:
+            super().init_forward_metadata(forward_batch)
+
+    def init_cuda_graph_state(
+        self,
+        max_bs: int,
+        max_num_tokens: int,
+        block_kv_indices: Optional[torch.Tensor] = None,
+    ):
+        if block_kv_indices is None:
+            cuda_graph_kv_indices = torch.full(
+                (max_bs, (self.max_context_len + PAGE_SIZE) // PAGE_SIZE),
+                1,
+                dtype=torch.int32,
+                device="cuda",
+            )
+        else:
+            cuda_graph_kv_indices = block_kv_indices
+
+        if self.num_draft_tokens:
+            self.cuda_graph_mla_metadata, self.cuda_graph_num_splits = get_mla_metadata(
+                torch.ones(
+                    max_bs, dtype=torch.int32, device=cuda_graph_kv_indices.device
+                ),
+                self.num_draft_tokens * self.num_q_heads,
+                1,
+                is_fp8_kvcache=self.is_fp8_kvcache,
+            )
+        else:
+            self.cuda_graph_mla_metadata, self.cuda_graph_num_splits = get_mla_metadata(
+                torch.ones(
+                    max_bs, dtype=torch.int32, device=cuda_graph_kv_indices.device
+                ),
+                self.num_q_heads,
+                1,
+                is_fp8_kvcache=self.is_fp8_kvcache,
+            )
+        self.cuda_graph_kv_indices = cuda_graph_kv_indices
+
+    def init_forward_metadata_capture_cuda_graph(
+        self,
+        bs: int,
+        num_tokens: int,
+        req_pool_indices: torch.Tensor,
+        seq_lens: torch.Tensor,
+        encoder_lens: Optional[torch.Tensor],
+        forward_mode: ForwardMode,
+        spec_info: Optional[SpecInput],
+    ):
+        if forward_mode.is_decode_or_idle():
+            max_seqlen_pad = triton.cdiv(seq_lens.max().item(), PAGE_SIZE)
+
+            create_flashmla_kv_indices_triton[(bs,)](
+                self.req_to_token,
+                req_pool_indices,
+                seq_lens,
+                None,
+                self.cuda_graph_kv_indices,
+                self.req_to_token.stride(0),
+                self.cuda_graph_kv_indices.stride(0),
+            )
+            num_q_heads = self.num_q_heads * (self.num_draft_tokens or 1)
+            mla_metadata, num_splits = get_mla_metadata(
+                seq_lens.to(torch.int32),
+                num_q_heads,
+                1,
+                is_fp8_kvcache=self.is_fp8_kvcache,
+            )
+            self.cuda_graph_mla_metadata.copy_(mla_metadata)
+            self.cuda_graph_num_splits[: bs + 1].copy_(num_splits)
+            self.forward_metadata = FlashMLADecodeMetadata(
+                self.cuda_graph_mla_metadata,
+                self.cuda_graph_num_splits[: bs + 1],
+                self.cuda_graph_kv_indices[:bs, :max_seqlen_pad],
+            )
+        elif forward_mode.is_target_verify():
+            seq_lens = seq_lens + self.num_draft_tokens
+            max_seqlen_pad = triton.cdiv(seq_lens.max().item(), PAGE_SIZE)
+
+            create_flashmla_kv_indices_triton[(bs,)](
+                self.req_to_token,
+                req_pool_indices,
+                seq_lens,
+                None,
+                self.cuda_graph_kv_indices,
+                self.req_to_token.stride(0),
+                self.cuda_graph_kv_indices.stride(0),
+            )
+            mla_metadata, num_splits = get_mla_metadata(
+                seq_lens.to(torch.int32),
+                self.num_draft_tokens * self.num_q_heads,
+                1,
+                is_fp8_kvcache=self.is_fp8_kvcache,
+            )
+            self.cuda_graph_mla_metadata.copy_(mla_metadata)
+            self.cuda_graph_num_splits[: bs + 1].copy_(num_splits)
+            self.forward_metadata = FlashMLADecodeMetadata(
+                self.cuda_graph_mla_metadata,
+                self.cuda_graph_num_splits[: bs + 1],
+                self.cuda_graph_kv_indices[:bs, :max_seqlen_pad],
+            )
+        else:
+            super().init_forward_metadata_capture_cuda_graph(
+                bs,
+                num_tokens,
+                req_pool_indices,
+                seq_lens,
+                encoder_lens,
+                forward_mode,
+                spec_info,
+            )
+
+    def init_forward_metadata_replay_cuda_graph(
+        self,
+        bs: int,
+        req_pool_indices: torch.Tensor,
+        seq_lens: torch.Tensor,
+        seq_lens_sum: int,
+        encoder_lens: Optional[torch.Tensor],
+        forward_mode: ForwardMode,
+        spec_info: Optional[SpecInput],
+        seq_lens_cpu: Optional[torch.Tensor],
+    ):
+
+        if forward_mode.is_decode_or_idle():
+            assert seq_lens_cpu is not None
+            seq_lens = seq_lens[:bs]
+            seq_lens_cpu = seq_lens_cpu[:bs]
+            max_seqlen_pad = triton.cdiv(seq_lens_cpu.max().item(), PAGE_SIZE)
+            create_flashmla_kv_indices_triton[(bs,)](
+                self.req_to_token,
+                req_pool_indices[:bs],
+                seq_lens,
+                None,
+                self.cuda_graph_kv_indices,
+                self.req_to_token.stride(0),
+                self.cuda_graph_kv_indices.stride(0),
+            )
+            num_q_heads = self.num_q_heads * (self.num_draft_tokens or 1)
+            mla_metadata, num_splits = get_mla_metadata(
+                seq_lens.to(torch.int32),
+                num_q_heads,
+                1,
+                is_fp8_kvcache=self.is_fp8_kvcache,
+            )
+            self.cuda_graph_mla_metadata.copy_(mla_metadata)
+            self.cuda_graph_num_splits[: bs + 1].copy_(num_splits)
+            self.forward_metadata.mla_metadata = self.cuda_graph_mla_metadata
+            self.forward_metadata.num_splits = self.cuda_graph_num_splits[: bs + 1]
+            self.forward_metadata.block_kv_indices = self.cuda_graph_kv_indices[
+                :bs, :max_seqlen_pad
+            ]
+        elif forward_mode.is_target_verify():
+            seq_lens = seq_lens[:bs] + self.num_draft_tokens
+            seq_lens_cpu = seq_lens_cpu[:bs] + self.num_draft_tokens
+            max_seqlen_pad = triton.cdiv(seq_lens_cpu.max().item(), PAGE_SIZE)
+            create_flashmla_kv_indices_triton[(bs,)](
+                self.req_to_token,
+                req_pool_indices[:bs],
+                seq_lens,
+                None,
+                self.cuda_graph_kv_indices,
+                self.req_to_token.stride(0),
+                self.cuda_graph_kv_indices.stride(0),
+            )
+            mla_metadata, num_splits = get_mla_metadata(
+                seq_lens.to(torch.int32),
+                self.num_draft_tokens * self.num_q_heads,
+                1,
+                is_fp8_kvcache=self.is_fp8_kvcache,
+            )
+            self.cuda_graph_mla_metadata.copy_(mla_metadata)
+            self.cuda_graph_num_splits[: bs + 1].copy_(num_splits)
+            self.forward_metadata.mla_metadata = self.cuda_graph_mla_metadata
+            self.forward_metadata.num_splits = self.cuda_graph_num_splits[: bs + 1]
+            self.forward_metadata.block_kv_indices = self.cuda_graph_kv_indices[
+                :bs, :max_seqlen_pad
+            ]
+        else:
+            super().init_forward_metadata_replay_cuda_graph(
+                bs,
+                req_pool_indices,
+                seq_lens,
+                seq_lens_sum,
+                encoder_lens,
+                forward_mode,
+                spec_info,
+                seq_lens_cpu,
+            )
+
+    def get_cuda_graph_seq_len_fill_value(self):
+        return 1
+
+    def forward_decode(
+        self,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
+        layer: RadixAttention,
+        forward_batch: ForwardBatch,
+        save_kv_cache: bool = True,
+    ):
+        cache_loc = forward_batch.out_cache_loc
+
+        if k is not None:
+            assert v is not None
+            if save_kv_cache:
+                forward_batch.token_to_kv_pool.set_kv_buffer(
+                    layer,
+                    cache_loc,
+                    k,
+                    v,
+                )
+        bs = forward_batch.batch_size
+        k_cache = forward_batch.token_to_kv_pool.get_key_buffer(layer.layer_id)
+
+        reshape_q = q.view(bs, -1, layer.tp_q_head_num, layer.head_dim)
+        if self.is_fp8_kvcache:
+            # For FP8 KV cache, Q needs to be converted to FP8 for FlashMLA kernel
+            # In SGLang, we use layer.k_scale for both q and k scales
+            if layer.k_scale is not None:
+                q_scale = layer.k_scale
+                descale_q = layer.k_scale.reshape(1)
+                descale_k = layer.k_scale.reshape(1)
+            else:
+                # Fallback to 1.0 if k_scale is not initialized
+                q_scale = torch.ones((1,), dtype=torch.float32, device=reshape_q.device)
+                descale_q = torch.ones(
+                    (1,), dtype=torch.float32, device=reshape_q.device
+                )
+                descale_k = torch.ones(
+                    (1,), dtype=torch.float32, device=reshape_q.device
+                )
+
+            # Reshape to 2D for scaled_fp8_quant (which requires 2D input)
+            q_shape = reshape_q.shape
+            reshape_q_2d = reshape_q.reshape(-1, q_shape[-1])
+            reshape_q_fp8_2d, _ = scaled_fp8_quant(reshape_q_2d, q_scale)
+            reshape_q_fp8 = reshape_q_fp8_2d.reshape(q_shape)
+            o, _ = flash_mla_with_kvcache(
+                q=reshape_q_fp8,
+                k_cache=k_cache.view(-1, PAGE_SIZE, 1, self.kv_cache_dim),
+                block_table=self.forward_metadata.block_kv_indices[:bs],
+                cache_seqlens=forward_batch.seq_lens.to(torch.int32),
+                head_dim_v=self.kv_lora_rank,  # TODO Retrieve from config.
+                tile_scheduler_metadata=self.forward_metadata.flashmla_metadata,
+                num_splits=self.forward_metadata.num_splits,
+                softmax_scale=layer.scaling,
+                causal=True,
+                descale_q=descale_q,
+                descale_k=descale_k,
+            )
+
+            return o.view(-1, layer.tp_q_head_num * layer.v_head_dim)
+        else:
+            # todo: need check all causal True or False?
+            o, _ = flash_mla_with_kvcache(
+                q=reshape_q,
+                k_cache=k_cache.view(-1, PAGE_SIZE, 1, self.kv_cache_dim),
+                block_table=self.forward_metadata.block_kv_indices[:bs],
+                cache_seqlens=forward_batch.seq_lens.to(torch.int32),
+                head_dim_v=self.kv_lora_rank,  # TODO Retrieve from config.
+                tile_scheduler_metadata=self.forward_metadata.flashmla_metadata,
+                num_splits=self.forward_metadata.num_splits,
+                softmax_scale=layer.scaling,
+                causal=True,
+            )
+
+            return o.view(-1, layer.tp_q_head_num * layer.v_head_dim)
+
+    def forward_extend(
+        self,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
+        layer: RadixAttention,
+        forward_batch: ForwardBatch,
+        save_kv_cache: bool = True,
+    ):
+        if (
+            forward_batch.forward_mode == ForwardMode.EXTEND
+            or forward_batch.forward_mode == ForwardMode.DRAFT_EXTEND
+        ):
+            return super().forward_extend(q, k, v, layer, forward_batch, save_kv_cache)
+        else:
+            cache_loc = forward_batch.out_cache_loc
+
+            if k is not None:
+                assert v is not None
+                if save_kv_cache:
+                    forward_batch.token_to_kv_pool.set_kv_buffer(layer, cache_loc, k, v)
+
+            bs = forward_batch.batch_size
+            k_cache = forward_batch.token_to_kv_pool.get_key_buffer(layer.layer_id)
+
+            reshape_q = q.view(bs, -1, layer.tp_q_head_num, layer.head_dim)
+            if self.is_fp8_kvcache:
+                # For FP8 KV cache, Q needs to be converted to FP8 for FlashMLA kernel
+                # In SGLang, we use layer.k_scale for both q and k scales
+                if layer.k_scale is not None:
+                    q_scale = layer.k_scale
+                    descale_q = layer.k_scale.reshape(1)
+                    descale_k = layer.k_scale.reshape(1)
+                else:
+                    # Fallback to 1.0 if k_scale is not initialized
+                    q_scale = torch.ones(
+                        (1,), dtype=torch.float32, device=reshape_q.device
+                    )
+                    descale_q = torch.ones(
+                        (1,), dtype=torch.float32, device=reshape_q.device
+                    )
+                    descale_k = torch.ones(
+                        (1,), dtype=torch.float32, device=reshape_q.device
+                    )
+
+                # Quantize Q using scaled_fp8_quant (matching vLLM's approach)
+                # Reshape to 2D for scaled_fp8_quant (which requires 2D input)
+                q_shape = reshape_q.shape
+                reshape_q_2d = reshape_q.reshape(-1, q_shape[-1])
+                reshape_q_fp8_2d, _ = scaled_fp8_quant(reshape_q_2d, q_scale)
+                reshape_q_fp8 = reshape_q_fp8_2d.reshape(q_shape)
+                o, _ = flash_mla_with_kvcache(
+                    q=reshape_q_fp8,
+                    k_cache=k_cache.view(-1, PAGE_SIZE, 1, self.kv_cache_dim),
+                    block_table=self.forward_metadata.block_kv_indices[:bs],
+                    cache_seqlens=forward_batch.seq_lens.to(torch.int32)
+                    + self.num_draft_tokens,
+                    head_dim_v=self.kv_lora_rank,
+                    tile_scheduler_metadata=self.forward_metadata.flashmla_metadata,
+                    num_splits=self.forward_metadata.num_splits,
+                    softmax_scale=layer.scaling,
+                    causal=True,
+                    descale_q=descale_q,
+                    descale_k=descale_k,
+                )
+            else:
+                o, _ = flash_mla_with_kvcache(
+                    q=reshape_q,
+                    k_cache=k_cache.view(-1, PAGE_SIZE, 1, self.kv_cache_dim),
+                    block_table=self.forward_metadata.block_kv_indices[:bs],
+                    cache_seqlens=forward_batch.seq_lens.to(torch.int32)
+                    + self.num_draft_tokens,
+                    head_dim_v=self.kv_lora_rank,
+                    tile_scheduler_metadata=self.forward_metadata.flashmla_metadata,
+                    num_splits=self.forward_metadata.num_splits,
+                    softmax_scale=layer.scaling,
+                    causal=True,
+                )
+            return o.view(-1, layer.tp_q_head_num * layer.v_head_dim)
+
+
+# TODO: multi step kv indices optimization
+class FlashMLAMultiStepDraftBackend:
+    """
+    Wrap multiple flashmla attention backends as one for multiple consecutive
+    draft decoding steps.
+    """
+
+    def __init__(
+        self,
+        model_runner: ModelRunner,
+        topk: int,
+        speculative_num_steps: int,
+    ):
+        if topk > 1:
+            raise ValueError(
+                "Currently FlashMLA only supports topk=1 for speculative decoding"
+            )
+        self.topk = topk
+        self.speculative_num_steps = speculative_num_steps
+        max_bs = model_runner.req_to_token_pool.size * self.topk
+        self.kv_indptr = torch.zeros(
+            (
+                self.speculative_num_steps,
+                max_bs + 1,
+            ),
+            dtype=torch.int32,
+            device=model_runner.device,
+        )
+
+        self.attn_backends = []
+        for i in range(self.speculative_num_steps - 1):
+            self.attn_backends.append(
+                FlashMLABackend(
+                    model_runner,
+                    skip_prefill=True,
+                    kv_indptr_buf=self.kv_indptr[i],
+                    kv_last_page_len_buf=None,
+                )
+            )
+
+    def common_template(
+        self,
+        forward_batch: ForwardBatch,
+        call_fn: Callable,
+    ):
+        assert forward_batch.spec_info is not None
+
+        for i in range(self.speculative_num_steps - 1):
+            call_fn(i, forward_batch)
+
+    def init_forward_metadata(self, forward_batch: ForwardBatch):
+        def call_fn(i, forward_batch):
+            assert forward_batch.spec_info is not None
+            self.attn_backends[i].init_forward_metadata(forward_batch)
+
+        self.common_template(forward_batch, call_fn)
+
+    def init_cuda_graph_state(self, max_bs: int, max_num_tokens: int):
+        for i in range(self.speculative_num_steps - 1):
+            self.attn_backends[i].init_cuda_graph_state(
+                max_bs, max_num_tokens, block_kv_indices=None
+            )
+
+    def init_forward_metadata_capture_cuda_graph(self, forward_batch: ForwardBatch):
+        def call_fn(i, forward_batch):
+            self.attn_backends[i].init_forward_metadata_capture_cuda_graph(
+                forward_batch.batch_size,
+                forward_batch.batch_size * self.topk,
+                forward_batch.req_pool_indices,
+                forward_batch.seq_lens,
+                encoder_lens=None,
+                forward_mode=ForwardMode.DECODE,
+                spec_info=forward_batch.spec_info,
+            )
+
+        self.common_template(forward_batch, call_fn)
+
+    def init_forward_metadata_replay_cuda_graph(
+        self, forward_batch: ForwardBatch, bs: int
+    ):
+        def call_fn(i, forward_batch):
+            self.attn_backends[i].init_forward_metadata_replay_cuda_graph(
+                bs,
+                forward_batch.req_pool_indices,
+                forward_batch.seq_lens,
+                seq_lens_sum=-1,
+                encoder_lens=None,
+                forward_mode=ForwardMode.DECODE,
+                spec_info=forward_batch.spec_info,
+                seq_lens_cpu=forward_batch.seq_lens_cpu,
+            )
+
+        self.common_template(forward_batch, call_fn)
diff --git a/sglang/python/sglang/srt/layers/attention/hybrid_attn_backend.py b/sglang/python/sglang/srt/layers/attention/hybrid_attn_backend.py
new file mode 100644
index 0000000000000000000000000000000000000000..57e10daa60576997d5f7b91ffb4bebd46638c7ce
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/attention/hybrid_attn_backend.py
@@ -0,0 +1,197 @@
+from typing import Optional
+
+import torch
+
+from sglang.srt.layers.attention.base_attn_backend import AttentionBackend
+from sglang.srt.layers.attention.nsa.nsa_indexer import BaseIndexerMetadata
+from sglang.srt.layers.radix_attention import RadixAttention
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch, ForwardMode
+from sglang.srt.model_executor.model_runner import ModelRunner
+from sglang.srt.speculative.spec_info import SpecInput
+
+
+class HybridAttnBackend(AttentionBackend):
+    """Support different backends for prefill and decode."""
+
+    def __init__(
+        self,
+        model_runner: ModelRunner,
+        prefill_backend: AttentionBackend,
+        decode_backend: AttentionBackend,
+    ):
+        self.model_runner = model_runner
+        self.prefill_backend = prefill_backend
+        self.decode_backend = decode_backend
+        self.data_type = model_runner.kv_cache_dtype
+
+    def _select_backend(self, forward_mode: ForwardMode) -> AttentionBackend:
+        """
+        Select the appropriate attention backend based on the forward mode.
+
+        Args:
+            forward_mode: The current forward mode indicating the operation type
+
+        Returns:
+            The selected attention backend (prefill or decode)
+
+        Note:
+            - decode_or_idle: Always uses decode backend
+            - target_verify or draft_extend: Uses decode backend if speculative_attention_mode is "decode", otherwise prefill backend
+            - prefill: Always uses prefill backend
+        """
+        if forward_mode.is_decode_or_idle():
+            return self.decode_backend
+        elif forward_mode.is_target_verify() or forward_mode.is_draft_extend():
+            return (
+                self.decode_backend
+                if self.model_runner.server_args.speculative_attention_mode == "decode"
+                else self.prefill_backend
+            )
+        else:
+            return self.prefill_backend
+
+    def init_forward_metadata(self, forward_batch: ForwardBatch):
+        backend = self._select_backend(forward_batch.forward_mode)
+        backend.init_forward_metadata(forward_batch)
+
+    def init_cuda_graph_state(self, max_bs: int, max_num_tokens: int):
+        self.decode_backend.init_cuda_graph_state(max_bs, max_num_tokens)
+        if (
+            self.model_runner.server_args.speculative_algorithm is not None
+            and self.model_runner.server_args.speculative_attention_mode == "prefill"
+        ):
+            # When speculative decoding is enabled, we need to initialize the backend
+            # that will be used for target_verify.
+            self.prefill_backend.init_cuda_graph_state(max_bs, max_num_tokens)
+
+    def init_forward_metadata_capture_cuda_graph(
+        self,
+        bs: int,
+        num_tokens: int,
+        req_pool_indices: torch.Tensor,
+        seq_lens: torch.Tensor,
+        encoder_lens: Optional[torch.Tensor],
+        forward_mode: ForwardMode,
+        spec_info: Optional[SpecInput],
+    ):
+        backend = self._select_backend(forward_mode)
+        backend.init_forward_metadata_capture_cuda_graph(
+            bs,
+            num_tokens,
+            req_pool_indices,
+            seq_lens,
+            encoder_lens,
+            forward_mode,
+            spec_info,
+        )
+
+    def init_forward_metadata_replay_cuda_graph(
+        self,
+        bs: int,
+        req_pool_indices: torch.Tensor,
+        seq_lens: torch.Tensor,
+        seq_lens_sum: int,
+        encoder_lens: Optional[torch.Tensor],
+        forward_mode: ForwardMode,
+        spec_info: Optional[SpecInput],
+        seq_lens_cpu: Optional[torch.Tensor],
+    ):
+        backend = self._select_backend(forward_mode)
+        backend.init_forward_metadata_replay_cuda_graph(
+            bs,
+            req_pool_indices,
+            seq_lens,
+            seq_lens_sum,
+            encoder_lens,
+            forward_mode,
+            spec_info,
+            seq_lens_cpu,
+        )
+
+    def get_cuda_graph_seq_len_fill_value(self):
+        return self.decode_backend.get_cuda_graph_seq_len_fill_value()
+
+    def forward(
+        self,
+        q: Optional[torch.Tensor] = None,  # For full attention
+        k: Optional[torch.Tensor] = None,  # For full attention
+        v: Optional[torch.Tensor] = None,  # For full attention
+        layer: Optional[RadixAttention] = None,
+        forward_batch: Optional[ForwardBatch] = None,
+        save_kv_cache: bool = True,
+        *,
+        mixed_qkv: Optional[torch.Tensor] = None,  # For linear attention
+        a: Optional[torch.Tensor] = None,  # For linear attention
+        b: Optional[torch.Tensor] = None,  # For linear attention
+        **kwargs,
+    ):
+        """Forward method that supports both regular attention (q, k, v) and linear attention (mixed_qkv, a, b)."""
+        backend = self._select_backend(forward_batch.forward_mode)
+        if mixed_qkv is not None:
+            return backend.forward(
+                layer=layer,
+                forward_batch=forward_batch,
+                save_kv_cache=save_kv_cache,
+                mixed_qkv=mixed_qkv,
+                a=a,
+                b=b,
+                **kwargs,
+            )
+        return backend.forward(q, k, v, layer, forward_batch, save_kv_cache, **kwargs)
+
+    def forward_decode(
+        self,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
+        layer: RadixAttention,
+        forward_batch: ForwardBatch,
+        save_kv_cache: bool = True,
+        **kwargs,
+    ):
+        return self.decode_backend.forward_decode(
+            q, k, v, layer, forward_batch, save_kv_cache, **kwargs
+        )
+
+    def forward_extend(
+        self,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
+        layer: RadixAttention,
+        forward_batch: ForwardBatch,
+        save_kv_cache: bool = True,
+        **kwargs,
+    ):
+        backend = self._select_backend(forward_batch.forward_mode)
+        return backend.forward_extend(
+            q, k, v, layer, forward_batch, save_kv_cache, **kwargs
+        )
+
+    def get_indexer_metadata(
+        self, layer_id: int, forward_batch: ForwardBatch
+    ) -> Optional[BaseIndexerMetadata]:
+        backend = self._select_backend(forward_batch.forward_mode)
+        return backend.get_indexer_metadata(layer_id, forward_batch)
+
+    def forward(
+        self,
+        q: torch.Tensor = None,
+        k: torch.Tensor = None,
+        v: torch.Tensor = None,
+        layer: RadixAttention = None,
+        forward_batch: ForwardBatch = None,
+        save_kv_cache: bool = True,
+        **kwargs,
+    ):
+        """Delegate forward to the appropriate backend based on forward mode."""
+        backend = self._select_backend(forward_batch.forward_mode)
+        return backend.forward(
+            q=q,
+            k=k,
+            v=v,
+            layer=layer,
+            forward_batch=forward_batch,
+            save_kv_cache=save_kv_cache,
+            **kwargs,
+        )
diff --git a/sglang/python/sglang/srt/layers/attention/hybrid_linear_attn_backend.py b/sglang/python/sglang/srt/layers/attention/hybrid_linear_attn_backend.py
new file mode 100644
index 0000000000000000000000000000000000000000..91194c4943967e78754ee27f741752b9e73601e9
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/attention/hybrid_linear_attn_backend.py
@@ -0,0 +1,994 @@
+import logging
+from typing import Optional, Union
+
+import torch
+import triton
+import triton.language as tl
+
+from sglang.srt.layers.attention.base_attn_backend import AttentionBackend
+from sglang.srt.layers.attention.mamba.causal_conv1d_triton import PAD_SLOT_ID
+from sglang.srt.layers.attention.mamba.mamba import MambaMixer2
+from sglang.srt.layers.attention.mamba.mamba2_metadata import (
+    ForwardMetadata,
+    Mamba2Metadata,
+)
+from sglang.srt.layers.attention.mamba.mamba_state_scatter_triton import (
+    fused_mamba_state_scatter_with_mask,
+)
+from sglang.srt.layers.radix_attention import RadixAttention
+from sglang.srt.mem_cache.memory_pool import HybridReqToTokenPool
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch, ForwardMode
+from sglang.srt.model_executor.model_runner import ModelRunner
+from sglang.srt.server_args import get_global_server_args
+from sglang.srt.speculative.eagle_info import EagleDraftInput, EagleVerifyInput
+from sglang.srt.speculative.spec_info import SpecInput
+from sglang.srt.utils import is_cpu
+
+if not is_cpu():
+    from sglang.srt.layers.attention.fla.chunk_delta_h import (
+        CHUNK_SIZE as FLA_CHUNK_SIZE,
+    )
+
+logger = logging.getLogger(__name__)
+
+
+# Kernel to track mamba states if needed based on track mask
+@triton.jit
+def track_mamba_state_if_needed_kernel(
+    conv_states_ptr,
+    ssm_states_ptr,
+    cache_indices_ptr,
+    mamba_track_mask_ptr,
+    mamba_track_indices_ptr,
+    conv_state_stride_0,  # stride for first dimension (batch/pool index)
+    ssm_state_stride_0,  # stride for first dimension (batch/pool index)
+    conv_state_numel_per_row: tl.constexpr,  # total elements per row
+    ssm_state_numel_per_row: tl.constexpr,  # total elements per row
+    BLOCK_SIZE: tl.constexpr,
+):
+    """
+    Track conv_states and ssm_states rows based on track mask.
+
+    This kernel replaces a Python loop that copies state tensors for mamba attention.
+    For each batch element, if the track mask is True, it copies the entire row from
+    the source index (cache_indices[i]) to the destination index (mamba_track_indices[i]).
+
+    Grid: (batch_size,)
+    Each block handles one batch element, using multiple threads to copy data in parallel.
+    """
+    batch_idx = tl.program_id(0)
+
+    # Load the copy mask for this batch element
+    track_mask = tl.load(mamba_track_mask_ptr + batch_idx)
+
+    # Early exit if we don't need to track
+    if not track_mask:
+        return
+
+    # Load source and destination indices
+    src_idx = tl.load(cache_indices_ptr + batch_idx)
+    dst_idx = tl.load(mamba_track_indices_ptr + batch_idx)
+
+    # Copy conv_states
+    # Each thread handles BLOCK_SIZE elements
+    for offset in range(0, conv_state_numel_per_row, BLOCK_SIZE):
+        element_indices = offset + tl.arange(0, BLOCK_SIZE)
+        mask = element_indices < conv_state_numel_per_row
+
+        src_ptr = conv_states_ptr + src_idx * conv_state_stride_0 + element_indices
+        dst_ptr = conv_states_ptr + dst_idx * conv_state_stride_0 + element_indices
+
+        data = tl.load(src_ptr, mask=mask, other=0.0)
+        tl.store(dst_ptr, data, mask=mask)
+
+    # Copy ssm_states
+    for offset in range(0, ssm_state_numel_per_row, BLOCK_SIZE):
+        element_indices = offset + tl.arange(0, BLOCK_SIZE)
+        mask = element_indices < ssm_state_numel_per_row
+
+        src_ptr = ssm_states_ptr + src_idx * ssm_state_stride_0 + element_indices
+        dst_ptr = ssm_states_ptr + dst_idx * ssm_state_stride_0 + element_indices
+
+        data = tl.load(src_ptr, mask=mask, other=0.0)
+        tl.store(dst_ptr, data, mask=mask)
+
+
+def track_mamba_states_if_needed(
+    conv_states: torch.Tensor,
+    ssm_states: torch.Tensor,
+    cache_indices: torch.Tensor,
+    mamba_track_mask: torch.Tensor,
+    mamba_track_indices: torch.Tensor,
+    batch_size: int,
+):
+    """
+    Track mamba states using Triton kernel for better performance.
+
+    Args:
+        conv_states: Convolution states tensor [pool_size, ...]
+        ssm_states: SSM states tensor [pool_size, ...]
+        cache_indices: Source indices for each batch element [batch_size]
+        mamba_track_mask: Boolean mask indicating which elements to track [batch_size]
+        mamba_track_indices: Indices to track for each batch element [batch_size]
+        batch_size: Number of batch elements
+    """
+    conv_state_numel_per_row = conv_states[0].numel()
+    ssm_state_numel_per_row = ssm_states[0].numel()
+
+    # Choose BLOCK_SIZE based on the size of the data
+    BLOCK_SIZE = 1024
+
+    # Launch kernel with batch_size blocks
+    grid = (batch_size,)
+    track_mamba_state_if_needed_kernel[grid](
+        conv_states,
+        ssm_states,
+        cache_indices,
+        mamba_track_mask,
+        mamba_track_indices,
+        conv_states.stride(0),
+        ssm_states.stride(0),
+        conv_state_numel_per_row,
+        ssm_state_numel_per_row,
+        BLOCK_SIZE,
+    )
+
+
+class MambaAttnBackendBase(AttentionBackend):
+    def __init__(self, model_runner: ModelRunner):
+        super().__init__()
+        self.pad_slot_id = PAD_SLOT_ID
+        self.device = model_runner.device
+        self.req_to_token_pool: HybridReqToTokenPool = model_runner.req_to_token_pool
+        self.forward_metadata: ForwardMetadata = None
+        self.state_indices_list = []
+        self.query_start_loc_list = []
+        self.retrieve_next_token_list = []
+        self.retrieve_next_sibling_list = []
+        self.retrieve_parent_token_list = []
+        self.cached_cuda_graph_decode_query_start_loc: torch.Tensor = None
+        self.cached_cuda_graph_verify_query_start_loc: torch.Tensor = None
+        self.conv_states_shape: tuple[int, int] = None
+
+    def _forward_metadata(self, forward_batch: ForwardBatch):
+        bs = forward_batch.batch_size
+
+        retrieve_next_token = None
+        retrieve_next_sibling = None
+        retrieve_parent_token = None
+        track_conv_indices = None
+        track_ssm_h_src = None
+        track_ssm_h_dst = None
+        track_ssm_final_src = None
+        track_ssm_final_dst = None
+
+        mamba_cache_indices = self.req_to_token_pool.get_mamba_indices(
+            forward_batch.req_pool_indices
+        )
+
+        if forward_batch.forward_mode.is_decode_or_idle():
+            query_start_loc = torch.arange(
+                0, bs + 1, dtype=torch.int32, device=self.device
+            )
+        elif forward_batch.forward_mode.is_extend(include_draft_extend_v2=True):
+            if forward_batch.forward_mode.is_draft_extend_v2():
+                # HybridLinearAttnBackend.init_forward_metadata calls all sub-backends
+                # unconditionally, but DRAFT_EXTEND_V2 only runs full-attn layers in
+                # the draft model, so mamba metadata can be skipped.
+                query_start_loc = None
+            elif forward_batch.forward_mode.is_target_verify():
+                query_start_loc = torch.arange(
+                    0,
+                    forward_batch.input_ids.shape[0] + 1,
+                    step=forward_batch.spec_info.draft_token_num,
+                    dtype=torch.int32,
+                    device=forward_batch.input_ids.device,
+                )
+
+                if forward_batch.spec_info.topk > 1:
+                    retrieve_next_token = forward_batch.spec_info.retrive_next_token
+                    retrieve_next_sibling = forward_batch.spec_info.retrive_next_sibling
+                    # retrieve_next_token is None during dummy run so skip tensor creation
+                    if retrieve_next_token is not None:
+                        retrieve_parent_token = torch.empty_like(retrieve_next_token)
+            else:
+                query_start_loc = torch.empty(
+                    (bs + 1,), dtype=torch.int32, device=self.device
+                )
+                query_start_loc[:bs] = forward_batch.extend_start_loc
+                query_start_loc[bs] = (
+                    forward_batch.extend_start_loc[-1]
+                    + forward_batch.extend_seq_lens[-1]
+                )
+                if (
+                    forward_batch.mamba_track_mask is not None
+                    and forward_batch.mamba_track_mask.any()
+                ):
+                    track_conv_indices = self._init_track_conv_indices(
+                        query_start_loc, forward_batch
+                    )
+
+                    (
+                        track_ssm_h_src,
+                        track_ssm_h_dst,
+                        track_ssm_final_src,
+                        track_ssm_final_dst,
+                    ) = self._init_track_ssm_indices(mamba_cache_indices, forward_batch)
+        else:
+            raise ValueError(f"Invalid forward mode: {forward_batch.forward_mode=}")
+
+        return ForwardMetadata(
+            query_start_loc=query_start_loc,
+            mamba_cache_indices=mamba_cache_indices,
+            retrieve_next_token=retrieve_next_token,
+            retrieve_next_sibling=retrieve_next_sibling,
+            retrieve_parent_token=retrieve_parent_token,
+            track_conv_indices=track_conv_indices,
+            track_ssm_h_src=track_ssm_h_src,
+            track_ssm_h_dst=track_ssm_h_dst,
+            track_ssm_final_src=track_ssm_final_src,
+            track_ssm_final_dst=track_ssm_final_dst,
+        )
+
+    def init_forward_metadata(self, forward_batch: ForwardBatch):
+        self.forward_metadata = self._forward_metadata(forward_batch)
+
+    def _init_track_conv_indices(
+        self, query_start_loc: torch.Tensor, forward_batch: ForwardBatch
+    ):
+        """
+        Compute indices for extracting conv states from the input sequence during extend.
+
+        In Mamba models, the conv layer maintains a sliding window of recent inputs.
+        After processing a prefill chunk, we need to save the last `conv_state_len` tokens
+        of the processed region for prefix caching.
+
+        The key insight is that FLA (Flash Linear Attention) processes sequences in chunks
+        of FLA_CHUNK_SIZE. We only track the conv state up to the last complete chunk boundary
+        (aligned_len).
+
+        start_indices is the starting token index of the conv state to track in this extend batch.
+        indices include all pos to track in this extend batch, conv_state_len for each req that
+        needs to be tracked (i.e. mamba_track_mask is True)
+
+        Returns:
+            indices: Tensor of shape [num_tracked_requests, conv_state_len] containing
+                     flattened positions into the packed input tensor.
+        """
+        conv_state_len = self.conv_states_shape[-1]
+
+        # Calculate the end position of the last aligned chunk
+        lens_to_track = (
+            forward_batch.mamba_track_seqlens - forward_batch.extend_prefix_lens
+        )
+        mamba_cache_chunk_size = get_global_server_args().mamba_cache_chunk_size
+        aligned_len = (lens_to_track // mamba_cache_chunk_size) * mamba_cache_chunk_size
+        start_indices = query_start_loc[:-1] + aligned_len - conv_state_len
+        start_indices = start_indices[forward_batch.mamba_track_mask]
+
+        # Create indices: [batch_size, conv_state_len]
+        indices = start_indices.unsqueeze(-1) + torch.arange(
+            conv_state_len,
+            device=self.device,
+            dtype=start_indices.dtype,
+        )
+
+        return indices.clamp(0, query_start_loc[-1] - 1)
+
+    def _init_track_ssm_indices(
+        self, mamba_cache_indices: torch.Tensor, forward_batch: ForwardBatch
+    ):
+        """
+        Compute source and destination indices for tracking SSM states for prefix caching.
+
+        After processing a prefill, we need to save the SSM recurrent state for prefix caching.
+        The FLA kernel outputs intermediate hidden states `h` at each chunk boundary,
+        plus a `last_recurrent_state` at the end of the chunked prefill size.
+
+        The challenge is that sequences may or may not end on a chunk boundary:
+          - Aligned case (len % FLA_CHUNK_SIZE == 0): In this case, FLA will store the to-cache
+            state in the last_recurrent_state.
+          - Unaligned case (len % FLA_CHUNK_SIZE != 0): The last_recurrent_state includes the
+            unaligned position, but we only want state up to the last chunk boundary.
+            We must extract from the intermediate `h` tensor at the appropriate chunk index.
+
+        We compute the src and dst indices for all requests that need to be cached
+        (i.e. mamba_track_mask is True) based on the rule above.
+
+        For example:
+        1. If chunked prefill length is < 64, then only final state has value. In this case we
+           cache `final` state.
+        2. if chunked prefill length == 64, then only final state has value. In this case we
+           cache pos 64, from `final` state
+        3. if chunked prefill length >64 and < 128, then both h and final state have value.
+           We cache pos 64 from `h` state
+        4. if chunked prefill length ==128, then both h and final state have value. We cache
+           pos 128 from `final` state. Note `h` doesn't include the pos 128.
+
+        Returns:
+            track_ssm_h_src: Source indices into the packed `h` tensor (for unaligned seqs)
+            track_ssm_h_dst: Destination cache slot indices (for unaligned seqs)
+            track_ssm_final_src: Source indices into last_recurrent_state buffer (for aligned seqs)
+            track_ssm_final_dst: Destination cache slot indices (for aligned seqs)
+        """
+        # Move to CPU to avoid kernel launches for masking operations
+        mamba_track_mask = forward_batch.mamba_track_mask.cpu()
+        extend_seq_lens = forward_batch.extend_seq_lens.cpu()
+        mamba_track_indices = forward_batch.mamba_track_indices.cpu()
+        mamba_cache_indices = mamba_cache_indices.cpu()
+        mamba_track_seqlens = forward_batch.mamba_track_seqlens.cpu()
+        prefix_lens = forward_batch.extend_prefix_lens.cpu()
+
+        # Calculate the number of hidden states per request
+        num_h_states = (extend_seq_lens - 1) // FLA_CHUNK_SIZE + 1
+
+        # Calculate the starting offset for each sequence in the packed batch
+        track_ssm_src_offset = torch.zeros_like(num_h_states)
+        track_ssm_src_offset[1:] = torch.cumsum(num_h_states[:-1], dim=0)
+
+        # Filter variables by track mask
+        lens_to_track = mamba_track_seqlens - prefix_lens
+        lens_masked = lens_to_track[mamba_track_mask]
+        offset_masked = track_ssm_src_offset[mamba_track_mask]
+        dst_masked = mamba_track_indices[mamba_track_mask]
+
+        # Determine if the sequence ends at a chunk boundary
+        is_aligned = (lens_masked % FLA_CHUNK_SIZE) == 0
+
+        # Case 1: Aligned. Use last_recurrent_state from ssm_states.
+        track_ssm_final_src = mamba_cache_indices[mamba_track_mask][is_aligned]
+        track_ssm_final_dst = dst_masked[is_aligned]
+
+        # Case 2: Unaligned. Use intermediate state from h.
+        # TODO: if support FLA_CHUNK_SIZE % page size != 0, then need to modify this
+        not_aligned = ~is_aligned
+        track_ssm_h_src = offset_masked[not_aligned] + (
+            lens_masked[not_aligned] // FLA_CHUNK_SIZE
+        )
+        track_ssm_h_dst = dst_masked[not_aligned]
+
+        # Move back to GPU
+        return (
+            track_ssm_h_src.to(self.device, non_blocking=True),
+            track_ssm_h_dst.to(self.device, non_blocking=True),
+            track_ssm_final_src.to(self.device, non_blocking=True),
+            track_ssm_final_dst.to(self.device, non_blocking=True),
+        )
+
+    def init_forward_metadata_capture_cuda_graph(
+        self,
+        bs: int,
+        num_tokens: int,
+        req_pool_indices: torch.Tensor,
+        seq_lens: torch.Tensor,
+        encoder_lens: Optional[torch.Tensor],
+        forward_mode: ForwardMode,
+        spec_info: Optional[Union[EagleDraftInput, EagleVerifyInput]],
+    ):
+        self.forward_metadata = self._capture_metadata(
+            bs, req_pool_indices, forward_mode, spec_info
+        )
+
+    def init_forward_metadata_replay_cuda_graph(
+        self,
+        bs: int,
+        req_pool_indices: torch.Tensor,
+        seq_lens: torch.Tensor,
+        seq_lens_sum: int,
+        encoder_lens: Optional[torch.Tensor],
+        forward_mode: ForwardMode,
+        spec_info: Optional[Union[EagleDraftInput, EagleVerifyInput]],
+        seq_lens_cpu: Optional[torch.Tensor],
+    ):
+        self.forward_metadata = self._replay_metadata(
+            bs, req_pool_indices, forward_mode, spec_info, seq_lens_cpu
+        )
+
+    def init_forward_metadata_capture_cpu_graph(
+        self,
+        bs: int,
+        num_tokens: int,
+        req_pool_indices: torch.Tensor,
+        seq_lens: torch.Tensor,
+        encoder_lens: Optional[torch.Tensor],
+        forward_mode: ForwardMode,
+        spec_info: Optional[Union[EagleDraftInput, EagleVerifyInput]],
+    ):
+        self.forward_metadata = self._capture_metadata(
+            bs, req_pool_indices, forward_mode, spec_info
+        )
+
+    def init_cuda_graph_state(self, max_bs: int, max_num_tokens: int):
+        assert (
+            max_num_tokens % max_bs == 0
+        ), f"max_num_tokens={max_num_tokens} must be divisible by max_bs={max_bs}"
+        draft_token_num = max_num_tokens // max_bs
+        for i in range(max_bs):
+            self.state_indices_list.append(
+                torch.full(
+                    (i + 1,), self.pad_slot_id, dtype=torch.int32, device=self.device
+                )
+            )
+            self.query_start_loc_list.append(
+                torch.zeros((i + 2,), dtype=torch.int32, device=self.device)
+            )
+            self.retrieve_next_token_list.append(
+                torch.zeros(
+                    (i + 1, draft_token_num), dtype=torch.int32, device=self.device
+                )
+            )
+            self.retrieve_next_sibling_list.append(
+                torch.zeros(
+                    (i + 1, draft_token_num), dtype=torch.int32, device=self.device
+                )
+            )
+            self.retrieve_parent_token_list.append(
+                torch.zeros(
+                    (i + 1, draft_token_num), dtype=torch.int32, device=self.device
+                )
+            )
+        self.cached_cuda_graph_decode_query_start_loc = torch.arange(
+            0, max_bs + 1, dtype=torch.int32, device=self.device
+        )
+        self.cached_cuda_graph_verify_query_start_loc = torch.arange(
+            0,
+            max_bs * draft_token_num + 1,
+            step=draft_token_num,
+            dtype=torch.int32,
+            device=self.device,
+        )
+
+    def init_cpu_graph_state(self, max_bs: int, max_num_tokens: int):
+        assert (
+            max_num_tokens % max_bs == 0
+        ), f"max_num_tokens={max_num_tokens} must be divisible by max_bs={max_bs}"
+        for i in range(max_bs):
+            self.state_indices_list.append(
+                torch.full(
+                    (i + 1,), self.pad_slot_id, dtype=torch.int32, device=self.device
+                )
+            )
+            self.query_start_loc_list.append(
+                torch.empty((i + 2,), dtype=torch.int32, device=self.device)
+            )
+        self.cached_cuda_graph_decode_query_start_loc = torch.arange(
+            0, max_bs + 1, dtype=torch.int32, device=self.device
+        )
+
+    def _capture_metadata(
+        self,
+        bs: int,
+        req_pool_indices: torch.Tensor,
+        forward_mode: ForwardMode,
+        spec_info: Optional[Union[EagleDraftInput, EagleVerifyInput]],
+    ):
+        if forward_mode.is_decode_or_idle():
+            self.query_start_loc_list[bs - 1].copy_(
+                self.cached_cuda_graph_decode_query_start_loc[: bs + 1]
+            )
+        elif forward_mode.is_target_verify():
+            self.query_start_loc_list[bs - 1].copy_(
+                self.cached_cuda_graph_verify_query_start_loc[: bs + 1]
+            )
+        else:
+            raise ValueError(f"Invalid forward mode: {forward_mode=}")
+        mamba_indices = self.req_to_token_pool.get_mamba_indices(req_pool_indices)
+        self.state_indices_list[bs - 1][: len(mamba_indices)].copy_(mamba_indices)
+
+        # If topk > 1, we need to use retrieve_next_token and retrieve_next_sibling to handle the eagle tree custom attention mask
+        if forward_mode.is_target_verify() and spec_info.topk > 1:
+            # They are None during cuda graph capture so skip the copy_...
+            # self.retrieve_next_token_list[bs - 1].copy_(spec_info.retrive_next_token)
+            # self.retrieve_next_sibling_list[bs - 1].copy_(spec_info.retrive_next_sibling)
+            return ForwardMetadata(
+                query_start_loc=self.query_start_loc_list[bs - 1],
+                mamba_cache_indices=self.state_indices_list[bs - 1],
+                retrieve_next_token=self.retrieve_next_token_list[bs - 1],
+                retrieve_next_sibling=self.retrieve_next_sibling_list[bs - 1],
+                retrieve_parent_token=self.retrieve_parent_token_list[bs - 1],
+            )
+        else:
+            return ForwardMetadata(
+                query_start_loc=self.query_start_loc_list[bs - 1],
+                mamba_cache_indices=self.state_indices_list[bs - 1],
+            )
+
+    def _replay_metadata(
+        self,
+        bs: int,
+        req_pool_indices: torch.Tensor,
+        forward_mode: ForwardMode,
+        spec_info: Optional[SpecInput],
+        seq_lens_cpu: Optional[torch.Tensor],
+    ):
+        num_padding = torch.count_nonzero(
+            seq_lens_cpu == self.get_cuda_graph_seq_len_fill_value()
+        )
+        # Make sure forward metadata is correctly handled for padding reqs
+        req_pool_indices[bs - num_padding :] = 0
+        mamba_indices = self.req_to_token_pool.get_mamba_indices(req_pool_indices)
+        mamba_indices[bs - num_padding :] = -1
+        self.state_indices_list[bs - 1][: len(mamba_indices)].copy_(mamba_indices)
+        if forward_mode.is_decode_or_idle():
+            if num_padding == 0:
+                self.query_start_loc_list[bs - 1].copy_(
+                    self.cached_cuda_graph_decode_query_start_loc[: bs + 1]
+                )
+            else:
+                self.query_start_loc_list[bs - 1][: bs - num_padding].copy_(
+                    self.cached_cuda_graph_decode_query_start_loc[: bs - num_padding]
+                )
+                self.query_start_loc_list[bs - 1][bs - num_padding :].fill_(
+                    bs - num_padding
+                )
+        elif forward_mode.is_target_verify():
+            if num_padding == 0:
+                self.query_start_loc_list[bs - 1].copy_(
+                    self.cached_cuda_graph_verify_query_start_loc[: bs + 1]
+                )
+            else:
+                self.query_start_loc_list[bs - 1][: bs - num_padding].copy_(
+                    self.cached_cuda_graph_verify_query_start_loc[: bs - num_padding]
+                )
+                self.query_start_loc_list[bs - 1][bs - num_padding :].fill_(
+                    (bs - num_padding) * spec_info.draft_token_num
+                )
+        else:
+            raise ValueError(f"Invalid forward mode: {forward_mode=}")
+
+        # If topk > 1, we need to use retrieve_next_token and retrieve_next_sibling to handle the eagle tree custom attention mask
+        if forward_mode.is_target_verify() and spec_info.topk > 1:
+            bs_without_pad = spec_info.retrive_next_token.shape[0]
+            self.retrieve_next_token_list[bs - 1][:bs_without_pad].copy_(
+                spec_info.retrive_next_token
+            )
+            self.retrieve_next_sibling_list[bs - 1][:bs_without_pad].copy_(
+                spec_info.retrive_next_sibling
+            )
+            return ForwardMetadata(
+                query_start_loc=self.query_start_loc_list[bs - 1],
+                mamba_cache_indices=self.state_indices_list[bs - 1],
+                retrieve_next_token=self.retrieve_next_token_list[bs - 1],
+                retrieve_next_sibling=self.retrieve_next_sibling_list[bs - 1],
+                retrieve_parent_token=self.retrieve_parent_token_list[bs - 1],
+            )
+        else:
+            return ForwardMetadata(
+                query_start_loc=self.query_start_loc_list[bs - 1],
+                mamba_cache_indices=self.state_indices_list[bs - 1],
+            )
+
+    def get_cuda_graph_seq_len_fill_value(self):
+        return 1  # Mamba attn does not use seq lens to index kv cache
+
+    def get_cpu_graph_seq_len_fill_value(self):
+        return 1
+
+    def _track_mamba_state_decode(
+        self,
+        forward_batch: ForwardBatch,
+        conv_states: torch.Tensor,
+        ssm_states: torch.Tensor,
+        cache_indices: torch.Tensor,
+    ):
+        """
+        Track and copy Mamba conv/SSM states during decode for prefix caching.
+
+        During decode, each token update modifies conv_states and ssm_states in-place
+        at positions indexed by cache_indices (the working slots). For prefix caching,
+        we need to copy these updated states to persistent cache slots (mamba_track_indices)
+        so they can be prefix cached.
+
+        This delegates to `track_mamba_states_if_needed`, which performs:
+            conv_states[mamba_track_indices[i]] = conv_states[cache_indices[i]]
+            ssm_states[mamba_track_indices[i]] = ssm_states[cache_indices[i]]
+        for all requests where mamba_track_mask[i] is True.
+        """
+        if forward_batch.mamba_track_mask is not None:
+            track_mamba_states_if_needed(
+                conv_states,
+                ssm_states,
+                cache_indices,
+                forward_batch.mamba_track_mask,
+                forward_batch.mamba_track_indices,
+                forward_batch.batch_size,
+            )
+
+    def _track_mamba_state_extend(
+        self,
+        forward_batch: ForwardBatch,
+        h: torch.Tensor,
+        ssm_states: torch.Tensor,
+        forward_metadata: ForwardMetadata,
+    ):
+        """
+        Track and copy SSM states during extend for prefix caching.
+
+        After the FLA chunked prefill kernel runs, we need to save the SSM recurrent
+        state at the last chunk boundary so it can be reused for prefix caching.
+        The source of the state depends on whether the sequence length is aligned
+        to FLA_CHUNK_SIZE. See `_init_track_ssm_indices` for more details on how
+        the source and destination indices are computed.
+
+        Note: Conv state tracking for extend is handled separately via gather operations
+        using indices computed by `_init_track_conv_indices`.
+        """
+        if (
+            forward_batch.mamba_track_mask is not None
+            and forward_batch.mamba_track_mask.any()
+        ):
+            h = h.squeeze(0)
+
+            if forward_metadata.track_ssm_h_src.numel() > 0:
+                ssm_states[forward_metadata.track_ssm_h_dst] = h[
+                    forward_metadata.track_ssm_h_src
+                ].to(ssm_states.dtype, copy=False)
+            if forward_metadata.track_ssm_final_src.numel() > 0:
+                ssm_states[forward_metadata.track_ssm_final_dst] = ssm_states[
+                    forward_metadata.track_ssm_final_src
+                ]
+
+
+class Mamba2AttnBackend(MambaAttnBackendBase):
+    """Attention backend wrapper for Mamba2Mixer kernels."""
+
+    def __init__(self, model_runner: ModelRunner):
+        super().__init__(model_runner)
+        config = model_runner.mamba2_config
+        assert config is not None
+        self.mamba_chunk_size = config.mamba_chunk_size
+
+    def init_forward_metadata(self, forward_batch: ForwardBatch):
+        metadata = self._forward_metadata(forward_batch)
+        self.forward_metadata = Mamba2Metadata.prepare_mixed(
+            metadata,
+            self.mamba_chunk_size,
+            forward_batch,
+        )
+
+    def init_forward_metadata_capture_cuda_graph(
+        self,
+        bs: int,
+        num_tokens: int,
+        req_pool_indices: torch.Tensor,
+        seq_lens: torch.Tensor,
+        encoder_lens: Optional[torch.Tensor],
+        forward_mode: ForwardMode,
+        spec_info: Optional[Union[EagleDraftInput, EagleVerifyInput]],
+    ):
+        metadata = self._capture_metadata(bs, req_pool_indices, forward_mode, spec_info)
+        draft_token_num = spec_info.draft_token_num if spec_info is not None else 1
+        self.forward_metadata = Mamba2Metadata.prepare_decode(
+            metadata,
+            seq_lens,
+            is_target_verify=forward_mode.is_target_verify(),
+            draft_token_num=draft_token_num,
+        )
+
+    def init_forward_metadata_replay_cuda_graph(
+        self,
+        bs: int,
+        req_pool_indices: torch.Tensor,
+        seq_lens: torch.Tensor,
+        seq_lens_sum: int,
+        encoder_lens: Optional[torch.Tensor],
+        forward_mode: ForwardMode,
+        spec_info: Optional[Union[EagleDraftInput, EagleVerifyInput]],
+        seq_lens_cpu: Optional[torch.Tensor],
+    ):
+        metadata = self._replay_metadata(
+            bs, req_pool_indices, forward_mode, spec_info, seq_lens_cpu
+        )
+        draft_token_num = spec_info.draft_token_num if spec_info is not None else 1
+        self.forward_metadata = Mamba2Metadata.prepare_decode(
+            metadata,
+            seq_lens,
+            is_target_verify=forward_mode.is_target_verify(),
+            draft_token_num=draft_token_num,
+        )
+
+    def forward(
+        self,
+        mixer: MambaMixer2,
+        hidden_states: torch.Tensor,
+        output: torch.Tensor,
+        layer_id: int,
+        mup_vector: Optional[torch.Tensor] = None,
+        use_triton_causal_conv: bool = False,
+    ):
+        assert isinstance(self.forward_metadata, Mamba2Metadata)
+        layer_cache = self.req_to_token_pool.mamba2_layer_cache(layer_id)
+        return mixer.forward(
+            hidden_states=hidden_states,
+            output=output,
+            layer_cache=layer_cache,
+            metadata=self.forward_metadata,
+            mup_vector=mup_vector,
+            use_triton_causal_conv=use_triton_causal_conv,
+        )
+
+    def forward_decode(self, *args, **kwargs):
+        raise NotImplementedError(
+            "Mamba2AttnBackend's forward is called directly instead of through HybridLinearAttnBackend, as it supports mixed prefill and decode"
+        )
+
+    def forward_extend(self, *args, **kwargs):
+        raise NotImplementedError(
+            "Mamba2AttnBackend's forward is called directly instead of through HybridLinearAttnBackend, as it supports mixed prefill and decode"
+        )
+
+
+class HybridLinearAttnBackend(AttentionBackend):
+    """Manages a full and linear attention backend"""
+
+    def __init__(
+        self,
+        full_attn_backend: AttentionBackend,
+        linear_attn_backend: MambaAttnBackendBase,
+        full_attn_layers: list[int],
+    ):
+        self.full_attn_layers = full_attn_layers
+        self.full_attn_backend = full_attn_backend
+        self.linear_attn_backend = linear_attn_backend
+        self.attn_backend_list = [full_attn_backend, linear_attn_backend]
+
+    def init_forward_metadata(self, forward_batch: ForwardBatch):
+        for attn_backend in self.attn_backend_list:
+            attn_backend.init_forward_metadata(forward_batch)
+
+    def init_cuda_graph_state(self, max_bs: int, max_num_tokens: int):
+        for attn_backend in self.attn_backend_list:
+            attn_backend.init_cuda_graph_state(max_bs, max_num_tokens)
+
+    def init_cpu_graph_state(self, max_bs: int, max_num_tokens: int):
+        for attn_backend in self.attn_backend_list:
+            attn_backend.init_cpu_graph_state(max_bs, max_num_tokens)
+
+    def init_forward_metadata_capture_cuda_graph(
+        self,
+        bs: int,
+        num_tokens: int,
+        req_pool_indices: torch.Tensor,
+        seq_lens: torch.Tensor,
+        encoder_lens: Optional[torch.Tensor],
+        forward_mode: ForwardMode,
+        spec_info: Optional[SpecInput],
+    ):
+        for attn_backend in self.attn_backend_list:
+            attn_backend.init_forward_metadata_capture_cuda_graph(
+                bs,
+                num_tokens,
+                req_pool_indices,
+                seq_lens,
+                encoder_lens,
+                forward_mode,
+                spec_info,
+            )
+
+    def init_forward_metadata_capture_cpu_graph(
+        self,
+        bs: int,
+        num_tokens: int,
+        req_pool_indices: torch.Tensor,
+        seq_lens: torch.Tensor,
+        encoder_lens: Optional[torch.Tensor],
+        forward_mode: ForwardMode,
+        spec_info: Optional[Union[EagleDraftInput, EagleVerifyInput]],
+    ):
+        for attn_backend in self.attn_backend_list:
+            attn_backend.init_forward_metadata_capture_cpu_graph(
+                bs,
+                num_tokens,
+                req_pool_indices,
+                seq_lens,
+                encoder_lens,
+                forward_mode,
+                spec_info,
+            )
+
+    def init_forward_metadata_replay_cuda_graph(
+        self,
+        bs: int,
+        req_pool_indices: torch.Tensor,
+        seq_lens: torch.Tensor,
+        seq_lens_sum: int,
+        encoder_lens: Optional[torch.Tensor],
+        forward_mode: ForwardMode,
+        spec_info: Optional[SpecInput],
+        seq_lens_cpu: Optional[torch.Tensor],
+    ):
+        for attn_backend in self.attn_backend_list:
+            attn_backend.init_forward_metadata_replay_cuda_graph(
+                bs,
+                req_pool_indices,
+                seq_lens,
+                seq_lens_sum,
+                encoder_lens,
+                forward_mode,
+                spec_info,
+                seq_lens_cpu,
+            )
+
+    def get_cuda_graph_seq_len_fill_value(self):
+        return self.full_attn_backend.get_cuda_graph_seq_len_fill_value()
+
+    def get_cpu_graph_seq_len_fill_value(self):
+        return self.full_attn_backend.get_cpu_graph_seq_len_fill_value()
+
+    def forward_decode(
+        self,
+        layer: RadixAttention,
+        forward_batch: ForwardBatch,
+        save_kv_cache: bool = True,
+        q: Optional[torch.Tensor] = None,  # For full attention
+        k: Optional[torch.Tensor] = None,  # For full attention
+        v: Optional[torch.Tensor] = None,  # For full attention
+        mixed_qkv: Optional[torch.Tensor] = None,  # For linear attention
+        a: Optional[torch.Tensor] = None,  # For GDN linear attention
+        b: Optional[torch.Tensor] = None,  # For GDN linear attention
+        **kwargs,
+    ):
+        layer_id = layer.layer_id if layer else kwargs["layer_id"]
+        if layer_id in self.full_attn_layers:
+            return self.full_attn_backend.forward_decode(
+                q, k, v, layer, forward_batch, save_kv_cache, **kwargs
+            )
+        # Linear attention backend
+        return self.linear_attn_backend.forward_decode(
+            q=q,
+            k=k,
+            v=v,
+            layer=layer,
+            forward_batch=forward_batch,
+            save_kv_cache=save_kv_cache,
+            mixed_qkv=mixed_qkv,
+            a=a,
+            b=b,
+            **kwargs,
+        )
+
+    def forward_extend(
+        self,
+        layer: RadixAttention,
+        forward_batch: ForwardBatch,
+        save_kv_cache: bool = True,
+        q: Optional[torch.Tensor] = None,  # For full attention
+        k: Optional[torch.Tensor] = None,  # For full attention
+        v: Optional[torch.Tensor] = None,  # For full attention
+        mixed_qkv: Optional[torch.Tensor] = None,  # For linear attention
+        a: Optional[torch.Tensor] = None,  # For GDN linear attention
+        b: Optional[torch.Tensor] = None,  # For GDN linear attention
+        **kwargs,
+    ):
+        layer_id = layer.layer_id if layer else kwargs["layer_id"]
+        if layer_id in self.full_attn_layers:
+            return self.full_attn_backend.forward_extend(
+                q, k, v, layer, forward_batch, save_kv_cache, **kwargs
+            )
+        # Linear attention backend
+        return self.linear_attn_backend.forward_extend(
+            q=q,
+            k=k,
+            v=v,
+            layer=layer,
+            forward_batch=forward_batch,
+            save_kv_cache=save_kv_cache,
+            mixed_qkv=mixed_qkv,
+            a=a,
+            b=b,
+            **kwargs,
+        )
+
+    def forward(
+        self,
+        q: Optional[torch.Tensor] = None,  # For full attention
+        k: Optional[torch.Tensor] = None,  # For full attention
+        v: Optional[torch.Tensor] = None,  # For full attention
+        layer: RadixAttention = None,
+        forward_batch: ForwardBatch = None,
+        save_kv_cache: bool = True,
+        mixed_qkv: Optional[torch.Tensor] = None,  # For linear attention
+        a: Optional[torch.Tensor] = None,  # For linear attention
+        b: Optional[torch.Tensor] = None,  # For linear attention
+        **kwargs,
+    ):
+        layer_id = layer.layer_id if layer else kwargs["layer_id"]
+        is_linear_attn = layer_id not in self.full_attn_layers
+
+        if forward_batch.forward_mode.is_idle():
+            if is_linear_attn:
+                return mixed_qkv.new_empty(
+                    mixed_qkv.shape[0], layer.num_v_heads, layer.head_v_dim
+                )
+            return q.new_empty(q.shape[0], layer.tp_q_head_num * layer.v_head_dim)
+        elif forward_batch.forward_mode.is_decode():
+            return self.forward_decode(
+                layer,
+                forward_batch,
+                save_kv_cache,
+                q,
+                k,
+                v,
+                mixed_qkv,
+                a,
+                b,
+                **kwargs,
+            )
+        else:
+            return self.forward_extend(
+                layer,
+                forward_batch,
+                save_kv_cache,
+                q,
+                k,
+                v,
+                mixed_qkv,
+                a,
+                b,
+                **kwargs,
+            )
+
+    def update_mamba_state_after_mtp_verify(
+        self,
+        accepted_steps: torch.Tensor,
+        mamba_track_indices: Optional[torch.Tensor],
+        mamba_steps_to_track: Optional[torch.Tensor],
+        model,
+    ):
+        """
+        Update mamba states after MTP verify using fully fused Triton kernel.
+
+        This replaces the original advanced indexing operations with a single fused
+        gather-scatter kernel that also handles masking internally, avoiding:
+        - index_elementwise_kernel from tensor[bool_mask]
+        - index_select kernel launches
+        - nonzero kernel launches
+        """
+        request_number = accepted_steps.shape[0]
+
+        state_indices_tensor = (
+            self.linear_attn_backend.forward_metadata.mamba_cache_indices[
+                :request_number
+            ]
+        )
+
+        mamba_caches = (
+            self.linear_attn_backend.req_to_token_pool.get_speculative_mamba2_params_all_layers()
+        )
+
+        conv_states = mamba_caches.conv[0]
+        ssm_states = mamba_caches.temporal
+        intermediate_state_cache = mamba_caches.intermediate_ssm
+        intermediate_conv_window_cache = mamba_caches.intermediate_conv_window[0]
+
+        # Use fully fused kernel that handles masking internally
+        # This avoids separate nonzero() and index_select() calls
+        fused_mamba_state_scatter_with_mask(
+            ssm_states,
+            intermediate_state_cache,
+            state_indices_tensor,
+            accepted_steps,
+        )
+        fused_mamba_state_scatter_with_mask(
+            conv_states,
+            intermediate_conv_window_cache,
+            state_indices_tensor,
+            accepted_steps,
+        )
+
+        # Track indices used for tracking mamba states for prefix cache
+        if mamba_track_indices is not None:
+            assert mamba_steps_to_track is not None
+            # Use fully fused kernel for track scatter operations
+            fused_mamba_state_scatter_with_mask(
+                ssm_states,
+                intermediate_state_cache,
+                mamba_track_indices,
+                mamba_steps_to_track,
+            )
+            fused_mamba_state_scatter_with_mask(
+                conv_states,
+                intermediate_conv_window_cache,
+                mamba_track_indices,
+                mamba_steps_to_track,
+            )
diff --git a/sglang/python/sglang/srt/layers/attention/intel_amx_backend.py b/sglang/python/sglang/srt/layers/attention/intel_amx_backend.py
new file mode 100644
index 0000000000000000000000000000000000000000..e64bb45c00d9ed12658152c3da2e4b2903eb9c14
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/attention/intel_amx_backend.py
@@ -0,0 +1,165 @@
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+
+import torch
+
+from sglang.srt.layers.attention.base_attn_backend import AttentionBackend
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch
+
+if TYPE_CHECKING:
+    from sglang.srt.layers.radix_attention import RadixAttention
+    from sglang.srt.model_executor.model_runner import ModelRunner
+
+
+class IntelAMXAttnBackend(AttentionBackend):
+    def __init__(self, model_runner: ModelRunner):
+        import sgl_kernel  # noqa: F401
+
+        super().__init__()
+        self.forward_metadata = None
+        self.device = model_runner.device
+
+        self.num_head = (
+            model_runner.model_config.num_attention_heads // model_runner.tp_size
+        )
+
+        # [NB]: `layer_id` set to 0 for qwen3-next models, as not all attn layers require kv pool
+        # using "full_attention_layer_id_mapping" to map which layer needs kv pool
+        layer_id = 0
+        if hasattr(model_runner.token_to_kv_pool, "full_attention_layer_id_mapping"):
+            layer_id = [*model_runner.token_to_kv_pool.full_attention_layer_id_mapping][
+                0
+            ]
+        self.v_head_dim = model_runner.token_to_kv_pool.get_value_buffer(
+            layer_id
+        ).shape[-1]
+        self.decode_attention_fwd = torch.ops.sgl_kernel.decode_attention_cpu
+        self.extend_attention_fwd = torch.ops.sgl_kernel.extend_attention_cpu
+
+    def init_forward_metadata(self, forward_batch: ForwardBatch):
+        """Init the metadata for a forward pass."""
+
+        bs = forward_batch.batch_size
+        attn_logits = torch.zeros(
+            (
+                bs,
+                self.num_head,
+                8,  # self.num_kv_splits,
+                self.v_head_dim + 1,
+            ),
+            dtype=torch.float32,
+            device=self.device,
+        )
+        if forward_batch.forward_mode.is_decode_or_idle():
+            max_extend_len = None
+        else:
+            max_extend_len = torch.max(forward_batch.extend_seq_lens).item()
+        self.forward_metadata = (attn_logits, max_extend_len)
+
+    def get_cpu_graph_seq_len_fill_value(self):
+        return 1
+
+    def init_forward_metadata_capture_cpu_graph(
+        self,
+        bs: int,
+        num_tokens: int,
+        req_pool_indices: torch.Tensor,
+        seq_lens: torch.Tensor,
+        encoder_lens,
+        forward_mode,
+        spec_info,
+    ):
+        attn_logits = torch.zeros(
+            (
+                bs,
+                self.num_head,
+                8,  # self.num_kv_splits,
+                self.v_head_dim + 1,
+            ),
+            dtype=torch.float32,
+            device=self.device,
+        )
+        max_extend_len = None
+        self.forward_metadata = (attn_logits, max_extend_len)
+
+    def init_cpu_graph_state(self, max_bs: int, max_num_tokens: int):
+        pass
+
+    def forward_extend(
+        self,
+        q,
+        k,
+        v,
+        layer: RadixAttention,
+        forward_batch: ForwardBatch,
+        save_kv_cache=True,
+    ):
+        if layer.qk_head_dim != layer.v_head_dim:
+            o = q.new_empty((q.shape[0], layer.tp_q_head_num * layer.v_head_dim))
+        else:
+            o = torch.empty_like(q)
+
+        if save_kv_cache:
+            forward_batch.token_to_kv_pool.set_kv_buffer(
+                layer, forward_batch.out_cache_loc, k, v
+            )
+
+        _, max_extend_len = self.forward_metadata
+
+        self.extend_attention_fwd(
+            q.view(-1, layer.tp_q_head_num, layer.qk_head_dim),
+            k,
+            v,
+            o.view(-1, layer.tp_q_head_num, layer.v_head_dim),
+            forward_batch.token_to_kv_pool.get_key_buffer(layer.layer_id),
+            forward_batch.token_to_kv_pool.get_value_buffer(layer.layer_id),
+            forward_batch.req_to_token_pool.req_to_token,
+            forward_batch.req_pool_indices,
+            forward_batch.seq_lens,
+            forward_batch.extend_seq_lens,
+            forward_batch.extend_start_loc,
+            max_extend_len,
+            layer.scaling,
+            layer.logit_cap,
+        )
+        return o
+
+    def forward_decode(
+        self,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
+        layer: RadixAttention,
+        forward_batch: ForwardBatch,
+        save_kv_cache=True,
+    ):
+        attn_logits, _ = self.forward_metadata
+
+        q = q.reshape(-1, layer.tp_q_head_num * layer.qk_head_dim)
+
+        if layer.qk_head_dim != layer.v_head_dim:
+            o = q.new_empty((q.shape[0], layer.tp_q_head_num * layer.v_head_dim))
+        else:
+            o = torch.empty_like(q)
+
+        self.decode_attention_fwd(
+            q.view(-1, layer.tp_q_head_num, layer.qk_head_dim),
+            forward_batch.token_to_kv_pool.get_key_buffer(layer.layer_id),
+            forward_batch.token_to_kv_pool.get_value_buffer(layer.layer_id),
+            o.view(-1, layer.tp_q_head_num, layer.v_head_dim),
+            k,
+            v,
+            forward_batch.out_cache_loc,
+            attn_logits,
+            forward_batch.req_to_token_pool.req_to_token,
+            forward_batch.req_pool_indices,
+            forward_batch.seq_lens,
+            layer.scaling,
+            layer.logit_cap,
+        )
+
+        return o
+
+    def support_triton(self):
+        return False
diff --git a/sglang/python/sglang/srt/layers/attention/linear/__init__.py b/sglang/python/sglang/srt/layers/attention/linear/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/sglang/python/sglang/srt/layers/attention/linear/gdn_backend.py b/sglang/python/sglang/srt/layers/attention/linear/gdn_backend.py
new file mode 100644
index 0000000000000000000000000000000000000000..8b7851dfa741aa0aee10d3aaec18df1d766fce1c
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/attention/linear/gdn_backend.py
@@ -0,0 +1,411 @@
+from typing import Tuple, Union
+
+import torch
+
+from sglang.srt.layers.attention.fla.fused_gdn_gating import fused_gdn_gating
+from sglang.srt.layers.attention.hybrid_linear_attn_backend import MambaAttnBackendBase
+from sglang.srt.layers.attention.linear.kernels.gdn_triton import TritonGDNKernel
+from sglang.srt.layers.attention.linear.utils import (
+    LinearAttnKernelBackend,
+    get_linear_attn_decode_backend,
+    get_linear_attn_prefill_backend,
+)
+from sglang.srt.layers.attention.mamba.causal_conv1d_triton import (
+    causal_conv1d_fn,
+    causal_conv1d_update,
+)
+from sglang.srt.layers.radix_linear_attention import RadixLinearAttention
+from sglang.srt.mem_cache.memory_pool import MambaPool
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch
+from sglang.srt.model_executor.model_runner import ModelRunner
+from sglang.srt.utils import is_cpu, is_cuda, is_npu
+from sglang.srt.utils.common import rank0_log
+
+if not is_cpu():
+    from sglang.srt.layers.attention.fla.chunk_delta_h import (
+        CHUNK_SIZE as FLA_CHUNK_SIZE,
+    )
+
+if is_cuda():
+    from sglang.srt.layers.attention.mamba.causal_conv1d import (
+        causal_conv1d_fn as causal_conv1d_fn_cuda,
+    )
+
+    causal_conv1d_fn = causal_conv1d_fn_cuda
+elif is_npu():
+    from sgl_kernel_npu.mamba.causal_conv1d import (
+        causal_conv1d_fn_npu,
+        causal_conv1d_update_npu,
+    )
+
+    causal_conv1d_fn = causal_conv1d_fn_npu
+    causal_conv1d_update = causal_conv1d_update_npu
+elif is_cpu():
+    from sgl_kernel.mamba import causal_conv1d_fn_cpu, causal_conv1d_update_cpu
+
+    causal_conv1d_fn = causal_conv1d_fn_cpu
+    causal_conv1d_update = causal_conv1d_update_cpu
+    fused_gdn_gating = torch.ops.sgl_kernel.fused_gdn_gating_cpu
+
+
+class GDNKernelDispatcher:
+    """Dispatches GDN kernel calls to the appropriate backend per mode."""
+
+    def __init__(
+        self,
+        decode_backend: LinearAttnKernelBackend,
+        prefill_backend: LinearAttnKernelBackend,
+    ):
+        triton_kernel = TritonGDNKernel()
+
+        if decode_backend.is_triton():
+            self.decode_kernel = triton_kernel
+        elif decode_backend.is_cutedsl():
+            if not is_cuda():
+                raise ValueError("CuTe DSL backend requires CUDA")
+            from sglang.srt.layers.attention.linear.kernels.gdn_cutedsl import (
+                CuteDSLGDNKernel,
+            )
+
+            self.decode_kernel = CuteDSLGDNKernel()
+        elif decode_backend.is_flashinfer():
+            if not is_cuda():
+                raise ValueError("FlashInfer backend requires CUDA")
+            from sglang.srt.layers.attention.linear.kernels.gdn_flashinfer import (
+                FlashInferGDNKernel,
+            )
+
+            flashinfer_kernel = FlashInferGDNKernel()
+            self.decode_kernel = flashinfer_kernel
+        else:
+            raise ValueError(f"Unsupported GDN decode backend: {decode_backend}")
+
+        if prefill_backend.is_triton():
+            self.extend_kernel = triton_kernel
+        elif prefill_backend.is_cutedsl():
+            raise ValueError(
+                "CuTe DSL backend only supports decode, not prefill. "
+                "Use --linear-attn-prefill-backend triton instead."
+            )
+        elif prefill_backend.is_flashinfer():
+            if not is_cuda():
+                raise ValueError("FlashInfer backend requires CUDA")
+            # Reuse the FlashInfer kernel if already created for decode
+            if decode_backend.is_flashinfer():
+                self.extend_kernel = flashinfer_kernel
+            else:
+                from sglang.srt.layers.attention.linear.kernels.gdn_flashinfer import (
+                    FlashInferGDNKernel,
+                )
+
+                flashinfer_kernel = FlashInferGDNKernel()
+                self.extend_kernel = flashinfer_kernel
+        else:
+            raise ValueError(f"Unsupported GDN prefill backend: {prefill_backend}")
+
+        # Verify kernel: use FlashInfer if either decode or prefill selected it
+        if decode_backend.is_flashinfer() or prefill_backend.is_flashinfer():
+            self.verify_kernel = flashinfer_kernel
+        else:
+            self.verify_kernel = triton_kernel
+
+        rank0_log(
+            f"GDN kernel dispatcher: decode={self.decode_kernel.__class__.__name__}, "
+            f"extend={self.extend_kernel.__class__.__name__}, "
+            f"verify={self.verify_kernel.__class__.__name__}"
+        )
+
+    def decode(
+        self,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
+        a: torch.Tensor,
+        b: torch.Tensor,
+        *,
+        A_log: torch.Tensor,
+        dt_bias: torch.Tensor,
+        ssm_states: torch.Tensor,
+        cache_indices: torch.Tensor,
+        query_start_loc: torch.Tensor,
+        **kwargs,
+    ) -> torch.Tensor:
+        return self.decode_kernel.decode(
+            q,
+            k,
+            v,
+            a,
+            b,
+            A_log=A_log,
+            dt_bias=dt_bias,
+            ssm_states=ssm_states,
+            cache_indices=cache_indices,
+            query_start_loc=query_start_loc,
+            **kwargs,
+        )
+
+    def extend(
+        self,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
+        g: torch.Tensor,
+        beta: torch.Tensor,
+        *,
+        ssm_states: torch.Tensor,
+        cache_indices: torch.Tensor,
+        query_start_loc: torch.Tensor,
+        **kwargs,
+    ) -> tuple:
+        return self.extend_kernel.extend(
+            q,
+            k,
+            v,
+            g,
+            beta,
+            ssm_states=ssm_states,
+            cache_indices=cache_indices,
+            query_start_loc=query_start_loc,
+            **kwargs,
+        )
+
+    def target_verify(
+        self,
+        A_log: torch.Tensor,
+        dt_bias: torch.Tensor,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
+        a: torch.Tensor,
+        b: torch.Tensor,
+        *,
+        ssm_states: torch.Tensor,
+        cache_indices: torch.Tensor,
+        query_start_loc: torch.Tensor,
+        **kwargs,
+    ) -> torch.Tensor:
+        return self.verify_kernel.target_verify(
+            A_log=A_log,
+            dt_bias=dt_bias,
+            q=q,
+            k=k,
+            v=v,
+            a=a,
+            b=b,
+            ssm_states=ssm_states,
+            cache_indices=cache_indices,
+            query_start_loc=query_start_loc,
+            **kwargs,
+        )
+
+
+class GDNAttnBackend(MambaAttnBackendBase):
+    """Attention backend for GDN (Gated Delta Network) linear attention."""
+
+    def __init__(self, model_runner: ModelRunner):
+        super().__init__(model_runner)
+        self.conv_states_shape = (
+            model_runner.req_to_token_pool.mamba_pool.mamba_cache.conv[0].shape
+        )
+        if not is_cpu() and not is_npu():
+            assert (
+                self.conv_states_shape[-1] < FLA_CHUNK_SIZE
+            ), f"{self.conv_states_shape[-1]=} should be less than {FLA_CHUNK_SIZE}"
+
+        decode_backend = get_linear_attn_decode_backend()
+        prefill_backend = get_linear_attn_prefill_backend()
+        self.kernel_dispatcher = GDNKernelDispatcher(decode_backend, prefill_backend)
+
+    def forward_decode(
+        self,
+        layer: RadixLinearAttention,
+        forward_batch: ForwardBatch,
+        mixed_qkv: Union[torch.Tensor, Tuple[torch.Tensor, ...]],
+        a: torch.Tensor,
+        b: torch.Tensor,
+        **kwargs,
+    ):
+        layer_cache = self.req_to_token_pool.mamba2_layer_cache(layer.layer_id)
+        conv_states = layer_cache.conv[0]
+        ssm_states = layer_cache.temporal
+        query_start_loc = self.forward_metadata.query_start_loc
+        cache_indices = self.forward_metadata.mamba_cache_indices
+
+        assert isinstance(mixed_qkv, torch.Tensor)
+        mixed_qkv = causal_conv1d_update(
+            mixed_qkv,
+            conv_states,
+            layer.conv_weights,
+            layer.bias,
+            layer.activation,
+            conv_state_indices=cache_indices,
+        )
+
+        query, key, value = torch.split(
+            mixed_qkv,
+            [layer.q_dim, layer.k_dim, layer.v_dim],
+            dim=-1,
+        )
+        # Reshape from [bs, h*d] to [1, bs, h, d]
+        bs = forward_batch.batch_size
+        query = query.view(1, bs, layer.num_q_heads, layer.head_q_dim)
+        key = key.view(1, bs, layer.num_k_heads, layer.head_k_dim)
+        value = value.view(1, bs, layer.num_v_heads, layer.head_v_dim)
+
+        core_attn_out = self.kernel_dispatcher.decode(
+            q=query,
+            k=key,
+            v=value,
+            a=a,
+            b=b,
+            A_log=layer.A_log,
+            dt_bias=layer.dt_bias,
+            ssm_states=ssm_states,
+            cache_indices=cache_indices,
+            query_start_loc=query_start_loc,
+        )
+
+        self._track_mamba_state_decode(
+            forward_batch, conv_states, ssm_states, cache_indices
+        )
+
+        return core_attn_out
+
+    def forward_extend(
+        self,
+        layer: RadixLinearAttention,
+        forward_batch: ForwardBatch,
+        mixed_qkv: Union[torch.Tensor, Tuple[torch.Tensor, ...]],
+        a: torch.Tensor,
+        b: torch.Tensor,
+        **kwargs,
+    ):
+        assert isinstance(mixed_qkv, torch.Tensor)
+        seq_len = mixed_qkv.shape[0]
+
+        is_target_verify = forward_batch.forward_mode.is_target_verify()
+        forward_metadata = self.forward_metadata
+
+        query_start_loc = forward_metadata.query_start_loc
+        cache_indices = forward_metadata.mamba_cache_indices
+        retrieve_next_token = forward_metadata.retrieve_next_token
+        retrieve_next_sibling = forward_metadata.retrieve_next_sibling
+        retrieve_parent_token = forward_metadata.retrieve_parent_token
+
+        mamba_cache_params = self.req_to_token_pool.mamba2_layer_cache(layer.layer_id)
+        conv_states = mamba_cache_params.conv[0]
+        ssm_states = mamba_cache_params.temporal
+        if is_target_verify:
+            assert isinstance(mamba_cache_params, MambaPool.SpeculativeState)
+            intermediate_state_cache = mamba_cache_params.intermediate_ssm
+            intermediate_conv_window_cache = (
+                mamba_cache_params.intermediate_conv_window[0]
+            )
+            has_initial_states = torch.ones(
+                seq_len // forward_batch.spec_info.draft_token_num,
+                dtype=torch.bool,
+                device=forward_batch.input_ids.device,
+            )
+            intermediate_state_indices = torch.arange(
+                cache_indices.shape[0], dtype=torch.int32, device=cache_indices.device
+            )
+        else:
+            has_initial_states = forward_batch.extend_prefix_lens > 0
+
+        if is_target_verify:
+            batch_size = seq_len // forward_batch.spec_info.draft_token_num
+            draft_token_num = forward_batch.spec_info.draft_token_num
+            mixed_qkv_reshaped = mixed_qkv.view(
+                batch_size, draft_token_num, -1
+            ).transpose(1, 2)
+            mixed_qkv_processed = causal_conv1d_update(
+                mixed_qkv_reshaped,
+                conv_states,
+                layer.conv_weights,
+                layer.bias,
+                layer.activation,
+                conv_state_indices=cache_indices[:batch_size],
+                intermediate_conv_window=intermediate_conv_window_cache,
+                intermediate_state_indices=intermediate_state_indices[:batch_size],
+                retrieve_next_token=retrieve_next_token,
+                retrieve_next_sibling=retrieve_next_sibling,
+                retrieve_parent_token=retrieve_parent_token,
+            )
+            mixed_qkv = mixed_qkv_processed.transpose(1, 2).view(seq_len, -1)
+        else:
+            mixed_qkv = mixed_qkv.transpose(0, 1)
+            if (
+                forward_batch.mamba_track_mask is not None
+                and forward_batch.mamba_track_mask.any()
+            ):
+                conv_dst = forward_batch.mamba_track_indices
+                mixed_qkv_to_track = mixed_qkv[
+                    :, forward_metadata.track_conv_indices
+                ].transpose(0, 1)
+                mask_indices = forward_batch.mamba_track_mask.nonzero(as_tuple=True)[0]
+                conv_states[conv_dst[mask_indices]] = mixed_qkv_to_track
+
+            mixed_qkv = causal_conv1d_fn(
+                mixed_qkv,
+                layer.conv_weights,
+                layer.bias,
+                activation=layer.activation,
+                conv_states=conv_states,
+                has_initial_state=has_initial_states,
+                cache_indices=cache_indices,
+                query_start_loc=query_start_loc,
+                seq_lens_cpu=forward_batch.extend_seq_lens_cpu,
+            ).transpose(0, 1)[:seq_len]
+
+        query, key, value = torch.split(
+            mixed_qkv,
+            [layer.q_dim, layer.k_dim, layer.v_dim],
+            dim=-1,
+        )
+
+        actual_seq_len = query.shape[0]
+        query = query.view(1, actual_seq_len, layer.num_q_heads, layer.head_q_dim)
+        key = key.view(1, actual_seq_len, layer.num_k_heads, layer.head_k_dim)
+        value = value.view(1, actual_seq_len, layer.num_v_heads, layer.head_v_dim)
+
+        if is_target_verify:
+            core_attn_out = self.kernel_dispatcher.target_verify(
+                A_log=layer.A_log,
+                dt_bias=layer.dt_bias,
+                q=query,
+                k=key,
+                v=value,
+                a=a,
+                b=b,
+                ssm_states=ssm_states,
+                cache_indices=cache_indices,
+                query_start_loc=query_start_loc,
+                intermediate_states_buffer=intermediate_state_cache,
+                intermediate_state_indices=intermediate_state_indices,
+                cache_steps=forward_batch.spec_info.draft_token_num,
+                retrieve_parent_token=retrieve_parent_token,
+            )
+        else:
+            g, beta = fused_gdn_gating(layer.A_log, a, b, layer.dt_bias)
+            core_attn_out, last_recurrent_state, h = self.kernel_dispatcher.extend(
+                q=query,
+                k=key,
+                v=value,
+                g=g,
+                beta=beta,
+                ssm_states=ssm_states,
+                cache_indices=cache_indices,
+                query_start_loc=query_start_loc,
+            )
+            if (is_npu() or is_cpu()) and last_recurrent_state is not None:
+                last_recurrent_state = last_recurrent_state.to(
+                    ssm_states.dtype, copy=False
+                )
+                ssm_states[cache_indices] = last_recurrent_state
+
+            if h is not None:
+                self._track_mamba_state_extend(
+                    forward_batch, h, ssm_states, forward_metadata
+                )
+
+        return core_attn_out
diff --git a/sglang/python/sglang/srt/layers/attention/linear/kda_backend.py b/sglang/python/sglang/srt/layers/attention/linear/kda_backend.py
new file mode 100644
index 0000000000000000000000000000000000000000..6e6b16056535c43b1223843e0ed4625086a39163
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/attention/linear/kda_backend.py
@@ -0,0 +1,250 @@
+from typing import Tuple, Union
+
+import torch
+from einops import rearrange
+
+from sglang.srt.layers.attention.hybrid_linear_attn_backend import MambaAttnBackendBase
+from sglang.srt.layers.attention.linear.kernels.kda_triton import TritonKDAKernel
+from sglang.srt.layers.attention.linear.utils import (
+    LinearAttnKernelBackend,
+    get_linear_attn_decode_backend,
+    get_linear_attn_prefill_backend,
+)
+from sglang.srt.layers.attention.mamba.causal_conv1d_triton import (
+    causal_conv1d_fn,
+    causal_conv1d_update,
+)
+from sglang.srt.layers.radix_linear_attention import RadixLinearAttention
+from sglang.srt.utils import is_cpu, is_npu
+from sglang.srt.utils.common import rank0_log
+
+# KDA always uses the triton causal_conv1d_fn (no CUDA override).
+# Only causal_conv1d_update needs platform-specific overrides for decode.
+if is_npu():
+    from sgl_kernel_npu.mamba.causal_conv1d import causal_conv1d_update_npu
+
+    causal_conv1d_update = causal_conv1d_update_npu
+elif is_cpu():
+    from sgl_kernel.mamba import causal_conv1d_update_cpu
+
+    causal_conv1d_update = causal_conv1d_update_cpu
+
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch
+from sglang.srt.model_executor.model_runner import ModelRunner
+
+
+class KDAKernelDispatcher:
+    """Dispatches KDA kernel calls to the appropriate backend per mode."""
+
+    def __init__(
+        self,
+        decode_backend: LinearAttnKernelBackend,
+        prefill_backend: LinearAttnKernelBackend,
+    ):
+        triton_kernel = TritonKDAKernel()
+
+        if decode_backend.is_triton():
+            self.decode_kernel = triton_kernel
+        else:
+            raise ValueError(
+                f"Unsupported KDA decode backend: {decode_backend}. "
+                "KDA currently only supports 'triton'."
+            )
+
+        if prefill_backend.is_triton():
+            self.extend_kernel = triton_kernel
+        else:
+            raise ValueError(
+                f"Unsupported KDA prefill backend: {prefill_backend}. "
+                "KDA currently only supports 'triton'."
+            )
+
+        rank0_log(
+            f"KDA kernel dispatcher: decode={self.decode_kernel.__class__.__name__}, "
+            f"extend={self.extend_kernel.__class__.__name__}"
+        )
+
+    def decode(
+        self,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
+        a: torch.Tensor,
+        b: torch.Tensor,
+        *,
+        A_log: torch.Tensor,
+        dt_bias: torch.Tensor,
+        ssm_states: torch.Tensor,
+        cache_indices: torch.Tensor,
+        query_start_loc: torch.Tensor,
+        **kwargs,
+    ) -> torch.Tensor:
+        return self.decode_kernel.decode(
+            q,
+            k,
+            v,
+            a,
+            b,
+            A_log=A_log,
+            dt_bias=dt_bias,
+            ssm_states=ssm_states,
+            cache_indices=cache_indices,
+            query_start_loc=query_start_loc,
+            **kwargs,
+        )
+
+    def extend(
+        self,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
+        g: torch.Tensor,
+        beta: torch.Tensor,
+        *,
+        ssm_states: torch.Tensor,
+        cache_indices: torch.Tensor,
+        query_start_loc: torch.Tensor,
+        **kwargs,
+    ) -> torch.Tensor:
+        return self.extend_kernel.extend(
+            q,
+            k,
+            v,
+            g,
+            beta,
+            ssm_states=ssm_states,
+            cache_indices=cache_indices,
+            query_start_loc=query_start_loc,
+            **kwargs,
+        )
+
+
+class KDAAttnBackend(MambaAttnBackendBase):
+    """Attention backend for KDA (Kimi Delta Attention) linear attention."""
+
+    def __init__(self, model_runner: ModelRunner):
+        super().__init__(model_runner)
+        decode_backend = get_linear_attn_decode_backend()
+        prefill_backend = get_linear_attn_prefill_backend()
+        self.kernel_dispatcher = KDAKernelDispatcher(decode_backend, prefill_backend)
+
+    def forward_decode(
+        self,
+        layer: RadixLinearAttention,
+        mixed_qkv: Union[torch.Tensor, Tuple[torch.Tensor, ...]],
+        a: torch.Tensor,
+        b: torch.Tensor,
+        **kwargs,
+    ):
+        layer_cache = self.req_to_token_pool.mamba2_layer_cache(layer.layer_id)
+        conv_states = layer_cache.conv[0]
+        ssm_states = layer_cache.temporal
+        query_start_loc = self.forward_metadata.query_start_loc
+        cache_indices = self.forward_metadata.mamba_cache_indices
+
+        qkv = causal_conv1d_update(
+            mixed_qkv,
+            conv_states.transpose(-1, -2),
+            layer.conv_weights,
+            layer.bias,
+            activation="silu",
+            conv_state_indices=cache_indices,
+        )
+        q, k, v = qkv.split([layer.q_dim, layer.k_dim, layer.v_dim], dim=-1)
+        q = rearrange(q, "n (h d) -> 1 n h d", d=layer.head_q_dim)
+        k = rearrange(k, "n (h d) -> 1 n h d", d=layer.head_k_dim)
+        v = rearrange(v, "n (h d) -> 1 n h d", d=layer.head_v_dim)
+
+        return self.kernel_dispatcher.decode(
+            q=q,
+            k=k,
+            v=v,
+            a=a,
+            b=b,
+            A_log=layer.A_log,
+            dt_bias=layer.dt_bias,
+            ssm_states=ssm_states,
+            cache_indices=cache_indices,
+            query_start_loc=query_start_loc,
+        )
+
+    def forward_extend(
+        self,
+        layer: RadixLinearAttention,
+        forward_batch: ForwardBatch,
+        mixed_qkv: Union[torch.Tensor, Tuple[torch.Tensor, ...]],
+        a: torch.Tensor,
+        b: torch.Tensor,
+        **kwargs,
+    ):
+        query_start_loc = self.forward_metadata.query_start_loc
+        cache_indices = self.forward_metadata.mamba_cache_indices
+
+        mamba_cache_params = self.req_to_token_pool.mamba2_layer_cache(layer.layer_id)
+        conv_states = mamba_cache_params.conv[0].transpose(-1, -2)
+
+        ssm_states = mamba_cache_params.temporal
+
+        has_initial_state = forward_batch.extend_prefix_lens > 0
+
+        splits = [layer.q_dim, layer.k_dim, layer.v_dim]
+        q, k, v = mixed_qkv.transpose(0, 1).split(splits, dim=0)
+        q_conv_weight, k_conv_weight, v_conv_weight = layer.conv_weights.split(
+            splits, dim=0
+        )
+        q_conv_state, k_conv_state, v_conv_state = conv_states.split(splits, dim=-2)
+        if layer.bias is not None:
+            q_bias, k_bias, v_bias = layer.bias.split(splits, dim=0)
+        else:
+            q_bias, k_bias, v_bias = None, None, None
+
+        q = causal_conv1d_fn(
+            q,
+            q_conv_weight,
+            q_bias,
+            activation="silu",
+            conv_states=q_conv_state,
+            has_initial_state=has_initial_state,
+            cache_indices=cache_indices,
+            query_start_loc=query_start_loc,
+            seq_lens_cpu=forward_batch.extend_seq_lens_cpu,
+        ).transpose(0, 1)
+        k = causal_conv1d_fn(
+            k,
+            k_conv_weight,
+            k_bias,
+            activation="silu",
+            conv_states=k_conv_state,
+            has_initial_state=has_initial_state,
+            cache_indices=cache_indices,
+            query_start_loc=query_start_loc,
+            seq_lens_cpu=forward_batch.extend_seq_lens_cpu,
+        ).transpose(0, 1)
+        v = causal_conv1d_fn(
+            v,
+            v_conv_weight,
+            v_bias,
+            activation="silu",
+            conv_states=v_conv_state,
+            has_initial_state=has_initial_state,
+            cache_indices=cache_indices,
+            query_start_loc=query_start_loc,
+            seq_lens_cpu=forward_batch.extend_seq_lens_cpu,
+        ).transpose(0, 1)
+
+        q = rearrange(q, "n (h d) -> 1 n h d", d=layer.head_q_dim)
+        k = rearrange(k, "n (h d) -> 1 n h d", d=layer.head_k_dim)
+        v = rearrange(v, "n (h d) -> 1 n h d", d=layer.head_v_dim)
+
+        core_attn_out = self.kernel_dispatcher.extend(
+            q=q,
+            k=k,
+            v=v,
+            g=a,
+            beta=b,
+            ssm_states=ssm_states,
+            cache_indices=cache_indices,
+            query_start_loc=query_start_loc,
+        )
+
+        return core_attn_out
diff --git a/sglang/python/sglang/srt/layers/attention/linear/kernels/__init__.py b/sglang/python/sglang/srt/layers/attention/linear/kernels/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/sglang/python/sglang/srt/layers/attention/linear/kernels/gdn_cutedsl.py b/sglang/python/sglang/srt/layers/attention/linear/kernels/gdn_cutedsl.py
new file mode 100644
index 0000000000000000000000000000000000000000..fff4ef9015d6e4185c25e84ca60fc1295543137e
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/attention/linear/kernels/gdn_cutedsl.py
@@ -0,0 +1,47 @@
+import torch
+
+from sglang.jit_kernel.cutedsl_gdn import cutedsl_fused_sigmoid_gating_delta_rule_update
+from sglang.srt.layers.attention.linear.kernels.kernel_backend import (
+    LinearAttnKernelBase,
+)
+
+
+class CuteDSLGDNKernel(LinearAttnKernelBase):
+    """CuTe DSL kernel for GDN decode (CUDA only)."""
+
+    def decode(
+        self,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
+        a: torch.Tensor,
+        b: torch.Tensor,
+        *,
+        A_log: torch.Tensor,
+        dt_bias: torch.Tensor,
+        ssm_states: torch.Tensor,
+        cache_indices: torch.Tensor,
+        query_start_loc: torch.Tensor,
+        **kwargs,
+    ) -> torch.Tensor:
+        return cutedsl_fused_sigmoid_gating_delta_rule_update(
+            A_log=A_log,
+            dt_bias=dt_bias,
+            q=q,
+            k=k,
+            v=v,
+            a=a,
+            b=b,
+            initial_state_source=ssm_states,
+            initial_state_indices=cache_indices,
+            cu_seqlens=query_start_loc,
+            use_qk_l2norm_in_kernel=True,
+            softplus_beta=1.0,
+            softplus_threshold=20.0,
+        )
+
+    def extend(self, *args, **kwargs):
+        raise NotImplementedError("CuteDSLGDNKernel only supports decode")
+
+    def target_verify(self, *args, **kwargs):
+        raise NotImplementedError("CuteDSLGDNKernel only supports decode")
diff --git a/sglang/python/sglang/srt/layers/attention/linear/kernels/gdn_flashinfer.py b/sglang/python/sglang/srt/layers/attention/linear/kernels/gdn_flashinfer.py
new file mode 100644
index 0000000000000000000000000000000000000000..32ffd96649dbf74258b275bfd1ca70792324468a
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/attention/linear/kernels/gdn_flashinfer.py
@@ -0,0 +1,325 @@
+"""FlashInfer-based kernels for GDN (Gated Delta Network) linear attention.
+
+Provides K-last SSM layout support using FlashInfer CUTLASS kernels (SM90+).
+The K-last layout stores SSM states as [pool, HV, V, K] instead of V-last
+[pool, HV, K, V], enabling more efficient memory access patterns for decode.
+
+Requires ``flashinfer`` with GDN kernel support to be installed, e.g.
+``pip install -e ".[cutlass]"`` from the FlashInfer repo.
+
+NOTE: FlashInfer >= 0.6.4 includes a fix (PR#2509) that caches
+cudaGetDeviceProperties in the GDN prefill JIT launcher, eliminating ~80ms
+of CPU overhead per prefill. Upgrading from 0.6.3 to 0.6.4 recovers ~50%
+prefill throughput regression observed with stock FlashInfer.
+"""
+
+import logging
+import os
+from typing import Optional
+
+import torch
+
+from sglang.srt.layers.attention.linear.kernels.kernel_backend import (
+    LinearAttnKernelBase,
+)
+
+logger = logging.getLogger(__name__)
+
+# ---------------------------------------------------------------------------
+# Lazy import for FlashInfer GDN kernels
+# ---------------------------------------------------------------------------
+_flashinfer_gdn_available: Optional[bool] = None
+_flashinfer_chunk_gated_delta_rule = None
+_flashinfer_gated_delta_rule_mtp = None
+_flashinfer_gated_delta_rule_decode = None
+
+
+def _get_flashinfer_gdn_kernels():
+    """Lazy import for FlashInfer GDN prefill, decode and verify (MTP) kernels.
+
+    Returns (available, prefill_fn, mtp_fn, decode_fn).
+    """
+    global _flashinfer_gdn_available, _flashinfer_chunk_gated_delta_rule, _flashinfer_gated_delta_rule_mtp, _flashinfer_gated_delta_rule_decode
+    if _flashinfer_gdn_available is None:
+        try:
+            os.environ.setdefault("FLASHINFER_DISABLE_VERSION_CHECK", "1")
+
+            from flashinfer.gdn_decode import (
+                gated_delta_rule_decode_pretranspose,
+                gated_delta_rule_mtp,
+            )
+            from flashinfer.gdn_prefill import chunk_gated_delta_rule
+
+            _flashinfer_chunk_gated_delta_rule = chunk_gated_delta_rule
+            _flashinfer_gated_delta_rule_mtp = gated_delta_rule_mtp
+            # Use pretranspose (K-last / V-major) decode kernel to match
+            # the K-last pool layout [pool, HV, V, K]
+            _flashinfer_gated_delta_rule_decode = gated_delta_rule_decode_pretranspose
+            # SM90+ required for FlashInfer GDN CUTLASS kernels
+            _flashinfer_gdn_available = (
+                torch.cuda.is_available() and torch.cuda.get_device_capability()[0] >= 9
+            )
+            if _flashinfer_gdn_available:
+                logger.info(
+                    "FlashInfer GDN kernels (prefill + decode + MTP) loaded successfully"
+                )
+        except (ImportError, RuntimeError) as e:
+            logger.warning(f"FlashInfer GDN kernels not available: {e}")
+            _flashinfer_gdn_available = False
+            _flashinfer_gated_delta_rule_decode = None
+    return (
+        _flashinfer_gdn_available,
+        _flashinfer_chunk_gated_delta_rule,
+        _flashinfer_gated_delta_rule_mtp,
+        _flashinfer_gated_delta_rule_decode,
+    )
+
+
+# ---------------------------------------------------------------------------
+# Kernel implementation
+# ---------------------------------------------------------------------------
+
+
+class FlashInferGDNKernel(LinearAttnKernelBase):
+    """FlashInfer CUTLASS kernel for GDN with K-last SSM state layout.
+
+    Supports decode (pooled pretranspose), extend (chunked prefill) and
+    target_verify (MTP).  Requires SM90+ and FlashInfer with GDN support.
+    """
+
+    def __init__(self):
+        (
+            available,
+            self._prefill_fn,
+            self._mtp_fn,
+            self._decode_fn,
+        ) = _get_flashinfer_gdn_kernels()
+
+        if not available:
+            raise RuntimeError(
+                "FlashInfer GDN kernels are not available. "
+                "Requires SM90+ and FlashInfer with GDN kernel support."
+            )
+        if self._prefill_fn is None:
+            raise RuntimeError("FlashInfer GDN prefill kernel is unavailable.")
+        if self._mtp_fn is None:
+            raise RuntimeError("FlashInfer GDN MTP (verify) kernel is unavailable.")
+        if self._decode_fn is None:
+            raise RuntimeError("FlashInfer GDN decode kernel is unavailable.")
+
+        logger.info(
+            "K-last mode: Using FlashInfer GDN prefill, decode and MTP (verify) kernels"
+        )
+
+    # ---- decode ----
+
+    def decode(
+        self,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
+        a: torch.Tensor,
+        b: torch.Tensor,
+        *,
+        A_log: torch.Tensor,
+        dt_bias: torch.Tensor,
+        ssm_states: torch.Tensor,
+        cache_indices: torch.Tensor,
+        query_start_loc: torch.Tensor,
+        **kwargs,
+    ) -> torch.Tensor:
+        """K-last decode using FlashInfer pretranspose kernel (stock, no pool indexing).
+
+        TODO: Once FlashInfer PR#2521 is merged and released, switch back
+        to pool-indexed decode (passing state_indices directly) to avoid
+        the gather/scatter overhead (~7-9% decode regression).
+        https://github.com/flashinfer-ai/flashinfer/pull/2521
+        """
+        batch_size = cache_indices.shape[0]
+        num_heads = q.shape[2]
+        head_k_dim = q.shape[3]
+        num_v_heads = v.shape[2]
+        head_v_dim = v.shape[3]
+
+        query_fi = q.view(batch_size, 1, num_heads, head_k_dim)
+        key_fi = k.view(batch_size, 1, num_heads, head_k_dim)
+        value_fi = v.view(batch_size, 1, num_v_heads, head_v_dim)
+        a_fi = a.view(batch_size, 1, num_v_heads)
+        b_fi = b.view(batch_size, 1, num_v_heads)
+
+        # Gather states from pool
+        state_batch = ssm_states[cache_indices]
+
+        output_fi, new_state = self._decode_fn(
+            q=query_fi,
+            k=key_fi,
+            v=value_fi,
+            state=state_batch,
+            A_log=A_log.detach(),
+            a=a_fi,
+            dt_bias=dt_bias.detach(),
+            b=b_fi,
+            scale=None,
+            output=None,
+            use_qk_l2norm=True,
+        )
+
+        # Scatter updated states back to pool
+        ssm_states[cache_indices] = new_state
+
+        # [bs, 1, HV, V] -> [1, bs, HV, V]
+        return output_fi.view(1, batch_size, num_v_heads, head_v_dim)
+
+    # ---- extend (prefill) ----
+
+    def extend(
+        self,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
+        g: torch.Tensor,
+        beta: torch.Tensor,
+        *,
+        ssm_states: torch.Tensor,
+        cache_indices: torch.Tensor,
+        query_start_loc: torch.Tensor,
+        **kwargs,
+    ) -> tuple:
+        """K-last chunked prefill using FlashInfer GDN prefill kernel.
+
+        The FlashInfer kernel natively supports K-last state layout [N, H, V, K].
+        q and k are L2-normalized before calling the kernel (the kernel is called
+        with ``use_qk_l2norm_in_kernel=False``).
+        """
+        from sglang.srt.layers.attention.fla.l2norm import l2norm_fwd
+
+        # q, k: [1, seq, H, K] -> [seq, H, K]
+        # v:    [1, seq, HV, V] -> [seq, HV, V]
+        total_seq_len = q.shape[1]
+        num_v_heads = v.shape[2]
+        head_v_dim = v.shape[3]
+
+        q_fi = l2norm_fwd(q[0].contiguous())
+        k_fi = l2norm_fwd(k[0].contiguous())
+        v_fi = v[0].contiguous()
+
+        # g (alpha) and beta: [1, seq, HV] -> [seq, HV], float32 for FlashInfer
+        alpha_fi = torch.exp(g[0].to(torch.float32))
+        beta_fi = beta[0].to(torch.float32)
+
+        cu_seqlens_fi = query_start_loc.to(torch.int64)
+
+        # Remap negative padding indices to sentinel slot
+        ssm_cache_indices = torch.where(
+            cache_indices >= 0,
+            cache_indices,
+            ssm_states.shape[0] - 1,
+        ).to(torch.int64)
+
+        # FlashInfer requires float32 initial state, K-last layout [B, HV, V, K]
+        initial_state_fi = ssm_states[ssm_cache_indices].to(torch.float32)
+
+        output_fi, output_state_fi = self._prefill_fn(
+            q=q_fi,
+            k=k_fi,
+            v=v_fi,
+            g=alpha_fi,
+            beta=beta_fi,
+            scale=None,
+            initial_state=initial_state_fi,
+            output_final_state=True,
+            cu_seqlens=cu_seqlens_fi,
+            use_qk_l2norm_in_kernel=False,
+        )
+
+        # Write back state to pool
+        ssm_states.index_copy_(
+            0,
+            ssm_cache_indices,
+            output_state_fi.to(ssm_states.dtype),
+        )
+
+        # Output: [seq, HV, V] -> [1, seq, HV, V]
+        core_attn_out = output_fi.view(1, total_seq_len, num_v_heads, head_v_dim)
+
+        # Return (output, last_recurrent_state, h) to match Triton kernel interface.
+        # h=None since FlashInfer doesn't provide intermediate states
+        # (prefix caching for K-last prefill is not supported).
+        return core_attn_out, None, None
+
+    # ---- target_verify (MTP) ----
+
+    def target_verify(
+        self,
+        A_log: torch.Tensor,
+        dt_bias: torch.Tensor,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
+        a: torch.Tensor,
+        b: torch.Tensor,
+        *,
+        ssm_states: torch.Tensor,
+        cache_indices: torch.Tensor,
+        query_start_loc: torch.Tensor,
+        intermediate_states_buffer: torch.Tensor,
+        intermediate_state_indices: torch.Tensor,
+        cache_steps: int,
+        retrieve_parent_token: torch.Tensor,
+        **kwargs,
+    ) -> torch.Tensor:
+        """K-last MTP verify using FlashInfer GDN MTP kernel.
+
+        Only supports topk=1 (retrieve_parent_token must be None).
+        """
+        if retrieve_parent_token is not None:
+            raise RuntimeError(
+                "FlashInfer GDN verify kernel only supports topk=1 "
+                "(retrieve_parent_token must be None)."
+            )
+
+        # Recover batch_size and draft_token_num from g shape
+        # g: [1, seq_len, HV] where seq_len = batch_size * draft_token_num
+        seq_len = q.shape[1]
+        batch_size = query_start_loc.shape[0] - 1
+        draft_token_num = seq_len // batch_size
+
+        num_heads = q.shape[2]
+        head_k_dim = q.shape[3]
+        num_v_heads = v.shape[2]
+        head_v_dim = v.shape[3]
+
+        # Reshape [1, seq, H, D] -> [B, T, H, D] for FlashInfer MTP
+        query_mtp = q.view(batch_size, draft_token_num, num_heads, head_k_dim)
+        key_mtp = k.view(batch_size, draft_token_num, num_heads, head_k_dim)
+        value_mtp = v.view(batch_size, draft_token_num, num_v_heads, head_v_dim)
+
+        # a, b from g/beta: [1, seq, HV] -> [B, T, HV]
+        if a is None or b is None or A_log is None or dt_bias is None:
+            raise RuntimeError(
+                "FlashInfer GDN MTP kernel requires a_raw, b_raw, A_log, "
+                "dt_bias to be passed via kwargs."
+            )
+
+        a_mtp = a.view(batch_size, draft_token_num, num_v_heads)
+        b_mtp = b.view(batch_size, draft_token_num, num_v_heads)
+
+        output_fi, _ = self._mtp_fn(
+            q=query_mtp,
+            k=key_mtp,
+            v=value_mtp,
+            initial_state=ssm_states,
+            initial_state_indices=cache_indices,
+            A_log=A_log.detach(),
+            a=a_mtp,
+            dt_bias=dt_bias.detach(),
+            b=b_mtp,
+            scale=None,
+            output=None,
+            intermediate_states_buffer=intermediate_states_buffer,
+            disable_state_update=True,
+            use_qk_l2norm=True,
+        )
+
+        # [B, T, HV, V] -> [1, seq_len, HV, V]
+        return output_fi.view(1, seq_len, num_v_heads, head_v_dim)
diff --git a/sglang/python/sglang/srt/layers/attention/linear/kernels/gdn_triton.py b/sglang/python/sglang/srt/layers/attention/linear/kernels/gdn_triton.py
new file mode 100644
index 0000000000000000000000000000000000000000..106113cda3fb19aeb7dbd641bc6492ab7c3db529
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/attention/linear/kernels/gdn_triton.py
@@ -0,0 +1,136 @@
+import torch
+
+from sglang.srt.layers.attention.linear.kernels.kernel_backend import (
+    LinearAttnKernelBase,
+)
+from sglang.srt.utils import is_cpu, is_npu
+
+if not is_cpu():
+    from sglang.srt.layers.attention.fla.chunk import chunk_gated_delta_rule
+    from sglang.srt.layers.attention.fla.fused_sigmoid_gating_recurrent import (
+        fused_sigmoid_gating_delta_rule_update,
+    )
+
+if is_npu():
+    from sgl_kernel_npu.fla.chunk import chunk_gated_delta_rule_npu
+    from sgl_kernel_npu.fla.fused_sigmoid_gating_recurrent import (
+        fused_sigmoid_gating_delta_rule_update_npu,
+    )
+
+    chunk_gated_delta_rule = chunk_gated_delta_rule_npu
+    fused_sigmoid_gating_delta_rule_update = fused_sigmoid_gating_delta_rule_update_npu
+elif is_cpu():
+    from sgl_kernel.mamba import chunk_gated_delta_rule_cpu
+
+    chunk_gated_delta_rule = chunk_gated_delta_rule_cpu
+    fused_sigmoid_gating_delta_rule_update = (
+        torch.ops.sgl_kernel.fused_sigmoid_gating_delta_rule_update_cpu
+    )
+
+
+class TritonGDNKernel(LinearAttnKernelBase):
+    """Triton-based kernel for GDN (Gated Delta Network) linear attention."""
+
+    def decode(
+        self,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
+        a: torch.Tensor,
+        b: torch.Tensor,
+        *,
+        A_log: torch.Tensor,
+        dt_bias: torch.Tensor,
+        ssm_states: torch.Tensor,
+        cache_indices: torch.Tensor,
+        query_start_loc: torch.Tensor,
+        **kwargs,
+    ) -> torch.Tensor:
+        return fused_sigmoid_gating_delta_rule_update(
+            A_log=A_log,
+            dt_bias=dt_bias,
+            q=q,
+            k=k,
+            v=v,
+            a=a,
+            b=b,
+            initial_state_source=ssm_states,
+            initial_state_indices=cache_indices,
+            cu_seqlens=query_start_loc,
+            use_qk_l2norm_in_kernel=True,
+            softplus_beta=1.0,
+            softplus_threshold=20.0,
+        )
+
+    def extend(
+        self,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
+        g: torch.Tensor,
+        beta: torch.Tensor,
+        *,
+        ssm_states: torch.Tensor,
+        cache_indices: torch.Tensor,
+        query_start_loc: torch.Tensor,
+        **kwargs,
+    ) -> tuple:
+        recurrent_state = ssm_states
+        recurrent_state_indices_args = {"initial_state_indices": cache_indices}
+        if is_npu() or is_cpu():
+            recurrent_state = ssm_states[cache_indices]
+            recurrent_state_indices_args = {}
+        return chunk_gated_delta_rule(
+            q=q,
+            k=k,
+            v=v,
+            g=g,
+            beta=beta,
+            initial_state=recurrent_state,
+            cu_seqlens=query_start_loc,
+            head_first=False,
+            use_qk_l2norm_in_kernel=True,
+            **recurrent_state_indices_args,
+        )
+
+    def target_verify(
+        self,
+        A_log: torch.Tensor,
+        dt_bias: torch.Tensor,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
+        a: torch.Tensor,
+        b: torch.Tensor,
+        *,
+        ssm_states: torch.Tensor,
+        cache_indices: torch.Tensor,
+        query_start_loc: torch.Tensor,
+        intermediate_states_buffer: torch.Tensor,
+        intermediate_state_indices: torch.Tensor,
+        cache_steps: int,
+        retrieve_parent_token: torch.Tensor,
+        **kwargs,
+    ) -> torch.Tensor:
+        return fused_sigmoid_gating_delta_rule_update(
+            A_log=A_log,
+            dt_bias=dt_bias,
+            q=q,
+            k=k,
+            v=v,
+            a=a,
+            b=b,
+            initial_state_source=ssm_states,
+            initial_state_indices=cache_indices,
+            cu_seqlens=query_start_loc,
+            use_qk_l2norm_in_kernel=True,
+            softplus_beta=1.0,
+            softplus_threshold=20.0,
+            is_kda=False,
+            # target_verify specific parameters
+            disable_state_update=True,
+            intermediate_states_buffer=intermediate_states_buffer,
+            intermediate_state_indices=intermediate_state_indices,
+            cache_steps=cache_steps,
+            retrieve_parent_token=retrieve_parent_token,
+        )
diff --git a/sglang/python/sglang/srt/layers/attention/linear/kernels/kda_triton.py b/sglang/python/sglang/srt/layers/attention/linear/kernels/kda_triton.py
new file mode 100644
index 0000000000000000000000000000000000000000..2ab8a5e0bae6e3dd0a1c72bb700c67a16cb6bc6f
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/attention/linear/kernels/kda_triton.py
@@ -0,0 +1,73 @@
+import torch
+
+from sglang.srt.layers.attention.linear.kernels.kernel_backend import (
+    LinearAttnKernelBase,
+)
+from sglang.srt.utils import is_cpu
+
+if not is_cpu():
+    from sglang.srt.layers.attention.fla.fused_sigmoid_gating_recurrent import (
+        fused_sigmoid_gating_delta_rule_update,
+    )
+    from sglang.srt.layers.attention.fla.kda import chunk_kda
+
+
+class TritonKDAKernel(LinearAttnKernelBase):
+    """Triton-based kernel for KDA (Kimi Delta Attention) linear attention."""
+
+    def decode(
+        self,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
+        a: torch.Tensor,
+        b: torch.Tensor,
+        *,
+        A_log: torch.Tensor,
+        dt_bias: torch.Tensor,
+        ssm_states: torch.Tensor,
+        cache_indices: torch.Tensor,
+        query_start_loc: torch.Tensor,
+        **kwargs,
+    ) -> torch.Tensor:
+        return fused_sigmoid_gating_delta_rule_update(
+            A_log=A_log,
+            dt_bias=dt_bias,
+            q=q,
+            k=k,
+            v=v,
+            a=a,
+            b=b,
+            initial_state_source=ssm_states,
+            initial_state_indices=cache_indices,
+            cu_seqlens=query_start_loc,
+            use_qk_l2norm_in_kernel=True,
+            softplus_beta=1.0,
+            softplus_threshold=20.0,
+            is_kda=True,
+        )
+
+    def extend(
+        self,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
+        g: torch.Tensor,
+        beta: torch.Tensor,
+        *,
+        ssm_states: torch.Tensor,
+        cache_indices: torch.Tensor,
+        query_start_loc: torch.Tensor,
+        **kwargs,
+    ) -> torch.Tensor:
+        return chunk_kda(
+            q=q,
+            k=k,
+            v=v,
+            g=g,
+            beta=beta,
+            initial_state=ssm_states,
+            initial_state_indices=cache_indices,
+            use_qk_l2norm_in_kernel=True,
+            cu_seqlens=query_start_loc,
+        )
diff --git a/sglang/python/sglang/srt/layers/attention/linear/kernels/kernel_backend.py b/sglang/python/sglang/srt/layers/attention/linear/kernels/kernel_backend.py
new file mode 100644
index 0000000000000000000000000000000000000000..5e8018699e1adc6ebfd72bf50166bdc889153c4d
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/attention/linear/kernels/kernel_backend.py
@@ -0,0 +1,62 @@
+from abc import ABC, abstractmethod
+
+import torch
+
+
+class LinearAttnKernelBase(ABC):
+    """Abstract base class for linear attention kernel implementations.
+
+    Each concrete implementation wraps a specific kernel (Triton, CuTe DSL, etc.)
+    and provides decode/extend/target_verify methods with a unified interface.
+    """
+
+    @abstractmethod
+    def decode(
+        self,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
+        a: torch.Tensor,
+        b: torch.Tensor,
+        *,
+        A_log: torch.Tensor,
+        dt_bias: torch.Tensor,
+        ssm_states: torch.Tensor,
+        cache_indices: torch.Tensor,
+        query_start_loc: torch.Tensor,
+        **kwargs,
+    ) -> torch.Tensor: ...
+
+    @abstractmethod
+    def extend(
+        self,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
+        g: torch.Tensor,
+        beta: torch.Tensor,
+        *,
+        ssm_states: torch.Tensor,
+        cache_indices: torch.Tensor,
+        query_start_loc: torch.Tensor,
+        **kwargs,
+    ) -> tuple: ...
+
+    def target_verify(
+        self,
+        A_log: torch.Tensor,
+        dt_bias: torch.Tensor,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
+        a: torch.Tensor,
+        b: torch.Tensor,
+        *,
+        ssm_states: torch.Tensor,
+        cache_indices: torch.Tensor,
+        query_start_loc: torch.Tensor,
+        **kwargs,
+    ) -> torch.Tensor:
+        raise NotImplementedError(
+            f"{self.__class__.__name__} does not support target_verify"
+        )
diff --git a/sglang/python/sglang/srt/layers/attention/linear/lightning_attn.py b/sglang/python/sglang/srt/layers/attention/linear/lightning_attn.py
new file mode 100644
index 0000000000000000000000000000000000000000..d415eefc90fa2f8e82eb4a062fd6fe07d38cc71f
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/attention/linear/lightning_attn.py
@@ -0,0 +1,767 @@
+# Adapted from https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/layers/mamba/linear_attn.py
+
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import torch
+import triton
+import triton.language as tl
+from einops import rearrange
+
+
+@triton.jit
+def _fwd_diag_kernel(
+    Q,
+    K,
+    V,
+    Out,
+    S,
+    b: tl.constexpr,
+    h: tl.constexpr,
+    n,
+    d: tl.constexpr,
+    e: tl.constexpr,
+    BLOCK: tl.constexpr,
+    NUM_BLOCK,
+    CBLOCK: tl.constexpr,
+):
+    # This kernel computes the diagonal blocks of the attention matrix
+    # Each diagonal block represents attention
+    # where queries attend to keys in the same block
+    off = tl.program_id(0)
+    off_bh = off // NUM_BLOCK  # batch-head index
+    off_block = off % NUM_BLOCK  # block index within the sequence
+    off_cblock = tl.program_id(1)  # sub-block index within a block
+
+    off_h = off_bh % h  # head index
+
+    # Calculate base offsets for the current batch and head
+    qk_offset = off_bh * n * d
+    v_offset = off_bh * n * e
+    o_offset = off_bh * n * e
+
+    # Calculate offsets for the current block
+    block_offset = off_block * BLOCK
+    qk_block_offset = block_offset * d
+    v_block_offset = block_offset * e
+    o_block_offset = block_offset * e
+
+    # Calculate offsets for the current sub-block
+    cblock_offset = off_cblock * CBLOCK
+    q_cblock_offset = cblock_offset * d
+    o_cblock_offset = cblock_offset * e
+
+    # Calculate pointers to the query, key, value, and output tensors
+    Q_block_ptr = (
+        Q
+        + qk_offset
+        + qk_block_offset
+        + q_cblock_offset
+        + tl.arange(0, CBLOCK)[:, None] * d
+        + tl.arange(0, d)[None, :]
+    )
+    K_trans_block_ptr = (
+        K
+        + qk_offset
+        + qk_block_offset
+        + tl.arange(0, CBLOCK)[None, :] * d
+        + tl.arange(0, d)[:, None]
+    )
+    V_block_ptr = (
+        V
+        + v_offset
+        + v_block_offset
+        + tl.arange(0, CBLOCK)[:, None] * e
+        + tl.arange(0, e)[None, :]
+    )
+    O_block_ptr = (
+        Out
+        + o_offset
+        + o_block_offset
+        + o_cblock_offset
+        + tl.arange(0, CBLOCK)[:, None] * e
+        + tl.arange(0, e)[None, :]
+    )
+
+    # Load the decay rate for the current head
+    S_block_ptr = S + off_h
+    s = tl.load(S_block_ptr)
+
+    i = off_cblock
+    q_index = tl.arange(0, CBLOCK) + i * CBLOCK
+
+    # Load query values
+    q = tl.load(Q_block_ptr, mask=block_offset + q_index[:, None] < n, other=0.0).to(
+        tl.float32
+    )
+
+    # Initialize output accumulator
+    qkv = tl.zeros([CBLOCK, e], dtype=tl.float32)
+
+    # Process all sub-blocks up to and
+    # including the current one (causal attention)
+    for j in range(i + 1):
+        kv_index = tl.arange(0, CBLOCK) + j * CBLOCK
+        diff = q_index[:, None] - kv_index[None, :]
+        s_index = s * diff
+        # Apply causal mask: only attend to positions before the current one
+        s_index = tl.where(diff >= 0, -s_index, float("-inf"))
+        decay = tl.exp(s_index)
+
+        # Load key and value
+        k_trans = tl.load(
+            K_trans_block_ptr,
+            mask=block_offset + kv_index[None, :] < n,
+            other=0.0,
+        ).to(tl.float32)
+        v = tl.load(
+            V_block_ptr,
+            mask=block_offset + kv_index[:, None] < n,
+            other=0.0,
+        ).to(tl.float32)
+
+        # Compute attention scores and apply decay
+        qk = tl.dot(q, k_trans) * decay
+
+        # Compute weighted values and accumulate
+        qkv += tl.dot(qk, v)
+
+        # Move to the next sub-block
+        K_trans_block_ptr += CBLOCK * d
+        V_block_ptr += CBLOCK * e
+
+    # Store the result
+    tl.store(
+        O_block_ptr,
+        qkv.to(O_block_ptr.dtype.element_ty),
+        mask=block_offset + q_index[:, None] < n,
+    )
+
+
+@triton.jit
+def _fwd_kv_parallel(
+    K,
+    V,
+    K_decay,
+    KV,
+    b: tl.constexpr,
+    h: tl.constexpr,
+    n,
+    d: tl.constexpr,
+    e: tl.constexpr,
+    BLOCK: tl.constexpr,
+    NUM_BLOCK,
+    D_FBLOCK: tl.constexpr,
+    E_FBLOCK: tl.constexpr,
+    NUM_FBLOCK: tl.constexpr,
+    CBLOCK: tl.constexpr,
+    NUM_CBLOCK: tl.constexpr,
+):
+    # This kernel computes the key-value outer
+    # products for each block in parallel
+    off_bh = tl.program_id(0)  # batch-head index
+    off_block = tl.program_id(1)  # block index
+
+    off_h = off_bh % h  # head index
+
+    block_offset = off_block * BLOCK
+
+    # Calculate offsets for the current block
+    k_block_offset = block_offset * d
+    v_block_offset = block_offset * e
+    kv_block_offset = off_block * d * e
+
+    # Calculate base offsets for the current batch and head
+    k_offset = off_bh * n * d
+    v_offset = off_bh * n * e
+    kv_offset = off_bh * NUM_BLOCK * d * e
+
+    # Calculate pointers to the key, value, and key-value tensors
+    K_trans_block_ptr = (
+        K
+        + k_offset
+        + k_block_offset
+        + tl.arange(0, CBLOCK)[None, :] * d
+        + tl.arange(0, D_FBLOCK)[:, None]
+    )
+    V_block_ptr = (
+        V
+        + v_offset
+        + v_block_offset
+        + tl.arange(0, CBLOCK)[:, None] * e
+        + tl.arange(0, E_FBLOCK)[None, :]
+    )
+    KV_block_ptr = (
+        KV
+        + kv_offset
+        + kv_block_offset
+        + tl.arange(0, D_FBLOCK)[:, None] * e
+        + tl.arange(0, E_FBLOCK)[None, :]
+    )
+
+    # Load the decay factors for the current head and block
+    k_decay_ptr = K_decay + off_h * BLOCK + tl.arange(0, CBLOCK)[None, :]
+
+    kv_index = tl.arange(0, CBLOCK)
+
+    # Initialize the key-value outer product accumulator
+    kv = tl.zeros([D_FBLOCK, E_FBLOCK], dtype=tl.float32)
+
+    # Handle the last block which might be smaller than BLOCK
+    if off_block == NUM_BLOCK - 1:
+        split_n = n - (NUM_BLOCK - 1) * BLOCK
+    else:
+        split_n = BLOCK
+    left_shift = tl.cdiv(split_n, CBLOCK) * CBLOCK - split_n
+    num_blocks = min(tl.cdiv(split_n, CBLOCK), NUM_CBLOCK)
+    k_decay_ptr += (NUM_CBLOCK - num_blocks) * CBLOCK
+
+    # Process all sub-blocks in the current block
+    for j in range(num_blocks):
+        left_bound = (1 - j) * left_shift
+        # Load key and value, handling boundary conditions
+        k_trans = tl.load(
+            K_trans_block_ptr - left_shift * d,
+            mask=kv_index[None, :] >= left_bound,
+            other=0.0,
+        )
+        v = tl.load(
+            V_block_ptr - left_shift * e,
+            mask=kv_index[:, None] >= left_bound,
+            other=0.0,
+        )
+
+        # Load decay factor and compute weighted key-value outer product
+        k_decay = tl.load(k_decay_ptr)
+        kv += tl.dot(k_trans * k_decay, v)
+
+        # Move to the next sub-block
+        K_trans_block_ptr += CBLOCK * d
+        V_block_ptr += CBLOCK * e
+        k_decay_ptr += CBLOCK
+
+    # Store the result
+    tl.store(KV_block_ptr, kv.to(KV_block_ptr.dtype.element_ty))
+
+
+@triton.jit
+def _fwd_kv_reduce(
+    S,
+    KV,
+    KV_HISTORY,
+    b: tl.constexpr,
+    h: tl.constexpr,
+    n,
+    d: tl.constexpr,
+    e: tl.constexpr,
+    BLOCK: tl.constexpr,
+    NUM_BLOCK,
+    D_FBLOCK: tl.constexpr,
+    E_FBLOCK: tl.constexpr,
+):
+    # This kernel reduces the key-value outer products
+    # across blocks and updates the KV history
+    off_bh = tl.program_id(0)  # batch-head index
+    off_h = off_bh % h  # head index
+
+    kv_offset = off_bh * NUM_BLOCK * d * e
+
+    # Calculate pointer to the key-value tensor
+    KV_block_ptr = (
+        KV
+        + kv_offset
+        + tl.arange(0, D_FBLOCK)[:, None] * e
+        + tl.arange(0, E_FBLOCK)[None, :]
+    )
+
+    # Load the decay rate for the current head
+    s_ptrs = S + off_h
+    s = tl.load(s_ptrs)
+
+    # Calculate pointer to the key-value history tensor
+    kv_history_offset = off_bh * d * e
+    KV_HISTORY_block_ptr = (
+        KV_HISTORY
+        + kv_history_offset
+        + tl.arange(0, D_FBLOCK)[:, None] * e
+        + tl.arange(0, E_FBLOCK)[None, :]
+    )
+
+    # Load the previous key-value history
+    kv_pre = tl.load(KV_HISTORY_block_ptr).to(tl.float32)
+
+    # Process all blocks in reverse order to compute the prefix sum
+    for i in range(NUM_BLOCK):
+        block_size = min(n - i * BLOCK, BLOCK)
+        # Compute decay factor for the current block
+        block_decay = tl.exp(-s.to(tl.float32) * block_size)
+
+        # Load the current key-value outer product
+        kv_cur = tl.load(KV_block_ptr).to(tl.float32)
+        # Store the previous key-value history to the current block
+        tl.store(KV_block_ptr, kv_pre.to(KV_block_ptr.dtype.element_ty))
+
+        # Update the key-value history with the current block
+        kv_pre = block_decay * kv_pre + kv_cur
+        KV_block_ptr += d * e
+
+    # Store the updated key-value history
+    tl.store(KV_HISTORY_block_ptr, kv_pre)
+
+
+@triton.jit
+def _fwd_none_diag_kernel(
+    Q,
+    Out,
+    S,
+    KV,
+    b: tl.constexpr,
+    h: tl.constexpr,
+    n,
+    d: tl.constexpr,
+    e: tl.constexpr,
+    BLOCK: tl.constexpr,
+    NUM_BLOCK,
+    E_FBLOCK: tl.constexpr,
+    CBLOCK: tl.constexpr,
+    NUM_CBLOCK: tl.constexpr,
+):
+    # This kernel computes the non-diagonal blocks of the attention matrix
+    # Each non-diagonal block represents attention
+    # where queries attend to keys in different blocks
+    off_bh = tl.program_id(0)  # batch-head index
+    off_h = off_bh % h  # head index
+
+    off_nc = tl.program_id(1)
+    off_n = off_nc // NUM_CBLOCK  # block index
+    off_c = off_nc % NUM_CBLOCK  # sub-block index
+    off_e = tl.program_id(2)  # output feature block index
+
+    n_offset = off_n * BLOCK
+    c_offset = off_c * CBLOCK
+    e_offset = off_e * E_FBLOCK
+    block_offset = n_offset + c_offset
+
+    # Calculate offsets for the current batch, head, and block
+    q_offset = off_bh * n * d + (n_offset + c_offset) * d
+    o_offset = off_bh * n * e + (n_offset + c_offset) * e + e_offset
+    kv_offset = off_bh * NUM_BLOCK * d * e + off_n * d * e + e_offset
+
+    # Calculate pointers to the query, output, and key-value tensors
+    Q_block_ptr = (
+        Q + q_offset + tl.arange(0, CBLOCK)[:, None] * d + tl.arange(0, d)[None, :]
+    )
+    O_block_ptr = (
+        Out
+        + o_offset
+        + tl.arange(0, CBLOCK)[:, None] * e
+        + tl.arange(0, E_FBLOCK)[None, :]
+    )
+    KV_block_ptr = (
+        KV + kv_offset + tl.arange(0, d)[:, None] * e + tl.arange(0, E_FBLOCK)[None, :]
+    )
+
+    # Load the decay rate for the current head
+    S_block_ptr = S + off_h
+    s = tl.load(S_block_ptr)
+
+    c_array = tl.arange(0, CBLOCK)
+
+    # Load the key-value outer product for the current block
+    kv = tl.load(KV_block_ptr).to(tl.float32)
+    q_index = block_offset + tl.arange(0, CBLOCK)
+
+    # Load query values
+    q = tl.load(Q_block_ptr, mask=q_index[:, None] < n, other=0.0).to(tl.float32)
+
+    # Compute decay factors for the current sub-block
+    q_decay = tl.exp(-s.to(tl.float32) * (off_c * CBLOCK + c_array[:, None]))
+
+    # Compute non-diagonal attention output
+    qkv_none_diag = tl.dot(q, kv) * q_decay
+
+    # Load diagonal attention output (computed by _fwd_diag_kernel)
+    qkv_diag = tl.load(O_block_ptr, mask=q_index[:, None] < n, other=0.0).to(tl.float32)
+
+    # Combine diagonal and non-diagonal attention outputs
+    qkv = qkv_diag + qkv_none_diag
+
+    # Store the result
+    tl.store(
+        O_block_ptr, qkv.to(O_block_ptr.dtype.element_ty), mask=q_index[:, None] < n
+    )
+
+
+class _attention(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx, q, k, v, s, kv_history):
+        # Forward pass of the lightning attention algorithm
+        q = q.contiguous()
+        k = k.contiguous()
+        v = v.contiguous()
+        s = s.contiguous()
+
+        # Check CUDA compute capability
+        capability = torch.cuda.get_device_capability()
+        if capability[0] < 8:
+            raise RuntimeError(
+                "Flash attention currently only supported",
+                "for compute capability >= 80",
+            )
+
+        # Get input dimensions
+        b, h, n, d = q.shape
+        e = v.shape[-1]
+
+        # Initialize output tensor
+        o = torch.empty((b, h, n, e), dtype=q.dtype, device=q.device)
+
+        # Set block sizes
+        BLOCK = 256
+        NUM_BLOCK = triton.cdiv(n, BLOCK)
+
+        CBLOCK = 32
+        NUM_CBLOCK = BLOCK // CBLOCK
+        assert BLOCK % CBLOCK == 0, "BLOCK must be a multiple of CBLOCK"
+
+        # Compute decay factors for keys
+        array = torch.arange(0, BLOCK, device=q.device) + 1
+        k_decay = torch.exp(-s * (BLOCK - array.reshape(1, -1)))
+
+        # Step 1: Compute diagonal blocks of attention
+        grid = (b * h * NUM_BLOCK, NUM_CBLOCK)
+        _fwd_diag_kernel[grid](
+            q,
+            k,
+            v,
+            o,
+            s,
+            b,
+            h,
+            n,
+            d,
+            e,
+            BLOCK=BLOCK,
+            NUM_BLOCK=NUM_BLOCK,
+            CBLOCK=CBLOCK,
+        )
+
+        # Set feature block sizes
+        NUM_FBLOCK = 1
+        D_FBLOCK = d // NUM_FBLOCK
+        assert d % NUM_FBLOCK == 0
+        E_FBLOCK = e // NUM_FBLOCK
+        assert e % NUM_FBLOCK == 0
+
+        CBLOCK = 64
+        NUM_CBLOCK = BLOCK // CBLOCK
+        assert BLOCK % CBLOCK == 0, "BLOCK must be a multiple of CBLOCK"
+
+        # Step 2: Compute key-value outer products for each block in parallel
+        kv = torch.empty((b, h, NUM_BLOCK, d, e), dtype=torch.float32, device=q.device)
+        grid = (b * h, NUM_BLOCK)
+        _fwd_kv_parallel[grid](
+            k,
+            v,
+            k_decay,
+            kv,
+            b,
+            h,
+            n,
+            d,
+            e,
+            BLOCK=BLOCK,
+            NUM_BLOCK=NUM_BLOCK,
+            D_FBLOCK=D_FBLOCK,
+            E_FBLOCK=E_FBLOCK,
+            NUM_FBLOCK=NUM_FBLOCK,
+            CBLOCK=CBLOCK,
+            NUM_CBLOCK=NUM_CBLOCK,
+        )
+
+        # Step 3: Reduce key-value outer products
+        # across blocks and update KV history
+        grid = (b * h, NUM_FBLOCK)
+        _fwd_kv_reduce[grid](
+            s,
+            kv,
+            kv_history,
+            b,
+            h,
+            n,
+            d,
+            e,
+            BLOCK=BLOCK,
+            NUM_BLOCK=NUM_BLOCK,
+            D_FBLOCK=D_FBLOCK,
+            E_FBLOCK=E_FBLOCK,
+        )
+
+        # Step 4: Compute non-diagonal blocks of attention
+        grid = (b * h, NUM_BLOCK * NUM_CBLOCK)
+        _fwd_none_diag_kernel[grid](
+            q,
+            o,
+            s,
+            kv,
+            b,
+            h,
+            n,
+            d,
+            e,
+            BLOCK=BLOCK,
+            NUM_BLOCK=NUM_BLOCK,
+            E_FBLOCK=E_FBLOCK,
+            CBLOCK=CBLOCK,
+            NUM_CBLOCK=NUM_CBLOCK,
+        )
+
+        # Save tensors for backward pass
+        ctx.save_for_backward(q, k, v, s, kv)
+        ctx.BLOCK = BLOCK
+
+        return o, torch.cat([kv, kv_history.unsqueeze(2)], dim=2)
+
+
+# Apply the lightning attention function
+lightning_attention_ = _attention.apply
+
+
+def lightning_attention(q, k, v, ed, block_size=256, kv_history=None):
+    """
+    Apply lightning attention algorithm
+    to compute attention efficiently.
+
+    Args:
+        q: Query tensor of shape [batch, heads, seq_len, dim]
+        k: Key tensor of shape [batch, heads, seq_len, dim]
+        v: Value tensor of shape [batch, heads, seq_len, dim_v]
+        ed: Decay rate tensor of shape [heads]
+        block_size: Size of blocks for block-sparse attention
+        kv_history: Optional key-value history from previous computations
+
+    Returns:
+        output: Attention output
+        kv: Updated key-value history
+    """
+    d = q.shape[-1]
+    e = v.shape[-1]
+
+    if ed.dim() == 1:
+        ed = ed.view(1, -1, 1, 1)
+
+    # Split the computation into chunks for better parallelism
+    m = 128 if d >= 128 else 64
+    assert d % m == 0, f"Dimension d ({d}) must be divisible by m ({m})"
+    arr = [m * i for i in range(d // m + 1)]
+    if arr[-1] != d:
+        arr.append(d)
+    n = len(arr)
+    output = 0
+
+    # Initialize or clone key-value history
+    if kv_history is None:
+        kv_history = torch.zeros(
+            (q.shape[0], q.shape[1], d, e), dtype=torch.float32, device=q.device
+        )
+    else:
+        kv_history = kv_history.clone().contiguous()
+
+    # Process each chunk and accumulate results
+    for i in range(n - 1):
+        s = arr[i]
+        e = arr[i + 1]
+        q1 = q[..., s:e]
+        k1 = k[..., s:e]
+        o, kv = lightning_attention_(q1, k1, v, ed, kv_history)
+        output = output + o
+    return output, kv
+
+
+@triton.jit
+def _linear_attn_decode_kernel(
+    q_ptr,
+    k_ptr,
+    v_ptr,
+    kv_cache_ptr,
+    slope_rate,
+    slot_idx,
+    output_ptr,
+    D: tl.constexpr,
+    qkv_b_stride,
+    qkv_h_stride,
+    cache_b_stride,
+    cache_h_stride,
+    cache_d0_stride,
+    cache_d1_stride,
+    BLOCK_SIZE: tl.constexpr,
+):
+    """
+    Kernel for linear attention decoding with KV cache.
+
+    This kernel computes attention for a single token using the KV cache.
+    """
+    pid_b = tl.program_id(0)  # batch index
+    pid_h = tl.program_id(1)  # head index
+    pid_d = tl.program_id(2)  # dimension block index
+
+    # Load slot index for the current batch
+    slot_id = tl.load(slot_idx + pid_b)
+
+    # Skip if slot_id is -1 (padding)
+    if slot_id == -1:
+        return
+
+    batch_id = pid_b
+    head_id = pid_h
+
+    # Load decay rate for the current head
+    ratio = tl.load(slope_rate + pid_h)
+
+    # Calculate offsets for dimensions
+    qk_d_offsets = tl.arange(0, D)
+    v_d_offsets = tl.arange(0, BLOCK_SIZE) + pid_d * BLOCK_SIZE
+    cache_d_offsets = (
+        qk_d_offsets[:, None] * cache_d0_stride + v_d_offsets[None, :] * cache_d1_stride
+    )
+
+    # Calculate offsets for the current batch and head
+    q_offset = batch_id * qkv_b_stride + head_id * qkv_h_stride
+    k_offset = batch_id * qkv_b_stride + head_id * qkv_h_stride
+    v_offset = batch_id * qkv_b_stride + head_id * qkv_h_stride
+
+    cache_offset = slot_id * cache_b_stride + head_id * cache_h_stride
+
+    # Create masks for loading tensors
+    qk_mask = qk_d_offsets < D
+    v_mask = v_d_offsets < D
+
+    # Load query, key, and value tensors
+    q = tl.load(q_ptr + q_offset + qk_d_offsets, mask=qk_mask, other=0.0)
+    k = tl.load(k_ptr + k_offset + qk_d_offsets, mask=qk_mask, other=0.0)
+    v = tl.load(v_ptr + v_offset + v_d_offsets, mask=v_mask, other=0.0)
+
+    # Compute key-value outer product
+    kv_outer = k[:, None] * v[None, :]
+    kv_mask = qk_mask[:, None] & v_mask[None, :]
+
+    # Apply decay to previous KV cache
+    ratio = tl.exp(-ratio)
+    kv_ptr = kv_cache_ptr + cache_offset + cache_d_offsets
+    kv_cache_old = tl.load(kv_ptr, mask=kv_mask, other=0.0)
+    kv_outer = kv_outer + ratio * kv_cache_old
+
+    # Compute attention output
+    output = q[:, None].to(tl.float32) * kv_outer
+    output = tl.sum(output, axis=0)
+
+    # Update KV cache and store output
+    tl.store(kv_ptr, kv_outer, mask=kv_mask)
+    tl.store(output_ptr + q_offset + v_d_offsets, output, mask=v_mask)
+
+
+def linear_decode_forward_triton(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    kv_caches: torch.Tensor,
+    slope_rate: torch.Tensor,
+    slot_idx: torch.Tensor,
+    BLOCK_SIZE: int = 32,
+) -> torch.Tensor:
+    """
+    Perform linear attention decoding using Triton kernels.
+
+    Args:
+        q: Query tensor of shape [B, H, 1, D]
+        k: Key tensor of shape [B, H, 1, D]
+        v: Value tensor of shape [B, H, 1, D]
+        kv_caches: Key-value cache tensor
+        slope_rate: Decay rate tensor
+        slot_idx: Slot indices for batches
+        BLOCK_SIZE: Size of blocks for processing
+
+    Returns:
+        output: Attention output tensor
+    """
+    B, H, _, D = q.shape
+    assert k.shape == (B, H, 1, D)
+    assert v.shape == (B, H, 1, D)
+
+    # Initialize output tensor
+    output = torch.empty_like(q)
+
+    # Set grid dimensions for the kernel
+    grid = (B, H, D // BLOCK_SIZE)
+
+    # Calculate strides for tensors
+    qkv_b_stride = q.stride(0)
+    qkv_h_stride = q.stride(1)
+
+    cache_b_stride = kv_caches.stride(0)
+    cache_h_stride = kv_caches.stride(1)
+    cache_d0_stride = kv_caches.stride(2)
+    cache_d1_stride = kv_caches.stride(3)
+
+    # Launch the kernel
+    _linear_attn_decode_kernel[grid](
+        q,
+        k,
+        v,
+        kv_caches,
+        slope_rate,
+        slot_idx,
+        output,
+        D,
+        qkv_b_stride,
+        qkv_h_stride,
+        cache_b_stride,
+        cache_h_stride,
+        cache_d0_stride,
+        cache_d1_stride,
+        BLOCK_SIZE=BLOCK_SIZE,
+    )
+
+    # Reshape output and return
+    output = rearrange(output, "b h n d -> b n (h d)")
+    return output.squeeze(1).contiguous()
+
+
+class BailingLinearKernel:
+    """
+    Linear attention kernel implementation for Bailing models.
+
+    This class is adapted from MiniMaxText01LinearKernel in vllm:
+    https://github.com/vllm-project/vllm/blob/a9138e85b14047e06300685b48e3485b995425fb/vllm/model_executor/models/minimax_text_01.py#L289
+
+    The implementation maintains the same functionality while being renamed to
+    match our Bailing model naming convention.
+    """
+
+    @staticmethod
+    def jit_linear_forward_prefix(
+        q: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
+        kv_caches: torch.Tensor,
+        slope_rate: torch.Tensor,
+        block_size: int,
+        layer_idx: int = None,
+        **kwargs,
+    ) -> torch.Tensor:
+
+        slope_rate = slope_rate.to(torch.float32)
+        should_pad_dim = q.dim() == 3
+        if should_pad_dim:
+            q = q.unsqueeze(0)
+            k = k.unsqueeze(0)
+            v = v.unsqueeze(0)
+        b, h, n, d = q.shape
+        e = d
+        kv_history = kv_caches.reshape(1, h, d, e).contiguous()
+        output, kv_history = lightning_attention(
+            q, k, v, slope_rate, block_size=block_size, kv_history=kv_history
+        )
+        kv_caches.copy_(kv_history[:, :, -1, :, :].reshape(h, d, e))
+        assert output.shape[0] == 1, "batch size must be 1"
+        return output.squeeze(0).transpose(0, 1).reshape([n, h * d]).contiguous()
diff --git a/sglang/python/sglang/srt/layers/attention/linear/lightning_backend.py b/sglang/python/sglang/srt/layers/attention/linear/lightning_backend.py
new file mode 100644
index 0000000000000000000000000000000000000000..b34fefbfd230e10b4398a8c7a5a89e4799a3aeda
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/attention/linear/lightning_backend.py
@@ -0,0 +1,372 @@
+import logging
+import math
+from typing import Optional, Union
+
+import torch
+
+from sglang.srt.layers.attention.hybrid_linear_attn_backend import MambaAttnBackendBase
+from sglang.srt.layers.attention.linear.lightning_attn import (
+    BailingLinearKernel,
+    linear_decode_forward_triton,
+)
+from sglang.srt.layers.attention.linear.linear_metadata import BailingLinearMetadata
+from sglang.srt.layers.attention.linear.seg_la import SegLaMeta, seg_la_fwd
+from sglang.srt.layers.radix_attention import RadixAttention
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch, ForwardMode
+from sglang.srt.model_executor.model_runner import ModelRunner
+from sglang.srt.speculative.eagle_info import EagleDraftInput, EagleVerifyInput
+
+logger = logging.getLogger(__name__)
+
+
+class LightningAttentionBackend(MambaAttnBackendBase):
+    """
+    Note about the init:
+    - If no spec decoding
+        - FlashAttentionBackend will be init once when the server starts.
+    - If spec decoding
+        - FlashAttentionBackend will be init once for the target worker
+        - FlashAttentionMultiStepBackend will be once for the draft worker
+            - It will spawn num_steps FlashAttentionBackend for the draft worker
+
+    Note about CUDA Graph:
+    - We only support CUDA Graph for Decode (Normal Decode and Draft Decode) and Target Verify.
+    - We don't support CUDA Graph for Extend and Draft Extend.
+    - When server init, init_cuda_graph_state will be called first and then init_cuda_graph_capture will be called.
+    - For each forward batch, init_replay_cuda_graph will be called first and then replay the graph.
+    """
+
+    def __init__(self, model_runner: ModelRunner):
+        super().__init__(model_runner)
+
+        assert not (
+            model_runner.sliding_window_size is not None
+            and model_runner.model_config.is_encoder_decoder
+        ), "Sliding window and cross attention are not supported together"
+
+        # extra metadata for handling speculative decoding topk > 1, extended draft decode and verify
+        self.max_context_len = model_runner.model_config.context_len
+        self.device = model_runner.device
+        self.decode_cuda_graph_metadata = {}
+        self.kv_cache_dtype = model_runner.kv_cache_dtype
+        self.kv_cache_dtype_str = model_runner.server_args.kv_cache_dtype
+        self.BLOCK = (
+            model_runner.model_config.block
+            if hasattr(model_runner.model_config, "block")
+            else 256
+        )
+        total_num_heads = model_runner.model_config.hf_config.num_attention_heads
+        num_hidden_layers = model_runner.model_config.hf_config.num_hidden_layers
+        self.tp_slope = LightningAttentionBackend._build_slope_tensor(
+            total_num_heads, num_hidden_layers, self.device
+        )
+        self.linear_backend = getattr(
+            model_runner.model_config.hf_config, "linear_backend", "seg_la"
+        )
+        logger.info(
+            f"linear_backend for linear attention in hybrid_linear_backend: {self.linear_backend}"
+        )
+
+    def init_forward_metadata(self, forward_batch: ForwardBatch):
+        metadata = self._forward_metadata(forward_batch)
+        self.forward_metadata = BailingLinearMetadata.prepare_mixed(
+            metadata.query_start_loc,
+            metadata.mamba_cache_indices,
+            forward_batch,
+        )
+
+    def init_forward_metadata_capture_cuda_graph(
+        self,
+        bs: int,
+        num_tokens: int,
+        req_pool_indices: torch.Tensor,
+        seq_lens: torch.Tensor,
+        encoder_lens: Optional[torch.Tensor],
+        forward_mode: ForwardMode,
+        spec_info: Optional[Union[EagleDraftInput, EagleVerifyInput]],
+    ):
+        metadata = self._capture_metadata(bs, req_pool_indices, forward_mode, spec_info)
+        self.forward_metadata = BailingLinearMetadata.prepare_decode(
+            metadata.query_start_loc, metadata.mamba_cache_indices, bs, seq_lens
+        )
+
+    def init_forward_metadata_replay_cuda_graph(
+        self,
+        bs: int,
+        req_pool_indices: torch.Tensor,
+        seq_lens: torch.Tensor,
+        seq_lens_sum: int,
+        encoder_lens: Optional[torch.Tensor],
+        forward_mode: ForwardMode,
+        spec_info: Optional[Union[EagleDraftInput, EagleVerifyInput]],
+        seq_lens_cpu: Optional[torch.Tensor],
+    ):
+        metadata = self._replay_metadata(
+            bs, req_pool_indices, forward_mode, spec_info, seq_lens_cpu
+        )
+        self.forward_metadata = BailingLinearMetadata.prepare_decode(
+            metadata.query_start_loc, metadata.mamba_cache_indices, bs, seq_lens
+        )
+
+    @staticmethod
+    def _build_slope_tensor(
+        n_attention_heads: int, num_hidden_layers: int, device="cuda"
+    ):
+        def get_slopes(n):
+            def get_slopes_power_of_2(n):
+                start = 2 ** (-(2 ** -(math.log2(n) - 3)))
+                ratio = start
+                return [start * ratio**i for i in range(n)]
+
+            if math.log2(n).is_integer():
+                return get_slopes_power_of_2(n)
+            else:
+                closest_power_of_2 = 2 ** math.floor(math.log2(n))
+                return (
+                    get_slopes_power_of_2(closest_power_of_2)
+                    + get_slopes(2 * closest_power_of_2)[0::2][: n - closest_power_of_2]
+                )
+
+        slopes = torch.tensor(
+            get_slopes(n_attention_heads), dtype=torch.float32
+        ).reshape(n_attention_heads, 1, 1)
+        from sglang.srt.layers.dp_attention import (
+            get_attention_tp_rank,
+            get_attention_tp_size,
+        )
+
+        tp_heads = n_attention_heads // get_attention_tp_size()
+        tp_rank = get_attention_tp_rank()
+        if num_hidden_layers <= 1:
+            slope_rate_list = [slopes * (1 + 1e-5)]
+        else:
+            slope_rate_list = [
+                slopes * (1 - layer_id / (num_hidden_layers - 1) + 1e-5)
+                for layer_id in range(num_hidden_layers)
+            ]
+
+        tp_slope = [
+            slope_rate_list[layer_id][tp_rank * tp_heads : (tp_rank + 1) * tp_heads]
+            .contiguous()
+            .to(device)
+            for layer_id in range(num_hidden_layers)
+        ]
+
+        return tp_slope
+
+    def _prefill_and_mix_infer(
+        self,
+        q,
+        k,
+        v,
+        kv_cache,
+        state_indices_tensor,
+        forward_batch,
+        layer,
+        metadata,
+    ):
+        hidden = []
+        for _prefill_idx in range(metadata.num_prefills):
+            if _prefill_idx >= forward_batch.extend_start_loc.shape[0]:
+                break
+            if _prefill_idx >= state_indices_tensor.shape[0]:
+                break
+
+            _start = forward_batch.extend_start_loc[_prefill_idx]
+
+            if _prefill_idx + 1 < forward_batch.extend_start_loc.shape[0]:
+                _end = forward_batch.extend_start_loc[_prefill_idx + 1]
+            else:
+                if (
+                    forward_batch.extend_seq_lens is not None
+                    and _prefill_idx < forward_batch.extend_seq_lens.shape[0]
+                    and metadata.num_decodes > 0
+                ):
+                    seq_len = forward_batch.extend_seq_lens[_prefill_idx]
+                    _end = _start + seq_len
+                else:
+                    _end = q.shape[0]
+
+            slot_id = state_indices_tensor[_prefill_idx]
+            qs = q[_start:_end].transpose(0, 1).contiguous()
+            ks = k[_start:_end].transpose(0, 1).contiguous()
+            vs = v[_start:_end].transpose(0, 1).contiguous()
+            slice_layer_cache = kv_cache[slot_id, ...]
+            out_slice = BailingLinearKernel.jit_linear_forward_prefix(
+                qs,
+                ks,
+                vs,
+                slice_layer_cache,
+                self.tp_slope[layer.layer_id],
+                self.BLOCK,
+                layer_idx=layer.layer_id,
+            )
+            hidden.append(out_slice.contiguous())
+        if metadata.num_decodes > 0:
+            hidden.append(
+                self._decode_infer(
+                    q, k, v, kv_cache, state_indices_tensor, metadata, layer
+                )
+            )
+
+        if not hidden:
+            return torch.empty((0, q.size(-1)), device=q.device, dtype=q.dtype)
+
+        hidden = torch.concat(hidden, dim=0).contiguous()
+        return hidden
+
+    def _decode_infer(self, q, k, v, kv_cache, state_indices_tensor, metadata, layer):
+        num_prefill_tokens = metadata.num_prefill_tokens
+        num_prefills = metadata.num_prefills
+        q = q[num_prefill_tokens:].unsqueeze(2).contiguous()
+        k = k[num_prefill_tokens:].unsqueeze(2).contiguous()
+        v = v[num_prefill_tokens:].unsqueeze(2).contiguous()
+        slot_id = state_indices_tensor[num_prefills:]
+
+        assert slot_id.shape[0] == q.shape[0], (
+            f"slot_id length {slot_id.shape[0]} does not match decode batch size {q.shape[0]}. "
+            "This indicates a bug in the upstream logic that should be investigated."
+        )
+        hidden = linear_decode_forward_triton(
+            q, k, v, kv_cache, self.tp_slope[layer.layer_id], slot_id, 32
+        )
+        return hidden
+
+    def _linear_attention_entry(
+        self,
+        q,
+        k,
+        v,
+        kv_cache,
+        state_indices_tensor,
+        metadata,
+        layer,
+        mask=None,
+        temp_cache=None,
+        intermediate_state_indices=None,
+    ):
+        q_offsets = metadata.query_start_loc
+
+        seg_meta = SegLaMeta(
+            batch_size=metadata.batch_size,
+            q_offsets=metadata.query_start_loc,
+            s_offsets=state_indices_tensor,
+            q_lengths=q_offsets.diff(),
+            s_scales=metadata.has_initial_states,
+            max_q_length=None,
+            mask=mask,
+        )
+        hidden = seg_la_fwd(
+            q=q,
+            k=k,
+            v=v,
+            s=kv_cache,
+            decay_scales=self.tp_slope[layer.layer_id],
+            meta=seg_meta,
+            caches=temp_cache,
+            cache_indices=intermediate_state_indices,
+            decouple=True,
+        )
+        return hidden
+
+    def forward_extend(
+        self,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
+        layer: RadixAttention,
+        forward_batch: ForwardBatch,
+        save_kv_cache=True,
+        **kwargs,
+    ):
+        q_rope = kwargs["q_rope"] if "q_rope" in kwargs else None
+        k_rope = kwargs["k_rope"] if "k_rope" in kwargs else None
+        layer_id = layer.layer_id if layer else kwargs["layer_id"]
+
+        metadata = self.forward_metadata
+
+        if self.kv_cache_dtype_str != "auto" and layer.k_scale is not None:
+            q = q.to(self.kv_cache_dtype)
+
+        query_start_loc = self.forward_metadata.query_start_loc
+        cache_indices = self.forward_metadata.mamba_cache_indices
+        mamba_cache_params = self.req_to_token_pool.mamba2_layer_cache(layer_id)
+        ssm_states = mamba_cache_params.temporal
+        if self.linear_backend == "minimax":
+            o = self._prefill_and_mix_infer(
+                q.contiguous().view(-1, layer.tp_q_head_num, layer.head_dim),
+                k,
+                v,
+                ssm_states,
+                cache_indices,
+                forward_batch,
+                layer,
+                metadata,
+            )
+        elif self.linear_backend == "seg_la":
+            intermediate_state_indices = (
+                torch.arange(
+                    cache_indices.shape[0],
+                    dtype=torch.int32,
+                    device=cache_indices.device,
+                )
+                if forward_batch.forward_mode.is_target_verify()
+                else None
+            )
+            o = self._linear_attention_entry(
+                q,
+                k,
+                v,
+                ssm_states,
+                cache_indices,
+                metadata,
+                layer,
+                temp_cache=(
+                    mamba_cache_params.intermediate_ssm
+                    if forward_batch.forward_mode.is_target_verify()
+                    else None
+                ),
+                intermediate_state_indices=intermediate_state_indices,
+            )
+        else:
+            raise ValueError(
+                f"linear backend: {self.linear_backend} is not support for now"
+            )
+        return o.view(-1, layer.tp_q_head_num * layer.v_head_dim)
+
+    def forward_decode(
+        self,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
+        layer: RadixAttention,
+        forward_batch: ForwardBatch,
+        save_kv_cache=True,
+        **kwargs,
+    ) -> torch.Tensor:
+        q_rope = kwargs["q_rope"] if "q_rope" in kwargs else None
+        k_rope = kwargs["k_rope"] if "k_rope" in kwargs else None
+        layer_id = layer.layer_id if layer else kwargs["layer_id"]
+
+        # Use precomputed metadata across all layers
+        metadata = self.forward_metadata
+
+        if self.kv_cache_dtype_str != "auto":
+            q = q.to(self.kv_cache_dtype)
+
+        # Do linear attention
+        query_start_loc = self.forward_metadata.query_start_loc
+        cache_indices = self.forward_metadata.mamba_cache_indices
+        mamba_cache_params = self.req_to_token_pool.mamba2_layer_cache(layer_id)
+        ssm_states = mamba_cache_params.temporal
+        if self.linear_backend == "minimax":
+            o = self._decode_infer(q, k, v, ssm_states, cache_indices, metadata, layer)
+        elif self.linear_backend == "seg_la":
+            o = self._linear_attention_entry(
+                q, k, v, ssm_states, cache_indices, metadata, layer
+            )
+        else:
+            raise ValueError(
+                f"linear backend: {self.linear_backend} is not support for now"
+            )
+        return o.view(-1, layer.tp_q_head_num * layer.v_head_dim)
diff --git a/sglang/python/sglang/srt/layers/attention/linear/linear_metadata.py b/sglang/python/sglang/srt/layers/attention/linear/linear_metadata.py
new file mode 100644
index 0000000000000000000000000000000000000000..ed2da0409617ea25d3fa33564418814154bab1af
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/attention/linear/linear_metadata.py
@@ -0,0 +1,70 @@
+from dataclasses import dataclass
+
+import torch
+
+from sglang.srt.layers.attention.mamba.mamba2_metadata import ForwardMetadata
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch
+
+
+@dataclass(kw_only=True)
+class BailingLinearMetadata(ForwardMetadata):
+    num_prefills: int
+    num_prefill_tokens: int
+    num_decodes: int
+    batch_size: int
+    has_initial_states: torch.Tensor
+    q_lengths: torch.Tensor
+
+    @staticmethod
+    def prepare_decode(
+        query_start_loc: torch.Tensor,
+        mamba_cache_indices: torch.Tensor,
+        bs: int,
+        seq_lens: torch.Tensor,
+    ) -> "BailingLinearMetadata":
+        """This path is run during CUDA graph capture, i.e. decode only, so `num_prefills` is 0"""
+        return BailingLinearMetadata(
+            batch_size=bs,
+            query_start_loc=query_start_loc,
+            mamba_cache_indices=mamba_cache_indices,
+            num_decodes=seq_lens.shape[0],
+            num_prefills=0,
+            num_prefill_tokens=0,
+            has_initial_states=torch.ones_like(seq_lens),
+            q_lengths=query_start_loc.diff(),
+        )
+
+    @classmethod
+    def prepare_mixed(
+        cls,
+        query_start_loc: torch.Tensor,
+        mamba_cache_indices: torch.Tensor,
+        forward_batch: ForwardBatch,
+    ) -> "BailingLinearMetadata":
+        """This path cannot run with CUDA graph, as it contains extend requests."""
+        if forward_batch.extend_num_tokens is None:
+            return cls.prepare_decode(
+                query_start_loc=query_start_loc,
+                mamba_cache_indices=mamba_cache_indices,
+                bs=forward_batch.batch_size,
+                seq_lens=forward_batch.seq_lens,
+            )
+        num_prefills = len(forward_batch.extend_seq_lens)
+        num_prefill_tokens = forward_batch.extend_num_tokens
+        num_decodes = len(forward_batch.seq_lens) - num_prefills
+        context_lens_tensor = forward_batch.extend_prefix_lens
+        assert context_lens_tensor is not None
+        has_initial_states = context_lens_tensor > 0
+
+        query_start_loc = query_start_loc[: num_prefills + 1]
+
+        return BailingLinearMetadata(
+            batch_size=forward_batch.batch_size,
+            query_start_loc=query_start_loc,
+            mamba_cache_indices=mamba_cache_indices,
+            num_prefills=num_prefills,
+            num_prefill_tokens=num_prefill_tokens,
+            num_decodes=num_decodes,
+            has_initial_states=has_initial_states,
+            q_lengths=query_start_loc.diff(),
+        )
diff --git a/sglang/python/sglang/srt/layers/attention/linear/seg_la.py b/sglang/python/sglang/srt/layers/attention/linear/seg_la.py
new file mode 100644
index 0000000000000000000000000000000000000000..9b4e68d8d4e71ca1d8c1064bc185586b5fa4dc6c
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/attention/linear/seg_la.py
@@ -0,0 +1,910 @@
+# -*- coding: utf-8 -*-
+"""
+Copyright (c) Ant Financial Service Group and its affiliates.
+"""
+
+# Copied from https://code.alipay.com/pia/PainlessInferenceAcceleration/blob/v0.0.6/flood/flood/ops/seg_la.py
+
+from dataclasses import dataclass
+from typing import Optional
+
+import torch
+import triton
+import triton.language as tl
+
+
+# arg `meta` of `seg_la_fwd` is SegLaMeta
+@dataclass
+class SegLaMeta:
+    batch_size: int  # batch size, num of requests
+    max_q_length: int  # max(seq_lens)
+    q_offsets: torch.Tensor  # [bs+1], query_start_locations,
+    s_offsets: torch.Tensor  # [bs], slot_ids
+    q_lengths: torch.Tensor  # [bs], query length
+    s_scales: torch.Tensor  # [bs], prefill = 0, decode = 1
+    s_offsets_stride: int = 0
+    q_offsets_stride: int = 0
+    s_scales_stride: int = 0
+    decay_scales_stride: int = 0
+    mask: Optional[torch.Tensor] = None  # Currently not supported
+
+
+# fused
+@triton.jit
+def seg_la_kernel(
+    Q,
+    K,
+    V,
+    S,
+    Out,
+    softmax_scale,
+    stride_q,
+    stride_k,
+    stride_v,
+    stride_s,
+    stride_o,
+    s_offsets,
+    q_offsets,
+    q_lengths,
+    s_scales,
+    decay_scales,
+    HEAD_DIM: tl.constexpr,
+    SPLIT_DIM: tl.constexpr,
+    BLOCK: tl.constexpr,
+    EVEN: tl.constexpr,
+    DECOUPLE: tl.constexpr,
+):
+    bid = tl.program_id(0)
+    hid = tl.program_id(1)
+    sid = tl.program_id(2)
+
+    # s_scale is 0 (prefill) or 1 (decode)
+    s_scale = tl.load(s_scales + bid)
+    q_length = tl.load(q_lengths + bid)
+    q_offset = tl.load(q_offsets + bid)
+    s_offset = tl.load(s_offsets + bid)
+    decay_scale = -tl.load(decay_scales + hid)
+
+    offs_b = tl.arange(0, BLOCK)
+    offs_d = tl.arange(0, HEAD_DIM)
+    offs_s = tl.arange(0, SPLIT_DIM)
+
+    if s_offset == -1:
+        return
+
+    q_ptrs = (
+        Q
+        + q_offset * stride_q
+        + hid * HEAD_DIM
+        + (offs_b[:, None] * stride_q + offs_d[None, :])
+    )
+    k_ptrs = (
+        K
+        + q_offset * stride_k
+        + hid * HEAD_DIM
+        + (offs_b[:, None] * stride_k + offs_d[None, :])
+    )
+    v_ptrs = (
+        V
+        + q_offset * stride_v
+        + hid * HEAD_DIM
+        + sid * SPLIT_DIM
+        + (offs_b[:, None] * stride_v + offs_s[None, :])
+    )
+    out_ptrs = (
+        Out
+        + q_offset * stride_o
+        + hid * HEAD_DIM
+        + sid * SPLIT_DIM
+        + (offs_b[:, None] * stride_o + offs_s[None, :])
+    )
+    s_ptrs = (
+        S
+        + s_offset * stride_s
+        + hid * HEAD_DIM * HEAD_DIM
+        + sid * SPLIT_DIM
+        + (offs_d[:, None] * HEAD_DIM + offs_s[None, :])
+    )
+    state = tl.load(s_ptrs, mask=s_scale > 0).to(tl.float32)
+
+    if BLOCK > 1:
+        for n in range(0, q_length, BLOCK):
+            n = tl.multiple_of(n, BLOCK)
+
+            if EVEN:
+                q = tl.load(q_ptrs + n * stride_q).to(tl.float32)
+                k = tl.trans(tl.load(k_ptrs + n * stride_k)).to(tl.float32)
+                v = tl.load(v_ptrs + n * stride_k).to(tl.float32)
+            else:
+                q = tl.load(
+                    q_ptrs + n * stride_q,
+                    mask=(n + offs_b)[:, None] < q_length,
+                    other=0.0,
+                ).to(tl.float32)
+                k = tl.trans(
+                    tl.load(
+                        k_ptrs + n * stride_k,
+                        mask=(n + offs_b)[:, None] < q_length,
+                        other=0.0,
+                    )
+                ).to(tl.float32)
+                v = tl.load(
+                    v_ptrs + n * stride_k,
+                    mask=(n + offs_b)[:, None] < q_length,
+                    other=0.0,
+                ).to(tl.float32)
+
+            if DECOUPLE:
+                # only work with small scales
+                if EVEN:
+                    b = BLOCK
+                else:
+                    b = min(BLOCK, q_length - n)
+                b_offs = b - 1 - offs_b
+
+                edb = tl.exp(decay_scale * b_offs)
+                decays = tl.where(b_offs >= 0, edb, 0)
+                inv_decays = tl.where(b_offs >= 0, 1 / edb, 0)
+
+                q = q * inv_decays[:, None]
+                k = k * decays[None, :]
+                qk = tl.dot(q, k) * softmax_scale
+                qk = tl.where(offs_b[None, :] <= offs_b[:, None], qk, 0.0)
+                o = tl.dot(qk, v)
+
+                block_decay = tl.exp(decay_scale * b)
+                block_decay_plus = block_decay * softmax_scale
+                o = tl.dot(q, state) * block_decay_plus + o
+
+                state = state * block_decay + tl.dot(k, v)
+            else:
+
+                qk = tl.dot(q, k) * softmax_scale
+                decays = tl.exp(decay_scale * (offs_b[:, None] - offs_b[None, :]))
+                decays = tl.where(offs_b[None, :] <= offs_b[:, None], decays, 0.0)
+                qk *= decays
+                o = tl.dot(qk, v)
+
+                decay_arr = tl.exp(decay_scale * (offs_b[:, None] + 1)) * softmax_scale
+                o = tl.dot(q * decay_arr, state, acc=o)
+
+                if EVEN:
+                    b = BLOCK
+                else:
+                    b = min(BLOCK, q_length - n)
+                b_offs = b - 1 - offs_b
+                b_offs = tl.where(b_offs >= 0, b_offs, 10000)
+                decays = tl.exp(decay_scale * b_offs)
+                block_decay = tl.exp(decay_scale * b)
+                state = state * block_decay + tl.dot(k * decays[None, :], v)
+
+            if EVEN:
+                tl.store(out_ptrs + n * stride_o, o.to(Out.dtype.element_ty))
+            else:
+                tl.store(
+                    out_ptrs + n * stride_o,
+                    o.to(Out.dtype.element_ty),
+                    mask=(n + offs_b)[:, None] < q_length,
+                )
+
+        tl.store(s_ptrs, state.to(S.dtype.element_ty))
+
+    else:
+        q = tl.trans(tl.load(q_ptrs)).to(tl.float32) * softmax_scale
+        k = tl.trans(tl.load(k_ptrs)).to(tl.float32)
+        v = tl.load(v_ptrs).to(tl.float32)
+        state = state * tl.exp(decay_scale) + k * v
+
+        o = tl.sum(q * state, axis=0, keep_dims=True)
+
+        tl.store(out_ptrs, o.to(Out.dtype.element_ty))
+
+        tl.store(s_ptrs, state.to(S.dtype.element_ty))
+
+
+# used for prefilling
+@triton.jit
+def seg_la_p_kernel(
+    Q,
+    K,
+    V,
+    S,
+    Out,
+    softmax_scale,
+    stride_q,
+    stride_k,
+    stride_v,
+    stride_s,
+    stride_o,
+    s_offsets,
+    q_offsets,
+    q_lengths,
+    s_scales,
+    decay_scales,
+    HEAD_DIM: tl.constexpr,
+    K_SPLIT_DIM: tl.constexpr,
+    V_SPLIT_DIM: tl.constexpr,
+    BLOCK: tl.constexpr,
+    EVEN: tl.constexpr,
+):
+    bid = tl.program_id(0)
+    hid = tl.program_id(1)
+    kvid = tl.program_id(2)
+    N = HEAD_DIM // V_SPLIT_DIM
+    kid = kvid // N
+    vid = kvid % N
+    H = tl.num_programs(1)
+
+    # s_scale is 0 (first prefill chunk) or 1 (next prefill chunk)
+    s_scale = tl.load(s_scales + bid)
+    q_length = tl.load(q_lengths + bid)
+    q_offset = tl.load(q_offsets + bid)
+    s_offset = tl.load(s_offsets + bid)
+    decay_scale = -tl.load(decay_scales + hid)
+
+    offs_b = tl.arange(0, BLOCK)
+    offs_k = tl.arange(0, K_SPLIT_DIM)
+    offs_v = tl.arange(0, V_SPLIT_DIM)
+
+    if s_offset == -1:
+        return
+
+    q_ptrs = (
+        Q
+        + q_offset * stride_q
+        + hid * HEAD_DIM
+        + kid * K_SPLIT_DIM
+        + (offs_b[:, None] * stride_q + offs_k[None, :])
+    )
+    k_ptrs = (
+        K
+        + q_offset * stride_k
+        + hid * HEAD_DIM
+        + kid * K_SPLIT_DIM
+        + (offs_b[:, None] * stride_k + offs_k[None, :])
+    )
+    v_ptrs = (
+        V
+        + q_offset * stride_v
+        + hid * HEAD_DIM
+        + vid * V_SPLIT_DIM
+        + (offs_b[:, None] * stride_v + offs_v[None, :])
+    )
+    # (num_dim_block, length, qo_heads, d)
+    out_ptrs = (
+        Out
+        + kid * stride_o
+        + q_offset * HEAD_DIM * H
+        + hid * HEAD_DIM
+        + vid * V_SPLIT_DIM
+        + (offs_b[:, None] * H * HEAD_DIM + offs_v[None, :])
+    )
+    s_ptrs = (
+        S
+        + s_offset * stride_s
+        + hid * HEAD_DIM * HEAD_DIM
+        + kid * HEAD_DIM * K_SPLIT_DIM
+        + vid * V_SPLIT_DIM
+        + (offs_k[:, None] * HEAD_DIM + offs_v[None, :])
+    )
+    state = tl.load(s_ptrs, mask=s_scale > 0).to(tl.float32)
+
+    for n in range(0, q_length, BLOCK):
+        n = tl.multiple_of(n, BLOCK)
+
+        if EVEN:
+            q = tl.load(q_ptrs + n * stride_q).to(tl.float32)
+            k = tl.trans(tl.load(k_ptrs + n * stride_k)).to(tl.float32)
+            v = tl.load(v_ptrs + n * stride_v).to(tl.float32)
+            b = BLOCK
+            b_offs = b - 1 - offs_b
+            decays = tl.exp(decay_scale * b_offs)
+            inv_decays = 1 / decays
+        else:
+            q = tl.load(
+                q_ptrs + n * stride_q, mask=(n + offs_b)[:, None] < q_length, other=0.0
+            ).to(tl.float32)
+            k = tl.trans(
+                tl.load(
+                    k_ptrs + n * stride_k,
+                    mask=(n + offs_b)[:, None] < q_length,
+                    other=0.0,
+                )
+            ).to(tl.float32)
+            v = tl.load(
+                v_ptrs + n * stride_v, mask=(n + offs_b)[:, None] < q_length, other=0.0
+            ).to(tl.float32)
+            b = min(BLOCK, q_length - n)
+            b_offs = b - 1 - offs_b
+            block_decays = tl.exp(decay_scale * b_offs)
+            decays = tl.where(b_offs >= 0, block_decays, 0)
+            inv_decays = tl.where(b_offs >= 0, 1 / block_decays, 0)
+
+        q = q * inv_decays[:, None]
+        k = k * decays[None, :]
+        qk = tl.dot(q, k) * softmax_scale
+        qk = tl.where(offs_b[None, :] <= offs_b[:, None], qk, 0.0)
+        o = tl.dot(qk, v)
+
+        block_decay = tl.exp(decay_scale * b)
+        o = tl.dot(q, state) * block_decay * softmax_scale + o
+
+        state = state * block_decay + tl.dot(k, v)
+
+        if EVEN:
+            tl.store(out_ptrs + n * H * HEAD_DIM, o.to(Out.dtype.element_ty))
+        else:
+            tl.store(
+                out_ptrs + n * H * HEAD_DIM,
+                o.to(Out.dtype.element_ty),
+                mask=(n + offs_b)[:, None] < q_length,
+            )
+
+    tl.store(s_ptrs, state.to(S.dtype.element_ty))
+
+
+# used for speculative
+@triton.jit
+def seg_la_s_kernel(
+    Q,
+    K,
+    V,
+    S,
+    Out,
+    Mask,
+    softmax_scale,
+    stride_q,
+    stride_k,
+    stride_v,
+    stride_s,
+    stride_o,
+    s_offsets,
+    q_offsets,
+    q_lengths,
+    s_scales,
+    decay_scales,
+    HEAD_DIM: tl.constexpr,
+    K_SPLIT_DIM: tl.constexpr,
+    V_SPLIT_DIM: tl.constexpr,
+    BLOCK: tl.constexpr,
+    EVEN: tl.constexpr,
+):
+    bid = tl.program_id(0)
+    hid = tl.program_id(1)
+    kvid = tl.program_id(2)
+    N = HEAD_DIM // V_SPLIT_DIM
+    kid = kvid // N
+    vid = kvid % N
+    H = tl.num_programs(1)
+
+    # s_scale is 0 (first prefill chunk) or 1 (next prefill chunk)
+    s_scale = tl.load(s_scales + bid)
+    q_length = tl.load(q_lengths + bid)
+    q_offset = tl.load(q_offsets + bid)
+    s_offset = tl.load(s_offsets + bid)
+    decay_scale = -tl.load(decay_scales + hid)
+
+    offs_b = tl.arange(0, BLOCK)
+    offs_k = tl.arange(0, K_SPLIT_DIM)
+    offs_v = tl.arange(0, V_SPLIT_DIM)
+
+    if s_offset == -1:
+        return
+
+    q_ptrs = (
+        Q
+        + q_offset * stride_q
+        + hid * HEAD_DIM
+        + kid * K_SPLIT_DIM
+        + (offs_b[:, None] * stride_q + offs_k[None, :])
+    )
+    k_ptrs = (
+        K
+        + q_offset * stride_k
+        + hid * HEAD_DIM
+        + kid * K_SPLIT_DIM
+        + (offs_b[:, None] * stride_k + offs_k[None, :])
+    )
+    v_ptrs = (
+        V
+        + q_offset * stride_v
+        + hid * HEAD_DIM
+        + vid * V_SPLIT_DIM
+        + (offs_b[:, None] * stride_v + offs_v[None, :])
+    )
+    # (num_dim_block, length, qo_heads, d)
+    out_ptrs = (
+        Out
+        + kid * stride_o
+        + q_offset * HEAD_DIM * H
+        + hid * HEAD_DIM
+        + vid * V_SPLIT_DIM
+        + (offs_b[:, None] * H * HEAD_DIM + offs_v[None, :])
+    )
+    s_ptrs = (
+        S
+        + s_offset * stride_s
+        + hid * HEAD_DIM * HEAD_DIM
+        + kid * HEAD_DIM * K_SPLIT_DIM
+        + vid * V_SPLIT_DIM
+        + (offs_k[:, None] * HEAD_DIM + offs_v[None, :])
+    )
+    state = tl.load(s_ptrs, mask=s_scale > 0).to(tl.float32)
+
+    if EVEN:
+        q = tl.load(q_ptrs).to(tl.float32)
+        k = tl.trans(tl.load(k_ptrs)).to(tl.float32)
+        v = tl.load(v_ptrs).to(tl.float32)
+        mask = tl.load(
+            Mask
+            + bid * BLOCK * BLOCK
+            + tl.arange(0, BLOCK)[:, None] * BLOCK
+            + tl.arange(0, BLOCK)[None, :]
+        ).to(tl.int32)
+        positions = tl.sum(mask, 1) - 1
+        max_pos = tl.max(positions)
+        b_offs = max_pos - positions
+    else:
+        q = tl.load(q_ptrs, mask=offs_b[:, None] < q_length).to(tl.float32)
+        k = tl.trans(tl.load(k_ptrs, mask=offs_b[:, None] < q_length)).to(tl.float32)
+        v = tl.load(v_ptrs, mask=offs_b[:, None] < q_length).to(tl.float32)
+        mask = tl.load(
+            Mask
+            + bid * q_length * q_length
+            + tl.arange(0, BLOCK)[:, None] * q_length
+            + tl.arange(0, BLOCK)[None, :],
+            mask=(tl.arange(0, BLOCK)[:, None] < q_length)
+            & (tl.arange(0, BLOCK)[None, :] < q_length),
+        ).to(tl.int32)
+        positions = tl.sum(mask, 1) - 1
+        max_pos = tl.max(positions)
+        b_offs = max_pos - positions
+
+    decays = tl.exp(decay_scale * b_offs)
+    inv_decays = 1 / decays
+
+    q = q * inv_decays[:, None]
+    k = k * decays[None, :]
+    qk = tl.dot(q, k) * softmax_scale
+    qk = qk * mask.to(tl.float32)
+    o = tl.dot(qk, v)
+
+    block_decay = tl.exp(decay_scale * (max_pos + 1))
+    o = tl.dot(q, state) * block_decay * softmax_scale + o
+
+    if EVEN:
+        tl.store(out_ptrs, o.to(Out.dtype.element_ty))
+    else:
+        tl.store(out_ptrs, o.to(Out.dtype.element_ty), mask=offs_b[:, None] < q_length)
+
+
+# used for decode
+@triton.jit
+def seg_la_d_kernel(
+    Q,
+    K,
+    V,
+    S,
+    Out,
+    softmax_scale,
+    stride_q,
+    stride_k,
+    stride_v,
+    stride_s,
+    stride_o,
+    s_offsets,
+    decay_scales,
+    HEAD_DIM: tl.constexpr,
+    K_SPLIT_DIM: tl.constexpr,
+    V_SPLIT_DIM: tl.constexpr,
+):
+    bid = tl.program_id(0)
+    hid = tl.program_id(1)
+    kvid = tl.program_id(2)
+    N = HEAD_DIM // V_SPLIT_DIM
+    kid = kvid // N
+    vid = kvid % N
+    H = tl.num_programs(1)
+
+    # s_scale is 0 (first prefill chunk) or 1 (next prefill chunk)
+    s_offset = tl.load(s_offsets + bid)
+    if s_offset == -1:
+        return
+
+    decay_scale = -tl.load(decay_scales + hid)
+
+    offs_k = tl.arange(0, K_SPLIT_DIM)
+    offs_v = tl.arange(0, V_SPLIT_DIM)
+
+    q_ptrs = Q + bid * stride_q + hid * HEAD_DIM + kid * K_SPLIT_DIM + (offs_k)
+    k_ptrs = K + bid * stride_k + hid * HEAD_DIM + kid * K_SPLIT_DIM + (offs_k)
+    v_ptrs = V + bid * stride_v + hid * HEAD_DIM + vid * V_SPLIT_DIM + (offs_v)
+    # (num_dim_block, length, qo_heads, d)
+    out_ptrs = (
+        Out
+        + kid * stride_o
+        + bid * H * HEAD_DIM
+        + hid * HEAD_DIM
+        + vid * V_SPLIT_DIM
+        + (offs_v)
+    )
+    s_ptrs = (
+        S
+        + s_offset * stride_s
+        + hid * HEAD_DIM * HEAD_DIM
+        + kid * HEAD_DIM * K_SPLIT_DIM
+        + vid * V_SPLIT_DIM
+        + (offs_k[:, None] * HEAD_DIM + offs_v[None, :])
+    )
+    state = tl.load(s_ptrs).to(tl.float32)
+
+    k = tl.load(k_ptrs).to(tl.float32)
+    v = tl.load(v_ptrs).to(tl.float32)
+    q = tl.load(q_ptrs).to(tl.float32) * softmax_scale
+
+    state = state * tl.exp(decay_scale) + k[:, None] * v
+    o = tl.sum(q[:, None] * state, axis=0)
+
+    tl.store(out_ptrs, o.to(Out.dtype.element_ty))
+    tl.store(s_ptrs, state.to(S.dtype.element_ty))
+
+
+# used for MTP with only spec-topk=1.
+@triton.jit
+def seg_la_mtp_kernel(
+    Q,
+    K,
+    V,
+    S,
+    CACHES,
+    Out,
+    softmax_scale,
+    stride_q,
+    stride_k,
+    stride_v,
+    stride_s,
+    stride_c,
+    stride_o,
+    s_offsets,
+    cache_indices,
+    decay_scales,
+    step,
+    HEAD_DIM: tl.constexpr,
+    K_SPLIT_DIM: tl.constexpr,
+    V_SPLIT_DIM: tl.constexpr,
+):
+    bid = tl.program_id(0)
+    hid = tl.program_id(1)
+    kvid = tl.program_id(2)
+    N = HEAD_DIM // V_SPLIT_DIM
+    kid = kvid // N
+    vid = kvid % N
+    H = tl.num_programs(1)
+
+    s_offset = tl.load(s_offsets + bid)
+    if s_offset == -1:
+        return
+
+    decay_scale = tl.exp(-tl.load(decay_scales + hid))
+
+    offs_k = tl.arange(0, K_SPLIT_DIM)
+    offs_v = tl.arange(0, V_SPLIT_DIM)
+
+    # (length, qo_heads, d)
+    q_ptrs = Q + bid * step * stride_q + hid * HEAD_DIM + kid * K_SPLIT_DIM + (offs_k)
+    k_ptrs = K + bid * step * stride_k + hid * HEAD_DIM + kid * K_SPLIT_DIM + (offs_k)
+    v_ptrs = V + bid * step * stride_v + hid * HEAD_DIM + vid * V_SPLIT_DIM + (offs_v)
+    # (num_dim_block, length, qo_heads, d)
+    out_ptrs = (
+        Out
+        + kid * stride_o
+        + bid * step * H * HEAD_DIM
+        + hid * HEAD_DIM
+        + vid * V_SPLIT_DIM
+        + (offs_v)
+    )
+    # (bs, qo_heads, d, d)
+    s_ptrs = (
+        S
+        + s_offset * stride_s
+        + hid * HEAD_DIM * HEAD_DIM
+        + kid * HEAD_DIM * K_SPLIT_DIM
+        + vid * V_SPLIT_DIM
+        + (offs_k[:, None] * HEAD_DIM + offs_v[None, :])
+    )
+    state = tl.load(s_ptrs).to(tl.float32)
+    # (bs, step, kv_heads, d, d)
+    cache_indices = tl.load(cache_indices + bid)
+    c_ptrs = (
+        CACHES
+        + cache_indices * stride_c
+        + hid * HEAD_DIM * HEAD_DIM
+        + kid * HEAD_DIM * K_SPLIT_DIM
+        + vid * V_SPLIT_DIM
+        + (offs_k[:, None] * HEAD_DIM + offs_v[None, :])
+    )
+
+    for i in range(step):
+        q = tl.load(q_ptrs).to(tl.float32) * softmax_scale
+        k = tl.load(k_ptrs).to(tl.float32)
+        v = tl.load(v_ptrs).to(tl.float32)
+
+        state = state * decay_scale + k[:, None] * v
+        o = tl.sum(q[:, None] * state, axis=0)
+
+        tl.store(out_ptrs, o.to(Out.dtype.element_ty))
+        tl.store(c_ptrs, state.to(CACHES.dtype.element_ty))
+        q_ptrs += stride_q
+        k_ptrs += stride_k
+        v_ptrs += stride_v
+        out_ptrs += H * HEAD_DIM
+        c_ptrs += H * HEAD_DIM * HEAD_DIM
+
+
+# (k_dim_block, length, qo_heads, d)
+@triton.jit
+def seg_la_sum_kernel(T, O, DIM: tl.constexpr, NUM_BLOCK: tl.constexpr):
+    pid = tl.program_id(0)
+    length = tl.num_programs(0)
+    x = tl.zeros((DIM,), dtype=tl.float32)
+    for i in range(NUM_BLOCK):
+        x += tl.load(T + i * length * DIM + pid * DIM + tl.arange(0, DIM)).to(
+            tl.float32
+        )
+    tl.store(O + pid * DIM + tl.arange(0, DIM), x)
+
+
+def seg_la_fwd(
+    q,
+    k,
+    v,
+    s,
+    decay_scales,
+    meta,
+    caches=None,
+    cache_indices=None,
+    softmax_scale=None,
+    decouple=False,
+):
+    length, qo_heads, HEAD_DIM = q.shape
+    _, kv_heads, _ = k.shape
+    bs = meta.batch_size
+    if softmax_scale is None:
+        softmax_scale = HEAD_DIM ** (-0.5)
+
+    # MAX_LENGTH = meta.max_q_length
+    MAX_LENGTH = triton.cdiv(length, bs)
+
+    assert qo_heads == kv_heads, "seg_la does NOT support GQA currently"
+
+    if MAX_LENGTH > 1:
+        # prefill with partitioning q/k/v
+        # BLOCK should <= 64 with decouple
+        K_SPLIT_DIM = 32
+        V_SPLIT_DIM = 32 if bs <= 2 else 64
+
+        num_warps = 2  # 2
+        num_stages = 3  # 3
+
+        k_dim_block = HEAD_DIM // K_SPLIT_DIM
+        v_dim_block = HEAD_DIM // V_SPLIT_DIM
+        tmp = torch.empty(
+            (k_dim_block, length, qo_heads, HEAD_DIM), device=q.device, dtype=q.dtype
+        )
+        grid = (bs, kv_heads, k_dim_block * v_dim_block)
+
+        if caches is not None:
+            # mtp
+            EVEN = False
+            BLOCK = 32
+            step = length // bs
+
+            seg_la_mtp_kernel[grid](
+                q,
+                k,
+                v,
+                s,
+                caches,
+                tmp,
+                softmax_scale,
+                q.stride(0),
+                k.stride(0),
+                v.stride(0),
+                s.stride(0),
+                caches.stride(0),
+                tmp.stride(0),
+                meta.s_offsets,
+                cache_indices,
+                decay_scales,
+                step,
+                HEAD_DIM=HEAD_DIM,
+                K_SPLIT_DIM=K_SPLIT_DIM,
+                V_SPLIT_DIM=V_SPLIT_DIM,
+                num_warps=num_warps,
+                num_stages=num_stages,
+            )
+
+        elif meta.mask is not None:
+            # spec
+            ms = meta.mask.size(-1)
+            BLOCK = (ms + 15) // 16 * 16
+            EVEN = BLOCK == ms
+
+            seg_la_s_kernel[grid](
+                q,
+                k,
+                v,
+                s,
+                tmp,
+                meta.mask,
+                softmax_scale,
+                q.stride(0),
+                k.stride(0),
+                v.stride(0),
+                s.stride(0),
+                tmp.stride(0),
+                meta.s_offsets,
+                meta.q_offsets,
+                meta.q_lengths,
+                meta.s_scales,
+                decay_scales,
+                HEAD_DIM=HEAD_DIM,
+                K_SPLIT_DIM=K_SPLIT_DIM,
+                V_SPLIT_DIM=V_SPLIT_DIM,
+                BLOCK=BLOCK,
+                EVEN=EVEN,
+                num_warps=num_warps,
+                num_stages=num_stages,
+            )
+
+        else:
+            # prefill
+            BLOCK = 32
+            EVEN = MAX_LENGTH % BLOCK == 0 if bs == 1 else False
+
+            seg_la_p_kernel[grid](
+                q,
+                k,
+                v,
+                s,
+                tmp,
+                softmax_scale,
+                q.stride(0),
+                k.stride(0),
+                v.stride(0),
+                s.stride(0),
+                tmp.stride(0),
+                meta.s_offsets,
+                meta.q_offsets,
+                meta.q_lengths,
+                meta.s_scales,
+                decay_scales,
+                HEAD_DIM=HEAD_DIM,
+                K_SPLIT_DIM=K_SPLIT_DIM,
+                V_SPLIT_DIM=V_SPLIT_DIM,
+                BLOCK=BLOCK,
+                EVEN=EVEN,
+                num_warps=num_warps,
+                num_stages=num_stages,
+            )
+
+        if k_dim_block > 1:
+            if length < 2048:
+                o = tmp.sum(0)
+            else:
+                o = torch.empty(
+                    (length, qo_heads, HEAD_DIM), device=q.device, dtype=q.dtype
+                )
+                seg_la_sum_kernel[(length,)](
+                    tmp,
+                    o,
+                    DIM=qo_heads * HEAD_DIM,
+                    NUM_BLOCK=k_dim_block,
+                    num_warps=2,
+                    num_stages=3,
+                )
+        else:
+            o = tmp[0]
+
+    else:
+        # decode with partitioning q/k/v
+        if bs <= 128:
+            K_SPLIT_DIM = 128  # 128
+            V_SPLIT_DIM = 32  # 32
+            num_warps = 2  # 2
+            num_stages = 2  # 3
+        else:
+            K_SPLIT_DIM = 128  # 128
+            V_SPLIT_DIM = 64  # 32
+            num_warps = 2  # 2
+            num_stages = 3  # 3
+        k_dim_block = HEAD_DIM // K_SPLIT_DIM
+        v_dim_block = HEAD_DIM // V_SPLIT_DIM
+        tmp = torch.empty(
+            (k_dim_block, length, qo_heads, HEAD_DIM), device=q.device, dtype=q.dtype
+        )
+        grid = (bs, kv_heads, k_dim_block * v_dim_block)
+
+        seg_la_d_kernel[grid](
+            q,
+            k,
+            v,
+            s,
+            tmp,
+            softmax_scale,
+            q.stride(0),
+            k.stride(0),
+            v.stride(0),
+            s.stride(0),
+            tmp.stride(0),
+            meta.s_offsets,
+            decay_scales,
+            HEAD_DIM=HEAD_DIM,
+            K_SPLIT_DIM=K_SPLIT_DIM,
+            V_SPLIT_DIM=V_SPLIT_DIM,
+            num_warps=num_warps,
+            num_stages=num_stages,
+        )
+        if k_dim_block > 1:
+            o = tmp.sum(0)
+        else:
+            o = tmp[0]
+
+    # if fallback:
+    #     # prefill/decode with partitioning v only
+    #     o = torch.empty(q.shape, device=q.device, dtype=q.dtype)
+    #     if MAX_LENGTH == 1:
+    #         # decode
+    #         BLOCK = 1
+    #         EVEN = False
+    #         SPLIT_DIM = 32
+    #         num_warps = 8
+    #         num_stages = 2
+    #         num_dim_block = HEAD_DIM // SPLIT_DIM
+    #         grid = (batch, kv_heads, num_dim_block)
+    #     else:
+    #         # prefill
+    #         if decouple:
+    #             BLOCK = 64
+    #             SPLIT_DIM = 16
+    #         else:
+    #             BLOCK = HEAD_DIM
+    #             SPLIT_DIM = 32
+    #         # EVEN = all([x % BLOCK == 0 for x in meta.qls])
+    #         EVEN = False
+    #         num_warps = 8
+    #         num_stages = 2
+    #         # prop = torch.cuda.get_device_properties(q.device.index)
+    #         # arch = prop.major * 10 + prop.minor
+    #         # if arch not in (80, 90):
+    #         #     num_stages = 1
+
+    #         num_dim_block = HEAD_DIM // SPLIT_DIM
+    #         grid = (batch, kv_heads, num_dim_block)
+
+    #     seg_la_kernel[grid](
+    #         q,
+    #         k,
+    #         v,
+    #         s,
+    #         o,
+    #         softmax_scale,
+    #         q.stride(0),
+    #         k.stride(0),
+    #         v.stride(0),
+    #         s.stride(0),
+    #         o.stride(0),
+    #         meta.s_offsets,
+    #         meta.q_offsets,
+    #         meta.q_lengths,
+    #         meta.s_scales,
+    #         decay_scales,
+    #         HEAD_DIM=HEAD_DIM,
+    #         SPLIT_DIM=SPLIT_DIM,
+    #         BLOCK=BLOCK,
+    #         EVEN=EVEN,
+    #         DECOUPLE=decouple,
+    #         num_warps=num_warps,
+    #         num_stages=num_stages
+    #     )
+    return o
diff --git a/sglang/python/sglang/srt/layers/attention/linear/utils.py b/sglang/python/sglang/srt/layers/attention/linear/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..63e7c06cbcc1bfbef668b39f978e64285af03572
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/attention/linear/utils.py
@@ -0,0 +1,68 @@
+from __future__ import annotations
+
+import logging
+from enum import Enum
+from typing import TYPE_CHECKING, Optional
+
+from sglang.srt.utils.common import rank0_log
+
+if TYPE_CHECKING:
+    from sglang.srt.server_args import ServerArgs
+
+logger = logging.getLogger(__name__)
+
+
+class LinearAttnKernelBackend(Enum):
+    TRITON = "triton"
+    CUTEDSL = "cutedsl"
+    FLASHINFER = "flashinfer"
+
+    def is_triton(self):
+        return self == LinearAttnKernelBackend.TRITON
+
+    def is_cutedsl(self):
+        return self == LinearAttnKernelBackend.CUTEDSL
+
+    def is_flashinfer(self):
+        return self == LinearAttnKernelBackend.FLASHINFER
+
+
+LINEAR_ATTN_DECODE_BACKEND: Optional[LinearAttnKernelBackend] = None
+LINEAR_ATTN_PREFILL_BACKEND: Optional[LinearAttnKernelBackend] = None
+
+
+def initialize_linear_attn_config(server_args: ServerArgs):
+    global LINEAR_ATTN_DECODE_BACKEND
+    global LINEAR_ATTN_PREFILL_BACKEND
+
+    base = server_args.linear_attn_backend
+    decode = server_args.linear_attn_decode_backend or base
+    prefill = server_args.linear_attn_prefill_backend or base
+
+    LINEAR_ATTN_DECODE_BACKEND = LinearAttnKernelBackend(decode)
+    LINEAR_ATTN_PREFILL_BACKEND = LinearAttnKernelBackend(prefill)
+    rank0_log(
+        f"Linear attention kernel backend: "
+        f"decode={LINEAR_ATTN_DECODE_BACKEND.value}, "
+        f"prefill={LINEAR_ATTN_PREFILL_BACKEND.value}"
+    )
+
+
+def get_linear_attn_decode_backend() -> LinearAttnKernelBackend:
+    global LINEAR_ATTN_DECODE_BACKEND
+    if LINEAR_ATTN_DECODE_BACKEND is None:
+        logger.warning(
+            "LINEAR_ATTN_DECODE_BACKEND is not initialized, using triton backend"
+        )
+        LINEAR_ATTN_DECODE_BACKEND = LinearAttnKernelBackend.TRITON
+    return LINEAR_ATTN_DECODE_BACKEND
+
+
+def get_linear_attn_prefill_backend() -> LinearAttnKernelBackend:
+    global LINEAR_ATTN_PREFILL_BACKEND
+    if LINEAR_ATTN_PREFILL_BACKEND is None:
+        logger.warning(
+            "LINEAR_ATTN_PREFILL_BACKEND is not initialized, using triton backend"
+        )
+        LINEAR_ATTN_PREFILL_BACKEND = LinearAttnKernelBackend.TRITON
+    return LINEAR_ATTN_PREFILL_BACKEND
diff --git a/sglang/python/sglang/srt/layers/attention/mamba/__pycache__/causal_conv1d.cpython-311.pyc b/sglang/python/sglang/srt/layers/attention/mamba/__pycache__/causal_conv1d.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e0efbbb20eca4850f7b966fc9b22e5f584ffab8f
Binary files /dev/null and b/sglang/python/sglang/srt/layers/attention/mamba/__pycache__/causal_conv1d.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/layers/attention/mamba/__pycache__/causal_conv1d_triton.cpython-311.pyc b/sglang/python/sglang/srt/layers/attention/mamba/__pycache__/causal_conv1d_triton.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..df59d64a7d77fc1aa23e06dd2db1197541926703
Binary files /dev/null and b/sglang/python/sglang/srt/layers/attention/mamba/__pycache__/causal_conv1d_triton.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/layers/attention/mamba/__pycache__/mamba.cpython-311.pyc b/sglang/python/sglang/srt/layers/attention/mamba/__pycache__/mamba.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..415c85d26026002fa4b147b6df1de49b442451c5
Binary files /dev/null and b/sglang/python/sglang/srt/layers/attention/mamba/__pycache__/mamba.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/layers/attention/mamba/__pycache__/mamba2_metadata.cpython-311.pyc b/sglang/python/sglang/srt/layers/attention/mamba/__pycache__/mamba2_metadata.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d8779dbc67a6ca856ec34e68da407b71437fa67f
Binary files /dev/null and b/sglang/python/sglang/srt/layers/attention/mamba/__pycache__/mamba2_metadata.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/layers/attention/mamba/__pycache__/mamba_state_scatter_triton.cpython-311.pyc b/sglang/python/sglang/srt/layers/attention/mamba/__pycache__/mamba_state_scatter_triton.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d62e3cfd3f4e4284b39e71efc41b11051aed144e
Binary files /dev/null and b/sglang/python/sglang/srt/layers/attention/mamba/__pycache__/mamba_state_scatter_triton.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/layers/attention/mamba/__pycache__/mixer2_rms_norm_gated.cpython-311.pyc b/sglang/python/sglang/srt/layers/attention/mamba/__pycache__/mixer2_rms_norm_gated.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b78b3aa300dbedba86b3c0f207d7e7c61db46a1d
Binary files /dev/null and b/sglang/python/sglang/srt/layers/attention/mamba/__pycache__/mixer2_rms_norm_gated.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/layers/attention/mamba/causal_conv1d.py b/sglang/python/sglang/srt/layers/attention/mamba/causal_conv1d.py
new file mode 100644
index 0000000000000000000000000000000000000000..e7beb75b0051570801ce79522655c35195636d62
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/attention/mamba/causal_conv1d.py
@@ -0,0 +1,171 @@
+# Adapted from https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/layers/mamba/ops/causal_conv1d.py
+# SPDX-License-Identifier: Apache-2.0
+
+# Copyright (c) 2024, Tri Dao.
+# Adapted from https://github.com/Dao-AILab/causal-conv1d/blob/main/causal_conv1d/causal_conv1d_interface.py
+
+from typing import Optional
+
+import torch
+
+from .causal_conv1d_triton import PAD_SLOT_ID
+
+try:
+    from sgl_kernel import causal_conv1d_fwd
+    from sgl_kernel import causal_conv1d_update as causal_conv1d_update_kernel
+
+    torch.ops.sgl_kernel.causal_conv1d_update
+    _USE_TRITON = False
+except (ImportError, AttributeError):
+    from .causal_conv1d_triton import causal_conv1d_fn as _causal_conv1d_fn_triton
+    from .causal_conv1d_triton import (
+        causal_conv1d_update as _causal_conv1d_update_triton,
+    )
+
+    _USE_TRITON = True
+
+
+def causal_conv1d_fn(
+    x: torch.Tensor,
+    weight: torch.Tensor,
+    bias: Optional[torch.Tensor] = None,
+    query_start_loc: Optional[torch.Tensor] = None,
+    cache_indices: Optional[torch.Tensor] = None,
+    has_initial_state: Optional[torch.Tensor] = None,
+    conv_states: Optional[torch.Tensor] = None,
+    activation: Optional[str] = "silu",
+    pad_slot_id: int = PAD_SLOT_ID,
+    **kwargs,
+):
+    """
+    x: (batch, dim, seqlen) or (dim,cu_seq_len) for varlen
+        sequences are concatenated from left to right for varlen
+    weight: (dim, width)
+    bias: (dim,)
+    query_start_loc: (batch + 1) int32
+        The cumulative sequence lengths of the sequences in
+        the batch, used to index into sequence. prepended by 0.
+        for example: query_start_loc = torch.Tensor([0,10,16,17]),
+        x.shape=(dim,17)
+    cache_indices: (batch)  int32
+        indicates the corresponding state index,
+        like so: conv_state = conv_states[cache_indices[batch_id]]
+    has_initial_state: (batch) bool
+        indicates whether should the kernel take the current state as initial
+        state for the calculations
+    conv_states: (...,dim,width - 1) itype
+        updated inplace if provided
+    activation: either None or "silu" or "swish"
+    pad_slot_id: int
+            if cache_indices is passed, lets the kernel identify padded
+            entries that will not be processed,
+            for example: cache_indices = [pad_slot_id, 1, 20, pad_slot_id]
+            in this case, the kernel will not process entries at
+            indices 0 and 3
+
+
+    out: (batch, dim, seqlen)
+    """
+    if _USE_TRITON:
+        seq_lens_cpu = (
+            (query_start_loc[1:] - query_start_loc[:-1]).cpu().tolist()
+            if query_start_loc is not None
+            else [x.shape[-1]]
+        )
+        return _causal_conv1d_fn_triton(
+            x,
+            weight,
+            bias,
+            conv_states=conv_states,
+            query_start_loc=query_start_loc,
+            seq_lens_cpu=seq_lens_cpu,
+            cache_indices=cache_indices,
+            has_initial_state=has_initial_state,
+            activation=activation,
+            pad_slot_id=pad_slot_id,
+            **kwargs,
+        )
+    if activation not in [None, "silu", "swish"]:
+        raise NotImplementedError("activation must be None, silu, or swish")
+    if x.stride(-1) != 1:
+        x = x.contiguous()
+    bias = bias.contiguous() if bias is not None else None
+
+    causal_conv1d_fwd(
+        x,
+        weight,
+        bias,
+        conv_states,
+        query_start_loc,
+        cache_indices,
+        has_initial_state,
+        activation in ["silu", "swish"],
+        pad_slot_id,
+    )
+    return x
+
+
+def causal_conv1d_update(
+    x: torch.Tensor,
+    conv_state: torch.Tensor,
+    weight: torch.Tensor,
+    bias: Optional[torch.Tensor] = None,
+    activation: Optional[str] = None,
+    cache_seqlens: Optional[torch.Tensor] = None,
+    conv_state_indices: Optional[torch.Tensor] = None,
+    pad_slot_id: int = PAD_SLOT_ID,
+):
+    """
+    x: (batch, dim) or (batch, dim, seqlen)
+    conv_state: (batch, dim, state_len), where state_len >= width - 1
+    weight: (dim, width)
+    bias: (dim,)
+    cache_seqlens: (batch,), dtype int32.
+        If not None, the conv_state is treated as a circular buffer.
+        The conv_state will be updated by copying x to the conv_state
+        starting at the index
+        @cache_seqlens % state_len.
+    conv_state_indices: (batch,), dtype int32
+        If not None, the conv_state is a larger tensor along the batch dim,
+        and we are selecting the batch coords specified by conv_state_indices.
+        Useful for a continuous batching scenario.
+    pad_slot_id: int
+            if cache_indices is passed, lets the kernel identify padded
+            entries that will not be processed,
+            for example: cache_indices = [pad_slot_id, 1 ,20 ,pad_slot_id]
+            in this case, the kernel will not process entries at
+            indices 0 and 3
+    out: (batch, dim) or (batch, dim, seqlen)
+    """
+    if _USE_TRITON:
+        return _causal_conv1d_update_triton(
+            x,
+            conv_state,
+            weight,
+            bias=bias,
+            activation=activation,
+            cache_seqlens=cache_seqlens,
+            conv_state_indices=conv_state_indices,
+            pad_slot_id=pad_slot_id,
+        )
+    if activation not in [None, "silu", "swish"]:
+        raise NotImplementedError(
+            f"activation must be None, silu, or swish, actual: {activation}"
+        )
+    activation_val = activation in ["silu", "swish"]
+    unsqueeze = x.dim() == 2
+    if unsqueeze:
+        x = x.unsqueeze(-1)
+    causal_conv1d_update_kernel(
+        x,
+        conv_state,
+        weight,
+        bias,
+        activation_val,
+        cache_seqlens,
+        conv_state_indices,
+        pad_slot_id,
+    )
+    if unsqueeze:
+        x = x.squeeze(-1)
+    return x
diff --git a/sglang/python/sglang/srt/layers/attention/mamba/causal_conv1d_triton.py b/sglang/python/sglang/srt/layers/attention/mamba/causal_conv1d_triton.py
new file mode 100644
index 0000000000000000000000000000000000000000..c82f4d730fa124fa78c24e1d09d38d3a3cce1762
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/attention/mamba/causal_conv1d_triton.py
@@ -0,0 +1,1187 @@
+# Copyright (c) 2024, Tri Dao.
+# Adapted from https://github.com/Dao-AILab/causal-conv1d/blob/main/causal_conv1d/causal_conv1d_interface.py
+# and https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/layers/mamba/ops/causal_conv1d.py
+
+from typing import List, Optional, Union
+
+import torch
+import triton
+import triton.language as tl
+
+PAD_SLOT_ID = -1
+
+
+@triton.jit()
+def _causal_conv1d_fwd_kernel(  # continuous batching
+    # Pointers to matrices
+    x_ptr,  # (dim, cu_seqlen) holding `batch` of actual sequences + padded sequences
+    w_ptr,  # (dim, width)
+    bias_ptr,
+    initial_states_ptr,  # conv_states_ptr
+    cache_indices_ptr,  # conv_state_indices_ptr
+    has_initial_states_ptr,
+    query_start_loc_ptr,
+    o_ptr,  # (dim, seqlen) - actually pointing to x_ptr
+    # Matrix dimensions
+    dim: tl.constexpr,
+    seqlen: tl.int32,  # cu_seqlen
+    num_cache_lines: tl.constexpr,  # added to support vLLM larger cache lines
+    # Strides
+    stride_x_seq: tl.constexpr,  # stride to get to next sequence,
+    stride_x_dim: tl.constexpr,  # stride to get to next feature-value,
+    stride_x_token: tl.constexpr,  # stride to get to next token (same feature-index, same sequence-index)
+    stride_w_dim: tl.constexpr,  # stride to get to next dim-axis value
+    stride_w_width: tl.constexpr,  # stride to get to next width-axis value
+    stride_istate_seq: tl.constexpr,
+    stride_istate_dim: tl.constexpr,
+    stride_istate_token: tl.constexpr,
+    stride_o_seq: tl.constexpr,
+    stride_o_dim: tl.constexpr,
+    stride_o_token: tl.constexpr,
+    # others
+    pad_slot_id: tl.constexpr,
+    # Meta-parameters
+    HAS_BIAS: tl.constexpr,
+    KERNEL_WIDTH: tl.constexpr,
+    SILU_ACTIVATION: tl.constexpr,
+    HAS_INITIAL_STATES: tl.constexpr,
+    HAS_CACHE: tl.constexpr,
+    IS_CONTINUOUS_BATCHING: tl.constexpr,
+    USE_PAD_SLOT: tl.constexpr,
+    NP2_STATELEN: tl.constexpr,
+    BLOCK_M: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+):
+    conv_states_ptr = initial_states_ptr
+    conv_state_indices_ptr = cache_indices_ptr
+    stride_conv_state_seq = stride_istate_seq
+    stride_conv_state_dim = stride_istate_dim
+    stride_conv_state_tok = stride_istate_token
+    state_len = (
+        KERNEL_WIDTH - 1
+    )  # can be passed via argument if it's not the same as this value
+
+    # one program handles one chunk in a single sequence
+    # rather than mixing sequences - to make updating initial_states across sequences efficiently
+
+    # single-sequence id
+    idx_seq = tl.program_id(0)
+    chunk_offset = tl.program_id(1)
+
+    # BLOCK_N elements along the feature-dimension (channel)
+    idx_feats = tl.program_id(2) * BLOCK_N + tl.arange(0, BLOCK_N)
+
+    if idx_seq == pad_slot_id:
+        return
+
+    sequence_start_index = tl.load(query_start_loc_ptr + idx_seq)
+    sequence_end_index = tl.load(query_start_loc_ptr + idx_seq + 1)
+    # find the actual sequence length
+    seqlen = sequence_end_index - sequence_start_index
+
+    token_offset = BLOCK_M * chunk_offset
+    segment_len = min(BLOCK_M, seqlen - token_offset)
+
+    if segment_len <= 0:
+        return
+
+    # base of the sequence
+    x_base = (
+        x_ptr + sequence_start_index * stride_x_token + idx_feats * stride_x_dim
+    )  # [BLOCK_N,]
+
+    if IS_CONTINUOUS_BATCHING:
+        # cache_idx
+        conv_state_batch_coord = tl.load(conv_state_indices_ptr + idx_seq).to(tl.int64)
+    else:
+        # cache_idx
+        conv_state_batch_coord = idx_seq
+    if USE_PAD_SLOT:  # noqa
+        if conv_state_batch_coord == pad_slot_id:
+            # not processing as this is not the actual sequence
+            return
+    conv_states_base = (
+        conv_states_ptr
+        + (conv_state_batch_coord * stride_conv_state_seq)
+        + (idx_feats * stride_conv_state_dim)
+    )  # [BLOCK_N,]
+
+    w_base = w_ptr + (idx_feats * stride_w_dim)  # [BLOCK_N,]
+
+    # Does 2 things:
+    # 1. READ prior-block init-state data - [done by every Triton programs]
+    # 2. update conv_state with new data [only by the Triton program handles chunk_offset=0]
+    if chunk_offset == 0:
+        # read from conv_states
+        load_init_state = False
+        if HAS_INITIAL_STATES:  # the new HAS_INITIAL_STATES
+            load_init_state = tl.load(has_initial_states_ptr + idx_seq).to(tl.int1)
+        if load_init_state:
+            # load from conv_states
+            prior_tokens = conv_states_base + (state_len - 1) * stride_conv_state_tok
+            mask_w = idx_feats < dim
+            if KERNEL_WIDTH == 2:
+                conv_states_ptrs = prior_tokens  # [BLOCK_N]
+                col0 = tl.load(conv_states_ptrs, mask_w, 0.0)
+            if KERNEL_WIDTH == 3:
+                conv_states_ptrs = prior_tokens  # [BLOCK_N]
+                col1 = tl.load(conv_states_ptrs, mask_w, 0.0)
+                conv_states_ptrs = prior_tokens - 1 * stride_conv_state_tok  # [BLOCK_N]
+                col0 = tl.load(conv_states_ptrs, mask_w, 0.0)
+            if KERNEL_WIDTH == 4:
+                conv_states_ptrs = prior_tokens  # [BLOCK_N]
+                col2 = tl.load(conv_states_ptrs, mask_w, 0.0)
+                conv_states_ptrs = prior_tokens - 1 * stride_conv_state_tok  # [BLOCK_N]
+                col1 = tl.load(conv_states_ptrs, mask_w, 0.0)
+                conv_states_ptrs = prior_tokens - 2 * stride_conv_state_tok  # [BLOCK_N]
+                col0 = tl.load(conv_states_ptrs, mask_w, 0.0)
+            if KERNEL_WIDTH == 5:
+                conv_states_ptrs = prior_tokens  # [BLOCK_N]
+                col3 = tl.load(conv_states_ptrs, mask_w, 0.0)
+                conv_states_ptrs = prior_tokens - 1 * stride_conv_state_tok  # [BLOCK_N]
+                col2 = tl.load(conv_states_ptrs, mask_w, 0.0)
+                conv_states_ptrs = prior_tokens - 2 * stride_conv_state_tok  # [BLOCK_N]
+                col1 = tl.load(conv_states_ptrs, mask_w, 0.0)
+                conv_states_ptrs = prior_tokens - 3 * stride_conv_state_tok  # [BLOCK_N]
+                col0 = tl.load(conv_states_ptrs, mask_w, 0.0)
+        else:
+            # prior-tokens are zeros
+            if KERNEL_WIDTH >= 2:  # STRATEGY1
+                # first chunk and does not have prior-token, so just set to 0
+                col0 = tl.zeros((BLOCK_N,), dtype=x_ptr.dtype.element_ty)
+            if KERNEL_WIDTH >= 3:  # STRATEGY1
+                col1 = tl.zeros((BLOCK_N,), dtype=x_ptr.dtype.element_ty)
+            if KERNEL_WIDTH >= 4:  # STRATEGY1
+                col2 = tl.zeros((BLOCK_N,), dtype=x_ptr.dtype.element_ty)
+            if KERNEL_WIDTH >= 5:  # STRATEGY1
+                col3 = tl.zeros((BLOCK_N,), dtype=x_ptr.dtype.element_ty)
+
+        # STEP 2:
+        # here prepare data for updating conv_state
+        if (
+            state_len <= seqlen
+        ):  # SMALL_CACHE=True (only move part of 'x' into conv_state cache)
+            # just read from 'x'
+            # copy 'x' data to conv_state
+            # load only 'x' data (and set 0 before 'x' if seqlen < state_len)
+            idx_tokens_last = (seqlen - state_len) + tl.arange(
+                0, NP2_STATELEN
+            )  # [BLOCK_M]
+            x_ptrs = (
+                x_ptr
+                + ((sequence_start_index + idx_tokens_last) * stride_x_token)[:, None]
+                + (idx_feats * stride_x_dim)[None, :]
+            )  # [BLOCK_M,BLOCK_N,]
+            mask_x = (
+                (idx_tokens_last >= 0)[:, None]
+                & (idx_tokens_last < seqlen)[:, None]
+                & (idx_feats < dim)[None, :]
+            )  # token-index  # token-index  # feature-index
+            loaded_x = tl.load(x_ptrs, mask_x, 0.0)
+            new_conv_state = tl.load(x_ptrs, mask_x, 0.0)
+            idx_tokens_conv = tl.arange(0, NP2_STATELEN)  # [BLOCK_M]
+            conv_states_ptrs_target = (
+                conv_states_base[None, :]
+                + (idx_tokens_conv * stride_conv_state_tok)[:, None]
+            )
+
+            mask = (idx_tokens_conv < state_len)[:, None] & (idx_feats < dim)[None, :]
+            tl.debug_barrier()  #  NOTE: use this due to bug in Triton compiler
+            tl.store(conv_states_ptrs_target, new_conv_state, mask)
+
+        else:
+            if load_init_state:
+                # update conv_state by shifting left, i.e. take last few cols from conv_state + cols from 'x'
+                idx_tokens_conv = tl.arange(0, NP2_STATELEN)  # [BLOCK_M]
+
+                conv_states_ptrs_source = (
+                    conv_states_ptr
+                    + (conv_state_batch_coord * stride_conv_state_seq)
+                    + (idx_feats * stride_conv_state_dim)[None, :]
+                    + ((idx_tokens_conv + seqlen) * stride_conv_state_tok)[:, None]
+                )  # [BLOCK_M, BLOCK_N]
+                mask = (
+                    (conv_state_batch_coord < num_cache_lines)
+                    & ((idx_tokens_conv + seqlen) < state_len)[:, None]
+                    & (idx_feats < dim)[None, :]
+                )
+                conv_state = tl.load(conv_states_ptrs_source, mask, other=0.0)
+
+                VAL = state_len - seqlen
+
+                x_ptrs = (
+                    x_base[None, :]
+                    + ((idx_tokens_conv - VAL) * stride_x_token)[:, None]
+                )  # [BLOCK_M, BLOCK_N]
+
+                mask_x = (
+                    (idx_tokens_conv - VAL >= 0)[:, None]
+                    & (idx_tokens_conv - VAL < seqlen)[:, None]
+                    & (idx_feats < dim)[None, :]
+                )  # token-index  # token-index  # feature-index
+                loaded_x = tl.load(x_ptrs, mask_x, 0.0)
+
+                tl.debug_barrier()  # need this due to the bug in tl.where not enforcing this when data is the result of another tl.load
+                new_conv_state = tl.where(
+                    mask, conv_state, loaded_x
+                )  # BUG in 'tl.where'  which requires a barrier before this
+                conv_states_ptrs_target = (
+                    conv_states_base
+                    + (idx_tokens_conv * stride_conv_state_tok)[:, None]
+                )  # [BLOCK_M, BLOCK_N]
+                mask = (idx_tokens_conv < state_len)[:, None] & (idx_feats < dim)[
+                    None, :
+                ]
+                tl.store(conv_states_ptrs_target, new_conv_state, mask)
+            else:  # load_init_state == False
+                # update conv_state by shifting left, BUT
+                # set cols prior to 'x' as zeros + cols from 'x'
+                idx_tokens_conv = tl.arange(0, NP2_STATELEN)  # [BLOCK_M]
+
+                VAL = state_len - seqlen
+
+                x_ptrs = (
+                    x_base[None, :]
+                    + ((idx_tokens_conv - VAL) * stride_x_token)[:, None]
+                )  # [BLOCK_M, BLOCK_N]
+
+                mask_x = (
+                    (idx_tokens_conv - VAL >= 0)[:, None]
+                    & (idx_tokens_conv - VAL < seqlen)[:, None]
+                    & (idx_feats < dim)[None, :]
+                )  # token-index  # token-index  # feature-index
+                new_conv_state = tl.load(x_ptrs, mask_x, 0.0)
+
+                conv_states_ptrs_target = (
+                    conv_states_base
+                    + (idx_tokens_conv * stride_conv_state_tok)[:, None]
+                )  # [BLOCK_M, BLOCK_N]
+                mask = (idx_tokens_conv < state_len)[:, None] & (idx_feats < dim)[
+                    None, :
+                ]
+                tl.store(conv_states_ptrs_target, new_conv_state, mask)
+
+    else:  # chunk_offset > 0
+        # read prior-token data from `x`
+        load_init_state = True
+        prior_tokens = x_base + (token_offset - 1) * stride_x_token
+        mask_w = idx_feats < dim
+        if KERNEL_WIDTH == 2:
+            conv_states_ptrs = prior_tokens  # [BLOCK_N]
+            col0 = tl.load(conv_states_ptrs, mask_w, 0.0, cache_modifier=".ca")
+        if KERNEL_WIDTH == 3:
+            conv_states_ptrs = prior_tokens  # [BLOCK_N]
+            col1 = tl.load(conv_states_ptrs, mask_w, 0.0, cache_modifier=".ca")
+            conv_states_ptrs = prior_tokens - 1 * stride_x_token  # [BLOCK_N]
+            col0 = tl.load(conv_states_ptrs, mask_w, 0.0, cache_modifier=".ca")
+        if KERNEL_WIDTH == 4:
+            conv_states_ptrs = prior_tokens  # [BLOCK_N]
+            col2 = tl.load(conv_states_ptrs, mask_w, 0.0, cache_modifier=".ca")
+            conv_states_ptrs = prior_tokens - 1 * stride_x_token  # [BLOCK_N]
+            col1 = tl.load(conv_states_ptrs, mask_w, 0.0, cache_modifier=".ca")
+            conv_states_ptrs = prior_tokens - 2 * stride_x_token  # [BLOCK_N]
+            col0 = tl.load(conv_states_ptrs, mask_w, 0.0, cache_modifier=".ca")
+        if KERNEL_WIDTH == 5:
+            # ruff: noqa: F841
+            conv_states_ptrs = prior_tokens  # [BLOCK_N]
+            col3 = tl.load(conv_states_ptrs, mask_w, 0.0, cache_modifier=".ca")
+            conv_states_ptrs = prior_tokens - 1 * stride_x_token  # [BLOCK_N]
+            col2 = tl.load(conv_states_ptrs, mask_w, 0.0, cache_modifier=".ca")
+            conv_states_ptrs = prior_tokens - 2 * stride_x_token  # [BLOCK_N]
+            col1 = tl.load(conv_states_ptrs, mask_w, 0.0, cache_modifier=".ca")
+            conv_states_ptrs = prior_tokens - 3 * stride_x_token  # [BLOCK_N]
+            col0 = tl.load(conv_states_ptrs, mask_w, 0.0, cache_modifier=".ca")
+
+    if HAS_BIAS:
+        bias = bias_ptr + idx_feats
+        mask_bias = idx_feats < dim
+        acc_preload = tl.load(bias, mask=mask_bias, other=0.0).to(
+            tl.float32
+        )  # [BLOCK_N]
+    else:
+        acc_preload = tl.zeros((BLOCK_N,), dtype=tl.float32)
+
+    x_base_1d = x_base + token_offset * stride_x_token  # starting of chunk
+
+    # PRE-LOAD WEIGHTS
+    mask_w = idx_feats < dim
+    if KERNEL_WIDTH >= 2:
+        w_ptrs = w_base + (0 * stride_w_width)  # [BLOCK_N] tensor
+        w_col0 = tl.load(w_ptrs, mask_w, other=0.0)
+        w_ptrs = w_base + (1 * stride_w_width)  # [BLOCK_N] tensor
+        w_col1 = tl.load(w_ptrs, mask_w, other=0.0)
+    if KERNEL_WIDTH >= 3:
+        w_ptrs = w_base + (2 * stride_w_width)  # [BLOCK_N] tensor
+        w_col2 = tl.load(w_ptrs, mask_w, other=0.0)
+    if KERNEL_WIDTH >= 4:
+        w_ptrs = w_base + (3 * stride_w_width)  # [BLOCK_N] tensor
+        w_col3 = tl.load(w_ptrs, mask_w, other=0.0)
+    mask_x_1d = idx_feats < dim
+    for idx_token in range(segment_len):
+        acc = acc_preload
+
+        matrix_w = w_col0
+        matrix_x = col0
+        for j in tl.static_range(KERNEL_WIDTH):
+
+            if KERNEL_WIDTH == 2:
+                if j == 1:  # KERNEL_WIDTH-1:
+                    matrix_w = w_col1
+                    x_ptrs_1d = x_base_1d + idx_token * stride_x_token  # [BLOCK_N]
+                    matrix_x = tl.load(x_ptrs_1d, mask=mask_x_1d)
+            elif KERNEL_WIDTH == 3:
+                if j == 1:
+                    matrix_w = w_col1
+                    matrix_x = col1
+                elif j == 2:
+                    matrix_w = w_col2
+                    x_ptrs_1d = x_base_1d + idx_token * stride_x_token  # [BLOCK_N]
+                    matrix_x = tl.load(x_ptrs_1d, mask=mask_x_1d)
+            elif KERNEL_WIDTH == 4:
+                if j == 1:
+                    matrix_w = w_col1
+                    matrix_x = col1
+                elif j == 2:
+                    matrix_w = w_col2
+                    matrix_x = col2
+                elif j == 3:
+                    matrix_w = w_col3
+                    x_ptrs_1d = x_base_1d + idx_token * stride_x_token  # [BLOCK_N]
+                    matrix_x = tl.load(x_ptrs_1d, mask=mask_x_1d)
+
+            acc += matrix_x * matrix_w  # [BLOCK_N]
+
+        if KERNEL_WIDTH == 2:
+            col0 = matrix_x
+        elif KERNEL_WIDTH == 3:
+            col0 = col1
+            col1 = matrix_x
+        elif KERNEL_WIDTH == 4:
+            col0 = col1
+            col1 = col2
+            col2 = matrix_x
+
+        if SILU_ACTIVATION:
+            acc = acc / (1 + tl.exp(-acc))
+        mask_1d = (idx_token < segment_len) & (
+            idx_feats < dim
+        )  # token-index  # feature-index
+        o_ptrs = (
+            o_ptr
+            + (sequence_start_index + token_offset + idx_token) * stride_o_token
+            + (idx_feats * stride_o_dim)
+        )
+
+        tl.store(o_ptrs, acc, mask=mask_1d)
+
+
+def causal_conv1d_fn(
+    x: torch.Tensor,
+    weight: torch.Tensor,
+    bias: Union[torch.Tensor, None],
+    conv_states: torch.Tensor,
+    query_start_loc: torch.Tensor,
+    seq_lens_cpu: List[int],
+    cache_indices: Optional[torch.Tensor] = None,
+    has_initial_state: Optional[torch.Tensor] = None,
+    activation: Optional[str] = "silu",
+    pad_slot_id: int = PAD_SLOT_ID,
+    validate_data=False,
+    **kwargs,
+):
+    """support varlen + continuous batching when x is 2D tensor
+
+    x: (dim,cu_seq_len)
+        cu_seq_len = total tokens of all seqs in that batch
+        sequences are concatenated from left to right for varlen
+    weight: (dim, width)
+    conv_states: (...,dim,width - 1) itype
+        updated inplace if provided
+        [it use `cache_indices` to get the index to the cache of conv_state for that sequence
+
+        conv_state[cache_indices[i]] for seq-i - to be used as initial_state when has_initial_state[i] = True
+             and after that conv_state[cache_indices[i]] need to be shift-left and updated with values from 'x'
+        ]
+    query_start_loc: (batch + 1) int32
+        The cumulative sequence lengths of the sequences in
+        the batch, used to index into sequence. prepended by 0.
+        if
+        x = [5, 1, 1, 1] <- continuous batching (batch=4)
+        then
+        query_start_loc = [0, 5, 6, 7, 8] <- the starting index of the next sequence; while the last value is
+           the ending index of the last sequence
+        [length(query_start_loc)-1 == batch]
+        for example: query_start_loc = torch.Tensor([0,10,16,17]),
+        x.shape=(dim,17)
+    seq_lens_cpu: (batch) int32
+        The sequence lengths of the sequences in the batch
+    cache_indices: (batch)  int32
+        indicates the corresponding state index,
+        like so: conv_state = conv_states[cache_indices[batch_id]]
+    has_initial_state: (batch) bool
+        indicates whether should the kernel take the current state as initial
+        state for the calculations
+        [single boolean for each sequence in the batch: True or False]
+    bias: (dim,)
+    activation: either None or "silu" or "swish" or True
+    pad_slot_id: int
+        if cache_indices is passed, lets the kernel identify padded
+        entries that will not be processed,
+        for example: cache_indices = [pad_slot_id, 1, 20, pad_slot_id]
+        in this case, the kernel will not process entries at
+        indices 0 and 3
+
+    out: same shape as `x`
+    """
+    if isinstance(activation, bool) and activation:
+        activation = "silu"
+
+    out = torch.empty_like(x)
+
+    is_channel_last = (x.stride(0) == 1) & (x.stride(1) > 1)
+    dim, cu_seqlen = x.shape
+    _, width = weight.shape
+    state_len = width - 1
+    np2_statelen = triton.next_power_of_2(state_len)
+
+    stride_x_seq = 0
+    stride_x_dim = x.stride(0)
+    stride_x_token = x.stride(1)
+    stride_w_dim = weight.stride(0)
+    stride_w_width = weight.stride(1)
+    stride_istate_seq = 0
+    stride_istate_dim = 0
+    stride_istate_token = 0
+    num_cache_lines = 0
+    if conv_states is not None:
+        # extensions to support vLLM:
+        # 1. conv_states is used to replaced initial_states
+        # 2. conv_states serve as a cache with num cache lines can be larger than batch size
+        # 3. mapping from sequence x[idx] to a cache line at index as specified via cache_indices[idx]
+        # 4. computation can be skipped if cache_indices[idx] == pad_slot_id
+        num_cache_lines = conv_states.size(0)
+        assert (
+            num_cache_lines == conv_states.shape[0]
+            and dim == conv_states.shape[1]
+            and width - 1 <= conv_states.shape[2]
+        )
+        stride_istate_seq = conv_states.stride(0)
+        stride_istate_dim = conv_states.stride(1)
+        stride_istate_token = conv_states.stride(2)
+        # assert stride_istate_dim == 1
+    if out.dim() == 2:
+        stride_o_seq = 0
+        stride_o_dim = out.stride(0)
+        stride_o_token = out.stride(1)
+    else:
+        stride_o_seq = out.stride(0)
+        stride_o_dim = out.stride(1)
+        stride_o_token = out.stride(2)
+
+    if validate_data:
+        assert x.dim() == 2
+        assert query_start_loc is not None
+        assert query_start_loc.dim() == 1
+        assert x.stride(0) == 1 or x.stride(1) == 1
+        padded_batch = query_start_loc.size(0) - 1
+        if bias is not None:
+            assert bias.dim() == 1
+            assert dim == bias.size(0)
+        if cache_indices is not None:
+            assert cache_indices.dim() == 1
+            assert padded_batch == cache_indices.size(0)
+        if has_initial_state is not None:
+            assert has_initial_state.size() == (padded_batch,)
+            assert (
+                conv_states is not None
+            ), "ERROR: `has_initial_state` is used, which needs also `conv_states`"
+        assert weight.stride(1) == 1
+        assert (dim, width) == weight.shape
+        assert is_channel_last, "Need to run in channel-last layout"
+
+    def grid(META):
+        max_seq_len = max(seq_lens_cpu)
+        return (
+            len(seq_lens_cpu),  # batch_size
+            (max_seq_len + META["BLOCK_M"] - 1) // META["BLOCK_M"],
+            triton.cdiv(dim, META["BLOCK_N"]),
+        )
+
+    _causal_conv1d_fwd_kernel[grid](
+        # Pointers to matrices
+        x,
+        weight,
+        bias,
+        conv_states,
+        cache_indices,
+        has_initial_state,
+        query_start_loc,
+        out,
+        # Matrix dimensions
+        dim,
+        cu_seqlen,
+        num_cache_lines,
+        # stride
+        stride_x_seq,
+        stride_x_dim,
+        stride_x_token,
+        stride_w_dim,
+        stride_w_width,
+        stride_istate_seq,
+        stride_istate_dim,
+        stride_istate_token,
+        stride_o_seq,
+        stride_o_dim,
+        stride_o_token,
+        # others
+        pad_slot_id,
+        # META
+        HAS_BIAS=bias is not None,
+        KERNEL_WIDTH=width,
+        SILU_ACTIVATION=activation in ["silu", "swish"],
+        HAS_INITIAL_STATES=has_initial_state is not None,
+        HAS_CACHE=conv_states is not None,
+        IS_CONTINUOUS_BATCHING=cache_indices is not None,
+        USE_PAD_SLOT=pad_slot_id is not None,
+        NP2_STATELEN=np2_statelen,
+        # launch_cooperative_grid=True
+        BLOCK_M=8,
+        BLOCK_N=256,
+        num_stages=2,
+    )
+    return out
+
+
+# HAS_EAGLE_TREE_CUSTOM_ATTN_MASK is added to support eagle tree attention mask
+# retrieve_next_token_ptr: [N, NP2_T], retrieve_next_sibling_ptr: [N, NP2_T]
+# e.g. for a sequence of length 4, the eagle tree attention structure is:
+# retrieve_next_token=[1, 3, -1, -1] -> retrieve_next_token[i]: the 1st child token of token i
+# retrieve_next_sibling=[-1, 2, -1, -1] -> retrieve_next_sibling[i]: the 1st tree sibling token of token i
+# retrieve_parent_token=[n/a, 0, 0, 1] -> retrieve_parent_token[i]: the parent token of token i
+# Tree:
+#    0
+#   / \
+#  1   2
+# /
+# 3
+# When calculating token 3's convolution, it should conv to token 1 (parent) and token 0 (grand-parent)
+# When calculating token 2's convolution, it should conv to token 0 (parent)
+# This kernel is a fused kernel which will also produce retrieve_parent_token based on retrieve_next_token & retrieve_next_sibling
+@triton.jit()
+def _causal_conv1d_update_kernel(
+    # Pointers to matrices
+    x_ptr,  # (batch, dim, seqlen)
+    w_ptr,  # (dim, width)
+    bias_ptr,
+    conv_state_ptr,
+    cache_seqlens_ptr,  # circular buffer
+    conv_state_indices_ptr,
+    num_accepted_tokens_ptr,
+    intermediate_conv_window_ptr,
+    intermediate_state_indices_ptr,
+    retrieve_next_token_ptr,
+    retrieve_next_sibling_ptr,
+    retrieve_parent_token_ptr,
+    o_ptr,  # (batch, dim, seqlen)
+    # Matrix dimensions
+    batch: int,
+    dim: tl.constexpr,
+    seqlen: tl.constexpr,
+    state_len: tl.constexpr,
+    num_cache_lines: tl.constexpr,  # added to support vLLM larger cache lines
+    # Strides
+    stride_x_seq: tl.constexpr,
+    stride_x_dim: tl.constexpr,
+    stride_x_token: tl.constexpr,
+    stride_w_dim: tl.constexpr,
+    stride_w_width: tl.constexpr,
+    stride_conv_state_seq: tl.constexpr,
+    stride_conv_state_dim: tl.constexpr,
+    stride_conv_state_tok: tl.constexpr,
+    stride_state_indices: tl.constexpr,
+    stride_inter_seq: tl.constexpr,
+    stride_inter_step: tl.constexpr,
+    stride_inter_dim: tl.constexpr,
+    stride_inter_win: tl.constexpr,
+    stride_intermediate_state_indices: tl.constexpr,
+    stride_retrieve_next_token_seq: tl.constexpr,
+    stride_retrieve_next_token_token: tl.constexpr,
+    stride_retrieve_next_sibling_seq: tl.constexpr,
+    stride_retrieve_next_sibling_token: tl.constexpr,
+    stride_retrieve_parent_token_seq: tl.constexpr,
+    stride_retrieve_parent_token_token: tl.constexpr,
+    stride_o_seq: tl.constexpr,
+    stride_o_dim: tl.constexpr,
+    stride_o_token: tl.constexpr,
+    # others
+    pad_slot_id: tl.constexpr,
+    # Meta-parameters
+    HAS_BIAS: tl.constexpr,
+    KERNEL_WIDTH: tl.constexpr,
+    SILU_ACTIVATION: tl.constexpr,
+    IS_CONTINUOUS_BATCHING: tl.constexpr,
+    IS_SPEC_DECODING: tl.constexpr,
+    NP2_STATELEN: tl.constexpr,
+    NP2_SEQLEN: tl.constexpr,
+    USE_PAD_SLOT: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+    SAVE_INTERMEDIATE: tl.constexpr,
+    HAS_EAGLE_TREE_CUSTOM_ATTN_MASK: tl.constexpr,
+):
+    # ruff: noqa: E501
+    idx_seq = tl.program_id(0)
+    if idx_seq >= batch:
+        return
+
+    # [BLOCK_N,] elements along the feature-dimension (channel)
+    idx_feats = tl.program_id(1) * BLOCK_N + tl.arange(0, BLOCK_N)
+
+    if IS_CONTINUOUS_BATCHING:
+        # mask = idx_seq < batch
+        conv_state_batch_coord = tl.load(
+            conv_state_indices_ptr + idx_seq * stride_state_indices
+        ).to(tl.int64)
+        if SAVE_INTERMEDIATE:
+            intermediate_state_batch_coord = tl.load(
+                intermediate_state_indices_ptr
+                + idx_seq * stride_intermediate_state_indices
+            ).to(tl.int64)
+    else:
+        conv_state_batch_coord = idx_seq
+    if USE_PAD_SLOT:  # noqa
+        if conv_state_batch_coord == pad_slot_id:
+            # not processing as this is not the actual sequence
+            return
+
+    if IS_SPEC_DECODING:
+        # The rolling of conv state:
+        #
+        # Before forward, the conv_state is:
+        # [history1, history2, ..., historyM].
+        #
+        # After forward, the conv_state becomes:
+        # [history2, ..., historyM, draft1, draft2, ..., draftN].
+        #
+        # After acceptance, it becomes:
+        #
+        # - accept 1 tokens: [history2, ..., historyM, draft1]
+        # - accept 2 tokens: [history3, ..., historyM, draft1, draft2]
+        # - and so on.
+        conv_state_token_offset = tl.load(num_accepted_tokens_ptr + idx_seq) - 1
+    else:
+        conv_state_token_offset = 0
+
+    # STEP 1: READ init_state data
+    conv_states_base = (
+        conv_state_ptr
+        + (conv_state_batch_coord * stride_conv_state_seq)
+        + (idx_feats * stride_conv_state_dim)
+    )
+    mask_w = idx_feats < dim
+
+    prior_tokens = conv_states_base + conv_state_token_offset * stride_conv_state_tok
+    if KERNEL_WIDTH >= 2:
+        conv_states_ptrs = prior_tokens  # [BLOCK_N]
+        col0 = tl.load(conv_states_ptrs, mask_w, 0.0)
+    if KERNEL_WIDTH >= 3:
+        conv_states_ptrs = prior_tokens + 1 * stride_conv_state_tok  # [BLOCK_N]
+        col1 = tl.load(conv_states_ptrs, mask_w, 0.0)
+    if KERNEL_WIDTH >= 4:
+        conv_states_ptrs = prior_tokens + 2 * stride_conv_state_tok  # [BLOCK_N]
+        col2 = tl.load(conv_states_ptrs, mask_w, 0.0)
+    if KERNEL_WIDTH == 5:
+        conv_states_ptrs = prior_tokens + 3 * stride_conv_state_tok  # [BLOCK_N]
+        col3 = tl.load(conv_states_ptrs, mask_w, 0.0)
+
+    # STEP 2: assume state_len > seqlen
+    idx_tokens = tl.arange(0, NP2_STATELEN)  # [BLOCK_M]
+
+    # The conv_state updates works in a sliding window manner,
+    # at each forward pass, the tokens are shift by 1, so we
+    # load since idx_tokens + 1.
+    conv_state_ptrs_source = (
+        conv_state_ptr
+        + (conv_state_batch_coord * stride_conv_state_seq)
+        + conv_state_token_offset * stride_conv_state_tok
+        + (idx_feats * stride_conv_state_dim)[None, :]
+        + ((idx_tokens + (1 if IS_SPEC_DECODING else seqlen)) * stride_conv_state_tok)[
+            :, None
+        ]
+    )  # [BLOCK_M, BLOCK_N]
+    mask = (
+        (conv_state_batch_coord < num_cache_lines)
+        & ((idx_tokens + seqlen) < state_len)[:, None]
+        & (idx_feats < dim)[None, :]
+    )
+    conv_state = tl.load(conv_state_ptrs_source, mask, other=0.0)
+
+    VAL = state_len - seqlen
+    x_base = x_ptr + (idx_seq * stride_x_seq) + (idx_feats * stride_x_dim)  # [BLOCK_N]
+
+    x_ptrs = (
+        x_base[None, :] + ((idx_tokens - VAL) * stride_x_token)[:, None]
+    )  # [BLOCK_M, BLOCK_N]
+
+    mask_x = (
+        (idx_tokens - VAL >= 0)[:, None]
+        & (idx_tokens - VAL < seqlen)[:, None]
+        & (idx_feats < dim)[None, :]
+    )  # token-index  # token-index  # feature-index
+    loaded_x = tl.load(x_ptrs, mask_x, 0.0)
+    tl.debug_barrier()
+
+    new_conv_state = tl.where(mask, conv_state, loaded_x)
+
+    conv_state_base = (
+        conv_state_ptr
+        + (conv_state_batch_coord * stride_conv_state_seq)
+        + (idx_feats * stride_conv_state_dim)
+    )  # [BLOCK_N,]
+    conv_state_ptrs_target = (
+        conv_state_base + (idx_tokens * stride_conv_state_tok)[:, None]
+    )  # [BLOCK_M, BLOCK_N]
+    mask = (idx_tokens < state_len)[:, None] & (idx_feats < dim)[None, :]
+    tl.store(conv_state_ptrs_target, new_conv_state, mask)
+
+    # STEP 3: init accumulator
+    if HAS_BIAS:
+        bias = bias_ptr + idx_feats
+        mask_bias = idx_feats < dim
+        acc_preload = tl.load(bias, mask=mask_bias, other=0.0).to(
+            tl.float32
+        )  # [BLOCK_N]
+    else:
+        acc_preload = tl.zeros((BLOCK_N,), dtype=tl.float32)
+
+    # STEP 4:
+    # PRE-LOAD WEIGHTS
+    # first kernel column, configured for weights to handle BLOCK_N features in range
+    if HAS_EAGLE_TREE_CUSTOM_ATTN_MASK:
+        idx_tokens = tl.arange(0, NP2_SEQLEN)  # [BLOCK_M]
+        # Update parent mapping for all tokens at once using vectorized operations
+        mask_retrieve = idx_tokens < seqlen
+        retrieve_next_token_base = (
+            retrieve_next_token_ptr
+            + (idx_seq * stride_retrieve_next_token_seq)
+            + idx_tokens * stride_retrieve_next_token_token
+        )
+        retrieve_next_tokens = tl.load(retrieve_next_token_base, mask_retrieve)
+        retrieve_next_sibling_base = (
+            retrieve_next_sibling_ptr
+            + (idx_seq * stride_retrieve_next_sibling_seq)
+            + idx_tokens * stride_retrieve_next_sibling_token
+        )
+        retrieve_next_siblings = tl.load(retrieve_next_sibling_base, mask_retrieve)
+        parent_idx_tokens = tl.zeros((NP2_SEQLEN,), dtype=tl.int32)
+
+    w_base = w_ptr + (idx_feats * stride_w_dim)  # [BLOCK_N,]
+    mask_w = idx_feats < dim
+    if KERNEL_WIDTH >= 2:
+        w_ptrs = w_base + (0 * stride_w_width)  # [BLOCK_N] tensor
+        w_col0 = tl.load(w_ptrs, mask_w, other=0.0)
+        w_ptrs = w_base + (1 * stride_w_width)  # [BLOCK_N] tensor
+        w_col1 = tl.load(w_ptrs, mask_w, other=0.0)
+    if KERNEL_WIDTH >= 3:
+        w_ptrs = w_base + (2 * stride_w_width)  # [BLOCK_N] tensor
+        w_col2 = tl.load(w_ptrs, mask_w, other=0.0)
+    if KERNEL_WIDTH >= 4:
+        w_ptrs = w_base + (3 * stride_w_width)  # [BLOCK_N] tensor
+        w_col3 = tl.load(w_ptrs, mask_w, other=0.0)
+
+    x_base_1d = x_base  # starting of chunk [BLOCK_N]
+    mask_x_1d = idx_feats < dim
+
+    # STEP 5: compute each token
+    for idx_token in tl.static_range(seqlen):
+        acc = acc_preload
+
+        if HAS_EAGLE_TREE_CUSTOM_ATTN_MASK:
+            # set the parent index of the next token in the eagle tree
+            # next token's parent is the current token
+            retrieve_next_token_idx = tl.sum(
+                tl.where(idx_tokens == idx_token, retrieve_next_tokens, 0)
+            )
+            if retrieve_next_token_idx != -1:  # pad slot id
+                parent_idx_tokens = tl.where(
+                    idx_tokens == retrieve_next_token_idx,
+                    idx_token,
+                    parent_idx_tokens,
+                )
+            # next token's parent is the parent of the current token
+            retrieve_sibling_token_idx = tl.sum(
+                tl.where(idx_tokens == idx_token, retrieve_next_siblings, 0)
+            )
+            if retrieve_sibling_token_idx != -1:  # pad slot id
+                parent_idx_token = tl.sum(
+                    tl.where(idx_tokens == idx_token, parent_idx_tokens, 0)
+                )
+                parent_idx_tokens = tl.where(
+                    idx_tokens == retrieve_sibling_token_idx,
+                    parent_idx_token,
+                    parent_idx_tokens,
+                )
+            # tl.device_print("am", parent_idx_tokens)
+
+            _idx_token = idx_token
+            x_ptrs_1d = x_base_1d + _idx_token * stride_x_token  # [BLOCK_N]
+            matrix_x = tl.load(x_ptrs_1d, mask=mask_x_1d)
+            # convolution operation: itself * wcol[-1] + parent * wcol[-2] + grand-parent * wcol[-3] + ...
+            for j in tl.static_range(KERNEL_WIDTH):
+                if KERNEL_WIDTH == 2:
+                    if j == 0:
+                        matrix_w = w_col1
+                    else:
+                        matrix_w = w_col0
+                elif KERNEL_WIDTH == 3:
+                    if j == 0:
+                        matrix_w = w_col2
+                    elif j == 1:
+                        matrix_w = w_col1
+                    else:
+                        matrix_w = w_col0
+                elif KERNEL_WIDTH == 4:
+                    if j == 0:
+                        matrix_w = w_col3
+                    elif j == 1:
+                        matrix_w = w_col2
+                    elif j == 2:
+                        matrix_w = w_col1
+                    else:
+                        matrix_w = w_col0
+
+                if SAVE_INTERMEDIATE:
+                    # Save the window state after consuming this token
+                    # Layout: [seq(cache line), step, dim, win(K-1)]
+                    base_ptr = (
+                        intermediate_conv_window_ptr
+                        + intermediate_state_batch_coord * stride_inter_seq
+                        + idx_token * stride_inter_step
+                        + idx_feats * stride_inter_dim
+                    )
+
+                    # store itself in KERNEL_WIDTH-2 slot, parent in KERNEL_WIDTH-3 slot, grand-parent in KERNEL_WIDTH-4 slot, ...
+                    if KERNEL_WIDTH - j - 2 >= 0:
+                        tl.store(
+                            base_ptr + (KERNEL_WIDTH - j - 2) * stride_inter_win,
+                            matrix_x,
+                            mask=mask_w,
+                        )
+
+                acc += matrix_x * matrix_w
+
+                # move to parent for next iteration
+                if _idx_token > 0:
+                    _idx_token = tl.sum(
+                        tl.where(idx_tokens == _idx_token, parent_idx_tokens, 0)
+                    )
+                    x_ptrs_1d = x_base_1d + _idx_token * stride_x_token  # [BLOCK_N]
+                    matrix_x = tl.load(x_ptrs_1d, mask=mask_x_1d)
+                else:
+                    # no parent within the current chunk, load from prev conv state: col[-1] (idx 0's parent), col[-2] (idx 0's grand parent), ...
+                    if KERNEL_WIDTH == 2:
+                        if _idx_token == 0:
+                            matrix_x = col0
+                    elif KERNEL_WIDTH == 3:
+                        if _idx_token == 0:
+                            matrix_x = col1
+                        else:
+                            matrix_x = col0
+                    elif KERNEL_WIDTH == 4:
+                        if _idx_token == 0:
+                            matrix_x = col2
+                        elif _idx_token == -1:
+                            matrix_x = col1
+                        else:
+                            matrix_x = col0
+                    _idx_token = _idx_token - 1
+        else:
+            matrix_w = w_col0
+            matrix_x = col0
+
+            for j in tl.static_range(KERNEL_WIDTH):
+                if KERNEL_WIDTH == 2:
+                    if j == 1:  # KERNEL_WIDTH-1:
+                        matrix_w = w_col1
+                        x_ptrs_1d = x_base_1d + idx_token * stride_x_token  # [BLOCK_N]
+                        matrix_x = tl.load(x_ptrs_1d, mask=mask_x_1d)
+                elif KERNEL_WIDTH == 3:
+                    if j == 1:
+                        matrix_w = w_col1
+                        matrix_x = col1
+                    elif j == 2:
+                        matrix_w = w_col2
+                        x_ptrs_1d = x_base_1d + idx_token * stride_x_token  # [BLOCK_N]
+                        matrix_x = tl.load(x_ptrs_1d, mask=mask_x_1d)
+                elif KERNEL_WIDTH == 4:
+                    if j == 1:
+                        matrix_w = w_col1
+                        matrix_x = col1
+                    elif j == 2:
+                        matrix_w = w_col2
+                        matrix_x = col2
+                    elif j == 3:
+                        matrix_w = w_col3
+                        x_ptrs_1d = x_base_1d + idx_token * stride_x_token  # [BLOCK_N]
+                        matrix_x = tl.load(x_ptrs_1d, mask=mask_x_1d)
+
+                acc += matrix_x * matrix_w  # [BLOCK_N]
+
+            if KERNEL_WIDTH == 2:
+                col0 = matrix_x
+            elif KERNEL_WIDTH == 3:
+                col0 = col1
+                col1 = matrix_x
+            elif KERNEL_WIDTH == 4:
+                col0 = col1
+                col1 = col2
+                col2 = matrix_x
+
+            if SAVE_INTERMEDIATE:
+                # Save the window state after consuming this token
+                # Layout: [seq(cache line), step, dim, win(K-1)]
+                base_ptr = (
+                    intermediate_conv_window_ptr
+                    + intermediate_state_batch_coord * stride_inter_seq
+                    + idx_token * stride_inter_step
+                    + idx_feats * stride_inter_dim
+                )
+                if KERNEL_WIDTH >= 2:
+                    tl.store(base_ptr + 0 * stride_inter_win, col0, mask=mask_w)
+                if KERNEL_WIDTH >= 3:
+                    tl.store(base_ptr + 1 * stride_inter_win, col1, mask=mask_w)
+                if KERNEL_WIDTH >= 4:
+                    tl.store(base_ptr + 2 * stride_inter_win, col2, mask=mask_w)
+
+        if SILU_ACTIVATION:
+            acc = acc / (1 + tl.exp(-acc))
+        mask_1d = (idx_token < seqlen) & (
+            idx_feats < dim
+        )  # token-index  # feature-index
+        o_ptrs = (
+            o_ptr
+            + (idx_seq) * stride_o_seq
+            + idx_token * stride_o_token
+            + (idx_feats * stride_o_dim)
+        )
+
+        tl.store(o_ptrs, acc, mask=mask_1d)
+
+        # fuse: store calculated retrieve_parent_token to tensor
+        if HAS_EAGLE_TREE_CUSTOM_ATTN_MASK:
+            tl.store(
+                retrieve_parent_token_ptr
+                + idx_seq * stride_retrieve_parent_token_seq
+                + idx_tokens * stride_retrieve_parent_token_token,
+                parent_idx_tokens,
+                mask=mask_retrieve,
+            )
+
+
+def causal_conv1d_update(
+    x: torch.Tensor,
+    conv_state: torch.Tensor,
+    weight: torch.Tensor,
+    bias: Optional[torch.Tensor] = None,
+    activation: Union[bool, str, None] = None,
+    cache_seqlens: Optional[torch.Tensor] = None,
+    conv_state_indices: Optional[torch.Tensor] = None,
+    num_accepted_tokens: Optional[torch.Tensor] = None,
+    intermediate_conv_window: Optional[torch.Tensor] = None,
+    intermediate_state_indices: Optional[torch.Tensor] = None,
+    retrieve_next_token: Optional[torch.Tensor] = None,
+    retrieve_next_sibling: Optional[torch.Tensor] = None,
+    retrieve_parent_token: Optional[torch.Tensor] = None,
+    pad_slot_id: int = PAD_SLOT_ID,
+    metadata=None,
+    validate_data=False,
+):
+    """
+    x: (batch, dim) or (batch, dim, seqlen)
+        [shape=2: single token prediction]
+        [shape=3: single or multiple tokens prediction]
+    conv_state: (..., dim, state_len), where state_len >= width - 1
+    weight: (dim, width)
+    bias: (dim,)
+    cache_seqlens: (batch,), dtype int32.
+        If not None, the conv_state is treated as a circular buffer.
+        The conv_state will be updated by copying x to the conv_state
+        starting at the index
+        @cache_seqlens % state_len.
+    conv_state_indices: (batch,), dtype int32
+        If not None, the conv_state is a larger tensor along the batch dim,
+        and we are selecting the batch coords specified by conv_state_indices.
+        Useful for a continuous batching scenario.
+    pad_slot_id: int
+            if cache_indices is passed, lets the kernel identify padded
+            entries that will not be processed,
+            for example: cache_indices = [pad_slot_id, 1 ,20 ,pad_slot_id]
+            in this case, the kernel will not process entries at
+            indices 0 and 3
+    out: (batch, dim) or (batch, dim, seqlen)
+    """
+    if validate_data:
+        assert cache_seqlens is None  # not implemented yet - ok for vLLM
+        assert pad_slot_id is not None
+        assert x.stride(1) == 1
+    if isinstance(activation, bool):
+        activation = "silu" if activation is True else None
+    elif activation is not None:
+        assert activation in ["silu", "swish"]
+    unsqueeze = x.dim() == 2
+    if unsqueeze:
+        # make it (batch, dim, seqlen) with seqlen == 1
+        x = x.unsqueeze(-1)
+    batch, dim, seqlen = x.shape
+    _, width = weight.shape
+    # conv_state: (..., dim, state_len), where state_len >= width - 1
+    num_cache_lines, _, state_len = conv_state.size()
+
+    if validate_data:
+        assert dim == weight.size(0)
+        assert (
+            conv_state.stride(-2) == 1
+        ), f"ERROR: expect contiguous along feat-dim of conv_state (currently stride={conv_state.stride()})"
+        assert state_len >= width - 1
+        # when above happens, we don't shift-left to keep any records in conv_state
+        assert dim == conv_state.size(1)
+        if conv_state_indices is None:
+            assert conv_state.size(0) >= batch
+        else:
+            assert (batch,) == conv_state_indices.shape
+            assert intermediate_state_indices is not None
+            assert (batch,) == intermediate_state_indices.shape
+
+        assert num_cache_lines >= batch
+        assert weight.stride(1) == 1  # Need this
+        assert cache_seqlens is None  # not needed for vLLM - circular buffer
+
+    # adopt the strategy in vLLM that overwrite on 'x' directly, rather than creating a new tensor 'o'
+    out = torch.empty_like(x)
+    stride_w_dim, stride_w_width = weight.stride()
+
+    stride_x_seq, stride_x_dim, stride_x_token = x.stride()  # X (batch, dim, seqlen)
+
+    stride_o_seq, stride_o_dim, stride_o_token = out.stride()
+    stride_istate_seq, stride_istate_dim, stride_istate_token = conv_state.stride()
+    stride_state_indices = (
+        conv_state_indices.stride(0) if conv_state_indices is not None else 0
+    )
+    stride_intermediate_state_indices = (
+        intermediate_state_indices.stride(0)
+        if intermediate_state_indices is not None
+        else 0
+    )
+    if num_accepted_tokens is not None:
+        state_len = width - 1 + (seqlen - 1)  # effective state_len needed
+    else:
+        state_len = width - 1
+    np2_statelen = triton.next_power_of_2(state_len)
+    np2_seqlen = triton.next_power_of_2(seqlen)
+
+    def grid(META):
+        return (
+            batch,
+            triton.cdiv(dim, META["BLOCK_N"]),
+        )
+
+    # prepare intermediate buffer strides if provided
+    if intermediate_conv_window is not None:
+        stride_inter_seq, stride_inter_step, stride_inter_dim, stride_inter_win = (
+            intermediate_conv_window.stride(0),
+            intermediate_conv_window.stride(1),
+            intermediate_conv_window.stride(2),
+            intermediate_conv_window.stride(3),
+        )
+    else:
+        stride_inter_seq = stride_inter_step = stride_inter_dim = stride_inter_win = 0
+
+    # prepare retrieve next token buffer strides if provided
+    if retrieve_next_token is not None:
+        stride_retrieve_next_token_seq, stride_retrieve_next_token_token = (
+            retrieve_next_token.stride(0),
+            retrieve_next_token.stride(1),
+        )
+    else:
+        stride_retrieve_next_token_seq = stride_retrieve_next_token_token = 0
+
+    # prepare retrieve next sibling buffer strides if provided
+    if retrieve_next_sibling is not None:
+        stride_retrieve_next_sibling_seq, stride_retrieve_next_sibling_token = (
+            retrieve_next_sibling.stride(0),
+            retrieve_next_sibling.stride(1),
+        )
+    else:
+        stride_retrieve_next_sibling_seq = stride_retrieve_next_sibling_token = 0
+
+    # prepare retrieve parent token buffer strides if provided
+    if retrieve_parent_token is not None:
+        stride_retrieve_parent_token_seq, stride_retrieve_parent_token_token = (
+            retrieve_parent_token.stride(0),
+            retrieve_parent_token.stride(1),
+        )
+    else:
+        stride_retrieve_parent_token_seq = stride_retrieve_parent_token_token = 0
+
+    _causal_conv1d_update_kernel[grid](
+        # Pointers to matrices
+        x,
+        weight,
+        bias,
+        conv_state,
+        cache_seqlens,
+        conv_state_indices,
+        num_accepted_tokens,
+        intermediate_conv_window if intermediate_conv_window is not None else x,
+        intermediate_state_indices,
+        retrieve_next_token,
+        retrieve_next_sibling,
+        retrieve_parent_token,
+        out,
+        # Matrix dimensions
+        batch,
+        dim,
+        seqlen,
+        state_len,
+        num_cache_lines,
+        # stride
+        stride_x_seq,
+        stride_x_dim,
+        stride_x_token,
+        stride_w_dim,
+        stride_w_width,
+        stride_istate_seq,
+        stride_istate_dim,
+        stride_istate_token,
+        stride_state_indices,
+        stride_inter_seq,
+        stride_inter_step,
+        stride_inter_dim,
+        stride_inter_win,
+        stride_intermediate_state_indices,
+        stride_retrieve_next_token_seq,
+        stride_retrieve_next_token_token,
+        stride_retrieve_next_sibling_seq,
+        stride_retrieve_next_sibling_token,
+        stride_retrieve_parent_token_seq,
+        stride_retrieve_parent_token_token,
+        stride_o_seq,
+        stride_o_dim,
+        stride_o_token,
+        # others
+        pad_slot_id,
+        # META
+        HAS_BIAS=bias is not None,
+        KERNEL_WIDTH=width,
+        SILU_ACTIVATION=activation in ["silu", "swish"],
+        IS_CONTINUOUS_BATCHING=conv_state_indices is not None,
+        IS_SPEC_DECODING=num_accepted_tokens is not None,
+        NP2_STATELEN=np2_statelen,
+        NP2_SEQLEN=np2_seqlen,
+        USE_PAD_SLOT=pad_slot_id is not None,
+        BLOCK_N=256,
+        SAVE_INTERMEDIATE=intermediate_conv_window is not None,
+        HAS_EAGLE_TREE_CUSTOM_ATTN_MASK=retrieve_next_token is not None,
+    )
+    if unsqueeze:
+        out = out.squeeze(-1)
+    return out
diff --git a/sglang/python/sglang/srt/layers/attention/mamba/mamba.py b/sglang/python/sglang/srt/layers/attention/mamba/mamba.py
new file mode 100644
index 0000000000000000000000000000000000000000..46d0d5b3f951e138432efa0d0dde17c04e2b9d5d
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/attention/mamba/mamba.py
@@ -0,0 +1,697 @@
+from typing import Callable, List, Optional, Tuple
+
+import torch
+import torch.nn as nn
+
+from sglang.srt.configs.mamba_utils import (
+    Mamba2CacheParams,
+    extra_groups_for_head_shards,
+)
+from sglang.srt.distributed import (
+    divide,
+    get_tensor_model_parallel_rank,
+    get_tensor_model_parallel_world_size,
+)
+from sglang.srt.layers.attention.mamba.mamba2_metadata import Mamba2Metadata
+from sglang.srt.layers.attention.mamba.mixer2_rms_norm_gated import Mixer2RMSNormGated
+from sglang.srt.layers.attention.mamba.ops import (
+    mamba_chunk_scan_combined,
+    selective_state_update,
+)
+from sglang.srt.layers.linear import (
+    ColumnParallelLinear,
+    MergedColumnParallelLinear,
+    RowParallelLinear,
+)
+from sglang.srt.layers.quantization.base_config import QuantizationConfig
+from sglang.srt.mem_cache.memory_pool import MambaPool
+from sglang.srt.model_loader.weight_utils import (
+    composed_weight_loader,
+    sharded_weight_loader,
+)
+from sglang.srt.utils import is_cpu, is_cuda, is_npu, set_weight_attrs
+
+if is_cuda():
+    from sglang.srt.layers.attention.mamba.causal_conv1d import (
+        causal_conv1d_fn,
+        causal_conv1d_update,
+    )
+    from sglang.srt.layers.attention.mamba.causal_conv1d_triton import (
+        causal_conv1d_fn as causal_conv1d_fn_triton,
+    )
+    from sglang.srt.layers.attention.mamba.causal_conv1d_triton import (
+        causal_conv1d_update as causal_conv1d_update_triton,
+    )
+elif is_npu():
+    from sgl_kernel_npu.mamba.causal_conv1d import (
+        causal_conv1d_fn_npu as causal_conv1d_fn,
+    )
+    from sgl_kernel_npu.mamba.causal_conv1d import (
+        causal_conv1d_update_npu as causal_conv1d_update,
+    )
+
+LoaderFunction = Callable[[torch.Tensor, torch.Tensor], None]
+
+
+def mamba_v2_sharded_weight_loader(
+    shard_spec: List[Tuple[int, int, float]],
+    tp_size: int,
+    tp_rank: int,
+) -> LoaderFunction:
+    """Create a weight loader for mamba v2. This ensures that the projections
+    are correctly sharded so that they can be split into x, B, C. It also
+    ensures the the all the groups corresponding to a head shard is placed
+    together with it.
+    """
+
+    def loader(param: torch.Tensor, loaded_weight: torch.Tensor) -> None:
+
+        # - track boundary of (sharded) param, and loaded_weight, respectively
+        boundary, loaded_boundary = 0, 0
+
+        # Calculate padding size for CPU when TP odd size
+        if is_cpu():
+            full_dim_sum = 0
+            full_dim_list = []
+            weight_full_dim_list = []
+            for full_dim, _, _ in shard_spec:
+                full_dim_sum = full_dim_sum + full_dim
+                full_dim_list.append(full_dim)
+            for full_dim in full_dim_list:
+                weight_full_dim_list.append(
+                    int(full_dim / full_dim_sum * loaded_weight.size(0))
+                )
+
+        # - iterate over the shard specs
+        for full_dim, extra, duplicate_groups in shard_spec:
+            # - full dim is the model dim (before TP).
+            # - extra > 0, means there is expected overall increase
+            #   of dimensions. This is so because of replication.
+            # - ratio is used map the tp_rank to the actual shard
+            #   rank. This is useful when there is replication of
+            #   groups to accompany head shards.
+
+            # - size of the loaded shard
+            shard_size = full_dim // tp_size
+
+            # - compute the rank into the loaded shard.
+            # - if there is replication, different TP shards will
+            #   take from the same rank.
+            # NOTE: currently we only support duplication
+            # in the case where num_groups == 1
+            rank = 0 if duplicate_groups else tp_rank
+
+            # - leftmost boundary index into loaded weight.
+            loaded_skip = rank * shard_size
+            loaded_start_idx = loaded_boundary + loaded_skip
+
+            # - take these many dims from the loaded weight.
+            take = min(shard_size, full_dim - extra - loaded_skip)
+
+            # CPU logic of padding size for qwen3-next
+            # TODO : make this common for all mamba.
+            if is_cpu() and loaded_weight.size(0) % tp_size != 0:
+                import copy
+
+                loaded_weight_ = copy.deepcopy(loaded_weight)
+                q, k, v = torch.split(
+                    loaded_weight_,
+                    weight_full_dim_list,
+                    dim=0,
+                )
+                pad_qk = torch.zeros(
+                    full_dim_list[0] - weight_full_dim_list[0],
+                    loaded_weight.size(1),
+                    loaded_weight.size(2),
+                ).to(loaded_weight.dtype)
+                pad_v = torch.zeros(
+                    full_dim_list[2] - weight_full_dim_list[2],
+                    loaded_weight.size(1),
+                    loaded_weight.size(2),
+                ).to(loaded_weight.dtype)
+                q = torch.cat((q, pad_qk), dim=0)
+                k = torch.cat((k, pad_qk), dim=0)
+                v = torch.cat((v, pad_v), dim=0)
+                loaded_weight_qk = torch.cat((q, k), dim=0)
+                loaded_weight = torch.cat((loaded_weight_qk, v), dim=0)
+
+            # - always shard on dim 0
+            # - the ignore is for a mundane mypy error as it does not
+            #   seem to handle slices well.
+            # https://github.com/python/mypy/issues/2410
+            param.data[
+                boundary : (boundary + take), ...  # type: ignore[misc]
+            ] = loaded_weight[
+                loaded_start_idx : (loaded_start_idx + take)  # type: ignore[misc]
+            ]  # type: ignore[misc]
+
+            # move indexing boundaries
+            boundary += shard_size
+            loaded_boundary += full_dim - extra
+
+    return loader
+
+
+class MambaMixer2(torch.nn.Module):
+    """
+    Compute ∆, A, B, C, and D the state space parameters and compute
+    the `contextualized_states`. A, D are input independent
+    (see Mamba paper [1] Section 3.5.2 "Interpretation of A"
+    for why A isn't selective) ∆, B, C are input-dependent
+    (this is a key difference between Mamba and the linear time
+    invariant S4, and is why Mamba is called
+    **selective** state spaces)
+    """
+
+    def __init__(
+        self,
+        cache_params: Mamba2CacheParams,
+        hidden_size: int,
+        use_conv_bias: bool,
+        use_bias: bool,
+        n_groups: int = 1,
+        rms_norm_eps: float = 1e-5,
+        activation: str = "silu",
+        use_rms_norm: bool = True,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+
+        # For TP, the sharding plan is as follows:
+        # - for the conv modules, since
+        #   conv_dim = intermediate_size * 2 * n_groups * ssm_state_size,
+        #   we shard intermediate_size and n_groups
+        # - since intermediate_size = n_heads * head_dim, sharding on
+        #   intermediate_size is achieved by sharding on n_heads.
+        # - IF, world_size divides groups, then sharding
+        #   (n_groups / world_size, n_heads / world_size)
+        #   also maintains the invariant n_heads % n_groups == 0
+        # - HOWEVER IF, world_size DOES NOT divide groups, then we need
+        #   to allocate extra space in the shard, such that groups
+        #   may be replicated to follow the head shard.
+        # - NOTE: currently for the world size DOES NOT divide groups
+        #   case, we only support the case when n_groups == 1
+        self.tp_size = get_tensor_model_parallel_world_size()
+        self.tp_rank = get_tensor_model_parallel_rank()
+
+        self.num_heads = num_heads = cache_params.shape.num_heads
+        self.head_dim = cache_params.shape.head_dim
+
+        assert (
+            num_heads % self.tp_size == 0
+        ), "Tensor parallel world size must divide num heads."
+
+        assert (n_groups % self.tp_size) == 0 or n_groups == 1, (
+            "If tensor parallel world size does not divide num_groups, "
+            "then num_groups must equal 1."
+        )
+
+        assert (
+            (n_groups % self.tp_size == 0) or self.tp_size == 1 or quant_config is None
+        ), (
+            "Tensor parallel currently supported for quantized models only "
+            "if tensor parallel world size divides num groups."
+        )
+
+        self.ssm_state_size = cache_params.shape.ssm_state_size
+        self.activation = activation
+
+        conv_kernel_size = cache_params.shape.conv_kernel
+        self.intermediate_size = intermediate_size = (
+            cache_params.shape.intermediate_size
+        )
+        self.n_groups = n_groups
+        if n_groups % self.tp_size != 0:
+            # - for TP we shard conv_dim by sharding on n_groups,
+            # - but if n_groups cannot divide tp_size, we need to
+            #   extend some extra groups
+            groups = extra_groups_for_head_shards(n_groups, self.tp_size)
+            self.n_groups = n_groups + groups
+        self.groups_ssm_state_size = self.n_groups * self.ssm_state_size
+        self.conv_dim = cache_params.shape.conv_dim
+
+        if n_groups % self.tp_size == 0:
+            self.conv1d = MergedColumnParallelLinear(
+                input_size=conv_kernel_size,
+                output_sizes=[
+                    intermediate_size,
+                    self.groups_ssm_state_size,
+                    self.groups_ssm_state_size,
+                ],
+                bias=use_conv_bias,
+                quant_config=None,
+                prefix=f"{prefix}.conv1d",
+            )
+
+            self.in_proj = MergedColumnParallelLinear(
+                input_size=hidden_size,
+                output_sizes=[
+                    intermediate_size,
+                    intermediate_size,
+                    self.groups_ssm_state_size,
+                    self.groups_ssm_state_size,
+                    self.num_heads,
+                ],
+                bias=use_bias,
+                quant_config=quant_config,
+                prefix=f"{prefix}.in_proj",
+            )
+        else:
+            # This is the n_groups == 1 case,
+            # where we need to duplicate groups if TP>1.
+
+            self.conv1d = ColumnParallelLinear(
+                input_size=conv_kernel_size,
+                output_size=self.conv_dim,
+                bias=use_conv_bias,
+                quant_config=None,
+                prefix=f"{prefix}.conv1d",
+            )
+
+            self.in_proj = ColumnParallelLinear(
+                input_size=hidden_size,
+                output_size=intermediate_size + self.conv_dim + self.num_heads,
+                bias=use_bias,
+                quant_config=quant_config,
+                prefix=f"{prefix}.in_proj",
+            )
+
+            # - because in_proj is a concatenation of 3 weights, we
+            #   need to interleave them before sharding
+            # - use the custom weight loader mamba_v2_sharded_weight_loader
+            #   for conv1d.bias, covn1d.weight and in_proj.weight
+            # - need to set these settings, to assign the groups
+            #   to the head shards
+            group_shard_settings = (
+                self.groups_ssm_state_size,  # expected model size
+                (self.n_groups - n_groups) * self.ssm_state_size,  # extra dims assigned
+                n_groups == 1,  # if there was only one group
+            )
+            intermediate_settings = (intermediate_size, 0, False)
+            head_settings = (self.num_heads, 0, False)
+
+            # - the weight already has a "weight_loader" attribute
+            #   which set_weight_attrs will raise if we do not
+            #   delete before trying to override it
+            # - ditto for the other two weights below
+            delattr(self.conv1d.bias, "weight_loader")
+            set_weight_attrs(
+                self.conv1d.bias,
+                {
+                    "weight_loader": mamba_v2_sharded_weight_loader(
+                        [
+                            intermediate_settings,
+                            group_shard_settings,
+                            group_shard_settings,
+                        ],
+                        self.tp_size,
+                        self.tp_rank,
+                    )
+                },
+            )
+
+            delattr(self.conv1d.weight, "weight_loader")
+            set_weight_attrs(
+                self.conv1d.weight,
+                {
+                    "weight_loader": mamba_v2_sharded_weight_loader(
+                        [
+                            intermediate_settings,
+                            group_shard_settings,
+                            group_shard_settings,
+                        ],
+                        self.tp_size,
+                        self.tp_rank,
+                    )
+                },
+            )
+
+            if quant_config is None:
+                # - quant layers do not have a weight loader
+                delattr(self.in_proj.weight, "weight_loader")
+                set_weight_attrs(
+                    self.in_proj.weight,
+                    {
+                        "weight_loader": mamba_v2_sharded_weight_loader(
+                            [
+                                intermediate_settings,  # for gate
+                                intermediate_settings,
+                                group_shard_settings,
+                                group_shard_settings,
+                                head_settings,  # for dt
+                            ],
+                            self.tp_size,
+                            self.tp_rank,
+                        )
+                    },
+                )
+
+        # unsqueeze to fit conv1d weights shape into the linear weights shape.
+        # Can't do this in `weight_loader` since it already exists in
+        # `ColumnParallelLinear` and `MergedColumnParallelLinear`,
+        # and `set_weight_attrs` doesn't allow to override it
+        self.conv1d.weight.data = self.conv1d.weight.data.unsqueeze(1)
+
+        # - these are TPed by heads to reduce the size of the
+        #   temporal shape
+        self.A = nn.Parameter(
+            torch.empty(
+                divide(num_heads, self.tp_size),
+                dtype=torch.float32,
+            )
+        )
+        self.D = nn.Parameter(torch.ones(num_heads // self.tp_size))
+        self.dt_bias = nn.Parameter(torch.ones(num_heads // self.tp_size))
+        self.use_rms_norm = use_rms_norm
+
+        set_weight_attrs(self.D, {"weight_loader": sharded_weight_loader(0)})
+        a_weight_loader = composed_weight_loader(
+            sharded_weight_loader(0), lambda x: -torch.exp(x.float())
+        )
+        set_weight_attrs(self.A, {"weight_loader": a_weight_loader})
+        set_weight_attrs(self.dt_bias, {"weight_loader": sharded_weight_loader(0)})
+
+        self.out_proj = RowParallelLinear(
+            intermediate_size,
+            hidden_size,
+            bias=use_bias,
+            input_is_parallel=True,
+            quant_config=quant_config,
+            prefix=f"{prefix}.out_proj",
+        )
+
+        self.norm = Mixer2RMSNormGated(
+            intermediate_size, n_groups, self.use_rms_norm, eps=rms_norm_eps
+        )
+
+        self.prefix = prefix
+
+    def forward(
+        self,
+        *,
+        hidden_states: torch.Tensor,
+        output: torch.Tensor,
+        layer_cache: MambaPool.State,
+        metadata: Mamba2Metadata,
+        mup_vector: Optional[torch.Tensor] = None,
+        use_triton_causal_conv: bool = False,
+    ):
+        # metadata contains metadata necessary for the mamba2 triton
+        # kernels to operate in continuous batching and in chunked prefill
+        # modes; they are computed at top-level model forward since they
+        # stay the same and reused for all mamba layers in the same iteration
+        state_indices_tensor = metadata.mamba_cache_indices
+        conv_state = layer_cache.conv[0]
+        ssm_state = layer_cache.temporal
+
+        query_start_loc = metadata.query_start_loc
+
+        # 1. Gated MLP's linear projection
+        projected_states, _ = self.in_proj(hidden_states)
+
+        if mup_vector is not None:
+            projected_states = projected_states * mup_vector
+
+        gate, hidden_states_B_C, dt = torch.split(
+            projected_states,
+            [
+                self.intermediate_size // self.tp_size,
+                self.conv_dim // self.tp_size,
+                self.num_heads // self.tp_size,
+            ],
+            dim=-1,
+        )
+        conv_weights = self.conv1d.weight.view(
+            self.conv1d.weight.size(0), self.conv1d.weight.size(2)
+        )
+
+        # - get hidden_states, B and C after depthwise convolution.
+        split_hidden_states_B_C_fn = lambda hidden_states_B_C: torch.split(
+            hidden_states_B_C,
+            [
+                self.intermediate_size // self.tp_size,
+                self.groups_ssm_state_size // self.tp_size,
+                self.groups_ssm_state_size // self.tp_size,
+            ],
+            dim=-1,
+        )
+
+        num_prefills = metadata.num_prefills  # request count
+        num_decodes = metadata.num_decodes  # token count (=request)
+        num_decode_tokens = (
+            num_decodes * metadata.draft_token_num
+            if metadata.is_target_verify
+            else num_decodes
+        )
+        num_prefill_tokens = metadata.num_prefill_tokens  # token count
+        has_prefill = num_prefills > 0
+        has_decode = num_decodes > 0
+        num_actual_tokens = num_prefill_tokens + num_decode_tokens
+        assert num_actual_tokens == projected_states.shape[0]
+
+        # NOTE: V0 put prefill before decode
+        # Separate prefill and decode by splitting varlen input
+        # Split along token dimension
+        hidden_states_B_C_p, hidden_states_B_C_d = torch.split(
+            hidden_states_B_C,
+            [num_prefill_tokens, num_decode_tokens],
+            dim=0,
+        )
+        dt_p, dt_d = torch.split(
+            dt,
+            [num_prefill_tokens, num_decode_tokens],
+            dim=0,
+        )
+        # Split along batch dimension
+        state_indices_tensor_p, state_indices_tensor_d = torch.split(
+            state_indices_tensor,
+            [num_prefills, num_decodes],
+            dim=0,
+        )
+        query_start_loc_p = query_start_loc[: num_prefills + 1] if has_prefill else None
+
+        # Preallocate output tensor to avoid memcpy cost for merging prefill
+        # and decode outputs
+
+        preallocated_ssm_out = torch.empty(
+            [
+                projected_states.shape[0],
+                (self.num_heads * self.head_dim) // self.tp_size,
+            ],
+            dtype=hidden_states.dtype,
+            device=hidden_states.device,
+        )
+        preallocated_ssm_out_p, preallocated_ssm_out_d = torch.split(
+            preallocated_ssm_out,
+            [num_prefill_tokens, num_decode_tokens],
+            dim=0,
+        )
+
+        # Process prefill requests
+        if has_prefill:
+            mixed_metadata = metadata.mixed_metadata
+            assert mixed_metadata is not None
+            # 2. Convolution sequence transformation
+            # - "cache_indices" updates the conv_state cache in positions
+            #   pointed to by "state_indices_tensor"
+            has_initial_states_p = mixed_metadata.has_initial_states
+            prep_initial_states = mixed_metadata.prep_initial_states
+            cache_indices = state_indices_tensor_p
+            x = hidden_states_B_C_p.transpose(
+                0, 1
+            )  # this is the form that causal-conv see
+            ccfn = (
+                causal_conv1d_fn
+                if not use_triton_causal_conv
+                else causal_conv1d_fn_triton
+            )
+            hidden_states_B_C_p = ccfn(
+                x,
+                conv_weights,
+                self.conv1d.bias,
+                activation=self.activation,
+                conv_states=conv_state,
+                has_initial_state=has_initial_states_p,
+                cache_indices=cache_indices,
+                query_start_loc=query_start_loc_p,
+                seq_lens_cpu=mixed_metadata.extend_seq_lens_cpu,
+            ).transpose(0, 1)[:num_prefill_tokens]
+
+            hidden_states_p, B_p, C_p = split_hidden_states_B_C_fn(hidden_states_B_C_p)
+
+            # 3. State Space Model sequence transformation
+            initial_states = None
+            if has_initial_states_p is not None and prep_initial_states:
+                initial_states = torch.where(
+                    has_initial_states_p[:, None, None, None],
+                    ssm_state[state_indices_tensor_p],
+                    0,
+                )
+
+            # NOTE: final output is an in-place update of out tensor
+            varlen_state = mamba_chunk_scan_combined(
+                hidden_states_p.view(
+                    1, num_prefill_tokens, self.num_heads // self.tp_size, self.head_dim
+                ),
+                dt_p.unsqueeze(0),
+                self.A,
+                B_p.view(1, num_prefill_tokens, self.n_groups // self.tp_size, -1),
+                C_p.view(1, num_prefill_tokens, self.n_groups // self.tp_size, -1),
+                chunk_size=mixed_metadata.chunk_size,
+                D=self.D,
+                z=None,
+                dt_bias=self.dt_bias,
+                seq_idx=mixed_metadata.seq_idx,
+                chunk_indices=mixed_metadata.chunk_indices,
+                chunk_offsets=mixed_metadata.chunk_offsets,
+                cu_seqlens=query_start_loc_p,
+                initial_states=initial_states,
+                return_varlen_states=True,
+                return_final_states=False,
+                dt_softplus=True,
+                dt_limit=(0.0, float("inf")),
+                out=preallocated_ssm_out_p.view(
+                    1, num_prefill_tokens, -1, self.head_dim
+                ),
+                state_dtype=ssm_state.dtype,
+            )
+
+            # update ssm states
+            # - varlen state is a (num_prefills, nheads, headdim, dstate) tensor
+            ssm_state[state_indices_tensor_p] = varlen_state
+
+        # Process decode requests
+        if has_decode:
+            is_target_verify = metadata.is_target_verify
+
+            # 2. Convolution sequence transformation
+            if is_target_verify:
+                assert (
+                    use_triton_causal_conv
+                ), "Speculative decoding requires use_triton_causal_conv=True for intermediate state support"
+                assert isinstance(
+                    layer_cache, MambaPool.SpeculativeState
+                ), "layer_cache must be SpeculativeState for speculative decoding"
+                draft_token_num = metadata.draft_token_num
+                self.intermediate_state_indices = torch.arange(
+                    num_decodes, dtype=torch.int32, device=state_indices_tensor_d.device
+                )
+
+                # Reshape for batch processing
+                hidden_states_B_C_d_reshaped = hidden_states_B_C_d.view(
+                    num_decodes, draft_token_num, -1
+                ).transpose(1, 2)
+
+                hidden_states_B_C_d_processed = causal_conv1d_update_triton(
+                    hidden_states_B_C_d_reshaped,
+                    conv_state,
+                    conv_weights,
+                    self.conv1d.bias,
+                    self.activation,
+                    conv_state_indices=state_indices_tensor_d[:num_decodes],
+                    intermediate_conv_window=layer_cache.intermediate_conv_window[0],
+                    intermediate_state_indices=self.intermediate_state_indices,
+                    retrieve_next_token=metadata.retrieve_next_token,
+                    retrieve_next_sibling=metadata.retrieve_next_sibling,
+                    retrieve_parent_token=metadata.retrieve_parent_token,
+                )
+                hidden_states_B_C_d = hidden_states_B_C_d_processed.transpose(
+                    1, 2
+                ).view(num_decode_tokens, -1)
+            else:
+                ccu = (
+                    causal_conv1d_update
+                    if not use_triton_causal_conv
+                    else causal_conv1d_update_triton
+                )
+                hidden_states_B_C_d = ccu(
+                    hidden_states_B_C_d,
+                    conv_state,
+                    conv_weights,
+                    self.conv1d.bias,
+                    self.activation,
+                    conv_state_indices=state_indices_tensor_d,
+                )
+
+            hidden_states_d, B_d, C_d = split_hidden_states_B_C_fn(hidden_states_B_C_d)
+
+            # 3. State Space Model sequence transformation
+            n_groups = self.n_groups // self.tp_size
+            A_d = (
+                self.A[:, None, ...][:, :, None]
+                .expand(-1, self.head_dim, self.ssm_state_size)
+                .to(dtype=torch.float32)
+            )
+            dt_d = dt_d[:, :, None].expand(-1, -1, self.head_dim)
+            dt_bias = self.dt_bias[:, None, ...].expand(-1, self.head_dim)
+            D_d = self.D[:, None, ...].expand(-1, self.head_dim)
+            B_d = B_d.view(-1, n_groups, B_d.shape[1] // n_groups)
+            C_d = C_d.view(-1, n_groups, C_d.shape[1] // n_groups)
+            hidden_states_d = hidden_states_d.view(
+                -1, self.num_heads // self.tp_size, self.head_dim
+            )
+
+            if is_target_verify:
+                selective_state_update(
+                    ssm_state,
+                    hidden_states_d.view(
+                        num_decodes,
+                        draft_token_num,
+                        self.num_heads // self.tp_size,
+                        self.head_dim,
+                    ),
+                    dt_d.view(
+                        num_decodes,
+                        draft_token_num,
+                        self.num_heads // self.tp_size,
+                        self.head_dim,
+                    ),
+                    A_d,
+                    B_d.view(num_decodes, draft_token_num, n_groups, -1),
+                    C_d.view(num_decodes, draft_token_num, n_groups, -1),
+                    D_d,
+                    z=None,
+                    dt_bias=dt_bias,
+                    dt_softplus=True,
+                    state_batch_indices=state_indices_tensor_d[:num_decodes],
+                    out=preallocated_ssm_out_d.view(
+                        num_decodes,
+                        draft_token_num,
+                        self.num_heads // self.tp_size,
+                        self.head_dim,
+                    ),
+                    disable_state_update=True,
+                    intermediate_states_buffer=layer_cache.intermediate_ssm,
+                    cache_steps=draft_token_num,
+                    retrieve_parent_token=metadata.retrieve_parent_token,
+                    intermediate_state_indices=self.intermediate_state_indices,
+                )
+            else:
+                selective_state_update(
+                    ssm_state,
+                    hidden_states_d,
+                    dt_d,
+                    A_d,
+                    B_d,
+                    C_d,
+                    D_d,
+                    z=None,
+                    dt_bias=dt_bias,
+                    dt_softplus=True,
+                    state_batch_indices=state_indices_tensor_d,
+                    out=preallocated_ssm_out_d.view(num_decodes, -1, self.head_dim),
+                )
+
+        # 4. gated MLP
+        # GatedRMSNorm internally applying SiLU to the gate
+        # SiLU is applied internally before normalization, unlike standard
+        # norm usage
+        hidden_states = self.norm(preallocated_ssm_out, gate[:num_actual_tokens])
+
+        # 5. Final linear projection
+        output[:num_actual_tokens], _ = self.out_proj(hidden_states)
+
+    @property
+    def mamba_type(self) -> str:
+        return "mamba2"
diff --git a/sglang/python/sglang/srt/layers/attention/mamba/mamba2_metadata.py b/sglang/python/sglang/srt/layers/attention/mamba/mamba2_metadata.py
new file mode 100644
index 0000000000000000000000000000000000000000..5eeb2b65e307f51da87b9aa1b9f5652680289332
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/attention/mamba/mamba2_metadata.py
@@ -0,0 +1,249 @@
+# Copyright 2025 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+# Adapted from https://github.com/vllm-project/vllm/blob/2c58742dff8613a3bd7496f2008ce927e18d38d1/vllm/model_executor/layers/mamba/mamba2_metadata.py
+
+
+import math
+from dataclasses import dataclass
+from typing import Optional
+
+import torch
+
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch
+
+
+@dataclass(kw_only=True)
+class ForwardMetadata:
+    query_start_loc: torch.Tensor
+    mamba_cache_indices: torch.Tensor
+    # For topk > 1 eagle
+    retrieve_next_token: Optional[torch.Tensor] = None
+    retrieve_next_sibling: Optional[torch.Tensor] = None
+    retrieve_parent_token: Optional[torch.Tensor] = None
+    # For prefill radix cache
+    track_conv_indices: Optional[torch.Tensor] = None
+    track_ssm_h_src: Optional[torch.Tensor] = None
+    track_ssm_h_dst: Optional[torch.Tensor] = None
+    track_ssm_final_src: Optional[torch.Tensor] = None
+    track_ssm_final_dst: Optional[torch.Tensor] = None
+
+    is_target_verify: bool = False
+    draft_token_num: int = 1
+
+
+@dataclass(kw_only=True)
+class Mamba2Metadata(ForwardMetadata):
+    """stable metadata across all mamba2 layers in the forward pass"""
+
+    num_prefills: int
+    num_prefill_tokens: int
+    num_decodes: int
+
+    @dataclass(kw_only=True, frozen=True)
+    class MixedMetadata:
+        has_initial_states: torch.Tensor
+        prep_initial_states: bool
+
+        chunk_size: int
+        seq_idx: torch.Tensor
+        chunk_indices: torch.Tensor
+        chunk_offsets: torch.Tensor
+
+        extend_seq_lens_cpu: list[int]
+
+    mixed_metadata: MixedMetadata | None = None
+    """`mixed_metadata` is used for extend/mixed requests"""
+
+    @staticmethod
+    def _query_start_loc_to_chunk_indices_offsets(
+        query_start_loc: torch.Tensor, chunk_size: int, total_seqlens: int
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        """
+        Args:
+            query_start_loc (torch.Tensor): 1D tensor of cumulative sequence
+                lengths, shape (num_seqs + 1,).
+                The first element should be 0. Each entry represents the starting
+                index of a sequence in the flattened token array.
+            chunk_size (int): The size of each physical mamba chunk
+                (number of tokens per chunk).
+            total_seqlens (int): The total number of tokens in the batch.
+
+        Returns:
+            Tuple[torch.Tensor, torch.Tensor]: A tuple containing:
+                - chunk_indices (torch.Tensor): 1D tensor of indices
+                    indicating the physical chunk for each logical chunk.
+                - chunk_offsets (torch.Tensor): 1D tensor of offsets
+                    indicating the starting index of each logical chunk within
+                    its physical chunk.
+
+        This function computes the chunk indices and offsets for the given
+        query_start_loc and chunk_size. Both are tensors of integers with length N,
+        where N is the number of logical (pseudo) chunks.
+        A logical chunk is a sequence of tokens that are all part of the same
+        sequence and are all in the same physical mamba chunk.
+        In other words, a logical chunk changes every time we cross a sequence
+        boundary or a physical mamba chunk boundary.
+        Logical chunks are needed to handle batched requests with initial states
+        (see _state_passing_fwd and _chunk_scan_fwd).
+        The chunk_indices tensor contains the index of the physical chunk for each
+        logical chunk.
+        The chunk_offsets tensor contains the offset (AKA starting index) of the
+        logical chunk in the physical chunk.
+
+        Example:
+        query_start_loc = [0, 5, 10]
+        chunk_size = 8
+        total_seqlens = 10
+        -> chunk_indices = [0, 0, 1]
+        -> chunk_offsets = [0, 5, 0]
+
+        In this example, we have 2 sequences, each with 5 tokens. The physical
+        chunk size is 8 tokens.
+        We have three logical chunks:
+        - the first logical chunk starts at token 0 in the first physical chunk
+            and contains all 5 tokens from the first sequence
+        - the second logical chunk starts at token 5 in the first physical chunk
+            and contains first 3 tokens from the second sequence
+        - the third logical chunk starts at token 0 in the second physical chunk
+            and contains the remaining 2 tokens from the second sequence
+        """
+
+        cu_seqlens = query_start_loc[1:]  # remove prepended 0
+
+        # outputs will have length expansion of chunks that do not divide
+        # chunk_size
+        N = (
+            math.ceil(total_seqlens / chunk_size)
+            + (cu_seqlens[:-1] % chunk_size > 0).sum()
+        )
+        chunk_indices = torch.arange(N, dtype=torch.int, device=query_start_loc.device)
+        chunk_offsets = torch.zeros(
+            (N,), dtype=torch.int, device=query_start_loc.device
+        )
+
+        p = 0  # num of insertions
+        for s, e in zip(cu_seqlens[:-1], cu_seqlens[1:]):
+
+            # if does not divide chunk_size, then there is one chunk insertion
+            p += s % chunk_size > 0
+
+            # get the dimensions
+            # - the + 1 for _e is to shift the boundary by one chunk
+            # - this shifting is not needed if chunk_size divides e
+            _s, _e = s // chunk_size + p, e // chunk_size + p + (e % chunk_size > 0)
+
+            # adjust indices and offsets
+            chunk_indices[_s:_e] -= p
+            chunk_offsets[_s] = s % chunk_size
+
+        return chunk_indices, chunk_offsets
+
+    @staticmethod
+    def prepare_decode(
+        forward_metadata: ForwardMetadata,
+        seq_lens: torch.Tensor,
+        *,
+        is_target_verify: bool,
+        draft_token_num: int,
+    ) -> "Mamba2Metadata":
+        """This path is run during CUDA graph capture, i.e. decode only, so `num_prefills` is 0"""
+        return Mamba2Metadata(
+            query_start_loc=forward_metadata.query_start_loc,
+            mamba_cache_indices=forward_metadata.mamba_cache_indices,
+            retrieve_next_token=forward_metadata.retrieve_next_token,
+            retrieve_next_sibling=forward_metadata.retrieve_next_sibling,
+            retrieve_parent_token=forward_metadata.retrieve_parent_token,
+            num_decodes=len(seq_lens),
+            num_prefills=0,
+            num_prefill_tokens=0,
+            is_target_verify=is_target_verify,
+            draft_token_num=draft_token_num,
+        )
+
+    @classmethod
+    def prepare_mixed(
+        cls,
+        forward_metadata: ForwardMetadata,
+        chunk_size: int,
+        forward_batch: ForwardBatch,
+    ) -> "Mamba2Metadata":
+        """This path cannot run with CUDA graph, as it contains extend requests."""
+        if forward_batch.extend_num_tokens is None:
+            draft_token_num = (
+                forward_batch.spec_info.draft_token_num
+                if forward_batch.spec_info is not None
+                else 1
+            )
+            return cls.prepare_decode(
+                forward_metadata,
+                forward_batch.seq_lens,
+                is_target_verify=forward_batch.forward_mode.is_target_verify(),
+                draft_token_num=draft_token_num,
+            )
+        num_prefills = len(forward_batch.extend_seq_lens)
+        num_prefill_tokens = forward_batch.extend_num_tokens
+        num_decodes = len(forward_batch.seq_lens) - num_prefills
+        context_lens_tensor = forward_batch.extend_prefix_lens
+        assert context_lens_tensor is not None
+        # precompute flag to avoid device syncs later
+        has_initial_states = context_lens_tensor > 0
+        prep_initial_states = torch.any(has_initial_states[:num_prefills]).item()
+
+        query_start_loc = forward_metadata.query_start_loc[: num_prefills + 1]
+        seq_idx = torch.repeat_interleave(
+            torch.arange(
+                num_prefills, dtype=torch.int32, device=query_start_loc.device
+            ),
+            query_start_loc.diff(),
+            output_size=num_prefill_tokens,
+        )
+        seq_idx.unsqueeze_(0)
+
+        # We compute metadata for chunked prefill once at the top level model
+        # forward and reuse them in mamba layers. If not needed, they will be
+        # ignored inside mamba kernels.
+        chunk_offsets, chunk_indices = None, None
+        if prep_initial_states:
+            chunk_indices, chunk_offsets = (
+                cls._query_start_loc_to_chunk_indices_offsets(
+                    query_start_loc, chunk_size, num_prefill_tokens
+                )
+            )
+
+        draft_token_num = (
+            getattr(forward_batch.spec_info, "draft_token_num", 1)
+            if forward_batch.spec_info is not None
+            else 1
+        )
+        return Mamba2Metadata(
+            query_start_loc=query_start_loc,
+            mamba_cache_indices=forward_metadata.mamba_cache_indices,
+            retrieve_next_token=forward_metadata.retrieve_next_token,
+            retrieve_next_sibling=forward_metadata.retrieve_next_sibling,
+            retrieve_parent_token=forward_metadata.retrieve_parent_token,
+            num_prefills=num_prefills,
+            num_prefill_tokens=num_prefill_tokens,
+            num_decodes=num_decodes,
+            is_target_verify=forward_batch.forward_mode.is_target_verify(),
+            draft_token_num=draft_token_num,
+            mixed_metadata=cls.MixedMetadata(
+                has_initial_states=has_initial_states,
+                prep_initial_states=prep_initial_states,
+                chunk_size=chunk_size,
+                seq_idx=seq_idx,
+                chunk_indices=chunk_indices,
+                chunk_offsets=chunk_offsets,
+                extend_seq_lens_cpu=forward_batch.extend_seq_lens_cpu,
+            ),
+        )
diff --git a/sglang/python/sglang/srt/layers/attention/mamba/mamba_state_scatter_triton.py b/sglang/python/sglang/srt/layers/attention/mamba/mamba_state_scatter_triton.py
new file mode 100644
index 0000000000000000000000000000000000000000..0c1b7efac2f5035700094e8ea7111729329fea9d
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/attention/mamba/mamba_state_scatter_triton.py
@@ -0,0 +1,190 @@
+"""
+Fused Triton kernel for Mamba state scatter operations.
+
+This kernel replaces the expensive advanced indexing operations in
+`update_mamba_state_after_mtp_verify` with a single fused gather-scatter kernel,
+avoiding multiple `index_elementwise_kernel` launches.
+"""
+
+import torch
+import triton
+import triton.language as tl
+
+
+@triton.jit
+def _fused_mamba_state_scatter_with_mask_kernel(
+    src_ptr,
+    dst_ptr,
+    # Raw index arrays (before index_select)
+    dst_indices_raw_ptr,  # [total_requests] - state_indices_tensor
+    step_indices_raw_ptr,  # [total_requests] - accepted_steps or mamba_steps_to_track
+    # Total number of requests
+    total_requests,
+    elem_per_entry: tl.constexpr,
+    src_layer_stride,
+    src_req_stride,
+    src_step_stride,
+    dst_layer_stride,
+    dst_req_stride,
+    src_req_size,
+    src_step_size,
+    dst_req_size,
+    BLOCK_SIZE: tl.constexpr,
+):
+    """
+    Fused gather-scatter kernel with built-in masking.
+
+    This kernel fuses the index_select operations by:
+    1. Iterating over all requests (pid_req from 0 to total_requests-1)
+    2. Checking if step_indices_raw[pid_req] >= 0 (valid mask)
+    3. If valid, performing the scatter:
+       dst[l, dst_indices_raw[pid_req], :] = src[l, pid_req, step_indices_raw[pid_req], :]
+
+    Grid: (total_requests, num_layers, ceil(elem_per_entry / BLOCK_SIZE))
+    """
+    pid_req = tl.program_id(0)
+    pid_layer = tl.program_id(1).to(tl.int64)
+    pid_block = tl.program_id(2).to(tl.int64)
+
+    # Load step index to check validity (step >= 0 means valid)
+    step_idx = tl.load(step_indices_raw_ptr + pid_req).to(tl.int64)
+
+    # Early exit if this request is not valid (step < 0)
+    if step_idx < 0:
+        return
+
+    # Load destination index
+    dst_idx = tl.load(dst_indices_raw_ptr + pid_req).to(tl.int64)
+
+    # Source index is just the request index itself
+    src_idx = pid_req
+
+    # Bounds check to avoid illegal memory access
+    if not (
+        (dst_idx >= 0)
+        & (dst_idx < dst_req_size)
+        & (src_idx >= 0)
+        & (src_idx < src_req_size)
+        & (step_idx < src_step_size)
+    ):
+        return
+
+    # Compute base offsets
+    src_offset = (
+        pid_layer * src_layer_stride
+        + src_idx * src_req_stride
+        + step_idx * src_step_stride
+    )
+    dst_offset = pid_layer * dst_layer_stride + dst_idx * dst_req_stride
+
+    # Compute element range for this block
+    start = pid_block * BLOCK_SIZE
+    offsets = start + tl.arange(0, BLOCK_SIZE)
+    mask = offsets < elem_per_entry
+
+    # Load from source and store to destination
+    data = tl.load(src_ptr + src_offset + offsets, mask=mask)
+    tl.store(dst_ptr + dst_offset + offsets, data, mask=mask)
+
+
+def fused_mamba_state_scatter_with_mask(
+    dst: torch.Tensor,  # [num_layers, cache_size, *state_shape]
+    src: torch.Tensor,  # [num_layers, spec_size, draft_tokens, *state_shape]
+    dst_indices_raw: torch.Tensor,  # [total_requests] - raw indices (e.g., state_indices_tensor)
+    step_indices_raw: torch.Tensor,  # [total_requests] - raw step indices (step >= 0 means valid)
+):
+    """
+    Fully fused gather-scatter with built-in masking for mamba state updates.
+
+    This function fuses the following operations into a single kernel:
+    1. valid_mask = step_indices_raw >= 0
+    2. valid_indices = valid_mask.nonzero()
+    3. dst_indices = dst_indices_raw[valid_indices]  (index_select)
+    4. step_indices = step_indices_raw[valid_indices]  (index_select)
+    5. for each valid i: dst[:, dst_indices[i], :] = src[:, i, step_indices[i], :]
+
+    Args:
+        dst: Destination tensor [num_layers, cache_size, *state_shape]
+        src: Source tensor [num_layers, spec_size, draft_tokens, *state_shape]
+        dst_indices_raw: Raw destination indices for all requests [total_requests]
+        step_indices_raw: Raw step indices; entry >= 0 means valid [total_requests]
+    """
+    total_requests = step_indices_raw.shape[0]
+    if total_requests == 0:
+        return
+
+    if dst.device != src.device:
+        raise ValueError(
+            f"dst and src must be on the same device. {dst.device=} {src.device=}"
+        )
+    if not dst.is_cuda or not src.is_cuda:
+        raise ValueError(
+            "fused_mamba_state_scatter_with_mask only supports CUDA tensors."
+        )
+    if dst.ndim < 2 or src.ndim < 3:
+        raise ValueError(f"Unexpected tensor ranks: {dst.ndim=} {src.ndim=}")
+    if dst.shape[0] != src.shape[0]:
+        raise ValueError(
+            f"Layer dimension mismatch: {dst.shape[0]=} vs {src.shape[0]=}"
+        )
+    if dst.shape[2:] != src.shape[3:]:
+        raise ValueError(
+            f"Trailing dims mismatch: {dst.shape[2:]=} vs {src.shape[3:]=}"
+        )
+    if dst_indices_raw.ndim != 1 or step_indices_raw.ndim != 1:
+        raise ValueError(
+            f"indices must be 1D: {dst_indices_raw.shape=} {step_indices_raw.shape=}"
+        )
+    if dst_indices_raw.shape[0] != step_indices_raw.shape[0]:
+        raise ValueError(
+            f"indices length mismatch: {dst_indices_raw.shape[0]=} vs {step_indices_raw.shape[0]=}"
+        )
+
+    num_layers = dst.shape[0]
+    src_req_size = src.shape[1]
+    src_step_size = src.shape[2]
+    dst_req_size = dst.shape[1]
+
+    # Flatten trailing dimensions: number of elements per (layer, cache_line) entry.
+    elem_per_entry = dst.numel() // (dst.shape[0] * dst.shape[1])
+
+    # Get strides (in elements, not bytes)
+    src_layer_stride = src.stride(0)
+    src_req_stride = src.stride(1)
+    src_step_stride = src.stride(2)
+    dst_layer_stride = dst.stride(0)
+    dst_req_stride = dst.stride(1)
+
+    # Ensure indices are int32 and contiguous
+    dst_indices_raw = dst_indices_raw.to(torch.int32).contiguous()
+    step_indices_raw = step_indices_raw.to(torch.int32).contiguous()
+
+    # Ensure tensors are contiguous
+    if not dst.is_contiguous():
+        raise ValueError("dst tensor must be contiguous")
+    if not src.is_contiguous():
+        raise ValueError("src tensor must be contiguous")
+
+    # Block size for copying elements
+    BLOCK_SIZE = 1024
+
+    # Grid over all requests - invalid ones will early-exit in the kernel
+    grid = (total_requests, num_layers, triton.cdiv(elem_per_entry, BLOCK_SIZE))
+
+    _fused_mamba_state_scatter_with_mask_kernel[grid](
+        src,
+        dst,
+        dst_indices_raw,
+        step_indices_raw,
+        total_requests,
+        elem_per_entry,
+        src_layer_stride,
+        src_req_stride,
+        src_step_stride,
+        dst_layer_stride,
+        dst_req_stride,
+        src_req_size,
+        src_step_size,
+        dst_req_size,
+        BLOCK_SIZE=BLOCK_SIZE,
+    )
diff --git a/sglang/python/sglang/srt/layers/attention/mamba/mixer2_rms_norm_gated.py b/sglang/python/sglang/srt/layers/attention/mamba/mixer2_rms_norm_gated.py
new file mode 100644
index 0000000000000000000000000000000000000000..547b4aa9950d5d6497003cd5b5caa7cea3efbf4b
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/attention/mamba/mixer2_rms_norm_gated.py
@@ -0,0 +1,120 @@
+from typing import Union
+
+import torch
+
+from sglang.srt.distributed.communication_op import (
+    tensor_model_parallel_all_gather,
+    tensor_model_parallel_all_reduce,
+)
+from sglang.srt.distributed.parallel_state import (
+    get_tensor_model_parallel_rank,
+    get_tensor_model_parallel_world_size,
+)
+from sglang.srt.layers.attention.fla.layernorm_gated import rms_norm_gated
+from sglang.srt.layers.utils import MultiPlatformOp
+from sglang.srt.model_loader.weight_utils import sharded_weight_loader
+from sglang.srt.utils.common import set_weight_attrs
+
+
+class Mixer2RMSNormGated(MultiPlatformOp):
+    def __init__(
+        self,
+        full_hidden_size: int,
+        full_n_groups: int,
+        use_rms_norm: bool = True,
+        eps: float = 1e-6,
+    ):
+        super().__init__()
+        self.tp_size = get_tensor_model_parallel_world_size()
+        self.tp_rank = get_tensor_model_parallel_rank()
+        self.full_hidden_size = full_hidden_size
+        self.group_size = full_hidden_size // full_n_groups
+        self.per_rank_hidden_size = full_hidden_size // self.tp_size
+        self.n_groups = full_hidden_size // self.group_size
+
+        self.variance_epsilon = eps
+        self.use_rms_norm = use_rms_norm
+        if self.use_rms_norm:
+            # Register norm weight only if we're actually applying RMSNorm
+            self.weight = torch.nn.Parameter(torch.ones(self.per_rank_hidden_size))
+            set_weight_attrs(self.weight, {"weight_loader": sharded_weight_loader(0)})
+        else:
+            # Avoid checkpoint mismatch by skipping unused parameter
+            self.register_parameter("weight", None)
+        assert (
+            self.full_hidden_size % self.tp_size == 0
+        ), "Tensor parallel world size must divide hidden size."
+
+    def forward_native(
+        self,
+        x: torch.Tensor,
+        gate: torch.Tensor,
+    ):
+        # Three tensor-parallel cases:
+        #   1. n_groups is 1
+        #      In this case we parallelize along the reduction dim.
+        #      Each rank computes a local sum of squares followed by AllReduce
+        #   2. tp_size divides n_groups
+        #      Each rank only reduces within its local group(s).
+        #      No collective ops necessary.
+        #   3. The general case can be pretty complicated so we AllGather
+        #      the input and then redundantly compute the RMSNorm.
+        input_dtype = x.dtype
+        x = x * torch.nn.functional.silu(gate.to(torch.float32))
+        if not self.use_rms_norm:
+            return x.to(input_dtype)
+
+        if self.n_groups == 1:
+            if self.tp_size > 1:
+                # Compute local sum and then reduce to obtain global sum
+                local_sums = x.pow(2).sum(dim=-1, keepdim=True)
+                global_sums = tensor_model_parallel_all_reduce(local_sums)
+                # Calculate the variance
+                count = self.tp_size * x.shape[-1]
+                variance = global_sums / count
+
+            else:
+                variance = x.pow(2).mean(-1, keepdim=True)
+            x = x * torch.rsqrt(variance + self.variance_epsilon)
+        else:
+            redundant_tp: bool = self.n_groups % self.tp_size != 0
+            if redundant_tp:
+                # To handle the general case, redundantly apply the variance
+                x = tensor_model_parallel_all_gather(x, -1)
+
+            *prefix_dims, hidden_dim = x.shape
+            group_count = hidden_dim // self.group_size
+            x_grouped = x.view(*prefix_dims, group_count, self.group_size)
+            variance = x_grouped.pow(2).mean(-1, keepdim=True)
+            x_grouped = x_grouped * torch.rsqrt(variance + self.variance_epsilon)
+            x = x_grouped.view(*prefix_dims, hidden_dim)
+
+            if redundant_tp:
+                start = self.per_rank_hidden_size * self.tp_rank
+                end = start + self.per_rank_hidden_size
+                x = x[..., start:end]
+
+        return self.weight * x.to(input_dtype)
+
+    def forward_cuda(
+        self,
+        x: torch.Tensor,
+        gate: torch.Tensor,
+    ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]:
+        input_dtype = x.dtype
+        if not self.use_rms_norm:
+            # Keep gate in float32 for numerical stability during silu
+            return x * torch.nn.functional.silu(gate.to(torch.float32)).to(input_dtype)
+
+        if ((self.n_groups % self.tp_size) != 0) or self.n_groups != 1:
+            return self.forward_native(x, gate)
+
+        return rms_norm_gated(
+            x=x,
+            weight=self.weight.data,
+            bias=None,
+            z=gate,
+            eps=self.variance_epsilon,
+            norm_before_gate=False,
+            is_rms_norm=True,
+        )
diff --git a/sglang/python/sglang/srt/layers/attention/mamba/ops/__init__.py b/sglang/python/sglang/srt/layers/attention/mamba/ops/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..6496105aed97edc180979bb337c84d9d6a839578
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/attention/mamba/ops/__init__.py
@@ -0,0 +1,13 @@
+from .mamba_ssm import PAD_SLOT_ID
+from .ssd_combined import mamba_chunk_scan_combined
+from .ssu_dispatch import (
+    initialize_mamba_selective_state_update_backend,
+    selective_state_update,
+)
+
+__all__ = [
+    "PAD_SLOT_ID",
+    "selective_state_update",
+    "mamba_chunk_scan_combined",
+    "initialize_mamba_selective_state_update_backend",
+]
diff --git a/sglang/python/sglang/srt/layers/attention/mamba/ops/__pycache__/__init__.cpython-311.pyc b/sglang/python/sglang/srt/layers/attention/mamba/ops/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ea0bb9370835e2c747165bda4acec50617c69679
Binary files /dev/null and b/sglang/python/sglang/srt/layers/attention/mamba/ops/__pycache__/__init__.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/layers/attention/mamba/ops/__pycache__/mamba_ssm.cpython-311.pyc b/sglang/python/sglang/srt/layers/attention/mamba/ops/__pycache__/mamba_ssm.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..fed390b6e80d29edae7664a15319ba147435faf8
Binary files /dev/null and b/sglang/python/sglang/srt/layers/attention/mamba/ops/__pycache__/mamba_ssm.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/layers/attention/mamba/ops/__pycache__/ssd_bmm.cpython-311.pyc b/sglang/python/sglang/srt/layers/attention/mamba/ops/__pycache__/ssd_bmm.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6640b6ecff51d0f89c10670e1ef345eb5cd18b80
Binary files /dev/null and b/sglang/python/sglang/srt/layers/attention/mamba/ops/__pycache__/ssd_bmm.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/layers/attention/mamba/ops/__pycache__/ssd_chunk_scan.cpython-311.pyc b/sglang/python/sglang/srt/layers/attention/mamba/ops/__pycache__/ssd_chunk_scan.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..62cd6cb843b679edecd7668614f0500302e2d248
Binary files /dev/null and b/sglang/python/sglang/srt/layers/attention/mamba/ops/__pycache__/ssd_chunk_scan.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/layers/attention/mamba/ops/__pycache__/ssd_chunk_state.cpython-311.pyc b/sglang/python/sglang/srt/layers/attention/mamba/ops/__pycache__/ssd_chunk_state.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..42f6bbf9b39b99749213709f11dc97367e3f0ceb
Binary files /dev/null and b/sglang/python/sglang/srt/layers/attention/mamba/ops/__pycache__/ssd_chunk_state.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/layers/attention/mamba/ops/__pycache__/ssd_combined.cpython-311.pyc b/sglang/python/sglang/srt/layers/attention/mamba/ops/__pycache__/ssd_combined.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..760f9cd81f6d3265336c7ecbeda7133a9dac71fd
Binary files /dev/null and b/sglang/python/sglang/srt/layers/attention/mamba/ops/__pycache__/ssd_combined.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/layers/attention/mamba/ops/__pycache__/ssd_state_passing.cpython-311.pyc b/sglang/python/sglang/srt/layers/attention/mamba/ops/__pycache__/ssd_state_passing.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..53e5729a3762f58a2831c1de743f0dbb0bcd9091
Binary files /dev/null and b/sglang/python/sglang/srt/layers/attention/mamba/ops/__pycache__/ssd_state_passing.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/layers/attention/mamba/ops/__pycache__/ssu_dispatch.cpython-311.pyc b/sglang/python/sglang/srt/layers/attention/mamba/ops/__pycache__/ssu_dispatch.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..33d7643d35f75081ba193df9fd09d3e234f21b14
Binary files /dev/null and b/sglang/python/sglang/srt/layers/attention/mamba/ops/__pycache__/ssu_dispatch.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/layers/attention/mamba/ops/layernorm_gated.py b/sglang/python/sglang/srt/layers/attention/mamba/ops/layernorm_gated.py
new file mode 100644
index 0000000000000000000000000000000000000000..88b27eb5d3ce4d748f5043b6218bdf21abca8e1d
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/attention/mamba/ops/layernorm_gated.py
@@ -0,0 +1,172 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+# Copyright (c) 2024, Tri Dao.
+# Adapted from https://github.com/state-spaces/mamba/blob/60dadf2e0ee730ac337035d5533de10bc26e4847/mamba_ssm/ops/triton/layernorm_gated.py
+
+import torch
+import triton
+import triton.language as tl
+
+
+@triton.heuristics({"HAS_BIAS": lambda args: args["B"] is not None})
+@triton.heuristics({"HAS_Z": lambda args: args["Z"] is not None})
+@triton.jit
+def _layer_norm_fwd_1pass_kernel(
+    X,  # pointer to the input
+    Y,  # pointer to the output
+    W,  # pointer to the weights
+    B,  # pointer to the biases
+    Z,  # pointer to the other branch
+    Mean,  # pointer to the mean
+    Rstd,  # pointer to the 1/std
+    stride_x_row: tl.int64,
+    stride_y_row: tl.int64,
+    stride_z_row: tl.int64,
+    M: tl.int64,  # number of rows in X
+    N: tl.int64,  # number of columns in X
+    eps,  # epsilon to avoid division by zero
+    BLOCK_N: tl.constexpr,
+    HAS_BIAS: tl.constexpr,
+    HAS_Z: tl.constexpr,
+    NORM_BEFORE_GATE: tl.constexpr,
+    IS_RMS_NORM: tl.constexpr,
+):
+    # Map the program id to the row of X and Y it should compute.
+    row = tl.program_id(0)
+    group = tl.program_id(1)
+    X += row * stride_x_row + group * N
+    Y += row * stride_y_row + group * N
+    if HAS_Z:
+        Z += row * stride_z_row + group * N
+    if not IS_RMS_NORM:
+        Mean += group * M
+    Rstd += group * M
+    W += group * N
+    if HAS_BIAS:
+        B += group * N
+    # Compute mean and variance
+    cols = tl.arange(0, BLOCK_N)
+    x = tl.load(X + cols, mask=cols < N, other=0.0).to(tl.float32)
+    if HAS_Z and not NORM_BEFORE_GATE:
+        z = tl.load(Z + cols, mask=cols < N).to(tl.float32)
+        x *= z * tl.sigmoid(z)
+    if not IS_RMS_NORM:
+        mean = tl.sum(x, axis=0) / N
+        tl.store(Mean + row, mean)
+        xbar = tl.where(cols < N, x - mean, 0.0)
+        var = tl.sum(xbar * xbar, axis=0) / N
+    else:
+        xbar = tl.where(cols < N, x, 0.0)
+        var = tl.sum(xbar * xbar, axis=0) / N
+    rstd = 1 / tl.sqrt(var + eps)
+    tl.store(Rstd + row, rstd)
+    # Normalize and apply linear transformation
+    mask = cols < N
+    w = tl.load(W + cols, mask=mask).to(tl.float32)
+    if HAS_BIAS:
+        b = tl.load(B + cols, mask=mask).to(tl.float32)
+    x_hat = (x - mean) * rstd if not IS_RMS_NORM else x * rstd
+    y = x_hat * w + b if HAS_BIAS else x_hat * w
+    if HAS_Z and NORM_BEFORE_GATE:
+        z = tl.load(Z + cols, mask=mask).to(tl.float32)
+        y *= z * tl.sigmoid(z)
+    # Write output
+    tl.store(Y + cols, y, mask=mask)
+
+
+def _layer_norm_fwd(
+    x,
+    weight,
+    bias,
+    eps,
+    z=None,
+    out=None,
+    group_size=None,
+    norm_before_gate=True,
+    is_rms_norm=False,
+):
+    M, N = x.shape
+    if group_size is None:
+        group_size = N
+    assert N % group_size == 0
+    ngroups = N // group_size
+    assert x.stride(-1) == 1
+    if z is not None:
+        assert z.stride(-1) == 1
+        assert z.shape == (M, N)
+    assert weight.shape == (N,)
+    assert weight.stride(-1) == 1
+    if bias is not None:
+        assert bias.stride(-1) == 1
+        assert bias.shape == (N,)
+    # allocate output
+    if out is not None:
+        assert out.shape == x.shape
+    else:
+        out = torch.empty_like(x)
+    assert out.stride(-1) == 1
+    mean = (
+        torch.empty((ngroups * M,), dtype=torch.float32, device=x.device)
+        if not is_rms_norm
+        else None
+    )
+    rstd = torch.empty((ngroups * M,), dtype=torch.float32, device=x.device)
+    # Less than 64KB per feature: enqueue fused kernel
+    MAX_FUSED_SIZE = 65536 // x.element_size()
+    BLOCK_N = min(MAX_FUSED_SIZE, triton.next_power_of_2(group_size))
+    if group_size > BLOCK_N:
+        raise RuntimeError("This layer norm doesn't support feature dim >= 64KB.")
+    # heuristics for number of warps
+    num_warps = min(max(BLOCK_N // 256, 1), 8)
+    grid = (M, ngroups)
+    with torch.cuda.device(x.device.index):
+        _layer_norm_fwd_1pass_kernel[grid](
+            x,
+            out,
+            weight,
+            bias,
+            z,
+            mean,
+            rstd,
+            x.stride(0),
+            out.stride(0),
+            z.stride(0) if z is not None else 0,
+            M,
+            group_size,
+            eps,
+            BLOCK_N=BLOCK_N,
+            NORM_BEFORE_GATE=norm_before_gate,
+            IS_RMS_NORM=is_rms_norm,
+            num_warps=num_warps,
+        )
+    return out, mean, rstd
+
+
+def rms_norm_gated(
+    x, weight, bias, z=None, eps=1e-6, group_size=None, norm_before_gate=True
+):
+    x_shape_og = x.shape
+    # reshape input data into 2D tensor
+    x = x.reshape(-1, x.shape[-1])
+    if x.stride(-1) != 1:
+        x = x.contiguous()
+    if z is not None:
+        assert z.shape == x_shape_og
+        z = z.reshape(-1, z.shape[-1])
+        if z.stride(-1) != 1:
+            z = z.contiguous()
+    weight = weight.contiguous()
+    if bias is not None:
+        bias = bias.contiguous()
+    y, _, _ = _layer_norm_fwd(
+        x,
+        weight,
+        bias,
+        eps,
+        z=z,
+        group_size=group_size,
+        norm_before_gate=norm_before_gate,
+        is_rms_norm=True,
+    )
+
+    return y.reshape(x_shape_og)
diff --git a/sglang/python/sglang/srt/layers/attention/mamba/ops/mamba_ssm.py b/sglang/python/sglang/srt/layers/attention/mamba/ops/mamba_ssm.py
new file mode 100644
index 0000000000000000000000000000000000000000..f238d51b47ecbdd14cde5ef63a926203234fee8b
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/attention/mamba/ops/mamba_ssm.py
@@ -0,0 +1,494 @@
+# Adapted from: https://github.com/vllm-project/vllm/tree/main/vllm/model_executor/layers/mamba/ops/mamba_ssm.py
+
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# Copyright (c) 2024, Tri Dao, Albert Gu.
+# Adapted from https://github.com/state-spaces/mamba/blob/v2.2.4/mamba_ssm/ops/triton/selective_state_update.py
+
+import torch
+import triton
+import triton.language as tl
+from packaging import version
+
+PAD_SLOT_ID = -1
+
+TRITON3 = version.parse(triton.__version__) >= version.parse("3.0.0")
+
+if TRITON3:
+
+    @triton.jit
+    def softplus(dt):
+        dt = tl.where(dt <= 20.0, tl.math.log(tl.math.exp(dt) + 1), dt)
+        return dt
+
+else:
+
+    @triton.jit
+    def softplus(dt):
+        dt = tl.where(dt <= 20.0, tl.math.log1p(tl.exp(dt)), dt)
+        return dt
+
+
+@triton.heuristics({"HAS_DT_BIAS": lambda args: args["dt_bias_ptr"] is not None})
+@triton.heuristics({"HAS_D": lambda args: args["D_ptr"] is not None})
+@triton.heuristics({"HAS_Z": lambda args: args["z_ptr"] is not None})
+@triton.heuristics(
+    {
+        "HAS_STATE_BATCH_INDICES": lambda args: args["state_batch_indices_ptr"]
+        is not None
+    }
+)
+@triton.heuristics(
+    {"BLOCK_SIZE_DSTATE": lambda args: triton.next_power_of_2(args["dstate"])}
+)
+@triton.heuristics(
+    {
+        "CACHE_INTERMEDIATE_STATES": lambda args: args["intermediate_states_buffer"]
+        is not None
+    }
+)
+@triton.heuristics(
+    {
+        "HAS_EAGLE_TREE_CUSTOM_ATTN_MASK": lambda args: args[
+            "retrieve_parent_token_ptr"
+        ]
+        is not None
+    }
+)
+@triton.heuristics(
+    {
+        "HAS_INTERMEDIATE_STATE_INDICES": lambda args: args[
+            "intermediate_state_indices_ptr"
+        ]
+        is not None
+    }
+)
+@triton.jit(do_not_specialize=["T"])
+def _selective_scan_update_kernel(
+    # Pointers to matrices
+    state_ptr,
+    x_ptr,
+    dt_ptr,
+    dt_bias_ptr,
+    A_ptr,
+    B_ptr,
+    C_ptr,
+    D_ptr,
+    z_ptr,
+    out_ptr,
+    state_batch_indices_ptr,
+    pad_slot_id,
+    intermediate_states_buffer,
+    cache_steps,
+    retrieve_parent_token_ptr,
+    intermediate_state_indices_ptr,
+    # Matrix dimensions
+    batch,
+    T,
+    nheads,
+    dim,
+    dstate,
+    nheads_ngroups_ratio,
+    # Strides
+    stride_state_batch,
+    stride_state_head,
+    stride_state_dim,
+    stride_state_dstate,
+    stride_x_batch,
+    stride_x_T,
+    stride_x_head,
+    stride_x_dim,
+    stride_dt_batch,
+    stride_dt_T,
+    stride_dt_head,
+    stride_dt_dim,
+    stride_dt_bias_head,
+    stride_dt_bias_dim,
+    stride_A_head,
+    stride_A_dim,
+    stride_A_dstate,
+    stride_B_batch,
+    stride_B_T,
+    stride_B_group,
+    stride_B_dstate,
+    stride_C_batch,
+    stride_C_T,
+    stride_C_group,
+    stride_C_dstate,
+    stride_D_head,
+    stride_D_dim,
+    stride_z_batch,
+    stride_z_T,
+    stride_z_head,
+    stride_z_dim,
+    stride_out_batch,
+    stride_out_T,
+    stride_out_head,
+    stride_out_dim,
+    stride_retrieve_parent_token_batch,
+    stride_retrieve_parent_token_T,
+    # Meta-parameters
+    DT_SOFTPLUS: tl.constexpr,
+    TIE_HDIM: tl.constexpr,
+    BLOCK_SIZE_M: tl.constexpr,
+    HAS_DT_BIAS: tl.constexpr,
+    HAS_D: tl.constexpr,
+    HAS_Z: tl.constexpr,
+    HAS_STATE_BATCH_INDICES: tl.constexpr,
+    DISABLE_STATE_UPDATE: tl.constexpr,
+    CACHE_INTERMEDIATE_STATES: tl.constexpr,
+    HAS_EAGLE_TREE_CUSTOM_ATTN_MASK: tl.constexpr,
+    HAS_INTERMEDIATE_STATE_INDICES: tl.constexpr,
+    BLOCK_SIZE_DSTATE: tl.constexpr,
+):
+    pid_m = tl.program_id(axis=0)
+    pid_b = tl.program_id(axis=1)
+    pid_h = tl.program_id(axis=2)
+
+    # If HAS_STATE_BATCH_INDICES is true, then the ssm state's batch coordinate
+    # is taken from the state_batch_indices_ptr Otherwise, the state coordinate
+    # is the same as the batch id.
+    if HAS_STATE_BATCH_INDICES:
+        state_batch_indices_ptr += pid_b
+        state_batch_idx = tl.load(state_batch_indices_ptr).to(tl.int64)
+        state_ptr += state_batch_idx * stride_state_batch + pid_h * stride_state_head
+    else:
+        state_ptr += pid_b * stride_state_batch + pid_h * stride_state_head
+
+    x_ptr += pid_b * stride_x_batch + pid_h * stride_x_head
+    dt_ptr += pid_b * stride_dt_batch + pid_h * stride_dt_head
+    if HAS_DT_BIAS:
+        dt_bias_ptr += pid_h * stride_dt_bias_head
+    A_ptr += pid_h * stride_A_head
+    B_ptr += pid_b * stride_B_batch + (pid_h // nheads_ngroups_ratio) * stride_B_group
+    C_ptr += pid_b * stride_C_batch + (pid_h // nheads_ngroups_ratio) * stride_C_group
+    if HAS_Z:
+        z_ptr += pid_b * stride_z_batch + pid_h * stride_z_head
+    out_ptr += pid_b * stride_out_batch + pid_h * stride_out_head
+
+    offs_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
+    offs_n = tl.arange(0, BLOCK_SIZE_DSTATE)
+    state_ptrs = state_ptr + (
+        offs_m[:, None] * stride_state_dim + offs_n[None, :] * stride_state_dstate
+    )
+
+    mask = (offs_m[:, None] < dim) & (offs_n[None, :] < dstate)
+    if HAS_STATE_BATCH_INDICES:
+        mask &= state_batch_idx != pad_slot_id
+    state = tl.load(state_ptrs, mask=mask, other=0.0).to(tl.float32)
+
+    if HAS_DT_BIAS:
+        dt_bias_ptrs = dt_bias_ptr + offs_m * stride_dt_bias_dim
+    if HAS_D:
+        D_ptr += pid_h * stride_D_head
+        D_ptrs = D_ptr + offs_m * stride_D_dim
+    A_ptrs = A_ptr + offs_m[:, None] * stride_A_dim + offs_n[None, :] * stride_A_dstate
+
+    cache_idx = -1
+    if CACHE_INTERMEDIATE_STATES:
+        if HAS_INTERMEDIATE_STATE_INDICES:
+            intermediate_state_idx = tl.load(intermediate_state_indices_ptr + pid_b).to(
+                tl.int64
+            )
+            cache_idx = intermediate_state_idx
+        elif HAS_STATE_BATCH_INDICES:
+            cache_idx = state_batch_idx
+        else:
+            cache_idx = pid_b
+
+    current_step_idx = 0
+    for _ in range(T):
+        if HAS_EAGLE_TREE_CUSTOM_ATTN_MASK:
+            if current_step_idx != 0 and cache_idx >= 0:
+                parent_ptr = (
+                    retrieve_parent_token_ptr
+                    + pid_b * stride_retrieve_parent_token_batch
+                    + current_step_idx * stride_retrieve_parent_token_T
+                )
+                parent_step_idx = tl.load(parent_ptr).to(tl.int32)
+
+                if parent_step_idx >= 0 and parent_step_idx < T:
+                    step_offset = parent_step_idx * nheads * dim * dstate
+                    cache_ptr = (
+                        intermediate_states_buffer
+                        + cache_idx * cache_steps * nheads * dim * dstate
+                        + step_offset
+                        + pid_h * dim * dstate
+                        + offs_m[:, None] * dstate
+                        + offs_n[None, :]
+                    )
+                    state = tl.load(cache_ptr, mask=mask, other=0.0).to(tl.float32)
+
+        x_ptrs = x_ptr + offs_m * stride_x_dim
+        dt_ptrs = dt_ptr + offs_m * stride_dt_dim
+        B_ptrs = B_ptr + offs_n * stride_B_dstate
+        C_ptrs = C_ptr + offs_n * stride_C_dstate
+        if HAS_Z:
+            z_ptrs = z_ptr + offs_m * stride_z_dim
+        out_ptrs = out_ptr + offs_m * stride_out_dim
+
+        x = tl.load(x_ptrs, mask=offs_m < dim, other=0.0).to(tl.float32)
+        if not TIE_HDIM:
+            dt = tl.load(dt_ptrs, mask=offs_m < dim, other=0.0).to(tl.float32)
+            if HAS_DT_BIAS:
+                dt += tl.load(dt_bias_ptrs, mask=offs_m < dim, other=0.0).to(tl.float32)
+            if DT_SOFTPLUS:
+                dt = softplus(dt)
+            A = tl.load(
+                A_ptrs,
+                mask=(offs_m[:, None] < dim) & (offs_n[None, :] < dstate),
+                other=0.0,
+            ).to(tl.float32)
+            dA = tl.exp(A * dt[:, None])
+        else:
+            dt = tl.load(dt_ptr).to(tl.float32)
+            if HAS_DT_BIAS:
+                dt += tl.load(dt_bias_ptr).to(tl.float32)
+            if DT_SOFTPLUS:
+                dt = softplus(dt)
+            A = tl.load(A_ptr).to(tl.float32)
+            dA = tl.exp(A * dt)  # scalar, not a matrix
+
+        B = tl.load(B_ptrs, mask=offs_n < dstate, other=0.0).to(tl.float32)
+        C = tl.load(C_ptrs, mask=offs_n < dstate, other=0.0).to(tl.float32)
+        if HAS_D:
+            D = tl.load(D_ptrs, mask=offs_m < dim, other=0.0).to(tl.float32)
+        if HAS_Z:
+            z = tl.load(z_ptrs, mask=offs_m < dim, other=0.0).to(tl.float32)
+
+        dB = B[None, :] * dt[:, None] if not TIE_HDIM else B * dt
+        state = state * dA + dB * x[:, None]
+
+        if CACHE_INTERMEDIATE_STATES:
+            if HAS_STATE_BATCH_INDICES:
+                if state_batch_idx != pad_slot_id:
+                    cache_ptr_base = (
+                        intermediate_states_buffer
+                        + cache_idx * cache_steps * nheads * dim * dstate
+                        + current_step_idx * nheads * dim * dstate
+                        + pid_h * dim * dstate
+                    )
+                    cache_ptrs = cache_ptr_base + (
+                        offs_m[:, None] * dstate + offs_n[None, :]
+                    )
+                    tl.store(
+                        cache_ptrs, state.to(cache_ptrs.dtype.element_ty), mask=mask
+                    )
+
+        out = tl.sum(state * C[None, :], axis=1)
+        if HAS_D:
+            out += x * D
+        if HAS_Z:
+            out *= z * tl.sigmoid(z)
+        tl.store(out_ptrs, out, mask=offs_m < dim)
+
+        current_step_idx += 1
+
+        x_ptr += stride_x_T
+        dt_ptr += stride_dt_T
+        B_ptr += stride_B_T
+        C_ptr += stride_C_T
+        out_ptr += stride_out_T
+        if HAS_Z:
+            z_ptr += stride_z_T
+
+    if not DISABLE_STATE_UPDATE:
+        tl.store(state_ptrs, state.to(state_ptrs.dtype.element_ty), mask=mask)
+
+
+def selective_state_update(
+    state,
+    x,
+    dt,
+    A,
+    B,
+    C,
+    D=None,
+    z=None,
+    dt_bias=None,
+    dt_softplus=False,
+    state_batch_indices=None,
+    pad_slot_id=PAD_SLOT_ID,
+    out=None,
+    disable_state_update=False,
+    intermediate_states_buffer=None,
+    cache_steps=None,
+    retrieve_parent_token=None,
+    intermediate_state_indices=None,
+):
+    """
+    Argument:
+        state: (batch, dim, dstate) or (batch, nheads, dim, dstate)
+        x: (batch, dim) or (batch, nheads, dim) for single-token or (batch, T, nheads, dim) for multi-token
+        dt: (batch, dim) or (batch, nheads, dim)
+        A: (dim, dstate) or (nheads, dim, dstate)
+        B: (batch, dstate) or (batch, ngroups, dstate) for single-token or (batch, T, ngroups, dstate) for multi-token
+        C: (batch, dstate) or (batch, ngroups, dstate)
+        D: (dim,) or (nheads, dim)
+        z: (batch, dim) or (batch, nheads, dim)
+        dt_bias: (dim,) or (nheads, dim)
+        pad_slot_id: int
+            if cache_indices is passed, lets the kernel identify padded
+            entries that will not be processed,
+            for example: cache_indices = [pad_slot_id, 1, 20, pad_slot_id]
+            in this case, the kernel will not process entries at
+            indices 0 and 3
+        out: Preallocated ssm output tensor. Assume same shape as x.
+             In-place updated.
+        disable_state_update: If True, don't write back to state (for speculative verify)
+        intermediate_states_buffer: Buffer to cache intermediate states
+        cache_steps: Total number of steps in the buffer
+        retrieve_parent_token: (batch, T) tensor of parent token indices for EAGLE tree attention
+        intermediate_state_indices: (batch,) tensor of indices for intermediate_states_buffer operations.
+            If provided, uses these indices instead of state_batch_indices for the buffer.
+    """
+    if state.dim() == 3:
+        state = state.unsqueeze(1)
+    if x.dim() == 2:
+        x = x.unsqueeze(1)
+    if x.dim() == 3:
+        x = x.unsqueeze(1)
+    if dt.dim() == 2:
+        dt = dt.unsqueeze(1)
+    if dt.dim() == 3:
+        dt = dt.unsqueeze(1)
+    if A.dim() == 2:
+        A = A.unsqueeze(0)
+    if B.dim() == 2:
+        B = B.unsqueeze(1)
+    if B.dim() == 3:
+        B = B.unsqueeze(1)
+    if C.dim() == 2:
+        C = C.unsqueeze(1)
+    if C.dim() == 3:
+        C = C.unsqueeze(1)
+    if D is not None and D.dim() == 1:
+        D = D.unsqueeze(0)
+    if z is not None:
+        if z.dim() == 2:
+            z = z.unsqueeze(1)
+        if z.dim() == 3:
+            z = z.unsqueeze(1)
+    if dt_bias is not None and dt_bias.dim() == 1:
+        dt_bias = dt_bias.unsqueeze(0)
+    if out.dim() == 2:
+        out = out.unsqueeze(1)
+    if out.dim() == 3:
+        out = out.unsqueeze(1)
+
+    _, nheads, dim, dstate = state.shape
+    batch, T, _, _ = x.shape
+
+    assert x.shape == (batch, T, nheads, dim)
+    assert dt.shape == x.shape
+    assert A.shape == (nheads, dim, dstate)
+    ngroups = B.shape[2]
+    assert nheads % ngroups == 0, "nheads must be divisible by ngroups"
+    assert B.shape == (batch, T, ngroups, dstate)
+    assert C.shape == B.shape
+    if D is not None:
+        assert D.shape == (nheads, dim)
+    if z is not None:
+        assert z.shape == x.shape
+    if dt_bias is not None:
+        assert dt_bias.shape == (nheads, dim)
+    if state_batch_indices is not None:
+        assert state_batch_indices.shape == (batch,)
+    assert out.shape == x.shape
+
+    grid = lambda META: (triton.cdiv(dim, META["BLOCK_SIZE_M"]), batch, nheads)
+    z_strides = (
+        (z.stride(0), z.stride(1), z.stride(2), z.stride(3))
+        if z is not None
+        else (0, 0, 0, 0)
+    )
+    # We don't want autotune since it will overwrite the state
+    # We instead tune by hand.
+    BLOCK_SIZE_M, num_warps = (
+        (32, 4)
+        if dstate <= 16
+        else (
+            (16, 4)
+            if dstate <= 32
+            else ((8, 4) if dstate <= 64 else ((4, 4) if dstate <= 128 else ((4, 8))))
+        )
+    )
+    tie_hdim = (
+        A.stride(-1) == 0
+        and A.stride(-2) == 0
+        and dt.stride(-1) == 0
+        and dt_bias.stride(-1) == 0
+    )
+
+    retrieve_parent_token_strides = (
+        (retrieve_parent_token.stride(0), retrieve_parent_token.stride(1))
+        if retrieve_parent_token is not None
+        else (0, 0)
+    )
+
+    with torch.cuda.device(x.device.index):
+        _selective_scan_update_kernel[grid](
+            state,
+            x,
+            dt,
+            dt_bias,
+            A,
+            B,
+            C,
+            D,
+            z,
+            out,
+            state_batch_indices,
+            pad_slot_id,
+            intermediate_states_buffer,
+            cache_steps if cache_steps is not None else 0,
+            retrieve_parent_token,
+            intermediate_state_indices,
+            batch,
+            T,
+            nheads,
+            dim,
+            dstate,
+            nheads // ngroups,
+            state.stride(0),
+            state.stride(1),
+            state.stride(2),
+            state.stride(3),
+            x.stride(0),
+            x.stride(1),
+            x.stride(2),
+            x.stride(3),
+            dt.stride(0),
+            dt.stride(1),
+            dt.stride(2),
+            dt.stride(3),
+            *(dt_bias.stride(0), dt_bias.stride(1)) if dt_bias is not None else 0,
+            A.stride(0),
+            A.stride(1),
+            A.stride(2),
+            B.stride(0),
+            B.stride(1),
+            B.stride(2),
+            B.stride(3),
+            C.stride(0),
+            C.stride(1),
+            C.stride(2),
+            C.stride(3),
+            *(D.stride(0), D.stride(1)) if D is not None else 0,
+            z_strides[0],
+            z_strides[1],
+            z_strides[2],
+            z_strides[3],
+            out.stride(0),
+            out.stride(1),
+            out.stride(2),
+            out.stride(3),
+            retrieve_parent_token_strides[0],
+            retrieve_parent_token_strides[1],
+            dt_softplus,
+            tie_hdim,
+            BLOCK_SIZE_M,
+            DISABLE_STATE_UPDATE=disable_state_update,
+            num_warps=num_warps,
+        )
diff --git a/sglang/python/sglang/srt/layers/attention/mamba/ops/ssd_bmm.py b/sglang/python/sglang/srt/layers/attention/mamba/ops/ssd_bmm.py
new file mode 100644
index 0000000000000000000000000000000000000000..667d34afa6fd2932e6e5d12081051b30a9e5587d
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/attention/mamba/ops/ssd_bmm.py
@@ -0,0 +1,214 @@
+# Adapted from: https://github.com/vllm-project/vllm/tree/main/vllm/model_executor/layers/mamba/ops/ssd_bmm.py
+
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# Copyright (c) 2024, Tri Dao, Albert Gu.
+# Adapted from https://github.com/state-spaces/mamba/blob/v2.2.4/mamba_ssm/ops/triton/ssd_bmm.py
+
+# ruff: noqa: E501,SIM102
+
+import math
+
+import torch
+import triton
+import triton.language as tl
+
+
+@triton.jit
+def _bmm_chunk_fwd_kernel(
+    # Pointers to matrices
+    a_ptr,
+    b_ptr,
+    out_ptr,
+    seq_idx_ptr,
+    # Matrix dimensions
+    seqlen,
+    chunk_size,
+    K,
+    ngroups,
+    stride_a_batch,
+    stride_a_seqlen,
+    stride_a_head,
+    stride_ak,
+    stride_b_batch,
+    stride_b_seqlen,
+    stride_b_head,
+    stride_bk,
+    stride_out_batch,
+    stride_out_chunk,
+    stride_out_head,
+    stride_outm,
+    stride_outn,
+    stride_seq_idx_batch,
+    stride_seq_idx_seqlen,
+    # Meta-parameters
+    IS_CAUSAL: tl.constexpr,
+    dot_dtype: tl.constexpr,
+    HAS_SEQ_IDX: tl.constexpr,
+    BLOCK_SIZE_M: tl.constexpr = 16,
+    BLOCK_SIZE_N: tl.constexpr = 16,
+    BLOCK_SIZE_K: tl.constexpr = 16,
+):
+    pid_b = tl.program_id(axis=1)
+    pid_ch = tl.program_id(axis=2).to(tl.int64)
+    pid_c = pid_ch // ngroups
+    pid_h = pid_ch - pid_c * ngroups
+    num_pid_n = tl.cdiv(chunk_size, BLOCK_SIZE_N)
+    pid_m = tl.program_id(axis=0) // num_pid_n
+    pid_n = tl.program_id(axis=0) % num_pid_n
+    if IS_CAUSAL:
+        if pid_n * BLOCK_SIZE_N >= (pid_m + 1) * BLOCK_SIZE_M:
+            return
+    a_ptr += (
+        pid_b * stride_a_batch
+        + pid_c * chunk_size * stride_a_seqlen
+        + pid_h * stride_a_head
+    )
+    b_ptr += (
+        pid_b * stride_b_batch
+        + pid_c * chunk_size * stride_b_seqlen
+        + pid_h * stride_b_head
+    )
+    if HAS_SEQ_IDX:
+        seq_idx_ptr += (
+            pid_b * stride_seq_idx_batch + pid_c * chunk_size * stride_seq_idx_seqlen
+        )
+
+    offs_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
+    offs_n = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+    offs_k = tl.arange(0, BLOCK_SIZE_K)
+    a_ptrs = a_ptr + (offs_m[:, None] * stride_a_seqlen + offs_k[None, :] * stride_ak)
+    b_ptrs = b_ptr + (offs_k[:, None] * stride_bk + offs_n[None, :] * stride_b_seqlen)
+    chunk_size_limit = min(chunk_size, seqlen - pid_c * chunk_size)
+
+    acc = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
+    for k in range(0, tl.cdiv(K, BLOCK_SIZE_K)):
+        a = tl.load(
+            a_ptrs,
+            mask=(offs_m[:, None] < chunk_size_limit)
+            & (offs_k[None, :] < K - k * BLOCK_SIZE_K),
+            other=0.0,
+        ).to(dot_dtype)
+        b = tl.load(
+            b_ptrs,
+            mask=(offs_k[:, None] < K - k * BLOCK_SIZE_K)
+            & (offs_n[None, :] < chunk_size_limit),
+            other=0.0,
+        ).to(dot_dtype)
+        acc += tl.dot(a, b)
+        a_ptrs += BLOCK_SIZE_K * stride_ak
+        b_ptrs += BLOCK_SIZE_K * stride_bk
+
+    offs_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
+    offs_n = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+    if HAS_SEQ_IDX:
+        chunk_size_limit = min(chunk_size, seqlen - pid_c * chunk_size)
+        seq_idx_m = tl.load(
+            seq_idx_ptr + offs_m * stride_seq_idx_seqlen,
+            mask=offs_m < chunk_size_limit,
+            other=-1,
+        )
+        seq_idx_n = tl.load(
+            seq_idx_ptr + offs_n * stride_seq_idx_seqlen,
+            mask=offs_n < chunk_size_limit,
+            other=-2,
+        )
+        acc = tl.where(seq_idx_m[:, None] == seq_idx_n[None, :], acc, 0.0)
+    out = acc.to(out_ptr.dtype.element_ty)
+
+    out_ptr += (
+        pid_b * stride_out_batch + pid_c * stride_out_chunk + pid_h * stride_out_head
+    )
+    out_ptrs = out_ptr + (stride_outm * offs_m[:, None] + offs_n[None, :] * stride_outn)
+    tl.store(
+        out_ptrs,
+        out,
+        mask=(offs_m[:, None] < chunk_size) & (offs_n[None, :] < chunk_size),
+    )
+
+
+def _bmm_chunk_fwd(a, b, chunk_size, seq_idx=None, causal=False, output_dtype=None):
+    """
+    Argument:
+        a: (batch, seqlen, k) or (batch, seqlen, ngroups, k)
+        b: (batch, seqlen, k) or (batch, seqlen, ngroups, k)
+        seq_idx: (batch, seqlen) or None. out[i, j] for seq_idx[i] != seq_idx[j] will be zeroed out.
+        causal: if True, then out[i, j] for i > j will be arbitrary, only out[i, j] for i <= j are
+            guaranteed to be correct.
+    Return:
+        out: (batch, nchunks, chunk_size, chunk_size) or (batch, nchunks, ngroups, chunk_size, chunk_size)
+    """
+    # Check constraints.
+    has_groups = a.dim() == 4
+    if not has_groups:
+        batch, seqlen, k = a.shape
+    else:
+        batch, seqlen, ngroups, k = a.shape
+    assert b.shape == a.shape
+    if seq_idx is not None:
+        assert seq_idx.shape == (batch, seqlen)
+    if a.stride(-1) != 1 and a.stride(1) != 1:
+        a = a.contiguous()
+    if b.stride(-1) != 1 and b.stride(1) != 1:
+        b = b.contiguous()
+    nchunks = math.ceil(seqlen / chunk_size)
+    # Allocates output.
+    out_dtype = a.dtype if output_dtype is None else output_dtype
+    out = torch.empty(
+        (
+            (batch, nchunks, chunk_size, chunk_size)
+            if not has_groups
+            else (batch, nchunks, ngroups, chunk_size, chunk_size)
+        ),
+        device=a.device,
+        dtype=out_dtype,
+    )
+    dot_dtype = (
+        tl.bfloat16
+        if a.dtype == torch.bfloat16 or b.dtype == torch.bfloat16
+        else (
+            tl.float16
+            if a.dtype == torch.float16 or b.dtype == torch.float16
+            else tl.float32
+        )
+    )
+    grid = lambda META: (
+        triton.cdiv(chunk_size, META["BLOCK_SIZE_M"])
+        * triton.cdiv(chunk_size, META["BLOCK_SIZE_N"]),
+        batch,
+        nchunks if not has_groups else nchunks * ngroups,
+    )
+    with torch.cuda.device(a.device.index):
+        _bmm_chunk_fwd_kernel[grid](
+            a,
+            b,
+            out,
+            seq_idx,
+            seqlen,
+            chunk_size,
+            k,
+            ngroups if has_groups else 1,
+            a.stride(0),
+            a.stride(1),
+            0 if not has_groups else a.stride(2),
+            a.stride(-1),
+            b.stride(0),
+            b.stride(1),
+            0 if not has_groups else b.stride(2),
+            b.stride(-1),
+            out.stride(0),
+            out.stride(1),
+            0 if not has_groups else out.stride(2),
+            out.stride(-2),
+            out.stride(-1),
+            *(
+                (seq_idx.stride(0), seq_idx.stride(1))
+                if seq_idx is not None
+                else (0, 0)
+            ),
+            causal,
+            dot_dtype,
+            HAS_SEQ_IDX=seq_idx is not None,
+        )
+    return out
diff --git a/sglang/python/sglang/srt/layers/attention/mamba/ops/ssd_chunk_scan.py b/sglang/python/sglang/srt/layers/attention/mamba/ops/ssd_chunk_scan.py
new file mode 100644
index 0000000000000000000000000000000000000000..52b197139200836b58481ebc1f841f5ae65cda38
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/attention/mamba/ops/ssd_chunk_scan.py
@@ -0,0 +1,562 @@
+# Adapted from: https://github.com/vllm-project/vllm/tree/main/vllm/model_executor/layers/mamba/ops/ssd_chunk_scan.py
+
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# Copyright (c) 2024, Tri Dao, Albert Gu.
+# Adapted from https://github.com/state-spaces/mamba/blob/v2.2.4/mamba_ssm/ops/triton/ssd_chunk_scan.py
+
+# ruff: noqa: E501,SIM102
+
+import torch
+import triton
+import triton.language as tl
+from packaging import version
+
+TRITON_22 = version.parse(triton.__version__) >= version.parse("2.2.0")
+
+
+@triton.jit
+def _chunk_scan_fwd_kernel(
+    # Pointers to matrices
+    cb_ptr,
+    x_ptr,
+    z_ptr,
+    out_ptr,
+    out_x_ptr,
+    dt_ptr,
+    dA_cumsum_ptr,
+    seq_idx_ptr,
+    C_ptr,
+    states_ptr,
+    D_ptr,
+    initstates_ptr,
+    chunk_indices_ptr,
+    chunk_offsets_ptr,
+    chunk_meta_num,
+    # Matrix dimensions
+    chunk_size,
+    hdim,
+    dstate,
+    batch,
+    seqlen,
+    nheads_ngroups_ratio,
+    # Strides
+    stride_cb_batch,
+    stride_cb_chunk,
+    stride_cb_head,
+    stride_cb_csize_m,
+    stride_cb_csize_k,
+    stride_x_batch,
+    stride_x_seqlen,
+    stride_x_head,
+    stride_x_hdim,
+    stride_z_batch,
+    stride_z_seqlen,
+    stride_z_head,
+    stride_z_hdim,
+    stride_out_batch,
+    stride_out_seqlen,
+    stride_out_head,
+    stride_out_hdim,
+    stride_dt_batch,
+    stride_dt_chunk,
+    stride_dt_head,
+    stride_dt_csize,
+    stride_dA_cs_batch,
+    stride_dA_cs_chunk,
+    stride_dA_cs_head,
+    stride_dA_cs_csize,
+    stride_seq_idx_batch,
+    stride_seq_idx_seqlen,
+    stride_C_batch,
+    stride_C_seqlen,
+    stride_C_head,
+    stride_C_dstate,
+    stride_states_batch,
+    stride_states_chunk,
+    stride_states_head,
+    stride_states_hdim,
+    stride_states_dstate,
+    stride_init_states_batch,
+    stride_init_states_head,
+    stride_init_states_hdim,
+    stride_init_states_dstate,
+    stride_D_head,
+    # Meta-parameters
+    IS_CAUSAL: tl.constexpr,
+    HAS_D: tl.constexpr,
+    D_HAS_HDIM: tl.constexpr,
+    HAS_Z: tl.constexpr,
+    HAS_SEQ_IDX: tl.constexpr,
+    BLOCK_SIZE_DSTATE: tl.constexpr,
+    IS_TRITON_22: tl.constexpr,
+    HAS_INITSTATES: tl.constexpr,
+    BLOCK_SIZE_M: tl.constexpr = 16,
+    BLOCK_SIZE_N: tl.constexpr = 16,
+    BLOCK_SIZE_K: tl.constexpr = 16,
+):
+    pid_bc = tl.program_id(axis=1).to(tl.int64)
+    pid_c = pid_bc // batch
+    pid_b = pid_bc - pid_c * batch
+    if not HAS_INITSTATES:
+        c_idx = pid_c
+        c_off = 0
+    else:
+        c_idx = tl.load(chunk_indices_ptr + pid_c, mask=pid_c > -1, other=0)
+        c_off = tl.load(chunk_offsets_ptr + pid_c, mask=pid_c > -1, other=0)
+
+    pid_h = tl.program_id(axis=2)
+    num_pid_n = tl.cdiv(hdim, BLOCK_SIZE_N)
+    pid_m = tl.program_id(axis=0) // num_pid_n
+    pid_n = tl.program_id(axis=0) % num_pid_n
+    cb_ptr += (
+        pid_b * stride_cb_batch
+        + c_idx * stride_cb_chunk
+        + (pid_h // nheads_ngroups_ratio) * stride_cb_head
+    )
+    x_ptr += (
+        pid_b * stride_x_batch
+        + c_idx * chunk_size * stride_x_seqlen
+        + pid_h * stride_x_head
+    )
+    dt_ptr += pid_b * stride_dt_batch + c_idx * stride_dt_chunk + pid_h * stride_dt_head
+    dA_cumsum_ptr += (
+        pid_b * stride_dA_cs_batch
+        + c_idx * stride_dA_cs_chunk
+        + pid_h * stride_dA_cs_head
+    )
+    C_ptr += (
+        pid_b * stride_C_batch
+        + c_idx * chunk_size * stride_C_seqlen
+        + (pid_h // nheads_ngroups_ratio) * stride_C_head
+    )
+
+    # M-block offsets and prev states
+    #  - logic in next block may override these if there is an active offset
+    offs_m = pid_m * BLOCK_SIZE_M + c_off + tl.arange(0, BLOCK_SIZE_M)
+    prev_states_ptr = (
+        states_ptr
+        + pid_b * stride_states_batch
+        + c_idx * stride_states_chunk
+        + pid_h * stride_states_head
+    )
+    prev_states_hdim = stride_states_hdim
+    prev_states_dstate = stride_states_dstate
+
+    chunk_size_limit = min(chunk_size, seqlen - c_idx * chunk_size)
+    if HAS_SEQ_IDX:
+        seq_idx_ptr += (
+            pid_b * stride_seq_idx_batch + c_idx * chunk_size * stride_seq_idx_seqlen
+        )
+
+        # - we only need seq_idx_prev to be aligned to chunk boundary
+        seq_idx_prev = tl.load(
+            seq_idx_ptr - stride_seq_idx_seqlen, mask=c_idx >= 1, other=0
+        )
+
+        if HAS_INITSTATES:
+            # if there are init states, we only need seq_idx_m to point
+            # what is the current seq_idx
+
+            # get current seq idx
+            if (pid_m * BLOCK_SIZE_M + c_off) < chunk_size_limit:
+                seq_idx_m = tl.load(
+                    seq_idx_ptr
+                    + (pid_m * BLOCK_SIZE_M + c_off) * stride_seq_idx_seqlen,
+                )
+
+                # - recall that in ssd_state_passing, for the case c_off == 0
+                # i.e., the very first sequence, we made states_ptr hold its initial state
+                # so this edge case is taken care of
+                if (
+                    (c_off == 0)
+                    and (
+                        seq_idx_prev != seq_idx_m
+                    )  # if a seq is changed exactly on boundary
+                    or (c_off > 0)  # implies a new example (pseudo chunk)
+                ):
+
+                    # - replace prev_states_ptr with init_states
+                    prev_states_ptr = (
+                        initstates_ptr
+                        + seq_idx_m * stride_init_states_batch
+                        + pid_h * stride_init_states_head
+                    )
+                    prev_states_hdim = stride_init_states_hdim  # override strides
+                    prev_states_dstate = stride_init_states_dstate
+
+    offs_n = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+    dA_cs_m = tl.load(
+        dA_cumsum_ptr + offs_m * stride_dA_cs_csize, mask=offs_m < chunk_size, other=0.0
+    ).to(tl.float32)
+
+    # - handle chunk state limit
+    if HAS_INITSTATES:
+
+        # have to split this if otherwise compilation will have problems
+        dA_cs_m_boundary = 0.0
+
+        # get the c_idx for the next (logica) chunk
+        c_idx_n = tl.load(
+            chunk_indices_ptr + (pid_c + 1),
+            mask=pid_c > -1 and (pid_c + 1) < chunk_meta_num,
+            other=-1,  # to trigger different chunk
+        )
+
+        # - there are things to consider
+        # A. if c_off > 0 then we need to move the dA_cs boundary to ensure correct
+        #    contribution of past states
+        # B. if c_off_n < chunk_size_limit, then we need to adjust this so as not to
+        #    encroach into the next sequence, where c_off_n is the offset of the next
+        #    (logical) chunk.
+        # An equivalent check for B is c_idx == c_idx_n, where there is repetition in
+        # (logical) chunk indices.
+
+        if (c_idx == c_idx_n) or c_off > 0:
+
+            # get the next offset
+            c_off_n = tl.load(
+                chunk_offsets_ptr + (pid_c + 1),
+                mask=pid_c > -1 and (pid_c + 1) < chunk_meta_num,
+                other=chunk_size,
+            )
+
+            # in this case, adjust down the chunk_size_limit
+            if c_idx == c_idx_n:
+                chunk_size_limit = min(c_off_n, chunk_size_limit)
+
+            # get the cs at the offset boundary
+            # - c_off == 0 is a passthrough
+            # - We need dA_cs at the boundary, defined by c_off - no need
+            #   to increase pointer by pid_m (it is a constant offset,
+            #   i.e. the same for all blocks)
+            dA_cs_m_boundary = tl.load(
+                dA_cumsum_ptr + (c_off - 1) * stride_dA_cs_csize,
+                mask=(((c_off - 1) > -1) and ((c_off) < chunk_size)),
+                other=0.0,
+            ).to(tl.float32)
+
+    if HAS_SEQ_IDX:
+        # - handle seq idx when HAS_INITSTATES==False
+        if not HAS_INITSTATES:
+            seq_idx_m = tl.load(
+                seq_idx_ptr + offs_m * stride_seq_idx_seqlen,
+                mask=offs_m < chunk_size_limit,
+                other=-1,
+            )
+
+    acc = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
+
+    # Without the if (pid_c > -1), with Triton 2.1.0, I get
+    # Assertion `!(srcMmaLayout && dstMmaLayout) && "Unexpected mma -> mm a layout conversion"' failed.
+    # With Triton 2.2.0, this works
+    if IS_TRITON_22 or c_idx > -1:
+        # Faster to just do 1 iteration with larger BLOCK_SIZE_K, up to block size 128
+        offs_k_dstate = tl.arange(
+            0, BLOCK_SIZE_DSTATE if BLOCK_SIZE_DSTATE <= 128 else BLOCK_SIZE_K
+        )
+        C_ptrs = C_ptr + (
+            offs_m[:, None] * stride_C_seqlen + offs_k_dstate[None, :] * stride_C_dstate
+        )
+
+        prev_states_ptrs = prev_states_ptr + (
+            offs_n[None, :] * prev_states_hdim
+            + offs_k_dstate[:, None] * prev_states_dstate
+        )
+        if HAS_SEQ_IDX:
+
+            if not HAS_INITSTATES:
+                # - this is for continuous batching where there is no init states
+                scale_m = tl.where(seq_idx_m == seq_idx_prev, tl.exp(dA_cs_m), 0.0)
+            else:
+                # - if there is initstates, we will rely on prev_states, no zeroing
+                #   required.
+                scale_m = tl.exp(dA_cs_m - dA_cs_m_boundary)
+        else:
+            scale_m = tl.exp(dA_cs_m)
+        if BLOCK_SIZE_DSTATE <= 128:
+            C = tl.load(
+                C_ptrs,
+                mask=(offs_m[:, None] < chunk_size_limit)
+                & (offs_k_dstate[None, :] < dstate),
+                other=0.0,
+            )
+
+            prev_states = tl.load(
+                prev_states_ptrs,
+                mask=(offs_k_dstate[:, None] < dstate) & (offs_n[None, :] < hdim),
+                other=0.0,
+            )
+            prev_states = prev_states.to(C_ptr.dtype.element_ty)
+            acc = tl.dot(C, prev_states) * scale_m[:, None]
+        else:
+            for k in range(0, dstate, BLOCK_SIZE_K):
+                C = tl.load(
+                    C_ptrs,
+                    mask=(offs_m[:, None] < chunk_size_limit)
+                    & (offs_k_dstate[None, :] < dstate - k),
+                    other=0.0,
+                )
+                # C = (C * scale_m[:, None]).to(C_ptr.dtype.element_ty)
+                prev_states = tl.load(
+                    prev_states_ptrs,
+                    mask=(offs_k_dstate[:, None] < dstate - k)
+                    & (offs_n[None, :] < hdim),
+                    other=0.0,
+                )
+                prev_states = prev_states.to(C_ptr.dtype.element_ty)
+                acc += tl.dot(C, prev_states)
+                C_ptrs += BLOCK_SIZE_K
+                prev_states_ptrs += BLOCK_SIZE_K
+            acc *= scale_m[:, None]
+
+    offs_k = tl.arange(0, BLOCK_SIZE_K) + c_off
+    cb_ptrs = cb_ptr + (
+        offs_m[:, None] * stride_cb_csize_m + offs_k[None, :] * stride_cb_csize_k
+    )
+    x_ptrs = x_ptr + (
+        offs_k[:, None] * stride_x_seqlen + offs_n[None, :] * stride_x_hdim
+    )
+    dt_ptrs = dt_ptr + offs_k * stride_dt_csize
+    dA_cumsum_ptrs = dA_cumsum_ptr + offs_k * stride_dA_cs_csize
+    K_MAX = (
+        chunk_size_limit
+        if not IS_CAUSAL
+        else min((pid_m + 1) * BLOCK_SIZE_M, chunk_size_limit)
+    )
+    for k in range(0, K_MAX, BLOCK_SIZE_K):
+        cb = tl.load(
+            cb_ptrs,
+            mask=(offs_m[:, None] < chunk_size) & (offs_k[None, :] < chunk_size - k),
+            other=0.0,
+        ).to(tl.float32)
+        dA_cs_k = tl.load(dA_cumsum_ptrs, mask=offs_k < chunk_size - k, other=0.0).to(
+            tl.float32
+        )
+        # If there's seq_idx, we already set cb[i, j] = 0 for seq_idx[i] != seq_idx[j].
+        # So we don't need masking wrt seq_idx here.
+        cb *= tl.exp(dA_cs_m[:, None] - dA_cs_k[None, :])
+        dt_k = tl.load(dt_ptrs, mask=offs_k < chunk_size - k, other=0.0).to(tl.float32)
+        cb *= dt_k
+        if IS_CAUSAL:
+            mask = offs_m[:, None] >= k + offs_k[None, :]
+            cb = tl.where(mask, cb, 0.0)
+        cb = cb.to(x_ptr.dtype.element_ty)
+        x = tl.load(
+            x_ptrs,
+            mask=(offs_k[:, None] < chunk_size_limit - k) & (offs_n[None, :] < hdim),
+            other=0.0,
+        )
+        acc += tl.dot(cb, x)
+        cb_ptrs += BLOCK_SIZE_K * stride_cb_csize_k
+        x_ptrs += BLOCK_SIZE_K * stride_x_seqlen
+        dt_ptrs += BLOCK_SIZE_K * stride_dt_csize
+        dA_cumsum_ptrs += BLOCK_SIZE_K * stride_dA_cs_csize
+
+    offs_out_m = pid_m * BLOCK_SIZE_M + c_off + tl.arange(0, BLOCK_SIZE_M)
+    offs_out_n = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+
+    if HAS_D:
+        if D_HAS_HDIM:
+            D = tl.load(
+                D_ptr + pid_h * stride_D_head + offs_n, mask=offs_n < hdim, other=0.0
+            ).to(tl.float32)
+        else:
+            D = tl.load(D_ptr + pid_h * stride_D_head).to(tl.float32)
+        x_residual = tl.load(
+            x_ptr
+            + (offs_m[:, None] * stride_x_seqlen + offs_n[None, :] * stride_x_hdim),
+            mask=(offs_m[:, None] < chunk_size_limit) & (offs_n[None, :] < hdim),
+            other=0.0,
+        ).to(tl.float32)
+        acc += x_residual * D
+
+    if HAS_Z:
+        out_x_ptr += (
+            pid_b * stride_out_batch
+            + c_idx * chunk_size * stride_out_seqlen
+            + pid_h * stride_out_head
+        )
+        out_x_ptrs = out_x_ptr + (
+            stride_out_seqlen * offs_out_m[:, None] + offs_out_n[None, :]
+        )
+        tl.store(
+            out_x_ptrs,
+            acc,
+            mask=(offs_out_m[:, None] < chunk_size_limit)
+            & (offs_out_n[None, :] < hdim),
+        )
+
+        z_ptr += (
+            pid_b * stride_z_batch
+            + c_idx * chunk_size * stride_z_seqlen
+            + pid_h * stride_z_head
+        )
+        z_ptrs = z_ptr + (
+            stride_z_seqlen * offs_out_m[:, None] + stride_z_hdim * offs_out_n[None, :]
+        )
+        z = tl.load(
+            z_ptrs,
+            mask=(offs_out_m[:, None] < chunk_size_limit)
+            & (offs_out_n[None, :] < hdim),
+            other=0.0,
+        ).to(tl.float32)
+        acc *= z * tl.sigmoid(z)
+
+    out_ptr += (
+        pid_b * stride_out_batch
+        + c_idx * chunk_size * stride_out_seqlen
+        + pid_h * stride_out_head
+    )
+    out_ptrs = out_ptr + (
+        stride_out_seqlen * offs_out_m[:, None] + offs_out_n[None, :] * stride_out_hdim
+    )
+    tl.store(
+        out_ptrs,
+        acc,
+        mask=(offs_out_m[:, None] < chunk_size_limit) & (offs_out_n[None, :] < hdim),
+    )
+
+
+def _chunk_scan_fwd(
+    cb,
+    x,
+    dt,
+    dA_cumsum,
+    C,
+    states,
+    D=None,
+    z=None,
+    seq_idx=None,
+    chunk_indices=None,
+    chunk_offsets=None,
+    initial_states=None,
+    out=None,
+):
+    batch, seqlen, nheads, headdim = x.shape
+    _, _, nchunks, chunk_size = dt.shape
+    _, _, ngroups, dstate = C.shape
+    assert nheads % ngroups == 0
+    assert C.shape == (batch, seqlen, ngroups, dstate)
+    assert cb.shape == (batch, nchunks, ngroups, chunk_size, chunk_size)
+    if z is not None:
+        assert z.shape == x.shape
+    if D is not None:
+        assert D.shape == (nheads, headdim) or D.shape == (nheads,)
+    assert dt.shape == (batch, nheads, nchunks, chunk_size)
+    assert dA_cumsum.shape == (batch, nheads, nchunks, chunk_size)
+    assert states.shape == (batch, nchunks, nheads, headdim, dstate)
+
+    if seq_idx is not None:
+        assert seq_idx.shape == (batch, seqlen)
+
+        if initial_states is not None:
+            # with initial states, we need to take care of how
+            # seq_idx crosses the boundaries
+            assert batch == 1, "chunk scan only supports initial states with batch 1"
+            assert (
+                chunk_indices is not None and chunk_offsets is not None
+            ), "chunk_indices and chunk_offsets should have been set"
+        else:
+            chunk_indices, chunk_offsets = None, None
+    else:
+        chunk_indices, chunk_offsets = None, None
+
+    assert out.shape == x.shape
+
+    if z is not None:
+        out_x = torch.empty_like(x)
+        assert out_x.stride() == out.stride()
+    else:
+        out_x = None
+
+    grid = lambda META: (
+        triton.cdiv(chunk_size, META["BLOCK_SIZE_M"])
+        * triton.cdiv(headdim, META["BLOCK_SIZE_N"]),
+        batch * nchunks if chunk_offsets is None else len(chunk_offsets),
+        nheads,
+    )
+    z_strides = (
+        (z.stride(0), z.stride(1), z.stride(2), z.stride(3))
+        if z is not None
+        else (0, 0, 0, 0)
+    )
+    _chunk_scan_fwd_kernel[grid](
+        cb,
+        x,
+        z,
+        out,
+        out_x,
+        dt,
+        dA_cumsum,
+        seq_idx,
+        C,
+        states,
+        D,
+        initial_states,
+        chunk_indices,
+        chunk_offsets,
+        len(chunk_indices) if chunk_indices is not None else 0,
+        chunk_size,
+        headdim,
+        dstate,
+        batch,
+        seqlen,
+        nheads // ngroups,
+        cb.stride(0),
+        cb.stride(1),
+        cb.stride(2),
+        cb.stride(3),
+        cb.stride(4),
+        x.stride(0),
+        x.stride(1),
+        x.stride(2),
+        x.stride(3),
+        z_strides[0],
+        z_strides[1],
+        z_strides[2],
+        z_strides[3],
+        out.stride(0),
+        out.stride(1),
+        out.stride(2),
+        out.stride(3),
+        dt.stride(0),
+        dt.stride(2),
+        dt.stride(1),
+        dt.stride(3),
+        dA_cumsum.stride(0),
+        dA_cumsum.stride(2),
+        dA_cumsum.stride(1),
+        dA_cumsum.stride(3),
+        *((seq_idx.stride(0), seq_idx.stride(1)) if seq_idx is not None else (0, 0)),
+        C.stride(0),
+        C.stride(1),
+        C.stride(2),
+        C.stride(3),
+        states.stride(0),
+        states.stride(1),
+        states.stride(2),
+        states.stride(3),
+        states.stride(4),
+        *(
+            (
+                initial_states.stride(0),
+                initial_states.stride(1),
+                initial_states.stride(2),
+                initial_states.stride(3),
+            )
+            if initial_states is not None
+            else (0, 0, 0, 0)
+        ),
+        D.stride(0) if D is not None else 0,
+        True,
+        D is not None,
+        D.dim() == 2 if D is not None else True,
+        BLOCK_SIZE_DSTATE=max(triton.next_power_of_2(dstate), 16),
+        HAS_Z=z is not None,
+        HAS_SEQ_IDX=seq_idx is not None,
+        IS_TRITON_22=TRITON_22,
+        HAS_INITSTATES=initial_states is not None,
+    )
+    return out_x
diff --git a/sglang/python/sglang/srt/layers/attention/mamba/ops/ssd_chunk_state.py b/sglang/python/sglang/srt/layers/attention/mamba/ops/ssd_chunk_state.py
new file mode 100644
index 0000000000000000000000000000000000000000..2dd58380027f1fa08a18640311aed4673808eece
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/attention/mamba/ops/ssd_chunk_state.py
@@ -0,0 +1,646 @@
+# Adapted from: https://github.com/vllm-project/vllm/tree/main/vllm/model_executor/layers/mamba/ops/ssd_chunk_state.py
+
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# Copyright (c) 2024, Tri Dao, Albert Gu.
+# Adapted from https://github.com/state-spaces/mamba/blob/v2.2.4/mamba_ssm/ops/triton/ssd_chunk_state.py
+
+# ruff: noqa: E501
+
+import math
+
+import torch
+import triton
+import triton.language as tl
+
+from .mamba_ssm import softplus
+
+
+@triton.jit
+def _chunk_cumsum_fwd_kernel(
+    # Pointers to matrices
+    dt_ptr,
+    A_ptr,
+    dt_bias_ptr,
+    dt_out_ptr,
+    dA_cumsum_ptr,
+    # Matrix dimension
+    batch,
+    seqlen,
+    nheads,
+    chunk_size,
+    dt_min,
+    dt_max,
+    # Strides
+    stride_dt_batch,
+    stride_dt_seqlen,
+    stride_dt_head,
+    stride_A_head,
+    stride_dt_bias_head,
+    stride_dt_out_batch,
+    stride_dt_out_chunk,
+    stride_dt_out_head,
+    stride_dt_out_csize,
+    stride_dA_cs_batch,
+    stride_dA_cs_chunk,
+    stride_dA_cs_head,
+    stride_dA_cs_csize,
+    # Meta-parameters
+    DT_SOFTPLUS: tl.constexpr,
+    HAS_DT_BIAS: tl.constexpr,
+    BLOCK_SIZE_CHUNK: tl.constexpr,
+    BLOCK_SIZE_H: tl.constexpr = 16,
+):
+    pid_b = tl.program_id(axis=0)
+
+    # if dt is long, may cause problems, so use 64 bit
+    # https://github.com/triton-lang/triton/issues/1058
+    pid_c = tl.program_id(axis=1).to(tl.int64)
+    pid_h = tl.program_id(axis=2)
+    dt_ptr += pid_b * stride_dt_batch + pid_c * chunk_size * stride_dt_seqlen
+    dt_out_ptr += pid_b * stride_dt_out_batch + pid_c * stride_dt_out_chunk
+    dA_cumsum_ptr += pid_b * stride_dA_cs_batch + pid_c * stride_dA_cs_chunk
+
+    offs_h = pid_h * BLOCK_SIZE_H + tl.arange(0, BLOCK_SIZE_H)
+    offs_c = tl.arange(0, BLOCK_SIZE_CHUNK)
+    dt_ptrs = dt_ptr + (
+        offs_h[:, None] * stride_dt_head + offs_c[None, :] * stride_dt_seqlen
+    )
+    A_ptrs = A_ptr + offs_h * stride_A_head
+    dt_out_ptrs = dt_out_ptr + (
+        offs_h[:, None] * stride_dt_out_head + offs_c[None, :] * stride_dt_out_csize
+    )
+    dA_cs_ptrs = dA_cumsum_ptr + (
+        offs_h[:, None] * stride_dA_cs_head + offs_c[None, :] * stride_dA_cs_csize
+    )
+    chunk_size_limit = min(chunk_size, seqlen - pid_c * chunk_size)
+
+    dt = tl.load(
+        dt_ptrs,
+        mask=(offs_h[:, None] < nheads) & (offs_c[None, :] < chunk_size_limit),
+        other=0.0,
+    ).to(tl.float32)
+    if HAS_DT_BIAS:
+        dt_bias = tl.load(
+            dt_bias_ptr + offs_h * stride_dt_bias_head, mask=offs_h < nheads, other=0.0
+        ).to(tl.float32)
+        dt += dt_bias[:, None]
+    if DT_SOFTPLUS:
+        dt = tl.where(dt <= 20.0, softplus(dt), dt)
+    # As of Triton 2.2.0, tl.clamp is not available yet
+    # dt = tl.clamp(dt, dt_min, dt_max)
+    dt = tl.minimum(tl.maximum(dt, dt_min), dt_max)
+    dt = tl.where(
+        (offs_h[:, None] < nheads) & (offs_c[None, :] < chunk_size_limit), dt, 0.0
+    )
+    tl.store(
+        dt_out_ptrs,
+        dt,
+        mask=(offs_h[:, None] < nheads) & (offs_c[None, :] < chunk_size),
+    )
+    A = tl.load(A_ptrs, mask=offs_h < nheads, other=0.0).to(tl.float32)
+    dA = dt * A[:, None]
+    dA_cs = tl.cumsum(dA, axis=1)
+    tl.store(
+        dA_cs_ptrs,
+        dA_cs,
+        mask=(offs_h[:, None] < nheads) & (offs_c[None, :] < chunk_size),
+    )
+
+
+@triton.jit
+def _chunk_state_fwd_kernel(
+    # Pointers to matrices
+    x_ptr,
+    b_ptr,
+    states_ptr,
+    dt_ptr,
+    dA_cumsum_ptr,
+    seq_idx_ptr,
+    # Matrix dimensions
+    hdim,
+    dstate,
+    chunk_size,
+    batch,
+    seqlen,
+    nheads_ngroups_ratio,
+    # Strides
+    stride_x_batch,
+    stride_x_seqlen,
+    stride_x_head,
+    stride_x_hdim,
+    stride_b_batch,
+    stride_b_seqlen,
+    stride_b_head,
+    stride_b_dstate,
+    stride_states_batch,
+    stride_states_chunk,
+    stride_states_head,
+    stride_states_hdim,
+    stride_states_dstate,
+    stride_dt_batch,
+    stride_dt_chunk,
+    stride_dt_head,
+    stride_dt_csize,
+    stride_dA_cs_batch,
+    stride_dA_cs_chunk,
+    stride_dA_cs_head,
+    stride_dA_cs_csize,
+    stride_seq_idx_batch,
+    stride_seq_idx_seqlen,
+    # Meta-parameters
+    HAS_SEQ_IDX: tl.constexpr,
+    BLOCK_SIZE_M: tl.constexpr = 16,
+    BLOCK_SIZE_N: tl.constexpr = 16,
+    BLOCK_SIZE_K: tl.constexpr = 16,
+):
+    pid_bc = tl.program_id(axis=1).to(tl.int64)
+    pid_c = pid_bc // batch
+    pid_b = pid_bc - pid_c * batch
+    pid_h = tl.program_id(axis=2)
+    num_pid_n = tl.cdiv(dstate, BLOCK_SIZE_N)
+    pid_m = tl.program_id(axis=0) // num_pid_n
+    pid_n = tl.program_id(axis=0) % num_pid_n
+    b_ptr += (
+        pid_b * stride_b_batch
+        + pid_c * chunk_size * stride_b_seqlen
+        + (pid_h // nheads_ngroups_ratio) * stride_b_head
+    )
+    x_ptr += (
+        pid_b * stride_x_batch
+        + pid_c * chunk_size * stride_x_seqlen
+        + pid_h * stride_x_head
+    )
+    dt_ptr += pid_b * stride_dt_batch + pid_c * stride_dt_chunk + pid_h * stride_dt_head
+    dA_cumsum_ptr += (
+        pid_b * stride_dA_cs_batch
+        + pid_c * stride_dA_cs_chunk
+        + pid_h * stride_dA_cs_head
+    )
+    if HAS_SEQ_IDX:
+        seq_idx_ptr += (
+            pid_b * stride_seq_idx_batch + pid_c * chunk_size * stride_seq_idx_seqlen
+        )
+
+    offs_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
+    offs_n = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+    offs_k = tl.arange(0, BLOCK_SIZE_K)
+    x_ptrs = x_ptr + (
+        offs_m[:, None] * stride_x_hdim + offs_k[None, :] * stride_x_seqlen
+    )
+    b_ptrs = b_ptr + (
+        offs_n[None, :] * stride_b_dstate + offs_k[:, None] * stride_b_seqlen
+    )
+    dt_ptrs = dt_ptr + offs_k * stride_dt_csize
+    dA_cs_last = tl.load(dA_cumsum_ptr + (chunk_size - 1) * stride_dA_cs_csize).to(
+        tl.float32
+    )
+    dA_cumsum_ptrs = dA_cumsum_ptr + offs_k * stride_dA_cs_csize
+    if HAS_SEQ_IDX:
+        seq_idx_ptrs = seq_idx_ptr + offs_k * stride_seq_idx_seqlen
+
+    chunk_size_limit = min(chunk_size, seqlen - pid_c * chunk_size)
+    if HAS_SEQ_IDX:
+        seq_idx_last = tl.load(
+            seq_idx_ptr + (chunk_size_limit - 1) * stride_seq_idx_seqlen
+        )
+
+    acc = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
+    for k in range(0, chunk_size_limit, BLOCK_SIZE_K):
+        x = tl.load(
+            x_ptrs,
+            mask=(offs_m[:, None] < hdim) & (offs_k[None, :] < chunk_size_limit - k),
+            other=0.0,
+        )
+        b = tl.load(
+            b_ptrs,
+            mask=(offs_k[:, None] < chunk_size_limit - k) & (offs_n[None, :] < dstate),
+            other=0.0,
+        ).to(tl.float32)
+        dA_cs_k = tl.load(
+            dA_cumsum_ptrs, mask=offs_k < chunk_size_limit - k, other=0.0
+        ).to(tl.float32)
+        if HAS_SEQ_IDX:
+            seq_idx_k = tl.load(
+                seq_idx_ptrs, mask=offs_k < chunk_size_limit - k, other=-1
+            )
+        dt_k = tl.load(dt_ptrs, mask=offs_k < chunk_size_limit - k, other=0.0).to(
+            tl.float32
+        )
+        if not HAS_SEQ_IDX:
+            scale = tl.exp(dA_cs_last - dA_cs_k) * dt_k
+        else:
+            scale = tl.where(
+                seq_idx_k == seq_idx_last, tl.exp(dA_cs_last - dA_cs_k) * dt_k, 0.0
+            )
+        b *= scale[:, None]
+        b = b.to(x_ptr.dtype.element_ty)
+        acc += tl.dot(x, b)
+        x_ptrs += BLOCK_SIZE_K * stride_x_seqlen
+        b_ptrs += BLOCK_SIZE_K * stride_b_seqlen
+        dt_ptrs += BLOCK_SIZE_K * stride_dt_csize
+        dA_cumsum_ptrs += BLOCK_SIZE_K * stride_dA_cs_csize
+        if HAS_SEQ_IDX:
+            seq_idx_ptrs += BLOCK_SIZE_K * stride_seq_idx_seqlen
+    states = acc.to(states_ptr.dtype.element_ty)
+
+    states_ptr += (
+        pid_b * stride_states_batch
+        + pid_c * stride_states_chunk
+        + pid_h * stride_states_head
+    )
+    offs_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
+    offs_n = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+    states_ptrs = states_ptr + (
+        offs_m[:, None] * stride_states_hdim + offs_n[None, :] * stride_states_dstate
+    )
+    c_mask = (offs_m[:, None] < hdim) & (offs_n[None, :] < dstate)
+    tl.store(states_ptrs, states, mask=c_mask)
+
+
+@triton.jit
+def _chunk_state_varlen_kernel(
+    # Pointers to matrices
+    x_ptr,
+    b_ptr,
+    dt_ptr,
+    dA_cumsum_ptr,
+    chunk_states_ptr,
+    cu_seqlens_ptr,
+    states_ptr,
+    initstates_ptr,
+    # Matrix dimensions
+    hdim,
+    dstate,
+    chunk_size,
+    seqlen,
+    nheads_ngroups_ratio,
+    # Strides
+    stride_x_seqlen,
+    stride_x_head,
+    stride_x_hdim,
+    stride_b_seqlen,
+    stride_b_head,
+    stride_b_dstate,
+    stride_dt_chunk,
+    stride_dt_head,
+    stride_dt_csize,
+    stride_dA_cs_chunk,
+    stride_dA_cs_head,
+    stride_dA_cs_csize,
+    stride_chunk_states_chunk,
+    stride_chunk_states_head,
+    stride_chunk_states_hdim,
+    stride_chunk_states_dstate,
+    stride_states_batch,
+    stride_states_head,
+    stride_states_hdim,
+    stride_states_dstate,
+    stride_init_states_batch,
+    stride_init_states_head,
+    stride_init_states_hdim,
+    stride_init_states_dstate,
+    # Meta-parameters
+    HAS_INITSTATES: tl.constexpr,
+    BLOCK_SIZE_M: tl.constexpr = 16,
+    BLOCK_SIZE_N: tl.constexpr = 16,
+    BLOCK_SIZE_K: tl.constexpr = 16,
+):
+    pid_b = tl.program_id(axis=1)
+    pid_h = tl.program_id(axis=2)
+    num_pid_n = tl.cdiv(dstate, BLOCK_SIZE_N)
+    pid_m = tl.program_id(axis=0) // num_pid_n
+    pid_n = tl.program_id(axis=0) % num_pid_n
+    end_idx = tl.load(cu_seqlens_ptr + pid_b + 1)
+    pid_c = (end_idx - 1) // chunk_size
+    b_ptr += (
+        pid_c * chunk_size * stride_b_seqlen
+        + (pid_h // nheads_ngroups_ratio) * stride_b_head
+    )
+    x_ptr += pid_c * chunk_size * stride_x_seqlen + pid_h * stride_x_head
+    dt_ptr += pid_c * stride_dt_chunk + pid_h * stride_dt_head
+    dA_cumsum_ptr += pid_c * stride_dA_cs_chunk + pid_h * stride_dA_cs_head
+    chunk_states_ptr += (
+        pid_c * stride_chunk_states_chunk + pid_h * stride_chunk_states_head
+    )
+
+    if HAS_INITSTATES:
+        # if there are init states provided, we differentiate between states (which
+        # are boundary conditions at a chunk boundary) and initstates (which are boundary
+        # conditions when a new example in a cont batch starts)
+        initstates_ptr += pid_h * stride_init_states_head
+
+    offs_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
+    offs_n = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+    offs_k = tl.arange(0, BLOCK_SIZE_K)
+    x_ptrs = x_ptr + (
+        offs_m[:, None] * stride_x_hdim + offs_k[None, :] * stride_x_seqlen
+    )
+    b_ptrs = b_ptr + (
+        offs_n[None, :] * stride_b_dstate + offs_k[:, None] * stride_b_seqlen
+    )
+    dt_ptrs = dt_ptr + offs_k * stride_dt_csize
+    dA_cs_last = tl.load(
+        dA_cumsum_ptr + (end_idx - pid_c * chunk_size - 1) * stride_dA_cs_csize
+    ).to(tl.float32)
+    dA_cumsum_ptrs = dA_cumsum_ptr + offs_k * stride_dA_cs_csize
+
+    chunk_size_limit = end_idx - pid_c * chunk_size
+    start_idx = tl.load(cu_seqlens_ptr + pid_b)
+    start_idx_cur = tl.maximum(start_idx - pid_c * chunk_size, 0)
+
+    acc = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
+    for k in range(0, chunk_size_limit, BLOCK_SIZE_K):
+        x = tl.load(
+            x_ptrs,
+            mask=(offs_m[:, None] < hdim)
+            & (offs_k[None, :] < chunk_size_limit - k)
+            & (offs_k[None, :] >= start_idx_cur - k),
+            other=0.0,
+        )
+        b = tl.load(
+            b_ptrs,
+            mask=(offs_k[:, None] < chunk_size_limit - k)
+            & (offs_n[None, :] < dstate)
+            & (offs_k[:, None] >= start_idx_cur - k),
+            other=0.0,
+        ).to(tl.float32)
+        dA_cs_k = tl.load(
+            dA_cumsum_ptrs, mask=offs_k < chunk_size_limit - k, other=0.0
+        ).to(tl.float32)
+        dt_k = tl.load(dt_ptrs, mask=offs_k < chunk_size_limit - k, other=0.0).to(
+            tl.float32
+        )
+        scale = tl.where(
+            (offs_k >= start_idx_cur - k) & (offs_k < chunk_size_limit - k),
+            tl.exp(dA_cs_last - dA_cs_k) * dt_k,
+            0.0,
+        )
+        b *= scale[:, None]
+        b = b.to(x_ptr.dtype.element_ty)
+        acc += tl.dot(x, b)
+        x_ptrs += BLOCK_SIZE_K * stride_x_seqlen
+        b_ptrs += BLOCK_SIZE_K * stride_b_seqlen
+        dt_ptrs += BLOCK_SIZE_K * stride_dt_csize
+        dA_cumsum_ptrs += BLOCK_SIZE_K * stride_dA_cs_csize
+
+    # If the sequence starts after the last chunk idx, we don't need to add the contribution from the last chunk
+    # If HAS_INITSTATES==True need to consider two possibilities
+    # - if start_idx < pid_c * chunk_size, then we need to take the past_states_ptrs
+    # - if state_idx >= pid * chunk_size, then we need to insert initstates
+    if (start_idx < pid_c * chunk_size) or (HAS_INITSTATES):  # first chunk
+
+        dA_cs_boundary = 0.0  # default
+
+        if not HAS_INITSTATES:
+            past_states_ptrs = chunk_states_ptr + (
+                offs_m[:, None] * stride_chunk_states_hdim
+                + offs_n[None, :] * stride_chunk_states_dstate
+            )
+        else:
+
+            # - this seems repetitive, buts its to help the compiler
+            if start_idx < pid_c * chunk_size:
+                past_states_ptrs = chunk_states_ptr + (
+                    offs_m[:, None] * stride_chunk_states_hdim
+                    + offs_n[None, :] * stride_chunk_states_dstate
+                )
+            else:
+                past_states_ptrs = initstates_ptr + (
+                    pid_b * stride_init_states_batch
+                    + offs_m[:, None] * stride_init_states_hdim
+                    + offs_n[None, :] * stride_init_states_dstate
+                )
+
+                # need to adjust the boundary
+                if start_idx > pid_c * chunk_size:
+                    dA_cs_boundary = tl.load(
+                        dA_cumsum_ptr
+                        + (start_idx - pid_c * chunk_size - 1) * stride_dA_cs_csize
+                    ).to(tl.float32)
+
+        past_states = tl.load(
+            past_states_ptrs,
+            mask=(offs_m[:, None] < hdim) & (offs_n[None, :] < dstate),
+            other=0.0,
+        ).to(tl.float32)
+
+        scale = tl.exp(dA_cs_last - dA_cs_boundary)
+        acc += past_states * scale
+
+    states = acc.to(states_ptr.dtype.element_ty)
+
+    states_ptr += pid_b * stride_states_batch + pid_h * stride_states_head
+    offs_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
+    offs_n = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+    states_ptrs = states_ptr + (
+        offs_m[:, None] * stride_states_hdim + offs_n[None, :] * stride_states_dstate
+    )
+    c_mask = (offs_m[:, None] < hdim) & (offs_n[None, :] < dstate)
+    tl.store(states_ptrs, states, mask=c_mask)
+
+
+def _chunk_cumsum_fwd(
+    dt, A, chunk_size, dt_bias=None, dt_softplus=False, dt_limit=(0.0, float("inf"))
+):
+    batch, seqlen, nheads = dt.shape
+    assert A.shape == (nheads,)
+    if dt_bias is not None:
+        assert dt_bias.shape == (nheads,)
+    nchunks = math.ceil(seqlen / chunk_size)
+    dt_out = torch.empty(
+        batch, nheads, nchunks, chunk_size, device=dt.device, dtype=torch.float32
+    )
+    dA_cumsum = torch.empty(
+        batch, nheads, nchunks, chunk_size, device=dt.device, dtype=torch.float32
+    )
+    grid_chunk_cs = lambda META: (
+        batch,
+        nchunks,
+        triton.cdiv(nheads, META["BLOCK_SIZE_H"]),
+    )
+    with torch.cuda.device(dt.device.index):
+        _chunk_cumsum_fwd_kernel[grid_chunk_cs](
+            dt,
+            A,
+            dt_bias,
+            dt_out,
+            dA_cumsum,
+            batch,
+            seqlen,
+            nheads,
+            chunk_size,
+            dt_limit[0],
+            dt_limit[1],
+            dt.stride(0),
+            dt.stride(1),
+            dt.stride(2),
+            A.stride(0),
+            dt_bias.stride(0) if dt_bias is not None else 0,
+            dt_out.stride(0),
+            dt_out.stride(2),
+            dt_out.stride(1),
+            dt_out.stride(3),
+            dA_cumsum.stride(0),
+            dA_cumsum.stride(2),
+            dA_cumsum.stride(1),
+            dA_cumsum.stride(3),
+            dt_softplus,
+            HAS_DT_BIAS=dt_bias is not None,
+            BLOCK_SIZE_CHUNK=triton.next_power_of_2(chunk_size),
+        )
+    return dA_cumsum, dt_out
+
+
+def _chunk_state_fwd(
+    B, x, dt, dA_cumsum, seq_idx=None, states=None, states_in_fp32=True
+):
+    batch, seqlen, nheads, headdim = x.shape
+    _, _, nchunks, chunk_size = dt.shape
+    _, _, ngroups, dstate = B.shape
+    assert nheads % ngroups == 0
+    assert B.shape == (batch, seqlen, ngroups, dstate)
+    assert dt.shape == (batch, nheads, nchunks, chunk_size)
+    assert dA_cumsum.shape == dt.shape
+    if seq_idx is not None:
+        assert seq_idx.shape == (batch, seqlen)
+    if states is not None:
+        assert states.shape == (batch, nchunks, nheads, headdim, dstate)
+    else:
+        states_dtype = torch.float32 if states_in_fp32 else B.dtype
+        states = torch.empty(
+            (batch, nchunks, nheads, headdim, dstate),
+            device=x.device,
+            dtype=states_dtype,
+        )
+    grid = lambda META: (
+        triton.cdiv(headdim, META["BLOCK_SIZE_M"])
+        * triton.cdiv(dstate, META["BLOCK_SIZE_N"]),
+        batch * nchunks,
+        nheads,
+    )
+    with torch.cuda.device(x.device.index):
+        _chunk_state_fwd_kernel[grid](
+            x,
+            B,
+            states,
+            dt,
+            dA_cumsum,
+            seq_idx,
+            headdim,
+            dstate,
+            chunk_size,
+            batch,
+            seqlen,
+            nheads // ngroups,
+            x.stride(0),
+            x.stride(1),
+            x.stride(2),
+            x.stride(3),
+            B.stride(0),
+            B.stride(1),
+            B.stride(2),
+            B.stride(-1),
+            states.stride(0),
+            states.stride(1),
+            states.stride(2),
+            states.stride(3),
+            states.stride(4),
+            dt.stride(0),
+            dt.stride(2),
+            dt.stride(1),
+            dt.stride(3),
+            dA_cumsum.stride(0),
+            dA_cumsum.stride(2),
+            dA_cumsum.stride(1),
+            dA_cumsum.stride(3),
+            *(
+                (seq_idx.stride(0), seq_idx.stride(1))
+                if seq_idx is not None
+                else (0, 0)
+            ),
+            HAS_SEQ_IDX=seq_idx is not None,
+        )
+    return states
+
+
+def chunk_state_varlen(
+    B, x, dt, dA_cumsum, cu_seqlens, chunk_states, initial_states=None
+):
+    total_seqlen, nheads, headdim = x.shape
+    _, nchunks, chunk_size = dt.shape
+    _, ngroups, dstate = B.shape
+    batch = cu_seqlens.shape[0] - 1
+    cu_seqlens = cu_seqlens.contiguous()
+    assert nheads % ngroups == 0
+    assert B.shape == (total_seqlen, ngroups, dstate)
+    assert dt.shape == (nheads, nchunks, chunk_size)
+    assert dA_cumsum.shape == dt.shape
+    assert chunk_states.shape == (nchunks, nheads, headdim, dstate)
+
+    if initial_states is not None:
+        assert initial_states.shape == (batch, nheads, headdim, dstate)
+
+    states = torch.empty(
+        batch,
+        nheads,
+        headdim,
+        dstate,
+        dtype=chunk_states.dtype,
+        device=chunk_states.device,
+    )
+    grid = lambda META: (
+        triton.cdiv(headdim, META["BLOCK_SIZE_M"])
+        * triton.cdiv(dstate, META["BLOCK_SIZE_N"]),
+        batch,
+        nheads,
+    )
+    with torch.cuda.device(x.device.index):
+        _chunk_state_varlen_kernel[grid](
+            x,
+            B,
+            dt,
+            dA_cumsum,
+            chunk_states,
+            cu_seqlens,
+            states,
+            initial_states,
+            headdim,
+            dstate,
+            chunk_size,
+            total_seqlen,
+            nheads // ngroups,
+            x.stride(0),
+            x.stride(1),
+            x.stride(2),
+            B.stride(0),
+            B.stride(1),
+            B.stride(2),
+            dt.stride(1),
+            dt.stride(0),
+            dt.stride(2),
+            dA_cumsum.stride(1),
+            dA_cumsum.stride(0),
+            dA_cumsum.stride(2),
+            chunk_states.stride(0),
+            chunk_states.stride(1),
+            chunk_states.stride(2),
+            chunk_states.stride(3),
+            states.stride(0),
+            states.stride(1),
+            states.stride(2),
+            states.stride(3),
+            *(
+                (
+                    initial_states.stride(0),
+                    initial_states.stride(1),
+                    initial_states.stride(2),
+                    initial_states.stride(3),
+                )
+                if initial_states is not None
+                else (0, 0, 0, 0)
+            ),
+            HAS_INITSTATES=initial_states is not None,
+        )
+    return states
diff --git a/sglang/python/sglang/srt/layers/attention/mamba/ops/ssd_combined.py b/sglang/python/sglang/srt/layers/attention/mamba/ops/ssd_combined.py
new file mode 100644
index 0000000000000000000000000000000000000000..6e2e74752bab0091ce6e49b583d1e7e17fc34e7d
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/attention/mamba/ops/ssd_combined.py
@@ -0,0 +1,261 @@
+# Adapted from: https://github.com/vllm-project/vllm/tree/main/vllm/model_executor/layers/mamba/ops/ssd_combined.py
+
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# Copyright (c) 2024, Tri Dao, Albert Gu.
+# Adapted from https://github.com/state-spaces/mamba/blob/v2.2.4/mamba_ssm/ops/triton/ssd_combined.py
+
+# ruff: noqa: E501
+
+import torch
+import triton
+from einops import rearrange
+from packaging import version
+
+from .ssd_bmm import _bmm_chunk_fwd
+from .ssd_chunk_scan import _chunk_scan_fwd
+from .ssd_chunk_state import _chunk_cumsum_fwd, _chunk_state_fwd, chunk_state_varlen
+from .ssd_state_passing import _state_passing_fwd
+
+TRITON_22 = version.parse(triton.__version__) >= version.parse("2.2.0")
+
+
+def is_int_pow_2(n):
+    return isinstance(n, int) and n > 0 and (n & (n - 1)) == 0
+
+
+def _mamba_chunk_scan_combined_fwd(
+    x,
+    dt,
+    A,
+    B,
+    C,
+    chunk_size,
+    D=None,
+    z=None,
+    dt_bias=None,
+    initial_states=None,
+    seq_idx=None,
+    chunk_indices=None,
+    chunk_offsets=None,
+    cu_seqlens=None,
+    dt_softplus=False,
+    dt_limit=(0.0, float("inf")),
+    state_dtype=None,
+    out=None,
+):
+    assert is_int_pow_2(chunk_size), "chunk_size must be integer power of 2"
+    batch, seqlen, nheads, headdim = x.shape
+    _, _, ngroups, dstate = B.shape
+    assert nheads % ngroups == 0
+    assert B.shape == (batch, seqlen, ngroups, dstate)
+    assert dt.shape == (batch, seqlen, nheads)
+    assert A.shape == (nheads,)
+    assert C.shape == B.shape
+    if z is not None:
+        assert z.shape == x.shape
+    if D is not None:
+        assert D.shape == (nheads, headdim) or D.shape == (nheads,)
+    if seq_idx is not None:
+        assert seq_idx.shape == (batch, seqlen)
+    if B.stride(-1) != 1:
+        B = B.contiguous()
+    if C.stride(-1) != 1:
+        C = C.contiguous()
+    if (
+        x.stride(-1) != 1 and x.stride(1) != 1
+    ):  # Either M or K dimension should be contiguous
+        x = x.contiguous()
+    if (
+        z is not None and z.stride(-1) != 1 and z.stride(1) != 1
+    ):  # Either M or K dimension should be contiguous
+        z = z.contiguous()
+    if D is not None and D.stride(-1) != 1:
+        D = D.contiguous()
+    if initial_states is not None:
+        if cu_seqlens is None:
+            assert initial_states.shape == (batch, nheads, headdim, dstate)
+        else:
+            assert initial_states.shape == (
+                len(cu_seqlens) - 1,
+                nheads,
+                headdim,
+                dstate,
+            )
+
+    # This function executes 5 sub-functions for computing mamba
+    # - a good resource is the blog https://goombalab.github.io/blog/2024/mamba2-part3-algorithm/
+    #   which has a minimal implementation to understand the below operations
+    # - as explained by the blog, mamba is a special case of causal attention
+    # - the idea is to chunk the attention matrix and compute each
+    #   submatrix separately using different optimizations.
+    # - see the blog and paper for a visualization of the submatrices
+    #   which we refer to in the comments below
+
+    # 1. Compute chunked cumsum of A * dt
+    # - here dt may go through a softplus activation
+    dA_cumsum, dt = _chunk_cumsum_fwd(
+        dt, A, chunk_size, dt_bias=dt_bias, dt_softplus=dt_softplus, dt_limit=dt_limit
+    )
+
+    # 2. Compute the state for each intra-chunk
+    # (right term of low-rank factorization of off-diagonal blocks; B terms)
+    states = _chunk_state_fwd(B, x, dt, dA_cumsum, seq_idx=seq_idx, states_in_fp32=True)
+
+    # 3. Compute the inter-chunk SSM recurrence; produces correct SSM states at chunk boundaries
+    # (middle term of factorization of off-diag blocks; A terms)
+    # - for handling chunked prefill, this requires i) initial_states
+    #   ii) seq_idx iii) is_cont_batched and (iv) chunk_offsets to be all specified.
+    # - When a new seq_idx is detected, we will stop passing the prev_state
+    #   and switch accordingly to the init_state corresponding to the new seq_idx.
+    # - We will also make sure that the dA_cumsum is taken only from the start of the
+    #   sequence (hence we need the full dA_cumsum tensor and not just the values at chunk boundaries)
+    # - this will ensure that states will be updated with the rightmost flushed seq_idx
+    #   of the previous chunk. This implies that the first chunk of states is either 0
+    #   or equal to init_states of the first example.
+    states, final_states = _state_passing_fwd(
+        rearrange(states, "... p n -> ... (p n)"),
+        dA_cumsum,
+        initial_states=(
+            rearrange(initial_states, "... p n -> ... (p n)")
+            if initial_states is not None
+            else None
+        ),
+        seq_idx=seq_idx,
+        chunk_size=chunk_size,
+        out_dtype=state_dtype if state_dtype is not None else C.dtype,
+        is_cont_batched=cu_seqlens is not None,
+        chunk_offsets=chunk_offsets,
+    )
+    states, final_states = (
+        rearrange(t, "... (p n) -> ... p n", n=dstate) for t in [states, final_states]
+    )
+
+    # 4. Compute batched matrix multiply for C_j^T B_i terms
+    CB = _bmm_chunk_fwd(C, B, chunk_size, seq_idx=seq_idx, output_dtype=torch.float32)
+
+    # 5. Scan and compute the diagonal blocks, taking into
+    #    account past causal states.
+    # - if initial states are provided, then states information will be
+    #   augmented with initial_states.
+    # - to do this properly, we need to account for example changes in
+    #   the continuous batch, therefore we introduce pseudo chunks, which is
+    #   a chunk that is split up each time an example changes.
+    # - in each (pseudo) chunk, we detect if the previous (pseudo) chunk had
+    #   a seq_idx change, in which case we take states information from
+    #   init_states.
+    out_x = _chunk_scan_fwd(
+        CB,
+        x,
+        dt,
+        dA_cumsum,
+        C,
+        states,
+        D=D,
+        z=z,
+        seq_idx=seq_idx,
+        chunk_indices=chunk_indices,
+        chunk_offsets=chunk_offsets,
+        initial_states=initial_states,
+        out=out,
+    )
+    if cu_seqlens is None:
+        return out_x, dt, dA_cumsum, states, final_states
+    else:
+        assert (
+            batch == 1
+        ), "passing cu_seqlens to get the varlen states is only supported if batch dimension is 1"
+        varlen_states = chunk_state_varlen(
+            B.squeeze(0),
+            x.squeeze(0),
+            dt.squeeze(0),
+            dA_cumsum.squeeze(0),
+            cu_seqlens,
+            states.squeeze(0),
+            initial_states=initial_states,
+        )
+        return out_x, dt, dA_cumsum, states, final_states, varlen_states
+
+
+def mamba_chunk_scan_combined(
+    x,
+    dt,
+    A,
+    B,
+    C,
+    chunk_size,
+    D=None,
+    z=None,
+    dt_bias=None,
+    initial_states=None,
+    seq_idx=None,
+    chunk_indices=None,
+    chunk_offsets=None,
+    cu_seqlens=None,
+    dt_softplus=False,
+    dt_limit=(0.0, float("inf")),
+    out=None,
+    return_final_states=False,
+    return_varlen_states=False,
+    state_dtype=None,
+):
+    """
+    Argument:
+        x: (batch, seqlen, nheads, headdim)
+        dt: (batch, seqlen, nheads)
+        A: (nheads)
+        B: (batch, seqlen, ngroups, dstate)
+        C: (batch, seqlen, ngroups, dstate)
+        chunk_size: int
+        D: (nheads, headdim) or (nheads,)
+        z: (batch, seqlen, nheads, headdim)
+        dt_bias: (nheads,)
+        initial_states: (batch, nheads, headdim, dstate)
+        seq_idx: (batch, seqlen)
+        cu_seqlens: (num_sequences + 1) or None, only used if return_varlen_states is True
+        dt_softplus: Whether to apply softplus to dt
+        out: Preallocated output tensor
+        state_dtype: The data type of the ssm state
+    """
+
+    if not return_varlen_states:
+        cu_seqlens = None
+    else:
+        assert (
+            cu_seqlens is not None
+        ), "cu_seqlens must be provided if return_varlen_states is True"
+    out_x, dt_out, dA_cumsum, states, final_states, *rest = (
+        _mamba_chunk_scan_combined_fwd(
+            x,
+            dt,
+            A,
+            B,
+            C,
+            chunk_size,
+            D=D,
+            z=z,
+            dt_bias=dt_bias,
+            initial_states=initial_states,
+            seq_idx=seq_idx,
+            chunk_indices=chunk_indices,
+            chunk_offsets=chunk_offsets,
+            cu_seqlens=cu_seqlens,
+            dt_softplus=dt_softplus,
+            dt_limit=dt_limit,
+            out=out,
+            state_dtype=state_dtype,
+        )
+    )
+    if not return_varlen_states:
+        if not return_final_states:
+            return
+        else:
+            return final_states
+    else:
+        varlen_states = rest[0]
+        return (
+            (varlen_states)
+            if not return_final_states
+            else (final_states, varlen_states)
+        )
diff --git a/sglang/python/sglang/srt/layers/attention/mamba/ops/ssd_state_passing.py b/sglang/python/sglang/srt/layers/attention/mamba/ops/ssd_state_passing.py
new file mode 100644
index 0000000000000000000000000000000000000000..5e8c32385ae2f489cc4f5c0cb8373cef2610e8a2
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/attention/mamba/ops/ssd_state_passing.py
@@ -0,0 +1,264 @@
+# Adapted from: https://github.com/vllm-project/vllm/tree/main/vllm/model_executor/layers/mamba/ops/ssd_state_passing.py
+
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# Copyright (c) 2024, Tri Dao, Albert Gu.
+# Adapted from https://github.com/state-spaces/mamba/blob/v2.2.4/mamba_ssm/ops/triton/ssd_state_passing.py
+
+# ruff: noqa: E501
+
+import torch
+import triton
+import triton.language as tl
+
+
+@triton.jit
+def _state_passing_fwd_kernel(
+    # Pointers to matrices
+    states_ptr,
+    out_ptr,
+    final_states_ptr,
+    dA_cs_ptr,
+    initstates_ptr,
+    seq_idx_ptr,
+    chunk_offsets_ptr,
+    chunk_meta_num,
+    # Matrix dimensions
+    dim,
+    nchunks,
+    seqlen,
+    chunk_size,
+    # Strides
+    stride_states_batch,
+    stride_states_chunk,
+    stride_states_head,
+    stride_states_dim,
+    stride_out_batch,
+    stride_out_chunk,
+    stride_out_head,
+    stride_out_dim,
+    stride_final_states_batch,
+    stride_final_states_head,
+    stride_final_states_dim,
+    stride_dA_cs_batch,
+    stride_dA_cs_chunk,
+    stride_dA_cs_head,
+    stride_dA_cs_csize,
+    stride_initstates_batch,
+    stride_initstates_head,
+    stride_initstates_dim,
+    stride_seq_idx_batch,
+    stride_seq_idx_seqlen,
+    # Meta-parameters
+    HAS_INITSTATES: tl.constexpr,
+    HAS_SEQ_IDX: tl.constexpr,
+    IS_CONT_BATCHED: tl.constexpr,
+    BLOCK_SIZE: tl.constexpr = 16,
+):
+    pid_b = tl.program_id(axis=1)
+    pid_h = tl.program_id(axis=2)
+    pid_m = tl.program_id(axis=0)
+    states_ptr += pid_b * stride_states_batch + pid_h * stride_states_head
+    dA_cs_ptr += (
+        pid_b * stride_dA_cs_batch
+        + pid_h * stride_dA_cs_head
+        + (chunk_size - 1) * stride_dA_cs_csize
+    )
+    out_ptr += pid_b * stride_out_batch + pid_h * stride_out_head
+    final_states_ptr += (
+        pid_b * stride_final_states_batch + pid_h * stride_final_states_head
+    )
+    if HAS_INITSTATES:
+        initstates_ptr += pid_h * stride_initstates_head
+        if not IS_CONT_BATCHED:
+            initstates_ptr += pid_b * stride_initstates_batch
+
+    if HAS_SEQ_IDX:
+        seq_idx_ptr += pid_b * stride_seq_idx_batch
+
+    offs_m = pid_m * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
+    states_ptrs = states_ptr + offs_m * stride_states_dim
+    out_ptrs = out_ptr + offs_m * stride_out_dim
+    final_states_ptrs = final_states_ptr + offs_m * stride_final_states_dim
+
+    # - states will be the past state of the sequence that continues on the current check
+    if not HAS_INITSTATES:
+        states = tl.zeros((BLOCK_SIZE,), dtype=tl.float32)
+    else:
+        initstates_ptr += offs_m * stride_initstates_dim
+        initstates_ptrs = initstates_ptr
+        # - for cont batches, for the first chunk mean it will be the first batch's
+        #   init state
+        states = tl.load(initstates_ptrs, mask=offs_m < dim, other=0.0).to(tl.float32)
+
+    tl.store(out_ptrs, states, mask=offs_m < dim)
+    out_ptrs += stride_out_chunk
+    prev_seq_idx_chunk_end = 0
+    logical_chunk_idx = 0
+    for c in range(nchunks):
+        new_states = tl.load(states_ptrs, mask=offs_m < dim, other=0.0).to(tl.float32)
+        dA_cs = tl.load(dA_cs_ptr).to(tl.float32)
+        scale_mask = True
+        if HAS_SEQ_IDX:
+            # - the seq to pass forward is the one that is flushed to the right
+            #   boundary.
+            # - that is given by seq_idx_chunk_end below: the sequence index at the end of the chunk.
+            seq_idx_chunk_end = tl.load(
+                seq_idx_ptr
+                + (min((c + 1) * chunk_size, seqlen) - 1) * stride_seq_idx_seqlen
+            )
+            if HAS_INITSTATES:
+                if IS_CONT_BATCHED and prev_seq_idx_chunk_end != seq_idx_chunk_end:
+                    # this means in the current chunk the rightmost flushed seq
+                    # has changed.
+                    # - so we do not propagate the state from previous chunk
+                    # - but rather we load that sequence's init state
+                    initstates_ptrs = (
+                        initstates_ptr + seq_idx_chunk_end * stride_initstates_batch
+                    )
+
+                    # - update state with seq_idx_new's init state
+                    states = tl.load(initstates_ptrs, mask=offs_m < dim, other=0.0).to(
+                        tl.float32
+                    )
+
+                    # - we need to consider the cumsum only of the last sequence in the chunk
+                    # - find its starting position (given by c_off of the logical chunk index)
+                    # - and subtract the cumsum just before that position from the total cumsum
+                    # - first, update the logical chunk index (add the number of sequences in the current physical chunk):
+                    # sequence index at the start of the current chunk
+                    seq_idx_chunk_start = tl.load(
+                        seq_idx_ptr
+                        + min(c * chunk_size, seqlen) * stride_seq_idx_seqlen
+                    )
+                    logical_chunk_idx += seq_idx_chunk_end - seq_idx_chunk_start
+                    # - load the chunk offset:
+                    c_off = tl.load(
+                        chunk_offsets_ptr + logical_chunk_idx,
+                        mask=logical_chunk_idx < chunk_meta_num,
+                        other=0,
+                    )
+                    # - if offset is 0, then the sequence starts at the beginning of the chunk, and we don't need to subtract anything
+                    if c_off > 0:
+                        # - dA_cs_ptr currently points to the cumsum at the end of the chunk - subtract the chunk size and add the offset
+                        dA_cs_boundary = tl.load(
+                            dA_cs_ptr
+                            - (chunk_size - 1) * stride_dA_cs_csize
+                            + (c_off - 1) * stride_dA_cs_csize,
+                            mask=(c_off - 1) > -1 and c_off < chunk_size,
+                            other=0.0,
+                        )
+                        dA_cs -= dA_cs_boundary
+
+                # - increment logical chunk index for every physical chunk
+                logical_chunk_idx += 1
+            else:
+                scale_mask = seq_idx_chunk_end == prev_seq_idx_chunk_end
+            prev_seq_idx_chunk_end = seq_idx_chunk_end
+
+        scale = tl.where(scale_mask, tl.exp(dA_cs), 0.0)
+        states = scale * states + new_states
+        if c < nchunks - 1:
+            tl.store(out_ptrs, states, mask=offs_m < dim)
+        else:
+            tl.store(final_states_ptrs, states, mask=offs_m < dim)
+        states_ptrs += stride_states_chunk
+        dA_cs_ptr += stride_dA_cs_chunk
+        out_ptrs += stride_out_chunk
+
+
+def _state_passing_fwd(
+    states,
+    dA_cumsum,
+    initial_states=None,
+    seq_idx=None,
+    chunk_size=None,
+    out_dtype=None,
+    is_cont_batched=False,
+    chunk_offsets=None,
+):
+    batch, nchunks, nheads, dim = states.shape
+    if chunk_size is None:
+        chunk_size = dA_cumsum.shape[-1]
+    else:
+        assert chunk_size == dA_cumsum.shape[-1]
+    assert dA_cumsum.shape == (batch, nheads, nchunks, chunk_size)
+    if initial_states is not None:
+        if is_cont_batched:
+            # - if cu_seqlens is provided, then the initial states
+            #   are used for continuous batching. In which case we
+            #   require seq_idx to be provided
+            assert (
+                seq_idx is not None
+            ), "seq_idx must be provided for continuous batching"
+            # - we also need chunk_offsets to be provided, to account
+            #   for computation of dA_cumsum from the start of the
+            #   sequence
+            assert (
+                chunk_offsets is not None
+            ), "chunk_offsets must be provided for continuous batching"
+        else:
+            # - this is the regular batching case, where initial
+            #   states are used are for each example of the batch.
+            assert initial_states.shape == (batch, nheads, dim)
+
+    if seq_idx is not None:
+        seqlen = seq_idx.shape[-1]
+        assert seq_idx.shape == (batch, seqlen)
+    out_dtype = states.dtype if out_dtype is None else out_dtype
+    out = torch.empty(
+        (batch, nchunks, nheads, dim), device=states.device, dtype=out_dtype
+    )
+    final_states = torch.empty(
+        (batch, nheads, dim), device=states.device, dtype=torch.float32
+    )
+    grid = lambda META: (triton.cdiv(dim, META["BLOCK_SIZE"]), batch, nheads)
+    with torch.cuda.device(states.device.index):
+        _state_passing_fwd_kernel[grid](
+            states,
+            out,
+            final_states,
+            dA_cumsum,
+            initial_states,
+            seq_idx,
+            chunk_offsets,
+            len(chunk_offsets) if chunk_offsets is not None else 0,
+            dim,
+            nchunks,
+            seqlen if seq_idx is not None else 0,
+            chunk_size,
+            states.stride(0),
+            states.stride(1),
+            states.stride(2),
+            states.stride(3),
+            out.stride(0),
+            out.stride(1),
+            out.stride(2),
+            out.stride(3),
+            final_states.stride(0),
+            final_states.stride(1),
+            final_states.stride(2),
+            dA_cumsum.stride(0),
+            dA_cumsum.stride(2),
+            dA_cumsum.stride(1),
+            dA_cumsum.stride(3),
+            *(
+                (
+                    initial_states.stride(0),
+                    initial_states.stride(1),
+                    initial_states.stride(2),
+                )
+                if initial_states is not None
+                else (0, 0, 0)
+            ),
+            *(
+                (seq_idx.stride(0), seq_idx.stride(1))
+                if seq_idx is not None
+                else (0, 0)
+            ),
+            HAS_INITSTATES=initial_states is not None,
+            HAS_SEQ_IDX=seq_idx is not None,
+            IS_CONT_BATCHED=is_cont_batched,
+        )
+    return out, final_states
diff --git a/sglang/python/sglang/srt/layers/attention/mamba/ops/ssu_dispatch.py b/sglang/python/sglang/srt/layers/attention/mamba/ops/ssu_dispatch.py
new file mode 100644
index 0000000000000000000000000000000000000000..77d586eee3aeb5a4e69b570c955776d8560b17ad
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/attention/mamba/ops/ssu_dispatch.py
@@ -0,0 +1,277 @@
+from __future__ import annotations
+
+import logging
+from abc import ABC, abstractmethod
+from typing import TYPE_CHECKING
+
+import torch
+
+if TYPE_CHECKING:
+    from sglang.srt.server_args import ServerArgs
+
+logger = logging.getLogger(__name__)
+
+
+class MambaSSUBackend(ABC):
+    @property
+    @abstractmethod
+    def name(self) -> str:
+        """Human-readable name used for logging."""
+
+    @abstractmethod
+    def __call__(
+        self,
+        state: torch.Tensor,
+        x: torch.Tensor,
+        dt: torch.Tensor,
+        A: torch.Tensor,
+        B: torch.Tensor,
+        C: torch.Tensor,
+        D: torch.Tensor | None = None,
+        z: torch.Tensor | None = None,
+        dt_bias: torch.Tensor | None = None,
+        dt_softplus: bool = False,
+        state_batch_indices: torch.Tensor | None = None,
+        pad_slot_id: int = -1,
+        out: torch.Tensor | None = None,
+        disable_state_update: bool = False,
+        intermediate_states_buffer: torch.Tensor | None = None,
+        cache_steps: int | None = None,
+        retrieve_parent_token: torch.Tensor | None = None,
+        intermediate_state_indices: torch.Tensor | None = None,
+    ) -> None: ...
+
+
+class TritonSSUBackend(MambaSSUBackend):
+    """Triton-based selective-state-update backend."""
+
+    def __init__(self) -> None:
+        from sglang.srt.layers.attention.mamba.ops.mamba_ssm import (
+            selective_state_update,
+        )
+
+        self._kernel = selective_state_update
+
+    @property
+    def name(self) -> str:
+        return "triton"
+
+    def __call__(
+        self,
+        state: torch.Tensor,
+        x: torch.Tensor,
+        dt: torch.Tensor,
+        A: torch.Tensor,
+        B: torch.Tensor,
+        C: torch.Tensor,
+        D: torch.Tensor | None = None,
+        z: torch.Tensor | None = None,
+        dt_bias: torch.Tensor | None = None,
+        dt_softplus: bool = False,
+        state_batch_indices: torch.Tensor | None = None,
+        pad_slot_id: int = -1,
+        out: torch.Tensor | None = None,
+        disable_state_update: bool = False,
+        intermediate_states_buffer: torch.Tensor | None = None,
+        cache_steps: int | None = None,
+        retrieve_parent_token: torch.Tensor | None = None,
+        intermediate_state_indices: torch.Tensor | None = None,
+    ) -> None:
+        self._kernel(
+            state,
+            x,
+            dt,
+            A,
+            B,
+            C,
+            D=D,
+            z=z,
+            dt_bias=dt_bias,
+            dt_softplus=dt_softplus,
+            state_batch_indices=state_batch_indices,
+            pad_slot_id=pad_slot_id,
+            out=out,
+            disable_state_update=disable_state_update,
+            intermediate_states_buffer=intermediate_states_buffer,
+            cache_steps=cache_steps,
+            retrieve_parent_token=retrieve_parent_token,
+            intermediate_state_indices=intermediate_state_indices,
+        )
+
+
+class FlashInferSSUBackend(MambaSSUBackend):
+    """FlashInfer-based selective-state-update backend."""
+
+    def __init__(self) -> None:
+        from flashinfer.mamba import selective_state_update
+
+        self._kernel = selective_state_update
+
+    @property
+    def name(self) -> str:
+        return "flashinfer"
+
+    def __call__(
+        self,
+        state: torch.Tensor,
+        x: torch.Tensor,
+        dt: torch.Tensor,
+        A: torch.Tensor,
+        B: torch.Tensor,
+        C: torch.Tensor,
+        D: torch.Tensor | None = None,
+        z: torch.Tensor | None = None,
+        dt_bias: torch.Tensor | None = None,
+        dt_softplus: bool = False,
+        state_batch_indices: torch.Tensor | None = None,
+        pad_slot_id: int = -1,
+        out: torch.Tensor | None = None,
+        disable_state_update: bool = False,
+        intermediate_states_buffer: torch.Tensor | None = None,
+        cache_steps: int | None = None,
+        retrieve_parent_token: torch.Tensor | None = None,
+        intermediate_state_indices: torch.Tensor | None = None,
+    ) -> None:
+        if retrieve_parent_token is not None:
+            raise ValueError(
+                "FlashInfer backend does not support retrieve_parent_token. "
+                "Use --mamba-backend triton for EAGLE tree attention."
+            )
+        # FlashInfer expects cache_steps as an int (0 when unused).
+        self._kernel(
+            state,
+            x,
+            dt,
+            A,
+            B,
+            C,
+            D=D,
+            z=z,
+            dt_bias=dt_bias,
+            dt_softplus=dt_softplus,
+            state_batch_indices=state_batch_indices,
+            pad_slot_id=pad_slot_id,
+            out=out,
+            disable_state_update=disable_state_update,
+            intermediate_states_buffer=intermediate_states_buffer,
+            cache_steps=0 if cache_steps is None else cache_steps,
+            intermediate_state_indices=intermediate_state_indices,
+        )
+
+
+_BACKEND_REGISTRY: dict[str, type[MambaSSUBackend]] = {
+    "triton": TritonSSUBackend,
+    "flashinfer": FlashInferSSUBackend,
+}
+
+_mamba_ssu_backend: MambaSSUBackend | None = None
+
+
+def initialize_mamba_selective_state_update_backend(server_args: ServerArgs) -> None:
+    """Instantiate the selective-state-update backend from server config.
+
+    This should be called once during scheduler initialization.
+
+    Args:
+        server_args: Server arguments containing ``mamba_backend`` setting.
+
+    Raises:
+        ValueError: If the requested backend is unavailable or cannot be imported.
+    """
+    global _mamba_ssu_backend
+
+    requested = server_args.mamba_backend or "triton"
+
+    backend_cls = _BACKEND_REGISTRY.get(requested)
+    if backend_cls is None:
+        raise ValueError(
+            f"Unknown mamba backend '{requested}'. "
+            f"Available backends: {list(_BACKEND_REGISTRY.keys())}"
+        )
+
+    try:
+        _mamba_ssu_backend = backend_cls()
+    except ImportError:
+        raise ValueError(
+            f"Mamba backend '{requested}' requested but its dependencies are not "
+            f"available. Install the required package or use a different "
+            f"--mamba-backend value."
+        )
+
+    logger.info(
+        "Mamba selective_state_update backend initialized: %s",
+        _mamba_ssu_backend.name,
+    )
+
+
+def selective_state_update(
+    state: torch.Tensor,
+    x: torch.Tensor,
+    dt: torch.Tensor,
+    A: torch.Tensor,
+    B: torch.Tensor,
+    C: torch.Tensor,
+    D: torch.Tensor | None = None,
+    z: torch.Tensor | None = None,
+    dt_bias: torch.Tensor | None = None,
+    dt_softplus: bool = False,
+    state_batch_indices: torch.Tensor | None = None,
+    pad_slot_id: int = -1,
+    out: torch.Tensor | None = None,
+    disable_state_update: bool = False,
+    intermediate_states_buffer: torch.Tensor | None = None,
+    cache_steps: int | None = None,
+    retrieve_parent_token: torch.Tensor | None = None,
+    intermediate_state_indices: torch.Tensor | None = None,
+) -> None:
+    """Dispatch selective-state-update to the configured backend.
+
+    This function provides a unified interface regardless of the underlying
+    backend. Backend-specific argument adaptation is handled inside each
+    :class:`MambaSSUBackend` subclass.
+
+    Args:
+        state: SSM state tensor (batch, nheads, dim, dstate)
+        x: Input tensor
+        dt: Delta time tensor
+        A: A matrix
+        B: B matrix
+        C: C matrix
+        D: Optional D vector
+        z: Optional z tensor for gating
+        dt_bias: Optional dt bias
+        dt_softplus: Whether to apply softplus to dt
+        state_batch_indices: Optional batch indices for state
+        out: Preallocated output tensor (in-place updated)
+        disable_state_update: If True, don't write back to state (for speculative verify)
+        intermediate_states_buffer: Buffer to cache intermediate states
+        cache_steps: Total number of steps in the buffer
+        retrieve_parent_token: (batch, T) tensor of parent token indices for EAGLE tree attention
+        intermediate_state_indices: (batch,) tensor of indices for intermediate_states_buffer operations.
+            If provided, uses these indices instead of state_batch_indices for the buffer.
+    """
+    assert _mamba_ssu_backend is not None, (
+        "Mamba selective_state_update backend not initialized. "
+        "Call initialize_mamba_selective_state_update_backend() first."
+    )
+
+    _mamba_ssu_backend(
+        state,
+        x,
+        dt,
+        A,
+        B,
+        C,
+        D=D,
+        z=z,
+        dt_bias=dt_bias,
+        dt_softplus=dt_softplus,
+        state_batch_indices=state_batch_indices,
+        pad_slot_id=pad_slot_id,
+        out=out,
+        disable_state_update=disable_state_update,
+        intermediate_states_buffer=intermediate_states_buffer,
+        cache_steps=cache_steps,
+        retrieve_parent_token=retrieve_parent_token,
+        intermediate_state_indices=intermediate_state_indices,
+    )
diff --git a/sglang/python/sglang/srt/layers/attention/merge_state.py b/sglang/python/sglang/srt/layers/attention/merge_state.py
new file mode 100644
index 0000000000000000000000000000000000000000..245418a9dfdf18312924ccf68fac942102eaccfa
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/attention/merge_state.py
@@ -0,0 +1,46 @@
+from typing import Optional, Tuple
+
+import torch
+from sgl_kernel import merge_state_v2
+
+from sglang.srt.layers.attention.triton_ops.merge_state import merge_state_triton
+from sglang.srt.utils import is_cuda
+
+_is_cuda = is_cuda()
+
+
+# Automatically fallback to the Triton kernel in some cases
+# (e.g., for AMD GPUs, when the head dimension is not a multiple
+# of 4 or 8, and in FP8 precision)
+def _supported_dtypes(o: torch.Tensor) -> bool:
+    return o.dtype in [torch.float32, torch.half, torch.bfloat16]
+
+
+def _supported_headdim(o: torch.Tensor) -> bool:
+    headdim = o.shape[2]  # [NUM_TOKENS, NUM_HEADS, HEAD_SIZE]
+    if o.dtype == torch.float32:
+        return headdim % 4 == 0
+    return headdim % 8 == 0
+
+
+def merge_state(
+    prefix_output: torch.Tensor,
+    prefix_lse: torch.Tensor,
+    suffix_output: torch.Tensor,
+    suffix_lse: torch.Tensor,
+    output: Optional[torch.Tensor] = None,
+    output_lse: Optional[torch.Tensor] = None,
+) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
+    if (
+        _is_cuda
+        and _supported_dtypes(prefix_output)
+        and _supported_headdim(prefix_output)
+    ):
+        return merge_state_v2(
+            prefix_output, prefix_lse, suffix_output, suffix_lse, output, output_lse
+        )
+    else:
+        # Fallback to Triton kernel
+        return merge_state_triton(
+            prefix_output, prefix_lse, suffix_output, suffix_lse, output, output_lse
+        )
diff --git a/sglang/python/sglang/srt/layers/attention/nsa/__pycache__/dequant_k_cache.cpython-311.pyc b/sglang/python/sglang/srt/layers/attention/nsa/__pycache__/dequant_k_cache.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8af86cfd997efc0487d1f89c7ae252faf3b468a9
Binary files /dev/null and b/sglang/python/sglang/srt/layers/attention/nsa/__pycache__/dequant_k_cache.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/layers/attention/nsa/__pycache__/index_buf_accessor.cpython-311.pyc b/sglang/python/sglang/srt/layers/attention/nsa/__pycache__/index_buf_accessor.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..25bb62f4d4dc4d4947924b9c4dd2ef57667920cc
Binary files /dev/null and b/sglang/python/sglang/srt/layers/attention/nsa/__pycache__/index_buf_accessor.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/layers/attention/nsa/__pycache__/nsa_indexer.cpython-311.pyc b/sglang/python/sglang/srt/layers/attention/nsa/__pycache__/nsa_indexer.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..93d4861d025a75a6dc7775ad80029056572a9c5b
Binary files /dev/null and b/sglang/python/sglang/srt/layers/attention/nsa/__pycache__/nsa_indexer.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/layers/attention/nsa/__pycache__/quant_k_cache.cpython-311.pyc b/sglang/python/sglang/srt/layers/attention/nsa/__pycache__/quant_k_cache.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..fdc29f5c60b15425196ea8e3cbce44fa3330bf1c
Binary files /dev/null and b/sglang/python/sglang/srt/layers/attention/nsa/__pycache__/quant_k_cache.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/layers/attention/nsa/__pycache__/utils.cpython-311.pyc b/sglang/python/sglang/srt/layers/attention/nsa/__pycache__/utils.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0f9853a7472812513f385aa10e27315404493f9d
Binary files /dev/null and b/sglang/python/sglang/srt/layers/attention/nsa/__pycache__/utils.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/layers/attention/nsa/dequant_k_cache.py b/sglang/python/sglang/srt/layers/attention/nsa/dequant_k_cache.py
new file mode 100644
index 0000000000000000000000000000000000000000..5c69d135bb2426c9e2ce11063684251a50aafcf0
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/attention/nsa/dequant_k_cache.py
@@ -0,0 +1,289 @@
+import torch
+import triton
+import triton.language as tl
+
+
+def dequantize_k_cache(quant_k_cache):
+    return _dequantize_k_cache_fast_wrapped(quant_k_cache)
+
+
+def _dequantize_k_cache_ref(
+    quant_k_cache: torch.Tensor,  # (num_blocks, block_size, 1, bytes_per_token)
+    dv: int = 512,
+    tile_size: int = 128,
+    d: int = 576,
+) -> torch.Tensor:
+    """
+    De-quantize the k-cache
+    """
+    assert dv % tile_size == 0
+    original_ndim = quant_k_cache.ndim
+    if original_ndim == 3:
+        # set block_size = 1
+        quant_k_cache = quant_k_cache.unsqueeze(1)
+    num_tiles = dv // tile_size
+    num_blocks, block_size, h_k, _ = quant_k_cache.shape
+    assert h_k == 1
+    result = torch.empty(
+        (num_blocks, block_size, d), dtype=torch.bfloat16, device=quant_k_cache.device
+    )
+
+    quant_k_cache = quant_k_cache.view(num_blocks, block_size, -1)
+
+    input_nope = quant_k_cache[..., :dv]
+    input_scale = quant_k_cache[..., dv : dv + num_tiles * 4].view(torch.float32)
+    input_rope = quant_k_cache[..., dv + num_tiles * 4 :].view(torch.bfloat16)
+    result[..., dv:] = input_rope
+
+    for tile_idx in range(0, num_tiles):
+        cur_nope = input_nope[
+            ..., tile_idx * tile_size : (tile_idx + 1) * tile_size
+        ].to(torch.float32)
+        cur_scales = input_scale[..., tile_idx].unsqueeze(-1)
+        result[..., tile_idx * tile_size : (tile_idx + 1) * tile_size] = (
+            cur_nope * cur_scales
+        )
+
+    if original_ndim == 3:
+        return result.view(num_blocks, 1, -1)
+    else:
+        return result.view(num_blocks, block_size, 1, -1)
+
+
+def _dequantize_k_cache_fast_wrapped(
+    quant_k_cache: torch.Tensor,
+    dv: int = 512,
+    tile_size: int = 128,
+) -> torch.Tensor:
+    original_ndim = quant_k_cache.ndim
+    if original_ndim == 3:
+        # set block_size = 1
+        quant_k_cache = quant_k_cache.unsqueeze(1)
+    num_blocks, block_size, _, dim_quant = quant_k_cache.shape
+    assert dv == 512
+    assert dim_quant == 656
+    assert tile_size == 128
+    quant_k_cache = quant_k_cache.view((-1, dim_quant))
+
+    output = _dequantize_k_cache_fast(quant_k_cache)
+
+    if original_ndim == 3:
+        return output.view(num_blocks, 1, -1)
+    else:
+        return output.view(num_blocks, block_size, 1, -1)
+
+
+def _dequantize_k_cache_fast(quant_k_cache, group_size: int = 128):
+    num_tokens, dim_quant = quant_k_cache.shape
+
+    assert quant_k_cache.dtype == torch.float8_e4m3fn
+    dim_nope = 512
+    dim_rope = 64
+    num_tiles = dim_nope // group_size
+    assert dim_quant == 656
+
+    output = torch.empty(
+        (num_tokens, dim_nope + dim_rope),
+        dtype=torch.bfloat16,
+        device=quant_k_cache.device,
+    )
+
+    num_blocks_per_token = triton.cdiv(dim_nope + dim_rope, group_size)
+    assert num_blocks_per_token == 5
+
+    assert dim_nope % group_size == 0
+
+    input_nope_q = quant_k_cache[:, :dim_nope]
+    input_nope_s = quant_k_cache[:, dim_nope : dim_nope + num_tiles * 4].view(
+        torch.float32
+    )
+    input_rope = quant_k_cache[:, dim_nope + num_tiles * 4 :].view(torch.bfloat16)
+
+    _dequantize_k_cache_fast_kernel[(num_tokens, num_blocks_per_token)](
+        output,
+        input_nope_q,
+        input_nope_s,
+        input_rope,
+        output.stride(0),
+        input_nope_q.stride(0),
+        input_nope_s.stride(0),
+        input_rope.stride(0),
+        NUM_NOPE_BLOCKS=num_tiles,
+        GROUP_SIZE=group_size,
+        DIM_NOPE=dim_nope,
+        DIM_ROPE=dim_rope,
+    )
+
+    return output
+
+
+@triton.jit
+def _dequantize_k_cache_fast_kernel(
+    output_ptr,
+    input_nope_q_ptr,
+    input_nope_s_ptr,
+    input_rope_ptr,
+    output_stride_0: int,
+    input_nope_q_stride_0: int,
+    input_nope_s_stride_0: int,
+    input_rope_stride_0: int,
+    NUM_NOPE_BLOCKS: tl.constexpr,
+    GROUP_SIZE: tl.constexpr,
+    DIM_NOPE: tl.constexpr,
+    DIM_ROPE: tl.constexpr,
+):
+    token_id = tl.program_id(0)
+    raw_block_id = tl.program_id(1)
+
+    if raw_block_id < NUM_NOPE_BLOCKS:
+        # a. dequant nope
+        effective_block_id = raw_block_id
+
+        offs_q = effective_block_id * GROUP_SIZE + tl.arange(0, GROUP_SIZE)
+        mask = offs_q < DIM_NOPE
+        ptr_q = input_nope_q_ptr + token_id * input_nope_q_stride_0 + offs_q
+        ptr_s = input_nope_s_ptr + token_id * input_nope_s_stride_0 + effective_block_id
+
+        y_q = tl.load(ptr_q, mask=mask, other=0.0).to(tl.float32)
+        y_s = tl.load(ptr_s)
+
+        y = (y_q * y_s).to(output_ptr.dtype.element_ty)
+
+        dst_ptr = output_ptr + token_id * output_stride_0 + offs_q
+        tl.store(dst_ptr, y, mask=mask)
+    else:
+        # b. copy rope
+        effective_block_id = raw_block_id - NUM_NOPE_BLOCKS
+
+        offs = effective_block_id * GROUP_SIZE + tl.arange(0, GROUP_SIZE)
+        mask = offs < DIM_ROPE
+
+        src_ptr = input_rope_ptr + token_id * input_rope_stride_0 + offs
+        dst_ptr = output_ptr + token_id * output_stride_0 + DIM_NOPE + offs
+
+        data = tl.load(src_ptr, mask=mask).to(tl.bfloat16)
+        tl.store(dst_ptr, data, mask=mask)
+
+
+def dequantize_k_cache_paged(
+    quant_k_cache: torch.Tensor,
+    page_table_1_flattened: torch.Tensor,
+    group_size: int = 128,
+) -> torch.Tensor:
+    """
+    De-quantize the k-cache with paged layout
+    Args:
+        quant_k_cache: [total_num_tokens, 1, dim_quant] or [num_blocks, block_size, 1, dim_quant], the quantized k-cache in paged layout
+        page_table_1_flattened: [num_tokens], the flattened page_table_1 with the page indices in each requests concatenated together
+    Returns:
+        output: [num_tokens, 1, dim_nope + dim_rope], the de-quantized k-cache
+    """
+    dim_quant = quant_k_cache.shape[-1]
+    assert (
+        dim_quant == 656
+    ), f"dim_quant: {dim_quant} != 656 detected in dequantize_k_cache_paged"
+    quant_k_cache = quant_k_cache.view((-1, dim_quant))
+
+    # num_tokens can exceed kv_cache_size due to prefix sharing (multiple seqs share same KV slots)
+    # Index bounds validated in nsa_backend.init_forward_metadata
+    num_tokens = page_table_1_flattened.shape[0]
+    assert quant_k_cache.dtype == torch.float8_e4m3fn
+    dim_nope = 512
+    dim_rope = 64
+    num_tiles = dim_nope // group_size  # 512 // 128 = 4
+
+    output = torch.empty(
+        (num_tokens, 1, dim_nope + dim_rope),
+        dtype=torch.bfloat16,
+        device=quant_k_cache.device,
+    )
+
+    # cdiv(512 + 64, 128) = 5
+    num_blocks_per_token = triton.cdiv(dim_nope + dim_rope, group_size)
+    assert num_blocks_per_token == 5
+
+    assert dim_nope % group_size == 0
+
+    input_nope_q = quant_k_cache[:, :dim_nope]
+    # [:, 512:512+4*4] = [:, 512:528]
+    input_nope_s = quant_k_cache[:, dim_nope : dim_nope + num_tiles * 4].view(
+        torch.float32
+    )
+    # [:, 528:]
+    input_rope = quant_k_cache[:, dim_nope + num_tiles * 4 :].view(torch.bfloat16)
+
+    _dequantize_k_cache_paged_kernel[(num_tokens, num_blocks_per_token)](
+        output,
+        input_nope_q,
+        input_nope_s,
+        input_rope,
+        page_table_1_flattened,
+        output.stride(0),
+        input_nope_q.stride(0),
+        input_nope_s.stride(0),
+        input_rope.stride(0),
+        NUM_NOPE_BLOCKS=num_tiles,
+        GROUP_SIZE=group_size,
+        DIM_NOPE=dim_nope,
+        DIM_ROPE=dim_rope,
+    )
+
+    return output
+
+
+@triton.jit
+def _dequantize_k_cache_paged_kernel(
+    output_ptr,
+    input_nope_q_ptr,
+    input_nope_s_ptr,
+    input_rope_ptr,
+    page_table_1_ptr,
+    output_stride_0: int,
+    input_nope_q_stride_0: int,
+    input_nope_s_stride_0: int,
+    input_rope_stride_0: int,
+    NUM_NOPE_BLOCKS: tl.constexpr,
+    GROUP_SIZE: tl.constexpr,
+    DIM_NOPE: tl.constexpr,
+    DIM_ROPE: tl.constexpr,
+):
+    token_id = tl.program_id(0)
+    token_id_paged = tl.load(page_table_1_ptr + token_id).to(tl.int32)
+    raw_block_id = tl.program_id(1)
+
+    if raw_block_id < NUM_NOPE_BLOCKS:
+        # a. dequant nope
+        effective_block_id = raw_block_id
+
+        offs_q = effective_block_id * GROUP_SIZE + tl.arange(0, GROUP_SIZE)
+        mask = offs_q < DIM_NOPE
+        ptr_q = input_nope_q_ptr + token_id_paged * input_nope_q_stride_0 + offs_q
+        ptr_s = (
+            input_nope_s_ptr
+            + token_id_paged * input_nope_s_stride_0
+            + effective_block_id
+        )
+
+        y_q = tl.load(ptr_q, mask=mask, other=0.0).to(tl.float32)
+        y_s = tl.load(ptr_s)
+
+        y = (y_q * y_s).to(output_ptr.dtype.element_ty)
+
+        dst_ptr = output_ptr + token_id * output_stride_0 + offs_q
+        tl.store(dst_ptr, y, mask=mask)
+    else:
+        # b. copy rope
+        effective_block_id = raw_block_id - NUM_NOPE_BLOCKS
+
+        offs = effective_block_id * GROUP_SIZE + tl.arange(0, GROUP_SIZE)
+        mask = offs < DIM_ROPE
+
+        src_ptr = input_rope_ptr + token_id_paged * input_rope_stride_0 + offs
+        dst_ptr = output_ptr + token_id * output_stride_0 + DIM_NOPE + offs
+
+        data = tl.load(src_ptr, mask=mask).to(tl.bfloat16)
+        tl.store(dst_ptr, data, mask=mask)
+
+
+if __name__ == "__main__":
+    raise Exception("UT is in quant_k_cache.py")
diff --git a/sglang/python/sglang/srt/layers/attention/nsa/index_buf_accessor.py b/sglang/python/sglang/srt/layers/attention/nsa/index_buf_accessor.py
new file mode 100644
index 0000000000000000000000000000000000000000..1cdf65b91c295770a2d7384f3c7419cf0d2fa26f
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/attention/nsa/index_buf_accessor.py
@@ -0,0 +1,698 @@
+from typing import TYPE_CHECKING
+
+import torch
+import triton
+import triton.language as tl
+
+from sglang.srt.layers.quantization.fp8_kernel import is_fp8_fnuz
+from sglang.srt.utils import is_hip
+
+_is_hip = is_hip()
+_is_fp8_fnuz = is_fp8_fnuz()
+
+if TYPE_CHECKING:
+    from sglang.srt.mem_cache.memory_pool import NSATokenToKVPool
+
+"""
+k: data, 128 item per token, fp8
+s: scale, 1 item per token, fp32
+"""
+
+
+class GetK:
+    @classmethod
+    def execute(cls, *args, **kwargs):
+        return cls.triton(*args, **kwargs)
+
+    @classmethod
+    def slow(
+        cls, pool: "NSATokenToKVPool", buf, seq_len: int, page_indices: torch.Tensor
+    ):
+        num_pages = (seq_len + pool.page_size - 1) // pool.page_size
+        seq_len_ = num_pages * pool.page_size
+        index_k_fp8 = torch.empty(
+            (seq_len_, pool.index_head_dim),
+            dtype=torch.uint8,
+            device=pool.device,
+        )
+        for i in range(num_pages):
+            page_index = page_indices[i]
+            index_k_fp8[i * pool.page_size : (i + 1) * pool.page_size] = buf[
+                page_index
+            ][: pool.page_size * pool.index_head_dim].view(-1, pool.index_head_dim)
+
+        return index_k_fp8[:seq_len]
+
+    @classmethod
+    def torch_fast(
+        cls, pool: "NSATokenToKVPool", buf, seq_len: int, page_indices: torch.Tensor
+    ):
+        """
+        :param page_indices: (num_pages,), int32
+        :return: (seq_len, index_head_dim), uint8
+        """
+
+        # can handle per 128B instead of per element
+
+        # page_indices: (num_pages,), element := a page index
+        buf_numel_per_page = buf.shape[1]
+
+        num_k_bytes_per_page = pool.page_size * pool.index_head_dim
+        num_k_bytes_per_token = pool.index_head_dim
+
+        # buf: (num_pages, page_size 64 * head_dim 128 + page_size 64 * fp32_nbytes 4), uint8
+        # flat_buf: (whatever,), uint8
+        flat_buf = buf.flatten()
+
+        # flat_indices: (num_pages, num_k_bytes_per_page), int32, element := an index into flat_buf that we want to access
+        flat_indices = (page_indices * buf_numel_per_page)[:, None] + torch.arange(
+            num_k_bytes_per_page, dtype=torch.int32, device="cuda"
+        )[None, :]
+        flat_indices = flat_indices.flatten()[: seq_len * num_k_bytes_per_token]
+
+        out = flat_buf[flat_indices]
+        return out.view(-1, 128)
+
+    @classmethod
+    def triton(
+        cls, pool: "NSATokenToKVPool", buf, seq_len: int, page_indices: torch.Tensor
+    ):
+        """
+        Triton implementation for gathering K data from paged buffer.
+        :param page_indices: (num_pages,), int32/int64
+        :return: (seq_len, index_head_dim), uint8
+        """
+        return _get_k_triton(
+            buf=buf,
+            page_indices=page_indices,
+            seq_len=seq_len,
+            page_size=pool.page_size,
+            index_head_dim=pool.index_head_dim,
+        )
+
+
+class GetS:
+    @classmethod
+    def execute(cls, *args, **kwargs):
+        return cls.triton(*args, **kwargs)
+
+    @classmethod
+    def slow(
+        cls, pool: "NSATokenToKVPool", buf, seq_len: int, page_indices: torch.Tensor
+    ):
+        num_pages = (seq_len + pool.page_size - 1) // pool.page_size
+        seq_len_ = num_pages * pool.page_size
+        assert pool.index_head_dim // pool.quant_block_size == 1
+        index_k_scale_fp8 = torch.empty(
+            (seq_len_, 4),
+            dtype=torch.uint8,
+            device=pool.device,
+        )
+        for i in range(num_pages):
+            page_index = page_indices[i]
+            index_k_scale_fp8[i * pool.page_size : (i + 1) * pool.page_size] = buf[
+                page_index
+            ][pool.page_size * pool.index_head_dim :].view(-1, 4)
+        return index_k_scale_fp8[:seq_len]
+
+    @classmethod
+    def torch_fast(
+        cls, pool: "NSATokenToKVPool", buf, seq_len: int, page_indices: torch.Tensor
+    ):
+        """
+        :param page_indices: (num_pages,), int32
+        :return: (seq_len, index_head_dim // quant_block_size), uint8
+        """
+        buf_numel_per_page = buf.shape[1]
+
+        num_s_bytes_per_page = buf.shape[1] - pool.page_size * pool.index_head_dim
+        num_s_bytes_per_token = pool.index_head_dim // pool.quant_block_size * 4
+        s_offset_in_page = pool.page_size * pool.index_head_dim
+
+        flat_buf = buf.flatten()
+        flat_indices = (
+            (page_indices * buf_numel_per_page)[:, None]
+            + torch.arange(num_s_bytes_per_page, dtype=torch.int32, device="cuda")[
+                None, :
+            ]
+            + s_offset_in_page
+        )
+        flat_indices = flat_indices.flatten()[: seq_len * num_s_bytes_per_token]
+
+        out = flat_buf[flat_indices]
+        return out.view(-1, 4)
+
+    @classmethod
+    def triton(
+        cls, pool: "NSATokenToKVPool", buf, seq_len: int, page_indices: torch.Tensor
+    ):
+        """
+        Triton implementation for gathering S (scale) data from paged buffer.
+        :param page_indices: (num_pages,), int32/int64
+        :return: (seq_len, 4), uint8
+        """
+        return _get_s_triton(
+            buf=buf,
+            page_indices=page_indices,
+            seq_len=seq_len,
+            page_size=pool.page_size,
+            index_head_dim=pool.index_head_dim,
+        )
+
+
+class GetKAndS:
+    @classmethod
+    def execute(cls, *args, **kwargs):
+        return cls.triton(*args, **kwargs)
+
+    @classmethod
+    def triton(
+        cls, pool: "NSATokenToKVPool", buf, seq_len: int, page_indices: torch.Tensor
+    ):
+        """
+        Triton implementation for gathering both K and S data from paged buffer in a single call.
+        :param page_indices: (num_pages,), int32/int64
+        :return: tuple of (k_fp8, k_scale) where
+                 k_fp8: (seq_len, index_head_dim), uint8
+                 k_scale: (seq_len, 4), uint8
+        """
+        return _get_k_and_s_triton(
+            buf=buf,
+            page_indices=page_indices,
+            seq_len=seq_len,
+            page_size=pool.page_size,
+            index_head_dim=pool.index_head_dim,
+        )
+
+
+class SetK:
+    @classmethod
+    def execute(cls, *args, buf, **kwargs):
+        return cls.torch_fast(*args, **kwargs, buf=buf)
+
+    @classmethod
+    def slow(
+        cls,
+        pool: "NSATokenToKVPool",
+        buf: torch.Tensor,
+        loc: torch.Tensor,
+        index_k: torch.Tensor,
+    ):
+        for i in range(len(loc)):
+            page_index = loc[i] // pool.page_size
+            offset = loc[i] % pool.page_size
+            buf[
+                page_index,
+                offset * pool.index_head_dim : (offset + 1) * pool.index_head_dim,
+            ] = index_k[i].view(torch.uint8)
+
+    @classmethod
+    def torch_fast(
+        cls,
+        pool: "NSATokenToKVPool",
+        buf: torch.Tensor,
+        loc: torch.Tensor,
+        index_k: torch.Tensor,
+    ):
+        (num_tokens_to_write,) = loc.shape
+        buf_numel_per_page = buf.shape[1]
+        num_k_bytes_per_token = pool.index_head_dim
+
+        # loc: (num_tokens_to_write,), int32, element := the token index to write to
+        loc_page_index = loc // pool.page_size
+        loc_token_offset_in_page = loc % pool.page_size
+
+        flat_buf = buf.flatten()
+        flat_indices = (
+            (loc_page_index * buf_numel_per_page)[:, None]
+            + (loc_token_offset_in_page * num_k_bytes_per_token)[:, None]
+            + torch.arange(num_k_bytes_per_token, dtype=torch.int32, device="cuda")[
+                None, :
+            ]
+        )
+        num_k_bytes_total = num_tokens_to_write * num_k_bytes_per_token
+        flat_indices = flat_indices.flatten()[:num_k_bytes_total]
+        flat_buf[flat_indices] = index_k.view(torch.uint8).flatten()
+
+
+class SetS:
+    @classmethod
+    def execute(cls, *args, buf, **kwargs):
+        return cls.torch_fast(*args, **kwargs, buf=buf)
+
+    @classmethod
+    def slow(
+        cls,
+        pool: "NSATokenToKVPool",
+        buf: torch.Tensor,
+        loc: torch.Tensor,
+        index_k_scale: torch.Tensor,
+    ):
+        for i in range(len(loc)):
+            page_index = loc[i] // pool.page_size
+            offset = loc[i] % pool.page_size
+            start = pool.page_size * pool.index_head_dim
+            buf[page_index, start + offset * 4 : start + (offset + 1) * 4] = (
+                index_k_scale[i].view(torch.uint8)
+            )
+
+    @classmethod
+    def torch_fast(
+        cls,
+        pool: "NSATokenToKVPool",
+        buf: torch.Tensor,
+        loc: torch.Tensor,
+        index_k_scale: torch.Tensor,
+    ):
+        (num_tokens_to_write,) = loc.shape
+        buf_numel_per_page = buf.shape[1]
+        num_s_bytes_per_token = 4
+        s_offset_in_page = pool.page_size * pool.index_head_dim
+
+        # loc: (num_tokens_to_write,), int32, element := the token index to write to
+        loc_page_index = loc // pool.page_size
+        loc_token_offset_in_page = loc % pool.page_size
+
+        flat_buf = buf.flatten()
+        flat_indices = (
+            (loc_page_index * buf_numel_per_page)[:, None]
+            + s_offset_in_page
+            + (loc_token_offset_in_page * num_s_bytes_per_token)[:, None]
+            + torch.arange(num_s_bytes_per_token, dtype=torch.int32, device="cuda")[
+                None, :
+            ]
+        )
+        number_s_bytes_total = num_tokens_to_write * num_s_bytes_per_token
+        flat_indices = flat_indices.flatten()[:number_s_bytes_total]
+        flat_buf[flat_indices] = index_k_scale.view(torch.uint8).flatten()
+
+
+class SetKAndS:
+    @classmethod
+    def execute(cls, *args, buf, **kwargs):
+        if 0:
+            # print("SetK, SetS comparison test")
+            buf_cloned = buf.clone()
+            cls.vanilla(*args, **kwargs, buf=buf)
+            cls.triton(*args, **kwargs, buf=buf_cloned)
+
+            def _clear_token_0(target):
+                target[0, :128] = target[0, 64 * 128 : 64 * 128 + 4] = 0
+
+            _clear_token_0(buf)
+            _clear_token_0(buf_cloned)
+
+            assert torch.all(
+                buf == buf_cloned
+            ), f"{buf=} {buf_cloned=} {kwargs['loc'].to_list()=}"
+            return
+
+        cls.triton(*args, **kwargs, buf=buf)
+
+    @classmethod
+    def vanilla(cls, pool, buf, loc, index_k, index_k_scale):
+        SetK.execute(pool=pool, buf=buf, loc=loc, index_k=index_k)
+        SetS.execute(pool=pool, buf=buf, loc=loc, index_k_scale=index_k_scale)
+
+    @classmethod
+    def triton(cls, pool, buf, loc, index_k, index_k_scale):
+        _set_k_and_s_triton(
+            buf=buf,
+            loc=loc,
+            index_k=index_k,
+            index_k_scale=index_k_scale,
+            page_size=pool.page_size,
+        )
+
+
+def _set_k_and_s_triton(
+    buf: torch.Tensor,
+    loc: torch.Tensor,
+    index_k: torch.Tensor,
+    index_k_scale: torch.Tensor,
+    page_size: int,
+):
+    """
+    :param buf: (num_pages, page_size 64 * (128B data + 4B scale)), uint8
+    :param loc: (num_tokens_to_write,), int, element := the token index to write to
+    :param index_k: (num_tokens_to_write, 128 elem), fp8
+    :param index_k_scale: (num_tokens_to_write, 1 elem), fp32
+    :return:
+    """
+    num_pages, buf_numel_per_page = buf.shape
+    (num_tokens_to_write,) = loc.shape
+    num_tokens_to_write_, index_head_dim = index_k.shape
+
+    # Handle both 1D (num_tokens,) and 2D (num_tokens, 1) shapes for index_k_scale
+    if index_k_scale.ndim == 1:
+        num_tokens_to_write__ = index_k_scale.shape[0]
+        scale_dim = 1
+    elif index_k_scale.ndim == 2:
+        num_tokens_to_write__, scale_dim = index_k_scale.shape
+    else:
+        raise ValueError(
+            f"index_k_scale must be 1D or 2D, got shape {index_k_scale.shape}"
+        )
+    if _is_hip:
+        assert buf_numel_per_page == 1 * (128 + 4)
+    else:
+        assert buf_numel_per_page == 64 * (128 + 4)
+    assert num_tokens_to_write == num_tokens_to_write_ == num_tokens_to_write__
+    assert index_head_dim == 128
+    assert scale_dim == 1
+    if _is_hip:
+        assert page_size == 1
+    else:
+        assert page_size == 64
+
+    assert buf.dtype == torch.uint8
+    assert loc.dtype == torch.int64, f"{loc.dtype=}"  # can be int32
+    if _is_fp8_fnuz:
+        assert index_k.dtype == torch.float8_e4m3fnuz
+    else:
+        assert index_k.dtype == torch.float8_e4m3fn
+    assert index_k_scale.dtype == torch.float32
+
+    assert buf.is_contiguous()
+    assert loc.is_contiguous()
+    assert index_k.is_contiguous()
+    assert index_k_scale.is_contiguous()
+
+    if _is_fp8_fnuz:
+        buf_fp8 = buf.view(torch.float8_e4m3fnuz)
+    else:
+        buf_fp8 = buf.view(torch.float8_e4m3fn)
+    buf_fp32 = buf.view(torch.float32)
+
+    _set_k_and_s_triton_kernel[(num_tokens_to_write,)](
+        buf_fp8,
+        buf_fp32,
+        loc,
+        index_k,
+        index_k_scale,
+        index_k.stride(0),
+        PAGE_SIZE=page_size,
+        BUF_NUMEL_PER_PAGE=buf_numel_per_page,
+        NUM_K_ELEMS_PER_TOKEN=index_head_dim,
+        S_OFFSET_NBYTES_IN_PAGE=page_size * index_head_dim,
+    )
+
+
+@triton.jit
+def _set_k_and_s_triton_kernel(
+    buf_fp8_ptr,
+    buf_fp32_ptr,
+    loc_ptr,
+    index_k_ptr,
+    index_k_scale_ptr,
+    index_k_ptr_stride_0,
+    PAGE_SIZE: tl.constexpr,
+    BUF_NUMEL_PER_PAGE: tl.constexpr,
+    NUM_K_ELEMS_PER_TOKEN: tl.constexpr,
+    S_OFFSET_NBYTES_IN_PAGE: tl.constexpr,
+):
+    token_id = tl.program_id(0)
+
+    loc = tl.load(loc_ptr + token_id)
+
+    in_k_offsets = token_id * index_k_ptr_stride_0 + tl.arange(0, NUM_K_ELEMS_PER_TOKEN)
+
+    # no need for `mask`, since we read 128B for k and 4B for scale, both pow of 2
+    k = tl.load(index_k_ptr + in_k_offsets)
+    k_scale = tl.load(index_k_scale_ptr + token_id)
+
+    loc_page_index = loc // PAGE_SIZE
+    loc_token_offset_in_page = loc % PAGE_SIZE
+
+    out_k_offsets = (
+        loc_page_index * BUF_NUMEL_PER_PAGE
+        + loc_token_offset_in_page * NUM_K_ELEMS_PER_TOKEN
+        + tl.arange(0, NUM_K_ELEMS_PER_TOKEN)
+    )
+
+    # "//4" b/c it is fp32 instead of uint8
+    out_s_offset = (
+        loc_page_index * BUF_NUMEL_PER_PAGE // 4
+        + S_OFFSET_NBYTES_IN_PAGE // 4
+        + loc_token_offset_in_page
+    )
+
+    tl.store(buf_fp8_ptr + out_k_offsets, k)
+    tl.store(buf_fp32_ptr + out_s_offset, k_scale)
+
+
+def _get_k_triton(
+    buf: torch.Tensor,
+    page_indices: torch.Tensor,
+    seq_len: int,
+    page_size: int,
+    index_head_dim: int,
+):
+    """
+    Gather K (key) data from paged buffer using Triton.
+
+    :param buf: (num_pages, page_size * 128 + page_size * 4), uint8
+    :param page_indices: (num_pages,), int32/int64
+    :param seq_len: int, number of tokens to gather
+    :param page_size: int, typically 64
+    :param index_head_dim: int, typically 128
+    :return: (seq_len, index_head_dim), uint8
+    """
+    num_pages, buf_numel_per_page = buf.shape
+
+    # Allocate output
+    out = torch.empty((seq_len, index_head_dim), dtype=torch.uint8, device=buf.device)
+
+    # Launch kernel with one thread per token
+    grid = (seq_len,)
+    _get_k_triton_kernel[grid](
+        buf,
+        page_indices,
+        out,
+        seq_len,
+        page_size,
+        buf_numel_per_page,
+        index_head_dim,
+        BLOCK_SIZE=128,
+    )
+
+    return out
+
+
+@triton.jit
+def _get_k_triton_kernel(
+    buf_ptr,
+    page_indices_ptr,
+    out_ptr,
+    seq_len: tl.constexpr,
+    page_size: tl.constexpr,
+    buf_numel_per_page: tl.constexpr,
+    index_head_dim: tl.constexpr,
+    BLOCK_SIZE: tl.constexpr,
+):
+    """
+    Each program handles one token (seq_len tokens total).
+    Loads 128 bytes from the appropriate page.
+    """
+    token_id = tl.program_id(0)
+
+    # Calculate which page and offset within page
+    page_idx = token_id // page_size
+    token_offset_in_page = token_id % page_size
+
+    # Load the page index from page_indices
+    page_index = tl.load(page_indices_ptr + page_idx)
+
+    # Calculate source offset in buf
+    # buf[page_index, token_offset_in_page * index_head_dim : ...]
+    src_base_offset = (
+        page_index * buf_numel_per_page + token_offset_in_page * index_head_dim
+    )
+
+    # Load 128 bytes (index_head_dim elements)
+    offsets = tl.arange(0, BLOCK_SIZE)
+    mask = offsets < index_head_dim
+    data = tl.load(buf_ptr + src_base_offset + offsets, mask=mask)
+
+    # Store to output
+    dst_offset = token_id * index_head_dim
+    tl.store(out_ptr + dst_offset + offsets, data, mask=mask)
+
+
+def _get_s_triton(
+    buf: torch.Tensor,
+    page_indices: torch.Tensor,
+    seq_len: int,
+    page_size: int,
+    index_head_dim: int,
+):
+    """
+    Gather S (scale) data from paged buffer using Triton.
+
+    :param buf: (num_pages, page_size * 128 + page_size * 4), uint8
+    :param page_indices: (num_pages,), int32/int64
+    :param seq_len: int, number of tokens to gather
+    :param page_size: int, typically 64
+    :param index_head_dim: int, typically 128
+    :return: (seq_len, 4), uint8 (representing fp32 scale)
+    """
+    num_pages, buf_numel_per_page = buf.shape
+    s_offset_in_page = page_size * index_head_dim  # Scales start after K data
+
+    # Allocate output
+    out = torch.empty((seq_len, 4), dtype=torch.uint8, device=buf.device)
+
+    # Launch kernel with one thread per token
+    grid = (seq_len,)
+    _get_s_triton_kernel[grid](
+        buf,
+        page_indices,
+        out,
+        seq_len,
+        page_size,
+        buf_numel_per_page,
+        s_offset_in_page,
+    )
+
+    return out
+
+
+@triton.jit
+def _get_s_triton_kernel(
+    buf_ptr,
+    page_indices_ptr,
+    out_ptr,
+    seq_len: tl.constexpr,
+    page_size: tl.constexpr,
+    buf_numel_per_page: tl.constexpr,
+    s_offset_in_page: tl.constexpr,
+):
+    """
+    Each program handles one token (seq_len tokens total).
+    Loads 4 bytes (fp32 scale) from the appropriate page.
+    """
+    token_id = tl.program_id(0)
+
+    # Calculate which page and offset within page
+    page_idx = token_id // page_size
+    token_offset_in_page = token_id % page_size
+
+    # Load the page index from page_indices
+    page_index = tl.load(page_indices_ptr + page_idx)
+
+    # Calculate source offset in buf
+    # Scales are stored after K data: page_size * index_head_dim offset
+    # buf[page_index, s_offset_in_page + token_offset_in_page * 4 : ...]
+    src_base_offset = (
+        page_index * buf_numel_per_page + s_offset_in_page + token_offset_in_page * 4
+    )
+
+    # Load 4 bytes (fp32 scale)
+    offsets = tl.arange(0, 4)
+    data = tl.load(buf_ptr + src_base_offset + offsets)
+
+    # Store to output
+    dst_offset = token_id * 4
+    tl.store(out_ptr + dst_offset + offsets, data)
+
+
+def _get_k_and_s_triton(
+    buf: torch.Tensor,
+    page_indices: torch.Tensor,
+    seq_len: int,
+    page_size: int,
+    index_head_dim: int,
+):
+    """
+    Fused gather of both K (key) and S (scale) data from paged buffer using Triton.
+    This is more efficient than calling GetK and GetS separately.
+
+    :param buf: (num_pages, page_size * 128 + page_size * 4), uint8
+    :param page_indices: (num_pages,), int32/int64
+    :param seq_len: int, number of tokens to gather
+    :param page_size: int, typically 64
+    :param index_head_dim: int, typically 128
+    :return: tuple of (k_out, s_out) where
+             k_out: (seq_len, index_head_dim), uint8
+             s_out: (seq_len, 4), uint8
+    """
+    num_pages, buf_numel_per_page = buf.shape
+    s_offset_in_page = page_size * index_head_dim  # Scales start after K data
+
+    # Allocate outputs
+    k_out = torch.empty((seq_len, index_head_dim), dtype=torch.uint8, device=buf.device)
+    s_out = torch.empty((seq_len, 4), dtype=torch.uint8, device=buf.device)
+
+    # Launch kernel with one thread per token
+    grid = (seq_len,)
+    _get_k_and_s_triton_kernel[grid](
+        buf,
+        page_indices,
+        k_out,
+        s_out,
+        seq_len,
+        page_size,
+        buf_numel_per_page,
+        index_head_dim,
+        s_offset_in_page,
+        BLOCK_SIZE_K=128,
+    )
+
+    return k_out, s_out
+
+
+@triton.jit
+def _get_k_and_s_triton_kernel(
+    buf_ptr,
+    page_indices_ptr,
+    k_out_ptr,
+    s_out_ptr,
+    seq_len: tl.constexpr,
+    page_size: tl.constexpr,
+    buf_numel_per_page: tl.constexpr,
+    index_head_dim: tl.constexpr,
+    s_offset_in_page: tl.constexpr,
+    BLOCK_SIZE_K: tl.constexpr,
+):
+    """
+    Fused kernel that gathers both K and S data in a single pass.
+    Each program handles one token (seq_len tokens total).
+    Loads 128 bytes (K) + 4 bytes (S) from the appropriate page.
+    """
+    token_id = tl.program_id(0)
+
+    # Calculate which page and offset within page
+    page_idx = token_id // page_size
+    token_offset_in_page = token_id % page_size
+
+    # Load the page index from page_indices
+    page_index = tl.load(page_indices_ptr + page_idx)
+
+    # ===== Load K data (128 bytes) =====
+    # Calculate source offset for K in buf
+    k_src_base_offset = (
+        page_index * buf_numel_per_page + token_offset_in_page * index_head_dim
+    )
+
+    # Load 128 bytes (index_head_dim elements)
+    k_offsets = tl.arange(0, BLOCK_SIZE_K)
+    k_mask = k_offsets < index_head_dim
+    k_data = tl.load(buf_ptr + k_src_base_offset + k_offsets, mask=k_mask)
+
+    # Store K to output
+    k_dst_offset = token_id * index_head_dim
+    tl.store(k_out_ptr + k_dst_offset + k_offsets, k_data, mask=k_mask)
+
+    # ===== Load S data (4 bytes) =====
+    # Calculate source offset for S in buf
+    s_src_base_offset = (
+        page_index * buf_numel_per_page + s_offset_in_page + token_offset_in_page * 4
+    )
+
+    # Load 4 bytes (fp32 scale)
+    s_offsets = tl.arange(0, 4)
+    s_data = tl.load(buf_ptr + s_src_base_offset + s_offsets)
+
+    # Store S to output
+    s_dst_offset = token_id * 4
+    tl.store(s_out_ptr + s_dst_offset + s_offsets, s_data)
diff --git a/sglang/python/sglang/srt/layers/attention/nsa/nsa_backend_mtp_precompute.py b/sglang/python/sglang/srt/layers/attention/nsa/nsa_backend_mtp_precompute.py
new file mode 100644
index 0000000000000000000000000000000000000000..846d276e46bf0a26a871e72fea60ceebb8d29f10
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/attention/nsa/nsa_backend_mtp_precompute.py
@@ -0,0 +1,324 @@
+"""Multi-step precompute utilities for Native Sparse Attention backend.
+
+This module provides optimization utilities for multi-step speculative decoding
+by precomputing shared metadata once and copying it to multiple backend instances.
+"""
+
+from __future__ import annotations
+
+from dataclasses import dataclass
+from typing import TYPE_CHECKING, Optional
+
+import torch
+
+from sglang.srt.layers.attention.nsa.utils import compute_nsa_seqlens
+
+if TYPE_CHECKING:
+    from sglang.srt.model_executor.forward_batch_info import ForwardMode
+    from sglang.srt.speculative.spec_info import SpecInput
+
+
+@dataclass
+class PrecomputedMetadata:
+    """Precomputed metadata shared across multiple backend instances.
+
+    Used for multi-step speculative decoding where multiple backends
+    need identical metadata. Precomputing once and copying N times
+    is much faster than computing N times.
+
+    """
+
+    # Basic seqlens
+    cache_seqlens: torch.Tensor  # int32, [bs]
+    cu_seqlens_k: torch.Tensor  # int32, [bs+1]
+
+    # Page table
+    page_indices: torch.Tensor  # int32, [bs, max_len] or [expanded_bs, max_len]
+    real_page_table: Optional[torch.Tensor]  # int32, transformed version
+
+    # NSA seqlens
+    seqlens_expanded: torch.Tensor  # int32, [expanded_size]
+    nsa_cache_seqlens: torch.Tensor  # int32, [expanded_size]
+    nsa_cu_seqlens_k: torch.Tensor  # int32, [expanded_size+1]
+    seqlens_expanded_size: int
+
+    # Dimensions
+    max_len: int  # for decode/draft_extend
+    max_seqlen_k: int  # for target_verify
+
+    # FlashMLA (optional)
+    flashmla_metadata: Optional[torch.Tensor] = None
+
+
+def compute_cu_seqlens(seqlens: torch.Tensor) -> torch.Tensor:
+    """Compute cumulative sequence lengths with padding."""
+    assert seqlens.dtype == torch.int32
+    return torch.nn.functional.pad(
+        torch.cumsum(seqlens, dim=0, dtype=torch.int32), (1, 0)
+    )
+
+
+class NativeSparseAttnBackendMTPPrecomputeMixin:
+    """Mixin class providing metadata precomputation for multi-step speculative decoding.
+
+    This mixin provides the _precompute_replay_metadata method and its helpers,
+    which are used to optimize CUDA graph replay in multi-step scenarios.
+    """
+
+    def _precompute_replay_metadata(
+        self,
+        bs: int,
+        req_pool_indices: torch.Tensor,
+        seq_lens: torch.Tensor,
+        seq_lens_cpu: torch.Tensor,
+        forward_mode: "ForwardMode",
+        spec_info: Optional["SpecInput"],
+    ) -> PrecomputedMetadata:
+        """Precompute all shared metadata for multi-step backends.
+
+        This function extracts and computes all operations that are
+        identical across different backend instances in multi-step
+        speculative decoding.
+
+        Args:
+            bs: Batch size
+            req_pool_indices: Request pool indices [bs]
+            seq_lens: Sequence lengths [bs]
+            seq_lens_cpu: Sequence lengths on CPU [bs]
+            forward_mode: Forward mode (decode/target_verify/draft_extend)
+            spec_info: Speculative decoding info (for draft_extend mode)
+
+        Returns:
+            PrecomputedMetadata containing all shared intermediate results
+        """
+        # Slice inputs to batch size
+        seq_lens = seq_lens[:bs]
+        seq_lens_cpu = seq_lens_cpu[:bs]
+        req_pool_indices = req_pool_indices[:bs]
+
+        # Dispatch to mode-specific precomputation
+        if forward_mode.is_decode_or_idle():
+            return self._precompute_decode_mode(
+                bs, req_pool_indices, seq_lens, seq_lens_cpu
+            )
+        elif forward_mode.is_target_verify():
+            return self._precompute_target_verify_mode(
+                bs, req_pool_indices, seq_lens, seq_lens_cpu
+            )
+        elif forward_mode.is_draft_extend():
+            return self._precompute_draft_extend_mode(
+                bs, req_pool_indices, seq_lens, seq_lens_cpu, spec_info
+            )
+        else:
+            raise ValueError(f"Unsupported forward mode: {forward_mode}")
+
+    def _precompute_decode_mode(
+        self,
+        bs: int,
+        req_pool_indices: torch.Tensor,
+        seq_lens: torch.Tensor,
+        seq_lens_cpu: torch.Tensor,
+    ) -> PrecomputedMetadata:
+        """Precompute metadata for normal decode mode."""
+        max_len = int(seq_lens_cpu.max().item())
+
+        # Convert to int32 and compute cumsum
+        cache_seqlens = seq_lens.to(torch.int32)
+        cu_seqlens_k = compute_cu_seqlens(cache_seqlens)
+
+        # Get page indices from cache
+        page_indices = self.req_to_token[req_pool_indices, :max_len].contiguous()
+
+        # Compute NSA seqlens
+        nsa_cache_seqlens = compute_nsa_seqlens(
+            cache_seqlens, nsa_index_topk=self.nsa_index_topk
+        )
+        seqlens_expanded = cache_seqlens
+        seqlens_expanded_size = seqlens_expanded.shape[0]
+
+        # Compute NSA cumsum
+        nsa_cu_seqlens_k = compute_cu_seqlens(nsa_cache_seqlens)
+
+        # Transform page table if needed
+        if self.real_page_size > 1:
+            real_page_table = self._transform_table_1_to_real(page_indices)
+        else:
+            real_page_table = None  # Will use page_indices directly
+
+        # Compute FlashMLA metadata if needed
+        flashmla_metadata = None
+        if self.nsa_decode_impl == "flashmla_kv":
+            flashmla_metadata = self._compute_flashmla_metadata(
+                cache_seqlens=nsa_cache_seqlens,
+                seq_len_q=1,
+            )
+
+        return PrecomputedMetadata(
+            cache_seqlens=cache_seqlens,
+            cu_seqlens_k=cu_seqlens_k,
+            page_indices=page_indices,
+            real_page_table=real_page_table,
+            seqlens_expanded=seqlens_expanded,
+            nsa_cache_seqlens=nsa_cache_seqlens,
+            nsa_cu_seqlens_k=nsa_cu_seqlens_k,
+            seqlens_expanded_size=seqlens_expanded_size,
+            max_len=max_len,
+            max_seqlen_k=max_len,
+            flashmla_metadata=flashmla_metadata,
+        )
+
+    def _precompute_target_verify_mode(
+        self,
+        bs: int,
+        req_pool_indices: torch.Tensor,
+        seq_lens: torch.Tensor,
+        seq_lens_cpu: torch.Tensor,
+    ) -> PrecomputedMetadata:
+        """Precompute metadata for target verify mode."""
+        max_seqlen_k = int(
+            seq_lens_cpu.max().item() + self.speculative_num_draft_tokens
+        )
+
+        # Cache seqlens with draft tokens
+        cache_seqlens = (seq_lens + self.speculative_num_draft_tokens).to(torch.int32)
+        cu_seqlens_k = compute_cu_seqlens(cache_seqlens)
+
+        # Page indices (repeated for each draft token)
+        page_indices = self.req_to_token[req_pool_indices, :max_seqlen_k]
+        page_indices = torch.repeat_interleave(
+            page_indices, repeats=self.speculative_num_draft_tokens, dim=0
+        ).contiguous()
+
+        # Generate expanded seqlens
+        extend_seq_lens_cpu = [self.speculative_num_draft_tokens] * bs
+        seqlens_int32_cpu = [
+            self.speculative_num_draft_tokens + kv_len
+            for kv_len in seq_lens_cpu.tolist()
+        ]
+        seqlens_expanded = torch.cat(
+            [
+                torch.arange(
+                    kv_len - qo_len + 1,
+                    kv_len + 1,
+                    dtype=torch.int32,
+                    device=self.device,
+                )
+                for qo_len, kv_len in zip(
+                    extend_seq_lens_cpu,
+                    seqlens_int32_cpu,
+                    strict=True,
+                )
+            ]
+        )
+
+        # Compute NSA seqlens
+        nsa_cache_seqlens = compute_nsa_seqlens(seqlens_expanded, self.nsa_index_topk)
+        seqlens_expanded_size = seqlens_expanded.shape[0]
+
+        # NSA cumsum
+        nsa_cu_seqlens_k = compute_cu_seqlens(nsa_cache_seqlens)
+
+        # Transform page table
+        if self.real_page_size > 1:
+            real_page_table = self._transform_table_1_to_real(page_indices)
+        else:
+            real_page_table = None
+
+        # FlashMLA metadata
+        flashmla_metadata = None
+        if self.nsa_decode_impl == "flashmla_kv":
+            flashmla_metadata = self._compute_flashmla_metadata(
+                cache_seqlens=nsa_cache_seqlens,
+                seq_len_q=1,
+            )
+
+        return PrecomputedMetadata(
+            cache_seqlens=cache_seqlens,
+            cu_seqlens_k=cu_seqlens_k,
+            page_indices=page_indices,
+            real_page_table=real_page_table,
+            seqlens_expanded=seqlens_expanded,
+            nsa_cache_seqlens=nsa_cache_seqlens,
+            nsa_cu_seqlens_k=nsa_cu_seqlens_k,
+            seqlens_expanded_size=seqlens_expanded_size,
+            max_len=-1,  # Not used in this mode
+            max_seqlen_k=max_seqlen_k,
+            flashmla_metadata=flashmla_metadata,
+        )
+
+    def _precompute_draft_extend_mode(
+        self,
+        bs: int,
+        req_pool_indices: torch.Tensor,
+        seq_lens: torch.Tensor,
+        seq_lens_cpu: torch.Tensor,
+        spec_info: "SpecInput",
+    ) -> PrecomputedMetadata:
+        """Precompute metadata for draft extend mode."""
+        max_seqlen_k = int(seq_lens_cpu.max().item())
+
+        # Cache seqlens
+        cache_seqlens = seq_lens.to(torch.int32)
+        cu_seqlens_k = compute_cu_seqlens(cache_seqlens)
+
+        # Extend seqlens from spec_info
+        extend_seq_lens = spec_info.accept_length[:bs]
+        extend_seq_lens_cpu = extend_seq_lens.tolist()
+
+        # Page indices (repeated per accept length)
+        page_indices = self.req_to_token[req_pool_indices, :max_seqlen_k]
+        page_indices = torch.repeat_interleave(
+            page_indices, repeats=extend_seq_lens, dim=0
+        ).contiguous()
+
+        # Generate expanded seqlens
+        seqlens_expanded = torch.cat(
+            [
+                torch.arange(
+                    kv_len - qo_len + 1,
+                    kv_len + 1,
+                    dtype=torch.int32,
+                    device=self.device,
+                )
+                for qo_len, kv_len in zip(
+                    extend_seq_lens_cpu,
+                    seq_lens_cpu.tolist(),
+                    strict=True,
+                )
+            ]
+        )
+
+        # Compute NSA seqlens
+        nsa_cache_seqlens = compute_nsa_seqlens(seqlens_expanded, self.nsa_index_topk)
+        seqlens_expanded_size = seqlens_expanded.shape[0]
+
+        # NSA cumsum
+        nsa_cu_seqlens_k = compute_cu_seqlens(nsa_cache_seqlens)
+
+        # Transform page table
+        if self.real_page_size > 1:
+            real_page_table = self._transform_table_1_to_real(page_indices)
+        else:
+            real_page_table = None
+
+        # FlashMLA metadata
+        flashmla_metadata = None
+        if self.nsa_decode_impl == "flashmla_kv":
+            flashmla_metadata = self._compute_flashmla_metadata(
+                cache_seqlens=nsa_cache_seqlens,
+                seq_len_q=1,
+            )
+
+        return PrecomputedMetadata(
+            cache_seqlens=cache_seqlens,
+            cu_seqlens_k=cu_seqlens_k,
+            page_indices=page_indices,
+            real_page_table=real_page_table,
+            seqlens_expanded=seqlens_expanded,
+            nsa_cache_seqlens=nsa_cache_seqlens,
+            nsa_cu_seqlens_k=nsa_cu_seqlens_k,
+            seqlens_expanded_size=seqlens_expanded_size,
+            max_len=max_seqlen_k,
+            max_seqlen_k=max_seqlen_k,
+            flashmla_metadata=flashmla_metadata,
+        )
diff --git a/sglang/python/sglang/srt/layers/attention/nsa/nsa_indexer.py b/sglang/python/sglang/srt/layers/attention/nsa/nsa_indexer.py
new file mode 100644
index 0000000000000000000000000000000000000000..f2e6593328270876c39a7062ea28d443c6ab6195
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/attention/nsa/nsa_indexer.py
@@ -0,0 +1,1483 @@
+from __future__ import annotations
+
+import contextlib
+from abc import ABC, abstractmethod
+from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple
+
+import torch
+from einops import rearrange
+
+from sglang.jit_kernel.fused_store_index_cache import (
+    can_use_nsa_fused_store,
+    fused_store_index_k_cache,
+)
+from sglang.srt.environ import envs
+from sglang.srt.layers.dp_attention import attn_tp_all_gather_into_tensor
+from sglang.srt.layers.layernorm import LayerNorm
+from sglang.srt.layers.quantization.fp8_kernel import is_fp8_fnuz
+from sglang.srt.layers.utils import MultiPlatformOp
+from sglang.srt.utils import add_prefix, ceil_align, is_cuda, is_hip, is_npu
+
+global _use_multi_stream
+_is_cuda = is_cuda()
+_is_hip = is_hip()
+_is_npu = is_npu()
+_is_fp8_fnuz = is_fp8_fnuz()
+if _is_cuda:
+    try:
+        import deep_gemm
+    except ImportError as e:
+        deep_gemm = e
+
+if is_npu():
+    import torch_npu
+    from sglang.srt.hardware_backend.npu.utils import get_indexer_weight_stream
+
+from sglang.srt.distributed import (
+    get_attn_context_model_parallel_rank,
+    get_attn_context_model_parallel_world_size,
+)
+from sglang.srt.distributed.parallel_state import get_pp_group
+from sglang.srt.layers import deep_gemm_wrapper
+from sglang.srt.layers.attention.nsa.utils import (
+    cp_all_gather_rerange_output,
+    is_nsa_enable_prefill_cp,
+    is_nsa_prefill_cp_in_seq_split,
+)
+from sglang.srt.layers.communicator import ScatterMode
+from sglang.srt.layers.linear import ReplicatedLinear
+from sglang.srt.layers.quantization.base_config import QuantizationConfig
+from sglang.srt.layers.rotary_embedding import get_rope_wrapper
+from sglang.srt.model_executor.cuda_graph_runner import get_is_capture_mode
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch
+from sglang.srt.server_args import get_global_server_args
+
+_use_ag_after_qlora = envs.SGLANG_USE_AG_AFTER_QLORA.get()
+if TYPE_CHECKING:
+    from sglang.srt.mem_cache.memory_pool import NSATokenToKVPool
+
+
+DUAL_STREAM_TOKEN_THRESHOLD = 1024 if _is_cuda else 0
+
+
+class BaseIndexerMetadata(ABC):
+    @abstractmethod
+    def get_seqlens_int32(self) -> torch.Tensor:
+        """
+        Return: (batch_size,) int32 tensor
+        """
+
+    @abstractmethod
+    def get_page_table_64(self) -> torch.Tensor:
+        """
+        Return: (batch_size, num_blocks) int32, page table.
+                The page size of the table is 64.
+        """
+
+    @abstractmethod
+    def get_page_table_1(self) -> torch.Tensor:
+        """
+        Return: (batch_size, num_blocks) int32, page table.
+                The page size of the table is 1.
+        """
+
+    @abstractmethod
+    def get_seqlens_expanded(self) -> torch.Tensor:
+        """
+        Return: (sum_extend_seq_len,) int32 tensor
+        """
+
+    def get_indexer_kvcache_range(self) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        Return: (tokens, ), (tokens, ) int32, k_start and k_end in kv cache(token,xxx) for each token.
+        """
+
+    def get_indexer_seq_len_cpu(self) -> torch.Tensor:
+        """
+        Return: seq lens for each batch.
+        """
+
+    def get_nsa_extend_len_cpu(self) -> List[int]:
+        """
+        Return: extend seq lens for each batch.
+        """
+
+    def get_token_to_batch_idx(self) -> torch.Tensor:
+        """
+        Return: batch idx for each token.
+        """
+
+    @abstractmethod
+    def topk_transform(
+        self,
+        logits: torch.Tensor,
+        topk: int,
+    ) -> torch.Tensor:
+        """
+        Perform topk selection on the logits and possibly transform the result.
+
+        NOTE that attention backend may override this function to do some
+        transformation, which means the result of this topk_transform may not
+        be the topk indices of the input logits.
+
+        Return: Anything, since it will be passed to the attention backend
+                for further processing on sparse attention computation.
+                Don't assume it is the topk indices of the input logits.
+        """
+
+
+def rotate_activation(x: torch.Tensor) -> torch.Tensor:
+    assert x.dtype == torch.bfloat16
+    # from sgl_kernel import hadamard_transform
+    if _is_hip:
+        from fast_hadamard_transform import hadamard_transform
+    else:
+        from sglang.jit_kernel.hadamard import hadamard_transform
+
+    hidden_size = x.size(-1)
+    assert (
+        hidden_size & (hidden_size - 1)
+    ) == 0, "Hidden size must be a power of 2 for Hadamard transform."
+    return hadamard_transform(x, scale=hidden_size**-0.5)
+
+
+class Indexer(MultiPlatformOp):
+    def __init__(
+        self,
+        hidden_size: int,
+        index_n_heads: int,
+        index_head_dim: int,
+        rope_head_dim: int,
+        index_topk: int,
+        q_lora_rank: int,
+        max_position_embeddings: int,
+        rope_theta: float,
+        layer_id: int,
+        scale_fmt: Optional[str],
+        block_size: int = 128,
+        rope_scaling: Optional[Dict[str, Any]] = None,
+        is_neox_style: bool = True,
+        prefix: str = "",
+        quant_config: Optional[QuantizationConfig] = None,
+        alt_stream: Optional[torch.cuda.Stream] = None,
+    ):
+        super().__init__()
+        self.hidden_size = hidden_size
+        self.n_heads = index_n_heads
+        self.head_dim = index_head_dim
+        self.rope_head_dim = rope_head_dim
+        self.index_topk = index_topk
+        self.q_lora_rank = q_lora_rank
+        self.layer_id = layer_id
+        self.alt_stream = alt_stream
+        self.nsa_enable_prefill_cp = is_nsa_enable_prefill_cp()
+        if self.nsa_enable_prefill_cp:
+            self.cp_size = get_attn_context_model_parallel_world_size()
+            self.cp_rank = get_attn_context_model_parallel_rank()
+        else:
+            self.cp_size = None
+            self.cp_rank = None
+        if _is_cuda:
+            self.sm_count = deep_gemm.get_num_sms()
+            self.half_device_sm_count = ceil_align(self.sm_count // 2, 8)
+            pp_size = get_global_server_args().pp_size
+            self.logits_with_pp_recv = pp_size > 1 and not get_pp_group().is_last_rank
+        else:
+            self.logits_with_pp_recv = False
+
+        self.wq_b = ReplicatedLinear(
+            self.q_lora_rank,
+            self.n_heads * self.head_dim,
+            bias=False,
+            quant_config=quant_config,
+            prefix=add_prefix("wq_b", prefix),
+        )
+
+        self.wk = ReplicatedLinear(
+            self.hidden_size,
+            self.head_dim,
+            bias=False,
+            quant_config=quant_config,
+            prefix=add_prefix("wk", prefix),
+        )
+        self.weights_proj = ReplicatedLinear(
+            self.hidden_size,
+            self.n_heads,
+            bias=False,
+            params_dtype=torch.bfloat16 if _is_cuda else torch.float32,
+            prefix=add_prefix("weights_proj", prefix),
+        )
+        self.k_norm = LayerNorm(self.head_dim, dtype=torch.float32)
+        self.rotary_emb = get_rope_wrapper(
+            rope_head_dim,
+            rotary_dim=rope_head_dim,
+            max_position=max_position_embeddings,
+            base=rope_theta,  # type: ignore
+            rope_scaling=rope_scaling,
+            is_neox_style=is_neox_style,
+            device=get_global_server_args().device,
+        )
+        self.block_size = block_size
+        self.scale_fmt = scale_fmt
+        self.softmax_scale = self.head_dim**-0.5
+
+    @contextlib.contextmanager
+    def _with_real_sm_count(self):
+        # When pipeline parallelism is enabled, each PP rank initiates a recv operation after the _pp_launch_batch
+        # request to receive the PP proxy tensor or output from the previous stage, occupying one SM resource.
+        # Model execution runs in parallel with the recv operation, so the SMs available to the indexer must be reduced
+        # by 1. Currently, the last rank starts the send result + recv request only after waiting for execution results.
+        if self.logits_with_pp_recv:
+            pp_recv_sm_count = 1
+            with deep_gemm_wrapper.configure_deep_gemm_num_sms(
+                self.sm_count - pp_recv_sm_count
+            ):
+                yield
+        else:
+            yield
+
+    def _weights_proj_bf16_in_fp32_out(self, x: torch.Tensor) -> torch.Tensor:
+        if deep_gemm_wrapper.ENABLE_JIT_DEEPGEMM:
+            weight = self.weights_proj.weight
+            out = torch.empty(
+                (x.shape[0], weight.shape[0]),
+                dtype=torch.float32,
+                device=x.device,
+            )
+            deep_gemm_wrapper.gemm_nt_bf16bf16f32(x, weight, out)
+            return out
+
+        if _is_hip:
+            x = x.to(self.weights_proj.weight.dtype)
+        weights, _ = self.weights_proj(x)
+        return weights.float()
+
+    @torch.compile(dynamic=True) if not _is_hip else lambda f: f
+    def _project_and_scale_head_gates(self, x: torch.Tensor):
+        weights = self._weights_proj_bf16_in_fp32_out(x)
+        weights = weights * self.n_heads**-0.5
+        return weights
+
+    @torch.compile(dynamic=True) if not _is_hip else lambda f: f
+    def _get_logits_head_gate(self, x: torch.Tensor, q_scale: torch.Tensor):
+        weights = self._weights_proj_bf16_in_fp32_out(x)
+        weights = weights * self.n_heads**-0.5
+        weights = weights.unsqueeze(-1) * q_scale * self.softmax_scale
+        return weights
+
+    def _get_q_k_bf16(
+        self,
+        q_lora: torch.Tensor,
+        x: torch.Tensor,
+        positions: torch.Tensor,
+        enable_dual_stream: bool,
+        forward_batch: ForwardBatch,
+    ):
+        if enable_dual_stream:
+            current_stream = torch.cuda.current_stream()
+            self.alt_stream.wait_stream(current_stream)
+
+            with deep_gemm_wrapper.configure_deep_gemm_num_sms(
+                self.half_device_sm_count
+            ):
+                query, _ = self.wq_b(q_lora)
+                query = rearrange(query, "l (h d) -> l h d", d=self.head_dim)
+                q_rope, _ = torch.split(
+                    query,
+                    [self.rope_head_dim, self.head_dim - self.rope_head_dim],
+                    dim=-1,
+                )
+            with torch.cuda.stream(self.alt_stream):
+                # TODO we should also put DeepGEMM half SM here?
+                key, _ = self.wk(x)
+                key = self.k_norm(key)
+
+                k_rope, _ = torch.split(
+                    key,
+                    [self.rope_head_dim, self.head_dim - self.rope_head_dim],
+                    dim=-1,
+                )
+
+            current_stream.wait_stream(self.alt_stream)
+        else:
+            query, _ = self.wq_b(q_lora)
+            query = rearrange(query, "l (h d) -> l h d", d=self.head_dim)
+            q_rope, _ = torch.split(
+                query, [self.rope_head_dim, self.head_dim - self.rope_head_dim], dim=-1
+            )
+            key, _ = self.wk(x)
+            key = self.k_norm(key)
+            k_rope, _ = torch.split(
+                key, [self.rope_head_dim, self.head_dim - self.rope_head_dim], dim=-1
+            )
+
+        q_rope, k_rope = self.rotary_emb(positions, q_rope, k_rope)
+
+        query[..., : self.rope_head_dim] = q_rope
+        key[..., : self.rope_head_dim] = k_rope
+
+        if enable_dual_stream:
+            current_stream = torch.cuda.current_stream()
+            self.alt_stream.wait_stream(current_stream)
+            query = rotate_activation(query)
+
+            with torch.cuda.stream(self.alt_stream):
+                key = rotate_activation(key)
+            current_stream.wait_stream(self.alt_stream)
+        else:
+            query = rotate_activation(query)
+            key = rotate_activation(key)
+
+        # allgather+rerrange
+        if forward_batch.nsa_cp_metadata is not None and self.nsa_enable_prefill_cp:
+            key = cp_all_gather_rerange_output(
+                key.contiguous(),
+                self.cp_size,
+                forward_batch,
+                torch.cuda.current_stream(),
+            )
+        return query, key
+
+    def _get_k_bf16(
+        self,
+        x: torch.Tensor,
+        positions: torch.Tensor,
+        enable_dual_stream: bool,
+    ):
+        # Compute only key, skip query
+        key, _ = self.wk(x)
+        key = self.k_norm(key)
+        k_rope, _ = torch.split(
+            key, [self.rope_head_dim, self.head_dim - self.rope_head_dim], dim=-1
+        )
+
+        _, k_rope = self.rotary_emb(positions, k_rope, k_rope)
+        key[..., : self.rope_head_dim] = k_rope
+        key = rotate_activation(key)
+
+        return key
+
+    def _get_topk_paged(
+        self,
+        forward_batch: ForwardBatch,
+        layer_id: int,
+        q_fp8: torch.Tensor,
+        weights: torch.Tensor,
+        metadata: BaseIndexerMetadata,
+    ) -> torch.Tensor:
+        if TYPE_CHECKING:
+            assert isinstance(forward_batch.token_to_kv_pool, NSATokenToKVPool)
+
+        page_size = forward_batch.token_to_kv_pool.page_size
+        # NOTE(dark): blocksize = 64 is hardcoded in deep_gemm
+        if _is_hip:
+            assert page_size == 1, "only support page size 1"
+            block_tables = metadata.get_page_table_1()
+        else:
+            assert page_size == 64, "only support page size 64"
+            # NOTE(dark): this support extend/decode/decode+graph
+            block_tables = metadata.get_page_table_64()
+
+        max_seq_len = block_tables.shape[1] * page_size
+        kv_cache_fp8 = forward_batch.token_to_kv_pool.get_index_k_with_scale_buffer(
+            layer_id=layer_id
+        )
+
+        blocksize = page_size
+        if (
+            forward_batch.forward_mode.is_target_verify()
+            or forward_batch.forward_mode.is_draft_extend(include_v2=True)
+        ):
+            seqlens_32 = metadata.get_seqlens_expanded()
+        else:
+            seqlens_32 = metadata.get_seqlens_int32()
+        # Reuse pre-computed schedule metadata if available (from init_forward_metadata),
+        # otherwise fall back to computing it here.
+        schedule_metadata = getattr(metadata, "paged_mqa_schedule_metadata", None)
+        if _is_cuda:
+            if schedule_metadata is None:
+                schedule_metadata = deep_gemm.get_paged_mqa_logits_metadata(
+                    seqlens_32, blocksize, self.sm_count
+                )
+
+        assert len(q_fp8.shape) == 3
+        q_fp8 = q_fp8.unsqueeze(1)  # the next_n dim is 1 now
+        assert len(kv_cache_fp8.shape) == 2
+        block_kv = 1 if _is_hip else 64
+        num_heads_kv = 1
+        head_dim_with_sf = 132
+        if _is_hip:
+            kv_cache_fp8 = kv_cache_fp8.view(
+                -1, block_kv, num_heads_kv, head_dim_with_sf
+            )
+        else:
+            kv_cache_fp8 = kv_cache_fp8.view(
+                kv_cache_fp8.shape[0], block_kv, num_heads_kv, head_dim_with_sf
+            )
+        assert len(weights.shape) == 3
+        weights = weights.squeeze(2)
+
+        # When attn_tp_size > 1 or in the MAX_LEN padding mode, padding may exist in the hidden states,
+        # and it is necessary to extract the actual q length.
+        q_offset = sum(metadata.get_nsa_extend_len_cpu())
+        if _is_hip:
+            from aiter.ops.triton.pa_mqa_logits import deepgemm_fp8_paged_mqa_logits
+
+            batch_size, next_n, heads, _ = q_fp8.shape
+            logits = torch.full(
+                (batch_size * next_n, max_seq_len),
+                float("-inf"),
+                device=q_fp8.device,
+                dtype=torch.float32,
+            )
+            deepgemm_fp8_paged_mqa_logits(
+                q_fp8,
+                kv_cache_fp8,
+                weights,
+                logits,
+                seqlens_32,
+                block_tables,
+                max_seq_len,
+                Preshuffle=False,
+                KVBlockSize=block_kv,
+                ChunkK=128,
+                TotalCuCount=256,
+                WavePerEU=5,
+            )
+        else:
+            logits = deep_gemm.fp8_paged_mqa_logits(
+                q_fp8[:q_offset],
+                kv_cache_fp8,
+                weights[:q_offset],
+                seqlens_32,
+                block_tables,
+                schedule_metadata,
+                max_seq_len,
+                clean_logits=False,
+            )
+
+        # NOTE(dark): logits should be cleaned in topk_transform
+        topk_result = metadata.topk_transform(logits, self.index_topk)
+        # Restore possible padding exist in the hidden states.
+        if not _is_hip and q_offset < q_fp8.shape[0]:
+            pad_len = q_fp8.shape[0] - q_offset
+            padding = torch.full(
+                (pad_len, topk_result.shape[1]),
+                -1,
+                dtype=topk_result.dtype,
+                device=topk_result.device,
+            )
+            topk_result = torch.cat([topk_result, padding], dim=0)
+        return topk_result
+
+    def _should_chunk_mqa_logits(
+        self, num_q: int, num_k: int, device: torch.device
+    ) -> Tuple[bool, int]:
+        """
+        Detect whether we need to chunk the MQA logits computation to avoid OOM
+        Return: (need_chunk, free_mem)
+        """
+        # Quick static check for normal batches
+        if num_q * num_k < 8_000_000:  # 8M elements ≈ 32MB logits
+            return False, 0
+
+        free_mem, total_mem = torch.cuda.mem_get_info(device)
+        bytes_per_elem = 4  # float32
+        logits_bytes = num_q * num_k * bytes_per_elem
+
+        # Logits should not exceed 50% of free memory or 30% of total memory
+        need_chunk = (logits_bytes * 2 > free_mem) or (logits_bytes > total_mem * 0.3)
+        return need_chunk, free_mem
+
+    def _get_topk_ragged(
+        self,
+        forward_batch: ForwardBatch,
+        layer_id: int,
+        q_fp8: torch.Tensor,
+        weights: torch.Tensor,
+        metadata: BaseIndexerMetadata,
+    ) -> torch.Tensor:
+        if TYPE_CHECKING:
+            assert isinstance(forward_batch.token_to_kv_pool, NSATokenToKVPool)
+
+        assert forward_batch.forward_mode.is_extend_without_speculative()
+
+        page_size = forward_batch.token_to_kv_pool.page_size
+        if _is_hip:
+            assert page_size == 1, "only support page size 1"
+        else:
+            assert page_size == 64, "only support page size 64"
+
+        assert len(weights.shape) == 3
+        weights = weights.squeeze(-1)
+        k_fp8_list = []
+        k_scale_list = []
+
+        if _is_hip:
+            block_tables = metadata.get_page_table_1()
+        else:
+            block_tables = metadata.get_page_table_64()
+
+        assert (
+            forward_batch.seq_lens_cpu is not None
+            and forward_batch.extend_seq_lens_cpu is not None
+        )
+
+        batch_size = len(block_tables)
+        token_nums, _, _ = q_fp8.shape
+        device = q_fp8.device
+        topk_result = torch.full(
+            (token_nums, self.index_topk), -1, device=device, dtype=torch.int32
+        )
+        if batch_size == 0:
+            return topk_result
+
+        indexer_seq_lens_cpu = metadata.get_indexer_seq_len_cpu()
+        assert len(indexer_seq_lens_cpu) == batch_size
+        for i in range(batch_size):
+            seq_len = indexer_seq_lens_cpu[i].item()
+            assert isinstance(seq_len, int)
+            # Use fused Triton kernel to get both K and scale in a single call
+            k_fp8, k_scale = forward_batch.token_to_kv_pool.get_index_k_scale_buffer(
+                layer_id,
+                seq_len,
+                block_tables[i],
+            )
+            k_fp8_list.append(k_fp8)
+            k_scale_list.append(k_scale)
+        if _is_fp8_fnuz:
+            k_fp8 = torch.cat(k_fp8_list, dim=0).view(torch.float8_e4m3fnuz)
+        else:
+            k_fp8 = torch.cat(k_fp8_list, dim=0).view(torch.float8_e4m3fn)
+        k_scale = torch.cat(k_scale_list, dim=0).view(torch.float32).squeeze(-1)
+        kv_fp8 = (k_fp8, k_scale)
+        ks, ke = metadata.get_indexer_kvcache_range()
+        seq_lens_expanded = metadata.get_seqlens_expanded()
+        token_to_batch_idx = metadata.get_token_to_batch_idx()
+        q_offset = ks.shape[0]
+        k_offset = k_fp8.shape[0]
+
+        # Check if we need to chunk to avoid OOM
+        need_chunk, free_mem = self._should_chunk_mqa_logits(q_offset, k_offset, device)
+
+        if not need_chunk:
+            assert q_fp8[:q_offset].shape[0] != 0
+            with self._with_real_sm_count():
+                if _is_hip:
+                    from aiter.ops.triton.fp8_mqa_logits import fp8_mqa_logits
+
+                    kv, scale = kv_fp8
+                    logits = fp8_mqa_logits(
+                        q_fp8[:q_offset], kv, scale, weights[:q_offset], ks, ke
+                    )
+                else:
+                    logits = deep_gemm.fp8_mqa_logits(
+                        q_fp8[:q_offset],
+                        kv_fp8,
+                        weights[:q_offset],
+                        ks,
+                        ke,
+                        clean_logits=False,
+                    )
+            assert logits.shape[0] == len(seq_lens_expanded)
+            assert logits.shape[1] == k_offset
+
+            raw_topk_result = metadata.topk_transform(logits, self.index_topk, ks=ks)
+            topk_result[:q_offset] = raw_topk_result
+            return topk_result
+
+        # Chunk path
+        bytes_per_elem = 4  # float32
+        bytes_per_row = k_offset * bytes_per_elem
+        # Reserve 50% of free memory for logits
+        max_rows = max(1, int((free_mem * 0.5) // max(bytes_per_row, 1)))
+        max_rows = min(max_rows, q_offset)
+
+        global_topk_offset = metadata.attn_metadata.topk_indices_offset
+
+        assert (
+            seq_lens_expanded.shape[0] == q_offset
+        ), f"seq_lens_expanded length mismatch: {seq_lens_expanded.shape[0]} != {q_offset}"
+        if global_topk_offset is not None:
+            assert (
+                global_topk_offset.shape[0] >= q_offset
+            ), f"topk_indices_offset too short: {global_topk_offset.shape[0]} < {q_offset}"
+
+        start = 0
+        while start < q_offset:
+            end = min(start + max_rows, q_offset)
+
+            with self._with_real_sm_count():
+                if _is_hip:
+                    from aiter.ops.triton.fp8_mqa_logits import fp8_mqa_logits
+
+                    kv, scale = kv_fp8
+                    logits_chunk = fp8_mqa_logits(
+                        q_fp8[start:end],
+                        kv,
+                        scale,
+                        weights[start:end],
+                        ks[start:end],
+                        ke[start:end],
+                    )
+                else:
+                    logits_chunk = deep_gemm.fp8_mqa_logits(
+                        q_fp8[start:end],
+                        kv_fp8,
+                        weights[start:end],
+                        ks[start:end],
+                        ke[start:end],
+                        clean_logits=False,
+                    )
+
+            lengths_chunk = seq_lens_expanded[start:end]
+
+            # RAGGED: use global offset; PAGED: construct local cu_seqlens_q per chunk
+            if global_topk_offset is not None:
+                # RAGGED path
+                topk_offset_chunk = global_topk_offset[start:end]
+                cu_seqlens_q_chunk = None
+                batch_idx_chunk = None
+            else:
+                # PAGED path: treat each token as a length-1 sequence
+                topk_offset_chunk = None
+                B_chunk = logits_chunk.shape[0]
+                cu_seqlens_q_chunk = torch.ones(
+                    B_chunk, dtype=torch.int32, device=device
+                )
+                batch_idx_chunk = token_to_batch_idx[start:end]
+
+            raw_topk_chunk = metadata.topk_transform(
+                logits_chunk,
+                self.index_topk,
+                ks=ks[start:end],
+                cu_seqlens_q=cu_seqlens_q_chunk,
+                ke_offset=lengths_chunk,
+                batch_idx_list=batch_idx_chunk,
+                topk_indices_offset_override=topk_offset_chunk,
+            )
+            topk_result[start:end] = raw_topk_chunk
+            start = end
+
+        return topk_result
+
+    def _forward_cuda_k_only(
+        self,
+        x: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        layer_id: int,
+        act_quant,
+        enable_dual_stream: bool,
+        metadata: BaseIndexerMetadata,
+        return_indices: bool = True,
+    ) -> Optional[torch.Tensor]:
+        assert forward_batch.forward_mode.is_extend_without_speculative()
+        x_meta = x[0] if isinstance(x, tuple) else x
+
+        # Fast path: only compute and store k cache, skip all q and weights ops
+        key = self._get_k_bf16(x, positions, enable_dual_stream)
+
+        if not forward_batch.out_cache_loc.is_contiguous():
+            forward_batch.out_cache_loc = forward_batch.out_cache_loc.contiguous()
+
+        self._store_index_k_cache(
+            forward_batch=forward_batch,
+            layer_id=layer_id,
+            key=key,
+            act_quant=act_quant,
+        )
+
+        # MHA doesn't need topk_indices
+        if not return_indices:
+            return None
+
+        # MLA: use dummy logits with topk kernel's fast path to generate indices
+        # When length <= 2048, naive_topk_cuda directly generates [0,1,...,length-1,-1,...]
+        seq_lens_expanded = metadata.get_seqlens_expanded()
+        dummy_logits = torch.zeros(
+            seq_lens_expanded.shape[0],
+            self.index_topk,
+            dtype=torch.float32,
+            device=x_meta.device,
+        )
+        return metadata.topk_transform(dummy_logits, self.index_topk)
+
+    def _get_topk_ragged_with_cp(
+        self,
+        forward_batch: ForwardBatch,
+        layer_id: int,
+        q_fp8: torch.Tensor,
+        weights: torch.Tensor,
+        metadata: BaseIndexerMetadata,
+        kv_len: int,
+        actual_seq_q: int,
+        cp_index: List[Tuple[int, int, int]] = None,
+    ) -> torch.Tensor:
+        if TYPE_CHECKING:
+            assert isinstance(forward_batch.token_to_kv_pool, NSATokenToKVPool)
+
+        page_size = forward_batch.token_to_kv_pool.page_size
+        assert page_size == 64, "only support page size 64"
+        assert len(weights.shape) == 3
+        weights = weights.squeeze(-1)
+        k_fp8_list = []
+        k_scale_list = []
+        ks_list = []
+        ke_offset_list = []
+        offset = 0
+        actual_seq_q_list = []
+        batch_idx_list = []
+
+        block_tables = metadata.get_page_table_64()
+
+        assert (
+            forward_batch.seq_lens_cpu is not None
+            and forward_batch.extend_seq_lens_cpu is not None
+        )
+        if cp_index is not None:
+            # TODO Multi-batch support has accuracy issues
+            for batch_idx, start_seq_position, end_seq_position in cp_index:
+                pre_chunk_offset = (
+                    forward_batch.seq_lens_cpu[batch_idx].item()
+                    - forward_batch.extend_seq_lens_cpu[batch_idx]
+                )
+                start_seq_position += pre_chunk_offset
+                end_seq_position += pre_chunk_offset
+                if offset == 0 and batch_idx != 0:
+                    offset += forward_batch.extend_seq_lens_cpu[batch_idx - 1]
+                k_fp8 = forward_batch.token_to_kv_pool.get_index_k_continuous(
+                    layer_id,
+                    end_seq_position,
+                    block_tables[batch_idx],
+                )
+                k_scale = forward_batch.token_to_kv_pool.get_index_k_scale_continuous(
+                    layer_id,
+                    end_seq_position,
+                    block_tables[batch_idx],
+                )
+
+                extend_seq_len = end_seq_position - start_seq_position
+                ks = torch.full(
+                    (extend_seq_len,), offset, dtype=torch.int32, device="cuda"
+                )
+                k_fp8_list.append(k_fp8)
+                k_scale_list.append(k_scale)
+                ks_list.append(ks)
+                ke_offset = torch.arange(
+                    start_seq_position + 1,
+                    end_seq_position + 1,
+                    dtype=torch.int32,
+                    device="cuda",
+                )
+                ke_offset_list.append(ke_offset)
+                actual_seq_q = torch.tensor(
+                    [extend_seq_len], dtype=torch.int32, device="cuda"
+                )
+                actual_seq_q_list.append(actual_seq_q)
+                batch_idx_list.append(batch_idx)
+
+            k_fp8 = torch.cat(k_fp8_list, dim=0).view(torch.float8_e4m3fn)
+            k_scale = torch.cat(k_scale_list, dim=0).view(torch.float32).squeeze(-1)
+            kv_fp8 = (k_fp8, k_scale)
+            ks = torch.cat(ks_list, dim=0)
+            ke_offset = torch.cat(ke_offset_list, dim=0)
+            ke = ks + ke_offset
+            actual_seq_q = torch.cat(actual_seq_q_list, dim=0)
+            with self._with_real_sm_count():
+                logits = deep_gemm.fp8_mqa_logits(
+                    q_fp8,
+                    kv_fp8,
+                    weights,
+                    ks,
+                    ke,
+                    clean_logits=False,
+                )
+            topk_result = metadata.topk_transform(
+                logits,
+                self.index_topk,
+                ks=ks,
+                cu_seqlens_q=actual_seq_q,
+                ke_offset=ke_offset,
+                batch_idx_list=batch_idx_list,
+            )
+        else:
+            kv_len = (
+                forward_batch.seq_lens_cpu[0].item()
+                - forward_batch.extend_seq_lens_cpu[0]
+                + kv_len
+            )
+            k_fp8 = forward_batch.token_to_kv_pool.get_index_k_continuous(
+                layer_id,
+                kv_len,
+                block_tables[0],
+            )
+            k_scale = forward_batch.token_to_kv_pool.get_index_k_scale_continuous(
+                layer_id,
+                kv_len,
+                block_tables[0],
+            )
+
+            k_fp8 = k_fp8.view(torch.float8_e4m3fn)
+            k_scale = k_scale.view(torch.float32).squeeze(-1)
+            kv_fp8 = (k_fp8, k_scale)
+            ks = torch.full((actual_seq_q,), offset, dtype=torch.int32, device="cuda")
+            ke_offset = torch.arange(
+                (kv_len - actual_seq_q) + 1,
+                kv_len + 1,
+                dtype=torch.int32,
+                device="cuda",
+            )
+            ke = ks + ke_offset
+
+            with self._with_real_sm_count():
+                logits = deep_gemm.fp8_mqa_logits(
+                    q_fp8,
+                    kv_fp8,
+                    weights,
+                    ks,
+                    ke,
+                    clean_logits=False,
+                )
+            actual_seq_q = torch.tensor([actual_seq_q], dtype=torch.int32).to(
+                device="cuda", non_blocking=True
+            )
+            topk_result = metadata.topk_transform(
+                logits,
+                self.index_topk,
+                ks=ks,
+                cu_seqlens_q=actual_seq_q,
+                ke_offset=ke_offset,
+            )
+
+        return topk_result
+
+    def forward_indexer(
+        self,
+        q_fp8: torch.Tensor,
+        weights: torch.Tensor,
+        forward_batch: ForwardBatch,
+        topk: int,
+        layer_id: int,
+    ) -> Optional[torch.Tensor]:
+        if not _is_npu:
+            from sglang.srt.layers.attention.nsa.tilelang_kernel import fp8_index
+
+        page_size = forward_batch.token_to_kv_pool.page_size
+        assert page_size == 64, "only support page size 64"
+
+        assert len(weights.shape) == 3
+        weights = weights.squeeze(-1)
+
+        # logits = deep_gemm.fp8_mqa_logits(q_fp8, kv_fp8, weights, ks, ke)
+        k_fp8_list = []
+        k_scale_list = []
+
+        topk_indices_list = []
+
+        block_tables = forward_batch.req_to_token_pool.req_to_token[
+            forward_batch.req_pool_indices, :
+        ]
+        strided_indices = torch.arange(
+            0, block_tables.shape[-1], page_size, device="cuda"
+        )
+        block_tables = block_tables[:, strided_indices] // page_size
+
+        q_len_start = 0
+
+        for i in range(forward_batch.batch_size):
+            seq_len = forward_batch.seq_lens[i].item()
+            q_len = (
+                forward_batch.extend_seq_lens_cpu[i]
+                if forward_batch.forward_mode.is_extend()
+                else 1
+            )
+            q_len_end = q_len_start + q_len
+
+            q_fp8_partial = q_fp8[q_len_start:q_len_end]
+            q_fp8_partial = q_fp8_partial.unsqueeze(0).contiguous()
+
+            weights_partial = weights[q_len_start:q_len_end]
+            weights_partial = weights_partial.squeeze(-1).unsqueeze(0).contiguous()
+
+            k_fp8 = forward_batch.token_to_kv_pool.get_index_k_continuous(
+                layer_id,
+                seq_len,
+                block_tables[i],
+            )
+            k_scale = forward_batch.token_to_kv_pool.get_index_k_scale_continuous(
+                layer_id,
+                seq_len,
+                block_tables[i],
+            )
+
+            k_fp8 = k_fp8.view(torch.float8_e4m3fn).unsqueeze(0).contiguous()
+            k_scale = k_scale.view(torch.float32).squeeze(-1).unsqueeze(0).contiguous()
+
+            index_score = fp8_index(
+                q_fp8_partial,
+                weights_partial,
+                k_fp8,
+                k_scale,
+            )
+            end_pos = seq_len
+            topk_indices = index_score.topk(min(topk, end_pos), dim=-1)[1].squeeze(0)
+
+            pad_len = ceil_align(topk_indices.shape[-1], 2048) - topk_indices.shape[-1]
+            topk_indices = torch.nn.functional.pad(
+                topk_indices, (0, pad_len), "constant", -1
+            )
+
+            topk_indices_list.append(topk_indices)
+
+            q_len_start = q_len_end
+
+        topk_indices = torch.cat(topk_indices_list, dim=0)
+        return topk_indices
+
+    def _store_index_k_cache(
+        self,
+        forward_batch: ForwardBatch,
+        layer_id: int,
+        key: torch.Tensor,
+        *,
+        act_quant=None,  # fallback only
+    ) -> None:
+        """
+        Store NSA indexer K cache for current step.
+
+        Preferred: fused_store_index_k_cache(key, cache, out_cache_loc, page_size)
+        Fallback : act_quant(key) + token_to_kv_pool.set_index_k_scale_buffer(...)
+        """
+
+        # Fast path: JIT fused store (CUDA, page_size=64, non-fnuz)
+        if (
+            _is_cuda
+            and (not _is_fp8_fnuz)
+            and can_use_nsa_fused_store(
+                key.dtype,
+                forward_batch.out_cache_loc.dtype,
+                forward_batch.token_to_kv_pool.page_size,
+            )
+        ):
+            # NOTE: wrapper already normalizes shape/contiguity and asserts dtypes.
+            buf = forward_batch.token_to_kv_pool.get_index_k_with_scale_buffer(
+                layer_id=layer_id
+            )
+            fused_store_index_k_cache(
+                key,
+                buf,
+                forward_batch.out_cache_loc,
+                forward_batch.token_to_kv_pool.page_size,
+            )
+            return
+
+        # Fallback: original path
+        assert act_quant is not None
+        k_fp8, k_scale = act_quant(key, self.block_size, self.scale_fmt)
+
+        out_loc = forward_batch.out_cache_loc
+        if not out_loc.is_contiguous():
+            out_loc = out_loc.contiguous()
+
+        forward_batch.token_to_kv_pool.set_index_k_scale_buffer(
+            layer_id=layer_id,
+            loc=out_loc,
+            index_k=k_fp8,
+            index_k_scale=k_scale,
+        )
+
+    def forward_cuda(
+        self,
+        x: torch.Tensor,
+        q_lora: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        layer_id: int,
+        return_indices: bool = True,
+    ) -> Optional[torch.Tensor]:
+        if _is_hip:
+            from sglang.srt.layers.attention.nsa.tilelang_kernel import act_quant
+        elif not _is_npu:
+            from sglang.srt.layers.attention.nsa.triton_kernel import act_quant
+
+        if TYPE_CHECKING:
+            assert isinstance(forward_batch.token_to_kv_pool, NSATokenToKVPool)
+
+        # When upstream uses fused FP8 RMSNorm+quant, activations may be passed as
+        # a tuple like (x_fp8, x_scale[, y]). Use `x_meta` for shape/device queries.
+        x_meta = x[0] if isinstance(x, tuple) else x
+
+        metadata = forward_batch.attn_backend.get_indexer_metadata(
+            layer_id, forward_batch
+        )
+
+        enable_dual_stream = (
+            self.alt_stream is not None
+            and get_is_capture_mode()
+            and q_lora.shape[0] > 0
+            and q_lora.shape[0] <= DUAL_STREAM_TOKEN_THRESHOLD
+        )
+
+        # skip NSA if attention backend choose to skip this batch
+        if metadata is None:
+            return None
+
+        # Determine if should skip topk based on sequence length
+        # We can only skip the logits computation if cuda graph is not involved
+        skip_logits_computation = False
+        if forward_batch.forward_mode.is_extend_without_speculative():
+            if forward_batch.seq_lens_cpu is not None:
+                max_kv_len = forward_batch.seq_lens_cpu.max().item()
+                skip_logits_computation = max_kv_len <= self.index_topk
+
+        # Optimization: fast path when skipping topk computation
+        if skip_logits_computation and (not self.nsa_enable_prefill_cp):
+            return self._forward_cuda_k_only(
+                x,
+                positions,
+                forward_batch,
+                layer_id,
+                act_quant,
+                enable_dual_stream,
+                metadata,
+                return_indices,
+            )
+
+        if enable_dual_stream and forward_batch.forward_mode.is_decode_or_idle():
+            current_stream = torch.cuda.current_stream()
+            self.alt_stream.wait_stream(current_stream)
+            weights = self._project_and_scale_head_gates(x)
+            query, key = self._get_q_k_bf16(
+                q_lora, x, positions, enable_dual_stream, forward_batch=forward_batch
+            )
+            q_fp8, q_scale = act_quant(query, self.block_size, self.scale_fmt)
+            with torch.cuda.stream(self.alt_stream):
+                self._store_index_k_cache(
+                    forward_batch=forward_batch,
+                    layer_id=layer_id,
+                    key=key,
+                    act_quant=act_quant,
+                )
+            current_stream.wait_stream(self.alt_stream)
+            weights = weights.unsqueeze(-1) * q_scale * self.softmax_scale
+        else:
+            query, key = self._get_q_k_bf16(
+                q_lora, x, positions, enable_dual_stream, forward_batch=forward_batch
+            )
+
+            if enable_dual_stream:
+                current_stream = torch.cuda.current_stream()
+                self.alt_stream.wait_stream(current_stream)
+
+                q_fp8, q_scale = act_quant(query, self.block_size, self.scale_fmt)
+                with torch.cuda.stream(self.alt_stream):
+                    self._store_index_k_cache(
+                        forward_batch=forward_batch,
+                        layer_id=layer_id,
+                        key=key,
+                        act_quant=act_quant,
+                    )
+                current_stream.wait_stream(self.alt_stream)
+            else:
+                q_fp8, q_scale = act_quant(query, self.block_size, self.scale_fmt)
+                self._store_index_k_cache(
+                    forward_batch=forward_batch,
+                    layer_id=layer_id,
+                    key=key,
+                    act_quant=act_quant,
+                )
+
+            # `_get_logits_head_gate` expects a Tensor. For tuple activations, dequantize
+            # to a float tensor here (callsite), keeping `_get_logits_head_gate` backend-agnostic.
+            if isinstance(x, tuple):
+                assert len(x) in (
+                    2,
+                    3,
+                ), "For tuple input, only (x, x_s) or (x, x_s, y) formats are accepted"
+                x_q, x_s = x[0], x[1]
+                if (
+                    x_s is not None
+                    and x_q.dim() == 2
+                    and x_s.dim() == 2
+                    and x_q.shape[0] == x_s.shape[0]
+                ):
+                    m, n = x_q.shape
+                    ng = x_s.shape[1]
+                    if ng > 0 and n % ng == 0:
+                        group = n // ng
+                        x_for_gate = (
+                            x_q.to(torch.float32)
+                            .view(m, ng, group)
+                            .mul_(x_s.to(torch.float32).unsqueeze(-1))
+                            .view(m, n)
+                            .to(torch.bfloat16)
+                        )
+                    else:
+                        x_for_gate = x_q.to(torch.bfloat16)
+                else:
+                    x_for_gate = x_q.to(torch.bfloat16)
+            else:
+                x_for_gate = x
+
+            weights = self._get_logits_head_gate(x_for_gate, q_scale)
+
+        if _is_cuda or _is_hip:
+            assert forward_batch.seq_lens_cpu is not None
+            if len(forward_batch.seq_lens_cpu) == 0:
+                # this seems b/c max-pad, no worries?
+                # if x.shape[0] != 0:
+                #     print(
+                #         "HACK: seq_lens empty but x not empty, hackily return all-invalid topk_result"
+                #     )
+                return torch.full(
+                    (x_meta.shape[0], self.index_topk),
+                    -1,
+                    dtype=torch.int,
+                    device=x_meta.device,
+                )
+
+            if (
+                forward_batch.forward_mode.is_decode_or_idle()
+                or forward_batch.forward_mode.is_target_verify()
+                or forward_batch.forward_mode.is_draft_extend(include_v2=True)
+            ):
+                topk_result = self._get_topk_paged(
+                    forward_batch, layer_id, q_fp8, weights, metadata
+                )
+            else:
+                if (
+                    forward_batch.nsa_cp_metadata is not None
+                    and is_nsa_prefill_cp_in_seq_split()
+                ):
+                    kv_len_prev = forward_batch.nsa_cp_metadata.kv_len_prev
+                    kv_len_next = forward_batch.nsa_cp_metadata.kv_len_next
+                    actual_seq_q_prev = forward_batch.nsa_cp_metadata.actual_seq_q_prev
+                    actual_seq_q_next = forward_batch.nsa_cp_metadata.actual_seq_q_next
+
+                    # TODO support mutil-batch
+                    # cp_batch_seq_index_prev = forward_batch.nsa_cp_metadata["cp_batch_seq_index_prev"]
+                    # cp_batch_seq_index_next = forward_batch.nsa_cp_metadata["cp_batch_seq_index_next"]
+                    # TODO prev, next, combined into a single call
+                    q_fp8_prev, q_fp8_next = torch.split(
+                        q_fp8, (q_fp8.shape[0] + 1) // 2, dim=0
+                    )
+                    weights_prev, weights_next = torch.split(
+                        weights, (weights.shape[0] + 1) // 2, dim=0
+                    )
+                    topk_result_prev = self._get_topk_ragged_with_cp(
+                        forward_batch,
+                        layer_id,
+                        q_fp8_prev,
+                        weights_prev,
+                        metadata,
+                        kv_len_prev,
+                        actual_seq_q_prev,
+                    )
+
+                    topk_result_next = self._get_topk_ragged_with_cp(
+                        forward_batch,
+                        layer_id,
+                        q_fp8_next,
+                        weights_next,
+                        metadata,
+                        kv_len_next,
+                        actual_seq_q_next,
+                    )
+                    return torch.cat([topk_result_prev, topk_result_next], dim=0)
+                else:
+                    topk_result = self._get_topk_ragged(
+                        forward_batch, layer_id, q_fp8, weights, metadata
+                    )
+        else:
+            topk_result = self.forward_indexer(
+                q_fp8.contiguous(),
+                weights,
+                forward_batch,
+                topk=self.index_topk,
+                layer_id=layer_id,
+            )
+        return topk_result
+
+    def forward_npu(
+        self,
+        x: torch.Tensor,
+        q_lora: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        layer_id: int,
+        layer_scatter_modes=None,
+        dynamic_scale: torch.Tensor = None,
+    ) -> torch.Tensor:
+        if forward_batch.attn_backend.forward_metadata.seq_lens_cpu_int is None:
+            actual_seq_lengths_kv = forward_batch.attn_backend.forward_metadata.seq_lens
+        else:
+            actual_seq_lengths_kv = (
+                forward_batch.attn_backend.forward_metadata.seq_lens_cpu_int
+            )
+        is_prefill = (
+            forward_batch.forward_mode.is_extend()
+            and not forward_batch.forward_mode.is_draft_extend_v2()
+            and not forward_batch.forward_mode.is_target_verify()
+            and not forward_batch.forward_mode.is_draft_extend()
+        )
+
+        cos_sin = self.rotary_emb.cos_sin_cache[positions]
+        cos, sin = cos_sin.chunk(2, dim=-1)
+        cos = cos.repeat(1, 2).view(-1, 1, 1, self.rope_head_dim)
+        sin = sin.repeat(1, 2).view(-1, 1, 1, self.rope_head_dim)
+
+        bs = q_lora.shape[0]
+        if self.alt_stream is not None:
+            self.alt_stream.wait_stream(torch.npu.current_stream())
+            with torch.npu.stream(self.alt_stream):
+                q_lora = (
+                    (q_lora, dynamic_scale) if dynamic_scale is not None else q_lora
+                )
+                q = self.wq_b(q_lora)[
+                    0
+                ]  # [bs, 1536] @ [1536, 64 * 128] = [bs, 64 * 128]
+                wq_b_event = self.alt_stream.record_event()
+                q = q.view(bs, self.n_heads, self.head_dim)  # [bs, 64, 128]
+                q_pe, q_nope = torch.split(
+                    q,
+                    [self.rope_head_dim, self.head_dim - self.rope_head_dim],
+                    dim=-1,
+                )  # [bs, 64, 64 + 64]
+                q_pe = q_pe.view(bs, self.n_heads, 1, self.rope_head_dim)
+                q_pe = torch_npu.npu_rotary_mul(q_pe, cos, sin).view(
+                    bs, self.n_heads, self.rope_head_dim
+                )  # [bs, n, d]
+                q = torch.cat([q_pe, q_nope], dim=-1)
+                q.record_stream(self.alt_stream)
+                q_rope_event = self.alt_stream.record_event()
+        else:
+            q_lora = (q_lora, dynamic_scale) if dynamic_scale is not None else q_lora
+            q = self.wq_b(q_lora)[0]  # [bs, 1536] @ [1536, 64 * 128] = [bs, 64 * 128]
+            q = q.view(bs, self.n_heads, self.head_dim)  # [bs, 64, 128]
+            q_pe, q_nope = torch.split(
+                q,
+                [self.rope_head_dim, self.head_dim - self.rope_head_dim],
+                dim=-1,
+            )  # [bs, 64, 64 + 64]
+            q_pe = q_pe.view(bs, self.n_heads, 1, self.rope_head_dim)
+            q_pe = torch_npu.npu_rotary_mul(q_pe, cos, sin).view(
+                bs, self.n_heads, self.rope_head_dim
+            )  # [bs, n, d]
+            q = torch.cat([q_pe, q_nope], dim=-1)
+
+        if envs.SGLANG_NPU_USE_MULTI_STREAM.get():
+            indexer_weight_stream = get_indexer_weight_stream()
+            indexer_weight_stream.wait_stream(torch.npu.current_stream())
+            with torch.npu.stream(indexer_weight_stream):
+                x = x.view(-1, self.hidden_size)
+                weights = self.weights_proj(x.float())[0].to(torch.bfloat16)
+                weights.record_stream(indexer_weight_stream)
+                weights_event = indexer_weight_stream.record_event()
+        else:
+            x = x.view(-1, self.hidden_size)
+            weights = self.weights_proj(x.float())[0].to(torch.bfloat16)
+
+        k_proj = self.wk(x)[0]  # [b, s, 7168] @ [7168, 128] = [b, s, 128]
+        k = self.k_norm(k_proj)
+        if (
+            _use_ag_after_qlora
+            and layer_scatter_modes.layer_input_mode == ScatterMode.SCATTERED
+            and layer_scatter_modes.attn_mode == ScatterMode.TP_ATTN_FULL
+        ):
+            k = scattered_to_tp_attn_full(k, forward_batch)
+        k_pe, k_nope = torch.split(
+            k,
+            [self.rope_head_dim, self.head_dim - self.rope_head_dim],
+            dim=-1,
+        )  # [bs, 64 + 64]
+
+        k_pe = k_pe.view(-1, 1, 1, self.rope_head_dim)
+        k_pe = torch.ops.npu.npu_rotary_mul(k_pe, cos, sin).view(
+            bs, 1, self.rope_head_dim
+        )  # [bs, 1, d]
+        k = torch.cat([k_pe, k_nope.unsqueeze(1)], dim=-1)  # [bs, 1, 128]
+
+        if (
+            is_prefill
+            and self.nsa_enable_prefill_cp
+            and forward_batch.nsa_cp_metadata is not None
+        ):
+            k = cp_all_gather_rerange_output(
+                k.contiguous().view(-1, self.head_dim),
+                self.cp_size,
+                forward_batch,
+                torch.npu.current_stream(),
+            )
+
+        forward_batch.token_to_kv_pool.set_index_k_buffer(
+            layer_id, forward_batch.out_cache_loc, k
+        )
+        if is_prefill:
+            if self.nsa_enable_prefill_cp and forward_batch.nsa_cp_metadata is not None:
+                forward_batch.attn_backend.forward_metadata.actual_seq_lengths_q = (
+                    forward_batch.nsa_cp_metadata.actual_seq_q_prev_tensor,
+                    forward_batch.nsa_cp_metadata.actual_seq_q_next_tensor,
+                )
+                forward_batch.attn_backend.forward_metadata.actual_seq_lengths_kv = (
+                    forward_batch.nsa_cp_metadata.kv_len_prev_tensor,
+                    forward_batch.nsa_cp_metadata.kv_len_next_tensor,
+                )
+                actual_seq_lengths_q = (
+                    forward_batch.attn_backend.forward_metadata.actual_seq_lengths_q
+                )
+                actual_seq_lengths_kv = (
+                    forward_batch.attn_backend.forward_metadata.actual_seq_lengths_kv
+                )
+            else:
+                actual_seq_lengths_kv = forward_batch.seq_lens
+                actual_seq_lengths_q = forward_batch.extend_seq_lens.cumsum(dim=0)
+        else:
+            if forward_batch.attn_backend.forward_metadata.actual_seq_lengths_q is None:
+                if (
+                    forward_batch.forward_mode.is_draft_extend_v2()
+                    or forward_batch.forward_mode.is_target_verify()
+                    or forward_batch.forward_mode.is_draft_extend()
+                ):
+                    num_draft_tokens = (
+                        forward_batch.attn_backend.speculative_num_draft_tokens
+                    )
+                    actual_seq_lengths_q = torch.arange(
+                        num_draft_tokens,
+                        num_draft_tokens + bs,
+                        num_draft_tokens,
+                        dtype=torch.int32,
+                        device=k.device,
+                    )
+                else:
+                    actual_seq_lengths_q = torch.tensor(
+                        [1 + i * 1 for i in range(bs)],
+                        dtype=torch.int32,
+                        device=k.device,
+                    )
+            else:
+                actual_seq_lengths_q = (
+                    forward_batch.attn_backend.forward_metadata.actual_seq_lengths_q
+                )
+
+        past_key_states = forward_batch.token_to_kv_pool.get_index_k_buffer(layer_id)
+
+        if self.alt_stream is not None:
+            torch.npu.current_stream().wait_event(q_rope_event)
+        if envs.SGLANG_NPU_USE_MULTI_STREAM.get():
+            torch.npu.current_stream().wait_event(weights_event)
+        if (
+            _use_ag_after_qlora
+            and layer_scatter_modes.layer_input_mode == ScatterMode.SCATTERED
+            and layer_scatter_modes.attn_mode == ScatterMode.TP_ATTN_FULL
+        ):
+            weights = scattered_to_tp_attn_full(weights, forward_batch)
+        block_table = forward_batch.attn_backend.forward_metadata.block_tables
+        if (
+            is_prefill
+            and self.nsa_enable_prefill_cp
+            and forward_batch.nsa_cp_metadata is not None
+        ):
+            block_table = block_table[: actual_seq_lengths_q[0].numel()]
+            topk_indices = self.do_npu_cp_balance_indexer(
+                q.view(-1, self.n_heads, self.head_dim),
+                past_key_states,
+                weights,
+                actual_seq_lengths_q,
+                actual_seq_lengths_kv,
+                block_table,
+            )
+            return topk_indices
+        else:
+            block_table = (
+                block_table[: actual_seq_lengths_q.size()[0]]
+                if is_prefill
+                else block_table
+            )
+
+            topk_indices = torch_npu.npu_lightning_indexer(
+                query=q.view(-1, self.n_heads, self.head_dim),
+                key=past_key_states,
+                weights=weights,
+                actual_seq_lengths_query=actual_seq_lengths_q.to(torch.int32),
+                actual_seq_lengths_key=actual_seq_lengths_kv.to(k.device).to(
+                    torch.int32
+                ),
+                block_table=block_table,
+                layout_query="TND",
+                layout_key="PA_BSND",
+                sparse_count=self.index_topk,
+                sparse_mode=3,
+            )
+            return topk_indices[0]
+
+    def do_npu_cp_balance_indexer(
+        self,
+        q,
+        past_key_states,
+        indexer_weights,
+        actual_seq_lengths_q,
+        actual_seq_lengths_kv,
+        block_table,
+    ):
+        q_prev, q_next = torch.split(q, (q.size(0) + 1) // 2, dim=0)
+        weights_prev, weights_next = None, None
+        if indexer_weights is not None:
+            weights_prev, weights_next = torch.split(
+                indexer_weights, (indexer_weights.size(0) + 1) // 2, dim=0
+            )
+            weights_prev = weights_prev.contiguous().view(-1, weights_prev.shape[-1])
+            weights_next = weights_next.contiguous().view(-1, weights_next.shape[-1])
+
+        actual_seq_lengths_q_prev, actual_seq_lengths_q_next = actual_seq_lengths_q
+        actual_seq_lengths_kv_prev, actual_seq_lengths_kv_next = actual_seq_lengths_kv
+
+        topk_indices_prev = torch_npu.npu_lightning_indexer(
+            query=q_prev,
+            key=past_key_states,
+            weights=weights_prev,
+            actual_seq_lengths_query=actual_seq_lengths_q_prev.to(
+                device=q.device, dtype=torch.int32
+            ),
+            actual_seq_lengths_key=actual_seq_lengths_kv_prev.to(
+                device=q.device, dtype=torch.int32
+            ),
+            block_table=block_table,
+            layout_query="TND",
+            layout_key="PA_BSND",
+            sparse_count=self.index_topk,
+            sparse_mode=3,
+        )
+        topk_indices_next = torch_npu.npu_lightning_indexer(
+            query=q_next,
+            key=past_key_states,
+            weights=weights_next,
+            actual_seq_lengths_query=actual_seq_lengths_q_next.to(
+                device=q.device, dtype=torch.int32
+            ),
+            actual_seq_lengths_key=actual_seq_lengths_kv_next.to(
+                device=q.device, dtype=torch.int32
+            ),
+            block_table=block_table,
+            layout_query="TND",
+            layout_key="PA_BSND",
+            sparse_count=self.index_topk,
+            sparse_mode=3,
+        )
+        return topk_indices_prev[0], topk_indices_next[0]
+
+
+def scattered_to_tp_attn_full(
+    hidden_states: torch.Tensor,
+    forward_batch,
+) -> torch.Tensor:
+    hidden_states, local_hidden_states = (
+        torch.empty(
+            (forward_batch.input_ids.shape[0], hidden_states.shape[1]),
+            dtype=hidden_states.dtype,
+            device=hidden_states.device,
+        ),
+        hidden_states,
+    )
+    attn_tp_all_gather_into_tensor(hidden_states, local_hidden_states.contiguous())
+    return hidden_states
diff --git a/sglang/python/sglang/srt/layers/attention/nsa/nsa_mtp_verification.py b/sglang/python/sglang/srt/layers/attention/nsa/nsa_mtp_verification.py
new file mode 100644
index 0000000000000000000000000000000000000000..b957d4ba8a987c5914cd4941a6c3c45141070cd4
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/attention/nsa/nsa_mtp_verification.py
@@ -0,0 +1,407 @@
+"""
+Verification utilities for NSA backend fused metadata copy operations.
+
+This module contains verification code to ensure that fused metadata copy kernels
+produce the same results as individual copy operations.
+"""
+
+import torch
+
+
+def verify_single_backend_fused_metadata_copy(
+    metadata,
+    precomputed,
+    forward_mode,
+    bs,
+    flashmla_num_splits_src=None,
+    flashmla_metadata_src=None,
+    flashmla_num_splits_dst=None,
+    flashmla_metadata_dst=None,
+):
+    """
+    Verify that the fused metadata copy kernel produces the same results as individual copies.
+
+    Args:
+        metadata: The NSA metadata object containing destination tensors
+        precomputed: The precomputed metadata containing source tensors
+        forward_mode: The forward mode (decode, target_verify, or draft_extend)
+        bs: Batch size
+        flashmla_num_splits_src: Source FlashMLA num_splits tensor (optional)
+        flashmla_metadata_src: Source FlashMLA metadata tensor (optional)
+        flashmla_num_splits_dst: Destination FlashMLA num_splits tensor (optional)
+        flashmla_metadata_dst: Destination FlashMLA metadata tensor (optional)
+
+    Raises:
+        RuntimeError: If verification fails (tensors don't match)
+    """
+    # Clone destination tensors to preserve fused kernel results
+    fused_cache_seqlens = metadata.cache_seqlens_int32.clone()
+    fused_cu_seqlens_k = metadata.cu_seqlens_k.clone()
+    fused_page_table_1 = metadata.page_table_1.clone()
+    fused_nsa_cache_seqlens = metadata.nsa_cache_seqlens_int32.clone()
+    fused_nsa_seqlens_expanded = metadata.nsa_seqlens_expanded.clone()
+    fused_nsa_cu_seqlens_k = metadata.nsa_cu_seqlens_k.clone()
+    fused_real_page_table = (
+        metadata.real_page_table.clone()
+        if precomputed.real_page_table is not None
+        else None
+    )
+    fused_flashmla_num_splits = None
+    fused_flashmla_metadata = None
+    if precomputed.flashmla_metadata is not None:
+        fused_flashmla_num_splits = flashmla_num_splits_dst.clone()
+        fused_flashmla_metadata = flashmla_metadata_dst.clone()
+
+    # Create reference tensors (zeroed out)
+    ref_cache_seqlens = torch.zeros_like(metadata.cache_seqlens_int32)
+    ref_cu_seqlens_k = torch.zeros_like(metadata.cu_seqlens_k)
+    ref_page_table_1 = torch.zeros_like(metadata.page_table_1)
+    ref_nsa_cache_seqlens = torch.zeros_like(metadata.nsa_cache_seqlens_int32)
+    ref_nsa_seqlens_expanded = torch.zeros_like(metadata.nsa_seqlens_expanded)
+    ref_nsa_cu_seqlens_k = torch.zeros_like(metadata.nsa_cu_seqlens_k)
+    ref_real_page_table = (
+        torch.zeros_like(metadata.real_page_table)
+        if precomputed.real_page_table is not None
+        else None
+    )
+    ref_flashmla_num_splits = None
+    ref_flashmla_metadata = None
+    if precomputed.flashmla_metadata is not None:
+        ref_flashmla_num_splits = torch.zeros_like(flashmla_num_splits_dst)
+        ref_flashmla_metadata = torch.zeros_like(flashmla_metadata_dst)
+
+    # Run individual copy operations (reference implementation)
+    ref_cache_seqlens.copy_(precomputed.cache_seqlens)
+    ref_cu_seqlens_k[1:].copy_(precomputed.cu_seqlens_k[1:])
+
+    if forward_mode.is_decode_or_idle():
+        # Decode mode
+        ref_page_table_1[:, : precomputed.max_len].copy_(precomputed.page_indices)
+        ref_nsa_cache_seqlens.copy_(precomputed.nsa_cache_seqlens)
+    elif forward_mode.is_target_verify():
+        # Target verify mode
+        ref_page_table_1[:, : precomputed.max_seqlen_k].copy_(precomputed.page_indices)
+        ref_nsa_seqlens_expanded.copy_(precomputed.seqlens_expanded)
+        ref_nsa_cache_seqlens.copy_(precomputed.nsa_cache_seqlens)
+    elif forward_mode.is_draft_extend():
+        # Draft extend mode
+        rows = precomputed.page_indices.shape[0]
+        cols = precomputed.max_seqlen_k
+        ref_page_table_1[:rows, :cols].copy_(precomputed.page_indices)
+        size = precomputed.seqlens_expanded_size
+        ref_nsa_seqlens_expanded[:size].copy_(precomputed.seqlens_expanded)
+        ref_nsa_cache_seqlens[:size].copy_(precomputed.nsa_cache_seqlens)
+
+    # Copy NSA cu_seqlens
+    size = precomputed.seqlens_expanded_size
+    ref_nsa_cu_seqlens_k[1 : 1 + size].copy_(precomputed.nsa_cu_seqlens_k[1 : 1 + size])
+
+    # Copy real page table
+    if precomputed.real_page_table is not None:
+        rows, cols = precomputed.real_page_table.shape
+        ref_real_page_table[:rows, :cols].copy_(precomputed.real_page_table)
+
+    # Copy FlashMLA metadata
+    if precomputed.flashmla_metadata is not None:
+        size = precomputed.seqlens_expanded_size
+        ref_flashmla_num_splits[: size + 1].copy_(flashmla_num_splits_src[: size + 1])
+        ref_flashmla_metadata.copy_(flashmla_metadata_src)
+
+    # Compare results and crash if inconsistent
+    def check_tensor_equal(name, fused, ref):
+        if not torch.equal(fused, ref):
+            max_diff = (fused.float() - ref.float()).abs().max().item()
+            mismatched_elements = (fused != ref).sum().item()
+            total_elements = fused.numel()
+            raise RuntimeError(
+                f"FUSED METADATA COPY VERIFICATION FAILED!\n"
+                f"Tensor: {name}\n"
+                f"Max difference: {max_diff}\n"
+                f"Mismatched elements: {mismatched_elements}/{total_elements}\n"
+                f"Fused shape: {fused.shape}, Ref shape: {ref.shape}\n"
+                f"Forward mode: {forward_mode}, bs={bs}\n"
+                f"The fused kernel produces different results than individual copies.\n"
+                f"This indicates a bug in the fused metadata copy kernel."
+            )
+
+    # Verify all tensors (only compare the slices that were actually updated)
+    check_tensor_equal("cache_seqlens", fused_cache_seqlens, ref_cache_seqlens)
+    check_tensor_equal("cu_seqlens_k", fused_cu_seqlens_k, ref_cu_seqlens_k)
+
+    # Compare page_table_1 only for the region that was updated
+    if forward_mode.is_decode_or_idle():
+        check_tensor_equal(
+            "page_table_1",
+            fused_page_table_1[:, : precomputed.max_len],
+            ref_page_table_1[:, : precomputed.max_len],
+        )
+    elif forward_mode.is_target_verify():
+        check_tensor_equal(
+            "page_table_1",
+            fused_page_table_1[:, : precomputed.max_seqlen_k],
+            ref_page_table_1[:, : precomputed.max_seqlen_k],
+        )
+    elif forward_mode.is_draft_extend():
+        rows = precomputed.page_indices.shape[0]
+        cols = precomputed.max_seqlen_k
+        check_tensor_equal(
+            "page_table_1",
+            fused_page_table_1[:rows, :cols],
+            ref_page_table_1[:rows, :cols],
+        )
+
+    # Compare nsa_cache_seqlens only for the region that was updated
+    if forward_mode.is_decode_or_idle():
+        check_tensor_equal(
+            "nsa_cache_seqlens",
+            fused_nsa_cache_seqlens,
+            ref_nsa_cache_seqlens,
+        )
+    else:  # TARGET_VERIFY or DRAFT_EXTEND
+        size = precomputed.seqlens_expanded_size
+        check_tensor_equal(
+            "nsa_cache_seqlens",
+            fused_nsa_cache_seqlens[:size],
+            ref_nsa_cache_seqlens[:size],
+        )
+
+    # Compare nsa_seqlens_expanded only for TARGET_VERIFY and DRAFT_EXTEND
+    if forward_mode.is_target_verify() or forward_mode.is_draft_extend():
+        size = precomputed.seqlens_expanded_size
+        check_tensor_equal(
+            "nsa_seqlens_expanded",
+            fused_nsa_seqlens_expanded[:size],
+            ref_nsa_seqlens_expanded[:size],
+        )
+
+    # Compare nsa_cu_seqlens_k only for the region that was updated
+    size = precomputed.seqlens_expanded_size
+    check_tensor_equal(
+        "nsa_cu_seqlens_k",
+        fused_nsa_cu_seqlens_k[: 1 + size],
+        ref_nsa_cu_seqlens_k[: 1 + size],
+    )
+
+    if precomputed.real_page_table is not None:
+        rows, cols = precomputed.real_page_table.shape
+        check_tensor_equal(
+            "real_page_table",
+            fused_real_page_table[:rows, :cols],
+            ref_real_page_table[:rows, :cols],
+        )
+
+    if precomputed.flashmla_metadata is not None:
+        size = precomputed.seqlens_expanded_size
+        check_tensor_equal(
+            "flashmla_num_splits",
+            fused_flashmla_num_splits[: size + 1],
+            ref_flashmla_num_splits[: size + 1],
+        )
+        check_tensor_equal(
+            "flashmla_metadata",
+            fused_flashmla_metadata,
+            ref_flashmla_metadata,
+        )
+
+
+def verify_multi_backend_fused_metadata_copy(
+    metadata0,
+    metadata1,
+    metadata2,
+    precomputed,
+    bs,
+    flashmla_num_splits_src=None,
+    flashmla_metadata_src=None,
+):
+    """
+    Verify that the multi-backend fused metadata copy kernel produces the same results
+    as individual copies for all three backends.
+
+    Args:
+        metadata0: The NSA metadata object for backend 0
+        metadata1: The NSA metadata object for backend 1
+        metadata2: The NSA metadata object for backend 2
+        precomputed: The precomputed metadata containing source tensors
+        bs: Batch size
+        flashmla_num_splits_src: Source FlashMLA num_splits tensor (optional)
+        flashmla_metadata_src: Source FlashMLA metadata tensor (optional)
+
+    Raises:
+        RuntimeError: If verification fails (tensors don't match)
+    """
+    # Clone destination tensors to preserve fused kernel results
+    fused_results = []
+    for idx, metadata in enumerate([metadata0, metadata1, metadata2]):
+        fused_cache_seqlens = metadata.cache_seqlens_int32.clone()
+        fused_cu_seqlens_k = metadata.cu_seqlens_k.clone()
+        fused_page_table_1 = metadata.page_table_1.clone()
+        fused_nsa_cache_seqlens = metadata.nsa_cache_seqlens_int32.clone()
+        fused_nsa_cu_seqlens_k = metadata.nsa_cu_seqlens_k.clone()
+        fused_real_page_table = (
+            metadata.real_page_table.clone()
+            if precomputed.real_page_table is not None
+            else None
+        )
+        fused_flashmla_num_splits = None
+        fused_flashmla_metadata = None
+        if precomputed.flashmla_metadata is not None:
+            fused_flashmla_num_splits = metadata.flashmla_metadata.num_splits.clone()
+            fused_flashmla_metadata = (
+                metadata.flashmla_metadata.flashmla_metadata.clone()
+            )
+
+        fused_results.append(
+            {
+                "cache_seqlens": fused_cache_seqlens,
+                "cu_seqlens_k": fused_cu_seqlens_k,
+                "page_table_1": fused_page_table_1,
+                "nsa_cache_seqlens": fused_nsa_cache_seqlens,
+                "nsa_cu_seqlens_k": fused_nsa_cu_seqlens_k,
+                "real_page_table": fused_real_page_table,
+                "flashmla_num_splits": fused_flashmla_num_splits,
+                "flashmla_metadata": fused_flashmla_metadata,
+            }
+        )
+
+    # Run individual copy operations for each backend (reference implementation)
+    ref_results = []
+    for idx in range(3):
+        metadata = [metadata0, metadata1, metadata2][idx]
+
+        # Create reference tensors (zeroed out)
+        ref_cache_seqlens = torch.zeros_like(metadata.cache_seqlens_int32)
+        ref_cu_seqlens_k = torch.zeros_like(metadata.cu_seqlens_k)
+        ref_page_table_1 = torch.zeros_like(metadata.page_table_1)
+        ref_nsa_cache_seqlens = torch.zeros_like(metadata.nsa_cache_seqlens_int32)
+        ref_nsa_cu_seqlens_k = torch.zeros_like(metadata.nsa_cu_seqlens_k)
+        ref_real_page_table = (
+            torch.zeros_like(metadata.real_page_table)
+            if precomputed.real_page_table is not None
+            else None
+        )
+        ref_flashmla_num_splits = None
+        ref_flashmla_metadata = None
+        if precomputed.flashmla_metadata is not None:
+            ref_flashmla_num_splits = torch.zeros_like(
+                metadata.flashmla_metadata.num_splits
+            )
+            ref_flashmla_metadata = torch.zeros_like(
+                metadata.flashmla_metadata.flashmla_metadata
+            )
+
+        # Copy operations (decode mode)
+        ref_cache_seqlens.copy_(precomputed.cache_seqlens)
+        ref_cu_seqlens_k[1:].copy_(precomputed.cu_seqlens_k[1:])
+        ref_page_table_1[:, : precomputed.max_len].copy_(precomputed.page_indices)
+        ref_nsa_cache_seqlens.copy_(precomputed.nsa_cache_seqlens)
+
+        # Copy NSA cu_seqlens
+        size = precomputed.seqlens_expanded_size
+        ref_nsa_cu_seqlens_k[1 : 1 + size].copy_(
+            precomputed.nsa_cu_seqlens_k[1 : 1 + size]
+        )
+
+        # Copy real page table
+        if precomputed.real_page_table is not None:
+            rows, cols = precomputed.real_page_table.shape
+            ref_real_page_table[:rows, :cols].copy_(precomputed.real_page_table)
+
+        # Copy FlashMLA metadata
+        if precomputed.flashmla_metadata is not None:
+            ref_flashmla_num_splits[: size + 1].copy_(
+                flashmla_num_splits_src[: size + 1]
+            )
+            ref_flashmla_metadata.copy_(flashmla_metadata_src)
+
+        ref_results.append(
+            {
+                "cache_seqlens": ref_cache_seqlens,
+                "cu_seqlens_k": ref_cu_seqlens_k,
+                "page_table_1": ref_page_table_1,
+                "nsa_cache_seqlens": ref_nsa_cache_seqlens,
+                "nsa_cu_seqlens_k": ref_nsa_cu_seqlens_k,
+                "real_page_table": ref_real_page_table,
+                "flashmla_num_splits": ref_flashmla_num_splits,
+                "flashmla_metadata": ref_flashmla_metadata,
+            }
+        )
+
+    # Compare results for all 3 backends
+    def check_tensor_equal(backend_idx, name, fused, ref):
+        if not torch.equal(fused, ref):
+            max_diff = (fused.float() - ref.float()).abs().max().item()
+            mismatched_elements = (fused != ref).sum().item()
+            total_elements = fused.numel()
+            raise RuntimeError(
+                f"MULTI-BACKEND FUSED METADATA COPY VERIFICATION FAILED!\n"
+                f"Backend: {backend_idx}\n"
+                f"Tensor: {name}\n"
+                f"Max difference: {max_diff}\n"
+                f"Mismatched elements: {mismatched_elements}/{total_elements}\n"
+                f"Fused shape: {fused.shape}, Ref shape: {ref.shape}\n"
+                f"Batch size: {bs}\n"
+                f"The multi-backend fused kernel produces different results than individual copies.\n"
+                f"This indicates a bug in the fused metadata copy kernel."
+            )
+
+    # Verify all tensors for all 3 backends (multi-backend is DECODE mode only)
+    for idx in range(3):
+        fused = fused_results[idx]
+        ref = ref_results[idx]
+
+        check_tensor_equal(
+            idx,
+            "cache_seqlens",
+            fused["cache_seqlens"],
+            ref["cache_seqlens"],
+        )
+        check_tensor_equal(
+            idx,
+            "cu_seqlens_k",
+            fused["cu_seqlens_k"],
+            ref["cu_seqlens_k"],
+        )
+        # Multi-backend is DECODE mode only, so compare only [:, :max_len]
+        check_tensor_equal(
+            idx,
+            "page_table_1",
+            fused["page_table_1"][:, : precomputed.max_len],
+            ref["page_table_1"][:, : precomputed.max_len],
+        )
+        check_tensor_equal(
+            idx,
+            "nsa_cache_seqlens",
+            fused["nsa_cache_seqlens"],
+            ref["nsa_cache_seqlens"],
+        )
+        # DECODE mode uses bs for nsa_cu_seqlens_k size
+        check_tensor_equal(
+            idx,
+            "nsa_cu_seqlens_k",
+            fused["nsa_cu_seqlens_k"][: bs + 1],
+            ref["nsa_cu_seqlens_k"][: bs + 1],
+        )
+
+        if precomputed.real_page_table is not None:
+            rows, cols = precomputed.real_page_table.shape
+            check_tensor_equal(
+                idx,
+                "real_page_table",
+                fused["real_page_table"][:rows, :cols],
+                ref["real_page_table"][:rows, :cols],
+            )
+
+        if precomputed.flashmla_metadata is not None:
+            # DECODE mode uses bs + 1 for flashmla_num_splits
+            check_tensor_equal(
+                idx,
+                "flashmla_num_splits",
+                fused["flashmla_num_splits"][: bs + 1],
+                ref["flashmla_num_splits"][: bs + 1],
+            )
+            check_tensor_equal(
+                idx,
+                "flashmla_metadata",
+                fused["flashmla_metadata"],
+                ref["flashmla_metadata"],
+            )
diff --git a/sglang/python/sglang/srt/layers/attention/nsa/quant_k_cache.py b/sglang/python/sglang/srt/layers/attention/nsa/quant_k_cache.py
new file mode 100644
index 0000000000000000000000000000000000000000..5454071b897d329f66ab6aab76d2791954fde422
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/attention/nsa/quant_k_cache.py
@@ -0,0 +1,449 @@
+import torch
+import triton
+import triton.language as tl
+
+
+def quantize_k_cache(cache_k):
+    return _quantize_k_cache_fast_wrapped(cache_k)
+
+
+def quantize_k_cache_separate(
+    k_nope: torch.Tensor,
+    k_rope: torch.Tensor,
+    tile_size: int = 128,
+):
+    """
+    Quantize k_nope and k_rope separately without concat, returns two tensors.
+
+    This avoids the concat operation and enables direct reuse of set_mla_kv_buffer_triton
+    by returning two separate byte tensors for the nope and rope parts.
+
+    Args:
+        k_nope: (num_tokens, dim_nope) or (num_tokens, 1, dim_nope)
+                Must have dim_nope=512 for FP8 MLA quantization
+        k_rope: (num_tokens, dim_rope) or (num_tokens, 1, dim_rope)
+                Must have dim_rope=64 for FP8 MLA quantization
+        tile_size: quantization tile size (default 128)
+
+    Returns:
+        Tuple of (nope_part, rope_part) where:
+        - nope_part: (num_tokens, 1, 528) as uint8 view, contains [nope_fp8(512) | scales(16)]
+        - rope_part: (num_tokens, 1, 128) as uint8 view, contains [rope_bf16_bytes(128)]
+
+        These two tensors can be directly passed to set_mla_kv_buffer_triton(kv_buffer, loc, nope_part, rope_part)
+    """
+    # Squeeze middle dimension if present
+    k_nope_2d = k_nope.squeeze(1) if k_nope.ndim == 3 else k_nope
+    k_rope_2d = k_rope.squeeze(1) if k_rope.ndim == 3 else k_rope
+
+    num_tokens = k_nope_2d.shape[0]
+    dim_nope = k_nope_2d.shape[1]
+    dim_rope = k_rope_2d.shape[1]
+
+    # Validate dimensions for FP8 MLA
+    if dim_nope != 512:
+        raise ValueError(f"Expected dim_nope=512 for FP8 MLA, got {dim_nope}")
+    if dim_rope != 64:
+        raise ValueError(f"Expected dim_rope=64 for FP8 MLA, got {dim_rope}")
+    if k_rope_2d.shape[0] != num_tokens:
+        raise ValueError(
+            f"k_nope and k_rope must have same num_tokens, got {num_tokens} vs {k_rope_2d.shape[0]}"
+        )
+
+    return _quantize_k_cache_fast_separate(
+        k_nope=k_nope_2d, k_rope=k_rope_2d, group_size=tile_size
+    )
+
+
+# Copied from original
+def _quantize_k_cache_ref(
+    input_k_cache: torch.Tensor,  # (num_blocks, block_size, h_k, d)
+    dv: int = 512,
+    tile_size: int = 128,
+) -> torch.Tensor:
+    """
+    Quantize the k-cache
+    Return a tensor with shape (num_blocks, block_size, h_k, dv + 4(dv/tile_size) + t(d-dv)) of dtype uint8_t, where t = input_k_cache.element_size()
+    For more detail about the layout of K/V, please refer to comments in flash_mla_interface.py or README.md
+    """
+    assert dv % tile_size == 0
+    num_tiles = dv // tile_size
+    num_blocks, block_size, h_k, d = input_k_cache.shape
+    assert h_k == 1
+    input_k_cache = input_k_cache.squeeze(2)  # [num_blocks, block_size, d]
+    input_elem_size = input_k_cache.element_size()
+
+    result = torch.empty(
+        (num_blocks, block_size, dv + num_tiles * 4 + input_elem_size * (d - dv)),
+        dtype=torch.float8_e4m3fn,
+        device=input_k_cache.device,
+    )
+    result_k_nope_part = result[..., :dv]
+    result_k_scale_factor = result[..., dv : dv + num_tiles * 4].view(torch.float32)
+    result_k_rope_part = result[..., dv + num_tiles * 4 :].view(input_k_cache.dtype)
+    result_k_rope_part[:] = input_k_cache[..., dv:]
+
+    for tile_idx in range(0, num_tiles):
+        cur_scale_factors_inv = (
+            torch.abs(
+                input_k_cache[..., tile_idx * tile_size : (tile_idx + 1) * tile_size]
+            )
+            .max(dim=-1)
+            .values
+            / 448.0
+        )  # [num_blocks, block_size]
+        result_k_scale_factor[:, :, tile_idx] = cur_scale_factors_inv
+
+        cur_scale_factors_inv.unsqueeze_(-1)  # [num_blocks, block_size, 1]
+        cur_quantized_nope = (
+            input_k_cache[
+                ..., tile_idx * tile_size : (tile_idx + 1) * tile_size
+            ].float()
+            / cur_scale_factors_inv.float()
+        ).to(torch.float8_e4m3fn)
+        result_k_nope_part[..., tile_idx * tile_size : (tile_idx + 1) * tile_size] = (
+            cur_quantized_nope
+        )
+
+    result = result.view(num_blocks, block_size, 1, -1)
+    return result
+
+
+def _quantize_k_cache_fast_wrapped(
+    input_k_cache: torch.Tensor,
+    dv: int = 512,
+    tile_size: int = 128,
+) -> torch.Tensor:
+    # TODO the final API may be 2D instead of 4D, thus we convert them here
+    num_blocks, block_size, _, dim_nope_and_rope = input_k_cache.shape
+    assert dv == 512
+    assert dim_nope_and_rope == 512 + 64
+    assert tile_size == 128
+    input_k_cache = input_k_cache.view((-1, dim_nope_and_rope))
+
+    # TODO deliberately split into two tensors, then upstream can provide the two tensors instead of concat into one
+    k_nope = input_k_cache[:, :dv]
+    k_rope = input_k_cache[:, dv:]
+
+    output = _quantize_k_cache_fast(k_nope=k_nope, k_rope=k_rope)
+
+    return output.view(num_blocks, block_size, 1, -1)
+
+
+def _quantize_k_cache_fast(k_nope, k_rope, group_size: int = 128):
+    """
+    :param k_nope: (num_tokens, dim_nope 512)
+    :param k_rope: (num_tokens, dim_rope 64)
+    """
+
+    assert k_nope.dtype == torch.bfloat16
+    assert k_rope.dtype == torch.bfloat16
+
+    num_tokens, dim_nope = k_nope.shape
+    num_tokens_, dim_rope = k_rope.shape
+    assert num_tokens == num_tokens_
+    assert dim_nope == 512
+    assert dim_rope == 64
+    assert k_nope.dtype == k_rope.dtype
+    num_tiles = dim_nope // group_size
+
+    assert k_nope.stride(1) == 1
+    assert k_rope.stride(1) == 1
+
+    output = torch.empty(
+        (num_tokens, dim_nope + num_tiles * 4 + k_rope.element_size() * dim_rope),
+        dtype=torch.float8_e4m3fn,
+        device=k_nope.device,
+    )
+    output_nope_q = output[..., :dim_nope]
+    output_nope_s = output[..., dim_nope : dim_nope + num_tiles * 4].view(torch.float32)
+    output_rope = output[..., dim_nope + num_tiles * 4 :].view(torch.bfloat16)
+
+    num_blocks_per_token = triton.cdiv(dim_nope + dim_rope, group_size)
+    assert num_blocks_per_token == 5
+
+    assert dim_nope % group_size == 0
+    NUM_NOPE_BLOCKS = dim_nope // group_size
+
+    _quantize_k_cache_fast_kernel[(num_tokens, num_blocks_per_token)](
+        output_nope_q,
+        output_nope_s,
+        output_rope,
+        k_nope,
+        k_rope,
+        output_nope_q.stride(0),
+        output_nope_s.stride(0),
+        output_rope.stride(0),
+        k_nope.stride(0),
+        k_rope.stride(0),
+        NUM_NOPE_BLOCKS=NUM_NOPE_BLOCKS,
+        GROUP_SIZE=group_size,
+        DIM_NOPE=dim_nope,
+        DIM_ROPE=dim_rope,
+        FP8_MIN=torch.finfo(torch.float8_e4m3fn).min,
+        FP8_MAX=torch.finfo(torch.float8_e4m3fn).max,
+    )
+
+    return output
+
+
+def _quantize_k_cache_fast_separate(k_nope, k_rope, group_size: int = 128):
+    """
+    Quantize k_nope and k_rope in a single Triton kernel, directly outputting two separate tensors.
+
+    This avoids packing/unpacking and enables direct use with set_mla_kv_buffer_triton.
+
+    :param k_nope: (num_tokens, dim_nope 512) bfloat16
+    :param k_rope: (num_tokens, dim_rope 64) bfloat16
+    :param group_size: quantization tile size (default 128, kernel is tuned for this value)
+    :return: Tuple of (nope_part_u8, rope_part_u8)
+        - nope_part_u8: (num_tokens, 1, nope_part_bytes) uint8, layout [nope_fp8(dim_nope) | scales(num_tiles*4)]
+        - rope_part_u8: (num_tokens, 1, rope_part_bytes) uint8, layout [rope_bf16_bytes(dim_rope*2)]
+    """
+    num_tokens, dim_nope = k_nope.shape
+    num_tokens_, dim_rope = k_rope.shape
+
+    assert num_tokens == num_tokens_, f"k_nope and k_rope must have same num_tokens"
+
+    # Ensure contiguous tensors for kernel
+    k_nope = k_nope.contiguous()
+    k_rope = k_rope.contiguous()
+
+    num_tiles = dim_nope // group_size
+
+    # Calculate byte sizes based on validated dimensions
+    # nope_part: [FP8 quantized data (dim_nope bytes)] + [FP32 scales (num_tiles * 4 bytes)]
+    # rope_part: [BF16 raw data (dim_rope * 2 bytes)]
+    nope_part_bytes = (
+        dim_nope + num_tiles * 4
+    )  # e.g., 512 + 4*4 = 528 for dim_nope=512, group_size=128
+    rope_part_bytes = (
+        dim_rope * k_rope.element_size()
+    )  # e.g., 64 * 2 = 128 for dim_rope=64, BF16
+
+    # Allocate two separate output buffers (as uint8 for direct byte-level access)
+    nope_part_u8 = torch.empty(
+        (num_tokens, nope_part_bytes), dtype=torch.uint8, device=k_nope.device
+    )
+    rope_part_u8 = torch.empty(
+        (num_tokens, rope_part_bytes), dtype=torch.uint8, device=k_rope.device
+    )
+
+    # Create typed views for the kernel to write into
+    # Fixed byte layout for nope_part: [nope_fp8 (dim_nope bytes) | scales_fp32 (num_tiles*4 bytes)]
+    # Fixed byte layout for rope_part: [rope_bf16 (dim_rope*2 bytes)]
+    nope_q_view = nope_part_u8[:, :dim_nope].view(torch.float8_e4m3fn)
+    nope_s_view = nope_part_u8[:, dim_nope:].view(torch.float32)
+    rope_view = rope_part_u8.view(torch.bfloat16)
+
+    # Kernel launch parameters
+    num_blocks_per_token = triton.cdiv(dim_nope + dim_rope, group_size)
+    NUM_NOPE_BLOCKS = dim_nope // group_size
+
+    # Use the same kernel as _quantize_k_cache_fast (reuse existing implementation)
+    _quantize_k_cache_fast_kernel[(num_tokens, num_blocks_per_token)](
+        nope_q_view,
+        nope_s_view,
+        rope_view,
+        k_nope,
+        k_rope,
+        nope_q_view.stride(0),
+        nope_s_view.stride(0),
+        rope_view.stride(0),
+        k_nope.stride(0),
+        k_rope.stride(0),
+        NUM_NOPE_BLOCKS=NUM_NOPE_BLOCKS,
+        GROUP_SIZE=group_size,
+        DIM_NOPE=dim_nope,
+        DIM_ROPE=dim_rope,
+        FP8_MIN=torch.finfo(torch.float8_e4m3fn).min,
+        FP8_MAX=torch.finfo(torch.float8_e4m3fn).max,
+    )
+
+    # Add middle dimension for compatibility with set_mla_kv_buffer_triton
+    return nope_part_u8.unsqueeze(1), rope_part_u8.unsqueeze(1)
+
+
+@triton.jit
+def _quantize_k_cache_fast_kernel(
+    output_nope_q_ptr,
+    output_nope_s_ptr,
+    output_rope_ptr,
+    k_nope_ptr,
+    k_rope_ptr,
+    output_nope_q_stride_0: int,
+    output_nope_s_stride_0: int,
+    output_rope_stride_0: int,
+    k_nope_stride_0: int,
+    k_rope_stride_0: int,
+    NUM_NOPE_BLOCKS: tl.constexpr,
+    GROUP_SIZE: tl.constexpr,
+    DIM_NOPE: tl.constexpr,
+    DIM_ROPE: tl.constexpr,
+    FP8_MIN: tl.constexpr,
+    FP8_MAX: tl.constexpr,
+):
+    token_id = tl.program_id(0)
+    raw_block_id = tl.program_id(1)
+
+    if raw_block_id < NUM_NOPE_BLOCKS:
+        # a. quant nope
+        effective_block_id = raw_block_id
+
+        offs = effective_block_id * GROUP_SIZE + tl.arange(0, GROUP_SIZE)
+        mask = offs < DIM_NOPE
+        ptr = k_nope_ptr + token_id * k_nope_stride_0 + offs
+
+        y = tl.load(ptr, mask=mask, other=0.0).to(tl.float32)
+
+        # the ref impl do not have a `tl.maximum(... eps)`, so we remove it here
+        y_s = tl.max(tl.abs(y)) / FP8_MAX
+        y_s_inv = 1.0 / y_s
+        y_q = tl.clamp(y * y_s_inv, FP8_MIN, FP8_MAX).to(
+            output_nope_q_ptr.dtype.element_ty
+        )
+
+        dst_q_ptr = output_nope_q_ptr + token_id * output_nope_q_stride_0 + offs
+        dst_s_ptr = (
+            output_nope_s_ptr + token_id * output_nope_s_stride_0 + effective_block_id
+        )
+
+        tl.store(dst_q_ptr, y_q, mask=mask)
+        tl.store(dst_s_ptr, y_s)
+    else:
+        # b. copy rope
+        effective_block_id = raw_block_id - NUM_NOPE_BLOCKS
+
+        offs = effective_block_id * GROUP_SIZE + tl.arange(0, GROUP_SIZE)
+        mask = offs < DIM_ROPE
+
+        src_ptr = k_rope_ptr + token_id * k_rope_stride_0 + offs
+        dst_ptr = output_rope_ptr + token_id * output_rope_stride_0 + offs
+
+        data = tl.load(src_ptr, mask=mask)
+        tl.store(dst_ptr, data, mask=mask)
+
+
+if __name__ == "__main__":
+    import dequant_k_cache
+
+    for num_blocks, block_size in [
+        (1, 1),
+        (10, 64),
+    ]:
+        dim_nope_and_rope = 512 + 64
+
+        input_k_cache = torch.randn(
+            (num_blocks, block_size, 1, dim_nope_and_rope),
+            dtype=torch.bfloat16,
+            device="cuda",
+        )
+
+        ref_quant = _quantize_k_cache_ref(input_k_cache)
+        actual_quant = _quantize_k_cache_fast_wrapped(input_k_cache)
+
+        ref_ref_dequant = dequant_k_cache._dequantize_k_cache_slow(ref_quant)
+        ref_actual_dequant = dequant_k_cache._dequantize_k_cache_fast_wrapped(ref_quant)
+        actual_actual_dequant = dequant_k_cache._dequantize_k_cache_fast_wrapped(
+            actual_quant
+        )
+
+        print(f"{ref_ref_dequant=}")
+        print(f"{actual_actual_dequant=}")
+        print(f"{actual_actual_dequant - ref_ref_dequant=}")
+        print(f"{torch.mean(ref_ref_dequant - actual_actual_dequant)=}")
+
+        # TODO too different?
+        torch.testing.assert_close(
+            ref_ref_dequant, ref_actual_dequant, atol=0.2, rtol=0.2
+        )
+        torch.testing.assert_close(
+            ref_ref_dequant, actual_actual_dequant, atol=0.2, rtol=0.2
+        )
+
+        # test dequant_k_cache_paged
+        page_table_1 = torch.arange(
+            num_blocks * block_size, dtype=torch.int32, device="cuda"
+        )
+        actual_dequant_paged = dequant_k_cache.dequantize_k_cache_paged(
+            actual_quant, page_table_1
+        ).reshape(actual_actual_dequant.shape)
+        print(f"{torch.mean(actual_actual_dequant - actual_dequant_paged)=}")
+        torch.testing.assert_close(
+            ref_ref_dequant, actual_dequant_paged, atol=0.2, rtol=0.2
+        )
+
+    print("Passed")
+
+    # Test quantize_k_cache_separate: verify output matches concat path
+    print("\nTesting quantize_k_cache_separate...")
+    for num_tokens in [64, 100]:
+        dim_nope = 512
+        dim_rope = 64
+
+        k_nope = torch.randn(
+            num_tokens, 1, dim_nope, dtype=torch.bfloat16, device="cuda"
+        )
+        k_rope = torch.randn(
+            num_tokens, 1, dim_rope, dtype=torch.bfloat16, device="cuda"
+        )
+
+        # Old path: concat then quantize
+        k_concat = torch.cat([k_nope, k_rope], dim=-1).squeeze(1)  # (num_tokens, 576)
+        old_output = quantize_k_cache(k_concat.unsqueeze(1).unsqueeze(1))  # 4D input
+        old_output = old_output.squeeze(1).squeeze(1)  # Back to (num_tokens, 656)
+
+        # New path: quantize separately
+        nope_part, rope_part = quantize_k_cache_separate(k_nope, k_rope)
+        new_bytes = torch.cat([nope_part.squeeze(1), rope_part.squeeze(1)], dim=-1)
+
+        # Compare byte-level equality
+        old_bytes = old_output.view(torch.uint8)
+
+        if old_bytes.shape != new_bytes.shape:
+            raise RuntimeError(
+                f"Shape mismatch: {old_bytes.shape} vs {new_bytes.shape}"
+            )
+
+        diff_bytes = (old_bytes != new_bytes).sum().item()
+        if diff_bytes > 0:
+            max_diff = (old_bytes.float() - new_bytes.float()).abs().max().item()
+            raise RuntimeError(
+                f"quantize_k_cache_separate output doesn't match concat path: "
+                f"{diff_bytes} differing bytes, max_diff={max_diff}"
+            )
+
+        print(f"  num_tokens={num_tokens}: PASSED (outputs match byte-wise)")
+
+    print("quantize_k_cache_separate tests passed!")
+
+    print("\nDo benchmark...")
+
+    for num_blocks, block_size in [
+        (1, 64),
+        (64, 64),
+        (128, 64),
+        (256, 64),
+        (512, 64),
+        (1024, 64),
+        (2048, 64),
+    ]:
+        dim_nope_and_rope = 512 + 64
+
+        input_k_cache = torch.randn(
+            (num_blocks, block_size, 1, dim_nope_and_rope),
+            dtype=torch.bfloat16,
+            device="cuda",
+        )
+
+        actual_quant = _quantize_k_cache_fast_wrapped(input_k_cache)
+
+        page_table_1 = torch.arange(
+            num_blocks * block_size, dtype=torch.int32, device="cuda"
+        )
+
+        def run_ans():
+            return dequant_k_cache.dequantize_k_cache_paged(actual_quant, page_table_1)
+
+        ans_time: float = triton.testing.do_bench(run_ans, warmup=10, rep=20) / 1000  # type: ignore
+        print(f"seq_kv: {num_blocks * block_size}, time: {ans_time * 1e6: 4.0f} us")
diff --git a/sglang/python/sglang/srt/layers/attention/nsa/tilelang_kernel.py b/sglang/python/sglang/srt/layers/attention/nsa/tilelang_kernel.py
new file mode 100644
index 0000000000000000000000000000000000000000..244e9b46e772751998e010c6847d4b76046e2259
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/attention/nsa/tilelang_kernel.py
@@ -0,0 +1,1066 @@
+from typing import Optional, Tuple
+
+import tilelang
+import tilelang.language as T
+import torch
+
+from sglang.srt.layers.quantization.fp8_kernel import is_fp8_fnuz
+from sglang.srt.utils import is_gfx95_supported, is_hip
+
+tilelang.set_log_level("WARNING")
+
+pass_configs = {
+    tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
+    tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
+}
+# TL_DISABLE_FAST_MATH has deprecated in v0.1.7.post1 tilelang
+if hasattr(tilelang.PassConfigKey, "TL_DISABLE_FAST_MATH"):
+    pass_configs[tilelang.PassConfigKey.TL_DISABLE_FAST_MATH] = True
+elif hasattr(tilelang.PassConfigKey, "TL_ENABLE_FAST_MATH"):
+    pass_configs[tilelang.PassConfigKey.TL_ENABLE_FAST_MATH] = False
+
+_is_hip = is_hip()
+_is_gfx95_supported = is_gfx95_supported()
+_is_fp8_fnuz = is_fp8_fnuz()
+
+BF16 = "bfloat16"
+FP8 = "float8_e4m3fnuz" if _is_fp8_fnuz else "float8_e4m3"
+FP32 = "float32"
+
+
+def fast_log2_ceil(x):
+    bits_x = T.reinterpret("uint32", x)
+    exp_x = (bits_x >> 23) & 0xFF
+    man_bits = bits_x & ((1 << 23) - 1)
+    return T.Cast("int32", exp_x - 127 + T.if_then_else(man_bits != 0, 1, 0))
+
+
+def fast_pow2(x):
+    bits_x = (x + 127) << 23
+    return T.reinterpret("float32", bits_x)
+
+
+def fast_round_scale(amax, fp8_max_inv):
+    return fast_pow2(fast_log2_ceil(amax * fp8_max_inv))
+
+
+@tilelang.jit(pass_configs=pass_configs)
+def act_quant_kernel(
+    N, in_dtype=BF16, out_dtype=FP8, scale_dtype=FP32, round_scale=False
+):
+    M = T.symbolic("M")
+    fp8_min = -224.0 if _is_fp8_fnuz else -448.0
+    fp8_max = 224.0 if _is_fp8_fnuz else 448.0
+    fp8_max_inv = 1 / fp8_max
+    num_stages = 0 if round_scale else 2
+    blk_m = 32
+    group_size = 128
+
+    @T.prim_func
+    def act_quant_kernel_(
+        X: T.Tensor[(M, N), in_dtype],
+        Y: T.Tensor[(M, N), out_dtype],
+        S: T.Tensor[(M, T.ceildiv(N, group_size)), scale_dtype],
+    ):
+        with T.Kernel(T.ceildiv(M, blk_m), T.ceildiv(N, group_size), threads=128) as (
+            pid_m,
+            pid_n,
+        ):
+            x_shared = T.alloc_shared((blk_m, group_size), in_dtype)
+            x_local = T.alloc_fragment((blk_m, group_size), in_dtype)
+            amax_local = T.alloc_fragment((blk_m,), scale_dtype)
+            s_local = T.alloc_fragment((blk_m,), scale_dtype)
+            y_local = T.alloc_fragment((blk_m, group_size), out_dtype)
+            y_shared = T.alloc_shared((blk_m, group_size), out_dtype)
+
+            for _ in T.Pipelined(1, num_stages=num_stages):
+                T.copy(X[pid_m * blk_m, pid_n * group_size], x_shared)
+                T.copy(x_shared, x_local)
+                T.reduce_absmax(x_local, amax_local, dim=1)
+                for i in T.Parallel(blk_m):
+                    amax_local[i] = T.max(amax_local[i], 1e-4)
+                    if round_scale:
+                        s_local[i] = fast_round_scale(amax_local[i], fp8_max_inv)
+                    else:
+                        s_local[i] = amax_local[i] * fp8_max_inv
+                for i, j in T.Parallel(blk_m, group_size):
+                    y_local[i, j] = T.clamp(
+                        x_local[i, j] / s_local[i], fp8_min, fp8_max
+                    )
+                for i in T.Parallel(blk_m):
+                    S[pid_m * blk_m + i, pid_n] = s_local[i]
+                T.copy(y_local, y_shared)
+                T.copy(y_shared, Y[pid_m * blk_m, pid_n * group_size])
+
+    return act_quant_kernel_
+
+
+def act_quant(
+    x: torch.Tensor, block_size: int = 128, scale_fmt: Optional[str] = None
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """
+    Quantizes the input tensor `x` using block-wise quantization.
+
+    Args:
+        x (torch.Tensor): The input tensor to be quantized. Must be contiguous and its last dimension size must be divisible by `block_size`.
+        block_size (int, optional): The size of the blocks to be used for quantization. Default is 128.
+        scale_fmt (Optional[str], optional): The format of the scale. Default is None.
+    Returns:
+        Tuple[torch.Tensor, torch.Tensor]: A tuple containing:
+            - The quantized tensor with dtype `torch.float8_e4m3fn`.
+            - A tensor of scaling factors with dtype `torch.float32`.
+    """
+    assert x.is_contiguous(), "Input tensor must be contiguous"
+    assert (
+        x.size(-1) % block_size == 0
+    ), f"Last dimension size must be divisible by block_size (block_size={block_size})"
+    N = x.size(-1)
+    if _is_fp8_fnuz:
+        y = torch.empty_like(x, dtype=torch.float8_e4m3fnuz)
+    else:
+        y = torch.empty_like(x, dtype=torch.float8_e4m3fn)
+    s = x.new_empty(*x.size()[:-1], N // block_size, dtype=torch.float32)
+    kernel = act_quant_kernel(N, round_scale=scale_fmt is not None)
+    kernel(x.view(-1, N), y.view(-1, N), s.view(-1, N // block_size))
+    return y, s
+
+
+@tilelang.jit(out_idx=[4], pass_configs=pass_configs)
+def fp8_index_kernel(h: int, d: int, clear_accum=True):
+    b = T.symbolic("b")
+    m = T.symbolic("m")
+    n = T.symbolic("n")
+
+    blk_n1 = 512
+    blk_n2 = 128
+
+    @T.prim_func
+    def fp8_index_kernel_(
+        q: T.Tensor[(b, m, h, d), FP8],
+        q_s: T.Tensor[(b, m, h), FP32],
+        k: T.Tensor[(b, n, d), FP8],
+        k_s: T.Tensor[(b, n), FP32],
+        o: T.Tensor[(b, m, n), FP32],
+    ) -> None:
+        with T.Kernel(b, m, T.ceildiv(n, blk_n1)) as (i_b, i_m, i1_n):
+            q_smem = T.alloc_shared((h, d), FP8)
+            T.copy(q[i_b, i_m, 0, 0], q_smem)
+
+            q_s_frag = T.alloc_fragment(h, FP32)
+            T.copy(q_s[i_b, i_m, 0], q_s_frag)
+
+            for i2_n in T.Pipelined(blk_n1 // blk_n2, num_stages=2):
+                k_smem = T.alloc_shared((blk_n2, d), FP8)
+                T.copy(k[i_b, i1_n * blk_n1 + i2_n * blk_n2, 0], k_smem)
+
+                k_s_frag = T.alloc_fragment(blk_n2, FP32)
+                T.copy(k_s[i_b, i1_n * blk_n1 + i2_n * blk_n2], k_s_frag)
+
+                logits = T.alloc_fragment((blk_n2, h), FP32)
+                if not clear_accum:
+                    T.fill(logits, 0)
+                T.gemm(
+                    k_smem,
+                    q_smem,
+                    logits,
+                    transpose_A=False,
+                    transpose_B=True,
+                    clear_accum=clear_accum,
+                )
+
+                for i_h, i3_n in T.Parallel(h, blk_n2):
+                    logits[i3_n, i_h] = T.max(logits[i3_n, i_h], 0) * q_s_frag[i_h]
+
+                logits_sum = T.alloc_fragment(blk_n2, FP32)
+                T.reduce_sum(logits, logits_sum, dim=1)
+
+                for i3_n in T.Parallel(blk_n2):
+                    logits_sum[i3_n] *= k_s_frag[i3_n]
+
+                T.copy(logits_sum, o[i_b, i_m, i1_n * blk_n1 + i2_n * blk_n2])
+
+    return fp8_index_kernel_
+
+
+def fp8_index(
+    q: torch.Tensor,
+    q_s: torch.Tensor,
+    k: torch.Tensor,
+    k_s: torch.Tensor,
+) -> torch.Tensor:
+    """
+    Perform index score using FP8 precision.
+
+    Args:
+        q (torch.Tensor): The Q tensor, must be contiguous.
+        q_s (torch.Tensor): The scaling factor for Q (float), must be contiguous.
+        k (torch.Tensor): The K tensor, must be contiguous.
+        k_s (torch.Tensor): The scaling factor for K (e8m0 here), must be contiguous.
+
+        fp8 q @ fp8 k -> fp32 logits
+        relu(fp32 logits) * q_s (weights) -> fp32 logits
+        fp32 logits -> fp32 logits_sum
+        fp32 logits_sum * k_s (e8m0) -> fp32 index_score
+    """
+    if _is_hip:
+        return fp8_index_kernel(q.shape[2], q.shape[3], False)(q, q_s, k, k_s)
+    else:
+        return fp8_index_kernel(q.shape[2], q.shape[3])(q, q_s, k, k_s)
+
+
+@tilelang.jit(
+    out_idx=[-1],
+    pass_configs={
+        tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
+        tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
+    },
+)
+def sparse_attention_fwd_kernel_v1(
+    num_heads,
+    dim,
+    tail_dim,
+    topk,
+    *,
+    kv_group=1,
+    sm_scale=None,
+    is_causal=True,
+    block_I=64,
+    num_stages=2,
+    threads=256,
+):
+    assert dim == tilelang.math.next_power_of_2(
+        dim
+    ), f"haven't check padding correctness yet, dim={dim}"
+    assert tail_dim == tilelang.math.next_power_of_2(
+        tail_dim
+    ), f"haven't check padding correctness yet, dim={tail_dim}"
+    assert is_causal == True, "non-casual is not supported"
+    assert (
+        topk % block_I == 0
+    ), "otherwise will load some index=0 thus causing wrong kv to be loaded"
+    if sm_scale is None:
+        sm_scale = (1.0 / (dim + tail_dim)) ** 0.5 * 1.44269504  # log2(e)
+    else:
+        sm_scale = sm_scale * 1.44269504  # log2(e)
+
+    batch = T.symbolic("batch")
+    seq_len = T.symbolic("seq_len")
+    seq_len_kv = T.symbolic("seq_len_kv")
+
+    head_kv = num_heads // kv_group
+    q_shape = [batch, seq_len, num_heads, dim + tail_dim]
+    kv_shape = [batch, seq_len_kv, kv_group, dim + tail_dim]
+    o_shape = [batch, seq_len, num_heads, dim]
+    indices_shape = [batch, seq_len, kv_group, topk]
+    indices_dtype = "int32"
+    dtype = "bfloat16"
+    accum_dtype = "float"
+
+    H = head_kv
+    padded_H = max(tilelang.math.next_power_of_2(head_kv), 16)
+    if padded_H != H:
+        assert kv_group == 1
+    BI = block_I
+    NI = tilelang.cdiv(topk, block_I)
+    D = dim
+    D_tail = tail_dim
+
+    if head_kv > 64:
+        assert head_kv % 64 == 0, "head_kv should be a multiple of 64"
+        REPLICATE_H = head_kv // 64
+    else:
+        REPLICATE_H = 1
+
+    H_per_block = padded_H if REPLICATE_H == 1 else 64
+
+    @T.prim_func
+    def main(
+        Q: T.Tensor(q_shape, dtype),  # type: ignore
+        KV: T.Tensor(kv_shape, dtype),  # type: ignore
+        Indices: T.Tensor(indices_shape, indices_dtype),  # type: ignore
+        Output: T.Tensor(o_shape, dtype),  # type: ignore
+    ):
+        with T.Kernel(seq_len * REPLICATE_H, batch, kv_group, threads=threads) as (
+            bx,
+            by,
+            bz,
+        ):
+            Q_shared = T.alloc_shared([H_per_block, D], dtype)
+            Q_tail_shared = T.alloc_shared([H_per_block, D_tail], dtype)
+            KV_shared = T.alloc_shared([BI, D], dtype)
+            K_tail_shared = T.alloc_shared([BI, D_tail], dtype)
+            O_shared = T.alloc_shared([H_per_block, D], dtype)
+            mask = T.alloc_fragment([BI], "bool")
+
+            acc_o = T.alloc_fragment([H_per_block, D], accum_dtype)
+            acc_s = T.alloc_fragment([H_per_block, BI], accum_dtype)
+            S_shared = T.alloc_shared([H_per_block, BI], dtype)
+            sumexp = T.alloc_fragment([H_per_block], accum_dtype)
+            sumexp_i = T.alloc_fragment([H_per_block], accum_dtype)
+            alpha = T.alloc_fragment([H_per_block], accum_dtype)
+            m_i = T.alloc_fragment([H_per_block], accum_dtype)
+            m_i_prev = T.alloc_fragment([H_per_block], accum_dtype)
+
+            T.fill(acc_o, 0)
+            T.fill(sumexp, 0)
+            T.fill(m_i, -(2**30))  # avoid -inf - inf to cause nan
+
+            b_i, g_i = by, bz
+            s_i = bx if REPLICATE_H == 1 else (bx // REPLICATE_H)
+            q_i = s_i
+            max_kv_i = q_i
+
+            H0 = g_i * padded_H + (0 if REPLICATE_H == 1 else (bx % REPLICATE_H) * 64)
+            H1 = H0 + H_per_block
+
+            T.copy(Q[b_i, s_i, H0:H1, :D], Q_shared)
+            T.copy(Q[b_i, s_i, H0:H1, D:], Q_tail_shared)
+
+            for i_i in T.Pipelined(NI, num_stages=num_stages):
+
+                for bi_i in T.Parallel(BI):
+                    mask[bi_i] = Indices[b_i, s_i, g_i, i_i * BI + bi_i] >= 0
+
+                for bi_i, d_i in T.Parallel(BI, D):
+                    KV_shared[bi_i, d_i] = KV[
+                        b_i, Indices[b_i, s_i, g_i, i_i * BI + bi_i], g_i, d_i
+                    ]
+                for bi_i, d_i in T.Parallel(BI, D_tail):
+                    K_tail_shared[bi_i, d_i] = KV[
+                        b_i, Indices[b_i, s_i, g_i, i_i * BI + bi_i], g_i, D + d_i
+                    ]
+
+                for h_i, bi_i in T.Parallel(H_per_block, BI):
+                    acc_s[h_i, bi_i] = T.if_then_else(
+                        mask[bi_i], 0, -T.infinity(acc_s.dtype)
+                    )
+                T.gemm(
+                    Q_shared,
+                    KV_shared,
+                    acc_s,
+                    transpose_B=True,
+                    policy=T.GemmWarpPolicy.FullCol,
+                )
+                T.gemm(
+                    Q_tail_shared,
+                    K_tail_shared,
+                    acc_s,
+                    transpose_B=True,
+                    policy=T.GemmWarpPolicy.FullCol,
+                )
+                T.copy(m_i, m_i_prev)
+                T.reduce_max(acc_s, m_i, dim=1, clear=False)
+                for h_i in T.Parallel(H_per_block):
+                    alpha[h_i] = T.exp2((m_i_prev[h_i] - m_i[h_i]) * sm_scale)
+                for h_i, bi_i in T.Parallel(H_per_block, BI):
+                    acc_s[h_i, bi_i] = T.exp2(
+                        acc_s[h_i, bi_i] * sm_scale - m_i[h_i] * sm_scale
+                    )
+                T.reduce_sum(acc_s, sumexp_i, dim=1)  # is this a accumulate operator?
+                for h_i in T.Parallel(H_per_block):
+                    sumexp[h_i] = sumexp[h_i] * alpha[h_i] + sumexp_i[h_i]
+                for h_i, d_i in T.Parallel(H_per_block, D):
+                    acc_o[h_i, d_i] = acc_o[h_i, d_i] * alpha[h_i]
+
+                T.copy(acc_s, S_shared)
+                T.gemm(S_shared, KV_shared, acc_o, policy=T.GemmWarpPolicy.FullCol)
+
+            # Rescale
+            for h_i, d_i in T.Parallel(H_per_block, D):
+                acc_o[h_i, d_i] /= sumexp[h_i]
+            for h_i in T.Parallel(H_per_block):
+                sumexp[h_i] = T.log2(sumexp[h_i]) + m_i[h_i] * sm_scale
+
+            T.copy(acc_o, O_shared)
+            T.copy(acc_o, Output[b_i, s_i, H0:H1, :])
+
+    return main
+
+
+@tilelang.jit(
+    out_idx=[-1],
+    compile_flags=[
+        "-O3",
+        "-Wno-deprecated-declarations",
+        "-U__CUDA_NO_HALF_OPERATORS__",
+        "-U__CUDA_NO_HALF_CONVERSIONS__",
+        "-U__CUDA_NO_HALF2_OPERATORS__",
+        "-U__CUDA_NO_BFLOAT16_CONVERSIONS__",
+        "--expt-relaxed-constexpr",
+        "--expt-extended-lambda",
+        "--ptxas-options=-v,--register-usage-level=10",
+        "-DNDEBUG",
+    ],
+)  # type: ignore
+def sparse_attention_fwd_kernel_v2(
+    num_heads: int,
+    dim: int,
+    tail_dim: int,
+    topk: int,
+    *,
+    kv_group: int = 1,
+    sm_scale: Optional[float] = None,
+    block_I: int = 64,
+):
+    assert dim == tilelang.math.next_power_of_2(
+        dim
+    ), f"haven't check padding correctness yet, dim={dim}"
+    assert tail_dim == tilelang.math.next_power_of_2(
+        tail_dim
+    ), f"haven't check padding correctness yet, dim={tail_dim}"
+    assert (
+        topk % block_I == 0
+    ), "otherwise will load some index=0 thus causing wrong kv to be loaded"
+    if sm_scale is None:
+        sm_scale = (1.0 / (dim + tail_dim)) ** 0.5 * 1.44269504  # log2(e)
+    else:
+        sm_scale = sm_scale * 1.44269504  # log2(e)
+    threads = 384
+
+    batch = T.symbolic("batch")
+    qo_len = T.symbolic("seq_len")
+    num_pages = T.symbolic("num_pages")
+
+    q_shape = [batch, qo_len, num_heads, dim + tail_dim]
+    kv_shape = [batch, num_pages, kv_group, dim + tail_dim]
+    o_shape = [batch, qo_len, num_heads, dim]
+    indices_shape = [batch, qo_len, kv_group, topk]
+
+    indices_dtype = "int32"
+    dtype = "bfloat16"
+    accum_dtype = "float"
+
+    H = num_heads
+    padded_H = max(tilelang.math.next_power_of_2(num_heads), 16)
+    if padded_H != H:
+        assert kv_group == 1
+    BI = block_I
+    NI = tilelang.cdiv(topk, block_I)
+    assert NI % 2 == 0, "NI should be a multiple of 2"
+    D = dim
+    D_tail = tail_dim
+    if num_heads > 64:
+        assert num_heads % 64 == 0, "head_kv should be a multiple of 64"
+        REPLICATE_H = num_heads // 64
+    else:
+        REPLICATE_H = 1
+
+    H_per_block = padded_H if REPLICATE_H == 1 else 64
+
+    @T.prim_func
+    def main(
+        Q: T.Tensor(q_shape, dtype),  # type: ignore
+        KV: T.Tensor(kv_shape, dtype),  # type: ignore
+        Indices: T.Tensor(indices_shape, indices_dtype),  # type: ignore
+        Output: T.Tensor(o_shape, dtype),  # type: ignore
+    ):
+        """
+        Q: [b, qo_len, H, D + D_tail] (bfloat16)
+        KV: [b, num_pages, kv_group, D + D_tail] (bfloat16)
+        Indices: [b, qo_len, kv_group, topk] (int32)
+        """
+
+        with T.Kernel(qo_len * REPLICATE_H, batch, 1, threads=threads) as (bx, by, bz):  # type: ignore
+            Q_shared_l = T.alloc_shared([H_per_block, D // 2], dtype)
+            Q_shared_r = T.alloc_shared([H_per_block, D // 2], dtype)
+            Q_tail_shared = T.alloc_shared([H_per_block, D_tail], dtype)
+            KV_shared_0_l = T.alloc_shared([BI, D // 2], dtype)
+            KV_shared_0_r = T.alloc_shared([BI, D // 2], dtype)
+            KV_shared_1_l = T.alloc_shared([BI, D // 2], dtype)
+            KV_shared_1_r = T.alloc_shared([BI, D // 2], dtype)
+            K_tail_shared_0 = T.alloc_shared([BI, D_tail], dtype)
+            K_tail_shared_1 = T.alloc_shared([BI, D_tail], dtype)
+            O_shared_l = Q_shared_l
+            O_shared_r = Q_shared_r
+            is_kv_valid_0 = T.alloc_shared([BI], "bool", scope="shared")
+            is_kv_valid_1 = T.alloc_shared([BI], "bool", scope="shared")
+
+            acc_o_l = T.alloc_fragment([H_per_block, D // 2], accum_dtype)
+            acc_o_r = T.alloc_fragment([H_per_block, D // 2], accum_dtype)
+            acc_s = T.alloc_fragment([H_per_block, BI], accum_dtype)
+            S_shared = T.alloc_shared([H_per_block, BI], dtype)
+            sumexp = T.alloc_fragment([H_per_block], accum_dtype)
+            sum_exp_shared = T.alloc_shared([H_per_block], accum_dtype)
+            sumexp_i = T.alloc_fragment([H_per_block], accum_dtype)
+            alpha_shared = T.alloc_shared([H_per_block], accum_dtype, scope="shared")
+            alpha_local = T.alloc_fragment([H_per_block], accum_dtype)
+            m_i = T.alloc_fragment([H_per_block], accum_dtype)
+            m_i_prev = T.alloc_fragment([H_per_block], accum_dtype)
+            indices_local = T.alloc_local([1], indices_dtype)
+            indices_tmp = T.alloc_local([1], indices_dtype)
+
+            bar_q = T.alloc_barrier(arrive_count=384)
+            bar_k_0_ready = T.alloc_barrier(arrive_count=128)
+            bar_k_1_ready = T.alloc_barrier(arrive_count=128)
+            bar_k_0_free = T.alloc_barrier(arrive_count=256)
+            bar_k_1_free = T.alloc_barrier(arrive_count=256)
+            bar_sScale_and_sS_ready = T.alloc_barrier(arrive_count=256)
+            bar_sScale_and_sS_free = T.alloc_barrier(arrive_count=256)
+
+            bar_0_128 = T.alloc_barrier(arrive_count=128)
+            bar_1_128 = T.alloc_barrier(arrive_count=128)
+            bar_2_128 = T.alloc_barrier(arrive_count=128)
+            bar_final = T.alloc_barrier(arrive_count=128)
+
+            b_i, g_i = by, bz
+            s_i = bx if REPLICATE_H == 1 else bx // REPLICATE_H
+
+            H0 = g_i * padded_H + (0 if REPLICATE_H == 1 else (bx % REPLICATE_H) * 64)
+            H1 = H0 + H_per_block
+
+            tx = T.get_thread_binding()
+
+            T.copy(Q[b_i, s_i, H0:H1, 0 : D // 2], Q_shared_l)
+            T.copy(Q[b_i, s_i, H0:H1, D // 2 : D], Q_shared_r)
+            T.copy(Q[b_i, s_i, H0:H1, D:], Q_tail_shared)
+            T.barrier_arrive(bar_q)
+
+            if tx < 128:
+                T.set_max_nreg(240, 1)
+                T.fill(sumexp, 0)
+                T.fill(m_i, -(2**30))  # avoid -inf - inf to cause nan
+                T.fill(acc_o_l, 0)
+                T.barrier_wait(bar_q, 0)
+
+                for i_i in T.serial(T.ceildiv(NI, 2)):
+                    # Buffer 0
+                    # with sync_at(bar_0_128, 0):
+                    T.barrier_wait(bar_k_0_ready[0], (i_i & 1))
+                    T.barrier_arrive(bar_0_128)
+                    T.barrier_wait(bar_0_128, 0)
+
+                    for h_i, bi_i in T.Parallel(H_per_block, BI):
+                        acc_s[h_i, bi_i] = T.if_then_else(
+                            is_kv_valid_0[bi_i], 0, -T.infinity(acc_s.dtype)
+                        )
+                    T.gemm(
+                        Q_shared_l, KV_shared_0_l, acc_s, transpose_B=True, wg_wait=-1
+                    )
+                    T.gemm(
+                        Q_shared_r, KV_shared_0_r, acc_s, transpose_B=True, wg_wait=-1
+                    )
+                    T.gemm(
+                        Q_tail_shared,
+                        K_tail_shared_0,
+                        acc_s,
+                        transpose_B=True,
+                        wg_wait=-1,
+                    )
+
+                    T.wait_wgmma(0)
+
+                    if i_i != 0:
+                        T.barrier_arrive(bar_sScale_and_sS_free)
+                        T.barrier_wait(bar_sScale_and_sS_free, ((i_i * 2) & 1) ^ 1)
+
+                    T.copy(m_i, m_i_prev)
+                    T.reduce_max(acc_s, m_i, dim=1, clear=False)
+                    for h_i in T.Parallel(H_per_block):
+                        alpha_local[h_i] = T.exp2((m_i_prev[h_i] - m_i[h_i]) * sm_scale)
+                    for h_i, bi_i in T.Parallel(H_per_block, BI):
+                        acc_s[h_i, bi_i] = T.exp2(
+                            acc_s[h_i, bi_i] * sm_scale - m_i[h_i] * sm_scale
+                        )
+                    T.reduce_sum(
+                        acc_s, sumexp_i, dim=1
+                    )  # is this a accumulate operator?
+                    for h_i in T.Parallel(H_per_block):
+                        sumexp[h_i] = sumexp[h_i] * alpha_local[h_i] + sumexp_i[h_i]
+                    for h_i, d_i in T.Parallel(H_per_block, D // 2):
+                        acc_o_l[h_i, d_i] *= alpha_local[h_i]
+                    T.copy(alpha_local, alpha_shared)
+
+                    T.copy(acc_s, S_shared)
+                    T.gemm(S_shared, KV_shared_0_l, acc_o_l)
+
+                    T.barrier_arrive(bar_sScale_and_sS_ready)
+                    T.barrier_arrive(bar_k_0_free[0])
+
+                    # Buffer 1
+                    T.barrier_wait(bar_k_1_ready[0], (i_i & 1))
+                    T.barrier_arrive(bar_0_128)
+                    T.barrier_wait(bar_0_128, 1)
+
+                    for h_i, bi_i in T.Parallel(H_per_block, BI):
+                        acc_s[h_i, bi_i] = T.if_then_else(
+                            is_kv_valid_1[bi_i], 0, -T.infinity(acc_s.dtype)
+                        )
+                    T.gemm(
+                        Q_shared_l, KV_shared_1_l, acc_s, transpose_B=True, wg_wait=-1
+                    )
+                    T.gemm(
+                        Q_shared_r, KV_shared_1_r, acc_s, transpose_B=True, wg_wait=-1
+                    )
+                    T.gemm(
+                        Q_tail_shared,
+                        K_tail_shared_1,
+                        acc_s,
+                        transpose_B=True,
+                        wg_wait=-1,
+                    )
+
+                    T.wait_wgmma(0)
+
+                    T.barrier_arrive(bar_sScale_and_sS_free)
+                    T.barrier_wait(bar_sScale_and_sS_free, ((i_i * 2 + 1) & 1) ^ 1)
+
+                    T.copy(m_i, m_i_prev)
+                    T.reduce_max(acc_s, m_i, dim=1, clear=False)
+                    for h_i in T.Parallel(H_per_block):
+                        alpha_local[h_i] = T.exp2((m_i_prev[h_i] - m_i[h_i]) * sm_scale)
+                    for h_i, bi_i in T.Parallel(H_per_block, BI):
+                        acc_s[h_i, bi_i] = T.exp2(
+                            acc_s[h_i, bi_i] * sm_scale - m_i[h_i] * sm_scale
+                        )
+                    T.reduce_sum(
+                        acc_s, sumexp_i, dim=1
+                    )  # is this a accumulate operator?
+                    for h_i in T.Parallel(H_per_block):
+                        sumexp[h_i] = sumexp[h_i] * alpha_local[h_i] + sumexp_i[h_i]
+                    for h_i, d_i in T.Parallel(H_per_block, D // 2):
+                        acc_o_l[h_i, d_i] *= alpha_local[h_i]
+                    T.copy(alpha_local, alpha_shared)
+
+                    T.copy(acc_s, S_shared)
+                    T.gemm(S_shared, KV_shared_1_l, acc_o_l)
+
+                    T.barrier_arrive(bar_sScale_and_sS_ready)
+                    T.barrier_arrive(bar_k_1_free[0])
+
+                # Rescale
+                for h_i in T.Parallel(H_per_block):
+                    sum_exp_shared[h_i] = sumexp[h_i]
+                T.barrier_arrive(bar_final)
+                for h_i, d_i in T.Parallel(H_per_block, D // 2):
+                    acc_o_l[h_i, d_i] /= sumexp[h_i]
+                for h_i in T.Parallel(H_per_block):
+                    sumexp[h_i] = T.log2(sumexp[h_i]) + m_i[h_i] * sm_scale
+                T.copy(acc_o_l, O_shared_l)
+                T.copy(O_shared_l, Output[b_i, s_i, H0:H1, 0 : D // 2])
+            elif tx >= 128 and tx < 256:
+                # T.set_max_nreg(168, 1)
+                T.fill(acc_o_r, 0)
+                for i_i in T.serial(T.ceildiv(NI, 2)):
+                    # Buffer 0
+                    T.barrier_arrive(bar_sScale_and_sS_ready)
+                    T.barrier_wait(bar_sScale_and_sS_ready, ((i_i * 2) & 1))
+                    T.barrier_arrive(bar_1_128)
+                    T.barrier_wait(bar_1_128, 0)
+                    for h_i, d_i in T.Parallel(H_per_block, D // 2):
+                        acc_o_r[h_i, d_i] *= alpha_shared[h_i]
+                    T.gemm(S_shared, KV_shared_0_r, acc_o_r)
+                    T.barrier_arrive(bar_k_0_free[0])
+                    T.barrier_arrive(bar_sScale_and_sS_free)
+
+                    # Buffer 1
+                    T.barrier_arrive(bar_sScale_and_sS_ready)
+                    T.barrier_wait(bar_sScale_and_sS_ready, ((i_i * 2 + 1) & 1))
+                    T.barrier_arrive(bar_1_128)
+                    T.barrier_wait(bar_1_128, 1)
+                    for h_i, d_i in T.Parallel(H_per_block, D // 2):
+                        acc_o_r[h_i, d_i] *= alpha_shared[h_i]
+                    T.gemm(S_shared, KV_shared_1_r, acc_o_r)
+                    T.barrier_arrive(bar_k_1_free[0])
+                    if i_i != T.ceildiv(NI, 2) - 1:
+                        T.barrier_arrive(bar_sScale_and_sS_free)
+
+                # Rescale
+                T.barrier_wait(bar_final, 0)
+                for h_i, d_i in T.Parallel(H_per_block, D // 2):
+                    acc_o_r[h_i, d_i] /= sum_exp_shared[h_i]
+
+                T.copy(acc_o_r, O_shared_r)
+                T.copy(O_shared_r, Output[b_i, s_i, H0:H1, D // 2 : D])
+            elif tx >= 256:
+                # producer
+                T.set_max_nreg(80, 0)
+                indices_local[0] = 0
+                for i_i in T.serial(T.ceildiv(NI, 2)):
+                    # Buffer 0
+                    T.barrier_wait(bar_k_0_free[0], ((i_i & 1) ^ 1))
+                    T.barrier_arrive(bar_2_128)
+                    T.barrier_wait(bar_2_128, 0)
+
+                    for r in T.serial(4):
+                        indices_tmp[0] = Indices[
+                            b_i, s_i, g_i, (i_i * 2) * BI + r * 16 + (tx - 256) // 8
+                        ]
+                        is_kv_valid_0[r * 16 + (tx - 256) // 8] = indices_tmp[0] >= 0
+                        if is_kv_valid_0[r * 16 + (tx - 256) // 8]:
+                            indices_local[0] = indices_tmp[0]
+
+                        with T.attr("default", "async_scope", 1):  # type: ignore
+                            for u in T.serial(4):
+                                for v in T.vectorized(8):
+                                    KV_shared_0_l[
+                                        r * 16 + (tx - 256) // 8,
+                                        64 * u + (tx - 256) % 8 * 8 + v,
+                                    ] = KV[
+                                        b_i,
+                                        indices_local[0],
+                                        g_i,
+                                        64 * u + (tx - 256) % 8 * 8 + v,
+                                    ]
+                                    KV_shared_0_r[
+                                        r * 16 + (tx - 256) // 8,
+                                        64 * u + (tx - 256) % 8 * 8 + v,
+                                    ] = KV[
+                                        b_i,
+                                        indices_local[0],
+                                        g_i,
+                                        D // 2 + 64 * u + (tx - 256) % 8 * 8 + v,
+                                    ]
+                        with T.attr("default", "async_scope", 1):  # type: ignore
+                            for v in T.vectorized(8):
+                                K_tail_shared_0[
+                                    r * 16 + (tx - 256) // 8, (tx - 256) % 8 * 8 + v
+                                ] = KV[
+                                    b_i,
+                                    indices_local[0],
+                                    g_i,
+                                    D + (tx - 256) % 8 * 8 + v,
+                                ]
+
+                    T.cp_async_barrier_noinc(bar_k_0_ready[0])
+
+                    # Buffer 1
+                    T.barrier_wait(bar_k_1_free[0], ((i_i & 1) ^ 1))
+                    T.barrier_arrive(bar_2_128)
+                    T.barrier_wait(bar_2_128, 1)
+
+                    for r in T.serial(4):
+                        indices_tmp[0] = Indices[
+                            b_i, s_i, g_i, (i_i * 2 + 1) * BI + r * 16 + (tx - 256) // 8
+                        ]
+                        is_kv_valid_1[r * 16 + (tx - 256) // 8] = indices_tmp[0] >= 0
+                        if is_kv_valid_1[r * 16 + (tx - 256) // 8]:
+                            indices_local[0] = indices_tmp[0]
+
+                        with T.attr("default", "async_scope", 1):  # type: ignore
+                            for u in T.serial(4):
+                                for v in T.vectorized(8):
+                                    KV_shared_1_l[
+                                        r * 16 + (tx - 256) // 8,
+                                        64 * u + (tx - 256) % 8 * 8 + v,
+                                    ] = KV[
+                                        b_i,
+                                        indices_local[0],
+                                        g_i,
+                                        64 * u + (tx - 256) % 8 * 8 + v,
+                                    ]
+                                    KV_shared_1_r[
+                                        r * 16 + (tx - 256) // 8,
+                                        64 * u + (tx - 256) % 8 * 8 + v,
+                                    ] = KV[
+                                        b_i,
+                                        indices_local[0],
+                                        g_i,
+                                        D // 2 + 64 * u + (tx - 256) % 8 * 8 + v,
+                                    ]
+                        with T.attr("default", "async_scope", 1):  # type: ignore
+                            for v in T.vectorized(8):
+                                K_tail_shared_1[
+                                    r * 16 + (tx - 256) // 8, (tx - 256) % 8 * 8 + v
+                                ] = KV[
+                                    b_i,
+                                    indices_local[0],
+                                    g_i,
+                                    D + (tx - 256) % 8 * 8 + v,
+                                ]
+
+                    T.cp_async_barrier_noinc(bar_k_1_ready[0])
+
+    return main
+
+
+@tilelang.jit(
+    out_idx=[-2, -1],
+    pass_configs={
+        tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
+        tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
+    },
+)
+def sparse_mla_fwd_decode_partial(
+    heads,
+    dim,
+    tail_dim,
+    topk,
+    *,
+    kv_group=1,
+    sm_scale=None,
+    is_causal=True,
+    block_I=64,
+    threads=256,
+):
+    """
+    grid: (seq_len * REPLICATE_H, top_k_blocks).
+    Each block does one topk block, writes partial_o, partial_lse.
+    """
+
+    assert is_causal == True, "non-causal is not supported"
+    assert kv_group == 1
+    assert topk % block_I == 0
+
+    # log2(e) = 1.44269504
+    if sm_scale is None:
+        sm_scale = (1.0 / (dim + tail_dim)) ** 0.5 * 1.44269504
+    else:
+        sm_scale = sm_scale * 1.44269504
+
+    batch = 1
+    seq_len = T.dynamic("seq_len")
+    seq_len_kv = T.dynamic("seq_len_kv")
+
+    head_kv = heads // kv_group
+    padded_H = max(tilelang.math.next_power_of_2(head_kv), 16)
+    REPLICATE_H = (head_kv // 64) if head_kv > 64 else 1
+    H_per_block = padded_H if REPLICATE_H == 1 else 64
+    BI = block_I
+    NI = topk // block_I
+    D = dim
+    D_tail = tail_dim
+
+    q_shape = [batch, seq_len, heads, dim + tail_dim]
+    kv_shape = [batch, seq_len_kv, kv_group, dim + tail_dim]
+    indices_shape = [batch, seq_len, kv_group, topk]
+    partial_o_shape = [batch, seq_len, NI, heads, dim]
+    partial_lse_shape = [batch, seq_len, NI, heads]
+    indices_dtype = T.int32
+    dtype = T.bfloat16
+    accum_dtype = T.float32
+
+    @T.prim_func
+    def main(
+        Q: T.Tensor(q_shape, dtype),
+        KV: T.Tensor(kv_shape, dtype),
+        Indices: T.Tensor(indices_shape, indices_dtype),
+        Partial_O: T.Tensor(partial_o_shape, dtype),
+        Partial_Lse: T.Tensor(partial_lse_shape, accum_dtype),
+    ):
+        with T.Kernel(seq_len * REPLICATE_H, NI, threads=threads) as (bx, by):
+            Q_shared = T.alloc_shared([H_per_block, D], dtype)
+            Q_tail_shared = T.alloc_shared([H_per_block, D_tail], dtype)
+            KV_shared = T.alloc_shared([BI, D], dtype)
+            K_tail_shared = T.alloc_shared([BI, D_tail], dtype)
+            mask = T.alloc_fragment([BI], T.bool)
+
+            acc_o = T.alloc_fragment([H_per_block, D], accum_dtype)
+            acc_s = T.alloc_fragment([H_per_block, BI], accum_dtype)
+            S_shared = T.alloc_shared([H_per_block, BI], dtype)
+            sumexp_i = T.alloc_fragment([H_per_block], accum_dtype)
+            m_i = T.alloc_fragment([H_per_block], accum_dtype)
+
+            T.fill(acc_o, 0)
+
+            b_i, g_i = 0, 0
+            s_i = bx if REPLICATE_H == 1 else (bx // REPLICATE_H)
+            topk_block_i = by
+            q_i = s_i
+
+            H0 = 0 if REPLICATE_H == 1 else (bx % REPLICATE_H) * 64
+            H1 = H0 + H_per_block
+
+            T.copy(Q[b_i, s_i, H0:H1, :D], Q_shared)
+            T.copy(Q[b_i, s_i, H0:H1, D:], Q_tail_shared)
+
+            for bi_i in T.Parallel(BI):
+                mask[bi_i] = Indices[b_i, s_i, g_i, topk_block_i * BI + bi_i] >= 0
+            for bi_i, d_i in T.Parallel(BI, D):
+                KV_shared[bi_i, d_i] = KV[
+                    b_i, Indices[b_i, s_i, g_i, topk_block_i * BI + bi_i], g_i, d_i
+                ]
+            for bi_i, d_i in T.Parallel(BI, D_tail):
+                K_tail_shared[bi_i, d_i] = KV[
+                    b_i, Indices[b_i, s_i, g_i, topk_block_i * BI + bi_i], g_i, D + d_i
+                ]
+            for h_i, bi_i in T.Parallel(H_per_block, BI):
+                acc_s[h_i, bi_i] = T.if_then_else(
+                    mask[bi_i], 0, -T.infinity(acc_s.dtype)
+                )
+            T.gemm(
+                Q_shared,
+                KV_shared,
+                acc_s,
+                transpose_B=True,
+                policy=T.GemmWarpPolicy.FullCol,
+            )
+            T.gemm(
+                Q_tail_shared,
+                K_tail_shared,
+                acc_s,
+                transpose_B=True,
+                policy=T.GemmWarpPolicy.FullCol,
+            )
+
+            T.reduce_max(acc_s, m_i, dim=1, clear=True)
+            for h_i in T.Parallel(H_per_block):
+                m_i[h_i] = T.max(m_i[h_i], -(2**30))
+            for h_i, bi_i in T.Parallel(H_per_block, BI):
+                acc_s[h_i, bi_i] = T.exp2(
+                    acc_s[h_i, bi_i] * sm_scale - m_i[h_i] * sm_scale
+                )
+
+            T.reduce_sum(acc_s, sumexp_i, dim=1)
+            T.copy(acc_s, S_shared)
+            T.gemm(S_shared, KV_shared, acc_o, policy=T.GemmWarpPolicy.FullCol)
+
+            # sumexp_i==0 (all masked), divide by 1 to get 0 and avoid nan
+            for h_i, d_i in T.Parallel(H_per_block, D):
+                acc_o[h_i, d_i] = acc_o[h_i, d_i] / T.if_then_else(
+                    sumexp_i[h_i] == 0.0, 1.0, sumexp_i[h_i]
+                )
+            # sumexp_i==0 (all masked), use large negative so combine ignores this split
+            for h_i in T.Parallel(H_per_block):
+                sumexp_i[h_i] = T.if_then_else(
+                    sumexp_i[h_i] == 0.0,
+                    -(2**30),
+                    T.log2(sumexp_i[h_i]) + m_i[h_i] * sm_scale,
+                )
+
+            T.copy(acc_o, Partial_O[b_i, s_i, topk_block_i, H0:H1, :])
+            T.copy(sumexp_i, Partial_Lse[b_i, s_i, topk_block_i, H0:H1])
+
+    return main
+
+
+@tilelang.jit(
+    out_idx=[-1],
+    pass_configs={
+        tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
+        tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
+    },
+)
+def sparse_mla_fwd_decode_combine(
+    heads,
+    dim,
+    topk,
+    head_per_block,
+    *,
+    block_I=64,
+    threads=256,
+):
+    """
+    grid: (seq_len * REPLICATE_H). batch=1, kv_group=1.
+    Each block does one tile of heads (e.g. 4 or 8 for decode).
+    """
+
+    assert heads % head_per_block == 0, f"head_per_block must divide heads"
+
+    batch = 1
+    seq_len = T.dynamic("seq_len")
+
+    NI = topk // block_I
+    H_per_block = head_per_block
+    REPLICATE_H = heads // H_per_block
+
+    partial_o_shape = [batch, seq_len, NI, heads, dim]
+    partial_lse_shape = [batch, seq_len, NI, heads]
+    o_shape = [batch, seq_len, heads, dim]
+    dtype = T.bfloat16
+    accum_dtype = T.float32
+
+    @T.prim_func
+    def main(
+        Partial_O: T.Tensor(partial_o_shape, dtype),
+        Partial_Lse: T.Tensor(partial_lse_shape, accum_dtype),
+        Output: T.Tensor(o_shape, dtype),
+    ):
+        with T.Kernel(seq_len * REPLICATE_H, threads=threads) as (bx,):
+            shared_lse = T.alloc_shared([NI, H_per_block], accum_dtype)
+
+            lse_max = T.alloc_fragment([H_per_block], accum_dtype)
+            lse_sum = T.alloc_fragment([H_per_block], accum_dtype)
+            scale = T.alloc_fragment([H_per_block, NI], accum_dtype)
+            acc_o = T.alloc_fragment([H_per_block, dim], accum_dtype)
+
+            b_i = 0
+            s_i = bx if REPLICATE_H == 1 else (bx // REPLICATE_H)
+            H0 = 0 if REPLICATE_H == 1 else (bx % REPLICATE_H) * H_per_block
+            H1 = H0 + H_per_block
+
+            for k in T.serial(NI):
+                T.copy(Partial_Lse[b_i, s_i, k, H0:H1], shared_lse[k, :])
+
+            T.fill(lse_max, -(2**30))
+            for k in T.serial(NI):
+                for h_i in T.Parallel(H_per_block):
+                    lse_max[h_i] = T.max(lse_max[h_i], shared_lse[k, h_i])
+            T.fill(lse_sum, 0)
+            for k in T.serial(NI):
+                for h_i in T.Parallel(H_per_block):
+                    lse_sum[h_i] = lse_sum[h_i] + T.exp2(
+                        shared_lse[k, h_i] - lse_max[h_i]
+                    )
+            for k in T.serial(NI):
+                for h_i in T.Parallel(H_per_block):
+                    scale[h_i, k] = T.exp2(
+                        shared_lse[k, h_i] - lse_max[h_i] - T.log2(lse_sum[h_i])
+                    )
+
+            T.fill(acc_o, 0)
+            for k in T.serial(NI):
+                for h_i, d_i in T.Parallel(H_per_block, dim):
+                    acc_o[h_i, d_i] = acc_o[h_i, d_i] + scale[h_i, k] * Partial_O[
+                        b_i, s_i, k, H0 + h_i, d_i
+                    ].astype(accum_dtype)
+
+            T.copy(acc_o, Output[b_i, s_i, H0:H1, :])
+
+    return main
+
+
+def tilelang_sparse_fwd(
+    q: torch.Tensor,
+    kv: torch.Tensor,
+    indices: torch.Tensor,
+    sm_scale: float,
+    d_v: int = 512,
+) -> torch.Tensor:
+    assert q.dim() == 3 and kv.dim() == 3 and indices.dim() == 3
+    num_heads = q.shape[1]
+    dim = q.shape[2]
+    tail_dim = dim - d_v
+    topk = indices.shape[-1]
+    assert topk == 2048
+    if _is_hip:
+        if _is_gfx95_supported:
+            # decode kernel
+            if q.shape[0] <= 64:
+                kernel_partial = sparse_mla_fwd_decode_partial(
+                    num_heads,
+                    d_v,
+                    tail_dim,
+                    topk,
+                    sm_scale=sm_scale,
+                    block_I=64,
+                    threads=256,
+                )
+                kernel_combine = sparse_mla_fwd_decode_combine(
+                    num_heads, d_v, topk, head_per_block=4, block_I=64, threads=256
+                )
+                partial_o, partial_lse = kernel_partial(
+                    q.unsqueeze(0), kv.unsqueeze(0), indices.unsqueeze(0)
+                )
+                out = kernel_combine(partial_o, partial_lse)
+                return out
+
+            # prefill kernel
+            kernel = sparse_attention_fwd_kernel_v1(
+                num_heads, d_v, tail_dim, topk, sm_scale=sm_scale, num_stages=1
+            )
+        else:  # reduce LDS usage on gfx942 target
+            kernel = sparse_attention_fwd_kernel_v1(
+                num_heads,
+                d_v,
+                tail_dim,
+                topk,
+                sm_scale=sm_scale,
+                block_I=32,
+                num_stages=1,
+                threads=128,
+            )
+    else:
+        kernel = sparse_attention_fwd_kernel_v2(
+            num_heads, d_v, tail_dim, topk, sm_scale=sm_scale
+        )
+    return kernel(q.unsqueeze(0), kv.unsqueeze(0), indices.unsqueeze(0))  # type: ignore
diff --git a/sglang/python/sglang/srt/layers/attention/nsa/transform_index.py b/sglang/python/sglang/srt/layers/attention/nsa/transform_index.py
new file mode 100644
index 0000000000000000000000000000000000000000..10b1068f524106e8b43c867551e9838c954a9915
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/attention/nsa/transform_index.py
@@ -0,0 +1,144 @@
+from typing import List, Optional
+
+import torch
+import triton
+import triton.language as tl
+
+
+def transform_index_page_table_prefill(**kwargs):
+    return transform_index_page_table_prefill_ref(**kwargs)
+
+
+def transform_index_page_table_decode(**kwargs):
+    return transform_index_page_table_decode_ref(**kwargs)
+
+
+@triton.jit
+def transform_index_page_table_decode_kernel(
+    page_table_ptr: torch.Tensor,
+    topk_indices_ptr: torch.Tensor,
+    result_ptr: torch.Tensor,
+    page_size: tl.constexpr,
+    max_seqlen_k: tl.constexpr,
+):
+    TOPK: tl.constexpr = 2048
+    req_id = tl.program_id(0)
+    page_table_ptr = page_table_ptr + req_id * max_seqlen_k
+    topk_indices_ptr = topk_indices_ptr + req_id * TOPK
+    result_ptr = result_ptr + req_id * TOPK
+
+    offset = tl.arange(0, TOPK)  # topk should be 2048
+    loaded_topk_indices = tl.load(topk_indices_ptr + offset)
+    mask = loaded_topk_indices >= 0
+    loaded_kv_indices = tl.load(page_table_ptr + loaded_topk_indices, mask=mask)
+    tl.store(result_ptr + offset, loaded_kv_indices, mask=mask)
+    tl.store(result_ptr + offset, -1, mask=~mask)
+
+
+def transform_index_page_table_decode_fast(
+    page_table: torch.Tensor,
+    topk_indices: torch.Tensor,
+    result: Optional[torch.Tensor] = None,
+    page_size: int = 1,
+) -> torch.Tensor:
+    """
+    Transform the page table according to topk indices for sparse topk attention.
+    Args:
+        page_table: [qo_len, max_seqlen_k], the original page table
+        topk_indices: [qo_len, topk], the topk indices for each query position
+    Returns:
+        transformed_page_table: [qo_len, topk], the transformed page table
+        For out-of-bound indices in topk_indices, this should be filled with -1.
+    """
+    assert page_size == 1
+    assert page_table.shape[0] == topk_indices.shape[0]
+    assert topk_indices.shape[1] == 2048
+    qo_len = topk_indices.shape[0]
+    max_seqlen_k = page_table.shape[1]
+    if result is None:
+        result = torch.empty_like(topk_indices, dtype=torch.int32)
+    # Launch triton kernel
+    grid = (qo_len,)
+    transform_index_page_table_decode_kernel[grid](
+        page_table,
+        topk_indices,
+        result,
+        page_size,
+        max_seqlen_k=max_seqlen_k,
+    )
+    return result
+
+
+def transform_index_page_table_prefill_fast(
+    page_table: torch.Tensor,
+    topk_indices: torch.Tensor,
+    extend_lens_cpu: List[int],
+    page_size: int = 1,
+) -> torch.Tensor:
+    # TODO(baizhou): can be implemented with another triton kernel
+    assert page_size == 1
+    result = torch.empty_like(topk_indices, dtype=torch.int32)
+    assert len(extend_lens_cpu) == page_table.shape[0]
+    offset = 0
+    for i, l in enumerate(extend_lens_cpu):
+        transform_index_page_table_decode_fast(
+            page_table[i].unsqueeze(0).expand(l, -1),
+            topk_indices[offset : offset + l],
+            result=result[offset : offset + l],
+        )
+        offset += l
+    assert offset == topk_indices.shape[0]
+    return result
+
+
+def transform_index_page_table_decode_ref(
+    page_table: torch.Tensor,
+    topk_indices: torch.Tensor,
+    result: Optional[torch.Tensor] = None,
+    page_size: int = 1,
+) -> torch.Tensor:
+    assert page_size == 1
+    assert page_table.shape[0] == topk_indices.shape[0]
+    if result is None:
+        result = torch.empty_like(topk_indices, dtype=torch.int32)
+    assert result.shape == topk_indices.shape
+    torch.gather(
+        page_table.to(result.dtype),
+        dim=1,
+        index=topk_indices.clamp(min=0),
+        out=result,
+    )
+    result[topk_indices < 0] = -1
+    return result
+
+
+def transform_index_page_table_prefill_ref(
+    page_table: torch.Tensor,
+    topk_indices: torch.Tensor,
+    extend_lens_cpu: List[int],
+    page_size: int = 1,
+) -> torch.Tensor:
+    assert page_size == 1
+    result = torch.empty_like(topk_indices, dtype=torch.int32)
+    assert len(extend_lens_cpu) == page_table.shape[0]
+    offset = 0
+    for i, l in enumerate(extend_lens_cpu):
+        transform_index_page_table_decode_ref(
+            page_table[i].unsqueeze(0).expand(l, -1),
+            topk_indices[offset : offset + l],
+            result=result[offset : offset + l],
+        )
+        offset += l
+    assert offset == topk_indices.shape[0]
+    return result
+
+
+if __name__ == "__main__":
+    bs, topk, max_seqlen = 10, 2048, 3000
+    page_table = torch.randint(0, 100, (bs, max_seqlen), device="cuda")
+    topk_indices = torch.full((bs, topk), -1, device="cuda")
+    topk_indices[:, :1600] = torch.arange(1600).unsqueeze(0).repeat(bs, 1)
+    ref_result = transform_index_page_table_decode_ref(page_table, topk_indices)
+    result = transform_index_page_table_decode_fast(page_table, topk_indices)
+    assert torch.all(result == ref_result)
+    print("Passed")
diff --git a/sglang/python/sglang/srt/layers/attention/nsa/triton_kernel.py b/sglang/python/sglang/srt/layers/attention/nsa/triton_kernel.py
new file mode 100644
index 0000000000000000000000000000000000000000..0d29698040722b7484f4a15579be968e30578378
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/attention/nsa/triton_kernel.py
@@ -0,0 +1,196 @@
+from typing import Optional, Tuple
+
+import torch
+import triton
+import triton.language as tl
+
+
+# Triton implementation
+@triton.jit
+def _act_quant_kernel(
+    X_ptr,
+    Y_ptr,
+    S_ptr,
+    M,
+    N,
+    group_size: tl.constexpr,
+    round_scale: tl.constexpr,
+    BLOCK_M: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+):
+    """
+    Triton kernel for activation quantization.
+
+    Each block processes BLOCK_M rows and group_size columns.
+    """
+    # Get block IDs
+    pid_m = tl.program_id(0)
+    pid_n = tl.program_id(1)
+
+    # FP8 constants
+    fp8_min = -448.0
+    fp8_max = 448.0
+    fp8_max_inv = 1.0 / fp8_max
+
+    # Calculate row and column offsets
+    row_start = pid_m * BLOCK_M
+    col_start = pid_n * group_size
+
+    # Create offset arrays
+    rows = row_start + tl.arange(0, BLOCK_M)
+    cols = col_start + tl.arange(0, BLOCK_N)
+
+    # Mask for valid rows and columns
+    row_mask = rows < M
+    col_mask = cols < N
+    mask = row_mask[:, None] & col_mask[None, :]
+
+    # Load input data
+    x_ptrs = X_ptr + rows[:, None] * N + cols[None, :]
+    x = tl.load(x_ptrs, mask=mask, other=0.0).to(tl.float32)
+
+    # Compute absolute max along columns (group_size dimension) for each row
+    x_abs = tl.abs(x)
+    amax = tl.max(x_abs, axis=1)  # Shape: (BLOCK_M,)
+
+    # Clamp amax to avoid division by zero
+    amax = tl.maximum(amax, 1e-4)
+
+    # Compute scale
+    if round_scale:
+        # Fast round scale using bit manipulation approximation
+        # This is a simplified version - the exact bit manipulation is harder in Triton
+        # Using log2 + ceil + pow2 as approximation
+        log_val = tl.log2(amax * fp8_max_inv)
+        log_ceil = tl.ceil(log_val)
+        scale = tl.exp2(log_ceil)
+    else:
+        scale = amax * fp8_max_inv
+
+    # Quantize: y = clamp(x / scale, fp8_min, fp8_max)
+    scale_broadcast = scale[:, None]
+    y = x / scale_broadcast
+    y = tl.minimum(tl.maximum(y, fp8_min), fp8_max)
+
+    # Store quantized output
+    y_ptrs = Y_ptr + rows[:, None] * N + cols[None, :]
+    tl.store(y_ptrs, y, mask=mask)
+
+    # Store scales
+    s_cols = pid_n
+    s_ptrs = S_ptr + rows * (N // group_size) + s_cols
+    s_mask = row_mask
+    tl.store(s_ptrs, scale, mask=s_mask)
+
+
+def act_quant(
+    x: torch.Tensor, block_size: int = 128, scale_fmt: Optional[str] = None
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """
+    Quantizes the input tensor `x` using block-wise quantization with Triton.
+
+    Args:
+        x (torch.Tensor): The input tensor to be quantized. Must be contiguous and its last dimension size must be divisible by `block_size`.
+        block_size (int, optional): The size of the blocks to be used for quantization. Default is 128.
+        scale_fmt (Optional[str], optional): The format of the scale. Default is None.
+    Returns:
+        Tuple[torch.Tensor, torch.Tensor]: A tuple containing:
+            - The quantized tensor with dtype `torch.float8_e4m3fn`.
+            - A tensor of scaling factors with dtype `torch.float32`.
+    """
+    assert x.is_contiguous(), "Input tensor must be contiguous"
+    assert (
+        x.size(-1) % block_size == 0
+    ), f"Last dimension size must be divisible by block_size (block_size={block_size})"
+
+    # Flatten all dims except last
+    N = x.size(-1)
+    x_flat = x.view(-1, N)
+    M = x_flat.size(0)
+
+    # Allocate output tensors
+    y = torch.empty_like(x, dtype=torch.float8_e4m3fn)
+    y_flat = y.view(-1, N)
+    s = x.new_empty(*x.size()[:-1], N // block_size, dtype=torch.float32)
+    s_flat = s.view(-1, N // block_size)
+
+    # Launch kernel
+    BLOCK_M = 32
+    BLOCK_N = block_size
+    grid = (triton.cdiv(M, BLOCK_M), triton.cdiv(N, block_size))
+    round_scale = scale_fmt is not None
+
+    _act_quant_kernel[grid](
+        x_flat,
+        y_flat,
+        s_flat,
+        M,
+        N,
+        group_size=block_size,
+        round_scale=round_scale,
+        BLOCK_M=BLOCK_M,
+        BLOCK_N=BLOCK_N,
+        num_stages=0 if round_scale else 2,
+    )
+
+    return y, s
+
+
+@triton.jit
+def _get_valid_kv_indices_kernel(
+    page_table_ptr,  # [bs, topk]
+    kv_indptr_ptr,  # [bs + 1]
+    kv_indices_ptr,  # [bs * topk] output buffer
+    bs: tl.constexpr,
+    topk: tl.constexpr,
+):
+    """
+    Extract valid indices (non -1) from page_table into kv_indices.
+    Each program handles one batch.
+    """
+    batch_id = tl.program_id(0)
+
+    # Get the start position for this batch in kv_indices
+    dst_start = tl.load(kv_indptr_ptr + batch_id)
+
+    # Load all topk indices for this batch
+    src_offset = batch_id * topk
+    offsets = tl.arange(0, topk)
+    indices = tl.load(page_table_ptr + src_offset + offsets)
+
+    # Count valid indices and compact them
+    mask = indices != -1
+
+    # Use prefix sum to compute destination positions for valid elements
+    # For each position, count how many valid elements are before it
+    prefix_sum = tl.cumsum(mask.to(tl.int32), axis=0) - 1
+
+    # Store valid indices to their compacted positions
+    dst_positions = dst_start + prefix_sum
+    tl.store(kv_indices_ptr + dst_positions, indices, mask=mask)
+
+
+def get_valid_kv_indices(
+    page_table_1: torch.Tensor,
+    kv_indptr: torch.Tensor,
+    kv_indices: torch.Tensor,
+    bs: int,
+):
+    """
+    Extract valid indices from page_table_1 into kv_indices buffer.
+
+    Args:
+        page_table_1: [bs, topk] page table with -1 as invalid
+        kv_indptr: [bs + 1] cumulative count of valid indices per batch
+        kv_indices: [bs * topk] pre-allocated output buffer
+        bs: batch size
+    """
+    topk = page_table_1.shape[1]
+    grid = (bs,)
+    _get_valid_kv_indices_kernel[grid](
+        page_table_1,
+        kv_indptr,
+        kv_indices,
+        bs,
+        topk,
+    )
diff --git a/sglang/python/sglang/srt/layers/attention/nsa/utils.py b/sglang/python/sglang/srt/layers/attention/nsa/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..1e097983dee7f74c8f01d814825a80f702d90718
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/attention/nsa/utils.py
@@ -0,0 +1,579 @@
+# temp NSA debugging environ
+from dataclasses import dataclass
+from itertools import accumulate
+from typing import TYPE_CHECKING, List, Tuple, Union
+
+import torch
+import torch.nn.functional as F
+import triton
+import triton.language as tl
+
+from sglang.srt.distributed.device_communicators.pynccl_allocator import (
+    use_symmetric_memory,
+)
+from sglang.srt.layers.dp_attention import (
+    DpPaddingMode,
+    attn_cp_all_gather_into_tensor,
+    get_attention_cp_group,
+    get_attention_cp_rank,
+    get_attention_cp_size,
+    get_attention_dp_rank,
+    is_allocation_symmetric,
+)
+from sglang.srt.server_args import get_global_server_args
+from sglang.srt.utils.common import ceil_align, ceil_div
+
+if TYPE_CHECKING:
+    from sglang.srt.model_executor.forward_batch_info import ForwardBatch
+
+
+def compute_nsa_seqlens(original_seq_lens, nsa_index_topk: int):
+    return original_seq_lens.clamp(max=nsa_index_topk)
+
+
+def is_nsa_enable_prefill_cp():
+    return get_global_server_args().enable_nsa_prefill_context_parallel
+
+
+def is_nsa_prefill_cp_in_seq_split():
+    return (
+        is_nsa_enable_prefill_cp()
+        and get_global_server_args().nsa_prefill_cp_mode == "in-seq-split"
+    )
+
+
+def is_nsa_prefill_cp_round_robin_split():
+    return (
+        is_nsa_enable_prefill_cp()
+        and get_global_server_args().nsa_prefill_cp_mode == "round-robin-split"
+    )
+
+
+def can_nsa_prefill_cp_round_robin_split(forward_batch: "ForwardBatch"):
+    if not forward_batch.forward_mode.is_context_parallel_extend():
+        return False
+    cp_size = get_attention_cp_size()
+    seq_len = sum(forward_batch.extend_seq_lens_cpu)
+    return (
+        is_nsa_prefill_cp_round_robin_split()
+        and seq_len > 0
+        and seq_len >= cp_size
+        and cp_size > 1
+    )
+
+
+def nsa_cp_round_robin_split_data(input_: Union[torch.Tensor, List]):
+    """
+    # for round-robin-split, split the tokens evenly according to the rule of token_idx % cp_size.
+    |   +-----------before split------------+|
+    | token0, token1, token2, token3, token4, token5, token6, token7, ...
+    |
+    |   +--------------result-------------------+
+    | dp_atten_tp0: token0, token4, token8, token12, token16, ... |
+    | dp_atten_tp1: token1, token5, token9, token13, token17, ... |
+    | dp_atten_tp2: token2, token6, token10, token14, token18, ... |
+    | dp_atten_tp3: token3, token7, token11, token15, token19, ... |
+    |   +-------------------------+
+    """
+    cp_size = get_attention_cp_size()
+    cp_rank = get_attention_cp_rank()
+    if isinstance(input_, (tuple, list)):
+        indices = range(cp_rank, len(input_), cp_size)
+        return input_[indices]
+
+    tokens = len(input_)
+    if tokens % cp_size != 0:
+        cur_len = tokens // cp_size + (tokens % cp_size > cp_rank)
+        if cur_len == 0:
+            return input_.new_empty(0, *input_.shape[1:])
+        indices = torch.arange(cp_rank, tokens, cp_size, device=input_.device)
+        return input_[indices]
+
+    # for torch device tensor
+    return input_.view(-1, cp_size, *input_.shape[1:])[:, cp_rank].contiguous()
+
+
+def cal_padded_tokens(forward_batch: "ForwardBatch"):
+    # Consistent with the padding calculation logic in ForwardBatch.prepare_mlp_sync_batch,
+    # calculate the actual token length after padding when attn_tp_size > 1 or in the MAX_LEN padding mode.
+    global_num_tokens = forward_batch.global_num_tokens_cpu.copy()
+    sync_group_size = len(global_num_tokens)
+    attn_cp_size = get_attention_cp_size()
+    for i in range(sync_group_size):
+        global_num_tokens[i] = ceil_align(global_num_tokens[i], attn_cp_size)
+    dp_padding_mode = DpPaddingMode.get_dp_padding_mode(
+        forward_batch.is_extend_in_batch, global_num_tokens
+    )
+    if dp_padding_mode.is_max_len():
+        tokens = max(global_num_tokens)
+    elif len(global_num_tokens) > 1:
+        tokens = global_num_tokens[get_attention_dp_rank()]
+    else:
+        tokens = global_num_tokens[0]
+    if can_nsa_prefill_cp_round_robin_split(forward_batch):
+        tokens = ceil_div(tokens, attn_cp_size)
+    return tokens
+
+
+def pad_nsa_cache_seqlens(forward_batch: "ForwardBatch", nsa_cache_seqlens):
+    attn_cp_size = get_attention_cp_size()
+    needs_cp_pad = attn_cp_size > 1 and can_nsa_prefill_cp_round_robin_split(
+        forward_batch
+    )
+    needs_dp_pad = forward_batch.global_num_tokens_cpu is not None
+    if not needs_cp_pad and not needs_dp_pad:
+        return nsa_cache_seqlens
+    tokens = cal_padded_tokens(forward_batch)
+    pad_len = tokens - nsa_cache_seqlens.shape[0]
+    if pad_len > 0:
+        nsa_cache_seqlens = torch.cat(
+            [
+                nsa_cache_seqlens,
+                nsa_cache_seqlens.new_zeros(pad_len, *nsa_cache_seqlens.shape[1:]),
+            ]
+        )
+    return nsa_cache_seqlens
+
+
+@dataclass
+class NSAContextParallelMetadata:
+
+    split_list: List[int] = None
+    max_rank_len: List[int] = None
+    zigzag_index: List[int] = None
+    per_rank_actual_token: List[int] = None
+    reverse_split_len: List[int] = None
+    cp_reverse_index: List[int] = None
+    kv_len_prev: int = -1
+    kv_len_next: int = -1
+    actual_seq_q_prev: int = -1
+    actual_seq_q_next: int = -1
+    kv_len_prev_tensor: torch.Tensor = None
+    kv_len_next_tensor: torch.Tensor = None
+    actual_seq_q_prev_tensor: torch.Tensor = None
+    actual_seq_q_next_tensor: torch.Tensor = None
+    total_seq_lens: torch.Tensor = None
+
+
+def can_cp_split(seq_len: int, cp_size: int, use_nsa: bool, forward_batch):
+    if is_nsa_prefill_cp_round_robin_split():
+        cur_cp_seq_len = seq_len // cp_size
+        assert (
+            seq_len % cp_size == 0
+        ), f"seq_len {seq_len} is not divisible by cp_size {cp_size} when nsa_prefill_cp_mode is round-robin-split"
+    else:
+        # TODO current just support prefill batch=1 and len(input_ids) > self.cp_size * 2
+        # Note: (self.cp_size * 2) To achieve load balancing for seq computation,
+        # the seq data needs to be divided and recombined at twice the size of cp_size.
+        cur_cp_seq_len = seq_len // (cp_size * 2)
+    if (
+        cur_cp_seq_len != 0
+        and cp_size > 1
+        and use_nsa
+        and forward_batch.forward_mode.is_context_parallel_extend()
+        and is_nsa_enable_prefill_cp()
+        and sum(forward_batch.extend_seq_lens_cpu) >= cp_size
+    ):
+        return True
+    else:
+        return False
+
+
+def cp_split_and_rebuild_data(forward_batch, input_: torch.Tensor):
+    if is_nsa_prefill_cp_round_robin_split():
+        cp_size = get_attention_cp_size()
+        assert (
+            input_.shape[0] % cp_size == 0
+        ), f"Expect input shape 0 can divided by cp size, but got input shape {input_.shape}, cp size {cp_size}"
+        return nsa_cp_round_robin_split_data(input_)
+
+    input_list = list(
+        torch.split(input_, forward_batch.nsa_cp_metadata.split_list, dim=0)
+    )
+    result = torch.cat(
+        [input_list[i] for i in forward_batch.nsa_cp_metadata.zigzag_index], dim=0
+    ).view(-1, input_.shape[-1])
+    return result
+
+
+def cp_split_and_rebuild_position(forward_batch, positions: torch.Tensor):
+    if is_nsa_prefill_cp_round_robin_split():
+        cp_size = get_attention_cp_size()
+        assert positions.shape[0] % cp_size == 0, (
+            f"Expect positions shape 0 can divided by cp size, but got positions shape {positions.shape}, "
+            f"cp size {cp_size}"
+        )
+        return nsa_cp_round_robin_split_data(positions)
+
+    position_id_list = list(
+        torch.split(positions, forward_batch.nsa_cp_metadata.split_list, dim=-1)
+    )
+    positions = torch.cat(
+        [position_id_list[i] for i in forward_batch.nsa_cp_metadata.zigzag_index],
+        dim=-1,
+    )
+    return positions
+
+
+@triton.jit
+def nsa_cp_round_robin_split_q_seqs_kernel(
+    in_seqs_ptr,
+    out_seqs_ptr,
+    bs_idx_ptr,
+    tokens: tl.constexpr,
+    cp_size: tl.constexpr,
+    cp_rank: tl.constexpr,
+):
+    extra_seq = 0
+    bs_idx = 0
+    for bs in range(tokens):
+        cur_len = tl.load(in_seqs_ptr + bs)
+        cur_len += extra_seq
+        cur_seq = cur_len // cp_size + (cur_len % cp_size > cp_rank)
+        if cur_seq > 0:
+            tl.store(bs_idx_ptr + bs_idx, bs)
+            tl.store(out_seqs_ptr + bs_idx, cur_seq)
+            bs_idx += 1
+        extra_seq = cur_len - cur_seq * cp_size
+
+
+def nsa_cp_round_robin_split_q_seqs_cpu(extend_seqs):
+    cp_size = get_attention_cp_size()
+    cp_rank = get_attention_cp_rank()
+    extra_seq = 0
+    q_seqs = []
+    for bs, cur_len in enumerate(extend_seqs):
+        cur_len += extra_seq
+        cur_seq = cur_len // cp_size + int(cur_len % cp_size > cp_rank)
+        q_seqs.append(cur_seq)
+        extra_seq = cur_len - cur_seq * cp_size
+    bs_idx = list([i for i, x in enumerate(q_seqs) if x > 0])
+    q_seqs = [q_len for q_len in q_seqs if q_len > 0]
+    return q_seqs, bs_idx
+
+
+def nsa_cp_round_robin_split_q_seqs(
+    extend_seqs_cpu, extend_seqs
+) -> Tuple[List, torch.Tensor, List, torch.Tensor]:
+    """
+    round-robin-split distributes tokens across ranks based on token_idx % cp_size.
+
+    Return:
+    ret_q_lens_cpu(List) and ret_q_lens(torch.Tensor): the partitioned length (excluding zeros) on the current cp rank
+        for each sequence after distribution across cp ranks.
+    bs_idx_cpu(List) and bs_idx(torch.Tensor): marks which sequences are ultimately selected,
+        i.e., those with a partitioned length greater than zero.
+    """
+    cp_size = get_attention_cp_size()
+    cp_rank = get_attention_cp_rank()
+    # len(ret_q_lens_cpu) == len(bs_idx_cpu)
+    ret_q_lens_cpu, bs_idx_cpu = nsa_cp_round_robin_split_q_seqs_cpu(extend_seqs_cpu)
+    ret_q_lens = torch.empty(
+        (len(bs_idx_cpu),), device=extend_seqs.device, dtype=extend_seqs.dtype
+    )
+    bs_idx = torch.empty(
+        (len(bs_idx_cpu),), device=extend_seqs.device, dtype=torch.int32
+    )
+    grid = (1,)
+    nsa_cp_round_robin_split_q_seqs_kernel[grid](
+        extend_seqs, ret_q_lens, bs_idx, len(extend_seqs), cp_size, cp_rank
+    )
+    return ret_q_lens_cpu, ret_q_lens, bs_idx_cpu, bs_idx
+
+
+def nsa_use_prefill_cp(forward_batch, nsa_enable_prefill_cp=None):
+    if nsa_enable_prefill_cp is None:
+        nsa_enable_prefill_cp = is_nsa_enable_prefill_cp()
+    if (
+        forward_batch.nsa_cp_metadata is not None
+        and nsa_enable_prefill_cp
+        and forward_batch.forward_mode.is_context_parallel_extend()
+    ):
+        return True
+    else:
+        return False
+
+
+def cp_attn_tp_all_gather_reorganazied_into_tensor(
+    input_: torch.Tensor, total_len, attn_tp_size, forward_batch, stream_op
+):
+    """
+    Allgather communication for context_parallel(kv_cache, index_k, hidden_states).
+    This implementation mainly consists of three parts:
+    Step 1, padding the input shape to unify the shape for allgather communication (the shape must be the same).
+    Step 2, allgather communication(async).
+    Step 3, removing the padding and reassembling the data according to the actual tokens.
+    """
+    # step1
+    max_len = (total_len + attn_tp_size - 1) // attn_tp_size
+    pad_size = max_len - input_.shape[0]
+    if pad_size > 0:
+        input_ = F.pad(input_, (0, 0, 0, pad_size), mode="constant", value=0)
+    with use_symmetric_memory(
+        get_attention_cp_group(), disabled=not is_allocation_symmetric()
+    ):
+        input_tensor_all = torch.empty(
+            max_len * attn_tp_size,
+            input_.shape[1],
+            device=input_.device,
+            dtype=input_.dtype,
+        )
+    # step2
+    get_attention_cp_group().cp_all_gather_into_tensor_async(
+        input_tensor_all, input_, stream_op
+    )
+    # step3
+    outputs_list_max = list(
+        torch.split(input_tensor_all, forward_batch.nsa_cp_metadata.max_rank_len, dim=0)
+    )
+    outputs = torch.cat(
+        [
+            outputs_list_max[index][:per_rank_len]
+            for index, per_rank_len in enumerate(
+                forward_batch.nsa_cp_metadata.per_rank_actual_token
+            )
+        ],
+        dim=0,
+    )
+    return outputs
+
+
+def cp_all_gather_rerange_output(input_tensor, cp_size, forward_batch, stream):
+    """
+    # for in-seq-split
+    |   +-----------before allgather------------+|
+    |   | dp_atten_tp0: block0, block7 |
+    |   | dp_atten_tp1: block1, block6 |
+    |   | dp_atten_tp2: block2, block5 |
+    |   | dp_atten_tp3: block3, block4 |
+    |
+    |   +----------before rerange---------------+|
+    | block0 | block7 | block1 | block6 | block2 | block5 | block3 | block4 |
+    |
+    |   +--------------result-------------------+
+    | block0 | block1 | block2 | block3 | block4 | block5 | block6 | block7 |
+    |   +-------------------------+
+
+    # for round-robin-split
+    |   +-----------before allgather------------+|
+    | dp_atten_tp0: token0, token4, token8, token12, token16, ... |
+    | dp_atten_tp1: token1, token5, token9, token13, token17, ... |
+    | dp_atten_tp2: token2, token6, token10, token14, token18, ... |
+    | dp_atten_tp3: token3, token7, token11, token15, token19, ... |
+    |
+    |   +--------------result-------------------+
+    | token0, token1, token2, token3, token4, token5, token6, token7, ...
+    |   +-------------------------+
+    """
+    if is_nsa_prefill_cp_round_robin_split():
+        with use_symmetric_memory(
+            get_attention_cp_group(), disabled=not is_allocation_symmetric()
+        ):
+            output_tensor = input_tensor.new_empty(
+                (input_tensor.shape[0] * cp_size, *input_tensor.shape[1:]),
+            )
+        attn_cp_all_gather_into_tensor(
+            output_tensor,
+            input_tensor,
+        )
+        out_shape = output_tensor.shape
+        output_tensor = (
+            output_tensor.view(cp_size, -1, *out_shape[1:])
+            .transpose(0, 1)
+            .reshape(out_shape)
+        )
+        return output_tensor
+
+    bs_seq_len, hidden_size = input_tensor.shape
+    output_tensor = cp_attn_tp_all_gather_reorganazied_into_tensor(
+        input_tensor,
+        forward_batch.nsa_cp_metadata.total_seq_lens,
+        cp_size,
+        forward_batch,
+        stream,
+    )
+    outputs_list = list(
+        torch.split(
+            output_tensor, forward_batch.nsa_cp_metadata.reverse_split_len, dim=0
+        )
+    )
+    output_tensor = torch.cat(
+        [outputs_list[i] for i in forward_batch.nsa_cp_metadata.cp_reverse_index], dim=0
+    )
+    output_tensor = output_tensor.view(-1, hidden_size)
+    return output_tensor
+
+
+def calculate_cp_seq_idx(cp_chunks_len, seqs_len):
+    """Used to obtain the index of the seq corresponding
+    to each cp block in the forwardbatch, and the starting
+    and ending positions of the corresponding seq in the cp block"""
+    j = 0
+    tuple_len = []  # Only keep this result list
+    cumulative = {}  # Used to track cumulative values for each index
+
+    for i in range(len(cp_chunks_len)):
+        current_dict = {}
+        current_tuples = []
+        c_val = cp_chunks_len[i]
+
+        while j < len(seqs_len):
+            s_val = seqs_len[j]
+            if s_val == c_val:
+                idx = j
+                current_dict[idx] = s_val
+                # Update cumulative value for this index
+                cumulative[idx] = cumulative.get(idx, 0) + s_val
+                j += 1
+                break
+            elif s_val > c_val:
+                idx = j
+                current_dict[idx] = c_val
+                # Update cumulative value for this index
+                cumulative[idx] = cumulative.get(idx, 0) + c_val
+                seqs_len[j] = s_val - c_val
+                break
+            else:  # s_val < c_val
+                idx = j
+                current_dict[idx] = s_val
+                # Update cumulative value for this index
+                cumulative[idx] = cumulative.get(idx, 0) + s_val
+                c_val -= s_val
+                j += 1
+
+        # Build tuple: (index, historical cumulative, historical+current)
+        for idx, val in current_dict.items():
+            # Subtract current value to get historical cumulative
+            prev_cum = cumulative.get(idx, 0) - val
+            current_cum = prev_cum + val
+            current_tuples.append((idx, prev_cum, current_cum))
+
+        tuple_len.append(current_tuples)
+    return tuple_len
+
+
+def prepare_input_dp_with_cp_dsa(
+    kv_len,
+    cp_rank,
+    cp_size,
+    seqs_len,
+):
+    if is_nsa_prefill_cp_round_robin_split():
+        return True
+    """prepare_input_dp_with_cp_dsa-zigzag index
+    Example (DP_ATTENT_TP == CP_SIZE == 4):
+    Description:
+    1. Start with a full-length request.
+    2. Split the request into multiple blocks (block0 to block7).
+    3. Rearrange these blocks to balance computational
+        load across different DP ranks.
+    4. Assign the rearranged blocks to different DP attention
+        time points (dp_atten_tp0 to dp_atten_tp3).
+    +---------------------------------+
+    |        cp_split_tokens         |
+    +---------------------------------+
+    |                                 |
+    |   request_with_full_length     |
+    |             | split (cp_size * 2) |
+    |   +-------------------------+  |
+    |   | block0 | block1 | block2 | block3 | block4 | block5 | block6 | block7 |
+    |   +-------------------------+  |
+    |             | rerange          |
+    |   +---------------------------------+
+    |   | block0 | block7 | block1 | block6 | block2 | block5 | block3 | block4 |
+    |   +---------------------------------+
+    |             |
+    |   +-------------------------+
+    |   | dp_atten_tp0: block0, block7 |
+    |   | dp_atten_tp1: block1, block6 |
+    |   | dp_atten_tp2: block2, block5 |
+    |   | dp_atten_tp3: block3, block4 |
+    |   +-------------------------+
+
+    Why zigzag rearrange?
+    - Attention calculations must follow causal attention principles.
+    - Simply slicing by rank order can lead to computational load imbalance:
+        * First rank may focus on fewer historical key-value tokens (less computation)
+        * Last rank may focus on more tokens (more computation)
+    - To mitigate uneven load, the input hissenstate needs to be sliced by cp_size*2 and rearranged.
+    """
+    # just support batch = 1
+    kv_len = torch.tensor(kv_len)
+    bs_per_cp_group = 1
+    kv_len_origin = kv_len
+    # get zigzag index
+    cp_segment_num = cp_size * 2
+    seq_per_batch = kv_len // cp_segment_num  # seq_len for each batch and segment
+    split_list = seq_per_batch.repeat_interleave(cp_segment_num).int().tolist()
+    remainder = kv_len % (cp_segment_num)
+    if remainder > 0:
+        split_list[:remainder] = [x + 1 for x in split_list[:remainder]]
+
+    seq_max_rank_len = (kv_len + cp_size - 1) // cp_size
+    max_rank_len = seq_max_rank_len.repeat_interleave(cp_size).int().tolist()
+    zigzag_index = list(
+        range(cp_rank, cp_rank + bs_per_cp_group * cp_segment_num, cp_segment_num)
+    ) + list(
+        range(
+            cp_segment_num - cp_rank - 1,
+            bs_per_cp_group * cp_segment_num,
+            cp_segment_num,
+        )
+    )
+
+    per_rank_actual_token = list(
+        split_list[i] + split_list[cp_size * 2 - i - 1] for i in range(cp_size)
+    )
+    reverse_split_len = [
+        element
+        for i in range(cp_size)
+        for element in (split_list[i], split_list[cp_size * 2 - i - 1])
+    ]
+    # get zigzag reverse index
+    cp_reverse_index = []
+    for batch_id in range(bs_per_cp_group):
+        cp_reverse_index.extend(
+            list(range(batch_id, cp_segment_num * bs_per_cp_group, 2 * bs_per_cp_group))
+            + list(
+                range(
+                    (cp_segment_num - 1) * bs_per_cp_group + batch_id,
+                    0,
+                    -2 * bs_per_cp_group,
+                )
+            )
+        )
+    prefix_sum_list = list(accumulate(split_list))
+
+    # TODO Support multi-batch-cp-split, multi-batch-cp support has accuracy issues
+    # cp_seq_index = calculate_cp_seq_idx(split_list[:], seqs_len[:])
+    kv_len_prev = prefix_sum_list[cp_rank]
+    kv_len_next = prefix_sum_list[cp_size * 2 - cp_rank - 1]
+    actual_seq_q_prev = split_list[cp_rank]
+    actual_seq_q_next = split_list[cp_size * 2 - cp_rank - 1]
+    kv_len_prev_tensor = torch.tensor(kv_len_prev).to(device="cuda", dtype=torch.int32)
+    kv_len_next_tensor = torch.tensor(kv_len_next).to(device="cuda", dtype=torch.int32)
+    actual_seq_q_prev_tensor = torch.tensor(actual_seq_q_prev).to(
+        device="cuda", dtype=torch.int32
+    )
+    actual_seq_q_next_tensor = torch.tensor(actual_seq_q_next).to(
+        device="cuda", dtype=torch.int32
+    )
+
+    nsa_cp_metadata = NSAContextParallelMetadata(
+        split_list=split_list,
+        max_rank_len=max_rank_len,
+        zigzag_index=zigzag_index,
+        per_rank_actual_token=per_rank_actual_token,
+        reverse_split_len=reverse_split_len,
+        cp_reverse_index=cp_reverse_index,
+        kv_len_prev=kv_len_prev,
+        kv_len_next=kv_len_next,
+        actual_seq_q_prev=actual_seq_q_prev,
+        actual_seq_q_next=actual_seq_q_next,
+        kv_len_prev_tensor=kv_len_prev_tensor,
+        kv_len_next_tensor=kv_len_next_tensor,
+        actual_seq_q_prev_tensor=actual_seq_q_prev_tensor,
+        actual_seq_q_next_tensor=actual_seq_q_next_tensor,
+        total_seq_lens=kv_len_origin,
+    )
+    return nsa_cp_metadata
diff --git a/sglang/python/sglang/srt/layers/attention/nsa_backend.py b/sglang/python/sglang/srt/layers/attention/nsa_backend.py
new file mode 100644
index 0000000000000000000000000000000000000000..2e76cf608ed65e579e8107983f265fc500ee3e71
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/attention/nsa_backend.py
@@ -0,0 +1,2371 @@
+from __future__ import annotations
+
+from dataclasses import dataclass
+from enum import IntEnum, auto
+from typing import TYPE_CHECKING, Dict, List, Literal, Optional, Tuple, TypeAlias
+
+import torch
+
+from sglang.srt.configs.model_config import get_nsa_index_topk, is_deepseek_nsa
+from sglang.srt.environ import envs
+from sglang.srt.layers.attention.base_attn_backend import AttentionBackend
+from sglang.srt.layers.attention.nsa.dequant_k_cache import dequantize_k_cache_paged
+from sglang.srt.layers.attention.nsa.nsa_backend_mtp_precompute import (
+    NativeSparseAttnBackendMTPPrecomputeMixin,
+    PrecomputedMetadata,
+    compute_cu_seqlens,
+)
+from sglang.srt.layers.attention.nsa.nsa_indexer import BaseIndexerMetadata
+from sglang.srt.layers.attention.nsa.nsa_mtp_verification import (
+    verify_multi_backend_fused_metadata_copy,
+    verify_single_backend_fused_metadata_copy,
+)
+from sglang.srt.layers.attention.nsa.quant_k_cache import quantize_k_cache
+from sglang.srt.layers.attention.nsa.transform_index import (
+    transform_index_page_table_decode,
+    transform_index_page_table_prefill,
+)
+from sglang.srt.layers.attention.nsa.utils import (
+    can_nsa_prefill_cp_round_robin_split,
+    compute_nsa_seqlens,
+    is_nsa_enable_prefill_cp,
+    nsa_cp_round_robin_split_data,
+    nsa_cp_round_robin_split_q_seqs,
+    pad_nsa_cache_seqlens,
+)
+from sglang.srt.layers.attention.utils import (
+    concat_mla_absorb_q_general,
+    mla_quantize_and_rope_for_fp8,
+    seqlens_expand_triton,
+)
+from sglang.srt.layers.dp_attention import get_attention_tp_size
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch, ForwardMode
+from sglang.srt.utils import is_cuda, is_hip
+
+if TYPE_CHECKING:
+    from sglang.srt.layers.radix_attention import RadixAttention
+    from sglang.srt.model_executor.model_runner import ModelRunner
+    from sglang.srt.speculative.spec_info import SpecInput
+
+
+_is_hip = is_hip()
+
+if _is_hip:
+    from sglang.srt.layers.attention.nsa.triton_kernel import get_valid_kv_indices
+
+    try:
+        from aiter import (  # noqa: F401
+            flash_attn_varlen_func,
+            mha_batch_prefill_func,
+            paged_attention_ragged,
+        )
+        from aiter.mla import mla_decode_fwd, mla_prefill_fwd  # noqa: F401
+    except ImportError:
+        print(
+            "aiter is AMD specific kernel library. Please make sure aiter is installed on your AMD device."
+        )
+else:
+    from sgl_kernel.flash_attn import flash_attn_varlen_func, flash_attn_with_kvcache
+
+
+# Reuse this workspace buffer across all NSA backend instances
+global_workspace_buffer = None
+
+# Control whether to use fused metadata copy kernel (default: enabled)
+# Set SGLANG_USE_FUSED_METADATA_COPY=0 or false to disable
+_USE_FUSED_METADATA_COPY = envs.SGLANG_USE_FUSED_METADATA_COPY.get() and not _is_hip
+
+# Control whether to verify fused metadata copy against individual copies (default: disabled)
+# Set SGLANG_VERIFY_FUSED_METADATA_COPY=1 or true to enable verification
+# This will crash with detailed error message if any inconsistency is detected
+_VERIFY_FUSED_METADATA_COPY = envs.SGLANG_VERIFY_FUSED_METADATA_COPY.get()
+
+
+@dataclass(frozen=True)
+class NSAFlashMLAMetadata:
+    """Metadata only needed by FlashMLA"""
+
+    flashmla_metadata: torch.Tensor
+    num_splits: torch.Tensor
+
+    def slice(self, sli):
+        return NSAFlashMLAMetadata(
+            flashmla_metadata=self.flashmla_metadata,
+            num_splits=self.num_splits[sli],
+        )
+
+    def copy_(self, other: "NSAFlashMLAMetadata"):
+        self.flashmla_metadata.copy_(other.flashmla_metadata)
+        self.num_splits.copy_(other.num_splits)
+
+
+@dataclass(frozen=True)
+class NSAMetadata:
+    page_size: int
+
+    # Sequence lengths for the forward batch
+    cache_seqlens_int32: torch.Tensor
+    # Maximum sequence length for query
+    max_seq_len_q: int
+    # Maximum sequence length for key
+    max_seq_len_k: int
+    # Cumulative sequence lengths for query
+    cu_seqlens_q: torch.Tensor
+    # Cumulative sequence lengths for key
+    cu_seqlens_k: torch.Tensor
+    # Page table, the index of KV Cache Tables/Blocks
+    # this table is always with page_size = 1
+    page_table_1: torch.Tensor
+
+    # NOTE(dark): This will property be used in:
+    # 1. dense decode/prefill, we use paged flash attention, need real_page_table
+    # 2. sparse decode/prefill, indexer need real_page_table to compute the score
+    real_page_table: torch.Tensor
+
+    # NSA metadata (nsa prefill are expanded)
+    nsa_cache_seqlens_int32: torch.Tensor  # this seqlens is clipped to `topk`
+    nsa_cu_seqlens_q: torch.Tensor  # must be arange(0, len(nsa_cu_seqlens_k))
+    nsa_cu_seqlens_k: torch.Tensor  # cumsum of `nsa_cache_seqlens_int32`
+    nsa_extend_seq_lens_list: List[int]
+    nsa_seqlens_expanded: torch.Tensor  # expanded, unclipped `seqlens`
+    nsa_max_seqlen_q: Literal[1] = 1  # always 1 for decode, variable for extend
+
+    flashmla_metadata: Optional[NSAFlashMLAMetadata] = None
+    # DeepGEMM schedule metadata for paged MQA logits (decode/target_verify/draft_extend only).
+    # Precomputed once per forward batch and reused across layers.
+    paged_mqa_schedule_metadata: Optional[torch.Tensor] = None
+    # The sum of sequence lengths for key, prefill only
+    seq_lens_sum: Optional[int] = None
+    # The flattened 1D page table with shape (seq_lens_sum,), prefill only
+    # this table is always with page_size = 1
+    page_table_1_flattened: Optional[torch.Tensor] = None
+    # The offset of topk indices in ragged kv, prefill only
+    # shape: (seq_lens_sum,)
+    topk_indices_offset: Optional[torch.Tensor] = None
+
+    # k_start and k_end in kv cache for each token.
+    indexer_k_start_end: Optional[Tuple[torch.Tensor, torch.Tensor]] = None
+    # seq lens for each batch.
+    indexer_seq_lens_cpu: Optional[torch.Tensor] = None
+    # batch index for each token.
+    token_to_batch_idx: Optional[torch.Tensor] = None
+
+
+class TopkTransformMethod(IntEnum):
+    # Transform topk indices to indices to the page table (page_size = 1)
+    PAGED = auto()
+    # Transform topk indices to indices to ragged kv (non-paged)
+    RAGGED = auto()
+
+
+@torch.compile
+def _compiled_cat(tensors: list[torch.Tensor], dim: int = -1) -> torch.Tensor:
+    return torch.cat(tensors, dim=dim)
+
+
+def _cat(tensors: list[torch.Tensor], dim: int = -1) -> torch.Tensor:
+    """
+    Concatenate two tensors along the last dimension.
+    Use this function to concatenate q_nope and q_rope or k_nope and k_rope.
+    """
+    assert len(tensors) == 2
+
+    qk_nope, qk_rope = tensors
+    assert qk_nope.ndim == 3 and qk_rope.ndim == 3
+
+    torch._dynamo.mark_dynamic(qk_nope, 0)
+    torch._dynamo.mark_dynamic(qk_rope, 0)
+
+    return _compiled_cat([qk_nope, qk_rope], dim=dim)
+
+
+@dataclass(frozen=True)
+class NSAIndexerMetadata(BaseIndexerMetadata):
+    attn_metadata: NSAMetadata
+    topk_transform_method: TopkTransformMethod
+    paged_mqa_schedule_metadata: Optional[torch.Tensor] = None
+
+    def get_seqlens_int32(self) -> torch.Tensor:
+        return self.attn_metadata.cache_seqlens_int32
+
+    def get_page_table_64(self) -> torch.Tensor:
+        return self.attn_metadata.real_page_table
+
+    def get_page_table_1(self) -> torch.Tensor:
+        return self.attn_metadata.page_table_1
+
+    def get_seqlens_expanded(self) -> torch.Tensor:
+        return self.attn_metadata.nsa_seqlens_expanded
+
+    def get_cu_seqlens_k(self) -> torch.Tensor:
+        return self.attn_metadata.cu_seqlens_k
+
+    def get_indexer_kvcache_range(self) -> Tuple[torch.Tensor, torch.Tensor]:
+        return self.attn_metadata.indexer_k_start_end
+
+    def get_indexer_seq_len_cpu(self) -> torch.Tensor:
+        return self.attn_metadata.indexer_seq_lens_cpu
+
+    def get_nsa_extend_len_cpu(self) -> List[int]:
+        return self.attn_metadata.nsa_extend_seq_lens_list
+
+    def get_token_to_batch_idx(self) -> torch.Tensor:
+        return self.attn_metadata.token_to_batch_idx
+
+    def topk_transform(
+        self,
+        logits: torch.Tensor,
+        topk: int,
+        ks: Optional[torch.Tensor] = None,
+        cu_seqlens_q: torch.Tensor = None,
+        ke_offset: torch.Tensor = None,
+        batch_idx_list: List[int] = None,
+        topk_indices_offset_override: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        from sgl_kernel import (
+            fast_topk_transform_fused,
+            fast_topk_transform_ragged_fused,
+            fast_topk_v2,
+        )
+
+        if topk_indices_offset_override is not None:
+            cu_topk_indices_offset = topk_indices_offset_override
+            cu_seqlens_q_topk = None
+        elif cu_seqlens_q is not None:
+            cu_seqlens_q = cu_seqlens_q.to(torch.int32)
+            cu_seqlens_q_topk = compute_cu_seqlens(cu_seqlens_q)
+            cu_topk_indices_offset = torch.repeat_interleave(
+                cu_seqlens_q_topk[:-1],
+                cu_seqlens_q,
+            )
+        else:
+            cu_seqlens_q_topk = self.attn_metadata.cu_seqlens_q
+            cu_topk_indices_offset = self.attn_metadata.topk_indices_offset
+        if ke_offset is not None:
+            seq_lens_topk = ke_offset
+        else:
+            seq_lens_topk = self.get_seqlens_expanded()
+        if batch_idx_list is not None:
+            page_table_size_1 = self.attn_metadata.page_table_1[batch_idx_list]
+        else:
+            page_table_size_1 = self.attn_metadata.page_table_1
+
+        if not envs.SGLANG_NSA_FUSE_TOPK.get():
+            return fast_topk_v2(logits, seq_lens_topk, topk, row_starts=ks)
+        elif self.topk_transform_method == TopkTransformMethod.PAGED:
+            # NOTE(dark): if fused, we return a transformed page table directly
+            return fast_topk_transform_fused(
+                score=logits,
+                lengths=seq_lens_topk,
+                page_table_size_1=page_table_size_1,
+                cu_seqlens_q=cu_seqlens_q_topk,
+                topk=topk,
+                row_starts=ks,
+            )
+        elif self.topk_transform_method == TopkTransformMethod.RAGGED:
+            return fast_topk_transform_ragged_fused(
+                score=logits,
+                lengths=seq_lens_topk,
+                topk_indices_offset=cu_topk_indices_offset,
+                topk=topk,
+                row_starts=ks,
+            )
+        else:
+            assert False, f"Unsupported {self.topk_transform_method = }"
+
+
+_NSA_IMPL_T: TypeAlias = Literal[
+    "flashmla_sparse", "flashmla_kv", "fa3", "tilelang", "trtllm"
+]
+
+
+class NativeSparseAttnBackend(
+    NativeSparseAttnBackendMTPPrecomputeMixin, AttentionBackend
+):
+    def __init__(
+        self,
+        model_runner: ModelRunner,
+        skip_prefill: bool = False,
+        speculative_step_id=0,
+        topk=0,
+        speculative_num_steps=0,
+    ):
+        super().__init__()
+        self.forward_metadata: NSAMetadata
+        self.device = model_runner.device
+        assert isinstance(model_runner.page_size, int)
+        self.real_page_size = model_runner.page_size
+        self.num_splits = (
+            1 if model_runner.server_args.enable_deterministic_inference else 0
+        )
+        self.use_nsa = is_deepseek_nsa(model_runner.model_config.hf_config)
+        assert self.use_nsa, "NSA backend only supports DeepSeek NSA"
+        self.nsa_kv_cache_store_fp8 = (
+            model_runner.token_to_kv_pool.nsa_kv_cache_store_fp8
+        )
+        self.nsa_index_topk = get_nsa_index_topk(model_runner.model_config.hf_config)
+        self.max_context_len = model_runner.model_config.context_len
+        self.num_q_heads = (
+            model_runner.model_config.num_attention_heads // get_attention_tp_size()
+        )
+        self.kv_cache_dim = model_runner.token_to_kv_pool.kv_cache_dim
+        self.qk_nope_head_dim = model_runner.model_config.qk_nope_head_dim
+        self.kv_lora_rank = model_runner.model_config.kv_lora_rank
+        self.qk_rope_head_dim = model_runner.model_config.qk_rope_head_dim
+
+        assert model_runner.req_to_token_pool is not None
+        self.req_to_token = model_runner.req_to_token_pool.req_to_token
+
+        self.use_mha: bool = False
+        # Force NSA prefill to use MLA (i.e. disable MHA_ONE_SHOT), controlled by env var.
+        self._force_attn_forward_mla: bool = envs.SGLANG_NSA_FORCE_MLA.get()
+        self.nsa_prefill_impl: _NSA_IMPL_T = (
+            model_runner.server_args.nsa_prefill_backend
+        )
+        self.nsa_decode_impl: _NSA_IMPL_T = model_runner.server_args.nsa_decode_backend
+        self.enable_auto_select_prefill_impl = self.nsa_prefill_impl == "flashmla_auto"
+
+        self._arange_buf = torch.arange(16384, device=self.device, dtype=torch.int32)
+
+        if _is_hip:
+            max_bs = model_runner.req_to_token_pool.size
+
+            self.kv_indptr = torch.zeros(
+                (max_bs + 1,), dtype=torch.int32, device=model_runner.device
+            )
+
+            self.kv_indices = torch.zeros(
+                max_bs * self.nsa_index_topk,
+                dtype=torch.int32,
+                device=self.device,
+            )
+
+        # Speculative decoding
+        self.topk = model_runner.server_args.speculative_eagle_topk or 0
+        self.speculative_num_steps = speculative_num_steps
+        self.speculative_num_draft_tokens = (
+            model_runner.server_args.speculative_num_draft_tokens
+        )
+        self.speculative_step_id = speculative_step_id
+
+        self.device_capability = torch.cuda.get_device_capability()
+        self.device_sm_major = self.device_capability[0]
+        self.kv_cache_dtype = model_runner.kv_cache_dtype
+
+        # Allocate global workspace buffer for TRT-LLM kernels (ragged attention on SM100/B200, or trtllm decode)
+        if self.device_sm_major >= 10 or self.nsa_decode_impl == "trtllm":
+            global global_workspace_buffer
+            if global_workspace_buffer is None:
+                global_workspace_buffer = torch.empty(
+                    envs.SGLANG_FLASHINFER_WORKSPACE_SIZE.get(),
+                    dtype=torch.uint8,
+                    device=model_runner.device,
+                )
+            self.workspace_buffer = global_workspace_buffer
+        else:
+            self.workspace_buffer = None
+
+    def get_device_int32_arange(self, l: int) -> torch.Tensor:
+        if l > len(self._arange_buf):
+            next_pow_of_2 = 1 << (l - 1).bit_length()
+            self._arange_buf = torch.arange(
+                next_pow_of_2, device=self.device, dtype=torch.int32
+            )
+        return self._arange_buf[:l]
+
+    def _transform_table_1_to_real(self, page_table: torch.Tensor) -> torch.Tensor:
+        page_size = self.real_page_size
+        if page_size == 1:
+            return page_table
+        max_seqlen_k = page_table.shape[1]
+        strided_indices = torch.arange(
+            0, max_seqlen_k, page_size, device=page_table.device, dtype=torch.int32
+        )
+        return page_table[:, strided_indices] // page_size
+
+    def init_forward_metadata(self, forward_batch: ForwardBatch):
+        """Init the metadata for a forward pass."""
+        batch_size = forward_batch.batch_size
+        device = forward_batch.seq_lens.device
+
+        if forward_batch.forward_mode.is_target_verify():
+            draft_token_num = self.speculative_num_draft_tokens
+        else:
+            draft_token_num = 0
+
+        cache_seqlens_int32 = (forward_batch.seq_lens + draft_token_num).to(torch.int32)
+        cu_seqlens_k = compute_cu_seqlens(cache_seqlens_int32)
+        assert forward_batch.seq_lens_cpu is not None
+        max_seqlen_k = int(forward_batch.seq_lens_cpu.max().item() + draft_token_num)
+        # [b, max_seqlen_k]
+        page_table = forward_batch.req_to_token_pool.req_to_token[
+            forward_batch.req_pool_indices, :max_seqlen_k
+        ]
+
+        page_table_1_flattened = None
+        topk_indices_offset = None
+
+        # Centralized dispatch: decide all strategies for this batch
+        self.set_nsa_prefill_impl(forward_batch)
+        topk_transform_method = self.get_topk_transform_method()
+        # Batch indices selected when cp enabled: After splitting multiple sequences,
+        # a certain cp rank may not have some of these sequences.
+        # We use bs_idx_cpu to mark which sequences are finally selected by the current cp rank,
+        # a default value of None indicates that all sequences are selected.
+        bs_idx_cpu = None
+        # seq_len_cpu of selected sequences
+        indexer_seq_lens_cpu = forward_batch.seq_lens_cpu
+
+        if forward_batch.forward_mode.is_decode_or_idle():
+            extend_seq_lens_cpu = [1] * batch_size
+            max_seqlen_q = 1
+            cu_seqlens_q = self.get_device_int32_arange(batch_size + 1)
+            seqlens_expanded = cache_seqlens_int32
+        elif forward_batch.forward_mode.is_target_verify():
+            max_seqlen_q = 1
+            cu_seqlens_q = torch.arange(
+                0,
+                batch_size * self.speculative_num_draft_tokens + 1,
+                1,
+                dtype=torch.int32,
+                device=device,
+            )
+            extend_seq_lens_cpu = [self.speculative_num_draft_tokens] * batch_size
+            forward_batch.extend_seq_lens_cpu = extend_seq_lens_cpu
+
+            seqlens_expanded = seqlens_expand_triton(
+                torch.tensor(extend_seq_lens_cpu, dtype=torch.int32, device=device),
+                cache_seqlens_int32,
+                self.speculative_num_draft_tokens * batch_size,
+                self.speculative_num_draft_tokens,
+            )
+            page_table = torch.repeat_interleave(
+                page_table, repeats=self.speculative_num_draft_tokens, dim=0
+            )
+        elif forward_batch.forward_mode.is_draft_extend(include_v2=True):
+            assert (
+                forward_batch.extend_seq_lens_cpu is not None
+                and forward_batch.extend_seq_lens is not None
+                and forward_batch.extend_prefix_lens_cpu is not None
+            ), "All of them must not be None"
+
+            extend_seq_lens_cpu = forward_batch.extend_seq_lens_cpu
+            assert forward_batch.extend_seq_lens is not None
+
+            max_seqlen_q = 1
+            cu_seqlens_q = torch.arange(
+                0,
+                forward_batch.extend_num_tokens + 1,
+                1,
+                dtype=torch.int32,
+                device=device,
+            )
+
+            seqlens_expanded = seqlens_expand_triton(
+                forward_batch.extend_seq_lens,
+                cache_seqlens_int32,
+                sum(extend_seq_lens_cpu),
+                self.speculative_num_draft_tokens,
+            )
+            if forward_batch.forward_mode.is_draft_extend_v2():
+                # DRAFT_EXTEND_V2: V2 worker pre-fills draft KV cache with ALL speculated
+                # tokens upfront. All requests extend by the same fixed
+                # (speculative_num_draft_tokens). Use scalar to avoid GPU sync.
+                page_table = torch.repeat_interleave(
+                    page_table, repeats=self.speculative_num_draft_tokens, dim=0
+                )
+            else:
+                # DRAFT_EXTEND (v1): V1 worker extends by (accept_length + 1) per request
+                # after verification. Lengths vary per request based on how many tokens
+                # were accepted.
+                page_table = torch.repeat_interleave(
+                    page_table, repeats=forward_batch.extend_seq_lens, dim=0
+                )
+        elif forward_batch.forward_mode.is_extend():
+            assert (
+                forward_batch.extend_seq_lens_cpu is not None
+                and forward_batch.extend_seq_lens is not None
+                and forward_batch.extend_prefix_lens_cpu is not None
+            ), "All of them must not be None"
+            extend_seq_lens_cpu = forward_batch.extend_seq_lens_cpu
+            assert forward_batch.extend_seq_lens is not None
+            extend_seq_lens = forward_batch.extend_seq_lens
+
+            seqlens_expanded = torch.cat(
+                [
+                    torch.arange(
+                        kv_len - qo_len + 1,
+                        kv_len + 1,
+                        dtype=torch.int32,
+                        device=device,
+                    )
+                    for qo_len, kv_len in zip(
+                        forward_batch.extend_seq_lens_cpu,
+                        forward_batch.seq_lens_cpu.tolist(),
+                        strict=True,
+                    )
+                ]
+            )
+
+            if can_nsa_prefill_cp_round_robin_split(forward_batch):
+                seqlens_expanded = nsa_cp_round_robin_split_data(seqlens_expanded)
+                extend_seq_lens_cpu, extend_seq_lens, bs_idx_cpu, bs_idx = (
+                    nsa_cp_round_robin_split_q_seqs(
+                        extend_seq_lens_cpu, extend_seq_lens
+                    )
+                )
+                indexer_seq_lens_cpu = indexer_seq_lens_cpu[bs_idx_cpu]
+                cache_seqlens_int32 = cache_seqlens_int32[bs_idx]
+                cu_seqlens_k = compute_cu_seqlens(cache_seqlens_int32)
+                max_seqlen_k = (
+                    int(indexer_seq_lens_cpu.max().item() + draft_token_num)
+                    if len(indexer_seq_lens_cpu) != 0
+                    else 0
+                )
+                page_table = page_table[bs_idx, :max_seqlen_k]
+
+            if (
+                any(forward_batch.extend_prefix_lens_cpu)
+                or forward_batch.forward_mode == ForwardMode.DRAFT_EXTEND
+                or bs_idx_cpu is not None
+            ):
+                max_seqlen_q = (
+                    max(extend_seq_lens_cpu) if len(extend_seq_lens_cpu) != 0 else 1
+                )
+                cu_seqlens_q = compute_cu_seqlens(extend_seq_lens.to(torch.int32))
+            else:
+                max_seqlen_q = max_seqlen_k
+                cu_seqlens_q = cu_seqlens_k
+
+            # Check if MHA FP8 dequantization is needed
+            mha_dequantize_needed = (
+                self.use_mha
+                and forward_batch.token_to_kv_pool.dtype == torch.float8_e4m3fn
+            )
+            forward_batch.using_mha_one_shot_fp8_dequant = mha_dequantize_needed
+
+            # page_table_1_flattened is only used when prefix sharing is enabled:
+            has_prefix_sharing = any(forward_batch.extend_prefix_lens_cpu)
+            if has_prefix_sharing and (
+                topk_transform_method == TopkTransformMethod.RAGGED
+                or mha_dequantize_needed
+            ):
+                page_table_1_flattened = torch.cat(
+                    [
+                        page_table[i, :kv_len]
+                        for i, kv_len in enumerate(
+                            indexer_seq_lens_cpu.tolist(),
+                        )
+                    ]
+                )
+                assert page_table_1_flattened.shape[0] == sum(
+                    indexer_seq_lens_cpu
+                ), f"{page_table_1_flattened.shape[0] = } must be the same as {sum(indexer_seq_lens_cpu) = }"
+
+                # Validate indices when logical tokens exceed physical capacity
+                # This is likely to be triggered by PP with high kv reuse & parallelism
+                kv_cache_capacity = (
+                    forward_batch.token_to_kv_pool.size
+                    + forward_batch.token_to_kv_pool.page_size
+                )
+                if forward_batch.seq_lens_sum > kv_cache_capacity:
+                    max_idx = page_table_1_flattened.max().item()
+                    assert max_idx < kv_cache_capacity, (
+                        f"Invalid page table index: max={max_idx}, "
+                        f"kv_cache_capacity={kv_cache_capacity}"
+                    )
+
+            if topk_transform_method == TopkTransformMethod.RAGGED:
+                topk_indices_offset = torch.repeat_interleave(
+                    cu_seqlens_k[:-1],
+                    extend_seq_lens,
+                )
+        else:
+            assert False, f"Unsupported {forward_batch.forward_mode = }"
+
+        indexer_k_start_end, token_to_batch_idx = self._cal_indexer_k_start_end(
+            forward_batch, bs_idx_cpu
+        )
+        # 1D, expanded seqlens (1D means cheap to compute, so always compute it)
+        nsa_cache_seqlens_int32 = compute_nsa_seqlens(
+            original_seq_lens=seqlens_expanded,
+            nsa_index_topk=self.nsa_index_topk,
+        )
+        nsa_cache_seqlens_int32 = pad_nsa_cache_seqlens(
+            forward_batch, nsa_cache_seqlens_int32
+        )
+        nsa_cu_seqlens_k = compute_cu_seqlens(nsa_cache_seqlens_int32)
+        nsa_cu_seqlens_q = self.get_device_int32_arange(len(nsa_cu_seqlens_k))
+
+        paged_mqa_schedule_metadata = None
+        # DeepGEMM paged MQA logits path needs a schedule metadata tensor.
+        # Compute it once per forward batch and reuse it across layers.
+        if is_cuda() and (
+            forward_batch.forward_mode.is_decode_or_idle()
+            or forward_batch.forward_mode.is_target_verify()
+            or forward_batch.forward_mode.is_draft_extend()
+        ):
+            try:
+                import deep_gemm
+
+                # NOTE: DeepGEMM paged path uses block_size=64.
+                seqlens_32 = (
+                    seqlens_expanded
+                    if (
+                        forward_batch.forward_mode.is_target_verify()
+                        or forward_batch.forward_mode.is_draft_extend()
+                    )
+                    else cache_seqlens_int32
+                )
+                paged_mqa_schedule_metadata = deep_gemm.get_paged_mqa_logits_metadata(
+                    seqlens_32, 64, deep_gemm.get_num_sms()
+                )
+            except (ImportError, ModuleNotFoundError):
+                paged_mqa_schedule_metadata = None
+
+        metadata = NSAMetadata(
+            page_size=self.real_page_size,
+            cache_seqlens_int32=cache_seqlens_int32,
+            max_seq_len_q=max_seqlen_q,
+            max_seq_len_k=max_seqlen_k,
+            cu_seqlens_q=cu_seqlens_q,
+            cu_seqlens_k=cu_seqlens_k,
+            seq_lens_sum=forward_batch.seq_lens_sum,
+            page_table_1=page_table,
+            page_table_1_flattened=page_table_1_flattened,
+            flashmla_metadata=(
+                self._compute_flashmla_metadata(
+                    cache_seqlens=nsa_cache_seqlens_int32,
+                    seq_len_q=1,
+                )
+                if self.nsa_decode_impl == "flashmla_kv"
+                else None
+            ),
+            paged_mqa_schedule_metadata=paged_mqa_schedule_metadata,
+            nsa_cache_seqlens_int32=nsa_cache_seqlens_int32,
+            nsa_cu_seqlens_q=nsa_cu_seqlens_q,
+            nsa_cu_seqlens_k=nsa_cu_seqlens_k,
+            nsa_seqlens_expanded=seqlens_expanded,
+            nsa_extend_seq_lens_list=extend_seq_lens_cpu,
+            real_page_table=self._transform_table_1_to_real(page_table),
+            nsa_max_seqlen_q=1,
+            topk_indices_offset=topk_indices_offset,
+            indexer_k_start_end=indexer_k_start_end,
+            indexer_seq_lens_cpu=indexer_seq_lens_cpu,
+            token_to_batch_idx=token_to_batch_idx,
+        )
+        self.forward_metadata = metadata
+
+    def _cal_indexer_k_start_end(
+        self,
+        forward_batch: ForwardBatch,
+        bs_idx: Optional[List[int]] = None,
+    ):
+        if not forward_batch.forward_mode.is_extend_without_speculative():
+            return None, None
+        if forward_batch.batch_size == 0 or (bs_idx is not None and len(bs_idx) == 0):
+            empty_t = torch.empty(0, dtype=torch.int32, device=self.device)
+            return (empty_t, empty_t), empty_t
+
+        # Suppose there are two requests, with extend_seq_len = [3, 2]
+        # and seq_lens = [10, 4]
+        # The logits matrix looks like this, with * representing the valid logits
+        # and - representing the invalid logits:
+        #
+        #  ********--|----
+        #  *********-|----
+        #  **********|----
+        #  ----------|***-
+        #  ----------|****
+        #
+        # ks = [0, 0, 0, 10, 10]
+        # ke = [8, 9, 10, 13, 14]
+        ks_list = []
+        ke_list = []
+        token_to_batch_idx = []
+
+        q_offset = 0
+        k_offset = 0
+
+        assert (
+            forward_batch.seq_lens_cpu is not None
+            and forward_batch.extend_seq_lens_cpu is not None
+        )
+        for i in range(forward_batch.batch_size):
+            seq_len = forward_batch.seq_lens_cpu[i].item()
+            assert isinstance(seq_len, int)
+            extend_seq_len = forward_batch.extend_seq_lens_cpu[i]
+            ks = torch.full(
+                (extend_seq_len,), k_offset, dtype=torch.int32, device=self.device
+            )
+            kv_len = seq_len
+            if forward_batch.forward_mode.is_target_verify():
+                kv_len += self.speculative_num_draft_tokens
+            seq_lens_expanded = torch.arange(
+                kv_len - extend_seq_len + 1,
+                kv_len + 1,
+                dtype=torch.int32,
+                device=self.device,
+            )
+            ke = ks + seq_lens_expanded
+            ks_list.append(ks)
+            ke_list.append(ke)
+
+            # bi: The index within the selected batch bs_idx. Entries that were not selected are ignored.
+            bi = bs_idx.index(i) if (bs_idx is not None and i in bs_idx) else i
+            tb = torch.full(
+                (extend_seq_len,), bi, dtype=torch.int32, device=self.device
+            )
+            token_to_batch_idx.append(tb)
+
+            if bs_idx is None or i in bs_idx:  # skip batch not included in bs_idx
+                q_offset += extend_seq_len
+                k_offset += seq_len
+
+        ks = torch.cat(ks_list, dim=0)
+        ke = torch.cat(ke_list, dim=0)
+        token_to_batch_idx = torch.cat(token_to_batch_idx, dim=0)
+        if bs_idx is not None:
+            assert can_nsa_prefill_cp_round_robin_split(forward_batch)
+            ks = nsa_cp_round_robin_split_data(ks)
+            ke = nsa_cp_round_robin_split_data(ke)
+            token_to_batch_idx = nsa_cp_round_robin_split_data(token_to_batch_idx)
+        return (ks, ke), token_to_batch_idx
+
+    def init_cuda_graph_state(self, max_bs: int, max_num_tokens: int):
+        """Initialize CUDA graph state for the attention backend.
+
+        Args:
+            max_bs (int): Maximum batch size to support in CUDA graphs
+
+        This creates fixed-size tensors that will be reused during CUDA graph replay
+        to avoid memory allocations.
+        """
+        self.decode_cuda_graph_metadata: Dict = {
+            "cache_seqlens": torch.ones(
+                max_num_tokens, dtype=torch.int32, device=self.device
+            ),
+            "cu_seqlens_q": torch.arange(
+                0, max_bs + 1, dtype=torch.int32, device=self.device
+            ),
+            "cu_seqlens_k": torch.zeros(
+                max_bs + 1, dtype=torch.int32, device=self.device
+            ),
+            # fake page_table for sparse_prefill
+            # Add extra columns for speculative draft tokens to avoid
+            # overflow during target_verify when max_seqlen_k = seq_len + num_draft_tokens
+            "page_table": torch.zeros(
+                max_num_tokens,
+                self.max_context_len + (self.speculative_num_draft_tokens or 0),
+                dtype=torch.int32,
+                device=self.device,
+            ),
+            "flashmla_metadata": (
+                self._compute_flashmla_metadata(
+                    cache_seqlens=torch.ones(
+                        max_num_tokens, dtype=torch.int32, device=self.device
+                    ),
+                    seq_len_q=1,
+                )
+                if self.nsa_decode_impl == "flashmla_kv"
+                else None
+            ),
+        }
+
+    def init_forward_metadata_capture_cuda_graph(
+        self,
+        bs: int,
+        num_tokens: int,
+        req_pool_indices: torch.Tensor,
+        seq_lens: torch.Tensor,
+        encoder_lens: Optional[torch.Tensor],
+        forward_mode: ForwardMode,
+        spec_info: Optional[SpecInput],
+    ):
+        self.set_nsa_prefill_impl(forward_batch=None)
+
+        """Initialize forward metadata for capturing CUDA graph."""
+        if forward_mode.is_decode_or_idle():
+            # Normal Decode
+            # Get sequence information
+            cache_seqlens_int32 = seq_lens.to(torch.int32)
+            cu_seqlens_k = compute_cu_seqlens(cache_seqlens_int32)
+
+            # Use max context length for seq_len_k
+            page_table_1 = self.decode_cuda_graph_metadata["page_table"][:bs, :]
+            max_seqlen_q = 1
+            max_seqlen_k = page_table_1.shape[1]
+
+            # Precompute page table
+            # Precompute cumulative sequence lengths
+
+            # NOTE(dark): this is always arange, since we are decoding
+            cu_seqlens_q = self.decode_cuda_graph_metadata["cu_seqlens_q"][: bs + 1]
+            nsa_cache_seqlens_int32 = compute_nsa_seqlens(
+                cache_seqlens_int32, nsa_index_topk=self.nsa_index_topk
+            )
+
+            seqlens_expanded = cache_seqlens_int32
+            nsa_extend_seq_lens_list = [1] * num_tokens
+            if self.nsa_decode_impl == "flashmla_kv":
+                flashmla_metadata = self.decode_cuda_graph_metadata[
+                    "flashmla_metadata"
+                ].slice(slice(0, num_tokens + 1))
+                flashmla_metadata.copy_(
+                    self._compute_flashmla_metadata(
+                        cache_seqlens=nsa_cache_seqlens_int32,
+                        seq_len_q=1,
+                    )
+                )
+            else:
+                flashmla_metadata = None
+        elif forward_mode.is_target_verify() or forward_mode.is_draft_extend(
+            include_v2=True
+        ):
+            cache_seqlens_int32 = (seq_lens + self.speculative_num_draft_tokens).to(
+                torch.int32
+            )
+            cu_seqlens_k = compute_cu_seqlens(cache_seqlens_int32)
+            max_seqlen_q = 1
+            page_table_1 = self.decode_cuda_graph_metadata["page_table"][
+                : bs * self.speculative_num_draft_tokens, :
+            ]
+            max_seqlen_k = page_table_1.shape[1]
+
+            cu_seqlens_q = torch.arange(
+                0,
+                bs * self.speculative_num_draft_tokens + 1,
+                1,
+                dtype=torch.int32,
+                device=self.device,
+            )
+
+            extend_seq_lens_cpu = [self.speculative_num_draft_tokens] * bs
+
+            seqlens_int32_cpu = [
+                self.speculative_num_draft_tokens + kv_len
+                for kv_len in seq_lens.tolist()
+            ]
+            seqlens_expanded = torch.cat(
+                [
+                    torch.arange(
+                        kv_len - qo_len + 1,
+                        kv_len + 1,
+                        dtype=torch.int32,
+                        device=self.device,
+                    )
+                    for qo_len, kv_len in zip(
+                        extend_seq_lens_cpu,
+                        seqlens_int32_cpu,
+                        strict=True,
+                    )
+                ]
+            )
+            nsa_cache_seqlens_int32 = compute_nsa_seqlens(
+                seqlens_expanded, nsa_index_topk=self.nsa_index_topk
+            )
+            nsa_extend_seq_lens_list = [1] * bs * self.speculative_num_draft_tokens
+
+            if self.nsa_decode_impl == "flashmla_kv":
+                flashmla_metadata = self.decode_cuda_graph_metadata[
+                    "flashmla_metadata"
+                ].slice(slice(0, bs * self.speculative_num_draft_tokens + 1))
+
+                flashmla_metadata.copy_(
+                    self._compute_flashmla_metadata(
+                        cache_seqlens=nsa_cache_seqlens_int32,
+                        seq_len_q=1,
+                    )
+                )
+            else:
+                flashmla_metadata = None
+
+        nsa_cu_seqlens_k = compute_cu_seqlens(nsa_cache_seqlens_int32)
+        nsa_cu_seqlens_q = self.get_device_int32_arange(len(nsa_cu_seqlens_k))
+        real_page_table = self._transform_table_1_to_real(page_table_1)
+
+        paged_mqa_schedule_metadata = None
+        if is_cuda() and (
+            forward_mode.is_decode_or_idle()
+            or forward_mode.is_target_verify()
+            or forward_mode.is_draft_extend()
+        ):
+            try:
+                import deep_gemm
+
+                seqlens_32 = (
+                    seqlens_expanded
+                    if (
+                        forward_mode.is_target_verify()
+                        or forward_mode.is_draft_extend()
+                    )
+                    else cache_seqlens_int32
+                )
+                paged_mqa_schedule_metadata = deep_gemm.get_paged_mqa_logits_metadata(
+                    seqlens_32, 64, deep_gemm.get_num_sms()
+                )
+            except (ImportError, ModuleNotFoundError):
+                paged_mqa_schedule_metadata = None
+
+        metadata = NSAMetadata(
+            page_size=self.real_page_size,
+            cache_seqlens_int32=cache_seqlens_int32,
+            max_seq_len_q=max_seqlen_q,
+            max_seq_len_k=max_seqlen_k,
+            cu_seqlens_q=cu_seqlens_q,
+            cu_seqlens_k=cu_seqlens_k,
+            page_table_1=page_table_1,
+            flashmla_metadata=flashmla_metadata,
+            paged_mqa_schedule_metadata=paged_mqa_schedule_metadata,
+            nsa_cache_seqlens_int32=nsa_cache_seqlens_int32,
+            nsa_cu_seqlens_q=nsa_cu_seqlens_q,
+            nsa_cu_seqlens_k=nsa_cu_seqlens_k,
+            nsa_seqlens_expanded=seqlens_expanded,
+            real_page_table=real_page_table,
+            nsa_extend_seq_lens_list=nsa_extend_seq_lens_list,
+        )
+        self.decode_cuda_graph_metadata[bs] = metadata
+        self.forward_metadata = metadata
+
+    def init_forward_metadata_replay_cuda_graph(
+        self,
+        bs: int,
+        req_pool_indices: torch.Tensor,
+        seq_lens: torch.Tensor,
+        seq_lens_sum: int,
+        encoder_lens: Optional[torch.Tensor],
+        forward_mode: ForwardMode,
+        spec_info: Optional[SpecInput],
+        seq_lens_cpu: Optional[torch.Tensor],
+        out_cache_loc: Optional[torch.Tensor] = None,
+    ):
+        """Initialize forward metadata for replaying CUDA graph."""
+        assert seq_lens_cpu is not None
+
+        self.set_nsa_prefill_impl(forward_batch=None)
+
+        seq_lens = seq_lens[:bs]
+        seq_lens_cpu = seq_lens_cpu[:bs]
+        req_pool_indices = req_pool_indices[:bs]
+
+        # Normal Decode
+        metadata: NSAMetadata = self.decode_cuda_graph_metadata[bs]
+        if forward_mode.is_decode_or_idle():
+            # Normal Decode
+            max_len = int(seq_lens_cpu.max().item())
+
+            cache_seqlens = seq_lens.to(torch.int32)
+            metadata.cache_seqlens_int32.copy_(cache_seqlens)
+            metadata.cu_seqlens_k[1:].copy_(
+                torch.cumsum(cache_seqlens, dim=0, dtype=torch.int32)
+            )
+            page_indices = self.req_to_token[req_pool_indices, :max_len]
+            metadata.page_table_1[:, :max_len].copy_(page_indices)
+            nsa_cache_seqlens = compute_nsa_seqlens(
+                cache_seqlens, nsa_index_topk=self.nsa_index_topk
+            )
+            metadata.nsa_cache_seqlens_int32.copy_(nsa_cache_seqlens)
+            seqlens_expanded = cache_seqlens
+        elif forward_mode.is_target_verify():
+            max_seqlen_k = int(
+                seq_lens_cpu.max().item() + self.speculative_num_draft_tokens
+            )
+
+            cache_seqlens = (seq_lens + self.speculative_num_draft_tokens).to(
+                torch.int32
+            )
+            metadata.cache_seqlens_int32.copy_(cache_seqlens)
+            metadata.cu_seqlens_k[1:].copy_(
+                torch.cumsum(cache_seqlens, dim=0, dtype=torch.int32)
+            )
+            page_indices = self.req_to_token[req_pool_indices, :max_seqlen_k]
+            page_indices = torch.repeat_interleave(
+                page_indices, repeats=self.speculative_num_draft_tokens, dim=0
+            )
+            metadata.page_table_1[:, :max_seqlen_k].copy_(page_indices)
+            extend_seq_lens_cpu = [self.speculative_num_draft_tokens] * bs
+
+            seqlens_expanded = seqlens_expand_triton(
+                torch.tensor(
+                    extend_seq_lens_cpu, dtype=torch.int32, device=self.device
+                ),
+                cache_seqlens,
+                self.speculative_num_draft_tokens * bs,
+                self.speculative_num_draft_tokens,
+            )
+            metadata.nsa_seqlens_expanded.copy_(seqlens_expanded)
+            nsa_cache_seqlens = compute_nsa_seqlens(
+                seqlens_expanded, self.nsa_index_topk
+            )
+            metadata.nsa_cache_seqlens_int32.copy_(nsa_cache_seqlens)
+        elif forward_mode.is_draft_extend(include_v2=True):
+            max_seqlen_k = int(seq_lens_cpu.max().item())
+            cache_seqlens = seq_lens.to(torch.int32)
+            metadata.cache_seqlens_int32.copy_(cache_seqlens)
+            metadata.cu_seqlens_k[1:].copy_(
+                torch.cumsum(cache_seqlens, dim=0, dtype=torch.int32)
+            )
+
+            extend_seq_lens = spec_info.accept_length[:bs]
+            extend_seq_lens_cpu = extend_seq_lens.tolist()
+
+            page_indices = self.req_to_token[req_pool_indices, :max_seqlen_k]
+            page_indices = torch.repeat_interleave(
+                page_indices, repeats=extend_seq_lens, dim=0
+            )
+            metadata.page_table_1[: page_indices.shape[0], :max_seqlen_k].copy_(
+                page_indices
+            )
+
+            seqlens_expanded = seqlens_expand_triton(
+                extend_seq_lens,
+                cache_seqlens,
+                sum(extend_seq_lens_cpu),
+                self.speculative_num_draft_tokens,
+            )
+            metadata.nsa_seqlens_expanded[: seqlens_expanded.shape[0]].copy_(
+                seqlens_expanded
+            )
+            nsa_cache_seqlens = compute_nsa_seqlens(
+                seqlens_expanded, self.nsa_index_topk
+            )
+            metadata.nsa_cache_seqlens_int32[: seqlens_expanded.shape[0]].copy_(
+                nsa_cache_seqlens
+            )
+
+        # Update DeepGEMM paged MQA schedule metadata outside the captured graph.
+        if is_cuda() and (
+            forward_mode.is_decode_or_idle()
+            or forward_mode.is_target_verify()
+            or forward_mode.is_draft_extend()
+        ):
+            try:
+                import deep_gemm
+
+                seqlens_32 = (
+                    seqlens_expanded
+                    if (
+                        forward_mode.is_target_verify()
+                        or forward_mode.is_draft_extend()
+                    )
+                    else metadata.cache_seqlens_int32
+                )
+                new_schedule = deep_gemm.get_paged_mqa_logits_metadata(
+                    seqlens_32, 64, deep_gemm.get_num_sms()
+                )
+                if metadata.paged_mqa_schedule_metadata is None:
+                    metadata.paged_mqa_schedule_metadata = new_schedule
+                else:
+                    metadata.paged_mqa_schedule_metadata.copy_(new_schedule)
+            except (ImportError, ModuleNotFoundError):
+                metadata.paged_mqa_schedule_metadata = None
+        seqlens_expanded_size = seqlens_expanded.shape[0]
+        assert (
+            metadata.nsa_cache_seqlens_int32 is not None
+            and metadata.nsa_cu_seqlens_k is not None
+            and self.nsa_index_topk is not None
+        )
+
+        metadata.nsa_cu_seqlens_k[1 : 1 + seqlens_expanded_size].copy_(
+            torch.cumsum(nsa_cache_seqlens, dim=0, dtype=torch.int32)
+        )
+        # NOTE(dark): (nsa-) cu_seqlens_q is always arange, no need to copy
+
+        assert self.real_page_size == metadata.page_size
+        if self.real_page_size > 1:
+            real_table = self._transform_table_1_to_real(page_indices)
+            new_rows = real_table.shape[0]
+            new_cols = real_table.shape[1]
+            metadata.real_page_table[:new_rows, :new_cols].copy_(real_table)
+        else:
+            assert metadata.real_page_table is metadata.page_table_1
+
+        if self.nsa_decode_impl == "flashmla_kv":
+            flashmla_metadata = metadata.flashmla_metadata.slice(
+                slice(0, seqlens_expanded_size + 1)
+            )
+            flashmla_metadata.copy_(
+                self._compute_flashmla_metadata(
+                    cache_seqlens=nsa_cache_seqlens,
+                    seq_len_q=1,
+                )
+            )
+
+        self.forward_metadata = metadata
+
+    def init_forward_metadata_replay_cuda_graph_from_precomputed(
+        self,
+        bs: int,
+        precomputed: PrecomputedMetadata,
+        forward_mode: ForwardMode,
+    ):
+        """Fast path: copy precomputed metadata to this backend's metadata.
+
+        This function only performs copy operations, no computation.
+
+        Args:
+            bs: Batch size
+            precomputed: Precomputed metadata to copy from
+            forward_mode: Forward mode
+        """
+        self.set_nsa_prefill_impl(forward_batch=None)
+
+        metadata = self.decode_cuda_graph_metadata[bs]
+
+        # Track whether fused kernel succeeded
+        fused_kernel_succeeded = False
+
+        # Use fused CUDA kernel for all copy operations
+        if _USE_FUSED_METADATA_COPY:
+            try:
+                from sglang.jit_kernel.fused_metadata_copy import (
+                    fused_metadata_copy_cuda,
+                )
+
+                # Map forward_mode to integer enum
+                if forward_mode.is_decode_or_idle():
+                    mode_int = 0  # DECODE
+                elif forward_mode.is_target_verify():
+                    mode_int = 1  # TARGET_VERIFY
+                elif forward_mode.is_draft_extend():
+                    mode_int = 2  # DRAFT_EXTEND
+                else:
+                    raise ValueError(f"Unsupported forward_mode: {forward_mode}")
+
+                # Prepare FlashMLA tensors if needed
+                flashmla_num_splits_src = None
+                flashmla_num_splits_dst = None
+                flashmla_metadata_src = None
+                flashmla_metadata_dst = None
+                if precomputed.flashmla_metadata is not None:
+                    flashmla_num_splits_src = precomputed.flashmla_metadata.num_splits
+                    flashmla_num_splits_dst = metadata.flashmla_metadata.num_splits
+                    flashmla_metadata_src = (
+                        precomputed.flashmla_metadata.flashmla_metadata
+                    )
+                    flashmla_metadata_dst = metadata.flashmla_metadata.flashmla_metadata
+
+                # Call fused kernel
+                fused_metadata_copy_cuda(
+                    # Source tensors
+                    precomputed.cache_seqlens,
+                    precomputed.cu_seqlens_k,
+                    precomputed.page_indices,
+                    precomputed.nsa_cache_seqlens,
+                    precomputed.seqlens_expanded,
+                    precomputed.nsa_cu_seqlens_k,
+                    precomputed.real_page_table,
+                    flashmla_num_splits_src,
+                    flashmla_metadata_src,
+                    # Destination tensors
+                    metadata.cache_seqlens_int32,
+                    metadata.cu_seqlens_k,
+                    metadata.page_table_1,
+                    metadata.nsa_cache_seqlens_int32,
+                    metadata.nsa_seqlens_expanded,
+                    metadata.nsa_cu_seqlens_k,
+                    (
+                        metadata.real_page_table
+                        if precomputed.real_page_table is not None
+                        else None
+                    ),
+                    flashmla_num_splits_dst,
+                    flashmla_metadata_dst,
+                    # Parameters
+                    mode_int,
+                    bs,
+                    precomputed.max_len,
+                    precomputed.max_seqlen_k,
+                    precomputed.seqlens_expanded_size,
+                )
+
+                # Successfully used fused kernel
+                fused_kernel_succeeded = True
+
+                # Verification: compare fused kernel results against individual copies
+                if _VERIFY_FUSED_METADATA_COPY:
+                    verify_single_backend_fused_metadata_copy(
+                        metadata=metadata,
+                        precomputed=precomputed,
+                        forward_mode=forward_mode,
+                        bs=bs,
+                        flashmla_num_splits_src=flashmla_num_splits_src,
+                        flashmla_metadata_src=flashmla_metadata_src,
+                        flashmla_num_splits_dst=flashmla_num_splits_dst,
+                        flashmla_metadata_dst=flashmla_metadata_dst,
+                    )
+            except ImportError:
+                print(
+                    "Warning: Fused metadata copy kernel not available, falling back to individual copies."
+                )
+            except Exception as e:
+                print(
+                    f"Warning: Fused metadata copy kernel failed with error: {e}, falling back to individual copies."
+                )
+
+        # Fallback to individual copy operations if fused kernel disabled or failed
+        if not fused_kernel_succeeded:
+            # Copy basic seqlens
+            metadata.cache_seqlens_int32.copy_(precomputed.cache_seqlens)
+            metadata.cu_seqlens_k[1:].copy_(precomputed.cu_seqlens_k[1:])
+
+            # Mode-specific copy logic
+            if forward_mode.is_decode_or_idle():
+                # Decode mode
+                metadata.page_table_1[:, : precomputed.max_len].copy_(
+                    precomputed.page_indices
+                )
+                metadata.nsa_cache_seqlens_int32.copy_(precomputed.nsa_cache_seqlens)
+                # seqlens_expanded is same as cache_seqlens (already copied)
+
+            elif forward_mode.is_target_verify():
+                # Target verify mode
+                metadata.page_table_1[:, : precomputed.max_seqlen_k].copy_(
+                    precomputed.page_indices
+                )
+                metadata.nsa_seqlens_expanded.copy_(precomputed.seqlens_expanded)
+                metadata.nsa_cache_seqlens_int32.copy_(precomputed.nsa_cache_seqlens)
+
+            elif forward_mode.is_draft_extend():
+                # Draft extend mode
+                rows = precomputed.page_indices.shape[0]
+                cols = precomputed.max_seqlen_k
+                metadata.page_table_1[:rows, :cols].copy_(precomputed.page_indices)
+
+                size = precomputed.seqlens_expanded_size
+                metadata.nsa_seqlens_expanded[:size].copy_(precomputed.seqlens_expanded)
+                metadata.nsa_cache_seqlens_int32[:size].copy_(
+                    precomputed.nsa_cache_seqlens
+                )
+
+            # Copy NSA cu_seqlens
+            size = precomputed.seqlens_expanded_size
+            metadata.nsa_cu_seqlens_k[1 : 1 + size].copy_(
+                precomputed.nsa_cu_seqlens_k[1 : 1 + size]
+            )
+
+            # Copy real page table
+            if precomputed.real_page_table is not None:
+                rows, cols = precomputed.real_page_table.shape
+                metadata.real_page_table[:rows, :cols].copy_(
+                    precomputed.real_page_table
+                )
+
+            # Copy FlashMLA metadata in fallback path
+            if precomputed.flashmla_metadata is not None:
+                size = precomputed.seqlens_expanded_size
+                flashmla_metadata = metadata.flashmla_metadata.slice(slice(0, size + 1))
+                flashmla_metadata.copy_(precomputed.flashmla_metadata)
+
+        self.forward_metadata = metadata
+
+    def forward_extend(
+        self,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
+        layer: RadixAttention,
+        forward_batch: ForwardBatch,
+        save_kv_cache=True,
+        # For multi-head latent attention
+        q_rope: Optional[torch.Tensor] = None,
+        k_rope: Optional[torch.Tensor] = None,
+        topk_indices: Optional[torch.Tensor] = None,
+        cos_sin_cache: Optional[torch.Tensor] = None,
+        is_neox: Optional[bool] = False,
+        llama_4_scaling: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+
+        causal = not layer.is_cross_attention
+        metadata = self.forward_metadata
+        assert causal, "NSA is causal only"
+
+        nsa_impl = (
+            self.nsa_decode_impl
+            if (
+                forward_batch.forward_mode.is_target_verify()
+                or forward_batch.forward_mode.is_draft_extend(include_v2=True)
+            )
+            else self.nsa_prefill_impl
+        )
+
+        if nsa_impl == "trtllm" and not self.use_mha:
+            return self._forward_trtllm(
+                q,
+                k,
+                v,
+                layer,
+                forward_batch,
+                metadata.nsa_cache_seqlens_int32,
+                save_kv_cache,
+                q_rope,
+                k_rope,
+                topk_indices,
+                cos_sin_cache,
+                is_neox,
+                llama_4_scaling,
+            )
+
+        if k is not None:
+            assert v is not None
+            if save_kv_cache:
+                cache_loc = (
+                    forward_batch.out_cache_loc
+                    if not layer.is_cross_attention
+                    else forward_batch.encoder_out_cache_loc
+                )
+                forward_batch.token_to_kv_pool.set_mla_kv_buffer(  # type: ignore
+                    layer,
+                    cache_loc,
+                    k,
+                    k_rope,
+                )
+
+        # Use MHA kernel if in MHA_ONE_SHOT mode
+        if self.use_mha:
+            assert k is not None and v is not None
+            assert q_rope is None, "MHA_ONE_SHOT path should not pass q_rope"
+            assert (
+                layer.tp_k_head_num == layer.tp_q_head_num > 1
+            ), "MHA_ONE_SHOT requires dense multi-head config"
+            return self._forward_standard_mha(
+                q=q,
+                k=k,
+                v=v,
+                layer=layer,
+                forward_batch=forward_batch,
+                metadata=metadata,
+            )
+
+        # Do absorbed multi-latent attention (MLA path)
+        assert q_rope is not None
+        kv_cache = forward_batch.token_to_kv_pool.get_key_buffer(layer.layer_id)
+
+        if q_rope is not None:
+            q_nope = q.view(-1, layer.tp_q_head_num, layer.v_head_dim)
+            q_rope = q_rope.view(
+                -1, layer.tp_q_head_num, layer.head_dim - layer.v_head_dim
+            )
+        else:
+            q_all = q.contiguous().view(-1, layer.tp_q_head_num, layer.head_dim)
+            q_nope = q_all[:, :, : layer.v_head_dim]
+            q_rope = q_all[:, :, layer.v_head_dim :]
+
+        # Align topk_indices with q dimensions
+        # This handles cases where q is padded (TP + partial DP attention)
+        if topk_indices is not None:
+            topk_indices = self._pad_topk_indices(topk_indices, q_nope.shape[0])
+
+        # NOTE(dark): here, we use page size = 1
+        topk_transform_method = self.get_topk_transform_method()
+        if envs.SGLANG_NSA_FUSE_TOPK.get():
+            page_table_1 = topk_indices
+        else:
+            if topk_transform_method == TopkTransformMethod.RAGGED:
+                topk_indices_offset = metadata.topk_indices_offset
+                assert topk_indices_offset is not None
+                mask = topk_indices != -1
+                topk_indices_offset = (
+                    topk_indices_offset.unsqueeze(1)
+                    if topk_indices_offset.ndim == 1
+                    else topk_indices_offset
+                )
+                topk_indices = torch.where(
+                    mask, topk_indices + topk_indices_offset, topk_indices
+                )
+            elif topk_transform_method == TopkTransformMethod.PAGED:
+                assert metadata.nsa_extend_seq_lens_list is not None
+                page_table_1 = transform_index_page_table_prefill(
+                    page_table=metadata.page_table_1,
+                    topk_indices=topk_indices,
+                    extend_lens_cpu=metadata.nsa_extend_seq_lens_list,
+                    page_size=1,
+                )
+
+        if nsa_impl == "tilelang":
+            if q_rope is not None:
+                q_all = concat_mla_absorb_q_general(q_nope, q_rope)
+            return self._forward_tilelang(
+                q_all=q_all,
+                kv_cache=kv_cache,
+                page_table_1=page_table_1,
+                sm_scale=layer.scaling,
+                v_head_dim=layer.v_head_dim,
+            )
+        elif nsa_impl == "flashmla_sparse":
+            if q_rope is not None:
+                q_all = concat_mla_absorb_q_general(q_nope, q_rope)
+
+            if topk_transform_method == TopkTransformMethod.RAGGED:
+                if any(forward_batch.extend_prefix_lens_cpu):
+                    page_table_1_flattened = (
+                        self.forward_metadata.page_table_1_flattened
+                    )
+                    assert page_table_1_flattened is not None
+                    kv_cache = dequantize_k_cache_paged(
+                        kv_cache, page_table_1_flattened
+                    )
+                else:
+                    kv_cache = _cat([k, k_rope], dim=-1)
+                page_table_1 = topk_indices
+
+            return self._forward_flashmla_sparse(
+                q_all=q_all,
+                kv_cache=kv_cache,
+                page_table_1=page_table_1,
+                sm_scale=layer.scaling,
+                v_head_dim=layer.v_head_dim,
+            )
+        elif nsa_impl == "flashmla_kv":
+            if q_rope is not None:
+                q_all = concat_mla_absorb_q_general(q_nope, q_rope)
+            return self._forward_flashmla_kv(
+                q_all=q_all,
+                kv_cache=kv_cache,
+                sm_scale=layer.scaling,
+                v_head_dim=layer.v_head_dim,
+                # TODO optimize args
+                layer=layer,
+                metadata=metadata,
+                page_table_1=page_table_1,
+            )
+        elif nsa_impl == "fa3":
+            return self._forward_fa3(
+                q_rope=q_rope,
+                kv_cache=kv_cache,
+                v_head_dim=layer.v_head_dim,
+                q_nope=q_nope,
+                page_table=page_table_1,
+                cache_seqlens=metadata.nsa_cache_seqlens_int32,
+                cu_seqlens_q=metadata.nsa_cu_seqlens_q,
+                cu_seqlens_k=metadata.nsa_cu_seqlens_k,
+                max_seqlen_q=metadata.nsa_max_seqlen_q,
+                sm_scale=layer.scaling,
+                logit_cap=layer.logit_cap,
+                page_size=1,
+            )
+        elif nsa_impl == "aiter":
+            if q_rope is not None:
+                q_all = torch.cat([q_nope, q_rope], dim=-1)
+            return self._forward_aiter_extend(
+                q_all=q_all,
+                kv_cache=kv_cache,
+                page_table_1=page_table_1,
+                layer=layer,
+            )
+        else:
+            raise ValueError(
+                f"Unsupported {nsa_impl = } for forward_extend. Consider using an other attention backend."
+            )
+
+    def forward_decode(
+        self,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
+        layer: RadixAttention,
+        forward_batch: ForwardBatch,
+        save_kv_cache=True,
+        # For multi-head latent attention
+        q_rope: Optional[torch.Tensor] = None,
+        k_rope: Optional[torch.Tensor] = None,
+        topk_indices: Optional[torch.Tensor] = None,
+        cos_sin_cache: Optional[torch.Tensor] = None,
+        is_neox: Optional[bool] = False,
+        llama_4_scaling: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+
+        causal = not layer.is_cross_attention
+        metadata = self.forward_metadata
+        assert causal, "NSA is causal only"
+
+        if self.nsa_decode_impl == "trtllm":
+            return self._forward_trtllm(
+                q,
+                k,
+                v,
+                layer,
+                forward_batch,
+                metadata.cache_seqlens_int32,
+                save_kv_cache,
+                q_rope,
+                k_rope,
+                topk_indices,
+                cos_sin_cache,
+                is_neox,
+                llama_4_scaling,
+            )
+
+        if k is not None:
+            assert v is not None
+            if save_kv_cache:
+                cache_loc = (
+                    forward_batch.out_cache_loc
+                    if not layer.is_cross_attention
+                    else forward_batch.encoder_out_cache_loc
+                )
+                forward_batch.token_to_kv_pool.set_mla_kv_buffer(  # type: ignore
+                    layer,
+                    cache_loc,
+                    k,
+                    k_rope,
+                )
+
+        # Do absorbed multi-latent attention
+        kv_cache = forward_batch.token_to_kv_pool.get_key_buffer(layer.layer_id)
+        if q_rope is not None:
+            q_nope = q.view(-1, layer.tp_q_head_num, layer.v_head_dim)
+            q_rope = q_rope.view(
+                -1, layer.tp_q_head_num, layer.head_dim - layer.v_head_dim
+            )
+        else:
+            q_all = q.contiguous().view(-1, layer.tp_q_head_num, layer.head_dim)
+            q_nope = q_all[:, :, : layer.v_head_dim]
+            q_rope = q_all[:, :, layer.v_head_dim :]
+
+        # Align topk_indices with q dimensions
+        if topk_indices is not None:
+            topk_indices = self._pad_topk_indices(topk_indices, q_nope.shape[0])
+
+        if envs.SGLANG_NSA_FUSE_TOPK.get():
+            page_table_1 = topk_indices
+        else:
+            page_table_1 = transform_index_page_table_decode(
+                page_table=metadata.page_table_1,
+                topk_indices=topk_indices,
+                page_size=1,
+            )
+
+        if self.nsa_decode_impl == "flashmla_sparse":
+            if q_rope is not None:
+                q_all = concat_mla_absorb_q_general(q_nope, q_rope)
+            return self._forward_flashmla_sparse(
+                q_all=q_all,
+                kv_cache=kv_cache,
+                page_table_1=page_table_1,
+                sm_scale=layer.scaling,
+                v_head_dim=layer.v_head_dim,
+            )
+        elif self.nsa_decode_impl == "flashmla_kv":
+            if q_rope is not None:
+                q_all = concat_mla_absorb_q_general(q_nope, q_rope)
+            return self._forward_flashmla_kv(
+                q_all=q_all,
+                kv_cache=kv_cache,
+                sm_scale=layer.scaling,
+                v_head_dim=layer.v_head_dim,
+                # TODO optimize args
+                layer=layer,
+                metadata=metadata,
+                page_table_1=page_table_1,
+            )
+        elif self.nsa_decode_impl == "tilelang":
+            if q_rope is not None:
+                q_all = concat_mla_absorb_q_general(q_nope, q_rope)
+            return self._forward_tilelang(
+                q_all=q_all,
+                kv_cache=kv_cache,
+                page_table_1=page_table_1,
+                sm_scale=layer.scaling,
+                v_head_dim=layer.v_head_dim,
+            )
+        elif self.nsa_decode_impl == "fa3":
+            return self._forward_fa3(
+                q_rope=q_rope,
+                kv_cache=kv_cache,
+                v_head_dim=layer.v_head_dim,
+                q_nope=q_nope,
+                page_table=page_table_1,
+                cache_seqlens=metadata.nsa_cache_seqlens_int32,
+                cu_seqlens_q=metadata.nsa_cu_seqlens_q,
+                cu_seqlens_k=metadata.nsa_cu_seqlens_k,
+                max_seqlen_q=metadata.nsa_max_seqlen_q,
+                sm_scale=layer.scaling,
+                logit_cap=layer.logit_cap,
+                page_size=1,
+            )
+        elif self.nsa_decode_impl == "aiter":
+            if q_rope is not None:
+                q_all = torch.cat([q_nope, q_rope], dim=-1)
+            return self._forward_aiter(
+                q_all=q_all,
+                kv_cache=kv_cache,
+                page_table_1=page_table_1,
+                layer=layer,
+                metadata=metadata,
+                bs=forward_batch.batch_size,
+            )
+
+        else:
+            assert False, f"Unsupported {self.nsa_decode_impl = }"
+
+    def _forward_fa3(
+        self,
+        q_rope: torch.Tensor,
+        kv_cache: torch.Tensor,
+        v_head_dim: int,
+        q_nope: torch.Tensor,
+        page_table: torch.Tensor,
+        cache_seqlens: torch.Tensor,
+        cu_seqlens_q: torch.Tensor,
+        cu_seqlens_k: torch.Tensor,
+        max_seqlen_q: int,
+        sm_scale: float,
+        logit_cap: float,
+        page_size: int,
+    ) -> torch.Tensor:
+        k_rope_cache = kv_cache[:, :, v_head_dim:]
+        c_kv_cache = kv_cache[:, :, :v_head_dim]
+        qk_rope_dim = k_rope_cache.shape[-1]
+        k_rope_cache = k_rope_cache.view(-1, page_size, 1, qk_rope_dim)
+        c_kv_cache = c_kv_cache.view(-1, page_size, 1, v_head_dim)
+        o = flash_attn_with_kvcache(
+            q=q_rope,
+            k_cache=k_rope_cache,
+            v_cache=c_kv_cache,
+            qv=q_nope,
+            page_table=page_table,
+            cache_seqlens=cache_seqlens,
+            cu_seqlens_q=cu_seqlens_q,
+            cu_seqlens_k_new=cu_seqlens_k,
+            max_seqlen_q=max_seqlen_q,
+            softmax_scale=sm_scale,
+            causal=True,
+            softcap=logit_cap,
+            return_softmax_lse=False,
+            num_splits=self.num_splits,
+        )
+        return o  # type: ignore
+
+    def _forward_flashmla_sparse(
+        self,
+        q_all: torch.Tensor,
+        kv_cache: torch.Tensor,
+        v_head_dim: int,
+        page_table_1: torch.Tensor,
+        sm_scale: float,
+    ) -> torch.Tensor:
+        from sgl_kernel.flash_mla import flash_mla_sparse_fwd
+
+        # FlashMLA sparse kernel requires num_heads to be a multiple of 64 (Hopper) or 128 (Blackwell)
+        # When using TP, num_heads might be smaller (e.g., 256//8=32)
+        num_tokens, num_heads, head_dim = q_all.shape
+
+        # Determine required padding based on GPU architecture (use cached value)
+        required_padding = 128 if self.device_sm_major >= 10 else 64
+
+        need_padding = num_heads % required_padding != 0
+
+        if need_padding:
+            assert required_padding % num_heads == 0, (
+                f"num_heads {num_heads} cannot be padded to {required_padding}. "
+                f"TP size may be too large for this model."
+            )
+
+            # Pad q to required size
+            q_padded = q_all.new_zeros((num_tokens, required_padding, head_dim))
+            q_padded[:, :num_heads, :] = q_all
+            q_input = q_padded
+        else:
+            q_input = q_all
+
+        # indices shape must be (s_q, h_kv=1, topk), keep h_kv=1 unchanged
+        indices_input = page_table_1.unsqueeze(1)
+
+        o, _, _ = flash_mla_sparse_fwd(
+            q=q_input,
+            kv=kv_cache,
+            indices=indices_input,
+            sm_scale=sm_scale,
+            d_v=v_head_dim,
+        )
+
+        # Trim output back to original num_heads if we padded
+        if need_padding:
+            o = o[:, :num_heads, :]
+
+        return o
+
+    def _forward_flashmla_kv(
+        self,
+        q_all: torch.Tensor,
+        kv_cache: torch.Tensor,
+        v_head_dim: int,
+        sm_scale: float,
+        layer,
+        metadata: NSAMetadata,
+        page_table_1,
+    ) -> torch.Tensor:
+        from sgl_kernel.flash_mla import flash_mla_with_kvcache
+
+        cache_seqlens = metadata.nsa_cache_seqlens_int32
+
+        # TODO the 2nd dim is seq_len_q, need to be >1 when MTP
+        q_all = q_all.view(-1, 1, layer.tp_q_head_num, layer.head_dim)
+        kv_cache = kv_cache.view(-1, self.real_page_size, 1, self.kv_cache_dim)
+        assert self.real_page_size == 64, "only page size 64 is supported"
+
+        if not self.nsa_kv_cache_store_fp8:
+            # inefficiently quantize the whole cache
+            kv_cache = quantize_k_cache(kv_cache)
+
+        indices = page_table_1.unsqueeze(1)
+        assert (
+            indices.shape[-1] == self.nsa_index_topk
+        )  # requirement of FlashMLA decode kernel
+
+        o, _ = flash_mla_with_kvcache(
+            q=q_all,
+            k_cache=kv_cache,
+            cache_seqlens=cache_seqlens,
+            head_dim_v=v_head_dim,
+            tile_scheduler_metadata=metadata.flashmla_metadata.flashmla_metadata,
+            num_splits=metadata.flashmla_metadata.num_splits,
+            softmax_scale=sm_scale,
+            indices=indices,
+            # doc says it is not used, but if pass in None then error
+            block_table=torch.empty(
+                (q_all.shape[0], 0), dtype=torch.int32, device=q_all.device
+            ),
+            is_fp8_kvcache=True,
+        )
+        return o
+
+    def _forward_standard_mha(
+        self,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
+        layer: RadixAttention,
+        forward_batch: ForwardBatch,
+        metadata: NSAMetadata,
+    ) -> torch.Tensor:
+        """Standard MHA using FlashAttention varlen for MHA_ONE_SHOT mode."""
+        q = q.view(-1, layer.tp_q_head_num, layer.head_dim)
+        k = k.view(-1, layer.tp_k_head_num, layer.head_dim)
+        v = v.view(-1, layer.tp_v_head_num, layer.v_head_dim)
+
+        # MHA_ONE_SHOT: k/v include all tokens (prefix + current)
+        cu_seqlens_q = metadata.cu_seqlens_q
+        cu_seqlens_k = metadata.cu_seqlens_k
+        max_seqlen_k = metadata.max_seq_len_k
+        causal = True
+
+        # Verify batch sizes match (length of cu_seqlens should be batch_size + 1)
+        assert len(cu_seqlens_q) == len(cu_seqlens_k), (
+            f"batch_size mismatch: cu_seqlens_q has {len(cu_seqlens_q)-1} requests, "
+            f"cu_seqlens_k has {len(cu_seqlens_k)-1} requests"
+        )
+
+        # Use TRTLLm ragged attention for SM100 (Blackwell/B200) to avoid FA4 accuracy issues
+        if self.device_sm_major >= 10:
+            import flashinfer
+
+            seq_lens = metadata.cache_seqlens_int32
+            return flashinfer.prefill.trtllm_ragged_attention_deepseek(
+                query=q,
+                key=k,
+                value=v,
+                workspace_buffer=self.workspace_buffer,
+                seq_lens=seq_lens,
+                max_q_len=metadata.max_seq_len_q,
+                max_kv_len=max_seqlen_k,
+                bmm1_scale=layer.scaling,
+                bmm2_scale=1.0,
+                o_sf_scale=1.0,
+                batch_size=forward_batch.batch_size,
+                window_left=-1,
+                cum_seq_lens_q=cu_seqlens_q,
+                cum_seq_lens_kv=cu_seqlens_k,
+                enable_pdl=False,
+                is_causal=causal,
+                return_lse=False,
+            )
+
+        # Use FA3 for SM90 (Hopper/H200)
+        return flash_attn_varlen_func(
+            q=q,
+            k=k,
+            v=v,
+            cu_seqlens_q=cu_seqlens_q,
+            cu_seqlens_k=cu_seqlens_k,
+            max_seqlen_q=metadata.max_seq_len_q,
+            max_seqlen_k=max_seqlen_k,
+            softmax_scale=layer.scaling,
+            causal=causal,
+        )
+
+    def _forward_tilelang(
+        self,
+        q_all: torch.Tensor,
+        kv_cache: torch.Tensor,
+        v_head_dim: int,
+        page_table_1: torch.Tensor,
+        sm_scale: float,
+    ) -> torch.Tensor:
+        from sglang.srt.layers.attention.nsa.tilelang_kernel import tilelang_sparse_fwd
+
+        return tilelang_sparse_fwd(
+            q=q_all,
+            kv=kv_cache,
+            indices=page_table_1.unsqueeze(1),
+            sm_scale=sm_scale,
+            d_v=v_head_dim,
+        )
+
+    def _forward_aiter(
+        self,
+        q_all: torch.Tensor,
+        kv_cache: torch.Tensor,
+        page_table_1: torch.Tensor,
+        layer: RadixAttention,
+        metadata: NSAMetadata,
+        bs: int,
+    ) -> torch.Tensor:
+        q = q_all.reshape(-1, layer.tp_q_head_num * layer.head_dim)
+
+        if layer.head_dim != layer.v_head_dim:
+            o = q.new_empty((q.shape[0], layer.tp_q_head_num * layer.v_head_dim))
+        else:
+            o = torch.empty_like(q)
+
+        kv_indptr = self.kv_indptr
+
+        non_minus1_mask = page_table_1 != -1
+        non_minus1_counts = non_minus1_mask.sum(dim=1)
+        kv_indptr[1 : bs + 1] = torch.cumsum(non_minus1_counts, dim=0)
+
+        kv_indices = self.kv_indices
+        get_valid_kv_indices(page_table_1, kv_indptr, kv_indices, bs)
+
+        mla_decode_fwd(
+            q.view(-1, layer.tp_q_head_num, layer.head_dim),
+            kv_cache.view(-1, 1, 1, layer.head_dim),
+            o.view(-1, layer.tp_q_head_num, layer.v_head_dim),
+            metadata.cu_seqlens_q,
+            kv_indptr,
+            kv_indices,
+            metadata.cu_seqlens_q,
+            metadata.max_seq_len_q,
+            sm_scale=layer.scaling,
+            logit_cap=layer.logit_cap,
+        )
+        # kv_cache = kv_cache.view(-1, 1, layer.head_dim)
+        return o
+
+    def _forward_aiter_extend(
+        self,
+        q_all: torch.Tensor,
+        kv_cache: torch.Tensor,
+        page_table_1: torch.Tensor,
+        layer: RadixAttention,
+    ) -> torch.Tensor:
+        num_tokens = q_all.shape[0]
+        q = q_all.reshape(-1, layer.tp_q_head_num * layer.head_dim)
+
+        if layer.head_dim != layer.v_head_dim:
+            o = q.new_empty((num_tokens, layer.tp_q_head_num * layer.v_head_dim))
+        else:
+            o = torch.empty_like(q)
+
+        non_minus1_mask = page_table_1 != -1
+        non_minus1_counts = non_minus1_mask.sum(dim=1)
+
+        kv_indptr = torch.zeros(num_tokens + 1, dtype=torch.int32, device=self.device)
+        kv_indptr[1:] = torch.cumsum(non_minus1_counts, dim=0)
+
+        # Allocate kv_indices with upper-bound size (num_tokens * topk)
+        topk = page_table_1.shape[1]
+        kv_indices = torch.zeros(
+            num_tokens * topk, dtype=torch.int32, device=self.device
+        )
+
+        # Use get_valid_kv_indices kernel to extract valid indices
+        get_valid_kv_indices(page_table_1, kv_indptr, kv_indices, num_tokens)
+
+        # Build cu_seqlens_q for extend: each token is treated as seq_len_q=1
+        cu_seqlens_q = torch.arange(
+            0, num_tokens + 1, dtype=torch.int32, device=self.device
+        )
+        # TODO support more forward_mode
+        mla_decode_fwd(
+            q.view(-1, layer.tp_q_head_num, layer.head_dim),
+            kv_cache.view(-1, 1, 1, layer.head_dim),
+            o.view(-1, layer.tp_q_head_num, layer.v_head_dim),
+            cu_seqlens_q,
+            kv_indptr,
+            kv_indices,
+            cu_seqlens_q,
+            1,  # max_seq_len_q = 1 for per-token attention
+            sm_scale=layer.scaling,
+            logit_cap=layer.logit_cap,
+        )
+        return o
+
+    def _forward_trtllm(
+        self,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
+        layer: RadixAttention,
+        forward_batch: ForwardBatch,
+        seq_lens: torch.Tensor,
+        save_kv_cache=True,
+        # For multi-head latent attention
+        q_rope: Optional[torch.Tensor] = None,
+        k_rope: Optional[torch.Tensor] = None,
+        topk_indices: Optional[torch.Tensor] = None,
+        cos_sin_cache: Optional[torch.Tensor] = None,
+        is_neox: Optional[bool] = False,
+        llama_4_scaling: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        """Forward using TRT-LLM sparse MLA kernel."""
+        import flashinfer.decode
+
+        metadata = self.forward_metadata
+
+        merge_query = q_rope is not None
+        if self.kv_cache_dtype == torch.float8_e4m3fn:
+            # For FP8 path, we quantize the query and rope parts and merge them into a single tensor
+            # Note: rope application in deepseek_v2.py:forward_absorb_prepare is skipped for FP8 decode path of this trtllm_mla backend
+            assert q_rope is not None, "For FP8 path q_rope should not be None."
+            assert k_rope is not None, "For FP8 path k_rope should not be None."
+            assert (
+                cos_sin_cache is not None
+            ), "For FP8 path cos_sin_cache should not be None."
+
+            q, k, k_rope = mla_quantize_and_rope_for_fp8(
+                q,
+                q_rope,
+                k.squeeze(1),
+                k_rope.squeeze(1),
+                forward_batch.positions,
+                cos_sin_cache,
+                is_neox,
+                self.kv_lora_rank,
+                self.qk_rope_head_dim,
+            )
+            merge_query = False
+
+            # Save KV cache if requested
+        if save_kv_cache:
+            assert (
+                k is not None and k_rope is not None
+            ), "For populating trtllm_mla kv cache, both k_nope and k_rope should be not None."
+            cache_loc = (
+                forward_batch.out_cache_loc
+                if not layer.is_cross_attention
+                else forward_batch.encoder_out_cache_loc
+            )
+            forward_batch.token_to_kv_pool.set_mla_kv_buffer(
+                layer, cache_loc, k, k_rope
+            )
+
+        k_cache = forward_batch.token_to_kv_pool.get_key_buffer(layer.layer_id)
+        kv_cache = k_cache.view(-1, self.real_page_size, self.kv_cache_dim).unsqueeze(1)
+
+        if merge_query:
+            q_nope = q.view(-1, layer.tp_q_head_num, layer.v_head_dim)
+            q_rope_reshaped = q_rope.view(
+                -1, layer.tp_q_head_num, layer.head_dim - layer.v_head_dim
+            )
+            q_all = concat_mla_absorb_q_general(q_nope, q_rope_reshaped)
+        else:
+            q_all = q.view(-1, layer.tp_q_head_num, layer.head_dim)
+
+        # Align topk_indices with q dimensions
+        if topk_indices is not None:
+            topk_indices = self._pad_topk_indices(topk_indices, q.shape[0])
+
+        if envs.SGLANG_NSA_FUSE_TOPK.get():
+            page_table_1 = topk_indices
+        else:
+            page_table_1 = transform_index_page_table_decode(
+                page_table=metadata.page_table_1,
+                topk_indices=topk_indices,
+                page_size=1,
+            )
+
+        q_scale = 1.0
+        k_scale = (
+            layer.k_scale_float
+            if getattr(layer, "k_scale_float", None) is not None
+            else 1.0
+        )
+        bmm1_scale = q_scale * k_scale * layer.scaling
+
+        batch_size = page_table_1.shape[0]
+        _, num_heads, head_dim = q_all.shape
+
+        q = q_all.view(batch_size, 1, num_heads, head_dim)
+        kv = kv_cache.view(-1, 1, self.real_page_size, self.kv_cache_dim)
+        block_tables = page_table_1.unsqueeze(1)
+        seq_lens = metadata.cache_seqlens_int32 if seq_lens is None else seq_lens
+
+        out = flashinfer.decode.trtllm_batch_decode_with_kv_cache_mla(
+            query=q,
+            kv_cache=kv,
+            workspace_buffer=self.workspace_buffer,
+            qk_nope_head_dim=self.qk_nope_head_dim,
+            kv_lora_rank=self.kv_lora_rank,
+            qk_rope_head_dim=self.qk_rope_head_dim,
+            block_tables=block_tables,
+            seq_lens=seq_lens,
+            max_seq_len=metadata.max_seq_len_k,
+            sparse_mla_top_k=self.nsa_index_topk,
+            bmm1_scale=bmm1_scale,
+            backend="trtllm-gen",
+        )
+        # Output: [batch, q_len=1, heads, v_dim] -> [batch, heads, v_dim]
+        return out.squeeze(1)
+
+    def _pad_topk_indices(
+        self, topk_indices: torch.Tensor, num_tokens: int
+    ) -> torch.Tensor:
+        current_tokens = topk_indices.shape[0]
+        if current_tokens == num_tokens:
+            return topk_indices
+
+        assert current_tokens <= num_tokens, (
+            f"topk_indices rows ({current_tokens}) > num_tokens ({num_tokens}); "
+            "this indicates a mismatch between indexer output and q layout."
+        )
+
+        pad_size = num_tokens - current_tokens
+        padding = torch.full(
+            (pad_size, topk_indices.shape[1]),
+            -1,
+            dtype=topk_indices.dtype,
+            device=topk_indices.device,
+        )
+        return torch.cat([topk_indices, padding], dim=0)
+
+    def get_cuda_graph_seq_len_fill_value(self):
+        """Get the fill value for sequence length in CUDA graph."""
+        return 1
+
+    def set_nsa_prefill_impl(self, forward_batch: Optional[ForwardBatch] = None):
+        """
+        Decide all attention prefill dispatch strategies for this batch.
+        """
+        from sglang.srt.utils import get_device_sm, is_blackwell
+
+        # Decide MHA vs MLA
+        if forward_batch and forward_batch.forward_mode.is_extend_without_speculative():
+            # Check if sequence meets criteria for MHA_ONE_SHOT
+            assert forward_batch.seq_lens_cpu is not None
+            max_kv_len = forward_batch.seq_lens_cpu.max().item()
+            sum_seq_lens = sum(forward_batch.seq_lens_cpu)
+            device_sm = get_device_sm()
+
+            # when nsa prefill impl is trtllm, use its max chunk capacity as mha max kv len
+            mha_max_kv_len = (
+                forward_batch.get_max_chunk_capacity()
+                if self.nsa_prefill_impl == "trtllm"
+                else self.nsa_index_topk
+            )
+
+            # Requirements: H200/B200, short sequences, supported dtype, fits in chunk
+            self.use_mha = (
+                (
+                    device_sm == 90 or (device_sm >= 100 and device_sm < 110)
+                )  # SM90/SM100 only
+                and max_kv_len <= mha_max_kv_len  # Short enough for MHA
+                and forward_batch.token_to_kv_pool.dtype
+                in [torch.bfloat16, torch.float8_e4m3fn]
+                and sum_seq_lens
+                <= forward_batch.get_max_chunk_capacity()  # Fits in chunk
+                and (not is_nsa_enable_prefill_cp())  # CP not enabled
+            )
+        else:
+            self.use_mha = False  # Decode/verify always use MLA
+        if self._force_attn_forward_mla:
+            self.use_mha = False
+
+        # Set MLA implementation only if not using MHA
+        if not self.use_mha and self.enable_auto_select_prefill_impl:
+            if self.nsa_kv_cache_store_fp8:
+                if (
+                    is_blackwell()
+                    and forward_batch is not None
+                    and forward_batch.forward_mode == ForwardMode.EXTEND
+                ):
+                    total_kv_tokens = forward_batch.seq_lens_sum
+                    total_q_tokens = forward_batch.extend_num_tokens
+                    # Heuristic based on benchmarking flashmla_kv vs flashmla_sparse + dequantize_k_cache_paged
+                    if total_kv_tokens < total_q_tokens * 512:
+                        self.nsa_prefill_impl = "flashmla_sparse"
+                        return
+                self.nsa_prefill_impl = "flashmla_kv"
+            else:
+                # bf16 kv cache
+                self.nsa_prefill_impl = "flashmla_sparse"
+
+    def get_topk_transform_method(self) -> TopkTransformMethod:
+        """
+        SGLANG_NSA_FUSE_TOPK controls whether to fuse the topk transform into the topk kernel.
+        This method is used to select the topk transform method which can be fused or unfused.
+        """
+        if (
+            # disable for MTP
+            self.nsa_kv_cache_store_fp8
+            and self.nsa_prefill_impl == "flashmla_sparse"
+        ):
+            topk_transform_method = TopkTransformMethod.RAGGED
+        else:
+            topk_transform_method = TopkTransformMethod.PAGED
+        return topk_transform_method
+
+    def get_indexer_metadata(
+        self, layer_id: int, forward_batch: ForwardBatch
+    ) -> NSAIndexerMetadata:
+        return NSAIndexerMetadata(
+            attn_metadata=self.forward_metadata,
+            topk_transform_method=self.get_topk_transform_method(),
+            paged_mqa_schedule_metadata=self.forward_metadata.paged_mqa_schedule_metadata,
+        )
+
+    def _compute_flashmla_metadata(self, cache_seqlens: torch.Tensor, seq_len_q: int):
+        from sgl_kernel.flash_mla import get_mla_metadata
+
+        flashmla_metadata, num_splits = get_mla_metadata(
+            cache_seqlens=cache_seqlens,
+            # TODO doc says `num_q_tokens_per_q_seq * num_heads_q // num_heads_k`
+            #      but the name looks like need seq_len_q?
+            num_q_tokens_per_head_k=seq_len_q * self.num_q_heads // 1,
+            num_heads_k=1,
+            num_heads_q=self.num_q_heads,
+            is_fp8_kvcache=True,
+            topk=self.nsa_index_topk,
+        )
+
+        return NSAFlashMLAMetadata(
+            flashmla_metadata=flashmla_metadata,
+            num_splits=num_splits,
+        )
+
+
+class NativeSparseAttnMultiStepBackend:
+
+    def __init__(
+        self, model_runner: ModelRunner, topk: int, speculative_num_steps: int
+    ):
+        self.model_runner = model_runner
+        self.topk = topk
+        self.speculative_num_steps = speculative_num_steps
+        self.attn_backends = []
+        for i in range(self.speculative_num_steps - 1):
+            self.attn_backends.append(
+                NativeSparseAttnBackend(
+                    model_runner,
+                    speculative_step_id=i,
+                    topk=self.topk,
+                    speculative_num_steps=self.speculative_num_steps,
+                )
+            )
+
+    def init_forward_metadata(self, forward_batch: ForwardBatch):
+        for i in range(self.speculative_num_steps - 1):
+            self.attn_backends[i].init_forward_metadata(forward_batch)
+
+    def init_cuda_graph_state(self, max_bs: int, max_num_tokens: int):
+        for i in range(self.speculative_num_steps - 1):
+            self.attn_backends[i].init_cuda_graph_state(max_bs, max_num_tokens)
+
+    def init_forward_metadata_capture_cuda_graph(self, forward_batch: ForwardBatch):
+        for i in range(self.speculative_num_steps - 1):
+            self.attn_backends[i].init_forward_metadata_capture_cuda_graph(
+                forward_batch.batch_size,
+                forward_batch.batch_size * self.topk,
+                forward_batch.req_pool_indices,
+                forward_batch.seq_lens,
+                encoder_lens=None,
+                forward_mode=ForwardMode.DECODE,
+                spec_info=forward_batch.spec_info,
+            )
+
+    def init_forward_metadata_replay_cuda_graph(
+        self, forward_batch: ForwardBatch, bs: int
+    ):
+        if envs.SGLANG_NSA_ENABLE_MTP_PRECOMPUTE_METADATA.get():
+            # Precompute metadata once (shared across all backends)
+            precomputed = self.attn_backends[0]._precompute_replay_metadata(
+                bs=bs,
+                req_pool_indices=forward_batch.req_pool_indices,
+                seq_lens=forward_batch.seq_lens,
+                seq_lens_cpu=forward_batch.seq_lens_cpu,
+                forward_mode=ForwardMode.DECODE,
+                spec_info=forward_batch.spec_info,
+            )
+
+            # Use multi-backend fused copy when we have 3 or more backends
+            # This is 3x faster than calling the single-backend copy 3 times
+            if self.speculative_num_steps > 3:
+                try:
+                    from sglang.jit_kernel.fused_metadata_copy import (
+                        fused_metadata_copy_multi_cuda,
+                    )
+
+                    metadata0 = self.attn_backends[0].decode_cuda_graph_metadata[bs]
+                    metadata1 = self.attn_backends[1].decode_cuda_graph_metadata[bs]
+                    metadata2 = self.attn_backends[2].decode_cuda_graph_metadata[bs]
+
+                    # Set nsa_prefill_impl for first 3 backends (required by the method)
+                    for i in range(3):
+                        self.attn_backends[i].set_nsa_prefill_impl(forward_batch=None)
+
+                    # Prepare FlashMLA tensors if needed
+                    flashmla_num_splits_src = None
+                    flashmla_metadata_src = None
+                    flashmla_num_splits_dst0 = None
+                    flashmla_num_splits_dst1 = None
+                    flashmla_num_splits_dst2 = None
+                    flashmla_metadata_dst0 = None
+                    flashmla_metadata_dst1 = None
+                    flashmla_metadata_dst2 = None
+
+                    if precomputed.flashmla_metadata is not None:
+                        flashmla_num_splits_src = (
+                            precomputed.flashmla_metadata.num_splits
+                        )
+                        flashmla_metadata_src = (
+                            precomputed.flashmla_metadata.flashmla_metadata
+                        )
+                        flashmla_num_splits_dst0 = (
+                            metadata0.flashmla_metadata.num_splits
+                        )
+                        flashmla_num_splits_dst1 = (
+                            metadata1.flashmla_metadata.num_splits
+                        )
+                        flashmla_num_splits_dst2 = (
+                            metadata2.flashmla_metadata.num_splits
+                        )
+                        flashmla_metadata_dst0 = (
+                            metadata0.flashmla_metadata.flashmla_metadata
+                        )
+                        flashmla_metadata_dst1 = (
+                            metadata1.flashmla_metadata.flashmla_metadata
+                        )
+                        flashmla_metadata_dst2 = (
+                            metadata2.flashmla_metadata.flashmla_metadata
+                        )
+
+                    # Call the multi-backend fused kernel for first 3 backends
+                    fused_metadata_copy_multi_cuda(
+                        # Source tensors
+                        precomputed.cache_seqlens,
+                        precomputed.cu_seqlens_k,
+                        precomputed.page_indices,
+                        precomputed.nsa_cache_seqlens,
+                        precomputed.nsa_cu_seqlens_k,
+                        precomputed.real_page_table,
+                        flashmla_num_splits_src,
+                        flashmla_metadata_src,
+                        # Destination tensors for backend 0
+                        metadata0.cache_seqlens_int32,
+                        metadata0.cu_seqlens_k,
+                        metadata0.page_table_1,
+                        metadata0.nsa_cache_seqlens_int32,
+                        metadata0.nsa_cu_seqlens_k,
+                        (
+                            metadata0.real_page_table
+                            if precomputed.real_page_table is not None
+                            else None
+                        ),
+                        flashmla_num_splits_dst0,
+                        flashmla_metadata_dst0,
+                        # Destination tensors for backend 1
+                        metadata1.cache_seqlens_int32,
+                        metadata1.cu_seqlens_k,
+                        metadata1.page_table_1,
+                        metadata1.nsa_cache_seqlens_int32,
+                        metadata1.nsa_cu_seqlens_k,
+                        (
+                            metadata1.real_page_table
+                            if precomputed.real_page_table is not None
+                            else None
+                        ),
+                        flashmla_num_splits_dst1,
+                        flashmla_metadata_dst1,
+                        # Destination tensors for backend 2
+                        metadata2.cache_seqlens_int32,
+                        metadata2.cu_seqlens_k,
+                        metadata2.page_table_1,
+                        metadata2.nsa_cache_seqlens_int32,
+                        metadata2.nsa_cu_seqlens_k,
+                        (
+                            metadata2.real_page_table
+                            if precomputed.real_page_table is not None
+                            else None
+                        ),
+                        flashmla_num_splits_dst2,
+                        flashmla_metadata_dst2,
+                        # Parameters
+                        bs,
+                        precomputed.max_len,
+                        precomputed.seqlens_expanded_size,
+                    )
+
+                    # Verification: compare fused kernel results against individual copies
+                    if _VERIFY_FUSED_METADATA_COPY:
+                        verify_multi_backend_fused_metadata_copy(
+                            metadata0=metadata0,
+                            metadata1=metadata1,
+                            metadata2=metadata2,
+                            precomputed=precomputed,
+                            bs=bs,
+                            flashmla_num_splits_src=flashmla_num_splits_src,
+                            flashmla_metadata_src=flashmla_metadata_src,
+                        )
+
+                    # Copy remaining backends one by one (if > 3 backends)
+                    for i in range(3, self.speculative_num_steps - 1):
+                        self.attn_backends[
+                            i
+                        ].init_forward_metadata_replay_cuda_graph_from_precomputed(
+                            bs=bs,
+                            precomputed=precomputed,
+                            forward_mode=ForwardMode.DECODE,
+                        )
+                except (ImportError, Exception) as e:
+                    # Fallback to loop if multi-backend kernel not available or fails
+                    if isinstance(e, ImportError):
+                        print(
+                            "Warning: Multi-backend fused metadata copy kernel not available, falling back to loop."
+                        )
+                    else:
+                        print(
+                            f"Warning: Multi-backend fused metadata copy kernel failed with error: {e}, falling back to loop."
+                        )
+                    for i in range(self.speculative_num_steps - 1):
+                        self.attn_backends[
+                            i
+                        ].init_forward_metadata_replay_cuda_graph_from_precomputed(
+                            bs=bs,
+                            precomputed=precomputed,
+                            forward_mode=ForwardMode.DECODE,
+                        )
+            else:
+                # Less than 3 backends: copy to each backend individually
+                for i in range(self.speculative_num_steps - 1):
+                    self.attn_backends[
+                        i
+                    ].init_forward_metadata_replay_cuda_graph_from_precomputed(
+                        bs=bs,
+                        precomputed=precomputed,
+                        forward_mode=ForwardMode.DECODE,
+                    )
+        else:
+            # Fallback: compute metadata separately for each backend
+            for i in range(self.speculative_num_steps - 1):
+                self.attn_backends[i].init_forward_metadata_replay_cuda_graph(
+                    bs=bs,
+                    req_pool_indices=forward_batch.req_pool_indices,
+                    seq_lens=forward_batch.seq_lens,
+                    seq_lens_sum=forward_batch.seq_lens_sum,
+                    encoder_lens=None,
+                    forward_mode=ForwardMode.DECODE,
+                    spec_info=forward_batch.spec_info,
+                    seq_lens_cpu=forward_batch.seq_lens_cpu,
+                    out_cache_loc=None,
+                )
diff --git a/sglang/python/sglang/srt/layers/attention/tbo_backend.py b/sglang/python/sglang/srt/layers/attention/tbo_backend.py
new file mode 100644
index 0000000000000000000000000000000000000000..494d82d808e871a44dce56fadf9e038f026698d0
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/attention/tbo_backend.py
@@ -0,0 +1,263 @@
+from typing import TYPE_CHECKING, Callable, List, Optional
+
+import torch
+
+from sglang.srt.batch_overlap import two_batch_overlap
+from sglang.srt.layers.attention.base_attn_backend import AttentionBackend
+from sglang.srt.speculative.spec_info import SpecInput
+
+if TYPE_CHECKING:
+    from sglang.srt.model_executor.forward_batch_info import ForwardBatch, ForwardMode
+
+
+class TboAttnBackend(AttentionBackend):
+    def __init__(self, primary: AttentionBackend, children: List[AttentionBackend]):
+        super().__init__()
+        self.primary = primary
+        self.children = children
+
+    @classmethod
+    def init_new(cls, creator: Callable[[], AttentionBackend]):
+        return cls(
+            primary=creator(),
+            children=[creator() for _ in range(2)],
+        )
+
+    def init_forward_metadata(self, forward_batch: "ForwardBatch"):
+        self.primary.init_forward_metadata(forward_batch=forward_batch)
+        if forward_batch.tbo_children is not None:
+            for child, forward_batch_child in zip(
+                self.children, forward_batch.tbo_children, strict=True
+            ):
+                if forward_batch_child.batch_size > 0:
+                    child.init_forward_metadata(forward_batch=forward_batch_child)
+
+    def init_cuda_graph_state(self, max_bs: int, max_num_tokens: int):
+        self.primary.init_cuda_graph_state(max_bs=max_bs, max_num_tokens=max_num_tokens)
+        for item in self.children:
+            # TODO for children, maybe can provide *smaller* max_bs to optimize
+            item.init_cuda_graph_state(max_bs=max_bs, max_num_tokens=max_num_tokens)
+
+    def init_forward_metadata_capture_cuda_graph(
+        self,
+        bs: int,
+        num_tokens: int,
+        req_pool_indices: torch.Tensor,
+        seq_lens: torch.Tensor,
+        encoder_lens: Optional[torch.Tensor],
+        forward_mode: "ForwardMode",
+        spec_info: Optional[SpecInput],
+    ):
+        self.primary.init_forward_metadata_capture_cuda_graph(
+            bs=bs,
+            num_tokens=num_tokens,
+            req_pool_indices=req_pool_indices,
+            seq_lens=seq_lens,
+            encoder_lens=encoder_lens,
+            forward_mode=forward_mode,
+            spec_info=spec_info,
+        )
+
+        self._init_forward_metadata_cuda_graph_children(
+            fn_name="init_forward_metadata_capture_cuda_graph",
+            bs=bs,
+            req_pool_indices=req_pool_indices,
+            seq_lens=seq_lens,
+            encoder_lens=encoder_lens,
+            forward_mode=forward_mode,
+            spec_info=spec_info,
+            capture_num_tokens=num_tokens,
+        )
+
+    def init_forward_metadata_replay_cuda_graph(
+        self,
+        bs: int,
+        req_pool_indices: torch.Tensor,
+        seq_lens: torch.Tensor,
+        seq_lens_sum: int,
+        encoder_lens: Optional[torch.Tensor],
+        forward_mode: "ForwardMode",
+        spec_info: Optional[SpecInput],
+        seq_lens_cpu: Optional[torch.Tensor],
+    ):
+        self.primary.init_forward_metadata_replay_cuda_graph(
+            bs=bs,
+            req_pool_indices=req_pool_indices,
+            seq_lens=seq_lens,
+            seq_lens_sum=seq_lens_sum,
+            encoder_lens=encoder_lens,
+            forward_mode=forward_mode,
+            spec_info=spec_info,
+            seq_lens_cpu=seq_lens_cpu,
+        )
+
+        self._init_forward_metadata_cuda_graph_children(
+            fn_name="init_forward_metadata_replay_cuda_graph",
+            bs=bs,
+            req_pool_indices=req_pool_indices,
+            seq_lens=seq_lens,
+            encoder_lens=encoder_lens,
+            forward_mode=forward_mode,
+            spec_info=spec_info,
+            replay_seq_lens_sum=seq_lens_sum,
+            replay_seq_lens_cpu=seq_lens_cpu,
+        )
+
+    def _init_forward_metadata_cuda_graph_children(
+        self,
+        fn_name: str,
+        # common args
+        bs: int,
+        req_pool_indices: torch.Tensor,
+        seq_lens: torch.Tensor,
+        encoder_lens: Optional[torch.Tensor],
+        forward_mode: "ForwardMode",
+        spec_info: Optional[SpecInput],
+        # capture args
+        capture_num_tokens: int = None,
+        # replay args
+        replay_seq_lens_sum: int = None,
+        replay_seq_lens_cpu: Optional[torch.Tensor] = None,
+    ):
+        token_num_per_seq = two_batch_overlap.get_token_num_per_seq(
+            forward_mode=forward_mode, spec_info=spec_info
+        )
+        if fn_name == "init_forward_metadata_capture_cuda_graph":
+            assert (
+                capture_num_tokens == bs * token_num_per_seq
+            ), "For target-verify or decode mode, num_tokens should be equal to token_num_per_seq * bs"
+        num_tokens = bs * token_num_per_seq
+
+        tbo_split_seq_index, tbo_split_token_index = (
+            two_batch_overlap.compute_split_indices_for_cuda_graph_replay(
+                forward_mode=forward_mode,
+                cuda_graph_num_tokens=num_tokens,
+                spec_info=spec_info,
+            )
+        )
+
+        num_tokens_child_left = tbo_split_token_index
+        num_tokens_child_right = num_tokens - tbo_split_token_index
+        bs_child_left = tbo_split_seq_index
+        bs_child_right = bs - bs_child_left
+
+        assert (
+            num_tokens_child_left > 0 and num_tokens_child_right > 0
+        ), f"{num_tokens_child_left=} {num_tokens_child_right=} {forward_mode=} {num_tokens=}"
+
+        common_pre_split_args = dict(
+            fn_name=fn_name,
+            bs=bs,
+            req_pool_indices=req_pool_indices,
+            seq_lens=seq_lens,
+            encoder_lens=encoder_lens,
+            forward_mode=forward_mode,
+            spec_info=spec_info,
+            capture_num_tokens=capture_num_tokens,
+            replay_seq_lens_sum=replay_seq_lens_sum,
+            replay_seq_lens_cpu=replay_seq_lens_cpu,
+        )
+
+        args_left = _init_forward_metadata_cuda_graph_split(
+            output_bs=bs_child_left,
+            seq_slice=slice(None, tbo_split_seq_index),
+            **common_pre_split_args,
+        )
+        args_right = _init_forward_metadata_cuda_graph_split(
+            output_bs=bs_child_right,
+            seq_slice=slice(tbo_split_seq_index, None),
+            **common_pre_split_args,
+        )
+
+        child_left, child_right = self.children
+        getattr(child_left, fn_name)(**args_left)
+        getattr(child_right, fn_name)(**args_right)
+
+    def get_cuda_graph_seq_len_fill_value(self):
+        ans = self.primary.get_cuda_graph_seq_len_fill_value()
+        for child in self.children:
+            assert ans == child.get_cuda_graph_seq_len_fill_value()
+        return ans
+
+    def forward_extend(self, *args, **kwargs):
+        return self.primary.forward_extend(*args, **kwargs)
+
+    def forward_decode(self, *args, **kwargs):
+        return self.primary.forward_decode(*args, **kwargs)
+
+    def get_indexer_metadata(self, layer_id: int, forward_batch: "ForwardBatch"):
+        return self.primary.get_indexer_metadata(layer_id, forward_batch)
+
+
+def _init_forward_metadata_cuda_graph_split(
+    fn_name: str,
+    seq_slice: slice,
+    output_bs: int,
+    # common args
+    bs: int,
+    req_pool_indices: torch.Tensor,
+    seq_lens: torch.Tensor,
+    encoder_lens: Optional[torch.Tensor],
+    forward_mode: "ForwardMode",
+    spec_info: Optional[SpecInput],
+    # capture args
+    capture_num_tokens: int = None,
+    # replay args
+    replay_seq_lens_sum: int = None,
+    replay_seq_lens_cpu: Optional[torch.Tensor] = None,
+):
+    token_num_per_seq = two_batch_overlap.get_token_num_per_seq(
+        forward_mode=forward_mode, spec_info=spec_info
+    )
+    assert encoder_lens is None, "encoder_lens is not supported yet"
+    if spec_info is not None:
+        output_spec_info = two_batch_overlap.split_spec_info(
+            spec_info=spec_info,
+            start_seq_index=seq_slice.start if seq_slice.start is not None else 0,
+            end_seq_index=seq_slice.stop if seq_slice.stop is not None else bs,
+            start_token_index=(
+                seq_slice.start * token_num_per_seq
+                if seq_slice.start is not None
+                else 0
+            ),
+            end_token_index=(
+                seq_slice.stop * token_num_per_seq
+                if seq_slice.stop is not None
+                else bs * token_num_per_seq
+            ),
+        )
+
+    else:
+        output_spec_info = None
+    ans = dict(
+        bs=output_bs,
+        req_pool_indices=req_pool_indices[seq_slice],
+        seq_lens=seq_lens[seq_slice],
+        # directly forward
+        forward_mode=forward_mode,
+        # ignore
+        encoder_lens=None,
+        spec_info=output_spec_info,
+    )
+
+    if fn_name == "init_forward_metadata_capture_cuda_graph":
+        assert (
+            capture_num_tokens == bs * token_num_per_seq
+        ), "Only support num_tokens==bs * token_num_per_seq for target-verify or decode mode"
+        ans.update(
+            dict(
+                num_tokens=output_bs * token_num_per_seq,
+            )
+        )
+    elif fn_name == "init_forward_metadata_replay_cuda_graph":
+        output_seq_lens_cpu = replay_seq_lens_cpu[seq_slice]
+        ans.update(
+            dict(
+                seq_lens_sum=output_seq_lens_cpu.sum().item(),
+                seq_lens_cpu=output_seq_lens_cpu,
+            )
+        )
+    else:
+        raise NotImplementedError
+
+    return ans
diff --git a/sglang/python/sglang/srt/layers/attention/torch_flex_backend.py b/sglang/python/sglang/srt/layers/attention/torch_flex_backend.py
new file mode 100644
index 0000000000000000000000000000000000000000..69f097efd0066adec4b1092e288fe35d86b88081
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/attention/torch_flex_backend.py
@@ -0,0 +1,325 @@
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+
+import torch
+from torch.nn.attention.flex_attention import create_block_mask, flex_attention
+
+from sglang.srt.layers.attention.base_attn_backend import AttentionBackend
+from sglang.srt.layers.radix_attention import AttentionType
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch
+
+if TYPE_CHECKING:
+    from sglang.srt.layers.radix_attention import RadixAttention
+    from sglang.srt.model_executor.model_runner import ModelRunner
+
+
+class TorchFlexAttnBackend(AttentionBackend):
+    def __init__(self, model_runner: ModelRunner):
+        super().__init__()
+        self.forward_metadata = None
+        self.device = model_runner.device
+        self.flex_attention = torch.compile(flex_attention, dynamic=True)
+        torch._dynamo.config.cache_size_limit = 1024
+        torch._dynamo.config.accumulated_cache_size_limit = 1024
+
+    def init_forward_metadata(self, forward_batch: ForwardBatch):
+        """Init the metadata for a forward pass."""
+        # TODO: find a more elegant way to save memory
+        # Currently maintain the same memory as torch_native_backend
+        torch.cuda.empty_cache()
+
+        # Provide two block_mask Lists per seq_idx for lower latency, later will support per layer level mask generation
+        self.extend_block_masks = []
+        self.decode_block_masks = []
+
+        if forward_batch.forward_mode.is_extend():
+            for seq_idx in range(forward_batch.seq_lens.shape[0]):
+                seq_len_kv = forward_batch.seq_lens[seq_idx]
+                seq_len_q = seq_len_kv
+                self.extend_block_masks.append(
+                    create_block_mask(
+                        self._causal_mask,
+                        None,
+                        None,
+                        seq_len_q,
+                        seq_len_kv,
+                        device=self.device,
+                        _compile=False,
+                    )
+                )
+
+        elif forward_batch.forward_mode.is_decode():
+            for seq_idx in range(forward_batch.seq_lens.shape[0]):
+                seq_len_q = 1
+                seq_len_kv = forward_batch.seq_lens[seq_idx]
+
+                self.decode_block_masks.append(
+                    create_block_mask(
+                        self._decode_mask,
+                        None,
+                        None,
+                        seq_len_q,
+                        seq_len_kv,
+                        device=self.device,
+                        _compile=False,
+                    )
+                )
+
+    def _causal_mask(self, b, h, q_idx, kv_idx):
+        return q_idx >= kv_idx
+
+    def _decode_mask(self, b, h, q_idx, kv_idx):
+        return q_idx <= kv_idx
+
+    def _run_flex_forward_extend(
+        self,
+        query: torch.Tensor,
+        output: torch.Tensor,
+        k_cache: torch.Tensor,
+        v_cache: torch.Tensor,
+        req_to_token: torch.Tensor,
+        req_pool_indices: torch.Tensor,
+        seq_lens: torch.Tensor,
+        extend_prefix_lens: torch.Tensor,
+        extend_seq_lens: torch.Tensor,
+        scaling=None,
+        enable_gqa=False,
+        causal=False,
+    ):
+        """Run the extend forward by using torch flex attention op.
+
+        Args:
+            query: [num_tokens, num_heads, head_size]
+            output: [num_tokens, num_heads, head_size]
+            k_cache: [max_total_num_tokens, num_heads, head_size]
+            v_cache: [max_total_num_tokens, num_heads, head_size]
+            req_to_token: [max_num_reqs, max_context_len]
+            req_pool_indices: [num_seqs]
+            seq_lens: [num_seqs]
+            extend_prefix_lens: [num_seqs]
+            extend_seq_lens: [num_seqs]
+            scaling: float or None
+            enable_gqa: bool
+            causal: bool
+
+        Returns:
+            output: [num_tokens, num_heads, head_size]
+        """
+
+        assert seq_lens.shape[0] == extend_prefix_lens.shape[0]
+        assert seq_lens.shape[0] == extend_seq_lens.shape[0]
+
+        # [num_tokens, num_heads, head_size] -> [num_heads, num_tokens, head_size]
+        query = query.movedim(0, query.dim() - 2)
+
+        start_q, start_kv = 0, 0
+
+        for seq_idx in range(seq_lens.shape[0]):
+            # TODO: this loop process a sequence per iter, this is inefficient.
+            # Need optimize the performance later.
+            extend_seq_len_q = extend_seq_lens[seq_idx]
+            prefill_seq_len_q = extend_prefix_lens[seq_idx]
+
+            seq_len_kv = seq_lens[seq_idx]
+            end_q = start_q + extend_seq_len_q
+            end_kv = start_kv + seq_len_kv
+
+            per_req_query = query[:, start_q:end_q, :]
+            per_req_query_redundant = torch.empty(
+                (per_req_query.shape[0], seq_len_kv, per_req_query.shape[2]),
+                dtype=per_req_query.dtype,
+                device=per_req_query.device,
+            )
+
+            per_req_query_redundant[:, prefill_seq_len_q:, :] = per_req_query
+
+            # get key and value from cache. per_req_tokens contains the kv cache
+            # index for each token in the sequence.
+            req_pool_idx = req_pool_indices[seq_idx]
+            per_req_tokens = req_to_token[req_pool_idx, :seq_len_kv]
+            per_req_key = k_cache[per_req_tokens].movedim(0, query.dim() - 2)
+            per_req_value = v_cache[per_req_tokens].movedim(0, query.dim() - 2)
+
+            if not causal:
+                raise NotImplementedError("Non-causal mode is not yet implemented.")
+
+            per_req_out_redundant = (
+                self.flex_attention(
+                    per_req_query_redundant.unsqueeze(0),
+                    per_req_key.unsqueeze(0),
+                    per_req_value.unsqueeze(0),
+                    block_mask=self.extend_block_masks[seq_idx],
+                    scale=scaling,
+                    enable_gqa=enable_gqa,
+                )
+                .squeeze(0)
+                .movedim(query.dim() - 2, 0)
+            )
+            output[start_q:end_q, :, :] = per_req_out_redundant[
+                prefill_seq_len_q:, :, :
+            ]
+            start_q, start_kv = end_q, end_kv
+        return output
+
+    def _run_flex_forward_decode(
+        self,
+        query: torch.Tensor,
+        output: torch.Tensor,
+        k_cache: torch.Tensor,
+        v_cache: torch.Tensor,
+        req_to_token: torch.Tensor,
+        req_pool_indices: torch.Tensor,
+        seq_lens: torch.Tensor,
+        scaling=None,
+        enable_gqa=False,
+        causal=False,
+    ):
+        """Run the decode forward by using torch flex attention op.
+
+        Args:
+            query: [num_tokens, num_heads, head_size]
+            output: [num_tokens, num_heads, head_size]
+            k_cache: [max_total_num_tokens, num_heads, head_size]
+            v_cache: [max_total_num_tokens, num_heads, head_size]
+            req_to_token: [max_num_reqs, max_context_len]
+            req_pool_indices: [num_seqs]
+            seq_lens: [num_seqs]
+            scaling: float or None
+            enable_gqa: bool
+            causal: bool
+
+        Returns:
+            output: [num_tokens, num_heads, head_size]
+        """
+
+        # [num_tokens, num_heads, head_size] -> [num_heads, num_tokens, head_size]
+        query = query.movedim(0, query.dim() - 2)
+
+        start_q, start_kv = 0, 0
+        for seq_idx in range(seq_lens.shape[0]):
+            # TODO: this loop process a sequence per iter, this is inefficient.
+            # Need optimize the performance later.
+
+            seq_len_q = 1
+            seq_len_kv = seq_lens[seq_idx]
+            end_q = start_q + seq_len_q
+            end_kv = start_kv + seq_len_kv
+
+            per_req_query = query[:, start_q:end_q, :]
+
+            # get key and value from cache. per_req_tokens contains the kv cache
+            # index for each token in the sequence.
+            req_pool_idx = req_pool_indices[seq_idx]
+            per_req_tokens = req_to_token[req_pool_idx, :seq_len_kv]
+            per_req_key = k_cache[per_req_tokens].movedim(0, query.dim() - 2)
+            per_req_value = v_cache[per_req_tokens].movedim(0, query.dim() - 2)
+
+            per_req_out = (
+                self.flex_attention(
+                    per_req_query.unsqueeze(0),
+                    per_req_key.unsqueeze(0),
+                    per_req_value.unsqueeze(0),
+                    block_mask=self.decode_block_masks[seq_idx],
+                    scale=scaling,
+                    enable_gqa=enable_gqa,
+                )
+                .squeeze(0)
+                .movedim(query.dim() - 2, 0)
+            )
+
+            output[start_q:end_q, :, :] = per_req_out
+            start_q, start_kv = end_q, end_kv
+
+        return output
+
+    def forward_extend(
+        self,
+        q,
+        k,
+        v,
+        layer: RadixAttention,
+        forward_batch: ForwardBatch,
+        save_kv_cache=True,
+    ):
+        if layer.qk_head_dim != layer.v_head_dim:
+            o = q.new_empty((q.shape[0], layer.tp_q_head_num * layer.v_head_dim))
+        else:
+            o = torch.empty_like(q)
+
+        if save_kv_cache:
+            forward_batch.token_to_kv_pool.set_kv_buffer(
+                layer, forward_batch.out_cache_loc, k, v
+            )
+
+        use_gqa = layer.tp_q_head_num != layer.tp_k_head_num
+
+        q_ = q.view(-1, layer.tp_q_head_num, layer.qk_head_dim)
+        o_ = o.view(-1, layer.tp_q_head_num, layer.v_head_dim)
+
+        causal = True
+        if layer.is_cross_attention or layer.attn_type == AttentionType.ENCODER_ONLY:
+            raise NotImplementedError(
+                "TorchFlexAttnBackend does not support non-causal attention for now."
+            )
+
+        self._run_flex_forward_extend(
+            q_,
+            o_,
+            forward_batch.token_to_kv_pool.get_key_buffer(layer.layer_id),
+            forward_batch.token_to_kv_pool.get_value_buffer(layer.layer_id),
+            forward_batch.req_to_token_pool.req_to_token,
+            forward_batch.req_pool_indices,
+            forward_batch.seq_lens,
+            forward_batch.extend_prefix_lens,
+            forward_batch.extend_seq_lens,
+            scaling=layer.scaling,
+            enable_gqa=use_gqa,
+            causal=causal,
+        )
+        return o
+
+    def forward_decode(
+        self,
+        q,
+        k,
+        v,
+        layer: RadixAttention,
+        forward_batch: ForwardBatch,
+        save_kv_cache=True,
+    ):
+        # During torch.compile, there is a bug in rotary_emb that causes the
+        # output value to have a 3D tensor shape. This reshapes the output correctly.
+        q = q.reshape(-1, layer.tp_q_head_num * layer.qk_head_dim)
+
+        if layer.qk_head_dim != layer.v_head_dim:
+            o = q.new_empty((q.shape[0], layer.tp_q_head_num * layer.v_head_dim))
+        else:
+            o = torch.empty_like(q)
+
+        if save_kv_cache:
+            forward_batch.token_to_kv_pool.set_kv_buffer(
+                layer, forward_batch.out_cache_loc, k, v
+            )
+
+        use_gqa = layer.tp_q_head_num != layer.tp_k_head_num
+        q_ = q.view(-1, layer.tp_q_head_num, layer.qk_head_dim)
+        o_ = o.view(-1, layer.tp_q_head_num, layer.v_head_dim)
+
+        self._run_flex_forward_decode(
+            q_,
+            o_,
+            forward_batch.token_to_kv_pool.get_key_buffer(layer.layer_id),
+            forward_batch.token_to_kv_pool.get_value_buffer(layer.layer_id),
+            forward_batch.req_to_token_pool.req_to_token,
+            forward_batch.req_pool_indices,
+            forward_batch.seq_lens,
+            scaling=layer.scaling,
+            enable_gqa=use_gqa,
+            causal=False,
+        )
+
+        return o
+
+    def support_triton(self):
+        return False
diff --git a/sglang/python/sglang/srt/layers/attention/torch_native_backend.py b/sglang/python/sglang/srt/layers/attention/torch_native_backend.py
new file mode 100644
index 0000000000000000000000000000000000000000..85f0cb42f1691482b6fc606087c33918de717511
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/attention/torch_native_backend.py
@@ -0,0 +1,286 @@
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+
+import torch
+from torch.nn.functional import scaled_dot_product_attention
+
+from sglang.srt.layers.attention.base_attn_backend import AttentionBackend
+from sglang.srt.layers.radix_attention import AttentionType
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch
+
+if TYPE_CHECKING:
+    from sglang.srt.layers.radix_attention import RadixAttention
+    from sglang.srt.model_executor.model_runner import ModelRunner
+
+
+class TorchNativeAttnBackend(AttentionBackend):
+    def __init__(self, model_runner: ModelRunner):
+        super().__init__()
+        self.forward_metadata = None
+        self.device = model_runner.device
+
+    def init_forward_metadata(self, forward_batch: ForwardBatch):
+        """Init the metadata for a forward pass."""
+        pass
+
+    def _run_sdpa_forward_extend(
+        self,
+        query: torch.Tensor,
+        output: torch.Tensor,
+        k_cache: torch.Tensor,
+        v_cache: torch.Tensor,
+        req_to_token: torch.Tensor,
+        req_pool_indices: torch.Tensor,
+        seq_lens: torch.Tensor,
+        extend_prefix_lens: torch.Tensor,
+        extend_seq_lens: torch.Tensor,
+        scaling=None,
+        enable_gqa=False,
+        causal=False,
+    ):
+        """Run the extend forward by using torch native sdpa op.
+
+        Args:
+            query: [num_tokens, num_heads, head_size]
+            output: [num_tokens, num_heads, head_size]
+            k_cache: [max_total_num_tokens, num_heads, head_size]
+            v_cache: [max_total_num_tokens, num_heads, head_size]
+            req_to_token: [max_num_reqs, max_context_len]
+            req_pool_indices: [num_seqs]
+            seq_lens: [num_seqs]
+            extend_prefix_lens: [num_seqs]
+            extend_seq_lens: [num_seqs]
+            scaling: float or None
+            enable_gqa: bool
+            causal: bool
+
+        Returns:
+            output: [num_tokens, num_heads, head_size]
+        """
+
+        assert seq_lens.shape[0] == extend_prefix_lens.shape[0]
+        assert seq_lens.shape[0] == extend_seq_lens.shape[0]
+
+        # [num_tokens, num_heads, head_size] -> [num_heads, num_tokens, head_size]
+        query = query.movedim(0, query.dim() - 2)
+
+        start_q, start_kv = 0, 0
+        for seq_idx in range(seq_lens.shape[0]):
+            # TODO: this loop process a sequence per iter, this is inefficient.
+            # Need optimize the performance later.
+
+            extend_seq_len_q = extend_seq_lens[seq_idx]
+            prefill_seq_len_q = extend_prefix_lens[seq_idx]
+
+            seq_len_kv = seq_lens[seq_idx]
+            end_q = start_q + extend_seq_len_q
+            end_kv = start_kv + seq_len_kv
+
+            per_req_query = query[:, start_q:end_q, :]
+            per_req_query_redudant = torch.empty(
+                (per_req_query.shape[0], seq_len_kv, per_req_query.shape[2]),
+                dtype=per_req_query.dtype,
+                device=per_req_query.device,
+            )
+
+            per_req_query_redudant[:, prefill_seq_len_q:, :] = per_req_query
+
+            # get key and value from cache. per_req_tokens contains the kv cache
+            # index for each token in the sequence.
+            req_pool_idx = req_pool_indices[seq_idx]
+            per_req_tokens = req_to_token[req_pool_idx, :seq_len_kv]
+            per_req_key = k_cache[per_req_tokens].movedim(0, query.dim() - 2)
+            per_req_value = v_cache[per_req_tokens].movedim(0, query.dim() - 2)
+
+            if not (per_req_query.dtype == per_req_key.dtype == per_req_value.dtype):
+                # scaled_dot_product_attention() expects query, key, and value to have the same dtype
+                per_req_key = per_req_key.to(per_req_query.dtype)
+                per_req_value = per_req_value.to(per_req_query.dtype)
+
+            per_req_out_redudant = (
+                scaled_dot_product_attention(
+                    per_req_query_redudant.unsqueeze(0),
+                    per_req_key.unsqueeze(0),
+                    per_req_value.unsqueeze(0),
+                    enable_gqa=enable_gqa,
+                    scale=scaling,
+                    is_causal=causal,
+                )
+                .squeeze(0)
+                .movedim(query.dim() - 2, 0)
+            )
+            output[start_q:end_q, :, :] = per_req_out_redudant[prefill_seq_len_q:, :, :]
+            start_q, start_kv = end_q, end_kv
+        return output
+
+    def _run_sdpa_forward_decode(
+        self,
+        query: torch.Tensor,
+        output: torch.Tensor,
+        k_cache: torch.Tensor,
+        v_cache: torch.Tensor,
+        req_to_token: torch.Tensor,
+        req_pool_indices: torch.Tensor,
+        seq_lens: torch.Tensor,
+        scaling=None,
+        enable_gqa=False,
+        causal=False,
+    ):
+        """Run the decode forward by using torch native sdpa op.
+
+        Args:
+            query: [num_tokens, num_heads, head_size]
+            output: [num_tokens, num_heads, head_size]
+            k_cache: [max_total_num_tokens, num_heads, head_size]
+            v_cache: [max_total_num_tokens, num_heads, head_size]
+            req_to_token: [max_num_reqs, max_context_len]
+            req_pool_indices: [num_seqs]
+            seq_lens: [num_seqs]
+            scaling: float or None
+            enable_gqa: bool
+            causal: bool
+
+        Returns:
+            output: [num_tokens, num_heads, head_size]
+        """
+
+        # [num_tokens, num_heads, head_size] -> [num_heads, num_tokens, head_size]
+        query = query.movedim(0, query.dim() - 2)
+
+        start_q, start_kv = 0, 0
+        for seq_idx in range(seq_lens.shape[0]):
+            # TODO: this loop process a sequence per iter, this is inefficient.
+            # Need optimize the performance later.
+
+            seq_len_q = 1
+            seq_len_kv = seq_lens[seq_idx]
+            end_q = start_q + seq_len_q
+            end_kv = start_kv + seq_len_kv
+
+            per_req_query = query[:, start_q:end_q, :]
+
+            # get key and value from cache. per_req_tokens contains the kv cache
+            # index for each token in the sequence.
+            req_pool_idx = req_pool_indices[seq_idx]
+            per_req_tokens = req_to_token[req_pool_idx, :seq_len_kv]
+            per_req_key = k_cache[per_req_tokens].movedim(0, query.dim() - 2)
+            per_req_value = v_cache[per_req_tokens].movedim(0, query.dim() - 2)
+
+            if not (per_req_query.dtype == per_req_key.dtype == per_req_value.dtype):
+                # scaled_dot_product_attention() expects query, key, and value to have the same dtype
+                per_req_key = per_req_key.to(per_req_query.dtype)
+                per_req_value = per_req_value.to(per_req_query.dtype)
+
+            per_req_out = (
+                scaled_dot_product_attention(
+                    per_req_query.unsqueeze(0),
+                    per_req_key.unsqueeze(0),
+                    per_req_value.unsqueeze(0),
+                    enable_gqa=enable_gqa,
+                    scale=scaling,
+                    is_causal=causal,
+                )
+                .squeeze(0)
+                .movedim(query.dim() - 2, 0)
+            )
+            output[start_q:end_q, :, :] = per_req_out
+            start_q, start_kv = end_q, end_kv
+
+        return output
+
+    def forward_extend(
+        self,
+        q,
+        k,
+        v,
+        layer: RadixAttention,
+        forward_batch: ForwardBatch,
+        save_kv_cache=True,
+    ):
+        if layer.qk_head_dim != layer.v_head_dim:
+            o = q.new_empty((q.shape[0], layer.tp_q_head_num * layer.v_head_dim))
+        else:
+            o = torch.empty_like(q)
+
+        if layer.is_cross_attention:
+            cache_loc = forward_batch.encoder_out_cache_loc
+        else:
+            cache_loc = forward_batch.out_cache_loc
+
+        if save_kv_cache:
+            forward_batch.token_to_kv_pool.set_kv_buffer(layer, cache_loc, k, v)
+
+        use_gqa = layer.tp_q_head_num != layer.tp_k_head_num
+
+        q_ = q.view(-1, layer.tp_q_head_num, layer.qk_head_dim)
+        o_ = o.view(-1, layer.tp_q_head_num, layer.v_head_dim)
+
+        causal = True
+        if layer.is_cross_attention or layer.attn_type == AttentionType.ENCODER_ONLY:
+            causal = False
+
+        self._run_sdpa_forward_extend(
+            q_,
+            o_,
+            forward_batch.token_to_kv_pool.get_key_buffer(layer.layer_id),
+            forward_batch.token_to_kv_pool.get_value_buffer(layer.layer_id),
+            forward_batch.req_to_token_pool.req_to_token,
+            forward_batch.req_pool_indices,
+            forward_batch.seq_lens,
+            forward_batch.extend_prefix_lens,
+            forward_batch.extend_seq_lens,
+            scaling=layer.scaling,
+            enable_gqa=use_gqa,
+            causal=causal,
+        )
+        return o
+
+    def forward_decode(
+        self,
+        q,
+        k,
+        v,
+        layer: RadixAttention,
+        forward_batch: ForwardBatch,
+        save_kv_cache=True,
+    ):
+        # During torch.compile, there is a bug in rotary_emb that causes the
+        # output value to have a 3D tensor shape. This reshapes the output correctly.
+        q = q.reshape(-1, layer.tp_q_head_num * layer.qk_head_dim)
+
+        if layer.qk_head_dim != layer.v_head_dim:
+            o = q.new_empty((q.shape[0], layer.tp_q_head_num * layer.v_head_dim))
+        else:
+            o = torch.empty_like(q)
+
+        if layer.is_cross_attention:
+            cache_loc = forward_batch.encoder_out_cache_loc
+        else:
+            cache_loc = forward_batch.out_cache_loc
+
+        if save_kv_cache:
+            forward_batch.token_to_kv_pool.set_kv_buffer(layer, cache_loc, k, v)
+
+        use_gqa = layer.tp_q_head_num != layer.tp_k_head_num
+
+        q_ = q.view(-1, layer.tp_q_head_num, layer.qk_head_dim)
+        o_ = o.view(-1, layer.tp_q_head_num, layer.v_head_dim)
+
+        self._run_sdpa_forward_decode(
+            q_,
+            o_,
+            forward_batch.token_to_kv_pool.get_key_buffer(layer.layer_id),
+            forward_batch.token_to_kv_pool.get_value_buffer(layer.layer_id),
+            forward_batch.req_to_token_pool.req_to_token,
+            forward_batch.req_pool_indices,
+            forward_batch.seq_lens,
+            scaling=layer.scaling,
+            enable_gqa=use_gqa,
+            causal=False,
+        )
+
+        return o
+
+    def support_triton(self):
+        return False
diff --git a/sglang/python/sglang/srt/layers/attention/triton_backend.py b/sglang/python/sglang/srt/layers/attention/triton_backend.py
new file mode 100644
index 0000000000000000000000000000000000000000..d5c47d2fa67e000731bbba3c3a088068ab13c37d
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/attention/triton_backend.py
@@ -0,0 +1,1395 @@
+from __future__ import annotations
+
+from dataclasses import dataclass
+from typing import TYPE_CHECKING, List, Optional
+
+import torch
+import triton
+import triton.language as tl
+
+from sglang.srt.configs.model_config import AttentionArch
+from sglang.srt.layers.attention.base_attn_backend import AttentionBackend
+from sglang.srt.layers.attention.utils import create_flashinfer_kv_indices_triton
+from sglang.srt.layers.dp_attention import get_attention_tp_size
+from sglang.srt.layers.radix_attention import AttentionType
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch, ForwardMode
+from sglang.srt.speculative.spec_utils import generate_draft_decode_kv_indices
+from sglang.srt.utils import (
+    get_bool_env_var,
+    get_device_core_count,
+    get_int_env_var,
+    next_power_of_2,
+)
+
+if TYPE_CHECKING:
+    from sglang.srt.layers.radix_attention import RadixAttention
+    from sglang.srt.model_executor.model_runner import ModelRunner
+    from sglang.srt.speculative.spec_info import SpecInput
+
+
+def logit_capping_mod(logit_capping_method, logit_cap):
+    # positive logit_cap -> tanh cap
+    if logit_capping_method == "tanh":
+        return logit_cap
+    else:
+        raise ValueError()
+
+
+@dataclass
+class ForwardMetadata:
+    attn_logits: torch.Tensor
+    attn_lse: torch.Tensor
+    max_extend_len: int
+    num_kv_splits: torch.Tensor
+    kv_indptr: torch.Tensor
+    kv_indices: torch.Tensor
+    qo_indptr: torch.Tensor
+    custom_mask: torch.Tensor
+    mask_indptr: torch.Tensor
+    # Sliding window
+    window_kv_indptr: torch.Tensor
+    window_kv_indices: torch.Tensor
+    window_num_kv_splits: torch.Tensor
+    window_kv_offsets: torch.Tensor
+
+
+class TritonAttnBackend(AttentionBackend):
+    def __init__(
+        self,
+        model_runner: ModelRunner,
+        skip_prefill: bool = False,
+        kv_indptr_buf: Optional[torch.Tensor] = None,
+    ):
+        # Lazy import to avoid the initialization of cuda context
+        from sglang.srt.layers.attention.triton_ops.decode_attention import (
+            decode_attention_fwd,
+        )
+        from sglang.srt.layers.attention.triton_ops.extend_attention import (
+            build_unified_kv_indices,
+            extend_attention_fwd,
+            extend_attention_fwd_unified,
+        )
+
+        super().__init__()
+
+        self.decode_attention_fwd = torch.compiler.disable(decode_attention_fwd)
+        self.extend_attention_fwd = torch.compiler.disable(extend_attention_fwd)
+        self.extend_attention_fwd_unified = torch.compiler.disable(
+            extend_attention_fwd_unified
+        )
+        self.build_unified_kv_indices = torch.compiler.disable(build_unified_kv_indices)
+
+        # Parse args
+        self.skip_prefill = skip_prefill
+        max_bs = model_runner.req_to_token_pool.size
+        self.sliding_window_size = model_runner.sliding_window_size
+        self.req_to_token = model_runner.req_to_token_pool.req_to_token
+        self.token_to_kv_pool_allocator = model_runner.token_to_kv_pool_allocator
+        self.num_draft_tokens = model_runner.server_args.speculative_num_draft_tokens
+        self.speculative_num_steps = model_runner.server_args.speculative_num_steps
+        self.use_mla = model_runner.model_config.attention_arch == AttentionArch.MLA
+        self.num_head = (
+            model_runner.model_config.num_attention_heads // get_attention_tp_size()
+        )
+        self.num_kv_head = model_runner.model_config.get_num_kv_heads(
+            get_attention_tp_size()
+        )
+        if (
+            model_runner.hybrid_gdn_config is not None
+            or model_runner.kimi_linear_config is not None
+        ):
+            # For hybrid linear models, layer_id = 0 may not be full attention
+            self.v_head_dim = model_runner.token_to_kv_pool.get_v_head_dim()
+        else:
+            self.v_head_dim = model_runner.token_to_kv_pool.get_value_buffer(0).shape[
+                -1
+            ]
+        self.max_context_len = model_runner.model_config.context_len
+        self.device = model_runner.device
+        self.device_core_count = get_device_core_count(model_runner.gpu_id)
+        self.static_kv_splits = get_bool_env_var(
+            "SGLANG_TRITON_DECODE_ATTN_STATIC_KV_SPLITS", "false"
+        )
+        self.max_kv_splits = model_runner.server_args.triton_attention_num_kv_splits
+
+        self.allow_bidirectional_attention_in_extend = (
+            model_runner.server_args.disable_cuda_graph
+            and (model_runner.server_args.chunked_prefill_size == -1)
+        )
+
+        # Decide whether enable deterministic inference with batch-invariant operations
+        self.enable_deterministic = (
+            model_runner.server_args.enable_deterministic_inference
+        )
+
+        # Configure deterministic inference settings
+        if self.enable_deterministic:
+            # Use fixed split tile size for batch invariance
+            self.split_tile_size = get_int_env_var(
+                "SGLANG_TRITON_DECODE_SPLIT_TILE_SIZE", 256
+            )
+            # Set static_kv_splits to False to use deterministic logic instead
+            self.static_kv_splits = False
+        else:
+            self.split_tile_size = (
+                model_runner.server_args.triton_attention_split_tile_size
+            )
+
+        if self.split_tile_size is not None:
+            self.max_kv_splits = (
+                self.max_context_len + self.split_tile_size - 1
+            ) // self.split_tile_size
+
+        # Check arguments
+        assert not (
+            model_runner.sliding_window_size is not None
+            and model_runner.model_config.is_encoder_decoder
+        ), "Sliding window and cross attention are not supported together"
+
+        # Initialize buffers
+        # TODO(Jianan Ji): Make sure it behaves as expected when kv_indptr_buf is provided and sliding window is enabled
+        if kv_indptr_buf is None:
+            self.kv_indptr = torch.zeros(
+                (max_bs + 1,), dtype=torch.int32, device=model_runner.device
+            )
+        else:
+            self.kv_indptr = kv_indptr_buf
+
+        # If sliding window is enabled, we might need two sets of buffers
+        # because of interleaved attention types (e.g. for Gemma3)
+        self.window_kv_indptr = None
+        if self.sliding_window_size is not None and self.sliding_window_size > 0:
+            if kv_indptr_buf is None:
+                self.window_kv_indptr = torch.zeros(
+                    (max_bs + 1,), dtype=torch.int32, device=model_runner.device
+                )
+            else:
+                # When provided a buffer, create a clone for the second buffer
+                self.window_kv_indptr = torch.zeros_like(kv_indptr_buf)
+
+        if not self.skip_prefill:
+            self.qo_indptr = torch.zeros(
+                (max_bs + 1,), dtype=torch.int32, device=model_runner.device
+            )
+
+            self.mask_indptr = torch.zeros(
+                (max_bs + 1,), dtype=torch.int64, device=model_runner.device
+            )
+
+        # Initialize forward metadata
+        self.forward_metadata: ForwardMetadata = None
+
+        self.cuda_graph_custom_mask = None
+
+    def get_num_kv_splits(
+        self,
+        num_kv_splits: torch.Tensor,
+        seq_lens: torch.Tensor,
+    ):
+        num_token, num_seq = num_kv_splits.shape[0], seq_lens.shape[0]
+        # NOTE(alcanderian): Considering speculative_decodeing,
+        # num_kv_splits.shape[0] will be topk * real_num_token.
+        # And the real_num_token is num_seq in decoding phase.
+        num_group = num_token // num_seq
+
+        assert (
+            num_group * num_seq == num_token
+        ), f"num_seq({num_seq}), num_token({num_token}), something goes wrong!"
+
+        # Legacy dynamic splitting logic (non-deterministic)
+        if (
+            self.static_kv_splits or self.device_core_count <= 0
+        ) and not self.enable_deterministic:
+            num_kv_splits.fill_(self.max_kv_splits)
+            return
+
+        # deterministic
+        if self.split_tile_size is not None and self.enable_deterministic:
+            # expand seq_lens to match num_token
+            if num_group > 1:
+                expanded_seq_lens = seq_lens.repeat_interleave(num_group)
+            else:
+                expanded_seq_lens = seq_lens
+
+            num_kv_splits[:] = (
+                expanded_seq_lens + self.split_tile_size - 1
+            ) // self.split_tile_size
+            return
+
+        if num_seq < 256:
+            SCHEDULE_SEQ = 256
+        else:
+            SCHEDULE_SEQ = triton.next_power_of_2(num_seq)
+
+        get_num_kv_splits_triton[(1,)](
+            num_kv_splits,
+            seq_lens,
+            num_seq,
+            num_group,
+            self.num_head,
+            self.num_kv_head,
+            self.max_kv_splits,
+            self.device_core_count,
+            MAX_NUM_SEQ=SCHEDULE_SEQ,
+        )
+
+    def init_forward_metadata(self, forward_batch: ForwardBatch):
+        """Init auxiliary variables for triton attention backend."""
+
+        bs = forward_batch.batch_size
+        kv_indptr = self.kv_indptr
+        window_kv_indptr = self.window_kv_indptr
+        window_kv_indices = None
+        window_num_kv_splits = None
+        window_kv_offsets = None
+        spec_info = forward_batch.spec_info
+
+        if forward_batch.forward_mode.is_decode_or_idle():
+            if spec_info is None:
+                kv_indptr[1 : bs + 1] = torch.cumsum(forward_batch.seq_lens, dim=0)
+                kv_indptr = kv_indptr[: bs + 1]
+                kv_indices = torch.empty(
+                    forward_batch.seq_lens_sum, dtype=torch.int64, device=self.device
+                )
+                create_flashinfer_kv_indices_triton[(bs,)](
+                    self.req_to_token,
+                    forward_batch.req_pool_indices,
+                    forward_batch.seq_lens,
+                    kv_indptr,
+                    None,
+                    kv_indices,
+                    self.req_to_token.stride(0),
+                )
+                # Sliding window
+                if (
+                    self.sliding_window_size is not None
+                    and self.sliding_window_size > 0
+                ):
+                    window_kv_indptr, window_kv_indices, window_kv_lens, _ = (
+                        update_sliding_window_buffer(
+                            self.window_kv_indptr,
+                            self.req_to_token,
+                            self.sliding_window_size,
+                            forward_batch.seq_lens,
+                            forward_batch.req_pool_indices,
+                            bs,
+                            self.device,
+                            self.token_to_kv_pool_allocator,
+                        )
+                    )
+                    window_num_kv_splits = torch.empty(
+                        (bs,), dtype=torch.int32, device=self.device
+                    )
+                    self.get_num_kv_splits(window_num_kv_splits, window_kv_lens)
+            else:
+                kv_indptr, kv_indices = spec_info.kv_indptr, spec_info.kv_indices
+                bs = kv_indptr.shape[0] - 1
+
+            attn_logits = torch.empty(
+                (bs, self.num_head, self.max_kv_splits, self.v_head_dim),
+                dtype=torch.float32,
+                device=self.device,
+            )
+            attn_lse = torch.empty(
+                (bs, self.num_head, self.max_kv_splits),
+                dtype=torch.float32,
+                device=self.device,
+            )
+            num_kv_splits = torch.empty((bs,), dtype=torch.int32, device=self.device)
+            self.get_num_kv_splits(num_kv_splits, forward_batch.seq_lens)
+
+            qo_indptr = None
+            custom_mask = None
+            mask_indptr = None
+            max_extend_len = None
+        elif forward_batch.forward_mode.is_target_verify():
+            bs = len(forward_batch.req_pool_indices)
+            qo_indptr = torch.arange(
+                0,
+                (1 + bs) * self.num_draft_tokens,
+                step=self.num_draft_tokens,
+                dtype=torch.int32,
+                device=self.device,
+            )
+            # Different with flashinfer kv_indptr and kv_indices construction
+            kv_indptr[1 : bs + 1] = torch.cumsum(forward_batch.seq_lens, dim=0)
+            kv_indptr = kv_indptr[: bs + 1]
+            kv_indices = torch.empty(
+                kv_indptr[-1], dtype=torch.int64, device=self.device
+            )
+            create_flashinfer_kv_indices_triton[(bs,)](
+                self.req_to_token,
+                forward_batch.req_pool_indices,
+                forward_batch.seq_lens,
+                kv_indptr,
+                None,
+                kv_indices,
+                self.req_to_token.stride(0),
+            )
+
+            if self.sliding_window_size is not None and self.sliding_window_size > 0:
+                # window_kv_offsets is used to calculate the start position in custom mask
+                (
+                    window_kv_indptr,
+                    window_kv_indices,
+                    window_kv_lens,
+                    window_kv_offsets,
+                ) = update_sliding_window_buffer(
+                    self.window_kv_indptr,
+                    self.req_to_token,
+                    self.sliding_window_size,
+                    forward_batch.seq_lens,
+                    forward_batch.req_pool_indices,
+                    bs,
+                    self.device,
+                    self.token_to_kv_pool_allocator,
+                )
+
+            custom_mask = spec_info.custom_mask
+            seq_mask_len = self.num_draft_tokens * (
+                forward_batch.seq_lens + self.num_draft_tokens
+            )
+            mask_indptr = self.mask_indptr
+            mask_indptr[1 : bs + 1] = torch.cumsum(seq_mask_len[:bs], dim=0)
+            mask_indptr = mask_indptr[: bs + 1]
+            max_extend_len = self.num_draft_tokens
+            num_kv_splits = None
+            attn_logits = None
+            attn_lse = None
+
+        elif forward_batch.forward_mode.is_draft_extend():
+            kv_indices, kv_indptr, qo_indptr, custom_mask = (
+                spec_info.generate_attn_arg_prefill(
+                    forward_batch.req_pool_indices,
+                    forward_batch.seq_lens,
+                    None,
+                    self.req_to_token,
+                )
+            )
+            kv_indices = kv_indices.to(torch.int64)
+            mask_indptr = None
+            # TODO(FIXME): This will trigger an invalid Eagle tree when using
+            # `max(spec_info.accept_length_cpu)`.
+            # It might have been forgotten to update somewhere.
+            max_extend_len = torch.max(spec_info.accept_length).item()
+            num_kv_splits = None
+            attn_logits = None
+            attn_lse = None
+        else:
+            kv_indptr[1 : bs + 1] = torch.cumsum(
+                forward_batch.extend_prefix_lens, dim=0
+            )
+            kv_indptr = kv_indptr[: bs + 1]
+            kv_indices = torch.empty(
+                sum(forward_batch.extend_prefix_lens_cpu),
+                dtype=torch.int64,
+                device=self.device,
+            )
+            create_flashinfer_kv_indices_triton[(bs,)](
+                self.req_to_token,
+                forward_batch.req_pool_indices,
+                forward_batch.extend_prefix_lens,
+                kv_indptr,
+                None,
+                kv_indices,
+                self.req_to_token.stride(0),
+            )
+            # Sliding window
+            if self.sliding_window_size is not None and self.sliding_window_size > 0:
+                (
+                    window_kv_indptr,
+                    window_kv_indices,
+                    window_kv_lens,
+                    window_kv_offsets,
+                ) = update_sliding_window_buffer(
+                    self.window_kv_indptr,
+                    self.req_to_token,
+                    self.sliding_window_size,
+                    forward_batch.extend_prefix_lens,
+                    forward_batch.req_pool_indices,
+                    bs,
+                    self.device,
+                    self.token_to_kv_pool_allocator,
+                )
+
+            qo_indptr = self.qo_indptr
+            qo_indptr[1 : bs + 1] = torch.cumsum(forward_batch.extend_seq_lens, dim=0)
+            qo_indptr = qo_indptr[: bs + 1]
+            custom_mask = None
+            mask_indptr = None
+            attn_logits = None
+            attn_lse = None
+            max_extend_len = max(forward_batch.extend_seq_lens_cpu)
+            num_kv_splits = None
+
+        self.forward_metadata = ForwardMetadata(
+            attn_logits,
+            attn_lse,
+            max_extend_len,
+            num_kv_splits,
+            kv_indptr,
+            kv_indices,
+            qo_indptr,
+            custom_mask,
+            mask_indptr,
+            window_kv_indptr,
+            window_kv_indices,
+            window_num_kv_splits,
+            window_kv_offsets,
+        )
+
+    def init_cuda_graph_state(
+        self,
+        max_bs: int,
+        max_num_tokens: int,
+        kv_indices_buf: Optional[torch.Tensor] = None,
+        cuda_graph_num_kv_splits_buf: Optional[torch.Tensor] = None,
+    ):
+        self.cuda_graph_attn_logits = torch.zeros(
+            (max_num_tokens, self.num_head, self.max_kv_splits, self.v_head_dim),
+            dtype=torch.float32,
+            device=self.device,
+        )
+        self.cuda_graph_attn_lse = torch.zeros(
+            (max_num_tokens, self.num_head, self.max_kv_splits),
+            dtype=torch.float32,
+            device=self.device,
+        )
+
+        if cuda_graph_num_kv_splits_buf is None:
+            self.cuda_graph_num_kv_splits = torch.full(
+                (max_num_tokens,),
+                self.max_kv_splits,
+                dtype=torch.int32,
+                device=self.device,
+            )
+        else:
+            self.cuda_graph_num_kv_splits = cuda_graph_num_kv_splits_buf
+
+        if kv_indices_buf is None:
+            self.cuda_graph_kv_indices = torch.zeros(
+                (max_num_tokens * self.max_context_len),
+                dtype=torch.int64,
+                device=self.device,
+            )
+        else:
+            self.cuda_graph_kv_indices = kv_indices_buf
+
+        if not self.skip_prefill:
+            self.cuda_graph_custom_mask = torch.zeros(
+                (max_num_tokens * self.max_context_len),
+                dtype=torch.uint8,
+                device=self.device,
+            )
+
+        if self.sliding_window_size is not None and self.sliding_window_size > 0:
+            if kv_indices_buf is None:
+                self.cuda_graph_window_kv_indices = torch.zeros(
+                    (max_num_tokens * self.sliding_window_size),
+                    dtype=torch.int64,
+                    device=self.device,
+                )
+            else:
+                self.cuda_graph_window_kv_indices = torch.zeros_like(kv_indices_buf)
+
+            self.cuda_graph_window_num_kv_splits = torch.full(
+                (max_num_tokens,),
+                self.max_kv_splits,
+                dtype=torch.int32,
+                device=self.device,
+            )
+
+            self.cuda_graph_window_kv_offsets = torch.zeros(
+                (max_bs,),
+                dtype=torch.int32,
+                device=self.device,
+            )
+
+    def init_forward_metadata_capture_cuda_graph(
+        self,
+        bs: int,
+        num_tokens: int,
+        req_pool_indices: torch.Tensor,
+        seq_lens: torch.Tensor,
+        encoder_lens: Optional[torch.Tensor],
+        forward_mode: ForwardMode,
+        spec_info: Optional[SpecInput],
+    ):
+        assert encoder_lens is None, "Not supported"
+        window_kv_indptr = self.window_kv_indptr
+        window_kv_indices = None
+        window_num_kv_splits = None
+        window_kv_offsets = None
+
+        if forward_mode.is_decode_or_idle():
+            if spec_info is None:
+                kv_indptr = self.kv_indptr
+                kv_indptr[1 : bs + 1] = torch.cumsum(seq_lens, dim=0)
+                kv_indptr = kv_indptr[: bs + 1]
+                kv_indices = self.cuda_graph_kv_indices
+                create_flashinfer_kv_indices_triton[(bs,)](
+                    self.req_to_token,
+                    req_pool_indices,
+                    seq_lens,
+                    kv_indptr,
+                    None,
+                    kv_indices,
+                    self.req_to_token.stride(0),
+                )
+                if (
+                    self.sliding_window_size is not None
+                    and self.sliding_window_size > 0
+                ):
+                    window_kv_indices = self.cuda_graph_window_kv_indices
+                    window_num_kv_splits = self.cuda_graph_window_num_kv_splits
+                    window_kv_indptr, window_kv_indices, _, _ = (
+                        update_sliding_window_buffer_cuda_graph(
+                            self.window_kv_indptr,
+                            window_kv_indices,
+                            self.req_to_token,
+                            self.sliding_window_size,
+                            seq_lens[:bs],
+                            req_pool_indices,
+                            bs,
+                            self.token_to_kv_pool_allocator,
+                        )
+                    )
+            else:
+                kv_indptr, kv_indices = spec_info.kv_indptr, spec_info.kv_indices
+
+            attn_logits = self.cuda_graph_attn_logits
+            attn_lse = self.cuda_graph_attn_lse
+            max_extend_len = None
+            num_kv_splits = self.cuda_graph_num_kv_splits
+            qo_indptr = None
+            custom_mask = None
+            mask_indptr = None
+        elif forward_mode.is_target_verify():
+            qo_indptr = self.qo_indptr[: bs + 1]
+            qo_indptr[: bs + 1] = torch.arange(
+                0,
+                (1 + bs) * self.num_draft_tokens,
+                step=self.num_draft_tokens,
+                dtype=torch.int32,
+                device=self.device,
+            )
+            kv_indptr = self.kv_indptr[: bs + 1]
+            kv_indptr[1 : bs + 1] = torch.cumsum(seq_lens, dim=0)
+            kv_indices = self.cuda_graph_kv_indices
+            create_flashinfer_kv_indices_triton[(bs,)](
+                self.req_to_token,
+                req_pool_indices,
+                seq_lens,
+                kv_indptr,
+                None,
+                kv_indices,
+                self.req_to_token.stride(0),
+            )
+
+            if self.sliding_window_size is not None and self.sliding_window_size > 0:
+                window_kv_indices = self.cuda_graph_window_kv_indices
+                window_num_kv_splits = self.cuda_graph_window_num_kv_splits
+                window_kv_offsets = self.cuda_graph_window_kv_offsets
+                window_kv_indptr, window_kv_indices, _, window_kv_offsets[:bs] = (
+                    update_sliding_window_buffer_cuda_graph(
+                        self.window_kv_indptr,
+                        window_kv_indices,
+                        self.req_to_token,
+                        self.sliding_window_size,
+                        seq_lens[:bs],
+                        req_pool_indices,
+                        bs,
+                        self.token_to_kv_pool_allocator,
+                    )
+                )
+
+            custom_mask = self.cuda_graph_custom_mask
+            custom_mask[: spec_info.custom_mask.shape[0]] = spec_info.custom_mask
+            seq_mask_len = self.num_draft_tokens * (seq_lens + self.num_draft_tokens)
+            mask_indptr = self.mask_indptr[: bs + 1]
+            mask_indptr[1 : bs + 1] = torch.cumsum(seq_mask_len, dim=0)
+            max_extend_len = self.num_draft_tokens
+            num_kv_splits = None
+            attn_logits = None
+            attn_lse = None
+        elif forward_mode.is_draft_extend(include_v2=True):
+            num_tokens_per_bs = self.speculative_num_steps + 1
+            qo_indptr = self.qo_indptr[: bs + 1]
+            qo_indptr[: bs + 1] = torch.arange(
+                0,
+                bs * num_tokens_per_bs + 1,
+                step=num_tokens_per_bs,
+                dtype=torch.int32,
+                device=self.device,
+            )
+            kv_indptr = self.kv_indptr[: bs + 1]
+            kv_indptr[1 : bs + 1] = torch.cumsum(seq_lens, dim=0)
+            kv_indices = self.cuda_graph_kv_indices
+            create_flashinfer_kv_indices_triton[(bs,)](
+                self.req_to_token,
+                req_pool_indices,
+                seq_lens,
+                kv_indptr,
+                None,
+                kv_indices,
+                self.req_to_token.stride(0),
+            )
+            custom_mask = None
+            mask_indptr = None
+            max_extend_len = num_tokens_per_bs
+            num_kv_splits = None
+            attn_logits = None
+            attn_lse = None
+        else:
+            raise ValueError(
+                f"Invalid forward mode: {forward_mode=} for CUDA Graph capture."
+            )
+
+        self.forward_metadata = ForwardMetadata(
+            attn_logits,
+            attn_lse,
+            max_extend_len,
+            num_kv_splits,
+            kv_indptr,
+            kv_indices,
+            qo_indptr,
+            custom_mask,
+            mask_indptr,
+            window_kv_indptr,
+            window_kv_indices,
+            window_num_kv_splits,
+            window_kv_offsets,
+        )
+
+    def init_forward_metadata_replay_cuda_graph(
+        self,
+        bs: int,
+        req_pool_indices: torch.Tensor,
+        seq_lens: torch.Tensor,
+        seq_lens_sum: int,
+        encoder_lens: Optional[torch.Tensor],
+        forward_mode: ForwardMode,
+        spec_info: Optional[SpecInput],
+        seq_lens_cpu: Optional[torch.Tensor],
+    ):
+        # NOTE: encoder_lens expected to be zeros or None
+        if forward_mode.is_decode_or_idle():
+            # Update kv_indptr, kv_indices
+            kv_indptr = self.kv_indptr
+            kv_indices = self.cuda_graph_kv_indices
+            num_kv_splits = self.cuda_graph_num_kv_splits
+            if spec_info is None:
+                kv_indptr[1 : bs + 1] = torch.cumsum(seq_lens[:bs], dim=0)
+                kv_indptr = kv_indptr[: bs + 1]
+                create_flashinfer_kv_indices_triton[(bs,)](
+                    self.req_to_token,
+                    req_pool_indices[:bs],
+                    seq_lens[:bs],
+                    kv_indptr,
+                    None,
+                    kv_indices,
+                    self.req_to_token.stride(0),
+                )
+                num_token = bs
+                if (
+                    self.sliding_window_size is not None
+                    and self.sliding_window_size > 0
+                ):
+                    window_num_kv_splits = self.cuda_graph_window_num_kv_splits
+                    window_kv_indices = self.cuda_graph_window_kv_indices
+                    _, _, window_kv_lens, _ = update_sliding_window_buffer_cuda_graph(
+                        self.window_kv_indptr,
+                        window_kv_indices,
+                        self.req_to_token,
+                        self.sliding_window_size,
+                        seq_lens[:bs],
+                        req_pool_indices[:bs],
+                        bs,
+                        self.token_to_kv_pool_allocator,
+                    )
+                    self.get_num_kv_splits(
+                        window_num_kv_splits[:num_token], window_kv_lens[:bs]
+                    )
+
+            else:
+                assert False, "Multi-step cuda graph init is not done here."
+            self.get_num_kv_splits(num_kv_splits[:num_token], seq_lens[:bs])
+
+        elif forward_mode.is_target_verify():
+            # Update qo_indptr, kv_indptr, kv_indices, custom_mask, mask_indptr
+            bs = len(req_pool_indices)
+            qo_indptr = self.qo_indptr[: bs + 1]
+            qo_indptr[: bs + 1] = torch.arange(
+                0,
+                (1 + bs) * self.num_draft_tokens,
+                step=self.num_draft_tokens,
+                dtype=torch.int32,
+                device=self.device,
+            )
+            kv_indptr = self.kv_indptr[: bs + 1]
+            kv_indptr[1 : bs + 1] = torch.cumsum(seq_lens, dim=0)
+            kv_indices = self.cuda_graph_kv_indices
+            create_flashinfer_kv_indices_triton[(bs,)](
+                self.req_to_token,
+                req_pool_indices,
+                seq_lens,
+                kv_indptr,
+                None,
+                kv_indices,
+                self.req_to_token.stride(0),
+            )
+            if self.sliding_window_size is not None and self.sliding_window_size > 0:
+                window_num_kv_splits = self.cuda_graph_window_num_kv_splits
+                window_kv_indices = self.cuda_graph_window_kv_indices
+                window_kv_offsets = self.cuda_graph_window_kv_offsets
+                _, _, window_kv_lens, window_kv_offsets[:bs] = (
+                    update_sliding_window_buffer_cuda_graph(
+                        self.window_kv_indptr,
+                        window_kv_indices,
+                        self.req_to_token,
+                        self.sliding_window_size,
+                        seq_lens[:bs],
+                        req_pool_indices,
+                        bs,
+                        self.token_to_kv_pool_allocator,
+                    )
+                )
+            custom_mask = self.cuda_graph_custom_mask
+            custom_mask[: spec_info.custom_mask.shape[0]] = spec_info.custom_mask
+            seq_mask_len = self.num_draft_tokens * (seq_lens + self.num_draft_tokens)
+            mask_indptr = self.mask_indptr[: bs + 1]
+            mask_indptr[1 : bs + 1] = torch.cumsum(seq_mask_len, dim=0)
+        elif forward_mode.is_draft_extend(include_v2=True):
+            seq_lens = seq_lens[:bs]
+            num_tokens_per_bs = self.speculative_num_steps + 1
+            qo_indptr = self.qo_indptr[: bs + 1]
+            qo_indptr[: bs + 1] = torch.arange(
+                0,
+                bs * num_tokens_per_bs + 1,
+                step=num_tokens_per_bs,
+                dtype=torch.int32,
+                device=self.device,
+            )
+            kv_indptr = self.kv_indptr[: bs + 1]
+            kv_indptr[1 : bs + 1] = torch.cumsum(seq_lens, dim=0)
+            kv_indices = self.cuda_graph_kv_indices
+            create_flashinfer_kv_indices_triton[(bs,)](
+                self.req_to_token,
+                req_pool_indices,
+                seq_lens,
+                kv_indptr,
+                None,
+                kv_indices,
+                self.req_to_token.stride(0),
+            )
+        else:
+            raise ValueError(
+                f"Invalid forward mode: {forward_mode=} for CUDA Graph replay."
+            )
+
+    def get_cuda_graph_seq_len_fill_value(self):
+        return 1
+
+    def get_verify_buffers_to_fill_after_draft(self):
+        """
+        Return buffers for verify attention kernels that needs to be filled after draft.
+
+        Typically, these are tree mask and position buffers.
+        """
+        return [self.cuda_graph_custom_mask, None]
+
+    def update_verify_buffers_to_fill_after_draft(
+        self, spec_info: SpecInput, cuda_graph_bs: Optional[int]
+    ):
+        pass
+
+    def forward_extend(
+        self,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
+        layer: RadixAttention,
+        forward_batch: ForwardBatch,
+        save_kv_cache=True,
+        sinks=None,
+    ):
+        # TODO: reuse the buffer across layers
+        if layer.qk_head_dim != layer.v_head_dim:
+            o = q.new_empty((q.shape[0], layer.tp_q_head_num * layer.v_head_dim))
+        else:
+            o = torch.empty_like(q)
+
+        # Save KV cache first (must do this before unified kernel)
+        if save_kv_cache:
+            if (
+                self.use_mla or layer.k_scale is None
+            ):  # Triton MLA currently doesn't support quantized kv cache
+                forward_batch.token_to_kv_pool.set_kv_buffer(
+                    layer,
+                    forward_batch.out_cache_loc,
+                    k,
+                    v,
+                )
+            else:
+                forward_batch.token_to_kv_pool.set_kv_buffer(
+                    layer,
+                    forward_batch.out_cache_loc,
+                    k.clone(),  # cloned to protect k,v from in-place mutation in set_kv_buffer
+                    v.clone(),
+                    layer.k_scale,
+                    layer.v_scale,
+                )
+
+        logits_soft_cap = logit_capping_mod(layer.logit_capping_method, layer.logit_cap)
+
+        causal = True
+        if (
+            layer.is_cross_attention
+            or layer.attn_type == AttentionType.ENCODER_ONLY
+            or (
+                layer.attn_type == AttentionType.DECODER_BIDIRECTIONAL
+                and self.allow_bidirectional_attention_in_extend
+            )
+        ):
+            causal = False
+
+        # Deterministic mode: use unified 1-stage kernel
+        if self.enable_deterministic:
+            return self._forward_extend_unified(
+                q, o, layer, forward_batch, causal, logits_soft_cap, sinks
+            )
+
+        # Normal mode: use original 2-stage kernel
+        if layer.sliding_window_size is not None and layer.sliding_window_size > -1:
+            sliding_window_size = (
+                layer.sliding_window_size
+            )  # Needed for sliding window mask
+            kv_indptr = self.forward_metadata.window_kv_indptr
+            kv_indices = self.forward_metadata.window_kv_indices
+            window_kv_offsets = self.forward_metadata.window_kv_offsets
+        else:
+            sliding_window_size = -1
+            kv_indptr = self.forward_metadata.kv_indptr
+            kv_indices = self.forward_metadata.kv_indices
+            window_kv_offsets = None
+
+        if layer.k_scale is not None and layer.v_scale is not None:
+            k_descale = layer.k_scale_float
+            v_descale = layer.v_scale_float
+        else:
+            k_descale = 1.0
+            v_descale = 1.0
+
+        self.extend_attention_fwd(
+            q.view(-1, layer.tp_q_head_num, layer.qk_head_dim),
+            k.contiguous(),
+            v.contiguous(),
+            o.view(-1, layer.tp_q_head_num, layer.v_head_dim),
+            forward_batch.token_to_kv_pool.get_key_buffer(layer.layer_id),
+            forward_batch.token_to_kv_pool.get_value_buffer(layer.layer_id),
+            self.forward_metadata.qo_indptr,
+            kv_indptr,
+            kv_indices,
+            self.forward_metadata.custom_mask,
+            causal,
+            self.forward_metadata.mask_indptr,
+            self.forward_metadata.max_extend_len,
+            k_descale,
+            v_descale,
+            layer.scaling,
+            logit_cap=logits_soft_cap,
+            sliding_window_size=sliding_window_size,
+            sinks=sinks,
+            window_kv_offsets=window_kv_offsets,
+            xai_temperature_len=layer.xai_temperature_len,
+        )
+        return o
+
+    def _forward_extend_unified(
+        self,
+        q: torch.Tensor,
+        o: torch.Tensor,
+        layer: RadixAttention,
+        forward_batch: ForwardBatch,
+        causal: bool,
+        logits_soft_cap: float,
+        sinks: Optional[torch.Tensor],
+    ):
+        """
+        Unified 1-stage extend attention for deterministic inference.
+        Both prefix and extend KV are accessed through unified kv_indices.
+        """
+        bs = forward_batch.batch_size
+
+        # Determine sliding window settings
+        if layer.sliding_window_size is not None and layer.sliding_window_size > -1:
+            sliding_window_size = layer.sliding_window_size
+            # Note: for unified kernel, we use full kv_indptr (not window)
+            prefix_kv_indptr = self.forward_metadata.window_kv_indptr
+            prefix_kv_indices = self.forward_metadata.window_kv_indices
+            # Compute window start positions (absolute position of first key in window)
+            # window_start_pos = seq_len - window_len
+            window_kv_lens = prefix_kv_indptr[1 : bs + 1] - prefix_kv_indptr[:bs]
+            # Handle TARGET_VERIFY mode where extend_prefix_lens might not be set
+            if forward_batch.extend_prefix_lens is not None:
+                window_start_pos = (
+                    forward_batch.extend_prefix_lens[:bs] - window_kv_lens
+                )
+            else:
+                # Infer from spec_info: prefix_len = seq_len - draft_token_num
+                if forward_batch.spec_info is not None and hasattr(
+                    forward_batch.spec_info, "draft_token_num"
+                ):
+                    extend_prefix_lens = (
+                        forward_batch.seq_lens[:bs]
+                        - forward_batch.spec_info.draft_token_num
+                    )
+                    window_start_pos = extend_prefix_lens - window_kv_lens
+                else:
+                    window_start_pos = None
+        else:
+            sliding_window_size = -1
+            prefix_kv_indptr = self.forward_metadata.kv_indptr
+            prefix_kv_indices = self.forward_metadata.kv_indices
+            window_start_pos = None
+
+        # Build unified kv_indices using fused Triton kernel
+        extend_kv_indices = forward_batch.out_cache_loc
+
+        # Handle cases where extend_seq_lens or extend_start_loc might not be set
+        # In speculative decoding, we can infer these from spec_info or compute them
+        if forward_batch.extend_seq_lens is None:
+            # TARGET_VERIFY mode: infer extend_seq_lens from spec_info
+            if forward_batch.spec_info is not None and hasattr(
+                forward_batch.spec_info, "draft_token_num"
+            ):
+                draft_token_num = forward_batch.spec_info.draft_token_num
+                extend_seq_lens = torch.full(
+                    (bs,), draft_token_num, dtype=torch.int32, device=self.device
+                )
+            else:
+                raise RuntimeError(
+                    "extend_seq_lens is None but cannot infer from spec_info. "
+                    "This should not happen in TARGET_VERIFY mode."
+                )
+        else:
+            extend_seq_lens = forward_batch.extend_seq_lens
+
+        # Check extend_start_loc separately - it might be None even when extend_seq_lens is set
+        if forward_batch.extend_start_loc is None:
+            # Compute extend_start_loc from extend_seq_lens
+            # extend_start_loc[i] = sum(extend_seq_lens[0:i])
+            extend_start_loc = torch.cat(
+                [
+                    torch.zeros(1, dtype=torch.int32, device=self.device),
+                    torch.cumsum(extend_seq_lens[:-1], dim=0),
+                ]
+            )
+        else:
+            extend_start_loc = forward_batch.extend_start_loc
+
+        unified_kv_indptr, unified_kv_indices, prefix_lens = (
+            self.build_unified_kv_indices(
+                prefix_kv_indptr,
+                prefix_kv_indices,
+                extend_start_loc,
+                extend_seq_lens,
+                extend_kv_indices,
+                bs,
+            )
+        )
+
+        # Convert prefix_lens to int32 for the kernel
+        prefix_lens = prefix_lens.to(torch.int32)
+
+        if layer.k_scale is not None and layer.v_scale is not None:
+            k_descale = layer.k_scale_float
+            v_descale = layer.v_scale_float
+        else:
+            k_descale = 1.0
+            v_descale = 1.0
+
+        # Call unified kernel
+        self.extend_attention_fwd_unified(
+            q.view(-1, layer.tp_q_head_num, layer.qk_head_dim),
+            o.view(-1, layer.tp_q_head_num, layer.v_head_dim),
+            forward_batch.token_to_kv_pool.get_key_buffer(layer.layer_id),
+            forward_batch.token_to_kv_pool.get_value_buffer(layer.layer_id),
+            k_descale,
+            v_descale,
+            self.forward_metadata.qo_indptr,
+            unified_kv_indptr,
+            unified_kv_indices,
+            prefix_lens,
+            self.forward_metadata.max_extend_len,
+            custom_mask=self.forward_metadata.custom_mask,
+            mask_indptr=self.forward_metadata.mask_indptr,
+            sm_scale=layer.scaling,
+            logit_cap=logits_soft_cap,
+            is_causal=causal,
+            sliding_window_size=sliding_window_size,
+            sinks=sinks,
+            window_start_pos=window_start_pos,
+            xai_temperature_len=layer.xai_temperature_len,
+        )
+
+        return o
+
+    def forward_decode(
+        self,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
+        layer: RadixAttention,
+        forward_batch: ForwardBatch,
+        save_kv_cache=True,
+        sinks=None,
+    ):
+        # During torch.compile, there is a bug in rotary_emb that causes the
+        # output value to have a 3D tensor shape. This reshapes the output correctly.
+        q = q.reshape(-1, layer.tp_q_head_num * layer.qk_head_dim)
+
+        # TODO: reuse the buffer across layers
+        if layer.qk_head_dim != layer.v_head_dim:
+            o = q.new_empty((q.shape[0], layer.tp_q_head_num * layer.v_head_dim))
+        else:
+            o = torch.empty_like(q)
+
+        logits_soft_cap = logit_capping_mod(layer.logit_capping_method, layer.logit_cap)
+
+        if save_kv_cache:
+            if self.use_mla:  # Triton MLA currently doesn't support quantized kv cache
+                forward_batch.token_to_kv_pool.set_kv_buffer(
+                    layer,
+                    forward_batch.out_cache_loc,
+                    k,
+                    v,
+                )
+            else:
+                forward_batch.token_to_kv_pool.set_kv_buffer(
+                    layer,
+                    forward_batch.out_cache_loc,
+                    k,
+                    v,
+                    layer.k_scale,
+                    layer.v_scale,
+                )
+
+        if layer.sliding_window_size is not None and layer.sliding_window_size > -1:
+            kv_indptr = self.forward_metadata.window_kv_indptr
+            kv_indices = self.forward_metadata.window_kv_indices
+        else:
+            kv_indptr = self.forward_metadata.kv_indptr
+            kv_indices = self.forward_metadata.kv_indices
+
+        if layer.k_scale is not None and layer.v_scale is not None:
+            k_descale = layer.k_scale_float
+            v_descale = layer.v_scale_float
+        else:
+            k_descale = 1.0
+            v_descale = 1.0
+
+        self.decode_attention_fwd(
+            q.view(-1, layer.tp_q_head_num, layer.qk_head_dim),
+            forward_batch.token_to_kv_pool.get_key_buffer(layer.layer_id),
+            forward_batch.token_to_kv_pool.get_value_buffer(layer.layer_id),
+            o.view(-1, layer.tp_q_head_num, layer.v_head_dim),
+            kv_indptr,
+            kv_indices,
+            self.forward_metadata.attn_logits,
+            self.forward_metadata.attn_lse,
+            self.forward_metadata.num_kv_splits,
+            self.max_kv_splits,
+            layer.scaling,
+            k_descale,
+            v_descale,
+            logit_cap=logits_soft_cap,
+            sinks=sinks,
+            xai_temperature_len=layer.xai_temperature_len,
+        )
+        return o
+
+
+class TritonMultiStepDraftBackend:
+    """
+    Wrap multiple triton attention backends as one for multiple consecutive
+    draft decoding steps.
+    """
+
+    def __init__(
+        self,
+        model_runner: ModelRunner,
+        topk: int,
+        speculative_num_steps: int,
+    ):
+        self.topk = topk
+        self.speculative_num_steps = speculative_num_steps
+        max_bs = model_runner.req_to_token_pool.size * self.topk
+        self.kv_indptr = torch.zeros(
+            (
+                self.speculative_num_steps,
+                max_bs + 1,
+            ),
+            dtype=torch.int32,
+            device=model_runner.device,
+        )
+        self.attn_backends: List[TritonAttnBackend] = []
+        for i in range(self.speculative_num_steps - 1):
+            self.attn_backends.append(
+                TritonAttnBackend(
+                    model_runner,
+                    skip_prefill=True,
+                    kv_indptr_buf=self.kv_indptr[i],
+                )
+            )
+        self.max_context_len = self.attn_backends[0].max_context_len
+        self.num_head = (
+            model_runner.model_config.num_attention_heads // get_attention_tp_size()
+        )
+        self.device = model_runner.device
+        # Cached variables for generate_draft_decode_kv_indices
+        self.pool_len = model_runner.req_to_token_pool.req_to_token.shape[1]
+        self.page_size = model_runner.server_args.page_size
+
+    def common_template(
+        self,
+        forward_batch: ForwardBatch,
+        kv_indices_buffer: Optional[torch.Tensor],
+        call_fn: int,
+    ):
+        if kv_indices_buffer is None:
+            kv_indices_buffer = self.cuda_graph_kv_indices
+
+        num_seqs = forward_batch.batch_size
+        bs = self.topk * num_seqs
+        seq_lens_sum = forward_batch.seq_lens_sum
+
+        generate_draft_decode_kv_indices[
+            (self.speculative_num_steps, num_seqs, self.topk)
+        ](
+            forward_batch.req_pool_indices,
+            forward_batch.req_to_token_pool.req_to_token,
+            forward_batch.seq_lens,
+            kv_indices_buffer,
+            self.kv_indptr,
+            forward_batch.positions,
+            self.pool_len,
+            kv_indices_buffer.shape[1],
+            self.kv_indptr.shape[1],
+            next_power_of_2(num_seqs),
+            next_power_of_2(self.speculative_num_steps),
+            next_power_of_2(bs),
+            self.page_size,
+        )
+
+        if call_fn is None:
+            return
+
+        for i in range(self.speculative_num_steps - 1):
+            forward_batch.spec_info.kv_indptr = self.kv_indptr[i, : bs + 1]
+            forward_batch.spec_info.kv_indices = kv_indices_buffer[i][
+                : seq_lens_sum * self.topk + bs * (i + 1)
+            ]
+            call_fn(i, forward_batch)
+
+    def init_forward_metadata(self, forward_batch: ForwardBatch):
+        kv_indices = torch.empty(
+            (
+                self.speculative_num_steps,
+                forward_batch.batch_size * self.topk * self.max_context_len,
+            ),
+            dtype=torch.int64,
+            device=self.device,
+        )
+
+        def call_fn(i, forward_batch):
+            forward_batch.spec_info.kv_indptr = (
+                forward_batch.spec_info.kv_indptr.clone()
+            )
+            forward_batch.spec_info.kv_indices = (
+                forward_batch.spec_info.kv_indices.clone()
+            )
+            self.attn_backends[i].init_forward_metadata(forward_batch)
+
+        self.common_template(forward_batch, kv_indices, call_fn)
+
+    def init_cuda_graph_state(self, max_bs: int, max_num_tokens: int):
+        self.cuda_graph_kv_indices = torch.zeros(
+            (self.speculative_num_steps, max_num_tokens * self.max_context_len),
+            dtype=torch.int64,
+            device=self.device,
+        )
+        self.cuda_graph_num_kv_splits = torch.full(
+            (max_num_tokens,),
+            self.attn_backends[0].max_kv_splits,
+            dtype=torch.int32,
+            device=self.device,
+        )
+
+        for i in range(self.speculative_num_steps - 1):
+            self.attn_backends[i].init_cuda_graph_state(
+                max_bs,
+                max_num_tokens,
+                kv_indices_buf=self.cuda_graph_kv_indices[i],
+                cuda_graph_num_kv_splits_buf=self.cuda_graph_num_kv_splits,
+            )
+
+    def init_forward_metadata_capture_cuda_graph(self, forward_batch: ForwardBatch):
+        def call_fn(i, forward_batch):
+            self.attn_backends[i].init_forward_metadata_capture_cuda_graph(
+                forward_batch.batch_size,
+                forward_batch.batch_size * self.topk,
+                forward_batch.req_pool_indices,
+                forward_batch.seq_lens,
+                encoder_lens=None,
+                forward_mode=ForwardMode.DECODE,
+                spec_info=forward_batch.spec_info,
+            )
+
+        self.common_template(forward_batch, None, call_fn)
+
+    def init_forward_metadata_replay_cuda_graph(
+        self, forward_batch: ForwardBatch, bs: int
+    ):
+        self.common_template(forward_batch, None, None)
+
+        # NOTE: Multi-step's attention backends use the slice of
+        # - kv_indptr buffer (cuda graph and non-cuda graph)
+        # - kv_indices buffer (cuda graph only)
+        # So we don't need to assign the KV indices inside the attention backend.
+
+        # Compute num_kv_splits only once
+        num_token = forward_batch.batch_size * self.topk
+        self.attn_backends[-1].get_num_kv_splits(
+            self.attn_backends[-1].cuda_graph_num_kv_splits[:num_token],
+            forward_batch.seq_lens[:bs],
+        )
+
+
+@triton.jit
+def get_num_kv_splits_triton(
+    num_kv_splits_ptr,
+    seq_lens_ptr,
+    num_seq,
+    num_group,
+    num_head,
+    num_kv_head,
+    max_kv_splits,
+    device_core_count,
+    MAX_NUM_SEQ: tl.constexpr,
+):
+    # TODO: this method is tunable, we need more online serving data to tune it
+    offs_seq = tl.arange(0, MAX_NUM_SEQ)
+    mask_seq = offs_seq < num_seq
+
+    seq_lens = tl.load(seq_lens_ptr + offs_seq, mask=mask_seq, other=0)
+    max_seq_len = tl.max(seq_lens)
+    seq_lens = tl.load(seq_lens_ptr + offs_seq, mask=mask_seq, other=max_seq_len)
+    min_seq_len = tl.min(seq_lens)
+    if max_seq_len * 8 < min_seq_len * 10:
+        min_seq_len = max_seq_len
+    max_kv_splits_1 = tl.minimum(tl.cdiv(max_seq_len, min_seq_len), max_kv_splits)
+    kv_chunk_size_1 = tl.cdiv(max_seq_len, max_kv_splits_1)
+
+    # NOTE: this is a hack to let num_kv_split grows up with seqlen gradually
+    ext_seq_len = tl.cast(max_seq_len, tl.float32) / 64.0
+    ext_device_core_count = tl.cast(
+        device_core_count * tl.maximum(tl.log2(ext_seq_len), 1.0), tl.int32
+    )
+    block_h, num_kv_group = 16, num_head // num_kv_head
+    if num_kv_group == 1:
+        token_grid = num_seq * num_group * num_head
+    else:
+        # from triton_ops/decode_attention.py:_decode_grouped_att_m_fwd
+        block_h = tl.minimum(block_h, num_kv_group)
+        token_grid = num_seq * num_group * tl.cdiv(num_head, block_h)
+    max_kv_splits_2 = tl.minimum(
+        tl.cdiv(ext_device_core_count, token_grid), max_kv_splits
+    )
+    kv_chunk_size_2 = tl.cdiv(max_seq_len, max_kv_splits_2)
+
+    num_kv_splits = tl.maximum(
+        tl.cdiv(seq_lens, kv_chunk_size_1), tl.cdiv(seq_lens, kv_chunk_size_2)
+    )
+
+    offs_token = offs_seq * num_group
+    mask_token = offs_token < num_seq * num_group
+    for i in range(0, num_group):
+        tl.store(num_kv_splits_ptr + i + offs_token, num_kv_splits, mask=mask_token)
+
+
+def update_sliding_window_buffer(
+    window_kv_indptr,
+    req_to_token,
+    sliding_window_size,
+    seq_lens,
+    req_pool_indices,
+    bs,
+    device,
+    token_to_kv_pool_allocator=None,
+):
+    window_kv_lens = torch.minimum(
+        seq_lens,
+        torch.tensor(sliding_window_size),
+    )
+    window_kv_indptr[1 : bs + 1] = torch.cumsum(window_kv_lens, dim=0)
+    window_kv_indptr = window_kv_indptr[: bs + 1]
+    window_kv_indices = torch.empty(
+        window_kv_indptr[-1], dtype=torch.int64, device=device
+    )
+    window_kv_start_idx = seq_lens - window_kv_lens
+    create_flashinfer_kv_indices_triton[(bs,)](
+        req_to_token,
+        req_pool_indices,
+        window_kv_lens,
+        window_kv_indptr,
+        window_kv_start_idx,
+        window_kv_indices,
+        req_to_token.stride(0),
+    )
+    # full to swa index mapping
+    if hasattr(token_to_kv_pool_allocator, "translate_loc_from_full_to_swa"):
+        kv_last_index = window_kv_indptr[-1]
+        window_kv_indices[:kv_last_index] = (
+            token_to_kv_pool_allocator.translate_loc_from_full_to_swa(
+                window_kv_indices[:kv_last_index]
+            )
+        )
+    return window_kv_indptr, window_kv_indices, window_kv_lens, window_kv_start_idx
+
+
+def update_sliding_window_buffer_cuda_graph(
+    window_kv_indptr,
+    window_kv_indices,
+    req_to_token,
+    sliding_window_size,
+    seq_lens,
+    req_pool_indices,
+    bs,
+    token_to_kv_pool_allocator=None,
+):
+    window_kv_lens = torch.minimum(
+        seq_lens,
+        torch.tensor(sliding_window_size),
+    )
+    window_kv_indptr[1 : bs + 1] = torch.cumsum(window_kv_lens, dim=0)
+    window_kv_indptr = window_kv_indptr[: bs + 1]
+    window_kv_start_idx = seq_lens - window_kv_lens
+    create_flashinfer_kv_indices_triton[(bs,)](
+        req_to_token,
+        req_pool_indices,
+        window_kv_lens,
+        window_kv_indptr,
+        window_kv_start_idx,
+        window_kv_indices,
+        req_to_token.stride(0),
+    )
+    # full to swa index mapping
+    if hasattr(token_to_kv_pool_allocator, "translate_loc_from_full_to_swa"):
+        kv_last_index = window_kv_indptr[-1]
+        window_kv_indices[:kv_last_index] = (
+            token_to_kv_pool_allocator.translate_loc_from_full_to_swa(
+                window_kv_indices[:kv_last_index]
+            )
+        )
+    return window_kv_indptr, window_kv_indices, window_kv_lens, window_kv_start_idx
diff --git a/sglang/python/sglang/srt/layers/attention/triton_ops/__pycache__/prefill_attention.cpython-311.pyc b/sglang/python/sglang/srt/layers/attention/triton_ops/__pycache__/prefill_attention.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4934075e51a75dedd0b0d344dc6ec24a75e9a5a8
Binary files /dev/null and b/sglang/python/sglang/srt/layers/attention/triton_ops/__pycache__/prefill_attention.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/layers/attention/triton_ops/decode_attention.py b/sglang/python/sglang/srt/layers/attention/triton_ops/decode_attention.py
new file mode 100644
index 0000000000000000000000000000000000000000..2b166f3b0397ee83a26525b13a63e703ce32c2bb
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/attention/triton_ops/decode_attention.py
@@ -0,0 +1,787 @@
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""
+Memory-efficient attention for decoding.
+It supports page size = 1.
+"""
+
+# Adapted from
+# https://github.com/ModelTC/lightllm/blob/96353e868a840db4d103138caf15ed9dbea8c186/lightllm/models/deepseek2/triton_kernel/gqa_flash_decoding_stage1.py
+# https://github.com/ModelTC/lightllm/blob/96353e868a840db4d103138caf15ed9dbea8c186/lightllm/models/deepseek2/triton_kernel/gqa_flash_decoding_stage2.py
+
+import logging
+
+import triton
+import triton.language as tl
+
+from sglang.srt.utils import is_hip
+
+_is_hip = is_hip()
+
+logger = logging.getLogger(__name__)
+
+
+_MIN_BLOCK_KV = 32
+
+
+@triton.jit
+def tanh(x):
+    # Tanh is just a scaled sigmoid
+    return 2 * tl.sigmoid(2 * x) - 1
+
+
+@triton.jit
+def _fwd_kernel_stage1(
+    Q,
+    K_Buffer,
+    V_Buffer,
+    sm_scale_withk,
+    kv_indptr,
+    kv_indices,
+    Att_Out,
+    Att_Lse,
+    num_kv_splits,
+    stride_qbs,
+    stride_qh,
+    stride_buf_kbs,
+    stride_buf_kh,
+    stride_buf_vbs,
+    stride_buf_vh,
+    stride_mid_ob,
+    stride_mid_oh,
+    stride_mid_os,
+    kv_group_num: tl.constexpr,
+    BLOCK_DMODEL: tl.constexpr,
+    BLOCK_DV: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+    MIN_BLOCK_KV: tl.constexpr,
+    logit_cap: tl.constexpr,
+    Lk: tl.constexpr,
+    Lv: tl.constexpr,
+    xai_temperature_len: tl.constexpr,
+):
+    cur_batch = tl.program_id(0)
+    cur_head = tl.program_id(1)
+    split_kv_id = tl.program_id(2)
+
+    cur_kv_head = cur_head // kv_group_num
+
+    offs_d = tl.arange(0, BLOCK_DMODEL)
+    offs_dv = tl.arange(0, BLOCK_DV)
+    mask_d = offs_d < Lk
+    mask_dv = offs_dv < Lv
+
+    cur_batch_kv_start_idx = tl.load(kv_indptr + cur_batch)
+    cur_batch_seq_len = tl.load(kv_indptr + cur_batch + 1) - cur_batch_kv_start_idx
+    kv_splits = tl.load(num_kv_splits + cur_batch)
+
+    if xai_temperature_len > 0:
+        offs_qidx = cur_batch_seq_len - 1
+        xai_temperature_scale = 1.0 / tl.log2(float(xai_temperature_len))
+        _qtemp = tl.log2(offs_qidx.to(tl.float32)) * xai_temperature_scale
+        xai_temperature_reg = tl.where(offs_qidx > xai_temperature_len, _qtemp, 1.0)
+
+    off_q = cur_batch * stride_qbs + cur_head * stride_qh + offs_d
+
+    kv_len_per_split = (
+        tl.cdiv(tl.cdiv(cur_batch_seq_len, kv_splits), MIN_BLOCK_KV) * MIN_BLOCK_KV
+    )
+    split_kv_start = kv_len_per_split * split_kv_id
+    split_kv_end = tl.minimum(split_kv_start + kv_len_per_split, cur_batch_seq_len)
+
+    e_max = -float("inf")
+    e_sum = 0.0
+    acc = tl.zeros([BLOCK_DV], dtype=tl.float32)
+
+    if split_kv_end > split_kv_start:
+        q = tl.load(Q + off_q, mask=mask_d, other=0.0)
+        for start_n in range(split_kv_start, split_kv_end, BLOCK_N):
+            offs_n = start_n + tl.arange(0, BLOCK_N)
+            kv_loc = tl.load(
+                kv_indices + cur_batch_kv_start_idx + offs_n,
+                mask=offs_n < split_kv_end,
+                other=0,
+            )
+            offs_buf_k = (
+                kv_loc[:, None] * stride_buf_kbs
+                + cur_kv_head * stride_buf_kh
+                + offs_d[None, :]
+            )
+            k = tl.load(
+                K_Buffer + offs_buf_k,
+                mask=(offs_n[:, None] < split_kv_end) & (mask_d[None, :]),
+                other=0.0,
+            )
+            qk = tl.sum(q[None, :] * k, 1)
+            qk *= sm_scale_withk
+
+            if logit_cap > 0:
+                qk = logit_cap * tanh(qk / logit_cap)
+
+            if xai_temperature_len > 0:
+                qk *= xai_temperature_reg
+
+            qk = tl.where(offs_n < split_kv_end, qk, float("-inf"))
+
+            offs_buf_v = (
+                kv_loc[:, None] * stride_buf_vbs
+                + cur_kv_head * stride_buf_vh
+                + offs_dv[None, :]
+            )
+            v = tl.load(
+                V_Buffer + offs_buf_v,
+                mask=(offs_n[:, None] < split_kv_end) & (mask_dv[None, :]),
+                other=0.0,
+            )
+
+            n_e_max = tl.maximum(tl.max(qk, 0), e_max)
+            re_scale = tl.exp(e_max - n_e_max)
+            p = tl.exp(qk - n_e_max)
+            acc *= re_scale
+            acc += tl.sum(p[:, None] * v, 0)
+
+            e_sum = e_sum * re_scale + tl.sum(p, 0)
+            e_max = n_e_max
+
+        offs_mid_o = (
+            cur_batch * stride_mid_ob
+            + cur_head * stride_mid_oh
+            + split_kv_id * stride_mid_os
+            + offs_dv
+        )
+
+        tl.store(
+            Att_Out + offs_mid_o,
+            acc / e_sum,
+            mask=(mask_dv),
+        )
+
+        offs_mid_o_1 = (
+            cur_batch * stride_mid_ob
+            + cur_head * stride_mid_oh
+            + split_kv_id * stride_mid_os
+        ) // Lv
+
+        tl.store(
+            Att_Lse + offs_mid_o_1,
+            e_max + tl.log(e_sum),
+        )
+
+
+def _decode_att_m_fwd(
+    q,
+    k_buffer,
+    v_buffer,
+    att_out,
+    att_lse,
+    kv_indptr,
+    kv_indices,
+    num_kv_splits,
+    max_kv_splits,
+    sm_scale_withk,
+    logit_cap,
+    xai_temperature_len=-1,
+):
+    BLOCK = 64
+    # [TODO] work around SGPR limit on MI3xx
+    if _is_hip:
+        BLOCK = 8
+    MAX_KV_SPLITS = max_kv_splits
+    Lk = k_buffer.shape[-1]
+    Lv = v_buffer.shape[-1]
+
+    batch, head_num = q.shape[0], q.shape[1]
+
+    grid = (batch, head_num, MAX_KV_SPLITS)
+    kv_group_num = q.shape[1] // k_buffer.shape[1]
+
+    if kv_group_num == 1:
+        num_warps = 4
+    else:
+        num_warps = 2
+        if _is_hip:
+            num_warps = 1
+
+    BLOCK_DMODEL = triton.next_power_of_2(Lk)
+    BLOCK_DV = triton.next_power_of_2(Lv)
+
+    _fwd_kernel_stage1[grid](
+        q,
+        k_buffer,
+        v_buffer,
+        sm_scale_withk,
+        kv_indptr,
+        kv_indices,
+        att_out,
+        att_lse,
+        num_kv_splits,
+        q.stride(0),
+        q.stride(1),
+        k_buffer.stride(0),
+        k_buffer.stride(1),
+        v_buffer.stride(0),
+        v_buffer.stride(1),
+        att_out.stride(0),
+        att_out.stride(1),
+        att_out.stride(2),
+        kv_group_num=kv_group_num,
+        BLOCK_DMODEL=BLOCK_DMODEL,
+        BLOCK_DV=BLOCK_DV,
+        BLOCK_N=BLOCK,
+        MIN_BLOCK_KV=_MIN_BLOCK_KV,
+        logit_cap=logit_cap,
+        xai_temperature_len=xai_temperature_len,
+        num_warps=num_warps,
+        num_stages=2,
+        Lk=Lk,
+        Lv=Lv,
+    )
+
+
+@triton.jit
+def _fwd_grouped_kernel_stage1(
+    Q,
+    K_Buffer,
+    V_Buffer,
+    sm_scale_withk,
+    kv_indptr,
+    kv_indices,
+    Att_Out,
+    Att_Lse,
+    num_kv_splits,
+    stride_qbs,
+    stride_qh,
+    stride_buf_kbs,
+    stride_buf_kh,
+    stride_buf_vbs,
+    stride_buf_vh,
+    stride_mid_ob,
+    stride_mid_oh,
+    stride_mid_os,
+    kv_group_num: tl.constexpr,
+    q_head_num: tl.constexpr,
+    BLOCK_DMODEL: tl.constexpr,
+    BLOCK_DPE: tl.constexpr,
+    BLOCK_DV: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+    BLOCK_H: tl.constexpr,
+    MIN_BLOCK_KV: tl.constexpr,
+    logit_cap: tl.constexpr,
+    xai_temperature_len: tl.constexpr,
+    Lk: tl.constexpr,
+    Lv: tl.constexpr,
+):
+    cur_batch = tl.program_id(0)
+    cur_head_id = tl.program_id(1)
+    cur_kv_head = cur_head_id // tl.cdiv(kv_group_num, BLOCK_H)
+    split_kv_id = tl.program_id(2)
+
+    if BLOCK_H < kv_group_num:
+        VALID_BLOCK_H: tl.constexpr = BLOCK_H
+    else:
+        VALID_BLOCK_H: tl.constexpr = kv_group_num
+    cur_head = cur_head_id * VALID_BLOCK_H + tl.arange(0, BLOCK_H)
+    mask_h = cur_head < (cur_head_id + 1) * VALID_BLOCK_H
+    mask_h = mask_h & (cur_head < q_head_num)
+
+    offs_d = tl.arange(0, BLOCK_DMODEL)
+    offs_dv = tl.arange(0, BLOCK_DV)
+    mask_d = offs_d < Lk
+    mask_dv = offs_dv < Lv
+
+    cur_batch_kv_start_idx = tl.load(kv_indptr + cur_batch)
+    cur_batch_seq_len = tl.load(kv_indptr + cur_batch + 1) - cur_batch_kv_start_idx
+    kv_splits = tl.load(num_kv_splits + cur_batch)
+
+    if xai_temperature_len > 0:
+        offs_qidx = cur_batch_seq_len - 1
+        xai_temperature_scale = 1.0 / tl.log2(float(xai_temperature_len))
+        _qtemp = tl.log2(offs_qidx.to(tl.float32)) * xai_temperature_scale
+        xai_temperature_reg = tl.where(offs_qidx > xai_temperature_len, _qtemp, 1.0)
+
+    offs_q = cur_batch * stride_qbs + cur_head[:, None] * stride_qh + offs_d[None, :]
+
+    if BLOCK_DPE > 0:
+        offs_dpe = BLOCK_DMODEL + tl.arange(0, BLOCK_DPE)
+        mask_dpe = offs_dpe < Lk
+        off_qpe = (
+            cur_batch * stride_qbs + cur_head[:, None] * stride_qh + offs_dpe[None, :]
+        )
+
+    kv_len_per_split = (
+        tl.cdiv(tl.cdiv(cur_batch_seq_len, kv_splits), MIN_BLOCK_KV) * MIN_BLOCK_KV
+    )
+    split_kv_start = kv_len_per_split * split_kv_id
+    split_kv_end = tl.minimum(split_kv_start + kv_len_per_split, cur_batch_seq_len)
+
+    e_max = tl.zeros([BLOCK_H], dtype=tl.float32) - float("inf")
+    e_sum = tl.zeros([BLOCK_H], dtype=tl.float32)
+    acc = tl.zeros([BLOCK_H, BLOCK_DV], dtype=tl.float32)
+
+    if split_kv_end > split_kv_start:
+        q = tl.load(Q + offs_q, mask=(mask_h[:, None]) & (mask_d[None, :]), other=0.0)
+        if BLOCK_DPE > 0:
+            qpe = tl.load(
+                Q + off_qpe, mask=(mask_h[:, None]) & (mask_dpe[None, :]), other=0.0
+            )
+        for start_n in range(split_kv_start, split_kv_end, BLOCK_N):
+            offs_n = start_n + tl.arange(0, BLOCK_N)
+            kv_loc = tl.load(
+                kv_indices + cur_batch_kv_start_idx + offs_n,
+                mask=offs_n < split_kv_end,
+                other=0,
+            )
+            offs_buf_k = (
+                kv_loc[None, :] * stride_buf_kbs
+                + cur_kv_head * stride_buf_kh
+                + offs_d[:, None]
+            )
+            k = tl.load(
+                K_Buffer + offs_buf_k,
+                mask=(offs_n[None, :] < split_kv_end) & (mask_d[:, None]),
+                other=0.0,
+            )
+            qk = tl.dot(q, k.to(q.dtype))
+            if BLOCK_DPE > 0:
+                offs_buf_kpe = (
+                    kv_loc[None, :] * stride_buf_kbs
+                    + cur_kv_head * stride_buf_kh
+                    + offs_dpe[:, None]
+                )
+                kpe = tl.load(
+                    K_Buffer + offs_buf_kpe,
+                    mask=(offs_n[None, :] < split_kv_end) & (mask_dpe[:, None]),
+                    other=0.0,
+                )
+                qk += tl.dot(qpe, kpe.to(qpe.dtype))
+            qk *= sm_scale_withk
+
+            if logit_cap > 0:
+                qk = logit_cap * tanh(qk / logit_cap)
+
+            if xai_temperature_len > 0:
+                qk *= xai_temperature_reg[:, None]
+
+            qk = tl.where(
+                mask_h[:, None] & (offs_n[None, :] < split_kv_end), qk, float("-inf")
+            )
+
+            offs_buf_v = (
+                kv_loc[:, None] * stride_buf_vbs
+                + cur_kv_head * stride_buf_vh
+                + offs_dv[None, :]
+            )
+            v = tl.load(
+                V_Buffer + offs_buf_v,
+                mask=(offs_n[:, None] < split_kv_end) & (mask_dv[None, :]),
+                other=0.0,
+            )
+
+            n_e_max = tl.maximum(tl.max(qk, 1), e_max)
+            re_scale = tl.exp(e_max - n_e_max)
+            p = tl.exp(qk - n_e_max[:, None])
+            acc *= re_scale[:, None]
+            acc += tl.dot(p.to(v.dtype), v)
+
+            e_sum = e_sum * re_scale + tl.sum(p, 1)
+            e_max = n_e_max
+
+        offs_mid_o = (
+            cur_batch * stride_mid_ob
+            + cur_head[:, None] * stride_mid_oh
+            + split_kv_id * stride_mid_os
+            + offs_dv[None, :]
+        )
+
+        tl.store(
+            Att_Out + offs_mid_o,
+            acc / e_sum[:, None],
+            mask=(mask_h[:, None]) & (mask_dv[None, :]),
+        )
+
+        offs_mid_o_1 = (
+            cur_batch * stride_mid_ob
+            + cur_head * stride_mid_oh
+            + split_kv_id * stride_mid_os
+        ) // Lv
+
+        tl.store(
+            Att_Lse + offs_mid_o_1,
+            e_max + tl.log(e_sum),
+            mask=mask_h,
+        )
+
+
+def _decode_grouped_att_m_fwd(
+    q,
+    k_buffer,
+    v_buffer,
+    att_out,
+    att_lse,
+    kv_indptr,
+    kv_indices,
+    num_kv_splits,
+    max_kv_splits,
+    sm_scale_withk,
+    logit_cap,
+    xai_temperature_len=-1,
+):
+    BLOCK = 32
+    Lk = k_buffer.shape[-1]
+    Lv = v_buffer.shape[-1]
+
+    # [TODO] work around shmem limit on MI3xx
+    if _is_hip and Lk >= 576:
+        BLOCK = 16
+
+    if Lk == 576:
+        BLOCK_DMODEL = 512
+        BLOCK_DPE = 64
+    elif Lk == 288:
+        BLOCK_DMODEL = 256
+        BLOCK_DPE = 32
+    else:
+        BLOCK_DMODEL = triton.next_power_of_2(Lk)
+        BLOCK_DPE = 0
+    BLOCK_DV = triton.next_power_of_2(Lv)
+
+    batch, head_num = q.shape[0], q.shape[1]
+    kv_group_num = q.shape[1] // k_buffer.shape[1]
+
+    BLOCK_H = 16
+    MAX_KV_SPLITS = max_kv_splits
+    grid = (
+        batch,
+        triton.cdiv(head_num, min(BLOCK_H, kv_group_num)),
+        MAX_KV_SPLITS,
+    )
+
+    extra_kargs = {}
+    num_stages = 2
+    if _is_hip:
+        # https://rocm.docs.amd.com/en/docs-6.2.0/how-to/llm-fine-tuning-optimization/optimizing-triton-kernel.html
+        # https://github.com/triton-lang/triton/blob/main/third_party/amd/backend/compiler.py
+        extra_kargs = {"waves_per_eu": 1, "matrix_instr_nonkdim": 16, "kpack": 2}
+        num_stages = 1
+
+    _fwd_grouped_kernel_stage1[grid](
+        q,
+        k_buffer,
+        v_buffer,
+        sm_scale_withk,
+        kv_indptr,
+        kv_indices,
+        att_out,
+        att_lse,
+        num_kv_splits,
+        q.stride(0),
+        q.stride(1),
+        k_buffer.stride(0),
+        k_buffer.stride(1),
+        v_buffer.stride(0),
+        v_buffer.stride(1),
+        att_out.stride(0),
+        att_out.stride(1),
+        att_out.stride(2),
+        kv_group_num=kv_group_num,
+        q_head_num=head_num,
+        BLOCK_DMODEL=BLOCK_DMODEL,
+        BLOCK_DPE=BLOCK_DPE,
+        BLOCK_DV=BLOCK_DV,
+        BLOCK_N=BLOCK,
+        BLOCK_H=BLOCK_H,
+        MIN_BLOCK_KV=_MIN_BLOCK_KV,
+        logit_cap=logit_cap,
+        xai_temperature_len=xai_temperature_len,
+        num_warps=4,
+        num_stages=num_stages,
+        Lk=Lk,
+        Lv=Lv,
+        **extra_kargs,
+    )
+
+
+@triton.jit
+def _fwd_kernel_stage2(
+    Mid_O,
+    Mid_O_1,
+    O,
+    v_scale,
+    kv_indptr,
+    num_kv_splits,
+    sink_ptr,
+    stride_mid_ob,
+    stride_mid_oh,
+    stride_mid_os,
+    stride_obs,
+    stride_oh,
+    MAX_KV_SPLITS: tl.constexpr,
+    MIN_BLOCK_KV: tl.constexpr,
+    BLOCK_DV: tl.constexpr,
+    Lv: tl.constexpr,
+    HAS_SINK: tl.constexpr,
+):
+    cur_batch = tl.program_id(0)
+    cur_head = tl.program_id(1)
+
+    cur_batch_seq_len = tl.load(kv_indptr + cur_batch + 1) - tl.load(
+        kv_indptr + cur_batch
+    )
+    kv_splits = tl.load(num_kv_splits + cur_batch)
+
+    offs_d = tl.arange(0, BLOCK_DV)
+    mask_d = offs_d < Lv
+
+    e_sum = 0.0
+    e_max = -float("inf")
+    acc = tl.zeros([BLOCK_DV], dtype=tl.float32)
+
+    offs_v = cur_batch * stride_mid_ob + cur_head * stride_mid_oh + offs_d
+    offs_logic = (cur_batch * stride_mid_ob + cur_head * stride_mid_oh) // Lv
+    kv_len_per_split = (
+        tl.cdiv(tl.cdiv(cur_batch_seq_len, kv_splits), MIN_BLOCK_KV) * MIN_BLOCK_KV
+    )
+
+    for split_kv_id in range(0, MAX_KV_SPLITS):
+        split_kv_start = kv_len_per_split * split_kv_id
+        split_kv_end = tl.minimum(split_kv_start + kv_len_per_split, cur_batch_seq_len)
+
+        if split_kv_end > split_kv_start:
+            tv = tl.load(
+                Mid_O + offs_v + split_kv_id * stride_mid_os, mask=mask_d, other=0.0
+            )
+            tlogic = tl.load(Mid_O_1 + offs_logic + split_kv_id * stride_mid_os // Lv)
+            n_e_max = tl.maximum(tlogic, e_max)
+
+            old_scale = tl.exp(e_max - n_e_max)
+            acc *= old_scale
+            exp_logic = tl.exp(tlogic - n_e_max)
+            acc += exp_logic * tv
+
+            e_sum = e_sum * old_scale + exp_logic
+            e_max = n_e_max
+
+    if HAS_SINK:
+        cur_sink = tl.load(sink_ptr + cur_head)
+        e_sum += tl.exp(cur_sink - e_max)
+
+    tl.store(
+        O + cur_batch * stride_obs + cur_head * stride_oh + offs_d,
+        acc / e_sum * v_scale,
+        mask=mask_d,
+    )
+
+
+def _decode_softmax_reducev_fwd(
+    logits,
+    lse,
+    q,
+    o,
+    v_scale,
+    v_buffer,
+    kv_indptr,
+    num_kv_splits,
+    max_kv_splits,
+    sinks=None,
+):
+    batch, head_num = q.shape[0], q.shape[1]
+    Lv = v_buffer.shape[-1]
+    BLOCK_DV = triton.next_power_of_2(Lv)
+
+    MAX_KV_SPLITS = max_kv_splits
+    HAS_SINK = sinks is not None
+
+    extra_kargs = {}
+    if _is_hip:
+        # https://rocm.docs.amd.com/en/docs-6.2.0/how-to/llm-fine-tuning-optimization/optimizing-triton-kernel.html
+        # https://github.com/triton-lang/triton/blob/main/third_party/amd/backend/compiler.py
+        extra_kargs = {"waves_per_eu": 4, "matrix_instr_nonkdim": 16, "kpack": 2}
+
+    grid = (batch, head_num)
+    _fwd_kernel_stage2[grid](
+        logits,
+        lse,
+        o,
+        v_scale,
+        kv_indptr,
+        num_kv_splits,
+        sinks,
+        logits.stride(0),
+        logits.stride(1),
+        logits.stride(2),
+        o.stride(0),
+        o.stride(1),
+        MAX_KV_SPLITS=MAX_KV_SPLITS,
+        MIN_BLOCK_KV=_MIN_BLOCK_KV,
+        BLOCK_DV=BLOCK_DV,
+        Lv=Lv,
+        HAS_SINK=HAS_SINK,
+        num_warps=4,
+        num_stages=2,
+        **extra_kargs,
+    )
+
+
+def decode_attention_fwd_normal(
+    q,
+    k_buffer,
+    v_buffer,
+    o,
+    kv_indptr,
+    kv_indices,
+    attn_logits,
+    attn_lse,
+    num_kv_splits,
+    max_kv_splits,
+    sm_scale_withk,
+    v_scale,
+    logit_cap=0.0,
+    sinks=None,
+    xai_temperature_len=-1,
+):
+    _decode_att_m_fwd(
+        q,
+        k_buffer,
+        v_buffer,
+        attn_logits,
+        attn_lse,
+        kv_indptr,
+        kv_indices,
+        num_kv_splits,
+        max_kv_splits,
+        sm_scale_withk,
+        logit_cap,
+        xai_temperature_len,
+    )
+    _decode_softmax_reducev_fwd(
+        attn_logits,
+        attn_lse,
+        q,
+        o,
+        v_scale,
+        v_buffer,
+        kv_indptr,
+        num_kv_splits,
+        max_kv_splits,
+        sinks,
+    )
+
+
+def decode_attention_fwd_grouped(
+    q,
+    k_buffer,
+    v_buffer,
+    o,
+    kv_indptr,
+    kv_indices,
+    attn_logits,
+    attn_lse,
+    num_kv_splits,
+    max_kv_splits,
+    sm_scale_withk,
+    v_scale,
+    logit_cap=0.0,
+    sinks=None,
+    xai_temperature_len=-1,
+):
+    _decode_grouped_att_m_fwd(
+        q,
+        k_buffer,
+        v_buffer,
+        attn_logits,
+        attn_lse,
+        kv_indptr,
+        kv_indices,
+        num_kv_splits,
+        max_kv_splits,
+        sm_scale_withk,
+        logit_cap,
+        xai_temperature_len,
+    )
+    _decode_softmax_reducev_fwd(
+        attn_logits,
+        attn_lse,
+        q,
+        o,
+        v_scale,
+        v_buffer,
+        kv_indptr,
+        num_kv_splits,
+        max_kv_splits,
+        sinks,
+    )
+
+
+def decode_attention_fwd(
+    q,
+    k_buffer,
+    v_buffer,
+    o,
+    kv_indptr,
+    kv_indices,
+    attn_logits,
+    attn_lse,
+    num_kv_splits,
+    max_kv_splits,
+    sm_scale,
+    k_scale,
+    v_scale,
+    logit_cap=0.0,
+    sinks=None,
+    xai_temperature_len=-1,
+):
+    assert max_kv_splits == attn_logits.shape[2]
+    assert q.shape[0] <= kv_indptr.shape[0] - 1
+    assert q.shape[0] <= attn_logits.shape[0]
+
+    kv_group_num = q.shape[1] // v_buffer.shape[1]
+
+    if kv_group_num == 1:
+        # MHA
+        decode_attention_fwd_normal(
+            q,
+            k_buffer,
+            v_buffer,
+            o,
+            kv_indptr,
+            kv_indices,
+            attn_logits,
+            attn_lse,
+            num_kv_splits,
+            max_kv_splits,
+            sm_scale * k_scale,
+            v_scale,
+            logit_cap=logit_cap,
+            sinks=sinks,
+            xai_temperature_len=xai_temperature_len,
+        )
+    else:
+        # GQA/MQA/MLA
+        decode_attention_fwd_grouped(
+            q,
+            k_buffer,
+            v_buffer,
+            o,
+            kv_indptr,
+            kv_indices,
+            attn_logits,
+            attn_lse,
+            num_kv_splits,
+            max_kv_splits,
+            sm_scale * k_scale,
+            v_scale,
+            logit_cap=logit_cap,
+            sinks=sinks,
+            xai_temperature_len=xai_temperature_len,
+        )
diff --git a/sglang/python/sglang/srt/layers/attention/triton_ops/double_sparsity_attention.py b/sglang/python/sglang/srt/layers/attention/triton_ops/double_sparsity_attention.py
new file mode 100644
index 0000000000000000000000000000000000000000..8e972f4089cbbb1545f4f8339bc9206a4dd42dd3
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/attention/triton_ops/double_sparsity_attention.py
@@ -0,0 +1,1106 @@
+import torch
+import triton
+import triton.language as tl
+
+from sglang.srt.server_args import get_global_server_args
+from sglang.srt.utils import is_cuda, is_hip
+
+_is_cuda = is_cuda()
+if _is_cuda:
+    CUDA_CAPABILITY = torch.cuda.get_device_capability()
+
+_is_hip = is_hip()
+
+if get_global_server_args().triton_attention_reduce_in_fp32:
+    REDUCE_TRITON_TYPE = tl.float32
+    REDUCE_TORCH_TYPE = torch.float32
+else:
+    REDUCE_TRITON_TYPE = tl.float16
+    REDUCE_TORCH_TYPE = torch.float16
+
+
+@triton.jit
+def tanh(x):
+    # Tanh is just a scaled sigmoid
+    return 2 * tl.sigmoid(2 * x) - 1
+
+
+@triton.jit
+def _fwd_kernel_flash_decode_stage1(
+    Q,
+    K,
+    V,
+    sm_scale,
+    Req_to_tokens,
+    B_req_idx,
+    B_Seqlen,
+    Mid_O,  # [batch, head, seq_block_num, head_dim]
+    Mid_O_LogExpSum,  # [batch, head, seq_block_num]
+    stride_req_to_tokens_b,
+    stride_req_to_tokens_s,
+    stride_qbs,
+    stride_qh,
+    stride_qd,
+    stride_kbs,
+    stride_kh,
+    stride_kd,
+    stride_vbs,
+    stride_vh,
+    stride_vd,
+    stride_mid_ob,
+    stride_mid_oh,
+    stride_mid_os,
+    stride_mid_od,
+    stride_mid_o_eb,
+    stride_mid_o_eh,
+    stride_mid_o_es,
+    gqa_group_size,
+    BLOCK_SEQ: tl.constexpr,
+    BLOCK_DMODEL: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+):
+    cur_batch = tl.program_id(0)
+    cur_head = tl.program_id(1)
+    seq_start_block = tl.program_id(2)
+    cur_kv_head = cur_head // gqa_group_size
+
+    offs_d = tl.arange(0, BLOCK_DMODEL)
+    cur_batch_seq_len = tl.load(B_Seqlen + cur_batch)
+    cur_batch_req_idx = tl.load(B_req_idx + cur_batch)
+    cur_batch_start_index = seq_start_block * BLOCK_SEQ
+    cur_batch_end_index = tl.minimum(
+        cur_batch_seq_len, cur_batch_start_index + BLOCK_SEQ
+    )
+
+    off_q = cur_batch * stride_qbs + cur_head * stride_qh + offs_d
+
+    block_n_size = (
+        tl.where(
+            cur_batch_end_index - cur_batch_start_index <= 0,
+            0,
+            cur_batch_end_index - cur_batch_start_index + BLOCK_N - 1,
+        )
+        // BLOCK_N
+    )
+
+    offs_n = cur_batch_start_index + tl.arange(0, BLOCK_N)
+
+    q = tl.load(Q + off_q)
+
+    sum_exp = 0.0
+    max_logic = -float("inf")
+    acc = tl.zeros([BLOCK_DMODEL], dtype=tl.float32)
+
+    for start_n in range(0, block_n_size, 1):
+        offs_n_new = start_n * BLOCK_N + offs_n
+        k_loc = tl.load(
+            Req_to_tokens + stride_req_to_tokens_b * cur_batch_req_idx + offs_n_new,
+            mask=offs_n_new < cur_batch_end_index,
+            other=0,
+        )
+        off_k = k_loc[:, None] * stride_kbs + cur_kv_head * stride_kh + offs_d[None, :]
+        k = tl.load(
+            K + off_k, mask=offs_n_new[:, None] < cur_batch_end_index, other=0.0
+        )
+        att_value = tl.sum(q[None, :] * k, 1)
+        att_value *= sm_scale
+        att_value = tl.where(offs_n_new < cur_batch_end_index, att_value, float("-inf"))
+        v = tl.load(
+            V + off_k, mask=offs_n_new[:, None] < cur_batch_end_index, other=0.0
+        )
+
+        cur_max_logic = tl.max(att_value, axis=0)
+        new_max_logic = tl.maximum(cur_max_logic, max_logic)
+
+        exp_logic = tl.exp(att_value - new_max_logic)
+        logic_scale = tl.exp(max_logic - new_max_logic)
+        acc *= logic_scale
+        acc += tl.sum(exp_logic[:, None] * v, axis=0)
+
+        sum_exp = sum_exp * logic_scale + tl.sum(exp_logic, axis=0)
+        max_logic = new_max_logic
+
+    need_store = tl.where(block_n_size == 0, 0, 1)
+    for _ in range(0, need_store, 1):
+        off_mid_o = (
+            cur_batch * stride_mid_ob
+            + cur_head * stride_mid_oh
+            + seq_start_block * stride_mid_os
+            + offs_d
+        )
+        off_mid_o_logexpsum = (
+            cur_batch * stride_mid_o_eb + cur_head * stride_mid_o_eh + seq_start_block
+        )
+        tl.store(Mid_O + off_mid_o, acc / sum_exp)
+        tl.store(Mid_O_LogExpSum + off_mid_o_logexpsum, max_logic + tl.log(sum_exp))
+    return
+
+
+@triton.jit
+def _fwd_kernel_flash_decode_stage2(
+    B_Seqlen,
+    Mid_O,  # [batch, head, seq_block_num, head_dim]
+    Mid_O_LogExpSum,  # [batch, head, seq_block_num]
+    O,  # [batch, head, head_dim]
+    stride_mid_ob,
+    stride_mid_oh,
+    stride_mid_os,
+    stride_mid_od,
+    stride_mid_o_eb,
+    stride_mid_o_eh,
+    stride_mid_o_es,
+    stride_obs,
+    stride_oh,
+    stride_od,
+    BLOCK_SEQ: tl.constexpr,
+    BLOCK_DMODEL: tl.constexpr,
+):
+    cur_batch = tl.program_id(0)
+    cur_head = tl.program_id(1)
+
+    offs_d = tl.arange(0, BLOCK_DMODEL)
+    cur_batch_seq_len = tl.load(B_Seqlen + cur_batch)
+
+    block_n_size = (
+        tl.where(cur_batch_seq_len <= 0, 0, cur_batch_seq_len + BLOCK_SEQ - 1)
+        // BLOCK_SEQ
+    )
+
+    sum_exp = 0.0
+    max_logic = -float("inf")
+    acc = tl.zeros([BLOCK_DMODEL], dtype=tl.float32)
+
+    offs_v = cur_batch * stride_mid_ob + cur_head * stride_mid_oh + offs_d
+    offs_logic = cur_batch * stride_mid_o_eb + cur_head * stride_mid_o_eh
+    for block_seq_n in range(0, block_n_size, 1):
+        tv = tl.load(Mid_O + offs_v + block_seq_n * stride_mid_os)
+        tlogic = tl.load(Mid_O_LogExpSum + offs_logic + block_seq_n)
+        new_max_logic = tl.maximum(tlogic, max_logic)
+
+        old_scale = tl.exp(max_logic - new_max_logic)
+        acc *= old_scale
+        exp_logic = tl.exp(tlogic - new_max_logic)
+        acc += exp_logic * tv
+        sum_exp = sum_exp * old_scale + exp_logic
+        max_logic = new_max_logic
+
+    tl.store(O + cur_batch * stride_obs + cur_head * stride_oh + offs_d, acc / sum_exp)
+    return
+
+
+@torch.no_grad()
+def flash_decode_stage1(
+    q,
+    k,
+    v,
+    Req_to_tokens,
+    B_req_idx,
+    B_Seqlen,
+    max_len_in_batch,
+    mid_out,
+    mid_out_logsumexp,
+    block_seq,
+):
+    BLOCK_SEQ = block_seq
+    BLOCK_N = 16
+    assert BLOCK_SEQ % BLOCK_N == 0
+    # shape constraints
+    Lq, Lk = q.shape[-1], k.shape[-1]
+    assert Lq == Lk
+    assert Lk in {16, 32, 64, 128}
+    sm_scale = 1.0 / (Lk**0.5)
+    batch, head_num = B_req_idx.shape[0], q.shape[1]
+    grid = (batch, head_num, triton.cdiv(max_len_in_batch, BLOCK_SEQ))
+    gqa_group_size = q.shape[1] // k.shape[1]
+
+    _fwd_kernel_flash_decode_stage1[grid](
+        q,
+        k,
+        v,
+        sm_scale,
+        Req_to_tokens,
+        B_req_idx,
+        B_Seqlen,
+        mid_out,
+        mid_out_logsumexp,
+        Req_to_tokens.stride(0),
+        Req_to_tokens.stride(1),
+        q.stride(0),
+        q.stride(1),
+        q.stride(2),
+        k.stride(0),
+        k.stride(1),
+        k.stride(2),
+        v.stride(0),
+        v.stride(1),
+        v.stride(2),
+        mid_out.stride(0),
+        mid_out.stride(1),
+        mid_out.stride(2),
+        mid_out.stride(3),
+        mid_out_logsumexp.stride(0),
+        mid_out_logsumexp.stride(1),
+        mid_out_logsumexp.stride(2),
+        gqa_group_size,
+        BLOCK_SEQ=BLOCK_SEQ,
+        BLOCK_DMODEL=Lk,
+        BLOCK_N=BLOCK_N,
+        num_warps=1,
+        num_stages=2,
+    )
+    return
+
+
+@torch.no_grad()
+def flash_decode_stage2(mid_out, mid_out_logexpsum, B_Seqlen, O, block_seq):
+    Lk = mid_out.shape[-1]
+    assert Lk in {16, 32, 64, 128}
+    batch, head_num = mid_out.shape[0], mid_out.shape[1]
+    grid = (batch, head_num)
+
+    _fwd_kernel_flash_decode_stage2[grid](
+        B_Seqlen,
+        mid_out,
+        mid_out_logexpsum,
+        O,
+        mid_out.stride(0),
+        mid_out.stride(1),
+        mid_out.stride(2),
+        mid_out.stride(3),
+        mid_out_logexpsum.stride(0),
+        mid_out_logexpsum.stride(1),
+        mid_out_logexpsum.stride(2),
+        O.stride(0),
+        O.stride(1),
+        O.stride(2),
+        BLOCK_SEQ=block_seq,
+        BLOCK_DMODEL=Lk,
+        num_warps=4,
+        num_stages=2,
+    )
+    return
+
+
+def flash_decode_attention_fwd(
+    q,
+    k_buffer,
+    v_buffer,
+    o,
+    req_to_token,
+    b_req_idx,
+    b_start_loc,
+    b_seq_len,
+    attn_logits,
+    max_len_in_batch,
+    sm_scale,
+    logit_cap=0.0,
+):
+    BLOCK_SEQ = 256
+    kv_group_num = q.shape[1] // v_buffer.shape[1]
+    # batch_size = q.shape[0]
+
+    block_seq_num = (max_len_in_batch + BLOCK_SEQ - 1) // BLOCK_SEQ
+
+    mid_o = torch.empty(
+        [q.shape[0], q.shape[1], block_seq_num, q.shape[-1]],
+        dtype=torch.float32,
+        device="cuda",
+    )
+    mid_o_logexpsum = torch.empty(
+        [q.shape[0], q.shape[1], block_seq_num], dtype=torch.float32, device="cuda"
+    )
+
+    flash_decode_stage1(
+        q,
+        k_buffer,
+        v_buffer,
+        req_to_token,
+        b_req_idx,
+        b_seq_len,
+        max_len_in_batch,
+        mid_o,
+        mid_o_logexpsum,
+        BLOCK_SEQ,
+    )
+    flash_decode_stage2(mid_o, mid_o_logexpsum, b_seq_len, o, BLOCK_SEQ)
+
+
+@triton.jit
+def _sparse_fwd_kernel_flash_decode_stage1(  # Double Sparsity's approximate attention
+    Q_Label,
+    K_Label_Buffer,
+    sm_scale,
+    Req_to_tokens,  # shape: [B, S]
+    B_Seqlen,
+    Att_Out,  # shape: [H, B, S] easier for topk
+    stride_req_to_tokens_b,
+    stride_qbs,
+    stride_qh,
+    stride_buf_kbs,
+    stride_buf_kh,
+    att_stride_h,
+    att_stride_b,
+    kv_group_num: tl.constexpr,
+    BLOCK_DMODEL: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+    logit_cap: tl.constexpr,
+):
+    cur_batch = tl.program_id(0)
+    cur_head = tl.program_id(1)
+    start_n = tl.program_id(2)
+
+    cur_kv_head = cur_head // kv_group_num
+
+    offs_d = tl.arange(0, BLOCK_DMODEL)
+    cur_batch_seq_len = tl.load(B_Seqlen + cur_batch)
+
+    cur_batch_start_index = 0
+    cur_batch_end_index = cur_batch_seq_len
+
+    min_val = -float("inf")
+    att_value = tl.full([BLOCK_N], min_val, dtype=tl.float32)
+
+    off_q = cur_batch * stride_qbs + cur_head * stride_qh + offs_d
+
+    offs_n = start_n * BLOCK_N + tl.arange(0, BLOCK_N)
+
+    block_index = start_n * BLOCK_N
+    block_mask = tl.where(block_index < cur_batch_seq_len, 1, 0)
+
+    for start_mark in range(0, block_mask, 1):
+        q = tl.load(Q_Label + off_q + start_mark).to(REDUCE_TRITON_TYPE)
+        offs_n_new = cur_batch_start_index + offs_n
+        k_loc = tl.load(
+            Req_to_tokens + stride_req_to_tokens_b * cur_batch + offs_n_new,
+            mask=offs_n_new < cur_batch_end_index,
+            other=0,
+        )
+        offs_buf_k = (
+            k_loc[:, None] * stride_buf_kbs
+            + cur_kv_head * stride_buf_kh
+            + offs_d[None, :]
+        )
+        k = tl.load(
+            K_Label_Buffer + offs_buf_k,
+            mask=offs_n_new[:, None] < cur_batch_end_index,
+            other=0.0,
+        ).to(REDUCE_TRITON_TYPE)
+
+        att_value = tl.sum(q[None, :] * k, 1)
+        att_value *= sm_scale
+
+        if logit_cap > 0:
+            att_value = logit_cap * tanh(att_value / logit_cap)
+
+    att_value = tl.where(offs_n < cur_batch_end_index, att_value, min_val)
+    off_o = cur_head * att_stride_h + (cur_batch * att_stride_b + offs_n)
+    tl.store(Att_Out + off_o, att_value)
+
+
+@triton.jit
+def _sparse_fwd_kernel_flash_decode_stage2(
+    Q,
+    K,
+    V,
+    sm_scale,
+    Req_to_tokens,  # shape: [B, S]
+    Topk_token_indices,  # shape: [H, B, k]
+    Mid_O,  # [batch, head, seq_block_num, head_dim]
+    Mid_O_LogExpSum,  # [batch, head, seq_block_num]
+    Heavy_token_num,  # NOTE: This can be used as constexpr but we may support dynamic heavy token number in the future
+    stride_req_to_tokens_b,
+    stride_topk_token_indices_h,
+    stride_topk_token_indices_b,
+    stride_qbs,
+    stride_qh,
+    stride_kbs,
+    stride_kh,
+    stride_vbs,
+    stride_vh,
+    stride_mid_ob,
+    stride_mid_oh,
+    stride_mid_os,
+    stride_mid_o_eb,
+    stride_mid_o_eh,
+    gqa_group_size,
+    BLOCK_SEQ: tl.constexpr,
+    BLOCK_DMODEL: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+):
+    cur_batch = tl.program_id(0)
+    cur_head = tl.program_id(1)
+    seq_start_block = tl.program_id(2)
+    cur_kv_head = cur_head // gqa_group_size
+
+    offs_d = tl.arange(0, BLOCK_DMODEL)
+    cur_batch_start_index = seq_start_block * BLOCK_SEQ
+    cur_batch_end_index = tl.minimum(Heavy_token_num, cur_batch_start_index + BLOCK_SEQ)
+
+    off_q = cur_batch * stride_qbs + cur_head * stride_qh + offs_d
+
+    block_n_size = (
+        tl.where(
+            cur_batch_end_index - cur_batch_start_index <= 0,
+            0,
+            cur_batch_end_index - cur_batch_start_index + BLOCK_N - 1,
+        )
+        // BLOCK_N
+    )
+
+    # offs_n = cur_batch_start_index + tl.arange(0, BLOCK_N)
+    offs_n = tl.arange(0, BLOCK_N)
+
+    q = tl.load(Q + off_q)
+
+    sum_exp = 0.0
+    max_logic = -float("inf")
+    acc = tl.zeros([BLOCK_DMODEL], dtype=tl.float32)
+
+    for start_n in range(cur_batch_start_index, cur_batch_end_index, BLOCK_N):
+        # for start_n in range(0, block_n_size, 1):
+        # offs_n_new = start_n * BLOCK_N + offs_n
+        offs_n_new = start_n + offs_n
+        # offs_n_new = cur_batch_start_index + start_n * BLOCK_N + offs_n
+        topk_token_indices = tl.load(
+            Topk_token_indices
+            + stride_topk_token_indices_h * cur_head
+            + stride_topk_token_indices_b * cur_batch
+            + offs_n_new,
+            mask=offs_n_new < cur_batch_end_index,
+            other=0,
+        )
+        k_loc = tl.load(
+            Req_to_tokens + stride_req_to_tokens_b * cur_batch + topk_token_indices,
+            mask=offs_n_new < cur_batch_end_index,
+            other=0,
+        )
+        off_k = k_loc[:, None] * stride_kbs + cur_kv_head * stride_kh + offs_d[None, :]
+        k = tl.load(
+            K + off_k, mask=offs_n_new[:, None] < cur_batch_end_index, other=0.0
+        )
+        att_value = tl.sum(q[None, :] * k, 1)
+        att_value *= sm_scale
+        att_value = tl.where(offs_n_new < cur_batch_end_index, att_value, float("-inf"))
+        v = tl.load(
+            V + off_k, mask=offs_n_new[:, None] < cur_batch_end_index, other=0.0
+        )
+
+        cur_max_logic = tl.max(att_value, axis=0)
+        new_max_logic = tl.maximum(cur_max_logic, max_logic)
+
+        exp_logic = tl.exp(att_value - new_max_logic)
+        logic_scale = tl.exp(max_logic - new_max_logic)
+        acc *= logic_scale
+        acc += tl.sum(exp_logic[:, None] * v, axis=0)
+
+        sum_exp = sum_exp * logic_scale + tl.sum(exp_logic, axis=0)
+        max_logic = new_max_logic
+
+    # need_store = tl.where(block_n_size == 0, 0, 1)
+    need_store = 1
+    for _ in range(0, need_store, 1):
+        off_mid_o = (
+            cur_batch * stride_mid_ob
+            + cur_head * stride_mid_oh
+            + seq_start_block * stride_mid_os
+            + offs_d
+        )
+        off_mid_o_logexpsum = (
+            cur_batch * stride_mid_o_eb + cur_head * stride_mid_o_eh + seq_start_block
+        )
+        tl.store(Mid_O + off_mid_o, acc / sum_exp)
+        tl.store(Mid_O_LogExpSum + off_mid_o_logexpsum, max_logic + tl.log(sum_exp))
+    return
+
+
+@triton.jit
+def _sparse_fwd_kernel_flash_decode_stage3(
+    Mid_O,  # [batch, head, seq_block_num, head_dim]
+    Mid_O_LogExpSum,  # [batch, head, seq_block_num]
+    O,  # [batch, head, head_dim]
+    seq_len,  # NOTE: This can be used as constexpr but we may support dynamic heavy token number in the future
+    stride_mid_ob,
+    stride_mid_oh,
+    stride_mid_os,
+    stride_mid_o_eb,
+    stride_mid_o_eh,
+    stride_obs,
+    stride_oh,
+    BLOCK_SEQ: tl.constexpr,
+    BLOCK_DMODEL: tl.constexpr,
+):
+    cur_batch = tl.program_id(0)
+    cur_head = tl.program_id(1)
+
+    offs_d = tl.arange(0, BLOCK_DMODEL)
+
+    block_n_size = tl.where(seq_len <= 0, 0, seq_len + BLOCK_SEQ - 1) // BLOCK_SEQ
+
+    sum_exp = 0.0
+    max_logic = -float("inf")
+    acc = tl.zeros([BLOCK_DMODEL], dtype=tl.float32)
+
+    offs_v = cur_batch * stride_mid_ob + cur_head * stride_mid_oh + offs_d
+    offs_logic = cur_batch * stride_mid_o_eb + cur_head * stride_mid_o_eh
+    for block_seq_n in range(0, block_n_size, 1):
+        tv = tl.load(Mid_O + offs_v + block_seq_n * stride_mid_os)
+        tlogic = tl.load(Mid_O_LogExpSum + offs_logic + block_seq_n)
+        new_max_logic = tl.maximum(tlogic, max_logic)
+
+        old_scale = tl.exp(max_logic - new_max_logic)
+        acc *= old_scale
+        exp_logic = tl.exp(tlogic - new_max_logic)
+        acc += exp_logic * tv
+        sum_exp = sum_exp * old_scale + exp_logic
+        max_logic = new_max_logic
+
+    tl.store(O + cur_batch * stride_obs + cur_head * stride_oh + offs_d, acc / sum_exp)
+    return
+
+
+def sparse_flash_decode_stage1(
+    q_label,
+    k_label_buffer,
+    att_out,
+    Req_to_tokens,
+    B_Seqlen,
+    max_len_in_batch,
+    sm_scale,
+    logit_cap,
+):
+    BLOCK = 32
+    # shape constraints
+    Lq, Lk = q_label.shape[-1], k_label_buffer.shape[-1]
+    assert Lq == Lk
+    assert Lk in {16, 32, 64, 128, 256, 576}
+
+    BLOCK_DMODEL = Lk
+
+    batch, head_num = q_label.shape[0], q_label.shape[1]
+
+    grid = (batch, head_num, triton.cdiv(max_len_in_batch, BLOCK))
+    kv_group_num = q_label.shape[1] // k_label_buffer.shape[1]
+
+    if kv_group_num == 1:
+        num_warps = 4
+    else:
+        num_warps = 2
+
+    _sparse_fwd_kernel_flash_decode_stage1[grid](
+        q_label,
+        k_label_buffer,
+        sm_scale,
+        Req_to_tokens,
+        B_Seqlen,
+        att_out,
+        Req_to_tokens.stride(0),
+        q_label.stride(0),
+        q_label.stride(1),
+        k_label_buffer.stride(0),
+        k_label_buffer.stride(1),
+        att_out.stride(0),
+        att_out.stride(1),
+        kv_group_num,
+        BLOCK_DMODEL,
+        BLOCK,
+        logit_cap,
+        num_warps=num_warps,
+        num_stages=1,
+    )
+
+
+@torch.no_grad()
+def sparse_flash_decode_stage2(
+    q,
+    k,
+    v,
+    Req_to_tokens,
+    Topk_token_indices,
+    heavy_token_num,
+    mid_out,
+    mid_out_logsumexp,
+    block_seq,
+    sm_scale,
+):
+    BLOCK_SEQ = block_seq
+    BLOCK_N = 16
+    assert BLOCK_SEQ % BLOCK_N == 0
+    # shape constraints
+    Lq, Lk = q.shape[-1], k.shape[-1]
+    assert Lq == Lk
+    assert Lk in {16, 32, 64, 128}
+    assert heavy_token_num == Topk_token_indices.shape[-1]
+    # sm_scale = 1.0 / (Lk ** 0.5)
+    batch, head_num = q.shape[0], q.shape[1]
+    grid = (batch, head_num, triton.cdiv(heavy_token_num, BLOCK_SEQ))
+
+    gqa_group_size = q.shape[1] // k.shape[1]
+
+    _sparse_fwd_kernel_flash_decode_stage2[grid](
+        q,
+        k,
+        v,
+        sm_scale,
+        Req_to_tokens,
+        Topk_token_indices,
+        mid_out,
+        mid_out_logsumexp,
+        heavy_token_num,
+        Req_to_tokens.stride(0),
+        Topk_token_indices.stride(0),
+        Topk_token_indices.stride(1),
+        q.stride(0),
+        q.stride(1),
+        k.stride(0),
+        k.stride(1),
+        v.stride(0),
+        v.stride(1),
+        mid_out.stride(0),
+        mid_out.stride(1),
+        mid_out.stride(2),
+        mid_out_logsumexp.stride(0),
+        mid_out_logsumexp.stride(1),
+        gqa_group_size,
+        BLOCK_SEQ=BLOCK_SEQ,
+        BLOCK_DMODEL=Lk,
+        BLOCK_N=BLOCK_N,
+        num_warps=1,
+        num_stages=2,
+    )
+    return
+
+
+@torch.no_grad()
+def sparse_flash_decode_stage3(Seqlen, mid_out, mid_out_logexpsum, O, block_seq):
+    Lk = mid_out.shape[-1]
+    assert Lk in {16, 32, 64, 128}
+    batch, head_num = mid_out.shape[0], mid_out.shape[1]
+    grid = (batch, head_num)
+
+    _sparse_fwd_kernel_flash_decode_stage3[grid](
+        mid_out,
+        mid_out_logexpsum,
+        O,
+        Seqlen,
+        mid_out.stride(0),
+        mid_out.stride(1),
+        mid_out.stride(2),
+        mid_out_logexpsum.stride(0),
+        mid_out_logexpsum.stride(1),
+        O.stride(0),
+        O.stride(1),
+        BLOCK_SEQ=block_seq,
+        BLOCK_DMODEL=Lk,
+        num_warps=4,
+        num_stages=2,
+    )
+    return
+
+
+def flash_decode_sparse_attention_fwd(
+    q,
+    k_buffer,
+    v_buffer,
+    o,
+    q_label,
+    k_label_buffer,
+    req_to_token,
+    b_seq_len,
+    max_len_in_batch,
+    sm_scale,
+    logit_cap,
+    heavy_token_num=32,
+    att_out_approx=None,
+    mid_out=None,
+    mid_o_logexpsum=None,
+    BLOCK_SEQ=256,
+):
+    # TODO(Andy): Tune BLOCK_SEQ & BLOCK_D
+    kv_group_num = q.shape[1] // v_buffer.shape[1]
+    # batch_size = q.shape[0]
+
+    # Step 1: BGEMV approximate attention (page implementation)
+
+    if att_out_approx is None:
+        att_out_approx = torch.empty(
+            [q.shape[1], q.shape[0], max_len_in_batch],
+            dtype=REDUCE_TORCH_TYPE,
+            device=q.device,
+        )
+
+    if mid_out is None:
+        block_seq_num = (heavy_token_num + BLOCK_SEQ - 1) // BLOCK_SEQ
+
+        mid_out = torch.empty(
+            [q.shape[0], q.shape[1], block_seq_num, q.shape[-1]],
+            dtype=torch.float32,
+            device=q.device,
+        )
+        mid_o_logexpsum = torch.empty(
+            [q.shape[0], q.shape[1], block_seq_num],
+            dtype=torch.float32,
+            device=q.device,
+        )
+
+    sparse_flash_decode_stage1(
+        q_label,
+        k_label_buffer,
+        att_out_approx,
+        req_to_token,
+        b_seq_len,
+        max_len_in_batch,
+        sm_scale,
+        logit_cap,
+    )
+
+    # Step 2: TopK token selection
+    # NOTE(Andy): Apply sparse decoding when min > heavy_token_num and max > sparse decoding threshold
+    # TODO(Andy): Change a faster topk implementation
+    topk_token_indices = torch.topk(att_out_approx, heavy_token_num, dim=-1).indices
+    # topk_token_indices: [H, B, k], Req_to_tokens: [B, S]
+    # topk_token_indices = torch.arange(0, heavy_token_num, device=q.device).unsqueeze(0).unsqueeze(0).expand(q.shape[1], q.shape[0], -1)
+
+    sparse_flash_decode_stage2(
+        q,
+        k_buffer,
+        v_buffer,
+        req_to_token,
+        topk_token_indices,
+        heavy_token_num,
+        mid_out,
+        mid_o_logexpsum,
+        BLOCK_SEQ,
+        sm_scale,
+    )
+
+    sparse_flash_decode_stage3(heavy_token_num, mid_out, mid_o_logexpsum, o, BLOCK_SEQ)
+
+
+# Extend attention kernel for Double Sparsity
+# Moved from https://github.com/sgl-project/sglang/blob/v0.4.2.post1/python/sglang/srt/layers/attention/triton_ops/extend_attention.py
+@triton.jit
+def _fwd_kernel(
+    Q_Extend,
+    K_Extend,
+    V_Extend,
+    O_Extend,
+    K_Buffer,
+    V_Buffer,
+    Req_to_tokens,
+    B_req_idx,
+    B_Seq_Len,
+    B_Start_Loc_Extend,
+    B_Seq_Len_Extend,
+    sm_scale,
+    kv_group_num,
+    stride_qbs,
+    stride_qh,
+    stride_kbs,
+    stride_kh,
+    stride_vbs,
+    stride_vh,
+    stride_obs,
+    stride_oh,
+    stride_buf_kbs,
+    stride_buf_kh,
+    stride_buf_vbs,
+    stride_buf_vh,
+    stride_req_to_tokens_b,
+    logit_cap: tl.constexpr,
+    Lq: tl.constexpr,
+    Lv: tl.constexpr,
+    BLOCK_DMODEL: tl.constexpr,
+    BLOCK_DPE: tl.constexpr,
+    BLOCK_DV: tl.constexpr,
+    BLOCK_M: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+):
+    cur_seq = tl.program_id(0)
+    cur_head = tl.program_id(1)
+    cur_block_m = tl.program_id(2)
+    cur_kv_head = cur_head // kv_group_num
+
+    cur_seq_len = tl.load(B_Seq_Len + cur_seq)
+    cur_seq_len_extend = tl.load(B_Seq_Len_Extend + cur_seq)
+    cur_seq_len_prefix = cur_seq_len - cur_seq_len_extend
+
+    cur_seq_prefix_start_in_loc = 0
+    cur_seq_extend_start_contiguous = tl.load(B_Start_Loc_Extend + cur_seq)
+    cur_batch_req_idx = tl.load(B_req_idx + cur_seq)
+
+    offs_d = tl.arange(0, BLOCK_DMODEL)
+    offs_dv = tl.arange(0, BLOCK_DV)
+    offs_m = tl.arange(0, BLOCK_M)
+    mask_m = (cur_block_m * BLOCK_M + offs_m) < cur_seq_len_extend
+
+    mask_d = offs_d < Lq
+    mask_dv = offs_dv < Lv
+
+    offs_q = (
+        (cur_seq_extend_start_contiguous + cur_block_m * BLOCK_M + offs_m[:, None])
+        * stride_qbs
+        + cur_head * stride_qh
+        + offs_d[None, :]
+    )
+    q = tl.load(
+        Q_Extend + offs_q, mask=(mask_m[:, None]) & (mask_d[None, :]), other=0.0
+    )
+
+    if BLOCK_DPE > 0:
+        offs_dpe = BLOCK_DMODEL + tl.arange(0, BLOCK_DPE)
+        offs_qpe = (
+            (cur_seq_extend_start_contiguous + cur_block_m * BLOCK_M + offs_m[:, None])
+            * stride_qbs
+            + cur_head * stride_qh
+            + offs_dpe[None, :]
+        )
+        qpe = tl.load(Q_Extend + offs_qpe, mask=mask_m[:, None], other=0.0)
+
+    # stage 1: compute scores with prefix
+    offs_n = tl.arange(0, BLOCK_N)
+
+    acc = tl.zeros([BLOCK_M, BLOCK_DV], dtype=tl.float32)
+    deno = tl.zeros([BLOCK_M], dtype=tl.float32)
+    e_max = tl.zeros([BLOCK_M], dtype=tl.float32) - float("inf")
+
+    for start_n in range(0, cur_seq_len_prefix, BLOCK_N):
+        start_n = tl.multiple_of(start_n, BLOCK_N)
+        mask_n = (start_n + offs_n) < cur_seq_len_prefix
+        offs_b_loc_prefix = cur_batch_req_idx * stride_req_to_tokens_b + (
+            cur_seq_prefix_start_in_loc + start_n + offs_n
+        )
+        offs_kv_loc = tl.load(Req_to_tokens + offs_b_loc_prefix, mask=mask_n, other=0)
+
+        # load k in transposed way
+        offs_buf_k = (
+            offs_kv_loc[None, :] * stride_buf_kbs
+            + cur_kv_head * stride_buf_kh
+            + offs_d[:, None]
+        )
+        k = tl.load(
+            K_Buffer + offs_buf_k, mask=(mask_n[None, :]) & (mask_d[:, None]), other=0.0
+        )
+
+        qk = tl.dot(q.to(k.dtype), k)
+        if BLOCK_DPE > 0:
+            offs_kpe = (
+                offs_kv_loc[None, :] * stride_buf_kbs
+                + cur_kv_head * stride_buf_kh
+                + offs_dpe[:, None]
+            )
+            kpe = tl.load(
+                K_Buffer + offs_kpe,
+                mask=mask_n[None, :],
+                other=0.0,
+            )
+            qk += tl.dot(qpe.to(kpe.dtype), kpe)
+        qk *= sm_scale
+
+        if logit_cap > 0:
+            qk = logit_cap * tanh(qk / logit_cap)
+
+        qk = tl.where(mask_m[:, None] & mask_n[None, :], qk, float("-inf"))
+
+        n_e_max = tl.maximum(tl.max(qk, 1), e_max)
+        re_scale = tl.exp(e_max - n_e_max)
+        p = tl.exp(qk - n_e_max[:, None])
+        deno = deno * re_scale + tl.sum(p, 1)
+
+        offs_buf_v = (
+            offs_kv_loc[:, None] * stride_buf_vbs
+            + cur_kv_head * stride_buf_vh
+            + offs_dv[None, :]
+        )
+        v = tl.load(
+            V_Buffer + offs_buf_v, mask=mask_n[:, None] & mask_dv[None, :], other=0.0
+        )
+        p = p.to(v.dtype)
+        acc = acc * re_scale[:, None] + tl.dot(p, v)
+
+        e_max = n_e_max
+
+    # stage 2: compute the triangle part
+
+    cur_block_m_end = tl.minimum(cur_seq_len_extend, (cur_block_m + 1) * BLOCK_M)
+    for start_n in range(0, cur_block_m_end, BLOCK_N):
+        start_n = tl.multiple_of(start_n, BLOCK_N)
+        mask_n = (start_n + offs_n) < cur_block_m_end
+
+        # load k in transposed way
+        offs_k = (
+            (cur_seq_extend_start_contiguous + start_n + offs_n[None, :]) * stride_kbs
+            + cur_kv_head * stride_kh
+            + offs_d[:, None]
+        )
+        k = tl.load(
+            K_Extend + offs_k, mask=(mask_n[None, :]) & (mask_d[:, None]), other=0.0
+        )
+
+        qk = tl.dot(q, k, out_dtype=tl.float32)
+        if BLOCK_DPE > 0:
+            offs_kpe = (
+                (cur_seq_extend_start_contiguous + start_n + offs_n[None, :])
+                * stride_kbs
+                + cur_kv_head * stride_kh
+                + offs_dpe[:, None]
+            )
+            kpe = tl.load(
+                K_Extend + offs_kpe,
+                mask=mask_n[None, :],
+                other=0.0,
+            )
+            qk += tl.dot(qpe, kpe)
+
+        qk *= sm_scale
+
+        if logit_cap > 0:
+            qk = logit_cap * tanh(qk / logit_cap)
+
+        mask_causual = (cur_block_m * BLOCK_M + offs_m[:, None]) >= (
+            start_n + offs_n[None, :]
+        )
+        mask_causual &= mask_m[:, None] & mask_n[None, :]
+        qk = tl.where(mask_causual, qk, float("-inf"))
+
+        n_e_max = tl.maximum(tl.max(qk, 1), e_max)
+        re_scale = tl.exp(e_max - n_e_max)
+        p = tl.exp(qk - n_e_max[:, None])
+        deno = deno * re_scale + tl.sum(p, 1)
+
+        offs_v = (
+            (cur_seq_extend_start_contiguous + start_n + offs_n[:, None]) * stride_vbs
+            + cur_kv_head * stride_vh
+            + offs_dv[None, :]
+        )
+        v = tl.load(
+            V_Extend + offs_v, mask=mask_n[:, None] & mask_dv[None, :], other=0.0
+        )
+        p = p.to(v.dtype)
+        acc = acc * re_scale[:, None] + tl.dot(p, v)
+
+        e_max = n_e_max
+
+    offs_o = (
+        (cur_seq_extend_start_contiguous + cur_block_m * BLOCK_M + offs_m[:, None])
+        * stride_obs
+        + cur_head * stride_oh
+        + offs_dv[None, :]
+    )
+    tl.store(
+        O_Extend + offs_o, acc / deno[:, None], mask=mask_m[:, None] & mask_dv[None, :]
+    )
+
+
+def extend_attention_fwd(
+    q_extend,
+    k_extend,
+    v_extend,
+    o_extend,
+    k_buffer,
+    v_buffer,
+    req_to_tokens,
+    b_req_idx,
+    b_seq_len,
+    b_seq_len_extend,
+    b_start_loc_extend,
+    max_len_extend,
+    sm_scale=None,
+    logit_cap=0.0,
+):
+    """
+    q_extend, k_extend, v_extend, o_extend: contiguous tensors
+
+    k_buffer, v_buffer: (prefix + extend) tensors in mem_manager
+    """
+    Lq, Lk, Lv = (
+        q_extend.shape[-1],
+        k_extend.shape[-1],
+        v_extend.shape[-1],
+    )
+
+    if Lq == 576:
+        BLOCK_DMODEL = 512
+        BLOCK_DPE = 64
+    elif Lq == 288:
+        BLOCK_DMODEL = 256
+        BLOCK_DPE = 32
+    elif Lq == 192:
+        BLOCK_DMODEL = 128
+        BLOCK_DPE = 64
+    else:
+        BLOCK_DMODEL = triton.next_power_of_2(Lq)
+        BLOCK_DPE = 0
+    BLOCK_DV = triton.next_power_of_2(Lv)
+
+    if _is_hip:
+        BLOCK_M, BLOCK_N = (64, 64)
+        num_warps = 4
+
+    else:
+        if _is_cuda and CUDA_CAPABILITY[0] >= 9:
+            if Lq <= 256:
+                BLOCK_M, BLOCK_N = (128, 64)
+            else:
+                BLOCK_M, BLOCK_N = (32, 64)
+        elif _is_cuda and CUDA_CAPABILITY[0] >= 8:
+            if Lq <= 128:
+                BLOCK_M, BLOCK_N = (128, 128)
+            elif Lq <= 256:
+                BLOCK_M, BLOCK_N = (64, 64)
+            else:
+                BLOCK_M, BLOCK_N = (32, 64)
+        else:
+            BLOCK_M, BLOCK_N = (64, 64) if Lq <= 128 else (32, 32)
+
+        num_warps = 4 if Lk <= 64 else 8
+
+    sm_scale = sm_scale or 1.0 / (Lq**0.5)
+    batch_size, head_num = b_seq_len.shape[0], q_extend.shape[1]
+    kv_group_num = q_extend.shape[1] // k_extend.shape[1]
+
+    grid = (batch_size, head_num, triton.cdiv(max_len_extend, BLOCK_M))
+    num_stages = 1
+
+    extra_kargs = {}
+    if _is_hip:
+        extra_kargs = {"waves_per_eu": 4, "matrix_instr_nonkdim": 16, "kpack": 2}
+
+    _fwd_kernel[grid](
+        q_extend,
+        k_extend,
+        v_extend,
+        o_extend,
+        k_buffer,
+        v_buffer,
+        req_to_tokens,
+        b_req_idx,
+        b_seq_len,
+        b_start_loc_extend,
+        b_seq_len_extend,
+        sm_scale,
+        kv_group_num,
+        q_extend.stride(0),
+        q_extend.stride(1),
+        k_extend.stride(0),
+        k_extend.stride(1),
+        v_extend.stride(0),
+        v_extend.stride(1),
+        o_extend.stride(0),
+        o_extend.stride(1),
+        k_buffer.stride(0),
+        k_buffer.stride(1),
+        v_buffer.stride(0),
+        v_buffer.stride(1),
+        req_to_tokens.stride(0),
+        logit_cap=logit_cap,
+        BLOCK_DMODEL=BLOCK_DMODEL,
+        BLOCK_DPE=BLOCK_DPE,
+        BLOCK_DV=BLOCK_DV,
+        BLOCK_M=BLOCK_M,
+        BLOCK_N=BLOCK_N,
+        Lq=Lq,
+        Lv=Lv,
+        num_warps=num_warps,
+        num_stages=num_stages,
+        **extra_kargs,
+    )
diff --git a/sglang/python/sglang/srt/layers/attention/triton_ops/extend_attention.py b/sglang/python/sglang/srt/layers/attention/triton_ops/extend_attention.py
new file mode 100644
index 0000000000000000000000000000000000000000..8ce0e35ff79229d64ef81399da77c056ccf58a80
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/attention/triton_ops/extend_attention.py
@@ -0,0 +1,1063 @@
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""
+Memory-efficient attention for prefill.
+It supports page size = 1 and prefill with KV cache (i.e. extend).
+"""
+
+import torch
+import triton
+import triton.language as tl
+
+from sglang.srt.layers.attention.triton_ops.prefill_attention import (
+    context_attention_fwd,
+)
+from sglang.srt.utils import is_cuda, is_hip
+
+_is_cuda = is_cuda()
+if _is_cuda:
+    CUDA_CAPABILITY = torch.cuda.get_device_capability()
+
+_is_hip = is_hip()
+
+
+def _get_block_sizes_for_extend_attention(Lq: int, Lv: int):
+    """
+    Get block sizes and configuration for extend attention kernels.
+
+    Args:
+        Lq: Query head dimension
+        Lv: Value head dimension
+
+    Returns:
+        tuple: (BLOCK_DMODEL, BLOCK_DPE, BLOCK_DV, BLOCK_M, BLOCK_N, num_warps)
+    """
+    # Determine BLOCK_DMODEL and BLOCK_DPE based on head dimension
+    if Lq == 576:
+        BLOCK_DMODEL = 512
+        BLOCK_DPE = 64
+    elif Lq == 288:
+        BLOCK_DMODEL = 256
+        BLOCK_DPE = 32
+    elif Lq == 192:
+        BLOCK_DMODEL = 128
+        BLOCK_DPE = 64
+    else:
+        BLOCK_DMODEL = triton.next_power_of_2(Lq)
+        BLOCK_DPE = 0
+
+    BLOCK_DV = triton.next_power_of_2(Lv)
+
+    # Determine BLOCK_M, BLOCK_N, and num_warps based on hardware
+    if _is_hip:
+        BLOCK_M, BLOCK_N = (64, 64)
+        num_warps = 4
+    else:
+        if _is_cuda and CUDA_CAPABILITY[0] == 12:
+            # sm120 workstation Blackwell architecture (RTX Pro 6000) has a much smaller shared memory size (100K)
+            if Lq <= 128:
+                BLOCK_M, BLOCK_N = (64, 128)
+            elif Lq <= 256:
+                BLOCK_M, BLOCK_N = (64, 64)
+            else:
+                BLOCK_M, BLOCK_N = (32, 32)
+        elif _is_cuda and CUDA_CAPABILITY[0] >= 9:
+            # Hopper architecture (H100, etc.)
+            if Lq <= 256:
+                BLOCK_M, BLOCK_N = (128, 64)
+            else:
+                BLOCK_M, BLOCK_N = (32, 64)
+        elif _is_cuda and CUDA_CAPABILITY[0] >= 8:
+            # Ampere architecture (A100, etc.)
+            # sm86/sm89 has a much smaller shared memory size (100K) than sm80 (160K)
+            if CUDA_CAPABILITY[1] == 9 or CUDA_CAPABILITY[1] == 6:
+                if Lq <= 128:
+                    BLOCK_M, BLOCK_N = (64, 128)
+                elif Lq <= 256:
+                    BLOCK_M, BLOCK_N = (64, 64)
+                else:
+                    BLOCK_M, BLOCK_N = (32, 32)
+            else:
+                if Lq <= 128:
+                    BLOCK_M, BLOCK_N = (128, 128)
+                elif Lq <= 256:
+                    BLOCK_M, BLOCK_N = (64, 64)
+                else:
+                    BLOCK_M, BLOCK_N = (32, 64)
+        else:
+            # Older architectures
+            BLOCK_M, BLOCK_N = (64, 64) if Lq <= 128 else (32, 32)
+
+        num_warps = 4 if Lq <= 64 else 8
+
+    return BLOCK_DMODEL, BLOCK_DPE, BLOCK_DV, BLOCK_M, BLOCK_N, num_warps
+
+
+@triton.jit
+def tanh(x):
+    # Tanh is just a scaled sigmoid
+    return 2 * tl.sigmoid(2 * x) - 1
+
+
+@triton.jit
+def _copy_unified_indices_kernel(
+    # Input buffers
+    prefix_kv_indptr,
+    prefix_kv_indices,
+    extend_start_loc,
+    extend_seq_lens,
+    extend_kv_indices,
+    unified_kv_indptr,
+    # Output buffer
+    unified_kv_indices,
+    # Size
+    bs,
+):
+    """
+    Triton kernel to copy indices to unified buffer (parallel per sequence).
+    Each thread block processes one sequence with vectorized loads/stores.
+    """
+    pid = tl.program_id(0)
+
+    if pid >= bs:
+        return
+
+    # Load sequence info
+    prefix_start = tl.load(prefix_kv_indptr + pid)
+    prefix_end = tl.load(prefix_kv_indptr + pid + 1)
+    extend_start = tl.load(extend_start_loc + pid)
+    extend_len = tl.load(extend_seq_lens + pid)
+
+    prefix_len = prefix_end - prefix_start
+    unified_start = tl.load(unified_kv_indptr + pid)
+
+    # Copy indices in vectorized chunks
+    BLOCK_SIZE: tl.constexpr = 128
+
+    # Process prefix indices
+    for block_start in range(0, prefix_len, BLOCK_SIZE):
+        offs = block_start + tl.arange(0, BLOCK_SIZE)
+        mask = offs < prefix_len
+
+        src_idx = prefix_start + offs
+        dst_idx = unified_start + offs
+
+        vals = tl.load(prefix_kv_indices + src_idx, mask=mask, other=0)
+        tl.store(unified_kv_indices + dst_idx, vals, mask=mask)
+
+    # Process extend indices
+    for block_start in range(0, extend_len, BLOCK_SIZE):
+        offs = block_start + tl.arange(0, BLOCK_SIZE)
+        mask = offs < extend_len
+
+        src_idx = extend_start + offs
+        dst_idx = unified_start + prefix_len + offs
+
+        vals = tl.load(extend_kv_indices + src_idx, mask=mask, other=0)
+        tl.store(unified_kv_indices + dst_idx, vals, mask=mask)
+
+
+def build_unified_kv_indices(
+    prefix_kv_indptr: torch.Tensor,
+    prefix_kv_indices: torch.Tensor,
+    extend_start_loc: torch.Tensor,
+    extend_seq_lens: torch.Tensor,
+    extend_kv_indices: torch.Tensor,
+    bs: int,
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    """
+    Build unified KV indices efficiently:
+    - Use PyTorch's optimized cumsum (NVIDIA CUB) for indptr
+    - Use Triton kernel for parallel index copying
+
+    Returns:
+        (unified_kv_indptr, unified_kv_indices, prefix_lens)
+    """
+    device = prefix_kv_indptr.device
+
+    prefix_lens = prefix_kv_indptr[1 : bs + 1] - prefix_kv_indptr[:bs]
+
+    # Create unified_kv_indptr avoiding direct assignment (for CUDA graph compatibility)
+    unified_lens = prefix_lens + extend_seq_lens[:bs]
+    unified_kv_indptr = torch.cat(
+        [
+            torch.zeros(1, dtype=torch.int32, device=device),
+            torch.cumsum(unified_lens, dim=0),
+        ]
+    )
+
+    max_unified_len = len(prefix_kv_indices) + len(extend_kv_indices)
+
+    unified_kv_indices = torch.empty(max_unified_len, dtype=torch.int64, device=device)
+
+    # Launch Triton kernel for parallel index copying
+    _copy_unified_indices_kernel[(bs,)](
+        prefix_kv_indptr,
+        prefix_kv_indices,
+        extend_start_loc,
+        extend_seq_lens,
+        extend_kv_indices,
+        unified_kv_indptr,
+        unified_kv_indices,
+        bs,
+    )
+
+    return unified_kv_indptr, unified_kv_indices, prefix_lens
+
+
+@triton.jit
+def _fwd_kernel(
+    Q_Extend,
+    K_Extend,
+    V_Extend,
+    O_Extend,
+    K_Buffer,
+    V_Buffer,
+    qo_indptr,
+    kv_indptr,
+    kv_indices,
+    mask_ptr,
+    mask_indptr,
+    sink_ptr,
+    window_kv_offset_ptr,
+    sm_scale,
+    k_scale,
+    v_scale,
+    kv_group_num,
+    stride_qbs,
+    stride_qh,
+    stride_kbs,
+    stride_kh,
+    stride_vbs,
+    stride_vh,
+    stride_obs,
+    stride_oh,
+    stride_buf_kbs,
+    stride_buf_kh,
+    stride_buf_vbs,
+    stride_buf_vh,
+    SLIDING_WINDOW_SIZE: tl.constexpr,
+    logit_cap: tl.constexpr,
+    xai_temperature_len: tl.constexpr,
+    Lq: tl.constexpr,
+    Lv: tl.constexpr,
+    BLOCK_DMODEL: tl.constexpr,
+    BLOCK_DPE: tl.constexpr,
+    BLOCK_DV: tl.constexpr,
+    BLOCK_M: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+    USE_CUSTOM_MASK: tl.constexpr,
+    IS_CAUSAL: tl.constexpr,
+    SKIP_PREFIX_CUSTOM_MASK: tl.constexpr,
+    STORE_TRANSPOSE: tl.constexpr,
+    HAS_SINK: tl.constexpr,
+):
+    cur_seq = tl.program_id(0)
+    cur_head = tl.program_id(1)
+    cur_block_m = tl.program_id(2)
+    cur_kv_head = cur_head // kv_group_num
+
+    cur_seq_extend_start_idx = tl.load(qo_indptr + cur_seq)
+    cur_seq_len_extend = tl.load(qo_indptr + cur_seq + 1) - cur_seq_extend_start_idx
+    cur_seq_kv_start_idx = tl.load(kv_indptr + cur_seq)
+    cur_seq_len_prefix = tl.load(kv_indptr + cur_seq + 1) - cur_seq_kv_start_idx
+    cur_seq_len = cur_seq_len_prefix + cur_seq_len_extend
+
+    if USE_CUSTOM_MASK:
+        cur_seq_mask_start_idx = tl.load(mask_indptr + cur_seq)
+
+    # For SWA, we should only load the mask in the sliding window
+    window_kv_offset = 0
+    if USE_CUSTOM_MASK and SLIDING_WINDOW_SIZE > 0:
+        window_kv_offset = tl.load(window_kv_offset_ptr + cur_seq)
+
+    offs_d = tl.arange(0, BLOCK_DMODEL)
+    offs_dv = tl.arange(0, BLOCK_DV)
+    offs_m = tl.arange(0, BLOCK_M)
+    mask_m = (cur_block_m * BLOCK_M + offs_m) < cur_seq_len_extend
+
+    mask_d = offs_d < Lq
+    mask_dv = offs_dv < Lv
+
+    if xai_temperature_len > 0:
+        offs_qidx = cur_seq_len_prefix + cur_block_m * BLOCK_M + offs_m
+        xai_temperature_scale = 1.0 / tl.log2(float(xai_temperature_len))
+        xai_temperature_reg = tl.where(
+            offs_qidx > xai_temperature_len,
+            tl.log2(offs_qidx.to(tl.float32)) * xai_temperature_scale,
+            1.0,
+        )
+
+    offs_q = (
+        (cur_seq_extend_start_idx + cur_block_m * BLOCK_M + offs_m[:, None])
+        * stride_qbs
+        + cur_head * stride_qh
+        + offs_d[None, :]
+    )
+    q = tl.load(
+        Q_Extend + offs_q, mask=(mask_m[:, None]) & (mask_d[None, :]), other=0.0
+    )
+
+    if BLOCK_DPE > 0:
+        offs_dpe = BLOCK_DMODEL + tl.arange(0, BLOCK_DPE)
+        offs_qpe = (
+            (cur_seq_extend_start_idx + cur_block_m * BLOCK_M + offs_m[:, None])
+            * stride_qbs
+            + cur_head * stride_qh
+            + offs_dpe[None, :]
+        )
+        qpe = tl.load(Q_Extend + offs_qpe, mask=mask_m[:, None], other=0.0)
+
+    # stage 1: compute scores with prefix
+    offs_n = tl.arange(0, BLOCK_N)
+
+    acc = tl.zeros([BLOCK_M, BLOCK_DV], dtype=tl.float32)
+    deno = tl.zeros([BLOCK_M], dtype=tl.float32)
+    e_max = tl.zeros([BLOCK_M], dtype=tl.float32) - float("inf")
+
+    for start_n in range(0, cur_seq_len_prefix, BLOCK_N):
+        start_n = tl.multiple_of(start_n, BLOCK_N)
+        mask_n = (start_n + offs_n) < cur_seq_len_prefix
+
+        final_mask = mask_m[:, None] & mask_n[None, :]
+        if USE_CUSTOM_MASK and not SKIP_PREFIX_CUSTOM_MASK:
+            custom_mask = tl.load(
+                mask_ptr
+                + cur_seq_mask_start_idx
+                + (cur_block_m * BLOCK_M + offs_m[:, None])
+                * (cur_seq_len + window_kv_offset)
+                + window_kv_offset
+                + start_n
+                + offs_n[None, :],
+                mask=(mask_m[:, None] & mask_n[None, :]),
+                other=0,
+            )
+            final_mask &= custom_mask
+        if SLIDING_WINDOW_SIZE > 0:
+            # Add mask where q_id <= kv_id + sliding_window_size
+            # q_id = prefix_len + cur_m, kv_id = cur_n
+            window_mask = (
+                cur_seq_len_prefix + cur_block_m * BLOCK_M + offs_m[:, None]
+            ) <= (start_n + offs_n[None, :] + SLIDING_WINDOW_SIZE)
+            final_mask &= window_mask
+
+        SKIP_TILE = False
+        if (USE_CUSTOM_MASK and not SKIP_PREFIX_CUSTOM_MASK) or SLIDING_WINDOW_SIZE > 0:
+            SKIP_TILE = tl.max(tl.max(final_mask.to(tl.int32), axis=1), axis=0) == 0
+
+        if not SKIP_TILE:
+            offs_kv_loc = tl.load(
+                kv_indices + cur_seq_kv_start_idx + start_n + offs_n,
+                mask=mask_n,
+                other=0,
+            )
+
+            # load k in transposed way
+            offs_buf_k = (
+                offs_kv_loc[None, :] * stride_buf_kbs
+                + cur_kv_head * stride_buf_kh
+                + offs_d[:, None]
+            )
+            k = tl.load(
+                K_Buffer + offs_buf_k,
+                mask=(mask_n[None, :]) & (mask_d[:, None]),
+                other=0.0,
+            )
+
+            qk = tl.dot(q.to(k.dtype), k)
+            if BLOCK_DPE > 0:
+                offs_kpe = (
+                    offs_kv_loc[None, :] * stride_buf_kbs
+                    + cur_kv_head * stride_buf_kh
+                    + offs_dpe[:, None]
+                )
+                kpe = tl.load(
+                    K_Buffer + offs_kpe,
+                    mask=mask_n[None, :],
+                    other=0.0,
+                )
+                qk += tl.dot(qpe.to(kpe.dtype), kpe)
+            qk *= sm_scale * k_scale
+
+            if logit_cap > 0:
+                qk = logit_cap * tanh(qk / logit_cap)
+
+            if xai_temperature_len > 0:
+                qk *= xai_temperature_reg[:, None]
+
+            qk = tl.where(final_mask, qk, float("-inf"))
+
+            row_max = tl.max(qk, 1)
+            row_max_fixed = tl.where(row_max == float("-inf"), -1e20, row_max)
+            n_e_max = tl.maximum(row_max_fixed, e_max)
+
+            re_scale = tl.exp(e_max - n_e_max)
+            p = tl.exp(qk - n_e_max[:, None])
+            deno = deno * re_scale + tl.sum(p, 1)
+
+            offs_buf_v = (
+                offs_kv_loc[:, None] * stride_buf_vbs
+                + cur_kv_head * stride_buf_vh
+                + offs_dv[None, :]
+            )
+            v = tl.load(
+                V_Buffer + offs_buf_v,
+                mask=mask_n[:, None] & mask_dv[None, :],
+                other=0.0,
+            )
+            p = p.to(v.dtype)
+            acc = acc * re_scale[:, None] + tl.dot(p, v) * v_scale
+
+            e_max = n_e_max
+
+    # stage 2: compute the triangle part
+
+    cur_block_m_end = (
+        cur_seq_len_extend
+        if not IS_CAUSAL
+        else tl.minimum(cur_seq_len_extend, (cur_block_m + 1) * BLOCK_M)
+    )
+    for start_n in range(0, cur_block_m_end, BLOCK_N):
+        start_n = tl.multiple_of(start_n, BLOCK_N)
+        mask_n = (start_n + offs_n) < cur_block_m_end
+
+        final_mask = mask_m[:, None] & mask_n[None, :]
+        if USE_CUSTOM_MASK:
+            custom_mask = tl.load(
+                mask_ptr
+                + cur_seq_mask_start_idx
+                + (cur_block_m * BLOCK_M + offs_m[:, None])
+                * (cur_seq_len + window_kv_offset)
+                + window_kv_offset
+                + cur_seq_len_prefix
+                + start_n
+                + offs_n[None, :],
+                mask=(mask_m[:, None] & mask_n[None, :]),
+                other=0,
+            )
+            custom_mask &= mask_m[:, None] & mask_n[None, :]
+            final_mask &= custom_mask
+        elif IS_CAUSAL:
+            mask_causual = (cur_block_m * BLOCK_M + offs_m[:, None]) >= (
+                start_n + offs_n[None, :]
+            )
+            mask_causual &= mask_m[:, None] & mask_n[None, :]
+            final_mask &= mask_causual
+        else:
+            mask_non_causal = mask_m[:, None] & mask_n[None, :]
+            final_mask &= mask_non_causal
+
+        if SLIDING_WINDOW_SIZE > 0:
+            # Add mask where q_id <= kv_id + sliding_window_size
+            window_mask = (cur_block_m * BLOCK_M + offs_m[:, None]) <= (
+                start_n + offs_n[None, :] + SLIDING_WINDOW_SIZE
+            )
+            final_mask &= window_mask
+
+        SKIP_TILE = False
+        if USE_CUSTOM_MASK or SLIDING_WINDOW_SIZE > 0:
+            SKIP_TILE = tl.max(tl.max(final_mask.to(tl.int32), axis=1), axis=0) == 0
+
+        if not SKIP_TILE:
+            # load k in transposed way
+            offs_k = (
+                (cur_seq_extend_start_idx + start_n + offs_n[None, :]) * stride_kbs
+                + cur_kv_head * stride_kh
+                + offs_d[:, None]
+            )
+            k = tl.load(
+                K_Extend + offs_k, mask=(mask_n[None, :]) & (mask_d[:, None]), other=0.0
+            )
+
+            qk = tl.dot(q, k, out_dtype=tl.float32)
+            if BLOCK_DPE > 0:
+                offs_kpe = (
+                    (cur_seq_extend_start_idx + start_n + offs_n[None, :]) * stride_kbs
+                    + cur_kv_head * stride_kh
+                    + offs_dpe[:, None]
+                )
+                kpe = tl.load(
+                    K_Extend + offs_kpe,
+                    mask=mask_n[None, :],
+                    other=0.0,
+                )
+                qk += tl.dot(qpe, kpe)
+
+            qk *= sm_scale
+
+            if logit_cap > 0:
+                qk = logit_cap * tanh(qk / logit_cap)
+
+            if xai_temperature_len > 0:
+                qk *= xai_temperature_reg[:, None]
+
+            qk = tl.where(final_mask, qk, float("-inf"))
+
+            row_max = tl.max(qk, 1)
+            row_max_fixed = tl.where(row_max == float("-inf"), -1e20, row_max)
+            n_e_max = tl.maximum(row_max_fixed, e_max)
+
+            re_scale = tl.exp(e_max - n_e_max)
+            p = tl.exp(qk - n_e_max[:, None])
+            deno = deno * re_scale + tl.sum(p, 1)
+
+            offs_v = (
+                (cur_seq_extend_start_idx + start_n + offs_n[:, None]) * stride_vbs
+                + cur_kv_head * stride_vh
+                + offs_dv[None, :]
+            )
+            v = tl.load(
+                V_Extend + offs_v, mask=mask_n[:, None] & mask_dv[None, :], other=0.0
+            )
+            p = p.to(v.dtype)
+            acc = acc * re_scale[:, None] + tl.dot(p, v)
+
+            e_max = n_e_max
+
+    if HAS_SINK:
+        cur_sink = tl.load(sink_ptr + cur_head)
+        deno += tl.exp(cur_sink - e_max)
+
+    offs_o = (
+        (cur_seq_extend_start_idx + cur_block_m * BLOCK_M + offs_m[:, None])
+        * stride_obs
+        + cur_head * stride_oh
+        + offs_dv[None, :]
+    )
+    if STORE_TRANSPOSE:
+        tl.store(
+            O_Extend + offs_o.T,
+            (acc / deno[:, None]).T,
+            mask=(mask_m[:, None] & mask_dv[None, :]).T,
+        )
+    else:
+        tl.store(
+            O_Extend + offs_o,
+            acc / deno[:, None],
+            mask=mask_m[:, None] & mask_dv[None, :],
+        )
+
+
+def extend_attention_fwd(
+    q_extend,
+    k_extend,
+    v_extend,
+    o_extend,
+    k_buffer,
+    v_buffer,
+    qo_indptr,
+    kv_indptr,
+    kv_indices,
+    custom_mask,
+    is_causal,
+    mask_indptr,
+    max_len_extend,
+    k_scale,
+    v_scale,
+    sm_scale=None,
+    logit_cap=0.0,
+    skip_prefix_custom_mask=True,
+    sliding_window_size=-1,
+    sinks=None,
+    window_kv_offsets=None,
+    xai_temperature_len=-1,
+):
+    """
+    q_extend, k_extend, v_extend, o_extend: contiguous tensors
+
+    k_buffer, v_buffer: (prefix + extend) tensors in mem_manager
+    """
+    Lq, Lk, Lv = (
+        q_extend.shape[-1],
+        k_extend.shape[-1],
+        v_extend.shape[-1],
+    )
+
+    # Get block sizes and configuration
+    BLOCK_DMODEL, BLOCK_DPE, BLOCK_DV, BLOCK_M, BLOCK_N, num_warps = (
+        _get_block_sizes_for_extend_attention(Lq, Lv)
+    )
+
+    sm_scale = sm_scale or 1.0 / (Lq**0.5)
+    batch_size, head_num = qo_indptr.shape[0] - 1, q_extend.shape[1]
+    kv_group_num = q_extend.shape[1] // k_extend.shape[1]
+
+    USE_CUSTOM_MASK = custom_mask is not None
+    # Skip custom mask for prefix part
+    SKIP_PREFIX_CUSTOM_MASK = skip_prefix_custom_mask
+
+    HAS_SINK = sinks is not None
+
+    grid = (batch_size, head_num, triton.cdiv(max_len_extend, BLOCK_M))
+    num_stages = 1
+
+    extra_kargs = {}
+    if _is_hip:
+        extra_kargs = {"waves_per_eu": 1, "matrix_instr_nonkdim": 16, "kpack": 2}
+
+    _fwd_kernel[grid](
+        q_extend,
+        k_extend,
+        v_extend,
+        o_extend,
+        k_buffer,
+        v_buffer,
+        qo_indptr,
+        kv_indptr,
+        kv_indices,
+        custom_mask,
+        mask_indptr,
+        sinks,
+        window_kv_offsets,
+        sm_scale,
+        k_scale,
+        v_scale,
+        kv_group_num,
+        q_extend.stride(0),
+        q_extend.stride(1),
+        k_extend.stride(0),
+        k_extend.stride(1),
+        v_extend.stride(0),
+        v_extend.stride(1),
+        o_extend.stride(0),
+        o_extend.stride(1),
+        k_buffer.stride(0),
+        k_buffer.stride(1),
+        v_buffer.stride(0),
+        v_buffer.stride(1),
+        SLIDING_WINDOW_SIZE=sliding_window_size,
+        logit_cap=logit_cap,
+        xai_temperature_len=xai_temperature_len,
+        BLOCK_DMODEL=BLOCK_DMODEL,
+        BLOCK_DPE=BLOCK_DPE,
+        BLOCK_DV=BLOCK_DV,
+        BLOCK_M=BLOCK_M,
+        BLOCK_N=BLOCK_N,
+        Lq=Lq,
+        Lv=Lv,
+        USE_CUSTOM_MASK=USE_CUSTOM_MASK,
+        IS_CAUSAL=is_causal,
+        SKIP_PREFIX_CUSTOM_MASK=SKIP_PREFIX_CUSTOM_MASK,
+        HAS_SINK=HAS_SINK,
+        STORE_TRANSPOSE=_is_hip,
+        num_warps=num_warps,
+        num_stages=num_stages,
+        **extra_kargs,
+    )
+
+
+def redundant_attention(
+    q_extend,
+    o_extend,
+    k_buffer,
+    v_buffer,
+    b_req_idx,
+    b_start_loc,
+    b_seq_len,
+    b_seq_len_prefix,
+    max_len_in_batch,
+):
+    total_token_num = k_buffer.shape[0]
+    B, H_Q, D = b_req_idx.shape[0], q_extend.shape[-2], q_extend.shape[-1]
+    q_buffer = torch.empty(
+        (total_token_num, H_Q, D), dtype=q_extend.dtype, device=q_extend.device
+    )
+
+    pt = 0
+    for i in range(B):
+        cur_seq_len_extend = b_seq_len[i] - b_seq_len_prefix[i]
+        pl, pr = b_start_loc[i] + b_seq_len_prefix[i], b_start_loc[i] + b_seq_len[i]
+        q_buffer[pl:pr] = q_extend[pt : pt + cur_seq_len_extend]
+        pt += cur_seq_len_extend
+
+    o_buffer = torch.empty_like(q_buffer)
+    context_attention_fwd(
+        q_buffer, k_buffer, v_buffer, o_buffer, b_start_loc, b_seq_len, max_len_in_batch
+    )
+
+    pt = 0
+    for i in range(B):
+        cur_seq_len_extend = b_seq_len[i] - b_seq_len_prefix[i]
+        pl, pr = b_start_loc[i] + b_seq_len_prefix[i], b_start_loc[i] + b_seq_len[i]
+        o_extend[pt : pt + cur_seq_len_extend] = o_buffer[pl:pr]
+        pt += cur_seq_len_extend
+
+
+@triton.jit
+def _fwd_kernel_unified(
+    Q,
+    O,
+    K_Buffer,
+    V_Buffer,
+    qo_indptr,
+    kv_indptr,
+    kv_indices,
+    prefix_lens,
+    mask_ptr,
+    mask_indptr,
+    sink_ptr,
+    window_start_pos,
+    sm_scale_withk,
+    v_scale,
+    kv_group_num,
+    stride_qbs,
+    stride_qh,
+    stride_obs,
+    stride_oh,
+    stride_buf_kbs,
+    stride_buf_kh,
+    stride_buf_vbs,
+    stride_buf_vh,
+    SLIDING_WINDOW_SIZE: tl.constexpr,
+    logit_cap: tl.constexpr,
+    xai_temperature_len: tl.constexpr,
+    Lq: tl.constexpr,
+    Lv: tl.constexpr,
+    BLOCK_DMODEL: tl.constexpr,
+    BLOCK_DPE: tl.constexpr,
+    BLOCK_DV: tl.constexpr,
+    BLOCK_M: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+    IS_CAUSAL: tl.constexpr,
+    USE_CUSTOM_MASK: tl.constexpr,
+    HAS_SINK: tl.constexpr,
+):
+    """
+    Unified 1-stage kernel for deterministic extend attention.
+    Both prefix and extend KV are accessed through the unified kv_indices.
+    """
+    cur_seq = tl.program_id(0)
+    cur_head = tl.program_id(1)
+    cur_block_m = tl.program_id(2)
+    cur_kv_head = cur_head // kv_group_num
+
+    # Load sequence information
+    cur_seq_q_start_idx = tl.load(qo_indptr + cur_seq)
+    cur_seq_q_len = tl.load(qo_indptr + cur_seq + 1) - cur_seq_q_start_idx
+    cur_seq_kv_start_idx = tl.load(kv_indptr + cur_seq)
+    cur_seq_kv_len = tl.load(kv_indptr + cur_seq + 1) - cur_seq_kv_start_idx
+    cur_seq_prefix_len = tl.load(prefix_lens + cur_seq)
+
+    # Load window start position for sliding window attention
+    # This is the absolute position of the first key in the window (0 if no sliding window)
+    cur_window_start = 0
+    if SLIDING_WINDOW_SIZE > 0:
+        cur_window_start = tl.load(window_start_pos + cur_seq)
+
+    # Load custom mask start index if using custom mask (for speculative decoding)
+    if USE_CUSTOM_MASK:
+        cur_seq_mask_start_idx = tl.load(mask_indptr + cur_seq)
+
+    offs_d = tl.arange(0, BLOCK_DMODEL)
+    offs_dv = tl.arange(0, BLOCK_DV)
+    offs_m = tl.arange(0, BLOCK_M)
+    mask_m = (cur_block_m * BLOCK_M + offs_m) < cur_seq_q_len
+    mask_d = offs_d < Lq
+    mask_dv = offs_dv < Lv
+
+    # XAI temperature handling
+    if xai_temperature_len > 0:
+        offs_qidx = cur_seq_prefix_len + cur_block_m * BLOCK_M + offs_m
+        xai_temperature_reg = tl.where(
+            offs_qidx < xai_temperature_len,
+            1.0,
+            xai_temperature_len / (offs_qidx + 1.0),
+        )
+
+    # Load Q
+    offs_q = (
+        (cur_seq_q_start_idx + cur_block_m * BLOCK_M + offs_m[:, None]) * stride_qbs
+        + cur_head * stride_qh
+        + offs_d[None, :]
+    )
+    q = tl.load(Q + offs_q, mask=(mask_m[:, None]) & (mask_d[None, :]), other=0.0)
+
+    if BLOCK_DPE > 0:
+        offs_dpe = BLOCK_DMODEL + tl.arange(0, BLOCK_DPE)
+        offs_qpe = (
+            (cur_seq_q_start_idx + cur_block_m * BLOCK_M + offs_m[:, None]) * stride_qbs
+            + cur_head * stride_qh
+            + offs_dpe[None, :]
+        )
+        qpe = tl.load(Q + offs_qpe, mask=mask_m[:, None], other=0.0)
+
+    # Initialize accumulators
+    offs_n = tl.arange(0, BLOCK_N)
+    acc = tl.zeros([BLOCK_M, BLOCK_DV], dtype=tl.float32)
+    deno = tl.zeros([BLOCK_M], dtype=tl.float32)
+    e_max = tl.zeros([BLOCK_M], dtype=tl.float32) - float("inf")
+
+    # Unified loop: process all KV tokens (prefix + extend)
+    for start_n in range(0, cur_seq_kv_len, BLOCK_N):
+        start_n = tl.multiple_of(start_n, BLOCK_N)
+        mask_n = (start_n + offs_n) < cur_seq_kv_len
+
+        # Compute mask
+        final_mask = mask_m[:, None] & mask_n[None, :]
+
+        # Apply custom mask if provided
+        if USE_CUSTOM_MASK:
+            custom_mask = tl.load(
+                mask_ptr
+                + cur_seq_mask_start_idx
+                + (cur_block_m * BLOCK_M + offs_m[:, None]) * cur_seq_kv_len
+                + start_n
+                + offs_n[None, :],
+                mask=(mask_m[:, None] & mask_n[None, :]),
+                other=0,
+            )
+            final_mask &= custom_mask
+
+        # Apply causal mask for extend part
+        if IS_CAUSAL and not USE_CUSTOM_MASK:
+            # Determine if current KV block is in extend region
+            # Only apply causal mask when both Q and K are in extend region
+            q_idx = cur_block_m * BLOCK_M + offs_m[:, None]
+            k_idx_in_total = start_n + offs_n[None, :]
+
+            # Causal mask: q_idx >= (k_idx - prefix_len) when k_idx >= prefix_len
+            # For prefix region (k_idx < prefix_len), no causal mask
+            k_is_extend = k_idx_in_total >= cur_seq_prefix_len
+            k_idx_in_extend = k_idx_in_total - cur_seq_prefix_len
+            causal_mask = tl.where(
+                k_is_extend,
+                q_idx >= k_idx_in_extend,
+                True,  # No causal mask for prefix
+            )
+            final_mask &= causal_mask
+
+        if SLIDING_WINDOW_SIZE > 0:
+            # Sliding window mask with correct absolute positions
+            # Q absolute position: window_start + prefix_len + q_position_in_extend
+            q_abs_pos = (
+                cur_window_start
+                + cur_seq_prefix_len
+                + cur_block_m * BLOCK_M
+                + offs_m[:, None]
+            )
+
+            # K absolute position: window_start + k_index_in_unified_array
+            k_abs_pos = cur_window_start + start_n + offs_n[None, :]
+
+            # Sliding window: query can attend to keys within window_size
+            window_mask = q_abs_pos <= (k_abs_pos + SLIDING_WINDOW_SIZE)
+            final_mask &= window_mask
+
+        # Check if we can skip this tile
+        SKIP_TILE = False
+        if USE_CUSTOM_MASK or SLIDING_WINDOW_SIZE > 0:
+            SKIP_TILE = tl.max(tl.max(final_mask.to(tl.int32), axis=1), axis=0) == 0
+
+        if not SKIP_TILE:
+            # Load KV indices
+            offs_kv_loc = tl.load(
+                kv_indices + cur_seq_kv_start_idx + start_n + offs_n,
+                mask=mask_n,
+                other=0,
+            )
+
+            # Load K
+            offs_buf_k = (
+                offs_kv_loc[None, :] * stride_buf_kbs
+                + cur_kv_head * stride_buf_kh
+                + offs_d[:, None]
+            )
+            k = tl.load(
+                K_Buffer + offs_buf_k,
+                mask=(mask_n[None, :]) & (mask_d[:, None]),
+                other=0.0,
+            )
+
+            # Compute QK
+            qk = tl.dot(q.to(k.dtype), k)
+            if BLOCK_DPE > 0:
+                offs_kpe = (
+                    offs_kv_loc[None, :] * stride_buf_kbs
+                    + cur_kv_head * stride_buf_kh
+                    + offs_dpe[:, None]
+                )
+                kpe = tl.load(
+                    K_Buffer + offs_kpe,
+                    mask=mask_n[None, :],
+                    other=0.0,
+                )
+                qk += tl.dot(qpe.to(kpe.dtype), kpe)
+
+            qk *= sm_scale_withk
+
+            if logit_cap > 0:
+                qk = logit_cap * tanh(qk / logit_cap)
+
+            if xai_temperature_len > 0:
+                qk *= xai_temperature_reg[:, None]
+
+            qk = tl.where(final_mask, qk, float("-inf"))
+
+            # Online softmax
+            row_max = tl.max(qk, 1)
+            row_max_fixed = tl.where(row_max == float("-inf"), -1e20, row_max)
+            n_e_max = tl.maximum(row_max_fixed, e_max)
+
+            re_scale = tl.exp(e_max - n_e_max)
+            p = tl.exp(qk - n_e_max[:, None])
+            deno = deno * re_scale + tl.sum(p, 1)
+
+            # Load V
+            offs_buf_v = (
+                offs_kv_loc[:, None] * stride_buf_vbs
+                + cur_kv_head * stride_buf_vh
+                + offs_dv[None, :]
+            )
+            v = tl.load(
+                V_Buffer + offs_buf_v,
+                mask=mask_n[:, None] & mask_dv[None, :],
+                other=0.0,
+            )
+            p = p.to(v.dtype)
+            acc = acc * re_scale[:, None] + tl.dot(p, v)
+
+            e_max = n_e_max
+
+    # Handle sink tokens
+    if HAS_SINK:
+        cur_sink = tl.load(sink_ptr + cur_head)
+        deno += tl.exp(cur_sink - e_max)
+
+    # Store output
+    offs_o = (
+        (cur_seq_q_start_idx + cur_block_m * BLOCK_M + offs_m[:, None]) * stride_obs
+        + cur_head * stride_oh
+        + offs_dv[None, :]
+    )
+    tl.store(
+        O + offs_o,
+        acc / deno[:, None] * v_scale,
+        mask=mask_m[:, None] & mask_dv[None, :],
+    )
+
+
+def extend_attention_fwd_unified(
+    q,
+    o,
+    k_buffer,
+    v_buffer,
+    k_scale,
+    v_scale,
+    qo_indptr,
+    kv_indptr,
+    kv_indices,
+    prefix_lens,
+    max_len_extend,
+    custom_mask=None,
+    mask_indptr=None,
+    sm_scale=None,
+    logit_cap=0.0,
+    is_causal=True,
+    sliding_window_size=-1,
+    sinks=None,
+    window_start_pos=None,
+    xai_temperature_len=-1,
+):
+    """
+    Unified 1-stage extend attention for deterministic inference.
+
+    Args:
+        q: Query tensor [num_tokens, num_heads, head_dim]
+        o: Output tensor [num_tokens, num_heads, head_dim]
+        k_buffer: Key cache buffer
+        v_buffer: Value cache buffer
+        qo_indptr: Query offsets [batch_size + 1]
+        kv_indptr: KV offsets [batch_size + 1] (includes both prefix and extend)
+        kv_indices: Unified KV indices (both prefix and extend)
+        prefix_lens: Prefix length for each sequence [batch_size]
+        max_len_extend: Maximum extend length
+        custom_mask: Custom attention mask (for speculative decoding tree attention)
+        mask_indptr: Mask offsets [batch_size + 1]
+        sm_scale: Softmax scale
+        logit_cap: Logit capping value
+        is_causal: Whether to apply causal mask
+        sliding_window_size: Sliding window size (-1 for no sliding window)
+        sinks: Sink tokens
+        window_start_pos: Absolute position of first key in sliding window [batch_size]
+                         (None if sliding window not used)
+        xai_temperature_len: XAI temperature length
+    """
+    Lq, Lv = q.shape[-1], v_buffer.shape[-1]
+
+    # Get block sizes and configuration
+    BLOCK_DMODEL, BLOCK_DPE, BLOCK_DV, BLOCK_M, BLOCK_N, num_warps = (
+        _get_block_sizes_for_extend_attention(Lq, Lv)
+    )
+
+    sm_scale = sm_scale or 1.0 / (Lq**0.5)
+    batch_size, head_num = qo_indptr.shape[0] - 1, q.shape[1]
+    kv_group_num = q.shape[1] // k_buffer.shape[1]
+
+    USE_CUSTOM_MASK = custom_mask is not None
+    HAS_SINK = sinks is not None
+
+    # For sliding window attention, window_start_pos tracks the absolute position
+    # of the first key in each sequence's window
+    if sliding_window_size > 0 and window_start_pos is None:
+        # If not provided, assume window starts at position 0
+        window_start_pos = torch.zeros(batch_size, dtype=torch.int32, device=q.device)
+
+    grid = (batch_size, head_num, triton.cdiv(max_len_extend, BLOCK_M))
+    num_stages = 1
+
+    extra_kargs = {}
+    if _is_hip:
+        extra_kargs = {"waves_per_eu": 1, "matrix_instr_nonkdim": 16, "kpack": 2}
+
+    _fwd_kernel_unified[grid](
+        q,
+        o,
+        k_buffer,
+        v_buffer,
+        qo_indptr,
+        kv_indptr,
+        kv_indices,
+        prefix_lens,
+        custom_mask,
+        mask_indptr,
+        sinks,
+        window_start_pos,
+        sm_scale * k_scale,
+        v_scale,
+        kv_group_num,
+        q.stride(0),
+        q.stride(1),
+        o.stride(0),
+        o.stride(1),
+        k_buffer.stride(0),
+        k_buffer.stride(1),
+        v_buffer.stride(0),
+        v_buffer.stride(1),
+        SLIDING_WINDOW_SIZE=sliding_window_size,
+        logit_cap=logit_cap,
+        xai_temperature_len=xai_temperature_len,
+        BLOCK_DMODEL=BLOCK_DMODEL,
+        BLOCK_DPE=BLOCK_DPE,
+        BLOCK_DV=BLOCK_DV,
+        BLOCK_M=BLOCK_M,
+        BLOCK_N=BLOCK_N,
+        Lq=Lq,
+        Lv=Lv,
+        IS_CAUSAL=is_causal,
+        USE_CUSTOM_MASK=USE_CUSTOM_MASK,
+        HAS_SINK=HAS_SINK,
+        num_warps=num_warps,
+        num_stages=num_stages,
+        **extra_kargs,
+    )
diff --git a/sglang/python/sglang/srt/layers/attention/triton_ops/merge_state.py b/sglang/python/sglang/srt/layers/attention/triton_ops/merge_state.py
new file mode 100644
index 0000000000000000000000000000000000000000..955097fc9140e3fe5a3b94800a3d182d1056a37b
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/attention/triton_ops/merge_state.py
@@ -0,0 +1,96 @@
+from typing import Optional, Tuple
+
+import torch
+import triton
+import triton.language as tl
+
+
+@triton.jit
+def merge_state_kernel(
+    output,  # [NUM_TOKENS, NUM_HEADS, HEAD_SIZE] v_merged
+    output_lse,  # [NUM_TOKENS, NUM_HEADS] s_merged
+    prefix_output,  # [NUM_TOKENS, NUM_HEADS, HEAD_SIZE] v_a
+    prefix_lse,  # [NUM_TOKENS, NUM_HEADS] s_a
+    suffix_output,  # [NUM_TOKENS, NUM_HEADS, HEAD_SIZE] v_b
+    suffix_lse,  # [NUM_TOKENS, NUM_HEADS] s_b
+    HEAD_SIZE: tl.constexpr,
+    PADDED_HEAD_SIZE: tl.constexpr,
+    OUTPUT_LSE: tl.constexpr,
+):
+    token_idx = tl.program_id(0)
+    num_tokens = tl.num_programs(0)
+    head_idx = tl.program_id(1)
+    num_heads = tl.num_programs(1)
+
+    p_lse = tl.load(prefix_lse + token_idx * num_heads + head_idx)
+    s_lse = tl.load(suffix_lse + token_idx * num_heads + head_idx)
+    p_lse = float("-inf") if p_lse == float("inf") else p_lse
+    s_lse = float("-inf") if s_lse == float("inf") else s_lse
+
+    max_lse = tl.maximum(p_lse, s_lse)
+    p_lse = p_lse - max_lse
+    s_lse = s_lse - max_lse
+    out_se = tl.exp(p_lse) + tl.exp(s_lse)
+
+    if OUTPUT_LSE:
+        out_lse = tl.log(out_se) + max_lse
+        tl.store(output_lse + token_idx * num_heads + head_idx, out_lse)
+
+    head_arange = tl.arange(0, PADDED_HEAD_SIZE)
+    head_mask = head_arange < HEAD_SIZE
+    p_out = tl.load(
+        prefix_output
+        + token_idx * num_heads * HEAD_SIZE
+        + head_idx * HEAD_SIZE
+        + head_arange,
+        mask=head_mask,
+    )
+    s_out = tl.load(
+        suffix_output
+        + token_idx * num_heads * HEAD_SIZE
+        + head_idx * HEAD_SIZE
+        + head_arange,
+        mask=head_mask,
+    )
+
+    p_scale = tl.exp(p_lse) / out_se
+    s_scale = tl.exp(s_lse) / out_se
+    out = p_out * p_scale + s_out * s_scale
+    tl.store(
+        output + token_idx * num_heads * HEAD_SIZE + head_idx * HEAD_SIZE + head_arange,
+        out,
+        mask=head_mask,
+    )
+
+
+def merge_state_triton(
+    prefix_output: torch.Tensor,
+    prefix_lse: torch.Tensor,
+    suffix_output: torch.Tensor,
+    suffix_lse: torch.Tensor,
+    output: Optional[torch.Tensor] = None,
+    output_lse: Optional[torch.Tensor] = None,
+) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
+    # Avoid creating new tensors if they are already provided
+    if output is None:
+        output = torch.empty_like(prefix_output)
+    if output_lse is None:
+        output_lse = torch.empty_like(prefix_lse)
+
+    num_tokens = output.shape[0]
+    num_query_heads = output.shape[1]
+    head_size = output.shape[2]
+    padded_head_size = triton.next_power_of_2(head_size)
+
+    merge_state_kernel[(num_tokens, num_query_heads)](
+        output,
+        output_lse,
+        prefix_output,
+        prefix_lse,
+        suffix_output,
+        suffix_lse,
+        head_size,
+        padded_head_size,
+        output_lse is not None,
+    )
+    return output, output_lse
diff --git a/sglang/python/sglang/srt/layers/attention/triton_ops/prefill_attention.py b/sglang/python/sglang/srt/layers/attention/triton_ops/prefill_attention.py
new file mode 100644
index 0000000000000000000000000000000000000000..ac0fc72af1407d0b19028bcdc07ae9a87fcb70a7
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/attention/triton_ops/prefill_attention.py
@@ -0,0 +1,217 @@
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""
+Memory-efficient attention for prefill.
+It supporst page size = 1.
+"""
+
+# Adapted from
+# https://github.com/ModelTC/lightllm/blob/f2a54f0912293f683bf1d1695fd12c4098a5bf82/lightllm/models/llama/triton_kernel/context_flashattention_nopad.py#L1
+import torch
+import triton
+import triton.language as tl
+
+from sglang.srt.utils import is_cuda, is_hip
+
+_is_cuda = is_cuda()
+_is_hip = is_hip()
+
+if _is_cuda or _is_hip:
+    CUDA_CAPABILITY = torch.cuda.get_device_capability()
+
+
+@triton.jit
+def _fwd_kernel(
+    Q,
+    K,
+    V,
+    sm_scale,
+    B_Start_Loc,
+    B_Seqlen,
+    Out,
+    stride_qbs,
+    stride_qh,
+    stride_kbs,
+    stride_kh,
+    stride_vbs,
+    stride_vh,
+    stride_obs,
+    stride_oh,
+    kv_group_num: tl.constexpr,
+    BLOCK_M: tl.constexpr,
+    BLOCK_DMODEL: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+    IS_CAUSAL: tl.constexpr,
+    Lk: tl.constexpr,
+):
+    cur_batch = tl.program_id(0)
+    cur_head = tl.program_id(1)
+    start_m = tl.program_id(2)
+
+    cur_kv_head = cur_head // kv_group_num
+
+    cur_batch_seq_len = tl.load(B_Seqlen + cur_batch)
+    cur_batch_in_all_start_index = tl.load(B_Start_Loc + cur_batch)
+
+    block_start_loc = BLOCK_M * start_m
+
+    # initialize offsets
+    offs_n = tl.arange(0, BLOCK_N)
+    offs_d = tl.arange(0, BLOCK_DMODEL)
+    offs_m = start_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    off_q = (
+        (cur_batch_in_all_start_index + offs_m[:, None]) * stride_qbs
+        + cur_head * stride_qh
+        + offs_d[None, :]
+    )
+    off_k = offs_n[None, :] * stride_kbs + cur_kv_head * stride_kh + offs_d[:, None]
+    off_v = offs_n[:, None] * stride_vbs + cur_kv_head * stride_vh + offs_d[None, :]
+
+    mask_d = offs_d < Lk
+
+    q = tl.load(
+        Q + off_q,
+        mask=(offs_m[:, None] < cur_batch_seq_len) & (mask_d[None, :]),
+        other=0.0,
+    )
+
+    k_ptrs = K + off_k
+    v_ptrs = V + off_v
+
+    # initialize pointer to m and l
+    m_i = tl.zeros([BLOCK_M], dtype=tl.float32) - float("inf")
+    l_i = tl.zeros([BLOCK_M], dtype=tl.float32)
+    acc = tl.zeros([BLOCK_M, BLOCK_DMODEL], dtype=tl.float32)
+
+    block_mask = tl.where(block_start_loc < cur_batch_seq_len, 1, 0)
+
+    end_n = (
+        cur_batch_seq_len
+        if not IS_CAUSAL
+        else tl.minimum((start_m + 1) * BLOCK_M, cur_batch_seq_len)
+    )
+    for start_n in range(0, block_mask * end_n, BLOCK_N):
+        start_n = tl.multiple_of(start_n, BLOCK_N)
+        # -- compute qk ----
+        k = tl.load(
+            k_ptrs + (cur_batch_in_all_start_index + start_n) * stride_kbs,
+            mask=((start_n + offs_n[None, :]) < cur_batch_seq_len) & (mask_d[:, None]),
+            other=0.0,
+        )
+        # mask = tl.load(mask_ptrs + start_n, mask=start_n + offs_n < cur_batch_end_loc, other=0.0)
+
+        qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32)
+        qk += tl.dot(q, k)
+        qk *= sm_scale
+
+        if IS_CAUSAL:
+            qk += tl.where(
+                (start_n + offs_n[None, :] < cur_batch_seq_len)
+                & (offs_m[:, None] >= (start_n + offs_n[None, :])),
+                0,
+                float("-inf"),
+            )
+        else:
+            qk += tl.where(
+                (start_n + offs_n[None, :]) < cur_batch_seq_len, 0, float("-inf")
+            )
+
+        # -- compute m_ij, p, l_ij
+        m_ij = tl.max(qk, 1)
+        p = tl.exp(qk - m_ij[:, None])
+        l_ij = tl.sum(p, 1)
+        # -- update m_i and l_i
+        m_i_new = tl.maximum(m_i, m_ij)
+        alpha = tl.exp(m_i - m_i_new)
+        beta = tl.exp(m_ij - m_i_new)
+        l_i_new = alpha * l_i + beta * l_ij
+        # -- update output accumulator --
+        # scale p
+        p_scale = beta / l_i_new
+        p = p * p_scale[:, None]
+        # scale acc
+        acc_scale = l_i / l_i_new * alpha
+        acc = acc * acc_scale[:, None]
+        # update acc
+        v = tl.load(
+            v_ptrs + (cur_batch_in_all_start_index + start_n) * stride_vbs,
+            mask=((start_n + offs_n[:, None]) < cur_batch_seq_len) & (mask_d[None, :]),
+            other=0.0,
+        )
+
+        p = p.to(v.dtype)
+        acc += tl.dot(p, v)
+        # update m_i and l_i
+        l_i = l_i_new
+        m_i = m_i_new
+    # initialize pointers to output
+    off_o = (
+        (cur_batch_in_all_start_index + offs_m[:, None]) * stride_obs
+        + cur_head * stride_oh
+        + offs_d[None, :]
+    )
+    out_ptrs = Out + off_o
+    tl.store(
+        out_ptrs, acc, mask=(offs_m[:, None] < cur_batch_seq_len) & (mask_d[None, :])
+    )
+
+
+def context_attention_fwd(
+    q, k, v, o, b_start_loc, b_seq_len, max_input_len, is_causal=True
+):
+    """
+    q, k, v: [b * s, head, head_dim]
+    b_start_loc: [b]
+    b_seq_len: [b]
+    out: [b * s, head, head_dim]
+    """
+    if (_is_cuda or _is_hip) and CUDA_CAPABILITY[0] > 8:
+        BLOCK = 128
+    else:
+        BLOCK = 64
+
+    Lq, Lk, Lv = q.shape[-1], k.shape[-1], v.shape[-1]
+
+    sm_scale = 1.0 / (Lq**0.5)
+    batch, head = b_seq_len.shape[0], q.shape[1]
+    kv_group_num = q.shape[1] // k.shape[1]
+
+    grid = (batch, head, triton.cdiv(max_input_len, BLOCK))
+    num_warps = 4 if Lk <= 64 else 8
+
+    _fwd_kernel[grid](
+        q,
+        k,
+        v,
+        sm_scale,
+        b_start_loc,
+        b_seq_len,
+        o,
+        q.stride(0),
+        q.stride(1),
+        k.stride(0),
+        k.stride(1),
+        v.stride(0),
+        v.stride(1),
+        o.stride(0),
+        o.stride(1),
+        kv_group_num=kv_group_num,
+        BLOCK_M=BLOCK,
+        BLOCK_DMODEL=triton.next_power_of_2(Lk),
+        BLOCK_N=BLOCK,
+        IS_CAUSAL=is_causal,
+        num_warps=num_warps,
+        num_stages=1,
+        Lk=Lk,
+    )
diff --git a/sglang/python/sglang/srt/layers/attention/triton_ops/rocm_mla_decode_rope.py b/sglang/python/sglang/srt/layers/attention/triton_ops/rocm_mla_decode_rope.py
new file mode 100644
index 0000000000000000000000000000000000000000..89b17ce2db1545661d26686581036e759ec1f6ea
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/attention/triton_ops/rocm_mla_decode_rope.py
@@ -0,0 +1,439 @@
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""
+Memory-efficient attention for decoding.
+It supports page size = 1.
+"""
+
+# Adapted from
+# https://github.com/ModelTC/lightllm/blob/96353e868a840db4d103138caf15ed9dbea8c186/lightllm/models/deepseek2/triton_kernel/gqa_flash_decoding_stage1.py
+# https://github.com/ModelTC/lightllm/blob/96353e868a840db4d103138caf15ed9dbea8c186/lightllm/models/deepseek2/triton_kernel/gqa_flash_decoding_stage2.py
+
+import triton
+import triton.language as tl
+
+from sglang.srt.layers.attention.triton_ops.decode_attention import (
+    _decode_softmax_reducev_fwd,
+)
+
+
+def is_hip():
+    return triton.runtime.driver.active.get_current_target().backend == "hip"
+
+
+_is_hip = is_hip()
+
+
+@triton.jit
+def tanh(x):
+    # Tanh is just a scaled sigmoid
+    return 2 * tl.sigmoid(2 * x) - 1
+
+
+@triton.jit
+def _fwd_grouped_kernel_stage1_rope(
+    Q,  # Holds [Q_NOPE; Q_PE], b x h x (d+r)
+    K_Buffer,  # Holds [KV; K_PE], b*s x (c+r)
+    V_buffer,  # Holds [KV], b*s x (c)
+    cos_sin_cache,  # max_seq_len x (rotary_dim * 2)
+    positions,  # sequence positions
+    sm_scale,
+    kv_indptr,
+    kv_indices,
+    Att_Out,  # b x h x NUM_KV_SPLITS x (kv_lora_rank + 1)
+    k_pe_t_out,
+    stride_qb,
+    stride_qh,
+    stride_buf_kbs,
+    stride_buf_vbs,
+    stride_mid_ob,
+    stride_mid_oh,
+    stride_mid_os,
+    stride_kpe_tokens_out_b,
+    stride_cos_sin_cache_s,
+    stride_positions_b,
+    rotary_dim: tl.constexpr,
+    kv_lora_rank: tl.constexpr,
+    qk_rope_head_dim: tl.constexpr,
+    kv_group_num: tl.constexpr,
+    q_head_num: tl.constexpr,
+    BLOCK_C: tl.constexpr,
+    BLOCK_R: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+    BLOCK_H: tl.constexpr,
+    NUM_KV_SPLITS: tl.constexpr,
+    logit_cap: tl.constexpr,
+    USE_ROPE: tl.constexpr,
+    IS_NEOX_STYLE: tl.constexpr,
+):
+
+    cur_batch = tl.program_id(0)
+    cur_head_id = tl.program_id(1)
+    split_kv_id = tl.program_id(2)
+
+    if BLOCK_H < kv_group_num:
+        VALID_BLOCK_H: tl.constexpr = BLOCK_H
+    else:
+        VALID_BLOCK_H: tl.constexpr = kv_group_num
+    cur_head = cur_head_id * VALID_BLOCK_H + tl.arange(0, BLOCK_H)
+    mask_h = cur_head < (cur_head_id + 1) * VALID_BLOCK_H
+    mask_h = mask_h & (cur_head < q_head_num)
+
+    offs_c = tl.arange(0, BLOCK_C)
+    offs_qk_r = tl.arange(kv_lora_rank, kv_lora_rank + BLOCK_R)  # to get the k_pe
+
+    off_q_pe = (
+        cur_batch * stride_qb + cur_head[:, None] * stride_qh + offs_qk_r[None, :]
+    )
+    offs_q = cur_batch * stride_qb + cur_head[:, None] * stride_qh + offs_c[None, :]
+
+    mask_c = offs_c < kv_lora_rank
+    mask_qk_r = offs_qk_r < (kv_lora_rank + qk_rope_head_dim)
+
+    cur_batch_kv_start_idx = tl.load(kv_indptr + cur_batch)
+    cur_batch_seq_len = tl.load(kv_indptr + cur_batch + 1) - cur_batch_kv_start_idx
+
+    q = tl.load(Q + offs_q, mask=(mask_h[:, None]) & (mask_c[None, :]), other=0.0)
+    q_pe = tl.load(
+        Q + off_q_pe, mask=(mask_h[:, None]) & (mask_qk_r[None, :]), other=0.0
+    )
+
+    kv_len_per_split = tl.cdiv(cur_batch_seq_len, NUM_KV_SPLITS)
+    split_kv_start = kv_len_per_split * split_kv_id
+    split_kv_end = tl.minimum(split_kv_start + kv_len_per_split, cur_batch_seq_len)
+
+    # apply rotary embedding for q_pe, and k_pe (last token per batch of K_PE)
+    LAST_SPLIT = split_kv_end == cur_batch_seq_len
+    k_pe_last_token = tl.zeros([BLOCK_R], dtype=q.dtype)
+
+    if USE_ROPE:
+        if IS_NEOX_STYLE:
+            # [BLOCK_ROTARY // 2, BLOCK_ROTARY // 2 + 1, BLOCK_ROTARY // 2 + 2, ..., 0, 1, 2, ..., BLOCK_ROTARY // 2 - 1, pass:]
+            offs_qk_rot_r = kv_lora_rank + (
+                (tl.arange(0, BLOCK_R) + (rotary_dim // 2)) % rotary_dim
+            )
+            # Which elements to flip
+            mask_rotate = tl.arange(0, BLOCK_R) < (rotary_dim // 2)
+            # [0 , 1, 2, ..., rotary_dim // 2 - 1, 0 , 1, 2, ..., rotary_dim // 2 - 1]
+            offs_rotary = tl.arange(0, BLOCK_R) % (rotary_dim // 2)
+        else:
+            # [1, 0, 3, 2, 5, 4, ..., BLOCK_R, BLOCK_R - 1]
+            offs_qk_rot_r = (
+                kv_lora_rank
+                + (((tl.arange(0, BLOCK_R) + 1) % 2) * 2)
+                - 1
+                + tl.arange(0, BLOCK_R)
+            )
+            mask_rotate = tl.arange(0, BLOCK_R) % 2 < 1
+            # [0, 0, 1, 1, ..., rotary_dim // 2 - 1, rotary_dim // 2 - 1]
+            offs_rotary = tl.arange(0, BLOCK_R) // 2
+
+        if qk_rope_head_dim > rotary_dim:
+            offs_qk_rot_r = tl.where(
+                tl.arange(0, BLOCK_R) < rotary_dim, offs_qk_rot_r, tl.arange(0, BLOCK_R)
+            )
+            offs_rotary = tl.where(
+                tl.arange(0, BLOCK_R) < rotary_dim, offs_rotary, tl.arange(0, BLOCK_R)
+            )
+
+        mask_rotary = tl.arange(0, BLOCK_R) < rotary_dim
+
+        pos = tl.load(positions + cur_batch * stride_positions_b)
+        cos = tl.load(
+            cos_sin_cache + pos * stride_cos_sin_cache_s + offs_rotary,
+            mask=mask_rotary,
+            other=1.0,
+        )
+        sin = tl.load(
+            cos_sin_cache
+            + pos * stride_cos_sin_cache_s
+            + offs_rotary
+            + rotary_dim // 2,
+            mask_rotary,
+            other=0.0,
+        )
+
+        off_q_pe_rot = (
+            cur_batch * stride_qb
+            + cur_head[:, None] * stride_qh
+            + offs_qk_rot_r[None, :]
+        )
+        mask_qk_rot_r = offs_qk_rot_r < (kv_lora_rank + qk_rope_head_dim)
+
+        # 0, 2, 4,.... 1, 3, 5...
+        q_pe_rot = tl.load(
+            Q + off_q_pe_rot,
+            mask=(mask_h[:, None]) & (mask_qk_rot_r[None, :]),
+            other=0.0,
+        )
+        q_pe_rot = tl.where(mask_rotate[None, :], -q_pe_rot, q_pe_rot)
+
+        q_pe = q_pe * cos + q_pe_rot * sin
+
+        # we only apply to the last token in the K_PE
+        if LAST_SPLIT:
+            # debug assert
+            if (cur_batch == 0 and cur_head == 0) and split_kv_id < NUM_KV_SPLITS - 1:
+                tl.device_assert(False, "Only last split should compute k_pe")
+
+            kv_loc = tl.load(
+                kv_indices + cur_batch_kv_start_idx + cur_batch_seq_len - 1
+            )
+            offs_buf_k_pe_last_token = kv_loc * stride_buf_kbs + offs_qk_r
+            offs_buf_k_pe_rot_last_token = kv_loc * stride_buf_kbs + offs_qk_rot_r
+            k_pe_last_token = tl.load(K_Buffer + offs_buf_k_pe_last_token)
+
+            k_pe_rot_last_token = tl.load(K_Buffer + offs_buf_k_pe_rot_last_token)
+            k_pe_rot_last_token = tl.where(
+                mask_rotate, -k_pe_rot_last_token, k_pe_rot_last_token
+            )
+
+            k_pe_last_token = k_pe_last_token * cos + k_pe_rot_last_token * sin
+
+    e_max = tl.zeros([BLOCK_H], dtype=tl.float32) - float("inf")
+    e_sum = tl.zeros([BLOCK_H], dtype=tl.float32)
+    acc = tl.zeros([BLOCK_H, BLOCK_C], dtype=tl.float32)
+
+    if split_kv_end > split_kv_start:
+        for start_n in range(split_kv_start, split_kv_end, BLOCK_N):
+            offs_n = start_n + tl.arange(0, BLOCK_N)
+            kv_loc = tl.load(
+                kv_indices + cur_batch_kv_start_idx + offs_n,
+                mask=offs_n < split_kv_end,
+                other=0,
+            )
+
+            offs_buf_kv = kv_loc[None, :] * stride_buf_kbs + offs_c[:, None]
+            offs_buf_k_pe = kv_loc[None, :] * stride_buf_kbs + offs_qk_r[:, None]
+
+            k_pe = tl.load(
+                K_Buffer + offs_buf_k_pe,
+                mask=(offs_n[None, :] < split_kv_end) & (mask_qk_r[:, None]),
+                other=0.0,
+            )  # positional embedding part of keys
+
+            if (USE_ROPE and LAST_SPLIT) and start_n >= cur_batch_seq_len - BLOCK_N:
+                k_pe = tl.where(
+                    offs_n[None, :] != (split_kv_end - 1),
+                    k_pe,
+                    k_pe_last_token[:, None],
+                )
+
+            # (16, 64) x (64, 32)
+            # dot product of rope parts
+            qk = tl.dot(q_pe, k_pe.to(q_pe.dtype))
+
+            kv = tl.load(
+                K_Buffer + offs_buf_kv,
+                mask=(offs_n[None, :] < split_kv_end) & (mask_c[:, None]),
+                other=0.0,
+            )  # the shared latent tensor for keys and values
+
+            # (16, 512) x (512, 32)
+            # dot product of nope parts
+            qk += tl.dot(q, kv)
+
+            qk *= sm_scale
+
+            if logit_cap > 0:
+                qk = logit_cap * tanh(qk / logit_cap)
+
+            qk = tl.where(
+                mask_h[:, None] & (offs_n[None, :] < split_kv_end), qk, float("-inf")
+            )
+
+            offs_buf_v = kv_loc[:, None] * stride_buf_vbs + offs_c[None, :]
+            v = tl.load(
+                V_buffer + offs_buf_v,
+                mask=(offs_n[:, None] < split_kv_end) & (mask_c[None, :]),
+                other=0.0,
+            )
+
+            n_e_max = tl.maximum(tl.max(qk, 1), e_max)
+            re_scale = tl.exp(e_max - n_e_max)
+            p = tl.exp(qk - n_e_max[:, None])
+            acc *= re_scale[:, None]
+            # (16, 32) x (32, 512)
+            acc += tl.dot(p.to(v.dtype), v)
+
+            e_sum = e_sum * re_scale + tl.sum(p, 1)
+            e_max = n_e_max
+
+        offs_mid_o = (
+            cur_batch * stride_mid_ob
+            + cur_head[:, None] * stride_mid_oh
+            + split_kv_id * stride_mid_os
+            + offs_c[None, :]
+        )
+
+        if USE_ROPE:
+            if LAST_SPLIT:
+                k_pe_last_token_ptrs = (
+                    k_pe_t_out
+                    + cur_batch * stride_kpe_tokens_out_b
+                    + tl.arange(0, BLOCK_R)
+                )
+                tl.store(k_pe_last_token_ptrs, k_pe_last_token, mask=mask_qk_r)
+
+        tl.store(
+            Att_Out + offs_mid_o,
+            acc / e_sum[:, None],
+            mask=(mask_h[:, None]) & (mask_c[None, :]),
+        )
+
+        offs_mid_o_1 = (
+            cur_batch * stride_mid_ob
+            + cur_head * stride_mid_oh
+            + split_kv_id * stride_mid_os
+            + kv_lora_rank
+        )
+
+        tl.store(
+            Att_Out + offs_mid_o_1,
+            e_max + tl.log(e_sum),
+            mask=mask_h,
+        )
+
+
+# TODO rope offset
+def _decode_grouped_att_m_fwd_rope(
+    q,
+    k_buffer,
+    v_buffer,
+    att_out,
+    k_pe_tokens_out,
+    kv_lora_rank,  # c
+    cos_sin_cache,
+    positions,
+    rotary_dim,
+    kv_indptr,
+    kv_indices,
+    num_kv_splits,
+    sm_scale,
+    logit_cap,
+    use_rope,
+    is_neox_style=True,
+):
+    if use_rope:
+        assert (
+            k_pe_tokens_out is not None
+        ), "We must output the k_pe tokens with rope applied if rope fusion enabled."
+
+    BLOCK = 32
+
+    # # [TODO] work around shmem limit on MI3xx
+    # if _is_hip and kv_lora_rank >= 576:
+    #     BLOCK = 16
+
+    qk_rope_head_dim = k_buffer.shape[-1] - kv_lora_rank
+    batch, head_num = kv_indptr.shape[0] - 1, q.shape[1]
+    kv_group_num = q.shape[1] // k_buffer.shape[1]
+
+    BLOCK_C = triton.next_power_of_2(kv_lora_rank)
+    BLOCK_R = triton.next_power_of_2(qk_rope_head_dim)
+
+    BLOCK_H = 16
+    NUM_KV_SPLITS = num_kv_splits
+    grid = (
+        batch,
+        triton.cdiv(head_num, min(BLOCK_H, kv_group_num)),
+        NUM_KV_SPLITS,
+    )
+
+    extra_kargs = {}
+    num_stages = 2
+    if _is_hip:
+        # https://rocm.docs.amd.com/en/docs-6.2.0/how-to/llm-fine-tuning-optimization/optimizing-triton-kernel.html
+        # https://github.com/triton-lang/triton/blob/main/third_party/amd/backend/compiler.py
+        extra_kargs = {"waves_per_eu": 1, "matrix_instr_nonkdim": 16, "kpack": 2}
+        num_stages = 1
+
+    _fwd_grouped_kernel_stage1_rope[grid](
+        q,
+        k_buffer,
+        v_buffer,
+        cos_sin_cache,
+        positions,
+        sm_scale,
+        kv_indptr,
+        kv_indices,
+        att_out,
+        k_pe_tokens_out,
+        q.stride(0),
+        q.stride(1),
+        k_buffer.stride(0),
+        v_buffer.stride(0),
+        att_out.stride(0),
+        att_out.stride(1),
+        att_out.stride(2),
+        k_pe_tokens_out.stride(0) if use_rope else 0,
+        cos_sin_cache.stride(0) if use_rope else 0,
+        positions.stride(0) if use_rope else 0,
+        rotary_dim,
+        kv_lora_rank,
+        qk_rope_head_dim,
+        kv_group_num=kv_group_num,
+        q_head_num=head_num,
+        BLOCK_C=BLOCK_C,
+        BLOCK_R=BLOCK_R,
+        BLOCK_N=BLOCK,
+        BLOCK_H=BLOCK_H,
+        NUM_KV_SPLITS=NUM_KV_SPLITS,
+        logit_cap=logit_cap,
+        USE_ROPE=use_rope,
+        IS_NEOX_STYLE=is_neox_style,
+        num_warps=4,
+        num_stages=num_stages,
+        **extra_kargs
+    )
+
+
+def decode_attention_fwd_grouped_rope(
+    q,
+    k_buffer,
+    v_buffer,
+    o,
+    kv_indptr,
+    kv_indices,
+    k_pe_tokens,
+    kv_lora_rank,
+    rotary_dim,
+    cos_sin_cache,
+    positions,
+    attn_logits,
+    num_kv_splits,
+    sm_scale,
+    logit_cap=0.0,
+    use_rope=False,
+    is_neox_style=False,
+):
+    _decode_grouped_att_m_fwd_rope(
+        q,
+        k_buffer,
+        v_buffer,
+        attn_logits,
+        k_pe_tokens,
+        kv_lora_rank,
+        cos_sin_cache,
+        positions,
+        rotary_dim,
+        kv_indptr,
+        kv_indices,
+        num_kv_splits,
+        sm_scale,
+        logit_cap,
+        use_rope,
+        is_neox_style,
+    )
+    _decode_softmax_reducev_fwd(attn_logits, q, o, v_buffer, kv_indptr, num_kv_splits)
diff --git a/sglang/python/sglang/srt/layers/attention/triton_ops/trtllm_fp8_kv_kernel.py b/sglang/python/sglang/srt/layers/attention/triton_ops/trtllm_fp8_kv_kernel.py
new file mode 100644
index 0000000000000000000000000000000000000000..41cc9c9dc36739ae43a548124b32935fc4f54792
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/attention/triton_ops/trtllm_fp8_kv_kernel.py
@@ -0,0 +1,504 @@
+"""
+Fused FP8 quantization + paged KV cache write kernel for TRTLLM MHA backend.
+
+This kernel fuses the following operations:
+1. FP8 quantization of K and V tensors (from BF16/FP16 to FP8)
+2. Per-token or per-page scale computation
+3. Writing quantized K/V to paged KV cache layout
+
+Performance benefits:
+- Eliminates intermediate FP8 tensors in memory
+- Reduces kernel launch overhead
+- Better memory bandwidth utilization
+"""
+
+import logging
+from typing import Optional
+
+import torch
+import triton
+import triton.language as tl
+
+logger = logging.getLogger(__name__)
+
+
+@triton.jit
+def _process_kv_tensor(
+    token_id,
+    head_block_id,
+    page_id,
+    page_offset,
+    input_ptr,
+    cache_ptr,
+    inv_scale,
+    use_provided_scale: tl.constexpr,
+    num_kv_heads: tl.constexpr,
+    head_dim: tl.constexpr,
+    input_stride_token: tl.constexpr,
+    input_stride_head: tl.constexpr,
+    input_stride_dim: tl.constexpr,
+    cache_stride_page: tl.constexpr,
+    cache_stride_offset: tl.constexpr,
+    cache_stride_head: tl.constexpr,
+    cache_stride_dim: tl.constexpr,
+    BLOCK_HEAD: tl.constexpr,
+    BLOCK_DIM: tl.constexpr,
+):
+    """Process a block of heads for a single K or V tensor."""
+    head_idx = head_block_id * BLOCK_HEAD
+    num_heads_in_block = min(BLOCK_HEAD, num_kv_heads - head_idx)
+
+    for dim_idx in range(0, head_dim, BLOCK_DIM):
+        num_dims_in_block = min(BLOCK_DIM, head_dim - dim_idx)
+
+        head_offsets = head_idx + tl.arange(0, BLOCK_HEAD)
+        dim_offsets = dim_idx + tl.arange(0, BLOCK_DIM)
+
+        head_mask = head_offsets < (head_idx + num_heads_in_block)
+        dim_mask = dim_offsets < (dim_idx + num_dims_in_block)
+
+        # Load from input using 3D strides
+        input_offsets = (
+            token_id * input_stride_token
+            + head_offsets[:, None] * input_stride_head
+            + dim_offsets[None, :] * input_stride_dim
+        )
+        mask = head_mask[:, None] & dim_mask[None, :]
+
+        block = tl.load(input_ptr + input_offsets, mask=mask, other=0.0)
+
+        # Quantize to FP8
+        if use_provided_scale:
+            block_fp8 = (block * inv_scale).to(tl.float8e4nv)
+        else:
+            block_fp8 = block.to(tl.float8e4nv)
+
+        # Write to cache at [page_id, page_offset, head, dim]
+        cache_offsets = (
+            page_id * cache_stride_page
+            + page_offset * cache_stride_offset
+            + head_offsets[:, None] * cache_stride_head
+            + dim_offsets[None, :] * cache_stride_dim
+        )
+
+        tl.store(cache_ptr + cache_offsets, block_fp8, mask=mask)
+
+
+@triton.jit
+def _fused_fp8_set_kv_buffer_kernel(
+    # Input tensors (post-RoPE K and V in FP16/BF16)
+    k_ptr,  # [num_tokens, num_kv_heads, head_dim]
+    v_ptr,  # [num_tokens, num_kv_heads, head_dim]
+    # Output KV cache buffers (FP8 paged layout)
+    k_cache_ptr,  # [total_slots, num_kv_heads, head_dim]
+    v_cache_ptr,  # [total_slots, num_kv_heads, head_dim]
+    # Cache location indices
+    cache_loc_ptr,  # [num_tokens] -> token to cache location mapping
+    # Pointers to scalar inverse scales (computed on GPU in wrapper)
+    inv_k_scale_ptr,  # pointer to 0-D tensor on GPU
+    inv_v_scale_ptr,  # pointer to 0-D tensor on GPU
+    use_provided_scale: tl.constexpr,  # whether to use provided scale
+    # Tensor dimensions
+    num_kv_heads: tl.constexpr,
+    head_dim: tl.constexpr,
+    page_size: tl.constexpr,
+    # Strides for K input [num_tokens, num_kv_heads, head_dim]
+    k_stride_token: tl.constexpr,
+    k_stride_head: tl.constexpr,
+    k_stride_dim: tl.constexpr,
+    # Strides for K cache [total_slots, num_kv_heads, head_dim] (logically paged)
+    k_cache_stride_page: tl.constexpr,
+    k_cache_stride_offset: tl.constexpr,
+    k_cache_stride_head: tl.constexpr,
+    k_cache_stride_dim: tl.constexpr,
+    # Strides for V input [num_tokens, num_kv_heads, head_dim]
+    v_stride_token: tl.constexpr,
+    v_stride_head: tl.constexpr,
+    v_stride_dim: tl.constexpr,
+    # Strides for V cache [total_slots, num_kv_heads, head_dim] (logically paged)
+    v_cache_stride_page: tl.constexpr,
+    v_cache_stride_offset: tl.constexpr,
+    v_cache_stride_head: tl.constexpr,
+    v_cache_stride_dim: tl.constexpr,
+    # Block sizes
+    BLOCK_HEAD: tl.constexpr,  # Number of heads per block
+    BLOCK_DIM: tl.constexpr,  # Head dimension block size
+):
+    """
+    Fused FP8 quantization + paged KV cache write kernel.
+
+    Each program processes one token-head_block-kv combination, quantizing and writing
+    to the appropriate page in the KV cache.
+
+    Grid: (num_tokens, num_head_blocks, 2) where dim2: 0=K, 1=V
+    """
+    # Get program IDs
+    token_id = tl.program_id(0)
+    head_block_id = tl.program_id(1)
+    kv_idx = tl.program_id(2)  # 0 for K, 1 for V
+
+    # Get cache location for this token
+    cache_loc = tl.load(cache_loc_ptr + token_id)
+
+    # Compute page_id and offset within page
+    page_id = cache_loc // page_size
+    page_offset = cache_loc % page_size
+
+    # Select K or V based on kv_idx
+    if kv_idx == 0:
+        # Process K tensor
+        if use_provided_scale:
+            inv_scale = tl.load(inv_k_scale_ptr)
+        else:
+            inv_scale = 1.0
+        _process_kv_tensor(
+            token_id,
+            head_block_id,
+            page_id,
+            page_offset,
+            k_ptr,
+            k_cache_ptr,
+            inv_scale,
+            use_provided_scale,
+            num_kv_heads,
+            head_dim,
+            k_stride_token,
+            k_stride_head,
+            k_stride_dim,
+            k_cache_stride_page,
+            k_cache_stride_offset,
+            k_cache_stride_head,
+            k_cache_stride_dim,
+            BLOCK_HEAD,
+            BLOCK_DIM,
+        )
+    else:
+        # Process V tensor
+        if use_provided_scale:
+            inv_scale = tl.load(inv_v_scale_ptr)
+        else:
+            inv_scale = 1.0
+        _process_kv_tensor(
+            token_id,
+            head_block_id,
+            page_id,
+            page_offset,
+            v_ptr,
+            v_cache_ptr,
+            inv_scale,
+            use_provided_scale,
+            num_kv_heads,
+            head_dim,
+            v_stride_token,
+            v_stride_head,
+            v_stride_dim,
+            v_cache_stride_page,
+            v_cache_stride_offset,
+            v_cache_stride_head,
+            v_cache_stride_dim,
+            BLOCK_HEAD,
+            BLOCK_DIM,
+        )
+
+
+def fused_fp8_set_kv_buffer(
+    k: torch.Tensor,  # [num_tokens, num_kv_heads, head_dim] or [num_tokens, num_kv_heads * head_dim]
+    v: torch.Tensor,  # [num_tokens, num_kv_heads, head_dim] or [num_tokens, num_kv_heads * head_dim]
+    k_cache: torch.Tensor,  # [total_slots, num_kv_heads, head_dim] or [num_pages, page_size, num_kv_heads, head_dim]
+    v_cache: torch.Tensor,  # [total_slots, num_kv_heads, head_dim] or [num_pages, page_size, num_kv_heads, head_dim]
+    cache_loc: torch.Tensor,  # [num_tokens], dtype=int32
+    k_scale: Optional[
+        float
+    ] = None,  # Scalar scale (matching original set_kv_buffer signature)
+    v_scale: Optional[float] = None,
+    page_size: int = 16,
+    use_triton: bool = True,  # Whether to use Triton kernel (set to False to force naive fallback)
+) -> None:
+    """
+    Python wrapper for the fused FP8 quantization + paged KV cache write kernel.
+
+    This function replicates the exact behavior of the original set_kv_buffer but with
+    a fused kernel that combines FP8 quantization and cache write.
+
+    Args:
+        k: Key tensor after RoPE, can be 2D or 3D
+        v: Value tensor, can be 2D or 3D
+        k_cache: Paged K cache buffer in FP8
+        v_cache: Paged V cache buffer in FP8
+        cache_loc: Cache location for each token, shape [num_tokens]
+        k_scale: Optional scalar scale for K (matching original set_kv_buffer)
+        v_scale: Optional scalar scale for V (matching original set_kv_buffer)
+        page_size: Number of tokens per page
+        use_triton: Whether to use optimized Triton kernel
+    """
+    num_tokens = k.shape[0]
+
+    # Step 1: Infer num_kv_heads and head_dim from cache shape
+    if k_cache.ndim == 3:
+        # 3D cache layout: [total_slots, num_kv_heads, head_dim]
+        total_slots, num_kv_heads, head_dim = k_cache.shape
+        assert (
+            total_slots % page_size == 0
+        ), f"total_slots ({total_slots}) must be divisible by page_size ({page_size})"
+        num_pages = total_slots // page_size
+    elif k_cache.ndim == 4:
+        # 4D cache layout: [num_pages, page_size, num_kv_heads, head_dim]
+        num_pages, ps, num_kv_heads, head_dim = k_cache.shape
+        assert (
+            ps == page_size
+        ), f"page_size mismatch: cache has {ps}, expected {page_size}"
+        total_slots = num_pages * page_size
+    else:
+        raise ValueError(f"Unsupported k_cache.ndim={k_cache.ndim}, expected 3 or 4")
+
+    # Step 2: Validate k, v shapes and normalize
+    # Store original 3D shape for Triton path
+    k_3d = None
+    v_3d = None
+
+    if k.ndim == 3:
+        # Input is [num_tokens, num_kv_heads, head_dim]
+        assert (
+            k.shape[1] == num_kv_heads
+        ), f"num_kv_heads mismatch: k.shape[1]={k.shape[1]} vs cache={num_kv_heads}"
+        assert (
+            k.shape[2] == head_dim
+        ), f"head_dim mismatch: k.shape[2]={k.shape[2]} vs cache={head_dim}"
+        assert v.shape[1] == num_kv_heads and v.shape[2] == head_dim, "v shape mismatch"
+
+        # Keep 3D for Triton kernel
+        k_3d = k
+        v_3d = v
+        # Create 2D view for naive fallback (will be used only if use_triton=False)
+        k_2d = k.reshape(num_tokens, num_kv_heads * head_dim)
+        v_2d = v.reshape(num_tokens, num_kv_heads * head_dim)
+    elif k.ndim == 2:
+        # Input is already [num_tokens, num_kv_heads * head_dim]
+        assert (
+            k.shape[1] == num_kv_heads * head_dim
+        ), f"k.shape[1]={k.shape[1]} != {num_kv_heads * head_dim}"
+        assert (
+            v.shape[1] == num_kv_heads * head_dim
+        ), f"v.shape[1]={v.shape[1]} != {num_kv_heads * head_dim}"
+
+        # Create 3D view for Triton kernel
+        k_3d = k.view(num_tokens, num_kv_heads, head_dim)
+        v_3d = v.view(num_tokens, num_kv_heads, head_dim)
+        # Keep 2D for naive
+        k_2d = k
+        v_2d = v
+    else:
+        raise ValueError(f"Unsupported k.ndim={k.ndim}, expected 2 or 3")
+
+    # Step 3: Compute cache strides based on layout
+    if k_cache.ndim == 3:
+        # 3D cache: [total_slots, num_kv_heads, head_dim]
+        stride_slot = k_cache.stride(0)
+        stride_head = k_cache.stride(1)
+        stride_dim = k_cache.stride(2)
+
+        k_cache_stride_page = stride_slot * page_size
+        k_cache_stride_offset = stride_slot
+        k_cache_stride_head = stride_head
+        k_cache_stride_dim = stride_dim
+
+        v_stride_slot = v_cache.stride(0)
+        v_stride_head = v_cache.stride(1)
+        v_stride_dim = v_cache.stride(2)
+
+        v_cache_stride_page = v_stride_slot * page_size
+        v_cache_stride_offset = v_stride_slot
+        v_cache_stride_head = v_stride_head
+        v_cache_stride_dim = v_stride_dim
+    else:
+        # 4D cache: [num_pages, page_size, num_kv_heads, head_dim]
+        k_cache_stride_page = k_cache.stride(0)
+        k_cache_stride_offset = k_cache.stride(1)
+        k_cache_stride_head = k_cache.stride(2)
+        k_cache_stride_dim = k_cache.stride(3)
+
+        v_cache_stride_page = v_cache.stride(0)
+        v_cache_stride_offset = v_cache.stride(1)
+        v_cache_stride_head = v_cache.stride(2)
+        v_cache_stride_dim = v_cache.stride(3)
+
+    # Decide whether to use provided scale
+    use_provided_scale = k_scale is not None and v_scale is not None
+
+    if use_triton and num_tokens > 0:
+        # Use optimized Triton kernel
+        # Compute input strides for 3D k, v: [num_tokens, num_kv_heads, head_dim]
+        k_stride_token = k_3d.stride(0)
+        k_stride_head = k_3d.stride(1)
+        k_stride_dim = k_3d.stride(2)
+
+        v_stride_token = v_3d.stride(0)
+        v_stride_head = v_3d.stride(1)
+        v_stride_dim = v_3d.stride(2)
+
+        # Block sizes for tiling (tunable)
+        BLOCK_HEAD = min(num_kv_heads, 8)  # Process up to 8 heads at once
+        BLOCK_DIM = min(head_dim, 128)  # Process up to 128 dims at once
+
+        # Compute number of head blocks
+        num_head_blocks = (num_kv_heads + BLOCK_HEAD - 1) // BLOCK_HEAD
+
+        # Grid: (num_tokens, num_head_blocks, 2)
+        # - dim 0: tokens
+        # - dim 1: head blocks
+        # - dim 2: K/V (0=K, 1=V)
+        grid = (num_tokens, num_head_blocks, 2)
+
+        device = k_3d.device
+
+        def _to_tensor_scale(scale):
+            """Convert scale to 0-D CUDA tensor (accepts Python float or Tensor)."""
+            if isinstance(scale, torch.Tensor):
+                return scale.to(device=device, dtype=torch.float32)
+            else:
+                # Python float / np scalar
+                return torch.tensor(float(scale), device=device, dtype=torch.float32)
+
+        # Compute inverse scales on GPU to avoid GPU→CPU sync in CUDA graph capture.
+        # Previously we used float(k_scale) which triggers synchronization and fails
+        # during CUDA graph capture with cudaErrorStreamCaptureUnsupported.
+        if use_provided_scale:
+            k_scale_tensor = _to_tensor_scale(k_scale)
+            v_scale_tensor = _to_tensor_scale(v_scale)
+
+            # Pure GPU scalar operation, safe for CUDA graph
+            inv_k_scale = (1.0 / k_scale_tensor).to(device=device, dtype=torch.float32)
+            inv_v_scale = (1.0 / v_scale_tensor).to(device=device, dtype=torch.float32)
+
+            inv_k_scale_ptr = inv_k_scale
+            inv_v_scale_ptr = inv_v_scale
+        else:
+            # When use_provided_scale=False, kernel uses constant 1.0 for inv_scale.
+            # Triton will optimize away the tl.load() calls via constant folding.
+            # We pass dummy pointers (k_3d) which won't be accessed in the kernel.
+            # This avoids creating new GPU tensors during CUDA graph capture.
+            inv_k_scale_ptr = k_3d
+            inv_v_scale_ptr = k_3d
+
+        # Launch Triton kernel
+        _fused_fp8_set_kv_buffer_kernel[grid](
+            k_3d,
+            v_3d,
+            k_cache,
+            v_cache,
+            cache_loc,
+            inv_k_scale_ptr,
+            inv_v_scale_ptr,
+            use_provided_scale,
+            num_kv_heads,
+            head_dim,
+            page_size,
+            k_stride_token,
+            k_stride_head,
+            k_stride_dim,
+            k_cache_stride_page,
+            k_cache_stride_offset,
+            k_cache_stride_head,
+            k_cache_stride_dim,
+            v_stride_token,
+            v_stride_head,
+            v_stride_dim,
+            v_cache_stride_page,
+            v_cache_stride_offset,
+            v_cache_stride_head,
+            v_cache_stride_dim,
+            BLOCK_HEAD=BLOCK_HEAD,
+            BLOCK_DIM=BLOCK_DIM,
+        )
+    else:
+        # Fallback to naive implementation
+        _naive_fp8_set_kv_buffer(
+            k_2d, v_2d, k_cache, v_cache, cache_loc, k_scale, v_scale, page_size
+        )
+
+
+def _naive_fp8_set_kv_buffer(
+    k: torch.Tensor,
+    v: torch.Tensor,
+    k_cache: torch.Tensor,
+    v_cache: torch.Tensor,
+    cache_loc: torch.Tensor,
+    k_scale: Optional[float],
+    v_scale: Optional[float],
+    page_size: int,
+) -> None:
+    """
+    Naive fallback implementation that mimics the original set_kv_buffer logic.
+
+    This directly replicates the behavior of MHATokenToKVPool.set_kv_buffer:
+    1. Apply scale (if k.dtype != cache.dtype and scale is provided)
+    2. Convert to FP8
+    3. Write to cache at cache_loc
+
+    Args:
+        k: [num_tokens, num_kv_heads * head_dim], already reshaped to 2D
+        v: [num_tokens, num_kv_heads * head_dim], already reshaped to 2D
+        k_cache: [total_slots, num_kv_heads, head_dim] or [num_pages, page_size, num_kv_heads, head_dim]
+        v_cache: Same shape as k_cache
+        cache_loc: [num_tokens]
+        k_scale: Optional scale for K
+        v_scale: Optional scale for V
+        page_size: Tokens per page
+    """
+    num_tokens = k.shape[0]
+
+    # Infer dimensions from cache
+    if k_cache.ndim == 3:
+        num_kv_heads = k_cache.shape[1]
+        head_dim = k_cache.shape[2]
+    elif k_cache.ndim == 4:
+        num_kv_heads = k_cache.shape[2]
+        head_dim = k_cache.shape[3]
+    else:
+        raise ValueError(f"Unsupported k_cache.ndim={k_cache.ndim}")
+
+    # Determine target dtype and storage dtype
+    # See: python/sglang/srt/mem_cache/memory_pool.py:445-449
+    store_dtype = k_cache.dtype
+    if store_dtype == torch.uint8:
+        # Cache is stored as uint8 for FP8 (due to index_put limitation)
+        dtype = torch.float8_e4m3fn  # Logical dtype
+    else:
+        dtype = store_dtype  # Cache dtype is the logical dtype
+
+    # Replicate the original set_kv_buffer behavior
+    # See: python/sglang/srt/mem_cache/memory_pool.py:777-799
+    if k.dtype != dtype:
+        # Need quantization - clone first to avoid modifying input
+        k = k.clone()
+        v = v.clone()
+
+        if k_scale is not None:
+            k.div_(k_scale)  # In-place division
+        if v_scale is not None:
+            v.div_(v_scale)  # In-place division
+
+        k = k.to(dtype)
+        v = v.to(dtype)
+
+    # View FP8 as uint8 if needed (for index_put compatibility)
+    if store_dtype == torch.uint8 and dtype in (torch.float8_e5m2, torch.float8_e4m3fn):
+        k = k.view(torch.uint8)
+        v = v.view(torch.uint8)
+
+    # Reshape from [T, H*D] to [T, H, D]
+    k = k.view(num_tokens, num_kv_heads, head_dim)
+    v = v.view(num_tokens, num_kv_heads, head_dim)
+
+    # Write to cache using advanced indexing (same as original)
+    if k_cache.ndim == 3:
+        # 3D cache: [total_slots, H, D]
+        k_cache[cache_loc] = k
+        v_cache[cache_loc] = v
+    else:
+        # 4D cache: [num_pages, page_size, H, D]
+        # Decompose loc into page_id and page_offset (vectorized)
+        page_ids = cache_loc // page_size
+        page_offsets = cache_loc % page_size
+        k_cache[page_ids, page_offsets] = k
+        v_cache[page_ids, page_offsets] = v
diff --git a/sglang/python/sglang/srt/layers/attention/trtllm_mha_backend.py b/sglang/python/sglang/srt/layers/attention/trtllm_mha_backend.py
new file mode 100644
index 0000000000000000000000000000000000000000..09f3f409a1feeeb3c1e41c7dd9c50285e134fcf5
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/attention/trtllm_mha_backend.py
@@ -0,0 +1,942 @@
+from __future__ import annotations
+
+"""
+Support attention backend for TRTLLM MHA kernels from flashinfer.
+The kernel supports sm100 only, with sliding window and attention sink features.
+"""
+
+import logging
+from dataclasses import dataclass
+from typing import TYPE_CHECKING, Optional
+
+import torch
+
+from sglang.srt.environ import envs
+from sglang.srt.layers.attention.flashinfer_backend import (
+    FlashInferAttnBackend,
+    FlashInferMultiStepDraftBackend,
+)
+from sglang.srt.layers.attention.triton_ops.trtllm_fp8_kv_kernel import (
+    fused_fp8_set_kv_buffer,
+)
+from sglang.srt.layers.attention.utils import canonicalize_stride
+from sglang.srt.mem_cache.swa_memory_pool import SWAKVPool, SWATokenToKVPoolAllocator
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch, ForwardMode
+from sglang.srt.utils import is_flashinfer_available
+from sglang.srt.utils.common import is_sm90_supported, is_sm120_supported
+
+logger = logging.getLogger(__name__)
+
+if is_flashinfer_available():
+    import flashinfer
+
+if TYPE_CHECKING:
+    from sglang.srt.layers.radix_attention import RadixAttention
+    from sglang.srt.model_executor.model_runner import ModelRunner
+    from sglang.srt.speculative.spec_info import SpecInput
+
+# Constants
+# Default workspace size in MB for TRTLLM MHA
+# Can be configured via SGLANG_FLASHINFER_WORKSPACE_SIZE environment variable
+DEFAULT_WORKSPACE_SIZE_MB = 512
+
+# Reuse this workspace buffer across all TRTLLM MHA wrappers
+global_zero_init_workspace_buffer = None
+
+
+@dataclass
+class TRTLLMMHAMetadata:
+    # Sequence lengths for the forward batch
+    cache_seqlens_int32: torch.Tensor = None
+    # Maximum sequence length for query
+    max_seq_len_q: int = 1
+    # Maximum sequence length for key
+    max_seq_len_k: int = 0
+    # Cumulative sequence lengths for `query
+    cu_seqlens_q: torch.Tensor = None
+    # Cumulative sequence lengths for key
+    cu_seqlens_k: torch.Tensor = None
+    # Page table, the index of KV Cache Tables/Blocks
+    page_table: torch.Tensor = None
+    # Page table for SWA layers (translated from full pool indices to SWA pool indices)
+    swa_page_table: torch.Tensor = None
+
+
+class TRTLLMHAAttnBackend(FlashInferAttnBackend):
+    """TRTLLM MHA attention kernel from flashinfer."""
+
+    def __init__(
+        self,
+        model_runner: ModelRunner,
+        skip_prefill: bool = False,
+        kv_indptr_buf: Optional[torch.Tensor] = None,
+        kv_last_page_len_buf: Optional[torch.Tensor] = None,
+        speculative_step_id: int = 0,
+    ):
+        # Capture workspace size before super().__init__() to preserve user's
+        # SGLANG_FLASHINFER_WORKSPACE_SIZE setting (may be overridden by parent)
+        env_var = envs.SGLANG_FLASHINFER_WORKSPACE_SIZE
+        workspace_size_bytes = (
+            env_var.get()
+            if env_var.is_set()
+            else DEFAULT_WORKSPACE_SIZE_MB * 1024 * 1024
+        )
+
+        super().__init__(
+            model_runner, skip_prefill, kv_indptr_buf, kv_last_page_len_buf
+        )
+
+        config = model_runner.model_config
+
+        # MHA-specific dimensions
+        self.max_context_len = model_runner.model_config.context_len
+        self.hidden_size = config.hidden_size
+
+        # Runtime parameters
+        self.data_type = model_runner.kv_cache_dtype
+        self.q_data_type = model_runner.dtype
+        self.page_size = model_runner.page_size
+        self.req_to_token = model_runner.req_to_token_pool.req_to_token
+        self.device = model_runner.device
+
+        # Workspace allocation
+        self.workspace_size = workspace_size_bytes
+        # Allocate buffers
+        global global_zero_init_workspace_buffer
+        if global_zero_init_workspace_buffer is None:
+            global_zero_init_workspace_buffer = torch.zeros(
+                self.workspace_size,
+                dtype=torch.uint8,
+                device=model_runner.device,
+            )
+        self.workspace_buffer = global_zero_init_workspace_buffer
+
+        # CUDA graph state
+        self.decode_cuda_graph_metadata = {}
+
+        # Speculative decoding
+        # Only support topk <= 1 for now.
+        self.topk = model_runner.server_args.speculative_eagle_topk or 0
+        self.speculative_step_id = speculative_step_id
+        self.target_verify_metadata = {}
+
+        self.speculative_num_draft_tokens = (
+            model_runner.server_args.speculative_num_draft_tokens
+        )
+
+        # Sliding Window Attention(SWA) hybrid model support.
+        # For hybrid SWA models, the KV cache is split into two pools (full and SWA)
+        # with separate index spaces. We maintain a translated page_table for SWA
+        # layers so the trtllm kernel reads from the correct pool.
+        allocator = model_runner.token_to_kv_pool_allocator
+        self.use_sliding_window_kv_pool = isinstance(
+            allocator, SWATokenToKVPoolAllocator
+        )
+        self._swa_kv_pool: Optional[SWAKVPool] = (
+            allocator.get_kvcache() if self.use_sliding_window_kv_pool else None
+        )
+
+        # Forward metadata
+        self.forward_metadata: Optional[TRTLLMMHAMetadata] = None
+
+        # Init backend (XQA or TRTLLM-GEN)
+        # We need to specify q_type and out_type for different backend
+        # XQA: (q_type must be bf16)
+        #   KV bf16: q_type = bf16, out_type=model_runner.dtype
+        #   KV fp8: q_type = bf16, out_type=model_runner.dtype
+        # TRTLLM-GEN:
+        #   KV bf16: q_type = bf16, out_type=model_runner.dtype
+        #   KV fp8: q_type = fp8, out_type=model_runner.dtype
+        self.is_xqa_impl = is_sm90_supported() or is_sm120_supported()
+
+    def _maybe_translate_swa(
+        self, token_indices: torch.Tensor
+    ) -> Optional[torch.Tensor]:
+        """Translate full-pool token indices to SWA-pool indices, or return None."""
+        if not self.use_sliding_window_kv_pool:
+            return None
+        shape = token_indices.shape
+        return self._swa_kv_pool.translate_loc_from_full_to_swa(
+            token_indices.reshape(-1)
+        ).reshape(shape)
+
+    def _alloc_swa_page_table(
+        self, max_bs: int, max_num_pages: int
+    ) -> Optional[torch.Tensor]:
+        """Allocate a SWA page_table buffer, or return None for non-SWA models."""
+        if not self.use_sliding_window_kv_pool:
+            return None
+        return torch.zeros(max_bs, max_num_pages, dtype=torch.int32, device=self.device)
+
+    def _copy_swa_page_table(
+        self,
+        metadata: TRTLLMMHAMetadata,
+        page_indices: torch.Tensor,
+        num_pages: int,
+    ):
+        """Translate and copy SWA page indices into metadata. No-op for non-SWA."""
+        if metadata.swa_page_table is None:
+            return
+        swa_indices = self._maybe_translate_swa(page_indices)
+        metadata.swa_page_table[:, :num_pages].copy_(swa_indices // self.page_size)
+
+    def _get_layer_cache_loc(
+        self,
+        layer: RadixAttention,
+        cache_loc: torch.Tensor,
+    ) -> torch.Tensor:
+        """Return cache locations in the correct index space for the given layer."""
+        if self.use_sliding_window_kv_pool:
+            _, is_swa = self._swa_kv_pool.layers_mapping[layer.layer_id]
+            if is_swa:
+                return self._swa_kv_pool.translate_loc_from_full_to_swa(cache_loc)
+        return cache_loc
+
+    def _bind_swa_page_table(
+        self, metadata: TRTLLMMHAMetadata, source: dict, key: str, bs: int
+    ):
+        """Bind a pre-allocated SWA page_table slice to metadata for CUDA graph."""
+        buf = source.get(key)
+        if buf is not None:
+            metadata.swa_page_table = buf[:bs, :]
+
+    def _get_layer_page_table(
+        self, layer: RadixAttention, forward_batch: ForwardBatch
+    ) -> torch.Tensor:
+        """Return the correct page_table for the given layer (SWA or full)."""
+        swa_pt = self.forward_metadata.swa_page_table
+        if swa_pt is not None:
+            _, is_swa = self._swa_kv_pool.layers_mapping[layer.layer_id]
+            if is_swa:
+                return swa_pt
+        return self.forward_metadata.page_table
+
+    def init_cuda_graph_state(
+        self,
+        max_bs: int,
+        max_num_tokens: int,
+        kv_indices_buf: Optional[torch.Tensor] = None,
+    ):
+        """Initialize CUDA graph state for TRTLLM MHA."""
+        max_num_pages = (self.max_context_len + self.page_size - 1) // self.page_size
+        self.decode_cuda_graph_metadata = {
+            "cache_seqlens": torch.zeros(max_bs, dtype=torch.int32, device=self.device),
+            "page_table": torch.zeros(
+                max_bs,
+                max_num_pages,
+                dtype=torch.int32,
+                device=self.device,
+            ),
+            "swa_page_table": self._alloc_swa_page_table(max_bs, max_num_pages),
+            "strided_indices": torch.arange(
+                0, self.max_context_len, self.page_size, device=self.device
+            ),
+        }
+
+        if (
+            self.speculative_num_draft_tokens is not None
+            and self.speculative_num_draft_tokens > 0
+        ):
+            self.decode_cuda_graph_metadata["cu_seqlens_q"] = torch.arange(
+                0, max_bs + 1, dtype=torch.int32, device=self.device
+            )
+            self.decode_cuda_graph_metadata["cu_seqlens_k"] = torch.zeros(
+                max_bs + 1, dtype=torch.int32, device=self.device
+            )
+            self.decode_cuda_graph_metadata["page_table_draft_decode"] = torch.zeros(
+                max_bs,
+                max_num_pages,
+                dtype=torch.int32,
+                device=self.device,
+            )
+            self.decode_cuda_graph_metadata["swa_page_table_draft_decode"] = (
+                self._alloc_swa_page_table(max_bs, max_num_pages)
+            )
+
+            self.target_verify_metadata = {
+                "cache_seqlens": torch.zeros(
+                    max_bs, dtype=torch.int32, device=self.device
+                ),
+                "cu_seqlens_q": torch.arange(
+                    0,
+                    max_bs * self.speculative_num_draft_tokens + 1,
+                    step=self.speculative_num_draft_tokens,
+                    dtype=torch.int32,
+                    device=self.device,
+                ),
+                "cu_seqlens_k": torch.zeros(
+                    max_bs + 1, dtype=torch.int32, device=self.device
+                ),
+                "page_table": torch.zeros(
+                    max_bs,
+                    max_num_pages,
+                    dtype=torch.int32,
+                    device=self.device,
+                ),
+                "swa_page_table": self._alloc_swa_page_table(max_bs, max_num_pages),
+                "strided_indices": torch.arange(
+                    0, self.max_context_len, self.page_size, device=self.device
+                ),
+            }
+
+            self.draft_extend_metadata = {
+                "cache_seqlens": torch.zeros(
+                    max_bs, dtype=torch.int32, device=self.device
+                ),
+                "cu_seqlens_q": torch.zeros(
+                    max_bs + 1,
+                    dtype=torch.int32,
+                    device=self.device,
+                ),
+                "cu_seqlens_k": torch.zeros(
+                    max_bs + 1, dtype=torch.int32, device=self.device
+                ),
+                "page_table": torch.zeros(
+                    max_bs,
+                    max_num_pages,
+                    dtype=torch.int32,
+                    device=self.device,
+                ),
+                "swa_page_table": self._alloc_swa_page_table(max_bs, max_num_pages),
+                "strided_indices": torch.arange(
+                    0, self.max_context_len, self.page_size, device=self.device
+                ),
+            }
+
+    def init_forward_metadata_capture_cuda_graph(
+        self,
+        bs: int,
+        num_tokens: int,
+        req_pool_indices: torch.Tensor,
+        seq_lens: torch.Tensor,
+        encoder_lens: Optional[torch.Tensor],
+        forward_mode: ForwardMode,
+        spec_info: Optional[SpecInput],
+    ):
+        """Initialize metadata for CUDA graph capture."""
+        metadata = TRTLLMMHAMetadata()
+        device = seq_lens.device
+
+        if forward_mode.is_decode_or_idle():
+            if spec_info is not None:
+                # Draft Decode
+                # Here we only support topk = 1 for now.
+                metadata.cache_seqlens_int32 = self.decode_cuda_graph_metadata[
+                    "cache_seqlens"
+                ][:bs]
+                metadata.max_seq_len_k = seq_lens.max().item() + (
+                    self.speculative_step_id + 1
+                )
+                metadata.cu_seqlens_q = self.decode_cuda_graph_metadata["cu_seqlens_q"][
+                    : bs + 1
+                ]
+                metadata.cu_seqlens_k = torch.nn.functional.pad(
+                    torch.cumsum(
+                        metadata.cache_seqlens_int32, dim=0, dtype=torch.int32
+                    ),
+                    (1, 0),
+                )
+                metadata.page_table = self.decode_cuda_graph_metadata[
+                    "page_table_draft_decode"
+                ][:bs, :]
+                self._bind_swa_page_table(
+                    metadata,
+                    self.decode_cuda_graph_metadata,
+                    "swa_page_table_draft_decode",
+                    bs,
+                )
+                self.decode_cuda_graph_metadata[bs] = metadata
+            else:
+                # Normal Decode
+                # Get sequence information
+                metadata.cache_seqlens_int32 = seq_lens[:bs].to(torch.int32)
+                batch_size = len(seq_lens)
+                metadata.cu_seqlens_k = torch.nn.functional.pad(
+                    torch.cumsum(seq_lens, dim=0, dtype=torch.int32), (1, 0)
+                )
+
+                # Precompute maximum sequence length
+                metadata.max_seq_len_k = seq_lens.max().item()
+                # Precompute cumulative sequence lengths
+                metadata.cu_seqlens_q = torch.arange(
+                    0, batch_size + 1, dtype=torch.int32, device=device
+                )
+                # Precompute page table
+                metadata.page_table = self.decode_cuda_graph_metadata["page_table"][
+                    :bs, :
+                ]
+                self._bind_swa_page_table(
+                    metadata,
+                    self.decode_cuda_graph_metadata,
+                    "swa_page_table",
+                    bs,
+                )
+                self.decode_cuda_graph_metadata[bs] = metadata
+        elif forward_mode.is_target_verify():
+            # Target Verify
+            # Here we only support topk = 1 for now.
+            metadata.cache_seqlens_int32 = self.target_verify_metadata["cache_seqlens"][
+                :bs
+            ]
+            metadata.cache_seqlens_int32.copy_(
+                (seq_lens + self.speculative_num_draft_tokens)
+            )
+
+            metadata.cu_seqlens_q = torch.arange(
+                0,
+                bs * self.speculative_num_draft_tokens + 1,
+                self.speculative_num_draft_tokens,
+                dtype=torch.int32,
+                device=device,
+            )
+
+            metadata.cu_seqlens_k = self.target_verify_metadata["cu_seqlens_k"][
+                : (bs + 1)
+            ]
+
+            metadata.max_seq_len_q = self.speculative_num_draft_tokens
+            metadata.max_seq_len_k = (
+                seq_lens.max().item() + self.speculative_num_draft_tokens
+            )
+
+            metadata.page_table = self.target_verify_metadata["page_table"][:bs, :]
+            self._bind_swa_page_table(
+                metadata,
+                self.target_verify_metadata,
+                "swa_page_table",
+                bs,
+            )
+
+            self.target_verify_metadata[bs] = metadata
+        elif forward_mode.is_draft_extend():
+            metadata.cache_seqlens_int32 = self.draft_extend_metadata["cache_seqlens"][
+                :bs
+            ]
+            metadata.cache_seqlens_int32.copy_(seq_lens)
+            num_tokens_per_bs = num_tokens // bs
+            metadata.cu_seqlens_q = torch.arange(
+                0,
+                bs * num_tokens_per_bs + 1,
+                num_tokens_per_bs,
+                dtype=torch.int32,
+                device=device,
+            )
+
+            metadata.cu_seqlens_k = self.draft_extend_metadata["cu_seqlens_k"][
+                : (bs + 1)
+            ]
+            num_tokens_per_bs = num_tokens // bs
+            metadata.max_seq_len_q = num_tokens_per_bs
+            metadata.max_seq_len_k = seq_lens.max().item()
+
+            metadata.page_table = self.draft_extend_metadata["page_table"][:bs, :]
+            self._bind_swa_page_table(
+                metadata,
+                self.draft_extend_metadata,
+                "swa_page_table",
+                bs,
+            )
+
+            self.draft_extend_metadata[bs] = metadata
+        self.forward_metadata = metadata
+
+    def init_forward_metadata_replay_cuda_graph(
+        self,
+        bs: int,
+        req_pool_indices: torch.Tensor,
+        seq_lens: torch.Tensor,
+        seq_lens_sum: int,
+        encoder_lens: Optional[torch.Tensor],
+        forward_mode: ForwardMode,
+        spec_info: Optional[SpecInput],
+        seq_lens_cpu: Optional[torch.Tensor],
+    ):
+        """Replay CUDA graph with new inputs."""
+        seq_lens = seq_lens[:bs]
+        seq_lens_cpu = seq_lens_cpu[:bs]
+        req_pool_indices = req_pool_indices[:bs]
+        metadata = None
+        if forward_mode.is_decode_or_idle():
+            if spec_info is not None:
+                # Draft Decode
+                # Here we only support topk = 1 for now.
+                metadata = self.decode_cuda_graph_metadata[bs]
+                max_len = seq_lens_cpu.max().item()
+                metadata.max_seq_len_k = max_len + self.speculative_step_id + 1
+
+                max_seq_pages = (
+                    metadata.max_seq_len_k + self.page_size - 1
+                ) // self.page_size
+
+                metadata.cache_seqlens_int32.copy_(
+                    seq_lens + self.speculative_step_id + 1
+                )
+            else:
+                # Normal Decode
+                metadata = self.decode_cuda_graph_metadata[bs]
+                max_len = seq_lens_cpu.max().item()
+                max_seq_pages = (max_len + self.page_size - 1) // self.page_size
+                metadata.max_seq_len_k = max_len
+
+                metadata.cache_seqlens_int32.copy_(seq_lens)
+
+            metadata.cu_seqlens_k[1:].copy_(
+                torch.cumsum(metadata.cache_seqlens_int32, dim=0, dtype=torch.int32)
+            )
+            page_indices = self.req_to_token[
+                req_pool_indices[:, None],
+                self.decode_cuda_graph_metadata["strided_indices"][:max_seq_pages][
+                    None, :
+                ],
+            ]
+            metadata.page_table[:, :max_seq_pages].copy_(page_indices // self.page_size)
+            self._copy_swa_page_table(metadata, page_indices, max_seq_pages)
+        elif forward_mode.is_target_verify():
+            # Here we only support topk = 1 for now.
+            metadata = self.target_verify_metadata[bs]
+            metadata.cache_seqlens_int32.copy_(
+                (seq_lens + self.speculative_num_draft_tokens)
+            )
+
+            metadata.max_seq_len_k = (
+                seq_lens_cpu.max().item() + self.speculative_num_draft_tokens
+            )
+            max_len = seq_lens_cpu.max().item()
+            metadata.cu_seqlens_k[1:].copy_(
+                torch.cumsum(metadata.cache_seqlens_int32, dim=0, dtype=torch.int32)
+            )
+            max_seq_pages = (
+                metadata.max_seq_len_k + self.page_size - 1
+            ) // self.page_size
+            page_indices = self.req_to_token[
+                req_pool_indices[:, None],
+                self.decode_cuda_graph_metadata["strided_indices"][:max_seq_pages],
+            ]
+            metadata.page_table[:, :max_seq_pages].copy_(page_indices // self.page_size)
+            self._copy_swa_page_table(metadata, page_indices, max_seq_pages)
+            metadata.max_seq_len_q = self.speculative_num_draft_tokens
+        elif forward_mode.is_draft_extend():
+            metadata = self.draft_extend_metadata[bs]
+            metadata.cache_seqlens_int32.copy_(seq_lens)
+
+            metadata.max_seq_len_k = seq_lens_cpu.max().item()
+            max_len = seq_lens_cpu.max().item()
+            metadata.cu_seqlens_k[1:].copy_(
+                torch.cumsum(metadata.cache_seqlens_int32, dim=0, dtype=torch.int32)
+            )
+            accept_length = spec_info.accept_length[:bs]
+            if spec_info.accept_length_cpu:
+                metadata.max_seq_len_q = max(spec_info.accept_length_cpu) + 1
+            else:
+                metadata.max_seq_len_q = 1
+
+            metadata.cu_seqlens_q[1:].copy_(
+                torch.cumsum(accept_length, dim=0, dtype=torch.int32)
+            )
+
+            max_seq_pages = (
+                metadata.max_seq_len_k + self.page_size - 1
+            ) // self.page_size
+            page_indices = self.req_to_token[
+                req_pool_indices[:, None],
+                self.draft_extend_metadata["strided_indices"][:max_seq_pages],
+            ]
+            metadata.page_table[:, :max_seq_pages].copy_(page_indices // self.page_size)
+            self._copy_swa_page_table(metadata, page_indices, max_seq_pages)
+        self.forward_metadata = metadata
+
+    def get_cuda_graph_seq_len_fill_value(self) -> int:
+        """Get the fill value for sequence lengths in CUDA graph."""
+        return 1
+
+    def _should_use_fused_fp8_path(self, save_kv_cache: bool, k: torch.Tensor) -> bool:
+        """Check if we should use the fused FP8 KV cache write path."""
+        return save_kv_cache and k is not None and self.data_type == torch.float8_e4m3fn
+
+    def _fused_fp8_set_kv_buffer(
+        self,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
+        layer: RadixAttention,
+        forward_batch: ForwardBatch,
+        **kwargs,
+    ):
+        """Fused FP8 quantization and KV cache write."""
+        cache_loc = self._get_layer_cache_loc(layer, forward_batch.out_cache_loc)
+
+        # Get K/V cache buffers from token_to_kv_pool
+        k_cache, v_cache = forward_batch.token_to_kv_pool.get_kv_buffer(layer.layer_id)
+
+        fused_fp8_set_kv_buffer(
+            k=k,
+            v=v,
+            k_cache=k_cache,
+            v_cache=v_cache,
+            cache_loc=cache_loc,
+            k_scale=layer.k_scale,  # May be None
+            v_scale=layer.v_scale,  # May be None
+            page_size=self.page_size,
+        )
+
+    def init_forward_metadata(self, forward_batch: ForwardBatch):
+        """Initialize the metadata for a forward pass."""
+
+        metadata = TRTLLMMHAMetadata()
+        seqlens_in_batch = forward_batch.seq_lens
+        batch_size = forward_batch.batch_size
+        device = seqlens_in_batch.device
+
+        if forward_batch.forward_mode.is_decode_or_idle():
+            if forward_batch.spec_info is not None:
+                # Draft Decode
+                # Here we only support topk = 1 for now.
+                metadata.cache_seqlens_int32 = (
+                    seqlens_in_batch + (self.speculative_step_id + 1)
+                ).to(torch.int32)
+                metadata.max_seq_len_k = forward_batch.seq_lens_cpu.max().item() + (
+                    self.speculative_step_id + 1
+                )
+                metadata.cu_seqlens_q = torch.arange(
+                    0, batch_size + 1, dtype=torch.int32, device=device
+                )
+                metadata.cu_seqlens_k = torch.nn.functional.pad(
+                    torch.cumsum(
+                        metadata.cache_seqlens_int32, dim=0, dtype=torch.int32
+                    ),
+                    (1, 0),
+                )
+                metadata.page_table = forward_batch.req_to_token_pool.req_to_token[
+                    forward_batch.req_pool_indices, : metadata.max_seq_len_k
+                ]
+            else:
+                # Normal Decode
+                metadata.cache_seqlens_int32 = seqlens_in_batch.to(torch.int32)
+                metadata.max_seq_len_k = forward_batch.seq_lens_cpu.max().item()
+                metadata.cu_seqlens_q = torch.arange(
+                    0, batch_size + 1, dtype=torch.int32, device=device
+                )
+                metadata.cu_seqlens_k = torch.nn.functional.pad(
+                    torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.int32), (1, 0)
+                )
+                metadata.page_table = forward_batch.req_to_token_pool.req_to_token[
+                    forward_batch.req_pool_indices, : metadata.max_seq_len_k
+                ]
+        elif forward_batch.forward_mode.is_target_verify():
+            # Only support topk = 1 for now.
+            metadata.cache_seqlens_int32 = (
+                forward_batch.seq_lens + self.speculative_num_draft_tokens
+            ).to(torch.int32)
+            metadata.max_seq_len_q = self.speculative_num_draft_tokens
+            metadata.max_seq_len_k = (
+                forward_batch.seq_lens_cpu.max().item()
+                + self.speculative_num_draft_tokens
+            )
+            metadata.cu_seqlens_q = torch.arange(
+                0,
+                batch_size * self.speculative_num_draft_tokens + 1,
+                self.speculative_num_draft_tokens,
+                dtype=torch.int32,
+                device=device,
+            )
+            metadata.cu_seqlens_k = torch.nn.functional.pad(
+                torch.cumsum(metadata.cache_seqlens_int32, dim=0, dtype=torch.int32),
+                (1, 0),
+            )
+            metadata.page_table = forward_batch.req_to_token_pool.req_to_token[
+                forward_batch.req_pool_indices, : metadata.max_seq_len_k
+            ]
+
+        else:
+            metadata.cache_seqlens_int32 = seqlens_in_batch.to(torch.int32)
+            metadata.max_seq_len_k = forward_batch.seq_lens_cpu.max().item()
+            metadata.cu_seqlens_k = torch.nn.functional.pad(
+                torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.int32), (1, 0)
+            )
+            metadata.page_table = forward_batch.req_to_token_pool.req_to_token[
+                forward_batch.req_pool_indices, : metadata.max_seq_len_k
+            ]
+
+            if any(
+                forward_batch.extend_prefix_lens_cpu
+            ) or forward_batch.forward_mode.is_draft_extend(include_v2=True):
+                extend_seq_lens = forward_batch.extend_seq_lens
+                # NOTE: in piecewise CUDA graph warmup, extend_seq_lens_cpu is a torch.Tensor;
+                # Python's max() returns a 0-d tensor, but flashinfer expects an int.
+                max_q = max(forward_batch.extend_seq_lens_cpu)
+                metadata.max_seq_len_q = (
+                    int(max_q.item()) if isinstance(max_q, torch.Tensor) else int(max_q)
+                )
+                metadata.cu_seqlens_q = torch.nn.functional.pad(
+                    torch.cumsum(extend_seq_lens, dim=0, dtype=torch.int32), (1, 0)
+                )
+            else:
+                metadata.max_seq_len_q = metadata.max_seq_len_k
+                metadata.cu_seqlens_q = metadata.cu_seqlens_k
+
+        # Compute SWA page table (None for non-SWA models)
+        metadata.swa_page_table = self._maybe_translate_swa(metadata.page_table)
+
+        # Convert the page tables to a strided format
+        if self.page_size > 1:
+            self.strided_indices = torch.arange(
+                0, metadata.page_table.shape[1], self.page_size, device=self.device
+            )
+            metadata.page_table = (
+                metadata.page_table[:, self.strided_indices] // self.page_size
+            )
+            if metadata.swa_page_table is not None:
+                metadata.swa_page_table = (
+                    metadata.swa_page_table[:, self.strided_indices] // self.page_size
+                )
+
+        self.forward_metadata = metadata
+
+    def forward_decode(
+        self,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
+        layer: RadixAttention,
+        forward_batch: ForwardBatch,
+        save_kv_cache: bool = True,
+        **kwargs,
+    ) -> torch.Tensor:
+        """Run forward for decode using TRTLLM MHA kernel."""
+        cache_loc = forward_batch.out_cache_loc
+
+        use_fused_fp8_path = self._should_use_fused_fp8_path(save_kv_cache, k)
+
+        if use_fused_fp8_path:
+            # Use fused FP8 quantization + KV cache write path
+            self._fused_fp8_set_kv_buffer(
+                q=q,
+                k=k,
+                v=v,
+                layer=layer,
+                forward_batch=forward_batch,
+            )
+            k = None
+            v = None
+        else:
+            # Use original set_kv_buffer path
+            if save_kv_cache and k is not None:
+                forward_batch.token_to_kv_pool.set_kv_buffer(
+                    layer, cache_loc, k, v, layer.k_scale, layer.v_scale
+                )
+
+        # For XQA, q_dtype should be bf16
+        if self.data_type == torch.float8_e4m3fn and (not self.is_xqa_impl):
+            q = q.to(torch.float8_e4m3fn)
+        q = q.contiguous().view(-1, layer.tp_q_head_num, layer.head_dim)
+        k_cache, v_cache = forward_batch.token_to_kv_pool.get_kv_buffer(layer.layer_id)
+        # shape conversion:
+        # [num_pages, page_size, num_kv_heads, head_dim] -> [num_pages, num_kv_heads, page_size, head_dim]
+        k_cache = k_cache.view(
+            -1, self.page_size, layer.tp_k_head_num, layer.head_dim
+        ).permute(0, 2, 1, 3)
+        v_cache = v_cache.view(
+            -1, self.page_size, layer.tp_v_head_num, layer.head_dim
+        ).permute(0, 2, 1, 3)
+
+        if layer.tp_k_head_num == 1:
+            k_cache = canonicalize_stride(k_cache)
+        if layer.tp_v_head_num == 1:
+            v_cache = canonicalize_stride(v_cache)
+
+        kv_cache = (k_cache, v_cache)
+
+        # TODO: add support for quantization
+        q_scale = 1.0
+        k_scale = (
+            layer.k_scale_float
+            if getattr(layer, "k_scale_float", None) is not None
+            else 1.0
+        )
+        bmm1_scale = q_scale * k_scale * layer.scaling
+        bmm2_scale = 1.0
+        # sink: additional value per head in the denominator of the softmax.
+        attention_sink = kwargs.get("sinks", None)
+
+        page_table = self._get_layer_page_table(layer, forward_batch)
+
+        # Call TRT-LLM kernel
+        # raw_out: like q, [bs, acc_q_len, num_q_heads, head_dim] but with output dtype
+        o = flashinfer.decode.trtllm_batch_decode_with_kv_cache(
+            query=q,
+            kv_cache=kv_cache,
+            workspace_buffer=self.workspace_buffer,
+            block_tables=page_table,
+            seq_lens=self.forward_metadata.cache_seqlens_int32,
+            max_seq_len=self.max_context_len,
+            bmm1_scale=bmm1_scale,
+            bmm2_scale=bmm2_scale,
+            window_left=layer.sliding_window_size,
+            sinks=attention_sink,
+            out_dtype=self.q_data_type,  # model_runner.dtype
+        )
+
+        return o.view(-1, layer.tp_q_head_num * layer.head_dim)
+
+    def forward_extend(
+        self,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
+        layer: RadixAttention,
+        forward_batch: ForwardBatch,
+        save_kv_cache=True,
+        **kwargs,
+    ):
+        cache_loc = forward_batch.out_cache_loc
+
+        use_fused_fp8_path = self._should_use_fused_fp8_path(save_kv_cache, k)
+
+        if use_fused_fp8_path:
+            # Use fused FP8 quantization + KV cache write path
+            self._fused_fp8_set_kv_buffer(
+                q=q,
+                k=k,
+                v=v,
+                layer=layer,
+                forward_batch=forward_batch,
+            )
+            k = None
+            v = None
+        else:
+            # Use original set_kv_buffer path
+            if save_kv_cache and k is not None:
+                forward_batch.token_to_kv_pool.set_kv_buffer(
+                    layer, cache_loc, k, v, layer.k_scale, layer.v_scale
+                )
+
+        if self.data_type == torch.float8_e4m3fn:
+            q = q.to(torch.float8_e4m3fn)
+        q = q.contiguous().view(-1, layer.tp_q_head_num, layer.head_dim)
+        # [num_pages, page_size, num_kv_heads, head_dim] -> [num_pages, num_kv_heads, page_size, head_dim]
+        k_cache, v_cache = forward_batch.token_to_kv_pool.get_kv_buffer(layer.layer_id)
+        k_cache = k_cache.view(
+            -1, self.page_size, layer.tp_k_head_num, layer.head_dim
+        ).permute(0, 2, 1, 3)
+        v_cache = v_cache.view(
+            -1, self.page_size, layer.tp_v_head_num, layer.head_dim
+        ).permute(0, 2, 1, 3)
+
+        if layer.tp_k_head_num == 1:
+            k_cache = canonicalize_stride(k_cache)
+        if layer.tp_v_head_num == 1:
+            v_cache = canonicalize_stride(v_cache)
+
+        kv_cache = (k_cache, v_cache)
+
+        # sink: additional value per head in the denominator of the softmax.
+        attention_sink = kwargs.get("sinks", None)
+        # TODO: add support for quantization
+        q_scale = 1.0
+        k_scale = (
+            layer.k_scale_float
+            if getattr(layer, "k_scale_float", None) is not None
+            else 1.0
+        )
+        bmm1_scale = q_scale * k_scale * layer.scaling
+        bmm2_scale = 1.0
+
+        page_table = self._get_layer_page_table(layer, forward_batch)
+
+        if forward_batch.forward_mode.is_target_verify():
+            o = flashinfer.decode.trtllm_batch_decode_with_kv_cache(
+                query=q,
+                kv_cache=kv_cache,
+                workspace_buffer=self.workspace_buffer,
+                block_tables=page_table,
+                seq_lens=self.forward_metadata.cache_seqlens_int32,
+                max_seq_len=self.max_context_len,
+                bmm1_scale=bmm1_scale,
+                bmm2_scale=bmm2_scale,
+                window_left=layer.sliding_window_size,
+                sinks=attention_sink,
+                out_dtype=self.q_data_type,  # model_runner.dtype
+                q_len_per_req=self.forward_metadata.max_seq_len_q,
+            )
+        else:
+            o = flashinfer.prefill.trtllm_batch_context_with_kv_cache(
+                query=q,
+                kv_cache=kv_cache,
+                workspace_buffer=self.workspace_buffer,
+                block_tables=page_table,
+                seq_lens=self.forward_metadata.cache_seqlens_int32,
+                max_q_len=self.forward_metadata.max_seq_len_q,
+                max_kv_len=self.max_context_len,
+                bmm1_scale=bmm1_scale,
+                bmm2_scale=bmm2_scale,
+                batch_size=forward_batch.batch_size,
+                cum_seq_lens_q=self.forward_metadata.cu_seqlens_q,
+                cum_seq_lens_kv=self.forward_metadata.cu_seqlens_k,
+                window_left=layer.sliding_window_size,
+                sinks=attention_sink,
+                out_dtype=self.q_data_type,  # model_runner.dtype
+            )
+
+        return o.view(-1, layer.tp_q_head_num * layer.head_dim)
+
+
+class TRTLLMHAAttnMultiStepDraftBackend(FlashInferMultiStepDraftBackend):
+    """Multi-step TRTLLM MHA attention kernel used by EAGLE."""
+
+    def __init__(
+        self, model_runner: ModelRunner, topk: int, speculative_num_steps: int
+    ):
+        super().__init__(model_runner, topk, speculative_num_steps)
+        for i in range(self.speculative_num_steps - 1):
+            self.attn_backends[i] = TRTLLMHAAttnBackend(
+                model_runner,
+                skip_prefill=True,
+                kv_indptr_buf=self.kv_indptr[i],
+                kv_last_page_len_buf=self.kv_last_page_len,
+                speculative_step_id=i,
+            )
+
+    def init_forward_metadata(self, forward_batch: ForwardBatch):
+        for i in range(self.speculative_num_steps - 1):
+            self.attn_backends[i].init_forward_metadata(forward_batch)
+
+    def init_cuda_graph_state(self, max_bs: int, max_num_tokens: int):
+        for i in range(self.speculative_num_steps - 1):
+            self.attn_backends[i].init_cuda_graph_state(max_bs, max_num_tokens)
+
+    def init_forward_metadata_capture_cuda_graph(
+        self,
+        forward_batch: ForwardBatch,
+    ):
+        assert forward_batch.spec_info is not None
+        assert forward_batch.spec_info.is_draft_input()
+
+        for i in range(self.speculative_num_steps - 1):
+            self.attn_backends[i].init_forward_metadata_capture_cuda_graph(
+                forward_batch.batch_size,
+                forward_batch.batch_size * self.topk,
+                forward_batch.req_pool_indices,
+                forward_batch.seq_lens,
+                encoder_lens=forward_batch.encoder_lens,
+                forward_mode=ForwardMode.DECODE,
+                spec_info=forward_batch.spec_info,
+            )
+
+    def init_forward_metadata_replay_cuda_graph(
+        self, forward_batch: ForwardBatch, bs: int
+    ):
+        assert forward_batch.spec_info is not None
+        assert forward_batch.spec_info.is_draft_input()
+
+        for i in range(self.speculative_num_steps - 1):
+
+            self.attn_backends[i].init_forward_metadata_replay_cuda_graph(
+                bs,
+                forward_batch.req_pool_indices,
+                forward_batch.seq_lens,
+                forward_batch.seq_lens_sum,
+                encoder_lens=forward_batch.encoder_lens,
+                forward_mode=ForwardMode.DECODE,
+                spec_info=forward_batch.spec_info,
+                seq_lens_cpu=forward_batch.seq_lens_cpu,
+            )
diff --git a/sglang/python/sglang/srt/layers/attention/trtllm_mla_backend.py b/sglang/python/sglang/srt/layers/attention/trtllm_mla_backend.py
new file mode 100644
index 0000000000000000000000000000000000000000..b74124da25ea90db10a6df42137cb83524c69329
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/attention/trtllm_mla_backend.py
@@ -0,0 +1,1164 @@
+from __future__ import annotations
+
+"""
+Support attention backend for TRTLLM MLA kernels from flashinfer.
+"""
+
+import logging
+import math
+from dataclasses import dataclass
+from typing import TYPE_CHECKING, Optional, Union
+
+import torch
+import triton
+import triton.language as tl
+
+from sglang.srt.compilation.piecewise_context_manager import is_in_piecewise_cuda_graph
+from sglang.srt.layers.attention.flashinfer_mla_backend import (
+    FlashInferMLAAttnBackend,
+    FlashInferMLAMultiStepDraftBackend,
+)
+from sglang.srt.layers.attention.utils import (
+    concat_mla_absorb_q_general,
+    create_flashmla_kv_indices_triton,
+    get_num_page_per_block_flashmla,
+    mla_quantize_and_rope_for_fp8,
+)
+from sglang.srt.layers.dp_attention import get_attention_tp_size
+from sglang.srt.layers.quantization.fp8_kernel import scaled_fp8_quant
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch, ForwardMode
+from sglang.srt.server_args import get_global_server_args
+from sglang.srt.utils import is_flashinfer_available, is_float4_e2m1fn_x2
+
+if is_flashinfer_available():
+    import flashinfer
+
+if TYPE_CHECKING:
+    from sglang.srt.layers.radix_attention import RadixAttention
+    from sglang.srt.model_executor.model_runner import ModelRunner
+    from sglang.srt.speculative.spec_info import SpecInput
+
+logger = logging.getLogger(__name__)
+
+# Constants
+DEFAULT_WORKSPACE_SIZE_MB = 150  # Memory workspace size in MB
+
+# Block constraint from flashinfer requirements
+# From flashinfer.decode._check_trtllm_gen_mla_shape:
+#   block_num % (128 / block_size) == 0
+# This imposes that the total number of blocks must be divisible by
+# (128 / block_size). We capture the 128 constant here so we can
+# compute the LCM with other padding constraints.
+TRTLLM_BLOCK_CONSTRAINT = 128
+
+
+@triton.jit
+def pad_draft_extend_query_kernel(
+    q_ptr,  # Input query tensor [total_seq_len, num_heads, head_dim]
+    padded_q_ptr,  # Output padded query tensor [batch_size, max_seq_len, num_heads, head_dim]
+    seq_lens_q_ptr,  # Sequence lengths for each sequence [batch_size]
+    cumsum_ptr,  # Cumulative sum of accept lengths [batch_size + 1]
+    batch_size,
+    max_seq_len,
+    num_heads,
+    head_dim,
+    BLOCK_SIZE: tl.constexpr,
+):
+    """Triton kernel for padding draft extended query tensor with parallelized head and dim processing."""
+    # Use 3D program IDs: (batch_seq, head_block, dim_block)
+    batch_seq_pid = tl.program_id(0)
+    head_pid = tl.program_id(1)
+    dim_pid = tl.program_id(2)
+
+    batch_id = batch_seq_pid // max_seq_len
+    seq_pos = batch_seq_pid % max_seq_len
+
+    if batch_id >= batch_size:
+        return
+
+    # Load accept length for this batch
+    seq_len = tl.load(seq_lens_q_ptr + batch_id)
+
+    if seq_pos >= seq_len:
+        return
+
+    # Load cumulative sum to get start position in input tensor
+    input_start = tl.load(cumsum_ptr + batch_id)
+    input_pos = input_start + seq_pos
+
+    # Calculate head and dim block ranges
+    head_start = head_pid * BLOCK_SIZE
+    head_end = tl.minimum(head_start + BLOCK_SIZE, num_heads)
+    head_mask = tl.arange(0, BLOCK_SIZE) < (head_end - head_start)
+
+    dim_start = dim_pid * BLOCK_SIZE
+    dim_end = tl.minimum(dim_start + BLOCK_SIZE, head_dim)
+    dim_mask = tl.arange(0, BLOCK_SIZE) < (dim_end - dim_start)
+
+    # Calculate input offset
+    input_offset = (
+        input_pos * num_heads * head_dim
+        + (head_start + tl.arange(0, BLOCK_SIZE))[:, None] * head_dim
+        + (dim_start + tl.arange(0, BLOCK_SIZE))[None, :]
+    )
+
+    # Load data
+    data = tl.load(
+        q_ptr + input_offset,
+        mask=head_mask[:, None] & dim_mask[None, :],
+        other=0.0,
+    )
+
+    # Calculate output offset
+    output_offset = (
+        batch_id * max_seq_len * num_heads * head_dim
+        + seq_pos * num_heads * head_dim
+        + (head_start + tl.arange(0, BLOCK_SIZE))[:, None] * head_dim
+        + (dim_start + tl.arange(0, BLOCK_SIZE))[None, :]
+    )
+
+    # Store data
+    tl.store(
+        padded_q_ptr + output_offset,
+        data,
+        mask=head_mask[:, None] & dim_mask[None, :],
+    )
+
+
+@triton.jit
+def unpad_draft_extend_output_kernel(
+    raw_out_ptr,  # Input raw output tensor (batch_size, token_per_batch, tp_q_head_num, v_head_dim)
+    output_ptr,  # Output tensor (-1, tp_q_head_num, v_head_dim)
+    accept_length_ptr,  # Accept lengths for each sequence [batch_size]
+    cumsum_ptr,  # Cumulative sum of accept lengths [batch_size + 1]
+    batch_size,
+    token_per_batch,
+    tp_q_head_num,
+    v_head_dim,
+    BLOCK_SIZE: tl.constexpr,
+):
+    """Triton kernel for unpadding draft extended output tensor with parallelized head and dim processing."""
+    batch_seq_pid = tl.program_id(0)
+    head_pid = tl.program_id(1)
+    dim_pid = tl.program_id(2)
+
+    batch_id = batch_seq_pid // token_per_batch
+    seq_pos = batch_seq_pid % token_per_batch
+
+    if batch_id >= batch_size:
+        return
+
+    # Load accept length for this batch
+    accept_len = tl.load(accept_length_ptr + batch_id)
+
+    if seq_pos >= accept_len:
+        return
+
+    # Load cumulative sum to get start position in output tensor
+    output_start = tl.load(cumsum_ptr + batch_id)
+    output_pos = output_start + seq_pos
+
+    # Calculate head and dim block ranges
+    head_start = head_pid * BLOCK_SIZE
+    head_end = tl.minimum(head_start + BLOCK_SIZE, tp_q_head_num)
+    head_mask = tl.arange(0, BLOCK_SIZE) < (head_end - head_start)
+
+    dim_start = dim_pid * BLOCK_SIZE
+    dim_end = tl.minimum(dim_start + BLOCK_SIZE, v_head_dim)
+    dim_mask = tl.arange(0, BLOCK_SIZE) < (dim_end - dim_start)
+
+    # Calculate input offset: (batch_id, seq_pos, head_id, dim_id)
+    input_offset = (
+        batch_id * token_per_batch * tp_q_head_num * v_head_dim
+        + seq_pos * tp_q_head_num * v_head_dim
+        + (head_start + tl.arange(0, BLOCK_SIZE))[:, None] * v_head_dim
+        + (dim_start + tl.arange(0, BLOCK_SIZE))[None, :]
+    )
+
+    # Load data
+    data = tl.load(
+        raw_out_ptr + input_offset,
+        mask=head_mask[:, None] & dim_mask[None, :],
+        other=0.0,
+    )
+
+    output_offset = (
+        output_pos * tp_q_head_num * v_head_dim
+        + (head_start + tl.arange(0, BLOCK_SIZE))[:, None] * v_head_dim
+        + (dim_start + tl.arange(0, BLOCK_SIZE))[None, :]
+    )
+
+    # Store data
+    tl.store(
+        output_ptr + output_offset,
+        data,
+        mask=head_mask[:, None] & dim_mask[None, :],
+    )
+
+
+def _quantize_fp8_qkv(q, k, v, layer):
+    q = q.to(torch.float8_e4m3fn)
+
+    k_scale = getattr(layer, "k_scale_float", None)
+    if k_scale is None:
+        k_scale = 1.0
+    if k_scale != 1.0:
+        assert hasattr(layer, "k_scale"), "k_scale is not set"
+        k_2d, _ = scaled_fp8_quant(
+            k.reshape(-1, k.shape[-1]).contiguous(), layer.k_scale
+        )
+        k = k_2d.reshape(k.shape)
+    else:
+        k = k.to(torch.float8_e4m3fn)
+
+    v_scale = getattr(layer, "v_scale_float", None)
+    if v_scale is None:
+        v_scale = 1.0
+    if v_scale != 1.0:
+        assert hasattr(layer, "v_scale"), "v_scale is not set"
+        v_2d, _ = scaled_fp8_quant(
+            v.reshape(-1, v.shape[-1]).contiguous(), layer.v_scale
+        )
+        v = v_2d.reshape(v.shape)
+    else:
+        v = v.to(torch.float8_e4m3fn)
+
+    return q, k, v, k_scale, v_scale
+
+
+global_zero_init_workspace_buffer = None
+
+
+@dataclass
+class TRTLLMMLAPrefillMetadata:
+    """Metadata for TRTLLM MLA prefill operations."""
+
+    max_seq_len: int
+    cum_seq_lens: torch.Tensor
+    seq_lens: torch.Tensor
+    fallback_to_flashinfer_impl: bool = False
+
+
+@dataclass
+class TRTLLMMLADecodeMetadata:
+    """Metadata for TRTLLM MLA decode operations."""
+
+    block_kv_indices: Optional[torch.Tensor] = None
+    max_seq_len_k: Optional[int] = None
+    max_seq_len_q: Optional[int] = None
+    sum_seq_lens_q: Optional[int] = None
+    cu_seqlens_q: Optional[torch.Tensor] = None
+    seq_lens_q: Optional[torch.Tensor] = None
+    seq_lens_k: Optional[torch.Tensor] = None
+
+
+class TRTLLMMLABackend(FlashInferMLAAttnBackend):
+    """TRTLLM MLA attention kernel from flashinfer."""
+
+    def __init__(
+        self,
+        model_runner: ModelRunner,
+        skip_prefill: bool = False,
+        kv_indptr_buf: Optional[torch.Tensor] = None,
+        q_indptr_decode_buf: Optional[torch.Tensor] = None,
+    ):
+        super().__init__(
+            model_runner,
+            skip_prefill,
+            kv_indptr_buf,
+            q_indptr_decode_buf,
+        )
+
+        config = model_runner.model_config
+
+        # Model parameters
+        self.num_q_heads = config.num_attention_heads // get_attention_tp_size()
+        self.num_kv_heads = config.get_num_kv_heads(get_attention_tp_size())
+        self.num_local_heads = config.num_attention_heads // get_attention_tp_size()
+
+        # MLA-specific dimensions
+        self.kv_lora_rank = config.kv_lora_rank
+        self.qk_nope_head_dim = config.qk_nope_head_dim
+        self.qk_rope_head_dim = config.qk_rope_head_dim
+        self.v_head_dim = config.v_head_dim
+        self.kv_cache_dim = self.kv_lora_rank + self.qk_rope_head_dim
+
+        # Runtime parameters
+        self.scaling = config.scaling
+        self.data_type = model_runner.kv_cache_dtype
+        self.q_data_type = model_runner.dtype
+        self.page_size = model_runner.page_size
+        self.req_to_token = model_runner.req_to_token_pool.req_to_token
+
+        # Workspace allocation
+        self.workspace_size = DEFAULT_WORKSPACE_SIZE_MB * 1024 * 1024
+        global global_zero_init_workspace_buffer
+        if global_zero_init_workspace_buffer is None:
+            global_zero_init_workspace_buffer = torch.zeros(
+                self.workspace_size,
+                dtype=torch.uint8,
+                device=model_runner.device,
+            )
+        self.workspace_buffer = global_zero_init_workspace_buffer
+
+        # CUDA graph state
+        self.decode_cuda_graph_metadata = {}
+        self.decode_cuda_graph_kv_indices = None
+        self.padded_q_buffer = None
+        self.unpad_output_buffer = None
+        self.forward_prefill_metadata: Optional[TRTLLMMLAPrefillMetadata] = None
+        self.forward_decode_metadata: Union[TRTLLMMLADecodeMetadata, None] = None
+
+        self.disable_chunked_prefix_cache = (
+            get_global_server_args().disable_chunked_prefix_cache
+        )
+
+        self.num_draft_tokens = model_runner.server_args.speculative_num_draft_tokens
+
+    def _calc_padded_blocks(self, max_seq_len: int) -> int:
+        """
+        Calculate padded block count that satisfies both TRT-LLM and Triton constraints.
+
+        Args:
+            max_seq_len: Maximum sequence length in tokens
+
+        Returns:
+            Number of blocks padded to satisfy all constraints
+        """
+        blocks = triton.cdiv(max_seq_len, self.page_size)
+
+        # Apply dual constraints (take LCM to satisfy both):
+        # 1. TRT-LLM: block_num % (128 / page_size) == 0
+        # 2. Triton: number of pages per block
+        trtllm_constraint = TRTLLM_BLOCK_CONSTRAINT // self.page_size
+        triton_constraint = get_num_page_per_block_flashmla(self.page_size)
+        constraint_lcm = math.lcm(trtllm_constraint, triton_constraint)
+
+        if blocks % constraint_lcm != 0:
+            blocks = triton.cdiv(blocks, constraint_lcm) * constraint_lcm
+        return blocks
+
+    def _create_block_kv_indices(
+        self,
+        batch_size: int,
+        max_blocks: int,
+        req_pool_indices: torch.Tensor,
+        seq_lens: torch.Tensor,
+        device: torch.device,
+    ) -> torch.Tensor:
+        """
+        Create block KV indices tensor using Triton kernel.
+
+        Args:
+            batch_size: Batch size
+            max_blocks: Maximum number of blocks per sequence
+            req_pool_indices: Request pool indices
+            seq_lens: Sequence lengths
+            device: Target device
+
+        Returns:
+            Block KV indices tensor
+        """
+        block_kv_indices = torch.full(
+            (batch_size, max_blocks), -1, dtype=torch.int32, device=device
+        )
+
+        create_flashmla_kv_indices_triton[(batch_size,)](
+            self.req_to_token,
+            req_pool_indices,
+            seq_lens,
+            None,
+            block_kv_indices,
+            self.req_to_token.stride(0),
+            max_blocks,
+            PAGED_SIZE=self.page_size,
+        )
+
+        return block_kv_indices
+
+    def init_cuda_graph_state(
+        self,
+        max_bs: int,
+        max_num_tokens: int,
+        kv_indices_buf: Optional[torch.Tensor] = None,
+    ):
+        """Initialize CUDA graph state for TRTLLM MLA."""
+
+        max_blocks_per_seq = self._calc_padded_blocks(self.max_context_len)
+
+        self.decode_cuda_graph_kv_indices = torch.full(
+            (max_bs, max_blocks_per_seq), -1, dtype=torch.int32, device=self.device
+        )
+        num_tokens_per_bs = max_num_tokens // max_bs
+
+        if is_float4_e2m1fn_x2(self.data_type):
+            # Buffer for padded query: (max_bs, max_draft_tokens, num_q_heads, v_head_dim)
+            self.store_dtype = torch.uint8
+            self.padded_q_buffer = torch.zeros(
+                (max_bs, num_tokens_per_bs // 2, self.num_q_heads, self.kv_cache_dim),
+                dtype=self.store_dtype,
+                device=self.device,
+            )
+
+            # Buffer for unpadded output: (max_num_tokens, num_q_heads, v_head_dim)
+            self.unpad_output_buffer = torch.zeros(
+                (max_num_tokens // 2, self.num_q_heads, 512),
+                dtype=self.store_dtype,
+                device=self.device,
+            )
+        else:
+            # Buffer for padded query: (max_bs, max_draft_tokens, num_q_heads, v_head_dim)
+            self.padded_q_buffer = torch.zeros(
+                (max_bs, num_tokens_per_bs, self.num_q_heads, self.kv_cache_dim),
+                dtype=self.data_type,
+                device=self.device,
+            )
+
+            # Buffer for unpadded output: (max_num_tokens, num_q_heads, v_head_dim)
+            self.unpad_output_buffer = torch.zeros(
+                (max_num_tokens, self.num_q_heads, 512),
+                dtype=self.data_type,
+                device=self.device,
+            )
+
+        super().init_cuda_graph_state(max_bs, max_num_tokens, kv_indices_buf)
+
+    def init_forward_metadata_capture_cuda_graph(
+        self,
+        bs: int,
+        num_tokens: int,
+        req_pool_indices: torch.Tensor,
+        seq_lens: torch.Tensor,
+        encoder_lens: Optional[torch.Tensor],
+        forward_mode: ForwardMode,
+        spec_info: Optional[SpecInput],
+    ):
+        """Initialize metadata for CUDA graph capture."""
+
+        # Delegate to parent for non-decode modes.
+        if (
+            not forward_mode.is_decode_or_idle()
+            and not forward_mode.is_target_verify()
+            and not forward_mode.is_draft_extend(include_v2=True)
+        ):
+            return super().init_forward_metadata_capture_cuda_graph(
+                bs,
+                num_tokens,
+                req_pool_indices,
+                seq_lens,
+                encoder_lens,
+                forward_mode,
+                spec_info,
+            )
+
+        metadata = TRTLLMMLADecodeMetadata()
+
+        if forward_mode.is_target_verify():
+            seq_lens = seq_lens + self.num_draft_tokens
+            metadata.seq_lens_k = torch.zeros(
+                (bs,), dtype=torch.int32, device=seq_lens.device
+            )
+            metadata.seq_lens_k.copy_(seq_lens.to(dtype=torch.int32))
+        elif forward_mode.is_draft_extend(include_v2=True):
+            num_tokens_per_bs = num_tokens // bs
+            metadata.max_seq_len_q = num_tokens_per_bs
+            metadata.sum_seq_lens_q = num_tokens_per_bs * bs
+            metadata.cu_seqlens_q = torch.arange(
+                0,
+                bs * num_tokens_per_bs + 1,
+                num_tokens_per_bs,
+                dtype=torch.int32,
+                device=seq_lens.device,
+            )
+            metadata.seq_lens_q = torch.full(
+                (bs,), num_tokens_per_bs, dtype=torch.int32, device=seq_lens.device
+            )
+            # NOTE(draft_extend seq_len handling):
+            # forward_batch.seq_lens is the seq_lens of the prev_context + verified tokens.
+            # To account for pad_draft_extend_query, we need seq_lens = prev_context + max_draft_tokens.
+            # This will ensure queries align with kvs correctly when calling
+            # flashinfer.decode.trtllm_batch_decode_with_kv_cache_mla.
+            seq_lens = seq_lens - metadata.seq_lens_q + metadata.max_seq_len_q
+            metadata.seq_lens_k = torch.zeros(
+                (bs,), dtype=torch.int32, device=seq_lens.device
+            )
+            metadata.seq_lens_k.copy_(seq_lens.to(dtype=torch.int32))
+
+        # Custom fast-path for decode/idle.
+        # Capture with full width so future longer sequences are safe during replay
+        max_blocks_per_seq = self._calc_padded_blocks(self.max_context_len)
+        block_kv_indices = self.decode_cuda_graph_kv_indices[:bs, :max_blocks_per_seq]
+
+        create_flashmla_kv_indices_triton[(bs,)](
+            self.req_to_token,
+            req_pool_indices,
+            seq_lens,
+            None,
+            block_kv_indices,
+            self.req_to_token.stride(0),
+            max_blocks_per_seq,
+            PAGED_SIZE=self.page_size,
+        )
+
+        metadata.block_kv_indices = block_kv_indices
+        metadata.max_seq_len_k = self.max_context_len
+
+        self.decode_cuda_graph_metadata[bs] = metadata
+        self.forward_decode_metadata = metadata
+
+    def init_forward_metadata_replay_cuda_graph(
+        self,
+        bs: int,
+        req_pool_indices: torch.Tensor,
+        seq_lens: torch.Tensor,
+        seq_lens_sum: int,
+        encoder_lens: Optional[torch.Tensor],
+        forward_mode: ForwardMode,
+        spec_info: Optional[SpecInput],
+        seq_lens_cpu: Optional[torch.Tensor],
+    ):
+        """Replay CUDA graph with new inputs."""
+        # Delegate to parent for non-decode modes.
+        if (
+            not forward_mode.is_decode_or_idle()
+            and not forward_mode.is_target_verify()
+            and not forward_mode.is_draft_extend(include_v2=True)
+        ):
+            return super().init_forward_metadata_replay_cuda_graph(
+                bs,
+                req_pool_indices,
+                seq_lens,
+                seq_lens_sum,
+                encoder_lens,
+                forward_mode,
+                spec_info,
+                seq_lens_cpu,
+            )
+
+        metadata = self.decode_cuda_graph_metadata[bs]
+
+        if forward_mode.is_target_verify():
+            seq_lens = seq_lens[:bs] + self.num_draft_tokens
+            metadata.seq_lens_k.copy_(seq_lens.to(dtype=torch.int32))
+            del seq_lens_sum  # not handle "num_draft_tokens" but we do not need it
+        elif forward_mode.is_draft_extend(include_v2=True):
+            accept_length = spec_info.accept_length[:bs]
+            if spec_info.accept_length_cpu:
+                metadata.max_seq_len_q = max(spec_info.accept_length_cpu[:bs]) + 1
+                metadata.sum_seq_lens_q = sum(spec_info.accept_length_cpu[:bs]) + bs
+            else:
+                metadata.max_seq_len_q = 1
+                metadata.sum_seq_lens_q = bs
+            # draft_extend uses (accept_length + 1) query tokens per sequence
+            extend_seq_lens = accept_length + 1
+            metadata.cu_seqlens_q[1:].copy_(
+                torch.cumsum(extend_seq_lens, dim=0, dtype=torch.int32)
+            )
+            metadata.seq_lens_q.copy_(extend_seq_lens)
+            # see NOTE(draft_extend seq_len handling)
+            seq_lens = seq_lens[:bs] - metadata.seq_lens_q + metadata.max_seq_len_q
+            metadata.seq_lens_k.copy_(seq_lens.to(torch.int32))
+
+        # Update block indices for new sequences.
+        create_flashmla_kv_indices_triton[(bs,)](
+            self.req_to_token,
+            req_pool_indices[:bs],
+            seq_lens,
+            None,
+            metadata.block_kv_indices,
+            self.req_to_token.stride(0),
+            metadata.block_kv_indices.shape[1],
+            PAGED_SIZE=self.page_size,
+        )
+
+    def get_cuda_graph_seq_len_fill_value(self) -> int:
+        """Get the fill value for sequence lengths in CUDA graph."""
+        return 1
+
+    def init_forward_metadata(self, forward_batch: ForwardBatch):
+        """Initialize the metadata for a forward pass."""
+        # Delegate to parent for non-decode modes.
+        if (
+            forward_batch.forward_mode.is_extend()
+            and not forward_batch.forward_mode.is_target_verify()
+            and not forward_batch.forward_mode.is_draft_extend(include_v2=True)
+        ):
+            # For extend batch with prefix length > 0, fallback to ragged kernel implemented in flashinfer MLA backend
+            # when chunked prefix cache is disabled.
+            # Also fallback to flashinfer MLA backend when in piecewise cuda graph, since it only supports MLA forward mode.
+            has_prefix = any(forward_batch.extend_prefix_lens_cpu)
+            fallback_to_flashinfer_impl = (
+                self.disable_chunked_prefix_cache and has_prefix
+            ) or is_in_piecewise_cuda_graph()
+            if fallback_to_flashinfer_impl:
+                super().init_forward_metadata(forward_batch)
+
+            seq_lens = forward_batch.seq_lens - forward_batch.extend_prefix_lens
+            cum_seq_lens_q = torch.cat(
+                (
+                    torch.zeros(
+                        1, dtype=torch.int32, device=forward_batch.seq_lens.device
+                    ),
+                    torch.cumsum(seq_lens, dim=0),
+                )
+            ).int()
+            max_seq_len = max(forward_batch.extend_seq_lens_cpu)
+            self.forward_prefill_metadata = TRTLLMMLAPrefillMetadata(
+                max_seq_len,
+                cum_seq_lens_q,
+                seq_lens,
+                fallback_to_flashinfer_impl,
+            )
+        elif (
+            forward_batch.forward_mode.is_decode_or_idle()
+            or forward_batch.forward_mode.is_target_verify()
+            or forward_batch.forward_mode.is_draft_extend(include_v2=True)
+        ):
+            bs = forward_batch.batch_size
+            self.forward_decode_metadata = TRTLLMMLADecodeMetadata()
+            # This is necessary because the backend instance persists across forward passes,
+            # and forward_prefill_metadata from a previous regular extend call could still be set.
+            if (
+                forward_batch.forward_mode.is_target_verify()
+                or forward_batch.forward_mode.is_draft_extend(include_v2=True)
+            ):
+                self.forward_prefill_metadata = None
+            # Get maximum sequence length.
+            if getattr(forward_batch, "seq_lens_cpu", None) is not None:
+                max_seq = forward_batch.seq_lens_cpu.max().item()
+            else:
+                max_seq = forward_batch.seq_lens.max().item()
+
+            seq_lens = forward_batch.seq_lens
+
+            if forward_batch.forward_mode.is_target_verify():
+                max_seq = max_seq + self.num_draft_tokens
+                seq_lens = seq_lens + self.num_draft_tokens
+                self.forward_decode_metadata.seq_lens_k = seq_lens.to(torch.int32)
+            elif forward_batch.forward_mode.is_draft_extend(include_v2=True):
+                sum_seq_lens_q = sum(forward_batch.extend_seq_lens_cpu)
+                max_seq_len_q = max(forward_batch.extend_seq_lens_cpu)
+                cu_seqlens_q = torch.nn.functional.pad(
+                    torch.cumsum(
+                        forward_batch.extend_seq_lens, dim=0, dtype=torch.int32
+                    ),
+                    (1, 0),
+                )
+                # see NOTE(draft_extend seq_len handling)
+                seq_lens = seq_lens - forward_batch.extend_seq_lens + max_seq_len_q
+
+                self.forward_decode_metadata.max_seq_len_q = max_seq_len_q
+                self.forward_decode_metadata.sum_seq_lens_q = sum_seq_lens_q
+                self.forward_decode_metadata.cu_seqlens_q = cu_seqlens_q
+                self.forward_decode_metadata.seq_lens_q = forward_batch.extend_seq_lens
+                self.forward_decode_metadata.seq_lens_k = seq_lens.to(torch.int32)
+
+            max_seqlen_pad = self._calc_padded_blocks(max_seq)
+            block_kv_indices = self._create_block_kv_indices(
+                bs,
+                max_seqlen_pad,
+                forward_batch.req_pool_indices,
+                seq_lens,
+                seq_lens.device,
+            )
+
+            self.forward_decode_metadata.block_kv_indices = block_kv_indices
+            self.forward_decode_metadata.max_seq_len_k = int(max_seq)
+            self.forward_decode_metadata.batch_size = bs
+
+            forward_batch.decode_trtllm_mla_metadata = self.forward_decode_metadata
+        else:
+            return super().init_forward_metadata(forward_batch)
+
+    def init_mha_chunk_metadata(self, forward_batch: ForwardBatch):
+        super().init_mha_chunk_metadata(forward_batch, disable_flashinfer_ragged=True)
+
+    def pad_draft_extend_query(
+        self,
+        q: torch.Tensor,
+        padded_q: torch.Tensor,
+        seq_lens_q: torch.Tensor,
+        cu_seqlens_q: torch.Tensor,
+    ) -> torch.Tensor:
+        """Pad draft extended query using Triton kernel."""
+        batch_size = cu_seqlens_q.shape[0] - 1
+        max_seq_len_q = padded_q.shape[1]
+        num_heads = padded_q.shape[2]
+        head_dim = padded_q.shape[3]
+
+        # Launch Triton kernel with 3D grid for parallelized head and dim processing
+        BLOCK_SIZE = 64
+        num_head_blocks = triton.cdiv(num_heads, BLOCK_SIZE)
+        num_dim_blocks = triton.cdiv(head_dim, BLOCK_SIZE)
+        grid = (batch_size * max_seq_len_q, num_head_blocks, num_dim_blocks)
+
+        pad_draft_extend_query_kernel[grid](
+            q_ptr=q,
+            padded_q_ptr=padded_q,
+            seq_lens_q_ptr=seq_lens_q,
+            cumsum_ptr=cu_seqlens_q,
+            batch_size=batch_size,
+            max_seq_len=max_seq_len_q,
+            num_heads=num_heads,
+            head_dim=head_dim,
+            BLOCK_SIZE=BLOCK_SIZE,
+        )
+        return padded_q
+
+    def unpad_draft_extend_output(
+        self,
+        raw_out: torch.Tensor,
+        cu_seqlens_q: torch.Tensor,
+        seq_lens_q: torch.Tensor,
+        sum_seq_lens_q: int,
+    ) -> torch.Tensor:
+        """Unpad draft extended output using Triton kernel."""
+        # raw_out: (batch_size, token_per_batch, layer.tp_q_head_num, layer.v_head_dim)
+        batch_size = seq_lens_q.shape[0]
+        token_per_batch = raw_out.shape[1]  # max_seq_len
+        tp_q_head_num = raw_out.shape[2]  # num_heads
+        v_head_dim = raw_out.shape[3]  # head_dim
+        total_tokens = sum_seq_lens_q
+
+        # Check if we're in CUDA graph mode (buffers are pre-allocated)
+        if self.unpad_output_buffer is not None:
+            # Use pre-allocated buffer for CUDA graph compatibility
+            output = self.unpad_output_buffer[:total_tokens, :, :].to(
+                dtype=raw_out.dtype
+            )
+        else:
+            # Dynamic allocation for non-CUDA graph mode
+            output = torch.empty(
+                (total_tokens, tp_q_head_num, v_head_dim),
+                dtype=raw_out.dtype,
+                device=raw_out.device,
+            )
+
+        # Launch Triton kernel with 3D grid for parallelized head and dim processing
+        BLOCK_SIZE = 64
+        num_head_blocks = triton.cdiv(tp_q_head_num, BLOCK_SIZE)
+        num_dim_blocks = triton.cdiv(v_head_dim, BLOCK_SIZE)
+        grid = (batch_size * token_per_batch, num_head_blocks, num_dim_blocks)
+
+        unpad_draft_extend_output_kernel[grid](
+            raw_out_ptr=raw_out,
+            output_ptr=output,
+            accept_length_ptr=seq_lens_q,
+            cumsum_ptr=cu_seqlens_q,
+            batch_size=batch_size,
+            token_per_batch=token_per_batch,
+            tp_q_head_num=tp_q_head_num,
+            v_head_dim=v_head_dim,
+            BLOCK_SIZE=BLOCK_SIZE,
+        )
+        return output[:total_tokens, :, :]
+
+    def forward_decode(
+        self,
+        q: torch.Tensor,  # q_nope
+        k: torch.Tensor,  # k_nope
+        v: torch.Tensor,  # not used in this backend
+        layer: RadixAttention,
+        forward_batch: ForwardBatch,
+        save_kv_cache: bool = True,
+        q_rope: Optional[torch.Tensor] = None,
+        k_rope: Optional[torch.Tensor] = None,
+        cos_sin_cache: Optional[torch.Tensor] = None,
+        is_neox: Optional[bool] = False,
+        llama_4_scaling: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        """Run forward for decode using TRTLLM MLA kernel."""
+        merge_query = q_rope is not None
+        if self.data_type == torch.float8_e4m3fn:
+            # For FP8 path, we quantize the query and rope parts and merge them into a single tensor
+            # Note: rope application in deepseek_v2.py:forward_absorb_prepare is skipped for FP8 decode path of this trtllm_mla backend
+            assert all(
+                x is not None for x in [q_rope, k_rope, cos_sin_cache]
+            ), "For FP8 path and using flashinfer.rope.mla_rope_quantize we need all of q_rope, k_rope and cos_sin_cache to be not None."
+            q, k, k_rope = mla_quantize_and_rope_for_fp8(
+                q,
+                q_rope,
+                k.squeeze(1),
+                k_rope.squeeze(1),
+                forward_batch.positions,
+                cos_sin_cache,
+                is_neox,
+                self.kv_lora_rank,
+                self.qk_rope_head_dim,
+            )
+            merge_query = False
+
+        # Save KV cache if requested
+        if save_kv_cache:
+            assert (
+                k is not None and k_rope is not None
+            ), "For populating trtllm_mla kv cache, both k_nope and k_rope should be not None."
+            forward_batch.token_to_kv_pool.set_mla_kv_buffer(
+                layer, forward_batch.out_cache_loc, k, k_rope
+            )
+
+        # Prepare query tensor inline
+        if merge_query:
+            # For FP16 path, we merge the query and rope parts into a single tensor
+            q_nope = q.view(-1, layer.tp_q_head_num, layer.v_head_dim)
+            q_rope_reshaped = q_rope.view(
+                -1, layer.tp_q_head_num, layer.head_dim - layer.v_head_dim
+            )
+            query = concat_mla_absorb_q_general(q_nope, q_rope_reshaped)
+        else:
+            # For FP8 path, we already have the query and rope parts merged because of the quantize_and_rope_for_fp8 function
+            query = q.view(-1, layer.tp_q_head_num, layer.head_dim)
+
+        # Apply llama 4 scaling if provided
+        if llama_4_scaling is not None:
+            query = query.to(self.q_data_type) * llama_4_scaling
+            query = query.to(self.data_type)
+
+        # Ensure query has shape [bs, acc_q_len, num_q_heads, head_dim] when seq_len 1
+        if query.dim() == 3:
+            query = query.unsqueeze(1)
+
+        # Prepare KV cache inline
+        k_cache = forward_batch.token_to_kv_pool.get_key_buffer(layer.layer_id)
+        kv_cache = k_cache.view(-1, self.page_size, self.kv_cache_dim).unsqueeze(1)
+
+        # Get metadata
+        metadata = (
+            getattr(forward_batch, "decode_trtllm_mla_metadata", None)
+            or self.forward_decode_metadata
+        )
+
+        # Ensure batch_size is sufficient, the batch size increase due to the padding from the forward batch
+        # FIXME(@rainj-me), refactor the skip_attn_backend_init, init_forward_metadata for attn backends
+        # and padding logic in prepare_mlp_sync_batch to avoid this
+        batch_size = getattr(metadata, "batch_size", None)
+        if batch_size is not None and batch_size < forward_batch.batch_size:
+            self.init_forward_metadata(forward_batch)
+            metadata = forward_batch.decode_trtllm_mla_metadata
+
+        # Scale computation for TRTLLM MLA kernel BMM1 operation:
+        # The final BMM1 scale is computed as: q_scale * k_scale * softmax_scale
+        # Scale components:
+        # - q_scale: Query scaling factor (set to 1.0 for both FP16/FP8 paths)
+        # - k_scale: Key scaling factor from model checkpoint. Only applied when KV cache
+        #   stores FP8-quantized values, to compensate for the quantization scaling.
+        #   For BF16/FP16 KV cache, k_scale must be 1.0 since values are unscaled.
+        # - softmax_scale: Attention softmax scaling = 1/sqrt(head_dim), pre-computed as layer.scaling
+        q_scale = 1.0
+        if self.data_type == torch.float8_e4m3fn:
+            k_scale = (
+                layer.k_scale_float
+                if getattr(layer, "k_scale_float", None) is not None
+                else 1.0
+            )
+        else:
+            if getattr(layer, "k_scale_float", None) is not None:
+                logger.warning_once(
+                    "Checkpoint has k_scale but KV cache dtype is not FP8. "
+                    "Ignoring k_scale for BMM1 (k_scale=%.4f, kv_dtype=%s).",
+                    layer.k_scale_float,
+                    self.data_type,
+                )
+            k_scale = 1.0
+
+        bmm1_scale = q_scale * k_scale * layer.scaling
+
+        # Call TRT-LLM kernel
+        raw_out = flashinfer.decode.trtllm_batch_decode_with_kv_cache_mla(
+            query=query,
+            kv_cache=kv_cache,
+            workspace_buffer=self.workspace_buffer,
+            qk_nope_head_dim=self.qk_nope_head_dim,
+            kv_lora_rank=self.kv_lora_rank,
+            qk_rope_head_dim=self.qk_rope_head_dim,
+            block_tables=metadata.block_kv_indices,
+            seq_lens=forward_batch.seq_lens.to(torch.int32),
+            max_seq_len=metadata.max_seq_len_k,
+            bmm1_scale=bmm1_scale,
+        )
+
+        # Reshape output directly without slicing
+        output = raw_out.view(-1, layer.tp_q_head_num * layer.v_head_dim)
+        return output
+
+    def forward_extend(
+        self,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
+        layer: RadixAttention,
+        forward_batch: ForwardBatch,
+        save_kv_cache: bool = True,
+        q_rope: Optional[torch.Tensor] = None,
+        k_rope: Optional[torch.Tensor] = None,
+        cos_sin_cache: Optional[torch.Tensor] = None,
+        is_neox: Optional[bool] = False,
+        llama_4_scaling: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+
+        if (
+            self.forward_prefill_metadata is not None
+            and self.forward_prefill_metadata.fallback_to_flashinfer_impl
+        ):
+            return super().forward_extend(
+                q, k, v, layer, forward_batch, save_kv_cache, q_rope, k_rope
+            )
+
+        # TODO refactor to avoid code duplication
+        merge_query = q_rope is not None
+        if (
+            self.data_type == torch.float8_e4m3fn
+        ) and forward_batch.forward_mode.is_target_verify():
+            # For FP8 path, we quantize the query and rope parts and merge them into a single tensor
+            # Note: rope application in deepseek_v2.py:forward_absorb_prepare is skipped for FP8 decode path of this trtllm_mla backend
+            assert all(
+                x is not None for x in [q_rope, k_rope, cos_sin_cache]
+            ), "For FP8 path and using flashinfer.rope.mla_rope_quantize we need all of q_rope, k_rope and cos_sin_cache to be not None."
+            q, k, k_rope = mla_quantize_and_rope_for_fp8(
+                q,
+                q_rope,
+                k.squeeze(1),
+                k_rope.squeeze(1),
+                forward_batch.positions,
+                cos_sin_cache,
+                is_neox,
+                self.kv_lora_rank,
+                self.qk_rope_head_dim,
+            )
+            merge_query = False
+
+        # Save KV cache if requested
+        if save_kv_cache:
+            assert (
+                k is not None and k_rope is not None
+            ), "For populating trtllm_mla kv cache, both k_nope and k_rope should be not None."
+            forward_batch.token_to_kv_pool.set_mla_kv_buffer(
+                layer, forward_batch.out_cache_loc, k, k_rope
+            )
+
+        # TODO refactor to avoid code duplication
+        # Prepare query tensor inline
+        if merge_query:
+            # For FP16 path, we merge the query and rope parts into a single tensor
+            q_nope = q.view(-1, layer.tp_q_head_num, layer.v_head_dim)
+            q_rope_reshaped = q_rope.view(
+                -1, layer.tp_q_head_num, layer.head_dim - layer.v_head_dim
+            )
+            q = concat_mla_absorb_q_general(q_nope, q_rope_reshaped)
+
+        q = q.view(-1, layer.tp_q_head_num, layer.head_dim)
+
+        # Apply llama 4 scaling if provided
+        if llama_4_scaling is not None:
+            q = q.to(self.q_data_type) * llama_4_scaling
+            q = q.to(self.data_type)
+
+        if (
+            forward_batch.forward_mode.is_target_verify()
+            or forward_batch.forward_mode.is_draft_extend(include_v2=True)
+        ):
+            metadata = (
+                getattr(forward_batch, "decode_trtllm_mla_metadata", None)
+                or self.forward_decode_metadata
+            )
+
+            # Ensure batch_size is sufficient, the batch size increase due to the padding from the forward batch
+            # FIXME(@rainj-me), refactor the skip_attn_backend_init, init_forward_metadata for attn backends
+            # and padding logic in prepare_mlp_sync_batch to avoid this
+            batch_size = getattr(metadata, "batch_size", None)
+            if batch_size is not None and batch_size < forward_batch.batch_size:
+                self.init_forward_metadata(forward_batch)
+                metadata = forward_batch.decode_trtllm_mla_metadata
+
+            # Ensure query has shape [bs, num_draft_tokens, num_q_heads, head_dim]
+            bs = forward_batch.batch_size
+
+            k_cache = forward_batch.token_to_kv_pool.get_key_buffer(layer.layer_id)
+            kv_cache = k_cache.view(-1, self.page_size, self.kv_cache_dim).unsqueeze(1)
+
+            q_scale = 1.0
+            if self.data_type == torch.float8_e4m3fn:
+                k_scale = (
+                    layer.k_scale_float
+                    if getattr(layer, "k_scale_float", None) is not None
+                    else 1.0
+                )
+            else:
+                if getattr(layer, "k_scale_float", None) is not None:
+                    logger.warning_once(
+                        "Checkpoint has k_scale but KV cache dtype is not FP8. "
+                        "Ignoring k_scale for BMM1 (k_scale=%.4f, kv_dtype=%s).",
+                        layer.k_scale_float,
+                        self.data_type,
+                    )
+                k_scale = 1.0
+            q = q.to(self.data_type)
+
+            bmm1_scale = q_scale * k_scale * layer.scaling
+            if forward_batch.forward_mode.is_target_verify():
+                max_seq_len = (
+                    metadata.max_seq_len_k + forward_batch.spec_info.draft_token_num
+                )
+                # For target_verify, all sequences have the same number of draft tokens
+                q = q.view(bs, -1, layer.tp_q_head_num, layer.head_dim)
+                needs_unpad = False
+            else:
+                # draft_extend: handle varying accept_lengths. If total_tokens % bs == 0,
+                # we can directly reshape q; otherwise, pad to max_seq_len_q.
+                total_tokens = q.shape[0]
+                tokens_per_seq = total_tokens // bs if bs > 0 else 0
+                can_direct_view = bs > 0 and (total_tokens % bs == 0)
+
+                if can_direct_view:
+                    max_seq_len = metadata.max_seq_len_k + tokens_per_seq
+                    q = q.view(bs, tokens_per_seq, layer.tp_q_head_num, layer.head_dim)
+                    needs_unpad = False
+                else:
+                    # Varying lengths: pad q to (bs, max_seq_len_q, ...)
+                    actual_seq_lens_q = forward_batch.extend_seq_lens
+                    actual_max_seq_len_q = max(forward_batch.extend_seq_lens_cpu)
+                    max_seq_len = metadata.max_seq_len_k + actual_max_seq_len_q
+
+                    actual_cu_seqlens_q = torch.nn.functional.pad(
+                        torch.cumsum(actual_seq_lens_q, dim=0, dtype=torch.int32),
+                        (1, 0),
+                    )
+
+                    if self.padded_q_buffer is not None:
+                        padded_q = self.padded_q_buffer[
+                            :bs, :actual_max_seq_len_q, :, :
+                        ].to(dtype=q.dtype)
+                        padded_q.zero_()
+                    else:
+                        padded_q = torch.zeros(
+                            (
+                                bs,
+                                actual_max_seq_len_q,
+                                layer.tp_q_head_num,
+                                layer.head_dim,
+                            ),
+                            dtype=q.dtype,
+                            device=q.device,
+                        )
+
+                    q = self.pad_draft_extend_query(
+                        q, padded_q, actual_seq_lens_q, actual_cu_seqlens_q
+                    )
+                    needs_unpad = True
+                    unpad_seq_lens_q = actual_seq_lens_q
+                    unpad_cu_seqlens_q = actual_cu_seqlens_q
+                    unpad_sum_seq_lens_q = total_tokens
+
+            assert kv_cache.dtype == self.data_type
+
+            raw_out = flashinfer.decode.trtllm_batch_decode_with_kv_cache_mla(
+                query=q,
+                kv_cache=kv_cache,
+                workspace_buffer=self.workspace_buffer,
+                qk_nope_head_dim=self.qk_nope_head_dim,
+                kv_lora_rank=self.kv_lora_rank,
+                qk_rope_head_dim=self.qk_rope_head_dim,
+                block_tables=metadata.block_kv_indices,
+                seq_lens=metadata.seq_lens_k,
+                max_seq_len=max_seq_len,
+                bmm1_scale=bmm1_scale,
+            )
+
+            if needs_unpad:
+                # Unpad the output for draft_extend mode with varying lengths
+                # Use the actual values computed during padding, not from metadata
+                output = self.unpad_draft_extend_output(
+                    raw_out,
+                    unpad_cu_seqlens_q,
+                    unpad_seq_lens_q,
+                    unpad_sum_seq_lens_q,
+                )
+                output = output.view(-1, layer.tp_q_head_num * layer.v_head_dim)
+            else:
+                output = raw_out.view(-1, layer.tp_q_head_num * layer.v_head_dim)
+            return output
+
+        if k_rope is not None:
+            k = torch.cat([k, k_rope], dim=-1)
+        k = k.view(-1, layer.tp_k_head_num, layer.head_dim)
+        v = v.view(-1, layer.tp_k_head_num, layer.v_head_dim)
+
+        q_scale = k_scale = v_scale = 1.0
+        if self.data_type == torch.float8_e4m3fn:
+            q, k, v, k_scale, v_scale = _quantize_fp8_qkv(q, k, v, layer)
+
+        common_trtllm_args = {
+            "query": q,
+            "key": k,
+            "value": v,
+            "workspace_buffer": self.workspace_buffer,
+            "batch_size": forward_batch.batch_size,
+            "window_left": -1,
+            "enable_pdl": False,
+            "max_q_len": self.forward_prefill_metadata.max_seq_len,
+            "bmm1_scale": q_scale * k_scale * layer.scaling,
+            "bmm2_scale": v_scale,
+            "cum_seq_lens_q": self.forward_prefill_metadata.cum_seq_lens,
+        }
+
+        # When chunked prefix cache is enabled, dispatch to different path for ragged attention.
+        if forward_batch.attn_attend_prefix_cache:
+            # MHA for chunked prefix kv cache when running model with MLA
+            assert forward_batch.prefix_chunk_idx is not None
+            assert forward_batch.prefix_chunk_cu_seq_lens is not None
+            assert q_rope is None
+            assert k_rope is None
+            chunk_idx = forward_batch.prefix_chunk_idx
+
+            out = torch.zeros(
+                q.shape[0],
+                layer.tp_q_head_num,
+                layer.v_head_dim,
+                dtype=self.q_data_type,
+                device=q.device,
+            )
+            return flashinfer.prefill.trtllm_ragged_attention_deepseek(
+                **common_trtllm_args,
+                seq_lens=forward_batch.prefix_chunk_seq_lens[chunk_idx],
+                max_kv_len=forward_batch.prefix_chunk_max_seq_lens[chunk_idx],
+                o_sf_scale=-1.0,
+                cum_seq_lens_kv=forward_batch.prefix_chunk_cu_seq_lens[chunk_idx],
+                is_causal=False,
+                return_lse=True,
+                out=out,
+            )
+        else:
+            out = torch.zeros(
+                q.shape[0],
+                q.shape[1],
+                v.shape[2],
+                device=q.device,
+                dtype=self.q_data_type,
+            )
+            return flashinfer.prefill.trtllm_ragged_attention_deepseek(
+                **common_trtllm_args,
+                seq_lens=self.forward_prefill_metadata.seq_lens,
+                max_kv_len=self.forward_prefill_metadata.max_seq_len,
+                o_sf_scale=1.0,
+                cum_seq_lens_kv=self.forward_prefill_metadata.cum_seq_lens,
+                is_causal=True,
+                return_lse=forward_batch.mha_return_lse,
+                out=out,
+            )
+
+
+class TRTLLMMLAMultiStepDraftBackend(FlashInferMLAMultiStepDraftBackend):
+    """Multi-step draft backend for TRT-LLM MLA used by EAGLE."""
+
+    def __init__(
+        self, model_runner: "ModelRunner", topk: int, speculative_num_steps: int
+    ):
+        super().__init__(model_runner, topk, speculative_num_steps)
+
+        for i in range(self.speculative_num_steps - 1):
+            self.attn_backends[i] = TRTLLMMLABackend(
+                model_runner,
+                skip_prefill=True,
+                kv_indptr_buf=self.kv_indptr[i],
+                q_indptr_decode_buf=self.q_indptr_decode,
+            )
diff --git a/sglang/python/sglang/srt/layers/attention/utils.py b/sglang/python/sglang/srt/layers/attention/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..d679025dd38cd1d275b157f4313607b7e870b71f
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/attention/utils.py
@@ -0,0 +1,474 @@
+import torch
+import triton
+import triton.language as tl
+
+from sglang.srt.utils import is_cuda
+
+_FLASHMLA_CREATE_KV_BLOCK_SIZE = 4096
+FLASHMLA_CREATE_KV_BLOCK_SIZE_TRITON = tl.constexpr(_FLASHMLA_CREATE_KV_BLOCK_SIZE)
+
+_is_cuda = is_cuda()
+
+if _is_cuda:
+    from sgl_kernel import concat_mla_absorb_q
+
+
+@triton.jit
+def create_flashinfer_kv_indices_triton(
+    req_to_token_ptr,  # [max_batch, max_context_len]
+    req_pool_indices_ptr,
+    page_kernel_lens_ptr,
+    kv_indptr,
+    kv_start_idx,
+    kv_indices_ptr,
+    req_to_token_ptr_stride: tl.constexpr,
+):
+    BLOCK_SIZE: tl.constexpr = 512
+    pid = tl.program_id(axis=0)
+
+    # find the req pool idx, this is for batch to token
+    req_pool_index = tl.load(req_pool_indices_ptr + pid)
+    kv_indices_offset = tl.load(kv_indptr + pid)
+
+    kv_start = 0
+    kv_end = 0
+    if kv_start_idx:
+        kv_start = tl.load(kv_start_idx + pid).to(tl.int32)
+        kv_end = kv_start
+    kv_end += tl.load(page_kernel_lens_ptr + pid).to(tl.int32)
+
+    num_loop = tl.cdiv(kv_end - kv_start, BLOCK_SIZE)
+    for i in range(num_loop):
+        # index into req_to_token_ptr needs to be int64
+        offset = tl.arange(0, BLOCK_SIZE).to(tl.int64) + i * BLOCK_SIZE
+        mask = offset < kv_end - kv_start
+        data = tl.load(
+            req_to_token_ptr
+            + req_pool_index * req_to_token_ptr_stride
+            + kv_start
+            + offset,
+            mask=mask,
+        )
+        tl.store(kv_indices_ptr + kv_indices_offset + offset, data, mask=mask)
+
+
+def get_num_page_per_block_flashmla(page_size: int = 64) -> int:
+    num_page_per_block = _FLASHMLA_CREATE_KV_BLOCK_SIZE // page_size
+    return num_page_per_block
+
+
+@triton.jit
+def create_flashmla_kv_indices_triton(
+    req_to_token_ptr,  # [max_batch, max_context_len]
+    req_pool_indices_ptr,
+    page_kernel_lens_ptr,
+    kv_start_idx,
+    kv_indices_ptr,
+    req_to_token_ptr_stride: tl.constexpr,
+    kv_indices_ptr_stride: tl.constexpr,
+    PAGED_SIZE: tl.constexpr = 64,
+):
+    NUM_PAGE_PER_BLOCK: tl.constexpr = (
+        FLASHMLA_CREATE_KV_BLOCK_SIZE_TRITON // PAGED_SIZE
+    )
+    pid = tl.program_id(axis=0)
+
+    # find the req pool idx, this is for batch to token
+    req_pool_index = tl.load(req_pool_indices_ptr + pid)
+
+    kv_start = 0
+    kv_end = 0
+    if kv_start_idx:
+        kv_start = tl.load(kv_start_idx + pid).to(tl.int32)
+        kv_end = kv_start
+
+    kv_end += tl.load(page_kernel_lens_ptr + pid).to(tl.int32)
+
+    num_paged = tl.cdiv(kv_end - kv_start, PAGED_SIZE)
+    num_pages_loop = tl.cdiv(kv_end - kv_start, FLASHMLA_CREATE_KV_BLOCK_SIZE_TRITON)
+
+    for i in range(num_pages_loop):
+        # index into req_to_token_ptr needs to be int64
+        paged_offset = (
+            tl.arange(0, NUM_PAGE_PER_BLOCK).to(tl.int64) + i * NUM_PAGE_PER_BLOCK
+        ) * PAGED_SIZE
+        paged_offset_out = tl.arange(0, NUM_PAGE_PER_BLOCK) + i * NUM_PAGE_PER_BLOCK
+
+        mask = paged_offset < num_paged * PAGED_SIZE
+        mask_out = paged_offset_out < num_paged
+
+        data = tl.load(
+            req_to_token_ptr
+            + req_pool_index * req_to_token_ptr_stride
+            + kv_start
+            + paged_offset,
+            mask=mask,
+        )
+        tl.store(
+            kv_indices_ptr + pid * kv_indices_ptr_stride + paged_offset_out,
+            data // PAGED_SIZE,
+            mask=mask_out,
+        )
+
+
+@triton.jit
+def concat_and_cast_mha_k_kernel(
+    k_ptr,
+    k_nope_ptr,
+    k_rope_ptr,
+    head_cnt: tl.constexpr,
+    k_stride0: tl.constexpr,
+    k_stride1: tl.constexpr,
+    nope_stride0: tl.constexpr,
+    nope_stride1: tl.constexpr,
+    rope_stride0: tl.constexpr,
+    nope_dim: tl.constexpr,
+    rope_dim: tl.constexpr,
+):
+    pid_loc = tl.program_id(0)
+    head_range = tl.arange(0, head_cnt)
+
+    k_head_ptr = k_ptr + pid_loc * k_stride0 + head_range[:, None] * k_stride1
+
+    nope_offs = tl.arange(0, nope_dim)
+
+    src_nope_ptr = (
+        k_nope_ptr
+        + pid_loc * nope_stride0
+        + head_range[:, None] * nope_stride1
+        + nope_offs[None, :]
+    )
+    dst_nope_ptr = k_head_ptr + nope_offs[None, :]
+
+    src_nope = tl.load(src_nope_ptr)
+    tl.store(dst_nope_ptr, src_nope)
+
+    rope_offs = tl.arange(0, rope_dim)
+    src_rope_ptr = k_rope_ptr + pid_loc * rope_stride0 + rope_offs[None, :]
+    dst_rope_ptr = k_head_ptr + nope_dim + rope_offs[None, :]
+    src_rope = tl.load(src_rope_ptr)
+    tl.store(dst_rope_ptr, src_rope)
+
+
+def concat_and_cast_mha_k_triton(
+    k: torch.Tensor,
+    k_nope: torch.Tensor,
+    k_rope: torch.Tensor,
+):
+    # The source data type will be implicitly converted to the target data type.
+    assert (
+        len(k.shape) == 3 and len(k_nope.shape) == 3 and len(k_rope.shape) == 3
+    ), f"shape should be 3d, but got {k.shape=}, {k_nope.shape=}, {k_rope.shape=}"
+    assert (
+        k.shape[0] == k_nope.shape[0] and k.shape[0] == k_rope.shape[0]
+    ), f"invalid shape, got {k.shape=}, {k_nope.shape=}, {k_rope.shape=}"
+    assert (
+        k.shape[1] == k_nope.shape[1] and 1 == k_rope.shape[1]
+    ), f"invalid shape, got {k.shape=}, {k_nope.shape=}, {k_rope.shape=}"
+    assert (
+        k.shape[-1] == k_nope.shape[-1] + k_rope.shape[-1]
+    ), f"invalid shape, got {k.shape=}, {k_nope.shape=}, {k_rope.shape=}"
+
+    nope_dim = k_nope.shape[-1]
+    rope_dim = k_rope.shape[-1]
+    grid = (k.shape[0],)
+
+    concat_and_cast_mha_k_kernel[grid](
+        k,
+        k_nope,
+        k_rope,
+        k.shape[1],
+        k.stride(0),
+        k.stride(1),
+        k_nope.stride(0),
+        k_nope.stride(1),
+        k_rope.stride(0),
+        nope_dim,
+        rope_dim,
+    )
+
+
+@triton.jit
+def pad_sequence_with_mask_kernel(
+    input_ptr,  # (total_tokens, hidden)
+    offsets_ptr,  # (B,)
+    lengths_ptr,  # (B,)
+    output_ptr,  # (B, max_len, hidden)
+    mask_ptr,  # (B, max_len)
+    max_len,
+    hidden_dim,
+    BLOCK_M: tl.constexpr,  # seq block
+    BLOCK_D: tl.constexpr,  # hidden block
+):
+    b = tl.program_id(0)  # batch index
+    m = tl.program_id(1)  # seq block index
+
+    offset = tl.load(offsets_ptr + b)
+    length = tl.load(lengths_ptr + b)
+
+    seq_ids = m * BLOCK_M + tl.arange(0, BLOCK_M)
+    hid_ids = tl.arange(0, BLOCK_D)
+
+    seq_mask = seq_ids < max_len
+    valid_token = seq_ids < length
+
+    # input index
+    in_token = offset + seq_ids
+    in_ptr = input_ptr + in_token[:, None] * hidden_dim + hid_ids[None, :]
+
+    # output index
+    out_ptr = (
+        output_ptr
+        + b * max_len * hidden_dim
+        + seq_ids[:, None] * hidden_dim
+        + hid_ids[None, :]
+    )
+
+    values = tl.load(
+        in_ptr,
+        mask=valid_token[:, None] & (hid_ids[None, :] < hidden_dim),
+        other=0.0,
+    )
+
+    tl.store(
+        out_ptr,
+        values,
+        mask=seq_mask[:, None] & (hid_ids[None, :] < hidden_dim),
+    )
+
+    # attention mask
+    if tl.program_id(2) == 0:
+        mask_out_ptr = mask_ptr + b * max_len + seq_ids
+        tl.store(mask_out_ptr, valid_token, mask=seq_mask)
+
+
+def pad_sequence_with_mask(
+    input_emb,  # (total_tokens, hidden)
+    offsets,  # (B,)
+    lengths,  # (B,)
+    max_len,
+):
+    B = offsets.shape[0]
+    hidden_dim = input_emb.shape[1]
+
+    output = torch.zeros(
+        (B, max_len, hidden_dim),
+        device=input_emb.device,
+        dtype=input_emb.dtype,
+    )
+    attn_mask = torch.empty(
+        (B * max_len),
+        device=input_emb.device,
+        dtype=torch.bool,
+    )
+
+    BLOCK_D = triton.next_power_of_2(hidden_dim)
+    BLOCK_M = triton.next_power_of_2(max_len)
+
+    grid = (
+        B,
+        triton.cdiv(max_len, BLOCK_M),
+        1,
+    )
+
+    pad_sequence_with_mask_kernel[grid](
+        input_emb,
+        offsets,
+        lengths,
+        output,
+        attn_mask,
+        max_len,
+        hidden_dim,
+        BLOCK_M=BLOCK_M,
+        BLOCK_D=BLOCK_D,
+    )
+
+    return B, output, attn_mask
+
+
+@triton.jit
+def seqlens_expand_kernel(
+    extend_seq_lens_ptr,  # [N]
+    seq_lens_ptr,  # [N]
+    offsets_ptr,  # [N+1]
+    output_ptr,  # [sum(extend_seq_lens)]
+    N,
+    BLOCK: tl.constexpr,
+):
+    pid = tl.program_id(0)
+
+    if pid >= N:
+        return
+
+    qo_len = tl.load(extend_seq_lens_ptr + pid)
+    kv_len = tl.load(seq_lens_ptr + pid)
+
+    start = kv_len - qo_len + 1
+    out_offset = tl.load(offsets_ptr + pid)
+
+    offs = tl.arange(0, BLOCK)
+    mask = offs < qo_len
+
+    values = start + offs
+    tl.store(output_ptr + out_offset + offs, values, mask=mask)
+
+
+def seqlens_expand_triton(
+    extend_seq_lens: torch.Tensor,
+    seq_lens: torch.Tensor,
+    total_len: int,
+    max_q_len: int,
+):
+    """
+    extend_seq_lens: [N], int32, CUDA
+    seq_lens:        [N], int32, CUDA
+    """
+    assert extend_seq_lens.is_cuda
+    assert seq_lens.is_cuda
+
+    N = extend_seq_lens.numel()
+
+    offsets = torch.zeros(N + 1, device=extend_seq_lens.device, dtype=torch.int32)
+    offsets[1:] = torch.cumsum(extend_seq_lens, dim=0)
+    output = torch.empty(total_len, device=extend_seq_lens.device, dtype=torch.int32)
+
+    BLOCK = triton.next_power_of_2(max_q_len)
+    grid = (N,)
+
+    seqlens_expand_kernel[grid](
+        extend_seq_lens,
+        seq_lens,
+        offsets,
+        output,
+        N,
+        BLOCK=BLOCK,
+    )
+
+    return output
+
+
+# When num_kv_heads=1, we have tensors with degenerate strides,
+# For example, as below, where we have stride[-3] == stride[-2]:
+# - shape: [num_pages, 1, 64, 128]
+# - stride: [8192, 128, 128, 1]
+# This will cause TMA desc validation fail in flashinfer (trtllm-mha backend).
+#
+# See: https://github.com/flashinfer-ai/flashinfer/issues/2232
+def canonicalize_stride(tensor: torch.Tensor) -> torch.Tensor:
+    """
+    Adjust degenerate strides for a tensor, make it canonical.
+    """
+    sizes = tensor.size()
+    strides = tensor.stride()
+    ndim = tensor.dim()
+
+    need_fix = any(
+        sizes[i] == 1 and strides[i] == strides[i + 1] for i in range(ndim - 1)
+    )
+
+    if not need_fix:
+        return tensor
+
+    # canonicalize the stride
+    # Example:
+    # - shape: [num_pages, 1, 64, 128]
+    # - stride: [8192, 128, 128, 1] (wrong!)
+    # Gives new stride: [8192, 8192, 128 ,1] (correct!)
+    new_strides = [0] * ndim
+    new_strides[-1] = 1
+    for i in range(ndim - 2, -1, -1):
+        new_strides[i] = new_strides[i + 1] * sizes[i + 1]
+
+    return tensor.as_strided(sizes, new_strides)
+
+
+def mla_quantize_and_rope_for_fp8(
+    q_nope: torch.Tensor,
+    q_rope: torch.Tensor,
+    k_nope: torch.Tensor,
+    k_rope: torch.Tensor,
+    pos_ids: torch.Tensor,
+    cos_sin_cache: torch.Tensor,
+    is_neox: bool,
+    kv_lora_rank: int,
+    qk_rope_head_dim: int,
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    import flashinfer.rope
+
+    """Quantize and apply RoPE for FP8 attention path.
+
+        This function handles the FP8 quantization and RoPE application for MLA attention.
+        It takes separate query/key nope and rope components, applies RoPE to the rope parts,
+        quantizes all components to FP8, and merges the query components into a single tensor.
+
+        Args:
+            q_nope: Query no-position-encoding component [seq_len, num_heads, kv_lora_rank]
+                - expected dtype: torch.bfloat16
+            q_rope: Query RoPE component [seq_len, num_heads, qk_rope_head_dim]
+                - expected dtype: torch.bfloat16
+            k_nope: Key no-position-encoding component [seq_len, num_heads, kv_lora_rank]
+                - expected dtype: torch.bfloat16
+            k_rope: Key RoPE component [seq_len, num_heads, qk_rope_head_dim]
+                - expected dtype: torch.bfloat16
+            pos_ids: Position indices for each token
+                - expected dtype: torch.int64 or torch.int32
+            cos_sin_cache: Precomputed cosine/sine cache for RoPE
+                - expected dtype: matches q_/k_ input dtype (torch.bfloat16)
+            is_neox: Whether to use NeoX-style RoPE (interleaved) or GPT-style (half rotation)
+            kv_lora_rank: Dimension of the no-position-encoding component
+            qk_rope_head_dim: Dimension of the RoPE component
+
+        Returns:
+            tuple: (merged_q_out, k_nope_out, k_rope_out) quantized to FP8
+                - merged_q_out: [seq_len, num_heads, kv_lora_rank + qk_rope_head_dim], dtype=torch.float8_e4m3fn
+                - k_nope_out:   [seq_len, num_heads, kv_lora_rank], dtype=torch.float8_e4m3fn
+                - k_rope_out:   [seq_len, num_heads, qk_rope_head_dim], dtype=torch.float8_e4m3fn
+        """
+    attn_dtype = torch.float8_e4m3fn
+    q_len, num_heads = q_rope.shape[0], q_rope.shape[1]
+
+    # Allocate output tensors with FP8 dtype
+    # Query output will contain merged nope + rope components
+    q_out = q_rope.new_empty(
+        q_len,
+        num_heads,
+        kv_lora_rank + qk_rope_head_dim,
+        dtype=attn_dtype,
+    )
+
+    # Key outputs maintain original shapes but with FP8 dtype
+    k_rope_out = k_rope.new_empty(k_rope.shape, dtype=attn_dtype)
+    k_nope_out = k_nope.new_empty(k_nope.shape, dtype=attn_dtype)
+
+    # Apply RoPE and quantize all components in a single fused kernel call
+    # This kernel handles:
+    # 1. RoPE application to q_rope and k_rope using cos_sin_cache and positions
+    # 2. Quantization of all components to FP8 format
+    # 3. Output placement into pre-allocated tensors
+    flashinfer.rope.mla_rope_quantize_fp8(
+        q_rope=q_rope,
+        k_rope=k_rope,
+        q_nope=q_nope,
+        k_nope=k_nope,
+        cos_sin_cache=cos_sin_cache,
+        pos_ids=pos_ids,
+        is_neox=is_neox,
+        quantize_dtype=attn_dtype,
+        # Output tensor slicing: q_out contains [nope_part, rope_part]
+        q_rope_out=q_out[..., kv_lora_rank:],  # RoPE part goes to end
+        k_rope_out=k_rope_out,
+        q_nope_out=q_out[..., :kv_lora_rank],  # Nope part goes to beginning
+        k_nope_out=k_nope_out,
+        # Quantization scales (set to 1.0 for no additional scaling)
+        quant_scale_q=1.0,
+        quant_scale_kv=1.0,
+    )
+
+    return q_out, k_nope_out, k_rope_out
+
+
+def concat_mla_absorb_q_general(q_nope, q_rope):
+    if _is_cuda and q_nope.shape[-1] == 512 and q_rope.shape[-1] == 64:
+        return concat_mla_absorb_q(q_nope, q_rope)
+    else:
+        return torch.cat([q_nope, q_rope], dim=-1)
diff --git a/sglang/python/sglang/srt/layers/attention/vision.py b/sglang/python/sglang/srt/layers/attention/vision.py
new file mode 100644
index 0000000000000000000000000000000000000000..4abe5b2c4049e55f07a0f0b2abb2a7e5359c98e5
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/attention/vision.py
@@ -0,0 +1,1128 @@
+from __future__ import annotations
+
+import dataclasses
+import functools
+import math
+from functools import lru_cache, partial
+from typing import Any, Callable, Optional, Tuple
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from einops import rearrange
+
+from sglang.jit_kernel.norm import can_use_fused_inplace_qknorm as can_use_jit_qk_norm
+from sglang.srt.environ import envs
+from sglang.srt.layers.dp_attention import get_attention_tp_rank, get_attention_tp_size
+from sglang.srt.models.utils import apply_qk_norm
+from sglang.srt.utils import (
+    get_bool_env_var,
+    get_device_capability,
+    is_blackwell_supported,
+    is_cuda,
+    is_hip,
+    is_npu,
+    print_info_once,
+)
+from sglang.srt.utils.multi_stream_utils import (
+    maybe_execute_in_parallel,
+    with_multi_stream,
+)
+
+_is_cuda = is_cuda()
+_is_npu = is_npu()
+_is_hip = is_hip()
+
+if _is_cuda:
+    from flashinfer.prefill import cudnn_batch_prefill_with_kv_cache
+
+    try:
+        from sgl_kernel.flash_attn import flash_attn_varlen_func
+
+        from sglang.jit_kernel.flash_attention_v4 import (
+            flash_attn_varlen_func as flash_attn_varlen_func_fa4,
+        )
+
+        def flash_attn_func(*args, ver: int = 3, **kwargs):
+            if ver == 4:
+                return flash_attn_varlen_func_fa4(*args, **kwargs)
+            return flash_attn_varlen_func(*args, **kwargs)
+
+    except ImportError as e:
+        raise e
+
+
+if _is_npu:
+    import torch_npu
+
+_use_aiter = get_bool_env_var("SGLANG_USE_AITER") and _is_hip
+
+from sglang.srt.distributed import (
+    split_tensor_along_last_dim,
+    tensor_model_parallel_all_gather,
+)
+from sglang.srt.distributed import utils as dist_utils
+from sglang.srt.layers.attention.triton_ops.prefill_attention import (
+    context_attention_fwd,
+)
+from sglang.srt.layers.layernorm import RMSNorm
+from sglang.srt.layers.linear import (
+    ColumnParallelLinear,
+    QKVParallelLinear,
+    RowParallelLinear,
+)
+from sglang.srt.layers.quantization import QuantizationConfig
+from sglang.srt.layers.rotary_embedding import apply_rotary_pos_emb
+from sglang.srt.server_args import get_global_server_args
+from sglang.srt.utils import add_prefix, get_bool_env_var
+
+ROTARY_EMBED_CLASSES = {
+    "normal": apply_rotary_pos_emb,
+}
+
+# === Vision Encoder === #
+FLASHINFER_WORKSPACE_SIZE_BYTES = 128 * 1024 * 1024
+
+# Batch buckets for cuDNN graph caching - graphs are cached per bucket size
+# This avoids creating a new graph for each unique batch size at runtime
+BATCH_BUCKETS = [8, 16, 32, 64]
+
+# Bucketized max seqlens to reduce cuDNN recompilation frequency while
+# preserving a tighter upper bound than a single fixed max seqlen.
+FLASHINFER_MAX_SEQLEN_BUCKETS = [
+    4 * 1024,
+    8 * 1024,
+    16 * 1024,
+    32 * 1024,
+    64 * 1024,
+    128 * 1024,
+]
+
+
+@dataclasses.dataclass
+class SingletonCache:
+    data: Any = None
+
+    def set_data(self, value: Any) -> None:
+        self.data = value
+
+    def get_data(self) -> Optional[Any]:
+        return self.data
+
+    def empty(self) -> bool:
+        return self.get_data() is None
+
+
+# TODO: requires real seqlens from images
+@functools.lru_cache(maxsize=128)
+def _get_cu_seqlens_for_shape(batch_size: int, seqlen: int, device) -> torch.Tensor:
+    """
+    Generates cumulative sequence lengths (cu_seqlens) for a given batch_size, seqlen, and device.
+    Caches the result based on these parameters.
+    """
+    cu_seqlens = torch.arange(
+        0,
+        (batch_size + 1) * seqlen,
+        step=seqlen,
+        dtype=torch.int32,
+        device=device,
+    )
+    return cu_seqlens
+
+
+def resolve_seqlens(
+    cu_seqlens: torch.Tensor | SingletonCache | None,
+    bsz: int,
+    seq_len: int,
+    *,
+    device: torch.device,
+) -> torch.Tensor:
+    if cu_seqlens is None:
+        resolved_seqlens = _get_cu_seqlens_for_shape(bsz, seq_len, device=device)
+    elif isinstance(cu_seqlens, SingletonCache):
+        if cu_seqlens.empty():
+            cu_seqlens.set_data(_get_cu_seqlens_for_shape(bsz, seq_len, device=device))
+        resolved_seqlens = cu_seqlens.get_data()
+    else:
+        resolved_seqlens = cu_seqlens
+    assert isinstance(
+        resolved_seqlens, torch.Tensor
+    ), "cu_seqlens must be a torch.Tensor"
+    return resolved_seqlens
+
+
+class VisionSdpaAttention(nn.Module):
+    r"""
+    Scaled Dot Product Attention inner product
+
+    """
+
+    def __init__(
+        self,
+        head_dim: int,
+        num_heads: int,
+        num_kv_heads: int,
+        dropout: float = 0.0,
+        flatten_batch: bool = False,
+        softmax_in_single_precision: bool = False,
+        **kwargs,
+    ):
+        super().__init__()
+        self.head_size = head_dim
+        self.num_heads = num_heads
+        self.num_kv_heads = num_kv_heads
+        self.flatten_batch = flatten_batch
+        self.softmax_in_single_precision = softmax_in_single_precision
+        self.dropout = dropout
+        self.scale = 1.0 / math.sqrt(self.head_size)
+
+    @staticmethod
+    @lru_cache(maxsize=128)
+    def _generate_mask_cache(
+        s: int, flatten_batch: bool, cu_seqlens: tuple
+    ) -> torch.BoolTensor:
+        """
+        Generate a boolean attention mask with caching mechanism.
+        Args:
+            s: sequence length
+            flatten_batch: whether to flatten batch dimension
+            cu_seqlens: tuple of cumulative sequence lengths
+        Returns:
+            attention mask tensor of shape [b, 1, s, s] or [1, s, s]
+        """
+        if flatten_batch:
+            mask = torch.zeros([1, s, s], dtype=torch.bool)
+            for i in range(1, len(cu_seqlens)):
+                start = cu_seqlens[i - 1]
+                end = cu_seqlens[i]
+                mask[..., start:end, start:end] = True
+        else:
+            # [1, 1, 1, s]
+            row_indices = torch.arange(s).view(1, 1, 1, s)
+            # [1, 1, s, 1]
+            col_indices = torch.arange(s).view(1, 1, s, 1)
+            # [b, 1, 1, 1]
+            seq_lens = torch.tensor(
+                [end - start for start, end in zip(cu_seqlens[:-1], cu_seqlens[1:])],
+            ).view(-1, 1, 1, 1)
+
+            mask = (row_indices < seq_lens) & (col_indices < seq_lens)
+
+        return mask
+
+    def generate_patch_attention_mask(
+        self,
+        s: int,
+        cu_seqlens: Optional[torch.Tensor],
+        flatten_batch: bool = False,
+    ) -> Optional[torch.Tensor]:
+        r"""
+        Creates a non-causal 4D mask of shape `(b, 1, s, s)` or `(1, 1, s, s)`.
+        Args:
+            s: sequence length
+            cu_seqlens: cumulative sequence lengths tensor. If not, returns an empty mask
+            flatten_batch: whether to flatten batch dimension
+        Returns:
+            attention mask tensor or None
+        """
+        if cu_seqlens is None:
+            return None
+
+        cu_seqlens_tuple = tuple(cu_seqlens.cpu().tolist())
+
+        return self._generate_mask_cache(s, flatten_batch, cu_seqlens_tuple)
+
+    def forward(
+        self,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
+        bsz: int,
+        cu_seqlens: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        **kwargs,
+    ) -> torch.Tensor:
+        r"""
+        Args:
+            cu_seqlens: [b]
+        Returns:
+             [b * s, h, head_size]
+        """
+        if self.flatten_batch:
+            assert bsz == 1, "flatten_batch is True, bsz must be 1"
+
+        assert q.dim() == 3, q.shape
+
+        s = q.shape[0] // bsz
+
+        # [b, 1, s, s]
+        if attention_mask is None:
+            attention_mask = self.generate_patch_attention_mask(
+                s, cu_seqlens, flatten_batch=self.flatten_batch
+            )
+
+        if attention_mask is None:
+            if self.softmax_in_single_precision:
+                raise RuntimeError("Empty attention mask")
+        else:
+            attention_mask = attention_mask.to(device=q.device)
+
+        q, k, v = [rearrange(x, "(b s) h d -> b h s d", b=bsz) for x in [q, k, v]]
+
+        if self.softmax_in_single_precision:
+            k = rearrange(k, "b h s d -> b h d s")
+            attn_weights = torch.matmul(q, k) * self.scale
+            del k
+            # masking
+            attention_mask = (~attention_mask) * torch.finfo(q.dtype).min
+            attn_weights = attn_weights + attention_mask
+            del attention_mask
+            # full-precision
+            attn_weights = nn.functional.softmax(
+                attn_weights, dim=-1, dtype=torch.float32
+            ).to(q.dtype)
+            attn_weights = nn.functional.dropout(
+                attn_weights, p=self.dropout, training=False
+            )
+            output = torch.matmul(attn_weights, v)
+            del attn_weights, v
+        else:
+            # SDPA
+            # [b, h, s, head_size]
+            output = F.scaled_dot_product_attention(
+                q,
+                k,
+                v,
+                attn_mask=attention_mask,
+                dropout_p=self.dropout,
+                is_causal=False,
+            )
+
+        # [b, h, s, head_size] --> [b * s, h, head_size]
+        output = rearrange(output, "b h s d -> (b s) h d")
+
+        return output
+
+
+class VisionTritonAttention(nn.Module):
+    """
+    Triton-implemented attention without a causal mask
+    """
+
+    def __init__(
+        self,
+        **kwargs,
+    ):
+        super().__init__()
+        use_data_parallel = (
+            kwargs["use_data_parallel"] if "use_data_parallel" in kwargs else False
+        )
+        self.tp_size = 1 if use_data_parallel else get_attention_tp_size()
+
+    def forward(
+        self,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
+        cu_seqlens: torch.Tensor | SingletonCache | None,
+        bsz: int,
+        seq_len: int,
+        **kwargs,
+    ) -> torch.Tensor:
+        r"""
+        Args:
+            cu_seqlens: [b]
+        Returns:
+             [b * s, h, head_size]
+        """
+        if envs.SGLANG_VIT_ENABLE_CUDA_GRAPH.get():
+            if "output_ws" not in kwargs:
+                raise RuntimeError("output_ws should be prepared for cuda-graph mode")
+
+            if not isinstance(cu_seqlens, list):
+                raise RuntimeError("cuda-graph mode cu_seqlens should be a list")
+
+            output = kwargs["output_ws"]
+            context_attention_fwd(
+                q,
+                k,
+                v,
+                output,
+                cu_seqlens[0],
+                cu_seqlens[1],
+                cu_seqlens[2],
+                is_causal=False,
+            )
+        else:
+            cu_seqlens = resolve_seqlens(cu_seqlens, bsz, seq_len, device=q.device)
+
+            # [b * s, head, head_size]
+            output = torch.empty_like(q)
+
+            seq_lens = cu_seqlens[1:] - cu_seqlens[:-1]
+            max_seqlen = seq_lens.max().item()
+            context_attention_fwd(
+                q,
+                k,
+                v,
+                output,
+                cu_seqlens.cuda(),
+                seq_lens.cuda(),
+                max_seqlen,
+                is_causal=False,
+            )
+
+        return output
+
+
+class VisionFlash3Attention(nn.Module):
+    def __init__(
+        self,
+        **kwargs,
+    ):
+        if not _is_cuda:
+            raise Exception("VisionFlash3Attention is only available for cuda")
+        super().__init__()
+        use_data_parallel = (
+            kwargs["use_data_parallel"] if "use_data_parallel" in kwargs else False
+        )
+        self.tp_size = 1 if use_data_parallel else get_attention_tp_size()
+
+    def forward(
+        self,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
+        cu_seqlens: torch.Tensor | SingletonCache | None,
+        bsz: int,
+        seq_len: int,
+        **kwargs,
+    ) -> torch.Tensor:
+        r"""
+        Args:
+            cu_seqlens: [b]
+        Returns:
+             [b * s, h, head_size]
+        """
+        if envs.SGLANG_VIT_ENABLE_CUDA_GRAPH.get():
+            max_seqlen = cu_seqlens[1]
+            output = flash_attn_func(
+                q,
+                k,
+                v,
+                cu_seqlens_q=cu_seqlens[0],
+                cu_seqlens_k=cu_seqlens[0],
+                max_seqlen_q=max_seqlen,
+                max_seqlen_k=max_seqlen,
+            )
+        else:
+            cu_seqlens = resolve_seqlens(cu_seqlens, bsz, seq_len, device=q.device)
+            cu_seqlens = cu_seqlens.to(dtype=torch.int32).to(q.device)
+            seq_lens = cu_seqlens[1:] - cu_seqlens[:-1]
+            max_seqlen = seq_lens.max().item()
+
+            output = flash_attn_func(
+                q,
+                k,
+                v,
+                cu_seqlens_q=cu_seqlens,
+                cu_seqlens_k=cu_seqlens,
+                max_seqlen_q=max_seqlen,
+                max_seqlen_k=max_seqlen,
+            )
+
+        return output
+
+
+class VisionFlash4Attention(nn.Module):
+    def __init__(
+        self,
+        **kwargs,
+    ):
+        if not _is_cuda:
+            raise Exception("VisionFlash4Attention is only available for cuda")
+        super().__init__()
+
+    def forward(
+        self,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
+        cu_seqlens: torch.Tensor | SingletonCache | None,
+        bsz: int,
+        seq_len: int,
+        **kwargs,
+    ) -> torch.Tensor:
+        r"""
+        Args:
+            cu_seqlens: [b]
+        Returns:
+             [b * s, h, head_size]
+        """
+        if cu_seqlens is None:
+            cu_seqlens = _get_cu_seqlens_for_shape(bsz, seq_len, device=q.device)
+        elif isinstance(cu_seqlens, SingletonCache):
+            if cu_seqlens.empty():
+                cu_seqlens.set_data(
+                    _get_cu_seqlens_for_shape(bsz, seq_len, device=q.device)
+                )
+            cu_seqlens = cu_seqlens.get_data()
+
+        cu_seqlens = cu_seqlens.to(dtype=torch.int32).to(q.device)
+        seq_lens = cu_seqlens[1:] - cu_seqlens[:-1]
+        max_seqlen = seq_lens.max().item()
+
+        output = flash_attn_func(
+            q,
+            k,
+            v,
+            cu_seqlens_q=cu_seqlens,
+            cu_seqlens_k=cu_seqlens,
+            max_seqlen_q=max_seqlen,
+            max_seqlen_k=max_seqlen,
+            ver=4,
+        )
+
+        return output
+
+
+class VisionFlashInferAttention(nn.Module):
+    def __init__(
+        self,
+        **kwargs,
+    ):
+        if not _is_cuda:
+            raise Exception("VisionFlashInferAttention is only available for cuda")
+        super().__init__()
+        self.workspace_buffer = (
+            kwargs["workspace_buffer"] if "workspace_buffer" in kwargs else None
+        )
+
+    def forward(
+        self,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
+        cu_seqlens: torch.Tensor | SingletonCache | None,
+        bsz: int,
+        seq_len: int,
+        **kwargs,
+    ) -> torch.Tensor:
+        r"""
+        Args:
+            cu_seqlens: [b]
+        Returns:
+             [b * s, h, head_size]
+        """
+        if "sequence_lengths" not in kwargs:
+            raise RuntimeError(
+                "sequence_lengths should be prepared for vision flashinfer_cudnn attention backend"
+            )
+        if "max_seqlen" not in kwargs:
+            raise RuntimeError(
+                "max_seqlen should be prepared for vision flashinfer_cudnn attention backend"
+            )
+
+        sequence_lengths = kwargs["sequence_lengths"]  # (B_padded,) or (B_padded,1,1,1)
+        max_seqlen = kwargs["max_seqlen"]
+
+        # max_seqlen must be python int
+        if isinstance(max_seqlen, torch.Tensor):
+            if max_seqlen.is_cuda:
+                max_seqlen = int(max_seqlen.detach().cpu().item())
+            else:
+                max_seqlen = int(max_seqlen.item())
+        else:
+            max_seqlen = int(max_seqlen)
+
+        # flatten if caller gives (b, s, h, d)
+        is_reshaped = q.dim() == 4
+        if is_reshaped:
+            reshape_batch_size = q.shape[0]
+            q, k, v = (rearrange(x, "b s ... -> (b s) ...") for x in [q, k, v])
+
+        if not isinstance(cu_seqlens, torch.Tensor):
+            raise RuntimeError(
+                "flashinfer_cudnn expects packed indptrs as a torch.Tensor"
+            )
+
+        # sequence_lengths -> (B,)
+        if not isinstance(sequence_lengths, torch.Tensor):
+            raise RuntimeError("sequence_lengths must be a torch.Tensor")
+        seq_lens_1d = sequence_lengths.view(-1).to(device=q.device, dtype=torch.int32)
+        B = int(seq_lens_1d.numel())
+
+        # cu_seqlens contains packed *element indptrs*:
+        # [qk_indptr(B+1), v_indptr(B+1), o_indptr(B+1)] => total 3*(B+1)
+        cu_seqlens_1d = cu_seqlens.view(-1).to(device=q.device, dtype=torch.int32)
+        expected = 3 * (B + 1)
+        if int(cu_seqlens_1d.numel()) != expected:
+            raise RuntimeError(
+                f"packed indptr numel mismatch: got {cu_seqlens_1d.numel()}, expected {expected} (= 3*(B+1))"
+            )
+
+        split = B + 1
+        indptr_qk = cu_seqlens_1d[:split].view(split, 1, 1, 1)
+        indptr_v = cu_seqlens_1d[split : 2 * split].view(split, 1, 1, 1)
+        indptr_o = cu_seqlens_1d[2 * split :].view(split, 1, 1, 1)
+
+        # cuDNN style: (B,1,1,1)
+        seq_lens_4d = seq_lens_1d.view(B, 1, 1, 1)
+
+        # indptr are in ELEMENT offsets (not token offsets)
+        token_width_q = int(q.shape[1] * q.shape[2])  # heads * head_dim on this rank
+        total_elems_q = int(q.numel())
+
+        # check each real sequence fits
+        # (skip padded tail where seq_len==0)
+        start_elems = indptr_qk.view(-1)[:-1]  # (B,)
+        end_elems = start_elems + seq_lens_1d * token_width_q
+        if (end_elems > total_elems_q).any():
+            raise RuntimeError("offset + len out of bounds; packed indptr is wrong")
+
+        _, _, head_size = q.shape
+        scale = head_size**-0.5
+
+        output, _ = cudnn_batch_prefill_with_kv_cache(
+            q,
+            k,
+            v,
+            scale,
+            self.workspace_buffer,
+            max_token_per_sequence=max_seqlen,
+            max_sequence_kv=max_seqlen,
+            actual_seq_lens_q=seq_lens_4d,
+            actual_seq_lens_kv=seq_lens_4d,
+            causal=False,
+            return_lse=True,
+            batch_offsets_q=indptr_qk,
+            batch_offsets_k=indptr_qk,
+            batch_offsets_v=indptr_v,
+            batch_offsets_o=indptr_o,
+            is_cuda_graph_compatible=True,
+        )
+
+        if is_reshaped:
+            output = rearrange(output, "(b s) h d -> b s h d", b=reshape_batch_size)
+
+        return output
+
+
+class VisionAiterAttention(nn.Module):
+    def __init__(
+        self,
+        **kwargs,
+    ):
+        if not _is_hip:
+            raise Exception("aiter_attn is only available for AMD")
+        try:
+            from aiter import flash_attn_varlen_func as aiter_flash_attn_varlen_func
+        except ImportError as e:
+            raise ImportError(
+                "aiter is AMD specific kernel library. Please make sure aiter is installed on your AMD device."
+            ) from e
+
+        self.flash_attn_varlen_func = aiter_flash_attn_varlen_func
+        super().__init__()
+
+    def forward(
+        self,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
+        cu_seqlens: torch.Tensor | SingletonCache | None,
+        bsz: int,
+        seq_len: int,
+        **kwargs,
+    ) -> torch.Tensor:
+        cu_seqlens = resolve_seqlens(cu_seqlens, bsz, seq_len, device=q.device)
+
+        cu_seqlens = cu_seqlens.to(dtype=torch.int32).to(q.device)
+        seq_lens = cu_seqlens[1:] - cu_seqlens[:-1]
+        max_seqlen = seq_lens.max().item()
+
+        return self.flash_attn_varlen_func(
+            q=q,
+            k=k,
+            v=v,
+            cu_seqlens_q=cu_seqlens,
+            cu_seqlens_k=cu_seqlens,
+            max_seqlen_q=max_seqlen,
+            max_seqlen_k=max_seqlen,
+        )
+
+
+class VisionAscendAttention(nn.Module):
+
+    def __init__(
+        self,
+        **kwargs,
+    ):
+        if not _is_npu:
+            raise Exception("VisionAscendAttention is only available for ascend npu")
+        super().__init__()
+
+    def forward(
+        self,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
+        cu_seqlens: torch.Tensor | SingletonCache | None,
+        bsz: int,
+        seq_len: int,
+        **kwargs,
+    ) -> torch.Tensor:
+        r"""
+        Args:
+            cu_seqlens: [b]
+        Returns:
+             [b * s, h, head_size]
+        """
+        cu_seqlens = resolve_seqlens(cu_seqlens, bsz, seq_len, device="cpu")
+
+        seq_lens = cu_seqlens[1:] - cu_seqlens[:-1]
+        if seq_lens.is_npu:
+            # cu_seqlens must be on cpu because of operator restriction
+            seq_lens = seq_lens.to("cpu")
+        _, num_heads, head_size = q.shape
+        num_kv_heads = k.shape[1]
+        output = torch.empty_like(q)
+
+        # operator requires pta version >= 2.5.1
+        torch_npu._npu_flash_attention_unpad(
+            query=q,
+            key=k,
+            value=v,
+            seq_len=seq_lens.to(torch.int32),
+            scale_value=head_size**-0.5,
+            num_heads=num_heads,
+            num_kv_heads=num_kv_heads,
+            out=output,
+        )
+
+        return output
+
+
+QKV_BACKEND_IMPL = {
+    "triton_attn": VisionTritonAttention,
+    "sdpa": VisionSdpaAttention,
+    "fa3": VisionFlash3Attention,
+    "fa4": VisionFlash4Attention,
+    "flashinfer_cudnn": VisionFlashInferAttention,
+    "ascend_attn": VisionAscendAttention,
+    "aiter_attn": VisionAiterAttention,
+}
+
+
+class VisionAttention(nn.Module):
+    r"""
+        Multi-headed attention without any cache, mostly used for multimodal transformers.
+
+
+    Args:
+        use_qkv_parallel (bool, optional): If True, use QKV-parallel attention.
+        softmax_in_single_precision (bool, default to False):
+            if ``True``, the softmax will be performed in single-precision
+            Otherwise, it will be performed in half-precision
+
+    """
+
+    def __init__(
+        self,
+        embed_dim: int,
+        num_heads: int,
+        projection_size: int,
+        use_qkv_parallel: bool,
+        qkv_backend: Optional[str] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        dropout: float = 0.0,
+        softmax_in_single_precision: bool = False,
+        flatten_batch: bool = False,
+        prefix: str = "",
+        proj_bias: bool = True,
+        num_dummy_heads: int = 0,
+        qkv_bias: bool = True,
+        qk_normalization: bool = False,
+        qk_normalization_by_head_size: bool = False,
+        layer_norm_eps: float = 1e-06,
+        customized_position_embedding_applier: Callable[
+            [torch.Tensor, torch.Tensor, Any, Any], Tuple[torch.Tensor, torch.Tensor]
+        ] = None,
+        use_data_parallel: bool = False,
+        use_dp_attention_reduce: bool = False,
+        aux_stream: Optional[torch.cuda.Stream] = None,
+        workspace_buffer: Optional[torch.Tensor] = None,
+        **kwargs,
+    ):
+        super().__init__()
+        self.tp_size = 1 if use_data_parallel else get_attention_tp_size()
+        self.tp_rank = 0 if use_data_parallel else get_attention_tp_rank()
+        self.dropout = dropout
+        self.head_size = embed_dim // num_heads
+        self.hidden_size_per_attention_head = dist_utils.divide(
+            projection_size, num_heads
+        )
+        self.num_attention_heads_per_partition = dist_utils.divide(
+            num_dummy_heads + num_heads, self.tp_size
+        )
+        self.num_attention_kv_heads_per_partition = dist_utils.divide(
+            num_dummy_heads + num_heads, self.tp_size
+        )
+
+        self.q_size = self.num_attention_heads_per_partition * self.head_size
+        self.kv_size = self.num_attention_kv_heads_per_partition * self.head_size
+
+        self.qk_normalization = qk_normalization
+        self.qk_normalization_by_head_size = qk_normalization_by_head_size
+
+        # Additional dummy heads are used to enable TP for common GPU counts.
+        self.dummy_dim = (num_dummy_heads + num_heads) * self.head_size
+
+        if self.qk_normalization:
+            self.q_norm, self.k_norm = self._init_qk_norm(
+                self.dummy_dim, layer_norm_eps, embed_dim
+            )
+
+        elif self.qk_normalization_by_head_size:
+            self.q_norm, self.k_norm = self._init_qk_norm(
+                self.head_size, layer_norm_eps
+            )
+
+        # Select attention backend via a unified method
+        _passed_backend = qkv_backend
+        qkv_backend = self._determine_attention_backend(_passed_backend)
+        if (
+            get_global_server_args().mm_attention_backend is None
+            and _passed_backend is None
+        ):
+            print_info_once(f"Multimodal attention backend not set. Use {qkv_backend}.")
+        print_info_once(f"Using {qkv_backend} as multimodal attention backend.")
+
+        self.customized_position_embedding_applier = (
+            customized_position_embedding_applier
+        )
+        self.qkv_backend = QKV_BACKEND_IMPL[qkv_backend](
+            head_dim=self.head_size,
+            num_heads=self.num_attention_heads_per_partition,
+            num_kv_heads=self.num_attention_kv_heads_per_partition,
+            dropout=dropout,
+            flatten_batch=flatten_batch,
+            softmax_in_single_precision=softmax_in_single_precision,
+            use_data_parallel=use_data_parallel,
+            workspace_buffer=workspace_buffer,
+        )
+
+        self.use_qkv_parallel = use_qkv_parallel
+        if use_qkv_parallel:
+            self.qkv_proj = QKVParallelLinear(
+                hidden_size=embed_dim,
+                head_size=self.head_size,
+                total_num_heads=num_dummy_heads + num_heads,
+                total_num_kv_heads=num_dummy_heads + num_heads,
+                bias=qkv_bias,
+                quant_config=quant_config,
+                tp_rank=self.tp_rank,
+                tp_size=self.tp_size,
+                prefix=add_prefix("qkv_proj", prefix),
+            )
+        else:
+            self.qkv_proj = ColumnParallelLinear(
+                input_size=embed_dim,
+                output_size=3 * self.dummy_dim,
+                bias=qkv_bias,
+                quant_config=quant_config,
+                tp_rank=self.tp_rank,
+                tp_size=self.tp_size,
+                prefix=add_prefix("qkv_proj", prefix),
+            )
+        self.proj = RowParallelLinear(
+            input_size=self.dummy_dim,
+            output_size=embed_dim,
+            bias=proj_bias,
+            quant_config=quant_config,
+            tp_rank=self.tp_rank,
+            tp_size=self.tp_size,
+            prefix=add_prefix("proj", prefix),
+            use_dp_attention_reduce=use_dp_attention_reduce,
+        )
+
+        self.workspace_buffer = workspace_buffer
+        self.aux_stream = aux_stream
+        self.ln_events = [torch.cuda.Event(), torch.cuda.Event()] if aux_stream else []
+
+    def _init_qk_norm(
+        self, norm_dim: int, eps: float, var_hidden_size: Optional[int] = None
+    ):
+        norm_kwargs = (
+            dict(
+                weight_dtype=torch.float32,
+                cast_x_before_out_mul=True,
+            )
+            if get_global_server_args().rl_on_policy_target is not None
+            else {}
+        )
+        q_norm = RMSNorm(
+            norm_dim,
+            eps=eps,
+            var_hidden_size=var_hidden_size,
+            **norm_kwargs,
+        )
+        k_norm = RMSNorm(
+            norm_dim,
+            eps=eps,
+            var_hidden_size=var_hidden_size,
+            **norm_kwargs,
+        )
+        return q_norm, k_norm
+
+    def _determine_attention_backend(self, passed_backend: Optional[str]) -> str:
+        """Decide the multimodal attention backend string.
+
+        Priority: server args override > constructor arg > platform default.
+
+        Platform defaults:
+        - CUDA: "triton_attn"
+        - Non-CUDA: "sdpa"
+        """
+        override_backend = get_global_server_args().mm_attention_backend
+        if override_backend is not None:
+            backend = override_backend
+        elif passed_backend is not None:
+            backend = passed_backend
+        elif is_cuda():
+            major, minor = get_device_capability()
+            if major == 9:
+                backend = "fa3"
+            else:
+                backend = "triton_attn"
+        elif _is_hip:
+            if get_device_capability() >= (9, 4) and _use_aiter:
+                backend = "aiter_attn"
+            else:
+                backend = "triton_attn"
+        else:
+            backend = "sdpa"
+        if backend == "fa3" and is_blackwell_supported():
+            raise ValueError("The 'fa3' backend is not supported on Blackwell GPUs")
+
+        return backend
+
+    def _apply_qk_norm_head_size(self, q: torch.Tensor, k: torch.Tensor):
+        """apply qk norm for GLM-OCR vit attn"""
+        q_by_head = q.reshape(-1, self.head_size)
+        q_by_head = self.q_norm(q_by_head)
+        k_by_head = k.reshape(-1, self.head_size)
+        k_by_head = self.k_norm(k_by_head)
+        q = q_by_head.view(q.shape)
+        k = k_by_head.view(k.shape)
+        return q, k
+
+    def _apply_qk_norm(self, q: torch.Tensor, k: torch.Tensor):
+        """apply qk norm for internvl vit attn"""
+
+        def q_l2norm():
+            q_ = q.flatten(1, 2)
+            if self.tp_size > 1:
+                q_ = tensor_model_parallel_all_gather(q_.contiguous())
+            q_ = self.q_norm(q_)
+            if self.tp_size > 1:
+                splitter = partial(
+                    split_tensor_along_last_dim, num_partitions=self.tp_size
+                )
+                q_ = splitter(q_)[self.tp_rank]
+            q_ = q_.unflatten(-1, (-1, self.head_size))
+            return q_
+
+        def k_l2norm():
+            k_ = k.flatten(1, 2)
+            if self.tp_size > 1:
+                k_ = tensor_model_parallel_all_gather(k_.contiguous())
+            k_ = self.k_norm(k_)
+            if self.tp_size > 1:
+                splitter = partial(
+                    split_tensor_along_last_dim, num_partitions=self.tp_size
+                )
+                k_ = splitter(k_)[self.tp_rank]
+            k_ = k_.unflatten(-1, (-1, self.head_size))
+            return k_
+
+        with with_multi_stream(True):
+            q, k = maybe_execute_in_parallel(
+                q_l2norm,
+                k_l2norm,
+                self.ln_events,
+                self.aux_stream,
+            )
+        return q, k
+
+    def forward(
+        self,
+        x: torch.Tensor,
+        cu_seqlens: Optional[torch.Tensor] = None,
+        position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+        rotary_pos_emb_cos: Optional[torch.Tensor] = None,
+        rotary_pos_emb_sin: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        **kwargs,
+    ) -> torch.Tensor:
+        r"""
+        Args:
+            x: [b, s, embed_dim]
+            cu_seqlens: [b]
+        Returns:
+             [s, b, head * head_size]
+        """
+        if x.dim() == 2:
+            x = x.unsqueeze(0)
+        assert x.dim() == 3, x.shape
+        if (
+            get_global_server_args().rl_on_policy_target is not None
+            and position_embeddings is not None
+        ):
+            assert isinstance(position_embeddings, tuple), (
+                "expected position_embeddings to be a tuple of two tensors,\n"
+                f"but got {type(position_embeddings)}, change if needed"
+            )
+            position_embeddings = tuple(p.to(x.dtype) for p in position_embeddings)
+        x_shape = x.shape
+        bsz, s, _ = x_shape
+        head = self.num_attention_heads_per_partition
+        kv_head = self.num_attention_kv_heads_per_partition
+
+        attn_output_ws = kwargs["output_ws"] if "output_ws" in kwargs else None
+        max_seqlen = kwargs["max_seqlen"] if "max_seqlen" in kwargs else None
+        sequence_lengths = (
+            kwargs["sequence_lengths"] if "sequence_lengths" in kwargs else None
+        )
+        if self.use_qkv_parallel:
+            # [b, s, embed_dim] --> [b, s, embed_dim]
+            qkv, _ = self.qkv_proj(x)
+            q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+
+            # [b, s, embed_dim] --> [b * s, head, head_size]
+            q = q.reshape(bsz * s, head, -1).contiguous()
+            k = k.reshape(bsz * s, kv_head, -1).contiguous()
+            v = v.reshape(bsz * s, kv_head, -1).contiguous()
+            if self.qk_normalization_by_head_size:
+                q, k = self._apply_qk_norm_head_size(q, k)
+        else:
+            # [b, s, embed_dim] --> [s, b, embed_dim]
+            x = rearrange(x, "b s ... -> s b ...")
+            # [s, b, embed_dim] --> [s, b, head * 3 * head_size]
+            qkv, _ = self.qkv_proj(x)
+
+            # [s, b, head, head_dim_sum]
+            new_x_shape = qkv.size()[:-1] + (
+                head,
+                self.q_size + 2 * self.kv_size,
+            )
+            qkv = qkv.view(*new_x_shape)
+
+            # [s, b, head, 3 * head_size] --> 3 [s, b, head, head_size]
+            q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+
+            # [s, b, head, head_size] --> [b, s, head, head_size]
+            q, k, v = [
+                rearrange(x, "s b ... -> b s ...").contiguous() for x in (q, k, v)
+            ]
+
+            if self.qk_normalization_by_head_size:
+                q, k = self._apply_qk_norm_head_size(q, k)
+
+        cos = None
+        sin = None
+
+        if position_embeddings is not None:
+            if self.customized_position_embedding_applier is not None:
+                q, k = self.customized_position_embedding_applier(
+                    q, k, position_embeddings, x_shape
+                )
+            else:
+                cos, sin = position_embeddings
+        elif rotary_pos_emb_cos is not None and rotary_pos_emb_sin is not None:
+            cos = rotary_pos_emb_cos
+            sin = rotary_pos_emb_sin
+
+        if cos is not None and sin is not None:
+            original_shape = q.shape
+
+            # [total_tokens, head, head_size]
+            q = q.view(-1, head, self.head_size)
+            k = k.view(-1, head, self.head_size)
+
+            if cos.size(-1) * 2 == self.head_size:
+                cos = torch.cat([cos, cos], dim=-1)
+                sin = torch.cat([sin, sin], dim=-1)
+
+            q, k = apply_rotary_pos_emb(q, k, cos, sin)
+            q = q.view(original_shape)
+            k = k.view(original_shape)
+
+        if q.dim() == 4:
+            # [b, s, head, head_size] --> [b * s, head, head_size]
+            q = rearrange(q, "b s ... -> (b s) ...")
+        if k.dim() == 4:
+            # [b, s, head, head_size] --> [b * s, head, head_size]
+            k = rearrange(k, "b s ... -> (b s) ...")
+        if v.dim() == 4:
+            # [b, s, head, head_size] --> [b * s, head, head_size]
+            v = rearrange(v, "b s ... -> (b s) ...")
+
+        assert q.dim() == 3, q.dim()
+        assert k.dim() == 3, k.dim()
+        assert v.dim() == 3, v.dim()
+
+        # internvl
+        if self.qk_normalization and not self.qk_normalization_by_head_size:
+            # jit kernel
+            if can_use_jit_qk_norm(self.head_size, q.dtype):
+
+                # q: [tokens, head, head_size]  ->  [tokens, embed_dim]
+                head_dim_for_norm = head * self.head_size
+
+                q, k = apply_qk_norm(
+                    q=q,
+                    k=k,
+                    q_norm=self.q_norm,
+                    k_norm=self.k_norm,
+                    head_dim=head_dim_for_norm,
+                    alt_stream=self.aux_stream,
+                )
+
+            else:
+                q, k = self._apply_qk_norm(q, k)
+
+        output = self.qkv_backend.forward(
+            q=q,
+            k=k,
+            v=v,
+            bsz=bsz,
+            seq_len=s,
+            cu_seqlens=cu_seqlens,
+            attention_mask=attention_mask,
+            sequence_lengths=sequence_lengths,
+            max_seqlen=max_seqlen,
+            output_ws=attn_output_ws,
+        )
+
+        assert output.dim() == 3, output.shape
+
+        if self.use_qkv_parallel:
+            # [b * s, h, head_size] --> [b, s, h * head_size]
+            output = rearrange(output, "(b s) ... h d -> b s ... (h d)", b=bsz)
+
+            # [b, s, h * head_size] --> [b, s, h * head_size]
+            output, _ = self.proj(output)
+        else:
+            # [b * s, h, head_size] --> [s, b, h * head_size]
+            context_layer = rearrange(
+                output, "(b s) h d -> s b (h d)", b=bsz, s=s
+            ).contiguous()
+
+            # [s, b, h * head_size] --> [s, b, h * head_size]
+            output, _ = self.proj(context_layer)
+
+            # [s, b, h * head_size] --> [b, s, h * head_size]
+            output = output.view(bsz, s, -1)
+
+        return output
diff --git a/sglang/python/sglang/srt/layers/attention/vision_utils.py b/sglang/python/sglang/srt/layers/attention/vision_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..ecccb1f852894a725eaeeda759298d13679c8915
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/attention/vision_utils.py
@@ -0,0 +1,65 @@
+"""Utility functions for vision attention layers."""
+
+import torch
+
+from sglang.srt.layers.dp_attention import get_attention_tp_size
+
+
+def update_vit_attn_dummy_heads_config(config):
+    """Update HF config to ensure vision attention num_attention_heads is divisible by tp_size"""
+    tp_size = get_attention_tp_size()
+    num_heads = getattr(
+        config.vision_config,
+        "num_heads",
+        getattr(config.vision_config, "num_attention_heads", None),
+    )
+    head_dim = config.vision_config.hidden_size // num_heads
+    num_dummy_heads = 0
+
+    if num_heads % tp_size != 0:
+        num_dummy_heads = ((num_heads + tp_size - 1) // tp_size) * tp_size - num_heads
+
+    setattr(config.vision_config, "head_dim", head_dim)
+    setattr(config.vision_config, "num_dummy_heads", num_dummy_heads)
+
+
+def pad_vit_attn_dummy_heads(config, name: str, loaded_weight: torch.Tensor):
+    """Pad attention qkv weights for dummy heads"""
+    num_dummy_heads = config.vision_config.num_dummy_heads
+    if num_dummy_heads == 0:
+        return loaded_weight
+    head_dim = config.vision_config.head_dim
+
+    if "attn.qkv_proj" in name:
+        wq, wk, wv = loaded_weight.chunk(3, dim=0)
+        if name.endswith(".weight"):
+            dummy_shape = [num_dummy_heads, head_dim, wq.shape[-1]]
+        elif name.endswith(".bias"):
+            dummy_shape = [num_dummy_heads, head_dim]
+        else:
+            raise RuntimeError(f"Unsupported weight with name={name}")
+        pad_func = lambda x: torch.cat(
+            [x.unflatten(0, (-1, head_dim)), x.new_zeros(dummy_shape)], dim=0
+        ).flatten(0, 1)
+        wq, wk, wv = pad_func(wq), pad_func(wk), pad_func(wv)
+        loaded_weight = torch.cat([wq, wk, wv], dim=0)
+    elif any([_ in name for _ in ["attn.q_proj", "attn.k_proj", "attn.v_proj"]]):
+        if name.endswith(".weight"):
+            dummy_shape = [num_dummy_heads, head_dim, loaded_weight.shape[-1]]
+        elif name.endswith(".bias"):
+            dummy_shape = [num_dummy_heads, head_dim]
+        else:
+            raise RuntimeError(f"Unsupported weight with name={name}")
+        padded_weight = loaded_weight.new_zeros(dummy_shape)
+        loaded_weight = torch.cat(
+            [loaded_weight.unflatten(0, (-1, head_dim)), padded_weight], dim=0
+        ).flatten(0, 1)
+    elif "attn.proj.weight" in name:
+        padded_weight = loaded_weight.new_zeros(
+            loaded_weight.shape[0], head_dim * num_dummy_heads
+        )
+        loaded_weight = torch.cat([loaded_weight, padded_weight], dim=-1)
+    elif "attn.q_norm.weight" in name or "attn.k_norm.weight" in name:
+        padded_weight = loaded_weight.new_zeros(head_dim * num_dummy_heads)
+        loaded_weight = torch.cat([loaded_weight, padded_weight], dim=0)
+    return loaded_weight
diff --git a/sglang/python/sglang/srt/layers/attention/wave_backend.py b/sglang/python/sglang/srt/layers/attention/wave_backend.py
new file mode 100644
index 0000000000000000000000000000000000000000..9669a4568106f7f43d7639ec095c18c29acbd673
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/attention/wave_backend.py
@@ -0,0 +1,627 @@
+from __future__ import annotations
+
+import logging
+from dataclasses import dataclass
+from typing import TYPE_CHECKING, Optional
+
+import torch
+import triton
+import triton.language as tl
+
+from sglang.srt.layers.attention.base_attn_backend import AttentionBackend
+from sglang.srt.layers.attention.utils import create_flashinfer_kv_indices_triton
+from sglang.srt.layers.dp_attention import get_attention_tp_size
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch, ForwardMode
+from sglang.srt.utils import get_bool_env_var, get_device_core_count
+
+if TYPE_CHECKING:
+    from sglang.srt.layers.radix_attention import RadixAttention
+    from sglang.srt.model_executor.model_runner import ModelRunner
+    from sglang.srt.speculative.spec_info import SpecInput
+
+logger = logging.getLogger(__name__)
+
+
+@triton.jit
+def get_num_kv_splits_triton(
+    num_kv_splits_ptr,
+    seq_lens_ptr,
+    num_seq,
+    num_group,
+    num_head,
+    num_kv_head,
+    max_kv_splits,
+    device_core_count,
+    MAX_NUM_SEQ: tl.constexpr,
+):
+    # TODO: this method is tunable, we need more online serving data to tune it
+    offs_seq = tl.arange(0, MAX_NUM_SEQ)
+    mask_seq = offs_seq < num_seq
+
+    seq_lens = tl.load(seq_lens_ptr + offs_seq, mask=mask_seq, other=0)
+    max_seq_len = tl.max(seq_lens)
+    seq_lens = tl.load(seq_lens_ptr + offs_seq, mask=mask_seq, other=max_seq_len)
+    min_seq_len = tl.min(seq_lens)
+    if max_seq_len * 8 < min_seq_len * 10:
+        min_seq_len = max_seq_len
+    max_kv_splits_1 = tl.minimum(tl.cdiv(max_seq_len, min_seq_len), max_kv_splits)
+    kv_chunk_size_1 = tl.cdiv(max_seq_len, max_kv_splits_1)
+
+    # NOTE: this is a hack to let num_kv_split grows up with seqlen gradually
+    ext_seq_len = tl.cast(max_seq_len, tl.float32) / 64.0
+    ext_device_core_count = tl.cast(
+        device_core_count * tl.maximum(tl.log2(ext_seq_len), 1.0), tl.int32
+    )
+    block_h, num_kv_group = 16, num_head // num_kv_head
+    if num_kv_group == 1:
+        token_grid = num_seq * num_group * num_head
+    else:
+        # from triton_ops/decode_attention.py:_decode_grouped_att_m_fwd
+        block_h = tl.minimum(block_h, num_kv_group)
+        token_grid = num_seq * num_group * tl.cdiv(num_head, block_h)
+    max_kv_splits_2 = tl.minimum(
+        tl.cdiv(ext_device_core_count, token_grid), max_kv_splits
+    )
+    kv_chunk_size_2 = tl.cdiv(max_seq_len, max_kv_splits_2)
+
+    num_kv_splits = tl.maximum(
+        tl.cdiv(seq_lens, kv_chunk_size_1), tl.cdiv(seq_lens, kv_chunk_size_2)
+    )
+
+    offs_token = offs_seq * num_group
+    mask_token = offs_token < num_seq * num_group
+    for i in range(0, num_group):
+        tl.store(num_kv_splits_ptr + i + offs_token, num_kv_splits, mask=mask_token)
+
+
+@dataclass
+class ForwardMetadata:
+    attn_logits: torch.Tensor
+    attn_lse: torch.Tensor
+    max_extend_len: int
+    num_kv_splits: torch.Tensor
+    kv_indptr: torch.Tensor
+    kv_indices: torch.Tensor
+    qo_indptr: torch.Tensor
+    custom_mask: torch.Tensor
+    mask_indptr: torch.Tensor
+
+
+class WaveAttnBackend(AttentionBackend):
+    def __init__(
+        self,
+        model_runner: ModelRunner,
+        skip_prefill: bool = False,
+        kv_indptr_buf: Optional[torch.Tensor] = None,
+    ):
+        # Lazy import to avoid the initialization of cuda context
+        from sglang.srt.layers.attention.wave_ops.decode_attention import (
+            decode_attention_fwd,
+        )
+        from sglang.srt.layers.attention.wave_ops.extend_attention import (
+            extend_attention_wave,
+        )
+
+        super().__init__()
+
+        # Set unique cache dir for each process to avoid cache write races
+        import wave_lang.kernel.wave.cache as cache
+
+        base_cache_dir = cache.CACHE_BASE_DIR
+        new_dir = base_cache_dir / f"worker_{model_runner.tp_rank}"
+        logger.info(f"Setting Wave cache dir: {new_dir}")
+        cache.CACHE_BASE_DIR = new_dir
+
+        self.decode_attention_fwd = decode_attention_fwd
+        self.extend_attention_fwd = extend_attention_wave
+
+        self.skip_prefill = skip_prefill
+
+        max_bs = model_runner.req_to_token_pool.size
+
+        if kv_indptr_buf is None:
+            self.kv_indptr = torch.zeros(
+                (max_bs + 1,), dtype=torch.int32, device=model_runner.device
+            )
+        else:
+            self.kv_indptr = kv_indptr_buf
+
+        self.req_to_token = model_runner.req_to_token_pool.req_to_token
+
+        if not self.skip_prefill:
+            self.qo_indptr = torch.zeros(
+                (max_bs + 1,), dtype=torch.int32, device=model_runner.device
+            )
+
+            self.mask_indptr = torch.zeros(
+                (max_bs + 1,), dtype=torch.int64, device=model_runner.device
+            )
+
+        self.num_draft_tokens = model_runner.server_args.speculative_num_draft_tokens
+
+        self.num_head = (
+            model_runner.model_config.num_attention_heads // get_attention_tp_size()
+        )
+        self.num_kv_head = model_runner.model_config.get_num_kv_heads(
+            get_attention_tp_size()
+        )
+
+        self.static_kv_splits = get_bool_env_var(
+            "SGLANG_TRITON_DECODE_ATTN_STATIC_KV_SPLITS", "false"
+        )
+        self.max_kv_splits = model_runner.server_args.triton_attention_num_kv_splits
+        self.v_head_dim = model_runner.token_to_kv_pool.get_value_buffer(0).shape[-1]
+
+        self.forward_metadata: ForwardMetadata = None
+
+        self.max_context_len = model_runner.model_config.context_len
+
+        self.device = model_runner.device
+        self.device_core_count = get_device_core_count(model_runner.gpu_id)
+
+    def get_num_kv_splits(
+        self,
+        num_kv_splits: torch.Tensor,
+        seq_lens: torch.Tensor,
+    ):
+        num_token, num_seq = num_kv_splits.shape[0], seq_lens.shape[0]
+        num_group = num_token // num_seq
+
+        assert (
+            num_group * num_seq == num_token
+        ), f"num_seq({num_seq}), num_token({num_token}), something goes wrong!"
+
+        if self.static_kv_splits or self.device_core_count <= 0:
+            num_kv_splits.fill_(self.max_kv_splits)
+            return
+
+        if num_seq < 256:
+            SCHEDULE_SEQ = 256
+        else:
+            SCHEDULE_SEQ = triton.next_power_of_2(num_seq)
+
+        get_num_kv_splits_triton[(1,)](
+            num_kv_splits,
+            seq_lens,
+            num_seq,
+            num_group,
+            self.num_head,
+            self.num_kv_head,
+            self.max_kv_splits,
+            self.device_core_count,
+            MAX_NUM_SEQ=SCHEDULE_SEQ,
+        )
+
+    def init_forward_metadata(self, forward_batch: ForwardBatch):
+        """Init auxiliary variables for wave attention backend."""
+
+        bs = forward_batch.batch_size
+        kv_indptr = self.kv_indptr
+        spec_info = forward_batch.spec_info
+
+        if forward_batch.forward_mode.is_decode_or_idle():
+            if spec_info is None:
+                kv_indptr[1 : bs + 1] = torch.cumsum(forward_batch.seq_lens, dim=0)
+                kv_indptr = kv_indptr[: bs + 1]
+                kv_indices = torch.empty(
+                    forward_batch.seq_lens_sum, dtype=torch.int32, device=self.device
+                )
+                create_flashinfer_kv_indices_triton[(bs,)](
+                    self.req_to_token,
+                    forward_batch.req_pool_indices,
+                    forward_batch.seq_lens,
+                    kv_indptr,
+                    None,
+                    kv_indices,
+                    self.req_to_token.stride(0),
+                )
+            else:
+                kv_indptr, kv_indices = spec_info.kv_indptr, spec_info.kv_indices
+                bs = kv_indptr.shape[0] - 1
+
+            from sglang.srt.layers.attention.wave_ops.decode_attention import (
+                decode_attention_intermediate_arrays_shapes,
+            )
+
+            attn_logits_shape, attn_logits_max_shape = (
+                decode_attention_intermediate_arrays_shapes(
+                    bs, self.v_head_dim, self.num_head, self.max_kv_splits
+                )
+            )
+            attn_logits = torch.empty(
+                attn_logits_shape,
+                dtype=torch.float32,
+                device=self.device,
+            )
+            attn_lse = torch.empty(
+                attn_logits_max_shape,
+                dtype=torch.float32,
+                device=self.device,
+            )
+            num_kv_splits = torch.empty((bs,), dtype=torch.int32, device=self.device)
+
+            self.get_num_kv_splits(num_kv_splits, forward_batch.seq_lens)
+
+            qo_indptr = None
+            custom_mask = None
+            mask_indptr = None
+            max_extend_len = None
+        elif forward_batch.forward_mode.is_target_verify():
+            bs = len(forward_batch.req_pool_indices)
+            qo_indptr = torch.arange(
+                0,
+                (1 + bs) * self.num_draft_tokens,
+                step=self.num_draft_tokens,
+                dtype=torch.int32,
+                device=self.device,
+            )
+            # Different with flashinfer kv_indptr and kv_indices construction
+            kv_indptr[1 : bs + 1] = torch.cumsum(forward_batch.seq_lens, dim=0)
+            kv_indptr = kv_indptr[: bs + 1]
+            kv_indices = torch.empty(
+                kv_indptr[-1], dtype=torch.int32, device=self.device
+            )
+            create_flashinfer_kv_indices_triton[(bs,)](
+                self.req_to_token,
+                forward_batch.req_pool_indices,
+                forward_batch.seq_lens,
+                kv_indptr,
+                None,
+                kv_indices,
+                self.req_to_token.stride(0),
+            )
+
+            custom_mask = spec_info.custom_mask
+            seq_mask_len = self.num_draft_tokens * (
+                forward_batch.seq_lens + self.num_draft_tokens
+            )
+            mask_indptr = self.mask_indptr
+            mask_indptr[1 : bs + 1] = torch.cumsum(seq_mask_len[:bs], dim=0)
+            mask_indptr = mask_indptr[: bs + 1]
+            max_extend_len = self.num_draft_tokens
+            num_kv_splits = None
+            attn_logits = None
+            attn_lse = None
+        elif forward_batch.forward_mode.is_draft_extend():
+            kv_indices, kv_indptr, qo_indptr, custom_mask = (
+                spec_info.generate_attn_arg_prefill(
+                    forward_batch.req_pool_indices,
+                    forward_batch.seq_lens,
+                    None,
+                    self.req_to_token,
+                )
+            )
+            mask_indptr = None
+            # TODO(FIXME): This will trigger an invalid Eagle tree when using
+            # `max(spec_info.accept_length_cpu)`.
+            # It might have been forgotten to update somewhere.
+            max_extend_len = torch.max(spec_info.accept_length).item()
+            num_kv_splits = None
+            attn_logits = None
+            attn_lse = None
+        else:
+            kv_indptr[1 : bs + 1] = torch.cumsum(
+                forward_batch.extend_prefix_lens, dim=0
+            )
+            kv_indptr = kv_indptr[: bs + 1]
+            kv_indices = torch.empty(
+                forward_batch.extend_prefix_lens.sum().item(),
+                dtype=torch.int32,
+                device=self.device,
+            )
+            create_flashinfer_kv_indices_triton[(bs,)](
+                self.req_to_token,
+                forward_batch.req_pool_indices,
+                forward_batch.extend_prefix_lens,
+                kv_indptr,
+                None,
+                kv_indices,
+                self.req_to_token.stride(0),
+            )
+
+            qo_indptr = self.qo_indptr
+            qo_indptr[1 : bs + 1] = torch.cumsum(forward_batch.extend_seq_lens, dim=0)
+            qo_indptr = qo_indptr[: bs + 1]
+            custom_mask = None
+            mask_indptr = None
+            attn_logits = None
+            attn_lse = None
+            max_extend_len = torch.max(forward_batch.extend_seq_lens).item()
+            num_kv_splits = None
+
+        self.forward_metadata = ForwardMetadata(
+            attn_logits,
+            attn_lse,
+            max_extend_len,
+            num_kv_splits,
+            kv_indptr,
+            kv_indices,
+            qo_indptr,
+            custom_mask,
+            mask_indptr,
+        )
+
+    def init_cuda_graph_state(
+        self,
+        max_bs: int,
+        max_num_tokens: int,
+        kv_indices_buf: Optional[torch.Tensor] = None,
+    ):
+        from sglang.srt.layers.attention.wave_ops.decode_attention import (
+            decode_attention_intermediate_arrays_shapes,
+        )
+
+        attn_logits_shape, attn_logits_max_shape = (
+            decode_attention_intermediate_arrays_shapes(
+                max_bs, self.v_head_dim, self.num_head, self.max_kv_splits
+            )
+        )
+        self.cuda_graph_attn_logits = torch.zeros(
+            attn_logits_shape,
+            dtype=torch.float32,
+            device=self.device,
+        )
+        self.cuda_graph_attn_lse = torch.zeros(
+            attn_logits_max_shape,
+            dtype=torch.float32,
+            device=self.device,
+        )
+        self.cuda_graph_num_kv_splits = torch.full(
+            (max_bs,), self.max_kv_splits, dtype=torch.int32, device=self.device
+        )
+        if kv_indices_buf is None:
+            self.cuda_graph_kv_indices = torch.zeros(
+                (max_bs * self.max_context_len),
+                dtype=torch.int32,
+                device=self.device,
+            )
+        else:
+            self.cuda_graph_kv_indices = kv_indices_buf
+
+        if not self.skip_prefill:
+            self.cuda_graph_custom_mask = torch.zeros(
+                (max_bs * self.max_context_len),
+                dtype=torch.uint8,
+                device=self.device,
+            )
+
+    def init_forward_metadata_capture_cuda_graph(
+        self,
+        bs: int,
+        num_tokens: int,
+        req_pool_indices: torch.Tensor,
+        seq_lens: torch.Tensor,
+        encoder_lens: Optional[torch.Tensor],
+        forward_mode: ForwardMode,
+        spec_info: Optional[SpecInput],
+    ):
+        assert encoder_lens is None, "Not supported"
+
+        if forward_mode.is_decode_or_idle():
+            if spec_info is None:
+                kv_indptr = self.kv_indptr
+                kv_indptr[1 : bs + 1] = torch.cumsum(seq_lens, dim=0)
+                kv_indptr = kv_indptr[: bs + 1]
+                kv_indices = self.cuda_graph_kv_indices
+                create_flashinfer_kv_indices_triton[(bs,)](
+                    self.req_to_token,
+                    req_pool_indices,
+                    seq_lens,
+                    kv_indptr,
+                    None,
+                    kv_indices,
+                    self.req_to_token.stride(0),
+                )
+            else:
+                kv_indptr, kv_indices = spec_info.kv_indptr, spec_info.kv_indices
+
+            attn_logits = self.cuda_graph_attn_logits
+            attn_lse = self.cuda_graph_attn_lse
+            max_extend_len = None
+            num_kv_splits = self.cuda_graph_num_kv_splits
+            qo_indptr = None
+            custom_mask = None
+            mask_indptr = None
+        elif forward_mode.is_target_verify():
+            qo_indptr = self.qo_indptr[: bs + 1]
+            qo_indptr[: bs + 1] = torch.arange(
+                0,
+                (1 + bs) * self.num_draft_tokens,
+                step=self.num_draft_tokens,
+                dtype=torch.int32,
+                device=self.device,
+            )
+            kv_indptr = self.kv_indptr[: bs + 1]
+            kv_indptr[1 : bs + 1] = torch.cumsum(seq_lens, dim=0)
+            kv_indices = self.cuda_graph_kv_indices
+            create_flashinfer_kv_indices_triton[(bs,)](
+                self.req_to_token,
+                req_pool_indices,
+                seq_lens,
+                kv_indptr,
+                None,
+                kv_indices,
+                self.req_to_token.stride(0),
+            )
+
+            custom_mask = self.cuda_graph_custom_mask
+            seq_mask_len = self.num_draft_tokens * (seq_lens + self.num_draft_tokens)
+            mask_indptr = self.mask_indptr[: bs + 1]
+            mask_indptr[1 : bs + 1] = torch.cumsum(seq_mask_len, dim=0)
+            max_extend_len = self.num_draft_tokens
+            num_kv_splits = None
+            attn_logits = None
+            attn_lse = None
+        else:
+            raise ValueError(
+                f"Invalid forward mode: {forward_mode=} for CUDA Graph capture."
+            )
+
+        self.forward_metadata = ForwardMetadata(
+            attn_logits,
+            attn_lse,
+            max_extend_len,
+            num_kv_splits,
+            kv_indptr,
+            kv_indices,
+            qo_indptr,
+            custom_mask,
+            mask_indptr,
+        )
+
+    def init_forward_metadata_replay_cuda_graph(
+        self,
+        bs: int,
+        req_pool_indices: torch.Tensor,
+        seq_lens: torch.Tensor,
+        seq_lens_sum: int,
+        encoder_lens: Optional[torch.Tensor],
+        forward_mode: ForwardMode,
+        spec_info: Optional[SpecInput],
+        seq_lens_cpu: Optional[torch.Tensor],
+    ):
+        # NOTE: encoder_lens expected to be zeros or None
+        if forward_mode.is_decode_or_idle():
+            # Update kv_indptr, kv_indices
+            kv_indptr = self.kv_indptr
+            kv_indices = self.cuda_graph_kv_indices
+            num_kv_splits = self.cuda_graph_num_kv_splits
+            if spec_info is None:
+                kv_indptr[1 : bs + 1] = torch.cumsum(seq_lens[:bs], dim=0)
+                kv_indptr = kv_indptr[: bs + 1]
+                create_flashinfer_kv_indices_triton[(bs,)](
+                    self.req_to_token,
+                    req_pool_indices[:bs],
+                    seq_lens[:bs],
+                    kv_indptr,
+                    None,
+                    kv_indices,
+                    self.req_to_token.stride(0),
+                )
+                num_token = bs
+            else:
+                kv_indptr[: spec_info.kv_indptr.shape[0]] = spec_info.kv_indptr
+                kv_indices[: spec_info.kv_indices.shape[0]] = spec_info.kv_indices
+                num_token = spec_info.kv_indptr.shape[0] - 1
+            self.get_num_kv_splits(num_kv_splits[:num_token], seq_lens[:bs])
+        elif forward_mode.is_target_verify():
+            # Update qo_indptr, kv_indptr, kv_indices, custom_mask, mask_indptr
+            bs = len(req_pool_indices)
+            qo_indptr = self.qo_indptr[: bs + 1]
+            qo_indptr[: bs + 1] = torch.arange(
+                0,
+                (1 + bs) * self.num_draft_tokens,
+                step=self.num_draft_tokens,
+                dtype=torch.int32,
+                device=self.device,
+            )
+            kv_indptr = self.kv_indptr[: bs + 1]
+            kv_indptr[1 : bs + 1] = torch.cumsum(seq_lens, dim=0)
+            kv_indices = self.cuda_graph_kv_indices
+            create_flashinfer_kv_indices_triton[(bs,)](
+                self.req_to_token,
+                req_pool_indices,
+                seq_lens,
+                kv_indptr,
+                None,
+                kv_indices,
+                self.req_to_token.stride(0),
+            )
+            custom_mask = self.cuda_graph_custom_mask
+            custom_mask[: spec_info.custom_mask.shape[0]] = spec_info.custom_mask
+            seq_mask_len = self.num_draft_tokens * (seq_lens + self.num_draft_tokens)
+            mask_indptr = self.mask_indptr[: bs + 1]
+            mask_indptr[1 : bs + 1] = torch.cumsum(seq_mask_len, dim=0)
+        else:
+            raise ValueError(
+                f"Invalid forward mode: {forward_mode=} for CUDA Graph replay."
+            )
+
+    def get_cuda_graph_seq_len_fill_value(self):
+        return 1
+
+    def forward_extend(
+        self,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
+        layer: RadixAttention,
+        forward_batch: ForwardBatch,
+        save_kv_cache=True,
+    ):
+        # TODO: reuse the buffer across layers
+        if layer.qk_head_dim != layer.v_head_dim:
+            o = q.new_empty((q.shape[0], layer.tp_q_head_num * layer.v_head_dim))
+        else:
+            o = torch.empty_like(q)
+
+        if save_kv_cache:
+            forward_batch.token_to_kv_pool.set_kv_buffer(
+                layer, forward_batch.out_cache_loc, k, v
+            )
+
+        max_extend_len = self.forward_metadata.max_extend_len
+        computed_max_ext_seq_len = torch.max(forward_batch.extend_seq_lens)
+        if computed_max_ext_seq_len != max_extend_len:
+            assert len(forward_batch.extend_seq_lens) == 1
+            forward_batch.extend_seq_lens[0] = max_extend_len
+            forward_batch.seq_lens = max_extend_len
+
+        self.extend_attention_fwd(
+            q.view(-1, layer.tp_q_head_num, layer.qk_head_dim),
+            k.contiguous(),
+            v.contiguous(),
+            forward_batch.token_to_kv_pool.get_key_buffer(layer.layer_id),
+            forward_batch.token_to_kv_pool.get_value_buffer(layer.layer_id),
+            self.forward_metadata.qo_indptr,
+            self.forward_metadata.kv_indptr,
+            self.forward_metadata.kv_indices,
+            self.forward_metadata.custom_mask,
+            self.forward_metadata.mask_indptr,
+            self.forward_metadata.max_extend_len,
+            o.view(-1, layer.tp_q_head_num, layer.v_head_dim),
+            is_causal=True,
+            layer_scaling=layer.scaling,
+            logit_cap=layer.logit_cap,
+        )
+        return o
+
+    def forward_decode(
+        self,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
+        layer: RadixAttention,
+        forward_batch: ForwardBatch,
+        save_kv_cache=True,
+    ):
+        # During torch.compile, there is a bug in rotary_emb that causes the
+        # output value to have a 3D tensor shape. This reshapes the output correctly.
+        q = q.reshape(-1, layer.tp_q_head_num * layer.qk_head_dim)
+
+        # TODO: reuse the buffer across layers
+        if layer.qk_head_dim != layer.v_head_dim:
+            o = q.new_empty((q.shape[0], layer.tp_q_head_num * layer.v_head_dim))
+        else:
+            o = torch.empty_like(q)
+
+        if save_kv_cache:
+            forward_batch.token_to_kv_pool.set_kv_buffer(
+                layer, forward_batch.out_cache_loc, k, v
+            )
+
+        self.decode_attention_fwd(
+            q.view(-1, layer.tp_q_head_num, layer.qk_head_dim),
+            forward_batch.token_to_kv_pool.get_key_buffer(layer.layer_id),
+            forward_batch.token_to_kv_pool.get_value_buffer(layer.layer_id),
+            o.view(-1, layer.tp_q_head_num, layer.v_head_dim),
+            self.forward_metadata.kv_indptr,
+            self.forward_metadata.kv_indices,
+            self.forward_metadata.attn_logits,
+            self.forward_metadata.attn_lse,
+            self.forward_metadata.num_kv_splits,
+            self.max_kv_splits,
+            layer.scaling,
+            layer.logit_cap,
+        )
+        return o
diff --git a/sglang/python/sglang/srt/layers/attention/wave_ops/decode_attention.py b/sglang/python/sglang/srt/layers/attention/wave_ops/decode_attention.py
new file mode 100644
index 0000000000000000000000000000000000000000..c76bee9af561f13857f296f6e71dbccfcda4afa5
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/attention/wave_ops/decode_attention.py
@@ -0,0 +1,184 @@
+"""
+Memory-efficient attention for decoding.
+It supports page size = 1.
+"""
+
+import functools
+import logging
+
+from wave_lang.kernel.lang.global_symbols import *
+from wave_lang.kernel.wave.compile import WaveCompileOptions, wave_compile
+from wave_lang.kernel.wave.constraints import GenericDot, MMAOperand, MMAType
+from wave_lang.kernel.wave.templates.paged_decode_attention import (
+    get_paged_decode_attention_kernels,
+    get_paged_decode_intermediate_arrays_shapes,
+    paged_decode_attention_shape,
+)
+from wave_lang.kernel.wave.utils.general_utils import get_default_scheduling_params
+from wave_lang.kernel.wave.utils.run_utils import set_default_run_config
+
+logger = logging.getLogger(__name__)
+import os
+
+dump_generated_mlir = int(os.environ.get("WAVE_DUMP_MLIR", 0))
+
+
+@functools.lru_cache(maxsize=4096)
+def get_wave_kernel(
+    shape: paged_decode_attention_shape,
+    max_kv_splits,
+    input_dtype,
+    output_dtype,
+    logit_cap,
+):
+    mha = (shape.num_query_heads // shape.num_kv_heads) == 1
+
+    # Get the kernels (either compile or load from cache).
+    if mha:
+        mfma_variant = (
+            GenericDot(along_dim=MMAOperand.M, k_vec_size=4, k_mult=1),
+            GenericDot(along_dim=MMAOperand.M, k_vec_size=1, k_mult=64),
+        )
+    else:
+        mfma_variant = (MMAType.F32_16x16x16_F16, MMAType.F32_16x16x16_F16)
+
+    (
+        phase_0,
+        phase_1,
+        hyperparams_0,
+        hyperparams_1,
+        dynamic_symbols_0,
+        dynamic_symbols_1,
+    ) = get_paged_decode_attention_kernels(
+        shape,
+        mfma_variant,
+        max_kv_splits,
+        input_dtype=input_dtype,
+        output_dtype=output_dtype,
+        logit_cap=logit_cap,
+    )
+    hyperparams_0.update(get_default_scheduling_params())
+    hyperparams_1.update(get_default_scheduling_params())
+
+    options = WaveCompileOptions(
+        subs=hyperparams_0,
+        canonicalize=True,
+        run_bench=False,
+        use_buffer_ops=True,
+        waves_per_eu=2,
+        dynamic_symbols=dynamic_symbols_0,
+        wave_runtime=True,
+    )
+    options = set_default_run_config(options)
+    phase_0 = wave_compile(options, phase_0)
+
+    options = WaveCompileOptions(
+        subs=hyperparams_1,
+        canonicalize=True,
+        run_bench=False,
+        use_buffer_ops=False,
+        waves_per_eu=4,
+        dynamic_symbols=dynamic_symbols_1,
+        wave_runtime=True,
+    )
+    options = set_default_run_config(options)
+    phase_1 = wave_compile(options, phase_1)
+
+    return phase_0, phase_1
+
+
+def decode_attention_intermediate_arrays_shapes(
+    num_seqs, head_size_kv, num_query_heads, max_kv_splits
+):
+    # Not all fields are used, but we need to pass them to the function
+    shape = paged_decode_attention_shape(
+        num_query_heads=num_query_heads,
+        num_kv_heads=0,
+        head_size=0,
+        head_size_kv=head_size_kv,
+        block_size=0,
+        num_seqs=num_seqs,
+    )
+    return get_paged_decode_intermediate_arrays_shapes(shape, max_kv_splits)
+
+
+def decode_attention_wave(
+    q,
+    k_buffer,
+    v_buffer,
+    o,
+    b_req_idx,
+    req_to_token,
+    attn_logits,
+    attn_logits_max,
+    num_kv_splits,
+    max_kv_splits,
+    sm_scale,
+    logit_cap,
+):
+    num_seqs, num_query_heads, head_size = q.shape
+    _, num_kv_heads, _ = k_buffer.shape
+    _, _, head_size_kv = v_buffer.shape
+    block_size = 32
+    shape = paged_decode_attention_shape(
+        num_query_heads,
+        num_kv_heads,
+        head_size,
+        head_size_kv,
+        block_size,
+        num_seqs,
+    )
+
+    phase_0, phase_1 = get_wave_kernel(
+        shape, max_kv_splits, q.dtype, o.dtype, logit_cap
+    )
+
+    mb_qk = phase_0(
+        q,
+        k_buffer,
+        v_buffer,
+        b_req_idx,
+        req_to_token,
+        attn_logits,
+        attn_logits_max,
+    )
+    if dump_generated_mlir:
+        filename = f"wave_decode_attention_phase0_{'x'.join(map(str, shape))}.mlir"
+        with open(filename, "w") as f:
+            f.write(mb_qk.module_op.get_asm())
+
+    mb_sv = phase_1(attn_logits, attn_logits_max, b_req_idx, o)
+    if dump_generated_mlir:
+        filename = f"wave_decode_attention_phase1_{'x'.join(map(str, shape))}.mlir"
+        with open(filename, "w") as f:
+            f.write(mb_sv.module_op.get_asm())
+
+
+def decode_attention_fwd(
+    q,
+    k_buffer,
+    v_buffer,
+    o,
+    b_req_idx,
+    req_to_token,
+    attn_logits,
+    attn_logits_max,
+    num_kv_splits,
+    max_kv_splits,
+    sm_scale,
+    logit_cap=0.0,
+):
+    decode_attention_wave(
+        q,
+        k_buffer,
+        v_buffer,
+        o,
+        b_req_idx,
+        req_to_token,
+        attn_logits,
+        attn_logits_max,
+        num_kv_splits,
+        max_kv_splits,
+        sm_scale,
+        logit_cap,
+    )
diff --git a/sglang/python/sglang/srt/layers/attention/wave_ops/extend_attention.py b/sglang/python/sglang/srt/layers/attention/wave_ops/extend_attention.py
new file mode 100644
index 0000000000000000000000000000000000000000..27e674db247dddeff9c9769c43754200a11329da
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/attention/wave_ops/extend_attention.py
@@ -0,0 +1,147 @@
+"""
+Memory-efficient attention for prefill.
+It support page size = 1.
+"""
+
+import functools
+import os
+
+import torch
+from wave_lang.kernel.lang.global_symbols import *
+from wave_lang.kernel.wave.compile import WaveCompileOptions, wave_compile
+from wave_lang.kernel.wave.constraints import MMAType
+from wave_lang.kernel.wave.scheduling.schedule import SchedulingType
+from wave_lang.kernel.wave.templates.attention_common import AttentionShape
+from wave_lang.kernel.wave.templates.extend_attention import get_extend_attention_kernel
+from wave_lang.kernel.wave.utils.general_utils import get_default_scheduling_params
+from wave_lang.kernel.wave.utils.run_utils import set_default_run_config
+
+dump_generated_mlir = int(os.environ.get("WAVE_DUMP_MLIR", 0))
+
+
+@functools.lru_cache
+def get_wave_kernel(
+    shape: AttentionShape,
+    q_shape: tuple[int],
+    k_shape: tuple[int],
+    v_shape: tuple[int],
+    k_cache_shape: tuple[int],
+    v_cache_shape: tuple[int],
+    o_shape: tuple[int],
+    input_dtype: torch.dtype,
+    output_dtype: torch.dtype,
+    size_dtype: torch.dtype,
+    is_causal: bool,
+    logit_cap: float,
+    layer_scaling: float,
+):
+    assert shape.num_query_heads % shape.num_kv_heads == 0
+
+    mfma_variant = (MMAType.F32_16x16x32_K8_F16, MMAType.F32_16x16x16_F16)
+    (
+        extend_attention,
+        hyperparams,
+        dynamic_symbols,
+    ) = get_extend_attention_kernel(
+        shape,
+        mfma_variant,
+        q_shape,
+        k_shape,
+        v_shape,
+        k_cache_shape,
+        v_cache_shape,
+        o_shape,
+        input_dtype=input_dtype,
+        output_dtype=output_dtype,
+        size_dtype=size_dtype,
+        is_causal=is_causal,
+        layer_scaling=layer_scaling,
+        logit_cap=logit_cap,
+    )
+
+    hyperparams.update(get_default_scheduling_params())
+    options = WaveCompileOptions(
+        subs=hyperparams,
+        canonicalize=True,
+        run_bench=False,
+        schedule=SchedulingType.NONE,
+        use_scheduling_barriers=False,
+        dynamic_symbols=dynamic_symbols,
+        use_buffer_ops=True,
+        waves_per_eu=2,
+        denorm_fp_math_f32="preserve-sign",
+        wave_runtime=True,
+    )
+    options = set_default_run_config(options)
+    extend_attention = wave_compile(options, extend_attention)
+
+    return extend_attention
+
+
+def extend_attention_wave(
+    q_extend,
+    k_extend,
+    v_extend,
+    k_buffer,
+    v_buffer,
+    qo_indptr,
+    kv_indptr,
+    kv_indices,
+    custom_mask,
+    mask_indptr,
+    max_seq_len,
+    output,
+    is_causal=True,
+    layer_scaling=None,
+    logit_cap=0,
+):
+    shape = AttentionShape(
+        num_query_heads=q_extend.shape[1],
+        num_kv_heads=k_extend.shape[1],
+        head_size=q_extend.shape[2],
+        head_size_kv=k_extend.shape[2],
+        num_seqs=kv_indptr.shape[0] - 1,
+        max_seq_len=max_seq_len,
+    )
+
+    # Run the wave kernel.
+    extend_attention = get_wave_kernel(
+        shape,
+        q_extend.shape,
+        k_extend.shape,
+        v_extend.shape,
+        k_buffer.shape,
+        v_buffer.shape,
+        output.shape,
+        input_dtype=q_extend.dtype,
+        output_dtype=output.dtype,
+        size_dtype=qo_indptr.dtype,
+        is_causal=is_causal,
+        layer_scaling=layer_scaling,
+        logit_cap=logit_cap,
+    )
+
+    mb = extend_attention(
+        q_extend,
+        k_extend,
+        v_extend,
+        k_buffer,
+        v_buffer,
+        qo_indptr,
+        kv_indptr,
+        kv_indices,
+        max_seq_len,
+        output,
+    )
+
+    if dump_generated_mlir:
+        shape_list = [
+            q_extend.shape[0],
+            q_extend.shape[1],
+            k_extend.shape[1],
+            q_extend.shape[2],
+            k_extend.shape[2],
+        ]
+        filename = f"wave_prefill_attention_{'x'.join(map(str, shape_list))}.mlir"
+        with open(filename, "w") as f:
+            f.write(mb.module_op.get_asm())
diff --git a/sglang/python/sglang/srt/layers/attention/wave_ops/prefill_attention.py b/sglang/python/sglang/srt/layers/attention/wave_ops/prefill_attention.py
new file mode 100644
index 0000000000000000000000000000000000000000..ff446831b77f1e8d7d022d865c44918103dadb4a
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/attention/wave_ops/prefill_attention.py
@@ -0,0 +1,79 @@
+"""
+Memory-efficient attention for prefill.
+It support page size = 1.
+"""
+
+import math
+import os
+
+from wave_lang.kernel.lang.global_symbols import *
+from wave_lang.kernel.wave.compile import WaveCompileOptions, wave_compile
+from wave_lang.kernel.wave.constraints import MMAType
+from wave_lang.kernel.wave.templates.attention_common import AttentionShape
+from wave_lang.kernel.wave.templates.prefill_attention import (
+    get_prefill_attention_kernel,
+)
+from wave_lang.kernel.wave.utils.general_utils import get_default_scheduling_params
+from wave_lang.kernel.wave.utils.run_utils import set_default_run_config
+
+dump_generated_mlir = int(os.environ.get("WAVE_DUMP_MLIR", 0))
+
+
+def prefill_attention_wave(
+    q, k, v, o, b_start_loc, b_seq_len, max_seq_len, is_causal=True
+):
+
+    shape = AttentionShape(
+        num_query_heads=q.shape[1],
+        num_kv_heads=k.shape[1],
+        head_size=q.shape[2],
+        head_size_kv=k.shape[2],
+        num_seqs=b_seq_len.shape[0],
+        max_seq_len=max_seq_len,
+        total_seq_len=q.shape[0],
+    )
+
+    assert shape.num_query_heads % shape.num_kv_heads == 0
+
+    output_shape = (shape.total_seq_len, shape.num_query_heads, shape.head_size_kv)
+    # Run the wave kernel.
+    mfma_variant = (MMAType.F32_16x16x16_F16, MMAType.F32_16x16x16_F16)
+    prefill, hyperparams = get_prefill_attention_kernel(
+        shape,
+        mfma_variant,
+        q.shape,
+        k.shape,
+        v.shape,
+        output_shape,
+        input_dtype=q.dtype,
+        output_dtype=o.dtype,
+        size_dtype=b_seq_len.dtype,
+    )
+
+    hyperparams.update(get_default_scheduling_params())
+
+    log2e = 1.44269504089
+    dk_sqrt = math.sqrt(1.0 / shape.head_size)
+
+    options = WaveCompileOptions(
+        subs=hyperparams,
+        canonicalize=True,
+        run_bench=False,
+        use_scheduling_barriers=False,
+    )
+    options = set_default_run_config(options)
+    prefill = wave_compile(options, prefill)
+
+    mb = prefill(
+        q * dk_sqrt * log2e,
+        k,
+        v,
+        b_start_loc,
+        b_seq_len,
+        o,
+    )
+    if dump_generated_mlir:
+        shape_list = [q.shape[0], q.shape[1], k.shape[1], q.shape[2], k.shape[2]]
+        filename = f"wave_prefill_attention_{'x'.join(map(str, shape_list))}.mlir"
+        with open(filename, "w") as f:
+            f.write(mb.module_op.get_asm())
diff --git a/sglang/python/sglang/srt/layers/attention/xpu_backend.py b/sglang/python/sglang/srt/layers/attention/xpu_backend.py
new file mode 100644
index 0000000000000000000000000000000000000000..4a40d25ee8c97c0a3e733df460b394f0d7fc7048
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/attention/xpu_backend.py
@@ -0,0 +1,1028 @@
+from __future__ import annotations
+
+from typing import TYPE_CHECKING, Optional
+
+import torch
+
+from sglang.srt.configs.model_config import AttentionArch
+from sglang.srt.layers.attention.base_attn_backend import AttentionBackend
+from sglang.srt.layers.attention.flashattention_backend import (
+    FlashAttentionMetadata,
+    make_local_attention_virtual_batches,
+    merge_state_v2_wrapper,
+    prepare_swa_spec_page_table_triton,
+)
+from sglang.srt.managers.schedule_batch import get_global_server_args
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch, ForwardMode
+
+if TYPE_CHECKING:
+    from sglang.srt.layers.radix_attention import RadixAttention
+    from sglang.srt.model_executor.model_runner import ModelRunner
+
+from sgl_kernel import merge_state_v2
+from sgl_kernel.flash_attn import flash_attn_varlen_func, flash_attn_with_kvcache
+
+
+class XPUAttentionBackend(AttentionBackend):
+    """XPU FlashAttention backend, currently based on FlashAttentionBackend, will be refactored later.
+
+    TODO:
+    - Prefill and Decode disaggregation, currently only chunked prefill is supported
+    - Speculative Decoding support
+    - XPU Graph support, see https://github.com/pytorch/pytorch/issues/162143
+    - MLA support
+    """
+
+    def __init__(
+        self,
+        model_runner: ModelRunner,
+        skip_prefill: bool = False,
+        speculative_step_id=0,
+        topk=0,
+        speculative_num_steps=0,
+    ):
+        super().__init__()
+
+        assert not (
+            model_runner.sliding_window_size is not None
+            and model_runner.model_config.is_encoder_decoder
+        ), "Sliding window and cross attention are not supported together"
+
+        self.forward_metadata: FlashAttentionMetadata = None
+        # extra metadata for handling speculative decoding topk > 1, extended draft decode and verify
+        self.forward_metadata_spec_decode_expand: FlashAttentionMetadata = None
+        self.max_context_len = model_runner.model_config.context_len
+        self.device = model_runner.device
+        self.decode_cuda_graph_metadata = {}
+        self.target_verify_metadata = {}
+        self.req_to_token = model_runner.req_to_token_pool.req_to_token
+        self.kv_cache_dtype = model_runner.kv_cache_dtype
+        self.kv_cache_dtype_str = model_runner.server_args.kv_cache_dtype
+        self.page_size = model_runner.page_size
+        self.use_mla = model_runner.model_config.attention_arch == AttentionArch.MLA
+        assert (
+            self.use_mla is False
+        ), "XPUAttentionBackend doesn't support MLA yet, please use --attention-backend triton instead."
+        self.skip_prefill = skip_prefill
+        self.is_hybrid_swa = model_runner.is_hybrid_swa
+        if self.is_hybrid_swa:
+            self.full_to_swa_index_mapping = (
+                model_runner.token_to_kv_pool.full_to_swa_index_mapping
+            )
+        self.topk = model_runner.server_args.speculative_eagle_topk or 0
+        self.speculative_num_steps = speculative_num_steps
+        self.speculative_num_draft_tokens = (
+            model_runner.server_args.speculative_num_draft_tokens
+        )
+        self.speculative_step_id = speculative_step_id
+
+        # Local attention settings
+        self.attention_chunk_size = (
+            model_runner.attention_chunk_size
+            if hasattr(model_runner, "attention_chunk_size")
+            else None
+        )
+
+        # For each layer, the sliding_window_size can be different. This is only used for preparing SWA metadata.
+        # We use `layer.sliding_window_size` to decide whether to use SWA for each layer.
+        self.sliding_window_size = model_runner.sliding_window_size
+        self.has_swa = (
+            self.sliding_window_size is not None and self.sliding_window_size > -1
+        )
+
+    def init_forward_metadata(self, forward_batch: ForwardBatch):
+        """Initialize forward metadata hence all layers in the forward pass can reuse it."""
+        metadata = FlashAttentionMetadata()
+        seqlens_in_batch = forward_batch.seq_lens
+        batch_size = forward_batch.batch_size
+        device = seqlens_in_batch.device
+
+        if forward_batch.forward_mode.is_decode_or_idle():
+            # Draft Decode
+            if forward_batch.spec_info is not None:
+                assert (
+                    False
+                ), "XPUAttentionBackend doesn't support speculative decoding yet, please use --attention-backend triton instead."
+                if self.topk <= 1:
+                    metadata.cache_seqlens_int32 = (
+                        seqlens_in_batch + (self.speculative_step_id + 1)
+                    ).to(torch.int32)
+                    metadata.max_seq_len_k = forward_batch.seq_lens_cpu.max().item() + (
+                        self.speculative_step_id + 1
+                    )
+                    metadata.cu_seqlens_q = torch.arange(
+                        0, batch_size + 1, dtype=torch.int32, device=device
+                    )
+                    metadata.cu_seqlens_k = torch.nn.functional.pad(
+                        torch.cumsum(
+                            metadata.cache_seqlens_int32, dim=0, dtype=torch.int32
+                        ),
+                        (1, 0),
+                    )
+                    metadata.page_table = forward_batch.req_to_token_pool.req_to_token[
+                        forward_batch.req_pool_indices, : metadata.max_seq_len_k
+                    ]
+                else:
+                    metadata.cache_seqlens_int32 = (seqlens_in_batch).to(torch.int32)
+                    metadata.max_seq_len_q = self.topk
+                    metadata.max_seq_len_k = forward_batch.seq_lens_cpu.max().item()
+                    metadata.cu_seqlens_q = torch.arange(
+                        0,
+                        batch_size * self.topk + 1,
+                        step=self.topk,
+                        dtype=torch.int32,
+                        device=device,
+                    )
+                    metadata.cu_seqlens_k = torch.nn.functional.pad(
+                        torch.cumsum(
+                            metadata.cache_seqlens_int32, dim=0, dtype=torch.int32
+                        ),
+                        (1, 0),
+                    )
+                    metadata.page_table = forward_batch.req_to_token_pool.req_to_token[
+                        forward_batch.req_pool_indices, : metadata.max_seq_len_k
+                    ]
+
+                    metadata_expand = FlashAttentionMetadata()
+                    decode_length = self.speculative_step_id + 1
+                    metadata_expand.cache_seqlens_int32 = torch.full(
+                        (seqlens_in_batch.numel() * self.topk,),
+                        decode_length,
+                        device=device,
+                        dtype=torch.int32,
+                    )
+                    metadata_expand.max_seq_len_q = 1
+                    metadata_expand.cu_seqlens_q = torch.arange(
+                        0,
+                        metadata_expand.cache_seqlens_int32.numel() + 1,
+                        dtype=torch.int32,
+                        device=device,
+                    )
+                    metadata_expand.cu_seqlens_k = torch.arange(
+                        0,
+                        metadata_expand.cache_seqlens_int32.numel() * decode_length + 1,
+                        step=decode_length,
+                        dtype=torch.int32,
+                        device=device,
+                    )
+                    # shape: [bs, num_steps, topk] -> [bs x topk, num_steps]
+                    cache_loc = forward_batch.out_cache_loc.view(
+                        -1, self.speculative_num_steps
+                    )
+                    metadata_expand.page_table = (
+                        cache_loc[:, :decode_length].contiguous().to(torch.int32)
+                    )
+                    self.forward_metadata_spec_decode_expand = metadata_expand
+            else:
+                # Normal Decode
+                metadata.cache_seqlens_int32 = seqlens_in_batch.to(torch.int32)
+                metadata.max_seq_len_k = forward_batch.seq_lens_cpu.max().item()
+                metadata.cu_seqlens_q = torch.arange(
+                    0, batch_size + 1, dtype=torch.int32, device=device
+                )
+                metadata.cu_seqlens_k = torch.nn.functional.pad(
+                    torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.int32), (1, 0)
+                )
+                metadata.page_table = forward_batch.req_to_token_pool.req_to_token[
+                    forward_batch.req_pool_indices, : metadata.max_seq_len_k
+                ]
+            # TODO: we need to test this part for llama 4 eagle case
+            self._init_local_attn_metadata(forward_batch, metadata, device)
+        elif forward_batch.forward_mode.is_target_verify():
+            if self.topk <= 1:
+                metadata.cache_seqlens_int32 = (
+                    forward_batch.seq_lens + self.speculative_num_draft_tokens
+                ).to(torch.int32)
+                metadata.max_seq_len_q = self.speculative_num_draft_tokens
+                metadata.max_seq_len_k = (
+                    forward_batch.seq_lens_cpu.max().item()
+                    + self.speculative_num_draft_tokens
+                )
+                metadata.cu_seqlens_q = torch.arange(
+                    0,
+                    batch_size * self.speculative_num_draft_tokens + 1,
+                    self.speculative_num_draft_tokens,
+                    dtype=torch.int32,
+                    device=device,
+                )
+                metadata.cu_seqlens_k = torch.nn.functional.pad(
+                    torch.cumsum(
+                        metadata.cache_seqlens_int32, dim=0, dtype=torch.int32
+                    ),
+                    (1, 0),
+                )
+                metadata.page_table = forward_batch.req_to_token_pool.req_to_token[
+                    forward_batch.req_pool_indices, : metadata.max_seq_len_k
+                ]
+
+                self._init_local_attn_metadata(forward_batch, metadata, device)
+            else:
+                metadata.cache_seqlens_int32 = forward_batch.seq_lens.to(torch.int32)
+                metadata.max_seq_len_q = self.speculative_num_draft_tokens
+                metadata.max_seq_len_k = forward_batch.seq_lens_cpu.max().item()
+                metadata.cu_seqlens_q = torch.arange(
+                    0,
+                    batch_size * self.speculative_num_draft_tokens + 1,
+                    step=self.speculative_num_draft_tokens,
+                    dtype=torch.int32,
+                    device=device,
+                )
+                metadata.cu_seqlens_k = torch.nn.functional.pad(
+                    torch.cumsum(
+                        metadata.cache_seqlens_int32, dim=0, dtype=torch.int32
+                    ),
+                    (1, 0),
+                )
+                metadata.page_table = forward_batch.req_to_token_pool.req_to_token[
+                    forward_batch.req_pool_indices, : metadata.max_seq_len_k
+                ]
+
+                metadata_expand = FlashAttentionMetadata()
+
+                metadata_expand.max_seq_len_q = 1
+                metadata_expand.cu_seqlens_q = torch.arange(
+                    0,
+                    forward_batch.seq_lens.numel() * self.speculative_num_draft_tokens
+                    + 1,
+                    dtype=torch.int32,
+                    device=device,
+                )
+
+                # create expand page table
+                offsets = torch.arange(
+                    self.speculative_num_draft_tokens, device=device
+                ).unsqueeze(
+                    0
+                )  # shape: (1, self.speculative_num_draft_tokens)
+                cols = offsets.expand(
+                    forward_batch.seq_lens.numel(), -1
+                ) + forward_batch.seq_lens.unsqueeze(1)
+                cum_len = torch.nn.functional.pad(
+                    torch.cumsum(
+                        (
+                            forward_batch.seq_lens + self.speculative_num_draft_tokens
+                        ).repeat_interleave(self.speculative_num_draft_tokens),
+                        dim=0,
+                    ),
+                    (1, 0),
+                )[:-1]
+                mask_extraction_indices = (
+                    cols.repeat_interleave(self.speculative_num_draft_tokens, dim=0)
+                    + cum_len[:, None]
+                ).view(1, -1)
+                mask = forward_batch.spec_info.custom_mask[
+                    mask_extraction_indices
+                ].view(
+                    -1, self.speculative_num_draft_tokens
+                )  # (bsz * draft_num, draft_num)
+
+                # shift table indices to avoid padding
+                # non_masked_page_table [[8, 9, 10],   mask (display with int format) [[1, 0, 0],
+                #                        [8, 9, 10],                                   [1, 1, 0],
+                #                        [8, 9, 10]]                                   [1, 0, 1]]
+                # if masked with padding [[8, 0, 0],   our mask without padding       [[8, 9, 10],
+                #                        [8, 9, 0],                                    [8, 9, 10],
+                #                        [8, 0, 10]]                                   [8, 10, 9]]
+                # note here cache_seqlens_int32 is [1, 2, 2] so extra page indices will be ignored in each row
+                col_indices = offsets.expand(
+                    mask.shape[0], self.speculative_num_draft_tokens
+                )
+                # Build keys: if an entry is valid (mask==True), keep its original index;
+                # if not, add self.speculative_num_draft_tokens so that it sorts after all valid entries.
+                keys = torch.where(
+                    mask, col_indices, col_indices + self.speculative_num_draft_tokens
+                )
+                _, sort_order = torch.sort(keys, dim=1)
+                non_masked_page_table = (
+                    forward_batch.req_to_token_pool.req_to_token[
+                        forward_batch.req_pool_indices, :
+                    ]
+                    .gather(1, cols)
+                    .repeat_interleave(self.speculative_num_draft_tokens, dim=0)
+                )  # (bsz, draft_num)
+                metadata_expand.page_table = non_masked_page_table.gather(1, sort_order)
+                metadata_expand.cache_seqlens_int32 = mask.sum(dim=1).to(torch.int32)
+                metadata_expand.cu_seqlens_k = torch.nn.functional.pad(
+                    torch.cumsum(
+                        metadata_expand.cache_seqlens_int32, dim=0, dtype=torch.int32
+                    ),
+                    (1, 0),
+                )
+                self.forward_metadata_spec_decode_expand = metadata_expand
+
+                if self.has_swa:
+                    self._init_sliding_window_attn_spec_metadata(
+                        metadata, metadata_expand
+                    )
+
+        elif forward_batch.forward_mode.is_extend_or_draft_extend_or_mixed():
+            metadata.cache_seqlens_int32 = seqlens_in_batch.to(torch.int32)
+            metadata.max_seq_len_k = forward_batch.seq_lens_cpu.max().item()
+            metadata.cu_seqlens_k = torch.nn.functional.pad(
+                torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.int32), (1, 0)
+            )
+            metadata.page_table = forward_batch.req_to_token_pool.req_to_token[
+                forward_batch.req_pool_indices, : metadata.max_seq_len_k
+            ]
+
+            if (
+                any(forward_batch.extend_prefix_lens_cpu)
+                or forward_batch.forward_mode == ForwardMode.DRAFT_EXTEND
+            ):
+                extend_seq_lens = forward_batch.extend_seq_lens
+                metadata.max_seq_len_q = max(forward_batch.extend_seq_lens_cpu)
+                metadata.cu_seqlens_q = torch.nn.functional.pad(
+                    torch.cumsum(extend_seq_lens, dim=0, dtype=torch.int32), (1, 0)
+                )
+            else:
+                metadata.max_seq_len_q = metadata.max_seq_len_k
+                metadata.cu_seqlens_q = metadata.cu_seqlens_k
+
+            # Setup local attention if enabled
+            if forward_batch.forward_mode == ForwardMode.EXTEND:
+                self._init_local_attn_metadata(forward_batch, metadata, device)
+
+        # Encoder metadata for cross attention
+        if forward_batch.encoder_lens is not None:
+            assert (
+                forward_batch.encoder_lens.numel() == 1
+            ), "Only encoder size 1 is supported for now"
+
+            metadata.encoder_lens_int32 = forward_batch.encoder_lens.to(torch.int32)
+            metadata.encoder_cu_seqlens_k = torch.nn.functional.pad(
+                torch.cumsum(metadata.encoder_lens_int32, dim=0, dtype=torch.int32),
+                (1, 0),
+            )
+            metadata.encoder_max_seq_len_k = metadata.encoder_lens_int32.max().item()
+            metadata.encoder_page_table = forward_batch.req_to_token_pool.req_to_token[
+                forward_batch.req_pool_indices, : metadata.encoder_max_seq_len_k
+            ]
+
+            # Currently only support forward_batch.encoder_lens.numel() == 1
+            metadata.page_table = forward_batch.req_to_token_pool.req_to_token[
+                forward_batch.req_pool_indices,
+                metadata.encoder_max_seq_len_k : (
+                    metadata.encoder_max_seq_len_k + metadata.max_seq_len_k
+                ),
+            ]
+
+        # Convert the page table to a strided format which is needed by FA3 API
+        if self.page_size > 1:
+            self.strided_indices = torch.arange(
+                0, metadata.page_table.shape[1], self.page_size, device=self.device
+            )
+            metadata.page_table = (
+                metadata.page_table[:, self.strided_indices] // self.page_size
+            )
+
+        self.forward_metadata = metadata
+
+    def forward_extend(
+        self,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
+        layer: RadixAttention,
+        forward_batch: ForwardBatch,
+        save_kv_cache=True,
+        # For multi-head latent attention
+        q_rope: Optional[torch.Tensor] = None,
+        k_rope: Optional[torch.Tensor] = None,
+        sinks: Optional[torch.Tensor] = None,
+    ):
+        if k is not None:
+            assert v is not None
+            if save_kv_cache:
+                cache_loc = (
+                    forward_batch.out_cache_loc
+                    if not layer.is_cross_attention
+                    else forward_batch.encoder_out_cache_loc
+                )
+                if not self.use_mla:
+                    forward_batch.token_to_kv_pool.set_kv_buffer(
+                        layer, cache_loc, k, v, layer.k_scale, layer.v_scale
+                    )
+                else:
+                    forward_batch.token_to_kv_pool.set_mla_kv_buffer(
+                        layer,
+                        cache_loc,
+                        k,
+                        k_rope,
+                    )
+
+        # Use precomputed metadata across all layers
+        metadata = self.forward_metadata
+
+        # Calculate window size (can be moved to metadata if layer properties don't change)
+        # we don't do layer.sliding_window_size - 1 since in model.get_attention_sliding_window_size() we already - 1
+        # here is two side inclusive
+        is_hybrid_swa = (
+            layer.sliding_window_size is not None and layer.sliding_window_size > -1
+        )
+        window_size = (layer.sliding_window_size, 0) if is_hybrid_swa else (-1, -1)
+
+        # currently no FP8 KV cache supported
+        k_descale, v_descale = None, None
+        # # only use kv scaling if: 1) fp8 kv is explicitly enabled, 2) RadixAttention
+        # # has corresponding quantization method so that layer.k_scale is not None,
+        # # 3) layer.head_dim <= 256 since fa3 kernel require fp16 and bf16 data type in this case.
+        # if self.kv_cache_dtype_str != "auto" and layer.head_dim <= 256:
+        #     if layer.k_scale is not None:
+        #         descale_shape = (forward_batch.batch_size, layer.tp_k_head_num)
+        #         k_descale = layer.k_scale.expand(descale_shape)
+        #         v_descale = layer.v_scale.expand(descale_shape)
+        #     q = q.to(self.kv_cache_dtype)
+        #     q_rope = q_rope.to(self.kv_cache_dtype) if q_rope is not None else None
+        #     k_rope = k_rope.to(self.kv_cache_dtype) if k_rope is not None else None
+        causal = not layer.is_cross_attention
+
+        # Check if we should use local attention
+        use_local_attn = (
+            self.attention_chunk_size is not None
+            and metadata.local_attn_metadata is not None
+            and (hasattr(layer, "use_irope") and layer.use_irope)
+        )
+
+        # We do cascade attention for Target Verify with topk > 1
+        # We don't use cascade attention for Sliding Window Attention:
+        # - Different window sizes should be passed in for each q in the first stage of cascade attention, but FA3 interface doesn't support pass in a list of window sizes.
+        # - The overhead of duplicated computation of the common prefix part is small for sliding window layers (seq_len <= window_size), so we can just expand it.
+        use_cascade_attn = (
+            forward_batch.forward_mode.is_target_verify()
+            and self.topk > 1
+            and not is_hybrid_swa
+        )
+
+        # For fa3 interface version compatibility, we put new fields into conditional keyword args
+        kwargs = {}
+        if sinks is not None:
+            kwargs["sinks"] = sinks
+
+        # Get the appropriate page table based on whether we're using local attention
+        if use_local_attn:
+            local_metadata = metadata.local_attn_metadata
+            page_table = local_metadata.local_block_table
+            cu_seqlens_q = local_metadata.local_query_start_loc
+            cache_seqlens = local_metadata.local_seqused_k
+            max_seqlen_q = local_metadata.local_max_query_len
+        elif is_hybrid_swa and metadata.swa_spec_metadata is not None:
+            swa_spec_metadata = metadata.swa_spec_metadata
+            page_table = swa_spec_metadata.page_table
+            cu_seqlens_q = swa_spec_metadata.cu_seqlens_q
+            cache_seqlens = swa_spec_metadata.cache_seqlens_int32
+            max_seqlen_q = swa_spec_metadata.max_seq_len_q
+            cu_seqlens_k = swa_spec_metadata.cu_seqlens_k
+        else:
+            page_table = metadata.page_table
+            cu_seqlens_q = metadata.cu_seqlens_q
+            cache_seqlens = metadata.cache_seqlens_int32
+            max_seqlen_q = metadata.max_seq_len_q
+            cu_seqlens_k = metadata.cu_seqlens_k
+
+        # Use Flash Attention for prefill
+        if not self.use_mla:
+            # Do multi-head attention
+            key_cache, value_cache = forward_batch.token_to_kv_pool.get_kv_buffer(
+                layer.layer_id
+            )
+            key_cache = key_cache.view(
+                -1, self.page_size, layer.tp_k_head_num, layer.head_dim
+            )
+            value_cache = value_cache.view(
+                -1, self.page_size, layer.tp_v_head_num, layer.head_dim
+            )
+            if layer.is_cross_attention:
+                page_table = metadata.encoder_page_table
+                cache_seqlens = metadata.encoder_lens_int32
+                cu_seqlens_k = metadata.encoder_cu_seqlens_k
+                window_size = (-1, -1)
+
+            result = flash_attn_with_kvcache(
+                q=q.contiguous().view(-1, layer.tp_q_head_num, layer.head_dim),
+                k_cache=key_cache,
+                v_cache=value_cache,
+                page_table=page_table,
+                cache_seqlens=cache_seqlens,
+                cu_seqlens_q=cu_seqlens_q,
+                cu_seqlens_k_new=cu_seqlens_k if not use_local_attn else None,
+                max_seqlen_q=max_seqlen_q,
+                softmax_scale=layer.scaling,
+                causal=False if use_cascade_attn else causal,
+                window_size=window_size,
+                softcap=layer.logit_cap,
+                k_descale=k_descale,
+                v_descale=v_descale,
+                return_softmax_lse=use_cascade_attn,
+                **kwargs,
+            )
+
+            if use_cascade_attn:
+                o, softmax_lse, *rest = result
+                o_expand, softmax_lse_expand, *rest_expand = flash_attn_with_kvcache(
+                    q=q.contiguous().view(-1, layer.tp_q_head_num, layer.head_dim),
+                    k_cache=key_cache,
+                    v_cache=value_cache,
+                    page_table=self.forward_metadata_spec_decode_expand.page_table,
+                    cache_seqlens=self.forward_metadata_spec_decode_expand.cache_seqlens_int32,
+                    cu_seqlens_q=self.forward_metadata_spec_decode_expand.cu_seqlens_q,
+                    cu_seqlens_k_new=self.forward_metadata_spec_decode_expand.cu_seqlens_k,
+                    max_seqlen_q=self.forward_metadata_spec_decode_expand.max_seq_len_q,
+                    softmax_scale=layer.scaling,
+                    causal=False,
+                    window_size=window_size,
+                    softcap=layer.logit_cap,
+                    k_descale=k_descale,
+                    v_descale=v_descale,
+                    return_softmax_lse=True,
+                    **kwargs,
+                )
+                o, _ = merge_state_v2_wrapper(
+                    o,
+                    softmax_lse.T.contiguous(),
+                    o_expand,
+                    softmax_lse_expand.T.contiguous(),
+                )
+            else:
+                o = result
+        else:
+            if (
+                forward_batch.attn_attend_prefix_cache is not None
+                and not forward_batch.forward_mode.is_target_verify()
+                and not forward_batch.forward_mode.is_draft_extend()
+            ):
+                # Do multi-head attention with chunked prefix cache
+                if forward_batch.attn_attend_prefix_cache:
+                    assert not get_global_server_args().disable_chunked_prefix_cache
+                    # MHA for chunked prefix kv cache when running model with MLA
+                    assert forward_batch.prefix_chunk_idx is not None
+                    assert forward_batch.prefix_chunk_cu_seq_lens is not None
+                    assert forward_batch.prefix_chunk_max_seq_lens is not None
+
+                    chunk_idx = forward_batch.prefix_chunk_idx
+                    assert chunk_idx >= 0
+
+                    assert forward_batch.mha_return_lse
+                    output = flash_attn_varlen_func(
+                        q=q.view(-1, layer.tp_q_head_num, layer.head_dim),
+                        k=k.view(-1, layer.tp_k_head_num, layer.head_dim).to(q.dtype),
+                        v=v.view(-1, layer.tp_k_head_num, layer.v_head_dim).to(q.dtype),
+                        cu_seqlens_q=metadata.cu_seqlens_q,
+                        cu_seqlens_k=forward_batch.prefix_chunk_cu_seq_lens[chunk_idx],
+                        max_seqlen_q=metadata.max_seq_len_q,
+                        max_seqlen_k=forward_batch.prefix_chunk_max_seq_lens[chunk_idx],
+                        softmax_scale=layer.scaling,
+                        causal=False,
+                        return_softmax_lse=True,
+                    )
+                else:
+                    # MHA for extend part of sequence without attending prefix kv cache
+                    output = flash_attn_varlen_func(
+                        q=q.view(-1, layer.tp_q_head_num, layer.head_dim),
+                        k=k.view(-1, layer.tp_k_head_num, layer.head_dim).to(q.dtype),
+                        v=v.view(-1, layer.tp_k_head_num, layer.v_head_dim).to(q.dtype),
+                        cu_seqlens_q=metadata.cu_seqlens_q,
+                        cu_seqlens_k=metadata.cu_seqlens_q,
+                        max_seqlen_q=metadata.max_seq_len_q,
+                        max_seqlen_k=metadata.max_seq_len_q,
+                        softmax_scale=layer.scaling,
+                        causal=True,
+                        return_softmax_lse=forward_batch.mha_return_lse,
+                    )
+                if forward_batch.mha_return_lse:
+                    output, lse, *rest = output
+                    lse = torch.transpose(lse, 0, 1).contiguous()
+                    return output, lse
+                return output
+            else:
+                # Do absorbed multi-latent attention
+                kv_cache = forward_batch.token_to_kv_pool.get_key_buffer(
+                    layer.layer_id
+                ).to(q.dtype)
+                k_rope = kv_cache[:, :, layer.v_head_dim :]
+                c_kv = kv_cache[:, :, : layer.v_head_dim]
+                k_rope_cache = k_rope.view(
+                    -1,
+                    self.page_size,
+                    layer.tp_k_head_num,
+                    layer.head_dim - layer.v_head_dim,
+                )
+                c_kv_cache = c_kv.view(
+                    -1, self.page_size, layer.tp_v_head_num, layer.v_head_dim
+                )
+                if q_rope is not None:
+                    q_nope = q.view(-1, layer.tp_q_head_num, layer.v_head_dim)
+                    q_rope = q_rope.view(
+                        -1, layer.tp_q_head_num, layer.head_dim - layer.v_head_dim
+                    )
+                else:
+                    q_all = q.contiguous().view(-1, layer.tp_q_head_num, layer.head_dim)
+                    q_nope = q_all[:, :, : layer.v_head_dim]
+                    q_rope = q_all[:, :, layer.v_head_dim :]
+
+                result = flash_attn_with_kvcache(
+                    q=q_rope,
+                    k_cache=k_rope_cache,
+                    v_cache=c_kv_cache,
+                    qv=q_nope,
+                    page_table=page_table,
+                    cache_seqlens=cache_seqlens,
+                    cu_seqlens_q=cu_seqlens_q,
+                    cu_seqlens_k_new=cu_seqlens_k if not use_local_attn else None,
+                    max_seqlen_q=max_seqlen_q,
+                    softmax_scale=layer.scaling,
+                    causal=False if use_cascade_attn else causal,
+                    softcap=layer.logit_cap,
+                    k_descale=k_descale,
+                    v_descale=v_descale,
+                    return_softmax_lse=use_cascade_attn,
+                )
+                if use_cascade_attn:
+                    o, softmax_lse, *rest = result
+                    o_expand, softmax_lse_expand, *rest_expand = (
+                        flash_attn_with_kvcache(
+                            q=q_rope,
+                            k_cache=k_rope_cache,
+                            v_cache=c_kv_cache,
+                            qv=q_nope,
+                            page_table=self.forward_metadata_spec_decode_expand.page_table,
+                            cache_seqlens=self.forward_metadata_spec_decode_expand.cache_seqlens_int32,
+                            cu_seqlens_q=self.forward_metadata_spec_decode_expand.cu_seqlens_q,
+                            cu_seqlens_k_new=self.forward_metadata_spec_decode_expand.cu_seqlens_k,
+                            max_seqlen_q=self.forward_metadata_spec_decode_expand.max_seq_len_q,
+                            softmax_scale=layer.scaling,
+                            causal=False,
+                            window_size=window_size,
+                            softcap=layer.logit_cap,
+                            k_descale=k_descale,
+                            v_descale=v_descale,
+                            return_softmax_lse=True,
+                        )
+                    )
+                    o, _ = merge_state_v2_wrapper(
+                        o,
+                        softmax_lse.T.contiguous(),
+                        o_expand,
+                        softmax_lse_expand.T.contiguous(),
+                    )
+                else:
+                    o = result
+
+        return o.view(-1, layer.tp_q_head_num * layer.v_head_dim)
+
+    def forward_decode(
+        self,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
+        layer: RadixAttention,
+        forward_batch: ForwardBatch,
+        save_kv_cache=True,
+        # For multi-head latent attention
+        q_rope: Optional[torch.Tensor] = None,
+        k_rope: Optional[torch.Tensor] = None,
+        sinks: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        if k is not None:
+            assert v is not None
+            if save_kv_cache:
+                cache_loc = (
+                    forward_batch.out_cache_loc
+                    if not layer.is_cross_attention
+                    else forward_batch.encoder_out_cache_loc
+                )
+                if not self.use_mla:
+                    forward_batch.token_to_kv_pool.set_kv_buffer(
+                        layer, cache_loc, k, v, layer.k_scale, layer.v_scale
+                    )
+                else:
+                    forward_batch.token_to_kv_pool.set_mla_kv_buffer(
+                        layer,
+                        cache_loc,
+                        k,
+                        k_rope,
+                    )
+
+        # Use precomputed metadata across all layers
+        metadata = self.forward_metadata
+        local_attn_metadata = getattr(metadata, "local_attn_metadata", None)
+        use_local_attn = (
+            self.attention_chunk_size is not None
+            and local_attn_metadata is not None
+            and (hasattr(layer, "use_irope") and layer.use_irope)
+        )
+
+        # When Spec Decode enabled, forward_decode would be called with two mode:
+        # 1. DRAFT_DECODE: we enable cascade attention when top_k > 1
+        # 2. IDLE: we don’t need cascade attention, spec_info will be none in this case
+        use_cascade_attn = forward_batch.spec_info is not None and self.topk > 1
+
+        # Calculate window size (can be moved to metadata if layer properties don't change)
+        # we don't do layer.sliding_window_size - 1 since in model.get_attention_sliding_window_size() we already - 1
+        # here is two side inclusive
+        window_size = (
+            (layer.sliding_window_size, 0)
+            if layer.sliding_window_size is not None and layer.sliding_window_size > -1
+            else (-1, -1)
+        )
+        causal = not layer.is_cross_attention
+
+        # For fa3 interface version compatibility, we put new fields into conditional keyword args
+        kwargs = {}
+        if sinks is not None:
+            kwargs["sinks"] = sinks
+
+        k_descale, v_descale = None, None
+        # only use kv scaling if: 1) fp8 kv is explicitly enabled, 2) RadixAttention
+        # has corresponding quantization method so that layer.k_scale is not None,
+        # 3) layer.head_dim <= 256 since fa3 kernel require fp16 and bf16 data type in this case.
+        if self.kv_cache_dtype_str != "auto" and layer.head_dim <= 256:
+            if layer.k_scale is not None:
+                descale_shape = (forward_batch.batch_size, layer.tp_k_head_num)
+                k_descale = layer.k_scale.expand(descale_shape)
+                v_descale = layer.v_scale.expand(descale_shape)
+            q = q.to(self.kv_cache_dtype)
+            q_rope = q_rope.to(self.kv_cache_dtype) if q_rope is not None else None
+            k_rope = k_rope.to(self.kv_cache_dtype) if k_rope is not None else None
+        if not self.use_mla:
+            # Do multi-head attention
+
+            key_cache, value_cache = forward_batch.token_to_kv_pool.get_kv_buffer(
+                layer.layer_id
+            )
+            key_cache = key_cache.view(
+                -1, self.page_size, layer.tp_k_head_num, layer.head_dim
+            )
+            value_cache = value_cache.view(
+                -1, self.page_size, layer.tp_v_head_num, layer.head_dim
+            )
+
+            if layer.is_cross_attention:
+                # Always use non-chunked logic for cross-attention
+                o = flash_attn_with_kvcache(
+                    q=q.contiguous().view(-1, layer.tp_q_head_num, layer.head_dim),
+                    k_cache=key_cache,
+                    v_cache=value_cache,
+                    page_table=metadata.encoder_page_table,
+                    cache_seqlens=metadata.encoder_lens_int32,
+                    cu_seqlens_q=metadata.cu_seqlens_q,
+                    cu_seqlens_k_new=metadata.encoder_cu_seqlens_k,
+                    max_seqlen_q=1,
+                    softmax_scale=layer.scaling,
+                    causal=False,
+                    window_size=(-1, -1),
+                    softcap=layer.logit_cap,
+                    k_descale=k_descale,
+                    v_descale=v_descale,
+                    **kwargs,
+                )
+            elif use_local_attn:
+                # Use chunked (local) attention batching for self-attention
+                o = flash_attn_with_kvcache(
+                    q=q.contiguous().view(-1, layer.tp_q_head_num, layer.head_dim),
+                    k_cache=key_cache,
+                    v_cache=value_cache,
+                    page_table=local_attn_metadata.local_block_table,
+                    cache_seqlens=local_attn_metadata.local_seqused_k,
+                    cu_seqlens_q=local_attn_metadata.local_query_start_loc,
+                    cu_seqlens_k_new=None,
+                    max_seqlen_q=local_attn_metadata.local_max_query_len,
+                    softmax_scale=layer.scaling,
+                    causal=True,
+                    window_size=(-1, -1),
+                    softcap=layer.logit_cap,
+                    k_descale=k_descale,
+                    v_descale=v_descale,
+                    **kwargs,
+                )
+            else:
+                page_table = metadata.page_table
+                cache_seqlens = metadata.cache_seqlens_int32
+                cu_seqlens_k = metadata.cu_seqlens_k
+                max_seqlen_q = metadata.max_seq_len_q
+                q_reshaped = q.contiguous().view(
+                    -1, layer.tp_q_head_num, layer.head_dim
+                )
+
+                # Default: single-token self-attention
+                result = flash_attn_with_kvcache(
+                    q=q_reshaped,
+                    k_cache=key_cache,
+                    v_cache=value_cache,
+                    page_table=page_table,
+                    cache_seqlens=cache_seqlens,
+                    cu_seqlens_q=metadata.cu_seqlens_q,
+                    cu_seqlens_k_new=cu_seqlens_k,
+                    max_seqlen_q=max_seqlen_q,
+                    softmax_scale=layer.scaling,
+                    causal=False if use_cascade_attn else causal,
+                    window_size=window_size,
+                    softcap=layer.logit_cap,
+                    k_descale=k_descale,
+                    v_descale=v_descale,
+                    return_softmax_lse=use_cascade_attn,
+                    **kwargs,
+                )
+                if use_cascade_attn:
+                    o, softmax_lse, *rest = result
+                    o_expand, softmax_lse_expand, *rest_expand = (
+                        flash_attn_with_kvcache(
+                            q=q_reshaped,
+                            k_cache=key_cache,
+                            v_cache=value_cache,
+                            page_table=self.forward_metadata_spec_decode_expand.page_table,
+                            cache_seqlens=self.forward_metadata_spec_decode_expand.cache_seqlens_int32,
+                            cu_seqlens_q=self.forward_metadata_spec_decode_expand.cu_seqlens_q,
+                            cu_seqlens_k_new=self.forward_metadata_spec_decode_expand.cu_seqlens_k,
+                            max_seqlen_q=self.forward_metadata_spec_decode_expand.max_seq_len_q,
+                            softmax_scale=layer.scaling,
+                            causal=False,
+                            window_size=window_size,
+                            softcap=layer.logit_cap,
+                            k_descale=k_descale,
+                            v_descale=v_descale,
+                            return_softmax_lse=True,
+                            **kwargs,
+                        )
+                    )
+                    o, _ = merge_state_v2(
+                        o,
+                        softmax_lse.T.contiguous(),
+                        o_expand,
+                        softmax_lse_expand.T.contiguous(),
+                    )
+                else:
+                    o = result
+        else:
+            # Do absorbed multi-latent attention
+            kv_cache = forward_batch.token_to_kv_pool.get_key_buffer(layer.layer_id).to(
+                q.dtype
+            )
+            k_rope = kv_cache[:, :, layer.v_head_dim :]
+            c_kv = kv_cache[:, :, : layer.v_head_dim]
+            k_rope_cache = k_rope.view(
+                -1,
+                self.page_size,
+                layer.tp_k_head_num,
+                layer.head_dim - layer.v_head_dim,
+            )
+            c_kv_cache = c_kv.view(
+                -1, self.page_size, layer.tp_v_head_num, layer.v_head_dim
+            )
+
+            if q_rope is not None:
+                q_nope = q.view(-1, layer.tp_q_head_num, layer.v_head_dim)
+                q_rope = q_rope.view(
+                    -1, layer.tp_q_head_num, layer.head_dim - layer.v_head_dim
+                )
+            else:
+                q_all = q.contiguous().view(-1, layer.tp_q_head_num, layer.head_dim)
+                q_nope = q_all[:, :, : layer.v_head_dim]
+                q_rope = q_all[:, :, layer.v_head_dim :]
+            max_seqlen_q = metadata.max_seq_len_q
+
+            result = flash_attn_with_kvcache(
+                q=q_rope,
+                k_cache=k_rope_cache,
+                v_cache=c_kv_cache,
+                qv=q_nope,
+                page_table=metadata.page_table,
+                cache_seqlens=metadata.cache_seqlens_int32,
+                cu_seqlens_q=metadata.cu_seqlens_q,
+                cu_seqlens_k_new=metadata.cu_seqlens_k,
+                max_seqlen_q=max_seqlen_q,
+                softmax_scale=layer.scaling,
+                causal=False if use_cascade_attn else causal,
+                softcap=layer.logit_cap,
+                k_descale=k_descale,
+                v_descale=v_descale,
+                return_softmax_lse=use_cascade_attn,  # softmax_lse is needed for merge states
+            )
+            if use_cascade_attn:
+                o, softmax_lse, *rest = result
+                o_expand, softmax_lse_expand, *rest_expand = flash_attn_with_kvcache(
+                    q=q_rope,
+                    k_cache=k_rope_cache,
+                    v_cache=c_kv_cache,
+                    qv=q_nope,
+                    page_table=self.forward_metadata_spec_decode_expand.page_table,
+                    cache_seqlens=self.forward_metadata_spec_decode_expand.cache_seqlens_int32,
+                    cu_seqlens_q=self.forward_metadata_spec_decode_expand.cu_seqlens_q,
+                    cu_seqlens_k_new=self.forward_metadata_spec_decode_expand.cu_seqlens_k,
+                    max_seqlen_q=self.forward_metadata_spec_decode_expand.max_seq_len_q,
+                    softmax_scale=layer.scaling,
+                    causal=False,
+                    window_size=window_size,
+                    softcap=layer.logit_cap,
+                    k_descale=k_descale,
+                    v_descale=v_descale,
+                    return_softmax_lse=True,
+                )
+                o, _ = merge_state_v2(
+                    o,
+                    softmax_lse.T.contiguous(),
+                    o_expand,
+                    softmax_lse_expand.T.contiguous(),
+                )
+            else:
+                o = result
+
+        return o.view(-1, layer.tp_q_head_num * layer.v_head_dim)
+
+    def get_cuda_graph_seq_len_fill_value(self):
+        """Get the fill value for sequence length in CUDA graph."""
+        return 1
+
+    def _init_local_attn_metadata(
+        self, forwardbatch: ForwardBatch, metadata: FlashAttentionMetadata, device
+    ):
+        """Centralized utility to initialize local_attn_metadata if chunked attention is enabled."""
+        if self.attention_chunk_size is None:
+            metadata.local_attn_metadata = None
+            return
+
+        cu_seqlens_q = metadata.cu_seqlens_q
+        cache_seqlens_int32 = metadata.cache_seqlens_int32
+        if self.is_hybrid_swa:
+            page_table = self.full_to_swa_index_mapping[metadata.page_table].to(
+                torch.int32
+            )
+        else:
+            page_table = metadata.page_table
+        if cu_seqlens_q is None or cache_seqlens_int32 is None or page_table is None:
+            metadata.local_attn_metadata = None
+            return
+
+        cu_seqlens_q_np = cu_seqlens_q.cpu().numpy()
+        seq_lens_np = cache_seqlens_int32.cpu().numpy()
+        (
+            seqlens_q_local_np,
+            cu_seqlens_q_local_np,
+            seqlens_k_local_np,
+            block_table_local,
+        ) = make_local_attention_virtual_batches(
+            self.attention_chunk_size,
+            cu_seqlens_q_np,
+            seq_lens_np,
+            page_table,
+            self.page_size,
+        )
+
+        local_metadata = FlashAttentionMetadata.LocalAttentionMetadata(
+            local_query_start_loc=torch.from_numpy(cu_seqlens_q_local_np).to(device),
+            local_seqused_k=torch.from_numpy(seqlens_k_local_np).to(device),
+            local_block_table=block_table_local.to(device),
+            local_max_query_len=int(seqlens_q_local_np.max()),
+            local_max_seq_len=int(seqlens_k_local_np.max()),
+        )
+        metadata.local_attn_metadata = local_metadata
+
+    def _init_sliding_window_attn_spec_metadata(
+        self,
+        metadata: FlashAttentionMetadata,
+        metadata_expand: FlashAttentionMetadata,
+        metadata_swa: Optional[FlashAttentionMetadata] = None,
+    ):
+        # TODO: support page_size > 1 for swa spec
+        assert (
+            self.page_size == 1
+        ), "FlashAttention backend doesn't support topk > 1 speculative decoding with page size > 1 sliding window attention"
+
+        cache_seqlens_int32 = (
+            metadata.cache_seqlens_int32.repeat_interleave(
+                self.speculative_num_draft_tokens
+            )
+            + metadata_expand.cache_seqlens_int32
+        )
+        cu_seqlens_k = torch.nn.functional.pad(
+            torch.cumsum(cache_seqlens_int32, dim=0, dtype=torch.int32), (1, 0)
+        )
+        bs = cache_seqlens_int32.shape[0]
+        page_table = (
+            metadata.page_table.new_zeros(
+                (bs, metadata.max_seq_len_k + metadata_expand.page_table.shape[1])
+            )
+            if metadata_swa is None
+            else metadata_swa.page_table
+        )
+
+        prepare_swa_spec_page_table_triton(
+            page_table,
+            metadata.page_table,
+            metadata_expand.page_table,
+            metadata.cache_seqlens_int32,
+            metadata_expand.cache_seqlens_int32,
+            self.speculative_num_draft_tokens,
+        )
+
+        if metadata_swa is None:
+            metadata_swa = FlashAttentionMetadata()
+            metadata_swa.max_seq_len_q = 1
+            metadata_swa.cu_seqlens_q = metadata_expand.cu_seqlens_q
+            metadata_swa.cache_seqlens_int32 = cache_seqlens_int32
+            metadata_swa.cu_seqlens_k = cu_seqlens_k
+            metadata_swa.page_table = page_table
+        else:
+            metadata_swa.cache_seqlens_int32.copy_(cache_seqlens_int32)
+            metadata_swa.cu_seqlens_k.copy_(cu_seqlens_k)
+
+        metadata.swa_spec_metadata = metadata_swa
diff --git a/sglang/python/sglang/srt/layers/communicator.py b/sglang/python/sglang/srt/layers/communicator.py
new file mode 100644
index 0000000000000000000000000000000000000000..97c0f24cf83cea41873e1c56b70be7f71e8e44d7
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/communicator.py
@@ -0,0 +1,1037 @@
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+import logging
+from contextlib import contextmanager
+from dataclasses import dataclass
+from enum import Enum, auto
+from functools import partial
+from typing import Callable, Dict, List, Optional, Tuple, Union
+
+import torch
+
+from sglang.srt.distributed import (
+    get_tensor_model_parallel_rank,
+    get_tensor_model_parallel_world_size,
+    get_tp_group,
+    tensor_model_parallel_all_reduce,
+)
+from sglang.srt.distributed.device_communicators.pynccl_allocator import (
+    use_symmetric_memory,
+)
+from sglang.srt.environ import envs
+from sglang.srt.layers.attention.nsa.utils import (
+    is_nsa_enable_prefill_cp,
+    nsa_use_prefill_cp,
+)
+from sglang.srt.layers.dp_attention import (
+    attn_tp_all_gather_into_tensor,
+    attn_tp_reduce_scatter_tensor,
+    dp_gather_partial,
+    dp_reduce_scatter_tensor,
+    dp_scatter,
+    get_attention_cp_rank,
+    get_attention_cp_size,
+    get_attention_dp_size,
+    get_attention_tp_rank,
+    get_attention_tp_size,
+    get_global_dp_buffer,
+    get_local_dp_buffer,
+    is_allocation_symmetric,
+    is_dp_attention_enabled,
+)
+from sglang.srt.layers.moe import (
+    get_moe_a2a_backend,
+    should_use_flashinfer_cutlass_moe_fp4_allgather,
+)
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch
+from sglang.srt.server_args import get_global_server_args
+from sglang.srt.speculative.spec_info import SpeculativeAlgorithm
+from sglang.srt.utils import (
+    get_bool_env_var,
+    is_cuda,
+    is_flashinfer_available,
+    is_gfx95_supported,
+    is_hip,
+    is_npu,
+    is_sm90_supported,
+    is_sm100_supported,
+)
+
+_is_cuda = is_cuda()
+_is_flashinfer_available = is_flashinfer_available()
+_is_sm90_supported = _is_cuda and is_sm90_supported()
+_is_sm100_supported = _is_cuda and is_sm100_supported()
+_use_aiter = get_bool_env_var("SGLANG_USE_AITER") and is_hip()
+_is_gfx95_supported = is_gfx95_supported()
+_is_npu = is_npu()
+_use_ag_after_qlora = envs.SGLANG_USE_AG_AFTER_QLORA.get()
+
+if _use_aiter and _is_gfx95_supported:
+    from aiter.ops.triton.fused_fp8_quant import fused_rms_fp8_group_quant
+
+    from sglang.srt.layers.quantization.rocm_mxfp4_utils import fused_rms_mxfp4_quant
+elif _is_npu:
+    from sglang.srt.hardware_backend.npu.cmo import prepare_weight_cache
+
+
+# TODO: According to the discussion in https://github.com/flashinfer-ai/flashinfer/issues/1223#issuecomment-3047256465
+# We set the max token num to 128 for allreduce fusion with min-latency case(use_oneshot=True).
+FUSE_ALLREDUCE_MAX_BATCH_SIZE = 2048
+
+
+def apply_flashinfer_allreduce_fusion(batch_size: int):
+    return (
+        # NOTE: flashinfer 0.6.1 caused performance regression on sm100 for allreduce fusion
+        # Ref: https://github.com/sgl-project/sglang/issues/17237
+        (_is_sm90_supported or _is_sm100_supported)
+        and _is_flashinfer_available
+        and batch_size > 0
+        and batch_size <= FUSE_ALLREDUCE_MAX_BATCH_SIZE
+        and not is_dp_attention_enabled()
+        and get_global_server_args().enable_flashinfer_allreduce_fusion
+    )
+
+
+def apply_aiter_all_reduce_fusion(input_tensor: torch.Tensor):
+    n = input_tensor.shape[-1]
+    total_bytes = input_tensor.numel() * input_tensor.element_size()
+    return (
+        _use_aiter
+        and total_bytes > 0
+        and n <= 16384
+        and total_bytes < 8 * 1024 * 8192
+        and get_tensor_model_parallel_world_size() != 6
+        and not is_dp_attention_enabled()
+        and get_global_server_args().enable_aiter_allreduce_fusion
+    )
+
+
+class ScatterMode(Enum):
+    """
+    Suppose we have TP=4, DP=2, enable-dp-attention, and the system handles seq a,b,c,d
+    Model input/output: [ab, ab, cd, cd] for four ranks respectively
+    SCATTERED: [a, b, c, d]
+    TP_ATTN_FULL: [ab, ab, cd, cd], i.e. all ranks inside a TP attn group have full data of the group
+    FULL: [abcd, abcd, abcd, abcd]
+    """
+
+    SCATTERED = auto()
+    TP_ATTN_FULL = auto()
+    FULL = auto()
+
+    @staticmethod
+    def model_input_output():
+        """The scatter mode for model forward pass input and output data"""
+        if is_nsa_enable_prefill_cp():
+            return ScatterMode.SCATTERED
+        return ScatterMode.TP_ATTN_FULL
+
+
+class AttentionInputs:
+
+    def __init__(
+        self,
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+        qkv_latent_func: Callable,
+    ):
+        self.hidden_states_local = hidden_states
+        self.forward_batch = forward_batch
+        self.qkv_latent_func = qkv_latent_func
+        self.hidden_states_ = None
+        self.qkv_latent_ = None
+
+    def tp_all_gather_hidden_states(self, hidden_states, forward_batch):
+        total_tokens = forward_batch.input_ids.shape[0]
+        output = hidden_states.new_empty((total_tokens, hidden_states.shape[-1]))
+        get_tp_group().all_gather_into_tensor(output, hidden_states)
+        return output
+
+    def fetch_qkv_latent(self):
+        if self.qkv_latent_ is not None:
+            return self.qkv_latent_
+        assert self.qkv_latent_func is not None
+        self.qkv_latent_ = self.qkv_latent_func(
+            self.hidden_states_local, self.forward_batch
+        )
+        if get_attn_tp_context().input_scattered:
+            self.qkv_latent_ = self.tp_all_gather_hidden_states(
+                self.qkv_latent_, self.forward_batch
+            )
+        return self.qkv_latent_
+
+    def fetch_hidden_states(self):
+        if self.hidden_states_ is not None:
+            return self.hidden_states_
+        self.hidden_states_ = self.hidden_states_local
+        if get_attn_tp_context().input_scattered:
+            self.hidden_states_ = self.tp_all_gather_hidden_states(
+                self.hidden_states_, self.forward_batch
+            )
+        return self.hidden_states_
+
+
+class AttnTpContext:
+    def __init__(self):
+        self.allow_input_scattered = False
+        self.input_scattered_ = False
+        self.attn_inputs_: Optional[AttentionInputs] = None
+
+    def init_context(self, q_lora_rank, is_nsa):
+        self.allow_input_scattered = (
+            get_global_server_args().enable_attn_tp_input_scattered
+            and (_is_cuda or _is_npu)
+            and q_lora_rank is not None
+            and not is_nsa
+            and get_tensor_model_parallel_world_size() > 1
+            and not is_dp_attention_enabled()
+            and get_moe_a2a_backend().is_none()
+            and not enable_moe_dense_fully_dp()
+            and get_global_server_args().disable_piecewise_cuda_graph
+            and get_global_server_args().speculative_algorithm != "EAGLE3"
+        )
+        if get_global_server_args().enable_attn_tp_input_scattered:
+            if not self.allow_input_scattered:
+                logging.info(
+                    "attn_tp_input_scattered is not enabled while other conditions are not met"
+                )
+            else:
+                logging.info("attn_tp_input_scattered is enabled")
+
+    def use_input_scattered(self, forward_batch: ForwardBatch):
+        return (
+            self.allow_input_scattered
+            and forward_batch.forward_mode.is_extend()
+            and not forward_batch.forward_mode.is_target_verify()
+            and not forward_batch.forward_mode.is_draft_extend()
+            and forward_batch.input_ids is not None
+            and not forward_batch.can_run_tbo
+        )
+
+    @property
+    def input_scattered(self):
+        return self.input_scattered_
+
+    def set_attn_inputs(self, attn_inputs: AttentionInputs):
+        self.attn_inputs_ = attn_inputs
+
+    def fetch_qkv_latent(self):
+        assert self.attn_inputs_ is not None
+        return self.attn_inputs_.fetch_qkv_latent()
+
+    def fetch_hidden_states(self):
+        assert self.attn_inputs_ is not None
+        return self.attn_inputs_.fetch_hidden_states()
+
+    @contextmanager
+    def maybe_input_scattered(self, forward_batch: ForwardBatch):
+        flag = self.use_input_scattered(forward_batch)
+        old_flag = self.input_scattered
+        self.input_scattered_ = flag
+        yield
+        self.input_scattered_ = old_flag
+        self.attn_inputs_ = None
+
+
+ATTN_TP_CONTEXT = AttnTpContext()
+
+
+def get_attn_tp_context():
+    return ATTN_TP_CONTEXT
+
+
+@dataclass
+class _LayerModeComputationContext:
+    num_layers: int
+    layer_id: int
+    is_layer_sparse: bool
+    is_previous_layer_sparse: Optional[bool]
+    is_next_layer_sparse: Optional[bool]
+
+    def previous_layer(self):
+        assert self.is_previous_layer_sparse is not None
+        return _LayerModeComputationContext(
+            num_layers=self.num_layers,
+            layer_id=self.layer_id - 1,
+            is_layer_sparse=self.is_previous_layer_sparse,
+            is_previous_layer_sparse=None,
+            is_next_layer_sparse=self.is_layer_sparse,
+        )
+
+
+@dataclass
+class LayerScatterModes:
+    layer_input_mode: ScatterMode
+    attn_mode: ScatterMode
+    # Can be further split into e.g. mlp_input_mode and mlp_output_mode if needed
+    mlp_mode: ScatterMode
+    middle_residual_mode: ScatterMode
+    layer_output_mode: ScatterMode
+
+    @classmethod
+    def init_new(cls, **kwargs):
+        context = _LayerModeComputationContext(**kwargs)
+        return cls(
+            layer_input_mode=cls._compute_layer_input_mode(context),
+            attn_mode=ScatterMode.TP_ATTN_FULL,
+            mlp_mode=cls._compute_mlp_mode(context),
+            middle_residual_mode=cls._compute_middle_residual_mode(context),
+            layer_output_mode=cls._compute_layer_output_mode(context),
+        )
+
+    @classmethod
+    def _compute_layer_input_mode(cls, context: _LayerModeComputationContext):
+        if context.layer_id == 0:
+            return ScatterMode.model_input_output()
+        return cls._compute_layer_output_mode(context.previous_layer())
+
+    @classmethod
+    def _compute_mlp_mode(cls, context: _LayerModeComputationContext):
+        if context.is_layer_sparse:
+            return (
+                ScatterMode.SCATTERED
+                if (
+                    # Token dispatch/combine will be handled outside of LayerCommunicator for these modes.
+                    not get_moe_a2a_backend().is_none()
+                    or should_use_flashinfer_cutlass_moe_fp4_allgather()
+                )
+                else ScatterMode.FULL
+            )
+        else:
+            return (
+                ScatterMode.SCATTERED
+                if enable_moe_dense_fully_dp()
+                else ScatterMode.FULL
+            )
+
+    @classmethod
+    def _should_gather_for_tbo(cls, context: _LayerModeComputationContext):
+        return (
+            not context.is_layer_sparse
+            and context.is_next_layer_sparse
+            and enable_moe_dense_fully_dp()
+            and get_global_server_args().enable_two_batch_overlap
+        )
+
+    @classmethod
+    def _compute_middle_residual_mode(cls, context: _LayerModeComputationContext):
+        mlp_mode = cls._compute_mlp_mode(context)
+        if mlp_mode == ScatterMode.SCATTERED:
+            return ScatterMode.SCATTERED
+        if mlp_mode == ScatterMode.FULL:
+            return ScatterMode.TP_ATTN_FULL
+        raise NotImplementedError
+
+    @classmethod
+    def _compute_layer_output_mode(cls, context: _LayerModeComputationContext):
+        mlp_mode = cls._compute_mlp_mode(context)
+        if context.layer_id == context.num_layers - 1:
+            return ScatterMode.model_input_output()
+        if mlp_mode == ScatterMode.SCATTERED:
+            if cls._should_gather_for_tbo(context):
+                return ScatterMode.TP_ATTN_FULL
+            return ScatterMode.SCATTERED
+        if mlp_mode == ScatterMode.FULL:
+            return ScatterMode.TP_ATTN_FULL
+        raise NotImplementedError
+
+
+def enable_moe_dense_fully_dp():
+    return get_global_server_args().moe_dense_tp_size == 1
+
+
+class LayerCommunicator:
+    def __init__(
+        self,
+        layer_scatter_modes: LayerScatterModes,
+        input_layernorm: torch.nn.Module,
+        post_attention_layernorm: torch.nn.Module,
+        # Reduce scatter requires skipping all-reduce in model code after MoE/MLP, so only enable for models which have that implemented. Remove flag once done for all models that use LayerCommunicator.
+        allow_reduce_scatter: bool = False,
+        is_last_layer: bool = False,
+        qkv_latent_func: Optional[Callable] = None,
+    ):
+        self.layer_scatter_modes = layer_scatter_modes
+        self.input_layernorm = input_layernorm
+        self.post_attention_layernorm = post_attention_layernorm
+        self.allow_reduce_scatter = allow_reduce_scatter
+        self.is_last_layer = is_last_layer
+        self.qkv_latent_func = qkv_latent_func
+
+        self._context = CommunicateContext.init_new()
+        self._post_init_communicate()
+        self._speculative_algo = SpeculativeAlgorithm.from_string(
+            get_global_server_args().speculative_algorithm
+        )
+
+    def _post_init_communicate(self):
+        self._communicate_simple_fn = CommunicateSimpleFn.get_fn(
+            input_mode=self.layer_scatter_modes.layer_input_mode,
+            output_mode=self.layer_scatter_modes.attn_mode,
+            context=self._context,
+        )
+        self._communicate_with_all_reduce_and_layer_norm_fn = (
+            CommunicateWithAllReduceAndLayerNormFn.get_fn(
+                hidden_states_input_mode=self.layer_scatter_modes.attn_mode,
+                residual_input_mode=self.layer_scatter_modes.layer_input_mode,
+                hidden_states_output_mode=self.layer_scatter_modes.mlp_mode,
+                residual_output_mode=self.layer_scatter_modes.middle_residual_mode,
+                context=self._context,
+            )
+        )
+        self._communicate_summable_tensor_pair_fn = (
+            CommunicateSummableTensorPairFn.get_fn(
+                hidden_states_input_mode=self.layer_scatter_modes.mlp_mode,
+                residual_input_mode=self.layer_scatter_modes.middle_residual_mode,
+                output_mode=self.layer_scatter_modes.layer_output_mode,
+                context=self._context,
+            )
+        )
+
+    def prepare_attn_and_capture_last_layer_outputs(
+        self,
+        hidden_states: torch.Tensor,
+        residual: torch.Tensor,
+        forward_batch: ForwardBatch,
+        captured_last_layer_outputs: Optional[List[torch.Tensor]] = None,
+        post_residual_addition: Optional[torch.Tensor] = None,
+    ):
+        hidden_states, residual = self.prepare_attn(
+            hidden_states,
+            residual,
+            forward_batch,
+            post_residual_addition=post_residual_addition,
+        )
+        if captured_last_layer_outputs is not None:
+            gathered_last_layer_output = self._communicate_simple_fn(
+                hidden_states=residual,
+                forward_batch=forward_batch,
+                context=self._context,
+            )
+            if gathered_last_layer_output is residual:
+                # Clone to avoid modifying the original residual by Custom RMSNorm inplace operation
+                gathered_last_layer_output = residual.clone()
+            captured_last_layer_outputs.append(gathered_last_layer_output)
+        return hidden_states, residual
+
+    def prepare_attn(
+        self,
+        hidden_states: torch.Tensor,
+        residual: torch.Tensor,
+        forward_batch: ForwardBatch,
+        quant_format: str = "",
+        post_residual_addition: Optional[torch.Tensor] = None,
+    ):
+        if get_attn_tp_context().input_scattered:
+            hidden_states, residual = self._tp_reduce_scatter(
+                hidden_states,
+                residual,
+            )
+        if hidden_states.shape[0] == 0:
+            residual = hidden_states
+        else:
+            if (
+                residual is not None
+                and hasattr(hidden_states, "_sglang_needs_allreduce_fusion")
+                and hidden_states._sglang_needs_allreduce_fusion
+            ):
+                if (
+                    apply_aiter_all_reduce_fusion(hidden_states)
+                    or apply_flashinfer_allreduce_fusion(hidden_states.shape[0])
+                ) and hasattr(self.input_layernorm, "forward_with_allreduce_fusion"):
+                    hidden_states, residual = (
+                        self.input_layernorm.forward_with_allreduce_fusion(
+                            hidden_states, residual
+                        )
+                    )
+                else:
+                    hidden_states = tensor_model_parallel_all_reduce(hidden_states)
+                    hidden_states, residual = self.input_layernorm(
+                        hidden_states, residual
+                    )
+            else:
+                if residual is None:
+                    residual = hidden_states
+
+                    if _use_aiter and _is_gfx95_supported and ("mxfp4" in quant_format):
+                        hidden_states, *_, _ = fused_rms_mxfp4_quant(
+                            hidden_states,
+                            self.input_layernorm.weight,
+                            self.input_layernorm.variance_epsilon,
+                            None,
+                            None,
+                            None,
+                            None,
+                        )
+                    elif _use_aiter and _is_gfx95_supported and ("fp8" in quant_format):
+
+                        hidden_states, _, _, _res = fused_rms_fp8_group_quant(
+                            hidden_states,
+                            self.input_layernorm.weight,
+                            self.input_layernorm.variance_epsilon,
+                            inp2=None,
+                            inp2_weight=None,
+                            inp2_epsilon=None,
+                            group_size=128,
+                            dtype_quant=torch.float8_e4m3fn,
+                            res1=None,
+                            output_unquantized_inp1=False,
+                        )
+
+                    else:
+                        hidden_states = self.input_layernorm(hidden_states)
+                else:
+
+                    if _use_aiter and _is_gfx95_supported and ("mxfp4" in quant_format):
+                        hidden_states, *_, residual = fused_rms_mxfp4_quant(
+                            hidden_states,
+                            self.input_layernorm.weight,
+                            self.input_layernorm.variance_epsilon,
+                            None,
+                            None,
+                            None,
+                            residual,
+                        )
+                    elif _use_aiter and _is_gfx95_supported and ("fp8" in quant_format):
+                        # RMSNorm + FP8 per-group quant
+                        # return hidden_states：
+                        #   out_fp8  : FP8 activation →  a8w8 GEMM
+                        #   out_bs   : block-scale →  gemm_a8w8_blockscale.x_scale
+                        hidden_states, _, _, residual = fused_rms_fp8_group_quant(
+                            hidden_states,
+                            self.input_layernorm.weight,
+                            self.input_layernorm.variance_epsilon,
+                            inp2=None,
+                            inp2_weight=None,
+                            inp2_epsilon=None,
+                            group_size=128,
+                            dtype_quant=torch.float8_e4m3fn,
+                            res1=residual,
+                            output_unquantized_inp1=False,
+                        )
+                    else:
+                        hidden_states, residual = self.input_layernorm(
+                            hidden_states,
+                            residual,
+                            post_residual_addition,
+                        )
+
+        hidden_states = self._communicate_simple_fn(
+            hidden_states=hidden_states,
+            forward_batch=forward_batch,
+            context=self._context,
+        )
+        if self.qkv_latent_func is not None:
+            attn_inputs = AttentionInputs(
+                hidden_states, forward_batch, self.qkv_latent_func
+            )
+            get_attn_tp_context().set_attn_inputs(attn_inputs)
+        return hidden_states, residual
+
+    def _tp_reduce_scatter(
+        self,
+        hidden_states: torch.Tensor,
+        residual: torch.Tensor,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        if hidden_states.shape[0] == 0:
+            return hidden_states, hidden_states
+        assert (
+            hidden_states.shape[0] % self._context.tp_size == 0
+        ), f"Expected total tokens {hidden_states.shape[0]} % tp_size {self._context.tp_size} to be 0"
+        local_tokens = hidden_states.shape[0] // self._context.tp_size
+        output = hidden_states.new_empty(local_tokens, *hidden_states.shape[1:])
+        get_tp_group().reduce_scatter_tensor(output, hidden_states)
+        if residual is not None:
+            residual = residual.tensor_split(self._context.tp_size)[
+                self._context.tp_rank
+            ]
+        return output, residual
+
+    def prepare_mlp(
+        self,
+        hidden_states: torch.Tensor,
+        residual: torch.Tensor,
+        forward_batch: ForwardBatch,
+        cache=None,
+    ):
+        if cache is not None:
+            self._context.cache = cache
+
+        return self._communicate_with_all_reduce_and_layer_norm_fn(
+            hidden_states=hidden_states,
+            residual=residual,
+            forward_batch=forward_batch,
+            layernorm=self.post_attention_layernorm,
+            context=self._context,
+        )
+
+    def postprocess_layer(
+        self,
+        hidden_states: torch.Tensor,
+        residual: torch.Tensor,
+        forward_batch: ForwardBatch,
+    ):
+        return self._communicate_summable_tensor_pair_fn(
+            hidden_states=hidden_states,
+            residual=residual,
+            forward_batch=forward_batch,
+            context=self._context,
+            allow_reduce_scatter=self.allow_reduce_scatter,
+        )
+
+    def should_use_reduce_scatter(self, forward_batch: ForwardBatch):
+        if not self.allow_reduce_scatter:
+            return False
+        if (
+            self._communicate_summable_tensor_pair_fn
+            is CommunicateSummableTensorPairFn._scatter_hidden_states
+            and forward_batch.dp_padding_mode.is_max_len()
+        ):
+            return True
+        if nsa_use_prefill_cp(forward_batch):
+            return True
+        if get_attn_tp_context().input_scattered and not self.is_last_layer:
+            return True
+        return False
+
+    # NOTE: This function will cause torch recompilation
+    def should_fuse_mlp_allreduce_with_next_layer(
+        self, forward_batch: ForwardBatch
+    ) -> bool:
+        if (
+            is_dp_attention_enabled()
+            and self._speculative_algo is not None
+            and self._speculative_algo.is_eagle()
+        ):
+            return False
+
+        if get_attn_tp_context().input_scattered:
+            return False
+
+        batch_size = (
+            forward_batch.input_ids.shape[0]
+            if hasattr(forward_batch, "input_ids")
+            else 0
+        )
+
+        return (
+            (
+                apply_flashinfer_allreduce_fusion(batch_size)
+                or (
+                    _use_aiter
+                    and batch_size > 0
+                    and get_tensor_model_parallel_world_size() != 6
+                    and get_global_server_args().enable_aiter_allreduce_fusion
+                )
+            )
+            and (not self.is_last_layer)
+            and (self._context.tp_size > 1)
+        )
+
+
+@dataclass
+class CommunicateContext:
+    process_group_sizes: Dict[ScatterMode, int]
+    attn_tp_rank: int
+    attn_tp_size: int
+    attn_dp_size: int
+    attn_cp_rank: int
+    attn_cp_size: int
+    tp_size: int
+    cache = None
+    tp_rank: int
+
+    def is_same_group_size(self, a: ScatterMode, b: ScatterMode):
+        return self.process_group_sizes[a] == self.process_group_sizes[b]
+
+    @classmethod
+    def init_new(cls):
+        attn_tp_rank = get_attention_tp_rank()
+        attn_tp_size = get_attention_tp_size()
+        attn_dp_size = get_attention_dp_size()
+        attn_cp_size = get_attention_cp_size()
+        attn_cp_rank = get_attention_cp_rank()
+        tp_size = get_tensor_model_parallel_world_size()
+        tp_rank = get_tensor_model_parallel_rank()
+        process_group_sizes = {
+            ScatterMode.SCATTERED: 1,
+            ScatterMode.TP_ATTN_FULL: attn_tp_size,
+            # TODO: support --moe-dense-tp-size > 1
+            ScatterMode.FULL: tp_size,
+        }
+        return cls(
+            process_group_sizes=process_group_sizes,
+            attn_tp_rank=attn_tp_rank,
+            attn_tp_size=attn_tp_size,
+            attn_dp_size=attn_dp_size,
+            attn_cp_rank=attn_cp_rank,
+            attn_cp_size=attn_cp_size,
+            tp_size=tp_size,
+            tp_rank=tp_rank,
+        )
+
+
+class CommunicateSimpleFn:
+    @staticmethod
+    def get_fn(
+        input_mode: ScatterMode,
+        output_mode: ScatterMode,
+        context: CommunicateContext,
+    ):
+        if context.is_same_group_size(input_mode, output_mode):
+            return CommunicateSimpleFn._trivial
+
+        if (input_mode == ScatterMode.SCATTERED) and (
+            output_mode == ScatterMode.TP_ATTN_FULL
+        ):
+            if _use_ag_after_qlora:
+                return CommunicateSimpleFn._trivial
+            return CommunicateSimpleFn._scattered_to_tp_attn_full
+
+        raise NotImplementedError(f"{input_mode=} {output_mode=}")
+
+    @staticmethod
+    def _trivial(
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+        context: CommunicateContext,
+    ) -> torch.Tensor:
+        return hidden_states
+
+    @staticmethod
+    def _scattered_to_tp_attn_full(
+        hidden_states: Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]],
+        forward_batch: ForwardBatch,
+        context: CommunicateContext,
+    ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
+        if isinstance(hidden_states, tuple):
+            gathered_hidden_states = []
+            for local_hidden_states in hidden_states:
+                with use_symmetric_memory(
+                    get_tp_group(),
+                    disabled=not is_allocation_symmetric(),
+                ):
+                    output = torch.empty(
+                        (
+                            local_hidden_states.shape[0] * context.attn_tp_size,
+                            *local_hidden_states.shape[1:],
+                        ),
+                        dtype=local_hidden_states.dtype,
+                        device=local_hidden_states.device,
+                    )
+                attn_tp_all_gather_into_tensor(
+                    output,
+                    local_hidden_states,
+                )
+                gathered_hidden_states.append(output)
+            return tuple(gathered_hidden_states)
+
+        hidden_states, local_hidden_states = (
+            get_local_dp_buffer(),
+            hidden_states,
+        )
+        attn_tp_all_gather_into_tensor(
+            hidden_states,
+            local_hidden_states,
+        )
+        return hidden_states
+
+
+class CommunicateWithAllReduceAndLayerNormFn:
+    """Besides communication, needs to
+    1. All reduce in tp_attn_group on hidden_states
+    2. Apply layer norm
+    """
+
+    @staticmethod
+    def get_fn(
+        hidden_states_input_mode: ScatterMode,
+        residual_input_mode: ScatterMode,
+        hidden_states_output_mode: ScatterMode,
+        residual_output_mode: ScatterMode,
+        context: CommunicateContext,
+    ):
+
+        if (
+            context.is_same_group_size(
+                hidden_states_input_mode, hidden_states_output_mode
+            )
+            and context.is_same_group_size(residual_input_mode, residual_output_mode)
+            and context.attn_tp_size == 1
+        ):
+            return CommunicateWithAllReduceAndLayerNormFn._simple
+
+        if (
+            (hidden_states_input_mode == ScatterMode.TP_ATTN_FULL)
+            and (
+                residual_input_mode in [ScatterMode.SCATTERED, ScatterMode.TP_ATTN_FULL]
+            )
+            and (hidden_states_output_mode == ScatterMode.FULL)
+            and (residual_output_mode == ScatterMode.TP_ATTN_FULL)
+        ):
+            return partial(
+                CommunicateWithAllReduceAndLayerNormFn._gather_hidden_states_and_residual,
+                residual_input_mode=residual_input_mode,
+            )
+
+        if (
+            (hidden_states_input_mode == ScatterMode.TP_ATTN_FULL)
+            and (
+                residual_input_mode in [ScatterMode.SCATTERED, ScatterMode.TP_ATTN_FULL]
+            )
+            and (hidden_states_output_mode == ScatterMode.SCATTERED)
+            and (residual_output_mode == ScatterMode.SCATTERED)
+        ):
+            return partial(
+                CommunicateWithAllReduceAndLayerNormFn._scatter_hidden_states_and_residual,
+                residual_input_mode=residual_input_mode,
+            )
+
+        raise NotImplementedError(
+            f"{hidden_states_input_mode=} {residual_input_mode=} {hidden_states_output_mode=} {residual_output_mode=}"
+        )
+
+    @staticmethod
+    def _simple(
+        hidden_states: torch.Tensor,
+        residual: torch.Tensor,
+        forward_batch: ForwardBatch,
+        layernorm: torch.nn.Module,
+        context: CommunicateContext,
+    ):
+        # TODO move these `if shape != 0` into LayerNorm itself
+        if hidden_states.shape[0] != 0:
+            hidden_states, residual = layernorm(hidden_states, residual)
+        return hidden_states, residual
+
+    @staticmethod
+    def _gather_hidden_states_and_residual(
+        hidden_states: torch.Tensor,
+        residual: torch.Tensor,
+        forward_batch: ForwardBatch,
+        layernorm: torch.nn.Module,
+        context: CommunicateContext,
+        *,
+        residual_input_mode,
+    ):
+        if get_attn_tp_context().input_scattered:
+            return CommunicateWithAllReduceAndLayerNormFn._tp_all_reduce_with_scattered_residual(
+                hidden_states,
+                residual,
+                layernorm,
+                context,
+            )
+
+        if residual_input_mode == ScatterMode.SCATTERED and context.attn_tp_size > 1:
+            residual, local_residual = (
+                get_local_dp_buffer(),
+                residual,
+            )
+            attn_tp_all_gather_into_tensor(residual, local_residual)
+        if context.attn_dp_size != 1:
+            if context.attn_tp_rank == 0:
+                hidden_states += residual
+
+            # Perform layernorm on smaller data before comm. Only valid when attn_tp_size is 1 (tp_size == dp_size)
+            use_layer_norm_before_gather = context.attn_tp_size == 1
+            if use_layer_norm_before_gather and hidden_states.shape[0] != 0:
+                residual = hidden_states
+                with use_symmetric_memory(
+                    get_tp_group(),
+                    disabled=not is_allocation_symmetric(),
+                ):
+                    hidden_states = layernorm(hidden_states)
+
+            hidden_states, local_hidden_states = (
+                get_global_dp_buffer(),
+                hidden_states,
+            )
+            dp_gather_partial(hidden_states, local_hidden_states, forward_batch)
+
+            if not use_layer_norm_before_gather:
+                dp_scatter(residual, hidden_states, forward_batch)
+                if hidden_states.shape[0] != 0:
+                    hidden_states = layernorm(hidden_states)
+        else:
+            handled = False
+            if (
+                apply_aiter_all_reduce_fusion(hidden_states)
+                or apply_flashinfer_allreduce_fusion(hidden_states.shape[0])
+            ) and hasattr(layernorm, "forward_with_allreduce_fusion"):
+                hidden_states, residual = layernorm.forward_with_allreduce_fusion(
+                    hidden_states, residual
+                )
+                handled = True
+
+            if not handled:
+                hidden_states = tensor_model_parallel_all_reduce(hidden_states)
+                if _is_npu and context.cache is not None:
+                    _ = prepare_weight_cache(hidden_states, context.cache)
+                hidden_states, residual = layernorm(hidden_states, residual)
+        return hidden_states, residual
+
+    @staticmethod
+    def _scatter_hidden_states_and_residual(
+        hidden_states: torch.Tensor,
+        residual: torch.Tensor,
+        forward_batch: ForwardBatch,
+        layernorm: torch.nn.Module,
+        context: CommunicateContext,
+        *,
+        residual_input_mode,
+    ):
+        input_hidden_states = hidden_states
+        hidden_states = hidden_states.tensor_split(context.attn_tp_size)[
+            context.attn_tp_rank
+        ]
+        attn_tp_reduce_scatter_tensor(hidden_states, input_hidden_states)
+        if residual_input_mode == ScatterMode.TP_ATTN_FULL:
+            residual = residual.tensor_split(context.attn_tp_size)[context.attn_tp_rank]
+        if hidden_states.shape[0] != 0:
+            hidden_states, residual = layernorm(hidden_states, residual)
+        return hidden_states, residual
+
+    @staticmethod
+    def _tp_all_reduce_with_scattered_residual(
+        hidden_states: torch.Tensor,
+        residual: torch.Tensor,
+        layernorm: torch.nn.Module,
+        context: CommunicateContext,
+    ):
+        if hidden_states.shape[0] == 0:
+            return hidden_states, hidden_states
+
+        scattered_states = hidden_states.tensor_split(context.tp_size)[context.tp_rank]
+        scattered_states += residual
+        residual = tensor_model_parallel_all_reduce(hidden_states)
+        hidden_states = layernorm(residual)
+        return hidden_states, residual
+
+
+class CommunicateSummableTensorPairFn:
+    """It is allowed to make (hidden_states, residual) := (hidden_states + residual, None) if needed."""
+
+    @classmethod
+    def execute(
+        cls,
+        hidden_states_input_mode,
+        residual_input_mode,
+        output_mode,
+        context,
+        **kwargs,
+    ):
+        return cls.get_fn(
+            hidden_states_input_mode=hidden_states_input_mode,
+            residual_input_mode=residual_input_mode,
+            output_mode=output_mode,
+            context=context,
+        )(context=context, **kwargs)
+
+    @staticmethod
+    def get_fn(
+        hidden_states_input_mode: ScatterMode,
+        residual_input_mode: ScatterMode,
+        output_mode: ScatterMode,
+        context: CommunicateContext,
+    ):
+        if context.is_same_group_size(
+            hidden_states_input_mode, output_mode
+        ) and context.is_same_group_size(residual_input_mode, output_mode):
+            return CommunicateSummableTensorPairFn._trivial
+
+        if (
+            (hidden_states_input_mode == ScatterMode.FULL)
+            and (residual_input_mode == ScatterMode.TP_ATTN_FULL)
+            and (output_mode == ScatterMode.TP_ATTN_FULL)
+        ):
+            return CommunicateSummableTensorPairFn._scatter_hidden_states
+
+        if (
+            (hidden_states_input_mode == ScatterMode.SCATTERED)
+            and (residual_input_mode == ScatterMode.SCATTERED)
+            and (output_mode == ScatterMode.TP_ATTN_FULL)
+        ):
+            return CommunicateSummableTensorPairFn._gather
+
+        if (
+            (hidden_states_input_mode == ScatterMode.TP_ATTN_FULL)
+            and (residual_input_mode == ScatterMode.TP_ATTN_FULL)
+            and (output_mode == ScatterMode.SCATTERED)
+        ):
+            return CommunicateSummableTensorPairFn._scatter
+
+        raise NotImplementedError(
+            f"{hidden_states_input_mode=} {residual_input_mode=} {output_mode=}"
+        )
+
+    @staticmethod
+    def _trivial(
+        hidden_states: torch.Tensor,
+        residual: torch.Tensor,
+        forward_batch: ForwardBatch,
+        context: CommunicateContext,
+        **kwargs,
+    ):
+        return hidden_states, residual
+
+    @staticmethod
+    def _scatter_hidden_states(
+        hidden_states: torch.Tensor,
+        residual: torch.Tensor,
+        forward_batch: ForwardBatch,
+        context: CommunicateContext,
+        allow_reduce_scatter: bool = False,
+    ):
+        hidden_states, global_hidden_states = (
+            get_local_dp_buffer(),
+            hidden_states,
+        )
+        if allow_reduce_scatter and forward_batch.dp_padding_mode.is_max_len():
+            # When using padding, all_reduce is skipped after MLP and MOE and reduce scatter is used here instead.
+            dp_reduce_scatter_tensor(hidden_states, global_hidden_states)
+        else:
+            dp_scatter(hidden_states, global_hidden_states, forward_batch)
+        return hidden_states, residual
+
+    @staticmethod
+    def _gather(
+        hidden_states: torch.Tensor,
+        residual: torch.Tensor,
+        forward_batch: ForwardBatch,
+        context: CommunicateContext,
+        **kwargs,
+    ):
+        hidden_states += residual
+        residual = None
+        hidden_states, local_hidden_states = (
+            get_local_dp_buffer(),
+            hidden_states,
+        )
+        attn_tp_all_gather_into_tensor(
+            hidden_states,
+            local_hidden_states,
+        )
+        return hidden_states, residual
+
+    @staticmethod
+    def _scatter(
+        hidden_states: torch.Tensor,
+        residual: torch.Tensor,
+        forward_batch: ForwardBatch,
+        context: CommunicateContext,
+    ):
+        assert residual is None, "not yet handled residual!=None"
+        tensor_list = list(hidden_states.tensor_split(context.attn_tp_size))
+        hidden_states = tensor_list[context.attn_tp_rank]
+        return hidden_states, residual
diff --git a/sglang/python/sglang/srt/layers/communicator_nsa_cp.py b/sglang/python/sglang/srt/layers/communicator_nsa_cp.py
new file mode 100644
index 0000000000000000000000000000000000000000..296d1456812af0df2f9b21fcf6db9ba5e7f60578
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/communicator_nsa_cp.py
@@ -0,0 +1,210 @@
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+
+from functools import partial
+from typing import Callable, Optional
+
+import torch
+
+from sglang.srt.layers.attention.nsa.utils import (
+    is_nsa_enable_prefill_cp,
+    nsa_use_prefill_cp,
+)
+from sglang.srt.layers.communicator import (
+    CommunicateContext,
+    CommunicateSimpleFn,
+    CommunicateSummableTensorPairFn,
+    CommunicateWithAllReduceAndLayerNormFn,
+    LayerCommunicator,
+    LayerScatterModes,
+    ScatterMode,
+)
+from sglang.srt.layers.dp_attention import (
+    attn_cp_all_gather_into_tensor,
+    attn_cp_reduce_scatter_tensor,
+    get_local_dp_buffer,
+)
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch
+
+
+def nsa_enable_prefill_cp():
+    # After using cp, the communication mode of this part changes.
+    # The three parts of prepare_attn, prepare_mlp, and postprocess_layer
+    # no longer require additional communication for reduce, scatter, etc.
+    return is_nsa_enable_prefill_cp()
+
+
+class NSACPLayerCommunicator(LayerCommunicator):
+    def __init__(
+        self,
+        layer_scatter_modes: LayerScatterModes,
+        input_layernorm: torch.nn.Module,
+        post_attention_layernorm: torch.nn.Module,
+        # Reduce scatter requires skipping all-reduce in model code after MoE/MLP, so only enable for models which have that implemented. Remove flag once done for all models that use LayerCommunicator.
+        allow_reduce_scatter: bool = False,
+        is_last_layer: bool = False,
+        qkv_latent_func: Optional[Callable] = None,
+    ):
+        super().__init__(
+            layer_scatter_modes,
+            input_layernorm,
+            post_attention_layernorm,
+            allow_reduce_scatter,
+            is_last_layer,
+            qkv_latent_func,
+        )
+
+    def _post_init_communicate(self):
+        # SCATTERED in attn tp is different from SCATTERED in global tp when dp_size > 1
+        if self.layer_scatter_modes.mlp_mode != ScatterMode.SCATTERED:
+            assert (
+                self._context.attn_dp_size == 1
+            ), f"dp_size should be 1 when moe_runner_backend is none"
+        self._communicate_simple_fn = NSACPCommunicateSimpleFn.get_fn(
+            input_mode=ScatterMode.SCATTERED,
+            output_mode=ScatterMode.SCATTERED,
+            context=self._context,
+        )
+        self._communicate_with_all_reduce_and_layer_norm_fn = NSACPCommunicateWithAllReduceAndLayerNormFn.get_fn(
+            hidden_states_input_mode=ScatterMode.SCATTERED,
+            residual_input_mode=ScatterMode.SCATTERED,
+            hidden_states_output_mode=self.layer_scatter_modes.mlp_mode,  # SCATTERED, FULL
+            residual_output_mode=ScatterMode.SCATTERED,
+            context=self._context,
+        )
+        self._communicate_summable_tensor_pair_fn = NSACPCommunicateSummableTensorPairFn.get_fn(
+            hidden_states_input_mode=self.layer_scatter_modes.mlp_mode,  # SCATTERED, FULL
+            residual_input_mode=ScatterMode.SCATTERED,
+            output_mode=ScatterMode.SCATTERED,
+            context=self._context,
+        )
+
+
+class NSACPCommunicateSimpleFn(CommunicateSimpleFn):
+    @staticmethod
+    def get_fn(
+        input_mode: ScatterMode,
+        output_mode: ScatterMode,
+        context: CommunicateContext,
+    ):
+        if context.is_same_group_size(input_mode, output_mode):
+            return NSACPCommunicateSimpleFn._trivial
+
+        raise NotImplementedError(f"{input_mode=} {output_mode=}")
+
+
+class NSACPCommunicateWithAllReduceAndLayerNormFn(
+    CommunicateWithAllReduceAndLayerNormFn
+):
+    """Besides communication, needs to
+    1. All reduce in tp_attn_group on hidden_states
+    2. Apply layer norm
+    """
+
+    @staticmethod
+    def get_fn(
+        hidden_states_input_mode: ScatterMode,
+        residual_input_mode: ScatterMode,
+        hidden_states_output_mode: ScatterMode,
+        residual_output_mode: ScatterMode,
+        context: CommunicateContext,
+    ):
+        assert hidden_states_input_mode == ScatterMode.SCATTERED
+        assert residual_input_mode == ScatterMode.SCATTERED
+        assert residual_output_mode == ScatterMode.SCATTERED
+        if hidden_states_output_mode == ScatterMode.SCATTERED:
+            return NSACPCommunicateWithAllReduceAndLayerNormFn._simple
+
+        if hidden_states_output_mode == ScatterMode.FULL:
+            return partial(
+                NSACPCommunicateWithAllReduceAndLayerNormFn._gather_hidden_states_and_residual,
+                residual_input_mode=residual_input_mode,
+            )
+
+        raise NotImplementedError(
+            f"{hidden_states_input_mode=} {residual_input_mode=} {hidden_states_output_mode=} {residual_output_mode=}"
+        )
+
+    @staticmethod
+    def _gather_hidden_states_and_residual(
+        hidden_states: torch.Tensor,
+        residual: torch.Tensor,
+        forward_batch: ForwardBatch,
+        layernorm: torch.nn.Module,
+        context: CommunicateContext,
+        *,
+        residual_input_mode,
+    ):
+        if hidden_states.shape[0] != 0:
+            hidden_states, residual = layernorm(hidden_states, residual)
+        # for prefill: attn tp scattered -> full
+        # for decode: attn tp full -> full
+        if nsa_use_prefill_cp(forward_batch):
+            assert context.attn_dp_size == 1
+            hidden_states, local_hidden_states = (
+                get_local_dp_buffer(),
+                hidden_states,
+            )
+            attn_cp_all_gather_into_tensor(
+                hidden_states,
+                local_hidden_states,
+            )
+        return hidden_states, residual
+
+
+class NSACPCommunicateSummableTensorPairFn(CommunicateSummableTensorPairFn):
+    """It is allowed to make (hidden_states, residual) := (hidden_states + residual, None) if needed."""
+
+    @staticmethod
+    def get_fn(
+        hidden_states_input_mode: ScatterMode,
+        residual_input_mode: ScatterMode,
+        output_mode: ScatterMode,
+        context: CommunicateContext,
+    ):
+        if context.is_same_group_size(
+            hidden_states_input_mode, output_mode
+        ) and context.is_same_group_size(residual_input_mode, output_mode):
+            return NSACPCommunicateSummableTensorPairFn._trivial
+
+        if (
+            (hidden_states_input_mode == ScatterMode.FULL)
+            and (residual_input_mode == ScatterMode.SCATTERED)
+            and (output_mode == ScatterMode.SCATTERED)
+        ):
+            return NSACPCommunicateSummableTensorPairFn._scatter_hidden_states
+
+        raise NotImplementedError(
+            f"{hidden_states_input_mode=} {residual_input_mode=} {output_mode=}"
+        )
+
+    @staticmethod
+    def _scatter_hidden_states(
+        hidden_states: torch.Tensor,
+        residual: torch.Tensor,
+        forward_batch: ForwardBatch,
+        context: CommunicateContext,
+        allow_reduce_scatter: bool = False,
+    ):
+        # for prefill: full -> attn tp scattered
+        # for decode: full -> attn tp full
+        if nsa_use_prefill_cp(forward_batch):
+            assert context.attn_dp_size == 1
+            input_hidden_states = hidden_states
+            hidden_states = hidden_states.tensor_split(context.attn_cp_size)[
+                context.attn_cp_rank
+            ]
+            attn_cp_reduce_scatter_tensor(hidden_states, input_hidden_states)
+        return hidden_states, residual
diff --git a/sglang/python/sglang/srt/layers/deep_gemm_wrapper/__init__.py b/sglang/python/sglang/srt/layers/deep_gemm_wrapper/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..32f3ebe043645242d492972194be204a5411ef17
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/deep_gemm_wrapper/__init__.py
@@ -0,0 +1 @@
+from .entrypoint import *
diff --git a/sglang/python/sglang/srt/layers/deep_gemm_wrapper/__pycache__/__init__.cpython-311.pyc b/sglang/python/sglang/srt/layers/deep_gemm_wrapper/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1cd7877ea001f7b213d6ae1a754d957b140ff1d2
Binary files /dev/null and b/sglang/python/sglang/srt/layers/deep_gemm_wrapper/__pycache__/__init__.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/layers/deep_gemm_wrapper/__pycache__/compile_utils.cpython-311.pyc b/sglang/python/sglang/srt/layers/deep_gemm_wrapper/__pycache__/compile_utils.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a8d35694879a8ad2b24b26e1f4b6ce261af6bb9f
Binary files /dev/null and b/sglang/python/sglang/srt/layers/deep_gemm_wrapper/__pycache__/compile_utils.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/layers/deep_gemm_wrapper/__pycache__/configurer.cpython-311.pyc b/sglang/python/sglang/srt/layers/deep_gemm_wrapper/__pycache__/configurer.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9e2925e3268e98d52f6712ad7abd7cd09ad9567d
Binary files /dev/null and b/sglang/python/sglang/srt/layers/deep_gemm_wrapper/__pycache__/configurer.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/layers/deep_gemm_wrapper/__pycache__/entrypoint.cpython-311.pyc b/sglang/python/sglang/srt/layers/deep_gemm_wrapper/__pycache__/entrypoint.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ad4c2dea93713add6e5fc610d3200059ca922b38
Binary files /dev/null and b/sglang/python/sglang/srt/layers/deep_gemm_wrapper/__pycache__/entrypoint.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/layers/deep_gemm_wrapper/compile_utils.py b/sglang/python/sglang/srt/layers/deep_gemm_wrapper/compile_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..6aef866f5693790c6c2337259ccdec77adb4112a
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/deep_gemm_wrapper/compile_utils.py
@@ -0,0 +1,341 @@
+import logging
+import os
+from contextlib import contextmanager
+from enum import IntEnum, auto
+from typing import Dict, List, Tuple
+
+import torch
+from tqdm import tqdm
+
+from sglang.srt.distributed.device_communicators.pynccl_allocator import (
+    disable_symmetric_memory_context,
+    restore_symmetric_memory_context,
+)
+from sglang.srt.environ import envs
+from sglang.srt.layers.deep_gemm_wrapper.configurer import ENABLE_JIT_DEEPGEMM
+from sglang.srt.server_args import ServerArgs
+from sglang.srt.utils import ceil_div, get_available_gpu_memory
+
+logger = logging.getLogger(__name__)
+
+if ENABLE_JIT_DEEPGEMM:
+    import deep_gemm
+
+
+_BUILTIN_M_LIST = list(range(1, 1024 * 16 + 1))
+_ENABLE_JIT_DEEPGEMM_PRECOMPILE = envs.SGLANG_JIT_DEEPGEMM_PRECOMPILE.get()
+_DO_COMPILE_ALL = True
+_IS_FIRST_RANK_ON_NODE = envs.SGLANG_IS_FIRST_RANK_ON_NODE.get()
+_IN_PRECOMPILE_STAGE = envs.SGLANG_IN_DEEPGEMM_PRECOMPILE_STAGE.get()
+_FAST_WARMUP = envs.SGLANG_JIT_DEEPGEMM_FAST_WARMUP.get()
+
+# Force redirect deep_gemm cache_dir
+os.environ["DG_JIT_CACHE_DIR"] = os.getenv(
+    "SGLANG_DG_CACHE_DIR", os.path.join(os.path.expanduser("~"), ".cache", "deep_gemm")
+)
+
+# Refer to https://github.com/deepseek-ai/DeepGEMM/commit/d75b218b7b8f4a5dd5406ac87905039ead3ae42f
+# NVRTC may have performance loss with some cases.
+# And NVCC JIT speed is also 9x faster in the ref commit
+os.environ["DG_JIT_USE_NVRTC"] = os.getenv("SGL_DG_USE_NVRTC", "0")
+
+
+def update_deep_gemm_config(gpu_id: int, server_args: ServerArgs):
+    global _BUILTIN_M_LIST
+    global _DO_COMPILE_ALL
+    global _IS_FIRST_RANK_ON_NODE
+
+    _BUILTIN_M_LIST = []
+
+    if _FAST_WARMUP:
+        # In fast warmup mode, only compile a small set of typical Ms
+
+        # First cover all the small bs to ensure decode performance
+        _BUILTIN_M_LIST += list(range(1, 1025))
+
+        # Then cover larger batch sizes with gradually increasing steps
+        # For example, when chunekd prefill size is 16384
+        # The sampled Ms would be:
+        #   1024, 1026, ... 2046 (step 2)
+        #   2048, 2052, ... 4092 (step 4)
+        #   4096, 5004, ... 8184 (step 8)
+        #   8192, 9008, ... 16384 (step 16)
+        # Totally 1024 + 1024 / 2 + 2048 / 4 + 4096 / 8 + 8192 / 16 = 3072 kernels
+        next_m, sample_step = 1024, 2
+        max_prefill_bs = (
+            min(server_args.chunked_prefill_size, 32 * 1024)
+            if server_args.chunked_prefill_size >= 1
+            else 16 * 1024
+        )
+        while next_m < max_prefill_bs:
+            _BUILTIN_M_LIST += list(range(next_m, 2 * next_m, sample_step))
+            next_m = next_m * 2
+            sample_step = sample_step * 2
+        _BUILTIN_M_LIST.append(max_prefill_bs)
+        _BUILTIN_M_LIST = sorted(list(set(_BUILTIN_M_LIST)))
+    else:
+        # When fast warmup isn't enabled, generate m_max and compile all the covered Ms.
+        m_max = 1024 * 16
+        if server_args.chunked_prefill_size < 1:
+            m_max = 1024 * 64
+        elif server_args.chunked_prefill_size > 8192:
+            m_max = server_args.chunked_prefill_size * 2
+        m_max = min(1024 * 128, m_max)
+        _BUILTIN_M_LIST += list(range(1, m_max + 1))
+
+    _IS_FIRST_RANK_ON_NODE = server_args.base_gpu_id == gpu_id
+
+    # Check if is the first rank on node.
+    # Default each rank will try compile all Ms to
+    # load all symbols at the launch stages.
+    # Avoid loading symbols at the serving stages.
+    _DO_COMPILE_ALL = _IS_FIRST_RANK_ON_NODE
+
+
+class DeepGemmKernelType(IntEnum):
+    GROUPED_GEMM_NT_F8F8BF16_MASKED = auto()
+    GROUPED_GEMM_NT_F8F8BF16_CONTIG = auto()
+    GEMM_NT_F8F8BF16 = auto()
+    GEMM_NT_BF16BF16F32 = auto()
+
+
+_INITIALIZATION_DICT: Dict[Tuple[DeepGemmKernelType, int, int, int], bool] = dict()
+
+
+# TODO improve code
+def _maybe_compile_deep_gemm_one_type_all(
+    kernel_type: DeepGemmKernelType,
+    n: int,
+    k: int,
+    num_groups: int,
+) -> None:
+    global _INITIALIZATION_DICT
+    global _BUILTIN_M_LIST
+
+    query_key = (kernel_type, n, k, num_groups)
+    if (
+        _ENABLE_JIT_DEEPGEMM_PRECOMPILE
+        and _DO_COMPILE_ALL
+        and _INITIALIZATION_DICT.get(query_key) is None
+    ):
+        _INITIALIZATION_DICT[query_key] = True
+
+        # TODO maybe improve logs
+        if not _IN_PRECOMPILE_STAGE and _IS_FIRST_RANK_ON_NODE:
+            logger.warning(
+                "Entering DeepGEMM JIT Pre-Compile session. "
+                "It may take a long time (typically 10-20 mins) "
+                "if you have not run `sglang.compile_deep_gemm`. "
+                "It is recommended to run `sglang.compile_deep_gemm` with same args as `sglang.launch_server`"
+                " for pre-compilation to reduce the overhead if you have not run it before. "
+                "For example: "
+                "`python3 -m sglang.compile_deep_gemm --model deepseek-ai/DeepSeek-V3 --tp 8 --trust-remote-code`"
+            )
+
+        logger.info(
+            f"Try DeepGEMM JIT Compiling for "
+            f"<{kernel_type.name}> N={n}, K={k}, num_groups={num_groups} with all Ms."
+            f"{' It only takes a little time (typically 1 sec) if you have run `python3 -m sglang.compile_deep_gemm`. ' if not _IN_PRECOMPILE_STAGE else ''}"
+        )
+
+        _compile_deep_gemm_one_type_all(
+            kernel_type=kernel_type,
+            n=n,
+            k=k,
+            num_groups=num_groups,
+            m_list=_BUILTIN_M_LIST,
+        )
+
+
+# NOTE(alcanderian): get_num_sms should be change when 2-batch-overlap is introduced
+def _compile_deep_gemm_one_type_all(
+    kernel_type: DeepGemmKernelType,
+    n: int,
+    k: int,
+    num_groups: int,
+    m_list: List[int],
+) -> None:
+    # Symmetric memory allocation performs a collective operation across all the GPUs.
+    # Temporary disable symmetric memory during compilation since it only runs on the first rank.
+    saved_context = disable_symmetric_memory_context()
+    try:
+        if kernel_type == DeepGemmKernelType.GROUPED_GEMM_NT_F8F8BF16_CONTIG:
+            m_alignment = deep_gemm.get_mk_alignment_for_contiguous_layout()
+            m_list = sorted(list(set(m for m in m_list if m % m_alignment == 0)))
+
+        # Here the precompilation is only run on the first rank, so gpu_id should be 0
+        memory_budget = get_available_gpu_memory(device="cuda", gpu_id=0)
+
+        # If the memory budget is less memory requirement, we need to reduce max_m to avoid out of memory, which might further cause hanging during warmup
+        max_m = max(m_list)
+        required_memory = _BaseWarmupExecutor.get_memory_requirement(
+            kernel_type, max_m=max_m, n=n, k=k, num_groups=num_groups
+        )
+        logger.info(
+            f"Required memory for warmup: {required_memory}GB, Available memory: {memory_budget}GB"
+        )
+        if memory_budget < required_memory:
+            # TODO: Maybe compute the max_m based on the memory budget
+            while (
+                _BaseWarmupExecutor.get_memory_requirement(
+                    kernel_type, max_m=max_m, n=n, k=k, num_groups=num_groups
+                )
+                > memory_budget
+                and max_m > 4096
+            ):
+                max_m = max_m // 2
+            logger.warning(
+                f"Available memory {memory_budget}GB is less than required memory {required_memory}GB for warmup, reducing max_m to {max_m} to avoid out of memory"
+            )
+            m_list = [m for m in m_list if m <= max_m]
+
+        # Need some methods to estimate needed memory for warmup
+        executor = _BaseWarmupExecutor.create(
+            kernel_type, max_m=max_m, n=n, k=k, num_groups=num_groups
+        )
+
+        old_compile_mode = deep_gemm.get_compile_mode()
+        deep_gemm.set_compile_mode(1)
+        # TODO can use multi thread
+        for m in tqdm(m_list, desc=f"DeepGEMM warmup"):
+            executor.execute(m=m)
+        deep_gemm.set_compile_mode(old_compile_mode)
+
+        # clean up input buffers
+        torch.cuda.current_stream().synchronize()
+        del executor
+        torch.cuda.empty_cache()
+    finally:
+        # Restore symmetric memory context
+        restore_symmetric_memory_context(saved_context)
+
+
+class _BaseWarmupExecutor:
+    @staticmethod
+    def create(kernel_type: DeepGemmKernelType, **kwargs):
+        return {
+            DeepGemmKernelType.GEMM_NT_F8F8BF16: _NormalWarmupExecutor,
+            DeepGemmKernelType.GROUPED_GEMM_NT_F8F8BF16_CONTIG: _GroupedContWarmupExecutor,
+            DeepGemmKernelType.GROUPED_GEMM_NT_F8F8BF16_MASKED: _GroupedMaskedWarmupExecutor,
+            DeepGemmKernelType.GEMM_NT_BF16BF16F32: _BF16F32WarmupExecutor,
+        }[kernel_type](**kwargs)
+
+    @staticmethod
+    def get_memory_requirement(
+        kernel_type: DeepGemmKernelType, max_m: int, n: int, k: int, num_groups: int
+    ) -> int:
+        # Return the required memory space in GB for warmup executor
+        _GB = 1 << 30
+        if kernel_type == DeepGemmKernelType.GEMM_NT_F8F8BF16:
+            return (max_m * k + n * k + max_m * n * 2) / _GB
+        elif kernel_type == DeepGemmKernelType.GROUPED_GEMM_NT_F8F8BF16_CONTIG:
+            return (max_m * k + num_groups * n * k + max_m * 4 + max_m * n * 2) / _GB
+        elif kernel_type == DeepGemmKernelType.GROUPED_GEMM_NT_F8F8BF16_MASKED:
+            return (
+                num_groups * max_m * k
+                + num_groups * n * k
+                + num_groups * 4
+                + num_groups * max_m * n * 2
+            ) / _GB
+        elif kernel_type == DeepGemmKernelType.GEMM_NT_BF16BF16F32:
+            # bf16 lhs + bf16 rhs + fp32 out
+            return (max_m * k * 2 + n * k * 2 + max_m * n * 4) / _GB
+        else:
+            raise ValueError(f"Invalid kernel type: {kernel_type}")
+
+    def execute(self, m):
+        raise NotImplementedError
+
+
+def _empty_token_fp8(size):
+    *dims, k = size
+    return (
+        torch.empty(size, device="cuda", dtype=torch.float8_e4m3fn),
+        torch.empty(
+            (*dims, ceil_div(k, _BLOCK_SIZE)), device="cuda", dtype=torch.float32
+        ),
+    )
+
+
+def _empty_block_fp8(size):
+    *dims, n, k = size
+    return (
+        torch.empty(size, device="cuda", dtype=torch.float8_e4m3fn),
+        torch.empty(
+            (*dims, ceil_div(n, _BLOCK_SIZE), ceil_div(k, _BLOCK_SIZE)),
+            device="cuda",
+            dtype=torch.float32,
+        ),
+    )
+
+
+_BLOCK_SIZE = 128
+
+
+class _NormalWarmupExecutor(_BaseWarmupExecutor):
+    def __init__(self, max_m: int, n: int, k: int, num_groups: int):
+        self.lhs_q, self.lhs_s = _empty_token_fp8((max_m, k))
+        self.rhs_q, self.rhs_s = _empty_block_fp8((n, k))
+        self.out = torch.empty((max_m, n), device="cuda", dtype=torch.bfloat16)
+
+    def execute(self, m):
+        deep_gemm.fp8_gemm_nt(
+            (self.lhs_q[:m], self.lhs_s[:m]),
+            (self.rhs_q, self.rhs_s),
+            self.out[:m],
+        )
+
+
+class _GroupedContWarmupExecutor(_BaseWarmupExecutor):
+    def __init__(self, max_m: int, n: int, k: int, num_groups: int):
+        self.lhs_q, self.lhs_s = _empty_token_fp8((max_m, k))
+        self.rhs_q, self.rhs_s = _empty_block_fp8((num_groups, n, k))
+        self.m_indices = torch.zeros((max_m,), device="cuda", dtype=torch.int32)
+        self.out = torch.empty((max_m, n), device="cuda", dtype=torch.bfloat16)
+
+    def execute(self, m):
+        deep_gemm.m_grouped_fp8_gemm_nt_contiguous(
+            (self.lhs_q[:m], self.lhs_s[:m]),
+            (self.rhs_q, self.rhs_s),
+            self.out[:m],
+            m_indices=self.m_indices[:m],
+        )
+
+
+class _GroupedMaskedWarmupExecutor(_BaseWarmupExecutor):
+    def __init__(self, max_m: int, n: int, k: int, num_groups: int):
+        self.lhs_q, self.lhs_s = _empty_token_fp8((num_groups, max_m, k))
+        self.rhs_q, self.rhs_s = _empty_block_fp8((num_groups, n, k))
+        self.masked_m = torch.zeros((num_groups,), device="cuda", dtype=torch.int32)
+        self.out = torch.empty(
+            (num_groups, max_m, n), device="cuda", dtype=torch.bfloat16
+        )
+
+    def execute(self, m):
+        deep_gemm.fp8_m_grouped_gemm_nt_masked(
+            (self.lhs_q, self.lhs_s),
+            (self.rhs_q, self.rhs_s),
+            self.out,
+            masked_m=self.masked_m,
+            # DeepGEMM uses `expect_m` instead of input shape for `get_best_config`
+            expected_m=m,
+        )
+
+
+class _BF16F32WarmupExecutor(_BaseWarmupExecutor):
+    def __init__(self, max_m: int, n: int, k: int, num_groups: int):
+        self.lhs = torch.empty((max_m, k), device="cuda", dtype=torch.bfloat16)
+        self.rhs = torch.empty((n, k), device="cuda", dtype=torch.bfloat16)
+        self.out = torch.empty((max_m, n), device="cuda", dtype=torch.float32)
+
+    def execute(self, m):
+        deep_gemm.bf16_gemm_nt(self.lhs[:m], self.rhs, self.out[:m])
+
+
+@contextmanager
+def deep_gemm_execution_hook(
+    m: int, n: int, k: int, num_groups: int, kernel_type: DeepGemmKernelType
+):
+    if m > 0:
+        _maybe_compile_deep_gemm_one_type_all(kernel_type, n, k, num_groups)
+    yield
diff --git a/sglang/python/sglang/srt/layers/deep_gemm_wrapper/configurer.py b/sglang/python/sglang/srt/layers/deep_gemm_wrapper/configurer.py
new file mode 100644
index 0000000000000000000000000000000000000000..34494f59914cc5e7974dec19678801da94beaa07
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/deep_gemm_wrapper/configurer.py
@@ -0,0 +1,25 @@
+import logging
+
+from sglang.srt.environ import envs
+from sglang.srt.utils import get_device_sm, is_blackwell_supported
+
+logger = logging.getLogger(__name__)
+
+
+def _compute_enable_deep_gemm():
+    sm_version = get_device_sm()
+    if sm_version < 90:
+        return False
+
+    try:
+        import deep_gemm  # noqa: F401
+    except ImportError:
+        return False
+
+    return envs.SGLANG_ENABLE_JIT_DEEPGEMM.get()
+
+
+ENABLE_JIT_DEEPGEMM = _compute_enable_deep_gemm()
+
+DEEPGEMM_BLACKWELL = ENABLE_JIT_DEEPGEMM and is_blackwell_supported()
+DEEPGEMM_SCALE_UE8M0 = DEEPGEMM_BLACKWELL
diff --git a/sglang/python/sglang/srt/layers/deep_gemm_wrapper/entrypoint.py b/sglang/python/sglang/srt/layers/deep_gemm_wrapper/entrypoint.py
new file mode 100644
index 0000000000000000000000000000000000000000..8eada5be66ede0b0624a6b721dc1075fb05f3306
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/deep_gemm_wrapper/entrypoint.py
@@ -0,0 +1,148 @@
+import logging
+from contextlib import contextmanager
+from typing import Any, Optional, Tuple
+
+import torch
+
+from sglang.srt.layers.deep_gemm_wrapper import compile_utils
+from sglang.srt.layers.deep_gemm_wrapper.configurer import (  # noqa: F401
+    DEEPGEMM_BLACKWELL,
+    DEEPGEMM_SCALE_UE8M0,
+    ENABLE_JIT_DEEPGEMM,
+)
+from sglang.srt.server_args import ServerArgs
+from sglang.srt.utils import get_bool_env_var
+
+logger = logging.getLogger(__name__)
+
+if ENABLE_JIT_DEEPGEMM:
+    import deep_gemm
+    from deep_gemm.utils.layout import get_mn_major_tma_aligned_tensor  # noqa: F401
+
+_SANITY_CHECK = get_bool_env_var("SGLANG_DEEPGEMM_SANITY_CHECK")
+
+
+# TODO maybe rename these functions
+def grouped_gemm_nt_f8f8bf16_masked(
+    lhs: Tuple[torch.Tensor, torch.Tensor],
+    rhs: Tuple[torch.Tensor, torch.Tensor],
+    out: torch.Tensor,
+    masked_m: torch.Tensor,
+    expected_m: int,
+    overlap_args: Optional[Any] = None,
+    max_block_n: int = 256,
+):
+    num_groups, _, k = lhs[0].shape
+    _, n, _ = rhs[0].shape
+    kernel_type = compile_utils.DeepGemmKernelType.GROUPED_GEMM_NT_F8F8BF16_MASKED
+
+    _sanity_check_input(lhs)
+    _sanity_check_input(rhs)
+
+    with compile_utils.deep_gemm_execution_hook(
+        expected_m, n, k, num_groups, kernel_type
+    ):
+        with configure_deep_gemm_num_sms(
+            overlap_args.num_sms if overlap_args is not None else None
+        ):
+
+            return deep_gemm.fp8_m_grouped_gemm_nt_masked(
+                lhs,
+                rhs,
+                out,
+                masked_m,
+                expected_m,
+                **(
+                    dict(
+                        enable_overlap=True,
+                        max_block_n=max_block_n,
+                        signal=overlap_args.signal,
+                    )
+                    if overlap_args is not None
+                    else {}
+                ),
+            )
+
+
+def grouped_gemm_nt_f8f8bf16_contig(
+    lhs: Tuple[torch.Tensor, torch.Tensor],
+    rhs: Tuple[torch.Tensor, torch.Tensor],
+    out: torch.Tensor,
+    m_indices: torch.Tensor,
+):
+    m, k = lhs[0].shape
+    num_groups, n, _ = rhs[0].shape
+    kernel_type = compile_utils.DeepGemmKernelType.GROUPED_GEMM_NT_F8F8BF16_CONTIG
+
+    _sanity_check_input(lhs)
+    _sanity_check_input(rhs)
+
+    with compile_utils.deep_gemm_execution_hook(m, n, k, num_groups, kernel_type):
+        deep_gemm.m_grouped_fp8_gemm_nt_contiguous(lhs, rhs, out, m_indices)
+
+
+def gemm_nt_f8f8bf16(
+    lhs: Tuple[torch.Tensor, torch.Tensor],
+    rhs: Tuple[torch.Tensor, torch.Tensor],
+    out: torch.Tensor,
+):
+    m, k = lhs[0].shape
+    n, _ = rhs[0].shape
+    num_groups = 1
+    kernel_type = compile_utils.DeepGemmKernelType.GEMM_NT_F8F8BF16
+
+    _sanity_check_input(lhs)
+    _sanity_check_input(rhs)
+
+    with compile_utils.deep_gemm_execution_hook(m, n, k, num_groups, kernel_type):
+        deep_gemm.fp8_gemm_nt(
+            lhs,
+            rhs,
+            out,
+        )
+
+
+def gemm_nt_bf16bf16f32(
+    lhs: torch.Tensor,
+    rhs: torch.Tensor,
+    out: torch.Tensor,
+):
+    m, k = lhs.shape
+    n, _ = rhs.shape
+    num_groups = 1
+    kernel_type = compile_utils.DeepGemmKernelType.GEMM_NT_BF16BF16F32
+
+    with compile_utils.deep_gemm_execution_hook(m, n, k, num_groups, kernel_type):
+        deep_gemm.bf16_gemm_nt(lhs, rhs, out)
+
+
+def update_deep_gemm_config(gpu_id: int, server_args: ServerArgs):
+    compile_utils.update_deep_gemm_config(gpu_id, server_args)
+
+
+@contextmanager
+def configure_deep_gemm_num_sms(num_sms):
+    if num_sms is None:
+        yield
+    else:
+        original_num_sms = deep_gemm.get_num_sms()
+        deep_gemm.set_num_sms(num_sms)
+        try:
+            yield
+        finally:
+            deep_gemm.set_num_sms(original_num_sms)
+
+
+def _sanity_check_input(x_fp8: Tuple[torch.Tensor, torch.Tensor]):
+    if not _SANITY_CHECK:
+        return
+
+    x, x_scale = x_fp8
+
+    if x_scale.dtype == torch.int:
+        return
+
+    from sglang.srt.layers.quantization.fp8_utils import ceil_to_ue8m0
+
+    x_scale_ceil = ceil_to_ue8m0(x_scale)
+    assert torch.all(x_scale == x_scale_ceil), f"{x_scale=} {x_scale_ceil=}"
diff --git a/sglang/python/sglang/srt/layers/dp_attention.py b/sglang/python/sglang/srt/layers/dp_attention.py
new file mode 100644
index 0000000000000000000000000000000000000000..7e9ea6bbd219b6b6bae1820887112a4ebcc77df2
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/dp_attention.py
@@ -0,0 +1,577 @@
+from __future__ import annotations
+
+import functools
+import logging
+from contextlib import contextmanager
+from enum import IntEnum, auto
+from typing import TYPE_CHECKING, List, Optional, Tuple
+
+import torch
+import triton
+import triton.language as tl
+
+from sglang.srt.distributed import (
+    GroupCoordinator,
+    get_attn_context_model_parallel_rank,
+    get_attn_context_model_parallel_world_size,
+    get_attn_cp_group,
+    get_attn_tensor_model_parallel_rank,
+    get_attn_tensor_model_parallel_world_size,
+    get_attn_tp_group,
+    get_tensor_model_parallel_rank,
+    get_tensor_model_parallel_world_size,
+    get_tp_group,
+    tensor_model_parallel_all_reduce,
+)
+from sglang.srt.distributed.device_communicators.pynccl_allocator import (
+    use_symmetric_memory,
+)
+from sglang.srt.utils import get_bool_env_var, is_hip
+
+if TYPE_CHECKING:
+    from sglang.srt.configs.model_config import ModelConfig
+    from sglang.srt.server_args import ServerArgs
+
+logger = logging.getLogger(__name__)
+
+if TYPE_CHECKING:
+    from sglang.srt.model_executor.forward_batch_info import ForwardBatch
+
+_ATTN_DP_RANK: Optional[int] = None
+_ATTN_DP_SIZE: Optional[int] = None
+_LOCAL_ATTN_DP_SIZE: Optional[int] = None
+_LOCAL_ATTN_DP_RANK: Optional[int] = None
+_ENABLE_DP_ATTENTION_FLAG: bool = False
+
+_is_hip = is_hip()
+_USE_ROCM700A_WA = _is_hip and get_bool_env_var("SGLANG_USE_ROCM700A")
+
+
+class DpPaddingMode(IntEnum):
+
+    # Padding tokens to max length and then gather tokens using `all_gather_into_tensor`
+    MAX_LEN = auto()
+    # Padding tokens to sum length and then gather tokens using `all_reduce`
+    SUM_LEN = auto()
+
+    def is_max_len(self):
+        return self == DpPaddingMode.MAX_LEN
+
+    def is_sum_len(self):
+        return self == DpPaddingMode.SUM_LEN
+
+    @classmethod
+    def get_dp_padding_mode(
+        cls, is_extend_in_batch, global_num_tokens: List[int]
+    ) -> DpPaddingMode:
+        if is_extend_in_batch:
+            return DpPaddingMode.SUM_LEN
+
+        # we choose the mode that minimizes the communication cost
+        max_len = max(global_num_tokens)
+        sum_len = sum(global_num_tokens)
+        if sum_len * 2 > max_len * get_attention_dp_size():
+            return cls.MAX_LEN
+        else:
+            return cls.SUM_LEN
+
+    @classmethod
+    def get_default_mode_in_cuda_graph(cls) -> DpPaddingMode:
+        # TODO(kkhuang-amd): noqa, temporary work-around for rocm 7.0.0 alpha
+        # it can be safely removed later, once RCCL fixed
+        if _USE_ROCM700A_WA:
+            return cls.SUM_LEN
+        else:
+            return cls.MAX_LEN
+
+
+class _DpGatheredBufferWrapper:
+
+    _hidden_size: int
+    _dtype: torch.dtype
+    _device: torch.device
+    _global_dp_buffer_len: int
+    _local_dp_buffer_len: int
+    _dp_max_padding: bool
+    _global_num_tokens: Optional[List[int]]
+    _is_extend_in_batch: bool
+
+    @classmethod
+    def set_metadata(cls, hidden_size: int, dtype: torch.dtype, device: torch.device):
+        cls._hidden_size = hidden_size
+        cls._dtype = dtype
+        cls._device = device
+
+    @classmethod
+    def set_dp_buffer_len(
+        cls,
+        global_dp_buffer_len: int,
+        local_dp_buffer_len: int,
+        dp_max_padding: bool,
+        global_num_tokens: Optional[List[int]] = None,
+    ):
+        cls._global_dp_buffer_len = global_dp_buffer_len
+        cls._local_dp_buffer_len = local_dp_buffer_len
+        cls._dp_max_padding = dp_max_padding
+        cls._global_num_tokens = global_num_tokens
+
+    @classmethod
+    def get_global_dp_buffer(cls) -> torch.Tensor:
+        with use_symmetric_memory(get_tp_group(), disabled=not cls._dp_max_padding):
+            buffer = torch.empty(
+                (cls._global_dp_buffer_len, cls._hidden_size),
+                dtype=cls._dtype,
+                device=cls._device,
+            )
+        return buffer
+
+    @classmethod
+    def get_local_dp_buffer(cls) -> torch.Tensor:
+        with use_symmetric_memory(get_tp_group(), disabled=not cls._dp_max_padding):
+            buffer = torch.empty(
+                (cls._local_dp_buffer_len, cls._hidden_size),
+                dtype=cls._dtype,
+                device=cls._device,
+            )
+        return buffer
+
+    @classmethod
+    def get_global_dp_buffer_len(cls) -> int:
+        return cls._global_dp_buffer_len
+
+    @classmethod
+    def get_local_dp_buffer_len(cls) -> int:
+        return cls._local_dp_buffer_len
+
+    @classmethod
+    def get_dp_global_num_tokens(cls) -> List[int]:
+        return cls._global_num_tokens
+
+    @classmethod
+    def get_dp_hidden_size(cls) -> int:
+        return cls._hidden_size
+
+    @classmethod
+    def get_dp_dtype(cls) -> torch.dtype:
+        return cls._dtype
+
+    @classmethod
+    def get_dp_device(cls) -> torch.device:
+        return cls._device
+
+    @classmethod
+    def set_is_extend_in_batch(cls, is_extend_in_batch: bool):
+        cls._is_extend_in_batch = is_extend_in_batch
+
+    @classmethod
+    def get_is_extend_in_batch(cls) -> bool:
+        return cls._is_extend_in_batch
+
+    @classmethod
+    def is_dp_max_padding(cls) -> bool:
+        return cls._dp_max_padding
+
+
+def set_dp_buffer_len(
+    global_dp_buffer_len: int,
+    local_dp_buffer_len: int,
+    dp_max_padding: bool,
+    global_num_tokens: Optional[List[int]] = None,
+):
+    _DpGatheredBufferWrapper.set_dp_buffer_len(
+        global_dp_buffer_len, local_dp_buffer_len, dp_max_padding, global_num_tokens
+    )
+
+
+def get_global_dp_buffer() -> torch.Tensor:
+    return _DpGatheredBufferWrapper.get_global_dp_buffer()
+
+
+def get_local_dp_buffer() -> torch.Tensor:
+    return _DpGatheredBufferWrapper.get_local_dp_buffer()
+
+
+def get_global_dp_buffer_len() -> int:
+    return _DpGatheredBufferWrapper.get_global_dp_buffer_len()
+
+
+def get_local_dp_buffer_len() -> int:
+    return _DpGatheredBufferWrapper.get_local_dp_buffer_len()
+
+
+def get_dp_global_num_tokens() -> List[int]:
+    return _DpGatheredBufferWrapper.get_dp_global_num_tokens()
+
+
+def get_dp_hidden_size() -> int:
+    return _DpGatheredBufferWrapper.get_dp_hidden_size()
+
+
+def get_dp_dtype() -> torch.dtype:
+    return _DpGatheredBufferWrapper.get_dp_dtype()
+
+
+def get_dp_device() -> torch.device:
+    return _DpGatheredBufferWrapper.get_dp_device()
+
+
+def set_is_extend_in_batch(is_extend_in_batch: bool):
+    _DpGatheredBufferWrapper.set_is_extend_in_batch(is_extend_in_batch)
+
+
+def get_is_extend_in_batch() -> bool:
+    return _DpGatheredBufferWrapper.get_is_extend_in_batch()
+
+
+def is_dp_max_padding() -> bool:
+    return _DpGatheredBufferWrapper.is_dp_max_padding()
+
+
+def compute_dp_attention_world_info(
+    enable_dp_attention, tp_rank, tp_size, dp_size, attn_cp_size: int = 1
+):
+    attn_dp_size = dp_size if enable_dp_attention else 1
+    attn_tp_size = tp_size // attn_dp_size // attn_cp_size
+    attn_tp_rank = tp_rank % attn_tp_size
+
+    if not enable_dp_attention:
+        attn_dp_rank = 0
+    else:
+        # Rank layout is (dp, cp, tp) where tp is the fastest-changing dim:
+        # tp_rank = (attn_dp_rank * attn_cp_size + attn_cp_rank) * attn_tp_size + attn_tp_rank
+        attn_dp_rank = tp_rank // (attn_tp_size * attn_cp_size)
+
+    return attn_tp_rank, attn_tp_size, attn_dp_rank
+
+
+def compute_dp_attention_local_info(
+    enable_dp_attention, tp_rank, tp_size, dp_size, moe_dense_tp_size
+):
+    if not enable_dp_attention:
+        return tp_rank, tp_size, 0
+
+    local_tp_size = moe_dense_tp_size if moe_dense_tp_size else tp_size
+    local_tp_rank = tp_rank % local_tp_size
+    local_dp_size = max(1, dp_size // (tp_size // local_tp_size))
+
+    local_attn_tp_size = local_tp_size // local_dp_size
+    local_attn_dp_rank = local_tp_rank // local_attn_tp_size
+    local_attn_tp_rank = local_tp_rank % local_attn_tp_size
+
+    return local_attn_tp_rank, local_attn_tp_size, local_attn_dp_rank
+
+
+def initialize_dp_attention(
+    server_args: ServerArgs,
+    model_config: ModelConfig,
+):
+    global _ATTN_DP_RANK, _ATTN_DP_SIZE
+    global _LOCAL_ATTN_DP_SIZE, _LOCAL_ATTN_DP_RANK, _ENABLE_DP_ATTENTION_FLAG
+    enable_dp_attention = server_args.enable_dp_attention
+    dp_size = server_args.dp_size
+    moe_dense_tp_size = server_args.moe_dense_tp_size
+    attn_cp_size = server_args.attn_cp_size
+
+    _ENABLE_DP_ATTENTION_FLAG = enable_dp_attention
+
+    tp_rank = get_tensor_model_parallel_rank()
+    tp_size = get_tensor_model_parallel_world_size()
+
+    _, _, _ATTN_DP_RANK = compute_dp_attention_world_info(
+        enable_dp_attention, tp_rank, tp_size, dp_size, attn_cp_size
+    )
+    _, _, _LOCAL_ATTN_DP_RANK = compute_dp_attention_local_info(
+        enable_dp_attention, tp_rank, tp_size, dp_size, moe_dense_tp_size
+    )
+
+    if enable_dp_attention:
+        _ATTN_DP_SIZE = dp_size
+        if moe_dense_tp_size is None:
+            _LOCAL_ATTN_DP_SIZE = _ATTN_DP_SIZE
+        else:
+            _LOCAL_ATTN_DP_SIZE = max(1, dp_size // (tp_size // moe_dense_tp_size))
+    else:
+        _ATTN_DP_SIZE = 1
+        _LOCAL_ATTN_DP_SIZE = 1
+
+    _DpGatheredBufferWrapper.set_metadata(
+        hidden_size=model_config.hidden_size,
+        dtype=model_config.dtype,
+        device=torch.device(server_args.device),
+    )
+
+
+def is_dp_attention_enabled() -> bool:
+    return _ENABLE_DP_ATTENTION_FLAG
+
+
+def is_allocation_symmetric() -> bool:
+    return not is_dp_attention_enabled() or is_dp_max_padding()
+
+
+def get_attention_tp_group() -> GroupCoordinator:
+    return get_attn_tp_group()
+
+
+def get_attention_tp_rank() -> int:
+    return get_attn_tensor_model_parallel_rank()
+
+
+def get_attention_tp_size() -> int:
+    return get_attn_tensor_model_parallel_world_size()
+
+
+def get_attention_cp_group() -> GroupCoordinator:
+    return get_attn_cp_group()
+
+
+def get_attention_cp_rank() -> int:
+    return get_attn_context_model_parallel_rank()
+
+
+def get_attention_cp_size() -> int:
+    return get_attn_context_model_parallel_world_size()
+
+
+def get_attention_dp_rank() -> int:
+    assert _ATTN_DP_RANK is not None, "dp attention not initialized!"
+    return _ATTN_DP_RANK
+
+
+def get_attention_dp_size() -> int:
+    assert _ATTN_DP_SIZE is not None, "dp attention not initialized!"
+    return _ATTN_DP_SIZE
+
+
+def get_local_attention_dp_rank() -> int:
+    assert _LOCAL_ATTN_DP_RANK is not None, "dp attention not initialized!"
+    return _LOCAL_ATTN_DP_RANK
+
+
+def get_local_attention_dp_size() -> int:
+    assert _LOCAL_ATTN_DP_SIZE is not None, "dp attention not initialized!"
+    return _LOCAL_ATTN_DP_SIZE
+
+
+@contextmanager
+def disable_dp_size():
+    """Patch the tp group temporarily until this function ends.
+
+    This method is for draft workers of speculative decoding to run draft model
+    with different tp degree from that of target model workers.
+
+    Args:
+        tp_group (GroupCoordinator): the tp group coordinator
+    """
+    global _ATTN_DP_SIZE
+    assert _ATTN_DP_SIZE is not None, "dp attention not initialized!"
+
+    old_dp_size = _ATTN_DP_SIZE
+    _ATTN_DP_SIZE = 1
+    try:
+        yield
+    finally:
+        _ATTN_DP_SIZE = old_dp_size
+
+
+def get_dp_local_info(forward_batch: ForwardBatch) -> Tuple[torch.Tensor, torch.Tensor]:
+    # `get_dp_local_info` is only called in global DP gather and scatter. We use global DP rank here.
+    dp_rank = get_attention_dp_rank()
+
+    if forward_batch.dp_local_start_pos is None:
+        cumtokens = torch.cumsum(forward_batch.global_num_tokens_gpu, dim=0)
+        if dp_rank == 0:
+            local_start_pos = torch.zeros_like(cumtokens[0])
+        else:
+            local_start_pos = cumtokens[dp_rank - 1]
+        local_num_tokens = forward_batch.global_num_tokens_gpu[dp_rank]
+
+        forward_batch.dp_local_start_pos = local_start_pos
+        forward_batch.dp_local_num_tokens = local_num_tokens
+
+    return forward_batch.dp_local_start_pos, forward_batch.dp_local_num_tokens
+
+
+@triton.jit
+def memcpy_triton_kernel(
+    dst_ptr,
+    src_ptr,
+    offset_ptr,
+    sz_ptr,
+    offset_src: tl.constexpr,
+    chunk_size,  # multiplied for offset and sz
+    BLOCK_SIZE: tl.constexpr,
+):
+    pid = tl.program_id(axis=0).to(tl.int64)
+    offset = tl.load(offset_ptr).to(tl.int64) * chunk_size
+    sz = tl.load(sz_ptr).to(tl.int64) * chunk_size
+
+    start_index = pid * BLOCK_SIZE
+    offs = tl.arange(0, BLOCK_SIZE)
+    mask = start_index + offs < sz
+
+    if offset_src:
+        data = tl.load(src_ptr + offset + start_index + offs, mask=mask)
+        tl.store(dst_ptr + start_index + offs, data, mask=mask)
+    else:
+        data = tl.load(src_ptr + start_index + offs, mask=mask)
+        tl.store(dst_ptr + offset + start_index + offs, data, mask=mask)
+
+
+def prod(x):
+    return functools.reduce(lambda a, b: a * b, x, 1)
+
+
+def memcpy_triton(dst, src, dim, offset, sz, offset_src):
+    max_size = min(src.numel(), dst.numel())
+    assert dim == 0, "dim != 0 unsupported"
+    assert src.shape[1:] == dst.shape[1:], "src and dst must have same shape"
+    chunk_size = prod(src.shape[1:])
+    BLOCK_SIZE = 8192
+    grid = (triton.cdiv(max_size, BLOCK_SIZE),)
+
+    memcpy_triton_kernel[grid](dst, src, offset, sz, offset_src, chunk_size, BLOCK_SIZE)
+
+
+def _dp_gather_via_all_reduce(
+    global_tokens: torch.Tensor,
+    local_tokens: torch.Tensor,
+    forward_batch: ForwardBatch,
+    is_partial: bool,
+):
+    local_start_pos, local_num_tokens = get_dp_local_info(forward_batch)
+
+    global_tokens.fill_(0)
+    assert local_tokens.is_contiguous()
+    assert global_tokens.is_contiguous()
+
+    if local_tokens.shape[0] > 0 and (is_partial or get_attention_tp_rank() == 0):
+        assert (
+            local_tokens.untyped_storage() is not global_tokens.untyped_storage()
+        ), "aliasing between global_tokens and local_tokens not allowed"
+
+        memcpy_triton(
+            global_tokens, local_tokens, 0, local_start_pos, local_num_tokens, False
+        )
+
+    # Input IDs are in int 32. We should use inplace_all_reduce for local case because of custom all reduce.
+    NUM_GPUS_PER_NODE = 8
+    if (
+        not local_tokens.dtype.is_floating_point
+        and get_tensor_model_parallel_world_size() <= NUM_GPUS_PER_NODE
+    ):
+        from sglang.srt.distributed.parallel_state import inplace_all_reduce
+
+        inplace_all_reduce(global_tokens, group_name=get_tp_group().unique_name)
+
+    else:
+        global_tokens[:] = tensor_model_parallel_all_reduce(global_tokens)
+
+
+def _dp_gather_via_all_gather(
+    global_tokens: torch.Tensor,
+    local_tokens: torch.Tensor,
+    forward_batch: ForwardBatch,
+    is_partial: bool,
+):
+    if get_attention_tp_size() == 1:
+        get_tp_group().all_gather_into_tensor(global_tokens, local_tokens)
+        return
+
+    if not is_partial:
+        if get_attention_tp_rank() != 0:
+            local_tokens.fill_(0)
+    scattered_local_tokens = local_tokens.tensor_split(get_attention_tp_size())[
+        get_attention_tp_rank()
+    ]
+    get_attention_tp_group().reduce_scatter_tensor(scattered_local_tokens, local_tokens)
+    get_tp_group().all_gather_into_tensor(global_tokens, scattered_local_tokens)
+
+
+def _dp_gather(
+    global_tokens: torch.Tensor,
+    local_tokens: torch.Tensor,
+    forward_batch: ForwardBatch,
+    is_partial: bool,
+):
+    if forward_batch.dp_padding_mode.is_max_len():
+        _dp_gather_via_all_gather(
+            global_tokens, local_tokens, forward_batch, is_partial
+        )
+    else:
+        _dp_gather_via_all_reduce(
+            global_tokens, local_tokens, forward_batch, is_partial
+        )
+
+
+def dp_gather_partial(
+    global_tokens: torch.Tensor,
+    local_tokens: torch.Tensor,
+    forward_batch: ForwardBatch,
+):
+    _dp_gather(global_tokens, local_tokens, forward_batch, is_partial=True)
+
+
+def dp_gather_replicate(
+    global_tokens: torch.Tensor,
+    local_tokens: torch.Tensor,
+    forward_batch: ForwardBatch,
+):
+    _dp_gather(global_tokens, local_tokens, forward_batch, is_partial=False)
+
+
+def dp_scatter(
+    local_tokens: torch.Tensor,  # output
+    global_tokens: torch.Tensor,  # input
+    forward_batch: ForwardBatch,
+):
+    # local_num_tokens is not necessarily the same as local_tokens.shape[0],
+    # since local_tokens may be padded for cuda graph
+    local_start_pos, local_num_tokens = get_dp_local_info(forward_batch)
+
+    local_tokens.fill_(0)
+    assert local_tokens.is_contiguous()
+    assert global_tokens.is_contiguous()
+    if local_tokens.shape[0] > 0:
+        assert (
+            local_tokens.untyped_storage() is not global_tokens.untyped_storage()
+        ), "aliasing between local_tokens and global_tokens not allowed"
+
+        memcpy_triton(
+            local_tokens, global_tokens, 0, local_start_pos, local_num_tokens, True
+        )
+
+
+def dp_reduce_scatter_tensor(output: torch.Tensor, input: torch.Tensor):
+    if get_tensor_model_parallel_world_size() == get_attention_dp_size():
+        get_tp_group().reduce_scatter_tensor(output, input)
+    else:
+        scattered_local_tokens = input.tensor_split(
+            get_tensor_model_parallel_world_size()
+        )[get_tensor_model_parallel_rank()]
+        get_tp_group().reduce_scatter_tensor(scattered_local_tokens, input)
+        get_attention_tp_group().all_gather_into_tensor(output, scattered_local_tokens)
+
+
+def attn_tp_reduce_scatter_tensor(output: torch.Tensor, input: torch.Tensor):
+    return get_attention_tp_group().reduce_scatter_tensor(output, input)
+
+
+def attn_cp_reduce_scatter_tensor(output: torch.Tensor, input: torch.Tensor):
+    return get_attention_cp_group().reduce_scatter_tensor(output, input)
+
+
+def attn_tp_all_reduce(input: torch.Tensor):
+    return get_attention_tp_group().all_reduce(input)
+
+
+def attn_tp_all_gather_into_tensor(output: torch.Tensor, input: torch.Tensor):
+    return get_attention_tp_group().all_gather_into_tensor(output, input)
+
+
+def attn_cp_all_gather_into_tensor(output: torch.Tensor, input: torch.Tensor):
+    return get_attention_cp_group().all_gather_into_tensor(output, input)
+
+
+def attn_tp_all_gather(output_list: List[torch.Tensor], input: torch.Tensor):
+    return get_attention_tp_group().all_gather(input, output_tensor_list=output_list)
diff --git a/sglang/python/sglang/srt/layers/elementwise.py b/sglang/python/sglang/srt/layers/elementwise.py
new file mode 100644
index 0000000000000000000000000000000000000000..d8f2f7e48d5003ce85f40f9483446be6cef634f8
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/elementwise.py
@@ -0,0 +1,591 @@
+from typing import Optional, Tuple
+
+import torch
+import triton
+import triton.language as tl
+
+from sglang.srt.utils import is_hip
+from sglang.srt.utils.custom_op import register_custom_op
+
+_is_hip = is_hip()
+
+
+fused_softcap_autotune = triton.autotune(
+    configs=[
+        triton.Config(kwargs={"BLOCK_SIZE": 128}, num_warps=4),
+        triton.Config(kwargs={"BLOCK_SIZE": 128}, num_warps=8),
+        triton.Config(kwargs={"BLOCK_SIZE": 128}, num_warps=16),
+        triton.Config(kwargs={"BLOCK_SIZE": 256}, num_warps=4),
+        triton.Config(kwargs={"BLOCK_SIZE": 256}, num_warps=8),
+        triton.Config(kwargs={"BLOCK_SIZE": 512}, num_warps=4),
+        triton.Config(kwargs={"BLOCK_SIZE": 512}, num_warps=8),
+        triton.Config(kwargs={"BLOCK_SIZE": 512}, num_warps=16),
+        triton.Config(kwargs={"BLOCK_SIZE": 1024}, num_warps=4),
+        triton.Config(kwargs={"BLOCK_SIZE": 1024}, num_warps=8),
+        triton.Config(kwargs={"BLOCK_SIZE": 1024}, num_warps=16),
+        triton.Config(kwargs={"BLOCK_SIZE": 1024}, num_warps=32),
+        triton.Config(kwargs={"BLOCK_SIZE": 2048}, num_warps=32),
+        triton.Config(kwargs={"BLOCK_SIZE": 4096}, num_warps=32),
+        triton.Config(kwargs={"BLOCK_SIZE": 8192}, num_warps=32),
+        triton.Config(kwargs={"BLOCK_SIZE": 16384}, num_warps=32),
+        triton.Config(kwargs={"BLOCK_SIZE": 32768}, num_warps=32),
+    ],
+    key=["n_ele"],
+)
+
+
+@triton.jit
+def fused_softcap_kernel(
+    output_ptr,
+    input_ptr,
+    n_ele,
+    softcap_const: tl.constexpr,
+    BLOCK_SIZE: tl.constexpr,
+):
+    pid = tl.program_id(axis=0)
+    block_start = pid * BLOCK_SIZE
+    offsets = block_start + tl.arange(0, BLOCK_SIZE)
+    mask = offsets < n_ele
+    x = tl.load(input_ptr + offsets, mask=mask)
+    fx = x.to(tl.float32)
+    fxs = fx / softcap_const
+    exped = tl.exp(2 * fxs)
+    top = exped - 1
+    bottom = exped + 1
+    output = top / bottom * softcap_const
+    tl.store(output_ptr + offsets, output, mask=mask)
+
+
+fused_softcap_kernel_autotuned = fused_softcap_autotune(fused_softcap_kernel)
+
+
+def fused_softcap(x, softcap_const, autotune=False):
+    output = torch.empty_like(x, dtype=torch.float32)
+    n_elements = output.numel()
+    if autotune:
+        grid = lambda meta: (triton.cdiv(n_elements, meta["BLOCK_SIZE"]),)
+        fused_softcap_kernel_autotuned[grid](output, x, n_elements, softcap_const)
+    else:
+        fused_softcap_kernel[(triton.cdiv(n_elements, 128),)](
+            output, x, n_elements, softcap_const, BLOCK_SIZE=128, num_warps=8
+        )
+    return output
+
+
+# cast to float + softcap
+class Softcap:
+    def __init__(self, softcap_const: float):
+        self.softcap_const = softcap_const
+
+    def __call__(self, *args, **kwargs):
+        return self.forward(*args, **kwargs)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        if x.is_cuda:
+            return self.forward_cuda(x)
+        else:
+            return self.forward_native(x)
+
+    def forward_native(self, x: torch.Tensor) -> torch.Tensor:
+        return torch.tanh(x.float() / self.softcap_const) * self.softcap_const
+
+    def forward_cuda(self, x: torch.Tensor, autotune=False) -> torch.Tensor:
+        return fused_softcap(x, self.softcap_const, autotune=autotune)
+
+
+rmsnorm_autotune = triton.autotune(
+    configs=[
+        triton.Config(kwargs={"BLOCK_SIZE": 1024}, num_warps=4, num_stages=1),
+        triton.Config(kwargs={"BLOCK_SIZE": 1024}, num_warps=8, num_stages=1),
+        triton.Config(kwargs={"BLOCK_SIZE": 1024}, num_warps=16, num_stages=1),
+        triton.Config(kwargs={"BLOCK_SIZE": 1024}, num_warps=4),
+        triton.Config(kwargs={"BLOCK_SIZE": 1024}, num_warps=8),
+        triton.Config(kwargs={"BLOCK_SIZE": 1024}, num_warps=16),
+        triton.Config(kwargs={"BLOCK_SIZE": 1024}, num_warps=4, num_stages=4),
+        triton.Config(kwargs={"BLOCK_SIZE": 1024}, num_warps=8, num_stages=4),
+        triton.Config(kwargs={"BLOCK_SIZE": 1024}, num_warps=16, num_stages=4),
+        triton.Config(kwargs={"BLOCK_SIZE": 1024}, num_warps=8, num_stages=8),
+        triton.Config(kwargs={"BLOCK_SIZE": 1024}, num_warps=16, num_stages=8),
+        triton.Config(kwargs={"BLOCK_SIZE": 2048}, num_warps=8),
+        triton.Config(kwargs={"BLOCK_SIZE": 2048}, num_warps=16),
+        triton.Config(kwargs={"BLOCK_SIZE": 2048}, num_warps=8, num_stages=4),
+        triton.Config(kwargs={"BLOCK_SIZE": 2048}, num_warps=16, num_stages=4),
+        triton.Config(kwargs={"BLOCK_SIZE": 4096}, num_warps=8),
+        triton.Config(kwargs={"BLOCK_SIZE": 4096}, num_warps=16),
+        triton.Config(kwargs={"BLOCK_SIZE": 8192}, num_warps=8),
+        triton.Config(kwargs={"BLOCK_SIZE": 8192}, num_warps=16),
+        triton.Config(kwargs={"BLOCK_SIZE": 8192}, num_warps=32),
+        triton.Config(kwargs={"BLOCK_SIZE": 8192}, num_warps=8, num_stages=1),
+        triton.Config(kwargs={"BLOCK_SIZE": 8192}, num_warps=16, num_stages=1),
+        triton.Config(kwargs={"BLOCK_SIZE": 8192}, num_warps=32, num_stages=1),
+        triton.Config(kwargs={"BLOCK_SIZE": 8192}, num_warps=8, num_stages=4),
+        triton.Config(kwargs={"BLOCK_SIZE": 8192}, num_warps=16, num_stages=4),
+        triton.Config(kwargs={"BLOCK_SIZE": 8192}, num_warps=32, num_stages=4),
+        triton.Config(kwargs={"BLOCK_SIZE": 16384}, num_warps=8),
+        triton.Config(kwargs={"BLOCK_SIZE": 16384}, num_warps=16),
+        triton.Config(kwargs={"BLOCK_SIZE": 16384}, num_warps=32),
+        triton.Config(kwargs={"BLOCK_SIZE": 16384}, num_warps=8, num_stages=1),
+        triton.Config(kwargs={"BLOCK_SIZE": 16384}, num_warps=16, num_stages=1),
+        triton.Config(kwargs={"BLOCK_SIZE": 16384}, num_warps=32, num_stages=1),
+        triton.Config(kwargs={"BLOCK_SIZE": 16384}, num_warps=8, num_stages=4),
+        triton.Config(kwargs={"BLOCK_SIZE": 16384}, num_warps=16, num_stages=4),
+        triton.Config(kwargs={"BLOCK_SIZE": 16384}, num_warps=32, num_stages=4),
+    ],
+    key=["hidden_dim"],
+)
+
+
+@triton.jit
+def fused_dual_residual_rmsnorm_kernel(
+    output_ptr,
+    mid_ptr,
+    activ_ptr,
+    residual_ptr,
+    weight1_ptr,
+    weight2_ptr,
+    eps: tl.constexpr,
+    hidden_dim: tl.constexpr,
+    BLOCK_SIZE: tl.constexpr,
+):
+    pid = tl.program_id(axis=0)
+    input_start = pid * hidden_dim
+
+    offsets = tl.arange(0, BLOCK_SIZE)
+    mask = offsets < hidden_dim
+
+    a_ = tl.load(activ_ptr + input_start + offsets, mask=mask, other=0.0)
+    a = a_.to(tl.float32)
+    rms = tl.sqrt(tl.sum(a * a, axis=0) / hidden_dim + eps)
+
+    r = tl.load(residual_ptr + input_start + offsets, mask=mask, other=0.0)
+    w1_ = tl.load(weight1_ptr + offsets, mask=mask, other=0.0)
+    w1 = w1_.to(tl.float32)
+
+    a2r = r + (a / rms * w1).to(r.dtype)
+    tl.store(
+        mid_ptr + input_start + offsets,
+        a2r,
+        mask=mask,
+    )
+
+    a2r = a2r.to(tl.float32)
+    rms2 = tl.sqrt(tl.sum(a2r * a2r, axis=0) / hidden_dim + eps)
+
+    w2_ = tl.load(weight2_ptr + offsets, mask=mask, other=0.0)
+    w2 = w2_.to(tl.float32)
+
+    tl.store(
+        output_ptr + input_start + offsets,
+        a2r / rms2 * w2,  # implicitly casts to output dtype here
+        mask=mask,
+    )
+
+
+fused_dual_residual_rmsnorm_kernel_autotune = rmsnorm_autotune(
+    fused_dual_residual_rmsnorm_kernel
+)
+
+
+def fused_dual_residual_rmsnorm(x, residual, weight1, weight2, eps, autotune=False):
+    assert len(x.shape) == 2
+    assert (
+        x.shape == residual.shape and x.dtype == residual.dtype
+    ), f"{x.shape=} {residual.shape=} {x.dtype=} {residual.dtype=}"
+    output, mid = torch.empty_like(x), torch.empty_like(x)
+    bs, hidden_dim = x.shape
+    if autotune:
+        fused_dual_residual_rmsnorm_kernel_autotune[(bs,)](
+            output, mid, x, residual, weight1, weight2, eps=eps, hidden_dim=hidden_dim
+        )
+    else:
+        max_warps = 16 if _is_hip else 32
+        config = {
+            "BLOCK_SIZE": triton.next_power_of_2(hidden_dim),
+            "num_warps": max(
+                min(triton.next_power_of_2(triton.cdiv(hidden_dim, 256)), max_warps), 4
+            ),
+        }
+
+        fused_dual_residual_rmsnorm_kernel[(bs,)](
+            output,
+            mid,
+            x,
+            residual,
+            weight1,
+            weight2,
+            eps=eps,
+            hidden_dim=hidden_dim,
+            **config,
+        )
+
+    return output, mid
+
+
+@triton.jit
+def fused_rmsnorm_kernel(
+    output_ptr,
+    activ_ptr,
+    weight_ptr,
+    eps: tl.constexpr,
+    hidden_dim: tl.constexpr,
+    BLOCK_SIZE: tl.constexpr,
+):
+    pid = tl.program_id(axis=0).to(tl.int64)
+    input_start = pid * hidden_dim
+
+    offsets = tl.arange(0, BLOCK_SIZE)
+    mask = offsets < hidden_dim
+
+    a_ = tl.load(activ_ptr + input_start + offsets, mask=mask, other=0.0)
+    a = a_.to(tl.float32)
+    rms = tl.sqrt(tl.sum(a * a, axis=0) / hidden_dim + eps)
+
+    w1_ = tl.load(weight_ptr + offsets, mask=mask, other=0.0)
+    w1 = w1_.to(tl.float32)
+
+    a_rms = a / rms * w1
+
+    tl.store(
+        output_ptr + input_start + offsets,
+        a_rms,  # implicitly casts to output dtype here
+        mask=mask,
+    )
+
+
+def fused_rmsnorm(x, weight, eps, autotune=False, inplace=False):
+    assert len(x.shape) == 2
+    if inplace:
+        output = x
+    else:
+        output = torch.empty_like(x)
+    bs, hidden_dim = x.shape
+    max_warps = 16 if _is_hip else 32
+    config = {
+        "BLOCK_SIZE": triton.next_power_of_2(hidden_dim),
+        "num_warps": max(
+            min(triton.next_power_of_2(triton.cdiv(hidden_dim, 256)), max_warps), 4
+        ),
+    }
+
+    fused_rmsnorm_kernel[(bs,)](
+        output, x, weight, eps=eps, hidden_dim=hidden_dim, **config
+    )
+    return output
+
+
+class FusedDualResidualRMSNorm:
+    """
+    Fused implementation of
+    y = RMSNorm2(RMSNorm1(x) + residual))
+    """
+
+    def __init__(self, rmsnorm1, rmsnorm2) -> None:  # the one after rmsnorm1
+        self.rmsnorm1 = rmsnorm1
+        self.rmsnorm2 = rmsnorm2
+        self.variance_epsilon = self.rmsnorm1.variance_epsilon
+        assert self.rmsnorm1.variance_epsilon == self.rmsnorm2.variance_epsilon
+        assert self.rmsnorm1.weight.shape == self.rmsnorm2.weight.shape
+
+    def __call__(self, *args, **kwargs):
+        return self.forward(*args, **kwargs)
+
+    def forward(
+        self, x: torch.Tensor, residual: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        if x.is_cuda:
+            return self.forward_cuda(x, residual)
+        else:
+            return self.forward_flashinfer(x, residual)
+
+    def forward_cuda(
+        self, x: torch.Tensor, residual: torch.Tensor, autotune=False
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        return fused_dual_residual_rmsnorm(
+            x,
+            residual,
+            self.rmsnorm1.weight,
+            self.rmsnorm2.weight,
+            self.variance_epsilon,
+            autotune=autotune,
+        )
+
+    def forward_flashinfer(
+        self,
+        x: torch.Tensor,
+        residual: torch.Tensor,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        normed1 = self.rmsnorm1(x)
+        residual = normed1 + residual
+        return self.rmsnorm2(residual), residual
+
+    def forward_native(
+        self,
+        x: torch.Tensor,
+        residual: torch.Tensor,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        normed1 = self.rmsnorm1.forward_native(x)
+        residual = normed1 + residual
+        return self.rmsnorm2.forward_native(residual), residual
+
+
+@triton.jit
+def experts_combine_kernel(
+    out_hidden_states,
+    moe_hidden_states,
+    mlp_hidden_states,
+    combine_k: tl.constexpr,
+    hidden_dim: tl.constexpr,
+    BLOCK_SIZE: tl.constexpr,
+):
+    pid = tl.program_id(0)
+    start_index_mlp = pid * hidden_dim
+    start_index_rmoe = pid * hidden_dim * combine_k
+    offsets = tl.arange(0, BLOCK_SIZE)
+    mask = offsets < hidden_dim
+    combine_k_offsets = tl.arange(0, combine_k)
+
+    moe_x = tl.load(
+        moe_hidden_states
+        + start_index_rmoe
+        + combine_k_offsets[:, None] * hidden_dim
+        + offsets[None, :],
+        mask=mask[None, :],
+        other=0.0,
+    )
+    moe_x = tl.sum(moe_x, axis=0)
+    mlp_x = tl.load(mlp_hidden_states + start_index_mlp + offsets, mask=mask, other=0.0)
+    combined_x = (moe_x + mlp_x) / 1.4142135623730951
+
+    tl.store(out_hidden_states + start_index_mlp + offsets, combined_x, mask=mask)
+
+
+@register_custom_op(out_shape="mlp_hidden_states")
+def experts_combine_triton(
+    moe_hidden_states: torch.Tensor,
+    mlp_hidden_states: torch.Tensor,
+    output_buffer: Optional[torch.Tensor] = None,
+) -> torch.Tensor:
+    assert moe_hidden_states.is_contiguous()
+    assert mlp_hidden_states.is_contiguous()
+
+    if len(moe_hidden_states.shape) == 2:
+        combine_k = 1  # pre-combined
+    else:
+        combine_k = moe_hidden_states.shape[1]
+
+    if output_buffer is None:
+        out_hidden_states = torch.empty_like(mlp_hidden_states)
+    else:
+        flat_output_buffer = output_buffer.view(mlp_hidden_states.dtype).reshape(-1)
+        assert flat_output_buffer.numel() >= mlp_hidden_states.numel()
+        out_hidden_states = flat_output_buffer[: mlp_hidden_states.numel()].reshape(
+            mlp_hidden_states.shape
+        )
+
+    bs, hidden_dim = mlp_hidden_states.shape
+
+    config = {
+        "BLOCK_SIZE": triton.next_power_of_2(hidden_dim),
+        "num_warps": max(
+            min(triton.next_power_of_2(triton.cdiv(hidden_dim, 1024)), 8), 4
+        ),
+    }
+
+    experts_combine_kernel[(bs,)](
+        out_hidden_states,
+        moe_hidden_states,
+        mlp_hidden_states,
+        combine_k,
+        hidden_dim,
+        **config,
+    )
+
+    return out_hidden_states
+
+
+# gelu on first half of vector
+@triton.jit
+def gelu_and_mul_kernel(
+    out_hidden_states_ptr,  # (bs, hidden_dim)
+    out_scales_ptr,  # (bs,)
+    hidden_states_ptr,  # (bs, hidden_dim * 2)
+    quant_max: tl.constexpr,
+    static_scale: tl.constexpr,
+    hidden_dim: tl.constexpr,  # the output hidden_dim
+    BLOCK_SIZE: tl.constexpr,
+):
+    pid = tl.program_id(axis=0)
+
+    input_start = pid * hidden_dim * 2
+    output_start = pid * hidden_dim
+
+    input1_offs = tl.arange(0, BLOCK_SIZE)
+    mask = tl.arange(0, BLOCK_SIZE) < hidden_dim  # shared for input1, input3, output
+    input3_offs = hidden_dim + tl.arange(0, BLOCK_SIZE)
+    output_offs = tl.arange(0, BLOCK_SIZE)
+
+    x1 = tl.load(
+        hidden_states_ptr + input_start + input1_offs, mask=mask, other=0.0
+    ).to(tl.float32)
+    x3 = tl.load(
+        hidden_states_ptr + input_start + input3_offs, mask=mask, other=0.0
+    ).to(tl.float32)
+
+    # gelu
+    # cast down before mul to better match training?
+    gelu_x1 = 0.5 * (1.0 + tl.erf(x1 * 0.7071067811865475)) * x1
+    out = x3 * gelu_x1.to(hidden_states_ptr.dtype.element_ty)
+
+    if quant_max is not None:
+        raise NotImplementedError()
+
+    tl.store(out_hidden_states_ptr + output_start + output_offs, out, mask=mask)
+
+
+def gelu_and_mul_triton(
+    hidden_states,
+    scales=None,
+    quantize=None,  # dtype to quantize to
+    out=None,
+):
+    bs, in_hidden_dim = hidden_states.shape
+    hidden_dim = in_hidden_dim // 2
+
+    if out is None:
+        out_hidden_states = torch.empty(
+            (bs, hidden_dim),
+            dtype=quantize or hidden_states.dtype,
+            device=hidden_states.device,
+        )
+    else:
+        assert out.shape == (bs, hidden_dim)
+        assert out.dtype == (quantize or hidden_states.dtype)
+        out_hidden_states = out
+    out_scales = None
+    static_scale = False
+    if quantize is not None:
+        if scales is None:
+            out_scales = torch.empty(
+                (bs,), dtype=torch.float32, device=hidden_states.device
+            )
+        else:
+            out_scales = scales
+            static_scale = True
+
+    max_warps = 16 if _is_hip else 32
+    config = {
+        # 8 ele per thread (not tuned)
+        "num_warps": max(
+            min(triton.next_power_of_2(triton.cdiv(hidden_dim, 8 * 32)), max_warps), 4
+        ),
+    }
+
+    gelu_and_mul_kernel[(bs,)](
+        out_hidden_states,
+        out_scales,
+        hidden_states,
+        quant_max=torch.finfo(quantize).max if quantize is not None else None,
+        static_scale=static_scale,
+        hidden_dim=hidden_dim,
+        BLOCK_SIZE=triton.next_power_of_2(hidden_dim),
+        **config,
+    )
+
+    if quantize is not None:
+        return out_hidden_states, out_scales
+    else:
+        return out_hidden_states, None
+
+
+# silu on first half of vector
+@triton.jit
+def silu_and_mul_kernel(
+    out_hidden_states_ptr,  # (bs, hidden_dim)
+    out_scales_ptr,  # (bs,)
+    hidden_states_ptr,  # (bs, hidden_dim * 2)
+    quant_max: tl.constexpr,
+    static_scale: tl.constexpr,
+    hidden_dim: tl.constexpr,  # the output hidden_dim
+    BLOCK_SIZE: tl.constexpr,
+):
+    pid = tl.program_id(axis=0)
+
+    input_start = pid * hidden_dim * 2
+    output_start = pid * hidden_dim
+
+    input1_offs = tl.arange(0, BLOCK_SIZE)
+    mask = tl.arange(0, BLOCK_SIZE) < hidden_dim  # shared for input1, input3, output
+    input3_offs = hidden_dim + tl.arange(0, BLOCK_SIZE)
+    output_offs = tl.arange(0, BLOCK_SIZE)
+
+    x1 = tl.load(
+        hidden_states_ptr + input_start + input1_offs, mask=mask, other=0.0
+    ).to(tl.float32)
+    x3 = tl.load(
+        hidden_states_ptr + input_start + input3_offs, mask=mask, other=0.0
+    ).to(tl.float32)
+
+    # silu
+    # cast down before mul to better match training?
+    silu_x1 = x1 * tl.sigmoid(x1)
+    out = x3 * silu_x1.to(hidden_states_ptr.dtype.element_ty)
+
+    if quant_max is not None:
+        raise NotImplementedError()
+
+    tl.store(out_hidden_states_ptr + output_start + output_offs, out, mask=mask)
+
+
+def silu_and_mul_triton(
+    hidden_states,
+    scales=None,
+    quantize=None,  # dtype to quantize to
+    out=None,
+):
+    bs, in_hidden_dim = hidden_states.shape
+    hidden_dim = in_hidden_dim // 2
+
+    if out is None:
+        out_hidden_states = torch.empty(
+            (bs, hidden_dim),
+            dtype=quantize or hidden_states.dtype,
+            device=hidden_states.device,
+        )
+    else:
+        assert out.shape == (bs, hidden_dim)
+        assert out.dtype == (quantize or hidden_states.dtype)
+        out_hidden_states = out
+    out_scales = None
+    static_scale = False
+    if quantize is not None:
+        if scales is None:
+            out_scales = torch.empty(
+                (bs,), dtype=torch.float32, device=hidden_states.device
+            )
+        else:
+            out_scales = scales
+            static_scale = True
+
+    max_warps = 16 if _is_hip else 32
+    config = {
+        # 8 ele per thread (not tuned)
+        "num_warps": max(
+            min(triton.next_power_of_2(triton.cdiv(hidden_dim, 8 * 32)), max_warps), 4
+        ),
+    }
+
+    silu_and_mul_kernel[(bs,)](
+        out_hidden_states,
+        out_scales,
+        hidden_states,
+        quant_max=torch.finfo(quantize).max if quantize is not None else None,
+        static_scale=static_scale,
+        hidden_dim=hidden_dim,
+        BLOCK_SIZE=triton.next_power_of_2(hidden_dim),
+        **config,
+    )
+
+    if quantize is not None:
+        return out_hidden_states, out_scales
+    else:
+        return out_hidden_states, None
diff --git a/sglang/python/sglang/srt/layers/flashinfer_comm_fusion.py b/sglang/python/sglang/srt/layers/flashinfer_comm_fusion.py
new file mode 100644
index 0000000000000000000000000000000000000000..9d1f919008841b2377e25830cb8f57e94982af4e
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/flashinfer_comm_fusion.py
@@ -0,0 +1,274 @@
+import logging
+from typing import Optional, Tuple
+
+import torch
+
+from sglang.srt.distributed import (
+    get_tensor_model_parallel_rank,
+    get_tensor_model_parallel_world_size,
+)
+from sglang.srt.utils import is_flashinfer_available
+from sglang.srt.utils.custom_op import register_custom_op
+
+logger = logging.getLogger(__name__)
+
+_flashinfer_comm = None
+_workspace_manager = None
+
+if is_flashinfer_available():
+    try:
+        import flashinfer.comm as comm
+
+        if hasattr(comm, "allreduce_fusion") and hasattr(
+            comm, "create_allreduce_fusion_workspace"
+        ):
+            _flashinfer_comm = comm
+        else:
+            logger.warning(
+                "flashinfer.comm unified allreduce_fusion API is not available, "
+                "falling back to standard implementation"
+            )
+    except ImportError:
+        logger.warning(
+            "flashinfer.comm is not available, falling back to standard "
+            "implementation"
+        )
+
+
+class FlashInferWorkspaceManager:
+    def __init__(self):
+        self.workspace = None
+        self.world_size = None
+        self.rank = None
+        self.max_token_num = None
+        self.hidden_dim = None
+        self.dtype = None
+        self.initialized = False
+
+    def initialize(
+        self,
+        world_size: int,
+        rank: int,
+        max_token_num: int,
+        hidden_dim: int,
+        dtype: torch.dtype,
+        use_oneshot: Optional[bool] = None,
+    ):
+        """Initialize workspace"""
+        if _flashinfer_comm is None:
+            logger.warning(
+                "FlashInfer comm not available, skipping workspace " "initialization"
+            )
+            return
+
+        self.cleanup()
+        try:
+            self.workspace = _flashinfer_comm.create_allreduce_fusion_workspace(
+                backend="trtllm",
+                world_size=world_size,
+                rank=rank,
+                max_token_num=max_token_num,
+                hidden_dim=hidden_dim,
+                dtype=dtype,
+                force_oneshot_support=bool(use_oneshot),
+            )
+        except Exception as e:
+            logger.warning(f"Failed to initialize FlashInfer workspace: {e}")
+            self.workspace = None
+            self.initialized = False
+            return
+
+        self.world_size = world_size
+        self.rank = rank
+        self.max_token_num = max_token_num
+        self.hidden_dim = hidden_dim
+        self.dtype = dtype
+        self.initialized = True
+
+        backend = getattr(self.workspace, "backend", "unknown")
+        logger.info(
+            f"FlashInfer workspace initialized for rank {rank}, "
+            f"world_size {world_size}, backend {backend}"
+        )
+
+    def is_buffer_size_sufficient(
+        self,
+        token_num: int,
+        hidden_dim: int,
+        dtype: torch.dtype,
+        use_oneshot: Optional[bool] = None,
+    ) -> bool:
+        if not self.initialized or self.workspace is None:
+            return False
+        try:
+            return self.workspace.is_buffer_size_sufficient(
+                tp_size=self.world_size,
+                num_tokens=token_num,
+                hidden_dim=hidden_dim,
+                dtype=dtype,
+                use_oneshot=use_oneshot,
+            )
+        except Exception as e:
+            logger.debug(f"FlashInfer workspace size check failed: {e}")
+            return False
+
+    def cleanup(self):
+        """Clean up workspace"""
+        if self.workspace is not None:
+            try:
+                self.workspace.destroy()
+            except Exception as e:
+                logger.warning(f"Failed to cleanup FlashInfer workspace: {e}")
+            finally:
+                self.workspace = None
+                self.initialized = False
+                self.world_size = None
+                self.rank = None
+                self.max_token_num = None
+                self.hidden_dim = None
+                self.dtype = None
+
+
+_workspace_manager = FlashInferWorkspaceManager()
+
+
+def ensure_workspace_initialized(
+    max_token_num: int = 2048,
+    hidden_dim: int = 4096,
+    dtype: torch.dtype = torch.float16,
+    token_num: Optional[int] = None,
+    use_oneshot: Optional[bool] = None,
+):
+    """Ensure workspace is initialized"""
+    if not is_flashinfer_available() or _flashinfer_comm is None:
+        return False
+
+    world_size = get_tensor_model_parallel_world_size()
+    if world_size <= 1:
+        return False
+
+    rank = get_tensor_model_parallel_rank()
+    token_num = token_num or max_token_num
+
+    if (
+        not _workspace_manager.initialized
+        or _workspace_manager.world_size != world_size
+        or _workspace_manager.rank != rank
+        or not _workspace_manager.is_buffer_size_sufficient(
+            token_num=token_num,
+            hidden_dim=hidden_dim,
+            dtype=dtype,
+            use_oneshot=use_oneshot,
+        )
+    ):
+        _workspace_manager.initialize(
+            world_size=world_size,
+            rank=rank,
+            max_token_num=max_token_num,
+            hidden_dim=hidden_dim,
+            dtype=dtype,
+            use_oneshot=use_oneshot,
+        )
+
+    return _workspace_manager.initialized
+
+
+def fake_flashinfer_allreduce_residual_rmsnorm(
+    input_tensor: torch.Tensor,
+    residual: torch.Tensor,
+    weight: torch.Tensor,
+    eps: float = 1e-6,
+    max_token_num: int = 16384,
+    use_oneshot: Optional[bool] = None,
+    trigger_completion_at_end: bool = False,
+    fp32_acc: bool = False,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    residual_out = torch.empty_like(residual)
+    norm_out = torch.empty_like(input_tensor)
+    return norm_out, residual_out
+
+
+@register_custom_op(
+    mutates_args=["input_tensor", "residual", "weight"],
+    fake_impl=fake_flashinfer_allreduce_residual_rmsnorm,
+)
+def flashinfer_allreduce_residual_rmsnorm(
+    input_tensor: torch.Tensor,
+    residual: torch.Tensor,
+    weight: torch.Tensor,
+    eps: float = 1e-6,
+    max_token_num: int = 2048,
+    use_oneshot: Optional[bool] = None,
+    trigger_completion_at_end: bool = False,
+    fp32_acc: bool = False,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """
+    Use FlashInfer's fused allreduce + residual + RMS norm operation
+
+    Args:
+        input_tensor: Input tensor that needs allreduce
+        residual: Residual tensor
+        weight: RMS norm weight
+        eps: RMS norm epsilon
+        max_token_num: Maximum token number
+        use_oneshot: Whether to use oneshot mode
+        trigger_completion_at_end: Whether to trigger completion at end
+        fp32_acc: Whether to use fp32 precision
+
+    Returns:
+        Tuple[torch.Tensor, torch.Tensor]: (norm_output, residual_output)
+    """
+    if not is_flashinfer_available() or _flashinfer_comm is None:
+        logger.debug(
+            "FlashInfer not available, falling back to standard " "implementation"
+        )
+        return None, None
+
+    world_size = get_tensor_model_parallel_world_size()
+    if world_size <= 1:
+        logger.debug("Single GPU, no need for allreduce fusion")
+        return None, None
+
+    assert input_tensor.shape[0] <= max_token_num
+    if (
+        not input_tensor.is_contiguous()
+        or not residual.is_contiguous()
+        or not weight.is_contiguous()
+    ):
+        logger.debug("Non-contiguous tensors, skipping FlashInfer allreduce fusion")
+        return None, None
+
+    if not ensure_workspace_initialized(
+        max_token_num=max_token_num,
+        hidden_dim=input_tensor.shape[-1],
+        dtype=input_tensor.dtype,
+        token_num=input_tensor.shape[0],
+        use_oneshot=use_oneshot,
+    ):
+        logger.debug("FlashInfer workspace not available")
+        return None, None
+
+    residual_out = torch.empty_like(residual)
+    norm_out = torch.empty_like(input_tensor)
+
+    _flashinfer_comm.allreduce_fusion(
+        input=input_tensor,
+        workspace=_workspace_manager.workspace,
+        pattern=_flashinfer_comm.AllReduceFusionPattern.kARResidualRMSNorm,
+        launch_with_pdl=True,
+        residual_out=residual_out,
+        norm_out=norm_out,
+        residual_in=residual,
+        rms_gamma=weight,
+        rms_eps=eps,
+        use_oneshot=use_oneshot,
+        fp32_acc=fp32_acc,
+    )
+
+    return norm_out, residual_out
+
+
+def cleanup_flashinfer_workspace():
+    global _workspace_manager
+    if _workspace_manager is not None:
+        _workspace_manager.cleanup()
diff --git a/sglang/python/sglang/srt/layers/int4fp8_utils.py b/sglang/python/sglang/srt/layers/int4fp8_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..92299b4c36fca04e991c4ae5b6bf9cbda0c96d4f
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/int4fp8_utils.py
@@ -0,0 +1,73 @@
+"""
+Common utilities for quark.
+"""
+
+import logging
+from typing import Tuple
+
+import torch
+
+logger = logging.getLogger(__name__)
+
+
+def quantize_fp8_scale_tensorwise(w: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+    FP8_MAX = 448.0
+    scale = w.abs().amax().float() / FP8_MAX
+    scaled = (w / scale).clamp(-FP8_MAX, FP8_MAX).to(torch.float8_e4m3fn)
+    return scaled, scale
+
+
+def quantize_int4_scale_columnwise(
+    w: torch.Tensor,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    S4_MAX = 7
+    w_flat = w.reshape(-1, w.shape[-1]).float()
+    scale = w_flat.abs().amax(axis=-1) / S4_MAX
+    scaled = torch.round(w_flat / scale[:, None]).to(torch.int8).clamp(-S4_MAX, S4_MAX)
+    return scaled.reshape(w.shape), scale.reshape(w.shape[:-1])
+
+
+def pack_int4_to_int32(to_pack: torch.Tensor, reorder: bool = True) -> torch.Tensor:
+    if to_pack.ndim > 2:
+        raise ValueError(
+            "Pack: Only supports tensors with dimensions not greater than 2."
+        )
+
+    if reorder:
+        order_map = [0, 2, 4, 6, 1, 3, 5, 7]
+    else:
+        order_map = [0, 1, 2, 3, 4, 5, 6, 7]
+    pack_num = 8
+    if to_pack.ndim == 2:
+        packed = torch.zeros(
+            to_pack.shape[0],
+            to_pack.shape[1] // pack_num,
+            dtype=torch.int32,
+            device=to_pack.device,
+        )
+        new_c = to_pack.shape[1] // pack_num
+        for c in range(new_c):
+            for i in range(pack_num):
+                # Use -3 as an example, high_position is 11111111,cause bit_or generate errors, so we can't use int4 directly
+                packed_col = to_pack[:, c * pack_num + order_map[i]].to(torch.int32)
+                packed_col = packed_col & 0x0F
+                packed[:, c] = torch.bitwise_or(
+                    packed[:, c], torch.bitwise_left_shift(packed_col, i * 4)
+                )
+    elif to_pack.ndim == 0:
+        packed = to_pack.to(torch.int32)
+    else:
+        packed = torch.zeros(
+            to_pack.shape[0] // pack_num, dtype=torch.int32, device=to_pack.device
+        )
+        new_c = to_pack.shape[0] // pack_num
+        for c in range(new_c):
+            for i in range(pack_num):
+                # Use -3 as an example, high_position is 11111111,cause bit_or generate errors, so we can't use int4 directly
+                packed_col = to_pack[c * pack_num + order_map[i]]
+                packed_col = packed_col & 0x0F
+                packed[c] = torch.bitwise_or(
+                    packed[c], torch.bitwise_left_shift(packed_col, i * 4)
+                )
+
+    return packed.view(torch.uint32)
diff --git a/sglang/python/sglang/srt/layers/layernorm.py b/sglang/python/sglang/srt/layers/layernorm.py
new file mode 100644
index 0000000000000000000000000000000000000000..383f58399f5cd5b667551a610aa08599f988c58c
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/layernorm.py
@@ -0,0 +1,556 @@
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Fused operators for normalization layers."""
+
+import logging
+from typing import Optional, Tuple, Union
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from sglang.srt.batch_invariant_ops import (
+    is_batch_invariant_mode_enabled,
+    rms_norm_batch_invariant,
+)
+from sglang.srt.environ import envs
+from sglang.srt.layers.utils import MultiPlatformOp
+from sglang.srt.server_args import get_global_server_args
+from sglang.srt.utils import (
+    cpu_has_amx_support,
+    get_bool_env_var,
+    is_cpu,
+    is_cuda,
+    is_flashinfer_available,
+    is_hip,
+    is_npu,
+    is_xpu,
+)
+
+_is_cuda = is_cuda()
+_is_flashinfer_available = is_flashinfer_available()
+_is_hip = is_hip()
+_is_npu = is_npu()
+_use_aiter = get_bool_env_var("SGLANG_USE_AITER") and _is_hip
+_is_cpu_amx_available = cpu_has_amx_support()
+_is_cpu = is_cpu()
+_is_xpu = is_xpu()
+_flashinfer_layernorm_available = False
+
+if _is_cuda or _is_xpu:
+    if _is_flashinfer_available:
+        try:
+            from flashinfer.norm import layernorm
+
+            _flashinfer_layernorm_available = True
+        except (ImportError, AttributeError):
+            _flashinfer_layernorm_available = False
+    else:
+        _flashinfer_layernorm_available = False
+
+    from sgl_kernel import (
+        fused_add_rmsnorm,
+        gemma_fused_add_rmsnorm,
+        gemma_rmsnorm,
+        rmsnorm,
+    )
+_has_vllm_rms_norm = False
+if _use_aiter:
+    from aiter import rmsnorm2d_fwd as rms_norm
+    from aiter import rmsnorm2d_fwd_with_add as fused_add_rms_norm
+
+    _has_vllm_rms_norm = True  # aiter provides the rms_norm functions
+elif _is_hip:
+    try:
+        from vllm._custom_ops import fused_add_rms_norm, rms_norm
+
+        _has_vllm_rms_norm = True
+    except ImportError:
+        # Fallback: vllm not available, will use forward_native
+        _has_vllm_rms_norm = False
+
+logger = logging.getLogger(__name__)
+
+if _is_npu:
+    import torch_npu
+
+
+class RMSNorm(MultiPlatformOp):
+    def __init__(
+        self,
+        hidden_size: int,
+        eps: float = 1e-6,
+        var_hidden_size: Optional[int] = None,
+        cast_x_before_out_mul: bool = False,
+        fp32_residual: bool = False,
+        has_weight: bool = True,
+        weight_dtype: Optional = None,
+        override_orig_dtype: Optional = None,
+    ) -> None:
+        super().__init__()
+        self.has_weight = has_weight
+        self.cast_x_before_out_mul = cast_x_before_out_mul
+        self.fp32_residual = fp32_residual
+        self.override_orig_dtype = override_orig_dtype
+        if self.has_weight:
+            self.weight = nn.Parameter(torch.ones(hidden_size, dtype=weight_dtype))
+        else:
+            self.weight = torch.ones(hidden_size, dtype=weight_dtype)
+        self.variance_epsilon = eps
+        self.hidden_size = hidden_size
+        self.variance_size_override = (
+            None if var_hidden_size == hidden_size else var_hidden_size
+        )
+        if _use_aiter:
+            self._forward_method = self.forward_aiter
+
+    def forward_cuda(
+        self,
+        x: torch.Tensor,
+        residual: Optional[torch.Tensor] = None,
+        post_residual_addition: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
+        if x.numel() == 0:
+            return x
+        if self.variance_size_override is not None:
+            return self.forward_native(x, residual, post_residual_addition)
+        if is_batch_invariant_mode_enabled():
+            if (
+                residual is not None
+                or get_global_server_args().rl_on_policy_target == "fsdp"
+            ):
+                return self.forward_native(x, residual, post_residual_addition)
+            return rms_norm_batch_invariant(
+                x,
+                self.weight.data,
+                self.variance_epsilon,
+            )
+        if residual is not None:
+            # TODO: Ideally we want to have (hidden_states+residual)+post_residual_addition.
+            # but right now we can only have hidden_states+(residual+post_residual_addition).
+            # (hidden_states+residual)+post_residual_addition != hidden_states+(residual+post_residual_addition),
+            # we probably need to add another parameter to fused_add_rmsnorm
+            if post_residual_addition is not None:
+                residual = residual + post_residual_addition
+            fused_add_rmsnorm(x, residual, self.weight.data, self.variance_epsilon)
+            return x, residual
+        out = rmsnorm(x, self.weight.data, self.variance_epsilon)
+        return out
+
+    def forward_npu(
+        self,
+        x: torch.Tensor,
+        residual: Optional[torch.Tensor] = None,
+        post_residual_addition: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
+        if residual is not None:
+            if post_residual_addition is not None:
+                residual = residual + post_residual_addition
+            out, _, residual_out = torch_npu.npu_add_rms_norm(
+                residual, x, self.weight.data, self.variance_epsilon
+            )
+            return out, residual_out
+        return torch_npu.npu_rms_norm(x, self.weight.data, self.variance_epsilon)[0]
+
+    def forward_aiter(
+        self,
+        x: torch.Tensor,
+        residual: Optional[torch.Tensor] = None,
+        post_residual_addition: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
+        if residual is not None:
+            residual_out = torch.empty_like(x)
+            output = torch.empty_like(x)
+            if post_residual_addition is not None:
+                residual = residual + post_residual_addition
+            fused_add_rms_norm(
+                output,
+                x,
+                residual,
+                residual_out,
+                self.weight.data,
+                self.variance_epsilon,
+            )
+            return output, residual_out
+        return rms_norm(x, self.weight.data, self.variance_epsilon)
+
+    def forward_hip(
+        self,
+        x: torch.Tensor,
+        residual: Optional[torch.Tensor] = None,
+        post_residual_addition: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
+        # Fallback to native implementation if vllm is not available
+        if not _has_vllm_rms_norm:
+            return self.forward_native(x, residual, post_residual_addition)
+
+        if not x.is_contiguous():
+            # NOTE: Remove this if aiter kernel supports discontinuous input
+            x = x.contiguous()
+        if residual is not None:
+            out = torch.empty_like(x)
+            residual_out = torch.empty_like(x)
+            if post_residual_addition is not None:
+                residual = residual + post_residual_addition
+            fused_add_rms_norm(
+                out, x, residual_out, residual, self.weight.data, self.variance_epsilon
+            )
+            return out, residual_out
+        out = torch.empty_like(x)
+        rms_norm(out, x, self.weight.data, self.variance_epsilon)
+        return out
+
+    def forward_native(
+        self,
+        x: torch.Tensor,
+        residual: Optional[torch.Tensor] = None,
+        post_residual_addition: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
+        if not x.is_contiguous():
+            x = x.contiguous()
+        orig_dtype = self.override_orig_dtype or x.dtype
+        x = x.to(torch.float32)
+        if residual is not None:
+            x = x + residual.to(torch.float32)
+            if post_residual_addition is not None:
+                x = x + post_residual_addition.to(torch.float32)
+            if self.fp32_residual:
+                residual = x.clone()
+            else:
+                residual = x.to(orig_dtype)
+
+        hidden_size = x.shape[-1]
+        if hidden_size != self.hidden_size:
+            raise ValueError(
+                "Expected hidden_size to be "
+                f"{self.hidden_size}, but found: {hidden_size}"
+            )
+
+        if self.variance_size_override is None:
+            x_var = x
+        else:
+            if hidden_size < self.variance_size_override:
+                raise ValueError(
+                    "Expected hidden_size to be at least "
+                    f"{self.variance_size_override}, but found: {hidden_size}"
+                )
+
+            x_var = x[..., : self.variance_size_override]
+
+        variance = x_var.pow(2).mean(dim=-1, keepdim=True)
+        x = x * torch.rsqrt(variance + self.variance_epsilon)
+
+        if self.cast_x_before_out_mul:
+            x = self.weight * x.to(orig_dtype)
+        else:
+            x = (x * self.weight).to(orig_dtype)
+
+        if residual is None:
+            return x
+        else:
+            return x, residual
+
+    def forward_cpu(
+        self,
+        x: torch.Tensor,
+        residual: Optional[torch.Tensor] = None,
+        post_residual_addition: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
+        if _is_cpu_amx_available:
+            if residual is not None:
+                if post_residual_addition is not None:
+                    residual = residual + post_residual_addition
+                torch.ops.sgl_kernel.fused_add_rmsnorm_cpu(
+                    x, residual, self.weight.data, self.variance_epsilon
+                )
+                return x, residual
+            return torch.ops.sgl_kernel.rmsnorm_cpu(
+                x, self.weight.data, self.variance_epsilon
+            )
+        else:
+            return self.forward_native(x, residual, post_residual_addition)
+
+    def forward_xpu(
+        self,
+        x: torch.Tensor,
+        residual: Optional[torch.Tensor] = None,
+        post_residual_addition: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
+        if self.variance_size_override is not None:
+            return self.forward_native(x, residual, post_residual_addition)
+        if residual is not None:
+            if post_residual_addition is not None:
+                residual = residual + post_residual_addition
+            fused_add_rmsnorm(x, residual, self.weight.data, self.variance_epsilon)
+            return x, residual
+        out = rmsnorm(x, self.weight.data, self.variance_epsilon)
+        return out
+
+    def forward_with_allreduce_fusion(
+        self,
+        x: torch.Tensor,
+        residual: Optional[torch.Tensor] = None,
+        post_residual_addition: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
+        """
+        Forward method with allreduce fusion, prioritizing flashinfer fused operations
+        """
+        if residual is not None:
+            from sglang.srt.distributed import (
+                get_tensor_model_parallel_world_size,
+                tensor_model_parallel_all_reduce,
+                tensor_model_parallel_fused_allreduce_rmsnorm,
+            )
+            from sglang.srt.layers.flashinfer_comm_fusion import (
+                flashinfer_allreduce_residual_rmsnorm,
+            )
+
+            if get_tensor_model_parallel_world_size() > 1:
+                if post_residual_addition is not None:
+                    residual = residual + post_residual_addition
+
+                # Prefer AITER fused AR+RMSNorm when enabled on AMD.
+                if _use_aiter:
+                    fused_result = tensor_model_parallel_fused_allreduce_rmsnorm(
+                        x, residual, self.weight, self.variance_epsilon
+                    )
+                    if fused_result is not None:
+                        return fused_result
+                else:
+                    fused_result = flashinfer_allreduce_residual_rmsnorm(
+                        input_tensor=x,
+                        residual=residual,
+                        weight=self.weight,
+                        eps=self.variance_epsilon,
+                    )
+                    if fused_result[0] is not None:
+                        return fused_result
+
+                # For AITER route, preserve correctness when fused path is unavailable.
+                if (
+                    _use_aiter
+                    and get_global_server_args().enable_aiter_allreduce_fusion
+                ):
+                    x = tensor_model_parallel_all_reduce(x)
+                    return self.forward(x, residual, None)
+
+        return self.forward(x, residual, post_residual_addition)
+
+
+class LayerNorm(MultiPlatformOp):
+    def __init__(
+        self,
+        hidden_size: int,
+        eps: float = 1e-6,
+        elementwise_affine: bool = True,
+        bias: bool = True,
+        dtype: torch.dtype = torch.float32,
+    ) -> None:
+        super().__init__()
+        self.hidden_size = hidden_size
+        self.variance_epsilon = eps
+        self.elementwise_affine = elementwise_affine
+        self.use_bias = bias
+        self.dtype = dtype
+
+        self.bias = nn.Parameter(torch.zeros(hidden_size, dtype=self.dtype))
+        self.weight = nn.Parameter(torch.ones(hidden_size, dtype=self.dtype))
+
+    def forward_cuda(
+        self,
+        x: torch.Tensor,
+    ) -> torch.Tensor:
+        if (
+            _flashinfer_layernorm_available
+            and x.dtype == torch.bfloat16
+            and self.dtype == torch.float32
+        ):
+            return layernorm(x, self.weight, self.bias, self.variance_epsilon)
+        else:
+            return self.forward_native(x)
+
+    def forward_native(
+        self,
+        x: torch.Tensor,
+    ) -> torch.Tensor:
+        weight = self.weight if self.elementwise_affine else None
+        bias = self.bias if self.use_bias else None
+        orig_dtype = x.dtype
+        x = x.to(self.dtype)
+        return F.layer_norm(
+            x,
+            (self.hidden_size,),
+            weight=weight,
+            bias=bias,
+            eps=self.variance_epsilon,
+        ).to(orig_dtype)
+
+    def forward_hip(
+        self,
+        x: torch.Tensor,
+    ) -> torch.Tensor:
+        return self.forward_native(x)
+
+    def forward_npu(
+        self,
+        x: torch.Tensor,
+    ) -> torch.Tensor:
+        return self.forward_native(x)
+
+    def forward_cpu(
+        self,
+        x: torch.Tensor,
+    ) -> torch.Tensor:
+        if _is_cpu_amx_available:
+            return torch.ops.sgl_kernel.layernorm_cpu(
+                x, self.weight.data, self.variance_epsilon
+            )
+        else:
+            return self.forward_native(x)
+
+
+class GemmaRMSNorm(MultiPlatformOp):
+    def __init__(
+        self,
+        hidden_size: int,
+        eps: float = 1e-6,
+    ) -> None:
+        super().__init__()
+        self.weight = nn.Parameter(torch.zeros(hidden_size))
+        self.variance_epsilon = eps
+
+        # Re-dispatch
+        if _is_hip:
+            self._forward_method = self.forward_native
+
+    def _forward_impl(
+        self,
+        x: torch.Tensor,
+        residual: Optional[torch.Tensor] = None,
+        post_residual_addition: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
+        if residual is not None:
+            if post_residual_addition is not None:
+                residual = residual + post_residual_addition
+            gemma_fused_add_rmsnorm(
+                x, residual, self.weight.data, self.variance_epsilon
+            )
+            return x, residual
+        out = gemma_rmsnorm(x, self.weight.data, self.variance_epsilon)
+        return out
+
+    def forward_native(
+        self,
+        x: torch.Tensor,
+        residual: Optional[torch.Tensor] = None,
+        post_residual_addition: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
+        orig_dtype = x.dtype
+        if residual is not None:
+            if post_residual_addition is not None:
+                residual = residual + post_residual_addition
+            x = x + residual
+            residual = x
+
+        x = x.float()
+        variance = x.pow(2).mean(dim=-1, keepdim=True)
+        x = x * torch.rsqrt(variance + self.variance_epsilon)
+        x = x * (1.0 + self.weight.float())
+        x = x.to(orig_dtype)
+        return x if residual is None else (x, residual)
+
+    def forward_cuda(
+        self,
+        x: torch.Tensor,
+        residual: Optional[torch.Tensor] = None,
+        post_residual_addition: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
+        return self._forward_impl(x, residual, post_residual_addition)
+
+    def forward_cpu(
+        self,
+        x: torch.Tensor,
+        residual: Optional[torch.Tensor] = None,
+        post_residual_addition: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
+        if _is_cpu_amx_available:
+            if residual is not None:
+                if post_residual_addition is not None:
+                    residual = residual + post_residual_addition
+                torch.ops.sgl_kernel.gemma_fused_add_rmsnorm_cpu(
+                    x, residual, self.weight.data, self.variance_epsilon
+                )
+                return x, residual
+            return torch.ops.sgl_kernel.gemma_rmsnorm_cpu(
+                x, self.weight.data, self.variance_epsilon
+            )
+        return self.forward_native(x, residual, post_residual_addition)
+
+    def forward_npu(
+        self,
+        x: torch.Tensor,
+        residual: Optional[torch.Tensor] = None,
+        post_residual_addition: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
+        if envs.SGLANG_NPU_FORWARD_NATIVE_GEMMA_RMS_NORM.get():
+            return self.forward_native(x, residual)
+        if residual is not None:
+            if post_residual_addition is not None:
+                residual = residual + post_residual_addition
+            x = x + residual
+            residual = x
+
+        x, _ = torch_npu.npu_gemma_rms_norm(x, self.weight, self.variance_epsilon)
+        return x if residual is None else (x, residual)
+
+    def forward_xpu(
+        self,
+        x: torch.Tensor,
+        residual: Optional[torch.Tensor] = None,
+        post_residual_addition: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
+        return self._forward_impl(x, residual, post_residual_addition)
+
+
+class Gemma3RMSNorm(MultiPlatformOp):
+    def __init__(self, dim: int, eps: float = 1e-6):
+        super().__init__()
+        self.eps = eps
+        self.weight = nn.Parameter(torch.zeros(dim))
+        # Re-dispatch
+
+    def _norm(self, x):
+        return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)
+
+    def forward_native(self, x):
+        output = self._norm(x.float())
+        # Llama does x.to(float16) * w whilst Gemma3 is (x * w).to(float16)
+        # See https://github.com/huggingface/transformers/pull/29402
+        output = output * (1.0 + self.weight.float())
+        return output.type_as(x)
+
+    def forward_cpu(self, x):
+        if _is_cpu_amx_available and x.stride(-1) == 1:
+            return torch.ops.sgl_kernel.gemma3_rmsnorm_cpu(x, self.weight, self.eps)
+        return self.forward_native(x)
+
+    def forward_cuda(self, x):
+        return self.forward_native(x)
+
+    def forward_npu(self, x):
+        output, _ = torch_npu.npu_gemma_rms_norm(x, self.weight, self.eps)
+        return output
+
+    def extra_repr(self):
+        return f"{tuple(self.weight.shape)}, eps={self.eps}"
diff --git a/sglang/python/sglang/srt/layers/linear.py b/sglang/python/sglang/srt/layers/linear.py
new file mode 100644
index 0000000000000000000000000000000000000000..02d11cc452ba1b4f8a867488d3a37fb04ae551e7
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/linear.py
@@ -0,0 +1,1609 @@
+"""Adapted from https://github.com/vllm-project/vllm/blob/v0.6.4.post1/vllm/model_executor/layers/linear.py"""
+
+from __future__ import annotations
+
+import itertools
+import logging
+from typing import TYPE_CHECKING, Dict, List, Optional, Tuple
+
+import torch
+from torch import nn
+from torch.nn.parameter import Parameter, UninitializedParameter
+
+from sglang.srt.distributed import (
+    divide,
+    get_tensor_model_parallel_rank,
+    get_tensor_model_parallel_world_size,
+    get_tp_group,
+    split_tensor_along_last_dim,
+    tensor_model_parallel_all_gather,
+    tensor_model_parallel_all_reduce,
+)
+from sglang.srt.distributed.device_communicators.pynccl_allocator import (
+    use_symmetric_memory,
+)
+from sglang.srt.layers.dp_attention import (
+    get_attention_tp_group,
+    is_allocation_symmetric,
+)
+from sglang.srt.layers.parameter import (
+    BasevLLMParameter,
+    BlockQuantScaleParameter,
+    PackedColumnParameter,
+    PackedvLLMParameter,
+    PerTensorScaleParameter,
+    RowvLLMParameter,
+    _ColumnvLLMParameter,
+)
+from sglang.srt.layers.utils import pad_or_narrow_weight
+from sglang.srt.utils import get_bool_env_var, is_cpu, is_hip, is_npu, set_weight_attrs
+
+if TYPE_CHECKING:
+    from sglang.srt.layers.quantization.base_config import (
+        QuantizationConfig,
+        QuantizeMethodBase,
+    )
+
+_is_hip = is_hip()
+_disable_hip_linear_quant = _is_hip and get_bool_env_var(
+    "SGLANG_ROCM_DISABLE_LINEARQUANT"
+)
+
+logger = logging.getLogger(__name__)
+
+WEIGHT_LOADER_V2_SUPPORTED = [
+    "CompressedTensorsLinearMethod",
+    "AWQMarlinLinearMethod",
+    "AWQLinearMethod",
+    "AWQLinearAscendMethod",
+    "GPTQMarlinLinearMethod",
+    "Fp8LinearMethod",
+    "BlockInt8LinearMethod",
+    "MarlinLinearMethod",
+    "QQQLinearMethod",
+    "GPTQMarlin24LinearMethod",
+    "TPUInt8LinearMethod",
+    "GPTQLinearMethod",
+    "GPTQLinearAscendMethod",
+    "GPTQMoEAscendMethod",
+    "FBGEMMFp8LinearMethod",
+    "GPTQLinearAscendMethod",
+    "ModelOptFp8LinearMethod",
+    "ModelOptFp4LinearMethod",
+    "IPEXAWQLinearMethod",
+    "PetitNvFp4LinearMethod",
+    "QuarkInt4Fp8LinearMethod",
+]
+
+_is_cpu = is_cpu()
+_is_npu = is_npu()
+
+
+def adjust_marlin_shard(param, shard_size, shard_offset):
+    marlin_tile_size = getattr(param, "marlin_tile_size", None)
+    if marlin_tile_size is None:
+        return shard_size, shard_offset
+
+    return shard_size * marlin_tile_size, shard_offset * marlin_tile_size
+
+
+def adjust_bitsandbytes_4bit_shard(
+    param: Parameter, shard_offsets: Dict[str, Tuple[int, int]], loaded_shard_id: str
+) -> Tuple[int, int]:
+    """Adjust the quantization offsets and sizes for BitsAndBytes sharding."""
+
+    total, _ = shard_offsets["total"]
+    orig_offset, orig_size = shard_offsets[loaded_shard_id]
+
+    quantized_total = param.data.shape[0]
+    quantized_offset = orig_offset * quantized_total // total
+    quantized_size = orig_size * quantized_total // total
+
+    return quantized_size, quantized_offset
+
+
+def adjust_scalar_to_fused_array(param, loaded_weight, shard_id):
+    """For fused modules (QKV and MLP) we have an array of length
+    N that holds 1 scale for each "logical" matrix. So the param
+    is an array of length N. The loaded_weight corresponds to
+    one of the shards on disk. Here, we slice the param based on
+    the shard_id for loading.
+    """
+    qkv_idxs = {"q": 0, "k": 1, "v": 2}
+
+    if isinstance(shard_id, str):
+        shard_id = qkv_idxs[shard_id]
+    elif not isinstance(shard_id, int):
+        raise ValueError(f"Unknown Shard Id {shard_id}")
+
+    # AutoFP8 scales do not have a shape
+    # compressed-tensors scales do have a shape
+    if len(loaded_weight.shape) != 0:
+        assert loaded_weight.shape[0] == 1
+        loaded_weight = loaded_weight[0]
+
+    return param[shard_id], loaded_weight
+
+
+def adjust_shard_offsets(shard_offsets, loaded_weight, dim):
+    actual_weight_size = loaded_weight.size(dim)
+    target_weight_size = shard_offsets[-1][-1] + shard_offsets[-1][-2]
+    if actual_weight_size != target_weight_size:
+        new_shard_offsets = []
+        new_offset = 0
+        for shard_id, shard_offset, shard_size in shard_offsets:
+            actual_shard_size = actual_weight_size * shard_size // target_weight_size
+            new_shard_offsets.append((shard_id, new_offset, actual_shard_size))
+            new_offset += actual_shard_size
+        return new_shard_offsets
+    return shard_offsets
+
+
+class LinearBase(torch.nn.Module):
+    """Base linear layer.
+
+    Args:
+        input_size: input dimension of the linear layer.
+        output_size: output dimension of the linear layer.
+        bias: If true, add bias.
+        skip_bias_add: If true, skip adding bias but instead return it.
+        params_dtype: Data type for the parameters.
+        quant_config: Quantization configure.
+    """
+
+    def __init__(
+        self,
+        input_size: int,
+        output_size: int,
+        skip_bias_add: bool = False,
+        params_dtype: Optional[torch.dtype] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+
+        # Keep input parameters
+        self.input_size = input_size
+        self.output_size = output_size
+        self.skip_bias_add = skip_bias_add
+        if params_dtype is None:
+            params_dtype = torch.get_default_dtype()
+        self.params_dtype = params_dtype
+        self.quant_config = quant_config
+        if quant_config is None:
+            from sglang.srt.layers.quantization.unquant import UnquantizedLinearMethod
+
+            self.quant_method: Optional[QuantizeMethodBase] = UnquantizedLinearMethod()
+        else:
+            self.quant_method = quant_config.get_quant_method(self, prefix=prefix)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        raise NotImplementedError
+
+
+class ReplicatedLinear(LinearBase):
+    """Replicated linear layer.
+
+    Args:
+        input_size: input dimension of the linear layer.
+        output_size: output dimension of the linear layer.
+        bias: If true, add bias.
+        skip_bias_add: If true, skip adding bias but instead return it.
+        params_dtype: Data type for the parameters.
+        quant_config: Quantization configure.
+        prefix: The name of the layer in the state dict, including all parents
+                        (e.g. model.layers.0.qkv_proj)
+    """
+
+    def __init__(
+        self,
+        input_size: int,
+        output_size: int,
+        bias: bool = True,
+        skip_bias_add: bool = False,
+        params_dtype: Optional[torch.dtype] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__(
+            input_size,
+            output_size,
+            skip_bias_add,
+            params_dtype,
+            quant_config,
+            prefix=prefix,
+        )
+
+        # All the linear layer supports quant method.
+        assert self.quant_method is not None
+        self.quant_method.create_weights(
+            self,
+            self.input_size,
+            [self.output_size],
+            self.input_size,
+            self.output_size,
+            self.params_dtype,
+            weight_loader=self.weight_loader,
+        )
+
+        if bias:
+            self.bias = Parameter(
+                torch.empty(self.output_size, dtype=self.params_dtype)
+            )
+            set_weight_attrs(
+                self.bias,
+                {
+                    "output_dim": 0,
+                    "weight_loader": self.weight_loader,
+                },
+            )
+        else:
+            self.register_parameter("bias", None)
+
+    def weight_loader(self, param: Parameter, loaded_weight: torch.Tensor):
+        # If the weight on disk does not have a shape, give it one
+        # (such scales for AutoFp8).
+        if len(loaded_weight.shape) == 0:
+            loaded_weight = loaded_weight.reshape(1)
+
+        # The per-tensor quant-scale must be 1 dimension
+        if _is_npu:
+            if param.size() != loaded_weight.size() and param.size(0) == 1:
+                if torch.allclose(loaded_weight, loaded_weight[0]):
+                    loaded_weight = loaded_weight[:1]
+                else:
+                    raise ValueError(f"{loaded_weight} are not all equal")
+
+            if param.dtype == torch.int8 or loaded_weight.dtype == torch.int8:
+                assert (
+                    param.dtype == loaded_weight.dtype
+                ), "init para dtype and loaded weight dtype should be the same"
+
+        assert param.size() == loaded_weight.size()
+        param.data.copy_(loaded_weight)
+
+    def forward(self, x: torch.Tensor) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
+        bias = self.bias if not self.skip_bias_add else None
+        assert self.quant_method is not None
+        output = self.quant_method.apply(self, x, bias)
+        output_bias = self.bias if self.skip_bias_add else None
+        return output, output_bias
+
+    def extra_repr(self) -> str:
+        s = f"in_features={self.input_size}"
+        s += f", output_features={self.output_size}"
+        s += f", bias={self.bias is not None}"
+        return s
+
+
+class ColumnParallelLinear(LinearBase):
+    """Linear layer with column parallelism.
+
+    The linear layer is defined as Y = XA + b. A is parallelized along
+    its second dimension as A = [A_1, ..., A_p].
+
+    Args:
+        input_size: first dimension of matrix A.
+        output_size: second dimension of matrix A.
+        bias: If true, add bias.
+        gather_output: If true, call all-gather on output and make Y available
+                       to all GPUs, otherwise, every GPU will have its output
+                       which is Y_i = XA_i
+        skip_bias_add: This was added to enable performance optimizations where
+                       bias can be fused with other element-wise operations. we
+                       skip adding bias but instead return it.
+        params_dtype: Data type for the parameters.
+        quant_config: Quantization configure.
+        output_sizes: list of output sizes packed into one output, like for QKV
+                       the list would be size 3.
+        prefix: The name of the layer in the state dict, including all parents
+                        (e.g. model.layers.0.qkv_proj)
+    """
+
+    def __init__(
+        self,
+        input_size: int,
+        output_size: int,
+        bias: bool = True,
+        gather_output: bool = False,
+        skip_bias_add: bool = False,
+        params_dtype: Optional[torch.dtype] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        output_sizes: Optional[List[int]] = None,
+        prefix: str = "",
+        tp_rank: Optional[int] = None,
+        tp_size: Optional[int] = None,
+        use_presharded_weights: bool = False,
+        skip_block_quant_check: bool = False,
+    ):
+        super().__init__(
+            input_size, output_size, skip_bias_add, params_dtype, quant_config, prefix
+        )
+
+        self.gather_output = gather_output
+        self.use_presharded_weights = use_presharded_weights
+
+        # Divide the weight matrix along the last dimension.
+        if tp_rank is None:
+            tp_rank = get_tensor_model_parallel_rank()
+        if tp_size is None:
+            tp_size = get_tensor_model_parallel_world_size()
+        self.tp_rank, self.tp_size = tp_rank, tp_size
+        assert self.quant_method is not None
+        self.output_size_per_partition = divide(self.output_size, tp_size)
+        self.output_partition_sizes = [self.output_size_per_partition]
+        # If QKV or MergedColumn, use output size of each partition.
+        if hasattr(self, "output_sizes"):
+            self.output_partition_sizes = [
+                divide(output_size, tp_size) for output_size in self.output_sizes
+            ]
+
+        if output_sizes is None:
+            output_sizes = [output_size]
+
+        self.quant_method.create_weights(
+            layer=self,
+            input_size_per_partition=self.input_size,
+            output_partition_sizes=self.output_partition_sizes,
+            input_size=self.input_size,
+            output_size=self.output_size,
+            params_dtype=self.params_dtype,
+            skip_block_quant_check=skip_block_quant_check,
+            weight_loader=(
+                self.weight_loader_v2
+                if self.quant_method.__class__.__name__ in WEIGHT_LOADER_V2_SUPPORTED
+                else self.weight_loader
+            ),
+        )
+        if bias:
+            self.bias = Parameter(
+                torch.zeros(self.output_size_per_partition, dtype=params_dtype)
+            )
+            set_weight_attrs(
+                self.bias,
+                {
+                    "output_dim": 0,
+                    "weight_loader": self.weight_loader,
+                },
+            )
+        else:
+            self.register_parameter("bias", None)
+
+    def weight_loader(self, param: Parameter, loaded_weight: torch.Tensor):
+        output_dim = getattr(param, "output_dim", None)
+        param_data = param.data
+
+        # Special case for GGUF
+        is_gguf_weight = getattr(param, "is_gguf_weight", False)
+        is_gguf_weight_type = getattr(param, "is_gguf_weight_type", False)
+        if is_gguf_weight_type:
+            param.weight_type = loaded_weight.item()
+
+        # Materialize GGUF UninitializedParameter
+        if is_gguf_weight and isinstance(param, UninitializedParameter):
+            param.materialize(loaded_weight.shape, dtype=loaded_weight.dtype)
+
+        # bitsandbytes loads the weights of the specific portion
+        # no need to narrow here
+        use_bitsandbytes_4bit = getattr(param, "use_bitsandbytes_4bit", False)
+        if output_dim is not None and not use_bitsandbytes_4bit:
+            shard_size = param_data.shape[output_dim]
+            start_idx = self.tp_rank * shard_size
+
+            if _is_cpu:
+                from sglang.srt.model_loader.weight_utils import (
+                    narrow_padded_param_and_loaded_weight,
+                )
+
+                param_data, loaded_weight = narrow_padded_param_and_loaded_weight(
+                    param_data,
+                    loaded_weight,
+                    0,  # param_data_start
+                    start_idx,
+                    output_dim,
+                    shard_size,
+                    not self.use_presharded_weights,
+                )
+            else:
+                if not self.use_presharded_weights:
+                    loaded_weight = loaded_weight.narrow(
+                        output_dim, start_idx, shard_size
+                    )
+
+        # Special case for loading scales off disk, which often do not
+        # have a shape (such as in the case of AutoFP8).
+        if len(loaded_weight.shape) == 0:
+            loaded_weight = loaded_weight.reshape(1)
+
+        assert param_data.shape == loaded_weight.shape
+        param_data.copy_(loaded_weight)
+
+    def weight_loader_v2(self, param: Parameter, loaded_weight: torch.Tensor):
+        # Special case for loading scales off disk, which often do not
+        # have a shape (such as in the case of AutoFP8).
+        if len(loaded_weight.shape) == 0:
+            assert loaded_weight.numel() == 1
+            loaded_weight = loaded_weight.reshape(1)
+
+        if isinstance(param, _ColumnvLLMParameter):
+            param.load_column_parallel_weight(
+                loaded_weight,
+                tp_rank=self.tp_rank,
+                use_presharded_weights=self.use_presharded_weights,
+            )
+        else:
+            # FIXME: This branch is needed to load deepseek v3 awq.
+            # However, we should fix this and avoid the branching here.
+            # After QuantizedRL reload, params might still need tp_rank
+            try:
+                param.load_column_parallel_weight(
+                    loaded_weight,
+                    tp_rank=self.tp_rank,
+                    use_presharded_weights=self.use_presharded_weights,
+                )
+            except TypeError:
+                # Fallback for parameters that don't accept additional args
+                param.load_column_parallel_weight(loaded_weight)
+
+    def forward(self, input_):
+        bias = self.bias if not self.skip_bias_add else None
+
+        # Matrix multiply.
+        assert self.quant_method is not None
+        output_parallel = self.quant_method.apply(self, input_, bias)
+        if self.gather_output:
+            # All-gather across the partitions.
+            output = tensor_model_parallel_all_gather(output_parallel)
+        else:
+            output = output_parallel
+        output_bias = self.bias if self.skip_bias_add else None
+        return output, output_bias
+
+    def extra_repr(self) -> str:
+        s = f"in_features={self.input_size}"
+        s += f", output_features={self.output_size_per_partition}"
+        s += f", bias={self.bias is not None}"
+        s += f", tp_size={self.tp_size}"
+        s += f", gather_output={self.gather_output}"
+        return s
+
+
+class MergedColumnParallelLinear(ColumnParallelLinear):
+    """Packed linear layers with column parallelism.
+
+    Similar to ColumnParallelLinear, but the weight matrix is concatenated
+    along the output dimension. When the weight matrix is loaded, the
+    different partitions are sharded separately.
+
+    Args:
+        input_size: input dimension of the linear layer.
+        output_sizes: list of output dimensions of the linear layer.
+        bias: If true, add bias.
+        gather_output: If true, call all-gather on output and make the output
+                       available to all GPUs, otherwise, every GPU will have
+                       its own output.
+        skip_bias_add: This was added to enable performance optimizations where
+                       bias can be fused with other element-wise operations. we
+                       skip adding bias but instead return it.
+        params_dtype: Data type for the parameters.
+        quant_config: Quantization configure.
+        prefix: The name of the layer in the state dict, including all parents
+                        (e.g. model.layers.0.qkv_proj)
+    """
+
+    def __init__(
+        self,
+        input_size: int,
+        output_sizes: List[int],
+        bias: bool = True,
+        gather_output: bool = False,
+        skip_bias_add: bool = False,
+        params_dtype: Optional[torch.dtype] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+        tp_rank: Optional[int] = None,
+        tp_size: Optional[int] = None,
+        use_presharded_weights: bool = False,
+    ):
+        self.output_sizes = output_sizes
+        if tp_rank is None:
+            tp_rank = get_tensor_model_parallel_rank()
+        if tp_size is None:
+            tp_size = get_tensor_model_parallel_world_size()
+        self.tp_rank, self.tp_size = tp_rank, tp_size
+        assert all(output_size % tp_size == 0 for output_size in output_sizes)
+        self.use_presharded_weights = use_presharded_weights
+        super().__init__(
+            input_size=input_size,
+            output_size=sum(output_sizes),
+            bias=bias,
+            gather_output=gather_output,
+            skip_bias_add=skip_bias_add,
+            params_dtype=params_dtype,
+            quant_config=quant_config,
+            prefix=prefix,
+            tp_rank=tp_rank,
+            tp_size=tp_size,
+            use_presharded_weights=use_presharded_weights,
+        )
+        self.prefix = prefix
+
+    def weight_loader(
+        self,
+        param: Parameter,
+        loaded_weight: torch.Tensor,
+        loaded_shard_id: Optional[int] = None,
+    ):
+
+        # Special case for GGUF
+        # initialize GGUF param after we know the quantize type
+        is_gguf_weight = getattr(param, "is_gguf_weight", False)
+        is_gguf_weight_type = getattr(param, "is_gguf_weight_type", False)
+        if is_gguf_weight_type:
+            param.data[loaded_shard_id].copy_(loaded_weight)
+            param.shard_weight_type[loaded_shard_id] = loaded_weight.item()
+            return
+
+        if is_gguf_weight:
+            output_dim = getattr(param, "output_dim", None)
+            shard_size = loaded_weight.size(output_dim) // self.tp_size
+            start_idx = self.tp_rank * shard_size
+
+            loaded_weight = loaded_weight.narrow(output_dim, start_idx, shard_size)
+
+            param.shard_id.append(loaded_shard_id)
+            param.shard_id_map[loaded_shard_id] = len(param.data_container)
+            param.data_container.append(loaded_weight)
+            return
+
+        param_data = param.data
+        output_dim = getattr(param, "output_dim", None)
+        # Special case for AQLM codebooks.
+        is_metadata = getattr(param, "is_metadata", False)
+        # Special case for per-tensor scale to load scalar into fused array.
+        needs_scalar_to_array = getattr(param, "needs_scalar_to_array", False)
+
+        if loaded_shard_id is None:
+            # Loaded weight is already fused on disk (qkv/mlp).
+            if output_dim is None:
+                if needs_scalar_to_array:
+                    param_data, loaded_weight = adjust_scalar_to_fused_array(
+                        param_data, loaded_weight, 0
+                    )
+
+                assert param_data.shape == loaded_weight.shape
+                param_data.copy_(loaded_weight)
+                return
+            current_shard_offset = 0
+            shard_offsets: List[Tuple[int, int, int]] = []
+            for i, output_size in enumerate(self.output_sizes):
+                effective_size = (
+                    output_size // self.tp_size
+                    if self.use_presharded_weights
+                    else output_size
+                )
+                shard_offsets.append((i, current_shard_offset, effective_size))
+                current_shard_offset += effective_size
+            packed_dim = getattr(param, "packed_dim", None)
+
+            use_bitsandbytes_4bit = getattr(param, "use_bitsandbytes_4bit", False)
+            if _is_cpu:
+                shard_offsets = adjust_shard_offsets(
+                    shard_offsets, loaded_weight, output_dim
+                )
+
+            for shard_id, shard_offset, shard_size in shard_offsets:
+                # Special case for Quantization.
+                # If quantized, we need to adjust the offset and size to account
+                # for the packing.
+                if packed_dim == output_dim:
+                    shard_size = shard_size // param.pack_factor
+                    shard_offset = shard_offset // param.pack_factor
+                    # Special case for Marlin.
+                    shard_size, shard_offset = adjust_marlin_shard(
+                        param, shard_size, shard_offset
+                    )
+
+                if use_bitsandbytes_4bit:
+                    index = list(itertools.accumulate([0] + self.output_sizes))
+                    orig_offsets = {
+                        str(i): (index[i], size)
+                        for i, size in enumerate(self.output_sizes)
+                    }
+                    orig_offsets["total"] = (self.output_size, 0)
+                    shard_size, shard_offset = adjust_bitsandbytes_4bit_shard(
+                        param, orig_offsets, str(shard_id)
+                    )
+
+                loaded_weight_shard = loaded_weight.narrow(
+                    output_dim, shard_offset, shard_size
+                )
+                self.weight_loader(param, loaded_weight_shard, shard_id)
+            return
+
+        assert loaded_shard_id < len(self.output_sizes)
+        if output_dim is not None:
+            shard_offset = sum(self.output_sizes[:loaded_shard_id]) // self.tp_size
+            shard_size = self.output_sizes[loaded_shard_id] // self.tp_size
+            # Special case for quantization.
+            # If quantized, we need to adjust the offset and size to account
+            # for the packing.
+            packed_dim = getattr(param, "packed_dim", None)
+            if packed_dim == output_dim:
+                shard_size = shard_size // param.pack_factor
+                shard_offset = shard_offset // param.pack_factor
+                # Special case for Marlin.
+                shard_size, shard_offset = adjust_marlin_shard(
+                    param, shard_size, shard_offset
+                )
+
+            use_bitsandbytes_4bit = getattr(param, "use_bitsandbytes_4bit", False)
+            if use_bitsandbytes_4bit:
+                shard_size = loaded_weight.shape[output_dim]
+                shard_offset = loaded_weight.shape[output_dim] * loaded_shard_id
+
+            param_data = param_data.narrow(output_dim, shard_offset, shard_size)
+            start_idx = self.tp_rank * shard_size
+
+            if _is_cpu:
+                from sglang.srt.model_loader.weight_utils import (
+                    narrow_padded_param_and_loaded_weight,
+                )
+
+                param_data, loaded_weight = narrow_padded_param_and_loaded_weight(
+                    param_data,
+                    loaded_weight,
+                    0,  # param_data_start
+                    start_idx,
+                    output_dim,
+                    shard_size,
+                    not use_bitsandbytes_4bit and not self.use_presharded_weights,
+                )
+            else:
+                # bitsandbytes loads the weights of the specific portion
+                # no need to narrow here
+                if not use_bitsandbytes_4bit and not self.use_presharded_weights:
+                    # Padding for special case like qwen2_5_VL's mlp which is not 8-aligned
+                    end_idx = start_idx + shard_size
+                    if end_idx > loaded_weight.shape[output_dim]:
+                        loaded_weight = pad_or_narrow_weight(
+                            loaded_weight, output_dim, start_idx, shard_size
+                        )
+                    else:
+                        loaded_weight = loaded_weight.narrow(
+                            output_dim, start_idx, shard_size
+                        )
+
+        # Special case for AQLM codebooks.
+        elif is_metadata:
+            # metadata indicates fixed size concatenated along dim 0
+            shard_size = loaded_weight.shape[0]
+            shard_offset = loaded_shard_id * shard_size
+            param_data = param_data.narrow(0, shard_offset, shard_size)
+
+        # Special case for per-tensor scales in fused case.
+        elif needs_scalar_to_array:
+            param_data, loaded_weight = adjust_scalar_to_fused_array(
+                param_data, loaded_weight, loaded_shard_id
+            )
+
+        else:
+            ignore_warning = getattr(param, "ignore_warning", False)
+            if not ignore_warning:
+                logger.warning(
+                    "Loading a weight without `output_dim` attribute in "
+                    "MergedColumnParallelLinear, assume the weight is "
+                    "the same for all partitions."
+                )
+
+        assert param_data.shape == loaded_weight.shape
+        param_data.copy_(loaded_weight)
+
+    def _load_fused_module_from_checkpoint(
+        self, param: BasevLLMParameter, loaded_weight: torch.Tensor
+    ):
+        """
+        Handle special case for models where MLP layers are already
+        fused on disk. In this case, we have no shard id. This function
+        determmines the shard id by splitting these layers and then calls
+        the weight loader using the shard id.
+
+        An example of a model with these fused layers:
+        https://huggingface.co/microsoft/Phi-3-mini-4k-instruct
+        """
+
+        current_shard_offset = 0
+        shard_offsets: List[Tuple[int, int, int]] = []
+        for i, output_size in enumerate(self.output_sizes):
+            shard_offsets.append((i, current_shard_offset, output_size))
+            current_shard_offset += output_size
+
+        for shard_id, shard_offset, shard_size in shard_offsets:
+            # Special case for Quantization.
+            # If quantized, we need to adjust the offset and size to account
+            # for the packing.
+            if (
+                isinstance(param, (PackedColumnParameter, PackedvLLMParameter))
+                and param.packed_dim == param.output_dim
+            ):
+                shard_size, shard_offset = param.adjust_shard_indexes_for_packing(
+                    shard_size=shard_size, shard_offset=shard_offset
+                )
+
+            loaded_weight_shard = loaded_weight.narrow(
+                param.output_dim, shard_offset, shard_size
+            )
+            self.weight_loader_v2(param, loaded_weight_shard, shard_id)
+
+    def _load_merged_block_scale(
+        self, param: BasevLLMParameter, loaded_weight: torch.Tensor
+    ):
+        """
+        Handle block-wise scale loading for MergedColumnParallelLinear.
+        Similar to QKVParallelLinear._load_qkv_block_scale, but for merged column layers.
+        """
+        weight_block_size = self.quant_method.quant_config.weight_block_size
+        block_n, _ = weight_block_size[0], weight_block_size[1]
+        block_n = 1 if getattr(param, "format_ue8m0", False) else block_n
+
+        # Calculate block sizes for each shard
+        shard_block_sizes = []
+        shard_block_offsets = []
+        current_block_offset = 0
+        for output_size in self.output_sizes:
+            shard_block_size = (output_size + block_n - 1) // block_n
+            shard_block_sizes.append(shard_block_size)
+            shard_block_offsets.append(current_block_offset)
+            current_block_offset += shard_block_size
+
+        # Load each shard
+        for shard_id, (shard_block_offset, shard_block_size) in enumerate(
+            zip(shard_block_offsets, shard_block_sizes)
+        ):
+            # Extract the shard from loaded_weight
+            loaded_weight_shard = loaded_weight.narrow(
+                param.output_dim, shard_block_offset, shard_block_size
+            )
+
+            # Calculate per-rank offset and size (considering TP)
+            rank_shard_offset = shard_block_offset // self.tp_size
+            rank_shard_size = shard_block_size // self.tp_size
+
+            # Load into the parameter
+            param.load_merged_column_weight(
+                loaded_weight=loaded_weight_shard,
+                shard_id=shard_id,
+                shard_offset=rank_shard_offset,
+                shard_size=rank_shard_size,
+                tp_rank=self.tp_rank,
+                tp_size=self.tp_size,
+                use_presharded_weights=self.use_presharded_weights,
+            )
+
+    def weight_loader_v2(
+        self,
+        param: BasevLLMParameter,
+        loaded_weight: torch.Tensor,
+        loaded_shard_id: Optional[int] = None,
+    ):
+        if loaded_shard_id is None:
+            if isinstance(param, PerTensorScaleParameter):
+                param.load_merged_column_weight(
+                    loaded_weight=loaded_weight,
+                    shard_id=0,
+                    tp_rank=self.tp_rank,
+                    tp_size=self.tp_size,
+                )
+                return
+            elif isinstance(param, BlockQuantScaleParameter):
+                self._load_merged_block_scale(param, loaded_weight)
+                return
+            elif type(param) in (RowvLLMParameter, BasevLLMParameter):
+                param.load_merged_column_weight(
+                    loaded_weight=loaded_weight,
+                    tp_rank=self.tp_rank,
+                    tp_size=self.tp_size,
+                )
+                return
+            # TODO: @dsikka - move to parameter.py
+            self._load_fused_module_from_checkpoint(param, loaded_weight)
+            return
+
+        assert loaded_shard_id < len(self.output_sizes)
+
+        if isinstance(param, BlockQuantScaleParameter):
+            weight_block_size = self.quant_method.quant_config.weight_block_size
+            raw_block_n, _ = weight_block_size[0], weight_block_size[1]
+            block_n = 1 if getattr(param, "format_ue8m0", False) else raw_block_n
+            shard_offset = (
+                (sum(self.output_sizes[:loaded_shard_id]) + block_n - 1) // block_n
+            ) // self.tp_size
+            shard_size = (
+                (self.output_sizes[loaded_shard_id] + block_n - 1)
+                // block_n
+                // self.tp_size
+            )
+        else:
+            shard_offset = sum(self.output_sizes[:loaded_shard_id]) // self.tp_size
+            shard_size = self.output_sizes[loaded_shard_id] // self.tp_size
+
+        param.load_merged_column_weight(
+            loaded_weight=loaded_weight,
+            shard_id=loaded_shard_id,
+            shard_offset=shard_offset,
+            shard_size=shard_size,
+            use_presharded_weights=self.use_presharded_weights,
+            tp_rank=self.tp_rank,
+            tp_size=self.tp_size,
+        )
+
+
+class QKVParallelLinear(ColumnParallelLinear):
+    """Linear layers for the attention's QKV transformation.
+
+    Linear layers for the linear transformation of the query, key, and value
+    vectors in the attention layer. The weight matrix is concatenated along
+    the output dimension. The layer is parallelized along the head dimension.
+    When the number of key/value heads is smaller than the number of query
+    heads (e.g., multi-query/grouped-query attention), the key/value head may
+    be replicated while the query heads are partitioned.
+
+    Args:
+        hidden_size: input hidden state size of the transformer.
+        head_size: size of each attention head.
+        total_num_heads: total number of attention query heads.
+        total_num_kv_heads: total number of attention key/value heads. If
+                            None, assume total_num_kv_heads = total_num_heads.
+        bias: If true, add bias.
+        skip_bias_add: This was added to enable performance optimizations where
+                       bias can be fused with other element-wise operations. we
+                       skip adding bias but instead return it.
+        params_dtype: Data type for the parameters.
+        quant_config: Quantization configure.
+        prefix: The name of the layer in the state dict, including all parents
+                        (e.g. model.layers.0.qkv_proj)
+    """
+
+    def __init__(
+        self,
+        hidden_size: int,
+        head_size: int,
+        total_num_heads: int,
+        total_num_kv_heads: Optional[int] = None,
+        bias: bool = True,
+        skip_bias_add: bool = False,
+        params_dtype: Optional[torch.dtype] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+        tp_rank: Optional[int] = None,
+        tp_size: Optional[int] = None,
+        load_presharded_attn: bool = False,
+        v_head_size: Optional[int] = None,
+        skip_block_quant_check: bool = False,
+    ):
+        self.hidden_size = hidden_size
+        self.head_size = head_size
+        self.v_head_size = v_head_size if v_head_size is not None else head_size
+        self.total_num_heads = total_num_heads
+        if total_num_kv_heads is None:
+            total_num_kv_heads = total_num_heads
+        self.total_num_kv_heads = total_num_kv_heads
+        # Divide the weight matrix along the last dimension.
+        if tp_rank is None:
+            tp_rank = get_tensor_model_parallel_rank()
+        if tp_size is None:
+            tp_size = get_tensor_model_parallel_world_size()
+        self.tp_rank, self.tp_size = tp_rank, tp_size
+        self.num_heads = divide(self.total_num_heads, tp_size)
+        if tp_size >= self.total_num_kv_heads:
+            self.num_kv_heads = 1
+            self.num_kv_head_replicas = divide(tp_size, self.total_num_kv_heads)
+        else:
+            self.num_kv_heads = divide(self.total_num_kv_heads, tp_size)
+            self.num_kv_head_replicas = 1
+        self.q_proj_shard_size = self.num_heads * self.head_size
+        self.kv_proj_shard_size = self.num_kv_heads * self.head_size
+        self.v_proj_shard_size = self.num_kv_heads * self.v_head_size
+        input_size = self.hidden_size
+        output_size = (
+            self.num_heads * self.head_size
+            + self.num_kv_heads * self.head_size
+            + self.num_kv_heads * self.v_head_size
+        ) * tp_size
+        self.output_sizes = [
+            self.num_heads * self.head_size * tp_size,  # q_proj
+            self.num_kv_heads * self.head_size * tp_size,  # k_proj
+            self.num_kv_heads * self.v_head_size * tp_size,  # v_proj
+        ]
+        self.use_presharded_weights = load_presharded_attn
+        quant_config = None if _disable_hip_linear_quant else quant_config
+
+        super().__init__(
+            input_size=input_size,
+            output_size=output_size,
+            bias=bias,
+            gather_output=False,
+            skip_bias_add=skip_bias_add,
+            params_dtype=params_dtype,
+            quant_config=quant_config,
+            prefix=prefix,
+            tp_rank=tp_rank,
+            tp_size=tp_size,
+            use_presharded_weights=self.use_presharded_weights,
+            skip_block_quant_check=skip_block_quant_check,
+        )
+
+    def _get_shard_offset_mapping(self, loaded_shard_id: str):
+        shard_offset_mapping = {
+            "q": 0,
+            "k": self.num_heads * self.head_size,
+            "v": (self.num_heads + self.num_kv_heads) * self.head_size,
+            "total": (self.num_heads + self.num_kv_heads) * self.head_size
+            + self.num_kv_heads * self.v_head_size,
+        }
+        return shard_offset_mapping.get(loaded_shard_id)
+
+    def _get_shard_size_mapping(self, loaded_shard_id: str):
+        shard_size_mapping = {
+            "q": self.num_heads * self.head_size,
+            "k": self.num_kv_heads * self.head_size,
+            "v": self.num_kv_heads * self.v_head_size,
+        }
+        return shard_size_mapping.get(loaded_shard_id)
+
+    def _load_fused_module_from_checkpoint(
+        self, param: BasevLLMParameter, loaded_weight: torch.Tensor
+    ):
+        """
+        Handle special case for models where QKV layers are already
+        fused on disk. In this case, we have no shard id. This function
+        determmines the shard id by splitting these layers and then calls
+        the weight loader using the shard id.
+
+        An example of a model with these fused layers:
+        https://huggingface.co/microsoft/Phi-3-mini-4k-instruct
+        """
+        shard_offsets = [
+            # (shard_id, shard_offset, shard_size)
+            ("q", 0, self.total_num_heads * self.head_size),
+            (
+                "k",
+                self.total_num_heads * self.head_size,
+                self.total_num_kv_heads * self.head_size,
+            ),
+            (
+                "v",
+                (self.total_num_heads + self.total_num_kv_heads) * self.head_size,
+                self.total_num_kv_heads * self.v_head_size,
+            ),
+        ]
+
+        for shard_id, shard_offset, shard_size in shard_offsets:
+            # Special case for Quantization.
+            # If quantized, we need to adjust the offset and size to account
+            # for the packing.
+            if (
+                isinstance(param, (PackedColumnParameter, PackedvLLMParameter))
+                and param.packed_dim == param.output_dim
+            ):
+                shard_size, shard_offset = param.adjust_shard_indexes_for_packing(
+                    shard_size=shard_size, shard_offset=shard_offset
+                )
+
+            if not self.use_presharded_weights:
+                loaded_weight_shard = loaded_weight.narrow(
+                    param.output_dim, shard_offset, shard_size
+                )
+            self.weight_loader_v2(param, loaded_weight_shard, shard_id)
+
+    def _load_qkv_block_scale(
+        self, param: BasevLLMParameter, loaded_weight: torch.Tensor
+    ):
+        block_n, _ = self.quant_method.quant_config.weight_block_size
+        q_size = self.total_num_heads * self.head_size // block_n
+        k_size = self.total_num_kv_heads * self.head_size // block_n
+        v_size = self.total_num_kv_heads * self.head_size // block_n
+        shard_offsets = [
+            # (shard_id, shard_offset, shard_size)
+            ("q", 0, q_size),
+            ("k", q_size, k_size),
+            ("v", q_size + k_size, v_size),
+        ]
+        for shard_id, shard_offset, shard_size in shard_offsets:
+            loaded_weight_shard = loaded_weight.narrow(
+                param.output_dim, shard_offset, shard_size
+            )
+            rank_shard_offset = self._get_shard_offset_mapping(shard_id) // block_n
+            rank_shard_size = self._get_shard_size_mapping(shard_id) // block_n
+            param.load_qkv_weight(
+                loaded_weight=loaded_weight_shard,
+                num_heads=self.num_kv_head_replicas,
+                shard_id=shard_id,
+                shard_offset=rank_shard_offset,
+                shard_size=rank_shard_size,
+                tp_rank=self.tp_rank,
+                use_presharded_weights=self.use_presharded_weights,
+            )
+
+    def weight_loader_v2(
+        self,
+        param: BasevLLMParameter,
+        loaded_weight: torch.Tensor,
+        loaded_shard_id: Optional[str] = None,
+    ):
+        if loaded_shard_id is None:  # special case for certain models
+            if isinstance(param, PerTensorScaleParameter):
+                param.load_qkv_weight(loaded_weight=loaded_weight, shard_id=0)
+                return
+            elif type(param) in (RowvLLMParameter, BasevLLMParameter):
+                param.load_qkv_weight(loaded_weight=loaded_weight)
+                return
+            elif isinstance(param, BlockQuantScaleParameter):
+                self._load_qkv_block_scale(param, loaded_weight)
+                return
+            # TODO: @dsikka - move to parameter.py
+            self._load_fused_module_from_checkpoint(param, loaded_weight)
+            return
+
+        assert loaded_shard_id in ["q", "k", "v"]
+
+        shard_offset = self._get_shard_offset_mapping(loaded_shard_id)
+        shard_size = self._get_shard_size_mapping(loaded_shard_id)
+
+        if isinstance(param, BlockQuantScaleParameter):
+            weight_block_size = self.quant_method.quant_config.weight_block_size
+            raw_block_n, _ = weight_block_size[0], weight_block_size[1]
+            block_n = 1 if getattr(param, "format_ue8m0", False) else raw_block_n
+            shard_offset = (shard_offset + block_n - 1) // block_n
+            shard_size = (shard_size + block_n - 1) // block_n
+
+        param.load_qkv_weight(
+            loaded_weight=loaded_weight,
+            num_heads=self.num_kv_head_replicas,
+            shard_id=loaded_shard_id,
+            shard_offset=shard_offset,
+            shard_size=shard_size,
+            tp_rank=self.tp_rank,
+            use_presharded_weights=self.use_presharded_weights,
+        )
+
+    def weight_loader(
+        self,
+        param: Parameter,
+        loaded_weight: torch.Tensor,
+        loaded_shard_id: Optional[str] = None,
+    ):
+
+        # Special case for GGUF
+        # initialize GGUF param after we know the quantize type
+        is_gguf_weight = getattr(param, "is_gguf_weight", False)
+        is_gguf_weight_type = getattr(param, "is_gguf_weight_type", False)
+        if is_gguf_weight_type and loaded_shard_id is not None:
+            idx_map = {"q": 0, "k": 1, "v": 2}
+            param.data[idx_map[loaded_shard_id]].copy_(loaded_weight)
+            param.shard_weight_type[loaded_shard_id] = loaded_weight.item()
+            return
+
+        if is_gguf_weight:
+            output_dim = getattr(param, "output_dim", None)
+            shard_size = loaded_weight.size(output_dim) // self.tp_size
+            start_idx = self.tp_rank * shard_size
+
+            loaded_weight = loaded_weight.narrow(output_dim, start_idx, shard_size)
+
+            param.shard_id.append(loaded_shard_id)
+            param.shard_id_map[loaded_shard_id] = len(param.data_container)
+            param.data_container.append(loaded_weight)
+            return
+
+        param_data = param.data
+        output_dim = getattr(param, "output_dim", None)
+        # Special case for AQLM codebooks.
+        is_metadata = getattr(param, "is_metadata", False)
+
+        # Special case for per-tensor scales in fused case.
+        needs_scalar_to_array = getattr(param, "needs_scalar_to_array", False)
+
+        if loaded_shard_id is None:
+            # Loaded weight is already fused on disk (qkv/mlp).
+            if output_dim is None:
+                if needs_scalar_to_array:
+                    param_data, loaded_weight = adjust_scalar_to_fused_array(
+                        param_data, loaded_weight, 0
+                    )
+
+                assert param_data.shape == loaded_weight.shape
+                param_data.copy_(loaded_weight)
+                return
+            shard_offsets = [
+                # (shard_id, shard_offset, shard_size)
+                ("q", 0, self.total_num_heads * self.head_size),
+                (
+                    "k",
+                    self.total_num_heads * self.head_size,
+                    self.total_num_kv_heads * self.head_size,
+                ),
+                (
+                    "v",
+                    (self.total_num_heads + self.total_num_kv_heads) * self.head_size,
+                    self.total_num_kv_heads * self.v_head_size,
+                ),
+            ]
+            use_bitsandbytes_4bit = getattr(param, "use_bitsandbytes_4bit", False)
+
+            packed_dim = getattr(param, "packed_dim", None)
+            if _is_cpu:
+                shard_offsets = adjust_shard_offsets(
+                    shard_offsets, loaded_weight, output_dim
+                )
+
+            for shard_id, shard_offset, shard_size in shard_offsets:
+                # Special case for Quantized Weights.
+                # If quantized, we need to adjust the offset and size to account
+                # for the packing.
+                if packed_dim == output_dim:
+                    shard_size = shard_size // param.pack_factor
+                    shard_offset = shard_offset // param.pack_factor
+
+                    # Special case for Marlin.
+                    shard_size, shard_offset = adjust_marlin_shard(
+                        param, shard_size, shard_offset
+                    )
+
+                if use_bitsandbytes_4bit:
+                    orig_qkv_offsets = {
+                        "q": (0, self.total_num_heads * self.head_size),
+                        "k": (
+                            self.total_num_heads * self.head_size,
+                            self.total_num_kv_heads * self.head_size,
+                        ),
+                        "v": (
+                            (self.total_num_heads + self.total_num_kv_heads)
+                            * self.head_size,
+                            self.total_num_kv_heads * self.v_head_size,
+                        ),
+                        "total": (
+                            (self.total_num_heads + self.total_num_kv_heads)
+                            * self.head_size
+                            + self.total_num_kv_heads * self.v_head_size,
+                            0,
+                        ),
+                    }
+
+                    shard_size, shard_offset = adjust_bitsandbytes_4bit_shard(
+                        param, orig_qkv_offsets, shard_id
+                    )
+
+                if not self.use_presharded_weights:
+                    loaded_weight_shard = loaded_weight.narrow(
+                        output_dim, shard_offset, shard_size
+                    )
+                self.weight_loader(param, loaded_weight_shard, shard_id)
+            return
+
+        assert loaded_shard_id in ["q", "k", "v"]
+
+        # If output dim is defined, use the default loading process.
+        if output_dim is not None:
+            if loaded_shard_id == "q":
+                shard_offset = 0
+                shard_size = self.num_heads * self.head_size
+            elif loaded_shard_id == "k":
+                shard_offset = self.num_heads * self.head_size
+                shard_size = self.num_kv_heads * self.head_size
+            elif loaded_shard_id == "v":
+                shard_offset = (self.num_heads + self.num_kv_heads) * self.head_size
+                shard_size = self.num_kv_heads * self.v_head_size
+            # Special case for Quantized Weights.
+            # If quantized, we need to adjust the offset and size to account
+            # for the packing.
+            packed_dim = getattr(param, "packed_dim", None)
+            if packed_dim == output_dim:
+                shard_size = shard_size // param.pack_factor
+                shard_offset = shard_offset // param.pack_factor
+
+                # Special case for Marlin.
+                shard_size, shard_offset = adjust_marlin_shard(
+                    param, shard_size, shard_offset
+                )
+
+            use_bitsandbytes_4bit = getattr(param, "use_bitsandbytes_4bit", False)
+            if use_bitsandbytes_4bit:
+                orig_qkv_offsets = {
+                    "q": (0, self.num_heads * self.head_size),
+                    "k": (
+                        self.num_heads * self.head_size,
+                        self.num_kv_heads * self.head_size,
+                    ),
+                    "v": (
+                        (self.num_heads + self.num_kv_heads) * self.head_size,
+                        self.num_kv_heads * self.v_head_size,
+                    ),
+                    "total": (
+                        (self.num_heads + self.num_kv_heads) * self.head_size
+                        + self.num_kv_heads * self.v_head_size,
+                        0,
+                    ),
+                }
+                shard_size, shard_offset = adjust_bitsandbytes_4bit_shard(
+                    param, orig_qkv_offsets, loaded_shard_id
+                )
+
+            param_data = param_data.narrow(output_dim, shard_offset, shard_size)
+            if loaded_shard_id == "q":
+                shard_id = self.tp_rank
+            else:
+                shard_id = self.tp_rank // self.num_kv_head_replicas
+            start_idx = shard_id * shard_size
+
+            if _is_cpu:
+                from sglang.srt.model_loader.weight_utils import (
+                    narrow_padded_param_and_loaded_weight,
+                )
+
+                param_data, loaded_weight = narrow_padded_param_and_loaded_weight(
+                    param_data,
+                    loaded_weight,
+                    0,  # param_data_start
+                    start_idx,
+                    output_dim,
+                    shard_size,
+                    not use_bitsandbytes_4bit and not self.use_presharded_weights,
+                )
+            else:
+                # bitsandbytes loads the weights of the specific portion
+                # no need to narrow here
+                if not use_bitsandbytes_4bit and not self.use_presharded_weights:
+                    loaded_weight = loaded_weight.narrow(
+                        output_dim, start_idx, shard_size
+                    )
+
+        # Special case for for AQLM codebooks.
+        elif is_metadata:
+            # metadata indicates fixed size concatenated along dim 0
+            shard_size = loaded_weight.shape[0]
+            shard_index = ["q", "k", "v"].index(loaded_shard_id)
+            param_data = param_data.narrow(0, shard_index * shard_size, shard_size)
+        # Special case for per-tensor scales in fused case.
+        elif needs_scalar_to_array:
+            param_data, loaded_weight = adjust_scalar_to_fused_array(
+                param_data, loaded_weight, loaded_shard_id
+            )
+        else:
+            ignore_warning = getattr(param, "ignore_warning", False)
+            if not ignore_warning:
+                logger.warning(
+                    "Loading a weight without `output_dim` attribute in "
+                    "QKVParallelLinear, assume the weight is the same "
+                    "for all partitions."
+                )
+
+        assert (
+            param_data.shape == loaded_weight.shape
+        ), f"{param_data.shape=} {loaded_weight.shape=}"
+        param_data.copy_(loaded_weight)
+
+
+class RowParallelLinear(LinearBase):
+    """Linear layer with row parallelism.
+
+    The linear layer is defined as Y = XA + b. A is parallelized along
+    its first dimension and X along its second dimension as:
+               -   -
+              | A_1 |
+              | .   |
+          A = | .   |        X = [X_1, ..., X_p]
+              | .   |
+              | A_p |
+               -   -
+    Arguments:
+        input_size: first dimension of matrix A.
+        output_size: second dimension of matrix A.
+        bias: If true, add bias. Note that bias is not parallelized.
+        input_is_parallel: If true, we assume that the input is already
+                           split across the GPUs and we do not split
+                           again.
+        skip_bias_add: This was added to enable performance optimization where
+                       bias can be fused with other element-wise operations.
+                       We skip adding bias but instead return it.
+        params_dtype: Data type for the parameters.
+        quant_config: Quantization configure.
+    """
+
+    def __init__(
+        self,
+        input_size: int,
+        output_size: int,
+        bias: bool = True,
+        input_is_parallel: bool = True,
+        skip_bias_add: bool = False,
+        params_dtype: Optional[torch.dtype] = None,
+        reduce_results: bool = True,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+        tp_rank: Optional[int] = None,
+        tp_size: Optional[int] = None,
+        use_presharded_weights: bool = False,
+        use_dp_attention_reduce: bool = False,
+    ):
+        quant_config = None if _disable_hip_linear_quant else quant_config
+        super().__init__(
+            input_size, output_size, skip_bias_add, params_dtype, quant_config, prefix
+        )
+
+        self.input_is_parallel = input_is_parallel
+        self.reduce_results = reduce_results
+        self.use_dp_attention_reduce = use_dp_attention_reduce
+
+        # Divide the weight matrix along the last dimension.
+        if tp_rank is None:
+            tp_rank = get_tensor_model_parallel_rank()
+        if tp_size is None:
+            tp_size = get_tensor_model_parallel_world_size()
+        self.tp_rank, self.tp_size = tp_rank, tp_size
+        self.input_size_per_partition = divide(input_size, self.tp_size)
+        assert self.quant_method is not None
+        self.use_presharded_weights = use_presharded_weights
+
+        self.quant_method.create_weights(
+            layer=self,
+            input_size_per_partition=self.input_size_per_partition,
+            output_partition_sizes=[self.output_size],
+            input_size=self.input_size,
+            output_size=self.output_size,
+            params_dtype=self.params_dtype,
+            weight_loader=(
+                self.weight_loader_v2
+                if self.quant_method.__class__.__name__ in WEIGHT_LOADER_V2_SUPPORTED
+                else self.weight_loader
+            ),
+        )
+
+        if bias:
+            self.bias = Parameter(torch.zeros(self.output_size, dtype=params_dtype))
+            set_weight_attrs(
+                self.bias,
+                {
+                    "output_dim": 0,
+                    "weight_loader": self.weight_loader,
+                },
+            )
+        else:
+            self.register_parameter("bias", None)
+
+    def weight_loader(self, param: Parameter, loaded_weight: torch.Tensor):
+        input_dim = getattr(param, "input_dim", None)
+        use_bitsandbytes_4bit = getattr(param, "use_bitsandbytes_4bit", False)
+
+        # Special case for GGUF
+        is_gguf_weight = getattr(param, "is_gguf_weight", False)
+        is_gguf_weight_type = getattr(param, "is_gguf_weight_type", False)
+        if is_gguf_weight_type:
+            param.weight_type = loaded_weight.item()
+
+        # Materialize GGUF UninitializedParameter
+        if is_gguf_weight and isinstance(param, UninitializedParameter):
+            weight_shape = list(loaded_weight.shape)
+            if input_dim:
+                weight_shape[input_dim] = weight_shape[input_dim] // self.tp_size
+            param.materialize(tuple(weight_shape), dtype=loaded_weight.dtype)
+
+        param_data = param.data
+        # bitsandbytes loads the weights of the specific portion
+        # no need to narrow here
+        if (
+            input_dim is not None
+            and not use_bitsandbytes_4bit
+            and not self.use_presharded_weights
+        ):
+            shard_size = param_data.shape[input_dim]
+            start_idx = self.tp_rank * shard_size
+
+            if _is_cpu:
+                from sglang.srt.model_loader.weight_utils import (
+                    narrow_padded_param_and_loaded_weight,
+                )
+
+                param_data, loaded_weight = narrow_padded_param_and_loaded_weight(
+                    param_data,
+                    loaded_weight,
+                    0,  # param_data_start
+                    start_idx,
+                    input_dim,
+                    shard_size,
+                )
+            else:
+                # Padding for special case like qwen2_5_VL's mlp which is not 8-aligned
+                end_idx = start_idx + shard_size
+                if end_idx > loaded_weight.shape[input_dim]:
+                    loaded_weight = pad_or_narrow_weight(
+                        loaded_weight, input_dim, start_idx, shard_size
+                    )
+                else:
+                    loaded_weight = loaded_weight.narrow(
+                        input_dim, start_idx, shard_size
+                    )
+
+        # Special case for loading scales off disk, which often do not
+        # have a shape (such as in the case of AutoFP8).
+        if len(loaded_weight.shape) == 0:
+            loaded_weight = loaded_weight.reshape(1)
+
+        assert (
+            param_data.shape == loaded_weight.shape
+        ), f"{param_data.shape=} {loaded_weight.shape=}"
+        param_data.copy_(loaded_weight)
+
+    def weight_loader_v2(self, param: BasevLLMParameter, loaded_weight: torch.Tensor):
+
+        # Special case for loading scales off disk, which often do not
+        # have a shape (such as in the case of AutoFP8).
+        if len(loaded_weight.shape) == 0:
+            assert loaded_weight.numel() == 1
+            loaded_weight = loaded_weight.reshape(1)
+
+        if isinstance(param, RowvLLMParameter):
+            # This `BasevLLMParameter` is defined in sglang/srt/layers/parameter.py,
+            # It supports additional parameters like tp_rank and use_presharded_weights.
+            param.load_row_parallel_weight(
+                loaded_weight,
+                tp_rank=self.tp_rank,
+                use_presharded_weights=self.use_presharded_weights,
+            )
+        else:
+            # `params` is defined in `vllm/model_executor/parameter.py`,
+            # It does not support additional parameters.
+            # However, after QuantizedRL reload, params might still need tp_rank
+            try:
+                param.load_row_parallel_weight(
+                    loaded_weight,
+                    tp_rank=self.tp_rank,
+                    use_presharded_weights=self.use_presharded_weights,
+                )
+            except TypeError:
+                # Fallback for parameters that don't accept additional args
+                param.load_row_parallel_weight(loaded_weight)
+
+    def forward(self, input_, skip_all_reduce=False):
+        if self.input_is_parallel:
+            input_parallel = input_
+        else:
+            splitted_input = split_tensor_along_last_dim(
+                input_, num_partitions=self.tp_size
+            )
+            input_parallel = splitted_input[self.tp_rank].contiguous()
+
+        # Matrix multiply.
+        assert self.quant_method is not None
+        # Only fuse bias add into GEMM for rank 0 (this ensures that
+        # bias will not get added more than once in TP>1 case)
+        bias_ = None if (self.tp_rank > 0 or self.skip_bias_add) else self.bias
+        with use_symmetric_memory(
+            get_tp_group(), disabled=not is_allocation_symmetric()
+        ):
+            output_parallel = self.quant_method.apply(self, input_parallel, bias=bias_)
+
+        if self.reduce_results and self.tp_size > 1 and not skip_all_reduce:
+            if self.use_dp_attention_reduce:
+                output = get_attention_tp_group().all_reduce(output_parallel)
+            else:
+                output = tensor_model_parallel_all_reduce(output_parallel)
+        else:
+            output = output_parallel
+
+        output_bias = self.bias if self.skip_bias_add else None
+
+        return output, output_bias
+
+    def extra_repr(self) -> str:
+        s = f"input_features={self.input_size_per_partition}"
+        s += f", output_features={self.output_size}"
+        s += f", bias={self.bias is not None}"
+        s += f", tp_size={self.tp_size}"
+        s += f", reduce_results={self.reduce_results}"
+        return s
+
+
+class MergedColumnParallelRepeatedLinear(LinearBase):
+    """Merged column parallel linear and repeated linear layer.
+
+    TODO: quantization is not supported yet.
+    Args:
+        input_size: input dimension of the linear layer.
+        column_output_sizes: output dimension of the column linear layers.
+        repeated_output_sizes: output dimension of the repeated linear layers.
+        skip_bias_add: If true, skip adding bias but instead return it.
+        params_dtype: Data type for the parameters.
+        quant_config: Quantization configure.
+    """
+
+    def __init__(
+        self,
+        input_size: int,
+        column_output_sizes: List[int],
+        repeated_output_sizes: List[int],
+        skip_bias_add: bool = False,
+        params_dtype: Optional[torch.dtype] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        output_size = sum(column_output_sizes) + sum(repeated_output_sizes)
+        super().__init__(
+            input_size=input_size,
+            output_size=output_size,
+            skip_bias_add=skip_bias_add,
+            params_dtype=params_dtype,
+            quant_config=quant_config,
+            prefix=prefix,
+        )
+        self.num_column_parallel = len(column_output_sizes)
+        self.tp_rank = get_tensor_model_parallel_rank()
+        self.tp_size = get_tensor_model_parallel_world_size()
+
+        self.output_partition_sizes = [
+            divide(x, self.tp_size) for x in column_output_sizes
+        ] + repeated_output_sizes
+        self.quant_method.create_weights(
+            layer=self,
+            input_size_per_partition=self.input_size,
+            output_partition_sizes=self.output_partition_sizes,
+            input_size=self.input_size,
+            output_size=self.output_size,
+            params_dtype=self.params_dtype,
+            skip_block_quant_check=True,
+            weight_loader=self.weight_loader,
+        )
+
+        self.prefix = prefix
+
+    def forward(self, input_: torch.Tensor) -> torch.Tensor:
+        return self.quant_method.apply(self, input_)
+
+    def weight_loader(
+        self, param: Parameter, loaded_weight: torch.Tensor, loaded_shard_id: int
+    ) -> torch.Tensor:
+        output_dim = param.output_dim
+        shard_offset = sum(self.output_partition_sizes[:loaded_shard_id])
+        shard_size = self.output_partition_sizes[loaded_shard_id]
+        param_data = param.data.narrow(output_dim, shard_offset, shard_size)
+
+        if loaded_shard_id < self.num_column_parallel:
+            start_idx = self.tp_rank * shard_size
+            loaded_weight = loaded_weight.narrow(output_dim, start_idx, shard_size)
+
+        param_data.copy_(loaded_weight)
+
+
+class ColumnParallelBatchedLinear(nn.Module):
+    """Column parallel batched linear layer.
+
+    TODO: quantization is not supported yet.
+    Args:
+        batch: batch dimension of the linear layer.
+        input_size: input dimension of the linear layer.
+        output_size: output dimension of the linear layer.
+        dtype: Data type for the parameters.
+    """
+
+    def __init__(
+        self, batch: int, input_size: int, output_size: int, dtype: torch.dtype
+    ):
+        super().__init__()
+        self.tp_rank = get_tensor_model_parallel_rank()
+        self.tp_size = get_tensor_model_parallel_world_size()
+        self.weight = nn.Parameter(
+            torch.empty(batch, output_size // self.tp_size, input_size, dtype=dtype),
+            requires_grad=False,
+        )
+        setattr(self.weight, "weight_loader", self.weight_loader)
+
+    def forward(self, input: torch.Tensor) -> torch.Tensor:
+        return torch.bmm(input, self.weight.transpose(-1, -2))
+
+    def weight_loader(
+        self, param: Parameter, loaded_weight: torch.Tensor, loaded_shard_id: int
+    ) -> torch.Tensor:
+        shard_size = self.weight.shape[-2]
+        start_idx = self.tp_rank * shard_size
+        loaded_weight = loaded_weight.narrow(0, start_idx, shard_size)
+        param.data[loaded_shard_id].copy_(loaded_weight)
diff --git a/sglang/python/sglang/srt/layers/logits_processor.py b/sglang/python/sglang/srt/layers/logits_processor.py
new file mode 100644
index 0000000000000000000000000000000000000000..aff05bf4270335d1c1133235979d5a6d8e77fcd7
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/logits_processor.py
@@ -0,0 +1,1124 @@
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Logits processing."""
+
+import dataclasses
+import logging
+from typing import Any, Dict, List, Optional, Tuple, Union
+
+import torch
+import triton
+import triton.language as tl
+from torch import nn
+
+from sglang.srt.distributed import (
+    get_tensor_model_parallel_world_size,
+    tensor_model_parallel_all_gather,
+)
+from sglang.srt.environ import envs
+from sglang.srt.layers.dp_attention import (
+    DpPaddingMode,
+    attn_tp_all_gather,
+    attn_tp_all_gather_into_tensor,
+    dp_gather_replicate,
+    dp_scatter,
+    get_attention_dp_rank,
+    get_attention_dp_size,
+    get_attention_tp_size,
+    get_dp_device,
+    get_dp_dtype,
+    get_dp_hidden_size,
+)
+from sglang.srt.layers.utils.logprob import (
+    InputLogprobsResult,
+    compute_temp_top_p_normalized_logprobs,
+    get_token_ids_logprobs_chunk,
+    get_token_ids_logprobs_prefill,
+    get_top_logprobs_chunk,
+    get_top_logprobs_prefill,
+)
+from sglang.srt.layers.vocab_parallel_embedding import VocabParallelEmbedding
+from sglang.srt.model_executor.forward_batch_info import (
+    CaptureHiddenMode,
+    ForwardBatch,
+    ForwardMode,
+)
+from sglang.srt.server_args import get_global_server_args
+from sglang.srt.utils.common import is_npu, use_intel_amx_backend
+
+logger = logging.getLogger(__name__)
+
+_is_npu = is_npu()
+
+
+@dataclasses.dataclass
+class LogitsProcessorOutput:
+    ## Part 1: This part will be assigned in python/sglang/srt/layers/logits_processor.py::LogitsProcessor
+    # The logits of the next tokens.       shape: [#seq, vocab_size]
+    # Can be None for certain prefill-only requests (e.g., multi-item scoring) that don't need next token generation
+    next_token_logits: Optional[torch.Tensor]
+    # Used by speculative decoding (EAGLE)
+    # The last hidden layers
+    hidden_states: Optional[torch.Tensor] = None
+
+    ## Part 2: This part will be assigned in python/sglang/srt/layers/sampler.py::Sampler
+    # he log probs of output tokens, if SGLANG_RETURN_ORIGINAL_LOGPROB = True, will get the log probs before applying temperature. If False, will get the log probs before applying temperature.
+    next_token_logprobs: Optional[torch.Tensor] = None
+    # The logprobs and ids of the top-k tokens in output positions. shape: [#seq, k]
+    next_token_top_logprobs_val: Optional[List] = None
+    next_token_top_logprobs_idx: Optional[List] = None
+    # The logprobs and ids of the requested token ids in output positions. shape: [#seq, n] (n is the number of requested token ids)
+    # Can contain either lists or GPU tensors (for delayed copy optimization in prefill-only requests)
+    next_token_token_ids_logprobs_val: Optional[
+        List[Union[List[float], torch.Tensor]]
+    ] = None
+    next_token_token_ids_logprobs_idx: Optional[List] = None
+
+    ## Part 3: Prefill-only. This part will be assigned in python/sglang/srt/layers/logits_processor.py::LogitsProcessor
+    # The logprobs of input tokens.        shape: [#token]
+    input_token_logprobs: Optional[torch.Tensor] = None
+    # The logprobs and ids of the top-k tokens in input positions.  shape: [#seq, #token, k]
+    input_top_logprobs_val: Optional[List] = None
+    input_top_logprobs_idx: Optional[List] = None
+    # The logprobs and ids of the requested token ids in input positions. shape: [#seq, n] (n is the number of requested token ids)
+    # Can contain either lists or GPU tensors (for delayed GPU-to-CPU transfer optimization)
+    input_token_ids_logprobs_val: Optional[List[Union[List[float], torch.Tensor]]] = (
+        None
+    )
+    input_token_ids_logprobs_idx: Optional[List] = None
+
+    ## Part 4: Diffusion LLM only.
+    full_logits: Optional[torch.Tensor] = None
+
+    ## Part 5: Customized Info
+    customized_info: Optional[Dict[str, List[Any]]] = None
+
+    mm_input_embeds: Optional[torch.Tensor] = None
+
+
+@dataclasses.dataclass
+class LogitsMetadata:
+    forward_mode: ForwardMode
+    capture_hidden_mode: CaptureHiddenMode = CaptureHiddenMode.NULL
+    next_token_logits_buffer: Optional[torch.Tensor] = None
+
+    extend_return_logprob: bool = False
+    extend_return_top_logprob: bool = False
+    extend_token_ids_logprob: bool = False
+    extend_seq_lens: Optional[torch.Tensor] = None
+    extend_seq_lens_cpu: Optional[List[int]] = None
+    extend_logprob_start_lens_cpu: Optional[List[int]] = None
+    extend_logprob_pruned_lens_cpu: Optional[List[int]] = None
+    top_logprobs_nums: Optional[List[int]] = None
+    extend_input_logprob_token_ids_gpu: Optional[torch.Tensor] = None
+    token_ids_logprobs: Optional[List[List[int]]] = None
+
+    # logits and logprobs post processing
+    temp_scaled_logprobs: bool = False
+    temperature: torch.Tensor = None
+    top_p_normalized_logprobs: bool = False
+    top_p: torch.Tensor = None
+
+    # DP attention metadata. Not needed when DP attention is not used.
+    # Number of tokens in the request.
+    global_num_tokens_gpu: Optional[torch.Tensor] = None
+    # The start position of local hidden states.
+    dp_local_start_pos: Optional[torch.Tensor] = None
+    dp_local_num_tokens: Optional[torch.Tensor] = None
+    global_dp_buffer_len: Optional[int] = None
+    # Number of tokens to sample per DP rank
+    global_num_tokens_for_logprob_cpu: Optional[torch.Tensor] = None
+    global_num_tokens_for_logprob_gpu: Optional[torch.Tensor] = None
+    # The gather mode for DP attention
+    dp_padding_mode: Optional[DpPaddingMode] = None
+    # for padding
+    padded_static_len: int = -1
+
+    # Whether this batch is prefill-only (no token generation needed)
+    is_prefill_only: bool = False
+
+    mm_input_embeds: Optional[torch.Tensor] = None
+
+    @classmethod
+    def from_forward_batch(cls, forward_batch: ForwardBatch):
+        if (
+            forward_batch.forward_mode.is_extend()
+            and forward_batch.return_logprob
+            and not forward_batch.forward_mode.is_target_verify()
+        ):
+            extend_return_top_logprob = any(
+                x > 0 for x in forward_batch.top_logprobs_nums
+            )
+            extend_token_ids_logprob = any(
+                x is not None for x in forward_batch.token_ids_logprobs
+            )
+            extend_return_logprob = False
+            extend_logprob_pruned_lens_cpu = []
+            for extend_len, start_len in zip(
+                forward_batch.extend_seq_lens_cpu,
+                forward_batch.extend_logprob_start_lens_cpu,
+            ):
+                if extend_len - start_len > 0:
+                    extend_return_logprob = True
+                extend_logprob_pruned_lens_cpu.append(extend_len - start_len)
+        else:
+            extend_return_logprob = extend_return_top_logprob = (
+                extend_token_ids_logprob
+            ) = extend_logprob_pruned_lens_cpu = False
+
+        return cls(
+            forward_mode=forward_batch.forward_mode,
+            capture_hidden_mode=forward_batch.capture_hidden_mode,
+            next_token_logits_buffer=forward_batch.next_token_logits_buffer,
+            extend_return_logprob=extend_return_logprob,
+            extend_return_top_logprob=extend_return_top_logprob,
+            extend_token_ids_logprob=extend_token_ids_logprob,
+            extend_seq_lens=forward_batch.extend_seq_lens,
+            extend_seq_lens_cpu=forward_batch.extend_seq_lens_cpu,
+            extend_logprob_start_lens_cpu=forward_batch.extend_logprob_start_lens_cpu,
+            extend_logprob_pruned_lens_cpu=extend_logprob_pruned_lens_cpu,
+            top_logprobs_nums=forward_batch.top_logprobs_nums,
+            token_ids_logprobs=forward_batch.token_ids_logprobs,
+            extend_input_logprob_token_ids_gpu=forward_batch.extend_input_logprob_token_ids_gpu,
+            padded_static_len=forward_batch.padded_static_len,
+            is_prefill_only=forward_batch.is_prefill_only,
+            global_num_tokens_gpu=forward_batch.global_num_tokens_gpu,
+            dp_local_start_pos=forward_batch.dp_local_start_pos,
+            dp_local_num_tokens=forward_batch.dp_local_num_tokens,
+            global_dp_buffer_len=forward_batch.global_dp_buffer_len,
+            global_num_tokens_for_logprob_cpu=forward_batch.global_num_tokens_for_logprob_cpu,
+            global_num_tokens_for_logprob_gpu=forward_batch.global_num_tokens_for_logprob_gpu,
+            dp_padding_mode=DpPaddingMode.SUM_LEN,
+            mm_input_embeds=forward_batch.mm_input_embeds,
+        )
+
+    def compute_dp_attention_metadata(self):
+
+        cumtokens = torch.cumsum(self.global_num_tokens_for_logprob_gpu, dim=0)
+        dp_rank = get_attention_dp_rank()
+        if dp_rank == 0:
+            dp_local_start_pos = torch.zeros_like(
+                self.global_num_tokens_for_logprob_gpu[0]
+            )
+        else:
+            dp_local_start_pos = cumtokens[dp_rank - 1]
+
+        self.dp_local_start_pos = dp_local_start_pos
+        self.dp_local_num_tokens = self.global_num_tokens_for_logprob_gpu[dp_rank]
+
+        hidden_size = get_dp_hidden_size()
+        dtype = get_dp_dtype()
+        device = get_dp_device()
+
+        if self.global_num_tokens_for_logprob_cpu is not None:
+            # create a smaller buffer to reduce peak memory usage
+            self.global_dp_buffer_len = sum(self.global_num_tokens_for_logprob_cpu)
+        else:
+            self.global_dp_buffer_len = self.global_dp_buffer_len
+
+        self.gathered_buffer = torch.empty(
+            (
+                self.global_dp_buffer_len,
+                hidden_size,
+            ),
+            dtype=dtype,
+            device=device,
+        )
+
+
+class LogitsProcessor(nn.Module):
+    def __init__(
+        self,
+        config,
+        skip_all_gather: bool = False,
+        logit_scale: Optional[float] = None,
+        return_full_logits: bool = False,
+    ):
+        super().__init__()
+        self.config = config
+        self.vocab_size = config.vocab_size
+        self.logit_scale = logit_scale
+        self.use_attn_tp_group = get_global_server_args().enable_dp_lm_head
+        self.use_fp32_lm_head = get_global_server_args().enable_fp32_lm_head
+        if self.use_attn_tp_group:
+            self.attn_tp_size = get_attention_tp_size()
+            self.do_tensor_parallel_all_gather = (
+                not skip_all_gather and self.attn_tp_size > 1
+            )
+            self.do_tensor_parallel_all_gather_dp_attn = False
+        else:
+            self.do_tensor_parallel_all_gather = (
+                not skip_all_gather and get_tensor_model_parallel_world_size() > 1
+            )
+            self.do_tensor_parallel_all_gather_dp_attn = (
+                self.do_tensor_parallel_all_gather and get_attention_dp_size() != 1
+            )
+        self.final_logit_softcapping = getattr(
+            self.config, "final_logit_softcapping", None
+        )
+        if (
+            self.final_logit_softcapping is not None
+            and self.final_logit_softcapping < 0
+        ):
+            self.final_logit_softcapping = None
+
+        self.return_full_logits = return_full_logits
+        self.multi_item_delimiter = (
+            get_global_server_args().multi_item_scoring_delimiter
+        )
+
+        # enable chunked logprobs processing
+        self.enable_logprobs_chunk = envs.SGLANG_ENABLE_LOGITS_PROCESSER_CHUNK.get()
+        # chunk size for logprobs processing
+        self.logprobs_chunk_size = envs.SGLANG_LOGITS_PROCESSER_CHUNK_SIZE.get()
+
+    def forward(
+        self,
+        input_ids,
+        hidden_states,
+        lm_head: VocabParallelEmbedding,
+        logits_metadata: Union[LogitsMetadata, ForwardBatch],
+        aux_hidden_states: Optional[torch.Tensor] = None,
+        hidden_states_before_norm: Optional[torch.Tensor] = None,
+    ) -> LogitsProcessorOutput:
+        if isinstance(logits_metadata, ForwardBatch):
+            logits_metadata = LogitsMetadata.from_forward_batch(logits_metadata)
+
+        # Multi-item scoring only for prefill-only requests.
+        if self.multi_item_delimiter is not None and logits_metadata.is_prefill_only:
+            return self.compute_logprobs_for_multi_item_scoring(
+                input_ids,
+                hidden_states,
+                lm_head,
+                logits_metadata,
+                self.multi_item_delimiter,
+            )
+
+        # Diffusion LLM only.
+        if logits_metadata.forward_mode.is_dllm_extend():
+            return self._get_dllm_logits(hidden_states, lm_head, logits_metadata)
+
+        # Get the last hidden states and last logits for the next token prediction
+        (
+            pruned_states,
+            pruned_states_before_norm,
+            aux_pruned_states,
+            sample_indices,
+            input_logprob_indices,
+            token_to_seq_idx,
+        ) = self._get_pruned_states(
+            hidden_states,
+            hidden_states_before_norm,
+            aux_hidden_states,
+            logits_metadata,
+        )
+
+        hidden_states_to_store = self._get_hidden_states_to_store(
+            hidden_states,
+            hidden_states_before_norm,
+            aux_hidden_states,
+            pruned_states,
+            pruned_states_before_norm,
+            aux_pruned_states,
+            sample_indices,
+            logits_metadata,
+        )
+        del hidden_states
+
+        if not logits_metadata.extend_return_logprob:
+            # Compute logits for both input and sampled tokens.
+            logits = self._get_logits(pruned_states, lm_head, logits_metadata)
+            sampled_logits = (
+                logits[sample_indices] if sample_indices is not None else logits
+            )
+
+            # Decode mode or extend mode without return_logprob.
+            return LogitsProcessorOutput(
+                next_token_logits=sampled_logits,
+                hidden_states=hidden_states_to_store,
+                mm_input_embeds=logits_metadata.mm_input_embeds,
+            )
+
+        # Start to process input logprobs
+        # Normalize the logprob w/o temperature, top-p
+        self._expand_metadata_for_logprobs(logits_metadata, pruned_states.device)
+
+        # Determine whether to use chunked or non-chunked logits processing.
+        # Skip chunking if:
+        # 1. Chunking is disabled
+        # 2. Total count is below chunk size threshold
+        # 3. DP attention all-gather is enabled (can use "enable_dp_lm_head" to enable chunking)
+        should_skip_chunking = (
+            not self.enable_logprobs_chunk
+            or pruned_states.shape[0] <= self.logprobs_chunk_size
+            or self.do_tensor_parallel_all_gather_dp_attn
+        )
+
+        if should_skip_chunking:
+            # Compute logits for both input and sampled tokens.
+            logits = self._get_logits(pruned_states, lm_head, logits_metadata)
+            sampled_logits = (
+                logits[sample_indices] if sample_indices is not None else logits
+            )
+            input_logits = logits[input_logprob_indices]
+            del logits
+
+            logprobs_result = self.process_input_logprobs(input_logits, logits_metadata)
+        else:
+            logprobs_result, sampled_logits = self.process_input_logprobs_by_chunk(
+                pruned_states,
+                sample_indices,
+                input_logprob_indices,
+                token_to_seq_idx,
+                lm_head,
+                logits_metadata,
+            )
+
+        return LogitsProcessorOutput(
+            next_token_logits=sampled_logits,
+            hidden_states=hidden_states_to_store,
+            input_token_logprobs=logprobs_result.input_token_logprobs,
+            input_top_logprobs_val=logprobs_result.input_top_logprobs_val,
+            input_top_logprobs_idx=logprobs_result.input_top_logprobs_idx,
+            input_token_ids_logprobs_val=logprobs_result.input_token_ids_logprobs_val,
+            input_token_ids_logprobs_idx=logprobs_result.input_token_ids_logprobs_idx,
+            mm_input_embeds=logits_metadata.mm_input_embeds,
+        )
+
+    def _get_pruned_states(
+        self,
+        hidden_states: torch.Tensor,
+        hidden_states_before_norm: Optional[torch.Tensor],
+        aux_hidden_states: Optional[torch.Tensor],
+        logits_metadata: LogitsMetadata,
+    ):
+        pruned_states_before_norm: Optional[torch.Tensor] = None
+        aux_pruned_states = None
+        token_to_seq_idx = []
+
+        if (
+            logits_metadata.forward_mode.is_decode_or_idle()
+            or logits_metadata.forward_mode.is_target_verify()
+            or logits_metadata.forward_mode.is_draft_extend_v2()
+        ):
+            pruned_states = hidden_states
+            pruned_states_before_norm = hidden_states_before_norm
+            if aux_hidden_states is not None:
+                aux_pruned_states = [hidden for hidden in aux_hidden_states]
+            sample_indices = None
+            input_logprob_indices = None
+
+        elif (
+            logits_metadata.forward_mode.is_extend()
+            and not logits_metadata.extend_return_logprob
+        ):
+            # Prefill without input logprobs.
+            if logits_metadata.padded_static_len < 0:
+                last_index = torch.cumsum(logits_metadata.extend_seq_lens, dim=0) - 1
+            else:
+                # If padding_static length is 5 and extended_seq_lens is [2, 3],
+                # then our batch looks like [t00, t01, p, p, p, t10, t11, t12, p, p]
+                # and this retrieves t01 and t12, which are the valid last tokens
+                idx = torch.arange(
+                    len(logits_metadata.extend_seq_lens),
+                    device=logits_metadata.extend_seq_lens.device,
+                )
+                last_index = (
+                    idx * logits_metadata.padded_static_len
+                    + logits_metadata.extend_seq_lens
+                    - 1
+                )
+            pruned_states = hidden_states[last_index]
+            if hidden_states_before_norm is not None:
+                pruned_states_before_norm = hidden_states_before_norm[last_index]
+            if aux_hidden_states is not None:
+                aux_pruned_states = [hidden[last_index] for hidden in aux_hidden_states]
+            sample_indices = None
+            input_logprob_indices = None
+        else:
+            # Prefill with input logprobs.
+            # Find 4 different indices.
+            # 1. pruned_states: hidden states that we want logprobs from.
+            # 2. sample_indices: Indices that have sampled tokens.
+            # 3. input_logprob_indices: Indices that have input logprob tokens.
+            # 4. token_to_seq_idx: map each token to its sequence index
+            #
+            # Example
+            # -------
+            # Suppose a batch (flattened by sequence):
+            # [t00, t01, t02, t03, t10, t11, t12, t13, t14, t20, t21, t22, t23, t24, t25]
+            # extend_seq_lens_cpu           = [4, 5, 6]
+            # extend_logprob_start_lens_cpu = [0, 5, 3]
+            #
+            # Then, the indices are:
+            # pruned_states         -> [t00, t01, t02, t03, t14, t23, t24, t25]
+            # sample_indices        -> [3, 4, 7]
+            # input_logprob_indices -> [0, 1, 2, 3, 5, 6, 7]
+            # token_to_seq_idx      -> [0, 0, 0, 0, 1, 2, 2, 2]
+            #
+            # If chunk is enabled and chunk_size = 3, the chunks will be computed in a chunked manner:
+            # [t00, t01, t02], [t03, t14, t23], [t24, t25]
+
+            sample_index_pt = -1
+            sample_indices = []
+            input_logprob_indices_pt = 0
+            input_logprob_indices = []
+            pt, pruned_states_list, pruned_states_before_norm_list = 0, [], []
+
+            for idx, (extend_logprob_start_len, extend_len) in enumerate(
+                zip(
+                    logits_metadata.extend_logprob_start_lens_cpu,
+                    logits_metadata.extend_seq_lens_cpu,
+                )
+            ):
+                # It can happen in chunked prefill. We still need to sample 1 token,
+                # But we don't want to include it in input logprob.
+                if extend_len == extend_logprob_start_len:
+                    start_len = extend_logprob_start_len - 1
+                else:
+                    start_len = extend_logprob_start_len
+
+                # We always need at least 1 token to sample because that's required
+                # by a caller.
+                assert extend_len > start_len
+                pruned_states_list.append(
+                    hidden_states[pt + start_len : pt + extend_len]
+                )
+                if hidden_states_before_norm is not None:
+                    pruned_states_before_norm_list.append(
+                        hidden_states_before_norm[pt + start_len : pt + extend_len]
+                    )
+                # Map each token to its sequence index, for chunked computation
+                # of input logprobs
+                token_to_seq_idx.extend([idx] * (extend_len - start_len))
+                pt += extend_len
+                sample_index_pt += extend_len - start_len
+                sample_indices.append(sample_index_pt)
+                input_logprob_indices.extend(
+                    [
+                        input_logprob_indices_pt + i
+                        for i in range(extend_len - extend_logprob_start_len)
+                    ]
+                )
+                input_logprob_indices_pt += extend_len - start_len
+
+            # Set the last token of the last sequence
+            token_to_seq_idx.append(len(logits_metadata.extend_seq_lens_cpu) - 1)
+            pruned_states = torch.cat(pruned_states_list)
+            if hidden_states_before_norm is not None:
+                pruned_states_before_norm = torch.cat(pruned_states_before_norm_list)
+            sample_indices = torch.tensor(
+                sample_indices, device=pruned_states.device, dtype=torch.int64
+            )
+            input_logprob_indices = torch.tensor(
+                input_logprob_indices, device=pruned_states.device, dtype=torch.int64
+            )
+
+        return (
+            pruned_states,
+            pruned_states_before_norm,
+            aux_pruned_states,
+            sample_indices,
+            input_logprob_indices,
+            token_to_seq_idx,
+        )
+
+    def _get_hidden_states_to_store(
+        self,
+        hidden_states: torch.Tensor,
+        hidden_states_before_norm: Optional[torch.Tensor],
+        aux_hidden_states: Optional[List[torch.Tensor]],
+        pruned_states: torch.Tensor,
+        pruned_states_before_norm: Optional[torch.Tensor],
+        aux_pruned_states: Optional[List[torch.Tensor]],
+        sample_indices: Optional[torch.Tensor],
+        logits_metadata: LogitsMetadata,
+    ) -> Optional[torch.Tensor]:
+        hidden_states_to_store: Optional[torch.Tensor] = None
+        hidden_states_to_store_before_norm: Optional[torch.Tensor] = None
+        if logits_metadata.capture_hidden_mode.need_capture():
+            if logits_metadata.capture_hidden_mode.is_full():
+                if aux_hidden_states is not None:
+                    aux_hidden_states = torch.cat(aux_hidden_states, dim=-1)
+                    hidden_states_to_store = aux_hidden_states
+                else:
+                    hidden_states_to_store = hidden_states
+                hidden_states_to_store_before_norm = hidden_states_before_norm
+            elif logits_metadata.capture_hidden_mode.is_last():
+                # Get the last token hidden states. If sample_indices is None,
+                # pruned states only contain the last tokens already.
+                if aux_hidden_states is not None:
+                    aux_pruned_states = torch.cat(aux_pruned_states, dim=-1)
+                    hidden_states_to_store = (
+                        aux_pruned_states[sample_indices]
+                        if sample_indices is not None
+                        else aux_pruned_states
+                    )
+                else:
+                    hidden_states_to_store = (
+                        pruned_states[sample_indices]
+                        if sample_indices is not None
+                        else pruned_states
+                    )
+                    if hidden_states_before_norm is not None:
+                        hidden_states_to_store_before_norm = (
+                            pruned_states_before_norm[sample_indices]
+                            if sample_indices is not None
+                            else pruned_states_before_norm
+                        )
+            else:
+                assert False, "Should never reach"
+
+        if hidden_states_to_store_before_norm is not None:
+            # NOTE: when hidden_states_before_norm is provided, we always
+            # prefer to return it.
+            hidden_states_to_store = hidden_states_to_store_before_norm
+
+        return hidden_states_to_store
+
+    def _expand_metadata_for_logprobs(
+        self, logits_metadata: LogitsMetadata, device: torch.device
+    ):
+        pruned_lens = torch.tensor(
+            logits_metadata.extend_logprob_pruned_lens_cpu,
+            device=device,
+        )
+        if logits_metadata.temp_scaled_logprobs:
+            logits_metadata.temperature = torch.repeat_interleave(
+                logits_metadata.temperature.view(-1),
+                pruned_lens,
+            ).view(-1, 1)
+        if logits_metadata.top_p_normalized_logprobs:
+            logits_metadata.top_p = torch.repeat_interleave(
+                logits_metadata.top_p,
+                pruned_lens,
+            )
+
+    def process_input_logprobs(self, input_logits, logits_metadata: LogitsMetadata):
+        input_logprobs = compute_temp_top_p_normalized_logprobs(
+            input_logits, logits_metadata
+        )
+
+        # Get the logprob of top-k tokens
+        if logits_metadata.extend_return_top_logprob:
+            (
+                input_top_logprobs_val,
+                input_top_logprobs_idx,
+            ) = get_top_logprobs_prefill(input_logprobs, logits_metadata)
+        else:
+            input_top_logprobs_val = input_top_logprobs_idx = None
+
+        # Get the logprob of given token id
+        if logits_metadata.extend_token_ids_logprob:
+            (
+                input_token_ids_logprobs_val,
+                input_token_ids_logprobs_idx,
+            ) = get_token_ids_logprobs_prefill(input_logprobs, logits_metadata)
+        else:
+            input_token_ids_logprobs_val = input_token_ids_logprobs_idx = None
+
+        input_token_logprobs = input_logprobs[
+            torch.arange(input_logprobs.shape[0], device=input_logprobs.device),
+            logits_metadata.extend_input_logprob_token_ids_gpu,
+        ]
+
+        return InputLogprobsResult(
+            input_token_logprobs=input_token_logprobs,
+            input_top_logprobs_val=input_top_logprobs_val,
+            input_top_logprobs_idx=input_top_logprobs_idx,
+            input_token_ids_logprobs_val=input_token_ids_logprobs_val,
+            input_token_ids_logprobs_idx=input_token_ids_logprobs_idx,
+        )
+
+    def process_input_logprobs_by_chunk(
+        self,
+        pruned_states: torch.Tensor,
+        sample_indices: torch.Tensor,
+        input_logprob_indices: torch.Tensor,
+        token_to_seq_idx: list[int],
+        lm_head: VocabParallelEmbedding,
+        logits_metadata: LogitsMetadata,
+    ) -> Tuple[InputLogprobsResult, torch.Tensor]:
+        """
+        compute logprobs for the output token from the hidden states.
+        To avoid using too much memory, we split pruned_states into chunks of
+        rows to compute input_logprobs separately, then concatenate the results.
+
+        Returns:
+            InputLogprobsResult: logprobs result
+            torch.Tensor: sampled logits
+        """
+
+        # The peak memory usage is proportional to the chunk size.
+        chunk_size = self.logprobs_chunk_size
+        total_size = pruned_states.shape[0]
+        num_chunks = (total_size + chunk_size - 1) // chunk_size
+
+        input_token_logprobs = []
+        if logits_metadata.extend_return_top_logprob:
+            input_top_logprobs_val = []
+            input_top_logprobs_idx = []
+        else:
+            input_top_logprobs_val = None
+            input_top_logprobs_idx = None
+        if logits_metadata.extend_token_ids_logprob:
+            input_token_ids_logprobs_val = []
+            input_token_ids_logprobs_idx = []
+        else:
+            input_token_ids_logprobs_val = None
+            input_token_ids_logprobs_idx = None
+
+        # If a single sequence is split into multiple chunks, we need to keep track
+        # of the pruned length of the sequences in the previous chunks.
+        split_len_topk = 0
+        split_len_token_ids = 0
+
+        for i in range(num_chunks):
+            start_idx = i * chunk_size
+            end_idx = min((i + 1) * chunk_size, total_size)
+
+            # Get indices for this chunk
+            chunk_mask = (input_logprob_indices >= start_idx) & (
+                input_logprob_indices < end_idx
+            )
+            global_indices = input_logprob_indices[chunk_mask]
+            chunk_indices = global_indices - start_idx
+            # Get the positions in the original array where chunk_mask is True
+            # This is needed to correctly index into extend_input_logprob_token_ids_gpu
+            mask_indices = torch.nonzero(chunk_mask, as_tuple=True)[0]
+
+            # Get the logits for this chunk
+            chunk_states = pruned_states[start_idx:end_idx]
+            chunk_logits = self._get_logits(chunk_states, lm_head, logits_metadata)
+
+            # Initialize sampled_logits on first chunk
+            if i == 0:
+                sampled_logits = torch.empty(
+                    (sample_indices.shape[0], chunk_logits.shape[1]),
+                    dtype=chunk_logits.dtype,
+                    device=chunk_logits.device,
+                )
+
+            # Handle sampled logits for the chunk if needed
+            # This must be done before the continue statement to ensure all sampled_logits are filled
+            chunk_sample_mask = (sample_indices >= start_idx) & (
+                sample_indices < end_idx
+            )
+            if chunk_sample_mask.any():
+                chunk_sample_indices = sample_indices[chunk_sample_mask] - start_idx
+                sampled_logits[chunk_sample_mask] = chunk_logits[chunk_sample_indices]
+
+            # If there are no input logprobs in this chunk, skip the rest
+            if chunk_indices.numel() == 0:
+                continue
+
+            # Compute the logprobs of the chunk
+            chunk_input_logprobs = chunk_logits[chunk_indices]
+            # Only index per-token arrays when the corresponding feature is active.
+            # Otherwise these tensors can be per-sequence (or scalars), which can
+            # cause out-of-bounds indexing on GPU.
+            chunk_temperature = (
+                logits_metadata.temperature[global_indices]
+                if logits_metadata.temp_scaled_logprobs
+                and logits_metadata.temperature is not None
+                else None
+            )
+            chunk_top_p = (
+                logits_metadata.top_p[global_indices]
+                if logits_metadata.top_p_normalized_logprobs
+                and logits_metadata.top_p is not None
+                else None
+            )
+            chunk_input_logprobs = compute_temp_top_p_normalized_logprobs(
+                chunk_input_logprobs,
+                logits_metadata,
+                chunk_top_p,
+                chunk_temperature,
+            )
+
+            # For each chunk, we need to get the slice of the token_to_seq_idx
+            chunk_slice = slice(
+                token_to_seq_idx[start_idx], token_to_seq_idx[end_idx] + 1
+            )
+
+            # Get the logprob of top-k tokens
+            if logits_metadata.extend_return_top_logprob:
+                top_k_nums = logits_metadata.top_logprobs_nums[chunk_slice]
+                pruned_lens = logits_metadata.extend_logprob_pruned_lens_cpu[
+                    chunk_slice
+                ]
+                split_len_topk = get_top_logprobs_chunk(
+                    chunk_input_logprobs,
+                    logits_metadata,
+                    top_k_nums,
+                    pruned_lens,
+                    input_top_logprobs_val,
+                    input_top_logprobs_idx,
+                    split_len_topk,
+                )
+
+            # Get the logprob of given token id
+            if logits_metadata.extend_token_ids_logprob:
+                token_ids_logprobs = logits_metadata.token_ids_logprobs[chunk_slice]
+                pruned_lens = logits_metadata.extend_logprob_pruned_lens_cpu[
+                    chunk_slice
+                ]
+                split_len_token_ids = get_token_ids_logprobs_chunk(
+                    chunk_input_logprobs,
+                    token_ids_logprobs,
+                    pruned_lens,
+                    input_token_ids_logprobs_val,
+                    input_token_ids_logprobs_idx,
+                    split_len_token_ids,
+                )
+
+            # Get the logprob of the requested token ids
+            chunk_input_token_logprobs = chunk_input_logprobs[
+                torch.arange(
+                    chunk_input_logprobs.shape[0], device=chunk_input_logprobs.device
+                ),
+                logits_metadata.extend_input_logprob_token_ids_gpu[mask_indices],
+            ]
+            input_token_logprobs.append(chunk_input_token_logprobs)
+
+        # Concatenate the results
+        input_token_logprobs = torch.cat(input_token_logprobs, dim=0)
+
+        return (
+            InputLogprobsResult(
+                input_token_logprobs=input_token_logprobs,
+                input_top_logprobs_val=input_top_logprobs_val,
+                input_top_logprobs_idx=input_top_logprobs_idx,
+                input_token_ids_logprobs_val=input_token_ids_logprobs_val,
+                input_token_ids_logprobs_idx=input_token_ids_logprobs_idx,
+            ),
+            sampled_logits,
+        )
+
+    def _get_logits(
+        self,
+        hidden_states: torch.Tensor,
+        lm_head: VocabParallelEmbedding,
+        logits_metadata: LogitsMetadata,
+        embedding_bias: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        """Get logits from hidden_states.
+
+        If sampled_logits_only is True, it means hidden_states only contain the
+        last position (e.g., extend without input logprobs). The caller should
+        guarantee the given hidden_states follow this constraint.
+        """
+        hidden_states, local_hidden_states = self._gather_dp_attn_hidden_states(
+            hidden_states, logits_metadata
+        )
+
+        logits = self._compute_lm_head(hidden_states, lm_head, embedding_bias)
+
+        if self.logit_scale is not None:
+            logits.mul_(self.logit_scale)
+
+        if self.do_tensor_parallel_all_gather:
+            if self.use_attn_tp_group:
+                logits = self._gather_attn_tp_logits(logits)
+            else:
+                logits = tensor_model_parallel_all_gather(logits)
+
+        logits = self._scatter_dp_attn_logits(
+            logits, local_hidden_states, logits_metadata
+        )
+
+        logits = self._copy_logits_to_buffer(logits, logits_metadata)
+
+        if self.final_logit_softcapping:
+            if not _is_npu:
+                fused_softcap(logits, self.final_logit_softcapping)
+            else:
+                logits = self.final_logit_softcapping * torch.tanh(
+                    logits / self.final_logit_softcapping
+                )
+
+        return logits
+
+    def _compute_lm_head(
+        self,
+        hidden_states: torch.Tensor,
+        lm_head: VocabParallelEmbedding,
+        embedding_bias: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        if hasattr(lm_head, "set_lora") and hasattr(lm_head, "apply_lora"):
+            # This is a LoRA-wrapped module, use its forward method
+            logits = lm_head(hidden_states)
+        elif hasattr(lm_head, "weight"):
+            # Normal linear layer
+            if self.use_fp32_lm_head:
+                logits = torch.matmul(
+                    hidden_states.to(torch.float32), lm_head.weight.to(torch.float32).T
+                )
+            elif use_intel_amx_backend(lm_head):
+                logits = torch.ops.sgl_kernel.weight_packed_linear(
+                    hidden_states.to(lm_head.weight.dtype),
+                    lm_head.weight,
+                    None,  # bias
+                    True,  # is_vnni
+                )
+            elif get_global_server_args().rl_on_policy_target is not None:
+                # Due to tie-weight, we may not be able to change lm_head's weight dtype
+                logits = torch.matmul(
+                    hidden_states.bfloat16(), lm_head.weight.T.bfloat16()
+                )
+            else:
+                logits = torch.matmul(
+                    hidden_states.to(lm_head.weight.dtype), lm_head.weight.T
+                )
+        else:
+            # GGUF models
+            # TODO: use weight_packed_linear for GGUF models
+            if self.use_fp32_lm_head:
+                with torch.cuda.amp.autocast(enabled=False):
+                    logits = lm_head.quant_method.apply(
+                        lm_head, hidden_states.to(torch.float32), embedding_bias
+                    )
+            else:
+                logits = lm_head.quant_method.apply(
+                    lm_head, hidden_states, embedding_bias
+                )
+        return logits
+
+    def _gather_dp_attn_hidden_states(
+        self, hidden_states: torch.Tensor, logits_metadata: LogitsMetadata
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        if self.do_tensor_parallel_all_gather_dp_attn:
+            logits_metadata.compute_dp_attention_metadata()
+            local_hidden_states = hidden_states
+            hidden_states = logits_metadata.gathered_buffer
+            dp_gather_replicate(hidden_states, local_hidden_states, logits_metadata)
+            return hidden_states, local_hidden_states
+        return hidden_states, hidden_states
+
+    def _gather_attn_tp_logits(self, logits: torch.Tensor) -> torch.Tensor:
+        if self.vocab_size % self.attn_tp_size == 0:
+            global_logits = torch.empty(
+                (
+                    self.attn_tp_size,
+                    logits.shape[0],
+                    self.vocab_size // self.attn_tp_size,
+                ),
+                device=logits.device,
+                dtype=logits.dtype,
+            )
+            attn_tp_all_gather_into_tensor(global_logits, logits)
+            global_logits = global_logits.permute(1, 0, 2).reshape(
+                logits.shape[0], self.vocab_size
+            )
+        else:
+            global_logits = torch.empty(
+                (self.vocab_size, logits.shape[0]),
+                device=logits.device,
+                dtype=logits.dtype,
+            )
+            global_logits = global_logits.T
+            attn_tp_all_gather(
+                list(global_logits.tensor_split(self.attn_tp_size, dim=-1)),
+                logits,
+            )
+        return global_logits
+
+    def _scatter_dp_attn_logits(
+        self,
+        logits: torch.Tensor,
+        local_hidden_states: torch.Tensor,
+        logits_metadata: LogitsMetadata,
+    ) -> torch.Tensor:
+        if self.do_tensor_parallel_all_gather_dp_attn:
+            global_logits = logits
+            logits = torch.empty(
+                (local_hidden_states.shape[0], global_logits.shape[1]),
+                device=global_logits.device,
+                dtype=global_logits.dtype,
+            )
+            dp_scatter(logits, global_logits, logits_metadata)
+        return logits
+
+    def _copy_logits_to_buffer(
+        self, logits: torch.Tensor, logits_metadata: LogitsMetadata
+    ) -> torch.Tensor:
+        if logits_metadata.next_token_logits_buffer is not None:
+            logits_buffer = logits_metadata.next_token_logits_buffer
+            assert logits_buffer.dtype == torch.float
+            logits_buffer.copy_(logits[:, : self.vocab_size])
+            logits = logits_buffer
+        else:
+            logits = logits[:, : self.vocab_size].float()
+        return logits
+
+    def _get_dllm_logits(
+        self,
+        hidden_states: torch.Tensor,
+        lm_head: VocabParallelEmbedding,
+        logits_metadata: LogitsMetadata,
+    ) -> LogitsProcessorOutput:
+        assert self.return_full_logits
+        full_logits = self._get_logits(hidden_states, lm_head, logits_metadata)
+        return LogitsProcessorOutput(
+            full_logits=full_logits,
+            next_token_logits=None,
+        )
+
+    def compute_logprobs_for_multi_item_scoring(
+        self,
+        input_ids,
+        hidden_states,
+        lm_head: VocabParallelEmbedding,
+        logits_metadata: Union[LogitsMetadata, ForwardBatch],
+        delimiter_token: int,
+    ):
+        """
+        Compute logprobs for multi-item scoring using delimiter-based token extraction.
+
+        This method is designed for scenarios where you want to score multiple items/candidates
+        against a single query by combining them into one sequence separated by delimiters.
+
+        Sequence format: Query<delimiter>Item1<delimiter>Item2<delimiter>...
+        Scoring positions: Extracts logprobs at positions before each <delimiter>
+
+        Args:
+            input_ids (torch.Tensor): Input token IDs containing query and items separated by delimiters.
+                Shape: [total_sequence_length] for single request or [batch_total_length] for batch.
+            hidden_states (torch.Tensor): Hidden states from the model.
+                Shape: [sequence_length, hidden_dim].
+            lm_head (VocabParallelEmbedding): Language model head for computing logits.
+            logits_metadata (Union[LogitsMetadata, ForwardBatch]): Metadata containing batch info
+                and token ID specifications for logprob extraction.
+            delimiter_token (int): Token ID used as delimiter between query and items.
+
+        Returns:
+            LogitsProcessorOutput: Contains:
+                - next_token_logits: None (not needed for scoring-only requests)
+                - input_token_logprobs: Logprobs of delimiter tokens at scoring positions
+                - input_top_logprobs_val: Top-k logprobs at delimiter positions (if requested)
+                - input_top_logprobs_idx: Top-k token indices at delimiter positions (if requested)
+                - input_token_ids_logprobs_val: Logprobs for user-requested token IDs (if any)
+                - input_token_ids_logprobs_idx: Indices for user-requested token IDs (if any)
+        """
+        multi_item_indices = (input_ids == delimiter_token).nonzero(as_tuple=True)[
+            0
+        ] - 1
+        # Extract hidden states at delimiter positions for multi-item scoring
+        sliced_hidden = hidden_states[multi_item_indices]
+
+        sliced_logits = self._get_logits(sliced_hidden, lm_head, logits_metadata)
+        sliced_logprobs = torch.nn.functional.log_softmax(sliced_logits, dim=-1)
+
+        # Initialize return values
+        input_token_ids_logprobs_val = []
+        input_token_ids_logprobs_idx = []
+        input_top_logprobs_val = None
+        input_top_logprobs_idx = None
+
+        # Recalculate extend_logprob_pruned_lens_cpu to match delimiter counts per request
+        # Original contains sequence lengths, but we need delimiter counts for sliced_logprobs
+        if (
+            logits_metadata.token_ids_logprobs
+            or logits_metadata.extend_return_top_logprob
+        ):
+            logits_metadata.extend_logprob_pruned_lens_cpu = []
+
+            if logits_metadata.extend_seq_lens_cpu is not None:
+                # Multi-request batch: count delimiters per request
+                input_pt = 0
+                for req_seq_len in logits_metadata.extend_seq_lens_cpu:
+                    req_input_ids = input_ids[input_pt : input_pt + req_seq_len]
+                    delimiter_count = (req_input_ids == delimiter_token).sum().item()
+                    logits_metadata.extend_logprob_pruned_lens_cpu.append(
+                        delimiter_count
+                    )
+                    input_pt += req_seq_len
+            else:
+                # Single request case: one request gets all delimiters
+                total_delimiters = (input_ids == delimiter_token).sum().item()
+                logits_metadata.extend_logprob_pruned_lens_cpu = [total_delimiters]
+
+        # Get the logprobs of specified token ids
+        if logits_metadata.extend_token_ids_logprob:
+            (
+                input_token_ids_logprobs_val,
+                input_token_ids_logprobs_idx,
+            ) = get_token_ids_logprobs_prefill(
+                sliced_logprobs, logits_metadata, delay_cpu_copy=True
+            )
+
+        # Get the logprob of top-k tokens
+        if logits_metadata.extend_return_top_logprob:
+            (
+                input_top_logprobs_val,
+                input_top_logprobs_idx,
+            ) = get_top_logprobs_prefill(sliced_logprobs, logits_metadata)
+
+        # For input_token_logprobs, use delimiter token logprobs
+        input_token_logprobs = sliced_logprobs[:, delimiter_token]
+
+        return LogitsProcessorOutput(
+            next_token_logits=None,  # Multi-item scoring doesn't need next token logits
+            input_token_logprobs=input_token_logprobs,
+            input_top_logprobs_val=input_top_logprobs_val,
+            input_top_logprobs_idx=input_top_logprobs_idx,
+            input_token_ids_logprobs_val=input_token_ids_logprobs_val,
+            input_token_ids_logprobs_idx=input_token_ids_logprobs_idx,
+            # FIXME: These fields are not logits-related but are passed through here as a
+            # workaround since ForwardBatch is local to forward_batch_generation().
+            # They should be moved to GenerationBatchResult to keep this class clean.
+            mm_input_embeds=logits_metadata.mm_input_embeds,
+        )
+
+
+@triton.jit
+def fused_softcap_kernel(
+    full_logits_ptr,
+    softcapping_value,
+    n_elements,
+    BLOCK_SIZE: tl.constexpr,
+):
+    pid = tl.program_id(0).to(tl.int64)
+    block_start = pid * BLOCK_SIZE
+    offsets = block_start + tl.arange(0, BLOCK_SIZE)
+    mask = offsets < n_elements
+
+    # Load values
+    x = tl.load(full_logits_ptr + offsets, mask=mask)
+
+    # Perform operations in-place
+    x = x / softcapping_value
+
+    # Manual tanh implementation using exp
+    exp2x = tl.exp(2 * x)
+    x = (exp2x - 1) / (exp2x + 1)
+
+    x = x * softcapping_value
+
+    # Store result
+    tl.store(full_logits_ptr + offsets, x, mask=mask)
+
+
+def fused_softcap(full_logits, final_logit_softcapping):
+    n_elements = full_logits.numel()
+    BLOCK_SIZE = 1024
+    grid = ((n_elements + BLOCK_SIZE - 1) // BLOCK_SIZE, 1, 1)
+
+    fused_softcap_kernel[grid](
+        full_logits_ptr=full_logits,
+        softcapping_value=final_logit_softcapping,
+        n_elements=n_elements,
+        BLOCK_SIZE=BLOCK_SIZE,
+    )
+    return full_logits
diff --git a/sglang/python/sglang/srt/layers/model_parallel.py b/sglang/python/sglang/srt/layers/model_parallel.py
new file mode 100644
index 0000000000000000000000000000000000000000..d08754f3285e248e7e6a17525cae2d3beff4c568
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/model_parallel.py
@@ -0,0 +1,155 @@
+"""
+Common utilities for torch model parallelism.
+"""
+
+from typing import Optional, Sequence
+
+import torch
+import torch.nn as nn
+from torch.distributed.device_mesh import DeviceMesh
+
+try:
+    import torch.distributed.tensor as dt
+except ImportError:
+    # torch 2.4 or older
+    import torch.distributed._tensor as dt
+
+from torch.distributed.tensor.parallel import (
+    ColwiseParallel,
+    RowwiseParallel,
+    parallelize_module,
+)
+
+
+def _shard_tensor(
+    full_tensor: torch.Tensor,
+    device_mesh: DeviceMesh,
+    placements: Sequence[dt.Shard],
+) -> "dt.DTensor":
+    """
+    Locally shards a full tensor based on indicated sharding arrangement, and
+    returns a DTensor containing the local shard.
+
+    .. warning:: This is a private API that is subject to change. It skips the
+        communication otherwise required by `distribute_tensor`. It is only
+        applicable to cases where all ranks have the same `full_tensor`. For
+        example, in distributed inference all ranks load from the same
+        checkpoint. This API will not check for data equality between ranks, it
+        is thus user's responsibility to ensure the `full_tensor` is the same
+        across ranks.
+
+    Args:
+        full_tensor (torch.Tensor): the full tensor to be sharded.
+        device_mesh (:class:`DeviceMesh`): DeviceMesh to place the
+            DTensor.  Must have same dimension as the number of placements.
+        placements (Sequence[:class:`Shard`]): the placements that
+            describes how to place the local tensor on DeviceMesh.
+
+    Returns:
+        A :class:`DTensor` object with the shard as its local tensor.
+
+    Examples:
+        >>> # xdoctest: +SKIP("need world_size and rank")
+        >>> device_mesh = dist.init_device_mesh("cuda", (world_size,))
+        >>> full_tensor = torch.arange(world_size, device=f"cuda:{rank}")
+        >>> dtensor = _shard_tensor(full_tensor, device_mesh, [Shard(1)])
+    """
+    shape, offset = dt._utils.compute_local_shape_and_global_offset(
+        full_tensor.shape, device_mesh, placements
+    )
+    slices = [
+        slice(cur_offset, cur_offset + cur_shape)
+        for cur_shape, cur_offset in zip(shape, offset)
+    ]
+    local_tensor = full_tensor[slices]
+    return dt.DTensor.from_local(local_tensor, device_mesh, placements)
+
+
+class ColwiseParallelSharded(ColwiseParallel):
+    """
+    A version of ColwiseParallel where the local weight has been already
+    sharded.  This is used for the fused wqkv case, where during loading, we
+    already sharded wq, wk, wv before fusing them.
+    """
+
+    # Override the _partition_linear_fn in ColwiseParallel
+    def _partition_linear_fn(self, name, module, device_mesh):
+        # colwise shard weight/bias to Shard(0), weight be Shard(0)
+        # means Colwise as Linear is input * weight^T + bias, where
+        # weight would become Shard(1)
+        for name, param in module.named_parameters():
+            dtensor = dt.DTensor.from_local(param, device_mesh, [dt.Shard(0)])
+            dist_param = torch.nn.Parameter(dtensor, requires_grad=False)
+            module.register_parameter(name, dist_param)
+
+
+class RowwiseParallelMaybeWait(RowwiseParallel):
+    """
+    A version of RowwiseParallel that waits for the output (establish dependency
+    between comm stream and compute stream in CUDA sense) before going into the
+    next op. This is needed to workaround the current interaction between
+    AsyncCollectiveTensor and multi-platform ops, such as `RMSNorm`.
+    """
+
+    def _partition_linear_fn(self, name, module, device_mesh):
+        # Rowwise shard weight to Shard(1), bias to Replicate(), weight be Shard(1)
+        # means Rowwise as nn.Linear is input * weight^T + bias, where
+        # weight would become Shard(0)
+        module.register_parameter(
+            "weight",
+            nn.Parameter(_shard_tensor(module.weight, device_mesh, [dt.Shard(1)])),
+        )
+        if getattr(module, "bias", None) is not None:
+            # The Linear module has bias
+            module.register_parameter(
+                "bias",
+                nn.Parameter(
+                    dt.distribute_tensor(module.bias, device_mesh, [dt.Replicate()])
+                ),
+            )
+
+    @staticmethod
+    def _prepare_output_fn(output_layouts, use_local_output, mod, outputs, device_mesh):
+        outputs = super(
+            RowwiseParallelMaybeWait, RowwiseParallelMaybeWait
+        )._prepare_output_fn(
+            output_layouts, use_local_output, mod, outputs, device_mesh
+        )
+        return torch.distributed._functional_collectives.wait_tensor(outputs)
+
+
+def tensor_parallel(
+    module: torch.nn.Module,
+    device_mesh: Optional[DeviceMesh] = None,
+):
+    """
+    Tensor parallelize the model across the given device mesh.
+    Args:
+        module (`torch.nn.Module`):
+            The module to tensor parallelize.
+        device_mesh (`torch.distributed.DeviceMesh`):
+            The device mesh to use for tensor parallelism.
+    """
+
+    # Tensor parallelize a nn.Module based on the `_tp_plan` attribute of the module.
+    # No op if `_tp_plan` attribute does not exist under the module.
+    # This is a helper function to be used with `model.apply` to recursively
+    # parallelize a model.
+    def tplize(mod: torch.nn.Module) -> None:
+        tp_plan = getattr(mod, "_tp_plan", None)
+        if tp_plan is None:
+            return
+        for child_name, tp_style in tp_plan.items():
+            submod = mod.get_submodule(child_name)
+            if tp_style == "Colwise":
+                parallelize_module(submod, device_mesh, ColwiseParallel())
+            elif tp_style == "Rowwise":
+                parallelize_module(submod, device_mesh, RowwiseParallelMaybeWait())
+            elif tp_style == "Colwise_Sharded":
+                parallelize_module(submod, device_mesh, ColwiseParallelSharded())
+            else:
+                raise ValueError(f"Unknown TP style {tp_style}")
+
+    # `apply` is a native method of `nn.Module` that recursively applies a
+    # function to every submodule.
+    module.apply(tplize)
diff --git a/sglang/python/sglang/srt/layers/modelopt_utils.py b/sglang/python/sglang/srt/layers/modelopt_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..8e9d8435102ac287a34770272e499d67938d41c0
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/modelopt_utils.py
@@ -0,0 +1,11 @@
+"""
+ModelOpt related constants
+"""
+
+QUANT_CFG_CHOICES = {
+    "fp8": "FP8_DEFAULT_CFG",
+    "int4_awq": "INT4_AWQ_CFG",  # TODO: add support for int4_awq
+    "w4a8_awq": "W4A8_AWQ_BETA_CFG",  # TODO: add support for w4a8_awq
+    "nvfp4": "NVFP4_DEFAULT_CFG",
+    "nvfp4_awq": "NVFP4_AWQ_LITE_CFG",  # TODO: add support for nvfp4_awq
+}
diff --git a/sglang/python/sglang/srt/layers/moe/__init__.py b/sglang/python/sglang/srt/layers/moe/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..74d23ecd7c7097d7a619c91a0ae4e1a3deda8e17
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/__init__.py
@@ -0,0 +1,30 @@
+from sglang.srt.layers.moe.moe_runner import MoeRunner, MoeRunnerConfig
+from sglang.srt.layers.moe.utils import (
+    DeepEPMode,
+    MoeA2ABackend,
+    MoeRunnerBackend,
+    get_deepep_config,
+    get_deepep_mode,
+    get_moe_a2a_backend,
+    get_moe_runner_backend,
+    get_tbo_token_distribution_threshold,
+    initialize_moe_config,
+    is_tbo_enabled,
+    should_use_flashinfer_cutlass_moe_fp4_allgather,
+)
+
+__all__ = [
+    "DeepEPMode",
+    "MoeA2ABackend",
+    "MoeRunner",
+    "MoeRunnerConfig",
+    "MoeRunnerBackend",
+    "initialize_moe_config",
+    "get_moe_a2a_backend",
+    "get_moe_runner_backend",
+    "get_deepep_mode",
+    "should_use_flashinfer_cutlass_moe_fp4_allgather",
+    "is_tbo_enabled",
+    "get_tbo_token_distribution_threshold",
+    "get_deepep_config",
+]
diff --git a/sglang/python/sglang/srt/layers/moe/cutlass_moe.py b/sglang/python/sglang/srt/layers/moe/cutlass_moe.py
new file mode 100644
index 0000000000000000000000000000000000000000..198599e437dde46eb13872cdc291d7c0203cbd6d
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/cutlass_moe.py
@@ -0,0 +1,497 @@
+"""CUTLASS based Fused MoE kernels."""
+
+from typing import Optional, Tuple
+
+import torch
+
+from sglang.srt.layers.moe.cutlass_moe_params import CutlassMoEParams
+from sglang.srt.utils import is_cuda, is_sm90_supported, is_sm100_supported
+
+_is_cuda = is_cuda()
+if _is_cuda:
+    from sgl_kernel import (
+        apply_shuffle_mul_sum,
+        es_fp8_blockwise_scaled_grouped_mm,
+        es_sm100_mxfp8_blockscaled_grouped_mm,
+        es_sm100_mxfp8_blockscaled_grouped_quant,
+        fp8_blockwise_scaled_grouped_mm,
+        prepare_moe_input,
+        shuffle_rows,
+        silu_and_mul,
+    )
+
+    from sglang.jit_kernel.nvfp4 import (
+        cutlass_fp4_group_mm,
+        scaled_fp4_experts_quant,
+    )
+
+
+def cutlass_fused_experts_fp8(
+    a: torch.Tensor,
+    w1_q: torch.Tensor,
+    w2_q: torch.Tensor,
+    w1_scale: torch.Tensor,
+    w2_scale: torch.Tensor,
+    topk_weights: torch.Tensor,
+    topk_ids: torch.Tensor,
+    a1_strides: torch.Tensor,
+    c1_strides: torch.Tensor,
+    a2_strides: torch.Tensor,
+    c2_strides: torch.Tensor,
+    workspace: torch.Tensor,
+    a_ptrs: torch.Tensor,
+    b_ptrs: torch.Tensor,
+    out_ptrs: torch.Tensor,
+    a_scales_ptrs: torch.Tensor,
+    b_scales_ptrs: torch.Tensor,
+    expert_offsets: torch.Tensor,
+    problem_sizes1: torch.Tensor,
+    problem_sizes2: torch.Tensor,
+    use_fp8_blockscale: bool = True,
+    use_mxfp8: bool = False,
+    output: Optional[torch.Tensor] = None,
+    enable_es: Tuple[bool, bool] = (False, False),
+) -> torch.Tensor:
+    """Performs Fused MoE computation using CUTLASS-like kernels with FP8 weights and activations.
+
+    This function implements a Mixture of Experts (MoE) layer with a SwiGLU/SiLU
+    activation, leveraging custom kernels likely derived from CUTLASS principles
+    for grouped matrix multiplication (`fp8_blockwise_scaled_grouped_mm`) and
+    data preparation (`prepare_moe_input`, `silu_and_mul`).
+
+    It handles per-token routing, quantizes input activations to FP8 with
+    per-token scales, performs the expert computations using FP8 GEMMs with
+    pre-quantized FP8 weights (per-block scales), applies the SiLU activation,
+    and combines the results weighted by the router scores.
+
+    Args:
+        a (torch.Tensor): Input activations. Shape: `(m, k)`, where `m` is the total
+            number of tokens and `k` is the hidden size. Expected dtype: `torch.half`
+            or `torch.bfloat16`.
+        w1_q (torch.Tensor): Pre-quantized FP8 weight tensor for the first GEMM
+            (up-projection part of SwiGLU). Expected shape: `(E, k, n*2)`, where
+            `E` is the number of experts, `k` is the hidden size, and `n*2` is the
+            intermediate size (`I`). Expected dtype: `torch.float8_e4m3fn`.
+            Note: This shape implies weights are stored as (num_experts, hidden_size, intermediate_size).
+        w2_q (torch.Tensor): Pre-quantized FP8 weight tensor for the second GEMM
+            (down-projection). Expected shape: `(E, n, k)`, where `n` is half the
+            intermediate size (`I // 2`). Expected dtype: `torch.float8_e4m3fn`.
+            Note: This shape implies weights are stored as (num_experts, intermediate_size // 2, hidden_size).
+        w1_scale (torch.Tensor): Scales corresponding to `w1_q` (per-block scales).
+            Shape: `(E, num_blocks_n, num_blocks_k)`. Dtype: `torch.float32`.
+        w2_scale (torch.Tensor): Scales corresponding to `w2_q` (per-block scales).
+             Shape: `(E, num_blocks_k, num_blocks_n)`. Dtype: `torch.float32`.
+        topk_weights (torch.Tensor): Router weights for the selected top-k experts
+            for each token. Shape: `(m, topk)`. Dtype should ideally match `a`.
+        topk_ids (torch.Tensor): Indices of the selected top-k experts for each token.
+            Shape: `(m, topk)`. Dtype: `torch.int32`.
+        a1_strides (torch.Tensor): Stride information for the first GEMM's 'a' input.
+            Passed directly to the underlying kernel. Expected shape `(E,)`, dtype `torch.int64`.
+            Note: Its exact usage within `fp8_blockwise_scaled_grouped_mm` needs clarification
+            as it's passed as both a_stride and b_stride in the first call.
+        c1_strides (torch.Tensor): Stride information for the first GEMM's 'c' output.
+            Passed directly to the underlying kernel. Expected shape `(E,)`, dtype `torch.int64`.
+        a2_strides (torch.Tensor): Stride information for the second GEMM's 'a' input.
+            Passed directly to the underlying kernel. Expected shape `(E,)`, dtype `torch.int64`.
+            Note: Its exact usage within `fp8_blockwise_scaled_grouped_mm` needs clarification
+            as it's passed as both a_stride and b_stride in the second call.
+        c2_strides (torch.Tensor): Stride information for the second GEMM's 'c' output.
+            Passed directly to the underlying kernel. Expected shape `(E,)`, dtype `torch.int64`.
+        workspace (torch.Tensor): Reusable workspace for the underlying kernel.
+        a_ptrs (torch.Tensor): Pointers container for calculating offsets of the input activations for each expert.
+        b_ptrs (torch.Tensor): Pointers container for calculating offsets of the input weights for each expert.
+        out_ptrs (torch.Tensor): Pointers container for calculating offsets of the output activations for each expert.
+        a_scales_ptrs (torch.Tensor): Pointers container for calculating offsets of the input scales for each expert.
+        b_scales_ptrs (torch.Tensor): Pointers container for calculating offsets of the input scales for each expert.
+        use_fp8_blockscale (bool, optional): Flag indicating usage of FP8 with
+            block scaling. Currently, only `True` is supported. Defaults to `True`.
+        use_mxfp8 (bool, optional): Flag indicating usage of MXFP8 (UE8M0 scales)
+            with SM100 expert-specialization kernels. Defaults to `False`.
+        output (torch.Tensor, optional): Output tensor. If not provided, a new tensor will be created.
+        enable_es (tuple(bool, bool)): Flag indicating usage of expert specialization kernel for (up-projection, down-projection)
+    Returns:
+        torch.Tensor: The computed MoE layer output. Shape: `(m, k)`, dtype matches `a`.
+
+    Raises:
+        AssertionError: If input shapes, dtypes, or flags are inconsistent or unsupported.
+        NotImplementedError: If CUDA is not available or `sgl_kernel` is not properly installed.
+    """
+    assert use_fp8_blockscale, "Only support fp8 blockscale for now"
+    assert topk_weights.shape == topk_ids.shape, "topk shape mismatch"
+    assert w1_q.dtype == torch.float8_e4m3fn
+    assert w2_q.dtype == torch.float8_e4m3fn
+    assert a.shape[1] == w1_q.shape[1], "Hidden size mismatch w1"
+    assert w1_q.shape[2] == w2_q.shape[1] * 2, "Hidden size mismatch w2"
+    assert w1_q.shape[0] == w2_q.shape[0], "Expert number mismatch"
+    assert w1_q.shape[0] == w2_q.shape[0], "Weights expert number mismatch"
+    assert w1_q.shape[0] == w1_scale.shape[0], "w1 scales expert number mismatch"
+    assert w1_q.shape[0] == w2_scale.shape[0], "w2 scales expert number mismatch"
+    assert a.dtype in [torch.half, torch.bfloat16], "Invalid output dtype"
+
+    if is_cuda:
+        from sglang.srt.layers.quantization.fp8_kernel import (
+            sglang_per_token_group_quant_fp8,
+        )
+    es_up, es_down = enable_es
+    out_dtype = a.dtype
+    num_experts = w1_q.size(0)
+    m = a.size(0)
+    k = w1_q.size(1)
+    n = w2_q.size(1)
+
+    topk = topk_ids.size(1)
+    device = a.device
+
+    a_map = torch.empty((topk_ids.numel()), dtype=torch.int32, device=device)
+    c_map = torch.empty((topk_ids.numel()), dtype=torch.int32, device=device)
+
+    if use_mxfp8:
+        assert es_up and es_down, "MXFP8 requires expert-specialization for both GEMMs"
+        assert is_sm100_supported(), "MXFP8 requires SM100"
+        assert k % 32 == 0, "MXFP8 requires hidden size to be divisible by 32"
+        assert n % 32 == 0, "MXFP8 requires intermediate size to be divisible by 32"
+        assert w1_scale.dtype == torch.uint8, "MXFP8 w1_scale must be uint8"
+        assert w2_scale.dtype == torch.uint8, "MXFP8 w2_scale must be uint8"
+        expected_w1_scale_shape = (
+            num_experts,
+            w1_q.shape[1] // 32,
+            w1_q.shape[2],
+        )
+        expected_w2_scale_shape = (
+            num_experts,
+            w2_q.shape[1] // 32,
+            w2_q.shape[2],
+        )
+        assert (
+            w1_scale.shape == expected_w1_scale_shape
+        ), f"MXFP8 w1_scale must be {expected_w1_scale_shape}, got {w1_scale.shape}"
+        assert (
+            w2_scale.shape == expected_w2_scale_shape
+        ), f"MXFP8 w2_scale must be {expected_w2_scale_shape}, got {w2_scale.shape}"
+
+        mxfp8_blockscale_align = 128
+        total_tokens = m * topk
+        nonzero_experts = min(num_experts, total_tokens)
+        max_total = total_tokens + (mxfp8_blockscale_align - 1) * nonzero_experts
+        max_blockscale = (
+            (max_total + mxfp8_blockscale_align - 1) // mxfp8_blockscale_align
+        ) * mxfp8_blockscale_align
+
+    blockscale_offsets = None
+    if use_mxfp8 and (es_up or es_down):
+        blockscale_offsets = torch.empty(
+            (num_experts + 1,), dtype=torch.int32, device=device
+        )
+
+    prepare_moe_input(
+        topk_ids,
+        expert_offsets,
+        problem_sizes1,
+        problem_sizes2,
+        a_map,
+        c_map,
+        num_experts,
+        n,
+        k,
+        blockscale_offsets,
+    )
+
+    if use_mxfp8 and es_up:
+        rep_a = shuffle_rows(a, a_map, (m * topk, k))
+        rep_a_q = torch.empty_like(rep_a, dtype=torch.float8_e4m3fn)
+        rep_a1_scales = torch.empty(
+            (max_blockscale, k // 32), dtype=torch.uint8, device=device
+        )
+        es_sm100_mxfp8_blockscaled_grouped_quant(
+            rep_a,
+            problem_sizes1,
+            expert_offsets[:-1],
+            blockscale_offsets[:-1],
+            rep_a_q,
+            rep_a1_scales,
+        )
+    else:
+        a_q, a1_scale = sglang_per_token_group_quant_fp8(a, 128)
+        rep_a_q = shuffle_rows(a_q, a_map, (m * topk, k))
+        rep_a1_scales = shuffle_rows(a1_scale, a_map, (m * topk, int(k / 128)))
+
+    c1 = torch.empty((m * topk, n * 2), device=device, dtype=out_dtype)
+    c2 = torch.empty((m * topk, k), device=device, dtype=out_dtype)
+
+    a_sf_layout = torch.empty((num_experts, 5), device=device, dtype=torch.int)
+    w_sf_layout = torch.empty((num_experts, 5), device=device, dtype=torch.int)
+
+    if is_sm90_supported() and es_up:
+        es_fp8_blockwise_scaled_grouped_mm(
+            c1,
+            rep_a_q,
+            w1_q,
+            rep_a1_scales,
+            w1_scale,
+            a1_strides,
+            a1_strides,
+            c1_strides,
+            problem_sizes1,
+            expert_offsets[:-1],
+            workspace,
+        )
+    elif use_mxfp8 and es_up:
+        es_sm100_mxfp8_blockscaled_grouped_mm(
+            c1,
+            rep_a_q,
+            w1_q,
+            rep_a1_scales,
+            w1_scale,
+            problem_sizes1,
+            expert_offsets[:-1],
+            blockscale_offsets[:-1],
+        )
+    else:
+        fp8_blockwise_scaled_grouped_mm(
+            c1,
+            a_ptrs,
+            b_ptrs,
+            out_ptrs,
+            a_scales_ptrs,
+            b_scales_ptrs,
+            rep_a_q,
+            w1_q,
+            rep_a1_scales,
+            w1_scale,
+            a1_strides,
+            a1_strides,
+            c1_strides,
+            a_sf_layout,
+            w_sf_layout,
+            problem_sizes1,
+            expert_offsets[:-1],
+            workspace,
+        )
+
+    intermediate = torch.empty((m * topk, n), device=device, dtype=out_dtype)
+    silu_and_mul(c1, intermediate)
+
+    if use_mxfp8 and es_down:
+        intemediate_q = torch.empty_like(intermediate, dtype=torch.float8_e4m3fn)
+        a2_scale = torch.empty(
+            (max_blockscale, n // 32), dtype=torch.uint8, device=device
+        )
+        es_sm100_mxfp8_blockscaled_grouped_quant(
+            intermediate,
+            problem_sizes2,
+            expert_offsets[:-1],
+            blockscale_offsets[:-1],
+            intemediate_q,
+            a2_scale,
+        )
+    else:
+        intemediate_q, a2_scale = sglang_per_token_group_quant_fp8(intermediate, 128)
+
+    if is_sm90_supported() and es_down:
+        es_fp8_blockwise_scaled_grouped_mm(
+            c2,
+            intemediate_q,
+            w2_q,
+            a2_scale,
+            w2_scale,
+            a2_strides,
+            a2_strides,
+            c2_strides,
+            problem_sizes2,
+            expert_offsets[:-1],
+            workspace,
+        )
+    elif use_mxfp8 and es_down:
+        es_sm100_mxfp8_blockscaled_grouped_mm(
+            c2,
+            intemediate_q,
+            w2_q,
+            a2_scale,
+            w2_scale,
+            problem_sizes2,
+            expert_offsets[:-1],
+            blockscale_offsets[:-1],
+        )
+    else:
+        fp8_blockwise_scaled_grouped_mm(
+            c2,
+            a_ptrs,
+            b_ptrs,
+            out_ptrs,
+            a_scales_ptrs,
+            b_scales_ptrs,
+            intemediate_q,
+            w2_q,
+            a2_scale,
+            w2_scale,
+            a2_strides,
+            a2_strides,
+            c2_strides,
+            a_sf_layout,
+            w_sf_layout,
+            problem_sizes2,
+            expert_offsets[:-1],
+            workspace,
+        )
+
+    if output is None:
+        output = torch.empty((m, k), device=device, dtype=out_dtype)
+
+    apply_shuffle_mul_sum(c2, output, c_map, topk_weights.to(out_dtype))
+    return output
+
+
+FLOAT4_E2M1_MAX = 6.0
+FLOAT8_E4M3_MAX = 448.0
+
+
+def cutlass_moe_fp4(
+    a: torch.Tensor,
+    a1_gscale: torch.Tensor,
+    w1_fp4: torch.Tensor,
+    w1_blockscale: torch.Tensor,
+    w1_alphas: torch.Tensor,
+    a2_gscale: torch.Tensor,
+    w2_fp4: torch.Tensor,
+    w2_blockscale: torch.Tensor,
+    w2_alphas: torch.Tensor,
+    topk_weights: torch.Tensor,
+    topk_ids: torch.Tensor,
+    params: CutlassMoEParams,
+    apply_router_weight_on_input: bool = False,
+):
+    """
+    MoE implementation for FP4 Inputs
+
+    # Gemm 1
+    a: Input tensor: [m, k] (half/bfloat16)
+    a1_gscale: Activation scale per expert: [e]  (float32)
+    w1(gate up) (not an argument to cutlass_moe_fp4): [e, 2 * n, k]
+    w1_fp4: [e, 2 * n, k // 2], dtype: torch.uint8 (stacked fp4: E2M1)
+    (Note: `n` is the up projection output dim, `k` is the input dim in
+     full precision)
+    w1_blockscale: [e, 2 * n, k // block_size] (float8_e4m3)
+                   (Block size = 16 for NVFP4)
+
+    # Gemm 2
+    a2_gscale: Activation scale per expert: [e]
+    w2(down projection) (not an argument to cutlass_moe_fp4): [e, k, n]
+    w2_fp4: [e, k, n // 2], dtype: torch.uint8 (stacked E2M1)
+    w2_blockscale: [e, k, n // block_size], dtype: float8_e4m3
+
+    Strides for activations, weights and output in logical number of elements.
+    The activations & output stride is the number of elements to the next row.
+    The weights stride is the number of elements to the next row per expert.
+    For example, if the weight is [e, n, k], then the b_stride is a tensor of
+    shape [e] with each element being k. Similarly for activations, if the
+    shape is [m, k], then the a_stride has shape [e] with each value k.
+    Similarly for output, if the output is [m, n], then the c_stride is a
+    tensor of shape [e] with each element being k.
+
+    Note: cutlass_fp4_group_mm is designed to accept the strides of
+    activations and weights to be the same, so it is passed in as a single
+    tensor.
+    ab_strides_13: [e] dtype: int64 [Gemm 1: Activation / Weight strides]
+    ab_strides_2: [e] dtype: int64 [Gemm 2: Activation / Weight strides]
+    c_strides_13: [e] dtype: int64 [Gemm 1: Output Strides]
+    c_strides_2: [e] dtype: int64 [Gemm 1: Output Strides]
+
+    topk_weights: [m, topk] dtype: float8
+    topk_ids: [m, topk] dtype: float8
+
+    m, n, k: Unquantized weight shapes, dtype: int
+    e: number of experts for the current rank, dtype: int
+    assumes that topk < k < n to satisfy - up/down projection expectations.
+    """
+    assert topk_weights.shape == topk_ids.shape, "topk shape mismatch"
+    assert w1_fp4.dtype == torch.uint8, "weight 1 must be uint8"
+    assert w2_fp4.dtype == torch.uint8, "weight 2 must be uint8"
+    assert (
+        w1_fp4.ndim == 3
+        and w2_fp4.ndim == 3
+        and w1_blockscale.ndim == 3
+        and w2_blockscale.ndim == 3
+    ), "All Weights must be of rank 3 for cutlass_moe_fp4"
+    m_a, k_a = a.shape
+    e_w1, nx2_w1, half_k_w1 = w1_fp4.shape
+    e_w2, k_w2, half_n_w2 = w2_fp4.shape
+
+    assert e_w1 == e_w2 and e_w1 == params.num_experts, (
+        "Number of experts must match",
+        " between weights.",
+    )
+    assert (
+        k_a // 2 == half_k_w1 and params.hidden_size == k_w2
+    ), "Hidden size mismatch between a, w1 and w2"
+    assert (
+        nx2_w1 == params.intermediate_size_per_partition * 2
+        and half_n_w2 == params.intermediate_size_per_partition // 2
+    ), ("mismatch in " "expected `n`")
+    assert 2 * half_k_w1 == k_w2, "Hidden size mismatch w2 and w1"
+    assert a.dtype in [torch.half, torch.bfloat16], "Invalid input dtype"
+
+    out_dtype = a.dtype
+    num_topk = topk_ids.shape[1]
+    device = a.device
+    a_map = torch.empty((topk_ids.numel()), dtype=torch.int32, device=device)
+    c_map = torch.empty((topk_ids.numel()), dtype=torch.int32, device=device)
+    prepare_moe_input(
+        topk_ids,
+        params.expert_offsets,
+        params.problem_sizes1,
+        params.problem_sizes2,
+        a_map,
+        c_map,
+        params.num_experts,
+        params.intermediate_size_per_partition,
+        params.hidden_size,
+        params.blockscale_offsets,
+    )
+
+    rep_a_fp4, rep_a_blockscale = scaled_fp4_experts_quant(
+        a,
+        a1_gscale,
+        params.expert_offsets,
+        params.blockscale_offsets,
+        num_topk,
+        expert_map=a_map,
+    )
+    c1 = cutlass_fp4_group_mm(
+        rep_a_fp4,
+        w1_fp4,
+        rep_a_blockscale,
+        w1_blockscale,
+        w1_alphas,
+        out_dtype,
+        params.to_gemm1_args(),
+    )
+    del rep_a_fp4, rep_a_blockscale
+
+    # hidden size dimension is split to one halfpytho sized tensor.
+    intermediate = torch.empty(
+        (m_a * num_topk, w1_fp4.shape[1] // 2), device=device, dtype=out_dtype
+    )
+    silu_and_mul(c1, intermediate)
+
+    int_fp4, int_blockscale = scaled_fp4_experts_quant(
+        intermediate,
+        a2_gscale,
+        params.expert_offsets,
+        params.blockscale_offsets,
+        num_topk,
+    )
+    c2 = cutlass_fp4_group_mm(
+        int_fp4,
+        w2_fp4,
+        int_blockscale,
+        w2_blockscale,
+        w2_alphas,
+        out_dtype,
+        params.to_gemm2_args(),
+    )
+    del int_fp4, int_blockscale
+    c2 = shuffle_rows(c2, c_map, (m_a * num_topk, params.hidden_size))
+    c2 = c2.view(m_a, num_topk, params.hidden_size)
+    if not apply_router_weight_on_input:
+        c2 = c2 * topk_weights.view(m_a, num_topk, 1).to(out_dtype)
+    return c2.sum(dim=1).to(out_dtype)
diff --git a/sglang/python/sglang/srt/layers/moe/cutlass_moe_params.py b/sglang/python/sglang/srt/layers/moe/cutlass_moe_params.py
new file mode 100644
index 0000000000000000000000000000000000000000..6a71d606e89f243bff05523deb72bf24220b14c7
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/cutlass_moe_params.py
@@ -0,0 +1,187 @@
+from dataclasses import dataclass
+from enum import Enum, auto
+from typing import Optional
+
+import torch
+
+
+class CutlassMoEType(Enum):
+    """
+    Enum for the different types of cutlass moe operations
+    that are currently supported in SGLang.
+    """
+
+    BlockscaledFP8 = auto()
+    BlockscaledFP4 = auto()
+
+
+@dataclass
+class CutlassMoEParams:
+    """
+    Parameters for the cutlass moe operation.
+    """
+
+    #  Type as defined above
+    cutlass_moe_type: CutlassMoEType
+
+    # Strides for activations, weights and output in logical number of elements.
+    # The activations & output stride is the number of elements to the next row.
+    # The weights stride is the number of elements to the next row per expert.
+    # For example, if the weight is [e, n, k], then the b_stride is a tensor of
+    # shape [e] with each element being k. Similarly for activations, if the
+    # shape is [m, k], then the a_stride has shape [e] with each value k.
+    # Similarly for output, if the output is [m, n], then the c_stride is a
+    # tensor of shape [e] with each element being k.
+
+    # Note: cutlass_fp4_group_mm is designed to accept the strides of
+    # activations and weights to be the same, so it is passed in as a single
+    # tensor.
+    # ab_strides_13: [e] dtype: int64 [Gemm 1: Activation / Weight strides]
+    # ab_strides_2: [e] dtype: int64 [Gemm 2: Activation / Weight strides]
+    # c_strides_13: [e] dtype: int64 [Gemm 1: Output Strides]
+    # c_strides_2: [e] dtype: int64 [Gemm 2: Output Strides]
+    ab_strides_13: torch.Tensor
+    ab_strides_2: torch.Tensor
+    c_strides_13: torch.Tensor
+    c_strides_2: torch.Tensor
+
+    # m: Total number of tokens
+    # n: intermediate size per partition
+    # k: hidden size per expert
+    # e: Number of experts
+    # device: Device to run computation on and store tensors
+    m: int
+    intermediate_size_per_partition: int
+    hidden_size: int
+    num_experts: int
+    device: torch.device
+
+    # Pointers container for calculating offsets of the input activations for each expert
+    # a_ptrs: [e] dtype: int64
+    a_ptrs: torch.Tensor
+
+    # Pointers container for calculating offsets of the input weights for each expert
+    # b_ptrs: [e] dtype: int64
+    b_ptrs: torch.Tensor
+
+    # Pointers container for calculating offsets of the output activations for each expert
+    # out_ptrs: [e] dtype: int64
+    out_ptrs: torch.Tensor
+    # Pointers container for calculating offsets of the input scales for each expert
+    # a_scales_ptrs: [e] dtype: int64
+    # b_scales_ptrs: [e] dtype: int64
+    a_scales_ptrs: torch.Tensor
+    b_scales_ptrs: torch.Tensor
+    # Pointers for per-expert alpha values
+    alpha_ptrs: torch.Tensor
+    # CUTLASS blockscale layouts for A and B operands
+    layout_sfa: torch.Tensor
+    layout_sfb: torch.Tensor
+
+    # Offsets that mark at which token index each expert begins its computation
+    # The number of tokens computed with expert E is expert_offsets[E + 1] - expert_offsets[E]
+    # expert_offsets: [e+1] dtype: int32
+    expert_offsets: torch.Tensor
+
+    # Problem size: (num_experts, (m,2n,k)) for first GEMM
+    # problem_sizes1: [e, 3] dtype: int32
+    # Problem size: (num_experts, (m,n,k)) for second GEMM
+    # problem_sizes2: [e, 3] dtype: int32
+    problem_sizes1: torch.Tensor
+    problem_sizes2: torch.Tensor
+    # Similar to expert_offsets, but for blockscales for FP4 blockscaled Group GEMM
+    blockscale_offsets: Optional[torch.Tensor] = None
+
+    def __init__(
+        self,
+        cutlass_moe_type: CutlassMoEType,
+        device: torch.device,
+        num_experts: int,
+        intermediate_size_per_partition: int,
+        hidden_size: int,
+    ):
+        self.cutlass_moe_type = cutlass_moe_type
+        self.device = device
+        self.num_experts = num_experts
+        self.intermediate_size_per_partition = intermediate_size_per_partition
+        self.hidden_size = hidden_size
+        self.n = self.intermediate_size_per_partition
+        self.k = self.hidden_size
+        self.e = self.num_experts
+        self.ab_strides_13 = torch.full(
+            (self.e,), self.k, dtype=torch.int64, device=self.device
+        )
+        self.ab_strides_2 = torch.full(
+            (self.e,), self.n, dtype=torch.int64, device=self.device
+        )
+        self.c_strides_13 = torch.full(
+            (self.e,), 2 * self.n, dtype=torch.int64, device=self.device
+        )
+        self.c_strides_2 = torch.full(
+            (self.e,), self.k, dtype=torch.int64, device=self.device
+        )
+        self.expert_offsets = torch.empty(
+            (self.e + 1,), dtype=torch.int32, device=self.device
+        )
+        self.problem_sizes1 = torch.empty(
+            (self.e, 3), dtype=torch.int32, device=self.device
+        )
+        self.problem_sizes2 = torch.empty(
+            (self.e, 3), dtype=torch.int32, device=self.device
+        )
+        if self.cutlass_moe_type == CutlassMoEType.BlockscaledFP4:
+            self.blockscale_offsets = torch.empty(
+                (self.e + 1,), dtype=torch.int32, device=self.device
+            )
+        else:
+            self.blockscale_offsets = None
+        self.a_ptrs = torch.empty((self.e,), dtype=torch.int64, device=self.device)
+        self.b_ptrs = torch.empty((self.e,), dtype=torch.int64, device=self.device)
+        self.out_ptrs = torch.empty((self.e,), dtype=torch.int64, device=self.device)
+        self.a_scales_ptrs = torch.empty(
+            (self.e,), dtype=torch.int64, device=self.device
+        )
+        self.b_scales_ptrs = torch.empty(
+            (self.e,), dtype=torch.int64, device=self.device
+        )
+        self.alpha_ptrs = torch.empty((self.e,), dtype=torch.int64, device=self.device)
+        self.layout_sfa = torch.empty(
+            (self.e, 5), dtype=torch.int64, device=self.device
+        )
+        self.layout_sfb = torch.empty(
+            (self.e, 5), dtype=torch.int64, device=self.device
+        )
+
+    def to_gemm1_args(self) -> dict:
+        return {
+            "ab_strides": self.ab_strides_13,
+            "c_strides": self.c_strides_13,
+            "problem_sizes": self.problem_sizes1,
+            "expert_offsets": self.expert_offsets[:-1],
+            "blockscale_offsets": self.blockscale_offsets[:-1],
+            "a_ptrs": self.a_ptrs,
+            "b_ptrs": self.b_ptrs,
+            "out_ptrs": self.out_ptrs,
+            "a_scales_ptrs": self.a_scales_ptrs,
+            "b_scales_ptrs": self.b_scales_ptrs,
+            "alpha_ptrs": self.alpha_ptrs,
+            "layout_sfa": self.layout_sfa,
+            "layout_sfb": self.layout_sfb,
+        }
+
+    def to_gemm2_args(self) -> dict:
+        return {
+            "ab_strides": self.ab_strides_2,
+            "c_strides": self.c_strides_2,
+            "problem_sizes": self.problem_sizes2,
+            "expert_offsets": self.expert_offsets[:-1],
+            "blockscale_offsets": self.blockscale_offsets[:-1],
+            "a_ptrs": self.a_ptrs,
+            "b_ptrs": self.b_ptrs,
+            "out_ptrs": self.out_ptrs,
+            "a_scales_ptrs": self.a_scales_ptrs,
+            "b_scales_ptrs": self.b_scales_ptrs,
+            "alpha_ptrs": self.alpha_ptrs,
+            "layout_sfa": self.layout_sfa,
+            "layout_sfb": self.layout_sfb,
+        }
diff --git a/sglang/python/sglang/srt/layers/moe/cutlass_w4a8_moe.py b/sglang/python/sglang/srt/layers/moe/cutlass_w4a8_moe.py
new file mode 100644
index 0000000000000000000000000000000000000000..6ce5a3ddc43a39acbae5748d2e6c12b7be0faab3
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/cutlass_w4a8_moe.py
@@ -0,0 +1,546 @@
+# SPDX-License-Identifier: Apache-2.0
+"""Cutlass W4A8 MoE kernel."""
+
+from typing import Optional
+
+import torch
+
+from sglang.srt.utils import is_cuda_alike
+
+_is_cuda_alike = is_cuda_alike()
+
+if _is_cuda_alike:
+    from sgl_kernel import (
+        cutlass_w4a8_moe_mm,
+        get_cutlass_w4a8_moe_mm_data,
+    )
+
+from sgl_kernel import silu_and_mul
+
+from sglang.jit_kernel.per_tensor_quant_fp8 import per_tensor_quant_fp8
+from sglang.srt.distributed import get_moe_expert_parallel_world_size
+from sglang.srt.layers.moe.ep_moe.kernels import (
+    cutlass_w4_run_moe_ep_preproess,
+    deepep_ll_get_cutlass_w4a8_moe_mm_data,
+    deepep_permute_triton_kernel,
+    deepep_post_reorder_triton_kernel,
+    deepep_run_moe_deep_preprocess,
+    post_reorder_for_cutlass_moe,
+    pre_reorder_for_cutlass_moe,
+    silu_and_mul_masked_post_per_tensor_quant_fwd,
+    silu_mul_static_tensorwise_quant_for_cutlass_moe,
+)
+
+
+def cutlass_w4a8_moe(
+    a: torch.Tensor,
+    w1_q: torch.Tensor,
+    w2_q: torch.Tensor,
+    w1_scale: torch.Tensor,
+    w2_scale: torch.Tensor,
+    topk_weights: torch.Tensor,
+    topk_ids: torch.Tensor,
+    a_strides1: torch.Tensor,
+    b_strides1: torch.Tensor,
+    c_strides1: torch.Tensor,
+    a_strides2: torch.Tensor,
+    b_strides2: torch.Tensor,
+    c_strides2: torch.Tensor,
+    s_strides13: torch.Tensor,
+    s_strides2: torch.Tensor,
+    expert_offsets: torch.Tensor,
+    problem_sizes1: torch.Tensor,
+    problem_sizes2: torch.Tensor,
+    a1_scale: Optional[torch.Tensor] = None,
+    a2_scale: Optional[torch.Tensor] = None,
+    apply_router_weight_on_input: bool = False,
+    routed_scaling_factor: float = 1.0,
+) -> torch.Tensor:
+    """
+    This function computes a w4a8-quantized Mixture of Experts (MoE) layer
+    using two sets of quantized weights, w1_q and w2_q, and top-k gating
+    mechanism. The matrix multiplications are implemented with CUTLASS
+    grouped gemm.
+
+    Parameters:
+    - a (torch.Tensor): The input tensor to the MoE layer.
+        Shape: [M, K]
+    - w1_q (torch.Tensor): The first set of int4-quantized expert weights.
+        Shape: [num_experts, N * 2,  K // 2]
+        (the weights are passed transposed and int4-packed)
+    - w2_q (torch.Tensor): The second set of int4-quantized expert weights.
+        Shape: [num_experts, K, N // 2]
+        (the weights are passed transposed and int4-packed)
+    - w1_scale (torch.Tensor): The fp32 scale to dequantize w1_q.
+        Shape: [num_experts, K // 512, N * 8]
+    - w2_scale (torch.Tensor): The fp32 scale to dequantize w2_q.
+        Shape: [num_experts, N // 512, K * 4]
+    - topk_weights (torch.Tensor): The weights of each token->expert mapping.
+    - topk_ids (torch.Tensor): The ids of each token->expert mapping.
+    - a_strides1 (torch.Tensor): The input strides of the first grouped gemm.
+    - b_strides1 (torch.Tensor): The weights strides of the first grouped gemm.
+    - c_strides1 (torch.Tensor): The output strides of the first grouped gemm.
+    - a_strides2 (torch.Tensor): The input strides of the second grouped gemm.
+    - b_strides2 (torch.Tensor): The weights strides of the second grouped gemm.
+    - c_strides2 (torch.Tensor): The output strides of the second grouped gemm.
+    - s_strides13 (torch.Tensor): The input and scale strides of the first grouped gemm.
+    - s_strides2 (torch.Tensor): The scale strides of the second grouped gemm.
+    - a1_scale (Optional[torch.Tensor]): The optional fp32 scale to quantize a.
+        Shape: scalar or [1, K]
+    - a2_scale (Optional[torch.Tensor]): The optional fp32 scale to
+        quantize the intermediate result between the gemms.
+        Shape: scalar or [1, N]
+    - apply_router_weight_on_input (bool): When true, the topk weights are
+        applied directly on the inputs. This is only applicable when topk is 1.
+
+    Returns:
+    - torch.Tensor: The fp8 output tensor after applying the MoE layer.
+    """
+    assert topk_weights.shape == topk_ids.shape, "topk shape mismatch"
+    assert w1_q.dtype == torch.int8
+    assert w2_q.dtype == torch.int8
+    assert a.shape[1] // 2 == w1_q.shape[2], "Hidden size mismatch w1"
+    assert w1_q.shape[2] * 2 == w2_q.shape[1], "Hidden size mismatch w2"
+    assert w1_q.shape[0] == w2_q.shape[0], "Expert number mismatch"
+    assert w1_q.shape[0] == w1_scale.shape[0], "w1 scales expert number mismatch"
+    assert w1_q.shape[0] == w2_scale.shape[0], "w2 scales expert number mismatch"
+
+    assert a_strides1.shape[0] == w1_q.shape[0], "A Strides 1 expert number mismatch"
+    assert b_strides1.shape[0] == w1_q.shape[0], "B Strides 1 expert number mismatch"
+    assert a_strides2.shape[0] == w2_q.shape[0], "A Strides 2 expert number mismatch"
+    assert b_strides2.shape[0] == w2_q.shape[0], "B Strides 2 expert number mismatch"
+    num_local_experts = w1_q.size(0)
+    m = a.size(0)
+    k = w1_q.size(2) * 2  # w1_q is transposed and packed
+    n = w2_q.size(2) * 2  # w2_q is transposed and packed
+    topk = topk_ids.size(1)
+
+    if apply_router_weight_on_input:
+        assert topk == 1, "apply_router_weight_on_input is only implemented for topk=1"
+
+    device = a.device
+    if get_moe_expert_parallel_world_size() > 1:
+        topk_ids = torch.where(topk_ids == -1, num_local_experts, topk_ids)
+
+    src2dst = cutlass_w4_run_moe_ep_preproess(
+        topk_ids,
+    )
+
+    gateup_input = torch.empty(
+        (m * topk, k),
+        device=device,
+        dtype=torch.float8_e4m3fn,
+    )
+
+    pre_reorder_for_cutlass_moe(
+        a,
+        gateup_input,
+        src2dst,
+        topk_ids,
+        a1_scale,
+        num_local_experts,
+        topk,
+        m,
+        k,
+    )
+
+    # NOTE: a_map and c_map are not used in the get_cutlass_w4a8_moe_mm_data kernel,
+    # they are kept to allow for a quick switch of the permutation logic
+    # from the current triton kernel implementation to the cutlass-based one if needed.
+    a_map = torch.empty((topk_ids.numel()), dtype=torch.int32, device=device)
+    c_map = torch.empty((topk_ids.numel()), dtype=torch.int32, device=device)
+    get_cutlass_w4a8_moe_mm_data(
+        topk_ids,
+        expert_offsets,
+        problem_sizes1,
+        problem_sizes2,
+        a_map,
+        c_map,
+        num_local_experts,
+        n,
+        k,
+    )
+
+    c1 = torch.empty((m * topk, n * 2), device=device, dtype=torch.bfloat16)
+    c2 = torch.empty((m * topk, k), device=device, dtype=torch.bfloat16)
+
+    cutlass_w4a8_moe_mm(
+        c1,
+        gateup_input,
+        w1_q,
+        a1_scale.float(),
+        w1_scale,
+        expert_offsets[:-1],
+        problem_sizes1,
+        a_strides1,
+        b_strides1,
+        c_strides1,
+        s_strides13,
+        128,
+        topk,
+    )
+
+    intermediate_q = torch.empty(
+        (m * topk, n), dtype=torch.float8_e4m3fn, device=device
+    )
+    silu_mul_static_tensorwise_quant_for_cutlass_moe(
+        c1, intermediate_q, a2_scale.float(), expert_offsets[-1:], m * topk, n
+    )
+
+    cutlass_w4a8_moe_mm(
+        c2,
+        intermediate_q,
+        w2_q,
+        a2_scale.float(),
+        w2_scale,
+        expert_offsets[:-1],
+        problem_sizes2,
+        a_strides2,
+        b_strides2,
+        c_strides2,
+        s_strides2,
+        128,
+        topk,
+    )
+
+    output = torch.empty_like(a)
+
+    post_reorder_for_cutlass_moe(
+        c2,
+        output,
+        src2dst,
+        topk_ids,
+        topk_weights,
+        num_local_experts,
+        topk,
+        m,
+        k,
+        routed_scaling_factor,
+    )
+    return output
+
+
+def cutlass_w4a8_moe_deepep_normal(
+    a: torch.Tensor,
+    w1_q: torch.Tensor,
+    w2_q: torch.Tensor,
+    w1_scale: torch.Tensor,
+    w2_scale: torch.Tensor,
+    topk_weights: torch.Tensor,
+    topk_ids_: torch.Tensor,
+    a_strides1: torch.Tensor,
+    b_strides1: torch.Tensor,
+    c_strides1: torch.Tensor,
+    a_strides2: torch.Tensor,
+    b_strides2: torch.Tensor,
+    c_strides2: torch.Tensor,
+    s_strides13: torch.Tensor,
+    s_strides2: torch.Tensor,
+    expert_offsets: torch.Tensor,
+    problem_sizes1: torch.Tensor,
+    problem_sizes2: torch.Tensor,
+    a1_scale: Optional[torch.Tensor] = None,
+    a2_scale: Optional[torch.Tensor] = None,
+) -> torch.Tensor:
+    """
+    This function computes a w4a8-quantized Mixture of Experts (MoE) layer
+    using two sets of quantized weights, w1_q and w2_q, and top-k gating
+    mechanism. The matrix multiplications are implemented with CUTLASS
+    grouped gemm.
+
+    Parameters:
+    - a (torch.Tensor): The input tensor to the MoE layer.
+        Shape: [M, K]
+    - w1_q (torch.Tensor): The first set of int4-quantized expert weights.
+        Shape: [num_experts, N * 2,  K // 2]
+        (the weights are passed transposed and int4-packed)
+    - w2_q (torch.Tensor): The second set of int4-quantized expert weights.
+        Shape: [num_experts, K, N // 2]
+        (the weights are passed transposed and int4-packed)
+    - w1_scale (torch.Tensor): The fp32 scale to dequantize w1_q.
+        Shape: [num_experts, K // 512, N * 8]
+    - w2_scale (torch.Tensor): The fp32 scale to dequantize w2_q.
+        Shape: [num_experts, N // 512, K * 4]
+    - topk_weights (torch.Tensor): The weights of each token->expert mapping.
+    - a_strides1 (torch.Tensor): The input strides of the first grouped gemm.
+    - b_strides1 (torch.Tensor): The weights strides of the first grouped gemm.
+    - c_strides1 (torch.Tensor): The output strides of the first grouped gemm.
+    - a_strides2 (torch.Tensor): The input strides of the second grouped gemm.
+    - b_strides2 (torch.Tensor): The weights strides of the second grouped gemm.
+    - c_strides2 (torch.Tensor): The output strides of the second grouped gemm.
+    - s_strides13 (torch.Tensor): The input and scale strides of the first grouped gemm.
+    - s_strides2 (torch.Tensor): The scale strides of the second grouped gemm.
+    - a1_scale (Optional[torch.Tensor]): The optional fp32 scale to quantize a.
+        Shape: scalar or [1, K]
+    - a2_scale (Optional[torch.Tensor]): The optional fp32 scale to
+        quantize the intermediate result between the gemms.
+        Shape: scalar or [1, N]
+    - apply_router_weight_on_input (bool): When true, the topk weights are
+        applied directly on the inputs. This is only applicable when topk is 1.
+
+    Returns:
+    - torch.Tensor: The fp8 output tensor after applying the MoE layer.
+    """
+    assert topk_weights.shape == topk_ids_.shape, "topk shape mismatch"
+    assert w1_q.dtype == torch.int8
+    assert w2_q.dtype == torch.int8
+    assert a.shape[1] // 2 == w1_q.shape[2], "Hidden size mismatch w1"
+    assert w1_q.shape[2] * 2 == w2_q.shape[1], "Hidden size mismatch w2"
+    assert w1_q.shape[0] == w2_q.shape[0], "Expert number mismatch"
+    assert w1_q.shape[0] == w1_scale.shape[0], "w1 scales expert number mismatch"
+    assert w1_q.shape[0] == w2_scale.shape[0], "w2 scales expert number mismatch"
+
+    assert a_strides1.shape[0] == w1_q.shape[0], "A Strides 1 expert number mismatch"
+    assert b_strides1.shape[0] == w1_q.shape[0], "B Strides 1 expert number mismatch"
+    assert a_strides2.shape[0] == w2_q.shape[0], "A Strides 2 expert number mismatch"
+    assert b_strides2.shape[0] == w2_q.shape[0], "B Strides 2 expert number mismatch"
+    num_experts = w1_q.size(0)
+    m = a.size(0)
+    k = w1_q.size(2) * 2  # w1_q is transposed and packed
+    n = w2_q.size(2) * 2  # w2_q is transposed and packed
+    topk = topk_ids_.size(1)
+
+    num_experts = w1_q.size(0)
+    m = a.size(0)
+    k = w1_q.size(2) * 2
+    n = w2_q.size(2) * 2
+    topk = topk_ids_.size(1)
+    device = a.device
+
+    reorder_topk_ids, src2dst, _ = deepep_run_moe_deep_preprocess(
+        topk_ids_, num_experts
+    )
+    num_total_tokens = reorder_topk_ids.numel()
+    gateup_input_pre_reorder = torch.empty(
+        (int(num_total_tokens), a.shape[1]),
+        device=device,
+        dtype=a.dtype,
+    )
+    deepep_permute_triton_kernel[(a.shape[0],)](
+        a,
+        gateup_input_pre_reorder,
+        src2dst,
+        topk_ids_.to(torch.int64),
+        None,
+        topk,
+        a.shape[1],
+        BLOCK_SIZE=512,
+    )
+    gateup_input = torch.empty(
+        gateup_input_pre_reorder.shape, dtype=torch.float8_e4m3fn, device=device
+    )
+    per_tensor_quant_fp8(gateup_input_pre_reorder, gateup_input, a1_scale.float(), True)
+    del gateup_input_pre_reorder
+    local_topk_ids = topk_ids_
+    local_topk_ids = (
+        torch.where(local_topk_ids == -1, num_experts, topk_ids_).to(torch.int32)
+    ).contiguous()
+
+    a_map = torch.empty((local_topk_ids.numel()), dtype=torch.int32, device=device)
+    c_map = torch.empty((local_topk_ids.numel()), dtype=torch.int32, device=device)
+    get_cutlass_w4a8_moe_mm_data(
+        local_topk_ids,
+        expert_offsets,
+        problem_sizes1,
+        problem_sizes2,
+        a_map,
+        c_map,
+        num_experts,
+        n,
+        k,
+    )
+    c1 = torch.empty((m * topk, n * 2), device=device, dtype=torch.bfloat16)
+    c2 = torch.zeros((m * topk, k), device=device, dtype=torch.bfloat16)
+
+    cutlass_w4a8_moe_mm(
+        c1,
+        gateup_input,
+        w1_q,
+        a1_scale.float(),
+        w1_scale,
+        expert_offsets[:-1],
+        problem_sizes1,
+        a_strides1,
+        b_strides1,
+        c_strides1,
+        s_strides13,
+        128,
+        topk,
+    )
+    intermediate = torch.empty((m * topk, n), device=device, dtype=torch.bfloat16)
+    silu_and_mul(c1, intermediate)
+
+    intermediate_q = torch.empty(
+        intermediate.shape, dtype=torch.float8_e4m3fn, device=device
+    )
+    per_tensor_quant_fp8(intermediate, intermediate_q, a2_scale.float(), True)
+
+    cutlass_w4a8_moe_mm(
+        c2,
+        intermediate_q,
+        w2_q,
+        a2_scale.float(),
+        w2_scale,
+        expert_offsets[:-1],
+        problem_sizes2,
+        a_strides2,
+        b_strides2,
+        c_strides2,
+        s_strides2,
+        128,
+        topk,
+    )
+    num_tokens = src2dst.shape[0] // topk
+    output = torch.empty(
+        (num_tokens, c2.shape[1]),
+        device=c2.device,
+        dtype=torch.bfloat16,
+    )
+    deepep_post_reorder_triton_kernel[(num_tokens,)](
+        c2,
+        output,
+        src2dst,
+        topk_ids_,
+        topk_weights,
+        topk,
+        c2.shape[1],
+        BLOCK_SIZE=512,
+    )
+
+    return output
+
+
+def cutlass_w4a8_moe_deepep_ll(
+    a: torch.Tensor,
+    w1_q: torch.Tensor,
+    w2_q: torch.Tensor,
+    w1_scale: torch.Tensor,
+    w2_scale: torch.Tensor,
+    topk_ids_: torch.Tensor,
+    masked_m: torch.Tensor,
+    a_strides1: torch.Tensor,
+    b_strides1: torch.Tensor,
+    c_strides1: torch.Tensor,
+    a_strides2: torch.Tensor,
+    b_strides2: torch.Tensor,
+    c_strides2: torch.Tensor,
+    s_strides13: torch.Tensor,
+    s_strides2: torch.Tensor,
+    expert_offsets: torch.Tensor,
+    problem_sizes1: torch.Tensor,
+    problem_sizes2: torch.Tensor,
+    a1_scale: Optional[torch.Tensor] = None,
+    a2_scale: Optional[torch.Tensor] = None,
+) -> torch.Tensor:
+    """
+    This function computes a w4a8-quantized Mixture of Experts (MoE) layer
+    using two sets of quantized weights, w1_q and w2_q, and top-k gating
+    mechanism. The matrix multiplications are implemented with CUTLASS
+    grouped gemm.
+
+    Parameters:
+    - a (torch.Tensor): The input tensor to the MoE layer.
+        Shape: [num_local_experts, num_max_dispatch_tokens_per_rank * num_ranks, K]
+    - w1_q (torch.Tensor): The first set of int4-quantized expert weights.
+        Shape: [num_experts, N * 2,  K // 2]
+        (the weights are passed transposed and int4-packed)
+    - w2_q (torch.Tensor): The second set of int4-quantized expert weights.
+        Shape: [num_experts, K, N // 2]
+        (the weights are passed transposed and int4-packed)
+    - w1_scale (torch.Tensor): The fp32 scale to dequantize w1_q.
+        Shape: [num_experts, K // 512, N * 8]
+    - w2_scale (torch.Tensor): The fp32 scale to dequantize w2_q.
+        Shape: [num_experts, N // 512, K * 4]
+    - topk_weights (torch.Tensor): The weights of each token->expert mapping.
+    - a_strides1 (torch.Tensor): The input strides of the first grouped gemm.
+    - b_strides1 (torch.Tensor): The weights strides of the first grouped gemm.
+    - c_strides1 (torch.Tensor): The output strides of the first grouped gemm.
+    - a_strides2 (torch.Tensor): The input strides of the second grouped gemm.
+    - b_strides2 (torch.Tensor): The weights strides of the second grouped gemm.
+    - c_strides2 (torch.Tensor): The output strides of the second grouped gemm.
+    - s_strides13 (torch.Tensor): The input and scale strides of the first grouped gemm.
+    - s_strides2 (torch.Tensor): The scale strides of the second grouped gemm.
+    - a1_scale (Optional[torch.Tensor]): The optional fp32 scale to quantize a.
+        Shape: scalar or [1, K]
+    - a2_scale (Optional[torch.Tensor]): The optional fp32 scale to
+        quantize the intermediate result between the gemms.
+        Shape: scalar or [1, N]
+    - apply_router_weight_on_input (bool): When true, the topk weights are
+        applied directly on the inputs. This is only applicable when topk is 1.
+
+    Returns:
+    - torch.Tensor: The fp8 output tensor after applying the MoE layer.
+    """
+    assert w1_q.dtype == torch.int8
+    assert w2_q.dtype == torch.int8
+    assert a.shape[2] // 2 == w1_q.shape[2], "Hidden size mismatch w1"
+    assert w1_q.shape[2] * 2 == w2_q.shape[1], "Hidden size mismatch w2"
+    assert w1_q.shape[0] == w2_q.shape[0], "Expert number mismatch"
+    assert w1_q.shape[0] == w1_scale.shape[0], "w1 scales expert number mismatch"
+    assert w1_q.shape[0] == w2_scale.shape[0], "w2 scales expert number mismatch"
+
+    assert a_strides1.shape[0] == w1_q.shape[0], "A Strides 1 expert number mismatch"
+    assert b_strides1.shape[0] == w1_q.shape[0], "B Strides 1 expert number mismatch"
+    assert a_strides2.shape[0] == w2_q.shape[0], "A Strides 2 expert number mismatch"
+    assert b_strides2.shape[0] == w2_q.shape[0], "B Strides 2 expert number mismatch"
+    num_experts = w1_q.size(0)
+    m = a.size(1)
+    k = w1_q.size(2) * 2  # w1_q is transposed and packed
+    n = w2_q.size(2) * 2  # w2_q is transposed and packed
+    topk = topk_ids_.size(1)
+
+    device = a.device
+
+    problem_sizes1, problem_sizes2 = deepep_ll_get_cutlass_w4a8_moe_mm_data(
+        masked_m,
+        problem_sizes1,
+        problem_sizes2,
+        num_experts,
+        n,
+        k,
+    )
+
+    gateup_input = torch.empty(a.shape, dtype=torch.float8_e4m3fn, device=device)
+    per_tensor_quant_fp8(a, gateup_input, a1_scale.float(), True)
+    c1 = torch.empty((num_experts, m, n * 2), device=device, dtype=torch.bfloat16)
+    c2 = torch.empty((num_experts, m, k), device=device, dtype=torch.bfloat16)
+
+    cutlass_w4a8_moe_mm(
+        c1,
+        gateup_input,
+        w1_q,
+        a1_scale.float(),
+        w1_scale,
+        expert_offsets[:-1],
+        problem_sizes1,
+        a_strides1,
+        b_strides1,
+        c_strides1,
+        s_strides13,
+        128,
+        topk,
+    )
+
+    intermediate_q = torch.empty(
+        (num_experts, m, n), device=a.device, dtype=torch.float8_e4m3fn
+    )
+    silu_and_mul_masked_post_per_tensor_quant_fwd(
+        c1, intermediate_q, masked_m, a2_scale
+    )
+    cutlass_w4a8_moe_mm(
+        c2,
+        intermediate_q,
+        w2_q,
+        a2_scale.float(),
+        w2_scale,
+        expert_offsets[:-1],
+        problem_sizes2,
+        a_strides2,
+        b_strides2,
+        c_strides2,
+        s_strides2,
+        128,
+        topk,
+    )
+
+    return c2
diff --git a/sglang/python/sglang/srt/layers/moe/flashinfer_cutedsl_moe.py b/sglang/python/sglang/srt/layers/moe/flashinfer_cutedsl_moe.py
new file mode 100644
index 0000000000000000000000000000000000000000..74455c93121a2fd5cba2c626953fcf6493ee583a
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/flashinfer_cutedsl_moe.py
@@ -0,0 +1,183 @@
+from typing import Optional
+
+import torch
+from flashinfer import (
+    scaled_fp4_grouped_quantize,
+    silu_and_mul_scaled_nvfp4_experts_quantize,
+)
+from flashinfer.cute_dsl.blockscaled_gemm import grouped_gemm_nt_masked
+
+
+def get_cute_dtype(input: torch.Tensor) -> str:
+    if input.dtype == torch.bfloat16:
+        return "bfloat16"
+    elif input.dtype == torch.float16:
+        return "float16"
+    elif input.dtype == torch.float32:
+        return "float32"
+    else:
+        raise ValueError(f"Unsupported cute dtype {input.dtype}")
+
+
+def flashinfer_cutedsl_moe_masked(
+    hidden_states: tuple[torch.Tensor, Optional[torch.Tensor]],
+    input_global_scale: torch.Tensor,
+    w1: torch.Tensor,
+    w1_blockscale: torch.Tensor,
+    w1_alpha,
+    w2: torch.Tensor,
+    a2_global_scale: torch.Tensor,
+    w2_blockscale: torch.Tensor,
+    w2_alpha,
+    masked_m: torch.Tensor,
+    down_sm_count: Optional[int] = None,
+    down_signals: Optional[torch.Tensor] = None,
+    down_start_event: Optional[torch.cuda.Event] = None,
+):
+    """
+    Perform masked Mixture-of-Experts computation with FlashInfer's CuteDSL
+    kernels.
+
+    Args:
+        hidden_states: Either of the following case
+            * tuple[torch.Tensor, None]: [num_experts, m, k], bf16, None means no quant
+            * tuple[torch.Tensor, torch.Tensor]: [num_experts, m, k // 2], uint8, [num_experts, m, k // 16], float8_e4m3fn
+        input_global_scale (torch.Tensor): (l,)
+        w1 (torch.Tensor): fp4 weights, [l, 2 * n, k // 2], uint8
+        w1_blockscale (torch.Tensor): blockscale factors, e4m3,
+        w1_alpha (torch.Tensor): (l,)
+        w2 (torch.Tensor): fp4 weights, [l, k, n // 2], uint8
+        a2_global_scale (torch.Tensor): (l,)
+        w2_blockscale (torch.Tensor): blockscale factors, e4m3,
+        w2_alpha (torch.Tensor): (l,)
+        masked_m (torch.Tensor): Masked dimension indices
+
+    Notes:
+        - Assumes max(masked_m) == m.
+    """
+
+    # === Assertions on dtypes ===
+    assert w1.dtype == torch.uint8, f"w1 must be uint8 (fp4 packed), got {w1.dtype}"
+    assert (
+        w1_blockscale.dtype == torch.float8_e4m3fn
+    ), f"w1_blockscale must be float8_e4m3fn, got {w1_blockscale.dtype}"
+    assert (
+        w1_alpha.dtype == torch.float32
+    ), f"w1_alpha must be float32, got {w1_alpha.dtype}"
+    assert w2.dtype == torch.uint8, f"w2 must be uint8 (fp4 packed), got {w2.dtype}"
+    assert (
+        a2_global_scale.dtype == torch.float32
+    ), f"a2_global_scale must be float32, got {a2_global_scale.dtype}"
+    assert (
+        w2_blockscale.dtype == torch.float8_e4m3fn
+    ), f"w2_blockscale must be float8_e4m3fn, got {w2_blockscale.dtype}"
+    assert (
+        w2_alpha.dtype == torch.float32
+    ), f"w2_alpha must be float32, got {w2_alpha.dtype}"
+    assert (
+        len(hidden_states) == 2
+    ), f"hidden_states must be a tuple of length 2, got {len(hidden_states)}"
+
+    # === Assertions on shapes ===
+    n = w2.shape[-1] * 2  # intermediate dimension
+
+    if hidden_states[1] is not None:
+
+        a_q = hidden_states[0].view(torch.uint8)
+        a_q_sf = hidden_states[1].view(torch.float8_e4m3fn)
+        m, k_by_2, num_experts = a_q.shape
+        k = k_by_2 * 2
+    else:
+        num_experts, m, k = hidden_states[0].shape
+
+        assert (
+            input_global_scale.dtype == torch.float32
+        ), f"input_global_scale must be float32, got {input_global_scale.dtype}"
+        assert input_global_scale.shape == (
+            num_experts,
+        ), f"input_global_scale must be (l,), got {input_global_scale.shape}"
+
+        a_q, a_q_sf = scaled_fp4_grouped_quantize(
+            hidden_states[0],
+            masked_m,
+            input_global_scale,
+        )
+
+    assert w1.shape[-2] == 2 * n, f"w1 last-2 dim must be 2*n, got {w1.shape}"
+    assert (
+        w1.shape[-1] * 2 == k
+    ), f"w1 last dim * 2 must equal k, got {w1.shape[-1]} vs k={k}"
+    assert w2.shape[-2:] == (
+        k,
+        n // 2,
+    ), f"w2 shape mismatch, got {w2.shape[-2:]}, expected {(k, n//2)}"
+    assert w1_alpha.shape == (
+        num_experts,
+    ), f"w1_alpha must be (l,), got {w1_alpha.shape}"
+    assert a2_global_scale.shape == (
+        num_experts,
+    ), f"a2_global_scale must be (l,), got {a2_global_scale.shape}"
+    assert w2_alpha.shape == (
+        num_experts,
+    ), f"w2_alpha must be (l,), got {w2_alpha.shape}"
+
+    # TODO(kaixih@nvidia): dtype should be based on inputs.
+    gateup_output = torch.empty(
+        (num_experts, m, n * 2), dtype=torch.bfloat16, device=a_q.device
+    )
+    gateup_output = gateup_output.permute(1, 2, 0)  # requirement of kernel
+    sf_vec_size = 16
+    assert a_q_sf.dtype == torch.float8_e4m3fn
+    assert a_q.dtype == torch.uint8
+    ab_dtype = "float4_e2m1fn"
+    sf_dtype = "float8_e4m3fn"
+    c_dtype = "bfloat16"
+
+    # Gemm1
+    grouped_gemm_nt_masked(
+        (a_q, a_q_sf),
+        (w1.permute(1, 2, 0), w1_blockscale),
+        gateup_output,
+        masked_m,
+        ab_dtype=ab_dtype,
+        sf_dtype=sf_dtype,
+        c_dtype=c_dtype,
+        sf_vec_size=sf_vec_size,
+        alpha=w1_alpha.view(1, 1, num_experts),
+        alpha_dtype=get_cute_dtype(w1_alpha),
+    )  # in logical [m, n, l]
+
+    # SILU and quantization
+    diq, diq_sf = silu_and_mul_scaled_nvfp4_experts_quantize(
+        gateup_output.permute(2, 0, 1),
+        masked_m,
+        a2_global_scale,
+    )
+
+    if down_start_event is not None:
+        down_start_event.record()
+
+    # Gemm2
+    out = torch.empty((num_experts, m, k), dtype=torch.bfloat16, device=a_q.device)
+    out = out.permute(1, 2, 0)  # requirement of kernel
+    grouped_gemm_nt_masked(
+        (diq, diq_sf),
+        (w2.permute(1, 2, 0), w2_blockscale),
+        out,
+        masked_m,
+        ab_dtype=ab_dtype,
+        sf_dtype=sf_dtype,
+        c_dtype=c_dtype,
+        sf_vec_size=sf_vec_size,
+        alpha=w2_alpha.view(1, 1, num_experts),
+        alpha_dtype=get_cute_dtype(w2_alpha),
+        **(
+            dict(
+                sm_count=down_sm_count,
+                dst_signals=down_signals,
+            )
+            if down_sm_count is not None or down_signals is not None
+            else {}
+        ),
+    )  # in logical [m, k, l]
+    return out.permute(2, 0, 1)
diff --git a/sglang/python/sglang/srt/layers/moe/fused_moe_native.py b/sglang/python/sglang/srt/layers/moe/fused_moe_native.py
new file mode 100644
index 0000000000000000000000000000000000000000..4a9070fe3122b222e2a5adcb63c6186041cb2d8b
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/fused_moe_native.py
@@ -0,0 +1,107 @@
+"""
+Torch-native implementation for FusedMoE. This is used for torch.compile.
+It is based on https://github.com/pytorch-labs/gpt-fast/blob/32971d3129541c5bfb4f715abc33d1c5f408d204/mixtral-moe/model.py#L204
+"""
+
+import torch
+from torch.nn import functional as F
+
+from sglang.srt.layers.activation import GeluAndMul, SiluAndMul
+from sglang.srt.layers.moe.moe_runner import MoeRunnerConfig
+from sglang.srt.layers.moe.token_dispatcher import (
+    StandardCombineInput,
+    StandardDispatchOutput,
+)
+from sglang.srt.layers.moe.topk import StandardTopKOutput
+
+
+def fused_moe_forward_native(
+    layer: torch.nn.Module,
+    dispatch_output: StandardDispatchOutput,
+) -> StandardCombineInput:
+
+    x, x_scale, topk_output = dispatch_output
+    moe_runner_config = layer.moe_runner_config
+
+    if moe_runner_config.apply_router_weight_on_input:
+        raise NotImplementedError()
+
+    topk_weights, topk_ids, _ = topk_output
+
+    w13_weights = layer.w13_weight[topk_ids]
+    w1_weights, w3_weights = torch.chunk(w13_weights, 2, dim=2)
+    w2_weights = layer.w2_weight[topk_ids]
+    x1 = torch.einsum("ti,taoi -> tao", x, w1_weights)
+    if moe_runner_config.activation == "silu":
+        x1 = F.silu(x1)
+    elif moe_runner_config.activation == "gelu":
+        x1 = F.gelu(x1)
+    else:
+        raise ValueError(f"Unsupported activation: {moe_runner_config.activation=}")
+    x3 = torch.einsum("ti, taoi -> tao", x, w3_weights)
+    expert_outs = torch.einsum("tao, taio -> tai", (x1 * x3), w2_weights)
+    expert_outs = torch.einsum(
+        "tai,ta -> ti", expert_outs, topk_weights.to(expert_outs.dtype)
+    )
+    return StandardCombineInput(hidden_states=expert_outs)
+
+
+def moe_forward_native(
+    layer: torch.nn.Module,
+    x: torch.Tensor,
+    topk_output: StandardTopKOutput,
+    moe_runner_config: MoeRunnerConfig,
+) -> torch.Tensor:
+
+    if moe_runner_config.apply_router_weight_on_input:
+        raise NotImplementedError()
+
+    topk_weights, topk_ids, _ = topk_output
+
+    # Ref code from https://huggingface.co/deepseek-ai/DeepSeek-V2/blob/e0828e3cc0a03408724b80c3cc92c8e072db8d01/modeling_deepseek.py#L589
+    len_experts = layer.num_experts
+
+    cnts = topk_ids.new_zeros((topk_ids.shape[0], len_experts))
+    cnts.scatter_(1, topk_ids.to(torch.int64), 1)
+    tokens_per_expert = cnts.sum(dim=0)
+    idxs = topk_ids.view(-1).argsort()
+
+    sorted_tokens = x[idxs // topk_ids.shape[1]]
+    tokens_per_expert = tokens_per_expert.cpu().numpy()
+
+    if moe_runner_config.activation == "silu":
+        act = SiluAndMul()
+    elif moe_runner_config.activation == "gelu":
+        act = GeluAndMul()
+    else:
+        raise ValueError(f"Unsupported activation: {moe_runner_config.activation=}")
+
+    outputs = []
+    start_idx = 0
+    for i, num_tokens in enumerate(tokens_per_expert):
+        end_idx = start_idx + num_tokens
+        if num_tokens == 0:
+            continue
+        tokens_for_this_expert = sorted_tokens[start_idx:end_idx]
+
+        layer_w13_weight = layer.w13_weight[i]
+        layer_w2_weight = layer.w2_weight[i]
+
+        gate_up = F.linear(tokens_for_this_expert, layer_w13_weight)
+        gate_up = act(gate_up)
+        expert_out = F.linear(gate_up, layer_w2_weight)
+        outputs.append(expert_out)
+        start_idx = end_idx
+
+    outs = torch.cat(outputs, dim=0) if len(outputs) else sorted_tokens.new_empty(0)
+    new_x = torch.empty_like(outs)
+
+    new_x[idxs] = outs
+    final_out = (
+        new_x.view(*topk_ids.shape, -1)
+        .type(topk_weights.dtype)
+        .mul_(topk_weights.unsqueeze(dim=-1))
+        .sum(dim=1)
+        .type(new_x.dtype)
+    )
+    return final_out
diff --git a/sglang/python/sglang/srt/layers/moe/kt_ep_wrapper.py b/sglang/python/sglang/srt/layers/moe/kt_ep_wrapper.py
new file mode 100644
index 0000000000000000000000000000000000000000..128853931805f4e798a334307787c58e09e9149e
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/kt_ep_wrapper.py
@@ -0,0 +1,393 @@
+# SPDX-License-Identifier: Apache-2.0
+"""
+KT Expert Parallelism Wrapper for MoE layers.
+
+This module provides a generic wrapper that enables CPU-GPU expert parallelism
+for any MoE quantization method. It coordinates parallel execution of GPU experts
+(using any quantization method) and CPU experts (using AMX/AVX instructions).
+"""
+
+from dataclasses import dataclass
+from typing import TYPE_CHECKING, Optional
+
+import torch
+
+from sglang.srt.distributed import get_tensor_model_parallel_rank
+from sglang.srt.layers.quantization.base_config import FusedMoEMethodBase
+from sglang.srt.utils import get_compiler_backend
+
+if TYPE_CHECKING:
+    from sglang.srt.layers.moe import MoeRunnerConfig
+    from sglang.srt.layers.moe.token_dispatcher import (
+        CombineInput,
+        StandardDispatchOutput,
+    )
+    from sglang.srt.server_args import ServerArgs
+
+try:
+    from kt_kernel import KTMoEWrapper
+
+    KTRANSFORMERS_AVAILABLE = True
+except ImportError:
+    KTRANSFORMERS_AVAILABLE = False
+
+
+@dataclass
+class KTConfig:
+    """Configuration for KTransformers heterogeneous computing CPU part.
+
+    Args:
+        layer_idx: Layer index in the model
+        num_gpu_experts: Number of experts to run on GPU
+        cpuinfer_threads: Number of CPU inference threads
+        threadpool_count: Number of thread pools for CPU computation
+        weight_path: Path to CPU quantized weights
+        chunked_prefill_size: Chunk size for prefill computation
+        method: CPU computation method (e.g., "int4")
+        num_layers: Total number of layers in the model (optional)
+    """
+
+    layer_idx: int
+    num_gpu_experts: int
+    cpuinfer_threads: int
+    threadpool_count: int
+    weight_path: str
+    chunked_prefill_size: int
+    max_deferred_experts_per_token: int
+    method: str
+    num_layers: Optional[int] = None
+
+
+def create_kt_config_from_server_args(
+    server_args: "ServerArgs", layer_idx: int
+) -> Optional[KTConfig]:
+    """Create KTConfig from ServerArgs if KT is configured.
+
+    Args:
+        server_args: Global server arguments
+        layer_idx: Layer index in the model
+
+    Returns:
+        KTConfig if KT is configured, None otherwise
+    """
+    if server_args.kt_weight_path is None:
+        return None
+
+    # Try to get num_layers from model config
+    num_layers = None
+    try:
+        hf_config = server_args.get_hf_config()
+        num_layers = getattr(hf_config, "num_hidden_layers", None)
+    except Exception:
+        # If we can't get the config, num_layers will be None
+        pass
+
+    return KTConfig(
+        layer_idx=layer_idx,
+        num_gpu_experts=server_args.kt_num_gpu_experts,
+        cpuinfer_threads=server_args.kt_cpuinfer,
+        threadpool_count=server_args.kt_threadpool_count,
+        weight_path=server_args.kt_weight_path,
+        chunked_prefill_size=server_args.chunked_prefill_size,
+        method=server_args.kt_method,
+        max_deferred_experts_per_token=server_args.kt_max_deferred_experts_per_token,
+        num_layers=num_layers,
+    )
+
+
+@torch.compile(dynamic=True, backend=get_compiler_backend())
+def mask_cpu_expert_ids(topk_ids: torch.Tensor, num_gpu_experts: int) -> torch.Tensor:
+    """Mask CPU expert IDs by setting them to -1.
+
+    This function masks expert IDs that should be computed on CPU (IDs >= num_gpu_experts)
+    so they won't be computed on GPU. The masked IDs are set to -1, which causes the
+    GPU MoE kernel to skip those experts.
+
+    Args:
+        topk_ids: Tensor of shape [num_tokens, top_k] containing expert IDs
+        num_gpu_experts: Number of experts that should run on GPU (experts 0 to num_gpu_experts-1)
+
+    Returns:
+        Modified topk_ids tensor with CPU expert IDs masked as -1
+    """
+    topk_ids[topk_ids >= num_gpu_experts] = -1
+    return topk_ids
+
+
+class KTEPWrapperMethod(FusedMoEMethodBase):
+    """Wrapper for any MoE quantization method to enable CPU-GPU expert parallelism.
+
+    This wrapper coordinates parallel execution of:
+    - GPU experts (0 to num_gpu_experts-1) using any quantization method
+    - CPU experts (num_gpu_experts to total_experts-1) using AMX/AVX instructions
+
+    The wrapper implements the submit-compute-sync pattern:
+    1. Submit CPU expert computation (non-blocking)
+    2. Execute GPU expert computation in parallel
+    3. Synchronize and merge CPU+GPU results
+
+    Example:
+        # Wrap any GPU method with AMX/AVX CPU expert support
+        gpu_method = CompressedTensorsWNA16MoE(quant_config, prefix)
+        kt_config = KTConfig(layer_idx=0, num_gpu_experts=4, ...)
+        method = KTEPWrapperMethod(gpu_method, kt_config)
+    """
+
+    def __init__(
+        self,
+        gpu_method: FusedMoEMethodBase,
+        kt_config: KTConfig,
+    ):
+        """Initialize the KT EP wrapper.
+
+        Args:
+            gpu_method: The quantization method to use for GPU experts
+            kt_config: Configuration for KT CPU expert computation
+        """
+        if not KTRANSFORMERS_AVAILABLE:
+            raise ImportError(
+                "kt_kernel is not installed. To use KTransformers EP wrapper, please install kt_kernel."
+            )
+
+        self.gpu_method = gpu_method
+        self.kt_config = kt_config
+        self.num_gpu_experts = kt_config.num_gpu_experts
+        self.override_num_local_experts = True
+        self.gpu_method.num_gpu_experts = self.num_gpu_experts
+        self.tp_rank = get_tensor_model_parallel_rank()
+
+        # KT wrapper will be initialized in create_weights
+        self.wrapper: Optional[KTMoEWrapper] = None
+
+        # Store parameters needed for KT initialization
+        self._layer_params = None
+
+    def create_weights(
+        self,
+        layer: torch.nn.Module,
+        num_experts: int,
+        hidden_size: int,
+        intermediate_size_per_partition: int,
+        params_dtype: torch.dtype,
+        **extra_weight_attrs,
+    ):
+        """Create weights for both GPU and CPU experts.
+
+        Args:
+            layer: The MoE layer module
+            num_experts: Total number of experts (GPU + CPU)
+            hidden_size: Hidden dimension size
+            intermediate_size_per_partition: Intermediate size per TP partition
+            params_dtype: Data type for parameters
+            **extra_weight_attrs: Additional weight attributes
+        """
+        self.global_num_experts = num_experts
+        self.hidden_size = hidden_size
+        self.intermediate_size_per_partition = intermediate_size_per_partition
+
+        # Get required parameters from layer object
+        # top_k: number of experts selected per token
+        num_experts_per_tok = layer.top_k
+
+        # intermediate_size_full: full intermediate size before TP partitioning
+        intermediate_size_full = (
+            layer.intermediate_size_per_partition * layer.moe_tp_size
+        )
+
+        layer_max_deferred = self.kt_config.max_deferred_experts_per_token or 0
+        if (
+            self.kt_config.max_deferred_experts_per_token is not None
+            and self.kt_config.num_layers is not None
+            and self.kt_config.layer_idx == self.kt_config.num_layers - 1
+        ):
+            layer_max_deferred = 0
+
+        # 1. Create weights for GPU experts using the wrapped method
+        # GPU experts: 0 to num_gpu_experts-1
+        self.gpu_method.create_weights(
+            layer=layer,
+            num_experts=self.num_gpu_experts,
+            hidden_size=hidden_size,
+            intermediate_size_per_partition=intermediate_size_per_partition,
+            params_dtype=params_dtype,
+            **extra_weight_attrs,
+        )
+
+        # 2. Initialize KT wrapper for CPU experts
+        # CPU experts: num_gpu_experts to num_experts-1
+        if self.tp_rank == 0:
+            self.wrapper = KTMoEWrapper(
+                layer_idx=self.kt_config.layer_idx,
+                num_experts=num_experts,
+                num_experts_per_tok=num_experts_per_tok,
+                hidden_size=hidden_size,
+                moe_intermediate_size=intermediate_size_full,
+                num_gpu_experts=self.num_gpu_experts,
+                cpuinfer_threads=self.kt_config.cpuinfer_threads,
+                threadpool_count=self.kt_config.threadpool_count,
+                weight_path=self.kt_config.weight_path,
+                chunked_prefill_size=self.kt_config.chunked_prefill_size,
+                method=self.kt_config.method,
+                max_deferred_experts_per_token=layer_max_deferred,
+            )
+
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        """Process weights after loading from checkpoint.
+
+        Args:
+            layer: The MoE layer module
+        """
+        # 1. Process GPU weights
+        if hasattr(self.gpu_method, "process_weights_after_loading"):
+            self.gpu_method.process_weights_after_loading(layer)
+
+        # 2. Load CPU weights using KT wrapper
+        if self.tp_rank == 0 and self.wrapper is not None:
+            torch.cuda.synchronize()
+
+            # Get expert location metadata for CPU expert mapping
+            from sglang.srt.eplb.expert_location_dispatch import (
+                get_global_expert_location_metadata,
+            )
+
+            physical_to_logical_map_cpu = (
+                get_global_expert_location_metadata()
+                .physical_to_logical_map_cpu[self.kt_config.layer_idx]
+                .contiguous()
+            )
+            self.wrapper.load_weights(physical_to_logical_map_cpu)
+
+    def create_moe_runner(
+        self, layer: torch.nn.Module, moe_runner_config: "MoeRunnerConfig"
+    ):
+        """Create MoE runner for computation.
+
+        Args:
+            layer: The MoE layer module
+            moe_runner_config: Configuration for MoE runner
+        """
+        self.moe_runner_config = moe_runner_config
+        if self.override_num_local_experts:
+            moe_runner_config.num_local_experts = self.num_gpu_experts
+        # Delegate to GPU method to create its runner
+        self.gpu_method.create_moe_runner(layer, moe_runner_config)
+
+    def submit(
+        self,
+        layer: torch.nn.Module,
+        dispatch_output: "StandardDispatchOutput",
+    ) -> None:
+        """Submit CPU expert computation asynchronously (non-blocking).
+
+        This method submits the CPU expert computation to AMX/AVX without waiting
+        for completion, allowing GPU computation to proceed in parallel.
+
+        Args:
+            layer: The MoE layer module
+            dispatch_output: Dispatched tokens and routing information
+        """
+        assert (
+            self.moe_runner_config.activation == "silu"
+        ), "Only SiLU activation is supported."
+
+        if self.tp_rank != 0 or self.wrapper is None:
+            return
+
+        x = dispatch_output.hidden_states
+        topk_output = dispatch_output.topk_output
+        topk_weights, topk_ids, _ = topk_output
+
+        # Submit forward task to CPU (non-blocking)
+        self.wrapper.submit_forward(
+            x, topk_ids, topk_weights, torch.cuda.current_stream(x.device).cuda_stream
+        )
+
+    def sync(self, x: torch.Tensor) -> torch.Tensor:
+        """Synchronize and retrieve CPU expert computation results.
+
+        This method waits for the CPU computation to complete and returns the results.
+
+        Args:
+            x: Reference tensor for shape and device information
+
+        Returns:
+            CPU expert computation results
+        """
+        if self.tp_rank != 0 or self.wrapper is None:
+            return torch.zeros_like(x)
+
+        # Wait for CPU computation and retrieve results
+        return self.wrapper.sync_forward(
+            x, torch.cuda.current_stream(x.device).cuda_stream
+        )
+
+    def apply(
+        self,
+        layer: torch.nn.Module,
+        dispatch_output: "StandardDispatchOutput",
+    ) -> "CombineInput":
+        """Execute hybrid CPU+GPU MoE forward pass with parallelism.
+
+        This is the main computation method that coordinates:
+        1. Submit CPU expert computation (non-blocking)
+        2. Execute GPU expert computation in parallel
+        3. Synchronize CPU results and merge with GPU results
+
+        Args:
+            layer: The MoE layer module
+            dispatch_output: Dispatched tokens and routing information
+
+        Returns:
+            Combined computation results from CPU and GPU experts
+        """
+        from sglang.srt.layers.moe.token_dispatcher import StandardCombineInput
+
+        x = dispatch_output.hidden_states
+        topk_output = dispatch_output.topk_output
+
+        # Step 1: Submit CPU expert computation (non-blocking)
+        if self.tp_rank == 0:
+            self.submit(layer, dispatch_output)
+
+        # Step 2: Prepare GPU computation by masking CPU expert IDs
+        # CPU expert IDs (>= num_gpu_experts) are set to -1 so GPU kernel skips them
+        topk_ids = topk_output.topk_ids
+        masked_topk_ids = mask_cpu_expert_ids(topk_ids, self.num_gpu_experts)
+
+        # Create modified dispatch output for GPU computation
+        masked_topk_output = topk_output._replace(topk_ids=masked_topk_ids)
+        masked_dispatch_output = dispatch_output._replace(
+            topk_output=masked_topk_output
+        )
+
+        # Step 3: Execute GPU expert computation (any quantization method)
+        # This runs in parallel with CPU computation
+        gpu_combine_input = self.gpu_method.apply(layer, masked_dispatch_output)
+
+        # Step 4: Synchronize CPU results and merge with GPU results
+        output = gpu_combine_input.hidden_states
+        if self.tp_rank == 0:
+            cpu_output = self.sync(x)
+            output = output + cpu_output
+
+        return StandardCombineInput(hidden_states=output)
+
+    def __getattr__(self, name: str):
+        """Delegate attribute access to the wrapped GPU method.
+
+        This allows the wrapper to transparently expose attributes and methods
+        from the wrapped GPU quantization method.
+
+        Args:
+            name: Attribute name
+
+        Returns:
+            Attribute value from gpu_method
+        """
+        # Avoid infinite recursion for internal attributes
+        if name in ("gpu_method", "wrapper", "kt_config"):
+            raise AttributeError(
+                f"'{type(self).__name__}' object has no attribute '{name}'"
+            )
+
+        return getattr(self.gpu_method, name)
diff --git a/sglang/python/sglang/srt/layers/moe/rocm_moe_utils.py b/sglang/python/sglang/srt/layers/moe/rocm_moe_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..77a58a2f4fe2aebb43aada9e4d1e8733274555ed
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/rocm_moe_utils.py
@@ -0,0 +1,189 @@
+# Adapted from https://github.com/vllm-project/vllm/blob/v0.9.1rc2/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from enum import IntEnum
+from typing import Optional
+
+import torch
+import triton
+import triton.language as tl
+
+from sglang.srt.utils import get_bool_env_var, is_hip
+from sglang.srt.utils.custom_op import register_custom_op
+
+_is_hip = is_hip()
+_use_aiter = get_bool_env_var("SGLANG_USE_AITER") and _is_hip
+
+
+class ActivationMethod(IntEnum):
+    # This allows interfacing with AITER ActivationType enum
+    # without importing the ActivationType enum from AITER globally.
+    SILU = 0
+    GELU = 1
+
+
+# NOTE: for non _use_aiter case, use lazy registration to avoid overhead
+# (registration may not be trigger actually, since it will not be called)
+@register_custom_op(out_shape="hidden_states", eager=_use_aiter)
+def rocm_aiter_asm_moe_tkw1(
+    hidden_states: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    topk_weights: torch.Tensor,
+    topk_ids: torch.Tensor,
+    fc1_scale: Optional[torch.Tensor] = None,
+    fc2_scale: Optional[torch.Tensor] = None,
+    fc1_smooth_scale: Optional[torch.Tensor] = None,
+    fc2_smooth_scale: Optional[torch.Tensor] = None,
+    a16: bool = False,
+    per_tensor_quant_scale: Optional[torch.Tensor] = None,
+    expert_mask: Optional[torch.Tensor] = None,
+    activation_method: int = ActivationMethod.SILU.value,
+) -> torch.Tensor:
+
+    from aiter import ActivationType
+    from aiter.fused_moe_bf16_asm import asm_moe_tkw1
+
+    activation = ActivationType(activation_method)
+
+    return asm_moe_tkw1(
+        hidden_states,
+        w1,
+        w2,
+        topk_weights,
+        topk_ids,
+        fc1_scale=fc1_scale,
+        fc2_scale=fc2_scale,
+        fc1_smooth_scale=fc1_smooth_scale,
+        fc2_smooth_scale=fc2_smooth_scale,
+        a16=a16,
+        per_tensor_quant_scale=per_tensor_quant_scale,
+        expert_mask=expert_mask,
+        activation=activation,
+    )
+
+
+def rocm_fused_experts_tkw1(
+    hidden_states: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    topk_weights: torch.Tensor,
+    topk_ids: torch.Tensor,
+    activation: str = "silu",
+    apply_router_weight_on_input: bool = False,
+    use_fp8_w8a8: bool = False,
+    per_channel_quant: bool = False,
+    w1_scale: Optional[torch.Tensor] = None,
+    w2_scale: Optional[torch.Tensor] = None,
+    a1_scale: Optional[torch.Tensor] = None,
+    a2_scale: Optional[torch.Tensor] = None,
+    block_shape: Optional[list[int]] = None,
+) -> torch.Tensor:
+
+    activation_method = (
+        ActivationMethod.SILU if activation == "silu" else ActivationMethod.GELU
+    )
+    # All AITER Fused MoE kernels are expecting the following datatypes
+    topk_weights = topk_weights.to(torch.float32)
+    topk_ids = topk_ids.to(torch.int32)
+
+    # w8a8 per-channel quantization
+    if per_channel_quant and apply_router_weight_on_input and use_fp8_w8a8:
+        # AITER tkw1 kernel for FP8 models with `apply_router_weight_on_input`
+        # This applies topk_weights on the GEMM output of the first FC layer
+        #  rather than the second FC.
+        assert (
+            topk_weights.dim() == 2
+        ), "`topk_weights` should be in shape (num_tokens, topk)"
+        assert topk_weights.shape[-1] == 1, (
+            "Only support topk=1 when" " `apply_router_weight_on_input` is True"
+        )
+
+        return rocm_aiter_asm_moe_tkw1(
+            hidden_states,
+            w1,
+            w2,
+            topk_weights,
+            topk_ids,
+            fc1_scale=w1_scale,
+            fc2_scale=w2_scale,
+            fc1_smooth_scale=None,
+            fc2_smooth_scale=None,
+            a16=False,
+            per_tensor_quant_scale=None,
+            expert_mask=None,
+            activation_method=activation_method,
+        )
+    else:
+        assert False, "This should not be called."
+
+
+@triton.jit
+def upscale_kernel(
+    A_ptr,  # *fp16 / *fp32
+    scale_ptr,  # *fp16 / *fp32
+    Out_ptr,  # *fp16 / *fp32
+    M,
+    N,
+    recv_token_num,
+    stride_am,
+    stride_an,
+    stride_sm,
+    stride_sn,
+    stride_om,
+    stride_on,
+    BLOCK_N: tl.constexpr,
+):
+    pid_m = tl.program_id(0)  # row id
+    pid_n = tl.program_id(1)  # block id along N
+
+    recv_token_num_val = tl.load(recv_token_num)
+
+    if pid_m >= recv_token_num_val:
+        return
+
+    # column offsets
+    offs_n = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
+    mask = offs_n < N
+
+    # A[m, n]
+    a_ptrs = A_ptr + pid_m * stride_am + offs_n * stride_an
+    a = tl.load(a_ptrs, mask=mask, other=0.0)
+
+    # scale index: n // 128
+    scale_idx = offs_n // 128
+    s_ptrs = scale_ptr + pid_m * stride_sm + scale_idx * stride_sn
+    s = tl.load(s_ptrs, mask=mask, other=1.0)
+
+    out = a * s
+
+    out_ptrs = Out_ptr + pid_m * stride_om + offs_n * stride_on
+    tl.store(out_ptrs, out, mask=mask)
+
+
+def upscale(hidden_state, hidden_state_scale, recv_token_num, output_dtype):
+    M, N = hidden_state.shape
+
+    Out = torch.empty_like(hidden_state, dtype=output_dtype)
+
+    BLOCK_N = 256
+
+    grid = (M, triton.cdiv(N, BLOCK_N))
+
+    upscale_kernel[grid](
+        hidden_state,
+        hidden_state_scale,
+        Out,
+        M,
+        N,
+        recv_token_num,
+        hidden_state.stride(0),
+        hidden_state.stride(1),
+        hidden_state_scale.stride(0),
+        hidden_state_scale.stride(1),
+        Out.stride(0),
+        Out.stride(1),
+        BLOCK_N=BLOCK_N,
+    )
+
+    return Out
diff --git a/sglang/python/sglang/srt/layers/moe/routed_experts_capturer.py b/sglang/python/sglang/srt/layers/moe/routed_experts_capturer.py
new file mode 100644
index 0000000000000000000000000000000000000000..00bd6875558759743301f438f2430df61e23396d
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/routed_experts_capturer.py
@@ -0,0 +1,289 @@
+import logging
+from abc import ABC
+from typing import Optional
+
+import numpy as np
+import pybase64
+import torch
+
+from sglang.srt.configs.model_config import ModelConfig
+from sglang.srt.layers.dp_attention import (
+    get_attention_dp_rank,
+    get_dp_local_info,
+    is_dp_attention_enabled,
+)
+from sglang.srt.mem_cache.memory_pool import ReqToTokenPool
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch
+from sglang.srt.server_args import get_global_server_args
+
+logger = logging.getLogger(__name__)
+
+_GB = 1024 * 1024 * 1024
+_MB = 1024 * 1024
+
+
+def get_tensor_size_bytes(t: torch.Tensor):
+    return np.prod(t.shape) * t.dtype.itemsize
+
+
+class _RoutedExpertsDeviceCache:
+    def __init__(
+        self,
+        max_running_requests: int,
+        num_hidden_layers: int,
+        num_experts_per_tok: int,
+        num_fused_shared_experts: int,
+        device: str,
+    ) -> None:
+        self.buffer = torch.zeros(
+            (
+                max(
+                    get_global_server_args().chunked_prefill_size
+                    * get_global_server_args().dp_size,
+                    max_running_requests,
+                ),
+                num_hidden_layers,
+                num_experts_per_tok + num_fused_shared_experts,
+            ),
+            dtype=torch.int32,
+            device=device,
+        )
+        self._finalize_allocation_log()
+
+    def get_buffer_size_bytes(self):
+        assert hasattr(self, "buffer")
+        return get_tensor_size_bytes(self.buffer)
+
+    def capture_fwd_routed_experts(self, layer_id: int, topk_ids: torch.Tensor):
+        assert layer_id is not None, "capturing routing experts but get layer_id None"
+        batch, _ = topk_ids.shape
+        self.buffer[:batch, layer_id, :] = topk_ids
+
+    def _finalize_allocation_log(self):
+        """Common logging and memory usage computation for captured experts buffers."""
+        buffer_size_MB = self.get_buffer_size_bytes() / _MB
+        logger.info(
+            f"Routing experts device buffer allocated. #shape: {tuple(self.buffer.shape)}, size: {buffer_size_MB:.2f} MB"
+        )
+
+
+class _RoutedExpertsHostCache:
+    def __init__(
+        self,
+        num_tokens: int,
+        num_hidden_layers: int,
+        num_experts_per_tok: int,
+    ) -> None:
+        self.num_tokens = num_tokens
+        self.buffer = torch.zeros(
+            (
+                num_tokens,
+                num_hidden_layers,
+                num_experts_per_tok,
+            ),
+            dtype=torch.int32,
+            device="cpu",
+            pin_memory=True,
+        )
+        self._finalize_allocation_log()
+
+    def get_buffer_size_bytes(self):
+        assert hasattr(self, "buffer")
+        return get_tensor_size_bytes(self.buffer)
+
+    def set_experts_buffer(self, layer_id: int, loc: torch.Tensor, top_k: torch.Tensor):
+        self.buffer[layer_id, loc, :] = top_k.to(device="cpu", non_blocking=True)
+
+    def _finalize_allocation_log(self):
+        """Common logging and memory usage computation for captured experts buffers."""
+        buffer_size_GB = self.get_buffer_size_bytes() / _GB
+        logger.info(
+            f"Routing experts host buffer allocated. #tokens: {self.num_tokens}, size: {buffer_size_GB:.2f} GB"
+        )
+
+
+class RoutedExpertsCapturer(ABC):
+    @staticmethod
+    def create(
+        enable: bool,
+        model_config: ModelConfig,
+        num_fused_shared_experts: int,
+        num_tokens: int,
+        max_running_requests: int,
+        device: str,
+    ):
+        if enable:
+            return _RoutedExpertsCapturerReal(
+                model_config,
+                num_tokens=num_tokens,
+                max_running_requests=max_running_requests,
+                num_fused_shared_experts=num_fused_shared_experts,
+                device=device,
+            )
+        else:
+            return _RoutedExpertsCapturerNoop()
+
+    def _sync_fwd_experts_buffer_DtoH(
+        self,
+        forward_batch: ForwardBatch,
+        can_run_graph: bool,
+        cuda_graph_batch: int,
+    ):
+        raise NotImplementedError
+
+    def capture(self, layer_id: int, topk_ids: torch.Tensor):
+        raise NotImplementedError
+
+    def get_routed_experts(
+        self,
+        req_pool_idx: int,
+        seqlen: int,
+        req_to_token_pool: ReqToTokenPool,
+    ):
+        raise NotImplementedError
+
+    def on_forward_end(self, forward_batch, can_run_graph, cuda_graph_batch):
+        raise NotImplementedError
+
+    def get_host_cache(self):
+        raise NotImplementedError
+
+    def get_device_cache(self):
+        raise NotImplementedError
+
+
+class _RoutedExpertsCapturerReal(RoutedExpertsCapturer):
+    """Capturer for routed experts with host buffer"""
+
+    def __init__(
+        self,
+        model_config: ModelConfig,
+        num_tokens: int,
+        max_running_requests: int,
+        num_fused_shared_experts: int,
+        device: str,
+    ):
+        self.num_fused_shared_experts = num_fused_shared_experts
+        self.num_hidden_layers = model_config.hf_text_config.num_hidden_layers
+        self.num_experts_per_tok = model_config.hf_text_config.num_experts_per_tok
+
+        self.host_cache = _RoutedExpertsHostCache(
+            num_tokens=num_tokens,
+            num_hidden_layers=self.num_hidden_layers,
+            num_experts_per_tok=self.num_experts_per_tok,
+        )
+
+        self.device_cache = _RoutedExpertsDeviceCache(
+            max_running_requests=max_running_requests,
+            num_hidden_layers=self.num_hidden_layers,
+            num_experts_per_tok=self.num_experts_per_tok,
+            num_fused_shared_experts=self.num_fused_shared_experts,
+            device=device,
+        )
+
+    def _sync_fwd_experts_buffer_DtoH(
+        self,
+        forward_batch: ForwardBatch,
+        can_run_graph: bool,
+        cuda_graph_batch: int,
+    ):
+        if is_dp_attention_enabled():
+            local_start_pos, local_num_tokens = get_dp_local_info(forward_batch)
+            # handle with cuda graph padding
+            if can_run_graph:
+                local_start_pos = get_attention_dp_rank() * cuda_graph_batch
+                local_end_pos = local_start_pos + local_num_tokens
+            else:
+                local_end_pos = local_start_pos + local_num_tokens
+        else:
+            local_start_pos = 0
+            local_end_pos = forward_batch.out_cache_loc.shape[0]
+
+        # FIXME: sync explicitly here, overlap scheduler breaks here.
+        out_cache_loc_cpu = forward_batch.out_cache_loc.cpu()
+        self.host_cache.buffer[out_cache_loc_cpu] = self.device_cache.buffer[
+            local_start_pos:local_end_pos, :, : self.num_experts_per_tok
+        ].cpu()
+
+    def capture(self, layer_id: int, topk_ids: torch.Tensor):
+        self.device_cache.capture_fwd_routed_experts(layer_id, topk_ids)
+
+    def get_routed_experts(
+        self,
+        req_pool_idx: int,
+        seqlen: int,
+        req_to_token_pool: ReqToTokenPool,
+    ):
+        cache_pool_idx = (
+            req_to_token_pool.req_to_token[req_pool_idx][: seqlen - 1].cpu().clone()
+        )
+        return self.get_host_cache().buffer[cache_pool_idx]
+
+    def on_forward_end(self, forward_batch, can_run_graph, cuda_graph_batch):
+        self._sync_fwd_experts_buffer_DtoH(
+            forward_batch=forward_batch,
+            can_run_graph=can_run_graph,
+            cuda_graph_batch=cuda_graph_batch,
+        )
+
+    def get_host_cache(self):
+        return self.host_cache
+
+    def get_device_cache(self):
+        return self.device_cache
+
+
+class _RoutedExpertsCapturerNoop(RoutedExpertsCapturer):
+    def __init__(self):
+        pass
+
+    def _sync_fwd_experts_buffer_DtoH(
+        self,
+        forward_batch: ForwardBatch,
+        can_run_graph: bool,
+        cuda_graph_batch: int,
+    ):
+        pass
+
+    def capture(self, layer_id: int, topk_ids: torch.Tensor):
+        pass
+
+    def get_routed_experts(
+        self,
+        req_pool_idx: int,
+        seqlen: int,
+        req_to_token_pool: ReqToTokenPool,
+    ):
+        pass
+
+    def on_forward_end(self, forward_batch, can_run_graph, cuda_graph_batch):
+        pass
+
+    def get_host_cache(self):
+        pass
+
+    def get_device_cache(self):
+        pass
+
+
+_global_expert_capturer: Optional[RoutedExpertsCapturer] = _RoutedExpertsCapturerNoop()
+
+
+def get_global_experts_capturer():
+    return _global_expert_capturer
+
+
+def set_global_experts_capturer(capturer: RoutedExpertsCapturer):
+    global _global_expert_capturer
+    _global_expert_capturer = capturer
+
+
+def extract_routed_experts_from_meta_info(data):
+    # To solve the performance issue, we return the experts_ids in base64
+    # We left this function for user to change it back to normal int32
+    # See detokenizer_manager::_extract_routed_experts
+    routed_experts_base64 = data["meta_info"].get("routed_experts", None)
+    routed_experts = np.frombuffer(
+        pybase64.b64decode(routed_experts_base64.encode("utf-8")), dtype=np.int32
+    )
+    return routed_experts
diff --git a/sglang/python/sglang/srt/layers/moe/router.py b/sglang/python/sglang/srt/layers/moe/router.py
new file mode 100644
index 0000000000000000000000000000000000000000..5c0b86e58d184064650022ee71a8fb0c7953d6d0
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/router.py
@@ -0,0 +1,428 @@
+from typing import Optional, Tuple
+
+import torch
+import triton
+import triton.language as tl
+
+from sglang.srt.layers.moe.topk import fused_topk
+from sglang.srt.utils import is_hip
+
+_is_hip = is_hip()
+
+
+@triton.jit
+def fused_moe_router_cudacore_kernel(
+    input_ptr,  # input (bs, hidden_dim)
+    moe_router_weight_ptr,  # input (num_experts, hidden_dim)
+    topk_weights_ptr,  # output (bs, topk)
+    topk_ids_ptr,  # output (bs, topk)
+    correction_bias_ptr,
+    is_correction_bias: tl.constexpr,
+    num_experts: tl.constexpr,
+    topk: tl.constexpr,
+    moe_softcapping: tl.constexpr,
+    moe_renormalize: tl.constexpr,  # not supported
+    hidden_dim: tl.constexpr,
+    BLOCK_SIZE: tl.constexpr,
+):
+    pid = tl.program_id(axis=0)
+
+    offsets = tl.arange(0, BLOCK_SIZE)
+    mask = offsets < hidden_dim
+
+    # moe_router_weight is k major
+    expert_offsets = tl.arange(0, num_experts)[:, None]
+    router_mask = mask[None, :]
+    w_router = tl.load(
+        moe_router_weight_ptr + expert_offsets * hidden_dim + offsets[None, :],
+        mask=router_mask,
+        other=0.0,
+    )
+
+    x = tl.load(input_ptr + pid * hidden_dim + offsets, mask=mask, other=0.0)
+
+    # todo: tl.dot?
+    logits = tl.sum((w_router.to(tl.float32) * x[None, :].to(tl.float32)), axis=-1)
+
+    # logit softcap
+    if moe_softcapping == 0:
+        logits_softcapped = logits
+    else:
+        logits_scaled = logits / moe_softcapping
+        exped = tl.exp(2 * logits_scaled)
+        top = exped - 1
+        bottom = exped + 1
+        logits_softcapped = top / bottom * moe_softcapping
+
+    # Add bias after softcapping
+    if is_correction_bias:
+        bias = tl.load(correction_bias_ptr + tl.arange(0, num_experts))
+        logits_softcapped = logits_softcapped + bias
+
+    # topk
+    # assert 1 <= topk <= num_experts
+
+    # 5.38 us
+
+    top1 = tl.argmax(logits_softcapped, axis=0)
+    tl.store(topk_ids_ptr + pid * topk + 0, top1)  # 5.63 us
+
+    top1_v = tl.max(logits_softcapped, axis=0)
+    invsumexp = 1.0 / tl.sum(tl.exp(logits_softcapped - top1_v), axis=0)
+
+    tl.store(
+        topk_weights_ptr + pid * topk + 0,
+        invsumexp,
+    )  # 5.73 us
+
+    if topk >= 2:
+        top2 = tl.argmax(
+            tl.where(
+                tl.arange(0, num_experts) != top1, logits_softcapped, float("-inf")
+            ),
+            axis=0,
+        )
+        tl.store(topk_ids_ptr + pid * topk + 1, top2)
+        top2_v = tl.sum(logits_softcapped * (tl.arange(0, num_experts) == top2), axis=0)
+        tl.store(
+            topk_weights_ptr + pid * topk + 1,
+            tl.exp(top2_v - top1_v) * invsumexp,
+        )  # 5.95us
+
+    # probably slow
+    if topk > 2:
+        topk_mask = tl.full(logits_softcapped.shape, 1.0, dtype=logits_softcapped.dtype)
+        topk_mask = tl.where(
+            tl.arange(0, num_experts) != top1, topk_mask, float("-inf")
+        )
+        topk_mask = tl.where(
+            tl.arange(0, num_experts) != top2, topk_mask, float("-inf")
+        )
+        for i in range(2, topk):
+            topi = tl.argmax(logits_softcapped + topk_mask, axis=0)
+            topk_mask = tl.where(
+                tl.arange(0, num_experts) != topi, topk_mask, float("-inf")
+            )
+            tl.store(topk_ids_ptr + pid * topk + i, topi)
+            topi_v = tl.sum(
+                logits_softcapped * (tl.arange(0, num_experts) == topi), axis=0
+            )
+            tl.store(
+                topk_weights_ptr + pid * topk + i,
+                tl.exp(topi_v - top1_v) * invsumexp,
+            )
+    # assert not moe_renormalize, "moe weight renormalization not implemented"
+
+
+def fused_moe_router_cudacore(
+    x: torch.Tensor,
+    router_weight: torch.Tensor,
+    topk: int,
+    moe_softcapping: float,
+    correction_bias: Optional[torch.Tensor] = None,
+):
+    assert len(x.shape) == 2 and x.shape[1] == router_weight.shape[1]
+    bs, hidden_dim = x.shape
+    num_experts = router_weight.shape[0]
+
+    # router_logits = torch.empty((bs, num_experts), dtype=torch.float32, device=x.device)
+    topk_weights = torch.empty((bs, topk), dtype=torch.float32, device=x.device)
+    topk_ids = torch.empty((bs, topk), dtype=torch.int32, device=x.device)
+    is_correction_bias = correction_bias is not None
+
+    max_warps = 16 if _is_hip else 32
+    config = {
+        "BLOCK_SIZE": triton.next_power_of_2(hidden_dim),
+        "num_warps": max(
+            min(triton.next_power_of_2(triton.cdiv(hidden_dim, 256)), max_warps), 4
+        ),
+    }
+
+    fused_moe_router_cudacore_kernel[(bs,)](
+        x,
+        router_weight,
+        topk_weights,
+        topk_ids,
+        correction_bias,
+        is_correction_bias=is_correction_bias,
+        num_experts=num_experts,
+        topk=topk,
+        moe_softcapping=moe_softcapping,
+        moe_renormalize=False,
+        hidden_dim=hidden_dim,
+        **config,
+    )
+
+    return topk_weights, topk_ids
+
+
+@triton.jit
+def fused_moe_router_tensorcore_kernel(
+    a_ptr,  # input (bs, hidden_dim)
+    b_ptr,  # input (num_experts, hidden_dim)
+    topk_weights_ptr,  # output (bs, topk)
+    topk_ids_ptr,  # output (bs, topk)
+    bs,
+    num_experts: tl.constexpr,
+    topk: tl.constexpr,  # only support topk <= 2
+    moe_softcapping: tl.constexpr,
+    moe_renormalize: tl.constexpr,  # not supported
+    correction_bias_ptr,
+    is_correction_bias: tl.constexpr,
+    K: tl.constexpr,
+    BLOCK_SIZE_M: tl.constexpr,
+    BLOCK_SIZE_N: tl.constexpr,
+    BLOCK_SIZE_K: tl.constexpr,
+    stride_am: tl.constexpr,
+    stride_bn: tl.constexpr,
+    dp_attn_workaround_flag: tl.constexpr,
+):
+
+    # 1. get block id
+    pid = tl.program_id(axis=0)
+
+    # 2. create pointers for the first block of A and B
+    # 2.1. setup a_ptrs with offsets in m and k
+    offs_m = pid * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)[:, None]
+    bs_mask = offs_m < bs
+    offs_k = tl.arange(0, BLOCK_SIZE_K)[None, :]
+    a_ptrs = a_ptr + (offs_m * stride_am + offs_k)
+
+    # 2.2. setup b_ptrs with offsets in k and n.
+    #      Note: b matrix is k-major.
+    offs_k = tl.arange(0, BLOCK_SIZE_K)[None, :]
+    offs_n = tl.arange(0, BLOCK_SIZE_N)[:, None]
+    expert_mask = offs_n < num_experts
+    b_ptrs = b_ptr + (offs_n * stride_bn + offs_k)
+
+    # 3. Create an accumulator of float32 of size [BLOCK_SIZE_M, BLOCK_SIZE_N]
+    #    3.1. iterate in K dimension
+    #    3.2. transpose tile B
+    acc = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
+    for k in range(0, K // BLOCK_SIZE_K):  # hidden_dim % BLOCK_SIZE_K == 0
+        a = tl.load(
+            a_ptrs,
+            mask=bs_mask,
+            other=0.0,
+        ).to(tl.float32)
+        b = tl.load(b_ptrs, mask=expert_mask, other=0.0).to(tl.float32).T
+        acc += tl.dot(a, b)
+
+        # Advance the ptrs to the next K block.
+        a_ptrs += BLOCK_SIZE_K
+        b_ptrs += BLOCK_SIZE_K
+
+    # 4. logit softcap
+    if moe_softcapping == 0:
+        logits_softcapped = acc
+    else:
+        logits_scaled = acc / moe_softcapping
+        exped = tl.exp(2 * logits_scaled)
+        logits_softcapped = (exped - 1) / (exped + 1) * moe_softcapping
+
+    # Add bias after softcapping
+    if is_correction_bias:
+        bias = tl.load(
+            correction_bias_ptr + tl.arange(0, BLOCK_SIZE_N)[None, :],
+            mask=expert_mask.T,
+            other=0.0,
+        )
+        logits_softcapped = logits_softcapped + bias
+
+    if dp_attn_workaround_flag:
+        logits_softcapped = tl.where(
+            logits_softcapped != logits_softcapped, -1e9, logits_softcapped
+        )
+
+    # 5. top1
+    arange_block_size_n = tl.arange(0, BLOCK_SIZE_N)[None, :]
+    cond_top1 = arange_block_size_n < num_experts
+    top1 = tl.argmax(tl.where(cond_top1, logits_softcapped, float("-inf")), axis=1)
+    top1_v = tl.max(
+        tl.where(cond_top1, logits_softcapped, float("-inf")), axis=1, keep_dims=True
+    )
+    top1_invsumexp = 1.0 / tl.sum(
+        tl.where(cond_top1, tl.exp(logits_softcapped - top1_v), 0.0), axis=1
+    )
+
+    # 6. store top1 to output
+    offs_top1 = pid * topk * BLOCK_SIZE_M + topk * tl.arange(0, BLOCK_SIZE_M)
+    top1_mask = offs_top1 < bs * topk
+    tl.store(topk_ids_ptr + offs_top1, top1, mask=top1_mask)
+    tl.store(
+        topk_weights_ptr + offs_top1,
+        top1_invsumexp,
+        mask=top1_mask,
+    )
+
+    # 7. handle topk == 2
+    if topk == 2:
+        cond_top2 = (arange_block_size_n < num_experts) & (
+            arange_block_size_n != top1[:, None]
+        )
+        top2 = tl.argmax(
+            tl.where(cond_top2, logits_softcapped, float("-inf")),
+            axis=1,
+            keep_dims=True,
+        )
+        top2_v = tl.sum(
+            logits_softcapped * (arange_block_size_n == top2), axis=1, keep_dims=True
+        )
+        top2_invsumexp = tl.exp(top2_v - top1_v) * top1_invsumexp[:, None]
+
+        # store top2
+        offs_top2 = (
+            pid * topk * BLOCK_SIZE_M + topk * tl.arange(0, BLOCK_SIZE_M)[:, None] + 1
+        )
+        top2_mask = offs_top2 < bs * topk
+        tl.store(topk_ids_ptr + offs_top2, top2, mask=top2_mask)
+        tl.store(
+            topk_weights_ptr + offs_top2,
+            top2_invsumexp,
+            mask=top2_mask,
+        )
+
+
+def fused_moe_router_tensorcore(
+    x: torch.Tensor,
+    router_weight: torch.Tensor,
+    topk: int,
+    moe_softcapping: float,
+    BLOCK_SIZE_M: int,
+    BLOCK_SIZE_N: int,
+    BLOCK_SIZE_K: int,
+    correction_bias: Optional[torch.Tensor] = None,
+):
+    assert len(x.shape) == 2 and x.shape[1] == router_weight.shape[1]
+    bs, hidden_dim = x.shape
+    num_experts = router_weight.shape[0]
+
+    assert num_experts <= BLOCK_SIZE_N
+    assert hidden_dim % BLOCK_SIZE_K == 0
+    assert topk <= 2
+
+    topk_weights = torch.empty((bs, topk), dtype=torch.float32, device=x.device)
+    topk_ids = torch.empty((bs, topk), dtype=torch.int32, device=x.device)
+    is_correction_bias = correction_bias is not None
+
+    grid = (triton.cdiv(bs, BLOCK_SIZE_M) * triton.cdiv(num_experts, BLOCK_SIZE_N),)
+
+    # TODO(ch-wan): temporary workaround for dp attention. We should support masked
+    # router to skip padded tokens.
+    from sglang.srt.layers.dp_attention import is_dp_attention_enabled
+
+    dp_attn_workaround_flag = is_dp_attention_enabled()
+
+    fused_moe_router_tensorcore_kernel[grid](
+        a_ptr=x,
+        b_ptr=router_weight,
+        topk_weights_ptr=topk_weights,
+        topk_ids_ptr=topk_ids,
+        bs=bs,
+        num_experts=num_experts,
+        topk=topk,
+        moe_softcapping=moe_softcapping,
+        moe_renormalize=False,
+        K=hidden_dim,
+        correction_bias_ptr=correction_bias,
+        is_correction_bias=is_correction_bias,
+        BLOCK_SIZE_M=BLOCK_SIZE_M,
+        BLOCK_SIZE_N=BLOCK_SIZE_N,
+        BLOCK_SIZE_K=BLOCK_SIZE_K,
+        stride_am=hidden_dim,
+        stride_bn=hidden_dim,
+        dp_attn_workaround_flag=dp_attn_workaround_flag,
+    )
+
+    return topk_weights, topk_ids
+
+
+def fused_moe_router_shim(
+    moe_softcapping,
+    hidden_states,
+    gating_output,
+    topk,
+    renormalize,
+    correction_bias: Optional[torch.Tensor] = None,
+    enable_deterministic_inference: bool = False,
+):
+    assert not renormalize
+    assert (
+        len(hidden_states.shape) == 2
+        and hidden_states.shape[1] == gating_output.shape[1]
+    )
+    bs, hidden_dim = hidden_states.shape
+    num_experts = gating_output.shape[0]
+
+    BLOCK_SIZE_M = 32
+
+    BLOCK_SIZE_N = max(num_experts, 16)
+    BLOCK_SIZE_K = (
+        256 if num_experts < 256 else 64
+    )  # if experts are large, need to use smaller k block or shared memory OOM
+
+    if (
+        (bs >= 512 or num_experts > 8)
+        and hidden_dim % BLOCK_SIZE_K == 0
+        # we keep using single kernel to avoid non-deterministic behavior
+        and not enable_deterministic_inference
+    ):
+        # if large batch size or large expert, use kernel that uses tensorcore in matmul
+        return fused_moe_router_tensorcore(
+            x=hidden_states,
+            router_weight=gating_output,
+            topk=topk,
+            moe_softcapping=moe_softcapping,
+            BLOCK_SIZE_M=BLOCK_SIZE_M,
+            BLOCK_SIZE_N=BLOCK_SIZE_N,
+            BLOCK_SIZE_K=BLOCK_SIZE_K,
+            correction_bias=correction_bias,
+        )
+    else:
+        # if smaller, use kernel that does not use tensorcore in matmul
+        return fused_moe_router_cudacore(
+            x=hidden_states,
+            router_weight=gating_output,
+            topk=topk,
+            moe_softcapping=moe_softcapping,
+            correction_bias=correction_bias,
+        )
+
+
+class FusedMoeRouter:
+    def __init__(self, router_linear, topk, moe_softcapping) -> None:
+        self.router_linear = router_linear
+        self.topk = topk
+        self.moe_softcapping = moe_softcapping
+
+    def __call__(self, *args, **kwargs):
+        return self.forward(*args, **kwargs)
+
+    def forward(
+        self, x: torch.Tensor, residual: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        if x.is_cuda:
+            return self.forward_cuda(x, residual)
+        else:
+            return self.forward_vllm(x, residual)
+
+    def forward_cuda(
+        self, x: torch.Tensor, autotune=False
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        return fused_moe_router_shim(
+            moe_softcapping=self.moe_softcapping,
+            hidden_states=x,
+            gating_output=self.router_linear.weight,
+            topk=self.topk,
+            renormalize=False,
+        )
+
+    def forward_torch(
+        self,
+        x: torch.Tensor,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        g = x.float() @ self.router_linear.weight.T.float()
+
+        g = torch.tanh(g.float() / self.moe_softcapping) * self.moe_softcapping
+
+        return fused_topk(x, g, self.topk, False)
diff --git a/sglang/python/sglang/srt/layers/moe/topk.py b/sglang/python/sglang/srt/layers/moe/topk.py
new file mode 100644
index 0000000000000000000000000000000000000000..2523ef851711a1c3b9bb2185707f5ae5d3cc29b8
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/topk.py
@@ -0,0 +1,1113 @@
+# Copyright 2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+from __future__ import annotations
+
+import logging
+import math
+from dataclasses import dataclass
+from enum import IntEnum, auto
+from typing import (
+    TYPE_CHECKING,
+    Callable,
+    NamedTuple,
+    Optional,
+    Protocol,
+    TypeGuard,
+    runtime_checkable,
+)
+
+import torch
+
+try:
+    from triton_kernels.routing import GatherIndx, RoutingData, ScatterIndx, routing
+except ImportError:
+    pass
+
+from sglang.srt.distributed import get_tp_group
+from sglang.srt.distributed.device_communicators.pynccl_allocator import (
+    use_symmetric_memory,
+)
+from sglang.srt.eplb import expert_location_dispatch
+from sglang.srt.eplb.expert_distribution import get_global_expert_distribution_recorder
+from sglang.srt.eplb.expert_location_dispatch import (
+    ExpertLocationDispatchInfo,
+    topk_ids_logical_to_physical,
+)
+from sglang.srt.layers.dp_attention import is_allocation_symmetric
+from sglang.srt.layers.moe import get_moe_runner_backend
+from sglang.srt.layers.moe.routed_experts_capturer import get_global_experts_capturer
+from sglang.srt.layers.utils import MultiPlatformOp
+from sglang.srt.utils import (
+    cpu_has_amx_support,
+    get_bool_env_var,
+    get_compiler_backend,
+    is_cpu,
+    is_cuda,
+    is_hip,
+    is_npu,
+    is_xpu,
+)
+from sglang.srt.utils.patch_torch import register_fake_if_exists
+
+if TYPE_CHECKING:
+    from sglang.srt.layers.quantization import QuantizationConfig
+
+
+logger = logging.getLogger(__name__)
+_is_cuda = is_cuda()
+_is_hip = is_hip()
+_is_cpu = is_cpu()
+_is_cpu_amx_available = cpu_has_amx_support()
+_is_xpu = is_xpu()
+_is_npu = is_npu()
+_is_xpu = is_xpu()
+_use_aiter = get_bool_env_var("SGLANG_USE_AITER") and _is_hip
+
+if _is_cuda:
+    from sgl_kernel import moe_fused_gate
+
+    try:
+        from flashinfer.fused_moe import fused_topk_deepseek as _fused_topk_deepseek
+
+        from sglang.srt.utils.custom_op import register_custom_op
+
+        @register_custom_op(
+            op_name="fused_topk_deepseek",
+            mutates_args=["topk_weights", "topk_ids"],
+        )
+        def fused_topk_deepseek(
+            gating_output: torch.Tensor,
+            correction_bias: torch.Tensor,
+            num_expert_group: int,
+            topk_group: int,
+            topk: int,
+            scaling_factor: float,
+            topk_weights: torch.Tensor,
+            topk_ids: torch.Tensor,
+            renormalize: bool,
+        ) -> None:
+            _fused_topk_deepseek(
+                gating_output,
+                correction_bias,
+                num_expert_group,
+                topk_group,
+                topk,
+                scaling_factor,
+                topk_weights,
+                topk_ids,
+                renormalize,
+            )
+
+    except ImportError:
+        fused_topk_deepseek = None
+
+    try:
+        from sgl_kernel import kimi_k2_moe_fused_gate
+    except ImportError as e:
+        pass
+
+if _is_cuda or _is_hip or _is_xpu:
+    from sgl_kernel import topk_softmax
+
+    try:
+        from sgl_kernel import topk_sigmoid
+    except ImportError:
+        pass
+if _use_aiter:
+    try:
+        from aiter import biased_grouped_topk as aiter_biased_grouped_topk
+    except ImportError:
+        raise ImportError("aiter is required when SGLANG_USE_AITER is set to True")
+
+# -------------------------------- TopKConfig ---------------------------------------
+
+
+@dataclass
+class TopKConfig:
+    top_k: int
+    use_grouped_topk: bool = False
+    topk_group: Optional[int] = None
+    num_expert_group: Optional[int] = None
+    renormalize: bool = True
+    num_fused_shared_experts: int = 0
+    custom_routing_function: Optional[Callable] = None
+    correction_bias: Optional[torch.Tensor] = None
+    torch_native: bool = False
+    routed_scaling_factor: Optional[float] = None
+    apply_routed_scaling_factor_on_output: bool = False
+    fused_shared_experts_scaling_factor: Optional[float] = None
+    output_format: Optional[TopKOutputFormat] = None
+    scoring_func: str = "softmax"
+
+
+# -------------------------------- TopKOutput ---------------------------------------
+
+
+class TopKOutputChecker:
+
+    @staticmethod
+    def format_is_standard(topk_output: TopKOutput) -> TypeGuard[StandardTopKOutput]:
+        return isinstance(topk_output, StandardTopKOutput)
+
+    @staticmethod
+    def format_is_triton_kernels(
+        topk_output: TopKOutput,
+    ) -> TypeGuard[TritonKernelTopKOutput]:
+        return isinstance(topk_output, TritonKernelTopKOutput)
+
+    @staticmethod
+    def format_is_bypassed(topk_output: TopKOutput) -> TypeGuard[BypassedTopKOutput]:
+        return isinstance(topk_output, BypassedTopKOutput)
+
+
+class TopKOutputFormat(IntEnum):
+    STANDARD = auto()
+    TRITON_KERNEL = auto()
+    BYPASSED = auto()
+
+
+@runtime_checkable
+class TopKOutput(Protocol):
+    """Protocol for top-k outputs in different formats."""
+
+    @property
+    def format(self) -> TopKOutputFormat:
+        """The format of the output."""
+        ...
+
+
+class StandardTopKOutput(NamedTuple):
+    """Standard top-k output format."""
+
+    topk_weights: torch.Tensor
+    topk_ids: torch.Tensor
+    router_logits: torch.Tensor
+
+    @property
+    def format(self) -> TopKOutputFormat:
+        return TopKOutputFormat.STANDARD
+
+
+class TritonKernelTopKOutput(NamedTuple):
+    """Triton kernel top-k output format."""
+
+    routing_data: RoutingData
+    gather_indx: GatherIndx
+    scatter_indx: ScatterIndx
+
+    @property
+    def format(self) -> TopKOutputFormat:
+        return TopKOutputFormat.TRITON_KERNEL
+
+
+class BypassedTopKOutput(NamedTuple):
+    """Bypassed top-k output format."""
+
+    hidden_states: torch.Tensor
+    router_logits: torch.Tensor
+    topk_config: TopKConfig
+    num_token_non_padded: Optional[torch.Tensor] = None
+    expert_location_dispatch_info: Optional[ExpertLocationDispatchInfo] = None
+
+    @property
+    def format(self) -> TopKOutputFormat:
+        return TopKOutputFormat.BYPASSED
+
+
+# -------------------------------- TopK ---------------------------------------
+
+
+class TopK(MultiPlatformOp):
+    """
+    Parameters:
+    --top_k: The all number of top experts selected per token, including the fused shared expert(s).
+    --num_fused_shared_experts: num of shared experts, can be activate both in TP or EP mode.
+    --routed_scaling_factor: the scaling factor for routed experts in topk_weights.
+    --fused_shared_experts_scaling_factor: scaling factor for fused shared experts on AMD-platform.
+    """
+
+    def __init__(
+        self,
+        top_k: int,
+        *,
+        layer_id: Optional[int] = None,
+        use_grouped_topk: bool = False,
+        topk_group: Optional[int] = None,
+        num_expert_group: Optional[int] = None,
+        renormalize: bool = True,
+        num_fused_shared_experts: int = 0,
+        custom_routing_function: Optional[Callable] = None,
+        scoring_func: str = "softmax",
+        correction_bias: Optional[torch.Tensor] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        routed_scaling_factor: Optional[float] = None,
+        apply_routed_scaling_factor_on_output: Optional[bool] = False,
+        output_format: Optional[TopKOutputFormat] = None,
+        fused_shared_experts_scaling_factor: Optional[float] = None,
+    ):
+        # NOTE: scoring_func is not used for now, but we keep it for future use
+        # see https://github.com/sgl-project/sglang/pull/4505 for more details
+        super().__init__()
+
+        if use_grouped_topk:
+            assert num_expert_group is not None and topk_group is not None
+
+        self.layer_id = layer_id
+        self.topk_config = TopKConfig(
+            top_k=top_k,
+            use_grouped_topk=use_grouped_topk,
+            renormalize=renormalize,
+            topk_group=topk_group,
+            num_expert_group=num_expert_group,
+            num_fused_shared_experts=num_fused_shared_experts,
+            custom_routing_function=custom_routing_function,
+            correction_bias=correction_bias,
+            routed_scaling_factor=routed_scaling_factor,
+            apply_routed_scaling_factor_on_output=apply_routed_scaling_factor_on_output,
+            fused_shared_experts_scaling_factor=fused_shared_experts_scaling_factor,
+            output_format=output_format,
+            scoring_func=scoring_func,
+        )
+
+    def forward_native(
+        self,
+        hidden_states: torch.Tensor,
+        router_logits: torch.Tensor,
+        *,
+        num_token_non_padded: Optional[torch.Tensor] = None,
+        expert_location_dispatch_info: Optional[ExpertLocationDispatchInfo] = None,
+    ) -> TopKOutput:
+        self.topk_config.torch_native = True
+        return select_experts(
+            hidden_states=hidden_states,
+            layer_id=self.layer_id,
+            router_logits=router_logits,
+            topk_config=self.topk_config,
+            num_token_non_padded=num_token_non_padded,
+            expert_location_dispatch_info=expert_location_dispatch_info,
+        )
+
+    def forward_cuda(
+        self,
+        hidden_states: torch.Tensor,
+        router_logits: torch.Tensor,
+        *,
+        num_token_non_padded: Optional[torch.Tensor] = None,
+        expert_location_dispatch_info: Optional[ExpertLocationDispatchInfo] = None,
+    ) -> TopKOutput:
+        if self.topk_config.output_format is not None:
+            output_format = self.topk_config.output_format
+        elif get_moe_runner_backend().is_triton_kernels():
+            output_format = TopKOutputFormat.TRITON_KERNEL
+        elif (
+            get_moe_runner_backend().is_flashinfer_trtllm()
+            or get_moe_runner_backend().is_flashinfer_mxfp4()
+        ):
+            output_format = TopKOutputFormat.BYPASSED
+        else:
+            output_format = TopKOutputFormat.STANDARD
+
+        if output_format == TopKOutputFormat.TRITON_KERNEL:
+            # renormalize=True is equivalent to sm_first=False
+            routing_data, gather_idx, scatter_idx = routing(
+                router_logits,
+                self.topk_config.top_k,
+                sm_first=not self.topk_config.renormalize,
+            )
+            return TritonKernelTopKOutput(routing_data, gather_idx, scatter_idx)
+        elif output_format == TopKOutputFormat.BYPASSED:
+            return BypassedTopKOutput(
+                hidden_states=hidden_states,
+                router_logits=router_logits,
+                topk_config=self.topk_config,
+                num_token_non_padded=num_token_non_padded,
+                expert_location_dispatch_info=expert_location_dispatch_info,
+            )
+        else:
+            self.topk_config.torch_native = False
+            with use_symmetric_memory(
+                get_tp_group(), disabled=not is_allocation_symmetric()
+            ):
+                topk_output = select_experts(
+                    hidden_states=hidden_states,
+                    layer_id=self.layer_id,
+                    router_logits=router_logits,
+                    topk_config=self.topk_config,
+                    num_token_non_padded=num_token_non_padded,
+                    expert_location_dispatch_info=expert_location_dispatch_info,
+                )
+            return topk_output
+
+    def forward_cpu(
+        self,
+        hidden_states: torch.Tensor,
+        router_logits: torch.Tensor,
+        *,
+        num_token_non_padded: Optional[torch.Tensor] = None,
+        expert_location_dispatch_info: Optional[ExpertLocationDispatchInfo] = None,
+    ) -> TopKOutput:
+        return select_experts(
+            hidden_states=hidden_states,
+            layer_id=self.layer_id,
+            router_logits=router_logits,
+            topk_config=self.topk_config,
+            num_token_non_padded=num_token_non_padded,
+            expert_location_dispatch_info=expert_location_dispatch_info,
+        )
+
+    def forward_npu(
+        self,
+        hidden_states: torch.Tensor,
+        router_logits: torch.Tensor,
+        *,
+        num_token_non_padded: Optional[torch.Tensor] = None,
+        expert_location_dispatch_info: Optional[ExpertLocationDispatchInfo] = None,
+    ) -> TopKOutput:
+
+        from sglang.srt.hardware_backend.npu.moe.topk import fused_topk_npu
+
+        return fused_topk_npu(
+            hidden_states=hidden_states,
+            router_logits=router_logits,
+            topk_config=self.topk_config,
+            num_token_non_padded=num_token_non_padded,
+            expert_location_dispatch_info=expert_location_dispatch_info,
+            layer_id=self.layer_id,
+        )
+
+    def empty_topk_output(self, device: torch.device) -> TopKOutput:
+        topk = self.topk_config.top_k - self.topk_config.num_fused_shared_experts
+        with use_symmetric_memory(
+            get_tp_group(), disabled=not is_allocation_symmetric()
+        ):
+            topk_weights = torch.empty((0, topk), dtype=torch.float32, device=device)
+            topk_ids = torch.full((0, topk), -1, dtype=torch.int32, device=device)
+        # FIXME: router_logits should be of size (0, num_experts)
+        router_logits = torch.empty((0, topk), dtype=torch.float32, device=device)
+        return StandardTopKOutput(topk_weights, topk_ids, router_logits)
+
+
+# ------------------------------- TopK implementation -------------------------------------
+
+
+def fused_topk_torch_native(
+    hidden_states: torch.Tensor,
+    gating_output: torch.Tensor,
+    topk: int,
+    renormalize: bool,
+    correction_bias: torch.Tensor = None,
+    scoring_func: str = "softmax",
+):
+    def scoring_func_impl(gating_output: torch.Tensor) -> torch.Tensor:
+        if scoring_func == "softmax":
+            return gating_output.softmax(dim=-1)
+        elif scoring_func == "sigmoid":
+            return gating_output.sigmoid()
+        else:
+            raise ValueError(f"Invalid scoring function: {scoring_func}")
+
+    if correction_bias is not None:
+        n_routed_experts = gating_output.shape[-1]
+        scores = scoring_func_impl(gating_output)
+        scores_for_choice = scores.view(
+            -1, n_routed_experts
+        ) + correction_bias.unsqueeze(0)
+        topk_ids = torch.topk(scores_for_choice, k=topk, dim=-1, sorted=False)[1]
+        topk_weights = scores.gather(1, topk_ids)
+    else:
+        assert (
+            hidden_states.shape[0] == gating_output.shape[0]
+        ), f"Number of tokens mismatch, {hidden_states.shape=} vs {gating_output.shape=}"
+        M, _ = hidden_states.shape
+        topk_weights = torch.empty(
+            M, topk, dtype=torch.float32, device=hidden_states.device
+        )
+        topk_ids = torch.empty(M, topk, dtype=torch.int32, device=hidden_states.device)
+        topk_weights = scoring_func_impl(gating_output.float())
+        topk_weights, topk_ids = torch.topk(topk_weights, topk, dim=-1)
+
+    if renormalize:
+        topk_weights = topk_weights / topk_weights.sum(dim=-1, keepdim=True)
+    return topk_weights, topk_ids
+
+
+def fused_topk_cpu(
+    hidden_states: torch.Tensor,
+    gating_output: torch.Tensor,
+    topk: int,
+    renormalize: bool,
+    correction_bias: torch.Tensor = None,
+    scoring_func: str = "softmax",
+):
+    topk_weights, topk_ids = torch.ops.sgl_kernel.topk_softmax_cpu(
+        hidden_states=hidden_states,
+        gating_output=gating_output,
+        topk=topk,
+        renormalize=renormalize,
+    )
+    return topk_weights, topk_ids
+
+
+def apply_topk_weights_cpu(need_apply, topk_weights, inputs):
+    if not need_apply:
+        return inputs, topk_weights
+
+    # TODO: fuse below processing in fused_experts_cpu kernel
+    inputs = inputs * topk_weights.to(inputs.dtype)
+    topk_weights = torch.ones_like(
+        topk_weights, dtype=torch.float32
+    )  # clear topk_weights as already applied
+
+    return inputs, topk_weights
+
+
+def fused_topk(
+    hidden_states: torch.Tensor,
+    gating_output: torch.Tensor,
+    topk: int,
+    renormalize: bool,
+    correction_bias: Optional[torch.Tensor] = None,
+    scoring_func: str = "softmax",
+):
+    assert hidden_states.shape[0] == gating_output.shape[0], "Number of tokens mismatch"
+
+    M, _ = hidden_states.shape
+
+    topk_weights = torch.empty(
+        M, topk, dtype=torch.float32, device=hidden_states.device
+    )
+    topk_ids = torch.empty(M, topk, dtype=torch.int32, device=hidden_states.device)
+
+    if scoring_func == "softmax":
+        topk_softmax(
+            topk_weights,
+            topk_ids,
+            gating_output,
+            renormalize,
+        )
+    elif scoring_func == "sigmoid":
+        topk_sigmoid(
+            topk_weights,
+            topk_ids,
+            gating_output,
+            renormalize,
+            correction_bias,
+        )
+    else:
+        raise ValueError(f"Invalid scoring function: {scoring_func}")
+
+    return topk_weights, topk_ids
+
+
+# This is used by the Deepseek V2/V3/R1 series models
+@torch.compile(dynamic=True, backend=get_compiler_backend(), disable=_is_npu)
+def grouped_topk_gpu(
+    hidden_states: torch.Tensor,
+    gating_output: torch.Tensor,
+    topk: int,
+    renormalize: bool,
+    num_expert_group: Optional[int] = None,
+    topk_group: Optional[int] = None,
+    num_fused_shared_experts: int = 0,
+    routed_scaling_factor: Optional[float] = None,
+    apply_routed_scaling_factor_on_output: Optional[bool] = False,
+):
+    assert hidden_states.shape[0] == gating_output.shape[0], "Number of tokens mismatch"
+
+    scores = torch.softmax(gating_output, dim=-1)
+    num_token = scores.shape[0]
+    num_experts = scores.shape[1]
+    group_scores = (
+        scores.view(num_token, num_expert_group, -1).max(dim=-1).values
+    )  # [n, n_group]
+    group_idx = torch.topk(group_scores, k=topk_group, dim=-1, sorted=False)[
+        1
+    ]  # [n, top_k_group]
+    group_mask = torch.zeros_like(group_scores)  # [n, n_group]
+    group_mask.scatter_(1, group_idx, 1)  # [n, n_group]
+    score_mask = (
+        group_mask.unsqueeze(-1)
+        .expand(num_token, num_expert_group, scores.shape[-1] // num_expert_group)
+        .reshape(num_token, -1)
+    )  # [n, e]
+    tmp_scores = scores.masked_fill(~score_mask.bool(), 0.0)  # [n, e]
+    topk_weights, topk_ids = torch.topk(
+        tmp_scores,
+        k=topk,
+        dim=-1,
+        sorted=(True if num_fused_shared_experts > 0 else False),
+    )
+    if num_fused_shared_experts:
+        topk_ids[:, -1] = torch.randint(
+            low=num_experts,
+            high=num_experts + num_fused_shared_experts,
+            size=(topk_ids.size(0),),
+            dtype=topk_ids.dtype,
+            device=topk_ids.device,
+        )
+        if routed_scaling_factor is not None:
+            topk_weights[:, -1] = (
+                topk_weights[:, :-1].sum(dim=-1) / routed_scaling_factor
+            )
+
+    if renormalize:
+        topk_weights_sum = (
+            topk_weights.sum(dim=-1, keepdim=True)
+            if num_fused_shared_experts == 0
+            else topk_weights[:, :-1].sum(dim=-1, keepdim=True)
+        )
+        topk_weights = topk_weights / topk_weights_sum
+        if apply_routed_scaling_factor_on_output:
+            topk_weights *= routed_scaling_factor
+
+    topk_weights, topk_ids = topk_weights.to(torch.float32), topk_ids.to(torch.int32)
+
+    return topk_weights, topk_ids
+
+
+def grouped_topk_cpu(
+    hidden_states: torch.Tensor,
+    gating_output: torch.Tensor,
+    topk: int,
+    renormalize: bool,
+    num_expert_group: Optional[int] = None,
+    topk_group: Optional[int] = None,
+    num_fused_shared_experts: int = 0,
+    routed_scaling_factor: Optional[float] = None,
+    apply_routed_scaling_factor_on_output: Optional[bool] = False,
+):
+    assert not apply_routed_scaling_factor_on_output
+    return torch.ops.sgl_kernel.grouped_topk_cpu(
+        hidden_states,
+        gating_output,
+        topk,
+        renormalize,
+        num_expert_group,
+        topk_group,
+        num_fused_shared_experts,
+        routed_scaling_factor,
+        # num_token_non_padded must be None since it is not supported in kernel
+        num_token_non_padded=None,
+    )
+
+
+@torch.compile(dynamic=True, backend=get_compiler_backend(), disable=_is_npu)
+def kimi_k2_biased_topk_impl(
+    hidden_states: torch.Tensor,
+    gating_output: torch.Tensor,
+    correction_bias: torch.Tensor,
+    topk: int,
+    renormalize: bool,
+    routed_scaling_factor: Optional[float] = None,
+    apply_routed_scaling_factor_on_output: Optional[bool] = False,
+):
+    """
+    Optimized version for num_expert_group=1 case (e.g., Kimi K2 with 384 experts).
+    Simplifies the grouped topk logic by removing unnecessary group masking operations.
+    Note: This function assumes num_fused_shared_experts=0.
+    """
+    assert hidden_states.shape[0] == gating_output.shape[0], "Number of tokens mismatch"
+
+    scores = gating_output.sigmoid()
+    num_token = scores.shape[0]
+
+    # When num_expert_group=1, no need for group masking
+    # Directly compute scores with correction bias
+    tmp_scores = scores.view(num_token, -1) + correction_bias.unsqueeze(0)
+
+    # Directly select topk experts (no need to sort since num_fused_shared_experts=0)
+    _, topk_ids = torch.topk(tmp_scores, k=topk, dim=-1, sorted=False)
+    topk_weights = scores.gather(1, topk_ids)
+
+    if renormalize:
+        topk_weights_sum = topk_weights.sum(dim=-1, keepdim=True)
+        topk_weights = topk_weights / topk_weights_sum
+        if apply_routed_scaling_factor_on_output:
+            topk_weights *= routed_scaling_factor
+
+    topk_weights, topk_ids = topk_weights.to(torch.float32), topk_ids.to(torch.int32)
+    return topk_weights, topk_ids
+
+
+@torch.compile(dynamic=True, backend=get_compiler_backend(), disable=_is_npu)
+def biased_grouped_topk_impl(
+    hidden_states: torch.Tensor,
+    gating_output: torch.Tensor,
+    correction_bias: torch.Tensor,
+    topk: int,
+    renormalize: bool,
+    num_expert_group: Optional[int] = None,
+    topk_group: Optional[int] = None,
+    num_fused_shared_experts: int = 0,
+    routed_scaling_factor: Optional[float] = None,
+    apply_routed_scaling_factor_on_output: Optional[bool] = False,
+):
+    assert hidden_states.shape[0] == gating_output.shape[0], "Number of tokens mismatch"
+
+    scores = gating_output.sigmoid()
+    num_token = scores.shape[0]
+    num_experts = scores.shape[1]
+    scores_for_choice = scores.view(num_token, -1) + correction_bias.unsqueeze(0)
+    group_scores = (
+        scores_for_choice.view(num_token, num_expert_group, -1)
+        .topk(2, dim=-1)[0]
+        .sum(dim=-1)
+    )  # [n, n_group]
+    group_idx = torch.topk(group_scores, k=topk_group, dim=-1, sorted=False)[
+        1
+    ]  # [n, top_k_group]
+    group_mask = torch.zeros_like(group_scores)  # [n, n_group]
+    group_mask.scatter_(1, group_idx, 1)  # [n, n_group]
+    score_mask = (
+        group_mask.unsqueeze(-1)
+        .expand(num_token, num_expert_group, scores.shape[-1] // num_expert_group)
+        .reshape(num_token, -1)
+    )  # [n, e]
+    tmp_scores = scores_for_choice.masked_fill(
+        ~score_mask.bool(), float("-inf")
+    )  # [n, e]
+    _, topk_ids = torch.topk(
+        tmp_scores,
+        k=topk,
+        dim=-1,
+        sorted=(True if num_fused_shared_experts > 0 else False),
+    )
+    topk_weights = scores.gather(1, topk_ids)
+
+    if num_fused_shared_experts:
+        topk_ids[:, -1] = torch.randint(
+            low=num_experts,
+            high=num_experts + num_fused_shared_experts,
+            size=(topk_ids.size(0),),
+            dtype=topk_ids.dtype,
+            device=topk_ids.device,
+        )
+        if routed_scaling_factor is not None:
+            topk_weights[:, -1] = (
+                topk_weights[:, :-1].sum(dim=-1) / routed_scaling_factor
+            )
+
+    if renormalize:
+        topk_weights_sum = (
+            topk_weights.sum(dim=-1, keepdim=True)
+            if num_fused_shared_experts == 0
+            else topk_weights[:, :-1].sum(dim=-1, keepdim=True)
+        )
+        topk_weights = topk_weights / topk_weights_sum
+        if apply_routed_scaling_factor_on_output:
+            topk_weights *= routed_scaling_factor
+
+    topk_weights, topk_ids = topk_weights.to(torch.float32), topk_ids.to(torch.int32)
+
+    return topk_weights, topk_ids
+
+
+def is_power_of_two(n):
+    return n > 0 and math.log2(n).is_integer()
+
+
+def _mask_topk_ids_padded_region(
+    topk_ids: torch.Tensor,
+    num_token_non_padded: Optional[torch.Tensor] = None,
+):
+    if num_token_non_padded is None:
+        return
+    indices = torch.arange(0, topk_ids.shape[0], device=topk_ids.device)
+    topk_ids[indices >= num_token_non_padded, :] = -1
+
+
+@torch.compile(dynamic=True, backend=get_compiler_backend())
+def _biased_grouped_topk_postprocess(
+    topk_ids, expert_location_dispatch_info, num_token_non_padded
+):
+    topk_ids = topk_ids_logical_to_physical(topk_ids, expert_location_dispatch_info)
+    _mask_topk_ids_padded_region(topk_ids, num_token_non_padded)
+    return topk_ids
+
+
+def biased_grouped_topk_gpu(
+    hidden_states: torch.Tensor,
+    gating_output: torch.Tensor,
+    correction_bias: torch.Tensor,
+    topk: int,
+    renormalize: bool,
+    num_expert_group: Optional[int] = None,
+    topk_group: Optional[int] = None,
+    num_fused_shared_experts: int = 0,
+    routed_scaling_factor: Optional[float] = None,
+    apply_routed_scaling_factor_on_output: Optional[bool] = False,
+):
+
+    num_tokens = gating_output.shape[0]
+    num_experts = gating_output.shape[1]
+    experts_per_group = (
+        num_experts // num_expert_group if num_expert_group else num_experts
+    )
+
+    if (
+        _is_cuda
+        and fused_topk_deepseek is not None
+        and num_fused_shared_experts == 0
+        and is_power_of_two(num_experts)
+        # flashinfer constraints
+        and topk <= 8
+        and topk_group <= num_expert_group
+        and topk_group * num_expert_group >= topk
+        and (
+            (experts_per_group <= 32 and experts_per_group * topk_group <= 128)
+            if num_expert_group > 1
+            else num_experts <= 384
+        )
+    ):
+        # Pre-allocate output tensors (flashinfer mutates them in-place)
+        topk_weights = torch.empty(
+            (num_tokens, topk), dtype=torch.float32, device=gating_output.device
+        )
+        topk_ids = torch.empty(
+            (num_tokens, topk), dtype=torch.int32, device=gating_output.device
+        )
+
+        # flashinfer always applies the scaling_factor internally
+        scaling_factor = 1.0
+        if routed_scaling_factor is not None and apply_routed_scaling_factor_on_output:
+            scaling_factor = routed_scaling_factor
+
+        # flashinfer's fused_topk_deepseek
+        fused_topk_deepseek(
+            gating_output.to(dtype=torch.float32),
+            correction_bias,
+            num_expert_group,
+            topk_group,
+            topk,
+            scaling_factor,
+            topk_weights,
+            topk_ids,
+            True,
+        )
+
+        return topk_weights, topk_ids
+
+    elif (
+        _is_cuda
+        # moe_fused_gate kernel ensures that num_experts/num_expert_group does not exceed MAX_VPT=32 now. And when kernel can handle MAX_VPT > 32, we can remove this assertion.
+        and experts_per_group <= 32
+        and is_power_of_two(num_experts)
+    ):
+        topk_weights, topk_ids = moe_fused_gate(
+            gating_output.to(dtype=torch.float32),
+            correction_bias,
+            num_expert_group,
+            topk_group,
+            topk,
+            num_fused_shared_experts,
+            routed_scaling_factor if routed_scaling_factor is not None else 1.0,
+            apply_routed_scaling_factor_on_output,
+        )
+
+        return topk_weights, topk_ids
+
+    elif _use_aiter:
+        assert not apply_routed_scaling_factor_on_output, "Not implemented"
+        token = gating_output.shape[0]
+        device = gating_output.device
+        assert (
+            hidden_states.shape[0] == gating_output.shape[0]
+        ), f"Number of tokens mismatch: hidden_states.shape[0] = {hidden_states.shape[0]}, gating_output.shape[0] = {gating_output.shape[0]}"
+        topk_weights = torch.empty((token, topk), dtype=torch.float32, device=device)
+        topk_ids = torch.empty((token, topk), dtype=torch.int32, device=device)
+        aiter_biased_grouped_topk(
+            gating_output,
+            correction_bias.to(dtype=gating_output.dtype),
+            topk_weights,
+            topk_ids,
+            num_expert_group,
+            topk_group,
+            renormalize,
+            routed_scaling_factor if routed_scaling_factor is not None else 1.0,
+        )
+        return topk_weights, topk_ids
+    else:
+        # Use optimized path for Kimi K2 (384 experts with num_expert_group=1)
+        num_experts = gating_output.shape[1]
+        if _is_cuda and num_experts == 384 and num_expert_group == 1:
+            return kimi_k2_moe_fused_gate(
+                gating_output.to(dtype=torch.float32),
+                correction_bias,
+                topk=topk,
+                renormalize=renormalize,
+                routed_scaling_factor=routed_scaling_factor,
+                apply_routed_scaling_factor_on_output=apply_routed_scaling_factor_on_output,
+            )
+        else:
+            return biased_grouped_topk_impl(
+                hidden_states,
+                gating_output,
+                correction_bias,
+                topk,
+                renormalize,
+                num_expert_group,
+                topk_group,
+                num_fused_shared_experts=num_fused_shared_experts,
+                routed_scaling_factor=routed_scaling_factor,
+                apply_routed_scaling_factor_on_output=apply_routed_scaling_factor_on_output,
+            )
+
+
+def biased_grouped_topk_cpu(
+    hidden_states: torch.Tensor,
+    gating_output: torch.Tensor,
+    correction_bias: torch.Tensor,
+    topk: int,
+    renormalize: bool,
+    num_expert_group: Optional[int] = None,
+    topk_group: Optional[int] = None,
+    compiled: bool = True,
+    num_fused_shared_experts: int = 0,
+    routed_scaling_factor: Optional[float] = None,
+    apply_routed_scaling_factor_on_output: Optional[bool] = False,
+):
+    assert not apply_routed_scaling_factor_on_output, "Not implemented"
+    return torch.ops.sgl_kernel.biased_grouped_topk_cpu(
+        hidden_states,
+        gating_output,
+        correction_bias,
+        topk,
+        renormalize,
+        num_expert_group,
+        topk_group,
+        num_fused_shared_experts,
+        routed_scaling_factor,
+        # num_token_non_padded must be None since it is not supported in kernel
+        num_token_non_padded=None,
+    )
+
+
+if _is_cpu and _is_cpu_amx_available:
+    biased_grouped_topk = biased_grouped_topk_cpu
+    grouped_topk = grouped_topk_cpu
+    fused_topk_native = fused_topk_cpu
+    fused_topk = fused_topk_cpu
+else:
+    biased_grouped_topk = biased_grouped_topk_gpu
+    grouped_topk = grouped_topk_gpu
+    fused_topk_native = fused_topk_torch_native
+
+
+def _post_process_topk_ids(
+    topk_ids: torch.Tensor,
+    topk_weights: torch.Tensor,
+    topk_config: TopKConfig,
+    router_logits: torch.Tensor,
+    layer_id: int,
+    num_token_non_padded: Optional[torch.Tensor] = None,
+    expert_location_dispatch_info: Optional[ExpertLocationDispatchInfo] = None,
+) -> torch.Tensor:
+    num_fused_shared_experts = topk_config.num_fused_shared_experts
+    fused_shared_experts_scaling_factor = (
+        topk_config.fused_shared_experts_scaling_factor
+    )
+    get_global_experts_capturer().capture(
+        layer_id=layer_id,
+        topk_ids=topk_ids,
+    )
+    if _is_cuda:
+        topk_ids = topk_ids_logical_to_physical(topk_ids, expert_location_dispatch_info)
+        _mask_topk_ids_padded_region(topk_ids, num_token_non_padded)
+
+    if num_fused_shared_experts > 0 and _use_aiter:
+        M, N = router_logits.shape
+        scale_factor = (
+            1.0
+            if fused_shared_experts_scaling_factor is None
+            else fused_shared_experts_scaling_factor
+        )
+
+        # Lazy import to avoid circular-import issues
+        from sglang.srt.layers.moe.fused_moe_triton.fused_moe_triton_kernels import (
+            fused_append_shared_experts,
+        )
+
+        topk_ids, topk_weights = fused_append_shared_experts(
+            topk_ids,
+            topk_weights,
+            num_fused_shared_experts,
+            scale_factor,
+            N,  # base id for shared experts
+        )
+    return topk_ids, topk_weights
+
+
+def select_experts(
+    hidden_states: torch.Tensor,
+    router_logits: torch.Tensor,
+    topk_config: TopKConfig,
+    *,
+    layer_id: Optional[int] = None,
+    num_token_non_padded: Optional[torch.Tensor] = None,
+    expert_location_dispatch_info: Optional[ExpertLocationDispatchInfo] = None,
+) -> StandardTopKOutput:
+
+    top_k = topk_config.top_k
+    use_grouped_topk = topk_config.use_grouped_topk
+    topk_group = topk_config.topk_group
+    num_expert_group = topk_config.num_expert_group
+    renormalize = topk_config.renormalize
+    num_fused_shared_experts = topk_config.num_fused_shared_experts
+    custom_routing_function = topk_config.custom_routing_function
+    correction_bias = topk_config.correction_bias
+    torch_native = topk_config.torch_native
+    routed_scaling_factor = topk_config.routed_scaling_factor
+    apply_routed_scaling_factor_on_output = (
+        topk_config.apply_routed_scaling_factor_on_output
+    )
+
+    scoring_func = topk_config.scoring_func
+
+    router_logits, correction_bias = (
+        expert_location_dispatch.transform_select_experts_inputs(
+            router_logits=router_logits,
+            correction_bias=correction_bias,
+            info=expert_location_dispatch_info,
+        )
+    )
+
+    # DeepSeek V2/V3/R1 series models use grouped_top_k
+    # remove num_fused_shared_experts from grouped_topk/biased_grouped_topk
+    num_routed_topk = top_k - num_fused_shared_experts
+    if use_grouped_topk:
+        assert topk_group is not None
+        assert num_expert_group is not None
+        if correction_bias is None:
+            topk_weights, topk_ids = grouped_topk(
+                hidden_states=hidden_states,
+                gating_output=router_logits,
+                topk=num_routed_topk if _use_aiter else top_k,
+                renormalize=renormalize,
+                num_expert_group=num_expert_group,
+                topk_group=topk_group,
+                num_fused_shared_experts=num_fused_shared_experts,
+                routed_scaling_factor=routed_scaling_factor,
+                apply_routed_scaling_factor_on_output=apply_routed_scaling_factor_on_output,
+            )
+        else:
+            topk_weights, topk_ids = biased_grouped_topk(
+                hidden_states=hidden_states,
+                gating_output=router_logits,
+                correction_bias=correction_bias,
+                topk=num_routed_topk if _use_aiter else top_k,
+                renormalize=renormalize,
+                num_expert_group=num_expert_group,
+                topk_group=topk_group,
+                num_fused_shared_experts=num_fused_shared_experts,
+                routed_scaling_factor=routed_scaling_factor,
+                apply_routed_scaling_factor_on_output=apply_routed_scaling_factor_on_output,
+            )
+    elif torch_native and custom_routing_function is None:
+        assert (
+            num_token_non_padded is None
+        ), "num_token_non_padded is not yet supported in fused_topk_native"
+        assert expert_location_dispatch_info is None
+        assert not apply_routed_scaling_factor_on_output, "Not implemented"
+        topk_weights, topk_ids = fused_topk_native(
+            hidden_states=hidden_states,
+            gating_output=router_logits,
+            topk=num_routed_topk if _use_aiter else top_k,
+            renormalize=renormalize,
+            correction_bias=correction_bias,
+            scoring_func=scoring_func,
+        )
+    elif custom_routing_function is None:
+        assert not apply_routed_scaling_factor_on_output, "Not implemented"
+        # Qwen3MOE uses fused_topk
+        topk_weights, topk_ids = fused_topk(
+            hidden_states=hidden_states,
+            gating_output=router_logits,
+            topk=num_routed_topk if _use_aiter else top_k,
+            renormalize=renormalize,
+            correction_bias=correction_bias,
+            scoring_func=scoring_func,
+        )
+    else:
+        assert (
+            num_token_non_padded is None
+        ), "num_token_non_padded is not yet supported in custom_routing_function"
+        assert expert_location_dispatch_info is None
+        assert not apply_routed_scaling_factor_on_output, "Not implemented"
+        topk_weights, topk_ids = custom_routing_function(
+            hidden_states=hidden_states,
+            gating_output=router_logits,
+            topk=num_routed_topk if _use_aiter else top_k,
+            renormalize=renormalize,
+        )
+
+    topk_ids, topk_weights = _post_process_topk_ids(
+        topk_ids=topk_ids,
+        topk_weights=topk_weights,
+        topk_config=topk_config,
+        router_logits=router_logits,
+        num_token_non_padded=num_token_non_padded,
+        layer_id=layer_id,
+        expert_location_dispatch_info=expert_location_dispatch_info,
+    )
+
+    get_global_expert_distribution_recorder().on_select_experts(topk_ids=topk_ids)
+
+    return StandardTopKOutput(topk_weights, topk_ids, router_logits)
+
+
+# Register fake implementations for torch.compile support
+if _is_cuda:
+
+    @torch.library.register_fake("sgl_kernel::moe_fused_gate")
+    def _moe_fused_gate(
+        input_tensor,
+        bias,
+        num_expert_group,
+        topk_group,
+        topk,
+        num_fused_shared_experts=0,
+        routed_scaling_factor=0,
+        apply_routed_scaling_factor_on_output=False,
+    ):
+        num_rows = input_tensor.shape[0]
+        topk_weights = torch.empty(
+            (num_rows, topk), dtype=torch.float32, device=input_tensor.device
+        )
+        topk_ids = torch.empty(
+            (num_rows, topk), dtype=torch.int32, device=input_tensor.device
+        )
+        return topk_weights, topk_ids
+
+    @register_fake_if_exists("sgl_kernel::kimi_k2_moe_fused_gate")
+    def _kimi_k2_moe_fused_gate(
+        input_tensor,
+        bias,
+        topk,
+        renormalize,
+        routed_scaling_factor,
+        apply_routed_scaling_factor_on_output,
+    ):
+        num_rows = input_tensor.shape[0]
+        topk_weights = input_tensor.new_empty(
+            num_rows,
+            topk,
+            dtype=torch.float32,
+        )
+        topk_ids = input_tensor.new_empty(
+            num_rows,
+            topk,
+            dtype=torch.int32,
+        )
+        return topk_weights, topk_ids
diff --git a/sglang/python/sglang/srt/layers/moe/utils.py b/sglang/python/sglang/srt/layers/moe/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..70ad66e9c5144e04f8e56a8e88297b7939338f5b
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/moe/utils.py
@@ -0,0 +1,335 @@
+from __future__ import annotations
+
+import logging
+from contextlib import contextmanager
+from enum import Enum, IntEnum
+from typing import TYPE_CHECKING, Optional
+
+from sglang.srt.distributed.parallel_state import get_moe_expert_parallel_world_size
+from sglang.srt.layers.dp_attention import (
+    get_attention_dp_size,
+    is_dp_attention_enabled,
+)
+
+if TYPE_CHECKING:
+    from sglang.srt.server_args import ServerArgs
+
+logger = logging.getLogger(__name__)
+
+
+class MoeA2ABackend(Enum):
+
+    NONE = "none"
+    DEEPEP = "deepep"
+    MOONCAKE = "mooncake"
+    MORI = "mori"
+    ASCEND_FUSEEP = "ascend_fuseep"
+    FLASHINFER = "flashinfer"
+
+    @classmethod
+    def _missing_(cls, value):
+        if value is None:
+            return cls.NONE
+        for member in cls:
+            if value == member.value:
+                return member
+        raise ValueError(f"No {cls.__name__} member for value {value}")
+
+    def is_none(self):
+        return self == MoeA2ABackend.NONE
+
+    def is_deepep(self):
+        return self == MoeA2ABackend.DEEPEP
+
+    def is_mooncake(self):
+        return self == MoeA2ABackend.MOONCAKE
+
+    def is_flashinfer(self):
+        return self == MoeA2ABackend.FLASHINFER
+
+    def is_ascend_fuseep(self):
+        return self == MoeA2ABackend.ASCEND_FUSEEP
+
+    def is_mori(self):
+        return self == MoeA2ABackend.MORI
+
+
+class MoeRunnerBackend(Enum):
+
+    AUTO = "auto"
+    DEEP_GEMM = "deep_gemm"
+    TRITON = "triton"
+    TRITON_KERNELS = "triton_kernel"
+    FLASHINFER_TRTLLM = "flashinfer_trtllm"
+    FLASHINFER_CUTLASS = "flashinfer_cutlass"
+    FLASHINFER_MXFP4 = "flashinfer_mxfp4"
+    FLASHINFER_CUTEDSL = "flashinfer_cutedsl"
+    CUTLASS = "cutlass"
+    MARLIN = "marlin"
+
+    def is_auto(self):
+        return self == MoeRunnerBackend.AUTO
+
+    def is_deep_gemm(self):
+        return self == MoeRunnerBackend.DEEP_GEMM
+
+    def is_triton(self):
+        return self == MoeRunnerBackend.TRITON
+
+    def is_triton_kernels(self):
+        return self == MoeRunnerBackend.TRITON_KERNELS
+
+    def is_flashinfer_trtllm(self):
+        return self == MoeRunnerBackend.FLASHINFER_TRTLLM
+
+    def is_flashinfer_cutlass(self):
+        return self == MoeRunnerBackend.FLASHINFER_CUTLASS
+
+    def is_flashinfer_cutedsl(self):
+        return self == MoeRunnerBackend.FLASHINFER_CUTEDSL
+
+    def is_flashinfer_mxfp4(self):
+        return self == MoeRunnerBackend.FLASHINFER_MXFP4
+
+    def is_cutlass(self):
+        return self == MoeRunnerBackend.CUTLASS
+
+    def is_marlin(self):
+        return self == MoeRunnerBackend.MARLIN
+
+
+class DeepEPMode(Enum):
+
+    NORMAL = "normal"
+    LOW_LATENCY = "low_latency"
+    AUTO = "auto"
+
+    def enable_normal(self) -> bool:
+        return self in [DeepEPMode.NORMAL, DeepEPMode.AUTO]
+
+    def enable_low_latency(self) -> bool:
+        return self in [DeepEPMode.LOW_LATENCY, DeepEPMode.AUTO]
+
+    def resolve(self, is_extend_in_batch: bool) -> DeepEPMode:
+        if self != DeepEPMode.AUTO:
+            return self
+
+        if is_extend_in_batch:
+            return DeepEPMode.NORMAL
+        else:
+            return DeepEPMode.LOW_LATENCY
+
+    def is_normal(self) -> bool:
+        return self == DeepEPMode.NORMAL
+
+    def is_low_latency(self) -> bool:
+        return self == DeepEPMode.LOW_LATENCY
+
+    def is_auto(self) -> bool:
+        return self == DeepEPMode.AUTO
+
+
+MOE_A2A_BACKEND: Optional[MoeA2ABackend] = None
+MOE_RUNNER_BACKEND: Optional[MoeRunnerBackend] = None
+SPECULATIVE_MOE_RUNNER_BACKEND: Optional[MoeRunnerBackend] = None
+SPECULATIVE_MOE_A2A_BACKEND: Optional[MoeA2ABackend] = None
+DEEPEP_MODE: Optional[DeepEPMode] = None
+IS_TBO_ENABLED: Optional[bool] = None
+IS_SBO_ENABLED: Optional[bool] = None
+TBO_TOKEN_DISTRIBUTION_THRESHOLD: Optional[float] = None
+DEEPEP_CONFIG: Optional[str] = None
+DISABLE_FLASHINFER_CUTLASS_MOE_FP4_ALLGATHER: Optional[bool] = None
+MOE_QUANTIZATION: Optional[str] = None
+
+
+def initialize_moe_config(server_args: ServerArgs):
+    global MOE_A2A_BACKEND
+    global MOE_RUNNER_BACKEND
+    global SPECULATIVE_MOE_RUNNER_BACKEND
+    global SPECULATIVE_MOE_A2A_BACKEND
+    global DEEPEP_MODE
+    global DEEPEP_CONFIG
+    global IS_TBO_ENABLED
+    global IS_SBO_ENABLED
+    global TBO_TOKEN_DISTRIBUTION_THRESHOLD
+    global DISABLE_FLASHINFER_CUTLASS_MOE_FP4_ALLGATHER
+    global MOE_QUANTIZATION
+
+    MOE_A2A_BACKEND = MoeA2ABackend(server_args.moe_a2a_backend)
+    MOE_RUNNER_BACKEND = MoeRunnerBackend(server_args.moe_runner_backend)
+    SPECULATIVE_MOE_RUNNER_BACKEND = (
+        MoeRunnerBackend(server_args.speculative_moe_runner_backend)
+        if server_args.speculative_moe_runner_backend is not None
+        else MOE_RUNNER_BACKEND
+    )
+    SPECULATIVE_MOE_A2A_BACKEND = (
+        MoeA2ABackend(server_args.speculative_moe_a2a_backend)
+        if server_args.speculative_moe_a2a_backend is not None
+        else MOE_A2A_BACKEND
+    )
+    DEEPEP_MODE = DeepEPMode(server_args.deepep_mode)
+    DEEPEP_CONFIG = server_args.deepep_config or ""
+    IS_TBO_ENABLED = server_args.enable_two_batch_overlap
+    IS_SBO_ENABLED = server_args.enable_single_batch_overlap
+    TBO_TOKEN_DISTRIBUTION_THRESHOLD = server_args.tbo_token_distribution_threshold
+    DISABLE_FLASHINFER_CUTLASS_MOE_FP4_ALLGATHER = (
+        server_args.disable_flashinfer_cutlass_moe_fp4_allgather
+    )
+    MOE_QUANTIZATION = server_args.quantization
+
+
+def get_moe_a2a_backend() -> MoeA2ABackend:
+    global MOE_A2A_BACKEND
+    if MOE_A2A_BACKEND is None:
+        MOE_A2A_BACKEND = MoeA2ABackend.NONE
+    return MOE_A2A_BACKEND
+
+
+def get_moe_runner_backend() -> MoeRunnerBackend:
+    global MOE_RUNNER_BACKEND
+    if MOE_RUNNER_BACKEND is None:
+        MOE_RUNNER_BACKEND = MoeRunnerBackend.AUTO
+    return MOE_RUNNER_BACKEND
+
+
+def get_speculative_moe_runner_backend() -> MoeRunnerBackend:
+    global SPECULATIVE_MOE_RUNNER_BACKEND
+    if SPECULATIVE_MOE_RUNNER_BACKEND is None:
+        logger.warning(
+            "SPECULATIVE_MOE_RUNNER_BACKEND is not initialized, using auto backend"
+        )
+        SPECULATIVE_MOE_RUNNER_BACKEND = MoeRunnerBackend.AUTO
+    return SPECULATIVE_MOE_RUNNER_BACKEND
+
+
+def get_speculative_moe_a2a_backend() -> MoeA2ABackend:
+    global SPECULATIVE_MOE_A2A_BACKEND
+    if SPECULATIVE_MOE_A2A_BACKEND is None:
+        logger.warning(
+            "SPECULATIVE_MOE_A2A_BACKEND is not initialized, using none backend"
+        )
+        SPECULATIVE_MOE_A2A_BACKEND = MoeA2ABackend.NONE
+    return SPECULATIVE_MOE_A2A_BACKEND
+
+
+def get_deepep_mode() -> DeepEPMode:
+    global DEEPEP_MODE
+    if DEEPEP_MODE is None:
+        logger.warning("DEEPEP_MODE is not initialized, using auto mode")
+        DEEPEP_MODE = DeepEPMode.AUTO
+    return DEEPEP_MODE
+
+
+def get_deepep_config() -> str:
+    global DEEPEP_CONFIG
+    if DEEPEP_CONFIG is None:
+        logger.warning("DEEPEP_CONFIG is not initialized, using default config")
+        DEEPEP_CONFIG = ""
+    return DEEPEP_CONFIG
+
+
+def is_tbo_enabled() -> bool:
+    global IS_TBO_ENABLED
+    if IS_TBO_ENABLED is None:
+        IS_TBO_ENABLED = False
+    return IS_TBO_ENABLED
+
+
+def is_sbo_enabled() -> bool:
+    global IS_SBO_ENABLED
+    if IS_SBO_ENABLED is None:
+        IS_SBO_ENABLED = False
+    return IS_SBO_ENABLED
+
+
+def get_tbo_token_distribution_threshold() -> float:
+    global TBO_TOKEN_DISTRIBUTION_THRESHOLD
+    if TBO_TOKEN_DISTRIBUTION_THRESHOLD is None:
+        logger.warning(
+            "TBO_TOKEN_DISTRIBUTION_THRESHOLD is not initialized, using 0.48"
+        )
+        TBO_TOKEN_DISTRIBUTION_THRESHOLD = 0.48
+    return TBO_TOKEN_DISTRIBUTION_THRESHOLD
+
+
+def filter_moe_weight_param_global_expert(name, x, num_local_experts):
+    """
+    Filter out for MoE expert parameters that requires global expert.
+    """
+    return (
+        not getattr(x, "_sglang_require_global_experts", False)
+        and x.data.ndim > 0
+        and x.data.shape[0] == num_local_experts
+    )
+
+
+def should_use_flashinfer_cutlass_moe_fp4_allgather():
+    """
+    Perform FP4 quantize before all-gather for flashinfer cutlass moe to reduce communication cost for high-throughput serving.
+    """
+    return (
+        not DISABLE_FLASHINFER_CUTLASS_MOE_FP4_ALLGATHER
+        and get_moe_a2a_backend().is_none()
+        and get_moe_runner_backend().is_flashinfer_cutlass()
+        and is_dp_attention_enabled()
+        and MOE_QUANTIZATION == "modelopt_fp4"
+        and get_moe_expert_parallel_world_size() == get_attention_dp_size()
+    )
+
+
+@contextmanager
+def speculative_moe_backend_context():
+    """
+    Context manager to temporarily use the speculative MoE backend for draft model operations.
+    This ensures that draft models in speculative decoding use the configured speculative backend.
+    """
+    global MOE_RUNNER_BACKEND
+    original_backend = MOE_RUNNER_BACKEND
+    try:
+        MOE_RUNNER_BACKEND = get_speculative_moe_runner_backend()
+        yield
+    finally:
+        MOE_RUNNER_BACKEND = original_backend
+
+
+@contextmanager
+def speculative_moe_a2a_backend_context():
+    """
+    Context manager to temporarily use the speculative MoE A2A backend for draft model operations.
+    This ensures that draft models in speculative decoding use the configured speculative A2A backend.
+    """
+    global MOE_A2A_BACKEND
+    global DISABLE_FLASHINFER_CUTLASS_MOE_FP4_ALLGATHER
+    original_backend = MOE_A2A_BACKEND
+    original_disable_flashinfer_cutlass_moe_fp4_allgather = (
+        DISABLE_FLASHINFER_CUTLASS_MOE_FP4_ALLGATHER
+    )
+    try:
+        MOE_A2A_BACKEND = get_speculative_moe_a2a_backend()
+        # Disable FP4 allgather for spec decode since MTP layers are unquantized
+        DISABLE_FLASHINFER_CUTLASS_MOE_FP4_ALLGATHER = True
+        yield
+    finally:
+        MOE_A2A_BACKEND = original_backend
+        DISABLE_FLASHINFER_CUTLASS_MOE_FP4_ALLGATHER = (
+            original_disable_flashinfer_cutlass_moe_fp4_allgather
+        )
+
+
+# The type of method in top-K routing, for use in torch custom op
+# Please keep this in sync with the counterpart defined in https://github.com/flashinfer-ai/flashinfer/blob/main/include/flashinfer/trtllm/fused_moe/runner.h
+class RoutingMethodType(IntEnum):
+    # Default: Softmax -> TopK
+    Default = (0,)
+    # Renormalize: TopK -> Softmax
+    Renormalize = (1,)
+    # DeepSeekV3: Sigmoid -> RoutingBiasAdd -> Top2 in group -> Top4 groups -> Top8 experts from the Top4 groups
+    DeepSeekV3 = (2,)
+    # Llama4: Top1 -> Sigmoid
+    Llama4 = (3,)
+    # Qwen3: Softmax -> TopK -> Renormalize
+    RenormalizeNaive = (4,)
+    # TopK only (no softmax)
+    TopK = (5,)
+    # Unspecified
+    Unspecified = 6
diff --git a/sglang/python/sglang/srt/layers/multimodal.py b/sglang/python/sglang/srt/layers/multimodal.py
new file mode 100644
index 0000000000000000000000000000000000000000..738c658307a6b16506e49a094bf0352b766f0143
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/multimodal.py
@@ -0,0 +1,189 @@
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Logits processing."""
+
+import torch
+import triton
+import triton.language as tl
+
+FMIX32_C1 = 0x85EBCA6B
+FMIX32_C2 = 0xC2B2AE35
+POS_C1 = 0x27D4EB2D
+POS_C2 = 0x165667B1
+
+
+@triton.jit
+def _rotl32(x, r: tl.constexpr):
+    return (x << r) | (x >> (32 - r))
+
+
+@triton.jit
+def _fmix32(x, C1: tl.constexpr, C2: tl.constexpr):
+    c1 = tl.full((), C1, tl.uint32)
+    c2 = tl.full((), C2, tl.uint32)
+    x ^= x >> 16
+    x = x * c1
+    x ^= x >> 13
+    x = x * c2
+    x ^= x >> 16
+    return x
+
+
+@triton.jit
+def hash_tiles32_kernel_blocked(
+    in_ptr,
+    out_ptr,
+    n_u32,
+    seed1,
+    seed2,
+    FM_C1: tl.constexpr,
+    FM_C2: tl.constexpr,
+    POS_A: tl.constexpr,
+    POS_B: tl.constexpr,
+    TILE: tl.constexpr,
+    BLOCK: tl.constexpr,
+    USE_CG: tl.constexpr,
+):
+    pid = tl.program_id(axis=0)
+    base = pid * TILE
+
+    s1 = tl.full((), seed1, tl.uint32)
+    s2 = tl.full((), seed2, tl.uint32)
+    posA = tl.full((), POS_A, tl.uint32)
+    posB = tl.full((), POS_B, tl.uint32)
+
+    h1 = tl.zeros((), dtype=tl.uint32)
+    h2 = tl.zeros((), dtype=tl.uint32)
+
+    for off in tl.static_range(0, TILE, BLOCK):
+        idx = base + off + tl.arange(0, BLOCK)
+        m = idx < n_u32
+
+        if USE_CG:
+            v = tl.load(in_ptr + idx, mask=m, other=0, cache_modifier=".cg")
+        else:
+            v = tl.load(in_ptr + idx, mask=m, other=0)
+        v = v.to(tl.uint32)
+
+        iu = idx.to(tl.uint32)
+        p1 = (iu * posA + s1) ^ _rotl32(iu, 15)
+        p2 = (iu * posB + s2) ^ _rotl32(iu, 13)
+
+        k1 = _fmix32(v ^ p1, C1=FM_C1, C2=FM_C2)
+        k2 = _fmix32(v ^ p2, C1=FM_C1, C2=FM_C2)
+
+        zero32 = tl.zeros_like(k1)
+        k1 = tl.where(m, k1, zero32)
+        k2 = tl.where(m, k2, zero32)
+
+        h1 += tl.sum(k1, axis=0).to(tl.uint32)
+        h2 += tl.sum(k2, axis=0).to(tl.uint32)
+
+    nbytes = tl.full((), n_u32 * 4, tl.uint32)
+    h1 ^= nbytes
+    h2 ^= nbytes
+    h1 = _fmix32(h1, C1=FM_C1, C2=FM_C2)
+    h2 = (
+        _fmix32(h2, C1=FMIX32_C1, C2=FMIX32_C2)
+        if False
+        else _fmix32(h2, C1=FM_C1, C2=FM_C2)
+    )
+
+    out = (h1.to(tl.uint64) << 32) | h2.to(tl.uint64)
+    tl.store(out_ptr + pid, out)
+
+
+@triton.jit
+def add_tree_reduce_u64_kernel(in_ptr, out_ptr, n_elems, CHUNK: tl.constexpr):
+    pid = tl.program_id(axis=0)
+    start = pid * CHUNK
+    h = tl.zeros((), dtype=tl.uint64)
+    for i in tl.static_range(0, CHUNK):
+        idx = start + i
+        m = idx < n_elems
+        v = tl.load(in_ptr + idx, mask=m, other=0).to(tl.uint64)
+        h += v
+    tl.store(out_ptr + pid, h)
+
+
+def _as_uint32_words(t: torch.Tensor) -> torch.Tensor:
+    assert t.is_cuda, "Use .cuda() first"
+    tb = t.contiguous().view(torch.uint8)
+    nbytes = tb.numel()
+    pad = (4 - (nbytes & 3)) & 3
+    if pad:
+        tb_p = torch.empty(nbytes + pad, dtype=torch.uint8, device=tb.device)
+        tb_p[:nbytes].copy_(tb)
+        tb_p[nbytes:].zero_()
+        tb = tb_p
+    return tb.view(torch.uint32)
+
+
+def _final_splitmix64(x: int) -> int:
+    mask = (1 << 64) - 1
+    x &= mask
+    x ^= x >> 30
+    x = (x * 0xBF58476D1CE4E5B9) & mask
+    x ^= x >> 27
+    x = (x * 0x94D049BB133111EB) & mask
+    x ^= x >> 31
+    return x
+
+
+@torch.inference_mode()
+def gpu_tensor_hash(
+    tensor: torch.Tensor,
+    *,
+    seed: int = 0x243F6A88,
+    tile_words: int = 8192,
+    block_words: int = 256,
+    reduce_chunk: int = 1024,
+    num_warps: int = 4,
+    num_stages: int = 4,
+    use_cg: bool = True,
+) -> int:
+    assert tensor.is_cuda, "Use .cuda() first"
+    u32 = _as_uint32_words(tensor)
+    n = u32.numel()
+    if n == 0:
+        return 0
+
+    grid1 = (triton.cdiv(n, tile_words),)
+    partials = torch.empty(grid1[0], dtype=torch.uint64, device=u32.device)
+    hash_tiles32_kernel_blocked[grid1](
+        u32,
+        partials,
+        n,
+        seed1=seed & 0xFFFFFFFF,
+        seed2=((seed * 0x9E3779B1) ^ 0xDEADBEEF) & 0xFFFFFFFF,
+        FM_C1=FMIX32_C1,
+        FM_C2=FMIX32_C2,
+        POS_A=POS_C1,
+        POS_B=POS_C2,
+        TILE=tile_words,
+        BLOCK=block_words,
+        USE_CG=use_cg,
+        num_warps=num_warps,
+        num_stages=num_stages,
+    )
+
+    cur = partials
+    while cur.numel() > 1:
+        n_elems = cur.numel()
+        grid2 = (triton.cdiv(n_elems, reduce_chunk),)
+        nxt = torch.empty(grid2[0], dtype=torch.uint64, device=cur.device)
+        add_tree_reduce_u64_kernel[grid2](cur, nxt, n_elems, CHUNK=reduce_chunk)
+        cur = nxt
+
+    return _final_splitmix64(int(cur.item()))
diff --git a/sglang/python/sglang/srt/layers/parameter.py b/sglang/python/sglang/srt/layers/parameter.py
new file mode 100644
index 0000000000000000000000000000000000000000..ff0deb03e5cf19d5992bc16cceb29a2d1d177aa5
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/parameter.py
@@ -0,0 +1,596 @@
+"""Adapted from https://github.com/vllm-project/vllm/blob/v0.6.4.post1/vllm/model_executor/parameter.py"""
+
+import logging
+from fractions import Fraction
+from typing import Callable, Optional, Union
+
+import torch
+from torch.nn import Parameter
+
+from sglang.srt.environ import envs
+from sglang.srt.layers.utils import pad_or_narrow_weight
+from sglang.srt.utils import is_cpu
+
+__all__ = [
+    "BasevLLMParameter",
+    "PackedvLLMParameter",
+    "PerTensorScaleParameter",
+    "ModelWeightParameter",
+    "ChannelQuantScaleParameter",
+    "GroupQuantScaleParameter",
+    "BlockQuantScaleParameter",
+    "PackedColumnParameter",
+    "RowvLLMParameter",
+]
+
+logger = logging.getLogger(__name__)
+
+_is_cpu = is_cpu()
+
+
+def _dtype_rank(dtype: torch.dtype) -> Optional[int]:
+    if dtype in (
+        torch.float8_e4m3fn,
+        torch.float8_e4m3fnuz,
+        torch.float8_e5m2,
+        torch.float8_e5m2fnuz,
+    ):
+        return 0
+    if dtype in (torch.float16, torch.bfloat16):
+        return 1
+    if dtype == torch.float32:
+        return 2
+    if dtype == torch.float64:
+        return 3
+    return None
+
+
+def copy_with_check(target: torch.Tensor, loaded_weight: torch.Tensor):
+    """
+    Copy `loaded_weight` into `target` while forbidding downcasts.
+    bf16/fp16 share the same rank, and all fp8 variants share the same rank.
+    """
+
+    assert (
+        target.shape == loaded_weight.shape
+    ), f"{target.shape=}, {loaded_weight.shape=}"
+
+    if target.dtype == loaded_weight.dtype:
+        target.copy_(loaded_weight)
+        return
+
+    target_rank = _dtype_rank(target.dtype)
+    loaded_rank = _dtype_rank(loaded_weight.dtype)
+
+    if target_rank is None or loaded_rank is None:
+        raise ValueError(
+            f"Unsupported copy between dtypes: {target.dtype=}, {loaded_weight.dtype=}"
+        )
+    if target_rank < loaded_rank and not envs.SGLANG_QUANT_ALLOW_DOWNCASTING.get():
+        raise ValueError(
+            f"Downcasting not allowed: {target.dtype=}, {loaded_weight.dtype=}"
+        )
+
+    target.copy_(loaded_weight)
+
+
+class BasevLLMParameter(Parameter):
+    """
+    Base parameter for vLLM linear layers. Extends the torch.nn.parameter
+    by taking in a linear weight loader. Will copy the loaded weight
+    into the parameter when the provided weight loader is called.
+    """
+
+    def __new__(cls, data: torch.Tensor, **kwargs):
+
+        return super().__new__(cls, data=data, requires_grad=False)
+
+    def __init__(self, data: torch.Tensor, weight_loader: Callable):
+        """
+        Initialize the BasevLLMParameter
+
+        :param data: torch tensor with the parameter data
+        :param weight_loader: weight loader callable
+
+        :returns: a torch.nn.parameter
+        """
+
+        self._weight_loader = weight_loader
+
+    @property
+    def weight_loader(self):
+        return self._weight_loader
+
+    def _assert_and_load(self, loaded_weight: torch.Tensor):
+        assert self.data.shape == loaded_weight.shape
+        self.data.copy_(loaded_weight)
+
+    def load_column_parallel_weight(self, loaded_weight: torch.Tensor):
+        self._assert_and_load(loaded_weight)
+
+    def load_row_parallel_weight(self, loaded_weight: torch.Tensor):
+        self._assert_and_load(loaded_weight)
+
+    def load_merged_column_weight(self, loaded_weight: torch.Tensor, **kwargs):
+        self._assert_and_load(loaded_weight)
+
+    def load_qkv_weight(self, loaded_weight: torch.Tensor, **kwargs):
+        self._assert_and_load(loaded_weight)
+
+
+class _ColumnvLLMParameter(BasevLLMParameter):
+    """
+    Private class defining weight loading functionality
+    (load_merged_column_weight, load_qkv_weight)
+    for parameters being loaded into linear layers with column
+    parallelism. This includes QKV and MLP layers which are
+    not already fused on disk. Requires an output dimension
+    to be defined. Called within the weight loader of
+    each of the column parallel linear layers.
+    """
+
+    def __init__(self, output_dim: int, **kwargs):
+        self._output_dim = output_dim
+        super().__init__(**kwargs)
+
+    @property
+    def output_dim(self):
+        return self._output_dim
+
+    def load_column_parallel_weight(
+        self,
+        loaded_weight: torch.Tensor,
+        tp_rank: int,
+        use_presharded_weights: bool = False,
+    ):
+        if not use_presharded_weights:
+            shard_size = self.data.shape[self.output_dim]
+
+            from sglang.srt.model_loader.weight_utils import (
+                narrow_padded_param_and_loaded_weight,
+            )
+
+            if _is_cpu:
+                param_data, loaded_weight = narrow_padded_param_and_loaded_weight(
+                    self.data,
+                    loaded_weight,
+                    0,  # param_data_start
+                    tp_rank * shard_size,
+                    self.output_dim,
+                    shard_size,
+                )
+                assert param_data.shape == loaded_weight.shape
+                param_data.copy_(loaded_weight)
+                return
+            else:
+                loaded_weight = loaded_weight.narrow(
+                    self.output_dim, tp_rank * shard_size, shard_size
+                )
+
+        copy_with_check(self.data, loaded_weight)
+
+    def load_merged_column_weight(self, loaded_weight: torch.Tensor, **kwargs):
+
+        shard_offset = kwargs.get("shard_offset")
+        shard_size = kwargs.get("shard_size")
+        tp_rank = kwargs.get("tp_rank")
+        use_presharded_weights = kwargs.get("use_presharded_weights")
+        if (
+            isinstance(self, (PackedColumnParameter, PackedvLLMParameter))
+            and self.packed_dim == self.output_dim
+        ):
+            shard_size, shard_offset = self.adjust_shard_indexes_for_packing(
+                shard_offset=shard_offset, shard_size=shard_size
+            )
+
+        param_data = self.data
+
+        param_data = param_data.narrow(self.output_dim, shard_offset, shard_size)
+
+        from sglang.srt.model_loader.weight_utils import (
+            narrow_padded_param_and_loaded_weight,
+        )
+
+        if _is_cpu:
+            param_data, loaded_weight = narrow_padded_param_and_loaded_weight(
+                param_data,
+                loaded_weight,
+                0,  # param_data_start
+                tp_rank * shard_size,
+                self.output_dim,
+                shard_size,
+                not use_presharded_weights,
+            )
+        else:
+            if not use_presharded_weights:
+                # Padding for special case like qwen2_5_VL's mlp which is not 8-aligned
+                start_idx = tp_rank * shard_size
+                end_idx = start_idx + shard_size
+                if end_idx > loaded_weight.shape[self.output_dim]:
+                    loaded_weight = pad_or_narrow_weight(
+                        loaded_weight, self.output_dim, start_idx, shard_size
+                    )
+                else:
+                    loaded_weight = loaded_weight.narrow(
+                        self.output_dim, start_idx, shard_size
+                    )
+
+        assert param_data.shape == loaded_weight.shape
+        param_data.copy_(loaded_weight)
+
+    def load_qkv_weight(
+        self,
+        loaded_weight: torch.Tensor,
+        tp_rank: int,
+        use_presharded_weights: bool = False,
+        **kwargs,
+    ):
+
+        shard_offset = kwargs.get("shard_offset")
+        shard_size = kwargs.get("shard_size")
+        shard_id = kwargs.get("shard_id")
+        num_heads = kwargs.get("num_heads")
+
+        if (
+            isinstance(self, (PackedColumnParameter, PackedvLLMParameter))
+            and self.output_dim == self.packed_dim
+        ):
+            shard_size, shard_offset = self.adjust_shard_indexes_for_packing(
+                shard_offset=shard_offset, shard_size=shard_size
+            )
+
+        param_data = self.data
+        shard_id = tp_rank if shard_id == "q" else tp_rank // num_heads
+        param_data = param_data.narrow(self.output_dim, shard_offset, shard_size)
+
+        if _is_cpu:
+            from sglang.srt.model_loader.weight_utils import (
+                narrow_padded_param_and_loaded_weight,
+            )
+
+            param_data, loaded_weight = narrow_padded_param_and_loaded_weight(
+                param_data,
+                loaded_weight,
+                0,  # param_data_start
+                shard_id * shard_size,
+                self.output_dim,
+                shard_size,
+                not use_presharded_weights,
+            )
+        else:
+            if not use_presharded_weights:
+                loaded_weight = loaded_weight.narrow(
+                    self.output_dim, shard_id * shard_size, shard_size
+                )
+
+        assert (
+            param_data.shape == loaded_weight.shape
+        ), f"{param_data.shape=}, {loaded_weight.shape=}"
+        param_data.copy_(loaded_weight)
+
+
+class RowvLLMParameter(BasevLLMParameter):
+    """
+    Parameter class defining weight_loading functionality
+    (load_row_parallel_weight) for parameters being loaded
+    into linear layers with row parallel functionality.
+    Requires an input_dim to be defined.
+    """
+
+    def __init__(self, input_dim: int, **kwargs):
+        self._input_dim = input_dim
+        super().__init__(**kwargs)
+
+    @property
+    def input_dim(self):
+        return self._input_dim
+
+    def load_row_parallel_weight(
+        self,
+        loaded_weight: torch.Tensor,
+        tp_rank: int,
+        use_presharded_weights: bool = False,
+    ):
+        if not use_presharded_weights:
+            shard_size = self.data.shape[self.input_dim]
+
+            from sglang.srt.model_loader.weight_utils import (
+                narrow_padded_param_and_loaded_weight,
+            )
+
+            if _is_cpu:
+                param_data, loaded_weight = narrow_padded_param_and_loaded_weight(
+                    self.data,
+                    loaded_weight,
+                    0,  # param_data_start
+                    tp_rank * shard_size,
+                    self.input_dim,
+                    shard_size,
+                )
+
+                assert param_data.shape == loaded_weight.shape
+                param_data.copy_(loaded_weight)
+
+                return
+            else:
+                # Padding for special case like qwen2_5_VL's mlp which is not 8-aligned
+                start_idx = tp_rank * shard_size
+                end_idx = start_idx + shard_size
+                if end_idx > loaded_weight.shape[self.input_dim]:
+                    loaded_weight = pad_or_narrow_weight(
+                        loaded_weight, self.input_dim, start_idx, shard_size
+                    )
+                else:
+                    loaded_weight = loaded_weight.narrow(
+                        self.input_dim, start_idx, shard_size
+                    )
+
+        if len(loaded_weight.shape) == 0:
+            loaded_weight = loaded_weight.reshape(1)
+
+        assert self.data.shape == loaded_weight.shape
+        self.data.copy_(loaded_weight)
+
+
+class ModelWeightParameter(_ColumnvLLMParameter, RowvLLMParameter):
+    """
+    Parameter class for linear layer weights. Uses both column and
+    row parallelism.
+    """
+
+    pass
+
+
+class GroupQuantScaleParameter(_ColumnvLLMParameter, RowvLLMParameter):
+    """
+    Parameter class for weight scales loaded for weights with
+    grouped quantization. Uses both column and row parallelism.
+    """
+
+    pass
+
+
+class ChannelQuantScaleParameter(_ColumnvLLMParameter):
+    """
+    Parameter class for weight scales loaded for weights with
+    channel-wise quantization. Equivalent to _ColumnvLLMParameter.
+    """
+
+    pass
+
+
+class BlockQuantScaleParameter(_ColumnvLLMParameter, RowvLLMParameter):
+    """
+    Parameter class for weight scales loaded for weights with
+    block-wise quantization. Uses both column and row parallelism.
+    """
+
+    pass
+
+
+class PerTensorScaleParameter(BasevLLMParameter):
+    """
+    Parameter class for scales where the number of scales is
+    equivalent to the number of logical matrices in fused linear
+    layers (e.g. for QKV, there are 3 scales loaded from disk).
+    This is relevant to weights with per-tensor quantization.
+    Adds functionality to map the scalers to a shard during
+    weight loading.
+
+    Note: additional parameter manipulation may be handled
+    for each quantization config specifically, within
+    process_weights_after_loading
+    """
+
+    def __init__(self, **kwargs):
+        self.qkv_idxs = {"q": 0, "k": 1, "v": 2}
+        super().__init__(**kwargs)
+
+    def _shard_id_as_int(self, shard_id: Union[str, int]) -> int:
+        if isinstance(shard_id, int):
+            return shard_id
+
+        # if not int, assume shard_id for qkv
+        # map to int and return
+        assert isinstance(shard_id, str)
+        assert shard_id in self.qkv_idxs
+        return self.qkv_idxs[shard_id]
+
+    # For row parallel layers, no sharding needed
+    # load weight into parameter as is
+    def load_row_parallel_weight(self, *args, **kwargs):
+        kwargs.pop("tp_rank", None)
+        kwargs.pop("use_presharded_weights", None)
+        super().load_row_parallel_weight(*args, **kwargs)
+
+    def load_merged_column_weight(self, *args, **kwargs):
+        self._load_into_shard_id(*args, **kwargs)
+
+    def load_qkv_weight(self, *args, **kwargs):
+        self._load_into_shard_id(*args, **kwargs)
+
+    def load_column_parallel_weight(self, *args, **kwargs):
+        kwargs.pop("tp_rank", None)
+        kwargs.pop("use_presharded_weights", None)
+        super().load_row_parallel_weight(*args, **kwargs)
+
+    def _load_into_shard_id(
+        self, loaded_weight: torch.Tensor, shard_id: Union[str, int], **kwargs
+    ):
+        """
+        Slice the parameter data based on the shard id for
+        loading.
+        """
+
+        param_data = self.data
+        shard_id = self._shard_id_as_int(shard_id)
+
+        # AutoFP8 scales do not have a shape
+        # compressed-tensors scales do have a shape
+        if len(loaded_weight.shape) != 0:
+            assert loaded_weight.shape[0] == 1
+            loaded_weight = loaded_weight[0]
+
+        param_data = param_data[shard_id]
+        assert param_data.shape == loaded_weight.shape
+        param_data.copy_(loaded_weight)
+
+
+class PackedColumnParameter(_ColumnvLLMParameter):
+    """
+    Parameter for model parameters which are packed on disk
+    and support column parallelism only. See PackedvLLMParameter
+    for more details on the packed properties.
+    """
+
+    def __init__(
+        self,
+        packed_factor: Union[int, Fraction],
+        packed_dim: int,
+        marlin_tile_size: Optional[int] = None,
+        **kwargs,
+    ):
+        self._packed_factor = packed_factor
+        self._packed_dim = packed_dim
+        self._marlin_tile_size = marlin_tile_size
+        super().__init__(**kwargs)
+
+    @property
+    def packed_dim(self):
+        return self._packed_dim
+
+    @property
+    def packed_factor(self):
+        return self._packed_factor
+
+    @property
+    def marlin_tile_size(self):
+        return self._marlin_tile_size
+
+    def adjust_shard_indexes_for_packing(self, shard_size, shard_offset):
+        return _adjust_shard_indexes_for_packing(
+            shard_size=shard_size,
+            shard_offset=shard_offset,
+            packed_factor=self.packed_factor,
+            marlin_tile_size=self.marlin_tile_size,
+        )
+
+
+class PackedvLLMParameter(ModelWeightParameter):
+    """
+    Parameter for model weights which are packed on disk.
+    Example: GPTQ Marlin weights are int4 or int8, packed into int32.
+    Extends the ModelWeightParameter to take in the
+    packed factor, the packed dimension, and optionally, marlin
+    tile size for marlin kernels. Adjusts the shard_size and
+    shard_offset for fused linear layers model weight loading
+    by accounting for packing and optionally, marlin tile size.
+    """
+
+    def __init__(
+        self,
+        packed_factor: Union[int, Fraction],
+        packed_dim: int,
+        marlin_tile_size: Optional[int] = None,
+        **kwargs,
+    ):
+        self._packed_factor = packed_factor
+        self._packed_dim = packed_dim
+        self._marlin_tile_size = marlin_tile_size
+        super().__init__(**kwargs)
+
+    @property
+    def packed_dim(self):
+        return self._packed_dim
+
+    @property
+    def packed_factor(self):
+        return self._packed_factor
+
+    @property
+    def marlin_tile_size(self):
+        return self._marlin_tile_size
+
+    def adjust_shard_indexes_for_packing(self, shard_size, shard_offset):
+        return _adjust_shard_indexes_for_packing(
+            shard_size=shard_size,
+            shard_offset=shard_offset,
+            packed_factor=self.packed_factor,
+            marlin_tile_size=self.marlin_tile_size,
+        )
+
+
+def permute_param_layout_(
+    param: BasevLLMParameter, input_dim: int, output_dim: int, **kwargs
+) -> BasevLLMParameter:
+    """
+    Permute a parameter's layout to the specified input and output dimensions,
+    useful for forcing the parameter into a known layout, for example, if I need
+    a packed (quantized) weight matrix to be in the layout
+        {input_dim = 0, output_dim = 1, packed_dim = 0}
+    then I can call:
+        permute_param_layout_(x, input_dim=0, output_dim=1, packed_dim=0)
+    to ensure x is in the correct layout (permuting it to the correct layout if
+    required, asserting if it cannot get it to the correct layout)
+    """
+
+    curr_input_dim = getattr(param, "input_dim", None)
+    curr_output_dim = getattr(param, "output_dim", None)
+
+    if curr_input_dim is None or curr_output_dim is None:
+        assert param.data.dim() == 2, (
+            "permute_param_layout_ only supports 2D parameters when either "
+            "input_dim or output_dim is not set"
+        )
+
+    # if one of the dimensions is not set, set it to the opposite of the other
+    #  we can only do this since we asserted the parameter is 2D above
+    if curr_input_dim is None:
+        assert curr_output_dim is not None, "either input or output dim must be set"
+        curr_input_dim = (curr_output_dim + 1) % 2
+    if curr_output_dim is None:
+        assert curr_input_dim is not None, "either input or output dim must be set"
+        curr_output_dim = (curr_input_dim + 1) % 2
+
+    # create permutation from the current layout to the layout with
+    # self.input_dim at input_dim and self.output_dim at output_dim preserving
+    # other dimensions
+    perm = [
+        i for i in range(param.data.dim()) if i not in [curr_input_dim, curr_output_dim]
+    ]
+    perm.insert(input_dim, curr_input_dim)
+    perm.insert(output_dim, curr_output_dim)
+
+    if "packed_dim" in kwargs:
+        assert (
+            hasattr(param, "packed_dim")
+            and param.packed_dim == perm[kwargs["packed_dim"]]
+        ), "permute_param_layout_ currently doesn't support repacking"
+
+    param.data = param.data.permute(*perm)
+    if hasattr(param, "_input_dim"):
+        param._input_dim = input_dim
+    if hasattr(param, "_output_dim"):
+        param._output_dim = output_dim
+    if "packed_dim" in kwargs and hasattr(param, "_packed_dim"):
+        param._packed_dim = kwargs["packed_dim"]
+
+    return param
+
+
+def _adjust_shard_indexes_for_marlin(shard_size, shard_offset, marlin_tile_size):
+    return shard_size * marlin_tile_size, shard_offset * marlin_tile_size
+
+
+def _adjust_shard_indexes_for_packing(
+    shard_size, shard_offset, packed_factor, marlin_tile_size
+):
+    shard_size = shard_size // packed_factor
+    shard_offset = shard_offset // packed_factor
+    if marlin_tile_size is not None:
+        return _adjust_shard_indexes_for_marlin(
+            shard_size=shard_size,
+            shard_offset=shard_offset,
+            marlin_tile_size=marlin_tile_size,
+        )
+    return shard_size, shard_offset
diff --git a/sglang/python/sglang/srt/layers/pooler.py b/sglang/python/sglang/srt/layers/pooler.py
new file mode 100644
index 0000000000000000000000000000000000000000..f9f8e1a181756a15c6fdab2d0a9a1c2461f86ca8
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/pooler.py
@@ -0,0 +1,131 @@
+# adapted from
+# https://github.com/vllm-project/vllm/blob/82a1b1a82b1fbb454c82a9ef95730b929c9b270c/vllm/model_executor/layers/pooler.py
+
+from dataclasses import dataclass
+from enum import IntEnum
+from typing import Optional
+
+import torch
+import torch.nn as nn
+from transformers import PretrainedConfig
+
+from sglang.srt.layers.activation import get_cross_encoder_activation_function
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch
+
+
+class PoolingType(IntEnum):
+    LAST = 0
+    CLS = 1
+
+
+@dataclass
+class EmbeddingPoolerOutput:
+    # Pooler can return list[tensor] instead of tensor if the dimension of each tensor in the batch is different
+    # due to different per-request matryoshka dim truncation
+    embeddings: torch.Tensor | list[torch.Tensor]
+
+
+class Pooler(nn.Module):
+    """A layer that pools specific information from hidden states.
+    This layer does the following:
+    1. Extracts specific tokens or aggregates data based on pooling method.
+    2. Normalizes output if specified.
+    3. Returns structured results as `PoolerOutput`.
+    Attributes:
+        pooling_type: The type of pooling to use (LAST, AVERAGE, MAX).
+        normalize: Whether to normalize the pooled data.
+    """
+
+    def __init__(self, pooling_type: PoolingType, normalize: bool):
+        super().__init__()
+        self.pooling_type = pooling_type
+        self.normalize = normalize
+
+    def forward(
+        self, hidden_states: torch.Tensor, forward_batch: ForwardBatch
+    ) -> EmbeddingPoolerOutput:
+
+        if self.pooling_type == PoolingType.LAST:
+            last_token_indices = torch.cumsum(forward_batch.extend_seq_lens, dim=0) - 1
+            pooled_data = hidden_states[last_token_indices]
+        elif self.pooling_type == PoolingType.CLS:
+            prompt_lens = forward_batch.extend_seq_lens
+            first_token_flat_indices = torch.zeros_like(prompt_lens)
+            first_token_flat_indices[1:] += torch.cumsum(prompt_lens, dim=0)[:-1]
+            pooled_data = hidden_states[first_token_flat_indices]
+        else:
+            raise ValueError(f"Invalid pooling type: {self.pooling_type}")
+
+        if forward_batch.dimensions is not None:
+            all_same_dimensions = len(set(forward_batch.dimensions)) == 1
+            if all_same_dimensions:
+                pooled_data = pooled_data[..., : forward_batch.dimensions[0]]
+            else:
+                pooled_data = [
+                    tensor[..., :dim]
+                    for tensor, dim in zip(pooled_data, forward_batch.dimensions)
+                ]
+
+        if self.normalize:
+            if isinstance(pooled_data, list):
+                pooled_data = [
+                    nn.functional.normalize(tensor, p=2, dim=-1)
+                    for tensor in pooled_data
+                ]
+            else:
+                pooled_data = nn.functional.normalize(pooled_data, p=2, dim=-1)
+
+        return EmbeddingPoolerOutput(embeddings=pooled_data)
+
+
+class CrossEncodingPooler(nn.Module):
+    """A layer that pools specific information from hidden states.
+
+    This layer does the following:
+    1. Extracts specific tokens or aggregates data based on pooling method.
+    2. Normalizes output if specified.
+    3. Returns structured results as `EmbeddingPoolerOutput`.
+    """
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        classifier: nn.Module,
+        pooler: Optional[nn.Module] = None,
+    ):
+        super().__init__()
+        self.classifier = classifier
+        self.pooler = pooler
+        self.default_activation_function = get_cross_encoder_activation_function(config)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+    ) -> EmbeddingPoolerOutput:
+        """Pools sentence pair scores from the hidden_states."""
+
+        prompt_lens = forward_batch.extend_seq_lens
+
+        offset = 0
+        pooled_data_lst = []
+        for prompt_len in prompt_lens:
+            pooled_data_i = hidden_states[offset : offset + prompt_len]
+
+            if self.pooler is not None:
+                final_shape_tensor = self.pooler(pooled_data_i, forward_batch)
+            else:
+                final_shape_tensor = self.classifier(pooled_data_i)
+
+            pooled_data_lst.append(final_shape_tensor)
+            offset += prompt_len
+
+        pooled_output = torch.stack(pooled_data_lst)
+
+        if self.pooler is not None:
+            # apply classifier once on the full batch if possible
+            pooled_output = self.classifier(pooled_output)
+
+        scores = self.default_activation_function(pooled_output).squeeze(-1)
+
+        return EmbeddingPoolerOutput(embeddings=scores)
diff --git a/sglang/python/sglang/srt/layers/quantization/__init__.py b/sglang/python/sglang/srt/layers/quantization/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..9174c08ff559fa2a19c1b9b36520ec8334f983a9
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/quantization/__init__.py
@@ -0,0 +1,101 @@
+# Adapted from https://raw.githubusercontent.com/vllm-project/vllm/v0.5.5/vllm/model_executor/layers/quantization/__init__.py
+from __future__ import annotations
+
+import builtins
+import inspect
+from typing import TYPE_CHECKING, Dict, Optional, Type
+
+import torch
+
+
+# Define empty classes as placeholders when vllm is not available
+class DummyConfig:
+    def override_quantization_method(self, *args, **kwargs):
+        return None
+
+
+CompressedTensorsConfig = DummyConfig
+
+from sglang.srt.layers.quantization.auto_round import AutoRoundConfig
+from sglang.srt.layers.quantization.awq import AWQConfig, AWQMarlinConfig
+from sglang.srt.layers.quantization.base_config import QuantizationConfig
+from sglang.srt.layers.quantization.bitsandbytes import BitsAndBytesConfig
+from sglang.srt.layers.quantization.blockwise_int8 import BlockInt8Config
+from sglang.srt.layers.quantization.compressed_tensors.compressed_tensors import (
+    CompressedTensorsConfig,
+)
+from sglang.srt.layers.quantization.fp8 import Fp8Config
+from sglang.srt.layers.quantization.fpgemm_fp8 import FBGEMMFp8Config
+from sglang.srt.layers.quantization.gguf import GGUFConfig
+from sglang.srt.layers.quantization.gptq import GPTQConfig, GPTQMarlinConfig
+from sglang.srt.layers.quantization.modelopt_quant import (
+    ModelOptFp4Config,
+    ModelOptFp8Config,
+)
+from sglang.srt.layers.quantization.modelslim.modelslim import ModelSlimConfig
+from sglang.srt.layers.quantization.moe_wna16 import MoeWNA16Config
+from sglang.srt.layers.quantization.mxfp4 import Mxfp4Config
+from sglang.srt.layers.quantization.petit import PetitNvFp4Config
+from sglang.srt.layers.quantization.qoq import QoQConfig
+from sglang.srt.layers.quantization.quark.quark import QuarkConfig
+from sglang.srt.layers.quantization.quark_int4fp8_moe import QuarkInt4Fp8Config
+from sglang.srt.layers.quantization.w4afp8 import W4AFp8Config
+from sglang.srt.layers.quantization.w8a8_fp8 import W8A8Fp8Config
+from sglang.srt.layers.quantization.w8a8_int8 import W8A8Int8Config
+from sglang.srt.utils import is_cuda, is_hip, is_npu, mxfp_supported
+
+_is_mxfp_supported = mxfp_supported()
+
+if TYPE_CHECKING:
+    from sglang.srt.layers.moe.topk import TopKOutput
+
+# Base quantization methods
+BASE_QUANTIZATION_METHODS: Dict[str, Type[QuantizationConfig]] = {
+    "fp8": Fp8Config,
+    "mxfp8": Fp8Config,
+    "blockwise_int8": BlockInt8Config,
+    "modelopt": ModelOptFp8Config,  # Auto-detect, defaults to FP8
+    "modelopt_fp8": ModelOptFp8Config,
+    "modelopt_fp4": ModelOptFp4Config,
+    "w8a8_int8": W8A8Int8Config,
+    "w8a8_fp8": W8A8Fp8Config,
+    "awq": AWQConfig,
+    "awq_marlin": AWQMarlinConfig,
+    "bitsandbytes": BitsAndBytesConfig,
+    "gguf": GGUFConfig,
+    "gptq": GPTQConfig,
+    "gptq_marlin": GPTQMarlinConfig,
+    "moe_wna16": MoeWNA16Config,
+    "compressed-tensors": CompressedTensorsConfig,
+    "qoq": QoQConfig,
+    "w4afp8": W4AFp8Config,
+    "petit_nvfp4": PetitNvFp4Config,
+    "fbgemm_fp8": FBGEMMFp8Config,
+    "quark": QuarkConfig,
+    "auto-round": AutoRoundConfig,
+    "modelslim": ModelSlimConfig,
+    "quark_int4fp8_moe": QuarkInt4Fp8Config,
+}
+
+
+if is_cuda() or (_is_mxfp_supported and is_hip()):
+    BASE_QUANTIZATION_METHODS.update(
+        {
+            "mxfp4": Mxfp4Config,
+        }
+    )
+
+QUANTIZATION_METHODS = {**BASE_QUANTIZATION_METHODS}
+
+
+def get_quantization_config(quantization: str) -> Type[QuantizationConfig]:
+    if quantization not in QUANTIZATION_METHODS:
+        raise ValueError(
+            f"Invalid quantization method: {quantization}. "
+            f"Available methods: {list(QUANTIZATION_METHODS.keys())}"
+        )
+
+    return QUANTIZATION_METHODS[quantization]
+
+
+original_isinstance = builtins.isinstance
diff --git a/sglang/python/sglang/srt/layers/quantization/__pycache__/__init__.cpython-311.pyc b/sglang/python/sglang/srt/layers/quantization/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8d02768bd3662c18ce4d6579f40dda03eaa34722
Binary files /dev/null and b/sglang/python/sglang/srt/layers/quantization/__pycache__/__init__.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/layers/quantization/__pycache__/auto_round.cpython-311.pyc b/sglang/python/sglang/srt/layers/quantization/__pycache__/auto_round.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..183ffeea0564d7955fc16eb768f358874bc07574
Binary files /dev/null and b/sglang/python/sglang/srt/layers/quantization/__pycache__/auto_round.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/layers/quantization/__pycache__/awq.cpython-311.pyc b/sglang/python/sglang/srt/layers/quantization/__pycache__/awq.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3c6f5452268e7fe1882636fa7803f28122dcaf6a
Binary files /dev/null and b/sglang/python/sglang/srt/layers/quantization/__pycache__/awq.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/layers/quantization/__pycache__/base_config.cpython-311.pyc b/sglang/python/sglang/srt/layers/quantization/__pycache__/base_config.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5354de9f6b4579c62c45b9d5ffde7a539807bbf2
Binary files /dev/null and b/sglang/python/sglang/srt/layers/quantization/__pycache__/base_config.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/layers/quantization/__pycache__/base_scheme.cpython-311.pyc b/sglang/python/sglang/srt/layers/quantization/__pycache__/base_scheme.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e53e6e0b2a2e3f8162bbff57e0c0d8556d1b6eab
Binary files /dev/null and b/sglang/python/sglang/srt/layers/quantization/__pycache__/base_scheme.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/layers/quantization/__pycache__/bitsandbytes.cpython-311.pyc b/sglang/python/sglang/srt/layers/quantization/__pycache__/bitsandbytes.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2097570d3cd290cabd5d2d328ca395a0df836c39
Binary files /dev/null and b/sglang/python/sglang/srt/layers/quantization/__pycache__/bitsandbytes.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/layers/quantization/__pycache__/blockwise_int8.cpython-311.pyc b/sglang/python/sglang/srt/layers/quantization/__pycache__/blockwise_int8.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..dc2aa19de1faff2c6d45e53e4017b24bc92039a1
Binary files /dev/null and b/sglang/python/sglang/srt/layers/quantization/__pycache__/blockwise_int8.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/layers/quantization/__pycache__/fp4_utils.cpython-311.pyc b/sglang/python/sglang/srt/layers/quantization/__pycache__/fp4_utils.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9f6b355ffd8cecdbd6b7284da8336d809ce07570
Binary files /dev/null and b/sglang/python/sglang/srt/layers/quantization/__pycache__/fp4_utils.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/layers/quantization/__pycache__/fp8_kernel.cpython-311.pyc b/sglang/python/sglang/srt/layers/quantization/__pycache__/fp8_kernel.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..188643a91dcbcb75ef0eaebe61019b0451b3c35f
Binary files /dev/null and b/sglang/python/sglang/srt/layers/quantization/__pycache__/fp8_kernel.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/layers/quantization/__pycache__/fp8_utils.cpython-311.pyc b/sglang/python/sglang/srt/layers/quantization/__pycache__/fp8_utils.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b812c71e21160fe709525390956dc9a40628cdcd
Binary files /dev/null and b/sglang/python/sglang/srt/layers/quantization/__pycache__/fp8_utils.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/layers/quantization/__pycache__/fpgemm_fp8.cpython-311.pyc b/sglang/python/sglang/srt/layers/quantization/__pycache__/fpgemm_fp8.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..03ff448a9ebd33d0c5f326f5898f952812363487
Binary files /dev/null and b/sglang/python/sglang/srt/layers/quantization/__pycache__/fpgemm_fp8.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/layers/quantization/__pycache__/gguf.cpython-311.pyc b/sglang/python/sglang/srt/layers/quantization/__pycache__/gguf.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..cded3d6e7be0b5117b5be267f7f17c88a417067b
Binary files /dev/null and b/sglang/python/sglang/srt/layers/quantization/__pycache__/gguf.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/layers/quantization/__pycache__/gptq.cpython-311.pyc b/sglang/python/sglang/srt/layers/quantization/__pycache__/gptq.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2139277818da4bb7169274f4d7d0c59e0fb7f967
Binary files /dev/null and b/sglang/python/sglang/srt/layers/quantization/__pycache__/gptq.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/layers/quantization/__pycache__/int8_kernel.cpython-311.pyc b/sglang/python/sglang/srt/layers/quantization/__pycache__/int8_kernel.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..85c4b8784048cea169228eb027ce4500af99c1cf
Binary files /dev/null and b/sglang/python/sglang/srt/layers/quantization/__pycache__/int8_kernel.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/layers/quantization/__pycache__/int8_utils.cpython-311.pyc b/sglang/python/sglang/srt/layers/quantization/__pycache__/int8_utils.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..71e5a6a913054f72db80526799ce5b20b7b44028
Binary files /dev/null and b/sglang/python/sglang/srt/layers/quantization/__pycache__/int8_utils.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/layers/quantization/__pycache__/kv_cache.cpython-311.pyc b/sglang/python/sglang/srt/layers/quantization/__pycache__/kv_cache.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..94c472c2ffa87bb5ce1c693b54bccc87ffa8f9f1
Binary files /dev/null and b/sglang/python/sglang/srt/layers/quantization/__pycache__/kv_cache.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/layers/quantization/__pycache__/marlin_utils.cpython-311.pyc b/sglang/python/sglang/srt/layers/quantization/__pycache__/marlin_utils.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4b6c35c9e1e0094d8334d6b27cc92e5f14b1f992
Binary files /dev/null and b/sglang/python/sglang/srt/layers/quantization/__pycache__/marlin_utils.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/layers/quantization/__pycache__/marlin_utils_fp8.cpython-311.pyc b/sglang/python/sglang/srt/layers/quantization/__pycache__/marlin_utils_fp8.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..17302cb22726bb98d8374deb5c560ffa8583ca8e
Binary files /dev/null and b/sglang/python/sglang/srt/layers/quantization/__pycache__/marlin_utils_fp8.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/layers/quantization/__pycache__/modelopt_quant.cpython-311.pyc b/sglang/python/sglang/srt/layers/quantization/__pycache__/modelopt_quant.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..72d353a1b928206c9bdb1072f8c7b98a77aa64ff
Binary files /dev/null and b/sglang/python/sglang/srt/layers/quantization/__pycache__/modelopt_quant.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/layers/quantization/__pycache__/moe_wna16.cpython-311.pyc b/sglang/python/sglang/srt/layers/quantization/__pycache__/moe_wna16.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..14b696c55a61409dea887a9b229565ad0138b06a
Binary files /dev/null and b/sglang/python/sglang/srt/layers/quantization/__pycache__/moe_wna16.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/layers/quantization/__pycache__/mxfp4.cpython-311.pyc b/sglang/python/sglang/srt/layers/quantization/__pycache__/mxfp4.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1b2c034a680423034c8100fc8870e684626f7bed
Binary files /dev/null and b/sglang/python/sglang/srt/layers/quantization/__pycache__/mxfp4.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/layers/quantization/__pycache__/mxfp4_tensor.cpython-311.pyc b/sglang/python/sglang/srt/layers/quantization/__pycache__/mxfp4_tensor.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7f66da7ac996b6f303561f0448d5cff5f41054b9
Binary files /dev/null and b/sglang/python/sglang/srt/layers/quantization/__pycache__/mxfp4_tensor.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/layers/quantization/__pycache__/petit.cpython-311.pyc b/sglang/python/sglang/srt/layers/quantization/__pycache__/petit.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3544da2f06c286168d013bb412c51e51a5919e98
Binary files /dev/null and b/sglang/python/sglang/srt/layers/quantization/__pycache__/petit.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/layers/quantization/__pycache__/petit_utils.cpython-311.pyc b/sglang/python/sglang/srt/layers/quantization/__pycache__/petit_utils.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..eca58ed8380af694339d52d8ebea62be8a293f4d
Binary files /dev/null and b/sglang/python/sglang/srt/layers/quantization/__pycache__/petit_utils.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/layers/quantization/__pycache__/qoq.cpython-311.pyc b/sglang/python/sglang/srt/layers/quantization/__pycache__/qoq.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..52f94a976482ea7652eab818d1216f9094b3f0c3
Binary files /dev/null and b/sglang/python/sglang/srt/layers/quantization/__pycache__/qoq.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/layers/quantization/__pycache__/quark_int4fp8_moe.cpython-311.pyc b/sglang/python/sglang/srt/layers/quantization/__pycache__/quark_int4fp8_moe.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a31be19f17fa47949c226e5bb99de1699c3c0636
Binary files /dev/null and b/sglang/python/sglang/srt/layers/quantization/__pycache__/quark_int4fp8_moe.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/layers/quantization/__pycache__/unquant.cpython-311.pyc b/sglang/python/sglang/srt/layers/quantization/__pycache__/unquant.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1162c9f98fea3f43c9f359b1851baed1931ce865
Binary files /dev/null and b/sglang/python/sglang/srt/layers/quantization/__pycache__/unquant.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/layers/quantization/__pycache__/utils.cpython-311.pyc b/sglang/python/sglang/srt/layers/quantization/__pycache__/utils.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..91583b8e7fd83464a37236098a9b2950e4ec1e08
Binary files /dev/null and b/sglang/python/sglang/srt/layers/quantization/__pycache__/utils.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/layers/quantization/__pycache__/w4afp8.cpython-311.pyc b/sglang/python/sglang/srt/layers/quantization/__pycache__/w4afp8.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ad3a3b7bc8c10c4fbfc4346a591acd0aa8382681
Binary files /dev/null and b/sglang/python/sglang/srt/layers/quantization/__pycache__/w4afp8.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/layers/quantization/__pycache__/w8a8_fp8.cpython-311.pyc b/sglang/python/sglang/srt/layers/quantization/__pycache__/w8a8_fp8.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b7ab08b4ac62c845d4993257a8f0121fecd3d862
Binary files /dev/null and b/sglang/python/sglang/srt/layers/quantization/__pycache__/w8a8_fp8.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/layers/quantization/__pycache__/w8a8_int8.cpython-311.pyc b/sglang/python/sglang/srt/layers/quantization/__pycache__/w8a8_int8.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..16f6bae10d6a7c59500f7db6e4529a060be8c1d0
Binary files /dev/null and b/sglang/python/sglang/srt/layers/quantization/__pycache__/w8a8_int8.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/layers/quantization/auto_round.py b/sglang/python/sglang/srt/layers/quantization/auto_round.py
new file mode 100644
index 0000000000000000000000000000000000000000..74c4f0231ead27dbed4a5c3b94e8cad9a99571b0
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/quantization/auto_round.py
@@ -0,0 +1,396 @@
+# SPDX-License-Identifier: Apache-2.0
+
+import logging
+import re
+from fractions import Fraction
+from typing import Any, Optional, Union
+
+import torch
+
+logger = logging.getLogger(__name__)
+
+from sglang.srt.layers.quantization.utils import get_scalar_types
+
+ScalarType, scalar_types = get_scalar_types()
+
+from sglang.srt.layers.quantization.base_config import QuantizationConfig
+
+
+class AutoRoundConfig(QuantizationConfig):
+    """Config class for AutoRound.
+    Reference: https://arxiv.org/pdf/2309.05516
+    """
+
+    SUPPORTED_BITS = {2, 3, 4, 8}
+    SUPPORTED_DTYPES = {"int"}
+    SUPPORTED_FORMATS = {"auto_round:auto_gptq", "auto_round:auto_awq"}
+    SUPPORTED_BACKENDS = {"auto", "gptq", "gptq:marlin", "awq", "awq:marlin", "marlin"}
+
+    def __init__(
+        self,
+        weight_bits: int,
+        group_size: int,
+        sym: bool = True,
+        packing_format: str = "auto_round:auto_gptq",
+        block_name_to_quantize: Optional[Union[str, list[str]]] = None,
+        extra_config: Optional[dict[str, Any]] = None,
+        data_type: str = "int",
+        backend: str = "auto",
+    ) -> None:
+        super().__init__()
+        if weight_bits not in self.SUPPORTED_BITS:
+            raise ValueError(
+                f"Unsupported weight_bits: {weight_bits}, "
+                f"currently only support  {self.SUPPORTED_BITS}"
+            )
+        if data_type not in self.SUPPORTED_DTYPES:
+            raise ValueError(
+                f"Unsupported data_type: {data_type},"
+                f" currently only support  {self.SUPPORTED_DTYPES}"
+            )
+        if packing_format not in self.SUPPORTED_FORMATS:
+            raise ValueError(
+                f"Unsupported packing_format: {packing_format}, "
+                f"currently only support  {self.SUPPORTED_FORMATS}"
+            )
+        if backend not in self.SUPPORTED_BACKENDS:
+            raise ValueError(
+                f"Unsupported backend: {backend},  "
+                f"currently only support  {self.SUPPORTED_BACKENDS}"
+            )
+
+        self.weight_bits = weight_bits
+        self.group_size = group_size
+        self.sym = sym
+        self.packing_format = packing_format
+        self.block_name_to_quantize = (
+            block_name_to_quantize.split(",")
+            if isinstance(block_name_to_quantize, str)
+            else block_name_to_quantize
+        )
+        self.extra_config = extra_config
+        self.data_type = data_type
+        self.backend = backend
+        self.pack_factor = Fraction(32, weight_bits)
+
+    def __repr__(self) -> str:
+        return (
+            f"AutoRoundConfig(weight_bits={self.weight_bits}, "
+            f"group_size={self.group_size}, sym={self.sym})"
+        )
+
+    @classmethod
+    def get_name(cls):
+        return "auto-round"
+
+    @classmethod
+    def get_supported_act_dtypes(cls) -> list[torch.dtype]:
+        return [torch.half, torch.bfloat16]
+
+    @classmethod
+    def get_min_capability(cls) -> int:
+        return 60
+
+    @classmethod
+    def get_config_filenames(cls) -> list[str]:
+        return ["quantization_config.json"]
+
+    @classmethod
+    def from_config(cls, config: dict[str, Any]) -> "AutoRoundConfig":
+        return cls(
+            weight_bits=cls.get_from_keys(config, ["bits"]),
+            group_size=cls.get_from_keys(config, ["group_size"]),
+            sym=cls.get_from_keys(config, ["sym"]),
+            packing_format=cls.get_from_keys_or(
+                config,
+                ["packing_format"],
+                "auto_round:auto_gptq",
+            ),
+            block_name_to_quantize=cls.get_from_keys_or(
+                config, ["block_name_to_quantize", "to_quant_block_names"], None
+            ),
+            extra_config=cls.get_from_keys_or(config, ["extra_config"], None),
+            data_type=cls.get_from_keys_or(config, ["data_type"], "int"),
+            backend=cls.get_from_keys_or(
+                config, ["backend", "vllm_backend", "sglang_backend"], "auto"
+            ),
+        )
+
+    def get_scaled_act_names(self) -> list[str]:
+        """Returns the activation function names that should be post-scaled.
+
+        For now, this is only used by AWQ.
+        """
+        raise NotImplementedError
+
+    def get_layer_config(self, layer, layer_name: str):
+        from sglang.srt.layers.vocab_parallel_embedding import ParallelLMHead
+
+        def get_config(name: str, quantized: bool = True):
+            if not self.extra_config:
+                return (
+                    self.weight_bits if quantized else 16,
+                    self.group_size if quantized else -1,
+                    self.sym if quantized else True,
+                )
+
+            # Exact match first
+            if name in self.extra_config:
+                cfg = self.extra_config[name]
+                return (
+                    cfg.get("bits", self.weight_bits if quantized else 16),
+                    cfg.get("group_size", self.group_size if quantized else -1),
+                    cfg.get("sym", self.sym if quantized else True),
+                )
+
+            REGEX_SPECIAL_CHARS = set(r"*+?^$()[]{}|\\")
+            for pattern, cfg in self.extra_config.items():
+                if not isinstance(pattern, str) or not any(
+                    c in REGEX_SPECIAL_CHARS for c in pattern
+                ):
+                    continue
+
+                try:
+                    if re.fullmatch(pattern, name):
+                        return (
+                            cfg.get("bits", self.weight_bits if quantized else 16),
+                            cfg.get("group_size", self.group_size if quantized else -1),
+                            cfg.get("sym", self.sym if quantized else True),
+                        )
+                except re.error:
+                    # Invalid regex, ignore.
+                    continue
+
+            return (
+                self.weight_bits if quantized else 16,
+                self.group_size if quantized else -1,
+                self.sym if quantized else True,
+            )
+
+        # 1. Exact match from config
+        if self.extra_config and layer_name in self.extra_config:
+            return get_config(layer_name)
+
+        # 2. Determine whether layer should be quantized
+        quantized = not isinstance(layer, ParallelLMHead)
+        if self.block_name_to_quantize:
+            quantized = any(
+                layer_name.startswith(name) for name in self.block_name_to_quantize
+            )
+
+        # 3. Handle fused MoE
+        if self.extra_config and "fusedmoe" in layer.__class__.__name__.lower():
+            moe_configs = [
+                get_config(name, quantized)
+                for name in self.extra_config
+                if name.startswith(layer_name)
+            ]
+            if moe_configs:
+                if len(set(moe_configs)) == 1:
+                    return moe_configs[0]
+                raise ValueError(
+                    f"Fused MoE layer '{layer_name}' requires "
+                    f"consistent quant config for all sub-layers"
+                )
+
+        # 4. Handle fused QKV or other patterns
+        if self.extra_config:
+            for fusion_key, sub_keys in self.packed_modules_mapping.items():
+                if fusion_key in layer_name and layer_name.count(fusion_key) == 1:
+                    sub_names = [
+                        layer_name.replace(fusion_key, sub_key) for sub_key in sub_keys
+                    ]
+                    sub_configs = [get_config(name, quantized) for name in sub_names]
+                    if len(set(sub_configs)) == 1:
+                        return sub_configs[0]
+                    raise ValueError(
+                        f"Fused module '{layer_name}' requires "
+                        f"consistent quant config for {sub_names}"
+                    )
+
+        # 5. Fallback or try a regular expression match
+        return get_config(layer_name, quantized)
+
+    def check_quantized(self, weight_bits: int) -> bool:
+        return weight_bits < 16
+
+    def apply_awq_quant_layer(self, layer, prefix: str, backend: str = "auto"):
+        from sglang.srt.layers.linear import LinearBase
+        from sglang.srt.layers.moe.fused_moe_triton import FusedMoE
+        from sglang.srt.layers.quantization.marlin_utils import (
+            check_marlin_supported,
+            check_moe_marlin_supports_layer,
+        )
+        from sglang.srt.layers.quantization.unquant import UnquantizedLinearMethod
+        from sglang.srt.layers.vocab_parallel_embedding import ParallelLMHead
+
+        weight_bits, group_size, sym = self.get_layer_config(layer, prefix)
+        if not self.check_quantized(weight_bits):
+            if isinstance(layer, (LinearBase, ParallelLMHead)):
+                return UnquantizedLinearMethod()
+            else:
+                return None
+        logger.debug(
+            "[%s] Type: %s, Bits: %s, Group Size: %s, Sym: %s",
+            prefix,
+            layer.__class__.__name__,
+            weight_bits,
+            group_size,
+            sym,
+        )
+        if backend == "auto" or "marlin" in backend:
+            AWQ_TYPE_MAP = {
+                4: scalar_types.uint4,
+                8: scalar_types.uint8,
+            }
+            use_marlin = (weight_bits in AWQ_TYPE_MAP) and check_marlin_supported(
+                AWQ_TYPE_MAP[weight_bits], group_size, not sym
+            )
+            if isinstance(layer, FusedMoE):
+                use_marlin = use_marlin and check_moe_marlin_supports_layer(
+                    layer, group_size
+                )
+
+        else:
+            use_marlin = False
+        if use_marlin:
+            from sglang.srt.layers.quantization.awq import (
+                AWQMarlinConfig,
+                AWQMarlinLinearMethod,
+                AWQMoEMethod,
+            )
+
+            quant_args_marlin = AWQMarlinConfig(
+                weight_bits=weight_bits,
+                group_size=group_size,
+                zero_point=not sym,
+                lm_head_quantized=False,
+                full_config={},
+                modules_to_not_convert=[],
+            )
+        else:
+            from sglang.srt.layers.quantization.awq import AWQConfig, AWQLinearMethod
+
+            quant_args = AWQConfig(
+                weight_bits=weight_bits,
+                group_size=group_size,
+                zero_point=not sym,
+            )
+
+        if isinstance(layer, FusedMoE):
+            if use_marlin:
+                return AWQMoEMethod(quant_args_marlin)
+            from sglang.srt.layers.quantization.moe_wna16 import MoeWNA16Config
+
+            config = {
+                "quant_method": "awq",
+                "bits": weight_bits,
+                "group_size": group_size,
+                "zero_point": not sym,
+                "lm_head": False,
+            }
+            return MoeWNA16Config.from_config(config).get_quant_method(layer, prefix)
+
+        if isinstance(layer, (LinearBase, ParallelLMHead)):
+            if use_marlin:
+                return AWQMarlinLinearMethod(quant_args_marlin)
+            else:
+                return AWQLinearMethod(quant_args)
+        return None
+
+    def apply_gptq_quant_layer(self, layer, prefix: str, backend: str = "auto"):
+        from sglang.srt.layers.linear import LinearBase
+        from sglang.srt.layers.moe.fused_moe_triton import FusedMoE
+        from sglang.srt.layers.quantization.marlin_utils import (
+            check_marlin_supported,
+            check_moe_marlin_supports_layer,
+        )
+        from sglang.srt.layers.quantization.unquant import UnquantizedLinearMethod
+        from sglang.srt.layers.vocab_parallel_embedding import ParallelLMHead
+
+        weight_bits, group_size, sym = self.get_layer_config(layer, prefix)
+        if not self.check_quantized(weight_bits):
+            if isinstance(layer, (LinearBase, ParallelLMHead)):
+                return UnquantizedLinearMethod()
+            else:
+                return None
+
+        logger.debug(
+            "[%s] Type: %s, Bits: %s, Group Size: %s, Sym: %s",
+            prefix,
+            layer.__class__.__name__,
+            weight_bits,
+            group_size,
+            sym,
+        )
+        if backend == "auto" or "marlin" in backend:
+            GPTQ_TYPE_MAP = {
+                (4, True): scalar_types.uint4b8,
+                (8, True): scalar_types.uint8b128,
+            }
+            use_marlin = (weight_bits, sym) in GPTQ_TYPE_MAP and check_marlin_supported(
+                GPTQ_TYPE_MAP[(weight_bits, sym)], group_size, has_zp=not sym
+            )
+            if isinstance(layer, FusedMoE):
+                use_marlin = use_marlin and check_moe_marlin_supports_layer(
+                    layer, group_size
+                )
+        else:
+            use_marlin = False
+        if use_marlin:
+            from sglang.srt.layers.quantization.gptq import (
+                GPTQMarlinConfig,
+                GPTQMarlinLinearMethod,
+                GPTQMarlinMoEMethod,
+            )
+
+            quant_args_marlin = GPTQMarlinConfig(
+                weight_bits=weight_bits,
+                group_size=group_size,
+                is_sym=sym,
+                lm_head_quantized=False,
+                desc_act=False,
+                dynamic={},
+                full_config={},
+            )
+        else:
+            from sglang.srt.layers.quantization.gptq import GPTQConfig, GPTQLinearMethod
+
+            quant_args = GPTQConfig(
+                weight_bits=weight_bits,
+                group_size=group_size,
+                lm_head_quantized=False,
+                desc_act=False,
+                dynamic={},
+            )
+
+        if isinstance(layer, FusedMoE):
+            if use_marlin:
+                from sglang.srt.layers.quantization.moe_wna16 import MoeWNA16Config
+
+                config = {
+                    "quant_method": "gptq",
+                    "bits": weight_bits,
+                    "group_size": group_size,
+                    "sym": sym,
+                    "lm_head": False,
+                }
+                return MoeWNA16Config.from_config(config).get_quant_method(
+                    layer, prefix
+                )
+            return GPTQMarlinMoEMethod(quant_args_marlin)
+
+        if isinstance(layer, (LinearBase, ParallelLMHead)):
+            if use_marlin:
+                return GPTQMarlinLinearMethod(quant_args_marlin)
+            else:
+                return GPTQLinearMethod(quant_args)
+
+        return None
+
+    def get_quant_method(self, layer: torch.nn.Module, prefix: str):
+        # TODO enable CPU quant method later
+        if "gptq" in self.packing_format or "gptq" in self.backend:
+            return self.apply_gptq_quant_layer(layer, prefix)
+        if "awq" in self.packing_format or "awq" in self.backend:
+            return self.apply_awq_quant_layer(layer, prefix)
diff --git a/sglang/python/sglang/srt/layers/quantization/awq.py b/sglang/python/sglang/srt/layers/quantization/awq.py
new file mode 100644
index 0000000000000000000000000000000000000000..c05aa091ad647c17811ce52af8eb494f62a16a01
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/quantization/awq.py
@@ -0,0 +1,966 @@
+# SPDX-License-Identifier: Apache-2.0
+from __future__ import annotations
+
+import logging
+import warnings
+from typing import TYPE_CHECKING, Any, Dict, List, Optional
+
+import torch
+
+from sglang.srt.hardware_backend.npu.quantization.fused_moe_method_npu import (
+    npu_fused_experts,
+)
+from sglang.srt.layers.linear import LinearBase, set_weight_attrs
+from sglang.srt.layers.moe import (
+    MoeRunner,
+    MoeRunnerBackend,
+    MoeRunnerConfig,
+    get_moe_runner_backend,
+)
+from sglang.srt.layers.moe.moe_runner.marlin import MarlinMoeQuantInfo
+from sglang.srt.layers.parameter import GroupQuantScaleParameter, PackedvLLMParameter
+from sglang.srt.layers.quantization.base_config import (
+    FusedMoEMethodBase,
+    LinearMethodBase,
+    QuantizationConfig,
+    QuantizeMethodBase,
+)
+from sglang.srt.layers.quantization.marlin_utils import (
+    apply_awq_marlin_linear,
+    awq_to_marlin_zero_points,
+    check_marlin_supported,
+    check_marlin_supports_layer,
+    check_moe_marlin_supports_layer,
+    marlin_make_empty_g_idx,
+    marlin_make_workspace,
+    marlin_moe_permute_scales,
+    marlin_permute_scales,
+    moe_awq_to_marlin_zero_points,
+    verify_marlin_supported,
+    verify_marlin_supports_shape,
+)
+from sglang.srt.layers.quantization.unquant import UnquantizedLinearMethod
+from sglang.srt.layers.quantization.utils import get_scalar_types, replace_parameter
+from sglang.srt.utils.patch_torch import register_fake_if_exists
+
+if TYPE_CHECKING:
+    from sglang.srt.layers.moe.token_dispatcher import (
+        CombineInput,
+        StandardDispatchOutput,
+    )
+
+from sglang.srt.utils import is_cuda, is_hip, is_npu, is_xpu
+
+_is_cuda = is_cuda()
+_is_hip = is_hip()
+_is_xpu = is_xpu()
+_is_npu = is_npu()
+
+if _is_npu:
+    import torch_npu
+
+if _is_cuda:
+    from sglang.jit_kernel.awq_dequantize import awq_dequantize
+    from sglang.jit_kernel.awq_marlin_repack import (
+        awq_marlin_moe_repack,
+        awq_marlin_repack,
+    )
+    from sglang.srt.utils.custom_op import register_custom_op_from_extern
+
+    awq_dequantize = register_custom_op_from_extern(
+        awq_dequantize,
+        fake_impl=lambda qweight, scales, qzeros: qweight.new_empty(
+            qweight.shape[:-1] + (qweight.shape[-1] * 8,), dtype=scales.dtype
+        ),
+    )
+
+elif _is_hip:
+    from sglang.srt.layers.quantization.awq_triton import (
+        awq_dequantize_triton as awq_dequantize,
+    )
+
+elif _is_xpu:
+    from sgl_kernel import awq_dequantize
+
+    warnings.warn(f"XPU does not support fused_marlin_moe currently.")
+else:
+    warnings.warn(f"Only CUDA, HIP and XPU support AWQ currently.")
+
+logger = logging.getLogger(__name__)
+
+
+ScalarType, scalar_types = get_scalar_types()
+
+
+def is_layer_skipped_awq(prefix: str, modules_to_not_convert: List[str]):
+    return any(module_name in prefix for module_name in modules_to_not_convert)
+
+
+class AWQConfig(QuantizationConfig):
+    """Config class for AWQ.
+
+    Reference: https://arxiv.org/abs/2306.00978
+    """
+
+    def __init__(
+        self,
+        weight_bits: int,
+        group_size: int,
+        zero_point: bool,
+        modules_to_not_convert: Optional[List[str]] = None,
+    ) -> None:
+        super().__init__()
+        self.weight_bits = weight_bits
+        self.group_size = group_size
+        self.zero_point = zero_point
+        self.modules_to_not_convert = modules_to_not_convert or []
+
+        if self.weight_bits != 4:
+            raise ValueError(
+                "Currently, only 4-bit weight quantization is supported for "
+                f"AWQ, but got {self.weight_bits} bits."
+            )
+        self.pack_factor = 32 // self.weight_bits
+
+    def __repr__(self) -> str:
+        return (
+            f"AWQConfig(weight_bits={self.weight_bits}, "
+            f"group_size={self.group_size}, "
+            f"zero_point={self.zero_point}, "
+            f"modules_to_not_convert={self.modules_to_not_convert})"
+        )
+
+    def get_scaled_act_names(self) -> List[str]:
+        return []
+
+    def get_name(self) -> str:
+        return "awq"
+
+    def get_supported_act_dtypes(self) -> List[torch.dtype]:
+        return [torch.float16] if not _is_npu else [torch.float16, torch.bfloat16]
+
+    @classmethod
+    def get_min_capability(cls) -> int:
+        # The AWQ kernel only supports Turing or newer GPUs.
+        if _is_npu:
+            raise NotImplementedError(
+                'NPU hardware does not support "get_min_capability" feature.'
+            )
+        else:
+            return 75
+
+    @staticmethod
+    def get_config_filenames() -> List[str]:
+        return [
+            "quant_config.json",  # E.g., casperhansen/vicuna-7b-v1.5-awq
+            # E.g., abhinavkulkarni/mosaicml-mpt-7b-instruct-w4-g128-awq
+            "quantize_config.json",
+        ]
+
+    @classmethod
+    def from_config(cls, config: Dict[str, Any]) -> AWQConfig:
+        weight_bits = cls.get_from_keys(config, ["w_bit", "bits"])
+        group_size = cls.get_from_keys(config, ["q_group_size", "group_size"])
+        zero_point = cls.get_from_keys(config, ["zero_point"])
+        modules_to_not_convert = cls.get_from_keys_or(
+            config, ["modules_to_not_convert"], None
+        )
+        return cls(weight_bits, group_size, zero_point, modules_to_not_convert)
+
+    def get_quant_method(
+        self, layer: torch.nn.Module, prefix: str
+    ) -> Optional[LinearMethodBase]:
+        from sglang.srt.layers.linear import LinearBase
+        from sglang.srt.layers.moe.fused_moe_triton import FusedMoE
+
+        if _is_npu:
+            if isinstance(layer, LinearBase):
+                if is_layer_skipped_awq(prefix, self.modules_to_not_convert):
+                    return UnquantizedLinearMethod()
+                return AWQLinearAscendMethod(self)
+            elif isinstance(layer, FusedMoE):
+                return AWQMoEAscendMethod(self)
+            return None
+
+        if isinstance(layer, LinearBase):
+            if is_layer_skipped_awq(prefix, self.modules_to_not_convert):
+                return UnquantizedLinearMethod()
+            return AWQLinearMethod(self)
+        return None
+
+
+class AWQMarlinConfig(QuantizationConfig):
+    """Config class for AWQ Marlin"""
+
+    # num_bits -> type
+    TYPE_MAP = {
+        4: scalar_types.uint4,
+        8: scalar_types.uint8,
+    }
+
+    def __init__(
+        self,
+        weight_bits: int,
+        group_size: int,
+        zero_point: bool,
+        lm_head_quantized: bool,
+        modules_to_not_convert: Optional[list[str]],
+        full_config: dict[str, Any],
+    ) -> None:
+        super().__init__()
+        if _is_hip:
+            warnings.warn(f"HIP does not support fused_marlin_moe currently.")
+        self.pack_factor = 32 // weight_bits  # packed into int32
+        self.group_size = group_size
+        self.zero_point = zero_point
+        self.lm_head_quantized = lm_head_quantized
+        self.weight_bits = weight_bits
+        self.modules_to_not_convert = modules_to_not_convert or []
+        self.full_config = full_config
+
+        if self.weight_bits not in self.TYPE_MAP:
+            raise ValueError(
+                f"Unsupported num_bits = {self.weight_bits}. "
+                f"Supported num_bits = {self.TYPE_MAP.keys()}"
+            )
+
+        self.quant_type = self.TYPE_MAP[self.weight_bits]
+
+        verify_marlin_supported(
+            self.quant_type, group_size=self.group_size, has_zp=self.zero_point
+        )
+
+    def __repr__(self) -> str:
+        return (
+            f"AWQMarlinConfig(quant_type={self.quant_type}, "
+            f"group_size={self.group_size}, "
+            f"zero_point={self.zero_point}, "
+            f"lm_head_quantized={self.lm_head_quantized}, "
+            f"modules_to_not_convert={self.modules_to_not_convert})"
+        )
+
+    def get_scaled_act_names(self) -> List[str]:
+        return []
+
+    @classmethod
+    def get_name(cls) -> str:
+        return "awq_marlin"
+
+    @classmethod
+    def get_supported_act_dtypes(cls) -> list[torch.dtype]:
+        return [torch.half, torch.bfloat16]
+
+    @classmethod
+    def get_min_capability(cls) -> int:
+        return 80
+
+    @classmethod
+    def get_config_filenames(cls) -> list[str]:
+        return ["quantize_config.json"]
+
+    @classmethod
+    def from_config(cls, config: dict[str, Any]) -> AWQMarlinConfig:
+        weight_bits = cls.get_from_keys(config, ["bits"])
+        group_size = cls.get_from_keys(config, ["group_size"])
+        zero_point = cls.get_from_keys(config, ["zero_point"])
+        lm_head_quantized = cls.get_from_keys_or(config, ["lm_head"], default=False)
+        modules_to_not_convert = cls.get_from_keys_or(
+            config, ["modules_to_not_convert"], None
+        )
+        return cls(
+            weight_bits,
+            group_size,
+            zero_point,
+            lm_head_quantized,
+            modules_to_not_convert,
+            config,
+        )
+
+    @classmethod
+    def override_quantization_method(cls, hf_quant_cfg, user_quant) -> Optional[str]:
+        can_convert = cls.is_awq_marlin_compatible(hf_quant_cfg)
+        is_valid_user_quant = (
+            user_quant is None or user_quant == "marlin" or user_quant == "awq_marlin"
+        )
+
+        if can_convert and is_valid_user_quant:
+            msg = (
+                "The model is convertible to {} during runtime."
+                " Using {} kernel.".format(cls.get_name(), cls.get_name())
+            )
+            logger.info(msg)
+            return cls.get_name()
+
+        if can_convert and user_quant == "awq":
+            logger.info(
+                "Detected that the model can run with awq_marlin"
+                ", however you specified quantization=awq explicitly,"
+                " so forcing awq. Use quantization=awq_marlin for"
+                " faster inference"
+            )
+        return None
+
+    def get_quant_method(
+        self, layer: torch.nn.Module, prefix: str
+    ) -> Optional[QuantizeMethodBase]:
+        from sglang.srt.layers.moe.fused_moe_triton import FusedMoE
+        from sglang.srt.layers.vocab_parallel_embedding import ParallelLMHead
+
+        if isinstance(layer, LinearBase) or (
+            isinstance(layer, ParallelLMHead) and self.lm_head_quantized
+        ):
+            if is_layer_skipped_awq(prefix, self.modules_to_not_convert):
+                return UnquantizedLinearMethod()
+            # Check if the layer is supported by AWQMarlin.
+            if not check_marlin_supports_layer(layer, self.group_size):
+                logger.warning_once(
+                    "Layer '%s' is not supported by AWQMarlin. Falling back to unoptimized AWQ kernels.",  # noqa: E501
+                    prefix,
+                )
+                return AWQConfig.from_config(self.full_config).get_quant_method(
+                    layer, prefix
+                )
+            return AWQMarlinLinearMethod(self)
+        elif isinstance(layer, FusedMoE):
+            from sglang.srt.layers.quantization.moe_wna16 import MoeWNA16Config
+
+            if not check_moe_marlin_supports_layer(layer, self.group_size):
+                logger.warning_once(
+                    f"Layer '{prefix}' is not supported by AWQMoeMarlin. "
+                    "Falling back to Moe WNA16 kernels."
+                )
+                return MoeWNA16Config.from_config(self.full_config).get_quant_method(
+                    layer, prefix
+                )
+            return AWQMoEMethod(self)
+        return None
+
+    @classmethod
+    def is_awq_marlin_compatible(cls, quant_config: dict[str, Any]):
+        # Extract data from quant config.
+        quant_method = quant_config.get("quant_method", "").lower()
+        num_bits = quant_config.get("bits")
+        group_size = quant_config.get("group_size")
+        zero_point = quant_config.get("zero_point")
+
+        if not _is_cuda:
+            return False
+
+        if quant_method != "awq":
+            return False
+
+        # If we cannot find the info needed in the config, cannot convert.
+        if num_bits is None or group_size is None or zero_point is None:
+            return False
+
+        if num_bits not in cls.TYPE_MAP:
+            return False
+
+        return check_marlin_supported(
+            quant_type=cls.TYPE_MAP[num_bits], group_size=group_size, has_zp=zero_point
+        )
+
+
+class AWQLinearMethod(LinearMethodBase):
+    """Linear method for AWQ.
+
+    Args:
+        quant_config: The AWQ quantization config.
+    """
+
+    def __init__(self, quant_config: AWQConfig):
+        self.quant_config = quant_config
+
+    def create_weights(
+        self,
+        layer: torch.nn.Module,
+        input_size_per_partition: int,
+        output_partition_sizes: List[int],
+        input_size: int,
+        output_size: int,
+        params_dtype: torch.dtype,
+        **extra_weight_attrs,
+    ):
+        if input_size_per_partition % self.quant_config.group_size != 0:
+            raise ValueError(
+                "The input size is not aligned with the quantized "
+                "weight shape. This can be caused by too large "
+                "tensor parallel size."
+            )
+
+        output_size_per_partition = sum(output_partition_sizes)
+        if output_size_per_partition % self.quant_config.pack_factor != 0:
+            raise ValueError(
+                "The output size is not aligned with the quantized "
+                "weight shape. This can be caused by too large "
+                "tensor parallel size."
+            )
+
+        weight_loader = extra_weight_attrs.get("weight_loader")
+        qweight = PackedvLLMParameter(
+            data=torch.empty(
+                input_size_per_partition,
+                output_size_per_partition // self.quant_config.pack_factor,
+                dtype=torch.int32,
+            ),
+            input_dim=0,
+            output_dim=1,
+            packed_dim=1,
+            packed_factor=self.quant_config.pack_factor,
+            weight_loader=weight_loader,
+        )
+
+        qzeros = PackedvLLMParameter(
+            data=torch.empty(
+                input_size_per_partition // self.quant_config.group_size,
+                output_size_per_partition // self.quant_config.pack_factor,
+                dtype=torch.int32,
+            ),
+            input_dim=0,
+            output_dim=1,
+            packed_dim=1,
+            packed_factor=self.quant_config.pack_factor,
+            weight_loader=weight_loader,
+        )
+
+        scales = GroupQuantScaleParameter(
+            data=torch.empty(
+                input_size_per_partition // self.quant_config.group_size,
+                output_size_per_partition,
+                dtype=params_dtype,
+            ),
+            input_dim=0,
+            output_dim=1,
+            weight_loader=weight_loader,
+        )
+
+        layer.register_parameter("qweight", qweight)
+        layer.register_parameter("qzeros", qzeros)
+        layer.register_parameter("scales", scales)
+
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        layer.qweight = torch.nn.Parameter(layer.qweight.data, requires_grad=False)
+        layer.qzeros = torch.nn.Parameter(layer.qzeros.data, requires_grad=False)
+        layer.scales = torch.nn.Parameter(layer.scales.data, requires_grad=False)
+
+    def apply(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        bias: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        qweight = layer.qweight
+        scales = layer.scales
+        qzeros = layer.qzeros
+        pack_factor = self.quant_config.pack_factor
+        out_shape = x.shape[:-1] + (qweight.shape[-1] * pack_factor,)
+        reshaped_x = x.reshape(-1, x.shape[-1])
+        out = awq_dequantize(qweight, scales, qzeros)
+        out = torch.matmul(reshaped_x, out)
+
+        if bias is not None:
+            out.add_(bias)
+        return out.reshape(out_shape)
+
+
+class AWQMarlinLinearMethod(LinearMethodBase):
+    """Linear method for AWQ Marlin.
+
+    Args:
+        quant_config: The AWQ Marlin quantization config.
+    """
+
+    def __init__(self, quant_config: AWQMarlinConfig) -> None:
+        self.quant_config = quant_config
+
+    def create_weights(
+        self,
+        layer: torch.nn.Module,
+        input_size_per_partition: int,
+        output_partition_sizes: list[int],
+        input_size: int,
+        output_size: int,
+        params_dtype: torch.dtype,
+        **extra_weight_attrs,
+    ) -> None:
+        del output_size
+        output_size_per_partition = sum(output_partition_sizes)
+        weight_loader = extra_weight_attrs.get("weight_loader")
+
+        # Normalize group_size
+        if self.quant_config.group_size != -1:
+            group_size = self.quant_config.group_size
+        else:
+            group_size = input_size
+
+        verify_marlin_supports_shape(
+            output_size_per_partition=output_size_per_partition,
+            input_size_per_partition=input_size_per_partition,
+            input_size=input_size,
+            group_size=group_size,
+        )
+
+        qweight = PackedvLLMParameter(
+            data=torch.empty(
+                input_size_per_partition,
+                output_size_per_partition // self.quant_config.pack_factor,
+                dtype=torch.int32,
+            ),
+            input_dim=0,
+            output_dim=1,
+            packed_dim=1,
+            packed_factor=self.quant_config.pack_factor,
+            weight_loader=weight_loader,
+        )
+
+        num_groups = input_size_per_partition // group_size
+
+        qzeros = PackedvLLMParameter(
+            data=torch.empty(
+                num_groups,
+                output_size_per_partition // self.quant_config.pack_factor,
+                dtype=torch.int32,
+            ),
+            input_dim=0,
+            output_dim=1,
+            packed_dim=1,
+            packed_factor=self.quant_config.pack_factor,
+            weight_loader=weight_loader,
+        )
+
+        scales = GroupQuantScaleParameter(
+            data=torch.empty(
+                num_groups,
+                output_size_per_partition,
+                dtype=params_dtype,
+            ),
+            input_dim=0,
+            output_dim=1,
+            weight_loader=weight_loader,
+        )
+
+        layer.register_parameter("qweight", qweight)
+        layer.register_parameter("qzeros", qzeros)
+        layer.register_parameter("scales", scales)
+
+        layer.input_size_per_partition = input_size_per_partition
+        layer.output_size_per_partition = output_size_per_partition
+        layer.num_groups = num_groups
+
+    # TODO: Update this docs
+    # Checkpoints are serialized in AutoAWQ format, which is different from the
+    # marlin format. This function is called after the weights are loaded.
+    # Here, we handle the repacking
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        device = layer.qweight.device
+        layer.qweight = torch.nn.Parameter(layer.qweight.data, requires_grad=False)
+        layer.qzeros = torch.nn.Parameter(layer.qzeros.data, requires_grad=False)
+        layer.scales = torch.nn.Parameter(layer.scales.data, requires_grad=False)
+
+        # Allocate marlin workspace
+        layer.workspace = marlin_make_workspace(device)
+
+        # Repack weights from AWQ format to marlin format.
+        marlin_qweight = awq_marlin_repack(
+            layer.qweight,
+            size_k=layer.input_size_per_partition,
+            size_n=layer.output_size_per_partition,
+            num_bits=self.quant_config.quant_type.size_bits,
+        )
+        replace_parameter(layer, "qweight", marlin_qweight)
+
+        # Permute scales from AWQ format to marlin format.
+        marlin_scales = marlin_permute_scales(
+            layer.scales,
+            size_k=layer.input_size_per_partition,
+            size_n=layer.output_size_per_partition,
+            group_size=self.quant_config.group_size,
+        )
+        replace_parameter(layer, "scales", marlin_scales)
+
+        # Permute zero-points from AWQ format to marlin format.
+        marlin_zp = awq_to_marlin_zero_points(
+            layer.qzeros,
+            size_k=layer.num_groups,
+            size_n=layer.output_size_per_partition,
+            num_bits=self.quant_config.quant_type.size_bits,
+        )
+        replace_parameter(layer, "qzeros", marlin_zp)
+
+        # Not-used
+        layer.g_idx = marlin_make_empty_g_idx(device)
+        layer.g_idx_sort_indices = marlin_make_empty_g_idx(device)
+
+    def apply(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        bias: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        return apply_awq_marlin_linear(
+            input=x,
+            weight=layer.qweight,
+            weight_scale=layer.scales,
+            weight_zp=layer.qzeros,
+            g_idx=layer.g_idx,
+            g_idx_sort_indices=layer.g_idx_sort_indices,
+            workspace=layer.workspace,
+            quant_type=self.quant_config.quant_type,
+            output_size_per_partition=layer.output_size_per_partition,
+            input_size_per_partition=layer.input_size_per_partition,
+            bias=bias,
+        )
+
+
+class AWQLinearAscendMethod(AWQLinearMethod):
+    """Linear method for AWQ on Ascend.
+
+    Args:
+        quant_config: The AWQ quantization config.
+    """
+
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        layer.scales = torch.nn.Parameter(layer.scales.data, requires_grad=False)
+        qweight_tmp = torch.zeros_like(layer.qweight.data)
+        qzeros_tmp = layer.qzeros.data
+        qzeros_list = []
+        shifts = [0, 4, 1, 5, 2, 6, 3, 7]
+
+        for i in range(0, self.quant_config.pack_factor):
+            shift_num = shifts[i] * 4
+            qzeros_list.append((qzeros_tmp.reshape(-1, 1) >> shift_num) & 0xF)
+            qweight_tmp.bitwise_or_(
+                ((layer.qweight.data >> shift_num) * (2 ** (4 * i))) & (0xF << (4 * i))
+            )
+
+        qweight_tmp.bitwise_xor_(0x88888888)
+
+        qzeros_tmp = torch.cat(qzeros_list, dim=-1).reshape(qzeros_tmp.shape[0], -1)
+        qzeros_tmp = -(qzeros_tmp - 8)
+        qzeros_tmp = qzeros_tmp.to(layer.scales.data.dtype)
+
+        layer.zeros = torch.nn.Parameter(qzeros_tmp, requires_grad=False)
+        layer.weight = torch.nn.Parameter(qweight_tmp, requires_grad=False)
+
+    def apply(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        bias: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        qweight = layer.weight
+        scales = layer.scales
+        qzeros = layer.zeros
+        pack_factor = self.quant_config.pack_factor
+        out_shape = x.shape[:-1] + (qweight.shape[-1] * pack_factor,)
+        reshaped_x = x.reshape(-1, x.shape[-1])
+
+        if bias is not None and bias.dtype == torch.bfloat16:
+            bias = bias.float()
+
+        out = torch_npu.npu_weight_quant_batchmatmul(
+            reshaped_x,
+            qweight,
+            antiquant_scale=scales,
+            antiquant_offset=qzeros,
+            antiquant_group_size=self.quant_config.group_size,
+            bias=bias,
+        )
+
+        return out.reshape(out_shape)
+
+
+class AWQMoEMethod(FusedMoEMethodBase):
+
+    def __init__(self, quant_config: AWQMarlinConfig):
+        self.quant_config = quant_config
+        if self.quant_config.weight_bits != 4:
+            raise ValueError("AWQMoEMethod only supports 4bit now.")
+        self.quant_type = scalar_types.uint4
+
+    def create_weights(
+        self,
+        layer: torch.nn.Module,
+        num_experts: int,
+        hidden_size: int,
+        intermediate_size_per_partition: int,
+        params_dtype: torch.dtype,
+        **extra_weight_attrs,
+    ):
+        # Delay the import to avoid circular dependency
+        from sglang.srt.layers.moe.fused_moe_triton import FusedMoeWeightScaleSupported
+
+        extra_weight_attrs.update(
+            {
+                "is_transposed": True,
+                "quant_method": FusedMoeWeightScaleSupported.GROUP.value,
+            }
+        )
+
+        w13_qweight = torch.nn.Parameter(
+            torch.empty(
+                num_experts,
+                hidden_size,
+                2 * intermediate_size_per_partition // self.quant_config.pack_factor,
+                dtype=torch.int32,
+            ),
+            requires_grad=False,
+        )
+        layer.register_parameter("w13_qweight", w13_qweight)
+        set_weight_attrs(w13_qweight, extra_weight_attrs)
+
+        w2_qweight = torch.nn.Parameter(
+            torch.empty(
+                num_experts,
+                intermediate_size_per_partition,
+                hidden_size // self.quant_config.pack_factor,
+                dtype=torch.int32,
+            ),
+            requires_grad=False,
+        )
+        layer.register_parameter("w2_qweight", w2_qweight)
+        set_weight_attrs(w2_qweight, extra_weight_attrs)
+
+        num_groups_w13 = hidden_size // self.quant_config.group_size
+        num_groups_w2 = intermediate_size_per_partition // self.quant_config.group_size
+
+        # WEIGHT_SCALES
+        # Allocate 2 scales for w1 and w3 respectively.
+        w13_scales = torch.nn.Parameter(
+            torch.empty(
+                num_experts,
+                num_groups_w13,
+                intermediate_size_per_partition * 2,
+                dtype=params_dtype,
+            ),
+            requires_grad=False,
+        )
+        layer.register_parameter("w13_scales", w13_scales)
+        set_weight_attrs(w13_scales, extra_weight_attrs)
+
+        w2_scales = torch.nn.Parameter(
+            torch.empty(num_experts, num_groups_w2, hidden_size, dtype=params_dtype),
+            requires_grad=False,
+        )
+        layer.register_parameter("w2_scales", w2_scales)
+        set_weight_attrs(w2_scales, extra_weight_attrs)
+
+        # WEIGHT_ZERO_POINT
+        # Allocate 2 zero points for w1 and w3 respectively.
+        w13_qzeros = torch.nn.Parameter(
+            torch.empty(
+                num_experts,
+                num_groups_w13,
+                2 * intermediate_size_per_partition // self.quant_config.pack_factor,
+                dtype=torch.int32,
+            ),
+            requires_grad=False,
+        )
+        layer.register_parameter("w13_qzeros", w13_qzeros)
+        set_weight_attrs(w13_qzeros, extra_weight_attrs)
+
+        w2_qzeros = torch.nn.Parameter(
+            torch.empty(
+                num_experts,
+                num_groups_w2,
+                hidden_size // self.quant_config.pack_factor,
+                dtype=torch.int32,
+            ),
+            requires_grad=False,
+        )
+        layer.register_parameter("w2_qzeros", w2_qzeros)
+        set_weight_attrs(w2_qzeros, extra_weight_attrs)
+
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        num_experts = layer.w13_qweight.shape[0]
+        device = layer.w13_qweight.device
+
+        layer.w13_g_idx_sort_indices = torch.nn.Parameter(
+            torch.empty((num_experts, 0), dtype=torch.int32, device=device),
+            requires_grad=False,
+        )
+        layer.w2_g_idx_sort_indices = torch.nn.Parameter(
+            torch.empty((num_experts, 0), dtype=torch.int32, device=device),
+            requires_grad=False,
+        )
+
+        marlin_w13_qweight = awq_marlin_moe_repack(
+            layer.w13_qweight,
+            layer.w13_g_idx_sort_indices,
+            size_k=layer.w13_qweight.shape[1],
+            size_n=layer.w13_qweight.shape[2] * self.quant_config.pack_factor,
+            num_bits=self.quant_config.weight_bits,
+        )
+        replace_parameter(layer, "w13_qweight", marlin_w13_qweight)
+
+        marlin_w2_qweight = awq_marlin_moe_repack(
+            layer.w2_qweight,
+            layer.w2_g_idx_sort_indices,
+            size_k=layer.w2_qweight.shape[1],
+            size_n=layer.w2_qweight.shape[2] * self.quant_config.pack_factor,
+            num_bits=self.quant_config.weight_bits,
+        )
+        replace_parameter(layer, "w2_qweight", marlin_w2_qweight)
+
+        # hidden_size->intermediate_size
+        marlin_w13_scales = marlin_moe_permute_scales(
+            s=layer.w13_scales,
+            size_k=layer.intermediate_size_per_partition,
+            size_n=layer.w13_scales.shape[2],
+            group_size=self.quant_config.group_size,
+        )
+
+        replace_parameter(layer, "w13_scales", marlin_w13_scales)
+
+        marlin_w2_scales = marlin_moe_permute_scales(
+            s=layer.w2_scales,
+            size_k=layer.intermediate_size_per_partition,
+            size_n=layer.w2_scales.shape[2],
+            group_size=self.quant_config.group_size,
+        )
+        replace_parameter(layer, "w2_scales", marlin_w2_scales)
+
+        marlin_w13_zp = moe_awq_to_marlin_zero_points(
+            layer.w13_qzeros,
+            size_k=layer.w13_qzeros.shape[1],
+            size_n=layer.w13_qzeros.shape[2] * self.quant_config.pack_factor,
+            num_bits=self.quant_config.weight_bits,
+        )
+        replace_parameter(layer, "w13_qzeros", marlin_w13_zp)
+
+        marlin_w2_zp = moe_awq_to_marlin_zero_points(
+            layer.w2_qzeros,
+            size_k=layer.w2_qzeros.shape[1],
+            size_n=layer.w2_qzeros.shape[2] * self.quant_config.pack_factor,
+            num_bits=self.quant_config.weight_bits,
+        )
+        replace_parameter(layer, "w2_qzeros", marlin_w2_zp)
+
+    def create_moe_runner(
+        self, layer: torch.nn.Module, moe_runner_config: MoeRunnerConfig
+    ):
+        assert get_moe_runner_backend().is_auto()
+        self.moe_runner_config = moe_runner_config
+        self.runner = MoeRunner(MoeRunnerBackend.MARLIN, moe_runner_config)
+
+    def apply(
+        self,
+        layer: torch.nn.Module,
+        dispatch_output: StandardDispatchOutput,
+    ) -> CombineInput:
+
+        quant_info = MarlinMoeQuantInfo(
+            w13_qweight=layer.w13_qweight,
+            w2_qweight=layer.w2_qweight,
+            w13_scales=layer.w13_scales,
+            w2_scales=layer.w2_scales,
+            w13_g_idx_sort_indices=layer.w13_g_idx_sort_indices,
+            w2_g_idx_sort_indices=layer.w2_g_idx_sort_indices,
+            w13_qzeros=layer.w13_qzeros,
+            w2_qzeros=layer.w2_qzeros,
+            weight_bits=self.quant_config.weight_bits,
+        )
+
+        return self.runner.run(dispatch_output, quant_info)
+
+
+class AWQMoEAscendMethod(AWQMoEMethod):
+    def __init__(self, quant_config: AWQConfig):
+        self.quant_config = quant_config
+
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        w13_qweight_tmp = torch.zeros_like(layer.w13_qweight.data)
+        w2_qweight_tmp = torch.zeros_like(layer.w2_qweight.data)
+        w13_qzeros_list = []
+        w2_qzeros_list = []
+        shifts = [0, 4, 1, 5, 2, 6, 3, 7]
+        for i in range(0, self.quant_config.pack_factor):
+            shift_num = shifts[i] * 4
+            w13_qzeros_list.append(
+                (layer.w13_qzeros.data.reshape(-1, 1) >> shift_num) & 0xF
+            )
+            w2_qzeros_list.append(
+                (layer.w2_qzeros.data.reshape(-1, 1) >> shift_num) & 0xF
+            )
+            w13_qweight_tmp.bitwise_or_(
+                ((layer.w13_qweight.data >> shift_num) * (2 ** (4 * i)))
+                & (0xF << (4 * i))
+            )
+            w2_qweight_tmp.bitwise_or_(
+                ((layer.w2_qweight.data >> shift_num) * (2 ** (4 * i)))
+                & (0xF << (4 * i))
+            )
+
+        w13_qweight_tmp.bitwise_xor_(0x88888888)
+        w2_qweight_tmp.bitwise_xor_(0x88888888)
+
+        w13_qzeros_tmp = torch.cat(w13_qzeros_list, dim=-1).reshape(
+            layer.w13_qzeros.shape[0], layer.w13_qzeros.shape[1], -1
+        )
+        w13_qzeros_tmp = -(w13_qzeros_tmp - 8)
+        w13_qzeros_tmp = w13_qzeros_tmp.to(layer.w13_scales.data.dtype)
+        w2_qzeros_tmp = torch.cat(w2_qzeros_list, dim=-1).reshape(
+            layer.w2_qzeros.shape[0], layer.w2_qzeros.shape[1], -1
+        )
+        w2_qzeros_tmp = -(w2_qzeros_tmp - 8)
+        w2_qzeros_tmp = w2_qzeros_tmp.to(layer.w2_scales.data.dtype)
+
+        layer.register_parameter(
+            "w13_qzeros", torch.nn.Parameter(w13_qzeros_tmp, requires_grad=False)
+        )
+        layer.register_parameter(
+            "w13_qweight", torch.nn.Parameter(w13_qweight_tmp, requires_grad=False)
+        )
+        layer.register_parameter(
+            "w2_qzeros", torch.nn.Parameter(w2_qzeros_tmp, requires_grad=False)
+        )
+        layer.register_parameter(
+            "w2_qweight", torch.nn.Parameter(w2_qweight_tmp, requires_grad=False)
+        )
+
+    def create_moe_runner(
+        self, layer: torch.nn.Module, moe_runner_config: MoeRunnerConfig
+    ):
+        self.moe_runner_config = moe_runner_config
+
+    def apply(
+        self,
+        layer: torch.nn.Module,
+        dispatch_output: StandardDispatchOutput,
+    ) -> torch.Tensor:
+        from sglang.srt.layers.moe.token_dispatcher import StandardCombineInput
+
+        assert (
+            self.moe_runner_config.activation == "silu"
+        ), "Only SiLU activation is supported."
+
+        x = dispatch_output.hidden_states
+        topk_output = dispatch_output.topk_output
+
+        topk_weights, topk_ids, _ = topk_output
+        topk_ids = topk_ids.to(torch.int32)
+        topk_weights = topk_weights.to(x.dtype)
+        output = npu_fused_experts(
+            hidden_states=x,
+            w13=layer.w13_qweight,
+            w13_scale=layer.w13_scales,
+            w13_offset=layer.w13_qzeros,
+            w2=layer.w2_qweight,
+            w2_scale=layer.w2_scales,
+            w2_offset=layer.w2_qzeros,
+            topk_weights=topk_weights,
+            topk_ids=topk_ids,
+            top_k=topk_ids.shape[1],
+            use_wna16=True,
+        )
+        return StandardCombineInput(hidden_states=output)
+
+
+# Register fake implementations for torch.compile support
+if _is_cuda:
+
+    @register_fake_if_exists("sgl_kernel::awq_marlin_repack")
+    def _(b_q_weight, size_k, size_n, num_bits):
+        return b_q_weight.new_empty(
+            (size_k // 16, size_n * (num_bits // 2)), dtype=b_q_weight.dtype
+        )
diff --git a/sglang/python/sglang/srt/layers/quantization/awq_triton.py b/sglang/python/sglang/srt/layers/quantization/awq_triton.py
new file mode 100644
index 0000000000000000000000000000000000000000..b83dd79fbcc34b9ceb63218a1302819aae6bcc23
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/quantization/awq_triton.py
@@ -0,0 +1,368 @@
+# Adapted from https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/layers/quantization/awq_triton.py
+
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import torch
+import triton
+import triton.language as tl
+
+AWQ_TRITON_SUPPORTED_GROUP_SIZES = [-1, 32, 64, 128]
+
+
+@triton.jit
+def awq_dequantize_kernel(
+    qweight_ptr,  # quantized matrix
+    scales_ptr,  # scales, per group
+    zeros_ptr,  # zeros, per group
+    group_size,  # Should always be one of the supported group sizes
+    result_ptr,  # Output matrix
+    num_cols,  # input num cols in qweight
+    num_rows,  # input num rows in qweight
+    BLOCK_SIZE_X: tl.constexpr,
+    BLOCK_SIZE_Y: tl.constexpr,
+):
+    # Setup the pids.
+    pid_x = tl.program_id(axis=0)
+    pid_y = tl.program_id(axis=1)
+
+    # Compute offsets and masks for qweight_ptr.
+    offsets_y = pid_y * BLOCK_SIZE_Y + tl.arange(0, BLOCK_SIZE_Y)
+    offsets_x = pid_x * BLOCK_SIZE_X + tl.arange(0, BLOCK_SIZE_X)
+    offsets = num_cols * offsets_y[:, None] + offsets_x[None, :]
+
+    masks_y = offsets_y < num_rows
+    masks_x = offsets_x < num_cols
+
+    masks = masks_y[:, None] & masks_x[None, :]
+
+    # Compute offsets and masks for result output ptr.
+    result_offsets_y = pid_y * BLOCK_SIZE_Y + tl.arange(0, BLOCK_SIZE_Y)
+    result_offsets_x = pid_x * BLOCK_SIZE_X * 8 + tl.arange(0, BLOCK_SIZE_X * 8)
+    result_offsets = (
+        8 * num_cols * result_offsets_y[:, None] + result_offsets_x[None, :]
+    )
+
+    result_masks_y = result_offsets_y < num_rows
+    result_masks_x = result_offsets_x < num_cols * 8
+    result_masks = result_masks_y[:, None] & result_masks_x[None, :]
+
+    # Load the weights.
+    iweights = tl.load(qweight_ptr + offsets, masks, 0.0)
+    iweights = tl.interleave(iweights, iweights)
+    iweights = tl.interleave(iweights, iweights)
+    iweights = tl.interleave(iweights, iweights)
+
+    # Create reverse AWQ order as tensor: [0, 4, 1, 5, 2, 6, 3, 7]
+    # that will map given indices to the correct order.
+    reverse_awq_order_tensor = (
+        (tl.arange(0, 2) * 4)[None, :] + tl.arange(0, 4)[:, None]
+    ).reshape(8)
+
+    # Use this to compute a set of shifts that can be used to unpack and
+    # reorder the values in iweights and zeros.
+    shifts = reverse_awq_order_tensor * 4
+    shifts = tl.broadcast_to(shifts[None, :], (BLOCK_SIZE_Y * BLOCK_SIZE_X, 8))
+    shifts = tl.reshape(shifts, (BLOCK_SIZE_Y, BLOCK_SIZE_X * 8))
+
+    # Unpack and reorder: shift out the correct 4-bit value and mask.
+    iweights = (iweights >> shifts) & 0xF
+
+    # Compute zero offsets and masks.
+    zero_offsets_y = pid_y * BLOCK_SIZE_Y // group_size + tl.arange(0, 1)
+    zero_offsets_x = pid_x * BLOCK_SIZE_X + tl.arange(0, BLOCK_SIZE_X)
+    zero_offsets = num_cols * zero_offsets_y[:, None] + zero_offsets_x[None, :]
+
+    zero_masks_y = zero_offsets_y < num_rows // group_size
+    zero_masks_x = zero_offsets_x < num_cols
+    zero_masks = zero_masks_y[:, None] & zero_masks_x[None, :]
+
+    # Load the zeros.
+    zeros = tl.load(zeros_ptr + zero_offsets, zero_masks, 0.0)
+    zeros = tl.interleave(zeros, zeros)
+    zeros = tl.interleave(zeros, zeros)
+    zeros = tl.interleave(zeros, zeros)
+    zeros = tl.broadcast_to(zeros, (BLOCK_SIZE_Y, BLOCK_SIZE_X * 8))
+
+    # Unpack and reorder: shift out the correct 4-bit value and mask.
+    zeros = (zeros >> shifts) & 0xF
+
+    # Compute scale offsets and masks.
+    scale_offsets_y = pid_y * BLOCK_SIZE_Y // group_size + tl.arange(0, 1)
+    scale_offsets_x = pid_x * BLOCK_SIZE_X * 8 + tl.arange(0, BLOCK_SIZE_X * 8)
+    scale_offsets = num_cols * 8 * scale_offsets_y[:, None] + scale_offsets_x[None, :]
+    scale_masks_y = scale_offsets_y < num_rows // group_size
+    scale_masks_x = scale_offsets_x < num_cols * 8
+    scale_masks = scale_masks_y[:, None] & scale_masks_x[None, :]
+
+    # Load the scales.
+    scales = tl.load(scales_ptr + scale_offsets, scale_masks, 0.0)
+    scales = tl.broadcast_to(scales, (BLOCK_SIZE_Y, BLOCK_SIZE_X * 8))
+
+    # Dequantize.
+    iweights = (iweights - zeros) * scales
+    iweights = iweights.to(result_ptr.type.element_ty)
+
+    # Finally, store.
+    tl.store(result_ptr + result_offsets, iweights, result_masks)
+
+
+@triton.jit
+def awq_gemm_kernel(
+    a_ptr,
+    b_ptr,
+    c_ptr,
+    zeros_ptr,
+    scales_ptr,
+    M,
+    N,
+    K,
+    group_size,
+    BLOCK_SIZE_M: tl.constexpr,
+    BLOCK_SIZE_N: tl.constexpr,
+    BLOCK_SIZE_K: tl.constexpr,
+    SPLIT_K: tl.constexpr,
+):
+    pid = tl.program_id(axis=0)
+    pid_z = tl.program_id(1)
+
+    # NOTE: This doesn't work in TRITON_INTERPRET=1 mode.  Use below instead.
+    # num_pid_n = (N + BLOCK_SIZE_N - 1) // BLOCK_SIZE_N
+    num_pid_n = tl.cdiv(N, BLOCK_SIZE_N)
+
+    pid_m = pid // num_pid_n
+    pid_n = pid % num_pid_n
+
+    accumulator_dtype = c_ptr.type.element_ty
+
+    # NOTE: This doesn't work in TRITON_INTERPRET=1 mode.  Use below instead.
+    # accumulator = tl.arange(0, BLOCK_SIZE_N)
+    # accumulator = tl.broadcast_to(accumulator[None, :],
+    # (BLOCK_SIZE_M, BLOCK_SIZE_N))
+    # accumulator = accumulator & 0x0
+    # accumulator = accumulator.to(accumulator_dtype)
+    accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=accumulator_dtype)
+
+    # Create reverse AWQ order as tensor: [0, 4, 1, 5, 2, 6, 3, 7]
+    # that will map given indices to the correct order.
+    reverse_awq_order_tensor = (
+        (tl.arange(0, 2) * 4)[None, :] + tl.arange(0, 4)[:, None]
+    ).reshape(8)
+
+    # Create the necessary shifts to use to unpack.
+    shifts = reverse_awq_order_tensor * 4
+    shifts = tl.broadcast_to(shifts[None, :], (BLOCK_SIZE_K * (BLOCK_SIZE_N // 8), 8))
+    shifts = tl.reshape(shifts, (BLOCK_SIZE_K, BLOCK_SIZE_N))
+
+    # Offsets and masks.
+    offsets_am = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
+    masks_am = offsets_am < M
+
+    offsets_bn = pid_n * (BLOCK_SIZE_N // 8) + tl.arange(0, BLOCK_SIZE_N // 8)
+    masks_bn = offsets_bn < N // 8
+
+    offsets_zn = pid_n * (BLOCK_SIZE_N // 8) + tl.arange(0, BLOCK_SIZE_N // 8)
+    masks_zn = offsets_zn < N // 8
+
+    offsets_sn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+    masks_sn = offsets_sn < N
+
+    offsets_k = pid_z * BLOCK_SIZE_K + tl.arange(0, BLOCK_SIZE_K)
+    offsets_a = K * offsets_am[:, None] + offsets_k[None, :]
+    offsets_b = (N // 8) * offsets_k[:, None] + offsets_bn[None, :]
+
+    a_ptrs = a_ptr + offsets_a
+    b_ptrs = b_ptr + offsets_b
+
+    # NOTE: Use this in TRITON_INTERPRET=1 mode instead of tl.cdiv
+    # block_offset = BLOCK_SIZE_K * SPLIT_K
+    # for k in range(0, (K + block_offset - 1) // (block_offset)):
+    for k in range(0, tl.cdiv(K, BLOCK_SIZE_K * SPLIT_K)):
+        masks_k = offsets_k < K
+        masks_a = masks_am[:, None] & masks_k[None, :]
+        a = tl.load(a_ptrs, mask=masks_a, other=0.0)
+
+        masks_b = masks_k[:, None] & masks_bn[None, :]
+        b = tl.load(b_ptrs, mask=masks_b, other=0.0)
+        b = tl.interleave(b, b)
+        b = tl.interleave(b, b)
+        b = tl.interleave(b, b)
+
+        # Dequantize b.
+        offsets_szk = (
+            BLOCK_SIZE_K * SPLIT_K * k + pid_z * BLOCK_SIZE_K
+        ) // group_size + tl.arange(0, 1)
+        offsets_z = (N // 8) * offsets_szk[:, None] + offsets_zn[None, :]
+        masks_zk = offsets_szk < K // group_size
+        masks_z = masks_zk[:, None] & masks_zn[None, :]
+        zeros_ptrs = zeros_ptr + offsets_z
+        zeros = tl.load(zeros_ptrs, mask=masks_z, other=0.0)
+        zeros = tl.interleave(zeros, zeros)
+        zeros = tl.interleave(zeros, zeros)
+        zeros = tl.interleave(zeros, zeros)
+        zeros = tl.broadcast_to(zeros, (BLOCK_SIZE_K, BLOCK_SIZE_N))
+
+        offsets_s = N * offsets_szk[:, None] + offsets_sn[None, :]
+        masks_sk = offsets_szk < K // group_size
+        masks_s = masks_sk[:, None] & masks_sn[None, :]
+        scales_ptrs = scales_ptr + offsets_s
+        scales = tl.load(scales_ptrs, mask=masks_s, other=0.0)
+        scales = tl.broadcast_to(scales, (BLOCK_SIZE_K, BLOCK_SIZE_N))
+
+        b = (b >> shifts) & 0xF
+        zeros = (zeros >> shifts) & 0xF
+        b = (b - zeros) * scales
+        b = b.to(c_ptr.type.element_ty)
+
+        # Accumulate results.
+        accumulator = tl.dot(a, b, accumulator, out_dtype=accumulator_dtype)
+
+        offsets_k += BLOCK_SIZE_K * SPLIT_K
+        a_ptrs += BLOCK_SIZE_K * SPLIT_K
+        b_ptrs += BLOCK_SIZE_K * SPLIT_K * (N // 8)
+
+    c = accumulator.to(c_ptr.type.element_ty)
+    offs_cm = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
+    offs_cn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+    c_ptrs = c_ptr + pid_z * N * M + N * offs_cm[:, None] + offs_cn[None, :]
+    c_mask = (offs_cm[:, None] < M) & (offs_cn[None, :] < N)
+    tl.store(c_ptrs, c, mask=c_mask)
+
+
+# qweights - [K     , M // 8], int32
+# scales   - [K // G, M     ], float16
+# zeros    - [K // G, M // 8], int32
+def awq_dequantize_triton(
+    qweight: torch.Tensor,
+    scales: torch.Tensor,
+    zeros: torch.Tensor,
+    block_size_x: int = 32,
+    block_size_y: int = 32,
+) -> torch.Tensor:
+    K = qweight.shape[0]
+    M = scales.shape[1]
+    group_size = qweight.shape[0] // scales.shape[0]
+
+    assert K > 0 and M > 0
+    assert scales.shape[0] == K // group_size and scales.shape[1] == M
+    assert zeros.shape[0] == K // group_size and zeros.shape[1] == M // 8
+    assert group_size <= K
+    assert group_size in AWQ_TRITON_SUPPORTED_GROUP_SIZES or group_size == K
+
+    # Result tensor:
+    # number of rows = same as input tensor
+    # number of cols = 8 x input tensor num cols
+    result = torch.empty(
+        qweight.shape[0],
+        qweight.shape[1] * 8,
+        device=qweight.device,
+        dtype=scales.dtype,
+    )
+
+    Y = qweight.shape[0]  # num rows
+    X = qweight.shape[1]  # num cols
+
+    grid = lambda META: (
+        triton.cdiv(X, META["BLOCK_SIZE_X"]),
+        triton.cdiv(Y, META["BLOCK_SIZE_Y"]),
+    )
+    awq_dequantize_kernel[grid](
+        qweight,
+        scales,
+        zeros,
+        group_size,
+        result,
+        X,
+        Y,
+        BLOCK_SIZE_X=block_size_x,
+        BLOCK_SIZE_Y=block_size_y,
+    )
+
+    return result
+
+
+# input   - [M, K]
+# qweight - [K, N // 8]
+# qzeros  - [K // G, N // 8]
+# scales  - [K // G, N]
+# split_k_iters - parallelism along K-dimension, int, power of 2.
+def awq_gemm_triton(
+    input: torch.Tensor,
+    qweight: torch.Tensor,
+    scales: torch.Tensor,
+    qzeros: torch.Tensor,
+    split_k_iters: int,
+    block_size_m: int = 32,
+    block_size_n: int = 32,
+    block_size_k: int = 32,
+) -> torch.Tensor:
+    M, K = input.shape
+    N = qweight.shape[1] * 8
+    group_size = qweight.shape[0] // qzeros.shape[0]
+
+    assert N > 0 and K > 0 and M > 0
+    assert qweight.shape[0] == K and qweight.shape[1] == N // 8
+    assert qzeros.shape[0] == K // group_size and qzeros.shape[1] == N // 8
+    assert scales.shape[0] == K // group_size and scales.shape[1] == N
+    assert split_k_iters & (split_k_iters - 1) == 0 and split_k_iters != 0
+    assert split_k_iters <= 32
+    assert group_size <= K
+    assert group_size in AWQ_TRITON_SUPPORTED_GROUP_SIZES or group_size == K
+
+    grid = lambda META: (
+        triton.cdiv(M, META["BLOCK_SIZE_M"]) * triton.cdiv(N, META["BLOCK_SIZE_N"]),
+        split_k_iters,
+    )
+
+    result = torch.zeros((split_k_iters, M, N), dtype=scales.dtype, device=input.device)
+
+    # A = input, B = qweight, C = result
+    # A = M x K, B = K x N, C = M x N
+    awq_gemm_kernel[grid](
+        input,
+        qweight,
+        result,
+        qzeros,
+        scales,
+        M,
+        N,
+        K,
+        group_size,
+        BLOCK_SIZE_M=block_size_m,
+        BLOCK_SIZE_N=block_size_n,
+        BLOCK_SIZE_K=block_size_k,
+        SPLIT_K=split_k_iters,
+    )
+
+    result = result.sum(0)
+
+    return result
+
+
+def awq_dequantize_decomposition(
+    qweight: torch.Tensor,
+    scales: torch.Tensor,
+    zeros: torch.Tensor,
+) -> torch.Tensor:
+    qweight_tmp = qweight
+    qzeros_tmp = zeros
+    qweight_list = []
+    qzeros_list = []
+    shifts = [0, 4, 1, 5, 2, 6, 3, 7]
+    for i in range(0, 8):
+        shift_num = shifts[i] * 4
+        qzeros_list.append((qzeros_tmp.reshape(-1, 1) >> shift_num) & 0xF)
+        qweight_list.append((qweight_tmp.reshape(-1, 1) >> shift_num) & 0xF)
+    qzeros_tmp = (
+        torch.cat(qzeros_list, dim=-1).reshape(qzeros_tmp.shape[0], -1).to(scales.dtype)
+    )
+    qweight_tmp = (
+        torch.cat(qweight_list, dim=-1)
+        .reshape(qweight_tmp.shape[0], -1)
+        .to(scales.dtype)
+    )
+    res = (
+        qweight_tmp.reshape(qzeros_tmp.shape[0], -1, qzeros_tmp.shape[1])
+        - qzeros_tmp.unsqueeze(1)
+    ) * scales.unsqueeze(1)
+    return res.reshape(qweight_tmp.shape[0], -1)
diff --git a/sglang/python/sglang/srt/layers/quantization/base_config.py b/sglang/python/sglang/srt/layers/quantization/base_config.py
new file mode 100644
index 0000000000000000000000000000000000000000..48511d09f0bfdda954d082b484ec9ff30571b4aa
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/quantization/base_config.py
@@ -0,0 +1,250 @@
+# Adapted from https://raw.githubusercontent.com/vllm-project/vllm/v0.5.5/vllm/model_executor/layers/quantization/base_config.py
+from __future__ import annotations
+
+import inspect
+from abc import ABC, abstractmethod
+from typing import TYPE_CHECKING, Any, Dict, List, Optional, Type
+
+import torch
+from torch import nn
+
+if TYPE_CHECKING:
+    from sglang.srt.layers.moe.moe_runner import MoeRunnerConfig
+    from sglang.srt.layers.moe.token_dispatcher import CombineInput, DispatchOutput
+    from sglang.srt.models.utils import WeightsMapper
+
+
+class QuantizeMethodBase(ABC):
+    """Base class for different quantized methods."""
+
+    def create_weights(
+        self, layer: torch.nn.Module, *weight_args, **extra_weight_attrs
+    ):
+        """Create weights for a layer.
+
+        The weights will be set as attributes of the layer."""
+        raise NotImplementedError()
+
+    @abstractmethod
+    def apply(self, layer: torch.nn.Module, *args, **kwargs) -> torch.Tensor:
+        """Apply the weights in layer to the input tensor.
+
+        Expects create_weights to have been called before on the layer."""
+        raise NotImplementedError()
+
+    def process_weights_after_loading(self, layer: nn.Module) -> None:
+        """Process the weight after loading.
+
+        This can be used for example, to transpose weights for computation.
+        """
+        return
+
+
+class LinearMethodBase(QuantizeMethodBase):
+    """Base class for different (maybe quantized) linear methods."""
+
+    def create_weights(
+        self,
+        layer: torch.nn.Module,
+        input_size_per_partition: int,
+        output_partition_sizes: List[int],
+        input_size: int,
+        output_size: int,
+        params_dtype: torch.dtype,
+        **extra_weight_attrs,
+    ):
+        """Create weights for a linear layer.
+           The weights will be set as attributes of the layer.
+
+        Args:
+            layer: The layer that is using the LinearMethodBase factory.
+            input_size_per_partition: Size of the weight input dim on rank X.
+            output_partition_sizes: Sizes of the output dim of each logical
+                weight on rank X. E.g., output_partition_sizes for QKVLinear
+                is a list contains the width of Wq, Wk, Wv on rank X.
+            input_size: Size of the input dim of the weight across all ranks.
+            output_size: Size of the output dim of the weight across all ranks.
+            params_dtype: Datatype of the parameters.
+        """
+        raise NotImplementedError()
+
+    @abstractmethod
+    def apply(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        bias: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        """Apply the weights in layer to the input tensor.
+        Expects create_weights to have been called before on the layer."""
+        raise NotImplementedError()
+
+
+class FusedMoEMethodBase(QuantizeMethodBase):
+
+    def create_weights(
+        self,
+        layer: torch.nn.Module,
+        num_experts: int,
+        hidden_size: int,
+        intermediate_size_per_partition: int,
+        params_dtype: torch.dtype,
+        **extra_weight_attrs,
+    ):
+        raise NotImplementedError
+
+    def create_moe_runner(
+        self, layer: torch.nn.Module, moe_runner_config: MoeRunnerConfig
+    ):
+        raise NotImplementedError
+
+    @abstractmethod
+    def apply(
+        self,
+        layer: torch.nn.Module,
+        dispatch_output: DispatchOutput,
+    ) -> CombineInput:
+        raise NotImplementedError
+
+
+class QuantizationConfig(ABC):
+    """Base class for quantization configs."""
+
+    def __init__(self):
+        super().__init__()
+        # mapping is updated by models as they initialize
+        self.packed_modules_mapping: Dict[str, List[str]] = dict()
+
+    @abstractmethod
+    def get_name(self) -> str:
+        """Name of the quantization method."""
+        raise NotImplementedError()
+
+    @abstractmethod
+    def get_supported_act_dtypes(self) -> List[torch.dtype]:
+        """List of supported activation dtypes."""
+        raise NotImplementedError()
+
+    @classmethod
+    @abstractmethod
+    def get_min_capability(cls) -> int:
+        """Minimum GPU capability to support the quantization method.
+
+        E.g., 70 for Volta, 75 for Turing, 80 for Ampere.
+        This requirement is due to the custom CUDA kernels used by the
+        quantization method.
+        """
+        raise NotImplementedError()
+
+    @staticmethod
+    @abstractmethod
+    def get_config_filenames() -> List[str]:
+        """List of filenames to search for in the model directory."""
+        raise NotImplementedError()
+
+    @classmethod
+    @abstractmethod
+    def from_config(cls, config: Dict[str, Any]) -> "QuantizationConfig":
+        """Create a config class from the model's quantization config."""
+        raise NotImplementedError()
+
+    @classmethod
+    def override_quantization_method(cls, hf_quant_cfg, user_quant) -> Optional[str]:
+        """
+        Detects if this quantization method can support a given checkpoint
+        format by overriding the user specified quantization method --
+        this method should only be overwritten by subclasses in exceptional
+        circumstances
+        """
+        return None
+
+    @classmethod
+    def _modelopt_override_quantization_method(
+        cls, hf_quant_config, user_quant
+    ) -> Optional[str]:
+        """Shared ModelOpt quantization method override logic."""
+        if hf_quant_config is None:
+            return None
+
+        # Check if this is a ModelOpt config
+        quant_algo = hf_quant_config.get("quant_algo", "").upper()
+
+        # If user specified generic "modelopt", auto-detect the specific method
+        if user_quant == "modelopt":
+            if "FP8" in quant_algo:
+                return "modelopt_fp8"
+            elif "NVFP4" in quant_algo or "FP4" in quant_algo:
+                return "modelopt_fp4"
+
+        # The hf_quant_config may be a parsed quant config, so we need to check the
+        # quant_method.
+        if hf_quant_config.get("quant_method", "") == "modelopt_fp8":
+            return "modelopt_fp8"
+        elif hf_quant_config.get("quant_method", "") == "modelopt_fp4":
+            return "modelopt_fp4"
+
+        return None
+
+    @staticmethod
+    def get_from_keys(config: Dict[str, Any], keys: List[str]) -> Any:
+        """Get a value from the model's quantization config."""
+        for key in keys:
+            if key in config:
+                return config[key]
+        raise ValueError(
+            f"Cannot find any of {keys} in the model's " "quantization config."
+        )
+
+    @staticmethod
+    def get_from_keys_or(config: Dict[str, Any], keys: List[str], default: Any) -> Any:
+        """Get a optional value from the model's quantization config."""
+        try:
+            return QuantizationConfig.get_from_keys(config, keys)
+        except ValueError:
+            return default
+
+    @abstractmethod
+    def get_quant_method(
+        self, layer: torch.nn.Module, prefix: str
+    ) -> Optional[QuantizeMethodBase]:
+        """Get the quantize method to use for the quantized layer.
+
+        Args:
+            layer: The layer for the quant method.
+            prefix: The full name of the layer in the state dict
+        Returns:
+            The quantize method. None if the given layer doesn't support quant
+            method.
+        """
+        raise NotImplementedError()
+
+    @abstractmethod
+    def get_scaled_act_names(self) -> List[str]:
+        """Returns the activation function names that should be post-scaled.
+
+        For now, this is only used by AWQ.
+        """
+        raise NotImplementedError()
+
+    def apply_weight_name_mapper(
+        self, hf_to_sglang_mapper: "WeightsMapper"
+    ):  # noqa: B027
+        """
+        Interface for models to update module names referenced in
+        quantization configs in order to reflect the sglang model structure
+        :param hf_to_sglang_mapper: maps from hf model structure (the assumed
+            structure of the qconfig) to sglang model structure
+        """
+        pass
+
+
+def method_has_implemented_embedding(method_class: Type[QuantizeMethodBase]) -> bool:
+    """
+    Not all quant methods have embedding implemented, so we need to check that
+    it exists for our given method. We check this by making sure the function
+    has been changed from the base implementation.
+    """
+    base_embedding = inspect.getattr_static(QuantizeMethodBase, "embedding", None)
+    class_embedding = inspect.getattr_static(method_class, "embedding", None)
+
+    return class_embedding is not None and class_embedding is not base_embedding
diff --git a/sglang/python/sglang/srt/layers/quantization/base_scheme.py b/sglang/python/sglang/srt/layers/quantization/base_scheme.py
new file mode 100644
index 0000000000000000000000000000000000000000..ee55caa3ceaed1d64e0189931bcffe84146d333e
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/quantization/base_scheme.py
@@ -0,0 +1,99 @@
+# SPDX-License-Identifier: Apache-2.0
+
+from abc import ABC, abstractmethod
+from typing import TYPE_CHECKING, Optional
+
+import torch
+
+from sglang.srt.layers.moe import MoeRunnerConfig
+
+if TYPE_CHECKING:
+    from sglang.srt.layers.moe.token_dispatcher import StandardDispatchOutput
+
+__all__ = ["BaseLinearScheme", "BaseMoEScheme"]
+
+
+class BaseLinearScheme(ABC):
+    """
+    Abstract class used to describe the weight creation and forward pass
+    of different quantization schemes.
+    """
+
+    @abstractmethod
+    def create_weights(self, *args, **kwargs):
+        """
+        Weight creation for the particular scheme. Inputs to this function
+
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def process_weights_after_loading(self, layer: torch.nn.Module):
+        """
+        Called after weight loading is complete for any cleanup that
+        needs to occur.
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def apply_weights(
+        self, layer: torch.nn.Module, x: torch.Tensor, bias: Optional[torch.Tensor]
+    ):
+        """
+        Run the forward pass for the particular scheme. This is where
+        scheme-specific dequant/quant steps/kernels should be applied.
+
+        :param layer: torch.nn.Module with the registered weights and
+            other parameters relevant to the particular scheme.
+        :param x: input to the layer
+        :param bias: bias parameter
+
+        """
+        raise NotImplementedError
+
+
+class BaseMoEScheme(ABC):
+    """
+    Abstract class used to describe the weight creation and forward pass
+    of different quantization schemes.
+    """
+
+    @abstractmethod
+    def create_weights(self, *args, **kwargs):
+        """
+        Weight creation for the particular scheme. Inputs to this function
+
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def create_moe_runner(
+        self, layer: torch.nn.Module, moe_runner_config: MoeRunnerConfig
+    ):
+        raise NotImplementedError
+
+    @abstractmethod
+    def process_weights_after_loading(self, layer: torch.nn.Module):
+        """
+        Called after weight loading is complete for any cleanup that
+        needs to occur.
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def apply_weights(
+        self,
+        layer: torch.nn.Module,
+        dispatch_output: "StandardDispatchOutput",
+    ):
+        """
+        Run the forward pass for the particular scheme. This is where
+        scheme-specific dequant/quant steps/kernels should be applied.
+
+        :param layer: torch.nn.Module with the registered weights and
+            other parameters relevant to the particular scheme.
+        :param x: input to the layer
+        :param bias: bias parameter
+
+        """
+        raise NotImplementedError
diff --git a/sglang/python/sglang/srt/layers/quantization/bitsandbytes.py b/sglang/python/sglang/srt/layers/quantization/bitsandbytes.py
new file mode 100644
index 0000000000000000000000000000000000000000..9f17da1a627d56695b5fa3b851540901810767fa
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/quantization/bitsandbytes.py
@@ -0,0 +1,620 @@
+# SPDX-License-Identifier: Apache-2.0
+# Adapted from: https://github.com/vllm-project/vllm/blob/d4d2751732c3ccae162a5a0160c7d4fe05d2779a/vllm/model_executor/layers/quantization/bitsandbytes.py
+from __future__ import annotations
+
+from typing import TYPE_CHECKING, Any, Optional
+
+import torch
+from packaging import version
+
+from sglang.srt.layers.linear import LinearBase
+from sglang.srt.layers.quantization.base_config import (
+    FusedMoEMethodBase,
+    LinearMethodBase,
+    QuantizationConfig,
+    QuantizeMethodBase,
+)
+from sglang.srt.layers.quantization.unquant import UnquantizedLinearMethod
+from sglang.srt.utils import direct_register_custom_op, set_weight_attrs
+
+if TYPE_CHECKING:
+    from sglang.srt.layers.moe.token_dispatcher import (
+        CombineInput,
+        StandardDispatchOutput,
+    )
+
+
+class BitsAndBytesConfig(QuantizationConfig):
+    """Config class for BitsAndBytes Quantization.
+
+    Reference: https://arxiv.org/abs/2305.14314
+    """
+
+    def __init__(
+        self,
+        load_in_8bit: bool = False,
+        load_in_4bit: bool = True,
+        bnb_4bit_compute_dtype: str = "float32",
+        bnb_4bit_quant_storage: str = "uint8",
+        bnb_4bit_quant_type: str = "fp4",
+        bnb_4bit_use_double_quant: bool = False,
+        llm_int8_enable_fp32_cpu_offload: bool = False,
+        llm_int8_has_fp16_weight: bool = False,
+        llm_int8_skip_modules: list[str] | None = None,
+        llm_int8_threshold: float = 6.0,
+    ) -> None:
+        super().__init__()
+        self.load_in_8bit = load_in_8bit
+        self.load_in_4bit = load_in_4bit
+        self.bnb_4bit_compute_dtype = bnb_4bit_compute_dtype
+        self.bnb_4bit_quant_storage = bnb_4bit_quant_storage
+        self.bnb_4bit_quant_type = bnb_4bit_quant_type
+        self.bnb_4bit_use_double_quant = bnb_4bit_use_double_quant
+        self.llm_int8_enable_fp32_cpu_offload = llm_int8_enable_fp32_cpu_offload
+        self.llm_int8_has_fp16_weight = llm_int8_has_fp16_weight
+        self.llm_int8_skip_modules = llm_int8_skip_modules or []
+        self.llm_int8_threshold = llm_int8_threshold
+
+        if self.bnb_4bit_quant_storage not in ["uint8"]:
+            raise ValueError(
+                f"Unsupported bnb_4bit_quant_storage: {self.bnb_4bit_quant_storage}"
+            )
+
+    def __repr__(self) -> str:
+        return (
+            f"BitsAndBytesConfig(load_in_8bit={self.load_in_8bit}, "
+            f"load_in_4bit={self.load_in_4bit}, "
+            f"bnb_4bit_compute_dtype={self.bnb_4bit_compute_dtype}, "
+            f"bnb_4bit_quant_storage={self.bnb_4bit_quant_storage}, "
+            f"bnb_4bit_quant_type={self.bnb_4bit_quant_type}, "
+            f"llm_int8_skip_modules={self.llm_int8_skip_modules})"
+        )
+
+    def get_name(self) -> str:
+        return "bitsandbytes"
+
+    def get_scaled_act_names(self) -> list[str]:
+        return []
+
+    def get_supported_act_dtypes(self) -> list[torch.dtype]:
+        return [torch.float32, torch.float16, torch.bfloat16]
+
+    @classmethod
+    def get_min_capability(cls) -> int:
+        return 70
+
+    @staticmethod
+    def get_config_filenames() -> list[str]:
+        return []
+
+    @classmethod
+    def from_config(cls, config: dict[str, Any]) -> "BitsAndBytesConfig":
+        def get_safe_value(config, keys, default_value=None):
+            try:
+                value = QuantizationConfig.get_from_keys(config, keys)
+                return value if value is not None else default_value
+            except ValueError:
+                return default_value
+
+        load_in_8bit = get_safe_value(config, ["load_in_8bit"], default_value=False)
+        load_in_4bit = get_safe_value(config, ["load_in_4bit"], default_value=True)
+        bnb_4bit_compute_dtype = get_safe_value(
+            config, ["bnb_4bit_compute_dtype"], default_value="float32"
+        )
+        bnb_4bit_quant_storage = get_safe_value(
+            config, ["bnb_4bit_quant_storage"], default_value="uint8"
+        )
+        bnb_4bit_quant_type = get_safe_value(
+            config, ["bnb_4bit_quant_type"], default_value="fp4"
+        )
+        bnb_4bit_use_double_quant = get_safe_value(
+            config, ["bnb_4bit_use_double_quant"], default_value=False
+        )
+        llm_int8_enable_fp32_cpu_offload = get_safe_value(
+            config, ["llm_int8_enable_fp32_cpu_offload"], default_value=False
+        )
+        llm_int8_has_fp16_weight = get_safe_value(
+            config, ["llm_int8_has_fp16_weight"], default_value=False
+        )
+        llm_int8_skip_modules = get_safe_value(
+            config, ["llm_int8_skip_modules"], default_value=[]
+        )
+        llm_int8_threshold = get_safe_value(
+            config, ["llm_int8_threshold"], default_value=6.0
+        )
+
+        return cls(
+            load_in_8bit=load_in_8bit,
+            load_in_4bit=load_in_4bit,
+            bnb_4bit_compute_dtype=bnb_4bit_compute_dtype,
+            bnb_4bit_quant_storage=bnb_4bit_quant_storage,
+            bnb_4bit_quant_type=bnb_4bit_quant_type,
+            bnb_4bit_use_double_quant=bnb_4bit_use_double_quant,
+            llm_int8_enable_fp32_cpu_offload=llm_int8_enable_fp32_cpu_offload,
+            llm_int8_has_fp16_weight=llm_int8_has_fp16_weight,
+            llm_int8_skip_modules=llm_int8_skip_modules,
+            llm_int8_threshold=llm_int8_threshold,
+        )
+
+    def get_quant_method(
+        self, layer: torch.nn.Module, prefix: str
+    ) -> Optional[QuantizeMethodBase]:
+        from sglang.srt.layers.moe.fused_moe_triton.layer import FusedMoE
+
+        if isinstance(layer, LinearBase):
+            if is_layer_skipped_bnb(prefix, self.llm_int8_skip_modules):
+                return UnquantizedLinearMethod()
+            return BitsAndBytesLinearMethod(self)
+        elif isinstance(layer, FusedMoE):
+            return BitsAndBytesMoEMethod(self)
+        return None
+
+
+def is_layer_skipped_bnb(prefix: str, llm_int8_skip_modules: list[str]):
+    # Split the prefix into its dot-separated components
+    components = prefix.split(".")
+
+    # Check if any of the skip modules exactly matches any component
+    substr_check = any(
+        module_name in components for module_name in llm_int8_skip_modules
+    )
+
+    # Allow certain layers to not be quantized
+    set_components = set(".".join(components[: i + 1]) for i in range(len(components)))
+    set_llm_int8_skip_modules = set(llm_int8_skip_modules)
+    prefix_check = len(set_llm_int8_skip_modules & set_components) != 0
+
+    return substr_check or prefix_check
+
+
+def calculate_quant_ratio(dtype):
+    if dtype.is_floating_point:
+        return torch.finfo(dtype).bits // torch.iinfo(torch.uint8).bits
+    else:
+        return torch.iinfo(dtype).bits // torch.iinfo(torch.uint8).bits
+
+
+class BitsAndBytesLinearMethod(LinearMethodBase):
+    """Linear method for BitsAndBytes.
+
+    Args:
+       quant_config: The BitsAndBytes quantization config.
+    """
+
+    def __init__(self, quant_config: BitsAndBytesConfig):
+        try:
+            import bitsandbytes
+
+            if version.parse(bitsandbytes.__version__) < version.parse("0.46.1"):
+                raise ImportError(
+                    "bitsandbytes version is wrong. Please "
+                    "install bitsandbytes>=0.46.1."
+                )
+        except ImportError as err:
+            raise ImportError(
+                "Please install bitsandbytes>=0.46.1 via "
+                "`pip install bitsandbytes>=0.46.1` to use "
+                "bitsandbytes quantizer."
+            ) from err
+
+        self.quant_config = quant_config
+
+    def create_weights(
+        self,
+        layer: torch.nn.Module,
+        input_size_per_partition: int,
+        output_partition_sizes: list[int],
+        input_size: int,
+        output_size: int,
+        params_dtype: torch.dtype,
+        **extra_weight_attrs,
+    ):
+        from bitsandbytes.nn import Int8Params
+
+        def create_qweight_for_8bit():
+            qweight = Int8Params(
+                data=torch.empty(
+                    sum(output_partition_sizes),
+                    input_size_per_partition,
+                    dtype=torch.int8,
+                ),
+                has_fp16_weights=self.quant_config.llm_int8_has_fp16_weight,
+                requires_grad=False,
+            )
+            set_weight_attrs(
+                qweight,
+                {
+                    "input_dim": 0,
+                    "output_dim": 0,
+                    "pack_factor": 1,
+                    "use_bitsandbytes_8bit": True,
+                    "generation": 0,
+                },
+            )
+            return qweight
+
+        def create_qweight_for_4bit():
+            quant_ratio = calculate_quant_ratio(params_dtype)
+
+            total_size = input_size_per_partition * sum(output_partition_sizes)
+            if total_size % quant_ratio != 0:
+                raise ValueError(
+                    "The input size is not aligned with the quantized weight shape."
+                )
+
+            qweight = torch.nn.Parameter(
+                torch.empty(total_size // quant_ratio, 1, dtype=torch.uint8),
+                requires_grad=False,
+            )
+            set_weight_attrs(
+                qweight,
+                {
+                    "input_dim": 0,
+                    "output_dim": 0,
+                    "pack_factor": quant_ratio,
+                    "use_bitsandbytes_4bit": True,
+                },
+            )
+            return qweight
+
+        if self.quant_config.load_in_8bit:
+            qweight = create_qweight_for_8bit()
+        else:
+            qweight = create_qweight_for_4bit()
+        # Enable parameters to have the same name as in the BNB
+        # checkpoint format.
+        layer.register_parameter("weight", qweight)
+        set_weight_attrs(qweight, extra_weight_attrs)
+
+    def apply(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        bias: torch.Tensor | None = None,
+    ) -> torch.Tensor:
+        if self.quant_config.load_in_8bit:
+            return self._apply_8bit_weight(layer, x, bias)
+        else:
+            return self._apply_4bit_weight(layer, x, bias)
+
+    def _apply_8bit_weight(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        bias: torch.Tensor | None = None,
+    ) -> torch.Tensor:
+        # only load the bitsandbytes module when needed
+        from bitsandbytes import MatmulLtState, matmul
+
+        original_type = x.dtype
+        original_shape = x.shape
+        reshape_after_matmul = False
+        if x.ndim > 2:
+            x = x.reshape(-1, x.size(-1))
+            reshape_after_matmul = True
+        bf_x = x.to(torch.bfloat16)
+
+        qweight = layer.weight
+        offsets = qweight.bnb_shard_offsets
+        quant_states = qweight.bnb_quant_state
+        matmul_states = qweight.matmul_state
+        generation = qweight.generation
+
+        out_dim_0 = x.shape[0]
+        out_dim_1 = sum(
+            [quant_state[1].shape[0] for quant_state in quant_states.items()]
+        )
+        out = torch.empty(out_dim_0, out_dim_1, dtype=torch.float16, device=x.device)
+
+        current_index = 0
+        for i in range(len(quant_states)):
+            output_size = quant_states[i].shape[0]
+
+            # in profile_run or the first generation of inference,
+            # create new matmul_states
+            if generation == 0 or generation == 1:
+                matmul_states[i] = MatmulLtState()
+                matmul_states[i].CB = qweight[offsets[i] : offsets[i + 1]]
+                matmul_states[i].SCB = quant_states[i].to(x.device)
+                matmul_states[i].threshold = self.quant_config.llm_int8_threshold
+                matmul_states[i].has_fp16_weights = (
+                    self.quant_config.llm_int8_has_fp16_weight
+                )
+                matmul_states[i].is_training = False
+                if (
+                    matmul_states[i].threshold > 0.0
+                    and not matmul_states[i].has_fp16_weights
+                ):
+                    matmul_states[i].use_pool = True
+
+            new_x = bf_x.unsqueeze(0)
+
+            out[:, current_index : current_index + output_size] = matmul(
+                new_x, qweight[offsets[i] : offsets[i + 1]], state=matmul_states[i]
+            )
+
+            current_index += output_size
+
+            # only update the matmul_states if it is not profile_run
+            if (
+                generation > 0
+                and not self.quant_config.llm_int8_has_fp16_weight
+                and matmul_states[i].CB is not None
+                and matmul_states[i].CxB is not None
+            ):
+                del matmul_states[i].CB
+                qweight[offsets[i] : offsets[i + 1]] = matmul_states[i].CxB
+
+        out = out.to(original_type)
+
+        if reshape_after_matmul:
+            out = out.view(*original_shape[:-1], out.size(-1))
+
+        if bias is not None:
+            out += bias
+
+        qweight.generation += 1
+
+        return out
+
+    def _apply_4bit_weight(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        bias: torch.Tensor | None = None,
+    ) -> torch.Tensor:
+        original_type = x.dtype
+        original_shape = x.shape
+        reshape_after_matmul = False
+        if x.ndim > 2:
+            x = x.reshape(-1, x.size(-1))
+            reshape_after_matmul = True
+        bf_x = x.to(torch.bfloat16)
+
+        qweight = layer.weight
+        quant_states = qweight.bnb_quant_state
+        offsets = qweight.bnb_shard_offsets
+
+        out_dim_0 = x.shape[0]
+        out_dim_1 = sum(
+            [quant_state[1].shape[0] for quant_state in quant_states.items()]
+        )
+        out = torch.empty(out_dim_0, out_dim_1, dtype=torch.bfloat16, device=x.device)
+        apply_bnb_4bit(bf_x, qweight, offsets, out)
+        out = out.to(original_type)
+
+        if reshape_after_matmul:
+            out = out.view(*original_shape[:-1], out.size(-1))
+
+        if bias is not None:
+            out += bias
+
+        return out
+
+
+def _apply_bnb_4bit(
+    x: torch.Tensor,
+    weight: torch.Tensor,
+    offsets: torch.Tensor,
+    out: torch.Tensor,
+) -> None:
+    # only load the bitsandbytes module when needed
+    from bitsandbytes import matmul_4bit
+
+    quant_states = weight.bnb_quant_state
+    current_index = 0
+    for i in range(len(quant_states)):
+        output_size = quant_states[i].shape[0]
+        # It is more efficient to use out kwarg like
+        # matmul_4bit(..., out = ...).  Infeasible now due to the bug
+        # https://github.com/TimDettmers/bitsandbytes/issues/1235.
+        # Need to change  after the bug is fixed.
+        out[:, current_index : current_index + output_size] = matmul_4bit(
+            x, weight[offsets[i] : offsets[i + 1]].t(), quant_states[i]
+        )
+        current_index += output_size
+
+
+def _apply_bnb_4bit_fake(
+    x: torch.Tensor,
+    weight: torch.Tensor,
+    offsets: torch.Tensor,
+    out: torch.Tensor,
+) -> None:
+    return
+
+
+try:
+    direct_register_custom_op(
+        op_name="apply_bnb_4bit",
+        op_func=_apply_bnb_4bit,
+        mutates_args=["out"],
+        fake_impl=_apply_bnb_4bit_fake,
+    )
+    apply_bnb_4bit = torch.ops.sglang.apply_bnb_4bit
+
+except AttributeError as error:
+    raise error
+
+
+class BitsAndBytesMoEMethod(FusedMoEMethodBase):
+    """MoE method for BitsAndBytes.
+
+    Args:
+       quant_config: The BitsAndBytes quantization config.
+    """
+
+    def __init__(
+        self,
+        quant_config: BitsAndBytesConfig,
+    ):
+        super().__init__()
+        try:
+            import bitsandbytes
+
+            if version.parse(bitsandbytes.__version__) < version.parse("0.46.1"):
+                raise ImportError(
+                    "bitsandbytes version is wrong. Please "
+                    "install bitsandbytes>=0.46.1."
+                )
+        except ImportError as err:
+            raise ImportError(
+                "Please install bitsandbytes>=0.46.1 via "
+                "`pip install bitsandbytes>=0.46.1` to use "
+                "bitsandbytes quantizer."
+            ) from err
+        self.quant_config = quant_config
+
+    def create_weights(
+        self,
+        layer: torch.nn.Module,
+        num_experts: int,
+        hidden_size: int,
+        intermediate_size_per_partition: int,
+        params_dtype: torch.dtype,
+        **extra_weight_attrs,
+    ):
+        if self.quant_config.load_in_8bit:
+            call_fun = self._create_weights_8bit
+        else:
+            call_fun = self._create_weights_4bit
+        call_fun(
+            layer,
+            num_experts,
+            hidden_size,
+            intermediate_size_per_partition,
+            params_dtype,
+            **extra_weight_attrs,
+        )
+
+    def create_moe_runner(self, layer: torch.nn.Module, moe_runner_config):
+        self.moe_runner_config = moe_runner_config
+
+    def apply(
+        self,
+        layer: torch.nn.Module,
+        dispatch_output: StandardDispatchOutput,
+    ) -> CombineInput:
+        from sglang.srt.layers.moe.fused_moe_triton.fused_moe import fused_moe
+        from sglang.srt.layers.moe.token_dispatcher import StandardCombineInput
+
+        x = dispatch_output.hidden_states
+        topk_output = dispatch_output.topk_output
+
+        # TODO(bnell): Do these need to be called on the hot path?
+        if self.quant_config.load_in_8bit:
+            w13, w2 = self._apply_8bit_dequant(layer)
+        else:
+            w13, w2 = self._apply_4bit_dequant(layer)
+
+        moe_runner_config = self.moe_runner_config
+        output = fused_moe(
+            hidden_states=x,
+            w1=w13,
+            w2=w2,
+            topk_output=topk_output,
+            moe_runner_config=moe_runner_config,
+        )
+        return StandardCombineInput(hidden_states=output)
+
+    def _create_weights_4bit(
+        self,
+        layer: torch.nn.Module,
+        num_experts: int,
+        hidden_size: int,
+        intermediate_size_per_partition: int,
+        params_dtype: torch.dtype,
+        **extra_weight_attrs,
+    ):
+        quant_ratio = calculate_quant_ratio(params_dtype)
+        # Fused gate_up_proj (column parallel)
+        w13_total_size = (
+            hidden_size * 2 * intermediate_size_per_partition
+        ) // quant_ratio
+        w13_qweight = torch.nn.Parameter(
+            torch.empty(
+                num_experts,
+                w13_total_size,
+                1,
+                dtype=torch.uint8,
+            ),
+            requires_grad=False,
+        )
+        layer.register_parameter("w13_weight", w13_qweight)
+        set_weight_attrs(w13_qweight, extra_weight_attrs)
+        set_weight_attrs(
+            w13_qweight,
+            {
+                "num_experts": num_experts,
+                "input_dim": hidden_size,
+                "output_dim": 2 * intermediate_size_per_partition,
+                "experts_shape": (
+                    num_experts,
+                    intermediate_size_per_partition * 2,
+                    hidden_size,
+                ),
+                "pack_factor": quant_ratio,
+                "use_bitsandbytes_4bit": True,
+            },
+        )
+        # down_proj (row parallel)
+        w2_total_size = (hidden_size * intermediate_size_per_partition) // quant_ratio
+        w2_qweight = torch.nn.Parameter(
+            torch.empty(
+                num_experts,
+                w2_total_size,
+                1,
+                dtype=torch.uint8,
+            ),
+            requires_grad=False,
+        )
+        set_weight_attrs(
+            w2_qweight,
+            {
+                "num_experts": num_experts,
+                "input_dim": intermediate_size_per_partition,
+                "output_dim": hidden_size,
+                "experts_shape": (
+                    num_experts,
+                    hidden_size,
+                    intermediate_size_per_partition,
+                ),
+                "pack_factor": quant_ratio,
+                "use_bitsandbytes_4bit": True,
+            },
+        )
+        layer.register_parameter("w2_weight", w2_qweight)
+        set_weight_attrs(w2_qweight, extra_weight_attrs)
+
+    def _create_weights_8bit(
+        self,
+        layer: torch.nn.Module,
+        num_experts: int,
+        hidden_size: int,
+        intermediate_size_per_partition: int,
+        params_dtype: torch.dtype,
+        **extra_weight_attrs,
+    ):
+        raise NotImplementedError
+
+    def _apply_4bit_dequant(
+        self, layer: torch.nn.Module
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        from bitsandbytes.functional import dequantize_4bit
+
+        w13 = dequantize_4bit(
+            layer.w13_weight.reshape(-1, 1),
+            layer.w13_weight.bnb_quant_state,
+        )
+        w2 = dequantize_4bit(
+            layer.w2_weight.reshape(-1, 1),
+            layer.w2_weight.bnb_quant_state,
+        )
+        w13 = w13.reshape(layer.w13_weight.experts_shape)
+        w2 = w2.reshape(layer.w2_weight.experts_shape)
+        return w13, w2
+
+    def _apply_8bit_dequant(
+        self, layer: torch.nn.Module
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        raise NotImplementedError
diff --git a/sglang/python/sglang/srt/layers/quantization/blockwise_int8.py b/sglang/python/sglang/srt/layers/quantization/blockwise_int8.py
new file mode 100644
index 0000000000000000000000000000000000000000..60d4e3929b01947db21ac559b0a80e764aaa60de
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/quantization/blockwise_int8.py
@@ -0,0 +1,380 @@
+# Adapted from https://github.com/vllm-project/vllm/blob/v0.6.4.post1/vllm/model_executor/layers/quantization/fp8.py
+
+from __future__ import annotations
+
+import logging
+from typing import TYPE_CHECKING, Any, Dict, List, Optional
+
+import torch
+from torch.nn import Module
+
+from sglang.srt.distributed import get_tensor_model_parallel_world_size
+from sglang.srt.layers.moe import MoeRunner, MoeRunnerBackend, MoeRunnerConfig
+from sglang.srt.layers.moe.moe_runner.triton import TritonMoeQuantInfo
+from sglang.srt.layers.parameter import BlockQuantScaleParameter, ModelWeightParameter
+from sglang.srt.layers.quantization.base_config import (
+    FusedMoEMethodBase,
+    LinearMethodBase,
+    QuantizationConfig,
+    QuantizeMethodBase,
+)
+from sglang.srt.layers.quantization.int8_utils import apply_w8a8_block_int8_linear
+from sglang.srt.layers.quantization.unquant import UnquantizedLinearMethod
+from sglang.srt.layers.quantization.utils import is_layer_skipped
+from sglang.srt.utils import set_weight_attrs
+
+if TYPE_CHECKING:
+    from sglang.srt.layers.moe.token_dispatcher import (
+        CombineInput,
+        StandardDispatchOutput,
+    )
+
+ACTIVATION_SCHEMES = ["static", "dynamic"]
+
+logger = logging.getLogger(__name__)
+
+
+class BlockInt8Config(QuantizationConfig):
+    """Config class for INT8."""
+
+    def __init__(
+        self,
+        is_checkpoint_int8_serialized: bool = False,
+        activation_scheme: str = "dynamic",
+        ignored_layers: Optional[List[str]] = None,
+        weight_block_size: List[int] = None,
+    ) -> None:
+        self.is_checkpoint_int8_serialized = is_checkpoint_int8_serialized
+        if is_checkpoint_int8_serialized:
+            logger.warning(
+                "Detected int8 checkpoint. Please note that the "
+                "format is experimental and subject to change."
+            )
+        if activation_scheme not in ACTIVATION_SCHEMES:
+            raise ValueError(f"Unsupported activation scheme {activation_scheme}")
+        self.activation_scheme = activation_scheme
+        self.ignored_layers = ignored_layers or []
+        if weight_block_size is not None:
+            if not is_checkpoint_int8_serialized:
+                raise ValueError(
+                    f"The block-wise quantization only supports int8-serialized checkpoint for now."
+                )
+            if len(weight_block_size) != 2:
+                raise ValueError(
+                    f"The quantization block size of weight must have 2 dimensions, but got {len(weight_block_size)} dimensions."
+                )
+            if activation_scheme != "dynamic":
+                raise ValueError(
+                    f"The block-wise quantization only supports dynamic activation scheme for now, but got {activation_scheme} activation scheme."
+                )
+        self.weight_block_size = weight_block_size
+
+    @classmethod
+    def get_name(cls) -> str:
+        return "blockwise_int8"
+
+    @classmethod
+    def get_supported_act_dtypes(cls) -> List[torch.dtype]:
+        return [torch.bfloat16, torch.half]
+
+    @classmethod
+    def get_min_capability(cls) -> int:
+        return 80
+
+    @classmethod
+    def get_config_filenames(cls) -> List[str]:
+        return []
+
+    @classmethod
+    def from_config(cls, config: Dict[str, Any]) -> BlockInt8Config:
+        quant_method = cls.get_from_keys(config, ["quant_method"])
+        is_checkpoint_int8_serialized = "int8" in quant_method
+        activation_scheme = cls.get_from_keys(config, ["activation_scheme"])
+        ignored_layers = cls.get_from_keys_or(config, ["ignored_layers"], None)
+        weight_block_size = cls.get_from_keys_or(config, ["weight_block_size"], None)
+        return cls(
+            is_checkpoint_int8_serialized=is_checkpoint_int8_serialized,
+            activation_scheme=activation_scheme,
+            ignored_layers=ignored_layers,
+            weight_block_size=weight_block_size,
+        )
+
+    def get_quant_method(
+        self, layer: torch.nn.Module, prefix: str
+    ) -> Optional[QuantizeMethodBase]:
+        from sglang.srt.layers.linear import LinearBase
+        from sglang.srt.layers.moe.fused_moe_triton import FusedMoE
+
+        if isinstance(layer, LinearBase):
+            if is_layer_skipped(prefix, self.ignored_layers):
+                return UnquantizedLinearMethod()
+            return BlockInt8LinearMethod(self)
+        elif isinstance(layer, FusedMoE):
+            return BlockInt8MoEMethod(self)
+        return None
+
+    def get_scaled_act_names(self) -> List[str]:
+        return []
+
+
+class BlockInt8LinearMethod(LinearMethodBase):
+    """Linear method for INT8.
+    Supports loading INT8 checkpoints with static weight scale and
+    dynamic activation scale.
+
+    Limitations:
+    Only support block-wise int8 quantization and int8 checkpoint
+
+    Args:
+        quant_config: The quantization config.
+    """
+
+    def __init__(self, quant_config: BlockInt8Config):
+        self.quant_config = quant_config
+        assert self.quant_config.weight_block_size is not None
+        assert self.quant_config.is_checkpoint_int8_serialized
+
+    def create_weights(
+        self,
+        layer: torch.nn.Module,
+        input_size_per_partition: int,
+        output_partition_sizes: List[int],
+        input_size: int,
+        output_size: int,
+        params_dtype: torch.dtype,
+        **extra_weight_attrs,
+    ):
+        output_size_per_partition = sum(output_partition_sizes)
+        weight_loader = extra_weight_attrs.get("weight_loader")
+
+        tp_size = get_tensor_model_parallel_world_size()
+
+        block_n, block_k = (
+            self.quant_config.weight_block_size[0],
+            self.quant_config.weight_block_size[1],
+        )
+        # Required by row parallel
+        if tp_size > 1 and input_size // input_size_per_partition == tp_size:
+            if input_size_per_partition % block_k != 0:
+                raise ValueError(
+                    f"Weight input_size_per_partition = "
+                    f"{input_size_per_partition} is not divisible by "
+                    f"weight quantization block_k = {block_k}."
+                )
+        # Required by column parallel or enabling merged weights
+        if (tp_size > 1 and output_size // output_size_per_partition == tp_size) or len(
+            output_partition_sizes
+        ) > 1:
+            for output_partition_size in output_partition_sizes:
+                if output_partition_size % block_n != 0:
+                    raise ValueError(
+                        f"Weight output_partition_size = "
+                        f"{output_partition_size} is not divisible by "
+                        f"weight quantization block_n = {block_n}."
+                    )
+
+        layer.logical_widths = output_partition_sizes
+
+        layer.input_size_per_partition = input_size_per_partition
+        layer.output_size_per_partition = output_size_per_partition
+        layer.orig_dtype = params_dtype
+
+        # WEIGHT
+        weight_dtype = (
+            torch.int8
+            if self.quant_config.is_checkpoint_int8_serialized
+            else params_dtype
+        )
+
+        weight = ModelWeightParameter(
+            data=torch.empty(
+                output_size_per_partition, input_size_per_partition, dtype=weight_dtype
+            ),
+            input_dim=1,
+            output_dim=0,
+            weight_loader=weight_loader,
+        )
+        layer.register_parameter("weight", weight)
+
+        # WEIGHT SCALE
+
+        scale = BlockQuantScaleParameter(
+            data=torch.empty(
+                (output_size_per_partition + block_n - 1) // block_n,
+                (input_size_per_partition + block_k - 1) // block_k,
+                dtype=torch.float32,
+            ),
+            input_dim=1,
+            output_dim=0,
+            weight_loader=weight_loader,
+        )
+        scale[:] = torch.finfo(torch.float32).min
+        layer.register_parameter("weight_scale_inv", scale)
+
+        # INPUT ACTIVATION SCALE
+        assert self.quant_config.activation_scheme == "dynamic"
+        layer.register_parameter("input_scale", None)
+
+    def process_weights_after_loading(self, layer: Module) -> None:
+        # Block quant doesn't need to process weights after loading
+        # Use torch Parameter to avoid cuda graph capturing issue
+        layer.weight = torch.nn.Parameter(layer.weight.data, requires_grad=False)
+        layer.weight_scale_inv = torch.nn.Parameter(
+            layer.weight_scale_inv.data, requires_grad=False
+        )
+
+    def apply(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        bias: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        return apply_w8a8_block_int8_linear(
+            input=x,
+            weight=layer.weight,
+            block_size=self.quant_config.weight_block_size,
+            weight_scale=layer.weight_scale_inv,
+            input_scale=None,
+            bias=bias,
+        )
+
+
+class BlockInt8MoEMethod(FusedMoEMethodBase):
+    """MoE method for INT8.
+    Supports loading INT8 checkpoints with static weight scale and
+    dynamic activation scale.
+
+    Limitations:
+    Only support block-wise int8 quantization and int8 checkpoint
+
+    Args:
+        quant_config: The quantization config.
+    """
+
+    def __init__(self, quant_config: BlockInt8Config):
+        self.quant_config = quant_config
+        assert self.quant_config.weight_block_size is not None
+        assert self.quant_config.is_checkpoint_int8_serialized
+
+    def create_weights(
+        self,
+        layer: Module,
+        num_experts: int,
+        hidden_size: int,
+        intermediate_size_per_partition: int,
+        params_dtype: torch.dtype,
+        **extra_weight_attrs,
+    ):
+        from sglang.srt.layers.moe.fused_moe_triton import FusedMoeWeightScaleSupported
+
+        if self.quant_config.is_checkpoint_int8_serialized:
+            params_dtype = torch.int8
+        tp_size = get_tensor_model_parallel_world_size()
+
+        block_n, block_k = (
+            self.quant_config.weight_block_size[0],
+            self.quant_config.weight_block_size[1],
+        )
+        # NOTE(HandH1998): To ensure proper alignment of the block-wise quantization scales, the output_size of the weights for both the gate and up layers must be divisible by block_n.
+        # Required by column parallel or enabling merged weights
+        if intermediate_size_per_partition % block_n != 0:
+            raise ValueError(
+                f"The output_size of gate's and up's weight = "
+                f"{intermediate_size_per_partition} is not divisible by "
+                f"weight quantization block_n = {block_n}."
+            )
+        if tp_size > 1:
+            # Required by row parallel
+            if intermediate_size_per_partition % block_k != 0:
+                raise ValueError(
+                    f"The input_size of down's weight = "
+                    f"{intermediate_size_per_partition} is not divisible by "
+                    f"weight quantization block_k = {block_k}."
+                )
+
+        # WEIGHTS
+        w13_weight = torch.nn.Parameter(
+            torch.empty(
+                num_experts,
+                2 * intermediate_size_per_partition,
+                hidden_size,
+                dtype=params_dtype,
+            ),
+            requires_grad=False,
+        )
+        layer.register_parameter("w13_weight", w13_weight)
+        set_weight_attrs(w13_weight, extra_weight_attrs)
+
+        w2_weight = torch.nn.Parameter(
+            torch.empty(
+                num_experts,
+                hidden_size,
+                intermediate_size_per_partition,
+                dtype=params_dtype,
+            ),
+            requires_grad=False,
+        )
+        layer.register_parameter("w2_weight", w2_weight)
+        set_weight_attrs(w2_weight, extra_weight_attrs)
+
+        # WEIGHT_SCALES
+        w13_weight_scale = torch.nn.Parameter(
+            torch.ones(
+                num_experts,
+                2 * ((intermediate_size_per_partition + block_n - 1) // block_n),
+                (hidden_size + block_k - 1) // block_k,
+                dtype=torch.float32,
+            ),
+            requires_grad=False,
+        )
+        w2_weight_scale = torch.nn.Parameter(
+            torch.ones(
+                num_experts,
+                (hidden_size + block_n - 1) // block_n,
+                (intermediate_size_per_partition + block_k - 1) // block_k,
+                dtype=torch.float32,
+            ),
+            requires_grad=False,
+        )
+        layer.register_parameter("w13_weight_scale_inv", w13_weight_scale)
+        layer.register_parameter("w2_weight_scale_inv", w2_weight_scale)
+
+        extra_weight_attrs.update(
+            {"quant_method": FusedMoeWeightScaleSupported.BLOCK.value}
+        )
+        set_weight_attrs(w13_weight_scale, extra_weight_attrs)
+        set_weight_attrs(w2_weight_scale, extra_weight_attrs)
+
+        # INPUT_SCALES
+        assert self.quant_config.activation_scheme == "dynamic"
+        layer.w13_input_scale = None
+        layer.w2_input_scale = None
+
+    def process_weights_after_loading(self, layer: Module) -> None:
+        # Block quant doesn't need to process weights after loading
+        return
+
+    def create_moe_runner(
+        self, layer: torch.nn.Module, moe_runner_config: MoeRunnerConfig
+    ):
+        self.moe_runner_config = moe_runner_config
+        self.runner = MoeRunner(MoeRunnerBackend.TRITON, moe_runner_config)
+
+    def apply(
+        self,
+        layer: torch.nn.Module,
+        dispatch_output: StandardDispatchOutput,
+    ) -> CombineInput:
+
+        quant_info = TritonMoeQuantInfo(
+            w13_weight=layer.w13_weight,
+            w2_weight=layer.w2_weight,
+            use_int8_w8a8=True,
+            w13_scale=layer.w13_weight_scale_inv,
+            w2_scale=layer.w2_weight_scale_inv,
+            a13_scale=layer.w13_input_scale,
+            a2_scale=layer.w2_input_scale,
+            block_shape=self.quant_config.weight_block_size,
+        )
+
+        return self.runner.run(dispatch_output, quant_info)
diff --git a/sglang/python/sglang/srt/layers/quantization/compressed_tensors/README.md b/sglang/python/sglang/srt/layers/quantization/compressed_tensors/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..e8caf538b9a0f8d00e58ceb3e84be1d66898e878
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/quantization/compressed_tensors/README.md
@@ -0,0 +1,6 @@
+# quantization compressed_tensors module
+
+To support compressed_tensors format quantization models, we adapted https://github.com/vllm-project/vllm/tree/main/vllm/model_executor/layers/quantization/compressed_tensors into SGLang.
+
+
+For practical purposes, we have only applied the compressed_tensors format of `w8a8_fp8`. If you have requirements for other formats, you can submit an issue through this [link](https://github.com/sgl-project/sglang/issues).
diff --git a/sglang/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py b/sglang/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py
new file mode 100644
index 0000000000000000000000000000000000000000..a13c53af4d2dfa94a3a9a261811d5a00d785a7ae
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py
@@ -0,0 +1,1043 @@
+# Adapted from https://github.com/vllm-project/vllm/tree/main/vllm/model_executor/layers/quantization/compressed_tensors
+# SPDX-License-Identifier: Apache-2.0
+from __future__ import annotations
+
+import logging
+from contextlib import suppress
+from typing import (
+    TYPE_CHECKING,
+    Any,
+    Dict,
+    List,
+    Literal,
+    NamedTuple,
+    Optional,
+    Tuple,
+    cast,
+)
+
+import torch
+from compressed_tensors.config import (
+    CompressionFormat,
+    SparsityCompressionConfig,
+    SparsityStructure,
+)
+from compressed_tensors.quantization import (
+    QuantizationArgs,
+    QuantizationStrategy,
+    QuantizationType,
+)
+from pydantic import BaseModel
+
+from sglang.srt.layers.moe import MoeRunnerConfig, get_moe_runner_backend
+from sglang.srt.layers.quantization.base_config import (
+    FusedMoEMethodBase,
+    LinearMethodBase,
+    QuantizationConfig,
+    QuantizeMethodBase,
+)
+from sglang.srt.layers.quantization.compressed_tensors.schemes import (
+    WNA16_SUPPORTED_BITS,
+    CompressedTensorsLinearScheme,
+    CompressedTensorsMoEScheme,
+    CompressedTensorsMxInt4MoE,
+    CompressedTensorsW4A4Fp4,
+    CompressedTensorsW4A4Nvfp4MoE,
+    CompressedTensorsW8A8Fp8,
+    CompressedTensorsW8A8Fp8MoE,
+    CompressedTensorsW8A8Int8,
+    CompressedTensorsW8A16Fp8,
+    CompressedTensorsWNA16,
+    CompressedTensorsWNA16MoE,
+    CompressedTensorsWNA16TritonMoE,
+    NPUCompressedTensorsW4A8Int8DynamicMoE,
+    NPUCompressedTensorsW4A16Int4DynamicMoE,
+    NPUCompressedTensorsW8A8Int8,
+    NPUCompressedTensorsW8A8Int8DynamicMoE,
+)
+from sglang.srt.layers.quantization.compressed_tensors.utils import (
+    find_matched_target,
+    is_activation_quantization_format,
+    should_ignore_layer,
+)
+from sglang.srt.layers.quantization.fp8 import Fp8LinearMethod
+from sglang.srt.layers.quantization.unquant import (
+    UnquantizedFusedMoEMethod,
+    UnquantizedLinearMethod,
+)
+from sglang.srt.utils import is_cuda, is_hip, is_npu
+
+_is_cuda = is_cuda()
+_is_npu = is_npu()
+_is_hip = is_hip()
+
+if TYPE_CHECKING:
+    from sglang.srt.layers.moe.token_dispatcher import (
+        CombineInput,
+        StandardDispatchOutput,
+    )
+    from sglang.srt.models.utils import WeightsMapper
+
+logger = logging.getLogger(__name__)
+
+__all__ = ["CompressedTensorsLinearMethod"]
+
+SPARSITY_CONFIG_NAME: Literal["sparsity_config"] = "sparsity_config"
+QUANTIZATION_SCHEME_MAP_TYPE = Dict[str, Optional[Dict[str, QuantizationArgs]]]
+
+
+class DeviceCapability(NamedTuple):
+    major: int
+    minor: int
+
+    def as_version_str(self) -> str:
+        return f"{self.major}.{self.minor}"
+
+    def to_int(self) -> int:
+        """
+        Express device capability as an integer ``<major><minor>``.
+
+        It is assumed that the minor version is always a single digit.
+        """
+        assert 0 <= self.minor < 10
+        return self.major * 10 + self.minor
+
+
+class CompressedTensorsConfig(QuantizationConfig):
+    def __init__(
+        self,
+        target_scheme_map: Dict[str, Any],
+        ignore: List[str],
+        quant_format: str,
+        sparsity_scheme_map: Dict[str, SparsityCompressionConfig],
+        sparsity_ignore_list: List[str],
+        kv_cache_scheme: Optional[Dict[str, Any]] = None,
+        config: Optional[Dict[str, Any]] = None,
+        packed_modules_mapping: Optional[Dict[str, List[str]]] = None,
+        linear_fp8_config: Optional[Any] = None,
+    ):
+        super().__init__()
+        self.ignore = ignore
+        self.quant_format = quant_format
+        # Map from [target -> scheme]
+        self.target_scheme_map = target_scheme_map
+        self.kv_cache_scheme = kv_cache_scheme
+        self.sparsity_scheme_map = sparsity_scheme_map
+        self.sparsity_ignore_list = sparsity_ignore_list
+        self.config = config
+        self.packed_modules_mapping = packed_modules_mapping or {}
+        self.linear_fp8_config = linear_fp8_config
+
+    def get_linear_method(self) -> CompressedTensorsLinearMethod:
+        return CompressedTensorsLinearMethod(self)
+
+    def get_supported_act_dtypes(cls) -> List[torch.dtype]:
+        return [torch.float16, torch.bfloat16]
+
+    @classmethod
+    def get_min_capability(cls) -> int:
+        return 70
+
+    def get_name(self) -> str:
+        return "compressed_tensors"
+
+    def get_scaled_act_names(self) -> List[str]:
+        return []
+
+    def apply_weight_name_mapper(self, hf_to_sglang_mapper: "WeightsMapper"):
+        self.target_scheme_map = hf_to_sglang_mapper.apply_dict(self.target_scheme_map)
+        self.ignore = hf_to_sglang_mapper.apply_list(self.ignore)
+        self.sparsity_scheme_map = hf_to_sglang_mapper.apply_dict(
+            self.sparsity_scheme_map
+        )
+        self.sparsity_ignore_list = hf_to_sglang_mapper.apply_list(
+            self.sparsity_ignore_list
+        )
+        if self.kv_cache_scheme is not None:
+            self.kv_cache_scheme = hf_to_sglang_mapper.apply_dict(self.kv_cache_scheme)
+
+    def get_quant_method(
+        self,
+        layer: torch.nn.Module,
+        prefix: str,
+    ) -> Optional[QuantizeMethodBase]:
+        from sglang.srt.layers.linear import LinearBase
+
+        if isinstance(layer, LinearBase):
+            # If linear_fp8_config is set, use FP8 for linear layers
+            # This allows mixed quantization: experts with int4, linear layers with fp8
+            if self.linear_fp8_config is not None:
+                return Fp8LinearMethod(self.linear_fp8_config)
+            scheme = self.get_linear_scheme(layer=layer, layer_name=prefix)
+            if scheme is None:
+                return UnquantizedLinearMethod()
+            layer.scheme = scheme
+            return CompressedTensorsLinearMethod(self)
+        from sglang.srt.layers.moe.fused_moe_triton import FusedMoE
+
+        if isinstance(layer, FusedMoE):
+            layer.scheme = self.get_moe_scheme(layer=layer, layer_name=prefix)
+            if layer.scheme is None:  # ignored layer
+                use_triton_kernels = get_moe_runner_backend().is_triton_kernels()
+                use_flashinfer_trtllm_moe = (
+                    get_moe_runner_backend().is_flashinfer_trtllm()
+                )
+                return UnquantizedFusedMoEMethod(
+                    use_triton_kernels, use_flashinfer_trtllm_moe
+                )
+            return CompressedTensorsFusedMoEMethod(self)
+        return None
+
+    def _add_fused_moe_to_target_scheme_map(self):
+        """
+        Helper function to update target_scheme_map
+        since linear layers get fused into FusedMoE
+        targeting 'Linear' needs to also match
+        FusedMoE modules.
+        """
+        if (
+            "Linear" not in self.target_scheme_map
+            or "FusedMoE" in self.target_scheme_map
+        ):
+            return
+        self.target_scheme_map["FusedMoE"] = self.target_scheme_map["Linear"]
+        self.target_scheme_map["DeepEPMoE"] = self.target_scheme_map["Linear"]
+
+    @property
+    def weight_block_size(self) -> Optional[List[int]]:
+        """Get the weight block size from the quantization config."""
+        if "Linear" in self.target_scheme_map:
+            weights_config = self.target_scheme_map["Linear"].get("weights")
+            if weights_config and hasattr(weights_config, "block_structure"):
+                return weights_config.block_structure
+        return None
+
+    @classmethod
+    def from_config(cls, config: Dict[str, Any]) -> CompressedTensorsConfig:
+        ignore: List[str] = cast(List[str], config.get("ignore", []))
+        quant_format = cast(str, config.get("format"))
+        target_scheme_map = cls._quantization_scheme_map_from_config(config=config)
+        sparsity_scheme_map, sparsity_ignore_list = cls._parse_sparsity_config(
+            config=config
+        )
+        packed_modules_mapping = config.get("packed_modules_mapping", {})
+
+        # Parse linear_fp8_config if present (for mixed quantization scenarios)
+        # Format: {"activation_scheme": "dynamic", "fmt": "e4m3",
+        #          "quant_method": "fp8", "weight_block_size": [128, 128]}
+        linear_fp8_config = None
+        if "linear_fp8_config" in config:
+            from sglang.srt.layers.quantization.fp8 import Fp8Config
+
+            fp8_cfg = config["linear_fp8_config"]
+            # Check if it's fp8 format based on quant_method field
+            is_fp8 = fp8_cfg.get("quant_method") == "fp8"
+            linear_fp8_config = Fp8Config(
+                is_checkpoint_fp8_serialized=is_fp8,
+                activation_scheme=fp8_cfg.get("activation_scheme", "dynamic"),
+                ignored_layers=fp8_cfg.get("ignored_layers"),
+                weight_block_size=fp8_cfg.get("weight_block_size"),
+            )
+
+        return cls(
+            target_scheme_map=target_scheme_map,
+            ignore=ignore,
+            quant_format=quant_format,
+            sparsity_scheme_map=sparsity_scheme_map,
+            sparsity_ignore_list=sparsity_ignore_list,
+            config=config,
+            packed_modules_mapping=packed_modules_mapping,
+            linear_fp8_config=linear_fp8_config,
+        )
+
+    @classmethod
+    def _parse_sparsity_config(
+        cls, config: Dict[str, Any]
+    ) -> Tuple[Dict[str, SparsityCompressionConfig], List[str]]:
+        """
+        :param config: The `quantization_config` dictionary from config.json
+        :return: A tuple with two elements
+            1. A dictionary mapping target layer names to their corresponding
+                sparsity_config
+            2. A list of layer names to ignore for sparsity
+        """
+        if not (sparsity_config := config.get(SPARSITY_CONFIG_NAME)):
+            return dict(), []
+
+        sparsity_config = SparsityCompressionConfig.model_validate(sparsity_config)
+        sparse_scheme_map: Dict[str, SparsityCompressionConfig] = {
+            target: sparsity_config for target in sparsity_config.targets or list()
+        }
+        sparsity_ignore_list = sparsity_config.ignore or list()
+        return sparse_scheme_map, sparsity_ignore_list
+
+    @classmethod
+    def _quantization_scheme_map_from_config(
+        cls, config: Dict[str, Any]
+    ) -> QUANTIZATION_SCHEME_MAP_TYPE:
+        """
+        :param config: The `quantization_config` dictionary from config.json
+        :return: A dictionary mapping target layer names to their corresponding
+            quantization_args for weights and input activations
+        """
+        target_scheme_map: Dict[str, Any] = dict()
+        quant_format = cast(str, config.get("format"))
+
+        # The quant_config has multiple config_groups, each containing
+        # an input_activations key with details about how the activations are
+        # quantized, a weights key indicating how the weights are quantized,
+        # and a list of targets under the `targets` key, dictating which
+        # layers are impacted by the quantization details. The quantization
+        # details follow the structure defined by the QuantizationArgs
+        # pydantic model, which is used to verify the structure of the
+        # quant_config and also store the details for later use.
+
+        config_groups = config.get("config_groups", dict())
+        for _, quant_config in config_groups.items():
+            targets = quant_config.get("targets")
+            for target in targets:
+                target_scheme_map[target] = {}
+                target_scheme_map[target]["weights"] = QuantizationArgs.model_validate(
+                    quant_config.get("weights")
+                )
+
+                target_scheme_map[target]["input_activations"] = None
+                if is_activation_quantization_format(quant_format):
+                    input_activations = quant_config.get("input_activations")
+                    # The only case where we have activation quant supported
+                    # but no input_activations provided in the config
+                    # should be w8a16fp8 w8a16fp8 can also run for cases where
+                    # there is an input_quant but it is ignored
+                    if not input_activations:
+                        assert (
+                            target_scheme_map[target]["weights"].type
+                            == QuantizationType.FLOAT
+                        )
+                    else:
+                        target_scheme_map[target]["input_activations"] = (
+                            QuantizationArgs.model_validate(  # noqa: E501
+                                quant_config.get("input_activations")
+                            )
+                        )
+        return target_scheme_map
+
+    @classmethod
+    def get_config_filenames(cls) -> List[str]:
+        return []
+
+    def _check_scheme_supported(self, min_capability: int, error: bool = True) -> bool:
+        capability_tuple = DeviceCapability(*torch.cuda.get_device_capability())
+
+        if capability_tuple is not None:
+            capability = capability_tuple.to_int()
+            supported = capability >= min_capability
+            if error and not supported:
+                raise RuntimeError(
+                    "Quantization scheme is not supported for ",
+                    f"the current GPU. Min capability: {min_capability}. ",
+                    f"Current capability: {capability}.",
+                )
+            return supported
+        else:
+            return False
+
+    def _is_dynamic_token_w4a8(
+        self, weight_quant: BaseModel, input_quant: BaseModel
+    ) -> bool:
+        is_weight_4_bits = weight_quant.num_bits == 4
+        is_activation_8_bits = input_quant.num_bits == 8
+        weight_strategy = (
+            weight_quant.strategy == QuantizationStrategy.GROUP.value
+            or weight_quant.strategy == QuantizationStrategy.CHANNEL.value
+        )
+        is_token = (
+            weight_strategy and input_quant.strategy == QuantizationStrategy.TOKEN.value
+        )
+        is_dynamic = not weight_quant.dynamic and input_quant.dynamic
+
+        return (
+            is_weight_4_bits
+            and is_activation_8_bits
+            and is_token
+            and weight_quant.symmetric
+            and is_dynamic
+        )
+
+    def _is_static_tensor_w8a8(
+        self, weight_quant: BaseModel, input_quant: BaseModel
+    ) -> bool:
+        is_8_bits = weight_quant.num_bits == input_quant.num_bits == 8
+        weight_strategy = (
+            weight_quant.strategy == QuantizationStrategy.TENSOR.value
+            or weight_quant.strategy == QuantizationStrategy.CHANNEL.value
+        )
+        is_tensor = (
+            weight_strategy
+            and input_quant.strategy == QuantizationStrategy.TENSOR.value
+        )
+        is_static = not weight_quant.dynamic and not input_quant.dynamic
+
+        # Both symmetric and asymmetric input quantization supported.
+        # Only symmetric weight quantization supported.
+        return is_8_bits and is_tensor and weight_quant.symmetric and is_static
+
+    def _is_dynamic_token_w8a8(
+        self, weight_quant: BaseModel, input_quant: BaseModel
+    ) -> bool:
+        is_8_bits = weight_quant.num_bits == input_quant.num_bits == 8
+        weight_strategy = (
+            weight_quant.strategy == QuantizationStrategy.TENSOR.value
+            or weight_quant.strategy == QuantizationStrategy.CHANNEL.value
+        )
+        is_token = (
+            weight_strategy and input_quant.strategy == QuantizationStrategy.TOKEN.value
+        )
+        is_dynamic = not weight_quant.dynamic and input_quant.dynamic
+
+        # Both symmetric and asymmetric input quantization supported.
+        # Only symmetric weight quantization supported.
+        return is_8_bits and is_token and weight_quant.symmetric and is_dynamic
+
+    def _is_fp8_w8a8(
+        self, weight_quant: QuantizationArgs, input_quant: QuantizationArgs
+    ) -> bool:
+        # Confirm weights and activations quantized.
+        if weight_quant is None or input_quant is None:
+            return False
+
+        # Confirm weight scheme is supported.
+        is_floating_point = (
+            weight_quant.type == QuantizationType.FLOAT
+            and input_quant.type == QuantizationType.FLOAT
+        )
+        is_symmetric_weight = weight_quant.symmetric
+        is_static_weight = not weight_quant.dynamic
+        is_tensor_or_channel_or_block_weight = weight_quant.strategy in [
+            QuantizationStrategy.TENSOR,
+            QuantizationStrategy.CHANNEL,
+            QuantizationStrategy.BLOCK,
+        ]
+        if not (
+            is_floating_point
+            and is_symmetric_weight
+            and is_static_weight
+            and is_tensor_or_channel_or_block_weight
+        ):
+            return False
+
+        # Dynamic quantization is always supported if weights supported.
+        if input_quant.dynamic:
+            return True
+
+        # Confirm activation scheme is supported.
+        is_symmetric_activation = input_quant.symmetric
+        is_per_tensor_activation = input_quant.strategy == QuantizationStrategy.TENSOR
+        return is_symmetric_activation and is_per_tensor_activation
+
+    def _is_fp8_w8a16(self, weight_quant: BaseModel, input_quant: BaseModel) -> bool:
+        # Confirm weights quantized.
+        if weight_quant is None:
+            return False
+
+        # Confirm we have floating points.
+        if weight_quant.type != QuantizationType.FLOAT:
+            return False
+
+        # Confirm weight scheme is supported.
+        is_symmetric_weight = weight_quant.symmetric
+        is_static_weight = not weight_quant.dynamic
+        is_per_tensor_or_channel_weight = weight_quant.strategy in [
+            QuantizationStrategy.TENSOR,
+            QuantizationStrategy.CHANNEL,
+        ]
+        if not (
+            is_symmetric_weight
+            and is_static_weight  # noqa: SIM103
+            and is_per_tensor_or_channel_weight
+        ):
+            return False
+
+        # All conditions satisfied.
+        return True
+
+    def _is_fp4a4_nvfp4(
+        self, weight_quant: QuantizationArgs, input_quant: QuantizationArgs
+    ):
+        if weight_quant is None or input_quant is None:
+            return False
+
+        is_tensor_group_quant = (
+            weight_quant.strategy == QuantizationStrategy.TENSOR_GROUP.value
+            and input_quant.strategy == QuantizationStrategy.TENSOR_GROUP.value
+        )
+        is_symmetric = weight_quant.symmetric and input_quant.symmetric
+
+        is_group_size_16 = (
+            weight_quant.group_size == 16 and input_quant.group_size == 16
+        )
+        is_float_type = (
+            weight_quant.type == QuantizationType.FLOAT
+            and input_quant.type == QuantizationType.FLOAT
+        )
+        is_4_bits = weight_quant.num_bits == 4 and input_quant.num_bits == 4
+
+        return (
+            is_tensor_group_quant
+            and is_float_type
+            and is_4_bits
+            and is_group_size_16
+            and is_symmetric
+        )
+
+    def _is_wNa16_group_channel(
+        self, weight_quant: BaseModel, input_quant: BaseModel
+    ) -> bool:
+        input_quant_none = input_quant is None
+        is_symmetric = weight_quant.symmetric
+        is_channel_group = (
+            weight_quant.strategy == QuantizationStrategy.CHANNEL.value
+            or weight_quant.strategy == QuantizationStrategy.GROUP.value
+        )
+        is_static = not weight_quant.dynamic
+
+        return is_channel_group and input_quant_none and is_symmetric and is_static
+
+    def _is_mxint4a16(self, weight_quant: BaseModel, input_quant: BaseModel) -> bool:
+        input_quant_none = input_quant is None
+        is_symmetric = weight_quant.symmetric
+        is_mxint4 = (
+            weight_quant.num_bits == 4
+            and weight_quant.type == QuantizationType.INT
+            and weight_quant.strategy == QuantizationStrategy.GROUP.value
+            and weight_quant.group_size == 32
+        )
+        is_static = not weight_quant.dynamic
+
+        return is_mxint4 and input_quant_none and is_symmetric and is_static
+
+    def _is_dynamic_token_w4(
+        self, weight_quant: BaseModel, input_quant: BaseModel
+    ) -> bool:
+        is_w4 = weight_quant.num_bits == 4
+        weight_strategy = (
+            weight_quant.strategy == QuantizationStrategy.TENSOR.value
+            or weight_quant.strategy == QuantizationStrategy.CHANNEL.value
+            or weight_quant.strategy == QuantizationStrategy.GROUP.value
+        )
+        if input_quant is not None:
+            is_token = (
+                weight_strategy
+                and input_quant.strategy == QuantizationStrategy.TOKEN.value
+            )
+            is_dynamic = not weight_quant.dynamic and input_quant.dynamic
+        else:
+            is_token = weight_strategy
+            is_dynamic = not weight_quant.dynamic
+
+        # Both symmetric and asymmetric input quantization supported.
+        # Only symmetric weight quantization supported.
+        return is_w4 and weight_quant.symmetric and is_token and is_dynamic
+
+    def _get_scheme_from_parts(
+        self, weight_quant: BaseModel, input_quant: BaseModel
+    ) -> CompressedTensorsLinearScheme:
+
+        # Detect If Mixed Precision
+        if self._is_wNa16_group_channel(weight_quant, input_quant):
+            if (
+                self.quant_format == CompressionFormat.pack_quantized.value
+                and weight_quant.num_bits in WNA16_SUPPORTED_BITS
+            ):
+                return CompressedTensorsWNA16(
+                    num_bits=weight_quant.num_bits,
+                    strategy=weight_quant.strategy,
+                    group_size=weight_quant.group_size,
+                    actorder=weight_quant.actorder,
+                )
+            else:
+                raise ImportError(
+                    "Other method (CompressedTensorsW4A16Sparse24) is not supported now"
+                )
+
+        if is_activation_quantization_format(self.quant_format):
+            if self._is_fp4a4_nvfp4(weight_quant, input_quant):
+                is_fp4a4_nvfp4_supported = self._check_scheme_supported(
+                    CompressedTensorsW4A4Fp4.get_min_capability(), error=False
+                )
+                if is_fp4a4_nvfp4_supported:
+                    return CompressedTensorsW4A4Fp4()
+                else:
+                    raise NotImplementedError(
+                        "Current platform does not support w4a4 nvfp4 quantization."
+                    )
+
+            if self._is_fp8_w8a8(weight_quant, input_quant):
+                is_fp8_w8a8_supported = self._check_scheme_supported(
+                    CompressedTensorsW8A8Fp8.get_min_capability(), error=False
+                )
+                if is_fp8_w8a8_supported:
+                    return CompressedTensorsW8A8Fp8(
+                        weight_quant=weight_quant,
+                        is_static_input_scheme=(
+                            input_quant and not input_quant.dynamic
+                        ),
+                    )
+                else:
+                    # note: input_quant will be present for converted models;
+                    # will be ignored during inference post loading
+                    return CompressedTensorsW8A16Fp8(
+                        strategy=weight_quant.strategy,
+                        is_static_input_scheme=not input_quant.dynamic,
+                    )
+
+            # note: input_quant can be None
+            if self._is_fp8_w8a16(weight_quant, input_quant):
+                is_static_input_scheme = input_quant and not input_quant.dynamic
+                return CompressedTensorsW8A16Fp8(
+                    strategy=weight_quant.strategy,
+                    is_static_input_scheme=is_static_input_scheme,
+                )
+
+            if self._is_static_tensor_w8a8(weight_quant, input_quant):
+                if not _is_npu:
+                    return CompressedTensorsW8A8Int8(
+                        strategy=weight_quant.strategy,
+                        is_static_input_scheme=True,
+                        input_symmetric=input_quant.symmetric,
+                    )
+                else:
+                    return NPUCompressedTensorsW8A8Int8(
+                        strategy=weight_quant.strategy,
+                        is_static_input_scheme=True,
+                        input_symmetric=input_quant.symmetric,
+                    )
+
+            if self._is_dynamic_token_w8a8(weight_quant, input_quant):
+                if not _is_npu:
+                    return CompressedTensorsW8A8Int8(
+                        strategy=weight_quant.strategy,
+                        is_static_input_scheme=False,
+                        input_symmetric=input_quant.symmetric,
+                    )
+                else:
+                    return NPUCompressedTensorsW8A8Int8(
+                        strategy=weight_quant.strategy,
+                        is_static_input_scheme=False,
+                        input_symmetric=input_quant.symmetric,
+                    )
+
+        raise NotImplementedError("No compressed-tensors compatible scheme was found.")
+
+    def get_moe_scheme(
+        self, layer: torch.nn.Module, layer_name: Optional[str] = None
+    ) -> Optional[CompressedTensorsMoEScheme]:
+        """
+        compressed-tensors supports non uniform in the following way:
+
+        targets of config_groups: There can be N config_groups which each
+            have a quantization scheme. Each config_group has a list of targets
+            which can be a full layer_name, a regex for a layer_name, or
+            an nn.Module name.
+
+        Detect whether a layer_name is found in any target and
+        use the quantization scheme corresponding to the matched target
+        to select the CompressedTensorsMoEScheme used for infernece.
+        """
+
+        # FusedMoE was made by combining multiple Linears so need to
+        # make sure quantization config for Linear can target it
+        self._add_fused_moe_to_target_scheme_map()
+        unfused_names = [
+            layer_name + proj_name
+            for proj_name in [".0.gate_proj", ".0.up_proj", ".0.down_proj"]
+        ]
+        # TODO: refactor this to use expert_mapping and check all layer numbers
+        all_scheme_dicts = [self.get_scheme_dict(layer, name) for name in unfused_names]
+        scheme_dict = all_scheme_dicts[0] if all_scheme_dicts else None
+
+        # multiple schemes found
+        if not all(d == scheme_dict for d in all_scheme_dicts):
+            raise ValueError(
+                "All MoE projections need to have same "
+                "quantization scheme but found multiple"
+            )
+
+        if scheme_dict is None:  # ignored layer
+            return None
+
+        weight_quant = scheme_dict.get("weights")
+        input_quant = scheme_dict.get("input_activations")
+
+        if self._is_wNa16_group_channel(weight_quant, input_quant):
+            if not _is_npu:
+                if (
+                    self._is_mxint4a16(weight_quant, input_quant)
+                    and get_moe_runner_backend().is_flashinfer_trtllm()
+                ):
+                    logger.info_once(
+                        "Using CompressedTensorsMxInt4MoE with flashinfer_trtllm backend"
+                    )
+                    return CompressedTensorsMxInt4MoE(self)
+                elif _is_hip:
+                    logger.info_once("Using CompressedTensorsWNA16TritonMoE (ROCm)")
+                    return CompressedTensorsWNA16TritonMoE(self)
+                else:
+                    logger.info_once("Using CompressedTensorsWNA16MarlinMoEMethod")
+                    return CompressedTensorsWNA16MoE(self)
+            else:
+                if (
+                    self._is_dynamic_token_w4(weight_quant, input_quant)
+                    and input_quant is None
+                ):
+                    logger.info_once("Using NPUCompressedTensorsW4A16Int4DynamicMoE")
+                    return NPUCompressedTensorsW4A16Int4DynamicMoE(self)
+        elif self._is_fp4a4_nvfp4(weight_quant, input_quant):
+            logger.info_once("Using CompressedTensorsW4A4Nvfp4MoE")
+            return CompressedTensorsW4A4Nvfp4MoE()
+        elif self._is_fp8_w8a8(weight_quant, input_quant):
+            logger.info_once("Using CompressedTensorsW8A8Fp8MoE")
+            return CompressedTensorsW8A8Fp8MoE(weight_quant, input_quant)
+        elif self._is_dynamic_token_w8a8(weight_quant, input_quant):
+            if _is_npu:
+                logger.info_once("Using NPUCompressedTensorsW8A8Int8DynamicMoE")
+                return NPUCompressedTensorsW8A8Int8DynamicMoE(weight_quant, input_quant)
+            else:
+                raise NotImplementedError(
+                    f"The W8A8Int8 Fused MoE scheme is implemented only for NPU for now."
+                )
+        elif self._is_dynamic_token_w4a8(weight_quant, input_quant):
+            if _is_npu:
+                logger.info_once("Using NPUCompressedTensorsW4A8Int8DynamicMoE")
+                return NPUCompressedTensorsW4A8Int8DynamicMoE(self)
+            else:
+                raise NotImplementedError(
+                    f"The W4A8Int8 Fused MoE scheme is implemented only for NPU for now."
+                )
+        else:
+            raise RuntimeError(
+                f"Unsupported FusedMoe scheme: {weight_quant}, {input_quant}"
+            )
+
+    def get_linear_scheme(
+        self, layer: torch.nn.Module, layer_name: Optional[str] = None
+    ) -> Optional[CompressedTensorsLinearScheme]:
+        """
+        compressed-tensors supports non uniform in the following way:
+
+        targets of config_groups: There can be N config_groups which each
+            have a quantization scheme. Each config_group has a list of targets
+            which can be a full layer_name, a regex for a layer_name, or
+            an nn.Module name.
+
+        Detect whether a layer_name is found in any target and
+        use the quantization scheme corresponding to the matched target
+        to select the CompressedTensorsScheme used for infernece.
+        """
+
+        # Find the "target" in the compressed-tensors config
+        # that our layer conforms to.
+        # TODO : add compressed-tensors as dep
+        # so we do not have to re-write these functions
+        # need to make accelerate optional in ct to do this
+
+        # Use the new get_scheme_dict method to extract QuantizationArgs
+        scheme_dict = self.get_scheme_dict(layer, layer_name)
+        weight_quant = None
+        input_quant = None
+        if scheme_dict:
+            weight_quant = scheme_dict.get("weights")
+            input_quant = scheme_dict.get("input_activations")
+
+        # Find the sparsity scheme of the layer
+        # assume that fused layers inerhit first component's sparsity scheme
+        sparsity_targets = self.sparsity_scheme_map.keys() - set(
+            self.sparsity_ignore_list
+        )
+        sparsity_scheme: Optional[SparsityCompressionConfig] = None
+        with suppress(ValueError):
+            matched_target = find_matched_target(
+                layer_name=layer_name,
+                module=layer,
+                targets=sparsity_targets,
+                fused_mapping=self.packed_modules_mapping,
+            )
+            sparsity_scheme = self.sparsity_scheme_map[matched_target]
+
+        if self.supports_cutlass_24(
+            weight_quant=weight_quant,
+            input_quant=input_quant,
+            sparsity_scheme=sparsity_scheme,
+        ):
+            raise ImportError("CompressedTensors24 is not supported now")
+        elif weight_quant is None:
+            logger.warning_once(
+                "Acceleration for non-quantized schemes is "
+                "not supported by Compressed Tensors. "
+                "Falling back to UnquantizedLinearMethod"
+            )
+            return None
+
+        else:
+            # Find the quant_scheme
+            scheme = self._get_scheme_from_parts(  # type: ignore
+                weight_quant=weight_quant,
+                input_quant=input_quant,
+            )
+
+        # Raise error if device does not support the scheme
+        # (e.g. fp8 needs ada lovelace)
+        # Note: NPU devices do not support min_capability function
+        if not _is_npu:
+            self._check_scheme_supported(scheme.get_min_capability())
+        logger.debug("Using scheme: %s for %s", scheme.__class__.__name__, layer_name)
+        return scheme
+
+    def get_scheme_dict(
+        self, layer: torch.nn.Module, layer_name: str | None = None
+    ) -> dict[str, QuantizationArgs | str | None] | None:
+        """
+        Extract the QuantizationArgs for a given layer.
+
+        Returns:
+            dict with {
+                "weights": QuantizationArgs,
+                "input_activations": QuantizationArgs | None,
+                "format": str | None
+            } | None
+        """
+        if should_ignore_layer(
+            layer_name, ignore=self.ignore, fused_mapping=self.packed_modules_mapping
+        ):
+            return None
+
+        # Will be empty for models with only sparsity
+        if self.target_scheme_map:
+            matched_target = find_matched_target(
+                layer_name=layer_name,
+                module=layer,
+                targets=self.target_scheme_map.keys(),
+                fused_mapping=self.packed_modules_mapping,
+            )
+
+            return self.target_scheme_map[matched_target]
+
+        return None
+
+    def get_cache_scale(self, name: str) -> Optional[str]:
+        """
+        Check whether the param name matches the format for k/v cache scales
+        in compressed-tensors. If this is the case, return its equivalent
+        param name expected by vLLM
+
+        :param name: param name
+        :return: matching param name for KV cache scale in vLLM
+        """
+        if name.endswith(".output_scale") and ".k_proj" in name:
+            return name.replace(".k_proj.output_scale", ".attn.k_scale")
+        if name.endswith(".output_scale") and ".v_proj" in name:
+            return name.replace(".v_proj.output_scale", ".attn.v_scale")
+        # If no matches, return None
+        return None
+
+    @staticmethod
+    def supports_cutlass_24(
+        weight_quant: Optional[QuantizationArgs],
+        input_quant: Optional[QuantizationArgs],
+        sparsity_scheme: Optional[SparsityCompressionConfig] = None,
+    ) -> bool:
+        """
+        Check if the layer is supported by the Cutlass 2:4 Kernel
+        Conditions:
+            - Overarching condition: Sparsity Structure is 2:4
+            - Unquantized cases are supported
+            - Weight only quantization is not-supported
+            - Supported weight quantization strategies are TENSOR and CHANNEL
+            - Supported input quantization strategies are TENSOR and TOKEN
+            - Only 8 bit quantization is supported
+
+        :return: True if the layer is supported by the Cutlass 2:4 Kernel
+            False otherwise
+        """
+        if sparsity_scheme is None:
+            return False
+
+        is_valid_sparsity_structure: bool = (
+            sparsity_scheme.sparsity_structure == SparsityStructure.TWO_FOUR.value
+        )
+
+        valid_compressors = {
+            CompressionFormat.dense.value,
+            CompressionFormat.sparse_24_bitmask.value,
+        }
+
+        is_valid_sparsity = (
+            is_valid_sparsity_structure and sparsity_scheme.format in valid_compressors
+        )
+
+        if not is_valid_sparsity:
+            return False
+
+        # Unquantized cases are supported
+        if weight_quant is None and input_quant is None:
+            return True
+
+        # Weight only quantization is not-supported
+        if weight_quant is not None and input_quant is None:
+            return False
+
+        supported_weight_quant_strategies = [
+            QuantizationStrategy.TENSOR.value,
+            QuantizationStrategy.CHANNEL.value,
+        ]
+
+        assert weight_quant is not None
+        assert input_quant is not None
+        if weight_quant.strategy not in supported_weight_quant_strategies:
+            return False
+
+        supported_input_quant_strategies = [
+            QuantizationStrategy.TENSOR.value,
+            QuantizationStrategy.TOKEN.value,
+        ]
+
+        if input_quant.strategy not in supported_input_quant_strategies:
+            return False
+
+        return weight_quant.num_bits == input_quant.num_bits == 8
+
+
+class CompressedTensorsLinearMethod(LinearMethodBase):
+
+    def __init__(self, quantization_config: CompressedTensorsConfig):
+        self.quantization_config = quantization_config
+        self.quant_config = quantization_config
+
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        layer.scheme.process_weights_after_loading(layer)
+
+    def create_weights(
+        self,
+        layer: torch.nn.Module,
+        input_size_per_partition: int,
+        output_partition_sizes: List[int],
+        input_size: int,
+        output_size: int,
+        params_dtype: torch.dtype,
+        **extra_weight_attrs,
+    ):
+        """
+        Use the CompressedTensorsScheme associated with each layer to create
+        the necessary parameters for the layer. See LinearMethodBase for param
+        details
+        """
+        weight_loader = extra_weight_attrs.get("weight_loader")
+        layer.scheme.create_weights(
+            layer=layer,
+            input_size=input_size,
+            input_size_per_partition=input_size_per_partition,
+            output_partition_sizes=output_partition_sizes,
+            output_size=output_size,
+            params_dtype=params_dtype,
+            weight_loader=weight_loader,
+        )
+
+    def apply(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        bias: Optional[torch.Tensor] = None,
+    ):
+        """
+        Use the output of create_weights and the CompressedTensorsScheme
+        associated with the layer to apply the forward pass with the
+        layer input.  See LinearMethodBase for param details
+
+        """
+
+        scheme = layer.scheme
+        if scheme is None:
+            raise ValueError("A scheme must be defined for each layer")
+        return scheme.apply_weights(layer, x, bias=bias)
+
+
+class CompressedTensorsFusedMoEMethod(FusedMoEMethodBase):
+
+    def __init__(self, quantization_config: CompressedTensorsConfig):
+        self.quantization_config = quantization_config
+        self.quant_config = quantization_config
+
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        layer.scheme.process_weights_after_loading(layer)
+
+    def create_weights(
+        self,
+        layer: torch.nn.Module,
+        num_experts: int,
+        hidden_size: int,
+        intermediate_size_per_partition: int,
+        params_dtype: torch.dtype,
+        **extra_weight_attrs,
+    ):
+        """
+        Use the CompressedTensorsScheme associated with each layer to create
+        the necessary parameters for the layer. See LinearMethodBase for param
+        details
+        """
+        layer.scheme.create_weights(
+            layer=layer,
+            num_experts=num_experts,
+            hidden_size=hidden_size,
+            intermediate_size_per_partition=intermediate_size_per_partition,
+            params_dtype=params_dtype,
+            **extra_weight_attrs,
+        )
+
+    def create_moe_runner(
+        self, layer: torch.nn.Module, moe_runner_config: MoeRunnerConfig
+    ):
+        return layer.scheme.create_moe_runner(layer, moe_runner_config)
+
+    def apply(
+        self,
+        layer: torch.nn.Module,
+        dispatch_output: StandardDispatchOutput,
+    ) -> CombineInput:
+        """
+        Use the output of create_weights and the CompressedTensorsScheme
+        associated with the layer to apply the forward pass with the
+        layer input.  See LinearMethodBase for param details
+
+        """
+
+        scheme = layer.scheme
+        if scheme is None:
+            raise ValueError("A scheme must be defined for each layer")
+        return scheme.apply_weights(layer, dispatch_output)
+
+    def apply_weights_with_router_logits(
+        self,
+        layer: torch.nn.Module,
+        dispatch_output: StandardDispatchOutput,
+    ) -> torch.Tensor:
+        scheme = layer.scheme
+        if scheme is None:
+            raise ValueError("A scheme must be defined for each layer")
+        return scheme.apply_weights_with_router_logits(layer, dispatch_output)
+
+    def apply_without_routing_weights(
+        self,
+        layer,
+        hidden_states,
+        hidden_states_scale,
+        group_list_type,
+        group_list,
+        output_dtype,
+    ):
+        return layer.scheme.apply_without_routing_weights(
+            layer,
+            hidden_states,
+            hidden_states_scale,
+            group_list_type,
+            group_list,
+            output_dtype,
+        )
diff --git a/sglang/python/sglang/srt/layers/quantization/compressed_tensors/utils.py b/sglang/python/sglang/srt/layers/quantization/compressed_tensors/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..33d0d05b237b2d865b0f649fa4f8a1fe101cb573
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/quantization/compressed_tensors/utils.py
@@ -0,0 +1,219 @@
+# Adapted from https://github.com/vllm-project/vllm/tree/main/vllm/model_executor/layers/quantization/compressed_tensors
+# SPDX-License-Identifier: Apache-2.0
+
+import re
+from types import MappingProxyType
+from typing import Iterable, List, Mapping, Optional
+
+from compressed_tensors import CompressionFormat
+from torch.nn import Module
+
+
+def is_activation_quantization_format(format: str) -> bool:
+    _ACTIVATION_QUANTIZATION_FORMATS = [
+        CompressionFormat.naive_quantized.value,
+        CompressionFormat.int_quantized.value,
+        CompressionFormat.float_quantized.value,
+        CompressionFormat.nvfp4_pack_quantized.value,
+    ]
+    return format in _ACTIVATION_QUANTIZATION_FORMATS
+
+
+def should_ignore_layer(
+    layer_name: Optional[str],
+    ignore: Iterable[str] = tuple(),
+    fused_mapping: Mapping[str, List[str]] = MappingProxyType({}),
+) -> bool:
+    if layer_name is None:
+        return False
+
+    # layer_name = model.layers.0.self_attn.qkv_proj
+    # proj_name = qkv_proj
+    proj_name = layer_name.split(".")[-1]
+
+    # Fused layers like gate_up_proj or qkv_proj will not be fused
+    # in the safetensors checkpoint. So, we convert the name
+    # from the fused version to unfused + check to make sure that
+    # each shard of the fused layer has the same scheme.
+    if proj_name in fused_mapping and layer_name not in ignore:
+        shard_proj_names = fused_mapping[proj_name]
+
+        # Convert fused_name --> [shard_names]
+        shard_names = [
+            layer_name.replace(proj_name, shard_proj_name)
+            for shard_proj_name in shard_proj_names
+        ]
+
+        # Layer should be ignored if shards are ignored.
+        should_ignore_layer = None
+        for shard_name in shard_names:
+            should_ignore_shard = check_equal_or_regex_match(
+                layer_name=shard_name, targets=ignore
+            )
+
+            # If shard_idx=0, set layer ignore to match shard.
+            if should_ignore_layer is None:
+                should_ignore_layer = should_ignore_shard
+
+            # If shard_idx=1+ confirm scheme matches prior shards.
+            elif should_ignore_shard != should_ignore_layer:
+                raise ValueError(
+                    f"Found a different quantization schemes for "
+                    f"{shard_proj_names} in {layer_name}. vLLM "
+                    "requires all to use the same scheme."
+                )
+
+    # Unfused layers like down_proj and o_proj will match
+    # the safetensors checkpoint already.
+    else:
+        should_ignore_layer = check_equal_or_regex_match(
+            layer_name=layer_name, targets=ignore
+        )
+
+    assert should_ignore_layer is not None
+    return should_ignore_layer
+
+
+def check_equal_or_regex_match(layer_name: str, targets: Iterable[str]) -> bool:
+    """
+    Checks whether a layer_name is exactly equal or a regex match for
+    if target starts with 're:' to any target in list.
+    """
+    for target in targets:
+        if _is_equal_or_regex_match(layer_name, target, check_contains=True):
+            return True
+    return False
+
+
+def find_matched_target(
+    layer_name: Optional[str],
+    module: Module,
+    targets: Iterable[str],
+    fused_mapping: Mapping[str, List[str]] = MappingProxyType({}),
+) -> str:
+    """
+    Helper function to look up which "target" in the compressed-tensors
+    config that a layer corresponds to.
+
+    Recall that a compressed-tensors configs has a concept of
+    config_groups, where each layer can be quantized with with a different
+    scheme.
+
+    targets in each config_group will be a list of either layer names
+    (or regexes corresponding to layer names) or names of torch Modules.
+
+    First, we try to match the layer_name with a target
+    Second, we try to match the module's name with a target
+    Third, we try to map the layer_name to a list of fused module names.
+        *All* component module names must match in order for a match to be
+        successful. A successful match returns the first component target
+
+    :param layer_name: layer name
+    :param module: torch.nn.Module
+    :param targets: list of targets to match the layer against
+    :param fused_mapping: map from fused layer names to its components
+    :param fused_strategy: either "all" or "any". If using "all", fused
+        layers match if "all" of its components match
+    """
+
+    if layer_name is None:
+        layer_name = ""
+
+    matched_target = (
+        _find_first_match(layer_name, targets)
+        or _find_first_match(module.__class__.__name__, targets, True)
+        or _match_fused_layer(layer_name, targets, fused_mapping)
+    )
+
+    if matched_target is None:
+        raise ValueError(
+            f"Unable to find matching target for {layer_name} in the "
+            "compressed-tensors config."
+        )
+
+    return matched_target
+
+
+def _find_first_match(
+    value: str, targets: Iterable[str], check_contains: bool = False
+) -> Optional[str]:
+    """
+    Returns first element of target that matches value either
+    exactly or as a regex after 're:'. If check_contains is set to True,
+    additionally checks if the target string is contained within the value.
+
+    :param value: string to compare the list of targets against
+    :param targets: list of targets to match the layer against
+    :param check_contains: whether or not to do a substring match
+    """
+
+    for target in targets:
+        if _is_equal_or_regex_match(value, target, check_contains=check_contains):
+            return target
+    return None
+
+
+def _is_equal_or_regex_match(
+    value: str, target: str, check_contains: bool = False
+) -> bool:
+    """
+    Checks whether a value is exactly equal or a regex match for target
+    if target starts with 're:'. If check_contains is set to True,
+    additionally checks if the target string is contained within the value.
+    """
+
+    if target.startswith("re:"):
+        pattern = target[3:]
+        if re.match(pattern, value):
+            return True
+    elif check_contains:
+        if target.lower() in value.lower():
+            return True
+    elif target == value:
+        return True
+    return False
+
+
+def _match_fused_layer(
+    layer_name: str,
+    target_layers: Iterable[str],
+    fused_mapping: Mapping[str, List[str]],
+) -> Optional[str]:
+    """
+    Match a fused layer name to its corresponding individual layer in
+    target_layers. Returns first value in fused_mapping which matches targets
+
+    Implements an "all" matching strategy where a fused layer matches iff
+    "all" of its components match
+
+    :param layer_name: layer name
+    :param target_layers: list of targets to match the layer against
+    :param fused_mapping: map from fused layer names to its components
+
+    Examples:
+        layer_name = "model.layers.0.self_attn.qkv_proj"
+        target_layers = ["model.layers.0.self_attn.q_proj",
+                        "model.layers.0.self_attn.k_proj",
+                        "model.layers.0.self_attn.v_proj"]
+    """
+    # find layer_name in mapping
+    fused = next((key for key in fused_mapping if layer_name.endswith(key)), None)
+    if fused is None:
+        return None
+
+    # expand path of unfused components
+    unfused_paths = [
+        layer_name.replace(fused, unfused) for unfused in fused_mapping[fused]
+    ]
+
+    # for each unfused component, find a match in targets
+    unfused_matches: List[Optional[str]] = []
+    for unfused in unfused_paths:
+        for target in target_layers:
+            if _is_equal_or_regex_match(unfused, target):
+                unfused_matches.append(target)
+                break
+        else:
+            unfused_matches.append(None)
+
+    return unfused_matches[0] if all(unfused_matches) else None
diff --git a/sglang/python/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json b/sglang/python/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000000000000000000000000000000..46a982f5ee9a4bd67ce244b101c576efeeb53b78
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/sglang/python/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json b/sglang/python/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000000000000000000000000000000..a5679be1310574d178bd24838d1821b3d13fbbf8
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,26 @@
+{
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/sglang/python/sglang/srt/layers/quantization/configs/N=2048,K=512,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json b/sglang/python/sglang/srt/layers/quantization/configs/N=2048,K=512,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000000000000000000000000000000..62ccf8e6f9bc2559bd174f3c4f2c6395a83f10a9
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/quantization/configs/N=2048,K=512,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "96": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "128": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "256": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 2
+    }
+}
diff --git a/sglang/python/sglang/srt/layers/quantization/configs/N=2304,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json b/sglang/python/sglang/srt/layers/quantization/configs/N=2304,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000000000000000000000000000000..851bc9f9f0b50b41451b929eaa518869b6a05412
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/quantization/configs/N=2304,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/sglang/python/sglang/srt/layers/quantization/configs/N=2304,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json b/sglang/python/sglang/srt/layers/quantization/configs/N=2304,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000000000000000000000000000000..6c2eebe0fa0f377fdeec62001d9ab94b005f6991
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/quantization/configs/N=2304,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/sglang/python/sglang/srt/layers/quantization/configs/N=24576,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json b/sglang/python/sglang/srt/layers/quantization/configs/N=24576,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000000000000000000000000000000..6f5adbb9361229bc24b19098e3711a3fd15e939b
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/quantization/configs/N=24576,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,164 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "512": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    }
+}
diff --git a/sglang/python/sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json b/sglang/python/sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000000000000000000000000000000..e76d41dfd77b5c5cd5a87b66d40ce49232ade2a6
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/sglang/python/sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json b/sglang/python/sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000000000000000000000000000000..1c61451fb34e52deec827f8f63c80fb15830c202
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/sglang/python/sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json b/sglang/python/sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000000000000000000000000000000..63e661c80de6a7b1422f7a994a2ee7a4b724911c
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/sglang/python/sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json b/sglang/python/sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000000000000000000000000000000..d521e2bd0e39945197e766ced0c354eea7bfe666
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,26 @@
+{
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/sglang/python/sglang/srt/layers/quantization/configs/N=256,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json b/sglang/python/sglang/srt/layers/quantization/configs/N=256,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000000000000000000000000000000..c65a16f3dcc4f28c35655cf9dd1f8b658d26d28c
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/quantization/configs/N=256,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "256": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "512": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/sglang/python/sglang/srt/layers/quantization/configs/N=256,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json b/sglang/python/sglang/srt/layers/quantization/configs/N=256,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000000000000000000000000000000..a8ab31bfe7d74b4090bacd31faf3f9d53fede4d3
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/quantization/configs/N=256,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/sglang/python/sglang/srt/layers/quantization/configs/N=256,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json b/sglang/python/sglang/srt/layers/quantization/configs/N=256,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000000000000000000000000000000..1477e063d5ccfd724bee9b09a533bba73e234088
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/quantization/configs/N=256,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,26 @@
+{
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 5
+    }
+}
diff --git a/sglang/python/sglang/srt/layers/quantization/configs/N=3072,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json b/sglang/python/sglang/srt/layers/quantization/configs/N=3072,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000000000000000000000000000000..6f5adbb9361229bc24b19098e3711a3fd15e939b
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/quantization/configs/N=3072,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,164 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "512": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    }
+}
diff --git a/sglang/python/sglang/srt/layers/quantization/configs/N=3072,K=1536,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json b/sglang/python/sglang/srt/layers/quantization/configs/N=3072,K=1536,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000000000000000000000000000000..80d23e9692a6a4b83b537918950c319768d98f3e
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/quantization/configs/N=3072,K=1536,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,164 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "2": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "4": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "8": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "16": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "24": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "32": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "256": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "512": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    }
+}
diff --git a/sglang/python/sglang/srt/layers/quantization/configs/N=3072,K=1536,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json b/sglang/python/sglang/srt/layers/quantization/configs/N=3072,K=1536,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000000000000000000000000000000..0a5d7bfdba4852da9ed08d1bc27cd7d521d09965
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/quantization/configs/N=3072,K=1536,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "512": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/sglang/python/sglang/srt/layers/quantization/configs/N=3072,K=1536,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json b/sglang/python/sglang/srt/layers/quantization/configs/N=3072,K=1536,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000000000000000000000000000000..97034527030137922b9dc1f8b1820750092da6aa
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/quantization/configs/N=3072,K=1536,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/sglang/python/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json b/sglang/python/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000000000000000000000000000000..4225c78eb72cb55c7a65b72d27fab9b4f33cc87a
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,164 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "512": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    }
+}
diff --git a/sglang/python/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json b/sglang/python/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000000000000000000000000000000..933170b7be9b9bcd30165418fcbc001b39a56f52
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "4": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "128": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/sglang/python/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json b/sglang/python/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000000000000000000000000000000..797ee3d45e49dd97b6d3d17c71e20840dc31000a
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "24": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "32": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "256": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/sglang/python/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json b/sglang/python/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000000000000000000000000000000..7febe3d272b4bb76500f7c6b523396129fd53680
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "24": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "128": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    }
+}
diff --git a/sglang/python/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json b/sglang/python/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000000000000000000000000000000..56b939e52fac3ed53a4e0ba640c40010cb3af30a
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/sglang/python/sglang/srt/layers/quantization/configs/N=36864,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json b/sglang/python/sglang/srt/layers/quantization/configs/N=36864,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000000000000000000000000000000..7fa398c15a2a535401709b0f25e20f6e4b23e58e
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/quantization/configs/N=36864,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/sglang/python/sglang/srt/layers/quantization/configs/N=36864,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json b/sglang/python/sglang/srt/layers/quantization/configs/N=36864,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000000000000000000000000000000..f15d8f64c7090bd71d0091a524c65d7818fec38e
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/quantization/configs/N=36864,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/sglang/python/sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json b/sglang/python/sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000000000000000000000000000000..4225c78eb72cb55c7a65b72d27fab9b4f33cc87a
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,164 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "512": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    }
+}
diff --git a/sglang/python/sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json b/sglang/python/sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000000000000000000000000000000..a5518d97987501b814ab2a735e9f72f69cdee2cc
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,164 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "2": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "4": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "8": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "16": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "24": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "32": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "96": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "128": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "512": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    }
+}
diff --git a/sglang/python/sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json b/sglang/python/sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000000000000000000000000000000..932321ccaa9485479490122b3490ea167a855842
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "8": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "24": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "64": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/sglang/python/sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json b/sglang/python/sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000000000000000000000000000000..44c67c008882db68d0b25ecb79f678029e724f89
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,164 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "2": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "4": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "8": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "16": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "24": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "32": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "128": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "256": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    }
+}
diff --git a/sglang/python/sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json b/sglang/python/sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000000000000000000000000000000..5ffd367df833d773355590220598a3c7eceba4e0
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/sglang/python/sglang/srt/layers/quantization/configs/N=5120,K=3200,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json b/sglang/python/sglang/srt/layers/quantization/configs/N=5120,K=3200,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000000000000000000000000000000..c714b7f1928c6c0f0607b2457650f3f64a8b63a4
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/quantization/configs/N=5120,K=3200,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,26 @@
+{
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    }
+}
diff --git a/sglang/python/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json b/sglang/python/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000000000000000000000000000000..dcbb0efc53e471434447c9a1402a7ba01b73ed91
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,164 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    }
+}
diff --git a/sglang/python/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json b/sglang/python/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000000000000000000000000000000..dcbb0efc53e471434447c9a1402a7ba01b73ed91
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,164 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    }
+}
diff --git a/sglang/python/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json b/sglang/python/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000000000000000000000000000000..386928de139ce718f28222b9c1a6555df3958491
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "512": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 256,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 5
+    }
+}
diff --git a/sglang/python/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json b/sglang/python/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000000000000000000000000000000..a8ab31bfe7d74b4090bacd31faf3f9d53fede4d3
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/sglang/python/sglang/srt/layers/quantization/configs/N=7168,K=1024,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json b/sglang/python/sglang/srt/layers/quantization/configs/N=7168,K=1024,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000000000000000000000000000000..044868d2de39b2b16f8eabd9f5b574cce4f1524f
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/quantization/configs/N=7168,K=1024,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "16": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "64": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/sglang/python/sglang/srt/layers/quantization/configs/N=7168,K=1152,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json b/sglang/python/sglang/srt/layers/quantization/configs/N=7168,K=1152,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000000000000000000000000000000..246062a221d9e9ee2f0ac8a3ea39b1561fa09302
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/quantization/configs/N=7168,K=1152,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "24": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/sglang/python/sglang/srt/layers/quantization/configs/N=7168,K=1152,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json b/sglang/python/sglang/srt/layers/quantization/configs/N=7168,K=1152,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000000000000000000000000000000..a3a434e129efbb4c95da08d6e80cffcf2f5ae118
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/quantization/configs/N=7168,K=1152,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/sglang/python/sglang/srt/layers/quantization/configs/N=7168,K=1152,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json b/sglang/python/sglang/srt/layers/quantization/configs/N=7168,K=1152,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000000000000000000000000000000..2f3962f6865953767d55b60f9797ebb7c54d7696
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/quantization/configs/N=7168,K=1152,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,26 @@
+{
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/sglang/python/sglang/srt/layers/quantization/configs/N=7168,K=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json b/sglang/python/sglang/srt/layers/quantization/configs/N=7168,K=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000000000000000000000000000000..bfdff8adcbd9a08227835d2297b22cec871455dc
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/quantization/configs/N=7168,K=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "24": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "64": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "128": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2
+    }
+}
diff --git a/sglang/python/sglang/srt/layers/quantization/configs/N=7168,K=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json b/sglang/python/sglang/srt/layers/quantization/configs/N=7168,K=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000000000000000000000000000000..2bf5eb27e38208871d50348b170c8c74b80fc519
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/quantization/configs/N=7168,K=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "96": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "512": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    }
+}
diff --git a/sglang/python/sglang/srt/layers/quantization/configs/N=7168,K=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json b/sglang/python/sglang/srt/layers/quantization/configs/N=7168,K=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000000000000000000000000000000..871a0cf2e90f65000984a2443eff1108d4b75131
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/quantization/configs/N=7168,K=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "2": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "8": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "16": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "128": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    }
+}
diff --git a/sglang/python/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json b/sglang/python/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000000000000000000000000000000..dfe5c1e43d687e22d066295483e3324a803feb87
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,164 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    }
+}
diff --git a/sglang/python/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json b/sglang/python/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000000000000000000000000000000..9c908e80406587da4d246ce4e3a8a98a14c875b1
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "32": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "96": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/sglang/python/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json b/sglang/python/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000000000000000000000000000000..15b1c93f60fc5068ba11b82b6d5924dd2024a824
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/sglang/python/sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json b/sglang/python/sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000000000000000000000000000000..59b79432f516d01a18198ee9aa7efcae31452f16
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "64": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "128": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/sglang/python/sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json b/sglang/python/sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000000000000000000000000000000..f78e7060e6840ff721d306db556636b0bbc8d9b3
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "24": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "32": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "96": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "128": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/sglang/python/sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json b/sglang/python/sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000000000000000000000000000000..8ff12e64c172f5a5d0fbdf900728fe60b33877e2
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/sglang/python/sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json b/sglang/python/sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000000000000000000000000000000..a3f5371f1ce4ff0829bda4cc62cf1f315ad9e95a
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,26 @@
+{
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/sglang/python/sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json b/sglang/python/sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000000000000000000000000000000..dfe5c1e43d687e22d066295483e3324a803feb87
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,164 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    }
+}
diff --git a/sglang/python/sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json b/sglang/python/sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000000000000000000000000000000..72ab3dff208b57e42ad64241a0116768a9360ef6
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,164 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "2": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "4": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "8": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "16": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "24": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "32": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "64": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "128": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "256": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    }
+}
diff --git a/sglang/python/sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json b/sglang/python/sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000000000000000000000000000000..1d3ce5c94c2d9a4a1637204efb3b14f7a5579bdb
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "24": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "32": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "128": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "512": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/sglang/python/sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json b/sglang/python/sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000000000000000000000000000000..ca7f32b9552b479dc05495792b7e426db5eb1b56
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/sglang/python/sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json b/sglang/python/sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000000000000000000000000000000..12e3058535401290e3cafa7a94c9514cf388872b
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/sglang/python/sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json b/sglang/python/sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000000000000000000000000000000..5acea242cc0ad094cba8ee5f568ff88afb1b41ae
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/sglang/python/sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json b/sglang/python/sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000000000000000000000000000000..a87f5de1b1830890ec665b9b37975c78e21e1caf
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,164 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "256": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    }
+}
diff --git a/sglang/python/sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json b/sglang/python/sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000000000000000000000000000000..3ab5796ee15b6ec8d4ab1f4ab5a594fecb30e4b4
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "24": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "128": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "256": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "512": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/sglang/python/sglang/srt/layers/quantization/configs/N=7168,K=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json b/sglang/python/sglang/srt/layers/quantization/configs/N=7168,K=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000000000000000000000000000000..468f9e78da00ceb18b8650424b48bcff980a25a9
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/quantization/configs/N=7168,K=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,164 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "512": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    }
+}
diff --git a/sglang/python/sglang/srt/layers/quantization/configs/N=7168,K=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json b/sglang/python/sglang/srt/layers/quantization/configs/N=7168,K=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json
new file mode 100644
index 0000000000000000000000000000000000000000..468f9e78da00ceb18b8650424b48bcff980a25a9
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/quantization/configs/N=7168,K=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json	
@@ -0,0 +1,164 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "512": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    }
+}
diff --git a/sglang/python/sglang/srt/layers/quantization/fp4_utils.py b/sglang/python/sglang/srt/layers/quantization/fp4_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..a40074770fc906b3dca95cff1e6412c589017539
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/quantization/fp4_utils.py
@@ -0,0 +1,86 @@
+from __future__ import annotations
+
+import logging
+from enum import Enum
+from typing import TYPE_CHECKING
+
+from sglang.srt.environ import envs
+
+if TYPE_CHECKING:
+    from sglang.srt.server_args import ServerArgs
+
+logger = logging.getLogger(__name__)
+
+
+class Fp4GemmRunnerBackend(Enum):
+    """Enum for FP4 GEMM runner backend selection."""
+
+    AUTO = "auto"
+    FLASHINFER_CUDNN = "flashinfer_cudnn"
+    FLASHINFER_CUTLASS = "flashinfer_cutlass"
+    FLASHINFER_TRTLLM = "flashinfer_trtllm"
+
+    def is_auto(self) -> bool:
+        return self == Fp4GemmRunnerBackend.AUTO
+
+    def is_flashinfer_cudnn(self) -> bool:
+        return self == Fp4GemmRunnerBackend.FLASHINFER_CUDNN
+
+    def is_flashinfer_cutlass(self) -> bool:
+        return self == Fp4GemmRunnerBackend.FLASHINFER_CUTLASS
+
+    def is_flashinfer_trtllm(self) -> bool:
+        return self == Fp4GemmRunnerBackend.FLASHINFER_TRTLLM
+
+    def get_flashinfer_backend(self) -> str:
+        """Get the backend string to pass to FlashInfer's mm_fp4 API.
+
+        This remaps SGLang's user-facing backend names to FlashInfer's API names.
+        Examples:
+            'flashinfer_trtllm' -> 'trtllm'
+            'flashinfer_cutlass' -> 'cutlass'
+            'flashinfer_cudnn' -> 'cudnn'
+        """
+        if self.value.startswith("flashinfer_"):
+            return self.value.removeprefix("flashinfer_")
+        else:
+            return self.value
+
+
+FP4_GEMM_RUNNER_BACKEND: Fp4GemmRunnerBackend | None = None
+
+
+def initialize_fp4_gemm_config(server_args: ServerArgs) -> None:
+    """Initialize FP4 GEMM configuration from server args."""
+    global FP4_GEMM_RUNNER_BACKEND
+
+    backend = server_args.fp4_gemm_runner_backend
+
+    # Handle deprecated env var for backward compatibility
+    # TODO: Remove this in a future version
+    if envs.SGLANG_FLASHINFER_FP4_GEMM_BACKEND.is_set():
+        env_backend = envs.SGLANG_FLASHINFER_FP4_GEMM_BACKEND.get()
+        if backend == "auto":
+            logger.warning(
+                "SGLANG_FLASHINFER_FP4_GEMM_BACKEND is deprecated. "
+                f"Please use '--fp4-gemm-backend={env_backend}' instead."
+            )
+            if not env_backend.startswith("flashinfer_"):
+                env_backend = "flashinfer_" + env_backend
+            backend = env_backend
+        else:
+            logger.warning(
+                f"FP4 GEMM backend set to '{backend}' via --fp4-gemm-backend overrides "
+                "environment variable SGLANG_FLASHINFER_FP4_GEMM_BACKEND. "
+                "Using server argument value."
+            )
+
+    FP4_GEMM_RUNNER_BACKEND = Fp4GemmRunnerBackend(backend)
+
+
+def get_fp4_gemm_runner_backend() -> Fp4GemmRunnerBackend:
+    """Get the current FP4 GEMM runner backend."""
+    global FP4_GEMM_RUNNER_BACKEND
+    if FP4_GEMM_RUNNER_BACKEND is None:
+        FP4_GEMM_RUNNER_BACKEND = Fp4GemmRunnerBackend.AUTO
+    return FP4_GEMM_RUNNER_BACKEND
diff --git a/sglang/python/sglang/srt/layers/quantization/fp8.py b/sglang/python/sglang/srt/layers/quantization/fp8.py
new file mode 100644
index 0000000000000000000000000000000000000000..24757958430c368114cf7ee5632bd40d2209aa28
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/quantization/fp8.py
@@ -0,0 +1,1697 @@
+# Adapted from https://github.com/vllm-project/vllm/blob/v0.6.4.post1/vllm/model_executor/layers/quantization/fp8.py
+
+from __future__ import annotations
+
+import logging
+from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union
+
+import torch
+import torch.nn.functional as F
+from torch.nn import Module
+from torch.nn.parameter import Parameter
+
+from sglang.srt.distributed import get_tensor_model_parallel_world_size, get_tp_group
+from sglang.srt.distributed.device_communicators.pynccl_allocator import (
+    use_symmetric_memory,
+)
+from sglang.srt.environ import envs
+from sglang.srt.layers.amx_utils import (
+    CPUQuantMethod,
+    _amx_process_weight_after_loading,
+)
+from sglang.srt.layers.dp_attention import is_allocation_symmetric
+from sglang.srt.layers.moe import MoeRunner, MoeRunnerBackend, MoeRunnerConfig
+from sglang.srt.layers.moe.moe_runner.deep_gemm import DeepGemmMoeQuantInfo
+from sglang.srt.layers.moe.moe_runner.flashinfer_trtllm import (
+    FlashInferTrtllmFp8MoeQuantInfo,
+)
+from sglang.srt.layers.moe.moe_runner.triton import TritonMoeQuantInfo
+from sglang.srt.layers.moe.utils import RoutingMethodType, get_moe_runner_backend
+from sglang.srt.layers.parameter import (
+    BlockQuantScaleParameter,
+    ModelWeightParameter,
+    PerTensorScaleParameter,
+)
+from sglang.srt.layers.quantization.base_config import (
+    FusedMoEMethodBase,
+    LinearMethodBase,
+    QuantizationConfig,
+    QuantizeMethodBase,
+)
+from sglang.srt.layers.quantization.fp8_kernel import (
+    fp8_dtype,
+    is_fp8_fnuz,
+    per_token_group_quant_fp8,
+    scaled_fp8_quant,
+)
+from sglang.srt.layers.quantization.fp8_utils import (
+    apply_fp8_linear,
+    can_auto_enable_marlin_fp8,
+    cutlass_fp8_supported,
+    dispatch_w8a8_block_fp8_linear,
+    input_to_float8,
+    mxfp8_group_quantize,
+    normalize_e4m3fn_to_e4m3fnuz,
+    requant_weight_ue8m0_inplace,
+    triton_mxfp8_blockscaled_linear,
+)
+from sglang.srt.layers.quantization.kv_cache import BaseKVCacheMethod
+from sglang.srt.layers.quantization.marlin_utils_fp8 import (
+    apply_fp8_marlin_linear,
+    prepare_fp8_layer_for_marlin,
+)
+from sglang.srt.layers.quantization.unquant import (
+    UnquantizedFusedMoEMethod,
+    UnquantizedLinearMethod,
+)
+from sglang.srt.layers.quantization.utils import (
+    all_close_1d,
+    convert_to_channelwise,
+    is_layer_skipped,
+    per_tensor_dequantize,
+    requantize_with_max_scale,
+)
+from sglang.srt.utils import (
+    cpu_has_amx_support,
+    get_bool_env_var,
+    is_cpu,
+    is_cuda,
+    is_hip,
+    is_npu,
+    is_sm90_supported,
+    is_sm100_supported,
+    is_sm120_supported,
+    log_info_on_rank0,
+    print_warning_once,
+    set_weight_attrs,
+    use_intel_amx_backend,
+)
+
+if TYPE_CHECKING:
+    from sglang.srt.layers.moe.token_dispatcher import CombineInput, DispatchOutput
+    from sglang.srt.layers.moe.topk import TopKOutput
+    from sglang.srt.layers.quantization.w4afp8 import W4AFp8Config
+
+_is_hip = is_hip()
+_is_cuda = is_cuda()
+_is_npu = is_npu()
+_is_cpu_amx_available = cpu_has_amx_support()
+_is_cpu = is_cpu()
+_is_fp8_fnuz = is_fp8_fnuz()
+_use_hip_int4 = get_bool_env_var("SGLANG_INT4_WEIGHT") and _is_hip
+_use_aiter = envs.SGLANG_USE_AITER.get() and _is_hip
+
+if _use_aiter or _use_hip_int4:
+    from aiter import ActivationType, QuantType
+    from aiter.fused_moe import fused_moe
+    from aiter.ops.shuffle import shuffle_weight
+
+
+ACTIVATION_SCHEMES = ["static", "dynamic"]
+
+logger = logging.getLogger(__name__)
+
+
+class Fp8Config(QuantizationConfig):
+    """Config class for FP8."""
+
+    def __init__(
+        self,
+        is_checkpoint_fp8_serialized: bool = False,
+        activation_scheme: str = "dynamic",
+        ignored_layers: Optional[List[str]] = None,
+        weight_block_size: List[int] = None,
+        packed_modules_mapping: Optional[Dict[str, List[str]]] = None,
+        use_mxfp8: bool = False,
+    ) -> None:
+        super().__init__()
+        self.is_checkpoint_fp8_serialized = is_checkpoint_fp8_serialized
+        if is_checkpoint_fp8_serialized:
+            log_info_on_rank0(logger, "Detected fp8 checkpoint.")
+        if activation_scheme not in ACTIVATION_SCHEMES:
+            raise ValueError(f"Unsupported activation scheme {activation_scheme}")
+        self.activation_scheme = activation_scheme
+        self.ignored_layers = ignored_layers or []
+        self.packed_modules_mapping = packed_modules_mapping or {}
+        self.use_mxfp8 = use_mxfp8
+        if weight_block_size is not None:
+            if not is_checkpoint_fp8_serialized:
+                raise ValueError(
+                    f"The block-wise quantization only supports fp8-serialized checkpoint for now."
+                )
+            if len(weight_block_size) != 2:
+                raise ValueError(
+                    f"The quantization block size of weight must have 2 dimensions, but got {len(weight_block_size)} dimensions."
+                )
+            if activation_scheme != "dynamic":
+                raise ValueError(
+                    f"The block-wise quantization only supports dynamic activation scheme for now, but got {activation_scheme} activation scheme."
+                )
+        if self.use_mxfp8:
+            if weight_block_size is None:
+                weight_block_size = [1, 32]
+            elif weight_block_size != [1, 32]:
+                raise ValueError("MXFP8 requires weight_block_size=[1, 32].")
+        self.weight_block_size = weight_block_size
+
+    def get_name(self) -> str:
+        return "mxfp8" if self.use_mxfp8 else "fp8"
+
+    @classmethod
+    def get_supported_act_dtypes(cls) -> List[torch.dtype]:
+        return [torch.bfloat16, torch.half]
+
+    def get_min_capability(self) -> int:
+        return 100 if self.use_mxfp8 else 80
+
+    @classmethod
+    def get_config_filenames(cls) -> List[str]:
+        return []
+
+    @classmethod
+    def from_config(cls, config: Dict[str, Any]) -> Fp8Config:
+        quant_method = cls.get_from_keys(config, ["quant_method"])
+        use_mxfp8 = "mxfp8" in quant_method
+        is_checkpoint_fp8_serialized = ("fp8" in quant_method) or use_mxfp8
+        activation_scheme = cls.get_from_keys(config, ["activation_scheme"])
+        packed_modules_mapping = (
+            cls.get_from_keys_or(config, ["packed_modules_mapping"], {}) or {}
+        )
+        ignored_layers = cls.get_from_keys_or(
+            config, ["ignored_layers", "modules_to_not_convert"], None
+        )
+        if ignored_layers:
+            # Keep both "model." and non-"model." variants for robust prefix matching.
+            normalized = []
+            for layer in ignored_layers:
+                base = layer.removeprefix("model.")
+                normalized.append(base)
+                normalized.append(f"model.{base}")
+            ignored_layers = normalized
+        weight_block_size = cls.get_from_keys_or(config, ["weight_block_size"], None)
+        if use_mxfp8 and weight_block_size is not None:
+            logger.warning(
+                "MXFP8 ignoring incoming weight_block_size in config.json; it is fixed to [1, 32]."
+            )
+            weight_block_size = [1, 32]
+        return cls(
+            is_checkpoint_fp8_serialized=is_checkpoint_fp8_serialized,
+            activation_scheme=activation_scheme,
+            ignored_layers=ignored_layers,
+            weight_block_size=weight_block_size,
+            packed_modules_mapping=packed_modules_mapping,
+            use_mxfp8=use_mxfp8,
+        )
+
+    def get_quant_method(
+        self, layer: torch.nn.Module, prefix: str
+    ) -> Optional[QuantizeMethodBase]:
+        from sglang.srt.layers.linear import LinearBase
+        from sglang.srt.layers.moe.fused_moe_triton import FusedMoE
+        from sglang.srt.layers.radix_attention import RadixAttention
+
+        if isinstance(layer, LinearBase):
+            if is_layer_skipped(
+                prefix, self.ignored_layers, fused_mapping=self.packed_modules_mapping
+            ):
+                return UnquantizedLinearMethod()
+            return Fp8LinearMethod(self)
+        elif isinstance(layer, FusedMoE):
+            if is_layer_skipped(
+                prefix, self.ignored_layers, fused_mapping=self.packed_modules_mapping
+            ):
+                return UnquantizedFusedMoEMethod(
+                    layer.use_triton_kernels, layer.use_flashinfer_trtllm_moe
+                )
+            return Fp8MoEMethod(self)
+        elif isinstance(layer, RadixAttention):
+            return Fp8KVCacheMethod(self)
+        return None
+
+    def get_scaled_act_names(self) -> List[str]:
+        return []
+
+
+class Fp8LinearMethod(LinearMethodBase):
+    """Linear method for FP8.
+
+    It supports the following quantization schemes:
+    - Per-channel weight quantization + per-token activation quantization
+    - Per-tensor weight quantization + per-tensor activation quantization
+    - Blockwise weight quantization + blockwise activation quantization
+
+    It supports the following checkpoint formats:
+    - FP8 checkpoint
+    - FP16/BF16 checkpoint. In this case, the weights will be quantized to FP8 during the weight loading.
+
+    Notes:
+    - The activation quantization scheme can be static or dynamic. The dynamic activation quantization is more commonly used.
+    - On NV platforms, the per-channel weight quantization is used by default, if block quantization is not enabled.
+
+    Args:
+        quant_config: The quantization config.
+    """
+
+    def __init__(self, quant_config: Union[Fp8Config, W4AFp8Config]):
+        self.quant_config = quant_config
+        self.cutlass_fp8_supported = cutlass_fp8_supported()
+
+        # For GPUs that lack FP8 hardware support, we can leverage the Marlin
+        # kernel for fast weight-only FP8 quantization
+        self.use_marlin = False
+        if _is_cuda:
+            force_marlin = get_bool_env_var("SGLANG_FORCE_FP8_MARLIN")
+            auto_enable = can_auto_enable_marlin_fp8()
+            self.use_marlin = force_marlin or auto_enable
+
+        self.use_mxfp8 = getattr(self.quant_config, "use_mxfp8", False)
+        self.block_quant = (
+            self.use_mxfp8 or self.quant_config.weight_block_size is not None
+        )
+        self.w8a8_block_fp8_linear = dispatch_w8a8_block_fp8_linear()
+        self.is_checkpoint_fp8_serialized = (
+            self.quant_config.is_checkpoint_fp8_serialized
+        )
+        self.use_aiter_fp8_per_token = envs.SGLANG_USE_AITER_FP8_PER_TOKEN.get()
+        self.use_per_token_if_dynamic = False
+
+    def validate_block_quant_shapes(
+        self,
+        input_size: int,
+        input_size_per_partition: int,
+        output_size: int,
+        output_size_per_partition: int,
+        output_partition_sizes: List[int],
+        skip_block_quant_check: bool = False,
+    ):
+        tp_size = get_tensor_model_parallel_world_size()
+        block_n, block_k = (
+            self.quant_config.weight_block_size[0],
+            self.quant_config.weight_block_size[1],
+        )
+
+        if skip_block_quant_check:
+            print_warning_once(
+                "Skipping block quantization checks for weight partition."
+            )
+        else:
+            # Required by row parallel
+            if tp_size > 1 and input_size // input_size_per_partition == tp_size:
+                if input_size_per_partition % block_k != 0:
+                    raise ValueError(
+                        f"Weight input_size_per_partition = "
+                        f"{input_size_per_partition} is not divisible by "
+                        f"weight quantization block_k = {block_k}."
+                    )
+            # Required by column parallel or enabling merged weights
+            if (
+                tp_size > 1 and output_size // output_size_per_partition == tp_size
+            ) or len(output_partition_sizes) > 1:
+                for output_partition_size in output_partition_sizes:
+                    if output_partition_size % block_n != 0:
+                        raise ValueError(
+                            f"Weight output_partition_size = "
+                            f"{output_partition_size} is not divisible by "
+                            f"weight quantization block_n = {block_n}."
+                        )
+
+    def create_weights(
+        self,
+        layer: torch.nn.Module,
+        input_size_per_partition: int,
+        output_partition_sizes: List[int],
+        input_size: int,
+        output_size: int,
+        params_dtype: torch.dtype,
+        skip_block_quant_check: bool = False,
+        **extra_weight_attrs,
+    ):
+        # Copy the layer attributes
+        output_size_per_partition = sum(output_partition_sizes)
+        layer.logical_widths = output_partition_sizes
+        layer.input_size_per_partition = input_size_per_partition
+        layer.output_size_per_partition = output_size_per_partition
+        layer.orig_dtype = params_dtype
+        weight_loader = extra_weight_attrs.get("weight_loader")
+
+        if self.block_quant:
+            block_n, block_k = self.quant_config.weight_block_size
+            self.validate_block_quant_shapes(
+                input_size,
+                input_size_per_partition,
+                output_size,
+                output_size_per_partition,
+                output_partition_sizes,
+                skip_block_quant_check,
+            )
+
+        # Create the weight
+        weight_dtype = (
+            torch.float8_e4m3fn if self.is_checkpoint_fp8_serialized else params_dtype
+        )
+        weight = ModelWeightParameter(
+            data=torch.empty(
+                output_size_per_partition, input_size_per_partition, dtype=weight_dtype
+            ),
+            input_dim=1,
+            output_dim=0,
+            weight_loader=weight_loader,
+        )
+        layer.register_parameter("weight", weight)
+
+        # If checkpoint is serialized fp8, load them.
+        # Otherwise, wait until process_weights_after_loading.
+        if self.is_checkpoint_fp8_serialized:
+            # WEIGHT SCALE
+            if self.block_quant:
+                if hasattr(self.quant_config, "activation_scheme"):
+                    assert self.quant_config.activation_scheme == "dynamic"
+                elif hasattr(self.quant_config, "linear_activation_scheme"):
+                    assert self.quant_config.linear_activation_scheme == "dynamic"
+                if self.use_mxfp8 and not self.is_checkpoint_fp8_serialized:
+                    raise ValueError(
+                        "MXFP8 requires fp8-serialized checkpoint for linear layers."
+                    )
+                scale_dtype = torch.uint8 if self.use_mxfp8 else torch.float32
+                scale_init = torch.zeros if scale_dtype == torch.uint8 else torch.empty
+                scale = BlockQuantScaleParameter(
+                    data=scale_init(
+                        (output_size_per_partition + block_n - 1) // block_n,
+                        (input_size_per_partition + block_k - 1) // block_k,
+                        dtype=scale_dtype,
+                    ),
+                    input_dim=1,
+                    output_dim=0,
+                    weight_loader=weight_loader,
+                )
+                scale.format_ue8m0 = self.use_mxfp8
+                if scale_dtype != torch.uint8:
+                    scale[:] = torch.finfo(torch.float32).min
+                layer.register_parameter("weight_scale_inv", scale)
+            else:
+                scale = PerTensorScaleParameter(
+                    data=torch.empty(len(output_partition_sizes), dtype=torch.float32),
+                    weight_loader=weight_loader,
+                )
+                scale[:] = torch.finfo(torch.float32).min
+                layer.register_parameter("weight_scale", scale)
+
+            # INPUT ACTIVATION SCALE
+            if (
+                hasattr(self.quant_config, "activation_scheme")
+                and self.quant_config.activation_scheme == "static"
+            ) or (
+                hasattr(self.quant_config, "linear_activation_scheme")
+                and self.quant_config.linear_activation_scheme == "static"
+            ):
+                scale = PerTensorScaleParameter(
+                    data=torch.empty(len(output_partition_sizes), dtype=torch.float32),
+                    weight_loader=weight_loader,
+                )
+
+                scale[:] = torch.finfo(torch.float32).min
+                layer.register_parameter("input_scale", scale)
+            else:
+                layer.register_parameter("input_scale", None)
+
+    def process_weights_after_loading_block_quant(self, layer: Module) -> None:
+        # If ROCm, normalize the weights and scales to e4m3fnuz
+        if _is_fp8_fnuz:
+            # activation_scheme: dynamic
+            weight, weight_scale, _ = normalize_e4m3fn_to_e4m3fnuz(
+                weight=layer.weight,
+                weight_scale=layer.weight_scale_inv,
+                input_scale=None,
+            )
+            layer.input_scale = None
+        elif _is_cpu:
+            assert (
+                _is_cpu_amx_available
+            ), "Fp8LinearMethod on CPU requires that CPU has AMX support"
+            _amx_process_weight_after_loading(layer, ["weight"])
+            layer.weight_scale_inv = torch.nn.Parameter(
+                layer.weight_scale_inv.data, requires_grad=False
+            )
+            return
+        elif self.use_mxfp8:
+            if not self.is_checkpoint_fp8_serialized:
+                self._quantize_mxfp8_weights(layer)
+                return
+            # MXFP8 scales are stored as UE8M0 uint8; no requantization here.
+            # Keep parameter object to preserve weight_loader attrs for hot reload.
+            layer.weight_scale_inv.requires_grad_(False)
+            layer.weight_scale_inv.format_ue8m0 = True
+            return
+        else:
+            # For fp8 linear weights run with deepgemm, the weights and scales need be requantized to ue8m0
+            from sglang.srt.layers.quantization.fp8_utils import (
+                deepgemm_w8a8_block_fp8_linear_with_fallback,
+            )
+            from sglang.srt.model_loader.utils import (
+                should_deepgemm_weight_requant_ue8m0,
+            )
+
+            if (
+                should_deepgemm_weight_requant_ue8m0(
+                    weight_block_size=getattr(
+                        self.quant_config, "weight_block_size", None
+                    ),
+                )
+                and (
+                    self.w8a8_block_fp8_linear
+                    is deepgemm_w8a8_block_fp8_linear_with_fallback
+                )
+                and (not layer.weight_scale_inv.format_ue8m0)
+            ):
+                requant_weight_ue8m0_inplace(
+                    layer.weight,
+                    layer.weight_scale_inv,
+                    self.quant_config.weight_block_size,
+                )
+                layer.weight_scale_inv.format_ue8m0 = True
+            weight, weight_scale = layer.weight.data, layer.weight_scale_inv.data
+
+        layer.weight.data = weight.data
+        layer.weight_scale_inv.data = weight_scale.data
+
+    def _quantize_mxfp8_weights(self, layer: Module) -> None:
+        weight = layer.weight.data
+        qweight, weight_scale = mxfp8_group_quantize(weight)
+        # Keep parameter objects to preserve weight_loader attrs for hot reload.
+        layer.weight.data = qweight
+        layer.weight.requires_grad_(False)
+        if hasattr(layer, "weight_scale_inv") and layer.weight_scale_inv is not None:
+            layer.weight_scale_inv.data = weight_scale
+            layer.weight_scale_inv.requires_grad_(False)
+        else:
+            # First-time online MXFP8 quantization (no serialized scales).
+            layer.register_parameter(
+                "weight_scale_inv", Parameter(weight_scale, requires_grad=False)
+            )
+        layer.weight_scale_inv.format_ue8m0 = True
+        layer.input_scale = None
+
+    def process_weights_after_loading(self, layer: Module) -> None:
+        if self.block_quant:
+            self.process_weights_after_loading_block_quant(layer)
+        else:
+            layer.weight = Parameter(layer.weight.data, requires_grad=False)
+
+            # If checkpoint not serialized fp8, quantize the weights.
+            if not self.is_checkpoint_fp8_serialized:
+                if (
+                    self.cutlass_fp8_supported
+                    or self.use_marlin
+                    or (_use_aiter and self.use_aiter_fp8_per_token)
+                ):
+                    # apply per-channel quantization default as
+                    # cutlass sgl-kernel and marlin only support per-channel scale
+                    qweight, weight_scale = per_token_group_quant_fp8(
+                        layer.weight, layer.weight.shape[-1]
+                    )
+                    weight_scale = weight_scale.t().contiguous()
+                    if _use_aiter and self.use_aiter_fp8_per_token:
+                        self.use_per_token_if_dynamic = True
+                        qweight = shuffle_weight(qweight.contiguous(), (16, 16))
+                else:
+                    # per-tensor quantization
+                    qweight, weight_scale = input_to_float8(layer.weight)
+
+                # Update the layer with the new values.
+                layer.weight = Parameter(qweight.t(), requires_grad=False)
+                layer.weight_scale = Parameter(weight_scale, requires_grad=False)
+                layer.input_scale = None
+
+            # If checkpoint is fp8, handle that there are N scales for N
+            # shards in a fused module
+            else:
+                layer.weight_scale = Parameter(
+                    layer.weight_scale.data, requires_grad=False
+                )
+                if (
+                    hasattr(self.quant_config, "activation_scheme")
+                    and self.quant_config.activation_scheme == "static"
+                ) or (
+                    hasattr(self.quant_config, "linear_activation_scheme")
+                    and self.quant_config.linear_activation_scheme == "static"
+                ):
+                    layer.input_scale = Parameter(
+                        layer.input_scale.data, requires_grad=False
+                    )
+
+                # cutlass sgl-kernel and marlin only support per-channel scale; aiter supports per-channel scale
+                if (
+                    self.cutlass_fp8_supported
+                    or self.use_marlin
+                    or (_use_aiter and self.use_aiter_fp8_per_token)
+                ):
+                    weight = layer.weight
+                    weight_scale = convert_to_channelwise(
+                        layer.weight_scale, layer.logical_widths
+                    )
+                    if _use_aiter and self.use_aiter_fp8_per_token:
+                        # Otherwise, by default, aiter only uses per-tensor quantization
+                        self.use_per_token_if_dynamic = True
+                        if _is_fp8_fnuz:
+                            weight, weight_scale, _ = normalize_e4m3fn_to_e4m3fnuz(
+                                weight=weight,
+                                weight_scale=weight_scale,
+                            )
+                        weight = shuffle_weight(weight.contiguous(), (16, 16))
+                else:
+                    # Dequant -> Quant with max scale so we can run per tensor.
+                    weight = layer.weight
+                    weight_scale = layer.weight_scale
+                    # If ROCm, normalize the weights and scales to e4m3fnuz
+                    if _is_fp8_fnuz:
+                        weight, weight_scale, input_scale = (
+                            normalize_e4m3fn_to_e4m3fnuz(
+                                weight=weight,
+                                weight_scale=weight_scale,
+                                input_scale=layer.input_scale,
+                            )
+                        )
+                        if input_scale is not None:
+                            layer.input_scale = Parameter(
+                                input_scale, requires_grad=False
+                            )
+
+                    weight_scale, weight = requantize_with_max_scale(
+                        weight=weight,
+                        weight_scale=weight_scale,
+                        logical_widths=layer.logical_widths,
+                    )
+
+                # Update layer with new values.
+                layer.weight = Parameter(weight.t(), requires_grad=False)
+                layer.weight_scale = Parameter(weight_scale, requires_grad=False)
+                if (
+                    hasattr(self.quant_config, "activation_scheme")
+                    and self.quant_config.activation_scheme == "static"
+                ) or (
+                    hasattr(self.quant_config, "linear_activation_scheme")
+                    and self.quant_config.linear_activation_scheme == "static"
+                ):
+                    layer.input_scale = Parameter(
+                        layer.input_scale.max(), requires_grad=False
+                    )
+
+        if self.use_marlin:
+            if self.block_quant:
+                layer.weight_block_size = self.quant_config.weight_block_size
+            prepare_fp8_layer_for_marlin(layer, not self.block_quant)
+            # Activations not quantized for marlin.
+            del layer.input_scale
+
+    def apply(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        bias: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        if self.use_marlin:
+            return apply_fp8_marlin_linear(
+                input=x,
+                weight=layer.weight,
+                weight_scale=layer.weight_scale,
+                workspace=layer.workspace,
+                size_n=layer.output_size_per_partition,
+                size_k=layer.input_size_per_partition,
+                bias=bias,
+            )
+
+        if self.use_mxfp8:
+            if isinstance(x, tuple):
+                return triton_mxfp8_blockscaled_linear(
+                    input=x[0],
+                    weight=layer.weight,
+                    weight_scale=layer.weight_scale_inv,
+                    input_scale=x[1],
+                    bias=bias,
+                )
+            return triton_mxfp8_blockscaled_linear(
+                input=x,
+                weight=layer.weight,
+                weight_scale=layer.weight_scale_inv,
+                input_scale=None,
+                bias=bias,
+            )
+
+        if self.block_quant:
+            if use_intel_amx_backend(layer):
+                return torch.ops.sgl_kernel.fp8_scaled_mm_cpu(
+                    x,
+                    layer.weight,
+                    layer.weight_scale_inv,
+                    self.quant_config.weight_block_size,
+                    bias,
+                    x.dtype,
+                    True,  # is_vnni
+                )
+
+            if isinstance(x, tuple):
+                return self.w8a8_block_fp8_linear(
+                    input=x[0],
+                    weight=layer.weight,
+                    block_size=self.quant_config.weight_block_size,
+                    weight_scale=layer.weight_scale_inv,
+                    input_scale=x[1],
+                    bias=bias,
+                )
+
+            return self.w8a8_block_fp8_linear(
+                input=x,
+                weight=layer.weight,
+                block_size=self.quant_config.weight_block_size,
+                weight_scale=layer.weight_scale_inv,
+                input_scale=None,
+                bias=bias,
+            )
+
+        return apply_fp8_linear(
+            input=x,
+            weight=layer.weight,
+            weight_scale=layer.weight_scale,
+            input_scale=layer.input_scale,
+            bias=bias,
+            cutlass_fp8_supported=self.cutlass_fp8_supported,
+            use_per_token_if_dynamic=self.use_per_token_if_dynamic,
+        )
+
+
+class Fp8MoEMethod(FusedMoEMethodBase):
+    """MoE method for FP8.
+    Supports loading FP8 checkpoints with static weight scale and
+    dynamic/static activation scale.
+
+    Also supports loading quantized FP16/BF16 model checkpoints with dynamic
+    activation scaling. The weight scaling factor will be initialized after
+    the model weights are loaded.
+
+    Args:
+        quant_config: The quantization config.
+    """
+
+    def __init__(self, quant_config: Fp8Config):
+        self.quant_config = quant_config
+        self.use_mxfp8 = getattr(self.quant_config, "use_mxfp8", False)
+        self.block_quant = (
+            self.use_mxfp8 or self.quant_config.weight_block_size is not None
+        )
+        self.with_bias = False
+        if get_moe_runner_backend().is_cutlass():
+            assert (
+                cutlass_fp8_supported()
+            ), "cutlass_fp8 MoE requires CUDA 12.0+ with SM90 or CUDA 12.4+ with SM89"
+            assert self.block_quant, "cutlass_fp8 MoE requires block quantization"
+            assert (
+                is_sm100_supported() or is_sm90_supported() or is_sm120_supported()
+            ), "cutlass_fp8 MoE requires SM90, SM100, or SM120 GPUs"
+
+    @staticmethod
+    def is_deepgemm_moe_runner_backend_enabled() -> bool:
+        """Check if MoE will actually use DeepGEMM runner for FP8."""
+        from sglang.srt.layers import deep_gemm_wrapper
+        from sglang.srt.layers.moe.utils import get_moe_a2a_backend
+
+        moe_runner_backend = get_moe_runner_backend()
+        if moe_runner_backend.is_deep_gemm():
+            return True
+        if moe_runner_backend.is_auto():
+            return deep_gemm_wrapper.ENABLE_JIT_DEEPGEMM and (
+                get_moe_a2a_backend().is_deepep() or get_moe_a2a_backend().is_mooncake()
+            )
+        return False
+
+    def create_weights(
+        self,
+        layer: Module,
+        num_experts: int,
+        hidden_size: int,
+        intermediate_size_per_partition: int,
+        params_dtype: torch.dtype,
+        with_bias: bool = False,
+        **extra_weight_attrs,
+    ):
+        self.with_bias = with_bias
+        from sglang.srt.layers.moe.fused_moe_triton import FusedMoeWeightScaleSupported
+
+        if self.quant_config.is_checkpoint_fp8_serialized:
+            params_dtype = torch.uint32 if _use_hip_int4 else torch.float8_e4m3fn
+        tp_size = get_tensor_model_parallel_world_size()
+        if self.block_quant:
+            block_n, block_k = (
+                self.quant_config.weight_block_size[0],
+                self.quant_config.weight_block_size[1],
+            )
+            # NOTE(HandH1998): To ensure proper alignment of the block-wise quantization scales, the output_size of the weights for both the gate and up layers must be divisible by block_n.
+            # Required by column parallel or enabling merged weights
+            if intermediate_size_per_partition % block_n != 0:
+                raise ValueError(
+                    f"The output_size of gate's and up's weight = "
+                    f"{intermediate_size_per_partition} is not divisible by "
+                    f"weight quantization block_n = {block_n}."
+                )
+            if tp_size > 1:
+                # Required by row parallel
+                if intermediate_size_per_partition % block_k != 0:
+                    raise ValueError(
+                        f"The input_size of down's weight = "
+                        f"{intermediate_size_per_partition} is not divisible by "
+                        f"weight quantization block_k = {block_k}."
+                    )
+
+        # WEIGHTS
+        if _is_hip and _use_hip_int4:
+            # INT4 MoE weight - INT32 packed
+            w13_weight = torch.nn.Parameter(
+                torch.empty(
+                    num_experts,
+                    2 * intermediate_size_per_partition,
+                    hidden_size // 8,
+                    dtype=params_dtype,
+                ),
+                requires_grad=False,
+            )
+            w2_weight = torch.nn.Parameter(
+                torch.empty(
+                    num_experts,
+                    hidden_size,
+                    intermediate_size_per_partition // 8,
+                    dtype=params_dtype,
+                ),
+                requires_grad=False,
+            )
+        else:
+            w13_weight = torch.nn.Parameter(
+                torch.empty(
+                    num_experts,
+                    2 * intermediate_size_per_partition,
+                    hidden_size,
+                    dtype=params_dtype,
+                ),
+                requires_grad=False,
+            )
+            w2_weight = torch.nn.Parameter(
+                torch.empty(
+                    num_experts,
+                    hidden_size,
+                    intermediate_size_per_partition,
+                    dtype=params_dtype,
+                ),
+                requires_grad=False,
+            )
+
+        layer.register_parameter("w13_weight", w13_weight)
+        set_weight_attrs(w13_weight, extra_weight_attrs)
+
+        layer.register_parameter("w2_weight", w2_weight)
+        set_weight_attrs(w2_weight, extra_weight_attrs)
+
+        # BIAS (optional, e.g. GPT-OSS)
+        if self.with_bias:
+            w13_up_dim = (
+                2 * intermediate_size_per_partition
+                if layer.moe_runner_config.is_gated
+                else intermediate_size_per_partition
+            )
+            w13_weight_bias = torch.nn.Parameter(
+                torch.empty(num_experts, w13_up_dim, dtype=torch.float32),
+                requires_grad=False,
+            )
+            layer.register_parameter("w13_weight_bias", w13_weight_bias)
+            set_weight_attrs(w13_weight_bias, extra_weight_attrs)
+
+            w2_weight_bias = torch.nn.Parameter(
+                torch.empty(num_experts, hidden_size, dtype=torch.float32),
+                requires_grad=False,
+            )
+            layer.register_parameter("w2_weight_bias", w2_weight_bias)
+            set_weight_attrs(w2_weight_bias, extra_weight_attrs)
+
+        # WEIGHT_SCALES
+        if self.block_quant:
+            scale_dtype = torch.uint8 if self.use_mxfp8 else torch.float32
+            scale_init = torch.zeros if scale_dtype == torch.uint8 else torch.ones
+            w13_weight_scale = torch.nn.Parameter(
+                scale_init(
+                    num_experts,
+                    2 * ((intermediate_size_per_partition + block_n - 1) // block_n),
+                    (hidden_size + block_k - 1) // block_k,
+                    dtype=scale_dtype,
+                ),
+                requires_grad=False,
+            )
+            w2_weight_scale = torch.nn.Parameter(
+                scale_init(
+                    num_experts,
+                    (hidden_size + block_n - 1) // block_n,
+                    (intermediate_size_per_partition + block_k - 1) // block_k,
+                    dtype=scale_dtype,
+                ),
+                requires_grad=False,
+            )
+            # w13_weight and w2_weight are always requanted together
+            w13_weight_scale.format_ue8m0 = self.use_mxfp8
+            w2_weight_scale.format_ue8m0 = self.use_mxfp8
+            layer.register_parameter("w13_weight_scale_inv", w13_weight_scale)
+            layer.register_parameter("w2_weight_scale_inv", w2_weight_scale)
+            assert self.quant_config.activation_scheme == "dynamic"
+            if get_moe_runner_backend().is_cutlass():
+                self._ensure_cutlass_buffers_initialized(layer)
+
+        else:
+            # Allocate 2 scales for w1 and w3 respectively.
+            # They will be combined to a single scale after weight loading.
+            w13_weight_scale = torch.nn.Parameter(
+                torch.ones(num_experts, 2, dtype=torch.float32), requires_grad=False
+            )
+            w2_weight_scale = torch.nn.Parameter(
+                torch.ones(num_experts, dtype=torch.float32), requires_grad=False
+            )
+            layer.register_parameter("w13_weight_scale", w13_weight_scale)
+            layer.register_parameter("w2_weight_scale", w2_weight_scale)
+
+            if _is_hip:  # _use_aiter: TODO: add check back after triton kernel
+                # ROCm - using column scaling, duplicate scaling numbers in case per tensor scaling
+                w13_weight_scale1 = torch.nn.Parameter(
+                    torch.ones(
+                        num_experts,
+                        2 * intermediate_size_per_partition,
+                        dtype=torch.float32,
+                    ),
+                    requires_grad=False,
+                )
+                w2_weight_scale1 = torch.nn.Parameter(
+                    torch.ones(num_experts, hidden_size, dtype=torch.float32),
+                    requires_grad=False,
+                )
+                layer.register_parameter("w13_weight_scale1", w13_weight_scale1)
+                layer.register_parameter("w2_weight_scale1", w2_weight_scale1)
+
+        # Add the quantization method used (per tensor/grouped/channel)
+        # to ensure the weight scales are loaded in properly
+        extra_weight_attrs.update(
+            {"quant_method": FusedMoeWeightScaleSupported.BLOCK.value}
+            if self.block_quant
+            else {"quant_method": FusedMoeWeightScaleSupported.TENSOR.value}
+        )
+        # If loading fp8 checkpoint, pass the weight loaders.
+        # If loading an fp16 checkpoint, do not (we will quantize in
+        #   process_weights_after_loading()
+        if self.quant_config.is_checkpoint_fp8_serialized:
+            set_weight_attrs(w13_weight_scale, extra_weight_attrs)
+            set_weight_attrs(w2_weight_scale, extra_weight_attrs)
+
+            if _is_hip and _use_hip_int4:
+                extra_weight_attrs.update(
+                    {"quant_method": FusedMoeWeightScaleSupported.CHANNEL.value}
+                )
+                set_weight_attrs(w13_weight_scale1, extra_weight_attrs)
+                set_weight_attrs(w2_weight_scale1, extra_weight_attrs)
+
+        # INPUT_SCALES
+        if self.quant_config.activation_scheme == "static":
+            if not self.quant_config.is_checkpoint_fp8_serialized:
+                raise ValueError(
+                    "Found static activation scheme for checkpoint that "
+                    "was not serialized fp8."
+                )
+
+            w13_input_scale = torch.nn.Parameter(
+                torch.ones(num_experts, dtype=torch.float32), requires_grad=False
+            )
+            layer.register_parameter("w13_input_scale", w13_input_scale)
+            set_weight_attrs(w13_input_scale, extra_weight_attrs)
+
+            w2_input_scale = torch.nn.Parameter(
+                torch.ones(num_experts, dtype=torch.float32), requires_grad=False
+            )
+            layer.register_parameter("w2_input_scale", w2_input_scale)
+            set_weight_attrs(w2_input_scale, extra_weight_attrs)
+
+        else:
+            layer.w13_input_scale = None
+            layer.w2_input_scale = None
+
+    def process_weights_after_loading_block_quant(self, layer: Module) -> None:
+        # If ROCm, normalize the weights and scales to e4m3fnuz
+        if _is_fp8_fnuz:
+            # activation_scheme: dynamic
+            w13_weight, w13_weight_scale, _ = normalize_e4m3fn_to_e4m3fnuz(
+                weight=layer.w13_weight,
+                weight_scale=layer.w13_weight_scale_inv,
+                input_scale=None,
+            )
+            w2_weight, w2_weight_scale, _ = normalize_e4m3fn_to_e4m3fnuz(
+                weight=layer.w2_weight,
+                weight_scale=layer.w2_weight_scale_inv,
+                input_scale=None,
+            )
+            # Reset the parameter
+            layer.w13_weight = torch.nn.Parameter(w13_weight, requires_grad=False)
+            layer.w13_weight_scale_inv = torch.nn.Parameter(
+                w13_weight_scale, requires_grad=False
+            )
+            layer.w13_input_scale = None
+            layer.w2_weight = torch.nn.Parameter(w2_weight, requires_grad=False)
+            layer.w2_weight_scale_inv = torch.nn.Parameter(
+                w2_weight_scale, requires_grad=False
+            )
+            layer.w2_input_scale = None
+            if _use_aiter:
+                # add this section for MI300
+                # Pre-shuffle weights
+                layer.w13_weight.data = shuffle_weight(
+                    layer.w13_weight.contiguous(), (16, 16)
+                )
+                layer.w2_weight.data = shuffle_weight(
+                    layer.w2_weight.contiguous(), (16, 16)
+                )
+        elif _use_aiter:
+            # Pre-shuffle weights
+            layer.w13_weight.data = shuffle_weight(
+                layer.w13_weight.contiguous(), (16, 16)
+            )
+            layer.w2_weight.data = shuffle_weight(
+                layer.w2_weight.contiguous(), (16, 16)
+            )
+        elif _is_cpu:
+            assert (
+                _is_cpu_amx_available
+            ), "Fp8MoEMethod on CPU requires that CPU has AMX support"
+            _amx_process_weight_after_loading(layer, ["w13_weight", "w2_weight"])
+        elif self.use_mxfp8:
+            self._process_mxfp8_moe_weights(
+                layer, quantize=not self.quant_config.is_checkpoint_fp8_serialized
+            )
+        else:
+            # For fp8 moe run with deepgemm, the expert weights and scales need be requantized to ue8m0
+            from sglang.srt.layers.moe.ep_moe.layer import DeepEPMoE
+            from sglang.srt.model_loader.utils import (
+                should_deepgemm_weight_requant_ue8m0,
+            )
+
+            # Check if MoE will actually use DeepGEMM runner
+            will_use_deepgemm = self.is_deepgemm_moe_runner_backend_enabled()
+
+            if (
+                should_deepgemm_weight_requant_ue8m0(
+                    weight_block_size=getattr(
+                        self.quant_config, "weight_block_size", None
+                    ),
+                )
+                and will_use_deepgemm
+                and not layer.w13_weight_scale_inv.format_ue8m0
+            ):
+                assert isinstance(
+                    layer, DeepEPMoE
+                ), "DeepGemm MoE is only supported with DeepEPMoE"
+                weight_block_size = self.quant_config.weight_block_size
+                requant_weight_ue8m0_inplace(
+                    layer.w13_weight, layer.w13_weight_scale_inv, weight_block_size
+                )
+                requant_weight_ue8m0_inplace(
+                    layer.w2_weight, layer.w2_weight_scale_inv, weight_block_size
+                )
+                layer.w13_weight_scale_inv.format_ue8m0 = True
+                layer.w2_weight_scale_inv.format_ue8m0 = True
+
+    def _process_mxfp8_moe_weights(self, layer: Module, quantize: bool = True) -> None:
+
+        if not (_is_cuda and is_sm100_supported()):
+            raise RuntimeError("MXFP8 MoE quantization requires SM100.")
+
+        def _quantize_and_swizzle_with_cutlass_es_kernel(weight: torch.Tensor):
+            from sgl_kernel import es_sm100_mxfp8_blockscaled_grouped_quant
+
+            weight = weight.contiguous()
+            num_experts, m, k = weight.shape
+            assert k % 32 == 0, f"{k=} must be divisible by 32 for MXFP8"
+
+            weight_flat = weight.view(-1, k).contiguous()
+            problem_sizes = torch.empty(
+                (num_experts, 3), dtype=torch.int32, device=weight.device
+            )
+            problem_sizes[:, 0] = m
+            problem_sizes[:, 1] = 0
+            problem_sizes[:, 2] = k
+            expert_offsets = torch.arange(
+                0, num_experts * m, m, dtype=torch.int32, device=weight.device
+            )
+            aligned_m = ((m + 127) // 128) * 128
+            blockscale_offsets = torch.arange(
+                0,
+                num_experts * aligned_m,
+                aligned_m,
+                dtype=torch.int32,
+                device=weight.device,
+            )
+            qweight = torch.empty_like(weight_flat, dtype=torch.float8_e4m3fn)
+            scale = torch.empty(
+                (num_experts * aligned_m, k // 32),
+                dtype=torch.uint8,
+                device=weight.device,
+            )
+            es_sm100_mxfp8_blockscaled_grouped_quant(
+                weight_flat,
+                problem_sizes,
+                expert_offsets,
+                blockscale_offsets,
+                qweight,
+                scale,
+            )
+            qweight = qweight.view_as(weight)
+            scale = scale.view(num_experts, aligned_m, k // 32)
+            if aligned_m != m:
+                scale = scale[:, :m, :]
+            return qweight, scale
+
+        def _swizzle_mxfp8_sf(scale, num_warps):
+            from triton_kernels.tensor import convert_layout, wrap_torch_tensor
+            from triton_kernels.tensor_details import layout
+
+            scale_layout, scale_layout_opts = (
+                layout.make_default_matmul_mxfp4_w_scale_layout(
+                    mx_axis=1, num_warps=num_warps
+                )
+            )
+            scale = scale.transpose(-2, -1)
+            scale = convert_layout(
+                wrap_torch_tensor(scale), scale_layout, **scale_layout_opts
+            )
+            return scale
+
+        def _swizzle_with_triton_kernel(
+            weight_shape: tuple[int, int, int], scale: torch.Tensor
+        ):
+            num_experts, m, k = weight_shape
+            aligned_m = ((m + 127) // 128) * 128
+            scale = scale.view(num_experts, aligned_m, k // 32)
+            num_warps = 8
+            scale = _swizzle_mxfp8_sf(scale, num_warps)
+            scale = scale.data.view(num_experts, aligned_m, k // 32)
+            return scale
+
+        def _quantize_and_swizzle_with_triton_kernel(weight: torch.Tensor):
+
+            weight = weight.contiguous()
+            _, _, k = weight.shape
+            assert k % 32 == 0, f"{k=} must be divisible by 32 for MXFP8"
+
+            weight_flat = weight.view(-1, k).contiguous()
+            qweight, scale = mxfp8_group_quantize(weight_flat)
+            qweight = qweight.view_as(weight)
+            scale = _swizzle_with_triton_kernel(weight.shape, scale)
+            return qweight, scale
+
+        if quantize:
+            if get_moe_runner_backend().is_cutlass():
+                w13_q, w13_s = _quantize_and_swizzle_with_cutlass_es_kernel(
+                    layer.w13_weight.data
+                )
+                w2_q, w2_s = _quantize_and_swizzle_with_cutlass_es_kernel(
+                    layer.w2_weight.data
+                )
+            else:
+                w13_q, w13_s = _quantize_and_swizzle_with_triton_kernel(
+                    layer.w13_weight.data
+                )
+                w2_q, w2_s = _quantize_and_swizzle_with_triton_kernel(
+                    layer.w2_weight.data
+                )
+        else:
+            w13_q = layer.w13_weight.data
+            w2_q = layer.w2_weight.data
+            w13_s = _swizzle_with_triton_kernel(
+                layer.w13_weight.data.shape, layer.w13_weight_scale_inv.data
+            )
+            w2_s = _swizzle_with_triton_kernel(
+                layer.w2_weight.data.shape, layer.w2_weight_scale_inv.data
+            )
+
+        # Keep parameter objects to preserve weight_loader attrs for hot reload.
+        # Prefer in-place copy; rebind only when shape/dtype changes (online quantize).
+        def _copy_or_rebind(param: Parameter, new_value: torch.Tensor) -> None:
+            if (
+                param.data.shape == new_value.shape
+                and param.data.dtype == new_value.dtype
+            ):
+                param.data.copy_(new_value)
+            else:
+                param.data = new_value
+
+        _copy_or_rebind(layer.w13_weight, w13_q)
+        _copy_or_rebind(layer.w2_weight, w2_q)
+        _copy_or_rebind(layer.w13_weight_scale_inv, w13_s)
+        _copy_or_rebind(layer.w2_weight_scale_inv, w2_s)
+        layer.w13_weight.requires_grad_(False)
+        layer.w2_weight.requires_grad_(False)
+        layer.w13_weight_scale_inv.requires_grad_(False)
+        layer.w2_weight_scale_inv.requires_grad_(False)
+        layer.w13_weight_scale_inv.format_ue8m0 = True
+        layer.w2_weight_scale_inv.format_ue8m0 = True
+        layer.w13_input_scale = None
+        layer.w2_input_scale = None
+
+    def process_weights_after_loading(self, layer: Module) -> None:
+        if _is_hip and _use_hip_int4:
+            self.process_weights_hip_int4(layer)
+            return
+
+        # Block quant doesn't need to process weights after loading
+        if self.block_quant:
+            self.process_weights_after_loading_block_quant(layer)
+            return
+
+        # If checkpoint is fp16 or bfloat16, quantize in place.
+        if not self.quant_config.is_checkpoint_fp8_serialized:
+            # If ROCm, fp8_dtype will be float8_e4m3fnuz (MI300x HW)
+            w13_weight = torch.empty_like(layer.w13_weight.data, dtype=fp8_dtype)
+            w2_weight = torch.empty_like(layer.w2_weight.data, dtype=fp8_dtype)
+
+            # Re-initialize w13_scale because we directly quantize
+            # merged w13 weights and generate a single scaling factor.
+            layer.w13_weight_scale = torch.nn.Parameter(
+                torch.ones(
+                    layer.num_local_experts,
+                    dtype=torch.float32,
+                    device=w13_weight.device,
+                ),
+                requires_grad=False,
+            )
+            for expert in range(layer.num_local_experts):
+                w13_weight[expert, :, :], layer.w13_weight_scale[expert] = (
+                    scaled_fp8_quant(layer.w13_weight.data[expert, :, :])
+                )
+                w2_weight[expert, :, :], layer.w2_weight_scale[expert] = (
+                    scaled_fp8_quant(layer.w2_weight.data[expert, :, :])
+                )
+            layer.w13_weight = torch.nn.Parameter(w13_weight, requires_grad=False)
+            layer.w2_weight = torch.nn.Parameter(w2_weight, requires_grad=False)
+
+            if _is_hip:
+                self.process_weights_hip_scale_padding(layer)
+            return
+
+        # If checkpoint is fp8, we need to handle that the
+        # MoE kernels require single activation scale and single weight
+        # scale for w13 per expert.
+        else:
+            # Fp8 moe kernels require a single activation scale.
+            # We take the max of all the scales in case they differ.
+            if self.quant_config.activation_scheme == "static":
+                if layer.w13_input_scale is None or layer.w2_input_scale is None:
+                    raise ValueError(
+                        "QuantConfig has static quantization, but found "
+                        "activation scales are None."
+                    )
+                if not all_close_1d(layer.w13_input_scale) or not all_close_1d(
+                    layer.w2_input_scale
+                ):
+                    print_warning_once(
+                        "Found input_scales that are not equal for "
+                        "fp8 MoE layer. Using the maximum across experts "
+                        "for each layer. "
+                    )
+                layer.w13_input_scale = torch.nn.Parameter(
+                    layer.w13_input_scale.max(), requires_grad=False
+                )
+                layer.w2_input_scale = torch.nn.Parameter(
+                    layer.w2_input_scale.max(), requires_grad=False
+                )
+
+            # If ROCm, normalize the weights and scales to e4m3fnuz
+            if _is_fp8_fnuz:
+                # Normalize the weights and scales
+                w13_weight, w13_weight_scale, w13_input_scale = (
+                    normalize_e4m3fn_to_e4m3fnuz(
+                        layer.w13_weight, layer.w13_weight_scale, layer.w13_input_scale
+                    )
+                )
+                w2_weight, w2_weight_scale, w2_input_scale = (
+                    normalize_e4m3fn_to_e4m3fnuz(
+                        layer.w2_weight, layer.w2_weight_scale, layer.w2_input_scale
+                    )
+                )
+                # Reset the parameter
+                layer.w13_weight = torch.nn.Parameter(w13_weight, requires_grad=False)
+                layer.w13_weight_scale = torch.nn.Parameter(
+                    w13_weight_scale, requires_grad=False
+                )
+                if w13_input_scale is not None:
+                    layer.w13_input_scale = torch.nn.Parameter(
+                        w13_input_scale, requires_grad=False
+                    )
+                layer.w2_weight = torch.nn.Parameter(w2_weight, requires_grad=False)
+                layer.w2_weight_scale = torch.nn.Parameter(
+                    w2_weight_scale, requires_grad=False
+                )
+                if w2_input_scale is not None:
+                    layer.w2_input_scale = torch.nn.Parameter(
+                        w2_input_scale, requires_grad=False
+                    )
+            # Fp8 moe kernel needs single weight scale for w13 per expert.
+            # We take the max then dequant and requant each expert.
+            assert layer.w13_weight_scale is not None
+            shard_size = layer.intermediate_size_per_partition
+            max_w13_scales = layer.w13_weight_scale.max(dim=1).values
+            for expert_id in range(layer.num_local_experts):
+                start = 0
+                for shard_id in range(2):
+                    dq_weight = per_tensor_dequantize(
+                        layer.w13_weight[expert_id][start : start + shard_size, :],
+                        layer.w13_weight_scale[expert_id][shard_id],
+                    )
+                    (
+                        layer.w13_weight[expert_id][start : start + shard_size, :],
+                        _,
+                    ) = scaled_fp8_quant(dq_weight, max_w13_scales[expert_id])
+                    start += shard_size
+
+            layer.w13_weight_scale = torch.nn.Parameter(
+                max_w13_scales, requires_grad=False
+            )
+
+            if _is_hip:
+                self.process_weights_hip_scale_padding(layer)
+
+            # Align FP8 weights to FlashInfer per-tensor kernel layout if enabled
+            if get_moe_runner_backend().is_flashinfer_trtllm():
+                from sglang.srt.layers.moe.moe_runner.flashinfer_trtllm import (
+                    align_fp8_moe_weights_for_flashinfer_trtllm,
+                )
+
+                align_fp8_moe_weights_for_flashinfer_trtllm(layer)
+            return
+
+    def process_weights_hip_int4(self, layer: Module):
+        # TODO: _use_aiter: add after triton kernel added
+        # INT4-FP8 (INT4 MoE Weight, FP8 Compute)
+        # Weight Permutation
+        layer.w13_weight = torch.nn.Parameter(
+            shuffle_weight(layer.w13_weight.data, (16, 16)),
+            requires_grad=False,
+        )
+        torch.cuda.empty_cache()
+        layer.w2_weight = torch.nn.Parameter(
+            shuffle_weight(layer.w2_weight.data, (16, 16)),
+            requires_grad=False,
+        )
+        torch.cuda.empty_cache()
+
+        # INT4-FP8 : offset INT4 w13_weight_scale1 to single w13_weight_scale
+        # Fp8 moe kernel needs single fp8 w13_weight_scale for w13 per expert.
+        # We won't do requant each expert's fp8 weight (not direct available),
+        # instead we adjust half of INT4 w13_weight_scale1 numbers
+        assert layer.w13_weight_scale is not None
+        shard_size = layer.intermediate_size_per_partition
+        max_w13_scales = layer.w13_weight_scale.max(dim=1).values
+        for expert_id in range(layer.num_local_experts):
+            start = 0
+            max_w13_scale_fp8 = max_w13_scales[expert_id]
+            for shard_id in range(2):
+                if layer.w13_weight_scale[expert_id][shard_id] != max_w13_scale_fp8:
+                    int4_rescale = (
+                        layer.w13_weight_scale[expert_id][shard_id] / max_w13_scale_fp8
+                    )
+                    layer.w13_weight_scale1[expert_id][
+                        start : start + shard_size
+                    ] *= int4_rescale
+                start += shard_size
+
+        layer.w13_weight_scale = torch.nn.Parameter(max_w13_scales, requires_grad=False)
+
+        # special hack to asm_moe, which takes (weight_scale1 * weight_scale) as post GEMM scaling
+        # optimal design - shall apply per-column weight_scale1 before GEMM, and weight_scale post
+        for expert_id in range(layer.num_local_experts):
+            layer.w13_weight_scale1[expert_id] *= max_w13_scales[expert_id]
+            layer.w2_weight_scale1[expert_id] *= layer.w2_weight_scale[expert_id]
+
+    def process_weights_hip_scale_padding(self, layer: Module):
+        from sglang.srt.layers.moe.fused_moe_triton.fused_moe import (
+            padding_size,  # Avoid circular import
+        )
+
+        if _use_aiter:
+            layer.w13_weight = torch.nn.Parameter(
+                shuffle_weight(layer.w13_weight.data, (16, 16)),
+                requires_grad=False,
+            )
+            torch.cuda.empty_cache()
+            layer.w2_weight = torch.nn.Parameter(
+                shuffle_weight(layer.w2_weight.data, (16, 16)),
+                requires_grad=False,
+            )
+            torch.cuda.empty_cache()
+
+            # ROCm (_use_aiter): using column-wise scaling
+            layer.w13_weight_scale1 *= layer.w13_weight_scale.unsqueeze(-1)
+            layer.w2_weight_scale1 *= layer.w2_weight_scale.unsqueeze(-1)
+        elif get_bool_env_var("SGLANG_MOE_PADDING"):
+            # If ROCm, apply weight padding (min. Mem channel contention) only if set
+            layer.w13_weight = torch.nn.Parameter(
+                F.pad(layer.w13_weight.data, (0, padding_size), "constant", 0),
+                requires_grad=False,
+            )
+            torch.cuda.empty_cache()
+            layer.w2_weight = torch.nn.Parameter(
+                F.pad(layer.w2_weight.data, (0, padding_size), "constant", 0),
+                requires_grad=False,
+            )
+            torch.cuda.empty_cache()
+
+    def create_moe_runner(
+        self, layer: torch.nn.Module, moe_runner_config: MoeRunnerConfig
+    ):
+        self.moe_runner_config = moe_runner_config
+        moe_runner_backend = get_moe_runner_backend()
+
+        if moe_runner_backend.is_auto():
+            if self.is_deepgemm_moe_runner_backend_enabled():
+                moe_runner_backend = MoeRunnerBackend.DEEP_GEMM
+            else:
+                moe_runner_backend = MoeRunnerBackend.TRITON
+        if (
+            moe_runner_backend.is_deep_gemm()
+            or moe_runner_backend.is_triton()
+            or moe_runner_backend.is_flashinfer_trtllm()
+        ):
+            self.runner = MoeRunner(moe_runner_backend, moe_runner_config)
+        else:
+            # TODO(cwan): refactor other backends
+            pass
+
+    def apply(
+        self,
+        layer: torch.nn.Module,
+        dispatch_output: DispatchOutput,
+    ) -> CombineInput:
+
+        from sglang.srt.layers.moe.token_dispatcher import StandardCombineInput
+
+        x = dispatch_output.hidden_states
+        moe_runner_config = self.moe_runner_config
+
+        if use_intel_amx_backend(layer):
+            from sglang.srt.layers.moe.topk import apply_topk_weights_cpu
+
+            topk_weights, topk_ids, _ = dispatch_output.topk_output
+            x, topk_weights = apply_topk_weights_cpu(
+                moe_runner_config.apply_router_weight_on_input, topk_weights, x
+            )
+
+            output = torch.ops.sgl_kernel.fused_experts_cpu(
+                x,
+                layer.w13_weight,
+                layer.w2_weight,
+                topk_weights,
+                topk_ids,
+                False,  # inplace See [Note] inplace should be False in fused_experts.
+                CPUQuantMethod.FP8_W8A16,
+                layer.w13_weight_scale_inv,  # w1_scale
+                layer.w2_weight_scale_inv,  # w2_scale
+                None,  # w1_zp
+                None,  # w2_zp
+                self.quant_config.weight_block_size,  # block_size
+                True,  # is_vnni
+            )
+            return StandardCombineInput(hidden_states=output)
+
+        if _is_hip:
+            ret = self.maybe_apply_hip_fused_experts(
+                layer,
+                x,
+                dispatch_output.topk_output,
+                moe_runner_config.activation,
+                moe_runner_config.no_combine,
+            )
+            if ret is not None:
+                return StandardCombineInput(hidden_states=ret)
+
+        if get_moe_runner_backend().is_cutlass():
+            from sglang.srt.layers.moe.cutlass_moe import cutlass_fused_experts_fp8
+
+            with use_symmetric_memory(
+                get_tp_group(), disabled=not is_allocation_symmetric()
+            ):
+                symm_output = torch.empty_like(x)
+
+            topk_weights, topk_ids, _ = dispatch_output.topk_output
+            use_mxfp8 = getattr(self.quant_config, "use_mxfp8", False)
+            output = cutlass_fused_experts_fp8(
+                x,
+                layer.w13_weight.transpose(1, 2),
+                layer.w2_weight.transpose(1, 2),
+                layer.w13_weight_scale_inv.transpose(1, 2),
+                layer.w2_weight_scale_inv.transpose(1, 2),
+                topk_weights,
+                topk_ids,
+                self.ab_strides1,
+                self.c_strides1,
+                self.ab_strides2,
+                self.c_strides2,
+                self.workspace,
+                self.a_ptr,
+                self.b_ptr,
+                self.out_ptr,
+                self.a_scales_ptr,
+                self.b_scales_ptr,
+                self.expert_offsets,
+                self.problem_sizes1,
+                self.problem_sizes2,
+                use_fp8_blockscale=True,
+                use_mxfp8=use_mxfp8,
+                output=symm_output,
+                enable_es=(use_mxfp8, use_mxfp8),
+            )
+            return StandardCombineInput(hidden_states=output)
+
+        if self.runner.runner_backend.is_deep_gemm():
+
+            w13_weight = layer.w13_weight
+            w2_weight = layer.w2_weight
+
+            if self.block_quant:
+                block_shape = self.quant_config.weight_block_size
+                w13_scale = layer.w13_weight_scale_inv
+                w2_scale = layer.w2_weight_scale_inv
+            else:
+                # Convert per-tensor quant to per-block quant by repeating scales for forward_deepgemm
+                scale_block_size = 128
+                block_shape = [scale_block_size, scale_block_size]
+                w13_scale_n = (w13_weight.shape[1] - 1) // scale_block_size + 1
+                w13_scale_k = (w13_weight.shape[2] - 1) // scale_block_size + 1
+                w13_scale = (
+                    layer.w13_weight_scale.unsqueeze(1)
+                    .repeat_interleave(w13_scale_n, dim=1)
+                    .unsqueeze(2)
+                    .repeat_interleave(w13_scale_k, dim=2)
+                )
+                w2_scale_n = (w2_weight.shape[1] - 1) // scale_block_size + 1
+                w2_scale_k = (w2_weight.shape[2] - 1) // scale_block_size + 1
+                w2_scale = (
+                    layer.w2_weight_scale.unsqueeze(1)
+                    .repeat_interleave(w2_scale_n, dim=1)
+                    .unsqueeze(2)
+                    .repeat_interleave(w2_scale_k, dim=2)
+                )
+            quant_info = DeepGemmMoeQuantInfo(
+                w13_weight=w13_weight,
+                w2_weight=w2_weight,
+                use_fp8=True,
+                w13_scale=w13_scale,
+                w2_scale=w2_scale,
+                block_shape=block_shape,
+            )
+        elif self.runner.runner_backend.is_flashinfer_trtllm():
+            # FlashInfer TRT-LLM backend only supports fused execution and consumes
+            # router logits directly (no separate apply_with_router_logits needed).
+            global_num_experts = int(getattr(layer, "num_experts"))
+            num_local_experts = int(getattr(layer, "num_local_experts"))
+            moe_ep_rank = int(getattr(layer, "moe_ep_rank"))
+
+            quant_info = FlashInferTrtllmFp8MoeQuantInfo(
+                w13_weight=layer.w13_weight,
+                w2_weight=layer.w2_weight,
+                global_num_experts=global_num_experts,
+                local_expert_offset=moe_ep_rank * num_local_experts,
+                local_num_experts=num_local_experts,
+                intermediate_size=layer.w2_weight.shape[2],
+                routing_method_type=int(
+                    getattr(layer, "routing_method_type", RoutingMethodType.DeepSeekV3)
+                ),
+                block_quant=self.block_quant,
+                weight_block_k=(
+                    None
+                    if self.quant_config.weight_block_size is None
+                    else self.quant_config.weight_block_size[1]
+                ),
+                w13_weight_scale_inv=(
+                    layer.w13_weight_scale_inv if self.block_quant else None
+                ),
+                w2_weight_scale_inv=(
+                    layer.w2_weight_scale_inv if self.block_quant else None
+                ),
+                w13_input_scale=layer.w13_input_scale if not self.block_quant else None,
+                output1_scales_scalar=(
+                    getattr(layer, "output1_scales_scalar", None)
+                    if not self.block_quant
+                    else None
+                ),
+                output1_scales_gate_scalar=(
+                    getattr(layer, "output1_scales_gate_scalar", None)
+                    if not self.block_quant
+                    else None
+                ),
+                output2_scales_scalar=(
+                    getattr(layer, "output2_scales_scalar", None)
+                    if not self.block_quant
+                    else None
+                ),
+            )
+        elif self.runner.runner_backend.is_triton():
+            quant_info = TritonMoeQuantInfo(
+                w13_weight=layer.w13_weight,
+                w2_weight=layer.w2_weight,
+                b13=getattr(layer, "w13_weight_bias", None),
+                b2=getattr(layer, "w2_weight_bias", None),
+                use_fp8_w8a8=True,
+                w13_scale=(
+                    layer.w13_weight_scale_inv
+                    if self.block_quant
+                    else layer.w13_weight_scale
+                ),
+                w2_scale=(
+                    layer.w2_weight_scale_inv
+                    if self.block_quant
+                    else layer.w2_weight_scale
+                ),
+                a13_scale=layer.w13_input_scale,
+                a2_scale=layer.w2_input_scale,
+                block_shape=self.quant_config.weight_block_size,
+            )
+        else:
+            raise NotImplementedError(
+                "Unsupported runner backend: %s" % self.runner.runner_backend
+            )
+
+        return self.runner.run(dispatch_output, quant_info)
+
+    def _ensure_cutlass_buffers_initialized(self, layer: Module) -> None:
+        if getattr(self, "_cutlass_buffers_ready", False):
+            return
+
+        device = layer.w13_weight.device
+        num_experts = layer.w13_weight.shape[0]
+        hidden_size = layer.w2_weight.shape[1]
+        intermediate_size_per_partition = layer.intermediate_size_per_partition
+
+        self.ab_strides1 = torch.full(
+            (num_experts,), hidden_size, device=device, dtype=torch.int64
+        )
+        self.c_strides1 = torch.full(
+            (num_experts,),
+            2 * intermediate_size_per_partition,
+            device=device,
+            dtype=torch.int64,
+        )
+        self.ab_strides2 = torch.full(
+            (num_experts,),
+            intermediate_size_per_partition,
+            device=device,
+            dtype=torch.int64,
+        )
+        self.c_strides2 = torch.full(
+            (num_experts,), hidden_size, device=device, dtype=torch.int64
+        )
+        self.workspace = torch.empty(90000, device=device, dtype=torch.uint8)
+        self.a_ptr = torch.empty(num_experts, device=device, dtype=torch.int64)
+        self.b_ptr = torch.empty(num_experts, device=device, dtype=torch.int64)
+        self.out_ptr = torch.empty(num_experts, device=device, dtype=torch.int64)
+        self.a_scales_ptr = torch.empty(num_experts, device=device, dtype=torch.int64)
+        self.b_scales_ptr = torch.empty(num_experts, device=device, dtype=torch.int64)
+        self.expert_offsets = torch.empty(
+            num_experts + 1, device=device, dtype=torch.int32
+        )
+        self.problem_sizes1 = torch.empty(
+            num_experts, 3, device=device, dtype=torch.int32
+        )
+        self.problem_sizes2 = torch.empty(
+            num_experts, 3, device=device, dtype=torch.int32
+        )
+
+        self._cutlass_buffers_ready = True
+
+    def maybe_apply_hip_fused_experts(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        topk_output: TopKOutput,
+        activation: str = "silu",
+        no_combine: bool = False,
+    ) -> Optional[torch.Tensor]:
+        topk_weights, topk_ids, _ = topk_output
+        if _use_hip_int4:
+            # TODO: add triton kernel and add check _use_aiter
+            assert not no_combine, f"{no_combine=} is not supported."
+            return fused_moe(
+                x,
+                layer.w13_weight,
+                layer.w2_weight,
+                topk_weights,
+                topk_ids,
+                quant_type=QuantType.per_Token,
+                w1_scale=layer.w13_weight_scale1,
+                w2_scale=layer.w2_weight_scale1,
+                activation=(
+                    ActivationType.Silu if activation == "silu" else ActivationType.Gelu
+                ),
+            )
+
+        if _use_aiter:
+            assert not no_combine, f"{no_combine=} is not supported."
+            if self.block_quant:
+                return fused_moe(
+                    x,
+                    layer.w13_weight,
+                    layer.w2_weight,
+                    topk_weights,
+                    topk_ids,
+                    w1_scale=layer.w13_weight_scale_inv,
+                    w2_scale=layer.w2_weight_scale_inv,
+                    quant_type=QuantType.per_128x128,
+                    activation=(
+                        ActivationType.Silu
+                        if activation == "silu"
+                        else ActivationType.Gelu
+                    ),
+                    expert_mask=layer.expert_mask_gpu,
+                )
+            else:
+                return fused_moe(
+                    x,
+                    layer.w13_weight,
+                    layer.w2_weight,
+                    topk_weights,
+                    topk_ids,
+                    quant_type=QuantType.per_Token,
+                    w1_scale=layer.w13_weight_scale1,
+                    w2_scale=layer.w2_weight_scale1,
+                    activation=(
+                        ActivationType.Silu
+                        if activation == "silu"
+                        else ActivationType.Gelu
+                    ),
+                    expert_mask=layer.expert_mask_gpu,
+                )
+        return None
+
+
+class Fp8KVCacheMethod(BaseKVCacheMethod):
+    """
+    Supports loading kv-cache scaling factors from FP8 checkpoints.
+    """
+
+    def __init__(self, quant_config: Fp8Config):
+        super().__init__(quant_config)
diff --git a/sglang/python/sglang/srt/layers/quantization/fp8_kernel.py b/sglang/python/sglang/srt/layers/quantization/fp8_kernel.py
new file mode 100644
index 0000000000000000000000000000000000000000..1466bac6bec45d0e2668671e5b50703a29eb8f22
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/quantization/fp8_kernel.py
@@ -0,0 +1,2076 @@
+# Copyright 2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+import functools
+import json
+import logging
+import os
+from functools import lru_cache
+from typing import Any, Dict, List, Optional, Tuple
+
+import torch
+import triton
+import triton.language as tl
+
+try:
+    from triton.tools.tensor_descriptor import TensorDescriptor
+except:
+    pass
+
+from sglang.srt.layers import deep_gemm_wrapper
+from sglang.srt.utils import (
+    ceil_align,
+    get_bool_env_var,
+    get_device_core_count,
+    get_device_name,
+    is_cpu,
+    is_cuda,
+    is_hip,
+    is_sm100_supported,
+    is_sm120_supported,
+    log_info_on_rank0,
+)
+from sglang.srt.utils.custom_op import register_custom_op
+
+_is_hip = is_hip()
+_is_cuda = is_cuda()
+_is_cpu = is_cpu()
+_use_aiter = get_bool_env_var("SGLANG_USE_AITER") and _is_hip
+
+if _is_cuda:
+    from sgl_kernel import sgl_per_token_quant_fp8
+
+    from sglang.jit_kernel.per_tensor_quant_fp8 import (
+        per_tensor_quant_fp8 as sgl_per_tensor_quant_fp8,
+    )
+
+    # Temporary
+    try:
+        from sgl_kernel import sgl_per_token_group_quant_8bit
+
+        enable_sgl_per_token_group_quant_8bit = True
+    except ImportError:
+        from sgl_kernel import sgl_per_token_group_quant_fp8
+
+        enable_sgl_per_token_group_quant_8bit = False
+
+    from sglang.jit_kernel.per_token_group_quant_8bit import (
+        per_token_group_quant_8bit as sgl_per_token_group_quant_8bit_jit,
+    )
+
+if _is_hip:
+    _has_vllm = False
+    if _use_aiter:
+        try:
+            from aiter import (  # v0.1.3
+                dynamic_per_tensor_quant,
+                dynamic_per_token_scaled_quant,
+                static_per_tensor_quant,
+            )
+        except ImportError:
+            raise ImportError("aiter is required when SGLANG_USE_AITER is set to True")
+    else:
+        try:
+            import vllm._C  # noqa: F401
+
+            _has_vllm = True
+        except ImportError:
+            # Fallback: vllm not available, will use native PyTorch implementation
+            _has_vllm = False
+
+logger = logging.getLogger(__name__)
+
+
+@lru_cache()
+def is_fp8_fnuz() -> bool:
+    if _is_hip:
+        # only device 0 is checked, this assumes MI300 platforms are homogeneous
+        return "gfx94" in torch.cuda.get_device_properties(0).gcnArchName
+    return False
+
+
+if is_fp8_fnuz():
+    fp8_dtype = torch.float8_e4m3fnuz
+    fp8_max = 224.0
+else:
+    fp8_dtype = torch.float8_e4m3fn
+    fp8_max = torch.finfo(fp8_dtype).max
+fp8_min = -fp8_max
+
+
+@register_custom_op(mutates_args=["C"])
+def deep_gemm_fp8_fp8_bf16_nt(
+    A: torch.Tensor,
+    As: torch.Tensor,
+    B: torch.Tensor,
+    Bs: torch.Tensor,
+    C: torch.Tensor,
+) -> None:
+    deep_gemm_wrapper.gemm_nt_f8f8bf16((A, As), (B, Bs), C)
+
+
+@triton.jit
+def _per_token_group_quant_8bit(
+    # Pointers to inputs and output
+    y_ptr,
+    y_q_ptr,
+    y_s_ptr,
+    # Stride of input
+    y_stride,
+    # Columns of input
+    N,
+    # Avoid to divide zero
+    eps,
+    # Information for float8
+    bit8_min,
+    bit8_max,
+    # Meta-parameters
+    BLOCK: tl.constexpr,
+):
+    """A Triton-accelerated function to perform per-token-group quantization on a
+    tensor.
+
+    This function converts the tensor values into float8 values.
+    """
+    # Map the program id to the row of X and Y it should compute.
+    g_id = tl.program_id(0)
+    y_ptr += g_id * y_stride
+    y_q_ptr += g_id * y_stride
+    y_s_ptr += g_id
+
+    cols = tl.arange(0, BLOCK)  # N <= BLOCK
+    mask = cols < N
+
+    y = tl.load(y_ptr + cols, mask=mask, other=0.0).to(tl.float32)
+    # Quant
+    _absmax = tl.maximum(tl.max(tl.abs(y)), eps)
+    y_s = _absmax / bit8_max
+    y_s_inv = 1.0 / y_s
+    y_q = tl.clamp(y * y_s_inv, bit8_min, bit8_max).to(y_q_ptr.dtype.element_ty)
+
+    tl.store(y_q_ptr + cols, y_q, mask=mask)
+    tl.store(y_s_ptr, y_s)
+
+
+@triton.jit
+def _per_token_group_quant_8bit_colmajor(
+    # Pointers to inputs and output
+    y_ptr,
+    y_q_ptr,
+    y_s_ptr,
+    group_size,
+    # Num columns of y
+    y_num_columns,
+    # Stride from one column to the next of y_s
+    y_s_col_stride,
+    # Avoid to divide zero
+    eps,
+    # Information for float8
+    bit8_min,
+    bit8_max,
+    # Meta-parameters
+    BLOCK: tl.constexpr,
+    SCALE_UE8M0: tl.constexpr,
+):
+    """A Triton-accelerated function to perform per-token-group
+    quantization on a tensor.
+    This function converts the tensor values into float8 values.
+    """
+    # Map the program id to the row of X and Y it should compute.
+    g_id = tl.program_id(0)
+    y_ptr += g_id.to(tl.int64) * group_size
+    y_q_ptr += g_id.to(tl.int64) * group_size
+
+    # Convert g_id the flattened block coordinate to 2D so we can index
+    # into the output y_scales matrix
+    blocks_per_row = y_num_columns // group_size
+    scale_col = g_id % blocks_per_row
+    scale_row = g_id // blocks_per_row
+    y_s_ptr += scale_col * y_s_col_stride + scale_row
+
+    cols = tl.arange(0, BLOCK)  # group_size <= BLOCK
+    mask = cols < group_size
+
+    y = tl.load(y_ptr + cols, mask=mask, other=0.0).to(tl.float32)
+    # Quant
+    _absmax = tl.maximum(tl.max(tl.abs(y)), eps)
+    y_s = _absmax / bit8_max
+    if SCALE_UE8M0:
+        y_s = tl.exp2(tl.ceil(tl.log2(tl.abs(y_s))))
+    y_q = tl.clamp(y / y_s, bit8_min, bit8_max).to(y_q_ptr.dtype.element_ty)
+
+    tl.store(y_q_ptr + cols, y_q, mask=mask)
+    tl.store(y_s_ptr, y_s)
+
+
+def _per_token_group_quant_8bit_raw(
+    x: torch.Tensor,
+    group_size: int,
+    eps: float = 1e-10,
+    dtype: torch.dtype = fp8_dtype,
+    column_major_scales: bool = False,
+    scale_tma_aligned: bool = False,
+    scale_ue8m0: bool = False,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """Function to perform per-token-group quantization on an input tensor `x`.
+
+    It converts the tensor values into signed float8 values and returns the
+    quantized tensor along with the scaling factor used for quantization.
+
+    Args:
+        x: The input tensor with ndim >= 2.
+        group_size: The group size used for quantization.
+        eps: The minimum to avoid dividing zero.
+        dtype: The dype of output tensor.
+
+    Returns:
+        Tuple[torch.Tensor, torch.Tensor]: The quantized tensor and the scaling factor for quantization.
+    """
+    assert (
+        x.shape[-1] % group_size == 0
+    ), "the last dimension of `x` cannot be divisible by `group_size`"
+    assert x.is_contiguous(), "`x` is not contiguous"
+
+    if _is_hip:
+        if dtype == torch.int8:
+            bit8_max = 127.0
+        else:
+            bit8_max = 224.0
+        bit8_min = -bit8_max  # TODO incorrect for int8
+    else:
+        if dtype == torch.int8:
+            info = torch.iinfo(dtype)
+        else:
+            info = torch.finfo(dtype)
+        bit8_max = info.max
+        bit8_min = info.min
+
+    x_q = torch.empty_like(x, device=x.device, dtype=dtype)
+    x_s = create_per_token_group_quant_fp8_output_scale(
+        x_shape=x.shape,
+        device=x.device,
+        group_size=group_size,
+        column_major_scales=column_major_scales,
+        scale_tma_aligned=scale_tma_aligned,
+        scale_ue8m0=False,
+    )
+
+    M = x.numel() // group_size
+    N = group_size
+
+    BLOCK = triton.next_power_of_2(N)
+    # heuristics for number of warps
+    num_warps = min(max(BLOCK // 256, 1), 8)
+    num_stages = 1
+    if column_major_scales:
+        _per_token_group_quant_8bit_colmajor[(M,)](
+            x,
+            x_q,
+            x_s,
+            group_size,
+            x.shape[1],
+            x_s.stride(1),
+            eps,
+            bit8_min=bit8_min,
+            bit8_max=bit8_max,
+            BLOCK=BLOCK,
+            num_warps=num_warps,
+            num_stages=num_stages,
+            SCALE_UE8M0=scale_ue8m0,
+        )
+    else:
+        assert not scale_ue8m0
+        _per_token_group_quant_8bit[(M,)](
+            x,
+            x_q,
+            x_s,
+            group_size,
+            N,
+            eps,
+            bit8_min=bit8_min,
+            bit8_max=bit8_max,
+            BLOCK=BLOCK,
+            num_warps=num_warps,
+            num_stages=num_stages,
+        )
+
+    if scale_ue8m0:
+        from deep_gemm import transform_sf_into_required_layout
+
+        assert group_size == 128
+        x_s = transform_sf_into_required_layout(
+            x_s,
+            num_groups=None,
+            mn=x_q.shape[0],
+            k=x_q.shape[1],
+            recipe=(1, group_size, group_size),
+            is_sfa=True,
+        )
+
+    return x_q, x_s
+
+
+# backward compatibility
+per_token_group_quant_fp8 = _per_token_group_quant_8bit_raw
+
+
+def _per_token_group_quant_8bit_fuse_silu_and_mul(
+    x: torch.Tensor,
+    group_size: int,
+    dst_dtype: torch.dtype,
+    column_major_scales: bool,
+    scale_tma_aligned: bool,
+    scale_ue8m0: bool,
+    masked_m: Optional[torch.Tensor],
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    # Another way to implement (can be used in e.g. comparison tests)
+    # from sgl_kernel import silu_and_mul
+    # x_after_silu_and_mul = silu_and_mul(x)
+    # return per_token_group_quant_fp8(
+    #     x_after_silu_and_mul,
+    #     group_size=group_size,
+    #     eps=eps,
+    #     column_major_scales=column_major_scales,
+    #     scale_tma_aligned=scale_tma_aligned,
+    #     scale_ue8m0=scale_ue8m0,
+    # )
+
+    from deep_gemm import transform_sf_into_required_layout
+
+    from sglang.srt.layers.moe.ep_moe.kernels import silu_and_mul_masked_post_quant_fwd
+
+    assert column_major_scales
+    assert scale_tma_aligned
+    assert scale_ue8m0
+
+    needs_unsqueeze = x.dim() == 2
+    if needs_unsqueeze:
+        num_tokens, _ = x.shape
+        x = x.unsqueeze(0)
+        assert masked_m is None
+        masked_m = torch.tensor([num_tokens], device=x.device, dtype=torch.int32)
+
+    # Use `zeros` for easier testing
+    output = torch.zeros(
+        (*x.shape[:-1], x.shape[-1] // 2),
+        device=x.device,
+        dtype=dst_dtype,
+    )
+    # Use `zeros` for easier testing
+    output_scale_for_kernel = torch.zeros(
+        (*x.shape[:-1], x.shape[-1] // 2 // group_size),
+        device=x.device,
+        dtype=torch.float32,
+    )
+    silu_and_mul_masked_post_quant_fwd(
+        input=x,
+        output=output,
+        output_scale=output_scale_for_kernel,
+        quant_group_size=group_size,
+        masked_m=masked_m,
+        scale_ue8m0=scale_ue8m0,
+    )
+
+    assert group_size == 128
+    output_scale = transform_sf_into_required_layout(
+        output_scale_for_kernel,
+        num_groups=output.shape[0],
+        mn=output.shape[-2],
+        k=output.shape[-1],
+        recipe=(1, group_size, group_size),
+        is_sfa=True,
+    )
+
+    if needs_unsqueeze:
+        output = output.squeeze(0)
+        output_scale = output_scale.squeeze(0)
+
+    return output, output_scale
+
+
+def per_token_group_quant_8bit(
+    x: torch.Tensor,
+    group_size: int,
+    dst_dtype: torch.dtype,
+    eps: float = 1e-10,
+    column_major_scales: bool = False,
+    scale_tma_aligned: bool = False,
+    scale_ue8m0: bool = False,
+    fuse_silu_and_mul: bool = False,
+    masked_m: Optional[torch.Tensor] = None,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    if fuse_silu_and_mul:
+        return _per_token_group_quant_8bit_fuse_silu_and_mul(
+            x=x,
+            group_size=group_size,
+            dst_dtype=dst_dtype,
+            column_major_scales=column_major_scales,
+            scale_tma_aligned=scale_tma_aligned,
+            scale_ue8m0=scale_ue8m0,
+            masked_m=masked_m,
+        )
+    else:
+        return _per_token_group_quant_8bit_raw(
+            x=x,
+            group_size=group_size,
+            eps=eps,
+            column_major_scales=column_major_scales,
+            scale_tma_aligned=scale_tma_aligned,
+            scale_ue8m0=scale_ue8m0,
+            dtype=dst_dtype,
+        )
+
+
+def create_per_token_group_quant_fp8_output_scale(
+    x_shape,
+    device,
+    group_size,
+    column_major_scales: bool,
+    scale_tma_aligned: bool,
+    scale_ue8m0: bool,
+):
+    if scale_ue8m0:
+        assert column_major_scales and scale_tma_aligned
+        *x_batch, x_q_mn, x_q_k = x_shape
+        x_s_mn, x_s_k = x_q_mn, x_q_k // 128
+        aligned_mn = ceil_align(x_s_mn, 4)
+        aligned_k = ceil_align(x_s_k, 4)
+        # TODO(FIXME): Fix cuda kernel and recover here to empty.
+        return torch.empty(
+            (*x_batch, aligned_k // 4, aligned_mn),
+            device=device,
+            dtype=torch.int,
+        ).transpose(-1, -2)[..., :x_s_mn, :]
+    elif column_major_scales:
+        if scale_tma_aligned:
+            # TODO extract "align" function
+            # aligned to 4 * sizeof(float)
+            aligned_size = (x_shape[-2] + 3) // 4 * 4
+            return torch.empty(
+                x_shape[:-2] + (x_shape[-1] // group_size, aligned_size),
+                device=device,
+                dtype=torch.float32,
+            ).transpose(-1, -2)[: x_shape[-2], :]
+        else:
+            return torch.empty(
+                (x_shape[-1] // group_size,) + x_shape[:-1],
+                device=device,
+                dtype=torch.float32,
+            ).permute(-1, -2)
+    else:
+        return torch.empty(
+            x_shape[:-1] + (x_shape[-1] // group_size,),
+            device=device,
+            dtype=torch.float32,
+        )
+
+
+def sglang_per_token_group_quant_fp8(
+    x: torch.Tensor,
+    group_size: int,
+    eps: float = 1e-10,
+    column_major_scales: bool = False,
+    scale_tma_aligned: bool = False,
+    scale_ue8m0: bool = False,
+    fuse_silu_and_mul: bool = False,
+    masked_m: Optional[torch.Tensor] = None,
+    enable_v2: Optional[bool] = None,
+):
+    assert (
+        x.shape[-1] % group_size == 0
+    ), "the last dimension of `x` cannot be divisible by `group_size`"
+    assert x.is_contiguous(), "`x` is not contiguous"
+
+    out_shape = (*x.shape[:-1], x.shape[-1] // (2 if fuse_silu_and_mul else 1))
+
+    x_q = torch.empty(out_shape, device=x.device, dtype=fp8_dtype)
+    x_s = create_per_token_group_quant_fp8_output_scale(
+        x_shape=out_shape,
+        device=x.device,
+        group_size=group_size,
+        column_major_scales=column_major_scales,
+        scale_tma_aligned=scale_tma_aligned,
+        scale_ue8m0=scale_ue8m0,
+    )
+
+    if x.shape[0] > 0:
+        # Temporary
+        if enable_sgl_per_token_group_quant_8bit:
+            if enable_v2:
+                sgl_per_token_group_quant_8bit(
+                    x,
+                    x_q,
+                    x_s,
+                    group_size,
+                    eps,
+                    fp8_min,
+                    fp8_max,
+                    scale_ue8m0,
+                    fuse_silu_and_mul,
+                    masked_m,
+                    enable_v2=True,
+                )
+            else:
+                sgl_per_token_group_quant_8bit_jit(
+                    input=x,
+                    output_q=x_q,
+                    output_s=x_s,
+                    group_size=group_size,
+                    eps=eps,
+                    fp8_min=fp8_min,
+                    fp8_max=fp8_max,
+                    scale_ue8m0=scale_ue8m0,
+                )
+        else:
+            assert not enable_v2
+            sgl_per_token_group_quant_fp8(
+                x, x_q, x_s, group_size, eps, fp8_min, fp8_max, scale_ue8m0
+            )
+
+    return x_q, x_s
+
+
+# TODO maybe unify int8 and fp8 code later
+def sglang_per_token_group_quant_8bit(
+    x: torch.Tensor,
+    group_size: int,
+    dst_dtype: torch.dtype,
+    eps: float = 1e-10,
+    column_major_scales: bool = False,
+    scale_tma_aligned: bool = False,
+    scale_ue8m0: bool = False,
+    fuse_silu_and_mul: bool = False,
+    masked_m: Optional[torch.Tensor] = None,
+    enable_v2: Optional[bool] = None,
+):
+    from sglang.srt.layers.quantization.int8_kernel import (
+        sglang_per_token_group_quant_int8,
+    )
+
+    if dst_dtype == torch.int8:
+        assert not column_major_scales
+        assert not scale_tma_aligned
+        assert not fuse_silu_and_mul
+        assert masked_m is None
+        return sglang_per_token_group_quant_int8(
+            x=x,
+            group_size=group_size,
+            eps=eps,
+            dtype=dst_dtype,
+            enable_v2=enable_v2,
+        )
+
+    return sglang_per_token_group_quant_fp8(
+        x=x,
+        group_size=group_size,
+        eps=eps,
+        column_major_scales=column_major_scales,
+        scale_tma_aligned=scale_tma_aligned,
+        scale_ue8m0=scale_ue8m0,
+        fuse_silu_and_mul=fuse_silu_and_mul,
+        masked_m=masked_m,
+        enable_v2=enable_v2,
+    )
+
+
+def sglang_per_token_quant_fp8(
+    x: torch.Tensor,
+    dtype: torch.dtype = fp8_dtype,
+):
+    assert x.is_contiguous(), "`x` is not contiguous"
+
+    x_q = torch.empty_like(x, device=x.device, dtype=dtype)
+    x_s = torch.empty(
+        x.shape[0],
+        1,
+        device=x.device,
+        dtype=torch.float32,
+    )
+
+    sgl_per_token_quant_fp8(x, x_q, x_s)
+
+    return x_q, x_s
+
+
+@triton.jit
+def _static_quant_fp8(
+    # Pointers to inputs and output
+    y_ptr,
+    y_q_ptr,
+    y_s_ptr,
+    y_s_repeat_ptr,
+    # Stride of input
+    y_stride,
+    # Columns of input
+    N,
+    # Information for float8
+    fp8_min,
+    fp8_max,
+    # Meta-parameters
+    BLOCK: tl.constexpr,
+    REPEAT_SCALE: tl.constexpr,
+):
+    """A Triton-accelerated function to perform quantization using the given scale on a
+    tensor
+
+    This function converts the tensor values into float8 values.
+    """
+    # Map the program id to the row of X and Y it should compute.
+    g_id = tl.program_id(0)
+    y_ptr += g_id * y_stride
+    y_q_ptr += g_id * y_stride
+    if REPEAT_SCALE:
+        y_s_repeat_ptr += g_id
+
+    cols = tl.arange(0, BLOCK)  # N <= BLOCK
+    mask = cols < N
+
+    y = tl.load(y_ptr + cols, mask=mask, other=0.0).to(tl.float32)
+    y_s = tl.load(y_s_ptr).to(tl.float32)
+    y_s_inv = 1.0 / y_s
+    y_q = tl.clamp(y * y_s_inv, fp8_min, fp8_max).to(y_q_ptr.dtype.element_ty)
+
+    tl.store(y_q_ptr + cols, y_q, mask=mask)
+    if REPEAT_SCALE:
+        tl.store(y_s_repeat_ptr, y_s)
+
+
+def static_quant_fp8(
+    x: torch.Tensor,
+    x_s: torch.Tensor,
+    repeat_scale: bool = False,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """Function to perform static quantization using the given scale on an input tensor `x`.
+
+    It converts the tensor values into signed float8 values and returns the
+    quantized tensor along with the scaling factor used for quantization.
+
+    Args:
+        x: The input tensor with ndim >= 2.
+        x_s: The quantization scale.
+        repeat_scale: Whether to broadcast per-tensor scale to per-channel scale.
+        dtype: The dype of output tensor.
+
+    Returns:
+        Tuple[torch.Tensor, torch.Tensor]: The quantized tensor and the scaling factor for quantization.
+    """
+    assert x.is_contiguous(), "`x` is not contiguous"
+    assert x_s.numel() == 1, "only supports per-tensor scale"
+
+    x_q = torch.empty_like(x, device=x.device, dtype=fp8_dtype)
+    M = x.numel() // x.shape[-1]
+    N = x.shape[-1]
+    if repeat_scale:
+        x_s_repeat = torch.empty(
+            (M, 1),
+            device=x.device,
+            dtype=torch.float32,
+        )
+    else:
+        x_s_repeat = None
+
+    BLOCK = triton.next_power_of_2(N)
+    # heuristics for number of warps
+    num_warps = min(max(BLOCK // 256, 1), 8)
+    num_stages = 1
+    _static_quant_fp8[(M,)](
+        x,
+        x_q,
+        x_s,
+        x_s_repeat,
+        N,
+        N,
+        fp8_min=fp8_min,
+        fp8_max=fp8_max,
+        BLOCK=BLOCK,
+        REPEAT_SCALE=repeat_scale,
+        num_warps=num_warps,
+        num_stages=num_stages,
+    )
+    x_s = x_s_repeat if repeat_scale else x_s
+    return x_q, x_s
+
+
+@triton.jit
+def _w8a8_block_fp8_matmul(
+    # Pointers to inputs and output
+    A,
+    B,
+    C,
+    As,
+    Bs,
+    # Shape for matmul
+    M,
+    N,
+    K,
+    # Block size for block-wise quantization
+    group_n,
+    group_k,
+    # Stride for inputs and output
+    stride_am,
+    stride_ak,
+    stride_bk,
+    stride_bn,
+    stride_cm,
+    stride_cn,
+    stride_As_m,
+    stride_As_k,
+    stride_Bs_k,
+    stride_Bs_n,
+    # Meta-parameters
+    BLOCK_SIZE_M: tl.constexpr,
+    BLOCK_SIZE_N: tl.constexpr,
+    BLOCK_SIZE_K: tl.constexpr,
+    GROUP_SIZE_M: tl.constexpr,
+    needs_masking: tl.constexpr,
+):
+    """Triton-accelerated function used to perform linear operations (dot
+    product) on input tensors `A` and `B` with block-wise quantization, and store the result in output
+    tensor `C`.
+    """
+
+    pid = tl.program_id(axis=0)
+    num_pid_m = tl.cdiv(M, BLOCK_SIZE_M)
+    num_pid_n = tl.cdiv(N, BLOCK_SIZE_N)
+    num_pid_in_group = GROUP_SIZE_M * num_pid_n
+    group_id = pid // num_pid_in_group
+    first_pid_m = group_id * GROUP_SIZE_M
+    group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M)
+    pid_m = first_pid_m + (pid % group_size_m)
+    pid_n = (pid % num_pid_in_group) // group_size_m
+
+    offs_am = (pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)) % M
+    offs_bn = (pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)) % N
+    offs_k = tl.arange(0, BLOCK_SIZE_K)
+    a_ptrs = A + (offs_am[:, None] * stride_am + offs_k[None, :] * stride_ak)
+    b_ptrs = B + (offs_k[:, None] * stride_bk + offs_bn[None, :] * stride_bn)
+
+    As_ptrs = As + offs_am * stride_As_m
+    offs_bsn = offs_bn // group_n
+    Bs_ptrs = Bs + offs_bsn * stride_Bs_n
+    scale_step_k = BLOCK_SIZE_K // group_k
+
+    accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
+    for k in range(0, tl.cdiv(K, BLOCK_SIZE_K)):
+        if needs_masking:
+            a = tl.load(a_ptrs, mask=offs_k[None, :] < K - k * BLOCK_SIZE_K, other=0.0)
+            b = tl.load(b_ptrs, mask=offs_k[:, None] < K - k * BLOCK_SIZE_K, other=0.0)
+        else:
+            a = tl.load(a_ptrs)
+            b = tl.load(b_ptrs)
+
+        a_s = tl.load(As_ptrs)
+        b_s = tl.load(Bs_ptrs)
+
+        accumulator += tl.dot(a, b) * a_s[:, None] * b_s[None, :]
+        a_ptrs += BLOCK_SIZE_K * stride_ak
+        b_ptrs += BLOCK_SIZE_K * stride_bk
+        As_ptrs += scale_step_k * stride_As_k
+        Bs_ptrs += scale_step_k * stride_Bs_k
+
+    if C.dtype.element_ty == tl.bfloat16:
+        c = accumulator.to(tl.bfloat16)
+    elif C.dtype.element_ty == tl.float16:
+        c = accumulator.to(tl.float16)
+    else:
+        c = accumulator.to(tl.float32)
+
+    offs_cm = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
+    offs_cn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+    c_ptrs = C + stride_cm * offs_cm[:, None] + stride_cn * offs_cn[None, :]
+    c_mask = (offs_cm[:, None] < M) & (offs_cn[None, :] < N)
+    tl.store(c_ptrs, c, mask=c_mask)
+
+
+@triton.jit
+def _w8a8_block_fp8_matmul_unrolledx4(
+    # Pointers to inputs and output
+    A,
+    B,
+    C,
+    As,
+    Bs,
+    # Shape for matmul
+    M,
+    N,
+    K,
+    # Block size for block-wise quantization
+    group_n,
+    group_k,
+    # Stride for inputs and output
+    stride_am,
+    stride_ak,
+    stride_bk,
+    stride_bn,
+    stride_cm,
+    stride_cn,
+    stride_As_m,
+    stride_As_k,
+    stride_Bs_k,
+    stride_Bs_n,
+    # Meta-parameters
+    BLOCK_SIZE_M: tl.constexpr,
+    BLOCK_SIZE_N: tl.constexpr,
+    BLOCK_SIZE_K: tl.constexpr,
+    GROUP_SIZE_M: tl.constexpr,
+    needs_masking: tl.constexpr,
+):
+    """Triton-accelerated function used to perform linear operations (dot
+    product) on input tensors `A` and `B` with block-wise quantization, and store the result in output
+    tensor `C`.
+    """
+
+    pid = tl.program_id(axis=0)
+    num_pid_m = tl.cdiv(M, BLOCK_SIZE_M)
+    num_pid_n = tl.cdiv(N, BLOCK_SIZE_N)
+    num_pid_in_group = GROUP_SIZE_M * num_pid_n
+    group_id = pid // num_pid_in_group
+    first_pid_m = group_id * GROUP_SIZE_M
+    group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M)
+    pid_m = first_pid_m + (pid % group_size_m)
+    pid_n = (pid % num_pid_in_group) // group_size_m
+
+    offs_am = (pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)) % M
+    offs_bn = (pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)) % N
+    offs_k = tl.arange(0, BLOCK_SIZE_K)
+    a_ptrs = A + (offs_am[:, None] * stride_am + offs_k[None, :] * stride_ak)
+    b_ptrs = B + (offs_k[:, None] * stride_bk + offs_bn[None, :] * stride_bn)
+
+    As_ptrs = As + offs_am * stride_As_m
+    offs_bsn = offs_bn // group_n
+    Bs_ptrs = Bs + offs_bsn * stride_Bs_n
+    scale_step_k = BLOCK_SIZE_K // group_k
+
+    accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
+    # manually unroll to 4 iterations
+    UNROLL_FACTOR = 4
+    for k in range(0, tl.cdiv(K, BLOCK_SIZE_K * UNROLL_FACTOR)):
+        # 1st iteration
+        if needs_masking:
+            a = tl.load(
+                a_ptrs,
+                mask=offs_k[None, :] < K - (k * UNROLL_FACTOR) * BLOCK_SIZE_K,
+                other=0.0,
+            )
+            b = tl.load(
+                b_ptrs,
+                mask=offs_k[:, None] < K - (k * UNROLL_FACTOR) * BLOCK_SIZE_K,
+                other=0.0,
+            )
+        else:
+            a = tl.load(a_ptrs)
+            b = tl.load(b_ptrs)
+
+        a_s = tl.load(As_ptrs)
+        b_s = tl.load(Bs_ptrs)
+
+        accumulator += tl.dot(a, b) * a_s[:, None] * b_s[None, :]
+        a_ptrs += BLOCK_SIZE_K * stride_ak
+        b_ptrs += BLOCK_SIZE_K * stride_bk
+        As_ptrs += scale_step_k * stride_As_k
+        Bs_ptrs += scale_step_k * stride_Bs_k
+
+        # 2nd iteration
+        if needs_masking:
+            a = tl.load(
+                a_ptrs,
+                mask=offs_k[None, :] < K - (k * UNROLL_FACTOR + 1) * BLOCK_SIZE_K,
+                other=0.0,
+            )
+            b = tl.load(
+                b_ptrs,
+                mask=offs_k[:, None] < K - (k * UNROLL_FACTOR + 1) * BLOCK_SIZE_K,
+                other=0.0,
+            )
+        else:
+            a = tl.load(a_ptrs)
+            b = tl.load(b_ptrs)
+
+        a_s = tl.load(As_ptrs)
+        b_s = tl.load(Bs_ptrs)
+
+        accumulator += tl.dot(a, b) * a_s[:, None] * b_s[None, :]
+        a_ptrs += BLOCK_SIZE_K * stride_ak
+        b_ptrs += BLOCK_SIZE_K * stride_bk
+        As_ptrs += scale_step_k * stride_As_k
+        Bs_ptrs += scale_step_k * stride_Bs_k
+
+        # 3rd iteration
+        if needs_masking:
+            a = tl.load(
+                a_ptrs,
+                mask=offs_k[None, :] < K - (k * UNROLL_FACTOR + 2) * BLOCK_SIZE_K,
+                other=0.0,
+            )
+            b = tl.load(
+                b_ptrs,
+                mask=offs_k[:, None] < K - (k * UNROLL_FACTOR + 2) * BLOCK_SIZE_K,
+                other=0.0,
+            )
+        else:
+            a = tl.load(a_ptrs)
+            b = tl.load(b_ptrs)
+
+        a_s = tl.load(As_ptrs)
+        b_s = tl.load(Bs_ptrs)
+
+        accumulator += tl.dot(a, b) * a_s[:, None] * b_s[None, :]
+        a_ptrs += BLOCK_SIZE_K * stride_ak
+        b_ptrs += BLOCK_SIZE_K * stride_bk
+        As_ptrs += scale_step_k * stride_As_k
+        Bs_ptrs += scale_step_k * stride_Bs_k
+
+        # 4th iteration
+        if needs_masking:
+            a = tl.load(
+                a_ptrs,
+                mask=offs_k[None, :] < K - (k * UNROLL_FACTOR + 3) * BLOCK_SIZE_K,
+                other=0.0,
+            )
+            b = tl.load(
+                b_ptrs,
+                mask=offs_k[:, None] < K - (k * UNROLL_FACTOR + 3) * BLOCK_SIZE_K,
+                other=0.0,
+            )
+        else:
+            a = tl.load(a_ptrs)
+            b = tl.load(b_ptrs)
+
+        a_s = tl.load(As_ptrs)
+        b_s = tl.load(Bs_ptrs)
+
+        accumulator += tl.dot(a, b) * a_s[:, None] * b_s[None, :]
+        a_ptrs += BLOCK_SIZE_K * stride_ak
+        b_ptrs += BLOCK_SIZE_K * stride_bk
+        As_ptrs += scale_step_k * stride_As_k
+        Bs_ptrs += scale_step_k * stride_Bs_k
+
+    if C.dtype.element_ty == tl.bfloat16:
+        c = accumulator.to(tl.bfloat16)
+    elif C.dtype.element_ty == tl.float16:
+        c = accumulator.to(tl.float16)
+    else:
+        c = accumulator.to(tl.float32)
+
+    offs_cm = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
+    offs_cn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+    c_ptrs = C + stride_cm * offs_cm[:, None] + stride_cn * offs_cn[None, :]
+    c_mask = (offs_cm[:, None] < M) & (offs_cn[None, :] < N)
+    tl.store(c_ptrs, c, mask=c_mask)
+
+
+@functools.lru_cache
+def get_w8a8_block_fp8_configs(
+    N: int, K: int, block_n: int, block_k: int
+) -> Optional[Dict[int, Any]]:
+    """
+    Return optimized configurations for the w8a8 block fp8 kernel.
+
+    The return value will be a dictionary that maps an irregular grid of
+    batch sizes to configurations of the w8a8 block fp8 kernel. To evaluate the
+    kernel on a given batch size bs, the closest batch size in the grid should
+    be picked and the associated configuration chosen to invoke the kernel.
+    """
+
+    # Skip config lookup during torch.compile to avoid non-Tensor ops (e.g., device name).
+    # Returning None forces the caller to use the default config path during compile.
+    if torch._dynamo.is_compiling():
+        return None
+
+    # First look up if an optimized configuration is available in the configs
+    # directory
+    device_name = get_device_name().replace(" ", "_")
+    json_file_name = f"N={N},K={K},device_name={device_name},dtype=fp8_w8a8,block_shape=[{block_n}, {block_k}].json"
+
+    config_file_path = os.path.join(
+        os.path.dirname(os.path.realpath(__file__)), "configs", json_file_name
+    )
+    if os.path.exists(config_file_path):
+        with open(config_file_path) as f:
+            log_info_on_rank0(
+                logger,
+                f"Using configuration from {config_file_path} for W8A8 Block FP8 kernel.",
+            )
+            # If a configuration has been found, return it
+            return {int(key): val for key, val in json.load(f).items()}
+
+    # If no optimized configuration is available, we will use the default
+    # configuration
+    logger.warning(
+        (
+            "Using default W8A8 Block FP8 kernel config. Performance might be sub-optimal! "
+            "Config file not found at %s"
+        ),
+        config_file_path,
+    )
+    return None
+
+
+def select_w8a8_block_fp8_matmul_kernel(M, N, META):
+    return _w8a8_block_fp8_matmul
+
+
+if _is_hip:
+
+    def use_w8a8_block_fp8_matmul_unrolledx4(M, N, META):
+        # Use manually unrolledx4 kernel on AMD GPU when the grid size is small.
+        # Empirical testing shows the sweet spot lies when it's less than the # of
+        # compute units available on the device.
+        num_workgroups = triton.cdiv(M, META["BLOCK_SIZE_M"]) * triton.cdiv(
+            N, META["BLOCK_SIZE_N"]
+        )
+        num_workgroups <= get_device_core_count()
+
+    def select_w8a8_block_fp8_matmul_kernel(M, N, META):
+        if use_w8a8_block_fp8_matmul_unrolledx4(M, N, META):
+            return _w8a8_block_fp8_matmul_unrolledx4
+        else:
+            return _w8a8_block_fp8_matmul
+
+
+def prepare_block_fp8_matmul_inputs(
+    A: torch.Tensor,
+    B: torch.Tensor,
+    As: torch.Tensor,
+    Bs: torch.Tensor,
+    block_size: List[int],
+    output_dtype: torch.dtype = torch.float16,
+) -> Tuple[int, int, int]:
+    assert len(block_size) == 2
+    block_n, block_k = block_size[0], block_size[1]
+
+    assert A.shape[-1] == B.shape[-1]
+    assert A.shape[:-1] == As.shape[:-1]
+    assert A.is_contiguous()
+
+    if As.dtype == torch.float:
+        assert triton.cdiv(A.shape[-1], block_k) == As.shape[-1]
+    elif As.dtype == torch.int:
+        assert (
+            triton.cdiv(triton.cdiv(A.shape[-1], block_k), 4) == As.shape[-1]
+        ), f"{A.shape=} {As.shape=} {block_size=}"
+    else:
+        raise NotImplementedError
+
+    M = A.numel() // A.shape[-1]
+
+    assert B.ndim == 2
+    assert B.is_contiguous()
+    assert Bs.ndim == 2
+    N, K = B.shape
+
+    if Bs.dtype == torch.float:
+        assert triton.cdiv(N, block_n) == Bs.shape[0]
+        assert triton.cdiv(K, block_k) == Bs.shape[1]
+    elif Bs.dtype == torch.int:
+        assert N == Bs.shape[0], f"{B.shape=} {Bs.shape=} {block_size=}"
+        assert (
+            triton.cdiv(triton.cdiv(K, block_k), 4) == Bs.shape[1]
+        ), f"{B.shape=} {Bs.shape=} {block_size=}"
+    else:
+        raise NotImplementedError
+
+    C_shape = A.shape[:-1] + (N,)
+    C = A.new_empty(C_shape, dtype=output_dtype)
+
+    return M, N, K, C
+
+
+def w8a8_block_fp8_matmul_deepgemm(
+    A: torch.Tensor,
+    B: torch.Tensor,
+    As: torch.Tensor,
+    Bs: torch.Tensor,
+    block_size: List[int],
+    output_dtype: torch.dtype,
+) -> torch.Tensor:
+    M, N, K, C = prepare_block_fp8_matmul_inputs(A, B, As, Bs, block_size, output_dtype)
+
+    # Deepgemm only supports output tensor type as bfloat16
+    assert C.dtype == torch.bfloat16 and deep_gemm_wrapper.ENABLE_JIT_DEEPGEMM
+
+    deep_gemm_fp8_fp8_bf16_nt(A, As, B, Bs, C)
+
+    return C
+
+
+def w8a8_block_fp8_matmul_triton(
+    A: torch.Tensor,
+    B: torch.Tensor,
+    As: torch.Tensor,
+    Bs: torch.Tensor,
+    block_size: List[int],
+    output_dtype: torch.dtype = torch.float16,
+) -> torch.Tensor:
+    """This function performs matrix multiplication with block-wise quantization.
+
+    It takes two input tensors `A` and `B` with scales `As` and `Bs`.
+    The output is returned in the specified `output_dtype`.
+
+    Args:
+        A: The input tensor, e.g., activation.
+        B: The input tensor, e.g., weight.
+        As: The per-token-group quantization scale for `A`.
+        Bs: The per-block quantization scale for `B`.
+        block_size: The block size for per-block quantization. It should be 2-dim, e.g., [128, 128].
+        output_dytpe: The dtype of the returned tensor.
+
+    Returns:
+        torch.Tensor: The result of matmul.
+    """
+
+    M, N, K, C = prepare_block_fp8_matmul_inputs(A, B, As, Bs, block_size, output_dtype)
+
+    block_n, block_k = block_size
+
+    configs = get_w8a8_block_fp8_configs(N, K, block_size[0], block_size[1])
+    if configs:
+        # If an optimal configuration map has been found, look up the
+        # optimal config
+        config = configs[min(configs.keys(), key=lambda x: abs(x - M))]
+    else:
+        # Default config
+        # Block-wise quant: BLOCK_SIZE_K must be divisible by block_size[1]
+        config = {
+            "BLOCK_SIZE_M": 64,
+            "BLOCK_SIZE_N": block_size[0],
+            "BLOCK_SIZE_K": block_size[1],
+            "GROUP_SIZE_M": 32,
+            "num_warps": 4,
+            "num_stages": 3,
+        }
+
+    needs_masking = bool(K % config["BLOCK_SIZE_K"] != 0)
+
+    def grid(META):
+        return (
+            triton.cdiv(M, META["BLOCK_SIZE_M"]) * triton.cdiv(N, META["BLOCK_SIZE_N"]),
+        )
+
+    kernel = select_w8a8_block_fp8_matmul_kernel(M, N, config)
+
+    kernel[grid](
+        A,
+        B,
+        C,
+        As,
+        Bs,
+        M,
+        N,
+        K,
+        block_n,
+        block_k,
+        A.stride(-2),
+        A.stride(-1),
+        B.stride(1),
+        B.stride(0),
+        C.stride(-2),
+        C.stride(-1),
+        As.stride(-2),
+        As.stride(-1),
+        Bs.stride(1),
+        Bs.stride(0),
+        **config,
+        needs_masking=needs_masking,
+    )
+
+    return C
+
+
+# universal entry point, for testing purposes
+def w8a8_block_fp8_matmul(
+    A: torch.Tensor,
+    B: torch.Tensor,
+    As: torch.Tensor,
+    Bs: torch.Tensor,
+    block_size: List[int],
+    output_dtype: torch.dtype = torch.float16,
+) -> torch.Tensor:
+    if output_dtype == torch.bfloat16 and deep_gemm_wrapper.ENABLE_JIT_DEEPGEMM:
+        return w8a8_block_fp8_matmul_deepgemm(
+            A, B, As, Bs, block_size, output_dtype=output_dtype
+        )
+
+    return w8a8_block_fp8_matmul_triton(
+        A, B, As, Bs, block_size, output_dtype=output_dtype
+    )
+
+
+# Copied and adapted from https://github.com/triton-lang/triton/blob/main/python/tutorials/10-block-scaled-matmul.py
+@triton.jit
+def _mxfp8_block_scaled_matmul_kernel(  #
+    a_desc,  #
+    a_scale_desc,  #
+    b_desc,  #
+    b_scale_desc,  #
+    c_desc,  #
+    M: tl.constexpr,  #
+    N: tl.constexpr,  #
+    K: tl.constexpr,  #
+    output_type: tl.constexpr,  #
+    BLOCK_M: tl.constexpr,  #
+    BLOCK_N: tl.constexpr,  #
+    BLOCK_K: tl.constexpr,  #
+    rep_m: tl.constexpr,  #
+    rep_n: tl.constexpr,  #
+    rep_k: tl.constexpr,  #
+    NUM_STAGES: tl.constexpr,  #
+):  #
+    if output_type == 0:
+        output_dtype = tl.float32
+    elif output_type == 1:
+        output_dtype = tl.float16
+    elif output_type == 2:
+        output_dtype = tl.bfloat16
+
+    pid = tl.program_id(axis=0)
+    num_pid_m = tl.cdiv(M, BLOCK_M)
+    pid_m = pid % num_pid_m
+    pid_n = pid // num_pid_m
+    offs_am = pid_m * BLOCK_M
+    offs_bn = pid_n * BLOCK_N
+    offs_k_a = 0
+    offs_k_b = 0
+    offs_scale_m = pid_m * rep_m
+    offs_scale_n = pid_n * rep_n
+    offs_scale_k = 0
+
+    VEC_SIZE: tl.constexpr = 32
+
+    accumulator = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32)
+    for k in tl.range(0, tl.cdiv(K, BLOCK_K), num_stages=NUM_STAGES):
+        a = a_desc.load([offs_am, offs_k_a])
+        b = b_desc.load([offs_bn, offs_k_b])
+        scale_a = a_scale_desc.load([0, offs_scale_m, offs_scale_k, 0, 0])
+        scale_b = b_scale_desc.load([0, offs_scale_n, offs_scale_k, 0, 0])
+
+        scale_a = (
+            scale_a.reshape(rep_m, rep_k, 32, 4, 4)
+            .trans(0, 3, 2, 1, 4)
+            .reshape(BLOCK_M, BLOCK_K // VEC_SIZE)
+        )
+        scale_b = (
+            scale_b.reshape(rep_n, rep_k, 32, 4, 4)
+            .trans(0, 3, 2, 1, 4)
+            .reshape(BLOCK_N, BLOCK_K // VEC_SIZE)
+        )
+
+        accumulator = tl.dot_scaled(
+            a, scale_a, "e4m3", b.T, scale_b, "e4m3", accumulator
+        )
+
+        offs_k_a += BLOCK_K
+        offs_k_b += BLOCK_K
+        offs_scale_k += rep_k
+
+    c_desc.store([offs_am, offs_bn], accumulator.to(output_dtype))
+
+
+# Copied and adapted from https://github.com/triton-lang/triton/blob/main/python/tutorials/10-block-scaled-matmul.py
+def mxfp8_block_scaled_matmul_triton(
+    a: torch.Tensor,
+    a_scale: torch.Tensor,
+    b: torch.Tensor,
+    b_scale: torch.Tensor,
+    output_dtype: torch.dtype,
+    *,
+    block_m: int = 128,
+    block_n: int = 256,
+    block_k: int = 128,
+    num_stages: Optional[int] = None,
+) -> torch.Tensor:
+    """Block-scaled matmul for MXFP8 using Triton dot_scaled.
+
+    Args:
+        num_stages: Number of pipeline stages. If None, auto-selects based on GPU:
+            SM120: 1, SM100: 4.
+    """
+    if num_stages is None:
+        num_stages = 1 if is_sm120_supported() else (4 if is_sm100_supported() else 1)
+    M, K = a.shape
+    N, K_b = b.shape
+    assert K == K_b
+
+    if output_dtype == torch.float32:
+        output_type = 0
+    elif output_dtype == torch.float16:
+        output_type = 1
+    elif output_dtype == torch.bfloat16:
+        output_type = 2
+    else:
+        raise ValueError(f"Unsupported output dtype: {output_dtype}")
+
+    rep_m = block_m // 128
+    rep_n = block_n // 128
+    rep_k = block_k // 32 // 4
+
+    a_desc = TensorDescriptor.from_tensor(a, [block_m, block_k])
+    b_desc = TensorDescriptor.from_tensor(b, [block_n, block_k])
+
+    scale_block_shape = [1, rep_m, rep_k, 2, 256]
+    a_scale_desc = TensorDescriptor.from_tensor(a_scale, block_shape=scale_block_shape)
+    scale_block_shape = [1, rep_n, rep_k, 2, 256]
+    b_scale_desc = TensorDescriptor.from_tensor(b_scale, block_shape=scale_block_shape)
+
+    output = torch.empty((M, N), dtype=output_dtype, device=a.device)
+    c_desc = TensorDescriptor.from_tensor(output, [block_m, block_n])
+
+    grid = (triton.cdiv(M, block_m) * triton.cdiv(N, block_n), 1)
+    _mxfp8_block_scaled_matmul_kernel[grid](
+        a_desc,
+        a_scale_desc,
+        b_desc,
+        b_scale_desc,
+        c_desc,
+        M,
+        N,
+        K,
+        output_type,
+        block_m,
+        block_n,
+        block_k,
+        rep_m,
+        rep_n,
+        rep_k,
+        num_stages,
+    )
+    return output
+
+
+@triton.jit
+def _per_tensor_quant_mla_fp8_stage1(
+    x_ptr,
+    x_s_ptr,
+    head_size,
+    x_stride_h,
+    x_stride_s,
+    eps,
+    fp8_max,
+    BLOCK_SIZE: tl.constexpr,
+):
+    seq_id = tl.program_id(0)
+    head_id = tl.program_id(1)
+    offset = tl.arange(0, BLOCK_SIZE)
+    mask = offset < head_size
+
+    x_ptr += head_id * x_stride_h + seq_id * x_stride_s
+    x = tl.load(x_ptr + offset, mask=mask, other=0.0).to(tl.float32)
+    _absmax = tl.maximum(tl.max(tl.abs(x)), eps)
+
+    tl.atomic_max(x_s_ptr, _absmax / fp8_max)
+
+
+@triton.jit
+def _per_tensor_quant_mla_fp8_stage2(
+    x_ptr,
+    x_s_ptr,
+    x_q_ptr,
+    num_seq,
+    head_size,
+    x_stride_h,
+    x_stride_s,
+    fp8_min,
+    fp8_max,
+    BLOCK_SIZE: tl.constexpr,
+):
+    seq_id = tl.program_id(0)
+    head_id = tl.program_id(1)
+    offset = tl.arange(0, BLOCK_SIZE)
+    mask = offset < head_size
+
+    x_s = tl.load(x_s_ptr)
+    x_s_inv = 1.0 / x_s
+
+    x_ptr += head_id * x_stride_h + seq_id * x_stride_s
+    x_q_ptr += head_id * num_seq * head_size + seq_id * head_size
+
+    x = tl.load(x_ptr + offset, mask=mask, other=0.0).to(tl.float32)
+    x_q = tl.clamp(x * x_s_inv, fp8_min, fp8_max).to(x_q_ptr.dtype.element_ty)
+    tl.store(x_q_ptr + offset, x_q, mask=mask)
+
+
+def per_tensor_quant_mla_fp8(
+    x: torch.Tensor, x_s_out: torch.Tensor, eps: float = 1e-12
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """
+    This function quantizes input values to float8 values with tensor-wise quantization
+    and specialized for mla absorbed case.
+    """
+    assert x.dim() == 3, "`x` is not a 3d-tensor"
+    assert (
+        x_s_out.shape == (1,)
+        and x_s_out.dtype == torch.float32
+        and x_s_out.device == x.device
+    )
+
+    x_q = x.new_empty(x.size(), dtype=fp8_dtype)
+
+    num_head, num_seq, head_size = x.shape
+    BLOCK_SIZE = triton.next_power_of_2(head_size)
+    grid = (num_seq, num_head)
+
+    _per_tensor_quant_mla_fp8_stage1[grid](
+        x,
+        x_s_out,
+        head_size,
+        x.stride(0),
+        x.stride(1),
+        eps,
+        fp8_max,
+        BLOCK_SIZE,
+    )
+    _per_tensor_quant_mla_fp8_stage2[grid](
+        x,
+        x_s_out,
+        x_q,
+        num_seq,
+        head_size,
+        x.stride(0),
+        x.stride(1),
+        fp8_min,
+        fp8_max,
+        BLOCK_SIZE,
+    )
+
+    return x_q, x_s_out
+
+
+@triton.jit
+def _per_token_group_quant_mla_deep_gemm_masked_fp8(
+    y_ptr,
+    y_q_ptr,
+    y_s_ptr,
+    masked_m_ptr,
+    group_size,
+    y_stride_b,
+    y_stride_t,
+    y_q_stride_b,
+    y_q_stride_t,
+    y_s_stride_b,
+    y_s_stride_g,
+    eps,
+    fp8_min,
+    fp8_max,
+    NUM_GROUP: tl.constexpr,
+    BLOCK: tl.constexpr,
+):
+    """A Triton-accelerated function to perform per-token-group
+    quantization on a tensor for deep_gemm grouped_gemm_masked.
+    This function converts the tensor values into float8 values.
+    y and y_q: (b, t, k)
+    y_s: (b, k//group_size, t)
+    """
+    t_id = tl.program_id(0)
+    b_id = tl.program_id(1)
+
+    y_ptr += b_id * y_stride_b + t_id * y_stride_t
+    y_q_ptr += b_id * y_q_stride_b + t_id * y_q_stride_t
+    y_s_ptr += b_id * y_s_stride_b + t_id
+
+    if t_id == 0:
+        tl.store(masked_m_ptr + b_id, tl.num_programs(0))
+
+    cols = tl.arange(0, BLOCK)  # group_size <= BLOCK
+    mask = cols < group_size
+
+    for gid in range(NUM_GROUP):
+        y = tl.load(y_ptr + gid * group_size + cols, mask=mask, other=0.0).to(
+            tl.float32
+        )
+        _absmax = tl.maximum(tl.max(tl.abs(y)), eps)
+        y_s = _absmax / fp8_max
+        y_q = tl.clamp(y / y_s, fp8_min, fp8_max).to(y_q_ptr.dtype.element_ty)
+
+        tl.store(y_q_ptr + gid * group_size + cols, y_q, mask=mask)
+        tl.store(y_s_ptr + gid * y_s_stride_g, y_s)
+
+
+def per_token_group_quant_mla_deep_gemm_masked_fp8(
+    x: torch.Tensor,
+    group_size: int = 128,
+    eps: float = 1e-12,
+    dtype: torch.dtype = fp8_dtype,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """
+    This function quantizes input values to float8 values with per-token-group-quantization
+    for deep_gemm grouped_gemm_masked and specialized for mla absorbed case.
+    """
+    assert x.dim() == 3, "`x` is not a 3d-tensor"
+
+    b, m, k = x.shape
+    aligned_m = (m + 255) // 256 * 256  # 256 is the max block_m of the gemm kernel
+    num_tiles_k = k // group_size
+    assert num_tiles_k * group_size == k, f"k % {group_size} must be zero"
+
+    x_q = x.new_empty((b, aligned_m, k), dtype=dtype)
+    x_s = x.new_empty((b, num_tiles_k, aligned_m), dtype=torch.float32)
+    masked_m = x.new_empty((b,), dtype=torch.int32)
+
+    BLOCK_SIZE = triton.next_power_of_2(group_size)
+    grid = (m, b)
+
+    _per_token_group_quant_mla_deep_gemm_masked_fp8[grid](
+        x,
+        x_q,
+        x_s,
+        masked_m,
+        group_size,
+        x.stride(0),
+        x.stride(1),
+        x_q.stride(0),
+        x_q.stride(1),
+        x_s.stride(0),
+        x_s.stride(1),
+        eps,
+        -fp8_max,
+        fp8_max,
+        num_tiles_k,
+        BLOCK_SIZE,
+    )
+
+    return x_q, x_s.transpose(1, 2), masked_m, m, aligned_m
+
+
+"""
+Quantize input tensor to FP8 (8-bit floating point) format.
+
+Args:
+    input (torch.Tensor): Input tensor to be quantized
+    scale (Optional[torch.Tensor]): Pre-computed scaling factor for static quantization.
+        If None, scales will be computed dynamically.
+    num_token_padding (Optional[int]): If specified, pad the first dimension
+        of the output to at least this value.
+    use_per_token_if_dynamic (bool): When using dynamic scaling (scale=None),
+        determines the quantization granularity:
+        - True: compute scale per token
+        - False: compute single scale per tensor
+
+Returns:
+    Tuple[torch.Tensor, torch.Tensor]: A tuple containing:
+        - quantized_tensor: The FP8 quantized version of input
+        - scale_tensor: The scaling factors used for quantization
+
+Raises:
+    AssertionError: If input is not 2D or if static scale's numel != 1
+"""
+if _is_hip:
+
+    def _native_dynamic_per_token_quant_fp8(output, input, scale):
+        """Native PyTorch fallback for dynamic per-token FP8 quantization when vLLM is unavailable."""
+        M, N = input.shape
+        eps = 1e-12
+        # Compute per-token scale
+        absmax = input.abs().max(dim=1, keepdim=True).values
+        absmax = torch.clamp(absmax, min=eps)
+        scale_val = absmax / fp8_max
+        scale.copy_(scale_val)
+        # Quantize
+        output_data = torch.clamp(input / scale_val, fp8_min, fp8_max).to(fp8_dtype)
+        output.copy_(output_data)
+
+    def _native_dynamic_per_tensor_quant_fp8(output, input, scale):
+        """Native PyTorch fallback for dynamic per-tensor FP8 quantization when vLLM is unavailable."""
+        eps = 1e-12
+        absmax = input.abs().max()
+        absmax = torch.clamp(absmax, min=eps)
+        scale_val = absmax / fp8_max
+        # Use copy_ instead of fill_ with .item() to avoid CPU-GPU sync
+        scale.view(-1).copy_(scale_val.view(-1))
+        # Quantize
+        output_data = torch.clamp(input / scale_val, fp8_min, fp8_max).to(fp8_dtype)
+        output.copy_(output_data)
+
+    def _native_static_quant_fp8(output, input, scale):
+        """Native PyTorch fallback for static FP8 quantization when vLLM is unavailable."""
+        # Use tensor directly instead of .item() to avoid CPU-GPU sync
+        output_data = torch.clamp(input / scale, fp8_min, fp8_max).to(fp8_dtype)
+        output.copy_(output_data)
+
+    def scaled_fp8_quant(
+        input: torch.Tensor,
+        scale: Optional[torch.Tensor] = None,
+        num_token_padding: Optional[int] = None,
+        use_per_token_if_dynamic: bool = False,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        assert input.ndim == 2, f"Expected 2D input tensor, got {input.ndim}D"
+        shape = input.shape
+        if num_token_padding:
+            shape = (max(num_token_padding, input.shape[0]), shape[1])
+        output = torch.empty(shape, device=input.device, dtype=fp8_dtype)
+
+        if scale is None:
+            # Dynamic scaling
+            if use_per_token_if_dynamic:
+                scale = torch.empty(
+                    (shape[0], 1), device=input.device, dtype=torch.float32
+                )
+                if _use_aiter:
+                    dynamic_per_token_scaled_quant(output, input, scale)
+                elif _has_vllm:
+                    torch.ops._C.dynamic_per_token_scaled_fp8_quant(
+                        output, input.contiguous(), scale, None
+                    )
+                else:
+                    _native_dynamic_per_token_quant_fp8(output, input, scale)
+            else:
+                scale = torch.zeros(1, device=input.device, dtype=torch.float32)
+                if _use_aiter:
+                    dynamic_per_tensor_quant(output, input, scale)
+                elif _has_vllm:
+                    torch.ops._C.dynamic_scaled_fp8_quant(output, input, scale)
+                else:
+                    _native_dynamic_per_tensor_quant_fp8(output, input, scale)
+        else:
+            # Static scaling
+            assert (
+                scale.numel() == 1
+            ), f"Expected scalar scale, got numel={scale.numel()}"
+            if _use_aiter:
+                static_per_tensor_quant(output, input, scale)
+            elif _has_vllm:
+                torch.ops._C.static_scaled_fp8_quant(output, input, scale)
+            else:
+                _native_static_quant_fp8(output, input, scale)
+
+        return output, scale
+
+else:
+
+    def scaled_fp8_quant(
+        input: torch.Tensor,
+        scale: Optional[torch.Tensor] = None,
+        num_token_padding: Optional[int] = None,
+        use_per_token_if_dynamic: bool = False,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+
+        assert input.ndim == 2, f"Expected 2D input tensor, got {input.ndim}D"
+        shape = input.shape
+        if num_token_padding:
+            shape = (max(num_token_padding, input.shape[0]), shape[1])
+        output = torch.empty(shape, device=input.device, dtype=fp8_dtype)
+
+        if scale is None:
+            # Dynamic scaling
+            if use_per_token_if_dynamic:
+                scale = torch.empty(
+                    (shape[0], 1), device=input.device, dtype=torch.float32
+                )
+                sgl_per_token_quant_fp8(input, output, scale)
+            else:
+                scale = torch.zeros(1, device=input.device, dtype=torch.float32)
+                sgl_per_tensor_quant_fp8(
+                    input, output, scale, is_static=False
+                )  # False for dynamic
+        else:
+            # Static scaling
+            assert (
+                scale.numel() == 1
+            ), f"Expected scalar scale, got numel={scale.numel()}"
+            sgl_per_tensor_quant_fp8(
+                input, output, scale, is_static=True
+            )  # True for static
+
+        return output, scale
+
+
+fp8_autotune = triton.autotune(
+    configs=[
+        triton.Config({"BLOCK_M": block_m}, num_warps=num_warps)
+        for block_m in [16, 32, 64, 128]
+        for num_warps in [2, 4, 8]
+    ],
+    key=["K", "BLOCK_K", "M_ALIGNMENT"],
+)
+
+
+@triton.jit
+def _per_token_group_quant_fp8_hopper_moe_mn_major(
+    a,  # (M, K):(K, 1)
+    expert_offsets,  # (num_experts,)
+    problem_sizes,  # (num_experts, 3)
+    a_fp8,  # (M, K):(K, 1)
+    sfa,  # (M, k)
+    K: tl.constexpr,
+    BLOCK_K: tl.constexpr,
+    M_ALIGNMENT: tl.constexpr,
+    BLOCK_M: tl.constexpr,  # tune
+):
+    k_offset = tl.program_id(0)
+    expert_id = tl.program_id(1)
+
+    m = tl.load(problem_sizes + expert_id * 3)
+    current_expert_offset = tl.load(expert_offsets + expert_id).to(tl.int64)
+    tl.multiple_of(m, M_ALIGNMENT)
+    tl.multiple_of(current_expert_offset, M_ALIGNMENT)
+
+    coord_k = k_offset * BLOCK_K + tl.arange(0, BLOCK_K)
+    for i in tl.range(tl.cdiv(m, BLOCK_M)):
+        coord_m = i * BLOCK_M + tl.arange(0, BLOCK_M)
+        a_ptrs = a + current_expert_offset * K + coord_m[:, None] * K + coord_k[None, :]
+        a_mask = (coord_m < m)[:, None] & (coord_k < K)[None, :]
+
+        inp = tl.load(a_ptrs, mask=a_mask).to(tl.float32)  # [BLOCK_M, BLOCK_K]
+        inp_amax = tl.max(tl.abs(inp), axis=1)  # [BLOCK_M,]
+        inp_amax = tl.clamp(inp_amax, min=1e-4, max=float("inf"))
+        inp_fp8 = (inp * (448.0 / inp_amax[:, None])).to(tl.float8e4nv)
+
+        # Store fp8
+        a_fp8_ptrs = (
+            a_fp8 + current_expert_offset * K + coord_m[:, None] * K + coord_k[None, :]
+        )
+        tl.store(a_fp8_ptrs, inp_fp8, mask=a_mask)
+
+        # Store sfa
+        k = tl.cdiv(K, BLOCK_K)
+        sfa_ptrs = (
+            sfa + current_expert_offset * k + k_offset * m + coord_m
+        )  # MN-Major with sfa
+        tl.store(sfa_ptrs, inp_amax / 448.0, mask=coord_m < m)
+
+
+if not _is_cpu:
+    _per_token_group_quant_fp8_hopper_moe_mn_major = fp8_autotune(
+        _per_token_group_quant_fp8_hopper_moe_mn_major
+    )
+
+
+def per_token_group_quant_fp8_hopper_moe_mn_major(
+    A: torch.Tensor,
+    expert_offsets: torch.Tensor,
+    problem_sizes: torch.Tensor,
+    group_size: int,
+    expert_tokens_alignment: int = 1,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    assert A.dim() == 2
+    assert A.is_contiguous(), "`A` is not contiguous"
+    assert (
+        A.shape[-1] % group_size == 0
+    ), "the last dimension of `A` cannot be divisible by `group_size`"
+
+    a_q = torch.empty_like(A, device=A.device, dtype=fp8_dtype)
+    M, K = A.shape[0], A.shape[1]
+    k = K // group_size
+    sfa = torch.empty((M, k), device=A.device, dtype=torch.float32)
+    num_experts = problem_sizes.shape[0]
+    grid = (k, num_experts)
+    _per_token_group_quant_fp8_hopper_moe_mn_major[grid](
+        A,
+        expert_offsets,
+        problem_sizes,
+        a_q,
+        sfa,
+        K,
+        group_size,
+        expert_tokens_alignment,
+    )
+    return a_q, sfa
+
+
+@triton.jit
+def _per_group_transpose(
+    data_ptr: torch.Tensor,
+    trans_data_ptr: torch.Tensor,
+    expert_offsets: torch.Tensor,
+    k: int,
+    M_ALIGNMENT: tl.constexpr,
+    BLOCK_SIZE_M: tl.constexpr,
+    BLOCK_SIZE_K: tl.constexpr,
+):
+    expert_id = tl.program_id(0)
+    m_id = tl.program_id(1)
+    k_id = tl.program_id(2)
+
+    curr_expert_offset = tl.load(expert_offsets + expert_id)
+    next_expert_offset = tl.load(expert_offsets + expert_id + 1)
+    num_tokens_of_expert = next_expert_offset - curr_expert_offset
+    tl.multiple_of(curr_expert_offset, M_ALIGNMENT)
+    tl.multiple_of(next_expert_offset, M_ALIGNMENT)
+
+    data_start_ptr = data_ptr + curr_expert_offset * k
+    trans_data_start_ptr = trans_data_ptr + curr_expert_offset * k
+
+    k_coord = k_id * BLOCK_SIZE_K + tl.arange(0, BLOCK_SIZE_K)
+    k_mask = k_coord < k
+    for start_m in tl.range(0, num_tokens_of_expert, BLOCK_SIZE_M * tl.num_programs(1)):
+        m_coord = start_m + m_id * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
+        m_mask = m_coord < num_tokens_of_expert
+        off = m_coord[:, None] * k + k_coord[None, :]
+        trans_off = m_coord[:, None] + k_coord[None, :] * num_tokens_of_expert
+        mask = m_mask[:, None] & k_mask[None, :]
+
+        data = tl.load(data_start_ptr + off, mask=mask)
+        tl.store(trans_data_start_ptr + trans_off, data, mask=mask)
+
+
+def per_group_transpose(
+    a: torch.Tensor,
+    expert_offsets: torch.Tensor,
+    M_ALIGNMENT: int = 1,
+) -> torch.Tensor:
+    assert a.dim() == 2
+    assert a.is_contiguous(), "`a` is not contiguous"
+
+    m, k = a.size()
+    trans_a = torch.empty_like(a)
+    num_experts = expert_offsets.size(0) - 1
+
+    grid = lambda META: (
+        num_experts,
+        triton.cdiv((m + num_experts - 1) // num_experts, META["BLOCK_SIZE_M"]),
+        triton.cdiv(k, META["BLOCK_SIZE_K"]),
+    )
+    _per_group_transpose[grid](
+        a, trans_a, expert_offsets, k, M_ALIGNMENT, BLOCK_SIZE_M=16, BLOCK_SIZE_K=8
+    )
+    return trans_a
+
+
+def is_weak_contiguous(x: torch.Tensor):
+    strides = x.stride()
+    sizes = x.shape
+    is_not_transpose = strides[0] == 1 and (strides[1] >= max(1, sizes[0]))
+    is_transpose = strides[1] == 1 and (strides[0] >= max(1, sizes[1]))
+    return is_transpose or is_not_transpose
+
+
+@triton.jit
+def scaled_mm_kernel(
+    a_ptr,
+    b_ptr,
+    scale_a_ptr,
+    scale_b_ptr,
+    c_ptr,
+    bias_ptr,
+    M,
+    N,
+    K,
+    stride_am,
+    stride_ak,
+    stride_bk,
+    stride_bn,
+    stride_cm,
+    stride_cn,
+    ACCUMULATOR_DTYPE: tl.constexpr,
+    BLOCK_SIZE_M: tl.constexpr,
+    BLOCK_SIZE_N: tl.constexpr,
+    BLOCK_SIZE_K: tl.constexpr,
+    BLOCK_SIZE_SCALE_A: tl.constexpr,
+    BLOCK_SIZE_SCALE_B: tl.constexpr,
+):
+    pid = tl.program_id(axis=0)
+
+    num_pid_n = tl.cdiv(N, BLOCK_SIZE_N)
+
+    pid_m = pid // num_pid_n
+    pid_n = pid % num_pid_n
+
+    accumulator_dtype = ACCUMULATOR_DTYPE
+    accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=accumulator_dtype)
+
+    # NOTE: Some tensor inputs are so large, they will cause int32 overflow
+    # so it is necessary to use tl.int64 for all the offsets, else SEGV will
+    # eventually occur.
+
+    # Offsets and masks.
+    offsets_am = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M).to(tl.int64)
+    masks_am = offsets_am < M
+
+    offsets_bn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N).to(tl.int64)
+    masks_bn = offsets_bn < N
+
+    offsets_k = tl.arange(0, BLOCK_SIZE_K).to(tl.int64)
+    offsets_a = stride_am * offsets_am[:, None] + stride_ak * offsets_k[None, :]
+    offsets_b = stride_bk * offsets_k[:, None] + stride_bn * offsets_bn[None, :]
+
+    # NOTE: BLOCK_SIZE_SCALE_A could be 1 or BLOCK_SIZE_M, so need to create
+    # appropriate offsets and masks for each case. Same goes for
+    # BLOCK_SIZE_SCALE_B.
+    offsets_scale_am = (
+        tl.arange(0, BLOCK_SIZE_SCALE_A)
+        + (BLOCK_SIZE_SCALE_A > 1) * pid_m * BLOCK_SIZE_M
+    )
+    masks_scale_am = offsets_scale_am < M
+
+    offsets_scale_bn = (
+        tl.arange(0, BLOCK_SIZE_SCALE_B)
+        + (BLOCK_SIZE_SCALE_B > 1) * pid_n * BLOCK_SIZE_N
+    )
+    masks_scale_bn = offsets_scale_bn < N
+
+    a_ptrs = a_ptr + offsets_a
+    b_ptrs = b_ptr + offsets_b
+
+    scale_a_ptrs = scale_a_ptr + offsets_scale_am
+    scale_b_ptrs = scale_b_ptr + offsets_scale_bn
+
+    for k in range(0, tl.cdiv(K, BLOCK_SIZE_K)):
+        masks_k = offsets_k < K
+        masks_a = masks_am[:, None] & masks_k[None, :]
+        a = tl.load(a_ptrs, mask=masks_a)
+
+        masks_b = masks_k[:, None] & masks_bn[None, :]
+        b = tl.load(b_ptrs, mask=masks_b)
+
+        # Accumulate results.
+        accumulator = tl.dot(a, b, accumulator, out_dtype=accumulator_dtype)
+
+        offsets_k += BLOCK_SIZE_K
+        a_ptrs += BLOCK_SIZE_K * stride_ak
+        b_ptrs += BLOCK_SIZE_K * stride_bk
+
+    # Apply scale at end.
+    masks_scale_a = masks_scale_am[:, None] & (tl.arange(0, 1) < 1)[:, None]
+    scale_a = tl.load(scale_a_ptrs[:, None], masks_scale_a)
+    # Need to broadcast to the appropriate size, if scale_a is already
+    # (BLOCK_SIZE_M, 1) then it will broadcast to its own shape. Same goes
+    # for scale_b below.
+    scale_a = scale_a.broadcast_to((BLOCK_SIZE_M, 1))
+    accumulator = scale_a * accumulator.to(tl.float32)
+
+    masks_scale_b = masks_scale_bn[:, None] & (tl.arange(0, 1) < 1)[None, :]
+    scale_b = tl.load(scale_b_ptrs[:, None], masks_scale_b)
+    scale_b = scale_b.broadcast_to((BLOCK_SIZE_N, 1))
+    accumulator = scale_b.T * accumulator.to(tl.float32)
+
+    # Convert to output format.
+    c = accumulator.to(c_ptr.type.element_ty)
+
+    # Add bias, it's already in output format, so add it after conversion.
+    if bias_ptr:
+        offsets_bias = offsets_bn
+        bias_ptrs = bias_ptr + offsets_bias
+        bias_mask = offsets_bias < N
+        bias = tl.load(bias_ptrs, bias_mask)
+        c += bias
+
+    # Save output
+    offs_cm = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M).to(tl.int64)
+    offs_cn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N).to(tl.int64)
+    offs_cm = offs_cm.to(tl.int64)
+    offs_cn = offs_cn.to(tl.int64)
+    c_ptrs = c_ptr + stride_cm * offs_cm[:, None] + stride_cn * offs_cn[None, :]
+    c_mask = (offs_cm[:, None] < M) & (offs_cn[None, :] < N)
+
+    tl.store(c_ptrs, c, mask=c_mask)
+
+
+# input  - [M, K]
+# weight - [K, N]
+# Adapted from https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/layers/quantization/compressed_tensors/triton_scaled_mm.py
+def triton_scaled_mm(
+    input: torch.Tensor,
+    weight: torch.Tensor,
+    scale_a: torch.Tensor,
+    scale_b: torch.Tensor,
+    out_dtype: type[torch.dtype],
+    bias: Optional[torch.Tensor] = None,
+    block_size_m: int = 32,
+    block_size_n: int = 32,
+    block_size_k: int = 32,
+    use_heuristic=True,
+) -> torch.Tensor:
+    M, K = input.shape
+    N = weight.shape[1]
+
+    assert N > 0 and K > 0 and M > 0
+    assert weight.shape[0] == K
+    assert input.dtype == weight.dtype
+
+    scale_a = scale_a.reshape(-1, 1) if scale_a.dim() <= 1 else scale_a
+    scale_b = scale_b.reshape(-1, 1) if scale_b.dim() <= 1 else scale_b
+
+    assert scale_a.dtype == scale_b.dtype and scale_a.is_floating_point()
+    assert scale_a.shape[1] == 1 and (scale_a.shape[0] == 1 or scale_a.shape[0] == M)
+    assert scale_b.shape[1] == 1 and (scale_b.shape[0] == 1 or scale_b.shape[0] == N)
+    assert out_dtype.is_floating_point
+    assert bias is None or bias.is_floating_point()
+    assert is_weak_contiguous(input)
+    assert is_weak_contiguous(weight)
+
+    grid = lambda META: (
+        triton.cdiv(M, META["BLOCK_SIZE_M"]) * triton.cdiv(N, META["BLOCK_SIZE_N"]),
+    )
+
+    result = torch.empty((M, N), dtype=out_dtype, device=input.device)
+
+    has_scalar = lambda x: x.shape[0] == 1 and x.shape[1] == 1
+
+    if use_heuristic:
+        is_small_N = N < 8192
+        next_power_of_2_M = max(32, triton.next_power_of_2(M))
+        if next_power_of_2_M <= 32:
+            tile_shape = (64, 64, 256) if is_small_N else (64, 128, 256)
+        elif next_power_of_2_M <= 64:
+            tile_shape = (64, 64, 256)
+        elif next_power_of_2_M <= 128:
+            tile_shape = (64, 128, 128)
+        else:
+            tile_shape = (128, 128, 128)
+
+    block_size_m, block_size_n, block_size_k = tile_shape
+
+    block_size_sa = 1 if has_scalar(scale_a) else block_size_m
+    block_size_sb = 1 if has_scalar(scale_b) else block_size_n
+
+    accumulator_dtype = tl.float32 if input.is_floating_point() else tl.int32
+
+    # A = input, B = weight, C = result
+    # A = M x K, B = K x N, C = M x N
+    scaled_mm_kernel[grid](
+        input,
+        weight,
+        scale_a,
+        scale_b,
+        result,
+        bias,
+        M,
+        N,
+        K,
+        input.stride(0),
+        input.stride(1),
+        weight.stride(0),
+        weight.stride(1),
+        result.stride(0),
+        result.stride(1),
+        accumulator_dtype,
+        BLOCK_SIZE_M=block_size_m,
+        BLOCK_SIZE_N=block_size_n,
+        BLOCK_SIZE_K=block_size_k,
+        BLOCK_SIZE_SCALE_A=block_size_sa,
+        BLOCK_SIZE_SCALE_B=block_size_sb,
+    )
+
+    return result.to(out_dtype)
+
+
+if _is_cuda:
+    if enable_sgl_per_token_group_quant_8bit:
+
+        @torch.library.register_fake("sgl_kernel::sgl_per_token_group_quant_8bit")
+        def _(
+            input, output_q, output_s, group_size, eps, fp8_min, fp8_max, scale_ue8m0
+        ):
+            return
+
+    else:
+
+        @torch.library.register_fake("sgl_kernel::sgl_per_token_group_quant_fp8")
+        def _(
+            input, output_q, output_s, group_size, eps, fp8_min, fp8_max, scale_ue8m0
+        ):
+            return
+
+    @torch.library.register_fake("sgl_kernel::sgl_per_token_quant_fp8")
+    def _(input, output_q, output_s):
+        return
diff --git a/sglang/python/sglang/srt/layers/quantization/fp8_utils.py b/sglang/python/sglang/srt/layers/quantization/fp8_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..ce65b5a01e6af78ccba0b2e6fbe236493f8a630f
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/quantization/fp8_utils.py
@@ -0,0 +1,1453 @@
+from __future__ import annotations
+
+import logging
+from enum import Enum
+from functools import lru_cache
+from typing import TYPE_CHECKING, Callable, List, Optional, Tuple
+
+import torch
+
+from sglang.srt.environ import envs
+from sglang.srt.layers import deep_gemm_wrapper
+from sglang.srt.layers.quantization.fp8_kernel import sglang_per_token_group_quant_fp8
+from sglang.srt.layers.quantization.mxfp4_tensor import MXFP4QuantizeUtil
+from sglang.srt.utils.common import torch_release
+
+if TYPE_CHECKING:
+    from sglang.srt.server_args import ServerArgs
+
+from sglang.srt.layers.quantization.fp8_kernel import (
+    fp8_dtype,
+    fp8_max,
+    is_fp8_fnuz,
+    mxfp8_block_scaled_matmul_triton,
+    per_token_group_quant_fp8,
+    scaled_fp8_quant,
+    sglang_per_token_quant_fp8,
+    static_quant_fp8,
+    triton_scaled_mm,
+    w8a8_block_fp8_matmul_deepgemm,
+    w8a8_block_fp8_matmul_triton,
+)
+from sglang.srt.utils import (
+    ceil_align,
+    ceil_div,
+    get_bool_env_var,
+    get_cuda_version,
+    get_device_capability,
+    is_blackwell_supported,
+    is_cuda,
+    is_flashinfer_available,
+    is_hip,
+    is_sm90_supported,
+    is_sm100_supported,
+    is_sm120_supported,
+    offloader,
+)
+
+logger = logging.getLogger(__name__)
+
+_is_hip = is_hip()
+_is_cuda = is_cuda()
+_is_fp8_fnuz = is_fp8_fnuz()
+
+_use_aiter = get_bool_env_var("SGLANG_USE_AITER") and _is_hip
+
+if _use_aiter:
+    import aiter
+
+    # from aiter import gemm_a8w8_blockscale, gemm_a8w8_bpreshuffle, get_hip_quant
+    from aiter import gemm_a8w8_bpreshuffle, get_hip_quant
+    from aiter.ops.triton.gemm_a8w8_blockscale import gemm_a8w8_blockscale
+
+    aiter_per1x128_quant = get_hip_quant(aiter.QuantType.per_1x128)
+
+
+if _is_cuda:
+    from sgl_kernel import fp8_blockwise_scaled_mm, fp8_scaled_mm
+
+    @torch.library.register_fake("sgl_kernel::fp8_scaled_mm")
+    def _fp8_scaled_mm_abstract(mat_a, mat_b, scales_a, scales_b, out_dtype, bias=None):
+        # mat_a: [M, K], mat_b: [K, N] or [N, K] depending on callsite layout; output is [M, N].
+        M = mat_a.shape[-2]
+        N = mat_b.shape[-1]
+        return mat_a.new_empty((M, N), dtype=out_dtype)
+
+    @torch.library.register_fake("sgl_kernel::fp8_blockwise_scaled_mm")
+    def _fp8_blockwise_scaled_mm_abstract(mat_a, mat_b, scales_a, scales_b, out_dtype):
+        # mat_a: [M, K], mat_b: [K, N] or [N, K] depending on callsite layout; output is [M, N].
+        M = mat_a.shape[-2]
+        N = mat_b.shape[-1]
+        return mat_a.new_empty((M, N), dtype=out_dtype)
+
+
+use_vllm_cutlass_w8a8_fp8_kernel = get_bool_env_var("USE_VLLM_CUTLASS_W8A8_FP8_KERNEL")
+use_triton_w8a8_fp8_kernel = get_bool_env_var("USE_TRITON_W8A8_FP8_KERNEL")
+
+# Input scaling factors are no longer optional in _scaled_mm starting
+# from pytorch 2.5. Allocating a dummy tensor to pass as input_scale
+TORCH_DEVICE_IDENTITY = None
+
+
+def use_rowwise_torch_scaled_mm():
+    if _is_hip:
+        # The condition to determine if it is on a platform that supports
+        # torch._scaled_mm rowwise feature.
+        # The condition is determined once as the operations
+        # are time consuming.
+        return get_device_capability() >= (9, 4) and torch_release >= (2, 7)
+    return False
+
+
+USE_ROWWISE_TORCH_SCALED_MM = use_rowwise_torch_scaled_mm()
+
+
+@lru_cache(maxsize=1)
+def cutlass_fp8_supported():
+    if not _is_cuda:
+        return False
+    major, minor = get_device_capability()
+    cuda_version = get_cuda_version()
+    if major >= 9:
+        return cuda_version >= (12, 0)
+    elif major == 8 and minor == 9:
+        return cuda_version >= (12, 4)
+    return False
+
+
+def normalize_e4m3fn_to_e4m3fnuz(
+    weight: torch.Tensor,
+    weight_scale: torch.Tensor,
+    input_scale: Optional[torch.Tensor] = None,
+) -> Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
+    assert weight.dtype == torch.float8_e4m3fn
+    # The bits pattern 10000000(-128) represents zero in e4m3fn
+    # but NaN in e4m3fnuz. So here we set it to 0.
+    # https://onnx.ai/onnx/technical/float8.html
+    weight_as_int8 = weight.view(torch.int8)
+    ROCM_FP8_NAN_AS_INT = -128
+    weight_as_int8[weight_as_int8 == ROCM_FP8_NAN_AS_INT] = 0
+    weight = weight_as_int8.view(torch.float8_e4m3fnuz)
+
+    # For the same bits representation, e4m3fnuz value is half of
+    # the e4m3fn value, so we should double the scaling factor to
+    # get the same dequantized value.
+    # https://onnx.ai/onnx/technical/float8.html
+    weight_scale = weight_scale * 2.0
+    if input_scale is not None:
+        input_scale = input_scale * 2.0
+    return weight, weight_scale, input_scale
+
+
+class Fp8GemmRunnerBackend(Enum):
+    """Enum for FP8 GEMM runner backend selection."""
+
+    AUTO = "auto"
+    FLASHINFER_TRTLLM = "flashinfer_trtllm"
+    FLASHINFER_CUTLASS = "flashinfer_cutlass"
+    FLASHINFER_DEEPGEMM = "flashinfer_deepgemm"
+    CUTLASS = "cutlass"
+    DEEP_GEMM = "deep_gemm"
+    TRITON = "triton"
+    AITER = "aiter"
+
+    def is_auto(self) -> bool:
+        return self == Fp8GemmRunnerBackend.AUTO
+
+    def is_flashinfer_trtllm(self) -> bool:
+        return self == Fp8GemmRunnerBackend.FLASHINFER_TRTLLM
+
+    def is_flashinfer_cutlass(self) -> bool:
+        return self == Fp8GemmRunnerBackend.FLASHINFER_CUTLASS
+
+    def is_flashinfer_deepgemm(self) -> bool:
+        return self == Fp8GemmRunnerBackend.FLASHINFER_DEEPGEMM
+
+    def is_cutlass(self) -> bool:
+        return self == Fp8GemmRunnerBackend.CUTLASS
+
+    def is_deep_gemm(self) -> bool:
+        return self == Fp8GemmRunnerBackend.DEEP_GEMM
+
+    def is_triton(self) -> bool:
+        return self == Fp8GemmRunnerBackend.TRITON
+
+    def is_aiter(self) -> bool:
+        return self == Fp8GemmRunnerBackend.AITER
+
+
+FP8_GEMM_RUNNER_BACKEND: Fp8GemmRunnerBackend | None = None
+
+
+def _check_cutlass_block_fp8_hardware_support() -> bool:
+    """Return True if CUTLASS block FP8 is supported (Hopper or newer with CUDA 12.0+)."""
+    return is_sm90_supported() or is_blackwell_supported()
+
+
+if is_blackwell_supported() and is_flashinfer_available():
+    from flashinfer.gemm import gemm_fp8_nt_groupwise as _raw_gemm_fp8_nt_groupwise
+
+    from sglang.srt.utils.custom_op import register_custom_op
+
+    @lru_cache(maxsize=1)
+    def _get_flashinfer_groupwise_backend() -> str:
+        if get_fp8_gemm_runner_backend().is_flashinfer_cutlass():
+            return "cutlass"
+        if get_fp8_gemm_runner_backend().is_flashinfer_trtllm():
+            return "trtllm"
+
+        major, minor = get_device_capability()
+        # SM120/121: CUTLASS only.
+        # SM100/103: TRTLLM only.
+        if major >= 12:
+            return "cutlass"
+        return "trtllm"
+
+    # Wrap gemm_fp8_nt_groupwise as a custom op so torch.compile does not trace
+    # into flashinfer's JIT compilation code (pathlib/cubin_loader ops).
+    @register_custom_op(
+        op_name="flashinfer_gemm_fp8_nt_groupwise",
+        mutates_args=[],
+        fake_impl=lambda q_input, weight, x_scale, weight_scale, out_dtype: (
+            q_input.new_empty((q_input.shape[0], weight.shape[0]), dtype=out_dtype)
+        ),
+    )
+    def gemm_fp8_nt_groupwise(
+        q_input: torch.Tensor,
+        weight: torch.Tensor,
+        x_scale: torch.Tensor,
+        weight_scale: torch.Tensor,
+        out_dtype: torch.dtype,
+    ) -> torch.Tensor:
+        backend = _get_flashinfer_groupwise_backend()
+        if backend == "cutlass":
+            # FlashInfer CUTLASS groupwise kernel requires contiguous scale tensors
+            x_scale = x_scale.contiguous()
+            weight_scale = weight_scale.contiguous()
+            return _raw_gemm_fp8_nt_groupwise(
+                q_input,
+                weight,
+                x_scale,
+                weight_scale,
+                out_dtype=out_dtype,
+                backend="cutlass",
+                scale_major_mode="MN",
+            )
+        return _raw_gemm_fp8_nt_groupwise(
+            q_input,
+            weight,
+            x_scale,
+            weight_scale,
+            out_dtype=out_dtype,
+            backend=backend,
+        )
+
+
+if is_sm90_supported() and is_flashinfer_available():
+    # FlashInfer SM90 DeepGEMM with automatic swapAB optimization for small M
+    from flashinfer.gemm import fp8_blockscale_gemm_sm90
+
+
+def dispatch_w8a8_block_fp8_linear() -> Callable:
+    """
+    Dispatch to the appropriate FP8 block linear implementation.
+
+    This function selects the backend based on:
+    1. The --fp8-gemm-backend server argument (preferred)
+    2. Auto-detection based on hardware capabilities
+    """
+    backend = get_fp8_gemm_runner_backend()
+
+    # Handle explicit backend selection via --fp8-gemm-backend
+    if not backend.is_auto():
+        return _dispatch_explicit_backend(backend)
+
+    # Auto mode: Select based purely on hardware/backend availability
+    return _dispatch_auto_backend()
+
+
+def _dispatch_explicit_backend(backend: Fp8GemmRunnerBackend) -> Callable:
+    """Dispatch based on explicitly selected backend."""
+    if backend.is_flashinfer_trtllm():
+        if not (is_sm100_supported() and is_flashinfer_available()):
+            raise RuntimeError(
+                "FlashInfer FP8 GEMM requested via --fp8-gemm-backend=flashinfer_trtllm, "
+                "but FlashInfer is not available or not supported on this hardware. "
+                "FlashInfer TRTLLM FP8 GEMM requires SM100/SM103 GPUs and FlashInfer."
+            )
+        return flashinfer_gemm_w8a8_block_fp8_linear_with_fallback
+
+    elif backend.is_flashinfer_cutlass():
+        if not (is_blackwell_supported() and is_flashinfer_available()):
+            raise RuntimeError(
+                "FlashInfer FP8 GEMM requested via --fp8-gemm-backend=flashinfer_cutlass, "
+                "but FlashInfer is not available or not supported on this hardware. "
+                "FlashInfer CUTLASS FP8 GEMM requires Blackwell GPUs and FlashInfer."
+            )
+        return flashinfer_gemm_w8a8_block_fp8_linear_with_fallback
+
+    elif backend.is_flashinfer_deepgemm():
+        if not (is_sm90_supported() and is_flashinfer_available()):
+            raise RuntimeError(
+                "FlashInfer DeepGEMM with swapAB requested via --fp8-gemm-backend=flashinfer_deepgemm, "
+                "but it's not available. This backend requires Hopper (SM90) GPUs and FlashInfer "
+                "to be installed."
+            )
+        return flashinfer_deepgemm_w8a8_block_fp8_linear_with_fallback
+
+    elif backend.is_cutlass():
+        if not _check_cutlass_block_fp8_hardware_support():
+            raise RuntimeError(
+                "CUTLASS block FP8 requested via --fp8-gemm-backend=cutlass, "
+                "but hardware does not support it. CUTLASS block FP8 requires "
+                "Hopper (SM90+) GPUs with CUDA 12.0+."
+            )
+        return cutlass_w8a8_block_fp8_linear_with_fallback
+
+    elif backend.is_aiter():
+        if not _use_aiter:
+            raise RuntimeError(
+                "AITER backend requested via --fp8-gemm-backend=aiter, "
+                "but AITER is not available. AITER requires AMD GPUs with "
+                "SGLANG_USE_AITER=1 environment variable set."
+            )
+        return aiter_w8a8_block_fp8_linear
+
+    elif backend.is_deep_gemm():
+        if not deep_gemm_wrapper.ENABLE_JIT_DEEPGEMM:
+            raise RuntimeError(
+                "DeepGEMM backend requested via --fp8-gemm-backend=deep_gemm, "
+                "but DeepGEMM is not available. This usually means the deep_gemm package "
+                "is not installed or has been disabled via SGLANG_ENABLE_JIT_DEEPGEMM=0."
+            )
+        return deepgemm_w8a8_block_fp8_linear_with_fallback
+
+    elif backend.is_triton():
+        return triton_w8a8_block_fp8_linear
+
+    else:
+        raise ValueError(f"Unknown FP8 GEMM backend: {backend}")
+
+
+def _dispatch_auto_backend() -> Callable:
+    """Auto-select the best backend based on hardware capabilities."""
+    # Priority order for auto selection:
+    # 1. DeepGEMM (if enabled and available)
+    # 2. FlashInfer TRTLLM (if Blackwell GPU and FlashInfer available)
+    # 3. CUTLASS (if Hopper+ GPU and CUDA 12.0+)
+    # 4. AITER (if AMD GPU with AITER enabled)
+    # 5. Triton (fallback)
+
+    if deep_gemm_wrapper.ENABLE_JIT_DEEPGEMM:
+        return deepgemm_w8a8_block_fp8_linear_with_fallback
+    elif is_blackwell_supported() and is_flashinfer_available():
+        return flashinfer_gemm_w8a8_block_fp8_linear_with_fallback
+    elif _check_cutlass_block_fp8_hardware_support():
+        return cutlass_w8a8_block_fp8_linear_with_fallback
+    elif _use_aiter:
+        return aiter_w8a8_block_fp8_linear
+    else:
+        return triton_w8a8_block_fp8_linear
+
+
+def initialize_fp8_gemm_config(server_args: ServerArgs) -> None:
+    """Initialize FP8 GEMM configuration."""
+    global FP8_GEMM_RUNNER_BACKEND
+
+    backend = server_args.fp8_gemm_runner_backend
+
+    # TODO(brayden): Remove env-based overrides in v0.5.7, they will be fully removed in v0.5.7.
+    # Only check environment variables when the server args is not set, server args should take priority.
+    if backend == "auto":
+        if envs.SGLANG_ENABLE_FLASHINFER_FP8_GEMM.get():
+            backend = "flashinfer_trtllm"
+        elif envs.SGLANG_SUPPORT_CUTLASS_BLOCK_FP8.get():
+            backend = "cutlass"
+    else:
+        if (
+            envs.SGLANG_ENABLE_FLASHINFER_FP8_GEMM.get()
+            or envs.SGLANG_SUPPORT_CUTLASS_BLOCK_FP8.get()
+        ):
+            logger.warning(
+                f"FP8 GEMM backend set to '{backend}' via --fp8-gemm-backend overrides "
+                "environment variables SGLANG_ENABLE_FLASHINFER_FP8_GEMM and "
+                "SGLANG_SUPPORT_CUTLASS_BLOCK_FP8. Using server argument value."
+            )
+
+    if backend == "auto" and is_sm120_supported():
+        # TODO(brayden): Verify if CUTLASS can be set by default once SwapAB is supported
+        backend = "triton"
+
+    FP8_GEMM_RUNNER_BACKEND = Fp8GemmRunnerBackend(backend)
+
+
+def get_fp8_gemm_runner_backend() -> Fp8GemmRunnerBackend:
+    """Get the current FP8 GEMM runner backend."""
+    global FP8_GEMM_RUNNER_BACKEND
+    if FP8_GEMM_RUNNER_BACKEND is None:
+        FP8_GEMM_RUNNER_BACKEND = Fp8GemmRunnerBackend.AUTO
+    return FP8_GEMM_RUNNER_BACKEND
+
+
+def flashinfer_gemm_w8a8_block_fp8_linear_with_fallback(
+    input: torch.Tensor,
+    weight: torch.Tensor,
+    block_size: List[int],
+    weight_scale: torch.Tensor,
+    input_scale: Optional[torch.Tensor] = None,
+    bias: Optional[torch.Tensor] = None,
+) -> torch.Tensor:
+    assert input_scale is None
+
+    input_2d = input.view(-1, input.shape[-1])
+    backend = _get_flashinfer_groupwise_backend()
+    # TRTLLM backend requires K dimension >= 256.
+    if backend == "trtllm" and input_2d.shape[1] < 256:
+        return triton_w8a8_block_fp8_linear(
+            input, weight, block_size, weight_scale, input_scale, bias
+        )
+
+    output_shape = [*input.shape[:-1], weight.shape[0]]
+
+    # TRTLLM uses the existing SGLang column-major scale layout.
+    # CUTLASS with scale_major_mode="MN" expects (k//block_k, m), so we normalize below.
+    q_input, x_scale = sglang_per_token_group_quant_fp8(
+        input_2d, block_size[1], column_major_scales=(backend == "trtllm")
+    )
+    if backend == "cutlass":
+        block_n, block_k = block_size
+        m, k = input_2d.shape
+        n = weight.shape[0]
+        expected_x_scale_shape = (k // block_k, m)
+        expected_weight_scale_shape = (k // block_k, n // block_n)
+        if x_scale.shape == (m, k // block_k):
+            x_scale = x_scale.transpose(-1, -2).contiguous()
+        if weight_scale.shape == (n // block_n, k // block_k):
+            weight_scale = weight_scale.transpose(-1, -2).contiguous()
+        assert x_scale.shape == expected_x_scale_shape, (
+            "FlashInfer CUTLASS groupwise FP8 expects A scale layout "
+            f"(k//block_k, m) for scale_major_mode='MN', got {tuple(x_scale.shape)}; "
+            f"expected {expected_x_scale_shape}. "
+            f"strides={x_scale.stride()} is_contiguous={x_scale.is_contiguous()} "
+            f"m={m} n={n} k={k} block_size={block_size}"
+        )
+        assert weight_scale.shape == expected_weight_scale_shape, (
+            "FlashInfer CUTLASS groupwise FP8 expects B scale layout "
+            f"(k//block_k, n//block_n) for scale_major_mode='MN', got {tuple(weight_scale.shape)}; "
+            f"expected {expected_weight_scale_shape}. "
+            f"strides={weight_scale.stride()} is_contiguous={weight_scale.is_contiguous()} "
+            f"m={m} n={n} k={k} block_size={block_size}"
+        )
+        assert x_scale.dtype == torch.float32, (
+            "FlashInfer CUTLASS groupwise FP8 expects x_scale dtype float32, "
+            f"got {x_scale.dtype}."
+        )
+        assert weight_scale.dtype == torch.float32, (
+            "FlashInfer CUTLASS groupwise FP8 expects weight_scale dtype float32, "
+            f"got {weight_scale.dtype}."
+        )
+    # TRTLLM path continues using the original quantized scale layout.
+    output = gemm_fp8_nt_groupwise(
+        q_input,
+        weight,
+        x_scale,
+        weight_scale,
+        out_dtype=input_2d.dtype,
+    )
+
+    if bias is not None:
+        output += bias
+
+    return output.to(dtype=input_2d.dtype).view(*output_shape)
+
+
+def flashinfer_deepgemm_w8a8_block_fp8_linear_with_fallback(
+    input: torch.Tensor,
+    weight: torch.Tensor,
+    block_size: List[int],
+    weight_scale: torch.Tensor,
+    input_scale: Optional[torch.Tensor] = None,
+    bias: Optional[torch.Tensor] = None,
+) -> torch.Tensor:
+    """
+    FlashInfer DeepGEMM backend for SM90 (Hopper) with swapAB optimization.
+
+    Uses flashinfer.gemm.fp8_blockscale_gemm_sm90 which automatically selects
+    the swapAB kernel for small M dimensions (M < 32) for better performance
+    during decoding/low batch size scenarios.
+
+    For SM90 (Hopper), this uses the DeepGEMM JIT with automatic swapAB selection.
+    """
+    assert input_scale is None
+
+    output_dtype = input.dtype
+    dtype_supported = output_dtype == torch.bfloat16
+
+    # fp8_blockscale_gemm_sm90 requires: N % 64 == 0, K % 128 == 0
+    shape_supported = weight.shape[0] % 64 == 0 and weight.shape[1] % 128 == 0
+
+    if not (shape_supported and dtype_supported):
+        if weight_scale.dtype == torch.int32:
+            weight_scale = _unpack_ue8m0_scale_for_triton(
+                weight_scale, weight.shape, block_size
+            )
+        return triton_w8a8_block_fp8_linear(
+            input, weight, block_size, weight_scale, input_scale, bias
+        )
+
+    input_2d = input.view(-1, input.shape[-1])
+    output_shape = [*input.shape[:-1], weight.shape[0]]
+
+    # - input: (M, K) BF16 or FP8
+    # - weight: (N, K) FP8 with weight_scale
+    # - weight_scale: (N, K//128) for per-token or (N//128, K//128) for per-block
+
+    output = fp8_blockscale_gemm_sm90(
+        input_2d,
+        weight,
+        input_scale=None,  # BF16 input, internal quantization
+        weight_scale=weight_scale,
+        out_dtype=output_dtype,
+    )
+
+    if bias is not None:
+        output += bias
+    return output.view(*output_shape)
+
+
+def cutlass_w8a8_block_fp8_linear_with_fallback(
+    input: torch.Tensor,
+    weight: torch.Tensor,
+    block_size: List[int],
+    weight_scale: torch.Tensor,
+    input_scale: Optional[torch.Tensor] = None,
+    bias: Optional[torch.Tensor] = None,
+) -> torch.Tensor:
+    assert input_scale is None
+
+    # TODO: add more robust shape check here
+    shape_supported = weight.shape[0] % 128 == 0 and weight.shape[1] % 128 == 0
+
+    if not shape_supported:
+        # fallback to triton
+        return triton_w8a8_block_fp8_linear(
+            input, weight, block_size, weight_scale, input_scale, bias
+        )
+
+    input_2d = input.view(-1, input.shape[-1])
+    output_shape = [*input.shape[:-1], weight.shape[0]]
+
+    q_input, x_scale = per_token_group_quant_fp8(
+        input_2d, block_size[1], column_major_scales=True
+    )
+    output = fp8_blockwise_scaled_mm(
+        q_input, weight.T, x_scale, weight_scale.T, out_dtype=input_2d.dtype
+    )
+    if bias is not None:
+        output += bias
+    return output.to(dtype=input_2d.dtype).view(*output_shape)
+
+
+def deepgemm_w8a8_block_fp8_linear_with_fallback(
+    input: torch.Tensor,
+    weight: torch.Tensor,
+    block_size: List[int],
+    weight_scale: torch.Tensor,
+    input_scale: Optional[torch.Tensor] = None,
+    bias: Optional[torch.Tensor] = None,
+) -> torch.Tensor:
+    assert input_scale is None
+
+    output_dtype = input.dtype
+    dtype_supported = output_dtype == torch.bfloat16
+
+    # TODO: https://github.com/sgl-project/sglang/pull/6890#issuecomment-2943395737
+    shape_supported = weight.shape[0] % 64 == 0 and weight.shape[1] % 128 == 0
+
+    if not (shape_supported and dtype_supported):
+        # fall back to triton
+        # If weight_scale is in UE8M0 packed format (int32), convert back to float32
+        # UE8M0 format has shape (N, K//block_k//4) with dtype int32
+        # Triton expects shape (N//block_n, K//block_k) with dtype float32
+        if weight_scale.dtype == torch.int32:
+            weight_scale = _unpack_ue8m0_scale_for_triton(
+                weight_scale, weight.shape, block_size
+            )
+        return triton_w8a8_block_fp8_linear(
+            input, weight, block_size, weight_scale, input_scale, bias
+        )
+
+    input_2d = input.view(-1, input.shape[-1])
+    output_shape = [*input.shape[:-1], weight.shape[0]]
+
+    q_input, x_scale = sglang_per_token_group_quant_fp8(
+        input_2d,
+        block_size[1],
+        column_major_scales=True,
+        scale_tma_aligned=True,
+        scale_ue8m0=deep_gemm_wrapper.DEEPGEMM_SCALE_UE8M0,
+    )
+
+    output = w8a8_block_fp8_matmul_deepgemm(
+        q_input, weight, x_scale, weight_scale, block_size, output_dtype=output_dtype
+    )
+    if bias is not None:
+        output += bias
+    return output.to(dtype=output_dtype).view(*output_shape)
+
+
+def _unpack_ue8m0_scale_for_triton(
+    sf_packed: torch.Tensor,
+    weight_shape: Tuple[int, int],
+    block_size: List[int],
+) -> torch.Tensor:
+    """
+    Unpack UE8M0 packed scale tensor back to float32 format for triton kernel.
+
+    The UE8M0 format packs scales as:
+    - Shape: (N, K//block_k//4) with dtype int32
+    - Each int32 contains 4 uint8 scale values
+
+    Triton expects:
+    - Shape: (N//block_n, K//block_k) with dtype float32
+
+    Args:
+        sf_packed: Packed scale tensor with shape (N, packed_k_groups) and dtype int32
+        weight_shape: (N, K) shape of the weight tensor
+        block_size: [block_n, block_k] quantization block size
+
+    Returns:
+        Unpacked scale tensor with shape (n_groups, k_groups) and dtype float32
+    """
+    assert sf_packed.dtype == torch.int32
+    assert len(sf_packed.shape) == 2
+
+    N, K = weight_shape
+    block_n, block_k = block_size
+    n_groups = ceil_div(N, block_n)
+    k_groups = ceil_div(K, block_k)
+
+    mn_repeat, k_div_4 = sf_packed.shape
+    k_packed = k_div_4 * 4
+
+    # Unpack int32 -> 4x uint8 -> float32
+    # Each uint8 represents an exponent in UE8M0 format
+    sf_u8 = sf_packed.contiguous().view(torch.uint8).view(mn_repeat, k_packed)
+    sf_fp32 = (sf_u8.to(torch.int32) << 23).view(torch.float32)
+
+    # Handle row dimension - may have 128x replication or direct mapping
+    if mn_repeat == N:
+        # Rows are replicated 128 times, take every 128th row
+        # sf_fp32 shape: (N, k_packed) -> (n_groups, k_packed)
+        # Select representative rows at indices 0, 128, 256, ...
+        indices = torch.arange(0, N, block_n, device=sf_packed.device)
+        sf_fp32 = sf_fp32.index_select(0, indices)
+    elif mn_repeat == n_groups:
+        # Already in the correct n_groups format
+        pass
+    else:
+        raise ValueError(
+            f"Unexpected scale shape: sf_packed.shape={sf_packed.shape}, "
+            f"weight_shape={weight_shape}, block_size={block_size}"
+        )
+
+    # Crop k dimension to expected size (remove padding if any)
+    sf_fp32 = sf_fp32[:, :k_groups].contiguous()
+
+    return sf_fp32
+
+
+def aiter_w8a8_block_fp8_linear(
+    input: torch.Tensor,
+    weight: torch.Tensor,
+    block_size: List[int],
+    weight_scale: torch.Tensor,
+    input_scale: Optional[torch.Tensor] = None,
+    bias: Optional[torch.Tensor] = None,
+) -> torch.Tensor:
+    # assert input_scale is None
+    input_2d = input.view(-1, input.shape[-1])
+    output_shape = [*input.shape[:-1], weight.shape[0]]
+
+    # if input_scale not None, input is quanted
+    if input_scale is not None:
+        q_input = input_2d
+        x_scale = input_scale
+
+    else:
+        q_input, x_scale = aiter_per1x128_quant(input_2d, quant_dtype=aiter.dtypes.fp8)
+
+    output = gemm_a8w8_blockscale(
+        q_input,
+        weight,
+        x_scale,
+        weight_scale,
+        dtype=torch.bfloat16 if input_scale is not None else input.dtype,
+    )
+
+    if bias is not None:
+        output += bias
+
+    return output.to(
+        dtype=torch.bfloat16 if input_scale is not None else input_2d.dtype
+    ).view(*output_shape)
+
+
+def triton_w8a8_block_fp8_linear(
+    input: torch.Tensor,
+    weight: torch.Tensor,
+    block_size: List[int],
+    weight_scale: torch.Tensor,
+    input_scale: Optional[torch.Tensor] = None,
+    bias: Optional[torch.Tensor] = None,
+) -> torch.Tensor:
+    assert input_scale is None
+    input_2d = input.view(-1, input.shape[-1])
+    output_shape = [*input.shape[:-1], weight.shape[0]]
+
+    q_input, x_scale = per_token_group_quant_fp8(
+        input_2d, block_size[1], column_major_scales=False
+    )
+    output = w8a8_block_fp8_matmul_triton(
+        q_input, weight, x_scale, weight_scale, block_size, output_dtype=input_2d.dtype
+    )
+    if bias is not None:
+        output += bias
+    return output.to(dtype=input_2d.dtype).view(*output_shape)
+
+
+@lru_cache(maxsize=1)
+def _get_triton_mxfp8_downcast():
+    try:
+        from triton_kernels.numerics_details.mxfp import downcast_to_mxfp
+    except Exception as err:
+        raise RuntimeError(
+            "MXFP8 quantization requires triton_kernels with MXFP8 support."
+        ) from err
+    return downcast_to_mxfp
+
+
+def mxfp8_group_quantize(x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+    """Quantize a 2D contiguous tensor to MXFP8 with UE8M0 scales per group (32)."""
+    assert x.dim() == 2, f"Expected 2D input, got {x.dim()}D"
+    assert x.is_contiguous(), "MXFP8 quantization requires a contiguous 2D tensor."
+    _, k = x.shape
+    assert k % 32 == 0, f"{k=} must be divisible by 32"
+    downcast_to_mxfp = _get_triton_mxfp8_downcast()
+    q_input, scale_u8 = downcast_to_mxfp(x, torch.float8_e4m3fn, axis=1)
+    return q_input.contiguous(), scale_u8.contiguous()
+
+
+def _pack_mxfp8_scales(scale_u8: torch.Tensor) -> torch.Tensor:
+    # Pack (M, K//32) UE8M0 scales into the layout expected by tl.dot_scaled.
+    assert scale_u8.dim() == 2, f"Expected 2D scale tensor, got {scale_u8.dim()}D"
+    scale_u8 = scale_u8.contiguous()
+    m, k_groups = scale_u8.shape
+    assert (
+        k_groups % 4 == 0
+    ), f"{k_groups=} must be divisible by 4 (K must be multiple of 128)"
+
+    scale_m = ceil_div(m, 128)
+    if m % 128 != 0:
+        pad_rows = scale_m * 128 - m
+        pad = torch.full(
+            (pad_rows, k_groups),
+            127,
+            dtype=scale_u8.dtype,
+            device=scale_u8.device,
+        )
+        scale_u8 = torch.cat([scale_u8, pad], dim=0)
+
+    scale_k = k_groups // 4
+    scale_u8 = scale_u8.view(scale_m, 128, scale_k, 4)
+    scale_u8 = scale_u8.view(scale_m, 4, 32, scale_k, 4)
+    packed = scale_u8.permute(0, 3, 2, 1, 4).contiguous()
+    return packed.view(1, scale_m, scale_k, 2, 256)
+
+
+def triton_mxfp8_blockscaled_linear(
+    input: torch.Tensor,
+    weight: torch.Tensor,
+    weight_scale: torch.Tensor,
+    input_scale: Optional[torch.Tensor] = None,
+    bias: Optional[torch.Tensor] = None,
+    output_dtype: Optional[torch.dtype] = None,
+) -> torch.Tensor:
+    if not (_is_cuda and (is_sm100_supported() or is_sm120_supported())):
+        raise RuntimeError("MXFP8 dense linear requires Blackwell GPUs (SM100/SM120).")
+
+    input_2d = input.view(-1, input.shape[-1]).contiguous()
+    output_shape = [*input.shape[:-1], weight.shape[0]]
+
+    block_m = 128
+    block_n = 256 if weight.shape[0] % 256 == 0 else 128
+    block_k = 128
+
+    m, k = input_2d.shape
+    n, k_w = weight.shape
+    assert k == k_w, f"{k=} does not match {k_w=}"
+    assert k % 128 == 0, f"{k=} must be divisible by 128 for MXFP8"
+    assert n % block_n == 0, f"{n=} must be divisible by {block_n}"
+    assert weight.dtype == torch.float8_e4m3fn, "MXFP8 weight must be FP8 E4M3."
+    assert weight_scale.dtype == torch.uint8, "MXFP8 weight_scale must be UE8M0 uint8."
+
+    if input_scale is None:
+        q_input, x_scale_u8 = mxfp8_group_quantize(input_2d)
+    else:
+        q_input = input_2d
+        x_scale_u8 = input_scale
+        assert x_scale_u8.dtype == torch.uint8, "MXFP8 input_scale must be UE8M0 uint8."
+        assert x_scale_u8.shape == (m, k // 32)
+
+    if output_dtype is None:
+        if input_2d.dtype in (torch.float16, torch.bfloat16, torch.float32):
+            output_dtype = input_2d.dtype
+        else:
+            output_dtype = torch.bfloat16
+
+    if m % block_m != 0:
+        pad_rows = ceil_div(m, block_m) * block_m - m
+        q_input = torch.cat(
+            [
+                q_input,
+                torch.zeros((pad_rows, k), device=q_input.device, dtype=q_input.dtype),
+            ],
+            dim=0,
+        )
+        pad_scale = torch.full(
+            (pad_rows, k // 32),
+            127,
+            device=x_scale_u8.device,
+            dtype=x_scale_u8.dtype,
+        )
+        x_scale_u8 = torch.cat([x_scale_u8, pad_scale], dim=0)
+
+    a_scale_packed = _pack_mxfp8_scales(x_scale_u8)
+    b_scale_packed = _pack_mxfp8_scales(weight_scale)
+
+    num_stages = 1 if is_sm120_supported() else (4 if is_sm100_supported() else 1)
+    output = mxfp8_block_scaled_matmul_triton(
+        q_input,
+        a_scale_packed,
+        weight.contiguous(),
+        b_scale_packed,
+        output_dtype=output_dtype,
+        block_m=block_m,
+        block_n=block_n,
+        block_k=block_k,
+        num_stages=num_stages,
+    )
+    output = output[:m, :]
+    if bias is not None:
+        output += bias
+    return output.to(dtype=output_dtype).view(*output_shape)
+
+
+def dequant_mxfp4(
+    w_block: torch.Tensor,
+    w_scale: torch.Tensor,
+    out_dtype,
+) -> torch.Tensor:
+    """
+    :param w_block: (batch, n, k, 16), uint8, pack two mxfp4 into one byte
+    :param w_scale: (batch, n, k), uint8
+    :return: (batch, n, k * 32), float32
+    """
+
+    assert w_block.dtype == torch.uint8
+    assert w_scale.dtype == torch.uint8
+
+    batch, n, k, pack_dim = w_block.shape
+    batch_, n_, k_ = w_scale.shape
+    assert pack_dim == 16
+    assert batch == batch_
+    assert n == n_
+    assert k == k_
+
+    out_raw = MXFP4QuantizeUtil.dequantize(
+        quantized_data=w_block, scale=w_scale, dtype=out_dtype, block_sizes=[32]
+    )
+    return out_raw.reshape(batch, n, k * 32)
+
+
+def input_to_float8(
+    x: torch.Tensor, dtype: torch.dtype = fp8_dtype
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """This function quantizes input values to float8 values with tensor-wise quantization."""
+    min_val, max_val = x.aminmax()
+    amax = torch.maximum(min_val.abs(), max_val.abs()).float().clamp(min=1e-12)
+
+    if _is_fp8_fnuz:
+        dtype = fp8_dtype
+        fp_max = fp8_max
+    else:
+        finfo = torch.finfo(dtype)
+        fp_max = finfo.max
+
+    scale = fp_max / amax
+    x_scl_sat = (x.float() * scale).clamp(min=-fp_max, max=fp_max)
+    return x_scl_sat.to(dtype).contiguous(), scale.float().reciprocal()
+
+
+def block_quant_to_tensor_quant(
+    x_q_block: torch.Tensor,
+    x_s: torch.Tensor,
+    block_size: List[int],
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """This function converts block-wise quantization to tensor-wise quantization.
+    The inputs are block-wise quantization tensor `x_q_block`, block-wise quantization scale
+    and the block size.
+    The outputs are tensor-wise quantization tensor and tensor-wise quantization scale.
+    Note only float8 is supported for now.
+    """
+    block_n, block_k = block_size[0], block_size[1]
+    n, k = x_q_block.shape
+    n_tiles = (n + block_n - 1) // block_n
+    k_tiles = (k + block_k - 1) // block_k
+    assert n_tiles == x_s.shape[0]
+    assert k_tiles == x_s.shape[1]
+
+    x_dq_block = x_q_block.to(torch.float32)
+
+    x_dq_block_tiles = [
+        [
+            x_dq_block[
+                j * block_n : min((j + 1) * block_n, n),
+                i * block_k : min((i + 1) * block_k, k),
+            ]
+            for i in range(k_tiles)
+        ]
+        for j in range(n_tiles)
+    ]
+
+    for i in range(k_tiles):
+        for j in range(n_tiles):
+            x_dq_block_tiles[j][i][:, :] = x_dq_block_tiles[j][i] * x_s[j][i]
+
+    x_q_tensor, scale = (
+        scaled_fp8_quant(x_dq_block)
+        if _is_cuda
+        else input_to_float8(x_dq_block, dtype=x_q_block.dtype)
+    )
+    return x_q_tensor, scale
+
+
+def block_quant_dequant(
+    x_q_block: torch.Tensor,
+    x_s: torch.Tensor,
+    block_size: List[int],
+    dtype: torch.dtype,
+) -> torch.Tensor:
+    """This function converts block-wise quantization to unquantized.
+    The inputs are block-wise quantization tensor `x_q_block`, block-wise quantization scale
+    and the block size.
+    The output is an unquantized tensor with dtype.
+    """
+    block_n, block_k = block_size[0], block_size[1]
+    *_, n, k = x_q_block.shape
+
+    # ... n_scale k_scale -> ... (n_scale block_n) (k_scale block_k)
+    x_scale_repeat = x_s.repeat_interleave(block_n, dim=-2).repeat_interleave(
+        block_k, dim=-1
+    )
+    x_scale_repeat = x_scale_repeat[..., :n, :k]
+
+    return (x_q_block.to(torch.float32) * x_scale_repeat).to(dtype)
+
+
+def requant_weight_ue8m0_inplace(weight, weight_scale_inv, weight_block_size):
+    assert isinstance(weight, torch.nn.Parameter)
+    assert isinstance(weight_scale_inv, torch.nn.Parameter)
+
+    new_weight, new_weight_scale_inv = requant_weight_ue8m0(
+        weight.to(weight_scale_inv.device), weight_scale_inv, weight_block_size
+    )
+
+    offloader.update_param(weight, new_weight)
+    weight_scale_inv.data = new_weight_scale_inv
+
+
+def requant_weight_ue8m0(
+    weight: torch.Tensor,
+    weight_scale_inv: torch.Tensor,
+    weight_block_size: List[int],
+):
+    assert weight_block_size == [128, 128]
+
+    *_, n, k = weight.shape
+
+    weight_dequant = block_quant_dequant(
+        weight,
+        weight_scale_inv,
+        weight_block_size,
+        torch.bfloat16,
+    )
+
+    out_w, out_s = quant_weight_ue8m0(
+        weight_dequant=weight_dequant,
+        weight_block_size=weight_block_size,
+    )
+
+    out_s = transform_scale_ue8m0(out_s, mn=out_w.shape[-2])
+
+    return out_w, out_s
+
+
+def quant_weight_ue8m0(
+    weight_dequant: torch.Tensor,
+    weight_block_size: List[int],
+):
+    assert weight_block_size == [128, 128]
+    assert (
+        weight_dequant.dtype == torch.bfloat16
+    ), f"{weight_dequant.dtype=} {weight_dequant.shape=}"
+
+    *batch_dims, n, k = weight_dequant.shape
+
+    weight_dequant_flat = weight_dequant.view((-1, k))
+    out_w_flat, out_s_flat = per_block_cast_to_fp8(weight_dequant_flat)
+
+    out_w = out_w_flat.view((*batch_dims, n, k))
+    out_s = out_s_flat.view(
+        (
+            *batch_dims,
+            ceil_div(n, weight_block_size[0]),
+            ceil_div(k, weight_block_size[1]),
+        )
+    )
+
+    return out_w, out_s
+
+
+def transform_scale_ue8m0_inplace(param, mn):
+    param.data = transform_scale_ue8m0(param.data, mn=mn)
+
+
+# NOTE copy and modified from DeepGEMM
+def transform_scale_ue8m0(sf, mn, use_torch_impl: bool = False):
+    import deep_gemm.utils.layout
+
+    get_mn_major_tma_aligned_packed_ue8m0_tensor = (
+        _get_mn_major_tma_aligned_packed_ue8m0_tensor_torch_impl
+        if use_torch_impl
+        else deep_gemm.utils.layout.get_mn_major_tma_aligned_packed_ue8m0_tensor
+    )
+
+    sf = sf.index_select(-2, torch.arange(mn, device=sf.device) // 128)
+    sf = get_mn_major_tma_aligned_packed_ue8m0_tensor(sf)
+    return sf
+
+
+# Copied from DeepGEMM tests
+def _get_mn_major_tma_aligned_packed_ue8m0_tensor_torch_impl(
+    x: torch.Tensor,
+) -> torch.Tensor:
+    from deep_gemm.utils import align, get_tma_aligned_size
+
+    assert x.dtype == torch.float and x.dim() in (2, 3)
+
+    # First, convert into UE8M0 `uint8_t`
+    ue8m0_tensor = (x.view(torch.int) >> 23).to(torch.uint8)
+
+    # Second, make padded packed tensors
+    mn, k = x.shape[-2], x.shape[-1]
+    remove_dim = False
+    if x.dim() == 2:
+        x, remove_dim = x.unsqueeze(0), True
+    b = x.shape[0]
+    aligned_mn = get_tma_aligned_size(mn, 4)
+    aligned_k = align(k, 4)
+    padded = torch.zeros((b, aligned_mn, aligned_k), device=x.device, dtype=torch.uint8)
+    padded[:, :mn, :k] = ue8m0_tensor
+    padded = padded.view(-1).view(dtype=torch.int).view(b, aligned_mn, aligned_k // 4)
+
+    # Finally, transpose
+    transposed = torch.zeros(
+        (b, aligned_k // 4, aligned_mn), device=x.device, dtype=torch.int
+    ).mT
+    transposed[:, :, :] = padded
+    aligned_x = transposed[:, :mn, :]
+    return aligned_x.squeeze(0) if remove_dim else aligned_x
+
+
+def inverse_transform_scale_ue8m0(sf_packed, mn):
+    sf_fp32 = _inverse_transform_scale_ue8m0_impl(sf_packed)
+    # Can call consistency check every time since this is only called on startup
+    sf_packed_recreated = transform_scale_ue8m0(sf_fp32, mn=mn, use_torch_impl=True)
+    assert torch.all(
+        sf_packed == sf_packed_recreated
+    ), f"{sf_packed=} {sf_packed_recreated=} {sf_fp32=}"
+    return sf_fp32
+
+
+# Inverse impl can refer to DeepGEMM's torch impl in get_mn_major_tma_aligned_packed_ue8m0_tensor_torch_impl
+def _inverse_transform_scale_ue8m0_impl(sf_packed):
+    """
+    NOTE: We assume k is aligned
+    :param sf_packed: (scale_mn, scale_k/4) int32
+    :return: (scale_mn, scale_k), float32
+    """
+    if len(sf_packed.shape) == 3:
+        return torch.stack(
+            [_inverse_transform_scale_ue8m0_impl(x) for x in sf_packed], dim=0
+        )
+
+    block_size = 128
+    assert len(sf_packed.shape) == 2, f"{sf_packed.shape=}"
+    assert sf_packed.dtype == torch.int32
+
+    mn_repeat_128, k_div_4 = sf_packed.shape
+    mn = mn_repeat_128 // block_size
+    k = k_div_4 * 4
+
+    # packed u8 -> fp32
+    sf_u8 = sf_packed.contiguous().flatten().view(torch.uint8).view(mn_repeat_128, k)
+    sf_fp32 = (sf_u8.to(torch.int32) << 23).view(torch.float32)
+
+    # remove repeat
+    sf_reshaped = sf_fp32.view(mn, block_size, k)
+    sf_unrepeated = sf_reshaped[:, 0:1, :]
+    if not torch.all(sf_unrepeated == sf_reshaped):
+        from sglang.srt.debug_utils.dumper import get_tensor_info
+
+        raise AssertionError(
+            f"sf_unrepeated != sf_reshaped ({get_tensor_info(sf_unrepeated)=} {get_tensor_info(sf_reshaped)=})"
+        )
+    sf_unrepeated = sf_unrepeated.squeeze(1).contiguous()
+
+    assert sf_unrepeated.shape == (mn, k)
+    return sf_unrepeated
+
+
+# COPIED FROM DeepGEMM
+def per_block_cast_to_fp8(x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+    assert x.dim() == 2
+    m, n = x.shape
+    x_padded = torch.zeros(
+        (ceil_align(m, 128), ceil_align(n, 128)), dtype=x.dtype, device=x.device
+    )
+    x_padded[:m, :n] = x
+    x_view = x_padded.view(-1, 128, x_padded.size(1) // 128, 128)
+    x_amax = x_view.abs().float().amax(dim=(1, 3), keepdim=True).clamp(1e-4)
+    sf = ceil_to_ue8m0(x_amax / 448.0)
+    x_scaled = (x_view * (1.0 / sf)).to(torch.float8_e4m3fn)
+    return x_scaled.view_as(x_padded)[:m, :n].contiguous(), sf.view(
+        x_view.size(0), x_view.size(2)
+    )
+
+
+# COPIED FROM DeepGEMM
+def ceil_to_ue8m0(x: torch.Tensor):
+    return torch.pow(2.0, torch.ceil(torch.log2(x.abs())))
+
+
+def channel_quant_to_tensor_quant(
+    x_q_channel: torch.Tensor,
+    x_s: torch.Tensor,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    x_dq_channel = x_q_channel.to(torch.float32) * x_s
+    x_q_tensor, scale = (
+        scaled_fp8_quant(x_dq_channel)
+        if _is_cuda
+        else input_to_float8(x_dq_channel, dtype=x_q_channel.dtype)
+    )
+    return x_q_tensor, scale
+
+
+def _process_scaled_mm_output(output, input_2d_shape, output_shape):
+    if type(output) is tuple and len(output) == 2:
+        output = output[0]
+    return torch.narrow(output, 0, 0, input_2d_shape[0]).view(*output_shape)
+
+
+def _apply_fallback_scaled_mm(
+    qinput,
+    weight,
+    x_scale,
+    weight_scale,
+    input_2d_shape,
+    output_shape,
+    bias,
+    input_dtype,
+):
+    global TORCH_DEVICE_IDENTITY
+    if TORCH_DEVICE_IDENTITY is None:
+        TORCH_DEVICE_IDENTITY = torch.ones(1, dtype=torch.float32, device=weight.device)
+
+    output = torch._scaled_mm(
+        qinput,
+        weight,
+        scale_a=TORCH_DEVICE_IDENTITY,
+        scale_b=TORCH_DEVICE_IDENTITY,
+        out_dtype=torch.float32,
+    )
+
+    output = _process_scaled_mm_output(output, input_2d_shape, output_shape)
+    x_scale = torch.narrow(x_scale, 0, 0, input_2d_shape[0])
+
+    output = output * x_scale * weight_scale.t()
+    if bias is not None:
+        output = output + bias
+    return output.to(dtype=input_dtype)
+
+
+def apply_fp8_linear(
+    input: torch.Tensor,
+    weight: torch.Tensor,
+    weight_scale: torch.Tensor,
+    input_scale: Optional[torch.Tensor] = None,
+    input_scale_ub: Optional[torch.Tensor] = None,
+    bias: Optional[torch.Tensor] = None,
+    cutlass_fp8_supported: bool = cutlass_fp8_supported(),
+    use_per_token_if_dynamic: bool = False,
+    pad_output: Optional[bool] = None,
+    compressed_tensor_quant: bool = False,
+) -> torch.Tensor:
+    # Note: we pad the input because torch._scaled_mm is more performant
+    # for matrices with batch dimension > 16.
+    # This could change in the future.
+    # We also don't pad when using torch.compile,
+    # as it breaks with dynamic shapes.
+    if pad_output is None:
+        pad_output = not cutlass_fp8_supported and not get_bool_env_var(
+            "SGLANG_ENABLE_TORCH_COMPILE"
+        )
+    output_padding = 17 if pad_output else None
+
+    # View input as 2D matrix for fp8 methods
+    input_2d = input.view(-1, input.shape[-1])
+    output_shape = [*input.shape[:-1], weight.shape[1]]
+
+    if compressed_tensor_quant:
+        # Maybe apply padding to output, see comment in __init__
+        num_token_padding = output_padding
+        if cutlass_fp8_supported and weight_scale.numel() == weight.shape[1]:
+            num_token_padding = None
+        qinput, x_scale = scaled_fp8_quant(
+            input_2d,
+            input_scale,
+            num_token_padding=num_token_padding,
+            use_per_token_if_dynamic=use_per_token_if_dynamic,
+        )
+    else:
+        # cutlass w8a8 fp8 sgl-kernel only supports per-token scale
+        if input_scale is not None:
+            assert input_scale.numel() == 1
+            # broadcast per-tensor scale to per-token scale when supporting cutlass
+            qinput, x_scale = static_quant_fp8(
+                input_2d, input_scale, repeat_scale=cutlass_fp8_supported
+            )
+        else:
+            # default use per-token quantization if dynamic
+            if _is_cuda:
+                qinput, x_scale = sglang_per_token_quant_fp8(input_2d)
+            else:
+                # TODO(kkhuang): temporarily enforce per-tensor activation scaling if weight is per-tensor scaling
+                # final solution should be: 1. add support to per-tensor activation scaling.
+                # 2. solve the torch.compile error from weight_scale.numel() == 1 and x_scale.numel() > 1 (below line#308)
+                if _is_hip and weight_scale.numel() == 1:
+                    qinput, x_scale = scaled_fp8_quant(
+                        input_2d,
+                        input_scale,
+                        use_per_token_if_dynamic=use_per_token_if_dynamic,
+                    )
+                else:
+                    qinput, x_scale = per_token_group_quant_fp8(
+                        input_2d, group_size=input_2d.shape[1]
+                    )
+
+    if cutlass_fp8_supported and weight_scale.numel() == weight.shape[1]:
+        cutlass_compatible_b = weight.shape[0] % 16 == 0 and weight.shape[1] % 16 == 0
+        if not cutlass_compatible_b or use_triton_w8a8_fp8_kernel:
+            # Massage the input to be 2D
+            qinput = qinput.view(-1, qinput.shape[-1])
+            output = triton_scaled_mm(
+                qinput, weight, x_scale, weight_scale, input.dtype, bias
+            )
+        else:
+            output = fp8_scaled_mm(
+                qinput,
+                weight,
+                x_scale,
+                weight_scale,
+                out_dtype=input.dtype,
+                bias=bias,
+            )
+        return output.view(*output_shape)
+
+    # torch.scaled_mm supports per tensor weights + activations only
+    # so fallback to naive if per channel or per token
+    per_tensor_weights = weight_scale.numel() == 1
+    # When the number of token is 1,
+    # per-token scale has shape (1, 1), per-tensor scale has shape (1) or ().
+    per_tensor_activations = (x_scale.numel() == 1) and x_scale.dim() < 2
+
+    if (
+        use_per_token_if_dynamic
+        and not per_tensor_weights
+        and not per_tensor_activations
+        and (USE_ROWWISE_TORCH_SCALED_MM or _use_aiter)
+    ):
+        # into this sector means use dynamic per-token-per-channel quant
+        # per-token scale quant for input matrix, every row(one token) have one scale factor
+        # per-channel scale quant for weight matrix, every col(one channel) have one scale factor
+        if _use_aiter:
+            # gemm_a8w8_bpreshuffle(XQ, WQ, x_scale, w_scale, dtype)
+            # XQ -> input tensor, shape = (m, k)
+            # WQ -> weight tensor, shape = (n, k), with preshuffe get better perf
+            # x_scale -> input scale tensor, shape = (m, 1)
+            # w_scale -> weight scale tensor, shape = (n ,1)
+            # dtype -> output dtype
+            output = gemm_a8w8_bpreshuffle(
+                XQ=qinput,
+                WQ=weight.T,
+                x_scale=x_scale,
+                w_scale=weight_scale,
+                dtype=input.dtype,
+            )
+            if bias is not None:
+                output += bias
+            return _process_scaled_mm_output(output, input_2d.shape, output_shape)
+        else:
+            # For now validated on ROCm platform
+            # fp8 rowwise scaling in torch._scaled_mm is introduced in
+            # https://github.com/pytorch/pytorch/pull/144432 using hipBLASLt
+            # and ROCm 6.3, which only exists in torch 2.7 and above.
+            # For CUDA platform please validate if the
+            # torch._scaled_mm support rowwise scaled GEMM
+            # Fused GEMM_DQ Rowwise GEMM
+            output = torch._scaled_mm(
+                qinput,
+                weight,
+                out_dtype=input.dtype,
+                scale_a=x_scale,
+                scale_b=weight_scale.t(),
+                bias=bias,
+            )
+            return _process_scaled_mm_output(output, input_2d.shape, output_shape)
+
+    if per_tensor_weights and per_tensor_activations:
+        # Fused GEMM_DQ; _scaled_mm with torch.compile requires len(weight_scale.shape) == len(x_scale.shape)
+        if weight_scale.ndim == 0 and x_scale.ndim == 1:
+            weight_scale = weight_scale.unsqueeze(0)
+        output = torch._scaled_mm(
+            qinput,
+            weight,
+            out_dtype=input.dtype,
+            scale_a=x_scale,
+            scale_b=weight_scale,
+            bias=bias,
+        )
+        return _process_scaled_mm_output(output, input_2d.shape, output_shape)
+
+    # Fallback for channelwise case, where we use unfused DQ
+    # due to limitations with scaled_mm
+
+    # Symmetric quantized GEMM by definition computes the following:
+    #   C = (s_x * X) (s_w * W) + bias
+    # This is equivalent to dequantizing the weights and activations
+    # before applying a GEMM.
+    #
+    # In order to compute quantized operands, a quantized kernel
+    # will rewrite the above like so:
+    #   C = s_w * s_x * (X * W) + bias
+    #
+    # For the scaled_mm fallback case, we break this down, since it
+    # does not support s_w being a vector.
+    return _apply_fallback_scaled_mm(
+        qinput,
+        weight,
+        x_scale,
+        weight_scale,
+        input_2d.shape,
+        output_shape,
+        bias,
+        input.dtype,
+    )
+
+
+def can_auto_enable_marlin_fp8() -> bool:
+    try:
+        major, minor = get_device_capability()
+        sm = major * 10 + minor
+        return 80 <= sm < 89
+    except Exception:
+        return False
+
+
+def apply_fp8_ptpc_linear(
+    input: torch.Tensor,
+    weight: torch.Tensor,
+    weight_scale: torch.Tensor,
+    input_scale: Optional[torch.Tensor] = None,
+    input_scale_ub: Optional[torch.Tensor] = None,
+    bias: Optional[torch.Tensor] = None,
+    cutlass_fp8_supported: bool = cutlass_fp8_supported(),
+    use_per_token_if_dynamic: bool = False,
+    pad_output: Optional[bool] = None,
+    compressed_tensor_quant: bool = False,
+) -> torch.Tensor:
+    # View input as 2D matrix for fp8 methods
+    input_2d = input.view(-1, input.shape[-1])
+
+    # weight is transposed (K, N)
+    output_shape = [*input.shape[:-1], weight.shape[1]]
+
+    q_input, x_scale = aiter.per_token_quant_hip(input_2d, quant_dtype=aiter.dtypes.fp8)
+
+    per_tensor_weights = (weight_scale.numel() == 1) and weight_scale.dim() < 2
+    per_tensor_activations = (x_scale.numel() == 1) and x_scale.dim() < 2
+
+    if not (per_tensor_weights and per_tensor_activations):
+        # weight is in (N, K)
+        output_shape = [*input.shape[:-1], weight.shape[0]]
+
+    output = aiter.gemm_a8w8_bpreshuffle(
+        q_input, weight, x_scale, weight_scale, None, input.dtype
+    )
+    if bias is not None:
+        output = output + bias
+    return output.view(*output_shape)
+
+
+def validate_fp8_block_shape(
+    layer: torch.nn.Module,
+    input_size: int,
+    output_size: int,
+    input_size_per_partition: int,
+    output_partition_sizes: list[int],
+    block_size: list[int],
+) -> None:
+    """Validate block quantization shapes for tensor parallelism."""
+    from sglang.srt.distributed import get_tensor_model_parallel_world_size
+
+    tp_size = getattr(layer, "tp_size", get_tensor_model_parallel_world_size())
+    block_n, block_k = block_size[0], block_size[1]
+
+    # Required by row parallel
+    if (
+        tp_size > 1
+        and input_size // input_size_per_partition == tp_size
+        and input_size_per_partition % block_k != 0
+    ):
+        raise ValueError(
+            f"Weight input_size_per_partition = {input_size_per_partition} "
+            f"is not divisible by weight quantization block_k = {block_k}."
+        )
+
+    # Required by column parallel or enabling merged weights
+    is_tp_split = tp_size > 1 and output_size // sum(output_partition_sizes) == tp_size
+    is_merged_gemm = len(output_partition_sizes) > 1
+    if is_tp_split or is_merged_gemm:
+        sizes_to_check = output_partition_sizes
+        if not is_tp_split and is_merged_gemm:
+            # In case of merged matrices, we allow the last
+            # matrix to not be a multiple of block size
+            sizes_to_check = output_partition_sizes[:-1]
+        for output_partition_size in sizes_to_check:
+            if output_partition_size % block_n != 0:
+                raise ValueError(
+                    f"Weight output_partition_size = "
+                    f"{output_partition_size} is not divisible by "
+                    f"weight quantization block_n = {block_n}."
+                )
diff --git a/sglang/python/sglang/srt/layers/quantization/fpgemm_fp8.py b/sglang/python/sglang/srt/layers/quantization/fpgemm_fp8.py
new file mode 100644
index 0000000000000000000000000000000000000000..352d746289c2a69050f7eeac9d998fa4f6c5b417
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/quantization/fpgemm_fp8.py
@@ -0,0 +1,202 @@
+# SPDX-License-Identifier: Apache-2.0
+from __future__ import annotations
+
+import logging
+from typing import Any, List, Optional
+
+import torch
+from torch.nn import Module
+from torch.nn.parameter import Parameter
+
+from sglang.srt.layers.linear import LinearBase
+from sglang.srt.layers.parameter import ChannelQuantScaleParameter, ModelWeightParameter
+from sglang.srt.layers.quantization.base_config import (
+    LinearMethodBase,
+    QuantizationConfig,
+    QuantizeMethodBase,
+)
+from sglang.srt.layers.quantization.fp8_kernel import is_fp8_fnuz
+from sglang.srt.layers.quantization.fp8_utils import (
+    apply_fp8_linear,
+    can_auto_enable_marlin_fp8,
+    cutlass_fp8_supported,
+    normalize_e4m3fn_to_e4m3fnuz,
+)
+from sglang.srt.layers.quantization.marlin_utils_fp8 import (
+    apply_fp8_marlin_linear,
+    prepare_fp8_layer_for_marlin,
+)
+from sglang.srt.layers.quantization.unquant import UnquantizedLinearMethod
+from sglang.srt.layers.quantization.utils import is_layer_skipped
+from sglang.srt.utils import get_bool_env_var, is_cuda
+
+_is_cuda = is_cuda()
+_is_fp8_fnuz = is_fp8_fnuz()
+
+logger = logging.getLogger(__name__)
+
+
+class FBGEMMFp8Config(QuantizationConfig):
+    """Config class for FBGEMM Fp8."""
+
+    def __init__(self, ignore_list: list[str], input_scale_ub: float):
+        super().__init__()
+        self.ignore_list = ignore_list if ignore_list else []
+        self.input_scale_ub = input_scale_ub
+
+        # For GPUs that lack FP8 hardware suspport, we can leverage the Marlin
+        # kernel for fast weight-only FP8 quantization
+        # self.use_marlin = not marlin_fp8_supported()
+        self.use_marlin = False
+        if _is_cuda:
+            force_marlin = get_bool_env_var("SGLANG_FORCE_FP8_MARLIN")
+            auto_enable = can_auto_enable_marlin_fp8()
+            self.use_marlin = force_marlin or auto_enable
+
+    @classmethod
+    def get_name(cls) -> str:
+        return "fbgemm_fp8"
+
+    @classmethod
+    def get_supported_act_dtypes(cls) -> list[torch.dtype]:
+        return [torch.bfloat16, torch.float16]
+
+    @classmethod
+    def get_min_capability(cls) -> int:
+        return 80
+
+    @classmethod
+    def get_config_filenames(cls) -> list[str]:
+        return []
+
+    @classmethod
+    def from_config(cls, config: dict[str, Any]) -> FBGEMMFp8Config:
+        ignore_list = cls.get_from_keys(config, ["modules_to_not_convert"])
+        input_scale_ub = cls.get_from_keys(config, ["activation_scale_ub"])
+        return cls(ignore_list=ignore_list, input_scale_ub=input_scale_ub)
+
+    def get_quant_method(
+        self, layer: torch.nn.Module, prefix: str
+    ) -> Optional[QuantizeMethodBase]:
+        if isinstance(layer, LinearBase):
+            if is_layer_skipped(
+                prefix=prefix,
+                ignored_layers=self.ignore_list,
+                fused_mapping=self.packed_modules_mapping,
+            ):
+                return UnquantizedLinearMethod()
+            return FBGEMMFp8LinearMethod(self)
+        return None
+
+    def get_scaled_act_names(self) -> List[str]:
+        return []
+
+
+class FBGEMMFp8LinearMethod(LinearMethodBase):
+
+    def __init__(self, quant_config: FBGEMMFp8Config):
+        self.quant_config = quant_config
+        # self.fp8_linear = Fp8LinearOp(
+        #     act_quant_static=False, act_quant_group_shape=GroupShape.PER_TOKEN)
+        self.out_dtype = torch.get_default_dtype()
+        self.cutlass_fp8_supported = cutlass_fp8_supported()
+
+    def create_weights(
+        self,
+        layer: torch.nn.Module,
+        input_size_per_partition: int,
+        output_partition_sizes: list[int],
+        input_size: int,
+        output_size: int,
+        params_dtype: torch.dtype,
+        **extra_weight_attrs,
+    ):
+        # maybe_create_device_identity()
+        weight_loader = extra_weight_attrs.get("weight_loader")
+        del input_size, output_size
+        output_size_per_partition = sum(output_partition_sizes)
+
+        layer.logical_widths = output_partition_sizes
+
+        layer.input_size_per_partition = input_size_per_partition
+        layer.output_size_per_partition = output_size_per_partition
+        layer.orig_dtype = params_dtype
+
+        # WEIGHT
+        weight = ModelWeightParameter(
+            data=torch.empty(
+                output_size_per_partition,
+                input_size_per_partition,
+                dtype=torch.float8_e4m3fn,
+            ),
+            input_dim=1,
+            output_dim=0,
+            weight_loader=weight_loader,
+        )
+        layer.register_parameter("weight", weight)
+
+        # WEIGHT SCALE
+        weight_scale = ChannelQuantScaleParameter(
+            data=torch.empty((sum(output_partition_sizes), 1), dtype=torch.float32),
+            output_dim=0,
+            weight_loader=weight_loader,
+        )
+        weight_scale[:] = torch.finfo(torch.float32).min
+        layer.register_parameter("weight_scale", weight_scale)
+
+        # INPUT SCALE UPPER BOUND
+        input_scale_ub = torch.nn.Parameter(
+            torch.tensor((self.quant_config.input_scale_ub), dtype=torch.float32),
+            requires_grad=False,
+        )
+        layer.input_scale_ub = input_scale_ub
+
+    def process_weights_after_loading(self, layer: Module) -> None:
+        # required by torch.compile
+        layer.weight_scale = Parameter(layer.weight_scale.data, requires_grad=False)
+        layer.weight = Parameter(layer.weight.data, requires_grad=False)
+
+        weight = layer.weight
+
+        if _is_fp8_fnuz:
+            weight, weight_scale, input_scale = normalize_e4m3fn_to_e4m3fnuz(
+                weight=weight, weight_scale=layer.weight_scale, input_scale=None
+            )
+            if input_scale is not None:
+                layer.input_scale = Parameter(input_scale, requires_grad=False)
+            layer.weight_scale = Parameter(weight_scale, requires_grad=False)
+
+        layer.weight = Parameter(weight.t(), requires_grad=False)
+        if self.quant_config.use_marlin:
+            prepare_fp8_layer_for_marlin(layer)
+            # Activations not quantized for marlin.
+            del layer.input_scale_ub
+
+    def apply(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        bias: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+
+        if self.quant_config.use_marlin:
+            return apply_fp8_marlin_linear(
+                input=x,
+                weight=layer.weight,
+                weight_scale=layer.weight_scale,
+                workspace=layer.workspace,
+                size_n=layer.output_size_per_partition,
+                size_k=layer.input_size_per_partition,
+                bias=bias,
+            )
+
+        return apply_fp8_linear(
+            input=x,
+            weight=layer.weight,
+            weight_scale=layer.weight_scale,
+            input_scale=None,
+            input_scale_ub=layer.input_scale_ub,
+            bias=bias,
+            cutlass_fp8_supported=self.cutlass_fp8_supported,
+            use_per_token_if_dynamic=False,
+        )
diff --git a/sglang/python/sglang/srt/layers/quantization/gguf.py b/sglang/python/sglang/srt/layers/quantization/gguf.py
new file mode 100644
index 0000000000000000000000000000000000000000..748b3c29463ddae450ce5e86a60005a2ddc29caf
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/quantization/gguf.py
@@ -0,0 +1,570 @@
+# SPDX-License-Identifier: Apache-2.0
+# Adapted from: https://github.com/vllm-project/vllm/blob/ab3e80042eac24dd362408e6d63ad98768046359/vllm/model_executor/layers/quantization/gguf.py
+from __future__ import annotations
+
+import logging
+import warnings
+from typing import TYPE_CHECKING, Any, List, Optional
+
+import gguf
+import torch
+from gguf import GGMLQuantizationType as WeightType
+from torch.nn.parameter import Parameter, UninitializedParameter
+
+from sglang.srt.layers.linear import LinearBase
+from sglang.srt.layers.moe import MoeRunnerConfig
+from sglang.srt.layers.quantization.base_config import (
+    FusedMoEMethodBase,
+    LinearMethodBase,
+    QuantizationConfig,
+    QuantizeMethodBase,
+)
+from sglang.srt.layers.quantization.unquant import UnquantizedLinearMethod
+from sglang.srt.utils import is_cuda, is_hip, is_musa, is_xpu, set_weight_attrs
+
+if TYPE_CHECKING:
+    from sglang.srt.layers.moe.token_dispatcher import (
+        CombineInput,
+        StandardDispatchOutput,
+    )
+
+_is_cuda = is_cuda()
+_is_hip = is_hip()
+_is_xpu = is_xpu()
+_is_musa = is_musa()
+
+if _is_cuda or _is_musa:
+    from sgl_kernel import gelu_and_mul, moe_align_block_size, moe_sum, silu_and_mul
+    from sgl_kernel.quantization import (
+        ggml_dequantize,
+        ggml_moe_a8,
+        ggml_moe_a8_vec,
+        ggml_moe_get_block_size,
+        ggml_mul_mat_a8,
+        ggml_mul_mat_vec_a8,
+    )
+else:
+    if not _is_hip:
+        warnings.warn(f"Only CUDA and MUSA support GGUF quantization currently.")
+
+logger = logging.getLogger(__name__)
+
+
+class GGUFConfig(QuantizationConfig):
+    """Config class for GGUF."""
+
+    def __init__(self, modules_to_not_convert: list[str] | None = None) -> None:
+        super().__init__()
+        if _is_hip:
+            warnings.warn(f"Only CUDA and MUSA support GGUF quantization currently.")
+        self.modules_to_not_convert = modules_to_not_convert or []
+
+    def __repr__(self) -> str:
+        return "GGUFConfig()"
+
+    def get_scaled_act_names(self) -> List[str]:
+        return []
+
+    def get_name(self) -> "str":
+        return "gguf"
+
+    def get_supported_act_dtypes(self) -> list[torch.dtype]:
+        return [torch.half, torch.bfloat16, torch.float32]
+
+    @classmethod
+    def get_min_capability(cls) -> int:
+        return 60 if not _is_musa else 21
+
+    @classmethod
+    def get_config_filenames(cls) -> list[str]:
+        return []  # no extra configs.
+
+    @classmethod
+    def from_config(cls, config: dict[str, Any]) -> "GGUFConfig":
+        modules_to_not_convert = cls.get_from_keys_or(
+            config, ["modules_to_not_convert"], None
+        )
+        return cls(modules_to_not_convert)
+
+    def get_quant_method(
+        self, layer: torch.nn.Module, prefix: str
+    ) -> Optional["QuantizeMethodBase"]:
+        from sglang.srt.layers.moe.fused_moe_triton import FusedMoE
+        from sglang.srt.layers.vocab_parallel_embedding import VocabParallelEmbedding
+
+        if isinstance(layer, LinearBase):
+            if is_layer_skipped_gguf(prefix, self.modules_to_not_convert):
+                return UnquantizedLinearMethod()
+            return GGUFLinearMethod(self)
+        elif isinstance(layer, VocabParallelEmbedding):
+            return GGUFEmbeddingMethod(self)
+        elif isinstance(layer, FusedMoE):
+            return GGUFMoEMethod(self)
+        return None
+
+
+def is_layer_skipped_gguf(prefix: str, modules_to_not_convert: list[str]):
+    return any(module_name in prefix for module_name in modules_to_not_convert)
+
+
+UNQUANTIZED_TYPES = {WeightType.F32, WeightType.F16, WeightType.BF16}
+STANDARD_QUANT_TYPES = {
+    WeightType.Q4_0,
+    WeightType.Q4_1,
+    WeightType.Q5_0,
+    WeightType.Q5_1,
+    WeightType.Q8_0,
+    WeightType.Q8_1,
+}
+KQUANT_TYPES = {
+    WeightType.Q2_K,
+    WeightType.Q3_K,
+    WeightType.Q4_K,
+    WeightType.Q5_K,
+    WeightType.Q6_K,
+}
+IMATRIX_QUANT_TYPES = {
+    WeightType.IQ1_M,
+    WeightType.IQ1_S,
+    WeightType.IQ2_XXS,
+    WeightType.IQ2_XS,
+    WeightType.IQ2_S,
+    WeightType.IQ3_XXS,
+    WeightType.IQ3_S,
+    WeightType.IQ4_XS,
+    WeightType.IQ4_NL,
+}
+# TODO(Isotr0py): Currently, we don't have MMQ kernel for I-Matrix quantization.
+# Consolidate DEQUANT_TYPES, MMVQ_QUANT_TYPES and MMQ_QUANT_TYPES after we add
+# MMQ kernel for I-Matrix quantization.
+DEQUANT_TYPES = STANDARD_QUANT_TYPES | KQUANT_TYPES | IMATRIX_QUANT_TYPES
+MMVQ_QUANT_TYPES = STANDARD_QUANT_TYPES | KQUANT_TYPES | IMATRIX_QUANT_TYPES
+MMQ_QUANT_TYPES = STANDARD_QUANT_TYPES | KQUANT_TYPES
+
+
+def fused_mul_mat_gguf(
+    x: torch.Tensor, qweight: torch.Tensor, qweight_type: int
+) -> torch.Tensor:
+    if qweight_type in IMATRIX_QUANT_TYPES:
+        mmvq_safe = 8 if qweight.shape[0] > 5120 else 16
+    else:
+        mmvq_safe = 2 if qweight.shape[0] > 5120 else 6
+    # HACK: when doing chunked prefill we don't generate output tokens
+    # so input to logits generator is empty which causes invalid parameter
+    if x.shape[0] == 0:
+        return torch.empty(x.shape[0], qweight.shape[0], dtype=x.dtype, device=x.device)
+    # there is no need to call any kernel for fp16/bf16
+    if qweight_type in UNQUANTIZED_TYPES:
+        return x @ qweight.T
+    # enable MMVQ in contiguous batching with batch_size=1
+    if x.shape[0] <= mmvq_safe and qweight_type in MMVQ_QUANT_TYPES:
+        y = ggml_mul_mat_vec_a8(qweight, x, qweight_type, qweight.shape[0])
+    # Use MMQ Kernel if it's available (standard + k-quants)
+    elif qweight_type in MMQ_QUANT_TYPES:
+        y = ggml_mul_mat_a8(qweight, x, qweight_type, qweight.shape[0])
+    # If there is no available MMQ kernel, fallback to dequantize
+    elif qweight_type in DEQUANT_TYPES:
+        block_size, type_size = gguf.GGML_QUANT_SIZES[qweight_type]
+        shape = (qweight.shape[0], qweight.shape[1] // type_size * block_size)
+        weight = ggml_dequantize(qweight, qweight_type, *shape, x.dtype)
+        y = x @ weight.T
+    else:
+        # Raise an error if the quantization type is not supported.
+        # Might be useful if llama.cpp adds a new quantization type.
+        # Wrap to GGMLQuantizationType IntEnum to make sure it's a valid type.
+        qweight_type = WeightType(qweight_type)
+        raise NotImplementedError(f"Unsupported GGUF quantization type: {qweight_type}")
+    return y
+
+
+def fused_moe_gguf(
+    x: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    topk_weights: torch.Tensor,
+    topk_ids: torch.Tensor,
+    qweight_type: int,
+    qweight_type2: int,
+    activation: str,
+) -> torch.Tensor:
+    def act(x: torch.Tensor):
+        d = x.shape[-1] // 2
+        output_shape = x.shape[:-1] + (d,)
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        if activation == "silu":
+            silu_and_mul(out, x)
+        elif activation == "gelu":
+            gelu_and_mul(out, x)
+        else:
+            raise ValueError(f"Unsupported activation: {activation}")
+        return out
+
+    out_hidden_states = torch.empty_like(x)
+    # unless we decent expert reuse we are better off running moe_vec kernel
+    if (
+        qweight_type2 in MMQ_QUANT_TYPES
+        and qweight_type in MMQ_QUANT_TYPES
+        and x.shape[0] > 64
+    ):
+        num_tokens, _ = x.shape
+        E, N, _ = w1.shape
+        top_k = topk_ids.shape[1]
+        BLOCK_SIZE = ggml_moe_get_block_size(qweight_type)
+
+        sorted_token_ids, expert_ids, num_tokens_post_padded = moe_align_block_size(
+            topk_ids, BLOCK_SIZE, E
+        )
+        out = ggml_moe_a8(
+            x,
+            w1,
+            sorted_token_ids,
+            expert_ids,
+            num_tokens_post_padded,
+            qweight_type,
+            N,
+            top_k,
+            num_tokens,
+        )
+        out = act(out)
+        out = ggml_moe_a8(
+            out,
+            w2,
+            sorted_token_ids,
+            expert_ids,
+            num_tokens_post_padded,
+            qweight_type2,
+            w2.shape[1],
+            1,
+            num_tokens * top_k,
+        )
+        out = out.reshape(num_tokens, top_k, w2.shape[1]).mul_(
+            topk_weights.view(num_tokens, top_k, 1)
+        )
+        # TODO(FlamingoPg): maybe we can use moe_sum_reduce here?
+        moe_sum(out, out_hidden_states)
+    elif qweight_type2 in MMVQ_QUANT_TYPES and qweight_type in MMVQ_QUANT_TYPES:
+        num_tokens, _ = x.shape
+        E, N, _ = w1.shape
+        top_k = topk_ids.shape[1]
+
+        out = ggml_moe_a8_vec(x, w1, topk_ids, top_k, qweight_type, N, num_tokens)
+        out = act(out)
+
+        out = ggml_moe_a8_vec(
+            out, w2, topk_ids, 1, qweight_type2, w2.shape[1], num_tokens * top_k
+        )
+        out = out.reshape(num_tokens, top_k, w2.shape[1]).mul_(
+            topk_weights.view(num_tokens, top_k, 1)
+        )
+        moe_sum(out, out_hidden_states)
+    else:
+        logger.warning_once(
+            "There is no support for fast MoE kernel "
+            "for current quantization method. "
+            "Falling back to slow implementation. "
+        )
+        for tok, (w, idx) in enumerate(zip(topk_weights, topk_ids)):
+            inp = x[tok].reshape((1,) + x.shape[1:])
+            current_hidden_state = None
+            for ww, ii in zip(w, idx):
+                expert_up = w1[ii]
+
+                out = fused_mul_mat_gguf(inp, expert_up, qweight_type)
+                out = act(out)
+
+                expert_down = w2[ii]
+                current_state = fused_mul_mat_gguf(
+                    out, expert_down, qweight_type2
+                ).mul_(ww)
+                if current_hidden_state is None:
+                    current_hidden_state = current_state
+                else:
+                    current_hidden_state.add_(current_state)
+            out_hidden_states[tok] = current_hidden_state
+    return out_hidden_states
+
+
+def apply_gguf_embedding(
+    x: torch.Tensor,
+    qweight: torch.Tensor,
+    qweight_type: int,
+    hidden_size: int,
+    dtype: torch.dtype | None = None,
+) -> torch.Tensor:
+    if qweight_type in UNQUANTIZED_TYPES:
+        return torch.embedding(qweight, x)
+    elif qweight_type in DEQUANT_TYPES:
+        block_size, type_size = gguf.GGML_QUANT_SIZES[qweight_type]
+        x_flat = x.flatten()
+        assert hidden_size == qweight.shape[1] // type_size * block_size
+        quant = torch.index_select(qweight, dim=0, index=x_flat)
+        dequant = ggml_dequantize(
+            quant, qweight_type, hidden_size, x_flat.shape[0], dtype
+        )
+        return dequant.view(*x.shape, hidden_size)
+    else:
+        qweight_type = WeightType(qweight_type)
+        raise NotImplementedError(f"Unsupported GGUF quantization type: {qweight_type}")
+
+
+class GGUFLinearMethod(LinearMethodBase):
+    """Linear method for GGUF.
+
+    Args:
+        quant_config: The GGUF quantization config.
+    """
+
+    def __init__(self, quant_config: GGUFConfig):
+        self.quant_config = quant_config
+
+    def create_weights(
+        self,
+        layer: torch.nn.Module,
+        input_size_per_partition: int,
+        output_partition_sizes: list[int],
+        input_size: int,
+        output_size: int,
+        params_dtype: torch.dtype,
+        **extra_weight_attrs,
+    ):
+        self.params_dtype = params_dtype
+        output_size_per_partition = sum(output_partition_sizes)
+
+        tensor_shape = (output_size_per_partition, input_size_per_partition)
+        qweight = GGUFUninitializedParameter(requires_grad=False)
+        set_weight_attrs(
+            qweight,
+            {
+                "input_dim": 1,
+                "output_dim": 0,
+                "tensor_shape": tensor_shape,
+                "is_gguf_weight": True,
+                "data_container": [],
+                "shard_id": [],
+                "shard_id_map": {},
+            },
+        )
+        set_weight_attrs(qweight, extra_weight_attrs)
+        layer.register_parameter("qweight", qweight)
+
+        qweight_type = Parameter(
+            torch.empty(len(output_partition_sizes), dtype=torch.uint8),
+            requires_grad=False,
+        )
+        set_weight_attrs(
+            qweight_type,
+            {
+                "is_gguf_weight_type": True,
+                "weight_type": 0,
+                "shard_weight_type": {},
+                "ignore_warning": True,
+            },
+        )
+        set_weight_attrs(qweight_type, extra_weight_attrs)
+        layer.register_parameter("qweight_type", qweight_type)
+
+    def process_weights_after_loading(self, layer: torch.nn.Module):
+        qweight_type = layer.qweight_type.weight_type
+        if not (qweight_type in UNQUANTIZED_TYPES or qweight_type in DEQUANT_TYPES):
+            qweight_type = WeightType(qweight_type)
+            raise ValueError(
+                f"Unsupported GGUF quantization type {qweight_type} in layer {layer}."
+            )
+        # For MergedColumnParallelLinear and QKVParallelLinear, we need to
+        # materialize the padded weight parameter for CUDA Graph compatibility.
+        self._create_padded_weight_param(layer)
+
+    def _create_padded_weight_param(self, layer: torch.nn.Module):
+        """Create padded weight parameter for GGUF MergedLinear layer."""
+        qweight = layer.qweight
+        shard_id_map = qweight.shard_id_map
+        shard_id = qweight.shard_id
+        if len(data_container := qweight.data_container) > 1:
+            dtype = {data.dtype for data in data_container}
+            assert len(dtype) == 1, ValueError(
+                f"Data container has mixed dtypes: {dtype}"
+            )
+            dtype = next(iter(dtype))
+            # concat dim0 and pad dim1
+            padded_side = max(x.size(1) for x in data_container)
+            concat_side = sum(x.size(0) for x in data_container)
+            # Pad the quantized weights to dense tensor, and create a map
+            # with the location of each shard in the padded tensor.
+            padded_data = torch.zeros(
+                (concat_side, padded_side), dtype=dtype, device=qweight.device
+            )
+            # (dim0_start, dim0_end, dim1_size)
+            shard_offset_map = dict[str, tuple[int, int, int]]()
+            for idx in shard_id:
+                id_in_container = shard_id_map[idx]
+                start = sum(x.size(0) for x in data_container[:id_in_container])
+                end = start + data_container[id_in_container].size(0)
+                size = data_container[id_in_container].size(1)
+                padded_data[start:end, :size] = data_container[id_in_container]
+                shard_offset_map[idx] = (start, end, size)
+            qweight.data_container.clear()
+            padded_param = Parameter(padded_data, requires_grad=False)
+            set_weight_attrs(padded_param, vars(qweight))
+            set_weight_attrs(padded_param, {"shard_offset_map": shard_offset_map})
+            layer.register_parameter("qweight", padded_param)
+
+    def apply(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        bias: torch.Tensor | None = None,
+    ) -> torch.Tensor:
+        shard_id = layer.qweight.shard_id
+
+        if shard_id:
+            # dequantize shard weights respectively
+            shard_id = ["q", "k", "v"] if "q" in shard_id else shard_id
+            qweight = layer.qweight
+            result = []
+            for idx in shard_id:
+                start, end, offset = layer.qweight.shard_offset_map[idx]
+                qweight_type = layer.qweight_type.shard_weight_type[idx]
+                result.append(
+                    fused_mul_mat_gguf(
+                        x, qweight[start:end, :offset].contiguous(), qweight_type
+                    )
+                )
+            out = torch.cat(result, axis=1)
+        else:
+            qweight = layer.qweight
+            qweight_type = layer.qweight_type.weight_type
+            out = fused_mul_mat_gguf(x, qweight, qweight_type)
+        if bias is not None:
+            out.add_(bias)
+        return out
+
+
+class GGUFMoEMethod(FusedMoEMethodBase):
+    """MoE method for GGUF.
+
+    Args:
+        quant_config: The GGUF quantization config.
+    """
+
+    def __init__(self, quant_config: GGUFConfig):
+        self.quant_config = quant_config
+
+    def create_weights(
+        self,
+        layer: torch.nn.Module,
+        num_experts: int,
+        hidden_size: int,
+        intermediate_size_per_partition: int,
+        params_dtype: torch.dtype,
+        **extra_weight_attrs,
+    ):
+        tensor_shape = (num_experts, 2 * intermediate_size_per_partition, hidden_size)
+        # gate up proj
+        w13_qweight = GGUFUninitializedParameter(requires_grad=False)
+        set_weight_attrs(
+            w13_qweight,
+            {
+                "input_dim": 1,
+                "output_dim": 0,
+                "tensor_shape": tensor_shape,
+                "is_gguf_weight": True,
+                "data_container": [],
+            },
+        )
+        set_weight_attrs(w13_qweight, extra_weight_attrs)
+        layer.register_parameter("w13_qweight", w13_qweight)
+
+        w13_qweight_type = Parameter(
+            torch.empty(1, dtype=torch.uint8), requires_grad=False
+        )
+        set_weight_attrs(
+            w13_qweight_type,
+            {"is_gguf_weight_type": True, "weight_type": 0, "ignore_warning": True},
+        )
+        set_weight_attrs(w13_qweight_type, extra_weight_attrs)
+        layer.register_parameter("w13_qweight_type", w13_qweight_type)
+
+        tensor_shape = (num_experts, intermediate_size_per_partition, hidden_size)
+        # gate down proj
+        w2_qweight = GGUFUninitializedParameter(requires_grad=False)
+        set_weight_attrs(
+            w2_qweight,
+            {
+                "input_dim": 1,
+                "output_dim": 0,
+                "tensor_shape": tensor_shape,
+                "is_gguf_weight": True,
+                "data_container": [],
+            },
+        )
+        set_weight_attrs(w2_qweight, extra_weight_attrs)
+        layer.register_parameter("w2_qweight", w2_qweight)
+
+        w2_qweight_type = Parameter(
+            torch.empty(1, dtype=torch.uint8), requires_grad=False
+        )
+        set_weight_attrs(
+            w2_qweight_type,
+            {"is_gguf_weight_type": True, "weight_type": 0, "ignore_warning": True},
+        )
+
+        set_weight_attrs(w2_qweight_type, extra_weight_attrs)
+        layer.register_parameter("w2_qweight_type", w2_qweight_type)
+
+    def create_moe_runner(
+        self, layer: torch.nn.Module, moe_runner_config: MoeRunnerConfig
+    ):
+        self.moe_runner_config = moe_runner_config
+
+    def apply(
+        self,
+        layer: torch.nn.Module,
+        dispatch_output: StandardDispatchOutput,
+    ) -> CombineInput:
+        assert self.fused_experts is None
+
+        from sglang.srt.layers.moe.token_dispatcher import StandardCombineInput
+
+        assert (
+            self.moe_runner_config.activation == "silu"
+        ), "Only SiLU activation is supported."
+
+        x = dispatch_output.hidden_states
+        topk_output = dispatch_output.topk_output
+
+        moe_runner_config = self.moe_runner_config
+
+        topk_weights, topk_ids, _ = topk_output
+        output = fused_moe_gguf(
+            x=x,
+            w1=layer.w13_qweight,
+            w2=layer.w2_qweight,
+            topk_weights=topk_weights,
+            topk_ids=topk_ids,
+            qweight_type=layer.w13_qweight_type.weight_type,
+            qweight_type2=layer.w2_qweight_type.weight_type,
+            activation=moe_runner_config.activation,
+        )
+        return StandardCombineInput(hidden_states=output)
+
+
+class GGUFEmbeddingMethod(GGUFLinearMethod):
+    """Embedding method for GGUF.
+
+    Args:
+        quant_config: The GGUF quantization config.
+    """
+
+    def embedding(self, layer: torch.nn.Module, x: torch.Tensor) -> torch.Tensor:
+        qweight = layer.qweight
+        qweight_type = layer.qweight_type.weight_type
+        hidden_size = qweight.tensor_shape[1]
+
+        return apply_gguf_embedding(
+            x, qweight, qweight_type, hidden_size, dtype=self.params_dtype
+        )
+
+
+class GGUFUninitializedParameter(UninitializedParameter):
+    cls_to_become = Parameter
+    data_container: list[torch.Tensor]
diff --git a/sglang/python/sglang/srt/layers/quantization/gptq.py b/sglang/python/sglang/srt/layers/quantization/gptq.py
new file mode 100644
index 0000000000000000000000000000000000000000..9e5c2945f70a642a6b6f2f38624593520671568e
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/quantization/gptq.py
@@ -0,0 +1,1562 @@
+from __future__ import annotations
+
+import logging
+from dataclasses import dataclass
+from fractions import Fraction
+from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Union
+
+import torch
+
+from sglang.srt.hardware_backend.npu.quantization.fused_moe_method_npu import (
+    npu_fused_experts,
+)
+from sglang.srt.layers.moe import (
+    MoeRunner,
+    MoeRunnerBackend,
+    MoeRunnerConfig,
+    get_moe_runner_backend,
+)
+from sglang.srt.layers.moe.moe_runner.marlin import MarlinMoeQuantInfo
+from sglang.srt.layers.parameter import (
+    BasevLLMParameter,
+    ChannelQuantScaleParameter,
+    GroupQuantScaleParameter,
+    PackedColumnParameter,
+    PackedvLLMParameter,
+    RowvLLMParameter,
+    permute_param_layout_,
+)
+from sglang.srt.layers.quantization.base_config import (
+    FusedMoEMethodBase,
+    LinearMethodBase,
+    QuantizationConfig,
+    QuantizeMethodBase,
+)
+from sglang.srt.layers.quantization.marlin_utils import (
+    apply_gptq_marlin_linear,
+    check_marlin_supported,
+    check_marlin_supports_shape,
+    marlin_is_k_full,
+    marlin_make_empty_g_idx,
+    marlin_make_workspace,
+    marlin_moe_permute_scales,
+    marlin_permute_scales,
+    marlin_repeat_scales_on_all_ranks,
+    marlin_sort_g_idx,
+    marlin_zero_points,
+    verify_marlin_supported,
+)
+from sglang.srt.layers.quantization.utils import (
+    get_linear_quant_method,
+    get_scalar_types,
+    replace_parameter,
+    unpack_cols,
+)
+from sglang.srt.utils import is_cuda, is_npu, set_weight_attrs
+from sglang.srt.utils.patch_torch import register_fake_if_exists
+
+if TYPE_CHECKING:
+    from sglang.srt.layers.moe.token_dispatcher import (
+        CombineInput,
+        StandardDispatchOutput,
+    )
+
+_is_cuda = is_cuda()
+
+if _is_cuda:
+    from sgl_kernel import gptq_gemm, gptq_shuffle
+
+    from sglang.jit_kernel.gptq_marlin_repack import gptq_marlin_repack
+
+_is_npu = is_npu()
+
+if _is_npu:
+    import torch_npu
+
+logger = logging.getLogger(__name__)
+ScalarType, scalar_types = get_scalar_types()
+
+
+def check_marlin_format(hf_quant_cfg: Dict[str, Any]) -> bool:
+    # compat: gptqmodel and autogptq (eol) main use checkpoint_format: str
+    # compat: autogptq <=0.7.1 is_marlin_format: bool
+    return hf_quant_cfg.get("checkpoint_format") == "marlin" or hf_quant_cfg.get(
+        "is_marlin_format", False
+    )
+
+
+def gptq_marlin_moe_repack(
+    b_q_weight: torch.Tensor,
+    perm: torch.Tensor,
+    size_k: int,
+    size_n: int,
+    num_bits: int,
+) -> torch.Tensor:
+    num_experts = b_q_weight.shape[0]
+    assert size_k % 16 == 0
+    output = torch.empty(
+        (num_experts, size_k // 16, size_n * (num_bits // 2)),
+        device=b_q_weight.device,
+        dtype=b_q_weight.dtype,
+    )
+    for e in range(num_experts):
+        output[e] = gptq_marlin_repack(b_q_weight[e], perm[e], size_k, size_n, num_bits)
+    return output
+
+
+@dataclass
+class MarlinLinearLayerConfig:
+    full_weight_shape: tuple[int, int]  # [in, out]
+    partition_weight_shape: tuple[int, int]
+    weight_type: ScalarType
+    act_type: torch.dtype
+    group_size: int
+    zero_points: bool
+    has_g_idx: bool
+
+
+class GPTQConfig(QuantizationConfig):
+    """Config class for GPTQ.
+
+    Reference: https://arxiv.org/abs/2210.17323
+    """
+
+    def __init__(
+        self,
+        weight_bits: int,
+        group_size: int,
+        desc_act: bool,
+        lm_head_quantized: bool,
+        dynamic: Dict[str, Dict[str, Union[int, bool]]],
+        checkpoint_format: str = "",
+    ) -> None:
+        # GPTQModel use `dynamic` config property to allow per module
+        # quantization config so each module can be individually optimized.
+        # Format is Dict[str, Dict] where key is a regex string that can
+        # perform both positive ("+:" prefixed) or negative ("-:" prefixed)
+        # matching of a module.
+        # Default to positive match, override base quant config mode, if no
+        # prefix is used. Value is in dict format of field key and override
+        # value.
+        # Negative matching will skip quantization init for this module
+        # entirely:
+        # non-quantized inference. More details and quantization examples can be
+        # found at: https://github.com/ModelCloud/GPTQModel
+        # Example:
+        #  # last 1/2 of the layers 10-21 has 8bit vs 4bit for 0-9
+        #  # last 1/4 of the layers 16-21 has 8bit and group_size 64
+        # dynamic = {
+        #  #`.*\.` matches the layers_node prefix
+        #  # positive match layer 10-15
+        #  r"+:.*\.(?:1[0-5])\..*": {"bits": 8,},
+        #  # positive match layer 16-21
+        #  r"+:.*\.(?:1[6-9]|20|21)\..*": {"bits": 8, "group_size": 64,},
+        #  r"-:.*\.moe\..*": {}, # negative match (skip) all `moe` layers
+        # }
+        super().__init__()
+        self.dynamic = dynamic
+
+        self.weight_bits = weight_bits
+        self.group_size = group_size
+        self.desc_act = desc_act
+        self.lm_head_quantized = lm_head_quantized
+        self.pack_factor = Fraction(32, self.weight_bits)
+        # GPTQ v1 and v2 format deals with zero points differently.
+        # Currently GPTQModel stores v1 format checkpoints by default,
+        # but provides the option to set `format="gptq_v2"` in `QuantizeConfig`.
+        self.checkpoint_format = checkpoint_format
+        if self.weight_bits not in [2, 3, 4, 8]:
+            raise ValueError(
+                "Currently, only 2/3/4/8-bit weight quantization is "
+                f"supported for GPTQ, but got {self.weight_bits} bits."
+            )
+
+    def __repr__(self) -> str:
+        return (
+            f"GPTQConfig(weight_bits={self.weight_bits}, "
+            f"group_size={self.group_size}, "
+            f"desc_act={self.desc_act}),"
+            f"lm_head_quantized={self.lm_head_quantized}), "
+            f"dynamic={self.dynamic},"
+            f"checkpoint_format={self.checkpoint_format})"
+        )
+
+    def get_scaled_act_names(self) -> List[str]:
+        """Returns the activation function names that should be post-scaled.
+
+        For now, this is only used by AWQ.
+        """
+        raise NotImplementedError
+
+    @classmethod
+    def get_name(cls) -> str:
+        return "gptq"
+
+    @classmethod
+    def get_supported_act_dtypes(cls) -> List[torch.dtype]:
+        return [torch.half] if not _is_npu else [torch.half, torch.bfloat16]
+
+    @classmethod
+    # Need to figure it out
+    def get_min_capability(cls) -> int:
+        if _is_npu:
+            raise NotImplementedError(
+                'NPU hardware does not support "get_min_capability" feature.'
+            )
+        else:
+            return 60
+
+    @classmethod
+    def get_config_filenames(cls) -> List[str]:
+        return ["quantize_config.json"]
+
+    @classmethod
+    def from_config(cls, config: Dict[str, Any]) -> GPTQConfig:
+        dynamic = cls.get_from_keys_or(config, ["dynamic"], default={})
+        dynamic = {} if dynamic is None else dynamic
+
+        weight_bits = cls.get_from_keys(config, ["bits"])
+        group_size = cls.get_from_keys(config, ["group_size"])
+        desc_act = cls.get_from_keys(config, ["desc_act"])
+        lm_head_quantized = cls.get_from_keys_or(config, ["lm_head"], default=False)
+        checkpoint_format = cls.get_from_keys_or(
+            config, ["checkpoint_format"], default=""
+        )
+        return cls(
+            weight_bits,
+            group_size,
+            desc_act,
+            lm_head_quantized,
+            dynamic,
+            checkpoint_format,
+        )
+
+    def get_quant_method(
+        self, layer: torch.nn.Module, prefix: str
+    ) -> Optional[LinearMethodBase]:
+        # Delay the import to avoid circular dependency
+        from sglang.srt.layers.linear import LinearBase
+        from sglang.srt.layers.moe.fused_moe_triton import FusedMoE
+
+        if _is_npu:
+            if isinstance(layer, FusedMoE):
+                return GPTQMoEAscendMethod(self)
+            if isinstance(layer, LinearBase):
+                return GPTQLinearAscendMethod(self)
+            return None
+
+        if isinstance(layer, FusedMoE):
+            raise TypeError("GPTQ Method does not support MoE, please use gptq_marlin")
+        else:
+            return get_linear_quant_method(
+                self, layer, prefix=prefix, linear_method_cls=GPTQLinearMethod
+            )
+
+
+class GPTQMarlinConfig(QuantizationConfig):
+    """Config class for GPTQ Marlin"""
+
+    # (num_bits, is_sym) -> quant_type
+    TYPE_MAP = {
+        (4, True): scalar_types.uint4b8,
+        (8, True): scalar_types.uint8b128,
+    }
+
+    def __init__(
+        self,
+        weight_bits: int,
+        group_size: int,
+        desc_act: bool,
+        is_sym: bool,
+        lm_head_quantized: bool,
+        dynamic: Dict[str, Dict[str, Union[int, bool]]],
+        full_config: Dict[str, Any],
+    ) -> None:
+        super().__init__()
+        if desc_act and group_size == -1:
+            # In this case, act_order == True is the same as act_order == False
+            # (since we have only one group per output channel)
+            desc_act = False
+
+        # GPTQModel use `dynamic` config property to allow per module
+        # quantization config so each module can be individually optimized.
+        # Format is Dict[str, Dict] where key is a regex string that can
+        # perform both positive ("+:" prefixed) or negative ("-:" prefixed)
+        # matching of a module.
+        # Default to positive match, override base quant config mode, if no
+        # prefix is used. Value is in dict format of field key and override
+        # value.
+        # Negative matching will skip quantization init for this module
+        # entirely:
+        # non-quantized inference. More details and quantization examples can be
+        # found at: https://github.com/ModelCloud/GPTQModel
+        # Example:
+        #  # last 1/2 of the layers 10-21 has 8bit vs 4bit for 0-9
+        #  # last 1/4 of the layers 16-21 has 8bit and group_size 64
+        # dynamic = {
+        #  #`.*\.` matches the layers_node prefix
+        #  # positive match layer 10-15
+        #  r"+:.*\.(?:1[0-5])\..*": {"bits": 8,},
+        #  # positive match layer 16-21
+        #  r"+:.*\.(?:1[6-9]|20|21)\..*": {"bits": 8, "group_size": 64,},
+        #  r"-:.*\.moe\..*": {}, # negative match (skip) all `moe` layers
+        # }
+        self.dynamic = dynamic
+
+        self.weight_bits = weight_bits
+        self.is_sym = is_sym
+
+        self.pack_factor = 32 // weight_bits  # packed into int32
+        self.group_size = group_size
+        self.desc_act = desc_act
+        self.lm_head_quantized = lm_head_quantized
+        self.full_config = full_config
+
+        if (weight_bits, is_sym) not in self.TYPE_MAP:
+            raise ValueError(
+                "Unsupported quantization config: " f"bits={weight_bits}, sym={is_sym}"
+            )
+
+        # (num_bits, is_sym) -> quant_type
+        self.quant_type = self.TYPE_MAP[(weight_bits, is_sym)]
+
+    def __repr__(self) -> str:
+        return (
+            f"GPTQMarlinConfig(quant_type={self.quant_type}, "
+            f"group_size={self.group_size}, "
+            f"desc_act={self.desc_act}, "
+            f"lm_head_quantized={self.lm_head_quantized}), "
+            f"dynamic={self.dynamic}"
+        )
+
+    def get_scaled_act_names(self) -> List[str]:
+        """Returns the activation function names that should be post-scaled.
+
+        For now, this is only used by AWQ.
+        """
+        raise NotImplementedError
+
+    @classmethod
+    def get_name(cls) -> str:
+        return "gptq_marlin"
+
+    @classmethod
+    def get_supported_act_dtypes(cls) -> List[torch.dtype]:
+        return [torch.half, torch.bfloat16]
+
+    @classmethod
+    def get_min_capability(cls) -> int:
+        return 80
+
+    @classmethod
+    def get_config_filenames(cls) -> List[str]:
+        return ["quantize_config.json"]
+
+    @classmethod
+    def from_config(cls, config: Dict[str, Any]) -> GPTQMarlinConfig:
+        dynamic = cls.get_from_keys_or(config, ["dynamic"], default={})
+        dynamic = {} if dynamic is None else dynamic
+
+        weight_bits = cls.get_from_keys(config, ["bits"])
+        group_size = cls.get_from_keys(config, ["group_size"])
+        desc_act = cls.get_from_keys(config, ["desc_act"])
+        is_sym = cls.get_from_keys(config, ["sym"])
+        lm_head_quantized = cls.get_from_keys_or(config, ["lm_head"], default=False)
+        return cls(
+            weight_bits,
+            group_size,
+            desc_act,
+            is_sym,
+            lm_head_quantized,
+            dynamic,
+            config,
+        )
+
+    @classmethod
+    def override_quantization_method(cls, hf_quant_cfg, user_quant) -> Optional[str]:
+        is_marlin_format = check_marlin_format(hf_quant_cfg)
+
+        can_convert = cls.is_gptq_marlin_compatible(hf_quant_cfg)
+
+        is_valid_user_quant = (
+            user_quant is None or user_quant == "marlin" or user_quant == "gptq_marlin"
+        )
+
+        if not is_marlin_format and can_convert and is_valid_user_quant:
+            msg = (
+                "The model is convertible to {} during runtime."
+                " Using {} kernel.".format(cls.get_name(), cls.get_name())
+            )
+            logger.info(msg)
+            return cls.get_name()
+
+        if not is_marlin_format and can_convert and user_quant == "gptq":
+            logger.info(
+                "Detected that the model can run with gptq_marlin"
+                ", however you specified quantization=gptq explicitly,"
+                " so forcing gptq. Use quantization=gptq_marlin for"
+                " faster inference"
+            )
+        return None
+
+    def get_quant_method(
+        self, layer: torch.nn.Module, prefix: str
+    ) -> Optional[QuantizeMethodBase]:
+        # Delay the import to avoid circular dependency
+        from sglang.srt.layers.moe.fused_moe_triton import FusedMoE
+
+        if isinstance(layer, FusedMoE):
+            return GPTQMarlinMoEMethod(self)
+        return get_linear_quant_method(self, layer, prefix, GPTQMarlinLinearMethod)
+
+    @classmethod
+    def is_gptq_marlin_compatible(cls, quant_config: Dict[str, Any]):
+        quant_method = quant_config.get("quant_method", "").lower()
+        num_bits = quant_config.get("bits")
+        group_size = quant_config.get("group_size")
+        sym = quant_config.get("sym")
+        desc_act = quant_config.get("desc_act")
+
+        if not _is_cuda:
+            return False
+
+        if quant_method != "gptq":
+            return False
+
+        # Marlin conversion is only valid if required properties are found
+        if num_bits is None or group_size is None or sym is None or desc_act is None:
+            return False
+
+        if (num_bits, sym) not in cls.TYPE_MAP:
+            return False
+
+        return check_marlin_supported(
+            quant_type=cls.TYPE_MAP[(num_bits, sym)], group_size=group_size
+        )
+
+
+class GPTQLinearMethod(LinearMethodBase):
+    """Linear method for GPTQ.
+
+    Args:
+        quant_config: The GPTQ quantization config.
+    """
+
+    def __init__(self, quant_config: GPTQConfig):
+        self.quant_config = quant_config
+        # GPTQ v1 and v2 format deals with zero points differently
+        self.use_v2_format = quant_config.checkpoint_format == "gptq_v2"
+
+    def create_weights(
+        self,
+        layer: torch.nn.Module,
+        input_size_per_partition: int,
+        output_partition_sizes: list[int],
+        input_size: int,
+        output_size: int,
+        params_dtype: torch.dtype,
+        **extra_weight_attrs,
+    ):
+        del output_size  # Unused.
+        weight_loader = extra_weight_attrs.get("weight_loader")
+        if input_size_per_partition % self.quant_config.group_size != 0:
+            raise ValueError(
+                "The input size is not aligned with the quantized "
+                "weight shape. This can be caused by too large "
+                "tensor parallel size."
+            )
+        output_size_per_partition = sum(output_partition_sizes)
+        if output_size_per_partition % self.quant_config.pack_factor.numerator != 0:
+            raise ValueError(
+                "The output size is not aligned with the quantized "
+                "weight shape. This can be caused by too large "
+                "tensor parallel size."
+            )
+
+        if self.quant_config.group_size != -1:
+            group_size = self.quant_config.group_size
+        else:
+            group_size = input_size
+
+        self.use_shuffle = True
+        scale_and_zero_size = input_size // group_size
+        scale_and_zero_input_dim = None
+        if (
+            input_size != input_size_per_partition
+            and self.quant_config.group_size != -1
+        ):
+            if self.quant_config.desc_act:
+                self.use_shuffle = False
+            else:
+                # we need to partition qzeros and scales for exllama kernel
+                scale_and_zero_size = input_size_per_partition // group_size
+                scale_and_zero_input_dim = 0
+
+        qweight = PackedvLLMParameter(
+            data=torch.empty(
+                input_size_per_partition // self.quant_config.pack_factor,
+                output_size_per_partition,
+                dtype=torch.int32,
+            ),
+            input_dim=0,
+            output_dim=1,
+            packed_dim=0,
+            packed_factor=self.quant_config.pack_factor,
+            weight_loader=weight_loader,
+        )
+
+        g_idx = RowvLLMParameter(
+            data=torch.tensor(
+                [
+                    i // self.quant_config.group_size
+                    for i in range(input_size_per_partition)
+                ],
+                dtype=torch.int32,
+            ),
+            input_dim=0,
+            weight_loader=weight_loader,
+        )
+        qzeros_args = {
+            "data": torch.empty(
+                scale_and_zero_size,
+                output_size_per_partition // self.quant_config.pack_factor,
+                dtype=torch.int32,
+            ),
+            "weight_loader": weight_loader,
+        }
+        weight_scale_args = {
+            "data": torch.empty(
+                scale_and_zero_size,
+                output_size_per_partition,
+                dtype=params_dtype,
+            ),
+            "weight_loader": weight_loader,
+        }
+        if scale_and_zero_input_dim is None:
+            scales = ChannelQuantScaleParameter(output_dim=1, **weight_scale_args)
+            qzeros = PackedColumnParameter(
+                output_dim=1,
+                packed_dim=1,
+                packed_factor=self.quant_config.pack_factor,
+                **qzeros_args,
+            )
+
+        else:
+            scales = GroupQuantScaleParameter(
+                output_dim=1, input_dim=0, **weight_scale_args
+            )
+            qzeros = PackedvLLMParameter(
+                input_dim=0,
+                output_dim=1,
+                packed_dim=1,
+                packed_factor=self.quant_config.pack_factor,
+                **qzeros_args,
+            )
+
+        layer.register_parameter("qweight", qweight)
+        layer.register_parameter("g_idx", g_idx)
+        layer.register_parameter("qzeros", qzeros)
+        layer.register_parameter("scales", scales)
+
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        # for torch.compile
+        layer.qzeros = torch.nn.Parameter(layer.qzeros.data, requires_grad=False)
+        layer.qweight = torch.nn.Parameter(layer.qweight.data, requires_grad=False)
+        layer.g_idx = torch.nn.Parameter(layer.g_idx.data, requires_grad=False)
+        layer.scales = torch.nn.Parameter(layer.scales.data, requires_grad=False)
+
+        # exllama needs to shuffle the weight after the weight is loaded
+        # here we do the shuffle on first forward pass
+        if self.use_shuffle:
+            if self.quant_config.desc_act:
+                layer.g_idx.data = torch.argsort(layer.g_idx).to(torch.int)
+            else:
+                layer.g_idx.data = torch.empty(
+                    (0,), dtype=torch.int, device=layer.g_idx.device
+                )
+            gptq_shuffle(layer.qweight, layer.g_idx, self.quant_config.weight_bits)
+
+    def apply(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        bias: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        out_shape = x.shape[:-1] + (layer.qweight.shape[-1],)
+        reshaped_x = x.reshape(-1, x.shape[-1])
+
+        output = gptq_gemm(
+            reshaped_x,
+            layer.qweight,
+            layer.qzeros,
+            layer.scales,
+            layer.g_idx,
+            self.use_shuffle,
+            self.quant_config.weight_bits,
+        )
+        if bias is not None:
+            output.add_(bias)
+        return output.reshape(out_shape)
+
+
+class GPTQMoEAscendMethod(FusedMoEMethodBase):
+
+    def __init__(self, quant_config: GPTQConfig):
+        super().__init__()
+        self.quant_config = quant_config
+        self.use_v2_format = quant_config.checkpoint_format == "gptq_v2"
+        self.moe_runner_config: Optional[MoeRunnerConfig] = None
+
+    def create_weights(
+        self,
+        layer: torch.nn.Module,
+        num_experts: int,
+        hidden_size: int,
+        intermediate_size_per_partition: int,
+        params_dtype: torch.dtype,
+        **extra_weight_attrs,
+    ):
+        from sglang.srt.layers.moe.fused_moe_triton import FusedMoeWeightScaleSupported
+
+        pack_factor = self.quant_config.pack_factor
+
+        num_groups_w13 = hidden_size // self.quant_config.group_size
+        num_groups_w2 = intermediate_size_per_partition // self.quant_config.group_size
+
+        extra_weight_attrs.update(
+            {
+                "is_transposed": True,
+                "quant_method": FusedMoeWeightScaleSupported.GROUP.value,
+            }
+        )
+
+        w13_qweight = torch.nn.Parameter(
+            torch.empty(
+                num_experts,
+                hidden_size // pack_factor,
+                2 * intermediate_size_per_partition,
+                dtype=torch.int32,
+            ),
+            requires_grad=False,
+        )
+        layer.register_parameter("w13_qweight", w13_qweight)
+        set_weight_attrs(w13_qweight, extra_weight_attrs)
+
+        w2_qweight = torch.nn.Parameter(
+            torch.empty(
+                num_experts,
+                intermediate_size_per_partition // pack_factor,
+                hidden_size,
+                dtype=torch.int32,
+            ),
+            requires_grad=False,
+        )
+        layer.register_parameter("w2_qweight", w2_qweight)
+        set_weight_attrs(w2_qweight, extra_weight_attrs)
+
+        w13_scales = torch.nn.Parameter(
+            torch.empty(
+                num_experts,
+                num_groups_w13,
+                2 * intermediate_size_per_partition,
+                dtype=params_dtype,
+            ),
+            requires_grad=False,
+        )
+        layer.register_parameter("w13_scales", w13_scales)
+        set_weight_attrs(w13_scales, extra_weight_attrs)
+
+        w2_scales = torch.nn.Parameter(
+            torch.empty(
+                num_experts,
+                num_groups_w2,
+                hidden_size,
+                dtype=params_dtype,
+            ),
+            requires_grad=False,
+        )
+        layer.register_parameter("w2_scales", w2_scales)
+        set_weight_attrs(w2_scales, extra_weight_attrs)
+
+        w13_qzeros = torch.nn.Parameter(
+            torch.empty(
+                num_experts,
+                num_groups_w13,
+                2 * intermediate_size_per_partition // pack_factor,
+                dtype=torch.int32,
+            ),
+            requires_grad=False,
+        )
+        layer.register_parameter("w13_qzeros", w13_qzeros)
+        set_weight_attrs(w13_qzeros, extra_weight_attrs)
+
+        w2_qzeros = torch.nn.Parameter(
+            torch.empty(
+                num_experts,
+                num_groups_w2,
+                hidden_size // pack_factor,
+                dtype=torch.int32,
+            ),
+            requires_grad=False,
+        )
+        layer.register_parameter("w2_qzeros", w2_qzeros)
+        set_weight_attrs(w2_qzeros, extra_weight_attrs)
+
+    def create_moe_runner(
+        self,
+        layer: torch.nn.Module,
+        moe_runner_config: MoeRunnerConfig,
+        **extra_weight_attrs,
+    ):
+        self.moe_runner_config = moe_runner_config
+
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        w13_qzeros_2d = layer.w13_qzeros.data.contiguous().reshape(
+            -1, layer.w13_qzeros.shape[-1]
+        )
+        layer.w13_qzeros = torch.nn.Parameter(
+            unpack_from_int32(
+                w13_qzeros_2d,
+                self.quant_config.weight_bits,
+                packed_dim=1,
+            )
+            .reshape(layer.w13_qzeros.shape[0], layer.w13_qzeros.shape[1], -1)
+            .to(layer.w13_scales.dtype),
+            requires_grad=False,
+        )
+        if not self.use_v2_format:
+            layer.w13_qzeros += 1
+
+        w2_qzeros_2d = layer.w2_qzeros.data.contiguous().reshape(
+            -1, layer.w2_qzeros.shape[-1]
+        )
+        layer.w2_qzeros = torch.nn.Parameter(
+            unpack_from_int32(
+                w2_qzeros_2d,
+                self.quant_config.weight_bits,
+                packed_dim=1,
+            )
+            .reshape(layer.w2_qzeros.shape[0], layer.w2_qzeros.shape[1], -1)
+            .to(layer.w2_scales.dtype),
+            requires_grad=False,
+        )
+        if not self.use_v2_format:
+            layer.w2_qzeros += 1
+
+        w13_qweight_2d = (
+            layer.w13_qweight.data.transpose(-1, -2)
+            .contiguous()
+            .reshape(-1, layer.w13_qweight.shape[-2])
+        )
+        w13_qweight_tmp = unpack_from_int32(
+            w13_qweight_2d, self.quant_config.weight_bits, packed_dim=1
+        )
+
+        if self.quant_config.weight_bits == 4:
+            group_size = self.quant_config.group_size
+            scale_expanded = layer.w13_scales.data.repeat_interleave(group_size, dim=1)
+
+            neg_mask = scale_expanded < 0
+
+            if neg_mask.any():
+                neg_mask = neg_mask.transpose(-1, -2)
+                neg_mask = neg_mask.contiguous().reshape(w13_qweight_tmp.shape)
+                w13_qweight_tmp[neg_mask] = -w13_qweight_tmp[neg_mask]
+
+                if w13_qweight_tmp.max() > 7:
+                    w13_qweight_tmp.clamp_(max=7)
+
+                layer.w13_scales.data.abs_()
+
+            layer.w13_qweight = torch.nn.Parameter(
+                torch_npu.npu_convert_weight_to_int4pack(
+                    w13_qweight_tmp.reshape(
+                        layer.w13_qweight.shape[0], layer.w13_qweight.shape[2], -1
+                    )
+                    .transpose(-1, -2)
+                    .contiguous()
+                    .reshape(-1, layer.w13_qweight.shape[2])
+                    .to(torch.int32)
+                )
+                .reshape(layer.w13_qweight.shape[0], layer.w13_qweight.shape[1] * 8, -1)
+                .contiguous(),
+                requires_grad=False,
+            )
+        # use int8 to store weight by default
+        else:
+            layer.w13_qweight = torch.nn.Parameter(
+                w13_qweight_tmp.reshape(
+                    layer.w13_qweight.shape[0], layer.w13_qweight.shape[2], -1
+                )
+                .transpose(-1, -2)
+                .contiguous(),
+                requires_grad=False,
+            )
+
+        w2_qweight_2d = (
+            layer.w2_qweight.data.transpose(-1, -2)
+            .contiguous()
+            .reshape(-1, layer.w2_qweight.shape[-2])
+        )
+        w2_qweight_tmp = unpack_from_int32(
+            w2_qweight_2d, self.quant_config.weight_bits, packed_dim=1
+        )
+
+        if self.quant_config.weight_bits == 4:
+            group_size = self.quant_config.group_size
+            scale_expanded = layer.w2_scales.data.repeat_interleave(group_size, dim=1)
+
+            neg_mask = scale_expanded < 0
+
+            if neg_mask.any():
+                neg_mask = neg_mask.transpose(-1, -2)
+                neg_mask = neg_mask.contiguous().reshape(w2_qweight_tmp.shape)
+                w2_qweight_tmp[neg_mask] = -w2_qweight_tmp[neg_mask]
+
+                if w2_qweight_tmp.max() > 7:
+                    w2_qweight_tmp.clamp_(max=7)
+
+                layer.w2_scales.data.abs_()
+
+            layer.w2_qweight = torch.nn.Parameter(
+                torch_npu.npu_convert_weight_to_int4pack(
+                    w2_qweight_tmp.reshape(
+                        layer.w2_qweight.shape[0], layer.w2_qweight.shape[2], -1
+                    )
+                    .transpose(-1, -2)
+                    .contiguous()
+                    .reshape(-1, layer.w2_qweight.shape[2])
+                    .to(torch.int32)
+                )
+                .reshape(layer.w2_qweight.shape[0], layer.w2_qweight.shape[1] * 8, -1)
+                .contiguous(),
+                requires_grad=False,
+            )
+        # use int8 to store weight by default
+        else:
+            layer.w2_qweight = torch.nn.Parameter(
+                w2_qweight_tmp.reshape(
+                    layer.w2_qweight.shape[0], layer.w2_qweight.shape[2], -1
+                )
+                .transpose(-1, -2)
+                .contiguous(),
+                requires_grad=False,
+            )
+
+    def apply(
+        self,
+        layer: torch.nn.Module,
+        dispatch_output: StandardDispatchOutput,
+    ) -> torch.Tensor:
+        from sglang.srt.layers.moe.token_dispatcher import StandardCombineInput
+
+        assert (
+            self.moe_runner_config is not None
+        ), "moe_runner_config is not set. Did you forget to call create_weights/create_moe_runner?"
+
+        assert self.moe_runner_config.activation in ("silu", "swiglu"), (
+            f"Only SiLU/Swiglu activation is supported, "
+            f"got {self.moe_runner_config.activation!r}."
+        )
+
+        x = dispatch_output.hidden_states
+        topk_output = dispatch_output.topk_output
+        topk_weights, topk_ids, _ = topk_output
+
+        topk_ids = topk_ids.to(torch.int32)
+        topk_weights = topk_weights.to(x.dtype)
+
+        output = npu_fused_experts(
+            hidden_states=x,
+            w13=layer.w13_qweight,
+            w13_scale=layer.w13_scales,
+            w13_offset=layer.w13_qzeros,
+            w2=layer.w2_qweight,
+            w2_scale=layer.w2_scales,
+            w2_offset=layer.w2_qzeros,
+            topk_weights=topk_weights,
+            topk_ids=topk_ids,
+            top_k=topk_ids.shape[1],
+            use_wna16=True,
+        )
+
+        return StandardCombineInput(hidden_states=output)
+
+
+class GPTQMarlinLinearMethod(LinearMethodBase):
+    """Linear method for GPTQ Marlin.
+
+    Args:
+        quant_config: The GPTQ Marlin quantization config.
+    """
+
+    _kernel_backends_being_used: set[str] = set()
+
+    def __init__(self, quant_config: GPTQMarlinConfig) -> None:
+        self.quant_config = quant_config
+
+        # Verify supported on platform.
+        verify_marlin_supported(
+            quant_type=self.quant_config.quant_type,
+            group_size=self.quant_config.group_size,
+        )
+
+    def create_weights(
+        self,
+        layer: torch.nn.Module,
+        input_size_per_partition: int,
+        output_partition_sizes: list[int],
+        input_size: int,
+        output_size: int,
+        params_dtype: torch.dtype,
+        **extra_weight_attrs,
+    ) -> None:
+        output_size_per_partition = sum(output_partition_sizes)
+        is_row_parallel = input_size != input_size_per_partition
+        weight_loader = extra_weight_attrs.get("weight_loader")
+
+        self.kernel_config = MarlinLinearLayerConfig(
+            full_weight_shape=(input_size, output_size),
+            partition_weight_shape=(
+                input_size_per_partition,
+                output_size_per_partition,
+            ),
+            weight_type=self.quant_config.quant_type,
+            act_type=params_dtype,
+            group_size=self.quant_config.group_size,
+            zero_points=False,
+            has_g_idx=self.quant_config.desc_act,
+        )
+        # Normalize group_size
+        if self.quant_config.group_size != -1:
+            group_size = self.quant_config.group_size
+        else:
+            group_size = input_size
+
+        # Determine sharding
+        if marlin_repeat_scales_on_all_ranks(
+            self.quant_config.desc_act, self.quant_config.group_size, is_row_parallel
+        ):
+            # By setting scale_dim == None, weight_loader will
+            # repeat the scales on each GPU in TP>1 case.
+            scales_and_zp_input_dim = None
+            scales_and_zp_size = input_size // group_size
+        else:
+            # By setting scale_dim == 0, weight_loader will
+            # shard the scales in TP>1 case.
+            scales_and_zp_input_dim = 0
+            scales_and_zp_size = input_size_per_partition // group_size
+
+        # Quantized weights
+        qweight = PackedvLLMParameter(
+            data=torch.empty(
+                input_size_per_partition // self.quant_config.pack_factor,
+                output_size_per_partition,
+                dtype=torch.int32,
+            ),
+            input_dim=0,
+            output_dim=1,
+            packed_dim=0,
+            packed_factor=self.quant_config.pack_factor,
+            weight_loader=weight_loader,
+        )
+
+        # Activation order
+        g_idx = RowvLLMParameter(
+            data=torch.empty(
+                input_size_per_partition,
+                dtype=torch.int32,
+            ),
+            input_dim=0,
+            weight_loader=weight_loader,
+        )
+
+        qzeros_args = {
+            "data": torch.empty(
+                scales_and_zp_size,
+                output_size_per_partition // self.quant_config.pack_factor,
+                dtype=torch.int32,
+            ),
+            "weight_loader": weight_loader,
+        }
+        weight_scale_args = {
+            "data": torch.empty(
+                scales_and_zp_size,
+                output_size_per_partition,
+                dtype=params_dtype,
+            ),
+            "weight_loader": weight_loader,
+        }
+
+        if scales_and_zp_input_dim is None:
+            scales = ChannelQuantScaleParameter(output_dim=1, **weight_scale_args)
+            qzeros = PackedColumnParameter(
+                output_dim=1,
+                packed_dim=1,
+                packed_factor=self.quant_config.pack_factor,
+                **qzeros_args,
+            )
+
+        else:
+            scales = GroupQuantScaleParameter(
+                output_dim=1, input_dim=0, **weight_scale_args
+            )
+            qzeros = PackedvLLMParameter(
+                input_dim=0,
+                output_dim=1,
+                packed_dim=1,
+                packed_factor=self.quant_config.pack_factor,
+                **qzeros_args,
+            )
+
+        layer.register_parameter("qweight", qweight)
+        layer.register_parameter("g_idx", g_idx)
+        layer.register_parameter("scales", scales)
+        layer.register_parameter("qzeros", qzeros)
+
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        device = getattr(layer, "qweight").device
+        c = self.kernel_config
+
+        check_marlin_supports_shape(
+            c.partition_weight_shape[1],  # out_features
+            c.partition_weight_shape[0],  # in_features
+            c.full_weight_shape[0],  # in_features
+            c.group_size,
+        )
+
+        row_parallel = c.partition_weight_shape[0] != c.full_weight_shape[0]
+        self.is_k_full = marlin_is_k_full(c.has_g_idx, row_parallel)
+
+        # Allocate marlin workspace.
+        self.workspace = marlin_make_workspace(device)
+
+        # Default names since marlin requires empty parameters for these,
+        # TODO: remove this requirement from marlin (allow optional tensors)
+        self.w_q_name = "qweight"
+        self.w_s_name = "scales"
+        self.w_zp_name = "qzeros"
+        self.w_gidx_name = "g_idx"
+
+        def _transform_param(
+            layer: torch.nn.Module, name: Optional[str], fn: Callable
+        ) -> None:
+            if name is not None and getattr(layer, name, None) is not None:
+
+                old_param = getattr(layer, name)
+                new_param = fn(old_param)
+                # replace the parameter with torch.nn.Parameter for TorchDynamo
+                # compatibility
+                replace_parameter(
+                    layer, name, torch.nn.Parameter(new_param.data, requires_grad=False)
+                )
+
+        def transform_w_q(x):
+            assert isinstance(x, BasevLLMParameter)
+            permute_param_layout_(x, input_dim=0, output_dim=1, packed_dim=0)
+            x.data = gptq_marlin_repack(
+                x.data.contiguous(),
+                perm=layer.g_idx_sort_indices,
+                size_k=c.partition_weight_shape[0],
+                size_n=c.partition_weight_shape[1],
+                num_bits=c.weight_type.size_bits,
+            )
+            return x
+
+        def transform_w_s(x):
+            assert isinstance(x, BasevLLMParameter)
+            permute_param_layout_(x, input_dim=0, output_dim=1)
+            x.data = marlin_permute_scales(
+                x.data.contiguous(),
+                size_k=c.partition_weight_shape[0],
+                size_n=c.partition_weight_shape[1],
+                group_size=c.group_size,
+            )
+            return x
+
+        if c.has_g_idx:
+            g_idx, g_idx_sort_indices = marlin_sort_g_idx(
+                getattr(layer, self.w_gidx_name)
+            )
+            _transform_param(layer, self.w_gidx_name, lambda _: g_idx)
+            layer.g_idx_sort_indices = g_idx_sort_indices
+        else:
+            setattr(layer, self.w_gidx_name, marlin_make_empty_g_idx(device))
+            layer.g_idx_sort_indices = marlin_make_empty_g_idx(device)
+
+        if c.zero_points:
+            grouped_k = (
+                c.partition_weight_shape[0] // c.group_size if c.group_size != -1 else 1
+            )
+            _transform_param(
+                layer,
+                self.w_zp_name,
+                lambda x: marlin_zero_points(
+                    unpack_cols(
+                        x.t(),
+                        c.weight_type.size_bits,
+                        grouped_k,
+                        c.partition_weight_shape[1],
+                    ),
+                    size_k=grouped_k,
+                    size_n=c.partition_weight_shape[1],
+                    num_bits=c.weight_type.size_bits,
+                ),
+            )
+        else:
+            setattr(layer, self.w_zp_name, marlin_make_empty_g_idx(device))
+        _transform_param(layer, self.w_q_name, transform_w_q)
+        _transform_param(layer, self.w_s_name, transform_w_s)
+
+    def apply(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        bias: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        c = self.kernel_config
+
+        def _get_weight_params(
+            layer: torch.nn.Module,
+        ) -> tuple[
+            torch.Tensor,  # w_q
+            torch.Tensor,  # w_s
+            Optional[torch.Tensor],  # w_zp,
+            Optional[torch.Tensor],  # w_gidx
+        ]:
+            return (
+                getattr(layer, self.w_q_name),
+                getattr(layer, self.w_s_name),
+                getattr(layer, self.w_zp_name or "", None),
+                getattr(layer, self.w_gidx_name or "", None),
+            )
+
+        w_q, w_s, w_zp, w_gidx = _get_weight_params(layer)
+
+        # `process_weights_after_loading` will ensure w_zp and w_gidx are not
+        #  None for marlin
+        return apply_gptq_marlin_linear(
+            input=x,
+            weight=w_q,
+            weight_scale=w_s,
+            weight_zp=w_zp,  # type: ignore
+            g_idx=w_gidx,  # type: ignore
+            g_idx_sort_indices=layer.g_idx_sort_indices,
+            workspace=self.workspace,
+            wtype=c.weight_type,
+            input_size_per_partition=c.partition_weight_shape[0],
+            output_size_per_partition=c.partition_weight_shape[1],
+            is_k_full=self.is_k_full,
+            bias=bias,
+        )
+
+
+def unpack_from_int32(
+    weight: torch.Tensor,
+    num_bits: int,
+    packed_dim: int = 1,
+) -> torch.Tensor:
+    """
+    Unpacks quantized weights from int32 format back to original bits.
+
+    :param weight: The packed int32 tensor containing quantized weights
+    :param num_bits: The number of bits used for quantization (<= 8)
+    :param packed_dim: Dimension along which weights are packed (0 or 1), defaults to 1
+    :return: Unpacked tensor with int8 dtype after applying offset correction
+    """
+    assert (
+        weight.dtype == torch.int32
+    ), f"Expecting `weight.dtype` is torch.int32 but got {weight.dtype}."
+    assert (
+        num_bits <= 8
+    ), f"Expecting `num_bits` should not be larger than 8 but got {num_bits}."
+
+    pack_factor = 32 // num_bits
+    mask = (1 << num_bits) - 1
+
+    if packed_dim == 1:
+        unpacked_weight = torch.zeros(
+            (weight.shape[0], weight.shape[1] * pack_factor),
+            device=weight.device,
+            dtype=torch.int32,
+        )
+        for i in range(pack_factor):
+            unpacked_weight[:, i::pack_factor] = (weight >> (num_bits * i)) & mask
+    else:
+        unpacked_weight = torch.zeros(
+            (weight.shape[0] * pack_factor, weight.shape[1]),
+            device=weight.device,
+            dtype=torch.int32,
+        )
+        for i in range(pack_factor):
+            unpacked_weight[i::pack_factor, :] = (weight >> (num_bits * i)) & mask
+    offset = pow(2, num_bits) // 2
+    unpacked_weight = (unpacked_weight - offset).to(torch.int8)
+    return unpacked_weight
+
+
+class GPTQLinearAscendMethod(GPTQLinearMethod):
+    """Linear method for GPTQ on Ascend NPU."""
+
+    def create_weights(
+        self,
+        layer: torch.nn.Module,
+        input_size_per_partition: int,
+        output_partition_sizes: list[int],
+        input_size: int,
+        output_size: int,
+        params_dtype: torch.dtype,
+        **extra_weight_attrs,
+    ):
+        super().create_weights(
+            layer,
+            input_size_per_partition,
+            output_partition_sizes,
+            input_size,
+            output_size,
+            params_dtype,
+            **extra_weight_attrs,
+        )
+        set_weight_attrs(layer.qzeros, {"pack_factor": self.quant_config.pack_factor})
+        set_weight_attrs(layer.qweight, {"pack_factor": self.quant_config.pack_factor})
+
+        if self.quant_config.desc_act:
+            raise ValueError(
+                "Currently, desc_act (True) is not supported by GPTQ quantization on npu."
+            )
+
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+
+        layer.qzeros = torch.nn.Parameter(
+            unpack_from_int32(
+                layer.qzeros.data.contiguous(),
+                self.quant_config.weight_bits,
+                packed_dim=1,
+            ).to(layer.scales.dtype),
+            requires_grad=False,
+        )
+        if not self.use_v2_format:
+            layer.qzeros += 1
+
+        qweight_tmp = unpack_from_int32(
+            layer.qweight.data.contiguous(), self.quant_config.weight_bits, packed_dim=0
+        )
+        # use int8 to store weight by default
+        if self.quant_config.weight_bits != 4:
+            layer.qweight = torch.nn.Parameter(
+                qweight_tmp,
+                requires_grad=False,
+            )
+            return
+
+        # for 4bit case we need to pack 4bit weight to int32 to save memory
+        layer.qweight = torch.nn.Parameter(
+            torch_npu.npu_convert_weight_to_int4pack(qweight_tmp.to(torch.int32)),
+            requires_grad=False,
+        )
+
+    def apply(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        bias: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        qweight = layer.qweight
+        scales = layer.scales
+        qzeros = layer.qzeros
+
+        reshaped_x = x.reshape(-1, x.shape[-1])
+
+        if bias is not None and bias.dtype == torch.bfloat16:
+            bias = bias.float()
+
+        # 4bit weight is packed to int32(8 x int4)
+        if self.quant_config.weight_bits == 4:
+            out_shape = x.shape[:-1] + (qweight.shape[-1] * 8,)
+        else:
+            out_shape = x.shape[:-1] + (qweight.shape[-1],)
+
+        out = torch_npu.npu_weight_quant_batchmatmul(
+            reshaped_x,
+            qweight,
+            antiquant_scale=scales,
+            antiquant_offset=qzeros,
+            antiquant_group_size=self.quant_config.group_size,
+            bias=bias,
+        )
+
+        return out.reshape(out_shape)
+
+
+class GPTQMarlinMoEMethod(FusedMoEMethodBase):
+    """MoE Marlin method with quantization."""
+
+    def __init__(self, quant_config: GPTQMarlinConfig) -> None:
+        self.quant_config = quant_config
+
+    def create_weights(
+        self,
+        layer: torch.nn.Module,
+        num_experts: int,
+        hidden_size: int,
+        intermediate_size_per_partition: int,
+        params_dtype: torch.dtype,
+        **extra_weight_attrs,
+    ):
+        # Delay the import to avoid circular dependency
+        from sglang.srt.layers.linear import set_weight_attrs
+        from sglang.srt.layers.moe.fused_moe_triton import FusedMoeWeightScaleSupported
+
+        self.is_k_full = (not self.quant_config.desc_act) or layer.moe_tp_size == 1
+
+        if self.quant_config.group_size != -1:
+            scales_size13 = hidden_size // self.quant_config.group_size
+            if self.quant_config.desc_act:
+                w2_scales_size = intermediate_size_per_partition
+            else:
+                w2_scales_size = intermediate_size_per_partition * layer.moe_tp_size
+            scales_size2 = w2_scales_size // self.quant_config.group_size
+            strategy = FusedMoeWeightScaleSupported.GROUP.value
+        else:
+            scales_size13 = 1
+            scales_size2 = 1
+            strategy = FusedMoeWeightScaleSupported.CHANNEL.value
+
+        extra_weight_attrs.update({"quant_method": strategy, "is_transposed": True})
+        # Fused gate_up_proj (column parallel)
+        w13_qweight = torch.nn.Parameter(
+            torch.empty(
+                num_experts,
+                hidden_size // self.quant_config.pack_factor,
+                2 * intermediate_size_per_partition,
+                dtype=torch.int32,
+            ),
+            requires_grad=False,
+        )
+        layer.register_parameter("w13_qweight", w13_qweight)
+        set_weight_attrs(w13_qweight, extra_weight_attrs)
+        # down_proj (row parallel)
+        w2_qweight = torch.nn.Parameter(
+            torch.empty(
+                num_experts,
+                intermediate_size_per_partition // self.quant_config.pack_factor,
+                hidden_size,
+                dtype=torch.int32,
+            ),
+            requires_grad=False,
+        )
+        layer.register_parameter("w2_qweight", w2_qweight)
+        set_weight_attrs(w2_qweight, extra_weight_attrs)
+        # up_proj scales
+        w13_scales = torch.nn.Parameter(
+            torch.empty(
+                num_experts,
+                scales_size13,
+                2 * intermediate_size_per_partition,
+                dtype=torch.half,
+            ),
+            requires_grad=False,
+        )
+        layer.register_parameter("w13_scales", w13_scales)
+        set_weight_attrs(w13_scales, extra_weight_attrs)
+        # down_proj scales
+        w2_scales = torch.nn.Parameter(
+            torch.empty(num_experts, scales_size2, hidden_size, dtype=torch.half),
+            requires_grad=False,
+        )
+        layer.register_parameter("w2_scales", w2_scales)
+        set_weight_attrs(w2_scales, extra_weight_attrs)
+        # dont shard the w2 scales when running act order
+        set_weight_attrs(w2_scales, {"load_full_w2": self.quant_config.desc_act})
+        # up_proj scales
+        w13_qzeros = torch.nn.Parameter(
+            torch.empty(
+                num_experts,
+                scales_size13,
+                2 * intermediate_size_per_partition // self.quant_config.pack_factor,
+                dtype=params_dtype,
+            ),
+            requires_grad=False,
+        )
+        layer.register_parameter("w13_qzeros", w13_qzeros)
+        set_weight_attrs(w13_qzeros, extra_weight_attrs)
+        # down_proj scales
+        w2_qzeros = torch.nn.Parameter(
+            torch.empty(
+                num_experts,
+                scales_size2,
+                hidden_size // self.quant_config.pack_factor,
+                dtype=params_dtype,
+            ),
+            requires_grad=False,
+        )
+        layer.register_parameter("w2_qzeros", w2_qzeros)
+        set_weight_attrs(w2_qzeros, extra_weight_attrs)
+        # dont shard the w2 scales when running act order
+        set_weight_attrs(w2_qzeros, {"load_full_w2": self.quant_config.desc_act})
+        w13_g_idx = torch.nn.Parameter(
+            torch.empty(
+                num_experts,
+                hidden_size,
+                dtype=torch.int32,
+            ),
+            requires_grad=False,
+        )
+        layer.register_parameter("w13_g_idx", w13_g_idx)
+        set_weight_attrs(w13_g_idx, extra_weight_attrs)
+        w2_g_idx = torch.nn.Parameter(
+            torch.empty(
+                num_experts,
+                intermediate_size_per_partition,
+                dtype=torch.int32,
+            ),
+            requires_grad=False,
+        )
+        layer.register_parameter("w2_g_idx", w2_g_idx)
+        set_weight_attrs(w2_g_idx, extra_weight_attrs)
+        w13_g_idx_sort_indices = torch.nn.Parameter(
+            torch.empty(
+                num_experts,
+                hidden_size,
+                dtype=torch.int32,
+            ),
+            requires_grad=False,
+        )
+        layer.register_parameter("w13_g_idx_sort_indices", w13_g_idx_sort_indices)
+        set_weight_attrs(w13_g_idx_sort_indices, extra_weight_attrs)
+        w2_g_idx_sort_indices = torch.nn.Parameter(
+            torch.empty(
+                num_experts,
+                intermediate_size_per_partition,
+                dtype=torch.int32,
+            ),
+            requires_grad=False,
+        )
+        layer.register_parameter("w2_g_idx_sort_indices", w2_g_idx_sort_indices)
+        set_weight_attrs(w2_g_idx_sort_indices, extra_weight_attrs)
+
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+
+        # Process act_order
+        if self.quant_config.desc_act:
+            # Get sorting based on g_idx
+            num_experts = layer.w13_g_idx.shape[0]
+            w13_g_idx_sort_indices = torch.empty_like(layer.w13_g_idx)
+            w2_g_idx_sort_indices = torch.empty_like(layer.w2_g_idx)
+            w13_sorted_g_idx = torch.empty_like(layer.w13_g_idx)
+            w2_sorted_g_idx = torch.empty_like(layer.w2_g_idx)
+            for e in range(num_experts):
+                w13_g_idx_sort_indices[e] = torch.argsort(layer.w13_g_idx[e]).to(
+                    torch.int32
+                )
+                w2_g_idx_sort_indices[e] = torch.argsort(layer.w2_g_idx[e]).to(
+                    torch.int32
+                )
+                w13_sorted_g_idx[e] = layer.w13_g_idx[e][w13_g_idx_sort_indices[e]]
+                w2_sorted_g_idx[e] = layer.w2_g_idx[e][w2_g_idx_sort_indices[e]]
+            replace_parameter(layer, "w13_g_idx", w13_sorted_g_idx)
+            replace_parameter(layer, "w2_g_idx", w2_sorted_g_idx)
+            replace_parameter(layer, "w13_g_idx_sort_indices", w13_g_idx_sort_indices)
+            replace_parameter(layer, "w2_g_idx_sort_indices", w2_g_idx_sort_indices)
+        else:
+            # Reset g_idx related tensors
+            num_experts = layer.w13_g_idx.shape[0]
+            device = layer.w13_g_idx.device
+            layer.w13_g_idx = torch.nn.Parameter(
+                torch.empty((num_experts, 0), dtype=torch.int32, device=device),
+                requires_grad=False,
+            )
+            layer.w2_g_idx = torch.nn.Parameter(
+                torch.empty((num_experts, 0), dtype=torch.int32, device=device),
+                requires_grad=False,
+            )
+            layer.w13_g_idx_sort_indices = torch.nn.Parameter(
+                torch.empty((num_experts, 0), dtype=torch.int32, device=device),
+                requires_grad=False,
+            )
+            layer.w2_g_idx_sort_indices = torch.nn.Parameter(
+                torch.empty((num_experts, 0), dtype=torch.int32, device=device),
+                requires_grad=False,
+            )
+        # Repack weights
+        marlin_w13_qweight = gptq_marlin_moe_repack(
+            layer.w13_qweight,
+            layer.w13_g_idx_sort_indices,
+            layer.w13_qweight.shape[1] * self.quant_config.pack_factor,
+            layer.w13_qweight.shape[2],
+            self.quant_config.weight_bits,
+        )
+        replace_parameter(layer, "w13_qweight", marlin_w13_qweight)
+        marlin_w2_qweight = gptq_marlin_moe_repack(
+            layer.w2_qweight,
+            layer.w2_g_idx_sort_indices,
+            layer.w2_qweight.shape[1] * self.quant_config.pack_factor,
+            layer.w2_qweight.shape[2],
+            self.quant_config.weight_bits,
+        )
+        replace_parameter(layer, "w2_qweight", marlin_w2_qweight)
+        # Repack scales
+        marlin_w13_scales = marlin_moe_permute_scales(
+            s=layer.w13_scales,
+            size_k=layer.intermediate_size_per_partition,
+            size_n=layer.w13_scales.shape[2],
+            group_size=self.quant_config.group_size,
+        )
+        replace_parameter(layer, "w13_scales", marlin_w13_scales)
+        marlin_w2_scales = marlin_moe_permute_scales(
+            s=layer.w2_scales,
+            size_k=layer.w2_scales.shape[1]
+            * (
+                self.quant_config.group_size
+                if self.quant_config.group_size != -1
+                else self.quant_config.pack_factor
+            ),
+            size_n=layer.w2_scales.shape[2],
+            group_size=self.quant_config.group_size,
+        )
+        replace_parameter(layer, "w2_scales", marlin_w2_scales)
+
+    def create_moe_runner(
+        self, layer: torch.nn.Module, moe_runner_config: MoeRunnerConfig
+    ):
+        assert get_moe_runner_backend().is_auto()
+        self.moe_runner_config = moe_runner_config
+        self.runner = MoeRunner(MoeRunnerBackend.MARLIN, moe_runner_config)
+
+    def apply(
+        self,
+        layer: torch.nn.Module,
+        dispatch_output: StandardDispatchOutput,
+    ) -> CombineInput:
+        quant_info = MarlinMoeQuantInfo(
+            w13_qweight=layer.w13_qweight,
+            w2_qweight=layer.w2_qweight,
+            w13_scales=layer.w13_scales,
+            w2_scales=layer.w2_scales,
+            w13_g_idx=layer.w13_g_idx,
+            w2_g_idx=layer.w2_g_idx,
+            w13_g_idx_sort_indices=layer.w13_g_idx_sort_indices,
+            w2_g_idx_sort_indices=layer.w2_g_idx_sort_indices,
+            weight_bits=self.quant_config.weight_bits,
+            is_k_full=self.is_k_full,
+        )
+
+        return self.runner.run(dispatch_output, quant_info)
+
+
+# Register fake implementations for torch.compile support
+if _is_cuda:
+
+    @register_fake_if_exists("sgl_kernel::gptq_gemm")
+    def _(a, b_q_weight, b_gptq_qzeros, b_gptq_scales, b_g_idx, use_shuffle, bit):
+        return a.new_empty((a.shape[0], b_q_weight.shape[-1]), dtype=a.dtype)
+
+    @register_fake_if_exists("sgl_kernel::gptq_marlin_repack")
+    def _(b_q_weight, perm, size_k, size_n, num_bits):
+        return b_q_weight.new_empty(
+            (size_k // 16, size_n * (num_bits // 2)), dtype=b_q_weight.dtype
+        )
+
+    @register_fake_if_exists("sgl_kernel::gptq_shuffle")
+    def _(q_weight, q_perm, bit):
+        return
diff --git a/sglang/python/sglang/srt/layers/quantization/int8_kernel.py b/sglang/python/sglang/srt/layers/quantization/int8_kernel.py
new file mode 100644
index 0000000000000000000000000000000000000000..9a122cad593db092b02dc023c11e008f7319b1a7
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/quantization/int8_kernel.py
@@ -0,0 +1,441 @@
+import functools
+import json
+import logging
+import os
+from typing import Any, Dict, List, Optional, Tuple
+
+import torch
+import triton
+import triton.language as tl
+
+from sglang.srt.utils import get_device_name, is_cuda
+
+_is_cuda = is_cuda()
+if _is_cuda:
+    # Temporary
+    try:
+        from sgl_kernel import sgl_per_token_group_quant_8bit
+
+        enable_sgl_per_token_group_quant_8bit = True
+    except ImportError:
+        from sgl_kernel import sgl_per_token_group_quant_int8
+
+        enable_sgl_per_token_group_quant_8bit = False
+
+logger = logging.getLogger(__name__)
+
+
+@triton.jit
+def _per_token_quant_int8(
+    x_ptr,
+    xq_ptr,
+    scale_ptr,
+    x_sum_ptr,
+    stride_x,
+    stride_xq,
+    N,
+    CAL_SUM: tl.constexpr,
+    BLOCK: tl.constexpr,
+):
+    # Adapted from https://github.com/InternLM/lmdeploy/blob/086481ed84b59bee3b8e4274e5fc69620040c048/lmdeploy/pytorch/kernels/cuda/w8a8_triton_kernels.py#L282
+    row_id = tl.program_id(0)
+
+    cols = tl.arange(0, BLOCK)
+    mask = cols < N
+
+    x = tl.load(x_ptr + row_id * stride_x + cols, mask=mask, other=0.0).to(tl.float32)
+    absmax = tl.maximum(tl.max(tl.abs(x)), 1e-10)
+    scale_x = absmax / 127
+    x_q = x * (127 / absmax)
+    x_q = tl.extra.cuda.libdevice.round(x_q).to(tl.int8)
+    if CAL_SUM:
+        x_sum = tl.sum(x, axis=0)
+        tl.store(x_sum_ptr + row_id, x_sum.to(x_sum_ptr.dtype.element_ty))
+
+    tl.store(xq_ptr + row_id * stride_xq + cols, x_q, mask=mask)
+    tl.store(scale_ptr + row_id, scale_x.to(scale_ptr.dtype.element_ty))
+
+
+def per_token_quant_int8(x, scale_dtype=torch.float32, cal_sum=False):
+    M = x.numel() // x.shape[-1]
+    N = x.shape[-1]
+    x_q = torch.empty_like(x, device=x.device, dtype=torch.int8)
+    scales = torch.empty(x.shape[:-1] + (1,), device=x.device, dtype=scale_dtype)
+    if cal_sum:
+        x_sum = torch.empty(x.shape[:-1], device=x.device, dtype=x.dtype)
+    else:
+        x_sum = None
+    BLOCK = triton.next_power_of_2(N)
+    # heuristics for number of warps
+    num_warps = min(max(BLOCK // 256, 1), 8)
+
+    assert x.is_contiguous()
+    _per_token_quant_int8[(M,)](
+        x,
+        x_q,
+        scales,
+        x_sum,
+        stride_x=x.stride(-2),
+        stride_xq=x_q.stride(-2),
+        N=N,
+        CAL_SUM=cal_sum,
+        BLOCK=BLOCK,
+        num_warps=num_warps,
+        num_stages=1,
+    )
+    if cal_sum:
+        return x_q, scales, x_sum
+    else:
+        return x_q, scales
+
+
+@triton.jit
+def _per_token_group_quant_int8(
+    # Pointers to inputs and output
+    y_ptr,
+    y_q_ptr,
+    y_s_ptr,
+    # Stride of input
+    y_stride,
+    # Columns of input
+    N,
+    # Avoid to divide zero
+    eps,
+    # Information for int8
+    int8_min,
+    int8_max,
+    # Meta-parameters
+    BLOCK: tl.constexpr,
+):
+    """A Triton-accelerated function to perform per-token-group quantization on a
+    tensor.
+
+    This function converts the tensor values into int8 values.
+    """
+    # Map the program id to the row of X and Y it should compute.
+    g_id = tl.program_id(0)
+    y_ptr += g_id * y_stride
+    y_q_ptr += g_id * y_stride
+    y_s_ptr += g_id
+
+    cols = tl.arange(0, BLOCK)  # N <= BLOCK
+    mask = cols < N
+
+    y = tl.load(y_ptr + cols, mask=mask, other=0.0).to(tl.float32)
+    # Quant
+    _absmax = tl.maximum(tl.max(tl.abs(y)), eps)
+    y_s = _absmax / int8_max
+    y_q = tl.clamp(y / y_s, int8_min, int8_max).to(y_q_ptr.dtype.element_ty)
+
+    tl.store(y_q_ptr + cols, y_q, mask=mask)
+    tl.store(y_s_ptr, y_s)
+
+
+def per_token_group_quant_int8(
+    x: torch.Tensor,
+    group_size: int,
+    eps: float = 1e-10,
+    dtype: torch.dtype = torch.int8,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """Function to perform per-token-group quantization on an input tensor `x`.
+
+    It converts the tensor values into signed int8 values and returns the
+    quantized tensor along with the scaling factor used for quantization.
+
+    Args:
+        x: The input tensor with ndim >= 2.
+        group_size: The group size used for quantization.
+        eps: The minimum to avoid dividing zero.
+        dtype: The dype of output tensor. Note that only `torch.int8` is supported for now.
+
+    Returns:
+        Tuple[torch.Tensor, torch.Tensor]: The quantized tensor and the scaling factor for quantization.
+    """
+    assert (
+        x.shape[-1] % group_size == 0
+    ), "the last dimension of `x` cannot be divisible by `group_size`"
+    assert x.is_contiguous(), "`x` is not contiguous"
+
+    iinfo = torch.iinfo(dtype)
+    int8_max = iinfo.max
+    int8_min = iinfo.min
+
+    x_q = torch.empty_like(x, device=x.device, dtype=dtype)
+    M = x.numel() // group_size
+    N = group_size
+    x_s = torch.empty(
+        x.shape[:-1] + (x.shape[-1] // group_size,),
+        device=x.device,
+        dtype=torch.float32,
+    )
+
+    BLOCK = triton.next_power_of_2(N)
+    # heuristics for number of warps
+    num_warps = min(max(BLOCK // 256, 1), 8)
+    num_stages = 1
+    _per_token_group_quant_int8[(M,)](
+        x,
+        x_q,
+        x_s,
+        group_size,
+        N,
+        eps,
+        int8_min=int8_min,
+        int8_max=int8_max,
+        BLOCK=BLOCK,
+        num_warps=num_warps,
+        num_stages=num_stages,
+    )
+
+    return x_q, x_s
+
+
+def sglang_per_token_group_quant_int8(
+    x: torch.Tensor,
+    group_size: int,
+    eps: float = 1e-10,
+    dtype: torch.dtype = torch.int8,
+    enable_v2: Optional[bool] = None,
+):
+    assert (
+        x.shape[-1] % group_size == 0
+    ), "the last dimension of `x` cannot be divisible by `group_size`"
+    assert x.is_contiguous(), "`x` is not contiguous"
+
+    iinfo = torch.iinfo(dtype)
+    int8_max = iinfo.max
+    int8_min = iinfo.min
+
+    x_q = torch.empty_like(x, device=x.device, dtype=dtype)
+    x_s = torch.empty(
+        x.shape[:-1] + (x.shape[-1] // group_size,),
+        device=x.device,
+        dtype=torch.float32,
+    )
+
+    # Temporary
+    if enable_sgl_per_token_group_quant_8bit:
+        sgl_per_token_group_quant_8bit(
+            x, x_q, x_s, group_size, eps, int8_min, int8_max, enable_v2=enable_v2
+        )
+    else:
+        assert not enable_v2
+        sgl_per_token_group_quant_int8(x, x_q, x_s, group_size, eps, int8_min, int8_max)
+
+    return x_q, x_s
+
+
+@triton.jit
+def _w8a8_block_int8_matmul(
+    # Pointers to inputs and output
+    A,
+    B,
+    C,
+    As,
+    Bs,
+    # Shape for matmul
+    M,
+    N,
+    K,
+    # Block size for block-wise quantization
+    group_n,
+    group_k,
+    # Stride for inputs and output
+    stride_am,
+    stride_ak,
+    stride_bk,
+    stride_bn,
+    stride_cm,
+    stride_cn,
+    stride_As_m,
+    stride_As_k,
+    stride_Bs_k,
+    stride_Bs_n,
+    # Meta-parameters
+    BLOCK_SIZE_M: tl.constexpr,
+    BLOCK_SIZE_N: tl.constexpr,
+    BLOCK_SIZE_K: tl.constexpr,
+    GROUP_SIZE_M: tl.constexpr,
+):
+    """Triton-accelerated function used to perform linear operations (dot
+    product) on input tensors `A` and `B` with block-wise quantization, and store the result in output
+    tensor `C`.
+    """
+
+    pid = tl.program_id(axis=0)
+    num_pid_m = tl.cdiv(M, BLOCK_SIZE_M)
+    num_pid_n = tl.cdiv(N, BLOCK_SIZE_N)
+    num_pid_in_group = GROUP_SIZE_M * num_pid_n
+    group_id = pid // num_pid_in_group
+    first_pid_m = group_id * GROUP_SIZE_M
+    group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M)
+    pid_m = first_pid_m + (pid % group_size_m)
+    pid_n = (pid % num_pid_in_group) // group_size_m
+
+    offs_am = (pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)) % M
+    offs_bn = (pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)) % N
+    offs_k = tl.arange(0, BLOCK_SIZE_K)
+    a_ptrs = A + (offs_am[:, None] * stride_am + offs_k[None, :] * stride_ak)
+    b_ptrs = B + (offs_k[:, None] * stride_bk + offs_bn[None, :] * stride_bn)
+
+    As_ptrs = As + offs_am * stride_As_m
+    offs_bsn = offs_bn // group_n
+    Bs_ptrs = Bs + offs_bsn * stride_Bs_n
+
+    accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
+    for k in range(0, tl.cdiv(K, BLOCK_SIZE_K)):
+        a = tl.load(a_ptrs, mask=offs_k[None, :] < K - k * BLOCK_SIZE_K, other=0.0)
+        b = tl.load(b_ptrs, mask=offs_k[:, None] < K - k * BLOCK_SIZE_K, other=0.0)
+
+        k_start = k * BLOCK_SIZE_K
+        offs_ks = k_start // group_k
+        a_s = tl.load(As_ptrs + offs_ks * stride_As_k)
+        b_s = tl.load(Bs_ptrs + offs_ks * stride_Bs_k)
+
+        accumulator += tl.dot(a, b).to(tl.float32) * a_s[:, None] * b_s[None, :]
+        a_ptrs += BLOCK_SIZE_K * stride_ak
+        b_ptrs += BLOCK_SIZE_K * stride_bk
+
+    if C.dtype.element_ty == tl.bfloat16:
+        c = accumulator.to(tl.bfloat16)
+    elif C.dtype.element_ty == tl.float16:
+        c = accumulator.to(tl.float16)
+    else:
+        c = accumulator.to(tl.float32)
+
+    offs_cm = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
+    offs_cn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+    c_ptrs = C + stride_cm * offs_cm[:, None] + stride_cn * offs_cn[None, :]
+    c_mask = (offs_cm[:, None] < M) & (offs_cn[None, :] < N)
+    tl.store(c_ptrs, c, mask=c_mask)
+
+
+@functools.lru_cache
+def get_w8a8_block_int8_configs(
+    N: int, K: int, block_n: int, block_k: int
+) -> Optional[Dict[int, Any]]:
+    """
+    Return optimized configurations for the w8a8 block fp8 kernel.
+
+    The return value will be a dictionary that maps an irregular grid of
+    batch sizes to configurations of the w8a8 block fp8 kernel. To evaluate the
+    kernel on a given batch size bs, the closest batch size in the grid should
+    be picked and the associated configuration chosen to invoke the kernel.
+    """
+
+    # First look up if an optimized configuration is available in the configs
+    # directory
+    device_name = get_device_name().replace(" ", "_")
+    json_file_name = f"N={N},K={K},device_name={device_name},dtype=int8_w8a8,block_shape=[{block_n}, {block_k}].json"
+
+    config_file_path = os.path.join(
+        os.path.dirname(os.path.realpath(__file__)), "configs", json_file_name
+    )
+    if os.path.exists(config_file_path):
+        with open(config_file_path) as f:
+            logger.info(
+                "Using configuration from %s for W8A8 Block INT8 kernel.",
+                config_file_path,
+            )
+            # If a configuration has been found, return it
+            return {int(key): val for key, val in json.load(f).items()}
+
+    # If no optimized configuration is available, we will use the default
+    # configuration
+    logger.warning(
+        (
+            "Using default W8A8 Block INT8 kernel config. Performance might be sub-optimal! "
+            "Config file not found at %s"
+        ),
+        config_file_path,
+    )
+    return None
+
+
+def w8a8_block_int8_matmul(
+    A: torch.Tensor,
+    B: torch.Tensor,
+    As: torch.Tensor,
+    Bs: torch.Tensor,
+    block_size: List[int],
+    output_dtype: torch.dtype = torch.float16,
+) -> torch.Tensor:
+    """This function performs matrix multiplication with block-wise quantization.
+
+    It takes two input tensors `A` and `B` with scales `As` and `Bs`.
+    The output is returned in the specified `output_dtype`.
+
+    Args:
+        A: The input tensor, e.g., activation.
+        B: The input tensor, e.g., weight.
+        As: The per-token-group quantization scale for `A`.
+        Bs: The per-block quantization scale for `B`.
+        block_size: The block size for per-block quantization. It should be 2-dim, e.g., [128, 128].
+        output_dytpe: The dtype of the returned tensor.
+
+    Returns:
+        torch.Tensor: The result of matmul.
+    """
+    assert len(block_size) == 2
+    block_n, block_k = block_size[0], block_size[1]
+
+    assert A.shape[-1] == B.shape[-1]
+    assert A.shape[:-1] == As.shape[:-1] and A.is_contiguous()
+    assert triton.cdiv(A.shape[-1], block_k) == As.shape[-1]
+    M = A.numel() // A.shape[-1]
+
+    assert B.ndim == 2 and B.is_contiguous() and Bs.ndim == 2
+    N, K = B.shape
+    assert triton.cdiv(N, block_n) == Bs.shape[0]
+    assert triton.cdiv(K, block_k) == Bs.shape[1]
+
+    C_shape = A.shape[:-1] + (N,)
+    C = A.new_empty(C_shape, dtype=output_dtype)
+
+    configs = get_w8a8_block_int8_configs(N, K, block_size[0], block_size[1])
+    if configs:
+        # If an optimal configuration map has been found, look up the
+        # optimal config
+        config = configs[min(configs.keys(), key=lambda x: abs(x - M))]
+    else:
+        # Default config
+        # Block-wise quant: BLOCK_SIZE_K must be divisible by block_size[1]
+        config = {
+            "BLOCK_SIZE_M": 64,
+            "BLOCK_SIZE_N": block_size[0],
+            "BLOCK_SIZE_K": block_size[1],
+            "GROUP_SIZE_M": 32,
+            "num_warps": 4,
+            "num_stages": 3,
+        }
+
+    def grid(META):
+        return (
+            triton.cdiv(M, META["BLOCK_SIZE_M"]) * triton.cdiv(N, META["BLOCK_SIZE_N"]),
+        )
+
+    _w8a8_block_int8_matmul[grid](
+        A,
+        B,
+        C,
+        As,
+        Bs,
+        M,
+        N,
+        K,
+        block_n,
+        block_k,
+        A.stride(-2),
+        A.stride(-1),
+        B.stride(1),
+        B.stride(0),
+        C.stride(-2),
+        C.stride(-1),
+        As.stride(-2),
+        As.stride(-1),
+        Bs.stride(1),
+        Bs.stride(0),
+        **config,
+    )
+
+    return C
diff --git a/sglang/python/sglang/srt/layers/quantization/int8_utils.py b/sglang/python/sglang/srt/layers/quantization/int8_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..9a7aa6d825f2a2dfe4bc71cf923f288bf0f7f6f8
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/quantization/int8_utils.py
@@ -0,0 +1,73 @@
+from typing import List, Optional, Tuple
+
+import torch
+
+from sglang.srt.layers.quantization.int8_kernel import (
+    per_token_group_quant_int8,
+    w8a8_block_int8_matmul,
+)
+
+
+def apply_w8a8_block_int8_linear(
+    input: torch.Tensor,
+    weight: torch.Tensor,
+    block_size: List[int],
+    weight_scale: torch.Tensor,
+    input_scale: Optional[torch.Tensor] = None,
+    bias: Optional[torch.Tensor] = None,
+) -> torch.Tensor:
+    assert input_scale is None
+    # View input as 2D matrix for fp8 methods
+    input_2d = input.view(-1, input.shape[-1])
+    output_shape = [*input.shape[:-1], weight.shape[0]]
+
+    q_input, x_scale = per_token_group_quant_int8(input_2d, block_size[1])
+    output = w8a8_block_int8_matmul(
+        q_input, weight, x_scale, weight_scale, block_size, output_dtype=input.dtype
+    )
+
+    if bias is not None:
+        output = output + bias
+    return output.to(dtype=input.dtype).view(*output_shape)
+
+
+def input_to_int8(
+    x: torch.Tensor, dtype: torch.dtype = torch.int8
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """This function quantizes input values to int8 values with tensor-wise quantization."""
+    iinfo = torch.iinfo(dtype)
+    min_val, max_val = x.aminmax()
+    amax = torch.maximum(min_val.abs(), max_val.abs()).clamp(min=1e-12)
+    int8_min, int8_max = iinfo.min, iinfo.max
+    scale = int8_max / amax
+    x_scl_sat = (x * scale).clamp(min=int8_min, max=int8_max)
+    return x_scl_sat.to(dtype).contiguous(), scale.float().reciprocal()
+
+
+def block_dequant(
+    x_q_block: torch.Tensor,
+    x_s: torch.Tensor,
+    block_size: List[int],
+) -> torch.Tensor:
+    """This function conducts block-wise dequantization.
+    The inputs are block-wise quantization tensor `x_q_block`, block-wise quantization scale
+    and the block size.
+    The outputs are dequantized tensor.
+    """
+    block_n, block_k = block_size[0], block_size[1]
+    n, k = x_q_block.shape
+    n_tiles = (n + block_n - 1) // block_n
+    k_tiles = (k + block_k - 1) // block_k
+    assert n_tiles == x_s.shape[0]
+    assert k_tiles == x_s.shape[1]
+
+    x_dq_block = x_q_block.to(torch.float32)
+
+    for i in range(k_tiles):
+        for j in range(n_tiles):
+            x_dq_block[
+                j * block_n : min((j + 1) * block_n, n),
+                i * block_k : min((i + 1) * block_k, k),
+            ] *= x_s[j][i]
+
+    return x_dq_block
diff --git a/sglang/python/sglang/srt/layers/quantization/kv_cache.py b/sglang/python/sglang/srt/layers/quantization/kv_cache.py
new file mode 100644
index 0000000000000000000000000000000000000000..ef7cbe74efb0730a6757378f64f9034460f6e425
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/quantization/kv_cache.py
@@ -0,0 +1,81 @@
+# Adapted from https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/layers/quantization/kv_cache.py
+
+import logging
+
+import torch
+
+from sglang.srt.layers.quantization.base_config import (
+    QuantizationConfig,
+    QuantizeMethodBase,
+)
+from sglang.srt.layers.quantization.fp8_kernel import is_fp8_fnuz
+
+logger = logging.getLogger(__name__)
+
+
+class BaseKVCacheMethod(QuantizeMethodBase):
+    """
+    Quant method that adds `k_scale` and `v_scale` attributes to the
+    Attention layer to support loading those scaling factors from checkpoints.
+    The k/v_scale will be used to:
+        - quantize k/v_cache entries before saving them to the cache
+        - dequantize k/v_cache entries before fetching them from the cache
+
+    :param quant_config: the appropriate QuantizationConfig
+    """
+
+    def __init__(self, quant_config: QuantizationConfig):
+        self.quant_config = quant_config
+
+    def create_weights(self, layer: torch.nn.Module):
+        """
+        Create "weight" (aka k_scale and v_scale) for an attention layer.
+        """
+        # Initialize the KV cache scales to -1.0, which is an invalid value.
+        # If the k/v_scale appears in the checkpoint, it will be
+        # overwritten when loading weights.
+        layer.k_scale = torch.nn.Parameter(
+            torch.tensor(-1.0, dtype=torch.float32), requires_grad=False
+        )
+        layer.v_scale = torch.nn.Parameter(
+            torch.tensor(-1.0, dtype=torch.float32), requires_grad=False
+        )
+
+    def apply(self, layer: torch.nn.Module) -> torch.Tensor:
+        raise RuntimeError(f"{self.__class__.__name__}.apply should not be called.")
+
+    def process_weights_after_loading(self, layer) -> None:
+        if layer.k_scale > 0.0 and layer.v_scale > 0.0:
+            # We prefer to use separate k_scale and v_scale if present
+            k_scale = layer.k_scale.to("cpu").tolist()
+            v_scale = layer.v_scale.to("cpu").tolist()
+            if is_fp8_fnuz():
+                k_scale *= 2
+                v_scale *= 2
+        elif layer.k_scale < 0.0 and layer.v_scale < 0.0:
+            # If no scales were loaded (both scales are invalid negative
+            # values), use the default value of 1.0
+            k_scale = 1.0
+            v_scale = 1.0
+        else:
+            # If we find a single kv_scale in the checkpoint, we remap
+            # kv_scale to k_scale during weight loading, and duplicate
+            # k_scale to v_scale here
+            assert layer.k_scale > 0.0
+            scale_to_duplicate = max(layer.k_scale, layer.v_scale)
+            k_scale = scale_to_duplicate.to("cpu").tolist()
+            v_scale = scale_to_duplicate.to("cpu").tolist()
+            if is_fp8_fnuz():
+                k_scale *= 2
+                v_scale *= 2
+
+        if not isinstance(k_scale, float) or not isinstance(v_scale, float):
+            raise ValueError(
+                "Only support per-tensor scaling factor " "for fp8 KV cache"
+            )
+
+        # These are used in the final Attention.forward()
+        layer.k_scale.copy_(k_scale)
+        layer.v_scale.copy_(v_scale)
+        layer.k_scale_float = k_scale
+        layer.v_scale_float = v_scale
diff --git a/sglang/python/sglang/srt/layers/quantization/kvfp4_tensor.py b/sglang/python/sglang/srt/layers/quantization/kvfp4_tensor.py
new file mode 100644
index 0000000000000000000000000000000000000000..545199ff0a35eb0a9c15e50fd8d4d0493290f35f
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/quantization/kvfp4_tensor.py
@@ -0,0 +1,112 @@
+# Copyright 2025 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+import torch
+
+E2M1_MAX = 6.0
+# Put constants directly on CUDA if available
+_device = "cuda" if torch.cuda.is_available() else "cpu"
+E2M1_VALUES = torch.tensor(
+    [0, 0.5, 1, 1.5, 2, 3, 4, 6], dtype=torch.float32, device=_device
+)
+E2M1_BOUNDS = torch.tensor(
+    [0.25, 0.75, 1.25, 1.75, 2.5, 3.5, 5], dtype=torch.float32, device=_device
+)
+
+
+class KVFP4QuantizeUtil:
+    """Utility class for MXFP4 quantization and dequantization operations."""
+
+    @staticmethod
+    @torch.compile
+    def batched_quantize(tensor: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
+        """
+        Quantize tensor to KVFP4 format
+        Args:
+            tensor: Input tensor of shape [B, M, N]
+
+        Returns:
+            quant_tensor: Quantized tensor of shape [B, M, N/2]
+            scale_factors: Scale factors of shape [B, M*N/16]
+        """
+        b, m, n = tensor.shape
+
+        # Reshape to [B, M*N/16, 16] for block-wise quantization
+        reshaped = tensor.view(b, m * n // 16, 16)
+
+        # Compute scale factors per block
+        block_max = reshaped.abs().max(dim=-1, keepdim=True).values
+        scale_exp = torch.ceil(torch.log2(torch.clamp(block_max / E2M1_MAX, min=1e-10)))
+        scale_factors = (scale_exp + 127).squeeze(-1).to(torch.uint8)
+
+        # Apply scaling
+        scaled = reshaped / torch.exp2(scale_exp)
+
+        # Quantize to FP4
+        sign_bits = (scaled < 0).to(torch.uint8) << 3
+        abs_vals = scaled.abs()
+
+        # Pure tensor version (CUDA Graph safe)
+        magnitude_bits = torch.sum(abs_vals.unsqueeze(-1) >= E2M1_BOUNDS, dim=-1)
+
+        # Combine sign and magnitude
+        fp4_vals = sign_bits + magnitude_bits.to(torch.uint8)
+
+        # Pack two FP4 values into one uint8
+        fp4_reshaped = fp4_vals.view(b, m, n)
+        packed = (fp4_reshaped[..., 1::2] << 4) + fp4_reshaped[..., 0::2]
+
+        return packed, scale_factors
+
+    @staticmethod
+    @torch.compile
+    def batched_dequantize(
+        quant_tensor: torch.Tensor,
+        scale_factors: torch.Tensor,
+        dtype: torch.dtype = torch.bfloat16,
+    ) -> torch.Tensor:
+        """
+        Dequantize KVFP4 tensor
+        Args:
+            quant_tensor: Quantized tensor of shape [B, M, N/2]
+            scale_factors: Scale factors of shape [B, M*N/16]
+            dtype: Target dtype for output
+
+        Returns:
+            Dequantized tensor of shape [B, M, N]
+        """
+        b, m, n_half = quant_tensor.shape
+        n = n_half * 2
+
+        # More efficient unpacking using bit operations
+        fp4_vals = torch.empty(b, m, n, dtype=torch.uint8, device=quant_tensor.device)
+        fp4_vals[..., 0::2] = quant_tensor & 0x0F
+        fp4_vals[..., 1::2] = (quant_tensor >> 4) & 0x0F
+
+        # Extract sign and magnitude
+        sign_mask = (fp4_vals & 0x08) != 0
+        magnitude_idx = fp4_vals & 0x07
+
+        # Convert to float values
+        float_vals = E2M1_VALUES[magnitude_idx.long()]
+        float_vals = torch.where(sign_mask, -float_vals, float_vals)
+
+        # Reshape for block-wise scaling
+        reshaped = float_vals.view(b, m * n // 16, 16)
+
+        # Apply scale factors
+        scale_exp = scale_factors.float() - 127
+        scaled = reshaped * torch.exp2(scale_exp.unsqueeze(-1))
+
+        return scaled.view(b, m, n).to(dtype)
diff --git a/sglang/python/sglang/srt/layers/quantization/marlin_utils.py b/sglang/python/sglang/srt/layers/quantization/marlin_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..d2761fc8e88fe39f237d56140ca38fd5841354c4
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/quantization/marlin_utils.py
@@ -0,0 +1,983 @@
+# SPDX-License-Identifier: Apache-2.0
+# Adapted from https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/layers/quantization/utils/marlin_utils.py
+
+from __future__ import annotations
+
+import logging
+from dataclasses import dataclass
+from typing import TYPE_CHECKING, Any, Optional
+
+import numpy
+import torch
+
+from sglang.srt.layers.parameter import (
+    BasevLLMParameter,
+    ChannelQuantScaleParameter,
+    GroupQuantScaleParameter,
+    PackedvLLMParameter,
+)
+from sglang.srt.layers.quantization.base_config import (
+    LinearMethodBase,
+    QuantizationConfig,
+)
+from sglang.srt.layers.quantization.utils import (
+    get_scalar_types,
+    pack_cols,
+    unpack_cols,
+)
+from sglang.srt.utils import get_device_capability, is_cuda
+from sglang.srt.utils.custom_op import register_custom_op
+
+if TYPE_CHECKING:
+    from sglang.srt.layers.linear import LinearBase
+    from sglang.srt.layers.moe.fused_moe_triton.layer import FusedMoE
+
+from sglang.srt.compilation.piecewise_context_manager import get_forward_context
+
+try:
+    from vllm import _custom_ops as ops
+except ImportError:
+    ops = None
+
+
+_is_cuda = is_cuda()
+
+if _is_cuda:
+    from sglang.jit_kernel.gptq_marlin import gptq_marlin_gemm
+
+logger = logging.getLogger(__name__)
+
+ScalarType, scalar_types = get_scalar_types()
+
+GPTQ_MARLIN_TILE = 16
+GPTQ_MARLIN_MIN_THREAD_N = 64
+GPTQ_MARLIN_MIN_THREAD_K = 128
+GPTQ_MARLIN_MAX_PARALLEL = 16
+
+MARLIN_SUPPORTED_GROUP_SIZES = [-1, 32, 64, 128]
+
+# In case there is a performance issue with Marlin, the variable below can be
+# changed to False, which allows Marlin to perform global reductions in fp16
+# precision (instead of fp32), and therefore, save on some memory movements.
+USE_FP32_REDUCE_DEFAULT = True
+
+
+@dataclass
+class MarlinLinearLayerConfig:
+    full_weight_shape: tuple[int, int]  # [in, out]
+    partition_weight_shape: tuple[int, int]
+    weight_type: ScalarType
+    act_type: torch.dtype
+    group_size: int
+    zero_points: bool
+    has_g_idx: bool
+
+
+# For binary size and compile time, we don't support the same types for with and
+#  without runtime zero-point. We support common cases, i.e. AWQ and GPTQ.
+#  TODO: we may want to move this into the C++ so its closer to the actual impl
+def query_marlin_supported_quant_types(
+    has_zp: Optional[bool] = None,
+    include_fp_type: bool = True,
+    device_capability: Optional[int] = None,
+):
+    if device_capability is None:
+        major, minor = get_device_capability()
+        capability = major * 10 + minor
+        device_capability = -1 if capability is None else capability
+
+    if device_capability < 80:
+        return []
+
+    # - has_zp is True: return quant_types that has zero points
+    # - has_zp is False: return quant_types that has not zero points
+    # - has_zp is None: both
+    if has_zp is None:
+        types0 = query_marlin_supported_quant_types(
+            False, include_fp_type, device_capability
+        )
+        types1 = query_marlin_supported_quant_types(
+            True, include_fp_type, device_capability
+        )
+        return types0 + types1
+
+    if has_zp:
+        # AWQ style, unsigned + runtime zero-point
+        return [scalar_types.uint4]
+    else:
+        # GPTQ style, unsigned + symmetric bias
+        res = [scalar_types.uint4b8, scalar_types.uint8b128]
+        if include_fp_type:
+            res += [scalar_types.float8_e4m3fn, scalar_types.float4_e2m1f]
+        return res
+
+
+def _check_marlin_supported(
+    quant_type: ScalarType,
+    group_size: Optional[int],
+    has_zp: bool,
+    device_capability: Optional[int] = None,
+) -> tuple[bool, Optional[str]]:
+
+    if device_capability is None:
+        major, minor = get_device_capability()
+        capability = major * 10 + minor
+        device_capability = -1 if capability is None else capability
+
+    supported_types = query_marlin_supported_quant_types(
+        has_zp, True, device_capability
+    )
+
+    if quant_type not in supported_types:
+        return (
+            False,
+            f"Marlin does not support weight_bits = {quant_type}. "
+            f"Only types = {supported_types} "
+            f"are supported (for group_size = {group_size}, "
+            f"device_capability = {device_capability}, zp = {has_zp}).",
+        )
+    if group_size is None or group_size not in MARLIN_SUPPORTED_GROUP_SIZES:
+        return (
+            False,
+            f"Marlin does not support group_size = {group_size}. "
+            f"Only group_sizes = {MARLIN_SUPPORTED_GROUP_SIZES} "
+            "are supported.",
+        )
+
+    return True, None
+
+
+def check_marlin_supported(
+    quant_type: ScalarType,
+    group_size: int,
+    has_zp: bool = False,
+    device_capability: Optional[int] = None,
+) -> bool:
+    cond, _ = _check_marlin_supported(quant_type, group_size, has_zp, device_capability)
+    return cond
+
+
+def verify_marlin_supported(
+    quant_type: ScalarType, group_size: int, has_zp: bool = False
+) -> None:
+    cond, err_msg = _check_marlin_supported(quant_type, group_size, has_zp)
+    if not cond:
+        assert err_msg is not None
+        raise ValueError(err_msg)
+
+
+def verify_marlin_supports_shape(
+    output_size_per_partition: int,
+    input_size_per_partition: int,
+    input_size: int,
+    group_size: int,
+) -> None:
+
+    # Validate output_size_per_partition
+    if output_size_per_partition % GPTQ_MARLIN_MIN_THREAD_N != 0:
+        raise ValueError(
+            f"Weight output_size_per_partition = "
+            f"{output_size_per_partition} is not divisible by "
+            f" min_thread_n = {GPTQ_MARLIN_MIN_THREAD_N}. "
+            "Consider reducing tensor_parallel_size or running "
+            "with --quantization gptq."
+        )
+
+    # Validate input_size_per_partition
+    if input_size_per_partition % GPTQ_MARLIN_MIN_THREAD_K != 0:
+        raise ValueError(
+            f"Weight input_size_per_partition = "
+            f"{input_size_per_partition} is not divisible "
+            f"by min_thread_k = {GPTQ_MARLIN_MIN_THREAD_K}. "
+            "Consider reducing tensor_parallel_size or running "
+            "with --quantization gptq."
+        )
+
+    if group_size < input_size and input_size_per_partition % group_size != 0:
+        raise ValueError(
+            f"Weight input_size_per_partition = {input_size_per_partition}"
+            f" is not divisible by group_size = {group_size}. "
+            "Consider reducing tensor_parallel_size or running "
+            "with --quantization gptq."
+        )
+
+
+def check_marlin_supports_shape(
+    output_size_per_partition: int,
+    input_size_per_partition: int,
+    input_size: int,
+    group_size: int,
+) -> tuple[bool, Optional[str]]:
+    try:
+        verify_marlin_supports_shape(
+            output_size_per_partition, input_size_per_partition, input_size, group_size
+        )
+    except ValueError as e:
+        return False, e.__str__()
+    return True, None
+
+
+def check_marlin_supports_layer(layer: LinearBase, group_size: int) -> bool:
+    output_size_per_partition = (
+        getattr(layer, "output_size_per_partition", None) or layer.output_size
+    )
+    input_size_per_partition = (
+        getattr(layer, "input_size_per_partition", None) or layer.input_size
+    )
+
+    return check_marlin_supports_shape(
+        output_size_per_partition=output_size_per_partition,
+        input_size_per_partition=input_size_per_partition,
+        input_size=layer.input_size,
+        group_size=group_size,
+    )[0]
+
+
+def check_moe_marlin_supports_layer(layer: FusedMoE, group_size: int) -> bool:
+    hidden_size = layer.hidden_size
+    intermediate_size_per_partition = layer.intermediate_size_per_partition
+    # apply_router_weight_on_input is not supported for moe marlin
+    supports_router_weight = not layer.moe_runner_config.apply_router_weight_on_input
+    # moe marlin requires the activation to be silu
+    supports_activation = layer.moe_runner_config.activation == "silu"
+
+    # gate-up: (n, k) = (intermediate_size_per_partition * 2, hidden_size)
+    # down: (n, k) = (hidden_size, intermediate_size_per_partition)
+    # moe marlin requires n % 128 == 0 and k % 64 == 0
+    supports_shape = (
+        hidden_size % 128 == 0
+        and intermediate_size_per_partition % max(64, group_size) == 0
+    )
+    supports_group_size = group_size in [-1, 32, 64, 128]
+    return (
+        supports_shape
+        and supports_group_size
+        and supports_router_weight
+        and supports_activation
+    )
+
+
+def marlin_make_workspace(
+    device: torch.device, max_blocks_per_sm: int = 1
+) -> torch.Tensor:
+    # In the new marlin kernel, we use the num of threadblocks as workspace
+    # size. The num of threadblocks is is sms_count * max_blocks_per_sm.
+    sms = torch.cuda.get_device_properties(device).multi_processor_count
+    return torch.zeros(
+        sms * max_blocks_per_sm, dtype=torch.int, device=device, requires_grad=False
+    )
+
+
+def marlin_is_k_full(act_order: bool, is_row_parallel: bool) -> bool:
+    return (not act_order) or (act_order and not is_row_parallel)
+
+
+def marlin_repeat_scales_on_all_ranks(
+    act_order: bool, group_size: int, is_row_parallel: bool
+) -> bool:
+    # Need to repeat scales on every rank if act_ordering or
+    # channelwise and RowParallelLinear
+    is_channelwise = group_size == -1
+    return act_order or (is_channelwise and is_row_parallel)
+
+
+def marlin_make_empty_g_idx(device: torch.device) -> torch.Tensor:
+    return torch.nn.Parameter(
+        torch.empty(0, dtype=torch.int, device=device), requires_grad=False
+    )
+
+
+def marlin_make_empty_zp(device: torch.device) -> torch.Tensor:
+    return torch.nn.Parameter(
+        torch.empty(0, dtype=torch.int, device=device), requires_grad=False
+    )
+
+
+def marlin_sort_g_idx(g_idx: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
+    g_idx_sort_indices = torch.argsort(g_idx).to(torch.int)
+    return g_idx[g_idx_sort_indices], g_idx_sort_indices
+
+
+def get_scale_perms():
+    scale_perm: list[int] = []
+    for i in range(8):
+        scale_perm.extend([i + 8 * j for j in range(8)])
+    scale_perm_single: list[int] = []
+    for i in range(4):
+        scale_perm_single.extend([2 * i + j for j in [0, 1, 8, 9, 16, 17, 24, 25]])
+    return scale_perm, scale_perm_single
+
+
+def marlin_permute_scales(
+    s: torch.Tensor, size_k: int, size_n: int, group_size: int
+) -> torch.Tensor:
+
+    scale_perm, scale_perm_single = get_scale_perms()
+    if group_size < size_k and group_size != -1:
+        s = s.reshape((-1, len(scale_perm)))[:, scale_perm]
+    else:
+        s = s.reshape((-1, len(scale_perm_single)))[:, scale_perm_single]
+    s = s.reshape((-1, size_n)).contiguous()
+
+    return s
+
+
+def marlin_permute_bias(s: torch.Tensor) -> torch.Tensor:
+    origin_shape = s.shape
+    _, scale_perm_single = get_scale_perms()
+    s = s.reshape((-1, len(scale_perm_single)))[:, scale_perm_single]
+    return s.reshape(*origin_shape).contiguous()
+
+
+def marlin_moe_permute_scales(
+    s: torch.Tensor,
+    size_k: int,
+    size_n: int,
+    group_size: int,
+):
+    num_experts = s.shape[0]
+    output = torch.empty(
+        (num_experts, s.shape[1], s.shape[2]),
+        device=s.device,
+        dtype=s.dtype,
+    )
+
+    for e in range(num_experts):
+        output[e] = marlin_permute_scales(s[e], size_k, size_n, group_size)
+    return output
+
+
+def marlin_zero_points(
+    zp: torch.Tensor, size_k: int, size_n: int, num_bits: int
+) -> torch.Tensor:
+    # Permute zero-points in a similar way to scales, but do not use the
+    # "single" permutation, since zero-points are applied on every MMA
+    scale_perm, _ = get_scale_perms()
+    zp = zp.reshape((-1, len(scale_perm)))[:, scale_perm]
+
+    # Interleave column dim (for the dequantize code) and pack it to int32
+    if num_bits == 4:
+        interleave = numpy.array([0, 2, 4, 6, 1, 3, 5, 7])
+    elif num_bits == 8:
+        interleave = numpy.array([0, 2, 1, 3])
+    else:
+        raise Exception("num_bits must be 4 or 8, got {}".format(num_bits))
+
+    zp = zp.reshape((-1, len(interleave)))[:, interleave].ravel()
+    zp = zp.reshape((-1, size_n)).contiguous()
+    zp = pack_cols(zp, num_bits, size_k, size_n)
+
+    return zp
+
+
+def awq_to_marlin_zero_points(
+    q_zp_packed: torch.Tensor, size_k: int, size_n: int, num_bits: int
+) -> torch.Tensor:
+    # AWQ zero-points are quantized and packed on the column dim.
+    # In addition, the values are permuted based on dequantizer.
+    # Here we undo both of these, and then apply marlin permutation
+    # and pack it back.
+    q_zp = unpack_cols(q_zp_packed, num_bits, size_k, size_n)
+
+    # Undo interleaving (use argsort(..) to get inverse perm)
+    if num_bits == 4:
+        undo_interleave = numpy.argsort(numpy.array([0, 2, 4, 6, 1, 3, 5, 7]))
+    elif num_bits == 8:
+        undo_interleave = numpy.argsort(numpy.array([0, 2, 1, 3]))
+    else:
+        raise Exception("num_bits must be 4 or 8, got {}".format(num_bits))
+
+    q_zp = q_zp.reshape((-1, len(undo_interleave)))[:, undo_interleave].ravel()
+    q_zp = q_zp.reshape((-1, size_n)).contiguous()
+
+    marlin_zp = marlin_zero_points(q_zp, size_k, size_n, num_bits)
+    return marlin_zp
+
+
+def moe_awq_to_marlin_zero_points(
+    q_zp_packed: torch.Tensor, size_k: int, size_n: int, num_bits: int
+):
+    num_experts = q_zp_packed.shape[0]
+    output = torch.empty(
+        (num_experts, q_zp_packed.shape[1], q_zp_packed.shape[2]),
+        device=q_zp_packed.device,
+        dtype=q_zp_packed.dtype,
+    )
+    for e in range(num_experts):
+        output[e] = awq_to_marlin_zero_points(q_zp_packed[e], size_k, size_n, num_bits)
+    return output
+
+
+def maybe_warn_marlin_atomic_add(device, dtype):
+    if torch.compiler.is_dynamo_compiling():
+        return
+    device_capability = torch.cuda.get_device_capability(device)
+    if device_capability[0] < 9 and dtype == torch.bfloat16:
+        logger.info_once(
+            "You are running Marlin kernel with bf16 on GPUs before SM90. "
+            "You can consider change to fp16 to achieve better performance "
+            "if possible."
+        )
+
+
+def maybe_warn_marlin_atomic_add_env():
+    if torch.compiler.is_dynamo_compiling():
+        return
+    # TODO(yiyun): Need to add sglang's MARLIN_USE_ATOMIC_ADD: bool = False
+    if True:
+        return
+    # if envs.VLLM_MARLIN_USE_ATOMIC_ADD:
+    #     return
+    logger.info_once(
+        "Marlin kernel can achieve better performance for small size_n "
+        "with experimental use_atomic_add feature. "
+        "You can consider set environment variable "
+        "VLLM_MARLIN_USE_ATOMIC_ADD to 1 if possible."
+    )
+
+
+def should_use_atomic_add_reduce(
+    m: int, n: int, k: int, device: torch.device, dtype: torch.dtype
+) -> bool:
+
+    # the performance of atomicAdd is better than global reduce
+    # only when m*n is small and k is large
+    if n >= 2048 or k < 2048 or device.type != "cuda":
+        return False
+
+    # disable atomicAdd reduce by default,
+    # one can enable it with VLLM_MARLIN_USE_ATOMIC_ADD=1
+    # TODO: Need to add sglang's MARLIN_USE_ATOMIC_ADD: bool = False
+    if not True:
+        maybe_warn_marlin_atomic_add_env()
+        return False
+
+    # sm8x doesn't support atomicAdd + bfloat16 natively
+    device_capability = torch.cuda.get_device_capability(device)
+    if device_capability[0] < 9 and dtype == torch.bfloat16:
+        maybe_warn_marlin_atomic_add(device, dtype)
+        return False
+
+    return True
+
+
+def apply_gptq_marlin_linear(
+    input: torch.Tensor,
+    weight: torch.Tensor,
+    weight_scale: torch.Tensor,
+    weight_zp: torch.Tensor,
+    g_idx: torch.Tensor,
+    g_idx_sort_indices: torch.Tensor,
+    workspace: torch.Tensor,
+    wtype: ScalarType,
+    output_size_per_partition: int,
+    input_size_per_partition: int,
+    is_k_full: bool,
+    bias: Optional[torch.Tensor] = None,
+    use_fp32_reduce: bool = USE_FP32_REDUCE_DEFAULT,
+) -> torch.Tensor:
+    reshaped_x = input.reshape(-1, input.shape[-1])
+    out_shape = input.shape[:-1] + (output_size_per_partition,)
+
+    use_atomic_add = should_use_atomic_add_reduce(
+        m=reshaped_x.size(0),
+        n=output_size_per_partition,
+        k=reshaped_x.size(1),
+        device=input.device,
+        dtype=input.dtype,
+    )
+
+    forward_context = get_forward_context()
+    if forward_context is None:
+        output = gptq_marlin_gemm(
+            reshaped_x,
+            None,
+            weight,
+            weight_scale,
+            None,
+            weight_zp,
+            g_idx,
+            g_idx_sort_indices,
+            workspace,
+            wtype,
+            size_m=reshaped_x.shape[0],
+            size_n=output_size_per_partition,
+            size_k=input_size_per_partition,
+            is_k_full=is_k_full,
+            use_atomic_add=use_atomic_add,
+            use_fp32_reduce=use_fp32_reduce,
+            is_zp_float=False,
+        )
+    else:
+        output = unified_apply_gptq_marlin_gemm_with_wtype(
+            input=reshaped_x,
+            weight=weight,
+            weight_scale=weight_scale,
+            weight_zp=weight_zp,
+            g_idx=g_idx,
+            g_idx_sort_indices=g_idx_sort_indices,
+            workspace=workspace,
+            wtype_id=wtype.id,
+            output_size_per_partition=output_size_per_partition,
+            input_size_per_partition=input_size_per_partition,
+            is_k_full=is_k_full,
+            use_atomic_add=use_atomic_add,
+            use_fp32_reduce=use_fp32_reduce,
+            is_zp_float=False,
+        )
+
+    if bias is not None:
+        output.add_(bias)  # In-place add
+
+    return output.reshape(out_shape)
+
+
+def apply_awq_marlin_linear(
+    input: torch.Tensor,
+    weight: torch.Tensor,
+    weight_scale: torch.Tensor,
+    weight_zp: torch.Tensor,
+    g_idx: torch.Tensor,
+    g_idx_sort_indices: torch.Tensor,
+    workspace: torch.Tensor,
+    quant_type: ScalarType,
+    output_size_per_partition: int,
+    input_size_per_partition: int,
+    bias: Optional[torch.Tensor] = None,
+    use_fp32_reduce: bool = USE_FP32_REDUCE_DEFAULT,
+) -> torch.Tensor:
+    reshaped_x = input.reshape(-1, input.shape[-1])
+    out_shape = input.shape[:-1] + (output_size_per_partition,)
+
+    use_atomic_add = should_use_atomic_add_reduce(
+        m=reshaped_x.size(0),
+        n=output_size_per_partition,
+        k=reshaped_x.size(1),
+        device=input.device,
+        dtype=input.dtype,
+    )
+
+    forward_context = get_forward_context()
+    if forward_context is None:
+        output = gptq_marlin_gemm(
+            reshaped_x,
+            None,
+            weight,
+            weight_scale,
+            None,
+            weight_zp,
+            g_idx,
+            g_idx_sort_indices,
+            workspace,
+            quant_type,
+            size_m=reshaped_x.shape[0],
+            size_n=output_size_per_partition,
+            size_k=input_size_per_partition,
+            use_atomic_add=use_atomic_add,
+            use_fp32_reduce=use_fp32_reduce,
+            is_zp_float=False,
+        )
+    else:
+        output = unified_apply_gptq_marlin_gemm(
+            input=reshaped_x,
+            weight=weight,
+            weight_scale=weight_scale,
+            weight_zp=weight_zp,
+            g_idx=g_idx,
+            g_idx_sort_indices=g_idx_sort_indices,
+            workspace=workspace,
+            output_size_per_partition=output_size_per_partition,
+            input_size_per_partition=input_size_per_partition,
+            use_atomic_add=use_atomic_add,
+            use_fp32_reduce=use_fp32_reduce,
+            is_zp_float=False,
+        )
+
+    if bias is not None:
+        output.add_(bias)  # In-place add
+
+    return output.reshape(out_shape)
+
+
+class MarlinConfig(QuantizationConfig):
+    """Config class for Marlin.
+
+    Reference: https://github.com/IST-DASLab/marlin/tree/master
+    """
+
+    def __init__(
+        self,
+        group_size: int,
+        lm_head_quantized: bool,
+    ) -> None:
+        super().__init__()
+
+        # Group size for the quantization.
+        self.group_size = group_size
+        self.lm_head_quantized = lm_head_quantized
+        if self.group_size != 128 and self.group_size != -1:
+            raise ValueError(
+                "Currently, only group size 128 and -1 (channelwise) "
+                "is supported for Marlin, but got group_size of "
+                f"{self.group_size}"
+            )
+
+        # 4 Bits packed into 32 bit datatype.
+        self.pack_factor = 32 // 4
+
+        # Tile size used by marlin kernels.
+        self.tile_size = 16
+
+        # Min out_features dim
+        self.min_n_threads = 64
+
+        # Min in_features dim
+        self.min_k_threads = 128
+
+        # Max parallel problems to solve at once (improves large
+        # batch performance)
+        self.max_parallel = 16
+
+        # Permutation length used by the marlin kernels.
+        self.perm_len = 1024
+
+    def __repr__(self) -> str:
+        return (
+            f"MarlinConfig(group_size={self.group_size}, "
+            f"lm_head_quantized={self.lm_head_quantized})"
+        )
+
+    @classmethod
+    def get_name(cls) -> str:
+        return "marlin"
+
+    @classmethod
+    def get_supported_act_dtypes(cls) -> list[torch.dtype]:
+        return [torch.half]
+
+    @classmethod
+    # Need to figure it out
+    def get_min_capability(cls) -> int:
+        return 80
+
+    @classmethod
+    def get_config_filenames(cls) -> list[str]:
+        return ["quantize_config.json"]
+
+    @classmethod
+    def from_config(cls, config: dict[str, Any]) -> "MarlinConfig":
+        group_size = cls.get_from_keys(config, ["group_size"])
+        lm_head_quantized = cls.get_from_keys_or(config, ["lm_head"], default=False)
+        return cls(group_size, lm_head_quantized)
+
+    @classmethod
+    def override_quantization_method(cls, hf_quant_cfg, user_quant) -> Optional[str]:
+        # compat: autogptq >=0.8.0 use checkpoint_format: str
+        # compat: autogptq <=0.7.1 is_marlin_format: bool
+        is_marlin_format = hf_quant_cfg.get(
+            "checkpoint_format"
+        ) == "marlin" or hf_quant_cfg.get("is_marlin_format", False)
+
+        is_valid_user_quant = (
+            user_quant is None or user_quant == "gptq" or user_quant == "marlin"
+        )
+
+        if is_marlin_format and is_valid_user_quant:
+            msg = "The model is serialized in {} format. Using {} kernel.".format(
+                cls.get_name(), cls.get_name()
+            )
+            logger.info(msg)
+            return cls.get_name()
+
+        return None
+
+    def get_quant_method(
+        self, layer: torch.nn.Module, prefix: str
+    ) -> Optional[MarlinLinearMethod]:
+        from sglang.srt.layers.linear import LinearBase
+        from sglang.srt.layers.vocab_parallel_embedding import ParallelLMHead
+
+        if isinstance(layer, LinearBase) or (
+            isinstance(layer, ParallelLMHead) and self.lm_head_quantized
+        ):
+            return MarlinLinearMethod(self)
+        return None
+
+
+class MarlinLinearMethod(LinearMethodBase):
+    """Linear method for Marlin.
+
+    Args:
+        quant_config: The Marlin quantization config.
+    """
+
+    def __init__(self, quant_config: MarlinConfig):
+        self.quant_config = quant_config
+
+    def create_weights(
+        self,
+        layer: torch.nn.Module,
+        input_size_per_partition: int,
+        output_partition_sizes: list[int],
+        input_size: int,
+        output_size: int,
+        params_dtype: torch.dtype,
+        **extra_weight_attrs,
+    ):
+        del output_size  # Unused.
+        weight_loader = extra_weight_attrs["weight_loader"]
+
+        if params_dtype != torch.float16:
+            raise ValueError(
+                f"The params dtype must be float16, but got {params_dtype}"
+            )
+
+        # Validate output_size_per_partition
+        output_size_per_partition = sum(output_partition_sizes)
+        if output_size_per_partition % self.quant_config.min_n_threads != 0:
+            raise ValueError(
+                f"Weight output_size_per_partition = "
+                f"{output_size_per_partition} is not divisible by "
+                f"min_n_threads = {self.quant_config.min_n_threads}."
+            )
+        if output_size_per_partition % self.quant_config.pack_factor != 0:
+            raise ValueError(
+                f"Weight output_size_per_partition = "
+                f"{output_size_per_partition} is not divisible by "
+                f"pack_factor = {self.quant_config.pack_factor}."
+            )
+
+        # Validate input_size_per_partition
+        if input_size_per_partition % self.quant_config.min_k_threads != 0:
+            raise ValueError(
+                f"Weight input_size_per_partition = "
+                f"{input_size_per_partition} is not divisible by "
+                f"min_k_threads = {self.quant_config.min_k_threads}."
+            )
+        if (
+            self.quant_config.group_size != -1
+            and input_size_per_partition % self.quant_config.group_size != 0
+        ):
+            raise ValueError(
+                f"Weight input_size_per_partition = "
+                f"{input_size_per_partition} is not divisible by "
+                f"group_size = {self.quant_config.group_size}."
+            )
+
+        # Check that we have at least 4 tiles horizontally in the shard
+        num_tiles_per_perm = self.quant_config.perm_len // (
+            self.quant_config.tile_size**2
+        )
+        if output_size_per_partition % num_tiles_per_perm != 0:
+            raise ValueError("Each permutation group must reside on the same gpu")
+
+        # Quantized 4Bit weights packed into Int32.
+        qweight = PackedvLLMParameter(
+            data=torch.empty(
+                input_size_per_partition // self.quant_config.tile_size,
+                output_size_per_partition
+                * self.quant_config.tile_size
+                // self.quant_config.pack_factor,
+                device="cuda",
+                dtype=torch.int32,
+            ),
+            input_dim=0,
+            output_dim=1,
+            packed_dim=1,
+            packed_factor=self.quant_config.pack_factor,
+            marlin_tile_size=self.quant_config.tile_size,
+            weight_loader=weight_loader,
+        )
+
+        # Determine if channelwise or not
+        input_groups = (
+            1
+            if self.quant_config.group_size == -1
+            else input_size_per_partition // self.quant_config.group_size
+        )
+
+        weight_scale_args = {
+            "data": torch.empty(
+                input_groups,
+                output_size_per_partition,
+                device="cuda",
+                dtype=params_dtype,
+            ),
+            "weight_loader": weight_loader,
+        }
+        if input_groups == 1:
+            scales = ChannelQuantScaleParameter(output_dim=1, **weight_scale_args)
+        else:
+            scales = GroupQuantScaleParameter(
+                output_dim=1, input_dim=0, **weight_scale_args
+            )
+
+        # Allocate workspace (Used for internal locking mechanism)
+        max_workspace_size = (
+            output_size_per_partition // self.quant_config.min_n_threads
+        ) * self.quant_config.max_parallel
+
+        workspace = BasevLLMParameter(
+            data=torch.zeros(max_workspace_size, device="cuda", dtype=torch.int),
+            weight_loader=weight_loader,
+        )
+
+        layer.register_parameter("B", qweight)
+        layer.register_parameter("s", scales)
+        layer.register_parameter("workspace", workspace)
+
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        # required by torch.compile
+        layer.B = torch.nn.Parameter(layer.B.data, requires_grad=False)
+        layer.s = torch.nn.Parameter(layer.s.data, requires_grad=False)
+        layer.workspace = torch.nn.Parameter(layer.workspace.data, requires_grad=False)
+
+    def apply(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        bias: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        qweight = layer.B
+        scales = layer.s
+        workspace = layer.workspace
+
+        x_2d = x.view(-1, x.shape[-1])
+
+        size_m = x_2d.shape[0]
+        size_k = x_2d.shape[1]
+        size_n = scales.shape[1]
+
+        output_2d = ops.marlin_gemm(
+            x_2d, qweight, scales, workspace, size_m, size_n, size_k
+        )
+
+        output = output_2d.view(x.shape[:-1] + (output_2d.shape[1],))
+
+        if bias is not None:
+            output.add_(bias)  # In-place add
+
+        return output
+
+
+def fake_unified_apply_gptq_marlin_gemm(
+    input: torch.Tensor,
+    weight: torch.Tensor,
+    weight_scale: torch.Tensor,
+    weight_zp: torch.Tensor,
+    g_idx: torch.Tensor,
+    g_idx_sort_indices: torch.Tensor,
+    workspace: torch.Tensor,
+    output_size_per_partition: int,
+    input_size_per_partition: int,
+    use_atomic_add: bool,
+    use_fp32_reduce: bool,
+    is_zp_float: bool,
+) -> torch.Tensor:
+    return input.new_empty(
+        (input.shape[0], output_size_per_partition), dtype=input.dtype
+    )
+
+
+@register_custom_op(fake_impl=fake_unified_apply_gptq_marlin_gemm)
+def unified_apply_gptq_marlin_gemm(
+    input: torch.Tensor,
+    weight: torch.Tensor,
+    weight_scale: torch.Tensor,
+    weight_zp: torch.Tensor,
+    g_idx: torch.Tensor,
+    g_idx_sort_indices: torch.Tensor,
+    workspace: torch.Tensor,
+    output_size_per_partition: int,
+    input_size_per_partition: int,
+    use_atomic_add: bool,
+    use_fp32_reduce: bool,
+    is_zp_float: bool,
+) -> torch.Tensor:
+    quant_config = get_forward_context().quant_config
+    quant_type = quant_config.quant_type
+    return gptq_marlin_gemm(
+        input,
+        None,
+        weight,
+        weight_scale,
+        None,
+        weight_zp,
+        g_idx,
+        g_idx_sort_indices,
+        workspace,
+        quant_type,
+        size_m=input.shape[0],
+        size_n=output_size_per_partition,
+        size_k=input_size_per_partition,
+        use_atomic_add=use_atomic_add,
+        use_fp32_reduce=use_fp32_reduce,
+        is_zp_float=is_zp_float,
+    )
+
+
+def fake_unified_apply_gptq_marlin_gemm_with_wtype(
+    input: torch.Tensor,
+    weight: torch.Tensor,
+    weight_scale: torch.Tensor,
+    weight_zp: torch.Tensor,
+    g_idx: torch.Tensor,
+    g_idx_sort_indices: torch.Tensor,
+    workspace: torch.Tensor,
+    wtype_id: int,
+    output_size_per_partition: int,
+    input_size_per_partition: int,
+    is_k_full: bool,
+    use_atomic_add: bool,
+    use_fp32_reduce: bool,
+    is_zp_float: bool,
+) -> torch.Tensor:
+    return input.new_empty(
+        (input.shape[0], output_size_per_partition), dtype=input.dtype
+    )
+
+
+@register_custom_op(fake_impl=fake_unified_apply_gptq_marlin_gemm_with_wtype)
+def unified_apply_gptq_marlin_gemm_with_wtype(
+    input: torch.Tensor,
+    weight: torch.Tensor,
+    weight_scale: torch.Tensor,
+    weight_zp: torch.Tensor,
+    g_idx: torch.Tensor,
+    g_idx_sort_indices: torch.Tensor,
+    workspace: torch.Tensor,
+    wtype_id: int,
+    output_size_per_partition: int,
+    input_size_per_partition: int,
+    is_k_full: bool,
+    use_atomic_add: bool,
+    use_fp32_reduce: bool,
+    is_zp_float: bool,
+) -> torch.Tensor:
+    # Reconstruct ScalarType from id
+    wtype = None
+    for attr_name in dir(scalar_types):
+        if not attr_name.startswith("_"):
+            st = getattr(scalar_types, attr_name)
+            if hasattr(st, "id") and st.id == wtype_id:
+                wtype = st
+                break
+    return gptq_marlin_gemm(
+        input,
+        None,
+        weight,
+        weight_scale,
+        None,
+        weight_zp,
+        g_idx,
+        g_idx_sort_indices,
+        workspace,
+        wtype,
+        size_m=input.shape[0],
+        size_n=output_size_per_partition,
+        size_k=input_size_per_partition,
+        is_k_full=is_k_full,
+        use_atomic_add=use_atomic_add,
+        use_fp32_reduce=use_fp32_reduce,
+        is_zp_float=is_zp_float,
+    )
diff --git a/sglang/python/sglang/srt/layers/quantization/marlin_utils_fp8.py b/sglang/python/sglang/srt/layers/quantization/marlin_utils_fp8.py
new file mode 100644
index 0000000000000000000000000000000000000000..e699b67980a68dd8e4f84261395b5cebfce261ad
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/quantization/marlin_utils_fp8.py
@@ -0,0 +1,355 @@
+# SPDX-License-Identifier: Apache-2.0
+
+import logging
+from typing import Optional
+
+import torch
+
+from sglang.srt.layers.quantization.marlin_utils import (
+    USE_FP32_REDUCE_DEFAULT,
+    marlin_make_workspace,
+    marlin_permute_bias,
+    marlin_permute_scales,
+    should_use_atomic_add_reduce,
+)
+from sglang.srt.layers.quantization.utils import get_scalar_types
+from sglang.srt.utils import is_cuda
+
+_is_cuda = is_cuda()
+if _is_cuda:
+    from sglang.jit_kernel.gptq_marlin import gptq_marlin_gemm
+    from sglang.jit_kernel.gptq_marlin_repack import gptq_marlin_repack
+
+ScalarType, scalar_types = get_scalar_types()
+
+logger = logging.getLogger(__name__)
+
+
+def fp8_fused_exponent_bias_into_scales(scales):
+    fp8_exponent = 4
+    if scales.dtype == torch.half:
+        target_exponent = 5
+    elif scales.dtype == torch.bfloat16:
+        target_exponent = 8
+    # exponent_bias_fp16 = 2 ** 4 - 2 ** 3 = 8
+    # exponent_bias_bf16 = 2 ** 7 - 2 ** 3 = 120
+    exponent_bias = 2 ** (target_exponent - 1) - 2 ** (fp8_exponent - 1)
+    s = torch.ones_like(scales) * 2
+    s = s**exponent_bias
+    return scales * s
+
+
+def apply_fp8_marlin_linear(
+    input: torch.Tensor,
+    weight: torch.Tensor,
+    weight_scale: torch.Tensor,
+    workspace: torch.Tensor,
+    size_n: int,
+    size_k: int,
+    bias: Optional[torch.Tensor],
+    use_fp32_reduce: bool = USE_FP32_REDUCE_DEFAULT,
+) -> torch.Tensor:
+    # For GPUs that lack FP8 hardware support, we can leverage the
+    # Marlin kernel for fast weight-only FP8 quantization
+
+    reshaped_x = input.reshape(-1, input.shape[-1])
+    out_shape = input.shape[:-1] + (size_n,)
+
+    use_atomic_add = should_use_atomic_add_reduce(
+        m=reshaped_x.size(0), n=size_n, k=size_k, device=input.device, dtype=input.dtype
+    )
+
+    output = gptq_marlin_gemm(
+        a=reshaped_x,
+        c=None,
+        b_q_weight=weight,
+        b_scales=weight_scale,
+        global_scale=None,
+        b_zeros=None,
+        g_idx=None,
+        perm=None,
+        workspace=workspace,
+        b_q_type=scalar_types.float8_e4m3fn,
+        size_m=reshaped_x.size(0),
+        size_n=size_n,
+        size_k=size_k,
+        use_atomic_add=use_atomic_add,
+        use_fp32_reduce=use_fp32_reduce,
+    )
+
+    if bias is not None:
+        output.add_(bias)
+
+    return output.reshape(out_shape)
+
+
+def prepare_fp8_layer_for_marlin(
+    layer: torch.nn.Module, size_k_first: bool = True
+) -> None:
+    logger.warning_once(
+        "Your GPU does not have native support for FP8 computation but "
+        "FP8 quantization is being used. Weight-only FP8 compression will "
+        "be used leveraging the Marlin kernel. This may degrade "
+        "performance for compute-heavy workloads."
+    )
+
+    part_size_n = layer.output_size_per_partition
+    part_size_k = layer.input_size_per_partition
+    weight_block_size = getattr(layer, "weight_block_size", None)
+
+    if size_k_first:
+        assert layer.weight.shape == (part_size_k, part_size_n)
+    else:
+        assert layer.weight.shape == (part_size_n, part_size_k)
+
+    device = layer.weight.device
+
+    # WORKSPACE
+    layer.workspace = marlin_make_workspace(device)
+
+    # WEIGHT
+    # Repack weights to marlin format
+    perm = torch.empty(0, dtype=torch.int, device=device)
+    qweight = pack_fp8_to_int32(layer.weight, size_k_first)
+    if not size_k_first:
+        qweight = qweight.T.contiguous()
+
+    marlin_qweight = gptq_marlin_repack(
+        b_q_weight=qweight,
+        perm=perm,
+        size_k=part_size_k,
+        size_n=part_size_n,
+        num_bits=8,
+    )
+    layer.weight = torch.nn.Parameter(marlin_qweight, requires_grad=False)
+
+    # WEIGHT SCALES
+    # Permute scales
+    if "weight_scale" in dir(layer):
+        scales = layer.weight_scale.to(layer.orig_dtype)
+    elif "weight_scale_inv" in dir(layer):
+        scales = layer.weight_scale_inv.to(layer.orig_dtype)
+        del layer.weight_scale_inv
+
+    group_size = -1 if weight_block_size is None else weight_block_size[1]
+
+    # marlin kernel only support channel-wise and group-wise quantization
+    # we need to convert the scales
+    if weight_block_size is None:
+        if scales.nelement() == 1:
+            # tensor-wise quantization -> channel-wise quantization
+            # (1, 1) =>(repeat)=> (1, size_n)
+            scales = scales.view(1, 1).repeat_interleave(part_size_n, 1)
+        elif scales.nelement() > 1 and scales.nelement() != part_size_n:
+            assert part_size_n % scales.nelement() == 0
+            s_size = scales.nelement()
+            # tensor-wise quantization (for gate-up proj)
+            #     -> channel-wise quantization
+            # (1, s_size) =>(repeat)=> (1, size_n)
+            scales = scales.view(1, s_size)
+            scales = scales.repeat_interleave(part_size_n // s_size, 1)
+        else:
+            # channel-wise quantization
+            # (1, size_n)
+            scales = scales.view(1, part_size_n)
+    else:
+        # block-wise quantization -> group-wise quantization
+        # (size_k // block_size[1], ceil(size_n / block_size[0]))
+        #  =>(repeat)=> (size_k // block_size[1], size_n)
+        if not size_k_first:
+            scales = scales.T.contiguous()
+        block_n = weight_block_size[0]
+        scales = scales.repeat_interleave(block_n, 1)
+        # size_n may not divisible by block_size[0]
+        scales = scales[:, :part_size_n]
+
+    marlin_scales = marlin_permute_scales(
+        s=scales, size_k=part_size_k, size_n=part_size_n, group_size=group_size
+    )
+    marlin_scales = fp8_fused_exponent_bias_into_scales(marlin_scales)
+    layer.weight_scale = torch.nn.Parameter(marlin_scales, requires_grad=False)
+
+    if hasattr(layer, "bias") and layer.bias is not None:
+        assert layer.bias.shape == (part_size_n,)
+        bias = marlin_permute_bias(layer.bias)
+        layer.bias = torch.nn.Parameter(bias, requires_grad=False)
+
+
+def prepare_moe_fp8_layer_for_marlin(
+    layer: torch.nn.Module, size_k_first: bool = True
+) -> None:
+    logger.warning_once(
+        "Your GPU does not have native support for FP8 computation but "
+        "FP8 quantization is being used. Weight-only FP8 compression will "
+        "be used leveraging the Marlin kernel. This may degrade "
+        "performance for compute-heavy workloads."
+    )
+
+    e = layer.num_experts
+    k = layer.hidden_size
+    n = layer.intermediate_size_per_partition
+    weight_block_size = getattr(layer, "weight_block_size", None)
+
+    # WORKSPACE
+    device = layer.w13_weight.device
+    layer.workspace = marlin_make_workspace(device, 4)
+    perm = torch.empty(0, dtype=torch.int, device=device)
+
+    # WEIGHT
+    # Repack weights to marlin format
+    for name in ["w13_weight", "w2_weight"]:
+        weight = getattr(layer, name)
+        tensor_list = []
+        if "w13" in name:
+            size_n, size_k = n * 2, k
+        else:
+            size_n, size_k = k, n
+
+        if size_k_first:
+            assert weight.shape == (e, size_k, size_n)
+        else:
+            assert weight.shape == (e, size_n, size_k)
+
+        for i in range(e):
+            qweight = pack_fp8_to_int32(weight[i], size_k_first)
+            if not size_k_first:
+                qweight = qweight.T.contiguous()
+
+            marlin_qweight = gptq_marlin_repack(
+                b_q_weight=qweight, perm=perm, size_k=size_k, size_n=size_n, num_bits=8
+            )
+            tensor_list.append(marlin_qweight)
+
+        weight = torch.cat([x.unsqueeze(0) for x in tensor_list], 0)
+        weight = torch.nn.Parameter(weight, requires_grad=False)
+
+        setattr(layer, name, weight)
+
+    # WEIGHT SCALES
+    # Permute scales
+    group_size = -1 if weight_block_size is None else weight_block_size[1]
+
+    for name in ["w13", "w2"]:
+        if name + "_weight_scale" in dir(layer):
+            new_name = name + "_weight_scale"
+            scales = getattr(layer, new_name).to(layer.orig_dtype)
+            delattr(layer, new_name)
+        elif name + "_weight_scale_inv" in dir(layer):
+            new_name = name + "_weight_scale_inv"
+            scales = getattr(layer, new_name).to(layer.orig_dtype)
+            delattr(layer, new_name)
+
+        tensor_list = []
+        if "w13" in name:
+            size_n, size_k = n * 2, k
+        else:
+            size_n, size_k = k, n
+
+        # marlin kernel only support channel-wise and group-wise quantization
+        # we need to convert the scales
+        if weight_block_size is None:
+            if scales.nelement() == e:
+                # tensor-wise quantization -> channel-wise quantization
+                # (e, 1, 1) =>(repeat)=> (e, 1, size_n)
+                scales = scales.view(e, 1, 1).repeat_interleave(size_n, 2)
+            elif scales.nelement() > e and scales.nelement() != e * size_n:
+                assert (e * size_n) % scales.nelement() == 0
+                s_size = scales.nelement() // e
+                # tensor-wise quantization (for gate-up proj)
+                #     -> channel-wise quantization
+                # (e, 1, s_size) =>(repeat)=> (e, 1, size_n)
+                scales = scales.view(e, 1, s_size)
+                scales = scales.repeat_interleave(size_n // s_size, 2)
+            else:
+                # channel-wise quantization
+                # (e, 1, size_n)
+                scales = scales.view(e, 1, size_n)
+        else:
+            # block-wise quantization -> group-wise quantization
+            # (e, size_k // block_size[1], ceil(size_n / block_size[0]))
+            #  =>(repeat)=> (e, size_k // block_size[1], size_n)
+            if not size_k_first:
+                scales = scales.permute(0, 2, 1)
+            block_n = weight_block_size[0]
+            scales = scales.repeat_interleave(block_n, 2)
+            # size_n may not divisible by block_size[0]
+            scales = scales[..., :size_n].contiguous()
+
+        for i in range(e):
+            marlin_scales = marlin_permute_scales(
+                s=scales[i], size_k=size_k, size_n=size_n, group_size=group_size
+            )
+            tensor_list.append(marlin_scales)
+
+        scales = torch.cat([x.unsqueeze(0) for x in tensor_list], 0)
+        scales = fp8_fused_exponent_bias_into_scales(scales)
+        scales = torch.nn.Parameter(scales, requires_grad=False)
+
+        setattr(layer, name + "_weight_scale", scales)
+
+    # BIAS
+    # Permute bias
+    for name in ["w13_bias", "w2_bias"]:
+        if not hasattr(layer, name):
+            continue
+        bias = getattr(layer, name).to(layer.orig_dtype)
+
+        tensor_list = []
+        for i in range(e):
+            expert_bias = bias[i]
+
+            tensor_list.append(marlin_permute_bias(expert_bias))
+
+        bias = torch.cat([x.unsqueeze(0) for x in tensor_list], 0)
+        bias = torch.nn.Parameter(bias, requires_grad=False)
+        setattr(layer, name, bias)
+
+
+def pack_fp8_to_int32(
+    fp8_tensor: torch.Tensor, size_k_first: bool = True
+) -> torch.Tensor:
+    """
+    Repack FP8 weights to gptq format (packed int32 elements)
+    """
+    assert fp8_tensor.dtype == torch.float8_e4m3fn
+    assert fp8_tensor.ndim == 2
+
+    fp8_tensor = fp8_tensor.T if size_k_first else fp8_tensor
+    fp8_tensor = fp8_tensor.contiguous()
+    # fp8_tensor is contiguous and have shape (N, K) now
+    # with `.view(torch.int32)`, it become (N, K // 4)
+    int32_tensor = fp8_tensor.view(torch.int32)
+    return int32_tensor.T.contiguous() if size_k_first else int32_tensor
+
+
+def marlin_quant_fp8_torch(weight, group_size):
+    size_n, size_k = weight.shape
+    device = weight.device
+
+    if group_size != -1:
+        scales = weight.view(size_n, -1, group_size).abs().max(-1)[0] / 448
+        repeated_scales = scales.repeat_interleave(group_size, 1)
+        fp8_weight = (weight / repeated_scales).to(torch.float8_e4m3fn)
+        weight_ref = fp8_weight.to(weight.dtype) * repeated_scales
+    else:
+        scales = weight.view(size_n, 1, group_size).abs().max(-1)[0] / 448
+        repeated_scales = scales.repeat_interleave(size_k, 1)
+        fp8_weight = (weight / repeated_scales).to(torch.float8_e4m3fn)
+        weight_ref = fp8_weight.to(weight.dtype) * repeated_scales
+
+    packed_weight = pack_fp8_to_int32(fp8_weight, False).T.contiguous()
+    marlin_qweight = gptq_marlin_repack(
+        b_q_weight=packed_weight,
+        perm=torch.empty(0, dtype=torch.int, device=device),
+        size_k=size_k,
+        size_n=size_n,
+        num_bits=8,
+    )
+
+    marlin_scales = marlin_permute_scales(
+        s=scales.T, size_k=size_k, size_n=size_n, group_size=group_size
+    )
+
+    marlin_scales = fp8_fused_exponent_bias_into_scales(marlin_scales)
+
+    return weight_ref.T, marlin_qweight, marlin_scales
diff --git a/sglang/python/sglang/srt/layers/quantization/modelopt_quant.py b/sglang/python/sglang/srt/layers/quantization/modelopt_quant.py
new file mode 100644
index 0000000000000000000000000000000000000000..e47305e45d5da62252d97f8e51eff4ff3a20b63b
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/quantization/modelopt_quant.py
@@ -0,0 +1,1893 @@
+# Adapted from https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/layers/quantization/modelopt.py
+from __future__ import annotations
+
+import logging
+from enum import IntEnum
+from typing import TYPE_CHECKING, Any, Dict, List, Optional
+
+import regex as re
+import torch
+from torch.nn.parameter import Parameter
+
+from sglang.srt.distributed import get_tp_group
+from sglang.srt.distributed.device_communicators.pynccl_allocator import (
+    use_symmetric_memory,
+)
+from sglang.srt.environ import envs
+from sglang.srt.layers.dp_attention import is_allocation_symmetric
+from sglang.srt.layers.moe import (
+    MoeRunner,
+    MoeRunnerBackend,
+    MoeRunnerConfig,
+    get_moe_a2a_backend,
+    get_moe_runner_backend,
+)
+from sglang.srt.layers.moe.cutlass_moe_params import CutlassMoEParams, CutlassMoEType
+from sglang.srt.layers.moe.moe_runner.triton import TritonMoeQuantInfo
+from sglang.srt.layers.moe.utils import should_use_flashinfer_cutlass_moe_fp4_allgather
+from sglang.srt.layers.parameter import ModelWeightParameter, PerTensorScaleParameter
+from sglang.srt.layers.quantization.base_config import (
+    FusedMoEMethodBase,
+    LinearMethodBase,
+    QuantizationConfig,
+    QuantizeMethodBase,
+)
+from sglang.srt.layers.quantization.fp4_utils import get_fp4_gemm_runner_backend
+from sglang.srt.layers.quantization.fp8_kernel import scaled_fp8_quant
+from sglang.srt.layers.quantization.fp8_utils import (
+    apply_fp8_linear,
+    cutlass_fp8_supported,
+    is_blackwell_supported,
+)
+from sglang.srt.layers.quantization.kv_cache import BaseKVCacheMethod
+from sglang.srt.layers.quantization.unquant import UnquantizedLinearMethod
+from sglang.srt.layers.quantization.utils import (
+    convert_to_channelwise,
+    is_layer_skipped,
+    per_tensor_dequantize,
+    requantize_with_max_scale,
+    swizzle_blockscale,
+)
+from sglang.srt.layers.radix_attention import RadixAttention
+from sglang.srt.layers.utils import copy_or_rebind_param
+from sglang.srt.utils.common import (
+    get_bool_env_var,
+    is_cuda,
+    is_sm120_supported,
+    next_power_of_2,
+)
+from sglang.srt.utils.custom_op import register_custom_op
+from sglang.srt.utils.patch_torch import register_fake_if_exists
+
+if TYPE_CHECKING:
+    from sglang.srt.batch_overlap.single_batch_overlap import DownGemmOverlapArgs
+    from sglang.srt.layers.moe.fused_moe_triton.layer import FusedMoE
+    from sglang.srt.layers.moe.token_dispatcher import (
+        CombineInput,
+        StandardDispatchOutput,
+    )
+    from sglang.srt.models.utils import WeightsMapper
+
+fp4_quantize = None
+try:
+    if is_sm120_supported():
+        try:
+            from flashinfer import fp4_quantize
+        except ImportError:
+            from sglang.jit_kernel.nvfp4 import scaled_fp4_quant as fp4_quantize
+    else:
+        from sglang.jit_kernel.nvfp4 import scaled_fp4_quant as fp4_quantize
+except ImportError:
+    fp4_quantize = None
+
+try:
+    from flashinfer import mm_fp4 as flashinfer_fp4_gemm
+    from flashinfer import reorder_rows_for_gated_act_gemm, shuffle_matrix_sf_a
+
+    enable_flashinfer_fp4_gemm = True
+except ImportError:
+    if is_cuda():
+        from sglang.jit_kernel.nvfp4 import cutlass_scaled_fp4_mm as cutlass_fp4_gemm
+    enable_flashinfer_fp4_gemm = False
+    reorder_rows_for_gated_act_gemm = None
+    shuffle_matrix_a = None
+    shuffle_matrix_sf_a = None
+
+try:
+    from flashinfer.fused_moe import cutlass_fused_moe as flashinfer_cutlass_fused_moe
+    from flashinfer.fused_moe.core import ActivationType
+except ImportError:
+    flashinfer_cutlass_fused_moe = None
+
+    # Define a minimal ActivationType enum if flashinfer is not available
+    class ActivationType(IntEnum):
+        Swiglu = 3
+        Relu2 = 6
+
+
+# Initialize logger for the module
+logger = logging.getLogger(__name__)
+
+
+def _sglang_fp4_gemm_fake(
+    input: torch.Tensor,
+    weight: torch.Tensor,
+    input_sf: torch.Tensor,
+    weight_sf: torch.Tensor,
+    alpha: torch.Tensor,
+    out_dtype: torch.dtype,
+    out_features: int,
+) -> torch.Tensor:
+    M = input.shape[-2]
+    N = int(out_features)
+    return input.new_empty((M, N), dtype=out_dtype)
+
+
+@register_custom_op(fake_impl=_sglang_fp4_gemm_fake)
+def fp4_gemm(
+    input: torch.Tensor,
+    weight: torch.Tensor,
+    input_sf: torch.Tensor,
+    weight_sf: torch.Tensor,
+    alpha: torch.Tensor,
+    out_dtype: torch.dtype,
+    out_features: int,
+) -> torch.Tensor:
+    fp4_backend = get_fp4_gemm_runner_backend()
+    if enable_flashinfer_fp4_gemm:
+        # Use the remapping logic to convert SGLang backend names to FlashInfer API names
+        backend = fp4_backend.get_flashinfer_backend()
+        return flashinfer_fp4_gemm(
+            input, weight, input_sf, weight_sf, alpha, out_dtype, backend=backend
+        )
+    else:
+        return cutlass_fp4_gemm(input, weight, input_sf, weight_sf, alpha, out_dtype)
+
+
+if is_cuda() and (not is_sm120_supported()) and (fp4_quantize is not None):
+
+    @register_fake_if_exists("sgl_kernel::scaled_fp4_quant")
+    def _sgl_kernel_scaled_fp4_quant_fake(
+        output, input, output_scale, input_global_scale
+    ):
+        return
+
+
+CUTEDSL_MOE_SCALAR_INPUT_SCALE = get_bool_env_var(
+    "SGLANG_CUTEDSL_MOE_SCALAR_INPUT_SCALE", "true"
+)
+
+# FP4 GEMM alignment constant - CUTLASS/FlashInfer kernels require dimensions divisible by 32
+FP4_GEMM_ALIGNMENT = 32
+
+
+def round_up_to_multiple(x: int, m: int) -> int:
+    """Round up x to the nearest multiple of m."""
+    return (x + m - 1) // m * m
+
+
+def pad_nvfp4_weight(
+    weight: torch.Tensor,
+    n_alignment: int = FP4_GEMM_ALIGNMENT,
+    k_alignment: int = FP4_GEMM_ALIGNMENT,
+) -> tuple[torch.Tensor, int]:
+    """
+    Pad packed NVFP4 weights to satisfy alignment constraints for FP4 GEMM kernels.
+
+    Different backends have different alignment requirements:
+    - CUTLASS/cuDNN: N % 32 == 0, K % 32 == 0
+    - TRTLLM: N % 128 == 0 (for shuffle_matrix_sf_a), K padding handled separately
+
+    Args:
+        weight: Packed FP4 weight tensor of shape [N, K//2] (2 FP4 values per byte)
+        n_alignment: Required alignment for N dimension (default 32, use 128 for TRTLLM)
+        k_alignment: Required alignment for K dimension (default 32, use 0 to skip)
+
+    Returns:
+        Tuple of (padded_weight, weights_padding_cols) where weights_padding_cols
+        is the number of columns added for K-dimension padding (in bytes).
+    """
+    weight_current_rows = weight.shape[0]  # N dimension
+    weight_current_col_bytes = weight.shape[1]  # K//2 (packed)
+
+    # Calculate padding for N dimension (rows)
+    pad_rows = 0
+    if n_alignment > 0 and weight_current_rows % n_alignment != 0:
+        total_rows = round_up_to_multiple(weight_current_rows, n_alignment)
+        pad_rows = total_rows - weight_current_rows
+
+    # Calculate padding for K dimension (columns)
+    # 2 FP4 items are packed per byte in the input dimension
+    weight_current_col_elements = weight_current_col_bytes * 2
+    pad_cols_bytes = 0
+    if k_alignment > 0 and weight_current_col_elements % k_alignment != 0:
+        total_cols = round_up_to_multiple(weight_current_col_elements, k_alignment)
+        pad_cols = total_cols - weight_current_col_elements
+        # pad_cols is in elements, but padding is in bytes (2 elements per byte)
+        pad_cols_bytes = pad_cols // 2
+
+    # Apply padding in a single operation if needed
+    # For 2D tensor, pad argument is (pad_left, pad_right, pad_top, pad_bottom)
+    if pad_rows > 0 or pad_cols_bytes > 0:
+        weight = torch.nn.functional.pad(
+            weight, (0, pad_cols_bytes, 0, pad_rows)
+        ).contiguous()
+
+    return weight, pad_cols_bytes
+
+
+def pad_nvfp4_activation_for_cutlass(
+    x_fp4: torch.Tensor,
+    weights_padding_cols: int,
+) -> torch.Tensor:
+    """
+    Pad packed FP4 activations to match the K-dimension padding applied to weights.
+
+    Args:
+        x_fp4: Packed FP4 activation tensor
+        weights_padding_cols: Number of padding columns (in bytes) from weight padding
+
+    Returns:
+        Padded activation tensor
+    """
+    if weights_padding_cols > 0:
+        return torch.nn.functional.pad(x_fp4, (0, weights_padding_cols)).contiguous()
+    return x_fp4
+
+
+def slice_nvfp4_output(
+    out: torch.Tensor,
+    output_size: int,
+) -> torch.Tensor:
+    """
+    Slice the output tensor to remove padding in N dimension if weight was padded.
+
+    Args:
+        out: Output tensor from FP4 GEMM
+        output_size: Original output size before padding
+
+    Returns:
+        Sliced output tensor with padding removed
+    """
+    if out.shape[-1] != output_size:
+        return out[..., :output_size].contiguous()
+    return out
+
+
+# TODO make it true by default when the DeepEP PR is merged
+MOE_NVFP4_DISPATCH = envs.SGLANG_MOE_NVFP4_DISPATCH.get()
+# Supported activation schemes for the current configuration
+ACTIVATION_SCHEMES = ["static"]
+
+ACT_STR_TO_TYPE_MAP = {
+    "silu": ActivationType.Swiglu,  # This is the default
+    "relu2": ActivationType.Relu2,
+}
+
+
+class ModelOptQuantConfig(QuantizationConfig):
+    def __init__(
+        self,
+        kv_cache_quant_algo: Optional[str],
+        exclude_modules: Optional[List[str]],
+        packed_modules_mapping: Optional[Dict[str, List[str]]],
+    ):
+        super().__init__()
+        self.packed_modules_mapping = packed_modules_mapping
+        self.exclude_modules = exclude_modules or []
+        self.kv_cache_quant_algo = kv_cache_quant_algo
+
+    def _get_quant_method(
+        self,
+        layer: torch.nn.Module,
+        prefix: str,
+        *,
+        Linear: type[LinearMethodBase],
+        Moe: type[FusedMoEMethodBase],
+    ) -> Optional[QuantizeMethodBase]:
+        from sglang.srt.layers.linear import LinearBase
+        from sglang.srt.layers.moe.fused_moe_triton import FusedMoE
+
+        if isinstance(layer, LinearBase):
+            if is_layer_skipped(
+                prefix, self.exclude_modules, self.packed_modules_mapping
+            ) or self.is_layer_excluded(prefix):
+                return UnquantizedLinearMethod()
+            return Linear(self)
+        elif self.kv_cache_quant_algo and isinstance(layer, RadixAttention):
+            return ModelOptFp8KVCacheMethod(self)
+        elif isinstance(layer, FusedMoE):
+            # Check if MoE layer should be excluded from quantization
+            # (e.g., MTP layers that have no quantization scales in checkpoint)
+            if self.is_layer_excluded(prefix):
+                # Falls back to default unquantized MoE
+                return None
+            return Moe(self)
+        return None
+
+    @classmethod
+    def get_config_filenames(cls) -> List[str]:
+        return ["hf_quant_config.json"]
+
+    def get_scaled_act_names(self) -> List[str]:
+        return []
+
+    def apply_weight_name_mapper(
+        self, hf_to_sglang_mapper: "WeightsMapper"
+    ):  # noqa: B027
+        # Map excluded module patterns from HF layout to sglang layout.
+        # Ref: HF hf_quant_config.json for nvidia/Kimi-K2.5-NVFP4
+        # https://huggingface.co/nvidia/Kimi-K2.5-NVFP4/blob/main/hf_quant_config.json
+        if self.exclude_modules:
+            mapped = hf_to_sglang_mapper.apply_list(self.exclude_modules)
+            expanded: List[str] = []
+            for name in mapped:
+                expanded.append(name)
+                if name.startswith("language_model."):
+                    expanded.append(name.removeprefix("language_model."))
+            # Preserve order, drop duplicates.
+            self.exclude_modules = list(dict.fromkeys(expanded))
+
+    def is_layer_excluded(self, prefix: str) -> bool:
+        """Check if a layer should be excluded from quantization.
+
+        Handles:
+        - Exact matches (e.g., "lm_head" matching prefix "lm_head")
+        - Glob-style wildcards (e.g., "mtp*" matching "mtp_layers")
+        - Part-by-part matching (split prefix on "." and check each part)
+        - language_model. prefix stripping for vision-language models
+        - Fused module patterns (e.g., "q_a_proj" in "fused_qkv_a_proj_with_mqa")
+        """
+        if not self.exclude_modules:
+            return False
+
+        # Build prefix variants: some models wrap layers under "language_model."
+        prefixes_to_check = [prefix]
+        if prefix.startswith("language_model."):
+            prefixes_to_check.append(prefix.removeprefix("language_model."))
+
+        # Fused module patterns: the exclude list may reference a sub-component
+        # (e.g., "q_a_proj") that is fused into a combined parameter name
+        # (e.g., "fused_qkv_a_proj_with_mqa"). We check if the last segment of
+        # the exclude pattern is a substring of the last segment of the prefix.
+        fused_patterns = {"q_a_proj", "q_b_proj", "kv_a_proj_with_mqa", "kv_b_proj"}
+
+        for pattern in self.exclude_modules:
+            # Convert glob-style wildcard to regex (e.g., "mtp*" -> "mtp.*")
+            regex_str = pattern.replace(".", r"\.").replace("*", r".*")
+
+            for pfx in prefixes_to_check:
+                if re.fullmatch(regex_str, pfx):
+                    return True
+                # Part-by-part check: handles wildcards like "mtp*" matching
+                pfx_parts = pfx.split(".")
+                for part in pfx_parts:
+                    if re.fullmatch(regex_str, part):
+                        return True
+
+            # Check fused patterns: if the last segment of the exclude pattern
+            # is a known fused component, check if it appears in the prefix's
+            # last segment (handles fused_qkv_a_proj_with_mqa containing q_a_proj)
+            pattern_tail = pattern.rsplit(".", maxsplit=1)[-1]
+            if pattern_tail in fused_patterns:
+                for pfx in prefixes_to_check:
+                    if pattern_tail in pfx.rsplit(".", maxsplit=1)[-1]:
+                        return True
+
+        return False
+
+
+class ModelOptFp8Config(ModelOptQuantConfig):
+    """Configuration for ModelOpt FP8 quantization, including serialization and compatibility checks."""
+
+    def __init__(
+        self,
+        is_checkpoint_fp8_serialized: bool = False,
+        kv_cache_quant_method: Optional[str] = None,
+        exclude_modules: Optional[List[str]] = None,
+        packed_modules_mapping: Optional[Dict[str, List[str]]] = None,
+    ) -> None:
+        """
+        Args:
+            is_checkpoint_fp8_serialized (bool): Indicates if the checkpoint uses serialized FP8 format.
+        """
+        super().__init__(kv_cache_quant_method, exclude_modules, packed_modules_mapping)
+        self.is_checkpoint_fp8_serialized = is_checkpoint_fp8_serialized
+        if is_checkpoint_fp8_serialized:
+            logger.warning(
+                "Detected ModelOpt FP8 checkpoint. The format is experimental and subject to change."
+            )
+
+    @classmethod
+    def override_quantization_method(cls, hf_quant_config, user_quant):
+        """Override quantization method based on the model's config."""
+        return cls._modelopt_override_quantization_method(hf_quant_config, user_quant)
+
+    @classmethod
+    def get_name(cls) -> str:
+        return "modelopt_fp8"
+
+    @classmethod
+    def get_supported_act_dtypes(cls) -> List[torch.dtype]:
+        return [torch.bfloat16, torch.half]
+
+    @classmethod
+    def get_min_capability(cls) -> int:
+        return 89  # Minimum hardware capability (e.g., Hopper GPUs).
+
+    @classmethod
+    def from_config(cls, config: Dict[str, Any]) -> ModelOptFp8Config:
+        # Handle two different config formats:
+        # 1. hf_quant_config.json format: {"quantization": {"quant_algo": "FP8", ...}}
+        # 2. config.json quantization_config format: {"quant_algo": "FP8", ...}
+        # In future modelopt will deprecate hf_quant_config.json, and only keep config.json.
+        # For legacy reasons, we keep hf_quant_config.json for now.
+
+        # Initialize variables
+        kv_cache_quant_method = None
+        exclude_modules = None
+
+        # Try flat format first (config.json quantization_config - preferred format)
+        quant_method = config.get("quant_algo")
+        if quant_method is not None:
+            # Flat format (config.json quantization_config)
+            # Derive kv_cache quant from kv_cache_scheme dict
+            kv_cache_scheme = config.get("kv_cache_scheme")
+            if isinstance(kv_cache_scheme, dict):
+                if (
+                    kv_cache_scheme.get("type") == "float"
+                    and kv_cache_scheme.get("num_bits") == 8
+                ):
+                    kv_cache_quant_method = "FP8"
+
+            # Map 'ignore' field to 'exclude_modules'
+            exclude_modules = config.get("ignore")
+        else:
+            # Fall back to nested format (hf_quant_config.json - will be deprecated)
+            try:
+                quantization_section = cls.get_from_keys(config, ["quantization"])
+                quant_method = quantization_section.get("quant_algo")
+                kv_cache_quant_method = quantization_section.get("kv_cache_quant_algo")
+                exclude_modules = quantization_section.get("exclude_modules")
+            except ValueError:
+                raise ValueError(
+                    "Cannot find 'quant_algo' in the model's quantization config. "
+                    "Expected either flat format (config.json) or nested format (hf_quant_config.json)."
+                )
+        if quant_method is None:
+            raise ValueError(
+                "Cannot find 'quant_algo' in the model's quantization config. "
+            )
+        if "FP8" not in quant_method:
+            raise ValueError(
+                "ModelOptFp8Config only supports static FP8 quantization in SGLang. "
+                "For FP4 quantization, use ModelOptFp4Config. "
+                "Check the quantization config for your model's configuration."
+            )
+
+        return cls(
+            is_checkpoint_fp8_serialized=True,
+            kv_cache_quant_method=kv_cache_quant_method,
+            exclude_modules=exclude_modules,
+            packed_modules_mapping=config.get("packed_modules_mapping"),
+        )
+
+    def get_quant_method(
+        self, layer: torch.nn.Module, prefix: str
+    ) -> Optional[QuantizeMethodBase]:
+        return self._get_quant_method(
+            layer, prefix, Linear=ModelOptFp8LinearMethod, Moe=ModelOptFp8MoEMethod
+        )
+
+
+class ModelOptFp8LinearMethod(LinearMethodBase):
+    """Linear method for ModelOpt static FP8 quantization.
+
+    Supports loading FP8 checkpoints with static weight and activation scales.
+    Future support may include dynamic scales.
+
+    **Limitations**:
+    1. Only supports per-tensor quantization due to `torch._scaled_mm` limitations.
+    2. Only supports the `float8_e4m3fn` data type.
+
+    Args:
+        quant_config (ModelOptFp8Config): The ModelOpt quantization configuration.
+    """
+
+    def __init__(self, quant_config: ModelOptFp8Config):
+        super().__init__()
+        self.quant_config = quant_config
+        self.cutlass_fp8_supported = cutlass_fp8_supported()
+
+    def create_weights(
+        self,
+        layer: torch.nn.Module,
+        input_size_per_partition: int,
+        output_partition_sizes: List[int],
+        input_size: Optional[int],
+        output_size: Optional[int],
+        params_dtype: torch.dtype,
+        **extra_weight_attrs,
+    ) -> None:
+        """Creates and registers weights, weight scales, and input scales for FP8 quantization."""
+        output_size_per_partition = sum(output_partition_sizes)
+        weight_loader = extra_weight_attrs.get("weight_loader")
+        weight_dtype = (
+            torch.float8_e4m3fn
+            if self.quant_config.is_checkpoint_fp8_serialized
+            else params_dtype
+        )
+
+        # Set layer attributes
+        layer.logical_widths = output_partition_sizes
+        layer.input_size_per_partition = input_size_per_partition
+        layer.output_size_per_partition = output_size_per_partition
+
+        # Register weight
+        layer.register_parameter(
+            "weight",
+            ModelWeightParameter(
+                data=torch.empty(
+                    output_size_per_partition,
+                    input_size_per_partition,
+                    dtype=weight_dtype,
+                ),
+                input_dim=1,
+                output_dim=0,
+                weight_loader=weight_loader,
+            ),
+        )
+
+        if self.quant_config.is_checkpoint_fp8_serialized:
+            # Register weight and input scales
+            for scale_name in ["weight_scale", "input_scale"]:
+                layer.register_parameter(
+                    scale_name,
+                    PerTensorScaleParameter(
+                        data=torch.full(
+                            (len(output_partition_sizes),),
+                            torch.finfo(torch.float32).min,
+                            dtype=torch.float32,
+                        ),
+                        weight_loader=weight_loader,
+                    ),
+                )
+
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        """Requantizes weights after loading using the maximum scale."""
+        max_w_scale, quantized_weight = requantize_with_max_scale(
+            layer.weight, layer.weight_scale, layer.logical_widths
+        )
+        layer.weight = Parameter(quantized_weight.t(), requires_grad=False)
+        # cutlass sgl-kernel only supports per-channel scale
+        if self.cutlass_fp8_supported:
+            max_w_scale = convert_to_channelwise(max_w_scale, layer.logical_widths)
+        layer.weight_scale = Parameter(max_w_scale, requires_grad=False)
+        layer.input_scale = Parameter(layer.input_scale.max(), requires_grad=False)
+
+    def apply(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        bias: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        """Applies FP8 linear transformation."""
+        return apply_fp8_linear(
+            input=x,
+            weight=layer.weight,
+            weight_scale=layer.weight_scale,
+            input_scale=layer.input_scale,
+            bias=bias,
+            cutlass_fp8_supported=self.cutlass_fp8_supported,
+        )
+
+
+class ModelOptFp8KVCacheMethod(BaseKVCacheMethod):
+    """
+    Handles loading FP8 kv-cache scaling factors from modelopt quantized checkpoints.
+    """
+
+    def __init__(self, quant_config: ModelOptFp8Config):
+        super().__init__(quant_config)
+
+
+class ModelOptFp8MoEMethod(FusedMoEMethodBase):
+    """MoE method for ModelOpt FP8.
+    Supports loading FP8 checkpoints with static weight scale and activation scale.
+
+    Args:
+        quant_config: The ModelOpt quantization config.
+    """
+
+    def __init__(self, quant_config: ModelOptFp8Config):
+        self.quant_config = quant_config
+        self.cutlass_fp8_supported = cutlass_fp8_supported()
+
+    def create_weights(
+        self,
+        layer: torch.nn.Module,
+        num_experts: int,
+        hidden_size: int,
+        intermediate_size_per_partition: int,
+        params_dtype: torch.dtype,
+        **extra_weight_attrs,
+    ):
+        from sglang.srt.layers.moe.fused_moe_triton import FusedMoeWeightScaleSupported
+
+        # Use FP8 dtype if checkpoint is serialized, otherwise use the default dtype
+        weight_dtype = (
+            torch.float8_e4m3fn
+            if self.quant_config.is_checkpoint_fp8_serialized
+            else params_dtype
+        )
+        weight_loader = extra_weight_attrs.get("weight_loader")
+        num_shards = 2 if layer.moe_runner_config.is_gated else 1
+        intermediate_size = num_shards * intermediate_size_per_partition
+        w13_weight = ModelWeightParameter(
+            data=torch.empty(
+                num_experts,
+                intermediate_size,
+                hidden_size,
+                dtype=weight_dtype,
+            ),
+            input_dim=2,
+            output_dim=1,
+            weight_loader=weight_loader,
+        )
+        layer.register_parameter("w13_weight", w13_weight)
+
+        w2_weight = ModelWeightParameter(
+            data=torch.empty(
+                num_experts,
+                hidden_size,
+                intermediate_size_per_partition,
+                dtype=weight_dtype,
+            ),
+            input_dim=2,
+            output_dim=1,
+            weight_loader=weight_loader,
+        )
+        layer.register_parameter("w2_weight", w2_weight)
+
+        if self.quant_config.is_checkpoint_fp8_serialized:
+            # WEIGHT SCALES - Per-tensor scaling for ModelOpts
+            # Allocate 2 scales for w1 and w3 respectively.
+            # They will be combined to a single scale after weight loading.
+            w13_scale_shape = (num_experts, num_shards)
+            w13_weight_scale = PerTensorScaleParameter(
+                data=torch.full(
+                    w13_scale_shape,
+                    torch.finfo(torch.float32).min,
+                    dtype=torch.float32,
+                ),
+                weight_loader=weight_loader,
+            )
+            w2_weight_scale = PerTensorScaleParameter(
+                data=torch.full(
+                    (num_experts,), torch.finfo(torch.float32).min, dtype=torch.float32
+                ),
+                weight_loader=weight_loader,
+            )
+            layer.register_parameter("w13_weight_scale", w13_weight_scale)
+            layer.register_parameter("w2_weight_scale", w2_weight_scale)
+
+            # Set weight loader attributes for scales
+            extra_weight_attrs.update(
+                {"quant_method": FusedMoeWeightScaleSupported.TENSOR.value}
+            )
+
+            # INPUT SCALES - Per-tensor scaling for ModelOpt
+            w13_input_scale = PerTensorScaleParameter(
+                data=torch.full((num_experts,), 1.0, dtype=torch.float32),
+                weight_loader=weight_loader,
+            )
+            w2_input_scale = PerTensorScaleParameter(
+                data=torch.full((num_experts,), 1.0, dtype=torch.float32),
+                weight_loader=weight_loader,
+            )
+            layer.register_parameter("w13_input_scale", w13_input_scale)
+            layer.register_parameter("w2_input_scale", w2_input_scale)
+
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        """Process FP8 MoE weights after loading from serialized checkpoint.
+
+        Only supports pre-quantized checkpoints with FP8 weights and scales.
+        """
+
+        layer.w13_weight = Parameter(layer.w13_weight.data, requires_grad=False)
+        layer.w2_weight = Parameter(layer.w2_weight.data, requires_grad=False)
+
+        # Handle scale parameters
+        if hasattr(layer, "w13_weight_scale") and layer.w13_weight_scale is not None:
+            # Fp8 moe kernel needs single weight scale for w13 per expert.
+            # We take the max of the w1 and w3 scales then dequant and requant each expert.
+            if layer.w13_weight_scale.dim() == 2:  # Shape: (num_experts, 2)
+                # Get the maximum scale across w1 and w3 for each expert
+                max_w13_scales = layer.w13_weight_scale.max(dim=1).values
+
+                # Requantize each expert's weights using the combined scale
+                # w13_weight has shape (num_experts, 2 * intermediate_size_per_partition, hidden_size)
+                # where the first intermediate_size_per_partition rows are w1, the next are w3
+                num_shards = 2 if layer.moe_runner_config.is_gated else 1
+                intermediate_size_per_partition = (
+                    layer.w13_weight.shape[1] // num_shards
+                )
+                for expert_id in range(layer.w13_weight.shape[0]):
+                    start = 0
+                    for shard_id in range(num_shards):  # (w1 and w3) or w13
+                        # Dequantize using the original scale for this shard
+                        dq_weight = per_tensor_dequantize(
+                            layer.w13_weight[expert_id][
+                                start : start + intermediate_size_per_partition, :
+                            ],
+                            layer.w13_weight_scale[expert_id][shard_id],
+                        )
+                        # Requantize using the combined max scale
+                        (
+                            layer.w13_weight[expert_id][
+                                start : start + intermediate_size_per_partition, :
+                            ],
+                            _,
+                        ) = scaled_fp8_quant(dq_weight, max_w13_scales[expert_id])
+
+                        start += intermediate_size_per_partition
+
+                # Update the scale parameter to be per-expert instead of per-shard
+                layer.w13_weight_scale = Parameter(max_w13_scales, requires_grad=False)
+            else:
+                layer.w13_weight_scale = Parameter(
+                    layer.w13_weight_scale.data, requires_grad=False
+                )
+
+        if hasattr(layer, "w2_weight_scale") and layer.w2_weight_scale is not None:
+            layer.w2_weight_scale = Parameter(
+                layer.w2_weight_scale.data, requires_grad=False
+            )
+        if hasattr(layer, "w13_input_scale") and layer.w13_input_scale is not None:
+            layer.w13_input_scale = Parameter(
+                layer.w13_input_scale.max(), requires_grad=False
+            )
+        if hasattr(layer, "w2_input_scale") and layer.w2_input_scale is not None:
+            layer.w2_input_scale = Parameter(
+                layer.w2_input_scale.max(), requires_grad=False
+            )
+
+        # Align FP8 weights to FlashInfer per-tensor kernel layout if enabled
+        if get_moe_runner_backend().is_flashinfer_trtllm():
+            from sglang.srt.layers.moe.moe_runner.flashinfer_trtllm import (
+                align_fp8_moe_weights_for_flashinfer_trtllm,
+            )
+
+            # ModelOpt FP8 stores weights in [Up, Gate] order, so we need to swap
+            align_fp8_moe_weights_for_flashinfer_trtllm(layer, swap_w13_halves=True)
+        elif get_moe_runner_backend().is_flashinfer_cutlass():
+            assert (
+                hasattr(layer, "w13_input_scale") and layer.w13_input_scale is not None
+            )
+            assert hasattr(layer, "w2_input_scale") and layer.w2_input_scale is not None
+            assert (
+                hasattr(layer, "w13_weight_scale")
+                and layer.w13_weight_scale is not None
+            )
+            assert (
+                hasattr(layer, "w2_weight_scale") and layer.w2_weight_scale is not None
+            )
+
+            input_scale = layer.w13_input_scale.to(torch.float32)
+            activation_scale = layer.w2_input_scale.to(torch.float32)
+            w13_weight_scale = layer.w13_weight_scale.to(torch.float32)
+            w2_weight_scale = layer.w2_weight_scale.to(torch.float32)
+
+            layer.fc1_dequant = Parameter(
+                w13_weight_scale * input_scale, requires_grad=False
+            )
+            layer.fc2_quant = Parameter(
+                activation_scale.reciprocal(), requires_grad=False
+            )
+            layer.fc2_dequant = Parameter(
+                activation_scale * w2_weight_scale, requires_grad=False
+            )
+            layer.fc1_input_dequant = Parameter(input_scale, requires_grad=False)
+
+    def create_moe_runner(
+        self, layer: torch.nn.Module, moe_runner_config: MoeRunnerConfig
+    ):
+        self.moe_runner_config = moe_runner_config
+        self.runner = MoeRunner(MoeRunnerBackend.TRITON, moe_runner_config)
+
+    def apply(
+        self,
+        layer: torch.nn.Module,
+        dispatch_output: StandardDispatchOutput,
+    ) -> CombineInput:
+        x = dispatch_output.hidden_states
+        topk_output = dispatch_output.topk_output
+
+        # Fast path: TRT-LLM FP8 per-tensor MoE using BYPASSED TopK routing
+        from sglang.srt.layers.moe.topk import TopKOutputChecker
+
+        if (
+            get_moe_runner_backend().is_flashinfer_trtllm()
+            and TopKOutputChecker.format_is_bypassed(topk_output)
+        ):
+            from sglang.srt.layers.moe.moe_runner.flashinfer_trtllm import (
+                FlashInferTrtllmFp8MoeQuantInfo,
+                fused_experts_none_to_flashinfer_trtllm_fp8,
+            )
+            from sglang.srt.layers.moe.utils import RoutingMethodType
+
+            topk_config = topk_output.topk_config
+
+            # Constraints for ModelOpt FP8 MoE
+            assert (
+                self.moe_runner_config.activation == "silu"
+            ), "Only silu is supported for flashinfer fp8 moe"
+
+            # Enforce Llama4 routing for ModelOpt FP8 MoE for now.
+            # TODO(brayden): support other routing methods
+            assert topk_config.top_k == 1, "ModelOpt FP8 MoE requires top_k==1"
+            assert (
+                not topk_config.num_expert_group
+            ), "ModelOpt FP8 MoE does not support expert grouping"
+            assert (
+                not topk_config.topk_group
+            ), "ModelOpt FP8 MoE does not support grouped top-k"
+
+            quant_info = FlashInferTrtllmFp8MoeQuantInfo(
+                w13_weight=layer.w13_weight,
+                w2_weight=layer.w2_weight,
+                global_num_experts=layer.num_experts,
+                local_expert_offset=layer.moe_ep_rank * layer.num_local_experts,
+                local_num_experts=layer.num_local_experts,
+                intermediate_size=layer.w2_weight.shape[2],
+                routing_method_type=RoutingMethodType.Llama4,
+                block_quant=False,
+                w13_input_scale=layer.w13_input_scale,
+                output1_scales_scalar=layer.output1_scales_scalar,
+                output1_scales_gate_scalar=layer.output1_scales_gate_scalar,
+                output2_scales_scalar=layer.output2_scales_scalar,
+                use_routing_scales_on_input=True,
+            )
+
+            return fused_experts_none_to_flashinfer_trtllm_fp8(
+                dispatch_output, quant_info, self.moe_runner_config
+            )
+
+        if get_moe_runner_backend().is_flashinfer_cutlass():
+            activation = ACT_STR_TO_TYPE_MAP[self.moe_runner_config.activation]
+            assert (
+                (
+                    activation is ActivationType.Relu2
+                    and not self.moe_runner_config.is_gated
+                )
+                or activation is ActivationType.Swiglu
+                and self.moe_runner_config.is_gated
+            ), "Only Relu2 non-gated or Swiglu gated are supported for flashinfer cutlass fp8 moe"
+            topk_weights, topk_ids = topk_output.topk_weights, topk_output.topk_ids
+            x_fp8, _ = scaled_fp8_quant(x, layer.w13_input_scale)
+            output_dtype = x.dtype
+            original_col = x.shape[1]
+            x_sf = None
+
+            with use_symmetric_memory(
+                get_tp_group(), disabled=not is_allocation_symmetric()
+            ):
+                symm_output = torch.empty(
+                    x.shape[0], original_col, dtype=output_dtype, device=x.device
+                )
+            output = flashinfer_cutlass_fused_moe(
+                output=symm_output,
+                input=x_fp8,
+                token_selected_experts=topk_ids.to(torch.int),
+                token_final_scales=topk_weights,
+                fc1_expert_weights=layer.w13_weight,
+                fc2_expert_weights=layer.w2_weight,
+                output_dtype=output_dtype,
+                input_sf=x_sf,
+                quant_scales=[
+                    layer.fc1_dequant,
+                    layer.fc2_quant,
+                    layer.fc2_dequant,
+                    layer.fc1_input_dequant,
+                ],
+                ep_size=layer.moe_ep_size,
+                ep_rank=layer.moe_ep_rank,
+                tp_size=layer.moe_tp_size,
+                tp_rank=layer.moe_tp_rank,
+                tune_max_num_tokens=next_power_of_2(x.shape[0]),
+                activation_type=activation,
+            )[0]
+
+            from sglang.srt.layers.moe.token_dispatcher import StandardCombineInput
+
+            return StandardCombineInput(hidden_states=output)
+
+        quant_info = TritonMoeQuantInfo(
+            w13_weight=layer.w13_weight,
+            w2_weight=layer.w2_weight,
+            use_fp8_w8a8=True,
+            per_channel_quant=False,
+            w13_scale=layer.w13_weight_scale,
+            w2_scale=layer.w2_weight_scale,
+            a13_scale=layer.w13_input_scale,
+            a2_scale=layer.w2_input_scale,
+        )
+
+        return self.runner.run(dispatch_output, quant_info)
+
+
+class ModelOptFp4Config(ModelOptQuantConfig):
+    """Config class for FP4."""
+
+    def __init__(
+        self,
+        is_checkpoint_nvfp4_serialized: bool = False,
+        kv_cache_quant_algo: str = None,
+        group_size: int = None,
+        exclude_modules: List[str] = None,
+        packed_modules_mapping: Optional[Dict[str, List[str]]] = None,
+    ) -> None:
+        super().__init__(kv_cache_quant_algo, exclude_modules, packed_modules_mapping)
+        self.is_checkpoint_nvfp4_serialized = is_checkpoint_nvfp4_serialized
+        if is_checkpoint_nvfp4_serialized:
+            logger.warning(
+                "Detected nvfp4 checkpoint. Please note that the "
+                "format is experimental and subject to change."
+            )
+        self.group_size = group_size
+
+    @classmethod
+    def override_quantization_method(cls, hf_quant_config, user_quant):
+        """Override quantization method based on the model's config."""
+        return cls._modelopt_override_quantization_method(hf_quant_config, user_quant)
+
+    @classmethod
+    def get_name(cls) -> str:
+        return "modelopt_fp4"
+
+    @classmethod
+    def get_supported_act_dtypes(cls) -> List[torch.dtype]:
+        return [torch.bfloat16, torch.half, torch.float8_e4m3fn]
+
+    @classmethod
+    def get_min_capability(cls) -> int:
+        return 100
+
+    @staticmethod
+    def common_group_size(cfg: dict) -> int:
+        """Return the unique group_size across the config; raise if missing/mismatched."""
+        sizes = set()
+
+        # Top-level and 'quantization' block
+        v = cfg.get("group_size")
+        if isinstance(v, int):
+            sizes.add(v)
+        q = cfg.get("quantization")
+        if isinstance(q, dict):
+            v = q.get("group_size")
+            if isinstance(v, int):
+                sizes.add(v)
+
+        # config_groups: accept group-level or nested dicts (e.g., weights/input_activations)
+        for g in (cfg.get("config_groups") or {}).values():
+            if isinstance(g, dict):
+                v = g.get("group_size")
+                if isinstance(v, int):
+                    sizes.add(v)
+                for sub in g.values():
+                    if isinstance(sub, dict):
+                        v = sub.get("group_size")
+                        if isinstance(v, int):
+                            sizes.add(v)
+
+        if not sizes:
+            raise ValueError("No group_size found in config.")
+        if len(sizes) > 1:
+            raise ValueError(f"Inconsistent group_size values: {sorted(sizes)}")
+        return next(iter(sizes))
+
+    @classmethod
+    def from_config(cls, config: Dict[str, Any]) -> ModelOptFp4Config:
+        # Handle two different config formats:
+        # 1. hf_quant_config.json format: {"quantization": {"quant_algo": "NVFP4", ...}}
+        # 2. config.json quantization_config format: {"quant_algo": "NVFP4", ...}
+        # In future modelopt will deprecate hf_quant_config.json, and only keep config.json.
+        # For legacy reasons, we keep hf_quant_config.json for now.
+
+        # Initialize variables
+        kv_cache_quant_algo = None
+        group_size = None
+        exclude_modules = []
+
+        # Try flat format first (config.json quantization_config - preferred format)
+        quant_method = config.get("quant_algo")
+        if quant_method is not None:
+            # Flat format (config.json quantization_config)
+            # Derive kv_cache_quant_algo from kv_cache_scheme dict
+            kv_cache_scheme = config.get("kv_cache_scheme")
+            if isinstance(kv_cache_scheme, dict):
+                if (
+                    kv_cache_scheme.get("type") == "float"
+                    and kv_cache_scheme.get("num_bits") == 8
+                ):
+                    kv_cache_quant_algo = "FP8"
+                else:
+                    kv_cache_quant_algo = "auto"
+            elif isinstance(kv_cache_scheme, str):
+                scheme_name = kv_cache_scheme.strip().upper()
+                if scheme_name in ("FP8", "FLOAT8"):
+                    kv_cache_quant_algo = "FP8"
+                elif scheme_name in ("FP4", "FLOAT4", "NVFP4"):
+                    kv_cache_quant_algo = "NVFP4"
+                else:
+                    kv_cache_quant_algo = "auto"
+            else:
+                kv_cache_quant_algo = "auto"
+
+            group_size = config.get("group_size")
+            # If group_size is not at top level, try to extract from config_groups
+            if group_size is None:
+                config_groups = config.get("config_groups", {})
+                if config_groups:
+                    # Get group_size from the first group's weights config
+                    first_group = next(iter(config_groups.values()), {})
+                    weights_config = first_group.get("weights", {})
+                    group_size = weights_config.get("group_size")
+
+            exclude_modules = config.get("ignore", [])
+        else:
+            # Fall back to nested format (hf_quant_config.json - legacy format)
+            try:
+                quant_config = cls.get_from_keys(config, ["quantization"])
+                quant_method = quant_config["quant_algo"]
+                kv_cache_quant_algo = quant_config.get("kv_cache_quant_algo")
+                if not kv_cache_quant_algo:
+                    kv_cache_quant_algo = "auto"
+                group_size = ModelOptFp4Config.common_group_size(config)
+                exclude_modules = quant_config.get("exclude_modules", [])
+            except (ValueError, KeyError):
+                raise ValueError(
+                    "Cannot find 'quant_algo' in the model's quantization config. "
+                    "Expected either flat format (config.json) or nested format (hf_quant_config.json)."
+                )
+
+        if not quant_method in ["FP8", "NVFP4"]:
+            raise ValueError(
+                f"ModelOpt currently only supports: FP8, NVFP4"
+                " quantizations in sglang. Please check the "
+                "quantization config for your model's configuration."
+            )
+        is_checkpoint_nvfp4_serialized = "NVFP4" in quant_method
+
+        if group_size is None or exclude_modules is None:
+            logger.warning(
+                f"group_size: {group_size},"
+                f"kv_cache_quant_algo: {kv_cache_quant_algo},"
+                f"exclude_modules: {exclude_modules}"
+            )
+            raise ValueError(
+                "NVFP4 quantization requires group_size and exclude_modules "
+                "specified in the quantization config"
+            )
+        return cls(
+            is_checkpoint_nvfp4_serialized,
+            kv_cache_quant_algo,
+            group_size,
+            exclude_modules,
+            config.get("packed_modules_mapping"),
+        )
+
+    def get_quant_method(self, layer: torch.nn.Module, prefix: str):
+        return self._get_quant_method(
+            layer,
+            prefix,
+            Linear=ModelOptFp4LinearMethod,
+            Moe=ModelOptNvFp4FusedMoEMethod,  # FlashInferFP4MoE needs the same quantization method but with compatible attribute handling
+        )
+
+
+class ModelOptFp4LinearMethod(LinearMethodBase):
+    """Linear method for NVFP4.
+    Supports loading NVFP4 checkpoints with the following structure:
+
+    |Tensor Name           | datatype      |  shape      |
+    |----------------------------------------------------|
+    |input_scale           | torch.float32 | scalar      |
+    |weight                | NVFP4(SE2M1)  | [1, X, y/2] |
+    |weight_scale          | FP8-E4M3      | [X, Y]      |
+    |weight_scale_2        | torch.float32 | scalar      |
+
+    The weights are quantized per block of 16 elements.
+    Args: quant_config: The ModelOpt quantization config.
+    """
+
+    def __init__(self, quant_config: ModelOptFp4Config):
+        self.quant_config = quant_config
+
+    def create_weights(
+        self,
+        layer: torch.nn.Module,
+        input_size_per_partition: int,
+        output_partition_sizes: List[int],
+        input_size: int,
+        output_size: int,
+        params_dtype: torch.dtype,
+        **extra_weight_attrs,
+    ):
+        del input_size, output_size
+        if not self.quant_config.is_checkpoint_nvfp4_serialized:
+            raise ValueError(
+                "NVFP4 quantization was selected, "
+                " dynamic quantization is not supported."
+            )
+
+        output_size_per_partition = sum(output_partition_sizes)
+        weight_loader = extra_weight_attrs.get("weight_loader")
+
+        layer.logical_widths = output_partition_sizes
+
+        layer.input_size_per_partition = input_size_per_partition
+        layer.output_size_per_partition = output_size_per_partition
+        if input_size_per_partition % 16 != 0:
+            raise ValueError(
+                "Unsupported model when in features size is " "not multiple of 16"
+            )
+
+        weight_dtype = (
+            torch.float8_e4m3fn
+            if self.quant_config.is_checkpoint_nvfp4_serialized
+            else params_dtype
+        )
+
+        weight = ModelWeightParameter(
+            data=torch.empty(
+                # 2 fp4 data is packed in one uint8 in the input dimension
+                output_size_per_partition,
+                input_size_per_partition // 2,
+                dtype=torch.uint8,
+            ),
+            input_dim=1,
+            output_dim=0,
+            weight_loader=weight_loader,
+        )
+        layer.register_parameter("weight", weight)
+
+        input_scale = PerTensorScaleParameter(
+            data=torch.empty(len(output_partition_sizes), dtype=torch.float32),
+            weight_loader=weight_loader,
+        )
+
+        layer.register_parameter("input_scale", input_scale)
+
+        weight_scale_2 = PerTensorScaleParameter(
+            data=torch.empty(len(output_partition_sizes), dtype=torch.float32),
+            weight_loader=weight_loader,
+        )
+        layer.register_parameter("weight_scale_2", weight_scale_2)
+
+        weight_scale = ModelWeightParameter(
+            data=torch.empty(
+                output_size_per_partition,
+                input_size_per_partition // self.quant_config.group_size,
+                dtype=weight_dtype,
+            ),
+            input_dim=1,
+            output_dim=0,
+            weight_loader=weight_loader,
+        )
+
+        layer.register_parameter("weight_scale", weight_scale)
+
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        input_scale_2 = layer.input_scale.max().to(torch.float32)
+        weight_scale_2 = layer.weight_scale_2.max().to(torch.float32)
+
+        # Keep per-shard scales intact for hot reload; derive scalar params below.
+        copy_or_rebind_param(
+            layer, "alpha", (input_scale_2 * weight_scale_2).to(torch.float32)
+        )
+        copy_or_rebind_param(
+            layer, "input_scale_inv", (1 / input_scale_2).to(torch.float32)
+        )
+
+        # Store original output size before any padding
+        layer.output_size_per_partition = layer.weight.shape[0]
+
+        if get_fp4_gemm_runner_backend().is_flashinfer_trtllm():
+            # FlashInfer TRTLLM FP4 GEMM requires a different weight layout.
+            # FlashInfer provides nvfp4_quantize to quantize + shuffle the
+            # layout but we use our own quantization so we have to call
+            # shuffles ourselves.
+            #
+            # Alignment requirements:
+            #   - shuffle_matrix_a: weight.shape[0] (N) % 32 == 0
+            #   - shuffle_matrix_sf_a: scale.shape[0] (N) % 128 == 0, scale.shape[1] (K/16) % 4 == 0
+            # We pad N to multiple of 128 and K/16 to multiple of 4.
+            from flashinfer import shuffle_matrix_a, shuffle_matrix_sf_a
+
+            # Pad weight N dimension to 128
+            weight, _ = pad_nvfp4_weight(
+                layer.weight.data, n_alignment=128, k_alignment=0
+            )
+            # Pad scale N dimension to match weight
+            scale = layer.weight_scale
+            if scale.shape[0] != weight.shape[0]:
+                pad_n = weight.shape[0] - scale.shape[0]
+                scale = torch.nn.functional.pad(scale, (0, 0, 0, pad_n))
+
+            # Pad K dimension: scale K/16 must be multiple of 4
+            scale_k = scale.shape[1]  # K/16
+            weights_padding_cols = 0
+            if scale_k % 4 != 0:
+                padded_scale_k = round_up_to_multiple(scale_k, 4)
+                pad_scale_k = padded_scale_k - scale_k
+                # Pad scale K/16 dimension
+                scale = torch.nn.functional.pad(scale, (0, pad_scale_k, 0, 0))
+                # Pad weight K/2 dimension correspondingly (K/2 = K/16 * 8)
+                pad_weight_k = pad_scale_k * 8
+                weight = torch.nn.functional.pad(weight, (0, pad_weight_k, 0, 0))
+                # Store K padding for activation padding in apply()
+                weights_padding_cols = pad_weight_k
+
+            # Shuffle for TRTLLM layout
+            epilogue_tile_m = 128
+            shuffled_scale_shape = scale.shape
+            weight = shuffle_matrix_a(weight.view(torch.uint8), epilogue_tile_m)
+            scale = (
+                shuffle_matrix_sf_a(scale.view(torch.uint8), epilogue_tile_m)
+                .reshape(shuffled_scale_shape)
+                .view(torch.float8_e4m3fn)
+            )
+
+            copy_or_rebind_param(layer, "weight_scale_interleaved", scale)
+            copy_or_rebind_param(layer, "weight", weight)
+            layer.weights_padding_cols = weights_padding_cols
+            return
+
+        # Pad weights for CUTLASS/FlashInfer kernel alignment (K and N divisible by 32)
+        weight, weights_padding_cols = pad_nvfp4_weight(layer.weight.data)
+        layer.weights_padding_cols = weights_padding_cols
+        copy_or_rebind_param(layer, "weight", weight)
+
+        # Pad and blockwise interleave weight_scale
+        scales = layer.weight_scale
+        scale_ndim = scales.ndim
+        if scale_ndim == 2:
+            scales = scales.unsqueeze(0)
+        assert scales.ndim == 3
+        B, M, K = scales.shape
+        M_padded = round_up_to_multiple(M, 128)
+        K_padded = round_up_to_multiple(K, 4)
+        padded_scales = torch.zeros((B, M_padded, K_padded), dtype=scales.dtype)
+        padded_scales[:B, :M, :K] = scales
+        batches, rows, cols = padded_scales.shape
+        assert rows % 128 == 0
+        assert cols % 4 == 0
+        padded_scales = padded_scales.reshape(batches, rows // 128, 4, 32, cols // 4, 4)
+        padded_scales = padded_scales.permute((0, 1, 4, 3, 2, 5))
+        padded_scales = padded_scales.contiguous().cuda()
+        padded_scales = (
+            padded_scales.reshape(M_padded, K_padded)
+            if scale_ndim == 2
+            else padded_scales.reshape(B, M_padded, K_padded)
+        )
+        copy_or_rebind_param(layer, "weight_scale_interleaved", padded_scales)
+
+    def apply(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        bias: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        output_dtype = x.dtype
+        x_m, _ = x.shape
+
+        # Get original output size (before padding) and padded weight size
+        output_size = layer.output_size_per_partition
+        w_n, _ = layer.weight.shape
+        output_shape = [x_m, output_size]
+
+        # Quantize BF16 or FP16 to (FP4 and interleaved block scale)
+        x_fp4, x_scale_interleaved = fp4_quantize(x, layer.input_scale_inv)
+
+        assert x_fp4.dtype == torch.uint8
+        assert layer.weight.dtype == torch.uint8
+        assert layer.weight_scale_interleaved.dtype == torch.float8_e4m3fn
+        assert layer.alpha.dtype == torch.float32
+
+        # Pad activations to match weight K-dimension padding
+        weights_padding_cols = getattr(layer, "weights_padding_cols", 0)
+        x_fp4 = pad_nvfp4_activation_for_cutlass(x_fp4, weights_padding_cols)
+
+        w = layer.weight
+        w_scale_interleaved = layer.weight_scale_interleaved
+        if enable_flashinfer_fp4_gemm:
+            w = layer.weight.T
+            w_scale_interleaved = layer.weight_scale_interleaved.T
+
+        out = fp4_gemm(
+            x_fp4,
+            w,
+            x_scale_interleaved,
+            w_scale_interleaved,
+            layer.alpha,
+            output_dtype,
+            w_n,
+        )
+
+        # Slice output to remove N-dimension padding
+        out = slice_nvfp4_output(out, output_size)
+
+        if bias is not None:
+            out = out + bias
+        return out.view(*output_shape)
+
+
+class ModelOptNvFp4FusedMoEMethod(FusedMoEMethodBase):
+    """
+       MoE Method for FP4 Quantization with Blockscales and PerTensorScales
+    Args:
+        quant_config: NVFP4 Quant Config
+    """
+
+    def __init__(self, quant_config: ModelOptFp4Config):
+        self.quant_config = quant_config
+        if not is_blackwell_supported():
+            raise ValueError(
+                "Current platform does not support NVFP4"
+                " quantization. Please use Blackwell and"
+                " above."
+            )
+        self.enable_flashinfer_trtllm_moe = (
+            get_moe_runner_backend().is_flashinfer_trtllm()
+        )
+        self._cache_permute_indices = {}
+
+    @property
+    def enable_flashinfer_cutlass_moe(self) -> bool:
+        from sglang.srt.layers.moe import get_moe_runner_backend
+
+        """Access the global enable_flashinfer_cutlass_moe setting."""
+        return get_moe_runner_backend().is_flashinfer_cutlass()
+
+    @property
+    def enable_flashinfer_cutedsl_moe(self) -> bool:
+        from sglang.srt.layers.moe import get_moe_runner_backend
+
+        """Access the global enable_flashinfer_cutedsl_moe setting."""
+        return get_moe_runner_backend().is_flashinfer_cutedsl()
+
+    def create_weights(
+        self,
+        layer: torch.nn.Module,
+        num_experts: int,
+        hidden_size: int,
+        intermediate_size_per_partition: int,
+        params_dtype: torch.dtype,
+        **extra_weight_attrs,
+    ):
+        if not self.quant_config.is_checkpoint_nvfp4_serialized:
+            raise ValueError(
+                "NVFP4 quantization was selected, "
+                " dynamic quantization is not supported."
+            )
+
+        # TODO(ch-wan): check if this is needed
+        layer.intermediate_size_per_partition = intermediate_size_per_partition
+        layer.params_dtype = params_dtype
+        layer.quant_config = self.quant_config
+
+        weight_dtype = torch.uint8
+        weight_scale_dtype = torch.float8_e4m3fn
+        weight_loader = extra_weight_attrs.get("weight_loader")
+        # GEMM 1
+        num_shards = 2 if layer.moe_runner_config.is_gated else 1
+
+        w13_weight = ModelWeightParameter(
+            data=torch.empty(
+                layer.num_local_experts,
+                num_shards * intermediate_size_per_partition,
+                # 2 fp4 items are packed in the input dimension
+                hidden_size // 2,
+                dtype=weight_dtype,
+            ),
+            input_dim=1,
+            output_dim=2,
+            weight_loader=weight_loader,
+        )
+        layer.register_parameter("w13_weight", w13_weight)
+
+        # GEMM 2
+        w2_weight = ModelWeightParameter(
+            data=torch.empty(
+                layer.num_local_experts,
+                hidden_size,
+                # 2 fp4 items are packed in the input dimension
+                intermediate_size_per_partition // 2,
+                dtype=weight_dtype,
+            ),
+            input_dim=1,
+            output_dim=2,
+            weight_loader=weight_loader,
+        )
+        layer.register_parameter("w2_weight", w2_weight)
+
+        w13_weight_scale = ModelWeightParameter(
+            data=torch.empty(
+                layer.num_local_experts,
+                num_shards * intermediate_size_per_partition,
+                hidden_size // self.quant_config.group_size,
+                dtype=weight_scale_dtype,
+            ),
+            input_dim=1,
+            output_dim=2,
+            weight_loader=weight_loader,
+        )
+        layer.register_parameter("w13_weight_scale", w13_weight_scale)
+
+        # Only use `swizzle_blockscale` for shapes, not for real content
+        layer.w13_blockscale_swizzled = Parameter(
+            swizzle_blockscale(layer.w13_weight_scale), requires_grad=False
+        )
+
+        w2_weight_scale = ModelWeightParameter(
+            data=torch.empty(
+                layer.num_local_experts,
+                hidden_size,
+                intermediate_size_per_partition // self.quant_config.group_size,
+                dtype=weight_scale_dtype,
+            ),
+            input_dim=1,
+            output_dim=2,
+            weight_loader=weight_loader,
+        )
+        layer.register_parameter("w2_weight_scale", w2_weight_scale)
+
+        layer.w2_blockscale_swizzled = Parameter(
+            swizzle_blockscale(layer.w2_weight_scale), requires_grad=False
+        )
+
+        from sglang.srt.layers.moe.fused_moe_triton import FusedMoeWeightScaleSupported
+
+        extra_weight_attrs.update(
+            {"quant_method": FusedMoeWeightScaleSupported.BLOCK.value}
+        )
+
+        w13_weight_scale_shape = (
+            (layer.num_local_experts, 2)
+            if layer.moe_runner_config.is_gated
+            else (layer.num_local_experts,)
+        )
+        w13_weight_scale_2 = PerTensorScaleParameter(
+            data=torch.empty(w13_weight_scale_shape, dtype=torch.float32),
+            weight_loader=weight_loader,
+        )
+        layer.register_parameter("w13_weight_scale_2", w13_weight_scale_2)
+
+        w2_weight_scale_2 = PerTensorScaleParameter(
+            data=torch.empty(layer.num_local_experts, dtype=torch.float32),
+            weight_loader=weight_loader,
+        )
+        layer.register_parameter("w2_weight_scale_2", w2_weight_scale_2)
+
+        extra_weight_attrs.update(
+            {"quant_method": FusedMoeWeightScaleSupported.TENSOR.value}
+        )
+
+        w13_input_scale_shape = (layer.num_experts, num_shards)
+        w13_input_scale = PerTensorScaleParameter(
+            data=torch.empty(w13_input_scale_shape, dtype=torch.float32),
+            weight_loader=weight_loader,
+        )
+        w13_input_scale._sglang_require_global_experts = True
+        layer.register_parameter("w13_input_scale", w13_input_scale)
+
+        w2_input_scale = PerTensorScaleParameter(
+            data=torch.empty(layer.num_experts, dtype=torch.float32),
+            weight_loader=weight_loader,
+        )
+        w2_input_scale._sglang_require_global_experts = True
+        layer.register_parameter("w2_input_scale", w2_input_scale)
+
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        """Process FP4 MoE weights after loading from serialized checkpoint.
+
+        Only supports pre-quantized checkpoints with FP8 weights and scales.
+        """
+
+        # GEMM 1 scale processing
+        if layer.moe_runner_config.is_gated:
+            if layer.w13_weight_scale_2.dim() == 1:
+                # Some checkpoints store a shared scale for w1/w3.
+                w13_weight_scale_2 = layer.w13_weight_scale_2
+            else:
+                if layer.w13_weight_scale_2.shape[1] >= 2 and not torch.allclose(
+                    layer.w13_weight_scale_2[:, 0],
+                    layer.w13_weight_scale_2[:, 1],
+                ):
+                    logger.warning_once(
+                        "w1_weight_scale_2 must match w3_weight_scale_2. "
+                        "Accuracy may be affected."
+                    )
+
+                w13_weight_scale_2 = layer.w13_weight_scale_2[:, 0]
+        else:
+            w13_weight_scale_2 = layer.w13_weight_scale_2[:]
+
+        # Calculate input scales based on strategy
+        if self.enable_flashinfer_cutlass_moe or self.enable_flashinfer_trtllm_moe:
+            w13_input_scale = layer.w13_input_scale.max().to(torch.float32)
+            w2_input_scale = layer.w2_input_scale.max().to(torch.float32)
+        elif self.enable_flashinfer_cutedsl_moe:
+            # All-expert-one-input-scale is mathematically different from default per-expert-input-scale
+            # Thus we allow users to switch the flag to do thorough testing
+            if CUTEDSL_MOE_SCALAR_INPUT_SCALE:
+                w13_input_scale = (
+                    layer.w13_input_scale.max()
+                    .to(torch.float32)
+                    .repeat(layer.w13_input_scale.shape[0])
+                )
+            else:
+                w13_input_scale = layer.w13_input_scale.max(dim=1).values.to(
+                    torch.float32
+                )
+
+            w2_input_scale = layer.w2_input_scale
+
+            def _slice_scale(w):
+                assert w.shape == (layer.num_experts,)
+                assert layer.moe_ep_size * layer.num_local_experts == layer.num_experts
+                return w[
+                    layer.moe_ep_rank
+                    * layer.num_local_experts : (layer.moe_ep_rank + 1)
+                    * layer.num_local_experts
+                ]
+
+            w13_input_scale = _slice_scale(w13_input_scale)
+            w2_input_scale = _slice_scale(w2_input_scale)
+
+            if MOE_NVFP4_DISPATCH:
+                assert torch.all(w13_input_scale == w13_input_scale[0])
+                w13_input_scale = w13_input_scale[0]
+        else:
+            w13_input_scale = layer.w13_input_scale.max(dim=-1).values.to(torch.float32)
+            w2_input_scale = layer.w2_input_scale
+
+        # Create shared parameters
+        copy_or_rebind_param(
+            layer,
+            "g1_alphas",
+            (w13_input_scale * w13_weight_scale_2).to(torch.float32),
+        )
+        copy_or_rebind_param(
+            layer,
+            "g2_alphas",
+            (w2_input_scale * layer.w2_weight_scale_2).to(torch.float32),
+        )
+        copy_or_rebind_param(
+            layer,
+            "w13_input_scale_quant",
+            (1 / w13_input_scale).to(torch.float32),
+        )
+        copy_or_rebind_param(
+            layer,
+            "w2_input_scale_quant",
+            (1 / w2_input_scale).to(torch.float32),
+        )
+
+        # TODO: for flashinfer always do MOE_NVFP4_DISPATCH
+        layer.dispatcher.set_quant_config(
+            {
+                "input_global_scale": (
+                    layer.w13_input_scale_quant
+                    if MOE_NVFP4_DISPATCH
+                    or should_use_flashinfer_cutlass_moe_fp4_allgather()
+                    else None
+                )
+            }
+        )
+        block_size = 16
+        # Validate weight scales
+        assert_dim = 2 if layer.moe_runner_config.is_gated else 1
+        for name, weight_scale in [
+            ("w13", layer.w13_weight_scale),
+            ("w2", layer.w2_weight_scale),
+        ]:
+            # For NVFP4 TRTLLM we require one scale per 16 inputs (last dim == expected_blocks[name]).
+            if get_moe_runner_backend().is_flashinfer_trtllm():
+                expected_blocks = {
+                    "w13": layer.w13_weight.shape[2] * 2 // block_size,
+                    "w2": layer.w2_weight.shape[2] * 2 // block_size,
+                }
+                assert (
+                    weight_scale.shape[-1] == expected_blocks[name]
+                ), f"Expected {name}_weight_scale.dim(2) == {expected_blocks[name]}, got {weight_scale.shape[-1]}"
+            else:
+                if weight_scale.shape[assert_dim] % 4 != 0:
+                    logger.warning(
+                        "NVFP4 %s_weight_scale K' not multiple of 4: shape=%s, group_size=%s",
+                        name,
+                        tuple(weight_scale.shape),
+                        getattr(self.quant_config, "group_size", None),
+                    )
+            assert (
+                weight_scale.dtype == torch.float8_e4m3fn
+            ), f"{name} Weight Blockscale must be represented as FP8-E4M3"
+
+        # Weight processing based on strategy
+        if (
+            self.enable_flashinfer_trtllm_moe
+            and reorder_rows_for_gated_act_gemm is not None
+            and shuffle_matrix_sf_a is not None
+        ):
+            from sglang.srt.layers.moe.moe_runner.flashinfer_trtllm import (
+                align_fp4_moe_weights_for_flashinfer_trtllm,
+            )
+
+            # FlashInfer TRTLLM processing - handles both w13 and w2
+            align_fp4_moe_weights_for_flashinfer_trtllm(layer)
+
+        else:
+            # CUTLASS processing - handle w13 and w2 separately
+
+            # Process w13 weights
+            w13_blockscale_swizzled = swizzle_blockscale(layer.w13_weight_scale)
+            copy_or_rebind_param(
+                layer, "w13_blockscale_swizzled", w13_blockscale_swizzled
+            )
+
+            w13_weight = layer.w13_weight
+            intermediate_size_pad = w13_blockscale_swizzled.size(1) - w13_weight.size(1)
+            if intermediate_size_pad:
+                # padding gated activations will require to split w1 and w3
+                # and pad them individually
+                assert not layer.moe_runner_config.is_gated, (
+                    "The intermediate size required padding, "
+                    "but padding is also implemented for gated activations"
+                )
+
+                copy_or_rebind_param(
+                    layer,
+                    "w13_weight",
+                    torch.nn.functional.pad(
+                        w13_weight, (0, 0, 0, intermediate_size_pad)
+                    ),
+                )
+                copy_or_rebind_param(
+                    layer,
+                    "w2_weight",
+                    torch.nn.functional.pad(
+                        layer.w2_weight, (0, intermediate_size_pad // 2, 0, 0)
+                    ),
+                )
+                copy_or_rebind_param(
+                    layer,
+                    "w2_weight_scale",
+                    torch.nn.functional.pad(
+                        layer.w2_weight_scale, (0, intermediate_size_pad // 16)
+                    ),
+                )
+
+            # Process w2 weights
+            w2_blockscale_swizzled = swizzle_blockscale(layer.w2_weight_scale)
+            copy_or_rebind_param(
+                layer, "w2_blockscale_swizzled", w2_blockscale_swizzled
+            )
+
+            # Both flashinfer cutlass and regular cutlass use same processing for w2
+
+            # Set up CUTLASS MoE parameters (reuse to keep CUDA graph stable)
+            device = layer.w13_weight.device
+            inter_size = layer.w2_weight.shape[2] * 2
+            hidden_size = layer.w13_weight.shape[2] * 2
+            existing_params = getattr(layer, "cutlass_moe_params", None)
+            if (
+                existing_params is None
+                or existing_params.cutlass_moe_type != CutlassMoEType.BlockscaledFP4
+                or existing_params.num_experts != layer.num_experts
+                or existing_params.intermediate_size_per_partition != inter_size
+                or existing_params.hidden_size != hidden_size
+                or existing_params.device != device
+            ):
+                layer.cutlass_moe_params = CutlassMoEParams(
+                    CutlassMoEType.BlockscaledFP4,
+                    device,
+                    num_experts=layer.num_experts,  # global num experts
+                    intermediate_size_per_partition=inter_size,  # n
+                    hidden_size=hidden_size,
+                )  # k
+
+    @property
+    def load_up_proj_weight_first(self) -> bool:
+        # FlashInfer CUTLASS kernel assumes [Up, Gate] Proj as W13
+        return self.enable_flashinfer_cutlass_moe and self.moe_runner_config.is_gated
+
+    def create_moe_runner(
+        self, layer: torch.nn.Module, moe_runner_config: MoeRunnerConfig
+    ):
+        self.moe_runner_config = moe_runner_config
+        if get_moe_runner_backend().is_flashinfer_trtllm():
+            self.runner = MoeRunner(
+                MoeRunnerBackend.FLASHINFER_TRTLLM, moe_runner_config
+            )
+
+    def apply(
+        self,
+        layer: FusedMoE,
+        dispatch_output: StandardDispatchOutput,
+    ) -> CombineInput:
+
+        x = dispatch_output.hidden_states
+        x_sf = dispatch_output.hidden_states_scale
+        topk_output = dispatch_output.topk_output
+
+        activation = self.moe_runner_config.activation
+
+        assert (
+            activation in ACT_STR_TO_TYPE_MAP
+        ), f"{activation=} missing from {ACT_STR_TO_TYPE_MAP.keys()=}"
+        moe_runner_config = self.moe_runner_config
+
+        # FlashInfer TRTLLM FP4 path - layer has shuffled weights only when
+        # backend is flashinfer_trtllm
+        if hasattr(layer, "gemm1_weights_fp4_shuffled"):
+            from sglang.srt.layers.moe.moe_runner.flashinfer_trtllm import (
+                FlashInferTrtllmFp4MoeQuantInfo,
+            )
+            from sglang.srt.layers.moe.utils import RoutingMethodType
+
+            # Determine routing method type based on layer configuration
+            routing_method_type = getattr(
+                layer, "routing_method_type", RoutingMethodType.Default
+            )
+
+            quant_info = FlashInferTrtllmFp4MoeQuantInfo(
+                gemm1_weights_fp4_shuffled=layer.gemm1_weights_fp4_shuffled.data,
+                gemm2_weights_fp4_shuffled=layer.gemm2_weights_fp4_shuffled.data,
+                gemm1_scales_fp4_shuffled=layer.gemm1_scales_fp4_shuffled.data,
+                gemm2_scales_fp4_shuffled=layer.gemm2_scales_fp4_shuffled.data,
+                g1_scale_c=layer.g1_scale_c.data,
+                g1_alphas=layer.g1_alphas.data,
+                g2_alphas=layer.g2_alphas.data,
+                w13_input_scale_quant=layer.w13_input_scale_quant,
+                global_num_experts=layer.num_experts,
+                local_expert_offset=layer.moe_ep_rank * layer.num_local_experts,
+                local_num_experts=layer.num_local_experts,
+                intermediate_size_per_partition=layer.intermediate_size_per_partition,
+                routing_method_type=routing_method_type,
+            )
+
+            return self.runner.run(dispatch_output, quant_info)
+
+        if self.enable_flashinfer_cutlass_moe:
+            from sglang.srt.layers.moe.token_dispatcher import DispatchOutputChecker
+
+            assert (
+                not moe_runner_config.apply_router_weight_on_input
+            ), "apply_router_weight_on_input is not supported for Flashinfer"
+            # TRTLLM Cutlass moe takes in activations in BF16/Half/nvfp4 precision
+            # and fp4 quantized weights loaded from the checkpoint
+            topk_weights, topk_ids = topk_output.topk_weights, topk_output.topk_ids
+
+            output_dtype = torch.bfloat16
+
+            if DispatchOutputChecker.format_is_flashinfer(dispatch_output):
+                symm_output = dispatch_output.moe_output
+            else:
+                # If x_sf is not None, x is FP4 packed (half size), so we need * 2
+                # If x_sf is None, x is not packed, so output_col = x.shape[1]
+                output_col = x.shape[1]
+                if x_sf is not None and layer.moe_runner_config.is_gated:
+                    output_col *= 2
+                with use_symmetric_memory(
+                    get_tp_group(), disabled=not is_allocation_symmetric()
+                ):
+                    symm_output = torch.empty(
+                        x.shape[0],
+                        output_col,
+                        dtype=output_dtype,
+                        device=x.device,
+                    )
+
+            output = flashinfer_cutlass_fused_moe(
+                output=symm_output,
+                input=x,
+                token_selected_experts=topk_ids.to(torch.int),
+                token_final_scales=topk_weights,
+                fc1_expert_weights=layer.w13_weight.view(torch.long),
+                fc2_expert_weights=layer.w2_weight.view(torch.long),
+                output_dtype=output_dtype,
+                input_sf=x_sf,
+                # swizzled_input_sf=not get_moe_a2a_backend().is_flashinfer(),
+                quant_scales=[
+                    layer.w13_input_scale_quant,
+                    layer.w13_blockscale_swizzled.view(torch.int32),
+                    layer.g1_alphas,
+                    layer.w2_input_scale_quant,
+                    layer.w2_blockscale_swizzled.view(torch.int32),
+                    layer.g2_alphas,
+                ],
+                ep_size=layer.moe_ep_size,
+                ep_rank=layer.moe_ep_rank,
+                tp_size=layer.moe_tp_size,
+                tp_rank=layer.moe_tp_rank,
+                tune_max_num_tokens=next_power_of_2(x.shape[0]),
+                activation_type=ACT_STR_TO_TYPE_MAP[activation],
+                enable_alltoall=get_moe_a2a_backend().is_flashinfer(),
+            )[0]
+
+            from sglang.srt.layers.moe.token_dispatcher import StandardCombineInput
+
+            return StandardCombineInput(hidden_states=output)
+
+        from sglang.srt.layers.moe.cutlass_moe import cutlass_moe_fp4
+
+        topk_weights, topk_ids = topk_output.topk_weights, topk_output.topk_ids
+        output = cutlass_moe_fp4(
+            a=x,
+            a1_gscale=layer.w13_input_scale_quant,
+            w1_fp4=layer.w13_weight,
+            w1_blockscale=layer.w13_blockscale_swizzled,
+            w1_alphas=layer.g1_alphas,
+            a2_gscale=layer.w2_input_scale_quant,
+            w2_fp4=layer.w2_weight,
+            w2_blockscale=layer.w2_blockscale_swizzled,
+            w2_alphas=layer.g2_alphas,
+            topk_weights=topk_weights,
+            topk_ids=topk_ids,
+            params=layer.cutlass_moe_params,
+            apply_router_weight_on_input=moe_runner_config.apply_router_weight_on_input,
+        ).to(x.dtype)
+        # Scale by routed_scaling_factor is fused into select_experts.
+        from sglang.srt.layers.moe.token_dispatcher import StandardCombineInput
+
+        return StandardCombineInput(hidden_states=output)
+
+    def apply_without_routing_weights(
+        self,
+        layer: FusedMoE,
+        x: tuple[torch.Tensor, Optional[torch.Tensor]],
+        masked_m: torch.Tensor,
+        moe_runner_config: MoeRunnerConfig,
+    ) -> torch.Tensor:
+        assert (
+            moe_runner_config.activation == "silu"
+        ), "Only SiLU activation is supported."
+
+        assert self.enable_flashinfer_cutedsl_moe, "only support flashinfer cutedsl moe"
+        assert (
+            not moe_runner_config.apply_router_weight_on_input
+        ), "apply_router_weight_on_input is not supported for Flashinfer"
+
+        from sglang.srt.layers.moe.flashinfer_cutedsl_moe import (
+            flashinfer_cutedsl_moe_masked,
+        )
+
+        down_gemm_overlap_args: Optional[DownGemmOverlapArgs] = getattr(
+            layer, "down_gemm_overlap_args", None
+        )
+
+        out = flashinfer_cutedsl_moe_masked(
+            hidden_states=x,
+            input_global_scale=(
+                None if MOE_NVFP4_DISPATCH else layer.w13_input_scale_quant
+            ),
+            w1=layer.w13_weight,
+            w1_blockscale=layer.w13_blockscale_swizzled,
+            w1_alpha=layer.g1_alphas,
+            w2=layer.w2_weight,
+            a2_global_scale=layer.w2_input_scale_quant,
+            w2_blockscale=layer.w2_blockscale_swizzled,
+            w2_alpha=layer.g2_alphas,
+            masked_m=masked_m,
+            **(
+                dict(
+                    down_sm_count=down_gemm_overlap_args.num_sms,
+                    down_signals=down_gemm_overlap_args.signal,
+                    down_start_event=down_gemm_overlap_args.start_event,
+                )
+                if down_gemm_overlap_args is not None
+                else {}
+            ),
+        )
+        return out
diff --git a/sglang/python/sglang/srt/layers/quantization/modelslim/README.md b/sglang/python/sglang/srt/layers/quantization/modelslim/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..d2a43d696741419bd67eb0a24ca4fc7263d71846
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/quantization/modelslim/README.md
@@ -0,0 +1,14 @@
+Quantization [ModelSlim](https://gitcode.com/Ascend/msit) module.
+
+`--quantization modelslim` flag introduced. To load already quantized models, simply load the model weights. For models quantized with ModelSlim, there's no need to add `--quantization modelslim` argument when starting the engine. The quantization method will be automatically parsed from the downloaded `quant_model_description.json` config.
+
+ModelSlim was developed in the format of compressed_tensors and includes support for various quantization schemes, such as:
+- [x] W4A4 dynamic linear
+- [x] W8A8 static linear
+- [x] W8A8 dynamic linear
+- [x] W4A8 dynamic MOE
+- [x] W8A8 dynamic MOE
+
+Also ModelSlim module include:
+- [x] Automated config detection for modelslim format (without the need to specify --quantization modelslim flag)
+- [x] Unit-tests for w4a4 modelslim, w8a8 modelslim
diff --git a/sglang/python/sglang/srt/layers/quantization/modelslim/__pycache__/modelslim.cpython-311.pyc b/sglang/python/sglang/srt/layers/quantization/modelslim/__pycache__/modelslim.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9c15b45890a49aa7be569b43f98a82bafb44299c
Binary files /dev/null and b/sglang/python/sglang/srt/layers/quantization/modelslim/__pycache__/modelslim.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/layers/quantization/modelslim/modelslim.py b/sglang/python/sglang/srt/layers/quantization/modelslim/modelslim.py
new file mode 100644
index 0000000000000000000000000000000000000000..1212a40d48d3dcc74fd6107519ef54bd1ca24d82
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/quantization/modelslim/modelslim.py
@@ -0,0 +1,393 @@
+from __future__ import annotations
+
+import logging
+from types import MappingProxyType
+from typing import TYPE_CHECKING, Any, Dict, List, Mapping, Optional, Tuple, Union, cast
+
+import torch
+
+from sglang.srt.hardware_backend.npu.quantization.linear_method_npu import (
+    _NPULinearMethodBase,
+)
+from sglang.srt.layers.quantization.base_config import (
+    FusedMoEMethodBase,
+    QuantizationConfig,
+)
+from sglang.srt.layers.quantization.compressed_tensors.utils import should_ignore_layer
+from sglang.srt.layers.quantization.modelslim.schemes import (
+    ModelSlimW4A4Int4,
+    ModelSlimW4A8Int8MoE,
+    ModelSlimW8A8Int8,
+    ModelSlimW8A8Int8MoE,
+)
+from sglang.srt.layers.quantization.unquant import UnquantizedLinearMethod
+from sglang.srt.utils import apply_module_patch
+
+if TYPE_CHECKING:
+    from sglang.srt.layers.moe import MoeRunnerConfig
+    from sglang.srt.layers.moe.token_dispatcher import (
+        CombineInput,
+        StandardDispatchOutput,
+    )
+    from sglang.srt.layers.quantization.base_config import QuantizeMethodBase
+    from sglang.srt.layers.quantization.modelslim.schemes import (
+        ModelSlimLinearScheme,
+        ModelSlimMoEScheme,
+    )
+
+logger = logging.getLogger(__name__)
+
+
+# func refers to RMSNorm.__init__
+def npu_wrapper_rmsnorm_init(func):
+    def init(self, hidden_size: int, **extra_args) -> None:
+        func(self, hidden_size, **extra_args)
+        self.ignore_anti = True
+        # The Ascend w8a8_int8 quantization requires adding a bias in rmsnorm
+        self.bias = torch.nn.Parameter(torch.zeros(hidden_size), requires_grad=False)
+
+    return init
+
+
+# func refers to RMSNorm.forward_oot
+def npu_wrapper_rmsnorm_forward(func):
+    def _rmsnorm_forward_oot(
+        self,
+        x: torch.Tensor,
+        residual: Optional[torch.Tensor] = None,
+        post_residual_addition: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
+        from sgl_kernel_npu.norm.add_rmsnorm_bias import add_rmsnorm_bias
+
+        if not x.is_contiguous():
+            x = x.contiguous()
+        if residual is not None:
+            if post_residual_addition is not None:
+                residual = residual + post_residual_addition
+            out, residual_out = add_rmsnorm_bias(
+                x,
+                residual,
+                self.weight.data,
+                self.bias,
+                self.variance_epsilon,
+            )
+            return out.to(x.dtype), residual_out
+
+        out = torch.ops.npu.npu_rms_norm(x, self.weight.data, self.variance_epsilon)[0]
+        out = out + self.bias
+        return out.to(x.dtype)
+
+    return _rmsnorm_forward_oot
+
+
+class ModelSlimConfig(QuantizationConfig):
+    """
+    Config class for ModelSlim Quantization, a NPU-specific quantization type.
+    """
+
+    def __init__(self, quant_config: Dict[str, Any] = {}):
+        super().__init__()
+        self.quant_description = quant_config
+        ignore = cast(List[str], quant_config.get("ignore", []))
+        self.ignore = ignore if ignore is not None else []
+        packed_modules_mapping = quant_config.get("packed_modules_mapping", {})
+        self.packed_modules_mapping = (
+            packed_modules_mapping if packed_modules_mapping is not None else {}
+        )
+
+        for name in self.quant_description.keys():
+            if "norm.bias" in name:
+                apply_module_patch(
+                    "sglang.srt.layers.layernorm.RMSNorm",
+                    "__init__",
+                    [npu_wrapper_rmsnorm_init],
+                )
+                apply_module_patch(
+                    "sglang.srt.layers.layernorm.RMSNorm",
+                    "forward_npu",
+                    [npu_wrapper_rmsnorm_forward],
+                )
+
+    def get_linear_method(self) -> ModelSlimLinearMethod:
+        return ModelSlimLinearMethod(self)
+
+    @classmethod
+    def get_supported_act_dtypes(cls) -> List[torch.dtype]:
+        return [torch.int8, torch.float16, torch.bfloat16]
+
+    @classmethod
+    def get_min_capability(cls) -> int:
+        return 0
+
+    @classmethod
+    def get_name(cls) -> str:
+        return "modelslim"
+
+    @classmethod
+    def get_config_filenames(cls) -> List[str]:
+        filenames = ["quant_model_description.json"]
+        return filenames
+
+    @classmethod
+    def from_config(cls, config: Dict[str, Any]) -> ModelSlimConfig:
+        return cls(config)
+
+    def get_quant_method(
+        self,
+        layer: torch.nn.Module,
+        prefix: str,
+    ) -> Optional[QuantizeMethodBase]:
+        from sglang.srt.layers.linear import LinearBase
+        from sglang.srt.layers.moe.fused_moe_triton import FusedMoE
+
+        if isinstance(layer, LinearBase):
+            if should_ignore_layer(
+                prefix,
+                ignore=self.ignore,
+                fused_mapping=self.packed_modules_mapping,
+            ):
+                return UnquantizedLinearMethod()
+            key = "model"
+            if "vision_model" in prefix:
+                key = "vision_model"
+            elif "visual" in prefix:
+                key = "visual"
+            packed_modules_mapping_subset = self.packed_modules_mapping.get(key, {})
+            prefix_in_quant_config = prefix
+            proj_name = prefix.split(".")[-1]
+            if proj_name in packed_modules_mapping_subset:
+                prefix_in_quant_config = prefix.replace(
+                    proj_name, packed_modules_mapping_subset[proj_name][0]
+                )
+
+            if self.is_layer_skipped(prefix, packed_modules_mapping_subset):
+                return UnquantizedLinearMethod()
+            scheme = self.get_linear_scheme(
+                layer=layer, layer_name=prefix_in_quant_config
+            )
+            layer.scheme = scheme
+            return ModelSlimLinearMethod(self)
+        elif isinstance(layer, FusedMoE):
+            layer.scheme = self.get_moe_scheme(layer, prefix)
+            return ModelSlimFusedMoEMethod(self)
+        return None
+
+    def _get_scheme_from_parts(
+        self,
+        layer_name: str,
+    ) -> ModelSlimLinearScheme:
+
+        quant_type = self.quant_description.get(layer_name + ".weight", "")
+        if quant_type == "W8A8_DYNAMIC" or quant_type == "W8A8":
+            return ModelSlimW8A8Int8(
+                quant_config=self.quant_description, prefix=layer_name
+            )
+        elif quant_type == "W4A4_DYNAMIC":
+            return ModelSlimW4A4Int4(
+                quant_config=self.quant_description, prefix=layer_name
+            )
+        raise NotImplementedError("No modelslim compatible scheme was found.")
+
+    def get_linear_scheme(
+        self, layer: torch.nn.Module, layer_name: Optional[str] = None
+    ) -> Optional[ModelSlimLinearScheme]:
+        """
+        get_scheme method adjusted for modelslim, taken from
+        python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py
+        """
+        scheme = self._get_scheme_from_parts(
+            layer_name=layer_name,
+        )
+
+        # Ascend doesn't support device capability
+        logger.debug("Using scheme: %s for %s", scheme.__class__.__name__, layer_name)
+        return scheme
+
+    def get_moe_scheme(
+        self,
+        layer: torch.nn.Module,
+        prefix: str,
+    ) -> Optional[ModelSlimMoEScheme]:
+        # TODO: @dsikka: refactor this to use schemes as other kernels
+        # are supported + check if the layer is being ignored.
+
+        prefix_in_quant_config = prefix + ".0.down_proj.weight"
+        is_moe_w4a8_dynamic = (
+            self.quant_description.get(prefix_in_quant_config, "STATIC")
+            == "W4A8_DYNAMIC"
+        )
+        is_moe_w8a8_dynamic = (
+            self.quant_description.get(prefix_in_quant_config, "STATIC")
+            == "W8A8_DYNAMIC"
+        )
+        if is_moe_w4a8_dynamic:
+            logger.info_once("Using ModelSlimW4A8Int8MoE")
+            return ModelSlimW4A8Int8MoE(self)
+        elif is_moe_w8a8_dynamic:
+            logger.info_once("Using ModelSlimW8A8Int8MoE")
+            return ModelSlimW8A8Int8MoE(self)
+        else:
+            logger.warning(
+                f"Unsupported FusedMoe modelslim scheme: "
+                f"{self.quant_description.get(prefix_in_quant_config.strip())} "
+                f"in layer: {prefix}"
+            )
+            return None
+
+    def is_layer_skipped(
+        self, prefix: str, fused_mapping: Mapping[str, List[str]] = MappingProxyType({})
+    ):
+        # adapted from vllm.model_executor.layers.quantization.utils.quant_utils.is_layer_skipped
+        proj_name = prefix.split(".")[-1]
+        if proj_name in fused_mapping:
+            shard_prefixes = [
+                prefix.replace(proj_name, shard_proj_name)
+                for shard_proj_name in fused_mapping[proj_name]
+            ]
+
+            is_skipped = None
+            for shard_prefix in shard_prefixes:
+                is_shard_skipped = (
+                    self.quant_description.get(shard_prefix + ".weight", "") == "FLOAT"
+                )
+
+                if is_skipped is None:
+                    is_skipped = is_shard_skipped
+                elif is_shard_skipped != is_skipped:
+                    raise ValueError(
+                        f"Detected some but not all shards of {prefix} "
+                        "are quantized. All shards of fused layers "
+                        "to have the same precision."
+                    )
+        else:
+            is_skipped = self.quant_description.get(prefix + ".weight", "") == "FLOAT"
+
+        assert is_skipped is not None
+        return is_skipped
+
+    def get_scaled_act_names(self) -> List[str]:
+        return []
+
+
+class ModelSlimLinearMethod(_NPULinearMethodBase):
+
+    def __init__(self, quantization_config: ModelSlimConfig):
+        self.quantization_config = quantization_config
+
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        layer.scheme.process_weights_after_loading(layer)
+
+    def create_weights(
+        self,
+        layer: torch.nn.Module,
+        input_size_per_partition: int,
+        output_partition_sizes: List[int],
+        input_size: int,
+        output_size: int,
+        params_dtype: torch.dtype,
+        **extra_weight_attrs,
+    ):
+        """
+        Use the ModelSlimLinearScheme associated with the layer to create
+        the necessary parameters for the layer. See LinearMethodBase for param
+        details
+        """
+        weight_loader = extra_weight_attrs.get("weight_loader")
+        layer.scheme.create_weights(
+            layer=layer,
+            input_size=input_size,
+            input_size_per_partition=input_size_per_partition,
+            output_partition_sizes=output_partition_sizes,
+            output_size=output_size,
+            params_dtype=params_dtype,
+            weight_loader=weight_loader,
+        )
+
+    def apply(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        bias: Optional[torch.Tensor] = None,
+    ):
+        """
+        Use the output of create_weights and the ModelSlimLinearScheme
+        associated with the layer to apply the forward pass with the
+        layer input.  See LinearMethodBase for param details
+
+        """
+
+        scheme = layer.scheme
+        if scheme is None:
+            raise ValueError("A scheme must be defined for each layer")
+        return scheme.apply_weights(layer, x, bias=bias)
+
+
+class ModelSlimFusedMoEMethod(FusedMoEMethodBase):
+
+    def __init__(self, quantization_config: ModelSlimConfig):
+        self.quantization_config = quantization_config
+
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        layer.scheme.process_weights_after_loading(layer)
+
+    def create_weights(
+        self,
+        layer: torch.nn.Module,
+        num_experts: int,
+        hidden_size: int,
+        intermediate_size_per_partition: int,
+        params_dtype: torch.dtype,
+        **extra_weight_attrs,
+    ):
+        """
+        Use the ModelSlimMoEScheme associated with the layer to create
+        the necessary parameters for the layer. See FusedMoEMethodBase for param
+        details
+        """
+        layer.scheme.create_weights(
+            layer=layer,
+            num_experts=num_experts,
+            hidden_size=hidden_size,
+            intermediate_size_per_partition=intermediate_size_per_partition,
+            params_dtype=params_dtype,
+            **extra_weight_attrs,
+        )
+
+    def create_moe_runner(
+        self, layer: torch.nn.Module, moe_runner_config: MoeRunnerConfig
+    ):
+        return layer.scheme.create_moe_runner(layer, moe_runner_config)
+
+    def apply(
+        self,
+        layer: torch.nn.Module,
+        dispatch_output: StandardDispatchOutput,
+    ) -> CombineInput:
+        """
+        Use the output of create_weights and the ModelSlimMoEScheme
+        associated with the layer to apply the forward pass with the
+        layer input.  See FusedMoEMethodBase for param details
+
+        """
+        scheme = layer.scheme
+        if scheme is None:
+            raise ValueError("A scheme must be defined for each layer")
+        return scheme.apply_weights(layer, dispatch_output)
+
+    def apply_without_routing_weights(
+        self,
+        layer,
+        hidden_states,
+        hidden_states_scale,
+        group_list_type,
+        group_list,
+        output_dtype,
+    ):
+        return layer.scheme.apply_without_routing_weights(
+            layer,
+            hidden_states,
+            hidden_states_scale,
+            group_list_type,
+            group_list,
+            output_dtype,
+        )
diff --git a/sglang/python/sglang/srt/layers/quantization/modelslim/schemes/__init__.py b/sglang/python/sglang/srt/layers/quantization/modelslim/schemes/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..14005eb3d61e5678405506de416e89b034cbae03
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/quantization/modelslim/schemes/__init__.py
@@ -0,0 +1,16 @@
+# SPDX-License-Identifier: Apache-2.0
+
+from .modelslim_scheme import ModelSlimLinearScheme, ModelSlimMoEScheme
+from .modelslim_w4a4_int4 import ModelSlimW4A4Int4
+from .modelslim_w4a8_int8_moe import ModelSlimW4A8Int8MoE
+from .modelslim_w8a8_int8 import ModelSlimW8A8Int8
+from .modelslim_w8a8_int8_moe import ModelSlimW8A8Int8MoE
+
+__all__ = [
+    "ModelSlimLinearScheme",
+    "ModelSlimMoEScheme",
+    "ModelSlimW8A8Int8",
+    "ModelSlimW4A4Int4",
+    "ModelSlimW4A8Int8MoE",
+    "ModelSlimW8A8Int8MoE",
+]
diff --git a/sglang/python/sglang/srt/layers/quantization/modelslim/schemes/__pycache__/modelslim_scheme.cpython-311.pyc b/sglang/python/sglang/srt/layers/quantization/modelslim/schemes/__pycache__/modelslim_scheme.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3642b2a31958b60a8a62e8481d32fe92b0da151d
Binary files /dev/null and b/sglang/python/sglang/srt/layers/quantization/modelslim/schemes/__pycache__/modelslim_scheme.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/layers/quantization/modelslim/schemes/__pycache__/modelslim_w4a4_int4.cpython-311.pyc b/sglang/python/sglang/srt/layers/quantization/modelslim/schemes/__pycache__/modelslim_w4a4_int4.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..daabbd63a0f8155dbecf8f71d1b8ea8113f530a1
Binary files /dev/null and b/sglang/python/sglang/srt/layers/quantization/modelslim/schemes/__pycache__/modelslim_w4a4_int4.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/layers/quantization/modelslim/schemes/__pycache__/modelslim_w4a8_int8_moe.cpython-311.pyc b/sglang/python/sglang/srt/layers/quantization/modelslim/schemes/__pycache__/modelslim_w4a8_int8_moe.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e8019d282bfcc09d29df2e6a926b38e1e0073892
Binary files /dev/null and b/sglang/python/sglang/srt/layers/quantization/modelslim/schemes/__pycache__/modelslim_w4a8_int8_moe.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/layers/quantization/modelslim/schemes/__pycache__/modelslim_w8a8_int8_moe.cpython-311.pyc b/sglang/python/sglang/srt/layers/quantization/modelslim/schemes/__pycache__/modelslim_w8a8_int8_moe.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5f0a25e58a7e974fce4704d6ae024d49824ff512
Binary files /dev/null and b/sglang/python/sglang/srt/layers/quantization/modelslim/schemes/__pycache__/modelslim_w8a8_int8_moe.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/layers/quantization/modelslim/schemes/modelslim_scheme.py b/sglang/python/sglang/srt/layers/quantization/modelslim/schemes/modelslim_scheme.py
new file mode 100644
index 0000000000000000000000000000000000000000..26f958e7b633685a54331aa4bb5faeca055e709b
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/quantization/modelslim/schemes/modelslim_scheme.py
@@ -0,0 +1,100 @@
+# Adapted from https://github.com/vllm-project/vllm/tree/main/vllm/model_executor/layers/quantization/compressed_tensors
+# SPDX-License-Identifier: Apache-2.0
+
+from abc import abstractmethod
+from typing import TYPE_CHECKING, Optional
+
+import torch
+
+from sglang.srt.layers.moe import MoeRunnerConfig
+from sglang.srt.layers.quantization.base_scheme import BaseLinearScheme, BaseMoEScheme
+
+if TYPE_CHECKING:
+    from sglang.srt.layers.moe.token_dispatcher import StandardDispatchOutput
+
+__all__ = ["ModelSlimLinearScheme", "ModelSlimMoEScheme"]
+
+
+class ModelSlimLinearScheme(BaseLinearScheme):
+    """
+    Abstract class used to describe the weight creation and forward pass
+    of different quantization schemes supported by ModelSlim.
+    """
+
+    @abstractmethod
+    def create_weights(self, *args, **kwargs):
+        """
+        Weight creation for the particular scheme. Inputs to this function
+
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def process_weights_after_loading(self, layer: torch.nn.Module):
+        """
+        Called after weight loading is complete for any cleanup that
+        needs to occur.
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def apply_weights(
+        self, layer: torch.nn.Module, x: torch.Tensor, bias: Optional[torch.Tensor]
+    ):
+        """
+        Run the forward pass for the particular scheme. This is where
+        scheme-specific dequant/quant steps/kernels should be applied.
+
+        :param layer: torch.nn.Module with the registered weights and
+            other parameters relevant to the particular scheme.
+        :param x: input to the layer
+        :param bias: bias parameter
+
+        """
+        raise NotImplementedError
+
+
+class ModelSlimMoEScheme(BaseMoEScheme):
+    """
+    Abstract class used to describe the weight creation and forward pass
+    of different quantization schemes supported by ModelSlim.
+    """
+
+    @abstractmethod
+    def create_weights(self, *args, **kwargs):
+        """
+        Weight creation for the particular scheme. Inputs to this function
+
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def process_weights_after_loading(self, layer: torch.nn.Module):
+        """
+        Called after weight loading is complete for any cleanup that
+        needs to occur.
+        """
+        raise NotImplementedError
+
+    def create_moe_runner(
+        self, layer: torch.nn.Module, moe_runner_config: "MoeRunnerConfig"
+    ):
+        raise NotImplementedError
+
+    @abstractmethod
+    def apply_weights(
+        self,
+        layer,
+        dispatch_output: "StandardDispatchOutput",
+    ):
+        """
+        Run the forward pass for the particular scheme. This is where
+        scheme-specific dequant/quant steps/kernels should be applied.
+
+        :param layer: torch.nn.Module with the registered weights and
+            other parameters relevant to the particular scheme.
+        :param x: input to the layer
+        :param bias: bias parameter
+
+        """
+        raise NotImplementedError
diff --git a/sglang/python/sglang/srt/layers/quantization/modelslim/schemes/modelslim_w4a4_int4.py b/sglang/python/sglang/srt/layers/quantization/modelslim/schemes/modelslim_w4a4_int4.py
new file mode 100644
index 0000000000000000000000000000000000000000..1152aa80433f26f9b40db8debe4ec935f336c435
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/quantization/modelslim/schemes/modelslim_w4a4_int4.py
@@ -0,0 +1,99 @@
+# Adapted from https://github.com/vllm-project/vllm/tree/main/vllm/model_executor/layers/quantization/compressed_tensors
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any, Dict, List, Optional
+
+import torch
+
+from sglang.srt.hardware_backend.npu.quantization.linear_method_npu import (
+    NPU_W4A4DynamicLinearMethod,
+)
+from sglang.srt.layers.parameter import PerTensorScaleParameter
+from sglang.srt.layers.quantization.modelslim.schemes import ModelSlimLinearScheme
+from sglang.srt.utils import set_weight_attrs
+
+
+class ModelSlimW4A4Int4(ModelSlimLinearScheme):
+
+    def __init__(
+        self,
+        quant_config: Dict[str, any],
+        prefix: str,
+    ):
+        self.quant_config = quant_config
+        self.is_dynamic = self.quant_config[prefix + ".weight"] == "W4A4_DYNAMIC"
+        self.kernel = NPU_W4A4DynamicLinearMethod()
+
+    @staticmethod
+    def get_weight(
+        input_size: int, output_size: int, params_dtype: torch.dtype
+    ) -> Dict[str, Any]:
+        params_dict = {"weight": torch.empty(output_size, input_size, dtype=torch.int8)}
+        return params_dict
+
+    @staticmethod
+    def get_perchannel_param(
+        output_size: int,
+        params_dtype: torch.dtype,
+    ) -> Dict[str, Any]:
+        params_dict = {}
+        params_dict["weight_scale"] = torch.empty(output_size, 1, dtype=params_dtype)
+        params_dict["weight_offset"] = torch.empty(output_size, 1, dtype=params_dtype)
+        return params_dict
+
+    def create_weights(
+        self,
+        layer: torch.nn.Module,
+        input_size_per_partition: int,
+        output_partition_sizes: List[int],
+        input_size: int,
+        output_size: int,
+        params_dtype: torch.dtype,
+        **extra_weight_attrs,
+    ) -> None:
+        output_size_per_partition = sum(output_partition_sizes)
+        weight_loader = extra_weight_attrs.get("weight_loader")
+
+        weight_dict = {
+            "weight": torch.empty(
+                output_size_per_partition, input_size_per_partition, dtype=torch.int8
+            )
+        }
+        for weight_name, weight_param in weight_dict.items():
+            param = torch.nn.Parameter(weight_param, requires_grad=False)
+            set_weight_attrs(param, {"input_dim": 1, "output_dim": 0})
+            layer.register_parameter(weight_name, param)
+            set_weight_attrs(param, extra_weight_attrs)
+
+        pertensor_dict = {}
+        for pertensor_name, pertensor_param in pertensor_dict.items():
+            param = PerTensorScaleParameter(
+                data=pertensor_param, weight_loader=weight_loader
+            )
+            # disable warning
+            param.ignore_warning = True
+            layer.register_parameter(pertensor_name, param)
+
+        perchannel_dict = {}
+        perchannel_dict["weight_scale"] = torch.empty(
+            output_size_per_partition, 1, dtype=params_dtype
+        )
+        perchannel_dict["weight_offset"] = torch.empty(
+            output_size_per_partition, 1, dtype=params_dtype
+        )
+        for perchannel_name, perchannel_param in perchannel_dict.items():
+            param = torch.nn.Parameter(perchannel_param, requires_grad=False)
+            set_weight_attrs(param, {"output_dim": 0})
+            layer.register_parameter(perchannel_name, param)
+            set_weight_attrs(param, extra_weight_attrs)
+
+    def process_weights_after_loading(self, layer):
+        self.kernel.process_weights_after_loading(layer)
+
+    def apply_weights(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        bias: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        return self.kernel.apply(layer, x, bias)
diff --git a/sglang/python/sglang/srt/layers/quantization/modelslim/schemes/modelslim_w4a8_int8_moe.py b/sglang/python/sglang/srt/layers/quantization/modelslim/schemes/modelslim_w4a8_int8_moe.py
new file mode 100644
index 0000000000000000000000000000000000000000..4c3cd20f30e51833155c38b0c4148a0840f8a383
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/quantization/modelslim/schemes/modelslim_w4a8_int8_moe.py
@@ -0,0 +1,217 @@
+from __future__ import annotations
+
+import logging
+from typing import TYPE_CHECKING, Any, Dict
+
+import torch
+
+from sglang.srt.hardware_backend.npu.quantization.fused_moe_method_npu import (
+    NPUW4A8Int8DynamicMoEMethod,
+)
+from sglang.srt.layers.quantization.modelslim.schemes import ModelSlimMoEScheme
+from sglang.srt.utils import set_weight_attrs
+
+if TYPE_CHECKING:
+    from sglang.srt.layers.moe import MoeRunnerConfig
+    from sglang.srt.layers.moe.token_dispatcher import (
+        CombineInput,
+        StandardDispatchOutput,
+    )
+
+logger = logging.getLogger(__name__)
+
+__all__ = [
+    "ModelSlimW4A8Int8MoE",
+]
+
+
+class ModelSlimW4A8Int8MoE(ModelSlimMoEScheme):
+
+    def __init__(
+        self,
+        quant_config: Dict[str, Any],
+        prefix: str = None,
+    ):
+        self.quant_config = quant_config
+        self.group_size = 0
+        self.is_per_channel_weight = self.group_size == 0
+        self.tp_size = 1
+        self.activation_use_clip = False
+        self.kernel = NPUW4A8Int8DynamicMoEMethod()
+
+    def create_weights(
+        self,
+        layer: torch.nn.Module,
+        num_experts: int,
+        hidden_size: int,
+        intermediate_size_per_partition: int,
+        params_dtype: torch.dtype,
+        **extra_weight_attrs,
+    ) -> None:
+        from sglang.srt.layers.moe.fused_moe_triton import FusedMoeWeightScaleSupported
+
+        self.is_per_channel_weight = self.group_size == 0
+        self.num_experts = num_experts
+        extra_weight_attrs.update(
+            {"quant_method": FusedMoeWeightScaleSupported.CHANNEL.value}
+        )
+
+        # >> weight
+        w13_output_size = intermediate_size_per_partition
+        w2_output_size = hidden_size // 2
+        w13_weight = torch.nn.Parameter(
+            torch.empty(num_experts, w13_output_size, hidden_size, dtype=torch.int8),
+            requires_grad=False,
+        )
+        layer.register_parameter("w13_weight", w13_weight)
+        set_weight_attrs(w13_weight, extra_weight_attrs)
+        w2_weight = torch.nn.Parameter(
+            torch.empty(
+                num_experts,
+                w2_output_size,
+                intermediate_size_per_partition,
+                dtype=torch.int8,
+            ),
+            requires_grad=False,
+        )
+        layer.register_parameter("w2_weight", w2_weight)
+        set_weight_attrs(w2_weight, extra_weight_attrs)
+
+        # >> scale
+        w13_weight_scale = torch.nn.Parameter(
+            torch.empty(
+                num_experts, 2 * intermediate_size_per_partition, 1, dtype=torch.float32
+            ),
+            requires_grad=False,
+        )
+        layer.register_parameter("w13_weight_scale", w13_weight_scale)
+        set_weight_attrs(w13_weight_scale, extra_weight_attrs)
+
+        w2_weight_scale = torch.nn.Parameter(
+            torch.empty(num_experts, hidden_size, 1, dtype=torch.float32),
+            requires_grad=False,
+        )
+        layer.register_parameter("w2_weight_scale", w2_weight_scale)
+        set_weight_attrs(w2_weight_scale, extra_weight_attrs)
+
+        # >> offset
+        w13_weight_offset = torch.nn.Parameter(
+            torch.empty(
+                num_experts, 2 * intermediate_size_per_partition, 1, dtype=torch.float32
+            ),
+            requires_grad=False,
+        )
+        layer.register_parameter("w13_weight_offset", w13_weight_offset)
+        set_weight_attrs(w13_weight_offset, extra_weight_attrs)
+
+        w2_weight_offset = torch.nn.Parameter(
+            torch.empty(num_experts, hidden_size, 1, dtype=torch.float32),
+            requires_grad=False,
+        )
+        layer.register_parameter("w2_weight_offset", w2_weight_offset)
+        set_weight_attrs(w2_weight_offset, extra_weight_attrs)
+
+        # >>> special param for w4a8
+        if not self.is_per_channel_weight:
+            w13_weight_scale_second = torch.nn.Parameter(
+                torch.empty(
+                    num_experts,
+                    2 * intermediate_size_per_partition,
+                    hidden_size // self.group_size,
+                    dtype=torch.float32,
+                ),
+                requires_grad=False,
+            )
+            layer.register_parameter("w13_weight_scale_second", w13_weight_scale_second)
+            set_weight_attrs(w13_weight_scale_second, extra_weight_attrs)
+            w13_weight_offset_second = torch.nn.Parameter(
+                torch.empty(
+                    num_experts,
+                    2 * intermediate_size_per_partition,
+                    hidden_size // self.group_size,
+                    dtype=torch.float32,
+                ),
+                requires_grad=False,
+            )
+            layer.register_parameter(
+                "w13_weight_offset_second", w13_weight_offset_second
+            )
+            set_weight_attrs(w13_weight_offset_second, extra_weight_attrs)
+
+            w2_weight_scale_second = torch.nn.Parameter(
+                torch.empty(
+                    num_experts,
+                    hidden_size,
+                    intermediate_size_per_partition // self.group_size,
+                    dtype=torch.float32,
+                ),
+                requires_grad=False,
+            )
+            layer.register_parameter("w2_weight_scale_second", w2_weight_scale_second)
+            set_weight_attrs(w2_weight_scale_second, extra_weight_attrs)
+
+            w2_weight_offset_second = torch.nn.Parameter(
+                torch.empty(
+                    num_experts,
+                    hidden_size,
+                    intermediate_size_per_partition // self.group_size,
+                    dtype=torch.float32,
+                ),
+                requires_grad=False,
+            )
+            layer.register_parameter("w2_weight_offset_second", w2_weight_offset_second)
+            set_weight_attrs(w2_weight_offset_second, extra_weight_attrs)
+
+        w13_scale_bias = torch.nn.Parameter(
+            torch.empty(
+                num_experts, 2 * intermediate_size_per_partition, 1, dtype=torch.float32
+            ),
+            requires_grad=False,
+        )
+        layer.register_parameter("w13_scale_bias", w13_scale_bias)
+        set_weight_attrs(w13_scale_bias, extra_weight_attrs)
+
+        w2_scale_bias = torch.nn.Parameter(
+            torch.empty(
+                num_experts, hidden_size, 16 // self.tp_size, dtype=torch.float32
+            ),
+            requires_grad=False,
+        )
+        layer.register_parameter("w2_scale_bias", w2_scale_bias)
+        set_weight_attrs(w2_scale_bias, extra_weight_attrs)
+
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        self.kernel.process_weights_after_loading(
+            layer, self.is_per_channel_weight, self.activation_use_clip
+        )
+
+    def create_moe_runner(
+        self, layer: torch.nn.Module, moe_runner_config: "MoeRunnerConfig"
+    ):
+        self.moe_runner_config = moe_runner_config
+
+    def apply_weights(
+        self,
+        layer,
+        dispatch_output: "StandardDispatchOutput",
+    ) -> "CombineInput":
+        # FIXME W4A8 without EP can give 0 accuracy
+        return self.kernel.apply(layer, dispatch_output)
+
+    def apply_without_routing_weights(
+        self,
+        layer,
+        hidden_states,
+        hidden_states_scale,
+        group_list_type,
+        group_list,
+        output_dtype,
+    ):
+        return self.kernel.apply_without_routing_weights(
+            layer,
+            hidden_states,
+            hidden_states_scale,
+            group_list_type,
+            group_list,
+            output_dtype,
+        )
diff --git a/sglang/python/sglang/srt/layers/quantization/modelslim/schemes/modelslim_w8a8_int8.py b/sglang/python/sglang/srt/layers/quantization/modelslim/schemes/modelslim_w8a8_int8.py
new file mode 100644
index 0000000000000000000000000000000000000000..3770320ae29421df32f2273ed5c091ad0a3f4710
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/quantization/modelslim/schemes/modelslim_w8a8_int8.py
@@ -0,0 +1,117 @@
+# Adapted from https://github.com/vllm-project/vllm/tree/main/vllm/model_executor/layers/quantization/compressed_tensors
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Dict, List, Optional
+
+import torch
+
+from sglang.srt.hardware_backend.npu.quantization.linear_method_npu import (
+    NPUW8A8Int8DynamicLinearMethod,
+    NPUW8A8Int8LinearMethod,
+)
+from sglang.srt.layers.parameter import (
+    ChannelQuantScaleParameter,
+    ModelWeightParameter,
+    PerTensorScaleParameter,
+)
+from sglang.srt.layers.quantization.modelslim.schemes import ModelSlimLinearScheme
+
+
+class ModelSlimW8A8Int8(ModelSlimLinearScheme):
+
+    def __init__(
+        self,
+        quant_config: Dict[str, any],
+        prefix: str,
+    ):
+        self.quant_config = quant_config
+        self.is_dynamic = (
+            self.quant_config.get(prefix + ".weight", "") == "W8A8_DYNAMIC"
+        )
+        if self.is_dynamic:
+            self.kernel = NPUW8A8Int8DynamicLinearMethod()
+        else:
+            self.kernel = NPUW8A8Int8LinearMethod()
+
+    def create_weights(
+        self,
+        layer: torch.nn.Module,
+        input_size_per_partition: int,
+        output_partition_sizes: List[int],
+        input_size: int,
+        output_size: int,
+        params_dtype: torch.dtype,
+        **extra_weight_attrs,
+    ):
+        weight_loader = extra_weight_attrs.get("weight_loader")
+        output_size_per_partition = sum(output_partition_sizes)
+
+        weight = ModelWeightParameter(
+            data=torch.empty(
+                (output_size_per_partition, input_size_per_partition), dtype=torch.int8
+            ),
+            input_dim=1,
+            output_dim=0,
+            weight_loader=weight_loader,
+        )
+        layer.register_parameter("weight", weight)
+
+        weight_scale = ChannelQuantScaleParameter(
+            data=torch.empty((output_size_per_partition, 1), dtype=params_dtype),
+            output_dim=0,
+            weight_loader=weight_loader,
+        )
+        layer.register_parameter("weight_scale", weight_scale)
+
+        weight_offset = ChannelQuantScaleParameter(
+            data=torch.empty((output_size_per_partition, 1), dtype=params_dtype),
+            output_dim=0,
+            weight_loader=weight_loader,
+        )
+        layer.register_parameter("weight_offset", weight_offset)
+
+        if not self.is_dynamic:
+            input_scale = PerTensorScaleParameter(
+                data=torch.empty(1, dtype=params_dtype),
+                weight_loader=weight_loader,
+            )
+            input_scale.ignore_warning = True
+            layer.register_parameter("input_scale", input_scale)
+
+            input_offset = PerTensorScaleParameter(
+                data=torch.empty(1, dtype=params_dtype),
+                weight_loader=weight_loader,
+            )
+            input_offset.ignore_warning = True
+            layer.register_parameter("input_offset", input_offset)
+
+            quant_bias = ChannelQuantScaleParameter(
+                data=torch.empty(output_size_per_partition, dtype=torch.int32),
+                output_dim=0,
+                weight_loader=weight_loader,
+            )
+            layer.register_parameter("quant_bias", quant_bias)
+
+            if params_dtype == torch.bfloat16:
+                deq_scale_dtype = torch.float32
+            elif params_dtype == torch.float16:
+                deq_scale_dtype = torch.int64
+            else:
+                raise ValueError(f"Unsupported params_dtype: {params_dtype}")
+            deq_scale = ChannelQuantScaleParameter(
+                data=torch.empty(output_size_per_partition, dtype=deq_scale_dtype),
+                output_dim=0,
+                weight_loader=weight_loader,
+            )
+            layer.register_parameter("deq_scale", deq_scale)
+
+    def process_weights_after_loading(self, layer: torch.nn.Module):
+        self.kernel.process_weights_after_loading(layer)
+
+    def apply_weights(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        bias: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        return self.kernel.apply(layer, x, bias)
diff --git a/sglang/python/sglang/srt/layers/quantization/modelslim/schemes/modelslim_w8a8_int8_moe.py b/sglang/python/sglang/srt/layers/quantization/modelslim/schemes/modelslim_w8a8_int8_moe.py
new file mode 100644
index 0000000000000000000000000000000000000000..b226797f36f07448d627fe7d9719d1f90f17e64d
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/quantization/modelslim/schemes/modelslim_w8a8_int8_moe.py
@@ -0,0 +1,139 @@
+from __future__ import annotations
+
+import logging
+from typing import TYPE_CHECKING, Any, Dict
+
+import torch
+
+from sglang.srt.hardware_backend.npu.quantization.fused_moe_method_npu import (
+    NPUW8A8Int8DynamicMoEMethod,
+)
+from sglang.srt.layers.quantization.modelslim.schemes import ModelSlimMoEScheme
+from sglang.srt.utils import set_weight_attrs
+
+if TYPE_CHECKING:
+    from sglang.srt.layers.moe import MoeRunnerConfig
+    from sglang.srt.layers.moe.token_dispatcher import (
+        CombineInput,
+        StandardDispatchOutput,
+    )
+
+logger = logging.getLogger(__name__)
+
+__all__ = [
+    "ModelSlimW8A8Int8MoE",
+]
+
+
+class ModelSlimW8A8Int8MoE(ModelSlimMoEScheme):
+
+    def __init__(
+        self,
+        quant_config: Dict[str, Any],
+        prefix: str = None,
+    ):
+        self.quant_config = quant_config
+        self.kernel = NPUW8A8Int8DynamicMoEMethod()
+
+    def create_weights(
+        self,
+        layer: torch.nn.Module,
+        num_experts: int,
+        hidden_size: int,
+        intermediate_size_per_partition: int,
+        params_dtype: torch.dtype,
+        **extra_weight_attrs,
+    ) -> None:
+        from sglang.srt.layers.moe.fused_moe_triton import FusedMoeWeightScaleSupported
+
+        self.num_experts = num_experts
+        extra_weight_attrs.update(
+            {"quant_method": FusedMoeWeightScaleSupported.CHANNEL.value}
+        )
+
+        # weight
+        w13_weight = torch.nn.Parameter(
+            torch.empty(
+                num_experts,
+                2 * intermediate_size_per_partition,
+                hidden_size,
+                dtype=torch.int8,
+            ),
+            requires_grad=False,
+        )
+        layer.register_parameter("w13_weight", w13_weight)
+        set_weight_attrs(w13_weight, extra_weight_attrs)
+        w2_weight = torch.nn.Parameter(
+            torch.empty(
+                num_experts,
+                hidden_size,
+                intermediate_size_per_partition,
+                dtype=torch.int8,
+            ),
+            requires_grad=False,
+        )
+        layer.register_parameter("w2_weight", w2_weight)
+        set_weight_attrs(w2_weight, extra_weight_attrs)
+        # scale
+        w13_weight_scale = torch.nn.Parameter(
+            torch.empty(
+                num_experts, 2 * intermediate_size_per_partition, 1, dtype=torch.float32
+            ),
+            requires_grad=False,
+        )
+        layer.register_parameter("w13_weight_scale", w13_weight_scale)
+        set_weight_attrs(w13_weight_scale, extra_weight_attrs)
+        w2_weight_scale = torch.nn.Parameter(
+            torch.empty(num_experts, hidden_size, 1, dtype=torch.float32),
+            requires_grad=False,
+        )
+        layer.register_parameter("w2_weight_scale", w2_weight_scale)
+        set_weight_attrs(w2_weight_scale, extra_weight_attrs)
+        # offset
+        w13_weight_offset = torch.nn.Parameter(
+            torch.empty(
+                num_experts, 2 * intermediate_size_per_partition, 1, dtype=torch.float32
+            ),
+            requires_grad=False,
+        )
+        layer.register_parameter("w13_weight_offset", w13_weight_offset)
+        set_weight_attrs(w13_weight_offset, extra_weight_attrs)
+        w2_weight_offset = torch.nn.Parameter(
+            torch.empty(num_experts, hidden_size, 1, dtype=torch.float32),
+            requires_grad=False,
+        )
+        layer.register_parameter("w2_weight_offset", w2_weight_offset)
+        set_weight_attrs(w2_weight_offset, extra_weight_attrs)
+
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        self.kernel.process_weights_after_loading(layer)
+
+    def create_moe_runner(
+        self, layer: torch.nn.Module, moe_runner_config: "MoeRunnerConfig"
+    ):
+        self.moe_runner_config = moe_runner_config
+
+    def apply_weights(
+        self,
+        layer,
+        dispatch_output: "StandardDispatchOutput",
+    ) -> "CombineInput":
+        return self.kernel.apply(layer, dispatch_output)
+
+    def apply_without_routing_weights(
+        self,
+        layer,
+        hidden_states,
+        hidden_states_scale,
+        group_list_type,
+        group_list,
+        output_dtype,
+    ):
+        return self.kernel.apply_without_routing_weights(
+            layer,
+            hidden_states,
+            hidden_states_scale,
+            group_list_type,
+            group_list,
+            output_dtype,
+        )
diff --git a/sglang/python/sglang/srt/layers/quantization/moe_wna16.py b/sglang/python/sglang/srt/layers/quantization/moe_wna16.py
new file mode 100644
index 0000000000000000000000000000000000000000..ce6397dd1962bc3db2a391a871580c16955d446e
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/quantization/moe_wna16.py
@@ -0,0 +1,506 @@
+# Adapted from https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/layers/quantization/moe_wna16.py
+from __future__ import annotations
+
+import logging
+from typing import TYPE_CHECKING, Any, Dict, List, Optional
+
+import numpy as np
+import torch
+
+from sglang.srt.distributed import get_tensor_model_parallel_rank
+from sglang.srt.distributed.parallel_state import get_tp_group
+from sglang.srt.layers.moe import MoeRunner, MoeRunnerBackend, MoeRunnerConfig
+from sglang.srt.layers.moe.moe_runner.triton import TritonMoeQuantInfo
+from sglang.srt.layers.quantization.awq import AWQConfig
+from sglang.srt.layers.quantization.base_config import (
+    FusedMoEMethodBase,
+    QuantizationConfig,
+    QuantizeMethodBase,
+)
+from sglang.srt.layers.quantization.gptq import GPTQConfig, GPTQMarlinConfig
+from sglang.srt.layers.quantization.unquant import (
+    UnquantizedFusedMoEMethod,
+    UnquantizedLinearMethod,
+)
+from sglang.srt.utils import get_device_capability, set_weight_attrs
+
+logger = logging.getLogger(__name__)
+
+if TYPE_CHECKING:
+    from sglang.srt.layers.moe.token_dispatcher import (
+        CombineInput,
+        StandardDispatchOutput,
+    )
+
+
+def get_weight_perm(num_bits: int):
+    perm_list: List[int] = []
+    for i in range(32):
+        perm1: List[int] = []
+        col = i // 4
+        for block in [0, 1]:
+            for row in [
+                2 * (i % 4),
+                2 * (i % 4) + 1,
+                2 * (i % 4 + 4),
+                2 * (i % 4 + 4) + 1,
+            ]:
+                perm1.append(16 * row + col + 8 * block)
+        for j in range(4):
+            perm_list.extend([p + 256 * j for p in perm1])
+
+    perm = np.array(perm_list)
+
+    if num_bits == 4:
+        interleave = np.array([0, 2, 4, 6, 1, 3, 5, 7])
+    elif num_bits == 8:
+        interleave = np.array([0, 2, 1, 3])
+    else:
+        raise Exception("num_bits must be 4 or 8, got {}".format(num_bits))
+
+    perm = perm.reshape((-1, len(interleave)))[:, interleave].ravel()
+    perm = torch.from_numpy(perm)
+    return perm
+
+
+class MoeWNA16Config(QuantizationConfig):
+    """Config class for MOE WNA16 (W8A16/W4A16) quantization."""
+
+    def __init__(
+        self,
+        linear_quant_method: str,
+        weight_bits: int,
+        group_size: int,
+        has_zp: bool,
+        lm_head_quantized: bool,
+        modules_to_not_convert: Optional[List[str]],
+        full_config: Dict[str, Any],
+    ) -> None:
+        super().__init__()
+        self.weight_bits = weight_bits
+        self.group_size = group_size
+        self.has_zp = has_zp
+        self.bit8_pack_factor = 8 // self.weight_bits
+        self.lm_head_quantized = lm_head_quantized
+        self.linear_quant_method = linear_quant_method
+        self.full_config = full_config
+        self.use_marlin = False
+        # Avoid circular import
+
+        if self.linear_quant_method == "gptq":
+            self.use_marlin = GPTQMarlinConfig.is_gptq_marlin_compatible(full_config)
+        elif self.linear_quant_method == "awq":
+            capability_tuple = get_device_capability()
+            device_capability = (
+                -1
+                if capability_tuple is None
+                else capability_tuple[0] * 10 + capability_tuple[1]
+            )
+            awq_min_capability = AWQConfig.get_min_capability()
+            if device_capability < awq_min_capability:
+                raise ValueError(
+                    "The quantization method moe_wna16 + awq is not supported "
+                    "for the current GPU. "
+                    f"Minimum capability: {awq_min_capability}. "
+                    f"Current capability: {device_capability}."
+                )
+        else:
+            raise ValueError("moe_wna16 only support gptq and awq.")
+
+        if modules_to_not_convert is None:
+            self.modules_to_not_convert = []
+        else:
+            self.modules_to_not_convert = modules_to_not_convert
+
+    @classmethod
+    def get_name(cls) -> str:
+        return "moe_wna16"
+
+    @classmethod
+    def get_supported_act_dtypes(cls) -> List[torch.dtype]:
+        return [torch.bfloat16, torch.half]
+
+    @classmethod
+    def get_min_capability(cls) -> int:
+        return 70
+
+    @classmethod
+    def get_config_filenames(cls) -> List[str]:
+        return ["quantize_config.json"]
+
+    def get_scaled_act_names(self) -> List[str]:
+        raise NotImplementedError
+
+    @classmethod
+    def from_config(cls, config: Dict[str, Any]) -> MoeWNA16Config:
+        quant_method = cls.get_from_keys(config, ["quant_method"])
+        weight_bits = cls.get_from_keys(config, ["bits"])
+        group_size = cls.get_from_keys(config, ["group_size"])
+        lm_head_quantized = cls.get_from_keys_or(config, ["lm_head"], default=False)
+        if quant_method == "gptq":
+            has_zp = not cls.get_from_keys(config, ["sym"])
+            modules_to_not_convert = []
+        elif quant_method == "awq":
+            has_zp = cls.get_from_keys(config, ["zero_point"])
+            modules_to_not_convert = cls.get_from_keys_or(
+                config, ["modules_to_not_convert"], None
+            )
+        else:
+            raise ValueError("moe_wna16 only support gptq and awq.")
+
+        return cls(
+            quant_method,
+            weight_bits,
+            group_size,
+            has_zp,
+            lm_head_quantized,
+            modules_to_not_convert,
+            config,
+        )
+
+    @classmethod
+    def override_quantization_method(cls, hf_quant_cfg, user_quant) -> Optional[str]:
+        if user_quant == "moe_wna16" and cls.is_moe_wna16_compatible(hf_quant_cfg):
+            return cls.get_name()
+        return None
+
+    @classmethod
+    def is_moe_wna16_compatible(cls, quant_config: Dict[str, Any]):
+        # Extract data from quant config.
+        quant_method = quant_config.get("quant_method", "").lower()
+        num_bits = quant_config.get("bits")
+        desc_act = quant_config.get("desc_act")
+
+        capability_tuple = get_device_capability()
+        device_capability = (
+            -1
+            if all(capability is None for capability in capability_tuple)
+            else capability_tuple[0] * 10 + capability_tuple[1]
+        )
+        # Avoid circular import
+        awq_min_capability = AWQConfig.get_min_capability()
+
+        gptq_compatible = quant_method == "gptq" and not desc_act and num_bits in [4, 8]
+        awq_compatible = (
+            quant_method == "awq"
+            and num_bits == 4
+            and device_capability >= awq_min_capability
+        )
+
+        return gptq_compatible or awq_compatible
+
+    def get_quant_method(
+        self, layer: torch.nn.Module, prefix: str
+    ) -> Optional[QuantizeMethodBase]:
+        # avoid circular import
+        from sglang.srt.layers.linear import LinearBase
+        from sglang.srt.layers.moe.fused_moe_triton.layer import FusedMoE
+
+        if is_layer_skipped_quant(prefix, self.modules_to_not_convert):
+            if isinstance(layer, FusedMoE):
+                return UnquantizedFusedMoEMethod()
+            return UnquantizedLinearMethod()
+        elif isinstance(layer, LinearBase):
+
+            if self.linear_quant_method == "gptq":
+                if self.use_marlin:
+                    return GPTQMarlinConfig.from_config(
+                        self.full_config
+                    ).get_quant_method(layer, prefix)
+                else:
+                    return GPTQConfig.from_config(self.full_config).get_quant_method(
+                        layer, prefix
+                    )
+            elif self.linear_quant_method == "awq":
+                return AWQConfig.from_config(self.full_config).get_quant_method(
+                    layer, prefix
+                )
+            else:
+                raise ValueError("moe_wna16 only support gptq and awq.")
+        elif isinstance(layer, FusedMoE):
+            return MoeWNA16Method(self)
+        return None
+
+
+def is_layer_skipped_quant(prefix: str, modules_to_not_convert: List[str]):
+    return any(module_name in prefix for module_name in modules_to_not_convert)
+
+
+class MoeWNA16Method(FusedMoEMethodBase):
+    """Linear method for MOE WNA16 (W8A16/W4A16) quantization.
+
+    Args:
+        quant_config: The MOE WNA16 (W8A16/W4A16) quantization config.
+    """
+
+    def __init__(self, quant_config: MoeWNA16Config):
+        self.quant_config = quant_config
+
+    def create_weights(
+        self,
+        layer: torch.nn.Module,
+        num_experts: int,
+        hidden_size: int,
+        intermediate_size_per_partition: int,
+        params_dtype: torch.dtype,
+        **extra_weight_attrs,
+    ):
+        from sglang.srt.layers.moe.fused_moe_triton import FusedMoeWeightScaleSupported
+
+        layer.quant_config = self.quant_config
+        bit8_pack_factor = self.quant_config.bit8_pack_factor
+        group_size = self.quant_config.group_size
+        group_size_div_factor = 1
+
+        # make intermediate_size and hidden_size diviable by group_size
+        # we reduce the group size to ensure that
+        # and we would repeat the loaded_weight later
+        while intermediate_size_per_partition % group_size or hidden_size % group_size:
+            group_size = group_size // 2
+            group_size_div_factor *= 2
+            assert group_size >= 32
+        layer.group_size = group_size
+        layer.group_size_div_factor = group_size_div_factor
+
+        strategy = FusedMoeWeightScaleSupported.GROUP.value
+        extra_weight_attrs.update({"quant_method": strategy, "is_transposed": False})
+
+        assert "weight_loader" in extra_weight_attrs
+        weight_loader = extra_weight_attrs["weight_loader"]
+        wrapped_weight_loader = MoeWNA16Method.get_weight_loader(layer, weight_loader)
+        extra_weight_attrs["weight_loader"] = wrapped_weight_loader
+
+        # Fused gate_up_proj (column parallel)
+        w13_qweight = torch.nn.Parameter(
+            torch.empty(
+                num_experts,
+                2 * intermediate_size_per_partition,
+                hidden_size // bit8_pack_factor,
+                dtype=torch.uint8,
+            ),
+            requires_grad=False,
+        )
+        layer.register_parameter("w13_qweight", w13_qweight)
+        set_weight_attrs(w13_qweight, extra_weight_attrs)
+
+        # down_proj (row parallel)
+        w2_qweight = torch.nn.Parameter(
+            torch.empty(
+                num_experts,
+                hidden_size,
+                intermediate_size_per_partition // bit8_pack_factor,
+                dtype=torch.uint8,
+            ),
+            requires_grad=False,
+        )
+        layer.register_parameter("w2_qweight", w2_qweight)
+        set_weight_attrs(w2_qweight, extra_weight_attrs)
+
+        w13_scales = torch.nn.Parameter(
+            torch.zeros(
+                num_experts,
+                2 * intermediate_size_per_partition,
+                hidden_size // group_size,
+                dtype=params_dtype,
+            ),
+            requires_grad=False,
+        )
+        layer.register_parameter("w13_scales", w13_scales)
+        set_weight_attrs(w13_scales, extra_weight_attrs)
+
+        w2_scales = torch.nn.Parameter(
+            torch.zeros(
+                num_experts,
+                hidden_size,
+                intermediate_size_per_partition // group_size,
+                dtype=params_dtype,
+            ),
+            requires_grad=False,
+        )
+        layer.register_parameter("w2_scales", w2_scales)
+        set_weight_attrs(w2_scales, extra_weight_attrs)
+
+        if self.quant_config.has_zp:
+            w13_qzeros = torch.nn.Parameter(
+                torch.zeros(
+                    num_experts,
+                    2 * intermediate_size_per_partition // bit8_pack_factor,
+                    hidden_size // group_size,
+                    dtype=torch.uint8,
+                ),
+                requires_grad=False,
+            )
+            layer.register_parameter("w13_qzeros", w13_qzeros)
+            set_weight_attrs(w13_qzeros, extra_weight_attrs)
+
+            w2_qzeros = torch.nn.Parameter(
+                torch.zeros(
+                    num_experts,
+                    hidden_size // bit8_pack_factor,
+                    intermediate_size_per_partition // group_size,
+                    dtype=torch.uint8,
+                ),
+                requires_grad=False,
+            )
+            layer.register_parameter("w2_qzeros", w2_qzeros)
+            set_weight_attrs(w2_qzeros, extra_weight_attrs)
+
+        if self.quant_config.linear_quant_method == "gptq":
+            # some param are unused, but we need to init them in order to
+            # load weights
+            invalid_param_keys = ["w13_g_idx", "w2_g_idx"]
+            if not self.quant_config.has_zp:
+                invalid_param_keys += ["w13_qzeros", "w2_qzeros"]
+            for key in invalid_param_keys:
+                param = torch.nn.Parameter(
+                    torch.empty((0,), dtype=torch.int32), requires_grad=False
+                )
+                layer.register_parameter(key, param)
+                set_weight_attrs(param, extra_weight_attrs)
+
+    def create_moe_runner(
+        self, layer: torch.nn.Module, moe_runner_config: MoeRunnerConfig
+    ):
+        self.moe_runner_config = moe_runner_config
+        self.runner = MoeRunner(MoeRunnerBackend.TRITON, moe_runner_config)
+
+    def apply(
+        self,
+        layer: torch.nn.Module,
+        dispatch_output: StandardDispatchOutput,
+    ) -> CombineInput:
+        assert (
+            self.moe_runner_config.activation == "silu"
+        ), "Only SiLU activation is supported."
+
+        weight_bits = self.quant_config.weight_bits
+        has_zp = self.quant_config.has_zp
+
+        quant_info = TritonMoeQuantInfo(
+            w13_weight=layer.w13_qweight,
+            w2_weight=layer.w2_qweight,
+            use_int4_w4a16=weight_bits == 4,
+            use_int8_w8a16=weight_bits == 8,
+            w13_scale=layer.w13_scales,
+            w2_scale=layer.w2_scales,
+            w13_zp=layer.w13_qzeros if has_zp else None,
+            w2_zp=layer.w2_qzeros if has_zp else None,
+            block_shape=[0, layer.group_size],
+        )
+        return self.runner.run(dispatch_output, quant_info)
+
+    @staticmethod
+    def get_weight_loader(layer, weight_loader):
+
+        def convert_awq_tensor(tensor, tensor_type):
+            # convert awq qweight/qzeros to a standard format (assume int4)
+            # qweight: (k, n // pack_factor_bit32) -> (n, k // pack_factor_bit8)
+            # qzeros: (k // group_size, n // pack_factor_bit32) ->
+            #         (n // pack_factor_bit8, k // group_size)
+            # pack_factor_bit32 = 32 // weight_bits
+            # pack_factor_bit8 = 8 // weight_bits
+
+            # 0. suppose origin shape (a, b), dtype int32
+            # 1. convert to uint8, shape (a, b) -> (a, 4 * b)
+            size0 = tensor.size(0)
+            tensor = tensor.view(torch.uint8)
+
+            # 2. unpack to uint4 (only when weight_bits == 4)
+            #    shape (a, 4 * b) -> (a, 4 * b, 2)
+            shifter = torch.tensor([0, 4], dtype=torch.uint8, device=tensor.device)
+            tensor = (tensor[:, :, None] >> shifter) & 0xF
+
+            # 3. change order, see
+            # https://github.com/casper-hansen/AutoAWQ/blob/v0.2.8/awq/utils/quant_utils.py
+            # shape -> (a, 4 * b * pack_factor_bit8)
+            reverse_awq_pack_order = [0, 4, 1, 5, 2, 6, 3, 7]
+            tensor = tensor.view(-1, 8)[:, reverse_awq_pack_order]
+            tensor = tensor.view(size0, -1)
+
+            # 4. transpose, shape -> (4 * b * pack_factor_bit8, a)
+            tensor = tensor.T.contiguous()
+
+            # 5. repack (only when weight_bits == 4)
+            # qweight shape -> (4 * b * pack_factor_bit8, a // pack_factor_bit8)
+            # qzeros shape -> (4 * b, a)
+
+            if tensor_type == "qweight":
+                tensor = tensor[:, 1::2] * 16 + tensor[:, ::2]
+            elif tensor_type == "qzeros":
+                tensor = tensor[1::2, :] * 16 + tensor[::2, :]
+            return tensor
+
+        def convert_gptq_int4_qzeros(tensor):
+            tensor = tensor.view(torch.uint8)
+            shifter = torch.tensor([0, 4], dtype=torch.uint8, device=tensor.device)
+            tensor = (tensor[:, :, None] >> shifter) & 0xF
+            tensor = tensor + 1
+            tensor = tensor[:, :, 0] + tensor[:, :, 1] * 16
+            return tensor
+
+        def moe_wna16_weight_loader(
+            param: torch.nn.Parameter,
+            loaded_weight: torch.Tensor,
+            weight_name: str,
+            shard_id: str,
+            expert_id: int,
+        ):
+            if "g_idx" in weight_name:
+                return
+            if not layer.quant_config.has_zp and "qzeros" in weight_name:
+                return
+
+            device = get_tp_group().device
+            tp_rank = get_tensor_model_parallel_rank()
+            loaded_weight = loaded_weight.to(device)
+            shard_size = layer.intermediate_size_per_partition
+
+            # convert gptq and awq weight to a standard format
+            if layer.quant_config.linear_quant_method == "awq":
+                assert layer.quant_config.weight_bits == 4
+                if "weight" in weight_name:
+                    loaded_weight = convert_awq_tensor(loaded_weight, "qweight")
+                elif "zeros" in weight_name:
+                    loaded_weight = convert_awq_tensor(loaded_weight, "qzeros")
+                else:
+                    loaded_weight = loaded_weight.T
+            elif layer.quant_config.linear_quant_method == "gptq":
+                assert layer.quant_config.weight_bits in [4, 8]
+                if "weight" in weight_name:
+                    loaded_weight = loaded_weight.T.contiguous().view(torch.uint8)
+                elif "zeros" in weight_name:
+                    # add 1 to gptq qzeros to align with awq
+                    loaded_weight = loaded_weight.view(torch.uint8)
+                    if layer.quant_config.weight_bits == 4:
+                        loaded_weight = convert_gptq_int4_qzeros(loaded_weight).T
+                    else:
+                        loaded_weight = loaded_weight.T + 1
+                else:
+                    loaded_weight = loaded_weight.T
+
+            # repeat the qzeros/scales to fit new group size
+            if (
+                layer.group_size_div_factor > 1
+                and "qzeros" in weight_name
+                or "scales" in weight_name
+            ):
+                loaded_weight = loaded_weight.repeat_interleave(
+                    layer.group_size_div_factor, 1
+                )
+
+            if "w13_qzeros" in weight_name:
+                tensor = loaded_weight.view(
+                    layer.moe_tp_size, -1, loaded_weight.size(1)
+                )[tp_rank]
+                if shard_id == "w1":
+                    param.data[expert_id, : shard_size // 2] = tensor
+                else:
+                    param.data[expert_id, shard_size // 2 :] = tensor
+            elif "w2_qzeros" in weight_name:
+                param.data[expert_id] = loaded_weight.view(
+                    loaded_weight.size(0), layer.moe_tp_size, -1
+                )[:, tp_rank]
+            else:
+                weight_loader(param, loaded_weight, weight_name, shard_id, expert_id)
+
+        return moe_wna16_weight_loader
diff --git a/sglang/python/sglang/srt/layers/quantization/mxfp4.py b/sglang/python/sglang/srt/layers/quantization/mxfp4.py
new file mode 100644
index 0000000000000000000000000000000000000000..3c4b3ac12756f6aeab72b30849e8eeb58f19fd5e
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/quantization/mxfp4.py
@@ -0,0 +1,1050 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Adapted from https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/layers/quantization/mxfp4.py
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING, List, Optional
+
+import torch
+from torch.nn.parameter import Parameter
+
+from sglang.srt.distributed import get_tp_group
+from sglang.srt.distributed.device_communicators.pynccl_allocator import (
+    use_symmetric_memory,
+)
+from sglang.srt.layers.dp_attention import is_allocation_symmetric
+from sglang.srt.layers.moe import MoeRunner, MoeRunnerBackend, MoeRunnerConfig
+from sglang.srt.layers.moe.moe_runner.triton import TritonMoeQuantInfo
+from sglang.srt.layers.moe.utils import get_moe_runner_backend
+from sglang.srt.layers.quantization.base_config import (
+    FusedMoEMethodBase,
+    QuantizationConfig,
+    QuantizeMethodBase,
+)
+from sglang.srt.layers.quantization.utils import is_layer_skipped
+from sglang.srt.server_args import get_global_server_args
+from sglang.srt.utils import (
+    is_flashinfer_available,
+    is_gfx95_supported,
+    is_hip,
+    is_sm90_supported,
+    is_sm100_supported,
+    is_sm120_supported,
+    is_triton_kernels_available,
+    mxfp_supported,
+    next_power_of_2,
+    round_up,
+    set_weight_attrs,
+)
+from sglang.srt.utils.common import get_bool_env_var
+from sglang.srt.utils.custom_op import register_custom_op
+
+has_triton_kernels = is_triton_kernels_available()
+
+
+if is_flashinfer_available():
+    from flashinfer import (
+        mxfp8_quantize,
+        nvfp4_block_scale_interleave,
+        trtllm_fp4_block_scale_moe,
+    )
+    from flashinfer.fused_moe.core import get_w2_permute_indices_with_cache
+
+_flashinfer_mxfp4_permute_indices_cache: dict[torch.Size, torch.Tensor] = {}
+_flashinfer_mxfp4_permute_indices_device_cache: dict[
+    tuple[tuple[int, ...], int, int, str, int], torch.Tensor
+] = {}
+
+
+def _get_flashinfer_mxfp4_device_permute_indices(
+    x: torch.Tensor,
+    epilogue_tile_m: int,
+    num_elts_per_sf: Optional[int] = None,
+) -> torch.Tensor:
+    extra_args = {} if num_elts_per_sf is None else {"num_elts_per_sf": num_elts_per_sf}
+    permute_indices = get_w2_permute_indices_with_cache(
+        _flashinfer_mxfp4_permute_indices_cache,
+        x,
+        epilogue_tile_m,
+        **extra_args,
+    )
+
+    device_index = -1 if x.device.index is None else x.device.index
+    num_elts_per_sf_key = -1 if num_elts_per_sf is None else num_elts_per_sf
+    cache_key = (
+        tuple(x.shape),
+        epilogue_tile_m,
+        num_elts_per_sf_key,
+        x.device.type,
+        device_index,
+    )
+    cached_device_indices = _flashinfer_mxfp4_permute_indices_device_cache.get(
+        cache_key
+    )
+    if cached_device_indices is None:
+        cached_device_indices = permute_indices.to(x.device)
+        _flashinfer_mxfp4_permute_indices_device_cache[cache_key] = (
+            cached_device_indices
+        )
+
+    return cached_device_indices
+
+
+if TYPE_CHECKING:
+    from sglang.srt.layers.moe.token_dispatcher import (
+        CombineInput,
+        StandardDispatchOutput,
+    )
+
+_is_hip = is_hip()
+_use_aiter = get_bool_env_var("SGLANG_USE_AITER") and _is_hip
+_is_shuffle_moe_mxfp4 = is_gfx95_supported()
+
+if _is_hip:
+    # import aiter
+    try:
+        from aiter import ActivationType, QuantType
+        from aiter.fused_moe import fused_moe
+        from aiter.ops.shuffle import (
+            shuffle_scale_a16w4,
+            shuffle_weight,
+            shuffle_weight_a16w4,
+        )
+        from aiter.ops.triton.quant import dynamic_mxfp4_quant
+        from aiter.utility.fp4_utils import e8m0_shuffle
+    except ImportError as err:
+        ActivationType = QuantType = fused_moe = dynamic_mxfp4_quant = e8m0_shuffle = (
+            err
+        )
+
+
+def _swizzle_mxfp4(quant_tensor, scale, num_warps):
+    """weight swizzle for mxfp4 moe, used for OAI mxfp4 kernel"""
+    import triton_kernels.matmul_ogs_details.opt_flags as opt_flags
+    from triton_kernels.numerics import InFlexData
+    from triton_kernels.tensor import FP4, convert_layout, wrap_torch_tensor
+    from triton_kernels.tensor_details import layout
+
+    if is_sm120_supported():
+        # SM120 desktop Blackwell does not support the persistent/TMA MXFP4 path.
+        # This MXFP4 path uses StridedLayout and the non-persistent kernel with
+        # block_k=128 so the selected tile stays within the per-block shared-memory budget.
+        from triton_kernels.tensor_details.layout import StridedLayout
+
+        value_layout = StridedLayout
+        value_layout_opts = {}
+        scale_layout = StridedLayout
+        scale_layout_opts = {}
+        constraints = {
+            "is_persistent": False,
+            "block_k": 128,
+            "num_stages": 1,
+        }
+        opt_flags.update_opt_flags_constraints(constraints)
+    else:
+        value_layout, value_layout_opts = layout.make_default_matmul_mxfp4_w_layout(
+            mx_axis=1
+        )
+        scale_layout, scale_layout_opts = (
+            layout.make_default_matmul_mxfp4_w_scale_layout(
+                mx_axis=1, num_warps=num_warps
+            )
+        )
+        if is_sm100_supported():
+            constraints = {
+                "is_persistent": True,
+                "epilogue_subtile": 1,
+            }
+            opt_flags.update_opt_flags_constraints(constraints)
+        elif is_sm90_supported():
+            constraints = {
+                "split_k": 1,
+            }
+            opt_flags.update_opt_flags_constraints(constraints)
+    # transpose the tensor so that the quantization axis is on dim1
+    quant_tensor = quant_tensor.transpose(-2, -1)
+    scale = scale.transpose(-2, -1)
+    quant_tensor = convert_layout(
+        wrap_torch_tensor(quant_tensor, dtype=FP4), value_layout, **value_layout_opts
+    )
+    scale = convert_layout(wrap_torch_tensor(scale), scale_layout, **scale_layout_opts)
+    return quant_tensor, InFlexData(), scale
+
+
+def _dequant_mxfp4_fake(
+    x: torch.Tensor, scale: torch.Tensor, float_dtype: torch.dtype
+) -> torch.Tensor:
+    return torch.empty(
+        (*x.shape[:-1], x.shape[-1] * 2), dtype=float_dtype, device=x.device
+    )
+
+
+@register_custom_op(fake_impl=_dequant_mxfp4_fake)
+def dequant_mxfp4(
+    x: torch.Tensor, scale: torch.Tensor, float_dtype: torch.dtype
+) -> torch.Tensor:
+    try:
+        from quark.torch.kernel import mx
+    except ImportError as err:
+        raise ImportError(
+            "The package `amd-quark` is required to use "
+            "MX-FP4 models. Please install it with `pip install "
+            "amd-quark`."
+        ) from err
+
+    return mx.dq_mxfp4(x, scale, float_dtype)
+
+
+@register_custom_op(out_shape="x")
+def quant_dequant_mxfp4(
+    x: torch.Tensor, scale_calculation_mode: str = "even"
+) -> torch.Tensor:
+    try:
+        from quark.torch.kernel import mx
+    except ImportError as err:
+        raise ImportError(
+            "The package `amd-quark` is required to use "
+            "MX-FP4 models. Please install it with `pip install "
+            "amd-quark`."
+        ) from err
+
+    return mx.qdq_mxfp4(x, scale_calculation_mode)
+
+
+class Mxfp4Config(QuantizationConfig):
+
+    def __init__(
+        self,
+        ignored_layers: Optional[list[str]] = None,
+        is_checkpoint_mxfp4_serialized: bool = False,
+    ):
+        super().__init__()
+        self.is_checkpoint_mxfp4_serialized = is_checkpoint_mxfp4_serialized
+        self.ignored_layers = ignored_layers
+
+    @classmethod
+    def from_config(cls, config):
+
+        quant_method = cls.get_from_keys(config, ["quant_method"])
+        is_checkpoint_mxfp4_serialized = "mxfp4" in quant_method
+
+        if _is_hip:
+            if mxfp_supported():
+                return cls(
+                    is_checkpoint_mxfp4_serialized=is_checkpoint_mxfp4_serialized
+                )
+            else:
+
+                platform = torch.cuda.get_device_properties(0).gcnArchName
+                raise ValueError(
+                    f"Current platform {platform} not support mxfp4 computation"
+                )
+
+        return cls(is_checkpoint_mxfp4_serialized=is_checkpoint_mxfp4_serialized)
+
+    @classmethod
+    def get_min_capability(cls) -> int:
+        return 80
+
+    @classmethod
+    def get_name(cls) -> str:
+        return "mxfp4"
+
+    @classmethod
+    def get_supported_act_dtypes(cls) -> list[torch.dtype]:
+        return [torch.bfloat16, torch.float16]
+
+    @classmethod
+    def get_config_filenames(cls) -> list[str]:
+        return []
+
+    def is_static_cfg(self):
+        return self.is_checkpoint_mxfp4_serialized
+
+    def get_quant_method(
+        self, layer: torch.nn.Module, prefix: str
+    ) -> Optional["QuantizeMethodBase"]:
+
+        from sglang.srt.layers.linear import LinearBase
+        from sglang.srt.layers.moe.fused_moe_triton import FusedMoE
+        from sglang.srt.layers.quantization.unquant import UnquantizedLinearMethod
+
+        if isinstance(layer, LinearBase):
+            if self.ignored_layers and is_layer_skipped(
+                prefix=prefix,
+                ignored_layers=self.ignored_layers,
+                fused_mapping=self.packed_modules_mapping,
+            ):
+                return UnquantizedLinearMethod()
+            elif _is_hip:
+                return UnquantizedLinearMethod()
+        elif isinstance(layer, FusedMoE):
+            if self.is_checkpoint_mxfp4_serialized:
+                return Mxfp4MoEMethod(prefix=prefix)
+            else:
+                return Mxfp4DynamicQuantMoEMethod()
+        else:
+            if self.is_checkpoint_mxfp4_serialized:
+                raise NotImplementedError("Mxfp4 attention layer is not implemented")
+        return None
+
+    def get_scaled_act_names(self) -> List[str]:
+        return []
+
+
+class Mxfp4MoEMethod(FusedMoEMethodBase):
+
+    def __init__(
+        self,
+        prefix: str,
+    ):
+        super().__init__()
+
+        self.prefix = prefix
+        self.topk_indices_dtype = None
+        self.use_triton_kernels = get_moe_runner_backend().is_triton_kernels()
+        self.with_bias = False
+        self.use_flashinfer = get_moe_runner_backend().is_flashinfer_mxfp4()
+        self.flashinfer_mxfp4_moe_precision = (
+            get_global_server_args().flashinfer_mxfp4_moe_precision
+        )
+
+    def create_weights(
+        self,
+        layer: torch.nn.Module,
+        num_experts: int,
+        hidden_size: int,
+        intermediate_size_per_partition: int,
+        params_dtype: torch.dtype,
+        with_bias: bool = False,
+        **extra_weight_attrs,
+    ):
+        self.num_experts = num_experts
+        weight_dtype = torch.uint8
+        scale_dtype = torch.uint8
+        self.with_bias = with_bias
+        mxfp4_block = 32
+        triton_kernels_padding_alignment = 64
+
+        # pad the intermediate size to be a multiple of 2 * mxfp4_block
+        # for to hold non-uniform sharded tensor as well as swizzling
+        intermediate_size_per_partition_after_pad = intermediate_size_per_partition
+        if is_sm100_supported():
+            if self.use_flashinfer:
+                intermediate_size_per_partition_after_pad = round_up(
+                    intermediate_size_per_partition, 256
+                )
+                hidden_size = round_up(hidden_size, 256)
+            else:
+                intermediate_size_per_partition_after_pad = round_up(
+                    intermediate_size_per_partition, triton_kernels_padding_alignment
+                )
+        elif _use_aiter:
+
+            intermediate_size_per_partition_after_pad = round_up(
+                intermediate_size_per_partition, 256
+            )
+
+            hidden_size = round_up(hidden_size, 256)
+            self.hidden_pad = hidden_size - layer.hidden_size
+            self.intermediate_pad = (
+                intermediate_size_per_partition_after_pad
+                - layer.intermediate_size_per_partition
+            )
+        elif has_triton_kernels:
+            intermediate_size_per_partition_after_pad = round_up(
+                intermediate_size_per_partition, triton_kernels_padding_alignment
+            )
+
+        self.intermediate_size_per_partition = intermediate_size_per_partition_after_pad
+
+        self.hidden_size = hidden_size
+        # Fused gate_up_proj (column parallel)
+        w13_weight = torch.nn.Parameter(
+            torch.zeros(
+                layer.num_local_experts,
+                2 * intermediate_size_per_partition_after_pad,
+                hidden_size // 2,
+                dtype=weight_dtype,
+            ),
+            requires_grad=False,
+        )
+        layer.register_parameter("w13_weight", w13_weight)
+        set_weight_attrs(w13_weight, extra_weight_attrs)
+
+        w13_weight_scale = torch.nn.Parameter(
+            torch.zeros(
+                layer.num_local_experts,
+                2 * intermediate_size_per_partition_after_pad,
+                hidden_size // mxfp4_block,
+                dtype=scale_dtype,
+            ),
+            requires_grad=False,
+        )
+        layer.register_parameter("w13_weight_scale", w13_weight_scale)
+        set_weight_attrs(w13_weight_scale, extra_weight_attrs)
+
+        w13_weight_bias = torch.nn.Parameter(
+            torch.zeros(
+                layer.num_local_experts,
+                2 * intermediate_size_per_partition_after_pad,
+                dtype=torch.bfloat16,
+            ),
+            requires_grad=False,
+        )
+        layer.register_parameter("w13_weight_bias", w13_weight_bias)
+        set_weight_attrs(w13_weight_bias, extra_weight_attrs)
+
+        # down_proj (row parallel)
+        w2_weight = torch.nn.Parameter(
+            torch.zeros(
+                layer.num_local_experts,
+                hidden_size,
+                intermediate_size_per_partition_after_pad // 2,
+                dtype=weight_dtype,
+            ),
+            requires_grad=False,
+        )
+        layer.register_parameter("w2_weight", w2_weight)
+        set_weight_attrs(w2_weight, extra_weight_attrs)
+
+        w2_weight_scale = torch.nn.Parameter(
+            torch.zeros(
+                layer.num_local_experts,
+                hidden_size,
+                intermediate_size_per_partition_after_pad // mxfp4_block,
+                dtype=scale_dtype,
+            ),
+            requires_grad=False,
+        )
+        layer.register_parameter("w2_weight_scale", w2_weight_scale)
+        set_weight_attrs(w2_weight_scale, extra_weight_attrs)
+
+        w2_weight_bias = torch.nn.Parameter(
+            torch.zeros(layer.num_local_experts, hidden_size, dtype=torch.bfloat16),
+            requires_grad=False,
+        )
+        layer.register_parameter("w2_weight_bias", w2_weight_bias)
+        set_weight_attrs(w2_weight_bias, extra_weight_attrs)
+
+    def process_weights_after_loading(self, layer):
+        if self.use_flashinfer:
+            # TODO: these values are hardcoded for now, we need to get them from the model
+            layer.gemm1_alpha = Parameter(
+                torch.tensor([1.702] * self.num_experts, dtype=torch.float32).cuda(),
+                requires_grad=False,
+            )
+            layer.gemm1_beta = Parameter(
+                torch.tensor([1.0] * self.num_experts, dtype=torch.float32).cuda(),
+                requires_grad=False,
+            )
+            layer.gemm1_clamp_limit = Parameter(
+                torch.tensor([7.0] * self.num_experts, dtype=torch.float32).cuda(),
+                requires_grad=False,
+            )
+            sf_block_size = 32  # mxfp4 block size
+
+            assert (
+                layer.w13_weight.dim() == 3
+                and layer.w13_weight.shape[0] == self.num_experts
+                and layer.w13_weight.shape[1]
+                == self.intermediate_size_per_partition * 2
+                and layer.w13_weight.shape[2] == self.hidden_size // 2
+            )
+            assert (
+                layer.w13_weight_scale.dim() == 3
+                and layer.w13_weight_scale.shape[0] == self.num_experts
+                and layer.w13_weight_scale.shape[1]
+                == self.intermediate_size_per_partition * 2
+                and layer.w13_weight_scale.shape[2] == self.hidden_size // sf_block_size
+            )
+            assert (
+                layer.w2_weight.dim() == 3
+                and layer.w2_weight.shape[0] == self.num_experts
+                and layer.w2_weight.shape[1] == self.hidden_size
+                and layer.w2_weight.shape[2]
+                == self.intermediate_size_per_partition // 2
+            )
+            assert (
+                layer.w2_weight_scale.dim() == 3
+                and layer.w2_weight_scale.shape[1] == self.hidden_size
+                and layer.w2_weight_scale.shape[2]
+                == self.intermediate_size_per_partition // sf_block_size
+            )
+            assert (
+                layer.w13_weight_bias.dim() == 2
+                and layer.w13_weight_bias.shape[0] == self.num_experts
+                and layer.w13_weight_bias.shape[1]
+                == self.intermediate_size_per_partition * 2
+            )
+            assert (
+                layer.w2_weight_bias.dim() == 2
+                and layer.w2_weight_bias.shape[0] == self.num_experts
+                and layer.w2_weight_bias.shape[1] == self.hidden_size
+            )
+
+            w13_weight_scale = layer.w13_weight_scale.data
+            w2_weight_scale = layer.w2_weight_scale.data
+            w13_weight = layer.w13_weight.data
+            w2_weight = layer.w2_weight.data
+            w13_bias = layer.w13_weight_bias.data.to(torch.float32)
+            w2_bias = layer.w2_weight_bias.data.to(torch.float32)
+
+            # Swap w1 and w3 as the definition of
+            # swiglu is different in the trtllm-gen
+            def swap_every_two_rows(x, axis=-1):
+                shape = x.shape
+                if axis < 0:
+                    axis = len(shape) + axis
+
+                # Create a new shape with pairs swapped along specified axis
+                new_shape = list(shape)
+                new_shape[axis] = shape[axis] // 2
+                new_shape.insert(axis + 1, 2)
+
+                # Reshape to expose pairs, swap them, and reshape back
+                x = x.reshape(*new_shape)
+                x = x.flip(axis + 1)
+                new_shape = list(shape)
+                return x.reshape(*new_shape)
+
+            w13_weight_scale = swap_every_two_rows(w13_weight_scale, -2)
+            w13_weight = swap_every_two_rows(w13_weight, -2)
+            w13_bias = swap_every_two_rows(w13_bias, -1)
+
+            # Shuffle weights and scaling factors for transposed mma output
+            gemm1_weights_mxfp4_shuffled = []
+            gemm1_scales_mxfp4_shuffled = []
+            gemm2_weights_mxfp4_shuffled = []
+            gemm2_scales_mxfp4_shuffled = []
+            gemm1_bias_shuffled = []
+            gemm2_bias_shuffled = []
+            epilogue_tile_m = 128  # FIXME: this depends on the kernel internals
+            w13_weight_permute_indices = _get_flashinfer_mxfp4_device_permute_indices(
+                w13_weight[0].view(torch.uint8),
+                epilogue_tile_m,
+            )
+            w13_scale_permute_indices = _get_flashinfer_mxfp4_device_permute_indices(
+                w13_weight_scale[0].view(torch.uint8),
+                epilogue_tile_m,
+                num_elts_per_sf=16,
+            )
+            w13_bias_permute_indices = _get_flashinfer_mxfp4_device_permute_indices(
+                w13_bias[0].reshape(-1, 1),
+                epilogue_tile_m,
+            )
+
+            w2_weight_permute_indices = _get_flashinfer_mxfp4_device_permute_indices(
+                w2_weight[0].view(torch.uint8),
+                epilogue_tile_m,
+            )
+            w2_scale_permute_indices = _get_flashinfer_mxfp4_device_permute_indices(
+                w2_weight_scale[0].view(torch.uint8),
+                epilogue_tile_m,
+                num_elts_per_sf=16,
+            )
+            w2_bias_permute_indices = _get_flashinfer_mxfp4_device_permute_indices(
+                w2_bias[0].reshape(-1, 1),
+                epilogue_tile_m,
+            )
+
+            for i in range(self.num_experts):
+                gemm1_weights_mxfp4_shuffled.append(
+                    w13_weight[i]
+                    .view(torch.uint8)[w13_weight_permute_indices]
+                    .contiguous()
+                )
+
+                gemm1_scales_mxfp4_shuffled.append(
+                    nvfp4_block_scale_interleave(
+                        w13_weight_scale[i]
+                        .view(torch.uint8)[w13_scale_permute_indices]
+                        .contiguous()
+                    )
+                )
+
+                gemm1_bias_shuffled.append(
+                    w13_bias[i].reshape(-1, 1)[w13_bias_permute_indices].contiguous()
+                )
+
+                gemm2_weights_mxfp4_shuffled.append(
+                    w2_weight[i]
+                    .view(torch.uint8)[w2_weight_permute_indices]
+                    .contiguous()
+                )
+
+                gemm2_scales_mxfp4_shuffled.append(
+                    nvfp4_block_scale_interleave(
+                        w2_weight_scale[i]
+                        .view(torch.uint8)[w2_scale_permute_indices]
+                        .contiguous()
+                    )
+                )
+
+                gemm2_bias_shuffled.append(
+                    w2_bias[i].reshape(-1, 1)[w2_bias_permute_indices].contiguous()
+                )
+
+            w13_weight = torch.stack(gemm1_weights_mxfp4_shuffled)
+            w13_weight_scale = (
+                torch.stack(gemm1_scales_mxfp4_shuffled)
+                .reshape(
+                    self.num_experts,
+                    2 * self.intermediate_size_per_partition,
+                    self.hidden_size // sf_block_size,
+                )
+                .view(torch.float8_e4m3fn)
+            )
+
+            w2_weight = torch.stack(gemm2_weights_mxfp4_shuffled)
+            w2_weight_scale = (
+                torch.stack(gemm2_scales_mxfp4_shuffled)
+                .reshape(
+                    self.num_experts,
+                    self.hidden_size,
+                    self.intermediate_size_per_partition // sf_block_size,
+                )
+                .view(torch.float8_e4m3fn)
+            )
+
+            layer.w13_weight = Parameter(w13_weight, requires_grad=False)
+            layer.w13_weight_scale = Parameter(w13_weight_scale, requires_grad=False)
+            layer.w2_weight = Parameter(w2_weight, requires_grad=False)
+            layer.w2_weight_scale = Parameter(w2_weight_scale, requires_grad=False)
+            layer.w13_weight_bias = Parameter(
+                torch.stack(gemm1_bias_shuffled).reshape(self.num_experts, -1),
+                requires_grad=False,
+            )
+            layer.w2_weight_bias = Parameter(
+                torch.stack(gemm2_bias_shuffled).reshape(self.num_experts, -1),
+                requires_grad=False,
+            )
+            return
+        if _use_aiter:
+            if layer.w13_weight_bias is not None:
+                layer.w13_weight_bias.data = layer.w13_weight_bias.data.to(
+                    torch.float32
+                )
+            if layer.w2_weight_bias is not None:
+                layer.w2_weight_bias.data = layer.w2_weight_bias.data.to(torch.float32)
+
+            e, n, k = layer.w13_weight.shape
+            layer.w13_weight.view(torch.uint8).copy_(
+                layer.w13_weight.data.view(torch.uint8)
+                .view(e, n // 2, 2, k)
+                .permute(0, 2, 1, 3)
+                .contiguous()
+                .view(e, n, k)
+            )
+            layer.w13_weight_scale.data = (
+                layer.w13_weight_scale.data.view(e, n // 2, 2, -1)
+                .permute(0, 2, 1, 3)
+                .contiguous()
+                .view(e, n, -1)
+            )
+
+            layer.w13_weight.data = shuffle_weight_a16w4(layer.w13_weight, 16, True)
+            shuffled_w13_scale = shuffle_scale_a16w4(
+                layer.w13_weight_scale.view(-1, layer.w13_weight_scale.shape[-1]),
+                self.num_experts,
+                True,
+            )
+
+            layer.w2_weight.data = shuffle_weight_a16w4(layer.w2_weight, 16, False)
+            shuffled_w2_scale = shuffle_scale_a16w4(
+                layer.w2_weight_scale.view(-1, layer.w2_weight_scale.shape[-1]),
+                self.num_experts,
+                False,
+            )
+
+            layer.w13_weight_bias.data = (
+                layer.w13_weight_bias.data.view(-1, n // 2, 2)
+                .permute(0, 2, 1)
+                .contiguous()
+                .view(-1, n)
+            )
+
+            layer.w13_weight_scale = torch.nn.Parameter(
+                shuffled_w13_scale, requires_grad=False
+            )
+            layer.w2_weight_scale = torch.nn.Parameter(
+                shuffled_w2_scale, requires_grad=False
+            )
+
+            return
+
+        if self.use_triton_kernels:
+
+            from triton_kernels.matmul_ogs import FlexCtx, PrecisionConfig
+
+            w13_weight_bias = layer.w13_weight_bias.to(torch.float32)
+            w2_weight_bias = layer.w2_weight_bias.to(torch.float32)
+
+            layer.w13_weight_bias = Parameter(w13_weight_bias, requires_grad=False)
+            layer.w2_weight_bias = Parameter(w2_weight_bias, requires_grad=False)
+
+            num_warps = 8
+
+            w13_weight, w13_flex, w13_scale = _swizzle_mxfp4(
+                layer.w13_weight, layer.w13_weight_scale, num_warps
+            )
+            w2_weight, w2_flex, w2_scale = _swizzle_mxfp4(
+                layer.w2_weight, layer.w2_weight_scale, num_warps
+            )
+
+            self.w13_precision_config = PrecisionConfig(
+                weight_scale=w13_scale, flex_ctx=FlexCtx(rhs_data=w13_flex)
+            )
+            self.w2_precision_config = PrecisionConfig(
+                weight_scale=w2_scale, flex_ctx=FlexCtx(rhs_data=w2_flex)
+            )
+
+            self.w13_weight_triton_tensor = w13_weight
+            self.w2_weight_triton_tensor = w2_weight
+            del layer.w13_weight
+            del layer.w2_weight
+        else:
+            from triton_kernels.numerics_details.mxfp import upcast_from_mxfp
+
+            w13_weight = upcast_from_mxfp(
+                layer.w13_weight,
+                layer.w13_weight_scale,
+                target_dtype=torch.bfloat16,
+                axis=-1,
+            )
+            w2_weight = upcast_from_mxfp(
+                layer.w2_weight,
+                layer.w2_weight_scale,
+                target_dtype=torch.bfloat16,
+                axis=-1,
+            )
+            del layer.w13_weight
+            del layer.w2_weight
+            del layer.w13_weight_scale
+            del layer.w2_weight_scale
+            layer.w13_weight = Parameter(w13_weight.data, requires_grad=False)
+            layer.w2_weight = Parameter(w2_weight.data, requires_grad=False)
+        torch.cuda.empty_cache()
+
+    def create_moe_runner(
+        self, layer: torch.nn.Module, moe_runner_config: MoeRunnerConfig
+    ):
+        self.moe_runner_config = moe_runner_config
+        backend = (
+            MoeRunnerBackend.TRITON_KERNELS
+            if self.use_triton_kernels
+            else MoeRunnerBackend.TRITON
+        )
+        self.runner = MoeRunner(backend, moe_runner_config)
+
+    def apply(
+        self,
+        layer: torch.nn.Module,
+        dispatch_output: StandardDispatchOutput,
+    ) -> CombineInput:
+
+        from sglang.srt.layers.moe.token_dispatcher import StandardCombineInput
+        from sglang.srt.layers.moe.topk import TopKOutputChecker
+
+        x = dispatch_output.hidden_states
+        topk_output = dispatch_output.topk_output
+
+        if self.use_flashinfer:
+            # When bf16 mode is enabled, we don't need to quantize the input,
+            # TRT-LLM automatically handles quantization in the kernel implementation and pipelines it with GEMM operations,
+            # which can theoretically improve performance
+            origin_hidden_states_dim = x.shape[-1]
+            if self.flashinfer_mxfp4_moe_precision == "bf16":
+                assert x.dtype == torch.bfloat16
+                x_quant = x
+                x_scale = None
+
+                # May be fused later if this code branch is frequently needed
+                if self.hidden_size != origin_hidden_states_dim:
+                    x_quant = torch.nn.functional.pad(
+                        x_quant,
+                        (0, self.hidden_size - origin_hidden_states_dim),
+                        mode="constant",
+                        value=0.0,
+                    )
+            elif self.flashinfer_mxfp4_moe_precision == "default":
+                x_quant, x_scale = mxfp8_quantize(x, False, alignment=self.hidden_size)
+                x_scale = x_scale.view(torch.float8_e4m3fn).reshape(*x.shape[:-1], -1)
+            else:
+                raise NotImplementedError()
+
+            assert x_quant.shape[-1] == self.hidden_size
+            assert TopKOutputChecker.format_is_bypassed(topk_output)
+
+            top_k = topk_output.topk_config.top_k
+            router_logits = topk_output.router_logits
+
+            with use_symmetric_memory(
+                get_tp_group(), disabled=not is_allocation_symmetric()
+            ):
+                num_tokens = x_quant.shape[0]
+                hidden_size = origin_hidden_states_dim
+                symm_output = torch.empty(
+                    num_tokens, hidden_size, dtype=torch.bfloat16, device=x_quant.device
+                )
+            trtllm_gen_output = trtllm_fp4_block_scale_moe(
+                router_logits.to(torch.bfloat16),
+                None,  # routing_bias
+                x_quant,
+                x_scale,
+                layer.w13_weight,  # uint8 (e2m1 x 2)
+                layer.w13_weight_scale,  # uint8 (e4m3 x 2)
+                layer.w13_weight_bias,  # fp32 per expert per channel
+                layer.gemm1_alpha,  # fp32 per expert
+                layer.gemm1_beta,  # fp32 per expert
+                layer.gemm1_clamp_limit,  # fp32 per expert
+                layer.w2_weight,  # uint8 (e2m1 x 2)
+                layer.w2_weight_scale,  # ue8m0
+                layer.w2_weight_bias,  # fp32 per expert per channel
+                None,  # output1_scale_scalar
+                None,  # output1_scale_gate_scalar
+                None,  # output2_scale_scalar
+                layer.num_experts,
+                top_k,
+                None,  # n_group      # TODO: support n_group
+                None,  # topk_group   # TODO: support topk_group
+                self.intermediate_size_per_partition,  # padded to multiple of 256
+                layer.moe_ep_rank * layer.num_local_experts,  # local_expert_offset
+                layer.num_local_experts,  # local num experts
+                None,  # routed_scaling_factor
+                1,  # routing_method_type, renormalize
+                True,  # do finalize
+                tune_max_num_tokens=next_power_of_2(x_quant.shape[0]),
+                output=symm_output,
+            )[0]
+            return StandardCombineInput(hidden_states=trtllm_gen_output)
+        if _use_aiter:
+            topk_weights, topk_ids, _ = topk_output
+
+            if hasattr(torch, "float4_e2m1fn_x2"):
+                w13_weight = layer.w13_weight.view(torch.float4_e2m1fn_x2)
+                w2_weight = layer.w2_weight.view(torch.float4_e2m1fn_x2)
+            else:
+                w13_weight = layer.w13_weight
+                w2_weight = layer.w2_weight
+
+            origi_hidden_size = self.hidden_size - self.hidden_pad
+
+            x = torch.nn.functional.pad(
+                x,
+                (0, self.hidden_pad),
+                mode="constant",
+                value=0.0,
+            )
+
+            output = fused_moe(
+                x,
+                w13_weight,
+                w2_weight,
+                topk_weights,
+                topk_ids,
+                expert_mask=layer.expert_mask_gpu,
+                activation=ActivationType.Swiglu,
+                quant_type=QuantType.per_1x32,
+                w1_scale=layer.w13_weight_scale,
+                w2_scale=layer.w2_weight_scale,
+                doweight_stage1=self.moe_runner_config.apply_router_weight_on_input,
+                hidden_pad=self.hidden_pad,
+                intermediate_pad=self.intermediate_pad,
+                bias1=layer.w13_weight_bias,
+                bias2=layer.w2_weight_bias,
+            )
+            return StandardCombineInput(hidden_states=output)
+
+        backend = self.runner.runner_backend
+        if backend.is_triton_kernels():
+            from sglang.srt.layers.moe.moe_runner.triton_kernels import (
+                TritonKernelsQuantInfo,
+            )
+
+            assert (
+                layer.moe_ep_size == 1
+            ), "Expert parallel is not supported when using triton kernels"
+            quant_info = TritonKernelsQuantInfo(
+                w13_weight=(
+                    self.w13_weight_triton_tensor
+                    if self.w13_weight_triton_tensor is not None
+                    else layer.w13_weight
+                ),
+                w2_weight=(
+                    self.w2_weight_triton_tensor
+                    if self.w2_weight_triton_tensor is not None
+                    else layer.w2_weight
+                ),
+                w13_bias=getattr(layer, "w13_weight_bias", None),
+                w2_bias=getattr(layer, "w2_weight_bias", None),
+                w13_precision_config=getattr(self, "w13_precision_config", None),
+                w2_precision_config=getattr(self, "w2_precision_config", None),
+            )
+        else:
+            quant_info = TritonMoeQuantInfo(
+                w13_weight=layer.w13_weight,
+                w2_weight=layer.w2_weight,
+                b13=getattr(layer, "w13_weight_bias", None),
+                b2=getattr(layer, "w2_weight_bias", None),
+            )
+        return self.runner.run(dispatch_output, quant_info)
+
+
+class Mxfp4DynamicQuantMoEMethod(FusedMoEMethodBase):
+    def create_weights(
+        self,
+        layer: torch.nn.Module,
+        num_experts: int,
+        hidden_size: int,
+        intermediate_size_per_partition: int,
+        params_dtype: torch.dtype,
+        **extra_weight_attrs,
+    ):
+
+        from sglang.srt.layers.moe.fused_moe_triton import FusedMoeWeightScaleSupported
+
+        w13_weight = torch.nn.Parameter(
+            torch.empty(
+                num_experts,
+                2 * intermediate_size_per_partition,
+                hidden_size,
+                dtype=params_dtype,
+            ),
+            requires_grad=False,
+        )
+        w2_weight = torch.nn.Parameter(
+            torch.empty(
+                num_experts,
+                hidden_size,
+                intermediate_size_per_partition,
+                dtype=params_dtype,
+            ),
+            requires_grad=False,
+        )
+
+        layer.register_parameter("w13_weight", w13_weight)
+        set_weight_attrs(w13_weight, extra_weight_attrs)
+
+        layer.register_parameter("w2_weight", w2_weight)
+        set_weight_attrs(w2_weight, extra_weight_attrs)
+
+        # Allocate 2 scales for w1 and w3 respectively.
+        # They will be combined to a single scale after weight loading.
+        w13_weight_scale = torch.nn.Parameter(
+            torch.ones(num_experts, 2, dtype=torch.float32), requires_grad=False
+        )
+        w2_weight_scale = torch.nn.Parameter(
+            torch.ones(num_experts, dtype=torch.float32), requires_grad=False
+        )
+        layer.register_parameter("w13_weight_scale", w13_weight_scale)
+        layer.register_parameter("w2_weight_scale", w2_weight_scale)
+
+        # Add the quantization method used (per tensor/grouped/channel)
+        # to ensure the weight scales are loaded in properly
+        extra_weight_attrs.update(
+            {"quant_method": FusedMoeWeightScaleSupported.TENSOR.value}
+        )
+
+        layer.w13_input_scale = None
+        layer.w2_input_scale = None
+
+    def mxfp4_quantize(self, w):
+        w_shape = w.shape
+        w_need_reshape = True if w.dim() != 2 else False
+
+        if w_need_reshape:
+            w_last_dim_size = w_shape[-1]
+            w = w.view(-1, w_last_dim_size)
+
+        w, mx_scales = dynamic_mxfp4_quant(w)
+
+        if w_need_reshape:
+            w_new_shape = w_shape[:-1] + (w.shape[-1],)
+            w = w.view(w_new_shape)
+
+        mx_scales = e8m0_shuffle(mx_scales)
+
+        return w, mx_scales
+
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        w13, w13_mx_scales = self.mxfp4_quantize(layer.w13_weight.data)
+        w2, w2_mx_scales = self.mxfp4_quantize(layer.w2_weight.data)
+
+        # Pre-shuffle weight
+        is_shuffled = _is_shuffle_moe_mxfp4
+        if is_shuffled:
+            w13 = shuffle_weight(w13.contiguous(), (16, 16))
+            w2 = shuffle_weight(w2.contiguous(), (16, 16))
+
+        layer.w13_weight = torch.nn.Parameter(w13, requires_grad=False)
+        layer.w13_weight.is_shuffled = is_shuffled
+        layer.w13_weight_scale = torch.nn.Parameter(w13_mx_scales, requires_grad=False)
+
+        layer.w2_weight = torch.nn.Parameter(w2, requires_grad=False)
+        layer.w2_weight.is_shuffled = is_shuffled
+        layer.w2_weight_scale = torch.nn.Parameter(w2_mx_scales, requires_grad=False)
+
+    def create_moe_runner(
+        self, layer: torch.nn.Module, moe_runner_config: MoeRunnerConfig
+    ):
+        self.moe_runner_config = moe_runner_config
+
+    def apply(
+        self,
+        layer: torch.nn.Module,
+        dispatch_output: StandardDispatchOutput,
+    ) -> CombineInput:
+        from sglang.srt.layers.moe.token_dispatcher import StandardCombineInput
+
+        x = dispatch_output.hidden_states
+        topk_output = dispatch_output.topk_output
+
+        topk_weights, topk_ids, _ = topk_output
+        if _is_hip:
+            topk_weights = topk_weights.to(
+                torch.float32
+            )  # aiter's moe_sorting requires topk_weights to be FP32
+
+        if hasattr(torch, "float4_e2m1fn_x2"):
+            w13_weight = layer.w13_weight.view(torch.float4_e2m1fn_x2)
+            w2_weight = layer.w2_weight.view(torch.float4_e2m1fn_x2)
+        else:
+            w13_weight = layer.w13_weight
+            w2_weight = layer.w2_weight
+
+        if hasattr(layer.w13_weight, "is_shuffled"):
+            w13_weight.is_shuffled = True
+            w2_weight.is_shuffled = True
+
+        output = fused_moe(
+            x,
+            w13_weight,
+            w2_weight,
+            topk_weights,
+            topk_ids,
+            quant_type=QuantType.per_1x32,
+            w1_scale=layer.w13_weight_scale,
+            w2_scale=layer.w2_weight_scale,
+            activation=(
+                ActivationType.Silu
+                if self.moe_runner_config.activation == "silu"
+                else ActivationType.Gelu
+            ),
+            doweight_stage1=False,
+            expert_mask=layer.expert_mask_gpu,
+        )
+        return StandardCombineInput(hidden_states=output)
diff --git a/sglang/python/sglang/srt/layers/quantization/mxfp4_tensor.py b/sglang/python/sglang/srt/layers/quantization/mxfp4_tensor.py
new file mode 100644
index 0000000000000000000000000000000000000000..76cb92c544f7f61b1dc3a92d6257468190b2f5a1
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/quantization/mxfp4_tensor.py
@@ -0,0 +1,135 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Optional
+
+import torch
+
+
+# https://github.com/NVIDIA/TensorRT-Model-Optimizer/blob/main/modelopt/torch/quantization/qtensor/mxfp4_tensor.py
+class MXFP4QuantizeUtil:
+    E2M1_max = 6.0
+
+    E2M1_values = [0, 0.5, 1, 1.5, 2, 3, 4, 6]
+    E2M1_bounds = torch.tensor([0.25, 0.75, 1.25, 1.75, 2.5, 3.5, 5])
+
+    @classmethod
+    def quantize(cls, input: torch.Tensor, block_size: Optional[int]) -> tuple:
+        """Converting a tensor to a quantized format based on MXFP4 quantization. Only E4M3 is supported.
+        Args:
+            input (torch.Tensor): The input tensor to be quantized.
+            block_sizes (dict | None): The block sizes for quantization.
+        """
+
+        def cast_fp4(x):
+            sign = torch.sign(x)
+            sign_bit = (2 - sign) // 2
+            ord_ = torch.sum(
+                (x.abs().unsqueeze(-1) - cls.E2M1_bounds.to(x.device)) > 0, dim=-1
+            )
+            fp4_val = (sign_bit * 0b1000 + ord_).to(torch.uint8)
+            return fp4_val
+
+        def fuse_uint4_to_uint8(x):
+            # If the last dimension is odd, pad with zeros
+            # If this behavior is not desired, please modify the code accordingly
+            left_side = x[..., 0::2]  # Even indices (0, 2, 4...)
+            right_side = x[..., 1::2]  # Odd indices (1, 3, 5...)
+            new_data = (
+                right_side.clone() << 4
+            )  # Put odd indices (higher addresses) in high bits
+            new_data[
+                ..., : left_side.shape[-1]
+            ] += left_side  # Put even indices in low bits
+            return new_data
+
+        if block_size is None:
+            block_size = 32
+
+        original_shape = input.shape
+        original_dtype = input.dtype
+        input = input.view(-1, block_size)
+        # get scales
+        input_amax = input.abs().max(dim=-1, keepdim=True).values
+        descale = input_amax / cls.E2M1_max
+        min_value = torch.tensor(-127.0, device=descale.device)
+        e8m0_scale = torch.ceil(torch.maximum(torch.log2(descale), min_value))
+
+        input = (input / torch.exp2(e8m0_scale)).view(original_shape)
+        input_q = cast_fp4(input)
+        input_q = fuse_uint4_to_uint8(input_q)
+        e8m0_scale = (e8m0_scale + 127).to(torch.uint8)
+        return cls(original_shape, original_dtype, input_q), e8m0_scale
+
+    @classmethod
+    def dequantize(cls, quantized_data, dtype: torch.dtype, scale, block_sizes):
+        """Dequantze MXFP4 packed tensor to a target dtype."""
+
+        def unfuse_uint8_to_uint4(x):
+            """Unfuse uint8 values back to uint4 values.
+            This is the inverse operation of fuse_uint4_to_uint8.
+            """
+            # Extract the lower 4 bits (even indices)
+            left_side = x & 0x0F
+
+            # Extract the upper 4 bits (odd indices)
+            right_side = (x >> 4) & 0x0F
+
+            # Create a new tensor with alternating values
+            shape = list(x.shape)
+            shape[-1] = shape[-1] * 2
+            result = torch.zeros(shape, dtype=torch.uint8, device=x.device)
+
+            # Fill in the values - even indices get low bits, odd indices get high bits
+            result[..., 0::2] = left_side  # Even indices from low bits
+            result[..., 1::2] = right_side  # Odd indices from high bits
+
+            return result
+
+        e8m0_scale = scale
+        block_size = block_sizes[-1]
+
+        # Unfuse the uint8 values back to uint4
+        x_unfused = unfuse_uint8_to_uint4(quantized_data)
+        # Extract sign and magnitude
+        sign = 1 - 2 * ((x_unfused & 0b1000) >> 3).to(
+            torch.float32
+        )  # Extract sign bit and convert to +1/-1
+        magnitude = x_unfused & 0b0111  # Extract magnitude bits
+        magnitude = magnitude.to(torch.long)
+
+        # Create a tensor with the E2M1 values
+        values = torch.tensor(cls.E2M1_values, device=quantized_data.device)
+
+        # Use gather to index the values tensor properly
+        # We need to reshape magnitude to match the dimensions we want to gather along
+        original_shape = magnitude.shape
+        x_float = values[magnitude.reshape(-1)].reshape(original_shape)
+
+        # Apply sign and scale
+        x_float = sign.float() * x_float
+
+        # Reshape to apply block-wise scaling
+        x_float = x_float.reshape(-1, block_size)
+
+        # Apply the E8M0 scale
+        scale_factor = torch.exp2(e8m0_scale.float() - 127)
+        scale_factor = scale_factor.reshape(-1, 1)  # Reshape for proper broadcasting
+
+        # Apply scaling and reshape back to original shape
+        x_float = x_float * scale_factor
+
+        # Reshape back to the original shape
+        return x_float.reshape(original_shape).to(dtype)
diff --git a/sglang/python/sglang/srt/layers/quantization/petit.py b/sglang/python/sglang/srt/layers/quantization/petit.py
new file mode 100644
index 0000000000000000000000000000000000000000..37b7fbc5497ee3a13ff387868168dbee3f3bb9ee
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/quantization/petit.py
@@ -0,0 +1,253 @@
+# Adapted from https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/layers/quantization/modelopt.py
+
+
+import logging
+from typing import Any, Dict, List, Optional
+
+import regex as re
+import torch
+from torch.nn.parameter import Parameter
+
+from sglang.srt.layers.linear import LinearBase
+from sglang.srt.layers.parameter import ModelWeightParameter, PerTensorScaleParameter
+from sglang.srt.layers.quantization.base_config import (
+    LinearMethodBase,
+    QuantizationConfig,
+    QuantizeMethodBase,
+)
+from sglang.srt.layers.quantization.petit_utils import (
+    apply_petit_nvfp4_linear,
+    prepare_nvfp4_layer_for_petit,
+    verify_petit_nvfp4_supported,
+)
+from sglang.srt.layers.quantization.unquant import UnquantizedLinearMethod
+from sglang.srt.layers.quantization.utils import is_layer_skipped
+from sglang.srt.utils import is_hip
+
+_is_hip = is_hip()
+
+# Initialize logger for the module
+logger = logging.getLogger(__name__)
+
+
+# Configuration class to support the NVFP4 quantized model generated by the ModelOpt quantization tool
+class PetitNvFp4Config(QuantizationConfig):
+    """Config class for Petit FP4."""
+
+    def __init__(
+        self,
+        is_checkpoint_nvfp4_serialized: bool = False,
+        kv_cache_quant_algo: str = None,
+        group_size: int = None,
+        exclude_modules: List[str] = None,
+    ) -> None:
+        self.is_checkpoint_nvfp4_serialized = is_checkpoint_nvfp4_serialized
+        if is_checkpoint_nvfp4_serialized:
+            logger.warning(
+                "Detected nvfp4 checkpoint. Please note that the "
+                "format is experimental and subject to change."
+            )
+        self.group_size = group_size
+        self.kv_cache_quant_algo = kv_cache_quant_algo
+        self.exclude_modules = exclude_modules
+
+    @classmethod
+    def get_name(cls) -> str:
+        return "petit_nvfp4"
+
+    @classmethod
+    def get_supported_act_dtypes(cls) -> List[torch.dtype]:
+        return [torch.bfloat16, torch.half]
+
+    @classmethod
+    def get_min_capability(cls) -> int:
+        # Petit supports the gfx90a and gfx942 GPUs
+        return 90
+
+    @classmethod
+    def get_config_filenames(cls) -> List[str]:
+        return ["hf_quant_config.json"]
+
+    @classmethod
+    def from_config(cls, config: Dict[str, Any]) -> "PetitNvFp4Config":
+        quant_config = cls.get_from_keys(config, ["quantization"])
+        quant_method = quant_config["quant_algo"]
+        group_size = quant_config.get("group_size", None)
+        verify_petit_nvfp4_supported(quant_method, group_size)
+
+        is_checkpoint_nvfp4_serialized = "NVFP4" in quant_method
+        kv_cache_quant_algo = quant_config["kv_cache_quant_algo"]
+        if not kv_cache_quant_algo:
+            kv_cache_quant_algo = "auto"
+        exclude_modules = quant_config.get("exclude_modules", None)
+        if not (group_size and kv_cache_quant_algo and (exclude_modules is not None)):
+            logger.warning(
+                f"group_size: {group_size},"
+                f"kv_cache_quant_algo: {kv_cache_quant_algo},"
+                f"exclude_modules: {exclude_modules}"
+            )
+            raise ValueError(
+                "NVFP4 quantization requires group size and "
+                "kv_cache_quant_algo specified in "
+                "hf_quant_config.json"
+            )
+        return cls(
+            is_checkpoint_nvfp4_serialized,
+            kv_cache_quant_algo,
+            group_size,
+            exclude_modules,
+        )
+
+    @classmethod
+    def override_quantization_method(cls, hf_quant_cfg, user_quant) -> Optional[str]:
+        can_convert = cls.is_petit_nvfp4_compatible(hf_quant_cfg)
+        if can_convert:
+            return cls.get_name()
+        return None
+
+    @classmethod
+    def is_petit_nvfp4_compatible(cls, quant_config: Dict[str, Any]) -> bool:
+        quant_method = quant_config.get("quant_method", "").lower()
+        return _is_hip and quant_method == "modelopt"
+
+    def is_layer_excluded(self, prefix: str, exclude_modules: list):
+        for pattern in exclude_modules:
+            regex_str = pattern.replace(".", r"\.").replace("*", r".*")
+            if re.fullmatch(regex_str, prefix):
+                return True
+        return False
+
+    def get_quant_method(
+        self, layer: torch.nn.Module, prefix: str
+    ) -> Optional["QuantizeMethodBase"]:
+        if isinstance(layer, LinearBase):
+            if is_layer_skipped(prefix, self.exclude_modules) or self.is_layer_excluded(
+                prefix, self.exclude_modules
+            ):
+                return UnquantizedLinearMethod()
+            return PetitNvFp4LinearMethod(self)
+        return None
+
+    def get_scaled_act_names(self) -> List[str]:
+        return []
+
+
+class PetitNvFp4LinearMethod(LinearMethodBase):
+    """Linear method for NVFP4.
+    Supports loading NVFP4 checkpoints with the following structure:
+
+    |Tensor Name           | datatype      |  shape      |
+    |----------------------------------------------------|
+    |input_scale           | torch.float32 | scalar      |
+    |weight                | NVFP4(SE2M1)  | [1, X, y/2] |
+    |weight_scale          | FP8-E4M3      | [X, Y]      |
+    |weight_scale_2        | torch.float32 | scalar      |
+
+    The weights are quantized per block of 16 elements.
+    Args: quant_config: The ModelOpt quantization config.
+    """
+
+    def __init__(self, quant_config: PetitNvFp4Config):
+        self.quant_config = quant_config
+
+    def create_weights(
+        self,
+        layer: torch.nn.Module,
+        input_size_per_partition: int,
+        output_partition_sizes: List[int],
+        input_size: int,
+        output_size: int,
+        params_dtype: torch.dtype,
+        **extra_weight_attrs,
+    ):
+        del input_size, output_size
+        if not self.quant_config.is_checkpoint_nvfp4_serialized:
+            raise ValueError(
+                "NVFP4 quantization was selected, "
+                " dynamic quantization is not supported."
+            )
+
+        output_size_per_partition = sum(output_partition_sizes)
+        weight_loader = extra_weight_attrs.get("weight_loader")
+
+        layer.logical_widths = output_partition_sizes
+
+        layer.input_size_per_partition = input_size_per_partition
+        layer.output_size_per_partition = output_size_per_partition
+        if input_size_per_partition % 16 != 0:
+            raise ValueError(
+                "Unsupported model when in features size is " "not multiple of 16"
+            )
+
+        weight_dtype = (
+            torch.float8_e4m3fn
+            if self.quant_config.is_checkpoint_nvfp4_serialized
+            else params_dtype
+        )
+
+        weight = ModelWeightParameter(
+            data=torch.empty(
+                # 2 fp4 data is packed in one uint8 in the input dimension
+                output_size_per_partition,
+                input_size_per_partition // 2,
+                dtype=torch.uint8,
+            ),
+            input_dim=1,
+            output_dim=0,
+            weight_loader=weight_loader,
+        )
+        layer.register_parameter("weight", weight)
+
+        input_scale = PerTensorScaleParameter(
+            data=torch.empty(len(output_partition_sizes), dtype=torch.float32),
+            weight_loader=weight_loader,
+        )
+
+        layer.register_parameter("input_scale", input_scale)
+
+        weight_scale_2 = PerTensorScaleParameter(
+            data=torch.empty(len(output_partition_sizes), dtype=torch.float32),
+            weight_loader=weight_loader,
+        )
+        layer.register_parameter("weight_scale_2", weight_scale_2)
+
+        weight_scale = ModelWeightParameter(
+            data=torch.empty(
+                output_size_per_partition,
+                input_size_per_partition // self.quant_config.group_size,
+                dtype=weight_dtype,
+            ),
+            input_dim=1,
+            output_dim=0,
+            weight_loader=weight_loader,
+        )
+
+        layer.register_parameter("weight_scale", weight_scale)
+
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        input_scale_2 = layer.input_scale.max().to(torch.float32)
+        weight_scale_2 = layer.weight_scale_2.max().to(torch.float32)
+        layer.input_scale = Parameter(input_scale_2, requires_grad=False)
+        layer.weight_scale_2 = Parameter(weight_scale_2, requires_grad=False)
+        layer.alpha = Parameter(
+            layer.input_scale * layer.weight_scale_2, requires_grad=False
+        )
+
+        prepare_nvfp4_layer_for_petit(layer)
+        del layer.input_scale
+
+    def apply(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        bias: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        return apply_petit_nvfp4_linear(
+            input=x,
+            weight=layer.weight,
+            weight_scale=layer.weight_scale,
+            weight_scale_2=layer.weight_scale_2,
+            size_n=layer.output_size_per_partition,
+            size_k=layer.input_size_per_partition,
+            bias=bias,
+        )
diff --git a/sglang/python/sglang/srt/layers/quantization/petit_utils.py b/sglang/python/sglang/srt/layers/quantization/petit_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..529869f2413f31e6ab4fa96b36b3cc30a2e1c6c1
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/quantization/petit_utils.py
@@ -0,0 +1,104 @@
+from typing import Optional
+
+import torch
+
+try:
+    from petit_kernel import mul_nvfp4_a16, process_nvfp4_scales, repack_nvfp4
+except ImportError:
+
+    def _check_petit_nvfp4_supported(
+        quant_method: str, group_size: Optional[int]
+    ) -> tuple[bool, Optional[str]]:
+        return (
+            False,
+            "Petit is not installed. Please install it with `pip install petit-kernel`.",
+        )
+
+    def prepare_nvfp4_layer_for_petit(layer: torch.nn.Module) -> None:
+        raise ValueError(
+            "Petit is not installed. Please install it with `pip install petit-kernel`."
+        )
+
+    def apply_petit_nvfp4_linear(
+        input: torch.Tensor,
+        weight: torch.Tensor,
+        weight_scale: torch.Tensor,
+        weight_scale_2: torch.Tensor,
+        size_n: int,
+        size_k: int,
+        bias: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        raise ValueError(
+            "Petit is not installed. Please install it with `pip install petit-kernel`."
+        )
+
+
+def _check_petit_nvfp4_supported(
+    quant_method: str, group_size: Optional[int]
+) -> tuple[bool, Optional[str]]:
+    if quant_method != "NVFP4":
+        return (
+            False,
+            "Petit currently only supports: NVFP4"
+            " quantizations in sglang. Please check the "
+            "`hf_quant_config.json` file for your model's "
+            "quant configuration.",
+        )
+    if group_size is not None and group_size != 16:
+        return (
+            False,
+            "Petit currently only supports: group_size=16" " quantizations.",
+        )
+    return (True, None)
+
+
+def verify_petit_nvfp4_supported(quant_method: str, group_size: Optional[int]) -> None:
+    supported, error_msg = _check_petit_nvfp4_supported(quant_method, group_size)
+    if not supported:
+        raise ValueError(error_msg)
+
+
+def prepare_nvfp4_layer_for_petit(layer: torch.nn.Module) -> None:
+    # Repack weights to petit format
+    part_size_n = layer.output_size_per_partition
+    part_size_k = layer.input_size_per_partition
+    qweight = layer.weight.view(torch.int32).contiguous()
+    petit_qweight = repack_nvfp4(qweight, size_n=part_size_n, size_k=part_size_k)
+    layer.weight = torch.nn.Parameter(petit_qweight, requires_grad=False)
+
+    # Permute scales
+    weight_scale = process_nvfp4_scales(
+        scales=layer.weight_scale, size_k=part_size_k, size_n=part_size_n
+    )
+    layer.weight_scale = torch.nn.Parameter(weight_scale, requires_grad=False)
+
+    return
+
+
+def apply_petit_nvfp4_linear(
+    input: torch.Tensor,
+    weight: torch.Tensor,
+    weight_scale: torch.Tensor,
+    weight_scale_2: torch.Tensor,
+    size_n: int,
+    size_k: int,
+    bias: Optional[torch.Tensor] = None,
+) -> torch.Tensor:
+    reshaped_x = input.reshape(-1, input.shape[-1])
+    out_shape = input.shape[:-1] + (size_n,)
+
+    # TODO: Use auto-tuning to find the performant solution_id
+    output = mul_nvfp4_a16(
+        a=reshaped_x,
+        b=weight,
+        s=weight_scale,
+        global_scale=weight_scale_2,
+        size_m=reshaped_x.size(0),
+        size_n=size_n,
+        size_k=size_k,
+        solution_id=-1,
+    )
+    if bias is not None:
+        output.add_(bias)  # In-place add
+
+    return output.reshape(out_shape)
diff --git a/sglang/python/sglang/srt/layers/quantization/qoq.py b/sglang/python/sglang/srt/layers/quantization/qoq.py
new file mode 100644
index 0000000000000000000000000000000000000000..ec0fda482c4bd21408dca05454cb48788613daf4
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/quantization/qoq.py
@@ -0,0 +1,245 @@
+from __future__ import annotations
+
+from typing import Any, Dict, List, Optional
+
+import torch
+from torch.nn.parameter import Parameter
+
+from sglang.srt.layers.parameter import (
+    ChannelQuantScaleParameter,
+    GroupQuantScaleParameter,
+    ModelWeightParameter,
+)
+from sglang.srt.layers.quantization.base_config import (
+    LinearMethodBase,
+    QuantizationConfig,
+    QuantizeMethodBase,
+)
+from sglang.srt.layers.quantization.int8_kernel import per_token_quant_int8
+from sglang.srt.utils import is_cuda
+
+_is_cuda = is_cuda()
+if _is_cuda:
+    from sgl_kernel import qserve_w4a8_per_chn_gemm, qserve_w4a8_per_group_gemm
+
+
+QoQ_SUPPORTED_WEIGHT_BITS = [4]
+QoQ_SUPPORTED_GROUP_SIZES = [-1, 128]
+
+
+class QoQConfig(QuantizationConfig):
+    """Config class for QoQ Quantization.
+
+    - Weight: static, per-channel/group, asymmetric
+    - Activation: dynamic, per-token, symmetric
+
+    Reference: https://arxiv.org/abs/2405.04532
+    https://github.com/mit-han-lab/omniserve
+    """
+
+    def __init__(self, weight_bits: int, group_size: int) -> None:
+        self.weight_bits = weight_bits
+        self.group_size = group_size
+
+        # Verify
+        if self.weight_bits not in QoQ_SUPPORTED_WEIGHT_BITS:
+            raise ValueError(
+                f"QoQ does not support weight_bits = {self.weight_bits}. "
+                f"Only weight_bits = {QoQ_SUPPORTED_WEIGHT_BITS} "
+                "are supported."
+            )
+        if self.group_size not in QoQ_SUPPORTED_GROUP_SIZES:
+            raise ValueError(
+                f"QoQ does not support group_size = {self.group_size}. "
+                f"Only group_sizes = {QoQ_SUPPORTED_GROUP_SIZES} "
+                "are supported."
+            )
+
+        # 4 bits packed into 8 bit datatype.
+        self.pack_factor = 8 // self.weight_bits
+
+    def __repr__(self) -> str:
+        return "QoQConfig(weight_bits={}, group_size={})".format(
+            self.weight_bits, self.group_size
+        )
+
+    @classmethod
+    def get_supported_act_dtypes(cls) -> List[torch.dtype]:
+        return [torch.float16]
+
+    @classmethod
+    def get_min_capability(cls) -> int:
+        return 80
+
+    @classmethod
+    def get_name(cls) -> str:
+        return "qoq"
+
+    @classmethod
+    def get_config_filenames(cls) -> List[str]:
+        """List of filenames to search for in the model directory."""
+        return [
+            "quant_config.json",
+            "quantize_config.json",
+        ]
+
+    @classmethod
+    def from_config(cls, config: Dict[str, Any]) -> QoQConfig:
+        weight_bits = cls.get_from_keys(config, ["wbits"])
+        group_size = cls.get_from_keys(config, ["group_size"])
+        return cls(weight_bits, group_size)
+
+    def get_quant_method(
+        self,
+        layer: torch.nn.Module,
+        prefix: str,
+    ) -> Optional[QuantizeMethodBase]:
+        from sglang.srt.layers.linear import LinearBase
+
+        if isinstance(layer, LinearBase):
+            return QoQLinearMethod(self)
+        return None
+
+    def get_scaled_act_names(self) -> List[str]:
+        return []
+
+
+class QoQLinearMethod(LinearMethodBase):
+    """Linear method for QoQ.
+
+    Args:
+        quant_config: The QoQ quantization config.
+    """
+
+    def __init__(self, quant_config: QoQConfig):
+        self.quant_config = quant_config
+
+    def create_weights(
+        self,
+        layer: torch.nn.Module,
+        input_size_per_partition: int,
+        output_partition_sizes: List[int],
+        input_size: int,
+        output_size: int,
+        params_dtype: torch.dtype,
+        **extra_weight_attrs,
+    ):
+
+        weight_loader = extra_weight_attrs.get("weight_loader")
+
+        # Validate output_size_per_partition
+        output_size_per_partition = sum(output_partition_sizes)
+        if output_size_per_partition % 32 != 0:
+            raise ValueError(
+                f"Weight output_size_per_partition = "
+                f"{output_size_per_partition} is not divisible by 32."
+            )
+
+        # Validate input_size_per_partition
+        if input_size_per_partition % self.quant_config.pack_factor != 0:
+            raise ValueError(
+                f"Weight input_size_per_partition = "
+                f"{input_size_per_partition} is not divisible by "
+                f"pack_factor = {self.quant_config.pack_factor}."
+            )
+        if (
+            self.quant_config.group_size != -1
+            and input_size_per_partition % self.quant_config.group_size != 0
+        ):
+            raise ValueError(
+                f"Weight input_size_per_partition = "
+                f"{input_size_per_partition} is not divisible by "
+                f"group_size = {self.quant_config.group_size}."
+            )
+
+        qweight = ModelWeightParameter(
+            data=torch.empty(
+                output_size_per_partition,
+                input_size_per_partition // self.quant_config.pack_factor,
+                dtype=torch.int8,
+            ),
+            input_dim=1,
+            output_dim=0,
+            weight_loader=weight_loader,
+        )
+        layer.register_parameter("qweight", qweight)
+
+        s1_scales = ChannelQuantScaleParameter(
+            data=torch.empty(output_size_per_partition, dtype=torch.float16),
+            output_dim=0,
+            weight_loader=weight_loader,
+        )
+        layer.register_parameter("s1_scales", s1_scales)
+
+        if self.quant_config.group_size == -1:
+            s1_szeros = ChannelQuantScaleParameter(
+                data=torch.empty(output_size_per_partition, dtype=torch.float16),
+                output_dim=0,
+                weight_loader=weight_loader,
+            )
+            layer.register_parameter("s1_szeros", s1_szeros)
+        else:
+            s2_scales = GroupQuantScaleParameter(
+                data=torch.empty(
+                    (
+                        input_size_per_partition // self.quant_config.group_size,
+                        output_size_per_partition,
+                    ),
+                    dtype=torch.int8,
+                ),
+                input_dim=0,
+                output_dim=1,
+                weight_loader=weight_loader,
+            )
+            layer.register_parameter("s2_scales", s2_scales)
+
+            s2_zeros = GroupQuantScaleParameter(
+                data=torch.empty(
+                    (
+                        input_size_per_partition // self.quant_config.group_size,
+                        output_size_per_partition,
+                    ),
+                    dtype=torch.int8,
+                ),
+                input_dim=0,
+                output_dim=1,
+                weight_loader=weight_loader,
+            )
+            layer.register_parameter("s2_zeros", s2_zeros)
+
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        layer.qweight = Parameter(layer.qweight.data, requires_grad=False)
+        layer.s1_scales = Parameter(layer.s1_scales.data, requires_grad=False)
+        if self.quant_config.group_size == -1:
+            layer.s1_szeros = Parameter(layer.s1_szeros.data, requires_grad=False)
+        else:
+            layer.s2_scales = Parameter(layer.s2_scales.data, requires_grad=False)
+            layer.s2_zeros = Parameter(layer.s2_zeros.data, requires_grad=False)
+
+    def apply(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        bias: Optional[torch.Tensor] = None,
+    ):
+        assert x.dtype == torch.float16, "QoQ only supports float16 input now"
+        if self.quant_config.group_size == -1:
+            x_q, x_scale, x_sum = per_token_quant_int8(
+                x, scale_dtype=x.dtype, cal_sum=True
+            )
+            out = qserve_w4a8_per_chn_gemm(
+                x_q, layer.qweight, layer.s1_scales, x_scale, layer.s1_szeros, x_sum
+            )
+        else:
+            x_q, x_scale = per_token_quant_int8(x, scale_dtype=x.dtype)
+            out = qserve_w4a8_per_group_gemm(
+                x_q,
+                layer.qweight,
+                layer.s2_zeros,
+                layer.s2_scales,
+                layer.s1_scales,
+                x_scale,
+            )
+        if bias is not None:
+            out = out + bias
+        return out
diff --git a/sglang/python/sglang/srt/layers/quantization/quark/__init__.py b/sglang/python/sglang/srt/layers/quantization/quark/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/sglang/python/sglang/srt/layers/quantization/quark/__pycache__/__init__.cpython-311.pyc b/sglang/python/sglang/srt/layers/quantization/quark/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9449a53e01c22dbb18c321bb7688cc762cb79266
Binary files /dev/null and b/sglang/python/sglang/srt/layers/quantization/quark/__pycache__/__init__.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/layers/quantization/quark/__pycache__/quark.cpython-311.pyc b/sglang/python/sglang/srt/layers/quantization/quark/__pycache__/quark.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..30f7928f7e6f264ab2ab111099696908d6c57d26
Binary files /dev/null and b/sglang/python/sglang/srt/layers/quantization/quark/__pycache__/quark.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/layers/quantization/quark/__pycache__/utils.cpython-311.pyc b/sglang/python/sglang/srt/layers/quantization/quark/__pycache__/utils.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8d680baab5e907b725316a09a83b03b3369a55e8
Binary files /dev/null and b/sglang/python/sglang/srt/layers/quantization/quark/__pycache__/utils.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/layers/quantization/quark/quark.py b/sglang/python/sglang/srt/layers/quantization/quark/quark.py
new file mode 100644
index 0000000000000000000000000000000000000000..ad3546aeb8bc490afeed553b887f42e4d520d147
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/quantization/quark/quark.py
@@ -0,0 +1,527 @@
+# SPDX-License-Identifier: Apache-2.0
+
+import fnmatch
+import logging
+from typing import TYPE_CHECKING, Any, List, Optional, cast
+
+import torch
+
+from sglang.srt.layers.linear import LinearBase
+from sglang.srt.layers.moe import MoeRunnerConfig
+from sglang.srt.layers.quantization.base_config import (  # noqa: E501
+    FusedMoEMethodBase,
+    LinearMethodBase,
+    QuantizationConfig,
+    QuantizeMethodBase,
+)
+from sglang.srt.layers.quantization.kv_cache import BaseKVCacheMethod
+from sglang.srt.layers.quantization.quark.schemes import (
+    QuarkLinearScheme,
+    QuarkMoEScheme,
+    QuarkW4A4MXFP4,
+    QuarkW4A4MXFp4MoE,
+    QuarkW8A8Fp8,
+    QuarkW8A8FP8MoE,
+)
+from sglang.srt.layers.quantization.quark.utils import deep_compare, should_ignore_layer
+from sglang.srt.layers.quantization.unquant import UnquantizedLinearMethod
+from sglang.srt.layers.radix_attention import RadixAttention
+from sglang.srt.utils import get_device_capability
+
+if TYPE_CHECKING:
+    from sglang.srt.layers.moe.token_dispatcher import StandardDispatchOutput
+
+__all__ = ["QuarkLinearMethod", "QuarkFusedMoEMethod"]
+
+logger = logging.getLogger(__name__)
+
+
+class QuarkConfig(QuantizationConfig):
+
+    def __init__(
+        self,
+        quant_config: dict[str, Any],
+        kv_cache_group: Optional[list[str]] = None,
+        kv_cache_config: Optional[dict[str, Any]] = None,
+        pack_method: str = "reorder",
+    ):
+        super().__init__()
+        if kv_cache_group is None:
+            kv_cache_group = []
+        self.quant_config = quant_config
+        self.kv_cache_group = kv_cache_group
+        self.kv_cache_config = kv_cache_config
+        self.pack_method = pack_method
+        self.exclude_layers = cast(list[str], self.quant_config.get("exclude", []))
+
+        self.packed_modules_mapping = self.quant_config["packed_modules_mapping"]
+
+    def get_linear_method(self) -> "QuarkLinearMethod":
+        return QuarkLinearMethod(self)
+
+    @classmethod
+    def get_supported_act_dtypes(cls) -> list[torch.dtype]:
+        return [torch.float16, torch.bfloat16]
+
+    @classmethod
+    def get_min_capability(cls) -> int:
+        return 70
+
+    def get_name(self) -> str:
+        return "quark"
+
+    def apply_weight_name_mapper(self, hf_to_sglang_mapper):
+        self.exclude_layers = hf_to_sglang_mapper.apply_list(self.exclude_layers)
+
+    def get_quant_method(
+        self, layer: torch.nn.Module, prefix: str
+    ) -> Optional["QuantizeMethodBase"]:
+        # Check if the layer is skipped for quantization.
+        if should_ignore_layer(
+            prefix,
+            ignore=self.exclude_layers,
+            fused_mapping=self.packed_modules_mapping,
+        ):
+            if isinstance(layer, LinearBase):
+                return UnquantizedLinearMethod()
+            elif isinstance(layer, RadixAttention):
+                return QuarkKVCacheMethod(self)
+            return None
+
+        if isinstance(layer, LinearBase):
+            scheme = self.get_linear_scheme(layer=layer, layer_name=prefix)
+            layer.scheme = scheme
+            return QuarkLinearMethod(self)
+
+        if isinstance(layer, RadixAttention):
+            return QuarkKVCacheMethod(self)
+
+        from sglang.srt.layers.moe.fused_moe_triton.layer import FusedMoE
+
+        if isinstance(layer, FusedMoE):
+            layer.scheme = self.get_moe_scheme(layer, prefix)
+            return QuarkFusedMoEMethod(self)
+
+        return None
+
+    @classmethod
+    def from_config(cls, config: dict[str, Any]) -> "QuarkConfig":
+        export_config = config.get("export")
+        if export_config is None:
+            raise ValueError(
+                "The export key should be included in "
+                "the configurations of Quark quantized model"
+            )
+
+        kv_cache_group = cast(list[str], export_config.get("kv_cache_group"))
+        pack_method = cast(str, export_config.get("pack_method"))
+
+        # In the export model of quark, the quantization configuration
+        # of kv_cache is stored in layer_quant_config. First, it is
+        # judged whether kv_cache_group exists, and then it is judged
+        # whether layer_quant_config has a quantization configuration
+        # that matches kv_cache.
+        if len(kv_cache_group) == 0:
+            kv_cache_config = None
+        else:
+            kv_cache_set = set(kv_cache_group)
+            layer_quant_config = cast(dict[str, Any], config.get("layer_quant_config"))
+            layer_quant_names = list(layer_quant_config.keys())
+            layer_quant_set = set(layer_quant_names)
+
+            if not kv_cache_set.issubset(layer_quant_set):
+                raise ValueError(
+                    "The Quark quantized model has the "
+                    "kv_cache_group parameter setting, "
+                    "but no kv_cache quantization settings "
+                    "were found in the quantization "
+                    "configuration."
+                )
+
+            q_configs = [
+                cast(dict[str, Any], layer_quant_config.get(name))
+                for name in kv_cache_group
+            ]
+            if not all(deep_compare(q_config, q_configs[0]) for q_config in q_configs):
+                raise ValueError(
+                    "The quantization method used for kv_cache should "
+                    "be the same, but the quantization method for the "
+                    "kv_cache layer in the config is different."
+                )
+            kv_cache_config = q_configs[0].get("output_tensors")
+            if kv_cache_config is None:
+                raise ValueError("The kv_cache quantization configuration is empty.")
+
+            # Since we have already set kv_cache quantization configurations,
+            # we will remove the quantization configuration for the
+            # output_tensors corresponding to the kv_cache layer.
+            for q_config in q_configs:
+                q_config["output_tensors"] = None
+
+            # In case q_proj output is also quantized, remove the configuration
+            # to keep qkv consistency.
+            q_proj_q_config = cast(dict[str, Any], layer_quant_config.get("*q_proj"))
+            if q_proj_q_config is not None:
+                q_proj_q_config["output_tensors"] = None
+
+        return cls(
+            quant_config=config,
+            kv_cache_group=kv_cache_group,
+            kv_cache_config=kv_cache_config,
+            pack_method=pack_method,
+        )
+
+    @classmethod
+    def get_config_filenames(cls) -> list[str]:
+        return []
+
+    def _check_scheme_supported(self, min_capability: int, error: bool = True) -> bool:
+        capability_tuple = get_device_capability()
+
+        if capability_tuple is not None:
+            assert 0 <= capability_tuple[1] < 10
+            capability = capability_tuple[0] * 10 + capability_tuple[1]
+
+            supported = capability >= min_capability
+            if error and not supported:
+                raise RuntimeError(
+                    "Quantization scheme is not supported for ",
+                    f"the current GPU. Min capability: {min_capability}. ",
+                    f"Current capability: {capability}.",
+                )
+            return supported
+        else:
+            return False
+
+    def _is_fp8_w8a8(
+        self,
+        weight_quant: Optional[dict[str, Any]],
+        input_quant: Optional[dict[str, Any]],
+    ) -> bool:
+        # Confirm weights and input quantized.
+        if weight_quant is None or input_quant is None:
+            return False
+
+        # Confirm weight scheme is supported
+        is_fp8_dtype = (
+            weight_quant.get("dtype") == "fp8_e4m3"
+            and input_quant.get("dtype") == "fp8_e4m3"
+        )
+        is_static_weight = not weight_quant.get("is_dynamic")
+        is_per_tensor_or_channel_weight = weight_quant.get("qscheme") in [
+            "per_tensor",
+            "per_channel",
+        ]
+
+        if not (is_fp8_dtype and is_static_weight and is_per_tensor_or_channel_weight):
+            return False
+
+        # Dynamic quantization is always supported if weights supported.
+        if input_quant.get("is_dynamic"):
+            return True
+
+        # Confirm activation scheme is supported.
+        is_per_tensor_activation = input_quant.get("qscheme") == "per_tensor"
+        return is_per_tensor_activation
+
+    def _is_mx_fp4(
+        self,
+        weight_quant: Optional[dict[str, Any]],
+        input_quant: Optional[dict[str, Any]],
+    ) -> bool:
+        # Confirm weights and input quantized.
+        if weight_quant is None or input_quant is None:
+            logger.debug(
+                "Quark model is not in MX-FP4 format: "
+                "weight_quant or input_quant not set"
+            )
+            return False
+
+        # Input and weight dtype needs to be fp4.
+        if weight_quant.get("dtype") != "fp4" or input_quant.get("dtype") != "fp4":
+            logger.debug("Quark model is not in MX-FP4 format: dtype not fp4")
+            return False
+
+        # Input and weight qscheme needs to be per group.
+        if (
+            weight_quant.get("qscheme") != "per_group"
+            or input_quant.get("qscheme") != "per_group"
+        ):
+            logger.debug("Quark model is not in MX-FP4 format: not per_group")
+            return False
+
+        # Input and weight group size needs to be 32.
+        if weight_quant.get("group_size") != 32 or input_quant.get("group_size") != 32:
+            logger.debug("Quark model is not in MX-FP4 format: not group_size=32")
+            return False
+
+        # Weights need to use static quantization.
+        if weight_quant.get("is_dynamic") is True:
+            logger.debug("Quark model is not in MX-FP4 format: not weight static")
+            return False
+
+        # Activations need to use dynamic quantization.
+        if input_quant.get("is_dynamic") is False:
+            logger.debug("Quark model is not in MX-FP4 format: not activation dynamic")
+            return False
+
+        # Activations and weight scales need to be in e8m0 format.
+        if (
+            weight_quant.get("scale_format") != "e8m0"
+            or input_quant.get("scale_format") != "e8m0"
+        ):
+            logger.debug("Quark model is not in MX-FP4 format: not scale_format e8m0")
+            return False
+
+        return True
+
+    def _find_matched_config(
+        self, layer_name: str, module: torch.nn.Module
+    ) -> dict[str, Any]:
+
+        proj_name = layer_name.split(".")[-1]
+        if proj_name in self.packed_modules_mapping:
+            shard_proj_names = self.packed_modules_mapping[proj_name]
+
+            # Convert fused_name --> [shard_names]
+            shard_names = [
+                layer_name.replace(proj_name, shard_proj_name)
+                for shard_proj_name in shard_proj_names
+            ]
+            shard_configs = [
+                self._find_matched_config(shard_name, module)
+                for shard_name in shard_names
+            ]
+            if not all(
+                deep_compare(q_config, shard_configs[0]) for q_config in shard_configs
+            ):
+                raise ValueError(
+                    f"Found a different quantization configuration for "
+                    f"{shard_proj_names} in {layer_name}. vLLM "
+                    "requires all to use the same scheme."
+                )
+            return shard_configs[0]
+        else:
+            layer_quant_config = cast(
+                dict[str, Any], self.quant_config.get("layer_quant_config")
+            )
+            for name_pattern in layer_quant_config:
+                if fnmatch.fnmatch(layer_name, name_pattern):
+                    return layer_quant_config[name_pattern]
+
+            layer_type = type(module).__name__
+            layer_type_quant_config = cast(
+                dict[str, Any], self.quant_config.get("layer_type_quant_config")
+            )
+            if layer_type in layer_type_quant_config:
+                return layer_type_quant_config[layer_type]
+
+            global_quant_config = cast(
+                dict[str, Any], self.quant_config.get("global_quant_config")
+            )
+            return global_quant_config
+
+    def _get_scheme_from_config(self, config: dict[str, Any]) -> "QuarkLinearScheme":
+        if config.get("output_tensors") or config.get("bias"):
+            raise NotImplementedError(
+                "Currently, Quark models with output_tensors "
+                "and bias quantized are not supported"
+            )
+        weight_config = cast(dict[str, Any], config.get("weight"))
+        input_config = cast(dict[str, Any], config.get("input_tensors"))
+
+        if self._is_mx_fp4(weight_config, input_config):
+            return QuarkW4A4MXFP4(weight_config, input_config)
+        if self._is_fp8_w8a8(weight_config, input_config):
+            is_fp8_w8a8_supported = self._check_scheme_supported(
+                QuarkW8A8Fp8.get_min_capability(), error=False
+            )
+            if is_fp8_w8a8_supported:
+                return QuarkW8A8Fp8(weight_config, input_config)
+
+        raise NotImplementedError(
+            "No quark compatible scheme was found. "
+            f"Weight config: {weight_config}, "
+            f"Input config: {input_config}"
+        )
+
+    def get_linear_scheme(
+        self, layer: torch.nn.Module, layer_name: str
+    ) -> "QuarkLinearScheme":
+
+        layer_quant_config = self._find_matched_config(layer_name, layer)
+
+        # Find the quant_scheme
+        scheme = self._get_scheme_from_config(layer_quant_config)
+
+        # Raise error if device does not support the scheme
+        # (e.g. fp8 needs ada lovelace)
+        self._check_scheme_supported(scheme.get_min_capability())
+
+        return scheme
+
+    def get_moe_scheme(
+        self,
+        module: torch.nn.Module,
+        layer_name: str,
+    ) -> "QuarkMoEScheme":
+        layer_quant_config = self._find_matched_config(layer_name, module)
+
+        if layer_quant_config.get("output_tensors") or layer_quant_config.get("bias"):
+            raise NotImplementedError(
+                "Currently, Quark models with "
+                "output_tensors and bias "
+                "quantized are not supported"
+            )
+        weight_config = layer_quant_config.get("weight")
+        input_config = layer_quant_config.get("input_tensors")
+
+        if self._is_mx_fp4(weight_config, input_config):
+            return QuarkW4A4MXFp4MoE(weight_config, input_config)
+        elif self._is_fp8_w8a8(weight_config, input_config):
+            return QuarkW8A8FP8MoE(weight_config, input_config)
+        else:
+            raise RuntimeError("Unsupported FusedMoe scheme")
+
+    def get_scaled_act_names(self) -> List[str]:
+        return []
+
+
+class QuarkLinearMethod(LinearMethodBase):
+
+    def __init__(self, quantization_config: QuarkConfig):
+        self.quantization_config = quantization_config
+
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        layer.scheme.process_weights_after_loading(layer)
+
+    def create_weights(
+        self,
+        layer: torch.nn.Module,
+        input_size_per_partition: int,
+        output_partition_sizes: list[int],
+        input_size: int,
+        output_size: int,
+        params_dtype: torch.dtype,
+        **extra_weight_attrs,
+    ):
+        """
+        Use the QuarkLinearScheme associated with the layer to create
+        the necessary parameters for the layer. See LinearMethodBase for param
+        details
+        """
+        weight_loader = extra_weight_attrs.get("weight_loader")
+        layer.scheme.create_weights(
+            layer=layer,
+            input_size=input_size,
+            input_size_per_partition=input_size_per_partition,
+            output_partition_sizes=output_partition_sizes,
+            output_size=output_size,
+            params_dtype=params_dtype,
+            weight_loader=weight_loader,
+        )
+
+    def apply(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        bias: Optional[torch.Tensor] = None,
+    ):
+        """
+        Use the output of create_weights and the QuarkLinearScheme
+        associated with the layer to apply the forward pass with the
+        layer input.  See LinearMethodBase for param details
+
+        """
+        scheme = layer.scheme
+        if scheme is None:
+            raise ValueError("A scheme must be defined for each layer")
+        return scheme.apply_weights(layer, x, bias=bias)
+
+
+class QuarkFusedMoEMethod(FusedMoEMethodBase):
+
+    def __init__(self, quantization_config: QuarkConfig):
+        self.quantization_config = quantization_config
+
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        layer.scheme.process_weights_after_loading(layer)
+
+    def create_weights(
+        self,
+        layer: torch.nn.Module,
+        num_experts: int,
+        hidden_size: int,
+        intermediate_size_per_partition: int,
+        params_dtype: torch.dtype,
+        **extra_weight_attrs,
+    ):
+        """
+        Use the QuarkMoEScheme associated with the layer to create
+        the necessary parameters for the layer. See FusedMoEMethodBase for param
+        details
+        """
+        layer.scheme.create_weights(
+            layer=layer,
+            num_experts=num_experts,
+            hidden_size=hidden_size,
+            intermediate_size_per_partition=intermediate_size_per_partition,
+            params_dtype=params_dtype,
+            **extra_weight_attrs,
+        )
+
+    def create_moe_runner(
+        self, layer: torch.nn.Module, moe_runner_config: MoeRunnerConfig
+    ):
+        layer.scheme.create_moe_runner(layer, moe_runner_config)
+
+    def apply(
+        self,
+        layer: torch.nn.Module,
+        dispatch_output: "StandardDispatchOutput",
+    ):
+        """
+        Use the output of create_weights and the QuarkMoEScheme
+        associated with the layer to apply the forward pass with the
+        fused MoE layer. See FusedMoEMethodBase for param details
+
+        """
+        scheme = layer.scheme
+        if scheme is None:
+            raise ValueError("A scheme must be defined for each layer")
+        return scheme.apply_weights(layer, dispatch_output)
+
+
+class QuarkKVCacheMethod(BaseKVCacheMethod):
+    """
+    Supports loading kv-cache scaling factors from quark checkpoints.
+    """
+
+    def __init__(self, quant_config: QuarkConfig):
+        self.validate_kv_cache_config(quant_config.kv_cache_config)
+        super().__init__(quant_config)
+
+    @staticmethod
+    def validate_kv_cache_config(kv_cache_config: Optional[dict[str, Any]]):
+        """
+        Validator for the kv cache configuration. Useful for controlling the
+        kv cache quantization schemes, that are being supported in vLLM
+        :param kv_cache_config: the quark kv cache scheme
+        """
+        if kv_cache_config is None:
+            return
+
+        dtype = kv_cache_config.get("dtype")
+        if dtype != "fp8_e4m3":
+            raise NotImplementedError(
+                "Currently supported kv cache quantization is "
+                f"dtype=fp8_e4m3, however received {dtype}"
+            )
+
+        qscheme = kv_cache_config.get("qscheme")
+        if qscheme != "per_tensor":
+            raise NotImplementedError(
+                "Only support per-tensor scaling factor "
+                "for quark KV cache. "
+                f"Expected qscheme: per_tensor, found qscheme: {qscheme}"
+            )
diff --git a/sglang/python/sglang/srt/layers/quantization/quark/schemes/__init__.py b/sglang/python/sglang/srt/layers/quantization/quark/schemes/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..0ce9e0c299d9c7115e1d50731da3251f6c006424
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/quantization/quark/schemes/__init__.py
@@ -0,0 +1,16 @@
+# SPDX-License-Identifier: Apache-2.0
+
+from .quark_scheme import QuarkLinearScheme, QuarkMoEScheme
+from .quark_w4a4_mxfp4 import QuarkW4A4MXFP4
+from .quark_w4a4_mxfp4_moe import QuarkW4A4MXFp4MoE
+from .quark_w8a8_fp8 import QuarkW8A8Fp8
+from .quark_w8a8_fp8_moe import QuarkW8A8FP8MoE
+
+__all__ = [
+    "QuarkLinearScheme",
+    "QuarkMoEScheme",
+    "QuarkW4A4MXFP4",
+    "QuarkW8A8Fp8",
+    "QuarkW4A4MXFp4MoE",
+    "QuarkW8A8FP8MoE",
+]
diff --git a/sglang/python/sglang/srt/layers/quantization/quark/schemes/quark_scheme.py b/sglang/python/sglang/srt/layers/quantization/quark/schemes/quark_scheme.py
new file mode 100644
index 0000000000000000000000000000000000000000..556de3152dabc63450bb197e96b02b00c3bb0214
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/quantization/quark/schemes/quark_scheme.py
@@ -0,0 +1,116 @@
+# SPDX-License-Identifier: Apache-2.0
+
+from abc import abstractmethod
+from typing import TYPE_CHECKING, Optional
+
+import torch
+
+from sglang.srt.layers.moe import MoeRunnerConfig
+from sglang.srt.layers.quantization.base_scheme import BaseLinearScheme, BaseMoEScheme
+
+if TYPE_CHECKING:
+    from sglang.srt.layers.moe.token_dispatcher import StandardDispatchOutput
+
+__all__ = ["QuarkLinearScheme", "QuarkMoEScheme"]
+
+
+class QuarkLinearScheme(BaseLinearScheme):
+    """
+    Abstract class used to describe the weight creation and forward pass
+    of different quantization schemes supported by Quark.
+    """
+
+    @classmethod
+    @abstractmethod
+    def get_min_capability(cls) -> int:
+        """
+        Get minimum device capability.
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def create_weights(self, *args, **kwargs):
+        """
+        Weight creation for the particular scheme. Inputs to this function
+
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def process_weights_after_loading(self, layer: torch.nn.Module):
+        """
+        Called after weight loading is complete for any cleanup that
+        needs to occur.
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def apply_weights(
+        self, layer: torch.nn.Module, x: torch.Tensor, bias: Optional[torch.Tensor]
+    ):
+        """
+        Run the forward pass for the particular scheme. This is where
+        scheme-specific dequant/quant steps/kernels should be applied.
+
+        :param layer: torch.nn.Module with the registered weights and
+            other parameters relevant to the particular scheme.
+        :param x: input to the layer
+        :param bias: bias parameter
+
+        """
+        raise NotImplementedError
+
+
+class QuarkMoEScheme(BaseMoEScheme):
+    """
+    Abstract class used to describe the weight creation and forward pass
+    of different quantization schemes supported by Quark.
+    """
+
+    @classmethod
+    @abstractmethod
+    def get_min_capability(cls) -> int:
+        """
+        Get minimum device capability.
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def create_weights(self, *args, **kwargs):
+        """
+        Weight creation for the particular scheme. Inputs to this function
+
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def create_moe_runner(
+        self, layer: torch.nn.Module, moe_runner_config: MoeRunnerConfig
+    ):
+        raise NotImplementedError
+
+    @abstractmethod
+    def process_weights_after_loading(self, layer: torch.nn.Module):
+        """
+        Called after weight loading is complete for any cleanup that
+        needs to occur.
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def apply_weights(
+        self,
+        layer: torch.nn.Module,
+        dispatch_output: "StandardDispatchOutput",
+    ):
+        """
+        Run the forward pass for the particular scheme. This is where
+        scheme-specific dequant/quant steps/kernels should be applied.
+
+        :param layer: torch.nn.Module with the registered weights and
+            other parameters relevant to the particular scheme.
+        :param x: input to the layer
+        :param bias: bias parameter
+
+        """
+        raise NotImplementedError
diff --git a/sglang/python/sglang/srt/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py b/sglang/python/sglang/srt/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py
new file mode 100644
index 0000000000000000000000000000000000000000..04f7cd246ac08840a1a390b541dcbb916422ac82
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py
@@ -0,0 +1,160 @@
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any, Callable, Optional
+
+import torch
+
+from sglang.srt.layers.parameter import GroupQuantScaleParameter, PackedvLLMParameter
+from sglang.srt.layers.quantization.quark.schemes import QuarkLinearScheme
+from sglang.srt.utils import is_hip
+
+_is_hip = is_hip()
+if _is_hip:
+    from aiter.ops.triton.gemm.fused.fused_gemm_afp4wfp4_split_cat import (
+        fused_gemm_afp4wfp4_split_cat,
+    )
+    from aiter.ops.triton.gemm_afp4wfp4 import gemm_afp4wfp4
+    from aiter.ops.triton.gemm_afp4wfp4_pre_quant_atomic import gemm_afp4wfp4_pre_quant
+    from aiter.ops.triton.quant import dynamic_mxfp4_quant
+
+
+__all__ = ["QuarkW4A4MXFP4"]
+
+OCP_MX_BLOCK_SIZE = 32
+
+
+class QuarkW4A4MXFP4(QuarkLinearScheme):
+
+    def __init__(
+        self, weight_quant_spec: dict[str, Any], input_quant_spec: dict[str, Any]
+    ):
+        self.out_dtype = torch.get_default_dtype()
+        self.qscheme = "per_group"
+        self.weight_quant_spec = weight_quant_spec
+        self.input_quant_spec = input_quant_spec
+
+    @classmethod
+    def get_min_capability(cls) -> int:
+        return 70
+
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        return
+
+    def create_weights(
+        self,
+        layer: torch.nn.Module,
+        output_partition_sizes: list[int],
+        input_size_per_partition: int,
+        params_dtype: torch.dtype,
+        weight_loader: Callable,
+        **kwargs
+    ):
+        output_size_per_partition = sum(output_partition_sizes)
+        layer.logical_widths = output_partition_sizes
+
+        # WEIGHT
+        weight = PackedvLLMParameter(
+            data=torch.empty(
+                output_size_per_partition,
+                input_size_per_partition // 2,
+                dtype=torch.uint8,
+            ),
+            input_dim=1,
+            output_dim=0,
+            packed_dim=1,
+            packed_factor=2,
+            weight_loader=weight_loader,
+        )
+        layer.register_parameter("weight", weight)
+
+        # WEIGHT SCALE
+        weight_scale = GroupQuantScaleParameter(
+            data=torch.empty(
+                output_size_per_partition,
+                input_size_per_partition // OCP_MX_BLOCK_SIZE,
+                dtype=torch.uint8,
+            ),
+            input_dim=1,
+            output_dim=0,
+            weight_loader=weight_loader,
+        )
+        layer.register_parameter("weight_scale", weight_scale)
+
+    def apply_weights(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        bias: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        # This path does not have support for bias currently
+        assert bias is None, "bias is not supported"
+
+        three_d = False
+        fused_gemm_split_cat = False
+        x_s = None
+        y = None
+
+        if isinstance(x, tuple):
+            assert len(x) in [
+                2,
+                3,
+                5,
+            ], "For tuple input, only (x, x_s), (x, x_s, y), or (x, y, S1, S2, out_dtype) formats are accepted"
+            if len(x) == 2:
+                x, x_s = x
+            elif len(x) == 3:
+                x, x_s, y = x
+            elif len(x) == 5:
+                x, y, S1, S2, out_dtype = x
+                fused_gemm_split_cat = True
+
+        use_fused_quant_gemm = (
+            not fused_gemm_split_cat
+            and x_s is None
+            and y is not None
+            and layer.weight.shape[0] == y.shape[1]
+        )
+
+        if x.dim() == 3:
+            three_d = True
+            x = x.view(-1, x.shape[-1])
+            output_shape = [*x.shape[:-1], layer.weight.shape[0]]
+
+        # use_fused_quant_gemm = true, x_q is a bf16/fp16 num
+        # x_s is not None = true, x_q is uint8 num
+        if use_fused_quant_gemm or x_s is not None:
+            x_q = x
+        else:
+            x_q, x_s = dynamic_mxfp4_quant(x)
+
+        if y is None:
+            y = torch.empty(
+                x_q.shape[0],
+                layer.weight.shape[0],
+                device=x_q.device,
+                dtype=self.out_dtype,
+            )
+
+        if use_fused_quant_gemm:
+            gemm_afp4wfp4_pre_quant(x_q, layer.weight, layer.weight_scale, y.dtype, y)
+            y = y.to(x.dtype)
+        elif fused_gemm_split_cat:
+            k, v = fused_gemm_afp4wfp4_split_cat(
+                x=x_q,
+                w=layer.weight,
+                y=y,
+                x_scale=x_s,
+                w_scale=layer.weight_scale,
+                S1=S1,
+                S2=S2,
+                dtype=out_dtype,
+            )
+        else:
+            gemm_afp4wfp4(x_q, layer.weight, x_s, layer.weight_scale, self.out_dtype, y)
+
+        if fused_gemm_split_cat:
+            return k, v
+        elif three_d:
+            return y.view(*output_shape)
+        else:
+            return y
diff --git a/sglang/python/sglang/srt/layers/quantization/quark/utils.py b/sglang/python/sglang/srt/layers/quantization/quark/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..e3a9cd6e6a1fd60720591dd2e9dc35058753d644
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/quantization/quark/utils.py
@@ -0,0 +1,214 @@
+# SPDX-License-Identifier: Apache-2.0
+
+import re
+from collections.abc import Iterable, Mapping
+from types import MappingProxyType
+from typing import Any, Optional
+
+import torch
+
+try:
+    from aiter.ops.triton.quant import dynamic_mxfp4_quant
+except ImportError as err:
+
+    def raise_aiter_import_error(*args, **kwargs):
+        raise ImportError(
+            "Failed to import aiter. " "Make sure AITER is installed and accessible."
+        )
+
+    dynamic_mxfp4_quant = raise_aiter_import_error
+from torch import nn
+
+
+def deep_compare(dict1: Any, dict2: Any) -> bool:
+    if type(dict1) is not type(dict2):
+        return False
+    if isinstance(dict1, dict):
+        if dict1.keys() != dict2.keys():
+            return False
+        return all(deep_compare(dict1[k], dict2[k]) for k in dict1)
+    elif isinstance(dict1, list):
+        return set(dict1) == set(dict2)
+    else:
+        return dict1 == dict2
+
+
+def should_ignore_layer(
+    layer_name: Optional[str],
+    ignore: Iterable[str],
+    fused_mapping: Mapping[str, list[str]] = MappingProxyType({}),
+) -> bool:
+    if layer_name is None:
+        return False
+
+    # layer_name = model.layers.0.self_attn.qkv_proj
+    # proj_name = qkv_proj
+    proj_name = layer_name.split(".")[-1]
+
+    # Fused layers like gate_up_proj or qkv_proj will not be fused
+    # in the safetensors checkpoint. So, we convert the name
+    # from the fused version to unfused + check to make sure that
+    # each shard of the fused layer has the same scheme.
+    if proj_name in fused_mapping:
+        shard_proj_names = fused_mapping[proj_name]
+
+        # Convert fused_name --> [shard_names]
+        shard_names = [
+            layer_name.replace(proj_name, shard_proj_name)
+            for shard_proj_name in shard_proj_names
+        ]
+
+        # Layer should be ignored if shards are ignored.
+        should_ignore_layer = None
+        for shard_name in shard_names:
+            should_ignore_shard = check_equal_or_regex_match(
+                layer_name=shard_name, targets=ignore
+            )
+
+            # If shard_idx=0, set layer ignore to match shard.
+            if should_ignore_layer is None:
+                should_ignore_layer = should_ignore_shard
+
+            # If shard_idx=1+ confirm scheme matches prior shards.
+            elif should_ignore_shard != should_ignore_layer:
+                raise ValueError(
+                    f"Found a different quantization schemes for "
+                    f"{shard_proj_names} in {layer_name}. vLLM "
+                    "requires all to use the same scheme."
+                )
+
+    # Unfused layers like down_proj and o_proj will match
+    # the safetensors checkpoint already.
+    else:
+        should_ignore_layer = check_equal_or_regex_match(
+            layer_name=layer_name, targets=ignore
+        )
+
+    assert should_ignore_layer is not None
+
+    return should_ignore_layer
+
+
+def check_equal_or_regex_match(layer_name: str, targets: Iterable[str]) -> bool:
+    """
+    Checks whether a layer_name is exactly equal or a regex match for
+    if target starts with 're:' to any target in list.
+    """
+    for target in targets:
+        if _is_equal_or_regex_match(layer_name, target):
+            return True
+    return False
+
+
+def _is_equal_or_regex_match(
+    value: str, target: str, check_contains: bool = False
+) -> bool:
+    """
+    Checks whether a value is exactly equal or a regex match for target
+    if target starts with 're:'. If check_contains is set to True,
+    additionally checks if the target string is contained within the value.
+    """
+
+    if target.startswith("re:"):
+        pattern = target[3:]
+        if re.match(pattern, value):
+            return True
+    elif check_contains:
+        if target.lower() in value.lower():
+            return True
+    elif target == value:
+        return True
+    return False
+
+
+# utility for tensor dims > 2 cases
+def b_dynamic_mxfp4_quant(x):
+    h, b, d = x.shape
+    x, x_scales = dynamic_mxfp4_quant(x.reshape(-1, d))
+    return x.view(h, b, d // 2), x_scales.view(h, b, d // 32)
+
+
+def mxfp4_to_f32(x, is_3d):
+    # 2 because we pack fp4 in uint8.
+    x = x.repeat_interleave(2, dim=-1)
+    if is_3d:
+        x[..., ::2] = x[..., ::2] & 0xF
+        x[..., 1::2] = x[..., 1::2] >> 4
+    else:
+        x[:, ::2] = x[:, ::2] & 0xF
+        x[:, 1::2] = x[:, 1::2] >> 4
+
+    mxfp4_list = [
+        0.0,
+        0.5,
+        1.0,
+        1.5,
+        2.0,
+        3.0,
+        4.0,
+        6.0,
+        -0.0,
+        -0.5,
+        -1.0,
+        -1.5,
+        -2.0,
+        -3.0,
+        -4.0,
+        -6.0,
+    ]
+    mxfp4_in_f32 = torch.tensor(mxfp4_list, dtype=torch.float32, device="cuda")
+    return mxfp4_in_f32[x.long()]
+
+
+def e8m0_to_f32(x):
+    # Convert the input tensor `x` (assumed to be in e8m0 format) to float32.
+    # e8m0 is a custom 8-bit floating point format with 8 bits for exponent, 0 for mantissa.
+    # This means the value is essentially 2^(exponent - 127), similar to how IEEE-754 stores floats.
+
+    # Convert x to float32 for computation, and compute the power of 2 by subtracting the bias (127).
+    x_f32 = 2 ** ((x.to(torch.float32)) - 127)
+
+    # If the exponent value was 255 (i.e., 2^(128)), this is a special case usually used to represent NaN or Inf.
+    # Since this custom format has no mantissa, treat 2^128 as NaN.
+    x_f32[x_f32 == 128] = float("nan")
+    return x_f32
+
+
+def quark_post_load_weights(self_attn: nn.Module, w: torch.Tensor, quant_format: str):
+    if "mxfp4" in quant_format:
+        # when dtype is bf16, the processing flow is to dynamic quantize bf16 tensor to uint8 tensor
+        # do w_kc (bf16) first to get the w_kc(uint8) w_s_kc(uint8)
+        # and w_vc repeating the same procedure of w_kc to get  w_vc(uint8) w_s_vc(uint8)
+        if w.dtype == torch.bfloat16:
+            w_kc, w_vc = w.unflatten(
+                0, (-1, self_attn.qk_nope_head_dim + self_attn.v_head_dim)
+            ).split([self_attn.qk_nope_head_dim, self_attn.v_head_dim], dim=1)
+            w_kc, w_s_kc = b_dynamic_mxfp4_quant(w_kc.transpose(-2, -1))
+            w_kc = w_kc.transpose(-2, -1)
+            w_s_kc = w_s_kc.transpose(-2, -1)
+            w_vc, w_s_vc = b_dynamic_mxfp4_quant(w_vc)
+            w_s_kc = w_s_kc.transpose(1, 2).contiguous().transpose(1, 2)
+            w_s_vc = w_s_vc.contiguous().transpose(1, 2)
+        elif w.dtype == torch.uint8:  # static quant for mxfp4
+            # when dtype is uint8, it means the w has been quantized to mxfp4 format
+            # but we must separate it to w_kc and w_vc.
+            # The quantized tensor size is only half of original tensor size
+            # and the scaling factor is 1/32, the transpose behavior will be not correct
+            # need to upcast it to fp32 to separate w to w_kc and w_vc
+            # to ensure the following transpose behavior is correct
+            # and then do mxfp4 quant again
+            w = mxfp4_to_f32(w, True).to(torch.bfloat16)
+            w_scales = self_attn.kv_b_proj.weight_scale.repeat_interleave(32, dim=-1)
+            w_scales = e8m0_to_f32(w_scales).to(torch.bfloat16)
+            w = w * w_scales
+            w_kc, w_vc = w.unflatten(
+                0, (-1, (self_attn.qk_nope_head_dim + self_attn.v_head_dim))
+            ).split([self_attn.qk_nope_head_dim, self_attn.v_head_dim], dim=1)
+            w_kc, w_s_kc = b_dynamic_mxfp4_quant(w_kc.transpose(-2, -1))
+            w_kc = w_kc.transpose(-2, -1)
+            w_s_kc = w_s_kc.transpose(-2, -1)
+            w_vc, w_s_vc = b_dynamic_mxfp4_quant(w_vc)
+            w_s_kc = w_s_kc.transpose(1, 2).contiguous().transpose(1, 2)
+            w_s_vc = w_s_vc.contiguous().transpose(1, 2)
+
+        return w_kc, w_s_kc, w_vc, w_s_vc
diff --git a/sglang/python/sglang/srt/layers/quantization/quark_int4fp8_moe.py b/sglang/python/sglang/srt/layers/quantization/quark_int4fp8_moe.py
new file mode 100644
index 0000000000000000000000000000000000000000..7fbbc61131d9666878198461aa4476d81f5c2a08
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/quantization/quark_int4fp8_moe.py
@@ -0,0 +1,443 @@
+import logging
+from typing import TYPE_CHECKING, Any, Dict, List, Optional
+
+import torch
+from tqdm import tqdm
+from tqdm.std import EMA
+
+from sglang.srt.distributed import get_tensor_model_parallel_rank
+from sglang.srt.layers.int4fp8_utils import (
+    pack_int4_to_int32,
+    quantize_fp8_scale_tensorwise,
+    quantize_int4_scale_columnwise,
+)
+from sglang.srt.layers.moe import MoeRunnerConfig
+from sglang.srt.layers.quantization.base_config import (
+    FusedMoEMethodBase,
+    QuantizationConfig,
+    QuantizeMethodBase,
+)
+from sglang.srt.layers.quantization.fp8 import Fp8LinearMethod
+from sglang.srt.utils import BAR_FORMAT, is_hip, set_weight_attrs
+
+if TYPE_CHECKING:
+    from sglang.srt.layers.moe.token_dispatcher import DispatchOutput
+
+_is_hip = is_hip()
+
+
+if _is_hip:
+    from aiter import ActivationType, QuantType
+    from aiter.fused_moe import fused_moe
+    from aiter.ops.shuffle import shuffle_weight
+
+    ON_GFX950 = "gfx950" in torch.cuda.get_device_properties("cuda").gcnArchName
+
+logger = logging.getLogger(__name__)
+
+
+def tqdm_reset_no_print(tqdm_bar: tqdm, total=None):
+    tqdm_bar.n = 0
+    if total is not None:
+        tqdm_bar.total = total
+    if tqdm_bar.disable:
+        return
+    tqdm_bar.last_print_n = 0
+    tqdm_bar.last_print_t = tqdm_bar.start_t = tqdm_bar._time()
+    tqdm_bar._ema_dn = EMA(tqdm_bar.smoothing)
+    tqdm_bar._ema_dt = EMA(tqdm_bar.smoothing)
+    tqdm_bar._ema_miniters = EMA(tqdm_bar.smoothing)
+
+
+class QuarkInt4Fp8Config(QuantizationConfig):
+    """Config class for Quark Quantization.
+
+    - Weight: static, per-channel, symmetric
+    - Activation: dynamic, per-token, symmetric
+    """
+
+    def __init__(
+        self,
+        is_checkpoint_fp8_serialized: bool = False,
+        activation_scheme: str = "dynamic",
+    ):
+        self.is_checkpoint_fp8_serialized = is_checkpoint_fp8_serialized
+        self.activation_scheme = activation_scheme
+
+        if activation_scheme != "dynamic":
+            raise NotImplementedError(
+                "QuarkInt4Fp8Config only supports activation_scheme='dynamic'."
+            )
+
+        self.weight_block_size = None
+
+        self.num_quant_layers = 0
+
+        tp_rank = get_tensor_model_parallel_rank()
+
+        # The weight iterator already has a progress bar on rank=0, account for that.
+        position = 1 + tqdm._get_free_pos()
+        self.online_quant_progress_bar = tqdm(
+            total=0,
+            desc=f"Online quark_int4fp8_moe quantization on rank={tp_rank}",
+            position=position,
+            bar_format=BAR_FORMAT,
+            mininterval=2.0,
+        )
+
+    @classmethod
+    def get_supported_act_dtypes(cls) -> List[torch.dtype]:
+        return [torch.float16, torch.bfloat16]
+
+    @classmethod
+    def get_min_capability(cls) -> int:
+        return 70
+
+    @classmethod
+    def get_name(self) -> str:
+        return "quark_int4fp8_moe"
+
+    @classmethod
+    def get_config_filenames(cls) -> List[str]:
+        return []
+
+    @classmethod
+    def from_config(cls, config: Dict[str, Any]) -> "QuarkInt4Fp8Config":
+        return cls()
+
+    def get_quant_method(
+        self,
+        layer: torch.nn.Module,
+        prefix: str,
+    ) -> Optional["QuantizeMethodBase"]:
+        # TODO: fix circular imports issues in sglang forcing us to import here instead of at
+        # the top of file.
+        from sglang.srt.layers.linear import LinearBase
+        from sglang.srt.layers.moe.fused_moe_triton import FusedMoE
+
+        if isinstance(layer, LinearBase):
+            return Fp8LinearMethod(self)
+        elif isinstance(layer, FusedMoE):
+            return QuarkInt4Fp8MoEMethod(self)
+
+        return None
+
+    def get_scaled_act_names(self) -> List[str]:
+        return []
+
+
+class QuarkInt4Fp8MoEMethod(FusedMoEMethodBase):
+    """MoE method for INT4FP8.
+
+    Supports loading BF16/FP16 checkpoints, quantizing down to INT4, and dequantizing to FP8 during inference.
+
+    Args:
+        quant_config: The quantization config.
+    """
+
+    def __init__(self, quant_config):
+        self.quant_config = quant_config
+
+        self.online_quant_progress_bar = self.quant_config.online_quant_progress_bar
+
+        self.tp_rank = get_tensor_model_parallel_rank()
+
+        if not _is_hip:
+            raise NotImplementedError(
+                "The quark_int4fp8_moe online quantization scheme is only supported on AMD GPUs."
+            )
+
+    def get_weight_loader(self, layer, original_weight_loader):
+        def online_int4_fp8_weight_loader(
+            param: torch.nn.Parameter,
+            loaded_weight: torch.Tensor,
+            weight_name: str,
+            shard_id: str,
+            expert_id: int,
+        ):
+            if shard_id in ["w1", "w3"]:
+                shard_size = self.w13_shard_size
+            else:
+                shard_size = self.w2_shard_size
+
+            original_use_presharded_weights = layer.use_presharded_weights
+
+            if not layer.use_presharded_weights:
+                # In case the model is not pre-sharded (most checkpoints on HF Hub),
+                # we shard the model here in order to run online quantization on
+                # already sharded weights.
+                # Some models as `lmzheng/grok-1` are already be sharded.
+                layer.use_presharded_weights = True
+
+                if shard_id in ["w1", "w3"]:
+                    shard_dim = 0
+                    loaded_weight = loaded_weight.narrow(
+                        shard_dim, shard_size * self.tp_rank, shard_size
+                    )
+                else:
+                    shard_dim = 1
+                    loaded_weight = loaded_weight.narrow(
+                        shard_dim, shard_size * self.tp_rank, shard_size
+                    )
+
+            # We want to run online quantization on-device for speed purposes.
+            loaded_weight = loaded_weight.to(param.device)
+
+            _, fp8_scale = quantize_fp8_scale_tensorwise(loaded_weight)
+
+            int4_w, int4_scale = quantize_int4_scale_columnwise(loaded_weight)
+
+            int4_w = pack_int4_to_int32(int4_w)
+            int4_scale /= fp8_scale
+
+            if shard_id in ["w1", "w3"]:
+                if shard_id == "w1":
+                    shard_slice = slice(0, shard_size)
+                    idx = 0
+                else:
+                    shard_slice = slice(shard_size, 2 * shard_size)
+                    idx = 1
+
+                assert param[expert_id][shard_slice].dtype == int4_w.dtype
+
+                assert (
+                    layer.w13_int4_scale[expert_id][shard_slice].shape
+                    == int4_scale.shape
+                )
+                assert (
+                    layer.w13_int4_scale[expert_id][shard_slice].dtype
+                    == int4_scale.dtype
+                )
+
+                layer.w13_int4_scale[expert_id][shard_slice].copy_(int4_scale)
+
+                assert layer.w13_fp8_scale[expert_id][idx].shape == fp8_scale.shape
+                assert layer.w13_fp8_scale[expert_id][idx].dtype == fp8_scale.dtype
+
+                layer.w13_fp8_scale[expert_id][idx].copy_(fp8_scale)
+            else:
+                assert param[expert_id].dtype == int4_w.dtype
+                assert param[expert_id].shape == int4_w.shape
+
+                assert layer.w2_int4_scale[expert_id].shape == int4_scale.shape
+                assert layer.w2_int4_scale[expert_id].dtype == int4_scale.dtype
+
+                layer.w2_int4_scale[expert_id].copy_(int4_scale)
+
+                assert layer.w2_fp8_scale[expert_id].shape == fp8_scale.shape
+                assert layer.w2_fp8_scale[expert_id].dtype == fp8_scale.dtype
+
+                layer.w2_fp8_scale[expert_id].copy_(fp8_scale)
+
+            original_weight_loader(
+                param,
+                int4_w,
+                shard_id=shard_id,
+                weight_name=weight_name,
+                expert_id=expert_id,
+            )
+
+            # Reset `use_presharded_weights` as the same layer may load several different weights.
+            layer.use_presharded_weights = original_use_presharded_weights
+
+            self.online_quant_progress_bar.update(1)
+
+        return online_int4_fp8_weight_loader
+
+    def create_weights(
+        self,
+        layer: torch.nn.Module,
+        num_experts: int,
+        hidden_size: int,
+        intermediate_size_per_partition: int,
+        params_dtype: torch.dtype,
+        **extra_weight_attrs,
+    ):
+        # TODO: fix circular imports issues in sglang forcing us to import here instead of at
+        # the top of file.
+        from sglang.srt.layers.moe.fused_moe_triton import FusedMoeWeightScaleSupported
+
+        # print("intermediate_size_per_partition", intermediate_size_per_partition)
+        # fused moe logic already hands TP logic.
+        self.w13_shard_size = intermediate_size_per_partition
+        self.w2_shard_size = intermediate_size_per_partition
+
+        assert "weight_loader" in extra_weight_attrs
+        original_weight_loader = extra_weight_attrs.get("weight_loader")
+
+        online_int4fp8_weight_loader = self.get_weight_loader(
+            layer, original_weight_loader
+        )
+        extra_weight_attrs["weight_loader"] = online_int4fp8_weight_loader
+
+        params_dtype = torch.uint32
+        # WEIGHTS
+        # INT4 MoE weight - INT32 packed
+        w13_weight = torch.nn.Parameter(
+            torch.empty(
+                num_experts,
+                2 * intermediate_size_per_partition,
+                hidden_size // 8,
+                dtype=params_dtype,
+            ),
+            requires_grad=False,
+        )
+        w2_weight = torch.nn.Parameter(
+            torch.empty(
+                num_experts,
+                hidden_size,
+                intermediate_size_per_partition // 8,
+                dtype=params_dtype,
+            ),
+            requires_grad=False,
+        )
+        layer.register_parameter("w13_weight", w13_weight)
+        set_weight_attrs(w13_weight, extra_weight_attrs)
+
+        layer.register_parameter("w2_weight", w2_weight)
+        set_weight_attrs(w2_weight, extra_weight_attrs)
+
+        # Allocate 2 scales for w1 and w3 respectively.
+        # They will be combined to a single scale after weight loading.
+        w13_fp8_scale = torch.nn.Parameter(
+            torch.ones(num_experts, 2, dtype=torch.float32), requires_grad=False
+        )
+        w2_fp8_scale = torch.nn.Parameter(
+            torch.ones(num_experts, dtype=torch.float32), requires_grad=False
+        )
+        layer.register_parameter("w13_fp8_scale", w13_fp8_scale)
+        layer.register_parameter("w2_fp8_scale", w2_fp8_scale)
+
+        if _is_hip:
+            w13_int4_scale = torch.nn.Parameter(
+                torch.ones(
+                    num_experts,
+                    2 * intermediate_size_per_partition,
+                    dtype=torch.float32,
+                ),
+                requires_grad=False,
+            )
+            w2_int4_scale = torch.nn.Parameter(
+                torch.ones(num_experts, hidden_size, dtype=torch.float32),
+                requires_grad=False,
+            )
+            layer.register_parameter("w13_int4_scale", w13_int4_scale)
+            layer.register_parameter("w2_int4_scale", w2_int4_scale)
+
+        extra_weight_attrs.update(
+            {"quant_method": FusedMoeWeightScaleSupported.TENSOR.value}
+        )
+
+        set_weight_attrs(w13_fp8_scale, extra_weight_attrs)
+        set_weight_attrs(w2_fp8_scale, extra_weight_attrs)
+
+        # Add the quantization method used (per tensor/grouped/channel)
+        # to ensure the weight scales are loaded in properly
+        extra_weight_attrs.update(
+            {"quant_method": FusedMoeWeightScaleSupported.CHANNEL.value}
+        )
+
+        set_weight_attrs(w13_int4_scale, extra_weight_attrs)
+        set_weight_attrs(w2_int4_scale, extra_weight_attrs)
+
+        w13_input_scale = None
+        layer.register_parameter("w13_input_scale", w13_input_scale)
+
+        w2_input_scale = None
+        layer.register_parameter("w2_input_scale", w2_input_scale)
+
+        # Loading from the checkpoint w1, w2, w3 times the number of experts.
+        total = self.online_quant_progress_bar.total + num_experts * 3
+        tqdm_reset_no_print(self.online_quant_progress_bar, total=total)
+
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        if _is_hip and not ON_GFX950:
+            # CDNA3 does not support OCP FP8E4M3FN, but uses FP8E4M3FNUZ.
+            # CDNA4 supports OCP FP8E4M3FN.
+            layer.w13_int4_scale *= 0.5
+            layer.w2_int4_scale *= 0.5
+
+            layer.w13_fp8_scale *= 2.0
+            layer.w2_fp8_scale *= 2.0
+
+        # TODO: and use_aiter_moe: add after triton kernel added
+        # INT4-FP8 (INT4 MoE Weight, FP8 Compute)
+        # Weight Permutation
+        layer.w13_weight = torch.nn.Parameter(
+            shuffle_weight(layer.w13_weight.data, (16, 16)),
+            requires_grad=False,
+        )
+        torch.cuda.empty_cache()
+        layer.w2_weight = torch.nn.Parameter(
+            shuffle_weight(layer.w2_weight.data, (16, 16)),
+            requires_grad=False,
+        )
+        torch.cuda.empty_cache()
+
+        # INT4-FP8 : offset INT4 w13_int4_scale to single w13_fp8_scale
+        # Fp8 moe kernel needs single fp8 w13_fp8_scale for w13 per expert.
+        # We won't do requant each expert's fp8 weight (not direct available),
+        # instead we adjust half of INT4 w13_int4_scale numbers
+        assert layer.w13_fp8_scale is not None
+        shard_size = layer.intermediate_size_per_partition
+        max_w13_scales = layer.w13_fp8_scale.max(dim=1).values
+        for expert_id in range(layer.num_experts):
+            start = 0
+            max_w13_scale_fp8 = max_w13_scales[expert_id]
+            for shard_id in range(2):
+                if layer.w13_fp8_scale[expert_id][shard_id] != max_w13_scale_fp8:
+                    int4_rescale = (
+                        layer.w13_fp8_scale[expert_id][shard_id] / max_w13_scale_fp8
+                    )
+                    layer.w13_int4_scale[expert_id][
+                        start : start + shard_size
+                    ] *= int4_rescale
+                start += shard_size
+
+        layer.w13_fp8_scale = torch.nn.Parameter(max_w13_scales, requires_grad=False)
+
+        # special hack to asm_moe, which takes (weight_int4_scale * weight_scale) as post GEMM scaling
+        # optimal design - shall apply per-column weight_int4_scale before GEMM, and weight_scale post
+        for expert_id in range(layer.num_experts):
+            layer.w13_int4_scale[expert_id] *= max_w13_scales[expert_id]
+            layer.w2_int4_scale[expert_id] *= layer.w2_fp8_scale[expert_id]
+
+    def create_moe_runner(
+        self, layer: torch.nn.Module, moe_runner_config: MoeRunnerConfig
+    ):
+        self.moe_runner_config = moe_runner_config
+
+    def apply(
+        self,
+        layer: torch.nn.Module,
+        dispatch_output: "DispatchOutput",
+    ) -> torch.Tensor:
+        # TODO: fix circular imports issues in sglang forcing us to import here instead of at
+        # the top of file.
+        from sglang.srt.layers.moe.token_dispatcher import StandardCombineInput
+
+        topk_output = dispatch_output.topk_output
+        moe_runner_config = self.moe_runner_config
+
+        # TODO: add triton kernel and add check get_bool_env_var("CK_MOE")
+        assert (
+            not moe_runner_config.no_combine
+        ), f"no_combine={moe_runner_config.no_combine} is not supported."
+
+        output = fused_moe(
+            dispatch_output.hidden_states,
+            layer.w13_weight,
+            layer.w2_weight,
+            topk_output.topk_weights,
+            topk_output.topk_ids,
+            quant_type=QuantType.per_Token,
+            w1_scale=layer.w13_int4_scale,
+            w2_scale=layer.w2_int4_scale,
+            activation=(
+                ActivationType.Silu
+                if moe_runner_config.activation == "silu"
+                else ActivationType.Gelu
+            ),
+        )
+
+        return StandardCombineInput(hidden_states=output)
diff --git a/sglang/python/sglang/srt/layers/quantization/rocm_mxfp4_utils.py b/sglang/python/sglang/srt/layers/quantization/rocm_mxfp4_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..4659f76bd87ce2cdcc7f530ed1f5e043a712ded1
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/quantization/rocm_mxfp4_utils.py
@@ -0,0 +1,13 @@
+from aiter.ops.triton.batched_gemm_afp4wfp4_pre_quant import (
+    batched_gemm_afp4wfp4_pre_quant,
+)
+from aiter.ops.triton.fused_mxfp4_quant import (
+    fused_flatten_mxfp4_quant,
+    fused_rms_mxfp4_quant,
+)
+
+__all__ = [
+    "fused_rms_mxfp4_quant",
+    "fused_flatten_mxfp4_quant",
+    "batched_gemm_afp4wfp4_pre_quant",
+]
diff --git a/sglang/python/sglang/srt/layers/quantization/unquant.py b/sglang/python/sglang/srt/layers/quantization/unquant.py
new file mode 100644
index 0000000000000000000000000000000000000000..a1edcf5b4d44e426517630ecf6a3b2bda8ed39c3
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/quantization/unquant.py
@@ -0,0 +1,635 @@
+from __future__ import annotations
+
+from typing import TYPE_CHECKING, List, Optional
+
+import torch
+import torch.nn.functional as F
+from torch.nn.parameter import Parameter
+
+from sglang.srt.layers.amx_utils import (
+    CPUQuantMethod,
+    _amx_process_weight_after_loading,
+)
+from sglang.srt.layers.moe import (
+    MoeRunner,
+    MoeRunnerBackend,
+    MoeRunnerConfig,
+    get_moe_runner_backend,
+)
+from sglang.srt.layers.moe.moe_runner.triton import TritonMoeQuantInfo
+from sglang.srt.layers.quantization.base_config import (
+    FusedMoEMethodBase,
+    LinearMethodBase,
+    QuantizeMethodBase,
+)
+from sglang.srt.layers.utils import MultiPlatformOp
+from sglang.srt.utils import (
+    cpu_has_amx_support,
+    get_bool_env_var,
+    is_cpu,
+    is_hip,
+    is_npu,
+    next_power_of_2,
+    set_weight_attrs,
+    use_intel_amx_backend,
+    use_intel_xpu_backend,
+)
+
+if TYPE_CHECKING:
+    from sglang.srt.layers.moe.token_dispatcher import (
+        CombineInput,
+        StandardDispatchOutput,
+    )
+
+
+_is_cpu_amx_available = cpu_has_amx_support()
+_is_hip = is_hip()
+_is_cpu = is_cpu()
+_is_npu = is_npu()
+_use_aiter = get_bool_env_var("SGLANG_USE_AITER") and _is_hip
+
+if _use_aiter:
+    from aiter import ActivationType
+    from aiter.fused_moe import fused_moe
+    from aiter.ops.shuffle import shuffle_weight
+
+if _is_npu:
+    from sglang.srt.hardware_backend.npu.utils import npu_format_cast
+
+try:
+    from flashinfer.fused_moe import cutlass_fused_moe as flashinfer_cutlass_fused_moe
+    from flashinfer.fused_moe.core import ActivationType
+except ImportError:
+    flashinfer_cutlass_fused_moe = None
+
+
+class UnquantizedEmbeddingMethod(QuantizeMethodBase):
+    """Unquantized method for embeddings."""
+
+    def create_weights(
+        self,
+        layer: torch.nn.Module,
+        input_size_per_partition: int,
+        output_partition_sizes: List[int],
+        input_size: int,
+        output_size: int,
+        params_dtype: torch.dtype,
+        **extra_weight_attrs,
+    ):
+        """Create weights for embedding layer."""
+        weight = Parameter(
+            torch.empty(
+                sum(output_partition_sizes),
+                input_size_per_partition,
+                dtype=params_dtype,
+            ),
+            requires_grad=False,
+        )
+        set_weight_attrs(weight, {"input_dim": 1, "output_dim": 0})
+        layer.register_parameter("weight", weight)
+        set_weight_attrs(weight, extra_weight_attrs)
+
+    def apply(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        bias: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        return F.linear(x, layer.weight, bias)
+
+    def embedding(self, layer: torch.nn.Module, input_: torch.Tensor) -> torch.Tensor:
+        return F.embedding(input_, layer.weight)
+
+
+class UnquantizedLinearMethod(LinearMethodBase):
+    """Linear method without quantization."""
+
+    def create_weights(
+        self,
+        layer: torch.nn.Module,
+        input_size_per_partition: int,
+        output_partition_sizes: List[int],
+        input_size: int,
+        output_size: int,
+        params_dtype: torch.dtype,
+        **extra_weight_attrs,
+    ):
+        weight = Parameter(
+            torch.empty(
+                sum(output_partition_sizes),
+                input_size_per_partition,
+                dtype=params_dtype,
+            ),
+            requires_grad=False,
+        )
+        set_weight_attrs(weight, {"input_dim": 1, "output_dim": 0})
+        layer.register_parameter("weight", weight)
+        set_weight_attrs(weight, extra_weight_attrs)
+
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        if _is_cpu and _is_cpu_amx_available:
+            _amx_process_weight_after_loading(layer, ["weight"])
+
+    def apply(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        bias: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        if use_intel_amx_backend(layer):
+            x_shapes = x.shape
+            if len(x_shapes) == 3:
+                x = x.view(-1, x.shape[-1])
+            output = torch.ops.sgl_kernel.weight_packed_linear(
+                x,
+                layer.weight,
+                bias,
+                True,  # is_vnni
+            )
+            if len(x_shapes) == 3:
+                output = output.view(x_shapes[0], x_shapes[1], -1)
+            return output
+
+        return F.linear(x, layer.weight, bias)
+
+
+class UnquantizedFusedMoEMethod(FusedMoEMethodBase, MultiPlatformOp):
+    """MoE method without quantization."""
+
+    def __init__(
+        self, use_triton_kernels: bool = False, use_flashinfer_trtllm_moe: bool = False
+    ):
+        super().__init__()
+        self.use_flashinfer_cutlass = get_moe_runner_backend().is_flashinfer_cutlass()
+        self.use_triton_kernels = use_triton_kernels
+        self.with_bias = False
+        self.use_flashinfer_trtllm_moe = use_flashinfer_trtllm_moe
+        self._cache_permute_indices = dict({})
+
+    def create_weights(
+        self,
+        layer: torch.nn.Module,
+        num_experts: int,
+        hidden_size: int,
+        intermediate_size_per_partition: int,
+        params_dtype: torch.dtype,
+        with_bias: bool = False,
+        **extra_weight_attrs,
+    ):
+        self.with_bias = with_bias
+
+        # Fused gate_up_proj (column parallel)
+        w13_up_dim = (
+            2 * intermediate_size_per_partition
+            if layer.moe_runner_config.is_gated
+            else intermediate_size_per_partition
+        )
+        w13_weight_n, w13_weight_k = (w13_up_dim, hidden_size)
+        if self.use_triton_kernels:
+            w13_weight_n, w13_weight_k = w13_weight_k, w13_weight_n
+        w13_weight = torch.nn.Parameter(
+            torch.empty(num_experts, w13_weight_n, w13_weight_k, dtype=params_dtype),
+            requires_grad=False,
+        )
+        layer.register_parameter("w13_weight", w13_weight)
+        set_weight_attrs(w13_weight, extra_weight_attrs)
+
+        if self.with_bias:
+            w13_weight_bias = torch.nn.Parameter(
+                torch.empty(num_experts, w13_up_dim, dtype=torch.float32),
+                requires_grad=False,
+            )
+            layer.register_parameter("w13_weight_bias", w13_weight_bias)
+            set_weight_attrs(w13_weight_bias, extra_weight_attrs)
+
+        # down_proj (row parallel)
+        w2_weight_n, w2_weight_k = (
+            hidden_size,
+            intermediate_size_per_partition,
+        )
+        if self.use_triton_kernels:
+            w2_weight_n, w2_weight_k = w2_weight_k, w2_weight_n
+        w2_weight = torch.nn.Parameter(
+            torch.empty(num_experts, w2_weight_n, w2_weight_k, dtype=params_dtype),
+            requires_grad=False,
+        )
+        layer.register_parameter("w2_weight", w2_weight)
+        set_weight_attrs(w2_weight, extra_weight_attrs)
+
+        if self.with_bias:
+            w2_weight_bias = torch.nn.Parameter(
+                torch.empty(num_experts, hidden_size, dtype=torch.float32),
+                requires_grad=False,
+            )
+            layer.register_parameter("w2_weight_bias", w2_weight_bias)
+            set_weight_attrs(w2_weight_bias, extra_weight_attrs)
+
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        # Skip aiter weight shuffle when using non-auto MoE backend (e.g., triton, triton_kernels)
+        # because aiter CK kernels don't support all GEMM dimensions
+        _should_use_aiter_moe = _use_aiter and get_moe_runner_backend().is_auto()
+        if _should_use_aiter_moe:
+            layer.w13_weight = torch.nn.Parameter(
+                shuffle_weight(layer.w13_weight.data, (16, 16)),
+                requires_grad=False,
+            )
+            torch.cuda.empty_cache()
+            layer.w2_weight = torch.nn.Parameter(
+                shuffle_weight(layer.w2_weight.data, (16, 16)),
+                requires_grad=False,
+            )
+            torch.cuda.empty_cache()
+
+        # Pack weight for get better performance on CPU
+        if _is_cpu and _is_cpu_amx_available:
+            _amx_process_weight_after_loading(layer, ["w13_weight", "w2_weight"])
+
+        # Reorder rows of W1 for fused gated activation
+        if self.use_flashinfer_trtllm_moe:
+            from flashinfer.fused_moe.core import (
+                _maybe_get_cached_w3_w1_permute_indices,
+                convert_to_block_layout,
+                get_w2_permute_indices_with_cache,
+            )
+
+            # w1 and w3 have been swapped, so we don't need do that here
+            epilogue_tile_m = 128
+            block_k = 128
+            old_shape_w13 = layer.w13_weight.data[0].shape
+            old_shape_w2 = layer.w2_weight.data[0].shape
+            new_shape_w13 = None
+            new_shape_w2 = None
+            for i in range(layer.num_local_experts):
+                permute_indices = _maybe_get_cached_w3_w1_permute_indices(
+                    self._cache_permute_indices,
+                    layer.w13_weight.data[i].view(torch.uint8),
+                    epilogue_tile_m,
+                )
+                tmp_weights1 = (
+                    layer.w13_weight.data[i]
+                    .clone()
+                    .view(torch.uint8)[permute_indices.to(layer.w13_weight.data.device)]
+                    .contiguous()
+                )
+
+                permute_indices = get_w2_permute_indices_with_cache(
+                    self._cache_permute_indices,
+                    layer.w2_weight.data[i].view(torch.uint8),
+                    epilogue_tile_m,
+                )
+                tmp_weights2 = (
+                    layer.w2_weight.data[i]
+                    .clone()
+                    .view(torch.uint8)[permute_indices.to(layer.w2_weight.data.device)]
+                    .contiguous()
+                )
+
+                tmp_weights1 = convert_to_block_layout(
+                    tmp_weights1.view(torch.uint8), block_k
+                )
+                tmp_weights2 = convert_to_block_layout(
+                    tmp_weights2.view(torch.uint8), block_k
+                )
+
+                new_shape_w13 = tmp_weights1.view(torch.bfloat16).shape
+                new_shape_w2 = tmp_weights2.view(torch.bfloat16).shape
+                layer.w13_weight.data[i] = (
+                    tmp_weights1.view(torch.bfloat16)
+                    .contiguous()
+                    .reshape(old_shape_w13)
+                )
+                layer.w2_weight.data[i] = (
+                    tmp_weights2.view(torch.bfloat16).contiguous().reshape(old_shape_w2)
+                )
+
+            layer.w13_weight.data = layer.w13_weight.data.reshape(
+                layer.num_local_experts, *new_shape_w13
+            )
+            layer.w2_weight.data = layer.w2_weight.data.reshape(
+                layer.num_local_experts, *new_shape_w2
+            )
+
+        if _is_npu:
+            for weight_name in ["w13_weight", "w2_weight"]:
+                weight = getattr(layer, weight_name)
+                weight.data = weight.data.transpose(1, 2)
+                weight.data = npu_format_cast(
+                    weight.data,
+                )
+
+        return
+
+    def create_moe_runner(
+        self, layer: torch.nn.Module, moe_runner_config: MoeRunnerConfig
+    ):
+        self.moe_runner_config = moe_runner_config
+        if self.use_flashinfer_trtllm_moe:
+            backend = MoeRunnerBackend.FLASHINFER_TRTLLM
+        elif self.use_triton_kernels:
+            backend = MoeRunnerBackend.TRITON_KERNELS
+        else:
+            backend = MoeRunnerBackend.TRITON
+        self.runner = MoeRunner(backend, moe_runner_config)
+
+    @property
+    def load_up_proj_weight_first(self) -> bool:
+        # FlashInfer CUTLASS kernel assumes [Up, Gate] Proj as W13
+        return self.use_flashinfer_cutlass
+
+    def apply(
+        self,
+        layer: torch.nn.Module,
+        dispatch_output: StandardDispatchOutput,
+    ) -> CombineInput:
+        return self.forward(
+            layer=layer,
+            dispatch_output=dispatch_output,
+        )
+
+    def forward_cuda(
+        self,
+        layer: torch.nn.Module,
+        dispatch_output: StandardDispatchOutput,
+    ) -> CombineInput:
+        from sglang.srt.layers.moe.token_dispatcher import StandardCombineInput
+
+        x = dispatch_output.hidden_states
+        topk_output = dispatch_output.topk_output
+
+        moe_runner_config = self.moe_runner_config
+
+        backend = self.runner.runner_backend
+        if backend.is_triton_kernels():
+            from sglang.srt.layers.moe.moe_runner.triton_kernels import (
+                TritonKernelsQuantInfo,
+            )
+
+            quant_info = TritonKernelsQuantInfo(
+                w13_weight=layer.w13_weight,
+                w2_weight=layer.w2_weight,
+                w13_bias=getattr(layer, "w13_weight_bias", None),
+                w2_bias=getattr(layer, "w2_weight_bias", None),
+            )
+            return self.runner.run(dispatch_output, quant_info)
+        elif self.use_flashinfer_cutlass:
+            output = flashinfer_cutlass_fused_moe(
+                input=x,
+                token_selected_experts=topk_output.topk_ids,
+                token_final_scales=topk_output.topk_weights,
+                fc1_expert_weights=layer.w13_weight,
+                fc2_expert_weights=layer.w2_weight,
+                output_dtype=x.dtype,
+                quant_scales=None,
+                ep_size=layer.moe_ep_size,
+                ep_rank=layer.moe_ep_rank,
+                tp_size=layer.moe_tp_size,
+                tp_rank=layer.moe_tp_rank,
+                tune_max_num_tokens=next_power_of_2(x.shape[0]),
+                activation_type=(
+                    ActivationType.Relu2
+                    if moe_runner_config.activation == "relu2"
+                    else ActivationType.Swiglu
+                ),
+            )[0]
+            return StandardCombineInput(hidden_states=output)
+        elif self.use_flashinfer_trtllm_moe:
+            from sglang.srt.layers.moe.moe_runner.flashinfer_trtllm import (
+                FlashInferTrtllmBf16MoeQuantInfo,
+            )
+
+            quant_info = FlashInferTrtllmBf16MoeQuantInfo(
+                gemm1_weights=layer.w13_weight,
+                gemm2_weights=layer.w2_weight,
+                global_num_experts=layer.num_experts,
+                local_expert_offset=layer.moe_ep_rank * layer.num_local_experts,
+            )
+            return self.runner.run(dispatch_output, quant_info)
+        else:
+            # Skip aiter fused_moe when using non-auto MoE backend (e.g., triton, triton_kernels)
+            # because aiter CK kernels don't support all GEMM dimensions
+            _should_use_aiter_moe = _use_aiter and get_moe_runner_backend().is_auto()
+            if _should_use_aiter_moe:
+                assert not moe_runner_config.no_combine, "unsupported"
+                topk_weights, topk_ids, _ = topk_output
+                if moe_runner_config.apply_router_weight_on_input:
+                    assert (
+                        topk_weights.dim() == 2
+                    ), "`topk_weights` should be in shape (num_tokens, topk)"
+                    _, topk = topk_weights.shape
+                    assert (
+                        topk == 1
+                    ), "Only support topk=1 when `apply_router_weight_on_input` is True"
+                    x = x * topk_weights.to(x.dtype)
+                    topk_weights = torch.ones_like(
+                        topk_weights, dtype=torch.float32
+                    )  # topk_weights must be FP32 (float32)
+                output = fused_moe(
+                    x,
+                    layer.w13_weight,
+                    layer.w2_weight,
+                    topk_weights,
+                    topk_ids,
+                    activation=(
+                        ActivationType.Silu
+                        if moe_runner_config.activation == "silu"
+                        else ActivationType.Gelu
+                    ),
+                    expert_mask=layer.expert_mask_gpu,
+                )
+                return StandardCombineInput(hidden_states=output)
+            else:
+                quant_info = TritonMoeQuantInfo(
+                    w13_weight=layer.w13_weight,
+                    w2_weight=layer.w2_weight,
+                    b13=getattr(layer, "w13_weight_bias", None),
+                    b2=getattr(layer, "w2_weight_bias", None),
+                )
+                return self.runner.run(dispatch_output, quant_info)
+
+    def forward_cpu(
+        self,
+        layer: torch.nn.Module,
+        dispatch_output: StandardDispatchOutput,
+    ) -> CombineInput:
+        from sglang.srt.layers.moe.token_dispatcher import StandardCombineInput
+
+        x = dispatch_output.hidden_states
+        topk_output = dispatch_output.topk_output
+
+        moe_runner_config = self.moe_runner_config
+
+        assert (
+            moe_runner_config.activation == "silu"
+        ), f"activation = {moe_runner_config.activation} is not supported."
+
+        if use_intel_amx_backend(layer):
+            from sglang.srt.layers.moe.topk import apply_topk_weights_cpu
+
+            topk_weights, topk_ids, _ = topk_output
+            x, topk_weights = apply_topk_weights_cpu(
+                moe_runner_config.apply_router_weight_on_input, topk_weights, x
+            )
+            output = torch.ops.sgl_kernel.fused_experts_cpu(
+                x,
+                layer.w13_weight,
+                layer.w2_weight,
+                topk_weights,
+                topk_ids,
+                False,  # inplace # See [Note] inplace should be False in fused_experts.
+                CPUQuantMethod.UNQUANT,
+                None,  # w1_scale
+                None,  # w2_scale
+                None,  # w1_zp
+                None,  # w2_zp
+                None,  # block_size
+                True,  # is_vnni
+            )
+            return StandardCombineInput(hidden_states=output)
+        else:
+            from sglang.srt.layers.moe.fused_moe_native import moe_forward_native
+
+            output = moe_forward_native(
+                layer,
+                x,
+                topk_output,
+                moe_runner_config,
+            )
+            return StandardCombineInput(hidden_states=output)
+
+    def forward_xpu(
+        self,
+        layer: torch.nn.Module,
+        dispatch_output: StandardDispatchOutput,
+    ) -> CombineInput:
+        from sglang.srt.layers.moe.token_dispatcher import StandardCombineInput
+
+        x = dispatch_output.hidden_states
+        topk_output = dispatch_output.topk_output
+
+        moe_runner_config = self.moe_runner_config
+        assert moe_runner_config.activation in [
+            "silu",
+            "gelu",
+        ], f"activation = {moe_runner_config.activation} is not supported."
+
+        backend = self.runner.runner_backend
+        if use_intel_xpu_backend():
+            # sgl-kernel-xpu path
+            from sgl_kernel import fused_experts
+
+            topk_weights, topk_ids, _ = topk_output
+            output = fused_experts(
+                x,
+                layer.w13_weight,
+                layer.w2_weight,
+                topk_weights,
+                topk_ids,
+                b1=getattr(layer, "w13_weight_bias", None),
+                b2=getattr(layer, "w2_weight_bias", None),
+                activation=moe_runner_config.activation,
+            )
+            return StandardCombineInput(hidden_states=output)
+        else:
+            assert backend.is_triton()
+            assert (
+                moe_runner_config.activation == "silu"
+            ), f"activation = {moe_runner_config.activation} is not supported \
+            for Triton PATH, please set ENV SGLANG_USE_SGL_XPU=1."
+
+            quant_info = TritonMoeQuantInfo(
+                w13_weight=layer.w13_weight,
+                w2_weight=layer.w2_weight,
+                b13=getattr(layer, "w13_weight_bias", None),
+                b2=getattr(layer, "w2_weight_bias", None),
+            )
+            return self.runner.run(dispatch_output, quant_info)
+
+    def forward_npu(
+        self,
+        layer: torch.nn.Module,
+        dispatch_output: StandardDispatchOutput,
+    ) -> CombineInput:
+
+        from sglang.srt.layers.moe.token_dispatcher import StandardCombineInput
+
+        # x.shape = [B*S, H]
+        x = dispatch_output.hidden_states
+        # topk_weights.shape = [B*S, K]; topk_ids.shape = [B*S, K]
+        topk_weights, topk_ids, _ = dispatch_output.topk_output
+
+        original_dtype = x.dtype
+        num_tokens = x.shape[0]
+        topk_weights = topk_weights.to(x.dtype)
+        topk_ids = topk_ids.to(torch.int32)
+        num_experts = layer.num_experts
+        top_k = layer.top_k or topk_ids.shape[1]  # in case layer.top_k is not set
+
+        hidden_states, expanded_row_idx, expert_tokens, _ = (
+            torch.ops.npu.npu_moe_init_routing_v2(
+                x,
+                topk_ids,
+                active_num=num_tokens * top_k,
+                expert_num=num_experts,
+                expert_tokens_num_type=1,
+                expert_tokens_num_flag=True,
+                active_expert_range=[0, num_experts],
+                quant_mode=-1,
+            )
+        )
+        expert_tokens = expert_tokens.to(torch.int64)
+        w13_bias = [layer.w13_weight_bias] if self.with_bias else None
+        w2_bias = [layer.w2_weight_bias] if self.with_bias else None
+
+        # gmm1: gate_up_proj
+        hidden_states = torch.ops.npu.npu_grouped_matmul(
+            x=[hidden_states],
+            weight=[layer.w13_weight],
+            bias=w13_bias,
+            split_item=2,
+            group_list_type=1,
+            group_type=0,
+            group_list=expert_tokens,
+            output_dtype=original_dtype,
+        )[0]
+
+        # act_fn:
+        if self.moe_runner_config.activation == "npu_swiglu_oai":
+            from sgl_kernel_npu.activation.swiglu_oai import swiglu_oai
+
+            hidden_states = swiglu_oai(layer, hidden_states)
+        elif self.moe_runner_config.activation == "silu":
+            hidden_states = torch.ops.npu.npu_swiglu(hidden_states)
+        else:
+            from sglang.srt.layers.activation import GeluAndMul
+
+            hidden_states = GeluAndMul()(hidden_states)
+
+        # gmm2: down_proj
+        hidden_states = torch.ops.npu.npu_grouped_matmul(
+            x=[hidden_states],
+            weight=[layer.w2_weight],
+            bias=w2_bias,
+            split_item=2,
+            group_list_type=1,
+            group_type=0,
+            group_list=expert_tokens,
+            output_dtype=original_dtype,
+        )[0]
+
+        final_hidden_states = torch.ops.npu.npu_moe_finalize_routing(
+            hidden_states,
+            skip1=None,
+            skip2=None,
+            bias=None,
+            scales=topk_weights,
+            expanded_src_to_dst_row=expanded_row_idx,
+            export_for_source_row=topk_ids,
+            drop_pad_mode=2,
+        )
+
+        return StandardCombineInput(hidden_states=final_hidden_states)
+
+    def forward_tpu(self, *args, **kwargs) -> CombineInput:
+        raise NotImplementedError("The TPU backend currently does not support MoE.")
+
+    forward_native = forward_cpu
diff --git a/sglang/python/sglang/srt/layers/quantization/utils.py b/sglang/python/sglang/srt/layers/quantization/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..198b201de159a0b405270781038267e541019e2b
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/quantization/utils.py
@@ -0,0 +1,740 @@
+# Adapted from https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/layers/quantization/utils/quant_utils.py
+
+from __future__ import annotations
+
+import re
+from copy import deepcopy
+from types import MappingProxyType
+from typing import TYPE_CHECKING, Dict, List, Mapping, Optional, Tuple, Union
+
+import numpy
+import torch
+
+from sglang.srt.layers.quantization.fp8_kernel import scaled_fp8_quant
+
+if TYPE_CHECKING:
+    from sglang.srt.layers.quantization.base_config import QuantizationConfig
+
+
+def get_scalar_types():
+    """
+    Returns:
+        tuple: (ScalarType, scalar_types)
+    """
+    try:
+        from sgl_kernel.scalar_type import ScalarType, scalar_types
+
+        return ScalarType, scalar_types
+    except ImportError:
+
+        class MockScalarType:
+            pass
+
+        class MockScalarTypes:
+            uint4b8 = "uint4b8"
+            uint8b128 = "uint8b128"
+
+            def __getattr__(self, name):
+                return f"mock_{name}"
+
+        return MockScalarType, MockScalarTypes()
+
+
+ScalarType, scalar_types = get_scalar_types()
+
+
+def is_layer_skipped(
+    prefix: str,
+    ignored_layers: List[str],
+    fused_mapping: Mapping[str, List[str]] = MappingProxyType({}),
+) -> bool:
+    # prefix: model.layers.0.self_attn.q_proj
+    # proj_name: q_proj
+    proj_name = prefix.split(".")[-1]
+
+    # Fused layers like gate_up_proj or qkv_proj will not be fused
+    # in the safetensors checkpoint. So, we convert the name
+    # from the fused version to unfused + check to make sure that
+    # each shard of the fused layer has the same scheme.
+    if proj_name in fused_mapping:
+        shard_prefixes = [
+            prefix.replace(proj_name, shard_proj_name)
+            for shard_proj_name in fused_mapping[proj_name]
+        ]
+
+        is_skipped = None
+        for shard_prefix in shard_prefixes:
+            is_shard_skipped = any(
+                ignored in shard_prefix for ignored in ignored_layers
+            )
+
+            if is_skipped is None:
+                is_skipped = is_shard_skipped
+            elif is_shard_skipped != is_skipped:
+                raise ValueError(
+                    f"Detected some but not all shards of {prefix} "
+                    "are quantized. All shards of fused layers "
+                    "to have the same precision."
+                )
+    else:
+        is_skipped = any(ignored in prefix for ignored in ignored_layers)
+        if "gate_up_proj" in prefix:
+            prefix_gate = prefix.replace("gate_up_proj", "gate_proj")
+            prefix_up = prefix.replace("gate_up_proj", "up_proj")
+            if prefix_gate in ignored_layers and prefix_up in ignored_layers:
+                is_skipped = True
+        elif "experts" in prefix:
+            # Expert names can include full module paths; keep coarse prefix matches
+            # (e.g., "model.layers.{i}.") while also checking expert-specific entries.
+            is_skipped = is_skipped or any(
+                prefix in layer_name
+                for layer_name in ignored_layers
+                if "experts" in layer_name
+            )
+
+    assert is_skipped is not None
+    return is_skipped
+
+
+def per_tensor_dequantize(
+    tensor: torch.Tensor, inv_scale: Union[float, torch.Tensor]
+) -> torch.Tensor:
+    fake_qweight = tensor.to(torch.float16)
+    dq_weight = fake_qweight * inv_scale
+    return dq_weight
+
+
+def all_close_1d(x: torch.Tensor) -> bool:
+    assert len(x.shape) == 1
+    return all(torch.allclose(x[0], x[i]) for i in range(x.shape[0]))
+
+
+def convert_to_channelwise(
+    weight_scale: torch.Tensor, logical_widths: List[int]
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    # Create channelwise buffer
+    weight_scale_channel = torch.empty(
+        (sum(logical_widths), 1), dtype=torch.float32, device=weight_scale.device
+    )
+
+    # Handle scalar tensor case: broadcast same scale to all channels
+    if weight_scale.dim() == 0:
+        weight_scale_channel.fill_(weight_scale.item())
+        return weight_scale_channel
+
+    # Expand each scale to match the size of each logical matrix.
+    start = 0
+    for idx, logical_width in enumerate(logical_widths):
+        end = start + logical_width
+        weight_scale_channel[start:end, :] = weight_scale[idx]
+        start = end
+
+    return weight_scale_channel
+
+
+def requantize_with_max_scale(
+    weight: torch.Tensor, weight_scale: torch.Tensor, logical_widths: List[int]
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    # Max scale to be used for requanitzation.
+    max_w_scale = weight_scale.max()
+
+    # QKV / MLP is fused in the on disk checkpoint if any of the
+    # weight scales are still set to the default since we initialize
+    # N weight scales for N shards but we only load 1 weight scale
+    # from disk in this case. Skip requantization in this case (since)
+    # we already are quantized with the single scale.
+    # * Sample Model: nm-testing/Phi-3-mini-128k-instruct-FP8
+    unfused_module_in_checkpoint = (
+        weight_scale[-1] > torch.finfo(torch.float8_e4m3fn).min
+    )
+
+    # If unfused checkpoint, need requanize with the single scale.
+    if unfused_module_in_checkpoint:
+        start = 0
+        for idx, logical_width in enumerate(logical_widths):
+            end = start + logical_width
+            weight_dq = per_tensor_dequantize(weight[start:end, :], weight_scale[idx])
+            weight[start:end, :], _ = scaled_fp8_quant(weight_dq, max_w_scale)
+            start = end
+
+    return max_w_scale, weight
+
+
+def update_tensor_inplace(old: torch.Tensor, new: torch.Tensor) -> None:
+    old.copy_(new)
+
+
+# Adapted from https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/layers/quantization/utils/layer_utils.py
+# Newly generated tensors need to replace existing tensors that are
+# already registered as parameters by vLLM (and won't be freed)
+def replace_parameter(
+    mod: torch.nn.Module, name: str, new: Union[torch.Tensor, torch.nn.Parameter]
+) -> None:
+
+    old = getattr(mod, name)
+    if (
+        type(old) is type(new)
+        and old.dtype == new.dtype
+        and old.untyped_storage().nbytes() == new.untyped_storage().nbytes()
+    ):
+        # If we can just update in-place to avoid re-registering
+        #   can be faster if the underlying storage is the same
+        update_tensor_inplace(old, new)
+    else:
+        # Fallback re-register parameter, convert to Parameter if necessary
+        # this not only ensures we don't register a tensor as a parameter, but
+        # also ensures that all parameter subclasses get re-registered as
+        # parameters for `torch.compile` compatibility
+        if not isinstance(new, torch.nn.Parameter):
+            new = torch.nn.Parameter(new, requires_grad=False)
+        mod.register_parameter(name, torch.nn.Parameter(new, requires_grad=False))
+
+
+def assert_fp8_all_close(a: torch.Tensor, b: torch.Tensor):
+    assert a.shape == b.shape
+    assert a.dtype == b.dtype == torch.float8_e4m3fn
+
+    a_u8 = a.view(torch.uint8)
+    b_u8 = b.view(torch.uint8)
+    diff_u8 = (a_u8.to(torch.int16) - b_u8.to(torch.int16)).abs()
+
+    numel = a.numel()
+
+    count_diff_sign = ((a_u8 >= 0) & (b_u8 < 0)).sum().item()
+    count_tiny_diff = (diff_u8 >= 1).sum().item()
+    count_large_diff = (diff_u8 >= 2).sum().item()
+
+    assert (
+        (count_diff_sign == 0)
+        and (count_tiny_diff / numel < 0.005)
+        and (count_large_diff == 0)
+    ), f"{count_diff_sign=} {count_tiny_diff=} {count_large_diff=} {numel=}"
+
+
+# Match dynamic rules with module name (prefix) and override quantize
+# config if module (prefix) matches a rule
+def override_config(config: QuantizationConfig, prefix: str):
+    weight_bits = get_dynamic_override(config, prefix, "bits", config.weight_bits)
+    if isinstance(weight_bits, int):
+        config.weight_bits = weight_bits
+    group_size = get_dynamic_override(config, prefix, "group_size", config.group_size)
+    if isinstance(group_size, int):
+        config.group_size = group_size
+    desc_act = get_dynamic_override(config, prefix, "desc_act", config.desc_act)
+    if isinstance(desc_act, bool):
+        config.desc_act = desc_act
+
+    config.pack_factor = 32 // config.weight_bits  # packed into int32
+    if config.get_name() == "gptq_marlin":
+        is_sym = get_dynamic_override(config, prefix, "sym", config.is_sym)
+        if isinstance(is_sym, bool):
+            config.is_sym = is_sym
+
+        if (config.weight_bits, config.is_sym) not in config.TYPE_MAP:
+            raise ValueError(
+                "Unsupported quantization config: "
+                f"bits={config.weight_bits}, sym={config.is_sym}"
+            )
+
+        config.quant_type = config.TYPE_MAP[(config.weight_bits, config.is_sym)]
+    elif config.get_name() == "gptq":
+        if config.weight_bits not in [2, 3, 4, 8]:
+            raise ValueError(
+                "Currently, only 2/3/4/8-bit weight quantization is "
+                f"supported for GPTQ, but got {config.weight_bits} bits."
+            )
+
+
+def get_dynamic_override(
+    config: QuantizationConfig,
+    layer_name: str,
+    key: Optional[str] = None,
+    default_value: Union[int, bool, None] = None,
+) -> Union[Dict, int, bool, None]:
+    for pattern, pattern_dict in config.dynamic.items():
+        # Negative match: matched modules are excluded from quantized init
+        if pattern.startswith("-:"):
+            if re.match(pattern.removeprefix("-:"), layer_name):
+                return False
+        # Positive match: matched modules have quant properties overrides
+        # base quant config
+        elif re.match(pattern.removeprefix("+:"), layer_name):
+            if key is None:
+                return pattern_dict
+            else:
+                return pattern_dict.get(key, default_value)
+    return default_value
+
+
+def get_linear_quant_method(
+    config: QuantizationConfig,
+    layer: torch.nn.Module,
+    prefix: str,
+    linear_method_cls: type,
+):
+    from sglang.srt.layers.linear import LinearBase
+    from sglang.srt.layers.quantization.unquant import (
+        UnquantizedEmbeddingMethod,
+        UnquantizedLinearMethod,
+    )
+    from sglang.srt.layers.vocab_parallel_embedding import ParallelLMHead
+
+    cloned_config = deepcopy(config)
+    parallel_lm_head_quantized = (
+        isinstance(layer, ParallelLMHead) and cloned_config.lm_head_quantized
+    )
+
+    if isinstance(layer, LinearBase) or parallel_lm_head_quantized:
+        # False = skip module, None = no override, else = Positive match
+        if get_dynamic_override(cloned_config, layer_name=prefix) is False:
+            if parallel_lm_head_quantized:
+                return UnquantizedEmbeddingMethod()
+            return UnquantizedLinearMethod()
+
+        if prefix:
+            # Dynamic per module/layer rules may override base config
+            override_config(cloned_config, prefix=prefix)
+
+        return linear_method_cls(cloned_config)
+    return None
+
+
+def get_pack_factor(num_bits):
+    assert 32 % num_bits == 0, f"Unsupported num_bits = {num_bits}"
+    return 32 // num_bits
+
+
+def permute_rows(
+    q_w: torch.Tensor,
+    w_ref: torch.Tensor,
+    group_size: int,
+    test_perm: Optional[torch.Tensor] = None,
+):
+    assert q_w.shape == w_ref.shape
+
+    orig_device = q_w.device
+    k_size, _ = q_w.shape
+
+    g_idx = torch.zeros((k_size,), dtype=torch.int32)
+    for i in range(k_size):
+        g_idx[i] = i // group_size
+
+    # Simulate act_order by doing a random permutation on K
+    rand_perm = test_perm if test_perm is not None else torch.randperm(k_size)
+
+    g_idx = g_idx[rand_perm].contiguous()
+    q_w = q_w[rand_perm, :].contiguous()
+    w_ref = w_ref[rand_perm, :].contiguous()
+
+    return (
+        w_ref.to(device=orig_device),
+        q_w.to(device=orig_device),
+        g_idx.to(device=orig_device),
+        rand_perm.to(device=orig_device),
+    )
+
+
+def pack_cols(
+    q_w: torch.Tensor,
+    num_bits: int,
+    size_k: int,
+    size_n: int,
+):
+    assert q_w.shape == (size_k, size_n)
+
+    pack_factor = get_pack_factor(num_bits)
+    assert size_n % pack_factor == 0
+
+    orig_device = q_w.device
+
+    q_w = q_w.cpu().numpy().astype(numpy.uint32)
+
+    q_res = numpy.zeros((size_k, size_n // pack_factor), dtype=numpy.uint32)
+
+    for i in range(pack_factor):
+        q_res |= q_w[:, i::pack_factor] << num_bits * i
+
+    q_res = torch.from_numpy(q_res.astype(numpy.int32)).to(orig_device)
+    q_res = q_res.contiguous()
+
+    return q_res
+
+
+def pack_rows(
+    q_w: torch.Tensor,
+    num_bits: int,
+    size_k: int,
+    size_n: int,
+):
+    assert q_w.shape == (size_k, size_n)
+
+    pack_factor = get_pack_factor(num_bits)
+    assert size_k % pack_factor == 0
+
+    orig_device = q_w.device
+
+    q_w = q_w.cpu().numpy().astype(numpy.uint32)
+
+    q_res = numpy.zeros((size_k // pack_factor, size_n), dtype=numpy.uint32)
+
+    for i in range(pack_factor):
+        q_res |= q_w[i::pack_factor, :] << num_bits * i
+
+    q_res = torch.from_numpy(q_res.astype(numpy.int32)).to(orig_device)
+    return q_res
+
+
+def unpack_cols(
+    packed_q_w: torch.Tensor,
+    num_bits: int,
+    size_k: int,
+    size_n: int,
+):
+    pack_factor = get_pack_factor(num_bits)
+    assert size_n % pack_factor == 0
+    assert packed_q_w.shape == (
+        size_k,
+        size_n // pack_factor,
+    ), "packed_q_w.shape = {} size_k = {}, size_n = {} pack_Factor = {}".format(
+        packed_q_w.shape, size_k, size_n, pack_factor
+    )
+
+    orig_device = packed_q_w.device
+
+    packed_q_w_cpu = packed_q_w.cpu().numpy().astype(numpy.uint32)
+    q_res = numpy.zeros((size_k, size_n), dtype=numpy.uint32)
+
+    mask = (1 << num_bits) - 1
+    for i in range(pack_factor):
+        vals = packed_q_w_cpu & mask
+        packed_q_w_cpu >>= num_bits
+        q_res[:, i::pack_factor] = vals
+
+    q_res = torch.from_numpy(q_res.astype(numpy.int32)).to(orig_device)
+    q_res = q_res.contiguous()
+
+    return q_res
+
+
+# Adapted from https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/layers/quantization/utils/quant_utils.py
+def quantize_weights(
+    w: torch.Tensor,
+    quant_type: ScalarType,
+    group_size: Optional[int],
+    zero_points: bool = False,
+    ref_zero_points_after_scales: bool = False,
+):
+    assert (
+        quant_type.is_integer()
+    ), "Floating point quantization may work but has not been tested"
+    assert not zero_points or group_size is not None, (
+        "to have group zero points, group_size must be provided "
+        "(-1 group_size is channelwise)"
+    )
+
+    orig_device = w.device
+    orig_type = w.dtype
+    size_k, size_n = w.shape
+
+    assert w.is_floating_point(), "w must be float"
+
+    if group_size == -1:
+        group_size = size_k
+
+    # Reshape to [groupsize, -1]
+    if group_size is not None and group_size < size_k:
+        w = w.reshape((-1, group_size, size_n))
+        w = w.permute(1, 0, 2)
+        w = w.reshape((group_size, -1))
+
+    # Compute scale for each group
+    max_val = torch.max(w, 0, keepdim=True).values
+    min_val = torch.min(w, 0, keepdim=True).values
+
+    max_q_val = quant_type.max()
+    min_q_val = quant_type.min()
+
+    w_s = torch.Tensor([1.0]).to(w.device)  # unscaled case
+    maybe_w_zp = None
+    if group_size is not None:
+        if zero_points:
+            assert not quant_type.is_signed() and quant_type.max() > 0
+            w_s = (max_val - min_val).clamp(min=1e-5) / quant_type.max()
+            maybe_w_zp = (
+                torch.round(torch.abs(min_val / w_s)).clamp(min_q_val, max_q_val).int()
+            )
+        else:
+            # If the bias is such that there are no possible negative/positive
+            #  values, set the max value to inf to avoid divide by 0
+            w_s = torch.max(
+                abs(max_val / (max_q_val if max_q_val != 0 else torch.inf)),
+                abs(min_val / (min_q_val if min_q_val != 0 else torch.inf)),
+            )
+
+    # Quantize
+    w_q = torch.round(w / w_s).int() + (maybe_w_zp if zero_points else 0)
+    w_q = torch.clamp(w_q, min_q_val, max_q_val)
+
+    # Compute ref (dequantized)
+    # For some kernels (namely Machete) the zero-points are applied after the
+    # scales are applied, for this case computing the reference in similar way
+    # allows us to use tighter error tolerances in our unit tests.
+    if ref_zero_points_after_scales and maybe_w_zp is not None:
+        w_ref = w_q.to(orig_type) * w_s - maybe_w_zp.to(orig_type) * w_s
+    else:
+        w_ref = (w_q - (maybe_w_zp if zero_points else 0)).to(orig_type) * w_s
+
+    if quant_type.has_bias():
+        w_q += quant_type.bias
+
+    # Restore original shapes
+    if group_size is not None and group_size < size_k:
+
+        def reshape_w(w):
+            w = w.reshape((group_size, -1, size_n))
+            w = w.permute(1, 0, 2)
+            w = w.reshape((size_k, size_n)).contiguous()
+            return w
+
+        w_q = reshape_w(w_q)
+        w_ref = reshape_w(w_ref)
+        w_s = w_s.reshape((-1, size_n)).contiguous()
+
+    if maybe_w_zp is not None:
+        maybe_w_zp = maybe_w_zp.reshape((-1, size_n)).contiguous()
+        maybe_w_zp = maybe_w_zp.to(device=orig_device)
+
+    return (
+        w_ref.to(device=orig_device),
+        w_q.to(device=orig_device),
+        w_s if group_size is not None else None,
+        maybe_w_zp,
+    )
+
+
+SUPPORTED_GPTQ_QUANT_TYPES = [scalar_types.uint4b8, scalar_types.uint8b128]
+SUPPORTED_GROUP_SIZES = [-1, 32, 64, 128]
+
+
+def gptq_quantize_weights(
+    w: torch.Tensor,
+    quant_type: ScalarType,
+    group_size: int,
+    act_order: bool,
+    test_perm: Optional[torch.Tensor] = None,
+):
+    size_k, _ = w.shape
+
+    assert w.is_floating_point(), "w must be float"
+    assert (
+        quant_type in SUPPORTED_GPTQ_QUANT_TYPES
+    ), f"Unsupported gptq type = {quant_type}"
+    assert group_size in SUPPORTED_GROUP_SIZES + [
+        size_k
+    ], f"Unsupported groupsize = {group_size}"
+
+    w_ref, w_q, w_s, _ = quantize_weights(w, quant_type, group_size)
+
+    # Apply act_order
+    g_idx = torch.empty(0, dtype=torch.int, device=w.device)
+    rand_perm = torch.empty(0, dtype=torch.int, device=w.device)
+    if act_order:
+        assert (
+            group_size < size_k
+        ), "For act_order, groupsize = {} must be less than size_k = {}".format(
+            group_size, size_k
+        )
+
+        w_ref, w_q, g_idx, rand_perm = permute_rows(w_q, w_ref, group_size, test_perm)
+
+    return w_ref, w_q, w_s, g_idx, rand_perm
+
+
+def sort_weights(q_w: torch.Tensor, g_idx: torch.Tensor):
+    orig_device = q_w.device
+
+    sort_indices = torch.argsort(g_idx).to(dtype=torch.int32)  # Sort based on g_idx
+
+    g_idx = g_idx[sort_indices].contiguous()
+    q_w = q_w[sort_indices, :].contiguous()
+
+    return (
+        q_w.to(device=orig_device),
+        g_idx.to(device=orig_device),
+        sort_indices.to(device=orig_device),
+    )
+
+
+def swizzle_blockscale(scale: torch.Tensor):
+    """
+    Swizzle the scale tensor into a blockwise interleaved format for NVFP4 quantization.
+    """
+    assert scale.dtype == torch.float8_e4m3fn
+    # Pad and blockwise interleave weight_scale
+    scale_ndim = scale.ndim
+    if scale.ndim == 2:
+        scale = scale.unsqueeze(0)
+    assert scale.ndim == 3
+    B, M, K = scale.shape
+    round_up_multiple = lambda x, m: (x + m - 1) // m * m
+    M_padded = round_up_multiple(M, 128)
+    K_padded = round_up_multiple(K, 4)
+    padded_scale = torch.zeros((B, M_padded, K_padded), dtype=scale.dtype)
+    padded_scale[:B, :M, :K] = scale
+    batches, rows, cols = padded_scale.shape
+    assert rows % 128 == 0
+    assert cols % 4 == 0
+    padded_scale = padded_scale.reshape(batches, rows // 128, 4, 32, cols // 4, 4)
+    swizzled_scale = padded_scale.permute((0, 1, 4, 3, 2, 5))
+    swizzled_scale = swizzled_scale.contiguous().cuda()
+    return (
+        swizzled_scale.reshape(M_padded, K_padded)
+        if scale_ndim == 2
+        else swizzled_scale.reshape(B, M_padded, K_padded)
+    )
+
+
+def swap_w13_to_w31(x: torch.Tensor) -> torch.Tensor:
+    return (
+        x.reshape(-1, 2, x.shape[-2] // 2, x.shape[-1]).flip(dims=[1]).reshape(x.shape)
+    )
+
+
+def reorder_w1w3_to_w3w1(
+    weight: torch.Tensor, scale: torch.Tensor, dim: int = -2
+) -> tuple[torch.Tensor, torch.Tensor]:
+    """Re-order the concatenated `[w1, w3]` tensors to `[w3, w1]`"""
+    size = weight.size(dim)
+    assert size % 2 == 0, f"Expected even size in dim {dim}, got {size}"
+    half = size // 2
+
+    w1, w3 = weight.split(half, dim=dim)
+    s1, s3 = scale.split(half, dim=dim)
+
+    return (
+        torch.cat([w3, w1], dim=dim).contiguous(),
+        torch.cat([s3, s1], dim=dim).contiguous(),
+    )
+
+
+def prepare_static_weights_for_trtllm_fp4_moe(
+    gemm1_weights,
+    gemm2_weights,
+    gemm1_scales_linear_fp4_bytes,
+    gemm2_scales_linear_fp4_bytes,
+    hidden_size,
+    intermediate_size,
+    num_experts,
+):
+    from flashinfer import nvfp4_block_scale_interleave
+    from flashinfer.fused_moe.core import (
+        _maybe_get_cached_w3_w1_permute_indices,
+        get_w2_permute_indices_with_cache,
+    )
+
+    """Prepare quantized weights for kernel (done offline with weights)."""
+    _cache_permute_indices: dict[torch.Size, torch.Tensor] = {}
+    epilogue_tile_m = 128  # FIXME: this depends on the kernel internals
+
+    # Convert quantized weights to proper formats
+    gemm1_weights_fp4 = gemm1_weights.view(torch.float8_e4m3fn).reshape(
+        num_experts, 2 * intermediate_size, hidden_size // 2
+    )  # packed fp4
+    gemm1_scales_linear_fp4 = gemm1_scales_linear_fp4_bytes.view(
+        torch.float8_e4m3fn
+    ).reshape(
+        num_experts, 2 * intermediate_size, hidden_size // 16
+    )  # fp8 scaling factors
+
+    gemm2_weights_fp4 = gemm2_weights.view(torch.float8_e4m3fn).reshape(
+        num_experts, hidden_size, intermediate_size // 2
+    )  # packed fp4
+    gemm2_scales_linear_fp4 = gemm2_scales_linear_fp4_bytes.view(
+        torch.float8_e4m3fn
+    ).reshape(
+        num_experts, hidden_size, intermediate_size // 16
+    )  # fp8 scaling factors
+
+    gemm1_weights_fp4_shuffled = []
+    gemm1_scales_fp4_shuffled = []
+    gemm2_weights_fp4_shuffled = []
+    gemm2_scales_fp4_shuffled = []
+    for i in range(num_experts):
+        # Calculate the permute indices for the following:
+        # 1. Reorder rows of W1 and scales for fused gated activation
+        # 2. Shuffle weights and scaling factors for transposed mma output
+        # for both w3_w1 and w2 weights and scale factors
+        permute_indices = _maybe_get_cached_w3_w1_permute_indices(
+            _cache_permute_indices,
+            gemm1_weights_fp4[i].view(torch.uint8),
+            epilogue_tile_m,
+        )
+        gemm1_weights_fp4_shuffled.append(
+            gemm1_weights_fp4[i]
+            .view(torch.uint8)[permute_indices.to(gemm1_weights_fp4.device)]
+            .contiguous()
+        )
+
+        permute_sf_indices = _maybe_get_cached_w3_w1_permute_indices(
+            _cache_permute_indices,
+            gemm1_scales_linear_fp4[i].view(torch.uint8),
+            epilogue_tile_m,
+            num_elts_per_sf=16,
+        )
+        gemm1_scales_fp4_shuffled.append(
+            nvfp4_block_scale_interleave(
+                gemm1_scales_linear_fp4[i]
+                .view(torch.uint8)[
+                    permute_sf_indices.to(gemm1_scales_linear_fp4.device)
+                ]
+                .contiguous()
+            )
+        )
+
+        permute_indices = get_w2_permute_indices_with_cache(
+            _cache_permute_indices,
+            gemm2_weights_fp4[i].view(torch.uint8),
+            epilogue_tile_m,
+        )
+        gemm2_weights_fp4_shuffled.append(
+            gemm2_weights_fp4[i]
+            .view(torch.uint8)[permute_indices.to(gemm2_weights_fp4.device)]
+            .contiguous()
+        )
+
+        permute_sf_indices = get_w2_permute_indices_with_cache(
+            _cache_permute_indices,
+            gemm2_scales_linear_fp4[i].view(torch.uint8),
+            epilogue_tile_m,
+            num_elts_per_sf=16,
+        )
+        gemm2_scales_fp4_shuffled.append(
+            nvfp4_block_scale_interleave(
+                gemm2_scales_linear_fp4[i]
+                .view(torch.uint8)[
+                    permute_sf_indices.to(gemm2_scales_linear_fp4.device)
+                ]
+                .contiguous()
+            )
+        )
+
+    # Stack weights for all experts
+    gemm1_weights_fp4_shuffled = torch.stack(gemm1_weights_fp4_shuffled)
+    gemm1_scales_fp4_shuffled = (
+        torch.stack(gemm1_scales_fp4_shuffled)
+        .view(torch.float8_e4m3fn)
+        .reshape(num_experts, 2 * intermediate_size, hidden_size // 16)
+    )
+
+    gemm2_weights_fp4_shuffled = torch.stack(gemm2_weights_fp4_shuffled)
+    gemm2_scales_fp4_shuffled = (
+        torch.stack(gemm2_scales_fp4_shuffled)
+        .view(torch.float8_e4m3fn)
+        .reshape(num_experts, hidden_size, intermediate_size // 16)
+    )
+    return (
+        gemm1_weights_fp4_shuffled,
+        gemm1_scales_fp4_shuffled,
+        gemm2_weights_fp4_shuffled,
+        gemm2_scales_fp4_shuffled,
+    )
diff --git a/sglang/python/sglang/srt/layers/quantization/w4afp8.py b/sglang/python/sglang/srt/layers/quantization/w4afp8.py
new file mode 100644
index 0000000000000000000000000000000000000000..83869289ec981ba6983a800b55e6d99a5f750a93
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/quantization/w4afp8.py
@@ -0,0 +1,406 @@
+from __future__ import annotations
+
+import logging
+from typing import TYPE_CHECKING, Any, Dict, List, Optional
+
+import torch
+from torch.nn import Module
+from torch.nn.parameter import Parameter
+
+from sglang.srt.layers.quantization.base_config import (
+    FusedMoEMethodBase,
+    QuantizationConfig,
+    QuantizeMethodBase,
+)
+from sglang.srt.layers.quantization.fp8 import Fp8LinearMethod
+from sglang.srt.layers.quantization.unquant import UnquantizedLinearMethod
+from sglang.srt.layers.quantization.utils import is_layer_skipped
+from sglang.srt.utils import set_weight_attrs
+
+if TYPE_CHECKING:
+    from sglang.srt.layers.moe import MoeRunnerConfig
+    from sglang.srt.layers.moe.ep_moe.layer import DeepEPMoE
+    from sglang.srt.layers.moe.token_dispatcher import (
+        CombineInput,
+        DeepEPLLDispatchOutput,
+        DeepEPNormalDispatchOutput,
+        StandardDispatchOutput,
+    )
+
+ACTIVATION_SCHEMES = ["static", "dynamic"]
+
+logger = logging.getLogger(__name__)
+
+
+class W4AFp8Config(QuantizationConfig):
+    """Config class for MIXED_PRECISION W4AFp8."""
+
+    def __init__(
+        self,
+        is_checkpoint_fp8_serialized: bool = True,
+        is_checkpoint_w4afp8_serialized: bool = True,
+        linear_activation_scheme: str = "dynamic",
+        moe_activation_scheme: str = "static",
+        ignored_layers: Optional[List[str]] = None,
+        weight_block_size: Optional[List[int]] = None,
+        group_size: int = 128,
+    ) -> None:
+        super().__init__()
+        self.is_checkpoint_fp8_serialized = is_checkpoint_fp8_serialized
+        self.is_checkpoint_w4afp8_serialized = is_checkpoint_w4afp8_serialized
+        if is_checkpoint_w4afp8_serialized:
+            logger.warning("Detected w4afp8 checkpoint. Please note that")
+        if moe_activation_scheme not in ACTIVATION_SCHEMES:
+            raise ValueError(f"Unsupported activation scheme {moe_activation_scheme}")
+        self.linear_activation_scheme = linear_activation_scheme
+        self.moe_activation_scheme = moe_activation_scheme
+        self.ignored_layers = ignored_layers or []
+        self.weight_block_size = [128, 128]
+        self.group_size = group_size
+
+    @classmethod
+    def get_name(cls) -> str:
+        return "w4afp8"
+
+    @classmethod
+    def get_supported_act_dtypes(cls) -> List[torch.dtype]:
+        return [torch.bfloat16, torch.float8_e4m3fn]
+
+    @classmethod
+    def get_min_capability(cls) -> int:
+        return 90
+
+    @classmethod
+    def get_config_filenames(cls) -> List[str]:
+        return []
+
+    @classmethod
+    def from_config(cls, config: Dict[str, Any]) -> W4AFp8Config:
+        quant_method = cls.get_from_keys(config, ["quant_method"])
+        is_checkpoint_fp8_serialized = "fp8" in quant_method
+        is_checkpoint_w4afp8_serialized = "w4afp8" in quant_method
+        linear_activation_scheme = "dynamic"
+        moe_activation_scheme = "static"
+        weight_block_size = [128, 128]
+        return cls(
+            is_checkpoint_fp8_serialized=is_checkpoint_fp8_serialized,
+            is_checkpoint_w4afp8_serialized=is_checkpoint_w4afp8_serialized,
+            linear_activation_scheme=linear_activation_scheme,
+            moe_activation_scheme=moe_activation_scheme,
+            weight_block_size=weight_block_size,
+        )
+
+    def get_quant_method(
+        self, layer: torch.nn.Module, prefix: str
+    ) -> Optional[QuantizeMethodBase]:
+        from sglang.srt.layers.linear import LinearBase
+        from sglang.srt.layers.moe.fused_moe_triton import FusedMoE
+
+        if isinstance(layer, LinearBase):
+            if is_layer_skipped(prefix, self.ignored_layers):
+                return UnquantizedLinearMethod()
+            return Fp8LinearMethod(self)
+        elif isinstance(layer, FusedMoE):
+            return W4AFp8MoEMethod(self)
+        return None
+
+    def get_scaled_act_names(self) -> List[str]:
+        return []
+
+
+def interleave_scales(scales: torch.Tensor) -> torch.Tensor:
+    """Interleave scales in groups of 4 similar to TRT-LLM implementation."""
+    s_shape = scales.shape
+    # Reshape to separate groups of 4
+    alignment = 4 if s_shape[2] % 4 == 0 else 1
+    scales_interleaved = scales.reshape(
+        s_shape[0], s_shape[1], (s_shape[2] // alignment), alignment
+    )
+    # Permute dimensions to interleave
+    scales_interleaved = scales_interleaved.permute(0, 2, 1, 3)
+    # Reshape back to original dimensions but with interleaved values
+    scales_interleaved = scales_interleaved.reshape(
+        s_shape[0], s_shape[2] // alignment, s_shape[1] * alignment
+    )
+    return scales_interleaved.contiguous()
+
+
+class W4AFp8MoEMethod(FusedMoEMethodBase):
+    def __init__(self, quant_config: W4AFp8Config):
+        self.quant_config = quant_config
+
+    def create_weights(
+        self,
+        layer: Module,
+        num_experts: int,
+        hidden_size: int,
+        intermediate_size_per_partition: int,
+        params_dtype: torch.dtype,
+        **extra_weight_attrs,
+    ):
+        from sglang.srt.layers.moe.fused_moe_triton import FusedMoeWeightScaleSupported
+
+        assert "weight_loader" in extra_weight_attrs
+
+        # Fused gate_up_proj (column parallel)
+        w13_weight = torch.nn.Parameter(
+            torch.empty(
+                num_experts,
+                intermediate_size_per_partition * 2,
+                hidden_size // 2,
+                dtype=torch.int8,
+            ),
+            requires_grad=False,
+        )
+        layer.register_parameter("w13_weight", w13_weight)
+        set_weight_attrs(w13_weight, extra_weight_attrs)
+
+        # down_proj (row parallel)
+        w2_weight = torch.nn.Parameter(
+            torch.empty(
+                num_experts,
+                hidden_size,
+                intermediate_size_per_partition // 2,
+                dtype=torch.int8,
+            ),
+            requires_grad=False,
+        )
+        layer.register_parameter("w2_weight", w2_weight)
+        set_weight_attrs(w2_weight, extra_weight_attrs)
+
+        extra_weight_attrs.update(
+            {"quant_method": FusedMoeWeightScaleSupported.GROUP.value}
+        )
+        w13_weight_scale = torch.nn.Parameter(
+            torch.zeros(
+                num_experts,
+                2 * intermediate_size_per_partition,
+                hidden_size // self.quant_config.group_size,
+                dtype=torch.float32,
+            ),
+            requires_grad=False,
+        )
+        layer.register_parameter("w13_weight_scale_inv", w13_weight_scale)
+        set_weight_attrs(w13_weight_scale, extra_weight_attrs)
+
+        w2_weight_scale = torch.nn.Parameter(
+            torch.zeros(
+                num_experts,
+                hidden_size,
+                intermediate_size_per_partition // self.quant_config.group_size,
+                dtype=torch.float32,
+            ),
+            requires_grad=False,
+        )
+        layer.register_parameter("w2_weight_scale_inv", w2_weight_scale)
+        set_weight_attrs(w2_weight_scale, extra_weight_attrs)
+
+        # Input scales
+        w13_input_scale = torch.nn.Parameter(
+            torch.ones((num_experts, 2), dtype=torch.bfloat16),
+            requires_grad=False,
+        )
+        layer.register_parameter("w13_input_scale", w13_input_scale)
+        set_weight_attrs(w13_input_scale, extra_weight_attrs)
+
+        w2_input_scale = torch.nn.Parameter(
+            torch.ones(num_experts, dtype=torch.bfloat16),
+            requires_grad=False,
+        )
+        layer.register_parameter("w2_input_scale", w2_input_scale)
+        set_weight_attrs(w2_input_scale, extra_weight_attrs)
+
+        # Pre-populate the strides
+        device = layer.w13_weight.device
+
+        self.a_strides1 = torch.full(
+            (num_experts, 3),
+            hidden_size,
+            device=device,
+            dtype=torch.int64,
+        )
+        self.c_strides1 = torch.full(
+            (num_experts, 3),
+            2 * intermediate_size_per_partition,
+            device=device,
+            dtype=torch.int64,
+        )
+        self.a_strides2 = torch.full(
+            (num_experts, 3),
+            intermediate_size_per_partition,
+            device=device,
+            dtype=torch.int64,
+        )
+        self.c_strides2 = torch.full(
+            (num_experts, 3),
+            hidden_size,
+            device=device,
+            dtype=torch.int64,
+        )
+        self.b_strides1 = self.a_strides1
+        self.s_strides13 = self.c_strides1
+        self.b_strides2 = self.a_strides2
+        self.s_strides2 = self.c_strides2
+
+        self.expert_offsets = torch.empty(
+            (num_experts + 1), dtype=torch.int32, device=device
+        )
+        self.problem_sizes1 = torch.empty(
+            (num_experts, 3), dtype=torch.int32, device=device
+        )
+        self.problem_sizes2 = torch.empty(
+            (num_experts, 3), dtype=torch.int32, device=device
+        )
+
+        return
+
+    def process_weights_after_loading(self, layer: Module) -> None:
+        dtype = torch.bfloat16
+        device = layer.w2_weight.device
+
+        # Interleave w13_weight_scale (gate_up_proj)
+        w13_weight_scale = layer.w13_weight_scale_inv.to(dtype)
+        w13_weight_scale = interleave_scales(w13_weight_scale)
+        layer.w13_weight_scale_inv = Parameter(w13_weight_scale, requires_grad=False)
+
+        # Interleave w2_weight_scale (down_proj)
+        w2_weight_scale = layer.w2_weight_scale_inv.to(dtype)
+        w2_weight_scale = interleave_scales(w2_weight_scale)
+        layer.w2_weight_scale_inv = Parameter(w2_weight_scale, requires_grad=False)
+
+        # Process input scales
+        w13_input_scale_max = layer.w13_input_scale.max().to(torch.float32).item()
+        new_w13_input_scale = torch.tensor(
+            [w13_input_scale_max],
+            dtype=torch.float32,
+            device=device,
+        )
+        layer.w13_input_scale = Parameter(new_w13_input_scale, requires_grad=False)
+
+        w2_input_scale_max = layer.w2_input_scale.max().to(torch.float32).item()
+        new_w2_input_scale = torch.tensor(
+            [w2_input_scale_max], dtype=torch.float32, device=device
+        )
+        layer.w2_input_scale = Parameter(new_w2_input_scale, requires_grad=False)
+
+    def create_moe_runner(
+        self, layer: torch.nn.Module, moe_runner_config: MoeRunnerConfig
+    ):
+        self.moe_runner_config = moe_runner_config
+
+    def apply(
+        self,
+        layer: Module,
+        dispatch_output: StandardDispatchOutput,
+    ) -> CombineInput:
+
+        from sglang.srt.layers.moe.cutlass_w4a8_moe import cutlass_w4a8_moe
+        from sglang.srt.layers.moe.token_dispatcher import StandardCombineInput
+
+        x = dispatch_output.hidden_states
+        topk_output = dispatch_output.topk_output
+        topk_weights, topk_ids, _ = topk_output
+
+        output = cutlass_w4a8_moe(
+            x,
+            layer.w13_weight,
+            layer.w2_weight,
+            layer.w13_weight_scale_inv,
+            layer.w2_weight_scale_inv,
+            topk_weights,
+            topk_ids,
+            self.a_strides1,
+            self.b_strides1,
+            self.c_strides1,
+            self.a_strides2,
+            self.b_strides2,
+            self.c_strides2,
+            self.s_strides13,
+            self.s_strides2,
+            self.expert_offsets,
+            self.problem_sizes1,
+            self.problem_sizes2,
+            layer.w13_input_scale,
+            layer.w2_input_scale,
+            routed_scaling_factor=self.moe_runner_config.routed_scaling_factor or 1.0,
+        )
+        return StandardCombineInput(hidden_states=output)
+
+    def apply_deepep_ll(
+        self,
+        layer: DeepEPMoE,
+        dispatch_output: DeepEPLLDispatchOutput,
+    ) -> torch.Tensor:
+
+        from sglang.srt.layers.moe.cutlass_w4a8_moe import cutlass_w4a8_moe_deepep_ll
+
+        hidden_states, _, topk_ids, _, masked_m, _ = dispatch_output
+
+        output = cutlass_w4a8_moe_deepep_ll(
+            hidden_states,
+            layer.w13_weight,
+            layer.w2_weight,
+            layer.w13_weight_scale_inv,
+            layer.w2_weight_scale_inv,
+            topk_ids,
+            masked_m,
+            layer.quant_method.a_strides1,
+            layer.quant_method.b_strides1,
+            layer.quant_method.c_strides1,
+            layer.quant_method.a_strides2,
+            layer.quant_method.b_strides2,
+            layer.quant_method.c_strides2,
+            layer.quant_method.s_strides13,
+            layer.quant_method.s_strides2,
+            layer.quant_method.expert_offsets,
+            layer.quant_method.problem_sizes1,
+            layer.quant_method.problem_sizes2,
+            layer.w13_input_scale,
+            layer.w2_input_scale,
+        )
+
+        return output
+
+    def apply_deepep_normal(
+        self,
+        layer: DeepEPMoE,
+        dispatch_output: DeepEPNormalDispatchOutput,
+    ) -> torch.Tensor:
+        from sglang.srt.layers.moe.cutlass_w4a8_moe import (
+            cutlass_w4a8_moe_deepep_normal,
+        )
+
+        hidden_states, topk_idx, topk_weights = (
+            dispatch_output.hidden_states,
+            dispatch_output.topk_ids,
+            dispatch_output.topk_weights,
+        )
+        if isinstance(hidden_states, tuple):
+            hidden_states = hidden_states[0]
+
+        num_tokens = hidden_states.shape[0]
+        if num_tokens > 0:
+            return cutlass_w4a8_moe_deepep_normal(
+                hidden_states,
+                layer.w13_weight,
+                layer.w2_weight,
+                layer.w13_weight_scale_inv,
+                layer.w2_weight_scale_inv,
+                topk_weights,
+                topk_idx,
+                self.a_strides1,
+                self.b_strides1,
+                self.c_strides1,
+                self.a_strides2,
+                self.b_strides2,
+                self.c_strides2,
+                self.s_strides13,
+                self.s_strides2,
+                self.expert_offsets,
+                self.problem_sizes1,
+                self.problem_sizes2,
+                layer.w13_input_scale,
+                layer.w2_input_scale,
+            )
+        else:
+            return hidden_states
diff --git a/sglang/python/sglang/srt/layers/quantization/w8a8_fp8.py b/sglang/python/sglang/srt/layers/quantization/w8a8_fp8.py
new file mode 100644
index 0000000000000000000000000000000000000000..808e3e822f66091dab17104dfb5909dd17aed081
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/quantization/w8a8_fp8.py
@@ -0,0 +1,305 @@
+from __future__ import annotations
+
+from typing import TYPE_CHECKING, Any, Dict, List, Optional
+
+import torch
+from torch.nn.parameter import Parameter
+
+from sglang.srt.layers.moe import MoeRunner, MoeRunnerBackend, MoeRunnerConfig
+from sglang.srt.layers.moe.moe_runner.triton import TritonMoeQuantInfo
+from sglang.srt.layers.parameter import ChannelQuantScaleParameter, ModelWeightParameter
+from sglang.srt.layers.quantization.base_config import (
+    FusedMoEMethodBase,
+    LinearMethodBase,
+    QuantizationConfig,
+    QuantizeMethodBase,
+)
+from sglang.srt.layers.quantization.fp8_kernel import (
+    fp8_dtype,
+    is_fp8_fnuz,
+    per_token_group_quant_fp8,
+)
+from sglang.srt.layers.quantization.fp8_utils import (
+    apply_fp8_linear,
+    cutlass_fp8_supported,
+    input_to_float8,
+    normalize_e4m3fn_to_e4m3fnuz,
+)
+from sglang.srt.utils import set_weight_attrs
+
+if TYPE_CHECKING:
+    from sglang.srt.layers.moe.token_dispatcher import (
+        CombineInput,
+        StandardDispatchOutput,
+    )
+
+_is_fp8_fnuz = is_fp8_fnuz()
+
+
+class W8A8Fp8Config(QuantizationConfig):
+    """Config class for W8A8 FP8 Quantization.
+
+    Weight Quantization:
+    - Method: Static quantization
+    - Granularity: Per-channel
+    - Type: Symmetric
+
+    Activation Quantization:
+    - Method: Dynamic quantization
+    - Granularity: Per-token
+    - Type: Symmetric
+
+    Note:
+    - For models without offline quantization, weights will be quantized during model loading
+    - If CUTLASS is supported: Per-channel weight quantization is used
+    - If CUTLASS is not supported: Falls back to per-tensor weight quantization
+    """
+
+    def __init__(self, is_checkpoint_fp8_serialized: bool = False):
+        self.is_checkpoint_fp8_serialized = is_checkpoint_fp8_serialized
+
+    @classmethod
+    def get_supported_act_dtypes(cls) -> List[torch.dtype]:
+        return [torch.float16, torch.bfloat16]
+
+    @classmethod
+    def get_min_capability(cls) -> int:
+        return 89
+
+    @classmethod
+    def get_name(self) -> str:
+        return "w8a8_fp8"
+
+    @classmethod
+    def get_config_filenames(cls) -> List[str]:
+        return []
+
+    @classmethod
+    def from_config(cls, config: Dict[str, Any]) -> W8A8Fp8Config:
+        quant_method = cls.get_from_keys(config, ["quant_method"])
+        is_checkpoint_fp8_serialized = (
+            "compressed-tensors" in quant_method or "w8a8_fp8" in quant_method
+        )
+        return cls(is_checkpoint_fp8_serialized=is_checkpoint_fp8_serialized)
+
+    def get_quant_method(
+        self,
+        layer: torch.nn.Module,
+        prefix: str,
+    ) -> Optional[QuantizeMethodBase]:
+        from sglang.srt.layers.linear import LinearBase
+        from sglang.srt.layers.moe.fused_moe_triton import FusedMoE
+
+        if isinstance(layer, LinearBase):
+            return W8A8Fp8LinearMethod(self)
+        elif isinstance(layer, FusedMoE):
+            return W8A8FP8MoEMethod(self)
+        return None
+
+    def get_scaled_act_names(self) -> List[str]:
+        return []
+
+
+class W8A8Fp8LinearMethod(LinearMethodBase):
+
+    def __init__(self, quantization_config: W8A8Fp8Config):
+        self.cutlass_fp8_supported = cutlass_fp8_supported()
+        self.quantization_config = quantization_config
+
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        weight = layer.weight
+
+        if self.quantization_config.is_checkpoint_fp8_serialized:
+            weight_scale = layer.weight_scale.detach()
+            # If checkpoint offline quantized with w8a8_fp8, load the weight and weight_scale directly.
+            if _is_fp8_fnuz:
+                weight, weight_scale, _ = normalize_e4m3fn_to_e4m3fnuz(
+                    weight=weight, weight_scale=weight_scale
+                )
+
+            layer.weight = Parameter(weight.t(), requires_grad=False)
+            layer.weight_scale = Parameter(weight_scale, requires_grad=False)
+        else:
+            # If checkpoint not offline quantized, quantize the weights with per-channel quantization.
+            if self.cutlass_fp8_supported:
+                # if cutlass supported, we use cutlass_scaled_mm
+                # which requires per-channel quantization on weight
+                qweight, weight_scale = per_token_group_quant_fp8(
+                    layer.weight, layer.weight.shape[-1]
+                )
+                weight_scale = weight_scale.t().contiguous()
+            else:
+                # if cutlass not supported, we fall back to use torch._scaled_mm
+                # which requires per tensor quantization on weight
+                qweight, weight_scale = input_to_float8(layer.weight, dtype=fp8_dtype)
+
+            # Update the layer with the new values.
+            layer.weight = Parameter(qweight.t(), requires_grad=False)
+            layer.weight_scale = Parameter(weight_scale, requires_grad=False)
+            layer.input_scale = None
+
+    def create_weights(
+        self,
+        layer: torch.nn.Module,
+        input_size_per_partition: int,
+        output_partition_sizes: List[int],
+        input_size: int,
+        output_size: int,
+        params_dtype: torch.dtype,
+        **extra_weight_attrs,
+    ):
+        weight_dtype = (
+            torch.float8_e4m3fn
+            if self.quantization_config.is_checkpoint_fp8_serialized
+            else params_dtype
+        )
+
+        weight_loader = extra_weight_attrs.get("weight_loader")
+        self.logical_widths = output_partition_sizes
+
+        weight = ModelWeightParameter(
+            data=torch.empty(
+                sum(output_partition_sizes),
+                input_size_per_partition,
+                dtype=weight_dtype,
+            ),
+            input_dim=1,
+            output_dim=0,
+            weight_loader=weight_loader,
+        )
+        layer.register_parameter("weight", weight)
+
+        if self.quantization_config.is_checkpoint_fp8_serialized:
+            weight_scale = ChannelQuantScaleParameter(
+                data=torch.empty((sum(output_partition_sizes), 1), dtype=torch.float32),
+                output_dim=0,
+                weight_loader=weight_loader,
+            )
+            layer.register_parameter("weight_scale", weight_scale)
+        else:
+            layer.weight_scale = None
+
+    def apply(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        bias: Optional[torch.Tensor] = None,
+    ):
+        return apply_fp8_linear(
+            x,
+            layer.weight,
+            layer.weight_scale,
+            bias=bias,
+            cutlass_fp8_supported=self.cutlass_fp8_supported,
+        )
+
+
+class W8A8FP8MoEMethod(FusedMoEMethodBase):
+    """MoE method for FP8.
+    Supports loading FP8 checkpoints with static weight scale and
+    dynamic/static activation scale.
+    Also supports loading quantized FP16/BF16 model checkpoints with dynamic
+    activation scaling. The weight scaling factor will be initialized after
+    the model weights are loaded.
+    Args:
+        quant_config: The quantization config.
+    """
+
+    def __init__(self, quant_config: W8A8Fp8Config):
+        self.quant_config = quant_config
+
+    def create_weights(
+        self,
+        layer: torch.nn.Module,
+        num_experts: int,
+        hidden_size: int,
+        intermediate_size_per_partition: int,
+        params_dtype: torch.dtype,
+        **extra_weight_attrs,
+    ):
+        from sglang.srt.layers.moe.fused_moe_triton import FusedMoeWeightScaleSupported
+
+        # WEIGHTS
+        w13_weight = torch.nn.Parameter(
+            torch.empty(
+                num_experts,
+                2 * intermediate_size_per_partition,
+                hidden_size,
+                dtype=fp8_dtype,
+            ),
+            requires_grad=False,
+        )
+        layer.register_parameter("w13_weight", w13_weight)
+        set_weight_attrs(w13_weight, extra_weight_attrs)
+
+        w2_weight = torch.nn.Parameter(
+            torch.empty(
+                num_experts,
+                hidden_size,
+                intermediate_size_per_partition,
+                dtype=fp8_dtype,
+            ),
+            requires_grad=False,
+        )
+        layer.register_parameter("w2_weight", w2_weight)
+        set_weight_attrs(w2_weight, extra_weight_attrs)
+
+        w13_weight_scale = torch.nn.Parameter(
+            torch.ones(
+                num_experts, 2 * intermediate_size_per_partition, 1, dtype=torch.float32
+            ),
+            requires_grad=False,
+        )
+        w2_weight_scale = torch.nn.Parameter(
+            torch.ones(num_experts, hidden_size, 1, dtype=torch.float32),
+            requires_grad=False,
+        )
+        layer.register_parameter("w13_weight_scale", w13_weight_scale)
+        layer.register_parameter("w2_weight_scale", w2_weight_scale)
+
+        extra_weight_attrs.update(
+            {"quant_method": FusedMoeWeightScaleSupported.CHANNEL.value}
+        )
+
+        set_weight_attrs(w13_weight_scale, extra_weight_attrs)
+        set_weight_attrs(w2_weight_scale, extra_weight_attrs)
+
+        w13_input_scale = None
+        layer.register_parameter("w13_input_scale", w13_input_scale)
+
+        w2_input_scale = None
+        layer.register_parameter("w2_input_scale", w2_input_scale)
+
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        layer.w13_weight = Parameter(layer.w13_weight, requires_grad=False)
+        layer.w2_weight = Parameter(layer.w2_weight, requires_grad=False)
+        layer.w13_weight_scale = Parameter(
+            layer.w13_weight_scale.data, requires_grad=False
+        )
+        layer.w2_weight_scale = Parameter(
+            layer.w2_weight_scale.data, requires_grad=False
+        )
+
+    def create_moe_runner(
+        self, layer: torch.nn.Module, moe_runner_config: MoeRunnerConfig
+    ):
+        self.moe_runner_config = moe_runner_config
+        self.runner = MoeRunner(MoeRunnerBackend.TRITON, moe_runner_config)
+
+    def apply(
+        self,
+        layer: torch.nn.Module,
+        dispatch_output: StandardDispatchOutput,
+    ) -> CombineInput:
+
+        quant_info = TritonMoeQuantInfo(
+            w13_weight=layer.w13_weight,
+            w2_weight=layer.w2_weight,
+            use_fp8_w8a8=True,
+            per_channel_quant=True,
+            w13_scale=layer.w13_weight_scale,
+            w2_scale=layer.w2_weight_scale,
+            a13_scale=layer.w13_input_scale,
+            a2_scale=layer.w2_input_scale,
+        )
+        return self.runner.run(dispatch_output, quant_info)
diff --git a/sglang/python/sglang/srt/layers/quantization/w8a8_int8.py b/sglang/python/sglang/srt/layers/quantization/w8a8_int8.py
new file mode 100644
index 0000000000000000000000000000000000000000..fb12bc4fe024fea52383582f88c7f1f5222a356b
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/quantization/w8a8_int8.py
@@ -0,0 +1,378 @@
+from __future__ import annotations
+
+import logging
+from types import MappingProxyType
+from typing import TYPE_CHECKING, Any, Dict, List, Mapping, Optional, cast
+
+import torch
+from torch.nn.parameter import Parameter
+
+from sglang.srt.distributed import get_tensor_model_parallel_world_size
+from sglang.srt.layers.amx_utils import (
+    CPUQuantMethod,
+    _amx_process_weight_after_loading,
+)
+from sglang.srt.layers.moe import MoeRunner, MoeRunnerBackend, MoeRunnerConfig
+from sglang.srt.layers.moe.moe_runner.triton import TritonMoeQuantInfo
+from sglang.srt.layers.parameter import ChannelQuantScaleParameter, ModelWeightParameter
+from sglang.srt.layers.quantization.base_config import (
+    FusedMoEMethodBase,
+    LinearMethodBase,
+    QuantizationConfig,
+    QuantizeMethodBase,
+)
+from sglang.srt.layers.quantization.compressed_tensors.utils import should_ignore_layer
+from sglang.srt.layers.quantization.int8_kernel import per_token_quant_int8
+from sglang.srt.layers.quantization.unquant import UnquantizedLinearMethod
+from sglang.srt.utils import (
+    cpu_has_amx_support,
+    is_cpu,
+    is_cuda,
+    set_weight_attrs,
+    use_intel_amx_backend,
+)
+from sglang.srt.utils.patch_torch import register_fake_if_exists
+
+if TYPE_CHECKING:
+    from sglang.srt.layers.moe.token_dispatcher import StandardDispatchOutput
+
+_is_cuda = is_cuda()
+_is_cpu_amx_available = cpu_has_amx_support()
+_is_cpu = is_cpu()
+
+if _is_cuda:
+    from sgl_kernel import int8_scaled_mm
+
+    @register_fake_if_exists("sgl_kernel::int8_scaled_mm")
+    def _int8_scaled_mm_abstract(
+        mat_a,
+        mat_b,
+        scales_a,
+        scales_b,
+        out_dtype,
+        bias=None,
+    ):
+        M = mat_a.shape[-2]
+        N = mat_b.shape[-1]
+        return mat_a.new_empty((M, N), dtype=out_dtype)
+
+
+logger = logging.getLogger(__name__)
+
+
+class W8A8Int8Config(QuantizationConfig):
+    """Config class for W8A8 Quantization.
+
+    - Weight: static, per-channel, symmetric
+    - Activation: dynamic, per-token, symmetric
+    """
+
+    def __init__(self, quant_config: Dict[str, Any] = {}):
+        super().__init__()
+        self.quant_description = quant_config
+        self.is_dynamic = quant_config.get("is_dynamic", False)
+        ignore = cast(List[str], quant_config.get("ignore", []))
+        self.ignore = ignore if ignore is not None else []
+        packed_modules_mapping = quant_config.get("packed_modules_mapping", {})
+        self.packed_modules_mapping = (
+            packed_modules_mapping if packed_modules_mapping is not None else {}
+        )
+
+    @classmethod
+    def get_supported_act_dtypes(cls) -> List[torch.dtype]:
+        return [torch.float16, torch.bfloat16]
+
+    @classmethod
+    def get_min_capability(cls) -> int:
+        return 75
+
+    @classmethod
+    def get_name(self) -> str:
+        return "w8a8_int8"
+
+    @classmethod
+    def get_config_filenames(cls) -> List[str]:
+        filenames = []
+        return filenames
+
+    @classmethod
+    def from_config(cls, config: Dict[str, Any]) -> W8A8Int8Config:
+        return cls(config)
+
+    def get_quant_method(
+        self,
+        layer: torch.nn.Module,
+        prefix: str,
+    ) -> Optional[QuantizeMethodBase]:
+        from sglang.srt.layers.linear import LinearBase
+        from sglang.srt.layers.moe.fused_moe_triton import FusedMoE
+
+        if should_ignore_layer(
+            prefix, ignore=self.ignore, fused_mapping=self.packed_modules_mapping
+        ):
+            return UnquantizedLinearMethod()
+        if isinstance(layer, LinearBase):
+            return W8A8Int8LinearMethod(self)
+        elif isinstance(layer, FusedMoE):
+            return W8A8Int8MoEMethod(self)
+        return None
+
+    def is_layer_skipped(
+        self, prefix: str, fused_mapping: Mapping[str, List[str]] = MappingProxyType({})
+    ):
+        # adapted from vllm.model_executor.layers.quantization.utils.quant_utils.is_layer_skipped
+        proj_name = prefix.split(".")[-1]
+        if proj_name in fused_mapping:
+            shard_prefixes = [
+                prefix.replace(proj_name, shard_proj_name)
+                for shard_proj_name in fused_mapping[proj_name]
+            ]
+
+            is_skipped = None
+            for shard_prefix in shard_prefixes:
+                is_shard_skipped = (
+                    self.quant_description[shard_prefix + ".weight"] == "FLOAT"
+                )
+
+                if is_skipped is None:
+                    is_skipped = is_shard_skipped
+                elif is_shard_skipped != is_skipped:
+                    raise ValueError(
+                        f"Detected some but not all shards of {prefix} "
+                        "are quantized. All shards of fused layers "
+                        "to have the same precision."
+                    )
+        else:
+            is_skipped = self.quant_description[prefix + ".weight"] == "FLOAT"
+
+        assert is_skipped is not None
+        return is_skipped
+
+    def get_scaled_act_names(self) -> List[str]:
+        return []
+
+
+class W8A8Int8LinearMethod(LinearMethodBase):
+
+    def __init__(self, quantization_config: W8A8Int8Config):
+        self.quantization_config = quantization_config
+
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        if _is_cpu:
+            assert (
+                _is_cpu_amx_available
+            ), "W8A8Int8LinearMethod on CPU requires that CPU has AMX support"
+            _amx_process_weight_after_loading(layer, ["weight"])
+        else:
+            layer.weight = Parameter(layer.weight.t(), requires_grad=False)
+        layer.weight_scale = Parameter(layer.weight_scale.data, requires_grad=False)
+
+    def create_weights(
+        self,
+        layer: torch.nn.Module,
+        input_size_per_partition: int,
+        output_partition_sizes: List[int],
+        input_size: int,
+        output_size: int,
+        params_dtype: torch.dtype,
+        **extra_weight_attrs,
+    ):
+
+        weight_loader = extra_weight_attrs.get("weight_loader")
+        self.logical_widths = output_partition_sizes
+
+        weight = ModelWeightParameter(
+            data=torch.empty(
+                sum(output_partition_sizes), input_size_per_partition, dtype=torch.int8
+            ),
+            input_dim=1,
+            output_dim=0,
+            weight_loader=weight_loader,
+        )
+        layer.register_parameter("weight", weight)
+
+        weight_scale = ChannelQuantScaleParameter(
+            data=torch.empty((sum(output_partition_sizes), 1), dtype=torch.float32),
+            output_dim=0,
+            weight_loader=weight_loader,
+        )
+        layer.register_parameter("weight_scale", weight_scale)
+
+    def apply(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        bias: Optional[torch.Tensor] = None,
+    ):
+        if use_intel_amx_backend(layer):
+            return torch.ops.sgl_kernel.int8_scaled_mm_with_quant(
+                x,
+                layer.weight,
+                layer.weight_scale,
+                bias,
+                x.dtype,
+                True,  # is_vnni
+            )
+        x_q, x_scale = per_token_quant_int8(x)
+
+        x_q_2d = x_q.view(-1, x_q.shape[-1])
+        x_scale_2d = x_scale.view(-1, x_scale.shape[-1])
+        output_shape = [*x_q.shape[:-1], layer.weight.shape[1]]
+
+        output = int8_scaled_mm(
+            x_q_2d,
+            layer.weight,
+            x_scale_2d,
+            layer.weight_scale,
+            out_dtype=x.dtype,
+            bias=bias,
+        )
+
+        return output.view(output_shape)
+
+
+class W8A8Int8MoEMethod(FusedMoEMethodBase):
+    """MoE method for INT8.
+    Supports loading INT8 checkpoints with static weight scale and
+    dynamic/static activation scale.
+    Also supports loading quantized FP16/BF16 model checkpoints with dynamic
+    activation scaling. The weight scaling factor will be initialized after
+    the model weights are loaded.
+    Args:
+        quant_config: The quantization config.
+    """
+
+    def __init__(self, quant_config: W8A8Int8Config):
+        self.quant_config = quant_config
+
+    def create_weights(
+        self,
+        layer: torch.nn.Module,
+        num_experts: int,
+        hidden_size: int,
+        intermediate_size_per_partition: int,
+        params_dtype: torch.dtype,
+        **extra_weight_attrs,
+    ):
+        from sglang.srt.layers.moe.fused_moe_triton import FusedMoeWeightScaleSupported
+
+        tp_size = get_tensor_model_parallel_world_size()
+
+        # WEIGHTS
+        w13_weight = torch.nn.Parameter(
+            torch.empty(
+                num_experts,
+                2 * intermediate_size_per_partition,
+                hidden_size,
+                dtype=torch.int8,
+            ),
+            requires_grad=False,
+        )
+        layer.register_parameter("w13_weight", w13_weight)
+        set_weight_attrs(w13_weight, extra_weight_attrs)
+
+        w2_weight = torch.nn.Parameter(
+            torch.empty(
+                num_experts,
+                hidden_size,
+                intermediate_size_per_partition,
+                dtype=torch.int8,
+            ),
+            requires_grad=False,
+        )
+        layer.register_parameter("w2_weight", w2_weight)
+        set_weight_attrs(w2_weight, extra_weight_attrs)
+
+        w13_weight_scale = torch.nn.Parameter(
+            torch.ones(
+                num_experts, 2 * intermediate_size_per_partition, 1, dtype=torch.float32
+            ),
+            requires_grad=False,
+        )
+        w2_weight_scale = torch.nn.Parameter(
+            torch.ones(num_experts, hidden_size, 1, dtype=torch.float32),
+            requires_grad=False,
+        )
+        layer.register_parameter("w13_weight_scale", w13_weight_scale)
+        layer.register_parameter("w2_weight_scale", w2_weight_scale)
+
+        extra_weight_attrs.update(
+            {"quant_method": FusedMoeWeightScaleSupported.CHANNEL.value}
+        )
+
+        set_weight_attrs(w13_weight_scale, extra_weight_attrs)
+        set_weight_attrs(w2_weight_scale, extra_weight_attrs)
+
+        w13_input_scale = None
+        layer.register_parameter("w13_input_scale", w13_input_scale)
+
+        w2_input_scale = None
+        layer.register_parameter("w2_input_scale", w2_input_scale)
+
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        if _is_cpu:
+            assert (
+                _is_cpu_amx_available
+            ), "W8A8Int8MoEMethod on CPU requires that CPU has AMX support"
+            _amx_process_weight_after_loading(layer, ["w13_weight", "w2_weight"])
+        else:
+            layer.w13_weight = Parameter(layer.w13_weight, requires_grad=False)
+            layer.w2_weight = Parameter(layer.w2_weight, requires_grad=False)
+        layer.w13_weight_scale = Parameter(
+            layer.w13_weight_scale.data, requires_grad=False
+        )
+        layer.w2_weight_scale = Parameter(
+            layer.w2_weight_scale.data, requires_grad=False
+        )
+
+    def create_moe_runner(
+        self, layer: torch.nn.Module, moe_runner_config: MoeRunnerConfig
+    ):
+        self.moe_runner_config = moe_runner_config
+        self.runner = MoeRunner(MoeRunnerBackend.TRITON, moe_runner_config)
+
+    def apply(
+        self,
+        layer: torch.nn.Module,
+        dispatch_output: StandardDispatchOutput,
+    ) -> torch.Tensor:
+        from sglang.srt.layers.moe.token_dispatcher import StandardCombineInput
+
+        x = dispatch_output.hidden_states
+        topk_output = dispatch_output.topk_output
+
+        if use_intel_amx_backend(layer):
+            from sglang.srt.layers.moe.topk import apply_topk_weights_cpu
+
+            topk_weights, topk_ids, _ = topk_output
+            x, topk_weights = apply_topk_weights_cpu(
+                self.moe_runner_config.apply_router_weight_on_input, topk_weights, x
+            )
+            output = torch.ops.sgl_kernel.fused_experts_cpu(
+                x,
+                layer.w13_weight,
+                layer.w2_weight,
+                topk_weights,
+                topk_ids,
+                False,  # inplace See [Note] inplace should be False in fused_experts.
+                CPUQuantMethod.INT8_W8A8,
+                layer.w13_weight_scale,  # w1_scale
+                layer.w2_weight_scale,  # w2_scale
+                None,  # w1_zp
+                None,  # w2_zp
+                None,  # block_size
+                True,  # is_vnni
+            )
+            return StandardCombineInput(hidden_states=output)
+
+        quant_info = TritonMoeQuantInfo(
+            w13_weight=layer.w13_weight,
+            w2_weight=layer.w2_weight,
+            use_int8_w8a8=True,
+            per_channel_quant=True,
+            w13_scale=layer.w13_weight_scale,
+            w2_scale=layer.w2_weight_scale,
+            a13_scale=layer.w13_input_scale,
+            a2_scale=layer.w2_input_scale,
+        )
+        return self.runner.run(dispatch_output, quant_info)
diff --git a/sglang/python/sglang/srt/layers/radix_attention.py b/sglang/python/sglang/srt/layers/radix_attention.py
new file mode 100644
index 0000000000000000000000000000000000000000..449bc867067e588b2d88f1dc177de42bc3a53a93
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/radix_attention.py
@@ -0,0 +1,173 @@
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Radix attention."""
+
+from __future__ import annotations
+
+from enum import Enum
+from typing import TYPE_CHECKING, Optional
+
+import torch
+from torch import nn
+
+from sglang.srt.compilation.compilation_config import register_split_op
+from sglang.srt.compilation.piecewise_context_manager import get_forward_context
+from sglang.srt.utils.custom_op import register_custom_op
+
+if TYPE_CHECKING:
+    from sglang.srt.layers.quantization.base_config import QuantizationConfig
+    from sglang.srt.model_executor.forward_batch_info import ForwardBatch
+
+
+class AttentionType(Enum):
+    """
+    Attention type.
+    Use string to be compatible with `torch.compile`.
+    """
+
+    # Decoder attention between previous layer Q/K/V
+    DECODER = "decoder"
+    # Decoder bidirectional attention between image tokens
+    DECODER_BIDIRECTIONAL = "decoder_bidirectional"
+    # Encoder attention between previous layer Q/K/V
+    ENCODER_ONLY = "encoder_only"
+
+
+class RadixAttention(nn.Module):
+    """
+    The attention layer implementation.
+    """
+
+    def __init__(
+        self,
+        num_heads: int,
+        head_dim: int,
+        scaling: float,
+        num_kv_heads: int,
+        layer_id: int,
+        logit_cap: float = 0.0,
+        v_head_dim: int = -1,
+        sliding_window_size: int = -1,
+        is_cross_attention: bool = False,
+        pos_encoding_mode: str = "NONE",
+        logit_capping_method: str = "tanh",
+        quant_config: Optional[QuantizationConfig] = None,
+        attn_type: AttentionType = AttentionType.DECODER,
+        use_irope: bool = False,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.tp_q_head_num = num_heads
+        self.tp_k_head_num = num_kv_heads
+        self.tp_v_head_num = num_kv_heads
+        self.head_dim = head_dim
+        self.qk_head_dim = head_dim
+        self.v_head_dim = v_head_dim if v_head_dim != -1 else head_dim
+        self.scaling = scaling
+        self.layer_id = layer_id
+        self.logit_cap = logit_cap
+        self.sliding_window_size = sliding_window_size or -1
+        self.is_cross_attention = is_cross_attention
+        self.use_irope = use_irope
+        self.k_scale = None
+        self.v_scale = None
+        self.k_scale_float = None
+        self.v_scale_float = None
+        self.quant_method = None
+
+        if quant_config is not None:
+            self.quant_method = quant_config.get_quant_method(self, prefix=prefix)
+        if self.quant_method is not None:
+            self.quant_method.create_weights(self)
+        self.attn_type = attn_type
+
+        self.pos_encoding_mode = pos_encoding_mode
+        self.logit_capping_method = logit_capping_method
+        self.xai_temperature_len = -1
+
+    def forward(
+        self,
+        q,
+        k,
+        v,
+        forward_batch: ForwardBatch,
+        save_kv_cache: bool = True,
+        **kwargs,
+    ):
+        if k is not None:
+            # For cross-layer sharing, kv can be None
+            assert v is not None
+            if "k_rope" not in kwargs:
+                k = k.view(-1, self.tp_k_head_num, self.qk_head_dim)
+                v = v.view(-1, self.tp_v_head_num, self.v_head_dim)
+            else:
+                k = k.view(-1, self.tp_k_head_num, self.v_head_dim)
+
+        if forward_batch.forward_mode.is_extend() and get_forward_context() is not None:
+            if self.qk_head_dim != self.v_head_dim:
+                output = q.new_empty((q.shape[0], self.tp_q_head_num * self.v_head_dim))
+            else:
+                output = torch.empty_like(q)
+            unified_attention_with_output(
+                q, k, v, output, save_kv_cache, self.layer_id, **kwargs
+            )
+            return output
+        else:
+            return forward_batch.attn_backend.forward(
+                q,
+                k,
+                v,
+                self,
+                forward_batch,
+                save_kv_cache,
+                **kwargs,
+            )
+
+
+@register_custom_op(mutates_args=["output"])
+@register_split_op()
+def unified_attention_with_output(
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    output: torch.Tensor,
+    save_kv_cache: bool,
+    layer_id: int,
+    *,
+    q_rope: Optional[torch.Tensor] = None,
+    k_rope: Optional[torch.Tensor] = None,
+    sinks: Optional[torch.Tensor] = None,
+) -> None:
+    context = get_forward_context()
+    forward_batch = context.forward_batch
+    attention_layers = context.attention_layers
+    attention_layer = attention_layers[layer_id]
+
+    kwargs = {}
+    if q_rope is not None:
+        kwargs["q_rope"] = q_rope
+    if k_rope is not None:
+        kwargs["k_rope"] = k_rope
+    if sinks is not None:
+        kwargs["sinks"] = sinks
+
+    ret = forward_batch.attn_backend.forward(
+        query, key, value, attention_layer, forward_batch, save_kv_cache, **kwargs
+    )
+    assert (
+        output.numel() == ret.numel()
+    ), f"Output tensor element mismatch: {output.numel()} != {ret.numel()}"
+
+    output.view(ret.shape).copy_(ret)
+    return
diff --git a/sglang/python/sglang/srt/layers/radix_linear_attention.py b/sglang/python/sglang/srt/layers/radix_linear_attention.py
new file mode 100644
index 0000000000000000000000000000000000000000..89821e774460d5da6b0c2e9c32439663a80508b7
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/radix_linear_attention.py
@@ -0,0 +1,134 @@
+# Copyright 2025-2026 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Radix linear attention."""
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING, Optional, Tuple, Union
+
+import torch
+from torch import nn
+
+from sglang.srt.compilation.compilation_config import register_split_op
+from sglang.srt.compilation.piecewise_context_manager import get_forward_context
+from sglang.srt.utils.custom_op import register_custom_op
+
+if TYPE_CHECKING:
+    from sglang.srt.model_executor.forward_batch_info import ForwardBatch
+
+
+class RadixLinearAttention(nn.Module):
+    """
+    The Linear Attention Layer Implementation.
+    """
+
+    def __init__(
+        self,
+        layer_id: int,
+        num_q_heads: int,
+        num_k_heads: int,
+        num_v_heads: int,
+        head_q_dim: int,
+        head_k_dim: int,
+        head_v_dim: int,
+        # GDN KDA Shared Weights
+        conv_weights: Optional[Union[torch.Tensor, Tuple[torch.Tensor, ...]]] = None,
+        bias: Optional[Union[torch.Tensor, Tuple[torch.Tensor, ...]]] = None,
+        activation: str = "silu",
+        A_log: Optional[torch.Tensor] = None,
+        dt_bias: Optional[torch.Tensor] = None,
+    ):
+        super().__init__()
+        self.layer_id = layer_id
+        self.num_q_heads = num_q_heads
+        self.num_k_heads = num_k_heads
+        self.num_v_heads = num_v_heads
+        self.head_q_dim = head_q_dim
+        self.head_k_dim = head_k_dim
+        self.head_v_dim = head_v_dim
+        self.q_dim = num_q_heads * head_q_dim
+        self.k_dim = num_k_heads * head_k_dim
+        self.v_dim = num_v_heads * head_v_dim
+
+        self.conv_weights = conv_weights
+        self.bias = bias
+        self.activation = activation
+
+        self.A_log = A_log
+        self.dt_bias = dt_bias
+
+    def forward(
+        self,
+        forward_batch: ForwardBatch,
+        mixed_qkv: torch.Tensor,
+        a: torch.Tensor,
+        b: torch.Tensor,
+    ) -> torch.Tensor:
+        if forward_batch.forward_mode.is_extend() and get_forward_context() is not None:
+            # Output shape from linear attention: (1, seq_len, num_v_heads, head_v_dim)
+            seq_len = mixed_qkv.shape[0]
+            output = torch.empty(
+                (1, seq_len, self.num_v_heads, self.head_v_dim),
+                dtype=mixed_qkv.dtype,
+                device=mixed_qkv.device,
+            )
+            unified_linear_attention_with_output(
+                mixed_qkv,
+                a,
+                b,
+                output,
+                self.layer_id,
+            )
+            return output
+        else:
+            return forward_batch.attn_backend.forward(
+                layer=self,
+                forward_batch=forward_batch,
+                mixed_qkv=mixed_qkv,
+                a=a,
+                b=b,
+            )
+
+
+@register_custom_op(mutates_args=["output"])
+@register_split_op()
+def unified_linear_attention_with_output(
+    mixed_qkv: torch.Tensor,
+    a: torch.Tensor,
+    b: torch.Tensor,
+    output: torch.Tensor,
+    layer_id: int,
+) -> None:
+    """
+    Custom op wrapper for linear attention computation only.
+    """
+    context = get_forward_context()
+    forward_batch = context.forward_batch
+    attention_layers = context.attention_layers
+    attention_layer = attention_layers[layer_id]
+
+    ret = forward_batch.attn_backend.forward(
+        layer=attention_layer,
+        forward_batch=forward_batch,
+        mixed_qkv=mixed_qkv,
+        a=a,
+        b=b,
+    )
+
+    assert (
+        output.numel() == ret.numel()
+    ), f"Output tensor element mismatch: {output.numel()} != {ret.numel()}"
+
+    output.view(ret.shape).copy_(ret)
+    return
diff --git a/sglang/python/sglang/srt/layers/rocm_linear_utils.py b/sglang/python/sglang/srt/layers/rocm_linear_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..6c8a6a367e54e9b1446fa9f676ec459fcc223c10
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/rocm_linear_utils.py
@@ -0,0 +1,45 @@
+import torch
+from aiter.ops.triton.fused_kv_cache import fused_qk_rope_cat_and_cache_mla
+from aiter.ops.triton.fused_qk_concat import fused_qk_rope_cat
+from aiter.ops.triton.gemm_a16w16 import gemm_a16w16
+from aiter.ops.triton.gemm_a16w16_atomic import gemm_a16w16_atomic
+
+from sglang.srt.utils import BumpAllocator
+
+__all__ = ["fused_qk_rope_cat", "fused_qk_rope_cat_and_cache_mla"]
+
+
+def aiter_dsv3_router_gemm(
+    hidden_states: torch.Tensor,
+    weight: torch.Tensor,
+    gemm_output_zero_allocator: BumpAllocator = None,
+):
+    M = hidden_states.shape[0]
+    N = weight.shape[0]
+    y = None
+
+    if M <= 256:
+        # TODO (cagri): convert to bfloat16 as part of another kernel to save time
+        # for now it is also coupled with zero allocator.
+        if gemm_output_zero_allocator != None:
+            y = gemm_output_zero_allocator.allocate(M * N).view(M, N)
+        else:
+            y = torch.zeros((M, N), dtype=torch.float32, device=hidden_states.device)
+
+    if y is not None:
+        logits = gemm_a16w16_atomic(hidden_states, weight, y=y).to(hidden_states.dtype)
+    else:
+        logits = gemm_a16w16(hidden_states, weight)
+
+    return logits
+
+
+def get_dsv3_gemm_output_zero_allocator_size(
+    n_routed_experts: int, num_moe_layers: int, allocate_size: int, embedding_dim: int
+):
+    if embedding_dim != 7168 or n_routed_experts != 256:
+        return 0
+
+    per_layer_size = 256 * (allocate_size + n_routed_experts)
+
+    return num_moe_layers * per_layer_size
diff --git a/sglang/python/sglang/srt/layers/rotary_embedding/__init__.py b/sglang/python/sglang/srt/layers/rotary_embedding/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..97890ef9624fb19d7e1c4ec44686d7d2ad41461c
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/rotary_embedding/__init__.py
@@ -0,0 +1,31 @@
+# Adapted from https://raw.githubusercontent.com/vllm-project/vllm/refs/tags/v0.6.6.post1/vllm/model_executor/layers/rotary_embedding.py
+"""Rotary Positional Embeddings - public API (drop-in replacement for rotary_embedding.py)."""
+
+from sglang.srt.layers.rotary_embedding.base import RotaryEmbedding
+from sglang.srt.layers.rotary_embedding.factory import get_rope, get_rope_wrapper
+from sglang.srt.layers.rotary_embedding.mrope import (
+    Ernie4_5_VLRotaryEmbedding,
+    MRotaryEmbedding,
+)
+from sglang.srt.layers.rotary_embedding.utils import apply_rotary_pos_emb
+from sglang.srt.layers.rotary_embedding.yarn import (
+    yarn_find_correction_range,
+    yarn_get_mscale_simple,
+    yarn_linear_ramp_mask,
+)
+
+_yarn_find_correction_range = yarn_find_correction_range
+_yarn_get_mscale = yarn_get_mscale_simple
+_yarn_linear_ramp_mask = yarn_linear_ramp_mask
+
+__all__ = [
+    "RotaryEmbedding",
+    "get_rope",
+    "get_rope_wrapper",
+    "MRotaryEmbedding",
+    "Ernie4_5_VLRotaryEmbedding",
+    "apply_rotary_pos_emb",
+    "_yarn_find_correction_range",
+    "_yarn_get_mscale",
+    "_yarn_linear_ramp_mask",
+]
diff --git a/sglang/python/sglang/srt/layers/rotary_embedding/__pycache__/__init__.cpython-311.pyc b/sglang/python/sglang/srt/layers/rotary_embedding/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..da159cb8d9135f7bd6640e6aa8a19e20a813f6fb
Binary files /dev/null and b/sglang/python/sglang/srt/layers/rotary_embedding/__pycache__/__init__.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/layers/rotary_embedding/__pycache__/base.cpython-311.pyc b/sglang/python/sglang/srt/layers/rotary_embedding/__pycache__/base.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..506c487ebbf8e9a67c4f761f32e8f32c13ae9053
Binary files /dev/null and b/sglang/python/sglang/srt/layers/rotary_embedding/__pycache__/base.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/layers/rotary_embedding/__pycache__/factory.cpython-311.pyc b/sglang/python/sglang/srt/layers/rotary_embedding/__pycache__/factory.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..00f0a75e1a21e9a1ccfe8fe39892d29f8227b5fe
Binary files /dev/null and b/sglang/python/sglang/srt/layers/rotary_embedding/__pycache__/factory.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/layers/rotary_embedding/__pycache__/mrope.cpython-311.pyc b/sglang/python/sglang/srt/layers/rotary_embedding/__pycache__/mrope.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0023144288d8a9d34e8cf1c780e4a759753f9b34
Binary files /dev/null and b/sglang/python/sglang/srt/layers/rotary_embedding/__pycache__/mrope.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/layers/rotary_embedding/__pycache__/rope_variant.cpython-311.pyc b/sglang/python/sglang/srt/layers/rotary_embedding/__pycache__/rope_variant.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ac5e15578eabd876a628a5efb8a64591a591aa5b
Binary files /dev/null and b/sglang/python/sglang/srt/layers/rotary_embedding/__pycache__/rope_variant.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/layers/rotary_embedding/__pycache__/triton_kernels.cpython-311.pyc b/sglang/python/sglang/srt/layers/rotary_embedding/__pycache__/triton_kernels.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8100deb97745001e41442300e75046c7b819eff6
Binary files /dev/null and b/sglang/python/sglang/srt/layers/rotary_embedding/__pycache__/triton_kernels.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/layers/rotary_embedding/__pycache__/utils.cpython-311.pyc b/sglang/python/sglang/srt/layers/rotary_embedding/__pycache__/utils.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..67113d0a977f4b3b597313b39d8f020378af0016
Binary files /dev/null and b/sglang/python/sglang/srt/layers/rotary_embedding/__pycache__/utils.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/layers/rotary_embedding/__pycache__/yarn.cpython-311.pyc b/sglang/python/sglang/srt/layers/rotary_embedding/__pycache__/yarn.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2b5040c9228f5f3e3e99cb39b5a0d0fcc0ca1217
Binary files /dev/null and b/sglang/python/sglang/srt/layers/rotary_embedding/__pycache__/yarn.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/layers/rotary_embedding/base.py b/sglang/python/sglang/srt/layers/rotary_embedding/base.py
new file mode 100644
index 0000000000000000000000000000000000000000..fa3068b3dd0356bc1e5621434e47762c6195ffd4
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/rotary_embedding/base.py
@@ -0,0 +1,432 @@
+"""RotaryEmbedding base class + LinearScalingRotaryEmbedding."""
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, Union
+
+import torch
+
+from sglang.srt.layers.rotary_embedding.utils import apply_rotary_emb
+from sglang.srt.layers.utils import MultiPlatformOp
+from sglang.srt.server_args import get_global_server_args
+from sglang.srt.utils import (
+    cpu_has_amx_support,
+    get_bool_env_var,
+    is_cpu,
+    is_cuda,
+    is_hip,
+    is_musa,
+    is_npu,
+    is_xpu,
+)
+
+if TYPE_CHECKING:
+    from sglang.jit_kernel.rope import FusedSetKVBufferArg  # For type check-only
+
+_is_cuda = is_cuda()
+_is_hip = is_hip()
+_use_aiter = get_bool_env_var("SGLANG_USE_AITER") and _is_hip
+_is_npu = is_npu()
+_is_cpu_amx_available = cpu_has_amx_support()
+_is_cpu = is_cpu()
+_is_xpu = is_xpu()
+_is_musa = is_musa()
+
+if _is_cuda:
+    from sglang.jit_kernel.rope import apply_rope_with_cos_sin_cache_inplace
+
+if _is_npu:
+    import torch_npu
+
+
+class RotaryEmbedding(MultiPlatformOp):
+    """Original rotary positional embedding."""
+
+    def __init__(
+        self,
+        head_size: int,
+        rotary_dim: int,
+        max_position_embeddings: int,
+        base: int,
+        is_neox_style: bool,
+        dtype: torch.dtype,
+    ) -> None:
+        super().__init__()
+        self.head_size = head_size
+        self.rotary_dim = rotary_dim
+        self.max_position_embeddings = max_position_embeddings
+        self.base = base
+        self.is_neox_style = is_neox_style
+        self.dtype = dtype
+
+        cache = self._compute_cos_sin_cache()
+        # NOTE(ByronHsu): cache needs to be in FP32 for numerical stability
+        if not _is_cuda:
+            cache = cache.to(dtype)
+
+        if (
+            (not (_is_cuda) or self.head_size not in [64, 128, 256, 512])
+            and not (_is_cpu)
+            and not (_is_xpu)
+            and not (_is_npu)
+            and not (_is_musa)
+        ):
+            # rotary_embedding from sglang.jit_kernel.pos_enc and vllm._custom_ops has the same implementation.
+            # TODO: Test on different devices and remove this conditional.
+            if _is_cuda:
+                from sglang.jit_kernel.pos_enc import rotary_embedding
+            elif _is_hip:
+                from sgl_kernel import rotary_embedding
+            else:
+                from vllm._custom_ops import rotary_embedding
+
+            self.use_fallback_kernel = True
+            self.fallback_rotary_embedding = rotary_embedding
+        else:
+            self.use_fallback_kernel = False
+
+        self.cos_sin_cache: torch.Tensor
+        self.register_buffer("cos_sin_cache", cache, persistent=False)
+
+        self._apply_rotary_emb_wrapped = apply_rotary_emb
+
+        if get_global_server_args().rl_on_policy_target is not None:
+            self._forward_method = self.forward_native
+            self._apply_rotary_emb_wrapped = torch.compile(dynamic=True)(
+                apply_rotary_emb
+            )
+        self.position_cos, self.position_sin = None, None
+
+    def _compute_inv_freq(self, base: Union[int, float]) -> torch.Tensor:
+        """Compute the inverse frequency."""
+        # NOTE(woosuk): To exactly match the HF implementation, we need to
+        # use CPU to compute the cache and then move it to GPU. However, we
+        # create the cache on GPU for faster initialization. This may cause
+        # a slight numerical difference between the HF implementation and ours.
+        init_device = (
+            "cpu" if get_global_server_args().rl_on_policy_target is not None else None
+        )
+        inv_freq = 1.0 / (
+            base
+            ** (
+                torch.arange(
+                    0, self.rotary_dim, 2, dtype=torch.float, device=init_device
+                )
+                / self.rotary_dim
+            )
+        )
+        if get_global_server_args().rl_on_policy_target is not None:
+            inv_freq = inv_freq.cuda()
+        return inv_freq
+
+    def _compute_cos_sin_cache(self) -> torch.Tensor:
+        """Compute the cos and sin cache."""
+        inv_freq = self._compute_inv_freq(self.base)
+        t = torch.arange(self.max_position_embeddings, dtype=torch.float)
+
+        freqs = torch.einsum("i,j -> ij", t, inv_freq)
+        cos = freqs.cos()
+        sin = freqs.sin()
+        cache = torch.cat((cos, sin), dim=-1)
+        return cache
+
+    def _ensure_cos_sin_cache_length(self, needed_max_pos: int):
+        """Ensure cos_sin_cache length > needed_max_pos."""
+        from sglang.srt.environ import envs
+
+        cur_len = int(self.cos_sin_cache.shape[0])
+        if needed_max_pos < cur_len:
+            return
+
+        # Align to reduce realloc frequency
+        align = envs.SGLANG_ROPE_CACHE_ALIGN.get()
+        new_len = ((needed_max_pos + align) // align) * align
+        device = self.cos_sin_cache.device
+        dtype = self.cos_sin_cache.dtype
+
+        # Compute inv_freq on same device
+        inv_freq = self._compute_inv_freq(self.base).to(device=device)
+
+        # Incremental computation for new positions only
+        start = cur_len
+        t_new = torch.arange(start, new_len, dtype=inv_freq.dtype, device=device)
+        if t_new.numel() == 0:
+            return
+
+        freqs_new = torch.einsum("i,j->ij", t_new, inv_freq)
+        cos_new = freqs_new.cos()
+        sin_new = freqs_new.sin()
+        new_rows = torch.cat((cos_new, sin_new), dim=-1).to(dtype=dtype)
+
+        # Update cache with new rows
+        self.cos_sin_cache = torch.cat((self.cos_sin_cache, new_rows), dim=0).to(
+            device=device, dtype=dtype
+        )
+
+    def get_cos_sin_with_position(self, positions):
+        assert positions.ndim == 1, (
+            "2D positions (multimodal RoPE) are not supported by the base "
+            "RotaryEmbedding. Override this method in a subclass (e.g. MRotaryEmbedding)."
+        )
+        cos_sin = self.cos_sin_cache.index_select(0, positions.flatten())
+        last_dim = cos_sin.size()[-1]
+        cos, sin = (
+            cos_sin.reshape(-1, 2, last_dim // 2).repeat(1, 1, 2).chunk(2, dim=-2)
+        )
+        # BSNH
+        self.position_cos, self.position_sin = (
+            cos.view(-1, 1, 1, last_dim).contiguous(),
+            sin.view(-1, 1, 1, last_dim).contiguous(),
+        )
+
+    def get_cos_sin(self, seqlen: int) -> tuple[torch.Tensor, torch.Tensor]:
+        cos_sin = self.cos_sin_cache[:seqlen]
+        cos, sin = cos_sin.chunk(2, dim=-1)
+        return cos, sin
+
+    def forward_native(
+        self,
+        positions: torch.Tensor,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        offsets: Optional[torch.Tensor] = None,
+        fused_set_kv_buffer_arg: Optional[FusedSetKVBufferArg] = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """A PyTorch-native implementation of forward()."""
+        assert (
+            fused_set_kv_buffer_arg is None
+        ), "fused_set_kv_buffer_arg is not supported for native implementation"
+
+        if offsets is not None:
+            positions = positions + offsets
+        positions = positions.flatten()
+        num_tokens = positions.shape[0]
+        cos_sin = self.cos_sin_cache.index_select(0, positions)
+        cos, sin = cos_sin.chunk(2, dim=-1)
+
+        query_shape = query.shape
+        query = query.view(num_tokens, -1, self.head_size)
+        query_rot = query[..., : self.rotary_dim]
+        query_pass = query[..., self.rotary_dim :]
+        query_rot = self._apply_rotary_emb_wrapped(
+            query_rot, cos, sin, self.is_neox_style
+        )
+        query = torch.cat((query_rot, query_pass), dim=-1).reshape(query_shape)
+
+        key_shape = key.shape
+        key = key.view(num_tokens, -1, self.head_size)
+        key_rot = key[..., : self.rotary_dim]
+        key_pass = key[..., self.rotary_dim :]
+        key_rot = self._apply_rotary_emb_wrapped(key_rot, cos, sin, self.is_neox_style)
+        key = torch.cat((key_rot, key_pass), dim=-1).reshape(key_shape)
+        return query, key
+
+    def forward_npu(
+        self,
+        positions: torch.Tensor,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        offsets: Optional[torch.Tensor] = None,
+        fused_set_kv_buffer_arg: Optional[FusedSetKVBufferArg] = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """A PyTorch-npu implementation of forward()."""
+        assert (
+            fused_set_kv_buffer_arg is None
+        ), "fused_set_kv_buffer_arg is not supported for npu implementation"
+        if query.dtype == torch.bfloat16 and self.cos_sin_cache.dtype == torch.float:
+            return self.forward_native(positions, query, key, offsets)
+        if self.is_neox_style:
+            rotary_mode = "half"
+        else:
+            rotary_mode = "interleave"
+        mrope_section = [0, 0, 0]
+        query_out, key_out = torch_npu.npu_mrope(
+            positions,
+            query,
+            key,
+            self.cos_sin_cache,
+            self.head_size,
+            mrope_section=mrope_section,
+            rotary_mode=rotary_mode,
+        )
+        return query_out, key_out
+
+    def forward_cpu(
+        self,
+        positions: torch.Tensor,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        offsets: Optional[torch.Tensor] = None,
+        fused_set_kv_buffer_arg: Optional[FusedSetKVBufferArg] = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        assert (
+            fused_set_kv_buffer_arg is None
+        ), "fused_set_kv_buffer_arg is not supported for cpu implementation"
+
+        positions = torch.add(positions, offsets) if offsets is not None else positions
+        if _is_cpu_amx_available:
+            return torch.ops.sgl_kernel.rotary_embedding_cpu(
+                positions,
+                query,
+                key,
+                self.head_size,
+                self.cos_sin_cache,
+                self.is_neox_style,
+            )
+        else:
+            return self.forward_native(
+                positions, query, key, offsets, fused_set_kv_buffer_arg
+            )
+
+    def forward_cuda(
+        self,
+        positions: torch.Tensor,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        offsets: Optional[torch.Tensor] = None,
+        fused_set_kv_buffer_arg: Optional[FusedSetKVBufferArg] = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        if not self.use_fallback_kernel:
+            batch_size = positions.size(0)
+            q_rope = query.view(batch_size, -1, self.head_size)
+            k_rope = key.view(batch_size, -1, self.head_size)
+            if self.head_size != self.rotary_dim:
+                q_rope = q_rope[..., : self.rotary_dim]
+                k_rope = k_rope[..., : self.rotary_dim]
+            apply_rope_with_cos_sin_cache_inplace(
+                positions=positions,
+                q=q_rope,
+                k=k_rope,
+                cos_sin_cache=self.cos_sin_cache,
+                is_neox=self.is_neox_style,
+                fused_args=fused_set_kv_buffer_arg,
+            )
+        else:
+            assert (
+                fused_set_kv_buffer_arg is None
+            ), "save kv cache is not supported for fallback_rotary_embedding."
+            self.cos_sin_cache = self.cos_sin_cache.to(query.device, dtype=query.dtype)
+            self.fallback_rotary_embedding(
+                positions,
+                query,
+                key,
+                self.head_size,
+                self.cos_sin_cache,
+                self.is_neox_style,
+            )
+        return query, key
+
+    def extra_repr(self) -> str:
+        s = f"head_size={self.head_size}, rotary_dim={self.rotary_dim}"
+        s += f", max_position_embeddings={self.max_position_embeddings}"
+        s += f", base={self.base}, is_neox_style={self.is_neox_style}"
+        return s
+
+    def forward_xpu(
+        self,
+        positions: torch.Tensor,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        offsets: Optional[torch.Tensor] = None,
+        fused_set_kv_buffer_arg: Optional[FusedSetKVBufferArg] = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        assert (
+            fused_set_kv_buffer_arg is None
+        ), "fused_set_kv_buffer_arg is not supported for xpu implementation"
+        positions = torch.add(positions, offsets) if offsets is not None else positions
+
+        return torch.ops.sgl_kernel.rotary_embedding(
+            positions,
+            query,
+            key,
+            self.head_size,
+            self.cos_sin_cache,
+            self.is_neox_style,
+        )
+
+
+class LinearScalingRotaryEmbedding(RotaryEmbedding):
+    """RotaryEmbedding extended with linear scaling.
+
+    It supports multiple scaling factors. Since multiple LoRA adapters may have
+    different scaling factors, we need multiple cos/sin caches. In this way,
+    instead of running rotary embedding kernel per lora, we can run multiple
+    lora in a batched way.
+
+    In addition to that, we also keep the cos/sin cache for the scaling factor
+    of 1 (default) at all times.
+
+    Exemplary for two scaling factors x=1, y and z with embeddings
+    [[x11, x12, ... x1m], ..., [xn1, xn2, ..., xnm]] and
+    [[y11, y12, ... y1o], ..., [yn1, yn2, ..., yno]], and
+    [[z11, z12, ... z1p], ..., [zn1, zn2, ..., znp]],
+
+    we construct the cos/sin cache as follows:
+    [[x11, x12, ... x1m, y11, y12, ... y1o, z11, z12, ... z1p],
+        ...
+     [xn1, xn2, ... xnm, yn1, yn2, ... yno, zn1, zn2, ... znp]]
+
+    We then use offsets to index into the cos/sin cache for
+    the respective scaling factors.
+
+    The offset to cache can be accessed via `scaling_factor_to_offset` API.
+
+    Credits to the Reddit user /u/kaiokendev
+    """
+
+    def __init__(
+        self,
+        head_size: int,
+        rotary_dim: int,
+        max_position_embeddings: int,
+        base: int,
+        is_neox_style: bool,
+        scaling_factors: Union[List[float], float],
+        dtype: torch.dtype,
+    ) -> None:
+        if isinstance(scaling_factors, float):
+            scaling_factors = [scaling_factors]
+        self.scaling_factors: List[float] = scaling_factors  # noqa
+        super().__init__(
+            head_size, rotary_dim, max_position_embeddings, base, is_neox_style, dtype
+        )
+        # Lazy initialized.
+        self._scaling_factor_to_offset: Dict[float, int]
+
+    def _compute_cos_sin_cache(self) -> torch.Tensor:
+        inv_freq = self._compute_inv_freq(self.base)
+        cache_list: List[torch.Tensor] = []
+        # offsets to the next cache in a tensor.
+        # Each offset corresponds to the same index in scaling_factors.
+        offsets: List[int] = []
+        for scaling_factor in self.scaling_factors:
+            # NOTE(woosuk): self.max_position_embeddings is the original
+            # maximum length before applying the rope scaling.
+            # Thus, the maximum length after applying the rope scaling is
+            # self.max_position_embeddings * self.scaling_factor.
+            max_len = self.max_position_embeddings * scaling_factor
+            t = torch.arange(max_len, dtype=torch.float)
+            t = t / scaling_factor
+
+            freqs = torch.einsum("i,j -> ij", t, inv_freq)
+            cos = freqs.cos()
+            sin = freqs.sin()
+            cache = torch.cat((cos, sin), dim=-1)
+            if not cache_list:
+                offset = 0
+            else:
+                last_offset = offsets[-1]
+                next_max_len = cache_list[-1].shape[0]
+                offset = last_offset + next_max_len
+            offsets.append(offset)
+            cache_list.append(cache)
+        self._scaling_factor_to_offset = {
+            float(scaling_factor): offsets[i]
+            for i, scaling_factor in enumerate(self.scaling_factors)
+        }
+        assert len(self.scaling_factors) == len(offsets)
+        return torch.cat(cache_list, dim=0)
+
+    @property
+    def scaling_factor_to_offset(self) -> Dict[float, int]:
+        return self._scaling_factor_to_offset
diff --git a/sglang/python/sglang/srt/layers/rotary_embedding/factory.py b/sglang/python/sglang/srt/layers/rotary_embedding/factory.py
new file mode 100644
index 0000000000000000000000000000000000000000..9e24a2e5ed60addea0d01233ca0ebb04e60afb63
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/rotary_embedding/factory.py
@@ -0,0 +1,388 @@
+"""Factory functions: get_rope, get_rope_cpu, get_rope_wrapper."""
+
+from __future__ import annotations
+
+from typing import Any, Dict, Optional, Tuple
+
+import torch
+
+from sglang.srt.layers.rotary_embedding.base import (
+    LinearScalingRotaryEmbedding,
+    RotaryEmbedding,
+)
+from sglang.srt.layers.rotary_embedding.mrope import (
+    MRotaryEmbedding,
+    YaRNScalingMRotaryEmbedding,
+)
+from sglang.srt.layers.rotary_embedding.rope_variant import (
+    DeepseekScalingRotaryEmbedding,
+    DualChunkRotaryEmbedding,
+    DynamicNTKAlphaRotaryEmbedding,
+    DynamicNTKScalingRotaryEmbedding,
+    FourierRotaryEmbedding,
+    Llama3RotaryEmbedding,
+    Phi3LongRoPEScaledRotaryEmbedding,
+)
+from sglang.srt.layers.rotary_embedding.yarn import YaRNScalingRotaryEmbedding
+from sglang.srt.utils import get_bool_env_var, is_hip
+
+_is_hip = is_hip()
+_use_aiter = get_bool_env_var("SGLANG_USE_AITER") and _is_hip
+
+if _use_aiter:
+    from aiter.rotary_embedding import get_rope as aiter_get_rope
+
+_ROPE_DICT: Dict[Tuple, RotaryEmbedding] = {}
+
+
+def get_rope(
+    head_size: int,
+    rotary_dim: int,
+    max_position: int,
+    base: int,
+    is_neox_style: bool = True,
+    rope_scaling: Optional[Dict[str, Any]] = None,
+    dtype: Optional[torch.dtype] = None,
+    partial_rotary_factor: float = 1.0,
+    dual_chunk_attention_config: Optional[Dict[str, Any]] = None,
+) -> RotaryEmbedding:
+    if dtype is None:
+        dtype = torch.get_default_dtype()
+    if rope_scaling is not None:
+        rope_scaling_tuple = {
+            k: tuple(v) if isinstance(v, list) else v for k, v in rope_scaling.items()
+        }
+        rope_scaling_args = tuple(rope_scaling_tuple.items())
+    else:
+        rope_scaling_args = None
+
+    if dual_chunk_attention_config is not None:
+        dual_chunk_attention_tuple = {
+            k: tuple(v) if isinstance(v, list) else v
+            for k, v in dual_chunk_attention_config.items()
+            if k != "sparse_attention_config"
+        }
+        dual_chunk_attention_args = tuple(dual_chunk_attention_tuple.items())
+    else:
+        dual_chunk_attention_args = None
+
+    if partial_rotary_factor < 1.0:
+        rotary_dim = int(rotary_dim * partial_rotary_factor)
+    key = (
+        head_size,
+        rotary_dim,
+        max_position,
+        base,
+        is_neox_style,
+        rope_scaling_args,
+        dual_chunk_attention_args,
+        dtype,
+    )
+    if key in _ROPE_DICT:
+        return _ROPE_DICT[key]
+
+    if dual_chunk_attention_config is not None:
+        extra_kwargs = {
+            k: v
+            for k, v in dual_chunk_attention_config.items()
+            if k in ("chunk_size", "local_size")
+        }
+        rotary_emb = DualChunkRotaryEmbedding(
+            head_size,
+            rotary_dim,
+            max_position,
+            base,
+            is_neox_style,
+            dtype,
+            **extra_kwargs,
+        )
+    elif rope_scaling is None:
+        rotary_emb = RotaryEmbedding(
+            head_size, rotary_dim, max_position, base, is_neox_style, dtype
+        )
+    else:
+        if "rope_type" in rope_scaling:
+            scaling_type = rope_scaling["rope_type"]
+        elif "type" in rope_scaling:
+            scaling_type = rope_scaling["type"]
+        else:
+            raise ValueError(
+                f"Unknown RoPE scaling type, rope_scaling is {rope_scaling}"
+            )
+
+        if scaling_type == "llama3":
+            scaling_factor = rope_scaling["factor"]
+            low_freq_factor = rope_scaling["low_freq_factor"]
+            high_freq_factor = rope_scaling["high_freq_factor"]
+            original_max_position = rope_scaling["original_max_position_embeddings"]
+            rotary_emb = Llama3RotaryEmbedding(
+                head_size,
+                rotary_dim,
+                max_position,
+                base,
+                is_neox_style,
+                dtype,
+                scaling_factor,
+                low_freq_factor,
+                high_freq_factor,
+                original_max_position,
+            )
+        elif scaling_type == "default":
+            if "mrope_section" in rope_scaling:
+                rotary_emb = MRotaryEmbedding(
+                    head_size,
+                    rotary_dim,
+                    max_position,
+                    base,
+                    is_neox_style,
+                    dtype,
+                    mrope_section=rope_scaling["mrope_section"],
+                    mrope_interleaved=rope_scaling.get("mrope_interleaved", False),
+                )
+            elif rope_scaling.get("use_fope", False):
+                rotary_emb = FourierRotaryEmbedding(
+                    head_size,
+                    rotary_dim,
+                    max_position,
+                    base,
+                    is_neox_style,
+                    dtype,
+                    num_kv_heads=rope_scaling["num_kv_heads"],
+                    fope_init_factor=rope_scaling.get("fope_init_factor", 0.1),
+                    fope_sep_head=rope_scaling.get("fope_sep_head", True),
+                    num_inv_freq=rope_scaling.get("num_inv_freq", None),
+                )
+            else:
+                rotary_emb = RotaryEmbedding(
+                    head_size,
+                    rotary_dim,
+                    max_position,
+                    base,
+                    is_neox_style,
+                    dtype,
+                )
+        elif scaling_type == "linear":
+            scaling_factor = rope_scaling["factor"]
+            rotary_emb = LinearScalingRotaryEmbedding(
+                head_size,
+                rotary_dim,
+                max_position,
+                base,
+                is_neox_style,
+                scaling_factor,
+                dtype,
+            )
+        elif scaling_type == "dynamic":
+            scaling_factor = rope_scaling["factor"]
+            if "alpha" in rope_scaling:
+                rotary_emb = DynamicNTKAlphaRotaryEmbedding(
+                    head_size,
+                    rotary_dim,
+                    max_position,
+                    base,
+                    is_neox_style,
+                    rope_scaling["alpha"],
+                    dtype,
+                )
+            else:
+                rotary_emb = DynamicNTKScalingRotaryEmbedding(
+                    head_size,
+                    rotary_dim,
+                    max_position,
+                    base,
+                    is_neox_style,
+                    scaling_factor,
+                    dtype,
+                )
+        elif scaling_type == "yarn":
+            scaling_factor = rope_scaling["factor"]
+            original_max_position = rope_scaling["original_max_position_embeddings"]
+            extra_kwargs = {
+                k: v
+                for k, v in rope_scaling.items()
+                if k
+                in ("extrapolation_factor", "attn_factor", "beta_fast", "beta_slow")
+            }
+            extra_kwargs["truncate"] = rope_scaling.get("truncate", True)
+            if "mrope_section" in rope_scaling:
+                rotary_emb = YaRNScalingMRotaryEmbedding(
+                    head_size,
+                    rotary_dim,
+                    original_max_position,
+                    base,
+                    is_neox_style,
+                    scaling_factor,
+                    dtype,
+                    mrope_section=rope_scaling["mrope_section"],
+                    mrope_interleaved=rope_scaling.get("mrope_interleaved", False),
+                    **extra_kwargs,
+                )
+            else:
+                rotary_emb = YaRNScalingRotaryEmbedding(
+                    head_size,
+                    rotary_dim,
+                    original_max_position,
+                    base,
+                    is_neox_style,
+                    scaling_factor,
+                    dtype,
+                    **extra_kwargs,
+                )
+        elif scaling_type == "deepseek_yarn":
+            scaling_factor = rope_scaling["factor"]
+            original_max_position = rope_scaling["original_max_position_embeddings"]
+            extra_kwargs = {
+                k: v
+                for k, v in rope_scaling.items()
+                if k
+                in (
+                    "extrapolation_factor",
+                    "attn_factor",
+                    "beta_fast",
+                    "beta_slow",
+                    "mscale",
+                    "mscale_all_dim",
+                )
+            }
+            rotary_emb = DeepseekScalingRotaryEmbedding(
+                head_size,
+                rotary_dim,
+                original_max_position,
+                base,
+                is_neox_style,
+                scaling_factor,
+                dtype,
+                **extra_kwargs,
+            )
+        elif scaling_type == "longrope":
+            short_factor = rope_scaling["short_factor"]
+            long_factor = rope_scaling["long_factor"]
+            original_max_position = rope_scaling["original_max_position_embeddings"]
+            extra_kwargs = {
+                k: v
+                for k, v in rope_scaling.items()
+                if k in ("short_mscale", "long_mscale")
+            }
+            rotary_emb = Phi3LongRoPEScaledRotaryEmbedding(
+                head_size,
+                rotary_dim,
+                max_position,
+                original_max_position,
+                base,
+                is_neox_style,
+                dtype,
+                short_factor,
+                long_factor,
+                **extra_kwargs,
+            )
+        else:
+            raise ValueError(f"Unknown RoPE scaling type {scaling_type}")
+    _ROPE_DICT[key] = rotary_emb
+    return rotary_emb
+
+
+def get_rope_cpu(
+    head_size: int,
+    rotary_dim: int,
+    max_position: int,
+    base: int,
+    is_neox_style: bool = True,
+    rope_scaling: Optional[Dict[str, Any]] = None,
+    dtype: Optional[torch.dtype] = None,
+    partial_rotary_factor: float = 1.0,
+    device: Optional[str] = None,
+) -> RotaryEmbedding:
+    if dtype is None:
+        dtype = torch.get_default_dtype()
+    if rope_scaling is not None:
+        rope_scaling_tuple = {
+            k: tuple(v) if isinstance(v, list) else v for k, v in rope_scaling.items()
+        }
+        rope_scaling_args = tuple(rope_scaling_tuple.items())
+    else:
+        rope_scaling_args = None
+    if partial_rotary_factor < 1.0:
+        rotary_dim = int(rotary_dim * partial_rotary_factor)
+    key = (
+        head_size,
+        rotary_dim,
+        max_position,
+        base,
+        is_neox_style,
+        rope_scaling_args,
+        dtype,
+    )
+    if key in _ROPE_DICT:
+        return _ROPE_DICT[key]
+
+    assert rope_scaling is not None
+    scaling_type = rope_scaling["rope_type"]
+    assert (
+        scaling_type == "deepseek_yarn"
+    ), "Only deepseek_yarn is supported for CPU for now"
+
+    scaling_factor = rope_scaling["factor"]
+    original_max_position = rope_scaling["original_max_position_embeddings"]
+    extra_kwargs = {
+        k: v
+        for k, v in rope_scaling.items()
+        if k
+        in (
+            "extrapolation_factor",
+            "attn_factor",
+            "beta_fast",
+            "beta_slow",
+            "mscale",
+            "mscale_all_dim",
+        )
+    }
+    extra_kwargs["device"] = device
+    rotary_emb = DeepseekScalingRotaryEmbedding(
+        head_size,
+        rotary_dim,
+        original_max_position,
+        base,
+        is_neox_style,
+        scaling_factor,
+        dtype,
+        **extra_kwargs,
+    )
+    _ROPE_DICT[key] = rotary_emb
+    return rotary_emb
+
+
+def get_rope_wrapper(
+    head_size: int,
+    rotary_dim: int,
+    max_position: int,
+    base: int,
+    is_neox_style: bool = True,
+    rope_scaling: Optional[Dict[str, Any]] = None,
+    dtype: Optional[torch.dtype] = None,
+    partial_rotary_factor: float = 1.0,
+    device: Optional[str] = None,
+):
+    if device != "cpu":
+        wrapper = aiter_get_rope if _use_aiter else get_rope
+        return wrapper(
+            head_size,
+            rotary_dim,
+            max_position,
+            base,
+            is_neox_style,
+            rope_scaling,
+            dtype,
+            partial_rotary_factor,
+        )
+
+    return get_rope_cpu(
+        head_size,
+        rotary_dim,
+        max_position,
+        base,
+        is_neox_style,
+        rope_scaling,
+        dtype,
+        partial_rotary_factor,
+        device,
+    )
diff --git a/sglang/python/sglang/srt/layers/rotary_embedding/mrope.py b/sglang/python/sglang/srt/layers/rotary_embedding/mrope.py
new file mode 100644
index 0000000000000000000000000000000000000000..0a36550de1adbc7a047b82e9cf7e7e3caf02313d
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/rotary_embedding/mrope.py
@@ -0,0 +1,513 @@
+"""MRotaryEmbedding, YaRNScalingMRotaryEmbedding, Ernie4_5_VLRotaryEmbedding,
+apply_interleaved_rope for multimodal RoPE."""
+
+from __future__ import annotations
+
+from typing import List, Optional, Tuple
+
+import torch
+
+from sglang.srt.layers.rotary_embedding.base import RotaryEmbedding
+from sglang.srt.layers.rotary_embedding.triton_kernels import (
+    triton_ernie45_rope_fused_inplace,
+    triton_mrope_fused,
+)
+from sglang.srt.layers.rotary_embedding.utils import apply_rotary_emb
+from sglang.srt.layers.rotary_embedding.yarn import (
+    yarn_find_correction_range,
+    yarn_get_mscale_simple,
+    yarn_linear_ramp_mask,
+)
+from sglang.srt.server_args import get_global_server_args
+from sglang.srt.utils import is_cuda, is_npu
+
+_is_cuda = is_cuda()
+_is_npu = is_npu()
+
+if _is_cuda:
+    from sglang.jit_kernel.rope import apply_rope_with_cos_sin_cache_inplace
+
+if _is_npu:
+    import torch_npu
+
+
+def apply_interleaved_rope(x: torch.Tensor, mrope_section: list) -> torch.Tensor:
+    x_t = x[0].clone()
+    x_t[..., 1 : mrope_section[1] * 3 : 3] = x[1, ..., 1 : mrope_section[1] * 3 : 3]
+    x_t[..., 2 : mrope_section[2] * 3 : 3] = x[2, ..., 2 : mrope_section[2] * 3 : 3]
+    return x_t
+
+
+class MRotaryEmbedding(RotaryEmbedding):
+    """Rotary Embedding with Multimodal Sections."""
+
+    def __init__(
+        self,
+        head_size: int,
+        rotary_dim: int,
+        max_position_embeddings: int,
+        base: int,
+        is_neox_style: bool,
+        dtype: torch.dtype,
+        mrope_section: Optional[List[int]] = None,
+        mrope_interleaved: bool = False,
+    ) -> None:
+        super().__init__(
+            head_size, rotary_dim, max_position_embeddings, base, is_neox_style, dtype
+        )
+        self.mrope_section = mrope_section
+        self.mrope_interleaved = mrope_interleaved
+        if self.mrope_section:
+            expected_sum = rotary_dim // 2
+            actual_sum = sum(self.mrope_section)
+            if actual_sum != expected_sum:
+                print(
+                    f"MRoPE section sum mismatch: expected {expected_sum}, got {actual_sum}. "
+                    f"Adjusting mrope_section to match rotary_dim // 2 = {expected_sum}"
+                )
+                if actual_sum > 0:
+                    scale_factor = expected_sum / actual_sum
+                    self.mrope_section = [
+                        max(1, int(section * scale_factor))
+                        for section in self.mrope_section
+                    ]
+                    current_sum = sum(self.mrope_section)
+                    if current_sum != expected_sum:
+                        self.mrope_section[-1] += expected_sum - current_sum
+                else:
+                    self.mrope_section = [
+                        expected_sum // len(self.mrope_section)
+                    ] * len(self.mrope_section)
+                    remainder = expected_sum % len(self.mrope_section)
+                    for i in range(remainder):
+                        self.mrope_section[i] += 1
+                print(
+                    f"Corrected mrope_section: {self.mrope_section} (sum={sum(self.mrope_section)})"
+                )
+
+        if get_global_server_args().rl_on_policy_target is not None:
+            self._forward_method = self.forward_native
+
+    def get_cos_sin_with_position(self, positions):
+        if positions.ndim == 1:
+            return super().get_cos_sin_with_position(positions)
+        assert positions.ndim == 2
+        assert self.mrope_section
+        cos_sin = self.cos_sin_cache[positions]
+        last_dim = cos_sin.size()[-1]
+        cos, sin = cos_sin.chunk(2, dim=-1)
+        if self.mrope_interleaved:
+            cos = apply_interleaved_rope(cos, self.mrope_section)
+            sin = apply_interleaved_rope(sin, self.mrope_section)
+        else:
+            cos = torch.cat(
+                [m[i] for i, m in enumerate(cos.split(self.mrope_section, dim=-1))],
+                dim=-1,
+            )
+            sin = torch.cat(
+                [m[i] for i, m in enumerate(sin.split(self.mrope_section, dim=-1))],
+                dim=-1,
+            )
+        self.position_cos = cos.repeat(1, 2).view(-1, 1, 1, last_dim).contiguous()
+        self.position_sin = sin.repeat(1, 2).view(-1, 1, 1, last_dim).contiguous()
+
+    def _match_cos_sin_cache_dtype(self, query: torch.Tensor) -> None:
+        if (
+            self.cos_sin_cache.device != query.device
+            or self.cos_sin_cache.dtype != query.dtype
+        ):
+            self.cos_sin_cache = self.cos_sin_cache.to(query.device, dtype=query.dtype)
+
+    def forward_native(
+        self,
+        positions: torch.Tensor,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        fused_set_kv_buffer_arg=None,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        assert (
+            fused_set_kv_buffer_arg is None
+        ), "save kv cache is not supported for MRotaryEmbedding."
+        assert positions.ndim == 1 or positions.ndim == 2
+
+        cos_sin = self.cos_sin_cache[positions]
+        cos, sin = cos_sin.chunk(2, dim=-1)
+        if positions.ndim == 2:
+            assert self.mrope_section
+            if self.mrope_interleaved:
+                cos = apply_interleaved_rope(cos, self.mrope_section)
+                sin = apply_interleaved_rope(sin, self.mrope_section)
+            else:
+                cos = torch.cat(
+                    [m[i] for i, m in enumerate(cos.split(self.mrope_section, dim=-1))],
+                    dim=-1,
+                )
+                sin = torch.cat(
+                    [m[i] for i, m in enumerate(sin.split(self.mrope_section, dim=-1))],
+                    dim=-1,
+                )
+
+        seq_len_q = query.shape[0]
+        query_shape = query.shape
+        query = query.view(seq_len_q, -1, self.head_size)
+        query_rot = query[..., : self.rotary_dim]
+        query_pass = query[..., self.rotary_dim :]
+        query_rot = apply_rotary_emb(query_rot, cos, sin, self.is_neox_style)
+        query = torch.cat((query_rot, query_pass), dim=-1).reshape(query_shape)
+
+        seq_len_k = key.shape[0]
+        key_shape = key.shape
+        key = key.view(seq_len_k, -1, self.head_size)
+        key_rot = key[..., : self.rotary_dim]
+        key_pass = key[..., self.rotary_dim :]
+        key_rot = apply_rotary_emb(key_rot, cos, sin, self.is_neox_style)
+        key = torch.cat((key_rot, key_pass), dim=-1).reshape(key_shape)
+        return query, key
+
+    def forward_cuda(
+        self,
+        positions: torch.Tensor,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        fused_set_kv_buffer_arg=None,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        assert positions.ndim == 1 or positions.ndim == 2
+        if positions.ndim == 2 and self.mrope_section:
+            return self.forward_triton(positions, query, key)
+        return self.forward_native(positions, query, key, fused_set_kv_buffer_arg)
+
+    def forward_triton(
+        self,
+        positions: torch.Tensor,
+        query: torch.Tensor,
+        key: torch.Tensor,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        assert self.mrope_section
+        self._match_cos_sin_cache_dtype(query)
+        triton_mrope_fused(
+            query,
+            key,
+            self.cos_sin_cache,
+            positions,
+            self.mrope_section,
+            self.head_size,
+            self.rotary_dim,
+            self.mrope_interleaved,
+            self.is_neox_style,
+        )
+        return query, key
+
+    def forward_npu(
+        self,
+        positions: torch.Tensor,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        fused_set_kv_buffer_arg=None,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        assert (
+            fused_set_kv_buffer_arg is None
+        ), "fused_set_kv_buffer_arg is not supported for npu implementation"
+        if query.shape[1] > 4096:
+            return self.forward_native(positions, query, key, fused_set_kv_buffer_arg)
+        rotary_mode = "half" if self.is_neox_style else "interleave"
+        mrope_section = [0, 0, 0]
+        query_out, key_out = torch_npu.npu_mrope(
+            positions,
+            query,
+            key,
+            self.cos_sin_cache,
+            self.head_size,
+            mrope_section=mrope_section,
+            rotary_mode=rotary_mode,
+        )
+        return query_out, key_out
+
+    @staticmethod
+    def get_rope_index(
+        spatial_merge_size,
+        image_token_id,
+        video_token_id,
+        vision_start_token_id,
+        model_type,
+        tokens_per_second=None,
+        input_ids=None,
+        image_grid_thw=None,
+        video_grid_thw=None,
+        second_per_grid_ts=None,
+        **kwargs,
+    ):
+        from sglang.srt.layers.rotary_embedding.mrope_rope_index import get_rope_index
+
+        return get_rope_index(
+            spatial_merge_size,
+            image_token_id,
+            video_token_id,
+            vision_start_token_id,
+            model_type,
+            tokens_per_second,
+            input_ids,
+            image_grid_thw,
+            video_grid_thw,
+            second_per_grid_ts,
+            **kwargs,
+        )
+
+    @staticmethod
+    def get_rope_index_qwen3_omni(
+        spatial_merge_size,
+        image_token_id,
+        video_token_id,
+        vision_start_token_id,
+        tokens_per_second=None,
+        input_ids=None,
+        image_grid_thw=None,
+        video_grid_thw=None,
+        second_per_grid_ts=None,
+        **kwargs,
+    ):
+        from sglang.srt.layers.rotary_embedding.mrope_rope_index import (
+            get_rope_index_qwen3_omni,
+        )
+
+        return get_rope_index_qwen3_omni(
+            spatial_merge_size,
+            image_token_id,
+            video_token_id,
+            vision_start_token_id,
+            tokens_per_second,
+            input_ids,
+            image_grid_thw,
+            video_grid_thw,
+            second_per_grid_ts,
+            **kwargs,
+        )
+
+    @staticmethod
+    def get_rope_index_glm4v(
+        input_ids, hf_config, image_grid_thw, video_grid_thw, attention_mask, **kwargs
+    ):
+        from sglang.srt.layers.rotary_embedding.mrope_rope_index import (
+            get_rope_index_glm4v,
+        )
+
+        return get_rope_index_glm4v(
+            input_ids,
+            hf_config,
+            image_grid_thw,
+            video_grid_thw,
+            attention_mask,
+            **kwargs,
+        )
+
+    @staticmethod
+    def get_rope_index_ernie45(
+        input_ids, hf_config, image_grid_thw, video_grid_thw, **kwargs
+    ):
+        from sglang.srt.layers.rotary_embedding.mrope_rope_index import (
+            get_rope_index_ernie45,
+        )
+
+        return get_rope_index_ernie45(
+            input_ids, hf_config, image_grid_thw, video_grid_thw, **kwargs
+        )
+
+
+class YaRNScalingMRotaryEmbedding(MRotaryEmbedding):
+    """MRoPE-enabled rotary embedding with YaRN context scaling."""
+
+    def __init__(
+        self,
+        head_size: int,
+        rotary_dim: int,
+        max_position_embeddings: int,
+        base: int,
+        is_neox_style: bool,
+        scaling_factor: float,
+        dtype: torch.dtype,
+        *,
+        mrope_section: Optional[List[int]] = None,
+        mrope_interleaved: bool = False,
+        extrapolation_factor: float = 1,
+        attn_factor: float = 1,
+        beta_fast: int = 32,
+        beta_slow: int = 1,
+        truncate: bool = True,
+    ) -> None:
+        self.scaling_factor = scaling_factor
+        self.extrapolation_factor = extrapolation_factor
+        self.attn_factor = attn_factor
+        self.beta_fast = beta_fast
+        self.beta_slow = beta_slow
+        self.truncate = truncate
+        self.mscale = float(yarn_get_mscale_simple(self.scaling_factor) * attn_factor)
+        super().__init__(
+            head_size,
+            rotary_dim,
+            max_position_embeddings,
+            base,
+            is_neox_style,
+            dtype,
+            mrope_section=mrope_section,
+            mrope_interleaved=mrope_interleaved,
+        )
+
+    def _compute_inv_freq(self, scaling_factor: float) -> torch.Tensor:
+        pos_freqs = self.base ** (
+            torch.arange(0, self.rotary_dim, 2, dtype=torch.float) / self.rotary_dim
+        )
+        inv_freq_extrapolation = 1.0 / pos_freqs
+        inv_freq_interpolation = 1.0 / (scaling_factor * pos_freqs)
+        low, high = yarn_find_correction_range(
+            self.beta_fast,
+            self.beta_slow,
+            self.rotary_dim,
+            self.base,
+            self.max_position_embeddings,
+            self.truncate,
+        )
+        inv_freq_mask = (
+            1
+            - yarn_linear_ramp_mask(low, high, self.rotary_dim // 2, dtype=torch.float)
+        ) * self.extrapolation_factor
+        inv_freq = (
+            inv_freq_interpolation * (1 - inv_freq_mask)
+            + inv_freq_extrapolation * inv_freq_mask
+        )
+        return inv_freq
+
+    def _compute_cos_sin_cache(self) -> torch.Tensor:
+        inv_freq = self._compute_inv_freq(self.scaling_factor)
+        t = torch.arange(
+            self.max_position_embeddings * self.scaling_factor, dtype=torch.float32
+        )
+        freqs = torch.einsum("i,j -> ij", t, inv_freq)
+        cos = freqs.cos() * self.mscale
+        sin = freqs.sin() * self.mscale
+        cache = torch.cat((cos, sin), dim=-1)
+        return cache
+
+
+class Ernie4_5_VLRotaryEmbedding(MRotaryEmbedding):
+    """3D rotary positional embedding. [h w h w h w h w... t t t...]"""
+
+    def __init__(
+        self,
+        head_size: int,
+        rotary_dim: int,
+        max_position_embeddings: int,
+        base: int,
+        is_neox_style: bool,
+        dtype: torch.dtype,
+        mrope_section: Optional[List[int]] = None,
+        mrope_interleaved: bool = False,
+    ) -> None:
+        super().__init__(
+            head_size,
+            rotary_dim,
+            max_position_embeddings,
+            base,
+            is_neox_style,
+            dtype,
+            mrope_section=mrope_section,
+            mrope_interleaved=mrope_interleaved,
+        )
+        self._apply_rotary_emb_wrapped = torch.compile(dynamic=True)(apply_rotary_emb)
+
+    def forward_native(
+        self,
+        positions: torch.Tensor,
+        query: torch.Tensor,
+        key: torch.Tensor = None,
+    ):
+        assert positions.ndim == 1 or positions.ndim == 2
+        assert key is not None
+
+        num_tokens = positions.shape[-1]
+        cos_sin = self.cos_sin_cache[positions]
+        cos, sin = cos_sin.chunk(2, dim=-1)
+        if positions.ndim == 2:
+            assert self.mrope_section
+            section_h = self.mrope_section[0]
+            section_w = self.mrope_section[1]
+            section_t = self.mrope_section[2]
+            assert section_h == section_w
+            section_cos_t = cos[..., -section_t:]
+            section_cos_h = cos[..., : section_h + section_w : 2]
+            section_cos_w = cos[..., 1 : section_h + section_w : 2]
+            cos_t, cos_h, cos_w = section_cos_t[0], section_cos_h[1], section_cos_w[2]
+            cos_hw = torch.stack([cos_h, cos_w], dim=-1).reshape(
+                cos_h.shape[:-1] + (cos_h.shape[-1] * 2,)
+            )
+            cos = torch.cat([cos_hw, cos_t], dim=-1)
+            section_sin_t = sin[..., -section_t:]
+            section_sin_h = sin[..., : section_h + section_w : 2]
+            section_sin_w = sin[..., 1 : section_h + section_w : 2]
+            sin_t, sin_h, sin_w = section_sin_t[0], section_sin_h[1], section_sin_w[2]
+            sin_hw = torch.stack([sin_h, sin_w], dim=-1).reshape(
+                sin_h.shape[:-1] + (sin_h.shape[-1] * 2,)
+            )
+            sin = torch.cat([sin_hw, sin_t], dim=-1)
+
+        query_shape = query.shape
+        query = query.view(num_tokens, -1, self.head_size)
+        query_rot = query[..., : self.rotary_dim]
+        query_pass = query[..., self.rotary_dim :]
+        query_rot = self._apply_rotary_emb_wrapped(
+            query_rot, cos, sin, self.is_neox_style
+        )
+        query = torch.cat((query_rot, query_pass), dim=-1).reshape(query_shape)
+
+        key_shape = key.shape
+        key = key.view(num_tokens, -1, self.head_size)
+        key_rot = key[..., : self.rotary_dim]
+        key_pass = key[..., self.rotary_dim :]
+        key_rot = self._apply_rotary_emb_wrapped(key_rot, cos, sin, self.is_neox_style)
+        key = torch.cat((key_rot, key_pass), dim=-1).reshape(key_shape)
+        return query, key
+
+    def forward_cuda(
+        self,
+        positions: torch.Tensor,
+        query: torch.Tensor,
+        key: torch.Tensor = None,
+    ):
+        assert key is not None
+        assert positions.ndim in (1, 2)
+        self._match_cos_sin_cache_dtype(query)
+
+        if positions.ndim == 2:
+            assert self.mrope_section is not None
+            triton_ernie45_rope_fused_inplace(
+                q=query,
+                k=key,
+                cos_sin_cache=self.cos_sin_cache,
+                positions=positions,
+                mrope_section=self.mrope_section,
+                head_size=self.head_size,
+                rotary_dim=self.rotary_dim,
+                is_neox_style=self.is_neox_style,
+            )
+            return query, key
+
+        if _is_cuda and (apply_rope_with_cos_sin_cache_inplace is not None):
+            apply_rope_with_cos_sin_cache_inplace(
+                positions=positions,
+                query=query,
+                key=key,
+                head_size=self.head_size,
+                cos_sin_cache=self.cos_sin_cache,
+                is_neox=self.is_neox_style,
+            )
+            return query, key
+
+        return self.forward_native(positions, query, key)
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        fused_set_kv_buffer_arg=None,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        assert positions.ndim == 1 or positions.ndim == 2
+        return self.forward_cuda(positions, query, key)
diff --git a/sglang/python/sglang/srt/layers/rotary_embedding/mrope_rope_index.py b/sglang/python/sglang/srt/layers/rotary_embedding/mrope_rope_index.py
new file mode 100644
index 0000000000000000000000000000000000000000..f1315f02903749f2654a5456e3c29c08e3e332ef
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/rotary_embedding/mrope_rope_index.py
@@ -0,0 +1,826 @@
+"""get_rope_index implementations for Qwen2-VL/Qwen3-VL, Qwen3-Omni, GLM4V, Ernie4.5."""
+
+from __future__ import annotations
+
+import itertools
+from typing import Any, List, Optional, Tuple, Union
+
+import torch
+
+
+def _get_feat_extract_output_lengths(input_lengths):
+    """
+    Computes the output length of the convolutional layers and the output length of the audio encoder
+    """
+    input_lengths_leave = input_lengths % 100
+    feat_lengths = (input_lengths_leave - 1) // 2 + 1
+    output_lengths = (
+        ((feat_lengths - 1) // 2 + 1 - 1) // 2 + 1 + (input_lengths // 100) * 13
+    )
+    return output_lengths
+
+
+def _get_llm_pos_ids_for_vision(
+    st_idx, vision_idx, spatial_merge_size, t_index, grid_hs, grid_ws, device
+):
+    grid_h = grid_hs[vision_idx] // spatial_merge_size
+    grid_w = grid_ws[vision_idx] // spatial_merge_size
+
+    h_index = (
+        torch.arange(grid_h, device=device)
+        .view(1, -1, 1)
+        .expand(len(t_index), -1, grid_w)
+        .flatten()
+    )
+    w_index = (
+        torch.arange(grid_w, device=device)
+        .view(1, 1, -1)
+        .expand(len(t_index), grid_h, -1)
+        .flatten()
+    )
+    t_index = t_index.view(-1, 1).expand(-1, grid_h * grid_w).flatten()
+
+    llm_pos_ids = torch.stack([t_index, h_index, w_index], dim=0) + st_idx
+    return llm_pos_ids
+
+
+def get_rope_index(
+    spatial_merge_size: int,
+    image_token_id: int,
+    video_token_id: int,
+    vision_start_token_id: int,
+    model_type: str,
+    tokens_per_second: Optional[int] = None,
+    input_ids: Optional[torch.LongTensor] = None,
+    image_grid_thw: Optional[torch.LongTensor] = None,
+    video_grid_thw: Optional[torch.LongTensor] = None,
+    second_per_grid_ts: Optional[torch.Tensor] = None,
+    **kwargs,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    if model_type == "qwen3_omni_moe":
+        return get_rope_index_qwen3_omni(
+            spatial_merge_size,
+            image_token_id,
+            video_token_id,
+            vision_start_token_id,
+            tokens_per_second,
+            input_ids,
+            image_grid_thw,
+            video_grid_thw,
+            second_per_grid_ts,
+            **kwargs,
+        )
+    if (
+        model_type.startswith("qwen3_vl")
+        or model_type.startswith("qwen3_vl_moe")
+        or model_type.startswith("qwen3_5")
+    ) and video_grid_thw is not None:
+        video_grid_thw = torch.repeat_interleave(
+            video_grid_thw, video_grid_thw[:, 0], dim=0
+        )
+        video_grid_thw[:, 0] = 1
+
+    mrope_position_deltas = []
+    if input_ids is not None and (
+        image_grid_thw is not None or video_grid_thw is not None
+    ):
+        total_input_ids = input_ids
+        position_ids = torch.ones(
+            3,
+            input_ids.shape[0],
+            input_ids.shape[1],
+            dtype=input_ids.dtype,
+            device=input_ids.device,
+        )
+        image_index, video_index = 0, 0
+        for i, input_ids in enumerate(total_input_ids):
+            image_nums, video_nums = 0, 0
+            vision_start_indices = torch.argwhere(
+                input_ids == vision_start_token_id
+            ).squeeze(1)
+            vision_tokens = input_ids[vision_start_indices + 1]
+            image_nums = (vision_tokens == image_token_id).sum()
+            video_nums = (vision_tokens == video_token_id).sum()
+            input_tokens = input_ids.tolist()
+            llm_pos_ids_list: list = []
+            st = 0
+            remain_images, remain_videos = image_nums, video_nums
+            for _ in range(image_nums + video_nums):
+                if image_token_id in input_tokens and remain_images > 0:
+                    ed_image = input_tokens.index(image_token_id, st)
+                else:
+                    ed_image = len(input_tokens) + 1
+                if video_token_id in input_tokens and remain_videos > 0:
+                    ed_video = input_tokens.index(video_token_id, st)
+                else:
+                    ed_video = len(input_tokens) + 1
+                if ed_image < ed_video:
+                    t, h, w = (
+                        image_grid_thw[image_index][0],
+                        image_grid_thw[image_index][1],
+                        image_grid_thw[image_index][2],
+                    )
+                    second_per_grid_t = 0
+                    image_index += 1
+                    remain_images -= 1
+                    ed = ed_image
+                else:
+                    t, h, w = (
+                        video_grid_thw[video_index][0],
+                        video_grid_thw[video_index][1],
+                        video_grid_thw[video_index][2],
+                    )
+                    if second_per_grid_ts is not None:
+                        second_per_grid_t = second_per_grid_ts[video_index]
+                    else:
+                        second_per_grid_t = 1.0
+                    video_index += 1
+                    remain_videos -= 1
+                    ed = ed_video
+                t_int, h_int, w_int = int(t), int(h), int(w)
+                llm_grid_t = t_int
+                llm_grid_h = h_int // spatial_merge_size
+                llm_grid_w = w_int // spatial_merge_size
+                text_len = ed - st
+                st_idx = (
+                    llm_pos_ids_list[-1].max() + 1 if len(llm_pos_ids_list) > 0 else 0
+                )
+                llm_pos_ids_list.append(
+                    torch.arange(text_len).view(1, -1).expand(3, -1) + st_idx
+                )
+                if model_type in ("qwen2_5_vl", "paddleocr_vl"):
+                    range_tensor = torch.arange(llm_grid_t).view(-1, 1)
+                    expanded_range = range_tensor.expand(-1, llm_grid_h * llm_grid_w)
+                    time_tensor = expanded_range * second_per_grid_t * tokens_per_second
+                    t_index = time_tensor.long().flatten()
+                elif model_type in (
+                    "qwen2_vl",
+                    "qwen3_vl",
+                    "qwen3_vl_moe",
+                    "qwen3_5",
+                    "qwen3_5_moe",
+                ):
+                    t_index = (
+                        torch.arange(llm_grid_t, device=position_ids.device)
+                        .view(-1, 1)
+                        .expand(llm_grid_t, llm_grid_h * llm_grid_w)
+                        .reshape(-1)
+                    )
+                else:
+                    raise RuntimeError(f"Unimplemented model type: {model_type}")
+                h_index = (
+                    torch.arange(llm_grid_h, device=position_ids.device)
+                    .view(1, -1, 1)
+                    .expand(llm_grid_t, llm_grid_h, llm_grid_w)
+                    .reshape(-1)
+                )
+                w_index = (
+                    torch.arange(llm_grid_w, device=position_ids.device)
+                    .view(1, 1, -1)
+                    .expand(llm_grid_t, llm_grid_h, llm_grid_w)
+                    .reshape(-1)
+                )
+                llm_pos_ids_list.append(
+                    torch.stack([t_index, h_index, w_index]) + text_len + st_idx
+                )
+                st = ed + llm_grid_t * llm_grid_h * llm_grid_w
+            if st < len(input_tokens):
+                st_idx = (
+                    llm_pos_ids_list[-1].max() + 1 if len(llm_pos_ids_list) > 0 else 0
+                )
+                text_len = len(input_tokens) - st
+                llm_pos_ids_list.append(
+                    torch.arange(text_len).view(1, -1).expand(3, -1) + st_idx
+                )
+            llm_positions = torch.cat(llm_pos_ids_list, dim=1).reshape(3, -1)
+            position_ids[..., i, :] = llm_positions.to(position_ids.device)
+            mrope_position_deltas.append(
+                llm_positions.max() + 1 - len(total_input_ids[i])
+            )
+        mrope_position_deltas = torch.tensor(
+            mrope_position_deltas, device=input_ids.device
+        ).unsqueeze(1)
+        return position_ids, mrope_position_deltas
+    else:
+        s = input_ids.shape[1]
+        position_ids = torch.arange(s)
+        position_ids = position_ids.unsqueeze(0).expand(3, -1, -1).to(input_ids.device)
+        max_position_ids = position_ids.amax(dim=0, keepdim=False)
+        mrope_position_deltas = max_position_ids.amax(-1, keepdim=True) + 1 - s
+        return position_ids, mrope_position_deltas
+
+
+def get_rope_index_qwen3_omni(
+    spatial_merge_size: int,
+    image_token_id: int,
+    video_token_id: int,
+    vision_start_token_id: int,
+    tokens_per_second: Optional[int] = None,
+    input_ids: Optional[torch.LongTensor] = None,
+    image_grid_thw: Optional[torch.LongTensor] = None,
+    video_grid_thw: Optional[torch.LongTensor] = None,
+    second_per_grid_ts: Optional[torch.Tensor] = None,
+    **kwargs,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    audio_token_id = kwargs["audio_token_id"]
+    audio_start_token_id = kwargs["audio_start_token_id"]
+    position_id_per_seconds = kwargs["position_id_per_seconds"]
+    use_audio_in_video = kwargs.get("use_audio_in_video", False)
+    audio_seqlens = kwargs.get("audio_seqlens", None)
+    second_per_grids = second_per_grid_ts
+
+    mrope_position_deltas = []
+    if input_ids is not None and (
+        image_grid_thw is not None or video_grid_thw is not None
+    ):
+        total_input_ids = input_ids
+        position_ids = torch.zeros(
+            3,
+            input_ids.shape[0],
+            input_ids.shape[1],
+            dtype=torch.float,
+            device=input_ids.device,
+        )
+        image_idx, video_idx, audio_idx = 0, 0, 0
+        for i, current_input_ids in enumerate(total_input_ids):
+            image_nums, video_nums, audio_nums = 0, 0, 0
+            vision_start_indices = torch.argwhere(
+                current_input_ids == vision_start_token_id
+            ).squeeze(1)
+            if vision_start_indices.numel() > 0:
+                vision_tokens = current_input_ids[vision_start_indices + 1]
+                image_nums = (vision_tokens == image_token_id).sum()
+                video_nums = (
+                    (vision_tokens == audio_start_token_id).sum()
+                    if use_audio_in_video
+                    else (vision_tokens == video_token_id).sum()
+                )
+            audio_nums = torch.sum(current_input_ids == audio_start_token_id)
+            input_tokens = current_input_ids.tolist()
+            llm_pos_ids_list: list = []
+            st = 0
+            remain_images, remain_videos, remain_audios = (
+                image_nums,
+                video_nums,
+                audio_nums,
+            )
+            multimodal_nums = (
+                image_nums + audio_nums
+                if use_audio_in_video
+                else image_nums + video_nums + audio_nums
+            )
+            for _ in range(multimodal_nums):
+                st_idx = (
+                    llm_pos_ids_list[-1].max() + 1 if len(llm_pos_ids_list) > 0 else 0
+                )
+                ed_vision_start = (
+                    input_tokens.index(vision_start_token_id, st)
+                    if (
+                        (
+                            image_token_id in input_tokens
+                            or video_token_id in input_tokens
+                        )
+                        and (remain_videos > 0 or remain_images > 0)
+                    )
+                    else len(input_tokens) + 1
+                )
+                ed_audio_start = (
+                    input_tokens.index(audio_start_token_id, st)
+                    if (audio_token_id in input_tokens and remain_audios > 0)
+                    else len(input_tokens) + 1
+                )
+                min_ed = min(ed_vision_start, ed_audio_start)
+                text_len = min_ed - st
+                if text_len != 0:
+                    llm_pos_ids_list.append(
+                        torch.arange(text_len).view(1, -1).expand(3, -1) + st_idx
+                    )
+                    st_idx += text_len
+                if min_ed == ed_vision_start and ed_vision_start + 1 == ed_audio_start:
+                    bos_len, eos_len = 2, 2
+                else:
+                    bos_len, eos_len = 1, 1
+                llm_pos_ids_list.append(
+                    torch.arange(bos_len).view(1, -1).expand(3, -1) + st_idx
+                )
+                st_idx += bos_len
+                # Audio Only
+                if min_ed == ed_audio_start:
+                    audio_len = _get_feat_extract_output_lengths(
+                        audio_seqlens[audio_idx]
+                    )
+                    llm_pos_ids = (
+                        torch.arange(audio_len).view(1, -1).expand(3, -1) + st_idx
+                    )
+                    llm_pos_ids_list.append(llm_pos_ids)
+                    st += int(text_len + bos_len + audio_len + eos_len)
+                    audio_idx += 1
+                    remain_audios -= 1
+                # Image Only
+                elif (
+                    min_ed == ed_vision_start
+                    and current_input_ids[ed_vision_start + 1] == image_token_id
+                ):
+                    grid_t = image_grid_thw[image_idx][0]
+                    grid_hs = image_grid_thw[:, 1]
+                    grid_ws = image_grid_thw[:, 2]
+                    t_index = (
+                        torch.arange(grid_t) * 1 * position_id_per_seconds
+                    ).float()
+                    llm_pos_ids = _get_llm_pos_ids_for_vision(
+                        st_idx,
+                        image_idx,
+                        spatial_merge_size,
+                        t_index,
+                        grid_hs,
+                        grid_ws,
+                        input_ids.device,
+                    )
+                    image_len = image_grid_thw[image_idx].prod() // (
+                        spatial_merge_size**2
+                    )
+                    llm_pos_ids_list.append(llm_pos_ids)
+                    st += int(text_len + bos_len + image_len + eos_len)
+                    image_idx += 1
+                    remain_images -= 1
+                # Video Only
+                elif (
+                    min_ed == ed_vision_start
+                    and current_input_ids[ed_vision_start + 1] == video_token_id
+                ):
+                    grid_t = video_grid_thw[video_idx][0]
+                    grid_hs = video_grid_thw[:, 1]
+                    grid_ws = video_grid_thw[:, 2]
+                    t_index = (
+                        torch.arange(grid_t)
+                        * second_per_grids[video_idx].cpu().float()
+                        * position_id_per_seconds
+                    ).float()
+                    llm_pos_ids = _get_llm_pos_ids_for_vision(
+                        st_idx,
+                        video_idx,
+                        spatial_merge_size,
+                        t_index,
+                        grid_hs,
+                        grid_ws,
+                        input_ids.device,
+                    )
+                    video_len = video_grid_thw[video_idx].prod() // (
+                        spatial_merge_size**2
+                    )
+                    llm_pos_ids_list.append(llm_pos_ids)
+                    st += int(text_len + bos_len + video_len + eos_len)
+                    video_idx += 1
+                    remain_videos -= 1
+                # Audio in Video
+                elif (
+                    min_ed == ed_vision_start and ed_vision_start + 1 == ed_audio_start
+                ):
+                    audio_len = _get_feat_extract_output_lengths(
+                        audio_seqlens[audio_idx]
+                    )
+                    audio_llm_pos_ids = (
+                        torch.arange(audio_len).view(1, -1).expand(3, -1) + st_idx
+                    )
+                    grid_t = video_grid_thw[video_idx][0]
+                    grid_hs = video_grid_thw[:, 1]
+                    grid_ws = video_grid_thw[:, 2]
+                    t_index = (
+                        torch.arange(grid_t)
+                        * second_per_grids[video_idx].cpu().float()
+                        * position_id_per_seconds
+                    ).float()
+                    video_llm_pos_ids = _get_llm_pos_ids_for_vision(
+                        st_idx,
+                        video_idx,
+                        spatial_merge_size,
+                        t_index,
+                        grid_hs,
+                        grid_ws,
+                        input_ids.device,
+                    )
+                    video_data_index, audio_data_index = 0, 0
+                    while (
+                        video_data_index < video_llm_pos_ids.shape[-1]
+                        and audio_data_index < audio_llm_pos_ids.shape[-1]
+                    ):
+                        if (
+                            video_llm_pos_ids[0][video_data_index]
+                            <= audio_llm_pos_ids[0][audio_data_index]
+                        ):
+                            llm_pos_ids_list.append(
+                                video_llm_pos_ids[
+                                    :, video_data_index : video_data_index + 1
+                                ]
+                            )
+                            video_data_index += 1
+                        else:
+                            llm_pos_ids_list.append(
+                                audio_llm_pos_ids[
+                                    :, audio_data_index : audio_data_index + 1
+                                ]
+                            )
+                            audio_data_index += 1
+                    if video_data_index < video_llm_pos_ids.shape[-1]:
+                        llm_pos_ids_list.append(
+                            video_llm_pos_ids[
+                                :, video_data_index : video_llm_pos_ids.shape[-1]
+                            ]
+                        )
+                    if audio_data_index < audio_llm_pos_ids.shape[-1]:
+                        llm_pos_ids_list.append(
+                            audio_llm_pos_ids[
+                                :, audio_data_index : audio_llm_pos_ids.shape[-1]
+                            ]
+                        )
+                    video_len = video_grid_thw[video_idx].prod() // (
+                        spatial_merge_size**2
+                    )
+                    st += int(text_len + bos_len + audio_len + video_len + eos_len)
+                    audio_idx += 1
+                    video_idx += 1
+                    remain_videos -= 1
+                    remain_audios -= 1
+                st_idx = (
+                    llm_pos_ids_list[-1].max() + 1 if len(llm_pos_ids_list) > 0 else 0
+                )
+                llm_pos_ids_list.append(
+                    torch.arange(eos_len).view(1, -1).expand(3, -1) + st_idx
+                )
+            if st < len(input_tokens):
+                st_idx = (
+                    llm_pos_ids_list[-1].max() + 1 if len(llm_pos_ids_list) > 0 else 0
+                )
+                text_len = len(input_tokens) - st
+                llm_pos_ids_list.append(
+                    torch.arange(text_len).view(1, -1).expand(3, -1) + st_idx
+                )
+            llm_positions = torch.cat(
+                [item.float() for item in llm_pos_ids_list], dim=1
+            ).reshape(3, -1)
+            position_ids[..., i, :] = llm_positions.to(position_ids.device)
+            mrope_position_deltas.append(
+                llm_positions.max() + 1 - len(current_input_ids)
+            )
+        mrope_position_deltas = torch.tensor(
+            mrope_position_deltas, device=input_ids.device
+        ).unsqueeze(1)
+        return position_ids, mrope_position_deltas
+    else:
+        s = input_ids.shape[1]
+        position_ids = torch.arange(s)
+        position_ids = position_ids.unsqueeze(0).expand(3, -1, -1).to(input_ids.device)
+        max_position_ids = position_ids.max(0, keepdim=False)[0].max(-1, keepdim=True)[
+            0
+        ]
+        mrope_position_deltas = max_position_ids + 1 - s
+        return position_ids, mrope_position_deltas
+
+
+def get_rope_index_glm4v(
+    input_ids: torch.Tensor,
+    hf_config: Any,
+    image_grid_thw: Union[List[List[int]], torch.Tensor],
+    video_grid_thw: Union[List[List[int]], torch.Tensor],
+    attention_mask: torch.Tensor,
+    **kwargs,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """Get mrope input positions and delta value for GLM4V."""
+    image_token_id = hf_config.image_token_id
+    video_start_token_id = hf_config.video_start_token_id
+    video_end_token_id = hf_config.video_end_token_id
+    spatial_merge_size = hf_config.vision_config.spatial_merge_size
+
+    mrope_position_deltas = []
+
+    if input_ids is not None and (
+        image_grid_thw is not None or video_grid_thw is not None
+    ):
+        total_input_ids = input_ids
+
+        if attention_mask is None:
+            attention_mask = torch.ones_like(total_input_ids)
+
+        position_ids = torch.ones(
+            3,
+            input_ids.shape[0],
+            input_ids.shape[1],
+            dtype=input_ids.dtype,
+            device=input_ids.device,
+        )
+
+        image_index, video_index = 0, 0
+        video_group_index = 0
+        attention_mask = attention_mask.to(total_input_ids.device)
+
+        for i, ids in enumerate(total_input_ids):
+            curr_mask = attention_mask[i]
+            ids_masked = ids[curr_mask == 1]
+
+            input_tokens = ids_masked.tolist()
+            input_token_type = [""] * len(input_tokens)
+
+            video_check_flg = False
+            for j, token in enumerate(input_tokens):
+                if token == video_start_token_id:
+                    video_check_flg = True
+                elif token == video_end_token_id:
+                    video_check_flg = False
+
+                if token == image_token_id and not video_check_flg:
+                    input_token_type[j] = "image"
+                elif token == image_token_id and video_check_flg:
+                    input_token_type[j] = "video"
+                else:
+                    input_token_type[j] = "text"
+
+            input_type_group = []
+            for key, group in itertools.groupby(
+                enumerate(input_token_type), lambda x: x[1]
+            ):
+                group = list(group)
+                start_index = group[0][0]
+                end_index = group[-1][0] + 1
+                input_type_group.append((key, start_index, end_index))
+
+            llm_pos_ids_list = []
+            video_frame_num = 1
+
+            for modality_type, start_idx, end_idx in input_type_group:
+                if llm_pos_ids_list:
+                    st_idx = llm_pos_ids_list[-1].max().item() + 1
+                else:
+                    st_idx = 0
+
+                if modality_type == "image":
+                    t, h, w = (
+                        image_grid_thw[image_index][0],
+                        image_grid_thw[image_index][1],
+                        image_grid_thw[image_index][2],
+                    )
+                    t_int, h_int, w_int = int(t), int(h), int(w)
+                    llm_grid_t = t_int
+                    llm_grid_h = h_int // spatial_merge_size
+                    llm_grid_w = w_int // spatial_merge_size
+
+                    t_index = (
+                        torch.arange(llm_grid_t, device=position_ids.device)
+                        .view(-1, 1)
+                        .expand(llm_grid_t, llm_grid_h * llm_grid_w)
+                        .reshape(-1)
+                    )
+                    h_index = (
+                        torch.arange(llm_grid_h, device=position_ids.device)
+                        .view(1, -1, 1)
+                        .expand(llm_grid_t, llm_grid_h, llm_grid_w)
+                        .reshape(-1)
+                    )
+                    w_index = (
+                        torch.arange(llm_grid_w, device=position_ids.device)
+                        .view(1, 1, -1)
+                        .expand(llm_grid_t, llm_grid_h, llm_grid_w)
+                        .reshape(-1)
+                    )
+                    llm_pos_ids_list.append(
+                        torch.stack([t_index, h_index, w_index]) + st_idx
+                    )
+                    image_index += 1
+                    video_frame_num = 1
+
+                elif modality_type == "video":
+                    t = video_frame_num
+                    h = video_grid_thw[video_index][1]
+                    w = video_grid_thw[video_index][2]
+                    h_int, w_int = int(h), int(w)
+                    llm_grid_h = h_int // spatial_merge_size
+                    llm_grid_w = w_int // spatial_merge_size
+
+                    for t_idx in range(t):
+                        t_index = (
+                            torch.tensor(t_idx, device=position_ids.device)
+                            .view(-1, 1)
+                            .expand(1, llm_grid_h * llm_grid_w)
+                            .reshape(-1)
+                        )
+                        h_index = (
+                            torch.arange(llm_grid_h, device=position_ids.device)
+                            .view(1, -1, 1)
+                            .expand(1, llm_grid_h, llm_grid_w)
+                            .reshape(-1)
+                        )
+                        w_index = (
+                            torch.arange(llm_grid_w, device=position_ids.device)
+                            .view(1, 1, -1)
+                            .expand(1, llm_grid_h, llm_grid_w)
+                            .reshape(-1)
+                        )
+                        llm_pos_ids_list.append(
+                            torch.stack([t_index, h_index, w_index]) + st_idx
+                        )
+
+                    video_group_index += 1
+                    if video_group_index >= video_grid_thw[video_index][0]:
+                        video_index += 1
+                        video_group_index = 0
+                    video_frame_num += 1
+
+                else:  # text
+                    text_len = end_idx - start_idx
+                    text_range = torch.arange(text_len, device=position_ids.device)
+                    text_pos = text_range.view(1, -1).expand(3, text_len) + st_idx
+                    llm_pos_ids_list.append(text_pos)
+                    video_frame_num = 1
+
+            llm_positions = torch.cat(llm_pos_ids_list, dim=1).reshape(3, -1)
+            idx_mask = curr_mask == 1
+            position_ids[..., i, idx_mask] = llm_positions.to(position_ids.device)
+            mrope_position_deltas.append(
+                llm_positions.max() + 1 - len(total_input_ids[i])
+            )
+        mrope_position_deltas = torch.tensor(
+            mrope_position_deltas, device=input_ids.device
+        ).unsqueeze(1)
+        return position_ids, mrope_position_deltas
+    else:
+        if attention_mask is not None:
+            position_ids = attention_mask.long().cumsum(-1) - 1
+            position_ids.masked_fill_(attention_mask == 0, 1)
+            position_ids = (
+                position_ids.unsqueeze(0).expand(3, -1, -1).to(attention_mask.device)
+            )
+            max_position_ids = position_ids.amax(dim=0, keepdim=False)
+            mrope_position_deltas = (
+                max_position_ids.amax(-1, keepdim=True) + 1 - attention_mask.shape[-1]
+            )
+        else:
+            length = input_ids.shape[1]
+            batch_size = input_ids.shape[0]
+            arange_ids = torch.arange(length, device=input_ids.device).view(1, 1, -1)
+            position_ids = arange_ids.expand(3, batch_size, length)
+            mrope_position_deltas = torch.zeros(
+                [batch_size, 1],
+                device=input_ids.device,
+                dtype=input_ids.dtype,
+            )
+        return position_ids, mrope_position_deltas
+
+
+def get_rope_index_ernie45(
+    input_ids: torch.Tensor,
+    hf_config: Any,
+    image_grid_thw: Union[List[List[int]], torch.Tensor],
+    video_grid_thw: Union[List[List[int]], torch.Tensor],
+    **kwargs,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """Get mrope input positions and delta value for Ernie VL."""
+    image_token_id = hf_config.im_patch_id
+    video_start_token_id = hf_config.video_start_token_id
+    video_end_token_id = hf_config.video_end_token_id
+    spatial_conv_size = hf_config.spatial_conv_size
+    temporal_conv_size = hf_config.temporal_conv_size
+
+    mrope_position_deltas = []
+    if input_ids is not None and (
+        image_grid_thw is not None or video_grid_thw is not None
+    ):
+        total_input_ids = input_ids
+        position_ids = torch.ones(
+            3,
+            input_ids.shape[0],
+            input_ids.shape[1],
+            dtype=input_ids.dtype,
+            device=input_ids.device,
+        )
+        image_index, video_index = 0, 0
+        for i, input_ids in enumerate(total_input_ids):
+            input_tokens = input_ids.tolist()
+
+            input_token_type = []
+            video_check_flg = False
+            for token in input_tokens:
+                if token == video_start_token_id:
+                    video_check_flg = True
+                elif token == video_end_token_id:
+                    video_check_flg = False
+
+                if token == image_token_id and not video_check_flg:
+                    input_token_type.append("image")
+                elif token == image_token_id and video_check_flg:
+                    input_token_type.append("video")
+                else:
+                    input_token_type.append("text")
+
+            input_type_group = []
+            for key, group in itertools.groupby(
+                enumerate(input_token_type), lambda x: x[1]
+            ):
+                group = list(group)
+                start_index = group[0][0]
+                end_index = group[-1][0] + 1
+                input_type_group.append((key, start_index, end_index))
+
+            llm_pos_ids_list = []
+            video_frame_num = 1
+            for modality_type, start_idx, end_idx in input_type_group:
+                st_idx = (
+                    llm_pos_ids_list[-1].max() + 1 if len(llm_pos_ids_list) > 0 else 0
+                )
+
+                if modality_type == "image":
+                    t, h, w = (
+                        image_grid_thw[image_index][0],
+                        image_grid_thw[image_index][1],
+                        image_grid_thw[image_index][2],
+                    )
+                    llm_grid_t, llm_grid_h, llm_grid_w = (
+                        t.item(),
+                        h.item() // spatial_conv_size,
+                        w.item() // spatial_conv_size,
+                    )
+
+                    t_index = (
+                        torch.arange(llm_grid_t)
+                        .view(-1, 1)
+                        .expand(-1, llm_grid_h * llm_grid_w)
+                        .flatten()
+                    )
+                    h_index = (
+                        torch.arange(llm_grid_h)
+                        .view(1, -1, 1)
+                        .expand(llm_grid_t, -1, llm_grid_w)
+                        .flatten()
+                    )
+                    w_index = (
+                        torch.arange(llm_grid_w)
+                        .view(1, 1, -1)
+                        .expand(llm_grid_t, llm_grid_h, -1)
+                        .flatten()
+                    )
+                    llm_pos_ids_list.append(
+                        torch.stack([t_index, h_index, w_index]) + st_idx
+                    )
+                    image_index += 1
+                    video_frame_num = 1
+
+                elif modality_type == "video":
+                    t, h, w = (
+                        video_grid_thw[video_index][0],
+                        video_grid_thw[video_index][1],
+                        video_grid_thw[video_index][2],
+                    )
+                    llm_grid_t, llm_grid_h, llm_grid_w = (
+                        t.item() // temporal_conv_size,
+                        h.item() // spatial_conv_size,
+                        w.item() // spatial_conv_size,
+                    )
+
+                    for t_idx in range(llm_grid_t):
+                        t_index = (
+                            torch.tensor(t_idx)
+                            .view(-1, 1)
+                            .expand(-1, llm_grid_h * llm_grid_w)
+                            .flatten()
+                        )
+                        h_index = (
+                            torch.arange(llm_grid_h)
+                            .view(1, -1, 1)
+                            .expand(1, -1, llm_grid_w)
+                            .flatten()
+                        )
+                        w_index = (
+                            torch.arange(llm_grid_w)
+                            .view(1, 1, -1)
+                            .expand(1, llm_grid_h, -1)
+                            .flatten()
+                        )
+                        llm_pos_ids_list.append(
+                            torch.stack([t_index, h_index, w_index]) + st_idx
+                        )
+                    video_index += 1
+                    video_frame_num += 1
+
+                else:
+                    text_len = end_idx - start_idx
+                    llm_pos_ids_list.append(
+                        torch.arange(text_len).view(1, -1).expand(3, -1) + st_idx
+                    )
+                    video_frame_num = 1
+
+            llm_positions = torch.cat(llm_pos_ids_list, dim=1).reshape(3, -1)
+            position_ids[..., i, :] = llm_positions.to(position_ids.device)
+            mrope_position_deltas.append(
+                llm_positions.max() + 1 - len(total_input_ids[i])
+            )
+        mrope_position_deltas = torch.tensor(
+            mrope_position_deltas, device=input_ids.device
+        ).unsqueeze(1)
+        return position_ids, mrope_position_deltas
+    else:
+        s = input_ids.shape[1]
+        position_ids = torch.arange(s)
+        position_ids = position_ids.unsqueeze(0).expand(3, -1, -1).to(input_ids.device)
+        max_position_ids = position_ids.max(0, keepdim=False)[0].max(-1, keepdim=True)[
+            0
+        ]
+        mrope_position_deltas = max_position_ids + 1 - s
+        return position_ids, mrope_position_deltas
diff --git a/sglang/python/sglang/srt/layers/rotary_embedding/rope_variant.py b/sglang/python/sglang/srt/layers/rotary_embedding/rope_variant.py
new file mode 100644
index 0000000000000000000000000000000000000000..28aaae598bc8bf4571b3656b5765b8f5f5234c80
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/rotary_embedding/rope_variant.py
@@ -0,0 +1,868 @@
+"""RoPE scaling variants: Phi3LongRoPE, FourierRoPE, DeepseekScaling, Llama3,
+Llama4Vision, DynamicNTK, DynamicNTKAlpha, DualChunkRotaryEmbedding."""
+
+from __future__ import annotations
+
+import math
+from typing import List, Optional, Tuple, Union
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from sglang.srt.layers.rotary_embedding.base import RotaryEmbedding
+from sglang.srt.layers.rotary_embedding.utils import (
+    apply_rotary_pos_emb_native,
+    rotate_gptj,
+    rotate_neox,
+)
+from sglang.srt.layers.rotary_embedding.yarn import (
+    yarn_find_correction_range,
+    yarn_get_mscale,
+    yarn_linear_ramp_mask,
+)
+from sglang.srt.layers.utils import MultiPlatformOp
+from sglang.srt.utils import cpu_has_amx_support, get_device, is_cuda, is_hip, is_npu
+
+_is_cuda = is_cuda()
+_is_hip = is_hip()
+_is_npu = is_npu()
+_is_cpu_amx_available = cpu_has_amx_support()
+
+if _is_npu:
+    import torch_npu
+
+
+class Phi3LongRoPEScaledRotaryEmbedding(nn.Module):
+    """Phi3 family of models scaled rotary embedding."""
+
+    def __init__(
+        self,
+        head_size: int,
+        rotary_dim: int,
+        max_position_embeddings: int,
+        original_max_position_embeddings: int,
+        base: int,
+        is_neox_style: bool,
+        dtype: torch.dtype,
+        short_factor: List[float],
+        long_factor: List[float],
+        short_mscale: Optional[float] = None,
+        long_mscale: Optional[float] = None,
+    ):
+        super().__init__()
+
+        if is_neox_style is False:
+            raise ValueError(
+                "`Phi3LongRoPEScaledRotaryEmbedding` only supports neox_style."
+            )
+
+        self.rotary_dim = rotary_dim
+        self.head_size = head_size
+        self.max_position_embeddings = max_position_embeddings
+        self.original_max_position_embeddings = original_max_position_embeddings
+        self.base = base
+        self.short_factor = short_factor
+        self.long_factor = long_factor
+
+        scale = self.max_position_embeddings / self.original_max_position_embeddings
+        if scale <= 1.0:
+            scaling_factor = 1.0
+        else:
+            scaling_factor = math.sqrt(
+                1 + math.log(scale) / math.log(self.original_max_position_embeddings)
+            )
+        if short_mscale is None:
+            short_mscale = scaling_factor
+        if long_mscale is None:
+            long_mscale = scaling_factor
+
+        self.short_mscale = short_mscale
+        self.long_mscale = long_mscale
+
+        short_cache = self._compute_cos_sin_cache(
+            original_max_position_embeddings, short_factor, short_mscale
+        )
+        short_cache = short_cache.to(dtype)
+        self.register_buffer("short_cos_sin_cache", short_cache, persistent=False)
+
+        long_cache = self._compute_cos_sin_cache(
+            max_position_embeddings, long_factor, long_mscale
+        )
+        long_cache = long_cache.to(dtype)
+        self.register_buffer("long_cos_sin_cache", long_cache, persistent=False)
+
+        long_short_cache = torch.cat(
+            [self.short_cos_sin_cache, self.long_cos_sin_cache], dim=0
+        )
+        self.register_buffer(
+            "long_short_cos_sin_cache", long_short_cache, persistent=False
+        )
+
+    def _compute_inv_freq(self, rescale_factors: List[float]) -> torch.Tensor:
+        rescale_factors = torch.tensor(rescale_factors, dtype=torch.float32)
+        inv_freq = 1.0 / (
+            rescale_factors
+            * (
+                self.base
+                ** (
+                    torch.arange(0, self.rotary_dim, 2, dtype=torch.float)
+                    / self.rotary_dim
+                )
+            )
+        )
+        return inv_freq
+
+    def _compute_cos_sin_cache(
+        self,
+        max_position_embeddings: int,
+        rescale_factors: List[float],
+        mscale: float,
+    ) -> torch.Tensor:
+        inv_freq = self._compute_inv_freq(rescale_factors)
+        t = torch.arange(max_position_embeddings, dtype=torch.float)
+        freqs = torch.einsum("i,j -> ij", t, inv_freq)
+        cos = freqs.cos() * mscale
+        sin = freqs.sin() * mscale
+        cache = torch.cat((cos, sin), dim=-1)
+        return cache
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        offsets: Optional[torch.Tensor] = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        query = query.unflatten(1, (-1, self.head_size))
+        key = key.unflatten(1, (-1, self.head_size))
+
+        k = self.original_max_position_embeddings
+        long_prompt_offset = (
+            torch.any(positions > k).float() * torch.full_like(positions, k)
+        ).long()
+        idx = (
+            torch.add(positions, long_prompt_offset)
+            if long_prompt_offset is not None
+            else positions
+        )
+        self.long_short_cos_sin_cache: torch.Tensor = self.long_short_cos_sin_cache.to(
+            idx.device
+        )
+        idx = torch.add(idx, offsets) if offsets is not None else idx
+        cos_sin = torch.index_select(self.long_short_cos_sin_cache, 0, idx)
+
+        cos, sin = cos_sin.chunk(2, dim=-1)
+        cos = cos.repeat(1, 2).unsqueeze(-2)
+        sin = sin.repeat(1, 2).unsqueeze(-2)
+
+        query_rot = query[..., : self.rotary_dim]
+        query_pass = query[..., self.rotary_dim :]
+        query_rot = query_rot * cos + rotate_neox(query_rot) * sin
+        query = torch.cat((query_rot, query_pass), dim=-1)
+
+        key_rot = key[..., : self.rotary_dim]
+        key_pass = key[..., self.rotary_dim :]
+        key_rot = key_rot * cos + rotate_neox(key_rot) * sin
+        key = torch.cat((key_rot, key_pass), dim=-1)
+
+        return query.flatten(-2), key.flatten(-2)
+
+
+class FourierRotaryEmbedding(nn.Module):
+    """Fourier RotaryEmbedding extended."""
+
+    def __init__(
+        self,
+        head_size: int,
+        rotary_dim: int,
+        max_position_embeddings: int,
+        base: int,
+        is_neox_style: bool,
+        dtype: torch.dtype,
+        num_kv_heads: int,
+        *,
+        fope_init_factor: float = 0.1,
+        fope_sep_head: bool = True,
+        num_inv_freq: int = None,
+        device: Optional[str] = "cuda",
+    ) -> None:
+        self.fope_init_factor = fope_init_factor
+        self.fope_sep_head = fope_sep_head
+        self.num_inv_freq = num_inv_freq
+        self.num_kv_heads = num_kv_heads
+        self.device = device
+
+        super().__init__()
+        self.head_size = head_size
+        self.rotary_dim = rotary_dim
+        self.max_position_embeddings = max_position_embeddings
+        self.base = base
+        self.is_neox_style = is_neox_style
+        self.dtype = dtype
+
+        self.inv_freq: torch.Tensor
+        self.register_buffer(
+            "inv_freq", self._compute_inv_freq(self.base), persistent=False
+        )
+        self.input_dim = self.inv_freq.shape[-1]
+        self.output_dim = self.inv_freq.shape[-1]
+        self.cos_coef = nn.Parameter(
+            torch.empty(
+                self.num_kv_heads, self.input_dim, self.output_dim, dtype=torch.float32
+            ),
+            requires_grad=False,
+        )
+        self.sin_coef = nn.Parameter(
+            torch.empty(
+                self.num_kv_heads, self.input_dim, self.output_dim, dtype=torch.float32
+            ),
+            requires_grad=False,
+        )
+        self.cos_sin_cache: torch.Tensor
+        self.register_buffer(
+            "cos_sin_cache", self._compute_cos_sin_cache(), persistent=False
+        )
+        self.update_buffer = False
+
+    def _compute_inv_freq(self, base: Union[int, float]) -> torch.Tensor:
+        inv_freq = 1.0 / (
+            base
+            ** (
+                torch.arange(0, self.rotary_dim, 2, dtype=torch.int64).to(
+                    device=self.device, dtype=torch.float
+                )
+                / self.rotary_dim
+            )
+        )
+        assert (
+            inv_freq[:-1] > inv_freq[1:]
+        ).all(), "Expected inv_freq to be in decreasing order"
+        inv_freq_idx_selected = torch.ones_like(inv_freq, dtype=torch.bool)
+        if self.num_inv_freq is not None:
+            inv_freq_idx_selected[self.num_inv_freq :] = False
+        else:
+            inv_freq_idx_selected = inv_freq > (
+                2.0 * torch.pi / self.max_position_embeddings
+            )
+        inv_freq = inv_freq[inv_freq_idx_selected]
+        return inv_freq
+
+    def _compute_cos_sin_cache(self) -> torch.Tensor:
+        t = torch.arange(
+            self.max_position_embeddings, dtype=torch.float, device=self.device
+        )
+        freqs = torch.einsum("i,j -> ij", t, self.inv_freq)
+        if self.fope_sep_head:
+            pos_cos = freqs.cos().unsqueeze(0).expand(self.num_kv_heads, -1, -1)
+            pos_sin = freqs.sin().unsqueeze(0).expand(self.num_kv_heads, -1, -1)
+        else:
+            pos_cos = freqs.cos()
+            pos_sin = freqs.sin()
+        if self.fope_sep_head:
+            sin = torch.einsum("htD, hDd -> thd", pos_sin, self.sin_coef.float())
+            cos = torch.einsum("htD, hDd -> thd", pos_cos, self.cos_coef.float())
+        else:
+            sin = torch.einsum("tD, Dd -> td", pos_sin, self.sin_coef.float())
+            cos = torch.einsum("tD, Dd -> td", pos_cos, self.cos_coef.float())
+        sin = F.pad(
+            input=sin,
+            pad=(0, self.head_size // 2 - sin.size(-1)),
+            mode="constant",
+            value=1,
+        )
+        cos = F.pad(
+            input=cos,
+            pad=(0, self.head_size // 2 - cos.size(-1)),
+            mode="constant",
+            value=1,
+        )
+        sin = torch.cat((sin, sin), dim=-1)
+        cos = torch.cat((cos, cos), dim=-1)
+        cache = torch.cat((cos, sin), dim=-1)
+        return cache
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        offsets: Optional[torch.Tensor] = None,
+        **kwargs,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        if not self.update_buffer:
+            self.cos_sin_cache = self._compute_cos_sin_cache()
+            self.update_buffer = True
+        query = query.unflatten(-1, (-1, self.head_size))
+        key = key.unflatten(-1, (-1, self.head_size))
+        positions_with_offsets = (
+            torch.add(positions, offsets) if offsets is not None else positions
+        )
+        cos_sin = torch.index_select(self.cos_sin_cache, 0, positions_with_offsets).to(
+            dtype=query.dtype
+        )
+        cos, sin = cos_sin.chunk(2, dim=-1)
+        assert (
+            query.dim() == key.dim() == 3
+        ), "Expected query key (seq_len, heads, head_dim)"
+        assert cos.dim() <= 3 and sin.dim() <= 3
+        need_reshape = False
+        if cos.dim() == 3:
+            need_reshape = True
+            query_shape = query.shape
+            key_shape = key.shape
+            cos = cos.flatten(0, 1)
+            sin = sin.flatten(0, 1)
+            seq_len = cos.size(0)
+            query = query.reshape(seq_len, -1, query.size(-1))
+            key = key.reshape(seq_len, -1, key.size(-1))
+        query, key = apply_rotary_pos_emb_native(query, key, cos, sin)
+        if need_reshape:
+            query = query.reshape(query_shape)
+            key = key.reshape(key_shape)
+        return query.flatten(-2), key.flatten(-2)
+
+    def extra_repr(self) -> str:
+        s = f"head_size={self.head_size}, rotary_dim={self.rotary_dim}"
+        s += f", max_position_embeddings={self.max_position_embeddings}"
+        s += f", base={self.base}, is_neox_style={self.is_neox_style}"
+        s += f", fope_init_factor={self.fope_init_factor}, fope_sep_head={self.fope_sep_head}"
+        s += f", num_inv_freq={self.num_inv_freq}, num_kv_heads={self.num_kv_heads}"
+        return s
+
+
+class DeepseekScalingRotaryEmbedding(RotaryEmbedding):
+    """RotaryEmbedding extended with YaRN method.
+
+    Credits to Peng et al. github.com/jquesnelle/yarn
+    """
+
+    def __init__(
+        self,
+        head_size: int,
+        rotary_dim: int,
+        max_position_embeddings: int,
+        base: int,
+        is_neox_style: bool,
+        scaling_factor: float,
+        dtype: torch.dtype,
+        *,
+        extrapolation_factor: float = 1,
+        attn_factor: float = 1,
+        beta_fast: int = 32,
+        beta_slow: int = 1,
+        mscale: float = 1,
+        mscale_all_dim: float = 0,
+        device: Optional[str] = None,
+    ) -> None:
+        self.scaling_factor = scaling_factor
+        self.extrapolation_factor = extrapolation_factor
+        self.attn_factor = attn_factor
+        self.beta_fast = beta_fast
+        self.beta_slow = beta_slow
+        self.mscale = float(
+            yarn_get_mscale(self.scaling_factor, float(mscale))
+            / yarn_get_mscale(self.scaling_factor, float(mscale_all_dim))
+            * attn_factor
+        )
+        self.cos_cached_total = None
+        self.sin_cached_total = None
+        self.cos_cached = None
+        self.sin_cached = None
+        self.device = device if device is not None else get_device()
+        super().__init__(
+            head_size, rotary_dim, max_position_embeddings, base, is_neox_style, dtype
+        )
+        if _is_hip:
+            self._forward_method = self.forward_native
+
+    def _compute_inv_freq(self, scaling_factor: float) -> torch.Tensor:
+        pos_freqs = self.base ** (
+            torch.arange(0, self.rotary_dim, 2, dtype=torch.float, device=self.device)
+            / self.rotary_dim
+        )
+        inv_freq_extrapolation = 1.0 / pos_freqs
+        inv_freq_interpolation = 1.0 / (scaling_factor * pos_freqs)
+        low, high = yarn_find_correction_range(
+            self.beta_fast,
+            self.beta_slow,
+            self.rotary_dim,
+            self.base,
+            self.max_position_embeddings,
+        )
+        inv_freq_mask = (
+            1
+            - yarn_linear_ramp_mask(
+                low, high, self.rotary_dim // 2, dtype=torch.float, device=self.device
+            )
+        ) * self.extrapolation_factor
+        inv_freq = (
+            inv_freq_interpolation * (1 - inv_freq_mask)
+            + inv_freq_extrapolation * inv_freq_mask
+        )
+        return inv_freq
+
+    def _compute_cos_sin_cache(self) -> torch.Tensor:
+        inv_freq = self._compute_inv_freq(self.scaling_factor)
+        t = torch.arange(
+            self.max_position_embeddings * self.scaling_factor,
+            device=self.device,
+            dtype=torch.float32,
+        )
+        freqs = torch.einsum("i,j -> ij", t, inv_freq)
+        cos = freqs.cos() * self.mscale
+        sin = freqs.sin() * self.mscale
+        cache = torch.cat((cos, sin), dim=-1)
+        emb = torch.cat((freqs, freqs), dim=-1)
+        self.cos_cached_total = torch.cos(emb) * self.mscale
+        self.sin_cached_total = torch.sin(emb) * self.mscale
+        return cache
+
+    def get_cos_cached_total(self):
+        return self.cos_cached_total
+
+    def get_sin_cached_total(self):
+        return self.sin_cached_total
+
+    def get_cos_sin_cache(
+        self, positions, dtype, offsets: Optional[torch.Tensor] = None
+    ):
+        self.cos_cached = (
+            self.cos_cached_total[
+                torch.add(positions, offsets) if offsets is not None else positions
+            ]
+            .unsqueeze(-2)
+            .unsqueeze(-2)
+            .to(dtype)
+        )
+        self.sin_cached = (
+            self.sin_cached_total[
+                torch.add(positions, offsets) if offsets is not None else positions
+            ]
+            .unsqueeze(-2)
+            .unsqueeze(-2)
+            .to(dtype)
+        )
+        cos = self.cos_cached.to(positions.device)
+        sin = self.sin_cached.to(positions.device)
+        return cos, sin
+
+    def forward_native(
+        self,
+        positions: torch.Tensor,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        offsets: Optional[torch.Tensor] = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """PyTorch-native implementation equivalent to forward()."""
+        dtype = query.dtype
+        query_rot = query[..., : self.rotary_dim]
+        key_rot = key[..., : self.rotary_dim]
+        if self.rotary_dim < self.head_size:
+            query_pass = query[..., self.rotary_dim :]
+            key_pass = key[..., self.rotary_dim :]
+        cos_sin = self.cos_sin_cache[
+            torch.add(positions, offsets) if offsets is not None else positions
+        ]
+        cos, sin = cos_sin.chunk(2, dim=-1)
+        if self.is_neox_style:
+            cos = cos.repeat(1, 1, 2).unsqueeze(-2)
+            sin = sin.repeat(1, 1, 2).unsqueeze(-2)
+        else:
+            cos = cos.repeat_interleave(2, dim=-1).unsqueeze(-2)
+            sin = sin.repeat_interleave(2, dim=-1).unsqueeze(-2)
+        rotate_fn = rotate_neox if self.is_neox_style else rotate_gptj
+        query_rot = query_rot * cos + rotate_fn(query_rot) * sin
+        key_rot = key_rot * cos + rotate_fn(key_rot) * sin
+        if self.rotary_dim < self.head_size:
+            query = torch.cat((query_rot, query_pass), dim=-1)
+            key = torch.cat((key_rot, key_pass), dim=-1)
+        else:
+            query = query_rot
+            key = key_rot
+        return query.to(dtype), key.to(dtype)
+
+    def forward_npu(
+        self,
+        positions: torch.Tensor,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        offsets: Optional[torch.Tensor] = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        num_tokens, num_q_heads, _ = query.shape
+        num_k_heads = key.shape[1]
+        cos, sin = self.get_cos_sin_cache(positions, query.dtype, offsets)
+        query_rot = query[..., : self.rotary_dim]
+        key_rot = key[..., : self.rotary_dim]
+        if self.rotary_dim < self.head_size:
+            query_pass = query[..., self.rotary_dim :]
+            key_pass = key[..., self.rotary_dim :]
+        query_rot = torch_npu.npu_interleave_rope(
+            query_rot.reshape(num_tokens, num_q_heads, 1, self.rotary_dim),
+            cos,
+            sin,
+        )
+        key_rot = torch_npu.npu_interleave_rope(
+            key_rot.reshape(num_tokens, num_k_heads, 1, self.rotary_dim),
+            cos,
+            sin,
+        )
+        query_rot = query_rot.reshape(num_tokens, -1, self.rotary_dim)
+        key_rot = key_rot.reshape(num_tokens, -1, self.rotary_dim)
+        if self.rotary_dim < self.head_size:
+            query = torch.cat((query_rot, query_pass), dim=-1)
+            key = torch.cat((key_rot, key_pass), dim=-1)
+        else:
+            query = query_rot
+            key = key_rot
+        return query, key
+
+    def forward_cpu(
+        self,
+        positions: torch.Tensor,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        offsets: Optional[torch.Tensor] = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        positions = torch.add(positions, offsets) if offsets is not None else positions
+        if _is_cpu_amx_available:
+            return torch.ops.sgl_kernel.rotary_embedding_cpu(
+                positions, query, key, self.head_size, self.cos_sin_cache, False
+            )
+        else:
+            return self.forward_native(positions, query, key, offsets)
+
+
+class Llama3RotaryEmbedding(RotaryEmbedding):
+
+    def __init__(
+        self,
+        head_size: int,
+        rotary_dim: int,
+        max_position_embeddings: int,
+        base: int,
+        is_neox_style: bool,
+        dtype: torch.dtype,
+        scaling_factor: float,
+        low_freq_factor: float,
+        high_freq_factor: float,
+        orig_max_position: int,
+    ) -> None:
+        self.scaling_factor = scaling_factor
+        self.low_freq_factor = low_freq_factor
+        self.high_freq_factor = high_freq_factor
+        self.orig_max_position = orig_max_position
+        super().__init__(
+            head_size, rotary_dim, max_position_embeddings, base, is_neox_style, dtype
+        )
+
+    def _compute_inv_freq(self, base: Union[int, float]) -> torch.Tensor:
+        inv_freqs = super()._compute_inv_freq(base)
+        low_freq_wavelen = self.orig_max_position / self.low_freq_factor
+        high_freq_wavelen = self.orig_max_position / self.high_freq_factor
+        wave_len = 2 * math.pi / inv_freqs
+        if self.low_freq_factor != self.high_freq_factor:
+            smooth = (self.orig_max_position / wave_len - self.low_freq_factor) / (
+                self.high_freq_factor - self.low_freq_factor
+            )
+        else:
+            smooth = 0
+        new_freqs = torch.where(
+            wave_len < high_freq_wavelen,
+            inv_freqs,
+            torch.where(
+                wave_len > low_freq_wavelen,
+                inv_freqs / self.scaling_factor,
+                (1 - smooth) * inv_freqs / self.scaling_factor + smooth * inv_freqs,
+            ),
+        )
+        return new_freqs
+
+
+class Llama4VisionRotaryEmbedding(RotaryEmbedding):
+
+    def __init__(
+        self,
+        head_size: int,
+        rotary_dim: int,
+        max_position_embeddings: int,
+        base: int,
+        is_neox_style: bool,
+        dtype: torch.dtype,
+    ):
+        super().__init__(
+            head_size, rotary_dim, max_position_embeddings, base, is_neox_style, dtype
+        )
+
+    def _compute_inv_freq(self, base: Union[int, float]) -> torch.Tensor:
+        inv_freqs = super()._compute_inv_freq(base)
+        inv_freqs = inv_freqs[: (self.rotary_dim // 2)]
+        return inv_freqs
+
+    def _compute_cos_sin_cache(self) -> torch.Tensor:
+        inv_freq = self._compute_inv_freq(self.base)
+        num_patches = self.max_position_embeddings
+        img_idx = torch.arange(num_patches, dtype=torch.int32).reshape(num_patches, 1)
+        img_idx = torch.cat([img_idx, img_idx[:1]], dim=0)
+        img_idx[-1, -1] = -2  # set to ID_CLS_TOKEN
+        num_patches_single_dim = int(math.sqrt(num_patches))
+        frequencies_x = img_idx % num_patches_single_dim
+        frequencies_y = img_idx // num_patches_single_dim
+        freqs_x = (
+            (frequencies_x + 1)[..., None] * inv_freq[None, None, :]
+        ).repeat_interleave(2, dim=-1)
+        freqs_y = (
+            (frequencies_y + 1)[..., None] * inv_freq[None, None, :]
+        ).repeat_interleave(2, dim=-1)
+        freqs = torch.cat([freqs_x, freqs_y], dim=-1).float().contiguous()[..., ::2]
+        freqs = freqs.masked_fill(img_idx.reshape(-1, 1, 1) < 0, 0)
+        cache = torch.view_as_complex(
+            torch.stack([torch.cos(freqs), torch.sin(freqs)], dim=-1)
+        )
+        return cache
+
+    def forward(
+        self,
+        query: torch.Tensor,
+        key: torch.Tensor,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        self.cos_sin_cache: torch.Tensor = self.cos_sin_cache.to(query.device)
+        query_ = torch.view_as_complex(query.float().reshape(*query.shape[:-1], -1, 2))
+        key_ = torch.view_as_complex(key.float().reshape(*key.shape[:-1], -1, 2))
+        broadcast_shape = [
+            d if i == 1 or i == (query_.ndim - 1) else 1
+            for i, d in enumerate(query_.shape)
+        ]
+        freqs_ci = self.cos_sin_cache.view(*broadcast_shape)
+        query_out = torch.view_as_real(query_ * freqs_ci).flatten(3)
+        key_out = torch.view_as_real(key_ * freqs_ci).flatten(3)
+        return query_out.type_as(query), key_out.type_as(key)
+
+
+class DynamicNTKAlphaRotaryEmbedding(RotaryEmbedding):
+    """RotaryEmbedding extended with Dynamic NTK scaling.
+
+    Credits to the Reddit users /u/bloc97 and /u/emozilla
+    """
+
+    def __init__(
+        self,
+        head_size: int,
+        rotary_dim: int,
+        max_position_embeddings: int,
+        base: int,
+        is_neox_style: bool,
+        scaling_alpha: float,
+        dtype: torch.dtype,
+    ) -> None:
+        self.scaling_alpha = scaling_alpha
+        super().__init__(
+            head_size, rotary_dim, max_position_embeddings, base, is_neox_style, dtype
+        )
+
+    def _compute_cos_sin_cache(self) -> torch.Tensor:
+        max_len = self.max_position_embeddings
+        base = self.base * self.scaling_alpha ** (
+            self.rotary_dim / (self.rotary_dim - 2)
+        )
+        inv_freq = self._compute_inv_freq(base)
+        t = torch.arange(max_len, dtype=torch.float)
+        freqs = torch.einsum("i,j -> ij", t, inv_freq)
+        cos = freqs.cos()
+        sin = freqs.sin()
+        cache = torch.cat((cos, sin), dim=-1)
+        return cache
+
+
+class DualChunkRotaryEmbedding(MultiPlatformOp):
+    """Rotary positional embedding for Dual Chunk Attention."""
+
+    def __init__(
+        self,
+        head_size: int,
+        rotary_dim: int,
+        max_position_embeddings: int,
+        base: int,
+        is_neox_style: bool,
+        dtype: torch.dtype,
+        chunk_size: int,
+        local_size: int,
+    ) -> None:
+        super().__init__()
+        self.head_size = head_size
+        self.rotary_dim = rotary_dim
+        self.max_position_embeddings = max_position_embeddings
+        self.base = base
+        self.is_neox_style = is_neox_style
+        self.chunk_size = chunk_size
+        self.local_size = local_size
+        self.dtype = dtype
+        self.device = torch.device(f"cuda:{torch.cuda.current_device()}")
+        q_cache, qc_cache, k_cache, qc_no_clamp_cache, q_inter_cache = (
+            self._compute_cos_sin_cache()
+        )
+        self.register_buffer("cos_sin_q_cache", q_cache, persistent=False)
+        self.register_buffer("cos_sin_qc_cache", qc_cache, persistent=False)
+        self.register_buffer("cos_sin_k_cache", k_cache, persistent=False)
+        self.register_buffer(
+            "cos_sin_qc_no_clamp_cache", qc_no_clamp_cache, persistent=False
+        )
+        self.register_buffer("cos_sin_q_inter_cache", q_inter_cache, persistent=False)
+
+    def _compute_inv_freq(self, base: Union[int, float]) -> torch.Tensor:
+        inv_freq = 1.0 / (
+            base
+            ** (
+                torch.arange(0, self.rotary_dim, 2, dtype=torch.float) / self.rotary_dim
+            )
+        )
+        return inv_freq
+
+    def _compute_cos_sin_cache(self) -> torch.Tensor:
+        inv_freq = self._compute_inv_freq(self.base)
+        chunk_len = self.chunk_size - self.local_size
+        q_t = torch.arange(chunk_len, dtype=torch.float)
+        qc_t = (torch.arange(chunk_len, dtype=torch.float) + chunk_len).clamp(
+            max=self.chunk_size
+        )
+        k_t = torch.arange(self.max_position_embeddings, dtype=torch.float) % chunk_len
+        qc_no_clamp_t = torch.arange(chunk_len, dtype=torch.float) + chunk_len
+        q_inter_t = torch.arange(chunk_len, dtype=torch.float) + self.chunk_size
+
+        q_freqs = torch.outer(q_t, inv_freq)
+        qc_freqs = torch.outer(qc_t, inv_freq)
+        k_freqs = torch.outer(k_t, inv_freq)
+        qc_no_clamp_freqs = torch.outer(qc_no_clamp_t, inv_freq)
+        q_inter_freqs = torch.outer(q_inter_t, inv_freq)
+
+        q_cache = torch.cat((q_freqs.cos(), q_freqs.sin()), dim=-1).to(
+            dtype=self.dtype, device=self.device
+        )
+        qc_cache = torch.cat((qc_freqs.cos(), qc_freqs.sin()), dim=-1).to(
+            dtype=self.dtype, device=self.device
+        )
+        k_cache = torch.cat((k_freqs.cos(), k_freqs.sin()), dim=-1).to(
+            dtype=self.dtype, device=self.device
+        )
+        qc_no_clamp_cache = torch.cat(
+            (qc_no_clamp_freqs.cos(), qc_no_clamp_freqs.sin()), dim=-1
+        ).to(dtype=self.dtype, device=self.device)
+        q_inter_cache = torch.cat(
+            (q_inter_freqs.cos(), q_inter_freqs.sin()), dim=-1
+        ).to(dtype=self.dtype, device=self.device)
+        return q_cache, qc_cache, k_cache, qc_no_clamp_cache, q_inter_cache
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        offsets: Optional[torch.Tensor] = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        query = query.view(*query.shape[:-1], -1, self.head_size)
+        key = key.view(*key.shape[:-1], -1, self.head_size)
+        query_rot = query[..., : self.rotary_dim]
+        key_rot = key[..., : self.rotary_dim]
+        if self.rotary_dim < self.head_size:
+            query_pass = query[..., self.rotary_dim :]
+            key_pass = key[..., self.rotary_dim :]
+        else:
+            query_pass = None
+            key_pass = None
+
+        positions_with_offsets = (
+            torch.add(positions, offsets) if offsets is not None else positions
+        )
+        key = self._apply_rotary_embedding(
+            self.cos_sin_k_cache[positions_with_offsets], key_rot, key_pass
+        )
+        chunk_len = self.chunk_size - self.local_size
+        query = self._apply_rotary_embedding(
+            self.cos_sin_q_cache[positions_with_offsets % chunk_len],
+            query_rot,
+            query_pass,
+        )
+        query_succ = self._apply_rotary_embedding(
+            self.cos_sin_qc_cache[positions_with_offsets % chunk_len],
+            query_rot,
+            query_pass,
+        )
+        query_inter = self._apply_rotary_embedding(
+            self.cos_sin_qc_cache[chunk_len - 1].repeat(positions.shape[0], 1),
+            query_rot,
+            query_pass,
+        )
+        query_succ_critical = self._apply_rotary_embedding(
+            self.cos_sin_qc_no_clamp_cache[positions_with_offsets % chunk_len],
+            query_rot,
+            query_pass,
+        )
+        query_inter_critical = self._apply_rotary_embedding(
+            self.cos_sin_q_inter_cache[positions_with_offsets % chunk_len],
+            query_rot,
+            query_pass,
+        )
+        query = torch.cat(
+            (query, query_succ, query_inter, query_succ_critical, query_inter_critical),
+            dim=-1,
+        )
+        return query, key
+
+    def _apply_rotary_embedding(self, cos_sin, hidden_rot, hidden_pass):
+        cos, sin = cos_sin.chunk(2, dim=-1)
+        if self.is_neox_style:
+            cos = cos.repeat(1, 1, 2).unsqueeze(-2)
+            sin = sin.repeat(1, 1, 2).unsqueeze(-2)
+        else:
+            cos = cos.repeat_interleave(2, dim=-1).unsqueeze(-2)
+            sin = sin.repeat_interleave(2, dim=-1).unsqueeze(-2)
+        rotate_fn = rotate_neox if self.is_neox_style else rotate_gptj
+        hidden_rot = hidden_rot * cos + rotate_fn(hidden_rot) * sin
+        if self.rotary_dim < self.head_size:
+            hidden = torch.cat((hidden_rot, hidden_pass), dim=-1)
+        else:
+            hidden = hidden_rot
+        return hidden.flatten(-2).squeeze(0)
+
+    def extra_repr(self) -> str:
+        s = f"head_size={self.head_size}, rotary_dim={self.rotary_dim}"
+        s += f", max_position_embeddings={self.max_position_embeddings}"
+        s += f", base={self.base}, is_neox_style={self.is_neox_style}"
+        s += f", chunk_size={self.chunk_size}, local_size={self.local_size}"
+        return s
+
+
+class DynamicNTKScalingRotaryEmbedding(RotaryEmbedding):
+    """RotaryEmbedding extended with Dynamic NTK scaling.
+
+    Credits to the Reddit users /u/bloc97 and /u/emozilla
+    """
+
+    def __init__(
+        self,
+        head_size: int,
+        rotary_dim: int,
+        max_position_embeddings: int,
+        base: int,
+        is_neox_style: bool,
+        scaling_factor: float,
+        dtype: torch.dtype,
+    ) -> None:
+        self.scaling_factor = scaling_factor
+        super().__init__(
+            head_size, rotary_dim, max_position_embeddings, base, is_neox_style, dtype
+        )
+
+    def _compute_cos_sin_cache(self) -> torch.Tensor:
+        max_len = self.max_position_embeddings * self.scaling_factor
+        base = self.base * (
+            (self.scaling_factor * max_len / self.max_position_embeddings)
+            - (self.scaling_factor - 1)
+        ) ** (self.rotary_dim / (self.rotary_dim - 2))
+        inv_freq = self._compute_inv_freq(base)
+        t = torch.arange(max_len, dtype=torch.float)
+        freqs = torch.einsum("i,j -> ij", t, inv_freq)
+        cos = freqs.cos()
+        sin = freqs.sin()
+        cache = torch.cat((cos, sin), dim=-1)
+        return cache
diff --git a/sglang/python/sglang/srt/layers/rotary_embedding/triton_kernels.py b/sglang/python/sglang/srt/layers/rotary_embedding/triton_kernels.py
new file mode 100644
index 0000000000000000000000000000000000000000..9a3d21bf83bbca2d6487310b75f92969f4e8d57d
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/rotary_embedding/triton_kernels.py
@@ -0,0 +1,260 @@
+"""Triton JIT kernels for multimodal rotary positional embeddings."""
+
+from __future__ import annotations
+
+from typing import List
+
+import torch
+import triton
+import triton.language as tl
+
+
+@triton.jit
+def _triton_mrope_forward_fused(
+    q_ptr,
+    k_ptr,
+    cos_sin_cache_ptr,
+    positions_ptr,
+    q_stride,
+    k_stride,
+    positions_stride,
+    n_qh: tl.constexpr,
+    n_kh: tl.constexpr,
+    hd: tl.constexpr,
+    rd: tl.constexpr,
+    pad_n_qh: tl.constexpr,
+    pad_n_kh: tl.constexpr,
+    pad_hd: tl.constexpr,
+    mrope_section_t: tl.constexpr,
+    mrope_section_h: tl.constexpr,
+    mrope_section_w: tl.constexpr,
+    is_interleaved: tl.constexpr,
+    is_neox_style: tl.constexpr,
+):
+    pid = tl.program_id(0)
+    q_ptr = q_ptr + pid * q_stride
+    k_ptr = k_ptr + pid * k_stride
+    half_rd = rd // 2
+    t = tl.load(positions_ptr + 0 * positions_stride + pid)
+    h = tl.load(positions_ptr + 1 * positions_stride + pid)
+    w = tl.load(positions_ptr + 2 * positions_stride + pid)
+    t_cos = cos_sin_cache_ptr + t * rd
+    h_cos = cos_sin_cache_ptr + h * rd
+    w_cos = cos_sin_cache_ptr + w * rd
+    t_sin = t_cos + half_rd
+    h_sin = h_cos + half_rd
+    w_sin = w_cos + half_rd
+    cos_offsets = tl.arange(0, pad_hd // 2)
+    if is_interleaved:
+        h_mask = ((cos_offsets % 3) == 1) & (cos_offsets <= 3 * mrope_section_h)
+        w_mask = ((cos_offsets % 3) == 2) & (cos_offsets <= 3 * mrope_section_w)
+        t_mask = ~(h_mask | w_mask)
+    else:
+        t_end = mrope_section_t
+        h_end = t_end + mrope_section_h
+        t_mask = cos_offsets < mrope_section_t
+        h_mask = (t_end <= cos_offsets) & (cos_offsets < h_end)
+        w_mask = (h_end <= cos_offsets) & (cos_offsets < half_rd)
+    t_cos_row = tl.load(t_cos + cos_offsets, mask=t_mask, other=0)
+    t_sin_row = tl.load(t_sin + cos_offsets, mask=t_mask, other=0)
+    h_cos_row = tl.load(h_cos + cos_offsets, mask=h_mask, other=0)
+    h_sin_row = tl.load(h_sin + cos_offsets, mask=h_mask, other=0)
+    w_cos_row = tl.load(w_cos + cos_offsets, mask=w_mask, other=0)
+    w_sin_row = tl.load(w_sin + cos_offsets, mask=w_mask, other=0)
+    cos_row = t_cos_row + h_cos_row + w_cos_row
+    sin_row = t_sin_row + h_sin_row + w_sin_row
+    if is_neox_style:
+        fhq = tl.arange(0, pad_n_qh)[:, None] * hd + tl.arange(0, pad_hd // 2)[None, :]
+        fhk = tl.arange(0, pad_n_kh)[:, None] * hd + tl.arange(0, pad_hd // 2)[None, :]
+        fqm = (tl.arange(0, pad_n_qh)[:, None] < n_qh) & (
+            tl.arange(0, pad_hd // 2)[None, :] < rd // 2
+        )
+        fkm = (tl.arange(0, pad_n_kh)[:, None] < n_kh) & (
+            tl.arange(0, pad_hd // 2)[None, :] < rd // 2
+        )
+        q1 = tl.load(q_ptr + fhq, mask=fqm, other=0).to(sin_row.dtype)
+        k1 = tl.load(k_ptr + fhk, mask=fkm, other=0).to(sin_row.dtype)
+        shq = fhq + (rd // 2)
+        shk = fhk + (rd // 2)
+        q2 = tl.load(q_ptr + shq, mask=fqm, other=0).to(sin_row.dtype)
+        k2 = tl.load(k_ptr + shk, mask=fkm, other=0).to(sin_row.dtype)
+        tl.store(q_ptr + fhq, q1 * cos_row - q2 * sin_row, mask=fqm)
+        tl.store(q_ptr + shq, q2 * cos_row + q1 * sin_row, mask=fqm)
+        tl.store(k_ptr + fhk, k1 * cos_row - k2 * sin_row, mask=fkm)
+        tl.store(k_ptr + shk, k2 * cos_row + k1 * sin_row, mask=fkm)
+    else:
+        bq = tl.arange(0, pad_n_qh)[:, None] * hd
+        bk = tl.arange(0, pad_n_kh)[:, None] * hd
+        ei = 2 * tl.arange(0, pad_hd // 2)[None, :]
+        oi = ei + 1
+        im = tl.arange(0, pad_hd // 2)[None, :] < (rd // 2)
+        qm = (tl.arange(0, pad_n_qh)[:, None] < n_qh) & im
+        km = (tl.arange(0, pad_n_kh)[:, None] < n_kh) & im
+        qe = tl.load(q_ptr + bq + ei, mask=qm, other=0).to(sin_row.dtype)
+        qo = tl.load(q_ptr + bq + oi, mask=qm, other=0).to(sin_row.dtype)
+        ke = tl.load(k_ptr + bk + ei, mask=km, other=0).to(sin_row.dtype)
+        ko = tl.load(k_ptr + bk + oi, mask=km, other=0).to(sin_row.dtype)
+        tl.store(q_ptr + bq + ei, qe * cos_row - qo * sin_row, mask=qm)
+        tl.store(q_ptr + bq + oi, qo * cos_row + qe * sin_row, mask=qm)
+        tl.store(k_ptr + bk + ei, ke * cos_row - ko * sin_row, mask=km)
+        tl.store(k_ptr + bk + oi, ko * cos_row + ke * sin_row, mask=km)
+
+
+def triton_mrope_fused(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    cos_sin_cache: torch.Tensor,
+    positions: torch.Tensor,
+    mrope_section: List[int],
+    head_size: int,
+    rotary_dim: int,
+    mrope_interleaved: bool,
+    is_neox_style: bool,
+) -> None:
+    num_tokens, n_q_dim = q.shape
+    n_k_dim = k.shape[1]
+    n_qh = n_q_dim // head_size
+    n_kh = n_k_dim // head_size
+    pad_n_qh = triton.next_power_of_2(n_qh)
+    pad_n_kh = triton.next_power_of_2(n_kh)
+    pad_hd = triton.next_power_of_2(head_size)
+    _triton_mrope_forward_fused[(num_tokens,)](
+        q,
+        k,
+        cos_sin_cache,
+        positions,
+        q.stride(0),
+        k.stride(0),
+        positions.stride(0),
+        n_qh,
+        n_kh,
+        head_size,
+        rotary_dim,
+        pad_n_qh,
+        pad_n_kh,
+        pad_hd,
+        mrope_section[0],
+        mrope_section[1],
+        mrope_section[2],
+        mrope_interleaved,
+        is_neox_style,
+    )
+
+
+@triton.jit
+def _triton_ernie45_rope_qk_fused(
+    q_ptr,
+    k_ptr,
+    cos_sin_cache_ptr,
+    positions_ptr,
+    q_stride0: tl.constexpr,
+    k_stride0: tl.constexpr,
+    pos_stride0: tl.constexpr,
+    n_qh: tl.constexpr,
+    n_kh: tl.constexpr,
+    hd: tl.constexpr,
+    rd: tl.constexpr,
+    pad_n_qh: tl.constexpr,
+    pad_n_kh: tl.constexpr,
+    pad_hd: tl.constexpr,
+    section_hw: tl.constexpr,
+    is_neox_style: tl.constexpr,
+):
+    pid = tl.program_id(0)
+    q_ptr = q_ptr + pid * q_stride0
+    k_ptr = k_ptr + pid * k_stride0
+    half_rd = rd // 2
+    tpos = tl.load(positions_ptr + 0 * pos_stride0 + pid).to(tl.int32)
+    hpos = tl.load(positions_ptr + 1 * pos_stride0 + pid).to(tl.int32)
+    wpos = tl.load(positions_ptr + 2 * pos_stride0 + pid).to(tl.int32)
+    ridx = tl.arange(0, pad_hd // 2)
+    rmask = ridx < half_rd
+    use_hw = ridx < section_hw
+    use_h = (ridx & 1) == 0
+    pos = tl.where(use_hw, tl.where(use_h, hpos, wpos), tpos)
+    cos = tl.load(cos_sin_cache_ptr + pos * rd + ridx, mask=rmask, other=0.0)
+    sin = tl.load(
+        cos_sin_cache_ptr + pos * rd + (ridx + half_rd), mask=rmask, other=0.0
+    )
+    if is_neox_style:
+        qh = tl.arange(0, pad_n_qh)[:, None]
+        kh = tl.arange(0, pad_n_kh)[:, None]
+        d = tl.arange(0, pad_hd // 2)[None, :]
+        qm = (qh < n_qh) & (d < half_rd)
+        km = (kh < n_kh) & (d < half_rd)
+        qo0 = qh * hd + d
+        ko0 = kh * hd + d
+        qo1 = qo0 + half_rd
+        ko1 = ko0 + half_rd
+        q0 = tl.load(q_ptr + qo0, mask=qm, other=0.0).to(cos.dtype)
+        q1 = tl.load(q_ptr + qo1, mask=qm, other=0.0).to(cos.dtype)
+        k0 = tl.load(k_ptr + ko0, mask=km, other=0.0).to(cos.dtype)
+        k1 = tl.load(k_ptr + ko1, mask=km, other=0.0).to(cos.dtype)
+        cb = cos[None, :]
+        sb = sin[None, :]
+        tl.store(q_ptr + qo0, q0 * cb - q1 * sb, mask=qm)
+        tl.store(q_ptr + qo1, q1 * cb + q0 * sb, mask=qm)
+        tl.store(k_ptr + ko0, k0 * cb - k1 * sb, mask=km)
+        tl.store(k_ptr + ko1, k1 * cb + k0 * sb, mask=km)
+    else:
+        qh = tl.arange(0, pad_n_qh)[:, None]
+        kh = tl.arange(0, pad_n_kh)[:, None]
+        p = tl.arange(0, pad_hd // 2)[None, :]
+        qm = (qh < n_qh) & (p < half_rd)
+        km = (kh < n_kh) & (p < half_rd)
+        even = 2 * p
+        odd = even + 1
+        qe = tl.load(q_ptr + qh * hd + even, mask=qm, other=0.0).to(cos.dtype)
+        qo = tl.load(q_ptr + qh * hd + odd, mask=qm, other=0.0).to(cos.dtype)
+        ke = tl.load(k_ptr + kh * hd + even, mask=km, other=0.0).to(cos.dtype)
+        ko = tl.load(k_ptr + kh * hd + odd, mask=km, other=0.0).to(cos.dtype)
+        cb = cos[None, :]
+        sb = sin[None, :]
+        tl.store(q_ptr + qh * hd + even, qe * cb - qo * sb, mask=qm)
+        tl.store(q_ptr + qh * hd + odd, qo * cb + qe * sb, mask=qm)
+        tl.store(k_ptr + kh * hd + even, ke * cb - ko * sb, mask=km)
+        tl.store(k_ptr + kh * hd + odd, ko * cb + ke * sb, mask=km)
+
+
+def triton_ernie45_rope_fused_inplace(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    cos_sin_cache: torch.Tensor,
+    positions: torch.Tensor,
+    mrope_section: list,
+    head_size: int,
+    rotary_dim: int,
+    is_neox_style: bool,
+) -> None:
+    num_tokens = q.shape[0]
+    n_qh = q.shape[1] // head_size
+    n_kh = k.shape[1] // head_size
+    rd = rotary_dim
+    section_h, section_w, section_t = mrope_section
+    assert section_h == section_w, "Ernie4.5 layout assumes section_h == section_w"
+    assert section_h + section_w + section_t == rd // 2
+    if cos_sin_cache.dtype != q.dtype or cos_sin_cache.device != q.device:
+        cos_sin_cache = cos_sin_cache.to(device=q.device, dtype=q.dtype)
+    pad_n_qh = triton.next_power_of_2(n_qh)
+    pad_n_kh = triton.next_power_of_2(n_kh)
+    pad_hd = triton.next_power_of_2(head_size)
+    num_warps = 4 if (pad_n_qh * pad_hd) <= 8192 else 8
+    _triton_ernie45_rope_qk_fused[(num_tokens,)](
+        q,
+        k,
+        cos_sin_cache,
+        positions,
+        q.stride(0),
+        k.stride(0),
+        positions.stride(0),
+        n_qh=n_qh,
+        n_kh=n_kh,
+        hd=head_size,
+        rd=rd,
+        pad_n_qh=pad_n_qh,
+        pad_n_kh=pad_n_kh,
+        pad_hd=pad_hd,
+        section_hw=section_h + section_w,
+        is_neox_style=is_neox_style,
+        num_warps=num_warps,
+    )
diff --git a/sglang/python/sglang/srt/layers/rotary_embedding/utils.py b/sglang/python/sglang/srt/layers/rotary_embedding/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..8d9defabe1f05af5e032185535a1cd64b2f4ced9
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/rotary_embedding/utils.py
@@ -0,0 +1,132 @@
+"""Primitive rotary embedding ops: _rotate_neox, _rotate_gptj, _apply_rotary_emb,
+apply_rotary_pos_emb variants."""
+
+from __future__ import annotations
+
+from typing import Tuple
+
+import torch
+
+from sglang.srt.utils import get_compiler_backend, is_npu
+
+_is_npu = is_npu()
+
+if _is_npu:
+    import torch_npu
+
+    NPU_ROTARY_MUL_MAX_NUM_HEADS = 1000
+    NPU_ROTARY_MUL_MAX_HEAD_SIZE = 896
+
+
+def rotate_neox(x: torch.Tensor) -> torch.Tensor:
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
+    return torch.cat((-x2, x1), dim=-1)
+
+
+def rotate_gptj(x: torch.Tensor) -> torch.Tensor:
+    x1 = x[..., ::2]
+    x2 = x[..., 1::2]
+    x = torch.stack((-x2, x1), dim=-1)
+    return x.flatten(-2)
+
+
+def apply_rotary_emb(
+    x: torch.Tensor,
+    cos: torch.Tensor,
+    sin: torch.Tensor,
+    is_neox_style: bool,
+) -> torch.Tensor:
+    """
+    Args:
+        x: [num_tokens, num_heads, head_size]
+        cos: [num_tokens, head_size // 2]
+        sin: [num_tokens, head_size // 2]
+        is_neox_style: Whether to use the Neox-style or GPT-J-style rotary
+            positional embeddings.
+    """
+    cos = cos.unsqueeze(-2).to(x.dtype)
+    sin = sin.unsqueeze(-2).to(x.dtype)
+    if is_neox_style:
+        x1, x2 = torch.chunk(x, 2, dim=-1)
+    else:
+        x1 = x[..., ::2]
+        x2 = x[..., 1::2]
+    o1 = x1 * cos - x2 * sin
+    o2 = x2 * cos + x1 * sin
+    if is_neox_style:
+        return torch.cat((o1, o2), dim=-1)
+    else:
+        return torch.stack((o1, o2), dim=-1).flatten(-2)
+
+
+# Copied from transformers
+def rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
+    return torch.cat((-x2, x1), dim=-1)
+
+
+@torch.compile(dynamic=True, backend=get_compiler_backend())
+def apply_rotary_pos_emb_native(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    cos: torch.Tensor,
+    sin: torch.Tensor,
+    unsqueeze_dim=1,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    orig_q_dtype = q.dtype
+    orig_k_dtype = k.dtype
+    q, k = q.float(), k.float()
+
+    # embedding is performed in float
+    cos = cos.unsqueeze(unsqueeze_dim).float()
+    sin = sin.unsqueeze(unsqueeze_dim).float()
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+
+    q_embed = q_embed.to(orig_q_dtype)
+    k_embed = k_embed.to(orig_k_dtype)
+
+    return q_embed, k_embed
+
+
+def apply_rotary_pos_emb_npu(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    cos: torch.Tensor,
+    sin: torch.Tensor,
+    unsqueeze_dim=1,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """Ascend implementation equivalent to apply_rotary_pos_emb_native.
+
+    Args:
+        q: [num_tokens, num_heads, head_size]
+        k: [num_tokens, num_kv_heads, head_size]
+        cos: [num_tokens, head_size]
+        sin: [num_tokens, head_size]
+    """
+    if (
+        cos.dim() != 2
+        or q.dim() != 3
+        or q.shape[1] >= NPU_ROTARY_MUL_MAX_NUM_HEADS
+        or q.shape[2] >= NPU_ROTARY_MUL_MAX_HEAD_SIZE
+    ):
+        # Note: num_heads and head_size of q must be less than 1000 and 896, respectively
+        return apply_rotary_pos_emb_native(q, k, cos, sin, unsqueeze_dim)
+    cos = cos.unsqueeze(unsqueeze_dim).unsqueeze(0)
+    sin = sin.unsqueeze(unsqueeze_dim).unsqueeze(0)
+    q = q.unsqueeze(0)
+    k = k.unsqueeze(0)
+    q_embed = torch_npu.npu_rotary_mul(q, cos, sin)
+    k_embed = torch_npu.npu_rotary_mul(k, cos, sin)
+    q_embed = q_embed.squeeze(0)
+    k_embed = k_embed.squeeze(0)
+    return q_embed, k_embed
+
+
+if _is_npu:
+    apply_rotary_pos_emb = apply_rotary_pos_emb_npu
+else:
+    apply_rotary_pos_emb = apply_rotary_pos_emb_native
diff --git a/sglang/python/sglang/srt/layers/rotary_embedding/yarn.py b/sglang/python/sglang/srt/layers/rotary_embedding/yarn.py
new file mode 100644
index 0000000000000000000000000000000000000000..61f648ac061f767f4466532870908740900cee5e
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/rotary_embedding/yarn.py
@@ -0,0 +1,134 @@
+"""YaRNScalingRotaryEmbedding + YaRN helper functions."""
+
+from __future__ import annotations
+
+import math
+from typing import Tuple
+
+import torch
+
+from sglang.srt.layers.rotary_embedding.base import RotaryEmbedding
+
+
+# Inverse dim formula to find dim based on number of rotations
+def yarn_find_correction_dim(
+    num_rotations: int,
+    dim: int,
+    base: float = 10000,
+    max_position_embeddings: int = 2048,
+) -> float:
+    return (dim * math.log(max_position_embeddings / (num_rotations * 2 * math.pi))) / (
+        2 * math.log(base)
+    )
+
+
+# Find dim range bounds based on rotations
+def yarn_find_correction_range(
+    low_rot: int,
+    high_rot: int,
+    dim: int,
+    base: float = 10000,
+    max_position_embeddings: int = 2048,
+    truncate: bool = True,
+) -> Tuple[int, int]:
+    low = yarn_find_correction_dim(low_rot, dim, base, max_position_embeddings)
+    high = yarn_find_correction_dim(high_rot, dim, base, max_position_embeddings)
+    if truncate:
+        low = math.floor(low)
+        high = math.ceil(high)
+    return max(low, 0), min(high, dim - 1)  # Clamp values just in case
+
+
+def yarn_linear_ramp_mask(
+    low: float, high: float, dim: int, dtype: torch.dtype, device: torch.device = None
+) -> torch.Tensor:
+    if low == high:
+        high += 0.001  # Prevent singularity
+
+    linear_func = (torch.arange(dim, dtype=dtype, device=device) - low) / (high - low)
+    ramp_func = torch.clamp(linear_func, 0, 1)
+    return ramp_func
+
+
+def yarn_get_mscale_simple(scale: float = 1) -> float:
+    if scale <= 1:
+        return 1.0
+    return 0.1 * math.log(scale) + 1.0
+
+
+def yarn_get_mscale(scale: float = 1, mscale: float = 1) -> float:
+    if scale <= 1:
+        return 1.0
+    return 0.1 * mscale * math.log(scale) + 1.0
+
+
+class YaRNScalingRotaryEmbedding(RotaryEmbedding):
+    """RotaryEmbedding extended with YaRN method.
+
+    Credits to Peng et al. github.com/jquesnelle/yarn
+    """
+
+    def __init__(
+        self,
+        head_size: int,
+        rotary_dim: int,
+        max_position_embeddings: int,
+        base: int,
+        is_neox_style: bool,
+        scaling_factor: float,
+        dtype: torch.dtype,
+        *,
+        extrapolation_factor: float = 1,
+        attn_factor: float = 1,
+        beta_fast: int = 32,
+        beta_slow: int = 1,
+        truncate: bool = True,
+    ) -> None:
+        self.scaling_factor = scaling_factor
+        self.extrapolation_factor = extrapolation_factor
+        self.attn_factor = attn_factor
+        self.beta_fast = beta_fast
+        self.beta_slow = beta_slow
+        self.truncate = truncate
+        # Get n-d magnitude scaling corrected for interpolation
+        self.mscale = float(yarn_get_mscale_simple(self.scaling_factor) * attn_factor)
+        super().__init__(
+            head_size, rotary_dim, max_position_embeddings, base, is_neox_style, dtype
+        )
+
+    def _compute_inv_freq(self, scaling_factor: float) -> torch.Tensor:
+        pos_freqs = self.base ** (
+            torch.arange(0, self.rotary_dim, 2, dtype=torch.float) / self.rotary_dim
+        )
+        inv_freq_extrapolation = 1.0 / pos_freqs
+        inv_freq_interpolation = 1.0 / (scaling_factor * pos_freqs)
+
+        low, high = yarn_find_correction_range(
+            self.beta_fast,
+            self.beta_slow,
+            self.rotary_dim,
+            self.base,
+            self.max_position_embeddings,
+            self.truncate,
+        )
+        # Get n-d rotational scaling corrected for extrapolation
+        inv_freq_mask = (
+            1
+            - yarn_linear_ramp_mask(low, high, self.rotary_dim // 2, dtype=torch.float)
+        ) * self.extrapolation_factor
+        inv_freq = (
+            inv_freq_interpolation * (1 - inv_freq_mask)
+            + inv_freq_extrapolation * inv_freq_mask
+        )
+        return inv_freq
+
+    def _compute_cos_sin_cache(self) -> torch.Tensor:
+        inv_freq = self._compute_inv_freq(self.scaling_factor)
+        t = torch.arange(
+            self.max_position_embeddings * self.scaling_factor, dtype=torch.float32
+        )
+        freqs = torch.einsum("i,j -> ij", t, inv_freq)
+        cos = freqs.cos() * self.mscale
+        sin = freqs.sin() * self.mscale
+        cache = torch.cat((cos, sin), dim=-1)
+        return cache
diff --git a/sglang/python/sglang/srt/layers/sampler.py b/sglang/python/sglang/srt/layers/sampler.py
new file mode 100644
index 0000000000000000000000000000000000000000..a4c7c7db037ea019e1d04b1c3967c482738e9cc7
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/sampler.py
@@ -0,0 +1,748 @@
+import logging
+from typing import Callable, Dict, List, Optional, Tuple
+
+import torch
+import torch.distributed as dist
+from torch import nn
+
+from sglang.srt.distributed import get_tp_group
+from sglang.srt.layers.dp_attention import (
+    get_attention_tp_group,
+    is_dp_attention_enabled,
+)
+from sglang.srt.layers.logits_processor import LogitsProcessorOutput
+from sglang.srt.layers.utils.hash import murmur_hash32
+from sglang.srt.layers.utils.logprob import get_token_ids_logprobs, get_top_logprobs
+from sglang.srt.sampling.sampling_batch_info import SamplingBatchInfo
+from sglang.srt.sampling.sampling_params import TOP_K_ALL
+from sglang.srt.server_args import get_global_server_args
+from sglang.srt.utils.common import crash_on_warnings, get_bool_env_var, is_cuda, is_npu
+
+if is_cuda():
+    from flashinfer.sampling import (
+        min_p_sampling_from_probs,
+        top_k_top_p_sampling_from_probs,
+    )
+    from sgl_kernel import (
+        top_k_renorm_prob,
+        top_p_renorm_prob,
+    )
+if is_npu():
+    import torch_npu
+
+logger = logging.getLogger(__name__)
+
+SYNC_TOKEN_IDS_ACROSS_TP = get_bool_env_var("SYNC_TOKEN_IDS_ACROSS_TP")
+SGLANG_RETURN_ORIGINAL_LOGPROB = get_bool_env_var("SGLANG_RETURN_ORIGINAL_LOGPROB")
+_CUSTOM_SAMPLER_FACTORIES: Dict[str, Callable[[], "Sampler"]] = {}
+_BUILT_IN_SAMPLING_BACKENDS = {"flashinfer", "pytorch", "ascend"}
+
+
+class Sampler(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.use_nan_detection = get_global_server_args().enable_nan_detection
+        self.tp_sync_group = get_tp_group().device_group
+        if is_dp_attention_enabled():
+            self.tp_sync_group = get_attention_tp_group().device_group
+
+        self.rl_on_policy_target = get_global_server_args().rl_on_policy_target
+        # In RL on-policy mode, deterministic inference is automatically enabled.
+        self.enable_deterministic = (
+            get_global_server_args().enable_deterministic_inference
+        )
+        # In RL on-policy mode, we use log_softmax to compute logprobs to match the trainer.
+        self.use_log_softmax_logprob = self.rl_on_policy_target is not None
+        self.use_ascend_backend = get_global_server_args().sampling_backend == "ascend"
+
+    def _preprocess_logits(
+        self, logits: torch.Tensor, sampling_info: SamplingBatchInfo
+    ) -> torch.Tensor:
+        """Apply custom logit processors and handle NaN detection."""
+        # Apply the custom logit processors if registered in the sampling info
+        if sampling_info.has_custom_logit_processor:
+            apply_custom_logit_processor(logits, sampling_info)
+
+        # Detect and handle NaN values in logits
+        if self.use_nan_detection and torch.any(torch.isnan(logits)):
+            logger.warning("Detected errors during sampling! NaN in the logits.")
+            logits = torch.where(
+                torch.isnan(logits), torch.full_like(logits, -1e5), logits
+            )
+            if crash_on_warnings():
+                raise ValueError("Detected errors during sampling! NaN in the logits.")
+
+        return logits
+
+    def forward(
+        self,
+        logits_output: LogitsProcessorOutput,
+        sampling_info: SamplingBatchInfo,
+        return_logprob: bool,
+        top_logprobs_nums: List[int],
+        token_ids_logprobs: List[List[int]],
+        positions: torch.Tensor,
+    ):
+        """Run a sampler & compute logprobs and update logits_output accordingly.
+
+        Args:
+            logits_output: The logits from the model forward
+            sampling_info: Metadata for sampling
+            return_logprob: If set, store the output logprob information to
+                logits_output
+            top_logprobs_nums: Number of top lobprobs per sequence in a batch
+            token_ids_logprobs: Per-sequence list of specific token IDs to retrieve
+                logprobs for. Each element is a list of token IDs (or None) for one
+                sequence in the batch. This is used in speculative decoding.
+            positions: The positions of the tokens in the sequence. Used for deterministic sampling
+                to get the unique seed for each position.
+        """
+        logits = logits_output.next_token_logits
+
+        # Preprocess logits (custom processors and NaN handling)
+        logits = self._preprocess_logits(logits, sampling_info)
+
+        if sampling_info.is_all_greedy:
+            # Use torch.argmax if all requests use greedy sampling
+            batch_next_token_ids = torch.argmax(logits, -1)
+            if return_logprob:
+                original_logprobs = logprobs = torch.nn.functional.log_softmax(
+                    logits, dim=-1
+                )
+        else:
+            simple_sampling_case = (
+                not sampling_info.need_top_p_sampling
+                and not sampling_info.need_top_k_sampling
+                and not sampling_info.need_min_p_sampling
+            )
+
+            # If requested, cache original logprobs before temperature scaling.
+            if return_logprob and SGLANG_RETURN_ORIGINAL_LOGPROB:
+                original_logprobs = torch.log_softmax(logits, dim=-1)
+
+            # In RL on-policy mode, we use log_softmax to compute logprobs to match the trainer.
+            logprobs_via_logsoftmax_kernel = None
+            if self.rl_on_policy_target is not None:
+                # TODO: use more inplace ops to save memory
+                logits_div_temperature = (
+                    logits.bfloat16().div(sampling_info.temperatures).bfloat16()
+                )
+                logprobs_via_logsoftmax_kernel = torch.log_softmax(
+                    logits_div_temperature, dim=-1
+                )
+                del logits_div_temperature
+
+            if self.use_ascend_backend:
+                # Ascend backend: sample from logits directly.
+                batch_next_token_ids, logprobs = self._forward_ascend_backend(
+                    logits, sampling_info, simple_sampling_case, return_logprob
+                )
+            elif (
+                self.use_log_softmax_logprob
+                and self.enable_deterministic
+                and simple_sampling_case
+            ):
+                # RL on-policy path: sample from logprobs to match the trainer.
+                batch_next_token_ids = self._sample_from_logprobs(
+                    logprobs_via_logsoftmax_kernel,
+                    sampling_info,
+                    positions,
+                )
+                if return_logprob and not SGLANG_RETURN_ORIGINAL_LOGPROB:
+                    logprobs = logprobs_via_logsoftmax_kernel
+            else:
+                # Standard path: do softmax and sample from probs.
+                logits.div_(sampling_info.temperatures)
+
+                # In-place op to save memory
+                logits[:] = torch.softmax(logits, dim=-1)
+                probs = logits
+
+                batch_next_token_ids = self._sample_from_probs(
+                    probs, sampling_info, positions, simple_sampling_case
+                )
+                if return_logprob and not SGLANG_RETURN_ORIGINAL_LOGPROB:
+                    logprobs = (
+                        logprobs_via_logsoftmax_kernel
+                        if logprobs_via_logsoftmax_kernel is not None
+                        else torch.log(probs)
+                    )
+                del probs
+
+        # Attach logprobs to logits_output (in-place modification)
+        if return_logprob:
+            if SGLANG_RETURN_ORIGINAL_LOGPROB:
+                logprobs = original_logprobs
+            self._attach_logprobs_to_output(
+                logits_output,
+                logprobs,
+                top_logprobs_nums,
+                token_ids_logprobs,
+                sampling_info,
+                batch_next_token_ids,
+            )
+
+        self._sync_token_ids_across_tp(batch_next_token_ids, sampling_info)
+
+        return batch_next_token_ids
+
+    def _sample_from_probs(
+        self,
+        probs: torch.Tensor,
+        sampling_info: SamplingBatchInfo,
+        positions: torch.Tensor,
+        simple_sampling_case: bool,
+    ) -> torch.Tensor:
+        """Sample from probability distribution (after softmax).
+
+        Used for standard sampling with flashinfer/pytorch backends.
+        Handles both simple (direct multinomial) and complex (top-k/top-p/min-p) cases.
+        """
+        if simple_sampling_case:
+            batch_next_token_ids = sampling_from_probs_torch(
+                probs,
+                sampling_seed=sampling_info.sampling_seed,
+                positions=positions,
+            )
+        else:
+            backend = get_global_server_args().sampling_backend
+            if backend == "flashinfer":
+                assert (
+                    sampling_info.sampling_seed is None
+                ), "Sampling seed is not supported for flashinfer backend"
+                if sampling_info.need_min_p_sampling:
+                    probs = top_k_renorm_prob(probs, sampling_info.top_ks)
+                    probs = top_p_renorm_prob(probs, sampling_info.top_ps)
+                    batch_next_token_ids = min_p_sampling_from_probs(
+                        probs, sampling_info.min_ps
+                    )
+                else:
+                    batch_next_token_ids = top_k_top_p_sampling_from_probs(
+                        probs.contiguous(),
+                        sampling_info.top_ks,
+                        sampling_info.top_ps,
+                        filter_apply_order="joint",
+                        check_nan=self.use_nan_detection,
+                    )
+            elif backend == "pytorch":
+                # A slower fallback implementation with torch native operations.
+                batch_next_token_ids = top_k_top_p_min_p_sampling_from_probs_torch(
+                    probs,
+                    sampling_info.top_ks,
+                    sampling_info.top_ps,
+                    sampling_info.min_ps,
+                    sampling_info.need_min_p_sampling,
+                    sampling_info.sampling_seed,
+                    positions,
+                )
+            else:
+                raise ValueError(f"Invalid sampling backend: {backend}")
+        return batch_next_token_ids
+
+    def _sample_from_logprobs(
+        self,
+        logprobs: torch.Tensor,
+        sampling_info: SamplingBatchInfo,
+        positions: torch.Tensor,
+    ) -> torch.Tensor:
+        """Sample from log-probabilities using the Gumbel trick.
+
+        Used for deterministic sampling with simple cases (no top-k/top-p/min-p).
+        Requires sampling_seed to be set in sampling_info.
+        """
+        assert (
+            sampling_info.sampling_seed is not None
+        ), "sampling_seed is required for sampling from logprobs"
+        sampled_index = multinomial_with_seed(
+            logprobs, sampling_info.sampling_seed, positions
+        )
+        return sampled_index.view(-1).to(torch.int32)
+
+    def _sample_from_logits(
+        self,
+        logits: torch.Tensor,
+        sampling_info: SamplingBatchInfo,
+        simple_sampling_case: bool,
+    ) -> torch.Tensor:
+        """Sample from temperature-scaled logits without softmax.
+
+        Used for the Ascend NPU backend which handles softmax internally.
+        """
+        if simple_sampling_case:
+            probs = torch.softmax(logits, dim=-1)
+            batch_next_token_ids = torch.multinomial(probs, num_samples=1).view(-1)
+            return batch_next_token_ids.to(torch.int32)
+        else:
+            assert (
+                self.use_ascend_backend
+            ), "Only ascend backend supports sampling from logits"
+            batch_next_token_ids = top_k_top_p_min_p_sampling_from_logits_ascend(
+                logits,
+                sampling_info.top_ks,
+                sampling_info.top_ps,
+                sampling_info.min_ps,
+                sampling_info.need_min_p_sampling,
+            )
+            return batch_next_token_ids.to(torch.int32)
+
+    def _forward_ascend_backend(
+        self,
+        logits: torch.Tensor,
+        sampling_info: SamplingBatchInfo,
+        simple_sampling_case: bool,
+        return_logprob: bool,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
+        """Handle the full Ascend backend sampling path.
+
+        Ascend backend has fused kernels that handle softmax internally,
+        so we sample directly from temperature-scaled logits.
+
+        Returns:
+            A tuple of (batch_next_token_ids, logprobs). logprobs is None
+            when return_logprob is False or SGLANG_RETURN_ORIGINAL_LOGPROB is set.
+        """
+        logits.div_(sampling_info.temperatures)
+        batch_next_token_ids = self._sample_from_logits(
+            logits, sampling_info, simple_sampling_case
+        )
+        logprobs = None
+        if return_logprob and not SGLANG_RETURN_ORIGINAL_LOGPROB:
+            logprobs = torch.log_softmax(logits, dim=-1)
+        return batch_next_token_ids, logprobs
+
+    def _attach_logprobs_to_output(
+        self,
+        logits_output: LogitsProcessorOutput,
+        logprobs: torch.Tensor,
+        top_logprobs_nums: List[int],
+        token_ids_logprobs: List[List[int]],
+        sampling_info: SamplingBatchInfo,
+        batch_next_token_ids: torch.Tensor,
+    ):
+        # clamp to avoid -inf values
+        logprobs.clamp_(min=torch.finfo(logprobs.dtype).min)
+
+        # Attach logprobs to logits_output (in-place modification)
+        if any(x > 0 for x in top_logprobs_nums):
+            (
+                logits_output.next_token_top_logprobs_val,
+                logits_output.next_token_top_logprobs_idx,
+            ) = get_top_logprobs(logprobs, top_logprobs_nums)
+
+        if any(x is not None for x in token_ids_logprobs):
+            (
+                logits_output.next_token_token_ids_logprobs_val,
+                logits_output.next_token_token_ids_logprobs_idx,
+            ) = get_token_ids_logprobs(logprobs, token_ids_logprobs)
+
+        logits_output.next_token_logprobs = logprobs[
+            torch.arange(len(batch_next_token_ids), device=sampling_info.device),
+            batch_next_token_ids,
+        ]
+
+    def _sync_token_ids_across_tp(
+        self, batch_next_token_ids: torch.Tensor, sampling_info: SamplingBatchInfo
+    ):
+        if SYNC_TOKEN_IDS_ACROSS_TP or sampling_info.grammars:
+            # For performance reasons, SGLang does not sync the final token IDs across TP ranks by default.
+            # This saves one all-reduce, but the correctness of this approach depends on the determinism of several operators:
+            # the last all-reduce, the last lm_head matmul, and all sampling kernels.
+            # These kernels are deterministic in most cases, but there are some rare instances where they are not deterministic.
+            # In such cases, enable this env variable to prevent hanging due to TP ranks becoming desynchronized.
+            # When using xgrammar, this becomes more likely so we also do the sync when grammar is used.
+
+            torch.distributed.all_reduce(
+                batch_next_token_ids,
+                op=dist.ReduceOp.MIN,
+                group=self.tp_sync_group,
+            )
+
+    def compute_logprobs_only(
+        self,
+        logits_output: LogitsProcessorOutput,
+        sampling_info: SamplingBatchInfo,
+        return_logprob: bool,
+        top_logprobs_nums: List[int],
+        token_ids_logprobs: List[List[int]],
+    ) -> None:
+        """
+        Compute logprobs for requested token IDs without performing sampling.
+
+        Optimized for prefill-only scoring requests that need token probabilities
+        but don't require next token generation.
+        """
+
+        if logits_output.next_token_logits is None:
+            logger.warning("No logits available for logprob computation")
+            return
+
+        # Check if any requests actually need logprobs computation
+        needs_token_ids_logprobs = any(
+            token_ids is not None and len(token_ids) > 0
+            for token_ids in token_ids_logprobs
+        )
+        needs_top_logprobs = any(x > 0 for x in top_logprobs_nums)
+
+        if not (needs_token_ids_logprobs or needs_top_logprobs):
+            return
+
+        # Preprocess logits (custom processors and NaN handling)
+        logits = self._preprocess_logits(logits_output.next_token_logits, sampling_info)
+
+        # Compute logprobs
+        logprobs = torch.nn.functional.log_softmax(logits, dim=-1)
+
+        # Handle top logprobs if requested
+        if needs_top_logprobs:
+            (
+                logits_output.next_token_top_logprobs_val,
+                logits_output.next_token_top_logprobs_idx,
+            ) = get_top_logprobs(logprobs, top_logprobs_nums)
+
+        # Handle token_ids logprobs if requested
+        if needs_token_ids_logprobs:
+            (
+                logits_output.next_token_token_ids_logprobs_val,
+                logits_output.next_token_token_ids_logprobs_idx,
+            ) = get_token_ids_logprobs_batch_optimized(logprobs, token_ids_logprobs)
+
+
+def register_sampler_backend(backend: str, factory: Callable[[], "Sampler"]) -> None:
+    """Register a custom sampler factory for a backend string."""
+
+    if not backend:
+        raise ValueError("backend must be a non-empty string")
+
+    from sglang.srt.server_args import SAMPLING_BACKEND_CHOICES
+
+    if backend in _CUSTOM_SAMPLER_FACTORIES:
+        logger.warning("Overriding existing sampler factory for backend '%s'", backend)
+    SAMPLING_BACKEND_CHOICES.add(backend)
+    _CUSTOM_SAMPLER_FACTORIES[backend] = factory
+
+
+def create_sampler(backend: Optional[str] = None) -> "Sampler":
+    """Create a sampler honoring custom backend registrations."""
+
+    server_args = get_global_server_args()
+    backend = backend or (server_args.sampling_backend if server_args else None)
+
+    if backend in _CUSTOM_SAMPLER_FACTORIES:
+        sampler = _CUSTOM_SAMPLER_FACTORIES[backend]()
+        if not isinstance(sampler, Sampler):
+            raise TypeError(
+                f"Custom sampler factory for backend '{backend}' must return a Sampler"
+            )
+        return sampler
+
+    if backend is None or backend in _BUILT_IN_SAMPLING_BACKENDS:
+        return Sampler()
+
+    raise ValueError(
+        f"Unknown sampling backend '{backend}'. Register it via register_sampler_backend()."
+    )
+
+
+def top_k_top_p_min_p_sampling_from_probs_torch(
+    probs: torch.Tensor,
+    top_ks: torch.Tensor,
+    top_ps: torch.Tensor,
+    min_ps: torch.Tensor,
+    need_min_p_sampling: bool,
+    sampling_seed: Optional[torch.Tensor],
+    positions: torch.Tensor,
+):
+    """
+    A top-k, top-p and min-p sampling implementation with native pytorch operations.
+    When sampling_seed is not None, deterministic inference will be enabled, it will sample
+    with the sampling_seed of each request.
+    """
+    probs_sort, probs_idx = probs.sort(dim=-1, descending=True)
+    probs_sum = torch.cumsum(probs_sort, dim=-1)
+    probs_sort[
+        torch.arange(0, probs.shape[-1], device=probs.device).view(1, -1)
+        >= top_ks.view(-1, 1)
+    ] = 0.0
+    probs_sort[(probs_sum - probs_sort) > top_ps.view(-1, 1)] = 0.0
+
+    if need_min_p_sampling:
+        # TODO: probs_sort should be re-normalized for the use of multinomial_with_seed
+        assert (
+            sampling_seed is None
+        ), "With sampling seed, multinomial_with_seed will provide wrong results"
+        min_p_thresholds = probs_sort[:, 0] * min_ps
+        probs_sort[probs_sort < min_p_thresholds.view(-1, 1)] = 0.0
+
+    if sampling_seed is None:
+        sampled_index = torch.multinomial(probs_sort, num_samples=1)
+    else:
+        # NOTE: when using top-k/top-p/min-p sampling, we need to modify probs before we
+        # apply log to get logprobs. Therefore, we cannot use log_softmax directly.
+        # For now, we use log to the modified probs to get logprobs, but for numerical
+        # stability, we'd better come up with a solution to use log_softmax.
+        logprobs = probs_sort.to(torch.float64)  # Using float64 for numerical stability
+        del probs_sort
+        logprobs.log_()
+        sampled_index = multinomial_with_seed(logprobs, sampling_seed, positions)
+
+    # int32 range is enough to represent the token ids
+    probs_idx = probs_idx.to(torch.int32)
+    batch_next_token_ids = torch.gather(probs_idx, dim=1, index=sampled_index).view(-1)
+    return batch_next_token_ids
+
+
+def top_k_top_p_min_p_sampling_from_logits_ascend(
+    logits: torch.Tensor,
+    top_ks: torch.Tensor,
+    top_ps: torch.Tensor,
+    min_ps: torch.Tensor,
+    need_min_p_sampling: bool,
+):
+    """A top-k, top-p and min-p sampling implementation for ascend npu with torch_npu interface.
+
+    Takes temperature-scaled logits as input (softmax is applied internally).
+    """
+    # torch_npu.npu_top_k_top_p requires top_k value range in [1, 1024]
+    if hasattr(torch_npu, "npu_top_k_top_p") and torch.all(
+        (top_ks <= 1024) & (top_ks >= 1)
+    ):
+        logits_top_k_top_p = torch_npu.npu_top_k_top_p(logits, top_ps, top_ks)
+        probs_top_k_top_p = logits_top_k_top_p.softmax(dim=-1)
+
+        if need_min_p_sampling:
+            min_p_thresholds = probs_top_k_top_p.max(dim=-1) * min_ps
+            min_p_mask = probs_top_k_top_p < min_p_thresholds.view(-1, 1)
+            probs_top_k_top_p.masked_fill_(min_p_mask, 0.0)
+
+        batch_next_token_ids = torch.multinomial(probs_top_k_top_p, num_samples=1)
+    else:
+        probs = torch.softmax(logits, dim=-1)
+        probs_sort, probs_idx = probs.sort(dim=-1, descending=True)
+
+        # when top_k is -1 (in which sglang turns it to TOP_K_ALL), make it explicitly equal to logit's size
+        topk_all_mask = top_ks == TOP_K_ALL
+        top_ks.masked_fill_(topk_all_mask, probs.shape[1])
+        top_k_mask = torch.arange(0, probs.shape[-1], device=probs.device).view(
+            1, -1
+        ) >= top_ks.view(-1, 1)
+        probs_sort.masked_fill_(top_k_mask, 0.0)
+
+        probs_sum = torch.cumsum(probs_sort, dim=-1)
+        top_p_mask = probs_sum - probs_sort > top_ps.view(-1, 1)
+        probs_sort.masked_fill_(top_p_mask, 0.0)
+
+        if need_min_p_sampling:
+            min_p_thresholds = probs_sort[:, 0] * min_ps
+            min_p_mask = probs_sort < min_p_thresholds.view(-1, 1)
+            probs_sort.masked_fill_(min_p_mask, 0.0)
+
+        sampled_index = torch.multinomial(probs_sort, num_samples=1)
+        probs_idx = probs_idx.to(torch.int32)
+        batch_next_token_ids = torch.gather(probs_idx, dim=1, index=sampled_index)
+
+    return batch_next_token_ids.view(-1)
+
+
+@torch.compile(dynamic=True)
+def multinomial_with_seed(
+    logprobs: torch.Tensor, seed: torch.Tensor, positions: torch.Tensor
+) -> torch.Tensor:
+    """
+    Samples n elements from an input tensor `inputs` of shape (n, m) using
+    a unique random seed for each row. This is a deterministic batched alternative to
+    `torch.multinomial`.
+
+    Args:
+        inputs: A float tensor of shape (n, m) representing n categorical
+                distributions with m categories each. The values are treated
+                as weights and do not need to sum to 1.
+        seed:   An integer tensor of shape (n,) containing the random seed
+                for each corresponding row in `inputs`.
+        positions: The positions of the tokens in the sequence. Used for deterministic sampling
+                to get the unique seed for each position.
+
+    Returns:
+        A tensor of shape (n,) where the i-th element is an index sampled
+        from the distribution in `inputs[i]` using `seed[i]`.
+    """
+    n, m = logprobs.shape
+    seed = seed.to(torch.uint64)
+    col_indices = torch.arange(m, device=logprobs.device)
+    hashed = murmur_hash32(seed, positions, col_indices)
+
+    # NOTE (sehoon): it is critical to keep gumbel noise calculation in float64 to avoid numerical instability.
+    # keeping logprobs in float64 is less critical, but we found it's still safer to keep it in float64.
+    x = hashed.to(torch.float64) / torch.iinfo(torch.uint32).max
+
+    # x is a uniform sample in [0, 1]. get gumbel noise from it.
+    # which is equivalent to -log(-log(x))
+    # keep everything in in-place operations to avoid unnecessary memory allocations.
+    x.log_().clamp_(min=torch.finfo(x.dtype).min).neg_()  # -log(x)
+    x.log_().neg_()  # -log(-log(x)) == gumbel noise
+
+    # add gumbel noise to logprobs
+    x.add_(logprobs.to(torch.float64))
+
+    return torch.argmax(x, dim=1, keepdim=True)
+
+
+def sampling_from_probs_torch(
+    probs: torch.Tensor,
+    sampling_seed: Optional[torch.Tensor] = None,
+    positions: Optional[torch.Tensor] = None,
+):
+    """A sampling implementation with native pytorch operations, without
+    top-k, top-p, or min-p filtering.
+
+    Note: For deterministic sampling from logprobs, use Sampler._sample_from_logprobs instead.
+    """
+    if sampling_seed is None:
+        sampled_index = torch.multinomial(probs, num_samples=1)
+    else:
+        # Deterministic sampling: convert probs to logprobs and use gumbel trick
+        sampled_index = multinomial_with_seed(
+            torch.log(probs), sampling_seed, positions
+        )
+    batch_next_token_ids = sampled_index.view(-1).to(torch.int32)
+    return batch_next_token_ids
+
+
+def top_p_normalize_probs_torch(
+    probs: torch.Tensor,
+    top_ps: torch.Tensor,
+):
+    # See also top_k_top_p_min_p_sampling_from_probs_torch
+    probs_sort, probs_idx = probs.sort(dim=-1, descending=True)
+    probs_sum = torch.cumsum(probs_sort, dim=-1)
+    probs_sort[(probs_sum - probs_sort) > top_ps.view(-1, 1)] = 0.0
+    probs_sort.div_(probs_sort.sum(dim=-1, keepdim=True))
+    return torch.zeros_like(probs_sort).scatter_(-1, probs_idx, probs_sort)
+
+
+def get_token_ids_logprobs_batch_optimized(
+    logprobs: torch.Tensor,
+    token_ids_logprobs: List[List[int]],
+) -> Tuple[List, List]:
+    """
+    Vectorized batch processing for token ID logprobs extraction.
+
+    Uses a single GPU kernel call for the entire batch instead of multiple
+    separate calls, significantly improving performance for large batches.
+
+    Args:
+        logprobs: Log probabilities tensor [batch_size, vocab_size]
+        token_ids_logprobs: List of token IDs to extract logprobs for
+
+    Example:
+        # Input: batch_size=3, vocab_size=5
+        logprobs = torch.tensor([
+            [-1.2, -2.1, -0.8, -3.0, -1.5],  # batch 0
+            [-0.5, -1.8, -2.2, -1.1, -2.7],  # batch 1
+            [-2.0, -0.9, -1.4, -2.8, -1.6],  # batch 2
+        ])
+        token_ids_logprobs = [[1, 3], [2], [0, 2, 4]]
+
+        # Output:
+        # values = [tensor([-2.1, -3.0]), tensor([-2.2]), tensor([-2.0, -1.4, -1.6])]
+        # indices = [[1, 3], [2], [0, 2, 4]]
+    """
+    batch_size = len(token_ids_logprobs)
+    device = logprobs.device
+
+    # Step 1: Calculate lengths for each request, treating None as empty list
+    # Example: [[1, 3], [2], [0, 2, 4]] -> token_lengths = tensor([2, 1, 3])
+    token_lengths = torch.tensor(
+        [len(token_ids or []) for token_ids in token_ids_logprobs], device=device
+    )
+    total_tokens = int(token_lengths.sum().item())  # 2 + 1 + 3 = 6
+
+    # Handle edge case where no tokens are requested
+    if total_tokens == 0:
+        return [logprobs.new_empty(0) for _ in token_ids_logprobs], [
+            [] for _ in token_ids_logprobs
+        ]
+
+    # Step 2: Build flattened indices using torch operations
+    # Example: row_indices = [0, 0, 1, 2, 2, 2] (batch indices repeated by their lengths)
+    row_indices = torch.repeat_interleave(
+        torch.arange(batch_size, device=device), token_lengths
+    )
+    # Example: col_indices = [1, 3, 2, 0, 2, 4] (flattened token IDs from all requests)
+    col_indices = torch.tensor(
+        [
+            token_id
+            for token_ids in token_ids_logprobs
+            for token_id in (token_ids or [])
+        ],
+        device=device,
+        dtype=torch.long,
+    )
+
+    # Step 3: Single vectorized gather operation
+    # Example: logprobs[row_indices, col_indices] -> [-2.1, -3.0, -2.2, -2.0, -1.4, -1.6]
+    gathered_logprobs = logprobs[row_indices, col_indices]
+
+    # Step 4: Split results back per request using torch operations
+    # Example: split tensor [6] into chunks of sizes [2, 1, 3] -> [tensor(2), tensor(1), tensor(3)]
+    split_logprobs = torch.split_with_sizes(
+        gathered_logprobs, token_lengths.tolist(), dim=0
+    )
+
+    # Step 5: Format output to match expected return structure
+    # Example: Convert split tensors back to list format with proper empty handling
+    # i=0: [1,3] -> append split_logprobs[0] and [1,3]
+    # i=1: [2] -> append split_logprobs[1] and [2]
+    # i=2: [0,2,4] -> append split_logprobs[2] and [0,2,4]
+    output_token_ids_logprobs_val = []
+    output_token_ids_logprobs_idx = []
+
+    for i, token_ids in enumerate(token_ids_logprobs):
+        if token_ids is not None and len(token_ids) > 0:
+            output_token_ids_logprobs_val.append(split_logprobs[i])
+            output_token_ids_logprobs_idx.append(token_ids)
+        else:
+            output_token_ids_logprobs_val.append(logprobs.new_empty(0))
+            output_token_ids_logprobs_idx.append([])
+
+    return output_token_ids_logprobs_val, output_token_ids_logprobs_idx
+
+
+def apply_custom_logit_processor(
+    logits: torch.Tensor,
+    sampling_batch_info: SamplingBatchInfo,
+    num_tokens_in_batch: int = 1,
+):
+    """Apply custom logit processors to the logits.
+    This function will modify the logits in-place.
+    num_tokens_in_batch is needed to support spec decoding, where each batch can contain multiple
+    tokens. By default, we assume each batch contains only 1 token.
+    """
+
+    assert logits.shape[0] == len(sampling_batch_info) * num_tokens_in_batch, (
+        f"The batch size of logits ({logits.shape[0]}) does not match the batch size of "
+        f"sampling_batch_info ({len(sampling_batch_info)}) x num_tokens_in_batch "
+        f"({num_tokens_in_batch})"
+    )
+
+    for _, (
+        processor,
+        batch_mask,
+    ) in sampling_batch_info.custom_logit_processor.items():
+        # Get the batch indices that need to be processed
+        batch_indices = batch_mask.nonzero(as_tuple=True)[0]
+
+        assert batch_mask.shape[0] == len(sampling_batch_info), (
+            f"The number of batch mask ({batch_mask.shape[0]}) does not match the number of "
+            f"sampling_batch_info ({len(sampling_batch_info)})"
+        )
+        batch_mask = torch.repeat_interleave(batch_mask, num_tokens_in_batch)
+
+        # Apply the processor to the logits
+        logits[batch_mask] = processor(
+            logits[batch_mask],
+            [sampling_batch_info.custom_params[i] for i in batch_indices],
+        )
+
+        logger.debug(
+            f"Custom logit processor {processor.__class__.__name__} is applied."
+        )
diff --git a/sglang/python/sglang/srt/layers/sparse_pooler.py b/sglang/python/sglang/srt/layers/sparse_pooler.py
new file mode 100644
index 0000000000000000000000000000000000000000..331b23c942bb9af3d3880c233b4c25a2657597b2
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/sparse_pooler.py
@@ -0,0 +1,98 @@
+from dataclasses import dataclass
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from transformers import PretrainedConfig
+
+from sglang.srt.model_executor.model_runner import ForwardBatch
+
+
+@dataclass
+class SparseEmbeddingOutput:
+    embeddings: torch.Tensor  # [batch_size, vocab_size]
+
+
+class SparsePooler(nn.Module):
+    """A layer that pools hidden states into sparse vocabulary-space embeddings.
+
+    This layer does the following:
+    1. Applies a linear transformation + ReLU to get token-level weights
+    2. Maps these weights to vocabulary positions using token IDs
+    3. Aggregates weights for repeated tokens using max pooling
+    4. Returns sparse embeddings in vocabulary space
+
+    Attributes:
+        config: Model configuration containing vocab_size and hidden_size
+        sparse_linear: Linear layer for computing token weights
+        vocab_size: Size of vocabulary for output embeddings
+    """
+
+    def __init__(self, config: PretrainedConfig):
+        super().__init__()
+
+        # Validate required attributes
+        if not hasattr(config, "vocab_size"):
+            raise AttributeError(
+                f"Config {type(config)} missing required 'vocab_size' attribute"
+            )
+        if not hasattr(config, "hidden_size"):
+            raise AttributeError(
+                f"Config {type(config)} missing required 'hidden_size' attribute"
+            )
+
+        self.vocab_size = config.vocab_size
+        self.sparse_linear = nn.Linear(config.hidden_size, 1)
+        self._weights_loaded = False
+
+    def forward(
+        self, hidden_states: torch.Tensor, forward_batch: ForwardBatch
+    ) -> SparseEmbeddingOutput:
+        """
+        Forward pass for sparse pooling.
+
+        Args:
+            hidden_states: Packed sequence hidden states [total_tokens, hidden_size]
+            forward_batch: Batch information with sequence lengths and input_ids
+
+        Returns:
+            SparseEmbeddingOutput with embeddings of shape [batch_size, vocab_size]
+        """
+        if not self._weights_loaded:
+            raise ValueError(
+                "Sparse pooling weights not loaded. Call load_weights() first"
+            )
+
+        # Apply sparse linear + ReLU to get token weights
+        token_weights = F.relu(self.sparse_linear(hidden_states)).squeeze(
+            -1
+        )  # [total_tokens]
+
+        # Create batch indices for packed sequences
+        batch_indices = torch.repeat_interleave(
+            torch.arange(
+                len(forward_batch.extend_seq_lens), device=hidden_states.device
+            ),
+            forward_batch.extend_seq_lens,
+        )
+
+        # Initialize sparse embedding output
+        sparse_embedding = torch.zeros(
+            len(forward_batch.extend_seq_lens),
+            self.vocab_size,
+            dtype=token_weights.dtype,
+            device=token_weights.device,
+        )
+
+        # Map to vocabulary space using scatter_reduce with amax
+        flat_indices = batch_indices * self.vocab_size + forward_batch.input_ids
+        sparse_embedding.view(-1).scatter_reduce_(
+            0, flat_indices, token_weights, reduce="amax"
+        )
+
+        return SparseEmbeddingOutput(embeddings=sparse_embedding)
+
+    def load_weights(self, state_dict: dict):
+        """Load weights from state dict (called by the model)."""
+        self.sparse_linear.load_state_dict(state_dict)
+        self._weights_loaded = True
diff --git a/sglang/python/sglang/srt/layers/torchao_utils.py b/sglang/python/sglang/srt/layers/torchao_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..4276e122aa359663d3b7f3f608994c1f6b8408a3
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/torchao_utils.py
@@ -0,0 +1,95 @@
+"""
+Common utilities for torchao.
+"""
+
+import logging
+from typing import Callable, Optional
+
+import torch
+
+logger = logging.getLogger(__name__)
+
+
+def proj_filter(
+    module: torch.nn.Module,
+    fqn: str,
+):
+    """Filter function for quantizing projection layers."""
+    return "proj" in fqn
+
+
+# TODO: implement a more general filter function
+def proj_filter_conv3d(
+    module: torch.nn.Module,
+    fqn: str,
+):
+    if isinstance(module, torch.nn.Conv3d):
+        logger.warning(f"Quantize: skipping {fqn} because it's a Conv3d")
+        return False
+    return "proj" in fqn
+
+
+def apply_torchao_config_to_model(
+    model: torch.nn.Module,
+    torchao_config: str,
+    filter_fn: Optional[Callable] = proj_filter,
+):
+    """Quantize a modelwith torchao quantization specified by torchao_config
+
+    Args:
+       `model`: a model to be quantized based on torchao_config
+       `torchao_config` (str): type of quantization and their arguments we want to use to
+        quantize the model, e.g. int4wo-128 means int4 weight only quantization with group_size
+        128
+    """
+    if torchao_config == "" or torchao_config is None:
+        return model
+
+    # Lazy import to suppress some warnings
+    from torchao.quantization import (
+        float8_dynamic_activation_float8_weight,
+        float8_weight_only,
+        int4_weight_only,
+        int8_dynamic_activation_int8_weight,
+        int8_weight_only,
+        quantize_,
+    )
+    from torchao.quantization.observer import PerRow, PerTensor
+
+    if "int8wo" in torchao_config:
+        quantize_(model, int8_weight_only(), filter_fn=proj_filter_conv3d)
+    elif "int8dq" in torchao_config:
+        quantize_(model, int8_dynamic_activation_int8_weight(), filter_fn=filter_fn)
+    elif "int4wo" in torchao_config:
+        group_size = int(torchao_config.split("-")[-1])
+        assert group_size in [
+            32,
+            64,
+            128,
+            256,
+        ], f"int4wo groupsize needs to be one of [32, 64, 128, 256] but got {group_size}"
+        quantize_(model, int4_weight_only(group_size=group_size), filter_fn=filter_fn)
+    elif "fp8wo" in torchao_config:
+        # this requires newer hardware
+        # [rank0]: AssertionError: fp8e4nv data type is not supported on CUDA arch < 89
+        quantize_(model, float8_weight_only(), filter_fn=proj_filter_conv3d)
+    elif "fp8dq" in torchao_config:
+        granularity = torchao_config.split("-")[-1]
+        GRANULARITY_MAP = {
+            "per_row": PerRow(),
+            "per_tensor": PerTensor(),
+        }
+        assert (
+            granularity in GRANULARITY_MAP
+        ), f"Supported granularity are: {GRANULARITY_MAP.keys()}, got {granularity}"
+        quantize_(
+            model,
+            float8_dynamic_activation_float8_weight(
+                granularity=GRANULARITY_MAP[granularity]
+            ),
+            filter_fn=proj_filter_conv3d,
+        )
+    else:
+        raise ValueError(f"Unexpected config: {torchao_config}")
+
+    return model
diff --git a/sglang/python/sglang/srt/layers/utils/__init__.py b/sglang/python/sglang/srt/layers/utils/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e3101d534ed235fbd0f0bc7ddcf693450bc1dc04
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/utils/__init__.py
@@ -0,0 +1,3 @@
+# Temp workaround, make layer utils more fine-grained later
+from sglang.srt.layers.utils.common import *
+from sglang.srt.layers.utils.multi_platform import MultiPlatformOp
diff --git a/sglang/python/sglang/srt/layers/utils/__pycache__/__init__.cpython-311.pyc b/sglang/python/sglang/srt/layers/utils/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5a3d2eb1951301b0ca4374a68f689bedc8dc471b
Binary files /dev/null and b/sglang/python/sglang/srt/layers/utils/__pycache__/__init__.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/layers/utils/__pycache__/common.cpython-311.pyc b/sglang/python/sglang/srt/layers/utils/__pycache__/common.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5554a505c415ebef8c9db274cae30c7687da20f7
Binary files /dev/null and b/sglang/python/sglang/srt/layers/utils/__pycache__/common.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/layers/utils/__pycache__/hash.cpython-311.pyc b/sglang/python/sglang/srt/layers/utils/__pycache__/hash.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9224f5fcbdd91cd36e69f95750e9bc8bf2839d13
Binary files /dev/null and b/sglang/python/sglang/srt/layers/utils/__pycache__/hash.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/layers/utils/__pycache__/logprob.cpython-311.pyc b/sglang/python/sglang/srt/layers/utils/__pycache__/logprob.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3e96ad13592e1da7ece64970b26c2118ff9f7822
Binary files /dev/null and b/sglang/python/sglang/srt/layers/utils/__pycache__/logprob.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/layers/utils/__pycache__/multi_platform.cpython-311.pyc b/sglang/python/sglang/srt/layers/utils/__pycache__/multi_platform.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ab595377f1b9559148d6aa67bbfe6b775aba5aae
Binary files /dev/null and b/sglang/python/sglang/srt/layers/utils/__pycache__/multi_platform.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/layers/utils/common.py b/sglang/python/sglang/srt/layers/utils/common.py
new file mode 100644
index 0000000000000000000000000000000000000000..63e32409bf540eb82faeda14500927c2268df602
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/utils/common.py
@@ -0,0 +1,91 @@
+import logging
+import re
+
+import torch
+from torch.nn.parameter import Parameter
+
+logger = logging.getLogger(__name__)
+
+
+def get_layer_id(weight_name):
+    # example weight name: model.layers.10.self_attn.qkv_proj.weight
+    match = re.search(r"layers\.(\d+)\.", weight_name)
+    if match:
+        return int(match.group(1))
+    return None
+
+
+def pad_or_narrow_weight(
+    loaded_weight: torch.Tensor, input_dim: int, start_idx: int, shard_size: int
+) -> torch.Tensor:
+    # Padding with zeros for special case such as qwen2_5_VL's mlp which is not 8-aligned
+    valid_size = max(loaded_weight.shape[input_dim] - start_idx, 0)
+
+    if valid_size > 0:
+        loaded_slice = loaded_weight.narrow(input_dim, start_idx, valid_size)
+        pad_shape = list(loaded_weight.shape)
+        pad_shape[input_dim] = shard_size - valid_size
+        pad = torch.zeros(
+            pad_shape, dtype=loaded_weight.dtype, device=loaded_weight.device
+        )
+        return torch.cat([loaded_slice, pad], dim=input_dim)
+
+    # All padding
+    pad_shape = list(loaded_weight.shape)
+    pad_shape[input_dim] = shard_size
+    return torch.zeros(
+        pad_shape, dtype=loaded_weight.dtype, device=loaded_weight.device
+    )
+
+
+def copy_or_rebind_param(
+    module: torch.nn.Module, name: str, new_value: torch.Tensor
+) -> None:
+    """Keep parameter identities stable for CUDA graph reuse and hot reload."""
+    new_value = new_value.detach()
+    param = getattr(module, name, None)
+    if isinstance(param, Parameter):
+        if param.data.shape == new_value.shape and param.data.dtype == new_value.dtype:
+            param.data.copy_(new_value)
+        else:
+            param.data = new_value
+        param.requires_grad_(False)
+    else:
+        setattr(module, name, Parameter(new_value, requires_grad=False))
+
+
+class PPMissingLayer(torch.nn.Identity):
+    # Adapted from
+    # https://github.com/vllm-project/vllm/blob/18ed3132d2bfe1df9a74729457b69243955221e8/vllm/model_executor/models/utils.py#L468C1-L486C1
+    """
+    A placeholder layer for missing layers in a pipeline parallel model.
+
+    Attribute access on this placeholder returns 'self' so that common
+    patterns like ``isinstance(layer.mlp, SomeMoE)`` safely evaluate to
+    ``False`` instead of raising ``AttributeError``.
+    """
+
+    def __init__(self, *args, **kwargs):
+        super().__init__()
+        self.return_tuple = kwargs.get("return_tuple", False)
+
+    def __getattr__(self, name: str):
+        try:
+            return super().__getattr__(name)
+        except (AttributeError, KeyError):
+            return self
+
+    def __setattr__(self, name: str, value):
+        if name == "return_tuple" or name.startswith("_"):
+            super().__setattr__(name, value)
+        else:
+            pass
+
+    def forward(self, *args, **kwargs):
+        """
+        Return the first arg from args or the first value from kwargs.
+
+        Wraps the input in a tuple if `self.return_tuple` is True.
+        """
+        input = args[0] if args else next(iter(kwargs.values()))
+        return (input,) if self.return_tuple else input
diff --git a/sglang/python/sglang/srt/layers/utils/hash.py b/sglang/python/sglang/srt/layers/utils/hash.py
new file mode 100644
index 0000000000000000000000000000000000000000..2a090a4cab6c3c2bb8cf33371af7334af50c5ad7
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/utils/hash.py
@@ -0,0 +1,121 @@
+import torch
+import triton
+import triton.language as tl
+
+
+@triton.jit
+def rotl32(x, r: tl.constexpr) -> tl.uint32:
+    """
+    rotate left 32-bit integer x by r bits
+    e.g. x = 01110001, r = 2 -> 11000101
+    """
+    x = x.to(tl.uint64)
+    return ((x << r) | (x >> (32 - r))) & 0xFFFFFFFF
+
+
+@triton.jit
+def fmix32(h: tl.uint32) -> tl.uint32:
+    """
+    final mix of 32-bit hash value for MurmurHash
+    """
+    h ^= h >> 16
+    h = (h * 0x85EBCA6B) & 0xFFFFFFFF
+    h ^= h >> 13
+    h = (h * 0xC2B2AE35) & 0xFFFFFFFF
+    h ^= h >> 16
+    return h
+
+
+@triton.jit
+def murmur3_mix(h: tl.uint32, k: tl.uint32) -> tl.uint32:
+    """
+    Mixes a 32-bit key into the hash state.
+    """
+    c1: tl.uint32 = 0xCC9E2D51
+    c2: tl.uint32 = 0x1B873593
+    r1: tl.constexpr = 15
+    r2: tl.constexpr = 13
+    mm: tl.uint32 = 5
+    nn: tl.uint32 = 0xE6546B64
+
+    k = (k * c1) & 0xFFFFFFFF
+    k = rotl32(k, r1)
+    k = (k * c2) & 0xFFFFFFFF
+    h ^= k
+    h = rotl32(h, r2)
+    h = (h * mm + nn) & 0xFFFFFFFF
+    return h
+
+
+@triton.jit
+def murmur_hash32_kernel(
+    seed_ptr,
+    positions_ptr,
+    col_indices_ptr,
+    output_ptr,
+    num_rows,
+    num_cols,
+    BLOCK_SIZE: tl.constexpr,
+):
+    """
+    MurmurHash 32-bit implementation for Triton.
+    Reference:
+    - https://medium.com/@thealonemusk/murmurhash-the-scrappy-algorithm-that-secretly-powers-half-the-internet-2d3f79b4509b
+    - https://en.wikipedia.org/wiki/MurmurHash
+
+    We treat 64-bit seed, 32-bit position, and 32-bit col_index as 4 4-byte blocks, and bit-blend them together.
+    """
+    pid_row = tl.program_id(0)
+    pid_col = tl.program_id(1)
+
+    row_idx = pid_row
+    col_offsets = pid_col * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
+    mask = col_offsets < num_cols
+
+    # Load inputs
+    seed = tl.load(seed_ptr + row_idx).to(tl.uint64)
+    pos = tl.load(positions_ptr + row_idx).to(tl.uint32)
+    col = tl.load(col_indices_ptr + col_offsets, mask=mask, other=0).to(tl.uint32)
+
+    h: tl.uint32 = 0  # hash accumulator
+
+    # Process seed_low
+    k: tl.uint32 = (seed & 0xFFFFFFFF).to(tl.uint32)
+    h = murmur3_mix(h, k)
+
+    # Process seed_high
+    k = ((seed >> 32) & 0xFFFFFFFF).to(tl.uint32)
+    h = murmur3_mix(h, k)
+
+    # Process position block starting from seed32
+    h = murmur3_mix(h, pos)
+
+    # Process col block
+    h = murmur3_mix(h, col)
+
+    # Finalize (len=16 for seed + pos + col)
+    h ^= 16
+    h = fmix32(h)
+
+    # Store result as uint32
+    tl.store(output_ptr + row_idx * num_cols + col_offsets, h, mask=mask)
+
+
+def murmur_hash32(seed, positions, col_indices):
+    assert (
+        seed.shape == positions.shape
+    ), "Seed and positions must have the same shape (n,)"
+    assert (
+        len(seed.shape) == 1 and len(col_indices.shape) == 1
+    ), f"Inputs must be 1D tensors {seed.shape=} {col_indices.shape=}"
+    n = seed.shape[0]
+    m = col_indices.shape[0]
+    device = seed.device
+    hashed = torch.empty((n, m), dtype=torch.uint32, device=device)
+
+    BLOCK_SIZE = 1024
+    grid = (n, triton.cdiv(m, BLOCK_SIZE))
+    murmur_hash32_kernel[grid](
+        seed, positions, col_indices, hashed, n, m, BLOCK_SIZE=BLOCK_SIZE
+    )
+    return hashed
diff --git a/sglang/python/sglang/srt/layers/utils/logprob.py b/sglang/python/sglang/srt/layers/utils/logprob.py
new file mode 100644
index 0000000000000000000000000000000000000000..6f84c15ffb098c1b2bb2e7f6c79dff1fbcb3f4ea
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/utils/logprob.py
@@ -0,0 +1,407 @@
+from __future__ import annotations
+
+import dataclasses
+from enum import Enum, auto
+from typing import TYPE_CHECKING, List, Optional, Union
+
+import torch
+
+from sglang.srt.environ import envs
+
+if TYPE_CHECKING:
+    from sglang.srt.layers.logits_processor import LogitsMetadata, LogitsProcessorOutput
+    from sglang.srt.managers.schedule_batch import ScheduleBatch
+    from sglang.srt.speculative.eagle_info import EagleVerifyOutput
+    from sglang.srt.speculative.ngram_info import NgramVerifyInput
+
+
+class LogprobStage(Enum):
+    PREFILL = auto()
+    DECODE = auto()
+
+
+@dataclasses.dataclass
+class InputLogprobsResult:
+    input_token_logprobs: torch.Tensor
+    input_top_logprobs_val: Optional[List] = None
+    input_top_logprobs_idx: Optional[List] = None
+    input_token_ids_logprobs_val: Optional[List] = None
+    input_token_ids_logprobs_idx: Optional[List] = None
+
+
+def compute_temp_top_p_normalized_logprobs(
+    last_logits: torch.Tensor,
+    logits_metadata: LogitsMetadata,
+    top_p: Optional[torch.Tensor] = None,
+    temperature: Optional[torch.Tensor] = None,
+) -> torch.Tensor:
+    """
+    compute logprobs for the output token from the given logits.
+
+    Returns:
+        torch.Tensor: logprobs from logits
+    """
+    if top_p is None:
+        top_p = logits_metadata.top_p
+    if temperature is None:
+        temperature = logits_metadata.temperature
+
+    # Scale logits if temperature scaling is enabled
+    if logits_metadata.temp_scaled_logprobs:
+        last_logits = last_logits / temperature
+
+    # Normalize logprobs if top_p normalization is enabled
+    # NOTE: only normalize logprobs when top_p is set and not equal to 1.0
+    if logits_metadata.top_p_normalized_logprobs and (top_p != 1.0).any():
+        from sglang.srt.layers.sampler import top_p_normalize_probs_torch
+
+        probs = torch.softmax(last_logits, dim=-1)
+        del last_logits
+        probs = top_p_normalize_probs_torch(probs, top_p)
+        return torch.log(probs)
+    else:
+        return torch.nn.functional.log_softmax(last_logits, dim=-1)
+
+
+def get_top_logprobs_raw(
+    logprobs: torch.Tensor,
+    top_logprobs_nums: List[int],
+    stage: LogprobStage,
+    extend_logprob_pruned_lens_cpu: Optional[List[int]] = None,
+):
+    max_k = max(top_logprobs_nums)
+    values, indices = logprobs.topk(max_k, dim=-1)
+    values = values.tolist()
+    indices = indices.tolist()
+
+    top_logprobs_val = []
+    top_logprobs_idx = []
+
+    if stage == LogprobStage.DECODE:
+        for i, k in enumerate(top_logprobs_nums):
+            top_logprobs_val.append(values[i][:k])
+            top_logprobs_idx.append(indices[i][:k])
+    else:
+        pt = 0
+        for k, pruned_len in zip(top_logprobs_nums, extend_logprob_pruned_lens_cpu):
+            if pruned_len <= 0:
+                top_logprobs_val.append([])
+                top_logprobs_idx.append([])
+                continue
+
+            top_logprobs_val.append([values[pt + j][:k] for j in range(pruned_len)])
+            top_logprobs_idx.append([indices[pt + j][:k] for j in range(pruned_len)])
+            pt += pruned_len
+
+    return top_logprobs_val, top_logprobs_idx
+
+
+def get_top_logprobs_prefill(
+    all_logprobs: torch.Tensor, logits_metadata: LogitsMetadata
+):
+    return get_top_logprobs_raw(
+        all_logprobs,
+        logits_metadata.top_logprobs_nums,
+        stage=LogprobStage.PREFILL,
+        extend_logprob_pruned_lens_cpu=logits_metadata.extend_logprob_pruned_lens_cpu,
+    )
+
+
+def get_top_logprobs(
+    logprobs: torch.Tensor,
+    top_logprobs_nums: List[int],
+):
+    return get_top_logprobs_raw(logprobs, top_logprobs_nums, stage=LogprobStage.DECODE)
+
+
+def get_token_ids_logprobs_raw(
+    logprobs: torch.Tensor,
+    token_ids_logprobs: List[Optional[List[int]]],
+    stage: LogprobStage,
+    extend_logprob_pruned_lens_cpu: Optional[List[int]] = None,
+    delay_cpu_copy: bool = False,
+):
+    vals, idxs = [], []
+    if stage == LogprobStage.DECODE:
+        for i, token_ids in enumerate(token_ids_logprobs):
+            if token_ids is None:
+                vals.append([])
+                idxs.append([])
+            else:
+                vals.append(logprobs[i, token_ids].tolist())
+                idxs.append(token_ids)
+    else:  # prefill
+        pt = 0
+        for token_ids, pruned_len in zip(
+            token_ids_logprobs, extend_logprob_pruned_lens_cpu
+        ):
+            if pruned_len <= 0:
+                vals.append([])
+                idxs.append([])
+                continue
+            pos_logprobs = logprobs[pt : pt + pruned_len, token_ids]
+            vals.append(pos_logprobs if delay_cpu_copy else pos_logprobs.tolist())
+            idxs.append([token_ids for _ in range(pruned_len)])
+            pt += pruned_len
+    return vals, idxs
+
+
+def get_token_ids_logprobs_prefill(
+    all_logprobs, logits_metadata: LogitsMetadata, delay_cpu_copy=False
+):
+    return get_token_ids_logprobs_raw(
+        all_logprobs,
+        logits_metadata.token_ids_logprobs,
+        stage=LogprobStage.PREFILL,
+        extend_logprob_pruned_lens_cpu=logits_metadata.extend_logprob_pruned_lens_cpu,
+        delay_cpu_copy=delay_cpu_copy,
+    )
+
+
+def get_token_ids_logprobs(logprobs, token_ids_logprobs):
+    return get_token_ids_logprobs_raw(
+        logprobs, token_ids_logprobs, stage=LogprobStage.DECODE
+    )
+
+
+def get_top_logprobs_chunk(
+    logprobs: torch.Tensor,
+    logits_metadata: LogitsMetadata,
+    top_k_nums: List[int],
+    pruned_lens: List[int],
+    input_top_logprobs_val: List,
+    input_top_logprobs_idx: List,
+    split_pruned_len: int,
+) -> int:
+    """Get top-k logprobs for each sequence in the chunk.
+
+    Args:
+        logprobs: Log probabilities tensor of shape [seq_len, vocab_size]
+        logits_metadata: Metadata containing top-k and pruned length info
+        top_k_nums: List of top-k numbers for each sequence
+        pruned_lens: List of pruned lengths for each sequence
+        input_top_logprobs_val: List to store top-k logprob values
+        input_top_logprobs_idx: List to store top-k token indices
+        split_pruned_len: Length of pruned tokens from previous chunk
+
+    Returns:
+        int: Number of remaining tokens to process in next chunk
+    """
+    # No sequences in the chunk
+    if logprobs.shape[0] == 0:
+        return 0
+
+    max_k = max(logits_metadata.top_logprobs_nums)
+    ret = logprobs.topk(max_k, dim=1)
+    values = ret.values.tolist()
+    indices = ret.indices.tolist()
+
+    pt = 0
+    next_split_pruned_len = 0
+    for n, (k, pruned_len) in enumerate(zip(top_k_nums, pruned_lens)):
+        if n == 0:
+            # For the first sequence, adjust the pruned length
+            pruned_len -= split_pruned_len
+        else:
+            # After the first sequence, no split in the middle
+            split_pruned_len = 0
+
+        if pruned_len <= 0:
+            # if pruned length is less than or equal to 0,
+            # there is no top-k logprobs to process
+            input_top_logprobs_val.append([])
+            input_top_logprobs_idx.append([])
+            continue
+
+        # Get the top-k logprobs
+        val = []
+        idx = []
+        for j in range(pruned_len):
+            # Handle remaining tokens in next chunk if any
+            if pt + j >= len(values):
+                next_split_pruned_len = split_pruned_len + j
+                break
+            # Append the top-k logprobs
+            val.append(values[pt + j][:k])
+            idx.append(indices[pt + j][:k])
+
+        # Append or extend based on whether the sequence was split across chunks
+        if len(val) > 0:
+            if split_pruned_len > 0:
+                input_top_logprobs_val[-1].extend(val)
+                input_top_logprobs_idx[-1].extend(idx)
+            else:
+                input_top_logprobs_val.append(val)
+                input_top_logprobs_idx.append(idx)
+
+        pt += pruned_len
+    return next_split_pruned_len
+
+
+def get_token_ids_logprobs_chunk(
+    logprobs: torch.Tensor,
+    token_ids_logprobs: List[int],
+    pruned_lens: List[int],
+    input_token_ids_logprobs_val: List,
+    input_token_ids_logprobs_idx: List,
+    split_pruned_len: int = 0,
+):
+    """Get token_ids logprobs for each sequence in the chunk.
+
+    Args:
+        logprobs: Log probabilities tensor of shape [seq_len, vocab_size]
+        logits_metadata: Metadata containing token IDs and pruned length info
+        token_ids_logprobs: List of token IDs for each sequence
+        pruned_lens: List of pruned lengths for each sequence
+        input_token_ids_logprobs_val: List to store token logprob values
+        input_token_ids_logprobs_idx: List to store token indices
+        split_pruned_len: Length of pruned tokens from previous chunk
+
+    Returns:
+        int: Number of remaining tokens to process in next chunk
+    """
+
+    # No sequences in the chunk
+    if logprobs.shape[0] == 0:
+        return 0
+
+    pt = 0
+    next_split_pruned_len = 0
+    for n, (token_ids, pruned_len) in enumerate(
+        zip(
+            token_ids_logprobs,
+            pruned_lens,
+        )
+    ):
+        # Adjust pruned length for first sequence
+        if n == 0:
+            pruned_len -= split_pruned_len
+        else:
+            split_pruned_len = 0
+
+        if pruned_len <= 0:
+            # if pruned length is less than or equal to 0,
+            # there is no token ids logprobs to process
+            input_token_ids_logprobs_val.append([])
+            input_token_ids_logprobs_idx.append([])
+            continue
+
+        # Get the token ids logprobs
+        val = []
+        idx = []
+        for j in range(pruned_len):
+            # Handle remaining tokens in next chunk if any
+            if pt + j >= logprobs.shape[0]:
+                next_split_pruned_len = split_pruned_len + j
+                break
+            if token_ids is not None:
+                val.append(logprobs[pt + j, token_ids].tolist())
+                idx.append(token_ids)
+
+        # Append or extend based on whether the sequence was split across chunks
+        if len(val) > 0:
+            if split_pruned_len > 0:
+                input_token_ids_logprobs_val[-1].extend(val)
+                input_token_ids_logprobs_idx[-1].extend(idx)
+            else:
+                input_token_ids_logprobs_val.append(val)
+                input_token_ids_logprobs_idx.append(idx)
+
+        pt += pruned_len
+    return next_split_pruned_len
+
+
+def add_output_logprobs_for_spec_v1(
+    batch: ScheduleBatch,
+    res: Union[EagleVerifyOutput, NgramVerifyInput],
+    logits_output: Optional[LogitsProcessorOutput] = None,
+):
+    # Extract args
+    if logits_output is None:
+        logits_output = res.logits_output
+
+    if hasattr(res, "accept_length_per_req_cpu"):
+        accept_length_per_req_cpu = res.accept_length_per_req_cpu
+    else:
+        # FIXME: Get a NgramVerifyOutput class and use that instead of this hack.
+        accept_length_per_req_cpu = res.accept_length.tolist()
+
+    top_logprobs_nums = batch.top_logprobs_nums
+    token_ids_logprobs = batch.token_ids_logprobs
+    accepted_indices = res.accepted_indices
+    assert len(accepted_indices) == len(logits_output.next_token_logits)
+
+    temperatures = batch.sampling_info.temperatures
+    num_draft_tokens = batch.spec_info.draft_token_num
+    # acceptance indices are the indices in a "flattened" batch.
+    # dividing it to num_draft_tokens will yield the actual batch index.
+    temperatures = temperatures[accepted_indices // num_draft_tokens]
+    if envs.SGLANG_RETURN_ORIGINAL_LOGPROB.get():
+        logprobs = torch.nn.functional.log_softmax(
+            logits_output.next_token_logits, dim=-1
+        )
+    else:
+        logprobs = torch.nn.functional.log_softmax(
+            logits_output.next_token_logits / temperatures, dim=-1
+        )
+    batch_next_token_ids = res.verified_id
+    num_tokens_per_req = [accept + 1 for accept in accept_length_per_req_cpu]
+
+    # We should repeat top_logprobs_nums to match num_tokens_per_req.
+    top_logprobs_nums_repeat_interleaved = [
+        num
+        for num, num_tokens in zip(top_logprobs_nums, num_tokens_per_req)
+        for _ in range(num_tokens)
+    ]
+
+    token_ids_logprobs_repeat_interleaved = [
+        token_ids
+        for token_ids, num_tokens in zip(token_ids_logprobs, num_tokens_per_req)
+        for _ in range(num_tokens)
+    ]
+
+    # Extract logprobs
+    should_top_logprobs = any(x > 0 for x in top_logprobs_nums)
+    should_token_ids_logprobs = any(x is not None for x in token_ids_logprobs)
+    if should_top_logprobs:
+        (
+            logits_output.next_token_top_logprobs_val,
+            logits_output.next_token_top_logprobs_idx,
+        ) = get_top_logprobs(
+            logprobs,
+            top_logprobs_nums_repeat_interleaved,
+        )
+
+    if should_token_ids_logprobs:
+        (
+            logits_output.next_token_token_ids_logprobs_val,
+            logits_output.next_token_token_ids_logprobs_idx,
+        ) = get_token_ids_logprobs(
+            logprobs,
+            token_ids_logprobs_repeat_interleaved,
+        )
+
+    logits_output.next_token_logprobs = logprobs[
+        torch.arange(len(batch_next_token_ids), device=batch.sampling_info.device),
+        batch_next_token_ids,
+    ]
+
+    # Add output logprobs to the request
+    pt = 0
+    next_token_logprobs = logits_output.next_token_logprobs.tolist()
+    verified_ids = batch_next_token_ids.tolist()
+    token_top_logprobs_val = logits_output.next_token_top_logprobs_val
+    token_top_logprobs_idx = logits_output.next_token_top_logprobs_idx
+    for req, num_tokens in zip(batch.reqs, num_tokens_per_req, strict=True):
+        for _ in range(num_tokens):
+            # TODO: add token_ids_logprobs to each request
+            if req.return_logprob:
+                req.output_token_logprobs_val.append(next_token_logprobs[pt])
+                req.output_token_logprobs_idx.append(verified_ids[pt])
+                if req.top_logprobs_num > 0:
+                    assert (
+                        should_top_logprobs
+                    ), "Inconsistent state: should_top_logprobs is False"
+                    req.output_top_logprobs_val.append(token_top_logprobs_val[pt])
+                    req.output_top_logprobs_idx.append(token_top_logprobs_idx[pt])
+            pt += 1
diff --git a/sglang/python/sglang/srt/layers/utils/multi_platform.py b/sglang/python/sglang/srt/layers/utils/multi_platform.py
new file mode 100644
index 0000000000000000000000000000000000000000..eff8766e184cd3d3e1654870324f9b174a9d9a04
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/utils/multi_platform.py
@@ -0,0 +1,114 @@
+from typing import Callable
+
+from torch import nn
+
+from sglang.srt.utils import (
+    cpu_has_amx_support,
+    is_cpu,
+    is_cuda,
+    is_hip,
+    is_musa,
+    is_npu,
+    is_xpu,
+)
+
+_is_cuda = is_cuda()
+_is_hip = is_hip()
+_is_cpu = is_cpu()
+_is_cpu_amx_available = cpu_has_amx_support()
+_is_npu = is_npu()
+_is_xpu = is_xpu()
+_is_musa = is_musa()
+
+
+class MultiPlatformOp(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self._forward_method: Callable = self.dispatch_forward()
+
+        # States for torch.compile
+        self._original_forward_method = None
+        self.is_torch_compile = False
+
+    def enter_torch_compile(self, num_tokens: int):
+        # Skip if Op is already entered compile mode.
+        # NOTE(alcanderian): Some Ops(for example RotaryEmbedding) will be reused
+        # among layers and `enter_torch_compile` will be called many times.
+        # We should prevent `self._original_forward_method` from being overridden when
+        # it is not the first time `enter_torch_compile` called.
+        if self.is_torch_compile:
+            return
+
+        self._original_forward_method = self._forward_method
+        # NOTE: Temporarily workaround MoE
+        # The performance of torch.compile on this layer is not always good when bs > 1,
+        # so we decide to only use torch.compile when bs=1
+        if "FusedMoE" in self.__class__.__name__:
+            if num_tokens == 1:
+                from sglang.srt.layers.moe.fused_moe_native import (
+                    fused_moe_forward_native,
+                )
+
+                self._forward_method = fused_moe_forward_native
+        elif "TopK" in self.__class__.__name__:
+            if num_tokens == 1:
+                self._forward_method = self.forward_native
+        else:
+            self._forward_method = self.forward_native
+        self.is_torch_compile = True
+
+    def leave_torch_compile(self):
+        # Skip if Op is already exited compile mode.
+        if not self.is_torch_compile:
+            return
+
+        self._forward_method = self._original_forward_method
+        self._original_forward_method = None
+        self.is_torch_compile = False
+
+    # Please do not override this method, because `self._forward_method` can change when in torch compile mode
+    def forward(self, *args, **kwargs):
+        return self._forward_method(*args, **kwargs)
+
+    def forward_native(self, *args, **kwargs):
+        raise NotImplementedError
+
+    def forward_cuda(self, *args, **kwargs):
+        raise NotImplementedError
+
+    def forward_npu(self, *args, **kwargs):
+        return self.forward_native(*args, **kwargs)
+
+    def forward_hip(self, *args, **kwargs):
+        return self.forward_cuda(*args, **kwargs)
+
+    def forward_xpu(self, *args, **kwargs):
+        return self.forward_native(*args, **kwargs)
+
+    def forward_musa(self, *args, **kwargs):
+        # XXX (MUSA): MUSA kernels follow the CUDA path by default.
+        # At this stage, sgl-kernel support for MUSA is still under active
+        # development, so we fall back to the PyTorch-native implementation.
+        return self.forward_native(*args, **kwargs)
+
+    def forward_hpu(self, *args, **kwargs):
+        return self.forward_native(*args, **kwargs)
+
+    def forward_cpu(self, *args, **kwargs):
+        return self.forward_native(*args, **kwargs)
+
+    def dispatch_forward(self):
+        if _is_cuda:
+            return self.forward_cuda
+        elif _is_hip:
+            return self.forward_hip
+        elif _is_cpu and _is_cpu_amx_available:
+            return self.forward_cpu
+        elif _is_npu:
+            return self.forward_npu
+        elif _is_xpu:
+            return self.forward_xpu
+        elif _is_musa:
+            return self.forward_musa
+        else:
+            return self.forward_native
diff --git a/sglang/python/sglang/srt/layers/vocab_parallel_embedding.py b/sglang/python/sglang/srt/layers/vocab_parallel_embedding.py
new file mode 100644
index 0000000000000000000000000000000000000000..63b376c2478719b1e78533e16020b684beeec33f
--- /dev/null
+++ b/sglang/python/sglang/srt/layers/vocab_parallel_embedding.py
@@ -0,0 +1,588 @@
+# Adapted from https://github.com/vllm-project/vllm/blob/v0.6.3.post1/vllm/model_executor/layers/vocab_parallel_embedding.py
+
+import logging
+from dataclasses import dataclass
+from typing import List, Optional, Sequence, Tuple
+
+import torch
+from torch.nn.parameter import Parameter, UninitializedParameter
+
+from sglang.srt.distributed import (
+    divide,
+    get_tensor_model_parallel_rank,
+    get_tensor_model_parallel_world_size,
+    get_tp_group,
+    tensor_model_parallel_all_reduce,
+)
+from sglang.srt.distributed.device_communicators.pynccl_allocator import (
+    use_symmetric_memory,
+)
+from sglang.srt.layers.amx_utils import PackWeightMethod
+from sglang.srt.layers.communicator import get_attn_tp_context
+from sglang.srt.layers.dp_attention import (
+    attn_tp_all_reduce,
+    get_attention_tp_rank,
+    get_attention_tp_size,
+    is_allocation_symmetric,
+)
+from sglang.srt.layers.parameter import BasevLLMParameter
+from sglang.srt.layers.quantization.base_config import (
+    QuantizationConfig,
+    QuantizeMethodBase,
+    method_has_implemented_embedding,
+)
+from sglang.srt.layers.quantization.unquant import UnquantizedEmbeddingMethod
+from sglang.srt.utils import (
+    cpu_has_amx_support,
+    get_compiler_backend,
+    is_cpu,
+    is_npu,
+    set_weight_attrs,
+)
+
+DEFAULT_VOCAB_PADDING_SIZE = 64
+
+_is_cpu_amx_available = cpu_has_amx_support()
+_is_cpu = is_cpu()
+_is_npu = is_npu()
+
+logger = logging.getLogger(__name__)
+
+
+def pad_vocab_size(vocab_size: int, pad_to: int = DEFAULT_VOCAB_PADDING_SIZE) -> int:
+    """Pad the vocab size to the given value."""
+    return ((vocab_size + pad_to - 1) // pad_to) * pad_to
+
+
+def vocab_range_from_per_partition_vocab_size(
+    per_partition_vocab_size: int, rank: int, offset: int = 0
+) -> Sequence[int]:
+    index_f = rank * per_partition_vocab_size
+    index_l = index_f + per_partition_vocab_size
+    return index_f + offset, index_l + offset
+
+
+def vocab_range_from_global_vocab_size(
+    global_vocab_size: int, rank: int, world_size: int, offset: int = 0
+) -> Sequence[int]:
+    per_partition_vocab_size = divide(global_vocab_size, world_size)
+    return vocab_range_from_per_partition_vocab_size(
+        per_partition_vocab_size, rank, offset=offset
+    )
+
+
+@dataclass
+class VocabParallelEmbeddingShardIndices:
+    """Indices for a shard of a vocab parallel embedding."""
+
+    padded_org_vocab_start_index: int
+    padded_org_vocab_end_index: int
+    padded_added_vocab_start_index: int
+    padded_added_vocab_end_index: int
+
+    org_vocab_start_index: int
+    org_vocab_end_index: int
+    added_vocab_start_index: int
+    added_vocab_end_index: int
+
+    @property
+    def num_org_elements(self) -> int:
+        return self.org_vocab_end_index - self.org_vocab_start_index
+
+    @property
+    def num_added_elements(self) -> int:
+        return self.added_vocab_end_index - self.added_vocab_start_index
+
+    @property
+    def num_org_elements_padded(self) -> int:
+        return self.padded_org_vocab_end_index - self.padded_org_vocab_start_index
+
+    @property
+    def num_added_elements_padded(self) -> int:
+        return self.padded_added_vocab_end_index - self.padded_added_vocab_start_index
+
+    @property
+    def num_org_vocab_padding(self) -> int:
+        return self.num_org_elements_padded - self.num_org_elements
+
+    @property
+    def num_added_vocab_padding(self) -> int:
+        return self.num_added_elements_padded - self.num_added_elements
+
+    @property
+    def num_elements_padded(self) -> int:
+        return self.num_org_elements_padded + self.num_added_elements_padded
+
+    def __post_init__(self):
+        # sanity checks
+        assert self.padded_org_vocab_start_index <= self.padded_org_vocab_end_index
+        assert self.padded_added_vocab_start_index <= self.padded_added_vocab_end_index
+
+        assert self.org_vocab_start_index <= self.org_vocab_end_index
+        assert self.added_vocab_start_index <= self.added_vocab_end_index
+
+        assert self.org_vocab_start_index <= self.padded_org_vocab_start_index
+        assert self.added_vocab_start_index <= self.padded_added_vocab_start_index
+        assert self.org_vocab_end_index <= self.padded_org_vocab_end_index
+        assert self.added_vocab_end_index <= self.padded_added_vocab_end_index
+
+        assert self.num_org_elements <= self.num_org_elements_padded
+        assert self.num_added_elements <= self.num_added_elements_padded
+
+
+@torch.compile(dynamic=True, backend=get_compiler_backend(), disable=_is_npu)
+def get_masked_input_and_mask(
+    input_: torch.Tensor,
+    org_vocab_start_index: int,
+    org_vocab_end_index: int,
+    num_org_vocab_padding: int,
+    added_vocab_start_index: int,
+    added_vocab_end_index: int,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    # torch.compile will fuse all of the pointwise ops below
+    # into a single kernel, making it very fast
+    org_vocab_mask = (input_ >= org_vocab_start_index) & (input_ < org_vocab_end_index)
+    added_vocab_mask = (input_ >= added_vocab_start_index) & (
+        input_ < added_vocab_end_index
+    )
+    added_offset = (
+        added_vocab_start_index
+        - (org_vocab_end_index - org_vocab_start_index)
+        - num_org_vocab_padding
+    )
+    valid_offset = (org_vocab_start_index * org_vocab_mask) + (
+        added_offset * added_vocab_mask
+    )
+    vocab_mask = org_vocab_mask | added_vocab_mask
+    input_ = vocab_mask * (input_ - valid_offset)
+    return input_, ~vocab_mask
+
+
+class VocabParallelEmbedding(torch.nn.Module):
+    """Embedding parallelized in the vocabulary dimension.
+
+    Adapted from torch.nn.Embedding, note that we pad the vocabulary size to
+    make sure it is divisible by the number of model parallel GPUs.
+
+    In order to support various loading methods, we ensure that LoRA-added
+    embeddings are always at the end of TP-sharded tensors. In other words,
+    we shard base embeddings and LoRA embeddings separately (both padded),
+    and place them in the same tensor.
+    In this example, we will have the original vocab size = 1010,
+    added vocab size = 16 and padding to 64. Therefore, the total
+    vocab size with padding will be 1088 (because we first pad 1010 to
+    1024, add 16, and then pad to 1088).
+    Therefore, the tensor format looks like the following:
+    TP1, rank 0 (no sharding):
+                            |< --------BASE-------- >|< -BASE PADDING-- >|< -----LORA------ >|< -LORA PADDING-- >|
+    corresponding token_id: |  0  |  1  | ... | 1009 |  -1  | ... |  -1  | 1010 | ... | 1015 |  -1  | ... |  -1  |
+                     index: |  0  |  1  | ... | 1009 | 1010 | ... | 1023 | 1024 | ... | 1039 | 1040 | ... | 1087 |
+
+    TP2, rank 0:
+                            |< --------------------BASE--------------------- >|< -----LORA------ >|< -LORA PADDING- >|
+    corresponding token_id: |  0  |  1  |  2  | ... | 497  | 498 | ...  | 511 | 1000 | ... | 1015 |  -1  | ... |  -1 |
+                     index: |  0  |  1  |  2  | ... | 497  | 498 | ...  | 511 | 512  | ... | 527  |  520 | ... | 543 |
+    TP2, rank 1:
+                            |< -----------BASE----------- >|< -BASE PADDING- >|< -----------LORA PADDING----------- >|
+    corresponding token_id: | 512 | 513 | 514 | ... | 1009 | -1  | ...  | -1  |  -1  | ... |  -1  | -1  | ... |   -1 |
+                     index: |  0  |  1  |  2  | ... | 497  | 498 | ...  | 511 | 512  | ... | 519  | 520 | ... |  543 |
+
+    Args:
+        num_embeddings: vocabulary size.
+        embedding_dim: size of hidden state.
+        params_dtype: type of the parameters.
+        org_num_embeddings: original vocabulary size (without LoRA).
+        padding_size: padding size for the vocabulary.
+        quant_config: quant config for the layer
+        prefix: full name of the layer in the state dict
+    """  # noqa: E501
+
+    def __init__(
+        self,
+        num_embeddings: int,
+        embedding_dim: int,
+        *,
+        params_dtype: Optional[torch.dtype] = None,
+        org_num_embeddings: Optional[int] = None,
+        padding_size: int = DEFAULT_VOCAB_PADDING_SIZE,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+        enable_tp: bool = True,
+        use_attn_tp_group: bool = False,
+        use_presharded_weights: bool = False,
+    ):
+        super().__init__()
+        self.quant_config = quant_config
+
+        self.enable_tp = enable_tp
+        self.use_attn_tp_group = use_attn_tp_group
+        if self.enable_tp:
+            if use_attn_tp_group:
+                tp_rank = get_attention_tp_rank()
+                self.tp_size = get_attention_tp_size()
+            else:
+                tp_rank = get_tensor_model_parallel_rank()
+                self.tp_size = get_tensor_model_parallel_world_size()
+        else:
+            assert use_attn_tp_group is False
+            tp_rank = 0
+            self.tp_size = 1
+
+        self.num_embeddings = num_embeddings
+        self.org_vocab_size = org_num_embeddings or num_embeddings
+
+        # Support the case where the vocab size is not divisible by the TP size.
+        if (
+            _is_cpu
+            and pad_vocab_size(self.org_vocab_size, padding_size) % self.tp_size != 0
+        ):
+            padding_size *= self.tp_size
+        self.padding_size = padding_size
+
+        num_added_embeddings = num_embeddings - self.org_vocab_size
+        self.use_presharded_weights = use_presharded_weights
+        if use_presharded_weights:
+            assert (
+                num_added_embeddings == 0
+            ), "Lora is not supported with presharded weights."
+
+        self.org_vocab_size_padded = pad_vocab_size(
+            self.org_vocab_size, self.padding_size
+        )
+        self.num_embeddings_padded = pad_vocab_size(
+            self.org_vocab_size_padded + num_added_embeddings, self.padding_size
+        )
+        assert self.org_vocab_size_padded <= self.num_embeddings_padded
+
+        self.shard_indices = self._get_indices(
+            self.num_embeddings_padded,
+            self.org_vocab_size_padded,
+            self.num_embeddings,
+            self.org_vocab_size,
+            tp_rank,
+            self.tp_size,
+        )
+        self.embedding_dim = embedding_dim
+
+        quant_method = None
+        if quant_config is not None:
+            quant_method = quant_config.get_quant_method(self, prefix=prefix)
+        if quant_method is None:
+            quant_method = UnquantizedEmbeddingMethod()
+
+        # If we are making an embedding layer, then our quantization linear
+        # method must implement the embedding operation. If we are another
+        # layer type like ParallelLMHead, this is not important.
+        is_embedding_layer = type(self.__class__) is VocabParallelEmbedding
+        quant_method_implements_embedding = method_has_implemented_embedding(
+            type(quant_method)
+        )
+        if is_embedding_layer and not quant_method_implements_embedding:
+            raise NotImplementedError(
+                f"The class {type(quant_method).__name__} must implement "
+                "the 'embedding' method, see UnquantizedEmbeddingMethod."
+            )
+
+        self.quant_method: QuantizeMethodBase = quant_method
+
+        if params_dtype is None:
+            params_dtype = torch.get_default_dtype()
+        # Divide the weight matrix along the vocaburaly dimension.
+        self.num_added_embeddings = self.num_embeddings - self.org_vocab_size
+        self.num_embeddings_per_partition = divide(
+            self.num_embeddings_padded, self.tp_size
+        )
+        assert (
+            self.shard_indices.num_elements_padded == self.num_embeddings_per_partition
+        )
+        self.num_org_embeddings_per_partition = (
+            self.shard_indices.org_vocab_end_index
+            - self.shard_indices.org_vocab_start_index
+        )
+        self.num_added_embeddings_per_partition = (
+            self.shard_indices.added_vocab_end_index
+            - self.shard_indices.added_vocab_start_index
+        )
+
+        self.quant_method.create_weights(
+            self,
+            self.embedding_dim,
+            [self.num_embeddings_per_partition],
+            self.embedding_dim,
+            self.num_embeddings_padded,
+            params_dtype=params_dtype,
+            weight_loader=self.weight_loader,
+        )
+
+    @classmethod
+    def _get_indices(
+        cls,
+        vocab_size_padded: int,
+        org_vocab_size_padded: int,
+        vocab_size: int,
+        org_vocab_size: int,
+        tp_rank: int,
+        tp_size: int,
+    ) -> VocabParallelEmbeddingShardIndices:
+        """Get start and end indices for vocab parallel embedding, following the
+        layout outlined in the class docstring, based on the given tp_rank and
+        tp_size."""
+        num_added_embeddings_padded = vocab_size_padded - org_vocab_size_padded
+        padded_org_vocab_start_index, padded_org_vocab_end_index = (
+            vocab_range_from_global_vocab_size(org_vocab_size_padded, tp_rank, tp_size)
+        )
+        padded_added_vocab_start_index, padded_added_vocab_end_index = (
+            vocab_range_from_global_vocab_size(
+                num_added_embeddings_padded, tp_rank, tp_size, offset=org_vocab_size
+            )
+        )
+        # remove padding
+        org_vocab_start_index = min(padded_org_vocab_start_index, org_vocab_size)
+        org_vocab_end_index = min(padded_org_vocab_end_index, org_vocab_size)
+        added_vocab_start_index = min(padded_added_vocab_start_index, vocab_size)
+        added_vocab_end_index = min(padded_added_vocab_end_index, vocab_size)
+        return VocabParallelEmbeddingShardIndices(
+            padded_org_vocab_start_index,
+            padded_org_vocab_end_index,
+            padded_added_vocab_start_index,
+            padded_added_vocab_end_index,
+            org_vocab_start_index,
+            org_vocab_end_index,
+            added_vocab_start_index,
+            added_vocab_end_index,
+        )
+
+    def get_sharded_to_full_mapping(self) -> Optional[List[int]]:
+        """Get a mapping that can be used to reindex the gathered
+        logits for sampling.
+
+        During sampling, we gather logits from all ranks. The relationship
+        of index->token_id will follow the same format as outlined in the class
+        docstring. However, after the gather, we want to reindex the final
+        logits tensor to map index->token_id one-to-one (the index is always
+        equal the token_id it corresponds to). The indices returned by this
+        method allow us to do that.
+        """
+        if self.tp_size < 2:
+            return None
+
+        base_embeddings: List[int] = []
+        added_embeddings: List[int] = []
+        padding: List[int] = []
+        for tp_rank in range(self.tp_size):
+            shard_indices = self._get_indices(
+                self.num_embeddings_padded,
+                self.org_vocab_size_padded,
+                self.num_embeddings,
+                self.org_vocab_size,
+                tp_rank,
+                self.tp_size,
+            )
+            range_start = self.num_embeddings_per_partition * tp_rank
+            range_end = self.num_embeddings_per_partition * (tp_rank + 1)
+            base_embeddings.extend(
+                range(range_start, range_start + shard_indices.num_org_elements)
+            )
+            padding.extend(
+                range(
+                    range_start + shard_indices.num_org_elements,
+                    range_start + shard_indices.num_org_elements_padded,
+                )
+            )
+            added_embeddings.extend(
+                range(
+                    range_start + shard_indices.num_org_elements_padded,
+                    range_start
+                    + shard_indices.num_org_elements_padded
+                    + shard_indices.num_added_elements,
+                )
+            )
+            padding.extend(
+                range(
+                    range_start
+                    + shard_indices.num_org_elements_padded
+                    + shard_indices.num_added_elements,
+                    range_start
+                    + shard_indices.num_org_elements_padded
+                    + shard_indices.num_added_elements_padded,
+                )
+            )
+            assert (
+                range_start
+                + shard_indices.num_org_elements_padded
+                + shard_indices.num_added_elements_padded
+                == range_end
+            )
+        ret = base_embeddings + added_embeddings + padding
+        assert len(ret) == self.num_embeddings_padded
+        return ret
+
+    def weight_loader(self, param: Parameter, loaded_weight: torch.Tensor):
+        output_dim = getattr(param, "output_dim", None)
+        packed_dim = getattr(param, "packed_dim", None)
+
+        # If the parameter is a gguf weight, then load it directly.
+        if getattr(param, "is_gguf_weight_type", None):
+            param.data.copy_(loaded_weight)
+            param.weight_type = loaded_weight.item()
+            return
+        elif isinstance(param, UninitializedParameter):
+            shape = list(loaded_weight.shape)
+            if output_dim is not None:
+                shape[output_dim] = shape[output_dim] // self.tp_size
+            param.materialize(tuple(shape), dtype=loaded_weight.dtype)
+
+        # If parameter does not have output dim, then it should
+        # be copied onto all gpus (e.g. g_idx for act_order gptq).
+        if output_dim is None:
+            assert param.data.shape == loaded_weight.shape
+            param.data.copy_(loaded_weight)
+            return
+
+        # Shard indexes for loading the weight
+        start_idx = self.shard_indices.org_vocab_start_index
+        shard_size = self.shard_indices.org_vocab_end_index - start_idx
+
+        # If param packed on the same dim we are sharding on, then
+        # need to adjust offsets of loaded weight by pack_factor.
+        if packed_dim is not None and packed_dim == output_dim:
+            packed_factor = (
+                param.packed_factor
+                if isinstance(param, BasevLLMParameter)
+                else param.packed_factor
+            )
+            assert loaded_weight.shape[output_dim] == (
+                self.org_vocab_size // param.packed_factor
+            )
+            start_idx = start_idx // packed_factor
+            shard_size = shard_size // packed_factor
+        else:
+            assert loaded_weight.shape[output_dim] == (
+                self.org_vocab_size
+                // (self.tp_size if self.use_presharded_weights else 1)
+            ), f"{self.org_vocab_size=} {self.use_presharded_weights=} {loaded_weight.shape[output_dim]=}"
+
+        # Copy the data.
+        if not self.use_presharded_weights:
+            loaded_weight = loaded_weight.narrow(output_dim, start_idx, shard_size)
+        param[: loaded_weight.shape[0]].data.copy_(loaded_weight)
+        param[loaded_weight.shape[0] :].data.fill_(0)
+
+    def forward(self, input_):
+        if self.tp_size > 1:
+            # Build the mask.
+            masked_input, input_mask = get_masked_input_and_mask(
+                input_,
+                self.shard_indices.org_vocab_start_index,
+                self.shard_indices.org_vocab_end_index,
+                self.shard_indices.num_org_vocab_padding,
+                self.shard_indices.added_vocab_start_index,
+                self.shard_indices.added_vocab_end_index,
+            )
+        else:
+            masked_input = input_
+
+        # Get the embeddings.
+        with use_symmetric_memory(
+            get_tp_group(), disabled=not is_allocation_symmetric()
+        ):
+            output_parallel = self.quant_method.embedding(self, masked_input.long())
+
+        if self.tp_size > 1:
+            # Mask the output embedding.
+            output_parallel.masked_fill_(input_mask.unsqueeze(-1), 0)
+            if not get_attn_tp_context().input_scattered:
+                if self.use_attn_tp_group:
+                    output_parallel = attn_tp_all_reduce(output_parallel)
+                else:
+                    # Reduce across all the model parallel GPUs.
+                    output_parallel = tensor_model_parallel_all_reduce(output_parallel)
+        return output_parallel
+
+    def extra_repr(self) -> str:
+        s = f"num_embeddings={self.num_embeddings_per_partition}"
+        s += f", embedding_dim={self.embedding_dim}"
+        s += f", org_vocab_size={self.org_vocab_size}"
+        s += f", num_embeddings_padded={self.num_embeddings_padded}"
+        if self.enable_tp:
+            s += f", tp_size={self.tp_size}"
+        return s
+
+
+class ParallelLMHead(VocabParallelEmbedding):
+    """Parallelized LM head.
+
+    Output logits weight matrices used in the Sampler. The weight and bias
+    tensors are padded to make sure they are divisible by the number of
+    model parallel GPUs.
+
+    Args:
+        num_embeddings: vocabulary size.
+        embedding_dim: size of hidden state.
+        bias: whether to use bias.
+        params_dtype: type of the parameters.
+        org_num_embeddings: original vocabulary size (without LoRA).
+        padding_size: padding size for the vocabulary.
+    """
+
+    def __init__(
+        self,
+        num_embeddings: int,
+        embedding_dim: int,
+        *,
+        bias: bool = False,
+        params_dtype: Optional[torch.dtype] = None,
+        org_num_embeddings: Optional[int] = None,
+        padding_size: int = DEFAULT_VOCAB_PADDING_SIZE,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+        use_attn_tp_group: bool = False,
+        use_presharded_weights: bool = False,
+    ):
+        super().__init__(
+            num_embeddings,
+            embedding_dim,
+            params_dtype=params_dtype,
+            org_num_embeddings=org_num_embeddings,
+            padding_size=padding_size,
+            quant_config=quant_config,
+            prefix=prefix,
+            use_attn_tp_group=use_attn_tp_group,
+            use_presharded_weights=use_presharded_weights,
+        )
+        self.quant_config = quant_config
+
+        # We only support pack LMHead if it's not quantized.
+        if _is_cpu and _is_cpu_amx_available:
+            if hasattr(self, "weight") and self.weight.dtype in [
+                torch.bfloat16,
+                torch.float16,
+            ]:
+                self.quant_method = PackWeightMethod(weight_names=["weight"])
+
+        if bias:
+            self.bias = Parameter(
+                torch.empty(self.num_embeddings_per_partition, dtype=params_dtype)
+            )
+            set_weight_attrs(
+                self.bias,
+                {
+                    "output_dim": 0,
+                    "weight_loader": self.weight_loader,
+                },
+            )
+        else:
+            self.register_parameter("bias", None)
+
+    def tie_weights(self, embed_tokens: VocabParallelEmbedding):
+        """Tie the weights with word embeddings."""
+        # GGUF quantized embed_tokens.
+        if self.quant_config and self.quant_config.get_name() == "gguf":
+            return embed_tokens
+        else:
+            self.weight = embed_tokens.weight
+            return self
+
+    def forward(self, input_):
+        del input_
+        raise RuntimeError("LMHead's weights should be used in the sampler.")
diff --git a/sglang/python/sglang/srt/managers/__pycache__/async_dynamic_batch_tokenizer.cpython-311.pyc b/sglang/python/sglang/srt/managers/__pycache__/async_dynamic_batch_tokenizer.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5550834dc7e6361d3551ccbb51bcdd3c5069a248
Binary files /dev/null and b/sglang/python/sglang/srt/managers/__pycache__/async_dynamic_batch_tokenizer.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/managers/__pycache__/async_mm_data_processor.cpython-311.pyc b/sglang/python/sglang/srt/managers/__pycache__/async_mm_data_processor.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8ce3989ad17d6e1d29945021451ddf916540ddbd
Binary files /dev/null and b/sglang/python/sglang/srt/managers/__pycache__/async_mm_data_processor.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/managers/__pycache__/cache_controller.cpython-311.pyc b/sglang/python/sglang/srt/managers/__pycache__/cache_controller.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..308e9d0755b599b37d9796383fd38bd40fe79d57
Binary files /dev/null and b/sglang/python/sglang/srt/managers/__pycache__/cache_controller.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/managers/__pycache__/data_parallel_controller.cpython-311.pyc b/sglang/python/sglang/srt/managers/__pycache__/data_parallel_controller.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3cf424fe0e82f8114e423a1019db079787be82d7
Binary files /dev/null and b/sglang/python/sglang/srt/managers/__pycache__/data_parallel_controller.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/managers/__pycache__/detokenizer_manager.cpython-311.pyc b/sglang/python/sglang/srt/managers/__pycache__/detokenizer_manager.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8307001270c593d78dc1a754eba217a2a86405cb
Binary files /dev/null and b/sglang/python/sglang/srt/managers/__pycache__/detokenizer_manager.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/managers/__pycache__/disagg_service.cpython-311.pyc b/sglang/python/sglang/srt/managers/__pycache__/disagg_service.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ca2ac14e879d48770ffaeba0dae5e79156ec7fae
Binary files /dev/null and b/sglang/python/sglang/srt/managers/__pycache__/disagg_service.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/managers/__pycache__/io_struct.cpython-311.pyc b/sglang/python/sglang/srt/managers/__pycache__/io_struct.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d040ce75e0eb4d0259a30792898a964fcb470b48
Binary files /dev/null and b/sglang/python/sglang/srt/managers/__pycache__/io_struct.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/managers/__pycache__/mm_utils.cpython-311.pyc b/sglang/python/sglang/srt/managers/__pycache__/mm_utils.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c5cc8080b7e48df0397ac1af52d9e1c3d87d4662
Binary files /dev/null and b/sglang/python/sglang/srt/managers/__pycache__/mm_utils.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/managers/__pycache__/multi_tokenizer_mixin.cpython-311.pyc b/sglang/python/sglang/srt/managers/__pycache__/multi_tokenizer_mixin.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..43901f4be4a163ce6813906f5d21e3bddea8a97b
Binary files /dev/null and b/sglang/python/sglang/srt/managers/__pycache__/multi_tokenizer_mixin.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/managers/__pycache__/multimodal_processor.cpython-311.pyc b/sglang/python/sglang/srt/managers/__pycache__/multimodal_processor.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..54178a8767cdcb95557387781012dc76b85789ca
Binary files /dev/null and b/sglang/python/sglang/srt/managers/__pycache__/multimodal_processor.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/managers/__pycache__/overlap_utils.cpython-311.pyc b/sglang/python/sglang/srt/managers/__pycache__/overlap_utils.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0c436958d7b97acf175e0e3d6050d16539f8ed6e
Binary files /dev/null and b/sglang/python/sglang/srt/managers/__pycache__/overlap_utils.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/managers/__pycache__/prefill_delayer.cpython-311.pyc b/sglang/python/sglang/srt/managers/__pycache__/prefill_delayer.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ba9aa65208a2bc9f713b7a1ac7a46e17e17d7517
Binary files /dev/null and b/sglang/python/sglang/srt/managers/__pycache__/prefill_delayer.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/managers/__pycache__/schedule_policy.cpython-311.pyc b/sglang/python/sglang/srt/managers/__pycache__/schedule_policy.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3dcfdb3f8e465b0ed6da0d25a10973e092d46b3a
Binary files /dev/null and b/sglang/python/sglang/srt/managers/__pycache__/schedule_policy.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/managers/__pycache__/scheduler_dp_attn_mixin.cpython-311.pyc b/sglang/python/sglang/srt/managers/__pycache__/scheduler_dp_attn_mixin.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f7a9e498d721ee12c5bf596875ce01dd1a97dc4c
Binary files /dev/null and b/sglang/python/sglang/srt/managers/__pycache__/scheduler_dp_attn_mixin.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/managers/__pycache__/scheduler_input_blocker.cpython-311.pyc b/sglang/python/sglang/srt/managers/__pycache__/scheduler_input_blocker.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5d10799191f1bbc77daa464fb53288eb841f356d
Binary files /dev/null and b/sglang/python/sglang/srt/managers/__pycache__/scheduler_input_blocker.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/managers/__pycache__/scheduler_output_processor_mixin.cpython-311.pyc b/sglang/python/sglang/srt/managers/__pycache__/scheduler_output_processor_mixin.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8bb8e8e0fb33ad7621e963c6954db2e89a76b087
Binary files /dev/null and b/sglang/python/sglang/srt/managers/__pycache__/scheduler_output_processor_mixin.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/managers/__pycache__/scheduler_pp_mixin.cpython-311.pyc b/sglang/python/sglang/srt/managers/__pycache__/scheduler_pp_mixin.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8c8bddf3201fd1411ca7f183f37d5c68856c49d0
Binary files /dev/null and b/sglang/python/sglang/srt/managers/__pycache__/scheduler_pp_mixin.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/managers/__pycache__/scheduler_profiler_mixin.cpython-311.pyc b/sglang/python/sglang/srt/managers/__pycache__/scheduler_profiler_mixin.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..48d5c5030e52f69696c554f4ae454635c0db3f11
Binary files /dev/null and b/sglang/python/sglang/srt/managers/__pycache__/scheduler_profiler_mixin.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/managers/__pycache__/scheduler_recv_skipper.cpython-311.pyc b/sglang/python/sglang/srt/managers/__pycache__/scheduler_recv_skipper.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b9ff83074e2a0f6bcc4baed2570ba31db1f4b7e3
Binary files /dev/null and b/sglang/python/sglang/srt/managers/__pycache__/scheduler_recv_skipper.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/managers/__pycache__/scheduler_runtime_checker_mixin.cpython-311.pyc b/sglang/python/sglang/srt/managers/__pycache__/scheduler_runtime_checker_mixin.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9bc058fba195495edd807307ab49fdb898d97d4a
Binary files /dev/null and b/sglang/python/sglang/srt/managers/__pycache__/scheduler_runtime_checker_mixin.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/managers/__pycache__/scheduler_update_weights_mixin.cpython-311.pyc b/sglang/python/sglang/srt/managers/__pycache__/scheduler_update_weights_mixin.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a52dd154fe2490094beb1c690ce6d544a361e89b
Binary files /dev/null and b/sglang/python/sglang/srt/managers/__pycache__/scheduler_update_weights_mixin.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/managers/__pycache__/session_controller.cpython-311.pyc b/sglang/python/sglang/srt/managers/__pycache__/session_controller.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8a64fe26dc6f49edac661c2ba9d9ad996a8b3018
Binary files /dev/null and b/sglang/python/sglang/srt/managers/__pycache__/session_controller.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/managers/__pycache__/template_manager.cpython-311.pyc b/sglang/python/sglang/srt/managers/__pycache__/template_manager.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ed9cb841663fa5adec26ce37d8ab2b1a611b88be
Binary files /dev/null and b/sglang/python/sglang/srt/managers/__pycache__/template_manager.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/managers/__pycache__/tokenizer_communicator_mixin.cpython-311.pyc b/sglang/python/sglang/srt/managers/__pycache__/tokenizer_communicator_mixin.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3f85de3639c4973241f72748a83943ef1a1a50ca
Binary files /dev/null and b/sglang/python/sglang/srt/managers/__pycache__/tokenizer_communicator_mixin.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/managers/__pycache__/tokenizer_manager_multiitem_mixin.cpython-311.pyc b/sglang/python/sglang/srt/managers/__pycache__/tokenizer_manager_multiitem_mixin.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7483cf3533b62a4a142ff8c48dd1f082efeab1d1
Binary files /dev/null and b/sglang/python/sglang/srt/managers/__pycache__/tokenizer_manager_multiitem_mixin.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/managers/__pycache__/tp_worker.cpython-311.pyc b/sglang/python/sglang/srt/managers/__pycache__/tp_worker.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c6bdbe33a2f0a1d388de653e3c445e56301c2f65
Binary files /dev/null and b/sglang/python/sglang/srt/managers/__pycache__/tp_worker.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/managers/__pycache__/utils.cpython-311.pyc b/sglang/python/sglang/srt/managers/__pycache__/utils.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8d48aec006a9596b602f11ed9a0d6f90ed775aa9
Binary files /dev/null and b/sglang/python/sglang/srt/managers/__pycache__/utils.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/managers/async_dynamic_batch_tokenizer.py b/sglang/python/sglang/srt/managers/async_dynamic_batch_tokenizer.py
new file mode 100644
index 0000000000000000000000000000000000000000..ef1a8307f3c7da8bf32f84127f8f8168fe17d61a
--- /dev/null
+++ b/sglang/python/sglang/srt/managers/async_dynamic_batch_tokenizer.py
@@ -0,0 +1,170 @@
+"""
+Asynchronous dynamic batch tokenizer for SGLang.
+
+This module provides an async tokenizer with dynamic batching capabilities
+to reduce tokenization overhead when multiple requests arrive concurrently.
+"""
+
+import asyncio
+import logging
+from concurrent.futures import ThreadPoolExecutor
+from functools import partial
+from typing import Any, Dict, List, Optional
+
+logger = logging.getLogger(__name__)
+
+
+class AsyncDynamicbatchTokenizer:
+    """Asynchronous tokenizer with dynamic batching for single string prompts.
+
+    Dynamically batches pending encode requests from a queue to reduce overhead.
+    Only handles single string prompts - regular batch processing of multiple
+    strings per request should be handled at a higher level.
+    A single-thread ThreadPoolExecutor is used so the event loop stays responsive.
+
+    Note: Uses lazy initialization for asyncio components because this class
+    is instantiated in TokenizerManager.__init__() before the event loop starts.
+    """
+
+    def __init__(
+        self,
+        tokenizer,
+        max_batch_size: int = 32,
+        batch_wait_timeout_s: float = 0.002,
+    ) -> None:
+        self.tokenizer = tokenizer
+        self.max_batch_size = max_batch_size
+        self.batch_wait_timeout_s = batch_wait_timeout_s
+
+        # Single queue for all encode requests - initialized lazily
+        self._queue: Optional[asyncio.Queue] = None
+        self._batcher_task: Optional[asyncio.Task] = None
+
+        # Single-thread executor for blocking tokenizer calls
+        self._executor = ThreadPoolExecutor(max_workers=1)
+        self._initialized = False
+
+    def _ensure_initialized(self):
+        """Lazy initialization of event loop dependent components."""
+        if not self._initialized:
+            self._queue = asyncio.Queue()
+            self._batcher_task = asyncio.create_task(self._dynamic_batch_loop())
+            self._initialized = True
+
+    async def __call__(self, prompt: str, **kwargs) -> Any:
+        """Encode a single prompt."""
+        return await self.encode(prompt, **kwargs)
+
+    async def encode(self, prompt: str, **kwargs) -> Any:
+        """Encode a single prompt."""
+        self._ensure_initialized()
+        result_future: asyncio.Future = asyncio.get_running_loop().create_future()
+        await self._queue.put((prompt, kwargs, result_future))
+        return await result_future
+
+    async def _dynamic_batch_loop(self):
+        """Dynamically batch incoming encode requests for efficiency."""
+        while True:
+            try:
+                # Get the first request
+                prompt, kwargs, result_future = await self._queue.get()
+
+                # Collect requests into dynamic batch
+                prompts = [prompt]
+                kwargs_list = [kwargs]
+                result_futures = [result_future]
+
+                # Check if there are more items immediately available in the queue
+                # If queue is empty, process single item immediately without timeout
+                if self._queue.empty():
+                    # No other requests waiting, process immediately
+                    pass
+                else:
+                    # There might be more requests, wait for dynamic batching opportunity
+                    start_time = asyncio.get_running_loop().time()
+
+                    # Collect more requests up to max_batch_size or batch_wait_timeout_s
+                    while len(prompts) < self.max_batch_size:
+                        elapsed = asyncio.get_running_loop().time() - start_time
+                        if elapsed >= self.batch_wait_timeout_s:
+                            break
+
+                        remaining_time = self.batch_wait_timeout_s - elapsed
+                        try:
+                            prompt, kwargs, result_future = await asyncio.wait_for(
+                                self._queue.get(), remaining_time
+                            )
+                            prompts.append(prompt)
+                            kwargs_list.append(kwargs)
+                            result_futures.append(result_future)
+                        except asyncio.TimeoutError:
+                            break
+
+                # Log dynamic batch information
+                logger.debug(
+                    f"AsyncDynamicbatchTokenizer: Processing dynamic batch of size {len(prompts)}"
+                )
+
+                # Process the dynamic batch
+                await self._process_dynamic_batch(prompts, kwargs_list, result_futures)
+
+            except Exception as e:
+                logger.error(f"Error in dynamic batch loop: {e}")
+                # Continue the loop to handle other requests
+
+    async def _process_dynamic_batch(
+        self,
+        prompts: List[str],
+        kwargs_list: List[Dict],
+        result_futures: List[asyncio.Future],
+    ) -> None:
+        """Process a dynamic batch of encode requests for single string prompts."""
+        # Check if all kwargs are identical for efficient batch processing
+        can_batch = len(set(str(sorted(kw.items())) for kw in kwargs_list)) == 1
+        kwargs = kwargs_list[0] if can_batch else None
+
+        try:
+            # If every request uses identical kwargs we can run a single
+            # batch tokenizer call for a big speed-up.
+            if can_batch and len(prompts) > 1:
+                encode_fn = partial(self.tokenizer, prompts, **kwargs)
+                results = await asyncio.get_running_loop().run_in_executor(
+                    self._executor, encode_fn
+                )
+
+                for i, fut in enumerate(result_futures):
+                    if not fut.done():
+                        data = {k: v[i] for k, v in results.items()}
+                        fut.set_result(data)
+            else:
+                # Process each request individually due to different kwargs
+                if len(prompts) > 1 and not can_batch:
+                    logger.warning(
+                        f"AsyncDynamicbatchTokenizer: Dynamic batching disabled for batch of {len(prompts)} "
+                        f"requests due to differing kwargs. This reduces performance benefits. "
+                        f"Consider using consistent tokenization parameters across requests."
+                    )
+
+                encode_fn = lambda prompts=prompts, kwargs=kwargs_list: [
+                    self.tokenizer(p, **kw) for p, kw in zip(prompts, kwargs_list)
+                ]
+                results = await asyncio.get_running_loop().run_in_executor(
+                    self._executor, encode_fn
+                )
+
+                for fut, res in zip(result_futures, results):
+                    if not fut.done():
+                        fut.set_result(res)
+        except Exception as e:
+            logger.error(f"Error in dynamic batch processing: {e}")
+            for fut in result_futures:
+                if not fut.done():
+                    fut.set_exception(e)
+
+    def __del__(self):
+        """Clean up background tasks."""
+        if hasattr(self, "_batcher_task") and self._batcher_task:
+            if not self._batcher_task.done():
+                self._batcher_task.cancel()
+        if hasattr(self, "_executor"):
+            self._executor.shutdown(wait=False)
diff --git a/sglang/python/sglang/srt/managers/async_mm_data_processor.py b/sglang/python/sglang/srt/managers/async_mm_data_processor.py
new file mode 100644
index 0000000000000000000000000000000000000000..85e8580cb7697afbe583324cfbcb7c18efc54431
--- /dev/null
+++ b/sglang/python/sglang/srt/managers/async_mm_data_processor.py
@@ -0,0 +1,122 @@
+import asyncio
+import logging
+from concurrent.futures import ThreadPoolExecutor
+from functools import partial
+from typing import Any, Dict, List, Optional, Union
+
+logger = logging.getLogger(__name__)
+
+
+class AsyncMMDataProcessor:
+    """
+    Async wrapper for a multimodal processor.
+
+    Behavior:
+      - If the underlying processor exposes `process_mm_data_async`, call/await it directly.
+      - Otherwise, fall back to running a synchronous `process_mm_data` in a thread pool.
+      - Optionally guard per-call concurrency via an asyncio.Semaphore.
+      - Optionally enforce per-call timeout via asyncio.wait_for.
+    """
+
+    def __init__(
+        self,
+        mm_processor: Any,
+        *,
+        max_concurrent_calls: Optional[int] = None,
+        timeout_s: Optional[float] = None,
+    ) -> None:
+        """
+        Args:
+            mm_processor: An object exposing either
+                - async def process_mm_data_async(...): -> Dict[str, Any]
+              or
+                - def process_mm_data(...): -> Dict[str, Any]
+            max_concurrent_calls: Optional concurrency cap for per-call execution.
+            timeout_s: Optional timeout (seconds) for each `process()` call.
+        """
+        self.mm_processor = mm_processor
+        self.timeout_s = timeout_s
+
+        # Concurrency guard (None -> unlimited)
+        self.semaphore = (
+            asyncio.Semaphore(max_concurrent_calls) if max_concurrent_calls else None
+        )
+
+        # Detect async path; if missing, prepare a fallback executor for sync path
+        self._proc_async = getattr(mm_processor, "process_mm_data_async", None)
+        self.is_async = asyncio.iscoroutinefunction(self._proc_async)
+        self.fallback_exec: Optional[ThreadPoolExecutor] = (
+            ThreadPoolExecutor(max_workers=max_concurrent_calls)
+            if not self.is_async
+            else None
+        )
+
+    async def process(
+        self,
+        *,
+        image_data: Optional[List[Union[str, bytes]]] = None,
+        audio_data: Optional[List[Union[str, bytes]]] = None,
+        input_text_or_ids: Union[str, List[int], None] = None,
+        request_obj: Any,
+        **kwargs: Any,
+    ) -> Dict[str, Any]:
+        """
+        Public entrypoint: process a single multimodal request without blocking the event loop.
+        """
+
+        async def _invoke() -> Dict[str, Any]:
+            if self.is_async:
+                # Native async implementation
+                return await self._proc_async(
+                    image_data=image_data,
+                    audio_data=audio_data,
+                    input_text=input_text_or_ids,
+                    request_obj=request_obj,
+                    **kwargs,
+                )
+
+            # Synchronous fallback
+            sync_fn = getattr(self.mm_processor, "process_mm_data", None)
+            if not callable(sync_fn):
+                raise RuntimeError(
+                    "mm_processor has neither 'process_mm_data_async' nor 'process_mm_data'."
+                )
+            loop = asyncio.get_running_loop()
+            fn = partial(
+                sync_fn,
+                image_data=image_data,
+                audio_data=audio_data,
+                input_text=input_text_or_ids,
+                request_obj=request_obj,
+                **kwargs,
+            )
+            return await loop.run_in_executor(self.fallback_exec, fn)
+
+        # Apply optional concurrency guard
+        if self.semaphore is not None:
+            async with self.semaphore:
+                if self.timeout_s is not None:
+                    return await asyncio.wait_for(_invoke(), timeout=self.timeout_s)
+                return await _invoke()
+
+        # No concurrency guard
+        if self.timeout_s is not None:
+            return await asyncio.wait_for(_invoke(), timeout=self.timeout_s)
+        return await _invoke()
+
+    def shutdown(self) -> None:
+        """Gracefully shutdown resources owned by this wrapper."""
+        try:
+            if self.fallback_exec:
+                self.fallback_exec.shutdown(wait=False)
+        except Exception:
+            logger.exception(
+                "Error while shutting down fallback executor in AsyncMMDataProcessor"
+            )
+
+    def __del__(self):
+        # Best-effort shutdown
+        try:
+            self.shutdown()
+        except Exception:
+            pass
diff --git a/sglang/python/sglang/srt/managers/cache_controller.py b/sglang/python/sglang/srt/managers/cache_controller.py
new file mode 100644
index 0000000000000000000000000000000000000000..51030ae1d1bdf949493576ffc6eeb7dbd004c16f
--- /dev/null
+++ b/sglang/python/sglang/srt/managers/cache_controller.py
@@ -0,0 +1,1042 @@
+from __future__ import annotations
+
+"""
+Copyright 2023-2025 SGLang Team
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import logging
+import threading
+import time
+from queue import Empty, Full, Queue
+from typing import TYPE_CHECKING, List, NamedTuple, Optional
+
+import torch
+
+from sglang.srt.mem_cache.hicache_storage import (
+    HiCacheStorageConfig,
+    HiCacheStorageExtraInfo,
+)
+
+if TYPE_CHECKING:
+    from sglang.srt.mem_cache.allocator import BaseTokenToKVPoolAllocator
+    from sglang.srt.mem_cache.memory_pool_host import HostKVCache
+
+from sglang.srt.distributed import (
+    get_tensor_model_parallel_rank,
+    get_tensor_model_parallel_world_size,
+)
+from sglang.srt.layers.dp_attention import (
+    get_attention_dp_rank,
+    get_attention_tp_rank,
+    get_attention_tp_size,
+    is_dp_attention_enabled,
+)
+from sglang.srt.mem_cache.memory_pool import MLATokenToKVPool
+from sglang.srt.utils import get_device_module
+
+logger = logging.getLogger(__name__)
+
+device_module = get_device_module()
+
+
+class LayerLoadingEvent:
+    def __init__(self, num_layers: int):
+        self._num_layers = num_layers
+        self.load_events = [device_module.Event() for _ in range(num_layers)]
+        self.start_event = device_module.Event()  # start event on controller stream
+
+    def complete(self, layer_index: int):
+        assert 0 <= layer_index < self._num_layers
+        self.load_events[layer_index].record()
+
+    def wait(self, layer_index: int):
+        device_module.current_stream().wait_event(self.load_events[layer_index])
+
+    @property
+    def finish_event(self):
+        return self.load_events[-1]
+
+
+class LayerDoneCounter:
+    def __init__(self, num_layers: int):
+        self.num_layers = num_layers
+        # extra producer and consumer counters for overlap mode
+        self.num_counters = 3
+        self.events = [LayerLoadingEvent(num_layers) for _ in range(self.num_counters)]
+        self.producer_index = -1
+        self.consumer_index = -1
+
+    def update_producer(self):
+        self.producer_index = (self.producer_index + 1) % self.num_counters
+        assert self.events[
+            self.producer_index
+        ].finish_event.query(), (
+            "Producer finish event should be ready before being reused."
+        )
+        return self.producer_index
+
+    def set_consumer(self, index: int):
+        self.consumer_index = index
+
+    def wait_until(self, threshold: int):
+        if self.consumer_index < 0:
+            return
+        self.events[self.consumer_index].wait(threshold)
+
+    def reset(self):
+        self.producer_index = -1
+        self.consumer_index = -1
+
+
+class CacheOperation:
+
+    counter = 0
+
+    def __init__(
+        self,
+        host_indices: torch.Tensor,
+        device_indices: torch.Tensor,
+        node_id: int,
+        priority: Optional[int] = None,
+    ):
+        self.host_indices = host_indices
+        self.device_indices = device_indices
+        self.node_ids = [node_id]
+        self.data = None
+
+        self.id = CacheOperation.counter
+        CacheOperation.counter += 1
+        # default priority is the order of creation
+        self.priority = priority if priority is not None else self.id
+
+    @staticmethod
+    def merge_ops(ops: List[CacheOperation]) -> CacheOperation:
+        assert len(ops) > 0
+        if len(ops) == 1:
+            return ops[0]
+
+        host_indices = torch.cat([op.host_indices for op in ops])
+        device_indices = torch.cat([op.device_indices for op in ops])
+        node_ids = []
+        priority = min(op.priority for op in ops)
+        for op in ops:
+            node_ids.extend(op.node_ids)
+        merged_op = CacheOperation(host_indices, device_indices, -1, priority)
+        merged_op.node_ids = node_ids
+        return merged_op
+
+    def __lt__(self, other: CacheOperation):
+        return self.priority < other.priority
+
+
+class HiCacheAck(NamedTuple):
+    start_event: device_module.Event
+    finish_event: device_module.Event
+    node_ids: List[int]
+
+
+class TransferBuffer:
+    """
+    Overlapping buffer preparation and transfer operations to improve throughput.
+    """
+
+    def __init__(
+        self, stop_event, buffer_count: int = 3, max_buffer_size: int = 1024
+    ) -> None:
+        self.stop_event = stop_event
+        self.buffers = Queue(maxsize=buffer_count)
+        # todo: adjust the buffer size based on throughput profile of the system
+        self.max_buffer_size = max_buffer_size
+
+    def full(self) -> bool:
+        return self.buffers.full()
+
+    def empty(self) -> bool:
+        return self.buffers.empty()
+
+    def put(self, item, block=True, timeout=1) -> None:
+        while not self.stop_event.is_set():
+            try:
+                self.buffers.put(item, block=block, timeout=timeout)
+                break
+            except Full:
+                if not block:
+                    break
+                continue
+            except Exception as e:
+                logger.error(e)
+
+    def get(self, block=True, timeout=1) -> Optional[CacheOperation]:
+        try:
+            return self.buffers.get(block=block, timeout=timeout)
+        except Empty:
+            return None
+        except Exception as e:
+            logger.error(e)
+
+    def clear(self):
+        self.buffers.queue.clear()
+
+
+class StorageOperation:
+    counter = 0
+
+    def __init__(
+        self,
+        host_indices: torch.Tensor,
+        token_ids: List[int],
+        last_hash: Optional[str] = None,
+        hash_value: Optional[List[str]] = None,
+        prefix_keys: Optional[List[str]] = None,
+    ):
+        self.host_indices = host_indices
+        self.token_ids = token_ids
+        self.last_hash = last_hash
+        self.completed_tokens = 0
+        self.hash_value = hash_value if hash_value is not None else []
+        self.prefix_keys = prefix_keys
+
+        self.id = StorageOperation.counter
+        StorageOperation.counter += 1
+
+    def __lt__(self, other: "StorageOperation"):
+        return self.id < other.id
+
+
+class PrefetchOperation(StorageOperation):
+    def __init__(
+        self,
+        request_id: str,
+        host_indices: torch.Tensor,
+        token_ids: List[int],
+        last_hash: Optional[str] = None,
+        prefix_keys: Optional[List[str]] = None,
+    ):
+        self.request_id = request_id
+
+        self._lock = threading.Lock()
+        self._terminated_flag = False
+        self.start_time = time.monotonic()
+
+        super().__init__(host_indices, token_ids, last_hash, prefix_keys=prefix_keys)
+
+    def increment(self, num_tokens: int):
+        with self._lock:
+            if self._terminated_flag:
+                return False
+            self.completed_tokens += num_tokens
+            return True
+
+    def mark_terminate(self):
+        with self._lock:
+            self._terminated_flag = True
+
+    def is_terminated(self) -> bool:
+        return self._terminated_flag
+
+
+class HiCacheController:
+
+    def __init__(
+        self,
+        token_to_kv_pool_allocator: BaseTokenToKVPoolAllocator,
+        mem_pool_host: HostKVCache,
+        page_size: int,
+        tp_group: torch.distributed.ProcessGroup,
+        load_cache_event: threading.Event,
+        write_policy: str = "write_through_selective",
+        io_backend: str = "",
+        storage_backend: Optional[str] = None,
+        prefetch_threshold: int = 256,
+        model_name: Optional[str] = None,
+        storage_backend_extra_config: Optional[dict] = None,
+        pp_rank: int = 0,
+        pp_size: int = 1,
+    ):
+        self.tp_group = tp_group
+        self.mem_pool_device_allocator = token_to_kv_pool_allocator
+        self.mem_pool_device = token_to_kv_pool_allocator.get_kvcache()
+        self.mem_pool_host = mem_pool_host
+        self.write_policy = write_policy
+        self.page_size = page_size
+        self.io_backend = io_backend
+        self.enable_storage = False
+        self.storage_backend = None
+        self.storage_backend_type = None
+        self.pp_rank = pp_rank
+        self.pp_size = pp_size
+
+        # Default storage page IO functions (may be overridden by attach).
+        self.page_get_func = self._generic_page_get
+        self.page_set_func = self._generic_page_set
+
+        # Dedicated stop event for storage background threads (prefetch/backup).
+        # NOTE: Do NOT reuse `self.stop_event` here since it also guards core HiCache
+        # transfer buffers (CPU<->GPU). We want to allow runtime attach/detach of
+        # storage without stopping the whole controller.
+        self.storage_stop_event = threading.Event()
+
+        self.device = self.mem_pool_device.device
+        self.layer_num = self.mem_pool_device.layer_num
+        self.layer_done_counter = LayerDoneCounter(self.layer_num)
+        self.mem_pool_device.register_layer_transfer_counter(self.layer_done_counter)
+
+        if write_policy not in [
+            "write_through",
+            "write_through_selective",
+            "write_back",
+        ]:
+            raise ValueError(f"Invalid write policy: {write_policy}")
+
+        # self.write_queue = PriorityQueue[CacheOperation]()
+        self.load_queue: List[CacheOperation] = []
+        self.write_queue: List[CacheOperation] = []
+        self.ack_load_queue: List[HiCacheAck] = []
+        self.ack_write_queue: List[HiCacheAck] = []
+
+        self.stop_event = threading.Event()
+        self.write_buffer = TransferBuffer(self.stop_event)
+        self.load_buffer = TransferBuffer(
+            self.stop_event, buffer_count=10, max_buffer_size=100
+        )
+
+        self.write_stream = device_module.Stream()
+        self.load_stream = device_module.Stream()
+
+        # If a storage backend is provided at startup, treat it as an implicit attach,
+        # so init/runtime share the same lifecycle semantics and code paths.
+        if storage_backend is not None:
+            try:
+                self.attach_storage_backend(
+                    storage_backend=storage_backend,
+                    prefetch_threshold=prefetch_threshold,
+                    model_name=model_name,
+                    storage_backend_extra_config=storage_backend_extra_config,
+                )
+            except ValueError as e:
+                # Preserve the historical error shape on init for unknown backends.
+                raise ValueError(f"Failed to create storage backend: {e}") from e
+
+    def _start_storage_threads(self):
+        """Start storage prefetch/backup threads and their queues.
+
+        This is used by runtime attach, and also by reset when storage is enabled.
+        """
+        assert self.enable_storage
+        assert not self.storage_stop_event.is_set()
+
+        self.prefetch_thread = threading.Thread(
+            target=self.prefetch_thread_func, daemon=True
+        )
+        self.backup_thread = threading.Thread(
+            target=self.backup_thread_func, daemon=True
+        )
+        self.prefetch_queue = Queue()
+        self.backup_queue = Queue()
+
+        self.prefetch_revoke_queue = Queue()
+        self.ack_backup_queue = Queue()
+        self.host_mem_release_queue = Queue()
+
+        self.prefetch_thread.start()
+        self.backup_thread.start()
+
+    def _stop_storage_threads(self):
+        """Stop storage prefetch/backup threads and drain internal queues.
+
+        Caller should ensure no in-flight requests.
+        """
+        # Always request stop. This is safe even when storage is already disabled,
+        # and makes detach truly idempotent (previous partial detach may have left
+        # threads alive).
+        # NOTE: do NOT clear stop_event unless threads have fully stopped; otherwise
+        # a still-alive thread may resume and touch released state.
+        self.storage_stop_event.set()
+
+        # Best-effort wakeups so threads exit promptly even if blocked on queues.
+        try:
+            if hasattr(self, "prefetch_queue"):
+                self.prefetch_queue.put_nowait(None)
+            if hasattr(self, "backup_queue"):
+                self.backup_queue.put_nowait(None)
+            if hasattr(self, "prefetch_buffer"):
+                self.prefetch_buffer.put_nowait(None)
+        except Exception:
+            pass
+
+        # Best-effort joins (threads are daemon, but join keeps state clean).
+        threads = []
+        if hasattr(self, "prefetch_thread"):
+            threads.append(self.prefetch_thread)
+        if hasattr(self, "backup_thread"):
+            threads.append(self.backup_thread)
+        if hasattr(self, "prefetch_io_aux_thread"):
+            threads.append(self.prefetch_io_aux_thread)
+
+        for t in threads:
+            try:
+                t.join(timeout=10)
+            except Exception:
+                pass
+
+        alive = [t for t in threads if getattr(t, "is_alive", lambda: False)()]
+        if alive:
+            logger.error(
+                "Failed to stop HiCache storage threads cleanly: %s",
+                [getattr(t, "name", repr(t)) for t in alive],
+            )
+            raise RuntimeError("Failed to stop HiCache storage threads cleanly.")
+
+    def attach_storage_backend(
+        self,
+        storage_backend: str,
+        prefetch_threshold: int = 256,
+        model_name: Optional[str] = None,
+        storage_backend_extra_config: Optional[dict] = None,
+    ):
+        """Attach (enable) storage backend at runtime.
+
+        Requirement: no in-flight requests. This call is expected to run on the scheduler
+        thread (control path), not concurrently with prefetch/backup.
+        """
+        if self.enable_storage:
+            raise RuntimeError("Storage backend already attached.")
+
+        # Defensive: a previous partial detach may have flipped `enable_storage` but
+        # left background threads alive. Attaching on top of them is unsafe.
+        try:
+            self._stop_storage_threads()
+        except Exception as e:
+            raise RuntimeError(
+                "Cannot attach storage backend: previous detach did not stop storage threads cleanly."
+            ) from e
+
+        # Rollback-safe init: if creation fails, keep controller state consistent
+        # for future attach attempts.
+        self.storage_backend_type = storage_backend
+        from sglang.srt.mem_cache.hicache_storage import get_hash_str
+
+        self.get_hash_str = get_hash_str
+        self.storage_config = self._generate_storage_config(
+            model_name, storage_backend_extra_config
+        )
+        # for MLA models, only one rank needs to backup the KV cache
+        self.backup_skip = (
+            self.storage_config.is_mla_model
+            # todo: load balancing
+            and self.storage_config.tp_rank != 0
+        )
+
+        # Use storage backend factory for dynamic backend creation
+        from sglang.srt.mem_cache.storage import StorageBackendFactory
+
+        try:
+            self.storage_backend = StorageBackendFactory.create_backend(
+                storage_backend, self.storage_config, self.mem_pool_host
+            )
+            self.storage_backend.register_mem_pool_host(self.mem_pool_host)
+
+            self.enable_storage = True
+            # todo: threshold policy for prefetching
+            self.prefetch_threshold = max(prefetch_threshold, self.page_size)
+            self.prefetch_capacity_limit = max(
+                0, int(0.8 * (self.mem_pool_host.size - self.mem_pool_device.size))
+            )
+            # granularity of batch storage IO operations, in number of pages
+            self.storage_batch_size = 128
+            # tracking the number of tokens locked in prefetching, updated by the main scheduler thread
+            self.prefetch_tokens_occupied = 0
+
+            # create a new communication group for synchronizing storage operations across TP workers
+            self.tp_world_size = torch.distributed.get_world_size(group=self.tp_group)
+            if self.tp_world_size > 1:
+                from sglang.srt.distributed.parallel_state import (
+                    create_custom_parallel_group,
+                )
+
+                group_ranks = torch.distributed.get_process_group_ranks(self.tp_group)
+                self.prefetch_tp_group = create_custom_parallel_group(
+                    group_ranks=group_ranks, backend="gloo"
+                )
+
+            # Select the get and set functions
+            self.page_get_func = self._generic_page_get
+            self.page_set_func = self._generic_page_set
+
+            if (self.storage_backend_type in ["hf3fs", "mooncake", "eic", "nixl"]) or (
+                self.storage_backend_type == "dynamic"
+                and bool(self.storage_config.extra_config.get("interface_v1", 0))
+            ):
+                self.page_get_func = self._page_get_zero_copy
+                self.page_set_func = self._page_set_zero_copy
+
+            # Ensure stop_event is clear before starting threads.
+            self.storage_stop_event.clear()
+            self._start_storage_threads()
+        except Exception:
+            # Best-effort cleanup for partial init.
+            try:
+                self._stop_storage_threads()
+            except Exception:
+                pass
+            try:
+                if hasattr(self, "prefetch_tp_group"):
+                    try:
+                        torch.distributed.destroy_process_group(self.prefetch_tp_group)
+                    except Exception:
+                        pass
+                    self.prefetch_tp_group = None
+            except Exception:
+                pass
+            try:
+                if (
+                    hasattr(self, "storage_backend")
+                    and self.storage_backend is not None
+                ):
+                    if hasattr(self.storage_backend, "close"):
+                        self.storage_backend.close()
+            except Exception:
+                pass
+            self.storage_backend = None
+            self.storage_backend_type = None
+            self.enable_storage = False
+            self.page_get_func = self._generic_page_get
+            self.page_set_func = self._generic_page_set
+            raise
+
+    def detach_storage_backend(self):
+        """Detach (disable) storage backend at runtime.
+
+        Requirement: no in-flight requests. This will stop storage threads and release
+        the backend instance (best-effort close).
+        """
+        # Idempotent cleanup: even if `enable_storage` is already False,
+        # we may still have leftover resources (threads/backend/process group) from a
+        # previous partial detach. We attempt cleanup whenever possible.
+        try:
+            self._stop_storage_threads()
+        except Exception as e:
+            # Do not proceed tearing down backend/process group if threads are not
+            # fully stopped; otherwise still-alive threads may touch released state.
+            # Caller can retry detach.
+            logger.exception("Stop storage threads failed: %s", e)
+            # IMPORTANT: Do not silently succeed. Upper layers rely on exceptions here
+            # to avoid flipping `enable_storage` flags while threads are still alive.
+            raise RuntimeError("Stop storage threads failed; detach aborted.") from e
+
+        # Best-effort destroy process group created for storage ops.
+        try:
+            if (
+                hasattr(self, "prefetch_tp_group")
+                and self.prefetch_tp_group is not None
+            ):
+                try:
+                    torch.distributed.destroy_process_group(self.prefetch_tp_group)
+                except Exception:
+                    pass
+                self.prefetch_tp_group = None
+        except Exception:
+            pass
+
+        # Best-effort close (some backends rely on GC/destructor).
+        try:
+            if (
+                hasattr(self, "storage_backend")
+                and self.storage_backend is not None
+                and hasattr(self.storage_backend, "close")
+            ):
+                self.storage_backend.close()
+        except Exception:
+            logger.exception("Failed to close storage backend cleanly.")
+
+        self.storage_backend = None
+        self.storage_backend_type = None
+        self.enable_storage = False
+        self.page_get_func = self._generic_page_get
+        self.page_set_func = self._generic_page_set
+        # Now it's safe to clear the stop event for future re-attach.
+        self.storage_stop_event.clear()
+
+    def _generate_storage_config(
+        self,
+        model_name: Optional[str] = None,
+        storage_backend_extra_config: Optional[dict] = None,
+    ):
+
+        if is_dp_attention_enabled():
+            self.tp_rank = get_attention_tp_rank()
+            self.tp_size = get_attention_tp_size()
+            self.dp_rank = get_attention_dp_rank()
+        else:
+            self.tp_rank = get_tensor_model_parallel_rank()
+            self.tp_size = get_tensor_model_parallel_world_size()
+            self.dp_rank = 0
+
+        # Currently, NPUMLATokenToKVPool is the subclass of MLATokenToKVPool.
+        is_mla_backend = isinstance(self.mem_pool_device, MLATokenToKVPool)
+        # Least Common Multiple among heterogeneous tp size
+        tp_lcm_size = storage_backend_extra_config.pop("tp_lcm_size", None)
+        should_split_heads = False
+
+        if tp_lcm_size:
+            assert (
+                tp_lcm_size % self.tp_size == 0
+            ), "tp_lcm_size must be divisible by tp_size."
+            should_split_heads = (
+                not is_mla_backend
+                and self.mem_pool_host.layout == "page_head"
+                and tp_lcm_size > self.tp_size
+            )
+
+        return HiCacheStorageConfig(
+            tp_rank=self.tp_rank,
+            tp_size=self.tp_size,
+            pp_rank=self.pp_rank,
+            pp_size=self.pp_size,
+            is_mla_model=is_mla_backend,
+            is_page_first_layout=self.mem_pool_host.layout == "page_first",
+            model_name=model_name,
+            tp_lcm_size=tp_lcm_size,
+            should_split_heads=should_split_heads,
+            extra_config=storage_backend_extra_config,
+        )
+
+    def reset(self):
+        self.stop_event.set()
+        self.storage_stop_event.set()
+
+        self.write_queue.clear()
+        self.load_queue.clear()
+        self.write_buffer.clear()
+        self.load_buffer.clear()
+        self.ack_write_queue.clear()
+        self.ack_load_queue.clear()
+        if self.enable_storage:
+            self.prefetch_thread.join()
+            self.backup_thread.join()
+            self.prefetch_queue.queue.clear()
+            self.backup_queue.queue.clear()
+            self.prefetch_revoke_queue.queue.clear()
+            self.ack_backup_queue.queue.clear()
+
+        self.stop_event.clear()
+        self.storage_stop_event.clear()
+
+        if self.enable_storage:
+            self.prefetch_thread = threading.Thread(
+                target=self.prefetch_thread_func, daemon=True
+            )
+            self.backup_thread = threading.Thread(
+                target=self.backup_thread_func, daemon=True
+            )
+            self.prefetch_thread.start()
+            self.backup_thread.start()
+
+    def write(
+        self,
+        device_indices: torch.Tensor,
+        priority: Optional[int] = None,
+        node_id: int = -1,
+    ) -> Optional[torch.Tensor]:
+        """
+        Back up KV caches from device memory to host memory.
+        """
+        host_indices = self.mem_pool_host.alloc(len(device_indices))
+        if host_indices is None:
+            return None
+        self.write_queue.append(
+            CacheOperation(host_indices, device_indices, node_id, priority)
+        )
+        self.start_writing()
+        return host_indices
+
+    def start_writing(self) -> None:
+        if len(self.write_queue) == 0:
+            return
+
+        op = CacheOperation.merge_ops(self.write_queue)
+        host_indices, device_indices = self.move_indices(op)
+        self.write_queue.clear()
+
+        start_event = device_module.Event()
+        finish_event = device_module.Event()
+
+        start_event.record()
+        with device_module.stream(self.write_stream):
+            start_event.wait(self.write_stream)
+            self.mem_pool_host.backup_from_device_all_layer(
+                self.mem_pool_device, host_indices, device_indices, self.io_backend
+            )
+            finish_event.record()
+            # NOTE: We must save the host indices and device indices here,
+            # this is because we need to guarantee that these tensors are
+            # still alive when the write stream is executing.
+            if host_indices.is_cuda:
+                host_indices.record_stream(self.write_stream)
+            if device_indices.is_cuda:
+                device_indices.record_stream(self.write_stream)
+
+        self.ack_write_queue.append(HiCacheAck(start_event, finish_event, op.node_ids))
+
+    def load(
+        self,
+        host_indices: torch.Tensor,
+        priority: Optional[int] = None,
+        node_id: int = -1,
+    ) -> Optional[torch.Tensor]:
+        """
+        Load KV caches from host memory to device memory.
+        """
+        device_indices = self.mem_pool_device_allocator.alloc(len(host_indices))
+        if device_indices is None:
+            return None
+        self.load_queue.append(
+            CacheOperation(host_indices, device_indices, node_id, priority)
+        )
+        return device_indices
+
+    def move_indices(self, op: CacheOperation):
+        host_indices, device_indices = op.host_indices, op.device_indices
+        # move indices to GPU if using kernels, to host if using direct indexing
+        if self.io_backend == "kernel":
+            if not host_indices.is_cuda:
+                host_indices = host_indices.to(self.device, non_blocking=True)
+            return host_indices, device_indices
+        elif self.io_backend == "direct":
+            if self.mem_pool_host.layout == "layer_first":
+                device_indices = device_indices.cpu()
+                host_indices, idx = host_indices.sort()
+                return host_indices, device_indices.index_select(0, idx)
+            elif self.mem_pool_host.layout == "page_first_direct":
+                return host_indices, device_indices.cpu()
+        elif self.io_backend == "kernel_ascend":
+            return host_indices, device_indices.cpu()
+        else:
+            raise ValueError(f"Unsupported io backend")
+
+    def start_loading(self) -> int:
+        if len(self.load_queue) == 0:
+            return -1
+
+        producer_id = self.layer_done_counter.update_producer()
+        op = CacheOperation.merge_ops(self.load_queue)
+        host_indices, device_indices = self.move_indices(op)
+        self.load_queue.clear()
+        producer_event = self.layer_done_counter.events[producer_id]
+        producer_event.start_event.record()
+
+        with device_module.stream(self.load_stream):
+            producer_event.start_event.wait(self.load_stream)
+            for i in range(self.layer_num):
+                self.mem_pool_host.load_to_device_per_layer(
+                    self.mem_pool_device,
+                    host_indices,
+                    device_indices,
+                    i,
+                    self.io_backend,
+                )
+                producer_event.complete(i)
+            # NOTE: We must save the host indices and device indices here,
+            # this is because we need to guarantee that these tensors are
+            # still alive when the load stream is executing.
+            if host_indices.is_cuda:
+                host_indices.record_stream(self.load_stream)
+            if device_indices.is_cuda:
+                device_indices.record_stream(self.load_stream)
+
+        self.ack_load_queue.append(
+            HiCacheAck(
+                start_event=producer_event.start_event,
+                finish_event=producer_event.finish_event,
+                node_ids=op.node_ids,
+            )
+        )
+        return producer_id
+
+    def evict_device(self, device_indices: torch.Tensor) -> int:
+        self.mem_pool_device_allocator.free(device_indices)
+        return len(device_indices)
+
+    def evict_host(self, host_indices: torch.Tensor, backup_only: bool = True) -> int:
+        if not backup_only:
+            raise ValueError("Other eviction policies are not supported yet.")
+
+        self.mem_pool_host.free(host_indices)
+        return len(host_indices)
+
+    def prefetch(
+        self,
+        request_id: str,
+        host_indices: torch.Tensor,
+        new_input_tokens: List[int],
+        last_hash: Optional[str] = None,
+        prefix_keys: Optional[List[str]] = None,
+    ) -> PrefetchOperation:
+        """
+        Prefetch KV caches from storage backend to host memory.
+        """
+        operation = PrefetchOperation(
+            request_id, host_indices, new_input_tokens, last_hash, prefix_keys
+        )
+        self.prefetch_queue.put(operation)
+        return operation
+
+    def terminate_prefetch(self, operation):
+        operation.mark_terminate()
+        return operation.completed_tokens, operation.hash_value
+
+    def append_host_mem_release(self, host_indices: torch.Tensor):
+        if host_indices.numel() == 0:
+            return
+        pages = host_indices.split(self.mem_pool_host.page_size)
+        for page in pages:
+            self.host_mem_release_queue.put(page)
+
+    def _page_get_zero_copy(
+        self, operation, hash_values, host_indices, extra_info=None
+    ):
+        results = self.storage_backend.batch_get_v1(
+            hash_values, host_indices, extra_info
+        )
+        inc = 0
+        for i in range(len(hash_values)):
+            if not results[i]:
+                logger.warning(
+                    f"Prefetch operation {operation.request_id} failed to retrieve page {hash_values[i]}."
+                )
+                break
+            inc += self.page_size
+        operation.increment(inc)
+
+    # todo: deprecate
+    def _generic_page_get(self, operation, hash_values, host_indices, extra_info=None):
+        dummy_page_dst = [
+            self.mem_pool_host.get_dummy_flat_data_page() for _ in hash_values
+        ]
+        page_data = self.storage_backend.batch_get(hash_values, dummy_page_dst)
+        if page_data is None:
+            return
+        for i in range(len(hash_values)):
+            if page_data[i] is None:
+                logger.warning(
+                    f"Prefetch operation {operation.request_id} failed to retrieve page {hash_values[i]}."
+                )
+                break
+            # Must set the data before increasing the completed tokens.
+            # Otherwise this page may be read before being set.
+            self.mem_pool_host.set_from_flat_data_page(
+                host_indices[i * self.page_size],
+                page_data[i],
+            )
+            if not operation.increment(self.page_size):
+                break  # Operation terminated by controller
+
+    def _page_transfer(self, operation):
+        # Transfer batch by batch
+        prefix_keys = operation.prefix_keys
+        for i in range(0, len(operation.hash_value), self.storage_batch_size):
+            batch_hashes = operation.hash_value[i : i + self.storage_batch_size]
+            batch_host_indices = operation.host_indices[
+                i * self.page_size : (i + len(batch_hashes)) * self.page_size
+            ]
+            prev_completed_tokens = operation.completed_tokens
+            # Get one batch token, and update the completed_tokens if succeed
+            extra_info = HiCacheStorageExtraInfo(prefix_keys=prefix_keys)
+            self.page_get_func(operation, batch_hashes, batch_host_indices, extra_info)
+            # Check termination
+            if (
+                operation.completed_tokens
+                != prev_completed_tokens + len(batch_hashes) * self.page_size
+            ):
+                operation.mark_terminate()
+                break  # Some operations fail or operation terminated by controller
+
+            if prefix_keys and len(prefix_keys) > 0:
+                prefix_keys += batch_hashes
+
+    def prefetch_io_aux_func(self):
+        """
+        Auxiliary function conducting IO operations for prefetching.
+        """
+        while not self.storage_stop_event.is_set():
+            try:
+                operation = self.prefetch_buffer.get(block=True, timeout=1)
+                if operation is None:
+                    continue
+                self._page_transfer(operation)
+                # operation terminated by controller, release pre-allocated memory
+                self.append_host_mem_release(
+                    operation.host_indices[operation.completed_tokens :]
+                )
+            except Empty:
+                continue
+
+    def prefetch_rate_limited(self) -> bool:
+        """
+        Rate limit the prefetching operations to avoid overwhelming the storage backend.
+        """
+        # cancel prefetch if too much memory is occupied
+        if self.prefetch_tokens_occupied >= self.prefetch_capacity_limit:
+            return True
+        # todo: more sophisticated rate limiting based on storage backend performance
+        return False
+
+    def _storage_hit_query(self, operation) -> tuple[list[str], int]:
+        last_hash = operation.last_hash
+        tokens_to_fetch = operation.token_ids
+        prefix_keys = operation.prefix_keys.copy() if operation.prefix_keys else None
+
+        storage_query_count = 0
+        hash_value = []
+
+        for start in range(
+            0, len(tokens_to_fetch), self.page_size * self.storage_batch_size
+        ):
+            end = min(
+                start + self.page_size * self.storage_batch_size, len(tokens_to_fetch)
+            )
+            batch_tokens = tokens_to_fetch[start:end]
+            batch_hashes = []
+            for i in range(0, len(batch_tokens), self.page_size):
+                last_hash = self.get_hash_str(
+                    batch_tokens[i : i + self.page_size], last_hash
+                )
+                batch_hashes.append(last_hash)
+            extra_info = HiCacheStorageExtraInfo(prefix_keys=prefix_keys)
+            hit_page_num = self.storage_backend.batch_exists(batch_hashes, extra_info)
+            hash_value.extend(batch_hashes[:hit_page_num])
+            storage_query_count += hit_page_num * self.page_size
+            if hit_page_num < len(batch_hashes):
+                break
+            if prefix_keys and len(prefix_keys) > 0:
+                prefix_keys += batch_hashes
+
+        return hash_value, storage_query_count
+
+    def prefetch_thread_func(self):
+        """
+        Manage prefetching operations from storage backend to host memory.
+        """
+        self.prefetch_buffer = Queue()
+        self.prefetch_io_aux_thread = threading.Thread(
+            target=self.prefetch_io_aux_func, daemon=True
+        )
+        self.prefetch_io_aux_thread.start()
+        while (not self.storage_stop_event.is_set()) or not self.prefetch_queue.empty():
+            try:
+                operation = self.prefetch_queue.get(block=True, timeout=1)
+                if operation is None:
+                    continue
+                hash_value, storage_hit_count = self._storage_hit_query(operation)
+                if self.tp_world_size > 1:
+                    storage_hit_count_tensor = torch.tensor(
+                        storage_hit_count, dtype=torch.int
+                    )
+                    torch.distributed.all_reduce(
+                        storage_hit_count_tensor,
+                        op=torch.distributed.ReduceOp.MIN,
+                        group=self.prefetch_tp_group,
+                    )
+                    storage_hit_count = storage_hit_count_tensor.item()
+
+                if storage_hit_count < self.prefetch_threshold:
+                    # not to prefetch if not enough benefits
+                    self.prefetch_revoke_queue.put(operation.request_id)
+                    self.append_host_mem_release(operation.host_indices)
+                    logger.debug(
+                        f"Revoking prefetch for request {operation.request_id} due to insufficient hits ({storage_hit_count})."
+                    )
+                else:
+                    operation.hash_value = hash_value[
+                        : (storage_hit_count // self.page_size)
+                    ]
+                    # free the pre-allocated memory for pages that are not hit
+                    self.append_host_mem_release(
+                        operation.host_indices[storage_hit_count:]
+                    )
+                    operation.host_indices = operation.host_indices[:storage_hit_count]
+                    logger.debug(
+                        f"Prefetching {len(operation.hash_value)} pages for request {operation.request_id}."
+                    )
+                    self.prefetch_buffer.put(operation)
+
+            except Empty:
+                continue
+
+    def write_storage(
+        self,
+        host_indices: torch.Tensor,
+        token_ids: List[int],
+        hash_value: Optional[List[str]] = None,
+        prefix_keys: Optional[List[str]] = None,
+    ) -> int:
+        """
+        Write KV caches from host memory to storage backend.
+        """
+        operation = StorageOperation(
+            host_indices, token_ids, hash_value=hash_value, prefix_keys=prefix_keys
+        )
+        self.backup_queue.put(operation)
+        return operation.id
+
+    # todo: deprecate
+    def _generic_page_set(self, hash_values, host_indices, extra_info=None) -> bool:
+        data = [
+            self.mem_pool_host.get_data_page(host_indices[i * self.page_size])
+            for i in range(len(hash_values))
+        ]
+        return self.storage_backend.batch_set(hash_values, data)
+
+    def _page_set_zero_copy(self, hash_values, host_indices, extra_info=None) -> bool:
+        return all(
+            self.storage_backend.batch_set_v1(hash_values, host_indices, extra_info)
+        )
+
+    # Backup batch by batch
+    def _page_backup(self, operation):
+        # Backup batch by batch
+        prefix_keys = operation.prefix_keys
+        for i in range(0, len(operation.hash_value), self.storage_batch_size):
+            batch_hashes = operation.hash_value[i : i + self.storage_batch_size]
+            batch_host_indices = operation.host_indices[
+                i * self.page_size : (i + len(batch_hashes)) * self.page_size
+            ]
+            # Set one batch token, and record if success.
+            # todo: allow partial success
+            extra_info = HiCacheStorageExtraInfo(prefix_keys=prefix_keys)
+            success = self.page_set_func(batch_hashes, batch_host_indices, extra_info)
+            if not success:
+                logger.warning(
+                    f"Write page to storage: {len(batch_hashes)} pages failed."
+                )
+                break
+
+            if prefix_keys and len(prefix_keys) > 0:
+                prefix_keys += batch_hashes
+            operation.completed_tokens += self.page_size * len(batch_hashes)
+
+    def backup_thread_func(self):
+        """
+        Manage backup operations from host memory to storage backend.
+        """
+        while not self.storage_stop_event.is_set():
+            try:
+                operation = self.backup_queue.get(block=True, timeout=1)
+                if operation is None:
+                    continue
+
+                if not self.backup_skip:
+                    self._page_backup(operation)
+                self.ack_backup_queue.put(operation)
+
+            except Empty:
+                continue
diff --git a/sglang/python/sglang/srt/managers/configure_logging.py b/sglang/python/sglang/srt/managers/configure_logging.py
new file mode 100644
index 0000000000000000000000000000000000000000..0dc78edfa0753690ab5e6a0dd28e66841ed4851b
--- /dev/null
+++ b/sglang/python/sglang/srt/managers/configure_logging.py
@@ -0,0 +1,47 @@
+"""
+Copyright 2023-2025 SGLang Team
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+"""
+Configure the logging settings of a server.
+
+Usage:
+python3 -m sglang.srt.managers.configure_logging --url http://localhost:30000
+"""
+
+import argparse
+
+import requests
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--url", type=str, default="http://localhost:30000")
+    parser.add_argument("--log-requests", action="store_true")
+    parser.add_argument("--log-requests-level", type=int, default=3)
+    parser.add_argument(
+        "--dump-requests-folder", type=str, default="/tmp/sglang_request_dump"
+    )
+    parser.add_argument("--dump-requests-threshold", type=int, default=1000)
+    args = parser.parse_args()
+
+    response = requests.post(
+        args.url + "/configure_logging",
+        json={
+            "log_requests": args.log_requests,
+            "log_requests_level": args.log_requests_level,  # Log full requests
+            "dump_requests_folder": args.dump_requests_folder,
+            "dump_requests_threshold": args.dump_requests_threshold,
+        },
+    )
+    assert response.status_code == 200
diff --git a/sglang/python/sglang/srt/managers/data_parallel_controller.py b/sglang/python/sglang/srt/managers/data_parallel_controller.py
new file mode 100644
index 0000000000000000000000000000000000000000..ae5b38ec7160500439104594b53fb396b436ab0e
--- /dev/null
+++ b/sglang/python/sglang/srt/managers/data_parallel_controller.py
@@ -0,0 +1,605 @@
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""A controller that dispatches requests to multiple data parallel workers."""
+
+import faulthandler
+import logging
+import multiprocessing as mp
+import signal
+import threading
+import time
+from enum import Enum, auto
+from typing import Callable, List, Optional
+
+import psutil
+import setproctitle
+import zmq
+
+from sglang.srt.environ import envs
+from sglang.srt.layers.dp_attention import compute_dp_attention_world_info
+from sglang.srt.managers.io_struct import (
+    ActiveRanksOutput,
+    BlockReqInput,
+    TokenizedEmbeddingReqInput,
+    TokenizedGenerateReqInput,
+    WatchLoadUpdateReq,
+)
+from sglang.srt.managers.schedule_batch import Req
+from sglang.srt.managers.scheduler import run_scheduler_process
+from sglang.srt.observability.cpu_monitor import start_cpu_monitor_thread
+from sglang.srt.observability.req_time_stats import DPControllerReqTimeStats
+from sglang.srt.observability.trace import process_tracing_init, trace_set_thread_info
+from sglang.srt.server_args import (
+    DP_ATTENTION_HANDSHAKE_PORT_DELTA,
+    PortArgs,
+    ServerArgs,
+)
+from sglang.srt.utils import numa_utils
+from sglang.srt.utils.common import (
+    bind_port,
+    configure_ipv6,
+    configure_logger,
+    get_zmq_socket,
+    kill_itself_when_parent_died,
+    maybe_reindex_device_id,
+)
+from sglang.srt.utils.torch_memory_saver_adapter import TorchMemorySaverAdapter
+from sglang.srt.utils.watchdog import Watchdog
+from sglang.utils import TypeBasedDispatcher, get_exception_traceback
+
+logger = logging.getLogger(__name__)
+
+
+class LoadBalanceMethod(Enum):
+    """Load balance method."""
+
+    ROUND_ROBIN = auto()
+    FOLLOW_BOOTSTRAP_ROOM = auto()
+    TOTAL_REQUESTS = auto()
+    TOTAL_TOKENS = auto()
+
+    @classmethod
+    def from_str(cls, method: str):
+        method = method.upper()
+        try:
+            return cls[method]
+        except KeyError as exc:
+            raise ValueError(f"Invalid load balance method: {method}") from exc
+
+
+class DPBudget:
+    def __init__(self, dp_size: int):
+        self.dp_size = dp_size
+        self.total_requests = [0] * dp_size
+        self.total_tokens = [0] * dp_size
+
+    def update_budget(self, load_update: WatchLoadUpdateReq):
+        """Update the budget."""
+        for load in load_update.loads:
+            self.total_requests[load.dp_rank] = load.num_reqs
+            self.total_tokens[load.dp_rank] = load.num_tokens
+
+    def dispatch(self, method: LoadBalanceMethod):
+        if method == LoadBalanceMethod.TOTAL_REQUESTS:
+            target_rank = self.total_requests.index(min(self.total_requests))
+        elif method == LoadBalanceMethod.TOTAL_TOKENS:
+            # Use total_requests as a tie-breaker when total_tokens are equal
+            target_rank = min(
+                range(self.dp_size),
+                key=lambda i: (self.total_tokens[i], self.total_requests[i]),
+            )
+        else:
+            return None
+
+        # Increment the load of that worker by one as a heuristic
+        self.total_requests[target_rank] += 1
+        return target_rank
+
+
+class DataParallelController:
+    """A controller that dispatches requests to multiple data parallel workers."""
+
+    def __init__(
+        self,
+        server_args: ServerArgs,
+        port_args: PortArgs,
+        run_scheduler_process_func: Callable,
+    ) -> None:
+        # Parse args
+        self.server_args = server_args
+        self.port_args = port_args
+        self.load_balance_method = LoadBalanceMethod.from_str(
+            server_args.load_balance_method
+        )
+        self.run_scheduler_process_func = run_scheduler_process_func
+
+        # For DP balance
+        self.global_balance_id = 0
+
+        # Init inter-process communication
+        self.context = zmq.Context(1 + server_args.dp_size)
+        if server_args.node_rank == 0:
+            self.recv_from_tokenizer = get_zmq_socket(
+                self.context, zmq.PULL, port_args.scheduler_input_ipc_name, False
+            )
+
+        # Dispatch method
+        self.round_robin_counter = 0
+        dispatch_lookup = {
+            LoadBalanceMethod.ROUND_ROBIN: self.round_robin_scheduler,
+            LoadBalanceMethod.FOLLOW_BOOTSTRAP_ROOM: self.follow_bootstrap_room_scheduler,
+            LoadBalanceMethod.TOTAL_REQUESTS: self.total_requests_scheduler,
+            LoadBalanceMethod.TOTAL_TOKENS: self.total_tokens_scheduler,
+        }
+        self.dispatching = dispatch_lookup[self.load_balance_method]
+
+        # Load balance budget
+        self.dp_budget = DPBudget(server_args.dp_size)
+
+        # To protect changing env vars to set CUDA_VISIBLE_DEVICES.
+        self.env_lock = threading.Lock()
+
+        # Launch data parallel workers
+        self.scheduler_procs = []
+        self.workers: List[zmq.Socket] = [None] * server_args.dp_size
+        self.status: List[bool] = [True] * server_args.dp_size
+
+        if server_args.enable_dp_attention:
+            self.launch_dp_attention_schedulers(server_args, port_args)
+            self.control_message_step = server_args.tp_size
+        else:
+            self.launch_dp_schedulers(server_args, port_args)
+            self.control_message_step = 1
+
+        self.init_dispatcher()
+
+        self.soft_watchdog = Watchdog.create(
+            debug_name="DataParallelController",
+            watchdog_timeout=server_args.soft_watchdog_timeout,
+            soft=True,
+            test_stuck_time=envs.SGLANG_TEST_STUCK_DP_CONTROLLER.get(),
+        )
+
+        if server_args.enable_metrics:
+            start_cpu_monitor_thread("data_parallel_controller")
+
+    def send_to_all_workers(self, obj):
+        for i, worker in enumerate(self.workers):
+            if self.status[i]:
+                worker.send_pyobj(obj)
+
+    def send_control_message(self, obj):
+        # Send control messages to first worker of tp group
+        for worker in self.workers[:: self.control_message_step]:
+            worker.send_pyobj(obj)
+
+    def handle_load_update_req(self, obj):
+        self.dp_budget.update_budget(obj)
+
+    def update_active_ranks(self, ranks: ActiveRanksOutput):
+        self.status = ranks.status
+
+    def dispatching_with_trace(self, req: Req):
+        req.time_stats = DPControllerReqTimeStats.new_from_obj(req.time_stats)
+
+        req.time_stats.set_dp_dispatch_time()
+        self.dispatching(req)
+        req.time_stats.set_dp_dispatch_finish_time()
+
+    def init_dispatcher(self):
+        self._request_dispatcher = TypeBasedDispatcher(
+            [
+                (TokenizedGenerateReqInput, self.dispatching_with_trace),
+                (TokenizedEmbeddingReqInput, self.dispatching_with_trace),
+                (BlockReqInput, self.send_to_all_workers),
+                (WatchLoadUpdateReq, self.handle_load_update_req),
+                (ActiveRanksOutput, self.update_active_ranks),
+            ]
+        )
+        self._request_dispatcher.add_fallback_fn(self.send_control_message)
+
+    def launch_dp_schedulers(self, server_args, port_args):
+        base_gpu_id = 0
+
+        threads = []
+        sockets = []
+        ready_events = []
+        for dp_rank in range(server_args.dp_size):
+            tmp_port_args = PortArgs.init_new(server_args)
+            tmp_port_args.tokenizer_ipc_name = port_args.tokenizer_ipc_name
+            tmp_port_args.detokenizer_ipc_name = port_args.detokenizer_ipc_name
+
+            # This port is checked free in PortArgs.init_new.
+            # We hold it first so that the next dp worker gets a different port
+            sockets.append(bind_port(tmp_port_args.nccl_port))
+
+            ready_event = threading.Event()
+            ready_events.append(ready_event)
+
+            # Create a thread for each worker
+            thread = threading.Thread(
+                target=self.launch_tensor_parallel_group_thread,
+                args=(server_args, tmp_port_args, base_gpu_id, dp_rank, ready_event),
+            )
+            threads.append(thread)
+            base_gpu_id += (
+                server_args.tp_size * server_args.pp_size * server_args.gpu_id_step
+            )
+
+            if server_args.node_rank == 0:
+                self.workers[dp_rank] = get_zmq_socket(
+                    self.context,
+                    zmq.PUSH,
+                    tmp_port_args.scheduler_input_ipc_name,
+                    True,
+                )
+
+        # Free all sockets before starting the threads to launch TP workers
+        for sock in sockets:
+            sock.close()
+
+        # Start all threads
+        for thread in threads:
+            thread.start()
+        for event in ready_events:
+            event.wait()
+
+    def launch_tensor_parallel_group_thread(
+        self,
+        server_args: ServerArgs,
+        port_args: PortArgs,
+        base_gpu_id: int,
+        dp_rank: int,
+        ready_event: threading.Event,
+    ):
+        self.launch_tensor_parallel_group(server_args, port_args, base_gpu_id, dp_rank)
+        ready_event.set()
+
+        # This thread cannot be closed because otherwise the `kill_itself_when_parent_died`
+        # function in scheduler.py will kill the scheduler.
+        while True:
+            time.sleep(30 * 24 * 3600)
+
+    def _broadcast_worker_ports(
+        self, server_args: ServerArgs, worker_ports: Optional[List[int]] = None
+    ) -> List[int]:
+        """Broadcast worker ports from node 0 to all other nodes.
+
+        Node 0 acts as the server, waiting for all other nodes to connect and
+        sending them the pre-allocated worker ports. Other nodes act as clients,
+        connecting to node 0 to receive their copy of the worker ports.
+
+        Args:
+            server_args: Server arguments containing node configuration.
+            worker_ports: Pre-allocated worker ports to broadcast.
+
+        Returns:
+            List of worker ports (same on all nodes after broadcast).
+        """
+        # Determine the endpoint for inter-node communication
+        if server_args.dist_init_addr is None:
+            endpoint = f"tcp://127.0.0.1:{server_args.port + DP_ATTENTION_HANDSHAKE_PORT_DELTA}"
+        elif server_args.dist_init_addr.startswith("["):  # ipv6 address
+            port, host = configure_ipv6(server_args.dist_init_addr)
+            endpoint = f"tcp://{host}:{int(port) + DP_ATTENTION_HANDSHAKE_PORT_DELTA}"
+        else:
+            host, port = server_args.dist_init_addr.split(":")
+            endpoint = f"tcp://{host}:{int(port) + DP_ATTENTION_HANDSHAKE_PORT_DELTA}"
+
+        if server_args.node_rank == 0:
+            # Node 0: Broadcast worker ports to all other nodes
+            return self._broadcast_ports_as_server(
+                endpoint, server_args.nnodes - 1, worker_ports
+            )
+        else:
+            # Other nodes: Receive worker ports from node 0
+            return self._receive_ports_as_client(endpoint, server_args.node_rank)
+
+    def _broadcast_ports_as_server(
+        self, endpoint: str, expected_clients: int, worker_ports: List[int]
+    ) -> List[int]:
+        """Broadcast worker ports to all client nodes."""
+        logger.debug(f"Broadcasting worker ports to {expected_clients} client nodes")
+        logger.debug(f"Worker ports: {worker_ports}")
+
+        rep_socket = get_zmq_socket(self.context, zmq.REP, endpoint, True)
+
+        try:
+            connected_clients = 0
+            while connected_clients < expected_clients:
+                # Wait for client handshake
+                client_rank = rep_socket.recv().decode()
+                logger.debug(f"Received handshake from node {client_rank}")
+
+                # Send worker ports to client
+                rep_socket.send_pyobj(worker_ports)
+                connected_clients += 1
+                logger.debug(
+                    f"Sent worker ports to {connected_clients}/{expected_clients} nodes"
+                )
+
+            logger.debug("Worker port broadcast completed")
+            return worker_ports
+        finally:
+            rep_socket.close()
+
+    def _receive_ports_as_client(self, endpoint: str, node_rank: int) -> List[int]:
+        """Receive worker ports from the server node."""
+        logger.debug(f"Connecting to node 0 to receive worker ports")
+
+        req_socket = get_zmq_socket(self.context, zmq.REQ, endpoint, False)
+        req_socket.setsockopt(zmq.RCVTIMEO, 600 * 1000)  # 10 minute timeout
+        req_socket.setsockopt(zmq.SNDTIMEO, 600 * 1000)
+
+        try:
+            # Send handshake with our node rank
+            req_socket.send(str(node_rank).encode())
+
+            # Receive worker ports
+            worker_ports = req_socket.recv_pyobj()
+            logger.debug(f"Received {len(worker_ports)} worker ports from node 0")
+            return worker_ports
+        except zmq.Again:
+            logger.error("Timeout waiting for worker ports from node 0")
+            raise RuntimeError(
+                "Failed to receive worker ports from node 0 within timeout"
+            )
+        finally:
+            req_socket.close()
+
+    def launch_dp_attention_schedulers(
+        self, server_args: ServerArgs, port_args: PortArgs
+    ):
+        # Pre-allocate worker ports on node 0 to avoid conflicts
+        worker_ports = []
+        if server_args.node_rank == 0:
+            for dp_rank in range(server_args.dp_size):
+                port_and_socket = get_zmq_socket(self.context, zmq.PUSH)
+                worker_ports.append(port_and_socket[0])
+                self.workers[dp_rank] = port_and_socket[1]
+                logger.debug(f"Assigned port {port_and_socket[0]} to worker {dp_rank}")
+
+        broadcasted_ports = self._broadcast_worker_ports(
+            server_args, worker_ports if worker_ports else None
+        )
+        self.launch_tensor_parallel_group(
+            server_args, port_args, 0, None, broadcasted_ports
+        )
+
+    def launch_tensor_parallel_group(
+        self,
+        server_args: ServerArgs,
+        port_args: PortArgs,
+        base_gpu_id: int,
+        dp_rank: Optional[int],
+        worker_ports: Optional[List[int]] = None,
+    ):
+        if not server_args.enable_dp_attention:
+            logger.info(f"Launch DP{dp_rank} starting at GPU #{base_gpu_id}.")
+
+        memory_saver_adapter = TorchMemorySaverAdapter.create(
+            enable=server_args.enable_memory_saver
+        )
+
+        scheduler_pipe_readers = []
+
+        pp_size_per_node = max(server_args.pp_size // server_args.nnodes, 1)
+        nnodes_per_pp_rank = max(server_args.nnodes // server_args.pp_size, 1)
+        pp_rank_range = range(
+            pp_size_per_node * (server_args.node_rank // nnodes_per_pp_rank),
+            pp_size_per_node * (server_args.node_rank // nnodes_per_pp_rank + 1),
+        )
+
+        nnodes_per_tp_group = nnodes_per_pp_rank
+        tp_size_per_node = server_args.tp_size // nnodes_per_tp_group
+        tp_rank_range = range(
+            tp_size_per_node * (server_args.node_rank % nnodes_per_tp_group),
+            tp_size_per_node * (server_args.node_rank % nnodes_per_tp_group + 1),
+        )
+
+        attn_cp_rank = 0
+        moe_dp_rank = 0
+        for pp_rank in pp_rank_range:
+            for tp_rank in tp_rank_range:
+                rank_port_args = port_args
+
+                if server_args.enable_dp_attention:
+                    # dp attention has different sharding logic
+                    _, _, dp_rank = compute_dp_attention_world_info(
+                        server_args.enable_dp_attention,
+                        tp_rank,
+                        server_args.tp_size,
+                        server_args.dp_size,
+                        server_args.attn_cp_size,
+                    )
+                    # compute zmq ports for this dp rank
+                    rank_port_args = PortArgs.init_new(
+                        server_args, dp_rank, worker_ports
+                    )
+                    # Data parallelism reuses the tensor parallelism group,
+                    # so all dp ranks should use the same nccl port.
+                    rank_port_args.nccl_port = port_args.nccl_port
+
+                reader, writer = mp.Pipe(duplex=False)
+                gpu_id = (
+                    server_args.base_gpu_id
+                    + base_gpu_id
+                    + ((pp_rank % pp_size_per_node) * tp_size_per_node)
+                    + (tp_rank % tp_size_per_node) * server_args.gpu_id_step
+                )
+                attn_dp_size = (
+                    server_args.dp_size if server_args.enable_dp_attention else 1
+                )
+
+                # Parallelism hierarchy (outermost to innermost):
+                # - Attention: Global(TP) -> DP -> ATTN_CP -> ATTN_TP (innermost)
+                # - MoE: Global(TP) -> MOE_DP -> EP -> MOE_TP (innermost)
+                attn_tp_size = (
+                    server_args.tp_size // attn_dp_size // server_args.attn_cp_size
+                )
+                attn_cp_rank = (tp_rank // attn_tp_size) % server_args.attn_cp_size
+                moe_dp_rank = tp_rank // (
+                    server_args.tp_size // server_args.moe_dp_size
+                )
+                moe_ep_rank = (
+                    tp_rank
+                    % (server_args.tp_size // server_args.moe_dp_size)
+                    // (
+                        server_args.tp_size
+                        // server_args.moe_dp_size
+                        // server_args.ep_size
+                    )
+                )
+
+                with self.env_lock, maybe_reindex_device_id(gpu_id) as gpu_id:
+                    proc = mp.Process(
+                        target=self.run_scheduler_process_func,
+                        args=(
+                            server_args,
+                            rank_port_args,
+                            gpu_id,
+                            tp_rank,
+                            attn_cp_rank,
+                            moe_dp_rank,
+                            moe_ep_rank,
+                            pp_rank,
+                            dp_rank,
+                            writer,
+                        ),
+                    )
+                    with memory_saver_adapter.configure_subprocess(), numa_utils.configure_subprocess(
+                        server_args, gpu_id
+                    ):
+                        proc.start()
+                self.scheduler_procs.append(proc)
+                scheduler_pipe_readers.append(reader)
+
+        # Wait for model to finish loading
+        scheduler_info = []
+        for i in range(len(scheduler_pipe_readers)):
+            scheduler_info.append(scheduler_pipe_readers[i].recv())
+
+        self.max_total_num_tokens = scheduler_info[0]["max_total_num_tokens"]
+        self.max_req_input_len = scheduler_info[0]["max_req_input_len"]
+
+    def maybe_external_dp_rank_routing(self, req: Req):
+        if req.routed_dp_rank is not None:
+            logger.debug(f"Direct routing to DP rank {req.routed_dp_rank}")
+            self.workers[req.routed_dp_rank].send_pyobj(req)
+            return True
+        return False
+
+    def round_robin_scheduler(self, req: Req):
+        if self.maybe_external_dp_rank_routing(req):
+            return
+
+        while True:
+            if self.status[self.round_robin_counter]:
+                logger.debug(f"Choose worker {self.round_robin_counter}")
+                self.workers[self.round_robin_counter].send_pyobj(req)
+                self.round_robin_counter = (self.round_robin_counter + 1) % len(
+                    self.workers
+                )
+                break
+            self.round_robin_counter = (self.round_robin_counter + 1) % len(
+                self.workers
+            )
+
+    def follow_bootstrap_room_scheduler(self, req: Req):
+        if self.maybe_external_dp_rank_routing(req):
+            return
+
+        # Set default bootstrap_room if in FAKE auto mode and room is None
+        if (
+            req.bootstrap_room is None
+            and self.server_args.disaggregation_transfer_backend == "fake"
+        ):
+            req.bootstrap_room = self.round_robin_counter
+            self.round_robin_counter = (self.round_robin_counter + 1) % len(
+                self.workers
+            )
+
+        assert req.bootstrap_room is not None, (
+            "req.bootstrap_room should not be None. Do not send requests directly to "
+            "prefill or decode instances; send to the router instead."
+        )
+        target_rank = req.bootstrap_room % len(self.workers)
+        self.workers[target_rank].send_pyobj(req)
+
+    def total_requests_scheduler(self, req: Req):
+        if self.maybe_external_dp_rank_routing(req):
+            return
+        target_worker = self.dp_budget.dispatch(LoadBalanceMethod.TOTAL_REQUESTS)
+        self.workers[target_worker].send_pyobj(req)
+
+    def total_tokens_scheduler(self, req: Req):
+        if self.maybe_external_dp_rank_routing(req):
+            return
+        target_worker = self.dp_budget.dispatch(LoadBalanceMethod.TOTAL_TOKENS)
+        self.workers[target_worker].send_pyobj(req)
+
+    def event_loop(self):
+        while True:
+            while True:
+                self.soft_watchdog.feed()
+                try:
+                    recv_req = self.recv_from_tokenizer.recv_pyobj(zmq.NOBLOCK)
+                except zmq.ZMQError:
+                    break
+                self._request_dispatcher(recv_req)
+
+
+def run_data_parallel_controller_process(
+    server_args: ServerArgs,
+    port_args: PortArgs,
+    pipe_writer,
+    run_scheduler_process_func: Callable = run_scheduler_process,
+):
+    setproctitle.setproctitle("sglang::data_parallel_controller")
+    faulthandler.enable()
+    kill_itself_when_parent_died()
+    parent_process = psutil.Process().parent()
+
+    configure_logger(server_args)
+    if server_args.enable_trace:
+        process_tracing_init(server_args.otlp_traces_endpoint, "sglang")
+        thread_label = "DP Controller"
+        if server_args.disaggregation_mode == "prefill":
+            thread_label = "Prefill DP Controller"
+        elif server_args.disaggregation_mode == "decode":
+            thread_label = "Decode DP Controller"
+        trace_set_thread_info(thread_label)
+
+    try:
+        controller = DataParallelController(
+            server_args, port_args, run_scheduler_process_func
+        )
+        pipe_writer.send(
+            {
+                "status": "ready",
+                "max_total_num_tokens": controller.max_total_num_tokens,
+                "max_req_input_len": controller.max_req_input_len,
+            }
+        )
+        if server_args.node_rank == 0:
+            controller.event_loop()
+        for proc in controller.scheduler_procs:
+            proc.join()
+            logger.error(
+                f"Scheduler or DataParallelController {proc.pid} terminated with {proc.exitcode}"
+            )
+    except Exception:
+        traceback = get_exception_traceback()
+        logger.error(f"DataParallelController hit an exception: {traceback}")
+        parent_process.send_signal(signal.SIGQUIT)
diff --git a/sglang/python/sglang/srt/managers/detokenizer_manager.py b/sglang/python/sglang/srt/managers/detokenizer_manager.py
new file mode 100644
index 0000000000000000000000000000000000000000..2c6db4b2f7d694c50e8be9171929403a3e142fea
--- /dev/null
+++ b/sglang/python/sglang/srt/managers/detokenizer_manager.py
@@ -0,0 +1,448 @@
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""DetokenizerManager is a process that detokenizes the token ids."""
+
+import dataclasses
+import logging
+import os
+import signal
+from collections import OrderedDict, defaultdict
+from typing import Dict, List, Optional, Tuple, Union
+
+import psutil
+import pybase64
+import setproctitle
+import zmq
+
+from sglang.srt.environ import envs
+from sglang.srt.managers.io_struct import (
+    BatchEmbeddingOutput,
+    BatchMultimodalDecodeReq,
+    BatchStrOutput,
+    BatchTokenIDOutput,
+    FreezeGCReq,
+)
+from sglang.srt.managers.multi_tokenizer_mixin import MultiHttpWorkerDetokenizerMixin
+from sglang.srt.observability.cpu_monitor import start_cpu_monitor_thread
+from sglang.srt.server_args import PortArgs, ServerArgs
+from sglang.srt.utils import (
+    configure_logger,
+    freeze_gc,
+    get_zmq_socket,
+    kill_itself_when_parent_died,
+)
+from sglang.srt.utils.hf_transformers_utils import get_tokenizer
+from sglang.srt.utils.watchdog import Watchdog
+from sglang.utils import (
+    TypeBasedDispatcher,
+    find_printable_text,
+    get_exception_traceback,
+)
+
+logger = logging.getLogger(__name__)
+
+# Maximum number of request states that detokenizer can hold. When exceeded,
+# oldest request states will be evicted. Default: 65536 (1<<16).
+# For more details, see: https://github.com/sgl-project/sglang/issues/2812
+# Use power of 2 values for better memory allocation.
+DETOKENIZER_MAX_STATES = int(os.environ.get("SGLANG_DETOKENIZER_MAX_STATES", 1 << 16))
+
+
+@dataclasses.dataclass
+class DecodeStatus:
+    """Store the status of incremental decoding."""
+
+    decoded_text: str
+    decode_ids: List[int]
+    surr_offset: int
+    read_offset: int
+    # Offset that's sent to tokenizer for incremental update.
+    sent_offset: int = 0
+
+
+class DetokenizerManager(MultiHttpWorkerDetokenizerMixin):
+    """DetokenizerManager is a process that detokenizes the token ids."""
+
+    def __init__(
+        self,
+        server_args: ServerArgs,
+        port_args: PortArgs,
+    ):
+        # Init inter-process communication
+        self.init_ipc_channels(port_args)
+
+        # Init tokenizer
+        self.init_tokenizer(server_args)
+
+        # Init running status
+        self.init_running_status(server_args)
+
+        if server_args.enable_metrics:
+            start_cpu_monitor_thread("detokenizer")
+
+        # Init dispatcher
+        self.init_request_dispatcher()
+
+    @staticmethod
+    def is_health_check_request(rid: Optional[str]) -> bool:
+        return isinstance(rid, str) and rid.startswith("HEALTH_CHECK")
+
+    def init_ipc_channels(self, port_args: PortArgs):
+        context = zmq.Context(2)
+        self.recv_from_scheduler = get_zmq_socket(
+            context, zmq.PULL, port_args.detokenizer_ipc_name, True
+        )
+        self.send_to_tokenizer = get_zmq_socket(
+            context, zmq.PUSH, port_args.tokenizer_ipc_name, False
+        )
+
+    def init_tokenizer(self, server_args: ServerArgs):
+        if server_args.skip_tokenizer_init:
+            self.tokenizer = None
+        else:
+            self.tokenizer = get_tokenizer(
+                server_args.tokenizer_path,
+                tokenizer_mode=server_args.tokenizer_mode,
+                trust_remote_code=server_args.trust_remote_code,
+                revision=server_args.revision,
+            )
+
+    def init_running_status(self, server_args: ServerArgs):
+        self.decode_status = LimitedCapacityDict(capacity=DETOKENIZER_MAX_STATES)
+        self.is_dummy = False
+        self.is_tool_call_parser_gpt_oss = server_args.tool_call_parser == "gpt-oss"
+        self.disable_tokenizer_batch_decode = server_args.disable_tokenizer_batch_decode
+
+        self.soft_watchdog = Watchdog.create(
+            debug_name="DetokenizerManager",
+            watchdog_timeout=server_args.soft_watchdog_timeout,
+            soft=True,
+            test_stuck_time=envs.SGLANG_TEST_STUCK_DETOKENIZER.get(),
+        )
+
+    def init_request_dispatcher(self):
+        self._request_dispatcher = TypeBasedDispatcher(
+            [
+                (BatchEmbeddingOutput, self.handle_batch_embedding_out),
+                (BatchTokenIDOutput, self.handle_batch_token_id_out),
+                (BatchMultimodalDecodeReq, self.handle_multimodal_decode_req),
+                (FreezeGCReq, self.handle_freeze_gc_req),
+            ]
+        )
+
+    def event_loop(self):
+        """The event loop that handles requests"""
+        while True:
+            with self.soft_watchdog.disable():
+                recv_obj = self.recv_from_scheduler.recv_pyobj()
+            output = self._request_dispatcher(recv_obj)
+            if output is not None:
+                self.send_to_tokenizer.send_pyobj(output)
+            self.soft_watchdog.feed()
+
+    def trim_matched_stop(
+        self, output: Union[str, List[int]], finished_reason: Dict, no_stop_trim: bool
+    ):
+        if no_stop_trim or not finished_reason:
+            return output
+
+        matched = finished_reason.get("matched", None)
+        if not matched:
+            return output
+
+        # TODO(lmzheng): handle the case where multiple stop strs are hit
+
+        # Trim stop str.
+        if isinstance(matched, str) and isinstance(output, str):
+            pos = output.find(matched)
+            return output[:pos] if pos != -1 else output
+
+        # Trim stop token.
+        if isinstance(matched, int) and isinstance(output, list):
+            # 200012 <|call|> is the tool call token and one of eos tokens for gpt-oss model
+            if output[-1] == 200012 and self.is_tool_call_parser_gpt_oss:
+                return output
+            assert len(output) > 0
+            # NOTE: We can always assume the last token is the matched stop token
+            return output[:-1]
+        return output
+
+    def handle_batch_embedding_out(self, recv_obj: BatchEmbeddingOutput):
+        # If it is embedding model, no detokenization is needed.
+        return recv_obj
+
+    def _grouped_batch_decode(
+        self,
+        ids_list: List[List[int]],
+        skip_list: List[bool],
+        space_list: List[bool],
+    ) -> List[str]:
+        """Batch decode with grouping by (skip_special_tokens, spaces_between_special_tokens)."""
+
+        assert self.tokenizer is not None
+
+        # fast path
+        first_skip, first_space = skip_list[0], space_list[0]
+        if all(s == first_skip for s in skip_list) and all(
+            sp == first_space for sp in space_list
+        ):
+            return self.tokenizer.batch_decode(
+                ids_list,
+                skip_special_tokens=first_skip,
+                spaces_between_special_tokens=first_space,
+            )
+
+        # Group indices by (skip, space) tuple
+        groups: Dict[Tuple[bool, bool], List[int]]
+        groups = defaultdict(list)
+        for idx, (skip, space) in enumerate(zip(skip_list, space_list)):
+            groups[(skip, space)].append(idx)
+
+        # Decode each group and collect results
+        results: List[str] = [""] * len(ids_list)
+        for (skip, space), indices in groups.items():
+            decoded = self.tokenizer.batch_decode(
+                [ids_list[idx] for idx in indices],
+                skip_special_tokens=skip,
+                spaces_between_special_tokens=space,
+            )
+            for idx, text in zip(indices, decoded):
+                results[idx] = text
+
+        return results
+
+    def _decode_batch_token_id_output(self, recv_obj: BatchTokenIDOutput):
+        bs = len(recv_obj.rids)
+
+        # Initialize decode status
+        read_ids, surr_ids = [], []
+        for i in range(bs):
+            rid = recv_obj.rids[i]
+            if rid not in self.decode_status:
+                s = DecodeStatus(
+                    decoded_text=recv_obj.decoded_texts[i],
+                    decode_ids=recv_obj.decode_ids[i],
+                    surr_offset=0,
+                    read_offset=recv_obj.read_offsets[i],
+                )
+                if not self.is_health_check_request(rid):
+                    # for health check requests, we do not store the decode status
+                    self.decode_status[rid] = s
+            else:
+                s = self.decode_status[rid]
+                s.decode_ids.extend(recv_obj.decode_ids[i])
+
+            read_ids.append(
+                self.trim_matched_stop(
+                    s.decode_ids[s.surr_offset :],
+                    recv_obj.finished_reasons[i],
+                    recv_obj.no_stop_trim[i],
+                )
+            )
+            surr_ids.append(s.decode_ids[s.surr_offset : s.read_offset])
+
+        # Decode token ids to strings
+        if not self.disable_tokenizer_batch_decode:
+            if not self.is_dummy:
+                # Run normal batch decode
+                surr_texts = self._grouped_batch_decode(
+                    surr_ids,
+                    recv_obj.skip_special_tokens,
+                    recv_obj.spaces_between_special_tokens,
+                )
+                read_texts = self._grouped_batch_decode(
+                    read_ids,
+                    recv_obj.skip_special_tokens,
+                    recv_obj.spaces_between_special_tokens,
+                )
+            else:
+                # If it is dummy weights, just return dummy strings to prevent potential detokenization edge cases
+                surr_texts = ["dog" for _ in surr_ids]
+                read_texts = ["cat" for _ in read_ids]
+        else:
+            # Do not use batch decode to prevent some detokenization edge cases (e.g., gpt-oss).
+            surr_texts = [
+                self.tokenizer.decode(
+                    surr, skip_special_tokens=skip, spaces_between_special_tokens=space
+                )
+                for surr, skip, space in zip(
+                    surr_ids,
+                    recv_obj.skip_special_tokens,
+                    recv_obj.spaces_between_special_tokens,
+                )
+            ]
+            read_texts = [
+                self.tokenizer.decode(
+                    read, skip_special_tokens=skip, spaces_between_special_tokens=space
+                )
+                for read, skip, space in zip(
+                    read_ids,
+                    recv_obj.skip_special_tokens,
+                    recv_obj.spaces_between_special_tokens,
+                )
+            ]
+
+        # Incremental decoding
+        output_strs = []
+        for i in range(bs):
+            rid = recv_obj.rids[i]
+            if self.is_health_check_request(rid):
+                s = DecodeStatus(
+                    decoded_text=recv_obj.decoded_texts[i],
+                    decode_ids=recv_obj.decode_ids[i],
+                    surr_offset=0,
+                    read_offset=recv_obj.read_offsets[i],
+                )
+            else:
+                try:
+                    s = self.decode_status[rid]
+                except KeyError:
+                    raise RuntimeError(
+                        f"Decode status not found for request {rid}. "
+                        "It may be due to the request being evicted from the decode status due to memory pressure. "
+                        "Please increase the maximum number of requests by setting "
+                        "the SGLANG_DETOKENIZER_MAX_STATES environment variable to a bigger value than the default value. "
+                        f"The current value is {DETOKENIZER_MAX_STATES}. "
+                        "For more details, see: https://github.com/sgl-project/sglang/issues/2812"
+                    )
+            new_text = read_texts[i][len(surr_texts[i]) :]
+            if recv_obj.finished_reasons[i] is None:
+                # Streaming chunk: update the decode status
+                if len(new_text) > 0 and not new_text.endswith("�"):
+                    s.decoded_text = s.decoded_text + new_text
+                    s.surr_offset = s.read_offset
+                    s.read_offset = len(s.decode_ids)
+                    new_text = ""
+                else:
+                    new_text = find_printable_text(new_text)
+            else:
+                if rid in self.decode_status:
+                    del self.decode_status[rid]
+
+            output_str = self.trim_matched_stop(
+                s.decoded_text + new_text,
+                recv_obj.finished_reasons[i],
+                recv_obj.no_stop_trim[i],
+            )
+            # Incrementally send text.
+            incremental_output = output_str[s.sent_offset :]
+            s.sent_offset = len(output_str)
+            output_strs.append(incremental_output)
+
+        return output_strs
+
+    def _extract_routed_experts(
+        self, recv_obj: BatchTokenIDOutput
+    ) -> list[str | None] | None:
+        routed_experts = None
+        if recv_obj.routed_experts is not None:
+            routed_experts = [
+                (
+                    pybase64.b64encode(routed_experts.numpy().tobytes()).decode("utf-8")
+                    if routed_experts is not None
+                    else None
+                )
+                for routed_experts in recv_obj.routed_experts
+            ]
+        return routed_experts
+
+    def handle_batch_token_id_out(self, recv_obj: BatchTokenIDOutput):
+        # If handling idle batch, set output_strs to [].
+        output_strs = (
+            self._decode_batch_token_id_output(recv_obj)
+            if len(recv_obj.rids) > 0
+            else []
+        )
+        routed_experts = self._extract_routed_experts(recv_obj)
+
+        return BatchStrOutput(
+            rids=recv_obj.rids,
+            http_worker_ipcs=recv_obj.http_worker_ipcs,
+            finished_reasons=recv_obj.finished_reasons,
+            output_strs=output_strs,
+            output_ids=recv_obj.output_ids,
+            prompt_tokens=recv_obj.prompt_tokens,
+            completion_tokens=recv_obj.completion_tokens,
+            cached_tokens=recv_obj.cached_tokens,
+            cached_tokens_details=recv_obj.cached_tokens_details,
+            spec_verify_ct=recv_obj.spec_verify_ct,
+            spec_accepted_tokens=recv_obj.spec_accepted_tokens,
+            spec_acceptance_histogram=recv_obj.spec_acceptance_histogram,
+            input_token_logprobs_val=recv_obj.input_token_logprobs_val,
+            input_token_logprobs_idx=recv_obj.input_token_logprobs_idx,
+            output_token_logprobs_val=recv_obj.output_token_logprobs_val,
+            output_token_logprobs_idx=recv_obj.output_token_logprobs_idx,
+            input_top_logprobs_val=recv_obj.input_top_logprobs_val,
+            input_top_logprobs_idx=recv_obj.input_top_logprobs_idx,
+            output_top_logprobs_val=recv_obj.output_top_logprobs_val,
+            output_top_logprobs_idx=recv_obj.output_top_logprobs_idx,
+            input_token_ids_logprobs_val=recv_obj.input_token_ids_logprobs_val,
+            input_token_ids_logprobs_idx=recv_obj.input_token_ids_logprobs_idx,
+            output_token_ids_logprobs_val=recv_obj.output_token_ids_logprobs_val,
+            output_token_ids_logprobs_idx=recv_obj.output_token_ids_logprobs_idx,
+            output_token_entropy_val=recv_obj.output_token_entropy_val,
+            output_hidden_states=recv_obj.output_hidden_states,
+            routed_experts=routed_experts,
+            customized_info=recv_obj.customized_info,
+            placeholder_tokens_idx=None,
+            placeholder_tokens_val=None,
+            retraction_counts=recv_obj.retraction_counts,
+            token_steps=recv_obj.token_steps,
+            load=recv_obj.load,
+            dp_ranks=recv_obj.dp_ranks,
+            time_stats=recv_obj.time_stats,
+        )
+
+    def handle_multimodal_decode_req(self, recv_obj: BatchMultimodalDecodeReq):
+        raise NotImplementedError()
+
+    def handle_freeze_gc_req(self, recv_req: FreezeGCReq):
+        freeze_gc("Detokenizer Manager")
+        return None
+
+
+class LimitedCapacityDict(OrderedDict):
+    def __init__(self, capacity: int, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.capacity = capacity
+
+    def __setitem__(self, key, value):
+        if len(self) >= self.capacity:
+            # Remove the oldest element (first item in the dict)
+            self.popitem(last=False)
+        # Set the new item
+        super().__setitem__(key, value)
+
+
+def run_detokenizer_process(
+    server_args: ServerArgs,
+    port_args: PortArgs,
+    detokenizer_manager_class=DetokenizerManager,
+):
+    kill_itself_when_parent_died()
+    setproctitle.setproctitle("sglang::detokenizer")
+    configure_logger(server_args)
+    parent_process = psutil.Process().parent()
+
+    try:
+        manager = detokenizer_manager_class(server_args, port_args)
+        if server_args.tokenizer_worker_num == 1:
+            manager.event_loop()
+        else:
+            manager.multi_http_worker_event_loop()
+    except Exception:
+        traceback = get_exception_traceback()
+        logger.error(f"DetokenizerManager hit an exception: {traceback}")
+        manager.maybe_clear_socket_mapping()
+        parent_process.send_signal(signal.SIGQUIT)
diff --git a/sglang/python/sglang/srt/managers/disagg_service.py b/sglang/python/sglang/srt/managers/disagg_service.py
new file mode 100644
index 0000000000000000000000000000000000000000..8b02add202d75bec3e75f581f04b7c0783780e0a
--- /dev/null
+++ b/sglang/python/sglang/srt/managers/disagg_service.py
@@ -0,0 +1,44 @@
+"""Start bootstrap/kv-store-related server"""
+
+import os
+
+from sglang.srt.disaggregation.utils import (
+    DisaggregationMode,
+    KVClassType,
+    TransferBackend,
+    get_kv_class,
+)
+from sglang.srt.server_args import ServerArgs
+
+
+def start_disagg_service(
+    server_args: ServerArgs,
+):
+    # Start kv bootstrap server on prefill
+    disagg_mode = DisaggregationMode(server_args.disaggregation_mode)
+    transfer_backend = TransferBackend(server_args.disaggregation_transfer_backend)
+
+    if disagg_mode == DisaggregationMode.PREFILL:
+        # only start bootstrap server on prefill tm
+        kv_bootstrap_server_class = get_kv_class(
+            transfer_backend, KVClassType.BOOTSTRAP_SERVER
+        )
+        bootstrap_server = kv_bootstrap_server_class(
+            host=server_args.host,
+            port=server_args.disaggregation_bootstrap_port,
+        )
+        is_create_store = (
+            server_args.node_rank == 0 and transfer_backend == TransferBackend.ASCEND
+        )
+        if is_create_store:
+            try:
+                from memfabric_hybrid import create_config_store
+
+                ascend_url = os.getenv("ASCEND_MF_STORE_URL")
+                create_config_store(ascend_url)
+            except Exception as e:
+                error_message = f"Failed create mf store, invalid ascend_url."
+                error_message += f" With exception {e}"
+                raise error_message
+
+        return bootstrap_server
diff --git a/sglang/python/sglang/srt/managers/io_struct.py b/sglang/python/sglang/srt/managers/io_struct.py
new file mode 100644
index 0000000000000000000000000000000000000000..5f0db81374ed3233900a7c0abeb855ea5c179b04
--- /dev/null
+++ b/sglang/python/sglang/srt/managers/io_struct.py
@@ -0,0 +1,2018 @@
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""
+The definition of objects transferred between different
+processes (TokenizerManager, DetokenizerManager, Scheduler).
+"""
+
+from __future__ import annotations
+
+import copy
+import uuid
+from abc import ABC
+from dataclasses import dataclass, field
+from enum import Enum
+from typing import TYPE_CHECKING, Any, Dict, List, Literal, Optional, Union
+
+import torch
+
+from sglang.srt.lora.lora_registry import LoRARef
+from sglang.srt.managers.schedule_batch import BaseFinishReason
+from sglang.srt.multimodal.mm_utils import has_valid_data
+from sglang.srt.observability.req_time_stats import (
+    APIServerReqTimeStats,
+    DPControllerReqTimeStats,
+    SchedulerReqTimeStats,
+)
+from sglang.srt.sampling.sampling_params import SamplingParams
+from sglang.srt.utils import ImageData
+
+# Handle serialization of Image for pydantic
+if TYPE_CHECKING:
+    from PIL.Image import Image
+else:
+    Image = Any
+
+
+@dataclass
+class BaseReq(ABC):
+    rid: Optional[Union[str, List[str]]] = field(default=None, kw_only=True)
+    http_worker_ipc: Optional[str] = field(default=None, kw_only=True)
+
+    def regenerate_rid(self):
+        """Generate a new request ID and return it."""
+        if isinstance(self.rid, list):
+            self.rid = [uuid.uuid4().hex for _ in range(len(self.rid))]
+        else:
+            self.rid = uuid.uuid4().hex
+        return self.rid
+
+
+@dataclass
+class BaseBatchReq(ABC):
+    rids: Optional[List[str]] = field(default=None, kw_only=True)
+    http_worker_ipcs: Optional[List[str]] = field(default=None, kw_only=True)
+
+    def regenerate_rids(self):
+        """Generate new request IDs and return them."""
+        self.rids = [uuid.uuid4().hex for _ in range(len(self.rids))]
+        return self.rids
+
+
+@dataclass
+class SpeculativeDecodingMetricsMixin:
+    """
+    Mixin class containing speculative decoding metrics.
+
+    This class consolidates speculative decoding metrics that are shared across
+    batch output types that support speculative decoding to avoid code duplication.
+    """
+
+    # Verify count: number of verification forward passes
+    spec_verify_ct: List[int]
+
+    # Accepted tokens: Number of accepted tokens during speculative decoding
+    spec_accepted_tokens: List[int]
+
+    # Acceptance histogram: List of lists, where each inner list represents histogram counts.
+    # List index = number of accepted tokens in a step, List value = count of steps with that many accepted tokens.
+    # Example: histogram[0] = 5 means 5 steps with 0 accepted tokens, histogram[3] = 10 means 10 steps with 3 accepted tokens.
+    # Empty list [] when speculative decoding is disabled.
+    spec_acceptance_histogram: List[List[int]]
+
+
+# Parameters for a session
+@dataclass
+class SessionParams:
+    id: Optional[str] = None
+    rid: Optional[str] = None
+    offset: Optional[int] = None
+    replace: Optional[bool] = None
+    drop_previous_output: Optional[bool] = None
+
+
+# Type definitions for multimodal input data
+# Individual data item types for each modality
+ImageDataInputItem = Union[Image, str, ImageData, Dict]
+AudioDataInputItem = Union[str, Dict]
+VideoDataInputItem = Union[str, Dict]
+# Union type for any multimodal data item
+MultimodalDataInputItem = Union[
+    ImageDataInputItem, VideoDataInputItem, AudioDataInputItem
+]
+# Format types supporting single items, lists, or nested lists for batch processing
+MultimodalDataInputFormat = Union[
+    List[List[MultimodalDataInputItem]],
+    List[MultimodalDataInputItem],
+    MultimodalDataInputItem,
+]
+
+
+@dataclass
+class GenerateReqInput(BaseReq):
+    # The input prompt. It can be a single prompt or a batch of prompts.
+    text: Optional[Union[List[str], str]] = None
+    # The token ids for text; one can specify either text or input_ids
+    input_ids: Optional[Union[List[List[int]], List[int]]] = None
+    # The embeddings for input_ids; one can specify either text or input_ids or input_embeds.
+    input_embeds: Optional[Union[List[List[List[float]]], List[List[float]]]] = None
+    # The image input. It can be an image instance, file name, URL, or base64 encoded string.
+    # Can be formatted as:
+    # - Single image for a single request
+    # - List of images (one per request in a batch)
+    # - List of lists of images (multiple images per request)
+    # See also python/sglang/srt/utils.py:load_image for more details.
+    image_data: Optional[MultimodalDataInputFormat] = None
+    # The video input. Like image data, it can be a file name, a url, or base64 encoded string.
+    video_data: Optional[MultimodalDataInputFormat] = None
+    # The audio input. Like image data, it can be a file name, a url, or base64 encoded string.
+    audio_data: Optional[MultimodalDataInputFormat] = None
+    # The sampling_params. See descriptions below.
+    sampling_params: Optional[Union[List[Dict], Dict]] = None
+    # Whether to return logprobs.
+    return_logprob: Optional[Union[List[bool], bool]] = None
+    # If return logprobs, the start location in the prompt for returning logprobs.
+    # By default, this value is "-1", which means it will only return logprobs for output tokens.
+    logprob_start_len: Optional[Union[List[int], int]] = None
+    # If return logprobs, the number of top logprobs to return at each position.
+    top_logprobs_num: Optional[Union[List[int], int]] = None
+    # If return logprobs, the token ids to return logprob for.
+    token_ids_logprob: Optional[Union[List[List[int]], List[int]]] = None
+    # Whether to detokenize tokens in text in the returned logprobs.
+    return_text_in_logprobs: bool = False
+    # Whether to stream output.
+    stream: bool = False
+    # Whether to log metrics for this request (e.g. health_generate calls do not log metrics)
+    log_metrics: bool = True
+    # Whether to return hidden states
+    return_hidden_states: Union[List[bool], bool] = False
+    # Whether to return captured routed experts
+    return_routed_experts: bool = False
+    # The start location in the prompt for returning routed experts.
+    routed_experts_start_len: int = 0
+
+    # The modalities of the image data [image, multi-images, video]
+    modalities: Optional[List[str]] = None
+    # Session info for continual prompting
+    session_params: Optional[Union[List[Dict], Dict]] = None
+
+    # The path to the LoRA adaptors
+    lora_path: Optional[Union[List[Optional[str]], Optional[str]]] = None
+    # The uid of LoRA adaptors, should be initialized by tokenizer manager
+    lora_id: Optional[Union[List[Optional[str]], Optional[str]]] = None
+
+    # Custom logit processor for advanced sampling control. Must be a serialized instance
+    # of `CustomLogitProcessor` in python/sglang/srt/sampling/custom_logit_processor.py
+    # Use the processor's `to_str()` method to generate the serialized string.
+    custom_logit_processor: Optional[Union[List[Optional[str]], str]] = None
+
+    # For disaggregated inference
+    bootstrap_host: Optional[Union[List[str], str]] = None
+    bootstrap_port: Optional[Union[List[Optional[int]], int]] = None
+    bootstrap_room: Optional[Union[List[int], int]] = None
+    bootstrap_pair_key: Optional[Union[List[str], str]] = None
+    decode_tp_size: Optional[Union[List[Optional[int]], int]] = None
+
+    # Require reasoning for the request (hybrid reasoning model only)
+    require_reasoning: bool = False
+
+    # For DP routing — external router assigns a specific DP worker
+    routed_dp_rank: Optional[int] = None
+    # For PD disagg — hint telling decode which prefill DP worker has the KV cache
+    disagg_prefill_dp_rank: Optional[int] = None
+    # Deprecated: use routed_dp_rank instead
+    data_parallel_rank: Optional[int] = None
+
+    # For background responses (OpenAI responses API)
+    background: bool = False
+
+    # Conversation id used for tracking requests
+    conversation_id: Optional[str] = None
+
+    # Priority for the request
+    priority: Optional[int] = None
+
+    # Extra key for classifying the request (e.g. cache_salt)
+    extra_key: Optional[Union[List[str], str]] = None
+
+    # Routing key for routing-key schedule policy
+    routing_key: Optional[str] = None
+
+    # Whether to disallow logging for this request (e.g. due to ZDR)
+    no_logs: bool = False
+
+    # For custom metric labels
+    custom_labels: Optional[Dict[str, str]] = None
+
+    # (Internal) Whether to return bytes for image generation
+    return_bytes: bool = False
+
+    # Whether to return entropy
+    return_entropy: bool = False
+
+    # Propagates trace context via Engine.generate/async_generate
+    external_trace_header: Optional[Dict] = None
+    received_time: Optional[float] = None
+
+    # For EPD-disaggregated inference
+    need_wait_for_image: Optional[bool] = None
+    num_items_assigned: Optional[List] = None
+
+    # Multimodal tiling controls (extensions)
+    max_dynamic_patch: Optional[int] = None
+    min_dynamic_patch: Optional[int] = None
+    image_max_dynamic_patch: Optional[int] = None
+    video_max_dynamic_patch: Optional[int] = None
+
+    def contains_mm_input(self) -> bool:
+        return (
+            has_valid_data(self.image_data)
+            or has_valid_data(self.video_data)
+            or has_valid_data(self.audio_data)
+        )
+
+    def normalize_batch_and_arguments(self):
+        """
+        Normalize the batch size and arguments for the request.
+
+        This method resolves various input formats and ensures all parameters
+        are properly formatted as either single values or batches depending on the input.
+        It also handles parallel sampling expansion and sets default values for
+        unspecified parameters.
+
+        Raises:
+            ValueError: If inputs are not properly specified (e.g., none or all of
+                       text, input_ids, input_embeds are provided)
+        """
+        if self.data_parallel_rank is not None:
+            import warnings
+
+            warnings.warn(
+                "'data_parallel_rank' is deprecated, use 'routed_dp_rank' instead.",
+                DeprecationWarning,
+                stacklevel=2,
+            )
+            if self.routed_dp_rank is None:
+                self.routed_dp_rank = self.data_parallel_rank
+            self.data_parallel_rank = None
+
+        self._validate_inputs()
+        self._determine_batch_size()
+        self._handle_parallel_sampling()
+
+        if self.is_single:
+            self._normalize_single_inputs()
+        else:
+            self._normalize_batch_inputs()
+
+    def _validate_inputs(self):
+        """Validate that the input configuration is valid."""
+        if (
+            self.text is None and self.input_ids is None and self.input_embeds is None
+        ) or (
+            self.text is not None
+            and self.input_ids is not None
+            and self.input_embeds is not None
+        ):
+            raise ValueError(
+                "Either text, input_ids or input_embeds should be provided."
+            )
+
+    def _determine_batch_size(self):
+        """Determine if this is a single example or a batch and the batch size."""
+        if self.text is not None:
+            if isinstance(self.text, str):
+                self.is_single = True
+                self.batch_size = 1
+            else:
+                self.is_single = False
+                self.batch_size = len(self.text)
+            self.input_embeds = None
+        elif self.input_ids is not None:
+            if len(self.input_ids) == 0:
+                raise ValueError("input_ids cannot be empty.")
+            if isinstance(self.input_ids[0], int):
+                self.is_single = True
+                self.batch_size = 1
+            else:
+                self.is_single = False
+                self.batch_size = len(self.input_ids)
+            self.input_embeds = None
+        else:
+            if isinstance(self.input_embeds[0][0], float):
+                self.is_single = True
+                self.batch_size = 1
+            else:
+                self.is_single = False
+                self.batch_size = len(self.input_embeds)
+
+    def _handle_parallel_sampling(self):
+        """Handle parallel sampling parameters and adjust batch size if needed."""
+        # Determine parallel sample count
+        if self.sampling_params is None:
+            self.parallel_sample_num = 1
+            return
+        elif isinstance(self.sampling_params, dict):
+            self.parallel_sample_num = self.sampling_params.get("n", 1)
+        else:  # isinstance(self.sampling_params, list):
+            self.parallel_sample_num = self.sampling_params[0].get("n", 1)
+            for sampling_params in self.sampling_params:
+                if self.parallel_sample_num != sampling_params.get("n", 1):
+                    raise ValueError(
+                        "The parallel_sample_num should be the same for all samples in sample params."
+                    )
+
+        # If using parallel sampling with a single example, convert to batch
+        if self.parallel_sample_num > 1 and self.is_single:
+            self.is_single = False
+            if self.text is not None:
+                self.text = [self.text]
+            if self.input_ids is not None:
+                self.input_ids = [self.input_ids]
+            if self.input_embeds is not None:
+                self.input_embeds = [self.input_embeds]
+
+    def _normalize_single_inputs(self):
+        """Normalize inputs for a single example."""
+        if self.sampling_params is None:
+            self.sampling_params = {}
+        if self.rid is None:
+            self.rid = uuid.uuid4().hex
+        if self.return_logprob is None:
+            self.return_logprob = False
+        if self.logprob_start_len is None:
+            self.logprob_start_len = -1
+        if self.top_logprobs_num is None:
+            self.top_logprobs_num = 0
+        if not self.token_ids_logprob:  # covers both None and []
+            self.token_ids_logprob = None
+
+    def _normalize_batch_inputs(self):
+        """Normalize inputs for a batch of examples, including parallel sampling expansion."""
+        # Calculate expanded batch size
+        if self.parallel_sample_num == 1:
+            num = self.batch_size
+        else:
+            # Expand parallel_sample_num
+            num = self.batch_size * self.parallel_sample_num
+
+        # Expand input based on type
+        self._expand_inputs(num)
+        self._normalize_rid(num)
+        self._normalize_lora_paths(num)
+        self._normalize_image_data(num)
+        self._normalize_video_data(num)
+        self._normalize_audio_data(num)
+        self._normalize_sampling_params(num)
+        self._normalize_logprob_params(num)
+        self._normalize_custom_logit_processor(num)
+        self._normalize_bootstrap_params(num)
+
+    def _expand_inputs(self, num):
+        """Expand the main inputs (text, input_ids, input_embeds) for parallel sampling."""
+        if self.text is not None:
+            if not isinstance(self.text, list):
+                raise ValueError("Text should be a list for batch processing.")
+            self.text = self.text * self.parallel_sample_num
+        elif self.input_ids is not None:
+            if not isinstance(self.input_ids, list) or not isinstance(
+                self.input_ids[0], list
+            ):
+                raise ValueError(
+                    "input_ids should be a list of lists for batch processing."
+                )
+            self.input_ids = self.input_ids * self.parallel_sample_num
+        elif self.input_embeds is not None:
+            if not isinstance(self.input_embeds, list):
+                raise ValueError("input_embeds should be a list for batch processing.")
+            self.input_embeds = self.input_embeds * self.parallel_sample_num
+
+    def _normalize_lora_paths(self, num):
+        """Normalize LoRA paths for batch processing."""
+        if self.lora_path is not None:
+            if isinstance(self.lora_path, str):
+                self.lora_path = [self.lora_path] * num
+            elif isinstance(self.lora_path, list):
+                self.lora_path = self.lora_path * self.parallel_sample_num
+            else:
+                raise ValueError("lora_path should be a list or a string.")
+
+    def _normalize_image_data(self, num):
+        """Normalize image data for batch processing."""
+        if self.image_data is None:
+            self.image_data = [None] * num
+        elif not isinstance(self.image_data, list):
+            # Single image, convert to list of single-image lists
+            self.image_data = [[self.image_data]] * num
+            self.modalities = ["image"] * num
+        elif isinstance(self.image_data, list):
+            # Handle empty list case - treat as no images
+            if len(self.image_data) == 0:
+                self.image_data = [None] * num
+                return
+
+            if len(self.image_data) != self.batch_size:
+                raise ValueError(
+                    "The length of image_data should be equal to the batch size."
+                )
+
+            self.modalities = []
+            if len(self.image_data) > 0 and isinstance(self.image_data[0], list):
+                # Already a list of lists, keep as is
+                for i in range(len(self.image_data)):
+                    if self.image_data[i] is None or self.image_data[i] == [None]:
+                        self.modalities.append(None)
+                    elif len(self.image_data[i]) == 1:
+                        self.modalities.append("image")
+                    elif len(self.image_data[i]) > 1:
+                        self.modalities.append("multi-images")
+                    else:
+                        # Ensure len(self.modalities) == len(self.image_data)
+                        self.modalities.append(None)
+                # Expand parallel_sample_num
+                self.image_data = self.image_data * self.parallel_sample_num
+                self.modalities = self.modalities * self.parallel_sample_num
+            else:
+                # List of images for a batch, wrap each in a list
+                wrapped_images = [[img] for img in self.image_data]
+                # Expand for parallel sampling
+                self.image_data = wrapped_images * self.parallel_sample_num
+                self.modalities = ["image"] * num
+
+    def _normalize_video_data(self, num):
+        """Normalize video data for batch processing."""
+        if self.video_data is None:
+            self.video_data = [None] * num
+        elif not isinstance(self.video_data, list):
+            self.video_data = [self.video_data] * num
+        elif isinstance(self.video_data, list):
+            self.video_data = self.video_data * self.parallel_sample_num
+
+    def _normalize_audio_data(self, num):
+        """Normalize audio data for batch processing."""
+        if self.audio_data is None:
+            self.audio_data = [None] * num
+        elif not isinstance(self.audio_data, list):
+            self.audio_data = [self.audio_data] * num
+        elif isinstance(self.audio_data, list):
+            self.audio_data = self.audio_data * self.parallel_sample_num
+
+    def _normalize_sampling_params(self, num):
+        """Normalize sampling parameters for batch processing."""
+        if self.sampling_params is None:
+            self.sampling_params = [{}] * num
+        elif isinstance(self.sampling_params, dict):
+            self.sampling_params = [self.sampling_params] * num
+        else:  # Already a list
+            self.sampling_params = self.sampling_params * self.parallel_sample_num
+
+    def _normalize_rid(self, num):
+        """Normalize request IDs for batch processing."""
+        if self.rid is None:
+            self.rid = [uuid.uuid4().hex for _ in range(num)]
+        elif isinstance(self.rid, str):
+            new_rids = [f"{self.rid}_{i}" for i in range(num)]
+            self.rid = new_rids
+        elif isinstance(self.rid, list):
+            # Note: the length of rid shall be the same as the batch_size,
+            # as the rid would be expanded for parallel sampling in tokenizer_manager
+            if len(self.rid) != self.batch_size:
+                raise ValueError(
+                    "The specified rids length mismatch with the batch_size for batch processing."
+                )
+        else:
+            raise ValueError("The rid should be a string or a list of strings.")
+
+    def _normalize_logprob_params(self, num):
+        """Normalize logprob-related parameters for batch processing."""
+
+        # Helper function to normalize a parameter
+        def normalize_param(param, default_value, param_name):
+            if param is None:
+                return [default_value] * num
+            elif not isinstance(param, list):
+                return [param] * num
+            else:
+                if self.parallel_sample_num > 1:
+                    raise ValueError(
+                        f"Cannot use list {param_name} with parallel_sample_num > 1"
+                    )
+                return param
+
+        # Normalize each logprob parameter
+        self.return_logprob = normalize_param(
+            self.return_logprob, False, "return_logprob"
+        )
+        self.logprob_start_len = normalize_param(
+            self.logprob_start_len, -1, "logprob_start_len"
+        )
+        self.top_logprobs_num = normalize_param(
+            self.top_logprobs_num, 0, "top_logprobs_num"
+        )
+
+        # Handle token_ids_logprob specially due to its nested structure
+        if not self.token_ids_logprob:  # covers both None and []
+            self.token_ids_logprob = [None] * num
+        elif not isinstance(self.token_ids_logprob, list):
+            self.token_ids_logprob = [[self.token_ids_logprob] for _ in range(num)]
+        elif not isinstance(self.token_ids_logprob[0], list):
+            self.token_ids_logprob = [
+                copy.deepcopy(self.token_ids_logprob) for _ in range(num)
+            ]
+        elif self.parallel_sample_num > 1:
+            raise ValueError(
+                "Cannot use list token_ids_logprob with parallel_sample_num > 1"
+            )
+
+    def _normalize_custom_logit_processor(self, num):
+        """Normalize custom logit processor for batch processing."""
+        if self.custom_logit_processor is None:
+            self.custom_logit_processor = [None] * num
+        elif not isinstance(self.custom_logit_processor, list):
+            self.custom_logit_processor = [self.custom_logit_processor] * num
+        elif self.parallel_sample_num > 1:
+            raise ValueError(
+                "Cannot use list custom_logit_processor with parallel_sample_num > 1"
+            )
+
+    def _normalize_bootstrap_params(self, num):
+        """Normalize bootstrap parameters for batch processing."""
+        # Normalize bootstrap_host
+        if self.bootstrap_host is None:
+            self.bootstrap_host = [None] * num
+        elif not isinstance(self.bootstrap_host, list):
+            self.bootstrap_host = [self.bootstrap_host] * num
+        elif isinstance(self.bootstrap_host, list):
+            self.bootstrap_host = self.bootstrap_host * self.parallel_sample_num
+
+        # Normalize bootstrap_port
+        if self.bootstrap_port is None:
+            self.bootstrap_port = [None] * num
+        elif not isinstance(self.bootstrap_port, list):
+            self.bootstrap_port = [self.bootstrap_port] * num
+        elif isinstance(self.bootstrap_port, list):
+            self.bootstrap_port = self.bootstrap_port * self.parallel_sample_num
+
+        # Normalize bootstrap_room
+        if self.bootstrap_room is None:
+            self.bootstrap_room = [None] * num
+        elif not isinstance(self.bootstrap_room, list):
+            self.bootstrap_room = [self.bootstrap_room + i for i in range(num)]
+        elif isinstance(self.bootstrap_room, list):
+            self.bootstrap_room = self.bootstrap_room * self.parallel_sample_num
+
+        # Normalize bootstrap_pair_key
+        if self.bootstrap_pair_key is None:
+            self.bootstrap_pair_key = [None] * num
+        elif not isinstance(self.bootstrap_pair_key, list):
+            self.bootstrap_pair_key = [self.bootstrap_pair_key] * num
+        elif isinstance(self.bootstrap_pair_key, list):
+            self.bootstrap_pair_key = self.bootstrap_pair_key * self.parallel_sample_num
+
+    def _validate_session_params(self):
+        """Validate that session parameters are properly formatted."""
+        if self.session_params is not None:
+            if not isinstance(self.session_params, dict) and not isinstance(
+                self.session_params[0], dict
+            ):
+                raise ValueError("Session params must be a dict or a list of dicts.")
+
+    def __getitem__(self, i):
+        return GenerateReqInput(
+            text=self.text[i] if self.text is not None else None,
+            input_ids=self.input_ids[i] if self.input_ids is not None else None,
+            input_embeds=(
+                self.input_embeds[i] if self.input_embeds is not None else None
+            ),
+            image_data=self.image_data[i],
+            video_data=self.video_data[i],
+            audio_data=self.audio_data[i],
+            sampling_params=self.sampling_params[i],
+            rid=self.rid[i],
+            return_logprob=self.return_logprob[i],
+            logprob_start_len=self.logprob_start_len[i],
+            top_logprobs_num=self.top_logprobs_num[i],
+            token_ids_logprob=self.token_ids_logprob[i],
+            return_text_in_logprobs=self.return_text_in_logprobs,
+            stream=self.stream,
+            log_metrics=self.log_metrics,
+            return_hidden_states=(
+                self.return_hidden_states[i]
+                if isinstance(self.return_hidden_states, list)
+                else self.return_hidden_states
+            ),
+            return_routed_experts=self.return_routed_experts,
+            modalities=self.modalities[i] if self.modalities else None,
+            session_params=self.session_params,
+            lora_path=self.lora_path[i] if self.lora_path is not None else None,
+            lora_id=self.lora_id[i] if self.lora_id is not None else None,
+            custom_logit_processor=(
+                self.custom_logit_processor[i]
+                if self.custom_logit_processor is not None
+                else None
+            ),
+            # if `__getitem__` is called, the bootstrap_host, bootstrap_port, bootstrap_room must be a list
+            bootstrap_host=(
+                self.bootstrap_host[i] if self.bootstrap_host is not None else None
+            ),
+            bootstrap_port=(
+                self.bootstrap_port[i] if self.bootstrap_port is not None else None
+            ),
+            bootstrap_room=(
+                self.bootstrap_room[i] if self.bootstrap_room is not None else None
+            ),
+            bootstrap_pair_key=(
+                self.bootstrap_pair_key[i]
+                if self.bootstrap_pair_key is not None
+                else None
+            ),
+            decode_tp_size=(
+                self.decode_tp_size[i] if self.decode_tp_size is not None else None
+            ),
+            routed_dp_rank=self.routed_dp_rank,
+            disagg_prefill_dp_rank=self.disagg_prefill_dp_rank,
+            conversation_id=self.conversation_id,
+            priority=self.priority,
+            extra_key=self.extra_key,
+            no_logs=self.no_logs,
+            custom_labels=self.custom_labels,
+            return_bytes=self.return_bytes,
+            return_entropy=self.return_entropy,
+            external_trace_header=self.external_trace_header,
+            http_worker_ipc=self.http_worker_ipc,
+            received_time=self.received_time,
+        )
+
+
+@dataclass
+class TokenizedGenerateReqInput(BaseReq):
+    # The input text
+    input_text: str
+    # The input token ids
+    input_ids: List[int]
+    # The multimodal inputs
+    mm_inputs: dict
+    # The sampling parameters
+    sampling_params: SamplingParams
+    # Whether to return the logprobs
+    return_logprob: bool
+    # If return logprobs, the start location in the prompt for returning logprobs.
+    logprob_start_len: int
+    # If return logprobs, the number of top logprobs to return at each position.
+    top_logprobs_num: int
+    # If return logprobs, the token id to return logprob for
+    token_ids_logprob: List[int]
+    # Whether to stream output
+    stream: bool
+
+    # Whether to return hidden states
+    return_hidden_states: bool = False
+
+    # Whether to return captured routed experts
+    return_routed_experts: bool = False
+    # The start location in the prompt for returning routed experts.
+    routed_experts_start_len: int = 0
+
+    # The input embeds
+    input_embeds: Optional[Union[List[List[List[float]]], List[List[float]]]] = None
+
+    # Session info for continual prompting
+    session_params: Optional[SessionParams] = None
+
+    # LoRA related
+    lora_id: Optional[str] = None  # None means just use the base model
+
+    # Custom logit processor for advanced sampling control. Must be a serialized instance
+    # of `CustomLogitProcessor` in python/sglang/srt/sampling/custom_logit_processor.py
+    # Use the processor's `to_str()` method to generate the serialized string.
+    custom_logit_processor: Optional[str] = None
+
+    # For disaggregated inference
+    bootstrap_host: Optional[str] = None
+    bootstrap_port: Optional[int] = None
+    bootstrap_room: Optional[int] = None
+    bootstrap_pair_key: Optional[str] = None
+    decode_tp_size: Optional[int] = None
+
+    # Require reasoning for the request (hybrid reasoning model only)
+    require_reasoning: bool = False
+
+    # For DP routing
+    routed_dp_rank: Optional[int] = None
+    # For PD disagg — hint telling decode which prefill DP worker has the KV cache
+    disagg_prefill_dp_rank: Optional[int] = None
+
+    # Priority for the request
+    priority: Optional[int] = None
+
+    # Extra key for classifying the request (e.g. cache_salt)
+    extra_key: Optional[str] = None
+
+    # Routing key for routing-key schedule policy
+    routing_key: Optional[str] = None
+
+    # Whether to disallow logging for this request (e.g. due to ZDR)
+    no_logs: bool = False
+
+    # (Internal) Whether to return bytes for image generation
+    return_bytes: bool = False
+
+    # Whether to return entropy
+    return_entropy: bool = False
+
+    need_wait_for_image: bool = False
+    num_items_assigned: Optional[List] = None
+
+    # For observability
+    time_stats: Optional[Union[APIServerReqTimeStats, DPControllerReqTimeStats]] = None
+
+
+@dataclass
+class BatchTokenizedGenerateReqInput(BaseBatchReq):
+    # The batch of tokenized requests
+    batch: List[TokenizedGenerateReqInput]
+
+    def __len__(self):
+        return len(self.batch)
+
+    def __getitem__(self, i):
+        return self.batch[i]
+
+    def __iter__(self):
+        return iter(self.batch)
+
+
+@dataclass
+class EmbeddingReqInput(BaseReq):
+    # The input prompt. It can be a single prompt or a batch of prompts.
+    text: Optional[Union[List[List[str]], List[str], str]] = None
+    # The image input. It can be an image instance, file name, URL, or base64 encoded string.
+    # Can be formatted as:
+    # - Single image for a single request
+    # - List of images (one per request in a batch)
+    # - List of lists of images (multiple images per request)
+    # See also python/sglang/srt/utils.py:load_image for more details.
+    image_data: Optional[MultimodalDataInputFormat] = None
+    # The video input. Like image data, it can be a file name, a url, or base64 encoded string.
+    video_data: Optional[MultimodalDataInputFormat] = None
+    # The audio input. Like image data, it can be a file name, a url, or base64 encoded string.
+    audio_data: Optional[MultimodalDataInputFormat] = None
+    # The token ids for text; one can either specify text or input_ids.
+    input_ids: Optional[Union[List[List[int]], List[int]]] = None
+    # Dummy sampling params for compatibility
+    sampling_params: Optional[Union[List[Dict], Dict]] = None
+    # Dummy input embeds for compatibility
+    input_embeds: Optional[Union[List[List[List[float]]], List[List[float]]]] = None
+    # Whether to log metrics for this request (e.g. health_generate calls do not log metrics)
+    log_metrics: bool = True
+    # The modalities of the image data [image, multi-images, video]
+    modalities: Optional[List[str]] = None
+    # For cross-encoder requests
+    is_cross_encoder_request: bool = False
+    # Priority for the request
+    priority: Optional[int] = None
+    # Routing key for routing-key schedule policy
+    routing_key: Optional[str] = None
+
+    # For background responses (OpenAI responses API)
+    background: bool = False
+
+    # Propagates trace context via Engine.encode/async_encode
+    external_trace_header: Optional[Dict] = None
+    received_time: Optional[float] = None
+
+    # The number of dimensions the resulting output embeddings should have. It is applicable for Matryoshka Embeddings.
+    dimensions: Optional[int] = None
+
+    # The path to the LoRA adaptors
+    lora_path: Optional[Union[List[Optional[str]], Optional[str]]] = None
+    # The uid of LoRA adaptors, should be initialized by tokenizer manager
+    lora_id: Optional[Union[List[Optional[str]], Optional[str]]] = None
+
+    def normalize_batch_and_arguments(self):
+        # at least one of text, input_ids, or image should be provided
+        if self.text is None and self.input_ids is None and self.image_data is None:
+            raise ValueError(
+                "At least one of text, input_ids, or image should be provided"
+            )
+
+        # text and input_ids cannot be provided at the same time
+        if self.text is not None and self.input_ids is not None:
+            raise ValueError("text and input_ids cannot be provided at the same time")
+
+        # Derive the batch size
+        self.batch_size = 0
+        self.is_single = True
+
+        # check the batch size of text
+        if self.text is not None:
+            if isinstance(self.text, list):
+                self.batch_size += len(self.text)
+                self.is_single = False
+            else:
+                self.batch_size += 1
+
+        # check the batch size of input_ids
+        if self.input_ids is not None:
+            if isinstance(self.input_ids[0], list):
+                self.batch_size += len(self.input_ids)
+                self.is_single = False
+            else:
+                self.batch_size += 1
+
+        # Fill in default arguments
+        if self.is_single:
+            if self.rid is None:
+                self.rid = uuid.uuid4().hex
+            if self.sampling_params is None:
+                self.sampling_params = {}
+            self.sampling_params["max_new_tokens"] = 0
+        else:
+            if self.rid is None:
+                self.rid = [uuid.uuid4().hex for _ in range(self.batch_size)]
+            else:
+                assert isinstance(self.rid, list), "The rid should be a list."
+
+            if self.sampling_params is None:
+                self.sampling_params = [{}] * self.batch_size
+            elif isinstance(self.sampling_params, dict):
+                self.sampling_params = [self.sampling_params] * self.batch_size
+            for i in range(self.batch_size):
+                self.sampling_params[i]["max_new_tokens"] = 0
+
+            self._normalize_lora_paths(self.batch_size)
+
+    def _normalize_lora_paths(self, num):
+        """Normalize LoRA paths for batch processing."""
+        if self.lora_path is not None:
+            if isinstance(self.lora_path, str):
+                self.lora_path = [self.lora_path] * num
+            elif isinstance(self.lora_path, list):
+                if len(self.lora_path) != num:
+                    raise ValueError(
+                        f"lora_path list length ({len(self.lora_path)}) must match batch size ({num})"
+                    )
+            else:
+                raise ValueError("lora_path should be a list or a string.")
+
+    def contains_mm_input(self) -> bool:
+        return (
+            has_valid_data(self.image_data)
+            or has_valid_data(self.video_data)
+            or has_valid_data(self.audio_data)
+        )
+
+    def __getitem__(self, i):
+        if self.is_cross_encoder_request:
+            return EmbeddingReqInput(
+                text=[self.text[i]] if self.text is not None else None,
+                sampling_params=self.sampling_params[i],
+                rid=self.rid[i],
+                lora_path=self.lora_path[i] if self.lora_path is not None else None,
+                lora_id=self.lora_id[i] if self.lora_id is not None else None,
+                is_cross_encoder_request=True,
+                http_worker_ipc=self.http_worker_ipc,
+            )
+
+        return EmbeddingReqInput(
+            text=self.text[i] if self.text is not None else None,
+            input_ids=self.input_ids[i] if self.input_ids is not None else None,
+            image_data=self.image_data[i] if self.image_data is not None else None,
+            audio_data=self.audio_data[i] if self.audio_data is not None else None,
+            video_data=self.video_data[i] if self.video_data is not None else None,
+            sampling_params=self.sampling_params[i],
+            rid=self.rid[i],
+            lora_path=self.lora_path[i] if self.lora_path is not None else None,
+            lora_id=self.lora_id[i] if self.lora_id is not None else None,
+            external_trace_header=self.external_trace_header,
+            dimensions=self.dimensions,
+            http_worker_ipc=self.http_worker_ipc,
+            received_time=self.received_time,
+        )
+
+
+@dataclass
+class TokenizedEmbeddingReqInput(BaseReq):
+    # The input text
+    input_text: str
+    # The input token ids
+    input_ids: List[int]
+    # The image inputs
+    image_inputs: dict
+    # The token type ids
+    token_type_ids: List[int]
+    # Dummy sampling params for compatibility
+    sampling_params: SamplingParams
+    # For DP routing
+    routed_dp_rank: Optional[int] = None
+    # Priority for the request
+    priority: Optional[int] = None
+    # The number of dimensions the resulting output embeddings should have. It is applicable for Matryoshka Embeddings.
+    dimensions: Optional[int] = None
+    # LoRA related
+    lora_id: Optional[str] = None  # None means just use the base model
+    # For observability
+    time_stats: Optional[Union[APIServerReqTimeStats, DPControllerReqTimeStats]] = None
+
+
+@dataclass
+class BatchTokenizedEmbeddingReqInput(BaseBatchReq):
+    # The batch of tokenized embedding requests
+    batch: List[TokenizedEmbeddingReqInput]
+
+    def __len__(self):
+        return len(self.batch)
+
+    def __getitem__(self, i):
+        return self.batch[i]
+
+    def __iter__(self):
+        return iter(self.batch)
+
+
+@dataclass
+class BatchTokenIDOutput(BaseBatchReq, SpeculativeDecodingMetricsMixin):
+    # The finish reason
+    finished_reasons: List[BaseFinishReason]
+    # For incremental decoding
+    decoded_texts: List[str]
+    decode_ids: List[int]
+    read_offsets: List[int]
+    # Only used when `--skip-tokenizer-init` is on
+    output_ids: Optional[List[int]]
+    # Detokenization configs
+    skip_special_tokens: List[bool]
+    spaces_between_special_tokens: List[bool]
+    no_stop_trim: List[bool]
+
+    # Token counts
+    prompt_tokens: List[int]
+    completion_tokens: List[int]
+    cached_tokens: List[int]
+
+    # Logprobs
+    input_token_logprobs_val: List[float]
+    input_token_logprobs_idx: List[int]
+    output_token_logprobs_val: List[float]
+    output_token_logprobs_idx: List[int]
+    input_top_logprobs_val: List[List]
+    input_top_logprobs_idx: List[List]
+    output_top_logprobs_val: List[List]
+    output_top_logprobs_idx: List[List]
+    input_token_ids_logprobs_val: List[List]
+    input_token_ids_logprobs_idx: List[List]
+    output_token_ids_logprobs_val: List[List]
+    output_token_ids_logprobs_idx: List[List]
+    output_token_entropy_val: List[float]
+
+    # Hidden states
+    output_hidden_states: List[List[float]]
+
+    # The routed experts for each token, including both input and output tokens
+    # routed_experts[i] is a tensor of shape (token, layer, top_k) for request i
+    routed_experts: List[Optional[torch.Tensor]]
+
+    # The information of placeholder tokens (e.g., image token)
+    # idx is the index of the token in the prompt after expansion.
+    # val is the length of padded tokens after expansion.
+    placeholder_tokens_idx: List[Optional[List[int]]]
+    placeholder_tokens_val: List[Optional[List[int]]]
+
+    # Number of times each request was retracted.
+    retraction_counts: List[int]
+
+    # The trainer step id. Used to know which step's weights are used for sampling.
+    token_steps: List[List[int]] = None
+
+    # Load for DP balance
+    load: GetLoadReqOutput = None
+    # Customized info
+    customized_info: Optional[Dict[str, List[Any]]] = None
+    # Detailed breakdown of cached tokens by source (device/host/storage)
+    cached_tokens_details: Optional[List[Optional[Dict[str, Any]]]] = None
+    # DP rank of the scheduler that processed each request
+    dp_ranks: Optional[List[int]] = None
+
+    # For observability
+    time_stats: Optional[List[SchedulerReqTimeStats]] = None
+
+
+@dataclass
+class BatchMultimodalDecodeReq(BaseBatchReq):
+    decoded_ids: List[int]
+    input_token_logprobs_val: List[float]
+    input_token_logprobs_idx: List[int]
+    output_token_logprobs_val: List[float]
+    output_token_logprobs_idx: List[int]
+    read_offsets: List[int]
+    skip_special_tokens: List[bool]
+    spaces_between_special_tokens: List[bool]
+    image_resolutions: List[List[int]]
+    resize_image_resolutions: List[List[int]]
+
+    finished_reasons: List[BaseFinishReason]
+
+    # Token counts
+    prompt_tokens: List[int]
+    completion_tokens: List[int]
+    cached_tokens: List[int]
+
+    # The information of placeholder tokens (e.g., image token)
+    # idx is the index of the token in the prompt after expansion.
+    # val is the length of padded tokens after expansion.
+    placeholder_tokens_idx: List[Optional[List[int]]]
+    placeholder_tokens_val: List[Optional[List[int]]]
+
+    return_bytes: List[bool]
+
+    # The trainer step id. Used to know which step's weights are used for sampling.
+    token_steps: List[List[int]] = None
+
+
+@dataclass
+class BatchStrOutput(BaseBatchReq, SpeculativeDecodingMetricsMixin):
+    # The finish reason
+    finished_reasons: List[dict]
+    # The output decoded strings
+    output_strs: List[str]
+    # The token ids
+    output_ids: Optional[List[int]]
+
+    # Token counts
+    prompt_tokens: List[int]
+    completion_tokens: List[int]
+    cached_tokens: List[int]
+
+    # Logprobs
+    input_token_logprobs_val: List[float]
+    input_token_logprobs_idx: List[int]
+    output_token_logprobs_val: List[float]
+    output_token_logprobs_idx: List[int]
+    input_top_logprobs_val: List[List]
+    input_top_logprobs_idx: List[List]
+    output_top_logprobs_val: List[List]
+    output_top_logprobs_idx: List[List]
+    input_token_ids_logprobs_val: List[List]
+    input_token_ids_logprobs_idx: List[List]
+    output_token_ids_logprobs_val: List[List]
+    output_token_ids_logprobs_idx: List[List]
+    output_token_entropy_val: List[float]
+
+    # Hidden states
+    output_hidden_states: List[List[float]]
+
+    # The routed experts for each token, including both input and output tokens
+    # routed_experts[i] is a tensor of shape (token, layer, top_k) for request i
+    routed_experts: List[Optional[torch.Tensor]]
+
+    # The information of placeholder tokens (e.g., image token)
+    # idx is the index of the token in the prompt after expansion.
+    # val is the length of padded tokens after expansion.
+    placeholder_tokens_idx: List[Optional[List[int]]]
+    placeholder_tokens_val: List[Optional[List[int]]]
+
+    # Number of times each request was retracted.
+    retraction_counts: List[int]
+
+    # The trainer step id. Used to know which step's weights are used for sampling.
+    token_steps: List[List[int]] = None
+
+    # Load for DP balance
+    load: GetLoadReqOutput = None
+
+    # Customized info
+    customized_info: Optional[Dict[str, List[Any]]] = None
+    # Detailed breakdown of cached tokens by source (device/host/storage)
+    cached_tokens_details: Optional[List[Optional[Dict[str, Any]]]] = None
+    # DP rank of the scheduler that processed each request
+    dp_ranks: Optional[List[int]] = None
+
+    # For observability
+    time_stats: Optional[List[SchedulerReqTimeStats]] = None
+
+
+@dataclass
+class BatchMultimodalOutput(BaseBatchReq):
+    # The finish reason
+    finished_reasons: List[dict]
+    decoded_ids: List[List[int]]
+    # The outputs
+    outputs: Union[List[str | bytes], List[List[Dict]]]
+
+    # probability values for input tokens and output tokens
+    input_token_logprobs_val: List[List[float]]
+    input_token_logprobs_idx: List[List[int]]
+    output_token_logprobs_val: List[List[float]]
+    output_token_logprobs_idx: List[List[int]]
+
+    # Token counts
+    prompt_tokens: List[int]
+    completion_tokens: List[int]
+    cached_tokens: List[int]
+
+    placeholder_tokens_idx: List[Optional[List[int]]]
+    placeholder_tokens_val: List[Optional[List[int]]]
+
+    return_bytes: List[bool]
+    # Detailed breakdown of cached tokens by source (device/host/storage)
+    cached_tokens_details: Optional[List[Optional[Dict[str, Any]]]] = None
+
+    # For observability
+    time_stats: Optional[List[SchedulerReqTimeStats]] = None
+
+
+@dataclass
+class BatchEmbeddingOutput(BaseBatchReq):
+    # The finish reason
+    finished_reasons: List[BaseFinishReason]
+    # The output embedding
+    embeddings: Union[List[List[float]], List[Dict[int, float]]]
+    # Token counts
+    prompt_tokens: List[int]
+    cached_tokens: List[int]
+    # Placeholder token info
+    placeholder_tokens_idx: List[Optional[List[int]]]
+    placeholder_tokens_val: List[Optional[List[int]]]
+
+    # Number of times each request was retracted.
+    retraction_counts: List[int]
+    # Detailed breakdown of cached tokens by source (device/host/storage)
+    cached_tokens_details: Optional[List[Optional[Dict[str, Any]]]] = None
+
+    # For observability
+    time_stats: Optional[List[SchedulerReqTimeStats]] = None
+
+
+@dataclass
+class ClearHiCacheReqInput(BaseReq):
+    pass
+
+
+@dataclass
+class ClearHiCacheReqOutput(BaseReq):
+    success: bool
+
+
+@dataclass
+class FlushCacheReqInput(BaseReq):
+    pass
+
+
+@dataclass
+class FlushCacheReqOutput(BaseReq):
+    success: bool
+
+
+@dataclass
+class AttachHiCacheStorageReqInput(BaseReq):
+    """Dynamically attach (enable) HiCache storage backend at runtime.
+
+    Note: `hicache_storage_backend_extra_config_json` is a JSON string. It may contain both:
+    - backend-specific configs (e.g., mooncake master address)
+    - prefetch-related knobs (prefetch_threshold, prefetch_timeout_*, hicache_storage_pass_prefix_keys)
+    """
+
+    hicache_storage_backend: str
+    hicache_storage_backend_extra_config_json: Optional[str] = None
+    hicache_storage_prefetch_policy: Optional[str] = None
+    hicache_write_policy: Optional[str] = None
+
+    def __post_init__(self):
+        if self.hicache_storage_prefetch_policy is None:
+            pass
+        else:
+            allowed = ["best_effort", "wait_complete", "timeout"]
+            if self.hicache_storage_prefetch_policy not in allowed:
+                raise ValueError(
+                    f"Invalid hicache_storage_prefetch_policy: {self.hicache_storage_prefetch_policy!r}. "
+                    f"Expected one of {allowed}."
+                )
+
+        if self.hicache_write_policy is None:
+            return
+        allowed = ["write_back", "write_through", "write_through_selective"]
+        if self.hicache_write_policy not in allowed:
+            raise ValueError(
+                f"Invalid hicache_write_policy: {self.hicache_write_policy!r}. "
+                f"Expected one of {allowed}."
+            )
+
+
+@dataclass
+class AttachHiCacheStorageReqOutput(BaseReq):
+    success: bool
+    message: str = ""
+
+
+@dataclass
+class DetachHiCacheStorageReqInput(BaseReq):
+    """Dynamically detach (disable) HiCache storage backend at runtime."""
+
+    pass
+
+
+@dataclass
+class DetachHiCacheStorageReqOutput(BaseReq):
+    success: bool
+    message: str = ""
+
+
+@dataclass
+class PinPrefixReqInput(BaseReq):
+    """Pin a prefix by token_ids to resist eviction."""
+
+    token_ids: List[int] = field(default_factory=list)
+    ttl_seconds: int = 300  # TTL in seconds, default 5 minutes
+
+
+@dataclass
+class PinPrefixReqOutput(BaseReq):
+    success: bool
+    nodes_pinned: int = 0
+    message: str = ""
+
+
+@dataclass
+class PauseGenerationReqInput(BaseReq):
+    """
+    Note that the PauseGenerationRequests is only supported in SGLang Server.
+    abort: Abort and return all requests currently being processed.
+
+    in_place: Pause the scheduler's event_loop from performing inference;
+            only non-inference requests (e.g., control commands) will be handled.
+            The requests in the engine will be paused and stay in the event_loop,
+            then continue generation after continue_generation with the old kv cache.
+            Note: In 'inplace' mode, flush_cache will fail if there are any requests
+            in the running_batch.
+
+    retract: Pause the scheduler's event loop from performing inference;
+            only non-inference requests will be handled, and all currently running
+            requests will be retracted back to the waiting_queue.
+            Note: The KV cache can be flushed in this mode and will be automatically
+            recomputed after continue_generation.
+    """
+
+    mode: Literal["abort", "retract", "in_place"] = "abort"
+
+    def __post_init__(self):
+        allowed = ["abort", "retract", "in_place"]
+        if self.mode not in allowed:
+            raise ValueError(
+                f"Invalid mode: {self.mode!r}. " f"Expected one of {allowed}."
+            )
+
+
+@dataclass
+class ContinueGenerationReqInput(BaseReq):
+    pass
+
+
+@dataclass
+class UpdateWeightFromDiskReqInput(BaseReq):
+    # The model path with the new weights
+    model_path: str
+    # The format to load the weights
+    load_format: Optional[str] = None
+    # Whether to abort all requests before updating weights
+    abort_all_requests: bool = False
+    # Optional: Update weight version along with weights
+    weight_version: Optional[str] = None
+    # Whether to update weights asynchronously
+    is_async: bool = False
+    # Whether to empty torch cache
+    torch_empty_cache: bool = False
+    # Whether to keep the scheduler paused after weight update
+    keep_pause: bool = False
+    # Whether to recapture cuda graph after weight update
+    recapture_cuda_graph: bool = False
+    # The trainer step id. Used to know which step's weights are used for sampling.
+    token_step: int = 0
+    # Whether to flush the cache after updating weights
+    flush_cache: bool = True
+    # Tensor metadata
+    manifest: Optional[Dict[str, Any]] = None
+
+
+@dataclass
+class UpdateWeightFromDiskReqOutput(BaseReq):
+    success: bool
+    message: str
+    # Number of paused requests during weight sync.
+    num_paused_requests: Optional[int] = 0
+
+
+@dataclass
+class UpdateWeightsFromDistributedReqInput(BaseReq):
+    names: List[str]
+    dtypes: List[str]
+    shapes: List[List[int]]
+    # The group name
+    group_name: str = "weight_update_group"
+    # Whether to flush the cache after updating weights
+    flush_cache: bool = True
+    # Whether to abort all requests before updating weights
+    abort_all_requests: bool = False
+    # Optional: Update weight version along with weights
+    weight_version: Optional[str] = None
+    # Optional format specification for loading
+    load_format: Optional[str] = None
+
+
+@dataclass
+class UpdateWeightsFromDistributedReqOutput(BaseReq):
+    success: bool
+    message: str
+
+
+@dataclass
+class UpdateWeightsFromTensorReqInput(BaseReq):
+    """Update model weights from tensor input.
+
+    - Tensors are serialized for transmission
+    - Data is structured in JSON for easy transmission over HTTP
+    """
+
+    serialized_named_tensors: List[Union[str, bytes]]
+    # Optional format specification for loading
+    load_format: Optional[str] = None
+    # Whether to flush the cache after updating weights
+    flush_cache: bool = True
+    # Whether to abort all requests before updating weights
+    abort_all_requests: bool = False
+    # Optional: Update weight version along with weights
+    weight_version: Optional[str] = None
+    # Optional: Determine whether to disable updating the draft model
+    disable_draft_model: Optional[bool] = None
+
+
+@dataclass
+class UpdateWeightsFromTensorReqOutput(BaseReq):
+    success: bool
+    message: str
+
+
+@dataclass
+class InitWeightsSendGroupForRemoteInstanceReqInput(BaseReq):
+    # The master address
+    master_address: str
+    # The ports for each rank's communication group
+    ports: str
+    # The rank in the communication group
+    group_rank: int
+    # The world size
+    world_size: int
+    # The group name
+    group_name: str = "weight_send_group"
+    # The backend
+    backend: str = "nccl"
+
+
+# Now UpdateWeightsFromIPCReqInput and UpdateWeightsFromIPCReqOutput
+# are only used by Checkpoint Engine (https://github.com/MoonshotAI/checkpoint-engine)
+@dataclass
+class UpdateWeightsFromIPCReqInput(BaseReq):
+    # ZMQ socket paths for each device UUID
+    zmq_handles: Dict[str, str]
+    # Whether to flush cache after weight update
+    flush_cache: bool = True
+    # Optional: Update weight version along with weights
+    weight_version: Optional[str] = None
+
+
+@dataclass
+class UpdateWeightsFromIPCReqOutput(BaseReq):
+    success: bool
+    message: str
+
+
+@dataclass
+class InitWeightsSendGroupForRemoteInstanceReqOutput(BaseReq):
+    success: bool
+    message: str
+
+
+@dataclass
+class SendWeightsToRemoteInstanceReqInput(BaseReq):
+    # The master address
+    master_address: str
+    # The ports for each rank's communication group
+    ports: str
+    # The group name
+    group_name: str = "weight_send_group"
+
+
+@dataclass
+class SendWeightsToRemoteInstanceReqOutput(BaseReq):
+    success: bool
+    message: str
+
+
+@dataclass
+class UpdateExpertBackupReq(BaseReq):
+    pass
+
+
+@dataclass
+class BackupDramReq(BaseReq):
+    rank: int
+    weight_pointer_map: Dict[str, Any]
+    session_id: str
+    buffer_size: int
+
+
+@dataclass
+class InitWeightsUpdateGroupReqInput(BaseReq):
+    # The master address
+    master_address: str
+    # The master port
+    master_port: int
+    # The rank offset
+    rank_offset: int
+    # The world size
+    world_size: int
+    # The group name
+    group_name: str = "weight_update_group"
+    # The backend
+    backend: str = "nccl"
+
+
+@dataclass
+class InitWeightsUpdateGroupReqOutput(BaseReq):
+    success: bool
+    message: str
+
+
+@dataclass
+class DestroyWeightsUpdateGroupReqInput(BaseReq):
+    group_name: str = "weight_update_group"
+
+
+@dataclass
+class DestroyWeightsUpdateGroupReqOutput(BaseReq):
+    success: bool
+    message: str
+
+
+@dataclass
+class UpdateWeightVersionReqInput(BaseReq):
+    # The new weight version
+    new_version: str
+    # Whether to abort all running requests before updating
+    abort_all_requests: bool = True
+
+
+@dataclass
+class GetWeightsByNameReqInput(BaseReq):
+    name: str
+    truncate_size: int = 100
+
+
+@dataclass
+class GetWeightsByNameReqOutput(BaseReq):
+    parameter: list
+
+
+@dataclass
+class ReleaseMemoryOccupationReqInput(BaseReq):
+    # Optional tags to identify the memory region, which is primarily used for RL
+    # Currently we only support `weights` and `kv_cache`
+    tags: Optional[List[str]] = None
+
+
+@dataclass
+class ReleaseMemoryOccupationReqOutput(BaseReq):
+    pass
+
+
+@dataclass
+class ResumeMemoryOccupationReqInput(BaseReq):
+    # Optional tags to identify the memory region, which is primarily used for RL
+    # Currently we only support `weights` and `kv_cache`
+    tags: Optional[List[str]] = None
+
+
+@dataclass
+class ResumeMemoryOccupationReqOutput(BaseReq):
+    pass
+
+
+@dataclass
+class CheckWeightsReqInput(BaseReq):
+    action: str
+
+
+@dataclass
+class CheckWeightsReqOutput(BaseReq):
+    success: bool
+    message: str
+
+
+@dataclass
+class SlowDownReqInput(BaseReq):
+    forward_sleep_time: Optional[float]
+
+
+@dataclass
+class SlowDownReqOutput(BaseReq):
+    pass
+
+
+@dataclass
+class AbortReq(BaseReq):
+    # Whether to abort all requests
+    abort_all: bool = False
+    # The finished reason data
+    finished_reason: Optional[Dict[str, Any]] = None
+    abort_message: Optional[str] = None
+
+    def __post_init__(self):
+        # FIXME: This is a hack to keep the same with the old code
+        if self.rid is None:
+            self.rid = ""
+
+
+@dataclass
+class ActiveRanksOutput(BaseReq):
+    status: List[bool]
+
+
+@dataclass
+class GetInternalStateReq(BaseReq):
+    pass
+
+
+@dataclass
+class GetInternalStateReqOutput(BaseReq):
+    internal_state: Dict[Any, Any]
+
+
+@dataclass
+class SetInternalStateReq(BaseReq):
+    server_args: Dict[str, Any]
+
+
+@dataclass
+class SetInternalStateReqOutput(BaseReq):
+    updated: bool
+    server_args: Dict[str, Any]
+
+
+@dataclass
+class ProfileReqInput(BaseReq):
+    # The output directory
+    output_dir: Optional[str] = None
+    # Specify the steps to start the profiling
+    start_step: Optional[int] = None
+    # If set, it profile as many as this number of steps.
+    # If it is set, profiling is automatically stopped after this step, and
+    # the caller doesn't need to run stop_profile.
+    num_steps: Optional[int] = None
+    # The activities to record. The choices are ["CPU", "GPU", "MEM", "RPD"]
+    activities: Optional[List[str]] = None
+    # Whether profile by stages (e.g., prefill and decode) separately
+    profile_by_stage: bool = False
+    # Whether to record source information (file and line number) for the ops.
+    with_stack: Optional[bool] = None
+    # Whether to save information about operator’s input shapes.
+    record_shapes: Optional[bool] = None
+    # Merge profiles from all ranks into a single trace
+    merge_profiles: bool = False
+    # The prefix of the profile filenames
+    profile_prefix: Optional[str] = None
+    # Only profile these stages and ignore others
+    profile_stages: Optional[List[str]] = None
+
+
+class ProfileReqType(Enum):
+    START_PROFILE = 1
+    STOP_PROFILE = 2
+
+
+@dataclass
+class ProfileReq(BaseReq):
+    type: ProfileReqType
+    output_dir: Optional[str] = None
+    start_step: Optional[int] = None
+    num_steps: Optional[int] = None
+    activities: Optional[List[str]] = None
+    profile_by_stage: bool = False
+    with_stack: Optional[bool] = None
+    record_shapes: Optional[bool] = None
+    profile_id: Optional[str] = None
+    merge_profiles: bool = False
+    profile_prefix: Optional[str] = None
+    profile_stages: Optional[List[str]] = None
+
+
+@dataclass
+class ProfileReqOutput(BaseReq):
+    success: bool
+    message: str
+
+
+@dataclass
+class FreezeGCReq(BaseReq):
+    pass
+
+
+@dataclass
+class ConfigureLoggingReq(BaseReq):
+    log_requests: Optional[bool] = None
+    log_requests_level: Optional[int] = None
+    log_requests_format: Optional[str] = None
+    dump_requests_folder: Optional[str] = None
+    dump_requests_threshold: Optional[int] = None
+    crash_dump_folder: Optional[str] = None
+
+
+@dataclass
+class OpenSessionReqInput(BaseReq):
+    capacity_of_str_len: int
+    session_id: Optional[str] = None
+    streaming: Optional[bool] = None
+    timeout: Optional[float] = None
+
+
+@dataclass
+class CloseSessionReqInput(BaseReq):
+    session_id: str
+
+
+@dataclass
+class OpenSessionReqOutput(BaseReq):
+    session_id: Optional[str]
+    success: bool
+
+
+@dataclass
+class HealthCheckOutput(BaseReq):
+    pass
+
+
+class ExpertDistributionReqType(Enum):
+    START_RECORD = 1
+    STOP_RECORD = 2
+    DUMP_RECORD = 3
+
+
+@dataclass
+class ExpertDistributionReq(BaseReq):
+    action: ExpertDistributionReqType
+
+
+@dataclass
+class ExpertDistributionReqOutput(BaseReq):
+    pass
+
+
+@dataclass
+class Function:
+    description: Optional[str] = None
+    name: Optional[str] = None
+    parameters: Optional[object] = None
+
+
+@dataclass
+class Tool:
+    function: Function
+    type: Optional[str] = "function"
+
+
+@dataclass
+class ParseFunctionCallReq(BaseReq):
+    text: str  # The text to parse.
+    tools: List[Tool] = field(
+        default_factory=list
+    )  # A list of available function tools (name, parameters, etc.).
+    tool_call_parser: Optional[str] = (
+        None  # Specify the parser type, e.g. 'llama3', 'qwen25', or 'mistral'. If not specified, tries all.
+    )
+
+
+@dataclass
+class SeparateReasoningReqInput(BaseReq):
+    text: str  # The text to parse.
+    reasoning_parser: str  # Specify the parser type, e.g., "deepseek-r1".
+
+
+@dataclass
+class VertexGenerateReqInput(BaseReq):
+    instances: List[dict]
+    parameters: Optional[dict] = None
+
+
+@dataclass
+class RpcReqInput(BaseReq):
+    method: str
+    parameters: Optional[Dict] = None
+
+
+@dataclass
+class RpcReqOutput(BaseReq):
+    success: bool
+    message: str
+
+
+@dataclass
+class LoadLoRAAdapterReqInput(BaseReq):
+    # The name of the lora module to newly loaded.
+    lora_name: str
+    # The path of loading.
+    lora_path: str
+    # Whether to pin the LoRA adapter in memory.
+    pinned: bool = False
+    # The unique identifier for the LoRA adapter, which automatically generated in the `TokenizerManager`.
+    lora_id: Optional[str] = None
+
+    def to_ref(self) -> LoRARef:
+        return LoRARef(
+            lora_id=self.lora_id,
+            lora_name=self.lora_name,
+            lora_path=self.lora_path,
+            pinned=self.pinned,
+        )
+
+
+@dataclass
+class UnloadLoRAAdapterReqInput(BaseReq):
+    # The name of lora module to unload.
+    lora_name: str
+    # The unique identifier for the LoRA adapter, which automatically generated in the `TokenizerManager`.
+    lora_id: Optional[str] = None
+
+    def to_ref(self) -> LoRARef:
+        return LoRARef(
+            lora_id=self.lora_id,
+            lora_name=self.lora_name,
+        )
+
+
+@dataclass
+class LoadLoRAAdapterFromTensorsReqInput(BaseReq):
+    lora_name: str
+    config_dict: Dict[str, Any]
+    serialized_tensors: str
+    pinned: bool = False
+    added_tokens_config: Optional[Dict[str, Any]] = None
+    lora_id: Optional[str] = None
+    load_format: Optional[str] = None
+
+    def to_ref(self) -> LoRARef:
+        return LoRARef(
+            lora_id=self.lora_id,
+            lora_name=self.lora_name,
+            lora_path="__tensor__",
+            pinned=self.pinned,
+        )
+
+
+@dataclass
+class LoRAUpdateOutput(BaseReq):
+    success: bool
+    error_message: Optional[str] = None
+    loaded_adapters: Optional[Dict[str, LoRARef]] = None
+
+
+LoadLoRAAdapterReqOutput = UnloadLoRAAdapterReqOutput = (
+    LoadLoRAAdapterFromTensorsReqOutput
+) = LoRAUpdateOutput
+
+
+class BlockReqType(Enum):
+    BLOCK = 1
+    UNBLOCK = 2
+
+
+@dataclass
+class BlockReqInput(BaseReq):
+    type: BlockReqType
+
+
+@dataclass
+class GetLoadReqInput(BaseReq):
+    pass
+
+
+@dataclass
+class GetLoadReqOutput(BaseReq):
+    dp_rank: int
+    num_reqs: int
+    num_waiting_reqs: int
+    num_tokens: int
+    ts_tic: float
+
+
+@dataclass
+class MemoryMetrics:
+    """Memory breakdown metrics."""
+
+    weight_gb: float = field(
+        metadata={"metric": ("gauge", "Model weight memory in GB")}
+    )
+    kv_cache_gb: float = field(metadata={"metric": ("gauge", "KV cache memory in GB")})
+    graph_gb: float = field(metadata={"metric": ("gauge", "CUDA graph memory in GB")})
+    token_capacity: int = field(
+        metadata={"metric": ("gauge", "Max tokens in KV cache")}
+    )
+
+
+@dataclass
+class SpeculativeMetrics:
+    """Speculative decoding metrics."""
+
+    accept_length: float = field(
+        metadata={"metric": ("gauge", "Avg accepted tokens per step")}
+    )
+    accept_rate: float = field(
+        metadata={"metric": ("gauge", "Speculative acceptance rate")}
+    )
+
+
+@dataclass
+class LoRAMetrics:
+    """LoRA adapter pool metrics."""
+
+    slots_used: int = field(metadata={"metric": ("gauge", "LoRA adapter slots in use")})
+    slots_total: int = field(metadata={"metric": ("gauge", "Total LoRA adapter slots")})
+    utilization: float = field(
+        metadata={"metric": ("gauge", "LoRA pool utilization ratio")}
+    )
+
+
+@dataclass
+class DisaggregationMetrics:
+    """PD disaggregation metrics."""
+
+    mode: str  # "prefill", "decode", or "null" - not a metric
+    prefill_prealloc_queue_reqs: int = field(
+        default=0, metadata={"metric": ("gauge", "Prefill prealloc queue requests")}
+    )
+    prefill_inflight_queue_reqs: int = field(
+        default=0, metadata={"metric": ("gauge", "Prefill inflight queue requests")}
+    )
+    decode_prealloc_queue_reqs: int = field(
+        default=0, metadata={"metric": ("gauge", "Decode prealloc queue requests")}
+    )
+    decode_transfer_queue_reqs: int = field(
+        default=0, metadata={"metric": ("gauge", "Decode transfer queue requests")}
+    )
+    decode_retracted_queue_reqs: int = field(
+        default=0, metadata={"metric": ("gauge", "Decode retracted queue requests")}
+    )
+    kv_transfer_speed_gb_s: float = field(
+        default=0.0, metadata={"metric": ("gauge", "KV transfer speed in GB/s")}
+    )
+    kv_transfer_latency_ms: float = field(
+        default=0.0, metadata={"metric": ("gauge", "KV transfer latency in ms")}
+    )
+
+
+@dataclass
+class QueueMetrics:
+    """Detailed queue breakdown."""
+
+    waiting: int = field(metadata={"metric": ("gauge", "Main waiting queue size")})
+    grammar: int = field(
+        metadata={"metric": ("gauge", "Grammar compilation queue size")}
+    )
+    paused: int = field(
+        metadata={"metric": ("gauge", "Requests paused by weight sync")}
+    )
+    retracted: int = field(metadata={"metric": ("gauge", "Retracted requests count")})
+
+
+@dataclass
+class GetLoadsReqInput(BaseReq):
+    """Request for /v1/loads endpoint."""
+
+    VALID_SECTIONS = frozenset(
+        {"core", "memory", "spec", "lora", "disagg", "queues", "all"}
+    )
+
+    include: List[str] = field(default_factory=lambda: ["all"])
+    dp_rank: Optional[int] = None
+
+    def __post_init__(self):
+        """Validate include sections."""
+        if self.include:
+            invalid = set(self.include) - self.VALID_SECTIONS
+            if invalid:
+                raise ValueError(
+                    f"Invalid include sections: {invalid}. "
+                    f"Valid options: {sorted(self.VALID_SECTIONS)}"
+                )
+
+
+@dataclass
+class GetLoadsReqOutput(BaseReq):
+    """Per-DP-rank load metrics for /v1/loads endpoint."""
+
+    dp_rank: int
+    timestamp: float
+
+    num_running_reqs: int = field(
+        metadata={"metric": ("gauge", "Number of running requests")}
+    )
+    num_waiting_reqs: int = field(
+        metadata={"metric": ("gauge", "Number of waiting requests")}
+    )
+    num_used_tokens: int = field(
+        metadata={"metric": ("gauge", "Number of tokens in use")}
+    )
+    max_total_num_tokens: int = field(
+        metadata={"metric": ("gauge", "Maximum token capacity")}
+    )
+    token_usage: float = field(metadata={"metric": ("gauge", "Token pool usage ratio")})
+    gen_throughput: float = field(
+        metadata={"metric": ("gauge", "Generation throughput tokens/sec")}
+    )
+    cache_hit_rate: float = field(
+        metadata={"metric": ("gauge", "Prefix cache hit rate")}
+    )
+    utilization: float = field(
+        metadata={"metric": ("gauge", "Overall utilization ratio")}
+    )
+    max_running_requests: int = field(
+        metadata={"metric": ("gauge", "Maximum running requests capacity")}
+    )
+
+    memory: Optional[MemoryMetrics] = None
+    speculative: Optional[SpeculativeMetrics] = None
+    lora: Optional[LoRAMetrics] = None
+    disaggregation: Optional[DisaggregationMetrics] = None
+    queues: Optional[QueueMetrics] = None
+
+
+@dataclass
+class WatchLoadUpdateReq(BaseReq):
+    loads: List[GetLoadReqOutput]
+
+
+@dataclass
+class SetInjectDumpMetadataReqInput(BaseReq):
+    dump_metadata: Dict[str, Any]
+
+
+@dataclass
+class SetInjectDumpMetadataReqOutput(BaseReq):
+    success: bool
+
+
+@dataclass
+class LazyDumpTensorsReqInput(BaseReq):
+    pass
+
+
+@dataclass
+class LazyDumpTensorsReqOutput(BaseReq):
+    success: bool
+
+
+@dataclass
+class DumperControlReqInput(BaseReq):
+    method: str
+    body: Dict[str, Any]
+
+
+@dataclass
+class DumperControlReqOutput(BaseReq):
+    success: bool
+    response: List[Dict[str, Any]]
+    error: str = ""
+
+
+def _check_all_req_types():
+    """A helper function to check all request types are defined in this file."""
+    import inspect
+    import sys
+
+    all_classes = inspect.getmembers(sys.modules[__name__], inspect.isclass)
+    for class_type in all_classes:
+        # check its name
+        name = class_type[0]
+        is_io_struct = (
+            name.endswith("Req") or name.endswith("Input") or name.endswith("Output")
+        )
+        is_base_req = issubclass(class_type[1], BaseReq) or issubclass(
+            class_type[1], BaseBatchReq
+        )
+        if is_io_struct and not is_base_req:
+            raise ValueError(f"{name} is not a subclass of BaseReq or BaseBatchReq.")
+        if is_base_req and not is_io_struct:
+            raise ValueError(
+                f"{name} is a subclass of BaseReq but not follow the naming convention."
+            )
+
+
+_check_all_req_types()
diff --git a/sglang/python/sglang/srt/managers/mm_utils.py b/sglang/python/sglang/srt/managers/mm_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..48cd1e7fc51ff1d941118a0cec219d0d67ce6c1d
--- /dev/null
+++ b/sglang/python/sglang/srt/managers/mm_utils.py
@@ -0,0 +1,1726 @@
+"""
+Multi-modality utils
+"""
+
+import copy
+import hashlib
+import pickle
+from abc import abstractmethod
+from collections import defaultdict
+from multiprocessing import shared_memory
+from typing import Any, Callable, Dict, List, Literal, Optional, Tuple
+
+import numpy as np
+import torch
+from torch import nn
+
+from sglang.srt.environ import envs
+from sglang.srt.layers.multimodal import gpu_tensor_hash
+from sglang.srt.managers.schedule_batch import (
+    CudaIpcTensorTransportProxy,
+    Modality,
+    MultimodalDataItem,
+    MultimodalInputs,
+)
+from sglang.srt.mem_cache.multimodal_cache import EmbeddingResult, MultiModalStaticCache
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch
+from sglang.srt.multimodal.evs import EVSEmbeddingResult
+from sglang.srt.server_args import get_global_server_args
+from sglang.srt.utils import flatten_nested_list, is_npu, print_warning_once
+from sglang.utils import logger
+
+_is_npu = is_npu()
+
+# NOTE: Using the shared logger from sglang.utils instead of creating a module-specific logger
+# to ensure consistent logging behavior across the codebase. This prevents issues with log
+# propagation that can cause some log messages (like 'server is fired up') to not appear
+# in the console when multimodal support is enabled.
+
+# TODO(mick): nccl
+# cuda_ipc: for intranode tensor sharing
+TensorTransportMode = Literal["cuda_ipc", "auto", "default"]
+
+
+_GPU_FEATURE_BUFFER: Optional[torch.Tensor] = None
+_BUFFER_OFFSET = 0
+
+_EXTRA_PRE_TOKENS = 0  # pre chunk extra token (0 for the moment)
+_EXTRA_POST_TOKENS = 0  # post chunk extra token (0 for the moment)
+
+_is_default_tensor_transport = None
+
+
+def init_feature_buffer(device):
+    global _GPU_FEATURE_BUFFER, _BUFFER_OFFSET
+    if (
+        device == "cpu"
+        or envs.SGLANG_MM_BUFFER_SIZE_MB.get() == 0
+        or _GPU_FEATURE_BUFFER is not None
+    ):
+        return
+    try:
+        size_mb = envs.SGLANG_MM_BUFFER_SIZE_MB.get()
+        num_elements = int(size_mb * 1024 * 1024 / 4)
+        _GPU_FEATURE_BUFFER = torch.empty(
+            num_elements, dtype=torch.float32, device=device
+        )
+        logger.info(f"Preallocated {size_mb}MB GPU buffer")
+    except RuntimeError as e:
+        _GPU_FEATURE_BUFFER = None
+
+
+def reset_buffer_offset():
+    global _BUFFER_OFFSET
+    _BUFFER_OFFSET = 0
+
+
+def is_feature_buffer_initialized():
+    global _GPU_FEATURE_BUFFER
+    if _GPU_FEATURE_BUFFER is None:
+        return False
+    return True
+
+
+def try_add_to_buffer(tensor: torch.Tensor) -> Optional[torch.Tensor]:
+    global _BUFFER_OFFSET
+
+    if _GPU_FEATURE_BUFFER is None:
+        return tensor
+
+    tensor_size = tensor.numel()
+
+    if _BUFFER_OFFSET + tensor_size <= _GPU_FEATURE_BUFFER.numel():
+        buffer_view = _GPU_FEATURE_BUFFER[_BUFFER_OFFSET : _BUFFER_OFFSET + tensor_size]
+        buffer_view.copy_(tensor.flatten(), non_blocking=True)
+        result = buffer_view.view(tensor.shape)
+        _BUFFER_OFFSET += tensor_size
+        return result
+    else:
+        return tensor
+
+
+class TransportProxyTensor(torch.Tensor):
+    """
+    A convenient torch.Tensor subclass that carries extra metadata and supports
+    efficient inter-process communications
+    """
+
+    @staticmethod
+    def __new__(
+        cls,
+        data: torch.Tensor,
+        name: Optional[str] = None,
+        fields: Optional[Dict[str, Any]] = None,
+        transport_mode: TensorTransportMode = "default",
+        *args,
+        **kwargs,
+    ):
+
+        if not isinstance(data, torch.Tensor):
+            raise TypeError(
+                f"Input 'data' must be a torch.Tensor, but got {type(data)}"
+            )
+
+        instance = data.as_subclass(cls)
+
+        instance._metadata = {
+            "name": name,
+            "fields": fields if fields is not None else {},
+            "transport_mode": transport_mode,
+        }
+
+        return instance
+
+    def __getstate__(self):
+        """
+        Called during pickling. Implements the serialization logic.
+        """
+        # acquire all serialize metadata from _metadata
+        state = {
+            "metadata": self._metadata,
+            "tensor_data": None,
+            "ipc_extra": None,
+        }
+        transport_mode = self._metadata.get("transport_mode", "default")
+
+        if transport_mode == "cuda_ipc" and self.is_cuda:
+            try:
+                storage = self.untyped_storage()
+                handle = storage._share_cuda_()
+
+                state["ipc_extra"] = {
+                    "handle": handle,
+                    "shape": self.shape,
+                    "dtype": self.dtype,
+                    "stride": self.stride(),
+                    "device_index": self.device.index,
+                    "storage_offset": self.storage_offset(),
+                }
+                state["tensor_data"] = None
+            except Exception as e:
+                # Failed to get CUDA IPC handle (possibly tp). Falling back to default transport.
+                state["metadata"]["transport_mode"] = "default"
+                state["tensor_data"] = self.as_subclass(torch.Tensor)
+        else:
+            state["metadata"]["transport_mode"] = "default"
+            state["tensor_data"] = self.as_subclass(torch.Tensor)
+
+        return state
+
+    def __setstate__(self, state: Dict[str, Any]):
+        """
+        Called during unpickling. Implements the deserialization logic.
+        """
+        self._metadata = state["metadata"]
+
+        transport_mode = self._metadata.get("transport_mode", "default")
+
+        if transport_mode == "cuda_ipc" and state["ipc_extra"] is not None:
+            ipc_extra = state["ipc_extra"]
+            handle, shape, dtype, stride, source_device_index, s_offset = (
+                ipc_extra["handle"],
+                ipc_extra["shape"],
+                ipc_extra["dtype"],
+                ipc_extra["stride"],
+                ipc_extra["device_index"],
+                ipc_extra["storage_offset"],
+            )
+
+            try:
+                target_device = torch.device(f"cuda:{source_device_index}")
+                with torch.cuda.device(target_device):
+                    storage = torch.UntypedStorage._new_shared_cuda(*handle)
+                    reconstructed_tensor = torch.empty(
+                        0, dtype=dtype, device=target_device
+                    ).set_(storage, storage_offset=s_offset, size=shape, stride=stride)
+                    self.set_(reconstructed_tensor)
+            except Exception as e:
+                print(f"Error: Failed to deserialize from CUDA IPC handle ({e}).")
+                raise e
+
+        elif state["tensor_data"] is not None:
+            self.set_(state["tensor_data"])
+        else:
+            raise pickle.UnpicklingError(
+                "Invalid state for TransportProxyTensor: no tensor data found."
+            )
+
+    @property
+    def name(self) -> Optional[str]:
+        return self._metadata.get("name")
+
+    @property
+    def fields(self) -> Dict[str, Any]:
+        return self._metadata.get("fields", {})
+
+    @property
+    def transport_mode(self) -> TensorTransportMode:
+        return self._metadata.get("transport_mode", "default")
+
+
+class MultiModalityDataPaddingPattern:
+    """
+    Data tokens (like image tokens) often need special handling during padding
+    to maintain model compatibility. This class provides the interface for
+    implementing different padding strategies for data tokens
+    """
+
+    @abstractmethod
+    def pad_input_tokens(
+        self, input_ids: List[int], mm_inputs: MultimodalInputs
+    ) -> List[int]:
+        """
+        Pad the input ids sequence containing data tokens, and replace them with pad_values
+        """
+        pass
+
+
+class MultiModalityDataPaddingPatternTokenPairs(MultiModalityDataPaddingPattern):
+    """In this pattern, data tokens should be enclosed by special token pairs (e.g. <image>...</image>, data_token_pairs)
+
+    The padded value in a region enclosed by a token pair with be the same one, as the MultimodalDataItem's pad value
+
+    This strategy should be applied when data content is marked by start/end token pairs in the input sequence.
+    """
+
+    def __init__(
+        self,
+        data_token_pairs: Optional[List[Tuple[int, int]]],
+        data_start_token_ids: Optional[List[int]] = None,
+    ) -> None:
+        """
+
+        Args:
+            data_start_token_ids marks the start of a single multimodal data
+            See Minicpmo's slice_start_id for example
+        """
+        self.data_token_id_pairs = data_token_pairs
+        self.data_start_token_ids = data_start_token_ids or [
+            s for s, _e in data_token_pairs
+        ]
+
+    def pad_input_tokens(
+        self, input_ids: List[int], mm_inputs: MultimodalInputs
+    ) -> List[int]:
+        """
+        This function will replace the data-tokens in between with pad_values accordingly
+        """
+        pad_values = [item.pad_value for item in mm_inputs.mm_items]
+        data_token_pairs = self.data_token_id_pairs
+        mm_inputs.data_offsets = []
+        if data_token_pairs is None:
+            data_token_pairs = [mm_inputs.im_start_id, mm_inputs.im_end_id]
+        if data_token_pairs is None:
+            print_warning_once(
+                "No data_token_pairs provided, RadixAttention might be influenced."
+            )
+            return input_ids
+        start_token_ids = {s for s, _e in data_token_pairs}
+        end_tokens_ids = {e for _s, e in data_token_pairs}
+
+        padded_ids = []
+        last_idx = 0
+        data_idx = -1
+
+        start_indices = [i for i, x in enumerate(input_ids) if x in start_token_ids]
+        end_indices = [i for i, x in enumerate(input_ids) if x in end_tokens_ids]
+
+        if len(start_indices) != len(end_indices):
+            return input_ids
+
+        for start_idx, end_idx in zip(start_indices, end_indices):
+            padded_ids.extend(input_ids[last_idx : start_idx + 1])
+
+            if input_ids[start_idx] in self.data_start_token_ids:
+                data_idx += 1
+                mm_inputs.data_offsets += [start_idx]
+
+            if data_idx >= len(pad_values):
+                data_idx = len(pad_values) - 1
+
+            num_tokens = end_idx - start_idx - 1
+            pad_value = pad_values[data_idx]
+            padded_ids.extend([pad_value] * num_tokens)
+
+            last_idx = end_idx
+
+        padded_ids.extend(input_ids[last_idx:])
+
+        assert len(input_ids) == len(padded_ids), "Length validation fails"
+        return padded_ids
+
+
+class MultiModalityDataPaddingPatternMultimodalTokens(MultiModalityDataPaddingPattern):
+    """In this pattern, data tokens should be represented as repetitions of a single token
+    e.g. <image><image>....<image>, or <audio><audio>...<audio>
+    """
+
+    def pad_input_tokens(
+        self, input_ids: List[int], mm_inputs: MultimodalInputs
+    ) -> List[int]:
+        """
+        Replaces multimodal tokens in input_ids with corresponding pad_values from mm_items.
+        Each modality (image, audio, video) is handled separately based on its token_id.
+        """
+        if not input_ids or not mm_inputs.mm_items:
+            return input_ids
+
+        input_ids_tensor = torch.as_tensor(input_ids)
+
+        # Check if MM splitting is enabled
+        if envs.SGLANG_ENABLE_MM_SPLITTING.get():
+            items_by_modality = defaultdict(list)
+            for item in mm_inputs.mm_items:
+                items_by_modality[item.modality].append(item)
+
+            token_id_map = {
+                Modality.IMAGE: mm_inputs.im_token_id,
+                Modality.MULTI_IMAGES: mm_inputs.im_token_id,
+                Modality.AUDIO: mm_inputs.audio_token_id,
+                Modality.VIDEO: mm_inputs.video_token_id,
+            }
+
+            for modality, items in items_by_modality.items():
+                token_id = token_id_map.get(modality)
+
+                if not items or token_id is None:
+                    continue
+
+                for i, item in enumerate(items):
+                    for offset in items[i].offsets:
+                        input_ids_tensor[offset[0] : offset[1] + 1] = item.pad_value
+        else:
+            # Create mapping of token_ids to pad_values for each modality
+            token_to_pad_mapping = {}
+            for item in mm_inputs.mm_items:
+                if item.is_image() and mm_inputs.im_token_id is not None:
+                    token_to_pad_mapping[mm_inputs.im_token_id] = item.pad_value
+                elif item.is_audio() and mm_inputs.audio_token_id is not None:
+                    token_to_pad_mapping[mm_inputs.audio_token_id] = item.pad_value
+                elif item.is_video() and mm_inputs.video_token_id is not None:
+                    token_to_pad_mapping[mm_inputs.video_token_id] = item.pad_value
+                else:
+                    raise ValueError(
+                        f"No multimodal token id provided for {item.modality}"
+                    )
+
+            # Apply replacements for all tokens at once
+            for token_id, pad_value in token_to_pad_mapping.items():
+                input_ids_tensor[input_ids_tensor == token_id] = pad_value
+
+        ret_input_ids = input_ids_tensor.tolist()
+        return ret_input_ids
+
+
+embedding_cache: Optional[MultiModalStaticCache] = None
+
+
+def init_mm_embedding_cache(max_size: int = 0):
+    global embedding_cache
+    embedding_cache = MultiModalStaticCache(max_size)
+
+
+def get_embedding_chunk(
+    embedding: torch.Tensor,
+    extend_prefix_len: int,
+    extend_seq_len: int,
+    items_offset: List[Tuple[int, int]],
+) -> Tuple[torch.Tensor, int, int]:
+    """
+    Extract a chunk of embeddings based on the specified prefix length, sequence length, and offset ranges.
+
+    Args:
+        embedding: The full embedding tensor to extract a chunk from
+        extend_prefix_len: The starting position (prefix length) for extraction
+        extend_seq_len: The number of tokens to extract
+        items_offset: List of [start, end] offset ranges for multimodal items in the input sequence
+
+    Returns:
+        A tuple containing:
+        - The extracted embedding chunk as a tensor
+        - The start index used for extraction
+        - The end index used for extraction
+
+    Note:
+        If there's no overlap between the requested range and the offset ranges,
+        an empty tensor is returned with zeros for start and end indices.
+    """
+    start_index, end_index = 0, 0
+    extend_start_index = extend_prefix_len
+    extend_end_index = extend_prefix_len + extend_seq_len - 1
+
+    for start, end in items_offset:
+        if extend_start_index >= start and extend_start_index <= end:
+            start_index += extend_start_index - start
+        elif extend_start_index > end:
+            start_index += end - start + 1
+
+        if extend_end_index >= start and extend_end_index <= end:
+            end_index += extend_end_index - start + 1
+        elif extend_end_index > end:
+            end_index += end - start + 1
+    # some models' embedding is 3-dim, reshape it to 2-dim
+    embedding = embedding.reshape(-1, embedding.shape[-1])
+    embedding_chunk = embedding[start_index:end_index]
+    return embedding_chunk, start_index, end_index
+
+
+def _get_precomputed_embedding(
+    items: List[MultimodalDataItem],
+    prefix_length: List[int],
+    extend_length: List[int],
+    items_offset_list: List[List[Tuple[int, int]]],
+) -> Optional[torch.Tensor]:
+    """
+    If all items have precomputed_embeddings, return their concatenation.
+    If some but not all have precomputed_embeddings, raise NotImplementedError.
+    If none have precomputed_embeddings, return None.
+    """
+    precomputed_embeddings = []
+    for idx, item in enumerate(items):
+        if item.precomputed_embeddings is None:
+            precomputed_embeddings.append(None)
+            continue
+        seq_start_idx = prefix_length[idx]
+        seq_end_idx = seq_start_idx + extend_length[idx] - 1
+        prefix_embedding_length = []
+        extend_embedding_length = []
+        for mm_start_idx, mm_end_idx in items_offset_list[idx]:
+            if mm_start_idx > seq_end_idx:
+                break
+            if seq_start_idx > mm_start_idx:
+                prefix_embedding_length.append(
+                    min(seq_start_idx - mm_start_idx, mm_end_idx - mm_start_idx + 1)
+                )
+            if mm_end_idx >= seq_start_idx:
+                extend_embedding_length.append(
+                    min(
+                        mm_end_idx - seq_start_idx + 1,
+                        seq_end_idx - mm_start_idx + 1,
+                        mm_end_idx - mm_start_idx + 1,
+                        seq_end_idx - seq_start_idx + 1,
+                    )
+                )
+        prefix_embedding_length = int(np.sum(prefix_embedding_length))
+        extend_embedding_length = int(np.sum(extend_embedding_length))
+        precomputed_embeddings.append(
+            item.precomputed_embeddings[
+                prefix_embedding_length : prefix_embedding_length
+                + extend_embedding_length
+            ]
+        )
+
+    if any(feature is not None for feature in precomputed_embeddings):
+        if not all(feature is not None for feature in precomputed_embeddings):
+            raise NotImplementedError(
+                "MM inputs where only some items are precomputed."
+            )
+        result = torch.concat(precomputed_embeddings)
+        # some models embedding is 3-dim, reshape it to 2-dim (similar to get_embedding_chunk)
+        result = result.reshape(-1, result.shape[-1])
+        return result
+    return None
+
+
+DataEmbeddingFunc = Callable[
+    [List[MultimodalDataItem]], torch.Tensor | EVSEmbeddingResult
+]
+
+
+def get_embedding_items_per_chunk_with_extra_padding(
+    embedding_items_per_req: List["MultimodalDataItem"],
+    extend_prefix_len: int,
+    extend_seq_len: int,
+    items_offset: List[Tuple[int, int]],
+) -> List["MultimodalDataItem"]:
+    """
+    From all multimodal items of a request, select the subset that is "relevant to
+    this prefill chunk", and allow a small amount of extra padding on both sides
+    of the chunk boundary (for easier caching or cross-chunk reuse).
+
+    Assumptions:
+        - len(embedding_items_per_req) == len(items_offset)
+        - items_offset[j] = (start, end), meaning the multimodal tokens of the j-th
+        item correspond to [start, end) (left-closed, right-open) in the entire
+        token sequence
+        - The item order in embedding_items_per_req is one-to-one aligned with
+        items_offset
+
+    Args:
+        embedding_items_per_req: all items of this modality under the current
+            request (e.g. each frame in a 500-frame video)
+        extend_prefix_len: number of tokens already prefilled before the current
+            chunk
+        extend_seq_len: number of tokens in the current chunk
+        items_offset: (start, end) position of each item in the whole sentence
+
+    Returns:
+        The subset of items to feed into ViT for this chunk (preserving the
+        original order)
+    """
+    assert len(embedding_items_per_req) == len(
+        items_offset
+    ), f"items_per_req({len(embedding_items_per_req)}) vs items_offset({len(items_offset)}) mismatch"
+
+    if extend_seq_len <= 0:
+        return []
+
+    # Current chunk's token range
+    chunk_start = extend_prefix_len
+    chunk_end = extend_prefix_len + extend_seq_len
+
+    # Current chunk's token range with extra padding
+    window_start = max(0, chunk_start - _EXTRA_PRE_TOKENS)
+    window_end = chunk_end + _EXTRA_POST_TOKENS
+
+    selected_items: List["MultimodalDataItem"] = []
+
+    for item, (start, end) in zip(embedding_items_per_req, items_offset):
+        if start >= end:
+            continue
+
+        # Check whether this item has overlap with [window_start, window_end)
+        # If has overlap, add the item into selected_item.
+        if end > window_start and start < window_end:
+            selected_items.append(item)
+
+    return selected_items
+
+
+# TODO: To be obsoleted.
+def _get_chunked_prefill_embedding(
+    data_embedding_func: DataEmbeddingFunc,
+    embedding_items: List[MultimodalDataItem],
+    items_size: List[int],
+    prefix_length: List[int],
+    extend_length: List[int],
+    items_offset_list: List[List[Tuple[int, int]]],
+    input_ids: torch.Tensor,
+) -> tuple[torch.Tensor | None, torch.Tensor]:
+    # Calculate embedding for each request, try to get it from cache to avoid repeated calculation
+    embedding_list = []
+    # FIXME(Xinyuan): temporary workaround for eagle3, which may have len(items_size) > len(prefix_length)
+    max_iterations = min(len(items_size) - 1, len(prefix_length))
+    for i in range(max_iterations):
+        if items_size[i] == items_size[i + 1]:
+            continue
+        embedding_items_per_req = embedding_items[items_size[i] : items_size[i + 1]]
+        items_offset = items_offset_list[i]
+        assert items_offset is not None, items_offset
+        # if all items has been prefixed, we do not need to calculate embedding
+        if all([offset_end < prefix_length[i] for _, offset_end in items_offset]):
+            continue
+        item_hashes = [item.hash for item in embedding_items_per_req]
+        embedding_items_hash = MultiModalStaticCache.combine_hashes(item_hashes)
+        embedding_per_req = embedding_cache.get(item_hashes)
+        if embedding_per_req is None:
+            embedding = data_embedding_func(embedding_items_per_req)
+            embedding_per_req = (
+                EmbeddingResult(embedding=embedding)
+                if isinstance(embedding, torch.Tensor)
+                else embedding
+            )
+            if not embedding_cache.set(embedding_items_hash, embedding_per_req):
+                print_warning_once(
+                    "Multimodal embedding cache is full. This typically occurs when a single "
+                    "embedding exceeds the cache size limit. Consider increasing the "
+                    "`SGLANG_VLM_CACHE_SIZE_MB` environment variable or reducing the input "
+                    "embedding size."
+                )
+
+        extend_prefix_len = prefix_length[i]
+        extend_seq_len = extend_length[i] if i < len(extend_length) else 0
+
+        if isinstance(embedding_per_req, EVSEmbeddingResult):
+            item = embedding_items_per_req[0]
+            input_ids, items_offset = (
+                embedding_per_req.redistribute_pruned_frames_placeholders(
+                    input_ids,
+                    items_offset,
+                    item=item,
+                    extend_prefix_len=extend_prefix_len,
+                    extend_seq_len=extend_seq_len,
+                )
+            )
+
+        embedding_per_req_chunk, _, _ = get_embedding_chunk(
+            embedding=embedding_per_req.embedding,
+            extend_prefix_len=extend_prefix_len,
+            extend_seq_len=extend_seq_len,
+            items_offset=items_offset,
+        )
+        embedding_list.append(embedding_per_req_chunk)
+    if len(embedding_list) == 0:
+        return None, input_ids
+    return torch.concat(embedding_list, dim=0), input_ids
+
+
+def get_embedding_chunk_remove_extra_padding(
+    embedding: torch.Tensor,
+    extend_prefix_len: int,
+    extend_seq_len: int,
+    items_offset: List[Tuple[int, int]],
+) -> Tuple[Optional[torch.Tensor], int, int]:
+    """
+    From the embedding computed on "items related to this chunk + extra padding",
+    trim out the token embeddings that are not needed for the current chunk, and
+    keep only those mm tokens covered by
+    [extend_prefix_len, extend_prefix_len + extend_seq_len).
+
+    Assumptions:
+        - Each (start, end) in items_offset represents an item's multimodal token
+        interval [start, end) in the whole token sequence, and their order is
+        consistent with the order of items in `embedding`.
+        - The layout of `embedding`: each selected item is concatenated in order,
+        and item j occupies seg_len_j = end_j - start_j rows.
+
+    Args:
+        embedding: output of data_embedding_func(embedding_items_per_chunk),
+                shape = (T_total, D)
+        extend_prefix_len: number of tokens before the chunk (prefix_len)
+        extend_seq_len: number of tokens in this chunk (chunk_len)
+        items_offset: list of (start, end) for all items of the current request
+
+    Returns:
+        - trimmed_embedding: embedding that contains only the mm tokens needed
+        by this chunk, concatenated in token order
+        - num_tokens_before: number of mm tokens "before the chunk" that are
+        trimmed off (optional info, not used by the current caller)
+        - num_tokens_after: number of mm tokens "after the chunk" that are
+        trimmed off (optional info, not used by the current caller)
+    """
+    if embedding is None or embedding.numel() == 0:
+        return None, 0, 0
+
+    chunk_start = extend_prefix_len
+    chunk_end = extend_prefix_len + extend_seq_len
+
+    if extend_seq_len <= 0 or chunk_start >= chunk_end:
+        return None, 0, 0
+
+    # The window with extra padding
+    window_start = max(0, chunk_start - _EXTRA_PRE_TOKENS)
+    window_end = chunk_end + _EXTRA_POST_TOKENS
+
+    # Iterate item_offset to choose item.
+    # We need to forward an embedding_idx to locate the item start-end position in embedding.
+    embedding_idx = 0
+    kept_slices: List[torch.Tensor] = []
+
+    num_tokens_before = 0
+    num_tokens_after = 0
+
+    for start, end in items_offset:
+        if start >= end:
+            continue
+
+        seg_len = end - start
+
+        # Check whether this item has been chosen into embedding_items_per_chunk or not.
+        selected = end > window_start and start < window_end
+
+        if not selected:
+            # Not in embedding_items_per_chunk, not forward embedding_idx.
+            continue
+
+        # embedding has the whole item
+        # embedding[embedding_idx : embedding_idx + seg_len]
+
+        # Calculate the overlap range between item and the current chunk
+        overlap_start = max(start, chunk_start)
+        overlap_end = min(end, chunk_end)
+
+        if overlap_start < overlap_end:
+            # The item has a portion mm tokens in the current chunk
+            # The offset inside item
+            local_start = overlap_start - start
+            local_end = overlap_end - start
+
+            # The embedding index
+            slice_start = embedding_idx + local_start
+            slice_end = embedding_idx + local_end
+
+            kept_slices.append(embedding[slice_start:slice_end])
+
+            # Stats the token number before and after this chunk
+            num_tokens_before += max(0, local_start)
+            num_tokens_after += max(0, seg_len - local_end)
+        else:
+            # Although item is chosen into embedding_items_per_chunk as extra padding,
+            # Its mm tokens has no overlap with chunk, so don't count into the current
+            # chunk's embedding.
+            if end <= chunk_start:
+                num_tokens_before += seg_len
+            elif start >= chunk_end:
+                num_tokens_after += seg_len
+
+        # No matter whether this item has overlap with chunk, once it's selected, it
+        # counts seg_len in embedding, so embedding_idx has to forward.
+        embedding_idx += seg_len
+
+    if not kept_slices:
+        # No mm tokens in this chunk
+        return None, num_tokens_before, num_tokens_after
+
+    trimmed_embedding = torch.cat(kept_slices, dim=0)
+    return trimmed_embedding, num_tokens_before, num_tokens_after
+
+
+# This function is for chunked prefill vit for multiple items in the next feature.
+def _get_chunked_prefill_embedding_for_chunked_items(
+    data_embedding_func: Callable[[List["MultimodalDataItem"]], torch.Tensor],
+    embedding_items: List["MultimodalDataItem"],
+    items_size: List[int],
+    prefix_length: List[int],
+    extend_length: List[int],
+    items_offset_list: List[List[Tuple[int, int]]],
+) -> Optional[torch.Tensor]:
+    """
+    Multi-modal embedding computation for chunked prefill.
+
+    For each request:
+    1. Use items_size to split embedding_items into per-request sublists embedding_items_per_req;
+    2. Use get_embedding_items_per_chunk_with_extra_padding to select the subset of items related to this chunk;
+    3. Call data_embedding_func (ViT) on this subset to obtain embedding_per_chunk;
+    4. Concatenate embedding_per_req_chunk for all requests in order.
+
+    In this way, the ViT for each request only processes the frames / images related to the current chunk,
+    avoiding OOM caused by processing all the frames at once.
+    """
+    # Calculate embedding for each request, try to get it from cache to avoid repeated calculation
+    embedding_list = []
+    # FIXME(Xinyuan): temporary workaround for eagle3, which may have len(items_size) > len(prefix_length)
+    max_iterations = min(len(items_size) - 1, len(prefix_length))
+
+    for i in range(max_iterations):
+        if items_size[i] == items_size[i + 1]:
+            continue
+        embedding_items_per_req = embedding_items[items_size[i] : items_size[i + 1]]
+        items_offset = items_offset_list[i]
+        assert items_offset is not None, items_offset
+
+        # if all items has been prefixed, we do not need to calculate embedding
+        if all([offset_end < prefix_length[i] for _, offset_end in items_offset]):
+            continue
+
+        # 1) Pick up items related with this chunk
+        embedding_items_per_chunk = get_embedding_items_per_chunk_with_extra_padding(
+            embedding_items_per_req,
+            extend_prefix_len=prefix_length[i],
+            extend_seq_len=extend_length[i] if i < len(extend_length) else 0,
+            items_offset=items_offset,
+        )
+
+        if not embedding_items_per_chunk:
+            continue
+
+        # 2) construct cache key
+        # embedding_items_hash = MultiModalStaticCache.combine_hashes(
+        #     embedding_items_per_chunk
+        # )
+        item_hashes = [item.hash for item in embedding_items_per_chunk]
+        embedding_items_hash = MultiModalStaticCache.combine_hashes(item_hashes)
+
+        embedding_per_chunk = embedding_cache.get(embedding_items_hash)
+        if embedding_per_chunk is None:
+            # ViT forward for items related with per chunk
+            embedding_per_chunk = data_embedding_func(embedding_items_per_chunk)
+
+            embedding_for_cache = embedding_per_chunk.detach().cpu()
+            if not embedding_cache.set(embedding_items_hash, embedding_for_cache):
+                print(
+                    "[WARN] Multimodal embedding cache is full. "
+                    "Consider increasing `SGLANG_VLM_CACHE_SIZE_MB` or reducing "
+                    "video frame count / resolution for a single request."
+                )
+        else:
+            target_device = embedding_items_per_req[0].feature.device
+            if embedding_per_chunk.device != target_device:
+                embedding_per_chunk = embedding_per_chunk.to(target_device)
+
+        # 3) remove extra padding from embedding_per_chunk, only keep current chunk part
+        #    We probably don't need this part.
+        # embedding_per_req_chunk, _, _ = get_embedding_chunk_remove_extra_padding(
+        #     embedding=embedding_per_chunk,
+        #     extend_prefix_len=prefix_len,
+        #     extend_seq_len=chunk_len,
+        #     items_offset=items_offset,
+        # )
+
+        if embedding_per_chunk is not None and embedding_per_chunk.numel() > 0:
+            embedding_list.append(embedding_per_chunk)
+
+    if not embedding_list:
+        return None
+
+    # concat all the request's chunk embedding in token
+    return torch.cat(embedding_list, dim=0)
+
+
+def _get_multimodal_mask(
+    input_ids: torch.Tensor, placeholder_tensor: torch.Tensor
+) -> torch.Tensor:
+    return torch.isin(input_ids, placeholder_tensor).unsqueeze(-1)
+
+
+def _adjust_embedding_length(
+    embedding: torch.Tensor,
+    mask: torch.Tensor,
+    logger,
+) -> torch.Tensor:
+    num_mm_tokens_in_embedding = embedding.shape[0]
+    num_mm_tokens_in_input_ids = mask.sum().item()
+    if num_mm_tokens_in_input_ids != num_mm_tokens_in_embedding:
+        logger.warning(
+            f"Number of tokens in multimodal embedding does not match those in the input text. "
+            f"Got {num_mm_tokens_in_input_ids} tokens in the text but {num_mm_tokens_in_embedding} "
+            f"tokens from multimodal embeddings."
+        )
+        if num_mm_tokens_in_input_ids < num_mm_tokens_in_embedding:
+            chunked_prefill_size = get_global_server_args().chunked_prefill_size
+            if chunked_prefill_size != -1:
+                logger.warning(
+                    "You may want to avoid this issue by raising `chunked_prefill_size`, or disabling chunked prefill"
+                )
+            # extract from the end: this is a compromise
+            if embedding.dim() == 2:
+                embedding = embedding[-num_mm_tokens_in_input_ids:, :]
+            else:
+                num_multimodal = num_mm_tokens_in_input_ids // embedding.shape[0]
+                embedding = embedding[-num_multimodal:, :]
+        else:
+            raise RuntimeError(
+                f"Insufficient multimodal embedding length: {num_mm_tokens_in_input_ids=} vs {num_mm_tokens_in_embedding=}. This is an internal error"
+            )
+    return embedding
+
+
+def get_embedding_and_mask(
+    data_embedding_func: DataEmbeddingFunc,
+    embedding_items: List[MultimodalDataItem],
+    placeholder_tensor: torch.Tensor,
+    input_ids: torch.Tensor,
+    items_size: List[int],
+    prefix_length: List[int],
+    extend_length: List[int],
+    items_offset_list: List[List[Tuple[int, int]]],
+) -> Tuple[torch.Tensor | None, torch.Tensor | None, torch.Tensor]:
+    """
+    Generate multimodal embeddings and create a mask for identifying their positions in the input sequence.
+
+    Args:
+        data_embedding_func: Function that generates embeddings for multimodal items
+        embedding_items: List of multimodal items to embed
+        placeholder_tensor: Tensor containing token IDs that serve as placeholders for multimodal content
+        input_ids: The input token IDs tensor
+        items_size: Cumulative sizes of multimodal items per request
+        prefix_length: Prefix lengths for each request
+        extend_length: Sequence lengths for each request
+        items_offset_list: List of offset ranges for multimodal items in each request
+
+    Returns:
+        A tuple containing:
+        - The generated embeddings tensor
+        - A boolean mask tensor indicating where these embeddings should be placed
+        - If EVS is used, the pruned input ids tensor; otherwise, the original input ids tensor
+    """
+    # 1. Get embedding
+    embedding = _get_precomputed_embedding(
+        embedding_items, prefix_length, extend_length, items_offset_list
+    )
+    if embedding is None:
+        embedding, input_ids = _get_chunked_prefill_embedding(
+            data_embedding_func,
+            embedding_items,
+            items_size,
+            prefix_length,
+            extend_length,
+            items_offset_list,
+            input_ids,
+        )
+        if embedding is None:
+            return None, None, input_ids
+    # 2. Get mask
+    if _is_npu:
+        torch.npu.current_stream().synchronize()
+    special_multimodal_mask = _get_multimodal_mask(input_ids, placeholder_tensor)
+    # 3. Adjust embedding length if needed
+    embedding = _adjust_embedding_length(embedding, special_multimodal_mask, logger)
+    return embedding, special_multimodal_mask, input_ids
+
+
+def embed_mm_inputs(
+    mm_inputs_list: List[MultimodalInputs],
+    extend_prefix_lens: List[int],
+    extend_seq_lens: List[int],
+    input_ids: torch.Tensor,
+    input_embedding: nn.Embedding,
+    multimodal_model: nn.Module = None,
+    data_embedding_func_mapping: Dict[Modality, DataEmbeddingFunc] = None,
+    placeholder_tokens: dict[Modality, List[int]] = None,
+    use_deepstack: Dict[Modality, bool] = {},
+) -> Optional[torch.Tensor]:
+    """
+    Embed multimodal inputs and integrate them with text token embeddings.
+
+    Args:
+        mm_inputs_list: List of multimodal inputs to process
+        extend_prefix_lens: Prefix lengths for each request
+        extend_seq_lens: Sequence lengths for each request
+        input_ids: Input token IDs tensor
+        input_embedding: Embedding layer for text tokens
+        placeholder_tokens: Token IDs for multimodal placeholders (uses pad_values if None)
+
+    Returns:
+        Combined embedding tensor with multimodal content integrated
+    """
+    other_info = {}
+    if mm_inputs_list is None:
+        return None
+
+    # 1. Calculate the multimodal data which exists in input_ids, with the help of pad_values
+    # we assume that multimodal data are represented with its pad_values in input_ids
+    item_flatten_list = []
+    for mm_inputs in mm_inputs_list:
+        item_flatten_list += [item for item in mm_inputs.mm_items if item is not None]
+
+    # deepstack_embeddings: per-modality
+    modalities, embeddings, masks, deepstack_embeddings = [], [], [], []
+
+    # 2. Get multimodal embedding separately
+    # Try get mm embedding if any
+    for modality in Modality.all():
+        items = [
+            item for item in item_flatten_list if item.is_modality(modality=modality)
+        ]
+        embedder = (
+            None
+            if data_embedding_func_mapping is None
+            else data_embedding_func_mapping.get(modality, None)
+        )
+        if embedder is None:
+            # "image", "video", etc
+            modality_id = modality.name.lower()
+            embedder = getattr(multimodal_model, f"get_{modality_id}_feature", None)
+        if len(items) != 0:
+            assert embedder is not None, f"no embedding method found for {modality}"
+            placeholder_tensor = torch.as_tensor(
+                [item.pad_value for item in items],
+                device=input_ids.device,
+            )
+            # calculate per request items length offset
+            items_size = torch.zeros(len(mm_inputs_list) + 1, dtype=int)
+            items_offsets = []
+            for i, mm_inputs in enumerate(mm_inputs_list):
+                mm_items = [
+                    item
+                    for item in mm_inputs.mm_items
+                    if item.is_modality(modality=modality)
+                ]
+                items_size[i + 1] = len(mm_items)
+                items_offsets.append(
+                    flatten_nested_list([item.offsets for item in mm_items])
+                )
+            items_size = torch.cumsum(items_size, dim=0).tolist()
+
+            embedding, mask, input_ids = get_embedding_and_mask(
+                data_embedding_func=embedder,
+                embedding_items=items,
+                placeholder_tensor=placeholder_tensor,
+                input_ids=input_ids,
+                items_size=items_size,
+                prefix_length=extend_prefix_lens,
+                extend_length=extend_seq_lens,
+                items_offset_list=items_offsets,
+            )
+
+            if use_deepstack.get(modality, None) and embedding is not None:
+                embedding, deepstack_embedding = (
+                    multimodal_model.separate_deepstack_embeds(embedding)
+                )
+                deepstack_embeddings += [deepstack_embedding]
+            modalities += [modality]
+            embeddings += [embedding]
+            masks += [mask]
+
+    # 3. Get input embeddings
+    vocab_size = input_embedding.num_embeddings
+    # Important: clamp after getting original multimodal regions
+    # Clamp input ids. This is because the input_ids for the multimodal tokens are
+    # filled with the hash values of the multimodal for the prefix matching in the radix attention.
+    # There values are useless because their embeddings will be replaced by vision embeddings anyway.
+    input_ids.clamp_(min=0, max=vocab_size - 1)
+    input_embeds = input_embedding(input_ids)
+
+    # deepstack embedding
+    if use_deepstack:
+        num_deepstack_embeddings = len(multimodal_model.deepstack_visual_indexes)
+
+        deepstack_embedding_shape = input_embeds.shape[:-1] + (
+            input_embeds.shape[-1] * num_deepstack_embeddings,
+        )
+        # a zero-filled embedding, with the same length of input_embeds, but different hidden_size
+        input_deepstack_embeds = torch.zeros(
+            deepstack_embedding_shape,
+            device=input_embeds.device,
+            dtype=input_embeds.dtype,
+        )
+
+        other_info["input_deepstack_embeds"] = input_deepstack_embeds
+
+    # 4. scatter embeddings into input embedding
+    for i, modality, embedding, mask in zip(
+        range(len(embeddings)), modalities, embeddings, masks
+    ):
+        if embedding is None or mask is None:
+            continue
+        # in-place update
+        indices = torch.where(mask.squeeze(dim=-1))[0]
+        input_embeds[indices] = embedding.to(input_embeds.device, input_embeds.dtype)
+        if use_deepstack.get(modality, None):
+            input_deepstack_embeds[indices] = deepstack_embeddings[i].to(
+                input_embeds.device, input_embeds.dtype
+            )
+
+    return input_embeds, other_info
+
+
+def _embed_mm_inputs_with_split(
+    mm_inputs_list: List[MultimodalInputs],
+    extend_prefix_lens: List[int],
+    extend_seq_lens: List[int],
+    input_ids: torch.Tensor,
+    forward_batch: ForwardBatch,
+    input_embedding: nn.Embedding,
+    multimodal_model: nn.Module = None,
+    data_embedding_func_mapping: Dict[Modality, DataEmbeddingFunc] = None,
+    placeholder_tokens: dict[Modality, List[int]] = None,
+    use_deepstack: Dict[Modality, bool] = {},
+):
+    """Split batch into precomputed vs non-precomputed, embed each group, merge back."""
+    precomputed_req_indices = []
+    non_precomputed_req_indices = []
+    for idx, mm_input in enumerate(mm_inputs_list):
+        items = [item for item in mm_input.mm_items if item is not None]
+        if items and all(
+            getattr(item, "precomputed_embeddings", None) is not None for item in items
+        ):
+            precomputed_req_indices.append(idx)
+        else:
+            non_precomputed_req_indices.append(idx)
+
+    embed_kwargs = dict(
+        multimodal_model=multimodal_model,
+        input_embedding=input_embedding,
+        data_embedding_func_mapping=data_embedding_func_mapping,
+        placeholder_tokens=placeholder_tokens,
+        use_deepstack=use_deepstack,
+    )
+
+    if not precomputed_req_indices or not non_precomputed_req_indices:
+        return embed_mm_inputs(
+            mm_inputs_list=mm_inputs_list,
+            extend_prefix_lens=extend_prefix_lens,
+            extend_seq_lens=extend_seq_lens,
+            input_ids=input_ids,
+            **embed_kwargs,
+        )
+
+    all_seq_lens = forward_batch.extend_seq_lens_cpu
+    mm_batch_indices = [
+        i for i, mm in enumerate(forward_batch.mm_inputs) if mm is not None
+    ]
+    token_starts = []
+    cumulative = 0
+    for sl in all_seq_lens:
+        token_starts.append(cumulative)
+        cumulative += sl
+
+    vocab_size = input_embedding.num_embeddings
+    input_embeds = input_embedding(input_ids.clamp(min=0, max=vocab_size - 1))
+    other_info = {}
+
+    input_deepstack_embeds = None
+    if use_deepstack and multimodal_model is not None:
+        num_deepstack_embeddings = len(multimodal_model.deepstack_visual_indexes)
+        input_deepstack_embeds = torch.zeros(
+            input_ids.shape[0],
+            input_embedding.embedding_dim * num_deepstack_embeddings,
+            device=input_ids.device,
+            dtype=input_embedding.weight.dtype,
+        )
+        other_info["input_deepstack_embeds"] = input_deepstack_embeds
+
+    for group_req_indices in [precomputed_req_indices, non_precomputed_req_indices]:
+        sub_mm_inputs = [mm_inputs_list[i] for i in group_req_indices]
+        sub_prefix_lens = [extend_prefix_lens[i] for i in group_req_indices]
+        sub_seq_lens = [extend_seq_lens[i] for i in group_req_indices]
+        group_batch_indices = [mm_batch_indices[i] for i in group_req_indices]
+        sub_slices = [
+            input_ids[token_starts[bi] : token_starts[bi] + all_seq_lens[bi]]
+            for bi in group_batch_indices
+        ]
+        sub_input_ids = torch.cat(sub_slices)
+
+        sub_embeds, sub_info = embed_mm_inputs(
+            mm_inputs_list=sub_mm_inputs,
+            extend_prefix_lens=sub_prefix_lens,
+            extend_seq_lens=sub_seq_lens,
+            input_ids=sub_input_ids,
+            **embed_kwargs,
+        )
+
+        offset = 0
+        for bi in group_batch_indices:
+            req_len = all_seq_lens[bi]
+            start = token_starts[bi]
+            input_embeds[start : start + req_len] = sub_embeds[
+                offset : offset + req_len
+            ]
+            if (
+                input_deepstack_embeds is not None
+                and "input_deepstack_embeds" in sub_info
+            ):
+                input_deepstack_embeds[start : start + req_len] = sub_info[
+                    "input_deepstack_embeds"
+                ][offset : offset + req_len]
+            offset += req_len
+
+    return input_embeds, other_info
+
+
+def general_mm_embed_routine(
+    input_ids: torch.Tensor,
+    forward_batch: ForwardBatch,
+    language_model: nn.Module,
+    multimodal_model: Optional[nn.Module] = None,
+    data_embedding_funcs: Dict[Modality, DataEmbeddingFunc] = None,
+    placeholder_tokens: Optional[dict[Modality, List[int]]] = None,
+    use_deepstack: Dict[Modality, bool] = {},
+    **kwargs,
+) -> torch.Tensor:
+    """
+    Process multimodal inputs and forward through language model.
+
+    Args:
+        input_ids: Input token IDs tensor
+        forward_batch: Batch information for model forward pass
+        language_model: Base language model to use
+        data_embedding_funcs: A dictionary mapping from modality type to the corresponding embedding function.
+        placeholder_tokens: Token IDs for multimodal placeholders
+        use_deepstack: Whether to use deepstack embeddings for each modality, default False
+        **kwargs: Additional arguments passed to language model
+
+    Returns:
+        Hidden states from language model forward pass
+    """
+    assert hasattr(language_model, "get_input_embeddings")
+    embed_tokens = language_model.get_input_embeddings()
+    if not hasattr(language_model, "pp_group") or language_model.pp_group.is_first_rank:
+        if (
+            not forward_batch.forward_mode.is_decode()
+            and not forward_batch.forward_mode.is_target_verify()
+            and forward_batch.contains_mm_inputs()
+        ):
+            mm_inputs_list = [
+                mm_input for mm_input in forward_batch.mm_inputs if mm_input is not None
+            ]
+            extend_prefix_lens = [
+                prefix_len
+                for i, prefix_len in enumerate(forward_batch.extend_prefix_lens_cpu)
+                if forward_batch.mm_inputs[i] is not None
+            ]
+            extend_seq_lens = [
+                seq_len
+                for i, seq_len in enumerate(forward_batch.extend_seq_lens_cpu)
+                if forward_batch.mm_inputs[i] is not None
+            ]
+            server_args = get_global_server_args()
+            if server_args and server_args.enable_adaptive_dispatch_to_encoder:
+                # Split by precomputed vs non-precomputed so get_embedding_and_mask only sees uniform batches
+                input_embeds, other_info = _embed_mm_inputs_with_split(
+                    mm_inputs_list=mm_inputs_list,
+                    extend_prefix_lens=extend_prefix_lens,
+                    extend_seq_lens=extend_seq_lens,
+                    input_ids=input_ids,
+                    forward_batch=forward_batch,
+                    input_embedding=embed_tokens,
+                    multimodal_model=multimodal_model,
+                    data_embedding_func_mapping=data_embedding_funcs,
+                    placeholder_tokens=placeholder_tokens,
+                    use_deepstack=use_deepstack,
+                )
+            else:
+                input_embeds, other_info = embed_mm_inputs(
+                    mm_inputs_list=mm_inputs_list,
+                    extend_prefix_lens=extend_prefix_lens,
+                    extend_seq_lens=extend_seq_lens,
+                    input_ids=input_ids,
+                    input_embedding=embed_tokens,
+                    multimodal_model=multimodal_model,
+                    data_embedding_func_mapping=data_embedding_funcs,
+                    placeholder_tokens=placeholder_tokens,
+                    use_deepstack=use_deepstack,
+                )
+
+            # add for qwen3_vl deepstack
+            if use_deepstack:
+                kwargs["input_deepstack_embeds"] = other_info["input_deepstack_embeds"]
+            # Offload GPU features to CPU instead of discarding them to balance memory
+            # efficiency and data persistence.
+            # In chunked-prefill, a request is processed across multiple batches, and
+            # the original multimodal data must remain accessible until the entire
+            # prefill phase is complete. Since the multimodal embedding cache is
+            # best-effort, offloading to CPU ensures we have a reliable fallback
+            # if a cache miss occurs in subsequent chunks, while still freeing up
+            # critical GPU memory.
+            if mm_inputs_list:
+                for mm_input_obj in mm_inputs_list:
+                    if mm_input_obj and hasattr(mm_input_obj, "mm_items"):
+                        for mm_item in mm_input_obj.mm_items:
+                            feature = getattr(mm_item, "feature", None)
+                            if isinstance(feature, torch.Tensor) and feature.is_cuda:
+                                mm_item.feature = feature.to("cpu", non_blocking=True)
+            forward_batch.mm_inputs = None
+            forward_batch.mm_input_embeds = input_embeds
+        else:
+            input_embeds = embed_tokens(input_ids)
+        # Copy to pre-allocated buffer if available (for CUDA graph address stability)
+        if forward_batch.input_embeds is not None:
+            forward_batch.input_embeds.copy_(input_embeds)
+            input_embeds = forward_batch.input_embeds
+    else:
+        input_embeds = None
+
+    hidden_states = language_model(
+        input_ids=None,
+        forward_batch=forward_batch,
+        input_embeds=input_embeds,
+        **kwargs,
+    )
+    return hidden_states
+
+
+def get_multimodal_data_bounds(
+    input_ids: torch.Tensor, pad_values: List[int], token_pairs: List[Tuple[int, int]]
+) -> torch.Tensor:
+    """
+    Returns a tensor indicating the bounds of multimodal data (images, video, audio, etc.)
+
+    Returns:
+        [bounds_count, 2]
+    """
+    # All the multimodal data in the batch should share the same special bound token ids.
+    start_tokens = {s for s, _e in token_pairs}
+    end_tokens = {e for _s, e in token_pairs}
+
+    assert all(isinstance(t, int) for t in start_tokens)
+    assert all(isinstance(t, int) for t in end_tokens)
+
+    start_cond = torch.isin(
+        input_ids, torch.as_tensor(start_tokens, device=input_ids.device)
+    )
+    end_cond = torch.isin(
+        input_ids, torch.as_tensor(end_tokens, device=input_ids.device)
+    )
+
+    (data_start_tokens,) = torch.where(start_cond)
+    (data_end_tokens,) = torch.where(end_cond)
+
+    data_start_tokens_cpu = data_start_tokens.cpu().tolist()
+    data_end_tokens_cpu = data_end_tokens.cpu().tolist()
+
+    # the im_start_id sometimes can be cached as prefix, but it is needed for the embedding of the multimodal data
+    if len(data_start_tokens_cpu) != len(data_end_tokens_cpu):
+        if (
+            len(data_start_tokens_cpu) + 1 == len(data_end_tokens_cpu)
+            and input_ids[0].item() in pad_values
+            and data_end_tokens_cpu
+            and data_start_tokens_cpu
+            and data_end_tokens_cpu[0] < data_start_tokens_cpu[0]
+        ):
+            data_start_tokens_cpu.insert(0, 0)
+    valid_mm_data_nums = min(len(data_start_tokens_cpu), len(data_end_tokens_cpu))
+
+    if valid_mm_data_nums == 0:
+        return torch.zeros((0, 2), device=input_ids.device)
+
+    # Filter out pairs where start_token >= end_token
+    valid_pairs = []
+    for i in range(valid_mm_data_nums):
+        start_token = data_start_tokens_cpu[i]
+        end_token = data_end_tokens_cpu[i]
+        if start_token < end_token:
+            valid_pairs.append((start_token + 1, end_token - 1))
+
+    if not valid_pairs:
+        return torch.zeros((0, 2), device=input_ids.device)
+
+    # Convert valid pairs to tensor
+    valid_pairs_tensor = torch.as_tensor(valid_pairs, device=input_ids.device)
+    return valid_pairs_tensor
+
+
+def data_hash(data) -> int:
+    hash_bytes = hashlib.sha256(data).digest()[:8]
+    return int.from_bytes(hash_bytes, byteorder="big", signed=False)
+
+
+def tensor_hash(tensor_list) -> int:
+    """
+    hash a tensor or a tensor list
+    """
+    tensor = tensor_list
+    if isinstance(tensor_list, list):
+        tensor_list = flatten_nested_list(tensor_list)
+        tensor_list = [
+            x.flatten() if isinstance(x, torch.Tensor) else x for x in tensor_list
+        ]
+        tensor = torch.concat(tensor_list)
+    if tensor.is_cuda:
+        return gpu_tensor_hash(tensor.cuda())
+    tensor = tensor.detach().contiguous()
+
+    if tensor.dtype == torch.bfloat16:
+        # memoryview() doesn't support PyTorch's BFloat16 dtype
+        tensor = tensor.float()
+
+    assert isinstance(tensor, torch.Tensor)
+    tensor_cpu = tensor.cpu()
+
+    mv = memoryview(tensor_cpu.numpy())
+    return data_hash(mv.tobytes())
+
+
+def hash_feature(f):
+    if isinstance(f, list):
+        if isinstance(f[0], torch.Tensor):
+            return tensor_hash(f)
+        return data_hash(tuple(flatten_nested_list(f)))
+    elif isinstance(f, np.ndarray):
+        arr = np.ascontiguousarray(f)
+        arr_bytes = arr.tobytes()
+        return data_hash(arr_bytes)
+    elif isinstance(f, torch.Tensor):
+        return tensor_hash([f])
+    elif isinstance(f, CudaIpcTensorTransportProxy):
+        reconstruct_t = f.reconstruct_on_target_device(torch.cuda.current_device())
+        return tensor_hash([reconstruct_t])
+    return data_hash(f)
+
+
+def extend_mrope_positions_for_retracted_request(
+    mrope_positions: torch.Tensor, output_ids_len: int
+) -> torch.Tensor:
+    """
+    Extend mrope_positions for retracted requests by appending positions for output_ids.
+
+    When a request is retracted and has multimodal inputs with mrope_positions,
+    we need to extend the positions to cover the output_ids that were already generated.
+    For pure text tokens, all three dimensions use the same incremental sequence.
+
+    Args:
+        mrope_positions: The original mrope positions tensor, shape (3, origin_input_ids_len)
+        output_ids_len: The number of output tokens to generate positions for
+
+    Returns:
+        Extended mrope_positions tensor with shape (3, origin_input_ids_len + output_ids_len)
+    """
+    if output_ids_len <= 0:
+        return mrope_positions
+
+    # Get the last position value corresponding to origin_input_ids
+    # mrope_positions shape: (3, origin_input_ids_len)
+    last_position = mrope_positions[:, -1]  # shape: (3,)
+
+    # Generate pure text mrope positions for output_ids
+    # All three dimensions for pure text are the same incremental sequence
+    start_pos = last_position[0] + 1  # Start from last position + 1
+    output_positions = (
+        torch.arange(
+            start_pos,
+            start_pos + output_ids_len,
+            dtype=torch.int64,
+            device=mrope_positions.device,
+        )
+        .unsqueeze(0)
+        .expand(3, -1)
+    )  # shape: (3, output_ids_len)
+
+    # Concatenate to the original mrope_positions
+    return torch.cat([mrope_positions, output_positions], dim=1)
+
+
+def _get_length(value):
+    if value is None:
+        return None
+    if isinstance(value, torch.Tensor):
+        return value.shape[0] if value.ndim > 0 else None
+    if isinstance(value, np.ndarray):
+        return value.shape[0] if value.ndim > 0 else None
+    if isinstance(value, (list, tuple)):
+        return len(value)
+    return None
+
+
+def _slice_value(value, start, end):
+    if isinstance(value, torch.Tensor):
+        return value[start:end]
+    if isinstance(value, np.ndarray):
+        return value[start:end]
+    if isinstance(value, list):
+        return value[start:end]
+    if isinstance(value, tuple):
+        return value[start:end]
+    try:
+        return value[start:end]
+    except Exception:
+        return value
+
+
+def _slice_model_data(
+    data: dict,
+    index: int,
+    start: int,
+    end: int,
+    num_items: int,
+    total_feature_len: Optional[int],
+):
+    sliced = {}
+    for key, value in data.items():
+        length = _get_length(value)
+        if length == num_items:
+            sliced[key] = _slice_value(value, index, index + 1)
+        elif total_feature_len is not None and length == total_feature_len:
+            sliced[key] = _slice_value(value, start, end)
+        else:
+            sliced[key] = value
+    return sliced
+
+
+def get_new_expanded_mm_items(original_mm_items):
+    expanded_mm_items = []
+    for item in original_mm_items:
+        is_bundled = item.offsets is not None and len(item.offsets) > 1
+
+        if is_bundled:
+            num_items = len(item.offsets)
+
+            if item.is_image():
+                image_grid_thw = item.model_specific_data.get("image_grid_thw")
+                grid_len = _get_length(image_grid_thw)
+                if image_grid_thw is None or grid_len != num_items:
+                    expanded_mm_items.append(item)
+                    continue
+
+                patches_per_item = []
+                for grid in image_grid_thw:
+                    grid_tensor = torch.as_tensor(grid, dtype=torch.long)
+                    patches_per_item.append(int(torch.prod(grid_tensor).item()))
+
+                cumulative = torch.cumsum(
+                    torch.tensor(patches_per_item, dtype=torch.long), dim=0
+                )
+                slice_indices = [0] + cumulative.tolist()
+
+                feature_len = _get_length(item.feature)
+                if feature_len is None:
+                    feature_len = _get_length(item.precomputed_embeddings)
+                if feature_len is None or slice_indices[-1] != feature_len:
+                    expanded_mm_items.append(item)
+                    continue
+
+                total_feature_len = feature_len
+                for i in range(num_items):
+                    start, end = slice_indices[i], slice_indices[i + 1]
+                    new_item = copy.deepcopy(item)
+                    if item.feature is not None:
+                        new_item.feature = _slice_value(item.feature, start, end)
+                    if item.precomputed_embeddings is not None:
+                        new_item.precomputed_embeddings = _slice_value(
+                            item.precomputed_embeddings, start, end
+                        )
+                    new_item.offsets = [item.offsets[i]]
+                    new_item.model_specific_data = _slice_model_data(
+                        item.model_specific_data,
+                        index=i,
+                        start=start,
+                        end=end,
+                        num_items=num_items,
+                        total_feature_len=total_feature_len,
+                    )
+                    new_item.hash = None
+                    expanded_mm_items.append(new_item)
+
+            elif item.is_video():
+                video_grid_thw = item.model_specific_data.get("video_grid_thw")
+                if video_grid_thw is None:
+                    expanded_mm_items.append(item)
+                    continue
+
+                # video_grid_thw shape: [num_videos, 3] where each row is [T, H, W]
+                # When T > 1, item.offsets contains frames (num_items = total frames)
+                # grid_len = num_videos, num_items = sum(T for each video) = total frames
+                grid_len = _get_length(video_grid_thw)
+                num_videos = grid_len
+
+                # Calculate total frames and frames per video
+                frames_per_video = []
+                total_frames = 0
+                for i in range(num_videos):
+                    grid = video_grid_thw[i]
+                    if isinstance(grid, torch.Tensor):
+                        T = int(grid[0].item())  # T is the first element [T, H, W]
+                    else:
+                        grid_tensor = torch.as_tensor(grid, dtype=torch.long)
+                        T = int(grid_tensor[0].item())
+                    frames_per_video.append(T)
+                    total_frames += T
+
+                # num_items should equal total_frames when T > 1
+                if num_items != total_frames:
+                    expanded_mm_items.append(item)
+                    continue
+
+                # Calculate patches per video: T * H * W for each video
+                patches_per_video = []
+                for i in range(num_videos):
+                    grid = video_grid_thw[i]
+                    if isinstance(grid, torch.Tensor):
+                        patches_per_video.append(int(torch.prod(grid).item()))
+                    else:
+                        grid_tensor = torch.as_tensor(grid, dtype=torch.long)
+                        patches_per_video.append(int(torch.prod(grid_tensor).item()))
+
+                # Calculate cumulative patches to get slice indices for each video
+                cumulative = torch.cumsum(
+                    torch.tensor(patches_per_video, dtype=torch.long), dim=0
+                )
+                slice_indices = [0] + cumulative.tolist()
+
+                feature_len = _get_length(item.feature)
+                if feature_len is None:
+                    feature_len = _get_length(item.precomputed_embeddings)
+                if feature_len is None or slice_indices[-1] != feature_len:
+                    expanded_mm_items.append(item)
+                    continue
+
+                total_feature_len = feature_len
+                # Group frames by video: calculate frame indices for each video
+                frame_start_indices = [0]
+                for i in range(num_videos):
+                    frame_start_indices.append(
+                        frame_start_indices[-1] + frames_per_video[i]
+                    )
+
+                # Expand each video into a separate item
+                for video_idx in range(num_videos):
+                    start, end = (
+                        slice_indices[video_idx],
+                        slice_indices[video_idx + 1],
+                    )
+                    frame_start, frame_end = (
+                        frame_start_indices[video_idx],
+                        frame_start_indices[video_idx + 1],
+                    )
+
+                    new_item = copy.deepcopy(item)
+                    if item.feature is not None:
+                        new_item.feature = _slice_value(item.feature, start, end)
+                    if item.precomputed_embeddings is not None:
+                        new_item.precomputed_embeddings = _slice_value(
+                            item.precomputed_embeddings, start, end
+                        )
+                    # Group offsets for this video (all frames of this video)
+                    new_item.offsets = item.offsets[frame_start:frame_end]
+                    # For video_grid_thw, slice the corresponding row [T, H, W] for this video
+                    new_item.model_specific_data = _slice_model_data(
+                        item.model_specific_data,
+                        index=video_idx,
+                        start=start,
+                        end=end,
+                        num_items=num_videos,
+                        total_feature_len=total_feature_len,
+                    )
+                    new_item.hash = None
+                    expanded_mm_items.append(new_item)
+            else:
+                expanded_mm_items.append(item)
+
+        else:
+            expanded_mm_items.append(item)
+    return expanded_mm_items
+
+
+class ShmPointerMMData:
+    """
+    Wraps a tensor to be sent via a shared memory handle.
+    This acts as a "pointer" to the tensor data across process boundaries.
+    """
+
+    def __init__(self, tensor: torch.Tensor):
+        self.cpu_tensor = tensor.cpu().contiguous()
+        self.shape = self.cpu_tensor.shape
+        self.dtype = self.cpu_tensor.dtype
+
+        nbytes = self.cpu_tensor.numel() * self.cpu_tensor.element_size()
+
+        self.shm = shared_memory.SharedMemory(create=True, size=nbytes)
+
+        try:
+            shm_view = np.ndarray((nbytes,), dtype=np.uint8, buffer=self.shm.buf)
+
+            shm_view[:] = self.cpu_tensor.view(torch.uint8).numpy().flatten()
+        finally:
+            self.shm.close()
+
+    def __getstate__(self):
+        if not hasattr(self, "shm") or self.shm is None:
+            tensor = getattr(self, "cpu_tensor", None)
+            if tensor is None:
+                tensor = getattr(self, "tensor", None)
+            if tensor is None:
+                raise RuntimeError(
+                    "ShmPointerMMData cannot recreate shared memory without tensor"
+                )
+
+            cpu_tensor = tensor.cpu().contiguous()
+            self.shape = cpu_tensor.shape
+            self.dtype = cpu_tensor.dtype
+
+            nbytes = cpu_tensor.numel() * cpu_tensor.element_size()
+            self.shm = shared_memory.SharedMemory(create=True, size=nbytes)
+            try:
+                shm_view = np.ndarray((nbytes,), dtype=np.uint8, buffer=self.shm.buf)
+                shm_view[:] = cpu_tensor.view(torch.uint8).numpy().flatten()
+            finally:
+                self.shm.close()
+
+        return {
+            "shm_name": self.shm.name,
+            "shape": self.shape,
+            "dtype": self.dtype,
+        }
+
+    def __setstate__(self, state):
+        self.shm_name = state["shm_name"]
+        self.shape = state["shape"]
+        self.dtype = state["dtype"]
+        self.shm = None
+
+        shm_handle = shared_memory.SharedMemory(name=self.shm_name)
+        try:
+            self.tensor = (
+                torch.frombuffer(shm_handle.buf, dtype=self.dtype)
+                .reshape(self.shape)
+                .clone()
+            )
+        finally:
+            shm_handle.close()
+            shm_handle.unlink()
+
+
+def _get_is_default_transport():
+    global _is_default_tensor_transport
+    if _is_default_tensor_transport is None:
+        from sglang.srt.managers.tokenizer_manager import (
+            _determine_tensor_transport_mode,
+        )
+
+        _is_default_tensor_transport = (
+            _determine_tensor_transport_mode(get_global_server_args()) == "default"
+        )
+    return _is_default_tensor_transport
+
+
+def wrap_shm_features(obj):
+    """
+    Scan the object for multimodal tensors and wrap them in SHM pointers.
+    """
+    if _get_is_default_transport() or get_global_server_args().skip_tokenizer_init:
+        return obj
+
+    if hasattr(obj, "mm_inputs") and obj.mm_inputs:
+        mm_items = obj.mm_inputs.get("mm_items", [])
+        for item in mm_items:
+            if (
+                hasattr(item, "feature")
+                and isinstance(item.feature, torch.Tensor)
+                and item.feature.is_cpu
+            ):
+                item.feature = ShmPointerMMData(item.feature)
+    return obj
+
+
+def unwrap_shm_features(obj):
+    """
+    Restore ShmPointerMMData wrappers back into standard torch.Tensors.
+    """
+    if _get_is_default_transport() or get_global_server_args().skip_tokenizer_init:
+        return obj
+    if hasattr(obj, "mm_inputs") and obj.mm_inputs:
+        mm_items = obj.mm_inputs.get("mm_items", [])
+        for item in mm_items:
+            if isinstance(item.feature, ShmPointerMMData):
+                item.feature = item.feature.tensor
+    return obj
diff --git a/sglang/python/sglang/srt/managers/multi_tokenizer_mixin.py b/sglang/python/sglang/srt/managers/multi_tokenizer_mixin.py
new file mode 100644
index 0000000000000000000000000000000000000000..f2586497df949f33ad73f52bf4fb886f84a82f7a
--- /dev/null
+++ b/sglang/python/sglang/srt/managers/multi_tokenizer_mixin.py
@@ -0,0 +1,517 @@
+from __future__ import annotations
+
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""
+Mixin classes and utils for multi-http-worker mode
+This file uses multiple processes to handle requests and tokenization, reducing the overhead of python and http server.
+"""
+
+import asyncio
+import logging
+import multiprocessing as multiprocessing
+import os
+import pickle
+import sys
+import threading
+from functools import partialmethod
+from multiprocessing import shared_memory
+from typing import TYPE_CHECKING, Any, Dict, Union
+
+import setproctitle
+import zmq
+import zmq.asyncio
+
+from sglang.srt.disaggregation.utils import DisaggregationMode, TransferBackend
+from sglang.srt.managers.disagg_service import start_disagg_service
+from sglang.srt.managers.io_struct import (
+    BaseBatchReq,
+    BaseReq,
+    BatchEmbeddingOutput,
+    BatchMultimodalOutput,
+    BatchStrOutput,
+    BatchTokenIDOutput,
+)
+from sglang.srt.managers.tokenizer_communicator_mixin import _Communicator
+from sglang.srt.managers.tokenizer_manager import TokenizerManager
+from sglang.srt.server_args import PortArgs, ServerArgs
+from sglang.srt.utils import get_zmq_socket, kill_process_tree
+from sglang.utils import get_exception_traceback
+
+if TYPE_CHECKING:
+    from sglang.srt.managers.detokenizer_manager import DetokenizerManager
+
+logger = logging.getLogger(__name__)
+
+
+class SocketMapping:
+    def __init__(self):
+        self._zmq_context = zmq.Context()
+        self._mapping: Dict[str, zmq.Socket] = {}
+
+    def clear_all_sockets(self):
+        for socket in self._mapping.values():
+            socket.close()
+        self._mapping.clear()
+
+    def _register_ipc_mapping(self, ipc_name: str, is_tokenizer: bool):
+        type_str = "tokenizer" if is_tokenizer else "detokenizer"
+        if ipc_name in self._mapping:
+            logger.warning(f"{type_str} already registered {ipc_name=}, skipping...")
+            return
+        logger.info(f"Registering {type_str} {ipc_name=} in SocketMapping...")
+        socket = get_zmq_socket(self._zmq_context, zmq.PUSH, ipc_name, False)
+        self._mapping[ipc_name] = socket
+
+    def send_output(self, ipc_name: str, output: Any):
+        if ipc_name is None:
+            # Some unhandled cases
+            logger.warning(f"IPC name is None, output type={type(output)}, skipping...")
+            return
+
+        if ipc_name not in self._mapping:
+            self._register_ipc_mapping(ipc_name, is_tokenizer=False)
+        self._mapping[ipc_name].send_pyobj(output)
+
+
+def _extract_field_by_index(
+    output: Any, field_name: str, index: int, check_length: bool = True
+) -> Any:
+    """Extract a field value from output by index, handling None and length checks.
+
+    Args:
+        output: The output object containing the field
+        field_name: The name of the field to extract
+        index: The index to access in the field list
+        check_length: If True, check both field existence and length. If False, only check field existence.
+
+    Returns:
+        A list containing the field value at index, or None if not available.
+    """
+    field = getattr(output, field_name, None)
+    if field is None:
+        return None
+
+    if isinstance(field, dict):
+        new_field = {}
+        for k, v in field.items():
+            if len(v) <= index:
+                new_field[k] = None
+            new_field[k] = v[index]
+        return new_field
+
+    if check_length:
+        if len(field) <= index:
+            return None
+
+    return [field[index]]
+
+
+def _handle_output_by_index(output, i):
+    """NOTE: A maintainable method is better here."""
+    if isinstance(output, BatchTokenIDOutput):
+        new_output = BatchTokenIDOutput(
+            rids=[output.rids[i]],
+            spec_verify_ct=_extract_field_by_index(output, "spec_verify_ct", i),
+            spec_accepted_tokens=_extract_field_by_index(
+                output, "spec_accepted_tokens", i
+            ),
+            spec_acceptance_histogram=_extract_field_by_index(
+                output, "spec_acceptance_histogram", i
+            ),
+            time_stats=_extract_field_by_index(output, "time_stats", i),
+            finished_reasons=_extract_field_by_index(output, "finished_reasons", i),
+            decoded_texts=_extract_field_by_index(output, "decoded_texts", i),
+            decode_ids=_extract_field_by_index(output, "decode_ids", i),
+            read_offsets=_extract_field_by_index(output, "read_offsets", i),
+            output_ids=_extract_field_by_index(output, "output_ids", i),
+            skip_special_tokens=_extract_field_by_index(
+                output, "skip_special_tokens", i
+            ),
+            spaces_between_special_tokens=_extract_field_by_index(
+                output, "spaces_between_special_tokens", i
+            ),
+            no_stop_trim=_extract_field_by_index(output, "no_stop_trim", i),
+            prompt_tokens=_extract_field_by_index(output, "prompt_tokens", i),
+            completion_tokens=_extract_field_by_index(output, "completion_tokens", i),
+            cached_tokens=_extract_field_by_index(output, "cached_tokens", i),
+            cached_tokens_details=_extract_field_by_index(
+                output, "cached_tokens_details", i
+            ),
+            input_token_logprobs_val=_extract_field_by_index(
+                output, "input_token_logprobs_val", i, check_length=False
+            ),
+            input_token_logprobs_idx=_extract_field_by_index(
+                output, "input_token_logprobs_idx", i, check_length=False
+            ),
+            output_token_logprobs_val=_extract_field_by_index(
+                output, "output_token_logprobs_val", i, check_length=False
+            ),
+            output_token_logprobs_idx=_extract_field_by_index(
+                output, "output_token_logprobs_idx", i, check_length=False
+            ),
+            input_top_logprobs_val=_extract_field_by_index(
+                output, "input_top_logprobs_val", i, check_length=False
+            ),
+            input_top_logprobs_idx=_extract_field_by_index(
+                output, "input_top_logprobs_idx", i, check_length=False
+            ),
+            output_top_logprobs_val=_extract_field_by_index(
+                output, "output_top_logprobs_val", i, check_length=False
+            ),
+            output_top_logprobs_idx=_extract_field_by_index(
+                output, "output_top_logprobs_idx", i, check_length=False
+            ),
+            input_token_ids_logprobs_val=_extract_field_by_index(
+                output, "input_token_ids_logprobs_val", i, check_length=False
+            ),
+            input_token_ids_logprobs_idx=_extract_field_by_index(
+                output, "input_token_ids_logprobs_idx", i, check_length=False
+            ),
+            output_token_ids_logprobs_val=_extract_field_by_index(
+                output, "output_token_ids_logprobs_val", i, check_length=False
+            ),
+            output_token_ids_logprobs_idx=_extract_field_by_index(
+                output, "output_token_ids_logprobs_idx", i, check_length=False
+            ),
+            output_token_entropy_val=_extract_field_by_index(
+                output, "output_token_entropy_val", i, check_length=False
+            ),
+            output_hidden_states=_extract_field_by_index(
+                output, "output_hidden_states", i, check_length=False
+            ),
+            placeholder_tokens_idx=None,
+            placeholder_tokens_val=None,
+            token_steps=_extract_field_by_index(
+                output, "token_steps", i, check_length=False
+            ),
+        )
+    elif isinstance(output, BatchEmbeddingOutput):
+        new_output = BatchEmbeddingOutput(
+            rids=[output.rids[i]],
+            finished_reasons=_extract_field_by_index(output, "finished_reasons", i),
+            embeddings=_extract_field_by_index(output, "embeddings", i),
+            prompt_tokens=_extract_field_by_index(output, "prompt_tokens", i),
+            cached_tokens=_extract_field_by_index(output, "cached_tokens", i),
+            placeholder_tokens_idx=None,
+            placeholder_tokens_val=None,
+        )
+    elif isinstance(output, BatchStrOutput):
+        new_output = BatchStrOutput(
+            rids=[output.rids[i]],
+            spec_verify_ct=_extract_field_by_index(output, "spec_verify_ct", i),
+            spec_accepted_tokens=_extract_field_by_index(
+                output, "spec_accepted_tokens", i
+            ),
+            spec_acceptance_histogram=_extract_field_by_index(
+                output, "spec_acceptance_histogram", i
+            ),
+            time_stats=_extract_field_by_index(output, "time_stats", i),
+            finished_reasons=_extract_field_by_index(output, "finished_reasons", i),
+            output_strs=_extract_field_by_index(output, "output_strs", i),
+            output_ids=_extract_field_by_index(output, "output_ids", i),
+            prompt_tokens=_extract_field_by_index(output, "prompt_tokens", i),
+            completion_tokens=_extract_field_by_index(output, "completion_tokens", i),
+            cached_tokens=_extract_field_by_index(output, "cached_tokens", i),
+            input_token_logprobs_val=_extract_field_by_index(
+                output, "input_token_logprobs_val", i, check_length=False
+            ),
+            input_token_logprobs_idx=_extract_field_by_index(
+                output, "input_token_logprobs_idx", i, check_length=False
+            ),
+            output_token_logprobs_val=_extract_field_by_index(
+                output, "output_token_logprobs_val", i, check_length=False
+            ),
+            output_token_logprobs_idx=_extract_field_by_index(
+                output, "output_token_logprobs_idx", i, check_length=False
+            ),
+            input_top_logprobs_val=_extract_field_by_index(
+                output, "input_top_logprobs_val", i, check_length=False
+            ),
+            input_top_logprobs_idx=_extract_field_by_index(
+                output, "input_top_logprobs_idx", i, check_length=False
+            ),
+            output_top_logprobs_val=_extract_field_by_index(
+                output, "output_top_logprobs_val", i, check_length=False
+            ),
+            output_top_logprobs_idx=_extract_field_by_index(
+                output, "output_top_logprobs_idx", i, check_length=False
+            ),
+            input_token_ids_logprobs_val=_extract_field_by_index(
+                output, "input_token_ids_logprobs_val", i, check_length=False
+            ),
+            input_token_ids_logprobs_idx=_extract_field_by_index(
+                output, "input_token_ids_logprobs_idx", i, check_length=False
+            ),
+            output_token_ids_logprobs_val=_extract_field_by_index(
+                output, "output_token_ids_logprobs_val", i, check_length=False
+            ),
+            output_token_ids_logprobs_idx=_extract_field_by_index(
+                output, "output_token_ids_logprobs_idx", i, check_length=False
+            ),
+            output_token_entropy_val=_extract_field_by_index(
+                output, "output_token_entropy_val", i, check_length=False
+            ),
+            output_hidden_states=_extract_field_by_index(
+                output, "output_hidden_states", i, check_length=False
+            ),
+            routed_experts=_extract_field_by_index(
+                output, "routed_experts", i, check_length=False
+            ),
+            customized_info=_extract_field_by_index(
+                output, "customized_info", i, check_length=False
+            ),
+            dp_ranks=_extract_field_by_index(output, "dp_ranks", i, check_length=False),
+            placeholder_tokens_idx=None,
+            placeholder_tokens_val=None,
+            retraction_counts=_extract_field_by_index(output, "retraction_counts", i),
+            token_steps=_extract_field_by_index(
+                output, "token_steps", i, check_length=False
+            ),
+        )
+    elif isinstance(output, BatchMultimodalOutput):
+        new_output = BatchMultimodalOutput(
+            rids=[output.rids[i]],
+            finished_reasons=_extract_field_by_index(output, "finished_reasons", i),
+            outputs=_extract_field_by_index(output, "outputs", i),
+            prompt_tokens=_extract_field_by_index(output, "prompt_tokens", i),
+            completion_tokens=_extract_field_by_index(output, "completion_tokens", i),
+            cached_tokens=_extract_field_by_index(output, "cached_tokens", i),
+            placeholder_tokens_idx=None,
+            placeholder_tokens_val=None,
+        )
+    else:
+        new_output = output
+    return new_output
+
+
+class MultiHttpWorkerDetokenizerMixin:
+    """Mixin class for DetokenizerManager"""
+
+    def maybe_clear_socket_mapping(self: DetokenizerManager):
+        if hasattr(self, "socket_mapping"):
+            self.socket_mapping.clear_all_sockets()
+
+    def multi_http_worker_event_loop(self: DetokenizerManager):
+        """The event loop that handles requests, for multi multi-http-worker mode"""
+        self.socket_mapping = SocketMapping()
+        while True:
+            recv_obj = self.recv_from_scheduler.recv_pyobj()
+            output = self._request_dispatcher(recv_obj)
+            if output is None:
+                continue
+
+            assert isinstance(
+                recv_obj, BaseBatchReq
+            ), "for multi-http-worker, recv_obj must be BaseBatchReq"
+
+            # Send data using the corresponding socket
+            for i, ipc_name in enumerate(recv_obj.http_worker_ipcs):
+                new_output = _handle_output_by_index(output, i)
+                self.socket_mapping.send_output(ipc_name, new_output)
+
+
+class MultiTokenizerRouter:
+    """A router to receive requests from TokenizerWorker"""
+
+    def __init__(
+        self,
+        server_args: ServerArgs,
+        port_args: PortArgs,
+    ):
+        self.server_args = server_args
+        context = zmq.asyncio.Context(3)
+        self.recv_from_detokenizer = get_zmq_socket(
+            context, zmq.PULL, port_args.tokenizer_ipc_name, True
+        )
+        self.send_to_scheduler = get_zmq_socket(
+            context, zmq.PUSH, port_args.scheduler_input_ipc_name, True
+        )
+        self.receive_from_worker = get_zmq_socket(
+            context, zmq.PULL, port_args.tokenizer_worker_ipc_name, True
+        )
+        self._loop = asyncio.new_event_loop()
+        self._thread = threading.Thread(target=self._run_loop, daemon=True)
+        self._thread.start()
+        self._task = asyncio.run_coroutine_threadsafe(
+            self.router_worker_obj(), self._loop
+        )
+        # Start handle_loop simultaneously
+        self._handle_task = asyncio.run_coroutine_threadsafe(
+            print_exception_wrapper(self.handle_loop), self._loop
+        )
+        self.disaggregation_bootstrap_server = start_disagg_service(self.server_args)
+
+    def _run_loop(self):
+        self._loop.run_forever()
+
+    async def router_worker_obj(self):
+        while True:
+            recv_obj = await self.receive_from_worker.recv_pyobj()
+            await self.send_to_scheduler.send_pyobj(recv_obj)
+
+    async def handle_loop(self):
+        # special reqs will recv from scheduler, need to route to right worker
+        self.socket_mapping = SocketMapping()
+        while True:
+            recv_obj = await self.recv_from_detokenizer.recv_pyobj()
+            await self._distribute_result_to_workers(recv_obj)
+
+    async def _distribute_result_to_workers(self, recv_obj):
+        # Distribute result to each worker
+        if isinstance(recv_obj, BaseReq):
+            ipc_names = [recv_obj.http_worker_ipc]
+        elif isinstance(recv_obj, BaseBatchReq):
+            ipc_names = recv_obj.http_worker_ipcs
+        else:
+            raise ValueError(f"Unknown recv_obj type: {type(recv_obj)}")
+
+        for i, ipc_name in enumerate(ipc_names):
+            new_recv_obj = _handle_output_by_index(recv_obj, i)
+            self.socket_mapping.send_output(ipc_name, new_recv_obj)
+
+
+class TokenizerWorker(TokenizerManager):
+    """Tokenizer Worker in multi-http-worker mode"""
+
+    def __init__(
+        self,
+        server_args: ServerArgs,
+        port_args: PortArgs,
+    ):
+        setproctitle.setproctitle(f"sglang::tokenizer_worker:{os.getpid()}")
+        # prevent init prefill bootstrapserver again
+        disaggregation_mode = server_args.disaggregation_mode
+        server_args.disaggregation_mode = "null"
+        super().__init__(server_args, port_args)
+
+        self.worker_id = os.getpid()
+        self.tokenizer_ipc_name = port_args.tokenizer_ipc_name
+
+        # For PD disaggregtion
+        self.server_args.disaggregation_mode = disaggregation_mode
+        self.disaggregation_mode = DisaggregationMode(
+            self.server_args.disaggregation_mode
+        )
+        self.disaggregation_transfer_backend = TransferBackend(
+            self.server_args.disaggregation_transfer_backend
+        )
+        # Communicator
+        self.register_multi_tokenizer_communicator = _Communicator(
+            self.send_to_scheduler, 2
+        )
+
+    def _attach_multi_http_worker_info(self, req: Union[BaseReq, BaseBatchReq]):
+
+        if isinstance(req, BaseReq):
+            req.http_worker_ipc = self.tokenizer_ipc_name
+        elif isinstance(req, BaseBatchReq):
+            req.http_worker_ipcs = [self.tokenizer_ipc_name] * len(req.rids)
+        else:
+            raise ValueError(f"Unknown req type: {type(req)}")
+
+
+async def print_exception_wrapper(func):
+    """
+    Sometimes an asyncio function does not print exception.
+    We do another wrapper to handle the exception.
+    """
+    try:
+        await func()
+    except Exception:
+        traceback = get_exception_traceback()
+        logger.error(f"MultiTokenizerRouter hit an exception: {traceback}")
+        if hasattr(func, "__self__") and isinstance(
+            func.__self__, MultiTokenizerRouter
+        ):
+            func.__self__.dump_requests_before_crash()
+        kill_process_tree(os.getpid(), include_parent=True)
+        sys.exit(1)
+
+
+def get_main_process_id() -> int:
+    """Get the main process ID"""
+    return multiprocessing.current_process()._parent_pid
+
+
+def write_to_shared_memory(obj, name: str) -> shared_memory.SharedMemory:
+    """Write data to shared memory"""
+    serialized = pickle.dumps(obj)
+    size = len(serialized)
+    try:
+        # Try to open existing shared memory
+        shm = shared_memory.SharedMemory(name=name)
+        # If size is insufficient, close and recreate
+        if shm.size < size:
+            shm.close()
+            shm.unlink()
+            shm = shared_memory.SharedMemory(create=True, size=size, name=name)
+    except FileNotFoundError:
+        # If not present, create new shared memory
+        shm = shared_memory.SharedMemory(create=True, size=size, name=name)
+
+    shm.buf[:size] = serialized
+    return shm
+
+
+def read_from_shared_memory(name: str) -> Any:
+    """Read data from shared memory"""
+    try:
+        shm = shared_memory.SharedMemory(name=name)
+        data = pickle.loads(bytes(shm.buf))
+        shm.close()
+        return data
+    except FileNotFoundError:
+        raise FileNotFoundError(f"Shared memory {name} not found")
+
+
+def write_data_for_multi_tokenizer(
+    port_args: PortArgs, server_args: ServerArgs, scheduler_info: Dict
+):
+    """Write args information to share memory for multi-tokenizer"""
+    # get main process ID
+    main_pid = get_main_process_id()
+    current_pid = os.getpid()
+    logger.info(f"main process ID: {main_pid}, current process ID: {current_pid}")
+    args = (port_args, server_args, scheduler_info)
+    args_shm = write_to_shared_memory(args, f"multi_tokenizer_args_{current_pid}")
+    args_shm.close()
+
+    return args_shm
+
+
+def monkey_patch_uvicorn_multiprocessing(timeout: float = 10):
+    """Monkey patch uvicorn multiprocessing is_alive timeout"""
+    # from default 5s -> 10s
+    try:
+        from uvicorn.supervisors.multiprocess import Process
+
+        Process.is_alive = partialmethod(Process.is_alive, timeout=timeout)
+
+    except ImportError:
+        logger.warning(
+            "uvicorn.supervisors.multiprocess not found, skipping monkey patch"
+        )
+
+
+class SenderWrapper:
+    def __init__(self, port_args: PortArgs, send_to_scheduler: zmq.Socket):
+        self.port_args = port_args
+        self.send_to_scheduler = send_to_scheduler
+
+    def send_pyobj(self, obj):
+        if isinstance(obj, BaseReq):
+            obj.http_worker_ipc = self.port_args.tokenizer_ipc_name
+        self.send_to_scheduler.send_pyobj(obj)
diff --git a/sglang/python/sglang/srt/managers/multimodal_processor.py b/sglang/python/sglang/srt/managers/multimodal_processor.py
new file mode 100644
index 0000000000000000000000000000000000000000..b2c9e68cb9f94306efed4c777fe1bdda38390da3
--- /dev/null
+++ b/sglang/python/sglang/srt/managers/multimodal_processor.py
@@ -0,0 +1,55 @@
+# TODO: also move pad_input_ids into this module
+import importlib
+import inspect
+import logging
+import pkgutil
+
+from sglang.srt.multimodal.processors.base_processor import BaseMultimodalProcessor
+from sglang.srt.server_args import ServerArgs
+
+logger = logging.getLogger(__name__)
+
+PROCESSOR_MAPPING = {}
+
+
+def import_processors(package_name: str, overwrite: bool = False):
+    package = importlib.import_module(package_name)
+    for _, name, ispkg in pkgutil.iter_modules(package.__path__, package_name + "."):
+        if not ispkg:
+            try:
+                module = importlib.import_module(name)
+            except Exception as e:
+                logger.warning(f"Ignore import error when loading {name}: {e}")
+                continue
+            all_members = inspect.getmembers(module, inspect.isclass)
+            classes = [
+                member
+                for name, member in all_members
+                if member.__module__ == module.__name__
+            ]
+            for cls in (
+                cls for cls in classes if issubclass(cls, BaseMultimodalProcessor)
+            ):
+                assert hasattr(cls, "models")
+                for arch in getattr(cls, "models"):
+                    if overwrite:
+                        for model_cls, processor_cls in PROCESSOR_MAPPING.items():
+                            if model_cls.__name__ == arch.__name__:
+                                del PROCESSOR_MAPPING[model_cls]
+                                break
+                    PROCESSOR_MAPPING[arch] = cls
+
+
+def get_mm_processor(
+    hf_config, server_args: ServerArgs, processor, transport_mode, **kwargs
+) -> BaseMultimodalProcessor:
+    for model_cls, processor_cls in PROCESSOR_MAPPING.items():
+        if model_cls.__name__ in hf_config.architectures:
+            return processor_cls(
+                hf_config, server_args, processor, transport_mode, **kwargs
+            )
+
+    raise ValueError(
+        f"No processor registered for architecture: {hf_config.architectures}.\n"
+        f"Registered architectures: {[model_cls.__name__ for model_cls in PROCESSOR_MAPPING.keys()]}"
+    )
diff --git a/sglang/python/sglang/srt/managers/overlap_utils.py b/sglang/python/sglang/srt/managers/overlap_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..9246c20980e39640c85c4ac9b3202ed7ec97c281
--- /dev/null
+++ b/sglang/python/sglang/srt/managers/overlap_utils.py
@@ -0,0 +1,177 @@
+from __future__ import annotations
+
+from dataclasses import dataclass
+from typing import TYPE_CHECKING, Optional
+
+import torch
+
+from sglang.srt.speculative.spec_utils import spec_need_hidden_states
+from sglang.srt.utils import get_compiler_backend, is_npu
+
+if TYPE_CHECKING:
+    from sglang.srt.managers.schedule_batch import ModelWorkerBatch
+    from sglang.srt.managers.scheduler import GenerationBatchResult
+    from sglang.srt.speculative.eagle_info import EagleDraftInput
+    from sglang.srt.speculative.spec_info import SpeculativeAlgorithm
+
+_is_npu = is_npu()
+
+
+@torch.compile(dynamic=True, backend=get_compiler_backend(), disable=_is_npu)
+def _resolve_future_token_ids(input_ids, future_token_ids_map):
+    input_ids[:] = torch.where(
+        input_ids < 0,
+        future_token_ids_map[torch.clamp(-input_ids, min=0)],
+        input_ids,
+    )
+
+
+@dataclass
+class FutureIndices:
+    indices: torch.Tensor
+    interval: Optional[slice] = None
+
+
+class FutureMap:
+    def __init__(
+        self,
+        max_running_requests: int,
+        chunked_prefill_size: int,
+        context_len: int,
+        device: torch.device,
+        spec_algo: Optional[SpeculativeAlgorithm] = None,
+    ):
+        # FIXME: the calculation of future_limit and future_buffer_len maybe too conservative
+        self.future_ct = 0
+
+        # Circular buffer layout (wraps in this order):
+        # Running decode batch -> Prefill chunk 1 -> ... -> Prefill chunk N
+        # A running decode batch's result will be resolved after all prefill chunks are done.
+        # reserve `max_num_chunks` extra future slots on top of `max_running_requests * 3`.
+        max_num_chunks = (
+            (context_len + chunked_prefill_size - 1) // chunked_prefill_size
+            if chunked_prefill_size
+            else 0
+        )
+        self.future_limit = max_running_requests * (3 + max_num_chunks)
+        # Adding 2 * max_running_requests to future_limit ensures the buffer is sufficiently large.
+        self.future_buffer_len = self.future_limit + 2 * max_running_requests
+        self.device = device
+        self.spec_algo = spec_algo
+
+        if self.spec_algo.is_none():
+            # For non-speculative decoding, we only need to store the token ids.
+            self.buf_initialized = True
+            self.token_ids_buf = torch.empty(
+                (self.future_buffer_len,), dtype=torch.int64, device=self.device
+            )
+        else:
+            # For speculative decoding, we lazily initialize the buffers
+            # This is to make the shape derivation easier.
+            self.buf_initialized = False
+
+    def _lazy_init_buf(self, draft_input: EagleDraftInput):
+        self.buf_initialized = True
+
+        # Get a reference for each tensor
+        topk_p0 = draft_input.topk_p[0]
+        topk_index0 = draft_input.topk_index[0]
+        verified_id0 = draft_input.verified_id[0]
+        new_seq_lens0 = draft_input.new_seq_lens[0]
+
+        self.topk_p_buf = torch.empty(
+            (self.future_buffer_len, *topk_p0.shape),
+            dtype=topk_p0.dtype,
+            device=self.device,
+        )
+        self.topk_index_buf = torch.empty(
+            (self.future_buffer_len, *topk_index0.shape),
+            dtype=topk_index0.dtype,
+            device=self.device,
+        )
+        self.verified_id_buf = torch.empty(
+            (self.future_buffer_len, *verified_id0.shape),
+            dtype=verified_id0.dtype,
+            device=self.device,
+        )
+        self.new_seq_lens_buf = torch.empty(
+            (self.future_buffer_len, *new_seq_lens0.shape),
+            dtype=new_seq_lens0.dtype,
+            device=self.device,
+        )
+
+        if spec_need_hidden_states():
+            hidden_states0 = draft_input.hidden_states[0]
+            self.hidden_states_buf = torch.empty(
+                (self.future_buffer_len, *hidden_states0.shape),
+                dtype=hidden_states0.dtype,
+                device=self.device,
+            )
+
+    def alloc_future_indices(self, bs: int) -> FutureIndices:
+        """Update the circular buffer pointer and allocate future indices."""
+        cur_future_ct = self.future_ct
+        self.future_ct = (cur_future_ct + bs) % self.future_limit
+        start = cur_future_ct + 1
+        end = cur_future_ct + 1 + bs
+        indices = torch.arange(start, end, dtype=torch.int64, device=self.device)
+        return FutureIndices(indices=indices, interval=slice(start, end))
+
+    def resolve_future(self, model_worker_batch: ModelWorkerBatch):
+        if self.spec_algo.is_none():
+            _resolve_future_token_ids(model_worker_batch.input_ids, self.token_ids_buf)
+        else:
+            # TODO(lsyin): write future indices into spec_info.future_indices
+            draft_input: EagleDraftInput = model_worker_batch.spec_info
+            if draft_input is None:
+                # FIXME(lsyin): No future exists, only for prefill batch, not compatible with mixed mode
+                return
+            indices = draft_input.future_indices.indices
+            # The indices tensor was allocated on the default stream but is
+            # used here on the forward stream. Meanwhile, the old spec_info
+            # holding this tensor will lose all Python references (replaced at
+            # model_worker_batch.spec_info and batch.spec_info), so the
+            # caching allocator (torch GC) could reclaim the memory before
+            # the GPU finishes reading it.
+            indices.record_stream(torch.get_device_module(self.device).current_stream())
+            draft_input.topk_p = self.topk_p_buf[indices]
+            draft_input.topk_index = self.topk_index_buf[indices]
+            draft_input.verified_id = self.verified_id_buf[indices]
+            draft_input.new_seq_lens = self.new_seq_lens_buf[indices]
+            if spec_need_hidden_states():
+                draft_input.hidden_states = self.hidden_states_buf[indices]
+
+    def is_empty_slice(self, s: slice) -> bool:
+        start, stop, step = s.indices(self.future_buffer_len)
+        if step > 0:
+            return start >= stop
+        else:
+            return start <= stop
+
+    def store_to_map(
+        self, future_indices: FutureIndices, batch_result: GenerationBatchResult
+    ):
+        if self.spec_algo.is_none():
+            intv = future_indices.interval
+            self.token_ids_buf[intv] = batch_result.next_token_ids
+        else:
+            draft_input: EagleDraftInput = batch_result.next_draft_input
+            self.store_to_map_for_new_batch(future_indices, draft_input)
+
+    def store_to_map_for_new_batch(
+        self, future_indices: FutureIndices, draft_input: EagleDraftInput
+    ):
+        intv = future_indices.interval
+        if self.is_empty_slice(intv):
+            # idle indices in dp attention do not need store info
+            return
+
+        if not self.buf_initialized:
+            self._lazy_init_buf(draft_input)
+
+        self.topk_p_buf[intv] = draft_input.topk_p
+        self.topk_index_buf[intv] = draft_input.topk_index
+        self.verified_id_buf[intv] = draft_input.verified_id
+        self.new_seq_lens_buf[intv] = draft_input.new_seq_lens
+        if spec_need_hidden_states():
+            self.hidden_states_buf[intv] = draft_input.hidden_states
diff --git a/sglang/python/sglang/srt/managers/prefill_delayer.py b/sglang/python/sglang/srt/managers/prefill_delayer.py
new file mode 100644
index 0000000000000000000000000000000000000000..503d7c6f84490087f4d9dc89e63f62e85f3fe126
--- /dev/null
+++ b/sglang/python/sglang/srt/managers/prefill_delayer.py
@@ -0,0 +1,301 @@
+import dataclasses
+import logging
+import time
+from dataclasses import dataclass, field
+from typing import TYPE_CHECKING, NamedTuple, Optional
+
+import torch
+
+from sglang.srt.utils import get_bool_env_var
+
+if TYPE_CHECKING:
+    from sglang.srt.observability.metrics_collector import SchedulerMetricsCollector
+
+_DEBUG_LOG = get_bool_env_var("SGLANG_PREFILL_DELAYER_DEBUG_LOG")
+
+logger = logging.getLogger(__name__)
+
+
+@dataclass(frozen=True)
+class _State:
+    delayed_count: int = 0
+    start_time: float = field(default_factory=time.perf_counter)
+
+    def bump_delayed_count(self) -> "_State":
+        return dataclasses.replace(self, delayed_count=self.delayed_count + 1)
+
+
+class _NegotiateOutput(NamedTuple):
+    next_state: Optional[_State]
+    input_estimation: str
+    output_allow: bool
+    output_reason: str
+    num_prefillable: int
+    num_token_watermark_force_allow: int
+
+
+class PrefillDelayer:
+    def __init__(
+        self,
+        dp_size: int,
+        attn_tp_size: int,
+        cpu_group,
+        server_args,
+        max_delay_passes: int,
+        token_usage_low_watermark: Optional[float],
+        metrics_collector: Optional["SchedulerMetricsCollector"] = None,
+        device: Optional["torch.device"] = "cpu",
+    ):
+        self._max_delay_passes = max_delay_passes
+        self._token_usage_low_watermark = token_usage_low_watermark
+        logger.info(
+            f"PrefillDelayer initialized with "
+            f"max_delay_passes={self._max_delay_passes} "
+            f"token_usage_low_watermark={self._token_usage_low_watermark}"
+        )
+        # The global_info contains four pieces of information:
+        # prefillable, token_watermark_force_allow, running_batch, and max_prefill_bs.
+        self._global_info_buffer = torch.empty(
+            (dp_size, attn_tp_size, 4),
+            dtype=torch.int64,
+            device=device,
+        )
+        self.enable_dp_attention = server_args.enable_dp_attention
+        self._cpu_group = cpu_group
+
+        self._metrics_collector = metrics_collector
+
+        self._curr_state: Optional[_State] = None
+        self.skip_first_delayer = True
+
+        assert (
+            server_args.disaggregation_mode == "null"
+        ), "To use PrefillDelayer, disaggregation_mode must be null."
+        assert (
+            not server_args.disable_overlap_schedule
+        ), "To use PrefillDelayer, disable_overlap_schedule must be False."
+
+    def _negotiate_should_allow_prefill(
+        self,
+        local_prefillable: bool,
+        token_usage: float,
+        **kwargs,
+    ) -> _NegotiateOutput:
+        out = self._negotiate_should_allow_prefill_pure(
+            prev_state=self._curr_state,
+            local_prefillable=local_prefillable,
+            token_usage=token_usage,
+            **kwargs,
+        )
+        self._curr_state = out.next_state
+        return out
+
+    # (Almost) pure function, do not modify self state
+    def _negotiate_should_allow_prefill_pure(
+        self,
+        prev_state: Optional[_State],
+        local_prefillable: bool,
+        token_usage: float,
+        **kwargs,
+    ) -> _NegotiateOutput:
+        # Compute local states
+        local_token_watermark_force_allow = (
+            local_prefillable
+            and ((x := self._token_usage_low_watermark) is not None)
+            and (token_usage < x)
+        )
+
+        # Gather global states
+        tp0_info = self._gather_info(
+            local_prefillable=local_prefillable,
+            local_token_watermark_force_allow=local_token_watermark_force_allow,
+            **kwargs,
+        )
+        global_prefillable = tp0_info[:, 0]
+        global_token_watermark_force_allow = tp0_info[:, 1]
+        global_running_batch = tp0_info[:, 2]
+        global_max_prefill_bs = tp0_info[:, 3]
+
+        # Compute derived global states
+        if global_prefillable.min().item() > 0:
+            prefillable_status = "all"
+        elif global_prefillable.max().item() == 0:
+            prefillable_status = "none"
+        else:
+            prefillable_status = "mixed"
+        global_exists_token_watermark_force_allow = (
+            global_token_watermark_force_allow.max().item() > 0
+        )
+        debug_info = dict(
+            input_estimation=prefillable_status,
+            num_prefillable=global_prefillable.sum().item(),
+            num_token_watermark_force_allow=global_token_watermark_force_allow.sum().item(),
+        )
+
+        # Compute outputs
+        if prefillable_status == "all":
+            if kwargs is None:
+                exist_previous_wait = prev_state is not None
+                return _NegotiateOutput(
+                    next_state=None,
+                    output_allow=True,
+                    output_reason="wait_success" if exist_previous_wait else "no_wait",
+                    **debug_info,
+                )
+
+            max_running_requests = kwargs.get("max_running_requests", 0)
+            if (
+                max_running_requests - global_running_batch.max().item()
+                < global_max_prefill_bs.max().item()
+            ):
+                # When the "max_decode_bs - running_bs < max_prefill_bs" condition is met,
+                # the first merge_batch causes the decoding to fail to reach the maximum batch size.
+                if self.skip_first_delayer:
+                    self.skip_first_delayer = False
+                    pass
+                else:
+                    next_state = prev_state or _State()
+                    next_state = next_state.bump_delayed_count()
+                    return _NegotiateOutput(
+                        next_state=next_state,
+                        output_allow=False,
+                        output_reason="delay",
+                        **debug_info,
+                    )
+            exist_previous_wait = prev_state is not None
+            return _NegotiateOutput(
+                next_state=None,
+                output_allow=True,
+                output_reason="wait_success" if exist_previous_wait else "no_wait",
+                **debug_info,
+            )
+        elif prefillable_status == "none":
+            return _NegotiateOutput(
+                next_state=None,
+                # It does not matter whether we allow or not, thus we allow for simplicity
+                output_allow=True,
+                output_reason="",
+                **debug_info,
+            )
+        elif prefillable_status == "mixed":
+            if global_exists_token_watermark_force_allow:
+                return _NegotiateOutput(
+                    next_state=None,
+                    output_allow=True,
+                    output_reason="token_watermark",
+                    **debug_info,
+                )
+
+            prev_delayed_count = prev_state.delayed_count if prev_state else 0
+            if prev_delayed_count < self._max_delay_passes - 1:
+                next_state = prev_state or _State()
+                next_state = next_state.bump_delayed_count()
+                return _NegotiateOutput(
+                    next_state=next_state,
+                    output_allow=False,
+                    output_reason="delay",
+                    **debug_info,
+                )
+            else:
+                return _NegotiateOutput(
+                    next_state=None,
+                    output_allow=True,
+                    output_reason="wait_timeout",
+                    **debug_info,
+                )
+        else:
+            raise NotImplementedError
+
+    def _gather_info(
+        self, local_prefillable: bool, local_token_watermark_force_allow: bool, **kwargs
+    ):
+        local_info = torch.tensor(
+            [
+                int(local_prefillable),
+                int(local_token_watermark_force_allow),
+                kwargs.get("running_batch", 0),
+                kwargs.get("max_prefill_bs", 0),
+            ],
+            device="cpu",
+            dtype=torch.int64,
+        )
+        torch.distributed.all_gather_into_tensor(
+            self._global_info_buffer.flatten(),
+            local_info,
+            group=self._cpu_group,
+        )
+        tp0_info = self._global_info_buffer[:, 0, :]
+        return tp0_info
+
+
+class PrefillDelayerSinglePassExecutor:
+    def __init__(self, prefill_delayer: PrefillDelayer, token_usage: float):
+        self._prefill_delayer = prefill_delayer
+        self._token_usage = token_usage
+        self._result: Optional[_NegotiateOutput] = None
+
+    @property
+    def _called(self) -> bool:
+        return self._result is not None
+
+    def finalize(self, *, actual_prefill: bool):
+        if not self._called:
+            self.negotiate_should_allow_prefill(local_prefillable=False)
+
+        _record_single_pass_result(
+            actual_execution=actual_prefill,
+            output=self._result,
+            metrics_collector=self._prefill_delayer._metrics_collector,
+        )
+
+    def negotiate_should_allow_prefill(self, local_prefillable: bool, **kwargs) -> bool:
+        if not self._called:
+            self._result = self._prefill_delayer._negotiate_should_allow_prefill(
+                local_prefillable=local_prefillable,
+                token_usage=self._token_usage,
+                **kwargs,
+            )
+        return self._result.output_allow
+
+
+def _record_single_pass_result(
+    actual_execution: bool,
+    output: _NegotiateOutput,
+    metrics_collector: Optional["SchedulerMetricsCollector"],
+) -> None:
+    if _DEBUG_LOG:
+        if output.output_allow and (output.output_reason == "wait_timeout"):
+            logger.info(
+                f"PrefillDelayer timeout thus not forbid prefill "
+                f"(num_prefillable={output.num_prefillable}, "
+                f"actual_execution={actual_execution})"
+            )
+        elif output.output_allow and (output.output_reason == "token_watermark"):
+            logger.info(
+                f"PrefillDelayer force allow prefill due to low watermark. "
+                f"(num_prefillable={output.num_prefillable}, "
+                f"num_token_watermark_force_allow={output.num_token_watermark_force_allow}, "
+                f"actual_execution={actual_execution})"
+            )
+        else:
+            assert output.output_reason in {
+                "",
+                "wait_success",
+                "no_wait",
+                "delay",
+            }
+
+    if metrics_collector is not None:
+        if (s := output.next_state) is not None:
+            wait_seconds = time.perf_counter() - s.start_time
+            forward_passes = s.delayed_count
+        else:
+            wait_seconds = forward_passes = 0
+        metrics_collector.observe_prefill_delayer_outcome(
+            forward_passes=forward_passes,
+            wait_seconds=wait_seconds,
+            input_estimation=output.input_estimation,
+            output_allow=output.output_allow,
+            output_reason=output.output_reason,
+            actual_execution=actual_execution,
+        )
diff --git a/sglang/python/sglang/srt/managers/schedule_batch.py b/sglang/python/sglang/srt/managers/schedule_batch.py
new file mode 100644
index 0000000000000000000000000000000000000000..23d6dfc3523f8afc530d82ee8d9a21a4ec9a6270
--- /dev/null
+++ b/sglang/python/sglang/srt/managers/schedule_batch.py
@@ -0,0 +1,2432 @@
+from __future__ import annotations
+
+from sglang.srt.dllm.config import DllmConfig
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch
+from sglang.srt.utils.common import ceil_align, is_pin_memory_available
+
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""
+Store information about requests and batches.
+
+The following is the flow of data structures for a batch:
+
+ScheduleBatch -> ModelWorkerBatch -> ForwardBatch
+
+- ScheduleBatch is managed by `scheduler.py::Scheduler`.
+  It contains high-level scheduling data. Most of the data is on the CPU.
+- ModelWorkerBatch is managed by `tp_worker.py::TpModelWorker`.
+  It is a subset of `ScheduleBatch` that only contains data related to the model forward on GPU.
+  It will be transformed from CPU scheduler to GPU model runner.
+- ForwardBatch is managed by `model_runner.py::ModelRunner`.
+  It contains low-level tensor data. Most of the data consists of GPU tensors.
+
+TODO(lmzheng): ModelWorkerBatch seems a bit redundant and we consider removing it in the future.
+"""
+
+import copy
+import dataclasses
+import logging
+import re
+from enum import Enum, auto
+from functools import lru_cache
+from http import HTTPStatus
+from itertools import chain
+from typing import TYPE_CHECKING, Any, Dict, List, Optional, Set, Tuple, Union
+
+import numpy as np
+import torch
+
+from sglang.srt.constrained.base_grammar_backend import BaseGrammarObject
+from sglang.srt.disaggregation.base import BaseKVSender
+from sglang.srt.disaggregation.decode_schedule_batch_mixin import (
+    ScheduleBatchDisaggregationDecodeMixin,
+)
+from sglang.srt.disaggregation.utils import DisaggregationMode
+from sglang.srt.distributed.parallel_state import get_tensor_model_parallel_rank
+from sglang.srt.dllm.mixin.req import ReqDllmMixin
+from sglang.srt.environ import envs
+from sglang.srt.layers.attention.fla.chunk_delta_h import CHUNK_SIZE as FLA_CHUNK_SIZE
+from sglang.srt.mem_cache.allocator import BaseTokenToKVPoolAllocator
+from sglang.srt.mem_cache.base_prefix_cache import BasePrefixCache, MatchPrefixParams
+from sglang.srt.mem_cache.common import (
+    alloc_for_decode,
+    alloc_for_extend,
+    evict_from_tree_cache,
+    release_kv_cache,
+)
+from sglang.srt.mem_cache.memory_pool import ReqToTokenPool
+from sglang.srt.mem_cache.radix_cache import RadixKey
+from sglang.srt.mem_cache.swa_memory_pool import SWATokenToKVPoolAllocator
+from sglang.srt.model_executor.forward_batch_info import (
+    CaptureHiddenMode,
+    ForwardBatch,
+    ForwardMode,
+)
+from sglang.srt.observability.metrics_collector import (
+    DPCooperationInfo,
+    SchedulerMetricsCollector,
+)
+from sglang.srt.observability.req_time_stats import (
+    APIServerReqTimeStats,
+    DPControllerReqTimeStats,
+    SchedulerReqTimeStats,
+)
+from sglang.srt.sampling.sampling_batch_info import SamplingBatchInfo
+from sglang.srt.sampling.sampling_params import SamplingParams
+from sglang.srt.server_args import ServerArgs, get_global_server_args
+from sglang.srt.utils import flatten_nested_list
+from sglang.srt.utils.cuda_ipc_transport_utils import CudaIpcTensorTransportProxy
+
+if TYPE_CHECKING:
+    from typing import Any, Dict
+
+    from sglang.srt.configs.model_config import ModelConfig
+    from sglang.srt.managers.session_controller import Session
+    from sglang.srt.observability.scheduler_metrics_mixin import PrefillStats
+    from sglang.srt.speculative.eagle_info import EagleDraftInput
+    from sglang.srt.speculative.spec_info import SpecInput, SpeculativeAlgorithm
+
+INIT_INCREMENTAL_DETOKENIZATION_OFFSET = 5
+
+# Constant used as the base offset for MM (multimodal) pad values.
+# This ensures pad_values don't overlap with valid text token IDs.
+MM_PAD_SHIFT_VALUE = 1_000_000
+
+logger = logging.getLogger(__name__)
+
+
+@lru_cache(maxsize=1)
+def sanity_check_mm_pad_shift_value(vocab_size: int) -> None:
+    if vocab_size > MM_PAD_SHIFT_VALUE:
+        raise ValueError(
+            f"Model vocab_size ({vocab_size}) exceeds MM_PAD_SHIFT_VALUE ({MM_PAD_SHIFT_VALUE}). "
+            f"MM pad_values may overlap with valid token IDs. "
+            f"Please increase MM_PAD_SHIFT_VALUE in schedule_batch.py."
+        )
+
+
+def _compute_pad_value(hash: int) -> int:
+    """Compute pad value from hash."""
+    return MM_PAD_SHIFT_VALUE + (hash % (1 << 30))
+
+
+class BaseFinishReason:
+    def __init__(self, is_error: bool = False):
+        self.is_error = is_error
+
+    def to_json(self):
+        raise NotImplementedError()
+
+
+class FINISH_MATCHED_TOKEN(BaseFinishReason):
+    def __init__(self, matched: Union[int, List[int]]):
+        super().__init__()
+        self.matched = matched
+
+    def to_json(self):
+        return {
+            "type": "stop",  # to match OpenAI API's return value
+            "matched": self.matched,
+        }
+
+
+class FINISH_MATCHED_STR(BaseFinishReason):
+    def __init__(self, matched: str):
+        super().__init__()
+        self.matched = matched
+
+    def to_json(self):
+        return {
+            "type": "stop",  # to match OpenAI API's return value
+            "matched": self.matched,
+        }
+
+
+class FINISHED_MATCHED_REGEX(BaseFinishReason):
+    def __init__(self, matched: str):
+        super().__init__()
+        self.matched = matched
+
+    def to_json(self):
+        return {
+            "type": "stop",  # to match OpenAI API's return value
+            "matched": self.matched,
+        }
+
+
+class FINISH_LENGTH(BaseFinishReason):
+    def __init__(self, length: int):
+        super().__init__()
+        self.length = length
+
+    def to_json(self):
+        return {
+            "type": "length",  # to match OpenAI API's return value
+            "length": self.length,
+        }
+
+
+class FINISH_ABORT(BaseFinishReason):
+    def __init__(self, message=None, status_code=None, err_type=None):
+        super().__init__(is_error=True)
+        self.message = message or "Aborted"
+        self.status_code = status_code
+        self.err_type = err_type
+
+    def to_json(self):
+        return {
+            "type": "abort",
+            "message": self.message,
+            "status_code": self.status_code,
+            "err_type": self.err_type,
+        }
+
+
+class Modality(Enum):
+    IMAGE = auto()
+    MULTI_IMAGES = auto()
+    VIDEO = auto()
+    AUDIO = auto()
+
+    @staticmethod
+    def from_str(modality_str: str):
+        try:
+            return Modality[modality_str.upper()]
+        except KeyError:
+            raise ValueError(
+                f"Invalid modality string: {modality_str}. Valid modalities are: {[m.name for m in Modality]}"
+            )
+
+    @staticmethod
+    def all():
+        return [Modality.IMAGE, Modality.VIDEO, Modality.AUDIO]
+
+
+class MultimodalInputFormat(Enum):
+    NORMAL = auto()
+    PROCESSOR_OUTPUT = auto()
+    PRECOMPUTED_EMBEDDING = auto()
+
+
+@dataclasses.dataclass
+class MultimodalDataItem:
+    """
+    One MultimodalDataItem contains all inputs for one modality.
+    For example, if there are 3 images and 1 audio inputs, there will be 2 MultimodalDataItem.
+    One for images and one for audio.
+
+    We put the common fields first and the model-specific fields in model_specific_data.
+    """
+
+    modality: Modality
+    hash: int = None
+    pad_value: int = None
+    offsets: Optional[list] = None
+
+    format: MultimodalInputFormat = MultimodalInputFormat.NORMAL
+
+    # the raw features returned by processor, e.g. pixel_values or audio_features
+    feature: Union[torch.Tensor, np.ndarray] = None
+    # the precomputed embeddings, passed as final encoder embeddings
+    # One and only one of the feature and precomputed_embeddings will be empty
+    precomputed_embeddings: Optional[Union[torch.Tensor, np.ndarray]] = None
+
+    # Model-specific data stored in a dictionary
+    model_specific_data: dict[str, Any] = dataclasses.field(default_factory=dict)
+
+    def __getattr__(self, name: str):
+        if (
+            "model_specific_data" in self.__dict__
+            and name in self.__dict__["model_specific_data"]
+        ):
+            return self.__dict__["model_specific_data"][name]
+        else:
+            raise AttributeError(
+                f"'{self.__class__.__name__}' object has no attribute '{name}'"
+            )
+
+    def __setitem__(self, key: str, value: Any):
+        if key in self.__dict__:
+            self.__dict__[key] = value
+        else:
+            self.model_specific_data[key] = value
+
+    def set(self, key: str, value: Any):
+        self.__setitem__(key, value)
+
+    @staticmethod
+    def is_empty_list(l):
+        if l is None:
+            return True
+        return len([item for item in flatten_nested_list(l) if item is not None]) == 0
+
+    def set_pad_value(self):
+        """
+        Set the pad value after first hashing the data
+        """
+        if self.pad_value is not None:
+            return
+
+        from sglang.srt.managers.mm_utils import hash_feature
+
+        if envs.SGLANG_MM_SKIP_COMPUTE_HASH.get():
+            import uuid
+
+            self.hash = uuid.uuid4().int
+            self.pad_value = _compute_pad_value(self.hash)
+            return
+        if self.hash is None:
+            if self.feature is not None:
+                hashed_feature = self.feature
+            else:
+                hashed_feature = self.precomputed_embeddings
+            self.hash = hash_feature(hashed_feature)
+        assert self.hash is not None
+        self.pad_value = _compute_pad_value(self.hash)
+
+    def is_modality(self, modality: Modality) -> bool:
+        return self.modality == modality
+
+    def is_audio(self):
+        return self.modality == Modality.AUDIO
+
+    def is_image(self):
+        return self.modality in [Modality.IMAGE, Modality.MULTI_IMAGES]
+
+    def is_video(self):
+        return self.modality == Modality.VIDEO
+
+    def is_valid(self) -> bool:
+        return self.is_image() or self.is_video() or self.is_audio()
+
+    def validate(self):
+        ...
+        # TODO
+
+    def is_precomputed_embedding(self):
+        return self.format == MultimodalInputFormat.PRECOMPUTED_EMBEDDING
+
+    @staticmethod
+    def from_dict(obj: dict):
+        kwargs = dict(obj)
+        modality = kwargs.pop("modality")
+        if isinstance(modality, str):
+            modality = Modality[modality]
+        ret = MultimodalDataItem(modality=modality, **kwargs)
+        ret.validate()
+        return ret
+
+    def merge(self, other):
+        self.feature += other.feature
+        self.offsets += other.offsets
+        self.hash = hash((self.hash, other.hash))
+        self.set_pad_value()
+
+
+@dataclasses.dataclass
+class MultimodalInputs:
+    """The multimodal data related inputs."""
+
+    # items of data
+    mm_items: List[MultimodalDataItem]
+    image_pad_len: Optional[list] = None
+    num_image_tokens: Optional[int] = None
+
+    # image
+    im_token_id: Optional[int] = None
+    im_start_id: Optional[int] = None
+    im_end_id: Optional[int] = None
+    slice_start_id: Optional[int] = None
+    slice_end_id: Optional[int] = None
+
+    # video
+    video_token_id: Optional[int] = None
+
+    # audio
+    audio_token_id: Optional[int] = None
+    audio_start_id: Optional[int] = None
+    audio_end_id: Optional[int] = None
+
+    # QWen2-VL related
+    mrope_positions: Optional[torch.Tensor] = None
+    mrope_position_delta: Optional[torch.Tensor] = None
+    mrope_position_delta_repeated_cache: Optional[torch.Tensor] = None
+
+    @staticmethod
+    def from_dict(obj: dict):
+        # Check if MM splitting is enabled
+        if not envs.SGLANG_ENABLE_MM_SPLITTING.get():
+            mm_items = obj["mm_items"]
+        else:
+            from sglang.srt.managers.mm_utils import get_new_expanded_mm_items
+
+            original_mm_items = obj["mm_items"]
+            # Now, `mm_items` contains one item per image.
+            mm_items = get_new_expanded_mm_items(original_mm_items)
+
+        ret = MultimodalInputs(
+            mm_items=mm_items,
+        )
+
+        assert isinstance(ret.mm_items, list)
+        ret.mm_items = [item for item in ret.mm_items if item.is_valid()]
+
+        if envs.SGLANG_MM_BUFFER_SIZE_MB.get() > 0:
+            # Multi-modal feature hashing optimization:
+            # When SGLANG_MM_BUFFER_SIZE_MB > 0, we temporarily move feature tensors to GPU
+            # for faster hash computation, while avoiding OOM issues.
+            from sglang.srt.managers.mm_utils import (
+                init_feature_buffer,
+                is_feature_buffer_initialized,
+                reset_buffer_offset,
+                try_add_to_buffer,
+            )
+
+            device = torch.cuda.current_device() if torch.cuda.is_available() else "cpu"
+            if not is_feature_buffer_initialized():
+                init_feature_buffer(device)
+            reset_buffer_offset()
+            for item in ret.mm_items:
+                if item.feature is not None:
+                    if isinstance(item.feature, torch.Tensor):
+                        item.feature = try_add_to_buffer(item.feature)
+
+        for item in ret.mm_items:
+            item.set_pad_value()
+
+        if envs.SGLANG_MM_BUFFER_SIZE_MB.get() > 0:
+            for item in ret.mm_items:
+                if item.feature is not None:
+                    item.feature = item.feature.to("cpu", non_blocking=True)
+
+        optional_args = [
+            "mrope_positions",
+            "mrope_position_delta",
+            "im_token_id",
+            "im_start_id",
+            "im_end_id",
+            "video_token_id",
+            "slice_start_id",
+            "slice_end_id",
+            "audio_start_id",
+            "audio_end_id",
+            "audio_token_id",
+        ]
+        for arg in optional_args:
+            if arg in obj:
+                setattr(ret, arg, obj[arg])
+
+        return ret
+
+    def contains_image_inputs(self) -> bool:
+        return any(item.is_image() for item in self.mm_items)
+
+    def contains_video_inputs(self) -> bool:
+        return any(item.is_video() for item in self.mm_items)
+
+    def contains_audio_inputs(self) -> bool:
+        return any(item.is_audio() for item in self.mm_items)
+
+    def contains_mm_input(self) -> bool:
+        return any(True for item in self.mm_items if item.is_valid())
+
+    def merge(self, other: MultimodalInputs):
+        """
+        merge image inputs when requests are being merged
+        """
+
+        # args needed to be merged
+        optional_args = [
+            "mm_items",
+            "image_pad_len",
+        ]
+        for arg in optional_args:
+            self_arg = getattr(self, arg, None)
+            if self_arg is not None:
+                setattr(self, arg, self_arg + getattr(other, arg))
+
+        mrope_positions = self.mrope_positions
+        if mrope_positions is not None:
+            if other.mrope_positions is None:
+                self.mrope_positions = mrope_positions
+            else:
+                self.mrope_positions = torch.cat(
+                    [self.mrope_positions, other.mrope_positions], dim=1
+                )
+
+        mrope_position_delta = self.mrope_position_delta
+        if mrope_position_delta is not None:
+            if other.mrope_position_delta is None:
+                self.mrope_position_delta = mrope_position_delta
+            else:
+                self.mrope_position_delta = torch.cat(
+                    [self.mrope_position_delta, other.mrope_position_delta], dim=0
+                )
+
+        for key, val in other.__dict__.items():
+            if "_id" in key:
+                # set token_ids
+                if getattr(self, key, None) is None:
+                    setattr(self, key, getattr(other, key, None))
+        # other args would be kept intact
+
+
+class Req(ReqDllmMixin):
+    """The input and output status of a request."""
+
+    def __init__(
+        self,
+        rid: str,
+        origin_input_text: str,
+        origin_input_ids: List[int],
+        sampling_params: SamplingParams,
+        return_logprob: bool = False,
+        top_logprobs_num: int = 0,
+        dllm_config: Optional[DllmConfig] = None,
+        token_ids_logprob: List[int] = None,
+        stream: bool = False,
+        origin_input_ids_unpadded: Optional[Tuple[int]] = None,
+        lora_id: Optional[str] = None,
+        input_embeds: Optional[List[List[float]]] = None,
+        token_type_ids: List[int] = None,
+        session: Optional[Session] = None,
+        custom_logit_processor: Optional[str] = None,
+        require_reasoning: bool = False,
+        return_hidden_states: bool = False,
+        return_routed_experts: bool = False,
+        eos_token_ids: Optional[Set[int]] = None,
+        bootstrap_host: Optional[str] = None,
+        bootstrap_port: Optional[int] = None,
+        bootstrap_room: Optional[int] = None,
+        disagg_mode: Optional[DisaggregationMode] = None,
+        routed_dp_rank: Optional[int] = None,
+        disagg_prefill_dp_rank: Optional[int] = None,
+        vocab_size: Optional[int] = None,
+        priority: Optional[int] = None,
+        metrics_collector: Optional[SchedulerMetricsCollector] = None,
+        extra_key: Optional[str] = None,
+        routing_key: Optional[str] = None,
+        dimensions: Optional[int] = None,
+        http_worker_ipc: Optional[str] = None,
+        time_stats: Optional[
+            Union[APIServerReqTimeStats, DPControllerReqTimeStats]
+        ] = None,
+    ):
+        # Input and output info
+        self.rid = rid
+        self.origin_input_text = origin_input_text
+        self.origin_input_ids_unpadded = (
+            origin_input_ids_unpadded
+            if origin_input_ids_unpadded
+            else origin_input_ids  # Before image padding
+        )
+        self.origin_input_ids = origin_input_ids
+        # Each decode stage's output ids
+        self.output_ids = []
+        # fill_ids = origin_input_ids + output_ids. Updated if chunked.
+        self.fill_ids = []
+        self.session = session
+        self.input_embeds = input_embeds
+
+        # For req-level memory management
+        self.kv_committed_len = 0
+        self.kv_allocated_len = 0
+        self.kv_committed_freed = False
+        self.kv_overallocated_freed = False
+
+        # for corss-endoder model
+        self.token_type_ids = token_type_ids
+
+        # The length of KV that have been removed in swa cache.
+        # SWA KV cache eviction behavior differs by cache type:
+        # - Radix cache: KV in range [cache_protected_len, swa_evicted_seqlen) is freed manually in
+        #   `ScheduleBatch.maybe_evict_swa`; KV in range [0, cache_protected_len) is freed during radix cache eviction.
+        # - Chunk cache: KV in range [0, swa_evicted_seqlen) is freed manually in `ScheduleBatch.maybe_evict_swa`.
+        self.swa_evicted_seqlen = 0
+
+        # The index of the extend / decode batch
+        self.extend_batch_idx = 0
+        self.decode_batch_idx = 0
+
+        # For multi-http worker
+        self.http_worker_ipc = http_worker_ipc
+
+        # Require reasoning for the request (hybrid reasoning model only)
+        self.require_reasoning = require_reasoning
+
+        # Sampling info
+        if isinstance(sampling_params.custom_params, dict):
+            sampling_params = copy.copy(sampling_params)
+            sampling_params.custom_params = sampling_params.custom_params | {
+                "__req__": self
+            }
+        self.sampling_params = sampling_params
+        self.custom_logit_processor = custom_logit_processor
+        self.return_hidden_states = return_hidden_states
+
+        # extra key for classifying the request (e.g. cache_salt)
+        if lora_id is not None:
+            extra_key = (
+                extra_key or ""
+            ) + lora_id  # lora_id is concatenated to the extra key
+
+        self.extra_key = extra_key
+        self.lora_id = lora_id
+        self.routing_key = routing_key
+
+        # Memory pool info
+        self.req_pool_idx: Optional[int] = None
+        self.mamba_pool_idx: Optional[torch.Tensor] = None  # shape (1)
+        self.mamba_ping_pong_track_buffer: Optional[torch.Tensor] = None  # shape (2)
+        self.mamba_next_track_idx: Optional[int] = None  # 0 or 1
+        self.mamba_last_track_seqlen: Optional[int] = (
+            None  # seq len of the last cached mamba state
+        )
+        # the branching point seqlen to track mamba state. If set, given by prefix match,
+        # it will be the tracked seqlen in the ping pong buffer for the right prefill pass.
+        self.mamba_branching_seqlen: Optional[int] = None
+
+        # Check finish
+        self.tokenizer = None
+        self.finished_reason: Optional[BaseFinishReason] = None
+        # finished position (in output_ids), used when checking stop conditions with speculative decoding
+        self.finished_len = None
+        # Whether this request has finished output
+        self.finished_output = None
+        # If we want to abort the request in the middle of the event loop,
+        # set to_finish instead of directly setting finished_reason.
+        # Note: We should never set finished_reason in the middle, the req will get filtered and never respond
+        self.to_finish: Optional[BaseFinishReason] = None
+        self.stream = stream
+        self.eos_token_ids = eos_token_ids
+        self.vocab_size = vocab_size
+        self.priority = priority
+
+        # For incremental decoding
+        # ----- | --------- read_ids -------|
+        # ----- |   surr_ids  |
+        # xxxxx | xxxxxxxxxxx | xxxxxxxxxxx |
+        # ----- ^ ----------- ^ ----------- ^
+        # ----- 1 ----------- 2 ----------- 3
+        # 1: surr_offset
+        # 2: read_offset
+        # 3: last token
+        self.surr_offset = None  # Surrounding offset to defeat the cleanup algorithm
+        self.read_offset = None
+        self.decoded_text = ""
+
+        # For multimodal inputs
+        self.multimodal_inputs: Optional[MultimodalInputs] = None
+
+        # Prefix info
+        # The indices to kv cache for the shared prefix.
+        self.prefix_indices: torch.Tensor = torch.empty((0,), dtype=torch.int64)
+        # Number of tokens to run prefill.
+        self.extend_input_len = 0
+        # The relative logprob_start_len in an extend batch
+        self.extend_logprob_start_len = 0
+        self.last_node: Any = None
+        self.last_host_node: Any = None
+        self.host_hit_length = 0
+        # Tokens loaded from storage backend (L3) during prefetch for this request
+        self.storage_hit_length = 0
+        # The node to lock until for swa radix tree lock ref
+        self.swa_uuid_for_lock: Optional[int] = None
+        # The prefix length that is inserted into the tree cache
+        self.cache_protected_len: int = 0
+
+        # Whether or not if it is chunked. It increments whenever
+        # it is chunked, and decrement whenever chunked request is
+        # processed.
+        self.is_chunked = 0
+
+        # For retraction
+        self.is_retracted = False
+        # Indicates if the req has ever been retracted.
+        self.retracted_stain = False
+
+        # Incremental streamining
+        self.send_token_offset: int = 0
+        self.send_decode_id_offset: int = 0
+        # TODO (Byron): send_output_token_logprobs_offset and send_decode_id_offset can be different in disaggregation mode
+        # because the decode server does not have the first output token logprobs
+        self.send_output_token_logprobs_offset: int = 0
+
+        # Logprobs (arguments)
+        self.return_logprob = return_logprob
+        # Start index to compute logprob from.
+        self.logprob_start_len = 0
+        self.top_logprobs_num = top_logprobs_num
+        self.token_ids_logprob = token_ids_logprob
+        self.temp_scaled_logprobs = False
+        self.top_p_normalized_logprobs = False
+
+        # Logprobs (return values)
+        # True means the input logprob has been already sent to detokenizer.
+        self.input_logprob_sent: bool = False
+        self.input_token_logprobs_val: Optional[List[float]] = None
+        self.input_token_logprobs_idx: Optional[List[int]] = None
+        self.input_top_logprobs_val: Optional[List[float]] = None
+        self.input_top_logprobs_idx: Optional[List[int]] = None
+        self.input_token_ids_logprobs_val: Optional[List[float]] = None
+        self.input_token_ids_logprobs_idx: Optional[List[int]] = None
+        # Temporary holder to store input_token_logprobs.
+        self.input_token_logprobs: Optional[List[Tuple[int]]] = None
+        self.temp_input_top_logprobs_val: Optional[List[torch.Tensor]] = None
+        self.temp_input_top_logprobs_idx: Optional[List[int]] = None
+        self.temp_input_token_ids_logprobs_val: Optional[List[float]] = None
+        self.temp_input_token_ids_logprobs_idx: Optional[List[int]] = None
+
+        if return_logprob:
+            # shape: (bs, 1)
+            self.output_token_logprobs_val = []
+            self.output_token_logprobs_idx = []
+            # shape: (bs, k)
+            self.output_top_logprobs_val = []
+            self.output_top_logprobs_idx = []
+            # Can contain either lists or GPU tensors (delayed copy optimization for prefill-only scoring)
+            self.output_token_ids_logprobs_val: List[
+                Union[List[float], torch.Tensor]
+            ] = []
+            self.output_token_ids_logprobs_idx = []
+        else:
+            self.output_token_logprobs_val = self.output_token_logprobs_idx = (
+                self.output_top_logprobs_val
+            ) = self.output_top_logprobs_idx = self.output_token_ids_logprobs_val = (
+                self.output_token_ids_logprobs_idx
+            ) = None
+        self.hidden_states: List[List[float]] = []
+        self.hidden_states_tensor = None  # Note: use tensor instead of list to transfer hidden_states when PD + MTP
+        self.output_topk_p = None
+        self.output_topk_index = None
+
+        # capture routed experts
+        self.return_routed_experts = return_routed_experts
+        self.routed_experts: Optional[torch.Tensor] = (
+            None  # cpu tensor: shape (seqlen, topk)
+        )
+        # Customized info
+        self.customized_info: Optional[Dict[str, List[Any]]] = None
+
+        # Embedding (return values)
+        self.embedding = None
+
+        # Constrained decoding
+        self.grammar_key: Optional[str] = None
+        self.grammar: Optional[BaseGrammarObject] = None
+        self.grammar_wait_ct = 0
+
+        # The number of cached tokens that were already cached in the KV cache
+        self.cached_tokens = 0
+        self.already_computed = 0
+
+        # Detailed breakdown of cached tokens by source (for HiCache)
+        self.cached_tokens_device = 0  # Tokens from device cache (GPU)
+        self.cached_tokens_host = 0  # Tokens from host cache (CPU memory)
+        self.cached_tokens_storage = 0  # Tokens from L3 storage backend
+        self._cache_breakdown_computed = (
+            False  # Track if breakdown was already computed
+        )
+
+        # The number of verification forward passes in the speculative decoding.
+        # This is used to compute the average acceptance length per request.
+        self.spec_verify_ct = 0
+
+        # The number of accepted tokens in speculative decoding for this request.
+        # This is used to compute the acceptance rate and average acceptance length per request.
+        self.spec_accepted_tokens = 0
+
+        # Acceptance histogram for speculative decoding.
+        # List index = number of accepted tokens in a step, List value = count of steps with that many accepted tokens.
+        # Example: histogram[0] = 5 means 5 steps with 0 accepted tokens, histogram[3] = 10 means 10 steps with 3 accepted tokens.
+        self.spec_acceptance_histogram: List[int] = []
+
+        # The number of times this request has been retracted / preempted.
+        self.retraction_count = 0
+        self.retraction_mb_id = None
+
+        # For observability
+        self.metrics_collector = metrics_collector
+        if time_stats is not None:
+            self.time_stats = SchedulerReqTimeStats.new_from_obj(time_stats)
+        else:
+            self.time_stats = SchedulerReqTimeStats(disagg_mode=disagg_mode)
+        self.time_stats.set_metrics_collector(metrics_collector)
+        self.time_stats.set_scheduler_recv_time()
+        self.has_log_time_stats: bool = False
+
+        # For disaggregation
+        self.bootstrap_host: str = bootstrap_host
+        self.bootstrap_port: Optional[int] = bootstrap_port
+        self.bootstrap_room: Optional[int] = bootstrap_room
+        self.disagg_kv_sender: Optional[BaseKVSender] = None
+
+        self.routed_dp_rank: Optional[int] = routed_dp_rank
+        self.disagg_prefill_dp_rank: Optional[int] = disagg_prefill_dp_rank
+
+        # the start index of the sent kv cache
+        # We want to send it chunk by chunk for chunked prefill.
+        # After every chunk forward, we do the following:
+        # kv_send(req.input_ids[req.start_send_idx:len(req.fill_ids)])
+        # start_send_idx = len(req.fill_ids)
+        self.start_send_idx: int = 0
+
+        # For overlap schedule, we delay the kv transfer until `process_batch_result_disagg_prefill` rather than `process_prefill_chunk` in non-overlap
+        # This is because kv is not ready in `process_prefill_chunk`.
+        # We use `tmp_end_idx` to store the end index of the kv cache to send.
+        self.tmp_end_idx: int = -1
+        self.metadata_buffer_index: int = -1
+
+        # For Matryoshka embeddings
+        self.dimensions = dimensions
+
+        # For diffusion LLM
+        self.init_diffusion_llm(dllm_config)
+
+    @property
+    def seqlen(self) -> int:
+        """Get the current sequence length of the request."""
+        return len(self.origin_input_ids) + len(self.output_ids)
+
+    @property
+    def is_prefill_only(self) -> bool:
+        """Check if this request is prefill-only (no token generation needed)."""
+        # NOTE: when spec is enabled, prefill_only optimizations are disabled
+
+        spec_alg = get_global_server_args().speculative_algorithm
+        return self.sampling_params.max_new_tokens == 0 and spec_alg is None
+
+    @property
+    def output_ids_through_stop(self) -> List[int]:
+        """Get the output ids through the stop condition. Stop position is included."""
+        if self.finished_len is not None:
+            return self.output_ids[: self.finished_len]
+        return self.output_ids
+
+    def pop_committed_kv_cache(self) -> int:
+        """Return the length of committed KV cache and mark them as freed."""
+        assert (
+            not self.kv_committed_freed
+        ), f"Committed KV cache already freed ({self.kv_committed_len=})"
+        self.kv_committed_freed = True
+        return self.kv_committed_len
+
+    def pop_overallocated_kv_cache(self) -> Tuple[int, int]:
+        """Return the range of over-allocated KV cache and mark them as freed."""
+
+        # NOTE: This function is called when there is over-allocation of KV cache.
+        # Over-allocation: we allocate more KV cache than the committed length.
+        # e.g., speculative decoding may allocate more KV cache than actually used.
+        assert (
+            not self.kv_overallocated_freed
+        ), f"Overallocated KV cache already freed, {self.kv_committed_len=}, {self.kv_allocated_len=}"
+        self.kv_overallocated_freed = True
+        return self.kv_committed_len, self.kv_allocated_len
+
+    def update_spec_acceptance_histogram(self, accepted_draft_tokens: int):
+        """Update the speculative decoding acceptance histogram.
+
+        Args:
+            accepted_draft_tokens: Number of draft tokens accepted in this step.
+        """
+        if len(self.spec_acceptance_histogram) <= accepted_draft_tokens:
+            self.spec_acceptance_histogram.extend(
+                [0] * (accepted_draft_tokens - len(self.spec_acceptance_histogram) + 1)
+            )
+        self.spec_acceptance_histogram[accepted_draft_tokens] += 1
+
+    def extend_image_inputs(self, image_inputs):
+        if self.multimodal_inputs is None:
+            self.multimodal_inputs = image_inputs
+        else:
+            self.multimodal_inputs.merge(image_inputs)
+
+    def finished(self) -> bool:
+        # Whether request reached finished condition
+        return self.finished_reason is not None
+
+    def init_next_round_input(self, tree_cache: Optional[BasePrefixCache] = None):
+        if self.is_dllm():
+            self._init_fill_ids_for_dllm()
+            self.determine_dllm_phase()
+        else:
+            self.fill_ids = self.origin_input_ids + self.output_ids
+
+        input_len = len(self.fill_ids)
+        # NOTE: the matched length is at most 1 less than the input length to enable logprob computation
+        max_prefix_len = input_len - 1
+        if self.return_logprob and self.logprob_start_len >= 0:
+            max_prefix_len = min(max_prefix_len, self.logprob_start_len)
+        max_prefix_len = max(max_prefix_len, 0)
+        token_ids = self.fill_ids[:max_prefix_len]
+
+        if tree_cache is not None:
+            match_result = tree_cache.match_prefix(
+                MatchPrefixParams(
+                    key=RadixKey(token_ids=token_ids, extra_key=self.extra_key),
+                    req=self,
+                    cow_mamba=tree_cache.supports_mamba(),
+                )
+            )
+            (
+                self.prefix_indices,
+                self.last_node,
+                self.last_host_node,
+                self.host_hit_length,
+                self.mamba_branching_seqlen,
+            ) = (
+                match_result.device_indices,
+                match_result.last_device_node,
+                match_result.last_host_node,
+                match_result.host_hit_length,
+                match_result.mamba_branching_seqlen,
+            )
+            self.cache_protected_len = len(self.prefix_indices)
+
+            if self.is_dllm():
+                self._update_block_offset_for_dllm()
+
+        if (
+            self.is_retracted
+            and self.multimodal_inputs is not None
+            and self.multimodal_inputs.mrope_positions is not None
+        ):
+            from sglang.srt.managers.mm_utils import (
+                extend_mrope_positions_for_retracted_request,
+            )
+
+            self.multimodal_inputs.mrope_positions = (
+                extend_mrope_positions_for_retracted_request(
+                    self.multimodal_inputs.mrope_positions, len(self.output_ids)
+                )
+            )
+
+        self.set_extend_input_len(len(self.fill_ids) - len(self.prefix_indices))
+
+    # Based on https://github.com/vllm-project/vllm/blob/7a64d24aad69e4d2548aa0bf528d9fe63428ab01/vllm/transformers_utils/detokenizer.py#L194-L313
+    def init_incremental_detokenize(self):
+        first_iter = self.surr_offset is None or self.read_offset is None
+
+        output_ids = self.output_ids_through_stop
+
+        if first_iter:
+            self.read_offset = len(self.origin_input_ids_unpadded)
+            self.surr_offset = max(
+                self.read_offset - INIT_INCREMENTAL_DETOKENIZATION_OFFSET, 0
+            )
+            self.surr_and_decode_ids = (
+                self.origin_input_ids_unpadded[self.surr_offset :] + output_ids
+            )
+            self.cur_decode_ids_len = len(output_ids)
+        else:
+            self.surr_and_decode_ids.extend(output_ids[self.cur_decode_ids_len :])
+            self.cur_decode_ids_len = len(output_ids)
+
+        return self.surr_and_decode_ids, self.read_offset - self.surr_offset
+
+    def tail_str(self) -> str:
+        # Check stop strings and stop regex patterns together
+        if (
+            len(self.sampling_params.stop_strs) == 0
+            and len(self.sampling_params.stop_regex_strs) == 0
+        ):
+            return ""
+
+        max_len_tail_str = max(
+            self.sampling_params.stop_str_max_len + 1,
+            self.sampling_params.stop_regex_max_len + 1,
+        )
+
+        tail_len = min(max_len_tail_str, len(self.output_ids))
+        return self.tokenizer.decode(self.output_ids[-tail_len:])
+
+    def check_match_stop_str_prefix(self) -> bool:
+        """
+        Check if the suffix of tail_str overlaps with any stop_str prefix
+        """
+        if not self.sampling_params.stop_strs:
+            return False
+
+        tail_str = self.tail_str()
+
+        # Early return if tail_str is empty
+        if not tail_str:
+            return False
+
+        for stop_str in self.sampling_params.stop_strs:
+            if not stop_str:
+                continue
+            # Check if stop_str is contained in tail_str (fastest check first)
+            if stop_str in tail_str:
+                return True
+
+            # Check if tail_str suffix matches stop_str prefix
+            # Only check if stop_str is not empty, it's for stream output
+            min_len = min(len(tail_str), len(stop_str))
+            for i in range(1, min_len + 1):
+                if tail_str[-i:] == stop_str[:i]:
+                    return True
+
+        return False
+
+    def _check_token_based_finish(self, new_accepted_tokens: List[int]) -> bool:
+        if self.sampling_params.ignore_eos:
+            return False
+
+        # Check stop token ids
+        matched_eos = False
+
+        for i, token_id in enumerate(new_accepted_tokens):
+            if self.sampling_params.stop_token_ids:
+                matched_eos |= token_id in self.sampling_params.stop_token_ids
+            if self.eos_token_ids:
+                matched_eos |= token_id in self.eos_token_ids
+            if self.tokenizer is not None:
+                matched_eos |= token_id == self.tokenizer.eos_token_id
+                if self.tokenizer.additional_stop_token_ids:
+                    matched_eos |= token_id in self.tokenizer.additional_stop_token_ids
+            if matched_eos:
+                self.finished_reason = FINISH_MATCHED_TOKEN(matched=token_id)
+                matched_pos = len(self.output_ids) - len(new_accepted_tokens) + i
+                self.finished_len = matched_pos + 1
+                return True
+
+        return False
+
+    def _check_str_based_finish(self):
+        if (
+            len(self.sampling_params.stop_strs) > 0
+            or len(self.sampling_params.stop_regex_strs) > 0
+        ):
+            tail_str = self.tail_str()
+
+            # Check stop strings
+            if len(self.sampling_params.stop_strs) > 0:
+                for stop_str in self.sampling_params.stop_strs:
+                    if stop_str in tail_str or stop_str in self.decoded_text:
+                        self.finished_reason = FINISH_MATCHED_STR(matched=stop_str)
+                        return True
+
+            # Check stop regex
+            if len(self.sampling_params.stop_regex_strs) > 0:
+                for stop_regex_str in self.sampling_params.stop_regex_strs:
+                    if re.search(stop_regex_str, tail_str):
+                        self.finished_reason = FINISHED_MATCHED_REGEX(
+                            matched=stop_regex_str
+                        )
+                        return True
+
+        return False
+
+    def _check_vocab_boundary_finish(self, new_accepted_tokens: List[int] = None):
+        for i, token_id in enumerate(new_accepted_tokens):
+            if token_id > self.vocab_size or token_id < 0:
+                offset = len(self.output_ids) - len(new_accepted_tokens) + i
+                if self.sampling_params.stop_token_ids:
+                    self.output_ids[offset] = next(
+                        iter(self.sampling_params.stop_token_ids)
+                    )
+                if self.eos_token_ids:
+                    self.output_ids[offset] = next(iter(self.eos_token_ids))
+                self.finished_reason = FINISH_MATCHED_STR(matched="NaN happened")
+                self.finished_len = offset + 1
+                return True
+
+        return False
+
+    def check_finished(self, new_accepted_len: int = 1):
+        if self.finished():
+            return
+
+        if self.to_finish:
+            self.finished_reason = self.to_finish
+            self.to_finish = None
+            return
+
+        if len(self.output_ids) >= self.sampling_params.max_new_tokens:
+            self.finished_reason = FINISH_LENGTH(
+                length=self.sampling_params.max_new_tokens
+            )
+            self.finished_len = self.sampling_params.max_new_tokens
+            return
+
+        if self.grammar is not None:
+            if self.grammar.is_terminated():
+                self.finished_reason = FINISH_MATCHED_TOKEN(matched=self.output_ids[-1])
+                return
+
+        new_accepted_tokens = self.output_ids[-new_accepted_len:]
+
+        if self._check_token_based_finish(new_accepted_tokens):
+            return
+
+        if self._check_vocab_boundary_finish(new_accepted_tokens):
+            return
+
+        if self._check_str_based_finish():
+            return
+
+    def reset_for_retract(self):
+        # Increment retraction count before resetting other state. We should not reset this
+        # since we are tracking the total number of retractions for each request.
+        self.retraction_count += 1
+
+        self.prefix_indices = torch.empty((0,), dtype=torch.int64)
+        self.routed_experts = None
+        self.last_node = None
+        self.swa_uuid_for_lock = None
+        self.extend_input_len = 0
+        self.is_retracted = True
+        self.retracted_stain = True
+        self.input_token_logprobs = None
+        self.temp_input_top_logprobs_val = None
+        self.temp_input_top_logprobs_idx = None
+        self.extend_logprob_start_len = 0
+        self.is_chunked = 0
+        self.mamba_pool_idx = None
+        self.mamba_ping_pong_track_buffer = None
+        self.mamba_next_track_idx = None
+        self.mamba_last_track_seqlen = None
+        self.mamba_branching_seqlen = None
+        self.already_computed = 0
+        self.kv_allocated_len = 0
+        self.kv_committed_len = 0
+        self.kv_committed_freed = False
+        self.kv_overallocated_freed = False
+        self.swa_evicted_seqlen = 0
+        self.extend_batch_idx = 0
+        self.decode_batch_idx = 0
+
+    def offload_kv_cache(self, req_to_token_pool, token_to_kv_pool_allocator):
+        token_indices = req_to_token_pool.req_to_token[
+            self.req_pool_idx, : self.seqlen - 1
+        ]
+        self.kv_cache_cpu = token_to_kv_pool_allocator.get_cpu_copy(token_indices)
+
+    def load_kv_cache(self, req_to_token_pool, token_to_kv_pool_allocator):
+        token_indices = req_to_token_pool.req_to_token[
+            self.req_pool_idx, : self.seqlen - 1
+        ]
+        token_to_kv_pool_allocator.load_cpu_copy(self.kv_cache_cpu, token_indices)
+        del self.kv_cache_cpu
+
+    def log_time_stats(self):
+        # If overlap schedule, we schedule one decode batch ahead so this gets called twice.
+        if self.has_log_time_stats:
+            return
+
+        bootstrap_info = (
+            f", bootstrap_room={self.bootstrap_room}"
+            if self.bootstrap_room is not None
+            else ""
+        )
+        prefix = f"Req Time Stats(rid={self.rid}{bootstrap_info}, input len={len(self.origin_input_ids)}, output len={len(self.output_ids)}, type={self.time_stats.disagg_mode_str()})"
+        logger.info(f"{prefix}: {self.time_stats.convert_to_duration()}")
+        self.has_log_time_stats = True
+
+    def set_extend_input_len(self, extend_input_len: int):
+        # Setting extend_input_len and computing the relative logprob_start_len in an extend batch
+        #
+        # Key variables:
+        # - logprob_start_len: Absolute position in full sequence where logprob computation begins
+        # - extend_logprob_start_len: Relative position within current extend batch where logprob computation begins
+        # - extend_input_len: Number of tokens that need to be processed in this extend batch
+        self.extend_input_len = extend_input_len
+        if self.logprob_start_len == -1:
+            logprob_start_len = len(self.fill_ids) - 1
+        else:
+            # logprob_start_len should be at least the length of the prefix indices
+            logprob_start_len = max(self.logprob_start_len, len(self.prefix_indices))
+        self.extend_logprob_start_len = min(
+            logprob_start_len - len(self.prefix_indices),
+            self.extend_input_len,
+        )
+
+    def set_finish_with_abort(self, error_msg: str):
+        if get_tensor_model_parallel_rank() == 0:
+            logger.error(f"{error_msg}, {self.rid=}")
+        self.multimodal_inputs = None
+        self.grammar = None
+        self.origin_input_ids = [0]  # set it to one token to skip the long prefill
+        self.return_logprob = False
+        self.logprob_start_len = -1
+        self.to_finish = FINISH_ABORT(
+            error_msg, HTTPStatus.BAD_REQUEST, "BadRequestError"
+        )
+
+    def __repr__(self):
+        return (
+            f"Req(rid={self.rid}, "
+            f"input_ids={self.origin_input_ids}, output_ids={self.output_ids}, "
+            f"{self.grammar=}, "
+            f"{self.sampling_params=})"
+        )
+
+
+@dataclasses.dataclass
+class ScheduleBatch(ScheduleBatchDisaggregationDecodeMixin):
+    """Store all information of a batch on the scheduler."""
+
+    # Request, memory pool, and cache
+    reqs: List[Req]
+    req_to_token_pool: ReqToTokenPool = None
+    token_to_kv_pool_allocator: BaseTokenToKVPoolAllocator = None
+    tree_cache: BasePrefixCache = None
+    is_hybrid_swa: bool = False
+
+    # Batch configs
+    model_config: ModelConfig = None
+    forward_mode: ForwardMode = None
+    enable_overlap: bool = False
+    # Tell whether the current running batch is full so that we can skip
+    # the check of whether to prefill new requests.
+    # This is an optimization to reduce the overhead of the prefill check.
+    batch_is_full: bool = False
+
+    # For chunked prefill in PP
+    chunked_req: Optional[Req] = None
+
+    # Sampling info
+    sampling_info: SamplingBatchInfo = None
+
+    # Batched arguments to model runner
+    input_ids: torch.Tensor = None  # shape: [b], int64
+    input_embeds: torch.Tensor = None  # shape: [b, hidden_size], float32
+    token_type_ids: torch.Tensor = None  # shape: [b], int64
+    req_pool_indices: torch.Tensor = None  # shape: [b], int64
+    seq_lens: torch.Tensor = None  # shape: [b], int64
+    seq_lens_cpu: torch.Tensor = None  # shape: [b], int64
+    # The output locations of the KV cache
+    out_cache_loc: torch.Tensor = None  # shape: [b], int64
+    output_ids: torch.Tensor = None  # shape: [b], int64
+
+    # For hybrid GDN prefix cache
+    mamba_track_indices: torch.Tensor = None  # shape: [b], int64
+    mamba_track_mask: torch.Tensor = None  # shape: [b], bool
+    mamba_track_seqlens: torch.Tensor = None  # shape: [b], int64
+
+    # For multimodal inputs
+    multimodal_inputs: Optional[List] = None
+
+    # The sum of all sequence lengths
+    seq_lens_sum: int = None
+    # The original sequence lengths, Qwen-1M related
+    orig_seq_lens: torch.Tensor = None  # shape: [b], int32
+
+    # For DP attention
+    inner_idle_batch: Optional[ScheduleBatch] = None
+    global_num_tokens: Optional[List[int]] = None
+    global_num_tokens_for_logprob: Optional[List[int]] = None
+    is_extend_in_batch: bool = False
+    all_extend_in_batch: bool = False
+    can_run_dp_cuda_graph: bool = False
+    tbo_split_seq_index: Optional[int] = None
+    global_forward_mode: Optional[ForwardMode] = None
+
+    # For processing logprobs
+    return_logprob: bool = False
+    top_logprobs_nums: Optional[List[int]] = None
+    token_ids_logprobs: Optional[List[List[int]]] = None
+
+    # For logits and logprob post processing
+    temp_scaled_logprobs: bool = False
+    top_p_normalized_logprobs: bool = False
+
+    # For extend and mixed chunekd prefill
+    prefix_lens: List[int] = None
+    extend_lens: List[int] = None
+    extend_num_tokens: Optional[int] = None
+    decoding_reqs: List[Req] = None
+    extend_logprob_start_lens: List[int] = None
+    # It comes empty list if logprob is not required.
+    extend_input_logprob_token_ids: Optional[torch.Tensor] = None
+
+    # For encoder-decoder architectures
+    encoder_cached: Optional[List[bool]] = None
+    encoder_lens: Optional[torch.Tensor] = None
+    encoder_lens_cpu: Optional[List[int]] = None
+    encoder_out_cache_loc: Optional[torch.Tensor] = None
+
+    # For matryoshka embeddings
+    dimensions: Optional[list[int]] = None
+
+    # For split prefill
+    split_index: int = 0
+    split_prefill_finished: bool = False
+    split_forward_count: int = 1
+    split_forward_batch: ForwardBatch = None
+    seq_lens_cpu_cache: torch.Tensor = None
+
+    # Stream
+    has_stream: bool = False
+
+    # Has grammar
+    has_grammar: bool = False
+
+    # Device
+    device: str = "cuda"
+
+    # Speculative decoding
+    spec_algorithm: SpeculativeAlgorithm = None
+    # spec_info: Optional[SpecInput] = None
+    spec_info: Optional[SpecInput] = None
+
+    # Whether to return hidden states
+    return_hidden_states: bool = False
+
+    # Whether to return captured experts
+    return_routed_experts: bool = False
+
+    # Whether this batch is prefill-only (no token generation needed)
+    is_prefill_only: bool = False
+
+    # hicache pointer for synchronizing data loading from CPU to GPU
+    hicache_consumer_index: int = -1
+
+    # Diffusion LLM
+    dllm_config: Optional[DllmConfig] = None
+
+    # Metrics
+    dp_cooperation_info: Optional[DPCooperationInfo] = None
+    prefill_stats: Optional[PrefillStats] = None
+
+    @classmethod
+    def init_new(
+        cls,
+        reqs: List[Req],
+        req_to_token_pool: ReqToTokenPool,
+        token_to_kv_pool_allocator: BaseTokenToKVPoolAllocator,
+        tree_cache: BasePrefixCache,
+        model_config: ModelConfig,
+        enable_overlap: bool,
+        spec_algorithm: SpeculativeAlgorithm,
+        chunked_req: Optional[Req] = None,
+        dllm_config: Optional[DllmConfig] = None,
+    ):
+        return_logprob = any(req.return_logprob for req in reqs)
+
+        is_hybrid_swa = False
+        if isinstance(token_to_kv_pool_allocator, SWATokenToKVPoolAllocator):
+            is_hybrid_swa = True
+
+        return cls(
+            reqs=reqs,
+            req_to_token_pool=req_to_token_pool,
+            token_to_kv_pool_allocator=token_to_kv_pool_allocator,
+            tree_cache=tree_cache,
+            is_hybrid_swa=is_hybrid_swa,
+            model_config=model_config,
+            enable_overlap=enable_overlap,
+            return_logprob=return_logprob,
+            has_stream=any(req.stream for req in reqs),
+            has_grammar=any(req.grammar for req in reqs),
+            device=req_to_token_pool.device,
+            spec_algorithm=spec_algorithm,
+            return_hidden_states=any(req.return_hidden_states for req in reqs),
+            return_routed_experts=any(req.return_routed_experts for req in reqs),
+            is_prefill_only=all(req.is_prefill_only for req in reqs),
+            chunked_req=chunked_req,
+            dllm_config=dllm_config,
+        )
+
+    def batch_size(self):
+        return len(self.reqs)
+
+    def is_empty(self):
+        return len(self.reqs) == 0
+
+    def is_dllm(self):
+        return self.dllm_config is not None
+
+    def prepare_encoder_info_extend(self, input_ids: List[int], seq_lens: List[int]):
+        _pin = is_pin_memory_available(self.device)
+        self.encoder_lens_cpu = []
+        self.encoder_cached = []
+
+        for req in self.reqs:
+            im = req.multimodal_inputs
+            if im is None or im.num_image_tokens is None:
+                # No image input
+                self.encoder_lens_cpu.append(0)
+                self.encoder_cached.append(True)
+            else:
+                self.encoder_lens_cpu.append(im.num_image_tokens)
+                self.encoder_cached.append(
+                    self.forward_mode.is_decode()
+                    or len(req.prefix_indices) >= im.num_image_tokens
+                )
+
+        self.encoder_lens = torch.tensor(
+            self.encoder_lens_cpu, dtype=torch.int64, pin_memory=_pin
+        ).to(self.device, non_blocking=True)
+
+        # Strip encoder infos
+        pt = 0
+        decoder_out_cache_loc = []
+        encoder_out_cache_loc = []
+        for i, req in enumerate(self.reqs):
+            encoder_len = self.encoder_lens_cpu[i]
+            seq_lens[i] -= encoder_len
+
+            if len(req.prefix_indices) < encoder_len:
+                # NOTE: the encoder part should be considered as a whole
+                assert len(req.prefix_indices) == 0
+                input_ids[i] = input_ids[i][encoder_len:]
+                encoder_out_cache_loc.append(self.out_cache_loc[pt : pt + encoder_len])
+                decoder_out_cache_loc.append(
+                    self.out_cache_loc[pt + encoder_len : pt + req.extend_input_len]
+                )
+                self.extend_lens[i] -= encoder_len
+                self.extend_num_tokens -= encoder_len
+            else:
+                decoder_out_cache_loc.append(
+                    self.out_cache_loc[pt : pt + req.extend_input_len]
+                )
+                self.prefix_lens[i] -= encoder_len
+
+            pt += req.extend_input_len
+
+        # Reassign
+        self.input_ids = torch.tensor(
+            sum(input_ids, []), dtype=torch.int64, pin_memory=_pin
+        ).to(self.device, non_blocking=True)
+        self.seq_lens = torch.tensor(seq_lens, dtype=torch.int64, pin_memory=_pin).to(
+            self.device, non_blocking=True
+        )
+        self.seq_lens_cpu = torch.tensor(seq_lens, dtype=torch.int64)
+
+        if not decoder_out_cache_loc:
+            self.out_cache_loc = torch.zeros(0, dtype=torch.int64).to(
+                self.device, non_blocking=True
+            )
+        else:
+            self.out_cache_loc = torch.cat(decoder_out_cache_loc)
+
+        if not encoder_out_cache_loc:
+            self.encoder_out_cache_loc = torch.zeros(0, dtype=torch.int64).to(
+                self.device, non_blocking=True
+            )
+        else:
+            self.encoder_out_cache_loc = torch.cat(encoder_out_cache_loc)
+
+        assert (
+            len(self.out_cache_loc) == self.extend_num_tokens
+        ), f"Expected {len(self.out_cache_loc)}, got {self.extend_num_tokens}"
+
+    def prepare_for_extend(self):
+        self.forward_mode = ForwardMode.EXTEND
+
+        if self.is_dllm():
+            # For DLLM, we use a separate forward mode
+            self.forward_mode = ForwardMode.DLLM_EXTEND
+
+        # Init tensors
+        reqs = self.reqs
+        input_ids = [r.fill_ids[len(r.prefix_indices) :] for r in reqs]
+        extend_num_tokens = sum(len(ids) for ids in input_ids)
+        seq_lens = [len(r.fill_ids) for r in reqs]
+        orig_seq_lens = [max(len(r.fill_ids), len(r.origin_input_ids)) for r in reqs]
+        prefix_lens = [len(r.prefix_indices) for r in reqs]
+        extend_lens = [r.extend_input_len for r in reqs]
+
+        # For matryoshka embeddings
+        if self.model_config.is_matryoshka and any(
+            r.dimensions is not None for r in reqs
+        ):
+            self.dimensions = [
+                r.dimensions if r.dimensions else self.model_config.hidden_size
+                for r in reqs
+            ]
+
+        token_type_ids = [
+            r.token_type_ids for r in reqs if r.token_type_ids is not None
+        ]
+
+        _pin = is_pin_memory_available(self.device)
+        input_ids_tensor = torch.tensor(
+            list(chain.from_iterable(input_ids)), dtype=torch.int64, pin_memory=_pin
+        ).to(self.device, non_blocking=True)
+        seq_lens_tensor = torch.tensor(seq_lens, dtype=torch.int64, pin_memory=_pin).to(
+            self.device, non_blocking=True
+        )
+        seq_lens_cpu = torch.tensor(seq_lens, dtype=torch.int64)
+        orig_seq_lens_tensor = torch.tensor(
+            orig_seq_lens, dtype=torch.int32, pin_memory=_pin
+        ).to(self.device, non_blocking=True)
+
+        token_type_ids_tensor = None
+        if len(token_type_ids) > 0:
+            token_type_ids_tensor = torch.tensor(
+                sum(token_type_ids, []), dtype=torch.int64, pin_memory=_pin
+            ).to(self.device, non_blocking=True)
+
+        # Set batch fields needed by alloc_for_extend
+        self.prefix_lens = prefix_lens
+        self.extend_lens = extend_lens
+        self.seq_lens = seq_lens_tensor
+        self.seq_lens_cpu = seq_lens_cpu
+        self.extend_num_tokens = extend_num_tokens
+
+        # Allocate memory
+        out_cache_loc, req_pool_indices_tensor, req_pool_indices = alloc_for_extend(
+            self
+        )
+
+        # Set fields
+        input_embeds = []
+        extend_input_logprob_token_ids = []
+        multimodal_inputs = []
+        mamba_track_mask_cpu = []
+        mamba_track_indices_cpu = []
+        mamba_track_seqlens_cpu = []
+
+        for i, (req, seq_len, pre_len) in enumerate(zip(reqs, seq_lens, prefix_lens)):
+            req.req_pool_idx = req_pool_indices[i]
+            assert seq_len - pre_len == req.extend_input_len
+
+            req.extend_batch_idx += 1
+
+            # update req-level memory management fields
+            req.kv_committed_len = seq_len
+            req.kv_allocated_len = seq_len
+
+            # If input_embeds are available, store them
+            if req.input_embeds is not None:
+                # If req.input_embeds is already a list, append its content directly
+                input_embeds.extend(req.input_embeds)  # Use extend to avoid nesting
+
+            multimodal_inputs.append(req.multimodal_inputs)
+
+            # Only calculate cached_tokens once. Once retracted, the 'retracted_stain'
+            # flag will always True
+            if not req.retracted_stain:
+                new_cached = pre_len - req.already_computed
+                req.cached_tokens += new_cached
+
+                # Calculate detailed breakdown of cached tokens by source (for HiCache)
+                # Only compute once on FIRST chunk - subsequent chunks in chunked prefill
+                # would incorrectly count previously computed tokens as cache hits.
+                if not req._cache_breakdown_computed:
+                    # At this point, prefix_indices has been extended with host data
+                    # via init_load_back in schedule_policy, so:
+                    # - len(prefix_indices) = device_original + host_loaded
+                    # - host_hit_length = total tokens from host cache (including storage-prefetched)
+                    # - storage_hit_length = tokens loaded from storage backend (L3 hits)
+                    # - device_portion = len(prefix_indices) - host_hit_length
+                    #
+                    # Storage hits are now tracked via scheduler after prefetch completes.
+                    # storage_hit_length is set by scheduler.pop_prefetch_loaded_tokens()
+                    host_total = req.host_hit_length
+                    # Clamp storage to host_total to handle edge cases
+                    storage_portion = min(host_total, req.storage_hit_length)
+                    host_portion = host_total - storage_portion
+                    device_portion = max(0, len(req.prefix_indices) - host_total)
+
+                    req.cached_tokens_device = device_portion
+                    req.cached_tokens_host = host_portion
+                    req.cached_tokens_storage = storage_portion
+                    req._cache_breakdown_computed = True
+
+                req.already_computed = seq_len
+            req.is_retracted = False
+
+            if get_global_server_args().enable_mamba_extra_buffer():
+                self._mamba_radix_cache_v2_req_prepare_for_extend(
+                    req,
+                    mamba_track_mask_cpu,
+                    mamba_track_indices_cpu,
+                    mamba_track_seqlens_cpu,
+                )
+
+            if self.return_logprob:
+                # Find input logprob token ids.
+                # First, find a global index within origin_input_ids and slide it by 1
+                # to compute input logprobs. It is because you need the next token
+                # to compute input logprobs. E.g., (chunk size 2)
+                #
+                # input_logprobs = [1, 2, 3, 4]
+                # fill_ids = [1, 2]
+                # extend_input_logprob_token_id = [2, 3]
+                #
+                # Note that it can also overflow. In this case, we pad it with 0.
+                # input_logprobs = [1, 2, 3, 4]
+                # fill_ids = [3, 4]
+                # extend_input_logprob_token_id = [4, 0]
+                global_start_idx, global_end_idx = (
+                    len(req.prefix_indices),
+                    len(req.fill_ids),
+                )
+                if req.logprob_start_len == -1:
+                    logprob_start_len = len(req.origin_input_ids) - 1
+                else:
+                    logprob_start_len = req.logprob_start_len
+                # Apply logprob_start_len
+                if global_start_idx < logprob_start_len:
+                    global_start_idx = logprob_start_len
+
+                logprob_token_ids = req.origin_input_ids[
+                    global_start_idx + 1 : global_end_idx + 1
+                ]
+                extend_input_logprob_token_ids.extend(logprob_token_ids)
+
+                # We will need req.extend_input_len - req.extend_logprob_start_len number of
+                # tokens, and logprob_token_ids is for input logprob, so pad the rest of them by 0.
+                extend_input_logprob_token_ids.extend(
+                    [0]
+                    * (
+                        req.extend_input_len
+                        - req.extend_logprob_start_len
+                        - len(logprob_token_ids)
+                    )
+                )
+
+        if self.return_logprob:
+            extend_input_logprob_token_ids = torch.tensor(
+                extend_input_logprob_token_ids
+            )
+            # Clamp placeholder or out-of-range token IDs (e.g., multimodal hashes)
+            # so they stay within the vocab boundary before being sent to GPU.
+            extend_input_logprob_token_ids.clamp_(0, self.model_config.vocab_size - 1)
+        else:
+            extend_input_logprob_token_ids = None
+
+        self.input_ids = input_ids_tensor
+        self.req_pool_indices = req_pool_indices_tensor
+        self.orig_seq_lens = orig_seq_lens_tensor
+        self.out_cache_loc = out_cache_loc
+        self.input_embeds = (
+            torch.tensor(input_embeds, pin_memory=_pin).to(
+                self.device, non_blocking=True
+            )
+            if input_embeds
+            else None
+        )
+        for mm_input in multimodal_inputs:
+            if mm_input is None:
+                continue
+            for mm_item in mm_input.mm_items:
+                pixel_values = getattr(mm_item, "feature", None)
+                if isinstance(pixel_values, torch.Tensor):
+                    mm_item.feature = pixel_values.to(self.device, non_blocking=True)
+                elif isinstance(pixel_values, CudaIpcTensorTransportProxy):
+                    mm_item.feature = pixel_values.reconstruct_on_target_device(
+                        torch.cuda.current_device()
+                    )
+                    # The reference by CudaIpcTensorTransportProxy was cut off,
+                    # proactively delete to avoid slow gc.
+                    del pixel_values
+        self.multimodal_inputs = multimodal_inputs
+        self.token_type_ids = token_type_ids_tensor
+        self.seq_lens_sum = sum(seq_lens)
+
+        if self.return_logprob:
+            self.top_logprobs_nums = [r.top_logprobs_num for r in reqs]
+            self.token_ids_logprobs = [r.token_ids_logprob for r in reqs]
+
+        self.extend_logprob_start_lens = [r.extend_logprob_start_len for r in reqs]
+        self.extend_input_logprob_token_ids = extend_input_logprob_token_ids
+
+        if get_global_server_args().enable_mamba_extra_buffer():
+            self.mamba_track_indices = torch.tensor(
+                mamba_track_indices_cpu,
+                dtype=torch.int64,
+                device=self.device,
+            )
+            self.mamba_track_mask = torch.tensor(
+                mamba_track_mask_cpu,
+                dtype=torch.bool,
+                device=self.device,
+            )
+            self.mamba_track_seqlens = torch.tensor(
+                mamba_track_seqlens_cpu,
+                dtype=torch.int64,
+                device=self.device,
+            )
+
+        if self.model_config.is_encoder_decoder:
+            self.prepare_encoder_info_extend(input_ids, seq_lens)
+
+        # Build sampling info
+        self.sampling_info = SamplingBatchInfo.from_schedule_batch(
+            self,
+            self.model_config.vocab_size,
+        )
+
+    def _mamba_radix_cache_v2_req_prepare_for_extend(
+        self,
+        req: Req,
+        mamba_track_mask_cpu: List[bool],
+        mamba_track_indices_cpu: List[int],
+        mamba_track_seqlens_cpu: List[int],
+    ):
+        def _force_track_h(i: int) -> int:
+            assert i % FLA_CHUNK_SIZE == 0
+            # There are 3 cases for mamba_track_seqlen passed to mamba_track_seqlens_cpu:
+            # 1) aligned with FLA_CHUNK_SIZE-> retrieve from last_recurrent_state
+            #    a) is the last position -> retrieve from last_recurrent_state
+            #    b) is NOT the last position -> retrieve from h
+            # 2) unaligned with FLA_CHUNK_SIZE -> retrieve from h
+            # Currently, the math calculation only supports case 1a and 2. So for 1b, we need to add 1
+            # to force the math calculation to retrieve the correct mamba state from h.
+            return i + 1
+
+        mamba_cache_chunk_size = get_global_server_args().mamba_cache_chunk_size
+        mask = req.extend_input_len >= mamba_cache_chunk_size
+        mamba_track_mask_cpu.append(mask)
+        mamba_track_indices_cpu.append(
+            req.mamba_ping_pong_track_buffer[req.mamba_next_track_idx].item()
+        )
+        mamba_track_seqlen = -1
+        if mask:
+            # mamba_track_seqlen is used to calculate the indices to track in
+            # hybrid_linear_attn_backend's _init_track_ssm_indices. Due to the
+            # fact that the ssm state between aligned and non-aligned are retrieved differently,
+            # if 1) last pos and 2) is aligned, then retrieved from the last_recurrent_state,
+            # otherwise retrieved from h (i.e. unaligned).
+            # We need to pass the non-aligned seqlen to the calculation. Even though
+            # we pass in mamba_track_seqlen, the actual tracked seqlen is mamba_last_track_seqlen.
+            mamba_track_seqlen = len(req.prefix_indices) + req.extend_input_len
+
+            # mamba_track_seqlen_aligned/mamba_last_track_seqlen is actual tracked seqlen. Used to pass to
+            # mamba radix cache to track which seqlen this mamba state should store at.
+            mamba_track_seqlen_aligned = (
+                len(req.prefix_indices)
+                + (req.extend_input_len // mamba_cache_chunk_size)
+                * mamba_cache_chunk_size
+            )
+
+            # mamba_track_fla_chunk_aligned is the aligned seqlen based on FLA_CHUNK_SIZE
+            # If mamba_track_fla_chunk_aligned != mamba_track_seqlen_aligned, which can be true when
+            # page_size > FLA_CHUNK_SIZE, we need to force the math calculation to retrieve the correct mamba state from h
+            # by _force_track_h()
+            mamba_track_fla_chunk_aligned = (
+                len(req.prefix_indices)
+                + (req.extend_input_len // FLA_CHUNK_SIZE) * FLA_CHUNK_SIZE
+            )
+            if mamba_track_fla_chunk_aligned != mamba_track_seqlen_aligned:
+                # We want to track mamba_track_seqlen_aligned, and it's not the last position,
+                # so we need to add 1 to the seqlen to retrieve the correct mamba state from h.
+                mamba_track_seqlen = _force_track_h(mamba_track_seqlen_aligned)
+
+            req.mamba_next_track_idx = (
+                self.req_to_token_pool.get_mamba_ping_pong_other_idx(
+                    req.mamba_next_track_idx
+                )
+            )
+            if req.mamba_branching_seqlen is not None:
+                # track branching point in this forward if the branching point
+                # is within the current extend batch.
+                branching_seqlen_aligned_mask = (
+                    req.mamba_branching_seqlen - len(req.prefix_indices)
+                ) % mamba_cache_chunk_size == 0
+                if (
+                    req.mamba_branching_seqlen > len(req.prefix_indices)
+                    and req.mamba_branching_seqlen < mamba_track_seqlen
+                    and branching_seqlen_aligned_mask
+                ):
+                    # We want to track mamba_track_seqlen_aligned, and it's not the last position,
+                    # so we need to add 1 to the seqlen to retrieve the correct mamba state from h.
+                    # See _force_track_h() for more details.
+                    mamba_track_seqlen = _force_track_h(req.mamba_branching_seqlen)
+                    mamba_track_seqlen_aligned = req.mamba_branching_seqlen
+            req.mamba_last_track_seqlen = mamba_track_seqlen_aligned
+        mamba_track_seqlens_cpu.append(mamba_track_seqlen)
+
+    def prepare_for_split_prefill(self):
+        self.prepare_for_extend()
+        # For split prefill, we need to set the forward mode to SPLIT_PREFILL
+        self.forward_mode = ForwardMode.SPLIT_PREFILL
+
+    def mix_with_running(self, running_batch: "ScheduleBatch"):
+        self.forward_mode = ForwardMode.MIXED
+        running_bs = running_batch.batch_size()
+
+        for req in running_batch.reqs:
+            req.fill_ids = req.origin_input_ids + req.output_ids
+            req.set_extend_input_len(1)
+
+        input_ids = torch.cat([self.input_ids, running_batch.input_ids])
+        out_cache_loc = torch.cat([self.out_cache_loc, running_batch.out_cache_loc])
+
+        self.merge_batch(running_batch)
+        self.input_ids = input_ids
+        self.out_cache_loc = out_cache_loc
+
+        # For overlap scheduler, the output_ids has one step delay
+        delta = 0 if self.enable_overlap else -1
+
+        # NOTE: prefix_indices is what has been cached, but we don't cache each decode step
+        self.prefix_lens.extend(
+            [
+                len(r.origin_input_ids) + len(r.output_ids) + delta
+                for r in running_batch.reqs
+            ]
+        )
+        self.extend_lens.extend([1] * running_bs)
+        self.extend_num_tokens += running_bs
+        # TODO (lianmin): Revisit this. It should be seq_len - 1
+        self.extend_logprob_start_lens.extend([0] * running_bs)
+        self.is_prefill_only = False
+
+    def new_tokens_required_next_decode(
+        self, selected_indices: Optional[List[int]] = None
+    ):
+        page_size = self.token_to_kv_pool_allocator.page_size
+        requests = (
+            self.reqs
+            if selected_indices is None
+            else [self.reqs[i] for i in selected_indices]
+        )
+
+        if self.spec_algorithm.is_none():
+            new_pages = sum(1 for r in requests if r.kv_committed_len % page_size == 0)
+            return new_pages * page_size
+
+        server_args = get_global_server_args()
+        len_per_topk = server_args.speculative_num_steps or 1
+        spec_topk = server_args.speculative_eagle_topk or 1
+        spec_tokens = server_args.speculative_num_draft_tokens
+
+        if page_size > 1 and spec_topk > 1:
+            # last partial page and ceil alignment
+            len_per_topk = ceil_align(len_per_topk + page_size, page_size)
+            spec_tokens = ceil_align(spec_tokens, page_size)
+        elif page_size > 1:
+            # only page alignment
+            len_per_topk = ceil_align(len_per_topk, page_size)
+            spec_tokens = ceil_align(spec_tokens, page_size)
+
+        num_tokens = max(len_per_topk * spec_topk, spec_tokens) * len(requests)
+
+        # v2 eagle has over-allocation
+        return num_tokens * (1 + self.is_spec_v2)
+
+    def check_decode_mem(self, selected_indices: Optional[List[int]] = None):
+        num_tokens = self.new_tokens_required_next_decode(selected_indices)
+        evict_from_tree_cache(self.tree_cache, num_tokens)
+        return self.token_to_kv_pool_allocator.available_size() >= num_tokens
+
+    def retract_all(self, server_args: ServerArgs):
+        retracted_reqs = self.reqs
+        for idx in range(len(self.reqs)):
+            self.release_req(idx, len(self.reqs) - idx, server_args)
+
+        self.filter_batch(retracted_reqs)
+        return retracted_reqs
+
+    def retract_decode(
+        self, server_args: ServerArgs
+    ) -> Tuple[List[Req], float, List[Req]]:
+        """Retract the decoding requests when there is not enough memory."""
+        sorted_indices = list(range(len(self.reqs)))
+
+        # TODO(lsyin): improve retraction policy for radix cache
+        # For spec decoding, filter_batch API can only filter
+        # requests from the back, so we can only retract from the back.
+        # TODO(sang): Clean up finish path and support better retract
+        # policy.
+        if not server_args.speculative_algorithm:
+            sorted_indices.sort(
+                key=lambda i: (
+                    len(self.reqs[i].output_ids),
+                    -len(self.reqs[i].origin_input_ids),
+                ),
+                reverse=True,
+            )
+
+        retracted_reqs = []
+        first_iter = True
+        while first_iter or (
+            not self.check_decode_mem(selected_indices=sorted_indices)
+        ):
+            if len(sorted_indices) == 1:
+                # Always keep at least one request
+                break
+
+            first_iter = False
+            idx = sorted_indices.pop()
+            req = self.reqs[idx]
+            retracted_reqs.append(req)
+            # release memory and don't insert into the tree because we need the space instantly
+            self.release_req(idx, len(sorted_indices), server_args)
+
+        if len(sorted_indices) <= 1 and not self.check_decode_mem(
+            selected_indices=sorted_indices
+        ):
+            # Retracting loops ends and still not enough memory
+            raise ValueError(
+                "Out of memory even after retracting all other requests in the decode batch."
+            )
+
+        self.filter_batch(keep_indices=sorted_indices)
+
+        # Reqs in batch are filtered
+        total_decoded_tokens = sum(len(r.output_ids) for r in self.reqs)
+        total_max_new_tokens = sum(r.sampling_params.max_new_tokens for r in self.reqs)
+
+        new_estimate_ratio = (
+            total_decoded_tokens
+            + envs.SGLANG_RETRACT_DECODE_STEPS.get() * len(self.reqs)
+        ) / (
+            total_max_new_tokens + 1
+        )  # avoid zero division
+        new_estimate_ratio = min(1.0, new_estimate_ratio)
+
+        return retracted_reqs, new_estimate_ratio, []
+
+    def release_req(self, idx: int, remaing_req_count: int, server_args: ServerArgs):
+        req = self.reqs[idx]
+
+        if server_args.disaggregation_mode == "decode":
+            req.offload_kv_cache(
+                self.req_to_token_pool, self.token_to_kv_pool_allocator
+            )
+        # TODO (csy): for preempted requests, we may want to insert into the tree
+        release_kv_cache(req, self.tree_cache, is_insert=False)
+        # NOTE(lsyin): we should use the newly evictable memory instantly.
+        num_tokens = remaing_req_count * envs.SGLANG_RETRACT_DECODE_STEPS.get()
+        evict_from_tree_cache(self.tree_cache, num_tokens)
+
+        req.reset_for_retract()
+
+    def prepare_encoder_info_decode(self):
+        # Reset the encoder cached status
+        self.encoder_cached = [True] * len(self.reqs)
+
+    def prepare_for_idle(self):
+        self.forward_mode = ForwardMode.IDLE
+        self.input_ids = torch.empty(0, dtype=torch.int64, device=self.device)
+        self.seq_lens = torch.empty(0, dtype=torch.int64, device=self.device)
+        self.seq_lens_cpu = torch.empty(0, dtype=torch.int64)
+        self.orig_seq_lens = torch.empty(0, dtype=torch.int32, device=self.device)
+        self.out_cache_loc = torch.empty(0, dtype=torch.int64, device=self.device)
+        self.req_pool_indices = torch.empty(0, dtype=torch.int32, device=self.device)
+        self.seq_lens_sum = 0
+        self.extend_num_tokens = 0
+        self.sampling_info = SamplingBatchInfo.from_schedule_batch(
+            self,
+            self.model_config.vocab_size,
+        )
+
+    @property
+    def is_spec_v2(self):
+        # FIXME: finally deprecate is_spec_v2
+        ret = self.enable_overlap and not self.spec_algorithm.is_none()
+        assert not ret or self.spec_algorithm.supports_spec_v2()
+        return ret
+
+    def prepare_for_decode(self):
+        self.forward_mode = ForwardMode.DECODE
+        bs = len(self.reqs)
+
+        if self.is_spec_v2:
+            # TODO(spec-v2): all spec v2 should go through this path
+            draft_input: EagleDraftInput = self.spec_info
+            draft_input.prepare_for_decode(self)
+
+        if not self.spec_algorithm.is_none():
+            # if spec decoding is used, the decode batch is prepared inside
+            # `forward_batch_speculative_generation` after running draft models.
+            return
+
+        if self.sampling_info.penalizer_orchestrator.is_required:
+            if self.enable_overlap:
+                # TODO: this can be slow, optimize this.
+                delayed_output_ids = torch.tensor(
+                    [
+                        (
+                            req.output_ids[-1]
+                            if len(req.output_ids)
+                            else req.origin_input_ids[-1]
+                        )
+                        for req in self.reqs
+                    ],
+                    dtype=torch.int64,
+                    device=self.device,
+                )
+                self.sampling_info.penalizer_orchestrator.cumulate_output_tokens(
+                    delayed_output_ids
+                )
+            else:
+                self.sampling_info.penalizer_orchestrator.cumulate_output_tokens(
+                    self.output_ids.to(torch.int64)
+                )
+
+        # Update fields
+        self.input_ids = self.output_ids
+        self.output_ids = None
+
+        if self.model_config.is_encoder_decoder:
+            self.prepare_encoder_info_decode()
+
+        # Allocate memory
+        self.out_cache_loc = alloc_for_decode(self, token_per_req=1)
+
+        # Update req-level memory management fields
+        for req in self.reqs:
+            req.decode_batch_idx += 1
+            req.kv_committed_len += 1
+            req.kv_allocated_len += 1
+
+        # Update seq_lens after allocation
+        if self.enable_overlap:
+            # Do not use in-place operations in the overlap mode
+            self.seq_lens = self.seq_lens + 1
+            self.seq_lens_cpu = self.seq_lens_cpu + 1
+            self.orig_seq_lens = self.orig_seq_lens + 1
+        else:
+            # A faster in-place version
+            self.seq_lens.add_(1)
+            self.seq_lens_cpu.add_(1)
+            self.orig_seq_lens.add_(1)
+        self.seq_lens_sum += bs
+
+        if get_global_server_args().enable_mamba_extra_buffer():
+            if len(self.reqs) == 0:
+                self.mamba_track_indices = torch.empty(
+                    (0,), dtype=torch.int64, device=self.device
+                )
+            else:
+                # already on device
+                all_buffers = torch.stack(
+                    [req.mamba_ping_pong_track_buffer for req in self.reqs]
+                )
+                idx = (
+                    torch.tensor(
+                        [req.mamba_next_track_idx for req in self.reqs],
+                        dtype=torch.int64,
+                        pin_memory=True,
+                    )
+                    .unsqueeze(1)
+                    .to(device=all_buffers.device, non_blocking=True)
+                )
+                self.mamba_track_indices = (
+                    torch.gather(all_buffers, 1, idx).squeeze(1).to(torch.int64)
+                )
+
+            # async H2D
+            self.mamba_track_mask = (
+                (self.seq_lens_cpu % get_global_server_args().mamba_track_interval == 0)
+                .pin_memory()
+                .to(device=self.device, non_blocking=True)
+            )
+
+    def maybe_wait_verify_done(self):
+        if self.is_spec_v2:
+            draft_input: EagleDraftInput = self.spec_info
+            if draft_input.verify_done is not None:
+                draft_input.verify_done.synchronize()
+
+    def filter_batch(
+        self,
+        chunked_req_to_exclude: Optional[Union[Req, List[Req]]] = None,
+        keep_indices: Optional[List[int]] = None,
+        # FIXME(lsyin): deprecate this API after spec v1 is deprecated
+        v1_spec_info_filtered: Optional[bool] = False,
+    ):
+        # FIXME(lsyin): used here to get the correct seq_lens
+        # The batch has been launched but we need it verified to get correct next batch info
+        self.maybe_wait_verify_done()
+
+        if keep_indices is None:
+            if isinstance(chunked_req_to_exclude, Req):
+                chunked_req_to_exclude = [chunked_req_to_exclude]
+            elif chunked_req_to_exclude is None:
+                chunked_req_to_exclude = []
+            keep_indices = [
+                i
+                for i in range(len(self.reqs))
+                if not self.reqs[i].finished()
+                and self.reqs[i] not in chunked_req_to_exclude
+            ]
+
+        if keep_indices is None or len(keep_indices) == 0:
+            # Filter out all requests
+            self.reqs = []
+            return
+
+        if len(keep_indices) == len(self.reqs):
+            # No need to filter
+            return
+
+        keep_indices_device = torch.tensor(
+            keep_indices,
+            dtype=torch.int64,
+            pin_memory=is_pin_memory_available(self.device),
+        ).to(self.device, non_blocking=True)
+
+        if self.model_config.is_encoder_decoder:
+            self.encoder_lens = self.encoder_lens[keep_indices_device]
+            self.encoder_lens_cpu = [self.encoder_lens_cpu[i] for i in keep_indices]
+
+        self.reqs = [self.reqs[i] for i in keep_indices]
+        if self.multimodal_inputs is not None:
+            self.multimodal_inputs = [self.multimodal_inputs[i] for i in keep_indices]
+        self.req_pool_indices = self.req_pool_indices[keep_indices_device]
+        self.seq_lens = self.seq_lens[keep_indices_device]
+        self.seq_lens_cpu = self.seq_lens_cpu[keep_indices]
+        self.orig_seq_lens = self.orig_seq_lens[keep_indices_device]
+        self.out_cache_loc = None
+        self.seq_lens_sum = self.seq_lens.sum().item()
+
+        if self.output_ids is not None:
+            self.output_ids = self.output_ids[keep_indices_device]
+
+        self.mamba_track_indices = None
+        self.mamba_track_mask = None
+        self.mamba_track_seqlens = None
+        self.return_logprob = any(req.return_logprob for req in self.reqs)
+        if self.return_logprob:
+            self.top_logprobs_nums = [self.top_logprobs_nums[i] for i in keep_indices]
+            self.token_ids_logprobs = [self.token_ids_logprobs[i] for i in keep_indices]
+        else:
+            self.top_logprobs_nums = None
+            self.token_ids_logprobs = None
+
+        self.has_stream = any(req.stream for req in self.reqs)
+        self.has_grammar = any(req.grammar for req in self.reqs)
+
+        self.sampling_info.filter_batch(keep_indices, keep_indices_device)
+        # NOTE: spec_info filtered before batch filtering only happens in:
+        # - Spec v1's verify phase
+        # - Only for decode batch (running_batch)
+        has_been_filtered = v1_spec_info_filtered and not self.is_spec_v2
+
+        if self.spec_info:
+            self.spec_info.filter_batch(
+                new_indices=keep_indices_device,
+                has_been_filtered=has_been_filtered,
+            )
+
+    def merge_batch(self, other: "ScheduleBatch"):
+        # In the regular scheduler path:
+        # 1) self is always prefill, whose seq_lens is not a future
+        # 2) other is always decode, which is finished in previous step
+        # so verify_done is already synced and this is a no-op.
+        # In disagg decode + overlap, merge_batch can be called before
+        # filter_batch, so running_batch.seq_lens may still be a forward_stream
+        # future. Synchronize here to avoid a cross-stream data race.
+        self.maybe_wait_verify_done()
+
+        # Penalizer orchestrator must be merged before Batch.reqs is merged. This is because
+        # orchestrator.merge() depends on Batch.reqs during preparation of each penalizers, so it
+        # needs to be called with pre-merged Batch.reqs.
+        self.sampling_info.merge_batch(other.sampling_info)
+
+        # Encoder-decoder infos
+        if self.model_config.is_encoder_decoder:
+            self.encoder_lens = torch.cat([self.encoder_lens, other.encoder_lens])
+            self.encoder_lens_cpu.extend(other.encoder_lens_cpu)
+        self.req_pool_indices = torch.cat(
+            [self.req_pool_indices, other.req_pool_indices]
+        )
+        self.seq_lens = torch.cat([self.seq_lens, other.seq_lens])
+        self.seq_lens_cpu = torch.cat([self.seq_lens_cpu, other.seq_lens_cpu])
+        self.orig_seq_lens = torch.cat([self.orig_seq_lens, other.orig_seq_lens])
+        self.out_cache_loc = None
+        self.seq_lens_sum += other.seq_lens_sum
+        if self.output_ids is not None:
+            self.output_ids = torch.cat([self.output_ids, other.output_ids])
+        self.mamba_track_indices = None
+        self.mamba_track_mask = None
+        self.mamba_track_seqlens = None
+        if self.return_logprob and other.return_logprob:
+            self.top_logprobs_nums.extend(other.top_logprobs_nums)
+            self.token_ids_logprobs.extend(other.token_ids_logprobs)
+        elif self.return_logprob:
+            self.top_logprobs_nums.extend([0] * len(other.reqs))
+            self.token_ids_logprobs.extend([None] * len(other.reqs))
+        elif other.return_logprob:
+            self.top_logprobs_nums = [0] * len(self.reqs) + other.top_logprobs_nums
+            self.token_ids_logprobs = [None] * len(self.reqs) + other.token_ids_logprobs
+        self.reqs.extend(other.reqs)
+        if self.multimodal_inputs is not None:
+            self.multimodal_inputs.extend(other.multimodal_inputs)
+
+        self.return_logprob |= other.return_logprob
+        self.has_stream |= other.has_stream
+        self.has_grammar |= other.has_grammar
+        self.return_hidden_states |= other.return_hidden_states
+
+        if self.spec_info:
+            self.spec_info.merge_batch(other.spec_info)
+
+    def get_model_worker_batch(
+        self, seq_lens_cpu_cache: Optional[torch.Tensor] = None
+    ) -> ModelWorkerBatch:
+        if self.forward_mode.is_decode_or_idle():
+            extend_seq_lens = extend_prefix_lens = extend_logprob_start_lens = None
+        else:
+            extend_seq_lens = self.extend_lens
+            extend_prefix_lens = self.prefix_lens
+            extend_logprob_start_lens = self.extend_logprob_start_lens
+
+        if self.sampling_info:
+            if self.has_grammar:
+                self.sampling_info.grammars = [req.grammar for req in self.reqs]
+            else:
+                self.sampling_info.grammars = None
+
+        seq_lens_cpu = (
+            seq_lens_cpu_cache if seq_lens_cpu_cache is not None else self.seq_lens_cpu
+        )
+
+        return ModelWorkerBatch(
+            forward_mode=self.forward_mode,
+            input_ids=self.input_ids,
+            req_pool_indices=self.req_pool_indices,
+            seq_lens=self.seq_lens,
+            orig_seq_lens=self.orig_seq_lens,
+            out_cache_loc=self.out_cache_loc,
+            seq_lens_cpu=seq_lens_cpu,
+            seq_lens_sum=self.seq_lens_sum,
+            return_logprob=self.return_logprob,
+            top_logprobs_nums=self.top_logprobs_nums,
+            token_ids_logprobs=self.token_ids_logprobs,
+            global_num_tokens=self.global_num_tokens,
+            global_num_tokens_for_logprob=self.global_num_tokens_for_logprob,
+            is_extend_in_batch=self.is_extend_in_batch,
+            all_extend_in_batch=self.all_extend_in_batch,
+            can_run_dp_cuda_graph=self.can_run_dp_cuda_graph,
+            tbo_split_seq_index=self.tbo_split_seq_index,
+            global_forward_mode=self.global_forward_mode,
+            extend_num_tokens=self.extend_num_tokens,
+            extend_seq_lens=extend_seq_lens,
+            extend_prefix_lens=extend_prefix_lens,
+            extend_logprob_start_lens=extend_logprob_start_lens,
+            multimodal_inputs=self.multimodal_inputs,
+            encoder_cached=self.encoder_cached,
+            encoder_lens=self.encoder_lens,
+            encoder_lens_cpu=self.encoder_lens_cpu,
+            encoder_out_cache_loc=self.encoder_out_cache_loc,
+            lora_ids=[req.lora_id for req in self.reqs],
+            sampling_info=self.sampling_info,
+            input_embeds=self.input_embeds,
+            token_type_ids=self.token_type_ids,
+            spec_algorithm=self.spec_algorithm,
+            spec_info=self.spec_info,
+            hicache_consumer_index=self.hicache_consumer_index,
+            capture_hidden_mode=(
+                CaptureHiddenMode.FULL
+                if self.return_hidden_states
+                else (
+                    getattr(
+                        self.spec_info, "capture_hidden_mode", CaptureHiddenMode.NULL
+                    )
+                    if self.spec_info
+                    else CaptureHiddenMode.NULL
+                )
+            ),
+            extend_input_logprob_token_ids=self.extend_input_logprob_token_ids,
+            is_prefill_only=self.is_prefill_only,
+            dimensions=self.dimensions,
+            dllm_block_offsets=[req.dllm_block_offset for req in self.reqs],
+            dllm_config=self.dllm_config,
+            reqs=self.reqs,
+            has_grammar=self.has_grammar,
+            mamba_track_indices=self.mamba_track_indices,
+            mamba_track_mask=self.mamba_track_mask,
+            mamba_track_seqlens=self.mamba_track_seqlens,
+        )
+
+    def copy(self):
+        # Only contain fields that will be used by process_batch_result.
+        # Shallow-copy the reqs list so that in-place mutations (filter_batch,
+        # merge_batch) on the original don't corrupt this snapshot.
+        return ScheduleBatch(
+            reqs=self.reqs[:],
+            req_to_token_pool=self.req_to_token_pool,
+            req_pool_indices=self.req_pool_indices,
+            model_config=self.model_config,
+            forward_mode=self.forward_mode,
+            out_cache_loc=self.out_cache_loc,
+            return_logprob=self.return_logprob,
+            decoding_reqs=self.decoding_reqs,
+            spec_algorithm=self.spec_algorithm,
+            global_num_tokens=self.global_num_tokens,
+            global_num_tokens_for_logprob=self.global_num_tokens_for_logprob,
+            can_run_dp_cuda_graph=self.can_run_dp_cuda_graph,
+            all_extend_in_batch=self.all_extend_in_batch,
+            is_extend_in_batch=self.is_extend_in_batch,
+            is_prefill_only=self.is_prefill_only,
+            seq_lens_cpu=self.seq_lens_cpu,
+            enable_overlap=self.enable_overlap,
+            mamba_track_indices=self.mamba_track_indices,
+            mamba_track_mask=self.mamba_track_mask,
+            mamba_track_seqlens=self.mamba_track_seqlens,
+            dp_cooperation_info=self.dp_cooperation_info,
+            prefill_stats=self.prefill_stats,
+        )
+
+    def maybe_evict_swa(self):
+        if self.tree_cache.supports_swa():
+            sliding_window_size = self.tree_cache.sliding_window_size
+            server_args = get_global_server_args()
+
+            if (
+                self.forward_mode.is_decode()
+                and not server_args.disable_piecewise_cuda_graph
+                and not self.tree_cache.is_chunk_cache()
+            ):
+                return
+
+            for idx, req in enumerate(self.reqs):
+                if self.forward_mode.is_decode():
+                    # We set evict_swa condition here with two reasons:
+                    # 1. In overlap scheduler, we cannot evict swa when req.decode_batch_idx == 0 since the prev extend batch is still running.
+                    # 2. Evict swa every window_size tokens to reduce the overhead.
+                    if req.decode_batch_idx % sliding_window_size == 1:
+                        self._evict_swa(req, req.seqlen - 1)
+                elif self.forward_mode.is_extend() and self.tree_cache.is_chunk_cache():
+                    pre_len = self.prefix_lens[idx]
+                    if self.enable_overlap:
+                        # In chunked prefill case, when the second extend batch is scheduling, the first extend batch is still running, so we cannot evict swa tokens
+                        if req.extend_batch_idx < 2:
+                            continue
+                        else:
+                            pre_len = (
+                                pre_len - server_args.chunked_prefill_size
+                                if server_args.chunked_prefill_size > 0
+                                else pre_len
+                            )
+                            self._evict_swa(req, pre_len)
+                    else:
+                        self._evict_swa(req, pre_len)
+
+    def _evict_swa(self, req: Req, pre_len: int):
+        assert self.tree_cache.supports_swa(), "prefix cache must support swa"
+        sliding_window_size = self.tree_cache.sliding_window_size
+
+        # For swa radix cache, we need to evict the tokens that are not in the tree cache and also not in the sliding window
+        assert (
+            req.cache_protected_len % self.tree_cache.page_size == 0
+        ), "cache_protected_len must be page aligned"
+        req.swa_evicted_seqlen = max(req.swa_evicted_seqlen, req.cache_protected_len)
+
+        new_swa_evicted_seqlen = max(
+            req.swa_evicted_seqlen, pre_len - sliding_window_size
+        )
+
+        if self.tree_cache.page_size > 1:
+            new_swa_evicted_seqlen = (
+                new_swa_evicted_seqlen // self.tree_cache.page_size
+            ) * self.tree_cache.page_size
+
+        if new_swa_evicted_seqlen > req.swa_evicted_seqlen:
+            free_slots = self.req_to_token_pool.req_to_token[
+                req.req_pool_idx, req.swa_evicted_seqlen : new_swa_evicted_seqlen
+            ]
+            self.token_to_kv_pool_allocator.free_swa(free_slots)
+            req.swa_evicted_seqlen = new_swa_evicted_seqlen
+
+    def __str__(self):
+        return (
+            f"ScheduleBatch(forward_mode={self.forward_mode.name if self.forward_mode else 'None'}, "
+            f"#req={(len(self.reqs))})"
+        )
+
+
+@dataclasses.dataclass
+class ModelWorkerBatch:
+    # The forward mode
+    forward_mode: ForwardMode
+    # The input ids
+    input_ids: torch.Tensor
+    # The indices of requests in the req_to_token_pool
+    req_pool_indices: torch.Tensor
+    # The sequence length
+    seq_lens: torch.Tensor
+    # The indices of output tokens in the token_to_kv_pool_allocator
+    out_cache_loc: torch.Tensor
+    # The sequence length tensor on CPU
+    seq_lens_cpu: Optional[torch.Tensor]
+    seq_lens_sum: int
+
+    # For logprob
+    return_logprob: bool
+    top_logprobs_nums: Optional[List[int]]
+    token_ids_logprobs: Optional[List[List[int]]]
+
+    # For DP attention
+    global_num_tokens: Optional[List[int]]
+    global_num_tokens_for_logprob: Optional[List[int]]
+    is_extend_in_batch: bool
+    all_extend_in_batch: bool
+    can_run_dp_cuda_graph: bool
+    tbo_split_seq_index: Optional[int]
+    global_forward_mode: Optional[ForwardMode]
+
+    # For extend
+    extend_num_tokens: Optional[int]
+    extend_seq_lens: Optional[List[int]]
+    extend_prefix_lens: Optional[List[int]]
+    extend_logprob_start_lens: Optional[List[int]]
+    extend_input_logprob_token_ids: Optional[torch.Tensor]
+
+    # For multimodal
+    multimodal_inputs: Optional[List[MultimodalInputs]]
+
+    # For encoder-decoder
+    encoder_cached: Optional[List[bool]]
+    encoder_lens: Optional[torch.Tensor]
+    encoder_lens_cpu: Optional[List[int]]
+    encoder_out_cache_loc: Optional[torch.Tensor]
+
+    # For LoRA
+    lora_ids: Optional[List[str]]
+
+    # Sampling info
+    sampling_info: SamplingBatchInfo
+
+    # The original sequence lengths, Qwen-1M related
+    orig_seq_lens: Optional[torch.Tensor] = None
+
+    # The input Embeds
+    input_embeds: Optional[torch.Tensor] = None
+
+    # For corss-encoder model
+    token_type_ids: Optional[torch.Tensor] = None
+
+    # Speculative decoding
+    spec_algorithm: SpeculativeAlgorithm = None
+
+    spec_info: Optional[SpecInput] = None
+
+    # If set, the output of the batch contains the hidden states of the run.
+    capture_hidden_mode: CaptureHiddenMode = None
+    hicache_consumer_index: int = -1
+
+    # For matryoshka embeddings
+    dimensions: Optional[list[int]] = None
+
+    # Whether this batch is prefill-only (no token generation needed)
+    is_prefill_only: bool = False
+
+    # Diffusion LLM
+    dllm_block_offsets: Optional[List[int]] = None
+    dllm_config: Optional[DllmConfig] = None
+
+    # For constrained decoding
+    # FIXME(lsyin): remove this after fully overlap grammar
+    reqs: Optional[List[Req]] = None
+    has_grammar: bool = False
+
+    # For hidden states before normal
+    return_hidden_states_before_norm: bool = False
+
+    # For mamba state tracking
+    mamba_track_indices: Optional[torch.Tensor] = None  # shape: [b], int64
+    mamba_track_mask: Optional[torch.Tensor] = None  # shape: [b], bool
+    mamba_track_seqlens: Optional[torch.Tensor] = None  # shape: [b], int64
diff --git a/sglang/python/sglang/srt/managers/schedule_policy.py b/sglang/python/sglang/srt/managers/schedule_policy.py
new file mode 100644
index 0000000000000000000000000000000000000000..7ebd5cd7bb38d4f54ed8132c3bc2bf9f048e6043
--- /dev/null
+++ b/sglang/python/sglang/srt/managers/schedule_policy.py
@@ -0,0 +1,906 @@
+from __future__ import annotations
+
+import logging
+
+from sglang.srt.managers.prefill_delayer import PrefillDelayerSinglePassExecutor
+from sglang.srt.utils import get_bool_env_var
+
+_ROUTING_KEY_POLICY_DEBUG_LOG = get_bool_env_var("SGLANG_ROUTING_KEY_POLICY_DEBUG_LOG")
+logger = logging.getLogger(__name__)
+
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Request scheduler policy"""
+
+import os
+import random
+from collections import Counter, defaultdict
+from contextlib import contextmanager
+from enum import Enum, auto
+from typing import TYPE_CHECKING, Dict, List, Optional, Set, Union
+
+import torch
+
+from sglang.srt.dllm.config import DllmConfig
+from sglang.srt.layers.attention.nsa.utils import is_nsa_prefill_cp_in_seq_split
+from sglang.srt.managers.schedule_batch import Req, ScheduleBatch
+from sglang.srt.mem_cache.base_prefix_cache import (
+    BasePrefixCache,
+    InsertParams,
+    MatchPrefixParams,
+)
+from sglang.srt.mem_cache.radix_cache import RadixCache, RadixKey, TreeNode
+from sglang.srt.mem_cache.swa_memory_pool import SWATokenToKVPoolAllocator
+from sglang.srt.server_args import ServerArgs
+
+if TYPE_CHECKING:
+    from sglang.srt.mem_cache.allocator import BaseTokenToKVPoolAllocator
+
+# Clip the estimation of max_new_tokens for the request whose max_new_tokens is very large.
+# This can prevent the server from being too conservative.
+# Note that this only clips the estimation in the scheduler but does not change the stop
+# condition. The request can still generate tokens until it hits the unclipped max_new_tokens.
+CLIP_MAX_NEW_TOKENS = int(
+    os.environ.get("SGLANG_CLIP_MAX_NEW_TOKENS_ESTIMATION", "4096")
+)
+
+# Threshold for in-batch prefix cache.
+# If a request has a matched prefix length (against existing cache) less than this value,
+# the scheduler runs the in-batch prefix caching check for this request.
+# If we set it to -1, it means we disable in-batch prefix caching.
+IN_BATCH_PREFIX_CACHING_CHECK_THRESHOLD = int(
+    os.environ.get("IN_BATCH_PREFIX_CACHING_CHECK_THRESHOLD", "32")
+)
+
+# Threshold for in-batch prefix cache.
+# If a request has a matched prefix length (within the waiting queue) larger than this value,
+# the scheduler deprioritizes this request
+IN_BATCH_PREFIX_CACHING_DEPRIORITIZE_THRESHOLD = int(
+    os.environ.get("IN_BATCH_PREFIX_CACHING_DEPRIORITIZE_THRESHOLD", "32")
+)
+
+
+IGNORE_EOS_RESERVE_TOKENS = 1
+
+
+class CacheAwarePolicy(Enum):
+    """Scheduling policies that are aware of the tree cache."""
+
+    LPM = "lpm"  # longest prefix match
+    DFS_WEIGHT = "dfs-weight"  # depth-first search weighting
+
+
+class CacheAgnosticPolicy(Enum):
+    """Scheduling policies that are not aware of the tree cache."""
+
+    FCFS = "fcfs"  # first come first serve
+    LOF = "lof"  # longest output first
+    RANDOM = "random"
+    ROUTING_KEY = "routing-key"  # prioritize by routing key frequency in running batch
+
+
+class SchedulePolicy:
+    Policy = Union[CacheAwarePolicy, CacheAgnosticPolicy]
+
+    def __init__(
+        self,
+        policy: str,
+        tree_cache: BasePrefixCache,
+        enable_hierarchical_cache: bool,
+        enable_priority_scheduling: bool,
+        schedule_low_priority_values_first: bool,
+    ):
+        self.policy = self._validate_and_adjust_policy(policy, tree_cache)
+        self.tree_cache = tree_cache
+        self.enable_hierarchical_cache = enable_hierarchical_cache
+        self.enable_priority_scheduling = enable_priority_scheduling
+        self.schedule_low_priority_values_first = schedule_low_priority_values_first
+        self.priority_sign = 1 if schedule_low_priority_values_first else -1
+
+        # It is used to find the matching prefix for in-batch prefix caching.
+        self.waiting_queue_radix_tree = RadixCache.create_simulated()
+
+    def calc_priority(
+        self, waiting_queue: List[Req], running_batch: Optional[ScheduleBatch] = None
+    ) -> bool:
+        if self.policy == CacheAgnosticPolicy.FCFS:
+            if self.enable_priority_scheduling:
+                SchedulePolicy._sort_by_priority_and_fcfs(
+                    waiting_queue, self.priority_sign
+                )
+            return False
+
+        policy = self._determine_active_policy(waiting_queue)
+
+        prefix_computed = False
+        if isinstance(policy, CacheAwarePolicy):
+            prefix_computed = True
+            temporary_deprioritized = self._compute_prefix_matches(
+                waiting_queue, policy
+            )
+            if policy == CacheAwarePolicy.LPM:
+                SchedulePolicy._sort_by_longest_prefix(
+                    waiting_queue, temporary_deprioritized
+                )
+            elif policy == CacheAwarePolicy.DFS_WEIGHT:
+                SchedulePolicy._sort_by_dfs_weight(waiting_queue, self.tree_cache)
+            else:
+                raise ValueError(f"Unknown CacheAware Policy: {policy=}")
+        else:
+            if policy == CacheAgnosticPolicy.FCFS:
+                pass
+            elif policy == CacheAgnosticPolicy.LOF:
+                SchedulePolicy._sort_by_longest_output(
+                    waiting_queue,
+                    self.enable_priority_scheduling,
+                    self.priority_sign,
+                )
+            elif policy == CacheAgnosticPolicy.RANDOM:
+                SchedulePolicy._sort_randomly(waiting_queue)
+            elif policy == CacheAgnosticPolicy.ROUTING_KEY:
+                if running_batch is not None:
+                    SchedulePolicy._sort_by_routing_key(waiting_queue, running_batch)
+            else:
+                raise ValueError(f"Unknown CacheAgnostic Policy: {policy=}")
+        return prefix_computed
+
+    def _determine_active_policy(self, waiting_queue: List[Req]) -> Policy:
+        if self.policy == CacheAwarePolicy.LPM and len(waiting_queue) > 128:
+            # Turn off the expensive prefix matching and sorting when the #queue is large.
+            return CacheAgnosticPolicy.FCFS
+        return self.policy
+
+    def _validate_and_adjust_policy(
+        self, policy: str, tree_cache: BasePrefixCache
+    ) -> Policy:
+        """
+        Validates the policy and adjusts it if necessary based on tree cache settings.
+        """
+        try:
+            policy_enum = CacheAwarePolicy(policy)
+            if getattr(tree_cache, "disable", True):
+                # If tree_cache is disabled, using CacheAgnosticPolicy policy
+                return CacheAgnosticPolicy.FCFS
+            return policy_enum
+        except ValueError:
+            try:
+                return CacheAgnosticPolicy(policy)
+            except ValueError:
+                raise ValueError(f"Unknown schedule_policy: {policy=}")
+
+    def _compute_prefix_matches(
+        self, waiting_queue: List[Req], policy: CacheAwarePolicy
+    ) -> Set[int]:
+        """
+        Computes and caches the matching prefixes for requests in the waiting queue,
+            and handles in-batch prefix caching logic.
+        """
+        temporary_deprioritized: Set[int] = set()
+        self.waiting_queue_radix_tree.reset()
+
+        for r in waiting_queue:
+            prefix_ids = r.origin_input_ids + r.output_ids
+            extra_key = r.extra_key
+            # NOTE: the prefix_indices must always be aligned with last_node
+            match_result = self.tree_cache.match_prefix(
+                MatchPrefixParams(
+                    key=RadixKey(token_ids=prefix_ids, extra_key=extra_key)
+                )
+            )
+            (
+                r.prefix_indices,
+                r.last_node,
+                r.last_host_node,
+                r.host_hit_length,
+            ) = (
+                match_result.device_indices,
+                match_result.last_device_node,
+                match_result.last_host_node,
+                match_result.host_hit_length,
+            )
+
+            # NOTE(sang): This logic is for in-batch prefix caching;
+            # If there are more than 1 request that have small matching prefix from
+            # existing cache, but all those requests share the same prefix, we prefer
+            # to schedule only one of them so that we can increase the cache hit rate.
+            # We prefer to set IN_BATCH_PREFIX_CACHING_CHECK_THRESHOLD > 0 because too small
+            # threshold means we cannot use in-batch prefix caching for short prefixes.
+            # It is kind of common when the engine is long running (e.g., imagine the prefix "the").
+            if len(r.prefix_indices) <= IN_BATCH_PREFIX_CACHING_CHECK_THRESHOLD:
+                match_result = self.waiting_queue_radix_tree.match_prefix(
+                    MatchPrefixParams(
+                        key=RadixKey(token_ids=prefix_ids, extra_key=extra_key)
+                    )
+                )
+                in_batch_matching_prefixes = match_result.device_indices
+                if (
+                    len(in_batch_matching_prefixes)
+                    >= IN_BATCH_PREFIX_CACHING_DEPRIORITIZE_THRESHOLD
+                ):
+                    temporary_deprioritized.add(r.rid)
+                else:
+                    # Insert with a dummy key
+                    self.waiting_queue_radix_tree.insert(
+                        InsertParams(
+                            key=RadixKey(token_ids=prefix_ids, extra_key=extra_key),
+                            value=torch.empty(len(prefix_ids), dtype=torch.bool),
+                        )
+                    )
+        return temporary_deprioritized
+
+    @staticmethod
+    def _sort_by_longest_prefix(
+        waiting_queue: List[Req], temporary_deprioritized: Set[int]
+    ) -> None:
+        """Sorts the waiting queue based on the longest prefix match."""
+        waiting_queue.sort(
+            key=lambda r: (
+                -len(r.prefix_indices)
+                if r.rid not in temporary_deprioritized
+                else float("inf")
+            )
+        )
+
+    @staticmethod
+    def _sort_by_dfs_weight(
+        waiting_queue: List[Req], tree_cache: BasePrefixCache
+    ) -> None:
+        """Sorts the waiting queue based on a depth-first search weighting."""
+        last_node_to_reqs = defaultdict(list)
+        for req in waiting_queue:
+            last_node_to_reqs[req.last_node].append(req)
+
+        node_to_weight = defaultdict(int)
+        for node in last_node_to_reqs:
+            node_to_weight[node] = len(last_node_to_reqs[node])
+        SchedulePolicy._calc_weight(tree_cache.root_node, node_to_weight)
+
+        waiting_queue.clear()
+        SchedulePolicy._get_dfs_priority(
+            tree_cache.root_node,
+            node_to_weight,
+            last_node_to_reqs,
+            waiting_queue,
+        )
+
+    @staticmethod
+    def _sort_by_longest_output(
+        waiting_queue: List[Req],
+        enable_priority_scheduling: bool,
+        priority_sign: int,
+    ) -> None:
+        """Sorts the waiting queue based on the longest output (max_new_tokens). If using priority scheduling, sort by priority first."""
+        if enable_priority_scheduling:
+            waiting_queue.sort(
+                key=lambda x: (
+                    x.priority * priority_sign,
+                    -x.sampling_params.max_new_tokens,
+                )
+            )
+        else:
+            waiting_queue.sort(key=lambda x: -x.sampling_params.max_new_tokens)
+
+    @staticmethod
+    def _sort_randomly(waiting_queue: List[Req]) -> None:
+        """Shuffles the waiting queue randomly."""
+        random.shuffle(waiting_queue)
+
+    @staticmethod
+    def _sort_by_priority_and_fcfs(
+        waiting_queue: List[Req], priority_sign: int
+    ) -> None:
+        """Sorts the waiting queue based on the request priority then received titmestamp."""
+        waiting_queue.sort(
+            key=lambda x: (
+                x.priority * priority_sign,
+                x.time_stats.wait_queue_entry_time,
+            )
+        )
+
+    @staticmethod
+    def _sort_by_routing_key(
+        waiting_queue: List[Req], running_batch: ScheduleBatch
+    ) -> None:
+        """Sorts waiting queue by routing key frequency in running batch."""
+        routing_key_counts = Counter(
+            r.routing_key for r in running_batch.reqs if r.routing_key
+        )
+
+        if _ROUTING_KEY_POLICY_DEBUG_LOG:
+            waiting_keys_before = [r.routing_key for r in waiting_queue]
+            logger.info(
+                f"routing_key_counts={dict(routing_key_counts)}, "
+                f"waiting_keys_before={waiting_keys_before}"
+            )
+
+        if not routing_key_counts:
+            return
+
+        def sort_key(req: Req):
+            key = req.routing_key
+            if key and key in routing_key_counts:
+                count = routing_key_counts[key]
+                return (0, -count, key)
+            else:
+                return (1, 0, key or "")
+
+        waiting_queue.sort(key=sort_key)
+
+        if _ROUTING_KEY_POLICY_DEBUG_LOG:
+            waiting_keys_after = [r.routing_key for r in waiting_queue]
+            logger.info(f"waiting_keys_after={waiting_keys_after}")
+
+    @staticmethod
+    def _calc_weight(cur_node: TreeNode, node_to_weight: Dict[TreeNode, int]) -> None:
+        for child in cur_node.children.values():
+            SchedulePolicy._calc_weight(child, node_to_weight)
+            node_to_weight[cur_node] += node_to_weight[child]
+
+    @staticmethod
+    def _get_dfs_priority(
+        cur_node: TreeNode,
+        node_to_priority: Dict[TreeNode, int],
+        last_node_to_reqs: Dict[TreeNode, List[Req]],
+        q: List,
+    ) -> None:
+        children = [child for child in cur_node.children.values()]
+        children.sort(key=lambda x: -node_to_priority[x])
+        for child in children:
+            SchedulePolicy._get_dfs_priority(
+                child, node_to_priority, last_node_to_reqs, q
+            )
+        q.extend(last_node_to_reqs[cur_node])
+
+
+class AddReqResult(Enum):
+    CONTINUE = auto()  # Continue to add requests
+    NO_TOKEN = auto()  # No token left
+    OTHER = auto()  # Other reasons to stop adding requests
+
+
+class PrefillAdder:
+    def __init__(
+        self,
+        page_size: int,
+        tree_cache: BasePrefixCache,
+        token_to_kv_pool_allocator: BaseTokenToKVPoolAllocator,
+        running_batch: ScheduleBatch,
+        new_token_ratio: float,
+        rem_input_tokens: int,
+        rem_chunk_tokens: Optional[int],
+        mixed_with_decode_tokens: int = 0,
+        priority_scheduling_preemption_threshold: int = 0,
+        max_prefill_bs: int = 0,
+        max_running_requests: Optional[int] = None,
+        prefill_max_requests: Optional[int] = None,
+        prefill_delayer_single_pass: Optional[PrefillDelayerSinglePassExecutor] = None,
+        dllm_config: Optional[DllmConfig] = None,
+    ):
+        self.page_size = page_size
+        self.tree_cache = tree_cache
+        self.token_to_kv_pool_allocator = token_to_kv_pool_allocator
+        self.running_batch = running_batch
+        self.new_token_ratio = new_token_ratio
+        self.rem_input_tokens = rem_input_tokens - mixed_with_decode_tokens
+        self.rem_chunk_tokens = rem_chunk_tokens
+        self.dllm_config = dllm_config
+
+        if self.dllm_config is not None:
+            self._init_dllm_meta(dllm_config)
+
+        if self.rem_chunk_tokens is not None:
+            self.rem_chunk_tokens -= mixed_with_decode_tokens
+        self.rem_total_token_offset = mixed_with_decode_tokens
+        self.cur_rem_token_offset = mixed_with_decode_tokens
+
+        self.req_states = None
+        self.can_run_list = []
+        self.preempt_list = []
+        self.new_chunked_req = None
+        self.log_hit_tokens = 0
+        # TODO(lsyin): report the real input tokens excluding page alignment
+        self.log_input_tokens = 0
+
+        if running_batch is not None:
+            self.rem_total_token_offset += sum(
+                [
+                    self._get_running_request_total_token_offset(r)
+                    for r in running_batch.reqs
+                ]
+            )
+
+        self.is_hybrid_swa = isinstance(
+            self.token_to_kv_pool_allocator, SWATokenToKVPoolAllocator
+        )
+        self.is_hybrid_ssm_cache = self.tree_cache.supports_mamba()
+
+        self.priority_scheduling_preemption_threshold = (
+            priority_scheduling_preemption_threshold
+        )
+        self.nsa_prefill_cp_in_seq_split = is_nsa_prefill_cp_in_seq_split()
+        self.max_running_requests = max_running_requests
+        self.prefill_max_requests = prefill_max_requests
+        self.prefill_delayer_single_pass = prefill_delayer_single_pass
+        self.max_prefill_bs = max_prefill_bs
+
+    def _init_dllm_meta(self, dllm_config: DllmConfig):
+        self.dllm_block_size = dllm_config.block_size
+        max_running_reqs = dllm_config.max_running_requests
+
+        self.rem_dllm_tokens = max_running_reqs * self.dllm_block_size
+
+    def _get_running_request_total_token_offset(self, req: Req) -> int:
+        return (
+            min(
+                (req.sampling_params.max_new_tokens - len(req.output_ids)),
+                CLIP_MAX_NEW_TOKENS,
+            )
+            * self.new_token_ratio
+        )
+
+    @property
+    def rem_total_tokens(self):
+        if self.is_hybrid_swa:
+            available_and_evictable = min(
+                self.token_to_kv_pool_allocator.full_available_size()
+                + self.tree_cache.full_evictable_size(),
+                self.token_to_kv_pool_allocator.swa_available_size()
+                + self.tree_cache.swa_evictable_size(),
+            )
+        elif self.is_hybrid_ssm_cache:
+            available_and_evictable = (
+                self.token_to_kv_pool_allocator.available_size()
+                + self.tree_cache.full_evictable_size()
+            )
+        else:
+            available_and_evictable = (
+                self.token_to_kv_pool_allocator.available_size()
+                + self.tree_cache.evictable_size()
+            )
+        return available_and_evictable - self.rem_total_token_offset
+
+    @property
+    def cur_rem_tokens(self):
+        if self.is_hybrid_swa:
+            available_and_evictable = min(
+                self.token_to_kv_pool_allocator.full_available_size()
+                + self.tree_cache.full_evictable_size(),
+                self.token_to_kv_pool_allocator.swa_available_size()
+                + self.tree_cache.swa_evictable_size(),
+            )
+        elif self.is_hybrid_ssm_cache:
+            available_and_evictable = (
+                self.token_to_kv_pool_allocator.available_size()
+                + self.tree_cache.full_evictable_size()
+            )
+        else:
+            available_and_evictable = (
+                self.token_to_kv_pool_allocator.available_size()
+                + self.tree_cache.evictable_size()
+            )
+
+        return available_and_evictable - self.cur_rem_token_offset
+
+    def ceil_paged_tokens(self, tokens: int) -> int:
+        return -(-tokens // self.page_size) * self.page_size
+
+    def budget_state(self):
+        if self.rem_total_tokens <= 0 or self.cur_rem_tokens <= 0:
+            return AddReqResult.NO_TOKEN
+
+        if self.rem_input_tokens <= 0:
+            return AddReqResult.OTHER
+
+        if self.dllm_config is not None:
+            if self.rem_dllm_tokens <= 0:
+                return AddReqResult.OTHER
+        else:
+            if self.rem_chunk_tokens is not None and self.rem_chunk_tokens <= 0:
+                return AddReqResult.OTHER
+
+        return AddReqResult.CONTINUE
+
+    def _update_prefill_budget(
+        self, prefix_len: int, extend_input_len: int, max_new_tokens: int
+    ):
+        # TODO(lsyin): check this workaround logic, which only ensures the prefill will not out of memory, and may be too conservative
+        extend_input_len = self.ceil_paged_tokens(extend_input_len)
+
+        # alloc_extend reserves an extra page_size per request to make sure the budget doesn't over-commit
+        page_overhead = self.page_size
+        self.rem_total_token_offset += extend_input_len + max_new_tokens + page_overhead
+        self.cur_rem_token_offset += extend_input_len + page_overhead
+        self.rem_input_tokens -= extend_input_len
+
+        if self.dllm_config is not None:
+            self.rem_dllm_tokens -= extend_input_len
+        elif self.rem_chunk_tokens is not None:
+            self.rem_chunk_tokens -= extend_input_len
+
+        self.log_hit_tokens += prefix_len
+        self.log_input_tokens += extend_input_len
+
+    def _get_dllm_remain_tokens(self) -> int:
+        _rem_tokens = min(
+            self.rem_dllm_tokens,
+            self.dllm_block_size,
+            int(self.rem_total_tokens),
+        )
+        if _rem_tokens <= 0:
+            _rem_tokens = self.rem_dllm_tokens
+
+        return _rem_tokens
+
+    def _add_dllm_req(self, req: Req, prefix_len: int):
+        # FIXME: consider the case when rem_dllm_tokens < dllm_block_size,
+        # the diffusion unmask process may have some problems
+        # Make sure at least one page is available
+        trunc_len = (
+            min(self.rem_dllm_tokens, self.dllm_block_size)
+            // self.page_size
+            * self.page_size
+        )
+
+        req.extend_input_len = trunc_len
+        req.fill_ids = req.fill_ids[: prefix_len + trunc_len]
+
+        self.can_run_list.append(req)
+
+        self._update_prefill_budget(prefix_len, trunc_len, 0)
+
+    def _req_inc_lock_ref(self, req: Req):
+        if self.is_hybrid_swa:
+            swa_uuid_for_lock = self.tree_cache.inc_lock_ref(req.last_node)
+            req.swa_uuid_for_lock = swa_uuid_for_lock
+        else:
+            self.tree_cache.inc_lock_ref(req.last_node)
+
+    def add_dllm_staging_req(self, req: Req):
+        assert self.dllm_config is not None
+        _rem_tokens = self._get_dllm_remain_tokens()
+
+        if _rem_tokens <= 0:
+            return AddReqResult.NO_TOKEN
+
+        # Truncate input length to available tokens and update request metadata
+        truncated = req.extend_input_len > _rem_tokens
+        req.extend_input_len = min(req.extend_input_len, _rem_tokens)
+        req.fill_ids = req.fill_ids[: len(req.prefix_indices) + req.extend_input_len]
+        self.can_run_list.append(req)
+
+        # Update budget: reserve max_new_tokens only if not truncated
+        max_new_tokens = (
+            min(req.sampling_params.max_new_tokens, CLIP_MAX_NEW_TOKENS)
+            if not truncated
+            else 0
+        )
+        self._update_prefill_budget(0, req.extend_input_len, max_new_tokens)
+
+        # Return based on remaining token availability
+        return (
+            AddReqResult.NO_TOKEN
+            if self._get_dllm_remain_tokens() <= 0
+            else AddReqResult.CONTINUE
+        )
+
+    def add_chunked_req(self, req: Req):
+        if self.dllm_config is not None:
+            _rem_tokens = self._get_dllm_remain_tokens()
+        else:
+            _rem_tokens = min(self.rem_chunk_tokens, int(self.rem_total_tokens))
+            # The chunked_req must be added to the list; otherwise, it will cause a memory leak.
+            # Therefore, in certain cases where _rem_tokens <= 0, it should be replaced with rem_chunk_tokens.
+            if _rem_tokens <= 0:
+                _rem_tokens = self.rem_chunk_tokens
+
+        truncated = req.extend_input_len > _rem_tokens
+        req.set_extend_input_len(min(req.extend_input_len, _rem_tokens))
+        req.fill_ids = req.fill_ids[: len(req.prefix_indices) + req.extend_input_len]
+        self.can_run_list.append(req)
+        self._update_prefill_budget(
+            0,
+            req.extend_input_len,
+            (
+                min(req.sampling_params.max_new_tokens, CLIP_MAX_NEW_TOKENS)
+                if not truncated
+                else 0
+            ),
+        )
+
+        # Return if chunked prefill not finished
+        return req if truncated else None
+
+    @contextmanager
+    def _lock_node(self, last_node: TreeNode):
+        try:
+            if self.tree_cache.supports_swa() and self.tree_cache.is_tree_cache():
+                swa_uuid_for_lock = self.tree_cache.inc_lock_ref(last_node)
+            else:
+                self.tree_cache.inc_lock_ref(last_node)
+            yield None
+        finally:
+            if self.tree_cache.supports_swa() and self.tree_cache.is_tree_cache():
+                self.tree_cache.dec_lock_ref(last_node, swa_uuid_for_lock)
+            else:
+                self.tree_cache.dec_lock_ref(last_node)
+
+    def add_one_req_ignore_eos(self, req: Req):
+        # Early exit if no enough tokens for the input tokens
+        if self.ceil_paged_tokens(req.extend_input_len) > min(
+            self.cur_rem_tokens, self.rem_total_tokens
+        ):
+            return AddReqResult.NO_TOKEN
+
+        def add_req_state(r, insert_sort=False):
+            new_token_ratio = (
+                1.0 if r.sampling_params.ignore_eos else self.new_token_ratio
+            )
+            tokens_left = r.sampling_params.max_new_tokens * new_token_ratio - len(
+                r.output_ids
+            )
+            tokens_occupied = len(r.origin_input_ids) + len(r.output_ids)
+
+            if tokens_left <= 0:
+                return
+
+            if not insert_sort:
+                self.req_states.append((tokens_left, tokens_occupied))
+            else:
+                i = 0
+                for i in range(len(self.req_states)):
+                    if tokens_left <= self.req_states[i][0]:
+                        break
+                self.req_states.insert(i, (tokens_left, tokens_occupied))
+
+        if self.req_states is None:
+            self.req_states = []
+            add_req_state(req)
+            if self.running_batch is not None:
+                for r in self.running_batch.reqs:
+                    add_req_state(r)
+            for r in self.can_run_list:
+                add_req_state(r)
+            self.req_states.sort(key=lambda x: x[0])
+        else:
+            add_req_state(req, insert_sort=True)
+
+        if not self.is_hybrid_swa:
+            # Skip this logic for swa. The SWA has different memory management, and
+            # this mechanism is underestimating the memory usage.
+            cur_rem_tokens = self.cur_rem_tokens - self.ceil_paged_tokens(
+                req.extend_input_len
+            )
+            tokens_freed = 0
+            for i, (tokens_left, tokens_occupied) in enumerate(self.req_states):
+                # tokens_left gives a reservative calculation as the last token is not stored
+                bs = len(self.req_states) - i
+                min_free_tokens = cur_rem_tokens + tokens_freed - tokens_left * bs
+                # reserve tokens for corner cases
+                if min_free_tokens <= IGNORE_EOS_RESERVE_TOKENS * bs:
+                    return AddReqResult.NO_TOKEN
+                tokens_freed += tokens_occupied
+
+        if self.dllm_config is not None:
+            if self.rem_dllm_tokens <= 0:
+                return AddReqResult.OTHER
+
+            self._add_dllm_req(req, 0)
+        elif (
+            self.rem_chunk_tokens is None  # chunked prefill is disabled
+            or req.extend_input_len <= self.rem_chunk_tokens  # it is the last chunk
+        ):
+            # Non-chunked prefill
+            self.can_run_list.append(req)
+            self._update_prefill_budget(
+                0,
+                req.extend_input_len,
+                min(req.sampling_params.max_new_tokens, CLIP_MAX_NEW_TOKENS),
+            )
+        else:
+            if self.rem_chunk_tokens <= 0:
+                return AddReqResult.OTHER
+
+            # Chunked prefill
+            trunc_len = self.rem_chunk_tokens
+
+            req.set_extend_input_len(trunc_len)
+            req.fill_ids = req.fill_ids[:trunc_len]
+            self.can_run_list.append(req)
+            self.new_chunked_req = req
+            self._update_prefill_budget(0, trunc_len, 0)
+
+        return self.budget_state()
+
+    def add_one_req(
+        self, req: Req, has_chunked_req: bool, truncation_align_size: Optional[int]
+    ):
+        if (self.prefill_delayer_single_pass is not None) and (
+            not self.prefill_delayer_single_pass.negotiate_should_allow_prefill(
+                local_prefillable=True,
+                running_batch=self.running_batch.batch_size(),
+                max_prefill_bs=self.max_prefill_bs,
+                max_running_requests=self.max_running_requests,
+            )
+        ):
+            return AddReqResult.OTHER
+        # TODO support cp with multiple requests
+        # Enabling context parallelism currently presents precision issues;
+        # therefore, the prefill-batch setting is temporarily set to 1.
+        if self.nsa_prefill_cp_in_seq_split and len(self.can_run_list) >= 1:
+            return AddReqResult.OTHER
+
+        if (x := self.prefill_max_requests) is not None and len(self.can_run_list) >= x:
+            return AddReqResult.OTHER
+
+        if req.sampling_params.ignore_eos and getattr(self.tree_cache, "disable", True):
+            return self.add_one_req_ignore_eos(req)
+
+        total_tokens = req.extend_input_len + min(
+            max(req.sampling_params.max_new_tokens - len(req.output_ids), 0),
+            CLIP_MAX_NEW_TOKENS,
+        )
+
+        # adjusting the input_tokens based on host_hit_length and page_size
+        real_input_tokens = req.extend_input_len - req.host_hit_length
+        real_input_tokens = self.ceil_paged_tokens(real_input_tokens)
+        prefix_len = len(req.prefix_indices)
+
+        if total_tokens >= self.rem_total_tokens:
+            return AddReqResult.NO_TOKEN
+
+        if real_input_tokens >= self.rem_input_tokens and len(self.can_run_list) != 0:
+            return AddReqResult.OTHER
+
+        with self._lock_node(req.last_node):
+            # self.rem_total_tokens may decrease after the lock acquisition
+            if total_tokens >= self.rem_total_tokens:
+                return AddReqResult.NO_TOKEN
+
+            if req.host_hit_length > 0:
+                new_indices, req.last_node = self.tree_cache.init_load_back(
+                    req.last_host_node, req.host_hit_length
+                )
+                req.prefix_indices = torch.cat([req.prefix_indices, new_indices])
+                req.set_extend_input_len(len(req.fill_ids) - len(req.prefix_indices))
+                prefix_len = len(req.prefix_indices)
+                req.cache_protected_len = prefix_len
+
+            input_tokens = self.ceil_paged_tokens(req.extend_input_len)
+
+            if input_tokens >= self.rem_input_tokens and len(self.can_run_list) != 0:
+                return AddReqResult.OTHER
+
+            if self.dllm_config is not None:
+                if self.rem_dllm_tokens <= 0:
+                    return AddReqResult.OTHER
+
+                assert (
+                    truncation_align_size is None
+                ), "truncation_align_size is not supported for dllm prefill"
+
+                self._add_dllm_req(req, prefix_len)
+                self._req_inc_lock_ref(req)
+            elif self.rem_chunk_tokens is None or input_tokens <= self.rem_chunk_tokens:
+                # Non-chunked prefill
+                self.can_run_list.append(req)
+
+                self._req_inc_lock_ref(req)
+                self._update_prefill_budget(
+                    prefix_len,
+                    input_tokens,
+                    min(
+                        req.sampling_params.max_new_tokens,
+                        CLIP_MAX_NEW_TOKENS,
+                    ),
+                )
+            else:
+                # Make sure at least one page is available
+                trunc_len = self.rem_chunk_tokens // self.page_size * self.page_size
+
+                if trunc_len <= 0:
+                    return AddReqResult.OTHER
+
+                # When truncation align size is set, we want to assert that the prefill prefix length is multiple of truncation align size
+                # A typical use case is when deterministic inference is enabled with flashinfer attention backend,
+                # we need the prefill prefix length to be multiple of attention split size
+                if truncation_align_size is not None:
+                    if trunc_len < truncation_align_size:
+                        return AddReqResult.OTHER
+                    else:
+                        trunc_len = truncation_align_size * (
+                            trunc_len // truncation_align_size
+                        )
+
+                # Chunked prefill
+                req.set_extend_input_len(trunc_len)
+                req.fill_ids = req.fill_ids[: len(req.prefix_indices) + trunc_len]
+
+                self.can_run_list.append(req)
+                self.new_chunked_req = req
+
+                self._req_inc_lock_ref(req)
+                self._update_prefill_budget(prefix_len, trunc_len, 0)
+
+        return self.budget_state()
+
+    def preempt_to_schedule(self, req: Req, server_args: ServerArgs) -> bool:
+        """
+        Preempt running requests to serve the new request if the priority threshold is met and token count sum is verified.
+        Returns True if preemption was committed, and the new request can be scheduled.
+        """
+        # Iterate running requests to find preemptible requests
+        priority_sign = 1 if server_args.schedule_low_priority_values_first else -1
+
+        # NOTE: A request finishes in two phases:
+        #   1) check_finished + release_kv_cache  (in process_batch_result)
+        #   2) filter out of batch                (in get_next_batch_to_run / update_running_batch)
+        # Preemption runs between these two phases (inside get_new_batch_prefill),
+        # so running_batch may still contain requests whose KV cache is already freed.
+        # We must skip them here to avoid a double-free on release_req.
+        valid_running_reqs = (
+            r
+            for r in self.running_batch.reqs
+            if r not in self.preempt_list and not r.finished()
+        )
+
+        sorted_valid_running_reqs = sorted(
+            valid_running_reqs,
+            key=lambda x: (
+                x.priority * (-priority_sign),
+                -x.time_stats.wait_queue_entry_time,
+            ),
+        )
+
+        preemptible_reqs = []
+        min_tokens_to_remove = (
+            req.extend_input_len
+            + min(req.sampling_params.max_new_tokens, CLIP_MAX_NEW_TOKENS)
+            - self.rem_total_tokens
+        )
+        for running_req in sorted_valid_running_reqs:
+            # Priority difference needs to meet the threshold to be preemptible.
+            priority_diff = (req.priority - running_req.priority) * (-priority_sign)
+
+            if priority_diff > self.priority_scheduling_preemption_threshold:
+                preemptible_reqs.append(running_req)
+                min_tokens_to_remove -= self._get_running_request_total_token_offset(
+                    running_req
+                )
+                if min_tokens_to_remove <= 0:
+                    break
+            else:
+                break
+
+        # Check max token count limit can be met
+        if len(preemptible_reqs) == 0 or min_tokens_to_remove > 0:
+            return False
+
+        # Preempt running requests. Release allocated resources for immediate usage.
+        preemptible_reqs = set(preemptible_reqs)
+        keep_indices = []
+        release_counter = 0
+        for i, running_req in enumerate(self.running_batch.reqs):
+            if running_req in preemptible_reqs:
+                self.rem_total_token_offset -= (
+                    self._get_running_request_total_token_offset(running_req)
+                )
+                release_counter += 1
+                self.running_batch.release_req(
+                    i, len(self.running_batch.reqs) - release_counter, server_args
+                )
+            else:
+                keep_indices.append(i)
+        self.running_batch.filter_batch(keep_indices=keep_indices)
+        self.preempt_list.extend(preemptible_reqs)
+        return True
diff --git a/sglang/python/sglang/srt/managers/scheduler.py b/sglang/python/sglang/srt/managers/scheduler.py
new file mode 100644
index 0000000000000000000000000000000000000000..a7d3c4550a7ead380f0ba80fd16141905c67d137
--- /dev/null
+++ b/sglang/python/sglang/srt/managers/scheduler.py
@@ -0,0 +1,3260 @@
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""A scheduler that manages a tensor parallel GPU worker."""
+
+import faulthandler
+import logging
+import os
+import signal
+import sys
+import time
+from collections import deque
+from dataclasses import dataclass
+from http import HTTPStatus
+from typing import Any, Deque, Dict, List, Optional, Tuple, Union
+
+import psutil
+import setproctitle
+import torch
+import torch.distributed
+import zmq
+from torch.cuda import Stream as CudaStream
+from torch.cuda import StreamContext as CudaStreamContext
+from torch.distributed import barrier
+
+from sglang.srt.configs.model_config import ModelConfig
+from sglang.srt.constrained.grammar_manager import GrammarManager
+from sglang.srt.disaggregation.decode import (
+    DecodePreallocQueue,
+    DecodeTransferQueue,
+    SchedulerDisaggregationDecodeMixin,
+)
+from sglang.srt.disaggregation.decode_kvcache_offload_manager import (
+    DecodeKVCacheOffloadManager,
+)
+from sglang.srt.disaggregation.encode_receiver import create_mm_receiver
+from sglang.srt.disaggregation.prefill import (
+    PrefillBootstrapQueue,
+    SchedulerDisaggregationPrefillMixin,
+    release_req_to_metadata_buffer,
+)
+from sglang.srt.disaggregation.utils import (
+    DisaggregationMode,
+    MetadataBuffers,
+    ReqToMetadataIdxAllocator,
+    TransferBackend,
+    prepare_abort,
+)
+from sglang.srt.distributed import get_pp_group, get_world_group
+from sglang.srt.distributed.parallel_state import get_tp_group
+from sglang.srt.dllm.mixin.scheduler import SchedulerDllmMixin
+from sglang.srt.environ import envs
+from sglang.srt.eplb.expert_distribution import get_global_expert_distribution_recorder
+from sglang.srt.layers.attention.mamba.ops import (
+    initialize_mamba_selective_state_update_backend,
+)
+from sglang.srt.layers.dp_attention import (
+    compute_dp_attention_world_info,
+    get_attention_cp_group,
+    get_attention_tp_group,
+)
+from sglang.srt.layers.moe import initialize_moe_config
+from sglang.srt.layers.quantization.fp4_utils import initialize_fp4_gemm_config
+from sglang.srt.layers.quantization.fp8_utils import initialize_fp8_gemm_config
+from sglang.srt.lora.lora_overlap_loader import LoRAOverlapLoader
+from sglang.srt.managers.io_struct import (
+    AbortReq,
+    ActiveRanksOutput,
+    AttachHiCacheStorageReqInput,
+    AttachHiCacheStorageReqOutput,
+    BaseBatchReq,
+    BaseReq,
+    BatchTokenizedEmbeddingReqInput,
+    BatchTokenizedGenerateReqInput,
+    CheckWeightsReqInput,
+    ClearHiCacheReqInput,
+    ClearHiCacheReqOutput,
+    CloseSessionReqInput,
+    ContinueGenerationReqInput,
+    DestroyWeightsUpdateGroupReqInput,
+    DetachHiCacheStorageReqInput,
+    DetachHiCacheStorageReqOutput,
+    DumperControlReqInput,
+    DumperControlReqOutput,
+    ExpertDistributionReq,
+    ExpertDistributionReqOutput,
+    ExpertDistributionReqType,
+    FlushCacheReqInput,
+    FlushCacheReqOutput,
+    FreezeGCReq,
+    GetInternalStateReq,
+    GetInternalStateReqOutput,
+    GetLoadReqInput,
+    GetLoadsReqInput,
+    GetWeightsByNameReqInput,
+    HealthCheckOutput,
+    InitWeightsSendGroupForRemoteInstanceReqInput,
+    InitWeightsSendGroupForRemoteInstanceReqOutput,
+    InitWeightsUpdateGroupReqInput,
+    LoadLoRAAdapterFromTensorsReqInput,
+    LoadLoRAAdapterFromTensorsReqOutput,
+    LoadLoRAAdapterReqInput,
+    LoadLoRAAdapterReqOutput,
+    OpenSessionReqInput,
+    PauseGenerationReqInput,
+    PinPrefixReqInput,
+    PinPrefixReqOutput,
+    ProfileReq,
+    ReleaseMemoryOccupationReqInput,
+    ResumeMemoryOccupationReqInput,
+    RpcReqInput,
+    RpcReqOutput,
+    SendWeightsToRemoteInstanceReqInput,
+    SendWeightsToRemoteInstanceReqOutput,
+    SetInternalStateReq,
+    SetInternalStateReqOutput,
+    SlowDownReqInput,
+    SlowDownReqOutput,
+    TokenizedEmbeddingReqInput,
+    TokenizedGenerateReqInput,
+    UnloadLoRAAdapterReqInput,
+    UnloadLoRAAdapterReqOutput,
+    UpdateWeightFromDiskReqInput,
+    UpdateWeightsFromDistributedReqInput,
+    UpdateWeightsFromIPCReqInput,
+    UpdateWeightsFromTensorReqInput,
+)
+from sglang.srt.managers.mm_utils import init_mm_embedding_cache, unwrap_shm_features
+from sglang.srt.managers.overlap_utils import FutureMap
+from sglang.srt.managers.prefill_delayer import (
+    PrefillDelayer,
+    PrefillDelayerSinglePassExecutor,
+)
+from sglang.srt.managers.schedule_batch import (
+    FINISH_ABORT,
+    ModelWorkerBatch,
+    MultimodalInputs,
+    Req,
+    ScheduleBatch,
+)
+from sglang.srt.managers.schedule_policy import (
+    AddReqResult,
+    PrefillAdder,
+    SchedulePolicy,
+)
+from sglang.srt.managers.scheduler_dp_attn_mixin import SchedulerDPAttnMixin
+from sglang.srt.managers.scheduler_input_blocker import SchedulerInputBlocker
+from sglang.srt.managers.scheduler_output_processor_mixin import (
+    SchedulerOutputProcessorMixin,
+)
+from sglang.srt.managers.scheduler_pp_mixin import SchedulerPPMixin
+from sglang.srt.managers.scheduler_profiler_mixin import SchedulerProfilerMixin
+from sglang.srt.managers.scheduler_recv_skipper import SchedulerRecvSkipper
+from sglang.srt.managers.scheduler_runtime_checker_mixin import (
+    SchedulerRuntimeCheckerMixin,
+    create_scheduler_watchdog,
+)
+from sglang.srt.managers.scheduler_update_weights_mixin import (
+    SchedulerUpdateWeightsMixin,
+)
+from sglang.srt.managers.session_controller import SessionController
+from sglang.srt.managers.utils import GenerationBatchResult, validate_input_length
+from sglang.srt.mem_cache.cache_init_params import CacheInitParams
+from sglang.srt.mem_cache.common import release_kv_cache
+from sglang.srt.mem_cache.radix_cache import RadixCache
+from sglang.srt.mem_cache.session_aware_cache import SessionAwareCache
+from sglang.srt.model_executor.forward_batch_info import ForwardMode, PPProxyTensors
+from sglang.srt.multiplex.multiplexing_mixin import SchedulerMultiplexMixin
+from sglang.srt.observability.req_time_stats import (
+    real_time,
+    set_schedule_time_batch,
+    set_time_batch,
+)
+from sglang.srt.observability.scheduler_metrics_mixin import (
+    RECORD_STEP_TIME,
+    PrefillStats,
+    SchedulerMetricsMixin,
+)
+from sglang.srt.observability.trace import process_tracing_init, trace_set_thread_info
+from sglang.srt.parser.reasoning_parser import ReasoningParser
+from sglang.srt.server_args import PortArgs, ServerArgs, get_global_server_args
+from sglang.srt.speculative.spec_info import SpeculativeAlgorithm
+from sglang.srt.utils import (
+    DynamicGradMode,
+    broadcast_pyobj,
+    configure_gc_logger,
+    configure_logger,
+    freeze_gc,
+    get_available_gpu_memory,
+    get_bool_env_var,
+    get_int_env_var,
+    get_zmq_socket,
+    kill_itself_when_parent_died,
+    numa_bind_to_node,
+    point_to_point_pyobj,
+    require_mlp_sync,
+    set_gpu_proc_affinity,
+    set_random_seed,
+    suppress_other_loggers,
+)
+from sglang.srt.utils.hf_transformers_utils import (
+    get_processor,
+    get_tokenizer,
+    get_tokenizer_from_processor,
+)
+from sglang.srt.utils.torch_memory_saver_adapter import TorchMemorySaverAdapter
+from sglang.utils import TypeBasedDispatcher, get_exception_traceback
+
+logger = logging.getLogger(__name__)
+
+# Test retract decode for debugging purposes
+TEST_RETRACT = envs.SGLANG_TEST_RETRACT.get()
+TEST_RETRACT_INTERVAL = envs.SGLANG_TEST_RETRACT_INTERVAL.get()
+TEST_RETRACT_NO_PREFILL_BS = envs.SGLANG_TEST_RETRACT_NO_PREFILL_BS.get()
+
+
+@dataclass
+class EmbeddingBatchResult:
+    embeddings: torch.Tensor
+    copy_done: Optional[torch.cuda.Event] = None
+
+    def copy_to_cpu(self):
+        """Copy embeddings tensor to CPU in overlap scheduling."""
+
+        if isinstance(self.embeddings, torch.Tensor):
+            self.copy_done = torch.get_device_module(self.embeddings.device).Event()
+            self.embeddings = self.embeddings.to("cpu", non_blocking=True)
+        else:
+            assert isinstance(self.embeddings, list)
+            if len(self.embeddings) == 0:
+                return
+
+            self.copy_done = torch.get_device_module(self.embeddings[0].device).Event()
+            self.embeddings = [
+                emb.to("cpu", non_blocking=True) for emb in self.embeddings
+            ]
+
+        self.copy_done.record()
+
+
+class Scheduler(
+    SchedulerOutputProcessorMixin,
+    SchedulerUpdateWeightsMixin,
+    SchedulerProfilerMixin,
+    SchedulerMetricsMixin,
+    SchedulerDisaggregationDecodeMixin,
+    SchedulerDisaggregationPrefillMixin,
+    SchedulerMultiplexMixin,
+    SchedulerRuntimeCheckerMixin,
+    SchedulerPPMixin,
+    SchedulerDPAttnMixin,
+    SchedulerDllmMixin,
+):
+    """A scheduler that manages a tensor parallel GPU worker."""
+
+    def __init__(
+        self,
+        server_args: ServerArgs,
+        port_args: PortArgs,
+        gpu_id: int,
+        tp_rank: int,
+        moe_ep_rank: int,
+        pp_rank: int,
+        attn_cp_rank: int,
+        moe_dp_rank: int,
+        dp_rank: Optional[int],
+    ):
+        self.is_initializing = True
+        self.init_soft_watchdog(server_args)
+
+        # Parse args
+        self.server_args = server_args
+        self.tp_rank = tp_rank
+        self.moe_ep_rank = moe_ep_rank
+        self.pp_rank = pp_rank
+        self.attn_cp_rank = attn_cp_rank
+        self.attn_cp_size = server_args.attn_cp_size
+        self.moe_dp_rank = moe_dp_rank
+        self.moe_dp_size = server_args.moe_dp_size
+        self.dp_rank = dp_rank
+        self.tp_size = server_args.tp_size
+        self.moe_ep_size = server_args.ep_size
+        self.pp_size = server_args.pp_size
+        self.dp_size = server_args.dp_size
+        self.nccl_port = port_args.nccl_port
+        self.schedule_policy = server_args.schedule_policy
+        self.enable_priority_scheduling = server_args.enable_priority_scheduling
+        self.abort_on_priority_when_disabled = (
+            server_args.abort_on_priority_when_disabled
+        )
+        self.schedule_low_priority_values_first = (
+            server_args.schedule_low_priority_values_first
+        )
+        self.priority_scheduling_preemption_threshold = (
+            server_args.priority_scheduling_preemption_threshold
+        )
+        self.enable_lora = server_args.enable_lora
+        self.enable_lora_overlap_loading = server_args.enable_lora_overlap_loading
+        self.max_loras_per_batch = server_args.max_loras_per_batch
+        self.enable_overlap = not server_args.disable_overlap_schedule
+        self.enable_pdmux = server_args.enable_pdmux
+        self.skip_tokenizer_init = server_args.skip_tokenizer_init
+        self.enable_metrics = server_args.enable_metrics
+        self.enable_metrics_for_all_schedulers = (
+            server_args.enable_metrics_for_all_schedulers
+        )
+        self.stream_interval = server_args.stream_interval
+        self.spec_algorithm = SpeculativeAlgorithm.from_string(
+            server_args.speculative_algorithm
+        )
+        self.gpu_id = gpu_id
+        self.page_size = server_args.page_size
+        self.enable_hierarchical_cache = server_args.enable_hierarchical_cache
+        self.enable_hicache_storage = server_args.hicache_storage_backend is not None
+        self.max_recv_per_poll = envs.SGLANG_SCHEDULER_MAX_RECV_PER_POLL.get()
+
+        # Distributed rank info
+        self.attn_tp_rank, self.attn_tp_size, self.attn_dp_rank = (
+            compute_dp_attention_world_info(
+                server_args.enable_dp_attention,
+                self.tp_rank,
+                self.tp_size,
+                self.dp_size,
+                self.attn_cp_size,
+            )
+        )
+
+        self.enable_kv_cache_events = bool(
+            server_args.kv_events_config and self.attn_tp_rank == 0
+        )
+
+        # Init model configs
+        self.init_model_config()
+
+        # Init metrics stats
+        self.init_metrics(tp_rank, pp_rank, dp_rank)
+
+        # Init inter-process communication
+        self.init_ipc_channels(port_args)
+
+        # Init PD-multiplexing context
+        if self.enable_pdmux:
+            self.init_pdmux()
+
+        # Init tokenizer
+        self.init_tokenizer()
+
+        # Init moe config and GEMM config (FP8 GEMM, etc.)
+        self.init_moe_gemm_config()
+
+        # Init mamba backend
+        self.init_mamba_backend()
+
+        # Launch a model worker and draft model worker if using speculative decoding
+        self.init_model_worker()
+
+        if (t := envs.SGLANG_TEST_STUCK_SCHEDULER_INIT.get()) > 0:
+            time.sleep(t)
+
+        # Init cache and memory pool
+        self.init_cache_with_memory_pool()
+
+        # Init running status
+        self.init_running_status()
+
+        # Init chunked prefill
+        self.init_chunked_prefill()
+
+        # Init diffusion LLM
+        self.init_diffusion_llm()
+
+        # Init schedule policy and new token estimation
+        self.init_schedule_policy()
+
+        # Init watchdog, memory saver, input blocker and recv skipper
+        self.init_watch_dog_memory_saver_input_blocker()
+
+        # Init profiler
+        self.init_profiler()
+
+        # Init prefill-decodedisaggregation
+        self.init_disaggregation()
+
+        # Init overlap schedule
+        self.init_overlap()
+
+        # Init prefill kv split size when deterministic inference is enabled with various attention backends
+        self.init_deterministic_inference_config()
+
+        # Init request dispatcher
+        self.init_request_dispatcher()
+
+        # Init LoRA overlap loader
+        if self.enable_lora_overlap_loading:
+            self.lora_overlap_loader = LoRAOverlapLoader(
+                self.tp_worker.model_runner.lora_manager
+            )
+
+        # Init the grammar backend for constrained generation
+        self.grammar_manager = GrammarManager(self)
+
+        self.is_initializing = False
+
+    def init_model_config(self):
+        self.model_config = ModelConfig.from_server_args(self.server_args)
+
+    def init_ipc_channels(self, port_args: PortArgs):
+        context = zmq.Context(2)
+        self.idle_sleeper = None
+
+        if self.pp_rank == 0 and self.attn_tp_rank == 0 and self.attn_cp_rank == 0:
+            self.recv_from_tokenizer = get_zmq_socket(
+                context, zmq.PULL, port_args.scheduler_input_ipc_name, False
+            )
+            self.recv_from_rpc = get_zmq_socket(
+                context, zmq.DEALER, port_args.rpc_ipc_name, False
+            )
+
+            send_to_tokenizer = get_zmq_socket(
+                context, zmq.PUSH, port_args.tokenizer_ipc_name, False
+            )
+            if self.server_args.skip_tokenizer_init:
+                # Directly send to the TokenizerManager
+                send_to_detokenizer = get_zmq_socket(
+                    context, zmq.PUSH, port_args.tokenizer_ipc_name, False
+                )
+            else:
+                # Send to the DetokenizerManager
+                send_to_detokenizer = get_zmq_socket(
+                    context, zmq.PUSH, port_args.detokenizer_ipc_name, False
+                )
+
+            self.send_to_tokenizer = SenderWrapper(send_to_tokenizer)
+            self.send_to_detokenizer = SenderWrapper(send_to_detokenizer)
+
+            if self.server_args.sleep_on_idle:
+                self.idle_sleeper = IdleSleeper(
+                    [
+                        self.recv_from_tokenizer,
+                        self.recv_from_rpc,
+                    ]
+                )
+        else:
+            self.recv_from_tokenizer = None
+            self.recv_from_rpc = None
+            self.send_to_tokenizer = SenderWrapper(None)
+            self.send_to_detokenizer = SenderWrapper(None)
+
+        if self.current_scheduler_metrics_enabled:
+            self.send_metrics_from_scheduler = get_zmq_socket(
+                context, zmq.PUSH, port_args.metrics_ipc_name, False
+            )
+
+    def init_tokenizer(self):
+        server_args = self.server_args
+        self.is_generation = self.model_config.is_generation
+
+        if server_args.skip_tokenizer_init:
+            self.tokenizer = self.processor = None
+        else:
+            if self.model_config.is_multimodal:
+                self.processor = get_processor(
+                    server_args.tokenizer_path,
+                    tokenizer_mode=server_args.tokenizer_mode,
+                    trust_remote_code=server_args.trust_remote_code,
+                    revision=server_args.revision,
+                    use_fast=not server_args.disable_fast_image_processor,
+                )
+                self.tokenizer = get_tokenizer_from_processor(self.processor)
+            else:
+                self.tokenizer = get_tokenizer(
+                    server_args.tokenizer_path,
+                    tokenizer_mode=server_args.tokenizer_mode,
+                    trust_remote_code=server_args.trust_remote_code,
+                    revision=server_args.revision,
+                )
+
+        # Set reasoning_parser and think_end_id if --reasoning_parser is enabled
+        if self.server_args.reasoning_parser and self.tokenizer:
+            reasoning_parser = ReasoningParser(
+                model_type=self.server_args.reasoning_parser, stream_reasoning=False
+            )
+            self.tokenizer.think_end_id = self.tokenizer.encode(
+                reasoning_parser.detector.think_end_token, add_special_tokens=False
+            )[0]
+
+    def init_mamba_backend(self) -> None:
+        initialize_mamba_selective_state_update_backend(self.server_args)
+
+    def init_moe_gemm_config(self):
+        # For the MM models, check the text_config for MoE settings
+        config_to_check = getattr(
+            self.model_config.hf_config, "text_config", self.model_config.hf_config
+        )
+
+        if hasattr(config_to_check, "num_experts_per_tok"):
+            initialize_moe_config(self.server_args)
+
+        # Initialize GEMM-related configuration for FP8 and FP4 backends.
+        initialize_fp8_gemm_config(self.server_args)
+        initialize_fp4_gemm_config(self.server_args)
+
+        # This must be called after initialize_moe_config
+        self.require_mlp_sync = require_mlp_sync(self.server_args)
+
+    def init_tp_model_worker(self):
+        from sglang.srt.managers.tp_worker import TpModelWorker
+
+        self.tp_worker = TpModelWorker(
+            server_args=self.server_args,
+            gpu_id=self.gpu_id,
+            tp_rank=self.tp_rank,
+            moe_ep_rank=self.moe_ep_rank,
+            pp_rank=self.pp_rank,
+            attn_cp_rank=self.attn_cp_rank,
+            moe_dp_rank=self.moe_dp_rank,
+            dp_rank=self.dp_rank,
+            nccl_port=self.nccl_port,
+        )
+
+    def maybe_init_draft_worker(self):
+        if self.spec_algorithm.is_none():
+            self.draft_worker = None
+            return
+
+        # Launch a draft worker for speculative decoding
+        draft_worker_kwargs = dict(
+            server_args=self.server_args,
+            gpu_id=self.gpu_id,
+            tp_rank=self.tp_rank,
+            moe_ep_rank=self.moe_ep_rank,
+            nccl_port=self.nccl_port,
+            target_worker=self.tp_worker,
+            dp_rank=self.dp_rank,
+            attn_cp_rank=self.attn_cp_rank,
+            moe_dp_rank=self.moe_dp_rank,
+        )
+
+        if self.server_args.speculative_draft_load_format is not None:
+            self.server_args.load_format = (
+                self.server_args.speculative_draft_load_format
+            )
+            logger.info(
+                f"Using draft model load_format: '{self.server_args.speculative_draft_load_format}'"
+            )
+
+        DraftWorkerClass = self.spec_algorithm.create_worker(self.server_args)
+        self.draft_worker = DraftWorkerClass(**draft_worker_kwargs)
+
+    def init_model_worker(self):
+        self.init_tp_model_worker()
+        self.maybe_init_draft_worker()
+
+        # Dispatch the model worker
+        if self.spec_algorithm.is_none():
+            self.model_worker = self.tp_worker
+        else:
+            self.model_worker = self.draft_worker
+
+        # Get token and memory info from the model worker
+        (
+            self.max_total_num_tokens,
+            self.max_prefill_tokens,
+            self.max_running_requests,
+            self.max_queued_requests,
+            self.max_req_len,
+            self.max_req_input_len,
+            self.random_seed,
+            self.device,
+            self.forward_stream,
+            _,
+            _,
+            _,
+        ) = self.tp_worker.get_worker_info()
+        if get_global_server_args().pp_max_micro_batch_size is None:
+            get_global_server_args().pp_max_micro_batch_size = max(
+                self.max_running_requests // self.pp_size, 1
+            )
+
+        self.tp_group = get_tp_group()
+        self.tp_cpu_group = self.tp_group.cpu_group
+        self.attn_tp_group = get_attention_tp_group()
+        self.attn_tp_cpu_group = self.attn_tp_group.cpu_group
+        self.attn_cp_group = get_attention_cp_group()
+        self.attn_cp_cpu_group = self.attn_cp_group.cpu_group
+        self.pp_group = get_pp_group()
+        self.world_group = get_world_group()
+
+        # NOTE: dp_tp_* are request/data-plane coordination groups (not tensor collectives).
+        # When DP attention is enabled, scope to the attention-TP group; otherwise use
+        # the base TP group. Entry rank is the local rank 0 in that group.
+        # Use the CPU (gloo) group to broadcast VLM Python objects and avoid CUDA
+        # stream/device coupling (#11910).
+        self.dp_tp_group = (
+            self.attn_tp_group
+            if self.server_args.enable_dp_attention
+            else self.tp_group
+        )
+        self.dp_tp_cpu_group = self.dp_tp_group.cpu_group
+
+        self.pad_input_ids_func = self.tp_worker.get_pad_input_ids_func()
+        set_random_seed(self.random_seed)
+
+        # Print debug info
+        if self.tp_rank == 0:
+            avail_mem = get_available_gpu_memory(
+                self.device, self.gpu_id, empty_cache=False
+            )
+            logger.info(
+                f"max_total_num_tokens={self.max_total_num_tokens}, "
+                f"chunked_prefill_size={self.server_args.chunked_prefill_size}, "
+                f"max_prefill_tokens={self.max_prefill_tokens}, "
+                f"max_running_requests={self.max_running_requests}, "
+                f"context_len={self.model_config.context_len}, "
+                f"{'available_cpu_mem' if self.device == 'cpu' else 'available_gpu_mem'}={avail_mem:.2f} GB"
+            )
+
+        if self.enable_metrics and hasattr(self, "metrics_collector"):
+            self.metrics_collector.emit_cache_config_info(
+                self.page_size, self.max_total_num_tokens // self.page_size
+            )
+
+    def init_cache_with_memory_pool(self):
+        server_args = self.server_args
+
+        # Hybrid memory pool
+        self.is_hybrid_swa = self.tp_worker.is_hybrid_swa
+        self.is_hybrid_ssm = (
+            self.tp_worker.model_runner.hybrid_gdn_config is not None
+            or self.tp_worker.model_runner.mamba2_config is not None
+        )
+
+        self.sliding_window_size = None
+        if self.is_hybrid_swa:
+            self.sliding_window_size = self.tp_worker.sliding_window_size
+            self.full_tokens_per_layer, self.swa_tokens_per_layer = (
+                self.tp_worker.get_tokens_per_layer_info()
+            )
+
+        self.req_to_token_pool, self.token_to_kv_pool_allocator = (
+            self.tp_worker.get_memory_pool()
+        )
+
+        # Create cache
+        params = CacheInitParams(
+            disable=server_args.disable_radix_cache,
+            req_to_token_pool=self.req_to_token_pool,
+            token_to_kv_pool_allocator=self.token_to_kv_pool_allocator,
+            page_size=self.page_size,
+            is_eagle=self.spec_algorithm.is_eagle(),
+            tp_cache_group=(
+                self.attn_tp_cpu_group
+                if self.server_args.enable_dp_attention
+                else self.tp_cpu_group
+            ),
+            eviction_policy=server_args.radix_eviction_policy,
+            enable_metrics=self.enable_metrics,
+            enable_kv_cache_events=self.enable_kv_cache_events,
+            enable_mamba_extra_buffer=server_args.enable_mamba_extra_buffer(),
+            pp_rank=self.pp_rank,
+            pp_size=self.pp_size,
+            chunked_prefill_size=server_args.chunked_prefill_size,
+            sliding_window_size=self.sliding_window_size,
+        )
+
+        if (
+            server_args.chunked_prefill_size is not None
+            and server_args.disable_radix_cache
+        ):
+            if not self.is_hybrid_swa:
+                from sglang.srt.mem_cache.chunk_cache import ChunkCache
+
+                self.tree_cache = ChunkCache(params)
+            else:
+                from sglang.srt.mem_cache.chunk_cache import SWAChunkCache
+
+                self.tree_cache = SWAChunkCache(params)
+        else:
+
+            if envs.SGLANG_EXPERIMENTAL_CPP_RADIX_TREE.get():
+                # lazy import to avoid JIT overhead
+                from sglang.srt.mem_cache.radix_cache_cpp import RadixCacheCpp
+
+                logger.info("Using experimental C++ radix tree implementation.")
+                self.tree_cache = RadixCacheCpp(params=params, server_args=server_args)
+            elif self.enable_hierarchical_cache:
+                from sglang.srt.mem_cache.hiradix_cache import HiRadixCache
+
+                self.tree_cache = HiRadixCache(params=params, server_args=server_args)
+                self.tp_worker.register_hicache_layer_transfer_counter(
+                    self.tree_cache.cache_controller.layer_done_counter
+                )
+            elif self.is_hybrid_swa:
+                from sglang.srt.mem_cache.swa_radix_cache import SWARadixCache
+
+                self.tree_cache = SWARadixCache(params=params)
+            elif self.is_hybrid_ssm:
+                from sglang.srt.mem_cache.mamba_radix_cache import MambaRadixCache
+
+                self.tree_cache = MambaRadixCache(params)
+            elif server_args.enable_lmcache:
+                from sglang.srt.mem_cache.storage.lmcache.lmc_radix_cache import (
+                    LMCRadixCache,
+                )
+
+                self.tree_cache = LMCRadixCache(
+                    params=params,
+                    model_config=self.model_config,
+                    tp_size=self.tp_size,
+                    rank=self.tp_rank,
+                    tp_group=self.tp_group,
+                )
+            else:
+                self.tree_cache = RadixCache(params)
+
+        if server_args.enable_streaming_session:
+
+            self.tree_cache = SessionAwareCache(self.tree_cache)
+
+        if (
+            server_args.disaggregation_mode == "decode"
+            and server_args.disaggregation_decode_enable_offload_kvcache
+        ):
+            self.decode_offload_manager = DecodeKVCacheOffloadManager(
+                req_to_token_pool=self.req_to_token_pool,
+                token_to_kv_pool_allocator=self.token_to_kv_pool_allocator,
+                tp_group=params.tp_cache_group,
+                tree_cache=self.tree_cache,
+                server_args=self.server_args,
+            )
+        else:
+            self.decode_offload_manager = None
+
+        embedding_cache_size = envs.SGLANG_VLM_CACHE_SIZE_MB.get()
+        init_mm_embedding_cache(embedding_cache_size * 1024 * 1024)
+
+    def init_running_status(self):
+        self.waiting_queue: List[Req] = []
+        # The running decoding batch for continuous batching
+        self.running_batch: ScheduleBatch = ScheduleBatch(reqs=[], batch_is_full=False)
+        # The current forward batch
+        self.cur_batch: Optional[ScheduleBatch] = None
+        # The last forward batch
+        self.last_batch: Optional[ScheduleBatch] = None
+        self.forward_ct = 0
+        self.return_health_check_ct = 0
+        self.num_retracted_reqs: int = 0
+        self.num_paused_reqs: int = 0
+        self.session_controller = SessionController(self.tree_cache)
+        self.forward_sleep_time = None
+        self._engine_paused = False
+
+    def init_chunked_prefill(self):
+        # Init chunked prefill
+        self.chunked_prefill_size = self.server_args.chunked_prefill_size
+        if self.chunked_prefill_size <= 0:  # -1 means disable
+            self.chunked_prefill_size = None
+        self.chunked_req = None
+        self.is_mixed_chunk = (
+            self.chunked_prefill_size is not None
+            and self.server_args.enable_mixed_chunk
+        )
+
+        # Init the dynamic chunking predictor for PP
+        self.enable_dynamic_chunking = (
+            self.server_args.enable_dynamic_chunking and self.pp_size > 1
+        )
+        if self.enable_dynamic_chunking:
+            try:
+                self.profile_and_init_predictor()
+            except Exception as e:
+                logger.warning(
+                    f"[PP Dynamic Chunk] Failed to profile prefill latency: {e}. "
+                    "Dynamic chunking will be disabled."
+                )
+                self.enable_dynamic_chunking = False
+
+    def init_schedule_policy(self):
+        # Init schedule policy and new token estimation
+        self.policy = SchedulePolicy(
+            self.schedule_policy,
+            self.tree_cache,
+            self.enable_hierarchical_cache,
+            self.enable_priority_scheduling,
+            self.schedule_low_priority_values_first,
+        )
+        self.prefill_delayer: Optional[PrefillDelayer] = None
+        self.max_prefill_bs: int = 0
+        if self.server_args.enable_prefill_delayer:
+            self.prefill_delayer = PrefillDelayer(
+                dp_size=self.dp_size,
+                attn_tp_size=self.attn_tp_size,
+                cpu_group=self.tp_cpu_group,
+                server_args=self.server_args,
+                metrics_collector=(
+                    self.metrics_collector if self.enable_metrics else None
+                ),
+                max_delay_passes=self.server_args.prefill_delayer_max_delay_passes,
+                token_usage_low_watermark=self.server_args.prefill_delayer_token_usage_low_watermark,
+                device=(
+                    self.tp_group.device
+                    if self.server_args.disable_overlap_schedule
+                    else "cpu"
+                ),
+            )
+
+        # NOTE: preemption is enabled by default for priority scheduling.
+        self.enable_priority_preemption = (
+            self.enable_priority_scheduling
+            and not self.server_args.disable_priority_preemption
+        )
+
+        self.init_new_token_ratio = min(
+            envs.SGLANG_INIT_NEW_TOKEN_RATIO.get()
+            * self.server_args.schedule_conservativeness,
+            1.0,
+        )
+        self.min_new_token_ratio = min(
+            self.init_new_token_ratio * envs.SGLANG_MIN_NEW_TOKEN_RATIO_FACTOR.get(),
+            1.0,
+        )
+        self.new_token_ratio_decay = (
+            self.init_new_token_ratio - self.min_new_token_ratio
+        ) / envs.SGLANG_NEW_TOKEN_RATIO_DECAY_STEPS.get()
+        self.new_token_ratio = self.init_new_token_ratio
+
+    def init_soft_watchdog(self, server_args: ServerArgs):
+        if (x := server_args.soft_watchdog_timeout) is not None:
+            self.soft_watchdog = create_scheduler_watchdog(
+                self, watchdog_timeout=x, soft=True
+            )
+
+    def init_watch_dog_memory_saver_input_blocker(self):
+        # Start watchdog thread
+        self.watchdog = create_scheduler_watchdog(
+            self, watchdog_timeout=self.server_args.watchdog_timeout
+        )
+
+        # Init memory saver, profiler and metric stats
+        self.memory_saver_adapter = TorchMemorySaverAdapter.create(
+            enable=self.server_args.enable_memory_saver
+        )
+        self.offload_tags = set()
+
+        # Init recv skipper and input blocker
+        self.recv_skipper = SchedulerRecvSkipper.maybe_create(self.server_args)
+        self.input_blocker = (
+            SchedulerInputBlocker(noop=self.attn_tp_rank != 0)
+            if get_bool_env_var("SGLANG_ENABLE_COLOCATED_BATCH_GEN")
+            else None
+        )
+
+        # Configure GC logger
+        if envs.SGLANG_LOG_GC.get():
+            configure_gc_logger()
+
+    def init_disaggregation(self):
+        self.disaggregation_mode = DisaggregationMode(
+            self.server_args.disaggregation_mode
+        )
+        self.transfer_backend = TransferBackend(
+            self.server_args.disaggregation_transfer_backend
+        )
+
+        if self.draft_worker is None or self.spec_algorithm.is_ngram():
+            draft_token_to_kv_pool = None
+        elif self.spec_algorithm.supports_spec_v2() and self.enable_overlap:
+            if self.server_args.enable_multi_layer_eagle:
+                draft_runner = self.draft_worker.draft_worker.draft_runner_list[0]
+            else:
+                draft_runner = self.draft_worker.draft_worker.draft_runner
+            draft_token_to_kv_pool = draft_runner.token_to_kv_pool
+            model_config = draft_runner.model_config
+        else:
+            # todo: should we fix this when enabling mtp or it doesn't matter since we only enable mtp in decode node thus we don't transfer draft kvs between P and D?
+            draft_token_to_kv_pool = self.draft_worker.model_runner.token_to_kv_pool
+            model_config = self.draft_worker.model_config
+
+        if (
+            self.disaggregation_mode == DisaggregationMode.DECODE
+        ):  # *2 for the headroom.
+            buffer_size = (self.req_to_token_pool.size) * 2
+            self.req_to_metadata_buffer_idx_allocator = ReqToMetadataIdxAllocator(
+                buffer_size
+            )
+            self.disagg_metadata_buffers = MetadataBuffers(
+                buffer_size,
+                hidden_size=(
+                    model_config.hidden_size
+                    if self.spec_algorithm.is_eagle()
+                    else 16  # minimal padding size for RDMA
+                ),
+                hidden_states_dtype=(
+                    model_config.dtype
+                    if self.spec_algorithm.is_eagle()
+                    else torch.float32
+                ),
+                custom_mem_pool=self.token_to_kv_pool_allocator.get_kvcache().maybe_get_custom_mem_pool(),
+            )
+
+            # The decode requests polling kv cache
+            self.disagg_decode_transfer_queue = DecodeTransferQueue(
+                gloo_group=self.attn_tp_cpu_group,
+                req_to_metadata_buffer_idx_allocator=self.req_to_metadata_buffer_idx_allocator,
+                tp_rank=self.tp_rank,
+                metadata_buffers=self.disagg_metadata_buffers,
+                scheduler=self,
+                tree_cache=self.tree_cache,
+            )
+
+            # The decode requests pending for pre-allocation
+            self.disagg_decode_prealloc_queue = DecodePreallocQueue(
+                req_to_token_pool=self.req_to_token_pool,
+                token_to_kv_pool_allocator=self.token_to_kv_pool_allocator,
+                draft_token_to_kv_pool=draft_token_to_kv_pool,
+                req_to_metadata_buffer_idx_allocator=self.req_to_metadata_buffer_idx_allocator,
+                metadata_buffers=self.disagg_metadata_buffers,
+                scheduler=self,
+                transfer_queue=self.disagg_decode_transfer_queue,
+                tree_cache=self.tree_cache,
+                gloo_group=self.attn_tp_cpu_group,
+                tp_rank=self.tp_rank,
+                tp_size=self.tp_size,
+                dp_size=self.server_args.dp_size,
+                gpu_id=self.gpu_id,
+                bootstrap_port=self.server_args.disaggregation_bootstrap_port,
+                max_total_num_tokens=self.max_total_num_tokens,
+                pp_rank=self.pp_rank,
+                num_reserved_decode_tokens=self.server_args.num_reserved_decode_tokens,
+                transfer_backend=self.transfer_backend,
+            )
+
+        elif self.disaggregation_mode == DisaggregationMode.PREFILL:
+            # *2 for the headroom.
+            buffer_size = self.max_running_requests * 2
+            self.req_to_metadata_buffer_idx_allocator = ReqToMetadataIdxAllocator(
+                buffer_size
+            )
+            self.disagg_metadata_buffers = MetadataBuffers(
+                buffer_size,
+                hidden_size=(
+                    model_config.hidden_size
+                    if self.spec_algorithm.is_eagle()
+                    or self.spec_algorithm.is_standalone()
+                    else 16  # minimal padding size for RDMA
+                ),
+                hidden_states_dtype=(
+                    model_config.dtype
+                    if self.spec_algorithm.is_eagle()
+                    or self.spec_algorithm.is_standalone()
+                    else torch.float32
+                ),
+                custom_mem_pool=self.token_to_kv_pool_allocator.get_kvcache().maybe_get_custom_mem_pool(),
+            )
+
+            self.disagg_prefill_bootstrap_queue = PrefillBootstrapQueue(
+                token_to_kv_pool=self.token_to_kv_pool_allocator.get_kvcache(),
+                draft_token_to_kv_pool=draft_token_to_kv_pool,
+                req_to_metadata_buffer_idx_allocator=self.req_to_metadata_buffer_idx_allocator,
+                metadata_buffers=self.disagg_metadata_buffers,
+                tp_rank=self.tp_rank,
+                tp_size=self.tp_size,
+                gpu_id=self.gpu_id,
+                bootstrap_port=self.server_args.disaggregation_bootstrap_port,
+                gloo_group=self.attn_tp_cpu_group,
+                max_total_num_tokens=self.max_total_num_tokens,
+                scheduler=self,
+                pp_rank=self.pp_rank,
+                pp_size=self.pp_size,
+                transfer_backend=self.transfer_backend,
+            )
+            # The prefill requests that are in the middle of kv sending
+            self.disagg_prefill_inflight_queue: List[Req] = []
+
+        # Init mm receiver for EPD disaggregation mode
+        if (
+            self.server_args.language_only
+            and self.server_args.encoder_transfer_backend == "zmq_to_scheduler"
+        ):
+            self.mm_receiver = create_mm_receiver(
+                self.server_args,
+                hf_config=self.model_config.hf_config,
+                pp_rank=self.pp_rank,
+                tp_rank=self.tp_rank,
+                tp_group=self.tp_group,
+                scheduler=self,
+            )
+
+    def init_overlap(self):
+        self.device_module = torch.get_device_module(self.device)
+
+        self.forward_stream_ctx: CudaStreamContext = self.device_module.stream(
+            self.forward_stream
+        )
+        self.copy_stream: CudaStream = self.device_module.Stream()
+        self.copy_stream_ctx: CudaStreamContext = self.device_module.stream(
+            self.copy_stream
+        )
+
+        if not self.enable_overlap:
+            self.future_map = None
+            return
+
+        self.future_map = FutureMap(
+            self.max_running_requests,
+            self.chunked_prefill_size,
+            self.model_config.context_len,
+            self.device,
+            self.spec_algorithm,
+        )
+        self.batch_record_buf = [None] * 2
+        self.batch_record_ct = 0
+
+    def init_deterministic_inference_config(self):
+        """Initialize deterministic inference configuration for different attention backends."""
+        if not self.server_args.enable_deterministic_inference:
+            self.truncation_align_size = None
+            return
+
+        backend_sizes = {
+            "flashinfer": ("SGLANG_FLASHINFER_PREFILL_SPLIT_TILE_SIZE", 4096),
+            "triton": ("SGLANG_TRITON_PREFILL_TRUNCATION_ALIGN_SIZE", 4096),
+        }
+        env_var, default_size = backend_sizes.get(
+            self.server_args.attention_backend, (None, None)
+        )
+        self.truncation_align_size = (
+            get_int_env_var(env_var, default_size) if env_var else None
+        )
+
+    def init_request_dispatcher(self):
+        self._request_dispatcher = TypeBasedDispatcher(
+            [
+                (TokenizedGenerateReqInput, self.handle_generate_request),
+                (TokenizedEmbeddingReqInput, self.handle_embedding_request),
+                (BatchTokenizedGenerateReqInput, self.handle_batch_generate_request),
+                (BatchTokenizedEmbeddingReqInput, self.handle_batch_embedding_request),
+                (FlushCacheReqInput, self.flush_cache_wrapped),
+                (ClearHiCacheReqInput, self.clear_hicache_storage_wrapped),
+                (AttachHiCacheStorageReqInput, self.attach_hicache_storage_wrapped),
+                (DetachHiCacheStorageReqInput, self.detach_hicache_storage_wrapped),
+                (PinPrefixReqInput, self.pin_prefix_wrapped),
+                (AbortReq, self.abort_request),
+                (OpenSessionReqInput, self.open_session),
+                (CloseSessionReqInput, self.close_session),
+                (UpdateWeightFromDiskReqInput, self.update_weights_from_disk),
+                (InitWeightsUpdateGroupReqInput, self.init_weights_update_group),
+                (DestroyWeightsUpdateGroupReqInput, self.destroy_weights_update_group),
+                (
+                    InitWeightsSendGroupForRemoteInstanceReqInput,
+                    self.init_weights_send_group_for_remote_instance,
+                ),
+                (
+                    SendWeightsToRemoteInstanceReqInput,
+                    self.send_weights_to_remote_instance,
+                ),
+                (
+                    UpdateWeightsFromDistributedReqInput,
+                    self.update_weights_from_distributed,
+                ),
+                (UpdateWeightsFromTensorReqInput, self.update_weights_from_tensor),
+                (UpdateWeightsFromIPCReqInput, self.update_weights_from_ipc),
+                (GetWeightsByNameReqInput, self.get_weights_by_name),
+                (ReleaseMemoryOccupationReqInput, self.release_memory_occupation),
+                (ResumeMemoryOccupationReqInput, self.resume_memory_occupation),
+                (CheckWeightsReqInput, self.check_weights),
+                (SlowDownReqInput, self.slow_down),
+                (ProfileReq, self.profile),
+                (FreezeGCReq, self.handle_freeze_gc),
+                (GetInternalStateReq, self.get_internal_state),
+                (SetInternalStateReq, self.set_internal_state),
+                (RpcReqInput, self.handle_rpc_request),
+                (ExpertDistributionReq, self.expert_distribution_handle),
+                (LoadLoRAAdapterReqInput, self.load_lora_adapter),
+                (
+                    LoadLoRAAdapterFromTensorsReqInput,
+                    self.load_lora_adapter_from_tensors,
+                ),
+                (UnloadLoRAAdapterReqInput, self.unload_lora_adapter),
+                (GetLoadReqInput, self.get_load),
+                (GetLoadsReqInput, self.get_loads),
+                (PauseGenerationReqInput, self.pause_generation),
+                (ContinueGenerationReqInput, self.continue_generation),
+                (DumperControlReqInput, self.handle_dumper_control),
+            ]
+        )
+
+    def _abort_on_running_timeout(self):
+        # NOTE: this should be called before a batch is launched,
+        # as current spec-v1 still filters batch inside verify stage.
+        timeout_s = envs.SGLANG_REQ_RUNNING_TIMEOUT.get()
+        if timeout_s <= 0:
+            return
+        if self.running_batch.is_empty():
+            return
+
+        deadline = time.perf_counter() - timeout_s
+        for req in self.running_batch.reqs:
+            if not req.finished() and 0 < req.time_stats.forward_entry_time < deadline:
+                req.to_finish = FINISH_ABORT(
+                    "Request running timeout reached.", HTTPStatus.SERVICE_UNAVAILABLE
+                )
+
+    def get_init_info(self) -> Dict[str, Any]:
+        """Return scheduler initialization info for handshake.
+
+        This method provides the initialization info needed by the tokenizer manager
+        and other components to verify the scheduler is ready.
+        """
+        result_dict = {
+            "status": "ready",
+            "max_total_num_tokens": self.max_total_num_tokens,
+            "max_req_input_len": self.max_req_input_len,
+        }
+
+        if self.server_args.remote_instance_weight_loader_use_transfer_engine():
+            (
+                remote_instance_transfer_engine_session_id,
+                remote_instance_transfer_engine_weights_info_dict,
+            ) = self.get_remote_instance_transfer_engine_info()
+            result_dict.update(
+                {
+                    "tp_rank": self.tp_rank,
+                    "remote_instance_transfer_engine_session_id": remote_instance_transfer_engine_session_id,
+                    "remote_instance_transfer_engine_weights_info_dict": remote_instance_transfer_engine_weights_info_dict,
+                }
+            )
+
+        return result_dict
+
+    def run_event_loop(self) -> None:
+        """Run the scheduler's event loop.
+
+        Sets up the schedule stream and dispatches to the appropriate event loop.
+        The event loop blocks until shutdown.
+        """
+        self.schedule_stream = self.device_module.Stream(priority=0)
+        if self.device == "cpu":
+            self.schedule_stream.synchronize = lambda: None  # No-op for CPU
+        with CudaStreamContext(self.schedule_stream):
+            dispatch_event_loop(self)
+
+    @DynamicGradMode()
+    def event_loop_normal(self):
+        """A normal scheduler loop."""
+        while True:
+            # Receive requests
+            recv_reqs = self.recv_requests()
+            self.process_input_requests(recv_reqs)
+            if self._engine_paused:
+                self.cancel_bubble_timer()
+                continue
+
+            # Get the next batch to run
+            batch = self.get_next_batch_to_run()
+            self.cur_batch = batch
+
+            # Launch the current batch
+            if batch:
+                result = self.run_batch(batch)
+                self.process_batch_result(batch, result)
+            else:
+                # When the server is idle, do self-check and re-init some states.
+                self.self_check_during_idle()
+
+            # Update last_batch
+            self.last_batch = batch
+            if envs.SGLANG_ENABLE_STRICT_MEM_CHECK_DURING_BUSY.get():
+                self.self_check_during_busy()
+
+    @DynamicGradMode()
+    def event_loop_overlap(self):
+        """A scheduler loop that overlaps the CPU processing and GPU computation."""
+        self.result_queue: Deque[
+            Tuple[ScheduleBatch, Union[GenerationBatchResult, EmbeddingBatchResult]]
+        ] = deque()
+
+        def pop_and_process():
+            # Process the results of the last batch
+            tmp_batch, tmp_result = self.result_queue.popleft()
+            self.process_batch_result(tmp_batch, tmp_result)
+
+        while True:
+            # Receive requests
+            recv_reqs = self.recv_requests()
+            self.process_input_requests(recv_reqs)
+            if self._engine_paused:
+                continue
+
+            # Get the next batch to run
+            batch = self.get_next_batch_to_run()
+            self.cur_batch = batch
+            disable_overlap_for_batch = self.is_disable_overlap_for_batch(batch)
+
+            # If we do not need to overlap the current batch with the last batch,
+            # we can process the last batch immediately.
+            if disable_overlap_for_batch:
+                pop_and_process()
+
+            # Launch the current batch
+            if batch:
+                batch_result = self.run_batch(batch)
+                self.result_queue.append((batch.copy(), batch_result))
+            else:
+                batch_result = None
+                self.cancel_bubble_timer()
+
+            # Process the last batch
+            if self.last_batch:
+                if not disable_overlap_for_batch:
+                    pop_and_process()
+            elif batch is None:
+                # When the server is idle, do self-check and re-init some states
+                self.self_check_during_idle()
+
+            # Run sample of the current batch
+            # It depends on the result of the last batch (e.g., grammar), so we run it after the last batch is processed.
+            if self.is_generation:
+                self.launch_batch_sample_if_needed(batch_result)
+
+            # Update last_batch
+            self.last_batch = batch
+            if envs.SGLANG_ENABLE_STRICT_MEM_CHECK_DURING_BUSY.get():
+                self.self_check_during_busy()
+
+    def is_disable_overlap_for_batch(self, batch: ScheduleBatch) -> bool:
+        # For two consecutive prefill batches, we disable overlap to improve the TTFT of the first batch.
+        # This might slightly hurt the throughput, so we use an environment variable to control it.
+        disable_overlap_for_batch = (
+            envs.SGLANG_DISABLE_CONSECUTIVE_PREFILL_OVERLAP.get()
+            and batch
+            and batch.forward_mode.is_extend()
+            and self.last_batch
+            and self.last_batch.forward_mode.is_extend()
+        )
+
+        # We do not support overlap + spec + grammar yet,
+        # so we need to turn off overlap for this batch.
+        # TODO(lsyin): support overlap + spec + grammar
+        need_grammar_sync = (
+            batch
+            and batch.is_spec_v2
+            and batch.has_grammar
+            and batch.forward_mode.is_decode()
+            and len(self.result_queue) > 0
+        )
+
+        return disable_overlap_for_batch or need_grammar_sync
+
+    def recv_limit_reached(self, num_recv_reqs: int) -> bool:
+        if self.max_recv_per_poll < 0:
+            return False
+        return num_recv_reqs >= self.max_recv_per_poll
+
+    def recv_requests(
+        self,
+    ) -> List[Union[TokenizedGenerateReqInput, TokenizedEmbeddingReqInput, Any]]:
+        """Receive results at tp_rank = 0 and broadcast it to all other TP ranks."""
+
+        if self.recv_skipper is not None:
+            last_forward_mode = (
+                self.last_batch.forward_mode if self.last_batch is not None else None
+            )
+            if not self.recv_skipper.handle(last_forward_mode):
+                return []
+
+        if self.pp_rank == 0:
+            if self.attn_tp_rank == 0 and self.attn_cp_rank == 0:
+                recv_reqs = []
+
+                while True:
+                    try:
+                        if self.recv_limit_reached(len(recv_reqs)):
+                            break
+                        recv_req = self.recv_from_tokenizer.recv_pyobj(zmq.NOBLOCK)
+                        recv_req = unwrap_shm_features(recv_req)
+                    except zmq.ZMQError:
+                        break
+                    recv_reqs.append(recv_req)
+
+                while True:
+                    try:
+                        if self.recv_limit_reached(len(recv_reqs)):
+                            break
+                        recv_rpc = self.recv_from_rpc.recv_pyobj(zmq.NOBLOCK)
+                    except zmq.ZMQError:
+                        break
+                    recv_reqs.append(recv_rpc)
+            else:
+                recv_reqs = None
+        else:
+            if self.attn_tp_rank == 0 and self.attn_cp_rank == 0:
+                dp_offset = self.attn_dp_rank * self.attn_tp_size
+                recv_reqs = point_to_point_pyobj(
+                    [],
+                    self.pp_rank * self.tp_size + dp_offset,
+                    self.world_group.cpu_group,
+                    (self.pp_rank - 1) * self.tp_size + dp_offset,
+                    self.pp_rank * self.tp_size + dp_offset,
+                )
+            else:
+                recv_reqs = None
+
+        if self.input_blocker is not None:
+            recv_reqs = self.input_blocker.handle(recv_reqs)
+
+        if self.server_args.enable_dp_attention:
+            if self.attn_tp_rank == 0 and self.attn_cp_rank == 0:
+                work_reqs, control_reqs = self._split_work_and_control_reqs(recv_reqs)
+            else:
+                work_reqs = None
+                control_reqs = None
+
+            if self.attn_tp_size != 1:
+                work_reqs = broadcast_pyobj(
+                    work_reqs,
+                    self.attn_tp_group.rank,
+                    self.attn_tp_cpu_group,
+                    src=self.attn_tp_group.ranks[0],
+                )
+
+            if self.attn_cp_size != 1:
+                work_reqs = broadcast_pyobj(
+                    work_reqs,
+                    self.attn_cp_group.rank,
+                    self.attn_cp_cpu_group,
+                    src=self.attn_cp_group.ranks[0],
+                )
+
+            if self.tp_size != 1:
+                control_reqs = broadcast_pyobj(
+                    control_reqs,
+                    self.tp_group.rank,
+                    self.tp_cpu_group,
+                    src=self.tp_group.ranks[0],
+                )
+            recv_reqs = work_reqs + control_reqs
+        elif self.tp_size != 1:
+            recv_reqs = broadcast_pyobj(
+                recv_reqs,
+                self.tp_group.rank,
+                self.tp_cpu_group,
+                src=self.tp_group.ranks[0],
+            )
+
+        # Process MM requests under EPD-disaggregation mode
+        if (
+            self.pp_rank == 0
+            and self.server_args.language_only
+            and self.server_args.encoder_transfer_backend == "zmq_to_scheduler"
+        ):
+            recv_reqs, abort_reqs = self.mm_receiver.process_waiting_requests(recv_reqs)
+            for req, error_msg, error_code in abort_reqs:
+
+                status_code = (
+                    HTTPStatus.BAD_REQUEST
+                    if error_code == 400
+                    else HTTPStatus.INTERNAL_SERVER_ERROR
+                )
+                prepare_abort(req, error_msg, status_code=status_code)
+                self.stream_output([req], req.return_logprob)
+
+        return recv_reqs
+
+    def _split_work_and_control_reqs(self, recv_reqs: List):
+        work_reqs = [
+            req
+            for req in recv_reqs
+            if isinstance(
+                req,
+                (
+                    TokenizedGenerateReqInput,
+                    TokenizedEmbeddingReqInput,
+                    BatchTokenizedGenerateReqInput,
+                    BatchTokenizedEmbeddingReqInput,
+                ),
+            )
+        ]
+        control_reqs = [
+            req
+            for req in recv_reqs
+            if not isinstance(
+                req,
+                (
+                    TokenizedGenerateReqInput,
+                    TokenizedEmbeddingReqInput,
+                    BatchTokenizedGenerateReqInput,
+                    BatchTokenizedEmbeddingReqInput,
+                ),
+            )
+        ]
+        return work_reqs, control_reqs
+
+    def process_input_requests(self, recv_reqs: List):
+        now = time.monotonic()
+        self.session_controller.maybe_reap(now)
+        for recv_req in recv_reqs:
+            # If it is a health check generation request and there are running requests, ignore it.
+            if is_health_check_generate_req(recv_req) and (
+                self.chunked_req is not None
+                or self.dllm_manager.any_staging_reqs()
+                or not self.running_batch.is_empty()
+                or len(self.offload_tags) > 0
+            ):
+                self.return_health_check_ct += 1
+                continue
+
+            output = self._request_dispatcher(recv_req)
+            if output is not None:
+                if not isinstance(output, RpcReqOutput):
+                    self.send_to_tokenizer.send_output(output, recv_req)
+                else:
+                    if self.recv_from_rpc is not None:
+                        self.recv_from_rpc.send_pyobj(output)
+
+    def init_req_max_new_tokens(self, req):
+        req.sampling_params.max_new_tokens = min(
+            (
+                req.sampling_params.max_new_tokens
+                if req.sampling_params.max_new_tokens is not None
+                else 1 << 30
+            ),
+            self.max_req_len - len(req.origin_input_ids) - 1,
+        )
+
+    def _process_and_broadcast_mm_inputs(
+        self,
+        raw_mm_inputs: Optional[dict],
+    ):
+        """Materialize MultimodalInputs once on the entry rank and broadcast to others.
+
+        Entry rank:
+        - constructs MultimodalInputs.from_dict(raw_mm_inputs) once
+        - broadcasts to other ranks in self.cpu_group (if world_size > 1)
+
+        Non-entry ranks:
+        - receive the object via broadcast (if world_size > 1)
+        - otherwise (single-rank / no group) fall back to local from_dict
+
+        Returns:
+            MultimodalInputs | None
+        """
+        if raw_mm_inputs is None:
+            return None
+
+        group_world_size = 1
+        try:
+            if (
+                torch.distributed.is_available()
+                and torch.distributed.is_initialized()
+                and self.dp_tp_cpu_group is not None
+            ):
+                group_world_size = torch.distributed.get_world_size(
+                    group=self.dp_tp_cpu_group
+                )
+        except Exception as e:
+            logger.warning(
+                f"Failed to get world size in mm_inputs handling with {e}, fallback to 1."
+            )
+
+        # In case tp size > 1, all the Scheduler TP ranks runs the duplicated computing
+        # process in CPU which occupies the main thread CPU cycle. This computing logic
+        # merely needs to be run on TP0 and be broadcast to other TP ranks.
+        # Since the Scheduler is single-threaded, any large CPU cost will impact
+        # handling of other messages. For example, CPU hits 99.9% can significantly
+        # increase the CUDA kernel launch time.
+        if self.dp_tp_group.rank_in_group == 0:
+            # Only the entry rank materializes once from dict.
+            image_inputs = MultimodalInputs.from_dict(raw_mm_inputs)
+            # Broadcast to other TP ranks (use src=0 within the group).
+            if group_world_size > 1:
+                obj_list = [image_inputs]
+                torch.distributed.broadcast_object_list(
+                    obj_list,
+                    src=self.dp_tp_group.first_rank,
+                    group=self.dp_tp_cpu_group,
+                )
+                image_inputs = obj_list[0]
+        else:
+            # Non-entry ranks: receive if group size > 1; otherwise materialize locally.
+            if group_world_size > 1:
+                obj_list = [None]
+                torch.distributed.broadcast_object_list(
+                    obj_list,
+                    src=self.dp_tp_group.first_rank,
+                    group=self.dp_tp_cpu_group,
+                )
+                image_inputs = obj_list[0]
+            else:
+                image_inputs = MultimodalInputs.from_dict(raw_mm_inputs)
+
+        return image_inputs
+
+    def _get_multimodal_inputs(self, mm_inputs_dict: dict):
+        if self.server_args.enable_broadcast_mm_inputs_process:
+            return self._process_and_broadcast_mm_inputs(mm_inputs_dict)
+        else:
+            return MultimodalInputs.from_dict(mm_inputs_dict)
+
+    def _maybe_clear_mm_inputs(self, batch: ScheduleBatch) -> None:
+        for req in batch.reqs:
+            if not req.finished() or not (mm_inputs := req.multimodal_inputs):
+                continue
+            # For session requests, keep mm_inputs for the next request
+            if req.session:
+                continue
+            # For non-session requests, clear features and mm_inputs
+            for item in mm_inputs.mm_items:
+                item.feature = None
+            req.multimodal_inputs = None
+
+    def handle_generate_request(
+        self,
+        recv_req: TokenizedGenerateReqInput,
+    ):
+        # Route: normal request / session request / session-not-found
+        session_id = (
+            recv_req.session_params.id if recv_req.session_params is not None else None
+        )
+
+        if session_id is None:
+            # Normal non-session request
+            if recv_req.input_embeds is not None:
+                # Generate fake input_ids based on the length of input_embeds
+                seq_length = len(recv_req.input_embeds)
+                fake_input_ids = [1] * seq_length
+                recv_req.input_ids = fake_input_ids
+
+            if recv_req.bootstrap_port is None:
+                # Use default bootstrap port
+                recv_req.bootstrap_port = self.server_args.disaggregation_bootstrap_port
+
+            req = Req(
+                recv_req.rid,
+                recv_req.input_text,
+                recv_req.input_ids,
+                recv_req.sampling_params,
+                return_logprob=recv_req.return_logprob,
+                top_logprobs_num=recv_req.top_logprobs_num,
+                token_ids_logprob=recv_req.token_ids_logprob,
+                stream=recv_req.stream,
+                lora_id=recv_req.lora_id,
+                input_embeds=recv_req.input_embeds,
+                custom_logit_processor=recv_req.custom_logit_processor,
+                require_reasoning=recv_req.require_reasoning,
+                return_hidden_states=recv_req.return_hidden_states,
+                return_routed_experts=recv_req.return_routed_experts,
+                eos_token_ids=self.model_config.hf_eos_token_id,
+                bootstrap_host=recv_req.bootstrap_host,
+                bootstrap_port=recv_req.bootstrap_port,
+                bootstrap_room=recv_req.bootstrap_room,
+                disagg_mode=self.disaggregation_mode,
+                routed_dp_rank=recv_req.routed_dp_rank,
+                disagg_prefill_dp_rank=recv_req.disagg_prefill_dp_rank,
+                vocab_size=self.model_config.vocab_size,
+                priority=recv_req.priority,
+                metrics_collector=(
+                    self.metrics_collector if self.enable_metrics else None
+                ),
+                routing_key=recv_req.routing_key,
+                http_worker_ipc=recv_req.http_worker_ipc,
+                dllm_config=self.dllm_config,
+                time_stats=recv_req.time_stats,
+            )
+            req.tokenizer = self.tokenizer
+
+            if self.disaggregation_mode != DisaggregationMode.NULL:
+                # Invalid request for disaggregated mode
+                if (
+                    recv_req.bootstrap_room is None
+                    and self.transfer_backend != TransferBackend.FAKE
+                ):
+                    error_msg = (
+                        f"Invalid request: Disaggregated request received without "
+                        f"bootstrap room id. {req.rid=}"
+                    )
+                    logger.error(error_msg)
+                    recv_req.time_stats.trace_ctx.abort(
+                        abort_info={"reason": error_msg}
+                    )
+                    prepare_abort(req, error_msg, status_code=HTTPStatus.BAD_REQUEST)
+                    self.stream_output([req], req.return_logprob)
+                    return
+
+        elif session_id in self.session_controller:
+            # Session exists: create request from session
+            session = self.session_controller.get(session_id)
+            req = session.create_req(
+                recv_req,
+                self.tokenizer,
+                self.model_config.vocab_size,
+                eos_token_ids=self.model_config.hf_eos_token_id,
+            )
+            # TODO: set trace context
+            if self.enable_metrics:
+                req.time_stats.set_metrics_collector(self.metrics_collector)
+            if isinstance(req.finished_reason, FINISH_ABORT):
+                self.init_req_max_new_tokens(req)
+                self._add_request_to_queue(req)
+                return
+
+        else:
+            # Session ID provided but session not found
+            req = Req(
+                recv_req.rid,
+                recv_req.input_text,
+                recv_req.input_ids,
+                recv_req.sampling_params,
+                vocab_size=self.model_config.vocab_size,
+            )
+            req.tokenizer = self.tokenizer
+            req.set_finish_with_abort(
+                f"Invalid request: session id {session_id} does not exist"
+            )
+            self.init_req_max_new_tokens(req)
+            self._add_request_to_queue(req)
+            return
+
+        # Handle multimodal inputs
+        if recv_req.mm_inputs is not None:
+            image_inputs = self._get_multimodal_inputs(recv_req.mm_inputs)
+
+            SessionController.adjust_mm_offsets(recv_req, req, image_inputs)
+
+            # The following steps are already fast, execute locally on each rank.
+            # Expand a single image token into multiple dummy tokens for receiving image embeddings
+            req.origin_input_ids = self.pad_input_ids_func(
+                req.origin_input_ids, image_inputs
+            )
+            req.extend_image_inputs(image_inputs)
+
+            if len(req.origin_input_ids) >= self.max_req_input_len:
+                req.set_finish_with_abort(
+                    error_msg=(
+                        "Multimodal prompt is too long after expanding multimodal tokens. "
+                        f"After expanding {len(req.origin_input_ids_unpadded)=} => {len(req.origin_input_ids)} >= {self.max_req_input_len}."
+                    )
+                )
+                self.init_req_max_new_tokens(req)
+                self._add_request_to_queue(req)
+                return
+
+        # initialize before returning
+        self.init_req_max_new_tokens(req)
+
+        # Validate prompt length
+        error_msg = validate_input_length(
+            req,
+            self.max_req_input_len,
+            self.server_args.allow_auto_truncate,
+        )
+        if error_msg:
+            req.set_finish_with_abort(error_msg)
+            self._add_request_to_queue(req)
+            return
+
+        if not recv_req.return_logprob and recv_req.logprob_start_len != -1:
+            # When return_logprob is False, logprob_start_len should be ignored
+            recv_req.logprob_start_len = -1
+
+        if recv_req.logprob_start_len == -1:
+            if recv_req.return_logprob and recv_req.token_ids_logprob is None:
+                # If logprob is required but neither token_ids_logprob nor logprob_start_len is
+                # set, return the logprobs for output tokens by default
+                req.logprob_start_len = len(req.origin_input_ids) - 1
+            elif req.is_prefill_only:
+                # For prefill-only requests with logprob_start_len == -1, set logprob_start_len
+                # beyond input sequence to skip input logprob computation entirely
+                req.logprob_start_len = len(req.origin_input_ids)
+            else:
+                # If return_logprob is False, only the last token requires logprob computation
+                req.logprob_start_len = -1
+        else:
+            req.logprob_start_len = recv_req.logprob_start_len
+
+        if req.logprob_start_len > len(req.origin_input_ids):
+            error_msg = f"{req.logprob_start_len=} is higher than the number of input tokens {len(req.origin_input_ids)=}. Please use a smaller logprob_start_len."
+            req.logprob_start_len = -1
+            req.set_finish_with_abort(error_msg)
+            self._add_request_to_queue(req)
+            return
+
+        added_to_grammar_queue = self.grammar_manager.process_req_with_grammar(req)
+        if not added_to_grammar_queue:
+            self._add_request_to_queue(req)
+
+    def handle_batch_generate_request(
+        self,
+        recv_req: BatchTokenizedGenerateReqInput,
+    ):
+        """Handle optimized batch generate request."""
+        logger.debug(f"Processing batch generate request with {len(recv_req)} requests")
+
+        # Process each request in the batch
+        for tokenized_req in recv_req:
+            self.handle_generate_request(tokenized_req)
+
+    def _prefetch_kvcache(self, req: Req):
+        if self.enable_hicache_storage:
+            req.init_next_round_input(self.tree_cache)
+            if req.last_node.backuped:
+                # only to initiate the prefetch if the last node is backuped
+                # otherwise, the allocated GPU memory must be locked for integrity
+                last_hash = req.last_host_node.get_last_hash_value()
+                matched_len = len(req.prefix_indices) + req.host_hit_length
+                new_input_tokens = req.fill_ids[matched_len:]
+
+                prefix_keys = (
+                    req.last_node.get_prefix_hash_values(req.last_node.parent)
+                    if self.tree_cache.hicache_storage_pass_prefix_keys
+                    else None
+                )
+                self.tree_cache.prefetch_from_storage(
+                    req.rid,
+                    req.last_host_node,
+                    new_input_tokens,
+                    last_hash,
+                    prefix_keys,
+                )
+
+    def _add_request_to_queue(self, req: Req, is_retracted: bool = False):
+        if self.disaggregation_mode == DisaggregationMode.NULL:
+            if not self._set_or_validate_priority(req):
+                return
+            if self._abort_on_queued_limit(req):
+                return
+            self._prefetch_kvcache(req)
+            self.waiting_queue.append(req)
+            req.time_stats.set_wait_queue_entry_time()
+        elif self.disaggregation_mode == DisaggregationMode.PREFILL:
+            self._prefetch_kvcache(req)
+            self.disagg_prefill_bootstrap_queue.add(
+                req, self.model_config.num_key_value_heads
+            )
+            req.time_stats.set_prefill_bootstrap_queue_entry_time()
+        elif self.disaggregation_mode == DisaggregationMode.DECODE:
+            self.disagg_decode_prealloc_queue.add(req, is_retracted=is_retracted)
+            if not is_retracted:
+                req.time_stats.set_decode_prealloc_queue_entry_time()
+            else:
+                req.time_stats.set_retract_time()
+        else:
+            raise ValueError(f"Invalid {self.disaggregation_mode=}")
+
+    def _set_or_validate_priority(self, req: Req) -> bool:
+        """Set the default priority value, or abort the request based on the priority scheduling mode."""
+        if self.enable_priority_scheduling and req.priority is None:
+            if self.schedule_low_priority_values_first:
+                req.priority = sys.maxsize
+            else:
+                req.priority = -sys.maxsize - 1
+        elif (
+            not self.enable_priority_scheduling
+            and req.priority is not None
+            and self.abort_on_priority_when_disabled
+        ):
+            abort_req = AbortReq(
+                finished_reason={
+                    "type": "abort",
+                    "status_code": HTTPStatus.SERVICE_UNAVAILABLE,
+                    "message": "Using priority is disabled for this server. Please send a new request without a priority.",
+                },
+                rid=req.rid,
+            )
+            req.time_stats.trace_ctx.abort(abort_info=abort_req.finished_reason)
+            self.send_to_tokenizer.send_output(abort_req, req)
+            return False
+        return True
+
+    def _abort_on_queued_limit(self, recv_req: Req) -> bool:
+        """Abort an incoming or existing request if the waiting queue is full. Returns True if the incoming request is aborted."""
+        if (
+            self.max_queued_requests is None
+            or len(self.waiting_queue) + 1 <= self.max_queued_requests
+        ):
+            return False
+
+        # Reject the incoming request by default.
+        req_to_abort = recv_req
+        message = "The request queue is full."
+        if self.enable_priority_scheduling:
+            # With priority scheduling, consider aboritng an existing request based on the priority.
+            # direction = 1  => smaller number = higher priority; -1 => larger number = higher priority.
+            # max(...) + (direction * priority, queue_time_start) picks the least-preferred request.
+            # Tie: later queue_time_start (newer) is evicted first. Preempt only if strictly better.
+            direction = 1 if self.schedule_low_priority_values_first else -1
+            key_fn = lambda item: (
+                direction * item[1].priority,
+                item[1].time_stats.wait_queue_entry_time,
+            )
+            idx, candidate_req = max(enumerate(self.waiting_queue), key=key_fn)
+            abort_existing_req = (
+                direction * recv_req.priority < direction * candidate_req.priority
+            )
+            if abort_existing_req:
+                if self.enable_hicache_storage:
+                    # Release prefetch events associated with the request
+                    self.tree_cache.release_aborted_request(candidate_req.rid)
+                elif self.enable_hierarchical_cache:
+                    self.tree_cache.terminate_prefetch(candidate_req.rid)
+                self.waiting_queue.pop(idx)
+                req_to_abort = candidate_req
+                message = "The request is aborted by a higher priority request."
+
+        self.send_to_tokenizer.send_output(
+            AbortReq(
+                finished_reason={
+                    "type": "abort",
+                    "status_code": HTTPStatus.SERVICE_UNAVAILABLE,
+                    "message": message,
+                },
+                rid=req_to_abort.rid,
+            ),
+            req_to_abort,
+        )
+        req_to_abort.time_stats.trace_ctx.abort(abort_info={"reason": message})
+        return req_to_abort.rid == recv_req.rid
+
+    def _abort_on_waiting_timeout(self):
+        if (timeout_s := envs.SGLANG_REQ_WAITING_TIMEOUT.get()) <= 0:
+            return
+
+        deleted_reqs = set()
+        deadline = time.perf_counter() - timeout_s
+        for req in self.waiting_queue:
+            entry_time = req.time_stats.wait_queue_entry_time
+            if 0 < entry_time < deadline:
+                if self.enable_hicache_storage:
+                    # Release prefetch events associated with the request
+                    self.tree_cache.release_aborted_request(req.rid)
+                self.send_to_tokenizer.send_output(
+                    AbortReq(
+                        finished_reason={
+                            "type": "abort",
+                            "status_code": HTTPStatus.SERVICE_UNAVAILABLE,
+                            "message": "Request waiting timeout reached.",
+                        },
+                        rid=req.rid,
+                    ),
+                    req,
+                )
+                deleted_reqs.add(req)
+
+        if deleted_reqs:
+            self.waiting_queue = [
+                req for req in self.waiting_queue if req not in deleted_reqs
+            ]
+
+    def handle_embedding_request(
+        self,
+        recv_req: TokenizedEmbeddingReqInput,
+    ):
+        req = Req(
+            recv_req.rid,
+            recv_req.input_text,
+            recv_req.input_ids,
+            recv_req.sampling_params,
+            token_type_ids=recv_req.token_type_ids,
+            routed_dp_rank=recv_req.routed_dp_rank,
+            priority=recv_req.priority,
+            dimensions=recv_req.dimensions,
+            lora_id=recv_req.lora_id,
+            http_worker_ipc=recv_req.http_worker_ipc,
+            time_stats=recv_req.time_stats,
+        )
+        req.tokenizer = self.tokenizer
+
+        # Handle multimodal inputs
+        if recv_req.image_inputs is not None:
+            image_inputs = self._get_multimodal_inputs(recv_req.image_inputs)
+            # Expand a single image token into multiple dummy tokens for receiving image embeddings
+            # The `pad_input_ids_func` is model-specific and may be None for
+            # embedding models or models not requiring special padding.
+            # If None, `req.origin_input_ids` is expected to be correctly populated already.
+            if self.pad_input_ids_func:
+                req.origin_input_ids = self.pad_input_ids_func(
+                    req.origin_input_ids, image_inputs
+                )
+
+            req.extend_image_inputs(image_inputs)
+
+            if len(req.origin_input_ids) >= self.max_req_input_len:
+                req.set_finish_with_abort(
+                    error_msg=(
+                        "Multimodal prompt is too long after expanding multimodal tokens. "
+                        f"After expanding {len(req.origin_input_ids_unpadded)=} => {len(req.origin_input_ids)} >= {self.max_req_input_len}."
+                    )
+                )
+                self._add_request_to_queue(req)
+                return
+
+        # Validate prompts length
+        error_msg = validate_input_length(
+            req,
+            self.max_req_input_len,
+            self.server_args.allow_auto_truncate,
+        )
+        if error_msg:
+            self._add_request_to_queue(req)
+            return
+
+        # Copy more attributes
+        req.logprob_start_len = -1
+        self._add_request_to_queue(req)
+
+    def handle_batch_embedding_request(
+        self,
+        recv_req: BatchTokenizedEmbeddingReqInput,
+    ):
+        """Handle optimized batch embedding request."""
+        logger.debug(
+            f"Processing batch embedding request with {len(recv_req)} requests"
+        )
+
+        # Process each request in the batch
+        for tokenized_req in recv_req:
+            self.handle_embedding_request(tokenized_req)
+
+    def stash_chunked_request(self, req: Req):
+        self.tree_cache.cache_unfinished_req(req, chunked=True)
+
+    def get_next_batch_to_run(self) -> Optional[ScheduleBatch]:
+        self._abort_on_waiting_timeout()
+        self._abort_on_running_timeout()
+        if self.dllm_config is not None:
+            self.dllm_manager.filter_finished_reqs()
+
+        # Merge the prefill batch into the running batch
+        chunked_req_to_exclude = set()
+
+        if self.dllm_config is not None and self.dllm_manager.any_staging_reqs():
+            chunked_req_to_exclude.update(self.dllm_manager.staging_queue)
+            for req in self.dllm_manager.staging_queue:
+                self.stash_chunked_request(req)
+
+        if self.chunked_req is not None:
+            # Move the chunked request out of the batch so that we can merge
+            # only finished requests to running_batch.
+            chunked_req_to_exclude.add(self.chunked_req)
+            self.stash_chunked_request(self.chunked_req)
+
+        if self.last_batch and self.last_batch.forward_mode.is_extend():
+            if self.last_batch.chunked_req is not None:
+                # In the context pipeline parallelism, after the last chunk, the current microbatch still track outdated chunked_req.
+                # We need to discard it.
+                chunked_req_to_exclude.add(self.last_batch.chunked_req)
+
+            if self.dllm_config is not None and self.last_batch.reqs:
+                chunked_req_to_exclude.update(self.last_batch.reqs)
+
+            # Filter batch
+            last_bs = self.last_batch.batch_size()
+            self.last_batch.filter_batch(
+                chunked_req_to_exclude=list(chunked_req_to_exclude)
+            )
+            if self.last_batch.batch_size() < last_bs:
+                self.running_batch.batch_is_full = False
+
+            # Merge the new batch into the running batch.
+            # For prefill-only batch, we can avoid going through decoding step.
+            if not self.last_batch.is_empty() and not self.last_batch.is_prefill_only:
+                if self.running_batch.is_empty():
+                    self.running_batch = self.last_batch
+                else:
+                    # Merge running_batch with prefill batch
+                    self.running_batch.merge_batch(self.last_batch)
+
+        if self.dllm_config is not None:
+            new_batch = self.get_new_batch_dllm()
+        else:
+            new_batch = self.get_new_batch_prefill()
+
+        need_mlp_sync = self.require_mlp_sync
+        if need_mlp_sync and not self.spec_algorithm.is_none():
+            # NOTE: This branch makes sure prefill and decode batches will not be mixed when spec and dp-attn is enabled.
+            # Before merging the new batch into running batch:
+            # 1. All new batches are none -> need_mlp_sync remains true (sync is needed for decode batch).
+            # 2. All new batches are some (prefill / idle) -> we do not need prepare mlp sync one more time.
+            new_batch = self.maybe_prepare_mlp_sync_batch(new_batch)
+            need_mlp_sync = new_batch is None
+
+        if new_batch is not None:
+            # Run prefill first if possible
+            ret = new_batch
+        else:
+            # Run decode
+            if not self.running_batch.is_empty():
+                self.running_batch = self.update_running_batch(self.running_batch)
+                ret = self.running_batch if not self.running_batch.is_empty() else None
+            else:
+                ret = None
+
+        # Handle DP attention and log stats
+        ret = self.maybe_prepare_mlp_sync_batch(ret, need_sync=need_mlp_sync)
+
+        if ret:
+            set_schedule_time_batch(ret)
+
+        return ret
+
+    def get_num_allocatable_reqs(self, running_bs):
+        res = get_global_server_args().pp_max_micro_batch_size - running_bs
+        if self.pp_size > 1:
+            res = min(res, self.req_to_token_pool.available_size())
+        return res
+
+    def get_new_batch_prefill(self) -> Optional[ScheduleBatch]:
+        prefill_delayer_single_pass = None
+        if self.prefill_delayer:
+            _, token_usage, _, _ = self._get_token_info()
+            prefill_delayer_single_pass = PrefillDelayerSinglePassExecutor(
+                self.prefill_delayer, token_usage=token_usage
+            )
+
+        ret = self._get_new_batch_prefill_raw(
+            prefill_delayer_single_pass=prefill_delayer_single_pass
+        )
+
+        if self.prefill_delayer:
+            prefill_delayer_single_pass.finalize(actual_prefill=ret is not None)
+
+        return ret
+
+    def _get_new_batch_prefill_raw(
+        self, prefill_delayer_single_pass: Optional[PrefillDelayerSinglePassExecutor]
+    ) -> Optional[ScheduleBatch]:
+        # Check if the grammar is ready in the grammar queue
+        if self.grammar_manager.has_waiting_grammars():
+            ready_grammar_requests = self.grammar_manager.get_ready_grammar_requests()
+            for req in ready_grammar_requests:
+                self._add_request_to_queue(req)
+
+        if self.enable_priority_preemption:
+            # Reset batch_is_full to try preemption with a prefill adder.
+            self.running_batch.batch_is_full = False
+
+        if (
+            self.running_batch.batch_is_full or len(self.waiting_queue) == 0
+        ) and self.chunked_req is None:
+            return None
+
+        running_bs = len(self.running_batch.reqs)
+
+        # Ignore the check if self.chunked_req is not None.
+        # In the non-PP case, when self.chunked_req is not None, num_allocatable_reqs should always be greater than 0,
+        # as the space for the chunked requests has just been released.
+        # In PP case, chunked requests (or dllm requests) can start in one microbatch and end in another microbatch, so the max_running_requests per microbatch should not be strict.
+        # Instead, we should always allow chunked requests to be added, otherwise, there will be a memory leak.
+        if (
+            self.get_num_allocatable_reqs(running_bs) <= 0
+            and self.chunked_req is not None
+            and not self.enable_priority_preemption
+        ):
+            self.running_batch.batch_is_full = True
+            return None
+
+        if self.enable_hierarchical_cache:
+            self.tree_cache.check_hicache_events()
+
+        # Get priority queue
+        self.policy.calc_priority(self.waiting_queue, self.running_batch)
+
+        if TEST_RETRACT and running_bs > TEST_RETRACT_NO_PREFILL_BS:
+            # If we are testing retraction and the running batch size exceeds
+            # TEST_RETRACT_NO_PREFILL_BS, we skip the prefill to keep the requests
+            # in the waiting queue.
+            return None
+
+        # Determine chunked_prefill_size for this batch
+        chunked_prefill_size = self.chunked_prefill_size
+        if self.chunked_req is not None and self.enable_dynamic_chunking:
+            history_len = len(self.chunked_req.prefix_indices)
+            dynamic_size = self.predict_next_chunk_size(history_len)
+            if dynamic_size is not None:
+                chunked_prefill_size = dynamic_size
+
+        # Prefill policy
+        adder = PrefillAdder(
+            self.page_size,
+            self.tree_cache,
+            self.token_to_kv_pool_allocator,
+            self.running_batch,
+            self.new_token_ratio,
+            self.max_prefill_tokens,
+            chunked_prefill_size,
+            running_bs if self.is_mixed_chunk else 0,
+            self.priority_scheduling_preemption_threshold,
+            max_prefill_bs=self.max_prefill_bs,
+            max_running_requests=self.max_running_requests,
+            prefill_max_requests=self.server_args.prefill_max_requests,
+            prefill_delayer_single_pass=prefill_delayer_single_pass,
+            dllm_config=self.dllm_config,
+        )
+
+        if self.chunked_req is not None:
+            self.chunked_req.init_next_round_input()
+            self.chunked_req = adder.add_chunked_req(self.chunked_req)
+
+        if self.enable_lora:
+            running_loras = {req.lora_id for req in self.running_batch.reqs}
+
+        # Get requests from the waiting queue to a new prefill batch
+        for req in self.waiting_queue:
+            if self.enable_lora and req.lora_id not in running_loras:
+                if self.enable_lora_overlap_loading:
+                    # For overlapping loading of LoRA weights with computation, we will load each adapter one at a time,
+                    # as opposed to loading them in one batch
+                    res = self.lora_overlap_loader.try_overlap_load_lora(
+                        req.lora_id, running_loras
+                    )
+                    if not res:
+                        continue
+                else:
+                    new_lora_set = {req.lora_id} | running_loras
+                    if not self.tp_worker.model_runner.lora_manager.validate_lora_batch(
+                        new_lora_set
+                    ):
+                        continue
+
+            running_bs = len(self.running_batch.reqs)
+            if len(adder.can_run_list) >= self.get_num_allocatable_reqs(running_bs):
+                self.running_batch.batch_is_full = True
+            if self.disaggregation_mode == DisaggregationMode.PREFILL:
+                # In prefill mode, prealloc queue and transfer queue can also take memory,
+                # so we need to check if the available size for the actual available size.
+                if len(adder.can_run_list) >= self.req_to_token_pool.available_size():
+                    self.running_batch.batch_is_full = True
+
+            if self.running_batch.batch_is_full:
+                if (
+                    not self.enable_priority_preemption
+                    or not adder.preempt_to_schedule(req, self.server_args)
+                ):
+                    break
+
+            if self.enable_hicache_storage:
+                prefetch_done = self.tree_cache.check_prefetch_progress(req.rid)
+                if not prefetch_done:
+                    # skip staging requests that are ongoing prefetch
+                    continue
+                # Pop the number of tokens loaded from storage (L3 hits)
+                req.storage_hit_length = self.tree_cache.pop_prefetch_loaded_tokens(
+                    req.rid
+                )
+
+            req.init_next_round_input(self.tree_cache)
+            res = adder.add_one_req(
+                req,
+                has_chunked_req=(self.chunked_req is not None),
+                truncation_align_size=self.truncation_align_size,
+            )
+
+            if self.enable_lora:
+                running_loras.add(req.lora_id)
+
+            if res != AddReqResult.CONTINUE:
+                if res == AddReqResult.NO_TOKEN:
+                    if self.enable_hierarchical_cache:
+                        # Set batch_is_full after making sure there are requests that can be served
+                        self.running_batch.batch_is_full = len(
+                            adder.can_run_list
+                        ) > 0 or (not self.running_batch.is_empty())
+                    else:
+                        self.running_batch.batch_is_full = True
+                break
+
+        # Update waiting queue
+        can_run_list: List[Req] = adder.can_run_list
+        if len(can_run_list) == 0:
+            return None
+
+        self.waiting_queue = [
+            x for x in self.waiting_queue if x not in set(can_run_list)
+        ]
+        if adder.preempt_list:
+            for req in adder.preempt_list:
+                self._add_request_to_queue(req)
+
+        if adder.new_chunked_req is not None:
+            # Update chunked prefill
+            assert self.chunked_req is None
+            self.chunked_req = adder.new_chunked_req
+
+        if self.chunked_req is not None:
+            self.chunked_req.is_chunked += 1
+
+        # Record for logging prefill stats after forward
+        self.adder = adder
+        self.can_run_list = can_run_list
+        self.running_bs = len(self.running_batch.reqs)
+
+        set_time_batch(can_run_list, "set_forward_entry_time")
+
+        # Create a new batch
+        new_batch = ScheduleBatch.init_new(
+            can_run_list,
+            self.req_to_token_pool,
+            self.token_to_kv_pool_allocator,
+            self.tree_cache,
+            self.model_config,
+            self.enable_overlap,
+            self.spec_algorithm,
+            chunked_req=self.chunked_req,
+        )
+        self.max_prefill_bs = max(self.max_prefill_bs, len(can_run_list))
+        if self.enable_hierarchical_cache:
+            # todo (zhiqiang): disable cuda graph execution if hicache loading triggered
+            new_batch.hicache_consumer_index = (
+                self.tree_cache.ready_to_load_host_cache()
+            )
+
+        new_batch.prepare_for_extend()
+
+        # Record prefill stats for logging after forward
+        new_batch.prefill_stats = PrefillStats.from_adder(
+            adder, self.running_batch.reqs, self.enable_priority_scheduling
+        )
+
+        # Mixed-style chunked prefill
+        if (
+            self.is_mixed_chunk
+            and not self.running_batch.is_empty()
+            and not (new_batch.return_logprob or self.running_batch.return_logprob)
+        ):
+            # TODO (lianmin): support return_logprob + mixed chunked prefill
+            self.running_batch.filter_batch(v1_spec_info_filtered=True)
+            if not self.running_batch.is_empty():
+                self.running_batch.prepare_for_decode()
+                new_batch.mix_with_running(self.running_batch)
+                new_batch.decoding_reqs = self.running_batch.reqs
+            self.running_batch = ScheduleBatch(
+                reqs=[], batch_is_full=self.running_batch.batch_is_full
+            )
+        else:
+            new_batch.decoding_reqs = None
+
+        return new_batch
+
+    def update_running_batch(self, batch: ScheduleBatch) -> Optional[ScheduleBatch]:
+        """Update the current running decoding batch."""
+        initial_bs = batch.batch_size()
+
+        batch.filter_batch(v1_spec_info_filtered=True)
+        if batch.is_empty():
+            batch.batch_is_full = False
+            return batch
+
+        # Check if decode out of memory
+        if (kv_full_retract_flag := not batch.check_decode_mem()) or (
+            TEST_RETRACT and self.forward_ct % TEST_RETRACT_INTERVAL == 0
+        ):
+            old_available_tokens = self.token_to_kv_pool_allocator.available_size()
+            old_ratio = self.new_token_ratio
+            retracted_reqs, new_token_ratio, reqs_to_abort = batch.retract_decode(
+                self.server_args
+            )
+            new_available_tokens = self.token_to_kv_pool_allocator.available_size()
+            new_token_gained = new_available_tokens - old_available_tokens
+
+            self.num_retracted_reqs = len(retracted_reqs)
+            if self.enable_metrics and len(retracted_reqs) > 0:
+                self.metrics_collector.increment_retracted_reqs(
+                    num_retracted_reqs=len(retracted_reqs),
+                    num_retracted_input_tokens=sum(
+                        len(r.origin_input_ids) for r in retracted_reqs
+                    ),
+                    num_retracted_output_tokens=sum(
+                        len(r.output_ids) for r in retracted_reqs
+                    ),
+                )
+            self.new_token_ratio = new_token_ratio
+            for req in reqs_to_abort:
+                abort_reason: FINISH_ABORT = req.to_finish
+                self.send_to_tokenizer.send_output(
+                    AbortReq(abort_message=abort_reason.message, rid=req.rid), req
+                )
+
+            msg_prefix = (
+                "KV cache pool is full. Retract requests. "
+                if kv_full_retract_flag
+                else "Testing retraction. "
+            )
+            msg_details = f"#retracted_reqs: {len(retracted_reqs)}, #new_tokens_gained: {new_token_gained}"
+            if kv_full_retract_flag:
+                msg_details += (
+                    f", #new_token_ratio: {old_ratio:.4f} -> {new_token_ratio:.4f}"
+                )
+            logger.warning(msg_prefix + msg_details)
+
+            for req in retracted_reqs:
+                self._add_request_to_queue(req, is_retracted=True)
+        else:
+            self.new_token_ratio = max(
+                self.new_token_ratio - self.new_token_ratio_decay,
+                self.min_new_token_ratio,
+            )
+
+        if batch.batch_size() < initial_bs:
+            batch.batch_is_full = False
+
+        # Update batch tensors
+        batch.prepare_for_decode()
+        return batch
+
+    def record_batch_in_overlap(self, model_worker_batch: ModelWorkerBatch):
+        # FIXME(lsyin): hacky way to keep a reference to avoid GPU tensors being freed by torch GC
+        # NOTE: More Reliable: record all tensors into the forward stream
+        # NOTE: - for all future tensors, we shall always read from future map
+        #       - for all non-future tensors (produced only by schedule stream),
+        #       we shall keep its reference not being release during all the forwarding pass
+        self.batch_record_ct = (self.batch_record_ct + 1) % 2
+        self.batch_record_buf[self.batch_record_ct] = model_worker_batch
+
+    def run_batch(
+        self,
+        batch: ScheduleBatch,
+        pp_proxy_tensors: Optional[PPProxyTensors] = None,
+    ) -> Union[GenerationBatchResult, EmbeddingBatchResult]:
+        """Run a batch."""
+        self.forward_ct += 1
+
+        # Whether to run the profiler
+        self._profile_batch_predicate(batch)
+        if self.forward_sleep_time is not None:
+            logger.info(f"Scheduler.run_batch sleep {self.forward_sleep_time}s")
+            time.sleep(self.forward_sleep_time)
+
+        # Capture prefill start time for EXTEND mode
+        if batch.forward_mode == ForwardMode.EXTEND:
+            set_time_batch(batch.reqs, "set_prefill_run_batch_start_time")
+
+        # Place holder handling for pd-disagg decode event loop
+        if batch.forward_mode.is_prebuilt():
+            return self._run_batch_prebuilt(batch)
+
+        # Run forward
+        if self.is_generation:
+            if self.spec_algorithm.is_none() or self.enable_overlap:
+                # In most cases, we use the model worker batch to run the forward.
+                worker_batch_or_batch = batch.get_model_worker_batch()
+            else:
+                # In speculative decoding v1 (non-overlap) case, we use the batch directly.
+                # TODO(lsyin): delete this branch after unifying the abstraction.
+                worker_batch_or_batch = batch
+
+            if self.enable_overlap:
+                model_worker_batch = worker_batch_or_batch
+                self.record_batch_in_overlap(model_worker_batch)
+
+                # Sampling info will be modified during forward, so we store a copy.
+                model_worker_batch.sampling_info = (
+                    model_worker_batch.sampling_info.copy_for_forward()
+                )
+
+                bs = len(model_worker_batch.seq_lens)
+                future_indices = self.future_map.alloc_future_indices(bs)
+
+                with self.forward_stream_ctx, self.record_bubble_metrics(batch):
+                    self.forward_stream.wait_stream(self.schedule_stream)
+                    self.future_map.resolve_future(model_worker_batch)
+                    with self.record_forward_metrics(batch):
+                        batch_result = self.model_worker.forward_batch_generation(
+                            model_worker_batch
+                            # here pp is not compatible with overlap
+                        )
+                    # FIXME(lsyin): maybe move this to forward_batch_generation
+                    batch_result.copy_done = self.device_module.Event()
+                    if batch_result.delay_sample_func is None:
+                        self.future_map.store_to_map(future_indices, batch_result)
+                        batch_result.copy_to_cpu(return_logprob=batch.return_logprob)
+                    else:
+                        batch_result.future_indices = future_indices
+
+                # FIXME(lsyin): move this assignment elsewhere
+                future_indices_or_next_token_ids = -future_indices.indices
+
+                if batch.is_spec_v2:
+                    # FIXME(lsyin): tmp code for spec v2
+                    # We only keep future indices for next draft input
+
+                    batch.spec_info = batch_result.next_draft_input
+                    batch.spec_info.future_indices = future_indices
+
+                    # batch.spec_info = EagleDraftInput(
+                    #     future_indices=future_indices,
+                    #     verify_done=batch_result.next_draft_input.verify_done,
+                    # )
+
+                    # The future value, usually for next batch preparation
+                    # Current implementation strictly synchronizes the seq_lens
+                    batch.seq_lens = batch_result.next_draft_input.new_seq_lens
+            elif self.enable_pdmux and batch.forward_mode.is_split_prefill():
+                batch_result = self.tp_worker.forward_batch_split_prefill(batch)
+                future_indices_or_next_token_ids = batch_result.next_token_ids
+            else:
+                kwargs = (
+                    {"pp_proxy_tensors": pp_proxy_tensors}
+                    if self.spec_algorithm.is_none()
+                    else {}
+                )
+                with self.record_forward_metrics(batch):
+                    batch_result = self.model_worker.forward_batch_generation(
+                        worker_batch_or_batch, **kwargs
+                    )
+                future_indices_or_next_token_ids = batch_result.next_token_ids
+                self.update_cache_from_scheduler(batch, batch_result)
+
+            # NOTE: future_indices_or_next_token_ids is used in ScheduleBatch,
+            #       which can probably be replaced by future_indices later [TODO(lsyin)].
+            #       we shall still keep the original outputs, e.g. next_token_ids
+            #       in the GenerationBatchOutput for processing after copy_done.
+            batch.output_ids = future_indices_or_next_token_ids
+
+            # These 2 values are needed for processing the output, but the values can be
+            # modified by overlap schedule. So we have to copy them here so that
+            # we can use the correct values in output processing.
+            if batch.return_logprob:
+                batch_result.extend_input_len_per_req = [
+                    req.extend_input_len for req in batch.reqs
+                ]
+                batch_result.extend_logprob_start_len_per_req = [
+                    req.extend_logprob_start_len for req in batch.reqs
+                ]
+            else:
+                batch_result.extend_input_len_per_req = None
+                batch_result.extend_logprob_start_len_per_req = None
+
+            ret = batch_result
+        else:  # embedding or reward model
+            model_worker_batch = batch.get_model_worker_batch()
+
+            if self.enable_overlap:
+                self.record_batch_in_overlap(model_worker_batch)
+                with self.forward_stream_ctx, self.record_bubble_metrics(batch):
+                    self.forward_stream.wait_stream(self.schedule_stream)
+                    embeddings = self.tp_worker.forward_batch_embedding(
+                        model_worker_batch
+                    )
+                    ret = EmbeddingBatchResult(embeddings=embeddings)
+                    ret.copy_to_cpu()
+            else:
+                embeddings = self.tp_worker.forward_batch_embedding(model_worker_batch)
+                ret = EmbeddingBatchResult(embeddings=embeddings)
+
+        # Capture prefill end time for EXTEND mode
+        if batch.forward_mode == ForwardMode.EXTEND:
+            set_time_batch(batch.reqs, "set_prefill_run_batch_end_time")
+
+        if (
+            self.server_args.enable_dp_attention
+            and self.server_args.elastic_ep_backend is not None
+        ):
+            # Get the tensors indicating rank activeness
+            tp_active_ranks = self.tp_group.active_ranks.detach().cpu().numpy()
+            tp_active_ranks_cpu = self.tp_group.active_ranks_cpu.detach().numpy()
+            tp_active_ranks &= tp_active_ranks_cpu
+            dp_active_ranks = tp_active_ranks.reshape(self.dp_size, -1).prod(axis=1)
+            self.send_to_tokenizer.send_output(
+                ActiveRanksOutput(status=dp_active_ranks.tolist())
+            )
+
+        return ret
+
+    def launch_batch_sample_if_needed(
+        self, batch_result: GenerationBatchResult
+    ) -> Union[GenerationBatchResult]:
+        # TODO(lsyin): make the delayed sample a default behavior after
+        # unifying the forward_batch_generation interface (related to spec V2).
+        if batch_result is None or batch_result.delay_sample_func is None:
+            return
+
+        with self.forward_stream_ctx:
+            self.forward_stream.wait_stream(self.schedule_stream)
+            _batch_result = batch_result.delay_sample_func()
+            assert _batch_result is batch_result
+            self.future_map.store_to_map(batch_result.future_indices, batch_result)
+            batch_result.copy_to_cpu(return_logprob=self.cur_batch.return_logprob)
+
+    def process_batch_result(
+        self,
+        batch: ScheduleBatch,
+        result: Union[GenerationBatchResult, EmbeddingBatchResult],
+    ):
+        if batch.forward_mode.is_decode():
+            self.process_batch_result_decode(batch, result)
+        elif batch.forward_mode.is_extend():
+            if batch.is_dllm():
+                self.process_batch_result_dllm(batch, result)
+            elif self.disaggregation_mode == DisaggregationMode.PREFILL:
+                self.process_batch_result_disagg_prefill(batch, result)
+            else:
+                self.process_batch_result_prefill(batch, result)
+        elif batch.forward_mode.is_prebuilt():
+            self.process_batch_result_prebuilt(batch)
+        elif batch.forward_mode.is_idle():
+            self.process_batch_result_idle(batch, result)
+
+        self.log_batch_result_stats(batch, result)
+        self._maybe_clear_mm_inputs(batch)
+        self.maybe_send_health_check_signal()
+
+    def maybe_send_health_check_signal(self):
+        if self.return_health_check_ct:
+            # Return some signal for the health check.
+            # This is used to prevent the health check signal being blocked by long context prefill.
+            # However, one minor issue is that this code path does not check the status of detokenizer manager.
+            self.return_health_check_ct -= 1
+            self.send_to_tokenizer.send_output(HealthCheckOutput())
+
+    def flush_cache_wrapped(self, recv_req: FlushCacheReqInput):
+        success = self.flush_cache()
+        return FlushCacheReqOutput(success=success)
+
+    def clear_hicache_storage_wrapped(self, recv_req: ClearHiCacheReqInput):
+        if self.enable_hierarchical_cache:
+            self.tree_cache.clear_storage_backend()
+            logger.info("Hierarchical cache cleared successfully!")
+            if_success = True
+        else:
+            logging.warning("Hierarchical cache is not enabled.")
+            if_success = False
+        return ClearHiCacheReqOutput(success=if_success)
+
+    def _is_idle_for_hicache_storage_op(self) -> bool:
+        """Stricter idle check for storage attach/detach.
+
+        We require:
+        - no running batches (including overlap/pp/disagg paths) via `_is_no_request()`
+        - no queued requests in scheduler queues (waiting/grammar/disagg queues)
+        """
+        if not self._is_no_request():
+            return False
+        if len(self.waiting_queue) != 0:
+            return False
+        if len(self.grammar_manager.grammar_queue) != 0:
+            return False
+        return True
+
+    def attach_hicache_storage_wrapped(
+        self, recv_req: AttachHiCacheStorageReqInput
+    ) -> AttachHiCacheStorageReqOutput:
+        if not self.enable_hierarchical_cache:
+            return AttachHiCacheStorageReqOutput(
+                success=False, message="Hierarchical cache is not enabled."
+            )
+
+        if not self._is_idle_for_hicache_storage_op():
+            return AttachHiCacheStorageReqOutput(
+                success=False,
+                message=(
+                    "Reject attach: scheduler is not idle. "
+                    f"#queue-req={len(self.waiting_queue)} "
+                    f"#running-req={len(self.running_batch.reqs)}"
+                ),
+            )
+
+        if not hasattr(self.tree_cache, "attach_storage_backend"):
+            return AttachHiCacheStorageReqOutput(
+                success=False,
+                message="Current tree_cache implementation does not support dynamic attach.",
+            )
+
+        try:
+            ok, msg = self.tree_cache.attach_storage_backend(
+                storage_backend=recv_req.hicache_storage_backend,
+                storage_backend_extra_config_json=recv_req.hicache_storage_backend_extra_config_json,
+                served_model_name=self.server_args.served_model_name,
+                hicache_storage_prefetch_policy=recv_req.hicache_storage_prefetch_policy,
+                hicache_write_policy=recv_req.hicache_write_policy,
+            )
+        except Exception as e:
+            logger.exception("Attach HiCache storage backend failed with exception.")
+            return AttachHiCacheStorageReqOutput(success=False, message=str(e))
+        if ok:
+            self.enable_hicache_storage = True
+            self.server_args.hicache_storage_backend = recv_req.hicache_storage_backend
+            if recv_req.hicache_storage_backend_extra_config_json is not None:
+                self.server_args.hicache_storage_backend_extra_config = (
+                    recv_req.hicache_storage_backend_extra_config_json
+                )
+            if recv_req.hicache_storage_prefetch_policy is not None:
+                self.server_args.hicache_storage_prefetch_policy = (
+                    recv_req.hicache_storage_prefetch_policy
+                )
+            if recv_req.hicache_write_policy is not None:
+                self.server_args.hicache_write_policy = recv_req.hicache_write_policy
+            logger.info(
+                f"Attached HiCache storage backend: {recv_req.hicache_storage_backend}"
+            )
+        return AttachHiCacheStorageReqOutput(success=ok, message=msg)
+
+    def detach_hicache_storage_wrapped(
+        self, recv_req: DetachHiCacheStorageReqInput
+    ) -> DetachHiCacheStorageReqOutput:
+        if not self.enable_hierarchical_cache:
+            return DetachHiCacheStorageReqOutput(
+                success=False, message="Hierarchical cache is not enabled."
+            )
+
+        if not self._is_idle_for_hicache_storage_op():
+            return DetachHiCacheStorageReqOutput(
+                success=False,
+                message=(
+                    "Reject detach: scheduler is not idle. "
+                    f"#queue-req={len(self.waiting_queue)} "
+                    f"#running-req={len(self.running_batch.reqs)}"
+                ),
+            )
+
+        if not hasattr(self.tree_cache, "detach_storage_backend"):
+            return DetachHiCacheStorageReqOutput(
+                success=False,
+                message="Current tree_cache implementation does not support dynamic detach.",
+            )
+
+        # Idempotent detach: even if scheduler thinks storage is disabled, we still
+        # attempt best-effort cleanup in tree_cache (it may have leftover state).
+        try:
+            ok, msg = self.tree_cache.detach_storage_backend()
+        except Exception as e:
+            logger.exception("Detach HiCache storage backend failed with exception.")
+            return DetachHiCacheStorageReqOutput(success=False, message=str(e))
+
+        if ok or (not self.enable_hicache_storage):
+            # Treat "already disabled / nothing to do" as success for idempotence.
+            self.enable_hicache_storage = False
+            self.server_args.hicache_storage_backend = None
+            self.server_args.hicache_storage_backend_extra_config = None
+            logger.info("Detached HiCache storage backend.")
+            return DetachHiCacheStorageReqOutput(
+                success=True, message=msg or "HiCache storage backend is detached."
+            )
+
+        return DetachHiCacheStorageReqOutput(success=False, message=msg)
+
+    def pin_prefix_wrapped(self, recv_req: PinPrefixReqInput):
+        if not hasattr(self.tree_cache, "pin_prefix"):
+            return PinPrefixReqOutput(
+                success=False,
+                nodes_pinned=0,
+                message="PIN requires --enable-hierarchical-cache",
+            )
+        if getattr(self.tree_cache, "_max_pinned_tokens", 0) <= 0:
+            return PinPrefixReqOutput(
+                success=False,
+                nodes_pinned=0,
+                message="Pinning is disabled (SGLANG_HICACHE_MAX_PINNED_RATIO is 0)",
+            )
+        nodes_pinned, reject_reason = self.tree_cache.pin_prefix(
+            recv_req.token_ids, recv_req.ttl_seconds
+        )
+        if nodes_pinned == 0:
+            return PinPrefixReqOutput(
+                success=False,
+                nodes_pinned=0,
+                message=reject_reason or "No matching prefix found in cache to pin",
+            )
+        msg = f"Pinned {nodes_pinned} nodes (ttl={recv_req.ttl_seconds}s)"
+        if reject_reason:
+            msg += f"; {reject_reason}"
+        return PinPrefixReqOutput(
+            success=True,
+            nodes_pinned=nodes_pinned,
+            message=msg,
+        )
+
+    def _is_no_request(self):
+        no_request = (
+            self.running_batch.is_empty()
+            and (self.last_batch is None or self.last_batch.is_empty())
+            and (self.cur_batch is None or self.cur_batch.is_empty())
+            and (not self.enable_overlap or len(self.result_queue) == 0)
+            and (self.pp_size == 1 or all(x.is_empty() for x in self.running_mbs))
+        )
+        if self.disaggregation_mode == DisaggregationMode.PREFILL:
+            no_request &= (
+                len(self.disagg_prefill_bootstrap_queue.queue) == 0
+                and len(self.disagg_prefill_inflight_queue) == 0
+            )
+        if self.disaggregation_mode == DisaggregationMode.DECODE:
+            no_request &= (
+                len(self.disagg_decode_prealloc_queue.queue) == 0
+                and len(self.disagg_decode_transfer_queue.queue) == 0
+            )
+        return no_request
+
+    def flush_cache(self):
+        """Flush the memory pool and cache."""
+        if self._is_no_request():
+            self.cur_batch = None
+            self.last_batch = None
+            self.tree_cache.reset()
+            self.req_to_token_pool.clear()
+            self.token_to_kv_pool_allocator.clear()
+            self.grammar_manager.clear()
+            self.reset_metrics()
+
+            if self.draft_worker:
+                self.draft_worker.clear_cache_pool()
+
+            # TODO: allow optional empty cache
+            torch.cuda.empty_cache()
+            logger.info("Cache flushed successfully!")
+            success = True
+        else:
+            logging.warning(
+                f"Cache not flushed because there are pending requests. "
+                f"#queue-req: {len(self.waiting_queue)}, "
+                f"#running-req: {len(self.running_batch.reqs)}"
+            )
+            success = False
+        return success
+
+    def get_internal_state(self, recv_req: GetInternalStateReq):
+        ret = vars(get_global_server_args())
+        ret["last_gen_throughput"] = self.last_gen_throughput
+        ret["memory_usage"] = {
+            "weight": round(self.tp_worker.model_runner.weight_load_mem_usage, 2),
+            "kvcache": round(
+                self.token_to_kv_pool_allocator.get_kvcache().mem_usage, 2
+            ),
+            "token_capacity": int(self.max_total_num_tokens),
+            "graph": round(self.tp_worker.model_runner.graph_mem_usage, 2),
+        }
+        ret["effective_max_running_requests_per_dp"] = self.max_running_requests
+
+        if not self.spec_algorithm.is_none() and self.spec_total_num_forward_ct > 0:
+            ret["avg_spec_accept_length"] = (
+                self.spec_total_num_accepted_tokens / self.spec_total_num_forward_ct
+            )
+
+        if RECORD_STEP_TIME:
+            ret["step_time_dict"] = self.step_time_dict
+
+        # This field is not serializable.
+        ret.pop("model_config", None)
+
+        return GetInternalStateReqOutput(internal_state=ret)
+
+    def set_internal_state(self, recv_req: SetInternalStateReq):
+        server_args_dict = recv_req.server_args
+        args_allow_update = set(
+            [
+                "pp_max_micro_batch_size",
+                "speculative_accept_threshold_single",
+                "speculative_accept_threshold_acc",
+            ]
+        )
+
+        if_success = True
+        for k, v in server_args_dict.items():
+            if k not in args_allow_update:
+                logging.warning(f"Updating {k} is not supported.")
+                if_success = False
+                break
+            elif k == "pp_max_micro_batch_size" and (
+                v > self.max_running_requests // self.pp_size or v < 1
+            ):
+                logging.warning(
+                    f"Updating {k} to {v} is rejected because it is out of the valid range [1, {self.max_running_requests // self.pp_size}]."
+                )
+                if_success = False
+                break
+
+        if if_success:
+            if not self.spec_algorithm.is_none() and self.spec_total_num_forward_ct > 0:
+                avg_spec_accept_length = (
+                    self.spec_total_num_accepted_tokens / self.spec_total_num_forward_ct
+                )
+                logger.info(f"{avg_spec_accept_length=}")
+            self.spec_total_num_accepted_tokens = self.spec_total_num_forward_ct = 0
+            for k, v in server_args_dict.items():
+                setattr(get_global_server_args(), k, v)
+            logger.info(f"Global server args updated! {get_global_server_args()=}")
+        return SetInternalStateReqOutput(
+            updated=True,
+            server_args=vars(get_global_server_args()),
+        )
+
+    def handle_rpc_request(self, recv_req: RpcReqInput):
+        # Handle RPC requests
+        logger.info(
+            f"handle_rpc_request: {recv_req.method}, param: {recv_req.parameters}"
+        )
+
+        success = True
+        exec = None
+        try:
+            func = getattr(self, recv_req.method)
+            if recv_req.parameters is not None:
+                func(**recv_req.parameters)
+            else:
+                func()
+        except Exception as e:
+            success = False
+            exec = e
+            logger.error(f"Failed to call rpc {recv_req.method}: {str(e)}")
+
+        barrier()
+        return RpcReqOutput(success, "" if not exec else str(exec))
+
+    def abort_request(self, recv_req: AbortReq):
+        # Delete requests in the waiting queue
+        to_del = []
+        for i, req in enumerate(self.waiting_queue):
+            if recv_req.abort_all or req.rid.startswith(recv_req.rid):
+                to_del.append(i)
+
+        # Sort in reverse order to avoid index issues when deleting
+        for i in reversed(to_del):
+            # Abort method 1: directly pop from the queue
+            # This only works for requests that have not started anything.
+            # We still need to send something back to TokenizerManager to clean up the state.
+            req = self.waiting_queue.pop(i)
+            if self.enable_hicache_storage:
+                # to release prefetch events associated with the request
+                self.tree_cache.release_aborted_request(req.rid)
+            self.send_to_tokenizer.send_output(AbortReq(rid=req.rid), req)
+            # For disaggregation decode mode, the request in the waiting queue has KV cache allocated.
+            if self.disaggregation_mode == DisaggregationMode.DECODE:
+                release_kv_cache(req, self.tree_cache)
+            # For disaggregation prefill mode, free the metadata buffer index
+            if self.disaggregation_mode == DisaggregationMode.PREFILL:
+                release_req_to_metadata_buffer(
+                    req, self.req_to_metadata_buffer_idx_allocator
+                )
+
+            # For mamba radix cache
+            if (
+                req.mamba_pool_idx is not None
+                and self.disaggregation_mode != DisaggregationMode.DECODE
+            ):
+                release_kv_cache(req, self.tree_cache, is_insert=False)
+            logger.debug(f"Abort queued request. {req.rid=}")
+
+        # Delete the requests in the grammar queue
+        # Abort method 2: call `set_finish_with_abort`
+        # The request will still run one prefill forward pass.
+        # In this case, we change the input_ids to be only one token to make this prefill cheap.
+        self.grammar_manager.abort_requests(recv_req)
+
+        # Delete requests not in the waiting queue when PD disaggregation is enabled
+        if self.disaggregation_mode == DisaggregationMode.PREFILL:
+            # Abort requests that have not yet been bootstrapped
+            for req in self.disagg_prefill_bootstrap_queue.queue:
+                if recv_req.abort_all or req.rid.startswith(recv_req.rid):
+                    logger.debug(f"Abort bootstrap queue request. {req.rid=}")
+                    if hasattr(req.disagg_kv_sender, "abort"):
+                        req.disagg_kv_sender.abort()
+
+            # Abort in-flight requests
+            for req in self.disagg_prefill_inflight_queue:
+                if recv_req.abort_all or req.rid.startswith(recv_req.rid):
+                    logger.debug(f"Abort inflight queue request. {req.rid=}")
+                    if hasattr(req.disagg_kv_sender, "abort"):
+                        req.disagg_kv_sender.abort()
+
+        elif self.disaggregation_mode == DisaggregationMode.DECODE:
+            # Abort requests that have not yet finished preallocation
+            for decode_req in self.disagg_decode_prealloc_queue.queue:
+                if recv_req.abort_all or decode_req.req.rid.startswith(recv_req.rid):
+                    logger.debug(f"Abort prealloc queue request. {decode_req.req.rid=}")
+                    decode_req.kv_receiver.abort()
+
+            # Abort requests waiting for kvcache to release tree cache
+            for decode_req in self.disagg_decode_transfer_queue.queue:
+                if recv_req.abort_all or decode_req.req.rid.startswith(recv_req.rid):
+                    logger.debug(f"Abort transfer queue request. {decode_req.req.rid=}")
+                    decode_req.kv_receiver.abort()
+
+            # Abort requests already retracted to CPU cache
+            if self.disagg_decode_prealloc_queue.retracted_queue:
+                remaining_retracted = []
+                for decode_req in self.disagg_decode_prealloc_queue.retracted_queue:
+                    if recv_req.abort_all or decode_req.rid.startswith(recv_req.rid):
+                        assert hasattr(decode_req, "kv_cache_cpu")
+                        del decode_req.kv_cache_cpu
+                        self.send_to_tokenizer.send_output(
+                            AbortReq(rid=decode_req.rid), decode_req
+                        )
+                    else:
+                        remaining_retracted.append(decode_req)
+                self.disagg_decode_prealloc_queue.retracted_queue = remaining_retracted
+
+        # Delete requests in the running batch
+        if self.cur_batch is self.running_batch or self.cur_batch is None:
+            reqs = self.running_batch.reqs
+        else:
+            reqs = self.running_batch.reqs + self.cur_batch.reqs
+
+        for req in reqs:
+            if not req.finished() and (
+                recv_req.abort_all or req.rid.startswith(recv_req.rid)
+            ):
+                # Abort method 3: set `to_finish`
+                # The request will still run one decode forward pass.
+                # Then we reuse all existing code to clean up the KV cache allocation.
+                logger.debug(f"Abort running request. {req.rid=}")
+                req.to_finish = FINISH_ABORT()
+
+    def _pause_engine(self) -> Tuple[List[Req], int]:
+        raise NotImplementedError()
+
+    def pause_generation(self, recv_req: PauseGenerationReqInput):
+        self._engine_paused = True
+
+        if self.enable_overlap and self.last_batch:
+            # Process the results of the last batch
+            tmp_batch, tmp_result = self.result_queue.popleft()
+            self.process_batch_result(tmp_batch, tmp_result)
+
+        if self.last_batch and self.last_batch.forward_mode.is_extend():
+            chunked_req_to_exclude = set()
+            if recv_req.mode == "in_place":
+                if self.chunked_req is not None:
+                    chunked_req_to_exclude.add(self.chunked_req)
+            self.last_batch.filter_batch(
+                chunked_req_to_exclude=list(chunked_req_to_exclude)
+            )
+            self.running_batch.merge_batch(self.last_batch)
+
+        self.last_batch = None
+        self.cur_batch = None
+
+        if recv_req.mode == "retract":
+            self.running_batch.filter_batch(v1_spec_info_filtered=True)
+            if len(self.running_batch.reqs) != 0:
+                retracted_reqs = self.running_batch.retract_all(self.server_args)
+                for req in retracted_reqs:
+                    self._add_request_to_queue(req)
+
+            self.running_batch.batch_is_full = False
+            self.chunked_req = None
+
+    def continue_generation(self, recv_req: ContinueGenerationReqInput):
+        self._engine_paused = False
+
+    def load_lora_adapter(
+        self, recv_req: LoadLoRAAdapterReqInput
+    ) -> LoadLoRAAdapterReqOutput:
+        """In-place loading a new lora adapter from disk or huggingface."""
+
+        result = self.tp_worker.load_lora_adapter(recv_req)
+        return result
+
+    def load_lora_adapter_from_tensors(
+        self, recv_req: LoadLoRAAdapterFromTensorsReqInput
+    ) -> LoadLoRAAdapterFromTensorsReqOutput:
+        """In-place loading a new lora adapter from serialized tensors."""
+
+        result = self.tp_worker.load_lora_adapter_from_tensors(recv_req)
+        return result
+
+    def unload_lora_adapter(
+        self, recv_req: UnloadLoRAAdapterReqInput
+    ) -> UnloadLoRAAdapterReqOutput:
+        """Unload the lora adapter."""
+
+        result = self.tp_worker.unload_lora_adapter(recv_req)
+        return result
+
+    def init_weights_send_group_for_remote_instance(
+        self, recv_req: InitWeightsSendGroupForRemoteInstanceReqInput
+    ):
+        """Init the seed and client instance communication group."""
+        success, message = self.tp_worker.init_weights_send_group_for_remote_instance(
+            recv_req
+        )
+        return InitWeightsSendGroupForRemoteInstanceReqOutput(success, message)
+
+    def send_weights_to_remote_instance(
+        self, recv_req: SendWeightsToRemoteInstanceReqInput
+    ):
+        """Send the seed instance weights to the destination instance."""
+        success, message = self.tp_worker.send_weights_to_remote_instance(recv_req)
+        return SendWeightsToRemoteInstanceReqOutput(success, message)
+
+    def slow_down(self, recv_req: SlowDownReqInput):
+        t = recv_req.forward_sleep_time
+        if t is not None and t <= 0:
+            t = None
+        self.forward_sleep_time = t
+        return SlowDownReqOutput()
+
+    def expert_distribution_handle(self, recv_req: ExpertDistributionReq):
+        action = recv_req.action
+        if action == ExpertDistributionReqType.START_RECORD:
+            get_global_expert_distribution_recorder().start_record()
+        elif action == ExpertDistributionReqType.STOP_RECORD:
+            get_global_expert_distribution_recorder().stop_record()
+        elif action == ExpertDistributionReqType.DUMP_RECORD:
+            get_global_expert_distribution_recorder().dump_record()
+        else:
+            raise ValueError(f"Unrecognized ExpertDistributionReq value: {recv_req=}")
+        return ExpertDistributionReqOutput()
+
+    def open_session(self, recv_req: OpenSessionReqInput):
+        return self.session_controller.open(recv_req)
+
+    def close_session(self, recv_req: CloseSessionReqInput):
+        self.session_controller.close(recv_req)
+
+    def maybe_sleep_on_idle(self):
+        if self.idle_sleeper is not None:
+            self.idle_sleeper.maybe_sleep()
+
+    def handle_freeze_gc(self, recv_req: FreezeGCReq):
+        """Handle freeze_gc request: freeze scheduler's GC and forward to detokenizer."""
+        freeze_gc("Scheduler")
+        self.send_to_detokenizer.send_output(recv_req, recv_req)
+        return None
+
+    def handle_dumper_control(self, recv_req: DumperControlReqInput):
+        from sglang.srt.debug_utils.dumper import dumper
+
+        try:
+            response: list = []
+            if (
+                not torch.distributed.is_initialized()
+                or torch.distributed.get_rank() == 0
+            ):
+                response = dumper._http_manager.handle_request(
+                    method=recv_req.method, body=recv_req.body
+                )
+            self.send_to_tokenizer.send_output(
+                DumperControlReqOutput(success=True, response=response), recv_req
+            )
+        except Exception as e:
+            print(f"[Scheduler] handle_dumper_control error: {e}", flush=True)
+            self.send_to_tokenizer.send_output(
+                DumperControlReqOutput(success=False, response=[], error=str(e)),
+                recv_req,
+            )
+
+    # placeholder for override
+    def update_cache_from_scheduler(
+        self, schedule_batch: ScheduleBatch, batch_result: GenerationBatchResult
+    ):
+        pass
+
+    def get_remote_instance_transfer_engine_info(self):
+        return self.tp_worker.get_remote_instance_transfer_engine_info()
+
+
+class IdleSleeper:
+    """
+    In setups which have long inactivity periods it is desirable to reduce
+    system power consumption when sglang does nothing. This would lead not only
+    to power savings, but also to more CPU thermal headroom when a request
+    eventually comes. This is important in cases when multiple GPUs are connected
+    as each GPU would otherwise pin one thread at 100% CPU usage.
+
+    The simplest solution is to use zmq.Poller on all sockets that may receive
+    data that needs handling immediately.
+    """
+
+    def __init__(self, sockets):
+        self.poller = zmq.Poller()
+        self.last_empty_time = real_time()
+        for s in sockets:
+            self.poller.register(s, zmq.POLLIN)
+
+        self.empty_cache_interval = envs.SGLANG_EMPTY_CACHE_INTERVAL.get()
+
+    def maybe_sleep(self):
+        self.poller.poll(1000)
+        if (
+            self.empty_cache_interval > 0
+            and real_time() - self.last_empty_time > self.empty_cache_interval
+        ):
+            self.last_empty_time = real_time()
+            torch.cuda.empty_cache()
+
+
+def is_health_check_generate_req(recv_req):
+    rid = getattr(recv_req, "rid", None)
+    return rid is not None and rid.startswith("HEALTH_CHECK")
+
+
+def is_work_request(recv_req):
+    return isinstance(
+        recv_req,
+        (
+            TokenizedGenerateReqInput,
+            TokenizedEmbeddingReqInput,
+            BatchTokenizedGenerateReqInput,
+            BatchTokenizedEmbeddingReqInput,
+        ),
+    )
+
+
+class SenderWrapper:
+    def __init__(self, socket: zmq.Socket):
+        self.socket = socket
+
+    def send_output(
+        self,
+        output: Union[BaseReq, BaseBatchReq],
+        recv_obj: Optional[Union[BaseReq, BaseBatchReq]] = None,
+    ):
+        if self.socket is None:
+            return
+
+        if (
+            isinstance(recv_obj, BaseReq)
+            and recv_obj.http_worker_ipc is not None
+            and output.http_worker_ipc is None
+        ):
+            # handle communicator reqs for multi-http worker case
+            output.http_worker_ipc = recv_obj.http_worker_ipc
+
+        self.socket.send_pyobj(output)
+
+
+def dispatch_event_loop(scheduler: Scheduler):
+    # Dispatch to the appropriate event loop based on the disaggregation mode
+    server_args = scheduler.server_args
+    disaggregation_mode: DisaggregationMode = scheduler.disaggregation_mode
+    if disaggregation_mode == DisaggregationMode.NULL:
+        if scheduler.enable_pdmux:
+            scheduler.event_loop_pdmux()
+        elif server_args.pp_size > 1:
+            scheduler.event_loop_pp()
+        elif scheduler.enable_overlap:
+            scheduler.event_loop_overlap()
+        else:
+            scheduler.event_loop_normal()
+    elif disaggregation_mode == DisaggregationMode.PREFILL:
+        if server_args.pp_size > 1:
+            scheduler.event_loop_pp_disagg_prefill()
+        elif scheduler.enable_overlap:
+            scheduler.event_loop_overlap_disagg_prefill()
+        else:
+            scheduler.event_loop_normal_disagg_prefill()
+    elif disaggregation_mode == DisaggregationMode.DECODE:
+        if server_args.pp_size > 1:
+            scheduler.event_loop_pp_disagg_decode()
+        elif scheduler.enable_overlap:
+            scheduler.event_loop_overlap_disagg_decode()
+        else:
+            scheduler.event_loop_normal_disagg_decode()
+
+
+def configure_scheduler(
+    server_args: ServerArgs,
+    tp_rank: int,
+    attn_cp_rank: int,
+    moe_dp_rank: int,
+    moe_ep_rank: int,
+    pp_rank: int,
+    dp_rank: Optional[int],
+) -> Optional[int]:
+    """Configure scheduler worker: logging, process title, etc.
+
+    Returns:
+        dp_rank
+    """
+    # Generate the logger prefix
+    if dp_rank is None and "SGLANG_DP_RANK" in os.environ:
+        # [For Router] if env var "SGLANG_DP_RANK" exist, set dp_rank to the value of the env var
+        dp_rank = int(os.environ["SGLANG_DP_RANK"])
+
+    prefix = ""
+    if dp_rank is not None:
+        prefix += f" DP{dp_rank}"
+    if server_args.pp_size > 1:
+        prefix += f" PP{pp_rank}"
+    if server_args.attn_cp_size > 1:
+        prefix += f" ATTN_CP{attn_cp_rank}"
+    if server_args.moe_dp_size > 1:
+        prefix += f" MOE_DP{moe_dp_rank}"
+    if server_args.tp_size > 1:
+        prefix += f" TP{tp_rank}"
+    if server_args.ep_size > 1:
+        prefix += f" EP{moe_ep_rank}"
+
+    # Config the process
+    setproctitle.setproctitle(f"sglang::scheduler{prefix.replace(' ', '_')}")
+    faulthandler.enable()
+
+    # Configure the logger
+    configure_logger(server_args, prefix=prefix)
+    suppress_other_loggers()
+
+    return dp_rank
+
+
+def run_scheduler_process(
+    server_args: ServerArgs,
+    port_args: PortArgs,
+    gpu_id: int,
+    tp_rank: int,
+    attn_cp_rank: int,
+    moe_dp_rank: int,
+    moe_ep_rank: int,
+    pp_rank: int,
+    dp_rank: Optional[int],
+    pipe_writer,
+):
+    dp_rank = configure_scheduler(
+        server_args, tp_rank, attn_cp_rank, moe_dp_rank, moe_ep_rank, pp_rank, dp_rank
+    )
+
+    kill_itself_when_parent_died()
+    parent_process = psutil.Process().parent()
+
+    # Set cpu affinity to this gpu process
+    if get_bool_env_var("SGLANG_SET_CPU_AFFINITY"):
+        set_gpu_proc_affinity(
+            server_args.pp_size, server_args.tp_size, server_args.nnodes, gpu_id
+        )
+    if (
+        numa_node := server_args.numa_node
+    ) is not None and not envs.SGLANG_NUMA_BIND_V2.get():
+        numa_bind_to_node(numa_node[gpu_id])
+
+    # Set up tracing
+    if server_args.enable_trace:
+        process_tracing_init(server_args.otlp_traces_endpoint, "sglang")
+        thread_label = "Scheduler"
+        if server_args.disaggregation_mode == "prefill":
+            thread_label = "Prefill Scheduler"
+        elif server_args.disaggregation_mode == "decode":
+            thread_label = "Decode Scheduler"
+        trace_set_thread_info(thread_label, tp_rank, dp_rank)
+
+    # Create a scheduler and run the event loop
+    try:
+        scheduler = Scheduler(
+            server_args,
+            port_args,
+            gpu_id,
+            tp_rank,
+            moe_ep_rank,
+            pp_rank,
+            attn_cp_rank,
+            moe_dp_rank,
+            dp_rank,
+        )
+
+        # Send initialization info back to the parent process
+        pipe_writer.send(scheduler.get_init_info())
+
+        # Run the event loop (blocks until shutdown)
+        scheduler.run_event_loop()
+
+    except Exception:
+        traceback = get_exception_traceback()
+        logger.error(f"Scheduler hit an exception: {traceback}")
+        parent_process.send_signal(signal.SIGQUIT)
diff --git a/sglang/python/sglang/srt/managers/scheduler_dp_attn_mixin.py b/sglang/python/sglang/srt/managers/scheduler_dp_attn_mixin.py
new file mode 100644
index 0000000000000000000000000000000000000000..5331fc03314a87ee23ee99ad6f2c4422006c2f7c
--- /dev/null
+++ b/sglang/python/sglang/srt/managers/scheduler_dp_attn_mixin.py
@@ -0,0 +1,271 @@
+from __future__ import annotations
+
+from dataclasses import dataclass
+from typing import TYPE_CHECKING, Callable, Optional
+
+import torch
+
+from sglang.srt.batch_overlap.two_batch_overlap import TboDPAttentionPreparer
+from sglang.srt.distributed.parallel_state import get_tp_group
+from sglang.srt.environ import envs
+from sglang.srt.managers.schedule_batch import ScheduleBatch
+from sglang.srt.model_executor.forward_batch_info import ForwardMode
+from sglang.srt.observability.metrics_collector import DPCooperationInfo
+from sglang.srt.utils.common import require_mlp_tp_gather
+
+if TYPE_CHECKING:
+    from sglang.srt.distributed.parallel_state import GroupCoordinator
+    from sglang.srt.managers.scheduler import Scheduler
+
+
+_ENABLE_METRICS_DP_ATTENTION = envs.SGLANG_ENABLE_METRICS_DP_ATTENTION.get()
+
+
+@dataclass
+class MLPSyncBatchInfo:
+    dp_size: int
+    tp_size: int
+    cp_size: int
+
+    num_tokens: int
+    num_tokens_for_logprob: int
+    can_cuda_graph: bool
+    is_extend_in_batch: bool
+    local_can_run_tbo: bool
+    local_forward_mode: int
+
+    # some gathered elements
+    tp0_info: torch.Tensor = None
+    global_num_tokens: list[int] = None
+    global_num_tokens_for_logprob: list[int] = None
+    tbo_split_seq_index: torch.Tensor = None
+    global_forward_mode: int = None
+    dp_cooperation_info: Optional[DPCooperationInfo] = None
+
+    def _get_local_tensor(self, device, dtype=torch.int64) -> torch.Tensor:
+        return torch.tensor(
+            [
+                self.num_tokens,
+                self.num_tokens_for_logprob,
+                int(self.can_cuda_graph),
+                int(self.is_extend_in_batch),
+                int(self.local_can_run_tbo),
+                self.local_forward_mode,
+            ],
+            device=device,
+            dtype=dtype,
+        )
+
+    def _get_fallback_tensor(self, device, dtype=torch.int64) -> torch.Tensor:
+        return torch.tensor(
+            [
+                0,  # num_tokens
+                0,  # num_tokens_for_logprob
+                1,  # can_cuda_graph
+                0,  # is_extend_in_batch
+                1,  # local_can_run_tbo
+                ForwardMode.IDLE.value,  # local_forward_mode
+            ],
+            device=device,
+            dtype=dtype,
+        )
+
+    def all_gather(self, device, group: torch.distributed.ProcessGroup):
+        local_info_tensor = self._get_local_tensor(device=device)
+        global_info_tensor = torch.empty(
+            (self.dp_size, self.tp_size * self.cp_size, 6),
+            dtype=torch.int64,
+            device=device,
+        )
+
+        torch.distributed.all_gather_into_tensor(
+            global_info_tensor.flatten(),
+            local_info_tensor,
+            group=group,
+        )
+        if device == "cpu":
+            tp_active_ranks = get_tp_group().active_ranks_cpu
+        else:
+            tp_active_ranks = get_tp_group().active_ranks
+
+        # Set fallback values for inactive ranks
+        tp_info = global_info_tensor.view(self.dp_size * self.tp_size * self.cp_size, 6)
+        tp_info[tp_active_ranks == 0] = self._get_fallback_tensor(device=device)
+
+        tp0_info = global_info_tensor[:, 0, :]
+        self.tp0_info = tp0_info
+        # Perform only one Device-to-Host (D2H) memory copy
+        cpu_data = tp0_info[:, :2].cpu()
+        self.global_num_tokens = cpu_data[:, 0].tolist()
+        self.global_num_tokens_for_logprob = cpu_data[:, 1].tolist()
+        self.can_cuda_graph = bool(tp0_info[:, 2].min().item())
+        self.is_extend_in_batch = bool(tp0_info[:, 3].max().item())
+        if _ENABLE_METRICS_DP_ATTENTION:
+            self.dp_cooperation_info = DPCooperationInfo.create(tp0_info[:, 5].tolist())
+
+
+def _update_gather_batch(
+    batch: ScheduleBatch,
+    mlp_sync_info: MLPSyncBatchInfo,
+    require_mlp_tp_gather: bool,
+    skip_all_gather=False,
+):
+    # TODO: handle the case when moe_dense_tp_size != 1
+    if not require_mlp_tp_gather:
+        batch.global_num_tokens = [mlp_sync_info.num_tokens]
+        batch.global_num_tokens_for_logprob = [mlp_sync_info.num_tokens_for_logprob]
+    else:
+        batch.global_num_tokens = mlp_sync_info.global_num_tokens
+        batch.global_num_tokens_for_logprob = (
+            mlp_sync_info.global_num_tokens_for_logprob
+        )
+    if not skip_all_gather:
+        batch.is_extend_in_batch = mlp_sync_info.is_extend_in_batch
+        batch.tbo_split_seq_index = mlp_sync_info.tbo_split_seq_index
+        batch.global_forward_mode = mlp_sync_info.global_forward_mode
+
+    # Check forward mode for cuda graph
+    batch.can_run_dp_cuda_graph = mlp_sync_info.can_cuda_graph
+
+
+def prepare_mlp_sync_batch_raw(
+    local_batch: ScheduleBatch,
+    dp_size: int,
+    attn_tp_size: int,
+    attn_cp_size: int,
+    tp_group: GroupCoordinator,
+    get_idle_batch: Callable[[], ScheduleBatch],
+    disable_cuda_graph: bool,
+    require_mlp_tp_gather: bool,
+    disable_overlap_schedule: bool,
+    offload_tags: set[str],
+):
+    # Check if other DP workers have running batches
+    if local_batch is None or local_batch.forward_mode.is_prebuilt():
+        num_tokens = 0
+        num_tokens_for_logprob = 0
+    elif local_batch.forward_mode.is_decode():
+        num_tokens = local_batch.batch_size()
+        num_tokens_for_logprob = num_tokens
+    else:
+        num_tokens = local_batch.extend_num_tokens
+        num_tokens_for_logprob = sum(
+            # We should have at least 1 token for sample in every case.
+            max(extend_len - logprob_start_len, 1)
+            for logprob_start_len, extend_len in zip(
+                local_batch.extend_logprob_start_lens,
+                local_batch.extend_lens,
+            )
+        )
+        assert (
+            local_batch.return_logprob
+            or num_tokens_for_logprob == local_batch.batch_size()
+        )
+
+    skip_all_gather = envs.SGLANG_SCHEDULER_SKIP_ALL_GATHER.get()
+    can_cuda_graph = (
+        local_batch is None
+        or local_batch.forward_mode.is_decode_or_idle()
+        or local_batch.forward_mode.is_prebuilt()
+    ) and not disable_cuda_graph
+
+    is_extend_in_batch = local_batch.forward_mode.is_extend() if local_batch else False
+    if local_batch is not None:
+        local_batch.is_extend_in_batch = is_extend_in_batch
+
+    tbo_preparer = TboDPAttentionPreparer()
+    if len(offload_tags) == 0 and (
+        disable_overlap_schedule
+        or envs.SGLANG_NCCL_ALL_GATHER_IN_OVERLAP_SCHEDULER_SYNC_BATCH.get()
+    ):
+        group = tp_group.device_group
+        device = tp_group.device
+    else:
+        group = tp_group.cpu_group
+        device = "cpu"
+
+    local_can_run_tbo, local_forward_mode = tbo_preparer.prepare_all_gather(local_batch)
+
+    mlp_sync_info = MLPSyncBatchInfo(
+        dp_size=dp_size,
+        tp_size=attn_tp_size,
+        cp_size=attn_cp_size,
+        num_tokens=num_tokens,
+        num_tokens_for_logprob=num_tokens_for_logprob,
+        can_cuda_graph=can_cuda_graph,
+        is_extend_in_batch=is_extend_in_batch,
+        local_can_run_tbo=local_can_run_tbo,
+        local_forward_mode=local_forward_mode,
+    )
+
+    if not skip_all_gather:
+        mlp_sync_info.all_gather(device=device, group=group)
+
+        mlp_sync_info.tbo_split_seq_index, mlp_sync_info.global_forward_mode = (
+            tbo_preparer.compute_output(
+                mlp_sync_info.tp0_info[:, 4:6],
+            )
+        )
+
+    need_idle_batch = skip_all_gather or max(mlp_sync_info.global_num_tokens) > 0
+    if need_idle_batch:
+        batch_to_gather = local_batch
+        if local_batch is None:
+            batch_to_gather = local_batch = get_idle_batch()
+        elif local_batch.forward_mode.is_prebuilt():
+            # NOTE: for prebuilt batch, we add an inner idle batch to run MLP sync
+            batch_to_gather = local_batch.inner_idle_batch = get_idle_batch()
+        _update_gather_batch(
+            batch_to_gather, mlp_sync_info, require_mlp_tp_gather, skip_all_gather
+        )
+
+    if _ENABLE_METRICS_DP_ATTENTION and local_batch is not None:
+        local_batch.dp_cooperation_info = mlp_sync_info.dp_cooperation_info
+
+    return local_batch
+
+
+class SchedulerDPAttnMixin:
+    def prepare_mlp_sync_batch(self: Scheduler, local_batch: ScheduleBatch):
+        return prepare_mlp_sync_batch_raw(
+            local_batch,
+            dp_size=self.server_args.dp_size,
+            attn_tp_size=self.attn_tp_size,
+            attn_cp_size=self.attn_cp_size,
+            tp_group=self.tp_group,
+            get_idle_batch=self.get_idle_batch,
+            disable_cuda_graph=self.server_args.disable_cuda_graph,
+            require_mlp_tp_gather=require_mlp_tp_gather(self.server_args),
+            disable_overlap_schedule=self.server_args.disable_overlap_schedule,
+            offload_tags=self.offload_tags,
+        )
+
+    def maybe_prepare_mlp_sync_batch(
+        self: Scheduler,
+        batch: Optional[ScheduleBatch],
+        need_sync: Optional[bool] = None,
+    ) -> Optional[ScheduleBatch]:
+        """
+        Helper to prepare MLP sync batch for DP attention.
+        Should be called after get_new_batch_prefill().
+
+        Args:
+            batch: The batch to process
+            need_sync: If specified, overrides self.require_mlp_sync for prepare_mlp_sync_batch decision
+        """
+        if need_sync if need_sync is not None else self.require_mlp_sync:
+            batch = self.prepare_mlp_sync_batch(batch)
+        return batch
+
+    def get_idle_batch(self: Scheduler) -> ScheduleBatch:
+        idle_batch = ScheduleBatch.init_new(
+            [],
+            self.req_to_token_pool,
+            self.token_to_kv_pool_allocator,
+            self.tree_cache,
+            self.model_config,
+            self.enable_overlap,
+            self.spec_algorithm,
+        )
+        idle_batch.prepare_for_idle()
+        return idle_batch
diff --git a/sglang/python/sglang/srt/managers/scheduler_input_blocker.py b/sglang/python/sglang/srt/managers/scheduler_input_blocker.py
new file mode 100644
index 0000000000000000000000000000000000000000..b6838ae431804d4beb635ddbf3b997cd699f5684
--- /dev/null
+++ b/sglang/python/sglang/srt/managers/scheduler_input_blocker.py
@@ -0,0 +1,106 @@
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+import logging
+from contextlib import contextmanager
+from enum import Enum, auto
+from typing import Any, List, Optional
+
+from sglang.srt.managers.io_struct import BlockReqInput, BlockReqType
+from sglang.srt.utils.poll_based_barrier import PollBasedBarrier
+
+logger = logging.getLogger(__name__)
+
+
+class SchedulerInputBlocker:
+    def __init__(self, noop: bool):
+        self._state = _State.UNBLOCKED
+        self._pending_reqs = []
+        self._noop = noop
+        self._global_unblock_barrier = PollBasedBarrier(noop=noop)
+
+    def handle(self, recv_reqs: Optional[List[Any]]):
+        assert (recv_reqs is None) == self._noop
+
+        if not self._noop:
+            output_reqs = []
+            for recv_req in recv_reqs:
+                output_reqs += self._handle_recv_req(recv_req)
+
+        global_arrived_unblock_barrier = (
+            self._global_unblock_barrier.poll_global_arrived()
+        )
+        if (
+            self._state == _State.GLOBAL_UNBLOCK_BARRIER
+            and global_arrived_unblock_barrier
+        ):
+            output_reqs += self._handle_arrive_unblock_barrier()
+
+        if not self._noop:
+            return output_reqs
+
+    def _handle_recv_req(self, recv_req):
+        if isinstance(recv_req, BlockReqInput):
+            if recv_req.type == BlockReqType.BLOCK:
+                self._execute_block_req()
+                return []
+            elif recv_req.type == BlockReqType.UNBLOCK:
+                self._execute_unblock_req()
+                return []
+            else:
+                raise NotImplementedError(f"{recv_req=}")
+        else:
+            if self._state == _State.UNBLOCKED:
+                return [recv_req]
+            else:
+                self._pending_reqs.append(recv_req)
+                return []
+
+    def _execute_block_req(self):
+        logger.info("Handle block req")
+        self._change_state(original=_State.UNBLOCKED, target=_State.BLOCKED)
+
+    def _execute_unblock_req(self):
+        logger.info("Handle unblock req")
+        self._change_state(
+            original=_State.BLOCKED, target=_State.GLOBAL_UNBLOCK_BARRIER
+        )
+        self._global_unblock_barrier.local_arrive()
+
+    def _handle_arrive_unblock_barrier(self):
+        logger.info(f"Arrived at unblock barrier ({len(self._pending_reqs)=})")
+        self._change_state(
+            original=_State.GLOBAL_UNBLOCK_BARRIER, target=_State.UNBLOCKED
+        )
+        output_reqs = [*self._pending_reqs]
+        self._pending_reqs.clear()
+        return output_reqs
+
+    def _change_state(self, original: "_State", target: "_State"):
+        assert self._state == original, f"{self._state=} {original=} {target=}"
+        self._state = target
+
+
+class _State(Enum):
+    UNBLOCKED = auto()
+    BLOCKED = auto()
+    GLOBAL_UNBLOCK_BARRIER = auto()
+
+
+@contextmanager
+def input_blocker_guard_region(send_to_scheduler):
+    send_to_scheduler.send_pyobj(BlockReqInput(BlockReqType.BLOCK))
+    try:
+        yield
+    finally:
+        send_to_scheduler.send_pyobj(BlockReqInput(BlockReqType.UNBLOCK))
diff --git a/sglang/python/sglang/srt/managers/scheduler_output_processor_mixin.py b/sglang/python/sglang/srt/managers/scheduler_output_processor_mixin.py
new file mode 100644
index 0000000000000000000000000000000000000000..f3b2d01ee4db61ccaa99a023e8409b07db82bf5e
--- /dev/null
+++ b/sglang/python/sglang/srt/managers/scheduler_output_processor_mixin.py
@@ -0,0 +1,1175 @@
+from __future__ import annotations
+
+import logging
+from typing import TYPE_CHECKING, List, Optional, Tuple, Union
+
+import torch
+
+from sglang.srt.disaggregation.utils import DisaggregationMode
+from sglang.srt.environ import envs
+from sglang.srt.layers.logits_processor import LogitsProcessorOutput
+from sglang.srt.layers.moe.routed_experts_capturer import get_global_experts_capturer
+from sglang.srt.managers.io_struct import (
+    AbortReq,
+    BatchEmbeddingOutput,
+    BatchTokenIDOutput,
+)
+from sglang.srt.managers.schedule_batch import (
+    BaseFinishReason,
+    Req,
+    ScheduleBatch,
+)
+from sglang.srt.mem_cache.common import release_kv_cache
+from sglang.srt.server_args import get_global_server_args
+
+if TYPE_CHECKING:
+    from sglang.srt.managers.scheduler import (
+        EmbeddingBatchResult,
+        GenerationBatchResult,
+        ScheduleBatch,
+        Scheduler,
+    )
+
+logger = logging.getLogger(__name__)
+
+DEFAULT_FORCE_STREAM_INTERVAL = 50
+
+
+class SchedulerOutputProcessorMixin:
+    """
+    This class implements the output processing logic for Scheduler.
+    We put them into a separate file to make the `scheduler.py` shorter.
+    """
+
+    def _get_storage_backend_type(self) -> str:
+        """Get storage backend type from tree_cache."""
+        storage_backend_type = "none"
+        cache_controller = getattr(self.tree_cache, "cache_controller", None)
+        if cache_controller and hasattr(cache_controller, "storage_backend"):
+            storage_backend = cache_controller.storage_backend
+            if storage_backend is not None:
+                storage_backend_type = type(storage_backend).__name__
+        return storage_backend_type
+
+    def _get_cached_tokens_details(self, req: Req) -> Optional[dict]:
+        """Get detailed cache breakdown for a request, if available.
+
+        Returns:
+            - None if HiCache is not enabled
+            - {"device": X, "host": Y} if HiCache enabled but L3 storage is not
+            - {"device": X, "host": Y, "storage": Z, "storage_backend": "..."} if L3 enabled
+        """
+        # Only show details if HiCache is enabled
+        if not getattr(self, "enable_hierarchical_cache", False):
+            return None
+
+        # Only show if there are any cached tokens
+        if (
+            req.cached_tokens_device > 0
+            or req.cached_tokens_host > 0
+            or req.cached_tokens_storage > 0
+        ):
+            details = {
+                "device": req.cached_tokens_device,
+                "host": req.cached_tokens_host,
+            }
+            # Only include storage fields if L3 storage is enabled
+            if getattr(self, "enable_hicache_storage", False):
+                details["storage"] = req.cached_tokens_storage
+                details["storage_backend"] = self._get_storage_backend_type()
+            return details
+        return None
+
+    def process_batch_result_prebuilt(self: Scheduler, batch: ScheduleBatch):
+        assert self.disaggregation_mode == DisaggregationMode.DECODE
+        for req in batch.reqs:
+            req.time_stats.set_decode_prebuilt_finish_time()
+            req.check_finished()
+            if req.finished():
+                req.time_stats.set_quick_finish_time()
+                release_kv_cache(req, self.tree_cache)
+
+        # Note: Logprobs should be handled on the prefill engine.
+        self.stream_output(batch.reqs, batch.return_logprob)
+
+    def maybe_collect_routed_experts(self: Scheduler, req: Req):
+        """Collect routed experts for a finished request."""
+        req.routed_experts = get_global_experts_capturer().get_routed_experts(
+            req_pool_idx=req.req_pool_idx,
+            seqlen=req.seqlen,
+            req_to_token_pool=self.req_to_token_pool,
+        )
+
+    def maybe_collect_customized_info(
+        self: Scheduler, i: int, req: Req, logits_output: LogitsProcessorOutput
+    ):
+        if logits_output is not None and logits_output.customized_info is not None:
+            if req.customized_info is None:
+                req.customized_info = {}
+            for k, v in logits_output.customized_info.items():
+                if k not in req.customized_info:
+                    req.customized_info[k] = []
+                # Copy the element so it doesn't retain the entire batch
+                # tensor/array via a view reference.
+                elem = v[i]
+                if isinstance(elem, torch.Tensor):
+                    elem = elem.clone()
+                elif hasattr(elem, "copy") and callable(elem.copy):
+                    elem = elem.copy()
+                req.customized_info[k].append(elem)
+
+    def process_batch_result_prefill(
+        self: Scheduler,
+        batch: ScheduleBatch,
+        result: Union[GenerationBatchResult, EmbeddingBatchResult],
+    ):
+        skip_stream_req = None
+
+        if self.is_generation:
+            if result.copy_done is not None:
+                result.copy_done.synchronize()
+
+            (
+                logits_output,
+                next_token_ids,
+                extend_input_len_per_req,
+                extend_logprob_start_len_per_req,
+            ) = (
+                result.logits_output,
+                result.next_token_ids,
+                result.extend_input_len_per_req,
+                result.extend_logprob_start_len_per_req,
+            )
+
+            # Move next_token_ids and logprobs to cpu
+            next_token_ids = next_token_ids.tolist()
+            if batch.return_logprob:
+                if logits_output.next_token_logprobs is not None:
+                    logits_output.next_token_logprobs = (
+                        logits_output.next_token_logprobs.tolist()
+                    )
+                if logits_output.input_token_logprobs is not None:
+                    logits_output.input_token_logprobs = tuple(
+                        logits_output.input_token_logprobs.tolist()
+                    )
+
+            hidden_state_offset = 0
+
+            # Check finish conditions
+            logprob_pt = 0
+
+            for i, (req, next_token_id) in enumerate(zip(batch.reqs, next_token_ids)):
+                if req.finished() or req.is_retracted:
+                    # decode req in mixed batch or retracted req
+                    continue
+
+                if req.is_chunked <= 0:
+                    req.time_stats.set_prefill_finished_time()
+
+                    # req output_ids are set here
+                    req.output_ids.append(next_token_id)
+                    req.check_finished()
+
+                    if req.finished():
+                        self.maybe_collect_routed_experts(req)
+                        release_kv_cache(req, self.tree_cache)
+                        req.time_stats.set_completion_time()
+                    elif not batch.decoding_reqs or req not in batch.decoding_reqs:
+                        self.tree_cache.cache_unfinished_req(req)
+
+                    self.maybe_collect_customized_info(i, req, logits_output)
+
+                    if batch.return_logprob:
+                        assert extend_logprob_start_len_per_req is not None
+                        assert extend_input_len_per_req is not None
+                        extend_logprob_start_len = extend_logprob_start_len_per_req[i]
+                        extend_input_len = extend_input_len_per_req[i]
+
+                        num_input_logprobs = self._calculate_num_input_logprobs(
+                            req, extend_input_len, extend_logprob_start_len
+                        )
+
+                        if req.return_logprob:
+                            self.add_logprob_return_values(
+                                i,
+                                req,
+                                logprob_pt,
+                                next_token_ids,
+                                num_input_logprobs,
+                                logits_output,
+                            )
+                        logprob_pt += num_input_logprobs
+
+                    if (
+                        req.return_hidden_states
+                        and logits_output.hidden_states is not None
+                    ):
+                        req.hidden_states.append(
+                            logits_output.hidden_states[
+                                hidden_state_offset : (
+                                    hidden_state_offset := hidden_state_offset
+                                    + len(req.origin_input_ids)
+                                )
+                            ]
+                            .cpu()
+                            .clone()
+                            .tolist()
+                        )
+
+                    if req.grammar is not None:
+                        # FIXME: this try-except block is for handling unexpected xgrammar issue.
+                        try:
+                            req.grammar.accept_token(next_token_id)
+                        except ValueError as e:
+                            # Grammar accept_token can raise ValueError if the token is not in the grammar.
+                            # This can happen if the grammar is not set correctly or the token is invalid.
+                            logger.error(
+                                f"Grammar accept_token failed for req {req.rid} with token {next_token_id}: {e}"
+                            )
+                            self.abort_request(AbortReq(rid=req.rid))
+                        req.grammar.finished = req.finished()
+
+                else:
+                    # being chunked reqs' prefill is not finished
+                    req.is_chunked -= 1
+                    # There is only at most one request being currently chunked.
+                    # Because this request does not finish prefill,
+                    # we don't want to stream the request currently being chunked.
+                    skip_stream_req = req
+
+                    # Incrementally update input logprobs.
+                    if batch.return_logprob:
+                        extend_logprob_start_len = extend_logprob_start_len_per_req[i]
+                        extend_input_len = extend_input_len_per_req[i]
+                        if extend_logprob_start_len < extend_input_len:
+                            # Update input logprobs.
+                            num_input_logprobs = self._calculate_num_input_logprobs(
+                                req, extend_input_len, extend_logprob_start_len
+                            )
+                            if req.return_logprob:
+                                self.add_input_logprob_return_values(
+                                    i,
+                                    req,
+                                    logits_output,
+                                    logprob_pt,
+                                    num_input_logprobs,
+                                    last_prefill_chunk=False,
+                                )
+                            logprob_pt += num_input_logprobs
+
+                    req.time_stats.set_last_chunked_prefill_finish_time()
+
+        else:  # embedding or reward model
+            if result.copy_done is not None:
+                result.copy_done.synchronize()
+
+            is_sparse = envs.SGLANG_EMBEDDINGS_SPARSE_HEAD.is_set()
+
+            embeddings = result.embeddings
+
+            if is_sparse:
+                batch_ids, token_ids = embeddings.indices()
+                values = embeddings.values()
+
+                embeddings = [{} for _ in range(embeddings.size(0))]
+                for i in range(batch_ids.shape[0]):
+                    embeddings[batch_ids[i].item()][token_ids[i].item()] = values[
+                        i
+                    ].item()
+            else:
+                if isinstance(embeddings, torch.Tensor):
+                    embeddings = embeddings.tolist()
+                else:
+                    embeddings = [tensor.tolist() for tensor in embeddings]
+
+            # Check finish conditions
+            for i, req in enumerate(batch.reqs):
+                if req.is_retracted:
+                    continue
+
+                req.embedding = embeddings[i]
+                if req.is_chunked <= 0:
+                    req.time_stats.set_prefill_finished_time()
+                    # Dummy output token for embedding models
+                    req.output_ids.append(0)
+                    req.check_finished()
+
+                    if req.finished():
+                        release_kv_cache(req, self.tree_cache)
+                        req.time_stats.set_completion_time()
+                    else:
+                        self.tree_cache.cache_unfinished_req(req)
+                else:
+                    # being chunked reqs' prefill is not finished
+                    req.is_chunked -= 1
+                    req.time_stats.set_last_chunked_prefill_finish_time()
+
+        self.stream_output(batch.reqs, batch.return_logprob, skip_stream_req)
+
+        if self.current_scheduler_metrics_enabled:
+            can_run_cuda_graph = getattr(result, "can_run_cuda_graph", False)
+            if self.enable_metrics:
+                self.metrics_collector.increment_prefill_cuda_graph_pass(
+                    value=can_run_cuda_graph
+                )
+            self.log_prefill_stats(
+                prefill_stats=batch.prefill_stats,
+                can_run_cuda_graph=can_run_cuda_graph,
+                dp_cooperation_info=batch.dp_cooperation_info,
+            )
+
+    def _resolve_spec_overlap_token_ids(
+        self: Scheduler, result: GenerationBatchResult, batch: ScheduleBatch
+    ) -> List[List[int]]:
+        """Resolve the padding next token ids for speculative decoding with overlap."""
+        assert result.next_token_ids.is_cpu
+        assert result.accept_lens.is_cpu
+
+        next_token_ids = result.next_token_ids.tolist()
+        accept_lens = result.accept_lens.tolist()
+        result.num_accepted_tokens = sum(accept_lens) - len(batch.reqs)
+        result.accept_length_per_req_cpu = [x - 1 for x in accept_lens]
+
+        predict_tokens = []
+        stride = self.draft_worker.speculative_num_draft_tokens
+
+        for i, req in enumerate(batch.reqs):
+            req.kv_committed_len += accept_lens[i]
+            predict_tokens.append(
+                next_token_ids[i * stride : i * stride + accept_lens[i]]
+            )
+            req.spec_verify_ct += 1
+
+            accepted_draft_tokens = result.accept_length_per_req_cpu[i]
+            req.spec_accepted_tokens += accepted_draft_tokens
+            req.update_spec_acceptance_histogram(accepted_draft_tokens)
+
+        return predict_tokens
+
+    def process_batch_result_idle(
+        self: Scheduler,
+        batch: ScheduleBatch,
+        result: GenerationBatchResult,
+    ):
+        if result.copy_done is not None:
+            result.copy_done.synchronize()
+
+        self.stream_output_generation(
+            batch.reqs, batch.return_logprob, is_idle_batch=True
+        )
+
+    def process_batch_result_decode(
+        self: Scheduler,
+        batch: ScheduleBatch,
+        result: GenerationBatchResult,
+    ):
+        if result.copy_done is not None:
+            result.copy_done.synchronize()
+
+        logits_output, next_token_ids, can_run_cuda_graph = (
+            result.logits_output,
+            result.next_token_ids,
+            result.can_run_cuda_graph,
+        )
+
+        if batch.spec_algorithm.is_none():
+            next_token_ids = next_token_ids.tolist()
+            if batch.return_logprob:
+                next_token_logprobs = logits_output.next_token_logprobs.tolist()
+        elif batch.is_spec_v2:
+            next_token_ids = self._resolve_spec_overlap_token_ids(result, batch)
+
+        self.num_generated_tokens += len(batch.reqs)
+        if not batch.spec_algorithm.is_none():
+            self.update_spec_metrics(batch.batch_size(), result.num_accepted_tokens)
+        if self.enable_metrics:
+            self.metrics_collector.increment_decode_cuda_graph_pass(
+                value=can_run_cuda_graph
+            )
+
+        self.token_to_kv_pool_allocator.free_group_begin()
+
+        # NOTE: in any case, we should check finish here
+        # if finished, also clean up committed kv cache and over-allocated kv cache here
+
+        # Check finish condition
+        for i, (req, next_token_id) in enumerate(zip(batch.reqs, next_token_ids)):
+            req: Req
+
+            if self.enable_overlap and (req.finished() or req.is_retracted):
+                # NOTE: This (req.finished() or req.is_retracted) should only happen when overlap scheduling is enabled.
+                # (currently not, e.g. Eagle V1 still check finish during forward)
+                # And all the over-allocated tokens will be freed in `release_kv_cache`.
+                continue
+
+            new_accepted_len = 1
+            if batch.spec_algorithm.is_none():
+                req.output_ids.append(next_token_id)
+            elif batch.is_spec_v2:
+                # Only spec v2's output_ids are updated here.
+                req.output_ids.extend(next_token_id)
+                new_accepted_len = len(next_token_id)
+
+            # Update Mamba last track seqlen
+            self._mamba_prefix_cache_update(req, batch, result, i)
+
+            req.time_stats.set_last_decode_finish_time()
+
+            req.check_finished(new_accepted_len)
+
+            if req.finished():
+                # delete feature to save memory
+                if req.multimodal_inputs is not None:
+                    for mm_item in req.multimodal_inputs.mm_items:
+                        pixel_values = mm_item.feature
+                        if isinstance(pixel_values, torch.Tensor):
+                            mm_item.feature = None
+                            del pixel_values
+                self.maybe_collect_routed_experts(req)
+
+                if self.server_args.disaggregation_decode_enable_offload_kvcache:
+                    # Asynchronously offload KV cache; release_kv_cache will be called after Device->Host transfer completes
+                    if not self.decode_offload_manager.offload_kv_cache(req):
+                        release_kv_cache(req, self.tree_cache)
+                else:
+                    release_kv_cache(req, self.tree_cache)
+
+                req.time_stats.set_completion_time()
+
+            self.maybe_collect_customized_info(i, req, logits_output)
+
+            if req.return_logprob and batch.spec_algorithm.is_none():
+                # speculative worker handles logprob in speculative decoding
+                req.output_token_logprobs_val.append(next_token_logprobs[i])
+                req.output_token_logprobs_idx.append(next_token_id)
+                if req.top_logprobs_num > 0:
+                    req.output_top_logprobs_val.append(
+                        logits_output.next_token_top_logprobs_val[i]
+                    )
+                    req.output_top_logprobs_idx.append(
+                        logits_output.next_token_top_logprobs_idx[i]
+                    )
+                if req.token_ids_logprob is not None:
+                    req.output_token_ids_logprobs_val.append(
+                        logits_output.next_token_token_ids_logprobs_val[i]
+                    )
+                    req.output_token_ids_logprobs_idx.append(
+                        logits_output.next_token_token_ids_logprobs_idx[i]
+                    )
+
+            if req.return_hidden_states and logits_output.hidden_states is not None:
+                req.hidden_states.append(
+                    logits_output.hidden_states[i].cpu().clone().tolist()
+                )
+
+            if req.grammar is not None:
+                # FIXME: this try-except block is for handling unexpected xgrammar issue.
+                try:
+                    if batch.spec_algorithm.is_none():
+                        # Normal decode: single token
+                        req.grammar.accept_token(next_token_id)
+                    elif batch.is_spec_v2:
+                        # Speculative decode: next_token_id is a list of accepted tokens
+                        for token_id in next_token_id:
+                            req.grammar.accept_token(token_id)
+                except ValueError as e:
+                    # Grammar accept_token can raise ValueError if the token is not in the grammar.
+                    # This can happen if the grammar is not set correctly or the token is invalid.
+                    logger.error(
+                        f"Grammar accept_token failed for req {req.rid} with token {next_token_id}: {e}"
+                    )
+                    self.abort_request(AbortReq(rid=req.rid))
+                req.grammar.finished = req.finished()
+
+        self.stream_output(batch.reqs, batch.return_logprob)
+        self.token_to_kv_pool_allocator.free_group_end()
+
+        self.forward_ct_decode = (self.forward_ct_decode + 1) % (1 << 30)
+        if self.current_scheduler_metrics_enabled:
+            if self.forward_ct_decode % self.server_args.decode_log_interval == 0:
+                self.log_decode_stats(can_run_cuda_graph, running_batch=batch)
+            self.log_decode_stats_every_iteration(
+                batch, num_accepted_tokens=result.num_accepted_tokens
+            )
+
+    def _mamba_prefix_cache_update(
+        self, req: Req, batch: ScheduleBatch, result: GenerationBatchResult, i: int
+    ) -> None:
+        seq_len = len(req.origin_input_ids) + len(req.output_ids) - 1
+        if req.mamba_ping_pong_track_buffer is not None:
+            mamba_track_interval = get_global_server_args().mamba_track_interval
+            if batch.spec_algorithm.is_none() and seq_len % mamba_track_interval == 0:
+                # for non-spec decode, we update mamba_last_track_seqlen at the end of each track interval
+                req.mamba_next_track_idx = (
+                    batch.req_to_token_pool.get_mamba_ping_pong_other_idx(
+                        req.mamba_next_track_idx
+                    )
+                )
+                req.mamba_last_track_seqlen = seq_len
+            elif (
+                not batch.spec_algorithm.is_none()
+                and result.accept_length_per_req_cpu is not None
+            ):
+                # for spec decode, update mamba_last_track_seqlen if this iteration crosses a track interval
+                actual_seq_len = req.seqlen - 1
+                if (
+                    actual_seq_len // mamba_track_interval
+                    != (actual_seq_len - result.accept_length_per_req_cpu[i])
+                    // mamba_track_interval
+                ):
+                    req.mamba_next_track_idx = (
+                        batch.req_to_token_pool.get_mamba_ping_pong_other_idx(
+                            req.mamba_next_track_idx
+                        )
+                    )
+                    req.mamba_last_track_seqlen = (
+                        actual_seq_len // mamba_track_interval * mamba_track_interval
+                    )
+
+    def _process_input_token_logprobs(
+        self, req: Req, input_token_logprobs: List
+    ) -> None:
+        """Process input token logprobs values and indices."""
+        is_multi_item_scoring = self._is_multi_item_scoring(req)
+
+        # Process logprob values - handle multi-item scoring vs regular requests
+        if is_multi_item_scoring:
+            # Multi-item scoring: use all logprobs as-is
+            req.input_token_logprobs_val = input_token_logprobs
+        else:
+            # Regular request: add None at start, remove last (sampling token)
+            req.input_token_logprobs_val = [None] + input_token_logprobs[:-1]
+
+        # Process logprob indices based on scoring type
+        if is_multi_item_scoring:
+            # Multi-item scoring: only include delimiter token positions
+            relevant_tokens = req.origin_input_ids[req.logprob_start_len :]
+            input_token_logprobs_idx = [
+                token_id
+                for token_id in relevant_tokens
+                if token_id == self.server_args.multi_item_scoring_delimiter
+            ]
+        else:
+            # Regular request: include all tokens from logprob_start_len onwards
+            input_token_logprobs_idx = req.origin_input_ids[req.logprob_start_len :]
+
+        # Clip padded hash values from image tokens to prevent detokenization errors
+        req.input_token_logprobs_idx = [
+            x if x < self.model_config.vocab_size - 1 else 0
+            for x in input_token_logprobs_idx
+        ]
+
+    def _process_input_top_logprobs(self, req: Req) -> None:
+        """Process input top logprobs."""
+        if req.top_logprobs_num <= 0:
+            return
+
+        is_multi_item_scoring = self._is_multi_item_scoring(req)
+
+        # Initialize arrays - multi-item scoring starts empty, others start with None
+        req.input_top_logprobs_val = [] if is_multi_item_scoring else [None]
+        req.input_top_logprobs_idx = [] if is_multi_item_scoring else [None]
+
+        # Extend arrays with temp values
+        for val, idx in zip(
+            req.temp_input_top_logprobs_val,
+            req.temp_input_top_logprobs_idx,
+            strict=True,
+        ):
+            req.input_top_logprobs_val.extend(val)
+            req.input_top_logprobs_idx.extend(idx)
+
+        # Remove last token (sampling token) for non multi-item scoring requests
+        if not is_multi_item_scoring:
+            req.input_top_logprobs_val.pop()
+            req.input_top_logprobs_idx.pop()
+
+        # Clean up temp storage
+        req.temp_input_top_logprobs_idx = None
+        req.temp_input_top_logprobs_val = None
+
+    def _process_input_token_ids_logprobs(self, req: Req) -> None:
+        """Process input token IDs logprobs."""
+        if req.token_ids_logprob is None:
+            return
+
+        is_multi_item_scoring = self._is_multi_item_scoring(req)
+
+        # Initialize arrays - multi-item scoring starts empty, others start with None
+        req.input_token_ids_logprobs_val = [] if is_multi_item_scoring else [None]
+        req.input_token_ids_logprobs_idx = [] if is_multi_item_scoring else [None]
+
+        # Process temp values - convert tensors to lists and extend arrays
+        for val, idx in zip(
+            req.temp_input_token_ids_logprobs_val,
+            req.temp_input_token_ids_logprobs_idx,
+            strict=True,
+        ):
+            val_list = val.tolist() if isinstance(val, torch.Tensor) else val
+            req.input_token_ids_logprobs_val.extend(
+                val_list if isinstance(val_list, list) else [val_list]
+            )
+            req.input_token_ids_logprobs_idx.extend(idx)
+
+        # Remove last token (sampling token) for non multi-item scoring requests
+        if not is_multi_item_scoring:
+            req.input_token_ids_logprobs_val.pop()
+            req.input_token_ids_logprobs_idx.pop()
+
+        # Clean up temp storage
+        req.temp_input_token_ids_logprobs_idx = None
+        req.temp_input_token_ids_logprobs_val = None
+
+    def _calculate_relevant_tokens_len(self, req: Req) -> int:
+        """Calculate the expected length of logprob arrays based on whether multi-item scoring is enabled.
+
+        For multi-item scoring, only delimiter positions have logprobs.
+        For regular requests, all positions from logprob_start_len onwards have logprobs.
+        """
+        is_multi_item_scoring = self._is_multi_item_scoring(req)
+        relevant_tokens = req.origin_input_ids[req.logprob_start_len :]
+
+        if is_multi_item_scoring:
+            # Multi-item scoring: count delimiter tokens from logprob_start_len onwards
+            return sum(
+                1
+                for token_id in relevant_tokens
+                if token_id == self.server_args.multi_item_scoring_delimiter
+            )
+        else:
+            # Regular request: all tokens from logprob_start_len onwards
+            return len(relevant_tokens)
+
+    def _calculate_num_input_logprobs(
+        self, req: Req, extend_input_len: int, extend_logprob_start_len: int
+    ) -> int:
+        """Calculate the number of input logprobs based on whether multi-item scoring is enabled.
+
+        For multi-item scoring, only delimiter positions have logprobs.
+        For regular requests, all positions in the range have logprobs.
+        """
+        is_multi_item_scoring = self._is_multi_item_scoring(req)
+
+        if is_multi_item_scoring:
+            # Multi-item scoring: count delimiter tokens in the relevant portion
+            relevant_tokens = req.origin_input_ids[
+                extend_logprob_start_len:extend_input_len
+            ]
+            return sum(
+                1
+                for token_id in relevant_tokens
+                if token_id == self.server_args.multi_item_scoring_delimiter
+            )
+        else:
+            # Regular request: all tokens in the range
+            return extend_input_len - extend_logprob_start_len
+
+    def _is_multi_item_scoring(self, req: Req) -> bool:
+        """Check if request uses multi-item scoring.
+
+        Multi-item scoring applies to prefill-only requests when a delimiter
+        token is configured. In this mode, only positions containing the
+        delimiter token receive logprobs.
+        """
+        return req.is_prefill_only and self.server_args.multi_item_scoring_delimiter
+
+    def add_input_logprob_return_values(
+        self: Scheduler,
+        i: int,
+        req: Req,
+        output: LogitsProcessorOutput,
+        logprob_pt: int,
+        num_input_logprobs: int,
+        last_prefill_chunk: bool,  # If True, it means prefill is finished.
+    ):
+        """Incrementally add input logprobs to `req`.
+
+        Args:
+            i: The request index in a batch.
+            req: The request. Input logprobs inside req are modified as a
+                consequence of the API
+            fill_ids: The prefill ids processed.
+            output: Logit processor output that's used to compute input logprobs
+            last_prefill_chunk: True if it is the last prefill (when chunked).
+                Some of input logprob operation should only happen at the last
+                prefill (e.g., computing input token logprobs).
+        """
+        assert output.input_token_logprobs is not None
+        if req.input_token_logprobs is None:
+            req.input_token_logprobs = []
+        if req.temp_input_top_logprobs_val is None:
+            req.temp_input_top_logprobs_val = []
+        if req.temp_input_top_logprobs_idx is None:
+            req.temp_input_top_logprobs_idx = []
+        if req.temp_input_token_ids_logprobs_val is None:
+            req.temp_input_token_ids_logprobs_val = []
+        if req.temp_input_token_ids_logprobs_idx is None:
+            req.temp_input_token_ids_logprobs_idx = []
+
+        if req.input_token_logprobs_val is not None:
+            # The input logprob has been already computed. It only happens
+            # upon retract.
+            if req.top_logprobs_num > 0:
+                assert req.input_token_logprobs_val is not None
+            return
+
+        # Important for the performance.
+        assert isinstance(output.input_token_logprobs, tuple)
+        input_token_logprobs: Tuple[int] = output.input_token_logprobs
+        input_token_logprobs = input_token_logprobs[
+            logprob_pt : logprob_pt + num_input_logprobs
+        ]
+        req.input_token_logprobs.extend(input_token_logprobs)
+
+        if req.top_logprobs_num > 0:
+            req.temp_input_top_logprobs_val.append(output.input_top_logprobs_val[i])
+            req.temp_input_top_logprobs_idx.append(output.input_top_logprobs_idx[i])
+
+        if req.token_ids_logprob is not None:
+            req.temp_input_token_ids_logprobs_val.append(
+                output.input_token_ids_logprobs_val[i]
+            )
+            req.temp_input_token_ids_logprobs_idx.append(
+                output.input_token_ids_logprobs_idx[i]
+            )
+
+        if last_prefill_chunk:
+            input_token_logprobs = req.input_token_logprobs
+            req.input_token_logprobs = None
+            assert req.input_token_logprobs_val is None
+            assert req.input_token_logprobs_idx is None
+            assert req.input_top_logprobs_val is None
+            assert req.input_top_logprobs_idx is None
+
+            # Process all input logprob types using helper functions
+            self._process_input_token_logprobs(req, input_token_logprobs)
+            self._process_input_top_logprobs(req)
+
+            self._process_input_token_ids_logprobs(req)
+
+            if req.return_logprob:
+                relevant_tokens_len = self._calculate_relevant_tokens_len(req)
+                assert len(req.input_token_logprobs_val) == relevant_tokens_len
+                assert len(req.input_token_logprobs_idx) == relevant_tokens_len
+                if req.top_logprobs_num > 0:
+                    assert len(req.input_top_logprobs_val) == relevant_tokens_len
+                    assert len(req.input_top_logprobs_idx) == relevant_tokens_len
+                if req.token_ids_logprob is not None:
+                    assert len(req.input_token_ids_logprobs_val) == relevant_tokens_len
+                    assert len(req.input_token_ids_logprobs_idx) == relevant_tokens_len
+
+    def add_logprob_return_values(
+        self: Scheduler,
+        i: int,
+        req: Req,
+        pt: int,
+        next_token_ids: List[int],
+        num_input_logprobs: int,
+        output: LogitsProcessorOutput,
+    ):
+        """Attach logprobs to the return values."""
+        if output.next_token_logprobs is not None:
+            req.output_token_logprobs_val.append(output.next_token_logprobs[i])
+            req.output_token_logprobs_idx.append(next_token_ids[i])
+
+        # Only add input logprobs if there are input tokens to process
+        # Note: For prefill-only requests with default logprob_start_len, this will be 0,
+        # meaning we only compute output logprobs (which is the intended behavior)
+        if num_input_logprobs > 0:
+            self.add_input_logprob_return_values(
+                i, req, output, pt, num_input_logprobs, last_prefill_chunk=True
+            )
+        else:
+            self._initialize_empty_logprob_containers(req)
+
+        if req.top_logprobs_num > 0:
+            req.output_top_logprobs_val.append(output.next_token_top_logprobs_val[i])
+            req.output_top_logprobs_idx.append(output.next_token_top_logprobs_idx[i])
+
+        if (
+            req.token_ids_logprob is not None
+            and output.next_token_token_ids_logprobs_val is not None
+        ):
+            # Convert GPU tensor to list if needed
+            logprobs_val = output.next_token_token_ids_logprobs_val[i]
+            if isinstance(logprobs_val, torch.Tensor):
+                logprobs_val = logprobs_val.tolist()
+            req.output_token_ids_logprobs_val.append(logprobs_val)
+            req.output_token_ids_logprobs_idx.append(
+                output.next_token_token_ids_logprobs_idx[i]
+            )
+
+        return num_input_logprobs
+
+    def _initialize_empty_logprob_containers(self, req: Req) -> None:
+        """
+        Initialize logprob fields to empty lists if unset.
+
+        This is needed for prefill-only requests where the normal initialization
+        flow might be bypassed, but downstream code expects these fields to be lists.
+        """
+        if req.input_token_logprobs_val is None:
+            req.input_token_logprobs_val = []
+        if req.input_token_logprobs_idx is None:
+            req.input_token_logprobs_idx = []
+        if req.input_top_logprobs_val is None:
+            req.input_top_logprobs_val = []
+        if req.input_top_logprobs_idx is None:
+            req.input_top_logprobs_idx = []
+        if req.input_token_ids_logprobs_val is None:
+            req.input_token_ids_logprobs_val = []
+        if req.input_token_ids_logprobs_idx is None:
+            req.input_token_ids_logprobs_idx = []
+
+    def stream_output(
+        self: Scheduler,
+        reqs: List[Req],
+        return_logprob: bool,
+        skip_req: Optional[Req] = None,
+    ):
+        """Stream the output to detokenizer."""
+        if self.is_generation:
+            self.stream_output_generation(reqs, return_logprob, skip_req)
+        else:  # embedding or reward model
+            self.stream_output_embedding(reqs)
+
+        if envs.SGLANG_TEST_CRASH_AFTER_STREAM_OUTPUTS.get() > 0:
+            self._trigger_crash_for_tests(
+                envs.SGLANG_TEST_CRASH_AFTER_STREAM_OUTPUTS.get()
+            )
+
+    def _trigger_crash_for_tests(self, crash_threshold: int):
+        # Crash trigger: crash after stream_output is called N times
+        # This is used for testing purposes.
+        if not hasattr(self, "_test_stream_output_count"):
+            self._test_stream_output_count = 0
+        self._test_stream_output_count += 1
+        if self._test_stream_output_count >= crash_threshold:
+            raise RuntimeError(
+                f"Test crash after stream_output called {self._test_stream_output_count} times"
+            )
+
+    def stream_output_generation(
+        self: Scheduler,
+        reqs: List[Req],
+        return_logprob: bool,
+        skip_req: Optional[Req] = None,
+        is_idle_batch: bool = False,
+    ):
+        rids = []
+        http_worker_ipcs = []
+        finished_reasons: List[BaseFinishReason] = []
+
+        decoded_texts = []
+        decode_ids_list = []
+        read_offsets = []
+        output_ids = []
+
+        skip_special_tokens = []
+        spaces_between_special_tokens = []
+        no_stop_trim = []
+        prompt_tokens = []
+        completion_tokens = []
+        cached_tokens = []
+        cached_tokens_details = []  # Detailed breakdown by cache source
+        spec_verify_ct = []
+        spec_accepted_tokens = []
+        spec_acceptance_histogram = []
+        retraction_counts = []
+        output_hidden_states = None
+        load = self.get_load()
+        routed_experts = None
+        customized_info = {}
+
+        time_stats = []
+
+        if return_logprob:
+            input_token_logprobs_val = []
+            input_token_logprobs_idx = []
+            output_token_logprobs_val = []
+            output_token_logprobs_idx = []
+            input_top_logprobs_val = []
+            input_top_logprobs_idx = []
+            output_top_logprobs_val = []
+            output_top_logprobs_idx = []
+            input_token_ids_logprobs_val = []
+            input_token_ids_logprobs_idx = []
+            output_token_ids_logprobs_val = []
+            output_token_ids_logprobs_idx = []
+        else:
+            input_token_logprobs_val = input_token_logprobs_idx = (
+                output_token_logprobs_val
+            ) = output_token_logprobs_idx = input_top_logprobs_val = (
+                input_top_logprobs_idx
+            ) = output_top_logprobs_val = output_top_logprobs_idx = (
+                input_token_ids_logprobs_val
+            ) = input_token_ids_logprobs_idx = output_token_ids_logprobs_val = (
+                output_token_ids_logprobs_idx
+            ) = None
+
+        for req in reqs:
+            if req is skip_req:
+                continue
+
+            # Multimodal partial stream chunks break the detokenizer, so drop aborted requests here.
+            if self.model_config.is_multimodal_gen and req.to_finish:
+                continue
+
+            if req.finished():
+                if req.finished_output:
+                    # With the overlap schedule, a request will try to output twice and hit this line twice
+                    # because of the one additional delayed token. This "continue" prevented the dummy output.
+                    continue
+                req.finished_output = True
+                if req.finished_len is None:
+                    req.finished_len = len(req.output_ids)
+                should_output = True
+            else:
+                if req.stream:
+                    stream_interval = (
+                        req.sampling_params.stream_interval or self.stream_interval
+                    )
+
+                    # origin stream_interval logic
+                    should_output = (
+                        len(req.output_ids) % stream_interval == 1
+                        if not self.model_config.is_multimodal_gen
+                        and stream_interval > 1
+                        else len(req.output_ids) % stream_interval == 0
+                    )
+
+                    if should_output:
+                        # check_match_stop_str_prefix if  tail_str's suffix match stop_str prefix
+                        should_output &= not req.check_match_stop_str_prefix()
+                else:
+                    should_output = (
+                        len(req.output_ids) % DEFAULT_FORCE_STREAM_INTERVAL == 0
+                        if not self.model_config.is_multimodal_gen
+                        else False
+                    )
+
+            if should_output:
+                send_token_offset = req.send_token_offset
+                send_output_token_logprobs_offset = (
+                    req.send_output_token_logprobs_offset
+                )
+                rids.append(req.rid)
+                http_worker_ipcs.append(req.http_worker_ipc)
+                finished_reasons.append(
+                    req.finished_reason.to_json() if req.finished_reason else None
+                )
+                decoded_texts.append(req.decoded_text)
+                decode_ids, read_offset = req.init_incremental_detokenize()
+
+                if self.model_config.is_multimodal_gen:
+                    decode_ids_list.append(decode_ids)
+                else:
+                    decode_ids_list.append(decode_ids[req.send_decode_id_offset :])
+
+                # Exclude the tokens after stop condition
+                output_ids_ = req.output_ids_through_stop
+
+                req.send_decode_id_offset = len(decode_ids)
+                read_offsets.append(read_offset)
+                output_ids.append(output_ids_[send_token_offset:])
+                req.send_token_offset = len(output_ids_)
+                skip_special_tokens.append(req.sampling_params.skip_special_tokens)
+                spaces_between_special_tokens.append(
+                    req.sampling_params.spaces_between_special_tokens
+                )
+                no_stop_trim.append(req.sampling_params.no_stop_trim)
+                prompt_tokens.append(len(req.origin_input_ids))
+                completion_tokens.append(len(output_ids_))
+                cached_tokens.append(req.cached_tokens)
+
+                # Collect detailed cache breakdown if available
+                cached_tokens_details.append(self._get_cached_tokens_details(req))
+
+                retraction_counts.append(req.retraction_count)
+
+                time_stats.append(req.time_stats)
+
+                if not self.spec_algorithm.is_none():
+                    spec_verify_ct.append(req.spec_verify_ct)
+                    spec_accepted_tokens.append(req.spec_accepted_tokens)
+                    spec_acceptance_histogram.append(req.spec_acceptance_histogram)
+
+                if return_logprob:
+                    if (
+                        req.return_logprob
+                        and not req.input_logprob_sent
+                        # Decode server does not send input logprobs
+                        and self.disaggregation_mode != DisaggregationMode.DECODE
+                    ):
+                        input_token_logprobs_val.append(req.input_token_logprobs_val)
+                        input_token_logprobs_idx.append(req.input_token_logprobs_idx)
+                        input_top_logprobs_val.append(req.input_top_logprobs_val)
+                        input_top_logprobs_idx.append(req.input_top_logprobs_idx)
+                        input_token_ids_logprobs_val.append(
+                            req.input_token_ids_logprobs_val
+                        )
+                        input_token_ids_logprobs_idx.append(
+                            req.input_token_ids_logprobs_idx
+                        )
+                        req.input_logprob_sent = True
+                    else:
+                        input_token_logprobs_val.append([])
+                        input_token_logprobs_idx.append([])
+                        input_top_logprobs_val.append([])
+                        input_top_logprobs_idx.append([])
+                        input_token_ids_logprobs_val.append([])
+                        input_token_ids_logprobs_idx.append([])
+
+                    if req.return_logprob:
+                        output_token_logprobs_val.append(
+                            req.output_token_logprobs_val[
+                                send_output_token_logprobs_offset:
+                            ]
+                        )
+                        output_token_logprobs_idx.append(
+                            req.output_token_logprobs_idx[
+                                send_output_token_logprobs_offset:
+                            ]
+                        )
+                        output_top_logprobs_val.append(
+                            req.output_top_logprobs_val[
+                                send_output_token_logprobs_offset:
+                            ]
+                        )
+                        output_top_logprobs_idx.append(
+                            req.output_top_logprobs_idx[
+                                send_output_token_logprobs_offset:
+                            ]
+                        )
+                        output_token_ids_logprobs_val.append(
+                            req.output_token_ids_logprobs_val[
+                                send_output_token_logprobs_offset:
+                            ]
+                        )
+                        output_token_ids_logprobs_idx.append(
+                            req.output_token_ids_logprobs_idx[
+                                send_output_token_logprobs_offset:
+                            ]
+                        )
+                        req.send_output_token_logprobs_offset = len(
+                            req.output_token_logprobs_val
+                        )
+                    else:
+                        output_token_logprobs_val.append([])
+                        output_token_logprobs_idx.append([])
+                        output_top_logprobs_val.append([])
+                        output_top_logprobs_idx.append([])
+                        output_token_ids_logprobs_val.append([])
+                        output_token_ids_logprobs_idx.append([])
+
+                if req.return_hidden_states:
+                    if output_hidden_states is None:
+                        output_hidden_states = []
+                    output_hidden_states.append(req.hidden_states)
+                if req.return_routed_experts:
+                    if routed_experts is None:
+                        routed_experts = []
+                    routed_experts.append(req.routed_experts)
+
+                if req.customized_info is not None:
+                    for k, v in req.customized_info.items():
+                        if k not in customized_info:
+                            customized_info[k] = []
+                        customized_info[k].append(v[send_token_offset:])
+
+            if (
+                req.finished()
+                and self.attn_tp_rank == 0
+                and self.server_args.enable_request_time_stats_logging
+            ):
+                req.log_time_stats()
+
+        dp_ranks = [self.dp_rank] * len(rids) if rids else None
+
+        # Send to detokenizer
+        if reqs or is_idle_batch:
+            if self.model_config.is_multimodal_gen:
+                return
+            self.send_to_detokenizer.send_output(
+                BatchTokenIDOutput(
+                    rids=rids,
+                    http_worker_ipcs=http_worker_ipcs,
+                    spec_verify_ct=spec_verify_ct,
+                    spec_accepted_tokens=spec_accepted_tokens,
+                    spec_acceptance_histogram=spec_acceptance_histogram,
+                    time_stats=time_stats,
+                    finished_reasons=finished_reasons,
+                    decoded_texts=decoded_texts,
+                    decode_ids=decode_ids_list,
+                    read_offsets=read_offsets,
+                    output_ids=output_ids,
+                    skip_special_tokens=skip_special_tokens,
+                    spaces_between_special_tokens=spaces_between_special_tokens,
+                    no_stop_trim=no_stop_trim,
+                    prompt_tokens=prompt_tokens,
+                    completion_tokens=completion_tokens,
+                    cached_tokens=cached_tokens,
+                    cached_tokens_details=cached_tokens_details,
+                    input_token_logprobs_val=input_token_logprobs_val,
+                    input_token_logprobs_idx=input_token_logprobs_idx,
+                    output_token_logprobs_val=output_token_logprobs_val,
+                    output_token_logprobs_idx=output_token_logprobs_idx,
+                    input_top_logprobs_val=input_top_logprobs_val,
+                    input_top_logprobs_idx=input_top_logprobs_idx,
+                    output_top_logprobs_val=output_top_logprobs_val,
+                    output_top_logprobs_idx=output_top_logprobs_idx,
+                    input_token_ids_logprobs_val=input_token_ids_logprobs_val,
+                    input_token_ids_logprobs_idx=input_token_ids_logprobs_idx,
+                    output_token_ids_logprobs_val=output_token_ids_logprobs_val,
+                    output_token_ids_logprobs_idx=output_token_ids_logprobs_idx,
+                    output_token_entropy_val=None,
+                    output_hidden_states=output_hidden_states,
+                    routed_experts=routed_experts,
+                    customized_info=customized_info,
+                    placeholder_tokens_idx=None,
+                    placeholder_tokens_val=None,
+                    retraction_counts=retraction_counts,
+                    load=load,
+                    dp_ranks=dp_ranks,
+                )
+            )
+
+    def stream_output_embedding(self: Scheduler, reqs: List[Req]):
+        rids = []
+        http_worker_ipcs = []
+        finished_reasons: List[BaseFinishReason] = []
+
+        embeddings = []
+        prompt_tokens = []
+        cached_tokens = []
+        cached_tokens_details = []  # Detailed breakdown by cache source
+        time_stats = []
+        retraction_counts = []
+        for req in reqs:
+            if req.finished():
+                rids.append(req.rid)
+                http_worker_ipcs.append(req.http_worker_ipc)
+                finished_reasons.append(req.finished_reason.to_json())
+                embeddings.append(req.embedding)
+                prompt_tokens.append(len(req.origin_input_ids))
+                cached_tokens.append(req.cached_tokens)
+
+                # Collect detailed cache breakdown if available
+                cached_tokens_details.append(self._get_cached_tokens_details(req))
+                time_stats.append(req.time_stats)
+                retraction_counts.append(req.retraction_count)
+        self.send_to_detokenizer.send_output(
+            BatchEmbeddingOutput(
+                rids=rids,
+                http_worker_ipcs=http_worker_ipcs,
+                time_stats=time_stats,
+                finished_reasons=finished_reasons,
+                embeddings=embeddings,
+                prompt_tokens=prompt_tokens,
+                cached_tokens=cached_tokens,
+                cached_tokens_details=cached_tokens_details,
+                placeholder_tokens_idx=None,
+                placeholder_tokens_val=None,
+                retraction_counts=retraction_counts,
+            )
+        )
diff --git a/sglang/python/sglang/srt/managers/scheduler_pp_mixin.py b/sglang/python/sglang/srt/managers/scheduler_pp_mixin.py
new file mode 100644
index 0000000000000000000000000000000000000000..c39a8fc139ef0f76746e5845dd155935b1a0d675
--- /dev/null
+++ b/sglang/python/sglang/srt/managers/scheduler_pp_mixin.py
@@ -0,0 +1,1387 @@
+from __future__ import annotations
+
+import logging
+import math
+import time
+from collections import deque
+from dataclasses import dataclass
+from typing import TYPE_CHECKING, Dict, List, Optional, Tuple
+
+import numpy as np
+import torch
+import torch.distributed
+from tqdm import tqdm
+
+from sglang.srt.disaggregation.base.conn import KVPoll
+from sglang.srt.disaggregation.utils import poll_and_all_reduce_attn_cp_tp_group
+from sglang.srt.distributed.parallel_state import P2PWork
+from sglang.srt.environ import envs
+from sglang.srt.layers.dp_attention import (
+    get_attention_dp_rank,
+    get_attention_dp_size,
+    is_dp_attention_enabled,
+)
+from sglang.srt.managers.schedule_batch import Req, ScheduleBatch
+from sglang.srt.managers.utils import (
+    GenerationBatchResult,
+    get_logprob_dict_from_result,
+    get_logprob_from_pp_outputs,
+)
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch, PPProxyTensors
+from sglang.srt.sampling.sampling_params import SamplingParams
+from sglang.srt.utils import DynamicGradMode, broadcast_pyobj, point_to_point_pyobj
+
+logger = logging.getLogger(__name__)
+
+if TYPE_CHECKING:
+    from sglang.srt.managers.scheduler import Scheduler
+
+
+@dataclass
+class PPBatchMetadata:
+    can_run_cuda_graph: bool
+
+
+class SchedulerPPMixin:
+    @DynamicGradMode()
+    def event_loop_pp(self: Scheduler):
+        """
+        A scheduler loop for pipeline parallelism.
+        Notes:
+        1. Each stage runs in the same order and is notified by the previous stage.
+        2. We use async send but sync recv to avoid desynchronization while minimizing the communication overhead.
+        3. We can use async batch depth to buffer the outputs in the last stage for to allow overlapping the GPU computation and CPU processing and avoid last PP rank staggler.
+
+        Unified Schedule:
+        ====================================================================
+        Stage P
+        recv ith req from previous stage
+        recv ith proxy from previous stage
+        run ith batch
+        recv prev (i+1)% mb_size th outputs
+        process batch result of prev (i+1)% mb_size th batch (can be run in parallel with the curr batch GPU computation)
+        send ith req to next stage
+        send ith proxy to next stage
+        send current stage's outputs to next stage(can be stashed and delayed to send later)
+
+        the above order can be optimized and reordered to minimize communication-related CPU stall and overhead bubbles.
+
+        ====================================================================
+        """
+        self.init_pp_loop_state()
+        while True:
+            server_is_idle = True
+            for mb_id in range(self.pp_loop_size):
+                self.running_batch = self.running_mbs[mb_id]
+                self.last_batch = self.last_mbs[mb_id]
+                next_first_rank_mb_id = (mb_id + self.pp_size) % self.pp_loop_size
+                next_mb_id = (mb_id + 1) % self.pp_loop_size
+                with torch.profiler.record_function("recv_requests"):
+                    recv_reqs = self.recv_requests()
+                    self.process_input_requests(recv_reqs)
+                if not self.pp_group.is_last_rank:
+                    self._pp_commit_comm_work(self.send_req_work)
+                    with torch.profiler.record_function("send_reqs_to_next_stage"):
+                        self.send_req_work = self._pp_send_pyobj_to_next_stage(
+                            recv_reqs,
+                            async_send=True,
+                        )
+                with torch.profiler.record_function("get_next_batch_to_run"):
+                    self.mbs[mb_id] = self.get_next_batch_to_run()
+                self.running_mbs[mb_id] = self.running_batch
+                self.cur_batch: Optional[ScheduleBatch] = self.mbs[mb_id]
+                if self.cur_batch:
+                    server_is_idle = False
+                    pp_proxy_tensors = self._pp_recv_proxy_tensors()
+                next_pp_outputs = None
+                next_batch_result = None
+                d2h_event = None
+                if self.server_args.pp_async_batch_depth > 0:
+                    next_pp_outputs, next_batch_result, d2h_event = (
+                        self._pp_commit_send_output_work_and_preprocess_output_tensors(
+                            next_first_rank_mb_id,
+                            next_mb_id,
+                        )
+                    )
+                self._pp_commit_comm_work(self.send_proxy_work)
+                if self.cur_batch:
+                    result, self.launch_event = self._pp_launch_batch(
+                        mb_id,
+                        pp_proxy_tensors,
+                        self.mb_metadata,
+                        self.last_rank_comm_queue,
+                    )
+                if self.server_args.pp_async_batch_depth == 0:
+                    next_pp_outputs, next_batch_result, d2h_event = (
+                        self._pp_commit_send_output_work_and_preprocess_output_tensors(
+                            next_first_rank_mb_id,
+                            next_mb_id,
+                        )
+                    )
+                if self.mbs[next_mb_id] is not None:
+                    d2h_event.synchronize()
+                    with torch.profiler.record_function("process_batch_result"):
+                        self._pp_process_batch_result(
+                            self.mbs[next_mb_id],
+                            next_batch_result,
+                        )
+                    self.last_mbs[next_mb_id] = self.mbs[next_mb_id]
+                if not self.pp_group.is_last_rank:
+                    if self.cur_batch:
+                        torch.cuda.current_stream().wait_event(self.launch_event)
+                        with torch.profiler.record_function(
+                            "send_proxy_dict_to_next_stage"
+                        ):
+                            self.send_proxy_work = self._pp_send_dict_to_next_stage(
+                                result.pp_hidden_states_proxy_tensors.tensors,
+                                async_send=True,
+                            )
+
+                self.pp_outputs = next_pp_outputs
+
+            # When the server is idle, self-check and re-init some states
+            if server_is_idle:
+                self.self_check_during_idle()
+
+    @DynamicGradMode()
+    def event_loop_pp_disagg_prefill(self: Scheduler):
+        """
+        This is the prefill server event loop for pipeline parallelism.
+
+        Notes:
+        1. Following the same rules as the event_loop_pp.
+        2. Adds extra steps for KV transfer process: bootstrap + release.
+
+        Prefill Server Schedule:
+        ====================================================================
+        Stage P
+        recv ith req from previous stage
+        recv ith bootstrap req from previous stage
+        recv ith transferred req from previous stage
+        recv ith proxy from previous stage
+        run ith batch
+        recv prev (i+1) % mb_size th consensus bootstrapped req from previous stage
+        local consensus on bootstrapped req
+        recv prev (i+1) % mb_size th release req from previous stage
+        local consensus on release req
+        recv prev (i+1) % mb_size th outputs
+        process batch result of prev (i+1)% mb_size th batch (can be run in parallel with the curr batch GPU computation)
+        send ith req to next stage
+        send ith bootstrap req to next stage
+        send ith transferred req to next stage
+        send ith proxy to next stage
+        send current stage's outputs to next stage (can be stashed and delayed to send later)
+
+        the above order can be optimized and reordered to minimize communication-related CPU stall and overhead bubbles.
+        ====================================================================
+
+        There are two additional elements compared to the regular schedule:
+
+        Bootstrap Requests + Release Requests:
+        - Both can have local failure and need to be consensus on. PP needs to guarantee eventual consistency of local failure and flush malfunc requests out as soft error.
+
+        """
+        self.init_pp_loop_state()
+
+        # PD additional state initialization
+        bmbs = [None] * self.pp_loop_size
+        tmbs = [None] * self.pp_loop_size
+        consensus_bootstrapped_rids: Optional[List[str]] = None
+        transferred_rids: List[str] = []
+        release_rids: Optional[List[str]] = None
+        send_bootstrapped_work = []
+        send_transfer_work = []
+        send_consensus_bootstrapped_work = []
+        send_release_work = []
+
+        while True:
+            server_is_idle = True
+            for mb_id in range(self.pp_loop_size):
+                self.running_batch = self.running_mbs[mb_id]
+                self.last_batch = self.last_mbs[mb_id]
+                next_first_rank_mb_id = (mb_id + self.pp_size) % self.pp_loop_size
+                next_mb_id = (mb_id + 1) % self.pp_loop_size
+
+                next_pp_outputs = None
+                next_release_rids = None
+                next_consensus_bootstrapped_rids = None
+                d2h_event = None
+                next_batch_result = None
+
+                recv_reqs = self.recv_requests()
+                self.process_input_requests(recv_reqs)
+
+                if not self.pp_group.is_last_rank:
+                    self._pp_commit_comm_work(self.send_req_work)
+
+                bootstrapped_rids = self._pp_pd_get_bootstrapped_ids()
+                bmbs[mb_id] = bootstrapped_rids
+                self._pp_commit_comm_work(send_bootstrapped_work)
+
+                transferred_rids = self._pp_pd_get_prefill_transferred_ids()
+                self._pp_commit_comm_work(send_transfer_work)
+                tmbs[mb_id] = transferred_rids
+
+                self.process_prefill_chunk()
+                batch = self.get_new_batch_prefill()
+                batch = self.maybe_prepare_mlp_sync_batch(batch)
+                self.mbs[mb_id] = batch
+                self.running_mbs[mb_id] = self.running_batch
+
+                self.cur_batch: Optional[ScheduleBatch] = self.mbs[mb_id]
+                if self.cur_batch:
+                    server_is_idle = False
+                    pp_proxy_tensors = self._pp_recv_proxy_tensors()
+
+                if self.server_args.pp_async_batch_depth > 0:
+                    next_pp_outputs, next_batch_result, d2h_event = (
+                        self._pp_commit_send_output_work_and_preprocess_output_tensors(
+                            next_first_rank_mb_id,
+                            next_mb_id,
+                        )
+                    )
+                self._pp_commit_comm_work(self.send_proxy_work)
+                if self.cur_batch:
+                    result, self.launch_event = self._pp_launch_batch(
+                        mb_id,
+                        pp_proxy_tensors,
+                        self.mb_metadata,
+                        self.last_rank_comm_queue,
+                    )
+                if self.server_args.pp_async_batch_depth == 0:
+                    next_pp_outputs, next_batch_result, d2h_event = (
+                        self._pp_commit_send_output_work_and_preprocess_output_tensors(
+                            next_first_rank_mb_id,
+                            next_mb_id,
+                        )
+                    )
+                send_consensus_bootstrapped_work, consensus_bootstrapped_rids = (
+                    self._pp_pd_send_consensus_bootstrapped_ids(
+                        bmbs,
+                        next_first_rank_mb_id,
+                        consensus_bootstrapped_rids,
+                        bootstrapped_rids,
+                    )
+                )
+                send_release_work, release_rids = (
+                    self._pp_pd_send_consensus_release_ids(
+                        tmbs, next_first_rank_mb_id, release_rids, transferred_rids
+                    )
+                )
+
+                if bmbs[next_mb_id] is not None:
+                    next_consensus_bootstrapped_rids = (
+                        self._pp_recv_pyobj_from_prev_stage()
+                    )
+                    next_consensus_bootstrapped_rids = self.process_bootstrapped_queue(
+                        next_consensus_bootstrapped_rids
+                    )
+                self._pp_commit_comm_work(send_consensus_bootstrapped_work)
+                if tmbs[next_mb_id] is not None:
+                    next_release_rids = self._pp_recv_pyobj_from_prev_stage()
+                self._pp_commit_comm_work(send_release_work)
+                # post-process the coming microbatch
+                if self.mbs[next_mb_id] is not None:
+                    d2h_event.synchronize()
+                    self._pp_process_batch_result(
+                        self.mbs[next_mb_id],
+                        next_batch_result,
+                    )
+                    self.last_mbs[next_mb_id] = self.mbs[next_mb_id]
+
+                if tmbs[next_mb_id] is not None:
+                    self.process_disagg_prefill_inflight_queue(next_release_rids)
+                if not self.pp_group.is_last_rank:
+                    self.send_req_work = self._pp_send_pyobj_to_next_stage(
+                        recv_reqs, async_send=True
+                    )
+                    send_bootstrapped_work = self._pp_send_pyobj_to_next_stage(
+                        bootstrapped_rids, async_send=True
+                    )
+                    send_transfer_work = self._pp_send_pyobj_to_next_stage(
+                        transferred_rids, async_send=True
+                    )
+                    if self.cur_batch:
+                        torch.cuda.current_stream().wait_event(self.launch_event)
+                        self.send_proxy_work = self._pp_send_dict_to_next_stage(
+                            result.pp_hidden_states_proxy_tensors.tensors,
+                            async_send=True,
+                        )
+
+                self.pp_outputs = next_pp_outputs
+                release_rids = next_release_rids
+                consensus_bootstrapped_rids = next_consensus_bootstrapped_rids
+
+                self.running_batch.batch_is_full = False
+
+            # When the server is idle, self-check and re-init some states
+            if server_is_idle and len(self.disagg_prefill_inflight_queue) == 0:
+                self.self_check_during_idle()
+
+    @DynamicGradMode()
+    def event_loop_pp_disagg_decode(self: Scheduler):
+        self.init_pp_loop_state()
+
+        # PD additional state initialization
+        rmbs = [None] * self.pp_loop_size
+        pmbs = [None] * self.pp_loop_size
+        tmbs = [None] * self.pp_loop_size
+        consensus_retract_rids: Optional[List[str]] = None
+        consensus_prealloc_rids: Optional[List[str]] = None
+        release_rids: Optional[List[str]] = None  # consensus transferred rids
+        send_retract_work = []
+        send_prealloc_work = []
+        send_transfer_work = []
+        send_consensus_retract_work = []
+        send_consensus_prealloc_work = []
+        send_release_work = []
+
+        while True:
+            server_is_idle = True
+            for mb_id in range(self.pp_loop_size):
+                self.running_batch = self.running_mbs[mb_id]
+                self.last_batch = self.last_mbs[mb_id]
+                next_first_rank_mb_id = (mb_id + self.pp_size) % self.pp_loop_size
+                next_mb_id = (mb_id + 1) % self.pp_loop_size
+
+                next_pp_outputs = None
+                next_consensus_retract_rids = None
+                next_consensus_prealloc_rids = None
+                next_release_rids = None
+                d2h_event = None
+                next_batch_result = None
+
+                recv_reqs = self.recv_requests()
+                self.process_input_requests(recv_reqs)
+
+                if not self.pp_group.is_last_rank:
+                    self._pp_commit_comm_work(self.send_req_work)
+
+                # reaching consensus through PP ranks
+                retract_rids = self._pp_pd_get_retract_ids(mb_id)
+                rmbs[mb_id] = retract_rids
+                self._pp_commit_comm_work(send_retract_work)
+
+                prealloc_rids = self._pp_pd_get_prealloc_ids()
+                pmbs[mb_id] = prealloc_rids
+                self._pp_commit_comm_work(send_prealloc_work)
+
+                transferred_rids = self._pp_pd_get_decode_transferred_ids()
+                tmbs[mb_id] = transferred_rids
+                self._pp_commit_comm_work(send_transfer_work)
+
+                # get batch to run and proxy tensors if needed
+                batch = self.get_next_disagg_decode_batch_to_run()
+                self.mbs[mb_id] = batch
+                self.running_mbs[mb_id] = self.running_batch
+
+                self.cur_batch: Optional[ScheduleBatch] = self.mbs[mb_id]
+                if self.cur_batch:
+                    server_is_idle = False
+                    pp_proxy_tensors = None
+                    if not self.cur_batch.forward_mode.is_prebuilt():
+                        pp_proxy_tensors = self._pp_recv_proxy_tensors()
+
+                # early send output if possible
+                if self.server_args.pp_async_batch_depth > 0:
+                    next_pp_outputs, next_batch_result, d2h_event = (
+                        self._pp_commit_send_output_work_and_preprocess_output_tensors(
+                            next_first_rank_mb_id,
+                            next_mb_id,
+                        )
+                    )
+                self._pp_commit_comm_work(self.send_proxy_work)
+
+                if self.cur_batch:
+                    result, self.launch_event = self._pp_launch_batch(
+                        mb_id,
+                        pp_proxy_tensors,
+                        self.mb_metadata,
+                        self.last_rank_comm_queue,
+                    )
+
+                if self.server_args.pp_async_batch_depth == 0:
+                    next_pp_outputs, next_batch_result, d2h_event = (
+                        self._pp_commit_send_output_work_and_preprocess_output_tensors(
+                            next_first_rank_mb_id,
+                            next_mb_id,
+                        )
+                    )
+
+                # reach consensus on last rank and send to PP=0
+                # otherwise, just pass along previous consensus
+                send_consensus_retract_work, consensus_retract_rids = (
+                    self._pp_pd_send_consensus_bootstrapped_ids(
+                        rmbs,
+                        next_first_rank_mb_id,
+                        consensus_retract_rids,
+                        retract_rids,
+                    )
+                )
+
+                send_consensus_prealloc_work, consensus_prealloc_rids = (
+                    self._pp_pd_send_consensus_bootstrapped_ids(
+                        pmbs,
+                        next_first_rank_mb_id,
+                        consensus_prealloc_rids,
+                        prealloc_rids,
+                    )
+                )
+
+                send_release_work, release_rids = (
+                    self._pp_pd_send_consensus_release_ids(
+                        tmbs, next_first_rank_mb_id, release_rids, transferred_rids
+                    )
+                )
+
+                if self.server_args.disaggregation_decode_enable_offload_kvcache:
+                    self.decode_offload_manager.check_offload_progress()
+
+                if rmbs[next_mb_id] is not None:
+                    next_consensus_retract_rids = self._pp_recv_pyobj_from_prev_stage()
+                    next_consensus_retract_rids = self.process_retract_queue(
+                        next_consensus_retract_rids
+                    )
+                self._pp_commit_comm_work(send_consensus_retract_work)
+
+                if pmbs[next_mb_id] is not None:
+                    next_consensus_prealloc_rids = self._pp_recv_pyobj_from_prev_stage()
+                    next_consensus_prealloc_rids = self.process_prealloc_queue(
+                        next_consensus_prealloc_rids
+                    )
+                self._pp_commit_comm_work(send_consensus_prealloc_work)
+
+                if tmbs[next_mb_id] is not None:
+                    next_release_rids = self._pp_recv_pyobj_from_prev_stage()
+                    next_release_rids = self.process_decode_transfer_queue(
+                        next_release_rids
+                    )
+                self._pp_commit_comm_work(send_release_work)
+
+                # post-process the coming microbatch
+                if self.mbs[next_mb_id] is not None:
+                    if not self.mbs[next_mb_id].forward_mode.is_prebuilt():
+                        d2h_event.synchronize()
+                        self._pp_process_batch_result(
+                            self.mbs[next_mb_id],
+                            next_batch_result,
+                        )
+                    self.last_mbs[next_mb_id] = self.mbs[next_mb_id]
+
+                if not self.pp_group.is_last_rank:
+                    self.send_req_work = self._pp_send_pyobj_to_next_stage(
+                        recv_reqs, async_send=True
+                    )
+                    send_retract_work = self._pp_send_pyobj_to_next_stage(
+                        retract_rids, async_send=True
+                    )
+                    send_prealloc_work = self._pp_send_pyobj_to_next_stage(
+                        prealloc_rids, async_send=True
+                    )
+                    send_transfer_work = self._pp_send_pyobj_to_next_stage(
+                        transferred_rids, async_send=True
+                    )
+                    if self.cur_batch and not self.cur_batch.forward_mode.is_prebuilt():
+                        torch.cuda.current_stream().wait_event(self.launch_event)
+                        self.send_proxy_work = self._pp_send_dict_to_next_stage(
+                            result.pp_hidden_states_proxy_tensors.tensors,
+                            async_send=True,
+                        )
+
+                self.pp_outputs = next_pp_outputs
+                release_rids = next_release_rids
+                consensus_retract_rids = next_consensus_retract_rids
+                consensus_prealloc_rids = next_consensus_prealloc_rids
+
+                self.running_batch.batch_is_full = False
+
+            # When the server is idle, self-check and re-init some states
+            queue_size = (
+                len(self.waiting_queue)
+                + len(self.disagg_decode_transfer_queue.queue)
+                + len(self.disagg_decode_prealloc_queue.queue)
+            )
+            if self.server_args.disaggregation_decode_enable_offload_kvcache:
+                queue_size += len(self.decode_offload_manager.ongoing_offload)
+
+            if server_is_idle and queue_size == 0:
+                self.self_check_during_idle()
+
+    def init_pp_loop_state(self: Scheduler):
+        self.pp_loop_size: int = self.pp_size + self.server_args.pp_async_batch_depth
+        # In CP mode, attention weights are duplicated, eliminating the need for the attention TP all-gather operation.
+        self.require_attn_tp_allgather = (
+            not self.server_args.enable_nsa_prefill_context_parallel
+        )
+        self.mbs = [None] * self.pp_loop_size
+        self.last_mbs = [None] * self.pp_loop_size
+        self.running_mbs = [
+            ScheduleBatch(reqs=[], batch_is_full=False)
+            for _ in range(self.pp_loop_size)
+        ]
+        self.mb_metadata: List[Optional[PPBatchMetadata]] = [None] * self.pp_loop_size
+        self.pp_outputs: Optional[PPProxyTensors] = None
+        self.last_rank_comm_queue: deque[Tuple[torch.cuda.Event, PPProxyTensors]] = (
+            deque()
+        )
+
+        self.send_req_work = []
+        self.send_proxy_work = []
+        self.send_output_work = []
+        self.launch_event = None
+
+    def profile_and_init_predictor(self: Scheduler):
+        """
+        Profile prefill latency for dynamic chunk sizing.
+
+        Only runs on PP0 (first rank), then broadcasts data to all ranks.
+        All ranks fit coefficients using the same data.
+        """
+        seq_lens: List[int] = []
+        latencies: List[float] = []
+
+        if self.pp_group.is_first_rank:
+            model_runner = self.tp_worker.model_runner
+            model_config = model_runner.model_config
+            input_ids_list = []
+            for i in range(128):
+                chunk_size = int(
+                    self.chunked_prefill_size * 1.25
+                    - i * (self.chunked_prefill_size * 1.25 // 128)
+                )
+                if chunk_size <= 0:
+                    break
+                input_ids = np.random.randint(
+                    0, 10000, size=chunk_size, dtype=np.int64
+                ).tolist()
+                input_ids_list.append(input_ids)
+
+            sampling_params = SamplingParams(
+                temperature=0,
+                max_new_tokens=1,
+            )
+            # Create and profile requests
+            for i, input_ids in enumerate(
+                tqdm(
+                    input_ids_list,
+                    desc="Profiling prefill latency for dynamic chunking",
+                )
+            ):
+                req = Req(
+                    rid=str(i),
+                    origin_input_text="",
+                    origin_input_ids=input_ids,
+                    sampling_params=sampling_params,
+                )
+                req.fill_ids = req.origin_input_ids
+                req.logprob_start_len = -1
+                req.set_extend_input_len(len(req.fill_ids) - len(req.prefix_indices))
+
+                # Prepare batch
+                batch = ScheduleBatch.init_new(
+                    [req],
+                    self.req_to_token_pool,
+                    self.token_to_kv_pool_allocator,
+                    self.tree_cache,
+                    self.model_config,
+                    False,
+                    self.spec_algorithm,
+                )
+
+                current_seq_len = len(req.fill_ids)
+
+                if is_dp_attention_enabled():
+                    # For profiling, we only have one request on PP0
+                    # Set global_num_tokens to indicate this rank has tokens, others have 0
+                    dp_size = get_attention_dp_size()
+                    global_num_tokens = [0] * dp_size
+                    dp_rank = get_attention_dp_rank()
+                    global_num_tokens[dp_rank] = current_seq_len
+                    batch.global_num_tokens = global_num_tokens
+                    batch.global_num_tokens_for_logprob = global_num_tokens
+
+                proxy_tensors = {
+                    "hidden_states": torch.zeros(
+                        (current_seq_len, model_config.hidden_size),
+                        dtype=model_config.dtype,
+                        device="cuda",
+                    ),
+                    "residual": torch.zeros(
+                        (current_seq_len, model_config.hidden_size),
+                        dtype=model_config.dtype,
+                        device="cuda",
+                    ),
+                }
+
+                pp_proxy = PPProxyTensors(proxy_tensors)
+
+                # Measure latency with CUDA synchronization for accurate timing
+                # Synchronize before starting timing to ensure clean measurement
+                if torch.cuda.is_available():
+                    torch.cuda.synchronize()
+
+                start = time.perf_counter()
+                batch.prepare_for_extend()
+                model_worker_batch = batch.get_model_worker_batch()
+
+                forward_batch = ForwardBatch.init_new(model_worker_batch, model_runner)
+                _ = model_runner.forward(
+                    forward_batch=forward_batch, pp_proxy_tensors=pp_proxy
+                )
+
+                # Synchronize after forward to ensure GPU operations complete
+                if torch.cuda.is_available():
+                    torch.cuda.synchronize()
+
+                latency_seconds = time.perf_counter() - start
+                latency_ms = latency_seconds * 1e3  # Convert to milliseconds
+                seq_lens.append(len(input_ids))
+                latencies.append(latency_ms)
+
+                # Release KV cache
+                if req.req_pool_idx is not None:
+                    kv_indices = self.req_to_token_pool.req_to_token[
+                        req.req_pool_idx, : len(req.fill_ids)
+                    ]
+                    self.token_to_kv_pool_allocator.free(kv_indices)
+                    self.req_to_token_pool.free(req)
+
+            logger.info(
+                f"[PP Dynamic Chunk] [PP0] Profiled {len(seq_lens)} samples: "
+                f"seq_lens={seq_lens}, latencies_ms={latencies}"
+            )
+
+            if self.attn_tp_size > 1:
+                data_to_sync_tp = [seq_lens, latencies]
+                data_to_sync_tp = broadcast_pyobj(
+                    data_to_sync_tp,
+                    self.attn_tp_group.rank,
+                    self.attn_tp_cpu_group,
+                    src=self.attn_tp_group.ranks[0],
+                )
+                seq_lens, latencies = data_to_sync_tp
+
+        # Broadcast data to all ranks
+        if torch.distributed.is_available() and torch.distributed.is_initialized():
+            data_to_sync = [seq_lens, latencies]
+            self.pp_group.broadcast_object_list(data_to_sync, src=0)
+            seq_lens, latencies = data_to_sync
+
+        # Quadratic model: f(l) = al^2 + bl + c
+        self.length_predictor = ChunkSizePredictor()
+        self.length_predictor.fit(seq_lens, latencies)
+        self.length_predictor.set_target_latency(self.chunked_prefill_size)
+        self.length_predictor.is_ready = True
+        logger.info(
+            f"[PP Dynamic Chunk] [PP{self.pp_rank}] Predictor ready (quadratic). "
+            f"Target latency: {self.length_predictor.target_latency:.2f}ms"
+        )
+
+    def predict_next_chunk_size(self: Scheduler, history_len: int) -> Optional[int]:
+        """
+        Predict next chunk size dynamically based on current history length.
+
+        Args:
+            history_len: Current sequence length
+
+        Returns:
+            Predicted chunk size, or None to use default chunked_prefill_size
+        """
+        if (
+            not self.enable_dynamic_chunking
+            or self.length_predictor is None
+            or not self.length_predictor.is_ready
+        ):
+            return None
+
+        max_chunk_size = getattr(self, "max_prefill_tokens", None)
+        predicted_size = self.length_predictor.predict_next_chunk_size(
+            history_len=history_len,
+            base_chunk_size=self.chunked_prefill_size,
+            page_size=self.page_size,
+            context_len=self.model_config.context_len,
+            max_chunk_size=max_chunk_size,
+        )
+
+        if predicted_size is not None:
+            logger.debug(
+                f"[PP Dynamic Chunk] [PP{self.pp_rank}] Predicted chunk size: "
+                f"{predicted_size} (history_len={history_len})"
+            )
+
+        return predicted_size
+
+    def process_bootstrapped_queue(
+        self: Scheduler, bootstrapped_rids: Optional[List[str]]
+    ):
+        # finished consensus bootstrapped reqs and prepare the waiting queue
+        if bootstrapped_rids is not None:
+            (
+                good_consensus_bootstrapped_rids,
+                bad_consensus_bootstrapped_rids,
+            ) = bootstrapped_rids
+            good_reqs, failed_reqs = (
+                self.disagg_prefill_bootstrap_queue.pop_bootstrapped(
+                    return_failed_reqs=True,
+                    rids_to_check=good_consensus_bootstrapped_rids
+                    + bad_consensus_bootstrapped_rids,
+                )
+            )
+            self.waiting_queue.extend(good_reqs)
+            return [[req.rid for req in good_reqs], [req.rid for req in failed_reqs]]
+        return None
+
+    def _pp_pd_get_bootstrapped_ids(self: Scheduler):
+        # communicate pre-consensus bootstrapp reqs
+        if self.pp_group.is_first_rank:
+            # First rank, pop the bootstrap reqs from the bootstrap queue
+            good_bootstrapped_rids, bad_bootstrapped_rids = self.get_rids(
+                self.disagg_prefill_bootstrap_queue.queue,
+                True,
+                [KVPoll.WaitingForInput],
+                [KVPoll.Failed],
+            )
+        else:
+            # Other ranks, receive the bootstrap reqs info from the previous rank and ensure the consensus
+            prev_bootstrapped_rids = self._pp_recv_pyobj_from_prev_stage()
+            prev_good_bootstrapped_rids, prev_bad_bootstrapped_rids = (
+                prev_bootstrapped_rids
+            )
+            curr_good_bootstrapped_rids, curr_bad_bootstrapped_rids = self.get_rids(
+                self.disagg_prefill_bootstrap_queue.queue,
+                True,
+                [KVPoll.WaitingForInput],
+                [KVPoll.Failed],
+            )
+            good_bootstrapped_rids = list(
+                set(prev_good_bootstrapped_rids) & set(curr_good_bootstrapped_rids)
+            )
+            bad_bootstrapped_rids = list(
+                set(prev_bad_bootstrapped_rids) | set(curr_bad_bootstrapped_rids)
+            )
+        return [good_bootstrapped_rids, bad_bootstrapped_rids]
+
+    def _pp_pd_get_prefill_transferred_ids(self: Scheduler):
+        # get the current stage transfer success
+        if self.pp_group.is_first_rank:
+            transferred_rids = self.get_rids(
+                self.disagg_prefill_inflight_queue,
+                True,
+                [KVPoll.Success, KVPoll.Failed],
+            )
+        # if other ranks, do intersection with the previous rank's transferred rids
+        else:
+            # 2 (Release): Receive the transferred rids from the previous rank
+            # 1. recv previous stage's transferred reqs info
+            prev_transferred_rids = self._pp_recv_pyobj_from_prev_stage()
+            # 2. get the current stage's transferred reqs info
+            curr_transferred_rids = self.get_rids(
+                self.disagg_prefill_inflight_queue,
+                True,
+                [KVPoll.Success, KVPoll.Failed],
+            )
+            # 3. new consensus rids = intersection(previous consensus rids, transfer finished rids)
+            transferred_rids = list(
+                set(prev_transferred_rids) & set(curr_transferred_rids)
+            )
+        return transferred_rids
+
+    def _pp_pd_send_consensus_bootstrapped_ids(
+        self: Scheduler,
+        bmbs: List[List[str]],
+        next_first_rank_mb_id: int,
+        consensus_bootstrapped_rids: List[str],
+        bootstrapped_rids: List[str],
+    ):
+        # 3 (Release): send the release rids from last stage to the first stage
+        send_consensus_bootstrapped_work = []
+        if self.pp_group.is_last_rank:
+            if bmbs[next_first_rank_mb_id] is not None:
+                consensus_bootstrapped_rids = bootstrapped_rids
+                send_consensus_bootstrapped_work = self._pp_send_pyobj_to_next_stage(
+                    consensus_bootstrapped_rids, async_send=True
+                )
+        # 4 (Release): send the release rids from non last rank to the next rank
+        else:
+            if consensus_bootstrapped_rids is not None:
+                send_consensus_bootstrapped_work = self._pp_send_pyobj_to_next_stage(
+                    consensus_bootstrapped_rids, async_send=True
+                )
+        return send_consensus_bootstrapped_work, consensus_bootstrapped_rids
+
+    def _pp_pd_send_consensus_release_ids(
+        self: Scheduler,
+        tmbs: List[List[str]],
+        next_first_rank_mb_id: int,
+        release_rids: List[str],
+        transferred_rids: List[str],
+    ):
+        send_release_work = []
+        if self.pp_group.is_last_rank:
+            if tmbs[next_first_rank_mb_id] is not None:
+                release_rids = transferred_rids
+                send_release_work = self._pp_send_pyobj_to_next_stage(
+                    release_rids, async_send=True
+                )
+        # 4 (Release): send the release rids from non last rank to the next rank
+        else:
+            if release_rids is not None:
+                send_release_work = self._pp_send_pyobj_to_next_stage(
+                    release_rids, async_send=True
+                )
+        return send_release_work, release_rids
+
+    def _pp_commit_comm_work(self: Scheduler, work: List[P2PWork]) -> None:
+        for p2p_work in work:
+            p2p_work.work.wait()
+        work.clear()
+
+    def _pp_commit_send_output_work_and_preprocess_output_tensors(
+        self: Scheduler,
+        next_first_rank_mb_id: int,
+        next_mb_id: int,
+    ) -> Tuple[PPProxyTensors, GenerationBatchResult, torch.cuda.Event]:
+        self._pp_commit_comm_work(work=self.send_output_work)
+        (
+            next_pp_outputs,
+            next_batch_result,
+            d2h_event,
+            self.send_output_work,
+        ) = self._pp_send_recv_and_preprocess_output_tensors(
+            next_first_rank_mb_id,
+            next_mb_id,
+            self.mbs,
+            self.mb_metadata,
+            self.last_rank_comm_queue,
+            self.pp_outputs,
+        )
+        return next_pp_outputs, next_batch_result, d2h_event
+
+    def _pp_send_pyobj_to_next_stage(self: Scheduler, data, async_send: bool = False):
+        p2p_work = []
+        if self.attn_tp_rank == 0:
+            dp_offset = self.attn_dp_rank * self.attn_tp_size
+            p2p_work = point_to_point_pyobj(
+                data,
+                self.pp_rank * self.tp_size + dp_offset,
+                self.world_group.cpu_group,
+                self.pp_rank * self.tp_size + dp_offset,
+                ((self.pp_rank + 1) % self.pp_size) * self.tp_size + dp_offset,
+                async_send=async_send,
+            )
+        return p2p_work
+
+    def _pp_recv_pyobj_from_prev_stage(self: Scheduler):
+        if self.attn_tp_rank == 0:
+            dp_offset = self.attn_dp_rank * self.attn_tp_size
+            data = point_to_point_pyobj(
+                [],
+                self.pp_rank * self.tp_size + dp_offset,
+                self.world_group.cpu_group,
+                ((self.pp_rank - 1) % self.pp_size) * self.tp_size + dp_offset,
+                self.pp_rank * self.tp_size + dp_offset,
+            )
+        else:
+            data = None
+
+        if self.attn_tp_size > 1:
+            data = broadcast_pyobj(
+                data,
+                self.attn_tp_group.rank,
+                self.attn_tp_cpu_group,
+                src=self.attn_tp_group.ranks[0],
+            )
+
+        return data
+
+    def _pp_prepare_tensor_dict(
+        self: Scheduler, result: GenerationBatchResult, batch: ScheduleBatch
+    ) -> Dict[str, torch.Tensor]:
+        tensor_dict = {
+            "next_token_ids": result.next_token_ids,
+        }
+
+        if batch.return_logprob:
+            logprob_dict = get_logprob_dict_from_result(result)
+            tensor_dict = {
+                **tensor_dict,
+                **logprob_dict,
+            }
+        return tensor_dict
+
+    def _pp_send_dict_to_next_stage(
+        self: Scheduler,
+        tensor_dict: Dict[str, torch.Tensor],
+        async_send: bool = True,
+    ):
+        p2p_work = []
+        p2p_work.extend(
+            self.pp_group.send_tensor_dict(
+                tensor_dict=tensor_dict,
+                all_gather_group=(
+                    self.attn_tp_group if self.require_attn_tp_allgather else None
+                ),
+                async_send=async_send,
+            )
+        )
+        return p2p_work
+
+    def _pp_recv_proxy_tensors(self: Scheduler) -> Optional[PPProxyTensors]:
+        pp_proxy_tensors = None
+        if not self.pp_group.is_first_rank:
+            pp_proxy_tensors = PPProxyTensors(
+                self.pp_group.recv_tensor_dict(
+                    all_gather_group=(
+                        self.attn_tp_group if self.require_attn_tp_allgather else None
+                    )
+                )
+            )
+        return pp_proxy_tensors
+
+    def _pp_recv_dict_from_prev_stage(
+        self: Scheduler,
+    ) -> Dict[str, torch.Tensor]:
+        res = self.pp_group.recv_tensor_dict(
+            all_gather_group=(
+                self.attn_tp_group if self.require_attn_tp_allgather else None
+            ),
+        )
+        return res
+
+    def _pp_prep_batch_result(
+        self: Scheduler,
+        batch: ScheduleBatch,
+        mb_metadata: PPBatchMetadata,
+        pp_outputs: PPProxyTensors,
+    ):
+        from sglang.srt.managers.scheduler import GenerationBatchResult
+
+        logits_output = None
+        extend_input_len_per_req = None
+        extend_logprob_start_len_per_req = None
+
+        if batch.return_logprob:
+            (
+                logits_output,
+                extend_input_len_per_req,
+                extend_logprob_start_len_per_req,
+            ) = get_logprob_from_pp_outputs(pp_outputs)
+        batch.output_ids = pp_outputs["next_token_ids"]
+        output_result = GenerationBatchResult(
+            logits_output=logits_output,
+            pp_hidden_states_proxy_tensors=None,
+            next_token_ids=pp_outputs["next_token_ids"],
+            extend_input_len_per_req=extend_input_len_per_req,
+            extend_logprob_start_len_per_req=extend_logprob_start_len_per_req,
+            can_run_cuda_graph=mb_metadata.can_run_cuda_graph,
+        )
+        return output_result
+
+    def _pp_process_batch_result(
+        self: Scheduler, batch: ScheduleBatch, output_result: GenerationBatchResult
+    ):
+        self.process_batch_result(batch, output_result)
+
+    def _pp_send_output_to_next_stage(
+        self: Scheduler,
+        next_first_rank_mb_id: int,
+        mbs: List[ScheduleBatch],
+        last_rank_comm_queue: deque[Tuple[torch.cuda.Event, PPProxyTensors]],
+        pp_outputs: PPProxyTensors | None,
+    ) -> List[P2PWork]:
+        send_output_work = []
+        if self.pp_group.is_last_rank:
+            # send ready PP output to rank 0
+            if mbs[next_first_rank_mb_id] is not None:
+                q_event, pp_outputs_to_send = last_rank_comm_queue.popleft()
+                if not mbs[next_first_rank_mb_id].forward_mode.is_prebuilt():
+                    torch.cuda.current_stream().wait_event(q_event)
+                    with torch.profiler.record_function("send_res_dict_to_next_stage"):
+                        send_output_work = self._pp_send_dict_to_next_stage(
+                            pp_outputs_to_send.tensors,
+                            async_send=True,
+                        )
+        # send the outputs from the last round to let the next stage worker run post processing
+        if not self.pp_group.is_last_rank:
+            if pp_outputs:
+                with torch.profiler.record_function("send_res_dict_to_next_stage"):
+                    send_output_work = self._pp_send_dict_to_next_stage(
+                        pp_outputs.tensors,
+                        async_send=True,
+                    )
+        return send_output_work
+
+    def _pp_send_recv_and_preprocess_output_tensors(
+        self: Scheduler,
+        next_first_rank_mb_id: int,
+        next_mb_id: int,
+        mbs: List[ScheduleBatch],
+        mb_metadata: List[PPBatchMetadata],
+        last_rank_comm_queue: deque[Tuple[torch.cuda.Event, PPProxyTensors]],
+        pp_outputs: PPProxyTensors | None,
+    ) -> Tuple[PPProxyTensors, List[P2PWork], torch.cuda.Event]:
+        next_pp_outputs = None
+        d2h_event = None
+        batch_result = None
+        send_output_work = self._pp_send_output_to_next_stage(
+            next_first_rank_mb_id,
+            mbs,
+            last_rank_comm_queue,
+            pp_outputs,
+        )
+
+        if mbs[next_mb_id] is not None:
+            with torch.profiler.record_function("recv_res_dict_from_prev_stage"):
+                next_pp_outputs = None
+                if not mbs[next_mb_id].forward_mode.is_prebuilt():
+                    next_pp_outputs = PPProxyTensors(
+                        self._pp_recv_dict_from_prev_stage()
+                    )
+            if not mbs[next_mb_id].forward_mode.is_prebuilt():
+                with self.copy_stream_ctx:
+                    self.copy_stream.wait_stream(self.schedule_stream)
+                    batch_result = self._pp_prep_batch_result(
+                        mbs[next_mb_id], mb_metadata[next_mb_id], next_pp_outputs
+                    )
+                    d2h_event = torch.cuda.Event()
+                    d2h_event.record(torch.cuda.current_stream())
+
+        return next_pp_outputs, batch_result, d2h_event, send_output_work
+
+    def _pp_launch_batch(
+        self: Scheduler,
+        mb_id: int,
+        pp_proxy_tensors: PPProxyTensors,
+        mb_metadata: List[Optional[PPBatchMetadata]],
+        last_rank_comm_queue: deque[Tuple[torch.cuda.Event, PPProxyTensors]],
+    ):
+        with torch.profiler.record_function("run_batch"):
+            with self.forward_stream_ctx:
+                self.forward_stream.wait_stream(self.schedule_stream)
+                result = self.run_batch(self.cur_batch, pp_proxy_tensors)
+                mb_metadata[mb_id] = PPBatchMetadata(
+                    can_run_cuda_graph=result.can_run_cuda_graph,
+                )
+                event = torch.cuda.Event()
+                event.record(torch.cuda.current_stream())
+                if self.pp_group.is_last_rank:
+                    # (last rank) buffer the outputs for async batch depth
+                    last_rank_comm_queue.append(
+                        (
+                            event,
+                            PPProxyTensors(
+                                self._pp_prepare_tensor_dict(result, self.cur_batch)
+                            ),
+                        )
+                    )
+        return result, event
+
+    def get_rids(
+        self: Scheduler, req_queue: List[Req], is_send: bool, *poll_statuses_group
+    ):
+        """
+        Used by PP, get the required rids with the given poll statuses.
+        """
+        polls = poll_and_all_reduce_attn_cp_tp_group(
+            [req.disagg_kv_sender if is_send else req.kv_receiver for req in req_queue],
+            self.attn_cp_cpu_group,
+            self.attn_tp_cpu_group,
+        )
+        rids: List = []
+        for poll_statuses in poll_statuses_group:
+            rids.append(
+                [
+                    req.rid if is_send else req.req.rid
+                    for req, poll in zip(req_queue, polls)
+                    if poll in poll_statuses
+                ]
+            )
+        return tuple(rids) if len(rids) > 1 else rids[0]
+
+    def _pp_pd_get_retract_ids(self: Scheduler, mb_id: int):
+        # communicate pre-consensus retracted reqs
+        for req in self.disagg_decode_prealloc_queue.retracted_queue:
+            # assign retracted reqs to the current microbatch
+            if req.retraction_mb_id is None:
+                req.retraction_mb_id = mb_id
+        curr_retract_rids = [
+            req.rid
+            for req in self.disagg_decode_prealloc_queue.retracted_queue
+            if req.retraction_mb_id == mb_id
+        ]
+        if self.pp_group.is_first_rank:
+            # First rank, get all retracted req ids for the microbatch
+            return curr_retract_rids
+        else:
+            # Other ranks, receive the retracted reqs info from the previous rank and ensure the consensus
+            prev_retract_rids = self._pp_recv_pyobj_from_prev_stage()
+            return list(set(prev_retract_rids) & set(curr_retract_rids))
+
+    def _pp_pd_get_prealloc_ids(self: Scheduler):
+        # communicate pre-consensus prealloc reqs
+        if self.pp_group.is_first_rank:
+            # First rank, pop the preallocated reqs from the prealloc queue
+            good_prealloc_rids, bad_prealloc_rids = self.get_rids(
+                self.disagg_decode_prealloc_queue.queue,
+                False,
+                [KVPoll.WaitingForInput],
+                [KVPoll.Failed],
+            )
+        else:
+            # Other ranks, receive the preallocated reqs info from the previous rank and ensure the consensus
+            prev_prealloc_rids = self._pp_recv_pyobj_from_prev_stage()
+            prev_good_prealloc_rids, prev_bad_prealloc_rids = prev_prealloc_rids
+            curr_good_prealloc_rids, curr_bad_prealloc_rids = self.get_rids(
+                self.disagg_decode_prealloc_queue.queue,
+                False,
+                [KVPoll.WaitingForInput],
+                [KVPoll.Failed],
+            )
+            good_prealloc_rids = list(
+                set(prev_good_prealloc_rids) & set(curr_good_prealloc_rids)
+            )
+            bad_prealloc_rids = list(
+                set(prev_bad_prealloc_rids) | set(curr_bad_prealloc_rids)
+            )
+        return [good_prealloc_rids, bad_prealloc_rids]
+
+    def _pp_pd_get_decode_transferred_ids(self: Scheduler):
+        # get the current stage transfer success
+        if self.pp_group.is_first_rank:
+            transferred_rids = self.get_rids(
+                self.disagg_decode_transfer_queue.queue,
+                False,
+                [KVPoll.Success, KVPoll.Failed],
+            )
+        # if other ranks, do intersection with the previous rank's transferred rids
+        else:
+            # 2 (Release): Receive the transferred rids from the previous rank
+            # 1. recv previous stage's transferred reqs info
+            prev_transferred_rids = self._pp_recv_pyobj_from_prev_stage()
+            # 2. get the current stage's transferred reqs info
+            curr_transferred_rids = self.get_rids(
+                self.disagg_decode_transfer_queue.queue,
+                False,
+                [KVPoll.Success, KVPoll.Failed],
+            )
+            # 3. new consensus rids = intersection(previous consensus rids, transfer finished rids)
+            transferred_rids = list(
+                set(prev_transferred_rids) & set(curr_transferred_rids)
+            )
+        return transferred_rids
+
+    def process_retract_queue(self: Scheduler, retract_rids: Optional[List[str]]):
+        if retract_rids is not None:
+            # try to resume retracted requests if there are enough space for another `num_reserved_decode_tokens` decode steps
+            resumed_reqs = self.disagg_decode_prealloc_queue.resume_retracted_reqs(
+                retract_rids
+            )
+            self.waiting_queue.extend(resumed_reqs)
+            return [req.rid for req in resumed_reqs]
+        return None
+
+    def process_prealloc_queue(self: Scheduler, prealloc_rids: Optional[List[str]]):
+        if len(self.disagg_decode_prealloc_queue.retracted_queue) > 0:
+            # if there are still retracted requests, we do not allocate new requests
+            return [[], []]
+
+        if prealloc_rids is not None:
+            (
+                good_consensus_prealloc_rids,
+                bad_consensus_prealloc_rids,
+            ) = prealloc_rids
+            good_reqs, failed_reqs = self.disagg_decode_prealloc_queue.pop_preallocated(
+                rids_to_check=good_consensus_prealloc_rids
+                + bad_consensus_prealloc_rids,
+            )
+            self.disagg_decode_transfer_queue.extend(good_reqs)
+            return [
+                [req.req.rid for req in good_reqs],
+                [req.req.rid for req in failed_reqs],
+            ]
+        return None
+
+    def process_decode_transfer_queue(
+        self: Scheduler, release_rids: Optional[List[str]]
+    ):
+        if release_rids is not None:
+            released_reqs = self.disagg_decode_transfer_queue.pop_transferred(
+                release_rids
+            )
+            self.waiting_queue.extend(released_reqs)
+            return [req.rid for req in released_reqs]
+        return None
+
+
+class ChunkSizePredictor:
+    """
+    Predictor for dynamic chunk size based on quadratic latency model.
+
+    Models latency as: f(l) = a*l^2 + b*l + c
+    Predicts next chunk size x such that: f(L+x) - f(L) = target_latency
+    """
+
+    def __init__(self):
+        self.quadratic_coeff_a = 0.0
+        self.linear_coeff_b = 0.0
+        self.constant_coeff_c = 0.0
+        self.target_latency: Optional[float] = None
+        self.is_ready = False
+
+    def fit(self, seq_lens: List[int], latencies: List[float]):
+        """Fit quadratic coefficients f(l) = al^2 + bl + c from data points."""
+        # Skip the first data point to reduce fitting bias, as the first run is slower without warmup
+        L = np.array(seq_lens[1:], dtype=np.float64)
+        T = np.array(latencies[1:], dtype=np.float64)
+
+        if len(L) < 8:
+            raise ValueError(
+                f"Not enough data points for quadratic fitting ({len(L)} < 8). "
+                "Need at least 8 samples with different sequence lengths."
+            )
+
+        # Build design matrix for f(l) = al^2 + bl + c
+        X = np.column_stack([L * L, L, np.ones_like(L)])  # [l^2, l, 1]
+
+        try:
+            coeffs, residuals, rank, s = np.linalg.lstsq(X, T, rcond=None)
+            if len(coeffs) >= 3:
+                fitted_a = float(coeffs[0])  # quadratic coefficient
+                fitted_b = float(coeffs[1])  # linear coefficient
+                fitted_c = float(coeffs[2])  # constant coefficient
+            else:
+                raise ValueError("Failed to fit coefficients: insufficient rank")
+        except np.linalg.LinAlgError as e:
+            raise ValueError(f"Failed to fit f(l) = al^2 + bl + c: {e}")
+
+        # Validate coefficients
+        if fitted_a <= 0:
+            raise ValueError(
+                f"Fitted quadratic coefficient a={fitted_a:.2e} is not positive. "
+                "Attention has O(n^2) complexity, so a must be positive. "
+                "Check warmup data quality."
+            )
+
+        if fitted_b < 0:
+            logger.warning(
+                f"Fitted linear coefficient b={fitted_b:.2e} is negative. Setting b=0."
+            )
+            fitted_b = 0.0
+
+        self.quadratic_coeff_a = fitted_a
+        self.linear_coeff_b = fitted_b
+        self.constant_coeff_c = fitted_c
+
+        logger.info(
+            f"[ChunkSizePredictor] Fitted coefficients: a={fitted_a:.2e}, "
+            f"b={fitted_b:.2e}, c={fitted_c:.2e}"
+        )
+
+    def set_target_latency(self, base_chunk_size: int):
+        """Set target latency based on base chunk size: target = f(base_chunk_size) - f(0)."""
+
+        def f(l: float) -> float:
+            """Total latency function: f(l) = al^2 + bl + c (or bl + c for linear)"""
+            return (
+                self.quadratic_coeff_a * l * l
+                + self.linear_coeff_b * l
+                + self.constant_coeff_c
+            )
+
+        self.target_latency = f(float(base_chunk_size)) - f(0.0)
+
+        if self.target_latency <= 0:
+            raise ValueError(
+                f"Calculated target_latency={self.target_latency:.2f}ms is not positive. "
+                "Check warmup data quality."
+            )
+
+        logger.info(
+            f"[ChunkSizePredictor] Target latency: {self.target_latency:.2f}ms "
+            f"(base_chunk_size={base_chunk_size})"
+        )
+
+    def predict_next_chunk_size(
+        self,
+        history_len: int,
+        base_chunk_size: int,
+        page_size: int,
+        context_len: int,
+        max_chunk_size: Optional[int] = None,
+    ) -> Optional[int]:
+        """
+        Predict next chunk size x such that f(history_len + x) - f(history_len) = target_latency.
+
+        Args:
+            history_len: Current sequence length (L)
+            base_chunk_size: Base chunk size
+            page_size: Page size for alignment
+            context_len: Maximum context length
+            max_chunk_size: Maximum allowed chunk size (optional)
+
+        Returns:
+            Predicted chunk size, or None if prediction fails
+        """
+        if not self.is_ready or self.target_latency is None:
+            return None
+
+        # Handle quadratic model: f(l) = al^2 + bl + c
+        if self.quadratic_coeff_a <= 0:
+            return None
+
+        # Solve f(L+x) - f(L) = T
+        # where f(L) = a*L^2 + b*L + c
+        # This expands to: ax^2 + (2aL+b)x - T = 0
+        # A = a, B = 2aL + b, C = -T
+        A = self.quadratic_coeff_a
+        B = 2 * self.quadratic_coeff_a * history_len + self.linear_coeff_b
+        C = -self.target_latency
+
+        discriminant = B * B - 4 * A * C
+
+        if discriminant < 0:
+            logger.warning(
+                f"Discriminant is negative ({discriminant:.2e}). "
+                f"No real solution for chunk size. L={history_len}, T={self.target_latency:.2f}ms."
+            )
+            return None
+
+        sqrt_discriminant = math.sqrt(discriminant)
+        calculated_chunk_size_float = (-B + sqrt_discriminant) / (2 * A)
+
+        if calculated_chunk_size_float <= 0:
+            logger.warning(
+                f"Calculated chunk size is non-positive ({calculated_chunk_size_float:.2f}). "
+                f"L={history_len}, T={self.target_latency:.2f}ms."
+            )
+            return None
+
+        # Use a smooth coefficient to reduce the abrupt decrease in chunk size
+        smooth_coeff = envs.SGLANG_DYNAMIC_CHUNKING_SMOOTH_FACTOR.get()
+        smoothed_chunk_size = base_chunk_size + smooth_coeff * (
+            calculated_chunk_size_float - base_chunk_size
+        )
+        # Make sure the dynamic chunk size is at least 1/4 of the base chunk size
+        calculated_chunk_size = max(int(smoothed_chunk_size), base_chunk_size // 4)
+
+        # Align to page_size (minimum alignment size is 64)
+        alignment_size = max(page_size, 64)
+        dynamic_chunk_size = (calculated_chunk_size // alignment_size) * alignment_size
+
+        # Ensure aligned size is at least alignment_size
+        if dynamic_chunk_size < alignment_size:
+            dynamic_chunk_size = alignment_size
+
+        # Apply constraints
+        max_allowed = context_len - history_len - 100  # Leave 100 tokens margin
+        if max_chunk_size is not None:
+            max_allowed = min(max_allowed, max_chunk_size)
+        dynamic_chunk_size = min(dynamic_chunk_size, max_allowed)
+
+        # Align again after min operation
+        dynamic_chunk_size = (dynamic_chunk_size // alignment_size) * alignment_size
+
+        if dynamic_chunk_size < alignment_size:
+            return None
+
+        return dynamic_chunk_size
diff --git a/sglang/python/sglang/srt/managers/scheduler_profiler_mixin.py b/sglang/python/sglang/srt/managers/scheduler_profiler_mixin.py
new file mode 100644
index 0000000000000000000000000000000000000000..c02ed7997d03a7fb18882ab422695542eeaf46c4
--- /dev/null
+++ b/sglang/python/sglang/srt/managers/scheduler_profiler_mixin.py
@@ -0,0 +1,410 @@
+from __future__ import annotations
+
+import logging
+import os
+import time
+from pathlib import Path
+from typing import TYPE_CHECKING, List, Optional
+
+import torch
+
+from sglang.srt.environ import envs
+from sglang.srt.managers.io_struct import ProfileReq, ProfileReqOutput, ProfileReqType
+from sglang.srt.model_executor.forward_batch_info import ForwardMode
+from sglang.srt.server_args import get_global_server_args
+from sglang.srt.utils import is_npu
+from sglang.srt.utils.profile_merger import ProfileMerger
+from sglang.srt.utils.profile_utils import ProfileManager
+
+if TYPE_CHECKING:
+    from sglang.srt.managers.schedule_batch import ScheduleBatch
+    from sglang.srt.managers.scheduler import Scheduler
+
+_is_npu = is_npu()
+if _is_npu:
+    import torch_npu
+
+    patches = [
+        ["profiler.profile", torch_npu.profiler.profile],
+        ["profiler.ProfilerActivity.CUDA", torch_npu.profiler.ProfilerActivity.NPU],
+        ["profiler.ProfilerActivity.CPU", torch_npu.profiler.ProfilerActivity.CPU],
+    ]
+    torch_npu._apply_patches(patches)
+
+logger = logging.getLogger(__name__)
+
+
+class SchedulerProfilerMixin:
+    def init_profiler(self: Scheduler):
+        if envs.SGLANG_PROFILE_V2.get():
+            self._profile_manager = ProfileManager(
+                tp_rank=self.tp_rank,
+                cpu_group=self.dp_tp_cpu_group,
+                gpu_id=self.gpu_id,
+            )
+            return
+
+        self.torch_profiler = None
+        self.torch_profiler_output_dir: Optional[Path] = None
+        self.profiler_activities: Optional[List[str]] = None
+        self.profile_id: Optional[str] = None
+
+        self.profiler_start_forward_ct: Optional[int] = None
+        self.profiler_target_forward_ct: Optional[int] = None
+
+        self.profiler_prefill_ct: Optional[int] = None
+        self.profiler_decode_ct: Optional[int] = None
+        self.profiler_target_prefill_ct: Optional[int] = None
+        self.profiler_target_decode_ct: Optional[int] = None
+
+        self.profile_by_stage: bool = False
+        self.profile_in_progress: bool = False
+        self.merge_profiles = False
+
+        # For ROCM
+        self.rpd_profiler = None
+
+    def init_profile(
+        self: Scheduler,
+        output_dir: Optional[str],
+        start_step: Optional[int],
+        num_steps: Optional[int],
+        activities: Optional[List[str]],
+        with_stack: Optional[bool],
+        record_shapes: Optional[bool],
+        profile_by_stage: bool,
+        profile_id: str,
+        merge_profiles: bool = False,
+        profile_prefix: str = "",
+        profile_stages: Optional[List[str]] = None,
+    ) -> ProfileReqOutput:
+        if envs.SGLANG_PROFILE_V2.get():
+            return self._profile_manager.configure(
+                output_dir=output_dir,
+                start_step=start_step,
+                num_steps=num_steps,
+                activities=activities,
+                with_stack=with_stack,
+                record_shapes=record_shapes,
+                profile_by_stage=profile_by_stage,
+                profile_id=profile_id,
+                merge_profiles=merge_profiles,
+                profile_prefix=profile_prefix,
+                profile_stages=profile_stages,
+            )
+
+        if self.profile_in_progress:
+            return ProfileReqOutput(
+                success=False,
+                message="Profiling is already in progress. Call /stop_profile first.",
+            )
+
+        self.profile_by_stage = profile_by_stage
+        self.merge_profiles = merge_profiles
+
+        if output_dir is None:
+            output_dir = os.getenv("SGLANG_TORCH_PROFILER_DIR", "/tmp")
+        if activities is None:
+            activities = ["CPU", "GPU"]
+
+        self.torch_profiler_output_dir = Path(output_dir).expanduser()
+        self.torch_profiler_with_stack = with_stack
+        self.torch_profiler_record_shapes = record_shapes
+        self.profiler_activities = activities
+        self.profile_id = profile_id
+        self.profile_prefix = profile_prefix
+
+        if start_step:
+            self.profiler_start_forward_ct = max(start_step, self.forward_ct + 1)
+
+        if num_steps:
+            if self.profile_by_stage:
+                self.profiler_prefill_ct = 0
+                self.profiler_decode_ct = 0
+                self.profiler_target_prefill_ct = num_steps
+                self.profiler_target_decode_ct = num_steps
+            elif start_step:
+                self.profiler_target_forward_ct = (
+                    self.profiler_start_forward_ct + num_steps
+                )
+            else:
+                self.profiler_target_forward_ct = self.forward_ct + num_steps
+            # The caller will be notified when reaching profiler_target_forward_ct
+        else:
+            self.profiler_target_forward_ct = None
+
+        return ProfileReqOutput(success=True, message="Succeeded")
+
+    def start_profile(
+        self: Scheduler, stage: Optional[ForwardMode] = None
+    ) -> ProfileReqOutput | None:
+        if envs.SGLANG_PROFILE_V2.get():
+            return self._profile_manager.manual_start()
+
+        stage_str = f" for {stage.name}" if stage else ""
+        logger.info(
+            f"Profiling starts{stage_str}. Traces will be saved to: {self.torch_profiler_output_dir} (with profile id: {self.profile_id})",
+        )
+
+        activities = self.profiler_activities
+        with_stack = self.torch_profiler_with_stack
+        record_shapes = self.torch_profiler_record_shapes
+
+        activity_map = {
+            "CPU": torch.profiler.ProfilerActivity.CPU,
+            "GPU": torch.profiler.ProfilerActivity.CUDA,
+        }
+        if hasattr(torch.profiler.ProfilerActivity, "XPU"):
+            activity_map["XPU"] = torch.profiler.ProfilerActivity.XPU
+        torchprof_activities = [
+            activity_map[a] for a in activities if a in activity_map
+        ]
+
+        if "RPD" in activities:  # for ROCM
+            from rpdTracerControl import rpdTracerControl
+
+            rpdTracerControl.skipCreate()
+
+            self.rpd_profile_path = os.path.join(
+                self.torch_profiler_output_dir,
+                "rpd-" + str(time.time()) + f"-TP-{self.tp_rank}" + ".trace.json.gz",
+            )
+
+            if self.tp_rank == 0:
+                import sqlite3
+
+                from rocpd.schema import RocpdSchema
+
+                if os.path.exists("trace.rpd"):
+                    os.unlink("trace.rpd")
+                schema = RocpdSchema()
+                connection = sqlite3.connect("trace.rpd")
+                schema.writeSchema(connection)
+                connection.commit()
+                del connection
+            torch.distributed.barrier(self.dp_tp_cpu_group)
+
+            self.rpd_profiler = rpdTracerControl()
+            self.rpd_profiler.setPythonTrace(True)
+            self.rpd_profiler.start()
+            self.rpd_profiler.rangePush("", "rpd profile range", "")
+            self.profile_in_progress = True
+        elif torchprof_activities:
+            self.torch_profiler = torch.profiler.profile(
+                activities=torchprof_activities,
+                with_stack=with_stack if with_stack is not None else True,
+                record_shapes=record_shapes if record_shapes is not None else False,
+                on_trace_ready=(
+                    None
+                    if not _is_npu
+                    else torch_npu.profiler.tensorboard_trace_handler(
+                        str(self.torch_profiler_output_dir)
+                    )
+                ),
+            )
+            self.torch_profiler.start()
+            self.profile_in_progress = True
+
+        if "MEM" in activities:
+            torch.cuda.memory._record_memory_history(max_entries=100000)
+            self.profile_in_progress = True
+
+        if "CUDA_PROFILER" in activities:
+            if self.gpu_id == get_global_server_args().base_gpu_id:
+                torch.cuda.cudart().cudaProfilerStart()
+            self.profile_in_progress = True
+
+        return ProfileReqOutput(success=True, message="Succeeded")
+
+    def _merge_profile_traces(self: Scheduler) -> str:
+        if not self.merge_profiles:
+            return ""
+
+        if self.tp_rank != 0:
+            return ""
+        if getattr(self, "dp_size", 1) > 1 and getattr(self, "dp_rank", 0) != 0:
+            return ""
+        if getattr(self, "pp_size", 1) > 1 and getattr(self, "pp_rank", 0) != 0:
+            return ""
+        if getattr(self, "moe_ep_size", 1) > 1 and getattr(self, "moe_ep_rank", 0) != 0:
+            return ""
+
+        try:
+            logger.info("Starting profile merge...")
+            merger = ProfileMerger(self.torch_profiler_output_dir, self.profile_id)
+            merged_path = merger.merge_chrome_traces()
+
+            summary = merger.get_merge_summary()
+            merge_message = (
+                f" Merged trace: {merged_path} "
+                f"(Events: {summary.get('total_events', '?')}, "
+                f"Files: {summary.get('total_files', '?')})"
+            )
+
+            logger.info(f"Profile merge completed: {merged_path}")
+        except Exception as e:
+            logger.error(f"Failed to merge profiles: {e}", exc_info=True)
+            return f" Merge failed: {e!s}"
+        else:
+            return merge_message
+
+    def stop_profile(
+        self: Scheduler, stage: Optional[ForwardMode] = None
+    ) -> ProfileReqOutput | None:
+        if envs.SGLANG_PROFILE_V2.get():
+            return self._profile_manager.manual_stop()
+
+        if not self.profile_in_progress:
+            return ProfileReqOutput(
+                success=False,
+                message="Profiling is not in progress. Call /start_profile first.",
+            )
+
+        self.torch_profiler_output_dir.mkdir(parents=True, exist_ok=True)
+
+        if self.profile_prefix:
+            stage_prefix = self.profile_prefix + "-"
+        else:
+            stage_prefix = ""
+
+        stage_suffix = f"-{stage.name}" if stage else ""
+        logger.info("Stop profiling" + stage_suffix + "...")
+        if self.torch_profiler is not None:
+            self.torch_profiler.stop()
+            if not _is_npu:
+                # Build filename with only non-zero ranks to maintain backward compatibility
+                filename_parts = [self.profile_id, f"TP-{self.tp_rank}"]
+
+                # Only add other ranks if parallelism is enabled (size > 1)
+                if getattr(self, "dp_size", 1) > 1:
+                    filename_parts.append(f"DP-{getattr(self, 'dp_rank', 0)}")
+                if getattr(self, "pp_size", 1) > 1:
+                    filename_parts.append(f"PP-{getattr(self, 'pp_rank', 0)}")
+                if getattr(self, "moe_ep_size", 1) > 1:
+                    filename_parts.append(f"EP-{getattr(self, 'moe_ep_rank', 0)}")
+
+                filename = (
+                    stage_prefix
+                    + "-".join(filename_parts)
+                    + stage_suffix
+                    + ".trace.json.gz"
+                )
+
+                self.torch_profiler.export_chrome_trace(
+                    os.path.join(self.torch_profiler_output_dir, filename)
+                )
+            torch.distributed.barrier(self.dp_tp_cpu_group)
+
+        if self.rpd_profiler is not None:
+            self.rpd_profiler.rangePop()
+            self.rpd_profiler.stop()
+            self.rpd_profiler.flush()
+
+            torch.distributed.barrier(self.dp_tp_cpu_group)
+            if self.tp_rank == 0:
+                from sglang.srt.utils.rpd_utils import rpd_to_chrome_trace
+
+                rpd_to_chrome_trace("trace.rpd", self.rpd_profile_path)
+            self.rpd_profiler = None
+            self.rpd_profile_path = None
+
+        if self.profiler_activities is not None and "MEM" in self.profiler_activities:
+            memory_profile_path = os.path.join(
+                self.torch_profiler_output_dir,
+                str(time.time())
+                + f"-TP-{self.tp_rank}-memory"
+                + stage_suffix
+                + ".pickle",
+            )
+            torch.cuda.memory._dump_snapshot(memory_profile_path)
+            torch.cuda.memory._record_memory_history(enabled=None)
+
+        if "CUDA_PROFILER" in self.profiler_activities:
+            if self.gpu_id == get_global_server_args().base_gpu_id:
+                torch.cuda.cudart().cudaProfilerStop()
+
+        merge_message = self._merge_profile_traces()
+
+        logger.info(
+            "Profiling done. Traces are saved to: %s%s",
+            self.torch_profiler_output_dir,
+            merge_message,
+        )
+        self.torch_profiler = None
+        self.profile_in_progress = False
+        self.profiler_start_forward_ct = None
+
+        return ProfileReqOutput(success=True, message=f"Succeeded.{merge_message}")
+
+    def _profile_batch_predicate(self: Scheduler, batch: ScheduleBatch):
+        if envs.SGLANG_PROFILE_V2.get():
+            self._profile_manager.step(forward_mode=batch.forward_mode)
+            return
+
+        if self.profile_by_stage:
+            if batch.forward_mode.is_prefill():
+                if self.profiler_prefill_ct == 0:
+                    self.start_profile(batch.forward_mode)
+                self.profiler_prefill_ct += 1
+                if self.profiler_prefill_ct > self.profiler_target_prefill_ct:
+                    if self.profile_in_progress:
+                        self.stop_profile(stage=ForwardMode.EXTEND)
+            elif batch.forward_mode.is_decode():
+                if self.profiler_decode_ct == 0:
+                    if self.profile_in_progress:
+                        # force trace flush
+                        self.stop_profile(stage=ForwardMode.EXTEND)
+                    self.start_profile(batch.forward_mode)
+                self.profiler_decode_ct += 1
+                if self.profiler_decode_ct > self.profiler_target_decode_ct:
+                    if self.profile_in_progress:
+                        self.stop_profile(stage=ForwardMode.DECODE)
+            elif batch.forward_mode.is_idle():
+                pass
+            else:
+                raise RuntimeError(f"unsupported profile stage: {batch.forward_mode}")
+        else:
+            # Check profiler
+            if (
+                self.profiler_target_forward_ct
+                and self.profiler_target_forward_ct <= self.forward_ct
+            ):
+                self.stop_profile()
+            if (
+                self.profiler_start_forward_ct
+                and self.profiler_start_forward_ct == self.forward_ct
+            ):
+                self.start_profile()
+
+    def profile(self: Scheduler, recv_req: ProfileReq):
+        if recv_req.type == ProfileReqType.START_PROFILE:
+            if recv_req.profile_by_stage or recv_req.start_step:
+                return self.init_profile(
+                    recv_req.output_dir,
+                    recv_req.start_step,
+                    recv_req.num_steps,
+                    recv_req.activities,
+                    recv_req.with_stack,
+                    recv_req.record_shapes,
+                    recv_req.profile_by_stage,
+                    recv_req.profile_id,
+                    recv_req.merge_profiles,
+                    recv_req.profile_prefix,
+                    recv_req.profile_stages,
+                )
+            else:
+                self.init_profile(
+                    recv_req.output_dir,
+                    recv_req.start_step,
+                    recv_req.num_steps,
+                    recv_req.activities,
+                    recv_req.with_stack,
+                    recv_req.record_shapes,
+                    recv_req.profile_by_stage,
+                    recv_req.profile_id,
+                    recv_req.merge_profiles,
+                    recv_req.profile_prefix,
+                )
+                return self.start_profile()
+        else:
+            return self.stop_profile()
diff --git a/sglang/python/sglang/srt/managers/scheduler_recv_skipper.py b/sglang/python/sglang/srt/managers/scheduler_recv_skipper.py
new file mode 100644
index 0000000000000000000000000000000000000000..69c3e19a5080faa2987615fbedf5b066ca571d97
--- /dev/null
+++ b/sglang/python/sglang/srt/managers/scheduler_recv_skipper.py
@@ -0,0 +1,38 @@
+from sglang.srt.environ import envs
+from sglang.srt.model_executor.forward_batch_info import ForwardMode
+from sglang.srt.server_args import ServerArgs
+
+
+class SchedulerRecvSkipper:
+    @staticmethod
+    def maybe_create(server_args: ServerArgs):
+        if server_args.scheduler_recv_interval <= 1:
+            return None
+        return SchedulerRecvSkipper(server_args)
+
+    def __init__(self, server_args: ServerArgs):
+        # Can be supported if needed, but may need e.g. `global_forward_mode`
+        assert not server_args.enable_dp_attention
+        self._counter = 0
+        self._threshold = server_args.scheduler_recv_interval
+        # All can be tuned if needed
+        self._default_weight = envs.SGLANG_SCHEDULER_RECV_SKIPPER_WEIGHT_DEFAULT.get()
+        self._weight_of_forward_mode = {
+            ForwardMode.DECODE: envs.SGLANG_SCHEDULER_RECV_SKIPPER_WEIGHT_DECODE.get(),
+            ForwardMode.TARGET_VERIFY: envs.SGLANG_SCHEDULER_RECV_SKIPPER_WEIGHT_TARGET_VERIFY.get(),
+            None: envs.SGLANG_SCHEDULER_RECV_SKIPPER_WEIGHT_NONE.get(),
+        }
+
+    def handle(self, last_forward_mode: ForwardMode):
+        should_recv = False
+
+        last_weight = self._weight_of_forward_mode.get(
+            last_forward_mode, self._default_weight
+        )
+        self._counter += last_weight
+
+        if self._counter >= self._threshold:
+            self._counter = 0
+            should_recv = True
+
+        return should_recv
diff --git a/sglang/python/sglang/srt/managers/scheduler_runtime_checker_mixin.py b/sglang/python/sglang/srt/managers/scheduler_runtime_checker_mixin.py
new file mode 100644
index 0000000000000000000000000000000000000000..0441be4c5c5249f3880fdadc6dd55b84949bf16b
--- /dev/null
+++ b/sglang/python/sglang/srt/managers/scheduler_runtime_checker_mixin.py
@@ -0,0 +1,389 @@
+from __future__ import annotations
+
+import logging
+import time
+import warnings
+from typing import TYPE_CHECKING
+
+from sglang.srt.disaggregation.utils import DisaggregationMode
+from sglang.srt.environ import envs
+from sglang.srt.managers.schedule_batch import ScheduleBatch
+from sglang.srt.mem_cache.session_aware_cache import SessionAwareCache
+from sglang.srt.observability.metrics_collector import QueueCount
+from sglang.srt.utils.common import ceil_align, raise_error_or_warn
+from sglang.srt.utils.request_logger import disable_request_logging
+from sglang.srt.utils.watchdog import WatchdogRaw
+
+if TYPE_CHECKING:
+    from sglang.srt.managers.scheduler import Scheduler
+
+logger = logging.getLogger(__name__)
+
+
+class SchedulerRuntimeCheckerMixin:
+    def _session_held_tokens(self: Scheduler) -> int:
+        if isinstance(self.tree_cache, SessionAwareCache):
+            return self.tree_cache.session_held_tokens()
+        return 0
+
+    def _session_held_req_count(self: Scheduler) -> int:
+        if isinstance(self.tree_cache, SessionAwareCache):
+            return self.tree_cache.session_held_req_count()
+        return 0
+
+    def _get_token_info(self: Scheduler):
+        available_size = self.token_to_kv_pool_allocator.available_size()
+        evictable_size = self.tree_cache.evictable_size()
+        num_used = self.max_total_num_tokens - (available_size + evictable_size)
+        token_usage = num_used / self.max_total_num_tokens
+        return num_used, token_usage, available_size, evictable_size
+
+    def _get_mamba_token_info(self: Scheduler):
+        is_mamba_radix_cache = (
+            self.tree_cache.supports_mamba() and self.tree_cache.is_tree_cache()
+        )
+        full_available_size = self.token_to_kv_pool_allocator.available_size()
+        full_evictable_size = (
+            self.tree_cache.full_evictable_size() if is_mamba_radix_cache else 0
+        )
+        mamba_available_size = self.req_to_token_pool.mamba_pool.available_size()
+        mamba_evictable_size = (
+            self.tree_cache.mamba_evictable_size() if is_mamba_radix_cache else 0
+        )
+        full_num_used = self.token_to_kv_pool_allocator.size - (
+            full_available_size + full_evictable_size
+        )
+        mamba_num_used = self.req_to_token_pool.mamba_pool.size - (
+            mamba_available_size + mamba_evictable_size
+        )
+        full_token_usage = full_num_used / self.token_to_kv_pool_allocator.size
+        mamba_usage = mamba_num_used / self.req_to_token_pool.mamba_pool.size
+        return (
+            full_num_used,
+            mamba_num_used,
+            full_token_usage,
+            mamba_usage,
+            full_available_size,
+            full_evictable_size,
+            mamba_available_size,
+            mamba_evictable_size,
+        )
+
+    def _get_swa_token_info(self: Scheduler):
+        full_available_size = self.token_to_kv_pool_allocator.full_available_size()
+        full_evictable_size = self.tree_cache.full_evictable_size()
+        swa_available_size = self.token_to_kv_pool_allocator.swa_available_size()
+        swa_evictable_size = self.tree_cache.swa_evictable_size()
+        full_num_used = self.full_tokens_per_layer - (
+            full_available_size + full_evictable_size
+        )
+        swa_num_used = self.swa_tokens_per_layer - (
+            swa_available_size + swa_evictable_size
+        )
+        full_token_usage = full_num_used / self.full_tokens_per_layer
+        swa_token_usage = swa_num_used / self.swa_tokens_per_layer
+        return (
+            full_num_used,
+            swa_num_used,
+            full_token_usage,
+            swa_token_usage,
+            full_available_size,
+            full_evictable_size,
+            swa_available_size,
+            swa_evictable_size,
+        )
+
+    def _check_hybrid_memory(self: Scheduler):
+        (
+            full_num_used,
+            swa_num_used,
+            _,
+            _,
+            full_available_size,
+            full_evictable_size,
+            swa_available_size,
+            swa_evictable_size,
+        ) = self._get_swa_token_info()
+        session_held = self._session_held_tokens()
+        memory_leak = (full_num_used - session_held) != 0 or swa_num_used != 0
+        token_msg = (
+            f"{self.full_tokens_per_layer=}, {full_available_size=}, {full_evictable_size=}, {self.tree_cache.full_protected_size()=}\n"
+            f"{self.swa_tokens_per_layer=}, {swa_available_size=}, {swa_evictable_size=}, {self.tree_cache.swa_protected_size()=}, {session_held=}\n"
+        )
+        return memory_leak, token_msg
+
+    def _check_mamba_memory(self: Scheduler):
+        (
+            full_num_used,
+            mamba_num_used,
+            _,
+            _,
+            full_available_size,
+            full_evictable_size,
+            mamba_available_size,
+            mamba_evictable_size,
+        ) = self._get_mamba_token_info()
+        session_held = self._session_held_tokens()
+        memory_leak = (
+            full_num_used != self.tree_cache.full_protected_size() + session_held
+            or mamba_num_used != self.tree_cache.mamba_protected_size()
+        )
+        if memory_leak:
+            free_full_pages = set(
+                self.token_to_kv_pool_allocator.free_pages.tolist()
+                + self.token_to_kv_pool_allocator.release_pages.tolist()
+            )
+            cached_full_pages = set(self.tree_cache.all_values_flatten().tolist())
+            expected_full_pages = set(
+                range(1, self.token_to_kv_pool_allocator.size + 1)
+            )
+            leaked_full_pages = (
+                expected_full_pages - free_full_pages - cached_full_pages
+            )
+            free_mamba_pages = set(
+                self.req_to_token_pool.mamba_pool.free_slots.tolist()
+            )
+            cached_mamba_pages = set(
+                self.tree_cache.all_mamba_values_flatten().tolist()
+            )
+            expected_mamba_pages = set(range(self.req_to_token_pool.mamba_pool.size))
+            leaked_mamba_pages = (
+                expected_mamba_pages - free_mamba_pages - cached_mamba_pages
+            )
+            token_msg = (
+                f"{full_available_size=}, {full_evictable_size=}, {self.token_to_kv_pool_allocator.size=}, {self.tree_cache.full_protected_size()=}\n"
+                f"{mamba_available_size=}, {mamba_evictable_size=}, {self.req_to_token_pool.mamba_pool.size=}, {self.tree_cache.mamba_protected_size()=}, leaked_full_pages={leaked_full_pages if len(leaked_full_pages) > 0 else None}, leaked_mamba_pages={leaked_mamba_pages if len(leaked_mamba_pages) > 0 else None}\n"
+            )
+        else:
+            token_msg = (
+                f"{full_available_size=}, {full_evictable_size=}, {self.token_to_kv_pool_allocator.size=}, {self.tree_cache.full_protected_size()=}\n"
+                f"{mamba_available_size=}, {mamba_evictable_size=}, {self.req_to_token_pool.mamba_pool.size=}, {self.tree_cache.mamba_protected_size()=}\n"
+            )
+        return memory_leak, token_msg
+
+    def _check_radix_cache_memory(self: Scheduler):
+        _, _, available_size, evictable_size = self._get_token_info()
+        protected_size = self.tree_cache.protected_size()
+        session_held = self._session_held_tokens()
+        memory_leak = (available_size + evictable_size) != (
+            self.max_total_num_tokens - protected_size - session_held
+        )
+        token_msg = f"{self.max_total_num_tokens=}, {available_size=}, {evictable_size=}, {protected_size=}, {session_held=}\n"
+        return memory_leak, token_msg
+
+    def _get_batch_uncached_size(self: Scheduler, batch: ScheduleBatch) -> int:
+        ret = 0
+        for req in batch.reqs:
+            assert req.kv_committed_freed == req.kv_overallocated_freed
+            uncached_len = 0
+            if not req.kv_committed_freed:
+                allocated_len = req.kv_allocated_len
+                if self.page_size > 1:
+                    allocated_len = ceil_align(allocated_len, self.page_size)
+                    assert req.cache_protected_len % self.page_size == 0
+                uncached_len = allocated_len - req.cache_protected_len
+
+            ret += uncached_len
+
+        return ret
+
+    def self_check_during_busy(self: Scheduler):
+        current_batch: ScheduleBatch = self.last_batch
+
+        if current_batch is None:
+            return
+
+        spec_topk = self.server_args.speculative_eagle_topk or 1
+        if spec_topk > 1:
+            warnings.warn(
+                "Runtime memory check (busy) is not supported when speculation topk > 1."
+            )
+            return
+
+        _, _, available_size, evictable_size = self._get_token_info()
+        protected_size = self.tree_cache.protected_size()
+
+        uncached_size = self._get_batch_uncached_size(current_batch)
+
+        if (
+            current_batch.forward_mode.is_extend()
+            and self.running_batch is not None
+            and not self.running_batch.is_empty()
+        ):
+            uncached_size += self._get_batch_uncached_size(self.running_batch)
+
+        if envs.SGLANG_ENABLE_STRICT_MEM_CHECK_DURING_BUSY.get() > 1:
+            log_msg = f"[Mem Check (BUSY)] {available_size=}, {evictable_size=}, {protected_size=}, {uncached_size=}"
+            logger.info(log_msg)
+
+        session_held = self._session_held_tokens()
+        total_tokens = (
+            available_size
+            + evictable_size
+            + protected_size
+            + uncached_size
+            + session_held
+        )
+        assert (
+            total_tokens == self.max_total_num_tokens
+        ), f"Mem Leak Detected! {total_tokens=} vs {self.max_total_num_tokens=}"
+
+    def _check_req_pool(self: Scheduler):
+        if self.disaggregation_mode == DisaggregationMode.DECODE:
+            req_total_size = (
+                self.req_to_token_pool.size + self.req_to_token_pool.pre_alloc_size
+            )
+        else:
+            req_total_size = self.req_to_token_pool.size
+
+        session_req_count = self._session_held_req_count()
+        if len(self.req_to_token_pool.free_slots) + session_req_count != req_total_size:
+            msg = (
+                "req_to_token_pool memory leak detected!"
+                f"available_size={len(self.req_to_token_pool.free_slots)}, "
+                f"session_held={session_req_count}, "
+                f"total_size={self.req_to_token_pool.size}\n"
+            )
+            raise_error_or_warn(
+                self,
+                envs.SGLANG_ENABLE_STRICT_MEM_CHECK_DURING_IDLE.get(),
+                "count_req_pool_leak_warnings",
+                msg,
+            )
+
+    def check_memory(self: Scheduler):
+        if self.is_hybrid_swa:
+            memory_leak, token_msg = self._check_hybrid_memory()
+        elif self.is_hybrid_ssm and self.tree_cache.supports_mamba():
+            memory_leak, token_msg = self._check_mamba_memory()
+        else:
+            memory_leak, token_msg = self._check_radix_cache_memory()
+
+        if memory_leak:
+            msg = "token_to_kv_pool_allocator memory leak detected! " f"{token_msg}"
+            raise_error_or_warn(
+                self,
+                envs.SGLANG_ENABLE_STRICT_MEM_CHECK_DURING_IDLE.get(),
+                "count_memory_leak_warnings",
+                msg,
+            )
+
+        self._check_req_pool()
+
+        if (
+            self.enable_metrics
+            and self.current_scheduler_metrics_enabled
+            and time.perf_counter() > self.metrics_collector.last_log_time + 30
+        ):
+            # During idle time, also collect metrics every 30 seconds.
+            if self.is_hybrid_swa:
+                (
+                    full_num_used,
+                    swa_num_used,
+                    full_token_usage,
+                    swa_token_usage,
+                    _,
+                    _,
+                    _,
+                    _,
+                ) = self._get_swa_token_info()
+                num_used = max(full_num_used, swa_num_used)
+                token_usage = max(full_token_usage, swa_token_usage)
+            elif self.is_hybrid_ssm:
+                (
+                    num_used,
+                    _,
+                    token_usage,
+                    _,
+                    _,
+                    _,
+                    _,
+                    _,
+                ) = self._get_mamba_token_info()
+            else:
+                num_used, token_usage, _, _ = self._get_token_info()
+
+            priority_enabled = self.enable_priority_scheduling
+            self.stats.num_running_reqs = QueueCount.from_reqs(
+                self.running_batch.reqs, priority_enabled
+            )
+            self.stats.num_used_tokens = num_used
+            self.stats.token_usage = round(token_usage, 2)
+            self.stats.gen_throughput = 0
+            self.stats.num_queue_reqs = QueueCount.from_reqs(
+                self.waiting_queue, priority_enabled
+            )
+            self.stats.num_grammar_queue_reqs = len(self.grammar_manager)
+            if self.disaggregation_mode == DisaggregationMode.PREFILL:
+                self.stats.num_prefill_prealloc_queue_reqs = QueueCount.from_reqs(
+                    self.disagg_prefill_bootstrap_queue.queue, priority_enabled
+                )
+                self.stats.num_prefill_inflight_queue_reqs = QueueCount.from_reqs(
+                    self.disagg_prefill_inflight_queue, priority_enabled
+                )
+            if self.disaggregation_mode == DisaggregationMode.DECODE:
+                self.stats.num_decode_prealloc_queue_reqs = QueueCount.from_reqs(
+                    self.disagg_decode_prealloc_queue.queue, priority_enabled
+                )
+                self.stats.num_decode_transfer_queue_reqs = QueueCount.from_reqs(
+                    self.disagg_decode_transfer_queue.queue, priority_enabled
+                )
+            self.metrics_collector.log_stats(self.stats)
+        self._publish_kv_events()
+
+    def check_tree_cache(self: Scheduler):
+        if (
+            self.tree_cache.is_tree_cache()
+            and (self.is_hybrid_swa and self.tree_cache.supports_swa())
+            or (self.is_hybrid_ssm and self.tree_cache.supports_mamba())
+        ):
+            self.tree_cache.sanity_check()
+
+    def self_check_during_idle(self: Scheduler):
+        if self.disaggregation_mode == DisaggregationMode.PREFILL:
+            if len(self.disagg_prefill_inflight_queue) > 0:
+                return
+        elif self.disaggregation_mode == DisaggregationMode.DECODE:
+            queue_size = (
+                len(self.waiting_queue)
+                + len(self.disagg_decode_transfer_queue.queue)
+                + len(self.disagg_decode_prealloc_queue.queue)
+            )
+            if self.server_args.disaggregation_decode_enable_offload_kvcache:
+                queue_size += len(self.decode_offload_manager.ongoing_offload)
+            if queue_size:
+                return
+
+        self.check_memory()
+        self.check_tree_cache()
+        self.new_token_ratio = self.init_new_token_ratio
+        self.maybe_sleep_on_idle()
+
+
+def create_scheduler_watchdog(
+    scheduler: Scheduler, watchdog_timeout: float, soft: bool = False
+) -> WatchdogRaw:
+    def dump_info() -> str:
+        if scheduler.is_initializing or disable_request_logging():
+            return ""
+        if scheduler.is_hybrid_swa:
+            _, info_msg = scheduler._check_hybrid_memory()
+        elif scheduler.is_hybrid_ssm and scheduler.tree_cache.supports_mamba():
+            _, info_msg = scheduler._check_mamba_memory()
+        else:
+            _, info_msg = scheduler._check_radix_cache_memory()
+        return (
+            f"{scheduler.cur_batch.batch_size()=}\n"
+            f"{scheduler.cur_batch.reqs=}\n"
+            f"{info_msg}"
+        )
+
+    return WatchdogRaw(
+        debug_name="Scheduler",
+        get_counter=lambda: getattr(scheduler, "forward_ct", 0),
+        is_active=lambda: scheduler.is_initializing
+        or getattr(scheduler, "cur_batch", None) is not None,
+        watchdog_timeout=watchdog_timeout,
+        soft=soft,
+        dump_info=dump_info,
+    )
diff --git a/sglang/python/sglang/srt/managers/scheduler_update_weights_mixin.py b/sglang/python/sglang/srt/managers/scheduler_update_weights_mixin.py
new file mode 100644
index 0000000000000000000000000000000000000000..ba038877b432f2ceaaf7ff6c192777480dd67b3a
--- /dev/null
+++ b/sglang/python/sglang/srt/managers/scheduler_update_weights_mixin.py
@@ -0,0 +1,225 @@
+from __future__ import annotations
+
+import logging
+import traceback
+from typing import TYPE_CHECKING, Tuple
+
+import torch
+
+from sglang.srt.constants import (
+    GPU_MEMORY_ALL_TYPES,
+    GPU_MEMORY_TYPE_CUDA_GRAPH,
+    GPU_MEMORY_TYPE_KV_CACHE,
+    GPU_MEMORY_TYPE_WEIGHTS,
+)
+from sglang.srt.managers.io_struct import (
+    CheckWeightsReqInput,
+    CheckWeightsReqOutput,
+    DestroyWeightsUpdateGroupReqInput,
+    DestroyWeightsUpdateGroupReqOutput,
+    GetWeightsByNameReqInput,
+    GetWeightsByNameReqOutput,
+    InitWeightsUpdateGroupReqInput,
+    InitWeightsUpdateGroupReqOutput,
+    ReleaseMemoryOccupationReqInput,
+    ReleaseMemoryOccupationReqOutput,
+    ResumeMemoryOccupationReqInput,
+    ResumeMemoryOccupationReqOutput,
+    UpdateWeightFromDiskReqInput,
+    UpdateWeightFromDiskReqOutput,
+    UpdateWeightsFromDistributedReqInput,
+    UpdateWeightsFromDistributedReqOutput,
+    UpdateWeightsFromIPCReqInput,
+    UpdateWeightsFromIPCReqOutput,
+    UpdateWeightsFromTensorReqInput,
+    UpdateWeightsFromTensorReqOutput,
+)
+
+if TYPE_CHECKING:
+    from sglang.srt.managers.scheduler import Scheduler
+
+logger = logging.getLogger(__name__)
+
+
+class SchedulerUpdateWeightsMixin:
+
+    def update_weights_from_disk(
+        self: Scheduler, recv_req: UpdateWeightFromDiskReqInput
+    ):
+        """In-place update of the weights from disk."""
+        success, message = self.tp_worker.update_weights_from_disk(recv_req)
+        if success:
+            if recv_req.flush_cache:
+                flush_cache_success = self.flush_cache()
+                assert flush_cache_success, "Cache flush failed after updating weights"
+        else:
+            logger.error(message)
+        return UpdateWeightFromDiskReqOutput(success, message, 0)
+
+    def init_weights_update_group(
+        self: Scheduler, recv_req: InitWeightsUpdateGroupReqInput
+    ):
+        """Initialize the online model parameter update group."""
+        success, message = self.tp_worker.init_weights_update_group(recv_req)
+        return InitWeightsUpdateGroupReqOutput(success, message)
+
+    def destroy_weights_update_group(
+        self: Scheduler, recv_req: DestroyWeightsUpdateGroupReqInput
+    ):
+        """Destroy the online model parameter update group."""
+        success, message = self.tp_worker.destroy_weights_update_group(recv_req)
+        return DestroyWeightsUpdateGroupReqOutput(success, message)
+
+    def update_weights_from_distributed(
+        self,
+        recv_req: UpdateWeightsFromDistributedReqInput,
+    ) -> Tuple[bool, str]:
+        """Update the online model parameter."""
+        success, message = self.tp_worker.update_weights_from_distributed(recv_req)
+        if success:
+            if recv_req.flush_cache:
+                flush_cache_success = self.flush_cache()
+                assert flush_cache_success, "Cache flush failed after updating weights"
+        else:
+            logger.error(message)
+        return UpdateWeightsFromDistributedReqOutput(success, message)
+
+    def update_weights_from_tensor(
+        self: Scheduler, recv_req: UpdateWeightsFromTensorReqInput
+    ):
+        """Update the online model parameter from tensors."""
+        if recv_req.disable_draft_model:
+            worker = self.tp_worker
+        else:
+            worker = self.draft_worker or self.tp_worker
+        success, message = worker.update_weights_from_tensor(recv_req)
+        # TODO extract common code b/t update_weights_from_distributed and update_weights_from_tensor later
+        if success:
+            if recv_req.flush_cache:
+                flush_cache_success = self.flush_cache()
+                assert flush_cache_success, "Cache flush failed after updating weights"
+        else:
+            logger.error(message)
+        torch.distributed.barrier(group=self.tp_cpu_group)
+        return UpdateWeightsFromTensorReqOutput(success, message)
+
+    def update_weights_from_ipc(
+        self: Scheduler, recv_req: UpdateWeightsFromIPCReqInput
+    ):
+        """Update the online model parameter from IPC for checkpoint-engine integration."""
+        success, message = self.tp_worker.update_weights_from_ipc(recv_req)
+        if success:
+            if recv_req.flush_cache:
+                flush_cache_success = self.flush_cache()
+                assert flush_cache_success, "Cache flush failed after updating weights"
+        else:
+            logger.error(message)
+        torch.distributed.barrier(group=self.tp_cpu_group)
+        return UpdateWeightsFromIPCReqOutput(success, message)
+
+    def get_weights_by_name(self: Scheduler, recv_req: GetWeightsByNameReqInput):
+        parameter = self.tp_worker.get_weights_by_name(recv_req)
+        return GetWeightsByNameReqOutput(parameter)
+
+    def release_memory_occupation(
+        self: Scheduler, recv_req: ReleaseMemoryOccupationReqInput
+    ):
+        assert (
+            self._is_no_request()
+        ), "release_memory_occupation should be called only when no ongoing request."
+
+        tags = recv_req.tags
+
+        if tags is None or len(tags) == 0:
+            tags = GPU_MEMORY_ALL_TYPES
+
+        for tag in tags:
+            self.offload_tags.add(tag)
+
+        if GPU_MEMORY_TYPE_KV_CACHE in tags:
+            self.memory_saver_adapter.pause(GPU_MEMORY_TYPE_KV_CACHE)
+            self.flush_cache()
+
+        if GPU_MEMORY_TYPE_WEIGHTS in tags:
+            self.stashed_model_static_state = _export_static_state(
+                self.tp_worker.model_runner.model
+            )
+            torch.distributed.barrier(self.tp_cpu_group)
+            self.memory_saver_adapter.pause(GPU_MEMORY_TYPE_WEIGHTS)
+
+        if GPU_MEMORY_TYPE_CUDA_GRAPH in tags:
+            self.memory_saver_adapter.pause(GPU_MEMORY_TYPE_CUDA_GRAPH)
+
+        torch.get_device_module().synchronize()
+
+        return ReleaseMemoryOccupationReqOutput()
+
+    def resume_memory_occupation(
+        self: Scheduler, recv_req: ResumeMemoryOccupationReqInput
+    ):
+        tags = recv_req.tags
+
+        if tags is None or len(tags) == 0:
+            tags = GPU_MEMORY_ALL_TYPES
+
+        for tag in tags:
+            self.offload_tags.remove(tag)
+
+        if GPU_MEMORY_TYPE_CUDA_GRAPH in tags:
+            self.memory_saver_adapter.resume(GPU_MEMORY_TYPE_CUDA_GRAPH)
+
+        if GPU_MEMORY_TYPE_WEIGHTS in tags:
+            self.memory_saver_adapter.resume(GPU_MEMORY_TYPE_WEIGHTS)
+            torch.distributed.barrier(self.tp_cpu_group)
+            _import_static_state(
+                self.tp_worker.model_runner.model,
+                self.stashed_model_static_state,
+            )
+            del self.stashed_model_static_state
+
+        if GPU_MEMORY_TYPE_KV_CACHE in tags:
+            self.memory_saver_adapter.resume(GPU_MEMORY_TYPE_KV_CACHE)
+
+        return ResumeMemoryOccupationReqOutput()
+
+    def check_weights(self: Scheduler, recv_req: CheckWeightsReqInput):
+        try:
+            self.tp_worker.model_runner.check_weights(action=recv_req.action)
+            return CheckWeightsReqOutput(success=True, message="Success.")
+        except Exception as e:
+            logger.warning(f"check_weights see error: {e}")
+            traceback.print_exc()
+            return CheckWeightsReqOutput(success=False, message=f"{e}")
+
+    def save_remote_model(self: Scheduler, params):
+        url = params["url"]
+
+        self.tp_worker.model_runner.save_remote_model(url)
+
+        if self.draft_worker is not None:
+            draft_url = params.get("draft_url", None)
+            assert (
+                draft_url is not None
+            ), "draft_url must be provided when draft model is enabled"
+            self.draft_worker.model_runner.save_remote_model(draft_url)
+
+    def save_sharded_model(self: Scheduler, params):
+        self.tp_worker.model_runner.save_sharded_model(
+            path=params["path"],
+            pattern=params["pattern"],
+            max_size=params["max_size"],
+        )
+
+
+def _export_static_state(model):
+    return dict(
+        buffers=[
+            (name, buffer.detach().clone()) for name, buffer in model.named_buffers()
+        ]
+    )
+
+
+def _import_static_state(model, static_params):
+    self_named_buffers = dict(model.named_buffers())
+    for name, tensor in static_params["buffers"]:
+        self_named_buffers[name][...] = tensor
diff --git a/sglang/python/sglang/srt/managers/session_controller.py b/sglang/python/sglang/srt/managers/session_controller.py
new file mode 100644
index 0000000000000000000000000000000000000000..05cd15cfa6bfd42e8714d43db982e4f627078a00
--- /dev/null
+++ b/sglang/python/sglang/srt/managers/session_controller.py
@@ -0,0 +1,309 @@
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+from __future__ import annotations
+
+import logging
+import time
+import uuid
+from typing import TYPE_CHECKING, Dict, Optional
+
+from sglang.srt.managers.io_struct import (
+    CloseSessionReqInput,
+    OpenSessionReqInput,
+    OpenSessionReqOutput,
+    TokenizedGenerateReqInput,
+)
+from sglang.srt.managers.schedule_batch import FINISH_ABORT, Req
+from sglang.srt.mem_cache.session_aware_cache import SessionAwareCache
+
+if TYPE_CHECKING:
+    from sglang.srt.mem_cache.base_prefix_cache import BasePrefixCache
+
+logger = logging.getLogger(__name__)
+
+
+class SessionReqNode:
+    def __init__(
+        self,
+        req: Req,
+        parent: Optional["SessionReqNode"] = None,
+        children=None,
+    ):
+        self.req = req
+        self.parent = parent
+        if parent is not None:
+            parent.children.append(self)
+        self.children = [] if not children else children
+
+    def clear_children(self, req_dict):
+        for req_node in self.children:
+            req_node.clear(req_dict)
+        self.children = []
+
+    def clear(self, req_dict):
+        for req_node in self.children:
+            req_node.clear(req_dict)
+
+        if self.req.finished_reason is None:
+            self.req.to_finish = FINISH_ABORT()
+        del req_dict[self.req.rid]
+
+    def abort(self):
+        if self.req.finished_reason is None:
+            self.req.to_finish = FINISH_ABORT()
+
+    def __str__(self):
+        return self._str_helper(self.req.rid)
+
+    def _str_helper(self, prefix=""):
+        if len(self.children) == 0:
+            return prefix + "\n"
+        else:
+            origin_prefix = prefix
+            prefix += " -- " + self.children[0].req.rid
+            ret = self.children[0]._str_helper(prefix)
+            for child in self.children[1:]:
+                prefix = " " * len(origin_prefix) + " \\- " + child.req.rid
+                ret += child._str_helper(prefix)
+            return ret
+
+
+class Session:
+    def __init__(
+        self,
+        capacity_of_str_len: int,
+        session_id: Optional[str] = None,
+        streaming: bool = False,
+        timeout: Optional[float] = None,
+    ):
+        self.session_id = session_id if session_id is not None else uuid.uuid4().hex
+        self.capacity_of_str_len = capacity_of_str_len
+        self.streaming = streaming
+        self.timeout = timeout
+        self.last_active_time: float = time.monotonic()
+        self.req_nodes: Dict[str, SessionReqNode] = {}
+
+    def is_timed_out(self) -> bool:
+        if self.timeout is None:
+            return False
+        return time.monotonic() - self.last_active_time > self.timeout
+
+    def create_req(
+        self,
+        req: TokenizedGenerateReqInput,
+        tokenizer,
+        vocab_size: int,
+        eos_token_ids=None,
+    ):
+        assert req.session_params is not None
+        self.last_active_time = time.monotonic()
+        session_params = req.session_params
+
+        last_req_node = None
+        last_req = None
+        abort = False
+        abort_message = ""
+        if self.streaming:
+            # Streaming sessions: only simple appends allowed; reject otherwise.
+            if session_params.replace:
+                abort = True
+                abort_message = "Streaming sessions do not support replace."
+            elif session_params.drop_previous_output:
+                abort = True
+                abort_message = (
+                    "Streaming sessions do not support drop_previous_output."
+                )
+            elif session_params.offset and session_params.offset != 0:
+                abort = True
+                abort_message = "Streaming sessions do not support offset."
+            elif self.req_nodes:
+                assert len(self.req_nodes) == 1
+                _, last_req_node = self.req_nodes.popitem()
+                last_req = last_req_node.req
+        elif session_params.replace:
+            if session_params.rid is None:
+                for _, req_node in self.req_nodes.items():
+                    req_node.clear(self.req_nodes)
+            else:
+                if session_params.rid not in self.req_nodes:
+                    abort = True
+                    abort_message = "Invalid request session id"
+                else:
+                    last_req_node = self.req_nodes[session_params.rid]
+                    last_req_node.abort()
+                    last_req = last_req_node.req
+                    last_req_node.clear_children(self.req_nodes)
+        else:
+            if session_params.rid is not None:
+                if session_params.rid not in self.req_nodes:
+                    abort = True
+                    abort_message = "Invalid request session id"
+                else:
+                    last_req_node = self.req_nodes[session_params.rid]
+                    last_req = last_req_node.req
+                    if not last_req.finished():
+                        abort = True
+                        abort_message = "Session request is appending to a request that hasn't finished."
+                        logging.warning(abort_message)
+
+        if last_req is not None:
+            # trim bos token if it is an append
+            if (
+                tokenizer is not None
+                and req.input_ids
+                and req.input_ids[0] == tokenizer.bos_token_id
+            ):
+                req.input_ids = req.input_ids[1:]
+
+            input_ids = (
+                last_req.origin_input_ids
+                + last_req.output_ids[: last_req.sampling_params.max_new_tokens]
+            )
+
+            if session_params.drop_previous_output:
+                input_ids = last_req.origin_input_ids[:]
+
+            if session_params.offset and session_params.offset != 0:
+                input_ids = input_ids[: session_params.offset] + req.input_ids
+            else:
+                input_ids += req.input_ids
+
+            input_ids_unpadded = (
+                last_req.origin_input_ids_unpadded
+                + last_req.output_ids[: last_req.sampling_params.max_new_tokens]
+            )
+            if session_params.drop_previous_output:
+                input_ids_unpadded = last_req.origin_input_ids_unpadded[:]
+
+            if session_params.offset and session_params.offset != 0:
+                input_ids_unpadded = (
+                    input_ids_unpadded[: session_params.offset] + req.input_ids
+                )
+            else:
+                input_ids_unpadded += req.input_ids
+        else:
+            input_ids = req.input_ids
+            input_ids_unpadded = req.input_ids
+
+        new_req = Req(
+            rid=req.rid,
+            origin_input_text=None,
+            origin_input_ids=input_ids,
+            origin_input_ids_unpadded=input_ids_unpadded,
+            sampling_params=req.sampling_params,
+            lora_id=req.lora_id,
+            session=self,
+            custom_logit_processor=req.custom_logit_processor,
+            stream=req.stream,
+            return_logprob=req.return_logprob,
+            top_logprobs_num=req.top_logprobs_num,
+            token_ids_logprob=req.token_ids_logprob,
+            vocab_size=vocab_size,
+            eos_token_ids=eos_token_ids,
+            require_reasoning=req.require_reasoning,
+            return_hidden_states=req.return_hidden_states,
+            return_routed_experts=req.return_routed_experts,
+            priority=req.priority,
+            routing_key=req.routing_key,
+            http_worker_ipc=req.http_worker_ipc,
+            time_stats=req.time_stats,
+        )
+        if last_req is not None:
+            new_req.multimodal_inputs = last_req.multimodal_inputs
+        new_req.tokenizer = tokenizer
+
+        if abort:
+            new_req.set_finish_with_abort(abort_message)
+        elif self.streaming:
+            if last_req is not None:
+                last_req.session = None
+            self.req_nodes[req.rid] = SessionReqNode(new_req)
+        else:
+            new_req_node = SessionReqNode(new_req, last_req_node)
+            self.req_nodes[req.rid] = new_req_node
+
+        return new_req
+
+
+class SessionController:
+    def __init__(self, tree_cache: BasePrefixCache):
+        self.sessions: Dict[str, Session] = {}
+        self._last_reap_time: float = 0.0
+        self.tree_cache = tree_cache
+
+    def __contains__(self, session_id: str) -> bool:
+        return session_id in self.sessions
+
+    def get(self, session_id: str) -> Optional[Session]:
+        return self.sessions.get(session_id)
+
+    def open(self, recv_req: OpenSessionReqInput) -> OpenSessionReqOutput:
+        session_id = recv_req.session_id
+        if session_id in self.sessions:
+            logger.warning(f"session id {session_id} already exist, cannot open.")
+            return OpenSessionReqOutput(session_id, False)
+        elif session_id is None:
+            logger.warning("session id is None, cannot open.")
+            return OpenSessionReqOutput(session_id, False)
+        else:
+            self.sessions[session_id] = Session(
+                recv_req.capacity_of_str_len,
+                session_id,
+                streaming=bool(recv_req.streaming),
+                timeout=recv_req.timeout,
+            )
+            return OpenSessionReqOutput(session_id, True)
+
+    def close(self, recv_req: CloseSessionReqInput):
+        session_id = recv_req.session_id
+        if session_id not in self.sessions:
+            logger.warning(f"session id {session_id} does not exist, cannot delete.")
+        else:
+            self._close(session_id)
+
+    def _close(self, session_id: str):
+        session = self.sessions[session_id]
+        if session.streaming and session.req_nodes:
+            assert len(session.req_nodes) == 1
+            req = next(iter(session.req_nodes.values())).req
+            if not req.finished():
+                req.session = None
+        if isinstance(self.tree_cache, SessionAwareCache):
+            self.tree_cache.release_session(session_id)
+        del self.sessions[session_id]
+
+    def maybe_reap(self, now: float, interval: float = 1.0):
+        # reap sessions every second
+        if now - self._last_reap_time > interval:
+            self._last_reap_time = now
+            timed_out = [
+                sid for sid, session in self.sessions.items() if session.is_timed_out()
+            ]
+            for sid in timed_out:
+                logger.info(f"Session {sid} timed out, closing.")
+                self._close(sid)
+
+    @staticmethod
+    def adjust_mm_offsets(recv_req: TokenizedGenerateReqInput, req: Req, image_inputs):
+        # For session requests, adjust mm_inputs offsets by the prefix length.
+        # Session.create_req prepends previous context to origin_input_ids,
+        # so offsets from the new prompt need to be shifted.
+        if len(recv_req.input_ids) >= len(req.origin_input_ids):
+            return
+        prefix_len = len(req.origin_input_ids) - len(recv_req.input_ids)
+        for mm_item in image_inputs.mm_items:
+            if mm_item.offsets:
+                mm_item.offsets = [
+                    (start + prefix_len, end + prefix_len)
+                    for start, end in mm_item.offsets
+                ]
diff --git a/sglang/python/sglang/srt/managers/template_manager.py b/sglang/python/sglang/srt/managers/template_manager.py
new file mode 100644
index 0000000000000000000000000000000000000000..996758477020ac47f7d53d3ff65a937b6a9f9eb6
--- /dev/null
+++ b/sglang/python/sglang/srt/managers/template_manager.py
@@ -0,0 +1,352 @@
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""
+Centralized template management for chat templates and completion templates.
+
+This module provides a unified interface for managing both chat conversation templates
+and code completion templates, eliminating global state and improving modularity.
+"""
+
+import json
+import logging
+import os
+import re
+from typing import Dict, Optional
+
+from sglang.srt.managers.tokenizer_manager import TokenizerManager
+from sglang.srt.parser.code_completion_parser import (
+    CompletionTemplate,
+    FimPosition,
+    completion_template_exists,
+    register_completion_template,
+    set_completion_template,
+)
+from sglang.srt.parser.conversation import (
+    Conversation,
+    SeparatorStyle,
+    chat_template_exists,
+    get_conv_template_by_model_path,
+    register_conv_template,
+)
+from sglang.srt.parser.jinja_template_utils import detect_jinja_template_content_format
+
+logger = logging.getLogger(__name__)
+
+
+class TemplateManager:
+    """
+    Centralized manager for chat and completion templates.
+
+    This class encapsulates all template-related state and operations,
+    eliminating the need for global variables and providing a clean
+    interface for template management.
+    """
+
+    def __init__(self):
+        self._chat_template_name: Optional[str] = None
+        self._completion_template_name: Optional[str] = None
+        self._jinja_template_content_format: Optional[str] = "openai"
+        self._force_reasoning: bool = False
+
+    @property
+    def chat_template_name(self) -> Optional[str]:
+        """Get the current chat template name."""
+        return self._chat_template_name
+
+    @property
+    def completion_template_name(self) -> Optional[str]:
+        """Get the current completion template name."""
+        return self._completion_template_name
+
+    @property
+    def jinja_template_content_format(self) -> Optional[str]:
+        """Get the detected template content format ('string' or 'openai' or None)."""
+        return self._jinja_template_content_format
+
+    @property
+    def force_reasoning(self) -> bool:
+        """
+        Check if the current chat template enforces reasoning/thinking.
+
+        Returns:
+            True if the template contains reasoning patterns like <think> tags
+        """
+        return self._force_reasoning
+
+    def _detect_reasoning_pattern(self, template: str) -> bool:
+        """
+        Detect if the chat template contains reasoning/thinking patterns.
+        """
+        if template is None:
+            return False
+
+        # TODO: remove this hard code the reasoning pattern
+        force_reasoning_pattern = r"<\|im_start\|>assistant\\n<think>\\n"
+        has_reasoning = re.search(force_reasoning_pattern, template) is not None
+
+        if has_reasoning:
+            logger.info("Detected the force reasoning pattern in chat template.")
+
+        return has_reasoning
+
+    def load_chat_template(
+        self,
+        tokenizer_manager: TokenizerManager,
+        chat_template_arg: Optional[str],
+        model_path: str,
+    ) -> None:
+        """
+        Load a chat template from various sources.
+
+        Args:
+            tokenizer_manager: The tokenizer manager instance
+            chat_template_arg: Template name, file path, or None to auto-detect
+            model_path: Path to the model
+        """
+        if chat_template_arg:
+            self._load_explicit_chat_template(tokenizer_manager, chat_template_arg)
+        else:
+            # Guess chat template from model path
+            self.guess_chat_template_from_model_path(model_path)
+
+            # If no pre-defined template was found, fallback to HuggingFace template
+            if self._chat_template_name is None:
+                # Try HuggingFace template first
+                hf_template = self._resolve_hf_chat_template(tokenizer_manager)
+                if hf_template:
+                    # override the chat template
+                    if tokenizer_manager.tokenizer:
+                        tokenizer_manager.tokenizer.chat_template = hf_template
+                    self._jinja_template_content_format = (
+                        detect_jinja_template_content_format(hf_template)
+                    )
+                    logger.info(
+                        f"Using default HuggingFace chat template with detected content format: {self._jinja_template_content_format}"
+                    )
+                else:
+                    # Default to string content format if no template was found
+                    self._jinja_template_content_format = "string"
+                    logger.info(
+                        "No chat template found, defaulting to 'string' content format"
+                    )
+
+        # Detect reasoning pattern from chat template
+        if tokenizer_manager.tokenizer:
+            self._force_reasoning = self._detect_reasoning_pattern(
+                tokenizer_manager.tokenizer.chat_template
+            )
+
+    def _load_explicit_chat_template(
+        self, tokenizer_manager: TokenizerManager, chat_template_arg: str
+    ) -> None:
+        """Load explicitly specified chat template."""
+        logger.info(f"Loading chat template from argument: {chat_template_arg}")
+
+        if chat_template_exists(chat_template_arg):
+            self._chat_template_name = chat_template_arg
+            return
+
+        if not os.path.exists(chat_template_arg):
+            raise RuntimeError(
+                f"Chat template {chat_template_arg} is not a built-in template name "
+                "or a valid chat template file path."
+            )
+
+        if chat_template_arg.endswith(".jinja"):
+            self._load_jinja_template(tokenizer_manager, chat_template_arg)
+        else:
+            self._load_json_chat_template(chat_template_arg)
+
+    def guess_chat_template_from_model_path(self, model_path: str) -> None:
+        """
+        Infer chat template name from model path.
+
+        Args:
+            model_path: Path to the model
+        """
+        template_name = get_conv_template_by_model_path(model_path)
+        if template_name is not None:
+            logger.info(f"Inferred chat template from model path: {template_name}")
+            self._chat_template_name = template_name
+
+    def load_completion_template(self, completion_template_arg: str) -> None:
+        """
+        Load completion template for code completion.
+
+        Args:
+            completion_template_arg: Template name or file path
+        """
+        logger.info(f"Loading completion template: {completion_template_arg}")
+
+        if not completion_template_exists(completion_template_arg):
+            if not os.path.exists(completion_template_arg):
+                raise RuntimeError(
+                    f"Completion template {completion_template_arg} is not a built-in template name "
+                    "or a valid completion template file path."
+                )
+
+            self._load_json_completion_template(completion_template_arg)
+        else:
+            self._completion_template_name = completion_template_arg
+
+        set_completion_template(self._completion_template_name)
+
+    def initialize_templates(
+        self,
+        tokenizer_manager: TokenizerManager,
+        model_path: str,
+        chat_template: Optional[str] = None,
+        completion_template: Optional[str] = None,
+    ) -> None:
+        """
+        Initialize all templates based on provided configuration.
+
+        Args:
+            tokenizer_manager: The tokenizer manager instance
+            model_path: Path to the model
+            chat_template: Optional chat template name/path
+            completion_template: Optional completion template name/path
+        """
+        # Load chat template
+        self.load_chat_template(tokenizer_manager, chat_template, model_path)
+
+        # Load completion template
+        if completion_template:
+            self.load_completion_template(completion_template)
+
+    def _load_jinja_template(
+        self, tokenizer_manager: TokenizerManager, template_path: str
+    ) -> None:
+        """Load a Jinja template file."""
+        with open(template_path, "r") as f:
+            chat_template = "".join(f.readlines()).strip("\n")
+        tokenizer_manager.tokenizer.chat_template = chat_template.replace("\\n", "\n")
+        self._chat_template_name = None
+        # Detect content format from the loaded template
+        self._jinja_template_content_format = detect_jinja_template_content_format(
+            chat_template
+        )
+        logger.info(
+            f"Detected user specified Jinja chat template with content format: {self._jinja_template_content_format}"
+        )
+
+    def _load_json_chat_template(self, template_path: str) -> None:
+        """Load a JSON chat template file."""
+        assert template_path.endswith(
+            ".json"
+        ), "unrecognized format of chat template file"
+
+        with open(template_path, "r") as filep:
+            template = json.load(filep)
+            try:
+                sep_style = SeparatorStyle[template["sep_style"]]
+            except KeyError:
+                raise ValueError(
+                    f"Unknown separator style: {template['sep_style']}"
+                ) from None
+
+            register_conv_template(
+                Conversation(
+                    name=template["name"],
+                    system_template=template["system"] + "\n{system_message}",
+                    system_message=template.get("system_message", ""),
+                    roles=(template["user"], template["assistant"]),
+                    sep_style=sep_style,
+                    sep=template.get("sep", "\n"),
+                    stop_str=template["stop_str"],
+                ),
+                override=True,
+            )
+        self._chat_template_name = template["name"]
+
+    def _load_json_completion_template(self, template_path: str) -> None:
+        """Load a JSON completion template file."""
+        assert template_path.endswith(
+            ".json"
+        ), "unrecognized format of completion template file"
+
+        with open(template_path, "r") as filep:
+            template = json.load(filep)
+            try:
+                fim_position = FimPosition[template["fim_position"]]
+            except KeyError:
+                raise ValueError(
+                    f"Unknown fim position: {template['fim_position']}"
+                ) from None
+
+            register_completion_template(
+                CompletionTemplate(
+                    name=template["name"],
+                    fim_begin_token=template["fim_begin_token"],
+                    fim_middle_token=template["fim_middle_token"],
+                    fim_end_token=template["fim_end_token"],
+                    fim_position=fim_position,
+                ),
+                override=True,
+            )
+        self._completion_template_name = template["name"]
+
+    def _resolve_hf_chat_template(
+        self, tokenizer_manager: TokenizerManager
+    ) -> Optional[str]:
+        try:
+            # Try (mm-)processor first, then tokenizer
+            template = (
+                getattr(tokenizer_manager.processor, "chat_template", None)
+                if tokenizer_manager.processor
+                else None
+            ) or (
+                getattr(tokenizer_manager.tokenizer, "chat_template", None)
+                if tokenizer_manager.tokenizer
+                else None
+            )
+
+            if template is None:
+                logger.warning("No HuggingFace chat template found")
+                return None
+
+            # Handle dict templates (multiple named templates)
+            if isinstance(template, dict):
+                return self._select_named_template(template, tokenizer_manager)
+
+            # Single string template
+            return template
+
+        except Exception as e:
+            logger.warning(f"Error getting chat template: {e}")
+            return None
+
+    def _select_named_template(
+        self, templates: Dict[str, str], tokenizer_manager: TokenizerManager
+    ) -> str:
+        if not templates:
+            raise ValueError("Empty templates dict provided")
+
+        available_names = list(templates.keys())
+        logger.info(f"Multiple HuggingFace chat templates available: {available_names}")
+
+        # Use specified template if provided
+        if preferred_name := tokenizer_manager.server_args.hf_chat_template_name:
+            if preferred_name not in templates:
+                raise ValueError(
+                    f"Specified template '{preferred_name}' not found. "
+                    f"Available templates: {available_names}"
+                )
+            logger.info(f"Using specified chat template: '{preferred_name}'")
+            return templates[preferred_name]
+
+        # Fallback: Use first available template
+        first_name = available_names[0]
+        logger.info(f"Using first available template: '{first_name}'")
+        return templates[first_name]
diff --git a/sglang/python/sglang/srt/managers/tokenizer_communicator_mixin.py b/sglang/python/sglang/srt/managers/tokenizer_communicator_mixin.py
new file mode 100644
index 0000000000000000000000000000000000000000..3faf15cd36896ec4ef71ed8d3d5e22d08741536e
--- /dev/null
+++ b/sglang/python/sglang/srt/managers/tokenizer_communicator_mixin.py
@@ -0,0 +1,986 @@
+from __future__ import annotations
+
+import asyncio
+import copy
+import logging
+import time
+import uuid
+from collections import deque
+from contextlib import nullcontext
+from typing import (
+    TYPE_CHECKING,
+    Any,
+    Deque,
+    Dict,
+    Generic,
+    List,
+    Optional,
+    Tuple,
+    TypeVar,
+)
+
+import fastapi
+import zmq
+
+from sglang.srt.managers.io_struct import (
+    AttachHiCacheStorageReqInput,
+    AttachHiCacheStorageReqOutput,
+    CheckWeightsReqInput,
+    CheckWeightsReqOutput,
+    ClearHiCacheReqInput,
+    ClearHiCacheReqOutput,
+    CloseSessionReqInput,
+    DestroyWeightsUpdateGroupReqInput,
+    DestroyWeightsUpdateGroupReqOutput,
+    DetachHiCacheStorageReqInput,
+    DetachHiCacheStorageReqOutput,
+    DumperControlReqInput,
+    DumperControlReqOutput,
+    ExpertDistributionReq,
+    ExpertDistributionReqOutput,
+    ExpertDistributionReqType,
+    FlushCacheReqInput,
+    FlushCacheReqOutput,
+    GetInternalStateReq,
+    GetInternalStateReqOutput,
+    GetLoadReqInput,
+    GetLoadReqOutput,
+    GetLoadsReqInput,
+    GetLoadsReqOutput,
+    GetWeightsByNameReqInput,
+    GetWeightsByNameReqOutput,
+    InitWeightsSendGroupForRemoteInstanceReqInput,
+    InitWeightsSendGroupForRemoteInstanceReqOutput,
+    InitWeightsUpdateGroupReqInput,
+    InitWeightsUpdateGroupReqOutput,
+    LoadLoRAAdapterFromTensorsReqInput,
+    LoadLoRAAdapterFromTensorsReqOutput,
+    LoadLoRAAdapterReqInput,
+    LoadLoRAAdapterReqOutput,
+    LoRAUpdateOutput,
+    OpenSessionReqInput,
+    PinPrefixReqInput,
+    PinPrefixReqOutput,
+    ProfileReq,
+    ProfileReqOutput,
+    ProfileReqType,
+    ReleaseMemoryOccupationReqInput,
+    ReleaseMemoryOccupationReqOutput,
+    ResumeMemoryOccupationReqInput,
+    ResumeMemoryOccupationReqOutput,
+    SendWeightsToRemoteInstanceReqInput,
+    SendWeightsToRemoteInstanceReqOutput,
+    SetInternalStateReq,
+    SetInternalStateReqOutput,
+    SlowDownReqInput,
+    SlowDownReqOutput,
+    UnloadLoRAAdapterReqInput,
+    UnloadLoRAAdapterReqOutput,
+    UpdateWeightsFromDistributedReqInput,
+    UpdateWeightsFromDistributedReqOutput,
+    UpdateWeightsFromIPCReqInput,
+    UpdateWeightsFromIPCReqOutput,
+    UpdateWeightsFromTensorReqInput,
+    UpdateWeightsFromTensorReqOutput,
+)
+from sglang.srt.server_args import LoRARef, ServerArgs
+from sglang.srt.utils import get_bool_env_var
+from sglang.utils import TypeBasedDispatcher
+
+if TYPE_CHECKING:
+    from sglang.srt.managers.tokenizer_manager import TokenizerManager
+
+T = TypeVar("T")
+
+logger = logging.getLogger(__name__)
+
+
+class _Communicator(Generic[T]):
+    """Note: The communicator now only run up to 1 in-flight request at any time."""
+
+    def __init__(self, sender: zmq.Socket, fan_out: int, mode="queueing"):
+        self._sender = sender
+        self._fan_out = fan_out
+        self._mode = mode
+        self._result_event: Optional[asyncio.Event] = None
+        self._result_values: Optional[List[T]] = None
+        self._ready_queue: Deque[asyncio.Future] = deque()
+
+        assert mode in ["queueing", "watching"]
+
+    async def queueing_call(self, obj: T):
+        ready_event = asyncio.Event()
+        if self._result_event is not None or len(self._ready_queue) > 0:
+            self._ready_queue.append(ready_event)
+            await ready_event.wait()
+            assert self._result_event is None
+            assert self._result_values is None
+
+        if obj:
+            self._sender.send_pyobj(obj)
+
+        self._result_event = asyncio.Event()
+        self._result_values = []
+        await self._result_event.wait()
+        result_values = self._result_values
+        self._result_event = self._result_values = None
+
+        if len(self._ready_queue) > 0:
+            self._ready_queue.popleft().set()
+
+        return result_values
+
+    async def watching_call(self, obj):
+        if self._result_event is None:
+            assert self._result_values is None
+            self._result_values = []
+            self._result_event = asyncio.Event()
+
+            if obj:
+                self._sender.send_pyobj(obj)
+
+        await self._result_event.wait()
+        result_values = copy.deepcopy(self._result_values)
+        self._result_event = self._result_values = None
+        return result_values
+
+    async def __call__(self, obj):
+        if self._mode == "queueing":
+            return await self.queueing_call(obj)
+        else:
+            return await self.watching_call(obj)
+
+    def handle_recv(self, recv_obj: T):
+        self._result_values.append(recv_obj)
+        if len(self._result_values) == self._fan_out:
+            self._result_event.set()
+
+    @staticmethod
+    def merge_results(results):
+        all_success = all([r.success for r in results])
+        all_message = [r.message for r in results]
+        all_message = " | ".join(all_message)
+        return all_success, all_message
+
+
+class TokenizerCommunicatorMixin:
+    """Mixin class for TokenizerManager to handle communication with the scheduler."""
+
+    def init_communicators(self: TokenizerManager, server_args: ServerArgs):
+        # Communicators
+        self.init_weights_update_group_communicator = _Communicator(
+            self.send_to_scheduler, server_args.dp_size
+        )
+        self.destroy_weights_update_group_communicator = _Communicator(
+            self.send_to_scheduler, server_args.dp_size
+        )
+        self.update_weights_from_distributed_communicator = _Communicator(
+            self.send_to_scheduler, server_args.dp_size
+        )
+        self.init_weights_send_group_for_remote_instance_communicator = _Communicator(
+            self.send_to_scheduler, server_args.dp_size
+        )
+        self.send_weights_to_remote_instance_communicator = _Communicator(
+            self.send_to_scheduler, server_args.dp_size
+        )
+        self.update_weights_from_tensor_communicator = _Communicator(
+            self.send_to_scheduler, server_args.dp_size
+        )
+        self.update_weights_from_ipc_communicator = _Communicator(
+            self.send_to_scheduler, server_args.dp_size
+        )
+        self.get_weights_by_name_communicator = _Communicator(
+            self.send_to_scheduler, server_args.dp_size
+        )
+        self.release_memory_occupation_communicator = _Communicator(
+            self.send_to_scheduler, server_args.dp_size
+        )
+        self.resume_memory_occupation_communicator = _Communicator(
+            self.send_to_scheduler, server_args.dp_size
+        )
+        self.check_weights_communicator = _Communicator(
+            self.send_to_scheduler, server_args.dp_size
+        )
+        self.slow_down_communicator = _Communicator(
+            self.send_to_scheduler, server_args.dp_size
+        )
+        self.flush_cache_communicator = _Communicator(
+            self.send_to_scheduler, server_args.dp_size
+        )
+        self.clear_hicache_storage_communicator = _Communicator(
+            self.send_to_scheduler, server_args.dp_size
+        )
+        self.attach_hicache_storage_communicator = _Communicator(
+            self.send_to_scheduler, server_args.dp_size
+        )
+        self.detach_hicache_storage_communicator = _Communicator(
+            self.send_to_scheduler, server_args.dp_size
+        )
+        self.pin_prefix_communicator = _Communicator(
+            self.send_to_scheduler, server_args.dp_size
+        )
+        self.profile_communicator = _Communicator(
+            self.send_to_scheduler, server_args.dp_size
+        )
+        self.get_internal_state_communicator = _Communicator(
+            self.send_to_scheduler, server_args.dp_size
+        )
+        self.set_internal_state_communicator = _Communicator(
+            self.send_to_scheduler, server_args.dp_size
+        )
+        self.expert_distribution_communicator = _Communicator(
+            self.send_to_scheduler, server_args.dp_size
+        )
+        self.update_lora_adapter_communicator = _Communicator(
+            self.send_to_scheduler, server_args.dp_size
+        )
+        self.get_load_communicator = _Communicator(
+            self.send_to_scheduler, server_args.dp_size, mode="watching"
+        )
+        self.get_loads_communicator = _Communicator(
+            self.send_to_scheduler, server_args.dp_size
+        )
+        self.dumper_control_communicator = _Communicator(
+            self.send_to_scheduler, server_args.dp_size
+        )
+
+        self._result_dispatcher += self._get_communicator_dispatcher()
+
+    def _get_communicator_dispatcher(self: TokenizerManager):
+        return TypeBasedDispatcher(
+            [
+                (
+                    InitWeightsUpdateGroupReqOutput,
+                    self.init_weights_update_group_communicator.handle_recv,
+                ),
+                (
+                    DestroyWeightsUpdateGroupReqOutput,
+                    self.destroy_weights_update_group_communicator.handle_recv,
+                ),
+                (
+                    UpdateWeightsFromDistributedReqOutput,
+                    self.update_weights_from_distributed_communicator.handle_recv,
+                ),
+                (
+                    InitWeightsSendGroupForRemoteInstanceReqOutput,
+                    self.init_weights_send_group_for_remote_instance_communicator.handle_recv,
+                ),
+                (
+                    SendWeightsToRemoteInstanceReqOutput,
+                    self.send_weights_to_remote_instance_communicator.handle_recv,
+                ),
+                (
+                    UpdateWeightsFromTensorReqOutput,
+                    self.update_weights_from_tensor_communicator.handle_recv,
+                ),
+                (
+                    UpdateWeightsFromIPCReqOutput,
+                    self.update_weights_from_ipc_communicator.handle_recv,
+                ),
+                (
+                    GetWeightsByNameReqOutput,
+                    self.get_weights_by_name_communicator.handle_recv,
+                ),
+                (
+                    ReleaseMemoryOccupationReqOutput,
+                    self.release_memory_occupation_communicator.handle_recv,
+                ),
+                (
+                    ResumeMemoryOccupationReqOutput,
+                    self.resume_memory_occupation_communicator.handle_recv,
+                ),
+                (
+                    CheckWeightsReqOutput,
+                    self.check_weights_communicator.handle_recv,
+                ),
+                (
+                    SlowDownReqOutput,
+                    self.slow_down_communicator.handle_recv,
+                ),
+                (
+                    ClearHiCacheReqOutput,
+                    self.clear_hicache_storage_communicator.handle_recv,
+                ),
+                (
+                    AttachHiCacheStorageReqOutput,
+                    self.attach_hicache_storage_communicator.handle_recv,
+                ),
+                (
+                    DetachHiCacheStorageReqOutput,
+                    self.detach_hicache_storage_communicator.handle_recv,
+                ),
+                (
+                    PinPrefixReqOutput,
+                    self.pin_prefix_communicator.handle_recv,
+                ),
+                (
+                    FlushCacheReqOutput,
+                    self.flush_cache_communicator.handle_recv,
+                ),
+                (
+                    ProfileReqOutput,
+                    self.profile_communicator.handle_recv,
+                ),
+                (
+                    GetInternalStateReqOutput,
+                    self.get_internal_state_communicator.handle_recv,
+                ),
+                (
+                    SetInternalStateReqOutput,
+                    self.set_internal_state_communicator.handle_recv,
+                ),
+                (
+                    ExpertDistributionReqOutput,
+                    self.expert_distribution_communicator.handle_recv,
+                ),
+                (
+                    LoRAUpdateOutput,
+                    self.update_lora_adapter_communicator.handle_recv,
+                ),
+                (
+                    GetLoadReqOutput,
+                    self.get_load_communicator.handle_recv,
+                ),
+                (
+                    GetLoadsReqOutput,
+                    self.get_loads_communicator.handle_recv,
+                ),
+                (
+                    DumperControlReqOutput,
+                    self.dumper_control_communicator.handle_recv,
+                ),
+            ]
+        )
+
+    async def flush_cache(self: TokenizerManager) -> FlushCacheReqOutput:
+        self.auto_create_handle_loop()
+        return (await self.flush_cache_communicator(FlushCacheReqInput()))[0]
+
+    async def clear_hicache_storage(self: TokenizerManager) -> ClearHiCacheReqOutput:
+        """Clear the hierarchical cache storage."""
+        self.auto_create_handle_loop()
+        # Delegate to the scheduler to handle HiCacheStorage clearing
+        return (await self.clear_hicache_storage_communicator(ClearHiCacheReqInput()))[
+            0
+        ]
+
+    async def attach_hicache_storage(
+        self: TokenizerManager,
+        hicache_storage_backend: str,
+        hicache_storage_backend_extra_config_json: Optional[str] = None,
+        hicache_storage_prefetch_policy: Optional[str] = None,
+        hicache_write_policy: Optional[str] = None,
+    ) -> AttachHiCacheStorageReqOutput:
+        """Attach (enable) HiCache storage backend at runtime."""
+        self.auto_create_handle_loop()
+        results = await self.attach_hicache_storage_communicator(
+            AttachHiCacheStorageReqInput(
+                hicache_storage_backend=hicache_storage_backend,
+                hicache_storage_backend_extra_config_json=hicache_storage_backend_extra_config_json,
+                hicache_storage_prefetch_policy=hicache_storage_prefetch_policy,
+                hicache_write_policy=hicache_write_policy,
+            )
+        )
+
+        all_success, all_message = _Communicator.merge_results(results)
+        out = AttachHiCacheStorageReqOutput(success=all_success, message=all_message)
+        # TODO: partial rollback if failed
+        if all_success:
+            # Keep tokenizer side server_info consistent with scheduler side.
+            self.server_args.hicache_storage_backend = hicache_storage_backend
+            if hicache_storage_backend_extra_config_json is not None:
+                self.server_args.hicache_storage_backend_extra_config = (
+                    hicache_storage_backend_extra_config_json
+                )
+            if hicache_storage_prefetch_policy is not None:
+                self.server_args.hicache_storage_prefetch_policy = (
+                    hicache_storage_prefetch_policy
+                )
+            if hicache_write_policy is not None:
+                self.server_args.hicache_write_policy = hicache_write_policy
+        return out
+
+    async def detach_hicache_storage(
+        self: TokenizerManager,
+    ) -> DetachHiCacheStorageReqOutput:
+        """Detach (disable) HiCache storage backend at runtime."""
+        self.auto_create_handle_loop()
+        results = await self.detach_hicache_storage_communicator(
+            DetachHiCacheStorageReqInput()
+        )
+
+        all_success, all_message = _Communicator.merge_results(results)
+        out = DetachHiCacheStorageReqOutput(success=all_success, message=all_message)
+        # TODO: partial rollback if failed
+        if all_success:
+            self.server_args.hicache_storage_backend = None
+            self.server_args.hicache_storage_backend_extra_config = None
+        return out
+
+    async def pin_prefix(
+        self: TokenizerManager, token_ids: List[int], ttl_seconds: int = 300
+    ) -> PinPrefixReqOutput:
+        """Pin a prefix by token_ids to resist eviction."""
+        results = await self.pin_prefix_communicator(
+            PinPrefixReqInput(token_ids=token_ids, ttl_seconds=ttl_seconds)
+        )
+        all_success, all_message = _Communicator.merge_results(results)
+        total = sum(r.nodes_pinned for r in results)
+        return PinPrefixReqOutput(
+            success=all_success, nodes_pinned=total, message=all_message
+        )
+
+    async def start_profile(
+        self: TokenizerManager,
+        output_dir: Optional[str] = None,
+        start_step: Optional[int] = None,
+        num_steps: Optional[int] = None,
+        activities: Optional[List[str]] = None,
+        with_stack: Optional[bool] = None,
+        record_shapes: Optional[bool] = None,
+        profile_by_stage: bool = False,
+        merge_profiles: bool = False,
+        profile_prefix: Optional[str] = None,
+        profile_stages: Optional[List[str]] = None,
+    ):
+        self.auto_create_handle_loop()
+        env_with_stack: bool = get_bool_env_var("SGLANG_PROFILE_WITH_STACK", "true")
+        with_stack = False if with_stack is False or env_with_stack is False else True
+        env_record_shapes: bool = get_bool_env_var(
+            "SGLANG_PROFILE_RECORD_SHAPES", "true"
+        )
+        record_shapes = (record_shapes is not False) and env_record_shapes
+        req = ProfileReq(
+            type=ProfileReqType.START_PROFILE,
+            output_dir=output_dir,
+            start_step=start_step,
+            num_steps=num_steps,
+            activities=activities,
+            with_stack=with_stack,
+            record_shapes=record_shapes,
+            profile_by_stage=profile_by_stage,
+            profile_id=str(time.time()),
+            merge_profiles=merge_profiles,
+            profile_prefix=profile_prefix,
+            profile_stages=profile_stages,
+        )
+        return await self._execute_profile(req)
+
+    async def stop_profile(self: TokenizerManager):
+        self.auto_create_handle_loop()
+        req = ProfileReq(type=ProfileReqType.STOP_PROFILE)
+        return await self._execute_profile(req)
+
+    async def _execute_profile(self: TokenizerManager, req: ProfileReq):
+        result = (await self.profile_communicator(req))[0]
+        if not result.success:
+            raise RuntimeError(result.message)
+        return result
+
+    async def start_expert_distribution_record(self: TokenizerManager):
+        self.auto_create_handle_loop()
+        req = ExpertDistributionReq(action=ExpertDistributionReqType.START_RECORD)
+        await self.expert_distribution_communicator(req)
+
+    async def stop_expert_distribution_record(self: TokenizerManager):
+        self.auto_create_handle_loop()
+        req = ExpertDistributionReq(action=ExpertDistributionReqType.STOP_RECORD)
+        await self.expert_distribution_communicator(req)
+
+    async def dump_expert_distribution_record(self: TokenizerManager):
+        self.auto_create_handle_loop()
+        req = ExpertDistributionReq(action=ExpertDistributionReqType.DUMP_RECORD)
+        await self.expert_distribution_communicator(req)
+
+    async def init_weights_update_group(
+        self: TokenizerManager,
+        obj: InitWeightsUpdateGroupReqInput,
+        request: Optional[fastapi.Request] = None,
+    ) -> Tuple[bool, str]:
+        self.auto_create_handle_loop()
+        assert (
+            self.server_args.dp_size == 1 or self.server_args.enable_dp_attention
+        ), "dp_size must be 1 or dp attention must be enabled for update weights from distributed"
+
+        results = await self.init_weights_update_group_communicator(obj)
+        return _Communicator.merge_results(results)
+
+    async def destroy_weights_update_group(
+        self: TokenizerManager,
+        obj: DestroyWeightsUpdateGroupReqInput,
+        request: Optional[fastapi.Request] = None,
+    ) -> Tuple[bool, str]:
+        self.auto_create_handle_loop()
+        assert (
+            self.server_args.dp_size == 1 or self.server_args.enable_dp_attention
+        ), "dp_size must be 1 or dp attention must be enabled for destroy parameter update group"
+
+        results = await self.destroy_weights_update_group_communicator(obj)
+        return _Communicator.merge_results(results)
+
+    async def update_weights_from_distributed(
+        self: TokenizerManager,
+        obj: UpdateWeightsFromDistributedReqInput,
+        request: Optional[fastapi.Request] = None,
+    ) -> Tuple[bool, str]:
+        self.auto_create_handle_loop()
+        assert (
+            self.server_args.dp_size == 1 or self.server_args.enable_dp_attention
+        ), "dp_size must be 1 or dp attention must be enabled for update weights from distributed"
+
+        if obj.abort_all_requests:
+            self.abort_request(abort_all=True)
+
+        # Immediately update the weights if the engine is in paused state
+        async with self.is_pause_cond:
+            is_paused = self.is_pause
+
+        lock_context = (
+            self.model_update_lock.writer_lock if not is_paused else nullcontext()
+        )
+        async with lock_context:
+            results = await self.update_weights_from_distributed_communicator(obj)
+
+        success, message = _Communicator.merge_results(results)
+        if success and obj.weight_version is not None:
+            self._update_weight_version_if_provided(obj.weight_version)
+            message += f" Weight version updated to {obj.weight_version}."
+
+        return success, message
+
+    async def init_weights_send_group_for_remote_instance(
+        self: TokenizerManager,
+        obj: InitWeightsSendGroupForRemoteInstanceReqInput,
+        request: Optional[fastapi.Request] = None,
+    ) -> Tuple[bool, str]:
+        self.auto_create_handle_loop()
+        # TODO: support DP
+        assert (
+            self.server_args.dp_size == 1
+        ), "dp_size must be 1 for init_weights_send_group_for_remote_instance"
+        result = (
+            await self.init_weights_send_group_for_remote_instance_communicator(obj)
+        )[0]
+        return result.success, result.message
+
+    async def send_weights_to_remote_instance(
+        self: TokenizerManager,
+        obj: SendWeightsToRemoteInstanceReqInput,
+        request: Optional[fastapi.Request] = None,
+    ) -> Tuple[bool, str]:
+        self.auto_create_handle_loop()
+        # TODO: support DP
+        assert (
+            self.server_args.dp_size == 1
+        ), "dp_size must be 1 for send_weights_to_remote_instance"
+        result = (await self.send_weights_to_remote_instance_communicator(obj))[0]
+        return result.success, result.message
+
+    async def update_weights_from_tensor(
+        self: TokenizerManager,
+        obj: UpdateWeightsFromTensorReqInput,
+        request: Optional[fastapi.Request] = None,
+    ) -> Tuple[bool, str]:
+        self.auto_create_handle_loop()
+        assert (
+            self.server_args.dp_size == 1 or self.server_args.enable_dp_attention
+        ), "dp_size must be 1 or dp attention must be enabled for update weights from tensor"
+
+        if obj.abort_all_requests:
+            self.abort_request(abort_all=True)
+
+        # Immediately update the weights if the engine is in paused state
+        async with self.is_pause_cond:
+            is_paused = self.is_pause
+
+        lock_context = (
+            self.model_update_lock.writer_lock if not is_paused else nullcontext()
+        )
+        async with lock_context:
+            results = await self.update_weights_from_tensor_communicator(obj)
+
+        success, message = _Communicator.merge_results(results)
+        if success and obj.weight_version is not None:
+            self._update_weight_version_if_provided(obj.weight_version)
+            message += f" Weight version updated to {obj.weight_version}."
+
+        return success, message
+
+    async def update_weights_from_ipc(
+        self: TokenizerManager,
+        obj: UpdateWeightsFromIPCReqInput,
+        request: Optional[fastapi.Request] = None,
+    ) -> Tuple[bool, str]:
+        """Update weights via IPC for checkpoint-engine integration."""
+        self.auto_create_handle_loop()
+        try:
+            # For now, we only support single data parallel instance
+            assert (
+                self.server_args.dp_size == 1 or self.server_args.enable_dp_attention
+            ), "dp_size must be 1 or dp attention must be enabled for update weights from IPC"
+            logger.info("Starting IPC weight update")
+            # This means that weight sync cannot run while requests are in progress.
+            async with self.model_update_lock.writer_lock:
+                result = (await self.update_weights_from_ipc_communicator(obj))[0]
+                success, message = result.success, result.message
+        except Exception as e:
+            error_msg = f"IPC weight update failed: {str(e)}"
+            logger.error(error_msg)
+            success, message = False, error_msg
+
+        if success and obj.weight_version is not None:
+            self._update_weight_version_if_provided(obj.weight_version)
+            message += f" Weight version updated to {obj.weight_version}."
+
+        return success, message
+
+    async def _unload_lora_adapter_locked(
+        self: TokenizerManager,
+        obj: UnloadLoRAAdapterReqInput,
+    ) -> UnloadLoRAAdapterReqOutput:
+        assert (
+            self.lora_update_lock.locked()
+        ), "self.lora_update_lock must be locked in order for self._unload_lora_adapter_locked() to be called"
+
+        # Unregister the LoRA adapter from the registry to stop new requests for this adapter
+        # from being started.
+        lora_id = await self.lora_registry.unregister(obj.lora_name)
+        obj.lora_id = lora_id
+
+        # Initiate the actual unloading operation at the backend processes only after all
+        # ongoing requests using this LoRA adapter are finished.
+        await self.lora_registry.wait_for_unload(lora_id)
+        result = (await self.update_lora_adapter_communicator(obj))[0]
+
+        return result
+
+    async def load_lora_adapter(
+        self: TokenizerManager,
+        obj: LoadLoRAAdapterReqInput,
+        _: Optional[fastapi.Request] = None,
+    ) -> LoadLoRAAdapterReqOutput:
+        self.auto_create_handle_loop()
+
+        try:
+            if not self.server_args.enable_lora:
+                raise ValueError(
+                    "LoRA is not enabled. Please set `--enable-lora` to enable LoRA."
+                )
+
+            # TODO (lifuhuang): Remove this after we verify that dynamic lora loading works
+            # with dp_size > 1.
+            assert (
+                self.server_args.dp_size == 1
+            ), "dp_size must be 1 for dynamic lora loading"
+            logger.info(
+                "Start load Lora adapter. Lora name=%s, path=%s",
+                obj.lora_name,
+                obj.lora_path,
+            )
+
+            async with self.lora_update_lock:
+                # Generate new uniquely identifiable LoRARef object.
+                new_adapter = LoRARef(
+                    lora_name=obj.lora_name,
+                    lora_path=obj.lora_path,
+                    pinned=obj.pinned,
+                )
+
+                # Trigger the actual loading operation at the backend processes.
+                obj.lora_id = new_adapter.lora_id
+                result = (await self.update_lora_adapter_communicator(obj))[0]
+
+                # Register the LoRA adapter only after loading is successful.
+                if result.success:
+                    await self.lora_registry.register(new_adapter)
+                    self.lora_ref_cache[obj.lora_name] = new_adapter
+
+                if self.server_args.max_loaded_loras is not None:
+                    while (
+                        self.lora_registry.num_registered_loras
+                        > self.server_args.max_loaded_loras
+                    ):
+                        lru_lora_name = await self.lora_registry.lru_lora_name(
+                            exclude_pinned=True
+                        )
+                        if lru_lora_name is None:
+                            raise ValueError(
+                                "Didn't find any LoRA adapters when trying to evict LRU LoRA adapter. "
+                                f"LoRA registry is: {self.lora_registry._registry}"
+                            )
+
+                        logger.info(
+                            f"Unloading least recently used LoRA adapter '{lru_lora_name}' "
+                            f"(current number of adapters: {self.lora_registry.num_registered_loras}, "
+                            f"max allowed: {self.server_args.max_loaded_loras})"
+                        )
+
+                        unload_result = await self._unload_lora_adapter_locked(
+                            UnloadLoRAAdapterReqInput(lora_name=lru_lora_name)
+                        )
+                        if not unload_result.success:
+                            raise ValueError(
+                                f"Error while unloading LRU LoRA adapter '{lru_lora_name}': "
+                                f"{unload_result.error_message}"
+                            )
+                        del result.loaded_adapters[lru_lora_name]
+
+                return result
+        except ValueError as e:
+            return LoadLoRAAdapterReqOutput(
+                success=False,
+                error_message=str(e),
+            )
+
+    async def load_lora_adapter_from_tensors(
+        self: TokenizerManager,
+        obj: LoadLoRAAdapterFromTensorsReqInput,
+        _: Optional[fastapi.Request] = None,
+    ) -> LoadLoRAAdapterFromTensorsReqOutput:
+        self.auto_create_handle_loop()
+
+        try:
+            if not self.server_args.enable_lora:
+                raise ValueError(
+                    "LoRA is not enabled. Please set `--enable-lora` to enable LoRA."
+                )
+
+            assert (
+                self.server_args.dp_size == 1
+            ), "dp_size must be 1 for dynamic lora loading"
+            logger.info(
+                "Start load Lora adapter from tensors. Lora name=%s",
+                obj.lora_name,
+            )
+
+            async with self.lora_update_lock:
+                new_adapter = LoRARef(
+                    lora_name=obj.lora_name,
+                    lora_path="__tensor__",
+                    pinned=obj.pinned,
+                )
+                obj.lora_id = new_adapter.lora_id
+                result = (await self.update_lora_adapter_communicator(obj))[0]
+
+                if result.success:
+                    await self.lora_registry.register(new_adapter)
+                    self.lora_ref_cache[obj.lora_name] = new_adapter
+                if self.server_args.max_loaded_loras is not None:
+                    while (
+                        self.lora_registry.num_registered_loras
+                        > self.server_args.max_loaded_loras
+                    ):
+                        lru_lora_name = await self.lora_registry.lru_lora_name(
+                            exclude_pinned=True
+                        )
+                        if lru_lora_name is None:
+                            raise ValueError(
+                                "Didn't find any LoRA adapters when trying to evict LRU LoRA adapter. "
+                                f"LoRA registry is: {self.lora_registry._registry}"
+                            )
+
+                        logger.info(
+                            f"Unloading least recently used LoRA adapter '{lru_lora_name}' "
+                            f"(current number of adapters: {self.lora_registry.num_registered_loras}, "
+                            f"max allowed: {self.server_args.max_loaded_loras})"
+                        )
+
+                        unload_result = await self._unload_lora_adapter_locked(
+                            UnloadLoRAAdapterReqInput(lora_name=lru_lora_name)
+                        )
+                        if not unload_result.success:
+                            raise ValueError(
+                                f"Error while unloading LRU LoRA adapter '{lru_lora_name}': "
+                                f"{unload_result.error_message}"
+                            )
+                        del result.loaded_adapters[lru_lora_name]
+
+                return result
+        except ValueError as e:
+            return LoadLoRAAdapterFromTensorsReqOutput(
+                success=False,
+                error_message=str(e),
+            )
+
+    async def unload_lora_adapter(
+        self: TokenizerManager,
+        obj: UnloadLoRAAdapterReqInput,
+        _: Optional[fastapi.Request] = None,
+    ) -> UnloadLoRAAdapterReqOutput:
+        self.auto_create_handle_loop()
+
+        try:
+            if not self.server_args.enable_lora:
+                raise ValueError(
+                    "LoRA is not enabled. Please set `--enable-lora` to enable LoRA."
+                )
+
+            assert (
+                obj.lora_name is not None
+            ), "lora_name must be provided to unload LoRA adapter"
+
+            # TODO (lifuhuang): Remove this after we verify that dynamic lora loading works
+            # with dp_size > 1.
+            assert (
+                self.server_args.dp_size == 1
+            ), "dp_size must be 1 for dynamic lora loading"
+            logger.info(
+                "Start unload Lora adapter. Lora name=%s",
+                obj.lora_name,
+            )
+
+            async with self.lora_update_lock:
+                return await self._unload_lora_adapter_locked(obj)
+        except ValueError as e:
+            return UnloadLoRAAdapterReqOutput(success=False, error_message=str(e))
+
+    async def get_weights_by_name(
+        self: TokenizerManager,
+        obj: GetWeightsByNameReqInput,
+        request: Optional[fastapi.Request] = None,
+    ):
+        self.auto_create_handle_loop()
+        results = await self.get_weights_by_name_communicator(obj)
+        all_parameters = [r.parameter for r in results]
+        if self.server_args.dp_size == 1:
+            return all_parameters[0]
+        else:
+            return all_parameters
+
+    async def release_memory_occupation(
+        self: TokenizerManager,
+        obj: ReleaseMemoryOccupationReqInput,
+        request: Optional[fastapi.Request] = None,
+    ):
+        self.auto_create_handle_loop()
+        await self.release_memory_occupation_communicator(obj)
+
+    async def resume_memory_occupation(
+        self: TokenizerManager,
+        obj: ResumeMemoryOccupationReqInput,
+        request: Optional[fastapi.Request] = None,
+    ):
+        self.auto_create_handle_loop()
+        await self.resume_memory_occupation_communicator(obj)
+
+    async def check_weights(
+        self: TokenizerManager,
+        obj: CheckWeightsReqInput,
+        request: Optional[fastapi.Request] = None,
+    ) -> CheckWeightsReqOutput:
+        self.auto_create_handle_loop()
+        results = await self.check_weights_communicator(obj)
+        return _Communicator.merge_results(results)
+
+    async def slow_down(
+        self: TokenizerManager,
+        obj: SlowDownReqInput,
+        request: Optional[fastapi.Request] = None,
+    ):
+        self.auto_create_handle_loop()
+        await self.slow_down_communicator(obj)
+
+    async def get_internal_state(self: TokenizerManager) -> List[Dict[Any, Any]]:
+        self.auto_create_handle_loop()
+        req = GetInternalStateReq()
+        responses: List[GetInternalStateReqOutput] = (
+            await self.get_internal_state_communicator(req)
+        )
+        # Many DP ranks
+        return [res.internal_state for res in responses]
+
+    async def set_internal_state(
+        self: TokenizerManager, obj: SetInternalStateReq
+    ) -> List[bool]:
+        self.auto_create_handle_loop()
+        responses: List[SetInternalStateReqOutput] = (
+            await self.set_internal_state_communicator(obj)
+        )
+        return [res.updated for res in responses]
+
+    async def dumper_control(
+        self: TokenizerManager, obj: DumperControlReqInput
+    ) -> List[DumperControlReqOutput]:
+        self.auto_create_handle_loop()
+        return await self.dumper_control_communicator(obj)
+
+    async def get_load(self: TokenizerManager) -> List[GetLoadReqOutput]:
+        self.auto_create_handle_loop()
+        req = GetLoadReqInput()
+        return await self.get_load_communicator(req)
+
+    async def get_loads(
+        self: TokenizerManager,
+        include: Optional[List[str]] = None,
+        dp_rank: Optional[int] = None,
+    ) -> List[GetLoadsReqOutput]:
+        """
+        Get comprehensive load metrics for /v1/loads endpoint.
+
+        Args:
+            include: List of sections to include. Options: core, memory, spec, lora, disagg, queues, all
+            dp_rank: Optional filter for specific DP rank
+
+        Returns:
+            List of GetLoadsReqOutput, one per scheduler (filtered by dp_rank if specified)
+        """
+        self.auto_create_handle_loop()
+        req = GetLoadsReqInput(
+            include=include if include else ["all"],
+            dp_rank=dp_rank,
+        )
+        results = await self.get_loads_communicator(req)
+
+        # Filter by dp_rank if specified
+        if dp_rank is not None:
+            results = [r for r in results if r.dp_rank == dp_rank]
+
+        return results
+
+    async def open_session(
+        self: TokenizerManager,
+        obj: OpenSessionReqInput,
+        request: Optional[fastapi.Request] = None,
+    ):
+        self.auto_create_handle_loop()
+        if obj.streaming:
+            if not self.server_args.enable_streaming_session:
+                raise ValueError(
+                    "Streaming sessions are disabled. "
+                    "Please relaunch with --enable-streaming-session."
+                )
+            if (
+                self.server_args.speculative_algorithm is not None
+                and not self.server_args.disable_overlap_schedule
+            ):
+                raise ValueError(
+                    "Streaming sessions are incompatible with speculative decoding v2 "
+                    "(overlap + speculative). Use --disable-overlap-schedule or "
+                    "disable speculative decoding."
+                )
+
+        if obj.session_id is None:
+            obj.session_id = uuid.uuid4().hex
+        elif obj.session_id in self.session_futures:
+            return None
+
+        self.send_to_scheduler.send_pyobj(obj)
+
+        self.session_futures[obj.session_id] = asyncio.Future()
+        session_id = await self.session_futures[obj.session_id]
+        del self.session_futures[obj.session_id]
+        return session_id
+
+    async def close_session(
+        self: TokenizerManager,
+        obj: CloseSessionReqInput,
+        request: Optional[fastapi.Request] = None,
+    ):
+        await self.send_to_scheduler.send_pyobj(obj)
+
+    def _update_weight_version_if_provided(
+        self: TokenizerManager, weight_version: Optional[str]
+    ) -> None:
+        """Update weight version if provided."""
+        if weight_version is not None:
+            self.server_args.weight_version = weight_version
diff --git a/sglang/python/sglang/srt/managers/tokenizer_manager.py b/sglang/python/sglang/srt/managers/tokenizer_manager.py
new file mode 100644
index 0000000000000000000000000000000000000000..ac8512fbc9d5317bb722d57173f3eca53a584c99
--- /dev/null
+++ b/sglang/python/sglang/srt/managers/tokenizer_manager.py
@@ -0,0 +1,2539 @@
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""TokenizerManager is a process that tokenizes the text."""
+
+import asyncio
+import copy
+import dataclasses
+import json
+import logging
+import os
+import pickle
+import signal
+import socket
+import sys
+import threading
+from collections import deque
+from contextlib import nullcontext
+from datetime import datetime
+from enum import Enum
+from http import HTTPStatus
+from typing import Any, Awaitable, Dict, List, Optional, Tuple, Union
+
+import fastapi
+import uvloop
+import zmq
+import zmq.asyncio
+from fastapi import BackgroundTasks
+
+from sglang.srt.configs.model_config import ModelConfig
+from sglang.srt.disaggregation.encode_receiver import create_mm_receiver
+from sglang.srt.disaggregation.utils import DisaggregationMode
+from sglang.srt.environ import envs
+from sglang.srt.lora.lora_registry import LoRARef, LoRARegistry
+from sglang.srt.managers.async_dynamic_batch_tokenizer import AsyncDynamicbatchTokenizer
+from sglang.srt.managers.async_mm_data_processor import AsyncMMDataProcessor
+from sglang.srt.managers.disagg_service import start_disagg_service
+from sglang.srt.managers.io_struct import (
+    AbortReq,
+    ActiveRanksOutput,
+    BatchEmbeddingOutput,
+    BatchMultimodalOutput,
+    BatchStrOutput,
+    BatchTokenIDOutput,
+    BatchTokenizedEmbeddingReqInput,
+    BatchTokenizedGenerateReqInput,
+    ConfigureLoggingReq,
+    ContinueGenerationReqInput,
+    EmbeddingReqInput,
+    FreezeGCReq,
+    GenerateReqInput,
+    HealthCheckOutput,
+    LoadLoRAAdapterReqInput,
+    OpenSessionReqOutput,
+    PauseGenerationReqInput,
+    SessionParams,
+    TokenizedEmbeddingReqInput,
+    TokenizedGenerateReqInput,
+    UpdateWeightFromDiskReqInput,
+    UpdateWeightFromDiskReqOutput,
+    WatchLoadUpdateReq,
+)
+from sglang.srt.managers.mm_utils import TensorTransportMode, wrap_shm_features
+from sglang.srt.managers.multimodal_processor import get_mm_processor, import_processors
+from sglang.srt.managers.schedule_batch import MultimodalDataItem
+from sglang.srt.managers.scheduler import is_health_check_generate_req
+from sglang.srt.managers.scheduler_input_blocker import input_blocker_guard_region
+from sglang.srt.managers.tokenizer_communicator_mixin import TokenizerCommunicatorMixin
+from sglang.srt.managers.tokenizer_manager_multiitem_mixin import (
+    TokenizerManagerMultiItemMixin,
+)
+from sglang.srt.observability.cpu_monitor import start_cpu_monitor_thread
+from sglang.srt.observability.metrics_collector import TokenizerMetricsCollector
+from sglang.srt.observability.req_time_stats import (
+    APIServerReqTimeStats,
+    calibrate_time_diff,
+    convert_time_to_realtime,
+    real_time,
+    set_time_batch,
+)
+from sglang.srt.observability.request_metrics_exporter import (
+    RequestMetricsExporterManager,
+)
+from sglang.srt.observability.trace import SpanAttributes, extract_trace_headers
+from sglang.srt.sampling.sampling_params import SamplingParams
+from sglang.srt.server_args import (
+    PortArgs,
+    ServerArgs,
+    set_global_server_args_for_tokenizer,
+)
+from sglang.srt.speculative.spec_info import SpeculativeAlgorithm
+from sglang.srt.utils import (
+    configure_gc_warning,
+    freeze_gc,
+    get_bool_env_var,
+    get_or_create_event_loop,
+    get_zmq_socket,
+    kill_process_tree,
+)
+from sglang.srt.utils.aio_rwlock import RWLock
+from sglang.srt.utils.hf_transformers_utils import (
+    get_processor,
+    get_tokenizer,
+    get_tokenizer_from_processor,
+)
+from sglang.srt.utils.request_logger import RequestLogger
+from sglang.srt.utils.watchdog import Watchdog
+from sglang.utils import TypeBasedDispatcher, get_exception_traceback
+
+asyncio.set_event_loop_policy(uvloop.EventLoopPolicy())
+
+_REQUEST_STATE_WAIT_TIMEOUT = envs.SGLANG_REQUEST_STATE_WAIT_TIMEOUT.get()
+
+logger = logging.getLogger(__name__)
+
+
+@dataclasses.dataclass
+class ReqState:
+    """Store the state a request."""
+
+    out_list: List[Dict[Any, Any]]
+    finished: bool
+    event: asyncio.Event
+    obj: Union[GenerateReqInput, EmbeddingReqInput]
+    time_stats: APIServerReqTimeStats
+    last_completion_tokens: int = 1
+
+    # For streaming output
+    last_output_offset: int = 0
+
+    # For incremental state update.
+    # TODO(lianmin): do not initialize some lists if not needed.
+    text: str = ""
+    output_ids: List[int] = dataclasses.field(default_factory=list)
+    input_token_logprobs_val: List[float] = dataclasses.field(default_factory=list)
+    input_token_logprobs_idx: List[int] = dataclasses.field(default_factory=list)
+    output_token_logprobs_val: List[float] = dataclasses.field(default_factory=list)
+    output_token_logprobs_idx: List[int] = dataclasses.field(default_factory=list)
+    input_top_logprobs_val: List[List[float]] = dataclasses.field(default_factory=list)
+    input_top_logprobs_idx: List[List[int]] = dataclasses.field(default_factory=list)
+    output_top_logprobs_val: List[List[float]] = dataclasses.field(default_factory=list)
+    output_top_logprobs_idx: List[List[int]] = dataclasses.field(default_factory=list)
+    input_token_ids_logprobs_val: List = dataclasses.field(default_factory=list)
+    input_token_ids_logprobs_idx: List = dataclasses.field(default_factory=list)
+    output_token_ids_logprobs_val: List = dataclasses.field(default_factory=list)
+    output_token_ids_logprobs_idx: List = dataclasses.field(default_factory=list)
+
+    # For detokenized logprobs
+    input_token_logprobs: List[Any] = dataclasses.field(default_factory=list)
+    output_token_logprobs: List[Any] = dataclasses.field(default_factory=list)
+    input_top_logprobs: List[Any] = dataclasses.field(default_factory=list)
+    output_top_logprobs: List[Any] = dataclasses.field(default_factory=list)
+    input_token_ids_logprobs: List[Any] = dataclasses.field(default_factory=list)
+    output_token_ids_logprobs: List[Any] = dataclasses.field(default_factory=list)
+
+
+class InputFormat(Enum):
+    """Input format types for tokenization handling."""
+
+    SINGLE_STRING = 1  # Regular single text like "Hello world"
+    BATCH_STRINGS = 2  # Regular batch like ["Hello", "World"]
+    CROSS_ENCODER_PAIRS = 3  # Cross-encoder pairs like [["query", "document"]]
+
+
+class TokenizerManager(TokenizerCommunicatorMixin, TokenizerManagerMultiItemMixin):
+    """TokenizerManager is a process that tokenizes the text."""
+
+    def __init__(
+        self,
+        server_args: ServerArgs,
+        port_args: PortArgs,
+    ):
+        # Parse args
+        self.server_args = server_args
+        self.enable_metrics = server_args.enable_metrics
+        self.preferred_sampling_params = server_args.preferred_sampling_params
+        self.crash_dump_folder = server_args.crash_dump_folder
+        set_global_server_args_for_tokenizer(server_args)
+
+        # Init model config
+        self.init_model_config()
+
+        # Initialize tokenizer and multimodalprocessor
+        self.init_tokenizer_and_processor()
+
+        # Init inter-process communication
+        self.init_ipc_channels(port_args)
+
+        # Init running status
+        self.init_running_status()
+
+        # Init logging and dumping
+        self.init_request_logging_and_dumping()
+
+        # Init weight update
+        self.init_weight_update()
+
+        # Init LoRA status
+        self.init_lora()
+
+        # Init PD disaggregation and encoder disaggregation
+        self.init_disaggregation()
+
+        # Init metric collector and watchdog
+        self.init_metric_collector_watchdog()
+
+        if self.enable_metrics:
+            start_cpu_monitor_thread("tokenizer")
+
+        # Init request dispatcher
+        self.init_request_dispatcher()
+
+    def init_model_config(self):
+        server_args = self.server_args
+        model_config_class = getattr(self, "model_config_class", ModelConfig)
+
+        # Read model args
+        self.model_path = server_args.model_path
+        self.served_model_name = server_args.served_model_name
+        self.model_config = model_config_class.from_server_args(server_args)
+        self.is_generation = self.model_config.is_generation
+        self.is_image_gen = self.model_config.is_image_gen
+        self.context_len = self.model_config.context_len
+        self.image_token_id = self.model_config.image_token_id
+        self.max_req_input_len = None  # Will be set later in engine.py
+        self.enable_priority_scheduling = server_args.enable_priority_scheduling
+        self.default_priority_value = server_args.default_priority_value
+        speculative_algorithm = SpeculativeAlgorithm.from_string(
+            server_args.speculative_algorithm
+        )
+        if speculative_algorithm.is_eagle():
+            # In the current eagle implementation, we store the draft tokens in the output token slots,
+            # so we need to reserve the space for the draft tokens.
+            self.num_reserved_tokens = max(
+                server_args.speculative_eagle_topk * server_args.speculative_num_steps,
+                server_args.speculative_num_draft_tokens,
+            )
+        else:
+            self.num_reserved_tokens = 0
+        self.validate_total_tokens = True
+
+    def init_tokenizer_and_processor(self):
+        server_args = self.server_args
+
+        # Initialize tokenizer and processor
+        if self.model_config.is_multimodal:
+            import_processors("sglang.srt.multimodal.processors")
+            if mm_process_pkg := envs.SGLANG_EXTERNAL_MM_PROCESSOR_PACKAGE.get():
+                import_processors(mm_process_pkg, overwrite=True)
+            _processor = _get_processor_wrapper(server_args)
+            transport_mode = _determine_tensor_transport_mode(self.server_args)
+
+            # We want to parallelize the image pre-processing so we create an executor for it
+            # We create mm_processor for any skip_tokenizer_init to make sure we still encode
+            # images even with skip_tokenizer_init=False.
+            self.mm_processor = get_mm_processor(
+                self.model_config.hf_config, server_args, _processor, transport_mode
+            )
+            self.mm_data_processor = AsyncMMDataProcessor(
+                self.mm_processor,
+                max_concurrent_calls=self.server_args.mm_max_concurrent_calls,
+                timeout_s=self.server_args.mm_per_request_timeout,
+            )
+
+            if server_args.skip_tokenizer_init:
+                self.tokenizer = self.processor = None
+            else:
+                self.processor = _processor
+                self.tokenizer = get_tokenizer_from_processor(self.processor)
+                os.environ["TOKENIZERS_PARALLELISM"] = "false"
+                self._initialize_multi_item_delimiter_text()
+        else:
+            self.mm_processor = self.processor = None
+
+            if server_args.skip_tokenizer_init:
+                self.tokenizer = None
+            else:
+                self.tokenizer = get_tokenizer(
+                    server_args.tokenizer_path,
+                    tokenizer_mode=server_args.tokenizer_mode,
+                    trust_remote_code=server_args.trust_remote_code,
+                    revision=server_args.revision,
+                )
+                self._initialize_multi_item_delimiter_text()
+
+        # Initialize async dynamic batch tokenizer if enabled (common for both multimodal and non-multimodal)
+        if (
+            server_args.enable_dynamic_batch_tokenizer
+            and not server_args.skip_tokenizer_init
+        ):
+            self.async_dynamic_batch_tokenizer = AsyncDynamicbatchTokenizer(
+                self.tokenizer,
+                max_batch_size=server_args.dynamic_batch_tokenizer_batch_size,
+                batch_wait_timeout_s=server_args.dynamic_batch_tokenizer_batch_timeout,
+            )
+        else:
+            self.async_dynamic_batch_tokenizer = None
+
+    def init_ipc_channels(self, port_args: PortArgs):
+        context = zmq.asyncio.Context(2)
+        self.recv_from_detokenizer = get_zmq_socket(
+            context, zmq.PULL, port_args.tokenizer_ipc_name, True
+        )
+        if self.server_args.tokenizer_worker_num == 1:
+            self.send_to_scheduler = get_zmq_socket(
+                context, zmq.PUSH, port_args.scheduler_input_ipc_name, True
+            )
+        else:
+            from sglang.srt.managers.multi_tokenizer_mixin import SenderWrapper
+
+            # Use tokenizer_worker_ipc_name in multi-tokenizer mode
+            send_to_scheduler = get_zmq_socket(
+                context, zmq.PUSH, port_args.tokenizer_worker_ipc_name, False
+            )
+
+            # Make sure that each request carries the tokenizer_ipc_name for response routing
+            self.send_to_scheduler = SenderWrapper(port_args, send_to_scheduler)
+
+    def init_running_status(self):
+        # Request states
+        self.rid_to_state: Dict[str, ReqState] = {}
+        self.event_loop = None
+        self.asyncio_tasks = set()
+
+        # Health check
+        self.server_status = ServerStatus.Starting
+        self.gracefully_exit = False
+        self.last_receive_tstamp = real_time()
+
+        # For load balancing
+        self.current_load = 0
+        self.current_load_lock = asyncio.Lock()
+
+        # Session
+        self.session_futures = {}  # session_id -> asyncio event
+
+    def init_request_logging_and_dumping(self):
+        # TODO: Refactor and organize the log export code.
+        # Request logging
+        self.request_logger = RequestLogger(
+            log_requests=self.server_args.log_requests,
+            log_requests_level=self.server_args.log_requests_level,
+            log_requests_format=self.server_args.log_requests_format,
+            log_requests_target=self.server_args.log_requests_target,
+        )
+
+        # Dumping
+        self.dump_requests_folder = ""  # By default do not dump
+        self.dump_requests_threshold = 1000
+        self.dump_request_list: List[Tuple] = []
+        self.crash_dump_request_list: deque[Tuple] = deque()
+        self.crash_dump_performed = False  # Flag to ensure dump is only called once
+        self.straggler_request_list: List[Tuple] = []
+
+        # Initialize performance metrics loggers with proper skip names
+        _, obj_skip_names, out_skip_names = self.request_logger.metadata
+        self.request_metrics_exporter_manager = RequestMetricsExporterManager(
+            self.server_args, obj_skip_names, out_skip_names
+        )
+
+    def init_weight_update(self):
+        # Initial weights status
+        self.initial_weights_loaded = True
+        if self.server_args.checkpoint_engine_wait_weights_before_ready:
+            self.initial_weights_loaded = False
+
+        # Weight updates
+        # The event to notify the weight sync is finished.
+        self.model_update_lock = RWLock()
+        self.model_update_result: Optional[Awaitable[UpdateWeightFromDiskReqOutput]] = (
+            None
+        )
+        self.is_pause = False
+        self.is_pause_cond = asyncio.Condition()
+
+    def init_lora(self):
+        # LoRA
+        # Initialize the `LoRARegistry` with initial LoRA adapter paths provided in `server_args`.
+        # The registry dynamically updates as adapters are loaded / unloaded during runtime. It
+        # serves as the source of truth for available adapters and maps user-friendly LoRA names
+        # to internally used unique LoRA IDs.
+        self.lora_registry = LoRARegistry(self.server_args.lora_paths)
+        # Lock to serialize LoRA update operations.
+        # Please note that, unlike `model_update_lock`, this does not block inference, allowing
+        # LoRA updates and inference to overlap.
+        self.lora_update_lock = asyncio.Lock()
+        # A cache for mapping the lora_name for LoRA adapters that have been loaded at any
+        # point to their latest LoRARef objects, so that they can be
+        # dynamically loaded if needed for inference
+        self.lora_ref_cache: Dict[str, LoRARef] = {}
+        if self.server_args.lora_paths is not None:
+            for lora_ref in self.server_args.lora_paths:
+                self.lora_ref_cache[lora_ref.lora_name] = lora_ref
+
+    def init_disaggregation(self):
+        # PD Disaggregation
+        self.disaggregation_mode = DisaggregationMode(
+            self.server_args.disaggregation_mode
+        )
+        self.bootstrap_server = start_disagg_service(self.server_args)
+
+        # Encoder Disaggregation
+        if self.server_args.language_only:
+            self.mm_receiver = create_mm_receiver(
+                self.server_args,
+                dtype=self.model_config.dtype,
+            )
+
+    def init_metric_collector_watchdog(self):
+        # Metrics
+        if self.enable_metrics:
+            labels = {
+                "model_name": self.server_args.served_model_name,
+                # TODO: Add lora name/path in the future,
+            }
+            if self.enable_priority_scheduling:
+                labels["priority"] = ""
+            if self.server_args.tokenizer_metrics_allowed_custom_labels:
+                for label in self.server_args.tokenizer_metrics_allowed_custom_labels:
+                    labels[label] = ""
+            if self.server_args.extra_metric_labels:
+                labels.update(self.server_args.extra_metric_labels)
+            self.metrics_collector = TokenizerMetricsCollector(
+                server_args=self.server_args,
+                labels=labels,
+                bucket_time_to_first_token=self.server_args.bucket_time_to_first_token,
+                bucket_e2e_request_latency=self.server_args.bucket_e2e_request_latency,
+                bucket_inter_token_latency=self.server_args.bucket_inter_token_latency,
+                collect_tokens_histogram=self.server_args.collect_tokens_histogram,
+            )
+
+        if self.server_args.gc_warning_threshold_secs > 0.0:
+            configure_gc_warning(self.server_args.gc_warning_threshold_secs)
+        self.soft_watchdog = Watchdog.create(
+            debug_name="TokenizerManager",
+            watchdog_timeout=self.server_args.soft_watchdog_timeout,
+            soft=True,
+            test_stuck_time=envs.SGLANG_TEST_STUCK_TOKENIZER.get(),
+        )
+
+    def init_request_dispatcher(self):
+        self._result_dispatcher = TypeBasedDispatcher(
+            [
+                (
+                    (
+                        BatchStrOutput,
+                        BatchEmbeddingOutput,
+                        BatchTokenIDOutput,
+                        BatchMultimodalOutput,
+                    ),
+                    self._handle_batch_output,
+                ),
+                (AbortReq, self._handle_abort_req),
+                (OpenSessionReqOutput, self._handle_open_session_req_output),
+                (
+                    UpdateWeightFromDiskReqOutput,
+                    self._handle_update_weights_from_disk_req_output,
+                ),
+                (FreezeGCReq, lambda x: None),
+                # For handling case when scheduler skips detokenizer and forwards back to the tokenizer manager, we ignore it.
+                (HealthCheckOutput, lambda x: None),
+                (ActiveRanksOutput, self.update_active_ranks),
+            ]
+        )
+        self.init_communicators(self.server_args)
+
+        self.sampling_params_class = SamplingParams
+        self.signal_handler_class = SignalHandler
+
+    async def generate_request(
+        self,
+        obj: Union[GenerateReqInput, EmbeddingReqInput],
+        request: Optional[fastapi.Request] = None,
+    ):
+        self.auto_create_handle_loop()
+
+        # Normalize the request
+        obj.normalize_batch_and_arguments()
+        self._set_default_priority(obj)
+        self._validate_rid(obj)
+
+        if isinstance(obj, GenerateReqInput) and obj.routed_dp_rank is not None:
+            dp_size = self.server_args.dp_size
+            if dp_size <= 1 and obj.routed_dp_rank == 0:
+                logger.warning(
+                    f"routed_dp_rank={obj.routed_dp_rank} is ignored because dp_size={dp_size}"
+                )
+            elif obj.routed_dp_rank < 0 or obj.routed_dp_rank >= dp_size:
+                raise ValueError(
+                    f"routed_dp_rank={obj.routed_dp_rank} out of range [0, {dp_size})"
+                )
+
+        self._req_stats_init(obj, request)
+        if self.server_args.language_only:
+            self._handle_epd_disaggregation_encode_request(obj)
+        if self.server_args.tokenizer_worker_num > 1:
+            self._attach_multi_http_worker_info(obj)
+
+        # Log the request
+        self.request_logger.log_received_request(obj, self.tokenizer, request)
+
+        async with self.is_pause_cond:
+            await self.is_pause_cond.wait_for(lambda: not self.is_pause)
+
+        async with self.model_update_lock.reader_lock:
+            await self._validate_and_resolve_lora(obj)
+
+            # Tokenize the request and send it to the scheduler
+            if obj.is_single:
+                tokenized_obj = await self._tokenize_one_request(obj)
+                state = self.rid_to_state[obj.rid]
+                self._send_one_request(tokenized_obj)
+                async for response in self._wait_one_response(obj, state, request):
+                    yield response
+            else:
+                async for response in self._handle_batch_request(obj, request):
+                    yield response
+
+    def _detect_input_format(
+        self, texts: Union[str, List[str]], is_cross_encoder: bool
+    ) -> InputFormat:
+        """Detect the format of input texts for proper tokenization handling.
+
+        Returns:
+            - InputFormat.SINGLE_STRING: Regular single text like "Hello world"
+            - InputFormat.BATCH_STRINGS: Regular batch like ["Hello", "World"]
+            - InputFormat.CROSS_ENCODER_PAIRS: Cross-encoder pairs like [["query", "document"]]
+        """
+        if isinstance(texts, str):
+            return InputFormat.SINGLE_STRING
+
+        if (
+            is_cross_encoder
+            and len(texts) > 0
+            and isinstance(texts[0], list)
+            and len(texts[0]) == 2
+        ):
+            return InputFormat.CROSS_ENCODER_PAIRS
+
+        return InputFormat.BATCH_STRINGS
+
+    def _prepare_tokenizer_input(
+        self, texts: Union[str, List[str]], input_format: InputFormat
+    ) -> Union[List[str], List[List[str]]]:
+        """Prepare input for the tokenizer based on detected format."""
+        if input_format == InputFormat.SINGLE_STRING:
+            return [texts]  # Wrap single string for batch processing
+        elif input_format == InputFormat.CROSS_ENCODER_PAIRS:
+            return texts  # Already in correct format: [["query", "doc"]]
+        else:  # BATCH_STRINGS
+            return texts  # Already in correct format: ["text1", "text2"]
+
+    def _extract_tokenizer_results(
+        self,
+        input_ids: List[List[int]],
+        token_type_ids: Optional[List[List[int]]],
+        input_format: InputFormat,
+        original_batch_size: int,
+    ) -> Union[
+        Tuple[List[int], Optional[List[int]]],
+        Tuple[List[List[int]], Optional[List[List[int]]]],
+    ]:
+        """Extract results from tokenizer output based on input format."""
+
+        # For single inputs (string or single cross-encoder pair), extract first element
+        if (
+            input_format in [InputFormat.SINGLE_STRING, InputFormat.CROSS_ENCODER_PAIRS]
+            and original_batch_size == 1
+        ):
+            single_input_ids = input_ids[0] if input_ids else []
+            single_token_type_ids = token_type_ids[0] if token_type_ids else None
+            return single_input_ids, single_token_type_ids
+
+        # For true batches, return as-is
+        return input_ids, token_type_ids
+
+    async def _tokenize_texts(
+        self, texts: Union[str, List[str]], is_cross_encoder: bool = False
+    ) -> Union[
+        Tuple[List[int], Optional[List[int]]],
+        Tuple[List[List[int]], Optional[List[List[int]]]],
+    ]:
+        """
+        Tokenize text(s) using the appropriate tokenizer strategy.
+
+        This method handles multiple input formats and chooses between async dynamic
+        batch tokenizer (for single texts only) and regular tokenizer.
+
+        Args:
+            texts: Text input in various formats:
+
+                   Regular cases:
+                   - Single string: "How are you?"
+                   - Batch of strings: ["Hello", "World", "How are you?"]
+
+                   Cross-encoder cases (sentence pairs for similarity/ranking):
+                   - Single pair: [["query text", "document text"]]
+                   - Multiple pairs: [["q1", "d1"], ["q2", "d2"], ["q3", "d3"]]
+
+            is_cross_encoder: Whether to return token_type_ids for cross-encoder models.
+                             Enables proper handling of sentence pairs with segment IDs.
+
+        Returns:
+            Single input cases:
+                Tuple[List[int], Optional[List[int]]]: (input_ids, token_type_ids)
+                Example: ([101, 2129, 102], [0, 0, 0]) for single text
+                Example: ([101, 2129, 102, 4068, 102], [0, 0, 0, 1, 1]) for cross-encoder pair
+
+            Batch input cases:
+                Tuple[List[List[int]], Optional[List[List[int]]]]: (batch_input_ids, batch_token_type_ids)
+                Example: ([[101, 2129, 102], [101, 4068, 102]], None) for regular batch
+
+            Note: token_type_ids is None unless is_cross_encoder=True.
+        """
+        if not texts or self.tokenizer is None:
+            raise ValueError("texts cannot be empty and tokenizer must be initialized")
+
+        # Step 1: Detect input format and prepare for tokenization
+        input_format = self._detect_input_format(texts, is_cross_encoder)
+        tokenizer_input = self._prepare_tokenizer_input(texts, input_format)
+        original_batch_size = len(texts) if not isinstance(texts, str) else 1
+
+        # Step 2: Set up tokenizer arguments
+        tokenizer_kwargs = (
+            {"return_token_type_ids": is_cross_encoder} if is_cross_encoder else {}
+        )
+
+        # Step 3: Choose tokenization strategy
+        use_async_tokenizer = (
+            self.async_dynamic_batch_tokenizer is not None
+            and input_format == InputFormat.SINGLE_STRING
+        )
+
+        if use_async_tokenizer:
+            logger.debug("Using async dynamic batch tokenizer for single text")
+            result = await self.async_dynamic_batch_tokenizer.encode(
+                tokenizer_input[0], **tokenizer_kwargs
+            )
+            # Convert to batch format for consistency
+            input_ids = [result["input_ids"]]
+            token_type_ids = (
+                [result["token_type_ids"]]
+                if is_cross_encoder and result.get("token_type_ids")
+                else None
+            )
+        else:
+            logger.debug(f"Using regular tokenizer for {len(tokenizer_input)} inputs")
+            encoded = self.tokenizer(tokenizer_input, **tokenizer_kwargs)
+            input_ids = encoded["input_ids"]
+            token_type_ids = encoded.get("token_type_ids") if is_cross_encoder else None
+
+        # Step 4: Extract results based on input format
+        return self._extract_tokenizer_results(
+            input_ids, token_type_ids, input_format, original_batch_size
+        )
+
+    async def _tokenize_one_request(
+        self,
+        obj: Union[GenerateReqInput, EmbeddingReqInput],
+    ):
+        """Tokenize one request."""
+        # Tokenize
+        input_embeds = None
+        input_text = obj.text
+        token_type_ids = None
+        is_cross_encoder_request = (
+            isinstance(obj, EmbeddingReqInput) and obj.is_cross_encoder_request
+        )
+        if obj.input_embeds is not None:
+            if not self.server_args.disable_radix_cache:
+                raise ValueError(
+                    "input_embeds is provided while disable_radix_cache is False. "
+                    "Please add `--disable-radix-cache` when you launch the server "
+                    "if you want to use input_embeds as inputs."
+                )
+            input_embeds = obj.input_embeds
+            input_ids = obj.input_ids
+        elif obj.input_ids is not None:
+            input_ids = obj.input_ids
+        else:
+            if self.tokenizer is None:
+                raise ValueError(
+                    "The engine initialized with skip_tokenizer_init=True cannot "
+                    "accept text prompts. Please provide input_ids or re-initialize "
+                    "the engine with skip_tokenizer_init=False."
+                )
+
+            # For audio-only requests (e.g., Whisper), text may be empty.
+            # The multimodal processor will provide input_ids later.
+            if not input_text and self.mm_processor and obj.contains_mm_input():
+                # Use empty placeholder - multimodal processor will override
+                input_ids = []
+            else:
+                input_ids, token_type_ids = await self._tokenize_texts(
+                    input_text, is_cross_encoder_request
+                )
+
+        if self.mm_processor and obj.contains_mm_input():
+            if obj.image_data is not None and not isinstance(obj.image_data, list):
+                obj.image_data = [obj.image_data]
+            if obj.video_data is not None and not isinstance(obj.video_data, list):
+                obj.video_data = [obj.video_data]
+            if obj.audio_data is not None and not isinstance(obj.audio_data, list):
+                obj.audio_data = [obj.audio_data]
+            self._validate_mm_limits(obj)
+
+            mm_inputs = None
+
+            if (
+                not self.server_args.language_only
+                or self.server_args.encoder_transfer_backend
+                in ["zmq_to_tokenizer", "mooncake"]
+            ):
+                if self.server_args.language_only:
+                    mm_inputs = await self.mm_receiver.recv_mm_data(
+                        img_data=obj.image_data,
+                        mm_processor=self.mm_processor,
+                        prompt=(input_text or input_ids),
+                        need_wait_for_image=obj.need_wait_for_image,
+                    )
+                if mm_inputs is None:
+                    mm_inputs: Dict = await self.mm_data_processor.process(
+                        image_data=obj.image_data,
+                        audio_data=obj.audio_data,
+                        input_text_or_ids=(input_text or input_ids),
+                        request_obj=obj,
+                        max_req_input_len=self.max_req_input_len,
+                    )
+            elif (
+                self.server_args.language_only
+                and self.server_args.encoder_transfer_backend == "zmq_to_scheduler"
+                and not obj.need_wait_for_image
+            ):
+                # In language_only mode with zmq_to_scheduler, if we didn't dispatch
+                # to encoder (e.g., only one image), process locally like non-language_only mode
+                mm_inputs: Dict = await self.mm_data_processor.process(
+                    image_data=obj.image_data,
+                    audio_data=obj.audio_data,
+                    input_text_or_ids=(input_text or input_ids),
+                    request_obj=obj,
+                    max_req_input_len=self.max_req_input_len,
+                )
+
+            if mm_inputs and "input_ids" in mm_inputs:
+                input_ids = mm_inputs["input_ids"]
+            if (
+                envs.SGLANG_MM_PRECOMPUTE_HASH.get()
+                and mm_inputs
+                and "mm_items" in mm_inputs
+            ):
+                for item in mm_inputs["mm_items"]:
+                    if isinstance(item, MultimodalDataItem):
+                        item.set_pad_value()
+        else:
+            mm_inputs = None
+
+        self._validate_one_request(obj, input_ids)
+        return self._create_tokenized_object(
+            obj, input_text, input_ids, input_embeds, mm_inputs, token_type_ids
+        )
+
+    def _validate_rid(self, obj: Union[GenerateReqInput, EmbeddingReqInput]) -> None:
+        """Validate the request ID (rid) uniqueness."""
+        rid = obj.rid
+        if rid is None:
+            return
+        ids = rid if isinstance(rid, list) else [rid]
+        if len(ids) != len(set(ids)):
+            raise ValueError(
+                f"Duplicate request IDs detected within the request: {ids}"
+            )
+
+        for i in ids:
+            if i in self.rid_to_state:
+                raise ValueError(f"Duplicate request ID detected: {i}")
+
+    def _validate_one_request(
+        self, obj: Union[GenerateReqInput, EmbeddingReqInput], input_ids: List[int]
+    ) -> None:
+        """Validates that the input token count and the requested token count doesn't exceed the model's context length."""
+        # FIXME: unify the length validation logic with the one in the scheduler.
+        _max_req_len = self.context_len
+        input_token_num = len(input_ids) if input_ids is not None else 0
+        input_token_num += self.num_reserved_tokens
+
+        # Validate input length
+        if input_token_num >= self.context_len:
+            if self.server_args.allow_auto_truncate:
+                logger.warning(
+                    f"The input ({input_token_num} tokens) is longer than the "
+                    f"model's context length ({self.context_len} tokens). "
+                    "Truncating the input."
+                )
+                del input_ids[_max_req_len:]
+                input_token_num = len(input_ids)
+            else:
+                raise ValueError(
+                    f"The input ({input_token_num} tokens) is longer than the "
+                    f"model's context length ({self.context_len} tokens)."
+                )
+
+        # Validate total tokens (input + max_new_tokens)
+        max_new_tokens = obj.sampling_params.get("max_new_tokens")
+        if (
+            self.validate_total_tokens
+            and max_new_tokens is not None
+            and (max_new_tokens + input_token_num) >= _max_req_len
+        ):
+            if self.server_args.allow_auto_truncate:
+                logger.warning(
+                    f"Requested token count ({input_token_num} input + {max_new_tokens} new) "
+                    f"exceeds the model's context length ({self.context_len} tokens). "
+                    "Truncating max_new_tokens."
+                )
+                obj.sampling_params["max_new_tokens"] = max(
+                    0, _max_req_len - input_token_num
+                )
+            else:
+                total_tokens = max_new_tokens + input_token_num
+                error_msg = (
+                    f"Requested token count exceeds the model's maximum context length "
+                    f"of {self.context_len} tokens. You requested a total of {total_tokens} "
+                    f"tokens: {input_token_num} tokens from the input messages and "
+                    f"{max_new_tokens} tokens for the completion. Please reduce the number "
+                    f"of tokens in the input messages or the completion to fit within the limit."
+                )
+                raise ValueError(error_msg)
+
+        # Validate embedding requests
+        if isinstance(obj, EmbeddingReqInput) and self.is_generation:
+            raise ValueError(
+                "This model does not appear to be an embedding model by default. "
+                "Please add `--is-embedding` when launching the server or try another model."
+            )
+
+        # Validate Matryoshka embeddings
+        if isinstance(obj, EmbeddingReqInput):
+            self._validate_for_matryoshka_dim(obj)
+
+        # Validate custom logit processor
+        if isinstance(obj, GenerateReqInput):
+            if (
+                obj.return_hidden_states
+                and not self.server_args.enable_return_hidden_states
+            ):
+                raise ValueError(
+                    "The server is not configured to return the hidden states. "
+                    "Please set `--enable-return-hidden-states` to enable this feature."
+                )
+            if (
+                obj.custom_logit_processor
+                and not self.server_args.enable_custom_logit_processor
+            ):
+                raise ValueError(
+                    "The server is not configured to enable custom logit processor. "
+                    "Please set `--enable-custom-logit-processor` to enable this feature."
+                )
+
+    def _validate_mm_limits(
+        self, obj: Union[GenerateReqInput, EmbeddingReqInput]
+    ) -> None:
+        if not self.server_args.limit_mm_data_per_request:
+            return
+
+        for modality, limit in self.server_args.limit_mm_data_per_request.items():
+            data = getattr(obj, f"{modality}_data", None)
+            if data:
+                count = len(data) if isinstance(data, list) else 1
+                if count > limit:
+                    raise ValueError(
+                        f"{modality.capitalize()} count {count} exceeds limit {limit} per request."
+                    )
+
+    def _validate_for_matryoshka_dim(self, obj: EmbeddingReqInput) -> None:
+        """Validate the request for Matryoshka dim if it has the field set."""
+        if obj.dimensions is None:
+            return
+
+        if not self.model_config.is_matryoshka:
+            raise ValueError(
+                f"Model '{self.model_config.model_path}' does not support matryoshka representation, "
+                f"changing output dimensions will lead to poor results."
+            )
+
+        if obj.dimensions < 1:
+            raise ValueError("Requested dimensions must be greater than 0")
+
+        if (
+            self.model_config.matryoshka_dimensions
+            and obj.dimensions not in self.model_config.matryoshka_dimensions
+        ):
+            raise ValueError(
+                f"Model '{self.model_config.model_path}' only supports {self.model_config.matryoshka_dimensions} matryoshka dimensions, "
+                f"using other output dimensions will lead to poor results."
+            )
+
+        if obj.dimensions > self.model_config.hidden_size:
+            raise ValueError(
+                f"Provided dimensions are greater than max embedding dimension: {self.model_config.hidden_size}"
+            )
+
+    def _validate_input_ids_in_vocab(
+        self, input_ids: Union[List[int], List[List[int]]], vocab_size: int
+    ) -> None:
+        # Handle both single sequence and batch of sequences
+        if isinstance(input_ids[0], list):
+            # Batch of sequences
+            for seq in input_ids:
+                if any(id >= vocab_size for id in seq):
+                    raise ValueError(
+                        f"The input_ids {seq} contains values greater than the vocab size ({vocab_size})."
+                    )
+        else:
+            # Single sequence
+            if any(id >= vocab_size for id in input_ids):
+                raise ValueError(
+                    f"The input_ids {input_ids} contains values greater than the vocab size ({vocab_size})."
+                )
+
+    def _create_tokenized_object(
+        self,
+        obj: Union[GenerateReqInput, EmbeddingReqInput],
+        input_text: str,
+        input_ids: List[int],
+        input_embeds: Optional[Union[List[float], None]] = None,
+        mm_inputs: Optional[Dict] = None,
+        token_type_ids: Optional[List[int]] = None,
+    ) -> Union[TokenizedGenerateReqInput, TokenizedEmbeddingReqInput]:
+        """Create a tokenized request object from common parameters."""
+        # Parse sampling parameters
+        # Note: if there are preferred sampling params, we use them if they are not
+        # explicitly passed in sampling_params
+        if self.preferred_sampling_params:
+            sampling_kwargs = {**self.preferred_sampling_params, **obj.sampling_params}
+        else:
+            sampling_kwargs = obj.sampling_params
+        sampling_params = self.sampling_params_class(**sampling_kwargs)
+        sampling_params.normalize(self.tokenizer)
+        sampling_params.verify(self.model_config.vocab_size)
+
+        # Build return object
+        if isinstance(obj, GenerateReqInput):
+            session_params = (
+                SessionParams(**obj.session_params) if obj.session_params else None
+            )
+
+            tokenized_obj = TokenizedGenerateReqInput(
+                input_text,
+                input_ids,
+                mm_inputs,
+                sampling_params,
+                obj.return_logprob,
+                obj.logprob_start_len,
+                obj.top_logprobs_num,
+                obj.token_ids_logprob,
+                obj.stream,
+                rid=obj.rid,
+                http_worker_ipc=obj.http_worker_ipc,
+                bootstrap_host=obj.bootstrap_host,
+                bootstrap_port=obj.bootstrap_port,
+                bootstrap_room=obj.bootstrap_room,
+                lora_id=obj.lora_id,
+                input_embeds=input_embeds,
+                session_params=session_params,
+                custom_logit_processor=obj.custom_logit_processor,
+                require_reasoning=obj.require_reasoning,
+                return_hidden_states=obj.return_hidden_states,
+                return_routed_experts=obj.return_routed_experts,
+                routed_dp_rank=obj.routed_dp_rank,
+                disagg_prefill_dp_rank=obj.disagg_prefill_dp_rank,
+                priority=obj.priority,
+                extra_key=obj.extra_key,
+                routing_key=obj.routing_key,
+                need_wait_for_image=obj.need_wait_for_image,
+                num_items_assigned=obj.num_items_assigned,
+            )
+        elif isinstance(obj, EmbeddingReqInput):
+            tokenized_obj = TokenizedEmbeddingReqInput(
+                input_text,
+                input_ids,
+                mm_inputs,
+                token_type_ids,
+                sampling_params,
+                rid=obj.rid,
+                priority=obj.priority,
+                dimensions=obj.dimensions,
+                lora_id=obj.lora_id,
+                http_worker_ipc=obj.http_worker_ipc,
+            )
+
+        tokenized_obj.time_stats = self.rid_to_state[obj.rid].time_stats
+        self.rid_to_state[obj.rid].time_stats.set_tokenize_finish_time()
+
+        return tokenized_obj
+
+    async def _batch_tokenize_and_process(
+        self, batch_size: int, obj: Union[GenerateReqInput, EmbeddingReqInput]
+    ) -> List[Union[TokenizedGenerateReqInput, TokenizedEmbeddingReqInput]]:
+        """Handle batch tokenization for text inputs only."""
+        logger.debug(f"Starting batch tokenization for {batch_size} text requests")
+
+        # If batch does not have text nothing to tokenize
+        # so lets construct the return object
+        if not self._batch_has_text(batch_size, obj):
+            # All requests already have input_ids, no need to tokenize
+            return [await self._tokenize_one_request(obj[i]) for i in range(batch_size)]
+
+        self._validate_batch_tokenization_constraints(batch_size, obj)
+
+        # Collect requests and texts
+        requests = [obj[i] for i in range(batch_size)]
+        texts = [req.text for req in requests]
+
+        # Check if any request is a cross-encoder request
+        is_cross_encoder_request = any(
+            isinstance(req, EmbeddingReqInput) and req.is_cross_encoder_request
+            for req in requests
+        )
+
+        # Batch tokenize all texts using unified method
+        input_ids_list, token_type_ids_list = await self._tokenize_texts(
+            texts, is_cross_encoder_request
+        )
+
+        # Process all requests
+        tokenized_objs = []
+        for i, req in enumerate(requests):
+            self._validate_one_request(obj[i], input_ids_list[i])
+            token_type_ids = (
+                token_type_ids_list[i] if token_type_ids_list is not None else None
+            )
+            tokenized_objs.append(
+                self._create_tokenized_object(
+                    req, req.text, input_ids_list[i], None, None, token_type_ids
+                )
+            )
+        logger.debug(f"Completed batch processing for {batch_size} requests")
+        return tokenized_objs
+
+    def _validate_batch_tokenization_constraints(
+        self, batch_size: int, obj: Union[GenerateReqInput, EmbeddingReqInput]
+    ) -> None:
+        """Validate constraints for batch tokenization processing."""
+        for i in range(batch_size):
+            if self.is_generation and obj[i].contains_mm_input():
+                raise ValueError(
+                    "For multimodal input processing do not set `enable_tokenizer_batch_encode`."
+                )
+            if obj[i].input_ids is not None:
+                raise ValueError(
+                    "Batch tokenization is not needed for pre-tokenized input_ids. Do not set `enable_tokenizer_batch_encode`."
+                )
+            if obj[i].input_embeds is not None:
+                raise ValueError(
+                    "Batch tokenization is not needed for input_embeds. Do not set `enable_tokenizer_batch_encode`."
+                )
+
+    def _batch_has_text(
+        self, batch_size: int, obj: Union[GenerateReqInput, EmbeddingReqInput]
+    ) -> bool:
+        """Check if any request in the batch contains text input."""
+        for i in range(batch_size):
+            if obj[i].text:
+                return True
+            elif self.is_generation and obj[i].contains_mm_input():
+                return True
+
+        return False
+
+    def _should_use_batch_tokenization(self, batch_size, requests) -> bool:
+        """Return True if we should run the tokenizer in batch mode.
+
+        Current policy:
+        - Respect explicit server flag `enable_tokenizer_batch_encode`.
+        - Or, if no request has text or multimodal input (all use pre-tokenized input_ids or input_embeds), batch the requests without tokenization.
+        - Batch tokenization does not support DP attention yet, and it will make everything goes to the first rank currently
+        """
+        return batch_size > 0 and (
+            self.server_args.enable_tokenizer_batch_encode
+            or (
+                (not self.server_args.enable_dp_attention)
+                and (not self._batch_has_text(batch_size, requests))
+            )
+        )
+
+    def _send_one_request(
+        self,
+        tokenized_obj: Union[TokenizedGenerateReqInput, TokenizedEmbeddingReqInput],
+    ):
+        tokenized_obj.time_stats.set_api_server_dispatch_time()
+        tokenized_obj = wrap_shm_features(tokenized_obj)
+        self.send_to_scheduler.send_pyobj(tokenized_obj)
+        tokenized_obj.time_stats.set_api_server_dispatch_finish_time()
+
+    def _send_batch_request(
+        self,
+        tokenized_objs: List[
+            Union[TokenizedGenerateReqInput, TokenizedEmbeddingReqInput]
+        ],
+    ):
+        """Send a batch of tokenized requests as a single batched request to the scheduler."""
+        if isinstance(tokenized_objs[0], TokenizedGenerateReqInput):
+            batch_req = BatchTokenizedGenerateReqInput(batch=tokenized_objs)
+        else:
+            batch_req = BatchTokenizedEmbeddingReqInput(batch=tokenized_objs)
+
+        set_time_batch(tokenized_objs, "set_api_server_dispatch_time")
+        self.send_to_scheduler.send_pyobj(batch_req)
+        set_time_batch(tokenized_objs, "set_api_server_dispatch_finish_time")
+
+    async def _wait_one_response(
+        self,
+        obj: Union[GenerateReqInput, EmbeddingReqInput],
+        state: ReqState,
+        request: Optional[fastapi.Request] = None,
+    ):
+        """Wait for the response of one request."""
+        # Not all request types have `stream` (e.g., EmbeddingReqInput). Default to non-streaming.
+        is_stream = getattr(obj, "stream", False)
+        while True:
+            try:
+                await asyncio.wait_for(
+                    state.event.wait(), timeout=_REQUEST_STATE_WAIT_TIMEOUT
+                )
+            except asyncio.TimeoutError:
+                if (
+                    request is not None
+                    and not obj.background
+                    and await request.is_disconnected()
+                ):
+                    # Abort the request for disconnected requests (non-streaming, waiting queue)
+                    self.abort_request(obj.rid)
+                    # Use exception to kill the whole call stack and asyncio task
+                    raise ValueError(
+                        f"Request is disconnected from the client side (type 1). Abort request {obj.rid=}"
+                    )
+                continue
+
+            out = state.out_list[-1]
+
+            state.out_list = []
+            if state.finished:
+                # For non-streaming cases, response has not been sent yet (`response_sent_to_client_time` has not been set yet).
+                # Record response sent time right before we log finished results and metrics.
+                if not state.time_stats.response_sent_to_client_time:
+                    state.time_stats.set_response_sent_to_client_time()
+                    out["meta_info"][
+                        "response_sent_to_client_ts"
+                    ] = state.time_stats.get_response_sent_to_client_realtime()
+                self.request_logger.log_finished_request(
+                    obj,
+                    out,
+                    is_multimodal_gen=self.model_config.is_multimodal_gen,
+                    request=request,
+                )
+
+                if self.request_metrics_exporter_manager.exporter_enabled():
+                    # Asynchronously write metrics for this request using the exporter manager.
+                    asyncio.create_task(
+                        self.request_metrics_exporter_manager.write_record(obj, out)
+                    )
+
+                # Check if this was an abort/error created by scheduler
+                if isinstance(out["meta_info"].get("finish_reason"), dict):
+                    finish_reason = out["meta_info"]["finish_reason"]
+                    if (
+                        finish_reason.get("type") == "abort"
+                        and finish_reason.get("status_code") == HTTPStatus.BAD_REQUEST
+                    ):
+                        if not is_stream:
+                            raise ValueError(finish_reason["message"])
+                        else:
+                            yield out
+                            break
+
+                    if finish_reason.get("type") == "abort" and finish_reason.get(
+                        "status_code"
+                    ) in (
+                        HTTPStatus.SERVICE_UNAVAILABLE,
+                        HTTPStatus.INTERNAL_SERVER_ERROR,
+                    ):
+                        # This is an abort request initiated by scheduler.
+                        # Delete the key to prevent resending abort request to the scheduler and
+                        # to ensure aborted request state is cleaned up.
+                        if state.obj.rid in self.rid_to_state:
+                            del self.rid_to_state[state.obj.rid]
+
+                        # Mark ongoing LoRA request as finished.
+                        if self.server_args.enable_lora and state.obj.lora_path:
+                            await self.lora_registry.release(state.obj.lora_id)
+                        if not is_stream:
+                            raise fastapi.HTTPException(
+                                status_code=finish_reason["status_code"],
+                                detail=finish_reason["message"],
+                            )
+                        else:
+                            yield out
+                            break
+                yield out
+                break
+
+            state.event.clear()
+
+            if is_stream:
+                # Record response sent time right before we send response.
+                if not state.time_stats.response_sent_to_client_time:
+                    state.time_stats.set_response_sent_to_client_time()
+                    out["meta_info"][
+                        "response_sent_to_client_ts"
+                    ] = state.time_stats.get_response_sent_to_client_realtime()
+                yield out
+            else:
+                if (
+                    request is not None
+                    and not obj.background
+                    and await request.is_disconnected()
+                ):
+                    # Abort the request for disconnected requests (non-streaming, running)
+                    self.abort_request(obj.rid)
+                    # Use exception to kill the whole call stack and asyncio task
+                    raise ValueError(
+                        f"Request is disconnected from the client side (type 3). Abort request {obj.rid=}"
+                    )
+
+    async def _handle_batch_request(
+        self,
+        obj: Union[GenerateReqInput, EmbeddingReqInput],
+        request: Optional[fastapi.Request] = None,
+    ):
+        batch_size = obj.batch_size
+
+        generators = []
+        rids = []
+        if getattr(obj, "parallel_sample_num", 1) == 1:
+            if self._should_use_batch_tokenization(batch_size, obj):
+                tokenized_objs = await self._batch_tokenize_and_process(batch_size, obj)
+                self._send_batch_request(tokenized_objs)
+
+                # Set up generators for each request in the batch
+                for i in range(batch_size):
+                    tmp_obj = obj[i]
+                    state = self.rid_to_state[tmp_obj.rid]
+                    state.obj = tmp_obj
+                    generators.append(self._wait_one_response(tmp_obj, state, request))
+                    rids.append(tmp_obj.rid)
+            else:
+                # Sequential tokenization and processing
+                with (
+                    input_blocker_guard_region(send_to_scheduler=self.send_to_scheduler)
+                    if get_bool_env_var("SGLANG_ENABLE_COLOCATED_BATCH_GEN")
+                    else nullcontext()
+                ):
+                    for i in range(batch_size):
+                        tmp_obj = obj[i]
+                        tokenized_obj = await self._tokenize_one_request(tmp_obj)
+                        state = self.rid_to_state[tmp_obj.rid]
+                        state.obj = tmp_obj
+                        self._send_one_request(tokenized_obj)
+                        generators.append(
+                            self._wait_one_response(tmp_obj, state, request)
+                        )
+                        rids.append(tmp_obj.rid)
+        else:
+            # FIXME: When using batch and parallel_sample_num together, the perf is not optimal.
+            if batch_size > 128:
+                logger.warning(
+                    "Sending a single large batch with parallel sampling (n > 1) has not been well optimized. "
+                    "The performance might be better if you just duplicate the requests n times or use "
+                    "many threads to send them one by one with parallel sampling (n > 1)."
+                )
+
+            # Tokenize all requests
+            objs = [obj[i] for i in range(batch_size)]
+            tokenized_objs = await asyncio.gather(
+                *(self._tokenize_one_request(obj) for obj in objs)
+            )
+
+            # Cache the common prefix for parallel sampling
+            for i in range(batch_size):
+                tmp_obj = copy.copy(objs[i])
+                tokenized_obj = copy.copy(tokenized_objs[i])
+                tokenized_obj.rid = tmp_obj.regenerate_rid()
+                tokenized_obj.sampling_params = copy.copy(tokenized_obj.sampling_params)
+                tokenized_obj.sampling_params.max_new_tokens = 0
+                tokenized_obj.stream = False
+                self._req_stats_init(tmp_obj)
+                state = self.rid_to_state[tmp_obj.rid]
+                tokenized_obj.time_stats = state.time_stats
+                self._send_one_request(tokenized_obj)
+                await self._wait_one_response(tmp_obj, state, request).__anext__()
+
+            # Expand requests, assign new rids for them, and send them
+            for i in range(batch_size):
+                for _ in range(obj.parallel_sample_num):
+                    tmp_obj = copy.copy(objs[i])
+                    tokenized_obj = copy.copy(tokenized_objs[i])
+                    tokenized_obj.rid = tmp_obj.regenerate_rid()
+                    self._req_stats_init(tmp_obj)
+                    state = self.rid_to_state[tmp_obj.rid]
+                    tokenized_obj.time_stats = state.time_stats
+                    self._send_one_request(tokenized_obj)
+                    generators.append(self._wait_one_response(tmp_obj, state, request))
+                    rids.append(tmp_obj.rid)
+
+                self.rid_to_state[objs[i].rid].time_stats.set_finished_time()
+                del self.rid_to_state[objs[i].rid]
+
+        # Wait for all requests
+        is_stream = hasattr(obj, "stream") and obj.stream
+        if not is_stream:
+            outputs = await asyncio.gather(*(gen.__anext__() for gen in generators))
+            yield outputs
+        else:
+            rid_to_index = {rid: i for i, rid in enumerate(rids)}
+            task_map = {asyncio.create_task(gen.__anext__()): gen for gen in generators}
+            while task_map:
+                done, _ = await asyncio.wait(
+                    task_map.keys(), return_when=asyncio.FIRST_COMPLETED
+                )
+
+                for task in done:
+                    gen = task_map.pop(task)
+                    try:
+                        result = task.result()
+                        result["index"] = rid_to_index[result["meta_info"]["id"]]
+                        yield result
+                        new_task = asyncio.create_task(gen.__anext__())
+                        task_map[new_task] = gen
+                    except StopAsyncIteration:
+                        pass
+
+    def abort_request(self, rid: str = "", abort_all: bool = False):
+        if not abort_all and rid not in self.rid_to_state:
+            return
+        req = AbortReq(rid=rid, abort_all=abort_all)
+        self.send_to_scheduler.send_pyobj(req)
+        if self.enable_metrics:
+            # TODO: also use custom_labels from the request
+            self.metrics_collector.observe_one_aborted_request(
+                self.metrics_collector.labels
+            )
+
+    async def pause_generation(self, obj: PauseGenerationReqInput):
+        async with self.is_pause_cond:
+            self.is_pause = True
+            if obj.mode != "abort":
+                await self.send_to_scheduler.send_pyobj(obj)
+            else:
+                # we are using the model_update_lock to check if there is still on-going requests.
+                while True:
+                    # TODO: maybe make it async instead of fire-and-forget
+                    self.abort_request(abort_all=True)
+                    is_locked = await self.model_update_lock.is_locked()
+                    if not is_locked:
+                        break
+                    await asyncio.sleep(1.0)
+
+    async def continue_generation(self, obj: ContinueGenerationReqInput):
+        async with self.is_pause_cond:
+            self.is_pause = False
+            await self.send_to_scheduler.send_pyobj(obj)
+            self.is_pause_cond.notify_all()
+
+    async def update_weights_from_disk(
+        self,
+        obj: UpdateWeightFromDiskReqInput,
+        request: Optional[fastapi.Request] = None,
+    ) -> Tuple[bool, str]:
+        self.auto_create_handle_loop()
+
+        # default the load format to the server_args
+        if obj.load_format is None:
+            obj.load_format = self.server_args.load_format
+        logger.info("Start update_weights. Load format=%s", obj.load_format)
+
+        if obj.abort_all_requests:
+            self.abort_request(abort_all=True)
+
+        # Immediately update the weights if the engine is in paused state
+        async with self.is_pause_cond:
+            is_paused = self.is_pause
+
+        lock_context = (
+            self.model_update_lock.writer_lock if not is_paused else nullcontext()
+        )
+        async with lock_context:
+            success, message, num_paused_requests = (
+                await self._wait_for_model_update_from_disk(obj)
+            )
+
+        if success and obj.weight_version is not None:
+            self._update_weight_version_if_provided(obj.weight_version)
+            message += f" Weight version updated to {obj.weight_version}."
+
+        return success, message, num_paused_requests
+
+    def _update_model_path_info(self, model_path: str, load_format: str):
+        self.served_model_name = model_path
+        self.server_args.model_path = model_path
+        self.server_args.load_format = load_format
+        self.model_path = model_path
+
+    async def _wait_for_model_update_from_disk(
+        self, obj: UpdateWeightFromDiskReqInput
+    ) -> Tuple[bool, str]:
+        self.send_to_scheduler.send_pyobj(obj)
+        self.model_update_result = asyncio.Future()
+        if self.server_args.dp_size == 1:
+            result = await self.model_update_result
+            if result.success:
+                self._update_model_path_info(obj.model_path, obj.load_format)
+            return result.success, result.message, result.num_paused_requests
+        else:  # self.server_args.dp_size > 1
+            self.model_update_tmp = []
+            result = await self.model_update_result
+
+            all_success = all([r.success for r in result])
+            if all_success is True:
+                self._update_model_path_info(obj.model_path, obj.load_format)
+            all_message = [r.message for r in result]
+            all_message = " | ".join(all_message)
+            all_paused_requests = [r.num_paused_requests for r in result]
+            return all_success, all_message, all_paused_requests
+
+    def configure_logging(self, obj: ConfigureLoggingReq):
+        self.request_logger.configure(
+            log_requests=obj.log_requests,
+            log_requests_level=obj.log_requests_level,
+            log_requests_format=obj.log_requests_format,
+        )
+        if obj.dump_requests_folder is not None:
+            self.dump_requests_folder = obj.dump_requests_folder
+        if obj.dump_requests_threshold is not None:
+            self.dump_requests_threshold = obj.dump_requests_threshold
+        if obj.crash_dump_folder is not None:
+            self.crash_dump_folder = obj.crash_dump_folder
+        logging.info(f"Config logging: {obj=}")
+
+    async def freeze_gc(self):
+        """Send a freeze_gc message to the scheduler first, then freeze locally."""
+        self.send_to_scheduler.send_pyobj(FreezeGCReq())
+        freeze_gc("Tokenizer Manager")
+        return None
+
+    def create_abort_task(self, obj: GenerateReqInput):
+        # Abort the request if the client is disconnected.
+        async def abort_request():
+            await asyncio.sleep(2)
+            if obj.is_single:
+                self.abort_request(obj.rid)
+            else:
+                for rid in obj.rid:
+                    self.abort_request(rid)
+
+        background_tasks = BackgroundTasks()
+        background_tasks.add_task(abort_request)
+        return background_tasks
+
+    def auto_create_handle_loop(self):
+        if self.event_loop is not None:
+            return
+
+        # Create and start the handle_loop task
+        loop = get_or_create_event_loop()
+        self.asyncio_tasks.add(
+            loop.create_task(print_exception_wrapper(self.handle_loop))
+        )
+        self.event_loop = loop
+
+        # We only add signal handler when the tokenizer manager is in the main thread
+        # due to the CPython limitation.
+        if threading.current_thread() is threading.main_thread():
+            signal_handler = self.signal_handler_class(self)
+            loop.add_signal_handler(signal.SIGTERM, signal_handler.sigterm_handler)
+            # Update the signal handler for the process. It overrides the sigquit handler in the launch phase.
+            loop.add_signal_handler(
+                signal.SIGQUIT, signal_handler.running_phase_sigquit_handler
+            )
+
+        self.asyncio_tasks.add(
+            loop.create_task(print_exception_wrapper(self.sigterm_watchdog))
+        )
+
+    async def handle_loop(self):
+        """The event loop that handles requests"""
+        while True:
+            with self.soft_watchdog.disable():
+                recv_obj = await self.recv_from_detokenizer.recv_pyobj()
+            self._result_dispatcher(recv_obj)
+            self.last_receive_tstamp = real_time()
+            self.soft_watchdog.feed()
+
+    def _handle_batch_output(
+        self,
+        recv_obj: Union[
+            BatchStrOutput,
+            BatchEmbeddingOutput,
+            BatchMultimodalOutput,
+            BatchTokenIDOutput,
+        ],
+    ):
+        for i, rid in enumerate(recv_obj.rids):
+            state = self.rid_to_state.get(rid, None)
+            if state is None:
+                logger.error(
+                    f"Received output for {rid=} but the state was deleted in TokenizerManager."
+                )
+                continue
+
+            # Build meta_info and return value
+            meta_info = {
+                "id": rid,
+                "finish_reason": recv_obj.finished_reasons[i],
+                "prompt_tokens": recv_obj.prompt_tokens[i],
+                "weight_version": self.server_args.weight_version,
+                "total_retractions": recv_obj.retraction_counts[i],
+            }
+
+            if self.enable_metrics:
+                if recv_obj.time_stats is not None:
+                    scheduler_time_stats = recv_obj.time_stats[i]
+                    meta_info.update(scheduler_time_stats.convert_to_output_meta_info())
+
+            if getattr(state.obj, "return_logprob", False):
+                self.convert_logprob_style(
+                    meta_info,
+                    state,
+                    state.obj.top_logprobs_num,
+                    state.obj.token_ids_logprob,
+                    state.obj.return_text_in_logprobs
+                    and not self.server_args.skip_tokenizer_init,
+                    recv_obj,
+                    i,
+                )
+
+            if not isinstance(recv_obj, BatchEmbeddingOutput):
+                meta_info.update(
+                    {
+                        "completion_tokens": recv_obj.completion_tokens[i],
+                        "cached_tokens": recv_obj.cached_tokens[i],
+                    }
+                )
+                # Add detailed cache breakdown if available
+                if (
+                    hasattr(recv_obj, "cached_tokens_details")
+                    and recv_obj.cached_tokens_details
+                ):
+                    meta_info["cached_tokens_details"] = recv_obj.cached_tokens_details[
+                        i
+                    ]
+
+            if getattr(recv_obj, "output_hidden_states", None):
+                meta_info["hidden_states"] = recv_obj.output_hidden_states[i]
+            if getattr(recv_obj, "routed_experts", None):
+                meta_info["routed_experts"] = recv_obj.routed_experts[i]
+            if getattr(recv_obj, "customized_info", None):
+                for k, v in recv_obj.customized_info.items():
+                    meta_info[k] = v[i]
+            if getattr(recv_obj, "dp_ranks", None):
+                meta_info["dp_rank"] = recv_obj.dp_ranks[i]
+
+            if isinstance(recv_obj, BatchStrOutput):
+                state.text += recv_obj.output_strs[i]
+                # Not all request types have `stream` (e.g., EmbeddingReqInput). Default to non-streaming.
+                is_stream = getattr(state.obj, "stream", False)
+                if self.server_args.stream_output and is_stream:
+                    state.output_ids.extend(recv_obj.output_ids[i])
+                    output_token_ids = state.output_ids[state.last_output_offset :]
+                    state.last_output_offset = len(state.output_ids)
+                else:
+                    state.output_ids.extend(recv_obj.output_ids[i])
+                    output_token_ids = state.output_ids.copy()
+
+                out_dict = {
+                    "text": state.text,
+                    "output_ids": output_token_ids,
+                    "meta_info": meta_info,
+                }
+
+            elif isinstance(recv_obj, BatchTokenIDOutput):
+                is_stream = getattr(state.obj, "stream", False)
+                if self.server_args.stream_output and is_stream:
+                    state.output_ids.extend(recv_obj.output_ids[i])
+                    output_token_ids = state.output_ids[state.last_output_offset :]
+                    state.last_output_offset = len(state.output_ids)
+                else:
+                    state.output_ids.extend(recv_obj.output_ids[i])
+                    output_token_ids = state.output_ids.copy()
+
+                out_dict = {
+                    "output_ids": output_token_ids,
+                    "meta_info": meta_info,
+                }
+            elif isinstance(recv_obj, BatchMultimodalOutput):
+                raise NotImplementedError("BatchMultimodalOut not implemented")
+            else:
+                assert isinstance(recv_obj, BatchEmbeddingOutput)
+                out_dict = {
+                    "embedding": recv_obj.embeddings[i],
+                    "meta_info": meta_info,
+                }
+
+            state.finished = recv_obj.finished_reasons[i] is not None
+            if state.finished:
+                # Ensure first_token_time is set before computing decode_throughput.
+                # Without this, requests that finish on their first output batch
+                # would have first_token_time=0.0, producing bogus throughput.
+                if state.time_stats.first_token_time == 0.0:
+                    state.time_stats.set_first_token_time()
+                state.time_stats.trace_ctx.trace_set_root_attrs(
+                    self.convert_to_span_attrs(state, recv_obj, i)
+                )
+                state.time_stats.set_finished_time()
+                meta_info["e2e_latency"] = state.time_stats.get_e2e_latency()
+
+                if self.server_args.speculative_algorithm:
+                    self._calculate_spec_decoding_metrics(meta_info, recv_obj, i)
+                if self.enable_metrics:
+                    scheduler_time_stats = (
+                        recv_obj.time_stats[i]
+                        if recv_obj.time_stats is not None
+                        else None
+                    )
+                    completion_tokens = (
+                        recv_obj.completion_tokens[i]
+                        if not isinstance(recv_obj, BatchEmbeddingOutput)
+                        else 0
+                    )
+                    meta_info.update(
+                        state.time_stats.convert_to_output_meta_info(
+                            scheduler_time_stats, completion_tokens
+                        )
+                    )
+
+                del self.rid_to_state[rid]
+
+                # Mark ongoing LoRA request as finished.
+                if self.server_args.enable_lora and state.obj.lora_path:
+                    asyncio.create_task(self.lora_registry.release(state.obj.lora_id))
+
+            state.out_list.append(out_dict)
+            state.event.set()
+
+            # Log metrics and dump
+            if self.enable_metrics and state.obj.log_metrics:
+                self.collect_metrics(state, recv_obj, i)
+            if self.dump_requests_folder and state.finished and state.obj.log_metrics:
+                self.dump_requests(state, out_dict)
+            if self.crash_dump_folder and state.finished and state.obj.log_metrics:
+                self.record_request_for_crash_dump(state, out_dict)
+
+        # When skip_tokenizer_init is enabled, tokensizer_manager receives
+        # BatchTokenIDOutput.
+        if (
+            self.server_args.dp_size > 1
+            and isinstance(recv_obj, (BatchStrOutput, BatchTokenIDOutput))
+            and recv_obj.load is not None
+        ):
+            load_update_req = WatchLoadUpdateReq(loads=[recv_obj.load])
+            self.send_to_scheduler.send_pyobj(load_update_req)
+
+    def add_logprob_to_meta_info(
+        self,
+        meta_info: dict,
+        state: ReqState,
+        top_logprobs_num: int,
+        token_ids_logprob: List[int],
+        return_text_in_logprobs: bool,
+    ):
+        # 1. Handle regular logprobs
+        if len(state.input_token_logprobs_val) > len(state.input_token_logprobs):
+            state.input_token_logprobs.extend(
+                self.detokenize_logprob_tokens(
+                    state.input_token_logprobs_val[len(state.input_token_logprobs) :],
+                    state.input_token_logprobs_idx[len(state.input_token_logprobs) :],
+                    return_text_in_logprobs,
+                )
+            )
+
+        if len(state.output_token_logprobs_val) > len(state.output_token_logprobs):
+            state.output_token_logprobs.extend(
+                self.detokenize_logprob_tokens(
+                    state.output_token_logprobs_val[len(state.output_token_logprobs) :],
+                    state.output_token_logprobs_idx[len(state.output_token_logprobs) :],
+                    return_text_in_logprobs,
+                )
+            )
+
+        meta_info["input_token_logprobs"] = state.input_token_logprobs
+        meta_info["output_token_logprobs"] = state.output_token_logprobs
+
+        # 2. Handle top logprobs
+        if top_logprobs_num > 0:
+            if len(state.input_top_logprobs_val) > len(state.input_top_logprobs):
+                state.input_top_logprobs.extend(
+                    self.detokenize_top_logprobs_tokens(
+                        state.input_top_logprobs_val[len(state.input_top_logprobs) :],
+                        state.input_top_logprobs_idx[len(state.input_top_logprobs) :],
+                        return_text_in_logprobs,
+                    )
+                )
+            if len(state.output_top_logprobs_val) > len(state.output_top_logprobs):
+                state.output_top_logprobs.extend(
+                    self.detokenize_top_logprobs_tokens(
+                        state.output_top_logprobs_val[len(state.output_top_logprobs) :],
+                        state.output_top_logprobs_idx[len(state.output_top_logprobs) :],
+                        return_text_in_logprobs,
+                    )
+                )
+
+            meta_info["input_top_logprobs"] = state.input_top_logprobs
+            meta_info["output_top_logprobs"] = state.output_top_logprobs
+
+        # 3. Handle token_ids_logprob
+        if token_ids_logprob is not None:
+            if len(state.input_token_ids_logprobs_val) > len(
+                state.input_token_ids_logprobs
+            ):
+                state.input_token_ids_logprobs.extend(
+                    self.detokenize_top_logprobs_tokens(
+                        state.input_token_ids_logprobs_val[
+                            len(state.input_token_ids_logprobs) :
+                        ],
+                        state.input_token_ids_logprobs_idx[
+                            len(state.input_token_ids_logprobs) :
+                        ],
+                        return_text_in_logprobs,
+                    )
+                )
+            if len(state.output_token_ids_logprobs_val) > len(
+                state.output_token_ids_logprobs
+            ):
+                state.output_token_ids_logprobs.extend(
+                    self.detokenize_top_logprobs_tokens(
+                        state.output_token_ids_logprobs_val[
+                            len(state.output_token_ids_logprobs) :
+                        ],
+                        state.output_token_ids_logprobs_idx[
+                            len(state.output_token_ids_logprobs) :
+                        ],
+                        return_text_in_logprobs,
+                    )
+                )
+
+            meta_info["input_token_ids_logprobs"] = state.input_token_ids_logprobs
+            meta_info["output_token_ids_logprobs"] = state.output_token_ids_logprobs
+
+    def convert_logprob_style(
+        self,
+        meta_info: dict,
+        state: ReqState,
+        top_logprobs_num: int,
+        token_ids_logprob: List[int],
+        return_text_in_logprobs: bool,
+        recv_obj: BatchStrOutput,
+        recv_obj_index: int,
+    ):
+        if recv_obj.input_token_logprobs_val is None:
+            return
+
+        if (
+            len(recv_obj.input_token_logprobs_val) > 0
+            and recv_obj.input_token_logprobs_val[recv_obj_index] is not None
+        ):
+            state.input_token_logprobs_val.extend(
+                recv_obj.input_token_logprobs_val[recv_obj_index]
+            )
+            state.input_token_logprobs_idx.extend(
+                recv_obj.input_token_logprobs_idx[recv_obj_index]
+            )
+        state.output_token_logprobs_val.extend(
+            recv_obj.output_token_logprobs_val[recv_obj_index]
+        )
+        state.output_token_logprobs_idx.extend(
+            recv_obj.output_token_logprobs_idx[recv_obj_index]
+        )
+
+        if top_logprobs_num > 0:
+            if len(recv_obj.input_top_logprobs_val) > 0:
+                state.input_top_logprobs_val.extend(
+                    recv_obj.input_top_logprobs_val[recv_obj_index]
+                )
+                state.input_top_logprobs_idx.extend(
+                    recv_obj.input_top_logprobs_idx[recv_obj_index]
+                )
+            state.output_top_logprobs_val.extend(
+                recv_obj.output_top_logprobs_val[recv_obj_index]
+            )
+            state.output_top_logprobs_idx.extend(
+                recv_obj.output_top_logprobs_idx[recv_obj_index]
+            )
+
+        if token_ids_logprob is not None:
+            if len(recv_obj.input_token_ids_logprobs_val) > 0:
+                state.input_token_ids_logprobs_val.extend(
+                    recv_obj.input_token_ids_logprobs_val[recv_obj_index]
+                )
+                state.input_token_ids_logprobs_idx.extend(
+                    recv_obj.input_token_ids_logprobs_idx[recv_obj_index]
+                )
+            state.output_token_ids_logprobs_val.extend(
+                recv_obj.output_token_ids_logprobs_val[recv_obj_index]
+            )
+            state.output_token_ids_logprobs_idx.extend(
+                recv_obj.output_token_ids_logprobs_idx[recv_obj_index]
+            )
+
+        self.add_logprob_to_meta_info(
+            meta_info,
+            state,
+            state.obj.top_logprobs_num,
+            state.obj.token_ids_logprob,
+            return_text_in_logprobs,
+        )
+
+    def detokenize_logprob_tokens(
+        self,
+        token_logprobs_val: List[float],
+        token_logprobs_idx: List[int],
+        decode_to_text: bool,
+    ):
+        if not decode_to_text:
+            return [
+                (logprob, token_id, None)
+                for logprob, token_id in zip(token_logprobs_val, token_logprobs_idx)
+            ]
+        else:
+            assert self.tokenizer is not None
+            token_texts = self.tokenizer.batch_decode(token_logprobs_idx)
+            return list(zip(token_logprobs_val, token_logprobs_idx, token_texts))
+
+    def detokenize_top_logprobs_tokens(
+        self,
+        token_logprobs_val: List[float],
+        token_logprobs_idx: List[int],
+        decode_to_text: bool,
+    ):
+        # TODO: The current implementation only batches the detokenization for top-k tokens per single position.
+        # We should batch all top-k tokens in all positions.
+        ret = []
+        for i in range(len(token_logprobs_val)):
+            if token_logprobs_val[i]:
+                ret.append(
+                    self.detokenize_logprob_tokens(
+                        token_logprobs_val[i], token_logprobs_idx[i], decode_to_text
+                    )
+                )
+            else:
+                ret.append(None)
+        return ret
+
+    def _calculate_spec_decoding_metrics(
+        self,
+        meta_info: Dict[str, Any],
+        recv_obj: Union[
+            BatchStrOutput,
+            BatchEmbeddingOutput,
+            BatchMultimodalOutput,
+            BatchTokenIDOutput,
+        ],
+        i: int,
+    ) -> None:
+        """Calculate speculative decoding metrics, such as acceptance rate and acceptance length metrics."""
+        if (
+            hasattr(recv_obj, "spec_verify_ct")
+            and recv_obj.spec_verify_ct[i] > 0
+            and hasattr(recv_obj, "spec_accepted_tokens")
+            and len(recv_obj.spec_accepted_tokens) > i
+        ):
+            # The draft tokens per speculative step (excluding the target-sampled token).
+            num_guess_tokens = self.server_args.speculative_num_draft_tokens - 1
+            total_draft_tokens = recv_obj.spec_verify_ct[i] * num_guess_tokens
+            accepted_tokens = recv_obj.spec_accepted_tokens[i]
+
+            # Calculate per-request acceptance rate and average acceptance length.
+            if total_draft_tokens > 0:
+                # Calculate acceptance rate: accepted / (steps * lookahead)
+                meta_info["spec_accept_rate"] = accepted_tokens / total_draft_tokens
+                meta_info["spec_accept_length"] = (
+                    recv_obj.completion_tokens[i] / recv_obj.spec_verify_ct[i]
+                )
+                meta_info["spec_accept_token_num"] = accepted_tokens
+                meta_info["spec_draft_token_num"] = total_draft_tokens
+                meta_info["spec_verify_ct"] = recv_obj.spec_verify_ct[i]
+
+            # Acceptance histogram: tracks how many decoding steps accepted a certain number of draft tokens.
+            if (
+                recv_obj.spec_acceptance_histogram
+                and len(recv_obj.spec_acceptance_histogram) > i
+                and recv_obj.spec_acceptance_histogram[i]
+            ):
+                meta_info["spec_accept_histogram"] = recv_obj.spec_acceptance_histogram[
+                    i
+                ]
+
+    def _request_has_grammar(self, obj: GenerateReqInput) -> bool:
+        return (
+            obj.sampling_params.get("json_schema", None)
+            or obj.sampling_params.get("regex", None)
+            or obj.sampling_params.get("ebnf", None)
+            or obj.sampling_params.get("structural_tag", None)
+        )
+
+    def collect_metrics(self, state: ReqState, recv_obj: BatchStrOutput, i: int):
+        completion_tokens = (
+            recv_obj.completion_tokens[i]
+            if getattr(recv_obj, "completion_tokens", None)
+            else 0
+        )
+
+        custom_labels = getattr(state.obj, "custom_labels", None)
+        labels = dict(self.metrics_collector.labels)
+        if custom_labels:
+            labels.update(custom_labels)
+        if self.enable_priority_scheduling:
+            priority = getattr(state.obj, "priority", None)
+            if priority is not None:
+                labels["priority"] = str(priority)
+        if (
+            state.time_stats.first_token_time == 0.0
+            and self.disaggregation_mode != DisaggregationMode.PREFILL
+        ):
+            state.time_stats.set_first_token_time()
+            state.last_completion_tokens = completion_tokens
+            self.metrics_collector.observe_time_to_first_token(
+                labels, state.time_stats.get_first_token_latency()
+            )
+        else:
+            num_new_tokens = completion_tokens - state.last_completion_tokens
+            if num_new_tokens:
+                self.metrics_collector.observe_inter_token_latency(
+                    labels,
+                    state.time_stats.get_interval(),
+                    num_new_tokens,
+                )
+                state.time_stats.set_last_time()
+                state.last_completion_tokens = completion_tokens
+
+        if state.finished:
+            retraction_count = (
+                recv_obj.retraction_counts[i]
+                if getattr(recv_obj, "retraction_counts", None)
+                and i < len(recv_obj.retraction_counts)
+                else 0
+            )
+
+            # Get detailed cache breakdown if available
+            cached_tokens_details = None
+            if (
+                hasattr(recv_obj, "cached_tokens_details")
+                and recv_obj.cached_tokens_details
+            ):
+                cached_tokens_details = recv_obj.cached_tokens_details[i]
+
+            self.metrics_collector.observe_one_finished_request(
+                labels,
+                recv_obj.prompt_tokens[i],
+                completion_tokens,
+                recv_obj.cached_tokens[i],
+                state.time_stats.get_e2e_latency(),
+                self._request_has_grammar(state.obj),
+                retraction_count,
+                cached_tokens_details,
+            )
+
+    def dump_requests(self, state: ReqState, out_dict: dict):
+        self.dump_request_list.append(
+            (
+                state.obj,
+                out_dict,
+                convert_time_to_realtime(state.time_stats.created_time),
+                convert_time_to_realtime(state.time_stats.finished_time),
+            )
+        )
+
+        if len(self.dump_request_list) >= self.dump_requests_threshold:
+            filename = os.path.join(
+                self.dump_requests_folder,
+                datetime.now().strftime("%Y-%m-%d_%H-%M-%S") + ".pkl",
+            )
+            self._dump_data_to_file(
+                data_list=self.dump_request_list,
+                filename=filename,
+                log_message=f"Dump {len(self.dump_request_list)} requests to {filename}",
+            )
+            self.dump_request_list = []
+
+    def record_request_for_crash_dump(self, state: ReqState, out_dict: dict):
+        current_time = real_time()
+        self.crash_dump_request_list.append(
+            (
+                state.obj,
+                out_dict,
+                convert_time_to_realtime(state.time_stats.created_time),
+                current_time,
+            )
+        )
+        # Remove requests older than 5 minutes based on finish time
+        while (
+            self.crash_dump_request_list
+            and current_time - self.crash_dump_request_list[0][3] >= 300
+        ):
+            self.crash_dump_request_list.popleft()
+
+    def _dump_data_to_file(
+        self, data_list: List[Tuple], filename: str, log_message: str
+    ):
+        logger.info(log_message)
+        to_dump_with_server_args = {
+            "server_args": self.server_args,
+            "requests": data_list.copy(),
+        }
+
+        def background_task():
+            os.makedirs(os.path.dirname(filename), exist_ok=True)
+            with open(filename, "wb") as f:
+                pickle.dump(to_dump_with_server_args, f)
+
+        asyncio.create_task(asyncio.to_thread(background_task))
+
+    def dump_requests_before_crash(
+        self, hostname: str = os.getenv("HOSTNAME", socket.gethostname())
+    ):
+        if not self.crash_dump_folder:
+            return
+
+        if self.crash_dump_performed:
+            logger.info(
+                "SIGTERM/SIGQUIT/Exception triggered, but crash dump already performed, skipping."
+            )
+            return
+        else:
+            self.crash_dump_performed = True
+
+        logger.error(f"Dumping requests before crash. {self.crash_dump_folder=}")
+
+        # Add finished requests from crash_dump_request_list
+        data_to_dump = []
+        if self.crash_dump_request_list:
+            data_to_dump.extend(self.crash_dump_request_list)
+
+        # Add unfinished requests from rid_to_state
+        unfinished_requests = []
+        for rid, state in self.rid_to_state.items():
+            if not state.finished:
+                state.time_stats.set_finished_time()
+                unfinished_requests.append(
+                    (
+                        state.obj,
+                        state.out_list[-1] if state.out_list else {},
+                        convert_time_to_realtime(state.time_stats.created_time),
+                        convert_time_to_realtime(state.time_stats.finished_time),
+                    )
+                )
+        if unfinished_requests:
+            data_to_dump.extend(unfinished_requests)
+
+        if not data_to_dump:
+            return
+
+        # Create a file
+        filename = os.path.join(
+            self.crash_dump_folder,
+            hostname,
+            f'crash_dump_{datetime.now().strftime("%Y-%m-%d_%H-%M-%S")}.pkl',
+        )
+        os.makedirs(os.path.dirname(filename), exist_ok=True)
+
+        # Write the data to the file
+        data_to_dump_with_server_args = {
+            "server_args": self.server_args,  # Include server_args in the dump
+            "requests": data_to_dump,
+            "launch_command": " ".join(sys.argv),
+        }
+        with open(filename, "wb") as f:
+            pickle.dump(data_to_dump_with_server_args, f)
+        logger.error(
+            f"Dumped {len(self.crash_dump_request_list)} finished and {len(unfinished_requests)} unfinished requests before crash to {filename}"
+        )
+        return filename
+
+    async def sigterm_watchdog(self):
+        while not self.gracefully_exit:
+            await asyncio.sleep(5)
+
+        # Drain requests
+        while True:
+            remain_num_req = len(self.rid_to_state)
+            remaining_rids = list(self.rid_to_state.keys())
+
+            if self.server_status == ServerStatus.UnHealthy:
+                # if health check failed, we should exit immediately
+                logger.error(
+                    "Signal SIGTERM received while health check failed. Force exiting."
+                )
+                self.dump_requests_before_crash()
+                self.force_exit_handler()
+                break
+
+            elif get_bool_env_var("SGL_FORCE_SHUTDOWN"):
+                # if force shutdown flag set, exit immediately
+                logger.error(
+                    "Signal SIGTERM received while force shutdown flag set. Force exiting."
+                )
+                self.force_exit_handler()
+                break
+
+            logger.info(
+                f"Gracefully exiting... Remaining number of requests {remain_num_req}. Remaining requests {remaining_rids=}."
+            )
+            if remain_num_req > 0:
+                await asyncio.sleep(5)
+            else:
+                self.dump_requests_before_crash()
+                break
+
+        kill_process_tree(os.getpid(), include_parent=True)
+        sys.exit(0)
+
+    def force_exit_handler(self):
+        """Put some custom force exit logic here."""
+        pass
+
+    def _handle_abort_req(self, recv_obj: AbortReq):
+        if is_health_check_generate_req(recv_obj):
+            return
+        state = self.rid_to_state[recv_obj.rid]
+        state.finished = True
+        state.time_stats.set_finished_time()
+
+        abort_message = recv_obj.abort_message or "Abort in waiting queue"
+        finish_reason = {
+            "type": "abort",
+            "message": abort_message,
+        }
+        if recv_obj.finished_reason:
+            finish_reason = recv_obj.finished_reason
+        meta_info = {
+            "id": recv_obj.rid,
+            "finish_reason": finish_reason,
+            "weight_version": self.server_args.weight_version,
+            "e2e_latency": state.time_stats.get_e2e_latency(),
+        }
+        is_stream = getattr(state.obj, "stream", False)
+        if getattr(state.obj, "return_logprob", False):
+            self.add_logprob_to_meta_info(
+                meta_info,
+                state,
+                state.obj.top_logprobs_num,
+                state.obj.token_ids_logprob,
+                state.obj.return_text_in_logprobs
+                and not self.server_args.skip_tokenizer_init,
+            )
+
+        output_ids = state.output_ids
+        meta_info["completion_tokens"] = len(output_ids)
+        if is_stream:
+            output_ids = [output_ids[-1]] if len(output_ids) > 0 else []
+        out = {
+            "text": state.text,
+            "output_ids": output_ids,
+            "meta_info": meta_info,
+        }
+        state.out_list.append(out)
+        state.event.set()
+
+    def update_active_ranks(self, ranks: ActiveRanksOutput):
+        self.send_to_scheduler.send_pyobj(ranks)
+
+    def _handle_open_session_req_output(self, recv_obj):
+        self.session_futures[recv_obj.session_id].set_result(
+            recv_obj.session_id if recv_obj.success else None
+        )
+
+    def _handle_update_weights_from_disk_req_output(self, recv_obj):
+        if self.server_args.dp_size == 1:
+            self.model_update_result.set_result(recv_obj)
+        else:  # self.server_args.dp_size > 1
+            self.model_update_tmp.append(recv_obj)
+            # set future if the all results are received
+            if len(self.model_update_tmp) == self.server_args.dp_size:
+                self.model_update_result.set_result(self.model_update_tmp)
+
+    async def _validate_and_resolve_lora(
+        self, obj: Union[GenerateReqInput, EmbeddingReqInput]
+    ) -> None:
+        if not obj.lora_path:
+            return
+
+        if not self.server_args.enable_lora:
+            first_adapter = (
+                obj.lora_path
+                if isinstance(obj.lora_path, str)
+                else next((a for a in obj.lora_path if a), None)
+            )
+
+            raise ValueError(
+                f"LoRA adapter '{first_adapter}' was requested, but LoRA is not enabled. "
+                "Please launch the server with --enable-lora flag and preload adapters "
+                "using --lora-paths or /load_lora_adapter endpoint."
+            )
+
+        await self._resolve_lora_path(obj)
+
+    async def _resolve_lora_path(self, obj: Union[GenerateReqInput, EmbeddingReqInput]):
+        if isinstance(obj.lora_path, str):
+            unique_lora_paths = set([obj.lora_path])
+        else:
+            unique_lora_paths = set(obj.lora_path)
+
+        if (
+            self.server_args.max_loaded_loras is not None
+            and len(unique_lora_paths) > self.server_args.max_loaded_loras
+        ):
+            raise ValueError(
+                f"Received request with {len(unique_lora_paths)} unique loras requested "
+                f"but max loaded loras is {self.server_args.max_loaded_loras}"
+            )
+
+        # Reload all existing LoRA adapters that have been dynamically unloaded
+        unregistered_loras = await self.lora_registry.get_unregistered_loras(
+            unique_lora_paths
+        )
+        for lora_path in unregistered_loras:
+            if lora_path is None:
+                continue
+
+            if lora_path not in self.lora_ref_cache:
+                raise ValueError(
+                    f"Got LoRA adapter that has never been loaded: {lora_path}\n"
+                    f"All loaded adapters: {self.lora_ref_cache.keys()}."
+                )
+
+            logger.info(f"Reloading evicted adapter: {lora_path}")
+            new_lora_ref = self.lora_ref_cache[lora_path]
+            load_result = await self.load_lora_adapter(
+                LoadLoRAAdapterReqInput(
+                    lora_name=new_lora_ref.lora_name,
+                    lora_path=new_lora_ref.lora_path,
+                    pinned=new_lora_ref.pinned,
+                )
+            )
+            if (
+                not load_result.success
+                and "already loaded" not in load_result.error_message
+            ):
+                raise ValueError(
+                    f"Failed to implicitly load LoRA adapter {lora_path}: {load_result.error_message}"
+                )
+
+        # Look up the LoRA ID from the registry and start tracking ongoing LoRA requests.
+        obj.lora_id = await self.lora_registry.acquire(obj.lora_path)
+
+    def _req_stats_init(
+        self,
+        obj: Union[GenerateReqInput, EmbeddingReqInput],
+        request: Optional[fastapi.Request] = None,
+    ):
+        calibrate_time_diff()
+        created_time = obj.received_time
+
+        external_trace_header = None
+        if self.server_args.enable_trace:
+            if request:
+                external_trace_header = extract_trace_headers(request.headers)
+                obj.external_trace_header = external_trace_header
+            elif obj.external_trace_header:
+                # When the request comes form the rust grpc server or Engine there isn't a
+                # real request object but we still need to propagate the trace context from
+                # the trace context that is explicitly passed in
+                external_trace_header = obj.external_trace_header
+
+        if not hasattr(obj, "is_single") or obj.is_single:
+            time_stats = APIServerReqTimeStats(disagg_mode=self.disaggregation_mode)
+            state = ReqState([], False, asyncio.Event(), obj, time_stats)
+            self.rid_to_state[obj.rid] = state
+
+            if self.server_args.enable_trace:
+                bootstrap_room = (
+                    obj.bootstrap_room if hasattr(obj, "bootstrap_room") else None
+                )
+                time_stats.init_trace_ctx(
+                    obj.rid,
+                    bootstrap_room,
+                    external_trace_header,
+                )
+            time_stats.set_created_time(created_time)
+        else:
+            for i in range(len(obj.rid)):
+                time_stats = APIServerReqTimeStats(disagg_mode=self.disaggregation_mode)
+                state = ReqState([], False, asyncio.Event(), obj[i], time_stats)
+                self.rid_to_state[obj.rid[i]] = state
+
+                if self.server_args.enable_trace:
+                    bootstrap_room = (
+                        obj.bootstrap_room[i]
+                        if hasattr(obj, "bootstrap_room") and obj.bootstrap_room
+                        else None
+                    )
+                    time_stats.init_trace_ctx(
+                        obj.rid[i],
+                        bootstrap_room,
+                        external_trace_header,
+                    )
+                time_stats.set_created_time(created_time)
+
+    def _should_dispatch_to_encoder(
+        self, obj: Union[GenerateReqInput, EmbeddingReqInput]
+    ) -> bool:
+        """Check if the request should be dispatched to encoder for processing.
+
+        Returns True if the request should be dispatched to encoder (multiple multimodal items),
+        False if it should be processed locally (single multimodal item or no multimodal items).
+
+        Args:
+            obj: The request input object
+
+        Returns:
+            bool: True if should dispatch to encoder, False otherwise
+        """
+        if obj.batch_size > 1:
+            logger.warning(
+                "Batch request (batch_size=%d) is not supported in EPD disaggregation mode; skipping encoder dispatch.",
+                obj.batch_size,
+            )
+            return False
+        if not isinstance(obj, GenerateReqInput) or not obj.contains_mm_input():
+            return False
+
+        # Count image / video / audio items for dispatch threshold
+        def _count_mm_items(data):
+            return (
+                len(data) if isinstance(data, list) else (1 if data is not None else 0)
+            )
+
+        total_mm_items = (
+            _count_mm_items(getattr(obj, "image_data", None))
+            + _count_mm_items(getattr(obj, "video_data", None))
+            + _count_mm_items(getattr(obj, "audio_data", None))
+        )
+        return total_mm_items >= envs.SGLANG_ENCODER_DISPATCH_MIN_ITEMS.get()
+
+    def _handle_epd_disaggregation_encode_request(
+        self, obj: Union[GenerateReqInput, EmbeddingReqInput]
+    ):
+        """Handle EPD-disaggregation mode encoding request."""
+        if isinstance(obj, GenerateReqInput) and obj.contains_mm_input():
+            # dispatch to encoder by default
+            should_dispatch = True
+            if self.server_args.enable_adaptive_dispatch_to_encoder:
+                should_dispatch = self._should_dispatch_to_encoder(obj)
+
+            # Set need_wait_for_image flag based on whether we dispatch to encoder
+            # This flag will be used in _tokenize_one_request to determine processing path
+            if should_dispatch:
+                obj.need_wait_for_image = True
+                if self.server_args.encoder_transfer_backend == "zmq_to_scheduler":
+                    self.mm_receiver.send_encode_request(obj)
+            else:
+                obj.need_wait_for_image = False
+
+    def convert_to_span_attrs(
+        self,
+        state: ReqState,
+        recv_obj: Union[
+            BatchStrOutput,
+            BatchEmbeddingOutput,
+            BatchMultimodalOutput,
+            BatchTokenIDOutput,
+        ],
+        i: int,
+    ) -> Dict[str, Any]:
+        """Convert attributes to span attributes."""
+        span_attrs = {}
+
+        if not self.server_args.enable_trace:
+            return span_attrs
+
+        # Token usage attributes
+        span_attrs[SpanAttributes.GEN_AI_USAGE_COMPLETION_TOKENS] = (
+            recv_obj.completion_tokens[i]
+        )
+        span_attrs[SpanAttributes.GEN_AI_USAGE_PROMPT_TOKENS] = recv_obj.prompt_tokens[
+            i
+        ]
+        span_attrs[SpanAttributes.GEN_AI_USAGE_CACHED_TOKENS] = recv_obj.cached_tokens[
+            i
+        ]
+
+        # Request identifiers
+        span_attrs[SpanAttributes.GEN_AI_REQUEST_ID] = (
+            str(state.obj.rid) if state.obj.rid else None
+        )
+
+        # Sampling parameters
+        sampling_params = state.obj.sampling_params or {}
+
+        if max_new_tokens := sampling_params.get("max_new_tokens"):
+            span_attrs[SpanAttributes.GEN_AI_REQUEST_MAX_TOKENS] = max_new_tokens
+
+        if top_p := sampling_params.get("top_p"):
+            span_attrs[SpanAttributes.GEN_AI_REQUEST_TOP_P] = top_p
+
+        if temperature := sampling_params.get("temperature"):
+            span_attrs[SpanAttributes.GEN_AI_REQUEST_TEMPERATURE] = temperature
+
+        if top_k := sampling_params.get("top_k"):
+            span_attrs[SpanAttributes.GEN_AI_REQUEST_TOP_K] = top_k
+
+        if n := sampling_params.get("n"):
+            span_attrs[SpanAttributes.GEN_AI_REQUEST_N] = n
+
+        # Response attributes
+        span_attrs[SpanAttributes.GEN_AI_RESPONSE_MODEL] = self.served_model_name
+
+        finish_reason = (
+            recv_obj.finished_reasons[i].get("type")
+            if recv_obj.finished_reasons[i]
+            else None
+        )
+        if finish_reason:
+            span_attrs[SpanAttributes.GEN_AI_RESPONSE_FINISH_REASONS] = json.dumps(
+                [finish_reason]
+            )
+
+        # Latency attributes
+        span_attrs.update(state.time_stats.convert_to_gen_ai_span_attrs())
+
+        return span_attrs
+
+    def _set_default_priority(self, obj: Union[GenerateReqInput, EmbeddingReqInput]):
+        """Set the default priority value."""
+        if (
+            self.enable_priority_scheduling
+            and obj.priority is None
+            and self.default_priority_value is not None
+        ):
+            obj.priority = self.default_priority_value
+
+
+class ServerStatus(Enum):
+    Up = "Up"
+    Starting = "Starting"
+    UnHealthy = "UnHealthy"
+
+
+async def print_exception_wrapper(func):
+    """
+    Sometimes an asyncio function does not print exception.
+    We do another wrapper to handle the exception.
+    """
+    try:
+        await func()
+    except Exception:
+        traceback = get_exception_traceback()
+        logger.error(f"TokenizerManager hit an exception: {traceback}")
+        if hasattr(func, "__self__") and isinstance(func.__self__, TokenizerManager):
+            func.__self__.dump_requests_before_crash()
+        kill_process_tree(os.getpid(), include_parent=True)
+        sys.exit(1)
+
+
+def _get_processor_wrapper(server_args):
+    try:
+        processor = get_processor(
+            server_args.tokenizer_path,
+            tokenizer_mode=server_args.tokenizer_mode,
+            trust_remote_code=server_args.trust_remote_code,
+            revision=server_args.revision,
+            use_fast=not server_args.disable_fast_image_processor,
+        )
+    except ValueError as e:
+        error_message = str(e)
+        if "does not have a slow version" in error_message:
+            logger.info(
+                f"Processor {server_args.tokenizer_path} does not have a slow version. Automatically use fast version"
+            )
+            processor = get_processor(
+                server_args.tokenizer_path,
+                tokenizer_mode=server_args.tokenizer_mode,
+                trust_remote_code=server_args.trust_remote_code,
+                revision=server_args.revision,
+                use_fast=True,
+            )
+        else:
+            raise e
+    return processor
+
+
+def _determine_tensor_transport_mode(server_args: ServerArgs) -> TensorTransportMode:
+    is_cross_node = server_args.dist_init_addr
+
+    if is_cross_node:
+        # Fallback to default CPU transport for multi-node
+        return "default"
+    else:
+        return "cuda_ipc"
+
+
+class SignalHandler:
+    def __init__(self, tokenizer_manager: TokenizerManager):
+        self.tokenizer_manager = tokenizer_manager
+
+    def sigterm_handler(self, signum=None, frame=None):
+        logger.warning(
+            f"SIGTERM received. {signum=} {frame=}. Draining requests and shutting down..."
+        )
+        self.tokenizer_manager.gracefully_exit = True
+
+    def running_phase_sigquit_handler(self, signum=None, frame=None):
+        logger.error(
+            f"SIGQUIT received. {signum=}, {frame=}. It usually means one child failed."
+        )
+        self.tokenizer_manager.dump_requests_before_crash()
+        kill_process_tree(os.getpid())
+
+
+# Note: request abort handling logic
+# We should handle all of the following cases correctly.
+#
+# | entrypoint | is_streaming | status          | abort engine    | cancel asyncio task   | rid_to_state                |
+# | ---------- | ------------ | --------------- | --------------- | --------------------- | --------------------------- |
+# | http       | yes          | validation      | background task | fast api              | del in _handle_abort_req    |
+# | http       | yes          | waiting queue   | background task | fast api              | del in _handle_abort_req    |
+# | http       | yes          | running         | background task | fast api              | del in _handle_batch_output |
+# | http       | no           | validation      | http exception  | http exception        | del in _handle_abort_req    |
+# | http       | no           | waiting queue   | type 1          | type 1 exception      | del in _handle_abort_req    |
+# | http       | no           | running         | type 3          | type 3 exception      | del in _handle_batch_output |
+#
diff --git a/sglang/python/sglang/srt/managers/tokenizer_manager_multiitem_mixin.py b/sglang/python/sglang/srt/managers/tokenizer_manager_multiitem_mixin.py
new file mode 100644
index 0000000000000000000000000000000000000000..bbc685d3f4a4541f828421d2ead9b66d89beb871
--- /dev/null
+++ b/sglang/python/sglang/srt/managers/tokenizer_manager_multiitem_mixin.py
@@ -0,0 +1,405 @@
+import logging
+import math
+from dataclasses import dataclass
+from typing import Any, Dict, List, Optional, Union
+
+from sglang.srt.managers.io_struct import GenerateReqInput
+
+logger = logging.getLogger(__name__)
+
+
+@dataclass(frozen=True, slots=True)
+class ScoreResult:
+    scores: List[List[float]]
+    prompt_tokens: int
+
+
+class TokenizerManagerMultiItemMixin:
+    async def score_prompts(
+        self,
+        prompts: Union[str, List[str], List[List[int]]],
+        label_token_ids: List[int],
+        apply_softmax: bool = False,
+        request: Optional[Any] = None,
+    ) -> ScoreResult:
+        """
+        Score probabilities of specified token IDs after each *full prompt*.
+
+        This is a thin wrapper over `score_request` that treats `prompts` as
+        already-composed inputs (i.e., no query/item concatenation needed).
+
+        Args:
+            prompts: A single prompt string, a list of prompt strings, or a list of
+                pre-tokenized prompt token ID sequences.
+            label_token_ids: Token IDs to compute probabilities for.
+            apply_softmax: Whether to normalize probabilities using softmax.
+            request: Optional FastAPI request object.
+
+        Returns:
+            ScoreResult with:
+                scores: List of score lists, one for each prompt, each in the order of label_token_ids.
+                prompt_tokens: The number of prompt tokens processed.
+        """
+        # Text prompts
+        if isinstance(prompts, str) or (
+            isinstance(prompts, list) and (not prompts or isinstance(prompts[0], str))
+        ):
+            return await self.score_request(
+                query="",
+                items=prompts,  # type: ignore[arg-type]
+                label_token_ids=label_token_ids,
+                apply_softmax=apply_softmax,
+                item_first=False,
+                request=request,
+            )
+
+        # Tokenized prompts
+        if isinstance(prompts, list) and (not prompts or isinstance(prompts[0], list)):
+            return await self.score_request(
+                query=[],
+                items=prompts,
+                label_token_ids=label_token_ids,
+                apply_softmax=apply_softmax,
+                item_first=False,
+                request=request,
+            )
+
+        raise ValueError("Invalid prompts type for score_prompts.")
+
+    def _initialize_multi_item_delimiter_text(self):
+        """Initialize multi-item delimiter text from token ID after tokenizer is loaded."""
+        if (
+            hasattr(self.server_args, "multi_item_scoring_delimiter")
+            and self.server_args.multi_item_scoring_delimiter is not None
+            and self.tokenizer is not None
+        ):
+            try:
+                self.multi_item_delimiter_text = self.tokenizer.decode(
+                    [self.server_args.multi_item_scoring_delimiter],
+                    skip_special_tokens=False,
+                )
+            except Exception as e:
+                logger.warning(
+                    f"Failed to decode delimiter token {self.server_args.multi_item_scoring_delimiter}: {e}"
+                )
+                self.multi_item_delimiter_text = None
+
+    def _build_multi_item_token_sequence(
+        self, query: List[int], items: List[List[int]], delimiter_token_id: int
+    ) -> List[int]:
+        """
+        Build a single token sequence for multi-item scoring.
+        Format: query<delimiter>item1<delimiter>item2<delimiter>item3<delimiter>
+
+        Args:
+            query: Query token IDs
+            items: List of item token ID sequences
+            delimiter_token_id: Token ID to use as delimiter
+
+        Returns:
+            Combined token sequence
+        """
+        combined_sequence = query[:]  # Start with query
+
+        for item in items:
+            combined_sequence.append(delimiter_token_id)  # Add delimiter
+            combined_sequence.extend(item)  # Add item tokens
+
+        # Add final delimiter after the last item for logprob extraction
+        combined_sequence.append(delimiter_token_id)
+
+        return combined_sequence
+
+    def _process_multi_item_scoring_results(
+        self,
+        results: Any,
+        items: List,
+        label_token_ids: List[int],
+        apply_softmax: bool,
+        batch_request=None,
+    ) -> ScoreResult:
+        """
+        Process results from multi-item scoring request.
+        Extracts logprobs at delimiter positions from input_token_ids_logprobs.
+
+        Args:
+            results: Results from generate_request
+            items: List of items being scored
+            label_token_ids: Token IDs to extract scores for
+            apply_softmax: Whether to apply softmax normalization
+            batch_request: The original batch request containing input sequence
+
+        Returns:
+            ScoreResult with:
+                scores: List of score lists, one for each prompt, each in the order of label_token_ids.
+                prompt_tokens: The number of prompt tokens processed.
+        """
+        result = results[0] if isinstance(results, list) else results
+        meta_info = result.get("meta_info", {})
+
+        # For multi-item scoring, logprobs are in input_token_ids_logprobs
+        input_logprobs = meta_info.get("input_token_ids_logprobs", [])
+        prompt_tokens = meta_info.get("prompt_tokens", 0)
+        request_id = meta_info.get("id", "<unknown>")
+
+        if not input_logprobs:
+            raise RuntimeError(
+                f"input_token_ids_logprobs is empty for multi-item scoring request {request_id}. "
+                "This indicates token_ids_logprobs were not computed properly for Multi-Item Scoring."
+            )
+
+        scores = []
+        num_items = len(items) if isinstance(items, list) else 1
+
+        # Check if we have the expected number of logprobs
+        expected_logprobs_count = num_items + 1
+        if len(input_logprobs) != expected_logprobs_count:
+            raise RuntimeError(
+                f"Expected {expected_logprobs_count} input_token_ids_logprobs for multi-item scoring "
+                f"with {num_items} items, but got {len(input_logprobs)}. "
+                f"Request ID: {request_id}"
+            )
+
+        # Skip the first delimiter (between query and first item) and process remaining delimiter positions
+        # We want to exclude the first one since it represents the boundary between query and first item, not an item boundary
+        start_idx = 1 if len(input_logprobs) > 1 else 0
+
+        # Process logprobs for each item position (excluding first delimiter)
+        for item_idx in range(num_items):
+            logprob_idx = start_idx + item_idx
+            item_logprobs_data = input_logprobs[logprob_idx]
+            logprobs = self._extract_logprobs_for_tokens(
+                item_logprobs_data, label_token_ids
+            )
+            score_list = self._convert_logprobs_to_scores(
+                logprobs, label_token_ids, apply_softmax
+            )
+            scores.append(score_list)
+
+        return ScoreResult(scores=scores, prompt_tokens=prompt_tokens)
+
+    def _process_single_item_scoring_results(
+        self, results: Any, label_token_ids: List[int], apply_softmax: bool
+    ) -> ScoreResult:
+        """
+        Process results from single-item scoring request.
+        Single-item scoring results are stored in output_token_ids_logprobs.
+
+        Args:
+            results: Results from generate_request
+            label_token_ids: Token IDs to extract scores for
+            apply_softmax: Whether to apply softmax normalization
+
+        Returns:
+            ScoreResult with:
+                scores: List of score lists, one for each prompt, each in the order of label_token_ids.
+                prompt_tokens: The number of prompt tokens processed.
+        """
+        scores = []
+        prompt_tokens = 0
+
+        for result in results:
+            # For single-item scoring, logprobs are in output_token_ids_logprobs
+            output_logprobs = result["meta_info"].get("output_token_ids_logprobs", [])
+            prompt_tokens += result["meta_info"].get("prompt_tokens", 0)
+
+            if not output_logprobs or len(output_logprobs) == 0:
+                raise RuntimeError(
+                    f"output_logprobs is empty for request {result['meta_info'].get('id', '<unknown>')}."
+                )
+
+            # Extract logprobs for the first (and only) position
+            logprobs = self._extract_logprobs_for_tokens(
+                output_logprobs[0], label_token_ids
+            )
+            score_list = self._convert_logprobs_to_scores(
+                logprobs, label_token_ids, apply_softmax
+            )
+            scores.append(score_list)
+
+        return ScoreResult(scores=scores, prompt_tokens=prompt_tokens)
+
+    async def score_request(
+        self,
+        query: Optional[Union[str, List[int]]] = None,
+        items: Optional[Union[str, List[str], List[List[int]]]] = None,
+        label_token_ids: Optional[List[int]] = None,
+        apply_softmax: bool = False,
+        item_first: bool = False,
+        request: Optional[Any] = None,
+    ) -> ScoreResult:
+        """
+        Score the probability of specified token IDs appearing after the given (query + item) pair.
+
+        This method supports two scoring approaches:
+        1. Single-Item scoring (default): Process each query+item pair independently
+        2. Multi-Item scoring: When multi_item_scoring_delimiter is set, combine query and
+           multiple items into a single sequence using delimiter for efficient processing.
+           Note: item_first parameter is ignored in multi-item scoring mode since it uses
+           a fixed format: query<delimiter>item1<delimiter>item2<delimiter>item3<delimiter>
+
+           Multi-item scoring works with both text and pre-tokenized inputs:
+           - Text: query<delimiter_text>item1<delimiter_text>item2<delimiter_text>item3<delimiter_text>
+           - Tokens: query<delimiter_token_id>item1<delimiter_token_id>item2<delimiter_token_id>item3<delimiter_token_id>
+
+        Args:
+            query: The query text or pre-tokenized query token IDs
+            items: The item text(s) or pre-tokenized item token IDs
+            label_token_ids: List of token IDs to compute probabilities for
+            apply_softmax: Whether to normalize probabilities using softmax
+            item_first: If True, prepend items to query. Ignored for multi-item scoring.
+            request: Optional FastAPI request object
+
+        Returns:
+            ScoreResult with:
+                scores: List of score lists, one for each prompt, each in the order of label_token_ids.
+                prompt_tokens: The number of prompt tokens processed.
+        """
+        if label_token_ids is None:
+            raise ValueError("label_token_ids must be provided")
+
+        if items is None:
+            raise ValueError("items must be provided")
+        if not items:
+            return ScoreResult(scores=[], prompt_tokens=0)
+
+        if self.tokenizer is not None:
+            vocab_size = self.tokenizer.vocab_size
+            for token_id in label_token_ids:
+                if token_id >= vocab_size:
+                    raise ValueError(
+                        f"Token ID {token_id} is out of vocabulary (vocab size: {vocab_size})"
+                    )
+
+        # Check if multi-item scoring is enabled by presence of delimiter
+        use_multi_item_scoring = (
+            self.server_args.multi_item_scoring_delimiter is not None
+            and self.multi_item_delimiter_text is not None
+        )
+
+        batch_request = GenerateReqInput(
+            token_ids_logprob=label_token_ids,
+            return_logprob=True,
+            # Set logprob_start_len=0 for multi-item scoring since we want logprobs at all delimiter positions
+            logprob_start_len=0 if use_multi_item_scoring else -1,
+            stream=False,
+            sampling_params={"max_new_tokens": 0},
+        )
+
+        # Handle string or tokenized query/items
+        if isinstance(query, str) and (
+            isinstance(items, str)
+            or (isinstance(items, list) and (not items or isinstance(items[0], str)))
+        ):
+            # Both query and items are text
+            items_list = [items] if isinstance(items, str) else items
+
+            if use_multi_item_scoring:
+                # Multi-item scoring: create single prompt with delimiter text
+                # Always use format: query<delimiter>item1<delimiter>item2<delimiter>item3<delimiter>
+                # (item_first is ignored for multi-item scoring)
+                delimiter = self.multi_item_delimiter_text
+                combined_items = delimiter.join(items_list)
+                # Add final delimiter after the last item for logprob extraction
+                single_prompt = f"{query}{delimiter}{combined_items}{delimiter}"
+                batch_request.text = [single_prompt]
+            else:
+                # Single-item scoring: create separate prompts for each item
+                if item_first:
+                    prompts = [f"{item}{query}" for item in items_list]
+                else:
+                    prompts = [f"{query}{item}" for item in items_list]
+                batch_request.text = prompts
+
+        elif (
+            isinstance(query, list)
+            and isinstance(items, list)
+            and items
+            and isinstance(items[0], list)
+        ):
+            # Both query and items are token IDs
+            if use_multi_item_scoring:
+                # Multi-item scoring: concatenate with delimiter token ID
+                # Format: query<delimiter_token_id>item1<delimiter_token_id>item2<delimiter_token_id>item3<delimiter_token_id>
+                delimiter_token_id = self.server_args.multi_item_scoring_delimiter
+                combined_input_ids = self._build_multi_item_token_sequence(
+                    query, items, delimiter_token_id
+                )
+                batch_request.input_ids = [combined_input_ids]
+            else:
+                # Single-item scoring: process each item separately
+                if item_first:
+                    input_ids_list = [item + query for item in items]
+                else:
+                    input_ids_list = [query + item for item in items]
+                batch_request.input_ids = input_ids_list
+        else:
+            raise ValueError(
+                "Invalid combination of query/items types for score_request."
+            )
+
+        results = await self.generate_request(batch_request, request).__anext__()
+
+        if use_multi_item_scoring:
+            # Multi-item scoring: extract scores from input_token_ids_logprobs
+            return self._process_multi_item_scoring_results(
+                results, items, label_token_ids, apply_softmax, batch_request
+            )
+        else:
+            # Single-item scoring: process each result separately
+            return self._process_single_item_scoring_results(
+                results, label_token_ids, apply_softmax
+            )
+
+    def _convert_logprobs_to_scores(
+        self,
+        logprobs: Dict[int, float],
+        label_token_ids: List[int],
+        apply_softmax: bool,
+    ) -> List[float]:
+        """
+        Convert logprobs dictionary to ordered score list.
+
+        Args:
+            logprobs: Dictionary mapping token_id to logprob
+            label_token_ids: Token IDs in desired order
+            apply_softmax: Whether to apply softmax normalization
+
+        Returns:
+            List of scores in the same order as label_token_ids
+        """
+        import torch
+
+        score_list = [
+            logprobs.get(token_id, float("-inf")) for token_id in label_token_ids
+        ]
+
+        if apply_softmax:
+            score_list = torch.softmax(torch.tensor(score_list), dim=0).tolist()
+        else:
+            # Convert logprobs to probabilities if not using softmax
+            score_list = [
+                math.exp(x) if x != float("-inf") else 0.0 for x in score_list
+            ]
+
+        return score_list
+
+    def _extract_logprobs_for_tokens(
+        self, logprobs_data: List, label_token_ids: List[int]
+    ) -> Dict[int, float]:
+        """
+        Extract logprobs for specified token IDs from logprobs data.
+
+        Args:
+            logprobs_data: List of (logprob, token_id, text) tuples
+            label_token_ids: Token IDs to extract logprobs for
+
+        Returns:
+            Dictionary mapping token_id to logprob
+        """
+        logprobs = {}
+        if logprobs_data:
+            for logprob, token_id, _ in logprobs_data:
+                if token_id in label_token_ids:
+                    logprobs[token_id] = logprob
+        return logprobs
diff --git a/sglang/python/sglang/srt/managers/tp_worker.py b/sglang/python/sglang/srt/managers/tp_worker.py
new file mode 100644
index 0000000000000000000000000000000000000000..8408cc9055f2c977fe4057bd68925fd2865e3f48
--- /dev/null
+++ b/sglang/python/sglang/srt/managers/tp_worker.py
@@ -0,0 +1,557 @@
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""A tensor parallel worker."""
+
+from __future__ import annotations
+
+import logging
+from abc import ABC, abstractmethod
+from typing import TYPE_CHECKING, List, Optional
+
+import torch
+
+from sglang.srt.distributed import get_pp_group, get_world_group
+from sglang.srt.managers.io_struct import (
+    DestroyWeightsUpdateGroupReqInput,
+    GetWeightsByNameReqInput,
+    InitWeightsSendGroupForRemoteInstanceReqInput,
+    InitWeightsUpdateGroupReqInput,
+    LoadLoRAAdapterFromTensorsReqInput,
+    LoadLoRAAdapterReqInput,
+    SendWeightsToRemoteInstanceReqInput,
+    UnloadLoRAAdapterReqInput,
+    UpdateWeightFromDiskReqInput,
+    UpdateWeightsFromDistributedReqInput,
+    UpdateWeightsFromIPCReqInput,
+    UpdateWeightsFromTensorReqInput,
+)
+from sglang.srt.managers.schedule_batch import ModelWorkerBatch, ScheduleBatch
+from sglang.srt.managers.scheduler import GenerationBatchResult
+from sglang.srt.mem_cache.allocator import BaseTokenToKVPoolAllocator
+from sglang.srt.mem_cache.memory_pool import ReqToTokenPool
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch, PPProxyTensors
+from sglang.srt.server_args import ServerArgs
+from sglang.srt.utils import MultiprocessingSerializer, broadcast_pyobj, set_random_seed
+from sglang.srt.utils.hf_transformers_utils import (
+    get_processor,
+    get_tokenizer,
+    get_tokenizer_from_processor,
+)
+from sglang.srt.utils.patch_torch import monkey_patch_torch_reductions
+from sglang.srt.weight_sync.tensor_bucket import FlattenedTensorBucket
+
+if TYPE_CHECKING:
+    from sglang.srt.managers.cache_controller import LayerDoneCounter
+    from sglang.srt.model_executor.model_runner import ModelRunner
+
+logger = logging.getLogger(__name__)
+
+
+class BaseTpWorker(ABC):
+    @abstractmethod
+    def forward_batch_generation(self, forward_batch: ForwardBatch):
+        pass
+
+    @property
+    @abstractmethod
+    def model_runner(self) -> "ModelRunner":
+        pass
+
+    @property
+    def sliding_window_size(self) -> Optional[int]:
+        return self.model_runner.sliding_window_size
+
+    @property
+    def is_hybrid_swa(self) -> bool:
+        return self.model_runner.is_hybrid_swa
+
+    def get_tokens_per_layer_info(self):
+        return (
+            self.model_runner.full_max_total_num_tokens,
+            self.model_runner.swa_max_total_num_tokens,
+        )
+
+    def get_pad_input_ids_func(self):
+        return getattr(self.model_runner.model, "pad_input_ids", None)
+
+    def get_memory_pool(self):
+        return (
+            self.model_runner.req_to_token_pool,
+            self.model_runner.token_to_kv_pool_allocator,
+        )
+
+    def update_weights_from_disk(self, recv_req: UpdateWeightFromDiskReqInput):
+        success, message = self.model_runner.update_weights_from_disk(
+            recv_req.model_path,
+            recv_req.load_format,
+            recapture_cuda_graph=recv_req.recapture_cuda_graph,
+        )
+        return success, message
+
+    def init_weights_update_group(self, recv_req: InitWeightsUpdateGroupReqInput):
+        success, message = self.model_runner.init_weights_update_group(
+            recv_req.master_address,
+            recv_req.master_port,
+            recv_req.rank_offset,
+            recv_req.world_size,
+            recv_req.group_name,
+            recv_req.backend,
+        )
+        return success, message
+
+    def destroy_weights_update_group(self, recv_req: DestroyWeightsUpdateGroupReqInput):
+        success, message = self.model_runner.destroy_weights_update_group(
+            recv_req.group_name,
+        )
+        return success, message
+
+    def init_weights_send_group_for_remote_instance(
+        self, recv_req: InitWeightsSendGroupForRemoteInstanceReqInput
+    ):
+        success, message = (
+            self.model_runner.init_weights_send_group_for_remote_instance(
+                recv_req.master_address,
+                recv_req.ports,
+                recv_req.group_rank,
+                recv_req.world_size,
+                recv_req.group_name,
+                recv_req.backend,
+            )
+        )
+        return success, message
+
+    def send_weights_to_remote_instance(
+        self, recv_req: SendWeightsToRemoteInstanceReqInput
+    ):
+        success, message = self.model_runner.send_weights_to_remote_instance(
+            recv_req.master_address,
+            recv_req.ports,
+            recv_req.group_name,
+        )
+        return success, message
+
+    def update_weights_from_distributed(
+        self, recv_req: UpdateWeightsFromDistributedReqInput
+    ):
+        success, message = self.model_runner.update_weights_from_distributed(
+            recv_req.names,
+            recv_req.dtypes,
+            recv_req.shapes,
+            recv_req.group_name,
+            recv_req.load_format,
+        )
+        return success, message
+
+    def update_weights_from_tensor(self, recv_req: UpdateWeightsFromTensorReqInput):
+
+        monkey_patch_torch_reductions()
+        success, message = self.model_runner.update_weights_from_tensor(
+            named_tensors=MultiprocessingSerializer.deserialize(
+                recv_req.serialized_named_tensors[self.tp_rank]
+            ),
+            load_format=recv_req.load_format,
+        )
+        return success, message
+
+    def update_weights_from_ipc(self, recv_req: UpdateWeightsFromIPCReqInput):
+        """Update weights from IPC for checkpoint-engine integration."""
+        success, message = self.model_runner.update_weights_from_ipc(recv_req)
+        return success, message
+
+    def get_weights_by_name(self, recv_req: GetWeightsByNameReqInput):
+        parameter = self.model_runner.get_weights_by_name(
+            recv_req.name, recv_req.truncate_size
+        )
+        return parameter
+
+    def load_lora_adapter(self, recv_req: LoadLoRAAdapterReqInput):
+        result = self.model_runner.load_lora_adapter(recv_req.to_ref())
+        return result
+
+    def unload_lora_adapter(self, recv_req: UnloadLoRAAdapterReqInput):
+        result = self.model_runner.unload_lora_adapter(recv_req.to_ref())
+        return result
+
+    def load_lora_adapter_from_tensors(
+        self, recv_req: LoadLoRAAdapterFromTensorsReqInput
+    ):
+        # The LoRA code handles TP sharding internally using slice_lora_a_weights
+        # and slice_lora_b_weights methods (see lora/layers.py:46-49, mem_pool.py:437-440).
+        if recv_req.load_format == "flattened_bucket":
+            flattened_data = MultiprocessingSerializer.deserialize(
+                recv_req.serialized_tensors
+            )
+            bucket = FlattenedTensorBucket(
+                flattened_tensor=flattened_data["flattened_tensor"],
+                metadata=flattened_data["metadata"],
+            )
+            tensors = dict(bucket.reconstruct_tensors())
+        else:
+            tensors = MultiprocessingSerializer.deserialize(recv_req.serialized_tensors)
+        result = self.model_runner.load_lora_adapter_from_tensors(
+            recv_req.to_ref(),
+            tensors,
+            recv_req.config_dict,
+            recv_req.added_tokens_config,
+        )
+        return result
+
+    def forward_batch_embedding(self, model_worker_batch: ModelWorkerBatch):
+        forward_batch = ForwardBatch.init_new(model_worker_batch, self.model_runner)
+        logits_output = self.model_runner.forward(forward_batch).logits_output
+        embeddings = logits_output.embeddings
+        return embeddings
+
+
+class TpModelWorker(BaseTpWorker):
+    """A tensor parallel model worker."""
+
+    def __init__(
+        self,
+        server_args: ServerArgs,
+        gpu_id: int,
+        tp_rank: int,
+        moe_ep_rank: int,
+        pp_rank: int,
+        attn_cp_rank: int,
+        moe_dp_rank: int,
+        dp_rank: Optional[int],
+        nccl_port: int,
+        is_draft_worker: bool = False,
+        req_to_token_pool: Optional[ReqToTokenPool] = None,
+        token_to_kv_pool_allocator: Optional[BaseTokenToKVPoolAllocator] = None,
+        is_multi_layer_eagle: bool = False,
+    ):
+        # Parse args
+        self.server_args = server_args
+        self.tp_size = server_args.tp_size
+        self.ep_size = server_args.ep_size
+        self.pp_size = server_args.pp_size
+        self.tp_rank = tp_rank
+        self.moe_ep_rank = moe_ep_rank
+        self.pp_rank = pp_rank
+        self.dp_rank = dp_rank
+        self.gpu_id = gpu_id
+        self.nccl_port = nccl_port
+        self.is_draft_worker = is_draft_worker
+        self.is_multi_layer_eagle = is_multi_layer_eagle
+        self.req_to_token_pool = req_to_token_pool
+        self.token_to_kv_pool_allocator = token_to_kv_pool_allocator
+        self.attn_cp_rank = attn_cp_rank
+        self.moe_dp_rank = moe_dp_rank
+
+        # MTP model runners
+        self.model_runner_list: List[ModelRunner] = []
+
+        self._init_model_config()
+        self._init_model_runner()
+
+        if is_multi_layer_eagle:
+            self._init_multi_layer_eagle_model_runners()
+
+        self._init_dllm_algorithm()
+
+        if server_args.skip_tokenizer_init:
+            self.tokenizer = self.processor = None
+        else:
+            if self.model_config.is_multimodal:
+                self.processor = get_processor(
+                    server_args.tokenizer_path,
+                    tokenizer_mode=server_args.tokenizer_mode,
+                    trust_remote_code=server_args.trust_remote_code,
+                    revision=server_args.revision,
+                )
+                self.tokenizer = get_tokenizer_from_processor(self.processor)
+            else:
+                self.tokenizer = get_tokenizer(
+                    server_args.tokenizer_path,
+                    tokenizer_mode=server_args.tokenizer_mode,
+                    trust_remote_code=server_args.trust_remote_code,
+                    revision=server_args.revision,
+                )
+        self.device = self.model_runner.device
+
+        # Init nccl groups
+        self.pp_group = get_pp_group()
+        self.world_group = get_world_group()
+
+        # Profile number of tokens
+        self.max_total_num_tokens = self.model_runner.max_total_num_tokens
+        self.max_prefill_tokens = server_args.max_prefill_tokens
+        self.max_running_requests = self.model_runner.max_running_requests
+        assert self.max_running_requests > 0, "max_running_request is zero"
+        self.max_queued_requests = server_args.max_queued_requests
+        assert (
+            self.max_queued_requests is None or self.max_queued_requests >= 1
+        ), "If configured, max_queued_requests must be at least 1 for any work to be scheduled."
+        self.max_req_len = min(
+            self.model_config.context_len - 1,
+            self.model_runner.max_token_pool_size - 1,
+        )
+        self.max_req_input_len = self.max_req_len - 5
+        assert (
+            self.max_req_len > 0 and self.max_req_input_len > 0
+        ), "Memory pool size is too small"
+
+        # Sync random seed across TP workers
+        self.random_seed = broadcast_pyobj(
+            [server_args.random_seed],
+            self.tp_size * self.pp_rank + tp_rank,
+            self.world_group.cpu_group,
+            src=self.world_group.ranks[0],
+        )[0]
+        set_random_seed(self.random_seed)
+
+        self.enable_overlap = not server_args.disable_overlap_schedule
+        self.enable_spec = server_args.speculative_algorithm is not None
+        self.hicache_layer_transfer_counter = None
+
+    def _init_model_config(self):
+        from sglang.srt.configs.model_config import ModelConfig
+
+        self.model_config = ModelConfig.from_server_args(
+            self.server_args,
+            model_path=(
+                self.server_args.model_path
+                if not self.is_draft_worker
+                else self.server_args.speculative_draft_model_path
+            ),
+            model_revision=(
+                self.server_args.revision
+                if not self.is_draft_worker
+                else self.server_args.speculative_draft_model_revision
+            ),
+            is_draft_model=self.is_draft_worker,
+        )
+
+    def _init_model_runner(self):
+        from sglang.srt.model_executor.model_runner import ModelRunner
+
+        self._model_runner = ModelRunner(
+            model_config=self.model_config,
+            mem_fraction_static=self.server_args.mem_fraction_static,
+            gpu_id=self.gpu_id,
+            tp_rank=self.tp_rank,
+            tp_size=self.tp_size,
+            moe_ep_rank=self.moe_ep_rank,
+            moe_ep_size=self.ep_size,
+            pp_rank=self.pp_rank,
+            pp_size=self.pp_size,
+            nccl_port=self.nccl_port,
+            dp_rank=self.dp_rank,
+            server_args=self.server_args,
+            is_draft_worker=self.is_draft_worker,
+            req_to_token_pool=self.req_to_token_pool,
+            token_to_kv_pool_allocator=self.token_to_kv_pool_allocator,
+            draft_model_idx=0 if self.is_multi_layer_eagle else None,
+        )
+
+    def _init_multi_layer_eagle_model_runners(self):
+        from sglang.srt.model_executor.model_runner import ModelRunner
+
+        self.model_runner_list.append(self.model_runner)
+        for i in range(1, self.server_args.speculative_num_steps):
+            self.model_runner_list.append(
+                ModelRunner(
+                    model_config=self.model_config,
+                    mem_fraction_static=self.server_args.mem_fraction_static,
+                    gpu_id=self.gpu_id,
+                    tp_rank=self.tp_rank,
+                    tp_size=self.tp_size,
+                    moe_ep_rank=self.moe_ep_rank,
+                    moe_ep_size=self.ep_size,
+                    pp_rank=self.pp_rank,
+                    pp_size=self.pp_size,
+                    nccl_port=self.nccl_port,
+                    dp_rank=self.dp_rank,
+                    server_args=self.server_args,
+                    is_draft_worker=self.is_draft_worker,
+                    req_to_token_pool=self.req_to_token_pool,
+                    token_to_kv_pool_allocator=self.token_to_kv_pool_allocator,
+                    draft_model_idx=i,
+                )
+            )
+
+    def _init_dllm_algorithm(self):
+        from sglang.srt.dllm.algorithm.base import DllmAlgorithm
+
+        if self.server_args.dllm_algorithm is not None:
+            self.dllm_algorithm = DllmAlgorithm.from_server_args(self.server_args)
+        else:
+            self.dllm_algorithm = None
+
+    @property
+    def model_runner(self) -> "ModelRunner":
+        return self._model_runner
+
+    def register_hicache_layer_transfer_counter(self, counter: LayerDoneCounter):
+        self.hicache_layer_transfer_counter = counter
+
+    def set_hicache_consumer(self, consumer_index: int):
+        if self.hicache_layer_transfer_counter is not None:
+            self.hicache_layer_transfer_counter.set_consumer(consumer_index)
+
+    def get_worker_info(self):
+        return (
+            self.max_total_num_tokens,
+            self.max_prefill_tokens,
+            self.max_running_requests,
+            self.max_queued_requests,
+            self.max_req_len,
+            self.max_req_input_len,
+            self.random_seed,
+            self.device,
+            self.model_runner.forward_stream,
+            self.model_runner.req_to_token_pool.size,
+            self.model_runner.req_to_token_pool.max_context_len,
+            self.model_runner.token_to_kv_pool.size,
+        )
+
+    def is_dllm(self):
+        return self.dllm_algorithm is not None
+
+    def _forward_batch_generation_dllm(
+        self, forward_batch: ForwardBatch
+    ) -> GenerationBatchResult:
+        logits_output, next_token_ids, can_run_cuda_graph = self.dllm_algorithm.run(
+            self.model_runner, forward_batch
+        )
+        return GenerationBatchResult(
+            logits_output=logits_output,
+            next_token_ids=next_token_ids,
+            can_run_cuda_graph=can_run_cuda_graph,
+        )
+
+    def get_remote_instance_transfer_engine_info(self):
+        return (
+            self.model_runner.remote_instance_transfer_engine_session_id,
+            self.model_runner.remote_instance_transfer_engine_weight_info,
+        )
+
+    def forward_batch_generation(
+        self,
+        model_worker_batch: ModelWorkerBatch,
+        forward_batch: Optional[ForwardBatch] = None,
+        pp_proxy_tensors: Optional[PPProxyTensors] = None,
+        is_verify: bool = False,
+        skip_attn_backend_init=False,
+    ) -> GenerationBatchResult:
+        # FIXME(lsyin): maybe remove skip_attn_backend_init in forward_batch_generation,
+        #               which requires preparing replay to always be in this function
+
+        # Get forward batch from model worker batch
+        if model_worker_batch is not None:
+            # update the consumer index of hicache to the running batch
+            self.set_hicache_consumer(model_worker_batch.hicache_consumer_index)
+
+            forward_batch = ForwardBatch.init_new(model_worker_batch, self.model_runner)
+        else:
+            # FIXME(lsyin): unify the interface of forward_batch
+            assert forward_batch is not None
+
+        if self.is_dllm():
+            return self._forward_batch_generation_dllm(forward_batch)
+
+        if self.pp_group.is_last_rank:
+            out = self.model_runner.forward(
+                forward_batch,
+                pp_proxy_tensors=pp_proxy_tensors,
+                skip_attn_backend_init=skip_attn_backend_init,
+            )
+            logits_output, can_run_cuda_graph = out.logits_output, out.can_run_graph
+            batch_result = GenerationBatchResult(
+                logits_output=logits_output,
+                can_run_cuda_graph=can_run_cuda_graph,
+                expert_distribution_metrics=out.expert_distribution_metrics,
+            )
+
+            if is_verify:
+                # Skip sampling and return logits for target forward
+                return batch_result
+
+            if (
+                self.enable_overlap
+                and not self.enable_spec
+                and model_worker_batch.sampling_info.grammars is not None
+            ):
+
+                def sample_batch_func():
+                    batch_result.next_token_ids = self.model_runner.sample(
+                        logits_output, forward_batch
+                    )
+                    return batch_result
+
+                batch_result.delay_sample_func = sample_batch_func
+                return batch_result
+
+            if not model_worker_batch.is_prefill_only:
+                # For normal requests, sample the next token ids.
+                batch_result.next_token_ids = self.model_runner.sample(
+                    logits_output, forward_batch
+                )
+            else:
+                # For prefill-only requests, create dummy token IDs on CPU
+                # The size should match the batch size (number of sequences), not total tokens
+                batch_result.next_token_ids = torch.zeros(
+                    len(model_worker_batch.seq_lens),
+                    dtype=torch.long,
+                    device=model_worker_batch.input_ids.device,
+                )
+                if (
+                    model_worker_batch.return_logprob
+                    and logits_output.next_token_logits is not None
+                ):
+                    # NOTE: Compute logprobs without full sampling
+                    self.model_runner.compute_logprobs_only(
+                        logits_output, model_worker_batch
+                    )
+
+            return batch_result
+        else:
+            out = self.model_runner.forward(
+                forward_batch,
+                pp_proxy_tensors=pp_proxy_tensors,
+                skip_attn_backend_init=skip_attn_backend_init,
+            )
+            pp_proxy_tensors, can_run_cuda_graph = out.logits_output, out.can_run_graph
+            return GenerationBatchResult(
+                pp_hidden_states_proxy_tensors=pp_proxy_tensors,
+                can_run_cuda_graph=can_run_cuda_graph,
+                expert_distribution_metrics=out.expert_distribution_metrics,
+            )
+
+    def forward_batch_split_prefill(self, batch: ScheduleBatch):
+        if batch.split_index == 0:
+            model_worker_batch = batch.get_model_worker_batch()
+            forward_batch = ForwardBatch.init_new(model_worker_batch, self.model_runner)
+            batch.split_forward_batch = forward_batch
+            batch.seq_lens_cpu_cache = model_worker_batch.seq_lens_cpu
+        else:
+            model_worker_batch = batch.get_model_worker_batch(batch.seq_lens_cpu_cache)
+
+        out = self.model_runner.forward(
+            batch.split_forward_batch, split_forward_count=batch.split_forward_count
+        )
+        logits_output, can_run_cuda_graph = out.logits_output, out.can_run_graph
+        if logits_output:
+            next_token_ids = self.model_runner.sample(logits_output, model_worker_batch)
+        else:
+            next_token_ids = None
+        batch_result = GenerationBatchResult(
+            logits_output=logits_output,
+            can_run_cuda_graph=can_run_cuda_graph,
+            expert_distribution_metrics=out.expert_distribution_metrics,
+        )
+        batch_result.next_token_ids = next_token_ids
+        return batch_result
diff --git a/sglang/python/sglang/srt/managers/utils.py b/sglang/python/sglang/srt/managers/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..9eea302b767988be7d2b7952e9b716fa1ba0f0ea
--- /dev/null
+++ b/sglang/python/sglang/srt/managers/utils.py
@@ -0,0 +1,206 @@
+from __future__ import annotations
+
+import dataclasses
+import logging
+from typing import TYPE_CHECKING, List, Optional, Union
+
+import torch
+
+from sglang.srt.eplb.expert_distribution import ExpertDistributionMetrics
+from sglang.srt.layers.logits_processor import LogitsProcessorOutput
+from sglang.srt.managers.overlap_utils import FutureIndices
+from sglang.srt.managers.schedule_batch import Req
+from sglang.srt.model_executor.forward_batch_info import PPProxyTensors
+from sglang.srt.server_args import ServerArgs
+
+if TYPE_CHECKING:
+    from sglang.srt.managers.scheduler import GenerationBatchResult
+    from sglang.srt.speculative.eagle_info import EagleDraftInput
+
+
+logger = logging.getLogger(__name__)
+
+
+@dataclasses.dataclass
+class GenerationBatchResult:
+    logits_output: Optional[LogitsProcessorOutput] = None
+    pp_hidden_states_proxy_tensors: Optional[PPProxyTensors] = None
+    next_token_ids: Optional[Union[torch.Tensor, List[torch.Tensor]]] = None
+    num_accepted_tokens: int = 0
+    accept_length_per_req_cpu: Optional[List[int]] = None
+    can_run_cuda_graph: bool = False
+
+    # For output processing
+    extend_input_len_per_req: Optional[List[int]] = None
+    extend_logprob_start_len_per_req: Optional[List[int]] = None
+
+    # For overlap scheduling
+    copy_done: Optional[torch.cuda.Event] = None
+    delay_sample_func: Optional[callable] = None
+    future_indices: Optional[FutureIndices] = None
+
+    # FIXME(lsyin): maybe move to a better place?
+    # sync path: forward stream -> output processor
+    accept_lens: Optional[torch.Tensor] = None
+
+    # relay path: forward stream -> next step forward
+    next_draft_input: Optional[EagleDraftInput] = None
+
+    # metrics
+    expert_distribution_metrics: Optional[ExpertDistributionMetrics] = None
+
+    def copy_to_cpu(self, return_logprob: bool):
+        """Copy tensors to CPU in overlap scheduling.
+        Only the tensors which are needed for processing results are copied,
+        e.g., next_token_ids, logits outputs
+        """
+        if return_logprob:
+            if self.logits_output.next_token_logprobs is not None:
+                self.logits_output.next_token_logprobs = (
+                    self.logits_output.next_token_logprobs.to("cpu", non_blocking=True)
+                )
+            if self.logits_output.input_token_logprobs is not None:
+                self.logits_output.input_token_logprobs = (
+                    self.logits_output.input_token_logprobs.to("cpu", non_blocking=True)
+                )
+        if self.logits_output.hidden_states is not None:
+            self.logits_output.hidden_states = self.logits_output.hidden_states.to(
+                "cpu", non_blocking=True
+            )
+        self.next_token_ids = self.next_token_ids.to("cpu", non_blocking=True)
+
+        if self.accept_lens is not None:
+            self.accept_lens = self.accept_lens.to("cpu", non_blocking=True)
+
+        if (x := self.expert_distribution_metrics) is not None:
+            x.copy_to_cpu()
+
+        self.copy_done.record()
+
+    @classmethod
+    def from_pp_proxy(
+        cls, logits_output, next_pp_outputs: PPProxyTensors, can_run_cuda_graph
+    ):
+        # TODO(lsyin): refactor PP and avoid using dict
+        proxy_dict = next_pp_outputs.tensors
+        return cls(
+            logits_output=logits_output,
+            pp_hidden_states_proxy_tensors=None,
+            next_token_ids=next_pp_outputs["next_token_ids"],
+            extend_input_len_per_req=proxy_dict.get("extend_input_len_per_req", None),
+            extend_logprob_start_len_per_req=proxy_dict.get(
+                "extend_logprob_start_len_per_req", None
+            ),
+            can_run_cuda_graph=can_run_cuda_graph,
+        )
+
+
+def validate_input_length(
+    req: Req, max_req_input_len: int, allow_auto_truncate: bool
+) -> Optional[str]:
+    """Validate and potentially truncate input length.
+
+    Args:
+        req: The request containing input_ids to validate
+        max_req_input_len: Maximum allowed input length
+        allow_auto_truncate: Whether to truncate long inputs
+
+    Returns:
+        Error message if validation fails, None if successful
+    """
+    if len(req.origin_input_ids) >= max_req_input_len:
+        if allow_auto_truncate:
+            logger.warning(
+                "Request length is longer than the KV cache pool size or "
+                "the max context length. Truncated. "
+                f"{len(req.origin_input_ids)=}, {max_req_input_len=}."
+            )
+            req.origin_input_ids = req.origin_input_ids[:max_req_input_len]
+            return None
+        else:
+            error_msg = (
+                f"Input length ({len(req.origin_input_ids)} tokens) exceeds "
+                f"the maximum allowed length ({max_req_input_len} tokens). "
+                f"Use a shorter input or enable --allow-auto-truncate."
+            )
+            return error_msg
+
+    return None
+
+
+def get_logprob_dict_from_result(result: GenerationBatchResult) -> dict:
+
+    logits_output = result.logits_output
+    assert logits_output is not None
+
+    return {
+        "extend_input_len_per_req": result.extend_input_len_per_req,
+        "extend_logprob_start_len_per_req": result.extend_logprob_start_len_per_req,
+        "next_token_logprobs": result.logits_output.next_token_logprobs,
+        "next_token_top_logprobs_val": result.logits_output.next_token_top_logprobs_val,
+        "next_token_top_logprobs_idx": result.logits_output.next_token_top_logprobs_idx,
+        "next_token_token_ids_logprobs_val": result.logits_output.next_token_token_ids_logprobs_val,
+        "next_token_token_ids_logprobs_idx": result.logits_output.next_token_token_ids_logprobs_idx,
+        "input_token_logprobs": result.logits_output.input_token_logprobs,
+        "input_top_logprobs_val": result.logits_output.input_top_logprobs_val,
+        "input_top_logprobs_idx": result.logits_output.input_top_logprobs_idx,
+        "input_token_ids_logprobs_val": result.logits_output.input_token_ids_logprobs_val,
+        "input_token_ids_logprobs_idx": result.logits_output.input_token_ids_logprobs_idx,
+    }
+
+
+def get_logprob_from_pp_outputs(
+    next_pp_outputs: PPProxyTensors,
+) -> tuple[LogitsProcessorOutput, list[int], list[int]]:
+    logits_output = LogitsProcessorOutput(
+        # Do not send logits and hidden states because they are large
+        next_token_logits=None,
+        hidden_states=None,
+        next_token_logprobs=next_pp_outputs["next_token_logprobs"],
+        next_token_top_logprobs_val=next_pp_outputs["next_token_top_logprobs_val"],
+        next_token_top_logprobs_idx=next_pp_outputs["next_token_top_logprobs_idx"],
+        next_token_token_ids_logprobs_val=next_pp_outputs[
+            "next_token_token_ids_logprobs_val"
+        ],
+        next_token_token_ids_logprobs_idx=next_pp_outputs[
+            "next_token_token_ids_logprobs_idx"
+        ],
+        input_token_logprobs=next_pp_outputs["input_token_logprobs"],
+        input_top_logprobs_val=next_pp_outputs["input_top_logprobs_val"],
+        input_top_logprobs_idx=next_pp_outputs["input_top_logprobs_idx"],
+        input_token_ids_logprobs_val=next_pp_outputs["input_token_ids_logprobs_val"],
+        input_token_ids_logprobs_idx=next_pp_outputs["input_token_ids_logprobs_idx"],
+    )
+    extend_input_len_per_req = next_pp_outputs["extend_input_len_per_req"]
+    extend_logprob_start_len_per_req = next_pp_outputs[
+        "extend_logprob_start_len_per_req"
+    ]
+
+    return logits_output, extend_input_len_per_req, extend_logprob_start_len_per_req
+
+
+def get_alloc_len_per_decode(server_args: Optional[ServerArgs] = None) -> int:
+    if server_args is None:
+        from sglang.srt.server_args import get_global_server_args
+
+        server_args = get_global_server_args()
+
+    if server_args.speculative_algorithm is None:
+        return 1
+
+    # Spec v1:
+    # 1) alloc topk * num_steps when draft decoding and then restore the allocation
+    # 2) alloc num_draft_tokens when verifying the drafts
+    # Sepc v2: allocate max(topk * num_steps, num_draft_tokens)
+
+    spec_steps = server_args.speculative_num_steps or 1
+    spec_topk = server_args.speculative_eagle_topk or 1
+    spec_tokens = server_args.speculative_num_draft_tokens
+    page_size = server_args.page_size
+
+    if page_size == 1 or spec_topk == 1:
+        return max(spec_steps * spec_topk, spec_tokens)
+    else:
+        raise NotImplementedError(
+            "get_alloc_len_per_decode not implemented for page_size > 1 and spec_topk > 1"
+        )
diff --git a/sglang/python/sglang/srt/mem_cache/__pycache__/allocator.cpython-311.pyc b/sglang/python/sglang/srt/mem_cache/__pycache__/allocator.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c7d35210f9079b1594639120742fe86f15f34275
Binary files /dev/null and b/sglang/python/sglang/srt/mem_cache/__pycache__/allocator.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/mem_cache/__pycache__/base_prefix_cache.cpython-311.pyc b/sglang/python/sglang/srt/mem_cache/__pycache__/base_prefix_cache.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..badd4dfbae9da098bf75209b70927ac5061bd296
Binary files /dev/null and b/sglang/python/sglang/srt/mem_cache/__pycache__/base_prefix_cache.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/mem_cache/__pycache__/cache_init_params.cpython-311.pyc b/sglang/python/sglang/srt/mem_cache/__pycache__/cache_init_params.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5277f60ebae2c7ac5156ad1e8a8670989a327a63
Binary files /dev/null and b/sglang/python/sglang/srt/mem_cache/__pycache__/cache_init_params.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/mem_cache/__pycache__/common.cpython-311.pyc b/sglang/python/sglang/srt/mem_cache/__pycache__/common.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f34587db345c288b87c230b2d140f90928f82e2f
Binary files /dev/null and b/sglang/python/sglang/srt/mem_cache/__pycache__/common.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/mem_cache/__pycache__/evict_policy.cpython-311.pyc b/sglang/python/sglang/srt/mem_cache/__pycache__/evict_policy.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8b3c55d31863d1e3a2c79474c4a1739b7ae0ae22
Binary files /dev/null and b/sglang/python/sglang/srt/mem_cache/__pycache__/evict_policy.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/mem_cache/__pycache__/hicache_storage.cpython-311.pyc b/sglang/python/sglang/srt/mem_cache/__pycache__/hicache_storage.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4b4fd30b45ade3d6adbf2e690a0b511f511877d0
Binary files /dev/null and b/sglang/python/sglang/srt/mem_cache/__pycache__/hicache_storage.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/mem_cache/__pycache__/memory_pool_host.cpython-311.pyc b/sglang/python/sglang/srt/mem_cache/__pycache__/memory_pool_host.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4da8cfc4aaee65089b8ec1c84cc8cf5090bb4ded
Binary files /dev/null and b/sglang/python/sglang/srt/mem_cache/__pycache__/memory_pool_host.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/mem_cache/__pycache__/multimodal_cache.cpython-311.pyc b/sglang/python/sglang/srt/mem_cache/__pycache__/multimodal_cache.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..41f6dfb3699d60c4012afac75f23ab8b7c75457b
Binary files /dev/null and b/sglang/python/sglang/srt/mem_cache/__pycache__/multimodal_cache.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/mem_cache/__pycache__/radix_cache.cpython-311.pyc b/sglang/python/sglang/srt/mem_cache/__pycache__/radix_cache.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..aa89d2696c249b2044dcfb8db8d95fa060f024a2
Binary files /dev/null and b/sglang/python/sglang/srt/mem_cache/__pycache__/radix_cache.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/mem_cache/__pycache__/session_aware_cache.cpython-311.pyc b/sglang/python/sglang/srt/mem_cache/__pycache__/session_aware_cache.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..70bae1b2d675931b147ea3d6530778dd6a0db4d2
Binary files /dev/null and b/sglang/python/sglang/srt/mem_cache/__pycache__/session_aware_cache.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/mem_cache/__pycache__/swa_memory_pool.cpython-311.pyc b/sglang/python/sglang/srt/mem_cache/__pycache__/swa_memory_pool.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6b8f15d35b0322133413d8e6b813b70c850ae8fb
Binary files /dev/null and b/sglang/python/sglang/srt/mem_cache/__pycache__/swa_memory_pool.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/mem_cache/__pycache__/utils.cpython-311.pyc b/sglang/python/sglang/srt/mem_cache/__pycache__/utils.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..70ea69507dcf7fcfe9d7619370a38faabe9e53c3
Binary files /dev/null and b/sglang/python/sglang/srt/mem_cache/__pycache__/utils.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/mem_cache/allocator.py b/sglang/python/sglang/srt/mem_cache/allocator.py
new file mode 100644
index 0000000000000000000000000000000000000000..c0ce15e20a84ddb975ece73da188b9346f833543
--- /dev/null
+++ b/sglang/python/sglang/srt/mem_cache/allocator.py
@@ -0,0 +1,519 @@
+from __future__ import annotations
+
+"""
+Copyright 2025 SGLang Team
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+"""
+Page-aligned memory pool.
+"""
+
+import abc
+from typing import TYPE_CHECKING
+
+import torch
+import triton
+import triton.language as tl
+
+from sglang.srt.utils import get_bool_env_var, get_num_new_pages, next_power_of_2
+
+if TYPE_CHECKING:
+    from sglang.srt.mem_cache.memory_pool import KVCache
+
+
+class BaseTokenToKVPoolAllocator(abc.ABC):
+    @abc.abstractmethod
+    def __init__(
+        self,
+        size: int,
+        page_size: int,
+        dtype: torch.dtype,
+        device: str,
+        kvcache: KVCache,
+        need_sort: bool,
+    ):
+        self.size = size
+        self.page_size = page_size
+        self.dtype = dtype
+        self.device = device
+        self._kvcache = kvcache
+        self.need_sort = need_sort
+
+        self.free_pages = None
+        self.release_pages = None
+        self.is_not_in_free_group = True
+        self.free_group = []
+
+    def debug_print(self) -> str:
+        return ""
+
+    def available_size(self):
+        return (len(self.free_pages) + len(self.release_pages)) * self.page_size
+
+    def get_kvcache(self):
+        return self._kvcache
+
+    def restore_state(self, state):
+        self.free_pages, self.release_pages = state
+
+    def backup_state(self):
+        return (self.free_pages, self.release_pages)
+
+    def free_group_begin(self):
+        self.is_not_in_free_group = False
+        self.free_group = []
+
+    def free_group_end(self):
+        self.is_not_in_free_group = True
+        if self.free_group:
+            self.free(torch.cat(self.free_group))
+
+    def merge_and_sort_free(self):
+        if len(self.release_pages) > 0:
+            self.free_pages = torch.cat((self.free_pages, self.release_pages))
+            self.free_pages, _ = torch.sort(self.free_pages)
+            self.release_pages = torch.empty(
+                (0,), dtype=self.release_pages.dtype, device=self.device
+            )
+
+    def get_cpu_copy(self, *args, **kwargs):
+        # FIXME: reuse the get_cpu_copy after paged allocator is implemented
+        raise NotImplementedError()
+
+    def load_cpu_copy(self, *args, **kwargs):
+        # FIXME: reuse the load_cpu_copy after paged allocator is implemented
+        raise NotImplementedError()
+
+    def alloc_extend(self, *args, **kwargs):
+        raise NotImplementedError("alloc_extend is only for paged allocator")
+
+    def alloc_decode(self, *args, **kwargs):
+        raise NotImplementedError("alloc_decode is only for paged allocator")
+
+    @abc.abstractmethod
+    def clear(self):
+        raise NotImplementedError()
+
+    @abc.abstractmethod
+    def alloc(self, need_size: int):
+        raise NotImplementedError()
+
+    @abc.abstractmethod
+    def free(self, free_index: torch.Tensor):
+        raise NotImplementedError()
+
+
+class TokenToKVPoolAllocator(BaseTokenToKVPoolAllocator):
+    """An allocator managing the indices to kv cache data."""
+
+    def __init__(
+        self,
+        size: int,
+        dtype: torch.dtype,
+        device: str,
+        kvcache: KVCache,
+        need_sort: bool,
+    ):
+        super().__init__(size, 1, dtype, device, kvcache, need_sort)
+        self.clear()
+
+    def clear(self):
+        # The padded slot 0 is used for writing dummy outputs from padded tokens.
+        self.free_pages = torch.arange(
+            1, self.size + 1, dtype=torch.int64, device=self.device
+        )
+        self.is_not_in_free_group = True
+        self.free_group = []
+        self.release_pages = torch.empty((0,), dtype=torch.int64, device=self.device)
+
+    def available_size(self):
+        # To avoid minor "len(free_pages) * 1" overhead
+        return len(self.free_pages) + len(self.release_pages)
+
+    def alloc(self, need_size: int):
+        if self.need_sort and need_size > len(self.free_pages):
+            self.merge_and_sort_free()
+
+        if need_size > len(self.free_pages):
+            return None
+
+        select_index = self.free_pages[:need_size]
+        self.free_pages = self.free_pages[need_size:]
+        return select_index
+
+    def free(self, free_index: torch.Tensor):
+        if free_index.numel() == 0:
+            return
+
+        if self.is_not_in_free_group:
+            if self.need_sort:
+                self.release_pages = torch.cat((self.release_pages, free_index))
+            else:
+                self.free_pages = torch.cat((self.free_pages, free_index))
+        else:
+            self.free_group.append(free_index)
+
+    def get_cpu_copy(self, indices):
+        return self._kvcache.get_cpu_copy(indices)
+
+    def load_cpu_copy(self, kv_cache_cpu, indices):
+        return self._kvcache.load_cpu_copy(kv_cache_cpu, indices)
+
+
+def alloc_extend_naive(
+    prefix_lens,
+    seq_lens,
+    last_loc,
+    free_pages,
+    out_indices,
+    page_size,
+    device,
+):
+    extend_lens = seq_lens - prefix_lens
+    end_pos = torch.cumsum(extend_lens, 0)
+    start_pos = end_pos - extend_lens
+    num_new_pages = (seq_lens + page_size - 1) // page_size - (
+        prefix_lens + page_size - 1
+    ) // page_size
+    num_full_new_pages = (seq_lens) // page_size - (
+        prefix_lens + page_size - 1
+    ) // page_size
+    need_page = num_new_pages - num_full_new_pages
+    end_new_pages = torch.cumsum(num_new_pages, 0)
+    start_new_pages = end_new_pages - num_new_pages
+    pos_in_page = torch.arange(page_size, device=device, dtype=torch.int32)
+    for i in range(len(prefix_lens)):
+        num1 = (
+            min(
+                seq_lens[i],
+                (prefix_lens[i] + page_size - 1) // page_size * page_size,
+            )
+            - prefix_lens[i]
+        )
+        if num1:
+            out_indices[start_pos[i] : start_pos[i] + num1] = (
+                last_loc[i] + 1 + pos_in_page[:num1].view(-1)
+            )
+
+        if prefix_lens[i] + num1 == seq_lens[i]:
+            continue
+
+        num2 = (
+            seq_lens[i] // page_size - (prefix_lens[i] + page_size - 1) // page_size
+        ) * page_size
+        if num2:
+            pages = (
+                free_pages[start_new_pages[i] : end_new_pages[i] - need_page[i]]
+                * page_size
+            )
+            out_indices[start_pos[i] + num1 : start_pos[i] + num1 + num2] = (
+                pages.view(-1, 1) + pos_in_page.view(1, -1)
+            ).view(-1)
+
+        if prefix_lens[i] + num1 + num2 == seq_lens[i]:
+            continue
+
+        num3 = seq_lens[i] - seq_lens[i] // page_size * page_size
+        if num3:
+            out_indices[end_pos[i] - num3 : end_pos[i]] = (
+                free_pages[end_new_pages[i] - 1] * page_size + pos_in_page[:num3]
+            ).view(-1)
+
+
+@triton.jit
+def alloc_extend_kernel(
+    pre_lens_ptr,
+    seq_lens_ptr,
+    last_loc_ptr,
+    free_page_ptr,
+    out_indices,
+    bs_upper: tl.constexpr,
+    page_size: tl.constexpr,
+):
+    pid = tl.program_id(0)
+
+    load_offset = tl.arange(0, bs_upper)
+    seq_lens = tl.load(seq_lens_ptr + load_offset, mask=load_offset <= pid)
+    pre_lens = tl.load(pre_lens_ptr + load_offset, mask=load_offset <= pid)
+    extend_lens = seq_lens - pre_lens
+
+    seq_len = tl.load(seq_lens_ptr + pid)
+    pre_len = tl.load(pre_lens_ptr + pid)
+    extend_len = seq_len - pre_len
+
+    sum_extend_lens = tl.sum(extend_lens)
+    output_start_loc = sum_extend_lens - extend_len
+
+    num_pages_after = (seq_lens + page_size - 1) // page_size
+    num_pages_before = (pre_lens + page_size - 1) // page_size
+    num_new_pages = num_pages_after - num_pages_before
+
+    num_page_start_loc_self = (seq_len + page_size - 1) // page_size - (
+        pre_len + page_size - 1
+    ) // page_size
+    sum_num_new_pages = tl.sum(num_new_pages)
+    new_page_start_loc = sum_num_new_pages - num_page_start_loc_self
+
+    # Part 1: fill the old partial page
+    last_loc = tl.load(last_loc_ptr + pid)
+    num_part1 = (
+        min(seq_len, (pre_len + page_size - 1) // page_size * page_size) - pre_len
+    )
+    offset_one_page = tl.arange(0, page_size)
+    tl.store(
+        out_indices + output_start_loc + offset_one_page,
+        last_loc + 1 + offset_one_page,
+        mask=offset_one_page < num_part1,
+    )
+    if pre_len + num_part1 == seq_len:
+        return
+
+    # Part 2: fill the new full pages using a dynamic blocked loop.
+    # The loop bound is derived from num_part2 (runtime value), so Triton
+    # generates a real loop instead of unrolling — no constexpr dependency
+    # on extend size and only one kernel compilation.
+    num_part2 = (
+        seq_len // page_size * page_size
+        - (pre_len + page_size - 1) // page_size * page_size
+    )
+    BLOCK_EXTEND: tl.constexpr = 4096
+    num_blocks = (num_part2 + BLOCK_EXTEND - 1) // BLOCK_EXTEND
+    for block_id in range(num_blocks):
+        offset_in_block = tl.arange(0, BLOCK_EXTEND)
+        offset = block_id * BLOCK_EXTEND + offset_in_block
+        mask = offset < num_part2
+        page_start = tl.load(
+            free_page_ptr + new_page_start_loc + offset // page_size,
+            mask=mask,
+        )
+        tl.store(
+            out_indices + output_start_loc + num_part1 + offset,
+            page_start * page_size + offset % page_size,
+            mask=mask,
+        )
+    if pre_len + num_part1 + num_part2 == seq_len:
+        return
+
+    # Part 3: fill the new partial page
+    num_part3 = seq_len - seq_len // page_size * page_size
+    start_loc = tl.load(
+        free_page_ptr + new_page_start_loc + num_page_start_loc_self - 1
+    )
+    tl.store(
+        out_indices + output_start_loc + num_part1 + num_part2 + offset_one_page,
+        start_loc * page_size + offset_one_page,
+        mask=offset_one_page < num_part3,
+    )
+
+
+@triton.jit
+def alloc_decode_kernel(
+    seq_lens_ptr,
+    last_loc_ptr,
+    free_page_ptr,
+    out_indices,
+    bs_upper: tl.constexpr,
+    page_size: tl.constexpr,
+):
+    pid = tl.program_id(0)
+
+    load_offset = tl.arange(0, bs_upper)
+    seq_lens = tl.load(seq_lens_ptr + load_offset, mask=load_offset <= pid)
+    pre_lens = tl.where(load_offset <= pid, seq_lens - 1, seq_lens)
+
+    seq_len = tl.load(seq_lens_ptr + pid)
+    pre_len = seq_len - 1
+
+    num_pages_after = (seq_lens + page_size - 1) // page_size
+    num_pages_before = (pre_lens + page_size - 1) // page_size
+    num_new_pages = num_pages_after - num_pages_before
+
+    num_page_start_loc_self = (seq_len + page_size - 1) // page_size - (
+        pre_len + page_size - 1
+    ) // page_size
+    sum_num_new_pages = tl.sum(num_new_pages)
+    new_page_start_loc = sum_num_new_pages - num_page_start_loc_self
+
+    if num_page_start_loc_self == 0:
+        last_loc = tl.load(last_loc_ptr + pid)
+        tl.store(out_indices + pid, last_loc + 1)
+    else:
+        page = tl.load(free_page_ptr + new_page_start_loc)
+        tl.store(out_indices + pid, page * page_size)
+
+
+class PagedTokenToKVPoolAllocator(BaseTokenToKVPoolAllocator):
+    """
+    An allocator managing the indices to kv cache data.
+
+    This class has the same interface as `TokenToKVPoolAllocator` but the output
+    of one request is always page-aligned.
+
+    TODO: fuse last_loc into the kernel.
+    """
+
+    def __init__(
+        self,
+        size: int,
+        page_size: int,
+        dtype: torch.dtype,
+        device: str,
+        kvcache: KVCache,
+        need_sort: bool,
+    ):
+        super().__init__(size, page_size, dtype, device, kvcache, need_sort)
+        self.num_pages = size // page_size
+        self.debug_mode = get_bool_env_var("SGLANG_DEBUG_MEMORY_POOL")
+        self.clear()
+
+    def alloc(self, need_size: int):
+        # page-aligned allocation, returning contiguous indices of pages
+        if self.debug_mode:
+            assert (
+                need_size % self.page_size == 0
+            ), "The allocation size should be page-aligned"
+
+        num_pages = need_size // self.page_size
+        if self.need_sort and num_pages > len(self.free_pages):
+            self.merge_and_sort_free()
+        if num_pages > len(self.free_pages):
+            return None
+
+        out_pages = self.free_pages[:num_pages]
+        self.free_pages = self.free_pages[num_pages:]
+
+        out_indices = (
+            out_pages[:, None] * self.page_size
+            + torch.arange(self.page_size, device=self.device)
+        ).reshape(-1)
+
+        return out_indices
+
+    def alloc_extend(
+        self,
+        prefix_lens: torch.Tensor,
+        prefix_lens_cpu: torch.Tensor,
+        seq_lens: torch.Tensor,
+        seq_lens_cpu: torch.Tensor,
+        last_loc: torch.Tensor,
+        extend_num_tokens: int,
+    ):
+        if self.debug_mode:
+            assert torch.all(
+                (last_loc + 1) % self.page_size == prefix_lens % self.page_size
+            )
+
+        bs = len(prefix_lens)
+        if self.need_sort and extend_num_tokens // self.page_size + bs + 1 > len(
+            self.free_pages
+        ):
+            self.merge_and_sort_free()
+
+        out_indices = torch.empty(
+            (extend_num_tokens,), dtype=torch.int64, device=self.device
+        )
+
+        alloc_extend_kernel[(bs,)](
+            prefix_lens,
+            seq_lens,
+            last_loc,
+            self.free_pages,
+            out_indices,
+            next_power_of_2(bs),
+            self.page_size,
+        )
+
+        if self.debug_mode:
+            assert len(torch.unique(out_indices)) == len(out_indices)
+
+        num_new_pages = get_num_new_pages(
+            seq_lens=seq_lens_cpu,
+            page_size=self.page_size,
+            prefix_lens=prefix_lens_cpu,
+        )
+        if num_new_pages > len(self.free_pages):
+            return None
+
+        self.free_pages = self.free_pages[num_new_pages:]
+        return out_indices
+
+    def alloc_decode(
+        self,
+        seq_lens: torch.Tensor,
+        seq_lens_cpu: torch.Tensor,
+        last_loc: torch.Tensor,
+    ):
+        if self.debug_mode:
+            assert torch.all(
+                (last_loc + 2) % self.page_size == seq_lens % self.page_size
+            )
+
+        bs = len(seq_lens)
+        if self.need_sort and bs > len(self.free_pages):
+            self.merge_and_sort_free()
+
+        out_indices = torch.empty((bs,), dtype=torch.int64, device=self.device)
+        alloc_decode_kernel[(bs,)](
+            seq_lens,
+            last_loc,
+            self.free_pages,
+            out_indices,
+            next_power_of_2(bs),
+            self.page_size,
+        )
+
+        if self.debug_mode:
+            assert len(torch.unique(out_indices)) == len(out_indices)
+
+        num_new_pages = get_num_new_pages(
+            seq_lens=seq_lens_cpu,
+            page_size=self.page_size,
+            decode=True,
+        )
+        if num_new_pages > len(self.free_pages):
+            return None
+
+        self.free_pages = self.free_pages[num_new_pages:]
+        return out_indices
+
+    def free(self, free_index: torch.Tensor):
+        if free_index.numel() == 0:
+            return
+
+        if self.is_not_in_free_group:
+            free_page_indices = torch.unique(free_index // self.page_size)
+            if self.need_sort:
+                self.release_pages = torch.cat((free_page_indices, self.release_pages))
+            else:
+                self.free_pages = torch.cat((free_page_indices, self.free_pages))
+        else:
+            self.free_group.append(free_index)
+
+        if self.debug_mode:
+            assert len(torch.unique(self.free_pages)) == len(self.free_pages)
+
+    def clear(self):
+        # The padded slot 0 is used for writing dummy outputs from padded tokens.
+        self.free_pages = torch.arange(
+            1, self.num_pages + 1, dtype=torch.int64, device=self.device
+        )
+        self.is_not_in_free_group = True
+        self.free_group = []
+        self.release_pages = torch.empty((0,), dtype=torch.int64, device=self.device)
+
+    def get_cpu_copy(self, indices):
+        return self._kvcache.get_cpu_copy(indices)
+
+    def load_cpu_copy(self, kv_cache_cpu, indices):
+        return self._kvcache.load_cpu_copy(kv_cache_cpu, indices)
diff --git a/sglang/python/sglang/srt/mem_cache/base_prefix_cache.py b/sglang/python/sglang/srt/mem_cache/base_prefix_cache.py
new file mode 100644
index 0000000000000000000000000000000000000000..a383c5aa2c63e6d4e55c09b61227d5e421f8d3a5
--- /dev/null
+++ b/sglang/python/sglang/srt/mem_cache/base_prefix_cache.py
@@ -0,0 +1,229 @@
+from __future__ import annotations
+
+import dataclasses
+import time
+from abc import ABC, abstractmethod
+from typing import (
+    TYPE_CHECKING,
+    Any,
+    NamedTuple,
+    Optional,
+    Protocol,
+    Tuple,
+    runtime_checkable,
+)
+
+import torch
+
+from sglang.srt.mem_cache.allocator import BaseTokenToKVPoolAllocator
+from sglang.srt.mem_cache.memory_pool import ReqToTokenPool
+from sglang.srt.observability.metrics_collector import RadixCacheMetricsCollector
+
+if TYPE_CHECKING:
+    from sglang.srt.managers.schedule_batch import Req
+    from sglang.srt.mem_cache.radix_cache import RadixKey
+
+
+@runtime_checkable
+class PrefixCacheTrait(Protocol):
+    req_to_token_pool: ReqToTokenPool
+    token_to_kv_pool_allocator: BaseTokenToKVPoolAllocator
+    page_size: int
+    disable: bool
+
+
+@dataclasses.dataclass
+class MatchPrefixParams:
+    """Unified parameters for match_prefix across different cache types"""
+
+    key: RadixKey
+
+    # Mamba specific
+    cow_mamba: bool = False
+    req: Optional[Req] = None
+
+
+@dataclasses.dataclass
+class InsertParams:
+    """Unified parameters for insert across different cache types"""
+
+    key: RadixKey
+    value: Optional[torch.Tensor] = None
+
+    # Mamba specific
+    mamba_value: Optional[torch.Tensor] = None
+
+    # SWA specific
+    prev_prefix_len: int = 0
+    swa_evicted_seqlen: int = 0
+
+    # General
+    chunked: bool = False
+    priority: int = 0
+
+
+@dataclasses.dataclass
+class InsertResult:
+    """Result of an insert operation"""
+
+    prefix_len: int
+    mamba_exist: bool = False
+
+
+@dataclasses.dataclass
+class EvictParams:
+    """Unified parameters for evict across different cache types"""
+
+    num_tokens: int
+    swa_num_tokens: int = 0
+    mamba_num: int = 0
+
+
+@dataclasses.dataclass
+class EvictResult:
+    """Result of an evict operation"""
+
+    num_tokens_evicted: int = 0
+    swa_num_tokens_evicted: int = 0
+    mamba_num_evicted: int = 0
+
+
+class MatchResult(NamedTuple):
+    """Result of a prefix match operation.
+
+    Attributes:
+        device_indices  :   Indices of the KV cache on the device matched by common prefix.
+        last_device_node:   The last TreeNode on the device that was matched.
+        last_host_node  :   The last TreeNode on the host that was matched.
+                            Note that if HiCache is not enabled,
+                            this **must** be the same as `last_device_node`.
+        host_hit_length :   Length of the KV cache hit on the host, if applicable.
+                            0 if HiCache is not enabled.
+        mamba_branching_seqlen: The mamba radix cache branching point, which is the longest
+                                page-aligned position that could've been cache hit if there
+                                exists a mamba state.
+    """
+
+    device_indices: torch.Tensor
+    last_device_node: Any
+    last_host_node: Any
+    host_hit_length: int = 0
+    mamba_branching_seqlen: Optional[int] = None
+
+
+class BasePrefixCache(ABC, PrefixCacheTrait):
+    """Cache can be indexed by either rid or key."""
+
+    metrics_collector: Optional[RadixCacheMetricsCollector] = (
+        None  # metrics collector for the cache
+    )
+
+    def init_metrics_collector(self):
+        from sglang.srt.server_args import get_global_server_args
+
+        server_args = get_global_server_args()
+        labels = {"cache_type": self.__class__.__name__}
+        if server_args.extra_metric_labels:
+            labels.update(server_args.extra_metric_labels)
+        self.metrics_collector = RadixCacheMetricsCollector(labels=labels)
+
+    def update_eviction_metrics(self, num_evicted: int, start_time: float):
+        if self.metrics_collector is not None and num_evicted > 0:
+            self.metrics_collector.observe_eviction_duration(
+                time.perf_counter() - start_time
+            )
+            self.metrics_collector.increment_eviction_num_tokens(num_evicted)
+
+    @abstractmethod
+    def reset(self):
+        pass
+
+    @abstractmethod
+    def match_prefix(self, params: MatchPrefixParams) -> MatchResult:
+        pass
+
+    @abstractmethod
+    def cache_finished_req(self, req: Req, is_insert: bool = True, **kwargs):
+        pass
+
+    @abstractmethod
+    def cache_unfinished_req(self, req: Req, **kwargs):
+        pass
+
+    @abstractmethod
+    def evict(self, params: EvictParams) -> EvictResult:
+        pass
+
+    @abstractmethod
+    def inc_lock_ref(self, node: Any):
+        pass
+
+    @abstractmethod
+    def dec_lock_ref(self, node: Any, swa_uuid_for_lock: Optional[str] = None):
+        pass
+
+    def evictable_size(self):
+        return 0
+
+    def full_evictable_size(self):
+        return 0
+
+    def swa_evictable_size(self):
+        return 0
+
+    def protected_size(self):
+        return 0
+
+    def full_protected_size(self):
+        return 0
+
+    def swa_protected_size(self):
+        return 0
+
+    def total_size(self):
+        raise NotImplementedError()
+
+    def pretty_print(self):
+        raise NotImplementedError()
+
+    def init_load_back(
+        self,
+        last_host_node: Any,
+        host_hit_length: int,
+    ) -> Tuple[torch.Tensor, Any]:
+        """
+        Preparing KV cache loading from host to device.
+        """
+        raise NotImplementedError()
+
+    def ready_to_load_host_cache(self) -> Any:
+        """
+        Notify the cache controller to start the KV cache loading
+        """
+        raise NotImplementedError()
+
+    def check_hicache_events(self) -> Any:
+        """
+        Check HiCache related activities to update radix tree and synchronize across TP workers if needed
+        """
+        raise NotImplementedError()
+
+    def take_events(self):
+        return []
+
+    def supports_swa(self) -> bool:
+        return False
+
+    def supports_mamba(self) -> bool:
+        return False
+
+    def is_chunk_cache(self) -> bool:
+        return False
+
+    def is_tree_cache(self) -> bool:
+        return not self.is_chunk_cache()
+
+    def available_and_evictable_str(self) -> str:
+        available_size = self.token_to_kv_pool_allocator.available_size()
+        evictable_size = self.evictable_size()
+        return f"Available tokens: {available_size + evictable_size} ({available_size=} + {evictable_size=})\n"
diff --git a/sglang/python/sglang/srt/mem_cache/cache_init_params.py b/sglang/python/sglang/srt/mem_cache/cache_init_params.py
new file mode 100644
index 0000000000000000000000000000000000000000..6de3f984f77b70507607766a4f55913da106481c
--- /dev/null
+++ b/sglang/python/sglang/srt/mem_cache/cache_init_params.py
@@ -0,0 +1,38 @@
+from __future__ import annotations
+
+import dataclasses
+from typing import TYPE_CHECKING, Optional
+
+import torch
+
+if TYPE_CHECKING:
+    from sglang.srt.mem_cache.allocator import BaseTokenToKVPoolAllocator
+    from sglang.srt.mem_cache.memory_pool import ReqToTokenPool
+
+
+@dataclasses.dataclass
+class CacheInitParams:
+    disable: bool
+    req_to_token_pool: ReqToTokenPool
+    token_to_kv_pool_allocator: BaseTokenToKVPoolAllocator
+    page_size: int
+
+    is_eagle: bool = False
+    tp_cache_group: Optional[torch.distributed.ProcessGroup] = None
+    eviction_policy: str = "lru"
+    disable_finished_insert: bool = False
+
+    enable_metrics: bool = False
+    enable_kv_cache_events: bool = False
+
+    enable_mamba_extra_buffer: bool = False
+
+    pp_rank: int = 0
+    pp_size: int = 1
+
+    chunked_prefill_size: Optional[int] = None
+
+    sliding_window_size: Optional[int] = None
+
+    # Time-to-live for cache entries in seconds. If None, TTL is disabled.
+    cache_ttl_seconds: Optional[float] = None
diff --git a/sglang/python/sglang/srt/mem_cache/chunk_cache.py b/sglang/python/sglang/srt/mem_cache/chunk_cache.py
new file mode 100644
index 0000000000000000000000000000000000000000..f8e05328521bfeb4ac2c09abe2cca65730019667
--- /dev/null
+++ b/sglang/python/sglang/srt/mem_cache/chunk_cache.py
@@ -0,0 +1,114 @@
+from __future__ import annotations
+
+"""Cache for chunked prefill, used when RadixCache is disabled."""
+
+import logging
+from typing import TYPE_CHECKING, Any, Optional
+
+import torch
+
+from sglang.srt.mem_cache.base_prefix_cache import (
+    BasePrefixCache,
+    EvictParams,
+    EvictResult,
+    InsertParams,
+    InsertResult,
+    MatchPrefixParams,
+    MatchResult,
+)
+from sglang.srt.mem_cache.swa_memory_pool import SWATokenToKVPoolAllocator
+
+if TYPE_CHECKING:
+    from sglang.srt.managers.schedule_batch import Req
+    from sglang.srt.mem_cache.cache_init_params import CacheInitParams
+
+
+logger = logging.getLogger(__name__)
+
+
+class ChunkCache(BasePrefixCache):
+    def __init__(self, params: CacheInitParams):
+        self.req_to_token_pool = params.req_to_token_pool
+        self.token_to_kv_pool_allocator = params.token_to_kv_pool_allocator
+        self.page_size = params.page_size
+        if self.token_to_kv_pool_allocator:
+            self.device = self.token_to_kv_pool_allocator.device
+        else:
+            self.device = torch.device("cpu")
+
+        self.protected_size_ = 0
+
+    def is_chunk_cache(self) -> bool:
+        return True
+
+    # NOTE (csy): this is to determine if a cache has prefix matching feature.
+    # Chunk cache always return True to indicate no prefix matching.
+    # TODO (csy): Using a prefix cache trait to replace this
+    @property
+    def disable(self):
+        return True
+
+    def reset(self):
+        pass
+
+    def match_prefix(self, params: MatchPrefixParams) -> MatchResult:
+        return MatchResult(
+            device_indices=torch.empty((0,), dtype=torch.int64),
+            last_device_node=None,
+            last_host_node=None,
+        )
+
+    def insert(self, params: InsertParams) -> InsertResult:
+        # ChunkCache does not support prefix caching, so insert is a no-op
+        return InsertResult(prefix_len=0)
+
+    def cache_finished_req(self, req: Req, is_insert: bool = True):
+        kv_committed_len = req.pop_committed_kv_cache()
+        # For decode server: if req.output_ids is empty, we want to free all req.origin_input_ids
+        kv_indices = self.req_to_token_pool.req_to_token[
+            req.req_pool_idx, :kv_committed_len
+        ]
+        self.token_to_kv_pool_allocator.free(kv_indices)
+
+    def cache_unfinished_req(self, req: Req, chunked=False):
+        kv_indices = self.req_to_token_pool.req_to_token[
+            req.req_pool_idx, : len(req.fill_ids)
+        ]
+        # `req.prefix_indices` will be used in `PrefillAdder::add_chunked_req` later
+        req.prefix_indices = kv_indices.to(dtype=torch.int64, copy=True)
+
+    def evict(self, params: EvictParams) -> EvictResult:
+        return EvictResult()
+
+    def inc_lock_ref(self, node: Any):
+        return 0
+
+    def dec_lock_ref(self, node: Any, swa_uuid_for_lock: Optional[str] = None):
+        return 0
+
+    def protected_size(self):
+        # NOTE: no protected size in chunk cache. Chunk cache's eviction is the same with request's lifecycle.
+        return 0
+
+    def pretty_print(self):
+        return ""
+
+
+class SWAChunkCache(ChunkCache):
+    """ChunkCache with support for sliding window attention."""
+
+    def __init__(self, params: CacheInitParams):
+        assert isinstance(params.token_to_kv_pool_allocator, SWATokenToKVPoolAllocator)
+        super().__init__(params)
+
+        self.sliding_window_size = params.sliding_window_size
+        self.chunked_prefill_size = params.chunked_prefill_size
+
+    def supports_swa(self) -> bool:
+        assert (
+            self.sliding_window_size is not None
+        ), "sliding_window_size must be set for SWAChunkCache"
+        return True
+
+    def evict(self, params: EvictParams) -> EvictResult:
+        return EvictResult()
diff --git a/sglang/python/sglang/srt/mem_cache/common.py b/sglang/python/sglang/srt/mem_cache/common.py
new file mode 100644
index 0000000000000000000000000000000000000000..5a759ed11bcd333a22de48dd45e6c1d345e3e235
--- /dev/null
+++ b/sglang/python/sglang/srt/mem_cache/common.py
@@ -0,0 +1,519 @@
+from __future__ import annotations
+
+import logging
+from typing import TYPE_CHECKING
+
+import torch
+import triton
+import triton.language as tl
+
+from sglang.srt.mem_cache.base_prefix_cache import BasePrefixCache, EvictParams
+from sglang.srt.mem_cache.memory_pool import HybridReqToTokenPool, ReqToTokenPool
+from sglang.srt.mem_cache.swa_memory_pool import SWATokenToKVPoolAllocator
+from sglang.srt.server_args import get_global_server_args
+from sglang.srt.utils import support_triton
+from sglang.srt.utils.common import ceil_align
+
+if TYPE_CHECKING:
+    from sglang.srt.managers.schedule_batch import Req, ScheduleBatch
+
+# Needs 2 + 1 slots for mamba request with prefix cache. 2 for ping pong cache, 1 for running mamba state.
+MAMBA_STATE_PER_REQ_PREFIX_CACHE = 3
+MAMBA_STATE_PER_REQ_NO_CACHE = 1
+
+logger = logging.getLogger(__name__)
+
+
+@triton.jit
+def write_req_to_token_pool_triton(
+    req_to_token_ptr,  # [max_batch, max_context_len]
+    req_pool_indices,
+    prefix_tensors,
+    pre_lens,
+    seq_lens,
+    extend_lens,
+    out_cache_loc,
+    req_to_token_ptr_stride: tl.constexpr,
+):
+    BLOCK_SIZE: tl.constexpr = 512
+    pid = tl.program_id(0)
+
+    req_pool_index = tl.load(req_pool_indices + pid)
+    pre_len = tl.load(pre_lens + pid)
+    seq_len = tl.load(seq_lens + pid)
+    prefix_tensor = tl.load(prefix_tensors + pid).to(tl.pointer_type(tl.int64))
+
+    # write prefix
+    num_loop = tl.cdiv(pre_len, BLOCK_SIZE)
+    for i in range(num_loop):
+        offset = tl.arange(0, BLOCK_SIZE) + i * BLOCK_SIZE
+        mask = offset < pre_len
+        value = tl.load(prefix_tensor + offset, mask=mask)
+        tl.store(
+            req_to_token_ptr + req_pool_index * req_to_token_ptr_stride + offset,
+            value,
+            mask=mask,
+        )
+
+    # NOTE: This can be slow for large bs
+    cumsum_start = tl.cast(0, tl.int64)
+    for i in range(pid):
+        cumsum_start += tl.load(extend_lens + i)
+
+    num_loop = tl.cdiv(seq_len - pre_len, BLOCK_SIZE)
+    for i in range(num_loop):
+        offset = tl.arange(0, BLOCK_SIZE) + i * BLOCK_SIZE
+        mask = offset < (seq_len - pre_len)
+        value = tl.load(out_cache_loc + cumsum_start + offset, mask=mask)
+        tl.store(
+            req_to_token_ptr
+            + req_pool_index * req_to_token_ptr_stride
+            + offset
+            + pre_len,
+            value,
+            mask=mask,
+        )
+
+
+def write_cache_indices(
+    out_cache_loc: torch.Tensor,
+    req_pool_indices_tensor: torch.Tensor,
+    req_pool_indices_cpu: torch.Tensor,
+    prefix_lens_tensor: torch.Tensor,
+    prefix_lens_cpu: torch.Tensor,
+    seq_lens_tensor: torch.Tensor,
+    seq_lens_cpu: torch.Tensor,
+    extend_lens_tensor: torch.Tensor,
+    extend_lens_cpu: torch.Tensor,
+    prefix_tensors: list[torch.Tensor],
+    req_to_token_pool: ReqToTokenPool,
+):
+    if support_triton(get_global_server_args().attention_backend):
+        prefix_pointers = torch.tensor(
+            [t.data_ptr() for t in prefix_tensors],
+            device=req_to_token_pool.device,
+            dtype=torch.uint64,
+        )
+        # TODO: some tensors can be reused for ForwardBatchInfo (e.g., extend_lens, cumsum_start)
+        write_req_to_token_pool_triton[(req_pool_indices_tensor.shape[0],)](
+            req_to_token_pool.req_to_token,
+            req_pool_indices_tensor,
+            prefix_pointers,
+            prefix_lens_tensor,
+            seq_lens_tensor,
+            extend_lens_tensor,
+            out_cache_loc,
+            req_to_token_pool.req_to_token.shape[1],
+        )
+    else:
+        pt = 0
+        for i in range(req_pool_indices_cpu.shape[0]):
+            req_idx = req_pool_indices_cpu[i].item()
+            prefix_len = prefix_lens_cpu[i].item()
+            seq_len = seq_lens_cpu[i].item()
+            extend_len = extend_lens_cpu[i].item()
+
+            req_to_token_pool.write(
+                (req_idx, slice(0, prefix_len)),
+                prefix_tensors[i],
+            )
+            req_to_token_pool.write(
+                (req_idx, slice(prefix_len, seq_len)),
+                out_cache_loc[pt : pt + extend_len],
+            )
+            pt += extend_len
+
+
+def get_last_loc(
+    req_to_token: torch.Tensor,
+    req_pool_indices_tensor: torch.Tensor,
+    prefix_lens_tensor: torch.Tensor,
+) -> torch.Tensor:
+    if (
+        get_global_server_args().attention_backend != "ascend"
+        and get_global_server_args().attention_backend != "torch_native"
+    ):
+        impl = get_last_loc_triton
+    else:
+        impl = get_last_loc_torch
+
+    return impl(req_to_token, req_pool_indices_tensor, prefix_lens_tensor)
+
+
+def get_last_loc_torch(
+    req_to_token: torch.Tensor,
+    req_pool_indices_tensor: torch.Tensor,
+    prefix_lens_tensor: torch.Tensor,
+) -> torch.Tensor:
+    return torch.where(
+        prefix_lens_tensor > 0,
+        req_to_token[req_pool_indices_tensor, prefix_lens_tensor - 1],
+        torch.full_like(prefix_lens_tensor, -1),
+    )
+
+
+@triton.jit
+def get_last_loc_kernel(
+    req_to_token,
+    req_pool_indices_tensor,
+    prefix_lens_tensor,
+    result,
+    num_tokens,
+    req_to_token_stride,
+    BLOCK_SIZE: tl.constexpr,
+):
+    pid = tl.program_id(0)
+    offset = tl.arange(0, BLOCK_SIZE) + pid * BLOCK_SIZE
+    mask = offset < num_tokens
+
+    prefix_lens = tl.load(prefix_lens_tensor + offset, mask=mask, other=0)
+    req_pool_indices = tl.load(req_pool_indices_tensor + offset, mask=mask, other=0)
+
+    token_mask = prefix_lens > 0
+    token_index = req_pool_indices * req_to_token_stride + (prefix_lens - 1)
+    tokens = tl.load(req_to_token + token_index, mask=token_mask, other=-1)
+
+    tl.store(result + offset, tokens, mask=mask)
+
+
+def get_last_loc_triton(
+    req_to_token: torch.Tensor,
+    req_pool_indices_tensor: torch.Tensor,
+    prefix_lens_tensor: torch.Tensor,
+) -> torch.Tensor:
+    BLOCK_SIZE = 256
+    num_tokens = prefix_lens_tensor.shape[0]
+    result = torch.empty_like(prefix_lens_tensor)
+    grid = (triton.cdiv(num_tokens, BLOCK_SIZE),)
+
+    get_last_loc_kernel[grid](
+        req_to_token,
+        req_pool_indices_tensor,
+        prefix_lens_tensor,
+        result,
+        num_tokens,
+        req_to_token.stride(0),
+        BLOCK_SIZE,
+    )
+    return result
+
+
+def alloc_token_slots(
+    tree_cache: BasePrefixCache,
+    num_tokens: int,
+    backup_state: bool = False,
+):
+    allocator = tree_cache.token_to_kv_pool_allocator
+    evict_from_tree_cache(tree_cache, num_tokens)
+
+    state = None
+    if backup_state:
+        state = allocator.backup_state()
+
+    out_cache_loc = allocator.alloc(num_tokens)
+
+    if out_cache_loc is None:
+        error_msg = (
+            f"Out of memory. Try to lower your batch size.\n"
+            f"Try to allocate {num_tokens} tokens.\n"
+            f"{available_and_evictable_str(tree_cache)}"
+        )
+        logger.error(error_msg)
+        if tree_cache is not None:
+            tree_cache.pretty_print()
+        raise RuntimeError(error_msg)
+
+    return (out_cache_loc, state) if backup_state else out_cache_loc
+
+
+def evict_from_tree_cache(tree_cache: BasePrefixCache | None, num_tokens: int):
+    if tree_cache is None:
+        return
+
+    if tree_cache.is_chunk_cache():
+        return
+
+    allocator = tree_cache.token_to_kv_pool_allocator
+
+    if isinstance(allocator, SWATokenToKVPoolAllocator):
+        # Hybrid allocator
+        full_available_size = allocator.full_available_size()
+        swa_available_size = allocator.swa_available_size()
+
+        if full_available_size < num_tokens or swa_available_size < num_tokens:
+            full_num_tokens = max(0, num_tokens - full_available_size)
+            swa_num_tokens = max(0, num_tokens - swa_available_size)
+            tree_cache.evict(
+                EvictParams(num_tokens=full_num_tokens, swa_num_tokens=swa_num_tokens)
+            )
+    else:
+        # Standard allocator
+        if allocator.available_size() < num_tokens:
+            tree_cache.evict(EvictParams(num_tokens=num_tokens))
+
+
+def alloc_paged_token_slots_extend(
+    tree_cache: BasePrefixCache,
+    prefix_lens: torch.Tensor,
+    prefix_lens_cpu: torch.Tensor,
+    seq_lens: torch.Tensor,
+    seq_lens_cpu: torch.Tensor,
+    last_loc: torch.Tensor,
+    extend_num_tokens: int,
+    backup_state: bool = False,
+):
+    # Over estimate the number of tokens: assume each request needs a new page.
+    allocator = tree_cache.token_to_kv_pool_allocator
+    num_tokens = extend_num_tokens + len(seq_lens_cpu) * allocator.page_size
+    evict_from_tree_cache(tree_cache, num_tokens)
+
+    state = None
+    if backup_state:
+        state = allocator.backup_state()
+
+    out_cache_loc = allocator.alloc_extend(
+        prefix_lens,
+        prefix_lens_cpu,
+        seq_lens,
+        seq_lens_cpu,
+        last_loc,
+        extend_num_tokens,
+    )
+
+    if out_cache_loc is None:
+        error_msg = (
+            f"Prefill out of memory. Try to lower your batch size.\n"
+            f"Try to allocate {extend_num_tokens} tokens.\n"
+            f"{available_and_evictable_str(tree_cache)}"
+        )
+        logger.error(error_msg)
+        if tree_cache is not None:
+            tree_cache.pretty_print()
+        raise RuntimeError(error_msg)
+
+    return (out_cache_loc, state) if backup_state else out_cache_loc
+
+
+def alloc_req_slots(
+    req_to_token_pool: ReqToTokenPool,
+    reqs: list[Req],
+    tree_cache: BasePrefixCache | None,
+) -> list[int]:
+    """Allocate request slots from the pool."""
+    num_reqs = len(reqs)
+    if isinstance(req_to_token_pool, HybridReqToTokenPool):
+        mamba_available_size = req_to_token_pool.mamba_pool.available_size()
+        factor = (
+            MAMBA_STATE_PER_REQ_PREFIX_CACHE
+            if tree_cache.supports_mamba()
+            else MAMBA_STATE_PER_REQ_NO_CACHE
+        )
+        mamba_state_needed = num_reqs * factor
+        if mamba_available_size < mamba_state_needed:
+            if tree_cache is not None and tree_cache.supports_mamba():
+                mamba_num = max(0, mamba_state_needed - mamba_available_size)
+                tree_cache.evict(EvictParams(num_tokens=0, mamba_num=mamba_num))
+    req_pool_indices = req_to_token_pool.alloc(reqs)
+
+    if req_pool_indices is None:
+        raise RuntimeError(
+            "alloc_req_slots runs out of memory. "
+            "Please set a smaller number for `--max-running-requests`. "
+            f"{req_to_token_pool.available_size()=}, "
+            f"{num_reqs=}, "
+        )
+    return req_pool_indices
+
+
+def alloc_for_extend(
+    batch: ScheduleBatch,
+) -> tuple[torch.Tensor, torch.Tensor, list[int]]:
+    """
+    Allocate KV cache for extend batch and write to req_to_token_pool.
+
+    Returns:
+        out_cache_loc: allocated cache locations
+        req_pool_indices_device: request pool indices at a device tensor
+        req_pool_indices: request pool indices as list
+    """
+    # free out-of-window swa tokens
+    batch.maybe_evict_swa()
+
+    prefix_tensors = [r.prefix_indices for r in batch.reqs]
+
+    # Create tensors for allocation
+    prefix_lens_cpu = torch.tensor(batch.prefix_lens, dtype=torch.int64)
+    extend_lens_cpu = torch.tensor(batch.extend_lens, dtype=torch.int64)
+    prefix_lens_device = prefix_lens_cpu.to(batch.device, non_blocking=True)
+    extend_lens_device = extend_lens_cpu.to(batch.device, non_blocking=True)
+
+    # Allocate req slots
+    req_pool_indices = alloc_req_slots(
+        batch.req_to_token_pool, batch.reqs, batch.tree_cache
+    )
+    req_pool_indices_cpu = torch.tensor(req_pool_indices, dtype=torch.int64)
+    req_pool_indices_device = req_pool_indices_cpu.to(batch.device, non_blocking=True)
+
+    # Allocate KV cache (throws exception on failure)
+    if batch.tree_cache.page_size == 1:
+        out_cache_loc = alloc_token_slots(batch.tree_cache, batch.extend_num_tokens)
+    else:
+        # Paged allocation - build last_loc
+        last_loc = [
+            (t[-1:] if len(t) > 0 else torch.tensor([-1], device=batch.device))
+            for t in prefix_tensors
+        ]
+        out_cache_loc = alloc_paged_token_slots_extend(
+            tree_cache=batch.tree_cache,
+            prefix_lens=prefix_lens_device,
+            prefix_lens_cpu=prefix_lens_cpu,
+            seq_lens=batch.seq_lens,
+            seq_lens_cpu=batch.seq_lens_cpu,
+            last_loc=torch.cat(last_loc),
+            extend_num_tokens=batch.extend_num_tokens,
+        )
+
+    # Write to req_to_token_pool
+    write_cache_indices(
+        out_cache_loc,
+        req_pool_indices_device,
+        req_pool_indices_cpu,
+        prefix_lens_device,
+        prefix_lens_cpu,
+        batch.seq_lens,
+        batch.seq_lens_cpu,
+        extend_lens_device,
+        extend_lens_cpu,
+        prefix_tensors,
+        batch.req_to_token_pool,
+    )
+
+    return out_cache_loc, req_pool_indices_device, req_pool_indices
+
+
+def alloc_paged_token_slots_decode(
+    tree_cache: BasePrefixCache,
+    seq_lens: torch.Tensor,
+    seq_lens_cpu: torch.Tensor,
+    last_loc: torch.Tensor,
+    token_per_req: int = 1,
+) -> torch.Tensor:
+    """Allocate paged KV cache for decode batch."""
+    allocator = tree_cache.token_to_kv_pool_allocator
+    # Over estimate the number of tokens: assume each request needs a new page.
+    num_tokens = len(seq_lens) * allocator.page_size
+    evict_from_tree_cache(tree_cache, num_tokens)
+
+    out_cache_loc = allocator.alloc_decode(seq_lens, seq_lens_cpu, last_loc)
+
+    if out_cache_loc is None:
+        error_msg = (
+            f"Decode out of memory. Try to lower your batch size.\n"
+            f"Try to allocate {len(seq_lens) * token_per_req} tokens.\n"
+            f"{available_and_evictable_str(tree_cache)}"
+        )
+        logger.error(error_msg)
+        if tree_cache is not None:
+            tree_cache.pretty_print()
+        raise RuntimeError(error_msg)
+
+    return out_cache_loc
+
+
+def alloc_for_decode(batch: ScheduleBatch, token_per_req: int) -> torch.Tensor:
+    """
+    Allocate KV cache for decode batch and write to req_to_token_pool.
+
+    Returns:
+        out_cache_loc: allocated cache locations
+    """
+
+    batch.maybe_evict_swa()
+
+    bs = batch.seq_lens.shape[0]
+
+    if batch.tree_cache.page_size == 1:
+        # Non-paged allocation
+        out_cache_loc = alloc_token_slots(batch.tree_cache, bs * token_per_req)
+    else:
+        # Paged allocation
+        last_loc = batch.req_to_token_pool.req_to_token[
+            batch.req_pool_indices, batch.seq_lens - 1
+        ]
+        seq_lens_next = batch.seq_lens + token_per_req
+        out_cache_loc = alloc_paged_token_slots_decode(
+            tree_cache=batch.tree_cache,
+            seq_lens=seq_lens_next,
+            seq_lens_cpu=batch.seq_lens_cpu + token_per_req,
+            last_loc=last_loc,
+            token_per_req=token_per_req,
+        )
+
+    # Write to req_to_token_pool
+    if batch.model_config.is_encoder_decoder:
+        locs = batch.encoder_lens + batch.seq_lens
+    else:
+        locs = batch.seq_lens.clone()
+
+    batch.req_to_token_pool.write(
+        (batch.req_pool_indices, locs), out_cache_loc.to(torch.int32)
+    )
+
+    return out_cache_loc
+
+
+def release_kv_cache(req: Req, tree_cache: BasePrefixCache, is_insert: bool = True):
+    # MambaRadixCache may alloc mamba state before alloc KV cache
+    if req.req_pool_idx is None:
+        assert (
+            tree_cache.supports_mamba()
+        ), "Only MambaRadixCache allow freeing before alloc"
+        # TODO (csy, hanming): clean up this early allocation logic
+        if req.mamba_pool_idx is not None:
+            tree_cache.req_to_token_pool.mamba_pool.free(
+                req.mamba_pool_idx.unsqueeze(-1)
+            )
+            req.mamba_pool_idx = None
+        return
+
+    tree_cache.cache_finished_req(req, is_insert=is_insert)
+
+    # FIXME: SessionAwareCache.cache_finished_req sets req_pool_idx = None to
+    # transfer KV ownership to the SessionSlot, so we skip the remaining
+    # cleanup (overalloc free + pool slot free). This means over-allocated
+    # tokens from speculative decoding are NOT freed between turns.
+    if req.req_pool_idx is None:
+        return
+
+    start_p, end_p = req.pop_overallocated_kv_cache()
+
+    global_server_args = get_global_server_args()
+    page_size = global_server_args.page_size
+    spec_algo = global_server_args.speculative_algorithm
+
+    if spec_algo is None:
+        assert (
+            start_p == end_p
+        ), f"Unexpected overallocated KV cache, {req.kv_committed_len=}, {req.kv_allocated_len=}"
+
+    if page_size > 1:
+        start_p = ceil_align(start_p, page_size)
+
+    if start_p < end_p:
+        indices_to_free = tree_cache.req_to_token_pool.req_to_token[req.req_pool_idx][
+            start_p:end_p
+        ]
+        tree_cache.token_to_kv_pool_allocator.free(indices_to_free)
+    # If the prefix cache doesn't manage mamba states, we must free them here.
+    if isinstance(tree_cache.req_to_token_pool, HybridReqToTokenPool) and (
+        not tree_cache.supports_mamba()
+    ):
+        assert (
+            req.mamba_pool_idx is not None
+        ), "mamba state is freed while the tree cache does not manage mamba states"
+        tree_cache.req_to_token_pool.free_mamba_cache(req)
+    tree_cache.req_to_token_pool.free(req)
+
+
+def available_and_evictable_str(tree_cache: BasePrefixCache) -> str:
+    return tree_cache.available_and_evictable_str()
diff --git a/sglang/python/sglang/srt/mem_cache/cpp_radix_tree/.clang-format b/sglang/python/sglang/srt/mem_cache/cpp_radix_tree/.clang-format
new file mode 100644
index 0000000000000000000000000000000000000000..afbd654a7903e970ec7a3a005e646788b68dea1f
--- /dev/null
+++ b/sglang/python/sglang/srt/mem_cache/cpp_radix_tree/.clang-format
@@ -0,0 +1,15 @@
+BasedOnStyle: Google
+IndentWidth: 2
+ColumnLimit: 120
+AllowShortFunctionsOnASingleLine: Empty
+DerivePointerAlignment: false
+PointerAlignment: Left
+NamespaceIndentation: None
+SortIncludes: true
+AllowShortLoopsOnASingleLine: false
+BinPackParameters: false              # Prevents packing parameters in declarations
+BinPackArguments: false               # Prevents packing arguments in function calls
+AlignAfterOpenBracket: AlwaysBreak    # Forces a break after the opening parenthesis
+AlignOperands: Align                  # Aligns arguments vertically
+PenaltyBreakBeforeFirstCallParameter: 1  # Encourages breaking before the first argument
+PenaltyReturnTypeOnItsOwnLine: 100    # Keeps return type with function name
diff --git a/sglang/python/sglang/srt/mem_cache/cpp_radix_tree/common.h b/sglang/python/sglang/srt/mem_cache/cpp_radix_tree/common.h
new file mode 100644
index 0000000000000000000000000000000000000000..72c2c78be58453b64d197c3defb2f9a2d6c070fc
--- /dev/null
+++ b/sglang/python/sglang/srt/mem_cache/cpp_radix_tree/common.h
@@ -0,0 +1,29 @@
+#pragma once
+#include <cstddef>
+#include <cstdint>
+#include <source_location>
+#include <span>
+#include <stdexcept>
+#include <string>
+#include <vector>
+
+namespace radix_tree_v2 {
+
+using token_t = std::int32_t;
+using token_vec_t = std::vector<token_t>;
+using token_slice = std::span<const token_t>;
+using NodeHandle = std::size_t;
+using IOTicket = std::uint32_t;
+
+inline void _assert(
+    bool condition,
+    const char* message = "Assertion failed",
+    std::source_location loc = std::source_location::current()) {
+  if (!condition) [[unlikely]] {
+    std::string msg = message;
+    msg = msg + " at " + loc.file_name() + ":" + std::to_string(loc.line()) + " in " + loc.function_name();
+    throw std::runtime_error(msg);
+  }
+}
+
+}  // namespace radix_tree_v2
diff --git a/sglang/python/sglang/srt/mem_cache/cpp_radix_tree/radix_tree.py b/sglang/python/sglang/srt/mem_cache/cpp_radix_tree/radix_tree.py
new file mode 100644
index 0000000000000000000000000000000000000000..592727aac45feae6f177fc269cd74348fe301540
--- /dev/null
+++ b/sglang/python/sglang/srt/mem_cache/cpp_radix_tree/radix_tree.py
@@ -0,0 +1,182 @@
+from __future__ import annotations
+
+import os
+from typing import TYPE_CHECKING, List, Optional, Tuple
+
+import torch
+from torch.utils.cpp_extension import load
+
+_abs_path = os.path.dirname(os.path.abspath(__file__))
+radix_tree_cpp = load(
+    name="radix_tree_cpp",
+    sources=[
+        f"{_abs_path}/tree_v2_binding.cpp",
+        f"{_abs_path}/tree_v2_debug.cpp",
+        f"{_abs_path}/tree_v2.cpp",
+    ],
+    extra_cflags=["-O3", "-std=c++20"],
+)
+
+if TYPE_CHECKING:
+
+    class TreeNodeCpp:
+        """
+        A placeholder for the TreeNode class. Cannot be constructed elsewhere.
+        """
+
+    class IOHandle:
+        """
+        A placeholder for the IOHandle class. Cannot be constructed elsewhere.
+        """
+
+    class RadixTreeCpp:
+        def __init__(
+            self,
+            disabled: bool,
+            host_size: Optional[int],
+            page_size: int,
+            write_through_threshold: int,
+        ):
+            """
+            Initializes the RadixTreeCpp instance.
+            Args:
+                disabled (bool): If True, the radix tree is disabled.
+                host_size (Optional[int]): Size of the radix tree on the CPU. None means no CPU tree.
+                page_size (int): Size of the page for the radix tree.
+                write_through_threshold (int): Threshold for writing through from GPU to CPU.
+            """
+            self.tree = radix_tree_cpp.RadixTree(  # type: ignore
+                disabled, host_size, page_size, write_through_threshold
+            )
+
+        def match_prefix(
+            self, prefix: List[int]
+        ) -> Tuple[List[torch.Tensor], int, TreeNodeCpp, TreeNodeCpp]:
+            """
+            Matches a prefix in the radix tree.
+            Args:
+                prefix (List[int]): The prefix to match.
+            Returns:
+                Tuple[List[torch.Tensor], TreeNodeCpp, TreeNodeCpp]:
+                    0. A list of indices that is matched by the prefix on the GPU.
+                    1. Sum length of the indices matched on the CPU.
+                    2. The last node of the prefix matched on the GPU.
+                    3. The last node of the prefix matched on the CPU.
+            """
+            return self.tree.match_prefix(prefix)
+
+        def evict(self, num_tokens: int) -> List[torch.Tensor]:
+            """
+            Evicts a number of tokens from the radix tree.
+            Args:
+                num_tokens (int): The number of tokens to evict.
+            Returns:
+                List[torch.Tensor]: A list of indices that were evicted.
+            """
+            return self.tree.evict(num_tokens)
+
+        def lock_ref(self, handle: TreeNodeCpp, lock: bool) -> None:
+            """
+            Locks or unlocks a reference to a tree node.
+            After locking, the node will not be evicted from the radix tree.
+            Args:
+                handle (TreeNodeCpp): The tree node to lock or unlock.
+                lock (bool): If True, locks the node; if False, unlocks it.
+            """
+            return self.tree.lock_ref(handle, lock)
+
+        def writing_through(
+            self, key: List[int], indices: torch.Tensor
+        ) -> Tuple[List[Tuple[IOHandle, torch.Tensor, torch.Tensor]], int]:
+            """
+            Inserts a key-value pair into the radix tree and perform write-through check.
+            Args:
+                key (List[int]): The key to insert.
+                indices (torch.Tensor): The value associated with the key.
+            Returns:
+                Tuple[List[Tuple[IOHandle, torch.Tensor, torch.Tensor]], int]:
+                    0. A list of (IOHandle, device indices, host indices) tuples.
+                       These IOhandles require write-through to the CPU in python side.
+                    1. The number of indices that are matched on device.
+            """
+            return self.tree.writing_through(key, indices)
+
+        def loading_onboard(
+            self,
+            host_node: TreeNodeCpp,
+            new_device_indices: torch.Tensor,
+        ) -> Tuple[IOHandle, List[torch.Tensor]]:
+            """
+            Updates the device indices of tree nodes within a range on the tree.
+            Args:
+                host_node (TreeNodeCpp): The tree node on the host, must be descendant of device_node.
+                new_device_indices (torch.Tensor): The new device indices to set.
+                    The length of this tensor must be exactly host indices length.
+            Returns:
+                Tuple[IOHandle, List[torch.Tensor]]:
+                    0. An IOHandle that requires loading to the CPU in python side.
+                    1. A list of host indices corresponding to the new device indices.
+            """
+            return self.tree.loading_onboard(host_node, new_device_indices)
+
+        def commit_writing_through(self, handle: IOHandle, success: bool) -> None:
+            """
+            Commits the write-through process for a tree node.
+            Args:
+                handle (IOHandle): The IOHandle to commit.
+                success (bool): If True, commits the write-through; if False, just indicates failure.
+            """
+            return self.tree.commit_writing_through(handle, success)
+
+        def commit_loading_onboard(self, handle: IOHandle, success: bool) -> None:
+            """
+            Commits the load onboard process for tree nodes within a range on the tree.
+            Args:
+                handle (IOHandle): The IOHandle to commit.
+                success (bool): If True, commits the load-onboard; if False, just indicates failure.
+            """
+            return self.tree.commit_loading_onboard(handle, success)
+
+        def evictable_size(self) -> int:
+            """
+            Returns the size of the evictable part of the radix tree.
+            This is the size of the part that can be evicted from the GPU (ref_count = 0).
+            Returns:
+                int: The size of the evictable part.
+            """
+            return self.tree.evictable_size()
+
+        def protected_size(self) -> int:
+            """
+            Returns the size of the protected part of the radix tree.
+            This is the size of the part that cannot be evicted from the GPU (ref_count > 0).
+            Returns:
+                int: The size of the protected part.
+            """
+            return self.tree.protected_size()
+
+        def total_size(self) -> int:
+            """
+            Returns the total size of the radix tree (including CPU nodes).
+            Returns:
+                int: The total size of the radix tree.
+            """
+            return self.tree.total_size()
+
+        def reset(self) -> None:
+            """
+            Resets the radix tree, clearing all nodes and indices.
+            """
+            return self.tree.reset()
+
+        def debug_print(self) -> None:
+            """
+            Prints the internal state of the radix tree for debugging purposes.
+            """
+            return self.tree.debug_print()
+
+else:
+    # Real implementation of the classes for runtime
+    RadixTreeCpp = radix_tree_cpp.RadixTree
+    TreeNodeCpp = object
+    IOHandle = object
diff --git a/sglang/python/sglang/srt/mem_cache/cpp_radix_tree/tree_v2.cpp b/sglang/python/sglang/srt/mem_cache/cpp_radix_tree/tree_v2.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..2a5433221873b2d88d76ca2921984a8a1b14695b
--- /dev/null
+++ b/sglang/python/sglang/srt/mem_cache/cpp_radix_tree/tree_v2.cpp
@@ -0,0 +1,143 @@
+#include "tree_v2.h"
+
+#include <ATen/core/TensorBody.h>
+#include <ATen/ops/empty.h>
+#include <ATen/ops/tensor.h>
+#include <ATen/ops/zeros.h>
+#include <c10/util/irange.h>
+
+#include <cstddef>
+#include <memory>
+#include <queue>
+#include <stdexcept>
+#include <utility>
+#include <vector>
+
+#include "common.h"
+#include "tree_v2_impl.h"
+#include "tree_v2_node.h"
+
+namespace radix_tree_v2 {
+
+static NodeHandle node2id(TreeNode* node) {
+  return node->node_id;
+}
+
+// compare function for the TreeNode pointers based on their time
+// we use LRU, so we want to evict the least recently used nodes
+// since std::priority_queue is a max-heap, we need to reverse the comparison
+static constexpr auto cmp = [](TreeNode* lhs, TreeNode* rhs) { return lhs->time() > rhs->time(); };
+
+RadixTree::RadixTree(bool disabled, std::optional<std::size_t> host_size, std::size_t page_size, std::size_t threshold)
+    : m_impl(std::make_unique<Impl>(disabled, host_size.has_value(), page_size, host_size.value_or(0), threshold)) {}
+
+RadixTree::~RadixTree() = default;
+
+std::tuple<std::vector<at::Tensor>, std::size_t, NodeHandle, NodeHandle>
+RadixTree::match_prefix(const token_vec_t& _key) {
+  if (m_impl->disabled) return {};
+
+  const auto key = token_slice{_key.data(), m_impl->align(_key.size())};
+  const auto [host_node, _] = m_impl->tree_walk(key);
+
+  // walk up to the first non-evicted node
+  std::size_t host_hit_length = 0;
+  const auto device_node = host_node;
+
+  // collect all the device indices
+  std::vector<at::Tensor> indices{};
+  walk_to_root(device_node, [&](TreeNode* n) { indices.push_back(n->device_indices()); });
+  std::reverse(indices.begin(), indices.end());
+
+  return {std::move(indices), host_hit_length, node2id(device_node), node2id(host_node)};
+}
+
+std::vector<at::Tensor> RadixTree::evict(std::size_t num_tokens) {
+  if (m_impl->disabled || num_tokens == 0) return {};
+  auto heap = std::priority_queue{cmp, m_impl->collect_leaves_device()};
+  std::vector<at::Tensor> evicted_values;
+  // evict nodes until we reach the desired number of tokens
+  std::size_t num_evict = 0;
+  while (num_evict < num_tokens && !heap.empty()) {
+    const auto node = heap.top();
+    heap.pop();
+    // when ref_count == 0, can't be writing through
+    _assert(node->on_gpu() && node->ref_count == 0);
+    if (!node->is_io_free()) continue;  // skip nodes that are undergoing IO (i.e. indices protected)
+    evicted_values.push_back(node->device_indices());
+    num_evict += node->length();
+    const auto parent = node->parent();
+    m_impl->remove_device_node(node);
+    if (parent->is_leaf_device() && parent->ref_count == 0)
+      heap.push(parent);  // push parent to the heap if it is now a free leaf
+  }
+
+  return evicted_values;
+}
+
+std::tuple<std::vector<std::tuple<IOTicket, at::Tensor, at::Tensor>>, std::size_t>
+RadixTree::writing_through(const token_vec_t& _key, at::Tensor value) {
+  if (m_impl->disabled) return {};
+  _assert(_key.size() == std::size_t(value.size(0)), "Key and value must have the same size");
+
+  // just align the key to the page size, clip the unaligned tail
+  const auto key = token_slice{_key.data(), m_impl->align(_key.size())};
+
+  // walk the tree to find the right place to insert
+  const auto [host_node, host_prefix_length] = m_impl->tree_walk(key);
+
+  // insert and create a new node if the remaining part of the key is not empty
+  if (host_prefix_length != key.size()) {
+    m_impl->create_device_node(
+        host_node,
+        {key.begin() + host_prefix_length, key.end()},
+        value.slice(/*dim=*/0, host_prefix_length, key.size()));
+  }
+
+  // add the hit count for the device node
+  walk_to_root(host_node, [&](TreeNode* n) { n->hit_count++; });
+
+  std::vector<std::tuple<IOTicket, at::Tensor, at::Tensor>> result;
+
+  // don't write through if hicache is disabled (no host memory), fast path
+  if (!m_impl->use_hicache) return {std::move(result), host_prefix_length};
+  throw std::runtime_error("Not implemented yet");
+}
+
+std::tuple<IOTicket, std::vector<at::Tensor>> RadixTree::loading_onboard(NodeHandle, at::Tensor) {
+  if (m_impl->disabled) return {};
+  throw std::runtime_error("Not implemented yet");
+}
+
+void RadixTree::commit_writing_through(IOTicket, bool) {
+  if (m_impl->disabled) return;
+  throw std::runtime_error("Not implemented yet");
+}
+
+void RadixTree::commit_loading_onboard(IOTicket, bool) {
+  if (m_impl->disabled) return;
+  throw std::runtime_error("Not implemented yet");
+}
+
+void RadixTree::reset() {
+  m_impl->reset();
+}
+
+void RadixTree::lock_ref(NodeHandle node_id, bool increment) {
+  if (m_impl->disabled) return;
+  m_impl->lock_ref(node_id, increment);
+}
+
+std::size_t RadixTree::evictable_size() const {
+  return m_impl->evictable_size();
+}
+
+std::size_t RadixTree::protected_size() const {
+  return m_impl->protected_size();
+}
+
+std::size_t RadixTree::total_size() const {
+  return m_impl->total_size();
+}
+
+}  // namespace radix_tree_v2
diff --git a/sglang/python/sglang/srt/mem_cache/cpp_radix_tree/tree_v2.h b/sglang/python/sglang/srt/mem_cache/cpp_radix_tree/tree_v2.h
new file mode 100644
index 0000000000000000000000000000000000000000..68da9b9e179d6f477987176f0aa96b9a391739f5
--- /dev/null
+++ b/sglang/python/sglang/srt/mem_cache/cpp_radix_tree/tree_v2.h
@@ -0,0 +1,59 @@
+#pragma once
+#include <ATen/core/TensorBody.h>
+#include <c10/core/Device.h>
+
+#include <cstddef>
+#include <memory>
+#include <optional>
+#include <tuple>
+#include <vector>
+
+#include "common.h"
+
+namespace radix_tree_v2 {
+
+struct RadixTree {
+ public:
+  RadixTree(bool disabled, std::optional<std::size_t> host_size, std::size_t page_size, std::size_t threshold);
+  ~RadixTree();
+
+  // Trees should not be copied or moved, as they manage their own memory and state.
+  RadixTree(const RadixTree&) = delete;
+  RadixTree(RadixTree&&) = delete;
+  RadixTree& operator=(const RadixTree&) = delete;
+  RadixTree& operator=(RadixTree&&) = delete;
+
+  /// @return (device indices that are matched, host indices length, device node, host node)
+  std::tuple<std::vector<at::Tensor>, std::size_t, NodeHandle, NodeHandle> match_prefix(const token_vec_t& key);
+  /// @return Device indices that need to be evicted (on python side).
+  std::vector<at::Tensor> evict(std::size_t num_tokens);
+  /// @brief (Un-)Lock a node.
+  void lock_ref(NodeHandle node_id, bool increment /* increment or decrement */);
+  /// @brief Update new key-value pair and try to perform write-through.
+  std::tuple<std::vector<std::tuple<IOTicket, at::Tensor, at::Tensor>>, std::size_t>
+  writing_through(const token_vec_t& key, at::Tensor value);
+  /// @brief Load to device from host within a range of nodes.
+  std::tuple<IOTicket, std::vector<at::Tensor>> loading_onboard(NodeHandle host_id, at::Tensor indices);
+  /// @brief Commit a transaction of write-through.
+  void commit_writing_through(IOTicket ticket, bool success);
+  /// @brief Commit a transaction of load onboard.
+  void commit_loading_onboard(IOTicket ticket, bool success);
+  /// @brief Clear and reset the tree.
+  void reset();
+
+  /// @return How many size are still evictable (on device + not locked).
+  std::size_t evictable_size() const;
+  /// @return How many size are protected (locked).
+  std::size_t protected_size() const;
+  /// @return How many size are used on device.
+  std::size_t total_size() const;
+
+  /// @brief Print debug information of the tree.
+  void debug_print() const;
+
+ private:
+  struct Impl;
+  std::unique_ptr<Impl> m_impl;
+};
+
+}  // namespace radix_tree_v2
diff --git a/sglang/python/sglang/srt/mem_cache/cpp_radix_tree/tree_v2_binding.cpp b/sglang/python/sglang/srt/mem_cache/cpp_radix_tree/tree_v2_binding.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..81069e4fe4d29e67eb4dcf40c2bb838a733a59b3
--- /dev/null
+++ b/sglang/python/sglang/srt/mem_cache/cpp_radix_tree/tree_v2_binding.cpp
@@ -0,0 +1,32 @@
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+#include <torch/extension.h>
+
+#include <cstddef>
+#include <optional>
+
+#include "tree_v2.h"
+
+PYBIND11_MODULE(radix_tree_cpp, m) {
+  using namespace radix_tree_v2;
+  namespace py = pybind11;
+  py::class_<RadixTree>(m, "RadixTree")
+      .def(
+          py::init<bool, std::optional<std::size_t>, std::size_t, std::size_t>(),
+          py::arg("disabled"),
+          py::arg("host_size"),
+          py::arg("page_size"),
+          py::arg("write_through_threshold"))
+      .def("match_prefix", &RadixTree::match_prefix)
+      .def("evict", &RadixTree::evict)
+      .def("lock_ref", &RadixTree::lock_ref)
+      .def("evictable_size", &RadixTree::evictable_size)
+      .def("protected_size", &RadixTree::protected_size)
+      .def("total_size", &RadixTree::total_size)
+      .def("writing_through", &RadixTree::writing_through)
+      .def("loading_onboard", &RadixTree::loading_onboard)
+      .def("commit_writing_through", &RadixTree::commit_writing_through)
+      .def("commit_loading_onboard", &RadixTree::commit_loading_onboard)
+      .def("reset", &RadixTree::reset)
+      .def("debug_print", &RadixTree::debug_print);
+}
diff --git a/sglang/python/sglang/srt/mem_cache/cpp_radix_tree/tree_v2_debug.cpp b/sglang/python/sglang/srt/mem_cache/cpp_radix_tree/tree_v2_debug.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..89b6290b1f87fa8519fa975b7ed5a684b84ac3d0
--- /dev/null
+++ b/sglang/python/sglang/srt/mem_cache/cpp_radix_tree/tree_v2_debug.cpp
@@ -0,0 +1,194 @@
+#include <c10/core/DeviceType.h>
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/ScalarType.h>
+
+#include <cstddef>
+#include <cstdlib>
+#include <iostream>
+#include <stdexcept>
+#include <string>
+
+#include "tree_v2.h"
+#include "tree_v2_impl.h"
+
+namespace radix_tree_v2 {
+
+void RadixTree::debug_print() const {
+  m_impl->debug_print(std::clog);
+}
+
+static constexpr auto npos = std::size_t(-1);
+
+void RadixTree::Impl::debug_print(std::ostream& os) const {
+  static constexpr auto _check = [](bool condition, auto msg, std::size_t id = npos) {
+    if (!condition) {
+      std::string suffix = id == npos ? "" : " [id = " + std::to_string(id) + "]";
+      throw std::runtime_error(std::string("RadixTree::debug_print failed: ") + msg + suffix);
+    }
+  };
+
+  static constexpr auto _print_node = [](TreeNode* node, std::size_t depth, std::ostream& os) {
+    const auto length = node->length();
+    os << node->node_id << " [depth = " << depth << "] [len = " << length << "]";
+
+    // placement status
+    if (node->on_both()) {
+      os << " [cpu + gpu]";
+    } else if (node->on_gpu()) {
+      os << " [gpu]";
+    } else if (node->on_cpu()) {
+      os << " [cpu]";
+    } else {
+      _check(false, "Node is not on GPU or CPU", node->node_id);
+    }
+
+    // IO status
+    if (node->is_io_free()) {
+      os << " [io = free]";
+    } else if (node->is_io_device_to_host()) {
+      os << " [io = gpu -> cpu]";
+    } else if (node->is_io_host_to_device()) {
+      os << " [io = cpu -> gpu]";
+    } else {
+      _check(false, "Node is in unknown IO state", node->node_id);
+    }
+
+    os << " [rc = " << node->ref_count << "]";
+    os << " [hit = " << node->hit_count << "]";
+  };
+
+  static constexpr auto _print_indices = [](at::Tensor indices, std::ostream& os) {
+    if (!indices.defined()) {
+      os << "[[N/A]]";
+      return indices;
+    }
+    indices = indices.to(c10::kCPU, c10::kLong, false, false, c10::MemoryFormat::Contiguous);
+    const auto length = indices.numel();
+    os << "[";
+    auto* data_ptr = indices.data_ptr<int64_t>();
+    for (const auto i : c10::irange(indices.size(0))) {
+      os << data_ptr[i];
+      if (i != length - 1) os << ", ";
+    }
+    os << "]";
+    return indices;
+  };
+
+  os << "Evictable size: " << evictable_size() << std::endl;
+  os << "Protected size: " << protected_size() << std::endl;
+  os << "Total size: " << const_cast<Impl*>(this)->total_size() << std::endl;
+  std::vector<std::tuple<TreeNode*, TreeNode*, token_slice>> stack;
+  auto root = const_cast<TreeNode*>(&m_root);
+  os << root->node_id << " [root]" << std::endl;
+  for (const auto& [key, child] : *root) {
+    stack.push_back({child.get(), root, key});
+  }
+
+  std::unordered_map<TreeNode*, std::size_t> depth_map;
+  std::string indent_buffer;
+  depth_map[root] = 0;
+  std::vector<NodeHandle> visited_id;
+  std::size_t evictable_size_real = 0;
+  while (!stack.empty()) {
+    const auto [node, parent, key] = stack.back();
+    stack.pop_back();
+    visited_id.push_back(node->node_id);
+    const auto nid = node->node_id;
+    _check(node != nullptr, "Node is null", nid);
+    _check(node->on_gpu() || node->on_cpu(), "Node is not on GPU or CPU", nid);
+    _check(node->parent() == parent, "Parent is not correct", nid);
+    _check(key.size() == page_size && node->diff_key(key, 0) == page_size, "Key is not correct", nid);
+    _check(depth_map.count(node) == 0, "Node is visited twice", nid);
+    _check(m_node_map.count(nid) == 1, "Node is not in the map", nid);
+    _check(m_node_map.at(nid) == node, "Node in the map is not the same as the one in the stack", nid);
+    _check(!node->on_gpu() || parent->is_root() || parent->on_gpu(), "Node on GPU must have a GPU/root parent", nid);
+    if (!node->is_io_free()) {
+      _check(node->ref_count > 0, "Node is in IO state but not protected", nid);
+      _check(node->on_both(), "Node in IO state must be on both CPU and GPU", nid);
+    }
+
+    if (node->on_gpu() && node->ref_count == 0) {
+      evictable_size_real += node->length();
+    }
+
+    const auto depth = (depth_map[node] = depth_map[parent] + 1);
+    indent_buffer.resize(depth * 2, ' ');
+    os << indent_buffer;
+    _print_node(node, depth, os);
+    os << std::endl;
+    for (const auto& [key, child] : *node) {
+      stack.push_back({child.get(), node, key});
+    }
+  }
+
+  _check(evictable_size_real == evictable_size(), "Evictable size is wrong");
+  _check(m_node_map.count(root->node_id) == 1, "Root node is not in the map");
+  _check(m_node_map.at(root->node_id) == root, "Root node in the map is not correct");
+
+  std::sort(visited_id.begin(), visited_id.end());
+  if (visited_id.size() != m_node_map.size() - 1) {
+    // Some error in the tree, not all nodes are visited
+    std::string id_list;
+    id_list += "(visited: ";
+    id_list += std::to_string(root->node_id) + " ";
+    for (const auto& id : visited_id) {
+      id_list += std::to_string(id) + " ";
+    }
+    id_list += "), (in map: ";
+    for (const auto& [id, _] : m_node_map) {
+      id_list += std::to_string(id) + " ";
+    }
+    id_list += ")";
+    _check(false, "Not all nodes are visited " + id_list);
+  }
+
+  static const auto kSGLANG_RADIX_CPP_DEBUG_LIMIT = [] {
+    const char* env = std::getenv("SGLANG_RADIX_CPP_DEBUG_LIMIT");
+    const std::size_t default_limit = 16;
+    if (env != nullptr) {
+      try {
+        return static_cast<std::size_t>(std::stoull(env));
+      } catch (const std::exception& e) {
+        std::cerr << "Invalid SGLANG_RADIX_CPP_DEBUG_LIMIT value: " << env  //
+                  << ". Using default value =" << default_limit << std::endl;
+      }
+    }
+    return default_limit;
+  }();
+
+  for (const auto nid : visited_id) {
+    const auto node = m_node_map.at(nid);
+    // print key and indices
+    const auto& key = node->_unsafe_tokens();
+    if (key.size() > kSGLANG_RADIX_CPP_DEBUG_LIMIT) {
+      os << "Node " << nid << ": key is too long (" << key.size() << " tokens), skipping..." << std::endl;
+      continue;
+    }
+    os << "Node " << nid << ": key = [";
+    for (const auto& i : c10::irange(key.size())) {
+      os << key[i];
+      if (i != key.size() - 1) os << ", ";
+    }
+
+    _check(key.size() % page_size == 0, "Misaligned key", nid);
+
+    os << "] device_indices = ";
+    const auto device_indices = _print_indices(node->device_indices(), os);
+    if (device_indices.defined()) {
+      std::size_t length = device_indices.numel();
+      _check(device_indices.dim() == 1, "Device indices must be 1D tensor", nid);
+      _check(length == node->length(), "Wrong device indices size", nid);
+    }
+
+    os << " host_indices = ";
+    const auto host_indices = _print_indices(node->host_indices(), os);
+    if (host_indices.defined()) {
+      std::size_t length = host_indices.numel();
+      _check(host_indices.dim() == 1, "Host indices must be 1D tensor", nid);
+      _check(length == node->length(), "Wrong host indices size", nid);
+    }
+    os << std::endl;
+  }
+}
+
+}  // namespace radix_tree_v2
diff --git a/sglang/python/sglang/srt/mem_cache/cpp_radix_tree/tree_v2_impl.h b/sglang/python/sglang/srt/mem_cache/cpp_radix_tree/tree_v2_impl.h
new file mode 100644
index 0000000000000000000000000000000000000000..1f4a23cd578267c19615c4542409cd5fac7ea467
--- /dev/null
+++ b/sglang/python/sglang/srt/mem_cache/cpp_radix_tree/tree_v2_impl.h
@@ -0,0 +1,286 @@
+#pragma once
+#include <c10/util/irange.h>
+
+#include <chrono>
+#include <cstddef>
+#include <iosfwd>
+#include <memory>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+
+#include "common.h"
+#include "tree_v2.h"
+#include "tree_v2_node.h"
+
+namespace radix_tree_v2 {
+
+using node_iterator_t = typename TreeNode::iterator_t;
+
+struct RadixTree::Impl {
+ public:
+  Impl(bool disabled, bool use_hicache, std::size_t page_size, std::size_t host_size, std::size_t threshold)
+      : m_root(/*node_id_=*/0),
+        m_evictable_size(0),
+        m_protected_size(0),
+        m_cached_vec(),
+        m_node_map(),
+        m_node_counter(1),  // start from 1 to avoid confusion with root node
+        disabled(disabled),
+        use_hicache(use_hicache),
+        page_size(page_size),
+        threshold(threshold) {
+    _assert(page_size > 0, "Page size must be greater than zero");
+    _assert(use_hicache == (host_size > 0), "Hierarchical cache is enabled iff host size > 0");
+    m_root.ref_count = 1;                  // root node is always protected
+    m_cached_vec.reserve(page_size);       // to avoid repeated allocations
+    m_node_map[m_root.node_id] = &m_root;  // add root to the map
+  }
+
+  TreeNode* split_node(node_iterator_t iterator, std::size_t prefix_length) {
+    // from `parent -> old_node` to `parent-> new_node -> old_node`
+    // the prefix part of the old node is moved to the new node
+    auto old_node_ptr = std::move(iterator->second);
+    auto new_node_ptr = std::make_unique<TreeNode>(m_node_counter++);
+    auto* old_node = old_node_ptr.get();
+    auto* new_node = new_node_ptr.get();
+    auto* parent = old_node->parent();
+    // set up data structures
+    split_prefix(new_node, old_node, prefix_length);
+    // set up parent-child relationship
+    add_child(new_node, std::move(old_node_ptr));
+    add_child(parent, std::move(new_node_ptr), iterator);
+    m_node_map[new_node->node_id] = new_node;  // add to the map
+    return new_node;
+  }
+
+  // node: x -> [GPU]
+  TreeNode* create_device_node(TreeNode* parent, token_vec_t vec, at::Tensor indices) {
+    auto new_node_ptr = std::make_unique<TreeNode>(m_node_counter++);
+    auto new_node = new_node_ptr.get();
+    new_node_ptr->_unsafe_tokens() = std::move(vec);
+    new_node_ptr->_unsafe_device_indices() = std::move(indices);
+    m_evictable_size += new_node_ptr->length();
+    add_child(parent, std::move(new_node_ptr));
+    m_node_map[new_node->node_id] = new_node;  // add to the map
+    return new_node;
+  }
+
+  // node: [GPU] -> x
+  void remove_device_node(TreeNode* node) {
+    _assert(node->on_gpu_only() && node->ref_count == 0);
+    m_evictable_size -= node->length();
+    node->parent()->erase_child(get_key(node));
+    m_node_map.erase(node->node_id);  // remove from the map
+  }
+
+  /**
+   * @brief Walk the tree to find the node that matches the key.
+   * If the key partially matches a node, it will split that node.
+   * @return A pair containing the last node that matches the key and
+   * the total prefix length matched (on gpu and cpu) so far.
+   */
+  std::pair<TreeNode*, std::size_t> tree_walk(token_slice key) {
+    _assert(key.size() % page_size == 0, "Key should be page-aligned");
+
+    std::size_t total_prefix_length = 0;
+    TreeNode* node = &m_root;
+
+    const auto now = std::chrono::steady_clock::now();
+    while (key.size() > 0) {
+      const auto iterator = node->find_child(get_key(key));
+      if (iterator == node->end()) break;
+
+      // walk to the child node
+      node = iterator->second.get();
+
+      // at least `page_size` tokens are matched, and there may be more tokens to match
+      // the return value prefix_length is no less than `page_size`
+      const auto prefix_length = align(node->diff_key(key, page_size) + page_size);
+      total_prefix_length += prefix_length;
+
+      // split the node if the prefix is not the whole token vector
+      if (prefix_length < node->length()) {
+        return {split_node(iterator, prefix_length), total_prefix_length};
+      }
+
+      // we have matched the whole key, continue to the next node
+      node->access(now);
+      key = key.subspan(prefix_length);
+    }
+
+    return {node, total_prefix_length};
+  }
+
+  std::vector<TreeNode*> collect_leaves() const {
+    std::vector<TreeNode*> leaves;
+    std::vector<TreeNode*> stack = {};
+
+    auto process_node = [&](TreeNode* node) {
+      if (node->is_leaf()) {
+        if (node->ref_count == 0) {
+          leaves.push_back(node);
+        }
+      } else {
+        stack.push_back(node);
+      }
+    };
+    for (const auto& [_, child] : m_root) {
+      process_node(child.get());
+    }
+    while (!stack.empty()) {
+      const auto node = stack.back();
+      stack.pop_back();
+      for (const auto& [_, child] : *node) {
+        process_node(child.get());
+      }
+    }
+    return leaves;
+  }
+
+  std::vector<TreeNode*> collect_leaves_device() const {
+    // for non-hicache, every leaf device node is a leaf node (since no backup on host)
+    if (!use_hicache) return collect_leaves();
+    std::vector<TreeNode*> leaves;
+    std::vector<TreeNode*> stack = {};
+
+    auto process_node = [&](TreeNode* node) {
+      if (!node->on_gpu()) return;  // skip nodes that are not on GPU
+
+      if (node->is_leaf_device()) {
+        if (node->ref_count == 0) {
+          leaves.push_back(node);
+        }
+      } else {
+        stack.push_back(node);
+      }
+    };
+    for (const auto& [_, child] : m_root) {
+      process_node(child.get());
+    }
+    while (!stack.empty()) {
+      const auto node = stack.back();
+      stack.pop_back();
+      for (const auto& [_, child] : *node) {
+        process_node(child.get());
+      }
+    }
+
+    return leaves;
+  }
+
+  void lock_ref(TreeNode* node, bool increment) {
+    if (node->is_root()) return;  // skip root node
+    _assert(node->on_gpu(), "Cannot lock reference on an evicted node");
+    if (increment)
+      walk_to_root(node, [this](TreeNode* n) {
+        if (n->ref_count == 0) {
+          m_evictable_size -= n->length();
+          m_protected_size += n->length();
+        }
+        n->ref_count++;
+      });
+    else
+      walk_to_root(node, [this](TreeNode* n) {
+        _assert(n->ref_count != 0, "Cannot decrement reference count = zero");
+        n->ref_count--;
+        if (n->ref_count == 0) {
+          m_protected_size -= n->length();
+          m_evictable_size += n->length();
+        }
+      });
+  }
+
+  void lock_ref(NodeHandle node_ptr, bool increment) {
+    return lock_ref(id2node(node_ptr), increment);
+  }
+
+  void lock(TreeNode* node) {
+    return lock_ref(node, /*increment=*/true);
+  }
+
+  void unlock(TreeNode* node) {
+    return lock_ref(node, /*increment=*/false);
+  }
+
+  std::size_t total_size() const {
+    std::size_t size = 0;
+    std::vector<const TreeNode*> stack = {&m_root};
+    while (!stack.empty()) {
+      auto* node = stack.back();
+      stack.pop_back();
+      size += node->length();
+      for (const auto& [_, child] : *node)
+        stack.push_back(child.get());
+    }
+    return size;
+  }
+
+  std::size_t evictable_size() const {
+    return m_evictable_size;
+  }
+
+  std::size_t protected_size() const {
+    return m_protected_size;
+  }
+
+  std::size_t align(std::size_t size) const {
+    return (size / page_size) * page_size;  // align to page size
+  }
+
+  TreeNode* id2node(NodeHandle node_id) const {
+    const auto iterator = m_node_map.find(node_id);
+    _assert(iterator != m_node_map.end(), "Node not found in the map");
+    return iterator->second;
+  }
+
+  void reset() {
+    _assert(m_root.ref_count == 1, "Root node must be protected during reset");
+    m_node_counter = 1;  // reset node counter
+    m_root.root_reset();
+    m_evictable_size = 0;
+    m_protected_size = 0;
+    m_node_map.clear();
+    m_node_map[m_root.node_id] = &m_root;  // re-add root to the map
+  }
+
+  void debug_print(std::ostream& os) const;
+
+ private:
+  // some auxiliary functions
+  token_vec_t& get_key(token_slice tokens) {
+    _assert(tokens.size() >= page_size, "Key should be at least page-sized");
+    tokens = tokens.subspan(0, page_size);
+    m_cached_vec.assign(tokens.begin(), tokens.end());
+    return m_cached_vec;
+  }
+
+  // justify for _unsafe call: we need to read the key part of the tokens
+  token_vec_t& get_key(TreeNode* node) {
+    return get_key(node->_unsafe_tokens());
+  }
+
+  void add_child(TreeNode* parent, std::unique_ptr<TreeNode>&& child) {
+    parent->add_child(get_key(child.get()), std::move(child));
+  }
+
+  void add_child(TreeNode* parent, std::unique_ptr<TreeNode>&& child, node_iterator_t it) {
+    parent->add_child(it, std::move(child));
+  }
+
+  TreeNode m_root;                                        // root node of the tree
+  std::size_t m_evictable_size;                           // number of evictable tokens on GPU (lock ref = 0)
+  std::size_t m_protected_size;                           // number of protected tokens on GPU (lock ref > 0)
+  token_vec_t m_cached_vec;                               // cached vector of tokens for the current operation
+  std::unordered_map<std::size_t, TreeNode*> m_node_map;  // map of node keys to nodes
+  std::size_t m_node_counter;                             // counter for node IDs
+
+ public:
+  // some public constant configurations (without m_ prefix)
+  const bool disabled;          // whether the cache is enabled, or just a temporary cache
+  const bool use_hicache;       // whether to use the HiCache for this tree
+  const std::size_t page_size;  // size of each page in the cache
+  const std::size_t threshold;  // threshold for write_through
+};
+
+}  // namespace radix_tree_v2
diff --git a/sglang/python/sglang/srt/mem_cache/cpp_radix_tree/tree_v2_node.h b/sglang/python/sglang/srt/mem_cache/cpp_radix_tree/tree_v2_node.h
new file mode 100644
index 0000000000000000000000000000000000000000..4eac86ea4402d39253beccdb3eac4ee97c729cb6
--- /dev/null
+++ b/sglang/python/sglang/srt/mem_cache/cpp_radix_tree/tree_v2_node.h
@@ -0,0 +1,257 @@
+#pragma once
+#include <ATen/core/TensorBody.h>
+
+#include <algorithm>
+#include <array>
+#include <chrono>
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#include <optional>
+#include <ranges>
+#include <unordered_map>
+
+#include "common.h"
+
+namespace radix_tree_v2 {
+
+struct std_vector_hash {
+  // see https://stackoverflow.com/questions/20511347/a-good-hash-function-for-a-vector
+  std::size_t operator()(const token_vec_t& vec) const {
+    std::size_t hash = 0;
+    for (const auto& token : vec) {
+      hash ^= token + 0x9e3779b9 + (hash << 6) + (hash >> 2);
+    }
+    return hash;
+  }
+};
+
+struct TreeNode {
+ public:
+  using childern_map_t = std::unordered_map<token_vec_t, std::unique_ptr<TreeNode>, std_vector_hash>;
+  using iterator_t = typename childern_map_t::iterator;
+  using const_iterator_t = typename childern_map_t::const_iterator;
+  using timestamp_t = std::chrono::steady_clock::time_point;
+
+  TreeNode(std::size_t node_id_)
+      : ref_count(0),
+        hit_count(0),
+        m_io_locked(std::nullopt),
+        m_io_status(IOStatus::None),
+        m_io_ticket(),
+        m_tokens(),
+        m_device_indices(),
+        m_host_indices(),
+        m_parent(),
+        m_children(),
+        m_last_access_time(std::chrono::steady_clock::now()),
+        node_id(node_id_) {}
+
+  void access(timestamp_t time = std::chrono::steady_clock::now()) {
+    m_last_access_time = time;
+  }
+
+  bool is_root() const {
+    return m_parent == nullptr;
+  }
+
+  timestamp_t time() const {
+    return m_last_access_time;
+  }
+
+  bool on_gpu() const {
+    return m_device_indices.defined();
+  }
+
+  bool on_cpu() const {
+    return m_host_indices.defined();
+  }
+
+  bool on_gpu_only() const {
+    return on_gpu() && !on_cpu();
+  }
+
+  bool on_cpu_only() const {
+    return !on_gpu() && on_cpu();
+  }
+
+  bool on_both() const {
+    return on_gpu() && on_cpu();
+  }
+
+  std::size_t length() const {
+    return m_tokens.size();
+  }
+
+  bool is_leaf() const {
+    return m_children.empty();
+  }
+
+  bool is_leaf_device() const {
+    for (const auto& [_, child] : m_children)
+      if (child->on_gpu()) return false;  // at least one child is on the device
+    return true;
+  }
+
+  void add_child(const token_vec_t& v, std::unique_ptr<TreeNode>&& child) {
+    child->m_parent = this;
+    m_children[v] = std::move(child);
+  }
+
+  void add_child(iterator_t it, std::unique_ptr<TreeNode>&& child) {
+    child->m_parent = this;
+    it->second = std::move(child);
+  }
+
+  void erase_child(const token_vec_t& v) {
+    _assert(m_children.erase(v) > 0, "Child node not found");
+  }
+
+  iterator_t find_child(const token_vec_t& v) {
+    return m_children.find(v);
+  }
+
+  iterator_t begin() {
+    return m_children.begin();
+  }
+
+  iterator_t end() {
+    return m_children.end();
+  }
+
+  const_iterator_t begin() const {
+    return m_children.begin();
+  }
+
+  const_iterator_t end() const {
+    return m_children.end();
+  }
+
+  TreeNode* parent() {
+    return m_parent;
+  }
+
+  // set up all data structures except for parent-child relationship
+  friend void split_prefix(TreeNode* new_node, TreeNode* old_node, std::size_t prefix_length) {
+    auto tokens = std::move(old_node->m_tokens);
+    _assert(0 < prefix_length && prefix_length < tokens.size(), "Invalid prefix size for split");
+
+    // set up tokens
+    old_node->m_tokens = token_vec_t(tokens.begin() + prefix_length, tokens.end());
+    new_node->m_tokens = std::move(tokens);
+    new_node->m_tokens.resize(prefix_length);
+
+    // set up values
+    const int64_t new_size = new_node->length();
+    const int64_t old_size = old_node->length();
+    if (old_node->m_device_indices.defined()) {
+      auto new_indices = old_node->m_device_indices.split_with_sizes({new_size, old_size});
+      new_node->m_device_indices = std::move(new_indices[0]);
+      old_node->m_device_indices = std::move(new_indices[1]);
+    }
+    if (old_node->m_host_indices.defined()) {
+      auto new_indices = old_node->m_host_indices.split_with_sizes({new_size, old_size});
+      new_node->m_host_indices = std::move(new_indices[0]);
+      old_node->m_host_indices = std::move(new_indices[1]);
+    }
+
+    // set up ref counts and hit counts
+    new_node->ref_count = old_node->ref_count;
+    new_node->hit_count = old_node->hit_count;
+
+    // If the old node (child) was locked for IO, the new node (parent) does not need
+    // to be locked, since it is naturally protected by the child node's lock.
+    if (old_node->m_io_locked.has_value()) {
+      new_node->m_io_locked = false;
+      new_node->m_io_status = old_node->m_io_status;
+      new_node->m_io_ticket = old_node->m_io_ticket;
+    }
+  }
+
+  /// @return The first index in `m_tokens` that differs from `key`.
+  std::size_t diff_key(token_slice key, std::size_t offset) const {
+    const auto a = token_slice{key}.subspan(offset);
+    const auto b = token_slice{m_tokens}.subspan(offset);
+    const auto [it_a, it_b] = std::ranges::mismatch(a, b);
+    return it_a - a.begin();  // return the index of the first differing token
+  }
+
+  at::Tensor device_indices() const {
+    return m_device_indices;
+  }
+  at::Tensor host_indices() const {
+    return m_host_indices;
+  }
+
+  // visiting tokens are always unsafe (use `diff_key` instead)
+  token_vec_t& _unsafe_tokens() {
+    return m_tokens;
+  }
+  at::Tensor& _unsafe_device_indices() {
+    return m_device_indices;
+  }
+  at::Tensor& _unsafe_host_indices() {
+    return m_host_indices;
+  }
+
+  bool is_io_free() const {
+    return m_io_status == IOStatus::None;
+  }
+
+  bool is_io_device_to_host() const {
+    return m_io_status == IOStatus::DeviceToHost;
+  }
+
+  bool is_io_host_to_device() const {
+    return m_io_status == IOStatus::HostToDevice;
+  }
+
+  void root_reset() {
+    _assert(is_root(), "Only root node can call root_reset");
+    _assert(
+        m_io_status == IOStatus::None && m_io_locked == std::nullopt,
+        "IO operation in progress, cannot reset root node");
+    _assert(this->m_tokens.empty(), "Root node tokens should be empty on reset");
+    _assert(
+        !this->m_device_indices.defined() && !this->m_host_indices.defined(),
+        "Root node indices should be always be empty and never assigned");
+    m_children.clear();
+    this->access();
+  }
+
+ public:
+  std::size_t ref_count;
+  std::size_t hit_count;
+
+ private:
+  enum class IOStatus : std::uint8_t {
+    None,
+    HostToDevice,
+    DeviceToHost,
+  };
+
+  std::optional<bool> m_io_locked;  // whether the node is locked in IO operation
+  IOStatus m_io_status;
+  IOTicket m_io_ticket;
+
+  token_vec_t m_tokens;
+  at::Tensor m_device_indices;  // indices of device value
+  at::Tensor m_host_indices;    // indices of host value
+  TreeNode* m_parent;
+  childern_map_t m_children;
+  timestamp_t m_last_access_time;
+
+ public:
+  const std::size_t node_id;  // unique ID for the node
+};
+
+template <typename F>
+inline TreeNode* walk_to_root(TreeNode* t, const F& f) {
+  while (!t->is_root()) {
+    f(t);
+    t = t->parent();
+  }
+  return t;  // return the root node
+}
+
+}  // namespace radix_tree_v2
diff --git a/sglang/python/sglang/srt/mem_cache/evict_policy.py b/sglang/python/sglang/srt/mem_cache/evict_policy.py
new file mode 100644
index 0000000000000000000000000000000000000000..687593418e33ec4148a744cd1830b4977ab885e4
--- /dev/null
+++ b/sglang/python/sglang/srt/mem_cache/evict_policy.py
@@ -0,0 +1,46 @@
+from __future__ import annotations
+
+from abc import ABC, abstractmethod
+from typing import TYPE_CHECKING, Tuple, Union
+
+if TYPE_CHECKING:
+    from sglang.srt.mem_cache.radix_cache import TreeNode
+
+
+class EvictionStrategy(ABC):
+    @abstractmethod
+    def get_priority(self, node: "TreeNode") -> Union[float, Tuple]:
+        pass
+
+
+class LRUStrategy(EvictionStrategy):
+    def get_priority(self, node: "TreeNode") -> float:
+        return node.last_access_time
+
+
+class LFUStrategy(EvictionStrategy):
+    def get_priority(self, node: "TreeNode") -> Tuple[int, float]:
+        return (node.hit_count, node.last_access_time)
+
+
+class FIFOStrategy(EvictionStrategy):
+    def get_priority(self, node: "TreeNode") -> float:
+        return node.creation_time
+
+
+class MRUStrategy(EvictionStrategy):
+    def get_priority(self, node: "TreeNode") -> float:
+        return -node.last_access_time
+
+
+class FILOStrategy(EvictionStrategy):
+    def get_priority(self, node: "TreeNode") -> float:
+        return -node.creation_time
+
+
+class PriorityStrategy(EvictionStrategy):
+    """Priority-aware eviction: lower priority values evicted first, then LRU within same priority."""
+
+    def get_priority(self, node: "TreeNode") -> Tuple[int, float]:
+        # Return (priority, last_access_time) so lower priority nodes are evicted first
+        return (node.priority, node.last_access_time)
diff --git a/sglang/python/sglang/srt/mem_cache/flush_cache.py b/sglang/python/sglang/srt/mem_cache/flush_cache.py
new file mode 100644
index 0000000000000000000000000000000000000000..73f80a2dbb6f03a29669caef6939f2f77ef89049
--- /dev/null
+++ b/sglang/python/sglang/srt/mem_cache/flush_cache.py
@@ -0,0 +1,33 @@
+"""
+Copyright 2023-2024 SGLang Team
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+"""
+Flush the KV cache.
+
+Usage:
+python3 -m sglang.srt.mem_cache.flush_cache --url http://localhost:30000
+"""
+
+import argparse
+
+import requests
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--url", type=str, default="http://localhost:30000")
+    args = parser.parse_args()
+
+    response = requests.post(args.url + "/flush_cache")
+    assert response.status_code == 200
diff --git a/sglang/python/sglang/srt/mem_cache/hicache_storage.py b/sglang/python/sglang/srt/mem_cache/hicache_storage.py
new file mode 100644
index 0000000000000000000000000000000000000000..02680db04a04b1b04d9d0b159c090f646b2348dd
--- /dev/null
+++ b/sglang/python/sglang/srt/mem_cache/hicache_storage.py
@@ -0,0 +1,292 @@
+import hashlib
+import logging
+import os
+from abc import ABC, abstractmethod
+from dataclasses import dataclass
+from typing import Any, List, Optional
+
+import torch
+
+from sglang.srt.environ import envs
+from sglang.srt.mem_cache.memory_pool_host import HostKVCache
+
+logger = logging.getLogger(__name__)
+
+
+def get_hash_str(token_ids: List[int], prior_hash: str = None) -> str:
+    hasher = hashlib.sha256()
+
+    if prior_hash:
+        hasher.update(bytes.fromhex(prior_hash))
+
+    for t in token_ids:
+        if isinstance(t, tuple):
+            # EAGLE bigram mode: hash both elements to uniquely identify the bigram
+            for elem in t:
+                hasher.update(elem.to_bytes(4, byteorder="little", signed=False))
+        else:
+            # Regular mode: single integer token
+            hasher.update(t.to_bytes(4, byteorder="little", signed=False))
+
+    return hasher.hexdigest()
+
+
+def hash_str_to_int64(hash_str: str) -> int:
+    """Convert SHA256 hex string to signed 64-bit integer for events.
+
+    Takes first 16 hex characters (64 bits) and converts to signed int64 range.
+    """
+    # Take first 16 hex chars to get 64-bit value
+    uint64_val = int(hash_str[:16], 16)
+    # Convert to signed int64 range [-2^63, 2^63-1]
+    if uint64_val >= 2**63:
+        return uint64_val - 2**64
+    return uint64_val
+
+
+@dataclass
+class HiCacheStorageConfig:
+    tp_rank: int
+    tp_size: int
+    pp_rank: int
+    pp_size: int
+    is_mla_model: bool
+    is_page_first_layout: bool
+    model_name: Optional[str]
+    tp_lcm_size: Optional[int] = None
+    should_split_heads: bool = False
+    extra_config: Optional[dict] = None
+
+
+@dataclass
+class HiCacheStorageExtraInfo:
+    prefix_keys: Optional[List[str]] = (None,)
+    extra_info: Optional[dict] = None
+
+
+class HiCacheStorage(ABC):
+    """
+    HiCacheStorage is a class that provides a generic key-value interface for storing and retrieving KV cache.
+    It abstracts the underlying storage mechanism, allowing different implementations to be used.
+    """
+
+    # todo, the page size of storage backend does not have to be the same as the same as host memory pool
+
+    def register_mem_pool_host(self, mem_pool_host: HostKVCache):
+        self.mem_pool_host = mem_pool_host
+
+    def batch_get_v1(
+        self,
+        keys: List[str],
+        host_indices: torch.Tensor,
+        extra_info: Optional[HiCacheStorageExtraInfo] = None,
+    ) -> List[bool]:
+        """
+        Retrieve values for multiple keys.
+        Returns a list of booleans indicating success for each key.
+        """
+        pass
+
+    def batch_set_v1(
+        self,
+        keys: List[str],
+        host_indices: torch.Tensor,
+        extra_info: Optional[HiCacheStorageExtraInfo] = None,
+    ) -> List[bool]:
+        """
+        Store multiple key-value pairs.
+        Returns a list of booleans indicating success for each key.
+        """
+        pass
+
+    @abstractmethod
+    def get(
+        self,
+        key: str,
+        target_location: Optional[Any] = None,
+        target_sizes: Optional[Any] = None,
+    ) -> torch.Tensor | None:
+        """
+        Retrieve the value associated with the given key.
+        Returns None if the key does not exist.
+        """
+        pass
+
+    # TODO: Deprecate
+    @abstractmethod
+    def batch_get(
+        self,
+        keys: List[str],
+        target_locations: Optional[Any] = None,
+        target_sizes: Optional[Any] = None,
+    ) -> List[torch.Tensor | None] | int:
+        """
+        Retrieve values for multiple keys.
+        Returns a list of tensors or None for each key.
+        """
+        pass
+
+    @abstractmethod
+    def set(
+        self,
+        key: str,
+        value: Optional[Any] = None,
+        target_location: Optional[Any] = None,
+        target_sizes: Optional[Any] = None,
+    ) -> bool:
+        """
+        Store the value associated with the given key.
+        Returns True if the operation was successful, False otherwise.
+        """
+        pass
+
+    # TODO: Deprecate
+    @abstractmethod
+    def batch_set(
+        self,
+        keys: List[str],
+        values: Optional[Any] = None,
+        target_locations: Optional[Any] = None,
+        target_sizes: Optional[Any] = None,
+    ) -> bool:
+        """
+        Store multiple key-value pairs.
+        Returns True if all operations were successful, False otherwise.
+        """
+        pass
+
+    @abstractmethod
+    def exists(self, key: str) -> bool:
+        """
+        Check if the key exists in the storage.
+        Returns True if the key exists, False otherwise.
+        """
+        pass
+
+    # TODO: Use a finer-grained return type (e.g., List[bool])
+    def batch_exists(
+        self, keys: List[str], extra_info: Optional[HiCacheStorageExtraInfo] = None
+    ) -> int:
+        """
+        Check if the keys exist in the storage.
+        return the number of consecutive existing keys from the start.
+        Can be overridden by subclasses for more efficient implementation.
+        """
+        for i in range(len(keys)):
+            if not self.exists(keys[i]):
+                return i
+        return len(keys)
+
+    def clear(self) -> None:
+        pass
+
+    def get_stats(self):
+        return None
+
+
+class HiCacheFile(HiCacheStorage):
+
+    def __init__(
+        self, storage_config: HiCacheStorageConfig, file_path: str = "/tmp/hicache"
+    ):
+        self.file_path = envs.SGLANG_HICACHE_FILE_BACKEND_STORAGE_DIR.get() or file_path
+
+        tp_rank, tp_size, model_name, is_mla_model = (
+            storage_config.tp_rank,
+            storage_config.tp_size,
+            storage_config.model_name,
+            storage_config.is_mla_model,
+        )
+        model_name = "-".join(model_name.split("/")) if model_name else ""
+        if is_mla_model:
+            self.config_suffix = f"_{model_name}"
+        else:
+            self.config_suffix = f"_{model_name}_{tp_rank}_{tp_size}"
+
+        if not os.path.exists(self.file_path) and tp_rank == 0:
+            os.makedirs(self.file_path)
+            logger.info(f"Created HiCacheFile storage directory at {self.file_path}")
+
+    def _get_suffixed_key(self, key: str) -> str:
+        return key + self.config_suffix
+
+    def get(
+        self,
+        key: str,
+        target_location: torch.Tensor,
+        target_sizes: Optional[Any] = None,
+    ) -> torch.Tensor | None:
+        key = self._get_suffixed_key(key)
+        tensor_path = os.path.join(self.file_path, f"{key}.bin")
+        try:
+            expected = target_location.numel() * target_location.element_size()
+            with open(tensor_path, "rb", buffering=0) as f:
+                buf = memoryview(target_location.view(torch.uint8).contiguous().numpy())
+                if f.readinto(buf) != expected:
+                    raise IOError(f"Short read for {key}")
+            return target_location
+        except FileNotFoundError:
+            logger.warning(f"Failed to fetch {key} from HiCacheFile storage.")
+            return None
+
+    def batch_get(
+        self,
+        keys: List[str],
+        target_locations: List[torch.Tensor],
+        target_sizes: Optional[Any] = None,
+    ) -> List[torch.Tensor | None]:
+        return [
+            self.get(key, target_location)
+            for key, target_location in zip(
+                keys, target_locations or [None] * len(keys)
+            )
+        ]
+
+    def set(
+        self,
+        key: str,
+        value: Optional[Any] = None,
+        target_location: Optional[Any] = None,
+        target_sizes: Optional[Any] = None,
+    ) -> bool:
+        if self.exists(key):
+            logger.debug(f"Key {key} already exists. Skipped.")
+            return True
+
+        key = self._get_suffixed_key(key)
+        tensor_path = os.path.join(self.file_path, f"{key}.bin")
+        try:
+            value.contiguous().view(dtype=torch.uint8).numpy().tofile(tensor_path)
+            return True
+        except Exception as e:
+            logger.error(f"Failed to save tensor {key}: {e}")
+            return False
+
+    def batch_set(
+        self,
+        keys: List[str],
+        values: Optional[Any] = None,
+        target_locations: Optional[Any] = None,
+        target_sizes: Optional[Any] = None,
+    ) -> bool:
+        for key, value in zip(keys, values):
+            if not self.set(key, value):
+                return False
+        return True
+
+    def exists(self, key: str) -> bool:
+        key = self._get_suffixed_key(key)
+        tensor_path = os.path.join(self.file_path, f"{key}.bin")
+        return os.path.exists(tensor_path)
+
+    def clear(self) -> bool:
+        try:
+            for filename in os.listdir(self.file_path):
+                file_path = os.path.join(self.file_path, filename)
+                if os.path.isfile(file_path):
+                    os.remove(file_path)
+            logger.info("Cleared all entries in HiCacheFile storage.")
+            return True
+        except Exception as e:
+            logger.error(f"Failed to clear HiCacheFile storage: {e}")
+            return False
diff --git a/sglang/python/sglang/srt/mem_cache/hiradix_cache.py b/sglang/python/sglang/srt/mem_cache/hiradix_cache.py
new file mode 100644
index 0000000000000000000000000000000000000000..2acb01157931462682fb935c8d00b17a785ebe2a
--- /dev/null
+++ b/sglang/python/sglang/srt/mem_cache/hiradix_cache.py
@@ -0,0 +1,1545 @@
+from __future__ import annotations
+
+import atexit
+import heapq
+import json
+import logging
+import os
+import threading
+import time
+from queue import Empty
+from typing import TYPE_CHECKING, Dict, List, Optional, Tuple
+
+import torch
+
+from sglang.srt.environ import envs
+from sglang.srt.managers.cache_controller import HiCacheController, PrefetchOperation
+from sglang.srt.mem_cache.base_prefix_cache import (
+    EvictParams,
+    EvictResult,
+    InsertParams,
+    InsertResult,
+    MatchPrefixParams,
+    MatchResult,
+)
+from sglang.srt.mem_cache.memory_pool import (
+    MHATokenToKVPool,
+    MLATokenToKVPool,
+    NSATokenToKVPool,
+)
+from sglang.srt.mem_cache.memory_pool_host import (
+    MHATokenToKVPoolHost,
+    MLATokenToKVPoolHost,
+    NSATokenToKVPoolHost,
+)
+from sglang.srt.mem_cache.radix_cache import (
+    RadixCache,
+    RadixKey,
+    TreeNode,
+    compute_node_hash_values,
+    split_node_hash_value,
+)
+from sglang.srt.mem_cache.utils import convert_to_bigram_key
+from sglang.srt.observability.metrics_collector import StorageMetricsCollector
+from sglang.srt.utils import bind_to_closest_numa_node_cuda
+
+if TYPE_CHECKING:
+    from sglang.srt.mem_cache.cache_init_params import CacheInitParams
+    from sglang.srt.server_args import ServerArgs
+
+logger = logging.getLogger(__name__)
+
+
+class HiRadixCache(RadixCache):
+
+    def __init__(self, params: CacheInitParams, server_args: ServerArgs):
+        self._enable_metrics_flag = params.enable_metrics
+        if server_args.hicache_io_backend == "direct":
+            # FIXME: move this logic into server_args parsing
+            if server_args.hicache_mem_layout == "page_first":
+                server_args.hicache_mem_layout = "page_first_direct"
+                logger.warning(
+                    "Page first layout is not supported with direct IO backend, switching to page first direct layout"
+                )
+
+        if not server_args.disable_hicache_numa_detect:
+            bind_to_closest_numa_node_cuda()
+
+        self.page_size = params.page_size
+        self.kv_cache = params.token_to_kv_pool_allocator.get_kvcache()
+
+        if isinstance(self.kv_cache, MHATokenToKVPool):
+            self.token_to_kv_pool_host = MHATokenToKVPoolHost(
+                self.kv_cache,
+                server_args.hicache_ratio,
+                server_args.hicache_size,
+                self.page_size,
+                server_args.hicache_mem_layout,
+                allocator_type=server_args.hicache_storage_backend,
+            )
+        elif isinstance(self.kv_cache, NSATokenToKVPool):
+            self.token_to_kv_pool_host = NSATokenToKVPoolHost(
+                self.kv_cache,
+                server_args.hicache_ratio,
+                server_args.hicache_size,
+                self.page_size,
+                server_args.hicache_mem_layout,
+                allocator_type=server_args.hicache_storage_backend,
+            )
+        elif isinstance(self.kv_cache, MLATokenToKVPool):
+            self.token_to_kv_pool_host = MLATokenToKVPoolHost(
+                self.kv_cache,
+                server_args.hicache_ratio,
+                server_args.hicache_size,
+                self.page_size,
+                server_args.hicache_mem_layout,
+                allocator_type=server_args.hicache_storage_backend,
+            )
+        else:
+            raise ValueError(f"HiRadixCache only supports MHA and MLA yet")
+
+        self.tp_group = params.tp_cache_group
+        self.tp_world_size = torch.distributed.get_world_size(group=self.tp_group)
+        self.pp_rank = params.pp_rank
+        self.pp_size = params.pp_size
+        self.enable_storage = server_args.hicache_storage_backend is not None
+        self.enable_storage_metrics = self.enable_storage and params.enable_metrics
+        self.extra_metric_labels = server_args.extra_metric_labels
+
+        (
+            extra_config,
+            prefetch_threshold,
+            prefetch_timeout_base,
+            prefetch_timeout_per_ki_token,
+            hicache_storage_pass_prefix_keys,
+        ) = self._parse_storage_backend_extra_config(
+            server_args.hicache_storage_backend_extra_config
+        )
+        # TODO: support more timeout check functions
+        self.is_prefetch_timeout = self._prefetch_timeout_check_linear_func
+        self.prefetch_stop_policy = server_args.hicache_storage_prefetch_policy
+
+        self.load_cache_event = threading.Event()
+        self.cache_controller = HiCacheController(
+            params.token_to_kv_pool_allocator,
+            self.token_to_kv_pool_host,
+            self.page_size,
+            self.tp_group,
+            load_cache_event=self.load_cache_event,
+            write_policy=server_args.hicache_write_policy,
+            io_backend=server_args.hicache_io_backend,
+            storage_backend=server_args.hicache_storage_backend,
+            prefetch_threshold=prefetch_threshold,
+            model_name=server_args.served_model_name,
+            storage_backend_extra_config=extra_config,
+            pp_rank=self.pp_rank,
+            pp_size=self.pp_size,
+        )
+        self._apply_storage_runtime_config(
+            storage_backend=server_args.hicache_storage_backend,
+            prefetch_threshold=prefetch_threshold,
+            prefetch_timeout_base=prefetch_timeout_base,
+            prefetch_timeout_per_ki_token=prefetch_timeout_per_ki_token,
+            hicache_storage_pass_prefix_keys=hicache_storage_pass_prefix_keys,
+            enable_storage=self.enable_storage,
+            enable_storage_metrics=self.enable_storage_metrics,
+            extra_metric_labels=self.extra_metric_labels,
+        )
+
+        # record the nodes with ongoing write through
+        self.ongoing_write_through = {}
+        # record the node segments with ongoing load back
+        self.ongoing_load_back = {}
+        # record the ongoing prefetch requests
+        self.ongoing_prefetch = {}
+        self.ongoing_backup = {}
+        # track per-request tokens loaded from storage (L3 hits)
+        # key: request_id, value: number of tokens actually loaded from storage
+        self.prefetch_loaded_tokens_by_reqid: dict[str, int] = {}
+        # todo: dynamically adjust the threshold
+        self.write_through_threshold = (
+            1 if server_args.hicache_write_policy == "write_through" else 2
+        )
+        self.load_back_threshold = 10
+
+        # Detach storage backend automatically on process shutdown
+        atexit.register(self.shutdown)
+
+        self.evictable_host_leaves = set()
+
+        # Pin budget: max tokens that can be pinned = ratio * host pool capacity.
+        pin_ratio = envs.SGLANG_HICACHE_MAX_PINNED_RATIO.get()
+        if pin_ratio < 0 or pin_ratio >= 1:
+            raise ValueError(
+                f"SGLANG_HICACHE_MAX_PINNED_RATIO must be in [0, 1), got {pin_ratio}"
+            )
+        self._max_pinned_tokens = int(self.token_to_kv_pool_host.size * pin_ratio)
+        self.pinned_size_ = 0
+        logger.info(
+            "Pin budget: %d tokens (ratio=%.3f)", self._max_pinned_tokens, pin_ratio
+        )
+
+        super().__init__(params=params)
+
+    def shutdown(self):
+        """Best-effort auto-detach of storage backend on process shutdown.
+
+        This keeps startup and runtime behavior consistent: if a backend was attached
+        (either via CLI args or via admin API), we attempt to detach it on exit.
+        """
+        try:
+            if self.enable_storage:
+                self.detach_storage_backend()
+        except Exception:
+            logger.exception("Failed to detach storage backend on process shutdown.")
+
+    def _apply_storage_runtime_config(
+        self,
+        *,
+        storage_backend: Optional[str],
+        prefetch_threshold: int,
+        prefetch_timeout_base: float,
+        prefetch_timeout_per_ki_token: float,
+        hicache_storage_pass_prefix_keys: bool,
+        enable_storage: bool,
+        enable_storage_metrics: bool,
+        extra_metric_labels: Optional[Dict[str, str]],
+    ) -> None:
+        prefetch_timeout_per_page = (
+            self.page_size / 1024 * prefetch_timeout_per_ki_token
+        )
+
+        self.enable_storage = enable_storage
+        self.prefetch_threshold = prefetch_threshold
+        self.prefetch_timeout_base = prefetch_timeout_base
+        self.prefetch_timeout_per_page = prefetch_timeout_per_page
+        self.hicache_storage_pass_prefix_keys = hicache_storage_pass_prefix_keys
+        self.enable_storage_metrics = enable_storage_metrics
+
+        if self.enable_storage_metrics:
+            labels = {
+                "storage_backend": storage_backend,
+                "tp_rank": self.cache_controller.tp_rank,
+                "dp_rank": self.cache_controller.dp_rank,
+                "pp_rank": self.cache_controller.pp_rank,
+                "pp_size": self.cache_controller.pp_size,
+            }
+            if extra_metric_labels:
+                labels.update(extra_metric_labels)
+            existing_collector = getattr(self, "storage_metrics_collector", None)
+            if existing_collector is None:
+                self.storage_metrics_collector = StorageMetricsCollector(labels=labels)
+            elif set(existing_collector.labels.keys()) == set(labels.keys()):
+                existing_collector.labels = labels
+            else:
+                logger.warning(
+                    "Storage metrics labels changed (%s -> %s). Keep existing labels to "
+                    "avoid duplicate metric registration.",
+                    sorted(existing_collector.labels.keys()),
+                    sorted(labels.keys()),
+                )
+
+    def attach_storage_backend(
+        self,
+        storage_backend: str,
+        storage_backend_extra_config_json: Optional[str] = None,
+        served_model_name: Optional[str] = None,
+        hicache_storage_prefetch_policy: Optional[str] = None,
+        hicache_write_policy: Optional[str] = None,
+    ) -> tuple[bool, str]:
+        """Attach (enable) storage backend at runtime.
+
+        This will start storage threads inside `HiCacheController` and enable
+        prefetch/backup paths. Caller must ensure there are no running/queued
+        requests to avoid races.
+        """
+        # Validate inputs first (no side effects).
+        if hicache_storage_prefetch_policy is not None:
+            allowed = ["best_effort", "wait_complete", "timeout"]
+            if hicache_storage_prefetch_policy not in allowed:
+                return (
+                    False,
+                    f"Invalid hicache_storage_prefetch_policy: {hicache_storage_prefetch_policy!r}. "
+                    f"Expected one of {allowed}.",
+                )
+
+        if hicache_write_policy is not None:
+            allowed = ["write_back", "write_through", "write_through_selective"]
+            if hicache_write_policy not in allowed:
+                return (
+                    False,
+                    f"Invalid hicache_write_policy: {hicache_write_policy!r}. "
+                    f"Expected one of {allowed}.",
+                )
+
+        # If already enabled:
+        # - backend unchanged: treat as success, update policies only.
+        # - backend changed: treat as failure, do NOT update policies.
+        if self.enable_storage:
+            current_backend = self.cache_controller.storage_backend_type
+
+            if current_backend == storage_backend:
+                if hicache_storage_prefetch_policy is not None:
+                    self.prefetch_stop_policy = hicache_storage_prefetch_policy
+                    logger.info(
+                        f"Set hicache_storage_prefetch_policy to {hicache_storage_prefetch_policy}"
+                    )
+                if hicache_write_policy is not None:
+                    self.cache_controller.write_policy = hicache_write_policy
+                    self.write_through_threshold = (
+                        1 if hicache_write_policy == "write_through" else 2
+                    )
+                    logger.info(f"Set hicache_write_policy to {hicache_write_policy}")
+                return (
+                    True,
+                    "HiCache storage backend already enabled with same backend; policies updated.",
+                )
+
+            return (
+                False,
+                f"HiCache storage backend is already enabled with backend '{current_backend}'. "
+                f"Cannot attach different backend '{storage_backend}'. Detach first.",
+            )
+
+        # Not enabled: update policies before controller attach so storage threads observe new values.
+        if hicache_storage_prefetch_policy is not None:
+            self.prefetch_stop_policy = hicache_storage_prefetch_policy
+            logger.info(
+                f"Set hicache_storage_prefetch_policy to {hicache_storage_prefetch_policy}"
+            )
+
+        if hicache_write_policy is not None:
+            self.cache_controller.write_policy = hicache_write_policy
+            self.write_through_threshold = (
+                1 if hicache_write_policy == "write_through" else 2
+            )
+            logger.info(f"Set hicache_write_policy to {hicache_write_policy}")
+
+        logger.info(f"Attaching HiCache storage backend: {storage_backend}")
+        try:
+            (
+                extra_config,
+                prefetch_threshold,
+                prefetch_timeout_base,
+                prefetch_timeout_per_ki_token,
+                hicache_storage_pass_prefix_keys,
+            ) = self._parse_storage_backend_extra_config(
+                storage_backend_extra_config_json
+            )
+        except Exception as e:
+            logger.exception(f"Failed to parse storage_backend_extra_config_json: {e}")
+            return (
+                False,
+                f"Failed to parse storage_backend_extra_config_json '{storage_backend_extra_config_json}': {e}",
+            )
+
+        try:
+            self.cache_controller.attach_storage_backend(
+                storage_backend=storage_backend,
+                prefetch_threshold=prefetch_threshold,
+                model_name=served_model_name,
+                storage_backend_extra_config=extra_config,
+            )
+        except Exception as e:
+            logger.exception(
+                f"Failed to attach storage backend '{storage_backend}': {e}"
+            )
+            return False, f"Failed to attach storage backend '{storage_backend}': {e}"
+
+        self._apply_storage_runtime_config(
+            storage_backend=storage_backend,
+            prefetch_threshold=prefetch_threshold,
+            prefetch_timeout_base=prefetch_timeout_base,
+            prefetch_timeout_per_ki_token=prefetch_timeout_per_ki_token,
+            hicache_storage_pass_prefix_keys=hicache_storage_pass_prefix_keys,
+            enable_storage=True,
+            enable_storage_metrics=self._enable_metrics_flag,
+            extra_metric_labels=self.extra_metric_labels,
+        )
+        return True, "Attached HiCache storage backend successfully."
+
+    def detach_storage_backend(self) -> tuple[bool, str]:
+        """Detach (disable) storage backend at runtime.
+
+        Caller must ensure there are no running/queued requests to avoid races.
+        """
+        try:
+            # Drain any pending control queues before tearing down storage threads/backend.
+            # IMPORTANT: this must happen before we clear `ongoing_*`, otherwise acks/releases
+            # cannot be matched to nodes and may leak host pages / locks.
+            self._drain_storage_control_queues_local()
+            # Idempotent detach: always ask controller to best-effort cleanup, even if
+            # `self.enable_storage` is already False (may be leftover state from a
+            # previous partial detach).
+            self.cache_controller.detach_storage_backend()
+        except Exception as e:
+            logger.exception("Failed to detach storage backend.")
+            # Do NOT crash the server for admin operations. Return failure with detail.
+            return False, f"Failed to detach HiCache storage backend: {e}"
+
+        # Best-effort cleanup of any leftover bookkeeping.
+        self._drain_storage_control_queues_local()
+        # After controller threads are fully stopped, it's safe to force-release any
+        # leftover pending ops (e.g., async prefetch/backup that didn't get a revoke/ack).
+        self._force_release_pending_storage_ops()
+
+        self.enable_storage = False
+        self.enable_storage_metrics = False
+        return True, "Detached HiCache storage backend successfully."
+
+    def _force_release_pending_storage_ops(self):
+        """Force release any leftover pending prefetch/backup bookkeeping.
+
+        This is a safety net for detach/shutdown paths. It assumes storage threads
+        have been stopped already (via controller.detach), so no concurrent access
+        to these structures should happen.
+        """
+        cc = self.cache_controller
+
+        # Force release leftover prefetch ops: free pre-allocated host pages and
+        # drop the host protection on the matched prefix node.
+        try:
+            for req_id, info in list(self.ongoing_prefetch.items()):
+                try:
+                    last_host_node, token_ids, host_indices, _operation = info
+                except Exception:
+                    # Unexpected shape; just drop it.
+                    self.ongoing_prefetch.pop(req_id, None)
+                    continue
+
+                try:
+                    if host_indices is not None:
+                        cc.mem_pool_host.free(host_indices)
+                except Exception:
+                    logger.exception(
+                        "Failed to free host indices for prefetch %s", req_id
+                    )
+
+                try:
+                    last_host_node.release_host()
+                except Exception:
+                    logger.exception(
+                        "Failed to release host protection for prefetch %s", req_id
+                    )
+
+                try:
+                    cc.prefetch_tokens_occupied -= len(token_ids)
+                    if cc.prefetch_tokens_occupied < 0:
+                        cc.prefetch_tokens_occupied = 0
+                except Exception:
+                    pass
+
+                self.ongoing_prefetch.pop(req_id, None)
+        except Exception:
+            logger.exception("Force release pending prefetch ops failed.")
+
+        # Force release leftover backup ops: drop host protection on nodes.
+        try:
+            for ack_id, node in list(self.ongoing_backup.items()):
+                try:
+                    node.release_host()
+                except Exception:
+                    logger.exception(
+                        "Failed to release host protection for backup op %s", ack_id
+                    )
+                self.ongoing_backup.pop(ack_id, None)
+        except Exception:
+            logger.exception("Force release pending backup ops failed.")
+
+    def _drain_storage_control_queues_local(self):
+        """Drain storage control queues without TP synchronization.
+
+        This is intended for shutdown/detach paths where we want to make best-effort
+        cleanup even if queue sizes temporarily differ across ranks.
+        """
+        self._drain_storage_control_queues_impl(
+            n_revoke=None,
+            n_backup=None,
+            n_release=None,
+            log_metrics=False,
+        )
+
+    def _drain_storage_control_queues_impl(
+        self,
+        n_revoke: Optional[int],
+        n_backup: Optional[int],
+        n_release: Optional[int],
+        log_metrics: bool,
+    ):
+        cc = self.cache_controller
+
+        def _drain_queue(q, limit: Optional[int]):
+            drained = 0
+            while limit is None or drained < limit:
+                try:
+                    item = q.get_nowait()
+                except Empty:
+                    break
+                drained += 1
+                yield item
+
+        def _drain_revoke():
+            for req_id in _drain_queue(cc.prefetch_revoke_queue, n_revoke):
+                info = self.ongoing_prefetch.pop(req_id, None)
+                if info is not None:
+                    last_host_node, token_ids, _, _ = info
+                    last_host_node.release_host()
+                    cc.prefetch_tokens_occupied -= len(token_ids)
+                    if cc.prefetch_tokens_occupied < 0:
+                        cc.prefetch_tokens_occupied = 0
+
+        def _drain_backup():
+            for operation in _drain_queue(cc.ack_backup_queue, n_backup):
+                ack_id = operation.id
+                entry = self.ongoing_backup.pop(ack_id, None)
+                if entry is not None:
+                    entry.release_host()
+                if log_metrics and self.enable_storage_metrics:
+                    self.storage_metrics_collector.log_backuped_tokens(
+                        operation.completed_tokens
+                    )
+
+        def _drain_release():
+            host_indices_list = []
+            for host_indices in _drain_queue(cc.host_mem_release_queue, n_release):
+                host_indices_list.append(host_indices)
+            if host_indices_list:
+                host_indices = torch.cat(host_indices_list, dim=0)
+                cc.mem_pool_host.free(host_indices)
+
+        _drain_revoke()
+        _drain_backup()
+        _drain_release()
+
+    def _parse_storage_backend_extra_config(
+        self, storage_backend_extra_config: Optional[str]
+    ):
+        """
+        Parse storage backend extra config JSON and extract specific parameters.
+
+        Args:
+            storage_backend_extra_config: JSON string containing extra configuration
+
+        Returns:
+            tuple: (extra_config_dict, prefetch_threshold, prefetch_timeout_base, prefetch_timeout_per_ki_token, hicache_storage_pass_prefix_keys)
+        """
+        # Parse extra config if provided. Extra config can be a JSON string or a json/toml/yaml file path prefixed with "@".
+        extra_config = {}
+        if storage_backend_extra_config:
+            try:
+                if storage_backend_extra_config.startswith("@"):
+                    # Read config from a json/toml/yaml file
+                    path = storage_backend_extra_config[1:]
+                    ext = os.path.splitext(path)[1].lower()
+                    with open(path, "rb" if ext == ".toml" else "r") as f:
+                        if ext == ".json":
+                            extra_config = json.load(f)
+                        elif ext == ".toml":
+                            import tomllib
+
+                            extra_config = tomllib.load(f)
+                        elif ext in (".yaml", ".yml"):
+                            import yaml
+
+                            extra_config = yaml.safe_load(f)
+                        else:
+                            raise ValueError(
+                                f"Unsupported config file {path} (config format: {ext})"
+                            )
+                else:
+                    # read config from JSON string
+                    extra_config = json.loads(storage_backend_extra_config)
+            except Exception as e:
+                logger.error(f"Invalid backend extra config JSON: {e}")
+                raise e
+
+        prefetch_threshold = extra_config.pop("prefetch_threshold", 256)  # tokens
+        prefetch_timeout_base = extra_config.pop("prefetch_timeout_base", 1)  # seconds
+        prefetch_timeout_per_ki_token = extra_config.pop(
+            "prefetch_timeout_per_ki_token", 0.25
+        )  # seconds per 1024 tokens
+        hicache_storage_pass_prefix_keys = extra_config.pop(
+            "hicache_storage_pass_prefix_keys", False
+        )
+
+        if not isinstance(prefetch_threshold, int):
+            raise ValueError(
+                f"prefetch_threshold must be int, got {type(prefetch_threshold).__name__}"
+            )
+        if not isinstance(prefetch_timeout_base, (int, float)):
+            raise ValueError(
+                f"prefetch_timeout_base must be number, got {type(prefetch_timeout_base).__name__}"
+            )
+        if not isinstance(prefetch_timeout_per_ki_token, (int, float)):
+            raise ValueError(
+                f"prefetch_timeout_per_ki_token must be number, got {type(prefetch_timeout_per_ki_token).__name__}"
+            )
+        if not isinstance(hicache_storage_pass_prefix_keys, bool):
+            raise ValueError(
+                "hicache_storage_pass_prefix_keys must be bool, got "
+                f"{type(hicache_storage_pass_prefix_keys).__name__}"
+            )
+
+        return (
+            extra_config,
+            prefetch_threshold,
+            float(prefetch_timeout_base),
+            float(prefetch_timeout_per_ki_token),
+            hicache_storage_pass_prefix_keys,
+        )
+
+    def reset(self):
+        TreeNode.counter = 0
+        self.cache_controller.reset()
+        self.token_to_kv_pool_host.clear()
+        # Clear per-request tracking dicts
+        self.prefetch_loaded_tokens_by_reqid.clear()
+        self.evictable_host_leaves.clear()
+        self.pinned_size_ = 0
+        super().reset()
+
+    def get_height(self, node: TreeNode):
+        height = 0
+        while node != self.root_node:
+            node = node.parent
+            height += 1
+        return height
+
+    def clear_storage_backend(self) -> bool:
+        if self.enable_storage:
+            try:
+                # Check if the storage backend has a clear method (for nixl backends)
+                if hasattr(self.cache_controller.storage_backend, "clear"):
+                    self.cache_controller.storage_backend.clear()
+                    logger.info(
+                        "Hierarchical cache storage backend cleared successfully!"
+                    )
+                    return True
+                else:
+                    logger.warning(
+                        f"Storage backend {type(self.cache_controller.storage_backend).__name__} does not support clear operation."
+                    )
+                    return False
+            except Exception as e:
+                logger.error(f"Failed to clear hierarchical cache storage backend: {e}")
+                return False
+        else:
+            logger.warning("Hierarchical cache storage backend is not enabled.")
+            return False
+
+    def write_backup(self, node: TreeNode, write_back=False):
+        host_indices = self.cache_controller.write(
+            device_indices=node.value,
+            node_id=node.id,
+        )
+        if host_indices is None:
+            self.evict_host(len(node.value))
+            host_indices = self.cache_controller.write(
+                device_indices=node.value,
+                node_id=node.id,
+            )
+        if host_indices is not None:
+            node.host_value = host_indices
+            assert len(node.host_value) > 0
+            self.ongoing_write_through[node.id] = node
+            if not write_back:
+                # no need to lock nodes if write back
+                self.inc_lock_ref(node)
+        else:
+            return 0
+
+        return len(host_indices)
+
+    def write_backup_storage(self, node: TreeNode):
+        prefix_keys = (
+            node.get_prefix_hash_values(node.parent)
+            if self.hicache_storage_pass_prefix_keys
+            else None
+        )
+
+        operation_id = self.cache_controller.write_storage(
+            node.host_value, node.key, node.hash_value, prefix_keys
+        )
+        self.ongoing_backup[operation_id] = node
+        node.protect_host()
+
+    def _inc_hit_count(self, node: TreeNode, chunked=False):
+        # skip the hit count update for chunked requests
+        if self.cache_controller.write_policy == "write_back" or chunked:
+            return
+        node.hit_count += 1
+
+        if not node.backuped:
+            if node.hit_count >= self.write_through_threshold:
+                # write to host if the node is not backuped
+                self.write_backup(node)
+
+    def writing_check(self, write_back=False):
+        if write_back:
+            # blocking till all write back complete
+            while len(self.ongoing_write_through) > 0:
+                for _, finish_event, ack_list in self.cache_controller.ack_write_queue:
+                    finish_event.synchronize()
+                    for ack_id in ack_list:
+                        backuped_node = self.ongoing_write_through.pop(ack_id)
+                        if self.enable_storage:
+                            self.write_backup_storage(backuped_node)
+                self.cache_controller.ack_write_queue.clear()
+                assert len(self.ongoing_write_through) == 0
+            return
+
+        # NOTE: all ranks has the same ongoing_write_through, can skip sync if empty
+        if len(self.ongoing_write_through) == 0:
+            return
+
+        finish_count = 0
+        for _, finish_event, ack_list in self.cache_controller.ack_write_queue:
+            if not finish_event.query():
+                break
+            finish_count += 1
+        queue_size = torch.tensor(finish_count, dtype=torch.int, device="cpu")
+        if self.tp_world_size > 1:
+            # synchronize TP workers to make the same update to radix cache
+            torch.distributed.all_reduce(
+                queue_size,
+                op=torch.distributed.ReduceOp.MIN,
+                group=self.tp_group,
+            )
+
+        finish_count = int(queue_size.item())
+        while finish_count > 0:
+            _, finish_event, ack_list = self.cache_controller.ack_write_queue.pop(0)
+            finish_event.synchronize()
+            for ack_id in ack_list:
+                backuped_node = self.ongoing_write_through.pop(ack_id)
+                self.dec_lock_ref(backuped_node)
+                if self.enable_storage:
+                    self.write_backup_storage(backuped_node)
+            finish_count -= 1
+
+    def loading_check(self):
+        finish_count = 0
+        for _, finish_event, ack_list in self.cache_controller.ack_load_queue:
+            if not finish_event.query():
+                # the KV cache loading is still ongoing
+                break
+            finish_count += 1
+            # no need to sync across TP workers as batch forwarding is synced
+            for ack_id in ack_list:
+                end_node = self.ongoing_load_back.pop(ack_id)
+                self.dec_lock_ref(end_node)
+
+        # ACK until all events are processed
+        del self.cache_controller.ack_load_queue[:finish_count]
+
+    def evictable_size(self):
+        return self.evictable_size_
+
+    def _is_pinned(self, node: TreeNode) -> bool:
+        """Check if a node has an active (non-expired) pin."""
+        return node.pin_expiry > 0 and time.monotonic() <= node.pin_expiry
+
+    def _clear_pin(self, node: TreeNode):
+        """Clear expired pin state and release host_ref_counter hold."""
+        if node.pin_expiry > 0:
+            self.pinned_size_ = max(0, self.pinned_size_ - len(node.key))
+            node.host_ref_counter = max(0, node.host_ref_counter - 1)
+        node.pin_expiry = 0.0
+        node.pin_ttl = 0
+
+    def pin_prefix(
+        self, token_ids: List[int], ttl_seconds: int = 300
+    ) -> Tuple[int, Optional[str]]:
+        """Pin nodes along a prefix path. Returns (nodes_pinned, reject_reason)."""
+        if self.disable or not token_ids:
+            return (0, None)
+
+        key, _ = self.maybe_bigram_convert(self._to_radix_key(token_ids))
+        if self.page_size != 1:
+            page_aligned_len = len(key) // self.page_size * self.page_size
+            key = key[:page_aligned_len]
+        if len(key) == 0:
+            return (0, None)
+
+        expiry = time.monotonic() + ttl_seconds
+        nodes_pinned = 0
+        budget_exceeded = False
+        node = self.root_node
+        child_key = self.get_child_key_fn(key)
+
+        while len(key) > 0 and child_key in node.children:
+            child = node.children[child_key]
+            prefix_len = self.key_match_fn(child.key, key)
+
+            # First pin on this node: check budget, then acquire hold
+            if child.pin_expiry == 0:
+                if self.pinned_size_ + len(child.key) > self._max_pinned_tokens:
+                    budget_exceeded = True
+                    break
+                child.host_ref_counter += 1
+                self.pinned_size_ += len(child.key)
+
+                # Eagerly back up to host so eviction finds pinned nodes
+                # already backuped and never enters the write_back drain
+                # path, which would leak lock_ref on in-flight
+                # write-through entries. No-op under write_back policy.
+                self._inc_hit_count(child)
+
+            # Extend expiry and store TTL for refresh-on-hit
+            child.pin_expiry = max(child.pin_expiry, expiry)
+            child.pin_ttl = max(child.pin_ttl, ttl_seconds)
+            nodes_pinned += 1
+
+            if prefix_len < len(child.key):
+                break
+
+            node = child
+            key = key[prefix_len:]
+            if len(key):
+                child_key = self.get_child_key_fn(key)
+
+        logger.info(
+            "[PIN] pin_prefix: nodes_pinned=%d, ttl=%ds", nodes_pinned, ttl_seconds
+        )
+        if budget_exceeded:
+            msg = f"Pin budget exhausted ({self.pinned_size_}/{self._max_pinned_tokens} tokens pinned)"
+            if nodes_pinned == 0:
+                return (0, msg)
+            return (nodes_pinned, f"prefix partially pinned; {msg}")
+        return (nodes_pinned, None)
+
+    def _to_radix_key(self, token_ids: List[int]) -> RadixKey:
+        """Convert raw token_ids to a RadixKey for tree walking.
+
+        Must use list (not tuple) to match scheduler's RadixKey format,
+        since _key_match_paged compares slices directly and list != tuple.
+        """
+        return RadixKey(token_ids=list(token_ids))
+
+    def inc_lock_ref(self, node: TreeNode):
+        if self.disable:
+            return 0
+
+        delta = 0
+        while node != self.root_node:
+            if node.lock_ref == 0:
+                self.evictable_size_ -= len(node.key)
+                self.protected_size_ += len(node.key)
+                delta -= len(node.key)
+            node.lock_ref += 1
+            self._update_leaf_status(node)
+            self._update_host_leaf_status(node)
+            node = node.parent
+        return delta
+
+    def dec_lock_ref(self, node: TreeNode):
+        if self.disable:
+            return 0
+
+        delta = 0
+        while node != self.root_node:
+            if node.lock_ref == 1:
+                self.evictable_size_ += len(node.key)
+                self.protected_size_ -= len(node.key)
+                delta += len(node.key)
+            node.lock_ref -= 1
+            self._update_leaf_status(node)
+            self._update_host_leaf_status(node)
+            if node.parent is None:
+                assert (
+                    node is self.root_node
+                ), f"This request holds the node from another tree"
+            node = node.parent
+        return delta
+
+    def _update_host_leaf_status(self, node: TreeNode):
+        if not node.evicted or node.lock_ref > 0:
+            if node in self.evictable_host_leaves:
+                self.evictable_host_leaves.remove(node)
+            return
+
+        for child in node.children.values():
+            if child.evicted:
+                if node in self.evictable_host_leaves:
+                    self.evictable_host_leaves.remove(node)
+                return
+
+        if node not in self.evictable_host_leaves:
+            self.evictable_host_leaves.add(node)
+
+    def evict(self, params: EvictParams) -> EvictResult:
+        start_time = time.perf_counter()
+        num_tokens = params.num_tokens
+        leaves = list(self.evictable_leaves)
+        eviction_heap = [
+            (self.eviction_strategy.get_priority(node), node) for node in leaves
+        ]
+        heapq.heapify(eviction_heap)
+
+        num_evicted = 0
+        write_back_nodes = []
+        while num_evicted < num_tokens and len(eviction_heap):
+            _priority, x = heapq.heappop(eviction_heap)
+
+            if x.lock_ref > 0:
+                continue
+
+            if self._is_pinned(x):
+                # Still active: demote to host if possible
+                if x.backuped:
+                    num_evicted += self._evict_backuped(x)
+                    continue
+                written = self.write_backup(x, write_back=True)
+                if written > 0:
+                    num_evicted += written
+                    write_back_nodes.append(x)
+                    continue  # backup succeeded, pin holds on host
+                # Host full -- drop pin so GPU can be freed
+                self._clear_pin(x)
+                logger.warning(
+                    "[PIN] evict: can't backup node %d to host, releasing pin",
+                    x.id,
+                )
+            elif x.pin_expiry > 0:
+                # Expired pin: clear and fall through to normal eviction
+                self._clear_pin(x)
+
+            if not x.backuped:
+                if self.cache_controller.write_policy == "write_back":
+                    # write to host if the node is not backuped
+                    num_evicted += self.write_backup(x, write_back=True)
+                    write_back_nodes.append(x)
+                else:
+                    num_evicted += self._evict_regular(x)
+            else:
+                num_evicted += self._evict_backuped(x)
+
+            for child in x.parent.children.values():
+                if child in write_back_nodes:
+                    continue
+                if not child.evicted:
+                    break
+            else:
+                # all children are evicted or no children
+                new_priority = self.eviction_strategy.get_priority(x.parent)
+                heapq.heappush(eviction_heap, (new_priority, x.parent))
+
+        if self.cache_controller.write_policy == "write_back":
+            self.writing_check(write_back=True)
+            for node in write_back_nodes:
+                assert node.backuped
+                self._evict_backuped(node)
+
+        self.update_eviction_metrics(num_evicted, start_time)
+        return EvictResult(num_tokens_evicted=num_evicted)
+
+    def _evict_backuped(self, node: TreeNode):
+        # GPU -> CPU demotion: no BlockRemoved since block is still reachable via load_back
+        num_evicted = self.cache_controller.evict_device(node.value)
+        assert num_evicted > 0
+        self.evictable_size_ -= num_evicted
+        node.value = None
+        self._update_leaf_status(node)
+        self._update_host_leaf_status(node)
+        # update leaf status for the parent because the node is evicted
+        self._update_leaf_status(node.parent)
+        return num_evicted
+
+    def _evict_regular(self, node: TreeNode):
+        # evict a node not initiated write to host -- emit BlockRemoved
+        self._record_remove_event(node)
+        self.cache_controller.mem_pool_device_allocator.free(node.value)
+        num_evicted = len(node.value)
+        self._delete_leaf(node)
+        return num_evicted
+
+    def evict_host(self, num_tokens: int):
+        leaves = list(self.evictable_host_leaves)
+        eviction_heap = [
+            (self.eviction_strategy.get_priority(node), node) for node in leaves
+        ]
+        heapq.heapify(eviction_heap)
+
+        num_evicted = 0
+        while num_evicted < num_tokens and len(eviction_heap):
+            _priority, x = heapq.heappop(eviction_heap)
+            if x == self.root_node:
+                break
+            # only evict the host value of evicted nodes
+            if not x.evicted:
+                continue
+
+            # Expire stale pins before checking host_ref_counter
+            if x.pin_expiry > 0 and time.monotonic() > x.pin_expiry:
+                self._clear_pin(x)
+
+            # node is protected from eviction as it has ongoing prefetch, backup, or pin
+            if x.host_ref_counter > 0:
+                continue
+
+            # Block deleted entirely (GPU already evicted, now CPU freed) --
+            # emit BlockRemoved so the router removes this block from its index.
+            self._record_remove_event(x)
+            num_evicted += self.cache_controller.evict_host(x.host_value)
+
+            key = self.get_child_key_fn(x.key)
+            v = x.parent.children.pop(key, None)
+            assert v == x, f"parent does not have child key, {key}"
+            if x in self.evictable_host_leaves:
+                self.evictable_host_leaves.remove(x)
+            self._update_host_leaf_status(x.parent)
+
+            if len(x.parent.children) == 0 and x.parent.evicted:
+                new_priority = self.eviction_strategy.get_priority(x.parent)
+                heapq.heappush(eviction_heap, (new_priority, x.parent))
+
+    def load_back(
+        self, node: TreeNode, mem_quota: Optional[int] = None
+    ) -> Optional[torch.Tensor]:
+
+        start_time = time.perf_counter()
+        last_hit_node = node
+        nodes_to_load = []
+        while node.evicted:
+            assert (
+                node.backuped
+            ), "No backup available on evicted nodes, should not happen"
+            nodes_to_load.insert(0, node)
+            node = node.parent
+        else:
+            ancester_node = node
+
+        # protect the ancestor nodes from eviction
+        delta = self.inc_lock_ref(ancester_node)
+
+        # load it all or not at all
+        host_indices = torch.cat([n.host_value for n in nodes_to_load])
+        if len(host_indices) < self.load_back_threshold or (
+            len(host_indices) > mem_quota + delta if mem_quota is not None else False
+        ):
+            # skip loading back if the total size is too small or exceeding the memory quota
+            self.dec_lock_ref(ancester_node)
+            return None
+
+        device_indices = self.cache_controller.load(
+            host_indices=host_indices, node_id=last_hit_node.id
+        )
+        if device_indices is None:
+            self.evict(EvictParams(num_tokens=len(host_indices)))
+            device_indices = self.cache_controller.load(
+                host_indices=host_indices, node_id=last_hit_node.id
+            )
+        self.dec_lock_ref(ancester_node)
+        if device_indices is None:
+            # no sufficient GPU memory to load back KV caches
+            logger.warning(
+                "load_back: FAILED to load %d tokens for node %d "
+                "even after eviction (evictable_size=%d)",
+                len(host_indices),
+                last_hit_node.id,
+                self.evictable_size_,
+            )
+            return None
+
+        self.ongoing_load_back[last_hit_node.id] = last_hit_node
+        offset = 0
+        for node in nodes_to_load:
+            node.value = device_indices[offset : offset + len(node.host_value)]
+            offset += len(node.host_value)
+        self.evictable_size_ += len(device_indices)
+        self.inc_lock_ref(last_hit_node)
+
+        if self.metrics_collector is not None:
+            self.metrics_collector.observe_load_back_duration(
+                time.perf_counter() - start_time
+            )
+            self.metrics_collector.increment_load_back_num_tokens(len(device_indices))
+
+        return device_indices
+
+    def init_load_back(
+        self,
+        last_node: TreeNode,
+        host_hit_length: int,
+        mem_quota: Optional[int] = None,
+    ):
+        _ = host_hit_length  # unused, but kept for compatibility
+        if last_node.evicted:
+            loading_values = self.load_back(last_node, mem_quota)
+            if loading_values is not None:
+                logger.debug(
+                    f"loading back {len(loading_values)} tokens for node {last_node.id}"
+                )
+                return loading_values, last_node
+
+            while last_node.evicted:
+                last_node = last_node.parent
+
+        return (
+            torch.empty((0,), dtype=torch.int64, device=self.device),
+            last_node,
+        )
+
+    def ready_to_load_host_cache(self) -> int:
+        """
+        Notify the cache controller to start the KV cache loading.
+        Return the consumer index for the schedule batch manager to track.
+        """
+        return self.cache_controller.start_loading()
+
+    def check_hicache_events(self):
+        self.writing_check()
+        self.loading_check()
+        if self.enable_storage:
+            self.drain_storage_control_queues()
+        if self.enable_storage_metrics:
+            self.storage_metrics_collector.log_storage_metrics(
+                self.cache_controller.storage_backend.get_stats()
+            )
+
+    def drain_storage_control_queues(self):
+        """
+        Combine prefetch revoke, backup ack, and host mem release checks
+        to minimize TP synchronization and Python overhead.
+        """
+        cc = self.cache_controller
+
+        qsizes = torch.tensor(
+            [
+                cc.prefetch_revoke_queue.qsize(),
+                cc.ack_backup_queue.qsize(),
+                cc.host_mem_release_queue.qsize(),
+            ],
+            dtype=torch.int,
+        )
+        if self.tp_world_size > 1:
+            torch.distributed.all_reduce(
+                qsizes, op=torch.distributed.ReduceOp.MIN, group=self.tp_group
+            )
+
+        n_revoke, n_backup, n_release = map(int, qsizes.tolist())
+        self._drain_storage_control_queues_impl(
+            n_revoke=n_revoke,
+            n_backup=n_backup,
+            n_release=n_release,
+            log_metrics=True,
+        )
+
+    # Timeout is linearly increasing with the number of pages
+    def _prefetch_timeout_check_linear_func(self, operation: PrefetchOperation):
+        # If hash_value has not been computed in timeout_base seconds, terminate it.
+        return (
+            time.monotonic() - operation.start_time
+            > self.prefetch_timeout_base
+            + len(operation.hash_value) * self.prefetch_timeout_per_page
+        )
+
+    def can_terminate_prefetch(self, operation: PrefetchOperation):
+        can_terminate = True
+
+        if self.prefetch_stop_policy == "best_effort":
+            return can_terminate
+
+        if len(operation.hash_value) == 0:
+            completed = False
+        else:
+            completed = (
+                operation.completed_tokens == len(operation.hash_value) * self.page_size
+            )
+
+        if self.prefetch_stop_policy == "wait_complete":
+            can_terminate = completed
+        elif self.prefetch_stop_policy == "timeout":
+            can_terminate = completed or self.is_prefetch_timeout(operation)
+        else:
+            # unknown prefetch stop policy, just return True
+            return True
+
+        operation_terminated = operation.is_terminated()
+        if self.tp_world_size > 1:
+            states = torch.tensor(
+                [1 - int(can_terminate), int(operation_terminated)],
+                dtype=torch.int,
+            )
+            torch.distributed.all_reduce(
+                states,
+                op=torch.distributed.ReduceOp.MAX,
+                group=self.tp_group,
+            )
+            can_terminate = states[0].item() == 0
+            operation_terminated = states[1].item() == 1
+        # the operation should be terminated if it is already terminated on any TP worker
+        # or it meets the termination condition on all TP workers
+        can_terminate = can_terminate or operation_terminated
+        return can_terminate
+
+    def check_prefetch_progress(self, req_id: str) -> bool:
+        if req_id not in self.ongoing_prefetch:
+            # there is no ongoing prefetch for this request or it has been revoked
+            return True
+
+        # todo: more policies for prefetch progress such as timeout
+        # the current policy is to prefetch with best effort and terminate when queuing is over
+        last_host_node, token_ids, host_indices, operation = self.ongoing_prefetch[
+            req_id
+        ]
+
+        if operation.host_indices is None:
+            # prefetch has not been issued due to insufficient host memory
+            return True
+
+        if not self.can_terminate_prefetch(operation):
+            return False
+
+        completed_tokens, hash_value = self.cache_controller.terminate_prefetch(
+            operation
+        )
+        logger.debug(f"Prefetch {req_id} completed with {completed_tokens} tokens")
+
+        min_completed_tokens = completed_tokens
+        if self.tp_world_size > 1:
+            # synchrnoize TP workers to make the same update to hiradix cache
+            completed_tokens_tensor = torch.tensor(
+                min_completed_tokens, dtype=torch.int
+            )
+            torch.distributed.all_reduce(
+                completed_tokens_tensor,
+                op=torch.distributed.ReduceOp.MIN,
+                group=self.tp_group,
+            )
+            min_completed_tokens = completed_tokens_tensor.item()
+        fetched_token_ids = token_ids[:min_completed_tokens]
+        written_indices = host_indices[:min_completed_tokens]
+        matched_length = self._insert_helper_host(
+            last_host_node,
+            RadixKey(
+                token_ids=fetched_token_ids, extra_key=last_host_node.key.extra_key
+            ),
+            written_indices,
+            hash_value[: min_completed_tokens // self.page_size],
+        )
+
+        self.cache_controller.mem_pool_host.free(host_indices[:matched_length])
+        self.cache_controller.append_host_mem_release(
+            host_indices[min_completed_tokens:completed_tokens]
+        )
+        last_host_node.release_host()
+        del self.ongoing_prefetch[req_id]
+        self.cache_controller.prefetch_tokens_occupied -= len(token_ids)
+
+        # Track tokens actually loaded from storage for this request (L3 hits)
+        loaded_from_storage = min_completed_tokens - matched_length
+        self.prefetch_loaded_tokens_by_reqid[req_id] = loaded_from_storage
+
+        if self.enable_storage_metrics:
+            self.storage_metrics_collector.log_prefetched_tokens(loaded_from_storage)
+
+        return True
+
+    def terminate_prefetch(self, req_id: str):
+        if req_id not in self.ongoing_prefetch:
+            return
+
+        _, _, _, operation = self.ongoing_prefetch[req_id]
+        if operation.host_indices is None:
+            return
+        operation.mark_terminate()
+
+    def pop_prefetch_loaded_tokens(self, req_id: str) -> int:
+        """
+        Pop and return the number of tokens loaded from storage for a request.
+        Returns 0 if no prefetch was done or was revoked.
+        This should be called after check_prefetch_progress() returns True.
+        """
+        return self.prefetch_loaded_tokens_by_reqid.pop(req_id, 0)
+
+    def match_prefix(self, params: MatchPrefixParams):
+        key = params.key
+        empty_value = torch.empty((0,), dtype=torch.int64, device=self.device)
+        key, _ = self.maybe_bigram_convert(key)
+        if self.disable or len(key) == 0:
+            return MatchResult(
+                device_indices=empty_value,
+                last_device_node=self.root_node,
+                last_host_node=self.root_node,
+                host_hit_length=0,
+            )
+
+        page_aligned_len = len(key)
+        if self.page_size != 1:
+            page_aligned_len = len(key) // self.page_size * self.page_size
+            key = key[:page_aligned_len]
+
+        value, last_node = self._match_prefix_helper(self.root_node, key)
+        if value:
+            value = torch.cat(value)
+        else:
+            value = empty_value
+
+        host_hit_length = 0
+        last_host_node = last_node
+        while last_node.evicted:
+            host_hit_length += len(last_node.host_value)
+            last_node = last_node.parent
+        while not last_host_node.backuped:
+            last_host_node = last_host_node.parent
+
+        return MatchResult(
+            device_indices=value,
+            last_device_node=last_node,
+            last_host_node=last_host_node,
+            host_hit_length=host_hit_length,
+        )
+
+    def prefetch_from_storage(
+        self,
+        req_id: str,
+        last_host_node: TreeNode,
+        new_input_tokens: List[int],
+        last_hash: Optional[str] = None,
+        prefix_keys: Optional[List[str]] = None,
+    ):
+        new_input_tokens = (
+            convert_to_bigram_key(new_input_tokens)
+            if self.is_eagle
+            else new_input_tokens
+        )
+        # align the number of fetching tokens to the page size
+        prefetch_length = len(new_input_tokens) - (
+            len(new_input_tokens) % self.page_size
+        )
+        new_input_tokens = new_input_tokens[:prefetch_length]
+        if (
+            not self.enable_storage
+            or prefetch_length < self.prefetch_threshold
+            or self.cache_controller.prefetch_rate_limited()
+        ):
+            return
+
+        last_host_node.protect_host()
+        host_indices = self.cache_controller.mem_pool_host.alloc(prefetch_length)
+        if host_indices is None:
+            self.evict_host(prefetch_length)
+            host_indices = self.cache_controller.mem_pool_host.alloc(prefetch_length)
+        if host_indices is None:
+            last_host_node.release_host()
+            # no sufficient host memory for prefetch
+            return
+        operation = self.cache_controller.prefetch(
+            req_id, host_indices, new_input_tokens, last_hash, prefix_keys
+        )
+        self.ongoing_prefetch[req_id] = (
+            last_host_node,
+            new_input_tokens,
+            host_indices,
+            operation,
+        )
+        self.cache_controller.prefetch_tokens_occupied += len(new_input_tokens)
+
+    def _insert_helper_host(
+        self, node: TreeNode, key: RadixKey, host_value, hash_value
+    ):
+        node.last_access_time = time.monotonic()
+        if len(key) == 0:
+            return 0
+
+        child_key = self.get_child_key_fn(key)
+
+        matched_length = 0
+        while len(key) > 0 and child_key in node.children.keys():
+            node = node.children[child_key]
+            node.last_access_time = time.monotonic()
+            # Refresh pin TTL on host insert hit
+            if self._is_pinned(node):
+                node.pin_expiry = time.monotonic() + node.pin_ttl
+            prefix_len = self.key_match_fn(node.key, key)
+            key = key[prefix_len:]
+            host_value = host_value[prefix_len:]
+            hash_value = hash_value[prefix_len // self.page_size :]
+            matched_length += prefix_len
+
+            if prefix_len < len(node.key):
+                new_node = self._split_node(node.key, node, prefix_len)
+                node = new_node
+
+            if len(key):
+                child_key = self.get_child_key_fn(key)
+
+        if len(key):
+            new_node = TreeNode(priority=node.priority)
+            new_node.parent = node
+            new_node.key = key
+            new_node.value = None
+            new_node.host_value = host_value.clone()
+            new_node.hash_value = hash_value
+            node.children[child_key] = new_node
+            self._update_host_leaf_status(new_node)
+            self._update_leaf_status(node)
+            self._update_host_leaf_status(node)
+
+        return matched_length
+
+    def _match_prefix_helper(self, node: TreeNode, key: RadixKey):
+        node.last_access_time = time.monotonic()
+        child_key = self.get_child_key_fn(key)
+        value = []
+
+        while len(key) > 0 and child_key in node.children.keys():
+            child = node.children[child_key]
+            child.last_access_time = time.monotonic()
+            # Refresh pin TTL on cache hit
+            if self._is_pinned(child):
+                child.pin_expiry = time.monotonic() + child.pin_ttl
+            prefix_len = self.key_match_fn(child.key, key)
+            if prefix_len < len(child.key):
+                new_node = self._split_node(child.key, child, prefix_len)
+                if not new_node.evicted:
+                    value.append(new_node.value)
+                node = new_node
+                break
+            else:
+                if not child.evicted:
+                    value.append(child.value)
+                node = child
+                key = key[prefix_len:]
+
+                if len(key):
+                    child_key = self.get_child_key_fn(key)
+
+        return value, node
+
+    def _split_node(self, key: RadixKey, child: TreeNode, split_len: int):
+        # child node split into new_node -> child
+        new_node = TreeNode(priority=child.priority)
+        new_node.children = {self.get_child_key_fn(key[split_len:]): child}
+        new_node.parent = child.parent
+        new_node.lock_ref = child.lock_ref
+        new_node.pin_expiry = child.pin_expiry
+        new_node.pin_ttl = child.pin_ttl
+        # If child is pinned, new parent inherits a host_ref_counter hold
+        if child.pin_expiry > 0:
+            new_node.host_ref_counter += 1
+        new_node.key = child.key[:split_len]
+        new_node.hit_count = child.hit_count
+
+        # split value and host value if exists
+        if child.evicted:
+            new_node.value = None
+        else:
+            new_node.value = child.value[:split_len].clone()
+            child.value = child.value[split_len:].clone()
+        if child.backuped:
+            new_node.host_value = child.host_value[:split_len].clone()
+            child.host_value = child.host_value[split_len:].clone()
+
+        new_node.hash_value, child.hash_value = split_node_hash_value(
+            child.hash_value, split_len, self.page_size
+        )
+        child.parent = new_node
+        child.key = child.key[split_len:]
+        new_node.parent.children[self.get_child_key_fn(key)] = new_node
+
+        return new_node
+
+    def insert(self, params: InsertParams) -> InsertResult:
+        key = params.key
+        value = params.value
+        chunked = params.chunked
+        priority = params.priority
+
+        if priority is None:
+            priority = 0
+        key, value = self.maybe_bigram_convert(key, value)
+
+        if len(key) == 0:
+            return InsertResult(prefix_len=0)
+
+        if self.is_eagle and value is not None:
+            # Make sure the value len equal to the EAGLE bigram key len
+            value = value[: len(key)]
+
+        node = self.root_node
+        child_key = self.get_child_key_fn(key)
+        total_prefix_length = 0
+
+        while len(key) > 0 and child_key in node.children.keys():
+            node = node.children[child_key]
+            node.last_access_time = time.monotonic()
+            node.priority = max(node.priority, priority)
+            prefix_len = self.key_match_fn(node.key, key)
+
+            if prefix_len == len(node.key):
+                if node.evicted:
+                    # change the reference if the node is evicted
+                    # this often happens in the case of KV cache recomputation
+                    node.value = value[:prefix_len]
+                    self.evictable_size_ += len(node.value)
+                    self._update_leaf_status(node)
+                    self._update_host_leaf_status(node)
+                    # update parent status as a new leaf is added into device
+                    self._update_leaf_status(node.parent)
+                else:
+                    self._inc_hit_count(node, chunked)
+                    total_prefix_length += prefix_len
+            else:
+                # partial match, split the node
+                new_node = self._split_node(node.key, node, prefix_len)
+                # shared-prefix node should also reflect max priority
+                new_node.priority = max(new_node.priority, priority)
+                if new_node.evicted:
+                    new_node.value = value[:prefix_len].clone()
+                    self.evictable_size_ += len(new_node.value)
+                    self._update_leaf_status(new_node)
+                    self._update_host_leaf_status(new_node)
+                    # update parent status as a new leaf is added into device
+                    self._update_leaf_status(new_node.parent)
+                else:
+                    self._inc_hit_count(new_node, chunked)
+                    total_prefix_length += prefix_len
+                node = new_node
+
+            key = key[prefix_len:]
+            value = value[prefix_len:]
+
+            if len(key):
+                child_key = self.get_child_key_fn(key)
+
+        if len(key):
+            new_node = TreeNode(priority=priority)
+            new_node.parent = node
+            new_node.key = key
+            new_node.value = value.clone()
+            node.children[child_key] = new_node
+            self.evictable_size_ += len(value)
+            self._update_leaf_status(node)
+            self._update_leaf_status(new_node)
+
+            # Compute hash_value if storage or kv events are enabled
+            if self.enable_storage or self.enable_kv_cache_events:
+                new_node.hash_value = compute_node_hash_values(new_node, self.page_size)
+
+            # Emit BlockStored so the router indexes this block.
+            self._record_store_event(new_node)
+
+            if self.cache_controller.write_policy != "write_back":
+                self._inc_hit_count(new_node, chunked)
+        return InsertResult(prefix_len=total_prefix_length)
+
+    def release_aborted_request(self, rid: str):
+        # Clean up storage hit tracking for aborted request
+        self.prefetch_loaded_tokens_by_reqid.pop(rid, None)
+
+        if rid not in self.ongoing_prefetch:
+            return
+
+        last_host_node, token_ids, host_indices, operation = self.ongoing_prefetch[rid]
+        if operation.host_indices is None:
+            return
+
+        completed_tokens, _ = self.cache_controller.terminate_prefetch(operation)
+        if self.tp_world_size > 1:
+            torch.distributed.barrier(group=self.tp_group)
+        last_host_node.release_host()
+        del self.ongoing_prefetch[rid]
+        self.cache_controller.append_host_mem_release(host_indices[:completed_tokens])
+        self.cache_controller.prefetch_tokens_occupied -= len(token_ids)
diff --git a/sglang/python/sglang/srt/mem_cache/mamba_radix_cache.py b/sglang/python/sglang/srt/mem_cache/mamba_radix_cache.py
new file mode 100644
index 0000000000000000000000000000000000000000..07f5278d964c2b9307c6268c8d38709504d0500a
--- /dev/null
+++ b/sglang/python/sglang/srt/mem_cache/mamba_radix_cache.py
@@ -0,0 +1,1232 @@
+from __future__ import annotations
+
+"""
+Copyright 2023-2024 SGLang Team
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+"""
+The radix tree data structure for managing the hybrid (full and Mamba) KV cache.
+"""
+
+import heapq
+from collections import defaultdict
+from functools import partial
+from typing import TYPE_CHECKING, List, Optional, Tuple
+
+import torch
+from numpy import float64
+
+from sglang.srt.distributed import get_tensor_model_parallel_rank
+from sglang.srt.layers.attention.fla.chunk_delta_h import CHUNK_SIZE as FLA_CHUNK_SIZE
+from sglang.srt.mem_cache.allocator import (
+    PagedTokenToKVPoolAllocator,
+    TokenToKVPoolAllocator,
+)
+from sglang.srt.mem_cache.base_prefix_cache import (
+    BasePrefixCache,
+    EvictParams,
+    EvictResult,
+    InsertParams,
+    InsertResult,
+    MatchPrefixParams,
+    MatchResult,
+)
+from sglang.srt.mem_cache.memory_pool import HybridReqToTokenPool
+from sglang.srt.mem_cache.radix_cache import (
+    RadixKey,
+    _key_match_page_size1,
+    _key_match_paged,
+    get_child_key,
+)
+from sglang.srt.server_args import get_global_server_args
+
+if TYPE_CHECKING:
+    from sglang.srt.managers.schedule_batch import Req
+    from sglang.srt.mem_cache.cache_init_params import CacheInitParams
+
+import logging
+
+logger = logging.getLogger(__name__)
+
+
+class TreeNode:
+
+    counter = 0
+    last_access_time_counter_float = float64(1.0)
+
+    def __init__(self, id: Optional[int] = None):
+        self.children = defaultdict(TreeNode)
+        self.parent: TreeNode = None
+        self.key: RadixKey = None
+        self.value: Optional[torch.Tensor] = None
+        self.mamba_value: Optional[torch.Tensor] = None
+        # invariant: for any node, if mamba_lock_ref is locked, full_lock_ref must be locked;
+        # if full_lock_ref is locked, mamba_lock_ref doesn't need to be locked. So,
+        # full_lock_ref is always >= mamba_lock_ref.
+        # for full_lock, once it is locked, its parent must be locked as well
+        # for mamba_lock, it only need lock node itself
+        self.full_lock_ref = 0
+        self.mamba_lock_ref = 0
+        # last access time is only used for sanity check. LRU is maintained by the lru list.
+        self.last_access_time = get_last_access_time()
+
+        self.hit_count = 0
+        # store the host indices of KV cache
+        self.host_value = None
+
+        # for lru list, invariant:
+        # 1. prev has greater last_access_time
+        # 2. next has smaller last_access_time
+        self.prev = None
+        self.next = None
+        self.mamba_prev = None
+        self.mamba_next = None
+
+        self.id = TreeNode.counter if id is None else id
+        TreeNode.counter += 1
+
+    @property
+    def evicted(self):
+        return self.value is None
+
+    @property
+    def backuped(self):
+        return self.host_value is not None
+
+    def __lt__(self, other: "TreeNode"):
+        return self.last_access_time < other.last_access_time
+
+
+def get_last_access_time() -> float64:
+    ret = TreeNode.last_access_time_counter_float
+    TreeNode.last_access_time_counter_float += 1.0
+    return ret
+
+
+class LRUList:
+    def __init__(self, mamba: bool = False):
+        self.mamba = mamba
+        if self.mamba:
+            self.prv = "mamba_prev"
+            self.nxt = "mamba_next"
+            self.lock_ref = "mamba_lock_ref"
+        else:
+            self.prv = "prev"
+            self.nxt = "next"
+            self.lock_ref = "full_lock_ref"
+        # Initialize dummy head and tail nodes
+        self.head = TreeNode()  # Most recently used side
+        self.tail = TreeNode()  # Least recently used side
+        setattr(self.head, self.nxt, self.tail)  # self.head.next = self.tail
+        setattr(self.tail, self.prv, self.head)  # self.tail.prev = self.head
+        self.cache = {}
+
+    def _add_node(self, node):
+        """Helper to add node right after head (most recently used)"""
+        self._add_node_after(self.head, node)
+
+    def _add_node_after(self, old_node, new_node):
+        """Helper to add node right after old_node"""
+        setattr(new_node, self.prv, old_node)  # new_node.prev = old_node
+        setattr(
+            new_node, self.nxt, getattr(old_node, self.nxt)
+        )  # new_node.next = old_node.next
+        setattr(
+            getattr(old_node, self.nxt), self.prv, new_node
+        )  # old_node.next.prev = new_node
+        setattr(old_node, self.nxt, new_node)  # old_node.next = new_node
+
+    def _remove_node(self, node):
+        """Helper to remove node from linked list"""
+        setattr(
+            getattr(node, self.prv), self.nxt, getattr(node, self.nxt)
+        )  # node.prev.next = node.next
+        setattr(
+            getattr(node, self.nxt), self.prv, getattr(node, self.prv)
+        )  # node.next.prev = node.prev
+
+    def _get_lru(self) -> Optional[TreeNode]:
+        """
+        Get the least recently used node
+        """
+        if len(self.cache) == 0:
+            return None
+        return getattr(self.tail, self.prv)
+
+    def reset_node_mru(self, node):
+        """
+        Move a (existing) node to most recently used position
+        """
+        assert node.id in self.cache, f"Resetting node {node.id=} not in lru list"
+        assert (
+            not self.mamba or node.mamba_value is not None
+        ), f"Resetting mamba tombstone node in mamba lru list: {node.id=}"
+        self._remove_node(node)
+        self._add_node(node)
+
+    def reset_node_and_parents_mru(self, node, root_node):
+        """
+        Move an (existing) node and its parents to most recently used position. Child node is
+        more recently used than parent node.
+        """
+        prev_node = self.head
+        while node != root_node:
+            if not self.mamba or node.mamba_value is not None:
+                assert (
+                    node.id in self.cache
+                ), f"Resetting node {node.id=} not in lru list when resetting node and parents mru"
+                self._remove_node(node)
+                self._add_node_after(prev_node, node)
+                prev_node = node
+            node = node.parent
+
+    def insert_mru(self, node):
+        """
+        Insert a (new) node as most recently used
+        """
+        assert (
+            not self.mamba or node.mamba_value is not None
+        ), f"Inserting mamba tombstone node in mamba lru list: {node.id=}"
+        assert (
+            node.id not in self.cache
+        ), f"Inserting node {node.id=} already in lru list, existing node: {self.cache[node.id].id=}"
+        self.cache[node.id] = node
+        self._add_node(node)
+
+    def remove_node(self, node: TreeNode):
+        """
+        Remove node from lru list
+        """
+        assert node.id in self.cache, f"Removing node {node.id=} not in lru list"
+        assert (
+            not self.mamba or node.mamba_value is not None
+        ), f"Removing mamba tombstone node from mamba lru list: {node.id=}"
+        del self.cache[node.id]
+        self._remove_node(node)
+
+    def get_lru_no_lock(self) -> Optional[TreeNode]:
+        """
+        Get the least recently used node that is not locked
+        """
+        return self.get_prev_no_lock(self.tail, check_id=False)
+
+    def get_leaf_lru_no_lock(self) -> Optional[TreeNode]:
+        """
+        Get the least recently used leaf node that is not locked
+        """
+        return self.get_prev_leaf_no_lock(self.tail, check_id=False)
+
+    def get_prev_no_lock(
+        self, node: TreeNode, check_id: bool = True
+    ) -> Optional[TreeNode]:
+        """
+        Get the previous (i.e. more recently used) node that is not locked
+        """
+        if check_id:
+            assert (
+                node.id in self.cache
+            ), f"Getting prev of node {node.id=} not in lru list"
+        x = getattr(node, self.prv)  # x = node.prev
+        while getattr(x, self.lock_ref) > 0:
+            x = getattr(x, self.prv)  # x = x.prev
+        # if x is the head, it means there is no node in the lru list without lock
+        if x == self.head:
+            return None
+        return x
+
+    def get_prev_leaf_no_lock(self, node: TreeNode, check_id: bool = True):
+        """
+        Get the previous (i.e. more recently used) leaf node that is not locked
+        """
+        if check_id:
+            assert (
+                node.id in self.cache
+            ), f"Getting prev of node {node.id=} not in lru list"
+        x = getattr(node, self.prv)  # x = node.prev
+        while getattr(x, self.lock_ref) > 0 or len(x.children) > 0:
+            x = getattr(x, self.prv)  # x = x.prev
+        # if x is the head, it means there is no leaf node in the lru list without lock
+        if x == self.head:
+            return None
+        return x
+
+    def in_list(self, node: Optional[TreeNode]):
+        """
+        Check if the node is in the lru list
+        """
+        if not node:
+            return False
+        return node.id in self.cache
+
+    def pretty_print(self, tree_cache: Optional["MambaRadixCache"] = None):
+        """
+        Pretty print the lru list
+        """
+        msg = f"{self.mamba=} LRU list: "
+        x_lru = self._get_lru()
+        while x_lru is not None and x_lru.id in self.cache:
+            msg += f"[{x_lru.id}] {x_lru.last_access_time:f} -> "
+            x_lru = getattr(x_lru, self.prv)
+        print(msg)
+
+        if not tree_cache:
+            return
+        msg = f"{self.mamba=} Nodes (sorted by last_access_time): "
+        if self.mamba:
+            nodes = tree_cache._collect_nontombstone_nodes()
+        else:
+            nodes = tree_cache._collect_all_nodes()
+        heapq.heapify(nodes)
+        while len(nodes):
+            x = heapq.heappop(nodes)
+            msg += f"[{x.id}] {x.last_access_time:f} -> "
+        print(msg)
+
+    # Note: this is expensive, only use for debug
+    def sanity_check_evictable_size(self):
+        """
+        Check the evictable size (i.e. the size of the nodes that are not locked)
+        """
+        node = self.get_lru_no_lock()
+        evictable_size = 0
+        while self.in_list(node):
+            evictable_size += (
+                len(node.value) if not self.mamba else len(node.mamba_value)
+            )
+            node = self.get_prev_no_lock(node)
+        return evictable_size
+
+    # Note: this is expensive, only use for debug or idle check
+    def sanity_check(self, tree_cache: "MambaRadixCache"):
+        """
+        Check if the lru list is valid by rebuilding the lru list from the tree, heapifying it, and
+        checking if the lru list is valid.
+        """
+        try:
+            if self.mamba:
+                nodes = tree_cache._collect_nontombstone_nodes()
+            else:
+                nodes = tree_cache._collect_all_nodes()
+            total_nodes = len(nodes)
+            total_lru = len(self.cache)
+            # heapify based on last_access_time
+            heapq.heapify(nodes)
+            # the root node is not in the lru list
+            assert len(nodes) == (
+                total_lru + (0 if self.mamba else 1)
+            ), f"len(nodes): {len(nodes)}, total_lru: {total_lru}"
+
+            x_lru = self._get_lru()
+            while len(nodes):
+                x = heapq.heappop(nodes)
+                if x == tree_cache.root_node:
+                    # root node is not in the lru list
+                    continue
+                assert (
+                    x_lru is not None and x_lru.id in self.cache
+                ), f"Incorrect LRU list, x_lru is None or not in cache: {x_lru=}, {x.id=}"
+
+                assert (
+                    x == x_lru
+                ), f"Incorrect LRU list, {self.mamba=}, x: {x.id=} != x_lru: {x_lru.id=}, {x.last_access_time=}, {x_lru.last_access_time=}"
+                assert (
+                    x_lru.full_lock_ref == 0
+                ), f"x_lru should not be locked when idle, {x_lru.full_lock_ref=}, {x_lru.id=}"
+                assert (
+                    x_lru.mamba_lock_ref == 0
+                ), f"x_lru should not be locked when idle, {x_lru.mamba_lock_ref=}, {x_lru.id=}"
+                x_lru = getattr(x, self.prv)
+
+            if self.mamba:
+                evictable_size = tree_cache.mamba_evictable_size()
+                lru_list_evictable_size = self.sanity_check_evictable_size()
+            else:
+                evictable_size = tree_cache.full_evictable_size()
+                lru_list_evictable_size = self.sanity_check_evictable_size()
+
+            assert (
+                evictable_size == lru_list_evictable_size
+            ), f"{self.mamba=}, total nodes: {total_nodes}, total lru: {total_lru}, evictable size: {evictable_size} != lru list evictable size: {lru_list_evictable_size}"
+        except Exception as e:
+            if get_tensor_model_parallel_rank() == 0:
+                msg = f"Mamba Radix tree sanity check failed, ping @yizhang2077: {e}"
+                logger.error(msg)
+                tree_cache.pretty_print()
+                tree_cache.full_lru_list.pretty_print(tree_cache)
+                tree_cache.mamba_lru_list.pretty_print(tree_cache)
+                raise Exception(msg)
+
+
+class MambaRadixCache(BasePrefixCache):
+    def __init__(self, params: CacheInitParams):
+        assert isinstance(
+            params.token_to_kv_pool_allocator, TokenToKVPoolAllocator
+        ) or isinstance(params.token_to_kv_pool_allocator, PagedTokenToKVPoolAllocator)
+        self.req_to_token_pool: HybridReqToTokenPool = params.req_to_token_pool
+        self.token_to_kv_pool_allocator = params.token_to_kv_pool_allocator
+
+        self.page_size = params.page_size
+        self.disable = params.disable
+        self.enable_mamba_extra_buffer = params.enable_mamba_extra_buffer
+
+        if not self.enable_mamba_extra_buffer:
+            assert (
+                self.page_size == 1
+            ), f"Page size must be 1 for MambaRadixCache v1, got {self.page_size}"
+
+        if self.token_to_kv_pool_allocator:
+            self.device = self.token_to_kv_pool_allocator.device
+        else:
+            self.device = torch.device("cpu")
+
+        if params.enable_metrics:
+            self.init_metrics_collector()
+
+        if self.page_size == 1:
+            self.key_match_fn = _key_match_page_size1
+            self.get_child_key_fn = get_child_key
+        else:
+            self.key_match_fn = partial(_key_match_paged, page_size=self.page_size)
+            self.get_child_key_fn = partial(get_child_key, page_size=self.page_size)
+        self.reset()
+
+    ##### Public API #####
+
+    def supports_mamba(self) -> bool:
+        return True
+
+    def reset(self) -> None:
+        self.root_node = TreeNode()
+        self.root_node.key = RadixKey([], None)
+        self.root_node.value = []
+        self.root_node.full_lock_ref = 1
+        self.root_node.mamba_lock_ref = 1
+        self.full_evictable_size_ = 0
+        self.mamba_evictable_size_ = 0
+        self.full_protected_size_ = 0
+        self.mamba_protected_size_ = 0
+        # LRU lists are used to maintain the order of eviction of the nodes in the tree
+        self.full_lru_list = LRUList(mamba=False)
+        self.mamba_lru_list = LRUList(mamba=True)
+
+    def match_prefix(self, params: MatchPrefixParams) -> MatchResult:
+        """Find the matching prefix from the radix tree.
+        Args:
+            params: MatchPrefixParams containing key and optional Mamba-specific parameters.
+        Returns:
+            A tuple of a tensor of matching prefix token IDs and
+            the last node that contains the prefix values. Note that
+            this API can modify the internal state of the Radix tree.
+            The last node create a new child if the prefix is shorter
+            than the last node's value.
+        """
+        key = self._match_pre_processor(params)
+        if key is None:
+            return MatchResult(
+                device_indices=torch.empty(
+                    (0,),
+                    dtype=torch.int64,
+                    device=self.device,
+                ),
+                last_device_node=self.root_node,
+                last_host_node=self.root_node,
+            )
+
+        value, last_node, best_value_len = self._match_prefix_helper(key)
+        return self._match_post_processor(params, value, last_node, best_value_len)
+
+    def insert(self, params: InsertParams) -> InsertResult:
+        if self.disable:
+            return InsertResult(prefix_len=0, mamba_exist=False)
+
+        key = params.key
+        value = params.value
+        mamba_value = params.mamba_value
+
+        if value is None:
+            value = torch.tensor([x for x in key.token_ids], dtype=torch.int64)
+        prefix_len, mamba_exist = self._insert_helper(
+            self.root_node, key, value, mamba_value
+        )
+        return InsertResult(prefix_len=prefix_len, mamba_exist=mamba_exist)
+
+    def cache_finished_req(self, req: Req, is_insert: bool = True) -> None:
+        """Cache request when it finishes."""
+        kv_committed_len = req.pop_committed_kv_cache()
+
+        if self.disable:
+            kv_indices = self.req_to_token_pool.req_to_token[
+                req.req_pool_idx, :kv_committed_len
+            ]
+            self.token_to_kv_pool_allocator.free(kv_indices)
+            self.req_to_token_pool.free_mamba_cache(req)
+            return
+
+        token_ids = (req.origin_input_ids + req.output_ids)[:kv_committed_len]
+        kv_indices = self.req_to_token_pool.req_to_token[
+            req.req_pool_idx, :kv_committed_len
+        ]
+
+        if is_insert:
+            cache_len = (
+                req.mamba_last_track_seqlen
+                if self.enable_mamba_extra_buffer
+                else len(token_ids)
+            )
+            if cache_len is None:
+                cache_len = 0
+            if cache_len != len(token_ids):
+                cache_end_idx = max(cache_len, req.cache_protected_len)
+                self.token_to_kv_pool_allocator.free(kv_indices[cache_end_idx:])
+                token_ids = token_ids[:cache_len]
+                kv_indices = kv_indices[:cache_len]
+
+            if self.page_size != 1:
+                page_aligned_len = len(kv_indices) // self.page_size * self.page_size
+                page_aligned_kv_indices = kv_indices[:page_aligned_len].to(
+                    dtype=torch.int64, copy=True
+                )
+            else:
+                page_aligned_len = len(kv_indices)
+                page_aligned_kv_indices = kv_indices.to(dtype=torch.int64, copy=True)
+
+            assert (
+                cache_len == page_aligned_len
+            ), f"It is required {cache_len=}, {page_aligned_len=}, {kv_committed_len=}, {len(req.origin_input_ids)=}, {len(req.output_ids)=} ping @yizhang2077 if you see this"
+
+            # Radix Cache takes one ref in memory pool
+            # insert the token_ids and kv_indices into the radix tree
+            if self.enable_mamba_extra_buffer:
+                mamba_ping_pong_track_buffer_to_keep = (
+                    self.req_to_token_pool.get_mamba_ping_pong_other_idx(
+                        req.mamba_next_track_idx
+                    )
+                )
+                mamba_value = (
+                    req.mamba_ping_pong_track_buffer[
+                        mamba_ping_pong_track_buffer_to_keep
+                    ]
+                    .unsqueeze(-1)
+                    .clone()
+                )
+            else:
+                mamba_value = req.mamba_pool_idx.unsqueeze(-1).clone()
+                mamba_ping_pong_track_buffer_to_keep = None
+
+            result = self.insert(
+                InsertParams(
+                    key=RadixKey(token_ids[:page_aligned_len], req.extra_key),
+                    value=page_aligned_kv_indices,
+                    mamba_value=mamba_value,
+                )
+            )
+            new_prefix_len, mamba_exist = result.prefix_len, result.mamba_exist
+
+            self.token_to_kv_pool_allocator.free(
+                kv_indices[req.cache_protected_len : new_prefix_len]
+            )
+        else:
+            self.token_to_kv_pool_allocator.free(kv_indices[req.cache_protected_len :])
+            mamba_exist = True
+
+        if mamba_exist:
+            mamba_ping_pong_track_buffer_to_keep = None
+
+        free_mamba_cache = True if self.enable_mamba_extra_buffer else mamba_exist
+
+        if free_mamba_cache:
+            self.req_to_token_pool.free_mamba_cache(
+                req,
+                mamba_ping_pong_track_buffer_to_keep=mamba_ping_pong_track_buffer_to_keep,
+            )
+
+        self.dec_lock_ref(req.last_node)
+
+    def cache_unfinished_req(self, req: Req, chunked=False) -> None:
+        """Cache request when it is unfinished."""
+
+        def _skip_cache_unfinished_req(req: Req) -> None:
+            kv_indices = self.req_to_token_pool.req_to_token[
+                req.req_pool_idx, : len(req.fill_ids)
+            ]
+
+            # `req.prefix_indices` will be used in `PrefillAdder::add_chunked_req` later
+            req.prefix_indices = kv_indices.to(dtype=torch.int64, copy=True)
+            return
+
+        token_ids = req.fill_ids
+        cache_len = (
+            req.mamba_last_track_seqlen
+            if self.enable_mamba_extra_buffer
+            else len(token_ids)
+        )
+        if self.disable or cache_len is None:
+            return _skip_cache_unfinished_req(req)
+
+        kv_indices_orig = self.req_to_token_pool.req_to_token[
+            req.req_pool_idx, : len(token_ids)
+        ]
+        # kv_indices is the kv indices to be cached
+        kv_indices = kv_indices_orig[:cache_len]
+        if self.page_size != 1:
+            page_aligned_len = len(kv_indices) // self.page_size * self.page_size
+            page_aligned_kv_indices = kv_indices[:page_aligned_len].to(
+                dtype=torch.int64, copy=True
+            )
+        else:
+            page_aligned_len = len(kv_indices)
+            page_aligned_kv_indices = kv_indices.to(dtype=torch.int64, copy=True)
+
+        assert page_aligned_len == len(
+            kv_indices
+        ), f"page_aligned_len != len(kv_indices), {page_aligned_len=}, {len(kv_indices)=}, {cache_len=}, {self.page_size=}, {FLA_CHUNK_SIZE=}"
+
+        page_aligned_token_ids = token_ids[:page_aligned_len]
+
+        if self.enable_mamba_extra_buffer:
+            # copy from the ping pong track buffer
+            mamba_ping_pong_track_buffer_to_keep = (
+                self.req_to_token_pool.get_mamba_ping_pong_other_idx(
+                    req.mamba_next_track_idx
+                )
+            )
+            mamba_value = (
+                req.mamba_ping_pong_track_buffer[mamba_ping_pong_track_buffer_to_keep]
+                .unsqueeze(-1)
+                .clone()
+            )
+        else:
+            mamba_value = self.req_to_token_pool.get_mamba_indices(
+                req.req_pool_idx
+            ).unsqueeze(-1)
+        # radix tree mamba value is forked from req space
+        mamba_value_forked = self.req_to_token_pool.mamba_pool.fork_from(mamba_value)
+
+        # if alloc mamba cache failed, do evict and alloc again
+        if mamba_value_forked is None:
+            self.evict(EvictParams(num_tokens=0, mamba_num=1))
+            mamba_value_forked = self.req_to_token_pool.mamba_pool.fork_from(
+                mamba_value
+            )
+            assert mamba_value_forked is not None, "Can not alloc mamba cache"
+        result = self.insert(
+            InsertParams(
+                key=RadixKey(page_aligned_token_ids, req.extra_key),
+                value=page_aligned_kv_indices,
+                mamba_value=mamba_value_forked,
+            )
+        )
+        new_prefix_len, mamba_exist = result.prefix_len, result.mamba_exist
+        self.token_to_kv_pool_allocator.free(
+            kv_indices[req.cache_protected_len : new_prefix_len]
+        )
+        # there is a mamba cache in radix cache, release it
+        if mamba_exist:
+            self.req_to_token_pool.mamba_pool.free(mamba_value_forked)
+
+        # The prefix indices could be updated, reuse it
+        match_result = self.match_prefix(
+            MatchPrefixParams(key=RadixKey(page_aligned_token_ids, req.extra_key))
+        )
+        new_indices, new_last_node = (
+            match_result.device_indices,
+            match_result.last_device_node,
+        )
+
+        if not mamba_exist:
+            assert torch.equal(new_last_node.mamba_value, mamba_value_forked)
+
+        assert (
+            req.cache_protected_len <= len(new_indices) + self.page_size - 1
+        ), f"{req.cache_protected_len=}, {len(new_indices)=}, {len(page_aligned_token_ids)=}, {mamba_exist=}"
+        assert new_prefix_len <= len(
+            new_indices
+        ), f"{new_prefix_len=}, {len(new_indices)=}"
+
+        self.req_to_token_pool.write(
+            (req.req_pool_idx, slice(req.cache_protected_len, len(new_indices))),
+            new_indices[req.cache_protected_len :],
+        )
+
+        self.dec_lock_ref(req.last_node)
+        self.inc_lock_ref(new_last_node)
+
+        # `req.prefix_indices` will be used in `PrefillAdder::add_chunked_req` later
+        # NOTE: this is needed for both page_size == 1 and page_size > 1
+        req.prefix_indices = torch.cat(
+            [new_indices, kv_indices_orig[len(new_indices) :]]
+        )
+        req.cache_protected_len = len(new_indices)
+        req.mamba_last_track_seqlen = None
+        req.last_node = new_last_node
+
+    def pretty_print(self) -> None:
+        self._print_helper(self.root_node, 0)
+        total_size, total_mamba_size = self._total_size_helper()
+        print(f"#full_tokens: {total_size}, #mamba_num: {total_mamba_size}")
+
+    def total_size(self) -> Tuple[int, int]:
+        return self._total_size_helper()
+
+    def _evict_leaf_node(
+        self, x: TreeNode, is_evict_mamba: bool
+    ) -> Tuple[int, int, TreeNode, TreeNode]:
+        assert (
+            x.full_lock_ref == 0 and x.mamba_lock_ref == 0
+        ), f"evict leaf node invalid with {x.id=} {x.full_lock_ref=} {x.mamba_lock_ref=}"
+
+        assert x.mamba_value is not None, f"leaf node mamba value is not None, {x.id=}"
+        # 1. a leaf node, free full tokens and mamba
+        self.token_to_kv_pool_allocator.free(x.value)
+        full_num_evicted = len(x.value)
+        self.req_to_token_pool.mamba_pool.free(x.mamba_value)
+        mamba_num_evicted = len(x.mamba_value)
+
+        # 2. get the next node, update the lru lists
+        if is_evict_mamba:
+            x_next = self.mamba_lru_list.get_prev_no_lock(x)
+        else:
+            x_next = self.full_lru_list.get_prev_leaf_no_lock(x)
+        self.full_lru_list.remove_node(x)
+        self.mamba_lru_list.remove_node(x)
+
+        # 3. delete the leaf node
+        self._delete_leaf(x)
+
+        # 4. Iteratively delete tombstone leaves to maintain invariant that leaf nodes are not tombstone
+        x, leaf_full_num_evicted = self._iteratively_delete_tombstone_leaf(x)
+        full_num_evicted += leaf_full_num_evicted
+        return full_num_evicted, mamba_num_evicted, x, x_next
+
+    def evict(self, params: EvictParams) -> EvictResult:
+        if self.disable:
+            return EvictResult()
+
+        full_num_evicted = 0
+        mamba_num_evicted = 0
+
+        if params.num_tokens > 0:
+            full_num_evicted = self.evict_full(params.num_tokens)
+        if params.mamba_num > 0:
+            mamba_num_evicted = self.evict_mamba(params.mamba_num)
+
+        return EvictResult(
+            num_tokens_evicted=full_num_evicted, mamba_num_evicted=mamba_num_evicted
+        )
+
+    def evict_mamba(self, mamba_num: int) -> int:
+        """Evict mamba states. Returns the number of mamba states evicted."""
+        if self.disable or mamba_num <= 0:
+            return 0
+        # get the least recently used node that is not locked, doesn't have to be a leaf
+        x = self.mamba_lru_list.get_lru_no_lock()
+        mamba_num_evicted = 0
+        # evict lru leaf nodes until mamba_num_tokens is reached
+        while mamba_num_evicted < mamba_num and (self.mamba_lru_list.in_list(x)):
+            assert x.mamba_value is not None, f"node has no mamba value, {x.id=}"
+            assert (
+                len(x.mamba_value) == 1
+            ), f"node has abnormal mamba length, {x.id=}, {len(x.mamba_value)=}"
+            assert x != self.root_node, f"root node is not evictable, {x.id=}"
+            assert x.mamba_lock_ref == 0, f"node is in use by mamba kv indices, {x.id=}"
+
+            if len(x.children) > 0:
+                # 1. an internal node, free mamba tokens.
+                self.req_to_token_pool.mamba_pool.free(x.mamba_value)
+                mamba_num_evicted += len(x.mamba_value)
+
+                # 2. get the next node, update the lru lists
+                x_next = self.mamba_lru_list.get_prev_no_lock(x)
+                self.mamba_lru_list.remove_node(x)
+
+                # 3. tombstone the node
+                self._tombstone_internal_node(x)
+            else:
+                _, mamba_evicted_delta, _, x_next = self._evict_leaf_node(x, True)
+                mamba_num_evicted += mamba_evicted_delta
+
+            x = x_next
+
+        return mamba_num_evicted
+
+    def evict_full(self, full_num_tokens: int) -> int:
+        """Evict full KV cache. Returns the number of tokens evicted."""
+        if self.disable or full_num_tokens <= 0:
+            return 0
+
+        full_num_evicted = 0
+        # get the least recently used leaf node that is not locked
+        x = self.full_lru_list.get_leaf_lru_no_lock()
+
+        while full_num_evicted < full_num_tokens and self.full_lru_list.in_list(x):
+            assert (
+                x != self.root_node
+            ), f"root node should not exist in full lru list, {x.id=}"
+            full_num_evicted_delta, _, x, x_next = self._evict_leaf_node(x, False)
+            full_num_evicted += full_num_evicted_delta
+
+            # if parent has no more children, it is a leaf. It is possible that this node is lru, so
+            # we need to get the first leaf node in the lru list
+            if len(x.parent.children) == 0:
+                x_next = self.full_lru_list.get_leaf_lru_no_lock()
+
+            x = x_next
+
+        return full_num_evicted
+
+    def inc_lock_ref(self, node: TreeNode) -> Optional[int]:
+        """
+        Increment the lock reference count for the node.
+        It locks the full_lock_ref for nodes between the [last node, root), exclusive.
+        It locks the mamba_lock_ref for current node if its mamba_value exists.
+        """
+        if self.disable:
+            return None
+
+        # protect mamba value in current node if it exists
+        if node.mamba_value is not None:
+            if node.mamba_lock_ref == 0:
+                self.mamba_evictable_size_ -= len(node.mamba_value)
+                self.mamba_protected_size_ += len(node.mamba_value)
+            node.mamba_lock_ref += 1
+
+        while node != self.root_node:
+            # lock full from node to root
+            assert (
+                node.full_lock_ref >= 0
+            ), f"inc_lock_ref on node with {node.full_lock_ref=}, {node.id=}"
+            if node.full_lock_ref == 0:
+                self.full_evictable_size_ -= len(node.value)
+                self.full_protected_size_ += len(node.value)
+            node.full_lock_ref += 1
+            node = node.parent
+        return None
+
+    def dec_lock_ref(self, node: TreeNode):
+        """
+        Decrement the lock reference count for the node.
+        It unlocks the full_lock_ref for nodes between the [last node, root), exclusive.
+        It unlocks the mamba_lock_ref for current node if its mamba_value exists.
+        """
+        if self.disable:
+            return None
+
+        if node.mamba_value is not None:
+            assert (
+                node.mamba_lock_ref > 0
+            ), f"dec_lock_ref on node with {node.mamba_lock_ref=}, {node.id=}"
+            if node.mamba_lock_ref == 1:
+                self.mamba_evictable_size_ += len(node.mamba_value)
+                self.mamba_protected_size_ -= len(node.mamba_value)
+            node.mamba_lock_ref -= 1
+
+        while node != self.root_node:
+            assert (
+                node.full_lock_ref > 0
+            ), f"dec_lock_ref on node with {node.full_lock_ref=}, {node.id=}"
+            if node.full_lock_ref == 1:
+                self.full_evictable_size_ += len(node.value)
+                self.full_protected_size_ -= len(node.value)
+            node.full_lock_ref -= 1
+            node = node.parent
+
+    def sanity_check(self):
+        if self.disable:
+            return
+        self.full_lru_list.sanity_check(self)
+        self.mamba_lru_list.sanity_check(self)
+
+    def evictable_size(self) -> Tuple[int, int]:
+        # Note: use full_evictable_size() and mamba_evictable_size() instead.
+        raise NotImplementedError
+
+    def full_evictable_size(self) -> int:
+        return self.full_evictable_size_
+
+    def mamba_evictable_size(self) -> int:
+        return self.mamba_evictable_size_
+
+    def protected_size(self) -> Tuple[int, int]:
+        # Note: use full_protected_size() and mamba_protected_size() instead.
+        raise NotImplementedError
+
+    def full_protected_size(self) -> int:
+        # protected size refers to the size of the full cache that is locked
+        return self.full_protected_size_
+
+    def mamba_protected_size(self) -> int:
+        # protected size refers to the size of the mamba cache that is locked
+        return self.mamba_protected_size_
+
+    def all_values_flatten(self) -> torch.Tensor:
+        values = []
+
+        def _dfs_helper(node: TreeNode):
+            for _, child in node.children.items():
+                values.append(child.value)
+                _dfs_helper(child)
+
+        _dfs_helper(self.root_node)
+        return torch.cat(values) if len(values) > 0 else torch.tensor([])
+
+    def all_mamba_values_flatten(self) -> torch.Tensor:
+        values = []
+
+        def _dfs_helper(node: TreeNode):
+            if node.mamba_value is not None:
+                values.append(node.mamba_value)
+            for _, child in node.children.items():
+                _dfs_helper(child)
+
+        _dfs_helper(self.root_node)
+        return torch.cat(values) if len(values) > 0 else torch.tensor([])
+
+    def available_and_evictable_str(self) -> str:
+        full_available_size = self.token_to_kv_pool_allocator.available_size()
+        full_evictable_size = self.full_evictable_size()
+        return (
+            f"Available full tokens: {full_available_size + full_evictable_size} ({full_available_size=} + {full_evictable_size=})\n"
+            f"Full LRU list evictable size: {self.full_lru_list.sanity_check_evictable_size()}\n"
+        )
+
+    ##### Internal Helper Functions #####
+
+    def _match_prefix_helper(
+        self, key: RadixKey
+    ) -> Tuple[List[torch.Tensor], TreeNode, int]:
+        """
+        Mamba prefix matching helper. It factors in the sliding window size such that
+        the matched node is guaranteed to either 1. connected to root without mamba tombstone,
+        or 2. the number of matching tokens from the matched node to the last mamba tombstone
+        node is greater than or equal to the sliding window size.
+        """
+        node = self.root_node
+        child_key = self.get_child_key_fn(key)
+
+        value: List[torch.Tensor] = []
+        best_value_len = 0
+        best_last_node = node
+        while len(key) > 0 and child_key in node.children.keys():
+            child = node.children[child_key]
+            # update best_value_len and best_last_node if needed
+            if node.mamba_value is not None:
+                best_value_len = len(value)
+                best_last_node = node
+
+            prefix_len = self.key_match_fn(child.key, key)
+            if prefix_len < len(child.key):
+                new_node = self._split_node(child.key, child, prefix_len)
+                value.append(new_node.value)
+                node = new_node
+                break
+            else:
+                value.append(child.value)
+                node = child
+                key = key[prefix_len:]
+
+                if len(key):
+                    child_key = self.get_child_key_fn(key)
+        # handle best_value_len and best_last_node, for the case that last node is fully matched
+        if node.mamba_value is not None:
+            best_value_len = len(value)
+            best_last_node = node
+
+        return value, best_last_node, best_value_len
+
+    def _match_pre_processor(self, params: MatchPrefixParams) -> Optional[RadixKey]:
+        """Preprocess the key before matching."""
+        key = params.key
+
+        if self.disable or len(key) == 0:
+            return None
+
+        return key
+
+    def _match_post_processor(
+        self,
+        params: MatchPrefixParams,
+        value: List[torch.Tensor],
+        last_node: TreeNode,
+        best_value_len: int,
+    ) -> MatchResult:
+        """Post-process the matched result."""
+        cow_mamba = params.cow_mamba
+        req = params.req
+
+        # update time for matched nodes, and make nodes closer to root to be least recently used
+        # this allows mamba to evict nodes closer to root first
+        node_update = last_node
+        self.full_lru_list.reset_node_and_parents_mru(node_update, self.root_node)
+        self.mamba_lru_list.reset_node_and_parents_mru(node_update, self.root_node)
+
+        # This last_access_time is for sanity check, can be deleted after validation in production
+        cur_time = get_last_access_time()
+        while node_update:
+            node_update.last_access_time = cur_time
+            cur_time -= (
+                0.00001  # assuming less than 100000 nodes in a branch of the tree
+            )
+            node_update = node_update.parent
+
+        # Calculate the branching point. It is defined as the last aligned position that
+        # does not have a mamba value.
+        if len(value) > best_value_len:
+            mamba_cache_chunk_size = get_global_server_args().mamba_cache_chunk_size
+            mamba_cache_chunk_aligned_seqlen = (
+                sum(len(v) for v in value) // mamba_cache_chunk_size
+            ) * mamba_cache_chunk_size
+            mamba_branching_seqlen = (
+                mamba_cache_chunk_aligned_seqlen
+                if mamba_cache_chunk_aligned_seqlen > 0
+                else None
+            )
+        else:
+            mamba_branching_seqlen = None
+
+        # Copy mamba state to req local space if cow is true
+        if cow_mamba and last_node.mamba_value is not None:
+            # for reqs without mamba cache
+            if req.mamba_pool_idx is None:
+                dst_index = self.req_to_token_pool.mamba_pool.alloc(1)
+                # try to alloc again, protect last_node from eviction
+                if dst_index is None:
+                    self.inc_lock_ref(last_node)
+                    self.evict(EvictParams(num_tokens=0, mamba_num=1))
+                    dst_index = self.req_to_token_pool.mamba_pool.alloc(1)
+                    self.dec_lock_ref(last_node)
+                    assert dst_index is not None, "Can not alloc mamba cache"
+                src_index = last_node.mamba_value
+                self.req_to_token_pool.mamba_pool.copy_from(src_index, dst_index)
+                req.mamba_pool_idx = dst_index[0]
+            else:
+                src_index = last_node.mamba_value
+                dst_index = req.mamba_pool_idx.unsqueeze(0)
+                self.req_to_token_pool.mamba_pool.copy_from(src_index, dst_index)
+
+        value = value[:best_value_len]
+        if value:
+            value = torch.cat(value)
+        else:
+            value = torch.empty((0,), dtype=torch.int64, device=self.device)
+
+        return MatchResult(
+            device_indices=value,
+            last_device_node=last_node,
+            last_host_node=last_node,
+            mamba_branching_seqlen=mamba_branching_seqlen,
+        )
+
+    def _split_node(self, key: RadixKey, child: TreeNode, split_len: int) -> TreeNode:
+        # new_node -> child
+        new_node = TreeNode()
+        new_node.children = {self.get_child_key_fn(key[split_len:]): child}
+        new_node.parent = child.parent
+        new_node.mamba_value = None  # mamba cache can not be split
+        new_node.full_lock_ref = child.full_lock_ref
+        new_node.mamba_lock_ref = 0
+        new_node.key = child.key[:split_len]
+        new_node.value = child.value[:split_len].clone()
+
+        # child time should be later than parent's time for mamba tombstone
+        child.last_access_time = get_last_access_time()
+
+        self.full_lru_list.remove_node(child)
+        if child.mamba_value is not None:
+            self.mamba_lru_list.remove_node(child)
+        child.parent = new_node
+        child.key = child.key[split_len:]
+        child.value = child.value[split_len:].clone()
+        new_node.parent.children[self.get_child_key_fn(key)] = new_node
+
+        # insert the new node and child into the lru lists, insert
+        # parent first so that parent is after child in the lru list
+        self.full_lru_list.insert_mru(new_node)
+        self.full_lru_list.insert_mru(child)
+        if child.mamba_value is not None:
+            self.mamba_lru_list.insert_mru(child)
+        return new_node
+
+    def _insert_helper(
+        self,
+        node: TreeNode,
+        key: RadixKey,
+        value,
+        mamba_value,
+    ) -> Tuple[int, bool]:
+        # Update the last access time from root to leaf, so that
+        # mamba will tombstone the node closer to root first
+        assert mamba_value is not None, "Mamba value should not be None here."
+        node.last_access_time = get_last_access_time()
+        if node != self.root_node:
+            self.full_lru_list.reset_node_mru(node)
+            if node.mamba_value is not None:
+                self.mamba_lru_list.reset_node_mru(node)
+        if len(key) == 0:
+            return 0, True
+
+        child_key = self.get_child_key_fn(key)
+
+        total_prefix_length = 0
+        while len(key) > 0 and child_key in node.children.keys():
+            node = node.children[child_key]
+            node.last_access_time = get_last_access_time()
+            self.full_lru_list.reset_node_mru(node)
+            if node.mamba_value is not None:
+                self.mamba_lru_list.reset_node_mru(node)
+            prefix_len = self.key_match_fn(node.key, key)
+            total_prefix_length += prefix_len
+            key = key[prefix_len:]
+            value = value[prefix_len:]
+
+            if prefix_len < len(node.key):
+                new_node = self._split_node(node.key, node, prefix_len)
+                node = new_node
+
+            if len(key):
+                child_key = self.get_child_key_fn(key)
+
+        mamba_value_exist = False
+        if len(key):
+            new_node = TreeNode()
+            new_node.parent = node
+            new_node.key = key
+            new_node.value = value.clone()
+            new_node.mamba_value = mamba_value
+            self.full_lru_list.insert_mru(new_node)
+            self.mamba_lru_list.insert_mru(new_node)
+            node.children[child_key] = new_node
+            self.full_evictable_size_ += len(value)
+            self.mamba_evictable_size_ += len(mamba_value)
+        elif node.mamba_value is None:  # add for mamba tombstone
+            node.mamba_value = mamba_value
+            self.full_lru_list.reset_node_mru(node)
+            self.mamba_lru_list.insert_mru(node)
+            self.mamba_evictable_size_ += len(mamba_value)
+            node.last_access_time = get_last_access_time()
+        else:  # mamba value already exists
+            mamba_value_exist = True
+            self.full_lru_list.reset_node_mru(node)
+            self.mamba_lru_list.reset_node_mru(node)
+            node.last_access_time = get_last_access_time()
+
+        return total_prefix_length, mamba_value_exist
+
+    def _iteratively_delete_tombstone_leaf(
+        self, node: TreeNode
+    ) -> Tuple[TreeNode, int]:
+        full_num_evicted = 0
+        while node.parent.mamba_value is None and len(node.parent.children) == 0:
+            # root node is not evictable
+            if node.parent == self.root_node:
+                break
+            # if locked, means node is in use, skip
+            if node.parent.full_lock_ref > 0:
+                break
+            assert (
+                node.parent.mamba_lock_ref == 0
+            ), f"tombstone mamba_lock_ref should always be 0, {node.parent.full_lock_ref=}, {node.parent.mamba_lock_ref=}, {node.parent.id=}"
+            # delete tombstone node evicts full tokens
+            self.token_to_kv_pool_allocator.free(node.parent.value)
+            full_num_evicted += len(node.parent.value)
+            self.full_lru_list.remove_node(node.parent)
+            self._delete_tombstone_leaf(node.parent)
+            node = node.parent
+
+        return node, full_num_evicted
+
+    def _delete_leaf(self, node: TreeNode) -> None:
+        assert (
+            node.mamba_value is not None
+        ), f"Invariant violated: leaf node is a tombstone, {node.id=}"
+        assert len(node.children) == 0, f"leaf node has children, {node.id=}"
+        key = self.get_child_key_fn(node.key)
+        v = node.parent.children.pop(key, None)
+        assert v == node, f"parent does not have child key, {key}"
+
+        self.full_evictable_size_ -= len(node.key)
+        self.mamba_evictable_size_ -= len(node.mamba_value)
+
+    def _tombstone_internal_node(self, node: TreeNode) -> None:
+        assert len(node.children) != 0, f"Cannot tombstone a leaf node, {node.id=}"
+        self.mamba_evictable_size_ -= len(node.mamba_value)
+        node.mamba_value = None
+
+    def _delete_tombstone_leaf(self, node: TreeNode) -> None:
+        assert (
+            node.mamba_value is None
+        ), f"Deleting a unexpected non-tombstone leaf node, {node.id=}"
+        assert len(node.children) == 0, f"leaf node has children, {node.id=}"
+        key = self.get_child_key_fn(node.key)
+        v = node.parent.children.pop(key, None)
+        assert v == node, f"parent does not have child key, {key}"
+
+        self.full_evictable_size_ -= len(node.key)
+
+    def _collect_nontombstone_nodes(self) -> List[TreeNode]:
+        ret_list = []
+        stack = [self.root_node]
+
+        while stack:
+            cur_node = stack.pop()
+            if cur_node.mamba_value is not None:
+                ret_list.append(cur_node)
+            stack.extend(cur_node.children.values())
+
+        return ret_list
+
+    def _collect_all_nodes(self) -> List[TreeNode]:
+        ret_list = []
+        stack = [self.root_node]
+        while stack:
+            cur_node = stack.pop()
+            ret_list.append(cur_node)
+            stack.extend(cur_node.children.values())
+        return ret_list
+
+    def _print_helper(self, node: TreeNode, indent: int) -> None:
+        """Prints the radix tree in a human-readable format."""
+        stack = [(node, indent)]
+        while stack:
+            current_node, current_indent = stack.pop()
+            print(
+                " " * current_indent,
+                f"[{current_node.id}]",
+                len(current_node.key),
+                f"fr={current_node.full_lock_ref}",
+                f"mr={current_node.mamba_lock_ref}",
+                f"fll={self.full_lru_list.in_list(current_node)}",
+                f"mll={self.mamba_lru_list.in_list(current_node)}",
+                f"mv={current_node.mamba_value}",
+            )
+            for key, child in current_node.children.items():
+                stack.append((child, current_indent + 2))
+
+                assert key == self.get_child_key_fn(
+                    child.key
+                ), f"{key=}, {self.get_child_key_fn(child.key)=}"
+
+    def _total_size_helper(self) -> Tuple[int, int]:
+        total_size = 0
+        total_mamba_size = 0
+        stack = [self.root_node]
+        while stack:
+            current_node = stack.pop()
+            total_size += len(current_node.value)
+            if current_node.mamba_value is not None:
+                total_mamba_size += len(current_node.mamba_value)
+            for child in current_node.children.values():
+                if child.evicted:
+                    continue
+                stack.append(child)
+        return total_size, total_mamba_size
diff --git a/sglang/python/sglang/srt/mem_cache/memory_pool.py b/sglang/python/sglang/srt/mem_cache/memory_pool.py
new file mode 100644
index 0000000000000000000000000000000000000000..16b1410c30903baf4cbca3992478fb953e02bcca
--- /dev/null
+++ b/sglang/python/sglang/srt/mem_cache/memory_pool.py
@@ -0,0 +1,2052 @@
+"""
+Copyright 2023-2024 SGLang Team
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+from __future__ import annotations
+
+"""
+Memory pool.
+
+SGLang has two levels of memory pool.
+ReqToTokenPool maps a request to its token locations.
+TokenToKVPoolAllocator manages the indices to kv cache data.
+KVCache actually holds the physical kv cache.
+"""
+
+import abc
+import dataclasses
+import logging
+from contextlib import contextmanager, nullcontext
+from dataclasses import dataclass, fields
+from typing import TYPE_CHECKING, Any, List, Optional, Tuple, Union
+
+import numpy as np
+import torch
+import triton
+import triton.language as tl
+
+from sglang.jit_kernel.kvcache import can_use_store_cache, store_cache
+from sglang.srt.configs.mamba_utils import BaseLinearStateParams
+from sglang.srt.constants import GPU_MEMORY_TYPE_KV_CACHE
+from sglang.srt.environ import envs
+from sglang.srt.layers.attention.nsa import index_buf_accessor
+from sglang.srt.layers.attention.nsa.quant_k_cache import (
+    quantize_k_cache,
+    quantize_k_cache_separate,
+)
+from sglang.srt.layers.radix_attention import RadixAttention
+from sglang.srt.mem_cache.utils import (
+    get_mla_kv_buffer_triton,
+    maybe_init_custom_mem_pool,
+    set_mla_kv_buffer_triton,
+    set_mla_kv_scale_buffer_triton,
+)
+from sglang.srt.utils import (
+    cpu_has_amx_support,
+    is_cpu,
+    is_cuda,
+    is_hip,
+    is_npu,
+    next_power_of_2,
+)
+from sglang.srt.utils.custom_op import register_custom_op
+from sglang.srt.utils.torch_memory_saver_adapter import TorchMemorySaverAdapter
+
+store_cache = register_custom_op(store_cache, mutates_args=["k_cache", "v_cache"])
+
+if TYPE_CHECKING:
+    from sglang.srt.managers.cache_controller import LayerDoneCounter
+    from sglang.srt.managers.schedule_batch import Req
+
+
+logger = logging.getLogger(__name__)
+
+GB = 1024 * 1024 * 1024
+_is_cuda = is_cuda()
+_is_npu = is_npu()
+_is_cpu = is_cpu()
+_cpu_has_amx_support = cpu_has_amx_support()
+_is_hip = is_hip()
+
+
+def get_tensor_size_bytes(t: Union[torch.Tensor, List[torch.Tensor]]):
+    if isinstance(t, list):
+        return sum(get_tensor_size_bytes(x) for x in t)
+    return np.prod(t.shape) * t.dtype.itemsize
+
+
+def _set_kv_buffer_impl(
+    k: torch.Tensor,
+    v: torch.Tensor,
+    k_cache: torch.Tensor,
+    v_cache: torch.Tensor,
+    indices: torch.Tensor,
+    row_dim: int,  # head_num * head_dim
+    store_dtype: torch.dtype,
+    device_module: Any,
+    alt_stream: Optional[torch.cuda.Stream] = None,
+    same_kv_dim: bool = True,
+) -> None:
+    row_bytes = row_dim * store_dtype.itemsize
+    if (_is_cuda or _is_hip) and same_kv_dim and can_use_store_cache(row_bytes):
+        return store_cache(
+            k.view(-1, row_dim),
+            v.view(-1, row_dim),
+            k_cache.view(-1, row_dim),
+            v_cache.view(-1, row_dim),
+            indices,
+            row_bytes=row_bytes,
+        )
+
+    from sglang.srt.model_executor.cuda_graph_runner import get_is_capture_mode
+
+    if get_is_capture_mode() and alt_stream is not None:
+        current_stream = device_module.current_stream()
+        alt_stream.wait_stream(current_stream)
+        k_cache[indices] = k
+        with device_module.stream(alt_stream):
+            v_cache[indices] = v
+        current_stream.wait_stream(alt_stream)
+    else:  # fallback to naive implementation
+        k_cache[indices] = k
+        v_cache[indices] = v
+
+
+class ReqToTokenPool:
+    """A memory pool that maps a request to its token locations."""
+
+    def __init__(
+        self,
+        size: int,
+        max_context_len: int,
+        device: str,
+        enable_memory_saver: bool,
+    ):
+        memory_saver_adapter = TorchMemorySaverAdapter.create(
+            enable=enable_memory_saver
+        )
+
+        self.size = size
+        self.max_context_len = max_context_len
+        self.device = device
+        with memory_saver_adapter.region(GPU_MEMORY_TYPE_KV_CACHE):
+            self.req_to_token = torch.zeros(
+                (size, max_context_len), dtype=torch.int32, device=device
+            )
+        self.free_slots = list(range(size))
+
+    def write(self, indices, values):
+        self.req_to_token[indices] = values
+
+    def available_size(self):
+        return len(self.free_slots)
+
+    def alloc(self, reqs: list[Req]) -> Optional[List[int]]:
+        # Indices of reqs that already have a req_pool_idx and will reuse
+        # their existing slot (e.g. chunked prefill continuing across chunks).
+        reusing = [i for i, r in enumerate(reqs) if r.req_pool_idx is not None]
+        if not any(r.is_dllm() for r in reqs):
+            assert (
+                sum(1 for i in reusing if reqs[i].is_chunked > 0) <= 1
+            ), "only one chunked request may reuse req_pool_idx in a batch"
+        assert all(
+            reqs[i].is_chunked > 0 or reqs[i].kv_committed_len > 0 for i in reusing
+        ), "reusing request must be chunked or have committed KV"
+
+        need_size = len(reqs) - len(reusing)
+        if need_size > len(self.free_slots):
+            return None
+        select_index = self.free_slots[:need_size]
+        self.free_slots = self.free_slots[need_size:]
+        offset = 0
+        for r in reqs:
+            if r.req_pool_idx is None:
+                r.req_pool_idx = select_index[offset]
+                offset += 1
+        return [r.req_pool_idx for r in reqs]
+
+    def free(self, req: Req):
+        assert req.req_pool_idx is not None, "request must have req_pool_idx"
+        self.free_slots.append(req.req_pool_idx)
+        req.req_pool_idx = None
+
+    def clear(self):
+        self.free_slots = list(range(self.size))
+
+
+class MambaPool:
+    @dataclass(frozen=True, kw_only=True)
+    class State:
+        conv: List[torch.Tensor]
+        temporal: torch.Tensor
+
+        def at_layer_idx(self, layer: int):
+            kwargs = {}
+            # Use fields instead of vars to avoid torch.compile graph break
+            for f in fields(self):
+                name = f.name
+                v = getattr(self, name)
+                if name in ("conv", "intermediate_conv_window"):
+                    kwargs[name] = [conv[layer] for conv in v]
+                else:
+                    kwargs[name] = v[layer]
+
+            return type(self)(**kwargs)
+
+        def mem_usage_bytes(self):
+            return sum(
+                get_tensor_size_bytes(getattr(self, f.name))
+                for f in dataclasses.fields(self)
+            )
+
+    @dataclass(frozen=True, kw_only=True)
+    class SpeculativeState(State):
+        intermediate_ssm: torch.Tensor
+        intermediate_conv_window: List[torch.Tensor]
+
+    def __init__(
+        self,
+        *,
+        size: int,
+        spec_state_size: int,
+        cache_params: BaseLinearStateParams,
+        device: str,
+        enable_memory_saver: bool = False,
+        speculative_num_draft_tokens: Optional[int] = None,
+    ):
+        conv_state_shape = cache_params.shape.conv
+        temporal_state_shape = cache_params.shape.temporal
+        conv_dtype = cache_params.dtype.conv
+        ssm_dtype = cache_params.dtype.temporal
+        self.memory_saver_adapter = TorchMemorySaverAdapter.create(
+            enable=enable_memory_saver
+        )
+        num_mamba_layers = len(cache_params.layers)
+
+        self.size = size
+        self.device = device
+
+        # for disagg with nvlink
+        self.enable_custom_mem_pool, self.custom_mem_pool, _ = (
+            maybe_init_custom_mem_pool(device=self.device)
+        )
+
+        with self.memory_saver_adapter.region(GPU_MEMORY_TYPE_KV_CACHE), (
+            torch.cuda.use_mem_pool(self.custom_mem_pool)
+            if self.enable_custom_mem_pool
+            else nullcontext()
+        ):
+            conv_state = [
+                torch.zeros(
+                    size=(num_mamba_layers, size + 1) + conv_shape,
+                    dtype=conv_dtype,
+                    device=device,
+                )
+                for conv_shape in conv_state_shape
+            ]
+
+            if _is_cpu and _cpu_has_amx_support:
+                from sglang.srt.layers.amx_utils import _init_amx_conv_state
+
+                # CPU uses a different layout of conv_state for kernel optimization
+                conv_state = _init_amx_conv_state(conv_state)
+
+            temporal_state = torch.zeros(
+                size=(num_mamba_layers, size + 1) + temporal_state_shape,
+                dtype=ssm_dtype,
+                device=device,
+            )
+            if speculative_num_draft_tokens is not None:
+                # Cache intermediate SSM states per draft token during target verify
+                # Shape: [num_layers, size + 1, speculative_num_draft_tokens, HV, K, V]
+                intermediate_ssm_state_cache = torch.zeros(
+                    size=(
+                        num_mamba_layers,
+                        spec_state_size + 1,
+                        speculative_num_draft_tokens,
+                        temporal_state_shape[0],
+                        temporal_state_shape[1],
+                        temporal_state_shape[2],
+                    ),
+                    dtype=ssm_dtype,
+                    device="cuda",
+                )
+                # Cache intermediate conv windows (last K-1 inputs) per draft token during target verify
+                # Shape: [num_layers, size + 1, speculative_num_draft_tokens, dim, K-1]
+                intermediate_conv_window_cache = [
+                    torch.zeros(
+                        size=(
+                            num_mamba_layers,
+                            spec_state_size + 1,
+                            speculative_num_draft_tokens,
+                            conv_shape[0],
+                            conv_shape[1],
+                        ),
+                        dtype=conv_dtype,
+                        device="cuda",
+                    )
+                    for conv_shape in conv_state_shape
+                ]
+                self.mamba_cache = self.SpeculativeState(
+                    conv=conv_state,
+                    temporal=temporal_state,
+                    intermediate_ssm=intermediate_ssm_state_cache,
+                    intermediate_conv_window=intermediate_conv_window_cache,
+                )
+                logger.info(
+                    f"Mamba Cache is allocated. "
+                    f"max_mamba_cache_size: {size}, "
+                    f"conv_state size: {get_tensor_size_bytes(conv_state) / GB:.2f}GB, "
+                    f"ssm_state size: {get_tensor_size_bytes(temporal_state) / GB:.2f}GB "
+                    f"intermediate_ssm_state_cache size: {get_tensor_size_bytes(intermediate_ssm_state_cache) / GB:.2f}GB "
+                    f"intermediate_conv_window_cache size: {get_tensor_size_bytes(intermediate_conv_window_cache) / GB:.2f}GB "
+                )
+            else:
+                self.mamba_cache = self.State(conv=conv_state, temporal=temporal_state)
+                logger.info(
+                    f"Mamba Cache is allocated. "
+                    f"max_mamba_cache_size: {size}, "
+                    f"conv_state size: {get_tensor_size_bytes(conv_state) / GB:.2f}GB, "
+                    f"ssm_state size: {get_tensor_size_bytes(temporal_state) / GB:.2f}GB "
+                )
+            # The padded slot 0 is used for writing dummy outputs from padded tokens.
+            self.free_slots = torch.arange(
+                1, self.size + 1, dtype=torch.int64, device=self.device
+            )
+            self.mem_usage = self.mamba_cache.mem_usage_bytes() / GB
+            self.num_mamba_layers = num_mamba_layers
+
+    def get_speculative_mamba2_params_all_layers(self) -> SpeculativeState:
+        assert isinstance(self.mamba_cache, self.SpeculativeState)
+        return self.mamba_cache
+
+    def mamba2_layer_cache(self, layer_id: int):
+        return self.mamba_cache.at_layer_idx(layer_id)
+
+    def available_size(self):
+        return len(self.free_slots)
+
+    def alloc(self, need_size: int) -> Optional[torch.Tensor]:
+        if need_size > len(self.free_slots):
+            return None
+
+        select_index = self.free_slots[:need_size]
+        self.free_slots = self.free_slots[need_size:]
+        # clear at alloc time — expand a scalar GPU zero to the right shape, no CPU-GPU sync
+        for i in range(len(self.mamba_cache.conv)):
+            t = self.mamba_cache.conv[i]
+            z = torch.zeros(1, dtype=t.dtype, device=t.device).expand(
+                t.shape[0], need_size, *t.shape[2:]
+            )
+            t[:, select_index] = z
+        t = self.mamba_cache.temporal
+        z = torch.zeros(1, dtype=t.dtype, device=t.device).expand(
+            t.shape[0], need_size, *t.shape[2:]
+        )
+        t[:, select_index] = z
+
+        return select_index
+
+    def free(self, free_index: torch.Tensor):
+        if free_index.numel() == 0:
+            return
+        self.free_slots = torch.cat((self.free_slots, free_index))
+
+    def clear(self):
+        self.free_slots = torch.arange(
+            1, self.size + 1, dtype=torch.int64, device=self.device
+        )
+
+    def copy_from(self, src_index: torch.Tensor, dst_index: torch.Tensor):
+        for i in range(len(self.mamba_cache.conv)):
+            self.mamba_cache.conv[i][:, dst_index] = self.mamba_cache.conv[i][
+                :, src_index
+            ]
+        self.mamba_cache.temporal[:, dst_index] = self.mamba_cache.temporal[
+            :, src_index
+        ]
+        return
+
+    def fork_from(self, src_index: torch.Tensor) -> Optional[torch.Tensor]:
+        dst_index = self.alloc(1)
+        if dst_index is None:
+            return None
+        self.copy_from(src_index, dst_index)
+        return dst_index
+
+    def get_contiguous_buf_infos(self):
+        """
+        Get buffer info for RDMA registration.
+        Only returns conv and temporal state buffers, excluding intermediate buffers
+        used for speculative decoding (intermediate_ssm, intermediate_conv_window).
+        """
+        state_tensors = []
+        for field in vars(self.mamba_cache):
+            # Skip intermediate buffers used only for speculative decoding
+            # These buffers have different size (spec_state_size + 1) and should not be transferred
+            if field in ("intermediate_ssm", "intermediate_conv_window"):
+                continue
+            value = getattr(self.mamba_cache, field)
+            if isinstance(value, list):
+                state_tensors.extend(value)
+            else:
+                state_tensors.append(value)
+        data_ptrs, data_lens, item_lens = [], [], []
+
+        for _, state_tensor in enumerate(state_tensors):
+            data_ptrs += [
+                state_tensor[i].data_ptr() for i in range(self.num_mamba_layers)
+            ]
+            data_lens += [state_tensor[i].nbytes for i in range(self.num_mamba_layers)]
+            item_lens += [
+                state_tensor[i][0].nbytes for i in range(self.num_mamba_layers)
+            ]
+        return data_ptrs, data_lens, item_lens
+
+    def get_state_dim_per_tensor(self):
+        """Get the sliceable dimension size for each state tensor.
+
+        For mamba state, the layout is:
+        - conv_state: [num_layers, size+1, conv_dim/tp, conv_kernel-1]
+        - temporal_state: [num_layers, size+1, num_heads/tp, head_dim, state_size]
+
+        The 3rd dimension (index 2) is the one that gets sliced by TP.
+        Returns the size of this dimension for each tensor (repeated for each layer).
+        """
+        state_tensors = []
+        for field in vars(self.mamba_cache):
+            value = getattr(self.mamba_cache, field)
+            if isinstance(value, list):
+                state_tensors.extend(value)
+            else:
+                state_tensors.append(value)
+
+        dim_per_tensor = []
+        for state_tensor in state_tensors:
+            # state_tensor shape: [num_layers, size+1, sliceable_dim, ...]
+            # The sliceable dimension is at index 2 (after num_layers and size)
+            sliceable_dim = state_tensor.shape[2]
+            # Repeat for each layer since we have per-layer data_ptrs
+            dim_per_tensor += [sliceable_dim] * self.num_mamba_layers
+        return dim_per_tensor
+
+
+class HybridReqToTokenPool(ReqToTokenPool):
+    """A memory pool that maps a request to its token locations."""
+
+    def __init__(
+        self,
+        *,
+        size: int,
+        mamba_size: int,
+        mamba_spec_state_size: int,
+        max_context_len: int,
+        device: str,
+        enable_memory_saver: bool,
+        cache_params: BaseLinearStateParams,
+        enable_mamba_extra_buffer: bool,
+        speculative_num_draft_tokens: int = None,
+        enable_overlap_schedule: bool = True,
+    ):
+        super().__init__(
+            size=size,
+            max_context_len=max_context_len,
+            device=device,
+            enable_memory_saver=enable_memory_saver,
+        )
+
+        self.mamba_ping_pong_track_buffer_size = 2 if enable_overlap_schedule else 1
+        self.enable_mamba_extra_buffer = enable_mamba_extra_buffer
+        self.enable_memory_saver = enable_memory_saver
+        self._init_mamba_pool(
+            size=mamba_size,
+            mamba_spec_state_size=mamba_spec_state_size,
+            cache_params=cache_params,
+            device=device,
+            enable_mamba_extra_buffer=enable_mamba_extra_buffer,
+            speculative_num_draft_tokens=speculative_num_draft_tokens,
+        )
+
+    def _init_mamba_pool(
+        self,
+        size: int,
+        mamba_spec_state_size: int,
+        cache_params: BaseLinearStateParams,
+        device: str,
+        enable_mamba_extra_buffer: bool,
+        speculative_num_draft_tokens: int = None,
+    ):
+        self.mamba_pool = MambaPool(
+            size=size,
+            spec_state_size=mamba_spec_state_size,
+            cache_params=cache_params,
+            device=device,
+            enable_memory_saver=self.enable_memory_saver,
+            speculative_num_draft_tokens=speculative_num_draft_tokens,
+        )
+        self.mamba_map = {layer_id: i for i, layer_id in enumerate(cache_params.layers)}
+
+        self.device = device
+        self.req_index_to_mamba_index_mapping: torch.Tensor = torch.zeros(
+            size, dtype=torch.int32, device=self.device
+        )
+        if enable_mamba_extra_buffer:
+            self.req_index_to_mamba_ping_pong_track_buffer_mapping: torch.Tensor = (
+                torch.zeros(
+                    (size, self.mamba_ping_pong_track_buffer_size),
+                    dtype=torch.int32,
+                    device=self.device,
+                )
+            )
+
+    # For chunk prefill req, we do not need to allocate mamba cache,
+    # We could use allocated mamba cache instead.
+    def alloc(self, reqs: List["Req"]) -> Optional[List[int]]:
+        select_index = super().alloc(reqs)
+        if select_index is None:
+            return None
+
+        mamba_indices: list[torch.Tensor] = []
+        mamba_ping_pong_track_buffers: list[torch.Tensor] = []
+        for req in reqs:
+            mid = None
+            if req.mamba_pool_idx is not None:  # for radix cache
+                mid = req.mamba_pool_idx
+            else:
+                mid = self.mamba_pool.alloc(1)
+                assert (
+                    mid is not None
+                ), f"Not enough space for mamba cache, try to increase --mamba-full-memory-ratio or --max-mamba-cache-size. {mid=}, {self.mamba_pool.size=}, {self.mamba_pool.available_size()=}, {len(reqs)=}"
+                mid = mid[0]
+                req.mamba_pool_idx = mid
+            mamba_indices.append(mid)
+            if self.enable_mamba_extra_buffer:
+                if req.mamba_ping_pong_track_buffer is None:
+                    req.mamba_ping_pong_track_buffer = self.mamba_pool.alloc(
+                        self.mamba_ping_pong_track_buffer_size
+                    )
+                    assert (
+                        req.mamba_ping_pong_track_buffer is not None
+                    ), "Not enough space for mamba ping pong idx, try to increase --mamba-full-memory-ratio."
+                    req.mamba_next_track_idx = 0
+                mamba_ping_pong_track_buffers.append(req.mamba_ping_pong_track_buffer)
+        assert len(select_index) == len(
+            mamba_indices
+        ), f"Not enough space for mamba cache, try to increase --mamba-full-memory-ratio or --max-mamba-cache-size."
+        if self.enable_mamba_extra_buffer:
+            assert len(select_index) == len(
+                mamba_ping_pong_track_buffers
+            ), f"Not enough space for mamba ping pong idx, try to increase --mamba-full-memory-ratio."
+        mamba_index_tensor = torch.stack(mamba_indices).to(dtype=torch.int32)
+        self.req_index_to_mamba_index_mapping[select_index] = mamba_index_tensor
+        if self.enable_mamba_extra_buffer:
+            ping_pong_tensor = torch.stack(mamba_ping_pong_track_buffers).to(
+                dtype=torch.int32
+            )
+            self.req_index_to_mamba_ping_pong_track_buffer_mapping[select_index] = (
+                ping_pong_tensor
+            )
+        return select_index
+
+    def get_mamba_indices(self, req_indices: torch.Tensor) -> torch.Tensor:
+        return self.req_index_to_mamba_index_mapping[req_indices]
+
+    def mamba2_layer_cache(self, layer_id: int):
+        assert layer_id in self.mamba_map
+        return self.mamba_pool.mamba2_layer_cache(self.mamba_map[layer_id])
+
+    def get_speculative_mamba2_params_all_layers(self) -> MambaPool.SpeculativeState:
+        return self.mamba_pool.get_speculative_mamba2_params_all_layers()
+
+    def get_mamba_ping_pong_other_idx(self, mamba_next_track_idx: int) -> int:
+        if self.mamba_ping_pong_track_buffer_size == 2:
+            return 1 - mamba_next_track_idx
+        else:
+            return mamba_next_track_idx
+
+    def free_mamba_cache(
+        self, req: "Req", mamba_ping_pong_track_buffer_to_keep: Optional[int] = None
+    ):
+        mamba_index = req.mamba_pool_idx
+        assert mamba_index is not None, "double free? mamba_index is None"
+        self.mamba_pool.free(mamba_index.unsqueeze(0))
+        req.mamba_pool_idx = None
+
+        if self.enable_mamba_extra_buffer:
+            mamba_ping_pong_track_buffer_to_free = (
+                self.req_index_to_mamba_ping_pong_track_buffer_mapping[req.req_pool_idx]
+            )
+            if mamba_ping_pong_track_buffer_to_keep is not None:
+                assert mamba_ping_pong_track_buffer_to_keep in [
+                    0,
+                    1,
+                ], f"mamba_ping_pong_track_buffer_to_keep must be 0 or 1, {mamba_ping_pong_track_buffer_to_keep=}"
+                # Avoid Python-list advanced indexing on a device tensor.
+                # The ping-pong buffer size is either 2 (normal) or 1 (spec decode).
+                if self.mamba_ping_pong_track_buffer_size == 2:
+                    idx_to_free = 1 - mamba_ping_pong_track_buffer_to_keep
+                    mamba_ping_pong_track_buffer_to_free = (
+                        mamba_ping_pong_track_buffer_to_free[
+                            idx_to_free : idx_to_free + 1
+                        ]
+                    )
+                else:
+                    assert self.mamba_ping_pong_track_buffer_size == 1, (
+                        f"Unexpected mamba_ping_pong_track_buffer_size="
+                        f"{self.mamba_ping_pong_track_buffer_size}"
+                    )
+                    assert mamba_ping_pong_track_buffer_to_keep == 0, (
+                        "mamba_ping_pong_track_buffer_to_keep must be 0 when "
+                        "mamba_ping_pong_track_buffer_size is 1"
+                    )
+                    # Keep the only slot, so free nothing.
+                    mamba_ping_pong_track_buffer_to_free = (
+                        mamba_ping_pong_track_buffer_to_free[0:0]
+                    )
+            self.mamba_pool.free(mamba_ping_pong_track_buffer_to_free)
+
+    def clear(self):
+        logger.info("Reset HybridReqToTokenPool")
+        super().clear()
+        self.mamba_pool.clear()
+        self.req_index_to_mamba_index_mapping.zero_()
+        if self.enable_mamba_extra_buffer:
+            self.req_index_to_mamba_ping_pong_track_buffer_mapping.zero_()
+
+
+class KVCache(abc.ABC):
+    @abc.abstractmethod
+    def __init__(
+        self,
+        size: int,
+        page_size: int,
+        dtype: torch.dtype,
+        layer_num: int,
+        device: str,
+        enable_memory_saver: bool,
+        start_layer: Optional[int] = None,
+        end_layer: Optional[int] = None,
+    ):
+        self.size = size
+        self.page_size = page_size
+        self.dtype = dtype
+        self.device = device
+        if dtype in (torch.float8_e5m2, torch.float8_e4m3fn):
+            # NOTE: Store as torch.uint8 because Tensor.index_put is not implemented for torch.float8_e5m2
+            self.store_dtype = torch.uint8
+        else:
+            self.store_dtype = dtype
+        self.layer_num = layer_num
+        self.start_layer = start_layer or 0
+        self.end_layer = end_layer or layer_num - 1
+        self.memory_saver_adapter = TorchMemorySaverAdapter.create(
+            enable=enable_memory_saver
+        )
+        self.mem_usage = 0
+
+        # used for chunked cpu-offloading
+        self.cpu_offloading_chunk_size = 8192
+
+        # default state for optional layer-wise transfer control
+        self.layer_transfer_counter = None
+
+        # for disagg with nvlink
+        self.enable_custom_mem_pool, self.custom_mem_pool, _ = (
+            maybe_init_custom_mem_pool(device=self.device)
+        )
+
+    def _finalize_allocation_log(self, num_tokens: int):
+        """Common logging and mem_usage computation for KV cache allocation.
+        Supports both tuple (K, V) size returns and single KV size returns.
+        """
+        kv_size_bytes = self.get_kv_size_bytes()
+        if isinstance(kv_size_bytes, tuple):
+            k_size, v_size = kv_size_bytes
+            k_size_GB = k_size / GB
+            v_size_GB = v_size / GB
+            logger.info(
+                f"KV Cache is allocated. #tokens: {num_tokens}, K size: {k_size_GB:.2f} GB, V size: {v_size_GB:.2f} GB"
+            )
+            self.mem_usage = k_size_GB + v_size_GB
+        else:
+            kv_size_GB = kv_size_bytes / GB
+            logger.info(
+                f"KV Cache is allocated. #tokens: {num_tokens}, KV size: {kv_size_GB:.2f} GB"
+            )
+            self.mem_usage = kv_size_GB
+
+    @abc.abstractmethod
+    def get_key_buffer(self, layer_id: int) -> torch.Tensor:
+        raise NotImplementedError()
+
+    @abc.abstractmethod
+    def get_value_buffer(self, layer_id: int) -> torch.Tensor:
+        raise NotImplementedError()
+
+    @abc.abstractmethod
+    def get_kv_buffer(self, layer_id: int) -> Tuple[torch.Tensor, torch.Tensor]:
+        raise NotImplementedError()
+
+    @abc.abstractmethod
+    def set_kv_buffer(
+        self,
+        layer: RadixAttention,
+        loc: torch.Tensor,
+        cache_k: torch.Tensor,
+        cache_v: torch.Tensor,
+    ) -> None:
+        raise NotImplementedError()
+
+    def register_layer_transfer_counter(self, layer_transfer_counter: LayerDoneCounter):
+        self.layer_transfer_counter = layer_transfer_counter
+
+    def get_cpu_copy(self, indices):
+        raise NotImplementedError()
+
+    def load_cpu_copy(self, kv_cache_cpu, indices):
+        raise NotImplementedError()
+
+    def maybe_get_custom_mem_pool(self):
+        return self.custom_mem_pool
+
+
+class MHATokenToKVPool(KVCache):
+
+    def __init__(
+        self,
+        size: int,
+        page_size: int,
+        dtype: torch.dtype,
+        head_num: int,
+        head_dim: int,
+        layer_num: int,
+        device: str,
+        enable_memory_saver: bool,
+        v_head_dim: Optional[int] = None,
+        swa_head_num: Optional[int] = None,
+        swa_head_dim: Optional[int] = None,
+        swa_v_head_dim: Optional[int] = None,
+        start_layer: Optional[int] = None,
+        end_layer: Optional[int] = None,
+        enable_alt_stream: bool = True,
+        enable_kv_cache_copy: bool = False,
+    ):
+        super().__init__(
+            size,
+            page_size,
+            dtype,
+            layer_num,
+            device,
+            enable_memory_saver,
+            start_layer,
+            end_layer,
+        )
+        self.head_num = swa_head_num if swa_head_num is not None else head_num
+        self.head_dim = swa_head_dim if swa_head_dim is not None else head_dim
+        self.v_head_dim = (
+            swa_v_head_dim
+            if swa_v_head_dim is not None
+            else v_head_dim if v_head_dim is not None else head_dim
+        )
+
+        self._create_buffers()
+
+        self.device_module = torch.get_device_module(self.device)
+        self.alt_stream = (
+            self.device_module.Stream() if _is_cuda and enable_alt_stream else None
+        )
+
+        if enable_kv_cache_copy:
+            self._init_kv_copy_and_warmup()
+        else:
+            self._kv_copy_config = None
+
+        self._finalize_allocation_log(size)
+
+        # for store_cache JIT kernel
+        self.row_dim = self.head_num * self.head_dim
+        self.same_kv_dim = self.head_dim == self.v_head_dim
+
+    def _init_kv_copy_and_warmup(self):
+        # Heuristics for KV copy tiling
+        _KV_COPY_STRIDE_THRESHOLD_LARGE = 8192
+        _KV_COPY_STRIDE_THRESHOLD_MEDIUM = 4096
+        _KV_COPY_TILE_SIZE_LARGE = 512
+        _KV_COPY_TILE_SIZE_MEDIUM = 256
+        _KV_COPY_TILE_SIZE_SMALL = 128
+        _KV_COPY_NUM_WARPS_LARGE_TILE = 8
+        _KV_COPY_NUM_WARPS_SMALL_TILE = 4
+
+        stride_bytes = int(self.data_strides[0].item())
+        if stride_bytes >= _KV_COPY_STRIDE_THRESHOLD_LARGE:
+            bytes_per_tile = _KV_COPY_TILE_SIZE_LARGE
+        elif stride_bytes >= _KV_COPY_STRIDE_THRESHOLD_MEDIUM:
+            bytes_per_tile = _KV_COPY_TILE_SIZE_MEDIUM
+        else:
+            bytes_per_tile = _KV_COPY_TILE_SIZE_SMALL
+
+        # Calculate num_locs_upper to avoid large Triton specialization (e.g. 8192)
+        chunk_upper = 128 if bytes_per_tile >= _KV_COPY_TILE_SIZE_LARGE else 256
+
+        self._kv_copy_config = {
+            "bytes_per_tile": bytes_per_tile,
+            "byte_tiles": (stride_bytes + bytes_per_tile - 1) // bytes_per_tile,
+            "num_warps": (
+                _KV_COPY_NUM_WARPS_SMALL_TILE
+                if bytes_per_tile <= _KV_COPY_TILE_SIZE_MEDIUM
+                else _KV_COPY_NUM_WARPS_LARGE_TILE
+            ),
+            "num_locs_upper": chunk_upper,
+        }
+
+        dummy_loc = torch.zeros(chunk_upper, dtype=torch.int64, device=self.device)
+        grid = (self.data_ptrs.numel(), self._kv_copy_config["byte_tiles"])
+
+        copy_all_layer_kv_cache_tiled[grid](
+            self.data_ptrs,
+            self.data_strides,
+            dummy_loc,
+            dummy_loc,
+            1,
+            chunk_upper,
+            BYTES_PER_TILE=self._kv_copy_config["bytes_per_tile"],
+            num_warps=self._kv_copy_config["num_warps"],
+            num_stages=2,
+        )
+
+    def _create_buffers(self):
+        with self.memory_saver_adapter.region(GPU_MEMORY_TYPE_KV_CACHE):
+            with (
+                torch.cuda.use_mem_pool(self.custom_mem_pool)
+                if self.enable_custom_mem_pool
+                else nullcontext()
+            ):
+                # [size, head_num, head_dim] for each layer
+                # The padded slot 0 is used for writing dummy outputs from padded tokens.
+                self.k_buffer = [
+                    torch.zeros(
+                        (self.size + self.page_size, self.head_num, self.head_dim),
+                        dtype=self.store_dtype,
+                        device=self.device,
+                    )
+                    for _ in range(self.layer_num)
+                ]
+                self.v_buffer = [
+                    torch.zeros(
+                        (self.size + self.page_size, self.head_num, self.v_head_dim),
+                        dtype=self.store_dtype,
+                        device=self.device,
+                    )
+                    for _ in range(self.layer_num)
+                ]
+
+        self.k_data_ptrs = torch.tensor(
+            [x.data_ptr() for x in self.k_buffer],
+            dtype=torch.uint64,
+            device=self.device,
+        )
+        self.v_data_ptrs = torch.tensor(
+            [x.data_ptr() for x in self.v_buffer],
+            dtype=torch.uint64,
+            device=self.device,
+        )
+        self.data_ptrs = torch.cat([self.k_data_ptrs, self.v_data_ptrs], dim=0)
+        self.data_strides = torch.tensor(
+            [
+                np.prod(x.shape[1:]) * x.dtype.itemsize
+                for x in self.k_buffer + self.v_buffer
+            ],
+            device=self.device,
+        )
+
+    def _clear_buffers(self):
+        del self.k_buffer
+        del self.v_buffer
+
+    def get_kv_size_bytes(self):
+        assert hasattr(self, "k_buffer")
+        assert hasattr(self, "v_buffer")
+        k_size_bytes = 0
+        for k_cache in self.k_buffer:
+            k_size_bytes += get_tensor_size_bytes(k_cache)
+        v_size_bytes = 0
+        for v_cache in self.v_buffer:
+            v_size_bytes += get_tensor_size_bytes(v_cache)
+        return k_size_bytes, v_size_bytes
+
+    # for disagg
+    def get_contiguous_buf_infos(self):
+        # layer_num x [seq_len, head_num, head_dim]
+        # layer_num x [page_num, page_size, head_num, head_dim]
+        kv_data_ptrs = [
+            self._get_key_buffer(i).data_ptr()
+            for i in range(self.start_layer, self.start_layer + self.layer_num)
+        ] + [
+            self._get_value_buffer(i).data_ptr()
+            for i in range(self.start_layer, self.start_layer + self.layer_num)
+        ]
+        kv_data_lens = [
+            self._get_key_buffer(i).nbytes
+            for i in range(self.start_layer, self.start_layer + self.layer_num)
+        ] + [
+            self._get_value_buffer(i).nbytes
+            for i in range(self.start_layer, self.start_layer + self.layer_num)
+        ]
+        kv_item_lens = [
+            self._get_key_buffer(i)[0].nbytes * self.page_size
+            for i in range(self.start_layer, self.start_layer + self.layer_num)
+        ] + [
+            self._get_value_buffer(i)[0].nbytes * self.page_size
+            for i in range(self.start_layer, self.start_layer + self.layer_num)
+        ]
+        return kv_data_ptrs, kv_data_lens, kv_item_lens
+
+    def get_cpu_copy(self, indices):
+        torch.cuda.synchronize()
+        kv_cache_cpu = []
+        chunk_size = self.cpu_offloading_chunk_size
+        for layer_id in range(self.layer_num):
+            kv_cache_cpu.append([])
+            for i in range(0, len(indices), chunk_size):
+                chunk_indices = indices[i : i + chunk_size]
+                k_cpu = self.k_buffer[layer_id][chunk_indices].to(
+                    "cpu", non_blocking=True
+                )
+                v_cpu = self.v_buffer[layer_id][chunk_indices].to(
+                    "cpu", non_blocking=True
+                )
+                kv_cache_cpu[-1].append([k_cpu, v_cpu])
+        torch.cuda.synchronize()
+        return kv_cache_cpu
+
+    def load_cpu_copy(self, kv_cache_cpu, indices):
+        torch.cuda.synchronize()
+        chunk_size = self.cpu_offloading_chunk_size
+        for layer_id in range(self.layer_num):
+            for i in range(0, len(indices), chunk_size):
+                chunk_indices = indices[i : i + chunk_size]
+                k_cpu, v_cpu = (
+                    kv_cache_cpu[layer_id][i // chunk_size][0],
+                    kv_cache_cpu[layer_id][i // chunk_size][1],
+                )
+                assert k_cpu.shape[0] == v_cpu.shape[0] == len(chunk_indices)
+                k_chunk = k_cpu.to(self.k_buffer[0].device, non_blocking=True)
+                v_chunk = v_cpu.to(self.v_buffer[0].device, non_blocking=True)
+                self.k_buffer[layer_id][chunk_indices] = k_chunk
+                self.v_buffer[layer_id][chunk_indices] = v_chunk
+        torch.cuda.synchronize()
+
+    def _get_key_buffer(self, layer_id: int):
+        # for internal use of referencing
+        if self.store_dtype != self.dtype:
+            return self.k_buffer[layer_id - self.start_layer].view(self.dtype)
+        return self.k_buffer[layer_id - self.start_layer]
+
+    def get_key_buffer(self, layer_id: int):
+        # note: get_key_buffer is hooked with synchronization for layer-wise KV cache loading
+        # it is supposed to be used only by attention backend not for information purpose
+        # same applies to get_value_buffer and get_kv_buffer
+        if self.layer_transfer_counter is not None:
+            self.layer_transfer_counter.wait_until(layer_id - self.start_layer)
+        return self._get_key_buffer(layer_id)
+
+    def _get_value_buffer(self, layer_id: int):
+        # for internal use of referencing
+        if self.store_dtype != self.dtype:
+            return self.v_buffer[layer_id - self.start_layer].view(self.dtype)
+        return self.v_buffer[layer_id - self.start_layer]
+
+    def get_value_buffer(self, layer_id: int):
+        if self.layer_transfer_counter is not None:
+            self.layer_transfer_counter.wait_until(layer_id - self.start_layer)
+        return self._get_value_buffer(layer_id)
+
+    def get_kv_buffer(self, layer_id: int):
+        return self.get_key_buffer(layer_id), self.get_value_buffer(layer_id)
+
+    def set_kv_buffer(
+        self,
+        layer: RadixAttention,
+        loc: torch.Tensor,
+        cache_k: torch.Tensor,
+        cache_v: torch.Tensor,
+        k_scale: Optional[float] = None,
+        v_scale: Optional[float] = None,
+        layer_id_override: Optional[int] = None,
+    ):
+        if layer_id_override is not None:
+            layer_id = layer_id_override
+        else:
+            layer_id = layer.layer_id
+        if cache_k.dtype != self.dtype:
+            if k_scale is not None:
+                cache_k.div_(k_scale)
+            if v_scale is not None:
+                cache_v.div_(v_scale)
+            cache_k = cache_k.to(self.dtype)
+            cache_v = cache_v.to(self.dtype)
+
+        if self.store_dtype != self.dtype:
+            cache_k = cache_k.view(self.store_dtype)
+            cache_v = cache_v.view(self.store_dtype)
+
+        _set_kv_buffer_impl(
+            cache_k,
+            cache_v,
+            self.k_buffer[layer_id - self.start_layer],
+            self.v_buffer[layer_id - self.start_layer],
+            loc,
+            row_dim=self.row_dim,
+            store_dtype=self.store_dtype,
+            device_module=self.device_module,
+            alt_stream=self.alt_stream,
+            same_kv_dim=self.same_kv_dim,
+        )
+
+    def move_kv_cache(self, tgt_loc: torch.Tensor, src_loc: torch.Tensor):
+        if envs.SGLANG_NATIVE_MOVE_KV_CACHE.get():
+            move_kv_cache_native(self.k_buffer, self.v_buffer, tgt_loc, src_loc)
+            return
+
+        N = tgt_loc.numel()
+        if N == 0:
+            return
+
+        assert (
+            self._kv_copy_config is not None
+        ), "KV copy not initialized. Set enable_kv_cache_copy=True in __init__"
+
+        cfg = self._kv_copy_config
+        cap = int(cfg.get("num_locs_upper", 256))
+        grid = (self.data_ptrs.numel(), cfg["byte_tiles"])
+
+        if N <= cap:
+            upper = next_power_of_2(N)
+            copy_all_layer_kv_cache_tiled[grid](
+                self.data_ptrs,
+                self.data_strides,
+                tgt_loc,
+                src_loc,
+                N,
+                upper,
+                BYTES_PER_TILE=cfg["bytes_per_tile"],
+                num_warps=cfg["num_warps"],
+                num_stages=2,
+            )
+            return
+
+        # Huge N: chunk, but each chunk's upper is still pow2(<= cap)
+        for start in range(0, N, cap):
+            end = min(start + cap, N)
+            chunk_len = end - start
+            upper = next_power_of_2(chunk_len)
+            copy_all_layer_kv_cache_tiled[grid](
+                self.data_ptrs,
+                self.data_strides,
+                tgt_loc[start:end],
+                src_loc[start:end],
+                chunk_len,
+                upper,
+                BYTES_PER_TILE=cfg["bytes_per_tile"],
+                num_warps=cfg["num_warps"],
+                num_stages=2,
+            )
+
+
+class MHATokenToKVPoolFP4(MHATokenToKVPool):
+
+    def _create_buffers(self):
+        with self.memory_saver_adapter.region(GPU_MEMORY_TYPE_KV_CACHE):
+            with (
+                torch.cuda.use_mem_pool(self.custom_mem_pool)
+                if self.enable_custom_mem_pool
+                else nullcontext()
+            ):
+                # [size, head_num, head_dim] for each layer
+                # The padded slot 0 is used for writing dummy outputs from padded tokens.
+                m = self.size + self.page_size
+                n = self.head_num
+                k = self.head_dim
+
+                scale_block_size = 16
+                self.store_dtype = torch.uint8
+                self.k_buffer = [
+                    torch.zeros(
+                        (m, n, k // 2),
+                        dtype=self.store_dtype,
+                        device=self.device,
+                    )
+                    for _ in range(self.layer_num)
+                ]
+                self.v_buffer = [
+                    torch.zeros(
+                        (m, n, k // 2),
+                        dtype=self.store_dtype,
+                        device=self.device,
+                    )
+                    for _ in range(self.layer_num)
+                ]
+
+                self.k_scale_buffer = [
+                    torch.zeros(
+                        (m, (n * k) // scale_block_size),
+                        dtype=self.store_dtype,
+                        device=self.device,
+                    )
+                    for _ in range(self.layer_num)
+                ]
+                self.v_scale_buffer = [
+                    torch.zeros(
+                        (m, (n * k) // scale_block_size),
+                        dtype=self.store_dtype,
+                        device=self.device,
+                    )
+                    for _ in range(self.layer_num)
+                ]
+
+    def _clear_buffers(self):
+        del self.k_buffer
+        del self.v_buffer
+        del self.k_scale_buffer
+        del self.v_scale_buffer
+
+    def _get_key_buffer(self, layer_id: int):
+        # for internal use of referencing
+        if self.store_dtype != self.dtype:
+            cache_k_nope_fp4 = self.k_buffer[layer_id - self.start_layer].view(
+                torch.uint8
+            )
+            cache_k_nope_fp4_sf = self.k_scale_buffer[layer_id - self.start_layer]
+
+            from sglang.srt.layers.quantization.kvfp4_tensor import KVFP4QuantizeUtil
+
+            cache_k_nope_fp4_dequant = KVFP4QuantizeUtil.batched_dequantize(
+                cache_k_nope_fp4, cache_k_nope_fp4_sf
+            )
+            return cache_k_nope_fp4_dequant
+        return self.k_buffer[layer_id - self.start_layer]
+
+    def _get_value_buffer(self, layer_id: int):
+        # for internal use of referencing
+        if self.store_dtype != self.dtype:
+            cache_v_nope_fp4 = self.v_buffer[layer_id - self.start_layer].view(
+                torch.uint8
+            )
+            cache_v_nope_fp4_sf = self.v_scale_buffer[layer_id - self.start_layer]
+
+            from sglang.srt.layers.quantization.kvfp4_tensor import KVFP4QuantizeUtil
+
+            cache_v_nope_fp4_dequant = KVFP4QuantizeUtil.batched_dequantize(
+                cache_v_nope_fp4, cache_v_nope_fp4_sf
+            )
+            return cache_v_nope_fp4_dequant
+        return self.v_buffer[layer_id - self.start_layer]
+
+    def set_kv_buffer(
+        self,
+        layer: RadixAttention,
+        loc: torch.Tensor,
+        cache_k: torch.Tensor,
+        cache_v: torch.Tensor,
+        k_scale: Optional[float] = None,
+        v_scale: Optional[float] = None,
+        layer_id_override: Optional[int] = None,
+    ):
+        from sglang.srt.model_executor.cuda_graph_runner import get_is_capture_mode
+
+        if layer_id_override is not None:
+            layer_id = layer_id_override
+        else:
+            layer_id = layer.layer_id
+        if cache_k.dtype != self.dtype:
+            if k_scale is not None:
+                cache_k.div_(k_scale)
+            if v_scale is not None:
+                cache_v.div_(v_scale)
+
+            from sglang.srt.layers.quantization.kvfp4_tensor import KVFP4QuantizeUtil
+
+            cache_k, cache_k_fp4_sf = KVFP4QuantizeUtil.batched_quantize(cache_k)
+            cache_v, cache_v_fp4_sf = KVFP4QuantizeUtil.batched_quantize(cache_v)
+
+        if self.store_dtype != self.dtype:
+            cache_k = cache_k.view(self.store_dtype)
+            cache_v = cache_v.view(self.store_dtype)
+
+            cache_k_fp4_sf = cache_k_fp4_sf.view(self.store_dtype)
+            cache_v_fp4_sf = cache_v_fp4_sf.view(self.store_dtype)
+
+        if get_is_capture_mode() and self.alt_stream is not None:
+            # Overlap the copy of K and V cache for small batch size
+            current_stream = self.device_module.current_stream()
+            self.alt_stream.wait_stream(current_stream)
+            self.k_buffer[layer_id - self.start_layer][loc] = cache_k
+
+            self.k_scale_buffer[layer_id - self.start_layer][loc] = cache_k_fp4_sf
+            with self.device_module.stream(self.alt_stream):
+                self.v_buffer[layer_id - self.start_layer][loc] = cache_v
+
+                self.v_scale_buffer[layer_id - self.start_layer][loc] = cache_v_fp4_sf
+            current_stream.wait_stream(self.alt_stream)
+        else:
+            self.k_buffer[layer_id - self.start_layer][loc] = cache_k
+            self.v_buffer[layer_id - self.start_layer][loc] = cache_v
+
+            self.k_scale_buffer[layer_id - self.start_layer][loc] = cache_k_fp4_sf
+            self.v_scale_buffer[layer_id - self.start_layer][loc] = cache_v_fp4_sf
+
+
+class HybridLinearKVPool(KVCache):
+    """KV cache with separate pools for full and linear attention layers."""
+
+    def __init__(
+        self,
+        size: int,
+        dtype: torch.dtype,
+        page_size: int,
+        head_num: int,
+        head_dim: int,
+        full_attention_layer_ids: List[int],
+        enable_kvcache_transpose: bool,
+        device: str,
+        mamba_pool: MambaPool,
+        enable_memory_saver: bool = False,
+        # TODO: refactor mla related args
+        use_mla: bool = False,
+        kv_lora_rank: int = None,
+        qk_rope_head_dim: int = None,
+    ):
+        self.size = size
+        self.dtype = dtype
+        self.device = device
+        self.full_layer_nums = len(full_attention_layer_ids)
+        self.page_size = page_size
+        # TODO support pp?
+        self.start_layer = 0
+        self.head_num = head_num
+        self.head_dim = head_dim
+        self.mamba_pool = mamba_pool
+        # TODO MHATransposedTokenToKVPool if enable_kvcache_transpose is True
+        assert not enable_kvcache_transpose
+        self.use_mla = use_mla
+        if not use_mla:
+
+            TokenToKVPoolClass = MHATokenToKVPool
+
+            if _is_npu:
+                from sglang.srt.hardware_backend.npu.memory_pool_npu import (
+                    NPUMHATokenToKVPool,
+                )
+
+                TokenToKVPoolClass = NPUMHATokenToKVPool
+
+            self.full_kv_pool = TokenToKVPoolClass(
+                size=size,
+                page_size=self.page_size,
+                dtype=dtype,
+                head_num=head_num,
+                head_dim=head_dim,
+                layer_num=self.full_layer_nums,
+                device=device,
+                enable_memory_saver=enable_memory_saver,
+            )
+        else:
+
+            TokenToKVPoolClass = MLATokenToKVPool
+
+            if _is_npu:
+                from sglang.srt.hardware_backend.npu.memory_pool_npu import (
+                    NPUMLATokenToKVPool,
+                )
+
+                TokenToKVPoolClass = NPUMLATokenToKVPool
+
+            self.full_kv_pool = TokenToKVPoolClass(
+                size=size,
+                page_size=self.page_size,
+                dtype=dtype,
+                layer_num=self.full_layer_nums,
+                device=device,
+                kv_lora_rank=kv_lora_rank,
+                qk_rope_head_dim=qk_rope_head_dim,
+                enable_memory_saver=enable_memory_saver,
+            )
+        self.full_attention_layer_id_mapping = {
+            id: i for i, id in enumerate(full_attention_layer_ids)
+        }
+        if use_mla:
+            self.mem_usage = self.get_kv_size_bytes() / GB
+        else:
+            k_size, v_size = self.get_kv_size_bytes()
+            self.mem_usage = (k_size + v_size) / GB
+
+    def get_kv_size_bytes(self):
+        return self.full_kv_pool.get_kv_size_bytes()
+
+    def get_contiguous_buf_infos(self):
+        return self.full_kv_pool.get_contiguous_buf_infos()
+
+    def get_state_buf_infos(self):
+        mamba_data_ptrs, mamba_data_lens, mamba_item_lens = (
+            self.mamba_pool.get_contiguous_buf_infos()
+        )
+        return mamba_data_ptrs, mamba_data_lens, mamba_item_lens
+
+    def get_state_dim_per_tensor(self):
+        """Get the sliceable dimension size for each mamba state tensor."""
+        return self.mamba_pool.get_state_dim_per_tensor()
+
+    def maybe_get_custom_mem_pool(self):
+        return self.full_kv_pool.maybe_get_custom_mem_pool()
+
+    def _transfer_full_attention_id(self, layer_id: int):
+        if layer_id not in self.full_attention_layer_id_mapping:
+            raise ValueError(
+                f"{layer_id=} not in full attention layers: {self.full_attention_layer_id_mapping.keys()}"
+            )
+        return self.full_attention_layer_id_mapping[layer_id]
+
+    def get_key_buffer(self, layer_id: int):
+        layer_id = self._transfer_full_attention_id(layer_id)
+        return self.full_kv_pool.get_key_buffer(layer_id)
+
+    def get_value_buffer(self, layer_id: int):
+        layer_id = self._transfer_full_attention_id(layer_id)
+        return self.full_kv_pool.get_value_buffer(layer_id)
+
+    def get_kv_buffer(self, layer_id: int):
+        layer_id = self._transfer_full_attention_id(layer_id)
+        return self.full_kv_pool.get_kv_buffer(layer_id)
+
+    @contextmanager
+    def _transfer_id_context(self, layer: RadixAttention):
+
+        @contextmanager
+        def _patch_layer_id(layer):
+            original_layer_id = layer.layer_id
+            layer.layer_id = self._transfer_full_attention_id(layer.layer_id)
+            try:
+                yield
+            finally:
+                layer.layer_id = original_layer_id
+
+        with _patch_layer_id(layer):
+            yield
+
+    def set_kv_buffer(
+        self,
+        layer: RadixAttention,
+        loc: torch.Tensor,
+        cache_k: torch.Tensor,
+        cache_v: torch.Tensor,
+        k_scale: float = 1.0,
+        v_scale: float = 1.0,
+    ):
+        layer_id = self._transfer_full_attention_id(layer.layer_id)
+        if not self.use_mla:
+            self.full_kv_pool.set_kv_buffer(
+                None,
+                loc,
+                cache_k,
+                cache_v,
+                k_scale,
+                v_scale,
+                layer_id_override=layer_id,
+            )
+        else:
+            with self._transfer_id_context(layer):
+                self.full_kv_pool.set_kv_buffer(
+                    layer,
+                    loc,
+                    cache_k,
+                    cache_v,
+                )
+
+    def move_kv_cache(self, tgt_loc: torch.Tensor, src_loc: torch.Tensor):
+        self.full_kv_pool.move_kv_cache(tgt_loc, src_loc)
+
+    def get_v_head_dim(self):
+        return self.full_kv_pool.get_value_buffer(0).shape[-1]
+
+    def set_mla_kv_buffer(
+        self,
+        layer: RadixAttention,
+        loc: torch.Tensor,
+        cache_k_nope: torch.Tensor,
+        cache_k_rope: torch.Tensor,
+    ):
+        assert self.use_mla, "set_mla_kv_buffer called when use_mla is False"
+        with self._transfer_id_context(layer):
+            self.full_kv_pool.set_mla_kv_buffer(layer, loc, cache_k_nope, cache_k_rope)
+
+    def get_mla_kv_buffer(
+        self,
+        layer: RadixAttention,
+        loc: torch.Tensor,
+        dst_dtype: Optional[torch.dtype] = None,
+    ):
+        assert self.use_mla, "get_mla_kv_buffer called when use_mla is False"
+        with self._transfer_id_context(layer):
+            return self.full_kv_pool.get_mla_kv_buffer(layer, loc, dst_dtype)
+
+
+class MLATokenToKVPool(KVCache):
+    def __init__(
+        self,
+        size: int,
+        page_size: int,
+        dtype: torch.dtype,
+        kv_lora_rank: int,
+        qk_rope_head_dim: int,
+        layer_num: int,
+        device: str,
+        enable_memory_saver: bool,
+        start_layer: Optional[int] = None,
+        end_layer: Optional[int] = None,
+        use_nsa: bool = False,
+        override_kv_cache_dim: Optional[int] = None,
+    ):
+        super().__init__(
+            size,
+            page_size,
+            dtype,
+            layer_num,
+            device,
+            enable_memory_saver,
+            start_layer,
+            end_layer,
+        )
+
+        self.kv_lora_rank = kv_lora_rank
+        self.qk_rope_head_dim = qk_rope_head_dim
+        self.use_nsa = use_nsa
+        self.nsa_kv_cache_store_fp8 = (
+            use_nsa
+            and dtype == torch.float8_e4m3fn
+            and override_kv_cache_dim is not None
+        )
+        # When override_kv_cache_dim is provided with nsa model, we assume the
+        # override kv cache dim is correct and use it directly.
+        self.kv_cache_dim = (
+            override_kv_cache_dim
+            if self.nsa_kv_cache_store_fp8
+            else (kv_lora_rank + qk_rope_head_dim)
+        )
+
+        self._create_buffers()
+
+        self.data_ptrs = torch.tensor(
+            [x.data_ptr() for x in self.kv_buffer],
+            dtype=torch.uint64,
+            device=self.device,
+        )
+        if not use_nsa:
+            # NSA will allocate indexer KV cache later and then log the total size
+            self._finalize_allocation_log(size)
+
+    def _create_buffers(self):
+        with self.memory_saver_adapter.region(GPU_MEMORY_TYPE_KV_CACHE):
+            with (
+                torch.cuda.use_mem_pool(self.custom_mem_pool)
+                if self.custom_mem_pool
+                else nullcontext()
+            ):
+                # The padded slot 0 is used for writing dummy outputs from padded tokens.
+                self.kv_buffer = [
+                    torch.zeros(
+                        (self.size + self.page_size, 1, self.kv_cache_dim),
+                        dtype=self.store_dtype,
+                        device=self.device,
+                    )
+                    for _ in range(self.layer_num)
+                ]
+
+    def _clear_buffers(self):
+        del self.kv_buffer
+
+    def get_kv_size_bytes(self):
+        assert hasattr(self, "kv_buffer")
+        kv_size_bytes = 0
+        for kv_cache in self.kv_buffer:
+            kv_size_bytes += get_tensor_size_bytes(kv_cache)
+        return kv_size_bytes
+
+    # for disagg
+    def get_contiguous_buf_infos(self):
+        # MLA has only one kv_buffer, so only the information of this buffer needs to be returned.
+        kv_data_ptrs = [self.kv_buffer[i].data_ptr() for i in range(self.layer_num)]
+        kv_data_lens = [self.kv_buffer[i].nbytes for i in range(self.layer_num)]
+        kv_item_lens = [
+            self.kv_buffer[i][0].nbytes * self.page_size for i in range(self.layer_num)
+        ]
+        return kv_data_ptrs, kv_data_lens, kv_item_lens
+
+    def get_key_buffer(self, layer_id: int):
+        if self.layer_transfer_counter is not None:
+            self.layer_transfer_counter.wait_until(layer_id - self.start_layer)
+
+        if self.store_dtype != self.dtype:
+            return self.kv_buffer[layer_id - self.start_layer].view(self.dtype)
+
+        return self.kv_buffer[layer_id - self.start_layer]
+
+    def get_value_buffer(self, layer_id: int):
+        if self.layer_transfer_counter is not None:
+            self.layer_transfer_counter.wait_until(layer_id - self.start_layer)
+
+        if self.store_dtype != self.dtype:
+            return self.kv_buffer[layer_id - self.start_layer][
+                ..., : self.kv_lora_rank
+            ].view(self.dtype)
+        return self.kv_buffer[layer_id - self.start_layer][..., : self.kv_lora_rank]
+
+    def get_kv_buffer(self, layer_id: int):
+        return self.get_key_buffer(layer_id), self.get_value_buffer(layer_id)
+
+    def set_kv_buffer(
+        self,
+        layer: RadixAttention,
+        loc: torch.Tensor,
+        cache_k: torch.Tensor,
+        cache_v: torch.Tensor,
+    ):
+        layer_id = layer.layer_id
+        assert not self.nsa_kv_cache_store_fp8
+        if cache_k.dtype != self.dtype:
+            cache_k = cache_k.to(self.dtype)
+
+        if self.store_dtype != self.dtype:
+            self.kv_buffer[layer_id - self.start_layer][loc] = cache_k.view(
+                self.store_dtype
+            )
+        else:
+            self.kv_buffer[layer_id - self.start_layer][loc] = cache_k
+
+    def set_mla_kv_buffer(
+        self,
+        layer: RadixAttention,
+        loc: torch.Tensor,
+        cache_k_nope: torch.Tensor,
+        cache_k_rope: torch.Tensor,
+    ):
+        layer_id = layer.layer_id
+
+        if self.nsa_kv_cache_store_fp8:
+            # OPTIMIZATION: Quantize k_nope and k_rope separately to avoid concat overhead
+            # This also enables reuse of set_mla_kv_buffer_triton two-tensor write path
+            # quantize_k_cache_separate returns (nope_part, rope_part) as uint8 bytes
+            cache_k_nope_fp8, cache_k_rope_fp8 = quantize_k_cache_separate(
+                cache_k_nope, cache_k_rope
+            )
+
+            # Reuse existing two-tensor write kernel (works with FP8 byte layout)
+            # cache_k_nope_fp8: (num_tokens, 1, 528) uint8 [nope_fp8(512) | scales(16)]
+            # cache_k_rope_fp8: (num_tokens, 1, 128) uint8 [rope_bf16_bytes(128)]
+            set_mla_kv_buffer_triton(
+                self.kv_buffer[layer_id - self.start_layer],
+                loc,
+                cache_k_nope_fp8,
+                cache_k_rope_fp8,
+            )
+        else:
+            if cache_k_nope.dtype != self.dtype:
+                cache_k_nope = cache_k_nope.to(self.dtype)
+                cache_k_rope = cache_k_rope.to(self.dtype)
+            if self.store_dtype != self.dtype:
+                cache_k_nope = cache_k_nope.view(self.store_dtype)
+                cache_k_rope = cache_k_rope.view(self.store_dtype)
+
+            set_mla_kv_buffer_triton(
+                self.kv_buffer[layer_id - self.start_layer],
+                loc,
+                cache_k_nope,
+                cache_k_rope,
+            )
+
+    def get_mla_kv_buffer(
+        self,
+        layer: RadixAttention,
+        loc: torch.Tensor,
+        dst_dtype: Optional[torch.dtype] = None,
+    ):
+        # get k nope and k rope from the kv buffer, and optionally cast them to dst_dtype.
+        layer_id = layer.layer_id
+        kv_buffer = self.get_key_buffer(layer_id)
+        dst_dtype = dst_dtype or self.dtype
+        cache_k_nope = torch.empty(
+            (loc.shape[0], 1, self.kv_lora_rank),
+            dtype=dst_dtype,
+            device=kv_buffer.device,
+        )
+        cache_k_rope = torch.empty(
+            (loc.shape[0], 1, self.qk_rope_head_dim),
+            dtype=dst_dtype,
+            device=kv_buffer.device,
+        )
+        get_mla_kv_buffer_triton(kv_buffer, loc, cache_k_nope, cache_k_rope)
+        return cache_k_nope, cache_k_rope
+
+    def get_cpu_copy(self, indices):
+        torch.cuda.synchronize()
+        kv_cache_cpu = []
+        chunk_size = self.cpu_offloading_chunk_size
+        for layer_id in range(self.layer_num):
+            kv_cache_cpu.append([])
+            for i in range(0, len(indices), chunk_size):
+                chunk_indices = indices[i : i + chunk_size]
+                kv_cpu = self.kv_buffer[layer_id][chunk_indices].to(
+                    "cpu", non_blocking=True
+                )
+                kv_cache_cpu[-1].append(kv_cpu)
+        torch.cuda.synchronize()
+        return kv_cache_cpu
+
+    def load_cpu_copy(self, kv_cache_cpu, indices):
+        torch.cuda.synchronize()
+        chunk_size = self.cpu_offloading_chunk_size
+        for layer_id in range(self.layer_num):
+            for i in range(0, len(indices), chunk_size):
+                chunk_indices = indices[i : i + chunk_size]
+                kv_cpu = kv_cache_cpu[layer_id][i // chunk_size]
+                assert kv_cpu.shape[0] == len(chunk_indices)
+                kv_chunk = kv_cpu.to(self.kv_buffer[0].device, non_blocking=True)
+                self.kv_buffer[layer_id][chunk_indices] = kv_chunk
+        torch.cuda.synchronize()
+
+
+class MLATokenToKVPoolFP4(MLATokenToKVPool):
+
+    def _create_buffers(self):
+        with self.memory_saver_adapter.region(GPU_MEMORY_TYPE_KV_CACHE):
+            with (
+                torch.cuda.use_mem_pool(self.custom_mem_pool)
+                if self.custom_mem_pool
+                else nullcontext()
+            ):
+                # The padded slot 0 is used for writing dummy outputs from padded tokens.
+                m = self.size + self.page_size
+                n = 1  # head_num
+                k = self.kv_cache_dim  # head_dim
+
+                scale_block_size = 16
+                self.store_dtype = torch.uint8
+
+                self.kv_buffer = [
+                    torch.zeros(
+                        (m, n, k // 2),
+                        dtype=self.store_dtype,
+                        device=self.device,
+                    )
+                    for _ in range(self.layer_num)
+                ]
+
+                self.kv_scale_buffer = [
+                    torch.zeros(
+                        (m, k // scale_block_size),
+                        dtype=self.store_dtype,
+                        device=self.device,
+                    )
+                    for _ in range(self.layer_num)
+                ]
+
+    def _clear_buffers(self):
+        del self.kv_buffer
+        del self.kv_scale_buffer
+
+    def get_key_buffer(self, layer_id: int):
+        if self.layer_transfer_counter is not None:
+            self.layer_transfer_counter.wait_until(layer_id - self.start_layer)
+
+        if self.store_dtype != self.dtype:
+            cache_k_nope_fp4 = self.kv_buffer[layer_id - self.start_layer].view(
+                torch.uint8
+            )
+            cache_k_nope_fp4_sf = self.kv_scale_buffer[layer_id - self.start_layer]
+
+            from sglang.srt.layers.quantization.kvfp4_tensor import KVFP4QuantizeUtil
+
+            cache_k_nope_fp4_dequant = KVFP4QuantizeUtil.batched_dequantize(
+                cache_k_nope_fp4, cache_k_nope_fp4_sf
+            )
+            return cache_k_nope_fp4_dequant
+
+        return self.kv_buffer[layer_id - self.start_layer]
+
+    def set_kv_buffer(
+        self,
+        layer: RadixAttention,
+        loc: torch.Tensor,
+        cache_k: torch.Tensor,
+        cache_v: torch.Tensor,
+    ):
+        layer_id = layer.layer_id
+        assert not self.nsa_kv_cache_store_fp8
+        if cache_k.dtype != self.dtype:
+            from sglang.srt.layers.quantization.kvfp4_tensor import KVFP4QuantizeUtil
+
+            cache_k_fp4, cache_k_fp4_sf = KVFP4QuantizeUtil.batched_quantize(cache_k)
+
+        if self.store_dtype != self.dtype:
+            self.kv_buffer[layer_id - self.start_layer][loc] = cache_k_fp4.view(
+                self.store_dtype
+            )
+            self.kv_scale_buffer[layer_id - self.start_layer][loc] = (
+                cache_k_fp4_sf.view(self.store_dtype)
+            )
+        else:
+            self.kv_buffer[layer_id - self.start_layer][loc] = cache_k
+
+    def set_mla_kv_buffer(
+        self,
+        layer: RadixAttention,
+        loc: torch.Tensor,
+        cache_k_nope: torch.Tensor,
+        cache_k_rope: torch.Tensor,
+    ):
+        layer_id = layer.layer_id
+
+        if self.nsa_kv_cache_store_fp8:
+            # original cache_k: (num_tokens, num_heads 1, hidden 576); we unsqueeze the page_size=1 dim here
+            # TODO no need to cat
+            cache_k = torch.cat([cache_k_nope, cache_k_rope], dim=-1)
+            cache_k = quantize_k_cache(cache_k.unsqueeze(1)).squeeze(1)
+            cache_k = cache_k.view(self.store_dtype)
+            self.kv_buffer[layer_id - self.start_layer][loc] = cache_k
+        else:
+            if cache_k_nope.dtype != self.dtype:
+                from sglang.srt.layers.quantization.kvfp4_tensor import (
+                    KVFP4QuantizeUtil,
+                )
+
+                cache_k_nope_fp4, cache_k_nope_fp4_sf = (
+                    KVFP4QuantizeUtil.batched_quantize(cache_k_nope)
+                )
+                cache_k_rope_fp4, cache_k_rope_fp4_sf = (
+                    KVFP4QuantizeUtil.batched_quantize(cache_k_rope)
+                )
+
+            if self.store_dtype != self.dtype:
+                cache_k_nope = cache_k_nope.view(self.store_dtype)
+                cache_k_rope = cache_k_rope.view(self.store_dtype)
+
+            set_mla_kv_buffer_triton(
+                self.kv_buffer[layer_id - self.start_layer],
+                loc,
+                cache_k_nope_fp4,
+                cache_k_rope_fp4,
+            )
+            set_mla_kv_scale_buffer_triton(
+                self.kv_scale_buffer[layer_id - self.start_layer],
+                loc,
+                cache_k_nope_fp4_sf,
+                cache_k_rope_fp4_sf,
+            )
+
+
+class NSATokenToKVPool(MLATokenToKVPool):
+    quant_block_size = 128
+    index_k_with_scale_buffer_dtype = torch.uint8
+    rope_storage_dtype = torch.bfloat16  # rope is always stored in bf16
+
+    def __init__(
+        self,
+        size: int,
+        page_size: int,
+        kv_lora_rank: int,
+        dtype: torch.dtype,
+        qk_rope_head_dim: int,
+        layer_num: int,
+        device: str,
+        index_head_dim: int,
+        enable_memory_saver: bool,
+        kv_cache_dim: int,
+        start_layer: Optional[int] = None,
+        end_layer: Optional[int] = None,
+    ):
+
+        override_dim = (
+            kv_cache_dim if kv_cache_dim != kv_lora_rank + qk_rope_head_dim else None
+        )
+
+        super().__init__(
+            size,
+            page_size,
+            dtype,
+            kv_lora_rank,
+            qk_rope_head_dim,
+            layer_num,
+            device,
+            enable_memory_saver,
+            start_layer,
+            end_layer,
+            use_nsa=True,
+            override_kv_cache_dim=override_dim,
+        )
+        # self.index_k_dtype = torch.float8_e4m3fn
+        # self.index_k_scale_dtype = torch.float32
+        self.index_head_dim = index_head_dim
+        # num head == 1 and head dim == 128 for index_k in NSA
+        assert index_head_dim == 128
+
+        if _is_hip:
+            assert self.page_size == 1
+        else:
+            assert self.page_size == 64
+        with (
+            torch.cuda.use_mem_pool(self.custom_mem_pool)
+            if self.custom_mem_pool
+            else nullcontext()
+        ):
+            self.index_k_with_scale_buffer = [
+                torch.zeros(
+                    # Layout:
+                    #     ref: test_attention.py :: kv_cache_cast_to_fp8
+                    #     shape: (num_pages, page_size 64 * head_dim 128 + page_size 64 * fp32_nbytes 4)
+                    #     data: for page i,
+                    #         * buf[i, :page_size * head_dim] for fp8 data
+                    #         * buf[i, page_size * head_dim:].view(float32) for scale
+                    (
+                        (size + page_size + 1) // self.page_size,
+                        self.page_size
+                        * (
+                            index_head_dim + index_head_dim // self.quant_block_size * 4
+                        ),
+                    ),
+                    dtype=self.index_k_with_scale_buffer_dtype,
+                    device=device,
+                )
+                for _ in range(layer_num)
+            ]
+        self._finalize_allocation_log(size)
+
+    def get_index_k_with_scale_buffer(self, layer_id: int) -> torch.Tensor:
+        if self.layer_transfer_counter is not None:
+            self.layer_transfer_counter.wait_until(layer_id - self.start_layer)
+        return self.index_k_with_scale_buffer[layer_id - self.start_layer]
+
+    def get_index_k_continuous(
+        self,
+        layer_id: int,
+        seq_len: int,
+        page_indices: torch.Tensor,
+    ):
+        buf = self.index_k_with_scale_buffer[layer_id - self.start_layer]
+        return index_buf_accessor.GetK.execute(
+            self, buf, seq_len=seq_len, page_indices=page_indices
+        )
+
+    def get_index_k_scale_continuous(
+        self,
+        layer_id: int,
+        seq_len: int,
+        page_indices: torch.Tensor,
+    ):
+        buf = self.index_k_with_scale_buffer[layer_id - self.start_layer]
+        return index_buf_accessor.GetS.execute(
+            self, buf, seq_len=seq_len, page_indices=page_indices
+        )
+
+    def get_index_k_scale_buffer(
+        self,
+        layer_id: int,
+        seq_len: int,
+        page_indices: torch.Tensor,
+    ):
+        """
+        Fused method to get both index K and scale data in a single call using Triton.
+        More efficient than calling get_index_k_continuous and get_index_k_scale_continuous separately.
+
+        :param layer_id: Layer index
+        :param seq_len: Sequence length
+        :param page_indices: Page indices tensor
+        :return: tuple of (k_fp8, k_scale) where
+                 k_fp8: (seq_len, index_head_dim), uint8
+                 k_scale: (seq_len, 4), uint8
+        """
+        buf = self.index_k_with_scale_buffer[layer_id - self.start_layer]
+        return index_buf_accessor.GetKAndS.execute(
+            self, buf, seq_len=seq_len, page_indices=page_indices
+        )
+
+    def set_index_k_scale_buffer(
+        self,
+        layer_id: int,
+        loc: torch.Tensor,
+        index_k: torch.Tensor,
+        index_k_scale: torch.Tensor,
+    ) -> None:
+        buf = self.index_k_with_scale_buffer[layer_id - self.start_layer]
+        index_buf_accessor.SetKAndS.execute(
+            pool=self, buf=buf, loc=loc, index_k=index_k, index_k_scale=index_k_scale
+        )
+
+    def get_state_buf_infos(self):
+        data_ptrs = [
+            self.index_k_with_scale_buffer[i].data_ptr() for i in range(self.layer_num)
+        ]
+        data_lens = [
+            self.index_k_with_scale_buffer[i].nbytes for i in range(self.layer_num)
+        ]
+        item_lens = [
+            self.index_k_with_scale_buffer[i][0].nbytes for i in range(self.layer_num)
+        ]
+        return data_ptrs, data_lens, item_lens
+
+    def get_kv_size_bytes(self):
+        kv_size_bytes = super().get_kv_size_bytes()
+        for index_k_cache in self.index_k_with_scale_buffer:
+            kv_size_bytes += get_tensor_size_bytes(index_k_cache)
+        return kv_size_bytes
+
+
+class DoubleSparseTokenToKVPool(KVCache):
+    def __init__(
+        self,
+        size: int,
+        page_size: int,
+        dtype: torch.dtype,
+        head_num: int,
+        head_dim: int,
+        layer_num: int,
+        device: str,
+        heavy_channel_num: int,
+        enable_memory_saver: bool,
+        start_layer: Optional[int] = None,
+        end_layer: Optional[int] = None,
+    ):
+        super().__init__(
+            size,
+            page_size,
+            dtype,
+            layer_num,
+            device,
+            enable_memory_saver,
+            start_layer,
+            end_layer,
+        )
+
+        with self.memory_saver_adapter.region(GPU_MEMORY_TYPE_KV_CACHE):
+            with (
+                torch.cuda.use_mem_pool(self.custom_mem_pool)
+                if self.enable_custom_mem_pool
+                else nullcontext()
+            ):
+                # [size, head_num, head_dim] for each layer
+                self.k_buffer = [
+                    torch.zeros(
+                        (size + page_size, head_num, head_dim),
+                        dtype=dtype,
+                        device=device,
+                    )
+                    for _ in range(layer_num)
+                ]
+                self.v_buffer = [
+                    torch.zeros(
+                        (size + page_size, head_num, head_dim),
+                        dtype=dtype,
+                        device=device,
+                    )
+                    for _ in range(layer_num)
+                ]
+
+                # [size, head_num, heavy_channel_num] for each layer
+                self.label_buffer = [
+                    torch.zeros(
+                        (size + 1, head_num, heavy_channel_num),
+                        dtype=dtype,
+                        device=device,
+                    )
+                    for _ in range(layer_num)
+                ]
+
+    def get_key_buffer(self, layer_id: int):
+        return self.k_buffer[layer_id - self.start_layer]
+
+    def get_value_buffer(self, layer_id: int):
+        return self.v_buffer[layer_id - self.start_layer]
+
+    def get_label_buffer(self, layer_id: int):
+        return self.label_buffer[layer_id - self.start_layer]
+
+    def get_kv_buffer(self, layer_id: int):
+        return (
+            self.k_buffer[layer_id - self.start_layer],
+            self.v_buffer[layer_id - self.start_layer],
+        )
+
+    def set_kv_buffer(
+        self,
+        layer: RadixAttention,
+        loc: torch.Tensor,
+        cache_k: torch.Tensor,
+        cache_v: torch.Tensor,
+        cache_label: torch.Tensor,
+    ):
+        # NOTE(Andy): ignore the dtype check
+        layer_id = layer.layer_id
+        self.k_buffer[layer_id - self.start_layer][loc] = cache_k
+        self.v_buffer[layer_id - self.start_layer][loc] = cache_v
+        self.label_buffer[layer_id - self.start_layer][loc] = cache_label
+
+
+def move_kv_cache_native(
+    k_buffer: List[torch.Tensor],
+    v_buffer: List[torch.Tensor],
+    tgt_loc: torch.Tensor,
+    src_loc: torch.Tensor,
+):
+    if tgt_loc.numel() == 0:
+        return
+
+    tgt_loc_flat = tgt_loc.view(-1).long()
+    src_loc_flat = src_loc.view(-1).long()
+    for k_cache, v_cache in zip(k_buffer, v_buffer):
+        k_cache[tgt_loc_flat] = k_cache[src_loc_flat]
+        v_cache[tgt_loc_flat] = v_cache[src_loc_flat]
+
+
+@triton.jit
+def copy_all_layer_kv_cache_tiled(
+    data_ptrs,
+    strides,
+    tgt_loc_ptr,
+    src_loc_ptr,
+    num_locs,
+    num_locs_upper: tl.constexpr,
+    BYTES_PER_TILE: tl.constexpr,
+):
+    """2D tiled kernel. Safe for in-place copy."""
+    bid = tl.program_id(0)
+    tid = tl.program_id(1)
+
+    stride = tl.load(strides + bid)
+    base_ptr = tl.load(data_ptrs + bid)
+    base_ptr = tl.cast(base_ptr, tl.pointer_type(tl.uint8))
+
+    byte_off = tid * BYTES_PER_TILE + tl.arange(0, BYTES_PER_TILE)
+    mask_byte = byte_off < stride
+    tl.multiple_of(byte_off, 16)
+
+    loc_idx = tl.arange(0, num_locs_upper)
+    mask_loc = loc_idx < num_locs
+
+    src = tl.load(src_loc_ptr + loc_idx, mask=mask_loc, other=0)
+    tgt = tl.load(tgt_loc_ptr + loc_idx, mask=mask_loc, other=0)
+
+    src_ptr = base_ptr + src[:, None] * stride + byte_off[None, :]
+    tgt_ptr = base_ptr + tgt[:, None] * stride + byte_off[None, :]
+
+    mask = mask_loc[:, None] & mask_byte[None, :]
+    vals = tl.load(src_ptr, mask=mask)
+    tl.store(tgt_ptr, vals, mask=mask)
diff --git a/sglang/python/sglang/srt/mem_cache/memory_pool_host.py b/sglang/python/sglang/srt/mem_cache/memory_pool_host.py
new file mode 100644
index 0000000000000000000000000000000000000000..0b8199d9f7e9af7677d21f680295593d4d0c47aa
--- /dev/null
+++ b/sglang/python/sglang/srt/mem_cache/memory_pool_host.py
@@ -0,0 +1,1238 @@
+import abc
+import logging
+import threading
+from collections import defaultdict
+from functools import wraps
+from typing import Optional
+
+import psutil
+import torch
+
+from sglang.jit_kernel.hicache import (
+    can_use_hicache_jit_kernel,
+)
+from sglang.jit_kernel.hicache import (
+    transfer_hicache_all_layer as jit_transfer_hicache_all_layer,
+)
+from sglang.jit_kernel.hicache import (
+    transfer_hicache_one_layer as jit_transfer_hicache_one_layer,
+)
+from sglang.srt.mem_cache.memory_pool import (
+    KVCache,
+    MHATokenToKVPool,
+    MLATokenToKVPool,
+    NSATokenToKVPool,
+)
+from sglang.srt.utils import is_cuda, is_npu, is_xpu
+
+_is_cuda = is_cuda()
+_is_npu = is_npu()
+_is_xpu = is_xpu()
+if not (_is_npu or _is_xpu):
+    from sgl_kernel.kvcacheio import (
+        transfer_kv_all_layer,
+        transfer_kv_all_layer_direct_lf_pf,
+        transfer_kv_all_layer_lf_pf,
+        transfer_kv_all_layer_lf_ph,
+        transfer_kv_all_layer_mla,
+        transfer_kv_all_layer_mla_lf_pf,
+        transfer_kv_direct,
+        transfer_kv_per_layer,
+        transfer_kv_per_layer_direct_pf_lf,
+        transfer_kv_per_layer_mla,
+        transfer_kv_per_layer_mla_pf_lf,
+        transfer_kv_per_layer_pf_lf,
+        transfer_kv_per_layer_ph_lf,
+    )
+if _is_npu:
+    from sgl_kernel_npu.kvcacheio import TransferDirection, transfer_kv_dim_exchange
+
+logger = logging.getLogger(__name__)
+
+
+def synchronized(func):
+    @wraps(func)
+    def wrapper(self, *args, **kwargs):
+        with self.lock:
+            return func(self, *args, **kwargs)
+
+    return wrapper
+
+
+class HostTensorAllocator(abc.ABC):
+    def __init__(self):
+        """Initialize the HostTensorAllocator."""
+        self.dtype = None
+        self.dims = None
+
+    def allocate(self, dims: tuple, dtype: torch.dtype, device: str) -> torch.Tensor:
+        """Allocate a tensor of given dims and dtype on the memory."""
+        self.dtype = dtype
+        self.dims = dims
+        tensor = torch.empty(dims, dtype=dtype, device=device)
+        return tensor
+
+
+def get_allocator_from_storage(allocator_type):
+    if allocator_type == "mooncake":
+        try:
+            from sglang.srt.mem_cache.storage.mooncake_store.mooncake_store import (
+                MooncakeHostTensorAllocator,
+            )
+
+            return MooncakeHostTensorAllocator()
+        except ImportError:
+            logger.warning(
+                "Mooncake's tensor allocator requires mooncake >= 0.3.8.post1. "
+                "Please upgrade Mooncake by 'pip install mooncake-transfer-engine --upgrade'. "
+                "Fallback to use default allocator."
+            )
+            return HostTensorAllocator()
+    else:
+        return HostTensorAllocator()
+
+
+def alloc_with_host_register(
+    dims,
+    dtype: torch.dtype,
+    device: str,
+    pin_memory: bool,
+    allocator: HostTensorAllocator,
+) -> torch.Tensor:
+    """
+    Allocate tensor and register host memory with cudaHostRegister.
+    CudaHostRegister only applies when pin_memory=True.
+    """
+    buffer = allocator.allocate(dims, dtype=dtype, device=device)
+    if pin_memory:
+        torch.cuda.cudart().cudaHostRegister(
+            buffer.data_ptr(), buffer.numel() * buffer.element_size(), 0
+        )
+    return buffer
+
+
+def alloc_with_pin_memory(
+    dims,
+    dtype: torch.dtype,
+    device: str,
+    pin_memory: bool,
+    allocator: None,
+) -> torch.Tensor:
+    """
+    Allocate tensor using PyTorch's built-in pin_memory flag.
+    """
+    buffer = torch.empty(dims, dtype=dtype, device=device, pin_memory=pin_memory)
+    return buffer
+
+
+ALLOC_MEMORY_FUNCS = defaultdict(
+    lambda: alloc_with_host_register,
+    {
+        "npu": alloc_with_pin_memory,
+    },
+)
+
+
+class HostKVCache(abc.ABC):
+
+    def __init__(
+        self,
+        device_pool: KVCache,
+        host_to_device_ratio: float,
+        host_size: int,
+        page_size: int,
+        layout: str,
+        pin_memory: bool,
+        device: str,
+        allocator_type: str = "default",
+    ):
+        self.device_pool = device_pool
+        self.page_size = page_size
+        self.layout = layout
+        self.pin_memory = pin_memory
+        self.device = device
+        self.allocator = get_allocator_from_storage(allocator_type)
+
+        self.dtype = device_pool.store_dtype
+        self.size_per_token = self.get_size_per_token()
+        if host_size > 0:
+            self.size = int(host_size * 1e9 // self.size_per_token)
+        else:
+            self.size = int(device_pool.size * host_to_device_ratio)
+        # Align up the host memory pool size to the page size
+        self.page_num = self.size // self.page_size + 1
+        self.size = self.page_num * self.page_size
+        self.start_layer = device_pool.start_layer
+        self.end_layer = device_pool.end_layer
+
+        assert (
+            self.size > device_pool.size
+        ), "The host memory should be larger than the device memory with the current protocol"
+
+        # Verify there is enough available host memory.
+        host_mem = psutil.virtual_memory()
+        requested_bytes = self.size * self.size_per_token
+        # preserve at least 10GB for other usage
+        ten_gb = 10 * (1024**3)
+        available_bytes = host_mem.available - ten_gb
+        if requested_bytes > available_bytes:
+            raise ValueError(
+                f"Not enough host memory available. Requesting "
+                f"{requested_bytes / 1e9:.2f} GB but only have "
+                f"{available_bytes / 1e9:.2f} GB free. Please reduce the "
+                f"size of the hierarchical cache."
+            )
+        else:
+            logger.info(
+                f"Allocating {requested_bytes / 1e9:.2f} GB host memory for hierarchical KV cache."
+            )
+
+        self.kv_buffer = self.init_kv_buffer()
+
+        # A lock for synchronized operations on memory allocation and state transitions.
+        self.lock = threading.RLock()
+        self.clear()
+
+    @abc.abstractmethod
+    def get_size_per_token(self):
+        raise NotImplementedError()
+
+    @abc.abstractmethod
+    def init_kv_buffer(self):
+        raise NotImplementedError()
+
+    @abc.abstractmethod
+    def load_to_device_per_layer(
+        self, device_pool, host_indices, device_indices, layer_id, io_backend
+    ) -> None:
+        """
+        Load KV data from the host memory pool to the device memory pool for a specific layer.
+        """
+        raise NotImplementedError()
+
+    @abc.abstractmethod
+    def backup_from_device_all_layer(
+        self, device_pool, host_indices, device_indices, io_backend
+    ) -> None:
+        """
+        Backup KV data from the device memory pool to the host memory pool for all layers.
+        """
+        raise NotImplementedError()
+
+    @abc.abstractmethod
+    def get_data_page(self, index, flat: bool = True) -> torch.Tensor:
+        """
+        Get a flat data page from the host memory pool.
+        """
+        raise NotImplementedError()
+
+    @abc.abstractmethod
+    def get_dummy_flat_data_page(self) -> torch.Tensor:
+        """
+        Get a dummy flat data page from the host memory pool.
+        This is used for prefetching or initializing empty pages.
+        """
+        raise NotImplementedError()
+
+    @abc.abstractmethod
+    def set_from_flat_data_page(self, index: int, data_page: torch.Tensor) -> None:
+        """
+        Set a flat data page to the host memory pool.
+        """
+        raise NotImplementedError()
+
+    @synchronized
+    def clear(self):
+        # Initialize memory states and tracking structures.
+        self.mem_state = torch.zeros(
+            (self.size,), dtype=torch.uint8, device=self.device
+        )
+        self.free_slots = torch.arange(self.size, dtype=torch.int64)
+
+    def available_size(self):
+        return len(self.free_slots)
+
+    @synchronized
+    def alloc(self, need_size: int) -> Optional[torch.Tensor]:
+        assert (
+            need_size % self.page_size == 0
+        ), "The requested size should be a multiple of the page size."
+        if need_size > self.available_size():
+            return None
+
+        select_index = self.free_slots[:need_size]
+        self.free_slots = self.free_slots[need_size:]
+
+        return select_index
+
+    @synchronized
+    def free(self, indices: torch.Tensor) -> int:
+        self.free_slots = torch.cat([self.free_slots, indices])
+        return len(indices)
+
+
+class MHATokenToKVPoolHost(HostKVCache):
+    device_pool: MHATokenToKVPool
+
+    def __init__(
+        self,
+        device_pool: MHATokenToKVPool,
+        host_to_device_ratio: float,
+        host_size: int,
+        page_size: int,
+        layout: str,
+        pin_memory: bool = True,
+        device: str = "cpu",
+        allocator_type: str = "default",
+    ):
+        super().__init__(
+            device_pool,
+            host_to_device_ratio,
+            host_size,
+            page_size,
+            layout,
+            pin_memory,
+            device,
+            allocator_type,
+        )
+        self.element_dim = self.device_pool.head_num * self.device_pool.head_dim
+        self.can_use_jit = _is_cuda and can_use_hicache_jit_kernel(
+            element_size=self.element_dim * self.dtype.itemsize
+        )
+
+        self.k_data_refs = [self.k_buffer[i] for i in range(self.layer_num)]
+        self.v_data_refs = [self.v_buffer[i] for i in range(self.layer_num)]
+        self.k_data_ptrs = torch.tensor(
+            [x.data_ptr() for x in self.k_data_refs],
+            dtype=torch.uint64,
+            device=self.device_pool.device,
+        )
+        self.v_data_ptrs = torch.tensor(
+            [x.data_ptr() for x in self.v_data_refs],
+            dtype=torch.uint64,
+            device=self.device_pool.device,
+        )
+
+    def get_size_per_token(self):
+        self.head_num = self.device_pool.head_num
+        self.head_dim = self.device_pool.head_dim
+        self.layer_num = self.device_pool.layer_num
+
+        return self.head_dim * self.head_num * self.layer_num * self.dtype.itemsize * 2
+
+    def get_ksize_per_token(self):
+        return self.get_size_per_token() // 2
+
+    def init_kv_buffer(self):
+        if self.layout == "layer_first":
+            dims = (2, self.layer_num, self.size, self.head_num, self.head_dim)
+        elif self.layout == "page_first":
+            dims = (2, self.size, self.layer_num, self.head_num, self.head_dim)
+        elif self.layout == "page_first_direct":
+            dims = (
+                2,
+                self.page_num,
+                self.layer_num,
+                self.page_size,
+                self.head_num,
+                self.head_dim,
+            )
+        elif self.layout == "page_head":
+            dims = (
+                2,
+                self.page_num,
+                self.head_num,
+                self.page_size,
+                self.layer_num,
+                self.head_dim,
+            )
+        else:
+            raise ValueError(f"Unsupported layout: {self.layout}")
+        self.token_stride_size = self.head_num * self.head_dim * self.dtype.itemsize
+        self.layout_dim = self.token_stride_size * self.layer_num
+
+        alloc_func = ALLOC_MEMORY_FUNCS[self.device_pool.device]
+        buffer = alloc_func(
+            dims,
+            dtype=self.dtype,
+            device=self.device,
+            pin_memory=self.pin_memory,
+            allocator=self.allocator,
+        )
+        return buffer
+
+    @property
+    def k_buffer(self):
+        return self.kv_buffer[0]
+
+    @property
+    def v_buffer(self):
+        return self.kv_buffer[1]
+
+    def load_to_device_per_layer(
+        self,
+        device_pool,
+        host_indices,
+        device_indices,
+        layer_id,
+        io_backend,
+    ):
+        if io_backend == "kernel":
+            if self.layout == "layer_first":
+                if self.can_use_jit:
+                    jit_transfer_hicache_one_layer(
+                        k_cache_dst=device_pool.k_buffer[layer_id],
+                        v_cache_dst=device_pool.v_buffer[layer_id],
+                        k_cache_src=self.k_buffer[layer_id],
+                        v_cache_src=self.v_buffer[layer_id],
+                        indices_dst=device_indices,
+                        indices_src=host_indices,
+                        element_dim=self.element_dim,
+                    )
+                else:
+                    transfer_kv_per_layer(
+                        src_k=self.k_buffer[layer_id],
+                        dst_k=device_pool.k_buffer[layer_id],
+                        src_v=self.v_buffer[layer_id],
+                        dst_v=device_pool.v_buffer[layer_id],
+                        src_indices=host_indices,
+                        dst_indices=device_indices,
+                        item_size=self.token_stride_size,
+                    )
+            elif self.layout == "page_first":
+                transfer_kv_per_layer_pf_lf(
+                    src_k=self.k_buffer,
+                    dst_k=device_pool.k_buffer[layer_id],
+                    src_v=self.v_buffer,
+                    dst_v=device_pool.v_buffer[layer_id],
+                    src_indices=host_indices,
+                    dst_indices=device_indices,
+                    layer_id=layer_id,
+                    item_size=self.token_stride_size,
+                    src_layout_dim=self.layout_dim,
+                )
+            elif self.layout == "page_head":
+                transfer_kv_per_layer_ph_lf(
+                    src_k=self.k_buffer,
+                    dst_k=device_pool.k_buffer[layer_id],
+                    src_v=self.v_buffer,
+                    dst_v=device_pool.v_buffer[layer_id],
+                    src_indices=host_indices,
+                    dst_indices=device_indices,
+                    layer_id=layer_id,
+                    item_size=self.token_stride_size,
+                    src_layout_dim=self.layout_dim,
+                    page_size=self.page_size,
+                    head_num=self.head_num,
+                )
+            else:
+                raise ValueError(f"Unsupported layout: {self.layout}")
+        elif io_backend == "direct":
+            if self.layout == "layer_first":
+                transfer_kv_direct(
+                    src_layers=[self.k_buffer[layer_id], self.v_buffer[layer_id]],
+                    dst_layers=[
+                        device_pool.k_buffer[layer_id],
+                        device_pool.v_buffer[layer_id],
+                    ],
+                    src_indices=host_indices,
+                    dst_indices=device_indices,
+                    page_size=self.page_size,
+                )
+            elif self.layout == "page_first_direct":
+                transfer_kv_per_layer_direct_pf_lf(
+                    src_ptrs=[self.k_buffer, self.v_buffer],
+                    dst_ptrs=[
+                        device_pool.k_buffer[layer_id],
+                        device_pool.v_buffer[layer_id],
+                    ],
+                    src_indices=host_indices,
+                    dst_indices=device_indices,
+                    layer_id=layer_id,
+                    page_size=self.page_size,
+                )
+            else:
+                raise ValueError(f"Unsupported layout: {self.layout}")
+        elif io_backend == "kernel_ascend":
+            if self.layout == "page_first_direct":
+                # Ascend-specific: transfer KV data for all layers when layer_id == 0
+                if layer_id == 0:
+                    transfer_kv_dim_exchange(
+                        device_indices=device_indices,
+                        host_indices=host_indices,
+                        device_k=device_pool.k_buffer,
+                        host_k=self.k_buffer,
+                        device_v=device_pool.v_buffer,
+                        host_v=self.v_buffer,
+                        page_size=self.page_size,
+                        direction=TransferDirection.H2D,
+                    )
+            else:
+                raise ValueError(f"Unsupported layout: {self.layout}")
+        else:
+            raise ValueError(f"Unsupported IO backend: {io_backend}")
+
+    def backup_from_device_all_layer(
+        self, device_pool, host_indices, device_indices, io_backend
+    ):
+        if io_backend == "kernel":
+            if self.layout == "layer_first":
+                if self.can_use_jit:
+                    jit_transfer_hicache_all_layer(
+                        k_ptr_dst=self.k_data_ptrs,
+                        v_ptr_dst=self.v_data_ptrs,
+                        indices_dst=host_indices,
+                        k_ptr_src=device_pool.k_data_ptrs,
+                        v_ptr_src=device_pool.v_data_ptrs,
+                        indices_src=device_indices,
+                        kv_cache_dst_stride_bytes=self.token_stride_size,
+                        kv_cache_src_stride_bytes=self.token_stride_size,
+                        element_size=self.element_dim * self.dtype.itemsize,
+                    )
+                else:
+                    transfer_kv_all_layer(
+                        src_k_layers=device_pool.k_data_ptrs,
+                        dst_k_layers=self.k_data_ptrs,
+                        src_v_layers=device_pool.v_data_ptrs,
+                        dst_v_layers=self.v_data_ptrs,
+                        src_indices=device_indices,
+                        dst_indices=host_indices,
+                        item_size=self.token_stride_size,
+                        num_layers=self.layer_num,
+                    )
+            elif self.layout == "page_first":
+                transfer_kv_all_layer_lf_pf(
+                    src_k_layers=device_pool.k_data_ptrs,
+                    dst_k=self.k_buffer,
+                    src_v_layers=device_pool.v_data_ptrs,
+                    dst_v=self.v_buffer,
+                    src_indices=device_indices,
+                    dst_indices=host_indices,
+                    item_size=self.token_stride_size,
+                    dst_layout_dim=self.layout_dim,
+                    num_layers=self.layer_num,
+                )
+            elif self.layout == "page_head":
+                transfer_kv_all_layer_lf_ph(
+                    src_k_layers=device_pool.k_data_ptrs,
+                    dst_k=self.k_buffer,
+                    src_v_layers=device_pool.v_data_ptrs,
+                    dst_v=self.v_buffer,
+                    src_indices=device_indices,
+                    dst_indices=host_indices,
+                    item_size=self.token_stride_size,
+                    dst_layout_dim=self.layout_dim,
+                    num_layers=self.layer_num,
+                    page_size=self.page_size,
+                    head_num=self.head_num,
+                )
+            else:
+                raise ValueError(f"Unsupported layout: {self.layout}")
+        elif io_backend == "direct":
+            if self.layout == "layer_first":
+                transfer_kv_direct(
+                    src_layers=device_pool.k_buffer + device_pool.v_buffer,
+                    dst_layers=self.k_data_refs + self.v_data_refs,
+                    src_indices=device_indices,
+                    dst_indices=host_indices,
+                    page_size=self.page_size,
+                )
+            elif self.layout == "page_first_direct":
+                transfer_kv_all_layer_direct_lf_pf(
+                    src_ptrs=device_pool.k_buffer + device_pool.v_buffer,
+                    dst_ptrs=[self.k_buffer, self.v_buffer],
+                    src_indices=device_indices,
+                    dst_indices=host_indices,
+                    page_size=self.page_size,
+                )
+            else:
+                raise ValueError(f"Unsupported layout: {self.layout}")
+        elif io_backend == "kernel_ascend":
+            if self.layout == "page_first_direct":
+                transfer_kv_dim_exchange(
+                    device_indices=device_indices,
+                    host_indices=host_indices,
+                    device_k=device_pool.k_buffer,
+                    host_k=self.k_buffer,
+                    device_v=device_pool.v_buffer,
+                    host_v=self.v_buffer,
+                    page_size=self.page_size,
+                    direction=TransferDirection.D2H,
+                )
+            else:
+                raise ValueError(f"Unsupported layout: {self.layout}")
+        else:
+            raise ValueError(f"Unsupported IO backend: {io_backend}")
+
+    def get_data_page(self, index, flat: bool = True) -> torch.Tensor:
+        if self.layout == "layer_first":
+            data_page = self.kv_buffer[:, :, index : index + self.page_size, :, :]
+        elif self.layout == "page_first":
+            data_page = self.kv_buffer[:, index : index + self.page_size, :, :, :]
+        elif self.layout in ["page_first_direct", "page_head"]:
+            real_index = index // self.page_size
+            data_page = self.kv_buffer[:, real_index : real_index + 1, :, :, :, :]
+        else:
+            raise ValueError(f"Unsupported layout: {self.layout}")
+        if flat:
+            data_page = data_page.flatten()
+        return data_page
+
+    def get_dummy_flat_data_page(self) -> torch.Tensor:
+        return torch.zeros(
+            (2, self.layer_num, self.page_size, self.head_num, self.head_dim),
+            dtype=self.dtype,
+            device=self.device,
+            pin_memory=self.pin_memory,
+        ).flatten()
+
+    def set_from_flat_data_page(self, index: int, data_page: torch.Tensor) -> None:
+        if self.layout == "layer_first":
+            self.kv_buffer[:, :, index : index + self.page_size, :, :] = (
+                data_page.reshape(
+                    2,
+                    self.layer_num,
+                    self.page_size,
+                    self.head_num,
+                    self.head_dim,
+                )
+            )
+        elif self.layout == "page_first":
+            self.kv_buffer[:, index : index + self.page_size, :, :, :] = (
+                data_page.reshape(
+                    2, self.page_size, self.layer_num, self.head_num, self.head_dim
+                )
+            )
+        elif self.layout == "page_first_direct":
+            real_index = index // self.page_size
+            self.kv_buffer[:, real_index : real_index + 1, :, :, :, :] = (
+                data_page.reshape(
+                    2, 1, self.layer_num, self.page_size, self.head_num, self.head_dim
+                )
+            )
+        elif self.layout == "page_head":
+            real_index = index // self.page_size
+            self.kv_buffer[:, real_index : real_index + 1, :, :, :, :] = (
+                data_page.reshape(
+                    2, 1, self.head_num, self.page_size, self.layer_num, self.head_dim
+                )
+            )
+        else:
+            raise ValueError(f"Unsupported layout: {self.layout}")
+
+    def get_split_heads_page_buffer_meta(
+        self, indices: torch.Tensor, split_factor: int
+    ):
+        """
+        get meta data for zero copy of heterogeneous ranks' KVCache
+        """
+        assert self.layout == "page_head"
+        assert len(indices) % self.page_size == 0
+        assert self.head_num % split_factor == 0
+        ptr_list = []
+        kv_buffer_data_ptr = self.kv_buffer.data_ptr()
+        indices = indices.tolist()
+        v_offset = (
+            self.layer_num
+            * self.size
+            * self.head_num
+            * self.head_dim
+            * self.dtype.itemsize
+        )
+        for index in range(0, len(indices), self.page_size):
+            for head_id in range(0, self.head_num, self.head_num // split_factor):
+                k_ptr = (
+                    kv_buffer_data_ptr
+                    + indices[index]
+                    * self.layer_num
+                    * self.head_num
+                    * self.head_dim
+                    * self.dtype.itemsize
+                    + head_id
+                    * self.page_size
+                    * self.layer_num
+                    * self.head_dim
+                    * self.dtype.itemsize
+                )
+                v_ptr = k_ptr + v_offset
+                ptr_list.append(k_ptr)
+                ptr_list.append(v_ptr)
+        element_size = (
+            self.layer_num
+            * self.dtype.itemsize
+            * self.page_size
+            * self.head_num
+            * self.head_dim
+            // split_factor
+        )
+        element_size_list = [element_size] * len(ptr_list)
+        return ptr_list, element_size_list
+
+    def get_page_buffer_meta(self, indices):
+        """ "
+        meta data for zero copy
+        """
+        assert len(indices) % self.page_size == 0
+        ptr_list = []
+        kv_buffer_data_ptr = self.kv_buffer.data_ptr()
+        indices = indices.tolist()
+        v_offset = (
+            self.layer_num
+            * self.size
+            * self.head_num
+            * self.head_dim
+            * self.dtype.itemsize
+        )
+        if self.layout == "layer_first":
+            for index in range(0, len(indices), self.page_size):
+                for layer_id in range(self.layer_num):
+                    k_ptr = (
+                        kv_buffer_data_ptr
+                        + indices[index]
+                        * self.head_num
+                        * self.head_dim
+                        * self.dtype.itemsize
+                        + layer_id
+                        * self.size
+                        * self.head_num
+                        * self.head_dim
+                        * self.dtype.itemsize
+                    )
+                    v_ptr = k_ptr + v_offset
+                    ptr_list.append(k_ptr)
+                    ptr_list.append(v_ptr)
+            element_size = (
+                self.dtype.itemsize * self.page_size * self.head_num * self.head_dim
+            )
+            element_size_list = [element_size] * len(ptr_list)
+        elif self.layout in ["page_first", "page_first_direct", "page_head"]:
+            for index in range(0, len(indices), self.page_size):
+                k_ptr = (
+                    kv_buffer_data_ptr
+                    + indices[index]
+                    * self.layer_num
+                    * self.head_num
+                    * self.head_dim
+                    * self.dtype.itemsize
+                )
+                v_ptr = k_ptr + v_offset
+                ptr_list.append(k_ptr)
+                ptr_list.append(v_ptr)
+            element_size = (
+                self.layer_num
+                * self.dtype.itemsize
+                * self.page_size
+                * self.head_num
+                * self.head_dim
+            )
+            element_size_list = [element_size] * len(ptr_list)
+        else:
+            raise ValueError(f"Unsupported layout: {self.layout}")
+        return ptr_list, element_size_list
+
+
+class MLATokenToKVPoolHost(HostKVCache):
+    device_pool: MLATokenToKVPool
+
+    def __init__(
+        self,
+        device_pool: MLATokenToKVPool,
+        host_to_device_ratio: float,
+        host_size: int,
+        page_size: int,
+        layout: str,
+        pin_memory: bool = True,
+        device: str = "cpu",
+        allocator_type: str = "default",
+        override_kv_cache_dim: Optional[int] = None,
+    ):
+        self.override_kv_cache_dim = override_kv_cache_dim
+        super().__init__(
+            device_pool,
+            host_to_device_ratio,
+            host_size,
+            page_size,
+            layout,
+            pin_memory,
+            device,
+            allocator_type,
+        )
+        self.data_refs = [self.kv_buffer[i] for i in range(self.layer_num)]
+        self.data_ptrs = torch.tensor(
+            [x.data_ptr() for x in self.data_refs],
+            dtype=torch.uint64,
+            device=self.device_pool.device,
+        )
+
+    def get_size_per_token(self):
+        self.kv_lora_rank = self.device_pool.kv_lora_rank
+        self.qk_rope_head_dim = self.device_pool.qk_rope_head_dim
+        self.layer_num = self.device_pool.layer_num
+        self.kv_cache_dim = self.override_kv_cache_dim or (
+            self.kv_lora_rank + self.qk_rope_head_dim
+        )
+        return self.kv_cache_dim * self.dtype.itemsize * self.layer_num
+
+    def get_ksize_per_token(self):
+        return self.get_size_per_token()
+
+    def init_kv_buffer(self):
+        if self.layout == "layer_first":
+            dims = (
+                self.layer_num,
+                self.size,
+                1,
+                self.kv_cache_dim,
+            )
+        elif self.layout == "page_first":
+            dims = (
+                self.size,
+                self.layer_num,
+                1,
+                self.kv_cache_dim,
+            )
+        elif self.layout == "page_first_direct":
+            dims = (
+                self.page_num,
+                self.layer_num,
+                self.page_size,
+                1,
+                self.kv_cache_dim,
+            )
+        # Ascend-specific: Aligns with NPUMLATokenToKVPool layout
+        # Separately allocate k_buffer and v_buffer for easier data transfer.
+        elif self.layout == "page_first_kv_split":
+            base_dims = (
+                self.page_num,
+                self.layer_num,
+                self.page_size,
+                1,
+            )
+            alloc_func = ALLOC_MEMORY_FUNCS[self.device_pool.device]
+            self.k_buffer = alloc_func(
+                (*base_dims, self.kv_lora_rank),
+                dtype=self.dtype,
+                device=self.device,
+                pin_memory=self.pin_memory,
+                allocator=self.allocator,
+            )
+            self.v_buffer = alloc_func(
+                (*base_dims, self.qk_rope_head_dim),
+                dtype=self.dtype,
+                device=self.device,
+                pin_memory=self.pin_memory,
+                allocator=self.allocator,
+            )
+            self.index_k_buffer = None
+            if self.device_pool.index_head_dim is not None:
+                self.index_k_buffer = alloc_func(
+                    (*base_dims, self.device_pool.index_head_dim),
+                    dtype=self.dtype,
+                    device=self.device,
+                    pin_memory=self.pin_memory,
+                    allocator=self.allocator,
+                )
+            # Return k_buffer to preserve original kv_buffer and data_refs init logic,
+            # though Ascend doesn't use these parameters.
+            return self.k_buffer
+        else:
+            raise ValueError(f"Unsupported layout: {self.layout}")
+        self.token_stride_size = self.kv_cache_dim * self.dtype.itemsize
+        self.layout_dim = self.token_stride_size * self.layer_num
+
+        alloc_func = ALLOC_MEMORY_FUNCS[self.device_pool.device]
+        buffer = alloc_func(
+            dims,
+            dtype=self.dtype,
+            device=self.device,
+            pin_memory=self.pin_memory,
+            allocator=self.allocator,
+        )
+        return buffer
+
+    def load_to_device_per_layer(
+        self, device_pool, host_indices, device_indices, layer_id, io_backend
+    ):
+        if io_backend == "kernel":
+            if self.layout == "layer_first":
+                transfer_kv_per_layer_mla(
+                    src=self.kv_buffer[layer_id],
+                    dst=device_pool.kv_buffer[layer_id],
+                    src_indices=host_indices,
+                    dst_indices=device_indices,
+                    item_size=self.token_stride_size,
+                )
+            elif self.layout == "page_first":
+                transfer_kv_per_layer_mla_pf_lf(
+                    src=self.kv_buffer,
+                    dst=device_pool.kv_buffer[layer_id],
+                    src_indices=host_indices,
+                    dst_indices=device_indices,
+                    layer_id=layer_id,
+                    item_size=self.token_stride_size,
+                    src_layout_dim=self.layout_dim,
+                )
+            else:
+                raise ValueError(f"Unsupported layout: {self.layout}")
+        elif io_backend == "direct":
+            if self.layout == "layer_first":
+                transfer_kv_direct(
+                    src_layers=[self.kv_buffer[layer_id]],
+                    dst_layers=[device_pool.kv_buffer[layer_id]],
+                    src_indices=host_indices,
+                    dst_indices=device_indices,
+                    page_size=self.page_size,
+                )
+            elif self.layout == "page_first_direct":
+                transfer_kv_per_layer_direct_pf_lf(
+                    src_ptrs=[self.kv_buffer],
+                    dst_ptrs=[device_pool.kv_buffer[layer_id]],
+                    src_indices=host_indices,
+                    dst_indices=device_indices,
+                    layer_id=layer_id,
+                    page_size=self.page_size,
+                )
+            else:
+                raise ValueError(f"Unsupported layout: {self.layout}")
+        elif io_backend == "kernel_ascend":
+            if self.layout == "page_first_kv_split":
+                # Ascend-specific: transfer KV data for all layers when layer_id == 0
+                if layer_id == 0:
+                    transfer_kv_dim_exchange(
+                        device_indices=device_indices,
+                        host_indices=host_indices,
+                        device_k=device_pool.k_buffer,
+                        host_k=self.k_buffer,
+                        device_v=device_pool.v_buffer,
+                        host_v=self.v_buffer,
+                        device_index_k=device_pool.index_k_buffer,
+                        host_index_k=self.index_k_buffer,
+                        page_size=self.page_size,
+                        direction=TransferDirection.H2D,
+                    )
+            else:
+                raise ValueError(f"Unsupported layout: {self.layout}")
+        else:
+            raise ValueError(f"Unsupported IO backend: {io_backend}")
+
+    def backup_from_device_all_layer(
+        self, device_pool, host_indices, device_indices, io_backend
+    ):
+        if io_backend == "kernel":
+            if self.layout == "layer_first":
+                transfer_kv_all_layer_mla(
+                    src_layers=device_pool.data_ptrs,
+                    dst_layers=self.data_ptrs,
+                    src_indices=device_indices,
+                    dst_indices=host_indices,
+                    item_size=self.token_stride_size,
+                    num_layers=self.layer_num,
+                )
+            elif self.layout == "page_first":
+                transfer_kv_all_layer_mla_lf_pf(
+                    src_layers=device_pool.data_ptrs,
+                    dst=self.kv_buffer,
+                    src_indices=device_indices,
+                    dst_indices=host_indices,
+                    item_size=self.token_stride_size,
+                    dst_layout_dim=self.layout_dim,
+                    num_layers=self.layer_num,
+                )
+            else:
+                raise ValueError(f"Unsupported layout: {self.layout}")
+        elif io_backend == "direct":
+            if self.layout == "layer_first":
+                transfer_kv_direct(
+                    src_layers=device_pool.kv_buffer,
+                    dst_layers=self.data_refs,
+                    src_indices=device_indices,
+                    dst_indices=host_indices,
+                    page_size=self.page_size,
+                )
+            elif self.layout == "page_first_direct":
+                transfer_kv_all_layer_direct_lf_pf(
+                    src_ptrs=device_pool.kv_buffer,
+                    dst_ptrs=[self.kv_buffer],
+                    src_indices=device_indices,
+                    dst_indices=host_indices,
+                    page_size=self.page_size,
+                )
+            else:
+                raise ValueError(f"Unsupported layout: {self.layout}")
+        elif io_backend == "kernel_ascend":
+            if self.layout == "page_first_kv_split":
+                transfer_kv_dim_exchange(
+                    device_indices=device_indices,
+                    host_indices=host_indices,
+                    device_k=device_pool.k_buffer,
+                    host_k=self.k_buffer,
+                    device_v=device_pool.v_buffer,
+                    host_v=self.v_buffer,
+                    device_index_k=device_pool.index_k_buffer,
+                    host_index_k=self.index_k_buffer,
+                    page_size=self.page_size,
+                    direction=TransferDirection.D2H,
+                )
+            else:
+                raise ValueError(f"Unsupported layout: {self.layout}")
+        else:
+            raise ValueError(f"Unsupported IO backend: {io_backend}")
+
+    def get_data_page(self, index, flat: bool = True) -> torch.Tensor:
+        if self.layout == "layer_first":
+            data_page = self.kv_buffer[:, index : index + self.page_size, :, :]
+        elif self.layout == "page_first":
+            data_page = self.kv_buffer[index : index + self.page_size, :, :, :]
+        elif self.layout == "page_first_direct":
+            real_index = index // self.page_size
+            data_page = self.kv_buffer[real_index : real_index + 1, :, :, :, :]
+        else:
+            raise ValueError(f"Unsupported layout: {self.layout}")
+        if flat:
+            data_page = data_page.flatten()
+        return data_page
+
+    def get_dummy_flat_data_page(self) -> torch.Tensor:
+        return torch.zeros(
+            (
+                self.layer_num,
+                self.page_size,
+                1,
+                self.kv_cache_dim,
+            ),
+            dtype=self.dtype,
+            device=self.device,
+            pin_memory=self.pin_memory,
+        ).flatten()
+
+    def set_from_flat_data_page(self, index: int, data_page: torch.Tensor) -> None:
+        if self.layout == "layer_first":
+            self.kv_buffer[:, index : index + self.page_size, :, :] = data_page.reshape(
+                self.layer_num,
+                self.page_size,
+                1,
+                self.kv_cache_dim,
+            )
+        elif self.layout == "page_first":
+            self.kv_buffer[index : index + self.page_size, :, :, :] = data_page.reshape(
+                self.page_size,
+                self.layer_num,
+                1,
+                self.kv_cache_dim,
+            )
+        elif self.layout == "page_first_direct":
+            real_index = index // self.page_size
+            self.kv_buffer[real_index : real_index + 1, :, :, :, :] = data_page.reshape(
+                1,
+                self.layer_num,
+                self.page_size,
+                1,
+                self.kv_cache_dim,
+            )
+        else:
+            raise ValueError(f"Unsupported layout: {self.layout}")
+
+    def get_page_buffer_meta(self, indices):
+        """ "
+        meta data for zero copy
+        """
+        assert len(indices) % self.page_size == 0
+        ptr_list = []
+        kv_buffer_data_ptr = self.kv_buffer.data_ptr()
+        indices = indices.tolist()
+        if self.layout == "layer_first":
+            for index in range(0, len(indices), self.page_size):
+                for layer_id in range(self.layer_num):
+                    k_ptr = (
+                        kv_buffer_data_ptr
+                        + indices[index] * self.kv_cache_dim * self.dtype.itemsize
+                        + layer_id * self.size * self.kv_cache_dim * self.dtype.itemsize
+                    )
+                    ptr_list.append(k_ptr)
+            element_size = self.dtype.itemsize * self.page_size * self.kv_cache_dim
+            element_size_list = [element_size] * len(ptr_list)
+        elif self.layout in ["page_first", "page_first_direct"]:
+            for index in range(0, len(indices), self.page_size):
+                k_ptr = (
+                    kv_buffer_data_ptr
+                    + indices[index]
+                    * self.layer_num
+                    * self.kv_cache_dim
+                    * self.dtype.itemsize
+                )
+                ptr_list.append(k_ptr)
+            element_size = (
+                self.layer_num
+                * self.dtype.itemsize
+                * self.page_size
+                * self.kv_cache_dim
+            )
+            element_size_list = [element_size] * len(ptr_list)
+        else:
+            raise ValueError(f"Unsupported layout: {self.layout}")
+        return ptr_list, element_size_list
+
+
+class NSATokenToKVPoolHost(MLATokenToKVPoolHost):
+    device_pool: NSATokenToKVPool
+
+    def __init__(
+        self,
+        device_pool: NSATokenToKVPool,
+        host_to_device_ratio: float,
+        host_size: int,
+        page_size: int,
+        layout: str,
+        pin_memory: bool = True,
+        device: str = "cpu",
+        allocator_type: str = "default",
+    ):
+        # Initialize indexer metadata before HostKVCache.__init__ calls get_size_per_token.
+        self.index_head_dim = device_pool.index_head_dim
+        self.indexer_quant_block_size = device_pool.quant_block_size
+        self.indexer_dtype = NSATokenToKVPool.index_k_with_scale_buffer_dtype
+        self.indexer_size_per_token = (
+            self.index_head_dim
+            + self.index_head_dim // self.indexer_quant_block_size * 4
+        )
+        super().__init__(
+            device_pool,
+            host_to_device_ratio,
+            host_size,
+            page_size,
+            layout,
+            pin_memory,
+            device,
+            allocator_type,
+            override_kv_cache_dim=device_pool.kv_cache_dim,
+        )
+        self.indexer_page_stride_size = (
+            self.indexer_size_per_token * self.page_size * self.indexer_dtype.itemsize
+        )
+        self.indexer_page_num = (self.size + self.page_size + 1) // self.page_size
+        self._init_indexer_buffers()
+        logger.info(
+            f"NSATokenToKVPoolHost initialized with indexer page stride size: {self.indexer_page_stride_size}, page num: {self.indexer_page_num}"
+        )
+
+    def get_size_per_token(self):
+        base = super().get_size_per_token()
+        return (
+            base
+            + self.indexer_size_per_token * self.layer_num * self.indexer_dtype.itemsize
+        )
+
+    def _init_indexer_buffers(self):
+        alloc_func = ALLOC_MEMORY_FUNCS[self.device_pool.device]
+        self.index_k_with_scale_buffer = [
+            alloc_func(
+                (self.indexer_page_num, self.indexer_page_stride_size),
+                dtype=self.indexer_dtype,
+                device=self.device,
+                pin_memory=self.pin_memory,
+                allocator=self.allocator,
+            )
+            for _ in range(self.layer_num)
+        ]
+        self.index_k_data_refs = [
+            self.index_k_with_scale_buffer[i] for i in range(self.layer_num)
+        ]
+        self.index_k_data_ptrs = torch.tensor(
+            [x.data_ptr() for x in self.index_k_data_refs],
+            dtype=torch.uint64,
+            device=self.device_pool.device,
+        )
+        self.index_k_device_ptrs = torch.tensor(
+            [x.data_ptr() for x in self.device_pool.index_k_with_scale_buffer],
+            dtype=torch.uint64,
+            device=self.device_pool.device,
+        )
+
+    def _get_indexer_page_indices(self, host_indices, device_indices):
+        if host_indices.numel() == 0:
+            return host_indices, device_indices
+        if host_indices.numel() % self.page_size != 0:
+            raise ValueError(
+                "Index buffer transfer expects page-aligned indices for NSA."
+            )
+        host_page_indices = (
+            host_indices.reshape(-1, self.page_size)[:, 0] // self.page_size
+        )
+        device_page_indices = (
+            device_indices.reshape(-1, self.page_size)[:, 0] // self.page_size
+        )
+        return host_page_indices, device_page_indices
+
+    def _load_indexer_to_device_per_layer(
+        self, device_pool, host_indices, device_indices, layer_id, io_backend
+    ):
+        host_page_indices, device_page_indices = self._get_indexer_page_indices(
+            host_indices, device_indices
+        )
+        use_kernel = io_backend == "kernel" and self.indexer_page_stride_size % 8 == 0
+        if use_kernel:
+            transfer_kv_per_layer_mla(
+                src=self.index_k_with_scale_buffer[layer_id],
+                dst=device_pool.index_k_with_scale_buffer[layer_id],
+                src_indices=host_page_indices,
+                dst_indices=device_page_indices,
+                item_size=self.indexer_page_stride_size,
+            )
+        else:
+            transfer_kv_direct(
+                src_layers=[self.index_k_with_scale_buffer[layer_id]],
+                dst_layers=[device_pool.index_k_with_scale_buffer[layer_id]],
+                src_indices=host_page_indices,
+                dst_indices=device_page_indices,
+                page_size=1,
+            )
+
+    def _backup_indexer_from_device_all_layer(
+        self, device_pool, host_indices, device_indices, io_backend
+    ):
+        host_page_indices, device_page_indices = self._get_indexer_page_indices(
+            host_indices, device_indices
+        )
+        use_kernel = io_backend == "kernel" and self.indexer_page_stride_size % 8 == 0
+        if use_kernel:
+            transfer_kv_all_layer_mla(
+                src_layers=self.index_k_device_ptrs,
+                dst_layers=self.index_k_data_ptrs,
+                src_indices=device_page_indices,
+                dst_indices=host_page_indices,
+                item_size=self.indexer_page_stride_size,
+                num_layers=self.layer_num,
+            )
+        else:
+            transfer_kv_direct(
+                src_layers=device_pool.index_k_with_scale_buffer,
+                dst_layers=self.index_k_with_scale_buffer,
+                src_indices=device_page_indices,
+                dst_indices=host_page_indices,
+                page_size=1,
+            )
+
+    def load_to_device_per_layer(
+        self,
+        device_pool,
+        host_indices,
+        device_indices,
+        layer_id,
+        io_backend,
+    ):
+        super().load_to_device_per_layer(
+            device_pool, host_indices, device_indices, layer_id, io_backend
+        )
+        self._load_indexer_to_device_per_layer(
+            device_pool, host_indices, device_indices, layer_id, io_backend
+        )
+
+    def backup_from_device_all_layer(
+        self, device_pool, host_indices, device_indices, io_backend
+    ):
+        super().backup_from_device_all_layer(
+            device_pool, host_indices, device_indices, io_backend
+        )
+        self._backup_indexer_from_device_all_layer(
+            device_pool, host_indices, device_indices, io_backend
+        )
diff --git a/sglang/python/sglang/srt/mem_cache/multimodal_cache.py b/sglang/python/sglang/srt/mem_cache/multimodal_cache.py
new file mode 100644
index 0000000000000000000000000000000000000000..ac1cb93de4c75a832d0589582188eceae7fba652
--- /dev/null
+++ b/sglang/python/sglang/srt/mem_cache/multimodal_cache.py
@@ -0,0 +1,143 @@
+import abc
+from collections import OrderedDict
+from dataclasses import dataclass
+from typing import List, Optional
+
+import torch
+
+from sglang.srt.mem_cache.allocator import BaseTokenToKVPoolAllocator
+
+
+class MultimodalCache(abc.ABC):
+    @abc.abstractmethod
+    def __init__(
+        self,
+    ): ...
+
+    @staticmethod
+    def combine_hashes(mm_hashes: List[int]) -> Optional[int]:
+        """
+        Get a combined hash from individual mm item hashes
+        """
+        if not mm_hashes:
+            return None
+        return hash(tuple(mm_hashes))
+
+    @abc.abstractmethod
+    def get(
+        self, mm_hashes: List[int], combined_hash: Optional[int] = None
+    ) -> Optional[torch.Tensor]:
+        """
+        Extract the embedding with the hash-ids of the queried items. Try combined hash first, if missed, fallback to individual hashes
+        The returned tensor may not be contiguous
+        """
+        raise NotImplementedError()
+
+    @abc.abstractmethod
+    def set(
+        self,
+        mm_hash: int,
+        embedding: torch.Tensor,
+        mm_embedding_allocator: BaseTokenToKVPoolAllocator,
+    ) -> bool:
+        """
+        Set the embedding to the pre-allocated locations with a hash id
+        """
+        raise NotImplementedError()
+
+    @abc.abstractmethod
+    def has(self, mm_hash: int) -> bool:
+        raise NotImplementedError()
+
+    @abc.abstractmethod
+    def free(
+        self, mm_hash: int, mm_embedding_allocator: BaseTokenToKVPoolAllocator
+    ) -> bool:
+        raise NotImplementedError()
+
+    @abc.abstractmethod
+    def clear(self):
+        raise NotImplementedError()
+
+    @abc.abstractmethod
+    def available_size(self):
+        raise NotImplementedError()
+
+
+def _get_tensor_size(embedding: torch.Tensor):
+    return embedding.element_size() * embedding.numel()
+
+
+@dataclass(kw_only=True)
+class EmbeddingResult:
+    embedding: torch.Tensor
+
+
+class MultiModalStaticCache(MultimodalCache):
+    """
+    A server-level cache for multimodal embedding.
+    Embeddings are computed prior, and this cache does not really pre-alloc
+    """
+
+    def __init__(
+        self,
+        max_size: int,
+    ):
+        super().__init__()
+        self.max_size = max_size
+        self.mm_cache: OrderedDict[int, EmbeddingResult] = OrderedDict()
+        self.current_size = 0
+
+    def get(
+        self, mm_hashes: List[int], combined_hash: Optional[int] = None
+    ) -> Optional[EmbeddingResult]:
+        combined_hash = self.combine_hashes(mm_hashes)
+        # MultiModalStaticCache does not fallback to individual item lookup
+
+        embedding = self.mm_cache.get(combined_hash)
+        if embedding is not None:
+            self.mm_cache.move_to_end(combined_hash)
+        return embedding
+
+    def set(
+        self,
+        mm_hash: int,
+        embedding: EmbeddingResult,
+        loc: Optional[torch.Tensor] = None,
+    ) -> bool:
+        assert isinstance(embedding, EmbeddingResult), embedding
+        if mm_hash in self.mm_cache:
+            self.mm_cache.move_to_end(mm_hash)
+            return True
+        data_size = _get_tensor_size(embedding.embedding)
+        while self.current_size + data_size > self.max_size:
+            if not self.mm_cache:
+                return False
+            lru_hash, lru_embedding = self.mm_cache.popitem(last=False)
+            self.current_size -= _get_tensor_size(lru_embedding.embedding)
+
+        self.mm_cache[mm_hash] = embedding
+        self.current_size += data_size
+        return True
+
+    def has(self, mm_hash: int) -> bool:
+        return mm_hash in self.mm_cache
+
+    def free(
+        self, mm_hash: int, mm_embedding_allocator: BaseTokenToKVPoolAllocator
+    ) -> bool:
+        if mm_hash not in self.mm_cache:
+            return False
+        old_embedding = self.mm_cache.pop(mm_hash)
+        self.current_size -= _get_tensor_size(old_embedding.embedding)
+        return True
+
+    def clear(self):
+        self.mm_cache.clear()
+        self.current_size = 0
+
+    def __len__(self):
+        return len(self.mm_cache)
+
+    def available_size(self):
+        return self.__len__()
diff --git a/sglang/python/sglang/srt/mem_cache/radix_cache.py b/sglang/python/sglang/srt/mem_cache/radix_cache.py
new file mode 100644
index 0000000000000000000000000000000000000000..c9c32f54deb91442af417c6ee15d2272c5f539e7
--- /dev/null
+++ b/sglang/python/sglang/srt/mem_cache/radix_cache.py
@@ -0,0 +1,903 @@
+from __future__ import annotations
+
+from sglang.srt.mem_cache.cache_init_params import CacheInitParams
+from sglang.srt.mem_cache.utils import convert_to_bigram_key
+
+"""
+Copyright 2023-2024 SGLang Team
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+"""
+The radix tree data structure for managing the KV cache.
+"""
+
+import heapq
+import logging
+import sys
+import time
+from collections import defaultdict
+from functools import lru_cache, partial
+from typing import TYPE_CHECKING, Any, Iterator, List, Optional, Tuple, Union
+
+import torch
+
+logger = logging.getLogger(__name__)
+
+from sglang.srt.disaggregation.kv_events import (
+    MEDIUM_GPU,
+    AllBlocksCleared,
+    BlockRemoved,
+    BlockStored,
+)
+from sglang.srt.mem_cache.base_prefix_cache import (
+    BasePrefixCache,
+    EvictParams,
+    EvictResult,
+    InsertParams,
+    InsertResult,
+    MatchPrefixParams,
+    MatchResult,
+)
+from sglang.srt.mem_cache.evict_policy import (
+    EvictionStrategy,
+    FIFOStrategy,
+    FILOStrategy,
+    LFUStrategy,
+    LRUStrategy,
+    MRUStrategy,
+    PriorityStrategy,
+)
+from sglang.srt.mem_cache.hicache_storage import get_hash_str, hash_str_to_int64
+
+if TYPE_CHECKING:
+    from sglang.srt.managers.schedule_batch import Req
+
+
+class RadixKey:
+    def __init__(
+        self,
+        token_ids: List[int],
+        extra_key: Optional[str] = None,
+        is_bigram: bool = False,
+    ):
+        # token ids sequence
+        self.token_ids = token_ids
+        # extra key (e.g. lora_id, cache_salt)
+        self.extra_key = extra_key
+        # is bigram key
+        self.is_bigram = is_bigram
+
+    def __len__(self) -> int:
+        return len(self.token_ids)
+
+    def __iter__(self) -> Iterator[int]:
+        return iter(self.token_ids)
+
+    def __getitem__(self, idx: Union[int, slice]) -> "RadixKey":
+        if isinstance(idx, slice):
+            return RadixKey(self.token_ids[idx], self.extra_key)
+        return RadixKey([self.token_ids[idx]], self.extra_key)
+
+    def __repr__(self) -> str:
+        preview = self.token_ids[:10]
+        return f"RadixKey(extra_key={self.extra_key!r}, token_ids={preview}{'...' if len(self.token_ids) > 10 else ''})"
+
+
+def maybe_bigram_convert(
+    is_eagle: bool,
+    key: RadixKey,
+    value: Optional[torch.Tensor] = None,
+) -> Tuple[RadixKey, Optional[torch.Tensor]]:
+    if is_eagle and not key.is_bigram:
+        key.token_ids = convert_to_bigram_key(key.token_ids)
+        key.is_bigram = True
+        if value is not None:
+            value = value[: len(key)]
+    return key, value
+
+
+def page_align_keys(key: list, page_size) -> list:
+    if page_size == 1:
+        return key
+    page_aligned_len = len(key) // page_size * page_size
+    return key[:page_aligned_len]
+
+
+class TreeNode:
+
+    counter = 0
+
+    def __init__(self, id: Optional[int] = None, priority: int = 0):
+        self.children = defaultdict(TreeNode)
+        self.parent: TreeNode = None
+        self.key: RadixKey = None
+        self.value: Optional[torch.Tensor] = None
+        self.lock_ref = 0
+        self.pin_expiry: float = (
+            0.0  # absolute expiry time (time.monotonic()), 0 = not pinned
+        )
+        self.pin_ttl: int = 0  # original TTL in seconds, for refresh-on-hit
+        self.last_access_time = time.monotonic()
+        self.creation_time = time.monotonic()
+
+        self.hit_count = 0
+        # indicating the node is locked to protect from eviction
+        # incremented when the node is referenced by a storage operation
+        self.host_ref_counter = 0
+        # store the host indices of KV cache
+        self.host_value: Optional[torch.Tensor] = None
+        # store hash values of each pages
+        self.hash_value: Optional[List[str]] = None
+        # priority for priority-aware eviction
+        self.priority = priority
+
+        self.id = TreeNode.counter if id is None else id
+        TreeNode.counter += 1
+
+    @property
+    def evicted(self):
+        return self.value is None
+
+    @property
+    def backuped(self):
+        return self.host_value is not None
+
+    def protect_host(self):
+        """Protect the host value from eviction."""
+        self.host_ref_counter += 1
+
+    def release_host(self):
+        """Release the host value, allowing it to be evicted."""
+        if self.host_ref_counter > 0:
+            self.host_ref_counter -= 1
+        else:
+            raise RuntimeError("Host reference counter is already zero.")
+
+    def get_last_hash_value(self) -> Optional[str]:
+        """Returns the hash value of the last page in this node."""
+        if self.hash_value is None or len(self.hash_value) == 0:
+            return None
+        return self.hash_value[-1]
+
+    @lru_cache(maxsize=1)
+    def get_prefix_hash_values(self, node: TreeNode) -> List[str]:
+        if node is None or node.hash_value is None:
+            return []
+
+        return node.get_prefix_hash_values(node.parent) + node.hash_value
+
+    def __lt__(self, other: "TreeNode"):
+        return self.last_access_time < other.last_access_time
+
+
+def _check_extra_key(key0: RadixKey, key1: RadixKey):
+    if key0.extra_key != key1.extra_key:
+        raise ValueError(
+            f"_key_match should be run on the same extra key, but got key0.extra_key={key0.extra_key} != key1.extra_key={key1.extra_key}"
+        )
+
+
+def _key_match_page_size1(key0: RadixKey, key1: RadixKey):
+    _check_extra_key(key0, key1)
+    i = 0
+    for k0, k1 in zip(key0.token_ids, key1.token_ids):
+        if k0 != k1:
+            break
+        i += 1
+    return i
+
+
+def _key_match_paged(key0: RadixKey, key1: RadixKey, page_size: int):
+    _check_extra_key(key0, key1)
+    min_len = min(len(key0), len(key1))
+
+    i = 0
+    while i < min_len:
+        if key0.token_ids[i : i + page_size] != key1.token_ids[i : i + page_size]:
+            break
+        i += page_size
+
+    return i
+
+
+def get_child_key(key: RadixKey, page_size: int = 1):
+    if page_size == 1:
+        plain_key = key.token_ids[0]
+    else:
+        plain_key = tuple(key.token_ids[:page_size])
+    if key.extra_key is None:
+        return plain_key
+    else:
+        return (key.extra_key, plain_key)
+
+
+def compute_node_hash_values(node: "TreeNode", page_size: int) -> List[str]:
+    """Compute SHA256-based hash values for position-aware identification.
+
+    Args:
+        node: The TreeNode to compute hash values for
+        page_size: The page size for chunking tokens
+
+    Returns:
+        List of SHA256 hex strings, one per page
+    """
+    hash_values = []
+
+    # Get parent's last hash value if parent exists
+    parent_hash = None
+    if node.parent is not None and node.parent.hash_value is not None:
+        # Check if parent is root by checking if it has empty key
+        if len(node.parent.key) > 0 and len(node.parent.hash_value) > 0:
+            parent_hash = node.parent.hash_value[-1]
+
+    # Iterate through node's pages
+    for start in range(0, len(node.key), page_size):
+        page_tokens = node.key.token_ids[start : start + page_size]
+        if not page_tokens:
+            continue
+
+        # Use SHA256-based chaining via get_hash_str
+        hash_val = get_hash_str(page_tokens, prior_hash=parent_hash)
+        hash_values.append(hash_val)
+        parent_hash = hash_val
+
+    return hash_values
+
+
+def split_node_hash_value(
+    child_hash_value: Optional[List[str]], split_len: int, page_size: int
+) -> tuple[Optional[List[str]], Optional[List[str]]]:
+    """Split hash_value between parent and child nodes during node splitting.
+
+    Args:
+        child_hash_value: The hash_value list from the child node being split
+        split_len: The length at which to split (in tokens)
+        page_size: The page size for calculating number of pages
+
+    Returns:
+        Tuple of (new_node_hash_value, updated_child_hash_value)
+    """
+    if child_hash_value is None:
+        return None, None
+
+    if page_size == 1:
+        split_pages = split_len
+    else:
+        split_pages = split_len // page_size
+
+    new_node_hash = child_hash_value[:split_pages]
+    child_hash = child_hash_value[split_pages:]
+
+    return new_node_hash, child_hash
+
+
+class RadixCache(BasePrefixCache):
+    def __init__(self, params: CacheInitParams):
+        self.disable = params.disable
+        self.req_to_token_pool = params.req_to_token_pool
+        self.token_to_kv_pool_allocator = params.token_to_kv_pool_allocator
+        self.page_size = params.page_size
+        self.enable_kv_cache_events = params.enable_kv_cache_events
+        self.is_eagle = params.is_eagle
+        self.disable_finished_insert = params.disable_finished_insert
+        self.eviction_policy = params.eviction_policy.lower()
+
+        self.kv_event_queue = []
+
+        if params.enable_metrics:
+            self.init_metrics_collector()
+
+        if self.token_to_kv_pool_allocator:
+            self.device = self.token_to_kv_pool_allocator.device
+        else:
+            self.device = torch.device("cpu")
+
+        if self.page_size == 1:
+            self.key_match_fn = _key_match_page_size1
+            self.get_child_key_fn = get_child_key
+        else:
+            self.key_match_fn = partial(_key_match_paged, page_size=self.page_size)
+            self.get_child_key_fn = partial(get_child_key, page_size=self.page_size)
+
+        if self.eviction_policy == "lru":
+            self.eviction_strategy: EvictionStrategy = LRUStrategy()
+        elif self.eviction_policy == "lfu":
+            self.eviction_strategy: EvictionStrategy = LFUStrategy()
+        elif self.eviction_policy == "fifo":
+            self.eviction_strategy: EvictionStrategy = FIFOStrategy()
+        elif self.eviction_policy == "mru":
+            self.eviction_strategy: EvictionStrategy = MRUStrategy()
+        elif self.eviction_policy == "filo":
+            self.eviction_strategy: EvictionStrategy = FILOStrategy()
+        elif self.eviction_policy == "priority":
+            self.eviction_strategy: EvictionStrategy = PriorityStrategy()
+        else:
+            raise ValueError(
+                f"Unknown eviction policy: {self.eviction_policy}. Supported policies: 'lru', 'lfu', 'fifo', 'mru', 'filo', 'priority'."
+            )
+
+        self.evictable_leaves = set()
+        self.reset()
+
+    @classmethod
+    def create_simulated(
+        self,
+        disable: bool = False,
+        mock_allocator: Optional[Any] = None,
+        page_size: int = 1,
+        enable_kv_cache_events: bool = False,
+    ) -> RadixCache:
+        """Init a radix cache without memory pools for simulation purpose."""
+        params = CacheInitParams(
+            disable=disable,
+            req_to_token_pool=None,
+            token_to_kv_pool_allocator=mock_allocator,
+            page_size=page_size,
+            enable_kv_cache_events=enable_kv_cache_events,
+        )
+        return RadixCache(params)
+
+    ##### Public API #####
+
+    def reset(self):
+        # Initialize root with minimum priority so any real priority overrides it
+        self.root_node = TreeNode(priority=-sys.maxsize)
+        self.root_node.key = RadixKey(token_ids=[], extra_key=None)
+        self.root_node.value = []
+        self.root_node.host_value = []
+        self.root_node.lock_ref = 1
+        self.root_node.hash_value = []
+        self.evictable_size_ = 0
+        self.protected_size_ = 0
+        self.evictable_leaves.clear()
+        self._record_all_cleared_event()
+
+    def maybe_bigram_convert(
+        self, key: RadixKey, value: Optional[torch.Tensor] = None
+    ) -> Tuple[RadixKey, Optional[torch.Tensor]]:
+        return maybe_bigram_convert(self.is_eagle, key, value)
+
+    def match_prefix(self, params: MatchPrefixParams) -> MatchResult:
+        """Find the longest cached prefix of ``key`` in the radix tree.
+
+        The logical namespace for prefix matching is determined by both the
+        token id sequence and the optional ``extra_key`` carried by ``RadixKey``.
+        Entries that share identical leading token ids but have *different*
+        ``extra_key`` values are intentionally kept disjoint and never share
+        prefix nodes. This is useful to:
+
+        * Isolate KV cache lines for different LoRA / adapter IDs.
+        * Separate requests that intentionally should not share state (e.g.,
+          different sampling salt, cache version, or retrieval augmentation
+          context) by supplying a distinct ``extra_key``.
+
+        Args:
+            params (MatchPrefixParams): Parameters containing the lookup key
+                with a list of token ids and an optional ``extra_key`` namespace tag.
+                If ``page_size > 1`` the length is internally truncated to a multiple
+                of ``page_size`` before matching. Passing an empty key returns an
+                empty result with the root as the last node.
+
+        Returns:
+            MatchResult: ``device_indices`` is a 1-D ``torch.int64`` tensor of
+            the concatenated KV cache indices corresponding to the longest
+            cached prefix (may be length 0). ``last_device_node`` and
+            ``last_host_node`` (currently the same) are the tree node objects
+            representing the terminal node of the matched prefix. This method
+            may mutate internal structure by splitting an existing node if the
+            match ends inside a stored segment.
+
+        Internal updates:
+            * Refreshes access metadata (timestamps) used by the
+                configured eviction strategy.
+            * If the lookup ends inside a stored segment the node is split once
+                to expose a precise boundary; this structural refinement improves
+                subsequent match efficiency and does not duplicate data.
+        """
+        key = params.key
+        key, _ = self.maybe_bigram_convert(key)
+
+        def empty_match_result():
+            return MatchResult(
+                device_indices=torch.empty(
+                    (0,),
+                    dtype=torch.int64,
+                    device=self.device,
+                ),
+                last_device_node=self.root_node,
+                last_host_node=self.root_node,
+            )
+
+        if self.disable or len(key) == 0:
+            return empty_match_result()
+
+        if self.page_size != 1:
+            page_aligned_len = len(key) // self.page_size * self.page_size
+            key = key[:page_aligned_len]
+
+        if len(key) == 0:
+            return empty_match_result()
+
+        value, last_node = self._match_prefix_helper(self.root_node, key)
+        if value:
+            value = torch.cat(value)
+        else:
+            value = torch.empty((0,), dtype=torch.int64, device=self.device)
+        return MatchResult(
+            device_indices=value,
+            last_device_node=last_node,
+            last_host_node=last_node,
+        )
+
+    def insert(self, params: InsertParams) -> InsertResult:
+        if self.disable:
+            return InsertResult(prefix_len=0)
+
+        key = params.key
+        value = params.value
+        priority = params.priority
+
+        if value is None:
+            value = torch.tensor(key.token_ids, dtype=torch.int64)
+
+        key, value = self.maybe_bigram_convert(key, value)
+
+        prefix_len = self._insert_helper(self.root_node, key, value, priority)
+        return InsertResult(prefix_len=prefix_len)
+
+    def cache_finished_req(self, req: Req, is_insert: bool = True):
+        """Cache request when it finishes."""
+        # In deterministic mode, disable finished request insertion to radix cache
+        if self.disable_finished_insert:
+            is_insert = False
+
+        kv_committed_len = req.pop_committed_kv_cache()
+        if self.disable:
+            kv_indices = self.req_to_token_pool.req_to_token[
+                req.req_pool_idx, :kv_committed_len
+            ]
+            self.token_to_kv_pool_allocator.free(kv_indices)
+            return
+
+        token_ids = (req.origin_input_ids + req.output_ids)[:kv_committed_len]
+        kv_indices = self.req_to_token_pool.req_to_token[
+            req.req_pool_idx, : len(token_ids)
+        ]
+
+        # Maybe convert to bigram keys for EAGLE
+        keys = convert_to_bigram_key(token_ids) if self.is_eagle else token_ids
+        keys = page_align_keys(keys, self.page_size)
+        values = kv_indices[: len(keys)].to(dtype=torch.int64, copy=True)
+        radix_key = RadixKey(keys, req.extra_key, is_bigram=self.is_eagle)
+
+        # Radix Cache takes one ref in memory pool
+        if is_insert:
+            priority = getattr(req, "priority", 0) or 0
+            result = self.insert(
+                InsertParams(key=radix_key, value=values, priority=priority)
+            )
+            new_prefix_len = result.prefix_len
+            # Free the duplicates that were already in the tree
+            self.token_to_kv_pool_allocator.free(
+                kv_indices[req.cache_protected_len : new_prefix_len]
+            )
+        else:
+            self.token_to_kv_pool_allocator.free(
+                kv_indices[req.cache_protected_len : len(keys)]
+            )
+
+        # free the unaligned tail
+        self.token_to_kv_pool_allocator.free(kv_indices[len(keys) :])
+
+        # Remove req slot release the cache lock
+        self.dec_lock_ref(req.last_node)
+
+    def cache_unfinished_req(self, req: Req, chunked=False):
+        """Cache request when it is unfinished."""
+        if self.disable:
+            return
+
+        token_ids = req.fill_ids
+        kv_indices = self.req_to_token_pool.req_to_token[
+            req.req_pool_idx, : len(token_ids)
+        ]
+
+        # Maybe convert to bigram keys for EAGLE
+        keys = convert_to_bigram_key(token_ids) if self.is_eagle else token_ids
+        keys = page_align_keys(keys, self.page_size)
+        values = kv_indices[: len(keys)].to(dtype=torch.int64, copy=True)
+        radix_key = RadixKey(keys, req.extra_key, is_bigram=self.is_eagle)
+
+        # Radix Cache takes one ref in memory pool
+        result = self.insert(
+            InsertParams(
+                key=radix_key,
+                value=values,
+                chunked=chunked,
+                priority=getattr(req, "priority", 0) or 0,
+            )
+        )
+        new_prefix_len = result.prefix_len
+
+        self.token_to_kv_pool_allocator.free(
+            kv_indices[req.cache_protected_len : new_prefix_len]
+        )
+
+        # The prefix indices could be updated, reuse it
+        match_result = self.match_prefix(MatchPrefixParams(key=radix_key))
+        new_indices, new_last_node = (
+            match_result.device_indices,
+            match_result.last_device_node,
+        )
+        assert len(new_indices) == len(keys), f"{len(new_indices)=}, {len(keys)=}"
+
+        self.req_to_token_pool.write(
+            (req.req_pool_idx, slice(req.cache_protected_len, len(new_indices))),
+            new_indices[req.cache_protected_len :],
+        )
+
+        # The cache_protected_len is not always equal to len(req.prefix_indices)
+        # since for page_size > 1, the partial part is added to req.prefix_indices, but that part of kv indices is not added to the tree.
+        # It should be freed in the next cache_unfinished_req and final cache_finished_req to avoid memory leak.
+        # So we introduce this `cache_protected_len` field to make sure the partial part can be freed correctly.
+        req.cache_protected_len = len(new_indices)
+
+        self.dec_lock_ref(req.last_node)
+        self.inc_lock_ref(new_last_node)
+
+        # `req.prefix_indices` will be used in `PrefillAdder::add_chunked_req` later
+        # - page_size != 1: there is a partial page at the end, keep the full kv_indices
+        # - eagle case: bigram keys will only cache len - 1 kv indices
+        if len(new_indices) < len(kv_indices):
+            req.prefix_indices = torch.cat(
+                [new_indices, kv_indices[len(new_indices) :]]
+            )
+        else:
+            req.prefix_indices = new_indices
+
+        req.last_node = new_last_node
+
+    def pretty_print(self):
+        self._print_helper(self.root_node, 0)
+        print(f"#tokens: {self.total_size()}")
+
+    def total_size(self):
+        return self._total_size_helper()
+
+    def evict(self, params: EvictParams) -> EvictResult:
+        if self.disable:
+            return EvictResult()
+
+        start_time = time.perf_counter()
+        num_tokens = params.num_tokens
+        leaves = list(self.evictable_leaves)
+        eviction_heap = [
+            (self.eviction_strategy.get_priority(node), node) for node in leaves
+        ]
+        heapq.heapify(eviction_heap)
+
+        num_evicted = 0
+        while num_evicted < num_tokens and len(eviction_heap):
+            _priority, x = heapq.heappop(eviction_heap)
+
+            self.token_to_kv_pool_allocator.free(x.value)
+            num_evicted += len(x.value)
+            self._delete_leaf(x)
+
+            if len(x.parent.children) == 0 and x.parent.lock_ref == 0:
+                new_priority = self.eviction_strategy.get_priority(x.parent)
+                heapq.heappush(eviction_heap, (new_priority, x.parent))
+
+            self._record_remove_event(x)
+
+        self.update_eviction_metrics(num_evicted, start_time)
+        return EvictResult(num_tokens_evicted=num_evicted)
+
+    def inc_lock_ref(self, node: TreeNode):
+        if self.disable:
+            return 0
+
+        delta = 0
+        while node != self.root_node:
+            if node.lock_ref == 0:
+                self.evictable_size_ -= len(node.key)
+                self.protected_size_ += len(node.key)
+                delta -= len(node.key)
+            node.lock_ref += 1
+            self._update_leaf_status(node)
+            node = node.parent
+        return delta
+
+    def dec_lock_ref(self, node: TreeNode):
+        if self.disable:
+            return 0
+
+        delta = 0
+        while node != self.root_node:
+            if node.lock_ref == 1:
+                self.evictable_size_ += len(node.key)
+                self.protected_size_ -= len(node.key)
+                delta += len(node.key)
+            node.lock_ref -= 1
+            self._update_leaf_status(node)
+            if node.parent is None:
+                assert (
+                    node is self.root_node
+                ), f"This request holds the node from another tree"
+            node = node.parent
+        return delta
+
+    def evictable_size(self):
+        return self.evictable_size_
+
+    def protected_size(self):
+        # protected size refers to the size of the cache that is locked
+        return self.protected_size_
+
+    def all_values_flatten(self):
+        values = []
+
+        def _dfs_helper(node: TreeNode):
+            for _, child in node.children.items():
+                values.append(child.value)
+                _dfs_helper(child)
+
+        _dfs_helper(self.root_node)
+        return torch.cat(values)
+
+    ##### Internal Helper Functions #####
+
+    def _match_prefix_helper(self, node: TreeNode, key: RadixKey):
+        access_time = time.monotonic()
+        node.last_access_time = access_time
+
+        child_key = self.get_child_key_fn(key)
+
+        value = []
+        while len(key) > 0 and child_key in node.children.keys():
+            child = node.children[child_key]
+            child.last_access_time = access_time
+            prefix_len = self.key_match_fn(child.key, key)
+            if prefix_len < len(child.key):
+                new_node = self._split_node(child.key, child, prefix_len)
+                value.append(new_node.value)
+                node = new_node
+                break
+            else:
+                value.append(child.value)
+                node = child
+                key = key[prefix_len:]
+
+                if len(key):
+                    child_key = self.get_child_key_fn(key)
+
+        return value, node
+
+    def _split_node(self, key: RadixKey, child: TreeNode, split_len: int):
+        # new_node -> child
+        # New node inherits child's priority (represents shared prefix)
+        new_node = TreeNode(priority=child.priority)
+        new_node.children = {self.get_child_key_fn(key[split_len:]): child}
+        new_node.parent = child.parent
+        new_node.lock_ref = child.lock_ref
+        new_node.key = child.key[:split_len]
+        new_node.value = child.value[:split_len].clone()
+        child.parent = new_node
+        child.key = child.key[split_len:]
+        child.value = child.value[split_len:].clone()
+        new_node.parent.children[self.get_child_key_fn(key)] = new_node
+
+        # Split hash_value if it was already computed, otherwise leave as None
+        new_node.hash_value, child.hash_value = split_node_hash_value(
+            child.hash_value, split_len, self.page_size
+        )
+
+        return new_node
+
+    def _insert_helper(self, node: TreeNode, key: RadixKey, value, priority: int = 0):
+        # Convert None priority to 0
+        if priority is None:
+            priority = 0
+        access_time = time.monotonic()
+        node.last_access_time = access_time
+        # Update priority along the path (take max to propagate higher priority)
+        node.priority = max(node.priority, priority)
+        if len(key) == 0:
+            return 0
+
+        child_key = self.get_child_key_fn(key)
+
+        total_prefix_length = 0
+        while len(key) > 0 and child_key in node.children.keys():
+            node = node.children[child_key]
+            node.last_access_time = access_time
+            prefix_len = self.key_match_fn(node.key, key)
+            total_prefix_length += prefix_len
+            key = key[prefix_len:]
+            value = value[prefix_len:]
+
+            if prefix_len < len(node.key):
+                new_node = self._split_node(node.key, node, prefix_len)
+                new_node.priority = max(new_node.priority, priority)
+                node = new_node
+            else:
+                node.priority = max(node.priority, priority)
+
+            if len(key):
+                child_key = self.get_child_key_fn(key)
+
+        if len(key):
+            new_node = TreeNode(priority=priority)
+            new_node.parent = node
+            new_node.key = key
+            new_node.value = value.clone()
+            node.children[child_key] = new_node
+            self.evictable_size_ += len(key)
+            self._update_leaf_status(node)
+            self._update_leaf_status(new_node)
+            # Hash will be computed lazily during event emission
+            self._record_store_event(new_node)
+        return total_prefix_length
+
+    def _print_helper(self, node: TreeNode, indent: int):
+        """Prints the radix tree in a human-readable format."""
+        stack = [(node, indent)]
+        while stack:
+            current_node, current_indent = stack.pop()
+            print(
+                " " * current_indent,
+                len(current_node.key),
+                current_node.key.token_ids[:10],
+                f"r={current_node.lock_ref}",
+            )
+            for key, child in current_node.children.items():
+                stack.append((child, current_indent + 2))
+
+                assert key == self.get_child_key_fn(
+                    child.key
+                ), f"{key=}, {self.get_child_key_fn(child.key)=}"
+
+    def _delete_leaf(self, node):
+        key = self.get_child_key_fn(node.key)
+        v = node.parent.children.pop(key, None)
+        assert v == node, f"parent does not have child key, {key}"
+
+        self.evictable_size_ -= len(node.key)
+        if node in self.evictable_leaves:
+            self.evictable_leaves.remove(node)
+        self._update_leaf_status(node.parent)
+
+    def _update_leaf_status(self, node: TreeNode):
+        if node.evicted or node.lock_ref > 0:
+            if node in self.evictable_leaves:
+                self.evictable_leaves.remove(node)
+            return
+
+        for child in node.children.values():
+            if not child.evicted:
+                if node in self.evictable_leaves:
+                    self.evictable_leaves.remove(node)
+                return
+
+        if node not in self.evictable_leaves:
+            self.evictable_leaves.add(node)
+
+    def _total_size_helper(self):
+        total_size = 0
+        stack = [self.root_node]
+        while stack:
+            current_node = stack.pop()
+            total_size += len(current_node.value)
+            for child in current_node.children.values():
+                if child.evicted:
+                    continue
+                stack.append(child)
+        return total_size
+
+    def _record_store_event(self, node: TreeNode):
+        # One BlockStored per ``page_size`` chunk.
+        if self.enable_kv_cache_events:
+            # Compute hash_value lazily if not already set
+            if node.hash_value is None:
+                node.hash_value = compute_node_hash_values(node, self.page_size)
+
+            # Get parent's last hash value for first page
+            parent_block_hash = None
+            if node.parent is not None and node.parent != self.root_node:
+                if (
+                    node.parent.hash_value is not None
+                    and len(node.parent.hash_value) > 0
+                ):
+                    parent_block_hash = hash_str_to_int64(node.parent.hash_value[-1])
+
+            page_index = 0
+            for start in range(0, len(node.key), self.page_size):
+                page_tokens = node.key.token_ids[start : start + self.page_size]
+                if not page_tokens:
+                    continue
+
+                block_hash = hash_str_to_int64(node.hash_value[page_index])
+
+                self.kv_event_queue.append(
+                    BlockStored(
+                        block_hashes=[block_hash],
+                        parent_block_hash=parent_block_hash,
+                        token_ids=page_tokens,
+                        block_size=len(page_tokens),
+                        lora_id=None,
+                        medium=MEDIUM_GPU,
+                    )
+                )
+
+                parent_block_hash = block_hash
+                page_index += 1
+
+    def _record_remove_event(self, node: TreeNode):
+        # One BlockRemoved per chunk.
+        if self.enable_kv_cache_events:
+            # Compute hash_value lazily if not already set (must match what was stored)
+            if node.hash_value is None:
+                node.hash_value = compute_node_hash_values(node, self.page_size)
+
+            page_index = 0
+            for start in range(0, len(node.key), self.page_size):
+                page_tokens = node.key.token_ids[start : start + self.page_size]
+                if not page_tokens:
+                    continue
+
+                block_hash = hash_str_to_int64(node.hash_value[page_index])
+
+                self.kv_event_queue.append(
+                    BlockRemoved(block_hashes=[block_hash], medium=MEDIUM_GPU)
+                )
+
+                page_index += 1
+
+    def _record_all_cleared_event(self):
+        if self.enable_kv_cache_events:
+            self.kv_event_queue.append(AllBlocksCleared())
+
+    def take_events(self):
+        """Atomically takes all events and clears the queue.
+
+        Returns:
+            A list of KV cache events.
+        """
+        if not self.enable_kv_cache_events:
+            return []
+        events = self.kv_event_queue
+        self.kv_event_queue = []
+        return events
+
+
+if __name__ == "__main__":
+    tree = RadixCache.create_simulated()
+
+    # Example token id sequences (as lists of ints)
+    tree.insert(InsertParams(key=RadixKey(token_ids=[1, 2, 3], extra_key=None)))
+    tree.insert(InsertParams(key=RadixKey(token_ids=[1, 2, 3], extra_key=None)))
+    tree.insert(InsertParams(key=RadixKey(token_ids=[1, 2, 4, 5], extra_key=None)))
+    tree.insert(
+        InsertParams(key=RadixKey(token_ids=[1, 2, 4, 5, 6, 7], extra_key=None))
+    )
+    tree.insert(
+        InsertParams(key=RadixKey(token_ids=[8, 9, 10, 11, 12], extra_key=None))
+    )
+    tree.pretty_print()
+
+    print(
+        tree.match_prefix(
+            MatchPrefixParams(key=RadixKey(token_ids=[1, 2, 3, 13, 14], extra_key=None))
+        )
+    )
diff --git a/sglang/python/sglang/srt/mem_cache/radix_cache_cpp.py b/sglang/python/sglang/srt/mem_cache/radix_cache_cpp.py
new file mode 100644
index 0000000000000000000000000000000000000000..6ed293d1b39daf23e2e97b73cd834f5fbfa0a8ab
--- /dev/null
+++ b/sglang/python/sglang/srt/mem_cache/radix_cache_cpp.py
@@ -0,0 +1,253 @@
+from __future__ import annotations
+
+import logging
+import time
+from typing import TYPE_CHECKING, List, Set
+
+import torch
+
+from sglang.srt.mem_cache.base_prefix_cache import (
+    BasePrefixCache,
+    EvictParams,
+    EvictResult,
+    MatchPrefixParams,
+    MatchResult,
+)
+from sglang.srt.mem_cache.cpp_radix_tree.radix_tree import (
+    IOHandle,
+    RadixTreeCpp,
+    TreeNodeCpp,
+)
+from sglang.srt.mem_cache.radix_cache import RadixKey
+
+if TYPE_CHECKING:
+    from sglang.srt.managers.schedule_batch import Req
+    from sglang.srt.mem_cache.cache_init_params import CacheInitParams
+    from sglang.srt.server_args import ServerArgs
+
+
+logger = logging.getLogger(__name__)
+
+
+class RadixCacheCpp(BasePrefixCache):
+    def __init__(
+        self,
+        params: CacheInitParams,
+        server_args: ServerArgs,
+        enable_write_cancel: bool = False,
+    ):
+        self.disable = params.disable
+        self.enable_write_cancel = enable_write_cancel
+
+        assert (
+            params.enable_kv_cache_events is False
+        ), "HiRadixCache does not support kv cache events yet"
+
+        # record the nodes with ongoing write through
+        self.ongoing_write_through: Set[IOHandle] = set()
+        # record the node segments with ongoing load back
+        self.ongoing_load_back: Set[IOHandle] = set()
+        # todo: dynamically adjust the threshold
+        self.write_through_threshold = (
+            1 if server_args.hicache_write_policy == "write_through" else 2
+        )
+        self.token_to_kv_pool_allocator = params.token_to_kv_pool_allocator
+        self.device = self.token_to_kv_pool_allocator.device
+        self.req_to_token_pool = params.req_to_token_pool
+        self.page_size = params.page_size
+        self.kv_cache = self.token_to_kv_pool_allocator.get_kvcache()
+
+        self.tp_group = params.tp_cache_group
+
+        if params.enable_metrics:
+            self.init_metrics_collector()
+
+        if not server_args.enable_hierarchical_cache:
+            self.tree = RadixTreeCpp(
+                disabled=self.disable,
+                page_size=self.page_size,
+                host_size=None,  # no host cache, this should be removed in the future
+                write_through_threshold=self.write_through_threshold,
+            )
+            self.cache_controller = None
+            return  # early return if hicache is not used
+
+        raise NotImplementedError("Host cache is not supported yet")
+
+    def _merge_tensor(self, l: List[torch.Tensor]) -> torch.Tensor:
+        """
+        Merge a list of tensors into a single tensor.
+        Args:
+            l (List[torch.Tensor]): List of tensors to merge.
+        Returns:
+            torch.Tensor: Merged tensor.
+        """
+        if len(l) == 0:
+            return torch.empty(0, dtype=torch.int64, device=self.device)
+        elif len(l) == 1:
+            return l[0]
+        else:
+            return torch.cat(l)
+
+    def reset(self):
+        if self.cache_controller is not None:
+            # need to clear the acks before resetting the cache controller
+            raise NotImplementedError("Host cache is not supported yet")
+        self.tree.reset()
+
+    def match_prefix(self, params: MatchPrefixParams) -> MatchResult:
+        key = params.key
+        device_indices_vec, host_indices_length, node_gpu, node_cpu = (
+            self.tree.match_prefix(key.token_ids)
+        )
+        return MatchResult(
+            device_indices=self._merge_tensor(device_indices_vec),
+            last_device_node=node_gpu,
+            last_host_node=node_cpu,
+            host_hit_length=host_indices_length,
+        )
+
+    def _insert(self, key: RadixKey, value: torch.Tensor) -> int:
+        """
+        Insert a key-value pair into the radix tree.
+        Args:
+            key (RadixKey): The key to insert, represented as a RadixKey.
+            value (torch.Tensor): The value to associate with the key.
+        Returns:
+            int: Number of device indices that were already present in the tree before the insertion.
+        """
+        ongoing_write, length = self.tree.writing_through(key.token_ids, value)
+        if self.cache_controller is None:
+            assert len(ongoing_write) == 0, "Implementation error"
+            return length
+
+        raise NotImplementedError("Host cache is not supported yet")
+
+    def dec_lock_ref(self, node: TreeNodeCpp):
+        """
+        Decrement the reference count of a node to root of the radix tree.
+        Args:
+            node (TreeNodeCpp): The handle of the node to decrement the reference count for.
+        """
+        self.tree.lock_ref(node, False)  # do not increment
+
+    def inc_lock_ref(self, node: TreeNodeCpp):
+        """
+        Increment the reference count of from a node to root of the radix tree.
+        Args:
+            node (TreeNodeCpp): The handle of the node to increment the reference count for.
+        """
+        self.tree.lock_ref(node, True)
+
+    def evict(self, params: EvictParams) -> EvictResult:
+        start_time = time.perf_counter()
+        num_tokens = params.num_tokens
+        evicted_device_indices = self.tree.evict(num_tokens)
+
+        num_evicted = 0
+        for indice in evicted_device_indices:
+            num_evicted += len(indice)
+            self.token_to_kv_pool_allocator.free(indice)
+
+        self.update_eviction_metrics(num_evicted, start_time)
+        return EvictResult(num_tokens_evicted=num_evicted)
+
+    def evictable_size(self):
+        return self.tree.evictable_size()
+
+    def protected_size(self):
+        return self.tree.protected_size()
+
+    def total_size(self):
+        return self.tree.total_size()
+
+    def cache_finished_req(self, req: Req, is_insert: bool = True):
+        """Cache request when it finishes."""
+        assert req.req_pool_idx is not None
+        kv_committed_len = req.pop_committed_kv_cache()
+        token_ids = (req.origin_input_ids + req.output_ids)[:kv_committed_len]
+        kv_indices = self.req_to_token_pool.req_to_token[
+            req.req_pool_idx, :kv_committed_len
+        ].to(dtype=torch.int64, copy=True)
+
+        # NOTE: our C++ implementation don't need `token_ids` and `kv_indices` to be page-aligned
+        # it will automatically align them, but length of them should be equal
+        old_prefix_len = len(req.prefix_indices) // self.page_size * self.page_size
+        page_aligned_overall_len = kv_committed_len // self.page_size * self.page_size
+
+        if is_insert:
+            new_prefix_len = self._insert(
+                RadixKey(token_ids, req.extra_key), kv_indices
+            )
+            # NOTE: kv_indices[:old_prefix_len] == req.prefix_indices
+            assert old_prefix_len <= new_prefix_len, "Wrong prefix indices"
+            # Free duplicates that were already in the pool
+            if old_prefix_len < new_prefix_len:
+                self.token_to_kv_pool_allocator.free(
+                    kv_indices[old_prefix_len:new_prefix_len]
+                )
+        else:
+            self.token_to_kv_pool_allocator.free(
+                kv_indices[old_prefix_len:page_aligned_overall_len]
+            )
+
+        # need to free the unaligned part, since it cannot be inserted into the radix tree
+        if page_aligned_overall_len < kv_committed_len:
+            # NOTE: sglang PagedAllocator support unaligned free (which will automatically align it)
+            self.token_to_kv_pool_allocator.free(kv_indices[page_aligned_overall_len:])
+
+        # Remove req slot release the cache lock
+        self.dec_lock_ref(req.last_node)
+
+    def cache_unfinished_req(self, req: Req, chunked=False):
+        """Cache request when it is unfinished."""
+        assert req.req_pool_idx is not None
+        token_ids = req.fill_ids
+        prefill_len = len(token_ids)  # prefill only (maybe chunked)
+        kv_indices = self.req_to_token_pool.req_to_token[
+            req.req_pool_idx, :prefill_len
+        ].to(dtype=torch.int64, copy=True)
+
+        # NOTE: our C++ implementation don't need `token_ids` and `kv_indices` to be page-aligned
+        # it will automatically align them, but length of them should be equal
+        old_prefix_len = len(req.prefix_indices) // self.page_size * self.page_size
+        new_prefix_len = self._insert(RadixKey(token_ids, req.extra_key), kv_indices)
+
+        # NOTE: kv_indices[:old_prefix_len] == req.prefix_indices
+        assert old_prefix_len <= new_prefix_len, "Wrong prefix indices"
+
+        # TODO(dark): optimize the `insert` and `match` (e.g. merge into 1 function)
+        # The prefix indices need to updated to reuse the kv indices in the pool
+        new_indices_vec, _, new_last_node, _ = self.tree.match_prefix(
+            RadixKey(token_ids, req.extra_key).token_ids
+        )
+        new_indices = self._merge_tensor(new_indices_vec)
+        assert new_prefix_len <= len(new_indices)
+
+        # KVCache between old & new is newly generated, but already exists in the pool
+        # we need to free this newly generated kv indices and reuse the indices in the pool
+        if old_prefix_len < new_prefix_len:
+            self.token_to_kv_pool_allocator.free(
+                kv_indices[old_prefix_len:new_prefix_len]
+            )
+            reused_indices = new_indices[old_prefix_len:new_prefix_len]
+            self.req_to_token_pool.req_to_token[
+                req.req_pool_idx, old_prefix_len:new_prefix_len
+            ] = reused_indices
+
+        if req.last_node != new_last_node:
+            self.dec_lock_ref(req.last_node)
+            self.inc_lock_ref(new_last_node)
+
+        # NOTE: there might be unaligned tail, so we may need to append it
+        assert len(new_indices) <= prefill_len < len(new_indices) + self.page_size
+        if self.page_size != 1 and len(new_indices) < prefill_len:
+            req.prefix_indices = torch.cat(
+                [new_indices, kv_indices[len(new_indices) :]]
+            )
+        else:
+            req.prefix_indices = new_indices
+        req.last_node = new_last_node
+
+    def pretty_print(self):
+        return self.tree.debug_print()
diff --git a/sglang/python/sglang/srt/mem_cache/session_aware_cache.py b/sglang/python/sglang/srt/mem_cache/session_aware_cache.py
new file mode 100644
index 0000000000000000000000000000000000000000..512e1347b9a09ef48526b087e07968fcb544689f
--- /dev/null
+++ b/sglang/python/sglang/srt/mem_cache/session_aware_cache.py
@@ -0,0 +1,311 @@
+from __future__ import annotations
+
+from dataclasses import dataclass, field
+from typing import TYPE_CHECKING, Any, Dict, Optional
+
+import torch
+
+from sglang.srt.mem_cache.base_prefix_cache import (
+    BasePrefixCache,
+    EvictParams,
+    EvictResult,
+    MatchPrefixParams,
+    MatchResult,
+)
+
+if TYPE_CHECKING:
+    from sglang.srt.managers.schedule_batch import Req
+
+
+class _VirtualNode:
+    """Sentinel node for streaming session requests.
+
+    Passed to inc_lock_ref / dec_lock_ref so the wrapper can distinguish
+    streaming-session locks (no-op) from real radix-tree locks (forwarded).
+    """
+
+    pass
+
+
+@dataclass
+class SessionSlot:
+    """Holds KV state between streaming session turns."""
+
+    virtual_node: _VirtualNode = field(default_factory=_VirtualNode)
+
+    # KV pool state (None means no KV is currently held by this slot)
+    req_pool_idx: Optional[int] = None
+    kv_committed_len: int = 0
+    kv_allocated_len: int = 0
+
+    # First req's radix tree node (for dec_lock_ref on session close)
+    last_node: Any = None
+    cache_protected_len: int = 0
+    swa_uuid_for_lock: Optional[str] = None
+
+    # SWA state
+    swa_evicted_seqlen: int = 0
+
+    # Mamba states
+    mamba_pool_idx: Any = None
+    mamba_ping_pong_track_buffer: Any = None
+    mamba_next_track_idx: Any = None
+    mamba_last_track_seqlen: Any = None
+    mamba_branching_seqlen: Any = None
+
+    def save_from_req(self, req: Req, is_first: bool):
+        """Save KV state from a finishing request into this slot."""
+        self.req_pool_idx = req.req_pool_idx
+        self.kv_committed_len = req.kv_committed_len
+        self.kv_allocated_len = req.kv_allocated_len
+        self.swa_evicted_seqlen = req.swa_evicted_seqlen
+
+        if is_first:
+            self.last_node = req.last_node
+            self.cache_protected_len = req.cache_protected_len
+            self.swa_uuid_for_lock = req.swa_uuid_for_lock
+
+        self.mamba_pool_idx = req.mamba_pool_idx
+        self.mamba_ping_pong_track_buffer = req.mamba_ping_pong_track_buffer
+        self.mamba_next_track_idx = req.mamba_next_track_idx
+        self.mamba_last_track_seqlen = req.mamba_last_track_seqlen
+        self.mamba_branching_seqlen = req.mamba_branching_seqlen
+
+        req.req_pool_idx = None
+        req.mamba_pool_idx = None
+
+    def restore_to_req(self, req: Req):
+        """Restore KV state from this slot into an incoming request."""
+        req.req_pool_idx = self.req_pool_idx
+        req.kv_committed_len = self.kv_committed_len
+        req.kv_allocated_len = self.kv_allocated_len
+        req.swa_evicted_seqlen = self.swa_evicted_seqlen
+        req.swa_uuid_for_lock = self.swa_uuid_for_lock
+
+        req.mamba_pool_idx = self.mamba_pool_idx
+        req.mamba_ping_pong_track_buffer = self.mamba_ping_pong_track_buffer
+        req.mamba_next_track_idx = self.mamba_next_track_idx
+        req.mamba_last_track_seqlen = self.mamba_last_track_seqlen
+        req.mamba_branching_seqlen = self.mamba_branching_seqlen
+
+        self.req_pool_idx = None
+        self.mamba_pool_idx = None
+
+
+def _is_streaming(req: Optional[Req]) -> bool:
+    return req is not None and req.session is not None and req.session.streaming
+
+
+class SessionAwareCache(BasePrefixCache):
+    """Decorator around any BasePrefixCache that manages streaming session KV.
+
+    Non-streaming requests are pure pass-through. Streaming requests have their
+    KV lifecycle managed by SessionSlot objects, avoiding any invasive changes
+    to the scheduling pipeline.
+    """
+
+    def __init__(self, inner: BasePrefixCache):
+        self.inner = inner
+        self.slots: Dict[str, SessionSlot] = {}
+
+    # -- Forward PrefixCacheTrait properties to inner cache --
+
+    @property
+    def req_to_token_pool(self):
+        return self.inner.req_to_token_pool
+
+    @req_to_token_pool.setter
+    def req_to_token_pool(self, value):
+        self.inner.req_to_token_pool = value
+
+    @property
+    def token_to_kv_pool_allocator(self):
+        return self.inner.token_to_kv_pool_allocator
+
+    @token_to_kv_pool_allocator.setter
+    def token_to_kv_pool_allocator(self, value):
+        self.inner.token_to_kv_pool_allocator = value
+
+    @property
+    def page_size(self):
+        return self.inner.page_size
+
+    @page_size.setter
+    def page_size(self, value):
+        self.inner.page_size = value
+
+    @property
+    def disable(self):
+        return self.inner.disable
+
+    @disable.setter
+    def disable(self, value):
+        self.inner.disable = value
+
+    @property
+    def metrics_collector(self):
+        return self.inner.metrics_collector
+
+    @metrics_collector.setter
+    def metrics_collector(self, value):
+        self.inner.metrics_collector = value
+
+    # -- BasePrefixCache abstract methods --
+
+    def reset(self):
+        self.slots.clear()
+        self.inner.reset()
+
+    def match_prefix(self, params: MatchPrefixParams) -> MatchResult:
+        req = params.req
+        if not _is_streaming(req):
+            return self.inner.match_prefix(params)
+
+        session_id = req.session.session_id
+        slot = self.slots.get(session_id)
+        if slot is None or slot.req_pool_idx is None:
+            return self.inner.match_prefix(params)
+
+        slot.restore_to_req(req)
+
+        max_prefix_len = len(params.key.token_ids)
+        prefix_len = min(req.kv_committed_len, max_prefix_len)
+        device_indices = self.req_to_token_pool.req_to_token[
+            req.req_pool_idx, :prefix_len
+        ].to(dtype=torch.int64)
+
+        return MatchResult(
+            device_indices=device_indices,
+            last_device_node=slot.virtual_node,
+            last_host_node=slot.virtual_node,
+        )
+
+    def cache_finished_req(self, req: Req, is_insert: bool = True, **kwargs):
+        if not _is_streaming(req):
+            return self.inner.cache_finished_req(req, is_insert=is_insert, **kwargs)
+
+        session_id = req.session.session_id
+        slot = self.slots.get(session_id)
+        is_first = slot is None
+        if is_first:
+            slot = SessionSlot()
+            self.slots[session_id] = slot
+
+        slot.save_from_req(req, is_first=is_first)
+
+    def cache_unfinished_req(self, req: Req, **kwargs):
+        if _is_streaming(req) and req.session.session_id in self.slots:
+            return
+        self.inner.cache_unfinished_req(req, **kwargs)
+
+    def evict(self, params: EvictParams) -> EvictResult:
+        return self.inner.evict(params)
+
+    def inc_lock_ref(self, node: Any):
+        if isinstance(node, _VirtualNode):
+            return None
+        return self.inner.inc_lock_ref(node)
+
+    def dec_lock_ref(self, node: Any, swa_uuid_for_lock: Optional[str] = None):
+        if isinstance(node, _VirtualNode):
+            return
+        if swa_uuid_for_lock is not None:
+            return self.inner.dec_lock_ref(node, swa_uuid_for_lock)
+        return self.inner.dec_lock_ref(node)
+
+    # -- Session lifecycle --
+
+    def release_session(self, session_id: str):
+        """Release all KV resources held by a streaming session."""
+        slot = self.slots.pop(session_id, None)
+        if slot is None:
+            return
+
+        if slot.last_node is not None:
+            if slot.swa_uuid_for_lock is not None:
+                self.inner.dec_lock_ref(slot.last_node, slot.swa_uuid_for_lock)
+            else:
+                self.inner.dec_lock_ref(slot.last_node)
+
+        if slot.req_pool_idx is not None:
+            start = slot.cache_protected_len
+            end = slot.kv_allocated_len
+            if start < end:
+                kv_indices = self.req_to_token_pool.req_to_token[
+                    slot.req_pool_idx, start:end
+                ]
+                self.token_to_kv_pool_allocator.free(kv_indices)
+            self.req_to_token_pool.free_slots.append(slot.req_pool_idx)
+
+    def session_held_tokens(self) -> int:
+        """Total KV tokens held by session slots, not tracked by the tree."""
+        total = 0
+        for slot in self.slots.values():
+            if slot.req_pool_idx is not None:
+                total += slot.kv_allocated_len - slot.cache_protected_len
+        return total
+
+    def session_held_req_count(self) -> int:
+        """Number of req pool slots held by session slots."""
+        return sum(1 for s in self.slots.values() if s.req_pool_idx is not None)
+
+    # -- Pass-through methods --
+
+    def evictable_size(self):
+        return self.inner.evictable_size()
+
+    def full_evictable_size(self):
+        return self.inner.full_evictable_size()
+
+    def swa_evictable_size(self):
+        return self.inner.swa_evictable_size()
+
+    def protected_size(self):
+        return self.inner.protected_size()
+
+    def full_protected_size(self):
+        return self.inner.full_protected_size()
+
+    def swa_protected_size(self):
+        return self.inner.swa_protected_size()
+
+    def total_size(self):
+        return self.inner.total_size()
+
+    def pretty_print(self):
+        return self.inner.pretty_print()
+
+    def init_load_back(self, last_host_node, host_hit_length):
+        return self.inner.init_load_back(last_host_node, host_hit_length)
+
+    def ready_to_load_host_cache(self):
+        return self.inner.ready_to_load_host_cache()
+
+    def check_hicache_events(self):
+        return self.inner.check_hicache_events()
+
+    def take_events(self):
+        return self.inner.take_events()
+
+    def supports_swa(self):
+        return self.inner.supports_swa()
+
+    def supports_mamba(self):
+        return self.inner.supports_mamba()
+
+    def is_chunk_cache(self):
+        return self.inner.is_chunk_cache()
+
+    def is_tree_cache(self):
+        return self.inner.is_tree_cache()
+
+    def available_and_evictable_str(self):
+        return self.inner.available_and_evictable_str()
+
+    def init_metrics_collector(self):
+        return self.inner.init_metrics_collector()
+
+    # Forward attribute access for cache-specific methods (e.g. sanity_check,
+    # sliding_window_size, all_values_flatten, etc.)
+    def __getattr__(self, name):
+        return getattr(self.inner, name)
diff --git a/sglang/python/sglang/srt/mem_cache/sparsity/__init__.py b/sglang/python/sglang/srt/mem_cache/sparsity/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..66e9ee89922953bcff8b3fd012dfe7dfaef51a86
--- /dev/null
+++ b/sglang/python/sglang/srt/mem_cache/sparsity/__init__.py
@@ -0,0 +1,27 @@
+from sglang.srt.mem_cache.sparsity.algorithms import (
+    BaseSparseAlgorithm,
+    BaseSparseAlgorithmImpl,
+    DeepSeekNSAAlgorithm,
+    QuestAlgorithm,
+)
+from sglang.srt.mem_cache.sparsity.backend import BackendAdaptor, FlashAttentionAdaptor
+from sglang.srt.mem_cache.sparsity.core import SparseConfig, SparseCoordinator
+from sglang.srt.mem_cache.sparsity.factory import (
+    create_sparse_coordinator,
+    get_sparse_coordinator,
+    register_sparse_coordinator,
+)
+
+__all__ = [
+    "BaseSparseAlgorithm",
+    "BaseSparseAlgorithmImpl",
+    "QuestAlgorithm",
+    "DeepSeekNSAAlgorithm",
+    "BackendAdaptor",
+    "FlashAttentionAdaptor",
+    "SparseConfig",
+    "SparseCoordinator",
+    "create_sparse_coordinator",
+    "get_sparse_coordinator",
+    "register_sparse_coordinator",
+]
diff --git a/sglang/python/sglang/srt/mem_cache/sparsity/algorithms/__init__.py b/sglang/python/sglang/srt/mem_cache/sparsity/algorithms/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..7e05203af03b88d248855e94973c33cf14706cd3
--- /dev/null
+++ b/sglang/python/sglang/srt/mem_cache/sparsity/algorithms/__init__.py
@@ -0,0 +1,13 @@
+from sglang.srt.mem_cache.sparsity.algorithms.base_algorithm import (
+    BaseSparseAlgorithm,
+    BaseSparseAlgorithmImpl,
+)
+from sglang.srt.mem_cache.sparsity.algorithms.deepseek_nsa import DeepSeekNSAAlgorithm
+from sglang.srt.mem_cache.sparsity.algorithms.quest_algorithm import QuestAlgorithm
+
+__all__ = [
+    "BaseSparseAlgorithm",
+    "BaseSparseAlgorithmImpl",
+    "DeepSeekNSAAlgorithm",
+    "QuestAlgorithm",
+]
diff --git a/sglang/python/sglang/srt/mem_cache/sparsity/algorithms/base_algorithm.py b/sglang/python/sglang/srt/mem_cache/sparsity/algorithms/base_algorithm.py
new file mode 100644
index 0000000000000000000000000000000000000000..5ed0fb9435cb2f33c0e0444fc9c628bb79832612
--- /dev/null
+++ b/sglang/python/sglang/srt/mem_cache/sparsity/algorithms/base_algorithm.py
@@ -0,0 +1,383 @@
+from abc import ABC, abstractmethod
+from typing import TYPE_CHECKING
+
+import torch
+
+if TYPE_CHECKING:
+    from sglang.srt.model_executor.forward_batch_info import ForwardBatch
+
+
+class BaseSparseAlgorithm(ABC):
+    """
+    Abstract base class for sparse attention algorithms.
+
+    This class provides a unified interface for implementing various retrievable KVCache
+    compression algorithms. Token-wise sparsity is treated as page-wise with page_size=1.
+
+    References:
+        - ChunkKV: https://arxiv.org/abs/2502.00299
+        - Quest: https://arxiv.org/pdf/2406.10774
+        - PQCache: https://arxiv.org/abs/2407.12820
+        - SnapKV: https://arxiv.org/pdf/2404.14469
+        - Look-ahead QCache: https://arxiv.org/pdf/2505.20334
+        - and more...
+    """
+
+    def __init__(self, config, device: torch.device, **kwargs):
+        self.config = config
+        self.device = device
+        self.req_to_token_pool = None
+        self.states = None
+
+    def initialize_representation_pool(
+        self,
+        start_layer: int,
+        end_layer: int,
+        token_to_kv_pool,
+        req_to_token_pool,
+        states,
+    ):
+        """
+        Initialize algorithm-specific representation pool and set context.
+
+        Called once during SparseCoordinator initialization. Algorithms allocate
+        their own representation tensors and store references to context.
+
+        Algorithm-specific implementations:
+            - ChunkKV: Allocate chunk scores [num_chunks, 1] for tracking semantic chunk importance
+            - Quest: Allocate page representations [num_pages, repr_dim] via key pooling
+            - PQCache: Allocate centroids [n_subvec, n_centroids, subvec_dim] and token codes [num_tokens, n_subvec]
+            - SnapKV: Allocate voting scores [num_tokens] and selected positions mask for retention strategy
+            - Look-ahead QCache: Allocate importance scores [num_tokens], eviction mask, and optional pseudo query cache [cache_size, hidden_dim]
+        """
+        pass
+
+    def construct_representations(
+        self,
+        layer_id: int,
+        req_pool_indices: torch.Tensor,
+        seq_lens: torch.Tensor,
+        k_buffer: torch.Tensor,
+        forward_batch: "ForwardBatch",
+    ):
+        """
+        Construct initial representations during prefill phase.
+
+        Called at every layer during forward pass. Algorithm internally decides
+        whether to perform construction.
+        Typically only constructs once per request during prefill/extend phase.
+
+        Algorithm-specific implementations:
+            - ChunkKV: Compute chunk importance scores via aggregated key L2 norms within semantic chunks
+            - Quest: Compute page representations via mean pooling of keys within each page
+            - PQCache: Run K-means clustering to generate centroids and assign each token to nearest centroid
+            - SnapKV: Select observation window (recent tokens), compute attention weights, aggregate via voting to identify important prefix positions, apply 1D pooling to preserve context
+            - Look-ahead QCache: Generate pseudo lookahead query (e.g., mean of last k queries), compute KV importance scores, mark low-importance KVs for eviction
+        """
+        pass
+
+    def update_representations(
+        self,
+        layer_id: int,
+        req_pool_indices: torch.Tensor,
+        seq_lens: torch.Tensor,
+        k_buffer: torch.Tensor,
+        forward_batch: "ForwardBatch",
+    ):
+        """
+        Incrementally update representations during decode phase.
+
+        Called at every layer during forward pass. Algorithm internally decides
+        whether to update based on:
+        - self.states.repr_constructed[req_id]: Whether initial construction done
+        - self.states.last_constructed_page[req_id]: Last constructed page index
+        - Current seq_lens: To detect new tokens/pages
+
+        Algorithm-specific implementations:
+            - ChunkKV: Incrementally compute importance scores for newly generated chunks during decode
+            - Quest: Incrementally compute representations for newly generated pages during decode
+            - PQCache: Assign new tokens to existing centroids (no centroid update during decode)
+            - SnapKV: Optional: periodically re-run voting with sliding observation window (typically static after prefill)
+            - Look-ahead QCache: Periodically regenerate pseudo queries and re-evaluate importance scores to adapt to generation dynamics
+        """
+        pass
+
+    @abstractmethod
+    def retrieve_topk(
+        self,
+        queries: torch.Tensor,
+        layer_id: int,
+        req_pool_indices: torch.Tensor,
+        sparse_mask: torch.Tensor,
+        **kwargs,
+    ) -> tuple:
+        """
+        Retrieve top-k important KV indices for sparse attention.
+
+        Called before attention computation at each layer. Uses current query
+        and pre-computed representations to select the most important subset
+        of KV cache for attention computation.
+
+        Args:
+            queries: [bs, num_heads, head_dim] Current query vectors
+            layer_id: Current layer index
+            req_pool_indices: [bs] Request pool indices
+            sparse_mask: [bs] bool, which requests need sparse attention
+            attn_metadata: Attention metadata (contains seq_lens, etc.)
+            **kwargs: Algorithm-specific arguments
+
+        Returns:
+            selected_indices: [bs, max_selected] Selected page/token indices, padded with -1
+            valid_lengths: [bs] Actual number of selected indices per request
+
+        Note:
+            - Indices are logical positions that will be mapped to physical KV cache by BackendAdaptor
+
+        Algorithm-specific implementations:
+            - ChunkKV: Select top-k chunks based on pre-computed importance scores with layer-wise index reuse
+            - Quest: Compute query-page similarity using current query and stored page representations, select top-k pages
+            - PQCache: Calculate query-centroid similarity, use centroid scores to rank tokens, select top-k tokens
+            - SnapKV: Return union of voted important prefix positions (with clustered neighbors) and observation window tokens
+            - Look-ahead QCache: Return KVs not marked for eviction (eviction based on pseudo query importance evaluation)
+        """
+        pass
+
+
+class BaseSparseAlgorithmImpl(BaseSparseAlgorithm):
+    """
+    Implementation base class for sparse attention algorithms.
+
+    Provides common infrastructure for algorithms that operate at page/chunk granularity
+    (token-wise is simply page_size=1):
+    - Generic construct/update flow with state tracking
+    - TopK retrieval with recent page retention (can be overridden)
+
+    Subclasses need to implement:
+    - _initialize_representation_pools(): Initialize algorithm-specific representation pools
+    - _compute_page_representations(): Compute page scores/representations
+    - _retrieve_page_scores(): Retrieve page scores for TopK selection
+
+    Subclasses can also override any method for specialized behavior
+    """
+
+    def __init__(self, config, device: torch.device, **kwargs):
+        super().__init__(config, device, **kwargs)
+        self.sparsity_ratio = config.sparse_extra_config.get("sparsity_ratio", 0.7)
+        self.num_recent_pages = config.sparse_extra_config.get("num_recent_pages", 4)
+        self.page_size = config.page_size
+
+    def initialize_representation_pool(
+        self,
+        start_layer: int,
+        end_layer: int,
+        token_to_kv_pool,
+        req_to_token_pool,
+        states,
+    ):
+        self.req_to_token_pool = req_to_token_pool
+        self.token_to_kv_pool = token_to_kv_pool
+        self.start_layer = start_layer
+        self.end_layer = end_layer
+        self.states = states
+
+        total_num_tokens = token_to_kv_pool.get_key_buffer(start_layer).shape[0]
+        total_num_pages = (total_num_tokens + self.page_size - 1) // self.page_size
+
+        # Initialize algorithm-specific representation pools
+        self._initialize_representation_pools(start_layer, end_layer, total_num_pages)
+
+    def construct_representations(
+        self,
+        layer_id,
+        req_pool_indices,
+        seq_lens,
+        k_buffer,
+        forward_batch,
+    ) -> torch.Tensor:
+
+        if not forward_batch.forward_mode.is_extend():
+            return
+
+        num_pages = seq_lens // self.page_size
+        valid_mask = (
+            ~self.states.repr_constructed[req_pool_indices]
+            & (seq_lens >= self.states.prompt_lens[req_pool_indices])
+            & (num_pages > 0)
+        )
+
+        if not valid_mask.any():
+            return
+
+        # Compute page representations by subclass
+        self._compute_page_representations(
+            layer_id,
+            req_pool_indices[valid_mask],
+            seq_lens[valid_mask],
+            0,
+            num_pages[valid_mask],
+            k_buffer,
+        )
+
+        # Update tracking states
+        if layer_id == self.end_layer - 1:
+            success_indices = req_pool_indices[valid_mask]
+            self.states.repr_constructed[success_indices] = True
+            self.states.last_constructed_page[success_indices] = num_pages[valid_mask]
+
+    def update_representations(
+        self,
+        layer_id,
+        req_pool_indices,
+        seq_lens,
+        k_buffer,
+        forward_batch,
+    ) -> torch.Tensor:
+        if not forward_batch.forward_mode.is_decode_or_idle():
+            return
+
+        start_page = self.states.last_constructed_page[req_pool_indices]
+        end_page = seq_lens // self.page_size
+        valid_mask = self.states.repr_constructed[req_pool_indices] & (
+            start_page < end_page
+        )
+
+        if not valid_mask.any():
+            return
+
+        # Compute page representations by subclass
+        self._compute_page_representations(
+            layer_id,
+            req_pool_indices[valid_mask],
+            seq_lens[valid_mask],
+            start_page[valid_mask],
+            end_page[valid_mask],
+            k_buffer,
+        )
+
+        # Update tracking states
+        if layer_id == self.end_layer - 1:
+            success_indices = req_pool_indices[valid_mask]
+            self.states.last_constructed_page[success_indices] = end_page[valid_mask]
+
+    def retrieve_topk(
+        self,
+        queries: torch.Tensor,
+        layer_id: int,
+        req_pool_indices: torch.Tensor,
+        sparse_mask: torch.Tensor,
+        **kwargs,
+    ) -> tuple:
+        """
+        Default TopK retrieval: score-based selection + recent pages.
+        Subclasses can override for query-dependent retrieval.
+
+        TODO:
+            1. Using triton kernel to speed up this function
+            2. Support CUDA Graph
+        """
+        bs, device = queries.shape[0], queries.device
+
+        seq_lens_source = kwargs.get("forward_batch", None)
+        if seq_lens_source is None or not hasattr(seq_lens_source, "seq_lens"):
+            raise ValueError(
+                "forward_batch with seq_lens is required for TopK retrieval"
+            )
+        seq_lens = seq_lens_source.seq_lens.to(device)
+
+        req_to_token = self.req_to_token_pool.req_to_token
+        max_req_tokens = req_to_token.shape[1]
+
+        per_request_indices = []
+        per_request_lengths = []
+
+        for i in range(bs):
+            if not sparse_mask[i]:
+                per_request_indices.append(
+                    torch.empty(0, device=device, dtype=torch.int32)
+                )
+                per_request_lengths.append(0)
+                continue
+
+            num_pages = int((seq_lens[i].item() + self.page_size - 1) // self.page_size)
+            if num_pages <= self.num_recent_pages:
+                per_request_indices.append(
+                    torch.empty(0, device=device, dtype=torch.int32)
+                )
+                per_request_lengths.append(0)
+                continue
+
+            page_idx = torch.arange(num_pages, device=device)
+            page_start_token = req_to_token[
+                req_pool_indices[i],
+                (page_idx * self.page_size).clamp(0, max_req_tokens - 1),
+            ]
+            phys_pages = (page_start_token // self.page_size).unsqueeze(0)
+
+            scores = self._retrieve_page_scores(
+                layer_id,
+                phys_pages,
+                req_pool_indices[i : i + 1],
+                queries[i : i + 1],
+            )
+
+            recent_start = max(num_pages - self.num_recent_pages, 0)
+            scores = scores.clone()
+            scores[:, recent_start:] = float("-inf")
+
+            history_pages = max(recent_start, 1)
+            k = max(int(history_pages * self.sparsity_ratio), 1)
+            k = min(k, history_pages)
+            topk_idx = torch.topk(scores, k=k, dim=1, sorted=False)[1].squeeze(0)
+
+            recent_idx = torch.arange(
+                recent_start, recent_start + self.num_recent_pages, device=device
+            )
+            recent_idx = recent_idx[recent_idx < num_pages]
+
+            combined = (
+                torch.cat([topk_idx, recent_idx], dim=0).sort()[0].to(torch.int32)
+            )
+
+            per_request_indices.append(combined)
+            per_request_lengths.append(int(combined.numel()))
+
+        max_len = max(max(per_request_lengths, default=0), 1)
+        out_indices = torch.full((bs, max_len), -1, dtype=torch.int32, device=device)
+        out_lengths = torch.zeros(bs, dtype=torch.int32, device=device)
+
+        for i, selected in enumerate(per_request_indices):
+            length = per_request_lengths[i]
+            if length == 0:
+                continue
+            out_indices[i, :length] = selected
+            out_lengths[i] = length
+
+        return out_indices, out_lengths
+
+    def _initialize_representation_pools(
+        self, start_layer: int, end_layer: int, total_num_pages: int
+    ):
+        """Initialize algorithm-specific representation pools for all layers."""
+        raise NotImplementedError
+
+    def _compute_page_representations(
+        self,
+        layer_id: int,
+        reqs: torch.Tensor,
+        seq_lens: torch.Tensor,
+        start_page,
+        end_page: torch.Tensor,
+        k_buffer: torch.Tensor,
+    ):
+        """Compute and store page representations for given page range."""
+        raise NotImplementedError
+
+    def _retrieve_page_scores(
+        self,
+        layer_id: int,
+        phys_pages: torch.Tensor,
+        req_pool_indices: torch.Tensor,
+        queries: torch.Tensor,
+    ) -> torch.Tensor:
+        """Retrieve page scores for TopK selection."""
+        raise NotImplementedError
diff --git a/sglang/python/sglang/srt/mem_cache/sparsity/algorithms/deepseek_nsa.py b/sglang/python/sglang/srt/mem_cache/sparsity/algorithms/deepseek_nsa.py
new file mode 100644
index 0000000000000000000000000000000000000000..6d64f10eed626039b12882e584ee008bf51932d2
--- /dev/null
+++ b/sglang/python/sglang/srt/mem_cache/sparsity/algorithms/deepseek_nsa.py
@@ -0,0 +1,80 @@
+from typing import Any, Optional
+
+import torch
+
+from sglang.srt.mem_cache.sparsity.algorithms.base_algorithm import (
+    BaseSparseAlgorithmImpl,
+)
+
+
+class DeepSeekNSAAlgorithm(BaseSparseAlgorithmImpl):
+    """
+    Sparse attention algorithm for DeepSeek NSA.
+
+    This algorithm uses NSA's native indexer for TopK retrieval.
+    Overrides all parent methods as NSA has its own specialized flow.
+    """
+
+    def __init__(self, config, device: torch.device, **kwargs):
+        super().__init__(config, device, **kwargs)
+
+    def retrieve_topk(
+        self,
+        queries: torch.Tensor,
+        layer_id: int,
+        req_pool_indices: torch.Tensor,
+        sparse_mask: torch.Tensor,
+        attn_metadata: Optional[Any],
+        **kwargs,
+    ) -> tuple:
+        indexer, forward_batch, x, q_lora, positions = (
+            kwargs.get("indexer"),
+            kwargs.get("forward_batch"),
+            kwargs.get("x"),
+            kwargs.get("q_lora"),
+            kwargs.get("positions"),
+        )
+
+        if any(v is None for v in [indexer, x, q_lora, positions, forward_batch]):
+            raise ValueError("Required: indexer, forward_batch, x, q_lora, positions")
+
+        return (
+            indexer(
+                x=x,
+                q_lora=q_lora,
+                positions=positions,
+                forward_batch=forward_batch,
+                layer_id=layer_id,
+            ),
+            None,
+        )
+
+    def initialize_representation_pool(
+        self,
+        start_layer: int,
+        end_layer: int,
+        token_to_kv_pool,
+        req_to_token_pool,
+        states,
+    ):
+        pass
+
+    def construct_representations(
+        self,
+        layer_id,
+        req_pool_indices,
+        seq_lens,
+        k_buffer,
+        forward_batch,
+    ):
+        pass
+
+    def update_representations(
+        self,
+        layer_id,
+        req_pool_indices,
+        seq_lens,
+        k_buffer,
+        forward_batch,
+    ):
+        pass
diff --git a/sglang/python/sglang/srt/mem_cache/sparsity/algorithms/quest_algorithm.py b/sglang/python/sglang/srt/mem_cache/sparsity/algorithms/quest_algorithm.py
new file mode 100644
index 0000000000000000000000000000000000000000..94678546af313c549a99f883fedb09179893e4cc
--- /dev/null
+++ b/sglang/python/sglang/srt/mem_cache/sparsity/algorithms/quest_algorithm.py
@@ -0,0 +1,166 @@
+"""
+Quest sparse attention algorithm.
+
+This implementation follows the Quest paper's bounding-box estimation for
+query-aware page selection. For each KV page, it maintains per-dimension
+min/max of keys and uses them to upper-bound attention scores without
+materializing full dot products.
+"""
+
+import logging
+
+import torch
+
+from sglang.srt.mem_cache.sparsity.algorithms.base_algorithm import (
+    BaseSparseAlgorithmImpl,
+)
+
+logger = logging.getLogger(__name__)
+
+
+class QuestAlgorithm(BaseSparseAlgorithmImpl):
+    """Quest page-wise sparse attention using bounding-box criticality."""
+
+    def __init__(self, config, device: torch.device, **kwargs):
+        super().__init__(config, device, **kwargs)
+        self.page_k_min = {}
+        self.page_k_max = {}
+        self.page_valid = {}
+
+    def _initialize_representation_pools(
+        self, start_layer: int, end_layer: int, total_num_pages: int
+    ):
+        key_buf = self.token_to_kv_pool.get_key_buffer(start_layer)
+        head_num, head_dim = key_buf.shape[1], key_buf.shape[2]
+
+        for layer_id in range(start_layer, end_layer):
+            self.page_k_min[layer_id] = torch.zeros(
+                (total_num_pages, head_num, head_dim),
+                dtype=torch.float32,
+                device=self.device,
+            )
+            self.page_k_max[layer_id] = torch.zeros_like(self.page_k_min[layer_id])
+            self.page_valid[layer_id] = torch.zeros(
+                total_num_pages, dtype=torch.bool, device=self.device
+            )
+
+        logger.info(
+            "Initialized Quest page reps: %d pages, %d layers, head_num=%d, head_dim=%d",
+            total_num_pages,
+            end_layer - start_layer,
+            head_num,
+            head_dim,
+        )
+
+    def _compute_page_representations(
+        self,
+        layer_id: int,
+        reqs: torch.Tensor,
+        seq_lens: torch.Tensor,
+        start_page,
+        end_page: torch.Tensor,
+        k_buffer: torch.Tensor,
+    ):
+        if isinstance(start_page, int):
+            start_page = torch.full_like(end_page, start_page)
+
+        device = k_buffer.device
+        req_to_token = self.req_to_token_pool.req_to_token
+        n = reqs.shape[0]
+        max_pages = int((end_page - start_page).max().item())
+        if max_pages <= 0:
+            return
+
+        pg_off = torch.arange(max_pages, device=device).unsqueeze(0)
+        pg_id = start_page.unsqueeze(1) + pg_off
+        pg_mask = pg_id < end_page.unsqueeze(1)
+
+        tok_start = pg_id * self.page_size
+        tok_off = torch.arange(self.page_size, device=device).view(1, 1, -1)
+        tok_pos = tok_start.unsqueeze(2) + tok_off
+        tok_mask = (
+            tok_pos
+            < (tok_start + self.page_size).clamp(max=seq_lens.unsqueeze(1)).unsqueeze(2)
+        ) & pg_mask.unsqueeze(2)
+
+        phys_tok = req_to_token[
+            reqs.view(n, 1, 1).expand(n, max_pages, self.page_size),
+            tok_pos.clamp(0, req_to_token.shape[1] - 1),
+        ].clamp(0, k_buffer.shape[0] - 1)
+
+        keys = k_buffer[phys_tok].to(torch.float32)
+        mask = tok_mask.unsqueeze(-1).unsqueeze(-1)
+
+        page_min = torch.where(mask, keys, torch.full_like(keys, float("inf"))).amin(
+            dim=2
+        )
+        page_max = torch.where(mask, keys, torch.full_like(keys, float("-inf"))).amax(
+            dim=2
+        )
+
+        phys_pg = (
+            req_to_token[
+                reqs.unsqueeze(1).expand(n, max_pages),
+                tok_start.clamp(0, req_to_token.shape[1] - 1),
+            ]
+            // self.page_size
+        )
+
+        idx = pg_mask.nonzero(as_tuple=False)
+        if idx.numel() == 0:
+            return
+
+        target_pages = phys_pg[idx[:, 0], idx[:, 1]].clamp(
+            0, self.page_k_min[layer_id].shape[0] - 1
+        )
+        self.page_k_min[layer_id][target_pages] = page_min[idx[:, 0], idx[:, 1]]
+        self.page_k_max[layer_id][target_pages] = page_max[idx[:, 0], idx[:, 1]]
+        self.page_valid[layer_id][target_pages] = True
+
+    def _retrieve_page_scores(
+        self,
+        layer_id: int,
+        phys_pages: torch.Tensor,
+        req_pool_indices: torch.Tensor,
+        queries: torch.Tensor,
+    ) -> torch.Tensor:
+        # Clamp pages to valid storage range
+        phys_pages_clamped = phys_pages.clamp(0, self.page_k_min[layer_id].shape[0] - 1)
+
+        k_min = self.page_k_min[layer_id][phys_pages_clamped]
+        k_max = self.page_k_max[layer_id][phys_pages_clamped]
+        valid_mask = self.page_valid[layer_id][phys_pages_clamped]
+        # Align query shape to KV heads.
+        head_dim = k_min.shape[-1]
+        if queries.dim() == 2:
+            bs, hidden = queries.shape
+            if hidden % head_dim != 0:
+                raise ValueError(
+                    f"Quest query hidden size {hidden} not divisible by head_dim {head_dim}"
+                )
+            q_heads = hidden // head_dim
+            q = queries.view(bs, q_heads, head_dim)
+        elif queries.dim() == 3:
+            q = queries
+        else:
+            raise ValueError(f"Unsupported query shape for Quest: {queries.shape}")
+
+        kv_heads = k_min.shape[-2]
+        q_heads = q.shape[1]
+        if q_heads != kv_heads:
+            if q_heads % kv_heads != 0:
+                raise ValueError(
+                    f"Query heads {q_heads} not divisible by KV heads {kv_heads}"
+                )
+            group = q_heads // kv_heads
+            # Average grouped query heads to align with KV heads (approximation for MQA/GQA).
+            q = q.view(q.shape[0], kv_heads, group, head_dim).mean(dim=2)
+
+        q = q.to(k_min.dtype).unsqueeze(1)  # [bs, 1, kv_heads, head_dim]
+
+        criticality = torch.where(q >= 0, q * k_max, q * k_min).sum(dim=(2, 3))
+        criticality = torch.where(
+            valid_mask, criticality, torch.full_like(criticality, float("-inf"))
+        )
+
+        return criticality
diff --git a/sglang/python/sglang/srt/mem_cache/sparsity/backend/__init__.py b/sglang/python/sglang/srt/mem_cache/sparsity/backend/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..2a6e1b710fa9ca2cb29e96a225c1852c867a4641
--- /dev/null
+++ b/sglang/python/sglang/srt/mem_cache/sparsity/backend/__init__.py
@@ -0,0 +1,7 @@
+from sglang.srt.mem_cache.sparsity.backend.backend_adaptor import (
+    BackendAdaptor,
+    FlashAttentionAdaptor,
+    NSABackendAdaptor,
+)
+
+__all__ = ["BackendAdaptor", "FlashAttentionAdaptor", "NSABackendAdaptor"]
diff --git a/sglang/python/sglang/srt/mem_cache/sparsity/backend/backend_adaptor.py b/sglang/python/sglang/srt/mem_cache/sparsity/backend/backend_adaptor.py
new file mode 100644
index 0000000000000000000000000000000000000000..baf6a5c995b29a9e5d33ccad85602c5e377febcd
--- /dev/null
+++ b/sglang/python/sglang/srt/mem_cache/sparsity/backend/backend_adaptor.py
@@ -0,0 +1,176 @@
+import logging
+from abc import ABC, abstractmethod
+from typing import TYPE_CHECKING, Any, Optional
+
+import torch
+
+if TYPE_CHECKING:
+    from sglang.srt.model_executor.forward_batch_info import ForwardBatch
+
+logger = logging.getLogger(__name__)
+
+
+class BackendAdaptor(ABC):
+    """Base class for attention backend adaptors."""
+
+    def __init__(self, device: torch.device):
+        self.device = device
+        self._original_metadata = None
+
+    def save_original_metadata(self, metadata: Any) -> None:
+        """Save original metadata in the beginning of the forward pass."""
+        pass
+
+    @abstractmethod
+    def adapt_for_attn_metadata(
+        self,
+        selected_indices: torch.Tensor,
+        valid_lengths: torch.Tensor,
+        sparse_mask: torch.Tensor,
+        current_metadata: Any,
+        forward_batch: "ForwardBatch",
+        req_to_token: torch.Tensor,
+        page_size: int,
+        layer_id: int,
+        **kwargs,
+    ) -> Any:
+        """
+        Adapt attention metadata for sparse KVCache access.
+
+        Transforms sparse retrieval results (logical indices of important KV pages/tokens)
+        into backend-specific attention metadata format.
+
+        Returns:
+            Modified attention metadata compatible with the backend
+        """
+        pass
+
+
+class NSABackendAdaptor(BackendAdaptor):
+    """Adaptor for NSA (Native Sparse Attention) backend."""
+
+    def __init__(
+        self,
+        device: torch.device,
+        req_to_token_pool,
+    ):
+        super().__init__(device)
+        self.req_to_token_pool = req_to_token_pool
+
+    def adapt_for_attn_metadata(
+        self,
+        selected_indices: torch.Tensor,
+        valid_lengths: torch.Tensor,
+        sparse_mask: torch.Tensor,
+        current_metadata: Any,
+        forward_batch: "ForwardBatch",
+        req_to_token: torch.Tensor,
+        page_size: int,
+        layer_id: int,
+        **kwargs,
+    ) -> Optional[torch.Tensor]:
+        """
+        Transform logical page indices to physical device indices for NSA backend.
+        """
+        # TODO: Implement NSA backend adaptor logic
+        pass
+
+
+class FlashAttentionAdaptor(BackendAdaptor):
+    """Adaptor for FlashAttention backend."""
+
+    def save_original_metadata(self, metadata: Any) -> None:
+        self._original_metadata = {
+            "page_table": metadata.page_table.clone(),
+            "cache_seqlens_int32": metadata.cache_seqlens_int32.clone(),
+            "cu_seqlens_k": metadata.cu_seqlens_k.clone(),
+            "max_seq_len_k": metadata.max_seq_len_k,
+        }
+
+    def adapt_for_attn_metadata(
+        self,
+        selected_indices: torch.Tensor,
+        valid_lengths: torch.Tensor,
+        sparse_mask: torch.Tensor,
+        current_metadata: Any,
+        forward_batch: "ForwardBatch",
+        req_to_token: torch.Tensor,
+        page_size: int,
+        layer_id: int,
+        **kwargs,
+    ) -> Any:
+        """
+        Adapt FlashAttention metadata for sparse KVCache access.
+
+        Modifies page_table, cache_seqlens, and related metadata to redirect
+        FlashAttention to only process selected sparse pages.
+
+        # TODO: Optimize performance
+        """
+        if self._original_metadata is None:
+            return current_metadata
+
+        if not sparse_mask.any():
+            return current_metadata
+
+        current_metadata.page_table.copy_(self._original_metadata["page_table"])
+        current_metadata.cache_seqlens_int32.copy_(
+            self._original_metadata["cache_seqlens_int32"]
+        )
+
+        physical_pages = self._logical_to_physical_pages_batch(
+            selected_indices,
+            forward_batch.req_pool_indices,
+            req_to_token,
+            page_size,
+        )
+
+        max_selected = physical_pages.shape[1]
+        valid_mask = torch.arange(max_selected, device=physical_pages.device).unsqueeze(
+            0
+        ) < valid_lengths.unsqueeze(1)
+        update_mask = sparse_mask.unsqueeze(1) & valid_mask
+
+        current_metadata.page_table[:, :max_selected] = torch.where(
+            update_mask, physical_pages, current_metadata.page_table[:, :max_selected]
+        )
+
+        seq_lens = forward_batch.seq_lens
+        positions_in_page = (seq_lens - 1) % page_size
+        diff = page_size - positions_in_page - 1
+        sparse_seq_lens = (valid_lengths * page_size - diff).to(torch.int32)
+
+        current_metadata.cache_seqlens_int32 = torch.where(
+            sparse_mask, sparse_seq_lens, self._original_metadata["cache_seqlens_int32"]
+        )
+
+        current_metadata.cu_seqlens_k = torch.nn.functional.pad(
+            torch.cumsum(
+                current_metadata.cache_seqlens_int32, dim=0, dtype=torch.int32
+            ),
+            (1, 0),
+        )
+        current_metadata.max_seq_len_k = int(current_metadata.cache_seqlens_int32.max())
+        return current_metadata
+
+    def _logical_to_physical_pages_batch(
+        self,
+        logical_pages: torch.Tensor,
+        req_pool_indices: torch.Tensor,
+        req_to_token: torch.Tensor,
+        page_size: int,
+    ) -> torch.Tensor:
+        bs, max_pages = logical_pages.shape
+
+        page_starts = logical_pages * page_size
+        page_starts_clamped = page_starts.clamp(min=0)
+
+        req_indices_expanded = req_pool_indices.unsqueeze(1).expand(-1, max_pages)
+        first_tokens = req_to_token[req_indices_expanded, page_starts_clamped]
+
+        physical_pages = first_tokens // page_size
+        physical_pages = torch.where(
+            logical_pages >= 0, physical_pages, torch.zeros_like(physical_pages)
+        )
+
+        return physical_pages.to(torch.int32)
diff --git a/sglang/python/sglang/srt/mem_cache/sparsity/core/__init__.py b/sglang/python/sglang/srt/mem_cache/sparsity/core/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..b1622cffb2a8e2f83eee928de8f7f3a5c1d0b38c
--- /dev/null
+++ b/sglang/python/sglang/srt/mem_cache/sparsity/core/__init__.py
@@ -0,0 +1,11 @@
+from sglang.srt.mem_cache.sparsity.core.sparse_coordinator import (
+    RequestTrackers,
+    SparseConfig,
+    SparseCoordinator,
+)
+
+__all__ = [
+    "RequestTrackers",
+    "SparseConfig",
+    "SparseCoordinator",
+]
diff --git a/sglang/python/sglang/srt/mem_cache/sparsity/core/sparse_coordinator.py b/sglang/python/sglang/srt/mem_cache/sparsity/core/sparse_coordinator.py
new file mode 100644
index 0000000000000000000000000000000000000000..f3c37dc2c2cbc4861f05c03a5110fccf4e524938
--- /dev/null
+++ b/sglang/python/sglang/srt/mem_cache/sparsity/core/sparse_coordinator.py
@@ -0,0 +1,272 @@
+import logging
+from dataclasses import dataclass, field
+from typing import TYPE_CHECKING, Any, Optional
+
+import torch
+
+from sglang.srt.mem_cache.memory_pool import KVCache, ReqToTokenPool
+from sglang.srt.mem_cache.sparsity.algorithms.base_algorithm import BaseSparseAlgorithm
+from sglang.srt.mem_cache.sparsity.backend.backend_adaptor import BackendAdaptor
+
+if TYPE_CHECKING:
+    from sglang.srt.layers.radix_attention import RadixAttention
+    from sglang.srt.managers.schedule_batch import Req
+    from sglang.srt.model_executor.forward_batch_info import ForwardBatch
+
+logger = logging.getLogger(__name__)
+
+
+class RequestTrackers:
+    """State tracker for sparse attention requests."""
+
+    def __init__(
+        self,
+        max_pool_size: int,
+        device: torch.device,
+        num_layers: int,
+        min_sparse_prompt_len: int,
+        max_context_len: int,
+    ):
+        self.device = device
+        self.num_layers = num_layers
+
+        self.repr_constructed = torch.zeros(
+            max_pool_size, dtype=torch.bool, device=device
+        )
+        self.prompt_lens = torch.zeros(max_pool_size, dtype=torch.int64, device=device)
+        self.last_constructed_page = torch.zeros(
+            max_pool_size, dtype=torch.int64, device=device
+        )
+
+        # TODO: Add more trackers for hierarchical KVCache management
+
+    def register(self, idx: int, prompt_len: int) -> None:
+        self.repr_constructed[idx] = False
+        self.prompt_lens[idx] = prompt_len
+        self.last_constructed_page[idx] = 0
+
+    def clear(self, idx: int) -> None:
+        self.repr_constructed[idx] = False
+        self.prompt_lens[idx] = 0
+        self.last_constructed_page[idx] = 0
+
+
+@dataclass
+class SparseConfig:
+    """Configuration for sparse attention."""
+
+    backend: str
+    algorithm: str
+    page_size: int = 64
+    min_sparse_prompt_len: int = 2048
+    sparse_extra_config: dict = field(
+        default_factory=dict
+    )  # Algorithm-specific config, parsed by each algorithm
+
+
+class SparseCoordinator:
+    """
+    Coordinator for sparse attention with retrievable KV cache compression.
+
+    This coordinator framework is designed for decode-phase retrievable algorithms
+    (e.g., Quest, PQCache, SnapKV) that dynamically select important KV cache entries
+    based on current queries. It manages the lifecycle of sparse attention including
+    representation construction, sparse retrieval, and token offloading.
+
+    Request Lifecycle and API Calls:
+        1. Request Start:
+           - on_request_begin(req) -> Register request and initialize state
+
+        2. Prefill Phase:
+           - attention_end(...)    -> Construct representations
+
+        3. Decode Phase:
+           - forward_begin(batch)  -> Wait for pending KVCache offloading
+           - attention_begin(...)  -> Identify important KV, load offloaded KVCache, adapt attention metadata
+           - attention_end(...)    -> Construct/update representations
+           - forward_end(batch)    -> Trigger KVCache offloading
+
+        4. Request End:
+           - on_request_end(req) -> Clean up state and resources
+    """
+
+    def __init__(
+        self,
+        config: SparseConfig,
+        algorithm: BaseSparseAlgorithm,
+        backend_adaptor: Optional[BackendAdaptor],
+        req_to_token_pool: ReqToTokenPool,
+        token_to_kv_pool: KVCache,
+        start_layer: int,
+        end_layer: int,
+        device: torch.device,
+    ):
+        self.config = config
+        self.algorithm = algorithm
+        self.backend_adaptor = backend_adaptor
+        self.req_to_token_pool = req_to_token_pool
+        self.token_to_kv_pool = token_to_kv_pool
+        self.start_layer = start_layer
+        self.end_layer = end_layer
+        self.device = device
+        self.page_size = config.page_size
+
+        self.states = RequestTrackers(
+            req_to_token_pool.req_to_token.shape[0],
+            device,
+            end_layer - start_layer + 1,
+            self.config.min_sparse_prompt_len,
+            self.req_to_token_pool.max_context_len,
+        )
+
+        # Initialize algorithm representation pool and context
+        self.algorithm.initialize_representation_pool(
+            start_layer,
+            end_layer,
+            self.token_to_kv_pool,
+            self.req_to_token_pool,
+            self.states,
+        )
+
+        logger.info(
+            f"SparseCoordinator initialized with sparse algorithm={type(algorithm).__name__}"
+        )
+
+    def on_request_begin(self, req: "Req") -> None:
+        """
+        Handle request begin event. Called when a new request is created.
+
+        Registers the request in the state tracker to enable sparse attention processing.
+        """
+        if req.req_pool_idx is not None:
+            self.states.register(req.req_pool_idx, len(req.origin_input_ids))
+
+    def on_request_end(self, req: "Req") -> None:
+        """
+        Handle request end event. Called when a request is completed or aborted.
+        Cleans up request-specific state and releases resources.
+        """
+        if req.req_pool_idx is None:
+            return
+
+        self.states.clear(req.req_pool_idx)
+
+        # TODO: Implement request end handling
+        # - Release host indices if any were allocated for offloading
+
+    def forward_begin(self, forward_batch: "ForwardBatch") -> None:
+        """
+        Handle forward pass begin event. Called before each forward pass starts.
+
+        Wait for pending KVCache offloading operations to complete before forward pass.
+        Ensures memory consistency for subsequent sparse attention operations.
+        """
+        # TODO: Implement forward begin handling
+        # - Check if there are pending offloading operations
+        pass
+
+    def forward_end(self, forward_batch: "ForwardBatch") -> None:
+        """
+        Handle forward pass end event. Called after each forward pass completes.
+
+        Trigger async KVCache offloading operations.
+        """
+        # TODO: Implement forward end handling
+        # - Identify tokens to offload
+        # - Trigger async offloading operations
+        pass
+
+    def attention_begin(
+        self,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        value: torch.Tensor,
+        layer: "RadixAttention",
+        forward_batch: "ForwardBatch",
+        attn_metadata: Optional[Any],
+        **kwargs,
+    ) -> Optional[Any]:
+        """
+        Handle attention begin event. Called before each attention pass starts.
+
+        Identify important KV entries via sparse algorithm, load offloaded KVCache if needed,
+        and adapt attention metadata for the attention backend.
+        """
+        if layer.layer_id == self.start_layer:
+            self.backend_adaptor.save_original_metadata(attn_metadata)
+
+        return self._handle_sparse_retrieve(
+            query, layer, forward_batch, attn_metadata, **kwargs
+        )
+
+    def attention_end(
+        self,
+        output: torch.Tensor,
+        layer: "RadixAttention",
+        forward_batch: "ForwardBatch",
+    ) -> None:
+        """
+        Handle attention end event. Called after each attention pass completes.
+
+        Maybe construct and update sparse representations.
+        """
+        layer_id = layer.layer_id
+
+        # Maybe construct representations
+        self.algorithm.construct_representations(
+            layer_id=layer_id,
+            req_pool_indices=forward_batch.req_pool_indices,
+            seq_lens=forward_batch.seq_lens,
+            k_buffer=self.token_to_kv_pool.get_key_buffer(layer_id),
+            forward_batch=forward_batch,
+        )
+
+        # Maybe update representations
+        self.algorithm.update_representations(
+            layer_id=layer_id,
+            req_pool_indices=forward_batch.req_pool_indices,
+            seq_lens=forward_batch.seq_lens,
+            k_buffer=self.token_to_kv_pool.get_key_buffer(layer_id),
+            forward_batch=forward_batch,
+        )
+
+    def _handle_sparse_retrieve(
+        self,
+        query: torch.Tensor,
+        layer: "RadixAttention",
+        forward_batch: "ForwardBatch",
+        attn_metadata: Optional[Any],
+        **kwargs,
+    ) -> Optional[torch.Tensor]:
+        req_pool_indices = forward_batch.req_pool_indices
+        # Compute Topk
+        sparse_mask = self._compute_sparse_mask(req_pool_indices)
+        selected_indices, valid_lengths = self.algorithm.retrieve_topk(
+            queries=query,
+            layer_id=layer.layer_id,
+            req_pool_indices=req_pool_indices,
+            sparse_mask=sparse_mask,
+            forward_batch=forward_batch,
+            attn_metadata=attn_metadata,
+            **kwargs,
+        )
+
+        # Adapt Attention Metadata
+        return self.backend_adaptor.adapt_for_attn_metadata(
+            selected_indices=selected_indices,
+            valid_lengths=valid_lengths,
+            sparse_mask=sparse_mask,
+            current_metadata=attn_metadata,
+            forward_batch=forward_batch,
+            req_to_token=self.req_to_token_pool.req_to_token,
+            page_size=self.page_size,
+            layer_id=layer.layer_id,
+        )
+
+    def _compute_sparse_mask(self, req_pool_indices):
+        mask = (
+            self.states.prompt_lens[req_pool_indices]
+            >= self.config.min_sparse_prompt_len
+        )
+
+        return mask
diff --git a/sglang/python/sglang/srt/mem_cache/sparsity/factory.py b/sglang/python/sglang/srt/mem_cache/sparsity/factory.py
new file mode 100644
index 0000000000000000000000000000000000000000..39d0f46822f72a4b0068a68875218424bb7f5349
--- /dev/null
+++ b/sglang/python/sglang/srt/mem_cache/sparsity/factory.py
@@ -0,0 +1,126 @@
+import json
+import logging
+from typing import Optional
+
+import torch
+
+from sglang.srt.mem_cache.sparsity.algorithms.base_algorithm import BaseSparseAlgorithm
+from sglang.srt.mem_cache.sparsity.algorithms.deepseek_nsa import DeepSeekNSAAlgorithm
+from sglang.srt.mem_cache.sparsity.algorithms.quest_algorithm import QuestAlgorithm
+from sglang.srt.mem_cache.sparsity.backend.backend_adaptor import (
+    FlashAttentionAdaptor,
+    NSABackendAdaptor,
+)
+from sglang.srt.mem_cache.sparsity.core.sparse_coordinator import (
+    SparseConfig,
+    SparseCoordinator,
+)
+
+logger = logging.getLogger(__name__)
+
+_global_sparse_coordinator: Optional[SparseCoordinator] = None
+
+_ALGORITHM_REGISTRY = {
+    "quest": lambda config, device, **kw: QuestAlgorithm(config, device, **kw),
+    "deepseek_nsa": lambda config, device, **kw: DeepSeekNSAAlgorithm(
+        config, device, **kw
+    ),
+}
+
+
+def _create_sparse_algorithm(
+    config: SparseConfig,
+    device: torch.device,
+    **kwargs,
+) -> BaseSparseAlgorithm:
+    algorithm_name = config.algorithm.lower()
+    factory = _ALGORITHM_REGISTRY.get(algorithm_name)
+
+    if factory is None:
+        raise ValueError(f"Unknown sparse algorithm: {algorithm_name}")
+
+    return factory(config, device, **kwargs)
+
+
+def _create_backend_adaptor(
+    backend: str,
+    device: torch.device,
+    sparse_algorithm: BaseSparseAlgorithm,
+    req_to_token_pool,
+):
+    """Create backend adaptor."""
+    if isinstance(sparse_algorithm, DeepSeekNSAAlgorithm):
+        return NSABackendAdaptor(device, req_to_token_pool)
+
+    if backend in ["fa3", "flashattention"]:
+        return FlashAttentionAdaptor(device)
+
+    raise ValueError(f"Unknown attention backend: {backend}")
+
+
+def _parse_sparse_config(server_args) -> SparseConfig:
+    """Parse hierarchical sparse config"""
+    # Parse extra config if provided
+    extra_config_str = server_args.hierarchical_sparse_attention_extra_config
+    if extra_config_str is not None:
+        try:
+            extra_config = json.loads(extra_config_str)
+
+            # Extract algorithm and backend
+            algorithm = extra_config.pop("algorithm", "quest")
+            backend = extra_config.pop("backend", "flashattention")
+            min_sparse_prompt_len = extra_config.pop("min_sparse_prompt_len", 2048)
+
+            # Everything else goes to algorithm_extra_config
+            sparse_extra_config = extra_config
+        except json.JSONDecodeError as e:
+            logger.warning(
+                f"Failed to parse hierarchical_sparse_attention_extra_config: {e}"
+            )
+
+    config = SparseConfig(
+        algorithm=algorithm,
+        backend=backend,
+        page_size=server_args.page_size,
+        min_sparse_prompt_len=min_sparse_prompt_len,
+        sparse_extra_config=sparse_extra_config,
+    )
+    return config
+
+
+def create_sparse_coordinator(
+    device: torch.device,
+    req_to_token_pool,
+    token_to_kv_pool,
+    start_layer: int,
+    end_layer: int,
+    server_args,
+    **kwargs,
+) -> SparseCoordinator:
+    config = _parse_sparse_config(server_args)
+    algorithm = _create_sparse_algorithm(config, device, **kwargs)
+    backend_adaptor = _create_backend_adaptor(
+        config.backend, device, algorithm, req_to_token_pool
+    )
+
+    coordinator = SparseCoordinator(
+        config=config,
+        algorithm=algorithm,
+        backend_adaptor=backend_adaptor,
+        req_to_token_pool=req_to_token_pool,
+        token_to_kv_pool=token_to_kv_pool,
+        start_layer=start_layer,
+        end_layer=end_layer,
+        device=device,
+    )
+    register_sparse_coordinator(coordinator)
+    return coordinator
+
+
+def register_sparse_coordinator(coordinator: SparseCoordinator) -> None:
+    global _global_sparse_coordinator
+    _global_sparse_coordinator = coordinator
+
+
+def get_sparse_coordinator() -> Optional[SparseCoordinator]:
+    return _global_sparse_coordinator
diff --git a/sglang/python/sglang/srt/mem_cache/storage/__init__.py b/sglang/python/sglang/srt/mem_cache/storage/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..34ac35508e114a7b03a2215879fb5a6ccfdf1a07
--- /dev/null
+++ b/sglang/python/sglang/srt/mem_cache/storage/__init__.py
@@ -0,0 +1,10 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to SGLang project
+
+"""Storage backend module for SGLang HiCache."""
+
+from .backend_factory import StorageBackendFactory
+
+__all__ = [
+    "StorageBackendFactory",
+]
diff --git a/sglang/python/sglang/srt/mem_cache/storage/aibrix_kvcache/README.md b/sglang/python/sglang/srt/mem_cache/storage/aibrix_kvcache/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..16941967f6d3045e0a39bfa0f218149b5f0dd772
--- /dev/null
+++ b/sglang/python/sglang/srt/mem_cache/storage/aibrix_kvcache/README.md
@@ -0,0 +1,37 @@
+# AIBrix KVCache as L3 KV Cache
+This document provides brief instructions for setting up a AIBrixKVCache storage backend +  AIBrixKVCache + SGLang runtime environment from scratch, describing how to utilize AIBrixKVCache as the L3 KV cache for SGLang.
+The process consists of three main steps:
+
+## Step1:Install AIbrix KVCache
+Refer to the [AIBrix KVCache documentation](https://github.com/vllm-project/aibrix/blob/main/python/aibrix_kvcache/README.md) to install  AIBrix KVCache.
+
+## Step2: Deploy AIBrix Distributed KVCache Storage
+
+AIBrix KVCache currently supports multiple distributed KVCache backends, including ByteDance's open-source Infinistore and the not-yet-open source PrisKV incubated by ByteDance's PrisDB & IAAS & DMI team.
+
+For the Infinistore installation process, please refer to [this link](https://github.com/bytedance/InfiniStore).
+
+PrisKV for AIBrix KVCache is currently in the open-source preparation stage, and no public documentation is available yet.
+
+
+## Step3: Deploy Model Serving
+
+For information on configuring a distributed KVCache backend for AIBrixKVCache, please refer to [this link](https://aibrix.readthedocs.io/latest/designs/aibrix-kvcache-offloading-framework.html)
+
+Using PrisKV as an example, the startup command is as follows:
+```bash
+export AIBRIX_KV_CACHE_OL_L1_CACHE_ENABLED="0"
+export AIBRIX_KV_CACHE_OL_L2_CACHE_BACKEND="PRIS"
+export AIBRIX_KV_CACHE_OL_PRIS_REMOTE_ADDR="127.0.0.1"
+export AIBRIX_KV_CACHE_OL_PRIS_REMOTE_PORT="6379"
+export AIBRIX_KV_CACHE_OL_PRIS_PASSWORD="kvcache-redis"
+MODEL_LENGTH=32768&&NCCL_MIN_NCHANNELS=24&&NCCL_IB_QPS_PER_CONNECTION=8&&NCCL_DEBUG=INFO \
+python3 -m sglang.launch_server \
+	--model-path /code/models/Qwen3-32B \
+	--host 0.0.0.0 --port 8080 \
+	--enable-hierarchical-cache \
+	--hicache-storage-backend aibrix \
+	--page-size 16 \
+	--hicache-write-policy write_back \
+	--enable-metrics --hicache-ratio=2
+```
diff --git a/sglang/python/sglang/srt/mem_cache/storage/aibrix_kvcache/aibrix_kvcache_storage.py b/sglang/python/sglang/srt/mem_cache/storage/aibrix_kvcache/aibrix_kvcache_storage.py
new file mode 100644
index 0000000000000000000000000000000000000000..bcc8271095c36d9f5955baa254e4e461ad3c8cf2
--- /dev/null
+++ b/sglang/python/sglang/srt/mem_cache/storage/aibrix_kvcache/aibrix_kvcache_storage.py
@@ -0,0 +1,157 @@
+import logging
+from typing import Any, List, Optional
+
+import torch
+from aibrix_kvcache import (
+    BaseKVCacheManager,
+    BlockHashes,
+    KVCacheBlockLayout,
+    KVCacheBlockSpec,
+    KVCacheConfig,
+    KVCacheTensorSpec,
+    ModelSpec,
+)
+from aibrix_kvcache.common.absl_logging import log_every_n_seconds
+
+from sglang.srt.mem_cache.hicache_storage import (
+    HiCacheStorage,
+    HiCacheStorageConfig,
+    HiCacheStorageExtraInfo,
+)
+from sglang.srt.mem_cache.memory_pool_host import HostKVCache
+
+logger = logging.getLogger(__name__)
+
+
+class AibrixKVCacheStorage(HiCacheStorage):
+    def __init__(self, storage_config: HiCacheStorageConfig, mem_pool: HostKVCache):
+        if storage_config is not None:
+            self.is_mla_backend = storage_config.is_mla_model
+            self.local_rank = storage_config.tp_rank
+        else:
+            self.is_mla_backend = False
+            self.local_rank = 0
+        kv_cache = mem_pool.device_pool
+        self.page_size = mem_pool.page_size
+        self.kv_cache_dtype = kv_cache.dtype
+        self.layer_num = kv_cache.layer_num
+        self.kv_head_ids = [
+            self.local_rank * kv_cache.head_num + i for i in range(kv_cache.head_num)
+        ]
+        if not self.is_mla_backend:
+            self.layer_ids = range(
+                kv_cache.start_layer, kv_cache.end_layer
+            )  # for pipeline parallel
+
+            self.block_spec = KVCacheBlockSpec(
+                block_ntokens=self.page_size,
+                block_dtype=self.kv_cache_dtype,
+                block_layout=KVCacheBlockLayout(KVCacheBlockLayout.NCLD),
+                tensor_spec=KVCacheTensorSpec(
+                    heads=self.kv_head_ids,
+                    layers=self.layer_ids,
+                    head_size=kv_cache.head_dim,
+                ),
+            )
+            logger.info(self.block_spec)
+            config = KVCacheConfig(
+                block_spec=self.block_spec, model_spec=ModelSpec(102400)
+            )
+            self.kv_cache_manager = BaseKVCacheManager(config)
+        else:
+            raise NotImplementedError(
+                "MLA is not supported by AibrixKVCacheStorage yet."
+            )
+
+    def _aibrix_kvcache_metrics_report(self):
+        self.kv_cache_manager.metrics.summary()
+        self.kv_cache_manager.metrics.reset()
+
+    def batch_get(
+        self,
+        keys: List[str],
+        target_locations: List[torch.Tensor],
+        target_sizes: Optional[Any] = None,
+    ) -> List[torch.Tensor | None]:
+        block_hash = BlockHashes(keys, self.page_size)
+        status = self.kv_cache_manager.acquire(None, block_hash)
+        log_every_n_seconds(
+            logger, logging.INFO, self._aibrix_kvcache_metrics_report(), 1
+        )
+        if status.is_ok():
+            num_fetched_tokens, handle = status.value
+            kv_blocks = handle.to_tensors()
+            assert len(kv_blocks) == len(target_locations)
+            for i in range(len(kv_blocks)):
+                assert (
+                    target_locations[i].nbytes == kv_blocks[i].nbytes
+                ), f"{target_locations[i].nbytes}, {kv_blocks[i].nbytes}"
+                target_locations[i].copy_(kv_blocks[i].flatten())
+            handle.release()
+            return target_locations
+
+        return [None] * len(keys)
+
+    def get(
+        self,
+        key: str,
+        target_location: Optional[Any] = None,
+        target_size: Optional[Any] = None,
+    ) -> torch.Tensor | None:
+        return self.batch_get([key], [target_location], [target_size])[0]
+
+    def batch_set(
+        self,
+        keys: List[str],
+        values: Optional[Any] = None,
+        target_locations: Optional[Any] = None,
+        target_sizes: Optional[Any] = None,
+    ) -> bool:
+        block_hash = BlockHashes(keys, self.page_size)
+        status = self.kv_cache_manager.allocate_for(None, block_hash)
+        if not status.is_ok():
+            logger.warning(
+                f"aibrix_kvcache set allocate failed, error_code {status.error_code}"
+            )
+            return False
+        handle = status.value
+        tensors = handle.to_tensors()
+        if len(tensors) != len(values):
+            logger.warning("aibrix_kvcache set allocate not enough")
+            return False
+        for i in range(len(tensors)):
+            assert (
+                tensors[i].nbytes == values[i].nbytes
+            ), f"{tensors[i].nbytes}, {values[i].nbytes}"
+            tensors[i].reshape(values[i].shape).copy_(values[i]).reshape(
+                tensors[i].shape
+            )
+        status = self.kv_cache_manager.put(None, block_hash, handle)
+        if not status.is_ok():
+            logger.info(
+                f"AIBrix KVCache Storage set failed, error_code {status.error_code}"
+            )
+            return False
+        completed = status.value
+        return completed == len(keys) * self.page_size
+
+    def set(
+        self,
+        key: str,
+        value: Optional[Any] = None,
+        target_location: Optional[Any] = None,
+        target_size: Optional[Any] = None,
+    ) -> bool:
+        return self.batch_set([key], [value], [target_location], [target_size])
+
+    def batch_exists(
+        self, keys: List[str], extra_info: Optional[HiCacheStorageExtraInfo] = None
+    ) -> int:
+        block_hash = BlockHashes(keys, self.page_size)
+        status = self.kv_cache_manager.exists(None, block_hash)
+        if status.is_ok():
+            return status.value // self.page_size
+        return 0
+
+    def exists(self, key: str) -> bool | dict:
+        return self.batch_exists([key]) > 0
diff --git a/sglang/python/sglang/srt/mem_cache/storage/aibrix_kvcache/unit_test.py b/sglang/python/sglang/srt/mem_cache/storage/aibrix_kvcache/unit_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..14494d819808697eac7c9655854dd9af012d08ff
--- /dev/null
+++ b/sglang/python/sglang/srt/mem_cache/storage/aibrix_kvcache/unit_test.py
@@ -0,0 +1,97 @@
+import logging
+import os
+
+import torch
+import torch.distributed
+from aibrix_kvcache.common.absl_logging import log_every_n_seconds
+from aibrix_kvcache_storage import AibrixKVCacheStorage
+
+from sglang.srt.mem_cache.hicache_storage import HiCacheStorageConfig
+from sglang.srt.mem_cache.memory_pool import MHATokenToKVPool
+from sglang.srt.mem_cache.memory_pool_host import MHATokenToKVPoolHost
+
+logging.basicConfig(
+    level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
+)
+
+logger = logging.getLogger(__name__)
+
+
+def setup():
+    os.environ["RANK"] = "0"
+    os.environ["WORLD_SIZE"] = "1"
+    os.environ["MASTER_ADDR"] = "127.0.0.1"
+    os.environ["MASTER_PORT"] = "63886"
+
+
+class AIBrixKVCacheStorageTest:
+    def test_with_page_size(self):
+        config = HiCacheStorageConfig(
+            tp_rank=0,
+            tp_size=1,
+            is_mla_model=False,
+            is_page_first_layout=True,
+            model_name="test",
+        )
+        for page_size in range(1, 3):
+            logger.info(f"page_size: {page_size}")
+            batch_size = 2
+            head_num = 1
+            layer_num = 64
+            head_dim = 128
+            kv_cache = MHATokenToKVPool(
+                1024,
+                page_size,
+                torch.float16,
+                head_num,
+                head_dim,
+                layer_num,
+                "cpu",
+                False,
+                0,
+                layer_num,
+            )
+            mem_pool = MHATokenToKVPoolHost(kv_cache, 2, 0, page_size, "layer_first")
+            query_length = batch_size * 2
+            partial = batch_size
+            self.aibrix_kvcache = AibrixKVCacheStorage(config, mem_pool)
+            target_shape = (2, layer_num, page_size, head_num, head_dim)
+            rand_tensor = [
+                torch.rand(target_shape, dtype=torch.float16)
+                for _ in range(query_length)
+            ]
+            keys = ["hash" + str(i) for i in range(query_length)]
+            partial_keys = keys[batch_size:query_length]
+            assert self.aibrix_kvcache.batch_exists(keys) == 0
+            assert self.aibrix_kvcache.batch_set(keys, rand_tensor)
+            get_tensor = [
+                torch.rand(target_shape, dtype=torch.float16).flatten()
+                for _ in range(query_length)
+            ]
+            self.aibrix_kvcache.batch_get(keys, get_tensor)
+            for i in range(query_length):
+                assert torch.equal(get_tensor[i], rand_tensor[i].flatten())
+            ret = self.aibrix_kvcache.batch_exists(keys)
+            assert self.aibrix_kvcache.batch_exists(keys) == query_length
+            assert self.aibrix_kvcache.batch_exists(partial_keys) == partial
+            partial_get_tensor = [
+                torch.rand(target_shape, dtype=torch.float16).flatten()
+                for _ in range(partial)
+            ]
+            self.aibrix_kvcache.batch_get(partial_keys, partial_get_tensor)
+            for i in range(partial):
+                assert torch.equal(
+                    partial_get_tensor[i], rand_tensor[i + partial].flatten()
+                )
+            log_every_n_seconds(
+                logger,
+                logging.INFO,
+                self.aibrix_kvcache.kv_cache_manager.metrics.summary(),
+                1,
+            )
+
+
+if __name__ == "__main__":
+    setup()
+    test = AIBrixKVCacheStorageTest()
+    test.test_with_page_size()
diff --git a/sglang/python/sglang/srt/mem_cache/storage/backend_factory.py b/sglang/python/sglang/srt/mem_cache/storage/backend_factory.py
new file mode 100644
index 0000000000000000000000000000000000000000..eaa5a3e18c646d755b1995a3e8e58e5c61754408
--- /dev/null
+++ b/sglang/python/sglang/srt/mem_cache/storage/backend_factory.py
@@ -0,0 +1,223 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to SGLang project
+
+import importlib
+import logging
+from typing import TYPE_CHECKING, Any, Dict
+
+from sglang.srt.mem_cache.hicache_storage import HiCacheStorage, HiCacheStorageConfig
+
+if TYPE_CHECKING:
+    pass
+
+logger = logging.getLogger(__name__)
+
+
+class StorageBackendFactory:
+    """Factory for creating storage backend instances with support for dynamic loading."""
+
+    _registry: Dict[str, Dict[str, Any]] = {}
+
+    @staticmethod
+    def _load_backend_class(
+        module_path: str, class_name: str, backend_name: str
+    ) -> type[HiCacheStorage]:
+        """Load and validate a backend class from module path."""
+        try:
+            module = importlib.import_module(module_path)
+            backend_class = getattr(module, class_name)
+            if not issubclass(backend_class, HiCacheStorage):
+                raise TypeError(
+                    f"Backend class {class_name} must inherit from HiCacheStorage"
+                )
+            return backend_class
+        except ImportError as e:
+            raise ImportError(
+                f"Failed to import backend '{backend_name}' from '{module_path}': {e}"
+            ) from e
+        except AttributeError as e:
+            raise AttributeError(
+                f"Class '{class_name}' not found in module '{module_path}': {e}"
+            ) from e
+
+    @classmethod
+    def register_backend(cls, name: str, module_path: str, class_name: str) -> None:
+        """Register a storage backend with lazy loading.
+
+        Args:
+            name: Backend identifier
+            module_path: Python module path containing the backend class
+            class_name: Name of the backend class
+        """
+        if name in cls._registry:
+            logger.warning(f"Backend '{name}' is already registered, overwriting")
+
+        def loader() -> type[HiCacheStorage]:
+            """Lazy loader function to import the backend class."""
+            return cls._load_backend_class(module_path, class_name, name)
+
+        cls._registry[name] = {
+            "loader": loader,
+            "module_path": module_path,
+            "class_name": class_name,
+        }
+
+    @classmethod
+    def create_backend(
+        cls,
+        backend_name: str,
+        storage_config: HiCacheStorageConfig,
+        mem_pool_host: Any,
+        **kwargs,
+    ) -> HiCacheStorage:
+        """Create a storage backend instance.
+        Args:
+            backend_name: Name of the backend to create
+            storage_config: Storage configuration
+            mem_pool_host: Memory pool host object
+            **kwargs: Additional arguments passed to external backends
+        Returns:
+            Initialized storage backend instance
+        Raises:
+            ValueError: If backend is not registered and cannot be dynamically loaded
+            ImportError: If backend module cannot be imported
+            Exception: If backend initialization fails
+        """
+        # First check if backend is already registered
+        if backend_name in cls._registry:
+            registry_entry = cls._registry[backend_name]
+            backend_class = registry_entry["loader"]()
+            logger.info(
+                f"Creating storage backend '{backend_name}' "
+                f"({registry_entry['module_path']}.{registry_entry['class_name']})"
+            )
+            return cls._create_builtin_backend(
+                backend_name, backend_class, storage_config, mem_pool_host
+            )
+
+        # Try to dynamically load backend from extra_config
+        if backend_name == "dynamic" and storage_config.extra_config is not None:
+            backend_config = storage_config.extra_config
+            return cls._create_dynamic_backend(
+                backend_config, storage_config, mem_pool_host, **kwargs
+            )
+
+        # Backend not found
+        available_backends = list(cls._registry.keys())
+
+        raise ValueError(
+            f"Unknown storage backend '{backend_name}'. "
+            f"Registered backends: {available_backends}. "
+        )
+
+    @classmethod
+    def _create_dynamic_backend(
+        cls,
+        backend_config: Dict[str, Any],
+        storage_config: HiCacheStorageConfig,
+        mem_pool_host: Any,
+        **kwargs,
+    ) -> HiCacheStorage:
+        """Create a backend dynamically from configuration."""
+        required_fields = ["backend_name", "module_path", "class_name"]
+        for field in required_fields:
+            if field not in backend_config:
+                raise ValueError(
+                    f"Missing required field '{field}' in backend config for 'dynamic' backend"
+                )
+
+        backend_name = backend_config["backend_name"]
+        module_path = backend_config["module_path"]
+        class_name = backend_config["class_name"]
+
+        try:
+            # Import the backend class
+            backend_class = cls._load_backend_class(
+                module_path, class_name, backend_name
+            )
+
+            logger.info(
+                f"Creating dynamic storage backend '{backend_name}' "
+                f"({module_path}.{class_name})"
+            )
+
+            # Create the backend instance with storage_config
+            return backend_class(storage_config, kwargs)
+        except Exception as e:
+            logger.error(
+                f"Failed to create dynamic storage backend '{backend_name}': {e}"
+            )
+            raise
+
+    @classmethod
+    def _create_builtin_backend(
+        cls,
+        backend_name: str,
+        backend_class: type[HiCacheStorage],
+        storage_config: HiCacheStorageConfig,
+        mem_pool_host: Any,
+    ) -> HiCacheStorage:
+        """Create built-in backend with original initialization logic."""
+        if backend_name == "file":
+            return backend_class(storage_config)
+        elif backend_name == "nixl":
+            return backend_class(storage_config)
+        elif backend_name == "mooncake":
+            backend = backend_class(storage_config, mem_pool_host)
+            return backend
+        elif backend_name == "aibrix":
+            backend = backend_class(storage_config, mem_pool_host)
+            return backend
+        elif backend_name == "hf3fs":
+            # Calculate bytes_per_page based on memory pool layout
+            if mem_pool_host.layout in ["page_first", "page_first_direct"]:
+                bytes_per_page = (
+                    mem_pool_host.get_ksize_per_token() * mem_pool_host.page_size
+                )
+            elif mem_pool_host.layout == "layer_first":
+                bytes_per_page = (
+                    mem_pool_host.get_size_per_token() * mem_pool_host.page_size
+                )
+
+            dtype = mem_pool_host.dtype
+            return backend_class.from_env_config(bytes_per_page, dtype, storage_config)
+        elif backend_name == "eic":
+            return backend_class(storage_config, mem_pool_host)
+        else:
+            raise ValueError(f"Unknown built-in backend: {backend_name}")
+
+
+# Register built-in storage backends
+StorageBackendFactory.register_backend(
+    "file", "sglang.srt.mem_cache.hicache_storage", "HiCacheFile"
+)
+
+StorageBackendFactory.register_backend(
+    "nixl",
+    "sglang.srt.mem_cache.storage.nixl.hicache_nixl",
+    "HiCacheNixl",
+)
+
+StorageBackendFactory.register_backend(
+    "mooncake",
+    "sglang.srt.mem_cache.storage.mooncake_store.mooncake_store",
+    "MooncakeStore",
+)
+
+StorageBackendFactory.register_backend(
+    "hf3fs",
+    "sglang.srt.mem_cache.storage.hf3fs.storage_hf3fs",
+    "HiCacheHF3FS",
+)
+
+StorageBackendFactory.register_backend(
+    "aibrix",
+    "sglang.srt.mem_cache.storage.aibrix_kvcache.aibrix_kvcache_storage",
+    "AibrixKVCacheStorage",
+)
+
+StorageBackendFactory.register_backend(
+    "eic",
+    "sglang.srt.mem_cache.storage.eic.eic_storage",
+    "EICStorage",
+)
diff --git a/sglang/python/sglang/srt/mem_cache/storage/eic/README.md b/sglang/python/sglang/srt/mem_cache/storage/eic/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..1f91d03781f299198b9160cfca6bca3cb0547042
--- /dev/null
+++ b/sglang/python/sglang/srt/mem_cache/storage/eic/README.md
@@ -0,0 +1,24 @@
+# EIC as sglang HiCache Storage
+EIC(Elastic Instant Cache) is a distributed database designed for LLM KV Cache. It supports RDMA, GDR and has the capabilities of distributed disaster tolerance and expansion.
+You can understand the principles and architecture of EIC through these articles: https://mp.weixin.qq.com/s/tasDqXf0Gxr3o_WCJ2IJUQ https://mp.weixin.qq.com/s/b_4YhTa96Zeklh23lv8qBw
+
+
+## Deploy EIC
+You can visit the official link https://console.volcengine.com/eic and deploy EIC KVCache on your compute cluster with web UI.In addition, we provide particular image in volcano engine, which integrates various optimizations based on the official image.
+You may use test_unit.py to detect the connectivity of EIC.
+
+
+
+## Deploy Model With EIC
+You can enable EIC KVCache offload with the official interface, such as
+
+```bash
+python -m sglang.launch_server \
+    --model-path [model_path]
+    --enable-hierarchical-cache \
+    --hicache-storage-backend eic \
+    --hicache-write-policy 'write_through' \
+    --hicache-mem-layout 'page_first' \
+
+```
+For more details, you can see https://www.volcengine.com/docs/85848/1749188 .
diff --git a/sglang/python/sglang/srt/mem_cache/storage/eic/eic_storage.py b/sglang/python/sglang/srt/mem_cache/storage/eic/eic_storage.py
new file mode 100644
index 0000000000000000000000000000000000000000..f3cc1563257045163c48be924bab77edf0ffda75
--- /dev/null
+++ b/sglang/python/sglang/srt/mem_cache/storage/eic/eic_storage.py
@@ -0,0 +1,777 @@
+import json
+import logging
+import os
+import time
+from typing import Any, List, Optional, Tuple
+
+import eic
+import torch
+import yaml
+
+from sglang.srt.mem_cache.hicache_storage import (
+    HiCacheStorage,
+    HiCacheStorageConfig,
+    HiCacheStorageExtraInfo,
+)
+from sglang.srt.mem_cache.memory_pool_host import HostKVCache
+
+logger = logging.getLogger(__name__)
+
+
+TensorPoolSize = 2048
+
+REMOTE_EIC_YAML_ENV_VAR = "REMOTE_EIC_YAML"
+
+# gpu direct rdma for kv set
+G_EnableKVSetGPUDirect = False
+
+# gpu direct rdma for kv get
+G_EnableKVGetGPUDirect = False
+
+# gpu nic affinity
+G_EnableGPUNicAffinity = False
+
+# default H20 gpu nic affinity
+GPUNicAffinity = {
+    "cuda:0": "eth1",
+    "cuda:1": "eth1",
+    "cuda:2": "eth2",
+    "cuda:3": "eth2",
+    "cuda:4": "eth3",
+    "cuda:5": "eth3",
+    "cuda:6": "eth4",
+    "cuda:7": "eth4",
+}
+
+# default H20 cpu nic affinity
+CPUNicAffinity = {
+    "cuda:0": "cpu",
+    "cuda:1": "cpu",
+    "cuda:2": "cpu",
+    "cuda:3": "cpu",
+    "cuda:4": "cpu",
+    "cuda:5": "cpu",
+    "cuda:6": "cpu",
+    "cuda:7": "cpu",
+}
+
+
+def get_eic_config_file_path():
+    if os.environ.get(REMOTE_EIC_YAML_ENV_VAR) is not None:
+        logger.info(f"eic init with env var {REMOTE_EIC_YAML_ENV_VAR}")
+        config_file = os.environ.get(REMOTE_EIC_YAML_ENV_VAR)
+    else:
+        config_file = "/sgl-workspace/config/remote-eic.yaml"
+        logger.info(f"eic init with default config, config_file {config_file}")
+    return config_file
+
+
+class FlexibleKVCacheMemoryPool:
+    def __init__(self, conn, kvcache_shape, kvcache_dtype, device):
+        self.connection = conn
+
+        if device.startswith("cpu") and G_EnableGPUNicAffinity:
+            gpu_id = torch.cuda.current_device()
+            self.device = CPUNicAffinity["cuda:" + str(gpu_id)]
+            # current memory pool size is 5 times of CPU TensorPoolSize
+            mempool_size = TensorPoolSize * 5
+        else:
+            self.device = device
+            mempool_size = TensorPoolSize
+
+        self.kvcache_shape = kvcache_shape
+        self.kvcache_dtype = kvcache_dtype
+
+        self.kv_cache_numel = 1
+        for i in self.kvcache_shape:
+            self.kv_cache_numel *= i
+
+        self.free_data_addr = set()
+        self.data_ptr_to_index = dict()
+
+        if self.device.startswith("cpu"):
+            self.kvcache_mempool = torch.zeros(
+                (mempool_size,) + kvcache_shape,
+                dtype=kvcache_dtype,
+                device=self.device,
+                pin_memory=True,
+            )
+        else:
+            self.kvcache_mempool = torch.zeros(
+                (mempool_size,) + kvcache_shape, dtype=kvcache_dtype, device=self.device
+            )
+
+        for i in range(mempool_size):
+            self.free_data_addr.add(i)
+            self.data_ptr_to_index[self.kvcache_mempool[i].data_ptr()] = i
+
+        meminfo = eic.MemoryInfo()
+        meminfo.type = eic.MemoryType.MEMORY_CUDA
+        meminfo.cuda_id = 0
+        vals = eic.IOBuffers()
+        vals.append(
+            self.kvcache_mempool.data_ptr(),
+            self.kvcache_mempool.numel() * self.kvcache_mempool.element_size(),
+            True,
+        )
+        self.connection.register_memory(vals, meminfo)
+        logger.info(
+            f"allocate memory pool, size {self.kvcache_mempool.numel() * self.kvcache_mempool.element_size()}, device {self.device}"
+        )
+
+    def try_allocate_kv_cache(self, shape, dtype, count=1):
+        if len(self.free_data_addr) < count:
+            return None
+
+        numel = 1
+        for i in shape:
+            numel *= i
+        if numel != self.kv_cache_numel or dtype != self.kvcache_dtype:
+            logger.error(
+                f"allocate from mempool failed, self.kvcache_shape {self.kvcache_shape}, dtype {self.kvcache_dtype}, require shape {shape}, dtype {dtype}"
+            )
+            return None
+
+        ret = []
+        for _ in range(count):
+            free_index = self.free_data_addr.pop()
+            ret.append(self.kvcache_mempool[free_index])
+        return ret
+
+    def free_to_mempool(self, data_ptr):
+        if data_ptr not in self.data_ptr_to_index:
+            logger.error(
+                f"free_to_mempool failed, data_ptr {data_ptr} not in allocated_data_addr"
+            )
+            return
+        self.free_data_addr.add(self.data_ptr_to_index[data_ptr])
+
+    def check_data_ptr_allocated(self, data_ptr):
+        return data_ptr in self.data_ptr_to_index
+
+    def left_count(self):
+        return len(self.free_data_addr)
+
+
+class EICStorage(HiCacheStorage):
+    def __init__(
+        self, hicache_config: HiCacheStorageConfig, memory_pool_host: HostKVCache
+    ):
+        global G_EnableKVSetGPUDirect, G_EnableKVGetGPUDirect
+        global GPUNicAffinity, CPUNicAffinity, G_EnableGPUNicAffinity
+
+        config_file = get_eic_config_file_path()
+        if os.path.exists(config_file) is False:
+            logger.error(f"config file {config_file} not exists")
+            raise RuntimeError(f"eic config file {config_file} not exists")
+
+        with open(config_file, "r") as fin:
+            config = yaml.safe_load(fin)
+
+        remote_url = config.get("remote_url", None)
+        if remote_url is None:
+            AssertionError("remote_url is None")
+
+        endpoint = remote_url[len("eic://") :]
+
+        logger.info(f"eic remote_url:" + remote_url + " endpoint: " + endpoint)
+
+        eic_instance_id = config.get("eic_instance_id", None)
+        logger.info(f"eic instance_id: {eic_instance_id}")
+
+        eic_thread_num = config.get("eic_thread_num", 1)
+        logger.info(f"eic thread_num: {eic_thread_num}")
+
+        eic_log_dir = config.get("eic_log_dir", None)
+        logger.info(f"eic log_dir: {eic_log_dir}")
+
+        eic_log_level = config.get("eic_log_level", 2)
+        logger.info(f"eic log_level: {eic_log_level}")
+
+        eic_trans_type = config.get("eic_trans_type", 3)
+        logger.info(f"eic trans_type: {eic_trans_type}")
+
+        eic_flag_file = config.get("eic_flag_file", None)
+        logger.info(f"eic flag_file: {eic_flag_file}")
+
+        # GDR now is not used
+        G_EnableKVSetGPUDirect = (
+            config.get("enable_kvset_gpu_direct", False) and torch.cuda.is_available()
+        )
+        logger.debug(f"eic enable_kvset_gpu_direct: {G_EnableKVSetGPUDirect}")
+
+        G_EnableKVGetGPUDirect = (
+            config.get("enable_kvget_gpu_direct", False) and torch.cuda.is_available()
+        )
+        logger.debug(f"eic enable_kvget_gpu_direct: {G_EnableKVGetGPUDirect}")
+
+        self.model_name = hicache_config.model_name
+
+        # rdma
+        enable_kv_set_direct = config.get("enable_kvset_direct", True)
+        logger.info(f"eic enable_kv_set_direct: {enable_kv_set_direct}")
+        self.enable_kv_set_direct = enable_kv_set_direct
+
+        enable_kv_get_direct = config.get("enable_kvget_direct", True)
+        logger.info(f"eic enable_kv_get_direct: {enable_kv_get_direct}")
+        self.enable_kv_get_direct = enable_kv_get_direct
+
+        # gpu nic affinity
+        G_EnableGPUNicAffinity = config.get("enable_gpu_nic_affinity", False)
+        logger.info(f"eic enable_gpu_nic_affinity: {G_EnableGPUNicAffinity}")
+        self.enable_gpu_nic_affinity = G_EnableGPUNicAffinity
+
+        if G_EnableGPUNicAffinity:
+            if "gpu_nic_affinity_config" in config:
+                GPUNicAffinity = json.loads(config["gpu_nic_affinity_config"])
+            if "cpu_nic_affinity_config" in config:
+                CPUNicAffinity = json.loads(config["cpu_nic_affinity_config"])
+            logger.info(f"eic gpu nic affinity {GPUNicAffinity}")
+            logger.info(f"eic cpu nic affinity {CPUNicAffinity}")
+
+        eic_namespace = config.get("eic_namespace", "")
+        logger.info(f"eic namespace: {eic_namespace}")
+        self.eic_namespace = eic_namespace
+
+        if not os.path.exists(eic_log_dir) and not os.path.isdir(eic_log_dir):
+            os.makedirs(eic_log_dir, exist_ok=True)
+
+        self.connection = eic.Client()
+        init_option = eic.InitOption()
+        init_option.log_dir = eic_log_dir
+        init_option.log_level = eic.LogLevel(eic_log_level)
+        init_option.transport_type = eic.TransportType(eic_trans_type)
+        init_option.flag_file = eic_flag_file
+
+        if G_EnableGPUNicAffinity:
+            gpu_id = torch.cuda.current_device()
+            init_option.multi_net_local_interface_names = GPUNicAffinity[
+                "cuda:" + str(gpu_id)
+            ]
+            logger.info(
+                f"gpu {gpu_id} set gpu nic affinity to {init_option.multi_net_local_interface_names}"
+            )
+
+        ret = self.connection.init(eic_instance_id, endpoint, init_option)
+        if ret != 0:
+            logger.error(f"fail to init eic client, ret: {ret}")
+            raise RuntimeError("EIC Client Init Failed.")
+        self.warmup()
+
+        self.memory_pool_host = memory_pool_host
+        self.host_kvcache_layout = self.memory_pool_host.layout
+        self.trans_type = eic.TransportType(eic_trans_type)
+        self.kv_cache_dtype = self.memory_pool_host.dtype
+        self.is_mla_model = hicache_config.is_mla_model
+        self.rank = hicache_config.tp_rank
+        self.world_size = hicache_config.tp_size
+        self.page_size = self.memory_pool_host.page_size
+        self.use_zero_copy = self.memory_pool_host.layout == "page_first"
+        if not self.use_zero_copy:
+            self.kv_cache_shape = self.memory_pool_host.get_data_page(
+                0, flat=True
+            ).shape
+            if self.enable_kv_set_direct:
+                self.kv_cache_write_mem_pool = FlexibleKVCacheMemoryPool(
+                    self.connection, self.kv_cache_shape, self.kv_cache_dtype, "cpu"
+                )
+            if self.enable_kv_get_direct:
+                self.kv_cache_get_mem_pool = FlexibleKVCacheMemoryPool(
+                    self.connection, self.kv_cache_shape, self.kv_cache_dtype, "cpu"
+                )
+        self._init_eic_prefix()
+
+    def warmup(self):
+        logger.info("begin warm up eic client")
+        start_time = time.perf_counter()
+        num_warmup = 1024
+        preheat_keys = ["warmup_key_" + str(i) for i in range(num_warmup)]
+        batch_size = 32
+        for i in range(0, num_warmup, batch_size):
+            keys_vec = eic.StringVector()
+            for key in preheat_keys[i : i + batch_size]:
+                keys_vec.append(key)
+            exist_option = eic.ExistOption()
+            _, _ = self.connection.mexist(keys_vec, exist_option)
+        logger.info(
+            f"finish eic client warm up, warm up cost {time.perf_counter() - start_time:.2f} seconds"
+        )
+
+    def register_mem_pool_host(self, memory_pool_host: HostKVCache) -> None:
+        # no need judge meminfo type, cuda_id, etc.
+        meminfo = eic.MemoryInfo()
+        meminfo.type = eic.MemoryType.MEMORY_CUDA
+        meminfo.cuda_id = 0
+        vals = eic.IOBuffers()
+        buffer = memory_pool_host.kv_buffer
+        vals.append(
+            buffer.data_ptr(),
+            buffer.numel() * buffer.element_size(),
+            True,
+        )
+        self.connection.register_memory(vals, meminfo)
+
+    def _init_eic_prefix(self):
+        if self.is_mla_model:
+            self.eic_prefix = (
+                f"{self.model_name}_mla_att_{self.host_kvcache_layout}@sglang"
+            )
+        else:
+            self.eic_prefix = f"{self.model_name}_mha_attn_{self.host_kvcache_layout}_{self.rank}_{self.world_size}_@sglang"
+
+    def _get_eic_key(self, keys: List[str]) -> str:
+        return [f"{self.eic_prefix}_{key}" for key in keys]
+
+    def set(
+        self,
+        key: str,
+        value: Optional[Any] = None,
+        target_location: Optional[Any] = None,
+        target_size: Optional[Any] = None,
+    ) -> bool:
+        # now is not used
+        if self.use_zero_copy:
+            return self.zero_copy_batch_set([key], [target_location])
+        else:
+            return self.generic_batch_set([key], [value])
+
+    # target_locations and target_sizes are not used for now
+    def batch_set(
+        self,
+        keys: List[str],
+        values: Optional[Any] = None,
+        target_locations: Optional[Any] = None,
+        target_sizes: Optional[Any] = None,
+    ) -> bool:
+        if len(keys) == 0:
+            return True
+        if self.use_zero_copy:
+            return self.zero_copy_batch_set(keys, values)
+        else:
+            return self.generic_batch_set(keys, values)
+
+    def get(
+        self,
+        key,
+        target_location: Optional[Any] = None,
+        target_size: Optional[Any] = None,
+    ) -> torch.Tensor | None:
+        # now is not used
+        if self.use_zero_copy:
+            return self.zero_copy_batch_get([key], [target_location])
+        else:
+            return self.generic_batch_get([key], [target_location])
+
+    # use for v1 interface, and shound not be called directly
+    def batch_get(
+        self,
+        keys: List[str],
+        target_locations: Optional[Any] = None,
+        target_sizes: Optional[Any] = None,
+    ) -> List[torch.Tensor | None]:
+        assert len(keys) == len(target_locations)
+        if len(keys) == 0:
+            return None
+        if self.use_zero_copy:
+            return self.zero_copy_batch_get(keys, target_locations)
+        else:
+            return self.generic_batch_get(keys, target_locations)
+
+    def _batch_exists_impl(self, keys) -> List[bool]:
+        if len(keys) == 0:
+            return 0
+        eic_keys = self._get_eic_key(keys)
+        logger.debug(f"eic exists {len(keys)}")
+        result = []
+        exist_bs = 1024
+        for i in range(0, len(eic_keys), exist_bs):
+            batch_keys = eic_keys[i : i + exist_bs]
+            keys_vec = eic.StringVector()
+            for key in batch_keys:
+                keys_vec.append(key)
+            exist_option = eic.ExistOption()
+            exist_option.ns = self.eic_namespace
+            status_code, exist_outcome = self.connection.mexist(keys_vec, exist_option)
+            if status_code != eic.StatusCode.SUCCESS:
+                logger.error(
+                    f"eic exists {len(keys)} failed, status_code {status_code}"
+                )
+                result.extend([False] * len(batch_keys))
+            for err_code in exist_outcome.status_codes:
+                result.append(err_code == eic.StatusCode.SUCCESS)
+        return result
+
+    def exists(self, key) -> bool:
+        exist_num = self.batch_exists([key])
+        return exist_num == 1
+
+    def batch_exists(
+        self, keys, extra_info: Optional[HiCacheStorageExtraInfo] = None
+    ) -> int:
+        if len(keys) == 0:
+            return 0
+        if self.use_zero_copy and not self.is_mla_model:
+            keys = self._get_mha_zero_copy_keys(keys)
+        exist_mask = self._batch_exists_impl(keys)
+        prefix_success = 0
+        for exist in exist_mask:
+            if exist:
+                prefix_success += 1
+            else:
+                break
+        if not self.is_mla_model and self.use_zero_copy:
+            prefix_success = prefix_success // 2
+        return prefix_success
+
+    def delete(self, key) -> None:
+        eic_keys = self._get_eic_key([key])
+        keys_vec = eic.StringVector()
+        for eic_key in eic_keys:
+            keys_vec.append(eic_key)
+        del_option = eic.DelOption()
+        self.connection.mdel(keys_vec, del_option)
+
+    def clear(self) -> None:
+        return
+
+    # Not used for now
+    def _filter_kv_cache(self, total_len) -> Tuple[int, int]:
+        mean_len = total_len // self.world_size
+        remainder = total_len % self.world_size
+        tp_keys_len = mean_len + (1 if self.rank < remainder else 0)
+        start = self.rank * mean_len + min(self.rank, remainder)
+        end = start + tp_keys_len
+        logger.debug(f"start: {start}, end: {end}, tp_keys_len: {tp_keys_len}")
+        return start, end
+
+    def zero_copy_batch_set(self, keys: List[str], values: List[torch.Tensor]) -> bool:
+        logger.debug(f"eic zero copy set {len(keys)} keys")
+        if len(keys) == 0:
+            return True
+        eic_keys = self._get_eic_key(keys)
+        keys_vec = eic.StringVector()
+        vals_vec = eic.IOBuffers()
+        # set data key & value
+        for i, key in enumerate(eic_keys):
+            # set data key & value
+            keys_vec.append(key)
+            vals_vec.append(
+                values[i].data_ptr(),
+                values[i].element_size() * values[i].numel(),
+                True,
+            )
+        # set options
+        set_option = eic.SetOption()
+        set_option.ns = self.eic_namespace
+        set_option.ttl_second = -1
+        status_code, set_outcome = self.connection.mset(keys_vec, vals_vec, set_option)
+        if status_code != eic.StatusCode.SUCCESS:
+            logger.error(f"eic mset {len(keys)} failed, status_code {status_code}")
+            return [False] * len(keys)
+        else:
+            logger.debug(f"eic zero copy mset {len(keys)} success")
+        return [True] * len(keys)
+
+    def zero_copy_batch_get(
+        self, keys: List[str], values: List[torch.Tensor]
+    ) -> List[bool]:
+        logger.debug(f"eic zero copy get {len(keys)} keys")
+        # Get Data: generate data keys and vals
+        get_data_start_time = time.perf_counter()
+        eic_keys = self._get_eic_key(keys)
+        data_keys = eic.StringVector()
+        data_vals = eic.IOBuffers()
+        success_mask = [True] * len(keys)
+        count = len(keys)
+        for i, key in enumerate(eic_keys):
+            data_keys.append(key)
+            data_vals.append(
+                values[i].data_ptr(),
+                values[i].element_size() * values[i].numel(),
+                True,
+            )
+
+        # Get data: recv data buffer tensor
+        get_option = eic.GetOption()
+        get_option.ns = self.eic_namespace
+        status_code, data_vals, get_outcome = self.connection.mget(
+            data_keys, get_option, data_vals
+        )
+
+        if status_code != eic.StatusCode.SUCCESS:
+            if status_code == eic.StatusCode.PARTIAL_FAILED:
+                for i, err_code in enumerate(get_outcome.status_codes):
+                    success = err_code == eic.StatusCode.SUCCESS
+                    if success:
+                        logger.debug(f"eic get data {eic_keys[i]} success")
+                    else:
+                        logger.error(
+                            f"eic get data {eic_keys[i]} failed, err_code {err_code}"
+                        )
+                        success_mask[i] = False
+            else:
+                logger.error(
+                    f"eic mget {len(eic_keys)} keys failed, status_code {status_code}"
+                )
+                success_mask = [False] * len(keys)
+                return success_mask
+
+        get_data_end_time = time.perf_counter()
+        get_data_execution_time = (get_data_end_time - get_data_start_time) * 1e6
+        logger.debug(f"eic get {count} keys data cost %.2f us", get_data_execution_time)
+        return success_mask
+
+    def generic_batch_set(
+        self,
+        keys: List[str],
+        values: List[torch.Tensor],
+    ) -> List[bool]:
+        assert len(keys) == len(values)
+        logger.debug(f"eic generic set {len(keys)} keys")
+        if len(keys) == 0:
+            return True
+        eic_keys = self._get_eic_key(keys)
+        keys_vec = eic.StringVector()
+        vals_vec = eic.IOBuffers()
+        count = len(keys)
+        registered = False
+        items = []
+        if self.enable_kv_set_direct:
+            values_data_ptrs = []
+            items = self.kv_cache_write_mem_pool.try_allocate_kv_cache(
+                self.kv_cache_shape, self.kv_cache_dtype, count
+            )
+            if items is None:
+                logger.warning("can not allocate tensor from pool")
+                for i, value in enumerate(values):
+                    values_data_ptrs.append(
+                        (value.data_ptr(), value.element_size() * value.numel(), False)
+                    )
+            else:
+                objs = items
+                registered = True
+                for i, key in enumerate(eic_keys):
+                    temp = objs[i].reshape(values[i].shape).contiguous()
+                    temp.copy_(values[i])
+                    if temp.data_ptr() != objs[i].data_ptr():
+                        registered = False
+                        temp = temp.cpu()
+                    values_data_ptrs.append(
+                        (
+                            temp.data_ptr(),
+                            temp.element_size() * temp.numel(),
+                            registered,
+                        )
+                    )
+
+            for i, key in enumerate(eic_keys):
+                keys_vec.append(key)
+                data_ptr, data_size, registered = values_data_ptrs[i]
+                vals_vec.append(data_ptr, data_size, registered)
+        else:
+            # use tensor direct
+            for i, key in enumerate(eic_keys):
+                keys_vec.append(key)
+                vals_vec.append(
+                    values[i].data_ptr(),
+                    values[i].element_size() * values[i].numel(),
+                    False,
+                )
+
+        # set options
+        set_option = eic.SetOption()
+        set_option.ns = self.eic_namespace
+        set_option.ttl_second = -1
+        status_code, set_outcome = self.connection.mset(keys_vec, vals_vec, set_option)
+        if status_code != eic.StatusCode.SUCCESS:
+            logger.error(f"eic mset {len(eic_keys)} failed, status_code {status_code}")
+        else:
+            logger.debug(f"eic mset {len(eic_keys)} success")
+
+        if self.enable_kv_set_direct and items is not None:
+            for item in items:
+                self.kv_cache_write_mem_pool.free_to_mempool(item.data_ptr())
+
+        err_code = set_outcome.status_codes[0]
+        if err_code != eic.StatusCode.SUCCESS:
+            logger.error(f"set data key {len(eic_keys)} failed, err_code {err_code}")
+            return [False] * len(keys)
+
+        logger.debug(f"set data key {len(eic_keys)} success")
+        return [True] * len(keys)
+
+    def generic_batch_get(
+        self, keys: List[str], buffers: List[torch.Tensor]
+    ) -> List[bool]:
+        # all success or all fail
+        logger.debug(f"eic generic get {len(keys)} keys")
+        eic_keys = self._get_eic_key(keys)
+        get_data_start_time = time.perf_counter()
+        data_keys = eic.StringVector()
+        data_vals = eic.IOBuffers()
+        count = len(eic_keys)
+        registered = False
+        items = []
+        success_mask = [True] * len(keys)
+        if self.enable_kv_get_direct:
+            items = self.kv_cache_get_mem_pool.try_allocate_kv_cache(
+                self.kv_cache_shape, self.kv_cache_dtype, count
+            )
+            if items is None:
+                logger.warning("can not allocate tensor from pool")
+                for i, key in enumerate(eic_keys):
+                    data_keys.append(key)
+                    data_vals.append(
+                        buffers[i].data_ptr(),
+                        buffers[i].element_size() * buffers[i].numel(),
+                        False,
+                    )
+            else:
+                registered = True
+                for i, key in enumerate(eic_keys):
+                    data_keys.append(key)
+                    data_vals.append(
+                        items[i].data_ptr(),
+                        items[i].element_size() * items[i].numel(),
+                        registered,
+                    )
+
+        else:
+            for i, key in enumerate(eic_keys):
+                data_keys.append(key)
+                data_vals.append(
+                    buffers[i].data_ptr(),
+                    buffers[i].element_size() * buffers[i].numel(),
+                    False,
+                )
+
+        # Get data: recv data buffer tensor
+        get_option = eic.GetOption()
+        get_option.ns = self.eic_namespace
+        status_code, data_vals, get_outcome = self.connection.mget(
+            data_keys, get_option, data_vals
+        )
+
+        if status_code != eic.StatusCode.SUCCESS:
+            if status_code == eic.StatusCode.PARTIAL_FAILED:
+                for i, err_code in enumerate(get_outcome.status_codes):
+                    success = err_code == eic.StatusCode.SUCCESS
+                    if success:
+                        logger.debug(f"eic get data {eic_keys[i]} success")
+                    else:
+                        logger.error(
+                            f"eic get data {eic_keys[i]} failed, err_code {err_code}"
+                        )
+                        success_mask[i] = False
+            else:
+                logger.error(
+                    f"eic mget {len(eic_keys)} keys failed, status_code {status_code}"
+                )
+                success_mask = [False] * len(keys)
+
+        if registered:
+            for i, item in enumerate(items):
+                if success_mask[i]:
+                    buffers[i].copy_(item)
+                self.kv_cache_get_mem_pool.free_to_mempool(item.data_ptr())
+
+        get_data_end_time = time.perf_counter()
+        get_data_execution_time = (get_data_end_time - get_data_start_time) * 1e6
+        logger.debug(f"eic get {count} keys data cost %.2f us", get_data_execution_time)
+        return success_mask
+
+    def _get_mha_zero_copy_keys(self, keys: List[str]) -> List[str]:
+        new_keys = []
+        for k in keys:
+            new_keys.append(f"{k}_k")
+            new_keys.append(f"{k}_v")
+        return new_keys
+
+    def _get_mha_zero_copy_values(
+        self, values: List[torch.Tensor]
+    ) -> List[torch.Tensor]:
+        new_values = []
+        for value in values:
+            new_values.append(value[0])
+            new_values.append(value[1])
+        return new_values
+
+    def _batch_get_preprocess(self, keys, host_indices):
+        page_num = len(host_indices) // self.page_size
+        # use memory pool directly or dummy page
+        values = (
+            [
+                self.memory_pool_host.get_data_page(
+                    host_indices[i * self.page_size], flat=False
+                )
+                for i in range(page_num)
+            ]
+            if self.use_zero_copy
+            else [
+                self.memory_pool_host.get_dummy_flat_data_page()
+                for _ in range(page_num)
+            ]
+        )
+
+        if self.use_zero_copy and not self.is_mla_model:
+            keys = self._get_mha_zero_copy_keys(keys)
+            values = self._get_mha_zero_copy_values(values)
+
+        return keys, values
+
+    def _batch_get_postprocess(self, host_indices, values, results):
+        page_num = len(host_indices) // self.page_size
+
+        if self.use_zero_copy:
+            if not self.is_mla_model:
+                results = [
+                    (results[2 * i] and results[2 * i + 1]) for i in range(page_num)
+                ]
+                results = results[:page_num]
+            return results
+
+        # dummy page copy to host memory pool
+        for i in range(page_num):
+            if not results[i]:
+                break
+            self.memory_pool_host.set_from_flat_data_page(
+                host_indices[i * self.memory_pool_host.page_size], values[i]
+            )
+
+        return results
+
+    def batch_get_v1(
+        self,
+        keys: List[str],
+        host_indices: torch.Tensor,
+        extra_info: Optional[HiCacheStorageExtraInfo] = None,
+    ) -> List[bool]:
+        keys, values = self._batch_get_preprocess(keys, host_indices)
+        results = self.batch_get(keys, values)
+        return self._batch_get_postprocess(host_indices, values, results)
+
+    def _batch_set_preprocess(self, keys, host_indices):
+        page_num = len(host_indices) // self.page_size
+        flat = not self.use_zero_copy
+        values = [
+            self.memory_pool_host.get_data_page(
+                host_indices[i * self.page_size], flat=flat
+            )
+            for i in range(page_num)
+        ]
+
+        if self.use_zero_copy and not self.is_mla_model:
+            keys = self._get_mha_zero_copy_keys(keys)
+            values = self._get_mha_zero_copy_values(values)
+
+        return keys, values
+
+    def batch_set_v1(
+        self,
+        keys: List[str],
+        host_indices: torch.Tensor,
+        extra_info: Optional[HiCacheStorageExtraInfo] = None,
+    ) -> List[bool]:
+        keys, values = self._batch_set_preprocess(keys, host_indices)
+        results = self.batch_set(keys, values)
+        return results
diff --git a/sglang/python/sglang/srt/mem_cache/storage/eic/test_unit.py b/sglang/python/sglang/srt/mem_cache/storage/eic/test_unit.py
new file mode 100644
index 0000000000000000000000000000000000000000..03d348ad8fc8dd53a6bf41e8699a0dc0ef57c807
--- /dev/null
+++ b/sglang/python/sglang/srt/mem_cache/storage/eic/test_unit.py
@@ -0,0 +1,115 @@
+import argparse
+import os
+
+import eic
+import torch
+import yaml
+
+
+def pase_args():
+    parser = argparse.ArgumentParser(description="EIC Storage Unit Test")
+    parser.add_argument(
+        "--config",
+        "-c",
+        type=str,
+        default="/sgl-workspace/config/remote-eic.yaml",
+        help="EIC yaml config",
+    )
+    args, _ = parser.parse_known_args()
+    return args
+
+
+def init_eic_client():
+    args = pase_args()
+    config_path = os.path.abspath(args.config)
+    if not os.path.exists(config_path):
+        raise FileNotFoundError(f"Config file not found: {config_path}")
+    with open(config_path, "r") as fin:
+        config = yaml.safe_load(fin)
+
+    remote_url = config.get("remote_url", None)
+    if remote_url is None:
+        AssertionError("remote_url is None")
+    endpoint = remote_url[len("eic://") :]
+    eic_instance_id = config.get("eic_instance_id", None)
+    eic_log_dir = config.get("eic_log_dir", None)
+    eic_log_level = config.get("eic_log_level", 2)
+    eic_trans_type = config.get("eic_trans_type", 3)
+    eic_flag_file = config.get("eic_flag_file", None)
+
+    if not os.path.exists(eic_log_dir):
+        os.makedirs(eic_log_dir, exist_ok=True)
+    eic_client = eic.Client()
+    init_option = eic.InitOption()
+    init_option.log_dir = eic_log_dir
+    init_option.log_level = eic.LogLevel(eic_log_level)
+    init_option.transport_type = eic.TransportType(eic_trans_type)
+    init_option.flag_file = eic_flag_file
+    ret = eic_client.init(eic_instance_id, endpoint, init_option)
+    if ret != 0:
+        raise RuntimeError(f"EIC Client init failed with error code: {ret}")
+    return eic_client
+
+
+def test_set(eic_client):
+    test_key = ["test_key_" + str(i) for i in range(16)]
+    tensors = [
+        torch.ones([12, 6, 1, 512], dtype=torch.bfloat16, device="cpu")
+        for _ in range(16)
+    ]
+    data_keys = eic.StringVector()
+    data_vals = eic.IOBuffers()
+    for i in range(16):
+        data_keys.append(test_key[i])
+        data_vals.append(
+            tensors[i].data_ptr(), tensors[i].numel() * tensors[i].element_size(), False
+        )
+    set_opt = eic.SetOption()
+    set_opt.ttl_second = 3
+    status_code, set_outcome = eic_client.mset(data_keys, data_vals, set_opt)
+    assert (
+        status_code == eic.StatusCode.SUCCESS
+    ), f"Set failed with status code: {status_code}"
+
+
+def test_get(eic_client):
+    test_key = ["test_key_" + str(i) for i in range(16)]
+    tensors = [
+        torch.zeros([12, 6, 1, 512], dtype=torch.bfloat16, device="cpu")
+        for _ in range(16)
+    ]
+    data_keys = eic.StringVector()
+    data_vals = eic.IOBuffers()
+    for i in range(16):
+        data_keys.append(test_key[i])
+        data_vals.append(
+            tensors[i].data_ptr(), tensors[i].numel() * tensors[i].element_size(), False
+        )
+    get_opt = eic.GetOption()
+    status_code, data_vals, get_outcome = eic_client.mget(data_keys, get_opt, data_vals)
+    assert (
+        status_code == eic.StatusCode.SUCCESS
+    ), f"Get failed with status code: {status_code}"
+
+
+def test_exists(eic_client):
+    test_key = ["test_key_" + str(i) for i in range(16)]
+    data_keys = eic.StringVector()
+    for key in test_key:
+        data_keys.append(key)
+    exists_opt = eic.ExistOption()
+    status_code, exists_outcome = eic_client.mexist(data_keys, exists_opt)
+    assert (
+        status_code == eic.StatusCode.SUCCESS
+    ), f"Exists failed with status code: {status_code}"
+
+
+def main():
+    eic_client = init_eic_client()
+    test_set(eic_client)
+    test_exists(eic_client)
+    test_get(eic_client)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/sglang/python/sglang/srt/mem_cache/storage/hf3fs/docs/README.md b/sglang/python/sglang/srt/mem_cache/storage/hf3fs/docs/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..2423b7135a2d1f2249fc22defee54fee5ebed9d5
--- /dev/null
+++ b/sglang/python/sglang/srt/mem_cache/storage/hf3fs/docs/README.md
@@ -0,0 +1,71 @@
+# Using HF3FS as L3 Global KV Cache
+
+This document provides step-by-step instructions for setting up a k8s + 3FS + SGLang runtime environment from scratch, describing how to utilize deepseek-hf3fs as the L3 KV cache for SGLang.
+The process consists of five main steps:
+
+## Step 1: Install deepseek-3fs via 3fs-Operator
+Refer to the [3fs-operator documentation](https://github.com/aliyun/kvc-3fs-operator/blob/main/README_en.md) to deploy 3FS components in your Kubernetes environment using the Operator with one-click deployment.
+
+## Step 2: Launch SGLang Pod
+Start your SGLang Pod while specifying 3FS-related labels in the YAML configuration. Follow the [fuse-client-creation guide](https://github.com/aliyun/kvc-3fs-operator/blob/main/README_en.md#fuse-client-creation).
+
+## Step 3: Configure Usrbio Client in SGLang Pod
+The Usrbio client is required for accessing 3FS. Install it in your SGLang Pod using either method below:
+
+**Alternative 1 (Recommend):** Built from the source code, the following provides quick installation commands (refer to [setup_usrbio_client.md](setup_usrbio_client.md))
+
+```
+set -e; \
+. /etc/os-release; \
+case "$VERSION_ID" in \
+  "22.04") \
+    CLANG_VERSION="14"; \
+    GIT_BRANCH=main; \
+    GIT_COMMIT_ID=6f029c439d0d22995900ca357d51b37975c6ffb5; \
+    ;; \
+  "24.04") \
+    CLANG_VERSION="18"; \
+    GIT_BRANCH="ubuntu24.04"; \
+    GIT_COMMIT_ID=d0cf83a42395cdb2a66d3ce83cb0a11a46bee9f3; \
+  ;; \
+  *) \
+    echo "Unsupported Ubuntu version: $VERSION_ID"; \
+    exit 1; \
+  ;; \
+esac; \
+apt-get update && apt-get install -y --no-install-recommends \
+        clang-format-$CLANG_VERSION clang-$CLANG_VERSION clang-tidy-$CLANG_VERSION lld-$CLANG_VERSION meson google-perftools \
+        libaio-dev libdouble-conversion-dev libdwarf-dev libgflags-dev libgmock-dev libgoogle-perftools-dev liblz4-dev liblzma-dev libuv1-dev \
+        && rm -rf /var/lib/apt/lists/* \
+        && apt-get clean \
+        && git clone https://github.com/novitalabs/3FS.git -b $GIT_BRANCH 3fs \
+        && cd 3fs \
+        && git checkout $GIT_COMMIT_ID \
+        && git submodule update --init --recursive \
+        && ./patches/apply.sh \
+        && CMAKE_BUILD_PARALLEL_LEVEL=32 python3 setup.py bdist_wheel -d dist \
+        && pip install dist/*.whl \
+        && cd .. \
+        && rm -rf 3fs
+export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/lib/python3.12/dist-packages
+```
+
+**Alternative 2:** Run `pip3 install hf3fs-py-usrbio` (Follow https://pypi.org/project/hf3fs-py-usrbio/#files)
+
+## Step 4: Deploy Model Serving
+
+### Single Node Deployment
+```bash
+export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/lib/python3.12/dist-packages
+python3 -m sglang.launch_server \
+    --model-path /code/models/Qwen3-32B/ \
+    --host 0.0.0.0 --port 10000 \
+    --page-size 64 \
+    --enable-hierarchical-cache \
+    --hicache-ratio 2 --hicache-size 0 \
+    --hicache-write-policy write_through \
+    --hicache-storage-backend hf3fs
+```
+
+### Multi-Node Deployment (Shared KV Cache)
+Follow the [deploy_sglang_3fs_multinode.md](deploy_sglang_3fs_multinode.md) guide to deploy SGLang with 3FS across multiple nodes for shared KV caching.
diff --git a/sglang/python/sglang/srt/mem_cache/storage/hf3fs/docs/deploy_sglang_3fs_multinode.md b/sglang/python/sglang/srt/mem_cache/storage/hf3fs/docs/deploy_sglang_3fs_multinode.md
new file mode 100644
index 0000000000000000000000000000000000000000..889f9ad85d55881ff32bc355711887faea1a7b6a
--- /dev/null
+++ b/sglang/python/sglang/srt/mem_cache/storage/hf3fs/docs/deploy_sglang_3fs_multinode.md
@@ -0,0 +1,65 @@
+# 1. Startup 3fs metadata service
+```bash
+nohup python3 -m sglang.srt.mem_cache.storage.hf3fs.mini_3fs_metadata_server > meta.out &
+```
+
+
+# 2. Startup sglang engine
+## HF3fs configures
+```bash
+vim /sgl-workspace/sglang/benchmark/hf3fs/hf3fs_config.json
+{
+    "file_path_prefix": "/data/hicache",
+    "file_size": 1099511627776,
+    "numjobs": 16,
+    "entries": 8,
+    "metadata_server_url": "http://metaServerIp:18000"
+}
+```
+
+## node1
+```bash
+export SGLANG_HICACHE_HF3FS_CONFIG_PATH=/sgl-workspace/sglang/benchmark/hf3fs/hf3fs_config.json
+export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/lib/python3.12/dist-packages
+rm -rf instance1.out && \
+nohup python3 -m sglang.launch_server \
+    --model-path /code/models/Qwen3-32B/ \
+    --host 0.0.0.0 --port 10000 \
+    --page-size 64 \
+    --enable-hierarchical-cache \
+    --hicache-ratio 2 --hicache-size 0 \
+    --hicache-write-policy write_through \
+    --hicache-storage-backend hf3fs --tp 2 > instance1.out &
+```
+
+## node2
+```bash
+export SGLANG_HICACHE_HF3FS_CONFIG_PATH=/sgl-workspace/sglang/benchmark/hf3fs/hf3fs_config.json
+export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/lib/python3.12/dist-packages
+rm -rf instance2.out && \
+nohup python3 -m sglang.launch_server \
+    --model-path /code/models/Qwen3-32B/ \
+    --host 0.0.0.0 --port 10000 \
+    --page-size 64 \
+    --enable-hierarchical-cache \
+    --hicache-ratio 2 --hicache-size 0 \
+    --hicache-write-policy write_through \
+    --hicache-storage-backend hf3fs --tp 2 > instance2.out &
+```
+
+# 3. Startup router
+```bash
+rm -rf router.out && \
+nohup python -m sglang_router.launch_router --worker-urls http://node1:10000 http://node2:10000 > router.out &
+```
+
+# 4. Startup multiturn benchmark
+```bash
+rm -rf bench_multiturn.out && \
+nohup python3 benchmark/hicache/bench_multiturn.py \
+    --model-path /code/models/Qwen3-32B \
+    --dataset-path /code/models/ShareGPT_V3_unfiltered_cleaned_split.json \
+    --port 30000 \
+    --request-length 2048 --num-clients 512 --num-rounds 5 --max-parallel 8 \
+    > bench_multiturn.out &
+```
diff --git a/sglang/python/sglang/srt/mem_cache/storage/hf3fs/docs/setup_usrbio_client.md b/sglang/python/sglang/srt/mem_cache/storage/hf3fs/docs/setup_usrbio_client.md
new file mode 100644
index 0000000000000000000000000000000000000000..7c7c0bfb26453332bf1d048d3888859e028cd858
--- /dev/null
+++ b/sglang/python/sglang/srt/mem_cache/storage/hf3fs/docs/setup_usrbio_client.md
@@ -0,0 +1,68 @@
+# HiCacheHF3FS Setup
+
+## Build & Package
+### Source Code
+https://github.com/deepseek-ai/3FS/blob/main/README.md#check-out-source-code
+```sh
+git clone https://github.com/deepseek-ai/3fs
+
+cd 3fs
+git submodule update --init --recursive
+./patches/apply.sh
+```
+
+### Build Dev Container
+https://github.com/deepseek-ai/3FS/blob/main/dockerfile/dev.dockerfile
+```sh
+cd 3fs/dockerfile
+docker build -t hf3fs:dev -f dev.dockerfile .
+```
+
+### Generate Python Wheel
+```sh
+docker run -it hf3fs:dev bash
+
+# Inside the development container
+git clone https://github.com/deepseek-ai/3fs
+
+cd 3fs
+git submodule update --init --recursive
+./patches/apply.sh
+
+apt-get update \
+&& apt-get install -y --no-install-recommends \
+python3 python3-pip \
+&& apt-get clean \
+&& rm -rf /var/lib/apt/lists/*
+# apt install python3.12 python3.12-venv python3.12-dev
+# curl https://bootstrap.pypa.io/get-pip.py -o get-pip.py
+# python3.12 get-pip.py
+
+# Generated wheel location: dist/hf3fs_py_usrbio-1.2.9+2db69ce-cp310-cp310-linux_x86_64.whl
+python3 setup.py bdist_wheel
+```
+
+## Installation
+```sh
+# Install Dependencies
+# https://github.com/deepseek-ai/3FS/blob/main/dockerfile/dev.dockerfile
+apt update && apt install -y                            \
+  libaio-dev                                            \
+  libboost-all-dev                                      \
+  libdouble-conversion-dev                              \
+  libdwarf-dev                                          \
+  libgflags-dev                                         \
+  libgmock-dev                                          \
+  libgoogle-glog-dev                                    \
+  libgoogle-perftools-dev                               \
+  libgtest-dev                                          \
+  liblz4-dev                                            \
+  liblzma-dev                                           \
+  libssl-dev                                            \
+  libunwind-dev                                         \
+  libuv1-dev
+
+# Install Python Package
+pip install hf3fs_py_usrbio-1.2.9+394583d-cp312-cp312-linux_x86_64.whl
+export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/lib/python3.12/dist-packages
+```
diff --git a/sglang/python/sglang/srt/mem_cache/storage/hf3fs/hf3fs_client.py b/sglang/python/sglang/srt/mem_cache/storage/hf3fs/hf3fs_client.py
new file mode 100644
index 0000000000000000000000000000000000000000..d789a20534891163249fef6f51fa67d243e37c7a
--- /dev/null
+++ b/sglang/python/sglang/srt/mem_cache/storage/hf3fs/hf3fs_client.py
@@ -0,0 +1,163 @@
+import logging
+import os
+from abc import ABC, abstractmethod
+from typing import List
+
+import torch
+
+
+class Hf3fsClient(ABC):
+    """Abstract interface for HF3FS clients."""
+
+    @abstractmethod
+    def __init__(self, path: str, size: int, bytes_per_page: int, entries: int):
+        """Initialize the HF3FS client.
+
+        Args:
+            path: File path for storage
+            size: Total size of storage file
+            bytes_per_page: Bytes per page
+            entries: Number of entries for batch operations
+        """
+        pass
+
+    @abstractmethod
+    def batch_read(self, offsets: List[int], tensors: List[torch.Tensor]) -> List[int]:
+        """Batch read from storage."""
+        pass
+
+    @abstractmethod
+    def batch_write(self, offsets: List[int], tensors: List[torch.Tensor]) -> List[int]:
+        """Batch write to storage."""
+        pass
+
+    @abstractmethod
+    def check(self, offsets: List[int], tensors: List[torch.Tensor]) -> None:
+        """Validate batch operation parameters."""
+        pass
+
+    @abstractmethod
+    def get_size(self) -> int:
+        """Get total storage size."""
+        pass
+
+    @abstractmethod
+    def close(self) -> None:
+        """Close the client and cleanup resources."""
+        pass
+
+    @abstractmethod
+    def flush(self) -> None:
+        """Flush data to disk."""
+        pass
+
+
+logger = logging.getLogger(__name__)
+
+
+class Hf3fsMockClient(Hf3fsClient):
+    """Mock implementation of Hf3fsClient for CI testing purposes."""
+
+    def __init__(self, path: str, size: int, bytes_per_page: int, entries: int):
+        """Initialize mock HF3FS client."""
+        self.path = path
+        self.size = size
+        self.bytes_per_page = bytes_per_page
+        self.entries = entries
+
+        # Create directory if it doesn't exist
+        os.makedirs(os.path.dirname(self.path), exist_ok=True)
+
+        # Create and initialize the file
+        self.file = os.open(self.path, os.O_RDWR | os.O_CREAT)
+        os.ftruncate(self.file, size)
+
+        logger.info(
+            f"Hf3fsMockClient initialized: path={path}, size={size}, "
+            f"bytes_per_page={bytes_per_page}, entries={entries}"
+        )
+
+    def batch_read(self, offsets: List[int], tensors: List[torch.Tensor]) -> List[int]:
+        """Batch read from mock storage."""
+        self.check(offsets, tensors)
+
+        results = []
+
+        for offset, tensor in zip(offsets, tensors):
+            size = tensor.numel() * tensor.itemsize
+
+            try:
+                os.lseek(self.file, offset, os.SEEK_SET)
+                bytes_read = os.read(self.file, size)
+
+                if len(bytes_read) == size:
+                    # Convert bytes to tensor and copy to target
+                    bytes_tensor = torch.frombuffer(bytes_read, dtype=torch.uint8)
+                    typed_tensor = bytes_tensor.view(tensor.dtype).view(tensor.shape)
+                    tensor.copy_(typed_tensor)
+                    results.append(size)
+                else:
+                    logger.warning(
+                        f"Short read: expected {size}, got {len(bytes_read)}"
+                    )
+                    results.append(len(bytes_read))
+
+            except Exception as e:
+                logger.error(f"Error reading from offset {offset}: {e}")
+                results.append(0)
+
+        return results
+
+    def batch_write(self, offsets: List[int], tensors: List[torch.Tensor]) -> List[int]:
+        """Batch write to mock storage."""
+        self.check(offsets, tensors)
+
+        results = []
+
+        for offset, tensor in zip(offsets, tensors):
+            size = tensor.numel() * tensor.itemsize
+
+            try:
+                # Convert tensor to bytes and write directly to file
+                tensor_bytes = tensor.contiguous().view(torch.uint8).flatten()
+                data = tensor_bytes.numpy().tobytes()
+
+                os.lseek(self.file, offset, os.SEEK_SET)
+                bytes_written = os.write(self.file, data)
+
+                if bytes_written == size:
+                    results.append(size)
+                else:
+                    logger.warning(f"Short write: expected {size}, got {bytes_written}")
+                    results.append(bytes_written)
+
+            except Exception as e:
+                logger.error(f"Error writing to offset {offset}: {e}")
+                results.append(0)
+
+        return results
+
+    def check(self, offsets: List[int], tensors: List[torch.Tensor]) -> None:
+        """Validate batch operation parameters."""
+        pass
+
+    def get_size(self) -> int:
+        """Get total storage size."""
+        return self.size
+
+    def close(self) -> None:
+        """Close the mock client and cleanup resources."""
+        try:
+            if hasattr(self, "file") and self.file >= 0:
+                os.close(self.file)
+                self.file = -1  # Mark as closed
+            logger.info(f"MockHf3fsClient closed: {self.path}")
+        except Exception as e:
+            logger.error(f"Error closing MockHf3fsClient: {e}")
+
+    def flush(self) -> None:
+        """Flush data to disk."""
+        try:
+            os.fsync(self.file)
+        except Exception as e:
+            logger.error(f"Error flushing MockHf3fsClient: {e}")
diff --git a/sglang/python/sglang/srt/mem_cache/storage/hf3fs/hf3fs_usrbio_client.py b/sglang/python/sglang/srt/mem_cache/storage/hf3fs/hf3fs_usrbio_client.py
new file mode 100644
index 0000000000000000000000000000000000000000..0e4e686c38fa369e77c420ba8b297391a19cd7c2
--- /dev/null
+++ b/sglang/python/sglang/srt/mem_cache/storage/hf3fs/hf3fs_usrbio_client.py
@@ -0,0 +1,220 @@
+import datetime
+import logging
+import multiprocessing
+import os
+import threading
+from functools import wraps
+from pathlib import Path
+from typing import List
+
+import torch
+from torch.utils.cpp_extension import load
+
+from sglang.srt.mem_cache.storage.hf3fs.hf3fs_client import Hf3fsClient
+
+root = Path(__file__).parent.resolve()
+hf3fs_utils = load(name="hf3fs_utils", sources=[f"{root}/hf3fs_utils.cpp"])
+
+logger = logging.getLogger(__name__)
+
+HF3FS_AVAILABLE = True
+try:
+    from hf3fs_fuse.io import (
+        deregister_fd,
+        extract_mount_point,
+        make_ioring,
+        make_iovec,
+        register_fd,
+    )
+except ImportError:
+    HF3FS_AVAILABLE = False
+
+
+def rsynchronized():
+    def _decorator(func):
+        @wraps(func)
+        def wrapper(self, *args, **kwargs):
+            with self.rlock:
+                return func(self, *args, **kwargs)
+
+        return wrapper
+
+    return _decorator
+
+
+def wsynchronized():
+    def _decorator(func):
+        @wraps(func)
+        def wrapper(self, *args, **kwargs):
+            with self.wlock:
+                return func(self, *args, **kwargs)
+
+        return wrapper
+
+    return _decorator
+
+
+class Hf3fsUsrBioClient(Hf3fsClient):
+    """HF3FS client implementation using usrbio."""
+
+    def __init__(
+        self,
+        path: str,
+        size: int,
+        bytes_per_page: int,
+        entries: int,
+        client_timeout: int,
+    ):
+        if not HF3FS_AVAILABLE:
+            raise ImportError(
+                "hf3fs_fuse.io is not available. Please install the hf3fs_fuse package."
+            )
+
+        self.path = path
+        self.size = size
+        self.bytes_per_page = bytes_per_page
+        self.entries = entries
+        self.client_timeout = client_timeout
+
+        self.file = os.open(self.path, os.O_RDWR | os.O_CREAT)
+        os.ftruncate(self.file, size)
+        register_fd(self.file)
+
+        self.hf3fs_mount_point = extract_mount_point(path)
+        self.bs = self.bytes_per_page
+        self.shm_r = multiprocessing.shared_memory.SharedMemory(
+            size=self.bs * self.entries, create=True
+        )
+        self.shm_w = multiprocessing.shared_memory.SharedMemory(
+            size=self.bs * self.entries, create=True
+        )
+
+        self.shm_r_tensor = torch.frombuffer(self.shm_r.buf, dtype=torch.uint8)
+        self.shm_w_tensor = torch.frombuffer(self.shm_w.buf, dtype=torch.uint8)
+
+        self.numa = -1
+        self.ior_r = make_ioring(
+            self.hf3fs_mount_point,
+            self.entries,
+            for_read=True,
+            timeout=1,
+            numa=self.numa,
+        )
+        self.ior_w = make_ioring(
+            self.hf3fs_mount_point,
+            self.entries,
+            for_read=False,
+            timeout=1,
+            numa=self.numa,
+        )
+        self.iov_r = make_iovec(self.shm_r, self.hf3fs_mount_point)
+        self.iov_w = make_iovec(self.shm_w, self.hf3fs_mount_point)
+        self.shm_r.unlink()
+        self.shm_w.unlink()
+
+        self.rlock = threading.RLock()
+        self.wlock = threading.RLock()
+
+    @rsynchronized()
+    def batch_read(self, offsets: List[int], tensors: List[torch.Tensor]) -> List[int]:
+        self.check(offsets, tensors)
+        results = [0] * len(offsets)
+        # prepare
+        current = 0
+        for offset, tensor in zip(offsets, tensors):
+            size = tensor.numel() * tensor.itemsize
+            try:
+                self.ior_r.prepare(
+                    self.iov_r[current : current + size], True, self.file, offset
+                )
+                current += size
+            except Exception as e:
+                logger.error(f"Error preparing batch read: {e}")
+                return results
+        # submit
+        ionum = len(offsets)
+        try:
+            resv = self.ior_r.submit().wait(
+                min_results=ionum,
+                timeout=datetime.timedelta(seconds=self.client_timeout),
+            )
+        except Exception as e:
+            logger.error(f"Error submitting batch read: {e}")
+            return results
+        # results
+        try:
+            hf3fs_utils.read_shm(self.shm_r_tensor, tensors)
+            results = [res.result for res in resv]
+        except Exception as e:
+            logger.error(f"[Hf3fsUsrBioClient] read_shm failed: {e}", exc_info=True)
+            return results
+
+        return results
+
+    @wsynchronized()
+    def batch_write(self, offsets: List[int], tensors: List[torch.Tensor]) -> List[int]:
+        self.check(offsets, tensors)
+        results = [0] * len(offsets)
+        # prepare
+        hf3fs_utils.write_shm(tensors, self.shm_w_tensor)
+        current = 0
+        for offset, tensor in zip(offsets, tensors):
+            size = tensor.numel() * tensor.itemsize
+            try:
+                self.ior_w.prepare(
+                    self.iov_w[current : current + size], False, self.file, offset
+                )
+                current += size
+            except Exception as e:
+                logger.error(f"Error preparing batch write: {e}")
+                return results
+
+        # submit
+        ionum = len(offsets)
+        try:
+            resv = self.ior_w.submit().wait(
+                min_results=ionum,
+                timeout=datetime.timedelta(seconds=self.client_timeout),
+            )
+        except Exception as e:
+            logger.error(f"Error submitting batch write: {e}")
+            return results
+
+        # results
+        results = [res.result for res in resv]
+
+        return results
+
+    def check(self, offsets: List[int], tensors: List[torch.Tensor]) -> None:
+        sizes = [t.numel() * t.itemsize for t in tensors]
+        if any(
+            [
+                len(offsets) > self.entries,
+                len(offsets) != len(sizes),
+                all(
+                    [
+                        offset < 0 or offset + size > self.size
+                        for offset, size in zip(offsets, sizes)
+                    ]
+                ),
+                all([size > self.bytes_per_page for size in sizes]),
+            ]
+        ):
+            self.close()
+            raise ValueError(f"Hf3fsClient.check: {offsets=}, {sizes=}")
+
+    def get_size(self) -> int:
+        return self.size
+
+    def close(self) -> None:
+        deregister_fd(self.file)
+        os.close(self.file)
+        del self.ior_r
+        del self.ior_w
+        del self.iov_r
+        del self.iov_w
+        self.shm_r.close()
+        self.shm_w.close()
+
+    def flush(self) -> None:
+        os.fsync(self.file)
diff --git a/sglang/python/sglang/srt/mem_cache/storage/hf3fs/hf3fs_utils.cpp b/sglang/python/sglang/srt/mem_cache/storage/hf3fs/hf3fs_utils.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..3a4b7dcc09446c2f0bbdd12d27f508d4c33e5999
--- /dev/null
+++ b/sglang/python/sglang/srt/mem_cache/storage/hf3fs/hf3fs_utils.cpp
@@ -0,0 +1,35 @@
+#include <torch/extension.h>
+
+#include <cstring>
+#include <vector>
+
+void read_shm(const torch::Tensor &shm, std::vector<torch::Tensor> dst) {
+  py::gil_scoped_release release;
+  char *src_ptr = static_cast<char *>(shm.data_ptr());
+  size_t current = 0;
+  for (size_t i = 0; i < dst.size(); ++i) {
+    auto &t = dst[i];
+    size_t t_bytes = t.numel() * t.element_size();
+    char *dst_ptr = static_cast<char *>(t.data_ptr());
+    std::memcpy(dst_ptr, src_ptr + current, t_bytes);
+    current += t_bytes;
+  }
+}
+
+void write_shm(const std::vector<torch::Tensor> src, torch::Tensor &shm) {
+  py::gil_scoped_release release;
+  char *dst_ptr = static_cast<char *>(shm.data_ptr());
+  size_t current = 0;
+  for (size_t i = 0; i < src.size(); ++i) {
+    auto &t = src[i];
+    size_t t_bytes = t.numel() * t.element_size();
+    char *src_ptr = static_cast<char *>(t.data_ptr());
+    std::memcpy(dst_ptr + current, src_ptr, t_bytes);
+    current += t_bytes;
+  }
+}
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+  m.def("read_shm", &read_shm, "Read tensors from shared memory");
+  m.def("write_shm", &write_shm, "Write tensors to shared memory");
+}
diff --git a/sglang/python/sglang/srt/mem_cache/storage/hf3fs/mini_3fs_metadata_server.py b/sglang/python/sglang/srt/mem_cache/storage/hf3fs/mini_3fs_metadata_server.py
new file mode 100644
index 0000000000000000000000000000000000000000..03fec2080dfabbecef263d92413e8bb78944b89f
--- /dev/null
+++ b/sglang/python/sglang/srt/mem_cache/storage/hf3fs/mini_3fs_metadata_server.py
@@ -0,0 +1,471 @@
+import argparse
+import atexit
+import json
+import logging
+import threading
+from collections import OrderedDict
+from pathlib import Path
+from typing import Dict, List, Optional, Tuple
+
+import orjson
+import requests
+from fastapi import FastAPI, HTTPException, Request, Response
+from fastapi.responses import ORJSONResponse
+from requests.adapters import HTTPAdapter
+from urllib3.util.retry import Retry
+
+from sglang.srt.mem_cache.storage.hf3fs.storage_hf3fs import Hf3fsMetadataInterface
+
+# --- Configuration ---
+logging.basicConfig(
+    level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
+)
+
+
+# --- Data Models ---
+class RankMetadata:
+    """Holds all metadata for a single rank."""
+
+    def __init__(self, num_pages: int):
+        self.lock = threading.Lock()
+        self.num_pages = num_pages
+        self.free_pages: List[int] = list(range(num_pages))
+        self.key_to_index: OrderedDict[str, int] = OrderedDict()
+        # Todo: Support multi files for HF3FS
+
+    def exists_keys(self, keys: List[str]) -> List[bool]:
+        """Check if keys exist in metadata."""
+        with self.lock:
+            return [key in self.key_to_index for key in keys]
+
+    def reserve_and_allocate_page_indices(
+        self, keys: List[Tuple[str, str]]
+    ) -> List[Tuple[bool, int]]:
+        """Reserve and allocate page indices for keys."""
+        with self.lock:
+            results = [None] * len(keys)
+            new_keys_to_process = []
+
+            for i, (key, prefix_key) in enumerate(keys):
+                if key in self.key_to_index:
+                    results[i] = (True, self.key_to_index[key])
+                    self.key_to_index.move_to_end(key)
+                else:
+                    new_keys_to_process.append((i, key, prefix_key))
+
+            # Todo: Implementing data eviction logic after HiCache supports prefix information pass-through
+            for i, key, prefix_key in new_keys_to_process:
+                if len(self.free_pages) > 0:
+                    page_index = self.free_pages.pop()
+                else:
+                    page_index = self.key_to_index.popitem(last=False)[1]
+
+                results[i] = (False, page_index)
+
+            return results
+
+    def confirm_write(
+        self,
+        written_keys_to_confirm: List[Tuple[str, int]],
+        pages_to_release: List[int],
+    ) -> None:
+        """Confirm write operations and release pages."""
+        with self.lock:
+            for key, page_index in written_keys_to_confirm:
+                self.key_to_index[key] = page_index
+                self.key_to_index.move_to_end(key)
+
+            for page_index in pages_to_release:
+                if page_index not in self.free_pages:
+                    self.free_pages.append(page_index)
+
+    def delete_keys(self, keys: List[str]) -> int:
+        """Delete keys and return count of deleted keys."""
+        with self.lock:
+            count = 0
+            for key in keys:
+                if key in self.key_to_index:
+                    page_index = self.key_to_index.pop(key)
+                    if page_index not in self.free_pages:
+                        self.free_pages.append(page_index)
+                    count += 1
+            return count
+
+    def clear_all(self) -> None:
+        """Clear all metadata."""
+        with self.lock:
+            self.free_pages = list(range(self.num_pages))
+            self.key_to_index.clear()
+
+    def get_page_indices(self, keys: List[str]) -> List[Optional[int]]:
+        """Get page indices for keys."""
+        with self.lock:
+            results = []
+            for key in keys:
+                if key in self.key_to_index:
+                    results.append(self.key_to_index[key])
+                    self.key_to_index.move_to_end(key)
+                else:
+                    results.append(None)
+            return results
+
+
+class GlobalMetadataState:
+    """Manages the state for all ranks and persistence."""
+
+    def __init__(self, persistence_path: Optional[str], save_interval: int):
+        self.global_lock = threading.RLock()
+        self.ranks: Dict[int, RankMetadata] = {}
+        self.persistence_path = Path(persistence_path) if persistence_path else None
+        self.save_interval = save_interval
+        self.save_timer: Optional[threading.Timer] = None
+        self.is_shutting_down = False
+
+    def load_from_disk(self):
+        if not self.persistence_path or not self.persistence_path.exists():
+            logging.info("Persistence file not found. Starting with a clean state.")
+            return
+
+        logging.info(f"Loading state from {self.persistence_path}")
+        try:
+            with open(self.persistence_path, "r") as f:
+                persisted_data = json.load(f)
+
+            with self.global_lock:
+                for rank_id_str, data in persisted_data.items():
+                    rank_id = int(rank_id_str)
+                    num_pages = data["num_pages"]
+                    rank_meta = RankMetadata(num_pages)
+                    rank_meta.free_pages = data["free_pages"]
+                    rank_meta.key_to_index = OrderedDict(data["key_to_index"])
+                    self.ranks[rank_id] = rank_meta
+                logging.info(
+                    f"Successfully loaded metadata for {len(self.ranks)} ranks."
+                )
+        except (json.JSONDecodeError, KeyError, TypeError) as e:
+            logging.error(
+                f"Failed to load or parse persistence file: {e}. Starting fresh.",
+                exc_info=True,
+            )
+            self.ranks.clear()
+
+    def save_to_disk(self):
+        if not self.persistence_path:
+            return
+
+        logging.info("Persisting metadata to disk...")
+        with self.global_lock:
+            serializable_state = {}
+            for rank_id, rank_meta in self.ranks.items():
+                with rank_meta.lock:
+                    serializable_state[rank_id] = {
+                        "num_pages": rank_meta.num_pages,
+                        "free_pages": rank_meta.free_pages,
+                        "key_to_index": list(rank_meta.key_to_index.items()),
+                    }
+
+        try:
+            temp_path = self.persistence_path.with_suffix(".tmp")
+            with open(temp_path, "w") as f:
+                json.dump(serializable_state, f, indent=4)
+            temp_path.rename(self.persistence_path)
+            logging.info(f"Metadata successfully persisted to {self.persistence_path}")
+        except Exception as e:
+            logging.error(f"Failed to save metadata to disk: {e}", exc_info=True)
+
+    def schedule_save(self):
+        if self.is_shutting_down or not self.persistence_path:
+            return
+        self.save_to_disk()
+        self.save_timer = threading.Timer(self.save_interval, self.schedule_save)
+        self.save_timer.start()
+
+    def shutdown(self):
+        logging.info("Shutting down metadata server...")
+        self.is_shutting_down = True
+        if self.save_timer:
+            self.save_timer.cancel()
+        self.save_to_disk()
+        logging.info("Shutdown complete.")
+
+
+# --- Global MetadataServer implementation ---
+class Hf3fsMetadataServer:
+    """HF3FS Metadata Server that manages metadata for multiple ranks."""
+
+    def __init__(self, persistence_path: Optional[str] = None, save_interval: int = 60):
+        self.state = GlobalMetadataState(persistence_path, save_interval)
+        self.app = FastAPI(default_response_class=ORJSONResponse)
+
+        self._setup_routes()
+
+    def _setup_routes(self):
+        """Setup FastAPI routes."""
+        self.app.post("/{rank}/initialize")(self.initialize)
+        self.app.post("/{rank}/exists")(self.exists)
+        self.app.post("/{rank}/reserve_and_allocate_page_indices")(
+            self.reserve_and_allocate_page_indices
+        )
+        self.app.post("/{rank}/confirm_write")(self.confirm_write)
+        self.app.post("/{rank}/delete_keys")(self.delete_keys)
+        self.app.post("/{rank}/clear")(self.clear)
+        self.app.post("/{rank}/get_page_indices")(self.get_page_indices)
+
+    def get_rank_metadata(self, rank: int) -> RankMetadata:
+        """Get rank metadata with proper error handling."""
+        if rank not in self.state.ranks:
+            raise HTTPException(
+                status_code=404,
+                detail=f"Rank {rank} not initialized. Please call /{rank}/initialize first.",
+            )
+        return self.state.ranks[rank]
+
+    async def _read_json(self, request: Request) -> dict:
+        """Parse request JSON using orjson if available."""
+        body = await request.body()
+        return orjson.loads(body)
+
+    def _json_response(self, content: dict):
+        """Return ORJSONResponse when available to bypass jsonable_encoder."""
+        return ORJSONResponse(content)
+
+    async def initialize(self, rank: int, request: Request):
+        """Initialize a rank with specified number of pages."""
+        data = await self._read_json(request)
+        num_pages = data["num_pages"]
+        with self.state.global_lock:
+            if rank in self.state.ranks:
+                logging.info(
+                    f"Rank {rank} already exists. Initialization request ignored."
+                )
+                if self.state.ranks[rank].num_pages != num_pages:
+                    logging.warning(
+                        f"Rank {rank} initialized with different num_pages. Existing: {self.state.ranks[rank].num_pages}, New: {num_pages}"
+                    )
+            else:
+                logging.info(f"Initializing new Rank {rank} with {num_pages} pages.")
+                self.state.ranks[rank] = RankMetadata(num_pages)
+        return Response(status_code=204)
+
+    async def exists(self, rank: int, request: Request):
+        """Check if keys exist in metadata."""
+        data = await self._read_json(request)
+        keys = data["keys"]
+        metadata = self.get_rank_metadata(rank)
+        results = metadata.exists_keys(keys)
+        return self._json_response({"exists": results})
+
+    async def reserve_and_allocate_page_indices(self, rank: int, request: Request):
+        """Reserve and allocate page indices for keys."""
+        data = await self._read_json(request)
+        metadata = self.get_rank_metadata(rank)
+        keys = data["keys"]
+        results = metadata.reserve_and_allocate_page_indices(keys)
+        return self._json_response({"indices": results})
+
+    async def confirm_write(self, rank: int, request: Request):
+        """Confirm write operations and release pages."""
+        data = await self._read_json(request)
+        metadata = self.get_rank_metadata(rank)
+        success_written_keys = data.get("written_keys_to_confirm", [])
+        released_pages = data.get("pages_to_release", [])
+
+        metadata.confirm_write(success_written_keys, released_pages)
+
+        return Response(status_code=204)
+
+    async def delete_keys(self, rank: int, request: Request):
+        """Delete keys from metadata."""
+        data = await self._read_json(request)
+        metadata = self.get_rank_metadata(rank)
+        count = metadata.delete_keys(data["keys"])
+        return Response(status_code=204)
+
+    async def clear(self, rank: int):
+        """Clear all metadata for a rank."""
+        metadata = self.get_rank_metadata(rank)
+        metadata.clear_all()
+        return Response(status_code=204)
+
+    async def get_page_indices(self, rank: int, request: Request):
+        """Get page indices for keys."""
+        data = await self._read_json(request)
+        metadata = self.get_rank_metadata(rank)
+        keys = data["keys"]
+        results = metadata.get_page_indices(keys)
+        return self._json_response({"indices": results})
+
+    def run(self, host: str = "0.0.0.0", port: int = 18000):
+        """Run the metadata server."""
+        self.state.load_from_disk()
+        if self.state.persistence_path:
+            self.state.schedule_save()
+            atexit.register(self.state.shutdown)
+
+        import uvicorn
+
+        logging.info(f"Starting metadata server on http://{host}:{port}")
+        if self.state.persistence_path:
+            logging.info(
+                f"Persistence is ENABLED. Saving to '{self.state.persistence_path}' every {self.state.save_interval} seconds."
+            )
+        else:
+            logging.info("Persistence is DISABLED.")
+
+        uvicorn.run(self.app, host=host, port=port)
+
+
+# --- Client implementation ---
+class Hf3fsGlobalMetadataClient(Hf3fsMetadataInterface):
+    """Global http metadata client for HF3FS."""
+
+    def __init__(self, base_url: str, max_retries: int = 3):
+        self.base_url = base_url.rstrip("/")
+        self._session = requests.Session()
+
+        retry_strategy = Retry(
+            total=max_retries,
+            backoff_factor=0.3,
+            status_forcelist=[500, 502, 503, 504],
+            allowed_methods=["GET", "POST"],
+        )
+        adapter = HTTPAdapter(
+            max_retries=retry_strategy, pool_connections=256, pool_maxsize=256
+        )
+        self._session.mount("http://", adapter)
+
+    def _post(self, endpoint: str, json_data: dict) -> dict:
+        try:
+            url = f"{self.base_url}/{endpoint}"
+            headers = {"Content-Type": "application/json"}
+            payload = orjson.dumps(json_data)  # type: ignore[union-attr]
+            response = self._session.post(url, data=payload, headers=headers)
+            response.raise_for_status()
+
+            if response.status_code == 204 or not response.content:
+                return {}
+            return orjson.loads(response.content)  # type: ignore[union-attr]
+        except requests.exceptions.RequestException as e:
+            logging.error(f"Failed to POST to {endpoint} after retries: {e}")
+            raise RuntimeError(f"Failed to connect to metadata server: {e}") from e
+
+    def initialize(self, rank: int, num_pages: int) -> None:
+        self._post(f"{rank}/initialize", {"num_pages": num_pages})
+
+    def reserve_and_allocate_page_indices(
+        self, rank: int, keys: List[Tuple[str, str]]
+    ) -> List[Tuple[bool, int]]:
+        response = self._post(
+            f"{rank}/reserve_and_allocate_page_indices", {"keys": keys}
+        )
+        return [tuple(item) for item in response.get("indices")]
+
+    def confirm_write(
+        self,
+        rank: int,
+        written_keys_to_confirm: List[Tuple[str, int]],
+        pages_to_release: List[int],
+    ) -> None:
+        self._post(
+            f"{rank}/confirm_write",
+            {
+                "written_keys_to_confirm": written_keys_to_confirm,
+                "pages_to_release": pages_to_release,
+            },
+        )
+
+    def delete_keys(self, rank: int, keys: List[str]) -> None:
+        self._post(f"{rank}/delete_keys", {"keys": keys})
+
+    def exists(self, rank: int, keys: List[str]) -> List[bool]:
+        response = self._post(f"{rank}/exists", {"keys": keys})
+        return response.get("exists", [])
+
+    def clear(self, rank: int) -> None:
+        self._post(f"{rank}/clear", {})
+
+    def get_page_indices(self, rank: int, keys: List[str]) -> List[Optional[int]]:
+        response = self._post(f"{rank}/get_page_indices", {"keys": keys})
+        return response.get("indices")
+
+
+class Hf3fsLocalMetadataClient(Hf3fsMetadataInterface):
+    """Local metadata client that directly operates on single RankMetadata in memory without metadata server."""
+
+    def __init__(self):
+        self.rank_metadata = None
+
+    def initialize(self, rank: int, num_pages: int) -> None:
+        self.rank_metadata = RankMetadata(num_pages)
+
+    def reserve_and_allocate_page_indices(
+        self, rank: int, keys: List[Tuple[str, str]]
+    ) -> List[Tuple[bool, int]]:
+        """Reserve and allocate page indices for keys."""
+        return self.rank_metadata.reserve_and_allocate_page_indices(keys)
+
+    def confirm_write(
+        self,
+        rank: int,
+        written_keys_to_confirm: List[Tuple[str, int]],
+        pages_to_release: List[int],
+    ) -> None:
+        """Confirm write operations."""
+        self.rank_metadata.confirm_write(written_keys_to_confirm, pages_to_release)
+
+    def delete_keys(self, rank: int, keys: List[str]) -> None:
+        """Delete keys."""
+        self.rank_metadata.delete_keys(keys)
+
+    def exists(self, rank: int, keys: List[str]) -> List[bool]:
+        """Check if keys exist."""
+        return self.rank_metadata.exists_keys(keys)
+
+    def clear(self, rank: int) -> None:
+        """Clear all metadata for rank."""
+        self.rank_metadata.clear_all()
+
+    def get_page_indices(self, rank: int, keys: List[str]) -> List[Optional[int]]:
+        """Get page indices for keys."""
+        return self.rank_metadata.get_page_indices(keys)
+
+
+def run_metadata_server(
+    host: str = "0.0.0.0",
+    port: int = 18000,
+    persistence_path: Optional[str] = None,
+    save_interval: int = 60,
+):
+    """Run the HF3FS metadata server."""
+    global server
+    server = Hf3fsMetadataServer(
+        persistence_path=persistence_path, save_interval=save_interval
+    )
+
+    server.run(host=host, port=port)
+
+
+# --- Main Execution ---
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="HF3FS Metadata Server")
+    parser.add_argument(
+        "--host", type=str, default="0.0.0.0", help="Host to bind the server to."
+    )
+    parser.add_argument(
+        "--port", type=int, default=18000, help="Port to run the server on."
+    )
+    parser.add_argument(
+        "--persistence-path",
+        type=str,
+        default=None,
+        help="Path to the file for persisting metadata. If not provided, persistence is disabled.",
+    )
+    parser.add_argument(
+        "--save-interval",
+        type=int,
+        default=60,
+        help="Interval in seconds for periodically saving metadata to disk.",
+    )
+    args = parser.parse_args()
+
+    run_metadata_server(args.host, args.port, args.persistence_path, args.save_interval)
diff --git a/sglang/python/sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py b/sglang/python/sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py
new file mode 100644
index 0000000000000000000000000000000000000000..b202968429f35e8eb63f4fb7145808280548af7d
--- /dev/null
+++ b/sglang/python/sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py
@@ -0,0 +1,664 @@
+import atexit
+import concurrent.futures
+import json
+import logging
+import os
+import signal
+import threading
+import time
+from abc import ABC, abstractmethod
+from functools import wraps
+from typing import Any, List, Optional, Tuple
+
+import torch
+
+from sglang.srt.mem_cache.hicache_storage import (
+    HiCacheStorage,
+    HiCacheStorageConfig,
+    HiCacheStorageExtraInfo,
+)
+from sglang.srt.mem_cache.memory_pool_host import HostKVCache
+from sglang.srt.mem_cache.storage.hf3fs.hf3fs_client import Hf3fsClient
+from sglang.srt.observability.metrics_collector import StorageMetrics
+
+logger = logging.getLogger(__name__)
+
+
+class Hf3fsMetadataInterface(ABC):
+    """Interface for HF3FS metadata operations."""
+
+    @abstractmethod
+    def initialize(self, rank: int, num_pages: int) -> None:
+        """Initialize the metadata service with specified number of pages."""
+        pass
+
+    @abstractmethod
+    def reserve_and_allocate_page_indices(
+        self,
+        rank: int,
+        keys: List[Tuple[str, str]],
+    ) -> List[Tuple[bool, int]]:
+        """
+        Reserve and allocate page indices for the specified keys.
+        Args:
+            rank: The rank of the process.
+            keys: The keys to reserve and allocate page indices for. Each tuple contains a key and the key of its prefix block.
+        Returns:
+            List[Tuple[bool, int]]: A list of tuples, where each tuple contains a boolean indicating whether the key has existed and an integer indicating the allocated page index.
+        """
+        pass
+
+    @abstractmethod
+    def confirm_write(
+        self,
+        rank: int,
+        written_keys_to_confirm: List[Tuple[str, int]],
+        pages_to_release: List[int],
+    ) -> None:
+        """
+        Confirm that key-value pairs have been successfully written to storage.
+        Args:
+            rank: The rank of the process.
+            written_keys_to_confirm: A list of tuples, where each tuple contains a key and its corresponding page index.
+            pages_to_release: A list of page indices to be released.
+        """
+        pass
+
+    @abstractmethod
+    def get_page_indices(self, rank: int, keys: List[str]) -> List[Optional[int]]:
+        """
+        Get page indices for the specified keys.
+        Args:
+            rank: The rank of the process.
+            keys: A list of keys.
+        Returns:
+            List[Optional[int]]: A list of integers representing the page indices for the specified keys.
+                                 If a key is not found, the corresponding index will be None.
+        """
+        pass
+
+    @abstractmethod
+    def delete_keys(self, rank: int, keys: List[str]) -> None:
+        """Delete specified keys and their associated pages."""
+        pass
+
+    @abstractmethod
+    def exists(self, rank: int, keys: List[str]) -> List[bool]:
+        """Check if the specified keys exist."""
+        pass
+
+    @abstractmethod
+    def clear(self, rank: int) -> None:
+        """Clear all key-value pairs and page allocations for the specified rank."""
+        pass
+
+
+class AtomicCounter:
+    def __init__(self, n: int):
+        assert n > 0
+        self.n = n
+        self._value = 0
+        self._lock = threading.Lock()
+
+    def next(self) -> int:
+        with self._lock:
+            current = self._value
+            self._value = (current + 1) % self.n
+            return current
+
+
+def synchronized():
+    def _decorator(func):
+        @wraps(func)
+        def wrapper(self, *args, **kwargs):
+            with self.lock:
+                return func(self, *args, **kwargs)
+
+        return wrapper
+
+    return _decorator
+
+
+def create_hf3fs_client(
+    path: str,
+    size: int,
+    bytes_per_page: int,
+    entries: int,
+    client_timeout: int,
+    use_mock: bool = False,
+) -> Hf3fsClient:
+    """Factory function to create appropriate HF3FS client.
+
+    Args:
+        path: File path for storage
+        size: Total size of storage file
+        bytes_per_page: Bytes per page
+        entries: Number of entries for batch operations
+        use_mock: Whether to use mock client instead of real usrbio client
+
+    Returns:
+    """
+    if use_mock:
+        from sglang.srt.mem_cache.storage.hf3fs.hf3fs_client import Hf3fsMockClient
+
+        logger.info(f"[Rank Using Hf3fsMockClient for testing")
+        return Hf3fsMockClient(path, size, bytes_per_page, entries)
+    else:
+        from sglang.srt.mem_cache.storage.hf3fs.hf3fs_usrbio_client import (
+            Hf3fsUsrBioClient,
+        )
+
+        return Hf3fsUsrBioClient(path, size, bytes_per_page, entries, client_timeout)
+
+
+class HiCacheHF3FS(HiCacheStorage):
+    """HiCache backend that stores KV cache pages in HF3FS files."""
+
+    default_env_var: str = "SGLANG_HICACHE_HF3FS_CONFIG_PATH"
+
+    def __init__(
+        self,
+        rank: int,
+        file_path: str,
+        file_size: int,
+        numjobs: int,
+        bytes_per_page: int,
+        entries: int,
+        client_timeout: int,
+        dtype: torch.dtype,
+        metadata_client: Hf3fsMetadataInterface,
+        is_mla_model: bool = False,
+        is_page_first_layout: bool = False,
+        use_mock_client: bool = False,
+    ):
+        self.rank = rank
+        self.file_path = file_path
+        self.file_size = file_size
+        self.numjobs = numjobs
+        self.bytes_per_page = bytes_per_page
+        self.gb_per_page = bytes_per_page / (1 << 30)
+        self.entries = entries
+        self.client_timeout = client_timeout
+        self.dtype = dtype
+        self.metadata_client = metadata_client
+        self.is_mla_model = is_mla_model
+        self.is_page_first_layout = is_page_first_layout
+        self.numel = self.bytes_per_page // self.dtype.itemsize
+        self.num_pages = self.file_size // self.bytes_per_page
+        self.skip_backup = False
+        if self.is_mla_model and self.rank != 0:
+            self.skip_backup = True
+            self.rank = 0
+
+        self.is_zero_copy = False
+
+        logger.info(
+            f"[Rank {self.rank}] HiCacheHF3FS Client Initializing: "
+            f"file_path={self.file_path}, "
+            f"file_size={self.file_size / (2 ** 30):.2f} GB, "
+            f"num_pages={self.num_pages}, "
+            f"is_mla_model={self.is_mla_model}"
+        )
+
+        self.ac = AtomicCounter(self.numjobs)
+        self.clients = [
+            create_hf3fs_client(
+                self.file_path,
+                self.file_size,
+                self.bytes_per_page,
+                self.entries,
+                self.client_timeout,
+                use_mock_client,
+            )
+            for _ in range(numjobs)
+        ]
+        self.executor = concurrent.futures.ThreadPoolExecutor(
+            max_workers=self.numjobs, thread_name_prefix=f"HiCacheHF3FS-Rank{self.rank}"
+        )
+
+        self.metadata_client.initialize(self.rank, self.num_pages)
+        self.lock = threading.RLock()
+
+        atexit.register(self.close)
+
+        signal.signal(signal.SIGINT, lambda sig, frame: self.close())
+        signal.signal(signal.SIGTERM, lambda sig, frame: self.close())
+        signal.signal(signal.SIGQUIT, lambda sig, frame: self.close())
+
+        self.prefetch_pgs = []
+        self.backup_pgs = []
+        self.prefetch_bandwidth = []
+        self.backup_bandwidth = []
+
+    @staticmethod
+    def from_env_config(
+        bytes_per_page: int,
+        dtype: torch.dtype,
+        storage_config: HiCacheStorageConfig = None,
+    ) -> "HiCacheHF3FS":
+        """Create a HiCacheHF3FS instance from environment configuration.
+
+        Environment:
+            - Uses env var stored in `HiCacheHF3FS.default_env_var` to locate a JSON config.
+            - Falls back to a local single-machine config when the env var is not set.
+
+        Raises:
+            ValueError: If MLA Model is requested without global metadata server or required keys are missing.
+        """
+        from sglang.srt.mem_cache.storage.hf3fs.mini_3fs_metadata_server import (
+            Hf3fsGlobalMetadataClient,
+            Hf3fsLocalMetadataClient,
+        )
+
+        use_mock_client = False
+        if storage_config is not None:
+            rank, is_mla_model, is_page_first_layout = (
+                storage_config.tp_rank,
+                storage_config.is_mla_model,
+                storage_config.is_page_first_layout,
+            )
+
+            if storage_config.extra_config is not None:
+                use_mock_client = storage_config.extra_config.get(
+                    "use_mock_hf3fs_client", False
+                )
+        else:
+            rank, is_mla_model, is_page_first_layout = (
+                0,
+                False,
+                False,
+            )
+
+        mla_unsupported_msg = f"MLA model is not supported without global metadata server, please refer to https://github.com/sgl-project/sglang/blob/main/python/sglang/srt/mem_cache/storage/hf3fs/docs/deploy_sglang_3fs_multinode.md"
+
+        config_path = os.getenv(HiCacheHF3FS.default_env_var)
+        if not config_path:
+            if is_mla_model:
+                raise ValueError(mla_unsupported_msg)
+
+            return HiCacheHF3FS(
+                rank=rank,
+                file_path=f"/data/hicache.{rank}.bin",
+                file_size=1 << 40,
+                numjobs=16,
+                bytes_per_page=bytes_per_page,
+                entries=8,
+                client_timeout=5,
+                dtype=dtype,
+                metadata_client=Hf3fsLocalMetadataClient(),
+                is_page_first_layout=is_page_first_layout,
+                use_mock_client=use_mock_client,
+            )
+
+        try:
+            with open(config_path, "r") as f:
+                config = json.load(f)
+        except Exception as e:
+            raise RuntimeError(f"Failed to load config from {config_path}: {str(e)}")
+
+        # Check required keys (metadata_server_url is now optional)
+        required_keys = {
+            "file_path_prefix",
+            "file_size",
+            "numjobs",
+            "entries",
+        }
+        missing_keys = required_keys - set(config.keys())
+        if missing_keys:
+            raise ValueError(f"Missing required keys in config: {missing_keys}")
+
+        # Choose metadata client based on configuration
+        if config.get("metadata_server_url"):
+            # Use global metadata client to connect to metadata server
+            metadata_server_url = config["metadata_server_url"]
+            metadata_client = Hf3fsGlobalMetadataClient(metadata_server_url)
+
+            logger.info(
+                f"Using global metadata client with server url: {metadata_server_url}"
+            )
+        else:
+            # Enable MLA optimization only when using the global metadata client
+            if is_mla_model:
+                raise ValueError(mla_unsupported_msg)
+
+            # Use local metadata client for single-machine deployment
+            metadata_client = Hf3fsLocalMetadataClient()
+
+        rank_for_path = 0 if is_mla_model else rank
+        return HiCacheHF3FS(
+            rank=rank,
+            # Let all ranks use the same file path for MLA model
+            file_path=f"{config['file_path_prefix']}.{rank_for_path}.bin",
+            file_size=int(config["file_size"]),
+            numjobs=int(config["numjobs"]),
+            bytes_per_page=bytes_per_page,
+            entries=int(config["entries"]),
+            client_timeout=config.get("client_timeout", 5),
+            dtype=dtype,
+            metadata_client=metadata_client,
+            is_mla_model=is_mla_model,
+            is_page_first_layout=is_page_first_layout,
+            use_mock_client=use_mock_client,
+        )
+
+    def _batch_get(
+        self,
+        keys: List[str],
+        values: List[torch.Tensor],
+    ) -> List[bool]:
+        page_indices = self.metadata_client.get_page_indices(self.rank, keys)
+        if len(page_indices) != len(keys):
+            logger.error(
+                f"[Rank {self.rank}] HiCacheHF3FS get: page_indices length {len(page_indices)} mismatch keys length {len(keys)}."
+            )
+            return [False] * len(keys)
+        batch_indices, file_offsets = [], []
+        for i, page_index in enumerate(page_indices):
+            if page_index is not None:
+                batch_indices.append(i)
+                file_offsets.append(page_index * self.bytes_per_page)
+
+        for target_location in values:
+            assert target_location.is_contiguous()
+        file_results = values
+
+        start_time = time.perf_counter()
+
+        futures = [
+            self.executor.submit(
+                self.clients[self.ac.next()].batch_read,
+                file_offsets[i : i + self.entries],
+                file_results[i : i + self.entries],
+            )
+            for i in range(0, len(batch_indices), self.entries)
+        ]
+        read_results = [result for future in futures for result in future.result()]
+
+        end_time = time.perf_counter()
+        ionum = len(batch_indices)
+        self.prefetch_pgs.append(ionum)
+        self.prefetch_bandwidth.append(
+            ionum / (end_time - start_time) * self.gb_per_page
+        )
+
+        results = [False] * len(keys)
+        for batch_index, read_result in zip(batch_indices, read_results):
+            if read_result == self.bytes_per_page:
+                results[batch_index] = True
+            else:
+                logger.error(
+                    f"[Rank {self.rank}] HiCacheHF3FS get {keys[batch_index]} failed"
+                )
+
+        return results
+
+    def _batch_set(
+        self,
+        keys: List[str],
+        values: Optional[Any] = None,
+    ) -> List[bool]:
+        # In MLA backend, only one rank needs to backup the KV cache
+        if self.skip_backup:
+            return True
+
+        # Todo: Add prefix block's hash key
+        key_with_prefix = [(key, "") for key in keys]
+        indices = self.metadata_client.reserve_and_allocate_page_indices(
+            self.rank, key_with_prefix
+        )
+        if len(indices) != len(keys):
+            logger.error(
+                f"[Rank {self.rank}] HiCacheHF3FS batch_get: mismatched lengths {len(indices)} != {len(keys)}"
+            )
+            # free allocated pages
+            if indices:
+                self.metadata_client.confirm_write(
+                    self.rank, [], [index[1] for index in indices]
+                )
+            return [False] * len(keys)
+        batch_indices, file_offsets, file_values = [], [], []
+        pages_to_release = []
+
+        for i, (value, (is_written, page_index)) in enumerate(zip(values, indices)):
+            if is_written or page_index == -1:
+                continue
+
+            batch_indices.append(i)
+            file_offsets.append(page_index * self.bytes_per_page)
+            assert value.is_contiguous()
+            file_values.append(value)
+
+        start_time = time.perf_counter()
+
+        futures = [
+            self.executor.submit(
+                self.clients[self.ac.next()].batch_write,
+                file_offsets[i : i + self.entries],
+                file_values[i : i + self.entries],
+            )
+            for i in range(0, len(batch_indices), self.entries)
+        ]
+        write_results = [
+            result == self.bytes_per_page
+            for future in futures
+            for result in future.result()
+        ]
+
+        end_time = time.perf_counter()
+        ionum = len(batch_indices)
+        self.backup_pgs.append(ionum)
+        self.backup_bandwidth.append(ionum / (end_time - start_time) * self.gb_per_page)
+
+        written_keys_to_confirm = []
+        results = [index[0] for index in indices]
+        for batch_index, write_result in zip(batch_indices, write_results):
+            key = keys[batch_index]
+            page_index = indices[batch_index][1]
+            if write_result:
+                written_keys_to_confirm.append((key, page_index))
+            else:
+                logger.error(f"[Rank {self.rank}] HiCacheHF3FS set {key} failed")
+                pages_to_release.append(page_index)
+            results[batch_index] = write_result
+
+        if len(written_keys_to_confirm) > 0 or len(pages_to_release) > 0:
+            self.metadata_client.confirm_write(
+                self.rank, written_keys_to_confirm, pages_to_release
+            )
+
+        return results
+
+    def delete(self, key: str) -> None:
+        self.metadata_client.delete_keys(self.rank, [key])
+
+    def exists(self, key: str) -> bool:
+        result = self.metadata_client.exists(self.rank, [key])
+        return result[0] if result else False
+
+    def batch_exists(
+        self, keys: List[str], extra_info: Optional[HiCacheStorageExtraInfo] = None
+    ) -> int:
+        factor = 1
+        if self.is_zero_copy and not self.is_mla_model:
+            keys = self._get_mha_zero_copy_keys(keys)
+            factor = 2
+
+        results = self.metadata_client.exists(self.rank, keys)
+
+        i = 0
+        while i < len(keys) and results[i]:
+            i += 1
+
+        return i // factor
+
+    def clear(self) -> None:
+        try:
+            self.metadata_client.clear(self.rank)
+            logger.info(f"Cleared HiCacheHF3FS for rank {self.rank}")
+        except Exception as e:
+            logger.error(f"Failed to clear HiCacheHF3FS: {e}")
+
+    def close(self) -> None:
+        try:
+            for c in self.clients:
+                c.close()
+            self.executor.shutdown(wait=True)
+        except Exception as e:
+            logger.error(f"close HiCacheHF3FS: {e}")
+        logger.info("close HiCacheHF3FS")
+
+    def get_stats(self):
+        storage_metrics = StorageMetrics()
+        storage_metrics.prefetch_pgs.extend(self.prefetch_pgs)
+        storage_metrics.backup_pgs.extend(self.backup_pgs)
+        storage_metrics.prefetch_bandwidth.extend(self.prefetch_bandwidth)
+        storage_metrics.backup_bandwidth.extend(self.backup_bandwidth)
+        self.prefetch_pgs.clear()
+        self.backup_pgs.clear()
+        self.prefetch_bandwidth.clear()
+        self.backup_bandwidth.clear()
+        return storage_metrics
+
+    def register_mem_pool_host(self, mem_pool_host: HostKVCache):
+        super().register_mem_pool_host(mem_pool_host)
+        self.is_zero_copy = self.mem_pool_host.layout in [
+            "page_first",
+            "page_first_direct",
+        ]
+
+        logger.info(f"{self.is_zero_copy=}, layout={self.mem_pool_host.layout}")
+
+    def _get_mha_zero_copy_keys(self, keys: List[str]) -> List[str]:
+        _keys = []
+        for k in keys:
+            _keys.append(f"{k}-k")
+            _keys.append(f"{k}-v")
+        return _keys
+
+    def _get_mha_zero_copy_values(
+        self, values: List[torch.Tensor]
+    ) -> List[torch.Tensor]:
+        _values = []
+        for value in values:
+            _values.append(value[0])
+            _values.append(value[1])
+        return _values
+
+    def _batch_get_preprocess(self, keys, host_indices):
+        page_num = len(host_indices) // self.mem_pool_host.page_size
+        # host_indices to kv_buffer
+        flat = not self.is_zero_copy
+        values = (
+            [
+                self.mem_pool_host.get_data_page(
+                    host_indices[i * self.mem_pool_host.page_size], flat=flat
+                )
+                for i in range(page_num)
+            ]
+            if self.is_zero_copy
+            else [
+                self.mem_pool_host.get_dummy_flat_data_page() for _ in range(page_num)
+            ]
+        )
+
+        if self.is_zero_copy and not self.is_mla_model:
+            keys = self._get_mha_zero_copy_keys(keys)
+            values = self._get_mha_zero_copy_values(values)
+
+        return keys, values
+
+    def _batch_get_postprocess(self, host_indices, values, results):
+        page_num = len(host_indices) // self.mem_pool_host.page_size
+
+        if self.is_zero_copy:
+            if not self.is_mla_model:
+                results = [
+                    (results[2 * i] and results[2 * i + 1]) for i in range(page_num)
+                ]
+                results = results[:page_num]
+            return results
+
+        for i in range(page_num):
+            if not results[i]:
+                break
+            self.mem_pool_host.set_from_flat_data_page(
+                host_indices[i * self.mem_pool_host.page_size], values[i]
+            )
+
+        return results
+
+    def batch_get_v1(
+        self,
+        keys: List[str],
+        host_indices: torch.Tensor,
+        extra_info: Optional[HiCacheStorageExtraInfo] = None,
+    ) -> List[bool]:
+        keys, values = self._batch_get_preprocess(keys, host_indices)
+        results = self._batch_get(keys, values)
+        return self._batch_get_postprocess(host_indices, values, results)
+
+    def _batch_set_preprocess(self, keys, host_indices):
+        page_num = len(host_indices) // self.mem_pool_host.page_size
+        # host_indices to kv_buffer
+        flat = not self.is_zero_copy
+        values = [
+            self.mem_pool_host.get_data_page(
+                host_indices[i * self.mem_pool_host.page_size], flat=flat
+            )
+            for i in range(page_num)
+        ]
+
+        if self.is_zero_copy and not self.is_mla_model:
+            keys = self._get_mha_zero_copy_keys(keys)
+            values = self._get_mha_zero_copy_values(values)
+
+        return keys, values
+
+    def batch_set_v1(
+        self,
+        keys: List[str],
+        host_indices: torch.Tensor,
+        extra_info: Optional[HiCacheStorageExtraInfo] = None,
+    ) -> List[bool]:
+        len_keys = len(keys)
+        keys, values = self._batch_set_preprocess(keys, host_indices)
+        results = self._batch_set(keys, values)
+        return results
+
+    # Deprecated
+    def get(
+        self,
+        key: str,
+        target_location: Optional[Any] = None,
+        target_sizes: Optional[Any] = None,
+    ) -> torch.Tensor | None:
+        pass
+
+    # Deprecated
+    def batch_get(
+        self,
+        keys: List[str],
+        target_locations: Optional[Any] = None,
+        target_sizes: Optional[Any] = None,
+    ) -> List[torch.Tensor | None] | int:
+        pass
+
+    # Deprecated
+    def set(
+        self,
+        key: str,
+        value: Optional[Any] = None,
+        target_location: Optional[Any] = None,
+        target_sizes: Optional[Any] = None,
+    ) -> bool:
+        pass
+
+    # Deprecated
+    def batch_set(
+        self,
+        keys: List[str],
+        values: Optional[Any] = None,
+        target_locations: Optional[Any] = None,
+        target_sizes: Optional[Any] = None,
+    ) -> bool:
+        pass
diff --git a/sglang/python/sglang/srt/mem_cache/storage/hf3fs/test_hf3fs_utils.py b/sglang/python/sglang/srt/mem_cache/storage/hf3fs/test_hf3fs_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..365effdef14a1b30de57c5e650d357aaa0b49771
--- /dev/null
+++ b/sglang/python/sglang/srt/mem_cache/storage/hf3fs/test_hf3fs_utils.py
@@ -0,0 +1,43 @@
+import multiprocessing.shared_memory
+from pathlib import Path
+
+import pytest
+import torch
+from torch.utils.cpp_extension import load
+from tqdm import tqdm
+
+root = Path(__file__).parent.resolve()
+hf3fs_utils = load(
+    name="hf3fs_utils", sources=[f"{root}/hf3fs_utils.cpp"], verbose=True
+)
+
+
+def test_rw_shm():
+    numel = 8 << 20
+    dtype = torch.bfloat16
+    page_num = 128
+    page_bytes = numel * dtype.itemsize
+    shm = multiprocessing.shared_memory.SharedMemory(
+        size=page_num * page_bytes, create=True
+    )
+    tshm = torch.frombuffer(shm.buf, dtype=torch.uint8)
+    a = [
+        torch.randn(numel, dtype=dtype)
+        for _ in tqdm(range(page_num), desc="prepare input")
+    ]
+    b = [
+        torch.empty(numel, dtype=dtype)
+        for _ in tqdm(range(page_num), desc="prepare output")
+    ]
+    hf3fs_utils.write_shm(a, tshm)
+    hf3fs_utils.read_shm(tshm, b)
+    for _a, _b in tqdm(zip(a, b), desc="assert_close"):
+        torch.testing.assert_close(_a, _b)
+
+    del tshm
+    shm.close()
+    shm.unlink()
+
+
+if __name__ == "__main__":
+    pytest.main([__file__])
diff --git a/sglang/python/sglang/srt/mem_cache/storage/lmcache/README.md b/sglang/python/sglang/srt/mem_cache/storage/lmcache/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..7177e21e5f550cca76c4faa6ba6721b00bd506ae
--- /dev/null
+++ b/sglang/python/sglang/srt/mem_cache/storage/lmcache/README.md
@@ -0,0 +1,43 @@
+# LMCache Connector for SGLang
+
+This document describes how to use LMCache as KV Cache Management Backend for SGLang engine.
+For more details about LMCache, please refer to: https://lmcache.ai
+
+## Install LMCache
+
+### Method 1: with pip
+
+```bash
+pip install lmcache
+```
+
+### Method 2: from source
+
+Clone LMCache project:
+
+```bash
+git clone https://github.com/LMCache/LMCache
+```
+
+Install:
+
+```bash
+cd LMCache
+pip install -e . --no-build-isolation
+```
+
+
+## Use LMCache
+
+Firstly, setup LMCache config. An example config is set at `example_config.yaml`. For more settings please refer to https://docs.lmcache.ai/api_reference/configurations.html.
+
+Secondly, setup SGLang serving engine with lmcache:
+
+```bash
+export LMCACHE_USE_EXPERIMENTAL=True
+export LMCACHE_CONFIG_FILE=example_config.yaml
+
+python -m sglang.launch_server \
+  --model-path MODEL \
+  --enable-lmcache
+```
diff --git a/sglang/python/sglang/srt/mem_cache/storage/lmcache/example_config.yaml b/sglang/python/sglang/srt/mem_cache/storage/lmcache/example_config.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..549110b7cd43590bf3d415134e978a3e32b56b70
--- /dev/null
+++ b/sglang/python/sglang/srt/mem_cache/storage/lmcache/example_config.yaml
@@ -0,0 +1,7 @@
+# Basic configurations
+chunk_size: 256
+
+# CPU offloading configurations
+local_cpu: true
+use_layerwise: true
+max_local_cpu_size: 10 # number of CPU backend GB
diff --git a/sglang/python/sglang/srt/mem_cache/storage/lmcache/lmc_radix_cache.py b/sglang/python/sglang/srt/mem_cache/storage/lmcache/lmc_radix_cache.py
new file mode 100644
index 0000000000000000000000000000000000000000..9a82aa31f4efaf4e24a4c927746ac7599ea9ccef
--- /dev/null
+++ b/sglang/python/sglang/srt/mem_cache/storage/lmcache/lmc_radix_cache.py
@@ -0,0 +1,276 @@
+from __future__ import annotations
+
+import logging
+import threading
+from typing import TYPE_CHECKING, Optional
+
+import torch
+
+from sglang.srt.mem_cache.base_prefix_cache import (
+    EvictParams,
+    EvictResult,
+    MatchPrefixParams,
+    MatchResult,
+)
+from sglang.srt.mem_cache.radix_cache import RadixCache, RadixKey, TreeNode
+
+try:
+    from lmcache.integration.sglang.sglang_adapter import (
+        LMCacheLayerwiseConnector,
+        LoadMetadata,
+        StoreMetadata,
+    )
+except ImportError as e:
+    raise RuntimeError(
+        "LMCache is not installed. Please install it by running `pip install lmcache`"
+    ) from e
+
+
+if TYPE_CHECKING:
+    from sglang.srt.configs.model_config import ModelConfig
+    from sglang.srt.managers.schedule_batch import Req
+    from sglang.srt.mem_cache.cache_init_params import CacheInitParams
+
+logger = logging.getLogger(__name__)
+
+
+class LayerTransferCounter:
+    """Minimal adapter that lets the memory pool notify LMCache per-layer.
+
+    The KV pool calls `wait_until(layer_id)` after finishing a layer, which we
+    translate into a `load_kv_layerwise(layer_id)` call on the LMCache connector
+    within the provided CUDA stream.
+    """
+
+    def __init__(
+        self,
+        num_layers: int,
+        load_stream: torch.cuda.Stream,
+        lmc_connector: LMCacheLayerwiseConnector,
+        printable: bool = False,
+    ):
+        self.num_layers = num_layers
+        self.load_stream = load_stream
+        self.lmc_connector = lmc_connector
+
+    def wait_until(self, layer_id: int):
+        # Ensure ordering of the async loads wrt compute stream(s).
+        self.load_stream.synchronize()
+        with self.load_stream:
+            self.lmc_connector.load_kv_layerwise(layer_id)
+
+
+class LMCRadixCache(RadixCache):
+    """RadixCache + LMCache IO.
+
+    This subclass adds:
+      - LMCache connector setup (device/host buffers, TP rank/size)
+      - Two CUDA streams for async load/store
+      - Layer-wise transfer executor wiring to the KV cache
+      - Overridden `match_prefix` to fetch missing prefix chunks from LMCache
+      - Extended cache_finalization paths to store back into LMCache
+      - Eviction barrier that respects any in-flight host->device stores
+    """
+
+    def __init__(
+        self,
+        params: CacheInitParams,
+        model_config: Optional["ModelConfig"] = None,
+        tp_size: int = 1,
+        rank: int = 0,
+        tp_group: Optional[torch.distributed.ProcessGroup] = None,
+    ):
+        super().__init__(params)
+
+        kvcache = self.token_to_kv_pool_allocator.get_kvcache()
+        self.lmcache_connector = LMCacheLayerwiseConnector(
+            sgl_config=model_config,
+            tp_size=tp_size,
+            rank=rank,
+            # NOTE: The original implementation accessed private buffers via
+            # `_kvcache.k_buffer` / `.v_buffer`. We prefer public accessors when
+            # available; fall back to private fields if needed.
+            k_pool=getattr(
+                kvcache,
+                "k_buffer",
+                getattr(self.token_to_kv_pool_allocator._kvcache, "k_buffer"),
+            ),
+            v_pool=getattr(
+                kvcache,
+                "v_buffer",
+                getattr(self.token_to_kv_pool_allocator._kvcache, "v_buffer"),
+            ),
+            tp_group=tp_group.device_group if tp_group is not None else None,
+        )
+
+        self.load_stream = torch.cuda.Stream()
+        self.store_stream = torch.cuda.Stream()
+
+        self.layer_done_executor = LayerTransferCounter(
+            num_layers=(
+                model_config.num_hidden_layers if model_config is not None else 0
+            ),
+            load_stream=self.load_stream,
+            lmc_connector=self.lmcache_connector,
+        )
+        kvcache.register_layer_transfer_counter(self.layer_done_executor)
+
+        self._in_flight_nodes: list[TreeNode] = []
+        self._node_lock = threading.Lock()
+
+    def reset(self):  # type: ignore[override]
+        super().reset()
+        if hasattr(self, "_in_flight_nodes"):
+            with self._node_lock:
+                self._in_flight_nodes.clear()
+
+    def match_prefix(self, params: MatchPrefixParams) -> MatchResult:  # type: ignore[override]
+        """Match cached prefix; if there's a tail miss, prefetch from LMCache.
+
+        Reuses the base matching logic to obtain (value, last_node). If there
+        remains a *page-aligned* uncached suffix and there is room (or after
+        eviction), we allocate token slots and trigger an async LMCache load
+        into those slots, then materialize a new child node for the retrieved
+        chunk.
+        """
+        key = params.key
+        if self.disable or not key:
+            return super().match_prefix(params)
+
+        if self.page_size != 1:
+            aligned_len = len(key) // self.page_size * self.page_size
+            key = key[:aligned_len]
+
+        base_res = super().match_prefix(params)
+        value: torch.Tensor = base_res.device_indices
+        last_node: TreeNode = base_res.last_device_node
+
+        if value.numel() == len(key):
+            return base_res
+
+        uncached_len = len(key) - value.numel()
+        if uncached_len == 0:
+            return base_res
+
+        chunk_size = self.lmcache_connector.chunk_size()
+        prefix_pad = value.numel() % chunk_size
+
+        if self.token_to_kv_pool_allocator.available_size() < uncached_len:
+            self.evict(EvictParams(num_tokens=uncached_len))
+
+        token_slots = self.token_to_kv_pool_allocator.alloc(uncached_len)
+        if token_slots is None:
+            return base_res
+
+        slot_mapping = torch.cat(
+            [
+                torch.full((value.numel(),), -1, dtype=torch.int64, device=self.device),
+                token_slots.detach().clone().to(torch.int64).to(self.device),
+            ]
+        )
+
+        with torch.cuda.stream(self.load_stream):
+            num_retrieved = self.lmcache_connector.start_load_kv(
+                LoadMetadata(
+                    token_ids=key.token_ids,  # full page-aligned key
+                    slot_mapping=slot_mapping,
+                    offset=value.numel() - prefix_pad,  # LMCache offset convention
+                )
+            )
+        logger.debug("num_retrieved_tokens: %s", num_retrieved)
+
+        if num_retrieved > 0:
+            self.token_to_kv_pool_allocator.free(
+                token_slots[(num_retrieved - prefix_pad) :]
+            )
+        else:
+            self.token_to_kv_pool_allocator.free(token_slots)
+
+        if num_retrieved > 0:
+            fetched = num_retrieved - prefix_pad
+            new_node = TreeNode(priority=last_node.priority)
+            start = value.numel()
+            end = start + fetched
+            new_node.key = key[start:end]
+            new_node.value = token_slots[:fetched]
+            new_node.parent = last_node
+            last_node.children[self.get_child_key_fn(new_node.key)] = new_node
+            last_node = new_node
+
+            value = torch.cat([value, token_slots[:fetched]])
+            self.evictable_size_ += fetched
+
+            self._record_store_event(new_node.parent)
+            self._record_store_event(new_node)
+
+            return MatchResult(
+                device_indices=value,
+                last_device_node=last_node,
+                last_host_node=last_node,
+            )
+
+        return base_res
+
+    def cache_finished_req(self, req: Req, is_insert: bool = True) -> None:  # type: ignore[override]
+        """On request completion, insert device KV into radix and store to LMCache."""
+
+        super().cache_finished_req(req, is_insert=is_insert)
+        if not is_insert:
+            return
+
+        from sglang.srt.server_args import get_global_server_args
+
+        global_server_args = get_global_server_args()
+        topk = global_server_args.speculative_eagle_topk
+        enable_kv_committed_len = topk is None or topk == 1
+        if enable_kv_committed_len:
+            kv_committed_len = req.kv_committed_len
+        else:
+            kv_committed_len = len(req.origin_input_ids) + max(
+                len(req.output_ids) - 1, 0
+            )
+
+        token_ids = (req.origin_input_ids + req.output_ids)[:kv_committed_len]
+        kv_indices = self.req_to_token_pool.req_to_token[
+            req.req_pool_idx, :kv_committed_len
+        ]
+
+        match_result = self.match_prefix(
+            MatchPrefixParams(key=RadixKey(token_ids, req.extra_key))
+        )
+        new_last_node = match_result.last_device_node
+        assert new_last_node is not None
+
+        self.inc_lock_ref(new_last_node)
+        store_md = StoreMetadata(
+            last_node=new_last_node,
+            token_ids=token_ids,
+            kv_indices=kv_indices,
+            offset=0,
+        )
+        with torch.cuda.stream(self.store_stream):
+            self.lmcache_connector.store_kv(store_md)
+        with self._node_lock:
+            self._in_flight_nodes.append(new_last_node)
+
+    def evict(self, params: EvictParams) -> EvictResult:
+        """Before base eviction, wait for any outstanding stores and release locks."""
+        if self.disable:
+            return EvictResult()
+
+        self.store_stream.synchronize()
+        with self._node_lock:
+            for node in self._in_flight_nodes:
+                self.dec_lock_ref(node)
+            self._in_flight_nodes.clear()
+
+        return super().evict(params)
+
+    def pretty_print(self):  # type: ignore[override]
+        super().pretty_print()
+        try:
+            logger.debug(
+                "evictable=%d protected=%d", self.evictable_size_, self.protected_size_
+            )
+        except Exception:  # pragma: no cover
+            pass
diff --git a/sglang/python/sglang/srt/mem_cache/storage/lmcache/unit_test.py b/sglang/python/sglang/srt/mem_cache/storage/lmcache/unit_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..68dfe939d69ac08793e4998c4d46770536ac5cd0
--- /dev/null
+++ b/sglang/python/sglang/srt/mem_cache/storage/lmcache/unit_test.py
@@ -0,0 +1,121 @@
+try:
+    from lmcache.integration.sglang.sglang_adapter import (
+        LMCacheLayerwiseConnector,
+        LoadMetadata,
+        StoreMetadata,
+    )
+except ImportError:
+    raise RuntimeError(
+        "LMCache is not installed. Please install it by running `pip install lmcache` in the root directory of LMCache"
+    )
+
+import os
+
+import torch
+
+from sglang.srt.configs.model_config import ModelConfig
+
+os.environ["LMCACHE_USE_EXPERIMENTAL"] = "True"
+os.environ["LMCACHE_CONFIG_FILE"] = "example_config.yaml"
+
+
+def test_load_store_metadata():
+    model_config = ModelConfig(
+        model_path="Qwen/Qwen3-4B",
+    )
+
+    # Generate Dummy KV Cache
+    head_num = model_config.num_key_value_heads
+    head_dim = model_config.head_dim
+    layer_num = model_config.num_hidden_layers
+    buffer_size = 256
+    input_id_len = 16
+
+    k_buffer = [
+        torch.randn(buffer_size, head_num, head_dim, dtype=torch.bfloat16).cuda()
+        for _ in range(layer_num)
+    ]
+    v_buffer = [
+        torch.randn(buffer_size, head_num, head_dim, dtype=torch.bfloat16).cuda()
+        for _ in range(layer_num)
+    ]
+
+    connector = LMCacheLayerwiseConnector(model_config, 1, 0, k_buffer, v_buffer)
+
+    fake_token_ids = torch.randint(0, model_config.vocab_size, (input_id_len,)).tolist()
+    fake_kv_indices = torch.randint(0, buffer_size, (input_id_len,))
+    offset = 0
+
+    store_metadata = StoreMetadata(
+        last_node=None,
+        token_ids=fake_token_ids,
+        kv_indices=fake_kv_indices,
+        offset=offset,
+    )
+
+    load_metadata = LoadMetadata(
+        token_ids=fake_token_ids,
+        slot_mapping=fake_kv_indices,
+        offset=offset,
+    )
+
+    current_stream = torch.cuda.current_stream()
+
+    retrieve_token_num = connector.start_load_kv(load_metadata)
+    assert retrieve_token_num == 0
+
+    connector.store_kv(store_metadata)
+    current_stream.synchronize()
+
+    # check retrieve
+    gt_key_buffer = [
+        torch.zeros(input_id_len, head_num, head_dim, dtype=torch.bfloat16).cuda()
+        for _ in range(layer_num)
+    ]
+    gt_value_buffer = [
+        torch.zeros(input_id_len, head_num, head_dim, dtype=torch.bfloat16).cuda()
+        for _ in range(layer_num)
+    ]
+
+    for i in range(layer_num):
+        gt_key_buffer[i] = k_buffer[i][fake_kv_indices]
+        gt_value_buffer[i] = v_buffer[i][fake_kv_indices]
+
+    # clear the k_buffer and v_buffer
+    for _ in range(layer_num):
+        k_buffer[i].zero_()
+        v_buffer[i].zero_()
+
+    retrieve_token_num = connector.start_load_kv(load_metadata)
+    assert retrieve_token_num == input_id_len
+
+    for i in range(layer_num):
+        current_stream.synchronize()
+        connector.load_kv_layerwise(i)
+
+    current_stream.synchronize()
+    test_key_buffer = [
+        torch.zeros(input_id_len, head_num, head_dim, dtype=torch.bfloat16).cuda()
+        for _ in range(layer_num)
+    ]
+    test_value_buffer = [
+        torch.zeros(input_id_len, head_num, head_dim, dtype=torch.bfloat16).cuda()
+        for _ in range(layer_num)
+    ]
+
+    for i in range(layer_num):
+        test_key_buffer[i] = k_buffer[i][fake_kv_indices]
+        test_value_buffer[i] = v_buffer[i][fake_kv_indices]
+
+    for i in range(layer_num):
+        assert torch.allclose(test_key_buffer[i], gt_key_buffer[i])
+        assert torch.allclose(test_value_buffer[i], gt_value_buffer[i])
+
+    print("================================================")
+    print("TEST_LOAD_STORE_METADATA PASSED!")
+    print("================================================")
+    connector.close()
+
+
+if __name__ == "__main__":
+    test_load_store_metadata()
diff --git a/sglang/python/sglang/srt/mem_cache/storage/mooncake_store/README.md b/sglang/python/sglang/srt/mem_cache/storage/mooncake_store/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..e9eadb0ee64e95d05f45c9e7e1cbca80c99a5aa5
--- /dev/null
+++ b/sglang/python/sglang/srt/mem_cache/storage/mooncake_store/README.md
@@ -0,0 +1,433 @@
+# Mooncake as L3 KV Cache
+
+This document describes how to use Mooncake as the L3 KV cache for SGLang.
+
+Related documentation:
+* [Quick Start: SGLang HiCache with Mooncake Backend](https://kvcache-ai.github.io/Mooncake/getting_started/examples/sglang-integration/hicache-quick-start.html)
+* [Complete Guide: SGLang HiCache with Mooncake Backend](https://kvcache-ai.github.io/Mooncake/getting_started/examples/sglang-integration/hicache-integration-v1.html)
+* [Mooncake x SGLang HiCache System Design](https://kvcache-ai.github.io/Mooncake/design/hicache-design.html)
+* [HiCache System Design and Optimization](https://docs.sglang.io/advanced_features/hicache_design.html)
+* [SGLang HiCache with Mooncake Backend Benchmark](https://kvcache-ai.github.io/Mooncake/performance/sglang-hicache-benchmark-results-v1.html)
+
+## About Mooncake
+
+Mooncake aims to enhance the inference efficiency of large language models (LLMs), especially in slow object storage environments, by constructing a multi-level caching pool on high-speed interconnected DRAM/SSD resources. Compared to traditional caching systems, Mooncake utilizes (GPUDirect) RDMA technology to transfer data directly in a zero-copy manner, while maximizing the use of multi-NIC resources on a single machine.
+
+For more details about Mooncake, please refer to [Mooncake project](https://github.com/kvcache-ai/Mooncake) and [Mooncake documents](https://kvcache-ai.github.io/Mooncake/).
+
+### Mooncake & SGLang HiCache
+
+Mooncake serves as a high-performance L3 storage backend for SGLang HiCache, enabling distributed KV cache storage across multiple servers with RDMA-accelerated data transfer. This integration addresses the capacity limitations of traditional GPU-only or GPU+CPU caching by providing virtually unlimited cache storage through a distributed memory pool.
+
+When a cache miss occurs in L1 and L2, HiCache automatically fetches the required KV cache from Mooncake's distributed memory pool. The system uses intelligent prefetching strategies to minimize latency, and utilize RDMA technology and zero-copy technique to ensure high-bandwidth, low-latency data transfer between SGLang instances and Mooncake storage nodes.
+
+**Key Advantages:**
+
+- **Scalable Capacity**: Aggregate memory across entire clusters into large distributed pools.
+- **Cache Sharing**: KV caches can be shared by all SGLang instances in the cluster.
+- **RDMA Acceleration**: Direct memory access eliminates CPU overhead and reduces latency.
+- **Zero Copy**: Direct data transfer between L2 and Mooncake without intermediate copying, maximizing throughput.
+- **Fault Tolerance**: Distributed architecture provides resilience against individual node failures.
+
+This integration is particularly valuable for production deployments involving long-context models, multi-turn conversations, and high-throughput serving scenarios where traditional caching approaches become capacity-constrained.
+
+## Install Mooncake
+
+**Method 1: with pip**
+
+```bash
+pip install mooncake-transfer-engine
+```
+
+**Method 2: from source**
+
+Clone Mooncake project:
+
+```bash
+git clone https://github.com/kvcache-ai/Mooncake --recursive
+```
+
+Install dependencies:
+
+```bash
+cd Mooncake
+bash dependencies.sh
+```
+
+Build the project:
+
+```bash
+mkdir build
+cd build
+cmake ..
+make -j
+```
+
+Install Mooncake:
+
+```bash
+sudo make install
+```
+
+For more details, please refer to [Mooncake official installation guide](https://kvcache-ai.github.io/Mooncake/getting_started/build.html).
+
+## Deployment
+
+**Mooncake** is a distributed system that efficiently aggregates memory resources across multiple servers. It can also be deployed on a single server for simpler setups.
+
+When integrated with **SGLang**, the system conceptually consists of four key components: `the master service`, `metadata service` (Optional), `store service`  (Optional), and the `SGLang server`. Among them, the `master service` and `metadata service` are responsible for object and metadata maintenance. The `store service` manages a contiguous memory segment that contributes to the distributed KV cache, making its memory accessible to both local and remote `SGLang servers`. Data transfer occurs directly between the `store service` and `SGLang servers`, bypassing the `master service`.
+
+### Single Server Deployment
+
+**Launch Mooncake `metadata service` (Optional):**
+
+```bash
+python -m mooncake.http_metadata_server
+```
+
+This service is responsible for centralized metadata management including internal connection status and related metadata.
+
+Deployment of the `metadata service` can be skipped in the following cases:
+* Mooncake supports non-centralized metadata management via a P2P handshake mechanism to exchange metadata. When using this mode, deployment of the `metadata service` can be skipped.
+* Mooncake also supports embedding `mededata service` into `master service`. In this case, only the `master service` needs to be started.
+
+**Launch Mooncake `master service`:**
+
+The `master service` orchestrates the logical storage space pool across the entire cluster, managing KV cache space allocation and eviction.
+
+To start `mooncake_master`:
+
+```bash
+mooncake_master --eviction_high_watermark_ratio=0.95
+```
+
+To start `mooncake_master` with embedded `metadata service` (so that a separate `metadata service` deployment can be skipped):
+
+```bash
+mooncake_master --enable_http_metadata_server=true --http_metadata_server_port=8080 --eviction_high_watermark_ratio=0.95
+```
+
+**Understanding `eviction_high_watermark_ratio`:**
+
+When a `PutStart` request fails due to insufficient memory, or when the eviction thread detects that space usage has reached the configured high watermark ratio, an eviction task is triggered to free up space by evicting a portion of objects.
+
+Due to memory fragmentation, allocation failures may occur even when memory usage has not yet reached 100%. The actual threshold depends on the workload. This [benchmark document](https://kvcache-ai.github.io/Mooncake/performance/allocator-benchmark-result.html) provides memory allocation efficiency results under different scenarios. if excessive allocation failures are observed, consider lowering this parameter accordingly.
+
+**Launch Mooncake `store service` (Optional):**
+
+First, create and save a configuration file in JSON format. For example:
+
+```json
+{
+    "local_hostname": "localhost",
+    "metadata_server": "http://127.0.0.1:8080/metadata",
+    "master_server_address": "127.0.0.1:50051",
+    "protocol": "rdma",
+    "device_name": "",
+    "global_segment_size": "4gb",
+    "local_buffer_size": 0
+}
+```
+
+Note: If the `metadata service` is not deployed, set this field to:
+
+```json
+    "metadata_server": "P2PHANDSHAKE",
+```
+
+Then start the `store service`:
+
+```bash
+python -m mooncake.mooncake_store_service --config=[config_path] --port=8081
+```
+
+Mooncake `store service` configuration can also be provided via environment variables:
+
+```bash
+MOONCAKE_LOCAL_HOSTNAME="localhost" \
+MOONCAKE_TE_META_DATA_SERVER="http://127.0.0.1:8080/metadata" \
+MOONCAKE_MASTER="127.0.0.1:50051" \
+MOONCAKE_PROTOCOL="rdma" \
+MOONCAKE_DEVICE="" \
+MOONCAKE_GLOBAL_SEGMENT_SIZE="4gb" \
+MOONCAKE_LOCAL_BUFFER_SIZE=0 \
+python -m mooncake.mooncake_store_service --port=8081
+```
+
+**Parameter Explanation:**
+
+* `local_hostname`, `MOONCAKE_LOCAL_HOSTNAME`: The hostname of the `store service`.
+* `metadata_server`, `MOONCAKE_TE_META_DATA_SERVER` : The network address of the `metadata service`. The default port is 8080. If the `metadata service` is not deployed, set this field to: `"metadata_server": "P2PHANDSHAKE"`.
+* `master_server_address`, `MOONCAKE_MASTER`: The network address of the `master service`. The default port is 50051.
+* `protocol`, `MOONCAKE_PROTOCOL`: The protocol used by Mooncake. Supported values are `"rdma"` or `"tcp"`. For optimal performance, `"rdma"` is recommended.
+* `device_name`, `MOONCAKE_DEVICE`: The RDMA devices used by Mooncake. This field can usually be left empty, as Mooncake automatically discovers available NICs by default. This parameter is required only when the protocol is set to `"rdma"` **and** a specific set of NICs needs to be used. Example: `"device_name": "mlx5_0,mlx5_1"`. To list available devices, run `ibv_devices`. **Note:** If the environment variable `MC_MS_AUTO_DISC` is set to `1`, any `device_name` or `MOONCAKE_DEVICE` configuration will be overridden, and Mooncake will switch to auto-discovery mode.
+  - For tensor parallel deployments where different ranks should use different devices, you can specify device configurations using JSON format:
+    ```json
+    {
+    "device_name": "{0: \"ib0,ib1\", 1: \"ib2,ib3\", 2: \"ib4,ib5\"}"
+    }
+    ```
+  - Or in environment variables:
+    ```bash
+    MOONCAKE_DEVICE="{\"0\": \"ib0,ib1\", \"1\": \"ib2,ib3\", \"2\": \"ib4,ib5\"}"
+    ```
+* `global_segment_size`, `MOONCAKE_GLOBAL_SEGMENT_SIZE`: The amount of memory contributed to the global memory pool. Accepts either bytes (integer) or a string with the `gb` suffix, e.g., `"4294967296"` or `"4gb"`. A larger value allows Mooncake to cache more KV tensors.
+* `local_buffer_size`, `MOONCAKE_LOCAL_BUFFER_SIZE`: Local buffer is used to do request operations such as `Get` or `Put`. In this case, it is set to 0 because the instance functions solely as a storage server, contributing memory to the global pool without issuing any request operations.
+
+**Important: Understanding Global Segment Size**
+
+`global_segment_size` and `MOONCAKE_GLOBAL_SEGMENT_SIZE`: This parameter specifies the amount of memory each instance contributes to the distributed memory pool. The total memory available for KV cache storage across the cluster is the sum of the memory contributed by all instances.
+
+Adjust this value according to system’s available memory and expected cache requirements.
+
+Note: If `MOONCAKE_GLOBAL_SEGMENT_SIZE` is set to a non-zero value when starting the `SGLang server`, launching the `store service` can be skipped. In this case, the `SGLang server` also takes on the role of the `store service`, which simplifies deployment but couples the two components together. Users can choose the deployment approach that best fits their needs.
+
+**Start the `SGLang server` with Mooncake enabled:**
+
+There are three ways to configure Mooncake:
+
+1. Via extra configuration passed through sglang parameters
+2. Using JSON configuration files
+3. Using environment variables
+
+Mooncake loads configuration in the following priority order:
+
+1. If Mooncake-specific options are provided in `--hicache-storage-backend-extra-config`, they are used first.
+2. If not, Mooncake checks whether the environment variable `DEFAULT_MOONCAKE_CONFIG_PATH_ENV` is set, and loads the JSON config file from that path.
+3. If neither of the above is provided, Mooncake falls back to environment variables.
+
+**Using extra-config of sglang arguments to configure Mooncake**
+
+```bash
+python -m sglang.launch_server \
+    --enable-hierarchical-cache \
+    --hicache-storage-backend mooncake \
+    --model-path [model_path] \
+    --hicache-storage-backend-extra-config '{"master_server_address": "127.0.0.1:50051", "local_hostname": "localhost", "metadata_server": "http://127.0.0.1:8080/metadata", "global_segment_size": "4gb", "protocol": "rdma", "device_name": ""}'
+```
+
+**Using JSON file to configure Mooncake**
+
+SGLang server can load Mooncake config from `SGLANG_HICACHE_MOONCAKE_CONFIG_PATH`.
+
+```bash
+export SGLANG_HICACHE_MOONCAKE_CONFIG_PATH=/sgl-workspace/sglang/benchmark/hicache/mooncake_config.json
+
+echo '{
+    "local_hostname": "localhost",
+    "metadata_server": "http://127.0.0.1:8080/metadata",
+    "master_server_address": "127.0.0.1:50051",
+    "protocol": "rdma",
+    "device_name": "",
+    "global_segment_size": "4gb"
+}' > ${SGLANG_HICACHE_MOONCAKE_CONFIG_PATH}
+
+python -m sglang.launch_server \
+    --enable-hierarchical-cache \
+    --hicache-storage-backend mooncake \
+    --model-path [model_path]
+```
+
+**Using env variables to configure Mooncake**
+
+```bash
+MOONCAKE_TE_META_DATA_SERVER="http://127.0.0.1:8080/metadata" \
+MOONCAKE_MASTER="127.0.0.1:50051" \
+MOONCAKE_PROTOCOL="rdma" \
+MOONCAKE_DEVICE="" \
+MOONCAKE_GLOBAL_SEGMENT_SIZE="4gb" \
+python -m sglang.launch_server \
+    --enable-hierarchical-cache \
+    --hicache-storage-backend mooncake\
+    --model-path [model_path]
+```
+
+**Parameter Explanation:**
+
+The Mooncake parameters used here are essentially the same as those configured for the `store service`.
+
+In particular, for the `global segment size`, if at least one `store service` instance is running, this value can be set to `0`. In this case, the SGLang server will not contribute any memory to the system. Note that KV tensors stored in this contributed memory will be lost when the process exits; however, this will **not** cause any system errors.
+
+**Important:** when `tp > 1`, each Tensor Parallel (TP) rank launches its own Mooncake backend instance and contributes `1/global_segment_size` memory. Therefore, the total memory consumption equals `global segment size`.
+
+**HiCache Related Parameters for SGLang Server**
+
+For a comprehensive overview of HiCache-related parameters, please refer to [this document](https://docs.sglang.io/advanced_features/hicache_design.html#related-parameters).
+
+
+Note that, for `--hicache-mem-layout {layer_first,page_first,page_first_direct}`, which specifies the memory layout for the host memory pool, `page_first` or `page_first_direct` are required if use Mooncake backend.
+
+### Distributed Deployment
+
+Distributed deployment of Mooncake is straightforward. Similar to the single-node setup, start one `metadata service` and one `master service` for this cluster. Then start a `store service` on each server.
+
+Mooncake also supports high availability mode. This mode enhances fault tolerance by running the `master service` as a cluster of multiple master nodes coordinated through an `etcd` cluster. The master nodes use `etcd` to elect a leader, which is responsible for handling client requests. For more details about how to deploy in this mode, please refer to our [documents](https://kvcache-ai.github.io/Mooncake/).
+
+### Deployment with Dummy Client (Experimental)
+
+In addition to the standard deployment where SGLang acts as a full Mooncake node, you can use the **Dummy Client** mode. In this mode, SGLang connects to a local **Mooncake Store Service** (Real Client) via RPC/IPC. This decouples the SGLang process from the heavy RDMA and memory management, potentially improving stability and allowing the cache to persist even if the SGLang process restarts.
+
+**Architecture:**
+* **Mooncake Master**: Manages the cluster topology (same as standard).
+* **Mooncake Store Service (Real Client)**: Manages the actual memory pool and RDMA connections. Must be running locally.
+* **SGLang Server (Dummy Client)**: Connects to the local Store Service to access the cache.
+
+#### 1. Launch Services (Master & Store)
+
+First, start the `master service` and the `store service`. The `store service` acts as the Real Client.
+
+**Start Master:**
+```bash
+mooncake_master --eviction_high_watermark_ratio=0.95
+```
+
+**Start Store Service (Real Client):** Crucially, the default port (50052) is used for internal RPC, which the Dummy Client will connect to.
+```bash
+mooncake_client --global_segment_size=4GB
+```
+
+**Parameter Explanation:**
+
+- **`host`**: (string, default: "0.0.0.0"): The hostname of the client.
+
+- **`port`**: (int, default: 50052): The port number the client service listens on.
+
+- **`global_segment_size`**: (string, default: "4GB"): The size of the global segment to be allocated by the client.
+
+- **`master_server_address`**: (string, default: "localhost:50051"): The address of the Master Service.
+
+- **`metadata_server`**: (string, default: "http://localhost:8080/metadata"): The address of the metadata service.
+
+- **`protocol`**: (string, default: "tcp"): The protocol used by the Transfer Engine.
+
+- **`device_name`**: (string, default: ""): The device name used by the Transfer Engine.
+
+- **`threads`**: (int, default: 1): The number of threads used by the client.
+
+#### 2. Launch SGLang (Dummy Client)
+Configure SGLang to connect to the Real Client using the client_server_address parameter.
+
+**Using extra-config of sglang arguments to configure Mooncake**
+
+```bash
+python -m sglang.launch_server \
+    --enable-hierarchical-cache \
+    --hicache-storage-backend mooncake \
+    --model-path [model_path] \
+    --hicache-storage-backend-extra-config '{"standalone_storage": true, "client_server_address": "127.0.0.1:50052"}'
+```
+
+**Using JSON file to configure Mooncake**
+
+SGLang server can load Mooncake config from `SGLANG_HICACHE_MOONCAKE_CONFIG_PATH`.
+
+```bash
+export SGLANG_HICACHE_MOONCAKE_CONFIG_PATH=/sgl-workspace/sglang/benchmark/hicache/mooncake_config.json
+
+echo '{
+    "standalone_storage": true,
+    "client_server_address": "127.0.0.1:50052"
+}' > ${SGLANG_HICACHE_MOONCAKE_CONFIG_PATH}
+
+python -m sglang.launch_server \
+    --enable-hierarchical-cache \
+    --hicache-storage-backend mooncake \
+    --model-path [model_path]
+```
+
+**Using env variables to configure Mooncake**
+
+```bash
+MOONCAKE_STANDALONE_STORAGE=1
+MOONCAKE_CLIENT="127.0.0.1:50052"
+python -m sglang.launch_server \
+    --enable-hierarchical-cache \
+    --hicache-storage-backend mooncake \
+    --model-path [model_path]
+```
+
+### Prefill/Decode Disaggregation
+
+In **PD disaggregation**, the configurations for the `metadata service`, `mooncake master`, and the optional `store service` remain the same as described above. The difference is that SGLang introduces three distinct roles: `prefill worker`, `decode worker`, and `router`.
+
+Among these, the `prefill worker` supports enabling **HiCache**. To run with PD disaggregation, start from the [PD configuration](https://kvcache-ai.github.io/Mooncake/getting_started/examples/sglang-integration-v1.html), and add the HiCache-related parameters (as previously described for the `SGLang server`) to the `prefill worker`.
+
+In the example below, one `prefill worker`, one `decode worker`, and one `router` are launched. HiCache is enabled on the `prefill worker` to optimize prefill performance.
+
+**Prefill worker**:
+
+```bash
+MOONCAKE_TE_META_DATA_SERVER="http://127.0.0.1:8080/metadata" \
+MOONCAKE_MASTER=127.0.0.1:50051 \
+MOONCAKE_PROTOCOL="rdma" \
+MOONCAKE_DEVICE="mlx5_1" \
+MOONCAKE_GLOBAL_SEGMENT_SIZE=4294967296 \
+python -m sglang.launch_server \
+    --model-path [model_path] \
+    --page-size 64 \
+    --enable-hierarchical-cache \
+    --hicache-storage-prefetch-policy timeout \
+    --hicache-storage-backend mooncake \
+    --disaggregation-mode prefill \
+    --disaggregation-ib-device "mlx5_1" \
+    --base-gpu-id 0 \
+    --port 30000
+```
+
+**Decode worker**:
+
+```bash
+python -m sglang.launch_server \
+    --model-path [model_path] \
+    --page-size 64 \
+    --disaggregation-mode decode \
+    --disaggregation-ib-device "mlx5_1" \
+    --base-gpu-id 1 \
+    --port 30001
+```
+
+**Router**:
+
+```bash
+python -m sglang_router.launch_router \
+    --pd-disaggregation \
+    --prefill "http://127.0.0.1:30000" \
+    --decode "http://127.0.0.1:30001" \
+    --host 0.0.0.0 \
+    --port 8000
+```
+
+## Troubleshooting
+
+**RDMA Registration Failure:**
+
+* In some environments, RDMA registration may require root privileges. In this case, try running the program as root.
+* In certain environments (e.g., eRDMA), there is an upper limit on the total amount of RDMA memory that can be registered. Once this limit is exceeded, registration will fail. To resolve this, you can lower the value of `MOONCAKE_GLOBAL_SEGMENT_SIZE`, or reduce the host memory allocated to HiCache in the `SGLang server` (since this memory is fully registered with RDMA to enable zero-copy).
+
+**HiCache CPU Memory Usage:**
+
+When using HiCache, the default L2 host DRAM (CPU memory) size for KV cache is **2 times** the size of the L1 device memory (GPU memory) for KV cache.
+
+If the model is small but the GPU memory is large — especially in multi-TP (tensor parallel) setups — this may cause the L1 KV cache to become very large, which in turn can consume excessive CPU DRAM.
+
+In such cases, you should manually configure an appropriate L2 cache size based on your hardware. This can be done by setting `--hicache-ratio` or `--hicache-size`.
+
+**More Information:**
+
+Additional troubleshooting information can be found [here](https://kvcache-ai.github.io/Mooncake/troubleshooting/troubleshooting.html).
+
+## Test Mooncake Store
+
+This test is intended for developers to quickly verify that the MooncakeStore class interfaces are functioning correctly.
+
+First, start the `metadata service` and `master service`. Then run the `test_mooncake_store.py`. 16MB global segments size is enough to run this test.
+
+```bash
+MOONCAKE_TE_META_DATA_SERVER="http://127.0.0.1:8080/metadata" \
+MOONCAKE_MASTER=127.0.0.1:50051 \
+MOONCAKE_PROTOCOL="rdma" \
+MOONCAKE_GLOBAL_SEGMENT_SIZE=16777216 \
+python3 [path of test_mooncake_store.py]
+```
+
+If all tests pass, the message "✅ All tests passed" will be printed at the end.
diff --git a/sglang/python/sglang/srt/mem_cache/storage/mooncake_store/embedding_cache_controller.py b/sglang/python/sglang/srt/mem_cache/storage/mooncake_store/embedding_cache_controller.py
new file mode 100644
index 0000000000000000000000000000000000000000..87090c9d7f721c8367c6b22864a871ad4c9d9a70
--- /dev/null
+++ b/sglang/python/sglang/srt/mem_cache/storage/mooncake_store/embedding_cache_controller.py
@@ -0,0 +1,315 @@
+import asyncio
+import logging
+import threading
+import time
+from queue import Empty, Queue
+from typing import List, Optional
+
+import torch
+
+from sglang.srt.mem_cache.storage.mooncake_store.mooncake_embedding_store import (
+    MooncakeEmbeddingStore,
+)
+
+logger = logging.getLogger(__name__)
+
+
+class ContiguousMemoryAllocator:
+    """
+    A simple allocator to manage variable-sized contiguous blocks
+    within a large pre-allocated flat buffer.
+    """
+
+    def __init__(self, total_size_bytes: int):
+        self.total_size = total_size_bytes
+        # List of (offset, size) for free blocks
+        self.free_blocks = [(0, total_size_bytes)]
+        self.allocated_map = {}  # {handle: (offset, size)}
+        self.lock = threading.Lock()
+
+    def allocate(self, size_bytes: int) -> Optional[int]:
+        with self.lock:
+            # Simple First-Fit allocation
+            for i, (offset, block_size) in enumerate(self.free_blocks):
+                if block_size >= size_bytes:
+                    # Allocate from this block
+                    remaining_size = block_size - size_bytes
+                    if remaining_size > 0:
+                        self.free_blocks[i] = (offset + size_bytes, remaining_size)
+                    else:
+                        self.free_blocks.pop(i)
+                    return offset
+            return None
+
+    def free(self, offset: int, size_bytes: int):
+        with self.lock:
+            # Return block and merge adjacent free blocks
+            self.free_blocks.append((offset, size_bytes))
+            self.free_blocks.sort()
+
+            merged = []
+            if not self.free_blocks:
+                return
+
+            curr_offset, curr_size = self.free_blocks[0]
+            for next_offset, next_size in self.free_blocks[1:]:
+                if curr_offset + curr_size == next_offset:
+                    curr_size += next_size
+                else:
+                    merged.append((curr_offset, curr_size))
+                    curr_offset, curr_size = next_offset, next_size
+            merged.append((curr_offset, curr_size))
+            self.free_blocks = merged
+
+
+class EmbeddingPrefetchOperation:
+    """Groups all missing images of a request for a single batch GET."""
+
+    def __init__(self, req_id: str, keys: List[str], ptrs: List[int], sizes: List[int]):
+        self.req_id = req_id
+        self.keys = keys
+        self.ptrs = ptrs
+        self.sizes = sizes
+        self.is_finished = False
+        self.success = False
+        self._lock = threading.Lock()
+
+    def mark_done(self, success: bool):
+        with self._lock:
+            self.success = success
+            self.is_finished = True
+
+
+class EmbeddingInsertOperation:
+    """Groups all newly computed images of a request for a single batch PUT."""
+
+    def __init__(self, keys: List[str], ptrs: List[int], sizes: List[int]):
+        self.keys = keys
+        self.ptrs = ptrs
+        self.sizes = sizes
+
+
+class EmbeddingCacheController:
+    def __init__(
+        self,
+        tp_rank,
+        tp_size,
+        max_pool_size_gb=4.0,
+        hidden_dim=1024,
+        tp_group=None,
+        all_rank_get=False,
+    ):
+        self.tp_world_size = tp_size
+        self.tp_group = tp_group
+        self.all_rank_get = all_rank_get
+        self.hidden_dim = hidden_dim
+        self.element_size = torch.float32.itemsize
+
+        # 1. Mooncake Backend & Pinned Buffer
+        self.mooncake_store = MooncakeEmbeddingStore()
+        self.total_pool_size_bytes = int(max_pool_size_gb * 1024**3)
+        self.cpu_pool = torch.empty(
+            self.total_pool_size_bytes, dtype=torch.uint8, pin_memory=True
+        )
+        self.mooncake_store.register_buffer(self.cpu_pool)
+
+        # 2. Variable Size Memory Management
+        self.allocator = ContiguousMemoryAllocator(self.total_pool_size_bytes)
+        self.hash_to_metadata = {}  # {image_hash: (offset, num_tokens, size_bytes)}
+
+        # 3. Task Tracking
+        self.ongoing_prefetch = {}  # {req_id: EmbeddingPrefetchOperation}
+        self.prefetch_queue = Queue()
+        self.insert_queue = Queue()
+
+        self.lock = threading.Lock()
+        self.stop_event = threading.Event()
+        self.io_thread = threading.Thread(target=self._io_loop, daemon=True)
+        self.io_thread.start()
+
+        if self.tp_world_size > 1:
+            if self.tp_group is None:
+                raise ValueError("tp_group must be provided when tp_size > 1")
+            from sglang.srt.distributed.parallel_state import (
+                create_custom_parallel_group,
+            )
+
+            group_ranks = torch.distributed.get_process_group_ranks(self.tp_group)
+            self.prefetch_tp_group = create_custom_parallel_group(
+                group_ranks=group_ranks, backend="gloo"
+            )
+        else:
+            self.prefetch_tp_group = None
+
+    def prefetch(
+        self, req_id: str, image_hashes: List[str], expected_tokens: List[int]
+    ):
+        """Issues ONE batch GET for all missing images in the request."""
+        keys, ptrs, sizes = [], [], []
+
+        with self.lock:
+            for h, num_tokens in zip(image_hashes, expected_tokens):
+                if h in self.hash_to_metadata:
+                    logger.debug(
+                        f"Req {req_id}: Hash  already in local metadata, skipping prefetch."
+                    )
+                    continue
+
+                size_bytes = num_tokens * self.hidden_dim * self.element_size
+                offset = self.allocator.allocate(size_bytes)
+                if offset is None:
+                    continue
+
+                self.hash_to_metadata[h] = (offset, num_tokens, size_bytes)
+                keys.append(h)
+                ptrs.append(self.cpu_pool.data_ptr() + offset)
+                sizes.append(size_bytes)
+
+            if not keys:
+                return
+
+            logger.info(
+                f"Req {req_id}: Starting global fetch for {len(keys)} images from Mooncake."
+            )
+
+            op = EmbeddingPrefetchOperation(req_id, keys, ptrs, sizes)
+            self.ongoing_prefetch[req_id] = op
+            self.prefetch_queue.put(op)
+
+    def insert_batch(
+        self, image_hashes: List[str], embedding_tensors: List[torch.Tensor]
+    ):
+        """Issues ONE batch PUT for all embeddings computed by this request."""
+        keys, ptrs, sizes = [], [], []
+
+        with self.lock:
+            for h, tensor in zip(image_hashes, embedding_tensors):
+                if h in self.hash_to_metadata:
+                    continue
+
+                num_tokens = tensor.shape[0]
+                size_bytes = num_tokens * self.hidden_dim * self.element_size
+                offset = self.allocator.allocate(size_bytes)
+                if offset is None:
+                    continue
+
+                # Copy to pinned pool for RDMA
+                self.hash_to_metadata[h] = (offset, num_tokens, size_bytes)
+                target_view = (
+                    self.cpu_pool[offset : offset + size_bytes]
+                    .view(torch.float32)
+                    .view(num_tokens, self.hidden_dim)
+                )
+                target_view.copy_(tensor.cpu())
+
+                keys.append(h)
+                ptrs.append(self.cpu_pool.data_ptr() + offset)
+                sizes.append(size_bytes)
+
+            if keys:
+                logger.info(
+                    f"Global Cache: Inserting {len(keys)} new embeddings into Mooncake cluster."
+                )
+                self.insert_queue.put(EmbeddingInsertOperation(keys, ptrs, sizes))
+
+    def _io_loop(self):
+        """Asynchronous worker handling both Batch GET and Batch PUT."""
+        while not self.stop_event.is_set():
+            processed_any = False
+
+            try:
+                op = self.prefetch_queue.get_nowait()
+                results = self.mooncake_store.batch_get(op.keys, op.ptrs, op.sizes)
+                success_count = sum(results)
+                logger.info(
+                    f"Mooncake GET Finished: Req {op.req_id}, Successfully fetched {success_count}/{len(op.keys)} images."
+                )
+                op.mark_done(all(results))
+                self.prefetch_queue.task_done()
+                processed_any = True
+            except Empty:
+                pass
+
+            try:
+                op = self.insert_queue.get_nowait()
+                self.mooncake_store.batch_put(op.keys, op.ptrs, op.sizes)
+                logger.info(
+                    f"Mooncake PUT Finished: Successfully stored {len(op.keys)} keys in cluster."
+                )
+                self.insert_queue.task_done()
+                processed_any = True
+            except Empty:
+                pass
+
+            if not processed_any:
+                time.sleep(0.001)
+
+    def check_prefetch_progress(self, req_id: str) -> bool:
+        """TP-Group barrier: ensures all cards have the request batch ready."""
+        local_ready = False
+        with self.lock:
+            if req_id not in self.ongoing_prefetch:
+                local_ready = True
+            else:
+                op = self.ongoing_prefetch[req_id]
+                if op.is_finished:
+                    local_ready = op.success
+
+        if self.all_rank_get and self.tp_world_size > 1:
+            ready_tensor = torch.tensor(
+                [1 if local_ready else 0], dtype=torch.int, device="cpu"
+            )
+            torch.distributed.all_reduce(
+                ready_tensor,
+                op=torch.distributed.ReduceOp.MIN,
+                group=self.prefetch_tp_group,
+            )
+            local_ready = ready_tensor.item() == 1
+
+        if local_ready:
+            with self.lock:
+                self.ongoing_prefetch.pop(req_id, None)
+            return True
+        return False
+
+    def get_embeddings(self, image_hashes: List[str]) -> List[torch.Tensor]:
+        """Final reconstruction for model input."""
+        with self.lock:
+            tensors = []
+            for h in image_hashes:
+                offset, num_tokens, size_bytes = self.hash_to_metadata[h]
+                tensors.append(
+                    self.cpu_pool[offset : offset + size_bytes]
+                    .view(torch.float32)
+                    .view(num_tokens, self.hidden_dim)
+                )
+            return tensors
+
+    async def batch_is_exist(self, image_hashes: List[str]) -> List[bool]:
+        with self.lock:
+            local_results = [h in self.hash_to_metadata for h in image_hashes]
+        local_hit_count = sum(local_results)
+
+        global_hit_count = 0
+        if not all(local_results):
+            missing_indices = [i for i, res in enumerate(local_results) if not res]
+            missing_hashes = [image_hashes[i] for i in missing_indices]
+
+            global_exists = await asyncio.to_thread(
+                self.mooncake_store.batch_is_exist, missing_hashes
+            )
+            global_hit_count = sum(global_exists)
+
+            for i, exists in zip(missing_indices, global_exists):
+                local_results[i] = exists
+
+        total = len(image_hashes)
+        miss_count = total - local_hit_count - global_hit_count
+        logger.info(
+            f"=== Multi-Level Cache Check === "
+            f"Total: {total} | "
+            f"Local Hits: {local_hit_count} | "
+            f"Global Hits: {global_hit_count} | "
+            f"Misses (GPU Work): {miss_count}"
+        )
+        return local_results
diff --git a/sglang/python/sglang/srt/mem_cache/storage/mooncake_store/mooncake_embedding_store.py b/sglang/python/sglang/srt/mem_cache/storage/mooncake_store/mooncake_embedding_store.py
new file mode 100644
index 0000000000000000000000000000000000000000..f358d97dcc796815f61c4f0648e2d8baaf5ec4de
--- /dev/null
+++ b/sglang/python/sglang/srt/mem_cache/storage/mooncake_store/mooncake_embedding_store.py
@@ -0,0 +1,68 @@
+import logging
+from typing import Any, List
+
+from sglang.srt.mem_cache.storage.mooncake_store.mooncake_store import MooncakeBaseStore
+
+logger = logging.getLogger(__name__)
+
+
+class MooncakeEmbeddingStore(MooncakeBaseStore):
+    def __init__(
+        self,
+        storage_config: Any = None,
+    ):
+        super().__init__()
+
+        MooncakeDistributedStore = self._import_mooncake_store()
+        self.store = MooncakeDistributedStore()
+        self.config = self._load_config(storage_config)
+        ret_code = self.store.setup(
+            self.config.local_hostname,
+            self.config.metadata_server,
+            self.config.global_segment_size,
+            16 * 1024 * 1024,  # Internal local buffer size
+            self.config.protocol,
+            self.config.device_name,
+            self.config.master_server_address,
+        )
+        if ret_code != 0:
+            raise RuntimeError(f"Failed to setup Mooncake Embedding Store: {ret_code}")
+
+        logger.info("Mooncake Embedding Store initialized successfully.")
+
+    def get_key(self, image_hash: str) -> str:
+        return f"emb_{image_hash}"
+
+    def batch_get(
+        self, hashes: List[str], ptrs: List[int], sizes: List[int]
+    ) -> List[bool]:
+        keys = [self.get_key(h) for h in hashes]
+        results = self.store.batch_get_into(keys, ptrs, sizes)
+        return [res > 0 for res in results]
+
+    def batch_put(
+        self, hashes: List[str], ptrs: List[int], sizes: List[int]
+    ) -> List[bool]:
+        keys = [self.get_key(h) for h in hashes]
+        exists = self.store.batch_is_exist(keys)
+
+        put_keys, put_ptrs, put_sizes, indices = [], [], [], []
+        success_map = [True] * len(hashes)
+
+        for i, status in enumerate(exists):
+            if status != 1:
+                put_keys.append(keys[i])
+                put_ptrs.append(ptrs[i])
+                put_sizes.append(sizes[i])
+                indices.append(i)
+
+        if put_keys:
+            results = self.store.batch_put_from(put_keys, put_ptrs, put_sizes)
+            for i, res in enumerate(results):
+                success_map[indices[i]] = res == 0
+        return success_map
+
+    def batch_is_exist(self, hashes: List[str]) -> List[bool]:
+        keys = [self.get_key(h) for h in hashes]
+        results = self.store.batch_is_exist(keys)
+        return [res == 1 for res in results]
diff --git a/sglang/python/sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py b/sglang/python/sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py
new file mode 100644
index 0000000000000000000000000000000000000000..892ab0d462ecce1efdcc3f8429f41e2e8db1efd1
--- /dev/null
+++ b/sglang/python/sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py
@@ -0,0 +1,810 @@
+import ctypes
+import json
+import logging
+import os
+import time
+import uuid
+from dataclasses import dataclass
+from typing import Any, List, Optional
+
+import requests
+import torch
+
+from sglang.srt.environ import envs
+from sglang.srt.mem_cache.hicache_storage import (
+    HiCacheStorage,
+    HiCacheStorageConfig,
+    HiCacheStorageExtraInfo,
+)
+from sglang.srt.mem_cache.memory_pool_host import HostKVCache, HostTensorAllocator
+from sglang.srt.observability.metrics_collector import StorageMetrics
+
+DEFAULT_LOCAL_BUFFER_SIZE = 16 * 1024 * 1024  # 16 MB
+SETUP_TIMEOUT = 600  # 10min
+
+logger = logging.getLogger(__name__)
+
+
+class MooncakeHostTensorAllocator(HostTensorAllocator):
+    def __init__(self):
+        super().__init__()
+        from mooncake.store import MooncakeHostMemAllocator
+
+        self.allocator = MooncakeHostMemAllocator()
+        self.ptr = None
+
+    def allocate(
+        self, dims: tuple, dtype: torch.dtype, device: str = "cpu"
+    ) -> torch.Tensor:
+        """
+        Allocates memory using MooncakeHostMemAllocator and wraps it in a PyTorch tensor.
+        """
+        self.dims = dims
+        self.dtype = dtype
+        size = 1
+        for d in dims:
+            size *= d
+        size *= torch.tensor([], dtype=self.dtype).element_size()
+        ptr_int = self.allocator.alloc(size)
+        self.ptr = ptr_int
+        c_type = ctypes.c_byte * size
+        c_array = c_type.from_address(ptr_int)
+
+        tensor = torch.frombuffer(c_array, dtype=torch.uint8, count=size)
+
+        if dtype != torch.uint8:
+            element_size = torch.tensor([], dtype=dtype).element_size()
+            assert size % element_size == 0, "Size must be divisible by element size"
+            tensor = tensor.view(dtype)
+
+        return tensor.view(dims)
+
+
+def _parse_global_segment_size(value) -> int:
+    if isinstance(value, int):
+        return value
+    if isinstance(value, str):
+        s = value.strip().lower()
+        if s.endswith("gb"):
+            num = s[:-2].strip()
+            if not num:
+                raise ValueError(
+                    "Invalid global_segment_size: missing number before 'gb'"
+                )
+            return int(num) * 1024 * 1024 * 1024
+        return int(s)
+    return int(value)
+
+
+@dataclass
+class MooncakeStoreConfig:
+    local_hostname: str
+    metadata_server: str
+    global_segment_size: int
+    protocol: str
+    device_name: str
+    master_server_address: str
+    master_metrics_port: int
+    check_server: bool
+    standalone_storage: bool
+    client_server_address: str
+
+    @staticmethod
+    def from_file() -> "MooncakeStoreConfig":
+        """Load the config from a JSON file."""
+        if not envs.SGLANG_HICACHE_MOONCAKE_CONFIG_PATH.is_set():
+            raise RuntimeError(
+                f"Config file path not set. Please set {envs.SGLANG_HICACHE_MOONCAKE_CONFIG_PATH.name}"
+            )
+        file_path = envs.SGLANG_HICACHE_MOONCAKE_CONFIG_PATH.get()
+        try:
+            with open(file_path) as fin:
+                config = json.load(fin)
+        except Exception as e:
+            raise RuntimeError(f"Failed to load config from {file_path}: {str(e)}")
+
+        if (
+            "master_server_address" not in config
+            and "client_server_address" not in config
+        ):
+            raise ValueError(
+                "Either master_server_address or client_server_address is required in config file"
+            )
+
+        return MooncakeStoreConfig(
+            local_hostname=config.get(
+                "local_hostname", envs.MOONCAKE_LOCAL_HOSTNAME.default
+            ),
+            metadata_server=config.get(
+                "metadata_server", envs.MOONCAKE_TE_META_DATA_SERVER.default
+            ),
+            global_segment_size=_parse_global_segment_size(
+                config.get(
+                    "global_segment_size", envs.MOONCAKE_GLOBAL_SEGMENT_SIZE.default
+                )
+            ),
+            protocol=config.get("protocol", envs.MOONCAKE_PROTOCOL.default),
+            device_name=config.get("device_name", envs.MOONCAKE_DEVICE.default),
+            master_server_address=config.get(
+                "master_server_address", envs.MOONCAKE_MASTER.default
+            ),
+            master_metrics_port=config.get(
+                "master_metrics_port", envs.MOONCAKE_MASTER_METRICS_PORT.default
+            ),
+            check_server=config.get("check_server", envs.MOONCAKE_CHECK_SERVER.default),
+            standalone_storage=config.get(
+                "standalone_storage", envs.MOONCAKE_STANDALONE_STORAGE.default
+            ),
+            client_server_address=config.get(
+                "client_server_address", envs.MOONCAKE_CLIENT.default
+            ),
+        )
+
+    @staticmethod
+    def load_from_env() -> "MooncakeStoreConfig":
+        """Load config from a file specified in the environment variable.
+        export MOONCAKE_MASTER=10.13.3.232:50051
+        export MOONCAKE_PROTOCOL="rdma"
+        export MOONCAKE_DEVICE=""
+        export MOONCAKE_TE_META_DATA_SERVER="P2PHANDSHAKE"
+        """
+        # other required environment variables...
+        if not envs.MOONCAKE_MASTER.is_set() and not envs.MOONCAKE_CLIENT.is_set():
+            raise ValueError(
+                "Either the environment variable 'MOONCAKE_MASTER' or 'MOONCAKE_CLIENT' is not set."
+            )
+
+        # Special handling for local_hostname: try MOONCAKE_LOCAL_HOSTNAME first,
+        # then fall back to LOCAL_HOSTNAME if not set.
+        # This is for forward compatibility with the legacy LOCAL_HOSTNAME environment variable.
+        if envs.MOONCAKE_LOCAL_HOSTNAME.is_set():
+            local_hostname = envs.MOONCAKE_LOCAL_HOSTNAME.get()
+        else:
+            local_hostname = os.getenv(
+                "LOCAL_HOSTNAME", envs.MOONCAKE_LOCAL_HOSTNAME.default
+            )
+
+        return MooncakeStoreConfig(
+            local_hostname=local_hostname,
+            metadata_server=envs.MOONCAKE_TE_META_DATA_SERVER.get(),
+            global_segment_size=_parse_global_segment_size(
+                envs.MOONCAKE_GLOBAL_SEGMENT_SIZE.get()
+            ),
+            protocol=envs.MOONCAKE_PROTOCOL.get(),
+            device_name=envs.MOONCAKE_DEVICE.get(),
+            master_server_address=envs.MOONCAKE_MASTER.get(),
+            master_metrics_port=envs.MOONCAKE_MASTER_METRICS_PORT.get(),
+            check_server=envs.MOONCAKE_CHECK_SERVER.get(),
+            standalone_storage=envs.MOONCAKE_STANDALONE_STORAGE.get(),
+            client_server_address=envs.MOONCAKE_CLIENT.get(),
+        )
+
+    @staticmethod
+    def load_from_extra_config(extra_config: dict) -> "MooncakeStoreConfig":
+        """Load config from extra_config dictionary."""
+        if (
+            "master_server_address" not in extra_config
+            and "client_server_address" not in extra_config
+        ):
+            raise ValueError(
+                "Either master_server_address or client_server_address is required in extra_config"
+            )
+
+        return MooncakeStoreConfig(
+            local_hostname=extra_config.get(
+                "local_hostname", envs.MOONCAKE_LOCAL_HOSTNAME.default
+            ),
+            metadata_server=extra_config.get(
+                "metadata_server", envs.MOONCAKE_TE_META_DATA_SERVER.default
+            ),
+            global_segment_size=_parse_global_segment_size(
+                extra_config.get(
+                    "global_segment_size", envs.MOONCAKE_GLOBAL_SEGMENT_SIZE.default
+                )
+            ),
+            protocol=extra_config.get("protocol", envs.MOONCAKE_PROTOCOL.default),
+            device_name=extra_config.get("device_name", envs.MOONCAKE_DEVICE.default),
+            master_server_address=extra_config.get(
+                "master_server_address", envs.MOONCAKE_MASTER.default
+            ),
+            master_metrics_port=extra_config.get(
+                "master_metrics_port", envs.MOONCAKE_MASTER_METRICS_PORT.default
+            ),
+            check_server=extra_config.get(
+                "check_server", envs.MOONCAKE_CHECK_SERVER.default
+            ),
+            standalone_storage=extra_config.get(
+                "standalone_storage", envs.MOONCAKE_STANDALONE_STORAGE.default
+            ),
+            client_server_address=extra_config.get(
+                "client_server_address", envs.MOONCAKE_CLIENT.default
+            ),
+        )
+
+
+class MooncakeBaseStore:
+    def __init__(self):
+        self.store = None
+        self.config = None
+
+    def _import_mooncake_store(self):
+        try:
+            from mooncake.store import MooncakeDistributedStore
+
+            return MooncakeDistributedStore
+        except ImportError as e:
+            raise ImportError(
+                "Please install mooncake by following the instructions at "
+                "https://kvcache-ai.github.io/Mooncake/getting_started/build.html "
+                "to run SGLang with MooncakeConnector."
+            ) from e
+
+    def _load_config(self, storage_config: Any = None):
+        extra_config = (
+            getattr(storage_config, "extra_config", None) if storage_config else None
+        )
+
+        if extra_config and (
+            extra_config.get("master_server_address") is not None
+            or extra_config.get("client_server_address") is not None
+        ):
+            config = MooncakeStoreConfig.load_from_extra_config(extra_config)
+            logger.info("Mooncake Configuration loaded from extra_config successfully.")
+
+        elif envs.SGLANG_HICACHE_MOONCAKE_CONFIG_PATH.is_set():
+            config = MooncakeStoreConfig.from_file()
+            logger.info("Mooncake Configuration loaded from file successfully.")
+
+        else:
+            config = MooncakeStoreConfig.load_from_env()
+            logger.info("Mooncake Configuration loaded from env successfully.")
+
+        return config
+
+    def register_buffer(self, tensor: torch.Tensor):
+        if self.store is None:
+            raise RuntimeError("Mooncake store is not initialized.")
+        ptr = tensor.data_ptr()
+        size = tensor.numel() * tensor.element_size()
+        ret_code = self.store.register_buffer(ptr, size)
+        if ret_code != 0:
+            logger.error(f"Failed to register buffer, error code: {ret_code}")
+            raise RuntimeError(
+                f"Failed to register buffer to Mooncake Store, error code: {ret_code}"
+            )
+
+
+class MooncakeStore(HiCacheStorage, MooncakeBaseStore):
+
+    def __init__(
+        self, storage_config: HiCacheStorageConfig = None, mem_pool: HostKVCache = None
+    ):
+        MooncakeBaseStore.__init__(self)
+        MooncakeDistributedStore = self._import_mooncake_store()
+        try:
+            self.store = MooncakeDistributedStore()
+
+            self.config = self._load_config(storage_config)
+            extra_config = (
+                getattr(storage_config, "extra_config", None)
+                if storage_config
+                else None
+            )
+            tp_scale_factor = 1 if storage_config is None else storage_config.tp_size
+
+            per_tp_global_segment_size = (
+                self.config.global_segment_size // tp_scale_factor
+            )
+
+            # Check if extra_backend_tag should be passed to MooncakeDistributedStore
+            self.extra_backend_tag = None
+            if extra_config and "extra_backend_tag" in extra_config:
+                self.extra_backend_tag = extra_config["extra_backend_tag"]
+                logger.info(f"Using extra_backend_tag: {self.extra_backend_tag}")
+
+            # Check server status
+            if self.config.check_server:
+                self.check_server()
+
+            # Handle JSON device_name configuration
+            device_name = self.config.device_name
+            if device_name and device_name.strip().startswith("{"):
+                try:
+                    device_config = json.loads(device_name)
+                    if storage_config and hasattr(storage_config, "tp_rank"):
+                        tp_rank = storage_config.tp_rank
+                        # Try both integer and string keys since JSON parsing may convert keys
+                        device_name = device_config.get(tp_rank, "")
+                        if not device_name:
+                            device_name = device_config.get(str(tp_rank), "")
+                    else:
+                        device_name = ""
+                except (json.JSONDecodeError, AttributeError):
+                    logger.warning(
+                        f"Failed to parse device_name as JSON: {device_name}"
+                    )
+                    device_name = ""
+            if self.config.standalone_storage:
+                if not isinstance(mem_pool.allocator, MooncakeHostTensorAllocator):
+                    raise RuntimeError(
+                        "MooncakeStore with standalone_storage=True requires MooncakeHostTensorAllocator. "
+                        "Please set standalone_storage=False "
+                        "or upgrade Mooncake by 'pip install mooncake --upgrade'."
+                    )
+                ret_code = self.store.setup_dummy(
+                    mem_pool.size * mem_pool.size_per_token,
+                    DEFAULT_LOCAL_BUFFER_SIZE,  # Zero copy interface does not need local buffer
+                    self.config.client_server_address,
+                )
+            else:
+                try:
+                    from sglang.srt.distributed.parallel_state import (
+                        get_mooncake_transfer_engine,
+                    )
+
+                    self._shared_mooncake_transfer_engine = (
+                        get_mooncake_transfer_engine()
+                    )
+                except Exception:
+                    self._shared_mooncake_transfer_engine = None
+                    logger.debug("Failed to reuse initialized mooncake transfer engine")
+
+                # Only reuse the shared MooncakeTransferEngine when its
+                # configuration matches the one used by MooncakeStore.
+                if (
+                    self._shared_mooncake_transfer_engine is not None
+                    and device_name
+                    == self._shared_mooncake_transfer_engine.get_ib_device()
+                    and self.config.metadata_server == "P2PHANDSHAKE"
+                    and self.config.protocol == "rdma"
+                ):
+                    client_hostname = (
+                        self._shared_mooncake_transfer_engine.get_session_id()
+                    )
+                    transfer_engine = self._shared_mooncake_transfer_engine.get_engine()
+                    logger.info(
+                        f"Reuse initialized mooncake transfer engine: {self._shared_mooncake_transfer_engine}"
+                    )
+                else:
+                    client_hostname = self.config.local_hostname
+                    transfer_engine = None
+
+                ret_code = self.store.setup(
+                    client_hostname,
+                    self.config.metadata_server,
+                    per_tp_global_segment_size,
+                    DEFAULT_LOCAL_BUFFER_SIZE,  # Zero copy interface does not need local buffer
+                    self.config.protocol,
+                    device_name,
+                    self.config.master_server_address,
+                    transfer_engine,
+                )
+            if ret_code:
+                raise RuntimeError(
+                    f"Failed to setup Mooncake store, error code: {ret_code}"
+                )
+            logger.info("Mooncake store setup successfully.")
+
+            self.warmup()
+            logger.info("Mooncake store warmup successfully.")
+
+            if storage_config is not None:
+                self.is_mla_backend = storage_config.is_mla_model
+                self.local_rank = storage_config.tp_rank
+                self.pp_rank = storage_config.pp_rank
+                self.pp_size = storage_config.pp_size
+            else:
+                self.is_mla_backend = False
+                self.local_rank = 0
+                self.pp_rank = 0
+                self.pp_size = 1
+
+            self.enable_pp = self.pp_size > 1
+            if self.enable_pp:
+                self.mha_suffix = f"{self.local_rank}_{self.pp_rank}"
+                self.mla_suffix = f"{self.pp_rank}"
+            else:
+                self.mha_suffix = f"{self.local_rank}"
+                self.mla_suffix = ""
+
+            self.storage_config = storage_config
+            self.split_factor = 0
+            if self.storage_config.should_split_heads:
+                self.split_factor = (
+                    self.storage_config.tp_lcm_size // self.storage_config.tp_size
+                )
+                base_rank = self.local_rank * self.split_factor
+                target_ranks = [base_rank + i for i in range(self.split_factor)]
+                if self.enable_pp:
+                    self.mha_suffix = [
+                        f"{rank}_{self.pp_rank}" for rank in target_ranks
+                    ]
+                else:
+                    self.mha_suffix = [f"{rank}" for rank in target_ranks]
+
+            self.gb_per_page = None
+            self.prefetch_pgs = []
+            self.backup_pgs = []
+            self.prefetch_bandwidth = []
+            self.backup_bandwidth = []
+
+        except ValueError as e:
+            logger.error("Configuration loading failed: %s", e)
+            raise
+        except Exception as exc:
+            logger.error("An error occurred while loading the configuration: %s", exc)
+            raise
+
+    def check_server(self):
+        master_server_ip = self.config.master_server_address.split(":")[0]
+        segments_url = f"http://{master_server_ip}:{self.config.master_metrics_port}/get_all_segments"
+        start_time = time.perf_counter()
+
+        check_result = False
+        while time.perf_counter() - start_time < SETUP_TIMEOUT:
+            try:
+                check_segments_resp = requests.get(segments_url, timeout=3)
+            except Exception:
+                logger.info(
+                    "waiting mooncake store server started, cost_time: %.2f seconds.",
+                    time.perf_counter() - start_time,
+                )
+                time.sleep(3)
+                continue
+
+            if check_segments_resp.text == "":
+                logger.info(
+                    "waiting mooncake store server started, cost_time: %.2f seconds.",
+                    time.perf_counter() - start_time,
+                )
+                time.sleep(3)
+                continue
+
+            logger.info("Mooncake store server started successfully.")
+            check_result = True
+            break
+
+        if not check_result:
+            logger.error("Launch mooncake store server timeout")
+            raise ValueError("Launch mooncake store server timeout")
+
+    def warmup(self):
+        warmup_key = "sglang_mooncake_store_warmup_key" + uuid.uuid4().hex
+        warmup_value = bytes(4 * 1024)  # 4 KB
+        assert self.store.put(warmup_key, warmup_value) == 0
+        assert self.store.is_exist(warmup_key) == 1
+        assert self.store.get(warmup_key) == warmup_value
+
+    def register_mem_pool_host(self, mem_pool_host: HostKVCache):
+        super().register_mem_pool_host(mem_pool_host)
+        assert self.mem_pool_host.layout in [
+            "page_first",
+            "page_first_direct",
+            "page_head",
+        ], "mooncake store storage backend only support page first or page first direct layout"
+        buffer = self.mem_pool_host.kv_buffer
+        try:
+            super().register_buffer(buffer)
+        except TypeError as err:
+            logger.error("Failed to register buffer to Mooncake Store: %s", err)
+            raise TypeError("Mooncake Store Register Buffer Error.") from err
+
+        bytes_per_page = mem_pool_host.get_ksize_per_token() * mem_pool_host.page_size
+        self.gb_per_page = bytes_per_page / (1 << 30)
+
+    def _get_mha_split_heads_buffer_meta(self, keys, indices):
+        ptr_list, element_size_list = (
+            self.mem_pool_host.get_split_heads_page_buffer_meta(
+                indices, self.split_factor
+            )
+        )
+        key_list = []
+        for key_ in keys:
+            for suffix in self.mha_suffix:
+                key_list.append(f"{key_}_{suffix}_k")
+                key_list.append(f"{key_}_{suffix}_v")
+        assert len(key_list) == len(ptr_list)
+        return key_list, ptr_list, element_size_list
+
+    def _get_mha_buffer_meta(self, keys, indices):
+        ptr_list, element_size_list = self.mem_pool_host.get_page_buffer_meta(indices)
+        key_list = []
+        for key_ in keys:
+            key_list.append(f"{key_}_{self.mha_suffix}_k")
+            key_list.append(f"{key_}_{self.mha_suffix}_v")
+        assert len(key_list) == len(ptr_list)
+        return key_list, ptr_list, element_size_list
+
+    def _get_mla_buffer_meta(self, keys, indices):
+        ptr_list, element_size_list = self.mem_pool_host.get_page_buffer_meta(indices)
+        key_list = []
+        for key_ in keys:
+            key_list.append(f"{key_}_{self.mla_suffix}_k")
+        assert len(key_list) == len(ptr_list)
+        return key_list, ptr_list, element_size_list
+
+    def _batch_preprocess(self, keys, host_indices):
+        assert len(keys) > 0
+        assert len(keys) == len(host_indices) // self.mem_pool_host.page_size
+        if self.is_mla_backend:
+            return self._get_mla_buffer_meta(keys, host_indices)
+        else:
+            if self.storage_config.should_split_heads:
+                return self._get_mha_split_heads_buffer_meta(keys, host_indices)
+            else:
+                return self._get_mha_buffer_meta(keys, host_indices)
+
+    def _batch_postprocess(self, results: List[int], is_set_operate=False):
+        """
+        refer to https://github.com/kvcache-ai/Mooncake/blob/main/mooncake-store/include/pybind_client.h
+        for batch_get_into, results is Vector of integers,
+            where each element is the number of bytes read on success, or a negative value on error
+        for batch_put_from, results is Vector of integers,
+            where each element is 0 on success, or a negative value on error
+        """
+        if self.is_mla_backend:
+            return [k_res == 0 if is_set_operate else k_res > 0 for k_res in results]
+        else:
+            if self.storage_config.should_split_heads:
+                kv_groups = [
+                    results[i : i + self.split_factor * 2]
+                    for i in range(0, len(results), self.split_factor * 2)
+                ]
+                return [
+                    (
+                        all(res == 0 for res in kv_group)
+                        if is_set_operate
+                        else all(res > 0 for res in kv_group)
+                    )
+                    for kv_group in kv_groups
+                ]
+            else:
+                kv_pairs = zip(results[::2], results[1::2])
+                return [
+                    (
+                        (k_res == 0 and v_res == 0)
+                        if is_set_operate
+                        else (k_res > 0 and v_res > 0)
+                    )
+                    for k_res, v_res in kv_pairs
+                ]
+
+    def batch_get_v1(
+        self,
+        keys: List[str],
+        host_indices: torch.Tensor,
+        extra_info: Optional[HiCacheStorageExtraInfo] = None,
+    ) -> List[bool]:
+        # Apply extra_backend_tag prefix if available
+        if self.extra_backend_tag is not None:
+            prefix = self.extra_backend_tag
+            keys = [f"{prefix}_{key}" for key in keys]
+
+        key_strs, buffer_ptrs, buffer_sizes = self._batch_preprocess(keys, host_indices)
+        get_results = self._get_batch_zero_copy_impl(
+            key_strs, buffer_ptrs, buffer_sizes
+        )
+        return self._batch_postprocess(get_results, is_set_operate=False)
+
+    def batch_set_v1(
+        self,
+        keys: List[str],
+        host_indices: torch.Tensor,
+        extra_info: Optional[HiCacheStorageExtraInfo] = None,
+    ) -> List[bool]:
+        # Apply extra_backend_tag prefix if available
+        if self.extra_backend_tag is not None:
+            prefix = self.extra_backend_tag
+            keys = [f"{prefix}_{key}" for key in keys]
+
+        key_strs, buffer_ptrs, buffer_sizes = self._batch_preprocess(keys, host_indices)
+        exist_result = self._batch_exist(key_strs)
+
+        set_keys = []
+        set_buffer_ptrs = []
+        set_buffer_sizes = []
+        set_indices = []
+        set_results = [-1] * len(key_strs)
+        for i in range(len(key_strs)):
+            if exist_result[i] != 1:
+                set_keys.append(key_strs[i])
+                set_buffer_ptrs.append(buffer_ptrs[i])
+                set_buffer_sizes.append(buffer_sizes[i])
+                set_indices.append(i)
+            else:
+                set_results[i] = 0
+
+        # Only set non-existing keys to storage
+        if len(set_keys) > 0:
+            put_results = self._put_batch_zero_copy_impl(
+                set_keys, set_buffer_ptrs, set_buffer_sizes
+            )
+            for i in range(len(set_indices)):
+                set_results[set_indices[i]] = put_results[i]
+
+        return self._batch_postprocess(set_results, is_set_operate=True)
+
+    def set(
+        self,
+        key,
+        value: Optional[Any] = None,
+        target_location: Optional[List[int]] = None,
+        target_sizes: Optional[List[int]] = None,
+    ) -> bool:
+        # Only support zero copy set for now
+        assert target_location is not None and target_sizes is not None
+        exist_result = self._batch_exist([key])
+        if exist_result[0] == 1:
+            return True
+        put_result = self._put_batch_zero_copy_impl(
+            [key], [target_location], [target_sizes]
+        )
+        return put_result[0] == 0
+
+    def batch_set(
+        self,
+        keys: List[str],
+        values: Optional[List[torch.Tensor]] = None,
+        target_locations: Optional[List[int]] = None,
+        target_sizes: Optional[List[int]] = None,
+    ) -> bool:
+        # Only support zero copy set for now
+        assert target_locations is not None and target_sizes is not None
+        assert len(keys) == len(target_locations) == len(target_sizes)
+
+        if len(keys) == 0:
+            return False
+
+        for i in range(len(keys)):
+            if (
+                keys[i] is None
+                or target_locations[i] is None
+                or target_sizes[i] is None
+            ):
+                return False
+
+        exist_result = self._batch_exist(keys)
+        set_keys = []
+        set_target_locations = []
+        set_target_sizes = []
+        set_indices = []
+        for i in range(len(keys)):
+            if exist_result[i] != 1:
+                set_keys.append(keys[i])
+                set_target_locations.append(target_locations[i])
+                set_target_sizes.append(target_sizes[i])
+                set_indices.append(i)
+        # Only set non-existing keys to storage
+        start_time = time.perf_counter()
+        put_result = self._put_batch_zero_copy_impl(
+            set_keys, set_target_locations, set_target_sizes
+        )
+        end_time = time.perf_counter()
+
+        self.backup_pgs.append(len(keys))
+        self.backup_bandwidth.append(
+            len(keys) / (end_time - start_time) * self.gb_per_page
+        )
+
+        for i in range(len(set_indices)):
+            if put_result[i] == 0:
+                exist_result[set_indices[i]] = 1
+
+        success_count = 0
+        for i in range(len(keys)):
+            if exist_result[i] == 0:
+                break
+            success_count += 1
+        # TODO: return the number of consecutive successful operations from the start.
+        return success_count == len(keys)
+
+    def get(
+        self,
+        key,
+        target_location: Optional[Any] = None,
+        target_sizes: Optional[Any] = None,
+    ) -> bool:
+        assert target_location is not None and target_sizes is not None
+        get_result = self._get_batch_zero_copy_impl(
+            [key], [target_location], [target_sizes]
+        )
+        return get_result[0] >= 0
+
+    def batch_get(
+        self,
+        keys: List[str],
+        target_locations: Optional[Any] = None,
+        target_sizes: Optional[Any] = None,
+    ) -> int:
+        assert len(keys) == len(target_locations) == len(target_sizes)
+        if len(keys) == 0:
+            return 0
+
+        start_time = time.perf_counter()
+        get_result = self._get_batch_zero_copy_impl(
+            keys, target_locations, target_sizes
+        )
+        end_time = time.perf_counter()
+
+        if self.is_mla_backend:
+            key_multiplier = 1
+        else:
+            key_multiplier = 2
+
+        self.prefetch_pgs.append(len(keys))
+        self.prefetch_bandwidth.append(
+            len(keys) / (end_time - start_time) * self.gb_per_page
+        )
+
+        for i in range(len(keys)):
+            if get_result[i] < 0:
+                return i // key_multiplier
+        return len(keys) // key_multiplier
+
+    def exists(self, key) -> bool:
+        exist_result = self._batch_exist([key])
+        return exist_result[0] == 1
+
+    def batch_exists(
+        self, keys, extra_info: Optional[HiCacheStorageExtraInfo] = None
+    ) -> int:
+        # Apply extra_backend_tag prefix if available
+        if self.extra_backend_tag is not None:
+            prefix = self.extra_backend_tag
+            keys = [f"{prefix}_{key}" for key in keys]
+
+        if self.is_mla_backend:
+            query_keys = [f"{key}_{self.mla_suffix}_k" for key in keys]
+            key_multiplier = 1
+        else:
+            query_keys = []
+            if self.storage_config.should_split_heads:
+                for key in keys:
+                    for suffix in self.mha_suffix:
+                        query_keys.append(f"{key}_{suffix}_k")
+                        query_keys.append(f"{key}_{suffix}_v")
+                key_multiplier = 2 * self.split_factor
+            else:
+                for key in keys:
+                    query_keys.append(f"{key}_{self.mha_suffix}_k")
+                    query_keys.append(f"{key}_{self.mha_suffix}_v")
+                key_multiplier = 2
+
+        exist_result = self._batch_exist(query_keys)
+        for i in range(len(query_keys)):
+            if exist_result[i] != 1:
+                return i // key_multiplier
+        return len(query_keys) // key_multiplier
+
+    def close(self):
+        # MooncakeDistributedStore will automatically call the destructor, so
+        # it is unnecessary to close it manually.
+        pass
+
+    def clear(self) -> None:
+        self.store.remove_all()
+
+    def _put_batch_zero_copy_impl(
+        self, key_strs: List[str], buffer_ptrs: List[int], buffer_sizes: List[int]
+    ) -> List[int]:
+        return self.store.batch_put_from(key_strs, buffer_ptrs, buffer_sizes)
+
+    def _get_batch_zero_copy_impl(
+        self, key_strs: List[str], buffer_ptrs: List[int], buffer_sizes: List[int]
+    ) -> List[int]:
+        return self.store.batch_get_into(key_strs, buffer_ptrs, buffer_sizes)
+
+    def _batch_exist(self, key_strs: List[str]) -> List[int]:
+        return self.store.batch_is_exist(key_strs)
+
+    def get_stats(self):
+        storage_metrics = StorageMetrics()
+        storage_metrics.prefetch_pgs.extend(self.prefetch_pgs)
+        storage_metrics.backup_pgs.extend(self.backup_pgs)
+        storage_metrics.prefetch_bandwidth.extend(self.prefetch_bandwidth)
+        storage_metrics.backup_bandwidth.extend(self.backup_bandwidth)
+        self.prefetch_pgs.clear()
+        self.backup_pgs.clear()
+        self.prefetch_bandwidth.clear()
+        self.backup_bandwidth.clear()
+        return storage_metrics
diff --git a/sglang/python/sglang/srt/mem_cache/storage/mooncake_store/test_mooncake_store.py b/sglang/python/sglang/srt/mem_cache/storage/mooncake_store/test_mooncake_store.py
new file mode 100644
index 0000000000000000000000000000000000000000..ae4788cb5dd4da3334afcd36bf85dd144d351f1e
--- /dev/null
+++ b/sglang/python/sglang/srt/mem_cache/storage/mooncake_store/test_mooncake_store.py
@@ -0,0 +1,189 @@
+import logging
+import uuid
+
+import torch
+from mooncake_store import MooncakeStore
+
+from sglang.srt.mem_cache.hicache_storage import HiCacheStorageConfig
+
+logging.basicConfig(
+    level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
+)
+logger = logging.getLogger(__name__)
+
+
+def generate_batch_query_keys(kv_num: int, config: HiCacheStorageConfig):
+    keys = []
+    for _ in range(kv_num):
+        key = "test_" + str(uuid.uuid4())
+        keys.append(key)
+    set_keys = []
+    for key in keys:
+        if config.is_mla_model:
+            set_keys.append(key + "_k")
+        else:
+            set_keys.append(key + f"_{config.tp_rank}_k")
+            set_keys.append(key + f"_{config.tp_rank}_v")
+    get_keys = set_keys
+    exist_keys = keys
+    return set_keys, get_keys, exist_keys
+
+
+def create_mock_host_kv_cache(buffer_size, dtype=torch.float32):
+    """Create a mock HostKVCache-like object for testing."""
+    buffer = torch.randn(buffer_size, dtype=dtype)
+
+    class MockHostKVCache:
+        def __init__(self, buffer):
+            self.kv_buffer = buffer
+            self.layout = "page_first"
+            self.page_size = 1  # Simple page size for testing
+
+        def get_page_buffer_meta(self, indices):
+            """Mock implementation of get_page_buffer_meta."""
+            ptr_list = []
+            element_size_list = []
+            for idx in indices:
+                # Create mock pointers and sizes for each page
+                ptr_list.append(idx * self.page_size * self.kv_buffer.element_size())
+                element_size_list.append(self.page_size * self.kv_buffer.element_size())
+            return ptr_list, element_size_list
+
+    return MockHostKVCache(buffer), buffer
+
+
+def test_single_operation():
+    """Test the set API with a single key-value pair."""
+    print("=" * 100)
+    print("Testing single operation")
+
+    buffer_size = 1024 * 1024 * 16  # 16MB
+    value_elements = 1024
+    store = MooncakeStore()
+    mock_host_kv_cache, buffer = create_mock_host_kv_cache(buffer_size)
+
+    # Register the memory pool host - this is the proper workflow
+    store.register_mem_pool_host(mock_host_kv_cache)
+
+    value_size = value_elements * buffer.element_size()
+
+    key = str(uuid.uuid4())
+    set_slice = buffer[:value_elements]
+    get_slice = buffer[value_elements : 2 * value_elements]
+    set_location = set_slice.data_ptr()
+    get_location = get_slice.data_ptr()
+
+    # Test set operation
+    result = store.set(key, target_location=set_location, target_sizes=value_size)
+    assert result is True, f"❌set operation failed for key: {key}"
+
+    # Test exists operation
+    assert store.exists(key), f"❌key {key} should exist after set operation"
+
+    # Test get operation
+    result = store.get(key, target_location=get_location, target_sizes=value_size)
+    assert result is True, f"❌get operation failed for key: {key}"
+
+    # Compare the data using proper tensor indices
+    assert torch.allclose(
+        set_slice, get_slice, atol=1e-6
+    ), f"❌get operation failed for key: {key}"
+
+    logger.info(f"✅ Single operation passed")
+
+
+def test_batch_operation(config: HiCacheStorageConfig):
+    """Test the batch set/get APIs with multiple key-value pairs."""
+    print("=" * 100)
+    print(f"Testing batch operation with config: {config}")
+
+    buffer_size = 1024 * 1024 * 16  # 16MB
+    value_elements = 256
+    kv_num = 13
+    store = MooncakeStore(config)
+    mock_host_kv_cache, buffer = create_mock_host_kv_cache(buffer_size)
+
+    store.register_mem_pool_host(mock_host_kv_cache)
+
+    value_size = value_elements * buffer.element_size()
+
+    set_keys, get_keys, exist_keys = generate_batch_query_keys(kv_num, config)
+    set_slices = [
+        buffer[i * value_elements : (i + 1) * value_elements]
+        for i in range(len(set_keys))
+    ]
+    set_locations = [set_slice.data_ptr() for set_slice in set_slices]
+    target_sizes = [value_size for _ in range(len(set_keys))]
+
+    # Test batch set operation
+    result = store.batch_set(
+        set_keys, target_locations=set_locations, target_sizes=target_sizes
+    )
+    assert result is True, f"❌batch set operation failed"
+
+    # Test batch exists operation
+    assert store.batch_exists(
+        exist_keys
+    ), f"❌keys should exist after batch set operation"
+
+    # Test batch get operation
+    get_slices = [
+        buffer[
+            (len(set_keys) + i)
+            * value_elements : (len(set_keys) + i + 1)
+            * value_elements
+        ]
+        for i in range(len(get_keys))
+    ]
+    get_locations = [get_slice.data_ptr() for get_slice in get_slices]
+    result = store.batch_get(
+        get_keys, target_locations=get_locations, target_sizes=target_sizes
+    )
+    assert result == kv_num, f"❌batch get operation failed"
+    for i in range(len(get_keys)):
+        assert torch.allclose(
+            set_slices[i], get_slices[i], atol=1e-6
+        ), f"❌batch get operation failed for key: {get_keys[i]}"
+
+    logger.info(f"✅ Batch operation passed")
+
+
+if __name__ == "__main__":
+    test_single_operation()
+    test_batch_operation(
+        HiCacheStorageConfig(
+            is_mla_model=False,
+            tp_rank=0,
+            tp_size=1,
+            model_name=None,
+            is_page_first_layout=True,
+        )
+    )
+    test_batch_operation(
+        HiCacheStorageConfig(
+            is_mla_model=True,
+            tp_rank=0,
+            tp_size=1,
+            model_name=None,
+            is_page_first_layout=True,
+        )
+    )
+    test_batch_operation(
+        HiCacheStorageConfig(
+            is_mla_model=False,
+            tp_rank=1,
+            tp_size=4,
+            model_name=None,
+            is_page_first_layout=True,
+        )
+    )
+    test_batch_operation(
+        HiCacheStorageConfig(
+            is_mla_model=True,
+            tp_rank=3,
+            tp_size=8,
+            model_name=None,
+            is_page_first_layout=True,
+        )
+    )
+    logger.info(f"✅ All tests passed")
diff --git a/sglang/python/sglang/srt/mem_cache/storage/nixl/README.md b/sglang/python/sglang/srt/mem_cache/storage/nixl/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..277f7ec2d5369340d9a2d46dd6adb9b9998d573f
--- /dev/null
+++ b/sglang/python/sglang/srt/mem_cache/storage/nixl/README.md
@@ -0,0 +1,470 @@
+# NIXL Integration for HiCache
+
+This directory contains the **NIXL (NVIDIA Inference Xfer Library)** integration for **HiCache**, enabling high-performance storage across multiple backends.
+
+NIXL provides a unified API for accessing various storage plugins, including but not limited to:
+
+- **Deepseek's 3FS APIs** for high-throughput file operations
+- **GPU Direct Storage (GDS)** for direct data movement between storage and GPU memory, bypassing CPU memory copies
+- **Amazon S3-compatible object storage** for key-value access patterns
+
+Additional backend integrations are planned for future releases.
+
+## NIXL Resources
+
+- **Project Repository**: [NIXL on GitHub](https://github.com/ai-dynamo/nixl)
+- **Documentation**: [NIXL Documentation](https://github.com/ai-dynamo/nixl/tree/main/docs)
+
+## Overview
+
+The NIXL integration consists of two main files:
+
+- **`hicache_nixl.py`** - Main HiCache storage connector using NIXL
+- **`nixl_utils.py`** - Utility classes for backend selection, registration, and file management
+
+## Components
+
+### HiCacheNixl
+The main storage connector that provides:
+- Single and batch tensor set/get operations
+- Automatic backend selection (3FS > POSIX > GDS_MT > GDS > OBJ)
+- High-performance file-based (or) object based storage access using NIXL
+
+### NixlUtils
+Consolidated utility classes:
+- **NixlBackendSelection** - Handles backend selection and creation
+- **NixlBackendConfig** - Handles backend configuration
+- **NixlRegistration** - Manages memory registration for tensors, files and objects
+- **NixlFileManager** - Handles file system operations and NIXL tuple creation
+
+## Using NIXL as the HiCache Storage Backend
+
+### 1. How Backend Plugin Selection Works
+
+The NIXL backend can support **multiple storage plugins** (e.g., POSIX, GDS, GDS_MT, 3FS, object store, etc).
+
+* Each plugin has its own configuration section in the TOML file.
+* A plugin is considered **usable** if:
+
+  * Its required library is available on the system (POSIX, GDS, GDS_MT are natively supported by NIXL).
+  * Its configuration is valid.
+  * It is marked as `active = true` in the configuration file (if applicable).
+* Some plugins (e.g., 3FS, GDS) require additional system libraries or hardware support.
+* NIXL selects the backend based on **internal priority and availability**, if neither a config file nor an in-command-line config string is provided.
+
+If a plugin is configured but its dependencies are missing, it will be skipped.
+
+
+### 2. Setting the Storage Directory (Optional)
+
+For POSIX / GDS / GDS_MT file-based backends, the default storage location is `/tmp/hicache_storage`. However, you can customize where cached data is stored:
+
+```bash
+export SGLANG_HICACHE_NIXL_BACKEND_STORAGE_DIR=/path/to/storage/dir
+```
+
+### 3. How to Provide Configuration for Backends
+
+There are three ways to specify configurations for the backends: default config, file based config, and command-line (JSON string based) config.
+
+#### 1. Using Default Configuration
+
+To enable HiCache with the NIXL backend, start the SGLang server with:
+
+```bash
+python3 -m sglang.launch_server \
+  --model-path <model> \
+  --host <ip> \
+  --port <port> \
+  --page-size 64 \
+  --enable-hierarchical-cache \
+  --hicache-ratio 2 \
+  --hicache-size 64 \
+  --hicache-write-policy write_through \
+  --hicache-storage-backend nixl
+```
+
+By default, NIXL will use its internal backend selection logic to choose an available storage plugin (and use default configs for the selected storage plugin).
+
+
+#### 2. Using a Configuration File (Recommended)
+
+For non-trivial setups with complex configurations, it is recommended to use a **TOML configuration file** to define which backend plugin to use and its configurations, via `--hicache-storage-backend-extra-config`:
+
+Below is an example command (note: detailed configs are defined in the config file):
+
+```bash
+python3 -m sglang.launch_server \
+  --model-path <model> \
+  --host <ip> \
+  --port <port> \
+  --page-size 64 \
+  --enable-hierarchical-cache \
+  --hicache-ratio 2 \
+  --hicache-size 64 \
+  --hicache-write-policy write_through \
+  --hicache-storage-backend nixl \
+  --hicache-storage-backend-extra-config "@config.nixl.toml"
+```
+
+> **Important**
+>
+> * The `@` prefix tells SGLang to load the configuration from a file.
+> * The file can be in **TOML format** (other formats, JSON / YAML, are also supported).
+> * This is the preferred way to configure NIXL storage backends.
+
+The structure of the config file is described in further details in [Configuration File Spec](#Configuration-File-Specification).
+
+
+#### 3. Using Command-line JSON String
+
+For debugging or quick testing, you may pass a **JSON-style string** directly via `--hicache-storage-backend-extra-config`.
+
+This requires explicitly specifying the plugin type via an environment variable, and this method can be applicable to **only a few** plugins (e.g., POSIX, GDS, GDS_MT)
+
+The below example shows how to use command-line string to use the POSIX plugin where URING is enabled for async POSIX storage.
+
+```bash
+export SGLANG_HICACHE_NIXL_BACKEND_PLUGIN_TYPE=POSIX
+
+python3 -m sglang.launch_server \
+  --model-path <model> \
+  --host <ip> \
+  --port <port> \
+  --page-size 64 \
+  --enable-hierarchical-cache \
+  --hicache-ratio 2 \
+  --hicache-size 64 \
+  --hicache-write-policy write_through \
+  --hicache-storage-backend nixl \
+  --hicache-storage-backend-extra-config "{'use_uring': 'true'}"
+```
+
+⚠️ **Note**:
+This method is convenient for testing / experimenting. For production or multi-plugin setups, it is always recommended to use the config file based approach.
+
+
+## Running Unit Tests
+
+### Prerequisites
+- NIXL library installed and available (latest main required for supporting object query)
+- PyTorch installed
+- Python 3.8+
+
+### Unit tests from current directory
+From the current directory run:
+
+#### Run all NIXL tests:
+```bash
+PYTHONPATH=. python -m pytest test_hicache_nixl_storage.py -o asyncio_mode=strict
+```
+
+#### Run with verbose output:
+```bash
+PYTHONPATH=. python -m pytest test_hicache_nixl_storage.py -v -o asyncio_mode=strict
+```
+
+Note: The `-v` flag provides more detailed output, showing each test case name and its result.
+
+#### Run a specific test:
+```bash
+PYTHONPATH=. python -m pytest test_hicache_nixl_storage.py -v -k test_single_set_get -o asyncio_mode=strict
+```
+
+Note: The `-o asyncio_mode=strict` flag is added to suppress warnings about asyncio configuration. This is not required for test functionality but provides cleaner output.
+
+## Test Coverage
+
+Tests for this integration, a test suite can be found at `test_hicache_nixl_storage.py` which covers:
+
+### HiCache Integration Tests (4 tests)
+- Single tensor set/get operations
+- Batch tensor set/get operations
+- Mixed single and batch operations
+- Data integrity for various tensor types
+
+### File Management Tests (5 tests)
+- Basic file operations
+- NIXL tuple creation
+- Error handling in file operations
+
+### Registration Tests (2 tests)
+- Tensor registration with memory type detection
+- File registration using NIXL tuples
+
+## Expected Output
+
+When tests run successfully, you should see:
+- NIXL agent initialization messages
+- Backend selection messages (e.g., "Backend POSIX was instantiated")
+- Test results with "ok" for passed tests
+- Summary showing "Ran X tests in Y seconds" and "OK"
+
+## Troubleshooting
+
+### Import Errors
+If you encounter `ModuleNotFoundError`, ensure:
+- You're running from the correct directory
+- `PYTHONPATH` is set correctly
+- NIXL library is properly installed
+
+### NIXL Errors
+If NIXL operations fail:
+- Check that NIXL is properly installed
+- Verify that required plugins are available
+- Ensure file permissions are correct for test directories
+
+## File Structure
+
+```
+python/sglang/srt/mem_cache/nixl/
+├── hicache_nixl.py          # Main HiCache storage connector
+├── nixl_utils.py            # All NIXL utility classes
+├── README.md                # This file
+└── tests/
+    └── test_nixl_unified.py # All tests in one file
+```
+
+## Dependencies
+
+- **NIXL**: NVIDIA Inference Xfer Library (version 0.4 or later)
+  - Required plugins: POSIX (minimum), 3FS/GDS (optional for better performance)
+  - See [NIXL Installation Guide](https://github.com/ai-dynamo/nixl/blob/main/README.md#installation)
+- **PyTorch**: For tensor operations (version 1.8 or later)
+- **Python 3.8+**: For type hints and modern features
+
+## Supported Features
+
+### Memory Types
+- **Tensor side**: multi-dimensional tensors of all numeric types (int32, int64, float32, float64) are supported.
+  - Tensors can be on CPU or GPU (as long as a GPU capable backend such as GDS_MT is available).
+  - Currently each tensor is mapped to a file or key, but it can be extended to support multiple keys per file or key.
+
+- **Storage side**: file and object are supported through their relevant backends (e.g., 3FS or OBJ).
+
+### Backend Priority
+
+The NIXL backend selection follows this priority order:
+1. **3FS** - Highest performance (if available)
+    - Best for high-throughput file operations using Deepseek 3FS APIs
+2. **POSIX** - Standard file I/O (fallback)
+    - Universal compatibility
+    - Good for development and testing - Leverages both libaio/liburing
+3. **GDS_MT** - Multi-threaded GDS (if available)
+    - Optimized for concurrent operations
+    - Supports GPU Direct storage with multiple light weight threads
+4. **GDS** - GPU Direct Storage (if available)
+    - Direct GPU-storage data path
+    - Best for filesystems benefiting from batch operations and smaller IOs.
+5. **OBJ** - Amazon S3 based Object Storage
+    - Key-value based storage
+The system automatically selects the best available backend, with POSIX as the default fallback.
+
+
+
+## Configuration File Specification
+
+This section defines the structure, supported sections, configuration keys, data types, defaults, and semantics for the NIXL HiCache backend configuration file (`config.nixl.toml`).
+
+The configuration file is written in **TOML** and consists of multiple **plugin-specific sections** under the `plugin.*` namespace. Each section configures one storage backend plugin. Only one plugin should be enabled via setting `active = true` in the corresponding plugin-specific section.
+
+An example of the configuration is provided in [`nixl.config.toml.sample`](./nixl.config.toml.sample).
+
+### 1. General Structure
+
+```toml
+[plugin.<backend_name>]
+<key> = <value>
+```
+
+* `<backend_name>` identifies the storage backend plugin.
+* Each plugin is configured independently.
+* Plugins are selected at runtime based on:
+
+  * Availability of required libraries/hardware
+  * Plugin configuration validity
+  * Internal backend priority rules
+* Unless otherwise stated, all configuration keys are **optional** and have sensible defaults.
+
+
+### 2. POSIX File System Backend (`plugin.posix`)
+
+#### Section
+
+```toml
+[plugin.posix]
+```
+
+#### Description
+
+Configures the POSIX file-system-based backend.
+This backend supports multiple asynchronous I/O mechanisms and automatically selects the most performant option supported by the system.
+
+**Backend priority (highest to lowest):**
+
+1. Linux AIO
+2. `io_uring`
+3. POSIX AIO
+
+
+#### Configuration Keys
+
+| Key             | Type    | Default   | Description                                                                                              |
+| --------------- | ------- | --------- | -------------------------------------------------------------------------------------------------------- |
+| `use_uring`     | string  | `"false"` | Enables Linux `io_uring` for asynchronous I/O when set to `"true"`. Recommended on modern Linux kernels. |
+| `use_posix_aio` | string  | `"false"` | Enables POSIX AIO as an alternative async I/O mechanism.                                                 |
+| `use_aio`       | string  | `"false"` | Enables generic Linux AIO.                                                                               |
+| `active`        | boolean |    N/A    | Controls whether this plugin is eligible for backend selection.                                          |
+
+**Notes**
+
+* Boolean-like options use **string values** (`"true"` / `"false"`) for compatibility.
+* **Only one backend** (i.e., only one of `use_uring`, `use_aio`, `use_posix_aio`) should be included in the config.
+
+
+### 3. NVIDIA GPUDirect Storage Backend (`plugin.gds`)
+
+#### Section
+
+```toml
+[plugin.gds]
+```
+
+#### Description
+
+Configures NVIDIA GPUDirect Storage (GDS) backend.
+This backend enables direct data transfers between storage and GPU memory.
+
+**Requirements**
+
+* NVIDIA GPU with GDS support
+* Compatible NVIDIA driver and CUDA runtime
+* Supported filesystem
+
+
+#### Configuration Keys
+
+| Key                | Type    | Default            | Description                                            |
+| ------------------ | ------- | ------------------ | ------------------------------------------------------ |
+| `batch_pool_size`  | integer | `128`              | Number of I/O requests maintained in the request pool. |
+| `batch_limit`      | integer | `128`              | Maximum number of requests issued in a single batch.   |
+| `max_request_size` | integer | `16777216` (16 MB) | Maximum size (in bytes) of a single I/O request.       |
+| `active`           | boolean |         N/A        | Controls whether this plugin is eligible for backend selection.|
+
+
+### 4. Multi-Threaded GDS Backend (`plugin.gds_mt`)
+
+#### Section
+
+```toml
+[plugin.gds_mt]
+```
+
+#### Description
+
+Configures the multi-threaded variant of the NVIDIA GDS backend, allowing parallel request processing using multiple CPU threads.
+
+
+
+#### Configuration Keys
+
+| Key            | Type    | Default | Description                                           |
+| -------------- | ------- | ------- | ----------------------------------------------------- |
+| `thread_count` | integer | `4`     | Number of worker threads used to submit GDS requests. |
+| `active`       | boolean |    N/A  | Controls whether this plugin is eligible for backend selection. |
+
+
+### 5. 3FS Backend (`plugin.3fs`)
+
+#### Section
+
+```toml
+[plugin.3fs]
+```
+
+#### Description
+
+Configures the 3FS (third-party filesystem) backend.
+
+**Requirements**
+
+* 3FS client library installed
+* Filesystem mounted and accessible on the host
+
+
+#### Configuration Keys
+
+| Key           | Type    | Default  | Description                        |
+| ------------- | ------- | -------- | ---------------------------------- |
+| `mount_point` | string  | *none*   | Mount point of the 3FS filesystem. |
+| `mem_config`  | string  | `"dram"` | Memory configuration mode.         |
+| `iopool_size` | integer | `64`     | Size of the I/O pool.              |
+| `active`      | boolean |   N/A    | Controls whether this plugin is eligible for backend selection.                                          |
+
+##### `mem_config` Valid Values
+
+| Value     | Description                                         |
+| --------- | --------------------------------------------------- |
+| `dram`    | Use DRAM for buffering                              |
+| `dram_zc` | Use DRAM with zero-copy support                     |
+| `auto`    | Automatically select based on platform capabilities |
+
+##### `iopool_size` Constraints
+
+* Valid range: **[2⁶, 2²⁰]**
+* Values outside this range may cause initialization failure.
+
+
+### 6. Object Storage Backend (`plugin.obj`)
+
+#### Section
+
+```toml
+[plugin.obj]
+```
+
+#### Description
+
+Configures an object storage backend compatible with S3 APIs (e.g., AWS S3, MinIO, Ceph).
+
+
+#### Configuration Keys
+
+| Key                      | Type    | Default      | Description                                    |
+| ------------------------ | ------- | ------------ | ---------------------------------------------- |
+| `num_threads`            | integer | `4`          | Number of client worker threads.               |
+| `endpoint_override`      | string  | `""`         | Custom endpoint URL (for non-AWS S3 services). |
+| `scheme`                 | string  | `"http"`     | Connection scheme (`http` or `https`).         |
+| `region`                 | string  | `""`         | Cloud region (if applicable).                  |
+| `req_checksum`           | string  | `"required"` | Request checksum behavior.                     |
+| `ca_bundle`              | string  | `""`         | Path to a custom CA bundle.                    |
+| `access_key`             | string  | `""`         | Access key credential.                         |
+| `secrete_key`            | string  | `""`         | Secret key credential.                         |
+| `session_token`          | string  | `""`         | Session token (optional).                      |
+| `use_virtual_addressing` | string  | `"true"`     | Enables virtual-hosted-style addressing.       |
+| `bucket`                 | string  | `""`         | Default bucket name.                           |
+| `active`                 | boolean |      N/A     | Controls whether this plugin is eligible for backend selection.                                          |
+
+##### `req_checksum` Valid Values
+
+| Value       | Description                                    |
+| ----------- | ---------------------------------------------- |
+| `required`  | Always include a checksum                      |
+| `supported` | Include checksum when supported by the backend |
+
+
+### 7. Notes and Best Practices
+
+* All plugin sections are optional.
+* Multiple plugins may be configured in a single file. However, it is recommended that **only one plugin** is configured `active = true`.
+* Plugins whose dependencies are unavailable will be skipped.
+* Use a TOML configuration file instead of inline JSON for:
+
+  * Multi-plugin setups
+  * Production deployments
+  * Clear validation and maintainability
+
+
+## Note
+
+This is v0 of the NIXL connector. Future versions will focus on further performance optimizations such as memory pre-registration (pre-allocating and registering memory buffers to reduce registration overhead during transfers) and block merging (combining related blocks as offsets within the same file to reduce file operations and improve throughput). These optimizations require changes at a higher layer, as the current HiCache API doesn't expose information like block relationships or hash patterns that would enable these optimizations.
diff --git a/sglang/python/sglang/srt/mem_cache/storage/nixl/hicache_nixl.py b/sglang/python/sglang/srt/mem_cache/storage/nixl/hicache_nixl.py
new file mode 100644
index 0000000000000000000000000000000000000000..78addf514e00b38eef0ee63ca5c42fc16ea012d2
--- /dev/null
+++ b/sglang/python/sglang/srt/mem_cache/storage/nixl/hicache_nixl.py
@@ -0,0 +1,622 @@
+import logging
+import time
+import uuid
+from typing import Any, List, Optional, Union
+
+import torch
+
+from sglang.srt.environ import envs
+from sglang.srt.mem_cache.hicache_storage import (
+    HiCacheStorage,
+    HiCacheStorageConfig,
+    HiCacheStorageExtraInfo,
+)
+from sglang.srt.mem_cache.memory_pool_host import HostKVCache
+
+from .nixl_utils import (
+    NixlBackendConfig,
+    NixlBackendSelection,
+    NixlFileManager,
+    NixlRegistration,
+)
+
+try:
+    from nixl._api import nixl_agent, nixl_agent_config
+except ImportError as e:
+    raise ImportError(
+        "Please install NIXL by following the instructions at "
+        "https://github.com/ai-dynamo/nixl/blob/main/README.md "
+        "to use HiCacheNixl storage backend."
+    ) from e
+
+logger = logging.getLogger(__name__)
+
+
+class HiCacheNixl(HiCacheStorage):
+    """HiCacheNixl provides high-performance storage using NIXL plugins."""
+
+    def __init__(
+        self,
+        storage_config: HiCacheStorageConfig,
+        file_path: str = "/tmp/hicache_storage",
+    ):
+        """Initialize NIXL storage connector."""
+
+        # create nixlconfig from the --hicache-storage-backend-extra-config
+        nixlconfig = NixlBackendConfig(storage_config.extra_config)
+
+        # select the NIXL backend plugin from extra_config or environment variable
+        plugin = nixlconfig.get_specified_plugin()
+
+        # Might be better to be unified across HiCache backends and moved to HiCacheController
+        file_path = envs.SGLANG_HICACHE_NIXL_BACKEND_STORAGE_DIR.get() or file_path
+        self.file_manager = (
+            NixlFileManager(file_path)
+            if plugin not in NixlBackendSelection.OBJ_PLUGINS
+            else None
+        )
+
+        # Initialize suffix based on storage config
+        tp_rank, tp_size, model_name = (
+            storage_config.tp_rank,
+            storage_config.tp_size,
+            storage_config.model_name,
+        )
+
+        self.is_mla_model = storage_config.is_mla_model
+
+        model_name = "-".join(model_name.split("/")) if model_name else ""
+
+        if self.is_mla_model:
+            self.config_suffix = f"_{model_name}"
+        else:
+            self.config_suffix = f"_{model_name}_{tp_rank}_{tp_size}"
+
+        agent_config = nixl_agent_config(backends=[])
+        self.agent_name = f"hicache_nixl_{str(uuid.uuid4())}"
+        self.agent = nixl_agent(self.agent_name, agent_config)
+
+        self.backend_selector = NixlBackendSelection(plugin, nixlconfig)
+        if not self.backend_selector.create_backend(self.agent):
+            raise RuntimeError("Failed to create NIXL backend")
+
+        self.registration = NixlRegistration(self.agent)
+
+    def _get_suffixed_key(self, key: str) -> str:
+        return key + self.config_suffix
+
+    def register_buffers(
+        self, buffers: Union[torch.Tensor, List[torch.Tensor], List[tuple]]
+    ) -> Optional[Any]:
+        """Register tensor(s) or target locations in host memory (list of addr,len tuples) with NIXL."""
+        if isinstance(buffers[0], tuple):
+            tuples = [(x[0], x[1], 0, "") for x in buffers]
+            return self.registration._register_memory(tuples, "DRAM")
+        else:
+            return self.registration._register_memory(buffers)
+
+    def register_files(
+        self, file_paths: List[str], open_file: Optional[bool] = True
+    ) -> Optional[Any]:
+        """Register files with NIXL."""
+        tuples = self.file_manager.files_to_nixl_tuples(file_paths)
+        return self.registration._register_memory(tuples, "FILE")
+
+    def register_objects(
+        self, keys: List[str], sizes: Optional[List[int]] = None
+    ) -> Optional[Any]:
+        """Register objects with NIXL."""
+        if not keys:
+            return None
+        tuples = [(0, 0, key, "") for key in keys]
+        return self.registration._register_memory(tuples, "OBJ")
+
+    def _execute_transfer(
+        self,
+        buffers: Optional[List[torch.Tensor | tuple]],
+        keys: List[str],
+        direction: str,
+    ) -> bool:
+        if len(buffers) != len(keys):
+            logger.error("Mismatch between number of tensors/buffers and files/objects")
+            return False
+
+        # Registering file and object keys per transfer, to be updated when
+        # pre-registration for file and object is added to HiCache.
+        if self.backend_selector.mem_type == "FILE":
+            tuples = self.file_manager.files_to_nixl_tuples(keys)
+            if not tuples or not self.registration._register_memory(tuples, "FILE"):
+                logger.error("Failed to prepare files for transfer")
+                return False
+        else:  # mem_type == "OBJ"
+            tuples = [(0, 0, key, "") for key in keys]
+            if not tuples or not self.registration._register_memory(tuples, "OBJ"):
+                logger.error("Failed to register objects")
+                return False
+
+        # Prepare transfer descriptors
+        if isinstance(buffers[0], torch.Tensor):
+            tensor_sizes = [
+                tensor.element_size() * tensor.numel() for tensor in buffers
+            ]
+            storage_tuples = [(x[0], s, x[2]) for x, s in zip(tuples, tensor_sizes)]
+            host_descs = self.agent.get_xfer_descs(buffers)
+
+            if direction in ("READ", "WRITE"):
+                # register buffer to avoid calling initialize_xfer twice due to missing registration
+                self.register_buffers(buffers)
+
+        elif isinstance(buffers[0], tuple):
+            storage_tuples = [(x[0], y[1], x[2]) for x, y in zip(tuples, buffers)]
+            host_descs = self.agent.get_xfer_descs(
+                [(x[0], x[1], 0) for x in buffers], "DRAM"
+            )
+
+            if direction in ("READ", "WRITE"):
+                # register buffer to avoid calling initialize_xfer twice due to missing registration
+                self.register_buffers(buffers)
+
+        else:
+            return False
+
+        storage_descs = self.agent.get_xfer_descs(
+            storage_tuples, self.backend_selector.mem_type
+        )
+
+        if (host_descs is None) or (storage_descs is None):
+            logger.error("Failed to get transfer descriptors")
+            return False
+
+        # Initialize transfer, default assumption that tensor was registered
+
+        try:
+            xfer_req = self.agent.initialize_xfer(
+                direction, host_descs, storage_descs, self.agent_name
+            )
+        except Exception:
+            # Check if it was due to missing pre-registration
+            if not self.register_buffers(buffers):
+                logger.error("Failed to register tensors/buffers")
+                return False
+
+            try:
+                xfer_req = self.agent.initialize_xfer(
+                    direction, host_descs, storage_descs, self.agent_name
+                )
+            except Exception as e:
+                logger.error(f"Failed to create transfer request: {e}")
+                return False
+
+        # Execute transfer and wait for its completion
+        try:
+            state = self.agent.transfer(xfer_req)
+            while state != "DONE":
+                state = self.agent.check_xfer_state(xfer_req)
+                if state == "ERR":
+                    self.agent.release_xfer_handle(xfer_req)
+                    logger.error("Transfer failed")
+                    return False
+                time.sleep(0.0001)  # Can be changed to os.sched_yield() or parametrized
+
+            self.agent.release_xfer_handle(xfer_req)
+            return True
+
+        except Exception as e:
+            logger.error(f"Failed to execute transfer: {e}")
+            import traceback
+
+            logger.error(f"Traceback: {traceback.format_exc()}")
+            return False
+
+    def get(
+        self,
+        key: str,
+        target_location: Optional[torch.Tensor | int] = None,
+        target_sizes: Optional[int] = None,
+    ) -> torch.Tensor | None:
+        # To be removed, being compatible with the current API
+        if target_location is None:
+            return None
+        if target_sizes:
+            result = self.batch_get([key], [target_location], [target_sizes])
+        else:
+            result = self.batch_get([key], [target_location])
+        return result[0] if result else None
+
+    def batch_get(
+        self,
+        keys: List[str],
+        target_locations: Optional[List[torch.Tensor | int]] = None,
+        target_sizes: Optional[List[int]] = None,
+    ) -> List[torch.Tensor | None]:
+        if not keys:
+            return []
+
+        # To be removed, being compatible with the current API
+        if not target_locations:
+            return [None] * len(keys)
+
+        if target_sizes and (len(target_sizes) != len(target_locations)):
+            logger.error("Mismatch between number of target_locations and target_sizes")
+            return [None] * len(keys)
+
+        if target_sizes:
+            dest = list(zip(target_locations, target_sizes))
+        else:
+            dest = target_locations
+
+        # Add suffix to keys
+        suffixed_keys = [self._get_suffixed_key(key) for key in keys]
+
+        if self.backend_selector.mem_type == "FILE":
+            file_paths = [self.file_manager.get_file_path(key) for key in suffixed_keys]
+            success = self._execute_transfer(dest, file_paths, "READ")
+        else:
+            success = self._execute_transfer(dest, suffixed_keys, "READ")
+        return target_locations if success and not target_sizes else [None] * len(keys)
+
+    def set(
+        self,
+        key: str,
+        value: Optional[torch.Tensor] = None,
+        target_location: Optional[int] = None,
+        target_sizes: Optional[int] = None,
+    ) -> bool:
+        if target_location and target_sizes:
+            return self.batch_set([key], None, [target_location], [target_sizes])
+        else:
+            return self.batch_set([key], [value])
+
+    def batch_set(
+        self,
+        keys: List[str],
+        values: Optional[List[torch.Tensor]] = None,
+        target_locations: Optional[List[int]] = None,
+        target_sizes: Optional[List[int]] = None,
+    ) -> bool:
+        if not keys or (not values and (not target_locations or not target_sizes)):
+            logger.error("Keys or values were not passed")
+            return False
+
+        if not values:
+            values = list(zip(target_locations, target_sizes))
+
+        # Add suffix to keys
+        suffixed_keys = [self._get_suffixed_key(key) for key in keys]
+
+        if self.backend_selector.mem_type == "FILE":
+            file_paths = []
+            for key in suffixed_keys:
+                file_path = self.file_manager.get_file_path(key)
+                # New file per set, to be updated when partial writes is added to HiCache
+                if not self.file_manager.create_file(file_path):
+                    logger.error(f"Failed to create file {file_path}")
+                    return False
+                file_paths.append(file_path)
+            return self._execute_transfer(values, file_paths, "WRITE")
+        else:  # mem_type == "OBJ"
+            return self._execute_transfer(values, suffixed_keys, "WRITE")
+
+    ############################################################################
+    # batch_*_v1 functions
+    # zero copy + non-zero-copy version for get, set, exists, batch_exists
+    ############################################################################
+
+    def clear(self) -> None:
+        self.file_manager.clear()
+
+    def register_mem_pool_host(self, mem_pool_host: HostKVCache):
+        super().register_mem_pool_host(mem_pool_host)
+
+        # enable zero-copy automatically if mem layout is page_first or page_first_direct
+        self.is_zero_copy = self.mem_pool_host.layout in [
+            "page_first",
+            "page_first_direct",
+        ]
+
+        logger.info(
+            f"HiCacheNixl: Registered mem_pool_host with layout {self.mem_pool_host.layout}, zero_copy set to {self.is_zero_copy}"
+        )
+
+    def exists(self, key: str) -> bool:
+        results = self.batch_exists([key])
+        return results > 0
+
+    def batch_exists(
+        self,
+        keys: List[str],
+        extra_info: Optional[HiCacheStorageExtraInfo] = None,
+    ) -> int:
+        # Add suffix to key
+
+        if self.is_zero_copy:
+            key_list = self._get_key_list_from_meta(keys)
+            key_denominator = (
+                1 if not self.is_mla_model else 2
+            )  # MLA model only has k buffer, no separate v buffer
+        else:
+            key_list = [self._get_suffixed_key(key) for key in keys]
+            key_denominator = 1
+
+        # obtain list of tuples by calling self.registration.create_query_tuples()
+        tuples = []
+        for key in key_list:
+            tuples += self.registration.create_query_tuples(
+                key,
+                self.backend_selector.mem_type,
+                self.file_manager if self.backend_selector.mem_type == "FILE" else None,
+            )
+
+        query_res = self.agent.query_memory(
+            tuples,
+            self.backend_selector.backend_name,
+            mem_type=self.backend_selector.mem_type,
+        )
+
+        for i in range(len(query_res)):
+            if query_res[i] is None:
+                return i // key_denominator
+        return len(query_res) // key_denominator
+
+    def _get_key_list_from_meta(self, keys: List[str]) -> List[str]:
+        # construct the key list for NIXL transfer based on the keys and the suffix, for each key, we will have one suffixed key for k buffer and one suffixed key for v buffer if it's not an MLA model, and only one suffixed key for k buffer if it's an MLA model, since MLA model only has k/v interleaved buffer
+        key_list = []
+
+        for key_ in keys:
+            suffixed_key = self._get_suffixed_key(key_)
+            if self.is_mla_model:
+                key_list.append(f"{suffixed_key}_k")
+            else:
+                key_list.append(f"{suffixed_key}_k")
+                key_list.append(f"{suffixed_key}_v")
+
+        return key_list
+
+    def _get_location_and_size_list_from_meta(
+        self, keys: List[str], host_indices: torch.Tensor
+    ):
+        # zero copy: mem_pool_host.get_data_page() does not work due to non-contiguous tensors, causing issues for NIXL transfer
+        ptr_list, element_size_list = self.mem_pool_host.get_page_buffer_meta(
+            host_indices
+        )
+        key_list = self._get_key_list_from_meta(keys)
+
+        if len(key_list) != len(ptr_list):
+            logger.error(
+                f"HiCacheNixl: mismatch between number of keys and number of buffer meta entries, keys: {len(keys)}, key_list: {len(key_list)}, buffer meta entries: {len(ptr_list)}"
+            )
+            return [], [], [], []
+
+        return key_list, [], ptr_list, element_size_list
+
+    def _batch_get_preprocess(self, keys: List[str], host_indices: torch.Tensor):
+        page_num = len(host_indices) // self.mem_pool_host.page_size
+
+        if len(keys) == 0 or len(keys) != page_num:
+            logger.warning(
+                f"HiCacheNixl: empty keys or mismatch in keys and host_indices lengths. keys: {len(keys)}, host_indices: {len(host_indices)}, page_size: {self.mem_pool_host.page_size}"
+            )
+            return [], [], [], []
+
+        if self.is_zero_copy:
+            key_list, _, ptr_list, element_size_list = (
+                self._get_location_and_size_list_from_meta(keys, host_indices)
+            )
+            return key_list, [], ptr_list, element_size_list
+        else:
+            # non zero copy: create contiguous, temporary tensors
+            target_tensors = [
+                self.mem_pool_host.get_dummy_flat_data_page() for i in range(page_num)
+            ]
+
+            key_list = [self._get_suffixed_key(key) for key in keys]
+            ptr_list = [tensor.data_ptr() for tensor in target_tensors]
+            element_size_list = [
+                tensor.numel() * tensor.element_size() for tensor in target_tensors
+            ]
+
+            return key_list, target_tensors, ptr_list, element_size_list
+
+    def _batch_get_zero_copy_impl(
+        self,
+        keys: List[str],
+        key_strs: List[str],
+        target_tensors: List[torch.Tensor],
+        target_locations: List[int],
+        target_sizes: List[int],
+    ) -> List[int]:
+
+        if not key_strs or not target_locations or not target_sizes:
+            return [False] * len(keys)
+
+        if (len(key_strs) != len(target_locations)) or (
+            len(target_sizes) != len(target_locations)
+        ):
+            logger.error(
+                "Mismatch between number of key_strs, target_locations and target_sizes"
+            )
+            return [False] * len(keys)
+
+        if self.is_zero_copy:
+            dest = list(zip(target_locations, target_sizes))
+        else:
+            dest = target_tensors
+
+        if self.backend_selector.mem_type == "FILE":
+            file_paths = [self.file_manager.get_file_path(key) for key in key_strs]
+            success = self._execute_transfer(dest, file_paths, "READ")
+        else:
+            success = self._execute_transfer(dest, key_strs, "READ")
+
+        return [True] * len(key_strs) if success else [False] * len(key_strs)
+
+    def _batch_get_postprocess(
+        self,
+        host_indices: torch.Tensor,
+        target_tensors: List[torch.Tensor],
+        results: List[bool],
+    ) -> List[bool]:
+
+        page_num = len(host_indices) // self.mem_pool_host.page_size
+
+        if self.is_zero_copy:
+            # zero copy: update final results based on the boolean results from NIXL transfer
+            if self.is_mla_model:
+                return results
+            else:
+                results = [
+                    (results[2 * i] and results[2 * i + 1]) for i in range(page_num)
+                ]
+                return results
+        else:
+            # non zero copy: copy data from temporary tensors to mem_pool_host page by page
+            for i in range(page_num):
+                if not results[i]:
+                    break
+                self.mem_pool_host.set_from_flat_data_page(
+                    host_indices[i * self.mem_pool_host.page_size], target_tensors[i]
+                )
+
+            return results
+
+    def batch_get_v1(
+        self,
+        keys: List[str],
+        host_indices: torch.Tensor,
+        extra_info: Optional[HiCacheStorageExtraInfo] = None,
+    ) -> List[bool]:
+
+        key_strs, target_tensors, buffer_ptrs, buffer_sizes = (
+            self._batch_get_preprocess(keys, host_indices)
+        )
+
+        if not key_strs or not buffer_ptrs or not buffer_sizes:
+            logger.error(
+                "HiCacheNixl batch_get_v1: preprocessing failed, empty key_strs, buffer_ptrs or buffer_sizes"
+            )
+            return [False] * len(keys)
+
+        start_time = time.perf_counter()
+
+        results_get = self._batch_get_zero_copy_impl(
+            keys, key_strs, target_tensors, buffer_ptrs, buffer_sizes
+        )
+
+        end_time = time.perf_counter()
+        elapsed_time_ms = (end_time - start_time) * 1000
+        total_bytes = sum(s for s in buffer_sizes if s is not None)
+
+        logger.debug(
+            f"HiCacheNixl batch_get_v1 transferred: {len(keys)} keys (pages), {host_indices.numel()} host_indices, {total_bytes} bytes, total time: {elapsed_time_ms:.3f} ms, effective bandwidth: {total_bytes / (elapsed_time_ms / 1000) / (1024 * 1024):.2f} MB/s"
+        )
+
+        return self._batch_get_postprocess(host_indices, target_tensors, results_get)
+
+    def _batch_set_preprocess(self, keys: List[str], host_indices: torch.Tensor):
+
+        page_num = len(host_indices) // self.mem_pool_host.page_size
+
+        if len(keys) == 0 or len(keys) != page_num:
+            logger.warning(
+                f"HiCacheNixl: empty keys or mismatch in keys and host_indices lengths. keys: {len(keys)}, host_indices: {len(host_indices)}, page_size: {self.mem_pool_host.page_size}"
+            )
+            return [], [], [], []
+
+        if self.is_zero_copy:
+            key_list, _, ptr_list, element_size_list = (
+                self._get_location_and_size_list_from_meta(keys, host_indices)
+            )
+            return key_list, [], ptr_list, element_size_list
+        else:
+            # non zero copy: NIXL still requires contiguous tensors for transfer
+            target_tensors = [
+                self.mem_pool_host.get_data_page(
+                    host_indices[i * self.mem_pool_host.page_size], flat=False
+                ).contiguous()
+                for i in range(page_num)
+            ]
+
+            key_list = [self._get_suffixed_key(key) for key in keys]
+            ptr_list = [tensor.data_ptr() for tensor in target_tensors]
+            element_size_list = [
+                tensor.numel() * tensor.element_size() for tensor in target_tensors
+            ]
+
+            return key_list, target_tensors, ptr_list, element_size_list
+
+    def _batch_set_zero_copy_impl(
+        self,
+        keys: List[str],
+        key_strs: List[str],
+        target_tensors: List[torch.Tensor],
+        target_locations: List[int],
+        target_sizes: List[int],
+    ) -> List[bool]:
+
+        if not key_strs or not target_locations or not target_sizes:
+            return [False] * len(keys)
+
+        if (len(key_strs) != len(target_locations)) or (
+            len(target_sizes) != len(target_locations)
+        ):
+            logger.error(
+                "Mismatch between number of key_strs, target_locations and target_sizes"
+            )
+            return [False] * len(keys)
+
+        if self.is_zero_copy:
+            src = list(zip(target_locations, target_sizes))
+        else:
+            src = target_tensors
+
+        if self.backend_selector.mem_type == "FILE":
+            file_paths = []
+            for key in key_strs:
+                file_path = self.file_manager.get_file_path(key)
+                # New file per set, to be updated when partial writes is added to HiCache
+                if not self.file_manager.create_file(file_path):
+                    logger.error(
+                        f"******** Failed to create file {file_path} *********"
+                    )
+                    return [False] * len(keys)
+                file_paths.append(file_path)
+            success = self._execute_transfer(src, file_paths, "WRITE")
+        else:  # mem_type == "OBJ"
+            success = self._execute_transfer(src, key_strs, "WRITE")
+
+        return [True] * len(keys) if success else [False] * len(keys)
+
+    def batch_set_v1(
+        self,
+        keys: List[str],
+        host_indices: torch.Tensor,
+        extra_info: Optional[HiCacheStorageExtraInfo] = None,
+    ) -> List[bool]:
+
+        if len(keys) == 0:
+            return []
+
+        key_strs, target_tensors, buffer_ptrs, buffer_sizes = (
+            self._batch_set_preprocess(keys, host_indices)
+        )
+
+        if not key_strs or not buffer_ptrs or not buffer_sizes:
+            logger.error(
+                "HiCacheNixl batch_set_v1: preprocessing failed, empty key_strs, buffer_ptrs or buffer_sizes"
+            )
+            return [False] * len(keys)
+
+        start_time = time.perf_counter()
+
+        results_set = self._batch_set_zero_copy_impl(
+            keys, key_strs, target_tensors, buffer_ptrs, buffer_sizes
+        )
+
+        end_time = time.perf_counter()
+        elapsed_time_ms = (end_time - start_time) * 1000
+        total_bytes = sum(s for s in buffer_sizes if s is not None)
+        logger.debug(
+            f"HiCacheNixl batch_set_v1 transferred: {len(keys)} keys (pages), {host_indices.numel()} host_indices, {total_bytes} bytes, total time: {elapsed_time_ms:.3f} ms, effective bandwidth: {total_bytes / (elapsed_time_ms / 1000) / (1024 * 1024):.2f} MB/s"
+        )
+
+        return results_set
diff --git a/sglang/python/sglang/srt/mem_cache/storage/nixl/nixl.config.toml.sample b/sglang/python/sglang/srt/mem_cache/storage/nixl/nixl.config.toml.sample
new file mode 100644
index 0000000000000000000000000000000000000000..1999882012b5dadb5f1479809b9690e85e383c90
--- /dev/null
+++ b/sglang/python/sglang/srt/mem_cache/storage/nixl/nixl.config.toml.sample
@@ -0,0 +1,117 @@
+################################################################################
+# IMPORTANT
+# 1. to enable a plugin, add "active = true" in the corresponding section
+# 2. the configs inside plugin.posix (i.e., use_aio, use_uring, use_posix_aio)
+#    are mutually exclusive
+################################################################################
+
+
+########################################
+# POSIX FILE SYSTEM BACKEND
+########################################
+[plugin.posix]
+# Configuration for the POSIX file-based backend.
+#
+# The supported backends include:
+# 1. AIO (`use_aio = "true"`)
+# 2. io_uring (`use_uring = "true"`)
+# 3. POSIX AIO (`use_posix_aio = "true"`)
+#
+# If not specified, NIXL will automatically detect and use available backends based on the following default priority: AIO > io_uring > POSIX AIO
+
+# Enable Linux io_uring for async I/O (recommended if supported)
+use_uring = "true"
+
+# Enable POSIX AIO (alternative async I/O mechanism)
+# use_posix_aio = "true"
+
+# Enable generic AIO
+# use_aio = "true"
+
+# Whether this plugin is eligible for selection
+active = true
+
+
+########################################
+# NVIDIA GDS (GPUDirect Storage) BACKEND
+########################################
+[plugin.gds]
+# Configuration for NVIDIA GPUDirect Storage
+# Requires compatible GPU, driver, and filesystem support
+
+# Number of requests per batch, default: 128
+batch_pool_size = 128
+
+# Maximum number of requests issued at once, default: 128
+batch_limit = 128
+
+# Maximum size of a single request (bytes), default: 16MB
+max_request_size = 16777216  # 16 MB
+
+
+########################################
+# MULTI-THREADED GDS BACKEND
+########################################
+[plugin.gds_mt]
+# Multi-threaded variant of the GDS backend
+
+# Number of worker threads, default: 4
+thread_count = 4
+
+
+########################################
+# 3FS (THIRD-PARTY FILE SYSTEM) BACKEND
+########################################
+[plugin.3fs]
+# Configuration for 3FS backend
+# Requires the 3FS library to be installed and mounted
+
+# Mount point of the 3FS filesystem
+mount_point = "/mnt/3fs"
+
+# Memory configuration mode:
+#   dram     - use DRAM
+#   dram_zc  - DRAM with zero-copy
+#   auto     - let backend decide
+mem_config = "dram"
+
+# Size of the I/O pool
+# Valid range: [2^6, 2^20]
+iopool_size = 64
+
+
+########################################
+# OBJECT STORAGE BACKEND (S3 / COMPATIBLE)
+########################################
+[plugin.obj]
+# Object storage backend (e.g., S3-compatible services)
+
+# Number of client worker threads, default: 4
+num_threads = 4
+
+# Override endpoint (useful for non-AWS S3 services)
+endpoint_override = ""
+
+# Connection scheme: http or https, default: http
+scheme = "http"
+
+# Cloud region (if applicable)
+region = ""
+
+# Request checksum behavior:
+#   required, supported
+req_checksum = "required"
+
+# Custom CA bundle path (if needed)
+ca_bundle = ""
+
+# Credentials
+access_key = ""
+secrete_key = ""
+session_token = ""
+
+# Use virtual-hosted-style addressing (true/false), default: true
+use_virtual_addressing = "true"
+
+# Default bucket name
+bucket = ""
diff --git a/sglang/python/sglang/srt/mem_cache/storage/nixl/nixl_utils.py b/sglang/python/sglang/srt/mem_cache/storage/nixl/nixl_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..9742cf3f396e9ac75f21f6bcb53f009653de3894
--- /dev/null
+++ b/sglang/python/sglang/srt/mem_cache/storage/nixl/nixl_utils.py
@@ -0,0 +1,299 @@
+import logging
+import os
+from typing import Any, List, Optional, Tuple, Union
+
+import torch
+
+logger = logging.getLogger(__name__)
+
+
+class NixlBackendConfig:
+    """Handles NIXL backend configurations"""
+
+    def __init__(self, config: Optional[dict[str, str]] = None):
+        """Initialize backend configuration.
+        Args:
+            config: configurations in a dictionary. This config comes from --hicache-storage-backend-extra-config
+
+            config can be in two forms:
+            1. fully qualified form (for all plugins, some of them are enabled, others not):
+                {'plugin': { 'posix': {...}, 'gds': {...}, ...}}
+            2. flat form (for a specific selected plugin), assuming all params apply to a selected plugin
+                {'param1': 'value1', 'param2': 'value2', ...}
+        """
+        self.config = config or {}
+
+    def get_specified_plugin(self) -> str:
+        """decide which plugin to use: either config or SGLANG_HICACHE_NIXL_BACKEND_PLUGIN specifies the plugin, if not, use "auto" """
+
+        if "plugin" in self.config:
+            # fully qualified form: {'plugin': { 'posix': {...}, 'gds': {...}, ...}}
+            # choose the FIRST active plugin
+            for key, item in self.config["plugin"].items():
+                if item.get("active", False) in [True, "true", "True"]:
+                    plugin = key.upper()
+                    break
+        else:
+            # config is empty, or in flat form {'param1': 'value1', 'param2': 'value2', ...}
+            plugin = os.getenv("SGLANG_HICACHE_NIXL_BACKEND_PLUGIN", "auto")
+
+        return plugin
+
+    def get_backend_initparams(self, backend_name) -> dict:
+        """Get initialization parameters from config of NIXL backend for backend creation.
+        Args:
+            backend_name: a specific backend's name (already converted "auto" into a specific backend name)
+
+        """
+
+        initparams = {}
+
+        # config can be in two forms:
+        if "plugin" in self.config:
+            # fully qualified form: {'plugin': { 'posix': {...}, 'gds': {...}, ...}}
+            if backend_name.lower() in self.config["plugin"]:
+                config_data = self.config["plugin"][backend_name.lower()]
+            else:
+                logger.debug(
+                    f"No specific config found for plugin {backend_name} in extra_config. Use default init params."
+                )
+                config_data = {}
+        else:
+            # flat form {'param1': 'value1', 'param2': 'value2', ...}
+            config_data = self.config
+
+        for key, value in config_data.items():
+            initparams[key] = str(value)
+
+        return initparams
+
+
+class NixlBackendSelection:
+    """Handles NIXL backend selection and creation."""
+
+    # Priority order for File-based plugins in case of auto selection
+    FILE_PLUGINS = ["3FS", "POSIX", "GDS_MT", "GDS"]
+    # Priority order for File-based plugins in case of auto selection (add more as needed)
+    OBJ_PLUGINS = ["OBJ"]  # Based on Amazon S3 SDK
+
+    def __init__(
+        self, plugin: str = "auto", nixlconfig: Optional[NixlBackendConfig] = None
+    ):
+        """Initialize backend selection.
+        Args:
+            plugin: Plugin to use (default "auto" selects best available).
+                   Can be a file plugin (3FS, POSIX, GDS, GDS_MT) or
+                   an object plugin (OBJ).
+        """
+        self.plugin = plugin
+        self.backend_name = None
+        self.mem_type = None
+        self.nixlconfig = nixlconfig
+
+    def set_bucket(self, bucket_name: str) -> None:
+        """Set AWS bucket name in environment variable."""
+        os.environ["AWS_DEFAULT_BUCKET"] = bucket_name
+        logger.debug(f"Set AWS bucket name to: {bucket_name}")
+
+    def create_backend(self, agent) -> bool:
+        """Create the appropriate NIXL backend based on configuration."""
+        try:
+            plugin_list = agent.get_plugin_list()
+            logger.debug(f"Available NIXL plugins: {plugin_list}")
+
+            # Handle explicit plugin selection or auto priority
+            if self.plugin == "auto":
+                # Try all file plugins first
+                for plugin in self.FILE_PLUGINS:
+                    if plugin in plugin_list:
+                        self.backend_name = plugin
+                        break
+                # If no file plugin found, try object plugins
+                if not self.backend_name:
+                    for plugin in self.OBJ_PLUGINS:
+                        if plugin in plugin_list:
+                            self.backend_name = plugin
+                            break
+            else:
+                # Use explicitly requested plugin
+                self.backend_name = self.plugin
+
+            if self.backend_name not in plugin_list:
+                logger.error(
+                    f"Backend {self.backend_name} not available in plugins: {plugin_list}"
+                )
+                return False
+
+            # obtain initparams for the backend from the NIXL config
+            initparams = (
+                self.nixlconfig.get_backend_initparams(self.backend_name)
+                if self.nixlconfig
+                else {}
+            )
+
+            # Create backend and set memory type
+            if self.backend_name in self.OBJ_PLUGINS and "bucket" not in initparams:
+                bucket = os.environ.get("AWS_DEFAULT_BUCKET")
+                if not bucket:
+                    logger.error(
+                        "AWS_DEFAULT_BUCKET environment variable must be set for object storage"
+                    )
+                    return False
+
+                initparams["bucket"] = bucket
+
+            # create backend using initialization parameters
+            agent.create_backend(self.backend_name, initparams)
+
+            logger.info(
+                f"NixlBackendSelection.create_backend: backend_name {self.backend_name} initparams {initparams} customParams {agent.get_backend_params(self.backend_name)} supported plugins {plugin_list}"
+            )
+
+            self.mem_type = "OBJ" if self.backend_name in self.OBJ_PLUGINS else "FILE"
+            logger.debug(
+                f"Created NIXL backend: {self.backend_name} with memory type: {self.mem_type}"
+            )
+            return True
+
+        except Exception as e:
+            logger.error(
+                f"Failed to create NIXL backend: {e}, backend_name {self.backend_name}, supported plugins {plugin_list} initparams {initparams}"
+            )
+            return False
+
+
+class NixlRegistration:
+    """Handles NIXL memory registration."""
+
+    def __init__(self, agent):
+        self.agent = agent
+
+    def create_query_tuples(
+        self, key: str, mem_type: str, file_manager=None
+    ) -> List[Tuple]:
+        """Create NIXL tuples for querying memory.
+        Args:
+            key: Key to query (file path for FILE or object key for OBJ)
+            mem_type: Memory type ("FILE" or "OBJ")
+            file_manager: Optional NixlFileManager for FILE memory type
+        Returns:
+            List of NIXL tuples for querying
+        """
+        if mem_type == "FILE":
+            if file_manager is None:
+                logger.error("file_manager required for FILE memory type")
+                return []
+            return [(0, 0, 0, file_manager.get_file_path(key))]
+        else:  # OBJ
+            return [(0, 0, 0, key)]
+
+    def _register_memory(
+        self,
+        items: Union[List[tuple], torch.Tensor, List[torch.Tensor]],
+        mem_type: Optional[str] = None,
+    ) -> Optional[Any]:
+        """Common registration logic for files, objects, and buffers.
+        Args:
+            items: List of tuples or tensors to register
+            mem_type: Memory type ("FILE", "OBJ") or None for tensor or list of tensors
+        """
+        if isinstance(items, list) and not items:
+            return None
+
+        reg_descs = self.agent.get_reg_descs(items, mem_type)
+        if reg_descs is None:
+            logger.error("Failed to create registration descriptors")
+            return None
+
+        try:
+            registered_memory = self.agent.register_memory(reg_descs)
+            return registered_memory  # Could be None in case of error
+        except Exception as e:
+            if not mem_type:
+                logger.error(f"Failed to register Tensors with NIXL: {e}")
+            else:
+                logger.error(
+                    f"Failed to register memory of type {mem_type} with NIXL: {e}"
+                )
+            return None
+
+
+class NixlFileManager:
+    """Handles file system operations for NIXL."""
+
+    def __init__(self, base_dir: str):
+        """
+        Initialize file manager.
+        Args:
+            base_dir: Base directory for storing tensor files
+        """
+        self.base_dir = base_dir
+        if base_dir == "":
+            logger.debug(f"Initialized file manager without a base directory")
+        else:
+            os.makedirs(base_dir, exist_ok=True)
+            logger.debug(f"Initialized file manager with base directory: {base_dir}")
+
+    def clear(self) -> None:
+        """Clear all files in the base directory."""
+        if self.base_dir == "":
+            logger.warning("Base directory is empty, skipping clear operation")
+            return
+
+        try:
+            for root, dirs, files in os.walk(self.base_dir):
+                for file in files:
+                    os.remove(os.path.join(root, file))
+            logger.debug(f"Cleared all files in base directory: {self.base_dir}")
+        except Exception as e:
+            logger.error(
+                f"Failed to clear files in base directory {self.base_dir}: {e}"
+            )
+
+    def get_file_path(self, key: str) -> str:
+        """Get full file path for a given key."""
+        return os.path.join(self.base_dir, key)
+
+    def create_file(self, file_path: str) -> bool:
+        """Create a file if it doesn't exist."""
+        try:
+            os.makedirs(os.path.dirname(file_path), exist_ok=True)
+            if not os.path.exists(file_path):
+                with open(file_path, "wb") as f:
+                    pass  # Create empty file
+            return True
+        except Exception as e:
+            logger.error(f"Failed to create file {file_path}: {e}")
+            return False
+
+    def open_file(self, file_path: str) -> Optional[int]:
+        """Open a file and return its file descriptor."""
+        try:
+            fd = os.open(file_path, os.O_RDWR)
+            return fd
+        except Exception as e:
+            logger.error(f"Failed to open file {file_path}: {e}")
+            return None
+
+    def close_file(self, fd: int) -> bool:
+        """Close a file descriptor."""
+        try:
+            os.close(fd)
+            return True
+        except Exception as e:
+            logger.error(f"Failed to close file descriptor {fd}: {e}")
+            return False
+
+    def files_to_nixl_tuples(
+        self, file_paths: List[str]
+    ) -> List[Tuple[int, int, int, str]]:
+        """Create NIXL tuples (offset, length, fd, file_path) for given files."""
+        tuples = []
+        for path in file_paths:
+            if (fd := self.open_file(path)) is None:
+                # Clean up on failure
+                for t in tuples:
+                    self.close_file(t[2])
+                return []
+            tuples.append((0, 0, fd, path))
+        return tuples
diff --git a/sglang/python/sglang/srt/mem_cache/storage/nixl/test_hicache_nixl_storage.py b/sglang/python/sglang/srt/mem_cache/storage/nixl/test_hicache_nixl_storage.py
new file mode 100644
index 0000000000000000000000000000000000000000..ad1796f0abd96766a23d1f136301e6480755bcb2
--- /dev/null
+++ b/sglang/python/sglang/srt/mem_cache/storage/nixl/test_hicache_nixl_storage.py
@@ -0,0 +1,270 @@
+#!/usr/bin/env python3
+
+import os
+import unittest
+from typing import List
+from unittest.mock import MagicMock
+
+import torch
+
+from sglang.srt.mem_cache.hicache_storage import HiCacheStorageConfig
+from sglang.srt.mem_cache.storage.nixl.hicache_nixl import HiCacheNixl
+from sglang.srt.mem_cache.storage.nixl.nixl_utils import (
+    NixlFileManager,
+    NixlRegistration,
+)
+
+
+class TestNixlUnified(unittest.TestCase):
+    """Unified test suite for all NIXL components."""
+
+    def setUp(self):
+        """Set up test environment."""
+        # Create test directories
+        self.test_dir = "/tmp/test_nixl_unified"
+        os.makedirs(self.test_dir, exist_ok=True)
+
+        # Mock NIXL agent for registration tests
+        self.mock_agent = MagicMock()
+        self.mock_agent.get_reg_descs.return_value = "mock_reg_descs"
+        self.mock_agent.register_memory.return_value = "mock_registered_memory"
+
+        # Create instances
+        self.file_manager = NixlFileManager(self.test_dir)
+        self.registration = NixlRegistration(self.mock_agent)
+
+        # Create storage config for testing
+        self.storage_config = HiCacheStorageConfig(
+            tp_rank=0,
+            tp_size=2,
+            is_mla_model=False,
+            is_page_first_layout=False,
+            model_name="test_model",
+        )
+
+        try:
+            self.hicache = HiCacheNixl(
+                storage_config=self.storage_config,
+                file_path=self.test_dir,
+                plugin="POSIX",
+            )
+        except ImportError:
+            self.skipTest("NIXL not available, skipping NIXL storage tests")
+
+    def tearDown(self):
+        """Clean up test directories."""
+        if os.path.exists(self.test_dir):
+            import shutil
+
+            shutil.rmtree(self.test_dir)
+
+    def delete_test_file(self, file_path: str) -> bool:
+        """Helper method to delete a test file.
+
+        Args:
+            file_path: Path to the file to delete
+
+        Returns:
+            bool: True if file was deleted or didn't exist, False on error
+        """
+        try:
+            if os.path.exists(file_path):
+                os.remove(file_path)
+            return True
+        except Exception as e:
+            return False
+
+    def verify_tensors_equal(self, expected: torch.Tensor, actual: torch.Tensor):
+        """Helper to verify tensor equality."""
+        self.assertIsNotNone(actual, "Retrieved tensor is None")
+        self.assertTrue(
+            torch.allclose(expected, actual, atol=1e-6),
+            f"Tensors not equal:\nExpected: {expected}\nActual: {actual}",
+        )
+
+    def verify_tensor_lists_equal(
+        self, expected: List[torch.Tensor], actual: List[torch.Tensor]
+    ):
+        """Helper to verify lists of tensors are equal."""
+        self.assertEqual(len(expected), len(actual), "Lists have different lengths")
+        for exp, act in zip(expected, actual):
+            self.verify_tensors_equal(exp, act)
+
+    # ============================================================================
+    # HiCache Integration Tests
+    # ============================================================================
+
+    def test_single_set_get(self):
+        """Test single tensor set/get operations."""
+        key = "test_key"
+        value = torch.randn(10, 10, device="cpu")
+        dst_tensor = torch.zeros_like(value, device="cpu")
+
+        # Test set
+        self.assertTrue(self.hicache.set(key, value))
+        self.assertTrue(self.hicache.exists(key))
+
+        # Test get
+        retrieved = self.hicache.get(key, dst_tensor)
+        self.verify_tensors_equal(value, dst_tensor)
+        self.verify_tensors_equal(value, retrieved)
+
+        # Same test in addr,len mode with another key and dst_tensor
+        key2 = "test_key2"
+        dst_tensor2 = torch.zeros_like(value, device="cpu")
+        src_addr, src_len = value.data_ptr(), value.numel() * value.element_size()
+        dst_addr, dst_len = (
+            dst_tensor2.data_ptr(),
+            dst_tensor2.numel() * dst_tensor2.element_size(),
+        )
+
+        # Test set
+        self.assertTrue(self.hicache.set(key, None, src_addr, src_len))
+        self.assertTrue(self.hicache.exists(key))
+
+        # Test get
+        retrieved2 = self.hicache.get(key, dst_addr, dst_len)
+        self.assertTrue(retrieved2 is None)
+        self.verify_tensors_equal(value, dst_tensor2)
+
+    def test_batch_set_get(self):
+        """Test batch tensor set/get operations."""
+        keys = ["key1", "key2", "key3"]
+        values = [
+            torch.randn(5, 5, device="cpu"),
+            torch.randn(3, 3, device="cpu"),
+            torch.randn(7, 7, device="cpu"),
+        ]
+        dst_tensors = [torch.zeros_like(v, device="cpu") for v in values]
+
+        # Test batch set
+        self.assertTrue(self.hicache.batch_set(keys, values))
+        self.assertTrue(all(self.hicache.exists(key) for key in keys))
+
+        # Test batch get
+        retrieved = self.hicache.batch_get(keys, dst_tensors)
+        self.verify_tensor_lists_equal(values, retrieved)
+
+        # Same test in addr,len mode with another key and dst_tensor
+        keys2 = ["key4", "key5", "key6"]
+        dst_tensors2 = [torch.zeros_like(v, device="cpu") for v in values]
+        src_addrs = [v.data_ptr() for v in values]
+        src_lens = [v.numel() * v.element_size() for v in values]
+        dst_addrs = [dt.data_ptr() for dt in dst_tensors2]
+        dst_lens = [dt.numel() * dt.element_size() for dt in dst_tensors2]
+
+        # Test batch set
+        self.assertTrue(self.hicache.batch_set(keys2, None, src_addrs, src_lens))
+        self.assertTrue(all(self.hicache.exists(key) for key in keys2))
+
+        # Test batch get
+        retrieved2 = self.hicache.batch_get(keys, dst_addrs, dst_lens)
+        self.assertTrue(all(ret is None for ret in retrieved2))
+        self.verify_tensor_lists_equal(values, dst_tensors2)
+
+    def test_mixed_operations(self):
+        """Test mixing single and batch operations."""
+        # Test interleaved set/get operations
+        key1, key2 = "key1", "key2"
+        value1 = torch.randn(4, 4, device="cpu")
+        value2 = torch.randn(6, 6, device="cpu")
+        dst1 = torch.zeros_like(value1)
+        dst2 = torch.zeros_like(value2)
+
+        # Single set/get
+        self.assertTrue(self.hicache.set(key1, value1))
+        retrieved1 = self.hicache.get(key1, dst1)
+        self.verify_tensors_equal(value1, retrieved1)
+
+        # Batch set/get
+        self.assertTrue(self.hicache.batch_set([key2], [value2]))
+        retrieved2 = self.hicache.batch_get([key2], [dst2])
+        self.verify_tensors_equal(value2, retrieved2[0])
+
+    def test_data_integrity(self):
+        """Test data integrity across operations."""
+        # Test with various tensor types and sizes
+        test_cases = [
+            ("float32", torch.randn(10, 10, dtype=torch.float32)),
+            ("float64", torch.randn(5, 5, dtype=torch.float64)),
+            ("int32", torch.randint(-100, 100, (8, 8), dtype=torch.int32)),
+            ("int64", torch.randint(-100, 100, (6, 6), dtype=torch.int64)),
+            ("bool", torch.randint(0, 2, (4, 4)).bool()),
+        ]
+
+        for name, tensor in test_cases:
+            with self.subTest(tensor_type=name):
+                key = f"test_{name}"
+                dst_tensor = torch.zeros_like(tensor)
+
+                # Set and immediately get
+                self.assertTrue(self.hicache.set(key, tensor))
+                retrieved1 = self.hicache.get(key, dst_tensor)
+                self.verify_tensors_equal(tensor, retrieved1)
+
+                # Get again to verify persistence
+                dst_tensor.zero_()
+                retrieved2 = self.hicache.get(key, dst_tensor)
+                self.verify_tensors_equal(tensor, retrieved2)
+
+    def test_basic_file_operations(self):
+        """Test basic file operations."""
+        test_file = os.path.join(self.test_dir, "test_file.bin")
+        self.file_manager.create_file(test_file)
+        self.assertTrue(os.path.exists(test_file))
+        self.assertEqual(os.path.getsize(test_file), 0)  # Empty file
+
+        # Test file deletion
+        self.assertTrue(self.delete_test_file(test_file))
+        self.assertFalse(os.path.exists(test_file))
+
+    def test_create_nixl_tuples(self):
+        """Test creation of NIXL tuples."""
+        test_file = os.path.join(self.test_dir, "test_file.bin")
+        self.file_manager.create_file(test_file)
+
+        # Test tuple creation
+        tuples = self.file_manager.files_to_nixl_tuples([test_file])
+        self.assertIsNotNone(tuples)
+        self.assertTrue(len(tuples) > 0)
+
+    def test_error_handling(self):
+        """Test error handling in file operations."""
+        # Test non-existent file
+        self.assertTrue(
+            self.delete_test_file("nonexistent_file.bin")
+        )  # Returns True if file doesn't exist
+
+        # Test invalid file path
+        self.assertFalse(self.file_manager.create_file(""))  # Empty path should fail
+
+    def test_register_buffers(self):
+        """Test registration of memory buffers."""
+        # Create test tensor
+        tensor = torch.randn(10, 10)
+
+        # Test buffer registration
+        self.assertIsNotNone(self.hicache.register_buffers(tensor))
+
+        # Test batch registration
+        tensors = [torch.randn(5, 5) for _ in range(3)]
+        self.assertIsNotNone(self.hicache.register_buffers(tensors))
+
+    def test_register_files_with_tuples(self):
+        """Test registration of files using NIXL tuples."""
+        files = [os.path.join(self.test_dir, f"test_file_{i}.bin") for i in range(3)]
+        for file in files:
+            self.file_manager.create_file(file)
+
+        # Create tuples and register
+        tuples = self.file_manager.files_to_nixl_tuples(files)
+        self.hicache.register_files(tuples)
+
+        # Verify tuples
+        self.assertEqual(len(tuples), len(files))
+        for t, f in zip(tuples, files):
+            self.assertEqual(t[3], f)  # Check file path
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/sglang/python/sglang/srt/mem_cache/swa_memory_pool.py b/sglang/python/sglang/srt/mem_cache/swa_memory_pool.py
new file mode 100644
index 0000000000000000000000000000000000000000..0faf201cbd4832fd1c21a780f10b3dec7cdd242f
--- /dev/null
+++ b/sglang/python/sglang/srt/mem_cache/swa_memory_pool.py
@@ -0,0 +1,461 @@
+import logging
+import weakref
+from typing import Dict, List, Optional, Tuple
+
+import torch
+
+from sglang.srt.layers.radix_attention import RadixAttention
+from sglang.srt.mem_cache.allocator import (
+    BaseTokenToKVPoolAllocator,
+    PagedTokenToKVPoolAllocator,
+    TokenToKVPoolAllocator,
+)
+from sglang.srt.mem_cache.memory_pool import KVCache, MHATokenToKVPool
+from sglang.srt.mem_cache.utils import maybe_init_custom_mem_pool
+
+logger = logging.getLogger(__name__)
+GB = 1024 * 1024 * 1024
+
+
+class SWAKVPool(KVCache):
+    """KV cache with separate pools for full and SWA attention layers."""
+
+    def __init__(
+        self,
+        size: int,
+        size_swa: int,
+        page_size: int,
+        dtype: torch.dtype,
+        head_num: int,
+        head_dim: int,
+        swa_attention_layer_ids: List[int],
+        full_attention_layer_ids: List[int],
+        enable_kvcache_transpose: bool,
+        device: str,
+        token_to_kv_pool_class: KVCache = MHATokenToKVPool,
+        **kwargs,
+    ):
+        self.size = size
+        self.size_swa = size_swa
+        self.dtype = dtype
+        self.head_num = head_num
+        self.head_dim = head_dim
+        self.device = device
+        self.swa_layer_nums = len(swa_attention_layer_ids)
+        self.full_layer_nums = len(full_attention_layer_ids)
+        self.start_layer = 0
+        self.page_size = page_size
+        self.swa_loc = None
+
+        kwargs["page_size"] = page_size
+        kwargs["enable_memory_saver"] = False
+        kwargs["head_num"] = head_num
+        kwargs["head_dim"] = head_dim
+        kwargs["device"] = device
+        # TODO MHATransposedTokenToKVPool if enable_kvcache_transpose is True
+        assert not enable_kvcache_transpose
+
+        # for disagg with nvlink
+        self.enable_custom_mem_pool, self.custom_mem_pool, _ = (
+            maybe_init_custom_mem_pool(device=self.device)
+        )
+
+        self.swa_kv_pool = token_to_kv_pool_class(
+            size=size_swa,
+            dtype=dtype,
+            layer_num=self.swa_layer_nums,
+            **kwargs,
+        )
+        kwargs.pop("swa_head_num", None)
+        kwargs.pop("swa_head_dim", None)
+        kwargs.pop("swa_v_head_dim", None)
+        self.full_kv_pool = token_to_kv_pool_class(
+            size=size,
+            dtype=dtype,
+            layer_num=self.full_layer_nums,
+            **kwargs,
+        )
+        # {layer_id: (index, is_swa_layer)}
+        self.layers_mapping: Dict[int, Tuple[int, bool]] = {}
+        for full_attn_layer_id, global_layer_id in enumerate(full_attention_layer_ids):
+            self.layers_mapping[global_layer_id] = (full_attn_layer_id, False)
+        for swa_layer_id, global_layer_id in enumerate(swa_attention_layer_ids):
+            self.layers_mapping[global_layer_id] = (swa_layer_id, True)
+        self.full_to_swa_index_mapping: Optional[torch.Tensor] = None
+
+        k_size, v_size = self.get_kv_size_bytes()
+        self.mem_usage = (k_size + v_size) / GB
+        logger.info(
+            f"SWAKVPool mem usage: {self.mem_usage:.2f} GB, swa size: {self.size_swa}, full size: {self.size}"
+        )
+
+    def register_mapping(self, full_to_swa_index_mapping: torch.Tensor):
+        self.full_to_swa_index_mapping = full_to_swa_index_mapping
+
+    def get_kv_size_bytes(self):
+        k_size, v_size = self.full_kv_pool.get_kv_size_bytes()
+        k_size_swa, v_size_swa = self.swa_kv_pool.get_kv_size_bytes()
+        return k_size + k_size_swa, v_size + v_size_swa
+
+    def get_contiguous_buf_infos(self):
+        full_kv_data_ptrs, full_kv_data_lens, full_kv_item_lens = (
+            self.full_kv_pool.get_contiguous_buf_infos()
+        )
+        return (
+            full_kv_data_ptrs,
+            full_kv_data_lens,
+            full_kv_item_lens,
+        )
+
+    def get_state_buf_infos(self):
+        swa_kv_data_ptrs, swa_kv_data_lens, swa_kv_item_lens = (
+            self.swa_kv_pool.get_contiguous_buf_infos()
+        )
+
+        return swa_kv_data_ptrs, swa_kv_data_lens, swa_kv_item_lens
+
+    def get_key_buffer(self, layer_id: int):
+        layer_id_pool, is_swa_layer = self.layers_mapping[layer_id]
+        if is_swa_layer:
+            return self.swa_kv_pool.get_key_buffer(layer_id_pool)
+        else:
+            return self.full_kv_pool.get_key_buffer(layer_id_pool)
+
+    def get_value_buffer(self, layer_id: int):
+        layer_id_pool, is_swa_layer = self.layers_mapping[layer_id]
+        if is_swa_layer:
+            return self.swa_kv_pool.get_value_buffer(layer_id_pool)
+        else:
+            return self.full_kv_pool.get_value_buffer(layer_id_pool)
+
+    def get_kv_buffer(self, layer_id: int):
+        layer_id_pool, is_swa_layer = self.layers_mapping[layer_id]
+        if is_swa_layer:
+            return self.swa_kv_pool.get_kv_buffer(layer_id_pool)
+        else:
+            return self.full_kv_pool.get_kv_buffer(layer_id_pool)
+
+    def set_swa_loc(self, loc: torch.Tensor):
+        self.swa_loc = loc
+
+    def translate_loc_from_full_to_swa(self, kv_indices: torch.Tensor):
+        assert self.full_to_swa_index_mapping is not None
+
+        # Note: kv_indices could have -1 values (from alloc_extend), which will be mapped to -1
+        # since the last item of full_to_swa_index_mapping is -1.
+        return self.full_to_swa_index_mapping[kv_indices].to(torch.int32)
+
+    def set_kv_buffer(
+        self,
+        layer: RadixAttention,
+        loc: torch.Tensor,
+        cache_k: torch.Tensor,
+        cache_v: torch.Tensor,
+        k_scale: float = 1.0,
+        v_scale: float = 1.0,
+    ):
+
+        layer_id = layer.layer_id
+        layer_id_pool, is_swa_layer = self.layers_mapping[layer_id]
+        if is_swa_layer:
+            if self.swa_loc is not None:
+                loc = self.swa_loc
+            else:
+                if self.full_to_swa_index_mapping is not None:
+                    loc = self.translate_loc_from_full_to_swa(loc)
+
+            self.swa_kv_pool.set_kv_buffer(
+                None,
+                loc,
+                cache_k,
+                cache_v,
+                k_scale,
+                v_scale,
+                layer_id_override=layer_id_pool,
+            )
+        else:
+            self.full_kv_pool.set_kv_buffer(
+                None,
+                loc,
+                cache_k,
+                cache_v,
+                k_scale,
+                v_scale,
+                layer_id_override=layer_id_pool,
+            )
+
+    def move_kv_cache(self, tgt_loc: torch.Tensor, src_loc: torch.Tensor):
+        self.full_kv_pool.move_kv_cache(tgt_loc, src_loc)
+        tgt_loc_swa = self.translate_loc_from_full_to_swa(tgt_loc)
+        src_loc_swa = self.translate_loc_from_full_to_swa(src_loc)
+        self.swa_kv_pool.move_kv_cache(tgt_loc_swa, src_loc_swa)
+
+    def get_cpu_copy(self, indices):
+        # For SWA, we need to copy KV cache from both full and SWA pools
+        # The indices are for the full pool, and we use mapping to get SWA indices
+        full_kv_cpu = self.full_kv_pool.get_cpu_copy(indices)
+
+        # Get SWA indices through the mapping
+        # Note: SWA allocation always creates 1:1 mapping, so no need to filter
+        if self.full_to_swa_index_mapping is not None:
+            swa_indices = self.full_to_swa_index_mapping[indices]
+            swa_kv_cpu = self.swa_kv_pool.get_cpu_copy(swa_indices)
+        else:
+            swa_kv_cpu = None
+
+        return {"full": full_kv_cpu, "swa": swa_kv_cpu}
+
+    def load_cpu_copy(self, kv_cache_cpu, indices):
+        # Load KV cache back from CPU to both full and SWA pools
+        # Note: indices here are NEW indices (newly allocated), different from get_cpu_copy indices
+        full_kv_cpu = kv_cache_cpu["full"]
+        swa_kv_cpu = kv_cache_cpu["swa"]
+
+        # Load full KV cache to the new indices
+        self.full_kv_pool.load_cpu_copy(full_kv_cpu, indices)
+
+        # Load SWA KV cache if it exists
+        if swa_kv_cpu is not None and self.full_to_swa_index_mapping is not None:
+            swa_indices = self.full_to_swa_index_mapping[indices]
+            self.swa_kv_pool.load_cpu_copy(swa_kv_cpu, swa_indices)
+
+
+class SWATokenToKVPoolAllocator(BaseTokenToKVPoolAllocator):
+    """Allocator for SWA hybrid KV cache."""
+
+    def __init__(
+        self,
+        size: int,
+        size_swa: int,
+        page_size: int,
+        dtype: torch.dtype,
+        device: str,
+        kvcache: SWAKVPool,
+        need_sort: bool,
+    ):
+        assert isinstance(kvcache, SWAKVPool)
+        self._size_full = size
+        self._size_swa = size_swa
+        self.dtype = dtype
+        self.device = device
+        self.page_size = page_size
+
+        if page_size == 1:
+            self.full_attn_allocator = TokenToKVPoolAllocator(
+                size,
+                dtype,
+                device,
+                kvcache.full_kv_pool,
+                need_sort,
+            )
+            self.swa_attn_allocator = TokenToKVPoolAllocator(
+                size_swa,
+                dtype,
+                device,
+                kvcache.swa_kv_pool,
+                need_sort,
+            )
+        else:
+            self.full_attn_allocator = PagedTokenToKVPoolAllocator(
+                size,
+                page_size,
+                dtype,
+                device,
+                kvcache.full_kv_pool,
+                need_sort,
+            )
+            self.swa_attn_allocator = PagedTokenToKVPoolAllocator(
+                size_swa,
+                page_size,
+                dtype,
+                device,
+                kvcache.swa_kv_pool,
+                need_sort,
+            )
+        # Note: append one more item of value -1 in the end so -1 maps to -1.
+        # It is needed for the last_loc in alloc_extend, where the first full_last_loc
+        # is -1, and we need to map it to swa_last_loc -1 as well.
+        self.full_to_swa_index_mapping = torch.cat(
+            [
+                torch.zeros(
+                    size + self.page_size,
+                    dtype=torch.int64,
+                    device=device,
+                ),
+                torch.tensor([-1], dtype=torch.int64, device=device),
+            ]
+        )
+
+        self.need_sort = need_sort
+        self.free_pages = None
+        self.release_pages = None
+        self.is_not_in_free_group = True
+        self.free_group = []
+
+        self.clear()
+        self._kvcache = kvcache
+        self._kvcache.register_mapping(weakref.proxy(self.full_to_swa_index_mapping))
+
+    def available_size(self):
+        return min(
+            self.full_attn_allocator.available_size(),
+            self.swa_attn_allocator.available_size(),
+        )
+
+    def full_available_size(self):
+        return self.full_attn_allocator.available_size()
+
+    def swa_available_size(self):
+        return self.swa_attn_allocator.available_size()
+
+    @property
+    def size(self):
+        return min(self._size_full, self._size_swa)
+
+    @property
+    def size_swa(self):
+        return self._size_swa
+
+    @property
+    def size_full(self):
+        return self._size_full
+
+    def debug_print(self) -> str:
+        msg = ""
+        msg += f"#swa-available-size: {self.swa_attn_allocator.available_size()}, "
+        msg += (
+            f"#full-attn-available-size: {self.full_attn_allocator.available_size()}, "
+        )
+        return msg
+
+    def get_kvcache(self):
+        return self._kvcache
+
+    def translate_loc_from_full_to_swa(self, kv_indices: torch.Tensor):
+        assert self._kvcache.full_to_swa_index_mapping is not None
+        return self._kvcache.translate_loc_from_full_to_swa(kv_indices)
+
+    def alloc(self, need_size: int):
+        assert self.page_size == 1
+        if need_size > self.full_attn_allocator.available_size():
+            return None
+        if need_size > self.swa_attn_allocator.available_size():
+            return None
+
+        alloc_full_indices = self.full_attn_allocator.alloc(need_size)
+        alloc_swa_indices = self.swa_attn_allocator.alloc(need_size)
+        assert alloc_full_indices is not None
+        assert alloc_swa_indices is not None
+
+        self.full_to_swa_index_mapping[alloc_full_indices] = alloc_swa_indices
+        return alloc_full_indices
+
+    def alloc_extend(
+        self,
+        prefix_lens: torch.Tensor,
+        prefix_lens_cpu: torch.Tensor,
+        seq_lens: torch.Tensor,
+        seq_lens_cpu: torch.Tensor,
+        last_loc: torch.Tensor,  # last_loc for full layers
+        extend_num_tokens: int,
+    ):
+        assert self.page_size > 1
+        num_tokens = extend_num_tokens + len(seq_lens) * self.page_size
+        if num_tokens > self.full_attn_allocator.available_size():
+            return None
+        if num_tokens > self.swa_attn_allocator.available_size():
+            return None
+
+        swa_last_loc = self.translate_loc_from_full_to_swa(last_loc)
+
+        alloc_full_indices = self.full_attn_allocator.alloc_extend(
+            prefix_lens,
+            prefix_lens_cpu,
+            seq_lens,
+            seq_lens_cpu,
+            last_loc,
+            extend_num_tokens,
+        )
+        alloc_swa_indices = self.swa_attn_allocator.alloc_extend(
+            prefix_lens,
+            prefix_lens_cpu,
+            seq_lens,
+            seq_lens_cpu,
+            swa_last_loc,
+            extend_num_tokens,
+        )
+        assert alloc_full_indices is not None
+        assert alloc_swa_indices is not None
+
+        self.full_to_swa_index_mapping[alloc_full_indices] = alloc_swa_indices
+
+        return alloc_full_indices
+
+    def alloc_decode(
+        self,
+        seq_lens: torch.Tensor,
+        seq_lens_cpu: torch.Tensor,
+        last_loc: torch.Tensor,  # last_loc for full layers
+    ):
+        assert self.page_size > 1
+        swa_last_loc = self.translate_loc_from_full_to_swa(last_loc)
+
+        alloc_full_indices = self.full_attn_allocator.alloc_decode(
+            seq_lens, seq_lens_cpu, last_loc
+        )
+        alloc_swa_indices = self.swa_attn_allocator.alloc_decode(
+            seq_lens, seq_lens_cpu, swa_last_loc
+        )
+
+        if alloc_full_indices is None or alloc_swa_indices is None:
+            return None
+
+        self.full_to_swa_index_mapping[alloc_full_indices] = alloc_swa_indices
+
+        return alloc_full_indices
+
+    def free(self, free_index: torch.Tensor):
+        if free_index.numel() == 0:
+            return
+
+        # NOTE: the API is not idempotent.
+        if self.is_not_in_free_group:
+            self.full_attn_allocator.free(free_index)
+            self.free_swa(free_index)
+        else:
+            self.free_group.append(free_index)
+        assert (
+            self.full_attn_allocator.available_size() <= self.full_attn_allocator.size
+        )
+        assert self.swa_attn_allocator.available_size() <= self.swa_attn_allocator.size
+
+    def free_swa(self, free_index: torch.Tensor):
+        swa_indices = self.full_to_swa_index_mapping[free_index]
+        swa_indices = swa_indices[swa_indices > 0]
+        self.swa_attn_allocator.free(swa_indices)
+        self.full_to_swa_index_mapping[free_index] = 0
+
+    def backup_state(self):
+        return [
+            self.full_attn_allocator.backup_state(),
+            self.swa_attn_allocator.backup_state(),
+        ]
+
+    def restore_state(self, state):
+        assert len(state) == 2
+        self.full_attn_allocator.restore_state(state[0])
+        self.swa_attn_allocator.restore_state(state[1])
+
+    def clear(self):
+        self.swa_attn_allocator.clear()
+        self.full_attn_allocator.clear()
+        # Note: the last item is -1, we don't clear it, see the comment in __init__
+        self.full_to_swa_index_mapping[:-1].fill_(0)
+        self.is_not_in_free_group = True
+        self.free_group = []
+
+    def get_cpu_copy(self, indices):
+        return self._kvcache.get_cpu_copy(indices)
+
+    def load_cpu_copy(self, kv_cache_cpu, indices):
+        return self._kvcache.load_cpu_copy(kv_cache_cpu, indices)
diff --git a/sglang/python/sglang/srt/mem_cache/swa_radix_cache.py b/sglang/python/sglang/srt/mem_cache/swa_radix_cache.py
new file mode 100644
index 0000000000000000000000000000000000000000..50b24a313b4358bc23d7cdb55c07def464b2a6dc
--- /dev/null
+++ b/sglang/python/sglang/srt/mem_cache/swa_radix_cache.py
@@ -0,0 +1,1146 @@
+from __future__ import annotations
+
+"""
+Copyright 2023-2024 SGLang Team
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+"""
+The radix tree data structure for managing the hybrid (full and SWA) KV cache.
+"""
+
+import heapq
+import time
+from collections import defaultdict
+from functools import partial
+from typing import TYPE_CHECKING, List, Optional, Tuple
+
+import torch
+from numpy import float64
+
+from sglang.srt.mem_cache.base_prefix_cache import (
+    BasePrefixCache,
+    EvictParams,
+    EvictResult,
+    InsertParams,
+    InsertResult,
+    MatchPrefixParams,
+    MatchResult,
+)
+from sglang.srt.mem_cache.cache_init_params import CacheInitParams
+from sglang.srt.mem_cache.radix_cache import (
+    RadixKey,
+    _key_match_page_size1,
+    _key_match_paged,
+    get_child_key,
+    maybe_bigram_convert,
+    page_align_keys,
+)
+from sglang.srt.mem_cache.swa_memory_pool import SWATokenToKVPoolAllocator
+from sglang.srt.mem_cache.utils import convert_to_bigram_key
+
+if TYPE_CHECKING:
+    from sglang.srt.managers.schedule_batch import Req
+
+import logging
+
+logger = logging.getLogger(__name__)
+
+
+class TreeNode:
+
+    counter = 0
+    swa_uuid_counter = 1
+    last_access_time_counter_float = float64(1.0)
+
+    def __init__(self, id: Optional[int] = None):
+        self.children = defaultdict(TreeNode)
+        self.parent: TreeNode = None
+        self.key: RadixKey = None
+        self.value: Optional[torch.Tensor] = None
+        # swa_tombstone is used to indicate the kv indices have been freed for swa layers
+        self.swa_tombstone = False
+        # invariant: for any node, if swa_lock_ref is locked, full_lock_ref must be locked;
+        # if full_lock_ref is locked, swa_lock_ref doesn't need to be locked. So,
+        # full_lock_ref is always >= swa_lock_ref.
+        self.full_lock_ref = 0
+        self.swa_lock_ref = 0
+        # last access time is only used for sanity check. LRU is maintained by the lru list.
+        self.last_access_time = get_last_access_time()
+
+        self.hit_count = 0
+        # store the host indices of KV cache
+        self.host_value = None
+
+        # for lru list, invariant:
+        # 1. prev has greater last_access_time
+        # 2. next has smaller last_access_time
+        self.prev = None
+        self.next = None
+        self.swa_prev = None
+        self.swa_next = None
+
+        self.id = TreeNode.counter if id is None else id
+        TreeNode.counter += 1
+        self.swa_uuid = None
+
+    @property
+    def evicted(self):
+        return self.value is None
+
+    @property
+    def backuped(self):
+        return self.host_value is not None
+
+    def __lt__(self, other: "TreeNode"):
+        return self.last_access_time < other.last_access_time
+
+
+def gen_swa_uuid() -> int:
+    TreeNode.swa_uuid_counter += 1
+    return TreeNode.swa_uuid_counter
+
+
+def get_last_access_time() -> float64:
+    ret = TreeNode.last_access_time_counter_float
+    TreeNode.last_access_time_counter_float += 1.0
+    return ret
+
+
+class LRUList:
+    def __init__(self, is_swa_list: bool = False):
+        self.is_swa_list = is_swa_list
+        if self.is_swa_list:
+            self.prv = "swa_prev"
+            self.nxt = "swa_next"
+            self.lock_ref = "swa_lock_ref"
+        else:
+            self.prv = "prev"
+            self.nxt = "next"
+            self.lock_ref = "full_lock_ref"
+        # Initialize dummy head and tail nodes
+        self.head = TreeNode()  # Most recently used side
+        self.tail = TreeNode()  # Least recently used side
+        setattr(self.head, self.nxt, self.tail)  # self.head.next = self.tail
+        setattr(self.tail, self.prv, self.head)  # self.tail.prev = self.head
+        self.cache = {}
+
+    def _add_node(self, node):
+        """Helper to add node right after head (most recently used)"""
+        self._add_node_after(self.head, node)
+
+    def _add_node_after(self, old_node, new_node):
+        """Helper to add node right after old_node"""
+        setattr(new_node, self.prv, old_node)  # new_node.prev = old_node
+        setattr(
+            new_node, self.nxt, getattr(old_node, self.nxt)
+        )  # new_node.next = old_node.next
+        setattr(
+            getattr(old_node, self.nxt), self.prv, new_node
+        )  # old_node.next.prev = new_node
+        setattr(old_node, self.nxt, new_node)  # old_node.next = new_node
+
+    def _remove_node(self, node):
+        """Helper to remove node from linked list"""
+        setattr(
+            getattr(node, self.prv), self.nxt, getattr(node, self.nxt)
+        )  # node.prev.next = node.next
+        setattr(
+            getattr(node, self.nxt), self.prv, getattr(node, self.prv)
+        )  # node.next.prev = node.prev
+
+    def _get_lru(self) -> Optional[TreeNode]:
+        """
+        Get the least recently used node
+        """
+        if len(self.cache) == 0:
+            return None
+        return getattr(self.tail, self.prv)
+
+    def reset_node_mru(self, node):
+        """
+        Move a (existing) node to most recently used position
+        """
+        assert node.id in self.cache, f"Resetting node {node.id=} not in lru list"
+        assert (
+            not self.is_swa_list or not node.swa_tombstone
+        ), f"Resetting swa tombstone node in swa lru list: {node.id=}"
+        self._remove_node(node)
+        self._add_node(node)
+
+    def reset_node_and_parents_mru(self, node, root_node):
+        """
+        Move an (existing) node and its parents to most recently used position. Child node is
+        more recently used than parent node.
+        """
+        prev_node = self.head
+        while node != root_node:
+            # for swa lru list, only reset non-tombstone nodes
+            if not self.is_swa_list or not node.swa_tombstone:
+                assert (
+                    node.id in self.cache
+                ), f"Resetting node {node.id=} not in lru list when resetting node and parents mru"
+                self._remove_node(node)
+                self._add_node_after(prev_node, node)
+                prev_node = node
+            node = node.parent
+
+    def insert_mru(self, node):
+        """
+        Insert a (new) node as most recently used
+        """
+        assert (
+            not self.is_swa_list or not node.swa_tombstone
+        ), f"Inserting swa tombstone node in swa lru list: {node.id=}"
+        assert (
+            node.id not in self.cache
+        ), f"Inserting node {node.id=} already in lru list, existing node: {self.cache[node.id].id=}"
+        self.cache[node.id] = node
+        self._add_node(node)
+
+    def remove_node(self, node: TreeNode):
+        """
+        Remove node from lru list
+        """
+        assert node.id in self.cache, f"Removing node {node.id=} not in lru list"
+        assert (
+            not self.is_swa_list or not node.swa_tombstone
+        ), f"Removing swa tombstone node from swa lru list: {node.id=}"
+        del self.cache[node.id]
+        self._remove_node(node)
+
+    def get_lru_no_lock(self) -> Optional[TreeNode]:
+        """
+        Get the least recently used node that is not locked
+        """
+        return self.get_prev_no_lock(self.tail, check_id=False)
+
+    def get_leaf_lru_no_lock(self) -> Optional[TreeNode]:
+        """
+        Get the least recently used leaf node that is not locked
+        """
+        return self.get_prev_leaf_no_lock(self.tail, check_id=False)
+
+    def get_prev_no_lock(
+        self, node: TreeNode, check_id: bool = True
+    ) -> Optional[TreeNode]:
+        """
+        Get the previous (i.e. more recently used) node that is not locked
+        """
+        if check_id:
+            assert (
+                node.id in self.cache
+            ), f"Getting prev of node {node.id=} not in lru list"
+        x = getattr(node, self.prv)  # x = node.prev
+        while getattr(x, self.lock_ref) > 0:
+            x = getattr(x, self.prv)  # x = x.prev
+        # if x is the head, it means there is no node in the lru list without lock
+        if x == self.head:
+            return None
+        return x
+
+    def get_prev_leaf_no_lock(self, node: TreeNode, check_id: bool = True):
+        """
+        Get the previous (i.e. more recently used) leaf node that is not locked
+        """
+        if check_id:
+            assert (
+                node.id in self.cache
+            ), f"Getting prev of node {node.id=} not in lru list"
+        x = getattr(node, self.prv)  # x = node.prev
+        while getattr(x, self.lock_ref) > 0 or len(x.children) > 0:
+            x = getattr(x, self.prv)  # x = x.prev
+        # if x is the head, it means there is no leaf node in the lru list without lock
+        if x == self.head:
+            return None
+        return x
+
+    def in_list(self, node: Optional[TreeNode]):
+        """
+        Check if the node is in the lru list
+        """
+        if not node:
+            return False
+        return node.id in self.cache
+
+    # Note: this is expensive, only use for debug
+    def sanity_check_evictable_size(self):
+        """
+        Check the evictable size (i.e. the size of the nodes that are not locked)
+        """
+        node = self.get_lru_no_lock()
+        evictable_size = 0
+        while self.in_list(node):
+            evictable_size += len(node.value)
+            node = self.get_prev_no_lock(node)
+        return evictable_size
+
+    # Note: this is expensive, only use for debug or idle check
+    def sanity_check(self, tree_cache: "SWARadixCache"):
+        """
+        Check if the lru list is valid by rebuilding the lru list from the tree, heapifying it, and
+        checking if the lru list is valid.
+        """
+        try:
+            if self.is_swa_list:
+                nodes = tree_cache._collect_nontombstone_nodes()
+            else:
+                nodes = tree_cache._collect_all_nodes()
+            total_nodes = len(nodes)
+            total_lru_plus_1 = len(self.cache) + 1
+            # heapify based on last_access_time
+            heapq.heapify(nodes)
+            # the root node is not in the lru list
+            assert (
+                len(nodes) == len(self.cache) + 1
+            ), f"len(nodes): {len(nodes)} != len(self.cache) + 1: {len(self.cache) + 1}"
+
+            x_lru = self._get_lru()
+            while len(nodes):
+                x = heapq.heappop(nodes)
+                if x == tree_cache.root_node:
+                    # root node is not in the lru list
+                    continue
+                assert (
+                    x == x_lru
+                ), f"Incorrect LRU list, {self.is_swa_list=}, x: {x.id=} != x_lru: {x_lru.id=}"
+                assert (
+                    x_lru.full_lock_ref == 0
+                ), f"x_lru should not be locked when idle, {x_lru.full_lock_ref=}, {x_lru.swa_uuid=}, {x_lru.id=}"
+                assert (
+                    x_lru.swa_lock_ref == 0
+                ), f"x_lru should not be locked when idle, {x_lru.swa_lock_ref=}, {x_lru.swa_uuid=}, {x_lru.id=}"
+                x_lru = getattr(x, self.prv)
+
+            if self.is_swa_list:
+                evictable_size = tree_cache.swa_evictable_size()
+                lru_list_evictable_size = self.sanity_check_evictable_size()
+            else:
+                evictable_size = tree_cache.full_evictable_size()
+                lru_list_evictable_size = self.sanity_check_evictable_size()
+
+            assert (
+                evictable_size == lru_list_evictable_size
+            ), f"{self.is_swa_list=}, total nodes: {total_nodes}, total lru plus 1: {total_lru_plus_1}, evictable size: {evictable_size} != lru list evictable size: {lru_list_evictable_size}"
+        except Exception as e:
+            msg = f"SWA Radix tree sanity check failed, ping @hanming-lu: {e}"
+            logger.error(msg)
+            raise Exception(msg)
+
+
+class SWARadixCache(BasePrefixCache):
+    def __init__(self, params: CacheInitParams):
+        assert isinstance(params.token_to_kv_pool_allocator, SWATokenToKVPoolAllocator)
+        self.req_to_token_pool = params.req_to_token_pool
+        self.token_to_kv_pool_allocator = params.token_to_kv_pool_allocator
+        self.page_size = params.page_size
+        self.disable = params.disable
+        self.is_eagle = params.is_eagle
+
+        if self.token_to_kv_pool_allocator:
+            self.device = self.token_to_kv_pool_allocator.device
+        else:
+            self.device = torch.device("cpu")
+
+        if self.page_size == 1:
+            self.key_match_fn = _key_match_page_size1
+            self.get_child_key_fn = get_child_key
+        else:
+            self.key_match_fn = partial(_key_match_paged, page_size=self.page_size)
+            self.get_child_key_fn = partial(get_child_key, page_size=self.page_size)
+
+        if self.is_eagle:
+            self.key_convert_fn = convert_to_bigram_key
+        else:
+            self.key_convert_fn = lambda key: key
+
+        if params.enable_metrics:
+            self.init_metrics_collector()
+
+        self.sliding_window_size = params.sliding_window_size
+        self.reset()
+
+    ##### Public API #####
+
+    def supports_swa(self) -> bool:
+        assert (
+            self.sliding_window_size is not None
+        ), "sliding_window_size must be set for SWARadixCache"
+        return True
+
+    def reset(self) -> None:
+        self.root_node = TreeNode()
+        self.root_node.key = []
+        self.root_node.value = []
+        self.root_node.full_lock_ref = 1
+        self.root_node.swa_lock_ref = 1
+        self.full_evictable_size_ = 0
+        self.swa_evictable_size_ = 0
+        self.full_protected_size_ = 0
+        self.swa_protected_size_ = 0
+        # LRU lists are used to maintain the order of eviction of the nodes in the tree
+        self.full_lru_list = LRUList(is_swa_list=False)
+        self.swa_lru_list = LRUList(is_swa_list=True)
+
+    def match_prefix(self, params: MatchPrefixParams) -> MatchResult:
+        """Find the matching prefix from the radix tree.
+        Args:
+            params: MatchPrefixParams containing key.
+        Returns:
+            A tuple of a tensor of matching prefix token IDs and
+            the last node that contains the prefix values. Note that
+            this API can modify the internal state of the Radix tree.
+            The last node create a new child if the prefix is shorter
+            than the last node's value.
+        """
+
+        key = self._match_pre_processor(params)
+        if key is None:
+            return MatchResult(
+                device_indices=torch.empty(
+                    (0,),
+                    dtype=torch.int64,
+                    device=self.device,
+                ),
+                last_device_node=self.root_node,
+                last_host_node=self.root_node,
+            )
+
+        value, last_node, best_value_len = self._match_prefix_helper(key)
+        return self._match_post_processor(params, value, last_node, best_value_len)
+
+    def insert(self, params: InsertParams) -> InsertResult:
+        if self.disable:
+            return InsertResult(prefix_len=0)
+
+        key = params.key
+        value = params.value
+        prev_prefix_len = params.prev_prefix_len
+        swa_evicted_seqlen = params.swa_evicted_seqlen
+
+        if value is None:
+            value = torch.tensor([x for x in key.token_ids], dtype=torch.int64)
+
+        key, value = maybe_bigram_convert(self.is_eagle, key, value)
+
+        prefix_len = self._insert_helper(
+            self.root_node, key, value, prev_prefix_len, swa_evicted_seqlen
+        )
+        return InsertResult(prefix_len=prefix_len)
+
+    def cache_finished_req(self, req: Req, is_insert: bool = True) -> None:
+        """Cache request when it finishes."""
+        kv_committed_len = req.pop_committed_kv_cache()
+        if self.disable:
+            kv_indices = self.req_to_token_pool.req_to_token[
+                req.req_pool_idx, :kv_committed_len
+            ]
+            self.token_to_kv_pool_allocator.free(kv_indices)
+            return
+
+        token_ids = (req.origin_input_ids + req.output_ids)[:kv_committed_len]
+        kv_indices = self.req_to_token_pool.req_to_token[
+            req.req_pool_idx, :kv_committed_len
+        ]
+
+        # Maybe convert to bigram keys for EAGLE
+        keys = self.key_convert_fn(token_ids)
+        keys = page_align_keys(keys, self.page_size)
+        page_aligned_len = len(keys)
+        values = kv_indices[:page_aligned_len].to(dtype=torch.int64, copy=True)
+        radix_key = RadixKey(
+            keys[:page_aligned_len],
+            req.extra_key,
+            is_bigram=self.is_eagle,
+        )
+        old_prefix_len = req.cache_protected_len
+
+        # Radix Cache takes one ref in memory pool
+        # Note: the insert function already frees the overlapped kv_indices
+        if is_insert:
+            self.insert(
+                InsertParams(
+                    key=radix_key,
+                    value=values,
+                    prev_prefix_len=old_prefix_len,
+                    swa_evicted_seqlen=req.swa_evicted_seqlen,
+                )
+            )
+        else:
+            self.token_to_kv_pool_allocator.free(
+                kv_indices[old_prefix_len:page_aligned_len]
+            )
+
+        # free the unaligned tail
+        self.token_to_kv_pool_allocator.free(kv_indices[page_aligned_len:])
+
+        # Remove req slot release the cache lock
+        self.dec_lock_ref(req.last_node, req.swa_uuid_for_lock)
+
+    def cache_unfinished_req(self, req: Req, chunked=False) -> None:
+        """Cache request when it is unfinished."""
+        if self.disable:
+            kv_indices = self.req_to_token_pool.req_to_token[
+                req.req_pool_idx, : len(req.fill_ids)
+            ]
+
+            # `req.prefix_indices` will be used in `PrefillAdder::add_chunked_req` later
+            req.prefix_indices = kv_indices
+            return
+
+        token_ids = req.fill_ids
+        kv_indices = self.req_to_token_pool.req_to_token[
+            req.req_pool_idx, : len(token_ids)
+        ]
+
+        keys = self.key_convert_fn(token_ids)
+        keys = page_align_keys(keys, self.page_size)
+        values = kv_indices[: len(keys)].to(dtype=torch.int64, copy=True)
+        radix_key = RadixKey(keys, req.extra_key, is_bigram=self.is_eagle)
+        old_prefix_len = req.cache_protected_len
+
+        # Radix Cache takes one ref in memory pool
+        # Note: the insert function already frees the overlapped kv_indices
+        result = self.insert(
+            InsertParams(
+                key=radix_key,
+                value=values,
+                prev_prefix_len=old_prefix_len,
+            )
+        )
+        new_prefix_len = result.prefix_len
+
+        # The prefix indices could be updated, reuse it
+        match_result = self.match_prefix(MatchPrefixParams(key=radix_key))
+        new_indices, new_last_node = (
+            match_result.device_indices,
+            match_result.last_device_node,
+        )
+
+        assert old_prefix_len <= len(new_indices), f"{old_prefix_len=}, {new_indices=}"
+        assert new_prefix_len <= len(new_indices), f"{new_prefix_len=}, {new_indices=}"
+        self.req_to_token_pool.write(
+            (req.req_pool_idx, slice(old_prefix_len, len(new_indices))),
+            new_indices[old_prefix_len:],
+        )
+
+        req.cache_protected_len = len(new_indices)
+
+        self.dec_lock_ref(req.last_node, req.swa_uuid_for_lock)
+        swa_uuid_for_lock = self.inc_lock_ref(new_last_node)
+
+        # `req.prefix_indices` will be used in `PrefillAdder::add_chunked_req` later
+        if len(new_indices) < len(kv_indices):
+            req.prefix_indices = torch.cat(
+                [new_indices, kv_indices[len(new_indices) :]]
+            )
+        else:
+            req.prefix_indices = new_indices
+        req.last_node = new_last_node
+        req.swa_uuid_for_lock = swa_uuid_for_lock
+
+    def pretty_print(self) -> None:
+        self._print_helper(self.root_node, 0)
+        total_size, total_swa_size = self._total_size_helper()
+        print(f"#full_tokens: {total_size}, #swa_tokens: {total_swa_size}")
+
+    def total_size(self) -> Tuple[int, int]:
+        return self._total_size_helper()
+
+    def evict(self, params: EvictParams) -> EvictResult:
+        if self.disable:
+            return EvictResult()
+        start_time = time.perf_counter()
+        full_num_tokens = params.num_tokens
+        swa_num_tokens = params.swa_num_tokens
+        full_num_evicted = 0
+        swa_num_evicted = 0
+        if full_num_tokens > 0:
+            # get the least recently used leaf node that is not locked
+            x = self.full_lru_list.get_leaf_lru_no_lock()
+
+            while full_num_evicted < full_num_tokens and self.full_lru_list.in_list(x):
+                assert (
+                    x != self.root_node
+                ), f"root node should not exist in full lru list, {x.id=}"
+                assert x.full_lock_ref == 0, f"node is in use, {x.id=}"
+
+                # 1. free node kv indices, evict full and swa tokens
+                self.token_to_kv_pool_allocator.free(x.value)
+                full_num_evicted += len(x.value)
+                swa_num_evicted += len(x.value)
+
+                # 2. get the next leaf, update the lru lists
+                x_next = self.full_lru_list.get_prev_leaf_no_lock(x)
+                self.full_lru_list.remove_node(x)
+                self.swa_lru_list.remove_node(x)
+
+                # 3. delete the leaf node
+                self._delete_leaf(x)
+
+                # 4. Iteratively delete tombstone leaves to maintain invariant that leaf nodes are not tombstone
+                x, leaf_full_num_evicted = self._iteratively_delete_tombstone_leaf(x)
+                full_num_evicted += leaf_full_num_evicted
+
+                # 5. if parent has no more children, it is a leaf. It is possible that this node is lru, so
+                # we need to get the first leaf node in the lru list
+                if len(x.parent.children) == 0:
+                    x_next = self.full_lru_list.get_leaf_lru_no_lock()
+
+                x = x_next
+
+        if swa_num_evicted < swa_num_tokens:
+            # get the least recently used node that is not locked, doesn't have to be a leaf
+            x = self.swa_lru_list.get_lru_no_lock()
+
+            # evict lru leaf nodes until swa_num_tokens is reached
+            while swa_num_evicted < swa_num_tokens and (self.swa_lru_list.in_list(x)):
+                assert not x.swa_tombstone, f"duplicate swa tombstone node, {x.id=}"
+                assert x != self.root_node, f"root node is not evictable, {x.id=}"
+                assert x.swa_lock_ref == 0, f"node is in use by swa kv indices, {x.id=}"
+
+                if len(x.children) > 0:
+                    # 1. an internal node, free swa tokens.
+                    self.token_to_kv_pool_allocator.free_swa(x.value)
+                    swa_num_evicted += len(x.value)
+
+                    # 2. get the next node, update the lru lists
+                    x_next = self.swa_lru_list.get_prev_no_lock(x)
+                    self.swa_lru_list.remove_node(x)
+
+                    # 3. tombstone the node
+                    self._tombstone_internal_node(x)
+                else:
+                    assert (
+                        x.full_lock_ref == 0
+                    ), f"leaf node with full lock must also have swa lock, {x.id=}"
+                    # 1. a leaf node, free full and swa tokens
+                    self.token_to_kv_pool_allocator.free(x.value)
+                    full_num_evicted += len(x.value)
+                    swa_num_evicted += len(x.value)
+
+                    # 2. get the next node, update the lru lists
+                    x_next = self.swa_lru_list.get_prev_no_lock(x)
+                    self.full_lru_list.remove_node(x)
+                    self.swa_lru_list.remove_node(x)
+
+                    # 3. delete the leaf node
+                    self._delete_leaf(x)
+
+                    # 4. Iteratively delete tombstone leaves to maintain invariant that leaf nodes are not tombstone
+                    self._iteratively_delete_tombstone_leaf(x)
+
+                x = x_next
+
+        self.update_eviction_metrics(full_num_evicted + swa_num_evicted, start_time)
+        return EvictResult(
+            num_tokens_evicted=full_num_evicted, swa_num_tokens_evicted=swa_num_evicted
+        )
+
+    def inc_lock_ref(self, node: TreeNode) -> Optional[int]:
+        """
+        Increment the lock reference count for the node. Returns the swa_uuid_for_lock, which needs
+        to be passed to dec_lock_ref.
+        It locks the full_lock_ref for nodes between the [last node, root), exclusive.
+        It locks the swa_lock_ref for nodes between the [last node, swa_uuid_for_lock], inclusive.
+        """
+        if self.disable:
+            return None
+
+        swa_lock_size = 0
+        swa_uuid_for_lock = None
+        while node != self.root_node:
+            # lock full from node to root
+            assert (
+                node.full_lock_ref >= 0
+            ), f"inc_lock_ref on node with {node.full_lock_ref=}, {node.id=}"
+            if node.full_lock_ref == 0:
+                self.full_evictable_size_ -= len(node.value)
+                self.full_protected_size_ += len(node.value)
+            node.full_lock_ref += 1
+
+            # lock swa if we have not reached the sliding window size.
+            # When we reach the sliding window size, we will set the swa_uuid_for_lock.
+            # caller needs to pass the swa_uuid_for_lock to dec_lock_ref
+            if swa_lock_size < self.sliding_window_size:
+                assert (
+                    not node.swa_tombstone
+                ), f"inc_lock_swa on swa_tombstone node, {node.id=}"
+                if node.swa_lock_ref == 0:
+                    self.swa_evictable_size_ -= len(node.value)
+                    self.swa_protected_size_ += len(node.value)
+                node.swa_lock_ref += 1
+                swa_lock_size += len(node.value)
+                if swa_lock_size >= self.sliding_window_size:
+                    if node.swa_uuid is None:
+                        node.swa_uuid = gen_swa_uuid()
+                    swa_uuid_for_lock = node.swa_uuid
+            node = node.parent
+        return swa_uuid_for_lock
+
+    def dec_lock_ref(self, node: TreeNode, swa_uuid_for_lock: Optional[int] = None):
+        """
+        Decrement the lock reference count for the node.
+        It unlocks the full_lock_ref for nodes between the [last node, root), exclusive.
+        It unlocks the swa_lock_ref for nodes between the [last node, swa_uuid_for_lock], inclusive.
+        If swa_uuid_for_lock is None, it unlocks to the root, exclusive.
+        """
+        if self.disable:
+            return
+
+        dec_lock_swa = True
+        while node != self.root_node:
+            assert (
+                node.full_lock_ref > 0
+            ), f"dec_lock_ref on node with {node.full_lock_ref=}, {node.id=}"
+            if node.full_lock_ref == 1:
+                self.full_evictable_size_ += len(node.value)
+                self.full_protected_size_ -= len(node.value)
+            node.full_lock_ref -= 1
+
+            if dec_lock_swa:
+                assert (
+                    not node.swa_tombstone
+                ), f"dec_lock_ref on swa_tombstone node, {node.id=}"
+                assert (
+                    node.swa_lock_ref > 0
+                ), f"dec_lock_ref on node with {node.swa_lock_ref=}, {node.id=}"
+
+                if node.swa_lock_ref == 1:
+                    self.swa_evictable_size_ += len(node.value)
+                    self.swa_protected_size_ -= len(node.value)
+                node.swa_lock_ref -= 1
+                if swa_uuid_for_lock and node.swa_uuid == swa_uuid_for_lock:
+                    dec_lock_swa = False
+
+            node = node.parent
+
+    def sanity_check(self):
+        self.full_lru_list.sanity_check(self)
+        self.swa_lru_list.sanity_check(self)
+
+    def evictable_size(self) -> Tuple[int, int]:
+        # Note: use full_evictable_size() and swa_evictable_size() instead.
+        raise NotImplementedError
+
+    def full_evictable_size(self) -> int:
+        return self.full_evictable_size_
+
+    def swa_evictable_size(self) -> int:
+        return self.swa_evictable_size_
+
+    def protected_size(self) -> Tuple[int, int]:
+        # Note: use full_protected_size() and swa_protected_size() instead.
+        raise NotImplementedError
+
+    def full_protected_size(self) -> int:
+        # protected size refers to the size of the full cache that is locked
+        return self.full_protected_size_
+
+    def swa_protected_size(self) -> int:
+        # protected size refers to the size of the swa cache that is locked
+        return self.swa_protected_size_
+
+    def all_values_flatten(self) -> torch.Tensor:
+        values = []
+
+        def _dfs_helper(node: TreeNode):
+            for _, child in node.children.items():
+                values.append(child.value)
+                _dfs_helper(child)
+
+        _dfs_helper(self.root_node)
+        return torch.cat(values)
+
+    def available_and_evictable_str(self) -> str:
+        full_available_size = self.token_to_kv_pool_allocator.full_available_size()
+        swa_available_size = self.token_to_kv_pool_allocator.swa_available_size()
+        full_evictable_size = self.full_evictable_size()
+        swa_evictable_size = self.swa_evictable_size()
+        return (
+            f"Available full tokens: {full_available_size + full_evictable_size} ({full_available_size=} + {full_evictable_size=})\n"
+            f"Available swa tokens: {swa_available_size + swa_evictable_size} ({swa_available_size=} + {swa_evictable_size=})\n"
+            f"Full LRU list evictable size: {self.full_lru_list.sanity_check_evictable_size()}\n"
+            f"SWA LRU list evictable size: {self.swa_lru_list.sanity_check_evictable_size()}\n"
+        )
+
+    ##### Internal Helper Functions #####
+
+    def _match_prefix_helper(
+        self, key: RadixKey
+    ) -> Tuple[List[torch.Tensor], TreeNode, int]:
+        """
+        SWA prefix matching helper. It factors in the sliding window size such that
+        the matched node is guaranteed to either 1. connected to root without swa tombstone,
+        or 2. the number of matching tokens from the matched node to the last swa tombstone
+        node is greater than or equal to the sliding window size.
+        """
+        node = self.root_node
+        child_key = self.get_child_key_fn(key)
+
+        value = []
+        # for path connected to root without tombstone, always match, so set to inf
+        match_len_since_tombstone = float("inf")
+        best_value_len = 0
+        best_last_node = node
+        while len(key) > 0 and child_key in node.children.keys():
+            child = node.children[child_key]
+
+            if child.swa_tombstone:
+                # update best_value_len and best_last_node if needed
+                if match_len_since_tombstone >= self.sliding_window_size:
+                    best_value_len = len(value)
+                    best_last_node = node
+                # reset match_len_since_tombstone if we hit a tombstone node
+                match_len_since_tombstone = 0
+
+            prefix_len = self.key_match_fn(child.key, key)
+            if prefix_len < len(child.key):
+                new_node = self._split_node(child.key, child, prefix_len)
+                value.append(new_node.value)
+                if not new_node.swa_tombstone:
+                    match_len_since_tombstone += len(new_node.value)
+                node = new_node
+                break
+            else:
+                value.append(child.value)
+                if not child.swa_tombstone:
+                    match_len_since_tombstone += len(child.value)
+                node = child
+                key = key[prefix_len:]
+
+                if len(key):
+                    child_key = self.get_child_key_fn(key)
+
+        # handle best_value_len and best_last_node, for the case that last node is fully matched
+        if match_len_since_tombstone >= self.sliding_window_size:
+            best_value_len = len(value)
+            best_last_node = node
+
+        return value, best_last_node, best_value_len
+
+    def _match_pre_processor(self, params: MatchPrefixParams) -> Optional[RadixKey]:
+        """Preprocess the key before matching."""
+        key = params.key
+        key, _ = maybe_bigram_convert(self.is_eagle, key)
+
+        if self.disable or len(key) == 0:
+            return None
+
+        if self.page_size != 1:
+            page_aligned_len = len(key) // self.page_size * self.page_size
+            key = key[:page_aligned_len]
+
+        return key
+
+    def _match_post_processor(
+        self,
+        params: MatchPrefixParams,
+        value: List[torch.Tensor],
+        last_node: TreeNode,
+        best_value_len: int,
+    ) -> MatchResult:
+        """Post-process the matched result."""
+        node_update = last_node
+        # update time for matched nodes, and make nodes closer to root to be least recently used
+        # this allows swa to evict nodes closer to root first
+        self.full_lru_list.reset_node_and_parents_mru(node_update, self.root_node)
+        self.swa_lru_list.reset_node_and_parents_mru(node_update, self.root_node)
+
+        # This last_access_time is for sanity check, can be deleted after validation in production
+        cur_time = get_last_access_time()
+        while node_update:
+            node_update.last_access_time = cur_time
+            cur_time -= (
+                0.00001  # assuming less than 100000 nodes in a branch of the tree
+            )
+            node_update = node_update.parent
+
+        value = value[:best_value_len]
+        if value:
+            value = torch.cat(value)
+        else:
+            value = torch.empty((0,), dtype=torch.int64, device=self.device)
+
+        return MatchResult(
+            device_indices=value,
+            last_device_node=last_node,
+            last_host_node=last_node,
+        )
+
+    def _split_node(self, key: RadixKey, child: TreeNode, split_len: int) -> TreeNode:
+        # new_node -> child
+        new_node = TreeNode()
+        new_node.children = {self.get_child_key_fn(key[split_len:]): child}
+        new_node.parent = child.parent
+        new_node.swa_tombstone = child.swa_tombstone
+        new_node.full_lock_ref = child.full_lock_ref
+        new_node.swa_lock_ref = child.swa_lock_ref
+        new_node.key = child.key[:split_len]
+        assert len(new_node.key) > 0, f"new_node.key should not be empty"
+        new_node.value = child.value[:split_len].clone()
+        # parent inherits the swa_uuid from child for swa lock ref
+        new_node.swa_uuid = child.swa_uuid
+        child.swa_uuid = None
+        # child time should be later than parent's time for swa tombstone
+        child.last_access_time = get_last_access_time()
+
+        # remove the child from the lru lists because it is being split
+        self.full_lru_list.remove_node(child)
+        if not new_node.swa_tombstone:
+            self.swa_lru_list.remove_node(child)
+        child.parent = new_node
+        child.key = child.key[split_len:]
+        assert len(child.key) > 0, f"child.key should not be empty"
+        child.value = child.value[split_len:].clone()
+        new_node.parent.children[self.get_child_key_fn(key)] = new_node
+
+        # insert the new node and child into the lru lists, insert
+        # parent first so that parent is after child in the lru list
+        self.full_lru_list.insert_mru(new_node)
+        self.full_lru_list.insert_mru(child)
+        if not new_node.swa_tombstone:
+            self.swa_lru_list.insert_mru(new_node)
+            self.swa_lru_list.insert_mru(child)
+        return new_node
+
+    def _insert_helper(
+        self,
+        node: TreeNode,
+        key: RadixKey,
+        value,
+        update_kv_after_len: int,
+        swa_evicted_seqlen: int = 0,
+    ) -> int:
+        # Update the last access time from root to leaf, so that
+        # swa will tombstone the node closer to root first
+        node.last_access_time = get_last_access_time()
+        if node != self.root_node:
+            self.full_lru_list.reset_node_mru(node)
+            if not node.swa_tombstone:
+                self.swa_lru_list.reset_node_mru(node)
+        if len(key) == 0:
+            return 0
+
+        child_key = self.get_child_key_fn(key)
+
+        total_prefix_length = 0
+        while len(key) > 0 and child_key in node.children.keys():
+            node = node.children[child_key]
+            node.last_access_time = get_last_access_time()
+            self.full_lru_list.reset_node_mru(node)
+            if not node.swa_tombstone:
+                self.swa_lru_list.reset_node_mru(node)
+            prefix_len = self.key_match_fn(node.key, key)
+
+            if prefix_len < len(node.key):
+                new_node = self._split_node(node.key, node, prefix_len)
+                node = new_node
+
+            # if tombstone after update_kv_after_len, update node.value to be the input value.
+            # This is needed because it is possible that the last sliding window size tokens
+            # contains tombstone. If this is the case and we don't update the kv value, then
+            # the prefill prefix matching will stuck.
+            if update_kv_after_len < total_prefix_length + prefix_len:
+                # For page_size > 1 and chunked prefill case, update_kv_after_len may be not page-aligned due to a trailing partial page
+                # (kept in the request but not inserted into the radix tree) appended to prefix_indices.
+                if node.swa_tombstone:
+                    assert (
+                        node.swa_lock_ref == 0
+                    ), f"tombstone swa_lock_ref should always be 0, {node.full_lock_ref=}, {node.swa_lock_ref=}, {node.id=}"
+                    assert (
+                        swa_evicted_seqlen % self.page_size == 0
+                    ), f"swa_evicted_seqlen must be page aligned, {swa_evicted_seqlen=}, {self.page_size=}"
+                    if swa_evicted_seqlen <= total_prefix_length:
+                        # Branch 1: all swa tokens of value[:prefix_len] are not evicted, so we can insert it to the tree directly.
+                        # Free full tokens in the original tree node.
+                        self.token_to_kv_pool_allocator.free(node.value[:prefix_len])
+                        # Overwrite the new value in request to the tree node.
+                        node.value = value[:prefix_len].clone()
+                        node.swa_tombstone = False
+                        self.swa_lru_list.insert_mru(node)
+                        self.swa_evictable_size_ += len(node.value)
+                    elif swa_evicted_seqlen < total_prefix_length + prefix_len:
+                        # Branch 2: part of swa tokens of value[:prefix_len] are evicted, so we need to split the node and insert the value to new node.
+                        start_update_idx = swa_evicted_seqlen - total_prefix_length
+                        self.token_to_kv_pool_allocator.free(
+                            node.value[start_update_idx:prefix_len]
+                        )
+                        self._split_node(node.key, node, start_update_idx)
+                        # Here node is the new node after split, so we can overwrite the value to the new node.
+                        # The old node is still swa tombstone and the full token is not freed.
+                        node.value = value[start_update_idx:prefix_len].clone()
+                        self.token_to_kv_pool_allocator.free(value[:start_update_idx])
+                        node.swa_tombstone = False
+                        self.swa_lru_list.insert_mru(node)
+                        self.swa_evictable_size_ += len(node.value)
+                    else:
+                        # Branch 3: all swa tokens of value[:prefix_len] are evicted, so we don't need to update the node.
+                        self.token_to_kv_pool_allocator.free(value[:prefix_len])
+                else:
+                    # The node is not tombstone, so we don't need to update the node.
+                    self.token_to_kv_pool_allocator.free(value[:prefix_len])
+
+            total_prefix_length += prefix_len
+            key = key[prefix_len:]
+            value = value[prefix_len:]
+
+            if len(key):
+                child_key = self.get_child_key_fn(key)
+
+        if len(key):
+            if (
+                swa_evicted_seqlen > total_prefix_length
+                and swa_evicted_seqlen < total_prefix_length + len(key)
+            ):
+                swa_tombstone_len = swa_evicted_seqlen - total_prefix_length
+                node = self._add_new_node(
+                    node,
+                    key[:swa_tombstone_len],
+                    value[:swa_tombstone_len],
+                    swa_tombstone=True,
+                )
+                key = key[swa_tombstone_len:]
+                value = value[swa_tombstone_len:]
+
+            self._add_new_node(node, key, value, swa_tombstone=False)
+        return total_prefix_length
+
+    def _add_new_node(
+        self,
+        parent: TreeNode,
+        key: RadixKey,
+        value: torch.Tensor,
+        swa_tombstone: bool = False,
+    ) -> TreeNode:
+        assert len(key) > 0, f"key should not be empty"
+        new_node = TreeNode()
+        new_node.parent = parent
+        new_node.key = key
+        new_node.value = value.clone()
+        new_node.swa_tombstone = swa_tombstone
+        parent.children[self.get_child_key_fn(key)] = new_node
+        self.full_lru_list.insert_mru(new_node)
+        self.full_evictable_size_ += len(value)
+        if not swa_tombstone:
+            self.swa_lru_list.insert_mru(new_node)
+            self.swa_evictable_size_ += len(value)
+        return new_node
+
+    def _iteratively_delete_tombstone_leaf(
+        self, node: TreeNode
+    ) -> Tuple[TreeNode, int]:
+        full_num_evicted = 0
+        while node.parent.swa_tombstone and len(node.parent.children) == 0:
+            # root node is not evictable
+            if node.parent == self.root_node:
+                break
+            # if locked, means node is in use, skip
+            if node.parent.full_lock_ref > 0:
+                break
+            assert (
+                node.parent.swa_lock_ref == 0
+            ), f"tombstone swa_lock_ref should always be 0, {node.parent.full_lock_ref=}, {node.parent.swa_lock_ref=}, {node.parent.id=}"
+            # delete tombstone node evicts full tokens
+            self.token_to_kv_pool_allocator.free(node.parent.value)
+            full_num_evicted += len(node.parent.value)
+            self.full_lru_list.remove_node(node.parent)
+            self._delete_tombstone_leaf(node.parent)
+            node = node.parent
+
+        return node, full_num_evicted
+
+    def _delete_leaf(self, node: TreeNode) -> None:
+        assert (
+            not node.swa_tombstone
+        ), f"Invariant violated: leaf node is a tombstone, {node.id=}"
+        assert len(node.children) == 0, f"leaf node has children, {node.id=}"
+        key = self.get_child_key_fn(node.key)
+        v = node.parent.children.pop(key, None)
+        assert v == node, f"parent does not have child key, {key}"
+        self.full_evictable_size_ -= len(node.key)
+        self.swa_evictable_size_ -= len(node.key)
+
+    def _tombstone_internal_node(self, node: TreeNode) -> None:
+        assert len(node.children) != 0, f"Cannot tombstone a leaf node, {node.id=}"
+        node.swa_tombstone = True
+        self.swa_evictable_size_ -= len(node.key)
+
+    def _delete_tombstone_leaf(self, node: TreeNode) -> None:
+        assert (
+            node.swa_tombstone
+        ), f"Deleting a unexpected non-tombstone leaf node, {node.id=}"
+        assert len(node.children) == 0, f"leaf node has children, {node.id=}"
+        key = self.get_child_key_fn(node.key)
+        v = node.parent.children.pop(key, None)
+        assert v == node, f"parent does not have child key, {key}"
+
+        self.full_evictable_size_ -= len(node.key)
+
+    def _collect_nontombstone_nodes(self) -> List[TreeNode]:
+        ret_list = []
+        stack = [self.root_node]
+
+        while stack:
+            cur_node = stack.pop()
+            if not cur_node.swa_tombstone:
+                ret_list.append(cur_node)
+            stack.extend(cur_node.children.values())
+
+        return ret_list
+
+    def _collect_all_nodes(self) -> List[TreeNode]:
+        ret_list = []
+        stack = [self.root_node]
+        while stack:
+            cur_node = stack.pop()
+            ret_list.append(cur_node)
+            stack.extend(cur_node.children.values())
+        return ret_list
+
+    def _print_helper(self, node: TreeNode, indent: int) -> None:
+        """Prints the radix tree in a human-readable format."""
+        stack = [(node, indent)]
+        while stack:
+            current_node, current_indent = stack.pop()
+            print(
+                " " * current_indent,
+                current_node.id,
+                len(current_node.key),
+                f"fr={current_node.full_lock_ref}",
+                f"sr={current_node.swa_lock_ref}",
+                f"fll={self.full_lru_list.in_list(current_node)}",
+                f"sll={self.swa_lru_list.in_list(current_node)}",
+                f"ts={current_node.swa_tombstone}",
+            )
+            for key, child in current_node.children.items():
+                stack.append((child, current_indent + 2))
+
+                assert key == self.get_child_key_fn(
+                    child.key
+                ), f"{key=}, {self.get_child_key_fn(child.key)=}"
+
+    def _total_size_helper(self) -> Tuple[int, int]:
+        total_size = 0
+        total_swa_size = 0
+        stack = [self.root_node]
+        while stack:
+            current_node = stack.pop()
+            total_size += len(current_node.value)
+            if not current_node.swa_tombstone:
+                total_swa_size += len(current_node.value)
+            for child in current_node.children.values():
+                if child.evicted:
+                    continue
+                stack.append(child)
+        return total_size, total_swa_size
diff --git a/sglang/python/sglang/srt/mem_cache/utils.py b/sglang/python/sglang/srt/mem_cache/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..3aec3dd89b51d64f5a73cc57866953bd38326a18
--- /dev/null
+++ b/sglang/python/sglang/srt/mem_cache/utils.py
@@ -0,0 +1,274 @@
+# Copyright 2025 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Common utilities."""
+
+from typing import Any, List, Optional, Tuple
+
+import torch
+import triton
+import triton.language as tl
+
+from sglang.srt.environ import envs
+
+
+@triton.jit
+def set_mla_kv_buffer_kernel(
+    kv_buffer_ptr,
+    cache_k_nope_ptr,
+    cache_k_rope_ptr,
+    loc_ptr,
+    buffer_stride: tl.constexpr,
+    nope_stride: tl.constexpr,
+    rope_stride: tl.constexpr,
+    nope_dim: tl.constexpr,
+    rope_dim: tl.constexpr,
+    BLOCK: tl.constexpr,
+):
+    pid_loc = tl.program_id(0)
+    pid_blk = tl.program_id(1)
+
+    base = pid_blk * BLOCK
+    offs = base + tl.arange(0, BLOCK)
+    total_dim = nope_dim + rope_dim
+    mask = offs < total_dim
+
+    loc = tl.load(loc_ptr + pid_loc).to(tl.int64)
+    dst_ptr = kv_buffer_ptr + loc * buffer_stride + offs
+
+    # Three-way branch to handle boundary correctly while preserving fast path
+    if base + BLOCK <= nope_dim:
+        # Fast path: entire block is in nope region
+        src = tl.load(
+            cache_k_nope_ptr + pid_loc * nope_stride + offs,
+            mask=mask,
+        )
+    elif base >= nope_dim:
+        # Fast path: entire block is in rope region
+        offs_rope = offs - nope_dim
+        src = tl.load(
+            cache_k_rope_ptr + pid_loc * rope_stride + offs_rope,
+            mask=mask,
+        )
+    else:
+        # Boundary case: block spans nope/rope boundary (e.g., FP8 with nope_dim=528)
+        # Handle each offset individually to avoid negative indexing
+        is_nope = offs < nope_dim
+        is_rope = (offs >= nope_dim) & (offs < (nope_dim + rope_dim))
+
+        src_nope = tl.load(
+            cache_k_nope_ptr + pid_loc * nope_stride + offs,
+            mask=mask & is_nope,
+            other=0,
+        )
+        src_rope = tl.load(
+            cache_k_rope_ptr + pid_loc * rope_stride + (offs - nope_dim),
+            mask=mask & is_rope,
+            other=0,
+        )
+
+        src = tl.where(is_nope, src_nope, src_rope)
+
+    tl.store(dst_ptr, src, mask=mask)
+
+
+def set_mla_kv_buffer_triton(
+    kv_buffer: torch.Tensor,
+    loc: torch.Tensor,
+    cache_k_nope: torch.Tensor,
+    cache_k_rope: torch.Tensor,
+):
+    nope_dim = cache_k_nope.shape[-1]
+    rope_dim = cache_k_rope.shape[-1]
+    total_dim = nope_dim + rope_dim
+    BLOCK = 128
+    n_loc = loc.numel()
+    grid = (n_loc, triton.cdiv(total_dim, BLOCK))
+
+    set_mla_kv_buffer_kernel[grid](
+        kv_buffer,
+        cache_k_nope,
+        cache_k_rope,
+        loc,
+        kv_buffer.stride(0),
+        cache_k_nope.stride(0),
+        cache_k_rope.stride(0),
+        nope_dim,
+        rope_dim,
+        BLOCK=BLOCK,
+    )
+
+
+@triton.jit
+def set_mla_kv_scale_buffer_kernel(
+    kv_buffer_ptr,
+    cache_k_nope_ptr,
+    cache_k_rope_ptr,
+    loc_ptr,
+    buffer_stride: tl.constexpr,
+    nope_stride: tl.constexpr,
+    rope_stride: tl.constexpr,
+    nope_dim: tl.constexpr,
+    rope_dim: tl.constexpr,
+    BLOCK: tl.constexpr,
+):
+    pid_loc = tl.program_id(0)
+    pid_blk = tl.program_id(1)
+
+    base = pid_blk * BLOCK
+    offs = base + tl.arange(0, BLOCK)
+    total_dim = nope_dim + rope_dim
+    mask = offs < total_dim  # Make sure don't cross the boundary
+
+    loc = tl.load(loc_ptr + pid_loc)
+    dst_ptr = kv_buffer_ptr + loc * buffer_stride + offs
+
+    # Check each offs should read 'nope' or 'rope'
+    is_nope = offs < nope_dim
+    src_nope = tl.load(
+        cache_k_nope_ptr + pid_loc * nope_stride + offs, mask=mask & is_nope, other=0.0
+    )
+    src_rope = tl.load(
+        cache_k_rope_ptr + pid_loc * rope_stride + (offs - nope_dim),
+        mask=mask & ~is_nope,
+        other=0.0,
+    )
+
+    # Combine nope + rope
+    src = src_nope + src_rope
+    tl.store(dst_ptr, src, mask=mask)
+
+
+def set_mla_kv_scale_buffer_triton(
+    kv_buffer: torch.Tensor,
+    loc: torch.Tensor,
+    cache_k_nope: torch.Tensor,
+    cache_k_rope: torch.Tensor,
+):
+    nope_dim = cache_k_nope.shape[-1]
+    rope_dim = cache_k_rope.shape[-1]
+    total_dim = nope_dim + rope_dim
+    BLOCK = 128  # Keep origin, works for smaller total_dim as well.
+    n_loc = loc.numel()
+    grid = (n_loc, triton.cdiv(total_dim, BLOCK))
+
+    set_mla_kv_scale_buffer_kernel[grid](
+        kv_buffer,
+        cache_k_nope,
+        cache_k_rope,
+        loc,
+        kv_buffer.stride(0),
+        cache_k_nope.stride(0),
+        cache_k_rope.stride(0),
+        nope_dim,
+        rope_dim,
+        BLOCK=BLOCK,
+    )
+
+
+@triton.jit
+def get_mla_kv_buffer_kernel(
+    kv_buffer_ptr,
+    cache_k_nope_ptr,
+    cache_k_rope_ptr,
+    loc_ptr,
+    buffer_stride: tl.constexpr,
+    nope_stride: tl.constexpr,
+    rope_stride: tl.constexpr,
+    nope_dim: tl.constexpr,
+    rope_dim: tl.constexpr,
+):
+    pid_loc = tl.program_id(0)
+    loc = tl.load(loc_ptr + pid_loc).to(tl.int64)
+    loc_src_ptr = kv_buffer_ptr + loc * buffer_stride
+
+    nope_offs = tl.arange(0, nope_dim)
+    nope_src_ptr = loc_src_ptr + nope_offs
+    nope_src = tl.load(nope_src_ptr)
+
+    tl.store(
+        cache_k_nope_ptr + pid_loc * nope_stride + nope_offs,
+        nope_src,
+    )
+
+    rope_offs = tl.arange(0, rope_dim)
+    rope_src_ptr = loc_src_ptr + nope_dim + rope_offs
+    rope_src = tl.load(rope_src_ptr)
+    tl.store(
+        cache_k_rope_ptr + pid_loc * rope_stride + rope_offs,
+        rope_src,
+    )
+
+
+def get_mla_kv_buffer_triton(
+    kv_buffer: torch.Tensor,
+    loc: torch.Tensor,
+    cache_k_nope: torch.Tensor,
+    cache_k_rope: torch.Tensor,
+):
+    # The source data type will be implicitly converted to the target data type.
+    nope_dim = cache_k_nope.shape[-1]  # 512
+    rope_dim = cache_k_rope.shape[-1]  # 64
+    n_loc = loc.numel()
+    grid = (n_loc,)
+
+    get_mla_kv_buffer_kernel[grid](
+        kv_buffer,
+        cache_k_nope,
+        cache_k_rope,
+        loc,
+        kv_buffer.stride(0),
+        cache_k_nope.stride(0),
+        cache_k_rope.stride(0),
+        nope_dim,
+        rope_dim,
+    )
+
+
+def maybe_init_custom_mem_pool(
+    device: str,
+) -> Tuple[bool, Optional[Any], Optional[str]]:
+    """
+    Initialize custom memory pool based on environment variable.
+
+    This function can be modified to support more features that require a custom memory pool.
+
+    Args:
+        device: The device to allocate memory on
+
+    Returns:
+        Tuple of (enable_custom_mem_pool, custom_mem_pool, custom_mem_pool_type)
+    """
+    enable_custom_mem_pool = (
+        True if envs.SGLANG_MOONCAKE_CUSTOM_MEM_POOL.get() is not None else False
+    )
+
+    if enable_custom_mem_pool:
+        # Currently, only mooncake requires a custom mem pool for MNNVL/Barex PD disaggregation
+        from sglang.srt.disaggregation.mooncake.utils import (
+            init_mooncake_custom_mem_pool,
+        )
+
+        return init_mooncake_custom_mem_pool(device)
+    else:
+        return False, None, None
+
+
+def convert_to_bigram_key(tokens: List[int]) -> List[Tuple[int, int]]:
+    # EAGLE uses bigram keys in the radix tree since draft sequence is the one-token-shifted version of target
+    # [1, 2, 3, 4] -> [(1,2), (2,3), (3,4)]
+    if len(tokens) and isinstance(tokens[0], tuple):
+        return tokens
+    if len(tokens) < 2:
+        return []
+    return [(tokens[i], tokens[i + 1]) for i in range(len(tokens) - 1)]
diff --git a/sglang/python/sglang/srt/model_executor/__pycache__/cpu_graph_runner.cpython-311.pyc b/sglang/python/sglang/srt/model_executor/__pycache__/cpu_graph_runner.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d727f79a951518e802ac346885d15e6f58814189
Binary files /dev/null and b/sglang/python/sglang/srt/model_executor/__pycache__/cpu_graph_runner.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/model_executor/__pycache__/cuda_graph_runner.cpython-311.pyc b/sglang/python/sglang/srt/model_executor/__pycache__/cuda_graph_runner.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2b7bb104892415b7a0cc8af3e0a31bca1f76ec24
Binary files /dev/null and b/sglang/python/sglang/srt/model_executor/__pycache__/cuda_graph_runner.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/model_executor/__pycache__/forward_batch_deepseek_mha_mixin.cpython-311.pyc b/sglang/python/sglang/srt/model_executor/__pycache__/forward_batch_deepseek_mha_mixin.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..772161cb67aac0415d651c0f5bfffe28058be28d
Binary files /dev/null and b/sglang/python/sglang/srt/model_executor/__pycache__/forward_batch_deepseek_mha_mixin.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/model_executor/__pycache__/forward_batch_info.cpython-311.pyc b/sglang/python/sglang/srt/model_executor/__pycache__/forward_batch_info.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8d68acc11270b82cd659144cabd723f306fcbdaa
Binary files /dev/null and b/sglang/python/sglang/srt/model_executor/__pycache__/forward_batch_info.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/model_executor/__pycache__/hook_manager.cpython-311.pyc b/sglang/python/sglang/srt/model_executor/__pycache__/hook_manager.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e68b15451f7c819f22a75fa712437d08e0e04368
Binary files /dev/null and b/sglang/python/sglang/srt/model_executor/__pycache__/hook_manager.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/model_executor/__pycache__/input_buffers.cpython-311.pyc b/sglang/python/sglang/srt/model_executor/__pycache__/input_buffers.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..07d158039ce73635fc9f11fab10e347d999dcb13
Binary files /dev/null and b/sglang/python/sglang/srt/model_executor/__pycache__/input_buffers.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/model_executor/__pycache__/model_runner_kv_cache_mixin.cpython-311.pyc b/sglang/python/sglang/srt/model_executor/__pycache__/model_runner_kv_cache_mixin.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2f1b18428e47451d63a01a68f6f1c36b0e517548
Binary files /dev/null and b/sglang/python/sglang/srt/model_executor/__pycache__/model_runner_kv_cache_mixin.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/model_executor/__pycache__/piecewise_cuda_graph_runner.cpython-311.pyc b/sglang/python/sglang/srt/model_executor/__pycache__/piecewise_cuda_graph_runner.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..afc536691e1f525f88f9b1d74bd0633280c8e684
Binary files /dev/null and b/sglang/python/sglang/srt/model_executor/__pycache__/piecewise_cuda_graph_runner.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/model_executor/cpu_graph_runner.py b/sglang/python/sglang/srt/model_executor/cpu_graph_runner.py
new file mode 100644
index 0000000000000000000000000000000000000000..f71e83ec42d7b71636b18d5a3408525fa4cf52d3
--- /dev/null
+++ b/sglang/python/sglang/srt/model_executor/cpu_graph_runner.py
@@ -0,0 +1,783 @@
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Run the model with cpu torch compile."""
+
+# The implementation of CPUGraphRunner follows the CudaGraphRunner
+
+from __future__ import annotations
+
+import logging
+from contextlib import contextmanager
+from typing import TYPE_CHECKING, Callable, Optional, Union
+
+import psutil
+import torch
+import tqdm
+
+from sglang.srt.distributed import get_tensor_model_parallel_rank
+from sglang.srt.distributed.parallel_state import GroupCoordinator
+from sglang.srt.layers.logits_processor import LogitsProcessorOutput
+from sglang.srt.model_executor.forward_batch_info import (
+    CaptureHiddenMode,
+    ForwardBatch,
+    ForwardMode,
+    PPProxyTensors,
+)
+from sglang.srt.speculative.spec_info import SpeculativeAlgorithm
+from sglang.srt.utils import (
+    log_info_on_rank0,
+    require_attn_tp_gather,
+    require_gathered_buffer,
+    require_mlp_sync,
+    require_mlp_tp_gather,
+)
+from sglang.srt.utils.patch_torch import monkey_patch_torch_compile
+
+logger = logging.getLogger(__name__)
+
+if TYPE_CHECKING:
+    from sglang.srt.model_executor.model_runner import ModelRunner
+
+
+@contextmanager
+def patch_model(
+    model: torch.nn.Module,
+    enable_compile: bool,
+    num_tokens: int,
+    tp_group: GroupCoordinator,
+):
+    """Patch the model to make it compatible with torch.compile"""
+    backup_ca_comm = None
+
+    try:
+        if enable_compile:
+            backup_ca_comm = tp_group.ca_comm
+            # Use custom-allreduce here.
+            # We found the custom allreduce is much faster than the built-in allreduce in torch,
+            # even with ENABLE_INTRA_NODE_COMM=1.
+            # tp_group.ca_comm = None
+            yield torch.compile(
+                torch.no_grad()(model.forward),
+                dynamic=False,
+            )
+        else:
+            yield model.forward
+    finally:
+        if enable_compile:
+            tp_group.ca_comm = backup_ca_comm
+
+
+def set_torch_compile_config():
+    import torch._dynamo.config
+    import torch._inductor.config
+
+    torch._inductor.config.fx_graph_cache = True  # Experimental feature to reduce compilation times, will be on by default in future
+    torch._inductor.config.freezing = True
+    torch._dynamo.config.accumulated_cache_size_limit = 1024
+    if hasattr(torch._dynamo.config, "cache_size_limit"):
+        torch._dynamo.config.cache_size_limit = 1024
+    monkey_patch_torch_compile()
+
+
+def get_batch_sizes_to_capture(model_runner: ModelRunner):
+    server_args = model_runner.server_args
+    # cpu torch compile only speeds up decoding by
+    # reducing python overhead when bs is small
+    capture_bs = list(range(1, 17))
+    capture_bs = [bs for bs in capture_bs if bs <= server_args.torch_compile_max_bs]
+    capture_bs = [bs for bs in capture_bs if bs <= model_runner.req_to_token_pool.size]
+    capture_bs = list(sorted(set(capture_bs)))
+    assert len(capture_bs) > 0 and capture_bs[0] > 0, f"{capture_bs=}"
+    return capture_bs
+
+
+def register_fake_ops():
+    """
+    Registers fake/meta implementations for all custom sgl_kernel CPU operators
+    using torch.library.register_fake to support torch.compile
+    """
+
+    none_return_ops = [
+        "shm_allreduce",
+        "bmm_cpu",
+        "fused_add_rmsnorm_cpu",
+        "decode_attention_cpu",
+        "extend_attention_cpu",
+        "gemma_fused_add_rmsnorm_cpu",
+        "layernorm_cpu",
+        "fused_add_layernorm_cpu",
+    ]
+    for op in none_return_ops:
+
+        @torch.library.register_fake(f"sgl_kernel::{op}")
+        def _(*args, **kwargs):
+            return
+
+    for op in [
+        "rmsnorm_cpu",
+        "l2norm_cpu",
+        "fused_experts_cpu",
+        "fused_rmsnorm_gated_cpu",
+        "shared_expert_cpu",
+        "causal_conv1d_update_cpu",
+        "causal_conv1d_fwd_cpu",
+        "gemma_rmsnorm_cpu",
+        "gemma3_rmsnorm_cpu",
+    ]:
+
+        @torch.library.register_fake(f"sgl_kernel::{op}")
+        def _(input, *args, **kwargs):
+            return torch.empty_like(input)
+
+    @torch.library.register_fake("sgl_kernel::qkv_proj_with_rope")
+    def _(
+        hidden_states,
+        q_a_proj_weight,
+        q_b_proj_weight,
+        kv_a_proj_weight,
+        w_kc,
+        q_a_layernorm_weight,
+        kv_a_layernorm_weight,
+        positions,
+        cos_sin_cache,
+        eps,
+        use_int8_w8a8,
+        use_fp8_w8a16,
+        q_a_proj_scale,
+        q_b_proj_scale,
+        kv_a_proj_scale,
+        is_vnni,
+        block_size,
+    ):
+        num_seqs = hidden_states.shape[0]
+        num_heads = w_kc.shape[0]
+        kv_lora_rank = w_kc.shape[1]
+        qk_rope_head_dim = kv_a_proj_weight.shape[0] - kv_lora_rank
+        q_input = torch.empty(
+            num_seqs,
+            num_heads,
+            kv_lora_rank + qk_rope_head_dim,
+            dtype=hidden_states.dtype,
+            device=hidden_states.device,
+        )
+        k_input = torch.empty(
+            num_seqs,
+            1,
+            kv_lora_rank + qk_rope_head_dim,
+            dtype=hidden_states.dtype,
+            device=hidden_states.device,
+        )
+        v_input = k_input.narrow(-1, 0, kv_lora_rank)
+        return q_input, k_input, v_input
+
+    @torch.library.register_fake("sgl_kernel::rotary_embedding_cpu")
+    def _(positions, query, key, head_size, cos_sin_cache, is_neox):
+        if query.ndim == 2:
+            return query, key
+        else:
+            return torch.empty_like(query), torch.empty_like(key)
+
+    @torch.library.register_fake("sgl_kernel::qkv_proj_with_rope_fused_weight")
+    def _(
+        hidden_states,
+        q_a_proj_weight,
+        q_b_proj_weight,
+        w_kc,
+        q_a_layernorm_weight,
+        kv_a_layernorm_weight,
+        positions,
+        cos_sin_cache,
+        eps,
+        use_int8_w8a8,
+        use_fp8_w8a16,
+        qkv_a_proj_scale,
+        q_b_proj_scale,
+        is_vnni,
+        block_size,
+        q_lora_rank,
+        kv_lora_rank,
+        qk_rope_head_dim,
+    ):
+        num_seqs = hidden_states.shape[0]
+        num_heads = w_kc.shape[0]
+        kv_lora_rank = w_kc.shape[1]
+        weight_chunks = torch.split(
+            q_a_proj_weight, [q_lora_rank, kv_lora_rank + qk_rope_head_dim], dim=0
+        )
+        qk_rope_head_dim = weight_chunks[1].shape[0] - kv_lora_rank
+        q_input = torch.empty(
+            num_seqs,
+            num_heads,
+            kv_lora_rank + qk_rope_head_dim,
+            dtype=hidden_states.dtype,
+            device=hidden_states.device,
+        )
+        k_input = torch.empty(
+            num_seqs,
+            1,
+            kv_lora_rank + qk_rope_head_dim,
+            dtype=hidden_states.dtype,
+            device=hidden_states.device,
+        )
+        v_input = k_input.narrow(-1, 0, kv_lora_rank)
+        return q_input, k_input, v_input
+
+    def get_n_size(mat2, is_vnni):
+        tile_n = 16
+        if mat2.dtype == torch.float32:
+            return mat2.shape[1]
+        if not is_vnni and mat2.dim() == 2 and mat2.shape[0] < tile_n:
+            return mat2.shape[1]
+        return mat2.shape[0]
+
+    @torch.library.register_fake("sgl_kernel::weight_packed_linear")
+    def _(mat1, mat2, bias, is_vnni):
+        M = mat1.shape[0]
+        N = get_n_size(mat2, is_vnni)
+        return mat1.new_empty(M, N)
+
+    @torch.library.register_fake("sgl_kernel::per_token_quant_int8_cpu")
+    def _(input):
+        M = input.shape[0]
+        K = input.shape[1]
+        Aq = input.new_empty(M, K, dtype=torch.int8)
+        As = input.new_empty(M, dtype=torch.float32)
+        return Aq, As
+
+    @torch.library.register_fake("sgl_kernel::int8_scaled_mm_cpu")
+    def _(mat1, mat2, scales1, scales2, bias, out_dtype, is_vnni):
+        M = mat1.shape[0]
+        N = mat2.shape[0]
+        out = mat1.new_empty(M, N, dtype=out_dtype)
+        return out
+
+    @torch.library.register_fake("sgl_kernel::grouped_topk_cpu")
+    def _(
+        hidden_states,
+        gating_output,
+        topk,
+        renormalize,
+        num_expert_group,
+        topk_group,
+        num_fused_shared_experts,
+        routed_scaling_factor,
+        num_token_non_padded,
+    ):
+        num_tokens = hidden_states.shape[0]
+        shape = (num_tokens, topk)
+        device = hidden_states.device
+        topk_weights = torch.empty(shape, device=device, dtype=torch.float32)
+        topk_ids = torch.empty(shape, device=device, dtype=torch.int)
+        return topk_weights, topk_ids
+
+    @torch.library.register_fake("sgl_kernel::biased_grouped_topk_cpu")
+    def _(
+        hidden_states,
+        gating_output,
+        correction_bias,
+        topk,
+        renormalize,
+        num_expert_group,
+        topk_group,
+        num_fused_shared_experts,
+        routed_scaling_factor,
+        num_token_non_padded,
+    ):
+        num_tokens = hidden_states.shape[0]
+        shape = (num_tokens, topk)
+        device = hidden_states.device
+        topk_weights = torch.empty(shape, device=device, dtype=torch.float32)
+        topk_ids = torch.empty(shape, device=device, dtype=torch.int)
+        return topk_weights, topk_ids
+
+    @torch.library.register_fake("sgl_kernel::topk_sigmoid_cpu")
+    def _(hidden_states, gating_output, topk, renormalize):
+        num_tokens = hidden_states.shape[0]
+        shape = (num_tokens, topk)
+        return (
+            torch.empty(shape, device=hidden_states.device, dtype=torch.float),
+            torch.empty(shape, device=hidden_states.device, dtype=torch.int),
+        )
+
+    @torch.library.register_fake("sgl_kernel::topk_softmax_cpu")
+    def _(
+        hidden_states,
+        gating_output,
+        topk,
+        renormalize,
+    ):
+        num_tokens = hidden_states.shape[0]
+        shape = (num_tokens, topk)
+        return (
+            torch.empty(shape, device=hidden_states.device, dtype=torch.float),
+            torch.empty(shape, device=hidden_states.device, dtype=torch.int),
+        )
+
+    for act_op in [
+        "silu_and_mul_cpu",
+        "gelu_tanh_and_mul_cpu",
+        "gelu_and_mul_cpu",
+    ]:
+
+        @torch.library.register_fake(f"sgl_kernel::{act_op}")
+        def _(input):
+            sizes = list(input.shape)
+            last_dim = input.dim() - 1
+            d = sizes[last_dim] // 2
+            sizes[last_dim] = d
+            return input.new_empty(sizes)
+
+    @torch.library.register_fake("sgl_kernel::int8_scaled_mm_with_quant")
+    def _(
+        mat1,
+        mat2,
+        scales2,
+        bias,
+        out_dtype,
+        is_vnni,
+    ):
+        M = mat1.shape[0]
+        N = mat2.shape[0]
+        return mat1.new_empty(M, N, dtype=out_dtype)
+
+    @torch.library.register_fake("sgl_kernel::fp8_scaled_mm_cpu")
+    def _(
+        mat1,
+        mat2,
+        scales2,
+        block_size,
+        bias,
+        out_dtype,
+        is_vnni,
+    ):
+        M = mat1.shape[0]
+        N = mat2.shape[0]
+        return mat1.new_empty(M, N, dtype=out_dtype)
+
+    @torch.library.register_fake("sgl_kernel::fused_linear_sigmoid_mul")
+    def _(
+        mat1,
+        mat2,
+        bias,
+        is_vnni,
+        post_mul_mat,
+    ):
+        M = mat1.shape[0]
+        N = post_mul_mat.shape[1]
+        return mat1.new_empty(M, N)
+
+    @torch.library.register_fake("sgl_kernel::fused_qkvzba_split_reshape_cat_cpu")
+    def _(mixed_qkvz, mixed_ba, num_heads_qk, num_heads_v, head_qk, head_v):
+        batch = mixed_qkvz.shape[0]
+        qkv_dim = num_heads_qk * head_qk * 2 + num_heads_v * head_v
+        mixed_qkv = mixed_qkvz.new_empty(batch, qkv_dim)
+        z = mixed_qkvz.new_empty(batch, num_heads_v, head_v)
+        b = mixed_ba.new_empty(batch, num_heads_v)
+        a = mixed_ba.new_empty(batch, num_heads_v)
+        return mixed_qkv, z, b, a
+
+    @torch.library.register_fake(
+        "sgl_kernel::fused_sigmoid_gating_delta_rule_update_cpu"
+    )
+    def _(
+        A_log,
+        dt_bias,
+        q,
+        k,
+        v,
+        a,
+        b,
+        initial_state_source,
+        initial_state_indices,
+        cu_seqlens,
+        use_qk_l2norm_in_kernel,
+        softplus_beta=1.0,
+        softplus_threshold=20.0,
+    ):
+        assert q.dim() == 4
+        assert v.dim() == 4
+        batch_size = q.shape[1]
+        seq_len = q.shape[0]
+        v_num_heads = v.shape[2]
+        v_head_dim = v.shape[3]
+        return q.new_empty(batch_size, seq_len, v_num_heads, v_head_dim)
+
+    @torch.library.register_fake("sgl_kernel::fused_gdn_gating_cpu")
+    def _(A_log, a, b, dt_bias):
+        batch = a.shape[0]
+        num_heads = a.shape[1]
+        out = a.new_empty(1, batch, num_heads, dtype=torch.float)
+        beta = b.new_empty(1, batch, num_heads)
+        return out, beta
+
+    @torch.library.register_fake("sgl_kernel::chunk_gated_delta_rule_cpu")
+    def _(
+        query,
+        key,
+        value,
+        g,
+        beta,
+        initial_state,
+        output_final_state,
+        cu_seqlens,
+        head_first,
+        use_qk_l2norm_in_kernel,
+        eps,
+    ):
+        output = torch.empty_like(value)
+        assert initial_state is not None
+        final_state = initial_state.to(torch.float32)
+
+        return output, final_state
+
+
+# TODO Remove unnecessary settings for CPUGraphRunner.
+# Re-abstract the graph runner and restructure CPUGraphRunner to reuse the same logic.
+class CPUGraphRunner:
+    """A CPUGraphRunner runs the forward pass of a model with cpu torch.compile."""
+
+    def __init__(self, model_runner: ModelRunner):
+        # Parse args
+        self.model_runner = model_runner
+        self.device = model_runner.device
+        self.graphs = {}
+        self.output_buffers = {}
+        self.enable_torch_compile = model_runner.server_args.enable_torch_compile
+        self.disable_padding = model_runner.server_args.disable_cuda_graph_padding
+        self.is_encoder_decoder = model_runner.model_config.is_encoder_decoder
+        self.require_gathered_buffer = require_gathered_buffer(model_runner.server_args)
+        self.require_mlp_tp_gather = require_mlp_tp_gather(model_runner.server_args)
+        self.require_mlp_sync = require_mlp_sync(model_runner.server_args)
+        self.require_attn_tp_gather = require_attn_tp_gather(model_runner.server_args)
+        self.enable_two_batch_overlap = (
+            model_runner.server_args.enable_two_batch_overlap
+        )
+        self.speculative_algorithm = model_runner.server_args.speculative_algorithm
+        self.enable_profile_cuda_graph = (
+            model_runner.server_args.enable_profile_cuda_graph
+        )
+        self.tp_size = model_runner.server_args.tp_size
+        self.dp_size = model_runner.server_args.dp_size
+        self.pp_size = model_runner.server_args.pp_size
+
+        self.capture_forward_mode = ForwardMode.DECODE
+        self.capture_hidden_mode = CaptureHiddenMode.NULL
+        self.num_tokens_per_bs = 1
+
+        # If returning hidden states is enabled, set initial capture hidden mode to full to avoid double-capture on startup
+        if model_runner.server_args.enable_return_hidden_states:
+            self.capture_hidden_mode = CaptureHiddenMode.FULL
+
+        assert (
+            not self.model_runner.server_args.enable_lora
+        ), "CPUGraphRunner does not support LoRA yet."
+        assert (
+            not self.enable_two_batch_overlap
+        ), "CPUGraphRunner does not support two batch overlap yet."
+        assert (
+            not self.require_mlp_tp_gather
+        ), "CPUGraphRunner does not support MLP TP gather yet."
+        assert (
+            not self.require_mlp_sync
+        ), "CPUGraphRunner does not support MLP sync yet."
+        assert (
+            not self.require_gathered_buffer
+        ), "CPUGraphRunner does not support gathered buffer yet."
+        assert (
+            model_runner.spec_algorithm == SpeculativeAlgorithm.NONE
+        ), "CPUGraphRunner does not support speculative inference yet."
+        # TODO add compile support for encoder-decoder models
+        assert (
+            not self.is_encoder_decoder
+        ), "CPUGraphRunner does not support encoder-decoder models yet."
+        assert self.dp_size == 1, "CPUGraphRunner does not support DP yet."
+        assert self.pp_size == 1, "CPUGraphRunner does not support PP yet."
+
+        # Batch sizes to capture
+        self.capture_bs = get_batch_sizes_to_capture(model_runner)
+        log_info_on_rank0(logger, f"Capture cpu graph bs {self.capture_bs}")
+        # Attention backend
+        self.max_bs = max(self.capture_bs)
+        self.max_num_token = self.max_bs * self.num_tokens_per_bs
+        self.model_runner.attn_backend.init_cpu_graph_state(
+            self.max_bs, self.max_num_token
+        )
+
+        self.seq_len_fill_value = (
+            self.model_runner.attn_backend.get_cpu_graph_seq_len_fill_value()
+        )
+
+        if self.enable_torch_compile:
+            register_fake_ops()
+            set_torch_compile_config()
+
+        # Graph inputs
+        with torch.device(self.device):
+            self.input_ids = torch.zeros((self.max_num_token,), dtype=torch.int64)
+            self.req_pool_indices = torch.zeros((self.max_bs,), dtype=torch.int64)
+            self.seq_lens = torch.full(
+                (self.max_bs,), self.seq_len_fill_value, dtype=torch.int64
+            )
+            self.out_cache_loc = torch.zeros((self.max_num_token,), dtype=torch.int64)
+            self.positions = torch.zeros((self.max_num_token,), dtype=torch.int64)
+            self.mrope_positions = torch.zeros((3, self.max_bs), dtype=torch.int64)
+            self.num_token_non_padded = torch.zeros((1,), dtype=torch.int64)
+            self.custom_mask = torch.ones(
+                (
+                    (self.seq_lens.sum().item() + self.max_num_token)
+                    * self.num_tokens_per_bs
+                ),
+                dtype=torch.bool,
+                device=self.device,
+            )
+
+        # Capture
+        try:
+            self.capture()
+        except RuntimeError as e:
+            raise Exception(
+                f"Capture CPU graph failed: {e}\n{CPU_GRAPH_CAPTURE_FAILED_MSG}"
+            )
+
+    def can_run(self, forward_batch: ForwardBatch):
+        is_bs_supported = forward_batch.batch_size in self.graphs
+
+        requested_capture_hidden_mode = max(
+            forward_batch.capture_hidden_mode,
+            (
+                forward_batch.spec_info.capture_hidden_mode
+                if getattr(forward_batch.spec_info, "capture_hidden_mode", None)
+                is not None
+                else CaptureHiddenMode.NULL
+            ),
+        )
+        capture_hidden_mode_matches = (
+            requested_capture_hidden_mode == CaptureHiddenMode.NULL
+            or requested_capture_hidden_mode == self.capture_hidden_mode
+        )
+
+        return is_bs_supported and capture_hidden_mode_matches
+
+    def capture(self) -> None:
+        capture_range = (
+            tqdm.tqdm(list(reversed(self.capture_bs)))
+            if get_tensor_model_parallel_rank() == 0
+            else reversed(self.capture_bs)
+        )
+        for bs in capture_range:
+            if get_tensor_model_parallel_rank() == 0:
+                avail_mem = psutil.virtual_memory().available / (1 << 30)
+                capture_range.set_description(
+                    f"Capturing batches ({bs=} {avail_mem=:.2f} GB)"
+                )
+
+            with patch_model(
+                self.model_runner.model,
+                bs in self.capture_bs,
+                num_tokens=bs * self.num_tokens_per_bs,
+                tp_group=self.model_runner.tp_group,
+            ) as forward:
+                (
+                    graph,
+                    output_buffers,
+                ) = self.capture_one_batch_size(bs, forward)
+                self.graphs[bs] = graph
+                self.output_buffers[bs] = output_buffers
+
+        # Re-init states for qwen3-next as
+        # torch.compile may change the states
+        self._reset_mamba_cache_if_needed()
+
+    def _reset_mamba_cache_if_needed(self) -> None:
+
+        mamba_pool = getattr(self.model_runner.req_to_token_pool, "mamba_pool", None)
+        if mamba_pool is None:
+            return
+        mamba_cache = getattr(mamba_pool, "mamba_cache", None)
+        if mamba_cache is None:
+            return
+
+        def _zero_nested(obj):
+            if isinstance(obj, torch.Tensor):
+                obj.zero_()
+            elif isinstance(obj, (list, tuple)):
+                for it in obj:
+                    _zero_nested(it)
+
+        for v in vars(mamba_cache).values():
+            _zero_nested(v)
+
+    def capture_one_batch_size(self, bs: int, forward: Callable):
+        num_tokens = bs * self.num_tokens_per_bs
+
+        # Graph inputs
+        input_ids = self.input_ids[:num_tokens]
+        req_pool_indices = self.req_pool_indices[:bs]
+        seq_lens = self.seq_lens[:bs]
+        out_cache_loc = self.out_cache_loc[:num_tokens]
+        positions = self.positions[:num_tokens]
+        mrope_positions = self.mrope_positions[:, :num_tokens]
+        self.num_token_non_padded[...] = num_tokens
+
+        spec_info = self.get_spec_info(num_tokens)
+        if self.capture_hidden_mode != CaptureHiddenMode.FULL:
+            self.capture_hidden_mode = (
+                spec_info.capture_hidden_mode if spec_info else CaptureHiddenMode.NULL
+            )
+
+        forward_batch = ForwardBatch(
+            forward_mode=self.capture_forward_mode,
+            batch_size=bs,
+            input_ids=input_ids,
+            req_pool_indices=req_pool_indices,
+            seq_lens=seq_lens,
+            req_to_token_pool=self.model_runner.req_to_token_pool,
+            token_to_kv_pool=self.model_runner.token_to_kv_pool,
+            attn_backend=self.model_runner.attn_backend,
+            out_cache_loc=out_cache_loc,
+            seq_lens_sum=seq_lens.sum().item(),
+            return_logprob=False,
+            positions=positions,
+            mrope_positions=mrope_positions,
+            spec_algorithm=self.model_runner.spec_algorithm,
+            spec_info=spec_info,
+            capture_hidden_mode=self.capture_hidden_mode,
+            num_token_non_padded=self.num_token_non_padded,
+            global_forward_mode=self.capture_forward_mode,
+        )
+
+        # Attention backend
+        self.model_runner.attn_backend.init_forward_metadata_capture_cpu_graph(
+            bs,
+            num_tokens,
+            req_pool_indices,
+            seq_lens,
+            None,
+            forward_batch.forward_mode,
+            forward_batch.spec_info,
+        )
+        # Do infernence to avoid setting attr at runtime, e.g.,
+        # self.attn_mha.kv_b_proj = self.kv_b_proj for full graph compile on CPU
+        with torch.no_grad():
+            self.model_runner.tp_group.barrier()
+            self.model_runner.model.forward(
+                forward_batch.input_ids,
+                forward_batch.positions,
+                forward_batch,
+            )
+
+        # Run and capture
+        def run_once():
+            # Clean intermediate result cache for DP attention
+            forward_batch.dp_local_start_pos = forward_batch.dp_local_num_tokens = None
+            logits_output_or_pp_proxy_tensors = forward(
+                forward_batch.input_ids,
+                forward_batch.positions,
+                forward_batch,
+            )
+            return logits_output_or_pp_proxy_tensors
+
+        with torch.no_grad():
+            for _ in range(2):
+                self.model_runner.tp_group.barrier()
+                out = run_once()
+            return forward, out
+
+    def recapture_if_needed(self, forward_batch: ForwardBatch):
+
+        # If the required capture_hidden_mode changes, we need to recapture the graph
+
+        # These are the different factors that can influence the capture_hidden_mode
+        capture_hidden_mode_required_by_forward_batch = (
+            forward_batch.capture_hidden_mode
+        )
+        capture_hidden_mode_required_by_spec_info = getattr(
+            forward_batch.spec_info, "capture_hidden_mode", CaptureHiddenMode.NULL
+        )
+        capture_hidden_mode_required_for_returning_hidden_states = (
+            CaptureHiddenMode.FULL
+            if self.model_runner.server_args.enable_return_hidden_states
+            else CaptureHiddenMode.NULL
+        )
+
+        # Determine the highest capture_hidden_mode required
+        # (If we have FULL, we can emulate LAST or NULL)
+        # (If we have LAST, we can emulate NULL)
+        required_capture_hidden_mode = max(
+            capture_hidden_mode_required_by_forward_batch,
+            capture_hidden_mode_required_by_spec_info,
+            capture_hidden_mode_required_for_returning_hidden_states,
+        )
+
+        # If the current hidden mode is no longer aligned with the required hidden mode, we need to set it to what is required and re-capture
+        if self.capture_hidden_mode != required_capture_hidden_mode:
+            self.capture_hidden_mode = required_capture_hidden_mode
+            self.capture()
+
+    # TODO add padding support for CPUGraphRunner
+    def replay(
+        self,
+        forward_batch: ForwardBatch,
+        skip_attn_backend_init: bool = False,
+        pp_proxy_tensors: Optional[PPProxyTensors] = None,
+    ) -> Union[LogitsProcessorOutput, PPProxyTensors]:
+        assert (
+            pp_proxy_tensors is None
+        ), "PPProxyTensors is not supported in CPUGraphRunner yet."
+        self.recapture_if_needed(forward_batch)
+        self.model_runner.attn_backend.init_forward_metadata(forward_batch)
+        output = self.graphs[forward_batch.batch_size](
+            forward_batch.input_ids,
+            forward_batch.positions,
+            forward_batch,
+        )
+        return output
+
+    def get_spec_info(self, num_tokens: int):
+        spec_info = None
+        if (
+            self.model_runner.spec_algorithm.is_eagle()
+            or self.model_runner.spec_algorithm.is_standalone()
+        ):
+            from sglang.srt.speculative.eagle_info import EagleVerifyInput
+
+            if self.model_runner.is_draft_worker:
+                raise RuntimeError("This should not happen.")
+            else:
+                spec_info = EagleVerifyInput(
+                    draft_token=None,
+                    custom_mask=self.custom_mask,
+                    positions=None,
+                    retrive_index=None,
+                    retrive_next_token=None,
+                    retrive_next_sibling=None,
+                    retrive_cum_len=None,
+                    spec_steps=self.model_runner.server_args.speculative_num_steps,
+                    topk=self.model_runner.server_args.speculative_eagle_topk,
+                    draft_token_num=self.model_runner.server_args.speculative_num_draft_tokens,
+                    capture_hidden_mode=CaptureHiddenMode.FULL,
+                    seq_lens_sum=None,
+                    seq_lens_cpu=None,
+                )
+
+        return spec_info
+
+
+CPU_GRAPH_CAPTURE_FAILED_MSG = (
+    "Possible solutions:\n"
+    "1. set --mem-fraction-static to a smaller value (e.g., 0.8 or 0.7)\n"
+    "2. set --torch-compile-max-bs to a smaller value (e.g., 8)\n"
+    "3. disable torch compile by not using --enable-torch-compile\n"
+    "Open an issue on GitHub https://github.com/sgl-project/sglang/issues/new/choose \n"
+)
diff --git a/sglang/python/sglang/srt/model_executor/cuda_graph_runner.py b/sglang/python/sglang/srt/model_executor/cuda_graph_runner.py
new file mode 100644
index 0000000000000000000000000000000000000000..9ff800a9f65d4f66edcd413fedaabbafab279e03
--- /dev/null
+++ b/sglang/python/sglang/srt/model_executor/cuda_graph_runner.py
@@ -0,0 +1,1159 @@
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Run the model with cuda graph and torch.compile."""
+
+from __future__ import annotations
+
+import bisect
+import gc
+import inspect
+import logging
+import os
+from contextlib import contextmanager
+from dataclasses import dataclass
+from functools import partial
+from typing import TYPE_CHECKING, Callable, Dict, Optional, Union
+
+import torch
+import tqdm
+from torch.profiler import ProfilerActivity, profile
+
+from sglang.srt.batch_overlap.two_batch_overlap import TboCudaGraphRunnerPlugin
+from sglang.srt.constants import GPU_MEMORY_TYPE_CUDA_GRAPH
+from sglang.srt.distributed import get_tensor_model_parallel_rank
+from sglang.srt.distributed.device_communicators.pynccl_allocator import (
+    set_graph_pool_id,
+)
+from sglang.srt.distributed.parallel_state import (
+    GroupCoordinator,
+    graph_capture,
+    set_pdmux_status,
+)
+from sglang.srt.dllm.config import DllmConfig
+from sglang.srt.layers.attention.nsa.utils import is_nsa_enable_prefill_cp
+from sglang.srt.layers.dp_attention import (
+    DpPaddingMode,
+    get_attention_cp_size,
+    get_attention_tp_rank,
+    get_attention_tp_size,
+    set_dp_buffer_len,
+    set_is_extend_in_batch,
+)
+from sglang.srt.layers.logits_processor import LogitsProcessorOutput
+from sglang.srt.layers.moe.token_dispatcher.deepep import DeepEPBuffer
+from sglang.srt.layers.moe.utils import get_deepep_mode, get_moe_a2a_backend
+from sglang.srt.layers.utils import MultiPlatformOp
+from sglang.srt.model_executor.forward_batch_info import (
+    CaptureHiddenMode,
+    ForwardBatch,
+    ForwardMode,
+    PPProxyTensors,
+    compute_local_num_token_non_padded,
+    enable_num_token_non_padded,
+)
+from sglang.srt.model_executor.input_buffers import ForwardInputBuffers
+from sglang.srt.multiplex.pdmux_context import get_current_stream_idx, get_stream_groups
+from sglang.srt.utils import (
+    empty_context,
+    get_available_gpu_memory,
+    get_bool_env_var,
+    is_hip,
+    log_info_on_rank0,
+    require_attn_tp_gather,
+    require_gathered_buffer,
+    require_mlp_sync,
+    require_mlp_tp_gather,
+)
+from sglang.srt.utils.patch_torch import monkey_patch_torch_compile
+from sglang.srt.utils.torch_memory_saver_adapter import TorchMemorySaverAdapter
+
+try:
+    from kt_kernel import KTMoEWrapper
+
+    KTRANSFORMERS_AVAILABLE = True
+except ImportError:
+    KTRANSFORMERS_AVAILABLE = False
+
+_is_hip = is_hip()
+
+logger = logging.getLogger(__name__)
+
+if TYPE_CHECKING:
+    from sglang.srt.model_executor.model_runner import ModelRunner
+
+
+@dataclass
+class DecodeInputBuffers(ForwardInputBuffers):
+
+    input_ids: torch.Tensor
+    input_embeds: torch.Tensor
+    req_pool_indices: torch.Tensor
+    seq_lens: torch.Tensor
+    seq_lens_cpu: torch.Tensor
+    out_cache_loc: torch.Tensor
+    positions: torch.Tensor
+    mrope_positions: torch.Tensor
+    num_token_non_padded: torch.Tensor
+    custom_mask: torch.Tensor
+    next_token_logits_buffer: torch.Tensor
+    mamba_track_indices: Optional[torch.Tensor]
+    mamba_track_mask: Optional[torch.Tensor]
+    global_num_tokens_gpu: torch.Tensor
+    global_num_tokens_for_logprob_gpu: torch.Tensor
+    encoder_lens: Optional[torch.Tensor]
+    pp_proxy_tensors: Optional[Dict[str, torch.Tensor]]
+
+    @classmethod
+    def create(
+        cls,
+        *,
+        device: torch.device,
+        max_bs: int,
+        max_num_token: int,
+        hidden_size: int,
+        vocab_size: int,
+        dtype: torch.dtype,
+        dp_size: int,
+        pp_size: int,
+        is_encoder_decoder: bool,
+        require_mlp_tp_gather: bool,
+        seq_len_fill_value: int,
+        encoder_len_fill_value: int,
+        num_tokens_per_bs: int,
+        cache_loc_dtype: torch.dtype,
+        enable_mamba_track: bool,
+    ) -> "DecodeInputBuffers":
+        with torch.device(device):
+            input_ids = torch.zeros((max_num_token,), dtype=torch.int64)
+            input_embeds = torch.zeros((max_num_token, hidden_size), dtype=dtype)
+            req_pool_indices = torch.zeros((max_bs,), dtype=torch.int32)
+            seq_lens = torch.full((max_bs,), seq_len_fill_value, dtype=torch.int32)
+            out_cache_loc = torch.zeros((max_num_token,), dtype=cache_loc_dtype)
+            positions = torch.zeros((max_num_token,), dtype=torch.int64)
+            mrope_positions = torch.zeros((3, max_num_token), dtype=torch.int64)
+            num_token_non_padded = torch.zeros((1,), dtype=torch.int32)
+            custom_mask = torch.ones(
+                (max_bs * seq_len_fill_value + max_num_token) * num_tokens_per_bs,
+                dtype=torch.bool,
+            )
+            next_token_logits_buffer = torch.zeros(
+                (max_num_token, vocab_size),
+                dtype=torch.float,
+            )
+            mamba_track_indices = (
+                torch.zeros((max_bs,), dtype=torch.int64)
+                if enable_mamba_track
+                else None
+            )
+            mamba_track_mask = (
+                torch.zeros((max_bs,), dtype=torch.bool) if enable_mamba_track else None
+            )
+
+            if pp_size > 1:
+                pp_proxy_tensors = {
+                    "hidden_states": torch.zeros((max_bs, hidden_size), dtype=dtype),
+                    "residual": torch.zeros((max_bs, hidden_size), dtype=dtype),
+                }
+            else:
+                pp_proxy_tensors = None
+
+            if is_encoder_decoder:
+                encoder_lens = torch.full(
+                    (max_bs,), encoder_len_fill_value, dtype=torch.int32
+                )
+            else:
+                encoder_lens = None
+
+            if require_mlp_tp_gather:
+                global_num_tokens_gpu = torch.zeros((dp_size,), dtype=torch.int32)
+                global_num_tokens_for_logprob_gpu = torch.zeros(
+                    (dp_size,), dtype=torch.int32
+                )
+            else:
+                global_num_tokens_gpu = torch.zeros((1,), dtype=torch.int32)
+                global_num_tokens_for_logprob_gpu = torch.zeros((1,), dtype=torch.int32)
+
+        # Keep seq_lens_cpu as a true CPU tensor, like the old implementation.
+        seq_lens_cpu = torch.full(
+            (max_bs,),
+            seq_len_fill_value,
+            dtype=torch.int32,
+            device="cpu",
+        )
+
+        return cls(
+            input_ids=input_ids,
+            input_embeds=input_embeds,
+            req_pool_indices=req_pool_indices,
+            seq_lens=seq_lens,
+            seq_lens_cpu=seq_lens_cpu,
+            out_cache_loc=out_cache_loc,
+            positions=positions,
+            mrope_positions=mrope_positions,
+            num_token_non_padded=num_token_non_padded,
+            custom_mask=custom_mask,
+            next_token_logits_buffer=next_token_logits_buffer,
+            mamba_track_indices=mamba_track_indices,
+            mamba_track_mask=mamba_track_mask,
+            encoder_lens=encoder_lens,
+            global_num_tokens_gpu=global_num_tokens_gpu,
+            global_num_tokens_for_logprob_gpu=global_num_tokens_for_logprob_gpu,
+            pp_proxy_tensors=pp_proxy_tensors,
+        )
+
+    def populate_from_forward_batch(
+        self,
+        *,
+        forward_batch: ForwardBatch,
+        raw_bs: int,
+        raw_num_token: int,
+        bs: int,
+        seq_len_fill_value: int,
+        require_gathered_buffer: bool,
+        num_tokens_per_bs: int,
+        nsa_enable_prefill_cp: bool,
+        enable_num_token_non_padded_flag: bool,
+        pp_proxy_tensors: Optional[PPProxyTensors] = None,
+    ):
+        if bs != raw_bs:
+            self.seq_lens.fill_(seq_len_fill_value)
+            self.out_cache_loc.zero_()
+            if self.mamba_track_indices is not None:
+                self.mamba_track_indices.zero_()
+            if self.mamba_track_mask is not None:
+                self.mamba_track_mask.fill_(False)
+
+        # Common inputs
+        self.input_ids[:raw_num_token].copy_(forward_batch.input_ids)
+        self.req_pool_indices[:raw_bs].copy_(forward_batch.req_pool_indices)
+        self.seq_lens[:raw_bs].copy_(forward_batch.seq_lens)
+        self.out_cache_loc[:raw_num_token].copy_(forward_batch.out_cache_loc)
+        self.positions[:raw_num_token].copy_(forward_batch.positions)
+
+        if (
+            self.mamba_track_indices is not None
+            and forward_batch.mamba_track_indices is not None
+        ):
+            self.mamba_track_indices[:raw_bs].copy_(forward_batch.mamba_track_indices)
+        if (
+            self.mamba_track_mask is not None
+            and forward_batch.mamba_track_mask is not None
+        ):
+            self.mamba_track_mask[:raw_bs].copy_(forward_batch.mamba_track_mask)
+
+        if forward_batch.seq_lens_cpu is not None:
+            if bs != raw_bs:
+                self.seq_lens_cpu.fill_(seq_len_fill_value)
+            self.seq_lens_cpu[:raw_bs].copy_(forward_batch.seq_lens_cpu)
+
+        if self.encoder_lens is not None and forward_batch.encoder_lens is not None:
+            self.encoder_lens[:raw_bs].copy_(forward_batch.encoder_lens)
+
+        if forward_batch.mrope_positions is not None:
+            self.mrope_positions[:, :raw_num_token].copy_(forward_batch.mrope_positions)
+
+        if require_gathered_buffer:
+            self.global_num_tokens_gpu.fill_(bs * num_tokens_per_bs)
+            self.global_num_tokens_for_logprob_gpu.fill_(bs * num_tokens_per_bs)
+
+        if enable_num_token_non_padded_flag:
+            if require_gathered_buffer and not nsa_enable_prefill_cp:
+                num_tokens_per_dp = bs * num_tokens_per_bs
+                local = compute_local_num_token_non_padded(
+                    global_num_token_non_padded=forward_batch.num_token_non_padded,
+                    num_tokens_per_dp=num_tokens_per_dp,
+                )
+                self.num_token_non_padded.copy_(local)
+            else:
+                self.num_token_non_padded.copy_(forward_batch.num_token_non_padded)
+
+        # Pipeline-parallel proxy tensors.
+        if pp_proxy_tensors is not None and self.pp_proxy_tensors is not None:
+            for key, buf in self.pp_proxy_tensors.items():
+                src = pp_proxy_tensors.tensors[key]
+                dim = src.shape[0]
+                buf[:dim].copy_(src)
+
+
+# Detect whether the current forward pass is in capture mode
+is_capture_mode = False
+
+
+def get_is_capture_mode():
+    return is_capture_mode
+
+
+@contextmanager
+def model_capture_mode():
+    global is_capture_mode
+    is_capture_mode = True
+
+    yield
+
+    is_capture_mode = False
+
+
+@contextmanager
+def freeze_gc(enable_cudagraph_gc: bool):
+    """
+    Optimize garbage collection during CUDA graph capture.
+    Clean up, then freeze all remaining objects from being included
+    in future collections if GC is disabled during capture.
+    """
+    gc.collect()
+    should_freeze = not enable_cudagraph_gc
+    if should_freeze:
+        gc.freeze()
+    try:
+        yield
+    finally:
+        if should_freeze:
+            gc.unfreeze()
+            gc.collect()
+
+
+def _to_torch(model: torch.nn.Module, reverse: bool, num_tokens: int):
+    for sub in model._modules.values():
+        if isinstance(sub, MultiPlatformOp):
+            if reverse:
+                sub.leave_torch_compile()
+            else:
+                sub.enter_torch_compile(num_tokens=num_tokens)
+        if isinstance(sub, torch.nn.Module):
+            _to_torch(sub, reverse, num_tokens)
+
+
+@contextmanager
+def patch_model(
+    model: torch.nn.Module,
+    enable_compile: bool,
+    num_tokens: int,
+    tp_group: GroupCoordinator,
+):
+    """Patch the model to make it compatible with with torch.compile"""
+    backup_ca_comm = None
+
+    try:
+        if enable_compile:
+            _to_torch(model, reverse=False, num_tokens=num_tokens)
+            backup_ca_comm = tp_group.ca_comm
+            # Use custom-allreduce here.
+            # We found the custom allreduce is much faster than the built-in allreduce in torch,
+            # even with ENABLE_INTRA_NODE_COMM=1.
+            # tp_group.ca_comm = None
+            yield torch.compile(
+                torch.no_grad()(model.forward),
+                mode=os.environ.get(
+                    "SGLANG_TORCH_COMPILE_MODE", "max-autotune-no-cudagraphs"
+                ),
+                dynamic=_is_hip and get_bool_env_var("SGLANG_TORCH_DYNAMIC_SHAPE"),
+            )
+        else:
+            yield model.forward
+    finally:
+        if enable_compile:
+            _to_torch(model, reverse=True, num_tokens=num_tokens)
+            tp_group.ca_comm = backup_ca_comm
+
+
+def set_torch_compile_config():
+    import torch._dynamo.config
+    import torch._inductor.config
+
+    torch._inductor.config.coordinate_descent_tuning = True
+    torch._inductor.config.triton.unique_kernel_names = True
+    torch._inductor.config.fx_graph_cache = True  # Experimental feature to reduce compilation times, will be on by default in future
+
+    # FIXME: tmp workaround
+    torch._dynamo.config.accumulated_cache_size_limit = 1024
+    if hasattr(torch._dynamo.config, "cache_size_limit"):
+        torch._dynamo.config.cache_size_limit = 1024
+
+    monkey_patch_torch_compile()
+
+
+def get_batch_sizes_to_capture(model_runner: ModelRunner, num_tokens_per_bs=1):
+    server_args = model_runner.server_args
+    capture_bs = server_args.cuda_graph_bs
+
+    if max(capture_bs) > model_runner.req_to_token_pool.size:
+        # In some cases (e.g., with a small GPU or --max-running-requests), the #max-running-requests
+        # is very small. We add more values here to make sure we capture the maximum bs.
+        capture_bs += [model_runner.req_to_token_pool.size]
+
+    mul_base = 1
+
+    if server_args.enable_two_batch_overlap:
+        mul_base *= 2
+        num_tokens_per_bs = 1  # tbo not test, set num_tokens_per_bs to 1
+
+    if require_gathered_buffer(server_args):
+        mul_base *= get_attention_tp_size()
+
+    if mul_base % get_attention_cp_size() != 0:
+        mul_base *= get_attention_cp_size()
+
+    # Model input token count = bs * num_tokens_per_bs; must be a multiple of attn_tp_size.
+    capture_bs = [bs for bs in capture_bs if bs * num_tokens_per_bs % mul_base == 0]
+
+    capture_bs = [bs for bs in capture_bs if bs <= model_runner.req_to_token_pool.size]
+    capture_bs = list(sorted(set(capture_bs)))
+    assert len(capture_bs) > 0 and capture_bs[0] > 0, f"{capture_bs=}"
+    compile_bs = (
+        [bs for bs in capture_bs if bs <= server_args.torch_compile_max_bs]
+        if server_args.enable_torch_compile
+        else []
+    )
+    return capture_bs, compile_bs
+
+
+# Reuse this memory pool across all cuda graph runners.
+global_graph_memory_pool = None
+
+
+def get_global_graph_memory_pool():
+    return global_graph_memory_pool
+
+
+def set_global_graph_memory_pool(val):
+    global global_graph_memory_pool
+    global_graph_memory_pool = val
+
+
+class CudaGraphRunner:
+    """A CudaGraphRunner runs the forward pass of a model with cuda graph and torch.compile."""
+
+    def __init__(self, model_runner: ModelRunner):
+        # Parse args
+        self.model_runner = model_runner
+        self.device = model_runner.device
+        self.device_module = torch.get_device_module(self.device)
+        self.graphs = {}
+        self.output_buffers = {}
+        self.enable_torch_compile = model_runner.server_args.enable_torch_compile
+        self.disable_padding = model_runner.server_args.disable_cuda_graph_padding
+        self.is_encoder_decoder = model_runner.model_config.is_encoder_decoder
+        self.require_gathered_buffer = require_gathered_buffer(model_runner.server_args)
+        self.require_mlp_tp_gather = require_mlp_tp_gather(model_runner.server_args)
+        self.require_mlp_sync = require_mlp_sync(model_runner.server_args)
+        self.require_attn_tp_gather = require_attn_tp_gather(model_runner.server_args)
+        self.enable_two_batch_overlap = (
+            model_runner.server_args.enable_two_batch_overlap
+        )
+        self.speculative_algorithm = model_runner.server_args.speculative_algorithm
+        self.enable_profile_cuda_graph = (
+            model_runner.server_args.enable_profile_cuda_graph
+        )
+        self.tp_size = model_runner.server_args.tp_size
+        self.dp_size = model_runner.server_args.dp_size
+        self.pp_size = model_runner.server_args.pp_size
+        self.enable_pdmux = model_runner.server_args.enable_pdmux
+
+        self.attn_tp_size = get_attention_tp_size()
+        self.attn_tp_rank = get_attention_tp_rank()
+        self.nsa_enable_prefill_cp = is_nsa_enable_prefill_cp()
+
+        self.deepep_adapter = DeepEPCudaGraphRunnerAdapter()
+
+        self.dllm_config = DllmConfig.from_server_args(model_runner.server_args)
+        self.is_dllm = self.dllm_config is not None
+
+        self.capture_forward_mode = ForwardMode.DECODE
+        self.capture_hidden_mode = CaptureHiddenMode.NULL
+        self.num_tokens_per_bs = 1
+        if (
+            model_runner.spec_algorithm.is_eagle()
+            or model_runner.spec_algorithm.is_standalone()
+            or model_runner.spec_algorithm.is_ngram()
+        ):
+            if self.model_runner.is_draft_worker:
+                raise RuntimeError("This should not happen")
+            else:
+                self.capture_forward_mode = ForwardMode.TARGET_VERIFY
+                self.num_tokens_per_bs = (
+                    self.model_runner.server_args.speculative_num_draft_tokens
+                )
+        elif self.is_dllm:
+            self.capture_forward_mode = ForwardMode.DLLM_EXTEND
+            self.num_tokens_per_bs = self.dllm_config.block_size
+
+        # Batch sizes to capture
+        self.capture_bs, self.compile_bs = get_batch_sizes_to_capture(
+            model_runner, self.num_tokens_per_bs
+        )
+        log_info_on_rank0(logger, f"Capture cuda graph bs {self.capture_bs}")
+        if KTRANSFORMERS_AVAILABLE:
+            KTMoEWrapper.set_capture_batch_sizes(self.capture_bs)
+
+        # If returning hidden states is enabled, set initial capture hidden mode to full to avoid double-capture on startup
+        if model_runner.server_args.enable_return_hidden_states:
+            self.capture_hidden_mode = CaptureHiddenMode.FULL
+
+        # Attention backend
+        self.max_bs = max(self.capture_bs)
+        self.max_num_token = self.max_bs * self.num_tokens_per_bs
+        self.model_runner.attn_backend.init_cuda_graph_state(
+            self.max_bs, self.max_num_token
+        )
+
+        # Init PDMux if needed
+        self.maybe_init_pdmux()
+        self.seq_len_fill_value = (
+            self.model_runner.attn_backend.get_cuda_graph_seq_len_fill_value()
+            if self.dllm_config is None
+            else self.dllm_config.block_size
+        )
+
+        self.encoder_len_fill_value = 0
+
+        if self.enable_torch_compile:
+            set_torch_compile_config()
+
+        if self.model_runner.server_args.enable_lora:
+            self.model_runner.lora_manager.init_cuda_graph_batch_info(
+                max_bs_in_cuda_graph=self.max_bs,
+                num_tokens_per_bs=self.num_tokens_per_bs,
+            )
+
+        enable_mamba_track = (
+            self.model_runner.server_args.enable_mamba_extra_buffer()
+            and self.model_runner.spec_algorithm.is_none()
+        )
+
+        if self.require_gathered_buffer:
+            assert self.require_mlp_tp_gather or self.require_attn_tp_gather
+        self.buffers: DecodeInputBuffers = DecodeInputBuffers.create(
+            device=self.device,
+            max_bs=self.max_bs,
+            max_num_token=self.max_num_token,
+            hidden_size=self.model_runner.model_config.hidden_size,
+            vocab_size=self.model_runner.model_config.vocab_size,
+            dtype=self.model_runner.model_config.dtype,
+            dp_size=self.dp_size,
+            pp_size=self.pp_size,
+            is_encoder_decoder=self.is_encoder_decoder,
+            require_mlp_tp_gather=self.require_mlp_tp_gather,
+            seq_len_fill_value=self.seq_len_fill_value,
+            encoder_len_fill_value=self.encoder_len_fill_value,
+            num_tokens_per_bs=self.num_tokens_per_bs,
+            cache_loc_dtype=self._cache_loc_dtype(),
+            enable_mamba_track=enable_mamba_track,
+        )
+        self.buffers.share_buffers()
+
+        self.tbo_plugin = TboCudaGraphRunnerPlugin()
+
+        # Speculative_inference
+        if (
+            model_runner.spec_algorithm.is_eagle3()
+            and model_runner.eagle_use_aux_hidden_state
+        ):
+            self.model_runner.model.set_eagle3_layers_to_capture()
+
+        # Capture
+        try:
+            with model_capture_mode():
+                self.capture()
+        except RuntimeError as e:
+            raise Exception(
+                f"Capture cuda graph failed: {e}\n{CUDA_GRAPH_CAPTURE_FAILED_MSG}"
+            )
+
+    def maybe_init_pdmux(self):
+        if self.enable_pdmux:
+            self.stream_groups = get_stream_groups()
+            for attn_backend in self.model_runner.decode_attn_backend_group:
+                attn_backend.init_cuda_graph_state(self.max_bs, self.max_num_token)
+
+    def _cache_loc_dtype(self):
+        return torch.int64
+
+    def can_run(self, forward_batch: ForwardBatch):
+        if self.require_mlp_tp_gather:
+            cuda_graph_bs = (
+                max(forward_batch.global_num_tokens_cpu) // self.num_tokens_per_bs
+                if self.model_runner.spec_algorithm.is_eagle()
+                or self.model_runner.spec_algorithm.is_standalone()
+                else max(forward_batch.global_num_tokens_cpu)
+            )
+        else:
+            cuda_graph_bs = forward_batch.batch_size
+
+        graph_key = cuda_graph_bs
+        if self.enable_pdmux:
+            graph_key = f"{get_current_stream_idx()}_{cuda_graph_bs}"
+
+        is_bs_supported = (
+            graph_key in self.graphs
+            if self.disable_padding
+            else cuda_graph_bs <= self.max_bs
+        )
+
+        if self.require_mlp_sync:
+            is_bs_supported = is_bs_supported and forward_batch.can_run_dp_cuda_graph
+
+        # NOTE: cuda graph cannot handle mixed batch (encoder_len = 0)
+        # If mixed batch cannot be supported, then encoder_lens can be removed in cuda graph
+        # because the full_text_row_masked_out_mask tensor will always be ones
+        is_encoder_lens_supported = (
+            torch.all(forward_batch.encoder_lens > 0)
+            if self.is_encoder_decoder
+            else True
+        )
+
+        requested_capture_hidden_mode = max(
+            forward_batch.capture_hidden_mode,
+            (
+                forward_batch.spec_info.capture_hidden_mode
+                if getattr(forward_batch.spec_info, "capture_hidden_mode", None)
+                is not None
+                else CaptureHiddenMode.NULL
+            ),
+        )
+        capture_hidden_mode_matches = (
+            requested_capture_hidden_mode == CaptureHiddenMode.NULL
+            or requested_capture_hidden_mode == self.capture_hidden_mode
+        )
+        is_tbo_supported = (
+            forward_batch.can_run_tbo if self.enable_two_batch_overlap else True
+        )
+
+        is_ngram_supported = (
+            (
+                forward_batch.batch_size * self.num_tokens_per_bs
+                == forward_batch.input_ids.numel()
+            )
+            if self.model_runner.spec_algorithm.is_ngram()
+            else True
+        )
+
+        return (
+            is_bs_supported
+            and is_encoder_lens_supported
+            and is_tbo_supported
+            and capture_hidden_mode_matches
+            and is_ngram_supported
+        )
+
+    def _init_profile_context_and_memory_record(self):
+        profile_context = profile(
+            activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA],
+            record_shapes=True,
+        )
+        torch.cuda.memory._record_memory_history()
+        return profile_context
+
+    def _post_process_after_profile(self, prof_context):
+        torch.cuda.memory._dump_snapshot(f"cuda_graph_runner_memory_usage.pickle")
+        torch.cuda.memory._record_memory_history(enabled=None)
+        log_message = (
+            "Sorted by CUDA Time:\n"
+            + prof_context.key_averages(group_by_input_shape=True).table(
+                sort_by="cuda_time_total", row_limit=10
+            )
+            + "\n\nSorted by CPU Time:\n"
+            + prof_context.key_averages(group_by_input_shape=True).table(
+                sort_by="cpu_time_total", row_limit=10
+            )
+            + "\n\nMemory Usage is saved to cuda_graph_runner_memory_usage.pickle\n"
+        )
+        logger.info(log_message)
+
+    def capture(self) -> None:
+        profile_context = empty_context()
+        if self.enable_profile_cuda_graph:
+            profile_context = self._init_profile_context_and_memory_record()
+
+        def _capture_one_stream(stream_idx: Optional[int] = None):
+            avail_mem = get_available_gpu_memory(
+                self.model_runner.device,
+                self.model_runner.gpu_id,
+                empty_cache=False,
+            )
+            # Reverse the order to enable better memory sharing across cuda graphs.
+            capture_range = (
+                tqdm.tqdm(list(reversed(self.capture_bs)))
+                if get_tensor_model_parallel_rank() == 0
+                else reversed(self.capture_bs)
+            )
+            for i, bs in enumerate(capture_range):
+                if get_tensor_model_parallel_rank() == 0:
+                    avail_mem = get_available_gpu_memory(
+                        self.model_runner.device,
+                        self.model_runner.gpu_id,
+                        empty_cache=False,
+                    )
+                    capture_range.set_description(
+                        f"Capturing batches ({bs=} {avail_mem=:.2f} GB)"
+                    )
+
+                with patch_model(
+                    self.model_runner.model,
+                    bs in self.compile_bs,
+                    num_tokens=bs * self.num_tokens_per_bs,
+                    tp_group=self.model_runner.tp_group,
+                ) as forward:
+                    (
+                        graph,
+                        output_buffers,
+                    ) = self.capture_one_batch_size(bs, forward, stream_idx)
+                    # For pd_multiplexing, we need to save the graph and output buffers
+                    key = bs if stream_idx is None else f"{stream_idx}_{bs}"
+                    self.graphs[key] = graph
+                    self.output_buffers[key] = output_buffers
+
+        # Trigger CUDA graph capture for specific shapes.
+        # Capture the large shapes first so that the smaller shapes
+        # can reuse the memory pool allocated for the large shapes.
+        with freeze_gc(self.model_runner.server_args.enable_cudagraph_gc):
+            if not self.enable_pdmux:
+                with graph_capture() as graph_capture_context, profile_context as prof:
+                    self.stream = graph_capture_context.stream
+                    _capture_one_stream()
+            else:
+                set_pdmux_status(False)
+                for i, sg in enumerate(self.stream_groups):
+                    with graph_capture(
+                        stream=sg[1]
+                    ) as graph_capture_context, profile_context as prof:
+                        self.stream = graph_capture_context.stream
+                        _capture_one_stream(i)
+
+        if self.enable_profile_cuda_graph:
+            self._post_process_after_profile(prof)
+
+    def _capture_graph(self, graph, pool, stream, run_once_fn):
+        memory_saver_adapter = TorchMemorySaverAdapter.create(
+            enable=self.model_runner.server_args.enable_memory_saver
+            and get_bool_env_var("SGLANG_MEMORY_SAVER_CUDA_GRAPH")
+        )
+        graph_fn = (
+            partial(memory_saver_adapter.cuda_graph, tag=GPU_MEMORY_TYPE_CUDA_GRAPH)
+            if memory_saver_adapter.enabled
+            else self.device_module.graph
+        )
+        with graph_fn(cuda_graph=graph, pool=pool, stream=stream):
+            out = run_once_fn()
+        return out
+
+    def _create_device_graph(self):
+        return torch.cuda.CUDAGraph()
+
+    def capture_one_batch_size(
+        self, bs: int, forward: Callable, stream_idx: Optional[int] = None
+    ):
+        buffers: DecodeInputBuffers = self.buffers
+        graph = self._create_device_graph()
+        stream = self.stream
+        num_tokens = bs * self.num_tokens_per_bs
+
+        # Graph inputs
+        input_ids = buffers.input_ids[:num_tokens]
+        req_pool_indices = buffers.req_pool_indices[:bs]
+        seq_lens = buffers.seq_lens[:bs]
+        seq_lens_cpu = buffers.seq_lens_cpu[:bs]
+        out_cache_loc = buffers.out_cache_loc[:num_tokens]
+        positions = buffers.positions[:num_tokens]
+        if self.is_encoder_decoder:
+            encoder_lens = buffers.encoder_lens[:bs]
+        else:
+            encoder_lens = None
+        mrope_positions = buffers.mrope_positions[:, :num_tokens]
+        next_token_logits_buffer = buffers.next_token_logits_buffer[:num_tokens]
+        buffers.num_token_non_padded[...] = num_tokens
+
+        # pipeline parallelism
+        if self.pp_size > 1:
+            pp_proxy_tensors = PPProxyTensors(
+                {k: v[:num_tokens] for k, v in buffers.pp_proxy_tensors.items()}
+            )
+
+        if self.require_mlp_tp_gather:
+            buffers.global_num_tokens_gpu.copy_(
+                torch.tensor(
+                    [num_tokens] * self.dp_size,
+                    dtype=torch.int32,
+                    device=input_ids.device,
+                )
+            )
+            buffers.global_num_tokens_for_logprob_gpu.copy_(
+                torch.tensor(
+                    [num_tokens] * self.dp_size,
+                    dtype=torch.int32,
+                    device=input_ids.device,
+                )
+            )
+            global_dp_buffer_len = num_tokens * self.dp_size
+        elif self.require_attn_tp_gather:
+            buffers.global_num_tokens_gpu.copy_(
+                torch.tensor(
+                    [num_tokens],
+                    dtype=torch.int32,
+                    device=input_ids.device,
+                )
+            )
+            buffers.global_num_tokens_for_logprob_gpu.copy_(
+                torch.tensor(
+                    [num_tokens],
+                    dtype=torch.int32,
+                    device=input_ids.device,
+                )
+            )
+            global_dp_buffer_len = num_tokens
+        else:
+            global_dp_buffer_len = None
+
+        spec_info = self.get_spec_info(num_tokens)
+        if self.capture_hidden_mode != CaptureHiddenMode.FULL:
+            self.capture_hidden_mode = (
+                spec_info.capture_hidden_mode if spec_info else CaptureHiddenMode.NULL
+            )
+
+        if self.model_runner.server_args.enable_lora:
+            # It is safe to capture CUDA graph using empty LoRA id, as the LoRA kernels will always be launched whenever
+            # `--enable-lora` is set to True (and return immediately if the LoRA id is empty for perf optimization).
+            lora_ids = [None] * bs
+        else:
+            lora_ids = None
+
+        # mamba state tracking
+        mamba_track_indices = (
+            buffers.mamba_track_indices[:bs]
+            if buffers.mamba_track_indices is not None
+            else None
+        )
+        mamba_track_mask = (
+            buffers.mamba_track_mask[:bs]
+            if buffers.mamba_track_mask is not None
+            else None
+        )
+
+        if stream_idx is None:
+            attn_backend = self.model_runner.attn_backend
+        else:
+            assert self.enable_pdmux
+            attn_backend = self.model_runner.decode_attn_backend_group[stream_idx]
+
+        forward_batch = ForwardBatch(
+            forward_mode=self.capture_forward_mode,
+            batch_size=bs,
+            input_ids=input_ids,
+            req_pool_indices=req_pool_indices,
+            seq_lens=seq_lens,
+            seq_lens_cpu=seq_lens_cpu,
+            next_token_logits_buffer=next_token_logits_buffer,
+            orig_seq_lens=seq_lens,
+            req_to_token_pool=self.model_runner.req_to_token_pool,
+            token_to_kv_pool=self.model_runner.token_to_kv_pool,
+            attn_backend=attn_backend,
+            out_cache_loc=out_cache_loc,
+            seq_lens_sum=seq_lens.sum().item(),
+            mamba_track_indices=mamba_track_indices,
+            mamba_track_mask=mamba_track_mask,
+            mamba_track_seqlens=None,  # Prefill only
+            encoder_lens=encoder_lens,
+            return_logprob=False,
+            positions=positions,
+            global_num_tokens_gpu=buffers.global_num_tokens_gpu,
+            global_num_tokens_for_logprob_gpu=buffers.global_num_tokens_for_logprob_gpu,
+            dp_padding_mode=DpPaddingMode.get_default_mode_in_cuda_graph(),
+            global_dp_buffer_len=global_dp_buffer_len,
+            mrope_positions=mrope_positions,
+            spec_algorithm=self.model_runner.spec_algorithm,
+            spec_info=spec_info,
+            capture_hidden_mode=self.capture_hidden_mode,
+            num_token_non_padded=buffers.num_token_non_padded,
+            global_forward_mode=self.capture_forward_mode,
+            lora_ids=lora_ids,
+        )
+        self.tbo_plugin.capture_one_batch_size(forward_batch, num_tokens=num_tokens)
+
+        if lora_ids is not None:
+            self.model_runner.lora_manager.prepare_lora_batch(forward_batch)
+
+        # Attention backend
+        attn_backend.init_forward_metadata_capture_cuda_graph(
+            bs,
+            num_tokens,
+            req_pool_indices,
+            seq_lens,
+            encoder_lens,
+            forward_batch.forward_mode,
+            forward_batch.spec_info,
+        )
+
+        # Run and capture
+        def run_once():
+            # Clean intermediate result cache for DP attention
+            forward_batch.dp_local_start_pos = forward_batch.dp_local_num_tokens = None
+            set_dp_buffer_len(
+                global_dp_buffer_len,
+                num_tokens,
+                forward_batch.dp_padding_mode.is_max_len(),
+            )
+            set_is_extend_in_batch(False)
+
+            kwargs = {}
+            if (
+                self.pp_size > 1
+                and "pp_proxy_tensors" in inspect.signature(forward).parameters
+            ):
+                kwargs["pp_proxy_tensors"] = PPProxyTensors(
+                    {k: v.clone() for k, v in pp_proxy_tensors.tensors.items()}
+                )
+
+            logits_output_or_pp_proxy_tensors = forward(
+                input_ids,
+                forward_batch.positions,
+                forward_batch,
+                **kwargs,
+            )
+            return logits_output_or_pp_proxy_tensors
+
+        self.deepep_adapter.capture(is_extend_in_batch=False)
+
+        for _ in range(2):
+            self.device_module.synchronize()
+            self.model_runner.tp_group.barrier()
+            run_once()
+
+        if get_global_graph_memory_pool() is None:
+            set_global_graph_memory_pool(self.device_module.graph_pool_handle())
+        # Set graph pool id globally to be able to use symmetric memory
+        set_graph_pool_id(get_global_graph_memory_pool())
+        out = self._capture_graph(
+            graph, get_global_graph_memory_pool(), stream, run_once
+        )
+
+        return graph, out
+
+    def recapture_if_needed(self, forward_batch: ForwardBatch):
+
+        # If the required capture_hidden_mode changes, we need to recapture the graph
+
+        # These are the different factors that can influence the capture_hidden_mode
+        capture_hidden_mode_required_by_forward_batch = (
+            forward_batch.capture_hidden_mode
+        )
+        capture_hidden_mode_required_by_spec_info = (
+            getattr(forward_batch.spec_info, "capture_hidden_mode", None)
+            or CaptureHiddenMode.NULL
+        )
+        capture_hidden_mode_required_for_returning_hidden_states = (
+            CaptureHiddenMode.FULL
+            if self.model_runner.server_args.enable_return_hidden_states
+            else CaptureHiddenMode.NULL
+        )
+
+        # Determine the highest capture_hidden_mode required
+        # (If we have FULL, we can emulate LAST or NULL)
+        # (If we have LAST, we can emulate NULL)
+        required_capture_hidden_mode = max(
+            capture_hidden_mode_required_by_forward_batch,
+            capture_hidden_mode_required_by_spec_info,
+            capture_hidden_mode_required_for_returning_hidden_states,
+        )
+
+        # If the current hidden mode is no longer aligned with the required hidden mode, we need to set it to what is required and re-capture
+        if self.capture_hidden_mode != required_capture_hidden_mode:
+            self.capture_hidden_mode = required_capture_hidden_mode
+            self.capture()
+
+    def replay_prepare(
+        self,
+        forward_batch: ForwardBatch,
+        pp_proxy_tensors: Optional[PPProxyTensors] = None,
+    ):
+        buffers = self.buffers
+        self.recapture_if_needed(forward_batch)
+
+        raw_bs = forward_batch.batch_size
+        raw_num_token = raw_bs * self.num_tokens_per_bs
+
+        # Pad
+        if self.require_mlp_tp_gather:
+            max_num_tokens = max(forward_batch.global_num_tokens_cpu)
+            max_batch_size = (
+                max_num_tokens / self.num_tokens_per_bs
+                if self.model_runner.spec_algorithm.is_eagle()
+                or self.model_runner.spec_algorithm.is_standalone()
+                else max_num_tokens
+            )
+            index = bisect.bisect_left(self.capture_bs, max_batch_size)
+        else:
+            index = bisect.bisect_left(self.capture_bs, raw_bs)
+        bs = self.capture_bs[index]
+
+        buffers.populate_from_forward_batch(
+            forward_batch=forward_batch,
+            raw_bs=raw_bs,
+            raw_num_token=raw_num_token,
+            bs=bs,
+            seq_len_fill_value=self.seq_len_fill_value,
+            require_gathered_buffer=self.require_gathered_buffer,
+            num_tokens_per_bs=self.num_tokens_per_bs,
+            nsa_enable_prefill_cp=self.nsa_enable_prefill_cp,
+            enable_num_token_non_padded_flag=enable_num_token_non_padded(
+                self.model_runner.server_args
+            ),
+            pp_proxy_tensors=pp_proxy_tensors,
+        )
+        if self.enable_two_batch_overlap:
+            self.tbo_plugin.replay_prepare(
+                forward_mode=self.capture_forward_mode,
+                bs=bs,
+                num_token_non_padded=len(forward_batch.input_ids),
+                spec_info=forward_batch.spec_info,
+            )
+        if forward_batch.forward_mode.is_idle() and forward_batch.spec_info is not None:
+            forward_batch.spec_info.custom_mask = buffers.custom_mask
+        # Attention backend
+        if self.enable_pdmux:
+            stream_idx = get_current_stream_idx()
+            attn_backend = self.model_runner.decode_attn_backend_group[stream_idx]
+        else:
+            attn_backend = self.model_runner.attn_backend
+        attn_backend.init_forward_metadata_replay_cuda_graph(
+            bs,
+            buffers.req_pool_indices[:bs],
+            buffers.seq_lens[:bs],
+            forward_batch.seq_lens_sum + (bs - raw_bs) * self.seq_len_fill_value,
+            buffers.encoder_lens[:bs] if self.is_encoder_decoder else None,
+            self.capture_forward_mode,
+            forward_batch.spec_info,
+            seq_lens_cpu=buffers.seq_lens_cpu[:bs],
+        )
+
+        # Store fields
+        self.raw_bs = raw_bs
+        self.raw_num_token = raw_num_token
+        self.bs = bs
+
+    def replay(
+        self,
+        forward_batch: ForwardBatch,
+        skip_attn_backend_init: bool = False,
+        pp_proxy_tensors: Optional[PPProxyTensors] = None,
+    ) -> Union[LogitsProcessorOutput, PPProxyTensors]:
+        self.deepep_adapter.replay()
+
+        if not skip_attn_backend_init:
+            self.replay_prepare(forward_batch, pp_proxy_tensors)
+        else:
+            # In speculative decoding, these two fields are still needed.
+            self.buffers.input_ids[: self.raw_num_token].copy_(forward_batch.input_ids)
+            self.buffers.positions[: self.raw_num_token].copy_(forward_batch.positions)
+
+        # Replay
+        if self.enable_pdmux:
+            graph_key = f"{get_current_stream_idx()}_{self.bs}"
+        else:
+            graph_key = self.bs
+        self.graphs[graph_key].replay()
+        output = self.output_buffers[graph_key]
+
+        if isinstance(output, LogitsProcessorOutput):
+            if self.is_dllm:
+                next_token_logits = None
+                full_logits = output.full_logits[: self.raw_num_token]
+            else:
+                full_logits = None
+                next_token_logits = output.next_token_logits[: self.raw_num_token]
+
+            return LogitsProcessorOutput(
+                next_token_logits=next_token_logits,
+                full_logits=full_logits,
+                hidden_states=(
+                    output.hidden_states[: self.raw_num_token]
+                    if output.hidden_states is not None
+                    else None
+                ),
+                customized_info=output.customized_info,
+            )
+        else:
+            assert isinstance(output, PPProxyTensors)
+            return PPProxyTensors({k: v[: self.bs] for k, v in output.tensors.items()})
+
+    def get_spec_info(self, num_tokens: int):
+        spec_info = None
+        if (
+            self.model_runner.spec_algorithm.is_eagle()
+            or self.model_runner.spec_algorithm.is_standalone()
+        ):
+            from sglang.srt.speculative.eagle_info import EagleVerifyInput
+
+            if self.model_runner.is_draft_worker:
+                raise RuntimeError("This should not happen.")
+            else:
+                spec_info = EagleVerifyInput(
+                    draft_token=None,
+                    custom_mask=self.buffers.custom_mask,
+                    positions=None,
+                    retrive_index=None,
+                    retrive_next_token=None,
+                    retrive_next_sibling=None,
+                    retrive_cum_len=None,
+                    spec_steps=self.model_runner.server_args.speculative_num_steps,
+                    topk=self.model_runner.server_args.speculative_eagle_topk,
+                    draft_token_num=self.model_runner.server_args.speculative_num_draft_tokens,
+                    capture_hidden_mode=CaptureHiddenMode.FULL,
+                    seq_lens_sum=None,
+                    seq_lens_cpu=None,
+                )
+
+        elif self.model_runner.spec_algorithm.is_ngram():
+            from sglang.srt.speculative.ngram_info import NgramVerifyInput
+
+            spec_info = NgramVerifyInput(
+                draft_token=None,
+                tree_mask=self.buffers.custom_mask,
+                positions=None,
+                retrive_index=None,
+                retrive_next_token=None,
+                retrive_next_sibling=None,
+                draft_token_num=self.num_tokens_per_bs,
+            )
+            spec_info.capture_hidden_mode = CaptureHiddenMode.NULL
+
+        return spec_info
+
+
+CUDA_GRAPH_CAPTURE_FAILED_MSG = (
+    "Possible solutions:\n"
+    "1. set --mem-fraction-static to a smaller value (e.g., 0.8 or 0.7)\n"
+    "2. set --cuda-graph-max-bs to a smaller value (e.g., 16)\n"
+    "3. disable torch compile by not using --enable-torch-compile\n"
+    "4. disable CUDA graph by --disable-cuda-graph. (Not recommended. Huge performance loss)\n"
+    "Open an issue on GitHub https://github.com/sgl-project/sglang/issues/new/choose \n"
+)
+
+
+class DeepEPCudaGraphRunnerAdapter:
+    def __init__(self):
+        # Record DeepEP mode used during capture to ensure replay consistency
+        self._captured_deepep_mode = None
+
+    def capture(self, is_extend_in_batch: bool):
+        if not get_moe_a2a_backend().is_deepep():
+            return
+        self._captured_deepep_mode = get_deepep_mode().resolve(
+            is_extend_in_batch=is_extend_in_batch
+        )
+        DeepEPBuffer.set_dispatch_mode(self._captured_deepep_mode)
+
+    def replay(self):
+        if not get_moe_a2a_backend().is_deepep():
+            return
+        assert self._captured_deepep_mode is not None
+        DeepEPBuffer.set_dispatch_mode(self._captured_deepep_mode)
diff --git a/sglang/python/sglang/srt/model_executor/forward_batch_deepseek_mha_mixin.py b/sglang/python/sglang/srt/model_executor/forward_batch_deepseek_mha_mixin.py
new file mode 100644
index 0000000000000000000000000000000000000000..26a59a6ccc05fead54adec6c70517891fbfd17bf
--- /dev/null
+++ b/sglang/python/sglang/srt/model_executor/forward_batch_deepseek_mha_mixin.py
@@ -0,0 +1,232 @@
+# Mixin class for metadata management of Deepseek MHA forward (chunked prefix cache)
+# More details can be found in python/sglang/srt/models/deepseek_common/attention_forward_methods/forward_mha.py
+
+from typing import List, Optional
+
+import torch
+import triton
+import triton.language as tl
+
+from sglang.srt.layers.attention.utils import create_flashinfer_kv_indices_triton
+
+
+class ForwardBatchDeepSeekMHAMixin:
+    # For MLA chunked prefix cache used in chunked prefill
+    # Tell attention backend whether the kv cache needs to be attended in current pass
+    attn_attend_prefix_cache: Optional[bool] = None
+    # Number of prefix cache chunks
+    num_prefix_chunks: Optional[int] = None
+    # Index of current chunk, used by attention backend
+    prefix_chunk_idx: Optional[int] = None
+    # Maximum number of tokens in each chunk per sequence. Computed from maximum chunk capacity
+    prefix_chunk_len: Optional[int] = None
+    # Start positions of prefix cache for each chunk, (num_prefix_chunks, batch_size)
+    prefix_chunk_starts: Optional[torch.Tensor] = None
+    # Lengths of prefix cache for each chunk, (num_prefix_chunks, batch_size)
+    prefix_chunk_seq_lens: Optional[torch.Tensor] = None
+    # Accumulated lengths of prefix cache for each chunk, (num_prefix_chunks, batch_size + 1)
+    prefix_chunk_cu_seq_lens: Optional[torch.Tensor] = None
+    # Max lengths of prefix cache for each chunk, (num_prefix_chunks,)
+    prefix_chunk_max_seq_lens: Optional[List[int]] = None
+    # Number of tokens in each prefix cache chunk, (num_prefix_chunks,)
+    prefix_chunk_num_tokens: Optional[List[int]] = None
+    # KV Indices for each chunk
+    prefix_chunk_kv_indices: Optional[List[torch.Tensor]] = None
+    # For MLA chunked prefix cache used in chunked prefill
+    # Tell attention backend whether lse needs to be returned
+    mha_return_lse: Optional[bool] = None
+    # Whether to apply MHA_ONE_SHOT forward method
+    mha_one_shot: Optional[bool] = None
+    # KV Indices for MHA_ONE_SHOT forward method
+    mha_one_shot_kv_indices: Optional[torch.Tensor] = None
+
+    def get_max_chunk_capacity(self):
+        # Maximum number of tokens in each chunk
+        # TODO: Should be changed to a better value, maybe passed through server args
+        return 128 * 1024
+
+    def set_prefix_chunk_idx(self, idx: int):
+        self.prefix_chunk_idx = idx
+
+    def set_attn_attend_prefix_cache(self, attn_attend_prefix_cache: bool):
+        self.attn_attend_prefix_cache = attn_attend_prefix_cache
+
+    def prepare_chunked_kv_indices(self, device: torch.device):
+        self.prefix_chunk_kv_indices = []
+        for idx in range(self.num_prefix_chunks):
+            chunk_starts = self.prefix_chunk_starts[idx]
+            chunk_seq_lens = self.prefix_chunk_seq_lens[idx]
+            chunk_cu_seq_lens = self.prefix_chunk_cu_seq_lens[idx]
+            num_chunk_tokens = self.prefix_chunk_num_tokens[idx]
+
+            chunk_kv_indices = torch.empty(
+                num_chunk_tokens, dtype=torch.int32, device=device
+            )
+
+            create_chunked_prefix_cache_kv_indices[(self.batch_size,)](
+                self.req_to_token_pool.req_to_token,
+                self.req_pool_indices,
+                chunk_starts,
+                chunk_seq_lens,
+                chunk_cu_seq_lens,
+                chunk_kv_indices,
+                self.req_to_token_pool.req_to_token.shape[1],
+            )
+            self.prefix_chunk_kv_indices.append(chunk_kv_indices)
+
+    # Here we suppose the length of each chunk is equal
+    # For example, if we have 4 sequences with prefix length [256, 512, 768, 1024], prefix_chunk_len = 256
+    # num_prefix_chunks = cdiv(1024, 256) = 4
+    # prefix_chunk_starts = [[0, 0, 0, 0], [256, 256, 256, 256], [512, 512, 512, 512], [768, 768, 768, 768]]
+    # prefix_chunk_ends = [[256, 256, 256, 256], [256, 512, 512, 512], [256, 512, 768, 768], [256, 512, 768, 1024]]
+    # prefix_chunk_seq_lens = [[256, 256, 256, 256], [0, 256, 256, 256], [0, 0, 256, 256], [0, 0, 0, 256]]
+    # TODO: Implement a better way to allocate chunk lengths that uses memory spaces more efficiently.
+    def get_prefix_chunk_seq_lens(
+        self, prefix_lens: torch.Tensor, num_prefix_chunks: int, prefix_chunk_len: int
+    ):
+        device = prefix_lens.device
+        prefix_chunk_starts = (
+            torch.arange(num_prefix_chunks, device=device, dtype=torch.int32)
+            .unsqueeze(1)
+            .expand(-1, self.batch_size)
+            * prefix_chunk_len
+        )
+        prefix_chunk_ends = torch.min(
+            prefix_lens.unsqueeze(0),
+            prefix_chunk_starts + prefix_chunk_len,
+        ).to(torch.int32)
+
+        prefix_chunk_seq_lens = (
+            (prefix_chunk_ends - prefix_chunk_starts).clamp(min=0).to(torch.int32)
+        )
+
+        return prefix_chunk_starts, prefix_chunk_seq_lens
+
+    # Called before each attention module if using chunked kv cache for prefill
+    # Some of the codes are adapted from https://github.com/vllm-project/vllm/blob/main/vllm/v1/attention/backends/mla/common.py
+    def prepare_chunked_prefix_cache_info(self, device: torch.device):
+
+        from sglang.srt.mem_cache.memory_pool import MLATokenToKVPool
+
+        assert isinstance(
+            self.token_to_kv_pool, MLATokenToKVPool
+        ), "Currently chunked prefix cache can only be used by Deepseek models"
+
+        if not any(self.extend_prefix_lens_cpu):
+            self.num_prefix_chunks = 0
+            return
+
+        if self.prefix_chunk_len is not None:
+            # Chunked kv cache info already prepared by prior modules
+            return
+
+        self.prefix_chunk_idx = -1
+
+        # chunk_capacity is the maximum number of tokens in each chunk
+        chunk_capacity = self.get_max_chunk_capacity()
+        self.prefix_chunk_len = chunk_capacity // self.batch_size
+
+        self.num_prefix_chunks = (
+            max(self.extend_prefix_lens_cpu) + self.prefix_chunk_len - 1
+        ) // self.prefix_chunk_len
+
+        # Here we compute chunk lens twice to avoid stream sync, once on gpu and once on cpu.
+        prefix_chunk_starts_cuda, prefix_chunk_seq_lens_cuda = (
+            self.get_prefix_chunk_seq_lens(
+                self.extend_prefix_lens,
+                self.num_prefix_chunks,
+                self.prefix_chunk_len,
+            )
+        )
+        _, prefix_chunk_seq_lens_cpu = self.get_prefix_chunk_seq_lens(
+            torch.tensor(self.extend_prefix_lens_cpu),
+            self.num_prefix_chunks,
+            self.prefix_chunk_len,
+        )
+        self.prefix_chunk_starts = prefix_chunk_starts_cuda
+        self.prefix_chunk_seq_lens = prefix_chunk_seq_lens_cuda
+
+        # Metadata for attention backend
+        self.prefix_chunk_cu_seq_lens = torch.zeros(
+            self.num_prefix_chunks,
+            self.batch_size + 1,
+            device=device,
+            dtype=torch.int32,
+        )
+        self.prefix_chunk_cu_seq_lens[:, 1:] = prefix_chunk_seq_lens_cuda.cumsum(
+            dim=1
+        ).to(torch.int32)
+        self.prefix_chunk_max_seq_lens = prefix_chunk_seq_lens_cpu.max(
+            dim=1
+        ).values.tolist()
+
+        self.prefix_chunk_num_tokens = prefix_chunk_seq_lens_cpu.sum(dim=1).tolist()
+        assert max(self.prefix_chunk_num_tokens) <= self.get_max_chunk_capacity()
+
+        # Precompute the kv indices for each chunk
+        self.prepare_chunked_kv_indices(device)
+
+    def fetch_mha_one_shot_kv_indices(self):
+        if self.mha_one_shot_kv_indices is not None:
+            return self.mha_one_shot_kv_indices
+        batch_size = self.batch_size
+        paged_kernel_lens_sum = sum(self.seq_lens_cpu)
+        kv_indices = torch.empty(
+            paged_kernel_lens_sum,
+            dtype=torch.int32,
+            device=self.req_pool_indices.device,
+        )
+        kv_indptr = torch.zeros(
+            batch_size + 1,
+            dtype=torch.int32,
+            device=self.req_pool_indices.device,
+        )
+        kv_indptr[1:] = torch.cumsum(self.seq_lens, dim=0)
+        create_flashinfer_kv_indices_triton[(self.batch_size,)](
+            self.req_to_token_pool.req_to_token,
+            self.req_pool_indices,
+            self.seq_lens,
+            kv_indptr,
+            None,
+            kv_indices,
+            self.req_to_token_pool.req_to_token.shape[1],
+        )
+        self.mha_one_shot_kv_indices = kv_indices
+        return kv_indices
+
+
+@triton.jit
+def create_chunked_prefix_cache_kv_indices(
+    req_to_token_ptr,  # (max_batch, max_context_len,)
+    req_pool_indices_ptr,  # (batch_size,)
+    chunk_start_idx_ptr,  # (batch_size,)
+    chunk_seq_lens_ptr,  # (batch_size,)
+    chunk_cu_seq_lens_ptr,  # (batch_size + 1,)
+    chunk_kv_indices_ptr,  # (num_chunk_tokens,)
+    req_to_token_ptr_stride: tl.constexpr,
+):
+    BLOCK_SIZE: tl.constexpr = 512
+    pid = tl.program_id(axis=0)
+
+    # find the req pool idx, this is for batch to token
+    req_pool_index = tl.load(req_pool_indices_ptr + pid)
+    chunk_kv_indices_offset = tl.load(chunk_cu_seq_lens_ptr + pid)
+
+    # get the token positions of current chunk
+    chunk_start_pos = tl.load(chunk_start_idx_ptr + pid).to(tl.int32)
+    chunk_seq_len = tl.load(chunk_seq_lens_ptr + pid).to(tl.int32)
+
+    num_loop = tl.cdiv(chunk_seq_len, BLOCK_SIZE)
+    for i in range(num_loop):
+        offset = tl.arange(0, BLOCK_SIZE) + i * BLOCK_SIZE
+        mask = offset < chunk_seq_len
+        data = tl.load(
+            req_to_token_ptr
+            + req_pool_index * req_to_token_ptr_stride
+            + chunk_start_pos
+            + offset,
+            mask=mask,
+        )
+        tl.store(
+            chunk_kv_indices_ptr + chunk_kv_indices_offset + offset, data, mask=mask
+        )
diff --git a/sglang/python/sglang/srt/model_executor/forward_batch_info.py b/sglang/python/sglang/srt/model_executor/forward_batch_info.py
new file mode 100644
index 0000000000000000000000000000000000000000..aaeb4fa7f41f5a91c515e0333e2ed4b67c0e2c8f
--- /dev/null
+++ b/sglang/python/sglang/srt/model_executor/forward_batch_info.py
@@ -0,0 +1,1105 @@
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""
+Store information about a forward batch.
+
+The following is the flow of data structures for a batch:
+
+ScheduleBatch -> ModelWorkerBatch -> ForwardBatch
+
+- ScheduleBatch is managed by `scheduler.py::Scheduler`.
+  It contains high-level scheduling data. Most of the data is on the CPU.
+- ModelWorkerBatch is managed by `tp_worker.py::TpModelWorker`.
+  It is a subset of `ScheduleBatch` that only contains data related to the model forward on GPU.
+  It will be transformed from CPU scheduler to GPU model runner.
+- ForwardBatch is managed by `model_runner.py::ModelRunner`.
+  It contains low-level tensor data. Most of the data consists of GPU tensors.
+"""
+
+from __future__ import annotations
+
+from dataclasses import dataclass
+from enum import IntEnum, auto
+from functools import total_ordering
+from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, Union
+
+import torch
+import triton
+import triton.language as tl
+
+from sglang.srt.distributed.parallel_state import (
+    get_moe_expert_parallel_world_size,
+    get_tensor_model_parallel_world_size,
+)
+from sglang.srt.layers.attention.nsa.utils import NSAContextParallelMetadata
+from sglang.srt.layers.dp_attention import (
+    DpPaddingMode,
+    get_attention_cp_size,
+    get_attention_dp_rank,
+    get_attention_tp_rank,
+    get_attention_tp_size,
+    set_dp_buffer_len,
+    set_is_extend_in_batch,
+)
+from sglang.srt.model_executor.forward_batch_deepseek_mha_mixin import (
+    ForwardBatchDeepSeekMHAMixin,
+)
+from sglang.srt.server_args import get_global_server_args
+from sglang.srt.utils import get_compiler_backend, is_hip, is_npu, support_triton
+from sglang.srt.utils.common import ceil_align
+
+if TYPE_CHECKING:
+    from sglang.srt.layers.attention.base_attn_backend import AttentionBackend
+    from sglang.srt.layers.logits_processor import LogitsProcessorOutput
+    from sglang.srt.managers.schedule_batch import ModelWorkerBatch, MultimodalInputs
+    from sglang.srt.mem_cache.memory_pool import KVCache, ReqToTokenPool
+    from sglang.srt.model_executor.model_runner import ModelRunner
+    from sglang.srt.sampling.sampling_batch_info import SamplingBatchInfo
+    from sglang.srt.speculative.spec_info import SpecInput, SpeculativeAlgorithm
+
+_is_npu = is_npu()
+
+
+class ForwardMode(IntEnum):
+    # Extend a sequence. The KV cache of the beginning part of the sequence is already computed (e.g., system prompt).
+    # It is also called "prefill" in common terminology.
+    EXTEND = auto()
+    # Decode one token.
+    DECODE = auto()
+    # Contains both EXTEND and DECODE when doing chunked prefill.
+    MIXED = auto()
+    # No sequence to forward. For data parallel attention, some workers will be IDLE if no sequence are allocated.
+    IDLE = auto()
+
+    # Used in speculative decoding: verify a batch in the target model.
+    TARGET_VERIFY = auto()
+    # Used in speculative decoding: extend a batch in the draft model.
+    DRAFT_EXTEND = auto()
+
+    DRAFT_EXTEND_V2 = auto()
+
+    # Used in disaggregated decode worker
+    # Represent a batch of requests having their KV cache ready to start decoding
+    PREBUILT = auto()
+
+    # Split Prefill for PD multiplexing
+    SPLIT_PREFILL = auto()
+
+    # Used in dLLM
+    DLLM_EXTEND = auto()
+
+    def is_prefill(self):
+        return self.is_extend()
+
+    def is_extend(self, include_draft_extend_v2: bool = False):
+        return (
+            self == ForwardMode.EXTEND
+            or self == ForwardMode.MIXED
+            or self == ForwardMode.DRAFT_EXTEND
+            or (include_draft_extend_v2 and self == ForwardMode.DRAFT_EXTEND_V2)
+            or self == ForwardMode.TARGET_VERIFY
+            or self == ForwardMode.SPLIT_PREFILL
+            or self == ForwardMode.DLLM_EXTEND
+        )
+
+    def is_context_parallel_extend(self, include_draft_extend_v2: bool = False):
+        return (
+            self == ForwardMode.EXTEND
+            or self == ForwardMode.MIXED
+            or (
+                self == ForwardMode.DRAFT_EXTEND_V2
+                if include_draft_extend_v2
+                else False
+            )
+        )
+
+    def is_decode(self):
+        return self == ForwardMode.DECODE
+
+    def is_mixed(self):
+        return self == ForwardMode.MIXED
+
+    def is_idle(self):
+        return self == ForwardMode.IDLE
+
+    def is_decode_or_idle(self):
+        return self == ForwardMode.DECODE or self == ForwardMode.IDLE
+
+    def is_target_verify(self):
+        return self == ForwardMode.TARGET_VERIFY
+
+    def is_draft_extend(self, include_v2: bool = False):
+        return self == ForwardMode.DRAFT_EXTEND or (
+            include_v2 and self == ForwardMode.DRAFT_EXTEND_V2
+        )
+
+    def is_draft_extend_v2(self):
+        # For fixed shape logits output in eagle v2 worker
+        return self == ForwardMode.DRAFT_EXTEND_V2
+
+    def is_extend_or_draft_extend_or_mixed(self, include_draft_extend_v2: bool = False):
+        return (
+            self == ForwardMode.EXTEND
+            or self == ForwardMode.DRAFT_EXTEND
+            or self == ForwardMode.MIXED
+            or self == ForwardMode.SPLIT_PREFILL
+            or (include_draft_extend_v2 and self == ForwardMode.DRAFT_EXTEND_V2)
+        )
+
+    def is_cuda_graph(self):
+        return (
+            self == ForwardMode.DECODE
+            or self == ForwardMode.TARGET_VERIFY
+            or self == ForwardMode.IDLE
+            or self == ForwardMode.DLLM_EXTEND
+        )
+
+    def is_cpu_graph(self):
+        return self == ForwardMode.DECODE
+
+    def is_split_prefill(self):
+        return self == ForwardMode.SPLIT_PREFILL
+
+    def is_extend_without_speculative(self):
+        return (
+            self.is_extend()
+            and not self.is_target_verify()
+            and not self.is_draft_extend()
+        )
+
+    def is_prebuilt(self):
+        return self == ForwardMode.PREBUILT
+
+    def is_dllm_extend(self):
+        return self == ForwardMode.DLLM_EXTEND
+
+
+@total_ordering
+class CaptureHiddenMode(IntEnum):
+    # Do not capture anything.
+    NULL = 0
+    # Capture a hidden state of the last token.
+    LAST = 1
+    # Capture hidden states of all tokens.
+    FULL = 2
+
+    def need_capture(self):
+        return self != CaptureHiddenMode.NULL
+
+    def is_full(self):
+        return self == CaptureHiddenMode.FULL
+
+    def is_last(self):
+        return self == CaptureHiddenMode.LAST
+
+    def __lt__(self, other):
+        return self.value < other.value
+
+
+def compute_local_num_token_non_padded(
+    global_num_token_non_padded: torch.Tensor,
+    num_tokens_per_dp: int,
+) -> torch.Tensor:
+    """Compute local non-padded token count for this attention-TP rank.
+
+    Converts a global count (across all TP ranks) to a local count for this rank.
+    The "global" scope is within the current DP rank; DP is handled via num_tokens_per_dp.
+    """
+    attn_tp_rank = get_attention_tp_rank()
+    attn_tp_size = get_attention_tp_size()
+    tokens_per_rank = num_tokens_per_dp // attn_tp_size
+
+    return torch.clamp(
+        global_num_token_non_padded - tokens_per_rank * attn_tp_rank,
+        0,
+        tokens_per_rank,
+    )
+
+
+@dataclass
+class ForwardBatch(ForwardBatchDeepSeekMHAMixin):
+    """Store all inputs of a forward pass."""
+
+    # The forward mode
+    forward_mode: ForwardMode
+    # The batch size
+    batch_size: int
+    # The input ids
+    input_ids: torch.Tensor
+    # The indices of requests in the req_to_token_pool
+    req_pool_indices: torch.Tensor
+    # The sequence length
+    seq_lens: torch.Tensor
+    # The indices of output tokens in the token_to_kv_pool
+    out_cache_loc: torch.Tensor
+
+    # The sum of all sequence lengths
+    seq_lens_sum: int
+
+    # The original sequence length without being chunked. Qwen-1M related.
+    orig_seq_lens: Optional[torch.Tensor] = None
+
+    # The indices of output tokens in the token_to_kv_pool_swa
+    # TODO(shiyang, biao): integrate out_cache_loc_swa into multiple attention backends
+    out_cache_loc_swa: Optional[torch.Tensor] = None
+    # The indices to track mamba state with
+    mamba_track_indices: Optional[torch.Tensor] = None  # shape: [b], int64
+    # The mask to track mamba state if needed
+    mamba_track_mask: Optional[torch.Tensor] = None  # shape: [b], bool
+    # The seqlens to track mamba state if masked, prefill only.
+    mamba_track_seqlens: Optional[torch.Tensor] = None  # shape: [b], int64
+
+    # Optional seq_lens on cpu
+    seq_lens_cpu: Optional[torch.Tensor] = None
+
+    # For logprob
+    return_logprob: bool = False
+    top_logprobs_nums: Optional[List[int]] = None
+    token_ids_logprobs: Optional[List[List[int]]] = None
+
+    # For logits and logprobs post processing
+    next_token_logits_buffer: torch.Tensor = None
+    temp_scaled_logprobs: bool = False
+    temperature: torch.Tensor = None
+    top_p_normalized_logprobs: bool = False
+    top_p: torch.Tensor = None
+
+    # Position information
+    positions: torch.Tensor = None
+
+    # For extend
+    extend_num_tokens: Optional[int] = None
+    extend_seq_lens: Optional[torch.Tensor] = None
+    extend_prefix_lens: Optional[torch.Tensor] = None
+    extend_start_loc: Optional[torch.Tensor] = None
+    extend_prefix_lens_cpu: Optional[List[int]] = None
+    extend_seq_lens_cpu: Optional[List[int]] = None
+    extend_logprob_start_lens_cpu: Optional[List[int]] = None
+    extend_input_logprob_token_ids_gpu: Optional[torch.Tensor] = None
+
+    # For split prefill
+    # intermediate values for split prefill
+    hidden_states: torch.Tensor = None
+    residual: torch.Tensor = None
+    model_specific_states: Dict[str, any] = None
+    # current split index of layer
+    split_index: int = 0
+
+    # For multimodal
+    mm_inputs: Optional[List[MultimodalInputs]] = None
+
+    # Encoder-decoder
+    encoder_cached: Optional[List[bool]] = None
+    encoder_lens: Optional[torch.Tensor] = None
+    encoder_lens_cpu: Optional[List[int]] = None
+    encoder_out_cache_loc: Optional[torch.Tensor] = None
+
+    # For LoRA
+    lora_ids: Optional[List[str]] = None
+
+    # For input embeddings
+    input_embeds: Optional[torch.Tensor] = None
+
+    # For cross-encoder model
+    token_type_ids: Optional[torch.Tensor] = None
+
+    # Sampling info
+    sampling_info: SamplingBatchInfo = None
+
+    # Attention backend
+    req_to_token_pool: ReqToTokenPool = None
+    token_to_kv_pool: KVCache = None
+    attn_backend: AttentionBackend = None
+
+    # For DP attention
+    original_global_num_tokens_cpu: Optional[List[int]] = None
+    global_num_tokens_cpu: Optional[List[int]] = None
+    global_num_tokens_gpu: Optional[torch.Tensor] = None
+    # Has to be None when cuda graph is captured.
+    global_num_tokens_for_logprob_cpu: Optional[List[int]] = None
+    global_num_tokens_for_logprob_gpu: Optional[torch.Tensor] = None
+    # The padding mode for DP attention
+    dp_padding_mode: Optional[DpPaddingMode] = None
+    # for extend, local start pos and num tokens is different in logits processor
+    # this will be computed in get_dp_local_info
+    # this will be recomputed in LogitsMetadata.from_forward_batch
+    dp_local_start_pos: Optional[torch.Tensor] = None  # cached info at runtime
+    dp_local_num_tokens: Optional[torch.Tensor] = None  # cached info at runtime
+    global_dp_buffer_len: Optional[int] = None
+    is_extend_in_batch: bool = False
+    all_extend_in_batch: bool = False
+    can_run_dp_cuda_graph: bool = False
+    global_forward_mode: Optional[ForwardMode] = None
+
+    # Whether this batch is prefill-only (no token generation needed)
+    is_prefill_only: bool = False
+
+    # Speculative decoding
+    spec_info: Optional[SpecInput] = None
+    spec_algorithm: SpeculativeAlgorithm = None
+    mm_input_embeds: Optional[torch.Tensor] = None
+    capture_hidden_mode: CaptureHiddenMode = None
+
+    # For padding
+    padded_static_len: int = -1  # -1 if not padded
+    num_token_non_padded: Optional[torch.Tensor] = None  # scalar tensor
+    num_token_non_padded_cpu: int = None
+
+    # For Qwen2-VL
+    mrope_positions: torch.Tensor = None
+
+    # For two-batch overlap
+    tbo_split_seq_index: Optional[int] = None
+    tbo_parent_token_range: Optional[Tuple[int, int]] = None
+    tbo_padded_len: Optional[int] = None
+    tbo_children: Optional[List[ForwardBatch]] = None
+
+    # For matryoshka embeddings
+    dimensions: Optional[list[int]] = None
+
+    # Record the split metadata of the sequence number of NSA context parallels.
+    nsa_cp_metadata: Optional[NSAContextParallelMetadata] = None
+
+    # For hidden states before normal
+    return_hidden_states_before_norm: bool = False
+
+    # For dumper: request IDs for cross-step sequence tracking
+    rids: Optional[List[str]] = None
+
+    @classmethod
+    def init_new(
+        cls,
+        batch: ModelWorkerBatch,
+        model_runner: ModelRunner,
+    ):
+        ret = cls(
+            forward_mode=batch.forward_mode,
+            batch_size=len(batch.seq_lens),
+            input_ids=batch.input_ids,
+            req_pool_indices=batch.req_pool_indices,
+            seq_lens=batch.seq_lens,
+            out_cache_loc=batch.out_cache_loc,
+            mamba_track_indices=batch.mamba_track_indices,
+            mamba_track_mask=batch.mamba_track_mask,
+            mamba_track_seqlens=batch.mamba_track_seqlens,
+            mm_inputs=batch.multimodal_inputs,
+            encoder_cached=batch.encoder_cached,
+            encoder_lens=batch.encoder_lens,
+            encoder_lens_cpu=batch.encoder_lens_cpu,
+            encoder_out_cache_loc=batch.encoder_out_cache_loc,
+            seq_lens_sum=batch.seq_lens_sum,
+            seq_lens_cpu=batch.seq_lens_cpu,
+            orig_seq_lens=batch.orig_seq_lens,
+            return_logprob=batch.return_logprob,
+            top_logprobs_nums=batch.top_logprobs_nums,
+            token_ids_logprobs=batch.token_ids_logprobs,
+            is_extend_in_batch=batch.is_extend_in_batch,
+            all_extend_in_batch=batch.all_extend_in_batch,
+            can_run_dp_cuda_graph=batch.can_run_dp_cuda_graph,
+            global_forward_mode=batch.global_forward_mode,
+            is_prefill_only=batch.is_prefill_only,
+            lora_ids=batch.lora_ids,
+            sampling_info=batch.sampling_info,
+            req_to_token_pool=model_runner.req_to_token_pool,
+            token_to_kv_pool=model_runner.token_to_kv_pool,
+            attn_backend=model_runner.attn_backend,
+            spec_algorithm=batch.spec_algorithm,
+            spec_info=batch.spec_info,
+            capture_hidden_mode=batch.capture_hidden_mode,
+            input_embeds=batch.input_embeds,
+            token_type_ids=batch.token_type_ids,
+            tbo_split_seq_index=batch.tbo_split_seq_index,
+            dimensions=batch.dimensions,
+            return_hidden_states_before_norm=batch.return_hidden_states_before_norm,
+            rids=[req.rid for req in batch.reqs],
+        )
+        device = model_runner.device
+
+        if batch.extend_input_logprob_token_ids is not None:
+            ret.extend_input_logprob_token_ids_gpu = (
+                batch.extend_input_logprob_token_ids.to(device, non_blocking=True)
+            )
+
+        if enable_num_token_non_padded(model_runner.server_args):
+            ret.num_token_non_padded = torch.tensor(
+                len(batch.input_ids), dtype=torch.int32
+            ).to(device, non_blocking=True)
+        ret.num_token_non_padded_cpu = len(batch.input_ids)
+
+        # For MLP sync
+        if batch.global_num_tokens is not None:
+            assert batch.global_num_tokens_for_logprob is not None
+
+            # process global_num_tokens and global_num_tokens_for_logprob
+            if batch.spec_info is not None:
+                spec_info: SpecInput = batch.spec_info
+                global_num_tokens, global_num_tokens_for_logprob = (
+                    spec_info.get_spec_adjusted_global_num_tokens(batch)
+                )
+            else:
+                global_num_tokens = batch.global_num_tokens
+                global_num_tokens_for_logprob = batch.global_num_tokens_for_logprob
+
+            ret.original_global_num_tokens_cpu = batch.global_num_tokens
+            ret.global_num_tokens_cpu = global_num_tokens
+            ret.global_num_tokens_gpu = torch.tensor(
+                global_num_tokens, dtype=torch.int64
+            ).to(device, non_blocking=True)
+
+            ret.global_num_tokens_for_logprob_cpu = global_num_tokens_for_logprob
+            ret.global_num_tokens_for_logprob_gpu = torch.tensor(
+                global_num_tokens_for_logprob, dtype=torch.int64
+            ).to(device, non_blocking=True)
+
+        if ret.forward_mode.is_idle():
+            ret.positions = torch.empty((0,), dtype=torch.int64, device=device)
+            return ret
+
+        # Override the positions with diffusion LLM or spec_info
+        if batch.dllm_config is not None:
+            block_size = batch.dllm_config.block_size
+            # Use int64 for AMD rotary embedding kernel compatibility
+            positions_dtype = torch.int64 if is_hip() else torch.int32
+            ret.positions = torch.tensor(
+                [
+                    i
+                    for block_offset in batch.dllm_block_offsets
+                    for i in range(block_offset, block_offset + block_size)
+                ],
+                dtype=positions_dtype,
+            ).to(device, non_blocking=True)
+        elif (
+            ret.spec_info is not None
+            and getattr(ret.spec_info, "positions", None) is not None
+        ):
+            ret.positions = ret.spec_info.positions
+
+        # Init position information
+        if ret.forward_mode.is_decode() or ret.forward_mode.is_target_verify():
+            if ret.positions is None:
+                ret.positions = clamp_position(batch.seq_lens)
+        else:
+            assert isinstance(batch.extend_seq_lens, list)
+            assert isinstance(batch.extend_prefix_lens, list)
+            ret.extend_seq_lens = torch.tensor(
+                batch.extend_seq_lens, dtype=torch.int32
+            ).to(device, non_blocking=True)
+            ret.extend_prefix_lens = torch.tensor(
+                batch.extend_prefix_lens, dtype=torch.int32
+            ).to(device, non_blocking=True)
+            ret.extend_num_tokens = batch.extend_num_tokens
+            positions, ret.extend_start_loc = compute_position(
+                model_runner.server_args.attention_backend,
+                ret.extend_prefix_lens,
+                ret.extend_seq_lens,
+                ret.extend_num_tokens,
+            )
+            if ret.positions is None:
+                ret.positions = positions
+            ret.extend_prefix_lens_cpu = batch.extend_prefix_lens
+            ret.extend_seq_lens_cpu = batch.extend_seq_lens
+            ret.extend_logprob_start_lens_cpu = batch.extend_logprob_start_lens
+
+        if model_runner.model_is_mrope:
+            if (
+                ret.spec_info is not None
+                and getattr(ret.spec_info, "positions", None) is not None
+            ):
+                ret._compute_spec_mrope_positions(model_runner, batch)
+            else:
+                ret._compute_mrope_positions(model_runner, batch)
+
+        # Init lora information
+        if model_runner.server_args.enable_lora:
+            # In the non-LoRA overlap loading case, we fetch LoRA adapters into the memory pool
+            # as a batch, right before running the batch
+            if not model_runner.server_args.enable_lora_overlap_loading:
+                model_runner.lora_manager.fetch_new_loras(set(ret.lora_ids))
+
+            model_runner.lora_manager.prepare_lora_batch(ret)
+
+        return ret
+
+    def adjust_num_token_non_padded_for_attn_tp(self, server_args) -> None:
+        """Make num_token_non_padded local to this attention-TP rank."""
+        from sglang.srt.utils.common import require_mlp_tp_gather
+
+        dp_rank = get_attention_dp_rank()
+        assert self.global_num_tokens_cpu is not None
+
+        if require_mlp_tp_gather(server_args):
+            num_tokens_per_dp = self.global_num_tokens_cpu[dp_rank]
+        else:
+            num_tokens_per_dp = self.global_num_tokens_cpu[0]
+
+        self.num_token_non_padded = compute_local_num_token_non_padded(
+            global_num_token_non_padded=self.num_token_non_padded,
+            num_tokens_per_dp=num_tokens_per_dp,
+        )
+
+    def merge_mm_inputs(self) -> Optional[MultimodalInputs]:
+        """
+        Merge all multimodal inputs in the batch into a single MultiModalInputs object.
+
+        Returns:
+            if none, current batch contains no multimodal input
+
+        """
+        if not self.mm_inputs or all(x is None for x in self.mm_inputs):
+            return None
+        # Filter out None values
+        valid_inputs = [x for x in self.mm_inputs if x is not None]
+
+        # TODO: is it expensive?
+        # a workaround to avoid importing `MultimodalInputs`
+        merged = valid_inputs[0].__class__(mm_items=[])
+
+        # Merge remaining inputs
+        for mm_input in valid_inputs:
+            merged.merge(mm_input)
+
+        return merged
+
+    def contains_image_inputs(self) -> bool:
+        if self.mm_inputs is None:
+            return False
+        return any(
+            mm_input is not None and mm_input.contains_image_inputs()
+            for mm_input in self.mm_inputs
+        )
+
+    def contains_audio_inputs(self) -> bool:
+        if self.mm_inputs is None:
+            return False
+        return any(
+            mm_input is not None and mm_input.contains_audio_inputs()
+            for mm_input in self.mm_inputs
+        )
+
+    def contains_video_inputs(self) -> bool:
+        if self.mm_inputs is None:
+            return False
+        return any(
+            mm_input is not None and mm_input.contains_video_inputs()
+            for mm_input in self.mm_inputs
+        )
+
+    def contains_mm_inputs(self) -> bool:
+        return (
+            self.contains_audio_inputs()
+            or self.contains_video_inputs()
+            or self.contains_image_inputs()
+        )
+
+    def _compute_spec_mrope_positions(
+        self, model_runner: ModelRunner, batch: ModelWorkerBatch
+    ):
+        # TODO support batched deltas
+        batch_size = self.seq_lens.shape[0]
+        device = model_runner.device
+        mm_inputs = batch.multimodal_inputs
+
+        if batch.forward_mode.is_draft_extend():  # draft_extend_after_decode
+            mrope_deltas = []
+            extend_lens = []
+            for batch_idx in range(batch_size):
+                extend_seq_len = batch.extend_seq_lens[batch_idx]
+                extend_lens.append(extend_seq_len)
+                mrope_delta = (
+                    torch.zeros(1, dtype=torch.int64)
+                    if mm_inputs[batch_idx] is None
+                    else mm_inputs[batch_idx].mrope_position_delta.squeeze(0)
+                )
+                mrope_deltas.append(mrope_delta.to(device=device))
+            position_chunks = torch.split(batch.spec_info.positions, extend_lens)
+            mrope_positions_list = [
+                pos_chunk + delta
+                for pos_chunk, delta in zip(position_chunks, mrope_deltas)
+            ]
+            next_input_positions = (
+                torch.cat(mrope_positions_list, dim=0).unsqueeze(0).repeat(3, 1)
+            )
+
+        else:  # target_verify or draft_decode
+            seq_positions = batch.spec_info.positions.view(batch_size, -1)
+            mrope_deltas = [
+                (
+                    torch.tensor([0], dtype=torch.int64)
+                    if mm_inputs[i] is None
+                    else mm_inputs[i].mrope_position_delta.squeeze(0)
+                )
+                for i in range(batch_size)
+            ]
+            mrope_delta_tensor = torch.stack(mrope_deltas, dim=0).to(device=device)
+            next_input_positions = (
+                (seq_positions + mrope_delta_tensor).flatten().unsqueeze(0).repeat(3, 1)
+            )
+
+        self.mrope_positions = next_input_positions
+
+    def _expand_mrope_from_input(
+        self,
+        mm_input: MultimodalInputs,
+        seq_len: int,
+    ) -> torch.Tensor:
+        # doing below compute on cpu to avoid frequent small kernels
+        if mm_input.mrope_position_delta_repeated_cache is None:
+            mm_input.mrope_position_delta_repeated_cache = (
+                (mm_input.mrope_position_delta - 1).flatten().unsqueeze(0).repeat(3, 1)
+            )
+        mrope_positions = mm_input.mrope_position_delta_repeated_cache + seq_len
+        return mrope_positions
+
+    def _compute_mrope_positions(
+        self, model_runner: ModelRunner, batch: ModelWorkerBatch
+    ):
+        # batch_size * [3 * seq_len]
+        batch_size = self.seq_lens_cpu.shape[0]
+        mrope_positions_list = [[]] * batch_size
+        for batch_idx in range(batch_size):
+            mm_input = batch.multimodal_inputs[batch_idx]
+            if self.forward_mode.is_decode():
+                # 3 * N
+                if (
+                    mm_input is None
+                    or get_global_server_args().rl_on_policy_target is not None
+                ):
+                    mrope_positions_list[batch_idx] = torch.full(
+                        (3, 1),
+                        self.seq_lens_cpu[batch_idx] - 1,
+                        dtype=torch.int64,
+                    )
+                else:
+                    mrope_positions = self._expand_mrope_from_input(
+                        mm_input, self.seq_lens_cpu[batch_idx]
+                    )
+                    mrope_positions_list[batch_idx] = mrope_positions
+            elif self.forward_mode.is_extend(include_draft_extend_v2=True):
+                extend_seq_len, extend_prefix_len = (
+                    batch.extend_seq_lens[batch_idx],
+                    batch.extend_prefix_lens[batch_idx],
+                )
+                if (
+                    mm_input is None
+                    or get_global_server_args().rl_on_policy_target is not None
+                ):
+                    # text only
+                    mrope_positions = torch.tensor(
+                        [
+                            [
+                                pos
+                                for pos in range(
+                                    extend_prefix_len,
+                                    extend_prefix_len + extend_seq_len,
+                                )
+                            ]
+                        ]
+                        * 3
+                    )
+                else:
+                    mrope_positions = mm_input.mrope_positions[
+                        :,
+                        extend_prefix_len : extend_prefix_len + extend_seq_len,
+                    ]
+                    if mrope_positions.numel() == 0:
+                        mrope_positions = self._expand_mrope_from_input(
+                            mm_input, self.seq_lens_cpu[batch_idx]
+                        )
+                mrope_positions_list[batch_idx] = mrope_positions
+
+        self.mrope_positions = torch.cat(
+            [pos for pos in mrope_positions_list],
+            dim=1,
+        ).to(dtype=torch.int64, device=model_runner.device, non_blocking=True)
+
+    def _pad_tensor_to_size(self, tensor: torch.Tensor, size: int, *, value: int = 0):
+        if value == 0:
+            return torch.cat(
+                [tensor, tensor.new_zeros(size - tensor.shape[0], *tensor.shape[1:])],
+                dim=0,
+            )
+        else:
+            return torch.cat(
+                [
+                    tensor,
+                    tensor.new_full((size - tensor.shape[0], *tensor.shape[1:]), value),
+                ],
+                dim=0,
+            )
+
+    def prepare_mlp_sync_batch(self, model_runner: ModelRunner):
+        from sglang.srt.batch_overlap.two_batch_overlap import TboForwardBatchPreparer
+
+        assert self.global_num_tokens_cpu is not None
+        assert self.global_num_tokens_for_logprob_cpu is not None
+
+        global_num_tokens = self.global_num_tokens_cpu
+        sync_group_size = len(global_num_tokens)
+        attn_tp_size = get_attention_tp_size()
+
+        for i in range(sync_group_size):
+            # make sure that the padded length is divisible by attn_tp_size because we may need reduce-scatter across attn_tp dim.
+            # there is no reduce-scatter in LM logprob, so we do not need to adjust the padded length for logprob
+            global_num_tokens[i] = ceil_align(global_num_tokens[i], attn_tp_size)
+
+        # make sure that each rank has the same number of tokens to do collective communication.
+        attn_cp_size = get_attention_cp_size()
+        for i in range(sync_group_size):
+            global_num_tokens[i] = ceil_align(global_num_tokens[i], attn_cp_size)
+
+        dp_padding_mode = DpPaddingMode.get_dp_padding_mode(
+            self.is_extend_in_batch, global_num_tokens
+        )
+        self.dp_padding_mode = dp_padding_mode
+
+        if dp_padding_mode.is_max_len():
+            # when DP gather mode is all gather, we will use
+            # all_gather_into_tensor to gather hidden states, where transferred
+            # tokens should be padded to the same length. We will also use
+            # reduce-scatter instead of all-reduce after MLP.
+            max_num_tokens = max(global_num_tokens)
+            global_num_tokens = [max_num_tokens] * sync_group_size
+            buffer_len = max_num_tokens * sync_group_size
+        else:
+            buffer_len = sum(global_num_tokens)
+
+        if len(global_num_tokens) > 1:
+            num_tokens = global_num_tokens[get_attention_dp_rank()]
+        else:
+            num_tokens = global_num_tokens[0]
+
+        self.global_dp_buffer_len = buffer_len
+        set_dp_buffer_len(
+            buffer_len, num_tokens, dp_padding_mode.is_max_len(), global_num_tokens
+        )
+        set_is_extend_in_batch(self.is_extend_in_batch)
+
+        bs = self.batch_size
+
+        if (
+            self.forward_mode.is_decode()
+            or self.forward_mode.is_target_verify()
+            or self.forward_mode.is_draft_extend(include_v2=True)
+            or self.forward_mode.is_idle()
+        ):
+            if self.is_extend_in_batch and dp_padding_mode.is_max_len():
+                setattr(self, "_original_forward_mode", self.forward_mode)
+                self.forward_mode = ForwardMode.EXTEND
+                self.extend_num_tokens = bs
+                self.extend_seq_lens = torch.full_like(self.seq_lens, 1)
+                self.extend_prefix_lens = self.seq_lens - 1
+                self.extend_start_loc = torch.arange(
+                    bs, dtype=torch.int32, device=self.seq_lens.device
+                )
+                self.extend_prefix_lens_cpu = self.extend_prefix_lens.cpu()
+                self.extend_seq_lens_cpu = self.extend_seq_lens.cpu()
+                self.extend_logprob_start_lens_cpu = self.extend_prefix_lens_cpu
+            else:
+                setattr(self, "_original_batch_size", self.batch_size)
+                if self.spec_info is not None:
+                    bs = self.batch_size = (
+                        num_tokens // self.spec_info.num_tokens_per_req
+                    )
+                else:
+                    bs = self.batch_size = num_tokens
+        elif self.forward_mode.is_extend():
+            self.extend_num_tokens = num_tokens
+
+        # padding
+        self._pad_inputs_to_size(model_runner, num_tokens, bs)
+        self.global_num_tokens_cpu = global_num_tokens
+        global_num_tokens_pinned = torch.tensor(global_num_tokens, pin_memory=True)
+        self.global_num_tokens_gpu.copy_(global_num_tokens_pinned, non_blocking=True)
+
+        TboForwardBatchPreparer.prepare(
+            batch=self, is_draft_worker=model_runner.is_draft_worker
+        )
+        # TODO: The following is added to make sure sub-batch input_ids are padded
+        # to the multiple of attn_tp_size. It can likely be removed after this
+        # function is refactored and merged into the Scheduler.
+        if self.tbo_children:
+            for child in self.tbo_children:
+                child._pad_inputs_to_size(
+                    model_runner, child.tbo_padded_len, child.batch_size
+                )
+
+    def _pad_inputs_to_size(self, model_runner: ModelRunner, num_tokens, bs):
+        # padding
+        self.input_ids = self._pad_tensor_to_size(self.input_ids, num_tokens)
+        self.req_pool_indices = self._pad_tensor_to_size(self.req_pool_indices, bs)
+        self.lora_ids.extend((bs - len(self.lora_ids)) * [None])
+
+        seq_len_fill_value = (
+            model_runner.attn_backend.get_cuda_graph_seq_len_fill_value()
+        )
+        self.seq_lens_sum = self.seq_lens_sum + seq_len_fill_value * (
+            bs - self.seq_lens.shape[0]
+        )
+        self.seq_lens = self._pad_tensor_to_size(
+            self.seq_lens, bs, value=seq_len_fill_value
+        )
+        if self.seq_lens_cpu is not None:
+            self.seq_lens_cpu = self._pad_tensor_to_size(
+                self.seq_lens_cpu, bs, value=seq_len_fill_value
+            )
+
+        self.out_cache_loc = self._pad_tensor_to_size(self.out_cache_loc, num_tokens)
+        if self.encoder_lens is not None:
+            self.encoder_lens = self._pad_tensor_to_size(self.encoder_lens, bs)
+        self.positions = self._pad_tensor_to_size(self.positions, num_tokens)
+        if self.mamba_track_indices is not None:
+            self.mamba_track_indices = self._pad_tensor_to_size(
+                self.mamba_track_indices, bs
+            )
+        if self.mamba_track_mask is not None:
+            self.mamba_track_mask = self._pad_tensor_to_size(self.mamba_track_mask, bs)
+        if self.mamba_track_seqlens is not None:
+            self.mamba_track_seqlens = self._pad_tensor_to_size(
+                self.mamba_track_seqlens, bs
+            )
+
+        if self.mrope_positions is not None:
+            self.mrope_positions = torch.cat(
+                [
+                    self.mrope_positions,
+                    self.mrope_positions.new_zeros(
+                        3, num_tokens - self.mrope_positions.shape[1]
+                    ),
+                ],
+                dim=1,
+            )
+
+        # TODO: check if we need to pad other tensors
+        if self.extend_seq_lens is not None:
+            self.extend_seq_lens = self._pad_tensor_to_size(self.extend_seq_lens, bs)
+
+        if self.spec_info is not None and self.spec_info.is_draft_input():
+            # FIXME(lsyin): remove this isinstance logic
+            spec_info = self.spec_info
+            self.output_cache_loc_backup = self.out_cache_loc
+            self.hidden_states_backup = spec_info.hidden_states
+            if spec_info.topk_p is not None:
+                spec_info.topk_p = self._pad_tensor_to_size(spec_info.topk_p, bs)
+            if spec_info.topk_index is not None:
+                spec_info.topk_index = self._pad_tensor_to_size(
+                    spec_info.topk_index, bs
+                )
+            if spec_info.accept_length is not None:
+                spec_info.accept_length = self._pad_tensor_to_size(
+                    spec_info.accept_length, bs
+                )
+            spec_info.hidden_states = self._pad_tensor_to_size(
+                spec_info.hidden_states, num_tokens
+            )
+
+    def prepare_attn_tp_scatter_input(self, model_runner: ModelRunner):
+        from sglang.srt.layers.communicator import get_attn_tp_context
+
+        attn_tp_context = get_attn_tp_context()
+        input_scattered = attn_tp_context.use_input_scattered(self)
+        if not input_scattered:
+            return
+        assert self.forward_mode.is_extend()
+        tokens = self.input_ids.shape[0]
+        rank_size = get_tensor_model_parallel_world_size()
+        tokens_padded = (tokens + rank_size - 1) // rank_size * rank_size
+        self._pad_inputs_to_size(model_runner, tokens_padded, self.batch_size)
+
+    def post_forward_mlp_sync_batch(self, logits_output: LogitsProcessorOutput):
+
+        self.forward_mode = getattr(self, "_original_forward_mode", self.forward_mode)
+        self.batch_size = getattr(self, "_original_batch_size", self.batch_size)
+        bs = self.batch_size
+
+        if self.spec_info is not None:
+            if self.forward_mode.is_decode():  # draft
+                num_tokens = self.hidden_states_backup.shape[0]
+                self.positions = self.positions[:num_tokens]
+                self.seq_lens = self.seq_lens[:bs]
+                self.req_pool_indices = self.req_pool_indices[:bs]
+                if self.seq_lens_cpu is not None:
+                    self.seq_lens_cpu = self.seq_lens_cpu[:bs]
+                logits_output.next_token_logits = logits_output.next_token_logits[
+                    :num_tokens
+                ]
+                logits_output.hidden_states = logits_output.hidden_states[:num_tokens]
+            elif self.forward_mode.is_target_verify():  # verify
+                num_tokens = bs * self.spec_info.draft_token_num
+                logits_output.next_token_logits = logits_output.next_token_logits[
+                    :num_tokens
+                ]
+                logits_output.hidden_states = logits_output.hidden_states[:num_tokens]
+            elif self.forward_mode.is_draft_extend():  # draft extend
+                self.spec_info.accept_length = self.spec_info.accept_length[:bs]
+                logits_output.next_token_logits = logits_output.next_token_logits[:bs]
+                logits_output.hidden_states = logits_output.hidden_states[:bs]
+            elif self.forward_mode.is_draft_extend_v2():  # draft extend_v2
+                bs = bs * self.spec_info.num_tokens_per_req
+                logits_output.next_token_logits = logits_output.next_token_logits[:bs]
+                logits_output.hidden_states = logits_output.hidden_states[:bs]
+            elif self.forward_mode.is_extend() or self.forward_mode.is_idle():
+                logits_output.next_token_logits = logits_output.next_token_logits[:bs]
+                logits_output.hidden_states = logits_output.hidden_states[:bs]
+
+            if hasattr(self, "hidden_states_backup"):
+                self.spec_info.hidden_states = self.hidden_states_backup
+            if hasattr(self, "output_cache_loc_backup"):
+                self.out_cache_loc = self.output_cache_loc_backup
+
+        elif self.forward_mode.is_decode() or self.forward_mode.is_idle():
+            logits_output.next_token_logits = logits_output.next_token_logits[:bs]
+            if logits_output.hidden_states is not None:
+                logits_output.hidden_states = logits_output.hidden_states[:bs]
+        elif self.forward_mode.is_extend():
+            num_tokens = self.seq_lens_sum
+            logits_output.next_token_logits = logits_output.next_token_logits[
+                :num_tokens
+            ]
+            if logits_output.hidden_states is not None:
+                logits_output.hidden_states = logits_output.hidden_states[:num_tokens]
+
+    @property
+    def can_run_tbo(self):
+        return self.tbo_split_seq_index is not None
+
+
+def enable_num_token_non_padded(server_args):
+    return get_moe_expert_parallel_world_size() > 1
+
+
+class PPProxyTensors:
+    # adapted from https://github.com/vllm-project/vllm/blob/d14e98d924724b284dc5eaf8070d935e214e50c0/vllm/sequence.py#L1103
+    tensors: Dict[str, torch.Tensor]
+
+    def __init__(self, tensors):
+        # manually define this function, so that
+        # Dynamo knows `IntermediateTensors()` comes from this file.
+        # Otherwise, dataclass will generate this function by evaluating
+        # a string, and we will lose the information about the source file.
+        self.tensors = tensors
+
+    def __getitem__(self, key: Union[str, slice]):
+        if isinstance(key, str):
+            return self.tensors[key]
+        elif isinstance(key, slice):
+            return self.__class__({k: v[key] for k, v in self.tensors.items()})
+
+    def __setitem__(self, key: str, value: torch.Tensor):
+        self.tensors[key] = value
+
+    def __len__(self):
+        return len(self.tensors)
+
+    def __eq__(self, other: object):
+        return isinstance(other, self.__class__) and self
+
+    def __repr__(self) -> str:
+        return f"PPProxyTensors(tensors={self.tensors})"
+
+
+def compute_position(
+    attn_backend: str,
+    extend_prefix_lens: torch.Tensor,
+    extend_seq_lens: torch.Tensor,
+    extend_seq_lens_sum: int,
+):
+    if support_triton(attn_backend):
+        positions, extend_start_loc = compute_position_triton(
+            extend_prefix_lens,
+            extend_seq_lens,
+            extend_seq_lens_sum,
+        )
+    else:
+        positions, extend_start_loc = compute_position_torch(
+            extend_prefix_lens, extend_seq_lens
+        )
+    return positions, extend_start_loc
+
+
+def compute_position_triton(
+    extend_prefix_lens: torch.Tensor, extend_seq_lens: torch.Tensor, extend_seq_lens_sum
+):
+    """Compute positions. It is a fused version of `compute_position_torch`."""
+    batch_size = extend_seq_lens.shape[0]
+    has_prefix = extend_prefix_lens.shape[0] == batch_size
+
+    positions = torch.empty(
+        extend_seq_lens_sum, dtype=torch.int64, device=extend_seq_lens.device
+    )
+    extend_start_loc = torch.empty(
+        batch_size, dtype=torch.int32, device=extend_seq_lens.device
+    )
+
+    # Launch kernel
+    compute_position_kernel[(batch_size,)](
+        positions,
+        extend_start_loc,
+        extend_prefix_lens,
+        extend_seq_lens,
+        has_prefix,
+    )
+
+    return positions, extend_start_loc
+
+
+@triton.jit
+def compute_position_kernel(
+    positions,
+    extend_start_loc,
+    extend_prefix_lens,
+    extend_seq_lens,
+    has_prefix: tl.constexpr,
+):
+    BLOCK_SIZE: tl.constexpr = 512
+    pid = tl.program_id(0).to(tl.int64)
+
+    prefix_len = tl.load(extend_prefix_lens + pid) if has_prefix else 0
+    seq_len = tl.load(extend_seq_lens + pid)
+
+    # NOTE: This can be slow for large bs
+    cumsum_start = tl.cast(0, tl.int64)
+    for i in range(pid):
+        cumsum_start += tl.load(extend_seq_lens + i)
+
+    num_loop = tl.cdiv(seq_len, BLOCK_SIZE)
+    for i in range(num_loop):
+        offset = tl.arange(0, BLOCK_SIZE) + i * BLOCK_SIZE
+        tl.store(
+            positions + cumsum_start + offset,
+            prefix_len + offset,
+            mask=offset < seq_len,
+        )
+    tl.store(extend_start_loc + pid, cumsum_start)
+
+
+def compute_position_torch(
+    extend_prefix_lens: torch.Tensor, extend_seq_lens: torch.Tensor
+):
+    positions = torch.cat(
+        [
+            torch.arange(
+                prefix_len, prefix_len + extend_len, device=extend_prefix_lens.device
+            )
+            for prefix_len, extend_len in zip(extend_prefix_lens, extend_seq_lens)
+        ],
+        axis=0,
+    )
+    extend_start_loc = torch.zeros_like(extend_seq_lens)
+    extend_start_loc[1:] = torch.cumsum(extend_seq_lens[:-1], dim=0)
+    return positions.to(torch.int64), extend_start_loc
+
+
+@torch.compile(dynamic=True, backend=get_compiler_backend(), disable=_is_npu)
+def clamp_position(seq_lens):
+    return torch.clamp((seq_lens - 1), min=0).to(torch.int64)
diff --git a/sglang/python/sglang/srt/model_executor/hook_manager.py b/sglang/python/sglang/srt/model_executor/hook_manager.py
new file mode 100644
index 0000000000000000000000000000000000000000..5f6d78a5102b3d3128560edd362ba67915c8c5d9
--- /dev/null
+++ b/sglang/python/sglang/srt/model_executor/hook_manager.py
@@ -0,0 +1,82 @@
+import fnmatch
+import importlib
+import logging
+from typing import Any, Callable, List, Optional
+
+import torch.nn as nn
+
+logger = logging.getLogger(__name__)
+
+
+def register_forward_hooks(model: nn.Module, hook_specs: List[dict[str, Any]]) -> None:
+    """
+    hook_specs is a list of dicts from server_args.forward_hooks.
+    Attaches forward hooks to the matching modules.
+    """
+    name_to_module = dict(model.named_modules())
+
+    for spec in hook_specs:
+        spec_name = spec.get("name", "")
+        target_patterns = spec.get("target_modules", [])
+        if not target_patterns:
+            logger.warning(f"Hook spec '{spec_name}' has no 'target_modules', skipping")
+            continue
+
+        hook_factory_path = spec.get("hook_factory")
+        if not hook_factory_path:
+            logger.warning(f"Hook spec '{spec_name}' has no 'hook_factory', skipping")
+            continue
+
+        config = spec.get("config") or {}
+        hook_factory = resolve_callable(hook_factory_path)
+
+        hook = hook_factory(config) if hook_factory else None
+        if hook is None:
+            logger.warning(
+                f"Hook factory '{hook_factory_path}' for spec '{spec_name}' "
+                "returned None, not registering any hook"
+            )
+            continue
+
+        # Resolve patterns like "model.layers.*.mlp"
+        matched = []
+        for name, module in name_to_module.items():
+            if any(fnmatch.fnmatch(name, pattern) for pattern in target_patterns):
+                matched.append((name, module))
+
+        if not matched:
+            logger.warning(
+                f"No modules matched hook spec '{spec_name}' "
+                f"patterns={target_patterns}"
+            )
+            continue
+
+        for module_name, module in matched:
+            _ = module.register_forward_hook(hook)
+            logger.info(f"Registered forward hook '{spec_name}' " f"on {module_name}")
+
+
+def resolve_callable(path: Optional[str]) -> Optional[Callable]:
+    if path is None:
+        return None
+
+    if ":" in path:
+        module_name, fn_name = path.split(":", 1)
+    else:
+        parts = path.split(".")
+        if len(parts) < 2:
+            raise ValueError(
+                f"Invalid hook callable path '{path}'. "
+                "Expected 'module.submodule:factory' or 'module.submodule.factory'."
+            )
+        *mod_parts, fn_name = parts
+        module_name = ".".join(mod_parts)
+
+    module = importlib.import_module(module_name)
+    try:
+        return getattr(module, fn_name)
+    except AttributeError as e:
+        raise AttributeError(
+            f"Module '{module_name}' has no attribute '{fn_name}' "
+            f"(from hook path '{path}')"
+        ) from e
diff --git a/sglang/python/sglang/srt/model_executor/input_buffers.py b/sglang/python/sglang/srt/model_executor/input_buffers.py
new file mode 100644
index 0000000000000000000000000000000000000000..240ccef3c95c83fbb313c0bf6357f102ff57101c
--- /dev/null
+++ b/sglang/python/sglang/srt/model_executor/input_buffers.py
@@ -0,0 +1,55 @@
+from __future__ import annotations
+
+from dataclasses import dataclass, fields
+from typing import Dict
+
+import torch
+
+_forward_input_buffer_pool: Dict[str, torch.Tensor] = {}
+
+
+@dataclass
+class ForwardInputBuffers:
+
+    def _share_one_buffer(self, name: str, new_buffer: torch.Tensor) -> torch.Tensor:
+
+        buffer_size = new_buffer.size()
+        buffer_stride = new_buffer.stride()
+
+        old_buffer = _forward_input_buffer_pool.get(name, None)
+        if old_buffer is not None:
+            assert (
+                new_buffer.dtype == old_buffer.dtype
+            ), f"Buffer {name} has different dtype than before."
+            assert (
+                new_buffer.device == old_buffer.device
+            ), f"Buffer {name} has different device than before."
+            if old_buffer.numel() > new_buffer.numel():
+                new_buffer = old_buffer
+
+        _forward_input_buffer_pool[name] = new_buffer
+        return new_buffer.as_strided(buffer_size, buffer_stride)
+
+    def share_buffers(self):
+
+        for f in fields(self):
+            name = f.name
+            buffer = getattr(self, name)
+
+            if buffer is None:
+                continue
+            elif isinstance(buffer, dict):
+                for sub_name, sub_buffer in buffer.items():
+                    assert isinstance(
+                        sub_buffer, torch.Tensor
+                    ), f"Field {name}.{sub_name} is expected to be a torch.Tensor, but got {type(sub_buffer)}."
+                    new_buffer = self._share_one_buffer(
+                        f"{name}.{sub_name}", sub_buffer
+                    )
+                    buffer[sub_name] = new_buffer
+            else:
+                assert isinstance(
+                    buffer, torch.Tensor
+                ), f"Field {name} is expected to be a torch.Tensor or a dict of torch.Tensor, but got {type(buffer)}."
+                new_buffer = self._share_one_buffer(name, buffer)
+                setattr(self, name, new_buffer)
diff --git a/sglang/python/sglang/srt/model_executor/mindspore_runner.py b/sglang/python/sglang/srt/model_executor/mindspore_runner.py
new file mode 100644
index 0000000000000000000000000000000000000000..4cdcaed505b6c1cf570c8667de43aab214776e96
--- /dev/null
+++ b/sglang/python/sglang/srt/model_executor/mindspore_runner.py
@@ -0,0 +1,120 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the SGLang project
+"""ms_runner launch MindSpore distributed modules."""
+
+import logging
+import multiprocessing as mp
+import os
+import sys
+from pathlib import Path
+
+import mindspore as ms
+import torch
+from mindspore._c_expression import GroupOptions
+from mindspore.communication import create_group
+
+from sglang.srt.distributed.parallel_state import _groups
+
+logger = logging.getLogger(__name__)
+
+
+class _Tmp:
+    def __init__(self):
+        self.sched_p = None
+
+    def set_sched_process(self, p):
+        self.sched_p = p
+
+    def __del__(self):
+        if self.sched_p:
+            self.sched_p.kill()
+
+
+_tmp = _Tmp()
+
+
+def _get_host_and_ip(distributed_init_method):
+    try:
+        _, ip_str, port_str = distributed_init_method.split(":")
+        ip = ip_str.split("/")[-1]
+        port = int(port_str)
+    except Exception as e:
+        raise RuntimeError(
+            "Cannot get host and port information from %s, error: %s!"
+            % (distributed_init_method, str(e))
+        )
+
+    return ip, port
+
+
+def run_scheduler_init(rank, local_rank, world_size, master_addr, master_port):
+    with open(str(Path() / "schedule.log"), "w") as scheduler_f:
+        # For Python outputs.
+        sys.stdout = scheduler_f
+        sys.stderr = scheduler_f
+        # For C++ outputs.
+        os.dup2(scheduler_f.fileno(), 1)
+        os.dup2(scheduler_f.fileno(), 2)
+        os.environ["DEVICE_ID"] = str(local_rank)
+        os.environ["MS_WORKER_NUM"] = str(world_size)
+        os.environ["MS_ROLE"] = "MS_SCHED"
+        os.environ["MS_NODE_ID"] = str(rank)
+        os.environ["MS_SCHED_HOST"] = str(master_addr)
+        os.environ["MS_SCHED_PORT"] = str(master_port)
+        # This function is blocked until the whole cluster exits.
+        ms.communication.init()
+
+
+def set_ms_parallel_env(rank, local_rank, world_size, init_method):
+    master_addr, master_port = _get_host_and_ip(init_method)
+    # change port avoiding port conflicts with torch
+    master_port = master_port + 35 if master_port < 65500 else master_port - 35
+    if not os.getenv("MS_ROLE"):
+        if rank == 0:
+            # Create a subprocess for scheduler of MindSpore, just for internal collaboration, not for collective communication
+            sched_p = mp.Process(
+                target=run_scheduler_init,
+                args=(rank, local_rank, world_size, master_addr, master_port),
+            )
+            sched_p.start()
+            global _tmp
+            _tmp.set_sched_process(sched_p)
+
+        os.environ["DEVICE_ID"] = str(local_rank)
+        os.environ["MS_WORKER_NUM"] = str(world_size)
+        os.environ["MS_ROLE"] = "MS_WORKER"
+        os.environ["MS_NODE_ID"] = str(rank)
+        os.environ["MS_SCHED_HOST"] = str(master_addr)
+        os.environ["MS_SCHED_PORT"] = str(master_port)
+
+
+def reuse_hccl_comm():
+    for group_name, group in _groups.items():
+        # Torch ProcessGroupHccl
+        device_group = group().device_group
+        hccl_comm_handle = device_group._get_backend(torch.device("npu")).get_hccl_comm(
+            group().local_rank
+        )
+        logger.info(
+            f"MindSpore reuse torch group: {device_group}, group_name: {group_name}, local rank: {group().local_rank},"
+            f"hccl communicator handle: {hex(hccl_comm_handle)}",
+        )
+        # Create MS communication group by hccl comm handle to reuse Torch group.
+        group_options = GroupOptions()
+        group_options.hccl_config = {"hccl_comm": hccl_comm_handle}
+        create_group(group_name, group().ranks, group_options)
+
+
+def init_ms_distributed(world_size, rank, local_rank, server_args, port):
+    if server_args.dist_init_addr:
+        dist_init_method = f"tcp://{server_args.dist_init_addr}"
+    else:
+        dist_init_method = f"tcp://{server_args.host}:{port}"
+    set_ms_parallel_env(rank, local_rank, world_size, dist_init_method)
+
+    ms.set_context(infer_boost="on", jit_level="O0")
+    ms.set_context(mode=ms.context.PYNATIVE_MODE)
+    ms.set_device("Ascend", local_rank)
+    ms.communication.init("hccl")
+    # After distributed job is initialized, reuse hccl comms for MindSpore.
+    reuse_hccl_comm()
diff --git a/sglang/python/sglang/srt/model_executor/model_runner.py b/sglang/python/sglang/srt/model_executor/model_runner.py
new file mode 100644
index 0000000000000000000000000000000000000000..9b615cce84998281b829c6b7f7570255e319da55
--- /dev/null
+++ b/sglang/python/sglang/srt/model_executor/model_runner.py
@@ -0,0 +1,2707 @@
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""ModelRunner runs the forward passes of the models."""
+
+import datetime
+import gc
+import inspect
+import json
+import logging
+import os
+import socket
+import threading
+import time
+from collections import defaultdict
+from dataclasses import dataclass
+from typing import Callable, List, Optional, Tuple, Union
+
+import torch
+import torch.distributed as dist
+from torch import nn
+
+from sglang.srt.configs import (
+    BailingHybridConfig,
+    FalconH1Config,
+    GraniteMoeHybridConfig,
+    JetNemotronConfig,
+    JetVLMConfig,
+    KimiLinearConfig,
+    Lfm2Config,
+    Lfm2MoeConfig,
+    NemotronH_Nano_VL_V2_Config,
+    NemotronHConfig,
+    Qwen3_5Config,
+    Qwen3_5MoeConfig,
+    Qwen3NextConfig,
+)
+from sglang.srt.configs.device_config import DeviceConfig
+from sglang.srt.configs.load_config import LoadConfig, LoadFormat
+from sglang.srt.configs.model_config import AttentionArch, ModelConfig, ModelImpl
+from sglang.srt.configs.update_config import adjust_config_with_unaligned_cpu_tp
+from sglang.srt.constants import GPU_MEMORY_TYPE_WEIGHTS
+from sglang.srt.debug_utils.dumper import dumper
+from sglang.srt.debug_utils.tensor_dump_forward_hook import (
+    register_forward_hook_for_model,
+)
+from sglang.srt.distributed import (
+    get_default_distributed_backend,
+    get_pp_group,
+    get_tp_group,
+    get_world_group,
+    init_distributed_environment,
+    initialize_model_parallel,
+    set_custom_all_reduce,
+    set_mscclpp_all_reduce,
+    set_torch_symm_mem_all_reduce,
+)
+from sglang.srt.distributed.device_communicators.pynccl_allocator import (
+    use_symmetric_memory,
+)
+from sglang.srt.distributed.parallel_state import monkey_patch_vllm_parallel_state
+from sglang.srt.elastic_ep.elastic_ep import ElasticEPStateManager
+from sglang.srt.elastic_ep.expert_backup_client import ExpertBackupClient
+from sglang.srt.environ import envs
+from sglang.srt.eplb.eplb_manager import EPLBManager
+from sglang.srt.eplb.expert_distribution import (
+    ExpertDistributionMetrics,
+    ExpertDistributionRecorder,
+    get_global_expert_distribution_recorder,
+    set_global_expert_distribution_recorder,
+)
+from sglang.srt.eplb.expert_location import (
+    ExpertLocationMetadata,
+    compute_initial_expert_location_metadata,
+    get_global_expert_location_metadata,
+    set_global_expert_location_metadata,
+)
+from sglang.srt.eplb.expert_location_updater import ExpertLocationUpdater
+from sglang.srt.hardware_backend.npu.graph_runner.npu_graph_runner import NPUGraphRunner
+from sglang.srt.layers import deep_gemm_wrapper
+from sglang.srt.layers.attention.attention_registry import (
+    ATTENTION_BACKENDS,
+    attn_backend_wrapper,
+)
+from sglang.srt.layers.attention.nsa.utils import is_nsa_enable_prefill_cp
+from sglang.srt.layers.attention.tbo_backend import TboAttnBackend
+from sglang.srt.layers.dp_attention import (
+    DpPaddingMode,
+    get_attention_tp_group,
+    get_attention_tp_size,
+    initialize_dp_attention,
+    set_dp_buffer_len,
+    set_is_extend_in_batch,
+)
+from sglang.srt.layers.logits_processor import LogitsProcessorOutput
+from sglang.srt.layers.moe.routed_experts_capturer import (
+    RoutedExpertsCapturer,
+    get_global_experts_capturer,
+    set_global_experts_capturer,
+)
+from sglang.srt.layers.pooler import EmbeddingPoolerOutput
+from sglang.srt.layers.quantization.fp8_kernel import fp8_dtype
+from sglang.srt.layers.sampler import create_sampler
+from sglang.srt.layers.torchao_utils import apply_torchao_config_to_model
+from sglang.srt.lora.lora_manager import LoRAManager
+from sglang.srt.lora.lora_registry import LoRARef
+from sglang.srt.managers.schedule_batch import sanity_check_mm_pad_shift_value
+from sglang.srt.mem_cache.allocator import BaseTokenToKVPoolAllocator
+from sglang.srt.mem_cache.memory_pool import ReqToTokenPool
+from sglang.srt.model_executor.cpu_graph_runner import CPUGraphRunner
+from sglang.srt.model_executor.cuda_graph_runner import (
+    CudaGraphRunner,
+    DecodeInputBuffers,
+    set_torch_compile_config,
+)
+from sglang.srt.model_executor.forward_batch_info import (
+    CaptureHiddenMode,
+    ForwardBatch,
+    ForwardMode,
+    PPProxyTensors,
+)
+from sglang.srt.model_executor.hook_manager import register_forward_hooks
+from sglang.srt.model_executor.model_runner_kv_cache_mixin import (
+    ModelRunnerKVCacheMixin,
+)
+from sglang.srt.model_executor.piecewise_cuda_graph_runner import (
+    PiecewiseCudaGraphRunner,
+)
+from sglang.srt.model_loader.loader import DefaultModelLoader, get_model_loader
+from sglang.srt.model_loader.remote_instance_weight_loader_utils import (
+    RemoteInstanceWeightLoaderBackend,
+    register_memory_region,
+    trigger_init_weights_send_group_for_remote_instance_request,
+)
+from sglang.srt.model_loader.utils import set_default_torch_dtype
+from sglang.srt.model_loader.weight_utils import default_weight_loader
+from sglang.srt.sampling.sampling_batch_info import SamplingBatchInfo
+from sglang.srt.server_args import (
+    ServerArgs,
+    get_global_server_args,
+    set_global_server_args_for_scheduler,
+)
+from sglang.srt.speculative.spec_info import SpeculativeAlgorithm
+from sglang.srt.utils import (
+    MultiprocessingSerializer,
+    cpu_has_amx_support,
+    dynamic_import,
+    empty_context,
+    enable_show_time_cost,
+    get_available_gpu_memory,
+    get_cpu_ids_by_node,
+    get_local_ip_auto,
+    init_custom_process_group,
+    is_hip,
+    is_host_cpu_arm64,
+    is_npu,
+    log_info_on_rank0,
+    monkey_patch_p2p_access_check,
+    require_attn_tp_gather,
+    require_gathered_buffer,
+    require_mlp_tp_gather,
+    reserve_rope_cache_for_long_sequences,
+    set_cuda_arch,
+    slow_rank_detector,
+)
+from sglang.srt.utils.nvtx_pytorch_hooks import PytHooks
+from sglang.srt.utils.offloader import (
+    create_offloader_from_server_args,
+    get_offloader,
+    set_offloader,
+)
+from sglang.srt.utils.patch_torch import (
+    monkey_patch_torch_reductions,
+    register_sgl_tp_rank,
+)
+from sglang.srt.utils.torch_memory_saver_adapter import TorchMemorySaverAdapter
+from sglang.srt.utils.weight_checker import WeightChecker
+from sglang.srt.weight_sync.tensor_bucket import (
+    FlattenedTensorBucket,
+    FlattenedTensorMetadata,
+)
+
+_is_hip = is_hip()
+_is_npu = is_npu()
+_is_cpu_amx_available = cpu_has_amx_support()
+_is_cpu_arm64 = is_host_cpu_arm64()
+
+if _is_npu:
+    from sglang.srt.hardware_backend.npu.utils import init_npu_backend
+
+    init_npu_backend()
+
+MLA_ATTENTION_BACKENDS = [
+    "aiter",
+    "flashinfer",
+    "fa3",
+    "fa4",
+    "triton",
+    "flashmla",
+    "cutlass_mla",
+    "trtllm_mla",
+    "ascend",
+    "nsa",
+]
+
+CHUNKED_PREFIX_CACHE_SUPPORTED_ATTENTION_BACKENDS = [
+    "flashinfer",
+    "fa3",
+    "fa4",
+    "flashmla",
+    "cutlass_mla",
+    "trtllm_mla",
+]
+
+TORCH_DTYPE_TO_KV_CACHE_STR = {
+    torch.float8_e4m3fn: "fp8_e4m3",
+    torch.float8_e4m3fnuz: "fp8_e4m3",
+    torch.float8_e5m2: "fp8_e5m2",
+    torch.bfloat16: "bf16",
+}
+
+
+def add_mla_attention_backend(backend_name):
+    if backend_name not in MLA_ATTENTION_BACKENDS:
+        MLA_ATTENTION_BACKENDS.append(backend_name)
+        logger.info(f"Added {backend_name} to MLA_ATTENTION_BACKENDS.")
+
+
+def add_chunked_prefix_cache_attention_backend(backend_name):
+    if backend_name not in CHUNKED_PREFIX_CACHE_SUPPORTED_ATTENTION_BACKENDS:
+        CHUNKED_PREFIX_CACHE_SUPPORTED_ATTENTION_BACKENDS.append(backend_name)
+        logger.info(
+            f"Added {backend_name} to CHUNKED_PREFIX_CACHE_SUPPORTED_ATTENTION_BACKENDS."
+        )
+
+
+# Detect stragger ranks in model loading
+UNBALANCED_MODEL_LOADING_TIMEOUT_S = 480  # leave more time for post data processing
+
+
+logger = logging.getLogger(__name__)
+
+
+def resolve_language_model(model: nn.Module) -> nn.Module:
+    model_cls_name = model.__class__.__name__
+    if model_cls_name == "Qwen3OmniMoeForConditionalGeneration":
+        return model.thinker.model
+    return model.model
+
+
+class RankZeroFilter(logging.Filter):
+    """Filter that only allows INFO level logs from rank 0, but allows all other levels from any rank."""
+
+    def __init__(self, is_rank_zero):
+        super().__init__()
+        self.is_rank_zero = is_rank_zero
+
+    def filter(self, record):
+        if record.levelno == logging.INFO:
+            return self.is_rank_zero
+        return True
+
+
+@dataclass
+class ModelRunnerOutput:
+    logits_output: Union[LogitsProcessorOutput, PPProxyTensors]
+    can_run_graph: bool
+    expert_distribution_metrics: Optional[ExpertDistributionMetrics] = None
+
+
+class ModelRunner(ModelRunnerKVCacheMixin):
+    """ModelRunner runs the forward passes of the models."""
+
+    def __init__(
+        self,
+        model_config: ModelConfig,
+        mem_fraction_static: float,
+        gpu_id: int,
+        tp_rank: int,
+        tp_size: int,
+        moe_ep_rank: int,
+        moe_ep_size: int,
+        pp_rank: int,
+        pp_size: int,
+        nccl_port: int,
+        server_args: ServerArgs,
+        dp_rank: Optional[int] = None,
+        attn_cp_rank: Optional[int] = None,
+        moe_dp_rank: Optional[int] = None,
+        is_draft_worker: bool = False,
+        req_to_token_pool: Optional[ReqToTokenPool] = None,
+        token_to_kv_pool_allocator: Optional[BaseTokenToKVPoolAllocator] = None,
+        draft_model_idx: Optional[int] = None,
+    ):
+        # Parse args
+        self.mem_fraction_static = mem_fraction_static
+        self.device = server_args.device
+        self.gpu_id = gpu_id
+        self.tp_rank = tp_rank
+        self.tp_size = tp_size
+        self.moe_ep_rank = moe_ep_rank
+        self.moe_ep_size = moe_ep_size
+        self.dp_size = server_args.dp_size if server_args.enable_dp_attention else 1
+        self.pp_rank = pp_rank
+        self.pp_size = pp_size
+        self.attn_cp_rank = attn_cp_rank
+        self.attn_cp_size = server_args.attn_cp_size
+        self.moe_dp_rank = moe_dp_rank
+        self.moe_dp_size = server_args.moe_dp_size
+        self.model_config = model_config
+        self.dist_port = nccl_port
+        self.server_args = server_args
+        self.is_draft_worker = is_draft_worker
+        self.is_generation = model_config.is_generation
+        self.is_multimodal = model_config.is_multimodal
+        self.is_multimodal_chunked_prefill_supported = (
+            model_config.is_multimodal_chunked_prefill_supported
+        )
+        self.spec_algorithm = SpeculativeAlgorithm.from_string(
+            server_args.speculative_algorithm
+        )
+        self.page_size = server_args.page_size
+        self.req_to_token_pool = req_to_token_pool
+        self.token_to_kv_pool_allocator = token_to_kv_pool_allocator
+        self.is_hybrid_swa = model_config.is_hybrid_swa
+        self.is_hybrid_swa_compress = model_config.is_hybrid_swa_compress
+        self.use_mla_backend = self.model_config.attention_arch == AttentionArch.MLA
+        self.attention_chunk_size = model_config.attention_chunk_size
+        self.forward_pass_id = 0
+        self.init_new_workspace = False
+        self.draft_model_idx = draft_model_idx
+
+        self.remote_instance_transfer_engine = None
+        self.remote_instance_transfer_engine_session_id = ""
+        self.remote_instance_transfer_engine_weight_info = None
+        # auxiliary hidden capture mode. TODO: expose this to server args?
+        self.eagle_use_aux_hidden_state = False
+        if self.spec_algorithm.is_eagle3() and not self.is_draft_worker:
+            # load draft config
+            draft_model_config = ModelConfig.from_server_args(
+                server_args,
+                model_path=(server_args.speculative_draft_model_path),
+                model_revision=server_args.speculative_draft_model_revision,
+                is_draft_model=True,
+            )
+            self.eagle_use_aux_hidden_state = True
+
+            try:
+                # get the aux layer from draft model config
+                eagle_config = getattr(
+                    draft_model_config.hf_config, "eagle_config", None
+                )
+                self.eagle_use_aux_hidden_state = eagle_config.get(
+                    "use_aux_hidden_state", True
+                )
+                self.eagle_aux_hidden_state_layer_ids = eagle_config[
+                    "eagle_aux_hidden_state_layer_ids"
+                ]
+            except:
+                # if there is no aux layer, set to None
+                self.eagle_aux_hidden_state_layer_ids = None
+
+        # Apply the rank zero filter to logger
+        if server_args.show_time_cost:
+            enable_show_time_cost()
+
+        # Model-specific adjustment
+        self.model_specific_adjustment()
+
+        # Set the global server_args in the scheduler process
+        set_global_server_args_for_scheduler(server_args)
+        global_server_args = get_global_server_args()
+
+        # FIXME: hacky set `use_mla_backend`
+        global_server_args.use_mla_backend = self.use_mla_backend
+
+        # Init OpenMP threads binding for CPU
+        if self.device == "cpu":
+            self.init_threads_binding()
+
+        # Initialize MooncakeTransferEngine
+        self.init_shared_mooncake_transfer_engine()
+
+        # Get available memory before model loading
+        pre_model_load_memory = self.init_torch_distributed()
+
+        # Init forward stream for overlap schedule
+        self.forward_stream = torch.get_device_module(self.device).Stream()
+
+        # CPU offload
+        set_offloader(create_offloader_from_server_args(server_args, dp_rank=dp_rank))
+
+        self._weight_checker = WeightChecker(model_runner=self)
+
+        if envs.SGLANG_DETECT_SLOW_RANK.get():
+            slow_rank_detector.execute()
+
+        # Init mindspore running environment when model impl is "mindspore"
+        self.init_mindspore_runner()
+
+        # Update deep gemm configure
+        if deep_gemm_wrapper.ENABLE_JIT_DEEPGEMM:
+            deep_gemm_wrapper.update_deep_gemm_config(gpu_id, server_args)
+
+        # Initialize the model runner
+        self.initialize(pre_model_load_memory)
+        self.check_quantized_moe_compatibility()
+
+        if self.is_multimodal:
+            sanity_check_mm_pad_shift_value(self.model_config.vocab_size)
+
+        # Temporary cached values
+        self.support_pp = (
+            "pp_proxy_tensors" in inspect.signature(self.model.forward).parameters
+        )
+
+        if self.pp_size > 1:
+            assert (
+                self.support_pp
+            ), "Pipeline Parallel is not compatible with this model."
+
+        # For weight updates
+        self._model_update_group = {}
+        self._weights_send_group = {}
+
+    def init_mindspore_runner(self):
+        # Init the mindspore runner
+        # for now, there is only some communication initialization work
+        if self.server_args.model_impl.lower() == ModelImpl.MINDSPORE and _is_npu:
+            from sglang.srt.model_executor.mindspore_runner import init_ms_distributed
+
+            init_ms_distributed(
+                world_size=self.tp_size * self.pp_size,
+                rank=self.tp_size * self.pp_rank + self.tp_rank,
+                local_rank=self.gpu_id,
+                server_args=self.server_args,
+                port=self.dist_port,
+            )
+
+    def initialize(self, pre_model_load_memory: float):
+        server_args = self.server_args
+
+        self.memory_saver_adapter = TorchMemorySaverAdapter.create(
+            enable=self.server_args.enable_memory_saver
+        )
+
+        if self.server_args.remote_instance_weight_loader_use_transfer_engine():
+            self.remote_instance_init_transfer_engine()
+
+        if not self.is_draft_worker:
+            set_global_expert_location_metadata(
+                compute_initial_expert_location_metadata(
+                    server_args=server_args,
+                    model_config=self.model_config,
+                    moe_ep_rank=self.moe_ep_rank,
+                )
+            )
+            if self.tp_rank == 0 and envs.SGLANG_LOG_EXPERT_LOCATION_METADATA.get():
+                logger.info(
+                    f"Initial expert_location_metadata: {get_global_expert_location_metadata()}"
+                )
+
+            set_global_expert_distribution_recorder(
+                ExpertDistributionRecorder.init_new(
+                    server_args,
+                    get_global_expert_location_metadata(),
+                    rank=self.tp_rank,
+                )
+            )
+
+        # Expert parallelism
+        self.eplb_manager = (
+            EPLBManager(self)
+            if self.server_args.enable_eplb and (not self.is_draft_worker)
+            else None
+        )
+        self.expert_location_updater = ExpertLocationUpdater()
+
+        (
+            ElasticEPStateManager.init(self.server_args)
+            if self.server_args.elastic_ep_backend
+            else None
+        )
+        # Load the model
+        self.sampler = create_sampler()
+        self.load_model()
+
+        # Load the expert backup client
+        self.expert_backup_client = (
+            ExpertBackupClient(self.server_args, self)
+            if (
+                self.server_args.enable_elastic_expert_backup
+                and self.server_args.elastic_ep_backend is not None
+            )
+            else None
+        )
+
+        if (
+            self.server_args.remote_instance_weight_loader_use_transfer_engine()
+            and self.remote_instance_transfer_engine is not None
+            and self.remote_instance_transfer_engine_weight_info is None
+        ):
+            self.remote_instance_transfer_engine_weight_info = register_memory_region(
+                self.model, self.remote_instance_transfer_engine
+            )
+
+        # For MTP models like DeepSeek-V3 or GLM-4.5, the MTP layer(s) are used separately as draft
+        # models for speculative decoding. In those cases, `num_nextn_predict_layers` is used to
+        # determine the number of layers.
+        model_has_mtp_layers = self.model_config.num_nextn_predict_layers is not None
+        model_num_layers = (
+            self.model_config.num_nextn_predict_layers
+            if self.is_draft_worker and model_has_mtp_layers
+            else max(
+                self.model_config.num_hidden_layers,
+                self.model_config.num_attention_layers,
+            )
+        )
+        if self.model_config.hf_config.architectures[0] == "MiMoV2MTP":
+            model_num_layers = 1
+        elif self.model_config.hf_config.architectures[0] == "Step3p5MTP":
+            model_num_layers = 1
+        self.start_layer = getattr(self.model, "start_layer", 0)
+        self.end_layer = getattr(self.model, "end_layer", model_num_layers)
+        self.num_effective_layers = self.end_layer - self.start_layer
+
+        # For LoopCoder models, each loop has its own layer_id, so we need to multiply by loop_num
+        loop_num = getattr(self.model_config.hf_config, "loop_num", 1)
+        if loop_num > 1:
+            self.num_effective_layers = self.num_effective_layers * loop_num
+
+        assert (
+            (not model_has_mtp_layers)
+            or (self.spec_algorithm.is_none())
+            or (
+                (not self.spec_algorithm.is_none())
+                and (self.num_effective_layers == model_num_layers)
+            )
+        ), "PP is not compatible with MTP models."
+
+        # Consider PP, so use start_layer and end_layer.
+        full_attention_layer_ids = [
+            layer_idx
+            for layer_idx in range(self.start_layer, self.end_layer + 1)
+            if hasattr(self.model_config, "full_attention_layer_ids")
+            and layer_idx in self.model_config.full_attention_layer_ids
+        ]
+        swa_attention_layer_ids = [
+            layer_idx
+            for layer_idx in range(self.start_layer, self.end_layer + 1)
+            if hasattr(self.model_config, "swa_attention_layer_ids")
+            and layer_idx in self.model_config.swa_attention_layer_ids
+        ]
+        # Update back to model_config.
+        self.model_config.swa_attention_layer_ids = swa_attention_layer_ids
+        self.model_config.full_attention_layer_ids = full_attention_layer_ids
+
+        # Apply torchao quantization
+        torchao_applied = getattr(self.model, "torchao_applied", False)
+        # In layered loading, torchao may have been applied
+        if not torchao_applied:
+            apply_torchao_config_to_model(
+                self.model, get_global_server_args().torchao_config
+            )
+
+        # Apply torch TP if the model supports it
+        supports_torch_tp = getattr(self.model, "supports_torch_tp", False)
+        if self.tp_size > 1 and supports_torch_tp:
+            self.apply_torch_tp()
+
+        # Init lora
+        if server_args.enable_lora:
+            self.init_lora_manager()
+
+        # Init Double Sparsity
+        if server_args.enable_double_sparsity:
+            if server_args.ds_heavy_channel_type is None:
+                raise ValueError(
+                    "Please specify the heavy channel type for double sparsity optimization."
+                )
+            self.init_double_sparsity_channel_config(server_args.ds_heavy_channel_type)
+
+        # Enable batch invariant mode
+        if server_args.enable_deterministic_inference:
+            from sglang.srt.batch_invariant_ops import enable_batch_invariant_mode
+
+            enable_batch_invariant_mode()
+
+        # Deduce KV cache dtype
+        self.configure_kv_cache_dtype()
+
+        # Init memory pool and attention backends
+        self.init_memory_pool(pre_model_load_memory)
+
+        # Init max running requests
+        self.max_running_requests = min(
+            (
+                self.max_total_num_tokens // 2
+                if server_args.max_running_requests is None
+                else server_args.max_running_requests // self.dp_size
+            ),
+            self.req_to_token_pool.size,
+        )
+
+        # Init routed experts capturer
+        self.init_routed_experts_capturer()
+
+        if self.device == "cuda" or self.device == "musa":
+            self.init_cublas()
+            self.init_attention_backend()
+            self.kernel_warmup()
+            self.init_device_graphs()
+        elif self.device in ["npu", "cpu"]:
+            self.init_attention_backend()
+            self.init_device_graphs()
+        else:
+            self.graph_runner = None
+            self.graph_mem_usage = 0
+            self.init_attention_backend()
+
+        if server_args.forward_hooks:
+            register_forward_hooks(self.model, server_args.forward_hooks)
+
+        if self.eagle_use_aux_hidden_state:
+            self.model.set_eagle3_layers_to_capture(
+                self.eagle_aux_hidden_state_layer_ids
+            )
+
+        # Initialize piecewise CUDA graph
+        self.init_piecewise_cuda_graphs()
+
+        self.prealloc_symmetric_memory_pool()
+
+    def init_routed_experts_capturer(self):
+        if not self.server_args.disable_shared_experts_fusion and hasattr(
+            self.model, "num_fused_shared_experts"
+        ):
+            num_fused_shared_experts = self.model.num_fused_shared_experts
+        else:
+            num_fused_shared_experts = 0
+
+        set_global_experts_capturer(
+            RoutedExpertsCapturer.create(
+                enable=get_global_server_args().enable_return_routed_experts,
+                model_config=self.model_config,
+                num_fused_shared_experts=num_fused_shared_experts,
+                num_tokens=self.max_total_num_tokens + self.page_size,
+                max_running_requests=self.max_running_requests,
+                device=self.device,
+            )
+        )
+
+    def remote_instance_init_transfer_engine(self):
+        try:
+            from mooncake.engine import TransferEngine
+        except ImportError as e:
+            logger.warning(
+                "Please install mooncake for using remote instance transfer engine: pip install mooncake"
+            )
+            return
+        self.remote_instance_transfer_engine = TransferEngine()
+        local_ip = get_local_ip_auto()
+        self.remote_instance_transfer_engine.initialize(
+            local_ip, "P2PHANDSHAKE", "rdma", envs.MOONCAKE_DEVICE.get()
+        )
+        self.remote_instance_transfer_engine_session_id = (
+            f"{local_ip}:{self.remote_instance_transfer_engine.get_rpc_port()}"
+        )
+
+    def model_specific_adjustment(self):
+        server_args = self.server_args
+
+        if server_args.enable_double_sparsity:
+            logger.info(
+                "Double sparsity optimization is turned on. Use triton backend without CUDA graph."
+            )
+            server_args.attention_backend = "triton"
+            server_args.disable_cuda_graph = True
+
+        if self.is_multimodal:
+            if not self.is_multimodal_chunked_prefill_supported:
+                server_args.chunked_prefill_size = -1
+                logger.info(
+                    f"Automatically turn off --chunked-prefill-size as it is not supported for "
+                    f"{self.model_config.hf_config.model_type}"
+                )
+
+        if (
+            not self.use_mla_backend
+            or server_args.attention_backend
+            not in CHUNKED_PREFIX_CACHE_SUPPORTED_ATTENTION_BACKENDS
+        ):
+            server_args.disable_chunked_prefix_cache = True
+
+        if not server_args.disable_chunked_prefix_cache:
+            log_info_on_rank0(logger, "Chunked prefix cache is turned on.")
+
+    def check_quantized_moe_compatibility(self):
+        if (
+            quantization_config := getattr(
+                self.model_config.hf_config, "quantization_config", None
+            )
+        ) is not None and (
+            weight_block_size := quantization_config.get("weight_block_size", None)
+        ) is not None:
+            weight_block_size_n = weight_block_size[0]
+
+            if self.tp_size % self.moe_ep_size != 0:
+                raise ValueError(
+                    f"tp_size {self.tp_size} must be divisible by ep_size {self.moe_ep_size}"
+                )
+            moe_tp_size = self.tp_size // self.moe_ep_size
+
+            moe_intermediate_size = getattr(
+                self.model_config.hf_text_config, "moe_intermediate_size", None
+            )
+            if moe_intermediate_size is None:
+                return
+
+            if moe_intermediate_size % moe_tp_size != 0:
+                raise ValueError(
+                    f"moe_intermediate_size {moe_intermediate_size} must be divisible by moe_tp_size ({moe_tp_size}) which is tp_size ({self.tp_size}) divided by moe_ep_size ({self.moe_ep_size})."
+                )
+
+            if (moe_intermediate_size // moe_tp_size) % weight_block_size_n != 0:
+                raise ValueError(
+                    f"For quantized MoE models, please make sure ({moe_intermediate_size=} / {moe_tp_size=}) % {weight_block_size_n=} == 0 "
+                    f"where moe_tp_size is equal to tp_size ({self.tp_size}) divided by ep_size ({self.moe_ep_size}). "
+                    f"You can fix this by setting arguments `--tp` and `--ep` correctly."
+                )
+
+    def init_torch_distributed(self):
+        tic = time.perf_counter()
+        logger.info("Init torch distributed begin.")
+
+        try:
+            torch.get_device_module(self.device).set_device(self.gpu_id)
+        except Exception:
+            logger.warning(
+                f"Context: {self.device=} {self.gpu_id=} {os.environ.get('CUDA_VISIBLE_DEVICES')=} {self.tp_rank=} {self.tp_size=}"
+            )
+            raise
+
+        backend = get_default_distributed_backend(self.device)
+        if self.device == "cuda" and self.server_args.elastic_ep_backend == "mooncake":
+            backend = "mooncake"
+            if self.server_args.mooncake_ib_device:
+                mooncake_ib_device = self.server_args.mooncake_ib_device.split(",")
+                try:
+                    from mooncake import ep as mooncake_ep
+
+                    mooncake_ep.set_device_filter(mooncake_ib_device)
+                except:
+                    pass  # A warning will be raised in `init_distributed_environment`
+
+        before_avail_memory = get_available_gpu_memory(self.device, self.gpu_id)
+        if not self.server_args.enable_p2p_check:
+            monkey_patch_p2p_access_check()
+
+        # Allow external orchestrators (e.g. trainpi) to override the distributed
+        # init method.  When set to "env://", torch uses MASTER_ADDR/MASTER_PORT
+        # env-vars and an externally-created TCPStore, completely avoiding port
+        # conflicts with intra-host collocation.
+        dist_init_method_override = envs.SGLANG_DISTRIBUTED_INIT_METHOD_OVERRIDE.get()
+        if dist_init_method_override:
+            dist_init_method = dist_init_method_override
+        elif self.server_args.dist_init_addr:
+            dist_init_method = f"tcp://{self.server_args.dist_init_addr}"
+        else:
+            dist_init_method = f"tcp://127.0.0.1:{self.dist_port}"
+        set_custom_all_reduce(not self.server_args.disable_custom_all_reduce)
+        set_mscclpp_all_reduce(self.server_args.enable_mscclpp)
+        set_torch_symm_mem_all_reduce(self.server_args.enable_torch_symm_mem)
+
+        if not self.is_draft_worker:
+            if self.device == "cpu":
+                if _is_cpu_amx_available or _is_cpu_arm64:
+                    # Bind OpenMP threads to CPU cores
+                    torch.ops.sgl_kernel.init_cpu_threads_env(self.local_omp_cpuid)
+
+                    # Set local size to hint SGLang to use shared memory based AllReduce
+                    os.environ["LOCAL_SIZE"] = str(self.tp_size)
+                    torch.ops.sgl_kernel.initialize(self.tp_size, self.tp_rank)
+
+                    @torch.library.register_fake("sgl_kernel::shm_allgather")
+                    def _(data, dim):
+                        return torch.cat([data] * self.tp_size, dim=dim)
+
+                else:
+                    logger.warning(
+                        "init_cpu_threads_env and shared memory based AllReduce is disabled, only intel amx backend and arm64 are supported"
+                    )
+
+            # Only initialize the distributed environment on the target model worker.
+            init_distributed_environment(
+                backend=backend,
+                world_size=self.tp_size * self.pp_size,
+                rank=self.tp_size * self.pp_rank + self.tp_rank,
+                local_rank=self.gpu_id,
+                distributed_init_method=dist_init_method,
+                timeout=self.server_args.dist_timeout,
+            )
+            initialize_model_parallel(
+                tensor_model_parallel_size=self.tp_size,
+                attention_data_parallel_size=self.dp_size,
+                pipeline_model_parallel_size=self.pp_size,
+                expert_model_parallel_size=self.moe_ep_size,
+                attention_context_model_parallel_size=self.attn_cp_size,
+                moe_data_model_parallel_size=self.moe_dp_size,
+                duplicate_tp_group=self.server_args.enable_pdmux,
+            )
+            initialize_dp_attention(
+                server_args=self.server_args,
+                model_config=self.model_config,
+            )
+            if is_npu():
+                register_sgl_tp_rank(self.gpu_id)
+
+        pre_model_load_memory = get_available_gpu_memory(
+            self.device,
+            self.gpu_id,
+            distributed=get_world_group().world_size > 1,
+            cpu_group=get_world_group().cpu_group,
+        )
+        self.tp_group = get_tp_group()
+        self.pp_group = get_pp_group()
+        self.attention_tp_group = get_attention_tp_group()
+
+        # Check memory for tensor parallelism
+        local_gpu_memory = get_available_gpu_memory(self.device, self.gpu_id)
+        if self.tp_size > 1 and not self.is_draft_worker:
+            if pre_model_load_memory < local_gpu_memory * 0.9:
+                msg = "The memory capacity is unbalanced. Some GPUs may be occupied by other processes. "
+                msg += f"{pre_model_load_memory=}, {local_gpu_memory=}, {local_gpu_memory * 0.9=}"
+                if envs.SGLANG_ENABLE_TP_MEMORY_INBALANCE_CHECK.get():
+                    raise RuntimeError(msg)
+                else:
+                    logger.warning(msg)
+
+        logger.info(
+            f"Init torch distributed ends. elapsed={time.perf_counter() - tic:.2f} s, "
+            f"mem usage={(before_avail_memory - local_gpu_memory):.2f} GB"
+        )
+        return pre_model_load_memory
+
+    def init_shared_mooncake_transfer_engine(self):
+        """
+        Need MooncakeTransferEngine when:
+        1) PD disaggregation uses mooncake for KV transfer (prefill/decode)
+        2) HiCache uses mooncake storage backend
+        3) Encoder disaggregation uses mooncake
+        """
+        use_mooncake_te = (
+            (
+                self.server_args.disaggregation_mode != "null"
+                and self.server_args.disaggregation_transfer_backend == "mooncake"
+            )
+            or (
+                self.server_args.enable_hierarchical_cache
+                and self.server_args.hicache_storage_backend == "mooncake"
+                and envs.SGLANG_HICACHE_MOONCAKE_REUSE_TE.get()
+            )
+            or (
+                self.server_args.encoder_only
+                and self.server_args.encoder_transfer_backend == "mooncake"
+            )
+            or (
+                self.server_args.language_only
+                and self.server_args.encoder_transfer_backend == "mooncake"
+            )
+            or (
+                self.server_args.enable_elastic_expert_backup
+                and self.server_args.elastic_ep_backend is not None
+            )
+        )
+
+        if use_mooncake_te:
+            from sglang.srt.distributed.device_communicators.mooncake_transfer_engine import (
+                init_mooncake_transfer_engine,
+            )
+
+            init_mooncake_transfer_engine(
+                hostname=get_local_ip_auto(),
+                gpu_id=self.gpu_id,
+                ib_device=(
+                    self.server_args.disaggregation_ib_device
+                    or self.server_args.mooncake_ib_device
+                ),
+            )
+
+    def load_model(self):
+        tic_total = time.perf_counter()
+        before_avail_memory = get_available_gpu_memory(self.device, self.gpu_id)
+        logger.info(
+            f"Load weight begin. avail mem={get_available_gpu_memory(self.device, self.gpu_id):.2f} GB"
+        )
+
+        # This can reduce thread conflicts and speed up weight loading.
+        if self.device != "cpu":
+            torch.set_num_threads(1)
+        if self.device == "cuda":
+            if torch.cuda.get_device_capability()[0] < 8:
+                logger.info(
+                    "Compute capability below sm80. Use float16 due to lack of bfloat16 support."
+                )
+                self.server_args.dtype = "float16"
+                self.model_config.dtype = torch.float16
+                if torch.cuda.get_device_capability()[1] < 5:
+                    raise RuntimeError("SGLang only supports sm75 and above.")
+
+        set_cuda_arch()
+
+        # Prepare the model config
+        from sglang.srt.configs.modelopt_config import ModelOptConfig
+
+        modelopt_config = ModelOptConfig(
+            quant=self.server_args.modelopt_quant,
+            checkpoint_restore_path=self.server_args.modelopt_checkpoint_restore_path,
+            checkpoint_save_path=self.server_args.modelopt_checkpoint_save_path,
+            export_path=self.server_args.modelopt_export_path,
+            quantize_and_serve=self.server_args.quantize_and_serve,
+        )
+
+        self.load_config = LoadConfig(
+            load_format=self.server_args.load_format,
+            download_dir=self.server_args.download_dir,
+            model_loader_extra_config=self.server_args.model_loader_extra_config,
+            tp_rank=self.tp_rank,
+            remote_instance_weight_loader_seed_instance_ip=self.server_args.remote_instance_weight_loader_seed_instance_ip,
+            remote_instance_weight_loader_seed_instance_service_port=self.server_args.remote_instance_weight_loader_seed_instance_service_port,
+            remote_instance_weight_loader_send_weights_group_ports=self.server_args.remote_instance_weight_loader_send_weights_group_ports,
+            remote_instance_weight_loader_backend=self.server_args.remote_instance_weight_loader_backend,
+            remote_instance_weight_loader_transfer_engine=self.remote_instance_transfer_engine,
+            modelopt_config=modelopt_config,
+            rl_quant_profile=self.server_args.rl_quant_profile,
+            draft_model_idx=self.draft_model_idx,
+        )
+        if self.device == "cpu":
+            self.model_config = adjust_config_with_unaligned_cpu_tp(
+                self.model_config, self.load_config, self.tp_size
+            )
+
+        if (
+            self.server_args.load_format == LoadFormat.REMOTE_INSTANCE
+            and self.server_args.remote_instance_weight_loader_backend
+            == RemoteInstanceWeightLoaderBackend.NCCL
+        ):
+            if self.tp_rank == 0:
+                instance_ip = socket.gethostbyname(socket.gethostname())
+                t = threading.Thread(
+                    target=trigger_init_weights_send_group_for_remote_instance_request,
+                    args=(
+                        self.server_args.remote_instance_weight_loader_seed_instance_ip,
+                        self.server_args.remote_instance_weight_loader_seed_instance_service_port,
+                        self.server_args.remote_instance_weight_loader_send_weights_group_ports,
+                        instance_ip,
+                    ),
+                )
+                t.start()
+
+        # Load the model
+        # Remove monkey_patch when linear.py quant remove dependencies with vllm
+        monkey_patch_vllm_parallel_state()
+
+        enable_cpu_backup = self.server_args.enable_weights_cpu_backup or (
+            self.is_draft_worker and self.server_args.enable_draft_weights_cpu_backup
+        )
+        with self.memory_saver_adapter.region(
+            GPU_MEMORY_TYPE_WEIGHTS,
+            enable_cpu_backup=enable_cpu_backup,
+        ):
+            self.loader = get_model_loader(
+                load_config=self.load_config,
+                model_config=self.model_config,
+            )
+            self.model = self.loader.load_model(
+                model_config=self.model_config,
+                device_config=DeviceConfig(self.device, self.gpu_id),
+            )
+            if hasattr(self.loader, "remote_instance_transfer_engine_weight_info"):
+                self.remote_instance_transfer_engine_weight_info = (
+                    self.loader.remote_instance_transfer_engine_weight_info
+                )
+        monkey_patch_vllm_parallel_state(reverse=True)
+
+        get_offloader().post_init()
+
+        # Register model for layerwise NVTX profiling if enabled
+        if self.server_args.enable_layerwise_nvtx_marker:
+            self.pyt_hooks = PytHooks()
+            self.pyt_hooks.register_hooks(self.model, module_prefix="model")
+
+        if self.server_args.kv_cache_dtype == "fp8_e4m3":
+            if self.server_args.quantization_param_path is not None:
+                if callable(getattr(self.model, "load_kv_cache_scales", None)):
+                    self.model.load_kv_cache_scales(
+                        self.server_args.quantization_param_path
+                    )
+                    logger.info(
+                        "Loaded KV cache scaling factors from %s",
+                        self.server_args.quantization_param_path,
+                    )
+                else:
+                    raise RuntimeError(
+                        "Using FP8 KV cache and scaling factors provided but "
+                        "model %s does not support loading scaling factors.",
+                        self.model.__class__,
+                    )
+            else:
+                logger.warning(
+                    "Using FP8 KV cache but no scaling factors "
+                    "provided. Defaulting to scaling factors of 1.0. "
+                    "This may lead to less accurate results!"
+                )
+
+        # Parse other args
+        self.sliding_window_size = None
+        if hasattr(self.model, "get_attention_sliding_window_size"):
+            self.sliding_window_size = self.model.get_attention_sliding_window_size()
+        elif (
+            self.model_config.is_hybrid_swa
+            and self.model_config.sliding_window_size is not None
+        ):
+            # sliding window field in model config may have different meaning for different kinds of models (e.g., dllm), here we only consider the sliding window in SWA model
+            self.sliding_window_size = self.model_config.sliding_window_size
+        elif self.model_config.attention_chunk_size is not None:
+            self.sliding_window_size = self.model_config.attention_chunk_size
+            logger.info(
+                f"Setting sliding_window_size to be attention_chunk_size: {self.sliding_window_size}"
+            )
+
+        self.dtype = self.model_config.dtype
+
+        after_avail_memory = get_available_gpu_memory(self.device, self.gpu_id)
+        self.weight_load_mem_usage = before_avail_memory - after_avail_memory
+        # Get quantization config from ModelConfig
+        # This handles both config.json (standard) and hf_quant_config.json (ModelOpt)
+        quant_str = self.model_config.get_quantization_config_log_str()
+
+        logger.info(
+            f"Load weight end. "
+            f"elapsed={time.perf_counter() - tic_total:.2f} s, "
+            f"type={type(self.model).__name__}, "
+            f"{quant_str + ', ' if quant_str else ''}"
+            f"avail mem={after_avail_memory:.2f} GB, "
+            f"mem usage={self.weight_load_mem_usage:.2f} GB."
+        )
+        if self.server_args.debug_tensor_dump_output_folder is not None:
+            register_forward_hook_for_model(
+                self.model,
+                self.server_args.debug_tensor_dump_output_folder,
+                self.server_args.debug_tensor_dump_layers,
+                self.tp_size,
+                self.tp_rank,
+                self.pp_rank,
+            )
+
+        if dumper.may_enable:
+            dumper.apply_source_patches()
+            dumper.register_non_intrusive_dumper(self.model)
+
+        # Pre-expand RoPE cache before CUDA Graph capture
+        reserve_rope_cache_for_long_sequences(
+            self.model,
+            self.server_args,
+            self.model_config,
+            logger,
+        )
+
+        if self.server_args.elastic_ep_backend == "mooncake":
+            # Mooncake does not support `monitored_barrier`
+            dist.barrier(group=get_tp_group().cpu_group)
+        else:
+            # Handle the case where some ranks do not finish loading.
+            try:
+                dist.monitored_barrier(
+                    group=get_tp_group().cpu_group,
+                    timeout=datetime.timedelta(
+                        seconds=UNBALANCED_MODEL_LOADING_TIMEOUT_S
+                    ),
+                    wait_all_ranks=True,
+                )
+            except RuntimeError:
+                raise ValueError(
+                    f"TP rank {self.tp_rank} could finish the model loading, but there are other ranks that didn't finish loading. It is likely due to unexpected failures (e.g., OOM) or a slow node."
+                ) from None
+
+    def update_expert_location(
+        self,
+        new_expert_location_metadata: ExpertLocationMetadata,
+        update_layer_ids: List[int],
+    ):
+        if ElasticEPStateManager.instance() is not None:
+            # TODO: refactor the weights update when elastic ep
+            old_expert_location_metadata = get_global_expert_location_metadata()
+            assert old_expert_location_metadata is not None
+            old_expert_location_metadata.update(
+                new_expert_location_metadata,
+                update_layer_ids=update_layer_ids,
+            )
+            if (
+                self.expert_backup_client is not None
+                and self.expert_backup_client.use_backup
+            ):
+                self.expert_backup_client.update_weights()
+                return
+
+            self.update_weights_from_disk(
+                self.server_args.model_path,
+                self.server_args.load_format,
+                lambda name: "mlp.experts" in name and "mlp.shared_experts" not in name,
+            )
+        else:
+            self.expert_location_updater.update(
+                self.model.routed_experts_weights_of_layer,
+                new_expert_location_metadata,
+                update_layer_ids=update_layer_ids,
+                nnodes=self.server_args.nnodes,
+                rank=self.tp_rank,
+            )
+
+    def update_weights_from_disk(
+        self,
+        model_path: str,
+        load_format: str,
+        weight_name_filter: Optional[Callable[[str], bool]] = None,
+        recapture_cuda_graph: bool = False,
+    ) -> tuple[bool, str]:
+        """Update engine weights in-place from the disk."""
+        logger.info(
+            f"Update engine weights online from disk begin. "
+            f"avail mem={get_available_gpu_memory(self.device, self.gpu_id, empty_cache=False):.2f} GB"
+        )
+
+        target_device = torch.device(self.device)
+        self.model_config.model_path = model_path
+        load_config = LoadConfig(load_format=load_format)
+
+        # Only support DefaultModelLoader for now
+        loader = get_model_loader(load_config, self.model_config)
+        if not isinstance(loader, DefaultModelLoader):
+            message = f"Failed to get model loader: {loader}."
+            return False, message
+
+        def get_weight_iter(config):
+            iter = loader._get_weights_iterator(
+                DefaultModelLoader.Source.init_new(config, self.model)
+            )
+            if weight_name_filter is not None:
+                iter = (
+                    (name, weight) for name, weight in iter if weight_name_filter(name)
+                )
+
+            return iter
+
+        def model_load_weights(model, iter):
+            loader.load_weights_and_postprocess(model, iter, target_device)
+            return model
+
+        with set_default_torch_dtype(self.model_config.dtype):
+            try:
+                iter = get_weight_iter(self.model_config)
+            except Exception as e:
+                message = f"Failed to get weights iterator: {e}."
+                return False, message
+            try:
+                model = model_load_weights(self.model, iter)
+            except Exception as e:
+                message = (
+                    f"Failed to update weights: {e}.\nRolling back to original weights."
+                )
+                del iter
+                gc.collect()
+                iter = get_weight_iter(self.model_config)
+                self.model = model_load_weights(self.model, iter)
+                return False, message
+
+        self.model = model
+        self.server_args.model_path = model_path
+        self.server_args.load_format = load_format
+        self.load_config = load_config
+
+        if recapture_cuda_graph and (self.device == "cuda" or self.device == "musa"):
+            self.init_device_graphs()
+
+        logger.info("Update weights end.")
+        return True, "Succeeded to update model weights."
+
+    def init_weights_send_group_for_remote_instance(
+        self,
+        master_address,
+        ports,
+        group_rank,
+        world_size,
+        group_name,
+        backend="nccl",
+    ):
+        assert (
+            torch.distributed.is_initialized()
+        ), "Default torch process group must be initialized"
+        assert group_name != "", "Group name cannot be empty"
+
+        ports_list = ports.split(",")
+        assert (
+            len(ports_list) == self.tp_size
+        ), f"Expected {self.tp_size} ports, but got {len(ports_list)} ports."
+        group_port = ports_list[self.tp_rank]
+        group_name = f"{group_name}_{group_port}_{self.tp_rank}"
+
+        logger.info(
+            f"init custom process group: tp_rank={self.tp_rank}, gpu_id={self.gpu_id}, master_address={master_address}, master_port={group_port}, "
+            f"group_rank={group_rank}, world_size={world_size}, group_name={group_name}, backend={backend}"
+        )
+
+        torch.cuda.empty_cache()
+        success = False
+        message = ""
+        try:
+            self._weights_send_group[group_name] = init_custom_process_group(
+                backend=backend,
+                init_method=f"tcp://{master_address}:{group_port}",
+                world_size=world_size,
+                rank=group_rank,
+                group_name=group_name,
+                device_id=torch.device("cuda", self.gpu_id),
+            )
+            dist.barrier(group=self._weights_send_group[group_name])
+            success = True
+            message = (
+                f"Succeeded to init group through {master_address}:{group_port} group."
+            )
+        except Exception as e:
+            message = f"Failed to init group: {e}."
+            logger.error(message)
+
+        torch.cuda.empty_cache()
+        return success, message
+
+    def send_weights_to_remote_instance(
+        self,
+        master_address,
+        ports,
+        group_name,
+    ):
+        assert (
+            torch.distributed.is_initialized()
+        ), "Default torch process group must be initialized"
+        assert group_name != "", "Group name cannot be empty"
+
+        ports_list = ports.split(",")
+        assert (
+            len(ports_list) == self.tp_size
+        ), f"Expected {self.tp_size} ports, but got {len(ports_list)} ports."
+        group_port = ports_list[self.tp_rank]
+        group_name = f"{group_name}_{group_port}_{self.tp_rank}"
+
+        if self._weights_send_group[group_name] is not None:
+            send_group = self._weights_send_group[group_name]
+        else:
+            message = f"Group {group_name} not in _weights_send_group list. Please call `init_weights_send_group_for_remote_instance` first."
+            logger.error(message)
+            return False, message
+
+        torch.cuda.empty_cache()
+        success = False
+        message = ""
+        try:
+            for _, weights in self.model.named_parameters():
+                torch.distributed.broadcast(
+                    weights,
+                    src=0,
+                    group=send_group,
+                )
+            success = True
+            message = f"Succeeded to send weights through {master_address}:{group_port} {group_name}."
+        except Exception as e:
+            message = f"Failed to send weights: {e}."
+            logger.error(message)
+
+        # destroy the process group after sending weights
+        del self._weights_send_group[group_name]
+        torch.distributed.distributed_c10d.destroy_process_group(send_group)
+        torch.cuda.empty_cache()
+        return success, message
+
+    def init_weights_update_group(
+        self,
+        master_address,
+        master_port,
+        rank_offset,
+        world_size,
+        group_name,
+        backend="nccl",
+    ):
+        """Initialize the Torch process group for model parameter updates.
+
+        `_model_update_group` is used in the RLHF workflow, where rank
+        0 is the actor model in the training engine, and the other ranks are
+        the inference engine, which is used for rollout.
+
+        In the RLHF workflow, the training engine updates the model
+        weights/parameters online, and broadcasts them to the inference
+        engine through the `_model_update_group` process group.
+        """
+        assert (
+            torch.distributed.is_initialized()
+        ), "Default torch process group must be initialized"
+        assert group_name != "", "Group name cannot be empty"
+
+        rank = rank_offset + self.tp_rank
+
+        logger.info(
+            f"init custom process group: master_address={master_address}, master_port={master_port}, "
+            f"rank_offset={rank_offset}, rank={rank}, world_size={world_size}, group_name={group_name}, backend={backend}"
+        )
+
+        try:
+            self._model_update_group[group_name] = init_custom_process_group(
+                backend=backend,
+                init_method=f"tcp://{master_address}:{master_port}",
+                world_size=world_size,
+                rank=rank,
+                group_name=group_name,
+            )
+            return True, "Succeeded to initialize custom process group."
+        except Exception as e:
+            message = f"Failed to initialize custom process group: {e}."
+            logger.error(message)
+            return False, message
+
+    def destroy_weights_update_group(self, group_name):
+        try:
+            if group_name in self._model_update_group:
+                pg = self._model_update_group.pop(group_name)
+                torch.distributed.destroy_process_group(pg)
+                return True, "Succeeded to destroy custom process group."
+            else:
+                return False, "The group to be destroyed does not exist."
+        except Exception as e:
+            message = f"Failed to destroy custom process group: {e}."
+            logger.error(message)
+            return False, message
+
+    def update_weights_from_distributed(
+        self,
+        names,
+        dtypes,
+        shapes,
+        group_name,
+        load_format: Optional[str] = None,
+    ):
+        """
+        Update specific parameter in the model weights online
+        through `_model_update_group` process group.
+
+        Args:
+            name: the name of the parameter to be updated.
+            dtype: the data type of the parameter to be updated.
+            shape: the shape of the parameter to be updated.
+        """
+
+        assert group_name in self._model_update_group, (
+            f"Group {group_name} not in {list(self._model_update_group.keys())}. "
+            "Please call `init_weights_update_group` first."
+        )
+
+        if load_format == "flattened_bucket":
+            return self._update_bucketed_weights_from_distributed(
+                names, dtypes, shapes, group_name
+            )
+        try:
+            weights = []
+            handles = []
+            for name, dtype, shape in zip(names, dtypes, shapes):
+                target_dtype = (
+                    dtype if isinstance(dtype, torch.dtype) else getattr(torch, dtype)
+                )
+                weight = torch.empty(shape, dtype=target_dtype, device=self.device)
+                handles.append(
+                    torch.distributed.broadcast(
+                        weight,
+                        src=0,
+                        group=self._model_update_group[group_name],
+                        async_op=True,
+                    )
+                )
+                weights.append((name, weight))
+            for handle in handles:
+                handle.wait()
+
+            self.model.load_weights(weights)
+            return True, "Succeeded to update parameter online."
+
+        except Exception as e:
+            error_msg = (
+                f"Failed to update parameter online: {e}. "
+                f"The full weights of the ModelRunner are partially updated. "
+                f"Please discard the whole weights."
+            )
+            logger.error(error_msg)
+            return False, error_msg
+
+    def _update_bucketed_weights_from_distributed(
+        self, names, dtypes, shapes, group_name
+    ):
+        try:
+            named_tensors = []
+            for name, dtype, shape in zip(names, dtypes, shapes):
+                target_dtype = (
+                    dtype if isinstance(dtype, torch.dtype) else getattr(torch, dtype)
+                )
+                named_tensors.append(
+                    (name, torch.empty(shape, dtype=target_dtype, device=self.device))
+                )
+            bucket = FlattenedTensorBucket(named_tensors=named_tensors)
+            flattened_tensor = bucket.get_flattened_tensor()
+            torch.distributed.broadcast(
+                flattened_tensor,
+                src=0,
+                group=self._model_update_group[group_name],
+            )
+            reconstructed_tensors = bucket.reconstruct_tensors()
+            self.model.load_weights(reconstructed_tensors)
+            return True, f"Succeeded to update parameter online."
+        except Exception as e:
+            error_msg = (
+                f"Failed to update parameter online: {e}. "
+                f"The full weights of the ModelRunner are partially updated. "
+                f"Please discard the whole weights."
+            )
+            logger.error(error_msg)
+            return False, error_msg
+
+    def update_weights_from_tensor(
+        self,
+        named_tensors: List[Tuple[str, Union[torch.Tensor, "LocalSerializedTensor"]]],
+        load_format: Optional[str] = None,
+    ):
+        monkey_patch_torch_reductions()
+        if load_format == "flattened_bucket":
+            # Handle flattened bucket format
+            return self._update_weights_from_flattened_bucket(
+                flattened_tensor_bucket_dict=named_tensors
+            )
+
+        # We need to get device after patch otherwise the device would be wrong
+        self.device_module = torch.get_device_module(self.device)
+        infered_device = self.device_module.current_device()
+
+        named_tensors = [
+            (name, _unwrap_tensor(tensor, tp_rank=self.tp_rank, device=infered_device))
+            for name, tensor in named_tensors
+        ]
+        if load_format == "direct":
+            _model_load_weights_direct(self.model, named_tensors)
+        elif load_format in self.server_args.custom_weight_loader:
+            custom_loader = dynamic_import(load_format)
+            custom_loader(self.model, named_tensors)
+        elif load_format is None:
+            self.model.load_weights(named_tensors)
+        else:
+            raise NotImplementedError(f"Unknown load_format={load_format}")
+        return True, "Success"
+
+    def _update_weights_from_flattened_bucket(
+        self,
+        flattened_tensor_bucket_dict,
+    ):
+        """Handle flattened bucket format for weight updates"""
+        flattened_tensor = flattened_tensor_bucket_dict["flattened_tensor"]
+        metadata = flattened_tensor_bucket_dict["metadata"]
+
+        # Convert metadata dict to our format
+        converted_metadata = []
+        for meta in metadata:
+            converted_meta = FlattenedTensorMetadata(
+                name=meta.name,
+                shape=meta.shape,
+                dtype=meta.dtype,
+                start_idx=meta.start_idx,
+                end_idx=meta.end_idx,
+                numel=meta.numel,
+            )
+            converted_metadata.append(converted_meta)
+
+        # Create bucket and reconstruct tensors
+        bucket = FlattenedTensorBucket(
+            flattened_tensor=flattened_tensor, metadata=converted_metadata
+        )
+        reconstructed_tensors = bucket.reconstruct_tensors()
+
+        # Load the reconstructed tensors using the standard method
+        self.model.load_weights(reconstructed_tensors)
+
+        return True, "Success"
+
+    def get_weights_by_name(
+        self, name: str, truncate_size: int = 100
+    ) -> Optional[torch.Tensor]:
+        """Get the weights of the parameter by its name. Similar to `get_parameter` in Hugging Face.
+
+        Only used for unit test with an unoptimized performance.
+        For optimized performance, please use torch.save and torch.load.
+        """
+        # TODO: (chenyang) Add support for Qwen models.
+        try:
+            return self.model.get_weights_by_name(
+                name, truncate_size, tp_size=self.tp_size
+            )
+        except Exception as e:
+            logger.error(f"Error when getting parameter {name}: {e}")
+            return None
+
+    def init_lora_manager(self):
+        self.lora_manager = LoRAManager(
+            base_model=self.model,
+            base_hf_config=self.model_config.hf_config,
+            max_loras_per_batch=self.server_args.max_loras_per_batch,
+            load_config=self.load_config,
+            dtype=self.dtype,
+            server_args=self.server_args,
+            lora_backend=self.server_args.lora_backend,
+            tp_size=self.tp_size,
+            tp_rank=self.tp_rank,
+            max_lora_rank=self.server_args.max_lora_rank,
+            target_modules=self.server_args.lora_target_modules,
+            lora_paths=self.server_args.lora_paths,
+        )
+
+    def load_lora_adapter(self, lora_ref: LoRARef):
+        """Load a new lora adapter from disk or huggingface."""
+
+        logger.info(
+            f"LoRA adapter loading starts: {lora_ref}. "
+            f"avail mem={get_available_gpu_memory(self.device, self.gpu_id):.2f} GB"
+        )
+
+        result = self.lora_manager.load_lora_adapter(lora_ref)
+
+        logger.info(
+            f"LoRA adapter loading completes: {lora_ref}. "
+            f"avail mem={get_available_gpu_memory(self.device, self.gpu_id):.2f} GB"
+        )
+
+        return result
+
+    def load_lora_adapter_from_tensors(
+        self, lora_ref: LoRARef, tensors, config_dict, added_tokens_config=None
+    ):
+        logger.info(f"LoRA adapter loading from tensors starts: {lora_ref}.")
+        result = self.lora_manager.load_lora_adapter_from_tensors(
+            lora_ref, tensors, config_dict, added_tokens_config
+        )
+        logger.info(f"LoRA adapter loading from tensors completes: {lora_ref}.")
+        return result
+
+    def unload_lora_adapter(self, lora_ref: LoRARef):
+        """Unload a lora adapter that was previously loaded during initialization or dynamic loading."""
+
+        logger.info(
+            f"LoRA adapter unloading starts: {lora_ref}. "
+            f"avail mem={get_available_gpu_memory(self.device, self.gpu_id):.2f} GB"
+        )
+
+        result = self.lora_manager.unload_lora_adapter(lora_ref)
+
+        logger.info(
+            f"LoRA adapter unloading completes: {lora_ref}. "
+            f"avail mem={get_available_gpu_memory(self.device, self.gpu_id):.2f} GB"
+        )
+
+        return result
+
+    @property
+    def qwen3_next_config(self):
+        config = self.model_config.hf_config
+        if isinstance(config, Qwen3NextConfig):
+            return config
+        return None
+
+    @property
+    def hybrid_lightning_config(self):
+        config = self.model_config.hf_config
+        if isinstance(config, BailingHybridConfig):
+            return config
+        return None
+
+    @property
+    def hybrid_gdn_config(self):
+        config = self.model_config.hf_config.get_text_config()
+        if isinstance(
+            config,
+            Qwen3NextConfig
+            | Qwen3_5Config
+            | Qwen3_5MoeConfig
+            | JetNemotronConfig
+            | JetVLMConfig,
+        ):
+            return config
+        return None
+
+    @property
+    def mamba2_config(self):
+        config = self.model_config.hf_config
+        if isinstance(config, NemotronHConfig) and self.is_draft_worker:
+            # NemotronH MTP draft models have no Mamba layers (pattern like "*E")
+            # so they shouldn't use HybridLinearAttnBackend
+            pattern = getattr(config, "mtp_hybrid_override_pattern", None)
+            if pattern is not None and "M" not in pattern:
+                return None
+        if isinstance(
+            config, FalconH1Config | NemotronHConfig | Lfm2Config | Lfm2MoeConfig
+        ):
+            return config
+        if isinstance(config, NemotronH_Nano_VL_V2_Config):
+            return config.llm_config
+
+        if isinstance(config, GraniteMoeHybridConfig):
+            has_mamba = any(
+                layer_type == "mamba"
+                for layer_type in getattr(config, "layer_types", [])
+            )
+            if not has_mamba:
+                return None
+            else:
+                return config
+
+        return None
+
+    @property
+    def max_token_pool_size(self):
+        """Return the max token pool size considering hybrid swa settings."""
+        if self.is_hybrid_swa:
+            return min(self.swa_max_total_num_tokens, self.max_total_num_tokens)
+        else:
+            return self.max_total_num_tokens
+
+    @property
+    def kimi_linear_config(self):
+        config = self.model_config.hf_config
+        if isinstance(config, KimiLinearConfig):
+            return config
+        return None
+
+    @property
+    def mambaish_config(self):
+        return (
+            self.mamba2_config
+            or self.hybrid_gdn_config
+            or self.kimi_linear_config
+            or self.hybrid_lightning_config
+        )
+
+    def configure_kv_cache_dtype(self):
+        if self.server_args.kv_cache_dtype == "auto":
+            quant_config = getattr(self.model, "quant_config", None)
+            kv_cache_quant_algo = getattr(quant_config, "kv_cache_quant_algo", None)
+            if (
+                isinstance(kv_cache_quant_algo, str)
+                and kv_cache_quant_algo.upper() == "FP8"
+            ):
+                if _is_hip:
+                    self.kv_cache_dtype = fp8_dtype
+                    self.server_args.kv_cache_dtype = TORCH_DTYPE_TO_KV_CACHE_STR[
+                        self.kv_cache_dtype
+                    ]
+                else:
+                    self.kv_cache_dtype = torch.float8_e4m3fn
+                    self.server_args.kv_cache_dtype = TORCH_DTYPE_TO_KV_CACHE_STR[
+                        self.kv_cache_dtype
+                    ]
+            else:
+                self.kv_cache_dtype = self.dtype
+        elif self.server_args.kv_cache_dtype == "fp8_e5m2":
+            if _is_hip:  # Using natively supported format
+                self.kv_cache_dtype = fp8_dtype
+            else:
+                self.kv_cache_dtype = torch.float8_e5m2
+        elif self.server_args.kv_cache_dtype == "fp8_e4m3":
+            if _is_hip:  # Using natively supported format
+                self.kv_cache_dtype = fp8_dtype
+            else:
+                self.kv_cache_dtype = torch.float8_e4m3fn
+        elif self.server_args.kv_cache_dtype in ("bf16", "bfloat16"):
+            self.kv_cache_dtype = torch.bfloat16
+        elif self.server_args.kv_cache_dtype == "fp4_e2m1":
+            if hasattr(torch, "float4_e2m1fn_x2"):
+                self.kv_cache_dtype = torch.float4_e2m1fn_x2
+                logger.warning(f"FP4 (E2M1) KV Cache might lead to a accuracy drop!")
+            else:
+                logger.warning(
+                    f"--kv-cache-dtype falls back to 'auto' because this torch version does not support torch.float4_e2m1fn_x2"
+                )
+                self.kv_cache_dtype = self.dtype
+        else:
+            raise ValueError(
+                f"Unsupported kv_cache_dtype: {self.server_args.kv_cache_dtype}."
+            )
+
+        log_info_on_rank0(logger, f"Using KV cache dtype: {self.kv_cache_dtype}")
+
+    def init_cublas(self):
+        """We need to run a small matmul to init cublas. Otherwise, it will raise some errors later."""
+        dtype = torch.float16
+        device = "cuda"
+        a = torch.ones((16, 16), dtype=dtype, device=device)
+        b = torch.ones((16, 16), dtype=dtype, device=device)
+        c = a @ b
+        return c
+
+    def init_attention_backend(self):
+        """Init attention kernel backend."""
+        if self.server_args.enable_pdmux:
+            self.attn_backend = self._get_attention_backend(init_new_workspace=True)
+            self.decode_attn_backend_group = []
+            for _ in range(self.server_args.sm_group_num):
+                self.decode_attn_backend_group.append(self._get_attention_backend())
+            self.decode_attn_backend = self.decode_attn_backend_group[0]
+        elif self.server_args.enable_two_batch_overlap and not self.is_draft_worker:
+            self.attn_backend = TboAttnBackend.init_new(self._get_attention_backend)
+        else:
+            self.attn_backend = self._get_attention_backend()
+
+    def _get_attention_backend(self, init_new_workspace: bool = False):
+        """Init attention kernel backend."""
+        draft_attn_backend = self.server_args.speculative_draft_attention_backend
+        if self.is_draft_worker and draft_attn_backend:
+            logger.warning(
+                f"Overriding draft attention backend to {draft_attn_backend}."
+            )
+            return self._get_attention_backend_from_str(
+                draft_attn_backend,
+                init_new_workspace=init_new_workspace,
+            )
+
+        (
+            self.prefill_attention_backend_str,
+            self.decode_attention_backend_str,
+        ) = self.server_args.get_attention_backends()
+
+        if self.decode_attention_backend_str != self.prefill_attention_backend_str:
+            from sglang.srt.layers.attention.hybrid_attn_backend import (
+                HybridAttnBackend,
+            )
+
+            attn_backend = HybridAttnBackend(
+                self,
+                decode_backend=self._get_attention_backend_from_str(
+                    self.decode_attention_backend_str,
+                    init_new_workspace=init_new_workspace,
+                ),
+                prefill_backend=self._get_attention_backend_from_str(
+                    self.prefill_attention_backend_str,
+                    init_new_workspace=init_new_workspace,
+                ),
+            )
+            logger.info(
+                f"Using hybrid attention backend for decode and prefill: "
+                f"decode_backend={self.decode_attention_backend_str}, "
+                f"prefill_backend={self.prefill_attention_backend_str}."
+            )
+            logger.warning(
+                "Warning: Attention backend specified by --attention-backend or default backend might be overridden."
+                "The feature of hybrid attention backend is experimental and unstable. Please raise an issue if you encounter any problem."
+            )
+        else:
+            attn_backend = self._get_attention_backend_from_str(
+                self.server_args.attention_backend,
+                init_new_workspace=init_new_workspace,
+            )
+
+        (
+            get_global_server_args().prefill_attention_backend,
+            get_global_server_args().decode_attention_backend,
+        ) = (self.prefill_attention_backend_str, self.decode_attention_backend_str)
+        return attn_backend
+
+    def _get_attention_backend_from_str(
+        self, backend_str: str, init_new_workspace: bool = False
+    ):
+        if backend_str not in ATTENTION_BACKENDS:
+            raise ValueError(f"Invalid attention backend: {backend_str}")
+        self.init_new_workspace = init_new_workspace
+        full_attention_backend = ATTENTION_BACKENDS[backend_str](self)
+        return attn_backend_wrapper(self, full_attention_backend)
+
+    def init_double_sparsity_channel_config(self, selected_channel):
+        selected_channel = "." + selected_channel + "_proj"
+        self.sorted_channels = []
+        # load channel config
+        with open(self.server_args.ds_channel_config_path, "r") as f:
+            channel_config = json.load(f)
+
+        for i in range(self.start_layer, self.end_layer):
+            key = "model.layers." + str(i) + ".self_attn" + selected_channel
+            self.sorted_channels.append(
+                torch.tensor(channel_config[key])[
+                    :, : self.server_args.ds_heavy_channel_num
+                ]
+                .contiguous()
+                .cuda()
+            )
+
+    def kernel_warmup(self):
+        """
+        Warmup and tune kernels before cuda graph capture.
+        Currently only doing FlashInfer autotune.
+        """
+        if self.device != "cuda":
+            return
+
+        if self._should_run_flashinfer_autotune():
+            self._flashinfer_autotune()
+
+    def _should_run_flashinfer_autotune(self) -> bool:
+        """Check if flashinfer autotune should be run."""
+        if self.server_args.disable_flashinfer_autotune:
+            return False
+
+        backend_str = self.server_args.moe_runner_backend
+
+        # TODO smor- support other cases for flashinfer autotune, such as, mamba backend
+
+        if backend_str not in [
+            "flashinfer_trtllm",
+            "flashinfer_mxfp4",
+            # TODO: flashinfer_cutlass will cause some flashinfer compilation errors. To be fixed.
+            # "flashinfer_cutlass",
+        ]:
+            return False
+
+        major, _ = torch.cuda.get_device_capability()
+        if major < 9:
+            return False
+
+        if (
+            self.spec_algorithm.is_eagle()
+            or self.spec_algorithm.is_standalone()
+            or self.spec_algorithm.is_ngram()
+        ):
+            return not self.is_draft_worker
+
+        return True
+
+    def _flashinfer_autotune(self):
+        """Run flashinfer autotune."""
+        from flashinfer.autotuner import autotune
+
+        logger.info("Running FlashInfer autotune...")
+
+        # Run warmup on the non-default stream to avoid NCCL 2.29+ cudaMemcpyBatchAsync
+        # calls on default stream (unsupported by CUDA) when --enable-symm-mem is used.
+        self.forward_stream.wait_stream(torch.cuda.current_stream())
+        with torch.get_device_module(self.device).stream(self.forward_stream):
+            with torch.inference_mode(), autotune():
+                self._dummy_run(
+                    batch_size=self.req_to_token_pool.size, run_ctx=autotune()
+                )
+        torch.cuda.current_stream().wait_stream(self.forward_stream)
+        logger.info("FlashInfer autotune completed.")
+
+    def _dummy_run(self, batch_size: int, run_ctx=None):
+        """Run a dummy forward pass for warmup/profiling."""
+        if self.is_generation:
+            capture_forward_mode = ForwardMode.DECODE
+        else:
+            capture_forward_mode = ForwardMode.EXTEND
+        capture_hidden_mode = CaptureHiddenMode.NULL
+        num_tokens_per_bs = 1
+        if (
+            self.spec_algorithm.is_eagle()
+            or self.spec_algorithm.is_standalone()
+            or self.spec_algorithm.is_ngram()
+        ):
+            if self.is_draft_worker:
+                raise RuntimeError("This should not happen")
+            else:
+                capture_forward_mode = ForwardMode.TARGET_VERIFY
+                num_tokens_per_bs = self.server_args.speculative_num_draft_tokens
+
+        if self.server_args.enable_return_hidden_states:
+            capture_hidden_mode = CaptureHiddenMode.FULL
+
+        num_tokens = batch_size * num_tokens_per_bs
+
+        if require_gathered_buffer(self.server_args):
+            attn_tp_size = get_attention_tp_size()
+            if attn_tp_size > 1 and num_tokens % attn_tp_size != 0:
+                num_tokens = num_tokens // attn_tp_size * attn_tp_size
+                batch_size = num_tokens // num_tokens_per_bs
+
+        seq_len_fill_value = self.attn_backend.get_cuda_graph_seq_len_fill_value()
+
+        if self.server_args.enable_torch_compile:
+            set_torch_compile_config()
+
+        if self.eagle_use_aux_hidden_state:
+            self.model.set_eagle3_layers_to_capture()
+
+        require_mlp_tp_gather_ = require_mlp_tp_gather(self.server_args)
+        if require_gathered_buffer(self.server_args):
+            assert require_mlp_tp_gather_ or require_attn_tp_gather(self.server_args)
+
+        buffers: DecodeInputBuffers = DecodeInputBuffers.create(
+            device=self.device,
+            max_bs=batch_size,
+            max_num_token=num_tokens,
+            hidden_size=self.model_config.hidden_size,
+            vocab_size=self.model_config.vocab_size,
+            dtype=self.model_config.dtype,
+            dp_size=self.server_args.dp_size,
+            pp_size=self.server_args.pp_size,
+            is_encoder_decoder=self.model_config.is_encoder_decoder,
+            require_mlp_tp_gather=require_mlp_tp_gather_,
+            seq_len_fill_value=seq_len_fill_value,
+            encoder_len_fill_value=0,
+            num_tokens_per_bs=num_tokens_per_bs,
+            cache_loc_dtype=torch.int64,
+            enable_mamba_track=False,
+        )
+        buffers.num_token_non_padded[...] = num_tokens
+
+        # For extend mode
+        if not self.is_generation:
+            extend_prefix_lens_cpu = [0] * batch_size
+            extend_seq_lens_cpu = [seq_len_fill_value] * batch_size
+            extend_num_tokens = num_tokens
+            extend_seq_lens = torch.full(
+                (batch_size,), seq_len_fill_value, dtype=torch.int32, device=self.device
+            )
+            extend_prefix_lens = torch.zeros(
+                (batch_size,), dtype=torch.int32, device=self.device
+            )
+            extend_start_loc = torch.arange(
+                0, num_tokens, num_tokens_per_bs, dtype=torch.int32, device=self.device
+            )
+        else:
+            extend_prefix_lens_cpu = None
+            extend_seq_lens_cpu = None
+            extend_num_tokens = None
+            extend_seq_lens = None
+            extend_prefix_lens = None
+            extend_start_loc = None
+
+        if self.server_args.pp_size > 1:
+            pp_proxy_tensors = PPProxyTensors(
+                {k: v[:num_tokens] for k, v in buffers.pp_proxy_tensors.items()}
+            )
+
+        if require_mlp_tp_gather_:
+            buffers.global_num_tokens_gpu.copy_(
+                torch.tensor(
+                    [num_tokens] * self.server_args.dp_size,
+                    dtype=torch.int32,
+                    device=self.device,
+                )
+            )
+            buffers.global_num_tokens_for_logprob_gpu.copy_(
+                torch.tensor(
+                    [num_tokens] * self.server_args.dp_size,
+                    dtype=torch.int32,
+                    device=self.device,
+                )
+            )
+            global_dp_buffer_len = num_tokens * self.server_args.dp_size
+        elif require_attn_tp_gather(self.server_args):
+            buffers.global_num_tokens_gpu.copy_(
+                torch.tensor(
+                    [num_tokens],
+                    dtype=torch.int32,
+                    device=self.device,
+                )
+            )
+            buffers.global_num_tokens_for_logprob_gpu.copy_(
+                torch.tensor(
+                    [num_tokens],
+                    dtype=torch.int32,
+                    device=self.device,
+                )
+            )
+            global_dp_buffer_len = num_tokens
+        else:
+            global_dp_buffer_len = None
+
+        def get_spec_info():
+            spec_info = None
+            if self.spec_algorithm.is_eagle() or self.spec_algorithm.is_standalone():
+                from sglang.srt.speculative.eagle_info import EagleVerifyInput
+
+                if self.is_draft_worker:
+                    raise RuntimeError("This should not happen.")
+                else:
+                    spec_info = EagleVerifyInput(
+                        draft_token=None,
+                        custom_mask=buffers.custom_mask,
+                        positions=None,
+                        retrive_index=None,
+                        retrive_next_token=None,
+                        retrive_next_sibling=None,
+                        retrive_cum_len=None,
+                        spec_steps=self.server_args.speculative_num_steps,
+                        topk=self.server_args.speculative_eagle_topk,
+                        draft_token_num=self.server_args.speculative_num_draft_tokens,
+                        capture_hidden_mode=CaptureHiddenMode.FULL,
+                        seq_lens_sum=None,
+                        seq_lens_cpu=None,
+                    )
+
+            elif self.spec_algorithm.is_ngram():
+                from sglang.srt.speculative.ngram_info import NgramVerifyInput
+
+                spec_info = NgramVerifyInput(
+                    draft_token=None,
+                    tree_mask=buffers.custom_mask,
+                    positions=None,
+                    retrive_index=None,
+                    retrive_next_token=None,
+                    retrive_next_sibling=None,
+                    draft_token_num=num_tokens_per_bs,
+                )
+                spec_info.capture_hidden_mode = CaptureHiddenMode.NULL
+
+            return spec_info
+
+        spec_info = get_spec_info()
+        if capture_hidden_mode != CaptureHiddenMode.FULL:
+            capture_hidden_mode = (
+                spec_info.capture_hidden_mode if spec_info else CaptureHiddenMode.NULL
+            )
+
+        if self.server_args.enable_lora:
+            lora_ids = [None] * batch_size
+        else:
+            lora_ids = None
+
+        forward_batch = ForwardBatch(
+            forward_mode=capture_forward_mode,
+            batch_size=batch_size,
+            input_ids=buffers.input_ids,
+            req_pool_indices=buffers.req_pool_indices,
+            seq_lens=buffers.seq_lens,
+            seq_lens_cpu=buffers.seq_lens_cpu,
+            next_token_logits_buffer=buffers.next_token_logits_buffer,
+            orig_seq_lens=buffers.seq_lens,
+            req_to_token_pool=self.req_to_token_pool,
+            token_to_kv_pool=self.token_to_kv_pool,
+            attn_backend=self.attn_backend,
+            out_cache_loc=buffers.out_cache_loc,
+            seq_lens_sum=buffers.seq_lens.sum().item(),
+            encoder_lens=buffers.encoder_lens,
+            return_logprob=False,
+            positions=buffers.positions,
+            extend_num_tokens=extend_num_tokens,
+            extend_seq_lens=extend_seq_lens,
+            extend_prefix_lens=extend_prefix_lens,
+            extend_start_loc=extend_start_loc,
+            extend_prefix_lens_cpu=extend_prefix_lens_cpu,
+            extend_seq_lens_cpu=extend_seq_lens_cpu,
+            global_num_tokens_gpu=buffers.global_num_tokens_gpu,
+            global_num_tokens_for_logprob_gpu=buffers.global_num_tokens_for_logprob_gpu,
+            dp_padding_mode=DpPaddingMode.get_default_mode_in_cuda_graph(),
+            global_dp_buffer_len=global_dp_buffer_len,
+            mrope_positions=buffers.mrope_positions,
+            spec_algorithm=self.spec_algorithm,
+            spec_info=spec_info,
+            capture_hidden_mode=capture_hidden_mode,
+            num_token_non_padded=buffers.num_token_non_padded,
+            global_forward_mode=capture_forward_mode,
+            lora_ids=lora_ids,
+        )
+
+        if lora_ids is not None:
+            self.lora_manager.prepare_lora_batch(forward_batch)
+
+        self.attn_backend.init_forward_metadata(forward_batch)
+
+        def run_once():
+            forward_batch.dp_local_start_pos = forward_batch.dp_local_num_tokens = None
+            set_dp_buffer_len(
+                global_dp_buffer_len,
+                num_tokens,
+                forward_batch.dp_padding_mode.is_max_len(),
+            )
+            set_is_extend_in_batch(False)
+
+            kwargs = {}
+            if (
+                self.server_args.pp_size > 1
+                and "pp_proxy_tensors"
+                in inspect.signature(self.model.forward).parameters
+            ):
+                kwargs["pp_proxy_tensors"] = PPProxyTensors(
+                    {k: v.clone() for k, v in pp_proxy_tensors.tensors.items()}
+                )
+            if not self.is_generation:
+                kwargs["get_embedding"] = True
+
+            logits_output_or_pp_proxy_tensors = self.model.forward(
+                buffers.input_ids,
+                forward_batch.positions,
+                forward_batch,
+                **kwargs,
+            )
+            return logits_output_or_pp_proxy_tensors
+
+        torch.get_device_module(self.device).synchronize()
+        self.tp_group.barrier()
+        with torch.inference_mode(), run_ctx or empty_context():
+            run_once()
+
+    def init_device_graphs(self):
+        """Capture device graphs."""
+        self.graph_runner = None
+        self.graph_mem_usage = 0
+
+        if not self.is_generation:
+            # TODO: Currently, cuda graph only captures decode steps, which only exists for generation models
+            return
+
+        if self.server_args.model_impl.lower() == ModelImpl.MINDSPORE:
+            return
+
+        if self.device != "cpu" and self.server_args.disable_cuda_graph:
+            return
+
+        if self.device == "cpu" and not self.server_args.enable_torch_compile:
+            return
+
+        tic = time.perf_counter()
+        before_mem = get_available_gpu_memory(self.device, self.gpu_id)
+        graph_backend = defaultdict(
+            lambda: "cuda graph",
+            {
+                "cpu": "cpu graph",
+                "npu": "npu graph",
+            },
+        )
+        logger.info(
+            f"Capture {graph_backend[self.device]} begin. This can take up to several minutes. avail mem={before_mem:.2f} GB"
+        )
+        graph_runners = defaultdict(
+            lambda: CudaGraphRunner,
+            {
+                "cpu": CPUGraphRunner,
+                "npu": NPUGraphRunner,
+            },
+        )
+        self.graph_runner = graph_runners[self.device](self)
+
+        after_mem = get_available_gpu_memory(self.device, self.gpu_id)
+        self.graph_mem_usage = before_mem - after_mem
+        logger.info(
+            f"Capture {graph_backend[self.device]} end. Time elapsed: {time.perf_counter() - tic:.2f} s. "
+            f"mem usage={self.graph_mem_usage:.2f} GB. avail mem={after_mem:.2f} GB."
+        )
+
+    def init_piecewise_cuda_graphs(self):
+        """Initialize piecewise CUDA graph runner."""
+        self.piecewise_cuda_graph_runner = None
+
+        if self.server_args.disable_piecewise_cuda_graph:
+            logger.info(
+                "Disable piecewise CUDA graph because --disable-piecewise-cuda-graph is set"
+            )
+            return
+
+        # Disable piecewise CUDA graph for non-language models
+        if not hasattr(self.model, "model"):
+            logger.warning(
+                "Disable piecewise CUDA graph because the model is not a language model"
+            )
+            return
+
+        # Disable piecewise CUDA graph for non capture size
+        if not self.server_args.piecewise_cuda_graph_tokens:
+            logger.warning(
+                "Disable piecewise CUDA graph because the capture size is not set"
+            )
+            return
+
+        # Collect attention layers and moe layers from the model
+        self.model.model = resolve_language_model(self.model)
+        language_model = getattr(self.model, "language_model", self.model)
+        self.attention_layers = []
+        self.moe_layers = []
+        self.moe_fusions = []
+        for layer in language_model.model.layers:
+            if hasattr(layer, "self_attn"):
+                if hasattr(layer.self_attn, "attn"):
+                    self.attention_layers.append(layer.self_attn.attn)
+                elif hasattr(layer.self_attn, "attn_mqa"):
+                    # For DeepSeek model
+                    self.attention_layers.append(layer.self_attn.attn_mqa)
+            # For hybrid model
+            elif hasattr(layer, "attn"):
+                self.attention_layers.append(layer.attn)
+            elif hasattr(layer, "linear_attn"):
+                if hasattr(layer.linear_attn, "attn"):
+                    self.attention_layers.append(layer.linear_attn.attn)
+                else:
+                    self.attention_layers.append(layer.linear_attn)
+            # For InternVL model
+            elif hasattr(layer, "attention"):
+                if hasattr(layer.attention, "attn"):
+                    self.attention_layers.append(layer.attention.attn)
+
+            moe_block = None
+            moe_fusion = None
+            if hasattr(layer, "mlp") and hasattr(layer.mlp, "experts"):
+                moe_block = layer.mlp.experts
+                moe_fusion = layer.mlp
+            if hasattr(layer, "block_sparse_moe") and hasattr(
+                layer.block_sparse_moe, "experts"
+            ):
+                moe_block = layer.block_sparse_moe.experts
+                moe_fusion = layer.block_sparse_moe
+            if hasattr(layer, "moe") and hasattr(layer.moe, "experts"):
+                moe_block = layer.moe.experts
+                moe_fusion = layer.moe
+            self.moe_layers.append(moe_block)
+            self.moe_fusions.append(moe_fusion)
+
+        if len(self.attention_layers) < self.model_config.num_hidden_layers:
+            # TODO(yuwei): support Non-Standard GQA
+            log_info_on_rank0(
+                logger,
+                "Disable piecewise CUDA graph because some layers do not apply Standard GQA",
+            )
+            return
+
+        tic = time.perf_counter()
+        before_mem = get_available_gpu_memory(self.device, self.gpu_id)
+        logger.info(
+            f"Capture piecewise CUDA graph begin. avail mem={before_mem:.2f} GB"
+        )
+
+        self.piecewise_cuda_graph_runner = PiecewiseCudaGraphRunner(self)
+
+        after_mem = get_available_gpu_memory(self.device, self.gpu_id)
+        mem_usage = before_mem - after_mem
+        logger.info(
+            f"Capture piecewise CUDA graph end. Time elapsed: {time.perf_counter() - tic:.2f} s. "
+            f"mem usage={mem_usage:.2f} GB. avail mem={after_mem:.2f} GB."
+        )
+
+    def init_threads_binding(self):
+        omp_cpuids = os.environ.get("SGLANG_CPU_OMP_THREADS_BIND", "all")
+        cpu_ids_by_node = get_cpu_ids_by_node()
+        n_numa_node = len(cpu_ids_by_node)
+        if omp_cpuids == "all":
+            assert self.tp_size <= n_numa_node, (
+                f"SGLANG_CPU_OMP_THREADS_BIND is not set, in this case, "
+                f"tp_size {self.tp_size} should be smaller than or equal to number of numa node on the machine {n_numa_node}. "
+                f"If you need tp_size to be larger than number of numa node, please set the CPU cores for each tp rank via SGLANG_CPU_OMP_THREADS_BIND explicitly. "
+                f"For example, on a machine with 2 numa nodes, where core 0-31 are on numa node 0 and core 32-63 are on numa node 1, "
+                f"it is suggested to use -tp 2 and bind tp rank 0 to core 0-31 and tp rank 1 to core 32-63. "
+                f"This is the default behavior if SGLANG_CPU_OMP_THREADS_BIND is not set and it is the same as setting SGLANG_CPU_OMP_THREADS_BIND=0-31|32-63. "
+                f"If you do need tp_size to be larger than the number of numa nodes, you could set SGLANG_CPU_OMP_THREADS_BIND explicitly for example SGLANG_CPU_OMP_THREADS_BIND=0-15|16-31|32-47|48-63 and run with -tp 4. "
+                f"If you don't want each tp rank to use all the cores on one numa node, you could set for example SGLANG_CPU_OMP_THREADS_BIND=0-15|32-47 and run with -tp 2."
+            )
+            if self.tp_size < n_numa_node:
+                logger.warning(
+                    f"Detected the current machine has {n_numa_node} numa nodes available, but tp_size is set to {self.tp_size}, so only {self.tp_size} numa nodes are used."
+                )
+            self.local_omp_cpuid = cpu_ids_by_node[self.tp_rank]
+        else:
+            threads_bind_list = omp_cpuids.split("|")
+            assert self.tp_size == len(threads_bind_list), (
+                f"SGLANG_CPU_OMP_THREADS_BIND setting must be aligned with TP size parameter ({self.tp_size}). "
+                f"Please double check your settings."
+            )
+            self.local_omp_cpuid = threads_bind_list[self.tp_rank]
+            if self.tp_size > n_numa_node:
+                logger.warning(
+                    f"TP size ({self.tp_size})is larger than numa node number ({n_numa_node}), "
+                    f"in this case the available memory amount of each rank cannot be determined in prior. "
+                    f"Please set proper `--max-total-tokens` to avoid the out-of-memory error."
+                )
+
+    def apply_torch_tp(self):
+        logger.info(f"Enabling torch tensor parallelism on {self.tp_size} devices.")
+        from sglang.srt.layers.model_parallel import tensor_parallel
+
+        device_mesh = torch.distributed.init_device_mesh(self.device, (self.tp_size,))
+        tensor_parallel(self.model, device_mesh)
+
+    def update_decode_attn_backend(self, stream_idx: int):
+        self.decode_attn_backend = self.decode_attn_backend_group[stream_idx]
+
+    def forward_decode(
+        self,
+        forward_batch: ForwardBatch,
+        skip_attn_backend_init: bool = False,
+        pp_proxy_tensors=None,
+    ) -> Union[LogitsProcessorOutput, PPProxyTensors]:
+        if not skip_attn_backend_init:
+            if self.server_args.enable_pdmux:
+                self.decode_attn_backend.init_forward_metadata(forward_batch)
+                forward_batch.attn_backend = self.decode_attn_backend
+            else:
+                self.attn_backend.init_forward_metadata(forward_batch)
+        # FIXME: add pp_proxy_tensors arg to all models
+        kwargs = {}
+        if self.support_pp:
+            kwargs["pp_proxy_tensors"] = pp_proxy_tensors
+        return self.model.forward(
+            forward_batch.input_ids,
+            forward_batch.positions,
+            forward_batch,
+            **kwargs,
+        )
+
+    def forward_extend(
+        self,
+        forward_batch: ForwardBatch,
+        skip_attn_backend_init: bool = False,
+        pp_proxy_tensors=None,
+    ) -> Tuple[
+        Union[LogitsProcessorOutput, PPProxyTensors, EmbeddingPoolerOutput], bool
+    ]:
+        kwargs = {}
+        if self.support_pp:
+            kwargs["pp_proxy_tensors"] = pp_proxy_tensors
+        if forward_batch.input_embeds is not None:
+            kwargs["input_embeds"] = forward_batch.input_embeds.bfloat16()
+        if not self.is_generation:
+            kwargs["get_embedding"] = True
+
+        can_run_graph = (
+            self.piecewise_cuda_graph_runner is not None
+            and self.piecewise_cuda_graph_runner.can_run(forward_batch)
+        )
+
+        if can_run_graph:
+            return (
+                self.piecewise_cuda_graph_runner.replay(forward_batch, **kwargs),
+                can_run_graph,
+            )
+
+        if not skip_attn_backend_init:
+            self.attn_backend.init_forward_metadata(forward_batch)
+
+        return (
+            self.model.forward(
+                forward_batch.input_ids,
+                forward_batch.positions,
+                forward_batch,
+                **kwargs,
+            ),
+            can_run_graph,
+        )
+
+    def forward_idle(
+        self, forward_batch: ForwardBatch, pp_proxy_tensors=None
+    ) -> Union[LogitsProcessorOutput, PPProxyTensors]:
+        # In DP Attention, IDLE batches are padded (batch_size > 0) for MLP sync.
+        # in this case, we need to reinit the forward metadata, otherwise the stale
+        # metadata causes batch_size mismatch in attention kernel(e.g. NSA Indexer).
+        if forward_batch.batch_size > 0:
+            self.attn_backend.init_forward_metadata(forward_batch)
+
+        kwargs = {}
+        if self.support_pp:
+            kwargs["pp_proxy_tensors"] = pp_proxy_tensors
+        return self.model.forward(
+            forward_batch.input_ids,
+            forward_batch.positions,
+            forward_batch,
+            **kwargs,
+        )
+
+    def forward_split_prefill(
+        self,
+        forward_batch: ForwardBatch,
+        reinit_attn_backend: bool = False,
+        forward_count: int = 1,
+    ) -> LogitsProcessorOutput:
+        if forward_batch.split_index == 0 or reinit_attn_backend:
+            self.attn_backend.init_forward_metadata(forward_batch)
+        next_split_index = min(
+            forward_batch.split_index + forward_count,
+            self.model_config.num_hidden_layers,
+        )
+        ret = self.model.forward_split_prefill(
+            forward_batch.input_ids,
+            forward_batch.positions,
+            forward_batch,
+            (forward_batch.split_index, next_split_index),
+        )
+        forward_batch.split_index = next_split_index
+        return ret
+
+    def forward(
+        self,
+        forward_batch: ForwardBatch,
+        skip_attn_backend_init: bool = False,
+        pp_proxy_tensors: Optional[PPProxyTensors] = None,
+        reinit_attn_backend: bool = False,
+        split_forward_count: int = 1,
+    ) -> ModelRunnerOutput:
+        self.forward_pass_id += 1
+
+        with get_global_expert_distribution_recorder().with_forward_pass(
+            self.forward_pass_id,
+            forward_batch,
+        ) as recorder_outputs:
+            output = self._forward_raw(
+                forward_batch,
+                skip_attn_backend_init,
+                pp_proxy_tensors,
+                reinit_attn_backend,
+                split_forward_count,
+            )
+            elastic_ep_state = ElasticEPStateManager.instance()
+            if (
+                elastic_ep_state is not None
+                and not elastic_ep_state.is_active_equal_last()
+            ):
+                elastic_ep_state.snapshot_active_to_last()
+                elastic_ep_state.sync_active_to_cpu()
+                logging.info("EPLB due to rank faults")
+                gen = self.eplb_manager.rebalance()
+                while True:
+                    try:
+                        next(gen)
+                    except StopIteration:
+                        break
+                output = self._forward_raw(
+                    forward_batch,
+                    skip_attn_backend_init,
+                    pp_proxy_tensors,
+                    reinit_attn_backend,
+                    split_forward_count,
+                )
+        output.expert_distribution_metrics = recorder_outputs.get("metrics")
+
+        # Copy cached routing experts' buffers back to CPU cache
+        get_global_experts_capturer().on_forward_end(
+            forward_batch=forward_batch,
+            can_run_graph=output.can_run_graph,
+            cuda_graph_batch=getattr(self.graph_runner, "bs", None),
+        )
+
+        if self.eplb_manager is not None:
+            self.eplb_manager.on_forward_pass_end()
+
+        if dumper.may_enable:
+            dumper.step()
+
+        return output
+
+    def _forward_raw(
+        self,
+        forward_batch: ForwardBatch,
+        skip_attn_backend_init: bool,
+        pp_proxy_tensors: Optional[PPProxyTensors],
+        reinit_attn_backend: bool = False,
+        split_forward_count: int = 1,
+    ) -> ModelRunnerOutput:
+        mode_check = (
+            forward_batch.forward_mode.is_cpu_graph
+            if self.device == "cpu"
+            else forward_batch.forward_mode.is_cuda_graph
+        )
+        can_run_graph = bool(
+            mode_check()
+            and self.graph_runner
+            and self.graph_runner.can_run(forward_batch)
+        )
+
+        if can_run_graph:
+            ret = self.graph_runner.replay(
+                forward_batch,
+                skip_attn_backend_init=skip_attn_backend_init,
+                pp_proxy_tensors=pp_proxy_tensors,
+            )
+            return ModelRunnerOutput(logits_output=ret, can_run_graph=can_run_graph)
+
+        # For MLP sync
+        if forward_batch.global_num_tokens_cpu is not None:
+            forward_batch.prepare_mlp_sync_batch(self)
+        else:
+            forward_batch.prepare_attn_tp_scatter_input(self)
+
+        # Normalize num_token_non_padded to be local to this attention TP rank if needed.
+        if (
+            forward_batch.num_token_non_padded is not None
+            and forward_batch.global_num_tokens_gpu is not None
+            and require_gathered_buffer(self.server_args)
+            and not is_nsa_enable_prefill_cp()
+        ):
+            forward_batch.adjust_num_token_non_padded_for_attn_tp(
+                server_args=self.server_args,
+            )
+
+        if forward_batch.forward_mode.is_decode():
+            ret = self.forward_decode(
+                forward_batch,
+                skip_attn_backend_init=skip_attn_backend_init,
+                pp_proxy_tensors=pp_proxy_tensors,
+            )
+        elif forward_batch.forward_mode.is_split_prefill():
+            ret = self.forward_split_prefill(
+                forward_batch,
+                reinit_attn_backend=reinit_attn_backend,
+                forward_count=split_forward_count,
+            )
+        elif forward_batch.forward_mode.is_extend(include_draft_extend_v2=True):
+            ret, can_run_graph = self.forward_extend(
+                forward_batch,
+                skip_attn_backend_init=skip_attn_backend_init,
+                pp_proxy_tensors=pp_proxy_tensors,
+            )
+        elif forward_batch.forward_mode.is_idle():
+            ret = self.forward_idle(forward_batch, pp_proxy_tensors=pp_proxy_tensors)
+        else:
+            raise ValueError(f"Invalid forward mode: {forward_batch.forward_mode}")
+
+        if (
+            forward_batch.global_num_tokens_cpu is not None
+            and self.pp_group.is_last_rank
+        ):
+            forward_batch.post_forward_mlp_sync_batch(ret)
+
+        return ModelRunnerOutput(logits_output=ret, can_run_graph=can_run_graph)
+
+    def _preprocess_logits(
+        self, logits_output: LogitsProcessorOutput, sampling_info: SamplingBatchInfo
+    ):
+        # NOTE: In overlap mode, the function update_regex_vocab_mask (in sample)
+        #       was executed after we processed last batch's results.
+
+        # Calculate logits bias and apply it to next_token_logits.
+        sampling_info.update_regex_vocab_mask()
+        sampling_info.apply_logits_bias(logits_output.next_token_logits)
+
+    def sample(
+        self,
+        logits_output: LogitsProcessorOutput,
+        forward_batch: ForwardBatch,
+    ) -> torch.Tensor:
+        """Sample and compute logprobs and update logits_output.
+
+        Args:
+            logits_output: The logits output from the model forward
+            forward_batch: The forward batch that generates logits_output
+
+        Returns:
+            A list of next_token_ids
+        """
+        # For duplex models with multiple output streams.
+        if isinstance(logits_output, tuple):
+            return torch.stack(
+                [self.sample(values, forward_batch) for values in logits_output],
+                axis=-1,
+            )
+
+        self._preprocess_logits(logits_output, forward_batch.sampling_info)
+        # Sample the next tokens
+        next_token_ids = self.sampler(
+            logits_output,
+            forward_batch.sampling_info,
+            forward_batch.return_logprob,
+            forward_batch.top_logprobs_nums,
+            forward_batch.token_ids_logprobs,
+            # For prefill, we only use the position of the last token.
+            (
+                forward_batch.positions
+                if forward_batch.forward_mode.is_decode()
+                else forward_batch.seq_lens - 1
+            ),
+        )
+        return next_token_ids
+
+    def compute_logprobs_only(
+        self,
+        logits_output: LogitsProcessorOutput,
+        forward_batch: ForwardBatch,
+    ) -> None:
+        """
+        Compute token_ids_logprobs without performing sampling.
+
+        Optimized path for prefill-only requests that need token_ids_logprobs but don't
+        require next token generation. Skips expensive sampling operations
+        while still providing requested probability information.
+
+        Args:
+            logits_output: The logits output from the model forward
+            forward_batch: The forward batch that generates logits_output
+        """
+        if not forward_batch.token_ids_logprobs:
+            return
+
+        # Preprocess logits (same as in sample method)
+        self._preprocess_logits(logits_output, forward_batch.sampling_info)
+
+        # Delegate to sampler for logprob-only computation
+        # This populates logits_output with requested token probabilities
+        self.sampler.compute_logprobs_only(
+            logits_output,
+            forward_batch.sampling_info,
+            forward_batch.return_logprob,
+            forward_batch.top_logprobs_nums,
+            forward_batch.token_ids_logprobs,
+        )
+
+    @property
+    def model_is_mrope(self) -> bool:
+        """Detect if the model has "mrope" rope_scaling type.
+        mrope requires keep "rope_deltas" between prompt and decoding phases."""
+        rope_scaling = getattr(
+            self.model_config.hf_text_config, "rope_parameters", None
+        ) or getattr(self.model_config.hf_text_config, "rope_scaling", {})
+        if rope_scaling is None:
+            return False
+        is_mrope_enabled = "mrope_section" in rope_scaling
+        return is_mrope_enabled
+
+    def save_remote_model(self, url: str):
+        from sglang.srt.model_loader.loader import RemoteModelLoader
+
+        logger.info(f"Saving model to {url}")
+        RemoteModelLoader.save_model(self.model, self.model_config.model_path, url)
+
+    def save_sharded_model(
+        self, path: str, pattern: Optional[str] = None, max_size: Optional[int] = None
+    ):
+        from sglang.srt.model_loader.loader import ShardedStateLoader
+
+        logger.info(
+            f"Save sharded model to {path} with pattern {pattern} and max_size {max_size}"
+        )
+        ShardedStateLoader.save_model(self.model, path, pattern, max_size)
+
+    def check_weights(self, action: str):
+        self._weight_checker.handle(action=action)
+
+    def update_weights_from_ipc(self, recv_req):
+        """Update weights from IPC for checkpoint-engine integration."""
+        try:
+            from sglang.srt.checkpoint_engine.checkpoint_engine_worker import (
+                SGLangCheckpointEngineWorkerExtensionImpl,
+            )
+
+            # Create a worker extension that integrates with SGLang's model
+            worker = SGLangCheckpointEngineWorkerExtensionImpl(self)
+            worker.update_weights_from_ipc(recv_req.zmq_handles)
+            return True, "IPC weight update completed successfully"
+        except ImportError as e:
+            return False, f"IPC weight update failed: ImportError {e}"
+        except Exception as e:
+            logger.error(f"IPC weight update failed: {e}")
+            return False, str(e)
+
+    def prealloc_symmetric_memory_pool(self):
+        # PyTorch mempools never de-fragment memory in OOM scenarios, so we need to pre-allocate a large chunk of memory to limit fragmentation.
+        if (
+            self.is_draft_worker
+            or not self.server_args.enable_symm_mem
+            or envs.SGLANG_SYMM_MEM_PREALLOC_GB_SIZE.get() <= 0
+        ):
+            return
+
+        # Memory allocation is tied to a cuda stream, use the forward stream
+        with torch.get_device_module(self.device).stream(self.forward_stream):
+            logger.info(
+                f"Pre-allocating symmetric memory pool with {envs.SGLANG_SYMM_MEM_PREALLOC_GB_SIZE.get()} GiB"
+            )
+            with use_symmetric_memory(get_tp_group()):
+                torch.empty(
+                    (envs.SGLANG_SYMM_MEM_PREALLOC_GB_SIZE.get() * 1024 * 1024 * 1024,),
+                    dtype=torch.uint8,
+                    device=self.device,
+                )
+
+
+def _model_load_weights_direct(model, named_tensors: List[Tuple[str, torch.Tensor]]):
+    params_dict = dict(model.named_parameters())
+    for name, tensor in named_tensors:
+        default_weight_loader(params_dict[name], tensor)
+
+
+def _unwrap_tensor(tensor, tp_rank, device):
+    if isinstance(tensor, LocalSerializedTensor):
+        tensor = tensor.get(tp_rank)
+    return tensor.to(device)
+
+
+@dataclass
+class LocalSerializedTensor:
+    """torch.Tensor that gets serialized by MultiprocessingSerializer (which only serializes a pointer and not the data).
+    The i-th element in the list corresponds to i-th rank's GPU."""
+
+    values: List[bytes]
+
+    def get(self, rank: int):
+        return MultiprocessingSerializer.deserialize(self.values[rank])
diff --git a/sglang/python/sglang/srt/model_executor/model_runner_kv_cache_mixin.py b/sglang/python/sglang/srt/model_executor/model_runner_kv_cache_mixin.py
new file mode 100644
index 0000000000000000000000000000000000000000..c46ff0a388c790cb50c5f613511e96d78d018375
--- /dev/null
+++ b/sglang/python/sglang/srt/model_executor/model_runner_kv_cache_mixin.py
@@ -0,0 +1,748 @@
+from __future__ import annotations
+
+import logging
+from typing import TYPE_CHECKING
+
+import torch
+
+from sglang.srt.configs.model_config import get_nsa_index_head_dim, is_deepseek_nsa
+from sglang.srt.distributed.parallel_state import get_world_group
+from sglang.srt.layers.dp_attention import get_attention_tp_size
+from sglang.srt.mem_cache.allocator import (
+    PagedTokenToKVPoolAllocator,
+    TokenToKVPoolAllocator,
+)
+from sglang.srt.mem_cache.memory_pool import (
+    DoubleSparseTokenToKVPool,
+    HybridLinearKVPool,
+    HybridReqToTokenPool,
+    MHATokenToKVPool,
+    MHATokenToKVPoolFP4,
+    MLATokenToKVPool,
+    MLATokenToKVPoolFP4,
+    NSATokenToKVPool,
+    ReqToTokenPool,
+)
+from sglang.srt.mem_cache.swa_memory_pool import SWAKVPool, SWATokenToKVPoolAllocator
+from sglang.srt.utils.common import (
+    get_available_gpu_memory,
+    is_float4_e2m1fn_x2,
+    is_npu,
+)
+
+if TYPE_CHECKING:
+    from sglang.srt.model_executor.model_runner import ModelRunner
+
+# the ratio of mamba cache pool size to max_running_requests
+MAMBA_CACHE_SIZE_MAX_RUNNING_REQUESTS_RATIO = 3
+MAMBA_CACHE_V2_ADDITIONAL_RATIO_OVERLAP = 2
+MAMBA_CACHE_V2_ADDITIONAL_RATIO_NO_OVERLAP = 1
+
+logger = logging.getLogger(__name__)
+
+_is_npu = is_npu()
+
+
+class ModelRunnerKVCacheMixin:
+    def get_cell_size_per_token(self: ModelRunner, num_layers: int) -> int:
+        kv_size = torch._utils._element_size(self.kv_cache_dtype)
+        if self.use_mla_backend:
+            cell_size = (
+                (self.model_config.kv_lora_rank + self.model_config.qk_rope_head_dim)
+                * num_layers
+                * kv_size
+            )
+            if is_float4_e2m1fn_x2(self.kv_cache_dtype):
+                # kv_scale_buffer
+                scale_block_size = 16
+                cell_size = (cell_size // 2) + (
+                    (
+                        (
+                            self.model_config.kv_lora_rank
+                            + self.model_config.qk_rope_head_dim
+                        )
+                        // scale_block_size
+                    )
+                    * num_layers
+                    * kv_size
+                )
+
+            # Add indexer KV cache overhead for NSA models (DeepSeek V3.2)
+            if is_deepseek_nsa(self.model_config.hf_config):
+                index_head_dim = get_nsa_index_head_dim(self.model_config.hf_config)
+                indexer_size_per_token = (
+                    index_head_dim
+                    + index_head_dim // NSATokenToKVPool.quant_block_size * 4
+                )
+                element_size = torch._utils._element_size(
+                    NSATokenToKVPool.index_k_with_scale_buffer_dtype
+                )
+                cell_size += indexer_size_per_token * num_layers * element_size
+        else:
+            if self.model_config.is_hybrid_swa:
+                full_layers_num = len(self.model_config.full_attention_layer_ids)
+                swa_layers_num = len(self.model_config.swa_attention_layer_ids)
+
+                full_per_token = self.model_config.get_num_kv_heads(
+                    get_attention_tp_size()
+                ) * (self.model_config.head_dim + self.model_config.v_head_dim)
+
+                swa_per_token = self.model_config.get_swa_num_kv_heads(
+                    get_attention_tp_size()
+                ) * (self.model_config.swa_head_dim + self.model_config.swa_v_head_dim)
+
+                cell_size = (
+                    full_per_token * full_layers_num + swa_per_token * swa_layers_num
+                ) * kv_size
+            else:
+                cell_size = (
+                    self.model_config.get_num_kv_heads(get_attention_tp_size())
+                    * (self.model_config.head_dim + self.model_config.v_head_dim)
+                    * num_layers
+                    * kv_size
+                )
+
+            if is_float4_e2m1fn_x2(self.kv_cache_dtype):
+                # kv_scale_buffer
+                scale_block_size = 16
+
+                n = self.model_config.get_num_kv_heads(get_attention_tp_size())
+                k = self.model_config.head_dim
+                cell_size = (cell_size // 2) + (
+                    (n * k * num_layers * 2 * kv_size) // scale_block_size
+                )
+        return cell_size
+
+    def profile_max_num_token(self: ModelRunner, pre_model_load_memory: int):
+        post_model_load_memory = get_available_gpu_memory(
+            self.device,
+            self.gpu_id,
+            distributed=get_world_group().world_size > 1,
+            cpu_group=get_world_group().cpu_group,
+        )
+
+        # Get the number of layers used for KV cache calculation
+        if self.is_draft_worker:
+            num_layers = getattr(
+                self.model_config.hf_config,
+                "num_nextn_predict_layers",
+                self.num_effective_layers,
+            )
+        elif mambaish := self.mambaish_config:
+            effective_layer_ids = [
+                i
+                for i in mambaish.full_attention_layer_ids
+                if self.start_layer <= i < self.end_layer
+            ]
+            num_layers = len(effective_layer_ids)
+        else:
+            num_layers = self.num_effective_layers
+
+        cell_size = self.get_cell_size_per_token(num_layers)
+
+        rest_memory = post_model_load_memory - pre_model_load_memory * (
+            1 - self.mem_fraction_static
+        )
+        if self.mambaish_config is not None:
+            rest_memory = self.handle_max_mamba_cache(rest_memory)
+
+        return int(rest_memory * (1 << 30)) // cell_size
+
+    def handle_max_mamba_cache(self: ModelRunner, total_rest_memory):
+        config = self.mambaish_config
+        server_args = self.server_args
+        assert config is not None
+
+        # reserve the memory for the intermediate mamba states used for spec dec
+        if not self.spec_algorithm.is_none():
+            assert server_args.speculative_num_draft_tokens is not None
+            assert server_args.max_running_requests is not None
+
+            max_running_requests = server_args.max_running_requests // (
+                self.dp_size if server_args.enable_dp_attention else 1
+            )
+            mamba_state_intermediate_size = (
+                config.mamba2_cache_params.mamba_cache_per_req
+                * max_running_requests
+                * server_args.speculative_num_draft_tokens
+            )
+            total_rest_memory = total_rest_memory - (
+                mamba_state_intermediate_size / (1 << 30)
+            )
+
+        if server_args.max_mamba_cache_size is not None:
+            # Use explicitly set max_mamba_cache_size
+            server_args.max_mamba_cache_size = server_args.max_mamba_cache_size // (
+                server_args.dp_size if server_args.enable_dp_attention else 1
+            )
+        elif (
+            server_args.disable_radix_cache
+            and server_args.max_running_requests is not None
+        ):
+            # Use explicitly set max_running_requests when radix cache is disabled
+            server_args.max_mamba_cache_size = server_args.max_running_requests // (
+                server_args.dp_size if server_args.enable_dp_attention else 1
+            )
+        else:
+            # Use ratio-based calculation to auto-fit available memory
+            assert config.mamba2_cache_params.mamba_cache_per_req > 0
+
+            # allocate the memory based on the ratio between mamba state memory vs. full kv cache memory
+            # solve the equations:
+            # 1. mamba_state_memory + full_kv_cache_memory == total_rest_memory
+            # 2. mamba_state_memory / full_kv_cache_memory == server_args.mamba_full_memory_ratio
+            mamba_state_memory_raw = (
+                total_rest_memory
+                * server_args.mamba_full_memory_ratio
+                / (1 + server_args.mamba_full_memory_ratio)
+            )
+            # calculate the max_mamba_cache_size based on the given total mamba memory
+            server_args.max_mamba_cache_size = int(
+                (mamba_state_memory_raw * (1 << 30))
+                // config.mamba2_cache_params.mamba_cache_per_req
+            )
+
+        mamba_state_memory = (
+            server_args.max_mamba_cache_size
+            * config.mamba2_cache_params.mamba_cache_per_req
+            / (1 << 30)
+        )
+        return total_rest_memory - mamba_state_memory
+
+    def calculate_mla_kv_cache_dim(self: ModelRunner) -> int:
+        is_nsa_model = is_deepseek_nsa(self.model_config.hf_config)
+        kv_cache_dtype = self.kv_cache_dtype
+        kv_lora_rank = self.model_config.kv_lora_rank
+        qk_rope_head_dim = self.model_config.qk_rope_head_dim
+        kv_cache_dim = kv_lora_rank + qk_rope_head_dim  # default mla kv cache dim
+
+        # For non-NSA models, MLA kv cache dim is simply kv_lora_rank + qk_rope_head_dim
+        if not is_nsa_model:
+            return kv_cache_dim
+
+        # TRTLLM backend does not override kv_cache_dim for MLA kv cache
+        # Assuming nsa prefill and decode backends are the same when using trtllm MLA backend,
+        # since it is not compatible for trtllm and other mla attn backend due to the different
+        # kv cache layout.
+        if (
+            self.server_args.nsa_prefill_backend == "trtllm"
+            or self.server_args.nsa_decode_backend == "trtllm"
+        ):
+            return kv_cache_dim
+
+        quant_block_size = NSATokenToKVPool.quant_block_size
+        rope_storage_dtype = NSATokenToKVPool.rope_storage_dtype
+        # Calculate override_kv_cache_dim for FP8 storage for non-trtllm attention backends:
+        # kv_lora_rank + scale storage (kv_lora_rank // quant_block_size * 4 bytes) + rope dimension storage
+        # Note: rope dimension is stored in original dtype (bf16), not quantized to fp8
+        if kv_cache_dtype == torch.float8_e4m3fn:
+            assert (
+                kv_lora_rank % quant_block_size == 0
+            ), f"kv_lora_rank {kv_lora_rank} must be multiple of quant_block_size {quant_block_size}"
+
+            return (
+                kv_lora_rank
+                + kv_lora_rank // quant_block_size * 4
+                + qk_rope_head_dim * rope_storage_dtype.itemsize
+            )
+
+        return kv_cache_dim
+
+    def set_num_tokens_hybrid_swa(self: ModelRunner):
+        page_size = self.server_args.page_size
+
+        assert self.sliding_window_size is not None and self.sliding_window_size > 0
+        full_layers_num = len(self.model_config.full_attention_layer_ids)
+        swa_layers_num = len(self.model_config.swa_attention_layer_ids)
+
+        assert swa_layers_num > 0, "Hybrid SWA model must have at least one SWA layer"
+
+        def align_page_size(x: int) -> int:
+            return (x // page_size) * page_size
+
+        if full_layers_num == 0:
+            # all layers are SWA
+            self.swa_max_total_num_tokens = align_page_size(self.max_total_num_tokens)
+            self.full_max_total_num_tokens = 0
+            self.max_total_num_tokens = self.swa_max_total_num_tokens
+            logger.info(
+                f"Use sliding window memory pool (all SWA). swa_layer_tokens={self.swa_max_total_num_tokens}"
+            )
+            return
+
+        swa_full_tokens_ratio = self.server_args.swa_full_tokens_ratio
+
+        # Use unified memory-based allocation for all hybrid SWA models.
+        #
+        # Let:
+        #   F = Full layer per-token memory
+        #   S = SWA layer per-token memory (may differ from F)
+        #   r = swa_full_tokens_ratio = swa_tokens / full_tokens
+        #
+        # The profile phase computed:
+        #   cell_size = F * n_full + S * n_swa
+        #   max_total_num_tokens = rest_memory / cell_size
+        #   => total_memory = max_total_num_tokens * (F * n_full + S * n_swa)
+        #
+        # We need to solve:
+        #   full_tokens * F * n_full + swa_tokens * S * n_swa = total_memory
+        #   swa_tokens = full_tokens * r
+        #
+        # Solution:
+        #   full_tokens = total_memory / (F * n_full + r * S * n_swa)
+        #               = max_total_num_tokens * (F * n_full + S * n_swa) / (F * n_full + r * S * n_swa)
+
+        kv_size = torch._utils._element_size(self.kv_cache_dtype)
+
+        # Full layer per-token memory
+        full_per_token = (
+            self.model_config.get_num_kv_heads(get_attention_tp_size())
+            * (self.model_config.head_dim + self.model_config.v_head_dim)
+            * kv_size
+        )
+
+        # SWA layer per-token memory
+        swa_per_token = (
+            self.model_config.get_swa_num_kv_heads(get_attention_tp_size())
+            * (self.model_config.swa_head_dim + self.model_config.swa_v_head_dim)
+            * kv_size
+        )
+
+        # Total memory available from profile
+        total_memory = self.max_total_num_tokens * (
+            full_per_token * full_layers_num + swa_per_token * swa_layers_num
+        )
+
+        # Solve the equations
+        denominator = (
+            full_per_token * full_layers_num
+            + swa_full_tokens_ratio * swa_per_token * swa_layers_num
+        )
+        assert (
+            denominator > 0
+        ), f"Invalid denominator={denominator} for memory-based allocation. full_per_token={full_per_token}, full_layers_num={full_layers_num}, swa_per_token={swa_per_token}, swa_layers_num={swa_layers_num}, swa_full_tokens_ratio={swa_full_tokens_ratio}"
+
+        self.full_max_total_num_tokens = int(total_memory / denominator)
+        self.swa_max_total_num_tokens = int(
+            self.full_max_total_num_tokens * swa_full_tokens_ratio
+        )
+
+        self.full_max_total_num_tokens = align_page_size(self.full_max_total_num_tokens)
+        self.swa_max_total_num_tokens = align_page_size(self.swa_max_total_num_tokens)
+
+        self.max_total_num_tokens = self.full_max_total_num_tokens
+
+        logger.info(
+            f"Use sliding window memory pool. full_layer_tokens={self.full_max_total_num_tokens}, swa_layer_tokens={self.swa_max_total_num_tokens}"
+        )
+
+    def _calculate_mamba_ratio(self: ModelRunner) -> int:
+        if self.server_args.disable_radix_cache:
+            return 1
+
+        additional_ratio = 0
+        if self.server_args.enable_mamba_extra_buffer():
+            if not self.spec_algorithm.is_none():
+                additional_ratio = MAMBA_CACHE_V2_ADDITIONAL_RATIO_NO_OVERLAP
+            else:
+                additional_ratio = MAMBA_CACHE_V2_ADDITIONAL_RATIO_OVERLAP
+
+        return MAMBA_CACHE_SIZE_MAX_RUNNING_REQUESTS_RATIO + additional_ratio
+
+    def _init_pools(self: ModelRunner, max_num_reqs: int):
+        # Initialize req_to_token_pool
+        if self.req_to_token_pool is None:
+            # FIXME(lsyin): this is the temporary fix for the context length issue when using speculative decoding
+            extra_max_context_len = 4
+            if self.server_args.speculative_num_draft_tokens is not None:
+                extra_max_context_len += self.server_args.speculative_num_draft_tokens
+
+            if self.server_args.disaggregation_mode == "decode":
+                from sglang.srt.disaggregation.decode import (
+                    DecodeReqToTokenPool,
+                    HybridMambaDecodeReqToTokenPool,
+                )
+
+                # subscribe memory for pre-allocated requests
+                # if max_num_reqs <= 32, we pre-allocate 2x requests
+                pre_alloc_size = max_num_reqs * 2 if max_num_reqs <= 32 else 0
+                if config := self.mambaish_config:
+                    self.req_to_token_pool = HybridMambaDecodeReqToTokenPool(
+                        size=max_num_reqs,
+                        max_context_len=self.model_config.context_len
+                        + extra_max_context_len,
+                        device=self.device,
+                        enable_memory_saver=self.server_args.enable_memory_saver,
+                        cache_params=config.mamba2_cache_params,
+                        speculative_num_draft_tokens=self.server_args.speculative_num_draft_tokens,
+                        enable_mamba_extra_buffer=self.server_args.enable_mamba_extra_buffer(),
+                        pre_alloc_size=pre_alloc_size,
+                        enable_overlap_schedule=not self.server_args.disable_overlap_schedule,
+                        mamba_size=self.server_args.max_mamba_cache_size,
+                    )
+                else:
+                    self.req_to_token_pool = DecodeReqToTokenPool(
+                        size=max_num_reqs,
+                        max_context_len=self.model_config.context_len
+                        + extra_max_context_len,
+                        device=self.device,
+                        enable_memory_saver=self.server_args.enable_memory_saver,
+                        pre_alloc_size=pre_alloc_size,
+                    )
+            elif config := self.mambaish_config:
+                self.req_to_token_pool = HybridReqToTokenPool(
+                    size=max_num_reqs,
+                    mamba_size=self.server_args.max_mamba_cache_size,
+                    mamba_spec_state_size=max_num_reqs,
+                    max_context_len=self.model_config.context_len
+                    + extra_max_context_len,
+                    device=self.device,
+                    enable_memory_saver=self.server_args.enable_memory_saver,
+                    cache_params=config.mamba2_cache_params,
+                    enable_mamba_extra_buffer=self.server_args.enable_mamba_extra_buffer(),
+                    speculative_num_draft_tokens=self.server_args.speculative_num_draft_tokens,
+                    enable_overlap_schedule=not self.server_args.disable_overlap_schedule,
+                )
+            else:
+                self.req_to_token_pool = ReqToTokenPool(
+                    size=max_num_reqs,
+                    max_context_len=self.model_config.context_len
+                    + extra_max_context_len,
+                    device=self.device,
+                    enable_memory_saver=self.server_args.enable_memory_saver,
+                )
+        else:
+            # Draft worker shares req_to_token_pool with the target worker.
+            assert self.is_draft_worker
+
+        # Initialize token_to_kv_pool
+        is_nsa_model = is_deepseek_nsa(self.model_config.hf_config)
+        if self.server_args.attention_backend == "ascend":
+            if self.use_mla_backend:
+                from sglang.srt.hardware_backend.npu.memory_pool_npu import (
+                    NPUMLATokenToKVPool,
+                )
+
+                self.token_to_kv_pool = NPUMLATokenToKVPool(
+                    self.max_total_num_tokens,
+                    page_size=self.page_size,
+                    dtype=self.kv_cache_dtype,
+                    kv_lora_rank=self.model_config.kv_lora_rank,
+                    qk_rope_head_dim=self.model_config.qk_rope_head_dim,
+                    index_head_dim=(
+                        self.model_config.index_head_dim if is_nsa_model else None
+                    ),
+                    layer_num=self.num_effective_layers,
+                    device=self.device,
+                    enable_memory_saver=self.server_args.enable_memory_saver,
+                    start_layer=self.start_layer,
+                    end_layer=self.end_layer,
+                )
+            else:
+                from sglang.srt.hardware_backend.npu.memory_pool_npu import (
+                    NPUMHATokenToKVPool,
+                )
+
+                self.token_to_kv_pool = NPUMHATokenToKVPool(
+                    self.max_total_num_tokens,
+                    page_size=self.page_size,
+                    dtype=self.kv_cache_dtype,
+                    head_num=self.model_config.get_num_kv_heads(
+                        get_attention_tp_size()
+                    ),
+                    head_dim=self.model_config.head_dim,
+                    layer_num=self.num_effective_layers,
+                    device=self.device,
+                    enable_memory_saver=self.server_args.enable_memory_saver,
+                    start_layer=self.start_layer,
+                    end_layer=self.end_layer,
+                )
+        elif self.use_mla_backend and is_nsa_model:
+            self.token_to_kv_pool = NSATokenToKVPool(
+                self.max_total_num_tokens,
+                page_size=self.page_size,
+                dtype=self.kv_cache_dtype,
+                kv_lora_rank=self.model_config.kv_lora_rank,
+                qk_rope_head_dim=self.model_config.qk_rope_head_dim,
+                layer_num=self.num_effective_layers,
+                device=self.device,
+                kv_cache_dim=self.calculate_mla_kv_cache_dim(),
+                enable_memory_saver=self.server_args.enable_memory_saver,
+                start_layer=self.start_layer,
+                end_layer=self.end_layer,
+                index_head_dim=get_nsa_index_head_dim(self.model_config.hf_config),
+            )
+        elif self.use_mla_backend and not self.mambaish_config:
+            assert not is_nsa_model
+            if is_float4_e2m1fn_x2(self.kv_cache_dtype):
+                self.token_to_kv_pool = MLATokenToKVPoolFP4(
+                    self.max_total_num_tokens,
+                    page_size=self.page_size,
+                    dtype=self.kv_cache_dtype,
+                    kv_lora_rank=self.model_config.kv_lora_rank,
+                    qk_rope_head_dim=self.model_config.qk_rope_head_dim,
+                    layer_num=self.num_effective_layers,
+                    device=self.device,
+                    enable_memory_saver=self.server_args.enable_memory_saver,
+                    start_layer=self.start_layer,
+                    end_layer=self.end_layer,
+                )
+            else:
+                self.token_to_kv_pool = MLATokenToKVPool(
+                    self.max_total_num_tokens,
+                    page_size=self.page_size,
+                    dtype=self.kv_cache_dtype,
+                    kv_lora_rank=self.model_config.kv_lora_rank,
+                    qk_rope_head_dim=self.model_config.qk_rope_head_dim,
+                    layer_num=self.num_effective_layers,
+                    device=self.device,
+                    enable_memory_saver=self.server_args.enable_memory_saver,
+                    start_layer=self.start_layer,
+                    end_layer=self.end_layer,
+                )
+        elif self.server_args.enable_double_sparsity:
+            self.token_to_kv_pool = DoubleSparseTokenToKVPool(
+                self.max_total_num_tokens,
+                page_size=self.page_size,
+                dtype=self.kv_cache_dtype,
+                head_num=self.model_config.get_num_kv_heads(get_attention_tp_size()),
+                head_dim=self.model_config.head_dim,
+                layer_num=self.num_effective_layers,
+                device=self.device,
+                heavy_channel_num=self.server_args.ds_heavy_channel_num,
+                enable_memory_saver=self.server_args.enable_memory_saver,
+                start_layer=self.start_layer,
+                end_layer=self.end_layer,
+            )
+        else:
+            if self.is_hybrid_swa:
+                kwargs = {}
+                if self.is_hybrid_swa_compress:
+                    kwargs = {
+                        "swa_head_num": max(
+                            1,
+                            self.model_config.hf_text_config.swa_num_key_value_heads
+                            // get_attention_tp_size(),
+                        ),
+                        "swa_head_dim": self.model_config.hf_text_config.swa_head_dim,
+                        "swa_v_head_dim": self.model_config.hf_text_config.swa_v_head_dim,
+                        "v_head_dim": self.model_config.hf_text_config.v_head_dim,
+                    }
+                self.token_to_kv_pool = SWAKVPool(
+                    size=self.full_max_total_num_tokens,
+                    size_swa=self.swa_max_total_num_tokens,
+                    page_size=self.page_size,
+                    dtype=self.kv_cache_dtype,
+                    head_num=self.model_config.get_num_kv_heads(
+                        get_attention_tp_size()
+                    ),
+                    head_dim=self.model_config.head_dim,
+                    swa_attention_layer_ids=self.model_config.swa_attention_layer_ids,
+                    full_attention_layer_ids=self.model_config.full_attention_layer_ids,
+                    enable_kvcache_transpose=False,
+                    device=self.device,
+                    **kwargs,
+                )
+            elif config := self.mambaish_config:
+                extra_args = {}
+                if self.use_mla_backend:
+                    extra_args = {
+                        "kv_lora_rank": self.model_config.kv_lora_rank,
+                        "qk_rope_head_dim": self.model_config.qk_rope_head_dim,
+                    }
+                self.token_to_kv_pool = HybridLinearKVPool(
+                    page_size=self.page_size,
+                    size=self.max_total_num_tokens,
+                    dtype=self.kv_cache_dtype,
+                    head_num=self.model_config.get_num_kv_heads(
+                        get_attention_tp_size()
+                    ),
+                    head_dim=self.model_config.head_dim,
+                    # if draft worker, we only need 1 attention layer's kv pool
+                    full_attention_layer_ids=(
+                        [0]
+                        if self.is_draft_worker
+                        else [
+                            i
+                            for i in config.full_attention_layer_ids
+                            if self.start_layer <= i < self.end_layer
+                        ]
+                    ),
+                    enable_kvcache_transpose=False,
+                    device=self.device,
+                    mamba_pool=self.req_to_token_pool.mamba_pool,
+                    enable_memory_saver=self.server_args.enable_memory_saver,
+                    use_mla=self.use_mla_backend,
+                    **extra_args,
+                )
+            else:
+                if is_float4_e2m1fn_x2(self.kv_cache_dtype):
+                    self.token_to_kv_pool = MHATokenToKVPoolFP4(
+                        self.max_total_num_tokens,
+                        page_size=self.page_size,
+                        dtype=self.kv_cache_dtype,
+                        head_num=self.model_config.get_num_kv_heads(
+                            get_attention_tp_size()
+                        ),
+                        head_dim=self.model_config.head_dim,
+                        layer_num=self.num_effective_layers,
+                        device=self.device,
+                        enable_memory_saver=self.server_args.enable_memory_saver,
+                        start_layer=self.start_layer,
+                        end_layer=self.end_layer,
+                        enable_alt_stream=not self.server_args.enable_pdmux,
+                        enable_kv_cache_copy=(
+                            self.server_args.speculative_algorithm is not None
+                        ),
+                    )
+                else:
+                    self.token_to_kv_pool = MHATokenToKVPool(
+                        self.max_total_num_tokens,
+                        page_size=self.page_size,
+                        dtype=self.kv_cache_dtype,
+                        head_num=self.model_config.get_num_kv_heads(
+                            get_attention_tp_size()
+                        ),
+                        head_dim=self.model_config.head_dim,
+                        layer_num=self.num_effective_layers,
+                        device=self.device,
+                        enable_memory_saver=self.server_args.enable_memory_saver,
+                        start_layer=self.start_layer,
+                        end_layer=self.end_layer,
+                        enable_alt_stream=not self.server_args.enable_pdmux,
+                        enable_kv_cache_copy=(
+                            self.server_args.speculative_algorithm is not None
+                        ),
+                    )
+
+        # Initialize token_to_kv_pool_allocator
+        need_sort = self.server_args.disaggregation_mode in ("decode", "prefill")
+        if self.token_to_kv_pool_allocator is None:
+            if _is_npu and (
+                self.server_args.attention_backend == "ascend"
+                or self.hybrid_gdn_config is not None
+            ):
+                from sglang.srt.hardware_backend.npu.allocator_npu import (
+                    NPUPagedTokenToKVPoolAllocator,
+                )
+
+                self.token_to_kv_pool_allocator = NPUPagedTokenToKVPoolAllocator(
+                    self.max_total_num_tokens,
+                    page_size=self.page_size,
+                    dtype=self.kv_cache_dtype,
+                    device=self.device,
+                    kvcache=self.token_to_kv_pool,
+                    need_sort=need_sort,
+                )
+            else:
+                if self.is_hybrid_swa:
+                    self.token_to_kv_pool_allocator = SWATokenToKVPoolAllocator(
+                        self.full_max_total_num_tokens,
+                        self.swa_max_total_num_tokens,
+                        page_size=self.page_size,
+                        dtype=self.kv_cache_dtype,
+                        device=self.device,
+                        kvcache=self.token_to_kv_pool,
+                        need_sort=need_sort,
+                    )
+                else:
+                    if self.page_size == 1:
+                        self.token_to_kv_pool_allocator = TokenToKVPoolAllocator(
+                            self.max_total_num_tokens,
+                            dtype=self.kv_cache_dtype,
+                            device=self.device,
+                            kvcache=self.token_to_kv_pool,
+                            need_sort=need_sort,
+                        )
+                    else:
+                        self.token_to_kv_pool_allocator = PagedTokenToKVPoolAllocator(
+                            self.max_total_num_tokens,
+                            page_size=self.page_size,
+                            dtype=self.kv_cache_dtype,
+                            device=self.device,
+                            kvcache=self.token_to_kv_pool,
+                            need_sort=need_sort,
+                        )
+
+        else:
+            assert self.is_draft_worker
+            if self.is_hybrid_swa:
+                assert (
+                    self.token_to_kv_pool_allocator.__class__
+                    == SWATokenToKVPoolAllocator
+                )
+                self.token_to_kv_pool.full_to_swa_index_mapping = (
+                    self.token_to_kv_pool_allocator.full_to_swa_index_mapping
+                )
+
+    def init_memory_pool(self: ModelRunner, pre_model_load_memory: int):
+        max_total_tokens = self.server_args.max_total_tokens
+        self.max_total_num_tokens = self.profile_max_num_token(pre_model_load_memory)
+
+        # Resolve max_num_reqs (per dp worker)
+        max_num_reqs = self.server_args.max_running_requests
+        if max_num_reqs is not None:
+            max_num_reqs = max_num_reqs // self.dp_size
+        else:
+            estimated = int(
+                self.max_total_num_tokens / self.model_config.context_len * 512
+            )
+            max_num_reqs = max(min(estimated, 4096), 2048)
+
+        if self.mambaish_config is not None:
+            ratio = self._calculate_mamba_ratio()
+
+            # Constrain the max_num_reqs by the mamba cache size
+            max_num_reqs = min(
+                max_num_reqs, self.server_args.max_mamba_cache_size // ratio
+            )
+
+        if max_total_tokens is not None:
+            if max_total_tokens > self.max_total_num_tokens:
+                logging.warning(
+                    f"max_total_tokens={max_total_tokens} is larger than the profiled value "
+                    f"{self.max_total_num_tokens}. "
+                    f"Use the profiled value instead."
+                )
+            self.max_total_num_tokens = min(self.max_total_num_tokens, max_total_tokens)
+
+        self.max_total_num_tokens = (
+            self.max_total_num_tokens
+            // self.server_args.page_size
+            * self.server_args.page_size
+        )
+        # different pp rank may have different num of layers, so we need to reduce the max_total_num_tokens
+        if self.pp_size > 1:
+            tensor = torch.tensor(self.max_total_num_tokens, dtype=torch.int64)
+            torch.distributed.all_reduce(
+                tensor,
+                op=torch.distributed.ReduceOp.MIN,
+                group=get_world_group().cpu_group,
+            )
+            self.max_total_num_tokens = tensor.item()
+
+        if not self.spec_algorithm.is_none() and self.is_draft_worker:
+            self.max_total_num_tokens = self.server_args.draft_runner_cache_size
+            max_num_reqs = self.server_args.max_num_reqs
+
+        # create token size for hybrid cache
+        if self.is_hybrid_swa:
+            self.set_num_tokens_hybrid_swa()
+
+        if not self.spec_algorithm.is_none() and not self.is_draft_worker:
+            # Draft worker should use SWA adjusted max_total_num_tokens for cache size, otherwise it may cause oob in kv cache store
+            self.server_args.draft_runner_cache_size = self.max_total_num_tokens
+            self.server_args.max_num_reqs = max_num_reqs
+
+        if self.max_total_num_tokens <= 0:
+            raise RuntimeError(
+                f"Not enough memory. Please try to increase --mem-fraction-static. "
+                f"Current value: {self.server_args.mem_fraction_static=}"
+            )
+
+        self._init_pools(max_num_reqs)
+
+        logger.info(
+            f"Memory pool end. "
+            f"avail mem={get_available_gpu_memory(self.device, self.gpu_id):.2f} GB"
+        )
diff --git a/sglang/python/sglang/srt/model_executor/piecewise_cuda_graph_runner.py b/sglang/python/sglang/srt/model_executor/piecewise_cuda_graph_runner.py
new file mode 100644
index 0000000000000000000000000000000000000000..88f840929f86067d36fd66dea7148b167a453cb1
--- /dev/null
+++ b/sglang/python/sglang/srt/model_executor/piecewise_cuda_graph_runner.py
@@ -0,0 +1,804 @@
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Run the model with cuda graph and torch.compile."""
+
+from __future__ import annotations
+
+import bisect
+import gc
+import logging
+from contextlib import contextmanager
+from dataclasses import dataclass
+from typing import TYPE_CHECKING, Optional, Union
+
+import torch
+import tqdm
+
+from sglang.srt.batch_overlap.two_batch_overlap import TboCudaGraphRunnerPlugin
+from sglang.srt.compilation.compilation_config import CompilationConfig
+from sglang.srt.compilation.compile import install_torch_compiled
+from sglang.srt.compilation.piecewise_context_manager import (
+    enable_piecewise_cuda_graph,
+    enable_piecewise_cuda_graph_compile,
+    set_forward_context,
+    set_pcg_capture_stream,
+)
+from sglang.srt.distributed import get_tensor_model_parallel_rank
+from sglang.srt.distributed.device_communicators.pynccl_allocator import (
+    set_graph_pool_id,
+)
+from sglang.srt.distributed.parallel_state import graph_capture
+from sglang.srt.layers.dp_attention import (
+    DpPaddingMode,
+    get_attention_tp_rank,
+    get_attention_tp_size,
+    set_dp_buffer_len,
+    set_is_extend_in_batch,
+)
+from sglang.srt.layers.logits_processor import LogitsProcessorOutput
+from sglang.srt.layers.moe.utils import get_moe_a2a_backend
+from sglang.srt.layers.pooler import EmbeddingPoolerOutput
+from sglang.srt.layers.utils import MultiPlatformOp
+from sglang.srt.model_executor.forward_batch_info import (
+    CaptureHiddenMode,
+    ForwardBatch,
+    ForwardMode,
+    PPProxyTensors,
+)
+from sglang.srt.model_executor.input_buffers import ForwardInputBuffers
+from sglang.srt.utils import get_available_gpu_memory, is_npu, log_info_on_rank0
+
+logger = logging.getLogger(__name__)
+
+if TYPE_CHECKING:
+    from sglang.srt.model_executor.model_runner import ModelRunner
+
+
+@dataclass
+class PrefillInputBuffers(ForwardInputBuffers):
+    input_ids: torch.Tensor
+    out_cache_loc: torch.Tensor
+    out_cache_loc_swa: Optional[torch.Tensor]
+    mamba_track_indices: Optional[torch.Tensor]
+    mamba_track_mask: Optional[torch.Tensor]
+    mamba_track_seqlens: Optional[torch.Tensor]
+    positions: torch.Tensor
+    input_embeds: Optional[torch.Tensor]
+    mrope_positions: Optional[torch.Tensor]
+
+
+@contextmanager
+def freeze_gc(enable_cudagraph_gc: bool):
+    """
+    Optimize garbage collection during CUDA graph capture.
+    Clean up, then freeze all remaining objects from being included
+    in future collections if GC is disabled during capture.
+    """
+    gc.collect()
+    should_freeze = not enable_cudagraph_gc
+    if should_freeze:
+        gc.freeze()
+    try:
+        yield
+    finally:
+        if should_freeze:
+            gc.unfreeze()
+
+
+def _to_torch(model: torch.nn.Module, reverse: bool, num_tokens: int):
+    for sub in model._modules.values():
+        if isinstance(sub, MultiPlatformOp):
+            if reverse:
+                sub.leave_torch_compile()
+            else:
+                sub.enter_torch_compile(num_tokens=num_tokens)
+        if isinstance(sub, torch.nn.Module):
+            _to_torch(sub, reverse, num_tokens)
+
+
+@contextmanager
+def patch_model(model: torch.nn.Module, compiler: str):
+    try:
+        if compiler != "eager":
+            _to_torch(model, reverse=False, num_tokens=16)
+        yield model
+    finally:
+        _to_torch(model, reverse=True, num_tokens=16)
+
+
+# Reuse this memory pool across all cuda graph runners.
+global_graph_memory_pool = None
+
+
+def get_global_graph_memory_pool():
+    return global_graph_memory_pool
+
+
+def set_global_graph_memory_pool(val):
+    global global_graph_memory_pool
+    global_graph_memory_pool = val
+
+
+def set_torch_compile_config():
+    import torch._dynamo.config
+
+    # Resolve torch._dynamo.exc.FailOnRecompileLimitHit
+    torch._dynamo.config.accumulated_cache_size_limit = 1024
+    if hasattr(torch._dynamo.config, "cache_size_limit"):
+        torch._dynamo.config.cache_size_limit = 1024
+
+
+class PiecewiseCudaGraphRunner:
+    """A PiecewiseCudaGraphRunner runs the forward pass of a model with cuda graph and torch.compile."""
+
+    def is_mamba_track_enabled(self):
+        return (
+            self.model_runner.server_args.enable_mamba_extra_buffer()
+            and not self.model_runner.server_args.disable_radix_cache
+            and self.model_runner.spec_algorithm.is_none()
+        )
+
+    def __init__(self, model_runner: ModelRunner):
+        # Parse args
+        self.model_runner = model_runner
+        self.device = model_runner.device
+        self.device_module = torch.get_device_module(self.device)
+        self.graphs = {}
+        self.output_buffers = {}
+        self.tp_size = model_runner.server_args.tp_size
+        self.dp_size = model_runner.server_args.dp_size
+        self.pp_size = model_runner.server_args.pp_size
+
+        self.attn_tp_size = get_attention_tp_size()
+        self.attn_tp_rank = get_attention_tp_rank()
+
+        set_torch_compile_config()
+
+        assert (
+            self.model_runner.server_args.piecewise_cuda_graph_tokens is not None
+        ), "piecewise_cuda_graph_tokens is not set"
+        assert self.model_runner.server_args.piecewise_cuda_graph_compiler in [
+            "eager",
+            "inductor",
+        ], "By now, only eager and inductor are supported for piecewise cuda graph compiler."
+        self.compile_config = CompilationConfig(
+            self.model_runner.server_args.piecewise_cuda_graph_tokens,
+            self.model_runner.server_args.piecewise_cuda_graph_compiler,
+            self.model_runner.server_args.enable_torch_compile_debug_mode,
+        )
+        if get_moe_a2a_backend().is_deepep() or get_moe_a2a_backend().is_mooncake():
+            self.compile_config.add_split_op(
+                "sglang.moe_forward_piecewise_cuda_graph_impl"
+            )
+
+        self.quant_config = getattr(self.model_runner.model, "quant_config", None)
+
+        # Batch sizes to capture
+        self.capture_num_tokens = self.compile_config.get_capture_sizes()
+        log_info_on_rank0(
+            logger, f"Capture cuda graph num tokens {self.capture_num_tokens}"
+        )
+        self.capture_forward_mode = ForwardMode.EXTEND
+        self.capture_hidden_mode = CaptureHiddenMode.NULL
+
+        # If returning hidden states is enabled, set initial capture hidden mode to full to avoid double-capture on startup
+        if model_runner.server_args.enable_return_hidden_states:
+            self.capture_hidden_mode = CaptureHiddenMode.FULL
+
+        self.max_num_tokens = (
+            max(self.capture_num_tokens) if self.capture_num_tokens else 8192
+        )
+        self.max_bs = model_runner.req_to_token_pool.size
+
+        self.is_multimodal = model_runner.is_multimodal
+        self.mamba_track_enabled = self.is_mamba_track_enabled()
+
+        # Graph inputs
+        with torch.device(self.device):
+            input_ids = torch.zeros((self.max_num_tokens,), dtype=torch.int64)
+            out_cache_loc = torch.zeros(
+                (self.max_num_tokens,), dtype=self._cache_loc_dtype()
+            )
+            out_cache_loc_swa = (
+                torch.zeros((self.max_num_tokens,), dtype=torch.int64)
+                if model_runner.is_hybrid_swa
+                else None
+            )
+            mamba_track_indices = (
+                torch.zeros((self.max_bs,), dtype=torch.int64)
+                if self.mamba_track_enabled
+                else None
+            )
+            mamba_track_mask = (
+                torch.zeros((self.max_bs,), dtype=torch.bool)
+                if self.mamba_track_enabled
+                else None
+            )
+            mamba_track_seqlens = (
+                torch.zeros((self.max_bs,), dtype=torch.int32)
+                if self.mamba_track_enabled
+                else None
+            )
+            positions = torch.zeros((self.max_num_tokens,), dtype=torch.int64)
+
+            self.tbo_plugin = TboCudaGraphRunnerPlugin()
+
+            if (
+                self.is_multimodal
+            ):  # Only create input_embeds and mrope_positions for multimodal model to save memory
+                # 1. In multimodal, we only compile and capture the language model part.
+                # 2. The embedder is outside of the graph, but cuda graph requires the input embeds to have a fixed memory address.
+                # 3. Input embeds is a pre-allocated buffer. In model.forward, we copy the embed output to this buffer.
+                input_embeds = torch.zeros(
+                    (self.max_num_tokens, self.model_runner.model_config.hidden_size),
+                    dtype=self.model_runner.dtype,
+                )
+                mrope_positions = torch.zeros(
+                    (3, self.max_num_tokens), dtype=torch.int64
+                )
+            else:
+                input_embeds = None
+                mrope_positions = None
+
+        self.buffers = PrefillInputBuffers(
+            input_ids=input_ids,
+            out_cache_loc=out_cache_loc,
+            out_cache_loc_swa=out_cache_loc_swa,
+            mamba_track_indices=mamba_track_indices,
+            mamba_track_mask=mamba_track_mask,
+            mamba_track_seqlens=mamba_track_seqlens,
+            positions=positions,
+            input_embeds=input_embeds,
+            mrope_positions=mrope_positions,
+        )
+        self.buffers.share_buffers()
+
+        self.attention_layers = self.model_runner.attention_layers
+        self.moe_layers = self.model_runner.moe_layers
+        self.moe_fusions = self.model_runner.moe_fusions
+
+        if get_global_graph_memory_pool() is None:
+            set_global_graph_memory_pool(self.device_module.graph_pool_handle())
+        # Set graph pool id globally to be able to use symmetric memory
+        set_graph_pool_id(get_global_graph_memory_pool())
+
+        with enable_piecewise_cuda_graph():
+            language_model = getattr(
+                self.model_runner.model, "language_model", self.model_runner.model
+            )
+            with patch_model(
+                language_model.model, self.compile_config.compiler
+            ) as patched_model:
+
+                # Dummy warmup for jit kernel
+                self.warmup_compile(num_tokens=self.capture_num_tokens[0])
+
+                install_torch_compiled(
+                    patched_model,
+                    fullgraph=True,
+                    dynamic_arg_dims=None,
+                    compile_config=self.compile_config,
+                    graph_pool=get_global_graph_memory_pool(),
+                )
+
+                with enable_piecewise_cuda_graph_compile():
+                    compile_range = (
+                        tqdm.tqdm(list(reversed(self.capture_num_tokens)))
+                        if get_tensor_model_parallel_rank() == 0
+                        else reversed(self.capture_num_tokens)
+                    )
+                    for _, num_tokens in enumerate(compile_range):
+                        if get_tensor_model_parallel_rank() == 0:
+                            compile_range.set_description(
+                                f"Compiling num tokens ({num_tokens=})"
+                            )
+                        self.warmup_compile(num_tokens=num_tokens)
+
+                set_global_graph_memory_pool(self.device_module.graph_pool_handle())
+                set_graph_pool_id(get_global_graph_memory_pool())
+
+                self.device_module.synchronize()
+                self.model_runner.tp_group.barrier()
+                # Capture
+                self.capture()
+
+        self.raw_num_tokens = 0
+
+    def warmup_compile(self, num_tokens: int):
+        """Warmup the model with a simple forward pass before CUDA graph capture."""
+        buffers = self.buffers
+        input_ids = buffers.input_ids[:num_tokens]
+        input_embeds = buffers.input_embeds[:num_tokens] if self.is_multimodal else None
+        positions = buffers.positions[:num_tokens]
+        mrope_positions = (
+            buffers.mrope_positions[:, :num_tokens] if self.is_multimodal else None
+        )
+        out_cache_loc = buffers.out_cache_loc[:num_tokens]
+        out_cache_loc_swa = (
+            buffers.out_cache_loc_swa[:num_tokens]
+            if buffers.out_cache_loc_swa is not None
+            else None
+        )
+        mamba_track_indices = (
+            buffers.mamba_track_indices[:1]
+            if buffers.mamba_track_indices is not None
+            else None
+        )
+        mamba_track_mask = (
+            buffers.mamba_track_mask[:1]
+            if buffers.mamba_track_mask is not None
+            else None
+        )
+        mamba_track_seqlens = (
+            buffers.mamba_track_seqlens[:1]
+            if buffers.mamba_track_seqlens is not None
+            else None
+        )
+        with torch.device(self.device):
+            forward_batch = ForwardBatch(
+                forward_mode=ForwardMode.EXTEND,
+                batch_size=1,
+                input_ids=input_ids,
+                input_embeds=input_embeds,
+                req_pool_indices=torch.arange(1, device=self.device),
+                seq_lens=torch.tensor([num_tokens], device=self.device),
+                next_token_logits_buffer=None,
+                orig_seq_lens=torch.tensor([num_tokens], device=self.device),
+                seq_lens_cpu=torch.tensor([num_tokens], device="cpu"),
+                req_to_token_pool=self.model_runner.req_to_token_pool,
+                token_to_kv_pool=self.model_runner.token_to_kv_pool,
+                attn_backend=self.model_runner.attn_backend,
+                out_cache_loc=out_cache_loc,
+                out_cache_loc_swa=out_cache_loc_swa,
+                seq_lens_sum=num_tokens,
+                mamba_track_indices=mamba_track_indices,
+                mamba_track_mask=mamba_track_mask,
+                mamba_track_seqlens=mamba_track_seqlens,
+                encoder_lens=None,
+                return_logprob=False,
+                extend_num_tokens=num_tokens,
+                extend_seq_lens=torch.tensor([num_tokens], device=self.device),
+                extend_prefix_lens=torch.tensor([0], device=self.device),
+                extend_start_loc=torch.tensor([0], device=self.device),
+                extend_prefix_lens_cpu=torch.tensor([0], device="cpu"),
+                extend_seq_lens_cpu=torch.tensor([num_tokens], device="cpu"),
+                extend_logprob_start_lens_cpu=torch.tensor([num_tokens], device="cpu"),
+                positions=positions,
+                global_num_tokens_gpu=None,
+                global_num_tokens_for_logprob_gpu=None,
+                dp_padding_mode=DpPaddingMode.get_default_mode_in_cuda_graph(),
+                global_dp_buffer_len=None,
+                mrope_positions=mrope_positions,
+                spec_algorithm=None,
+                spec_info=None,
+                capture_hidden_mode=CaptureHiddenMode.NULL,
+                num_token_non_padded=None,
+                global_forward_mode=ForwardMode.EXTEND,
+                lora_ids=None,
+            )
+
+        # Attention backend
+        self.model_runner.attn_backend.init_forward_metadata(forward_batch)
+        forward_batch.dp_local_start_pos = forward_batch.dp_local_num_tokens = None
+        set_dp_buffer_len(None, num_tokens, forward_batch.dp_padding_mode.is_max_len())
+        set_is_extend_in_batch(False)
+        with set_forward_context(
+            forward_batch,
+            self.attention_layers,
+            self.quant_config,
+            self.moe_layers,
+            self.moe_fusions,
+        ):
+            _ = self.model_runner.model.forward(
+                forward_batch.input_ids,
+                forward_batch.positions,
+                forward_batch,
+            )
+
+    def _cache_loc_dtype(self):
+        return torch.int64 if not is_npu() else torch.int32
+
+    def can_run(self, forward_batch: ForwardBatch):
+        # Disable piecewise cuda graph for input embeddings
+        # TODO(yuwei): fix it
+        if forward_batch.input_embeds is not None:
+            return False
+        num_tokens = len(forward_batch.input_ids)
+        if forward_batch.return_logprob:
+            for start_len, seq_len in zip(
+                forward_batch.extend_logprob_start_lens_cpu,
+                forward_batch.extend_seq_lens_cpu,
+            ):
+                if start_len is not None and start_len < seq_len:
+                    return False
+        if num_tokens <= self.max_num_tokens:
+            return True
+        return False
+
+    def capture(self) -> None:
+        # Trigger CUDA graph capture for specific shapes.
+        # Capture the large shapes first so that the smaller shapes
+        # can reuse the memory pool allocated for the large shapes.
+        with freeze_gc(
+            self.model_runner.server_args.enable_cudagraph_gc
+        ), graph_capture() as graph_capture_context:
+            stream = graph_capture_context.stream
+            with set_pcg_capture_stream(stream):
+                avail_mem = get_available_gpu_memory(
+                    self.model_runner.device,
+                    self.model_runner.gpu_id,
+                    empty_cache=False,
+                )
+                # Reverse the order to enable better memory sharing across cuda graphs.
+                capture_range = (
+                    tqdm.tqdm(list(reversed(self.capture_num_tokens)))
+                    if get_tensor_model_parallel_rank() == 0
+                    else reversed(self.capture_num_tokens)
+                )
+                for i, num_tokens in enumerate(capture_range):
+                    if get_tensor_model_parallel_rank() == 0:
+                        avail_mem = get_available_gpu_memory(
+                            self.model_runner.device,
+                            self.model_runner.gpu_id,
+                            empty_cache=False,
+                        )
+                        capture_range.set_description(
+                            f"Capturing num tokens ({num_tokens=} {avail_mem=:.2f} GB)"
+                        )
+
+                    self.capture_one_batch_size(num_tokens)
+
+    def capture_one_batch_size(self, num_tokens: int):
+        buffers = self.buffers
+        bs = 1
+
+        # Graph inputs
+        input_ids = buffers.input_ids[:num_tokens]
+        input_embeds = buffers.input_embeds[:num_tokens] if self.is_multimodal else None
+
+        out_cache_loc = buffers.out_cache_loc[:num_tokens]
+        out_cache_loc_swa = (
+            buffers.out_cache_loc_swa[:num_tokens]
+            if buffers.out_cache_loc_swa is not None
+            else None
+        )
+        mamba_track_indices = (
+            buffers.mamba_track_indices[:bs]
+            if buffers.mamba_track_indices is not None
+            else None
+        )
+        mamba_track_mask = (
+            buffers.mamba_track_mask[:bs]
+            if buffers.mamba_track_mask is not None
+            else None
+        )
+        mamba_track_seqlens = (
+            buffers.mamba_track_seqlens[:bs]
+            if buffers.mamba_track_seqlens is not None
+            else None
+        )
+        positions = buffers.positions[:num_tokens]
+        mrope_positions = (
+            buffers.mrope_positions[:, :num_tokens] if self.is_multimodal else None
+        )
+
+        global_dp_buffer_len = None
+
+        if self.model_runner.server_args.enable_lora:
+            # It is safe to capture CUDA graph using empty LoRA id, as the LoRA kernels will always be launched whenever
+            # `--enable-lora` is set to True (and return immediately if the LoRA id is empty for perf optimization).
+            lora_ids = [None] * bs
+        else:
+            lora_ids = None
+
+        with torch.device(self.device):
+            forward_batch = ForwardBatch(
+                forward_mode=ForwardMode.EXTEND,
+                batch_size=bs,
+                input_ids=input_ids,
+                input_embeds=input_embeds,
+                req_pool_indices=torch.arange(bs, device=self.device),
+                seq_lens=torch.tensor([num_tokens], device=self.device),
+                next_token_logits_buffer=None,
+                orig_seq_lens=torch.tensor([num_tokens], device=self.device),
+                seq_lens_cpu=torch.tensor([num_tokens], device="cpu"),
+                req_to_token_pool=self.model_runner.req_to_token_pool,
+                token_to_kv_pool=self.model_runner.token_to_kv_pool,
+                attn_backend=self.model_runner.attn_backend,
+                out_cache_loc=out_cache_loc,
+                out_cache_loc_swa=out_cache_loc_swa,
+                seq_lens_sum=num_tokens,
+                mamba_track_indices=mamba_track_indices,
+                mamba_track_mask=mamba_track_mask,
+                mamba_track_seqlens=mamba_track_seqlens,
+                encoder_lens=None,
+                return_logprob=False,
+                extend_num_tokens=num_tokens,
+                extend_seq_lens=torch.tensor([num_tokens], device=self.device),
+                extend_prefix_lens=torch.tensor([0], device=self.device),
+                extend_start_loc=torch.tensor([0], device=self.device),
+                extend_prefix_lens_cpu=torch.tensor([0], device="cpu"),
+                extend_seq_lens_cpu=torch.tensor([num_tokens], device="cpu"),
+                extend_logprob_start_lens_cpu=torch.tensor([num_tokens], device="cpu"),
+                positions=positions,
+                global_num_tokens_gpu=None,
+                global_num_tokens_for_logprob_gpu=None,
+                dp_padding_mode=DpPaddingMode.get_default_mode_in_cuda_graph(),
+                global_dp_buffer_len=None,
+                mrope_positions=mrope_positions,
+                spec_algorithm=None,
+                spec_info=None,
+                capture_hidden_mode=CaptureHiddenMode.NULL,
+                num_token_non_padded=None,
+                global_forward_mode=ForwardMode.EXTEND,
+                lora_ids=None,
+            )
+            self.tbo_plugin.capture_one_batch_size(forward_batch, num_tokens=num_tokens)
+
+        if lora_ids is not None:
+            self.model_runner.lora_manager.prepare_lora_batch(forward_batch)
+
+        self.model_runner.attn_backend.init_forward_metadata(forward_batch)
+
+        # Run and capture
+        def run_once():
+            # Clean intermediate result cache for DP attention
+            forward_batch.dp_local_start_pos = forward_batch.dp_local_num_tokens = None
+            set_dp_buffer_len(
+                global_dp_buffer_len,
+                num_tokens,
+                forward_batch.dp_padding_mode.is_max_len(),
+            )
+            # FIXME: the implementation is hacky. `is_extend_in_batch`` is for determining the deepep mode.
+            # It is True in this context but we need to set it to use low latency deepep mode.
+            set_is_extend_in_batch(False)
+
+            kwargs = {}
+            with set_forward_context(
+                forward_batch,
+                self.attention_layers,
+                self.quant_config,
+                self.moe_layers,
+                self.moe_fusions,
+            ):
+                self.model_runner.model.forward(
+                    forward_batch.input_ids,
+                    forward_batch.positions,
+                    forward_batch,
+                    **kwargs,
+                )
+            return
+
+        # run twice for warmup at the first time and cuda graph capture at the second time
+        # detail lies in sglang/python/sglang/srt/compilation/cuda_piecewise_backend.py
+        for _ in range(2):
+            self.device_module.synchronize()
+            self.model_runner.tp_group.barrier()
+            run_once()
+
+        return
+
+    def replay_prepare(
+        self,
+        forward_batch: ForwardBatch,
+        **kwargs,
+    ):
+        buffers = self.buffers
+        num_tokens = len(forward_batch.input_ids)
+        index = bisect.bisect_left(self.capture_num_tokens, num_tokens)
+        static_num_tokens = self.capture_num_tokens[index]
+        self.raw_num_tokens = num_tokens
+        if static_num_tokens != num_tokens:
+            buffers.out_cache_loc.zero_()
+            if buffers.out_cache_loc_swa is not None:
+                buffers.out_cache_loc_swa.zero_()
+            buffers.input_ids[num_tokens:static_num_tokens].zero_()
+            buffers.positions[num_tokens:static_num_tokens].zero_()
+            if self.is_multimodal:
+                buffers.input_embeds[:, num_tokens:static_num_tokens].zero_()
+            if forward_batch.mrope_positions is not None:
+                buffers.mrope_positions[:, num_tokens:static_num_tokens].zero_()
+
+        bs = forward_batch.batch_size
+
+        buffers.input_ids[:num_tokens].copy_(forward_batch.input_ids)
+        buffers.positions[:num_tokens].copy_(forward_batch.positions)
+        buffers.out_cache_loc[:num_tokens].copy_(forward_batch.out_cache_loc)
+        if buffers.out_cache_loc_swa is not None:
+            buffers.out_cache_loc_swa[: self.raw_num_tokens].copy_(
+                self.model_runner.token_to_kv_pool_allocator.translate_loc_from_full_to_swa(
+                    forward_batch.out_cache_loc
+                )
+            )
+
+        if (
+            buffers.mamba_track_indices is not None
+            and forward_batch.mamba_track_indices is not None
+        ):
+            buffers.mamba_track_indices[:bs].copy_(forward_batch.mamba_track_indices)
+        if (
+            buffers.mamba_track_mask is not None
+            and forward_batch.mamba_track_mask is not None
+        ):
+            buffers.mamba_track_mask[:bs].copy_(forward_batch.mamba_track_mask)
+        if (
+            buffers.mamba_track_seqlens is not None
+            and forward_batch.mamba_track_seqlens is not None
+        ):
+            buffers.mamba_track_seqlens[:bs].copy_(forward_batch.mamba_track_seqlens)
+
+        input_ids = buffers.input_ids[:static_num_tokens]
+        positions = buffers.positions[:static_num_tokens]
+        out_cache_loc = buffers.out_cache_loc[:static_num_tokens]
+
+        out_cache_loc_swa = (
+            buffers.out_cache_loc_swa[:static_num_tokens]
+            if forward_batch.out_cache_loc_swa is not None
+            else None
+        )
+
+        mamba_track_indices = (
+            buffers.mamba_track_indices[:bs]
+            if buffers.mamba_track_indices is not None
+            else None
+        )
+        mamba_track_mask = (
+            buffers.mamba_track_mask[:bs]
+            if buffers.mamba_track_mask is not None
+            else None
+        )
+        mamba_track_seqlens = (
+            buffers.mamba_track_seqlens[:bs]
+            if buffers.mamba_track_seqlens is not None
+            else None
+        )
+        if forward_batch.mrope_positions is not None:
+            buffers.mrope_positions[:, :num_tokens].copy_(forward_batch.mrope_positions)
+
+        input_ids = buffers.input_ids[:static_num_tokens]
+        input_embeds = (
+            buffers.input_embeds[:static_num_tokens] if self.is_multimodal else None
+        )
+
+        mrope_positions = (
+            buffers.mrope_positions[:, :static_num_tokens]
+            if forward_batch.mrope_positions is not None
+            else None
+        )
+
+        next_token_logits_buffer = None
+
+        static_forward_batch = ForwardBatch(
+            forward_mode=forward_batch.forward_mode,
+            batch_size=bs,
+            input_ids=input_ids,
+            input_embeds=input_embeds,
+            req_pool_indices=forward_batch.req_pool_indices,
+            seq_lens=forward_batch.seq_lens,
+            next_token_logits_buffer=next_token_logits_buffer,
+            orig_seq_lens=forward_batch.orig_seq_lens,
+            seq_lens_cpu=forward_batch.seq_lens_cpu,
+            req_to_token_pool=self.model_runner.req_to_token_pool,
+            token_to_kv_pool=self.model_runner.token_to_kv_pool,
+            attn_backend=self.model_runner.attn_backend,
+            out_cache_loc=out_cache_loc,
+            out_cache_loc_swa=out_cache_loc_swa,
+            seq_lens_sum=forward_batch.seq_lens_sum,
+            mamba_track_indices=mamba_track_indices,
+            mamba_track_mask=mamba_track_mask,
+            mamba_track_seqlens=mamba_track_seqlens,
+            encoder_lens=forward_batch.encoder_lens,
+            return_logprob=False,
+            extend_seq_lens=forward_batch.extend_seq_lens,
+            extend_prefix_lens=forward_batch.extend_prefix_lens,
+            extend_start_loc=forward_batch.extend_start_loc,
+            extend_prefix_lens_cpu=forward_batch.extend_prefix_lens_cpu,
+            extend_seq_lens_cpu=forward_batch.extend_seq_lens_cpu,
+            extend_logprob_start_lens_cpu=forward_batch.extend_logprob_start_lens_cpu,
+            extend_num_tokens=forward_batch.extend_num_tokens,
+            extend_input_logprob_token_ids_gpu=forward_batch.extend_input_logprob_token_ids_gpu,
+            positions=positions,
+            global_num_tokens_gpu=forward_batch.global_num_tokens_gpu,
+            global_num_tokens_for_logprob_gpu=forward_batch.global_num_tokens_for_logprob_gpu,
+            dp_padding_mode=forward_batch.dp_padding_mode,
+            global_dp_buffer_len=forward_batch.global_dp_buffer_len,
+            mrope_positions=mrope_positions,
+            spec_algorithm=forward_batch.spec_algorithm,
+            spec_info=forward_batch.spec_info,
+            capture_hidden_mode=forward_batch.capture_hidden_mode,
+            num_token_non_padded=forward_batch.num_token_non_padded,
+            global_forward_mode=forward_batch.global_forward_mode,
+            lora_ids=forward_batch.lora_ids,
+            sampling_info=forward_batch.sampling_info,
+            mm_inputs=forward_batch.mm_inputs,
+            temp_scaled_logprobs=forward_batch.temp_scaled_logprobs,
+            temperature=forward_batch.temperature,
+            top_p_normalized_logprobs=forward_batch.top_p_normalized_logprobs,
+            top_p=forward_batch.top_p,
+            dimensions=forward_batch.dimensions,
+        )
+
+        return static_forward_batch
+
+    def replay(
+        self,
+        forward_batch: ForwardBatch,
+        **kwargs,
+    ) -> Union[LogitsProcessorOutput, PPProxyTensors, EmbeddingPoolerOutput]:
+        with enable_piecewise_cuda_graph():
+            # Due to the dispatch kernel for MLA model, we init the metadata with original forward_batch
+            self.model_runner.attn_backend.init_forward_metadata(forward_batch)
+            static_forward_batch = self.replay_prepare(forward_batch, **kwargs)
+            # Replay
+            with set_forward_context(
+                static_forward_batch,
+                self.attention_layers,
+                self.quant_config,
+                self.moe_layers,
+                self.moe_fusions,
+            ):
+                output = self.model_runner.model.forward(
+                    static_forward_batch.input_ids,
+                    static_forward_batch.positions,
+                    static_forward_batch,
+                    **kwargs,
+                )
+                if isinstance(output, LogitsProcessorOutput):
+                    return LogitsProcessorOutput(
+                        next_token_logits=output.next_token_logits[
+                            : self.raw_num_tokens
+                        ],
+                        hidden_states=(
+                            output.hidden_states[: self.raw_num_tokens]
+                            if output.hidden_states is not None
+                            else None
+                        ),
+                    )
+                elif isinstance(output, EmbeddingPoolerOutput):
+                    return output
+                else:
+                    assert isinstance(output, PPProxyTensors)
+                    # TODO(Yuwei): support PP Support
+                    raise NotImplementedError(
+                        "PPProxyTensors is not supported in PiecewiseCudaGraphRunner yet."
+                    )
+
+    def get_spec_info(self, num_tokens: int):
+        spec_info = None
+        if (
+            self.model_runner.spec_algorithm.is_eagle()
+            or self.model_runner.spec_algorithm.is_standalone()
+        ):
+            from sglang.srt.speculative.eagle_utils import EagleVerifyInput
+
+            if self.model_runner.is_draft_worker:
+                raise RuntimeError("This should not happen.")
+            else:
+                spec_info = EagleVerifyInput(
+                    draft_token=None,
+                    custom_mask=self.custom_mask,
+                    positions=None,
+                    retrive_index=None,
+                    retrive_next_token=None,
+                    retrive_next_sibling=None,
+                    retrive_cum_len=None,
+                    spec_steps=self.model_runner.server_args.speculative_num_steps,
+                    topk=self.model_runner.server_args.speculative_eagle_topk,
+                    draft_token_num=self.model_runner.server_args.speculative_num_draft_tokens,
+                    capture_hidden_mode=CaptureHiddenMode.FULL,
+                    seq_lens_sum=None,
+                    seq_lens_cpu=None,
+                )
+
+        return spec_info
diff --git a/sglang/python/sglang/srt/model_loader/__init__.py b/sglang/python/sglang/srt/model_loader/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..87ccb33a4d4fb1f60d8118529dfd4cb574e74f33
--- /dev/null
+++ b/sglang/python/sglang/srt/model_loader/__init__.py
@@ -0,0 +1,40 @@
+# Adapted from https://github.com/vllm-project/vllm/blob/v0.6.4.post1/vllm/model_executor/model_loader/__init__.py
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+
+from torch import nn
+
+from sglang.srt.model_loader.loader import BaseModelLoader, get_model_loader
+from sglang.srt.model_loader.utils import (
+    get_architecture_class_name,
+    get_model_architecture,
+)
+
+if TYPE_CHECKING:
+    from sglang.srt.configs.device_config import DeviceConfig
+    from sglang.srt.configs.load_config import LoadConfig
+    from sglang.srt.configs.model_config import ModelConfig
+
+
+def get_model(
+    *,
+    model_config: ModelConfig,
+    load_config: LoadConfig,
+    device_config: DeviceConfig,
+) -> nn.Module:
+    loader = get_model_loader(load_config, model_config)
+    return loader.load_model(
+        model_config=model_config,
+        device_config=device_config,
+    )
+
+
+__all__ = [
+    "get_model",
+    "get_model_loader",
+    "BaseModelLoader",
+    "get_architecture_class_name",
+    "get_model_architecture",
+]
diff --git a/sglang/python/sglang/srt/model_loader/__pycache__/__init__.cpython-311.pyc b/sglang/python/sglang/srt/model_loader/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..13d2a70ef264d7504ccdab8b66a299bee2bd3878
Binary files /dev/null and b/sglang/python/sglang/srt/model_loader/__pycache__/__init__.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/model_loader/__pycache__/ci_weight_validation.cpython-311.pyc b/sglang/python/sglang/srt/model_loader/__pycache__/ci_weight_validation.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..be18d45198d41378d7eb7e68f705154f6f72d0fe
Binary files /dev/null and b/sglang/python/sglang/srt/model_loader/__pycache__/ci_weight_validation.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/model_loader/__pycache__/remote_instance_weight_loader_utils.cpython-311.pyc b/sglang/python/sglang/srt/model_loader/__pycache__/remote_instance_weight_loader_utils.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0b5834915bd03f5527d8abba58e7650957d48678
Binary files /dev/null and b/sglang/python/sglang/srt/model_loader/__pycache__/remote_instance_weight_loader_utils.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/model_loader/__pycache__/utils.cpython-311.pyc b/sglang/python/sglang/srt/model_loader/__pycache__/utils.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c3a1ba5bf512355661467b97cef1839d12b3b13c
Binary files /dev/null and b/sglang/python/sglang/srt/model_loader/__pycache__/utils.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/model_loader/__pycache__/weight_utils.cpython-311.pyc b/sglang/python/sglang/srt/model_loader/__pycache__/weight_utils.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4050d85744b740d84b6251ee9af3a22e509c07d9
Binary files /dev/null and b/sglang/python/sglang/srt/model_loader/__pycache__/weight_utils.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/model_loader/ci_weight_validation.py b/sglang/python/sglang/srt/model_loader/ci_weight_validation.py
new file mode 100644
index 0000000000000000000000000000000000000000..b5220e922f75dea926f69003f5d2699d9c00591f
--- /dev/null
+++ b/sglang/python/sglang/srt/model_loader/ci_weight_validation.py
@@ -0,0 +1,2096 @@
+"""
+CI-specific weight validation and cache cleanup utilities.
+
+This module contains validation and cleanup logic that is ONLY used in CI environments.
+These functions handle:
+- Validating safetensors files for corruption
+- Checking for missing shards in sharded models
+- Cleaning up corrupted files (selective or full cache deletion)
+- Automatic retry logic for corrupted downloads
+- Validating config/tokenizer files completeness to enable offline mode
+
+For regular users, weight_utils.py provides simple download functionality without
+the overhead of validation and automatic cleanup. The CI-specific behavior is
+gated by is_in_ci() checks in weight_utils.py.
+"""
+
+import glob as glob_module
+import hashlib
+import json
+import logging
+import os
+import re
+import shutil
+import tempfile
+import time
+from typing import List, Optional, Tuple
+
+import safetensors
+
+from sglang.srt.utils import log_info_on_rank0
+
+logger = logging.getLogger(__name__)
+
+# Validation marker version - increment when validation logic changes
+# v2: Added trust_remote_code module validation (modeling_*.py must exist in snapshot)
+# v3: Added remote file existence checks for hf_quant_config.json
+# v5: Invalidate all previous markers to force fresh validation
+VALIDATION_MARKER_VERSION = "5"
+
+
+def _remote_file_exists(
+    repo_id: str, filename: str, revision: Optional[str], allow_remote_check: bool
+) -> Optional[bool]:
+    """
+    Check if a file exists on Hugging Face Hub for a specific revision.
+
+    Args:
+        repo_id: Repository ID (e.g., "meta-llama/Llama-2-7b-hf")
+        filename: File name to check (e.g., "hf_quant_config.json")
+        revision: Git revision (commit hash, branch, or tag). None means default branch.
+        allow_remote_check: Whether remote checks are allowed (e.g., CI validation phase)
+
+    Returns:
+        True if file exists on hub, False if it doesn't exist, None if we cannot determine
+        (network error or remote check not allowed - be conservative and assume incomplete)
+    """
+    if not allow_remote_check:
+        logger.debug(
+            "Remote check disabled for %s/%s, returning None (unknown)",
+            repo_id,
+            filename,
+        )
+        return None
+
+    try:
+        from huggingface_hub import HfApi
+
+        api = HfApi()
+        exists = api.file_exists(repo_id=repo_id, filename=filename, revision=revision)
+        logger.debug(
+            "Remote file check: %s/%s (revision=%s) exists=%s",
+            repo_id,
+            filename,
+            revision or "default",
+            exists,
+        )
+        return exists
+    except Exception as e:
+        # Network errors, auth issues, repo not found, etc.
+        # Return None (unknown) - caller will treat as optional
+        logger.debug(
+            "Failed to check remote file existence for %s/%s (revision=%s): %s. "
+            "Will treat as optional.",
+            repo_id,
+            filename,
+            revision or "default",
+            e,
+        )
+        return None
+
+
+def _get_validation_marker_path(snapshot_dir: str) -> Optional[str]:
+    """
+    Get the path to validation marker file for a snapshot.
+
+    Marker is stored in /tmp to avoid permission issues with HF cache directory.
+    Marker key is sha256(snapshot_dir) to avoid any collisions regardless of
+    model_name_or_path format.
+
+    Args:
+        snapshot_dir: Path to snapshot directory
+
+    Returns:
+        Path to marker file or None if snapshot_dir is invalid
+    """
+    if not snapshot_dir or not os.path.isdir(snapshot_dir):
+        return None
+
+    # Normalize path to avoid marker misses due to trailing slashes or symlinks
+    # realpath resolves symlinks, rstrip removes trailing slashes
+    normalized_dir = os.path.realpath(snapshot_dir).rstrip("/")
+
+    # Use sha256 of normalized snapshot_dir path as unique key
+    # This avoids any collision issues with repo naming or snapshot hash reuse
+    dir_hash = hashlib.sha256(normalized_dir.encode("utf-8")).hexdigest()[:12]
+
+    # Store in /tmp with directory hash
+    return f"/tmp/sglang_hf_validation_{dir_hash}.json"
+
+
+def _get_per_run_marker_dir() -> str:
+    """
+    Get the directory for per-run validation markers.
+
+    These markers are specific to the current CI run and are not shared across
+    runners. They are stored in a temporary directory that is cleaned up after
+    the run completes.
+
+    Returns:
+        Path to per-run marker directory
+    """
+    # Prefer RUNNER_TEMP (GitHub Actions) or TMPDIR, fallback to /tmp
+    base_dir = os.environ.get("RUNNER_TEMP", os.environ.get("TMPDIR", "/tmp"))
+    marker_dir = os.path.join(base_dir, "sglang_ci_offline_markers")
+    os.makedirs(marker_dir, exist_ok=True)
+    return marker_dir
+
+
+def _get_per_run_marker_path(snapshot_dir: str) -> Optional[str]:
+    """
+    Get the path to per-run validation marker file for a snapshot.
+
+    Per-run markers are specific to the current CI run and are not shared
+    across runners. This prevents cross-runner cache state pollution.
+
+    Args:
+        snapshot_dir: Path to snapshot directory
+
+    Returns:
+        Path to per-run marker file or None if snapshot_dir is invalid
+    """
+    if not snapshot_dir or not os.path.isdir(snapshot_dir):
+        return None
+
+    normalized_dir = os.path.realpath(snapshot_dir).rstrip("/")
+    dir_hash = hashlib.sha256(normalized_dir.encode("utf-8")).hexdigest()[:12]
+
+    marker_dir = _get_per_run_marker_dir()
+    return os.path.join(marker_dir, f"{dir_hash}.json")
+
+
+def _read_per_run_marker(snapshot_dir: str) -> Optional[dict]:
+    """
+    Read per-run validation marker for a snapshot.
+
+    Args:
+        snapshot_dir: Path to snapshot directory
+
+    Returns:
+        Marker dict if exists and valid, None otherwise
+    """
+    marker_path = _get_per_run_marker_path(snapshot_dir)
+    if not marker_path or not os.path.exists(marker_path):
+        return None
+
+    try:
+        with open(marker_path, "r", encoding="utf-8") as f:
+            marker = json.load(f)
+
+        # Validate marker structure
+        if not isinstance(marker, dict):
+            return None
+
+        required_keys = ["timestamp", "model_id", "snapshot_hash", "validation_passed"]
+        if not all(k in marker for k in required_keys):
+            return None
+
+        if marker.get("validation_passed") is not True:
+            return None
+
+        return marker
+
+    except Exception as e:
+        logger.debug("Failed to read per-run marker from %s: %s", marker_path, e)
+        return None
+
+
+def _write_per_run_marker(
+    snapshot_dir: str, model_id: str, required_files: Optional[list] = None
+) -> None:
+    """
+    Write per-run validation marker for a snapshot.
+
+    Args:
+        snapshot_dir: Path to snapshot directory
+        model_id: Model identifier
+        required_files: List of required files that were validated
+    """
+    marker_path = _get_per_run_marker_path(snapshot_dir)
+    if not marker_path:
+        logger.debug("Cannot write per-run marker: invalid snapshot_dir")
+        return
+
+    from datetime import datetime
+
+    snapshot_hash = os.path.basename(snapshot_dir)
+
+    marker = {
+        "timestamp": datetime.utcnow().isoformat() + "Z",
+        "model_id": model_id,
+        "snapshot_hash": snapshot_hash,
+        "validation_passed": True,
+        "required_files": required_files or [],
+    }
+
+    try:
+        marker_dir = os.path.dirname(marker_path)
+        os.makedirs(marker_dir, exist_ok=True)
+
+        with tempfile.NamedTemporaryFile(
+            mode="w",
+            encoding="utf-8",
+            dir=marker_dir,
+            delete=False,
+            suffix=".tmp",
+        ) as f:
+            temp_path = f.name
+            json.dump(marker, f, indent=2)
+
+        os.replace(temp_path, marker_path)
+        logger.debug("Wrote per-run marker to %s", marker_path)
+    except Exception as e:
+        logger.warning("Failed to write per-run marker to %s: %s", marker_path, e)
+        try:
+            if "temp_path" in locals() and os.path.exists(temp_path):
+                os.remove(temp_path)
+        except Exception:
+            pass
+
+
+def _remove_per_run_marker(snapshot_dir: str) -> None:
+    """
+    Remove per-run validation marker for a snapshot.
+
+    Args:
+        snapshot_dir: Path to snapshot directory
+    """
+    marker_path = _get_per_run_marker_path(snapshot_dir)
+    if marker_path and os.path.exists(marker_path):
+        try:
+            os.remove(marker_path)
+            logger.debug("Removed per-run marker: %s", marker_path)
+        except Exception as e:
+            logger.warning("Failed to remove per-run marker %s: %s", marker_path, e)
+
+
+def _read_validation_marker(snapshot_dir: str) -> Optional[dict]:
+    """
+    Read validation marker for a snapshot.
+
+    Args:
+        snapshot_dir: Path to snapshot directory
+
+    Returns:
+        Marker dict with keys: version, validated_at, validation_passed
+        None if marker doesn't exist or is invalid or validation_passed is not True
+    """
+    marker_path = _get_validation_marker_path(snapshot_dir)
+    if not marker_path:
+        return None
+
+    if not os.path.exists(marker_path):
+        return None
+
+    try:
+        with open(marker_path, "r", encoding="utf-8") as f:
+            marker = json.load(f)
+
+        # Validate marker structure
+        if not isinstance(marker, dict):
+            return None
+
+        required_keys = ["version", "validated_at", "validation_passed"]
+        if not all(key in marker for key in required_keys):
+            return None
+
+        # Check version match
+        if marker["version"] != VALIDATION_MARKER_VERSION:
+            logger.debug(
+                "Validation marker version mismatch: %s != %s, will re-validate",
+                marker["version"],
+                VALIDATION_MARKER_VERSION,
+            )
+            return None
+
+        # Explicitly check validation_passed is True (defensive check)
+        # Even though we only write markers on success, this guards against
+        # manual edits or future code changes
+        if marker.get("validation_passed") is not True:
+            logger.debug(
+                "Validation marker has validation_passed=%s, treating as invalid",
+                marker.get("validation_passed"),
+            )
+            return None
+
+        return marker
+    except (json.JSONDecodeError, OSError) as e:
+        logger.debug("Failed to read validation marker at %s: %s", marker_path, e)
+        return None
+
+
+def _write_validation_marker(snapshot_dir: str, passed: bool) -> None:
+    """
+    Write validation marker for a snapshot (atomic write).
+
+    IMPORTANT: We only cache successful validations. Failed validations are NOT
+    cached to allow retry after files are downloaded.
+
+    Args:
+        snapshot_dir: Path to snapshot directory
+        passed: Whether validation passed
+    """
+    if not passed:
+        # Don't cache failures - allow retry on next launch
+        return
+
+    marker_path = _get_validation_marker_path(snapshot_dir)
+    if not marker_path:
+        logger.debug("Cannot write marker: invalid snapshot_dir")
+        return
+
+    from datetime import datetime
+
+    marker = {
+        "version": VALIDATION_MARKER_VERSION,
+        "validated_at": datetime.utcnow().isoformat() + "Z",
+        "validation_passed": passed,
+    }
+
+    try:
+        # Atomic write: write to temp file then os.replace
+        marker_dir = os.path.dirname(marker_path)
+        os.makedirs(marker_dir, exist_ok=True)
+
+        with tempfile.NamedTemporaryFile(
+            mode="w",
+            encoding="utf-8",
+            dir=marker_dir,
+            delete=False,
+            suffix=".tmp",
+        ) as f:
+            temp_path = f.name
+            json.dump(marker, f, indent=2)
+
+        # Atomic replace (overwrites existing file if any)
+        os.replace(temp_path, marker_path)
+        logger.debug("Wrote validation marker to %s (passed=%s)", marker_path, passed)
+    except Exception as e:
+        logger.warning("Failed to write validation marker to %s: %s", marker_path, e)
+        # Clean up temp file if it exists
+        try:
+            if "temp_path" in locals() and os.path.exists(temp_path):
+                os.remove(temp_path)
+        except Exception:
+            pass
+
+
+def _validate_json_file(file_path: str, file_name: str) -> bool:
+    """
+    Validate that a JSON file exists, is non-empty, and can be parsed.
+
+    Args:
+        file_path: Path to the JSON file
+        file_name: Name of the file (for logging)
+
+    Returns:
+        True if the file is valid, False otherwise
+    """
+    if not os.path.exists(file_path):
+        logger.debug("CI cache validation: %s not found at %s", file_name, file_path)
+        return False
+
+    if not os.path.isfile(file_path):
+        logger.warning(
+            "CI cache validation: %s is not a file: %s", file_name, file_path
+        )
+        return False
+
+    # Check if file is non-empty
+    try:
+        file_size = os.path.getsize(file_path)
+        if file_size == 0:
+            logger.warning("CI cache validation: %s is empty: %s", file_name, file_path)
+            return False
+    except OSError as e:
+        logger.warning("CI cache validation: Cannot get size of %s: %s", file_name, e)
+        return False
+
+    # Try to parse JSON
+    try:
+        with open(file_path, "r", encoding="utf-8") as f:
+            json.load(f)
+        return True
+    except json.JSONDecodeError as e:
+        logger.warning(
+            "CI cache validation: %s is not valid JSON: %s - %s",
+            file_name,
+            file_path,
+            e,
+        )
+        return False
+    except Exception as e:
+        logger.warning(
+            "CI cache validation: Failed to read %s: %s - %s",
+            file_name,
+            file_path,
+            e,
+        )
+        return False
+
+
+def _validate_config_and_tokenizer_files(
+    snapshot_dir: str,
+    model_id: Optional[str] = None,
+    revision: Optional[str] = None,
+    allow_remote_check: bool = False,
+) -> Tuple[bool, List[str]]:
+    """
+    Validate that critical config and tokenizer files exist and are valid.
+
+    This checks for:
+    - config.json (required)
+    - tokenizer_config.json (required)
+    - generation_config.json (optional but validated if present)
+    - hf_quant_config.json (conditionally required based on Hub) - for FP4/FP8/ModelOpt
+    - quantize_config.json / quant_config.json (optional but validated if present) - for AWQ/GPTQ
+    - params.json (optional but validated if present) - for Mistral native format
+    - preprocessor_config.json (optional but validated if present) - for vision models
+    - trust_remote_code dynamic modules (required if auto_map present in config.json)
+    - At least one tokenizer file: tokenizer.json, tokenizer.model, or tiktoken.model
+
+    Args:
+        snapshot_dir: Path to the model snapshot directory
+        model_id: Model repository ID (e.g., "meta-llama/Llama-2-7b-hf"), used for remote checks
+        revision: Git revision (commit hash), used for remote checks
+        allow_remote_check: Whether to check Hub for file existence to determine requirements
+
+    Returns:
+        Tuple of (is_valid, missing_files)
+        - is_valid: True if all required files are present and valid
+        - missing_files: List of missing or invalid file names
+    """
+    missing_files = []
+
+    # Check required config files
+    required_files = [
+        "config.json",
+        "tokenizer_config.json",
+    ]
+
+    for file_name in required_files:
+        file_path = os.path.join(snapshot_dir, file_name)
+        if not _validate_json_file(file_path, file_name):
+            missing_files.append(file_name)
+
+    # Check optional generation_config.json (validate if exists)
+    generation_config_path = os.path.join(snapshot_dir, "generation_config.json")
+    if os.path.exists(generation_config_path):
+        if not _validate_json_file(generation_config_path, "generation_config.json"):
+            missing_files.append("generation_config.json (exists but invalid)")
+
+    # Check hf_quant_config.json with remote existence check
+    # This file is needed for quantized models (FP4/FP8/ModelOpt)
+    # Example: nvidia/Llama-3.1-8B-Instruct-FP8, nvidia/DeepSeek-V3-0324-FP4
+    hf_quant_config_path = os.path.join(snapshot_dir, "hf_quant_config.json")
+    local_hf_quant_exists = os.path.exists(hf_quant_config_path)
+
+    # Check if file exists on Hub for this revision
+    # Only do remote check if model_id looks like a HF repo_id (org/model format)
+    # Skip if it's a local path (absolute path or doesn't contain '/')
+    remote_hf_quant_exists = None
+    is_hf_repo = (
+        model_id is not None
+        and "/" in model_id
+        and not os.path.isabs(model_id)
+        and not model_id.startswith("/")
+    )
+    if is_hf_repo and allow_remote_check:
+        remote_hf_quant_exists = _remote_file_exists(
+            repo_id=model_id,
+            filename="hf_quant_config.json",
+            revision=revision,
+            allow_remote_check=allow_remote_check,
+        )
+
+    # Apply conditional requirement logic
+    if remote_hf_quant_exists is True:
+        # Hub has this file for this revision - it's REQUIRED
+        if not local_hf_quant_exists:
+            missing_files.append(
+                f"hf_quant_config.json (required: exists on Hub for revision {revision or 'default'} but missing locally)"
+            )
+            log_info_on_rank0(
+                logger,
+                f"Hub has hf_quant_config.json for {model_id} revision {revision or 'default'} "
+                f"but local snapshot missing it. Cache incomplete, will not write marker.",
+            )
+        elif not _validate_json_file(hf_quant_config_path, "hf_quant_config.json"):
+            missing_files.append("hf_quant_config.json (exists but invalid)")
+    elif remote_hf_quant_exists is False:
+        # Hub doesn't have this file - it's OPTIONAL
+        # Only validate if it happens to exist locally
+        if local_hf_quant_exists:
+            if not _validate_json_file(hf_quant_config_path, "hf_quant_config.json"):
+                missing_files.append("hf_quant_config.json (exists but invalid)")
+    else:
+        # remote_hf_quant_exists is None - unknown (network error or remote check disabled)
+        # Treat as OPTIONAL - only enforce when we can positively confirm Hub has it
+        if local_hf_quant_exists:
+            # Local file exists - validate it
+            if not _validate_json_file(hf_quant_config_path, "hf_quant_config.json"):
+                missing_files.append("hf_quant_config.json (exists but invalid)")
+        # If local file missing and remote unknown, just log it - don't block marker
+        logger.debug(
+            "Cannot verify hf_quant_config.json on Hub for %s (revision=%s), "
+            "treating as optional since remote status unknown",
+            model_id or "unknown",
+            revision or "default",
+        )
+
+    # Check optional quantize_config.json / quant_config.json (validate if exists)
+    # These files are needed for AWQ/GPTQ/AutoRound quantized models
+    # Example: TheBloke/Llama-2-7B-AWQ, casperhansen/vicuna-7b-v1.5-awq
+    for quant_config_name in ["quantize_config.json", "quant_config.json"]:
+        quant_config_path = os.path.join(snapshot_dir, quant_config_name)
+        if os.path.exists(quant_config_path):
+            if not _validate_json_file(quant_config_path, quant_config_name):
+                missing_files.append(f"{quant_config_name} (exists but invalid)")
+            break  # Only need to check one of these
+
+    # Check optional params.json (validate if exists)
+    # This file is needed for Mistral native format models
+    # Example: mistralai/Mistral-7B-v0.1
+    params_json_path = os.path.join(snapshot_dir, "params.json")
+    if os.path.exists(params_json_path):
+        if not _validate_json_file(params_json_path, "params.json"):
+            missing_files.append("params.json (exists but invalid)")
+
+    # Check optional preprocessor_config.json (validate if exists)
+    # This file is needed for vision/multimodal models
+    # Example: llava-hf/llava-1.5-7b-hf, Qwen/Qwen2-VL-7B-Instruct
+    preprocessor_config_path = os.path.join(snapshot_dir, "preprocessor_config.json")
+    if os.path.exists(preprocessor_config_path):
+        if not _validate_json_file(
+            preprocessor_config_path, "preprocessor_config.json"
+        ):
+            missing_files.append("preprocessor_config.json (exists but invalid)")
+
+    # Check for trust_remote_code dynamic module files if needed
+    # When auto_map exists in config.json, the model requires custom Python files
+    # These files must be present for offline mode to work
+    config_path = os.path.join(snapshot_dir, "config.json")
+    if os.path.exists(config_path):
+        try:
+            with open(config_path, "r", encoding="utf-8") as f:
+                config = json.load(f)
+
+            auto_map = config.get("auto_map", {})
+            if auto_map and isinstance(auto_map, dict):
+                # Extract Python module files from auto_map
+                # auto_map format: {"AutoConfig": "configuration_xxx.ConfigClass", ...}
+                # We need to check if the .py files exist
+                custom_files = set()
+                for key, value in auto_map.items():
+                    if isinstance(value, str) and "." in value:
+                        # Extract module name (e.g., "configuration_xxx" from "configuration_xxx.ConfigClass")
+                        module_name = value.split(".")[0]
+                        custom_files.add(f"{module_name}.py")
+
+                # Check if all custom files exist in snapshot directory
+                # NOTE: Some models (like nvidia/DeepSeek-V3-0324-FP4) have auto_map
+                # but don't include modeling_*.py in their repo, relying on transformers
+                # to fetch it from the base model. We MUST mark these as missing to
+                # prevent offline mode, which would fail to load the dynamic modules.
+                for custom_file in custom_files:
+                    custom_file_path = os.path.join(snapshot_dir, custom_file)
+                    if not os.path.exists(custom_file_path):
+                        missing_files.append(
+                            f"{custom_file} (required for trust_remote_code)"
+                        )
+                        logger.debug(
+                            f"Custom module file not in snapshot: {custom_file} for {snapshot_dir}"
+                        )
+                    elif not os.path.isfile(custom_file_path):
+                        missing_files.append(f"{custom_file} (exists but not a file)")
+        except (json.JSONDecodeError, OSError, KeyError) as e:
+            # If we can't read config.json, it will be caught by earlier validation
+            logger.debug("Failed to check auto_map in config.json: %s", e)
+
+    # Check for at least one tokenizer file
+    tokenizer_files = [
+        "tokenizer.json",
+        "tokenizer.model",
+        "tiktoken.model",
+    ]
+
+    tokenizer_found = False
+    for tokenizer_file in tokenizer_files:
+        tokenizer_path = os.path.join(snapshot_dir, tokenizer_file)
+        if os.path.exists(tokenizer_path) and os.path.isfile(tokenizer_path):
+            # For tokenizer.json, validate it's proper JSON
+            if tokenizer_file == "tokenizer.json":
+                if _validate_json_file(tokenizer_path, tokenizer_file):
+                    tokenizer_found = True
+                    break
+            else:
+                # For .model files, just check they're non-empty
+                try:
+                    if os.path.getsize(tokenizer_path) > 0:
+                        tokenizer_found = True
+                        break
+                except OSError:
+                    pass
+
+    if not tokenizer_found:
+        missing_files.append("tokenizer file")
+
+    is_valid = len(missing_files) == 0
+    return is_valid, missing_files
+
+
+def ci_validate_cache_and_enable_offline_if_complete(
+    snapshot_dir: str,
+    weight_files: List[str],
+    model_name_or_path: str,
+) -> bool:
+    """
+    Validate local cache completeness (config/tokenizer/weights) and determine
+    if offline mode can be safely enabled.
+
+    This function uses a snapshot-level marker to cache validation results,
+    so the heavy validation is done at most once per snapshot per runner.
+
+    This function checks:
+    1. Validation marker (if exists and version matches, skip re-validation)
+    2. Config and tokenizer files (config.json, tokenizer_config.json, etc.)
+    3. Weight files (safetensors shards, index files, corruption check)
+
+    If all are present and valid, it returns True to signal that offline
+    mode can be safely enabled.
+
+    IMPORTANT: This should be called BEFORE any HF operations, and if it
+    returns True, the caller should set HF_HUB_OFFLINE=1 for the server
+    subprocess env ONLY (not global environment).
+
+    Args:
+        snapshot_dir: Path to the model snapshot directory
+        weight_files: List of weight file paths to validate (must be non-empty)
+        model_name_or_path: Model identifier for logging
+
+    Returns:
+        True if cache is complete and offline mode can be enabled, False otherwise
+    """
+    # Guard: weight_files is required
+    if not weight_files:
+        log_info_on_rank0(
+            logger,
+            f"CI_OFFLINE: No weight files provided, skip offline, keep online allowed - {model_name_or_path}",
+        )
+        return False
+
+    # Fast-path: Check if validation marker exists and is valid
+    # We only cache successful validations, so if marker exists, it means cache is complete
+    marker = _read_validation_marker(snapshot_dir)
+    if marker is not None:
+        marker_path = _get_validation_marker_path(snapshot_dir)
+        marker_name = os.path.basename(marker_path) if marker_path else "unknown"
+        log_info_on_rank0(
+            logger,
+            f"CI_OFFLINE: Marker hit (marker={marker_name}), skip re-validation, offline mode will be enabled - {model_name_or_path}",
+        )
+        return True
+
+    # No marker - perform full validation
+    # (Failures are not cached, so we'll retry validation each time until success)
+
+    # Extract revision (snapshot hash) from snapshot_dir path
+    # snapshot_dir format: /path/to/cache/models--org--model/snapshots/<commit_hash>
+    revision = os.path.basename(snapshot_dir)
+
+    # Only allow remote checks if we're not in offline mode
+    # This avoids unnecessary API calls and warnings in offline CI environments
+    import huggingface_hub.constants
+
+    allow_remote_check = not huggingface_hub.constants.HF_HUB_OFFLINE
+
+    log_info_on_rank0(
+        logger,
+        f"CI_OFFLINE: No marker found, performing full validation "
+        f"(snapshot={revision}, allow_remote_check={allow_remote_check}) - {model_name_or_path}",
+    )
+
+    # Validate config and tokenizer files with remote existence checks
+    config_valid, missing_config_files = _validate_config_and_tokenizer_files(
+        snapshot_dir=snapshot_dir,
+        model_id=model_name_or_path,
+        revision=revision,
+        allow_remote_check=allow_remote_check,
+    )
+
+    if not config_valid:
+        log_info_on_rank0(
+            logger,
+            f"CI_OFFLINE: Missing config/tokenizer files {missing_config_files}, skip offline, keep online allowed - {model_name_or_path}",
+        )
+        # Don't write marker for failures - allow retry after download
+        return False
+
+    # Validate weight files using existing validation from PR #15216
+    # This checks for missing shards, corrupted safetensors, etc.
+    weights_valid, error_msg, _ = _validate_sharded_model(snapshot_dir, weight_files)
+    if not weights_valid:
+        log_info_on_rank0(
+            logger,
+            f"CI_OFFLINE: Weight validation failed ({error_msg}), skip offline, keep online allowed - {model_name_or_path}",
+        )
+        # Don't write marker for failures - allow retry after download
+        return False
+
+    log_info_on_rank0(
+        logger,
+        f"CI_OFFLINE: Cache validation PASSED, offline mode will be enabled - {model_name_or_path}",
+    )
+
+    # Write marker with passed=True for future reuse
+    # (Failures are not cached, so this only happens on success)
+    _write_validation_marker(snapshot_dir, passed=True)
+    return True
+
+
+def _infer_component_type(component_name: str, component_info: list) -> str:
+    """
+    Infer component type from component name and info.
+
+    Args:
+        component_name: Name of the component (e.g., "scheduler", "tokenizer")
+        component_info: Component info from model_index.json (e.g., ["diffusers", "SchedulerClass"])
+
+    Returns:
+        Component type string for validation rules
+    """
+    # Normalize component name for type detection
+    name_lower = component_name.lower()
+
+    # Infer type based on name
+    if "scheduler" in name_lower:
+        return "scheduler"
+    elif "tokenizer" in name_lower:
+        return "tokenizer"
+    elif "image_processor" in name_lower:
+        return "image_processor"
+    elif "feature_extractor" in name_lower:
+        return "feature_extractor"
+    elif "processor" in name_lower:
+        return "processor"
+    else:
+        # Default to model component (needs config.json + weights)
+        return "model"
+
+
+def _check_component_config(
+    component_dir: str, component_type: str
+) -> Tuple[bool, List[str]]:
+    """
+    Check if component has required config files based on type.
+
+    Args:
+        component_dir: Path to component directory
+        component_type: Type of component (scheduler, tokenizer, processor, model, etc.)
+
+    Returns:
+        Tuple of (has_valid_config, list_of_candidates_tried)
+    """
+    if component_type == "scheduler":
+        # Scheduler: scheduler_config.json or config.json
+        candidates = ["scheduler_config.json", "config.json"]
+        for candidate in candidates:
+            candidate_path = os.path.join(component_dir, candidate)
+            if _validate_json_file(candidate_path, candidate):
+                return True, candidates
+        return False, candidates
+
+    elif component_type == "tokenizer":
+        # Tokenizer must have actual tokenizer files (not just tokenizer_config.json)
+        # Valid combinations:
+        # - tokenizer.json
+        # - tokenizer.model
+        # - vocab.json + merges.txt
+        candidates = [
+            "tokenizer.json",
+            "tokenizer.model",
+            "vocab.json+merges.txt",
+        ]
+
+        # Check tokenizer.json (validate as JSON)
+        tokenizer_json_path = os.path.join(component_dir, "tokenizer.json")
+        if _validate_json_file(tokenizer_json_path, "tokenizer.json"):
+            return True, candidates
+
+        # Check tokenizer.model (non-empty file)
+        tokenizer_model_path = os.path.join(component_dir, "tokenizer.model")
+        if os.path.exists(tokenizer_model_path) and os.path.isfile(
+            tokenizer_model_path
+        ):
+            try:
+                if os.path.getsize(tokenizer_model_path) > 0:
+                    return True, candidates
+            except OSError:
+                pass
+
+        # Check vocab.json + merges.txt pair
+        vocab_path = os.path.join(component_dir, "vocab.json")
+        merges_path = os.path.join(component_dir, "merges.txt")
+        if _validate_json_file(vocab_path, "vocab.json") and os.path.exists(
+            merges_path
+        ):
+            return True, candidates
+
+        return False, candidates
+
+    elif component_type in ["processor", "feature_extractor", "image_processor"]:
+        # Processor/feature_extractor/image_processor: preprocessor_config.json or config.json
+        candidates = ["preprocessor_config.json", "config.json"]
+        for candidate in candidates:
+            candidate_path = os.path.join(component_dir, candidate)
+            if _validate_json_file(candidate_path, candidate):
+                return True, candidates
+        return False, candidates
+
+    else:
+        # Default model components: config.json
+        candidates = ["config.json"]
+        config_path = os.path.join(component_dir, "config.json")
+        if _validate_json_file(config_path, "config.json"):
+            return True, candidates
+        return False, candidates
+
+
+def _check_component_weights(component_dir: str) -> bool:
+    """
+    Check if component directory has weight files.
+
+    Args:
+        component_dir: Path to component directory
+
+    Returns:
+        True if weight files found, False otherwise
+    """
+    weight_patterns = ["*.safetensors", "*.bin", "*.pt", "*.pth"]
+
+    for pattern in weight_patterns:
+        weight_files = glob_module.glob(os.path.join(component_dir, pattern))
+        if weight_files:
+            return True
+
+    return False
+
+
+def _format_component_list(components: List[str], max_show: int = 5) -> str:
+    """
+    Format component list with truncation.
+
+    Args:
+        components: List of component names
+        max_show: Maximum number to show before truncating
+
+    Returns:
+        Formatted string like "comp1, comp2, comp3" or "comp1, comp2, +3 more"
+    """
+    if len(components) <= max_show:
+        return ", ".join(components)
+    else:
+        shown = components[:max_show]
+        remaining = len(components) - max_show
+        return f"{', '.join(shown)}, +{remaining} more"
+
+
+def _validate_diffusion_model(
+    snapshot_dir: str,
+) -> Tuple[bool, Optional[str]]:
+    """
+    Validate diffusion model (diffusers pipeline) cache completeness.
+
+    This validation is based on model_index.json as the single source of truth.
+    Error reporting uses coarse-grained error codes unless verbose mode is enabled.
+
+    Error codes:
+    - DIFFUSERS_INVALID_INDEX: model_index.json missing or corrupted
+    - DIFFUSERS_INVALID_COMPONENTS: model_index.json has no valid components
+    - DIFFUSERS_MISSING_COMPONENT: component directory or config missing
+    - DIFFUSERS_MISSING_WEIGHTS: component weights missing
+
+    Args:
+        snapshot_dir: Path to the model snapshot directory
+
+    Returns:
+        Tuple of (is_valid, error_message)
+        - (True, None) if validation passed
+        - (False, error_code_with_components) if validation failed
+    """
+    # Check verbose mode from environment
+    verbose = os.environ.get("SGLANG_CI_VALIDATE_VERBOSE") == "1"
+
+    # 1. Check for model_index.json (required for diffusers models)
+    model_index_path = os.path.join(snapshot_dir, "model_index.json")
+    if not os.path.exists(model_index_path):
+        return False, "DIFFUSERS_INVALID_INDEX: model_index.json not found"
+
+    # Parse model_index.json
+    try:
+        with open(model_index_path, "r", encoding="utf-8") as f:
+            model_index = json.load(f)
+    except (json.JSONDecodeError, OSError) as e:
+        if verbose:
+            return False, f"DIFFUSERS_INVALID_INDEX: model_index.json parse error - {e}"
+        return False, "DIFFUSERS_INVALID_INDEX: model_index.json corrupted"
+
+    # 2. Extract components (non-underscore keys with list values)
+    components = {
+        k: v
+        for k, v in model_index.items()
+        if not k.startswith("_") and isinstance(v, list)
+    }
+
+    if not components:
+        return False, "DIFFUSERS_INVALID_COMPONENTS: no valid components defined"
+
+    # Categorize errors by type
+    missing_dirs = []
+    missing_configs = []
+    missing_configs_verbose = []
+    missing_weights = []
+
+    # 3. Validate each component
+    for component_name, component_info in components.items():
+        component_dir = os.path.join(snapshot_dir, component_name)
+
+        # Component directory must exist
+        if not os.path.isdir(component_dir):
+            missing_dirs.append(component_name)
+            continue
+
+        # Infer component type for validation rules
+        component_type = _infer_component_type(component_name, component_info)
+
+        # Check for required config files based on component type
+        has_valid_config, config_candidates = _check_component_config(
+            component_dir, component_type
+        )
+
+        if not has_valid_config:
+            missing_configs.append(component_name)
+            if verbose:
+                candidates_str = ", ".join(config_candidates)
+                missing_configs_verbose.append(
+                    f"{component_name} (tried: {candidates_str})"
+                )
+            continue
+
+        # 4. Check for weights if component needs them
+        # These components don't require weight files (config-only)
+        needs_weights = component_type not in [
+            "scheduler",
+            "tokenizer",
+            "processor",
+            "feature_extractor",
+            "image_processor",
+        ]
+
+        if needs_weights:
+            has_weights = _check_component_weights(component_dir)
+            if not has_weights:
+                missing_weights.append(component_name)
+
+    # 5. Build error message based on categorized errors
+    if missing_dirs or missing_configs or missing_weights:
+        errors = []
+
+        if missing_dirs:
+            dir_str = _format_component_list(missing_dirs)
+            if verbose:
+                errors.append(f"DIFFUSERS_MISSING_COMPONENT (dirs): {dir_str}")
+            else:
+                errors.append(f"DIFFUSERS_MISSING_COMPONENT(dir): {dir_str}")
+
+        if missing_configs:
+            if verbose:
+                config_str = "; ".join(missing_configs_verbose)
+                errors.append(f"DIFFUSERS_MISSING_COMPONENT (configs): {config_str}")
+            else:
+                config_str = _format_component_list(missing_configs)
+                errors.append(f"DIFFUSERS_MISSING_COMPONENT(cfg): {config_str}")
+
+        if missing_weights:
+            weight_str = _format_component_list(missing_weights)
+            errors.append(f"DIFFUSERS_MISSING_WEIGHTS: {weight_str}")
+
+        return False, " | ".join(errors)
+
+    return True, None
+
+
+def validate_cache_with_detailed_reason(
+    snapshot_dir: str, weight_files: List[str], model_name_or_path: str
+) -> Tuple[bool, Optional[str]]:
+    """
+    Validate cache and return detailed reason for failure.
+
+    This function performs validation without relying on shared validation markers.
+    Used by prevalidate_cached_models.py to provide detailed feedback.
+
+    Args:
+        snapshot_dir: Path to the model snapshot directory
+        weight_files: List of weight file paths to validate
+        model_name_or_path: Model identifier for logging
+
+    Returns:
+        Tuple of (success, reason):
+        - (True, None) if validation passed
+        - (False, reason_str) if validation failed with specific reason
+    """
+    # Guard: weight_files is required
+    if not weight_files:
+        return False, "No weight files provided"
+
+    # Perform full validation and capture failure reasons
+    revision = os.path.basename(snapshot_dir)
+
+    # Read from environment variable instead of huggingface_hub.constants
+    allow_remote_check = os.environ.get("HF_HUB_OFFLINE") != "1"
+
+    # Validate config and tokenizer files
+    config_valid, missing_config_files = _validate_config_and_tokenizer_files(
+        snapshot_dir=snapshot_dir,
+        model_id=model_name_or_path,
+        revision=revision,
+        allow_remote_check=allow_remote_check,
+    )
+
+    if not config_valid:
+        missing_files_str = ", ".join(missing_config_files)
+        return False, f"Missing config/tokenizer files: {missing_files_str}"
+
+    # Validate weight files
+    weights_valid, error_msg, _ = _validate_sharded_model(snapshot_dir, weight_files)
+    if not weights_valid:
+        return False, f"Weight validation failed: {error_msg}"
+
+    # All validations passed
+    return True, None
+
+
+def validate_cache_lightweight(
+    snapshot_dir: str, requires_hf_quant_config: bool = False
+) -> bool:
+    """
+    Lightweight runtime validation for cache completeness.
+
+    This is used during test runs to ensure the current runner's cache
+    is complete before enabling offline mode. Much faster than full validation
+    as it only checks file existence, not corruption.
+
+    Args:
+        snapshot_dir: Path to the model snapshot directory
+        requires_hf_quant_config: If True, hf_quant_config.json must exist
+                                  (required for modelopt quantization)
+
+    Returns:
+        True if cache is complete, False otherwise
+    """
+    # Check required config files
+    required_files = [
+        "config.json",
+        "tokenizer_config.json",
+    ]
+
+    for fname in required_files:
+        if not os.path.exists(os.path.join(snapshot_dir, fname)):
+            return False
+
+    # Check tokenizer files (at least one must exist)
+    tokenizer_files = [
+        "tokenizer.json",
+        "tokenizer.model",
+        "tiktoken.model",
+    ]
+
+    has_tokenizer = any(
+        os.path.exists(os.path.join(snapshot_dir, fname)) for fname in tokenizer_files
+    )
+    if not has_tokenizer:
+        return False
+
+    # Check for trust_remote_code dynamic module files if needed
+    # When auto_map exists in config.json, the model requires custom Python files
+    # These files must be present for offline mode to work
+    config_path = os.path.join(snapshot_dir, "config.json")
+    if os.path.exists(config_path):
+        try:
+            with open(config_path, "r", encoding="utf-8") as f:
+                config = json.load(f)
+
+            auto_map = config.get("auto_map", {})
+            if auto_map and isinstance(auto_map, dict):
+                # Extract Python module files from auto_map
+                # auto_map format: {"AutoConfig": "configuration_xxx.ConfigClass", ...}
+                # We need to check if the .py files exist
+                custom_files = set()
+                for key, value in auto_map.items():
+                    if isinstance(value, str) and "." in value:
+                        # Extract module name (e.g., "configuration_xxx" from "configuration_xxx.ConfigClass")
+                        module_name = value.split(".")[0]
+                        custom_files.add(f"{module_name}.py")
+
+                # Check if all custom files exist in snapshot directory
+                for custom_file in custom_files:
+                    custom_file_path = os.path.join(snapshot_dir, custom_file)
+                    if not os.path.exists(custom_file_path):
+                        logger.debug(
+                            "Custom module file not in snapshot: %s for %s",
+                            custom_file,
+                            snapshot_dir,
+                        )
+                        return False
+                    elif not os.path.isfile(custom_file_path):
+                        logger.debug(
+                            "Custom module path exists but not a file: %s",
+                            custom_file_path,
+                        )
+                        return False
+        except (json.JSONDecodeError, OSError, KeyError) as e:
+            # If we can't read config.json, it will be caught by earlier validation
+            logger.debug("Failed to check auto_map in config.json: %s", e)
+
+    # Check for weight files with index self-consistency
+    index_path = os.path.join(snapshot_dir, "model.safetensors.index.json")
+    has_index = os.path.exists(index_path)
+
+    if has_index:
+        # If index exists, validate that all shards listed in it exist
+        try:
+            with open(index_path, "r", encoding="utf-8") as f:
+                index_data = json.load(f)
+            weight_map = index_data.get("weight_map", {})
+            if weight_map:
+                # Check that all shard files referenced in index exist
+                required_shards = set(weight_map.values())
+                for shard_name in required_shards:
+                    shard_path = os.path.join(snapshot_dir, shard_name)
+                    if not os.path.exists(shard_path):
+                        logger.debug(
+                            "Index validation failed: missing shard %s in %s",
+                            shard_name,
+                            snapshot_dir,
+                        )
+                        return False
+        except (json.JSONDecodeError, OSError, KeyError) as e:
+            logger.debug("Failed to validate index file %s: %s", index_path, e)
+            return False
+    else:
+        # No index file - check for weight files and validate shard completeness
+        safetensors_files = glob_module.glob(
+            os.path.join(snapshot_dir, "*.safetensors")
+        )
+        if not safetensors_files:
+            return False
+
+        # Check shard completeness for sharded models (e.g., model-00001-of-00047.safetensors)
+        # Pattern: prefix-NNNNN-of-NNNNN.safetensors
+        shard_pattern = re.compile(r"(.*?)-(\d+)-of-(\d+)\.safetensors$")
+        shard_groups = {}
+
+        for f in safetensors_files:
+            base_name = os.path.basename(f)
+            match = shard_pattern.match(base_name)
+            if match:
+                prefix = match.group(1)
+                shard_id = int(match.group(2))
+                total_shards = int(match.group(3))
+                group_key = f"{prefix}-of-{total_shards}"
+
+                if group_key not in shard_groups:
+                    shard_groups[group_key] = {
+                        "total": total_shards,
+                        "found_shards": set(),
+                    }
+                shard_groups[group_key]["found_shards"].add(shard_id)
+
+        # Validate each shard group has all expected shards
+        for group_key, group_info in shard_groups.items():
+            total_shards = group_info["total"]
+            found_shards = group_info["found_shards"]
+            expected_shards = set(range(1, total_shards + 1))
+            missing_shards = expected_shards - found_shards
+
+            if missing_shards:
+                logger.debug(
+                    "Shard validation failed: missing shards %s in %s for %s",
+                    sorted(missing_shards),
+                    group_key,
+                    snapshot_dir,
+                )
+                return False
+
+    # Check hf_quant_config.json if required (for modelopt quantization)
+    if requires_hf_quant_config:
+        hf_quant_path = os.path.join(snapshot_dir, "hf_quant_config.json")
+        if not os.path.exists(hf_quant_path):
+            return False
+
+    return True
+
+
+def _validate_safetensors_file(file_path: str) -> bool:
+    """
+    Validate that a safetensors file is readable and not corrupted.
+
+    Args:
+        file_path: Path to the safetensors file
+
+    Returns:
+        True if the file is valid, False if corrupted
+    """
+    try:
+        # Attempt to open and read the header
+        # This will fail if the file is corrupted or incomplete
+        with safetensors.safe_open(file_path, framework="pt", device="cpu") as f:
+            # Just accessing the keys validates the header is readable
+            _ = list(f.keys())
+        return True
+    except Exception as e:
+        logger.warning(
+            "Corrupted safetensors file detected: %s - %s: %s",
+            file_path,
+            type(e).__name__,
+            str(e),
+        )
+        return False
+
+
+def _validate_pytorch_bin_file(file_path: str) -> bool:
+    """
+    Validate that a PyTorch .bin file is readable and not corrupted.
+
+    This catches corruption issues like truncated downloads or invalid archives
+    that would cause errors like:
+    "RuntimeError: PytorchStreamReader failed reading file data/X: invalid header
+    or archive is corrupted"
+
+    Args:
+        file_path: Path to the .bin file
+
+    Returns:
+        True if the file is valid, False if corrupted
+    """
+    try:
+        import torch
+
+        # Use weights_only=True for security and to avoid executing arbitrary code
+        # mmap=False to fully read the file and catch all corruption
+        torch.load(file_path, map_location="cpu", weights_only=True, mmap=False)
+        return True
+    except Exception as e:
+        logger.warning(
+            "Corrupted PyTorch bin file detected: %s - %s: %s",
+            file_path,
+            type(e).__name__,
+            str(e),
+        )
+        return False
+
+
+def _check_index_files_exist(snapshot_dir: str) -> Tuple[bool, Optional[str]]:
+    """
+    Check if all files listed in safetensors index files actually exist on disk.
+
+    This catches cases where the snapshot directory exists but files are missing
+    (e.g., due to incomplete downloads or corrupted cache).
+
+    Args:
+        snapshot_dir: Path to the model snapshot directory
+
+    Returns:
+        Tuple of (all_exist, error_message)
+    """
+    # Find all safetensors index files
+    index_files = [
+        f for f in os.listdir(snapshot_dir) if f.endswith(".safetensors.index.json")
+    ]
+
+    if not index_files:
+        # No index files means it's not a sharded model, skip this check
+        return True, None
+
+    for index_file in index_files:
+        index_path = os.path.join(snapshot_dir, index_file)
+
+        # Check if index file is a broken symlink (exists in listing but blob missing)
+        if os.path.islink(index_path) and not os.path.exists(index_path):
+            # Broken symlink - clean it up so download can proceed
+            try:
+                blob_path = os.path.realpath(index_path)
+                os.remove(index_path)
+                logger.warning(
+                    "Removed broken index symlink: %s (blob missing)", index_file
+                )
+                # Also try to remove dangling blob reference if it somehow exists
+                if os.path.exists(blob_path):
+                    os.remove(blob_path)
+            except Exception as e:
+                logger.error("Failed to remove broken symlink %s: %s", index_file, e)
+            return (
+                False,
+                f"Broken index file symlink: {index_file} (cleaned up, will re-download)",
+            )
+
+        try:
+            with open(index_path) as f:
+                index_data = json.load(f)
+
+            weight_map = index_data.get("weight_map", {})
+            if not weight_map:
+                continue
+
+            # Check that all files in weight_map exist
+            required_files = set(weight_map.values())
+            missing_files = []
+
+            for file_name in required_files:
+                file_path = os.path.join(snapshot_dir, file_name)
+                # Check both existence and that it's not a broken symlink
+                if not os.path.exists(file_path):
+                    missing_files.append(file_name)
+
+            if missing_files:
+                return (
+                    False,
+                    f"Missing {len(missing_files)} file(s) from index {index_file}: {missing_files[:3]}{'...' if len(missing_files) > 3 else ''}",
+                )
+
+        except FileNotFoundError as e:
+            # Index file was listed but can't be read - could be race condition or broken state
+            logger.warning("Failed to read index file %s: %s", index_file, e)
+            return (
+                False,
+                f"Index file {index_file} unreadable (will re-download)",
+            )
+        except Exception as e:
+            logger.warning("Failed to read index file %s: %s", index_file, e)
+            continue
+
+    return True, None
+
+
+def _validate_sharded_model(
+    snapshot_dir: str, weight_files: List[str]
+) -> Tuple[bool, Optional[str], List[str]]:
+    """
+    Validate that all model shards are present and not corrupted.
+
+    Args:
+        snapshot_dir: Path to the model snapshot directory
+        weight_files: List of weight file paths
+
+    Returns:
+        Tuple of (is_valid, error_message, corrupted_files)
+        - corrupted_files: List of file paths that are corrupted (for selective cleanup)
+    """
+    # First, check if all files from the index actually exist
+    # This catches missing files that wouldn't be found by glob
+    index_check_valid, index_error = _check_index_files_exist(snapshot_dir)
+    if not index_check_valid:
+        return False, index_error, []
+
+    # Pattern for sharded files: model-00001-of-00009.safetensors
+    shard_pattern = re.compile(r"(.*?)-(\d+)-of-(\d+)\.(safetensors|bin)")
+
+    # Group files by shard pattern (prefix-*-of-N)
+    shard_groups = {}
+    for f in weight_files:
+        base_name = os.path.basename(f)
+        match = shard_pattern.match(base_name)
+        if match:
+            prefix = match.group(1)
+            total_shards_str = match.group(3)
+            suffix = match.group(4)
+
+            group_key = f"{prefix}-of-{total_shards_str}.{suffix}"
+            if group_key not in shard_groups:
+                shard_groups[group_key] = {
+                    "prefix": prefix,
+                    "total": int(total_shards_str),
+                    "suffix": suffix,
+                    "found_shards": [],
+                    "files": [],
+                }
+
+            shard_id = int(match.group(2))
+            shard_groups[group_key]["found_shards"].append(shard_id)
+            shard_groups[group_key]["files"].append(f)
+
+    # Track corrupted files for selective cleanup
+    corrupted_files = []
+
+    # Validate each shard group
+    for group_key, group_info in shard_groups.items():
+        total_shards = group_info["total"]
+        found_shards = set(group_info["found_shards"])
+        expected_shards = set(range(1, total_shards + 1))
+
+        # Check for missing shards
+        missing_shards = expected_shards - found_shards
+        if missing_shards:
+            return (
+                False,
+                f"Missing shards in {group_key}: {sorted(missing_shards)}",
+                [],
+            )
+
+        # Validate weight files for corruption
+        if group_info["suffix"] == "safetensors":
+            for f in group_info["files"]:
+                if not _validate_safetensors_file(f):
+                    corrupted_files.append(f)
+        elif group_info["suffix"] == "bin":
+            for f in group_info["files"]:
+                if not _validate_pytorch_bin_file(f):
+                    corrupted_files.append(f)
+
+        # Check for required index file for safetensors shards
+        if group_info["suffix"] == "safetensors":
+            index_file = os.path.join(
+                snapshot_dir, f"{group_info['prefix']}.safetensors.index.json"
+            )
+            if not os.path.exists(index_file):
+                return (
+                    False,
+                    f"Missing index file: {os.path.basename(index_file)}",
+                    [],
+                )
+
+    if corrupted_files:
+        return (
+            False,
+            f"Corrupted shard files: {[os.path.basename(f) for f in corrupted_files]}",
+            corrupted_files,
+        )
+
+    return True, None, []
+
+
+def _cleanup_corrupted_files_selective(
+    model_name_or_path: str, corrupted_files: List[str]
+) -> int:
+    """
+    Selectively remove corrupted files and their blobs to force re-download.
+
+    This is more efficient than removing the entire model cache as it only
+    re-downloads corrupted files rather than the entire model.
+
+    Args:
+        model_name_or_path: Model identifier
+        corrupted_files: List of corrupted file paths (symlinks in snapshot)
+
+    Returns:
+        Number of files successfully cleaned up
+    """
+    cleaned_count = 0
+
+    for file_path in corrupted_files:
+        try:
+            # Resolve symlink to get blob path before deleting symlink
+            if os.path.islink(file_path):
+                blob_path = os.path.realpath(file_path)
+
+                # Delete the symlink
+                os.remove(file_path)
+                logger.info(
+                    "Removed corrupted symlink: %s", os.path.basename(file_path)
+                )
+
+                # Delete the blob (the actual corrupted data)
+                if os.path.exists(blob_path):
+                    os.remove(blob_path)
+                    logger.info(
+                        "Removed corrupted blob: %s", os.path.basename(blob_path)
+                    )
+
+                cleaned_count += 1
+            elif os.path.exists(file_path):
+                # Not a symlink, just delete the file
+                os.remove(file_path)
+                logger.info("Removed corrupted file: %s", os.path.basename(file_path))
+                cleaned_count += 1
+
+        except Exception as e:
+            logger.error(
+                "Failed to remove corrupted file %s: %s",
+                os.path.basename(file_path),
+                e,
+            )
+
+    if cleaned_count > 0:
+        logger.warning(
+            "Removed %d corrupted file(s) for %s. "
+            "These will be re-downloaded on next load.",
+            cleaned_count,
+            model_name_or_path,
+        )
+
+    return cleaned_count
+
+
+def _cleanup_corrupted_model_cache(
+    model_name_or_path: str, snapshot_dir: str, reason: str
+) -> None:
+    """
+    Remove entire corrupted model cache directory to force a clean re-download.
+
+    This is used when we cannot selectively clean (e.g., missing shards, incomplete
+    downloads with unknown affected files).
+
+    Args:
+        model_name_or_path: Model identifier
+        snapshot_dir: Path to the snapshot directory
+        reason: Reason for cleanup
+    """
+    # Navigate up to the model root directory: snapshots/hash -> snapshots -> model_root
+    repo_folder = os.path.abspath(os.path.join(snapshot_dir, "..", ".."))
+
+    try:
+        logger.warning(
+            "Removing entire cache for %s at %s. Reason: %s",
+            model_name_or_path,
+            repo_folder,
+            reason,
+        )
+        shutil.rmtree(repo_folder)
+        logger.info("Successfully removed corrupted cache directory")
+    except Exception as e:
+        logger.error(
+            "Failed to remove corrupted cache directory %s: %s. "
+            "Manual cleanup may be required.",
+            repo_folder,
+            e,
+        )
+
+
+def ci_validate_and_cleanup_local_snapshot(
+    model_name_or_path: str,
+    found_local_snapshot_dir: str,
+    local_weight_files: List[str],
+) -> bool:
+    """
+    CI-specific validation and cleanup for local model snapshots.
+
+    This function validates the local snapshot and performs automatic cleanup
+    if corruption or missing files are detected. This behavior is only appropriate
+    for CI environments where we want automatic recovery.
+
+    Args:
+        model_name_or_path: Model identifier for logging
+        found_local_snapshot_dir: Path to the local snapshot directory
+        local_weight_files: List of weight file paths found in the snapshot
+
+    Returns:
+        True if the snapshot is valid and can be used, False if it was invalid
+        and cleanup was performed (caller should re-download)
+    """
+    # Check for incomplete files and clean up if found
+    repo_folder = os.path.abspath(os.path.join(found_local_snapshot_dir, "..", ".."))
+    blobs_dir = os.path.join(repo_folder, "blobs")
+
+    # Check for incomplete download markers
+    incomplete_files = []
+    if os.path.isdir(blobs_dir):
+        incomplete_files = glob_module.glob(os.path.join(blobs_dir, "*.incomplete"))
+
+    if incomplete_files:
+        log_info_on_rank0(
+            logger,
+            f"Found {len(incomplete_files)} .incomplete files in {blobs_dir} for "
+            f"{model_name_or_path}. Will clean up and re-download.",
+        )
+        _cleanup_corrupted_model_cache(
+            model_name_or_path,
+            found_local_snapshot_dir,
+            f"Incomplete download detected ({len(incomplete_files)} incomplete files)",
+        )
+        return False
+
+    # Validate sharded models and check for corruption
+    if local_weight_files:
+        is_valid, error_msg, corrupted_files = _validate_sharded_model(
+            found_local_snapshot_dir, local_weight_files
+        )
+        if not is_valid:
+            if corrupted_files:
+                # Selective cleanup: only remove corrupted files
+                log_info_on_rank0(
+                    logger,
+                    f"Found {len(corrupted_files)} corrupted file(s) for "
+                    f"{model_name_or_path}: {error_msg}. "
+                    "Will selectively clean and re-download only these files.",
+                )
+                _cleanup_corrupted_files_selective(model_name_or_path, corrupted_files)
+                return False
+            else:
+                # Missing shards (not corruption) - let snapshot_download handle it.
+                # IMPORTANT: Do NOT delete the entire cache here, as other processes
+                # (TP/EP ranks) may already be loading weights from these files.
+                log_info_on_rank0(
+                    logger,
+                    f"Validation failed for {model_name_or_path}: {error_msg}. "
+                    "Will attempt to download missing files.",
+                )
+                return False
+
+        # Also validate single (non-sharded) weight files
+        for f in local_weight_files:
+            base_name = os.path.basename(f)
+            # Check if this is a single model file (not sharded)
+            # Include adapter_model.safetensors for LoRA adapters
+            if base_name in [
+                "model.safetensors",
+                "pytorch_model.safetensors",
+                "adapter_model.safetensors",
+            ]:
+                if not _validate_safetensors_file(f):
+                    log_info_on_rank0(
+                        logger,
+                        f"Corrupted model file {base_name} for {model_name_or_path}. "
+                        "Will selectively clean and re-download this file.",
+                    )
+                    # Selective cleanup for single file
+                    _cleanup_corrupted_files_selective(model_name_or_path, [f])
+                    return False
+            # Also validate single PyTorch .bin files
+            elif base_name in [
+                "pytorch_model.bin",
+                "model.bin",
+                "adapter_model.bin",
+            ]:
+                if not _validate_pytorch_bin_file(f):
+                    log_info_on_rank0(
+                        logger,
+                        f"Corrupted model file {base_name} for {model_name_or_path}. "
+                        "Will selectively clean and re-download this file.",
+                    )
+                    # Selective cleanup for single file
+                    _cleanup_corrupted_files_selective(model_name_or_path, [f])
+                    return False
+
+    return True
+
+
+def _validate_weights_after_download(
+    hf_folder: str,
+    allow_patterns: List[str],
+    model_name_or_path: str,
+) -> bool:
+    """
+    Validate downloaded weight files to catch corruption early.
+
+    This function validates safetensors files after download to catch
+    corruption issues (truncated downloads, network errors, etc.) before
+    model loading fails with cryptic errors. If corruption is found,
+    the corrupted files are automatically cleaned up.
+
+    Args:
+        hf_folder: Path to the downloaded model folder
+        allow_patterns: Patterns used to match weight files
+        model_name_or_path: Model identifier for error messages
+
+    Returns:
+        True if all files are valid, False if corrupted files were found and cleaned up
+    """
+    # Find all weight files that were downloaded
+    weight_files: List[str] = []
+    for pattern in allow_patterns:
+        weight_files.extend(glob_module.glob(os.path.join(hf_folder, pattern)))
+
+    if not weight_files:
+        return True  # No weight files to validate
+
+    # Validate weight files (safetensors and .bin)
+    corrupted_files = []
+    for f in weight_files:
+        if f.endswith(".safetensors") and os.path.exists(f):
+            if not _validate_safetensors_file(f):
+                corrupted_files.append(os.path.basename(f))
+        elif f.endswith(".bin") and os.path.exists(f):
+            if not _validate_pytorch_bin_file(f):
+                corrupted_files.append(os.path.basename(f))
+
+    if corrupted_files:
+        # Clean up corrupted files so next attempt re-downloads them
+        _cleanup_corrupted_files_selective(
+            model_name_or_path,
+            [os.path.join(hf_folder, f) for f in corrupted_files],
+        )
+        log_info_on_rank0(
+            logger,
+            f"Downloaded model files are corrupted for {model_name_or_path}: "
+            f"{corrupted_files}. The corrupted files have been removed. "
+            "Will retry download.",
+        )
+        return False
+
+    return True
+
+
+def _get_lock_file_path(
+    model_name_or_path: str, cache_dir: Optional[str] = None
+) -> str:
+    """
+    Generate a unique lock file path for download coordination.
+
+    In CI environments where multiple containers share an NFS-mounted HF cache,
+    the lock file is placed on the shared cache directory so ALL containers
+    coordinate on the same lock. This prevents cross-container .incomplete
+    file race conditions.
+
+    Falls back to /dev/shm (container-local) for non-CI or when the cache
+    dir is not accessible.
+
+    Args:
+        model_name_or_path: Model identifier
+        cache_dir: HF cache directory (None to use default)
+
+    Returns:
+        Path to the lock file
+    """
+    key_hash = hashlib.sha256(model_name_or_path.encode()).hexdigest()[:16]
+
+    # In CI, place lock on the shared HF cache directory so that ALL containers
+    # sharing the same NFS-mounted cache coordinate downloads.
+    # /dev/shm is container-local and doesn't prevent cross-container races.
+    try:
+        import huggingface_hub.constants
+
+        effective_cache_dir = cache_dir or huggingface_hub.constants.HF_HUB_CACHE
+        if os.path.isdir(effective_cache_dir):
+            lock_dir = os.path.join(effective_cache_dir, ".sglang_locks")
+            os.makedirs(lock_dir, exist_ok=True)
+            return os.path.join(lock_dir, f"download_{key_hash}.lock")
+    except Exception:
+        pass
+
+    # Fallback to container-local lock
+    if os.path.isdir("/dev/shm"):
+        return f"/dev/shm/sglang_download_lock_{key_hash}"
+    return f"/tmp/sglang_download_lock_{key_hash}"
+
+
+def _cleanup_incomplete_blobs(model_name_or_path: str, cache_dir: Optional[str]) -> int:
+    """
+    Remove stale .incomplete files from the model's blobs directory.
+
+    This is lighter than _cleanup_corrupted_model_cache (which deletes the
+    entire cache). We only remove .incomplete files so snapshot_download
+    starts fresh on retry, preserving any successfully downloaded blobs.
+
+    Args:
+        model_name_or_path: Model identifier (e.g., "meta-llama/Llama-2-7b-hf")
+        cache_dir: HF cache directory (None to use default)
+
+    Returns:
+        Number of .incomplete files removed
+    """
+    try:
+        import huggingface_hub.constants
+
+        effective_cache_dir = cache_dir or huggingface_hub.constants.HF_HUB_CACHE
+        repo_folder_name = huggingface_hub.constants.REPO_ID_SEPARATOR.join(
+            ["models", *model_name_or_path.split("/")]
+        )
+        blobs_dir = os.path.join(effective_cache_dir, repo_folder_name, "blobs")
+
+        if not os.path.isdir(blobs_dir):
+            return 0
+
+        incomplete_files = glob_module.glob(os.path.join(blobs_dir, "*.incomplete"))
+        removed = 0
+        for f in incomplete_files:
+            try:
+                os.remove(f)
+                removed += 1
+                logger.debug("Removed incomplete blob: %s", os.path.basename(f))
+            except OSError as e:
+                logger.debug(
+                    "Failed to remove incomplete blob %s: %s", os.path.basename(f), e
+                )
+
+        if removed > 0:
+            logger.warning(
+                "Cleaned up %d .incomplete blob(s) for %s in %s",
+                removed,
+                model_name_or_path,
+                blobs_dir,
+            )
+        return removed
+
+    except Exception as e:
+        logger.debug("Failed to clean up incomplete blobs: %s", e)
+        return 0
+
+
+def ci_download_with_validation_and_retry(
+    model_name_or_path: str,
+    allow_patterns: List[str],
+    ignore_patterns,
+    cache_dir: Optional[str],
+    revision: Optional[str],
+    max_retries: int = 3,
+) -> str:
+    """
+    CI-specific download with validation and automatic retry on corruption.
+
+    This function handles the download of model weights in CI environments,
+    with automatic validation and retry logic for handling corrupted downloads.
+
+    Uses filelock.FileLock on the shared HF cache directory to coordinate
+    downloads across all processes AND all containers sharing the same
+    NFS-mounted cache. Only one process downloads at a time; others wait
+    for the lock then use the cached result.
+
+    Args:
+        model_name_or_path: The model name or path
+        allow_patterns: The allowed patterns for weight files
+        ignore_patterns: The patterns to filter out weight files
+        cache_dir: The cache directory to store model weights
+        revision: The revision of the model
+        max_retries: Maximum number of download retries if corruption is detected
+
+    Returns:
+        str: The path to the downloaded model weights
+
+    Raises:
+        RuntimeError: If download fails after max_retries attempts
+    """
+    import filelock
+    import huggingface_hub.constants
+    from huggingface_hub import snapshot_download
+    from tqdm.auto import tqdm
+
+    class DisabledTqdm(tqdm):
+        def __init__(self, *args, **kwargs):
+            kwargs["disable"] = True
+            super().__init__(*args, **kwargs)
+
+    # Use filelock on the shared HF cache directory to coordinate downloads
+    # across all processes AND all containers sharing the same NFS mount.
+    # This prevents cross-container .incomplete file race conditions.
+    lock_file_path = _get_lock_file_path(model_name_or_path, cache_dir)
+
+    logger.info(
+        "[CI Download] Process %d using lock file: %s",
+        os.getpid(),
+        lock_file_path,
+    )
+
+    # filelock.FileLock handles creation, acquisition, and release cleanly.
+    # timeout=-1 means wait indefinitely (another container may be downloading
+    # a large model for 30+ minutes).
+    lock = filelock.FileLock(lock_file_path, timeout=-1, mode=0o666)
+
+    logger.info(
+        "[CI Download] Process %d waiting to acquire lock for %s",
+        os.getpid(),
+        model_name_or_path,
+    )
+
+    with lock:
+        logger.info(
+            "[CI Download] Process %d ACQUIRED lock for %s",
+            os.getpid(),
+            model_name_or_path,
+        )
+
+        # Re-check if another container already downloaded the model while
+        # we were waiting for the lock. This avoids redundant downloads.
+        try:
+            from sglang.srt.model_loader.weight_utils import (
+                _find_local_hf_snapshot_dir_unlocked,
+            )
+
+            cached_path = _find_local_hf_snapshot_dir_unlocked(
+                model_name_or_path, cache_dir, allow_patterns, revision
+            )
+            if cached_path is not None:
+                logger.info(
+                    "[CI Download] Process %d found cached model after "
+                    "acquiring lock (downloaded by another container): %s",
+                    os.getpid(),
+                    cached_path,
+                )
+                return cached_path
+        except Exception as e:
+            logger.debug(
+                "[CI Download] Re-check for cached model failed (non-fatal): %s", e
+            )
+
+        # Clean up stale .incomplete files from previous failed downloads
+        # before starting. Only do this once before the first attempt.
+        cleaned = _cleanup_incomplete_blobs(model_name_or_path, cache_dir)
+        if cleaned > 0:
+            logger.info(
+                "[CI Download] Pre-download cleanup: removed %d stale "
+                ".incomplete file(s) for %s",
+                cleaned,
+                model_name_or_path,
+            )
+
+        hf_folder = None
+        for attempt in range(max_retries):
+            try:
+                hf_folder = snapshot_download(
+                    model_name_or_path,
+                    allow_patterns=allow_patterns,
+                    ignore_patterns=ignore_patterns,
+                    cache_dir=cache_dir,
+                    tqdm_class=DisabledTqdm,
+                    revision=revision,
+                    local_files_only=huggingface_hub.constants.HF_HUB_OFFLINE,
+                    # Force single-threaded downloads to prevent race conditions
+                    # on NFS. HF hub defaults to max_workers=8, which can cause
+                    # .incomplete file conflicts when multiple threads operate
+                    # on the same files
+                    max_workers=1,
+                )
+            except (FileNotFoundError, OSError) as e:
+                # Race condition: .incomplete file was moved/deleted by another
+                # process. With NFS-level locking this should be rare, but can
+                # still happen if lock acquisition fails on some NFS setups.
+                logger.warning(
+                    "[CI Download] Process %d hit download error "
+                    "(attempt %d/%d) for %s: %s: %s",
+                    os.getpid(),
+                    attempt + 1,
+                    max_retries,
+                    model_name_or_path,
+                    type(e).__name__,
+                    e,
+                )
+                if attempt < max_retries - 1:
+                    # Backoff: 10s, 20s, 40s. Clean only the stale
+                    # .incomplete files (not active ones from other processes).
+                    backoff = 10 * (2**attempt)
+                    logger.info(
+                        "[CI Download] Cleaning up .incomplete files and "
+                        "retrying in %ds...",
+                        backoff,
+                    )
+                    _cleanup_incomplete_blobs(model_name_or_path, cache_dir)
+                    time.sleep(backoff)
+                    continue
+                raise RuntimeError(
+                    f"Download failed for {model_name_or_path} after "
+                    f"{max_retries} attempts due to download errors. "
+                    f"Last error: {type(e).__name__}: {e}"
+                ) from e
+
+            # Validate downloaded files to catch corruption early
+            is_valid = _validate_weights_after_download(
+                hf_folder, allow_patterns, model_name_or_path
+            )
+
+            if is_valid:
+                return hf_folder
+
+            # Validation failed, corrupted files were cleaned up
+            if attempt < max_retries - 1:
+                log_info_on_rank0(
+                    logger,
+                    f"Retrying download for {model_name_or_path} "
+                    f"(attempt {attempt + 2}/{max_retries})...",
+                )
+            else:
+                raise RuntimeError(
+                    f"Downloaded model files are still corrupted for "
+                    f"{model_name_or_path} after {max_retries} attempts. "
+                    "This may indicate a persistent issue with the model files "
+                    "on Hugging Face Hub or network problems."
+                )
+
+        # Should never reach here, but return hf_folder just in case
+        return hf_folder
+
+
+def ci_validate_and_clean_hf_cache(model_path: str) -> None:
+    """
+    Validate and clean corrupted safetensors files in HF cache before loading.
+
+    This function is needed because HFRunner (used in tests) calls transformers'
+    from_pretrained() directly, which bypasses SGLang's weight validation.
+    Corrupted cached files can cause cryptic errors like "EOF while parsing"
+    from safetensors.
+
+    Only runs in CI to avoid overhead for regular users.
+
+    Args:
+        model_path: Model identifier (e.g., "meta-llama/Llama-2-7b")
+    """
+    from sglang.utils import is_in_ci
+
+    if not is_in_ci():
+        return
+
+    # Skip for local paths
+    if os.path.isdir(model_path):
+        return
+
+    try:
+        import huggingface_hub.constants
+
+        # Find the HF cache directory for this model
+        cache_dir = huggingface_hub.constants.HF_HUB_CACHE
+        repo_folder = os.path.join(
+            cache_dir,
+            huggingface_hub.constants.REPO_ID_SEPARATOR.join(
+                ["models", *model_path.split("/")]
+            ),
+        )
+
+        if not os.path.isdir(repo_folder):
+            return
+
+        # Find snapshot directories
+        snapshots_dir = os.path.join(repo_folder, "snapshots")
+        if not os.path.isdir(snapshots_dir):
+            return
+
+        # Check each snapshot for corrupted files
+        corrupted_files = []
+        for snapshot_hash in os.listdir(snapshots_dir):
+            snapshot_dir = os.path.join(snapshots_dir, snapshot_hash)
+            if not os.path.isdir(snapshot_dir):
+                continue
+
+            # Find all safetensors files
+            safetensors_files = glob_module.glob(
+                os.path.join(snapshot_dir, "*.safetensors")
+            )
+
+            for sf_file in safetensors_files:
+                # Skip broken symlinks (os.path.exists returns False for them)
+                if not os.path.exists(sf_file):
+                    continue
+
+                if not _validate_safetensors_file(sf_file):
+                    corrupted_files.append(sf_file)
+
+            # Also find and validate PyTorch .bin files
+            bin_files = glob_module.glob(os.path.join(snapshot_dir, "*.bin"))
+
+            for bin_file in bin_files:
+                # Skip broken symlinks (os.path.exists returns False for them)
+                if not os.path.exists(bin_file):
+                    continue
+
+                if not _validate_pytorch_bin_file(bin_file):
+                    corrupted_files.append(bin_file)
+
+        if corrupted_files:
+            logger.warning(
+                "HFRunner: Found %d corrupted weight file(s) for %s. "
+                "Removing to force re-download.",
+                len(corrupted_files),
+                model_path,
+            )
+            _cleanup_corrupted_files_selective(model_path, corrupted_files)
+
+    except Exception as e:
+        # Don't fail if validation itself fails - let HF handle it
+        logger.debug("HF cache validation failed (non-fatal): %s", e)
diff --git a/sglang/python/sglang/srt/model_loader/loader.py b/sglang/python/sglang/srt/model_loader/loader.py
new file mode 100644
index 0000000000000000000000000000000000000000..da5c315afb8423a58870cfb0dfb55e1891d01023
--- /dev/null
+++ b/sglang/python/sglang/srt/model_loader/loader.py
@@ -0,0 +1,2800 @@
+# Adapted from https://github.com/vllm-project/vllm/blob/v0.6.3.post1/vllm/model_executor/model_loader/loader.py
+
+from __future__ import annotations
+
+# ruff: noqa: SIM117
+import collections
+import dataclasses
+import fnmatch
+import gc
+import glob
+import json
+import logging
+import math
+import os
+import re
+import socket
+import threading
+import time
+from abc import ABC, abstractmethod
+from contextlib import contextmanager, suppress
+from typing import (
+    TYPE_CHECKING,
+    Any,
+    Dict,
+    Generator,
+    Iterable,
+    List,
+    Optional,
+    Tuple,
+    Union,
+    cast,
+)
+
+import huggingface_hub
+import numpy as np
+import torch
+
+from sglang.srt.model_loader.remote_instance_weight_loader_utils import (
+    RemoteInstanceWeightLoaderBackend,
+    get_remote_instance_transfer_engine_info_per_rank,
+    register_memory_region,
+)
+from sglang.srt.server_args import get_global_server_args
+
+# Try to import accelerate (optional dependency)
+try:
+    from accelerate import infer_auto_device_map, init_empty_weights
+    from accelerate.utils import get_max_memory
+
+    HAS_ACCELERATE = True
+except ImportError:
+    HAS_ACCELERATE = False
+    infer_auto_device_map = None
+    init_empty_weights = None
+    get_max_memory = None
+
+from huggingface_hub import HfApi, hf_hub_download
+from torch import nn
+from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer
+from transformers.utils import SAFE_WEIGHTS_INDEX_NAME
+
+from sglang.srt.configs.load_config import LoadConfig, LoadFormat
+from sglang.srt.connector import (
+    ConnectorType,
+    create_remote_connector,
+    get_connector_type,
+)
+from sglang.srt.connector.utils import parse_model_name
+from sglang.srt.distributed import (
+    get_tensor_model_parallel_rank,
+    get_tensor_model_parallel_world_size,
+    model_parallel_is_initialized,
+)
+from sglang.srt.layers.modelopt_utils import QUANT_CFG_CHOICES
+from sglang.srt.layers.quantization.base_config import QuantizationConfig
+from sglang.srt.model_loader.remote_instance_weight_loader_utils import (
+    trigger_transferring_weights_request,
+)
+from sglang.srt.model_loader.utils import (
+    get_model_architecture,
+    post_load_weights,
+    set_default_torch_dtype,
+)
+
+# Constants for memory management
+DEFAULT_GPU_MEMORY_FRACTION_FOR_CALIBRATION = (
+    0.8  # Reserve 20% GPU memory headroom for ModelOpt calibration
+)
+from sglang.srt.environ import envs
+from sglang.srt.model_loader.weight_utils import (
+    buffered_multi_thread_safetensors_weights_iterator,
+    download_safetensors_index_file_from_hf,
+    download_weights_from_hf,
+    fastsafetensors_weights_iterator,
+    filter_duplicate_safetensors_files,
+    filter_files_not_needed_for_inference,
+    get_gguf_extra_tensor_names,
+    get_quant_config,
+    gguf_quant_weights_iterator,
+    initialize_dummy_weights,
+    maybe_add_mtp_safetensors,
+    multi_thread_pt_weights_iterator,
+    np_cache_weights_iterator,
+    pt_weights_iterator,
+    safetensors_weights_iterator,
+    set_runai_streamer_env,
+)
+from sglang.srt.utils import (
+    get_bool_env_var,
+    get_device_capability,
+    is_npu,
+    is_pin_memory_available,
+    rank0_log,
+    set_weight_attrs,
+)
+
+if TYPE_CHECKING:
+    from sglang.srt.configs.device_config import DeviceConfig
+    from sglang.srt.configs.model_config import ModelConfig
+    from sglang.srt.layers.quantization.base_config import QuantizationConfig
+
+_is_npu = is_npu()
+# ModelOpt: QUANT_CFG_CHOICES is imported from modelopt_utils.py
+# which contains the complete mapping of quantization config choices
+
+logger = logging.getLogger(__name__)
+
+
+@contextmanager
+def device_loading_context(module: torch.nn.Module, target_device: torch.device):
+    if target_device.type == "cpu":
+        # If target is CPU, no need to move anything
+        yield module
+        return
+
+    original_infos: Dict[str, Dict] = {}
+
+    # Store original device states and move parameters to GPU if they're on CPU
+    for name, p in module.named_parameters():
+        if p.device.type == "cpu":
+            original_data = p.data
+            device_data = p.data.to(target_device)
+            original_infos[name] = dict(
+                device=p.device,
+                original_data=original_data,
+                device_data=device_data,
+            )
+            p.data = device_data
+        # Parameters already on target device are not touched
+
+    try:
+        yield module
+
+    finally:
+        # Restore parameters to their original devices, ignoring new parameters
+        pin_memory = is_pin_memory_available()
+        for name, p in module.named_parameters():
+            if name in original_infos:
+                original_info = original_infos[name]
+                device_data = original_info["device_data"]
+                original_data = original_info["original_data"]
+                original_device: torch.device = original_info["device"]
+
+                if (
+                    (device_data.device == p.data.device)
+                    and (device_data.data_ptr() == p.data.data_ptr())
+                    and (device_data.shape == p.data.shape)
+                    and (device_data.dtype == p.data.dtype)
+                ):
+                    original_data.copy_(p.data.to(original_data.device))
+                    p.data = original_data
+                elif original_device.type == "cpu":
+                    # `torch.empty_like` does not support `pin_memory` argument
+                    cpu_data = torch.empty_strided(
+                        size=p.data.size(),
+                        stride=p.data.stride(),
+                        dtype=p.data.dtype,
+                        layout=p.data.layout,
+                        device="cpu",
+                        pin_memory=pin_memory,
+                    )
+                    cpu_data.copy_(p.data)
+                    p.data = cpu_data
+                else:
+                    p.data = p.data.to(original_device)
+        # New parameters or parameters already on target device are untouched
+
+
+logger = logging.getLogger(__name__)
+
+
+def _get_quantization_config(
+    model_config: ModelConfig,
+    load_config: LoadConfig,
+) -> Optional[QuantizationConfig]:
+    """Get the quantization config."""
+    model_class, _ = get_model_architecture(model_config)
+    packed_modules_mapping = getattr(model_class, "packed_modules_mapping", {})
+    remap_prefix = getattr(model_class, "remap_prefix", None)
+    if _is_npu:
+        packed_modules_mapping.update(
+            {
+                "visual": {
+                    "qkv_proj": ["qkv"],
+                    "gate_up_proj": ["gate_proj", "up_proj"],
+                },
+                "vision_model": {
+                    "qkv_proj": ["q_proj", "k_proj", "v_proj"],
+                    "proj": ["out_proj"],
+                },
+                "model": {
+                    "qkv_proj": ["q_proj", "k_proj", "v_proj"],
+                    "gate_up_proj": ["gate_proj", "up_proj"],
+                    "fused_qkv_a_proj_with_mqa": [
+                        "q_a_proj",
+                        "kv_a_proj_with_mqa",
+                    ],
+                },
+            }
+        )
+
+    if model_config.quantization is not None:
+        quant_config = get_quant_config(
+            model_config, load_config, packed_modules_mapping, remap_prefix
+        )
+        # (yizhang2077) workaround for nvidia/Llama-4-Maverick-17B-128E-Eagle3
+        if quant_config is None:
+            return None
+        if not _is_npu:
+            major, minor = get_device_capability()
+
+            if major is not None and minor is not None:
+                assert 0 <= minor < 10
+                capability = major * 10 + minor
+                if capability < quant_config.get_min_capability():
+                    raise ValueError(
+                        f"The quantization method {model_config.quantization} "
+                        "is not supported for the current GPU. "
+                        f"Minimum capability: {quant_config.get_min_capability()}. "
+                        f"Current capability: {capability}."
+                    )
+        supported_dtypes = quant_config.get_supported_act_dtypes()
+        if model_config.dtype not in supported_dtypes:
+            raise ValueError(
+                f"{model_config.dtype} is not supported for quantization "
+                f"method {model_config.quantization}. Supported dtypes: "
+                f"{supported_dtypes}"
+            )
+        hf_to_sglang_mapper = getattr(model_class, "hf_to_sglang_mapper", None)
+        # pass mappings by reference to quant_config
+        if hf_to_sglang_mapper is not None and quant_config is not None:
+            quant_config.apply_weight_name_mapper(hf_to_sglang_mapper)
+        return quant_config
+    return None
+
+
+def _initialize_model(
+    model_config: ModelConfig,
+    load_config: LoadConfig,
+    quant_config: Optional[QuantizationConfig] = None,
+) -> nn.Module:
+    """Initialize a model with the given configurations."""
+    model_class, _ = get_model_architecture(model_config)
+    kwargs = {
+        "config": model_config.hf_config,
+        "quant_config": quant_config,
+    }
+
+    # Only add sparse head kwargs if envs.SGLANG_EMBEDDINGS_SPARSE_HEAD.is_set()
+    if envs.SGLANG_EMBEDDINGS_SPARSE_HEAD.is_set():
+        kwargs["sparse_head"] = envs.SGLANG_EMBEDDINGS_SPARSE_HEAD.get()
+        kwargs["model_path"] = model_config.model_path
+
+    if load_config.draft_model_idx is not None:
+        kwargs["draft_model_idx"] = load_config.draft_model_idx
+
+    return model_class(**kwargs)
+
+
+class BaseModelLoader(ABC):
+    """Base class for model loaders."""
+
+    def __init__(self, load_config: LoadConfig):
+        self.load_config = load_config
+
+    @abstractmethod
+    def download_model(self, model_config: ModelConfig) -> None:
+        """Download a model so that it can be immediately loaded."""
+        raise NotImplementedError
+
+    @abstractmethod
+    def load_model(
+        self,
+        *,
+        model_config: ModelConfig,
+        device_config: DeviceConfig,
+    ) -> nn.Module:
+        """Load a model with the given configurations."""
+        raise NotImplementedError
+
+
+class DefaultModelLoader(BaseModelLoader):
+    """Model loader that can load different file types from disk."""
+
+    # default number of thread when enable multithread weight loading
+    DEFAULT_NUM_THREADS = 8
+
+    _MTP_PATTERN = re.compile(r"model\.mtp\.layers\.(\d+)\.")
+
+    @dataclasses.dataclass
+    class Source:
+        """A source for weights."""
+
+        model_or_path: str
+        """The model ID or path."""
+
+        revision: Optional[str]
+        """The optional model revision."""
+
+        prefix: str = ""
+        """A prefix to prepend to all weights."""
+
+        fall_back_to_pt: bool = True
+        """Whether .pt weights can be used."""
+
+        model_config: Optional["ModelConfig"] = None
+        """The model configuration (for checking architecture, etc)."""
+
+        @classmethod
+        def init_new(cls, model_config: ModelConfig, model):
+            return cls(
+                model_config.model_path,
+                model_config.revision,
+                prefix="",
+                fall_back_to_pt=getattr(model, "fall_back_to_pt_during_load", True),
+                model_config=model_config,
+            )
+
+    counter_before_loading_weights: float = 0.0
+    counter_after_loading_weights: float = 0.0
+
+    def __init__(self, load_config: LoadConfig):
+        super().__init__(load_config)
+        extra_config = load_config.model_loader_extra_config
+        allowed_keys = {"enable_multithread_load", "num_threads"}
+        unexpected_keys = set(extra_config.keys()) - allowed_keys
+
+        if unexpected_keys:
+            raise ValueError(
+                f"Unexpected extra config keys for load format "
+                f"{load_config.load_format}: "
+                f"{unexpected_keys}"
+            )
+
+    def _maybe_download_from_modelscope(
+        self, model: str, revision: Optional[str]
+    ) -> str:
+        """Download model from ModelScope hub if SGLANG_USE_MODELSCOPE is True.
+
+        Returns the path to the downloaded model, or the original model path if
+        not downloaded from ModelScope."""
+        if get_bool_env_var("SGLANG_USE_MODELSCOPE"):
+            # download model from ModelScope hub,
+            # lazy import so that modelscope is not required for normal use.
+            # pylint: disable=C.
+            from modelscope.hub.snapshot_download import snapshot_download
+
+            if not os.path.exists(model):
+                model_path = snapshot_download(
+                    model_id=model,
+                    cache_dir=self.load_config.download_dir,
+                    local_files_only=huggingface_hub.constants.HF_HUB_OFFLINE,
+                    revision=revision,
+                    ignore_file_pattern=self.load_config.ignore_patterns,
+                )
+            else:
+                model_path = model
+            return model_path
+        return model
+
+    def _prepare_weights(
+        self, model_name_or_path: str, revision: Optional[str], fall_back_to_pt: bool
+    ) -> Tuple[str, List[str], bool]:
+        """Prepare weights for the model.
+
+        If the model is not local, it will be downloaded."""
+        model_name_or_path = self._maybe_download_from_modelscope(
+            model_name_or_path, revision
+        )
+
+        is_local = os.path.isdir(model_name_or_path)
+        load_format = self.load_config.load_format
+        use_safetensors = False
+        index_file = SAFE_WEIGHTS_INDEX_NAME
+        # Some quantized models use .pt files for storing the weights.
+        if load_format == LoadFormat.AUTO:
+            allow_patterns = ["*.safetensors", "*.bin"]
+        elif (
+            load_format == LoadFormat.SAFETENSORS
+            or load_format == LoadFormat.FASTSAFETENSORS
+        ):
+            use_safetensors = True
+            allow_patterns = ["*.safetensors"]
+        elif load_format == LoadFormat.MISTRAL:
+            use_safetensors = True
+            allow_patterns = ["consolidated*.safetensors"]
+            index_file = "consolidated.safetensors.index.json"
+        elif load_format == LoadFormat.PT:
+            allow_patterns = ["*.pt"]
+        elif load_format == LoadFormat.NPCACHE:
+            allow_patterns = ["*.bin"]
+        elif load_format == LoadFormat.DUMMY:
+            raise ValueError(
+                f"DUMMY load_format should use DummyModelLoader and not call _prepare_weights"
+            )
+        else:
+            raise ValueError(f"Unknown load_format: {load_format}")
+
+        if fall_back_to_pt:
+            allow_patterns += ["*.pt"]
+
+        if not is_local:
+            hf_folder = download_weights_from_hf(
+                model_name_or_path,
+                self.load_config.download_dir,
+                allow_patterns,
+                revision,
+                ignore_patterns=self.load_config.ignore_patterns,
+            )
+        else:
+            hf_folder = model_name_or_path
+
+        server_args = get_global_server_args()
+        if server_args and server_args.model_checksum is not None:
+            from sglang.srt.utils.model_file_verifier import verify
+
+            checksums_source = server_args.model_checksum or model_name_or_path
+            verify(model_path=hf_folder, checksums_source=checksums_source)
+
+        hf_weights_files: List[str] = []
+        for pattern in allow_patterns:
+            hf_weights_files += glob.glob(os.path.join(hf_folder, pattern))
+            if len(hf_weights_files) > 0:
+                if pattern == "*.safetensors":
+                    use_safetensors = True
+                break
+
+        if use_safetensors:
+            # For models like Mistral-7B-Instruct-v0.3
+            # there are both sharded safetensors files and a consolidated
+            # safetensors file. Using both breaks.
+            # Here, we download the `model.safetensors.index.json` and filter
+            # any files not found in the index.
+            if not is_local:
+                download_safetensors_index_file_from_hf(
+                    model_name_or_path,
+                    index_file,
+                    self.load_config.download_dir,
+                    revision,
+                )
+            hf_weights_files = filter_duplicate_safetensors_files(
+                hf_weights_files, hf_folder, index_file
+            )
+        else:
+            hf_weights_files = filter_files_not_needed_for_inference(hf_weights_files)
+
+        if len(hf_weights_files) == 0:
+            raise RuntimeError(
+                f"Cannot find any model weights with `{model_name_or_path}`"
+            )
+
+        return hf_folder, hf_weights_files, use_safetensors
+
+    def _get_weights_iterator(
+        self, source: "Source"
+    ) -> Generator[Tuple[str, torch.Tensor], None, None]:
+        """Get an iterator for the model weights based on the load format."""
+        extra_config = self.load_config.model_loader_extra_config
+        use_multithread = extra_config.get("enable_multithread_load", False)
+        hf_folder, hf_weights_files, use_safetensors = self._prepare_weights(
+            source.model_or_path, source.revision, source.fall_back_to_pt
+        )
+
+        if use_safetensors and source.model_config is not None:
+            hf_weights_files = maybe_add_mtp_safetensors(
+                hf_weights_files,
+                hf_folder,
+                "model.safetensors.index.json",
+                source.model_config.hf_config,
+            )
+
+        if self.load_config.load_format == LoadFormat.NPCACHE:
+            # Currently np_cache only support *.bin checkpoints
+            assert use_safetensors is False
+            weights_iterator = np_cache_weights_iterator(
+                source.model_or_path,
+                self.load_config.download_dir,
+                hf_folder,
+                hf_weights_files,
+            )
+        elif use_safetensors:
+            weight_loader_disable_mmap = (
+                get_global_server_args().weight_loader_disable_mmap
+            )
+
+            if self.load_config.load_format == LoadFormat.FASTSAFETENSORS:
+                weights_iterator = fastsafetensors_weights_iterator(
+                    hf_weights_files,
+                )
+            elif use_multithread:
+                weights_iterator = buffered_multi_thread_safetensors_weights_iterator(
+                    hf_weights_files,
+                    max_workers=extra_config.get(
+                        "num_threads", self.DEFAULT_NUM_THREADS
+                    ),
+                    disable_mmap=weight_loader_disable_mmap,
+                )
+            else:
+                weights_iterator = safetensors_weights_iterator(
+                    hf_weights_files, disable_mmap=weight_loader_disable_mmap
+                )
+
+        else:
+            if use_multithread:
+                weights_iterator = multi_thread_pt_weights_iterator(
+                    hf_weights_files,
+                    max_workers=extra_config.get(
+                        "num_threads", self.DEFAULT_NUM_THREADS
+                    ),
+                )
+            else:
+                weights_iterator = pt_weights_iterator(hf_weights_files)
+
+        if self.load_config.draft_model_idx is not None:
+            return self._filter_mtp_weights(
+                weights_iterator, source.prefix, self.load_config.draft_model_idx
+            )
+
+        if self.counter_before_loading_weights == 0.0:
+            self.counter_before_loading_weights = time.perf_counter()
+        # Apply the prefix.
+        return ((source.prefix + name, tensor) for (name, tensor) in weights_iterator)
+
+    @classmethod
+    def _filter_mtp_weights(
+        cls, weights_iterator, prefix: str, draft_model_idx: int
+    ) -> Tuple[Tuple[str, torch.Tensor], ...]:
+        """Filter MTP (Multi-Token Prediction) weights to keep only the
+        specified draft model layer and remap it to layer 0."""
+        filtered_weights = []
+        for name, tensor in weights_iterator:
+            match = cls._MTP_PATTERN.match(name)
+            if match is not None:
+                idx = int(match.group(1))
+                if idx != draft_model_idx:
+                    continue
+                new_name = name.replace(match.group(), "model.mtp.layers.0.")
+            else:
+                new_name = name
+            filtered_weights.append((prefix + new_name, tensor))
+        return tuple(filtered_weights)
+
+    def _get_all_weights(
+        self,
+        model_config: ModelConfig,
+        model: nn.Module,
+    ) -> Generator[Tuple[str, torch.Tensor], None, None]:
+
+        primary_weights = DefaultModelLoader.Source.init_new(model_config, model)
+        yield from self._get_weights_iterator(primary_weights)
+
+        secondary_weights = cast(
+            Iterable[DefaultModelLoader.Source], getattr(model, "secondary_weights", ())
+        )
+        for source in secondary_weights:
+            yield from self._get_weights_iterator(source)
+
+    def download_model(self, model_config: ModelConfig) -> None:
+        self._prepare_weights(
+            model_config.model_path, model_config.revision, fall_back_to_pt=True
+        )
+
+    def _load_modelopt_base_model(self, model_config: ModelConfig) -> nn.Module:
+        """Load and prepare the base model for ModelOpt quantization.
+
+        This method handles the common model loading logic shared between
+        DefaultModelLoader (conditional) and ModelOptModelLoader (dedicated).
+        """
+        if not HAS_ACCELERATE:
+            raise ImportError(
+                "accelerate is required for ModelOpt quantization. "
+                "Please install it with: pip install accelerate"
+            )
+
+        hf_config = AutoConfig.from_pretrained(
+            model_config.model_path,
+            trust_remote_code=True,
+            local_files_only=huggingface_hub.constants.HF_HUB_OFFLINE,
+        )
+        with init_empty_weights():
+            torch_dtype = getattr(hf_config, "torch_dtype", torch.float16)
+            model = AutoModelForCausalLM.from_config(
+                hf_config, torch_dtype=torch_dtype, trust_remote_code=True
+            )
+        max_memory = get_max_memory()
+        inferred_device_map = infer_auto_device_map(model, max_memory=max_memory)
+
+        on_cpu = "cpu" in inferred_device_map.values()
+        model_kwargs = {"torch_dtype": "auto"}
+        device_map = "auto"
+
+        if on_cpu:
+            for device in max_memory.keys():
+                if isinstance(device, int):
+                    max_memory[device] *= DEFAULT_GPU_MEMORY_FRACTION_FOR_CALIBRATION
+
+            logger.warning(
+                "Model does not fit to the GPU mem. "
+                f"We apply the following memory limit for calibration: \n{max_memory}\n"
+                f"If you hit GPU OOM issue, please adjust the memory fraction "
+                f"(currently {DEFAULT_GPU_MEMORY_FRACTION_FOR_CALIBRATION}) or "
+                "reduce the calibration `batch_size` manually."
+            )
+            model_kwargs["max_memory"] = max_memory
+
+        model = AutoModelForCausalLM.from_pretrained(
+            model_config.model_path,
+            device_map=device_map,
+            **model_kwargs,
+            trust_remote_code=True,
+            local_files_only=huggingface_hub.constants.HF_HUB_OFFLINE,
+        )
+        # Handle both legacy modelopt_quant and unified quantization flags
+        if hasattr(model_config, "modelopt_quant") and model_config.modelopt_quant:
+            # Legacy approach
+            quant_choice_str = model_config.modelopt_quant
+            rank0_log(f"ModelOpt quantization requested (legacy): {quant_choice_str}")
+        else:
+            # Unified approach - extract quantization type
+            quant_choice_str = model_config._get_modelopt_quant_type()
+            rank0_log(
+                f"ModelOpt quantization requested (unified): {model_config.quantization} -> {quant_choice_str}"
+            )
+
+        if not isinstance(quant_choice_str, str):
+            raise TypeError(
+                f"Quantization type must be a string (e.g., 'fp8'), "
+                f"got {type(quant_choice_str)}"
+            )
+
+        return model
+
+    def load_model(
+        self,
+        *,
+        model_config: ModelConfig,
+        device_config: DeviceConfig,
+    ) -> nn.Module:
+
+        if hasattr(model_config, "modelopt_quant") and model_config.modelopt_quant:
+            # Load base model using shared method
+            model = self._load_modelopt_base_model(model_config)
+            # Note: DefaultModelLoader doesn't do additional quantization processing
+            # For full ModelOpt quantization, use ModelOptModelLoader
+            return model.eval()
+
+        target_device = torch.device(device_config.device)
+        quant_config = _get_quantization_config(model_config, self.load_config)
+        with set_default_torch_dtype(model_config.dtype):
+            with target_device:
+                model = _initialize_model(
+                    model_config,
+                    self.load_config,
+                    quant_config,
+                )
+
+            self.load_weights_and_postprocess(
+                model, self._get_all_weights(model_config, model), target_device
+            )
+
+        self.counter_after_loading_weights = time.perf_counter()
+        return model.eval()
+
+    @staticmethod
+    def load_weights_and_postprocess(model, weights, target_device):
+        model.load_weights(weights)
+
+        for _, module in model.named_modules():
+            quant_method = getattr(module, "quant_method", None)
+            if quant_method is not None:
+                # When quant methods need to process weights after loading
+                # (for repacking, quantizing, etc), they expect parameters
+                # to be on the global target device. This scope is for the
+                # case where cpu offloading is used, where we will move the
+                # parameters onto device for processing and back off after.
+                with device_loading_context(module, target_device):
+                    quant_method.process_weights_after_loading(module)
+                if _is_npu:
+                    torch.npu.empty_cache()
+
+
+class LayeredModelLoader(DefaultModelLoader):
+    """Model loader that loads weights layer by layer so that one can quantize a
+    layer before loading another to make the peak memory envelope smaller."""
+
+    def __init__(self, load_config: LoadConfig):
+        # Back to the default load format
+        load_config.load_format = LoadFormat.AUTO
+        super().__init__(load_config)
+
+    def load_model(
+        self,
+        *,
+        model_config: ModelConfig,
+        device_config: DeviceConfig,
+    ) -> nn.Module:
+        from sglang.srt.layers.torchao_utils import apply_torchao_config_to_model
+        from sglang.srt.server_args import get_global_server_args
+
+        torchao_config = get_global_server_args().torchao_config
+        target_device = torch.device(device_config.device)
+        quant_config = _get_quantization_config(model_config, self.load_config)
+
+        with set_default_torch_dtype(model_config.dtype):
+            # Create model on meta device
+            with torch.device("meta"):
+                model = _initialize_model(
+                    model_config,
+                    self.load_config,
+                    quant_config,
+                )
+
+            # Check model's layered load support
+            if not hasattr(model, "load_weights_to_module"):
+                raise ValueError(
+                    "LayeredModelLoader requires the model to have a "
+                    "`load_weights_to_module` method. "
+                    f"{model_config.model_path} does not support it."
+                )
+
+            # Get all weights from disk
+            weights = self._get_all_weights(model_config, model)
+
+            # Helper function to recursively fill the weights of a module
+            def fill_module(module, fqn: List[str], weights):
+                """
+                fqn: list of strings representing the fully qualified name of `module`.
+                """
+                # Layer by layer
+                for name, submod in module.named_children():
+                    fill_module(submod, fqn + [name], weights)
+
+                # First materialize on target device
+                module.to_empty(device=target_device, recurse=False)
+                fqn_path = ".".join(fqn)
+                # Fill weights
+                model.load_weights_to_module(
+                    fqn_path,
+                    weights,
+                )
+                # Quantize weights if applicable
+                if torchao_config and "proj" in fqn_path:
+                    # Note: `None` here is needed to indicate no filter, see
+                    # `apply_torchao_config_to_model` for details.
+                    apply_torchao_config_to_model(module, torchao_config, None)
+
+            # Start calling on root module
+            fill_module(model, [], weights)
+
+        if torchao_config:
+            model.torchao_applied = True
+
+        return model.eval()
+
+
+class QuantizedRLModelLoader(DefaultModelLoader):
+    """
+    Model loader for RL training with FP8 quantization (profile-free, native SGLang).
+
+    Workflow:
+      1. Initial load: Load base model → Record state → Apply FP8 quantization
+      2. Training Actor in full precision
+      3. Reload: Trainer sends full precision weights → Quantize to FP8 → Copy to original memory
+      4. Use torch.as_strided to preserve memory locations across reloads
+
+    Usage:
+      --model-path Qwen/Qwen2.5-7B --quantization fp8 --load-format flash_rl
+    """
+
+    # Parameter attributes to record for weight reloading
+    RECORDED_LOADER_KEYS = [
+        "weight_loader",
+        "load_qkv_weight",
+        "load_column_parallel_weight",
+        "load_row_parallel_weight",
+        "load_merged_column_weight",
+        "output_dim",
+        "input_dim",
+        "_assert_and_load",
+    ]
+
+    # Parameters to skip during FP8 quantization (matches FlashRL's exclude_list)
+    SKIP_QUANTIZATION_PARAMS = [
+        "weight_scale",
+        "input_scale",
+        "output_scale",
+        ".bias",
+        "lm_head.weight",
+        "model.norm.weight",
+        "embed_tokens",  # BF16 params
+        "rotary_emb.inv_freq",
+        "rotary_emb.cos_cached",
+        "rotary_emb.sin_cached",
+        "projector",
+        "input_layernorm.weight",
+        "post_attention_layernorm.weight",  # LayerNorms
+    ]
+
+    # Stacked parameters (Qwen2): shards loaded separately, then combined
+    STACKED_PARAMS_MAPPING = [
+        ("qkv_proj", ["q_proj", "k_proj", "v_proj"]),
+        ("gate_up_proj", ["gate_proj", "up_proj"]),
+    ]
+    _QKV_SHARD_ALIASES = {
+        "q_proj": "q",
+        "k_proj": "k",
+        "v_proj": "v",
+    }
+
+    def __init__(self, load_config: LoadConfig):
+        super().__init__(load_config)
+        logger.info("[QuantizedRL] Profile-free FP8 quantization enabled")
+        self._initial_load_complete = False
+
+    def _prepare_weights(
+        self, model_name_or_path: str, revision: Optional[str], fall_back_to_pt: bool
+    ):
+        """Standard weight preparation using base model path."""
+        logger.info(f"[QuantizedRL] Loading from base model: {model_name_or_path}")
+        temp_config = LoadConfig(load_format=LoadFormat.AUTO)
+        temp_loader = DefaultModelLoader(temp_config)
+        return temp_loader._prepare_weights(
+            model_name_or_path, revision, fall_back_to_pt
+        )
+
+    @staticmethod
+    def _bind_method_to_cls(func, obj):
+        """Bind function to object instance (for weight_loader methods)."""
+        import types
+
+        if hasattr(func, "__self__") or not callable(func):
+            return func
+        return types.MethodType(func, obj)
+
+    def load_weights_and_postprocess(self, model, weights, target_device):
+        """
+        Initial load: Load BF16 → Record state → Apply FP8 quantization.
+        Called ONCE during model initialization.
+        """
+        logger.info("[QuantizedRL] Initial load with FP8 quantization")
+
+        original_load_weights = model.load_weights
+
+        def load_weights_proxy(weights):
+            if QuantizedRLModelLoader.is_reload_scenario(model):
+                logger.info("[QuantizedRL] Using fast path reload in load_weights")
+                QuantizedRLModelLoader.rebinding_and_load_weights(
+                    model, original_load_weights, weights
+                )
+            else:
+                original_load_weights(weights)
+
+        model.load_weights = load_weights_proxy
+
+        model.load_weights(weights)
+        original_weights = dict(model.named_parameters())
+
+        # Record pre-quantization state (shape/stride) for torch.as_strided reset
+
+        model.original_weights_rebuild_keys = {}
+        for name, p in original_weights.items():
+            model.original_weights_rebuild_keys[name] = {
+                "shape": p.shape,
+                "stride": p.stride(),
+                "dtype": p.dtype,
+                "nbytes": p.untyped_storage().nbytes(),
+            }
+
+        # Record parameter attributes (weight_loader, etc.) before quantization
+        recorded_loader = {
+            k: dict() for k in QuantizedRLModelLoader.RECORDED_LOADER_KEYS
+        }
+        for name, p in original_weights.items():
+            for key in QuantizedRLModelLoader.RECORDED_LOADER_KEYS:
+                if hasattr(p, key):
+                    attr = getattr(p, key)
+                    if not callable(attr):
+                        recorded_loader[key][name] = attr
+                    elif hasattr(attr, "__self__") and p is attr.__self__:
+                        recorded_loader[key][name] = attr.__func__  # Store unbound
+                    else:
+                        recorded_loader[key][name] = attr
+        model.recorded_loader = recorded_loader
+
+        # Apply FP8 quantization (creates new Parameters, loses attributes)
+        for _, module in model.named_modules():
+            quant_method = getattr(module, "quant_method", None)
+            if quant_method is not None:
+                with device_loading_context(module, target_device):
+                    quant_method.process_weights_after_loading(module)
+
+        model.flash_rl_initial_load_complete = True
+        self._initial_load_complete = True
+        logger.info("[QuantizedRL] Initial load complete")
+
+    @staticmethod
+    def is_reload_scenario(model):
+        """Check if model is ready for reloading (initial load completed)."""
+        return (
+            hasattr(model, "original_weights_rebuild_keys")
+            and hasattr(model, "recorded_loader")
+            and getattr(model, "flash_rl_initial_load_complete", False)
+        )
+
+    @staticmethod
+    def _is_stacked_param(name):
+        """Check if parameter is stacked (qkv_proj, gate_up_proj)."""
+        for stacked_name, _ in QuantizedRLModelLoader.STACKED_PARAMS_MAPPING:
+            if stacked_name in name:
+                return True
+        return False
+
+    @staticmethod
+    def _resolve_stacked_info(name: str) -> Tuple[str, Optional[str], Optional[Any]]:
+        for target, shard_names in QuantizedRLModelLoader.STACKED_PARAMS_MAPPING:
+            for idx, shard in enumerate(shard_names):
+                if shard in name:
+                    shard_id = (
+                        QuantizedRLModelLoader._QKV_SHARD_ALIASES.get(shard, shard)
+                        if target == "qkv_proj"
+                        else idx
+                    )
+                    return name.replace(shard, target), target, shard_id
+        return name, None, None
+
+    @staticmethod
+    def _store_quantized_scale(
+        scale_store: Dict[str, Union[torch.Tensor, Dict[Any, torch.Tensor]]],
+        name: str,
+        scale: torch.Tensor,
+    ) -> None:
+        param_name, stacked_key, shard_id = (
+            QuantizedRLModelLoader._resolve_stacked_info(name)
+        )
+        if stacked_key is None:
+            scale_store[param_name] = scale
+        else:
+            shard_dict = scale_store.setdefault(param_name, {})
+            assert isinstance(shard_dict, dict)
+            shard_dict[shard_id] = scale
+
+    @staticmethod
+    def _apply_scale_update(
+        all_params: Dict[str, torch.nn.Parameter],
+        param_name: str,
+        scale_info: Union[torch.Tensor, Dict[Any, torch.Tensor], None],
+    ) -> None:
+        if scale_info is None:
+            return
+        # Get tp rank and size
+        tp_rank = get_tensor_model_parallel_rank()
+        tp_size = get_tensor_model_parallel_world_size()
+
+        def _get_tp_sharded_scale(full_scale_tensor):
+            """Get tp sharded scale from full scale tensor"""
+            if tp_size == 1:
+                return full_scale_tensor
+
+            full_dim = full_scale_tensor.shape[0]
+            shard_dim = full_dim // tp_size
+            start_idx = tp_rank * shard_dim
+            end_idx = start_idx + shard_dim
+            return full_scale_tensor[start_idx:end_idx]
+
+        if param_name.endswith(".weight"):
+            scale_param_name = f"{param_name[:-7]}.weight_scale"
+        else:
+            scale_param_name = f"{param_name}.weight_scale"
+
+        scale_param = all_params.get(scale_param_name)
+        if scale_param is None:
+            logger.warning(
+                "[QuantizedRL] Scale parameter not found: %s", scale_param_name
+            )
+            return
+        if isinstance(scale_info, torch.Tensor):
+            new_scale = scale_info.t().contiguous()
+            if scale_param.data.shape == new_scale.shape:
+                scale_param.data.copy_(new_scale)
+            else:
+                logger.warning(
+                    "[QuantizedRL] Scale shape mismatch for %s: expected %s, got %s",
+                    scale_param_name,
+                    scale_param.data.shape,
+                    new_scale.shape,
+                )
+        else:
+            stacked_key = next(
+                (
+                    target
+                    for target, _ in QuantizedRLModelLoader.STACKED_PARAMS_MAPPING
+                    if target in param_name
+                ),
+                None,
+            )
+            shard_names = next(
+                (
+                    names
+                    for target, names in QuantizedRLModelLoader.STACKED_PARAMS_MAPPING
+                    if target == stacked_key
+                ),
+                [],
+            )
+            rows_per_shard = scale_param.data.shape[-1] // max(len(shard_names), 1)
+            if rows_per_shard * len(shard_names) != scale_param.data.shape[-1]:
+                logger.warning(
+                    f"Scale param shape {scale_param.data.shape[-1]} not divisible by {len(shard_names)}"
+                )
+            offset = 0
+            for idx, shard in enumerate(shard_names):
+                shard_id = (
+                    QuantizedRLModelLoader._QKV_SHARD_ALIASES.get(shard, shard)
+                    if stacked_key == "qkv_proj"
+                    else idx
+                )
+                shard_scale = scale_info.get(shard_id)
+                shard_scale = _get_tp_sharded_scale(shard_scale)
+                if shard_scale is None:
+                    offset += rows_per_shard
+                    continue
+                shard_rows = shard_scale.shape[0]
+                start = offset
+                end = start + shard_rows
+                scale_param.data[..., start:end] = shard_scale.t().contiguous()
+                offset = end
+
+    @staticmethod
+    def rebinding_and_load_weights(model, first_time_load_weights, weights):
+        """
+        Reload: VERL sends BF16 → Quantize to FP8 → Copy to original memory.
+
+        Flow: Reset params → Restore attributes → Quantize in iterator → Load → Copy back
+        """
+        logger.info("[QuantizedRL] Reload: Updating weights with FP8 quantization")
+
+        weights_list = list(weights)
+        updated_param_names, is_last_update = (
+            QuantizedRLModelLoader._get_updated_params(weights_list, model)
+        )
+
+        # Save current FP8 parameter data pointers
+        existing_params = dict(model.named_parameters())
+        current_param_data = {}
+        for name in updated_param_names:
+            if name in existing_params:
+                current_param_data[name] = existing_params[name].data
+
+        # Reset to pre-quantization shape using torch.as_strided
+        # Keeps same storage, just changes view - critical for memory preservation
+        for name, rebuild_info in model.original_weights_rebuild_keys.items():
+            if name in updated_param_names and name in existing_params:
+                existing_params[name].data = torch.as_strided(
+                    # Note: avoid clone here
+                    existing_params[name].data.clone(),
+                    rebuild_info["shape"],
+                    rebuild_info["stride"],
+                )
+
+        # Restore weight loader attributes (only if missing)
+        for k, loader_dict in model.recorded_loader.items():
+            for param_name, loader in loader_dict.items():
+                if param_name in updated_param_names and param_name in existing_params:
+                    param = existing_params[param_name]
+                    if not hasattr(param, k):
+                        if callable(loader):
+                            if hasattr(loader, "__self__"):
+                                setattr(param, k, loader)
+                            else:
+                                setattr(
+                                    param,
+                                    k,
+                                    QuantizedRLModelLoader._bind_method_to_cls(
+                                        loader, param
+                                    ),
+                                )
+                        else:
+                            setattr(param, k, loader)
+
+        del existing_params
+
+        # Quantize BF16 weights to FP8 in iterator (before weight_loader)
+        # Store scales for later update
+        quantized_scales: Dict[str, Union[torch.Tensor, Dict[Any, torch.Tensor]]] = {}
+
+        def quantize_weights_iterator(weights_iter):
+            """Quantize individual shards before weight_loader stacks them."""
+            from sglang.srt.layers.quantization.fp8_kernel import (
+                per_token_group_quant_fp8,
+            )
+
+            for name, weight in weights_iter:
+                if any(
+                    skip in name
+                    for skip in QuantizedRLModelLoader.SKIP_QUANTIZATION_PARAMS
+                ):
+                    logger.info(f"[QuantizedRL] Skip: {name} ({weight.dtype})")
+                    yield (name, weight)
+                elif weight.dtype in [torch.bfloat16, torch.float32, torch.float16]:
+                    qweight, scale = per_token_group_quant_fp8(weight, weight.shape[-1])
+                    logger.info(f"[QuantizedRL] Quantize: {name} {weight.dtype}→FP8")
+                    QuantizedRLModelLoader._store_quantized_scale(
+                        quantized_scales, name, scale
+                    )
+                    yield (name, qweight)
+                else:
+                    logger.info(f"[QuantizedRL] Keep: {name} ({weight.dtype})")
+                    yield (name, weight)
+
+        # Load quantized weights (weight_loader stacks FP8 shards)
+        first_time_load_weights(quantize_weights_iterator(iter(weights_list)))
+
+        # Copy back to original FP8 memory locations and update scales
+        all_params = dict(model.named_parameters())
+
+        for name in updated_param_names:
+            if name not in all_params or name not in current_param_data:
+                continue
+            if any(
+                skip in name for skip in QuantizedRLModelLoader.SKIP_QUANTIZATION_PARAMS
+            ):
+                continue
+
+            new_param = all_params[name]
+            old_fp8_data = current_param_data[name]
+
+            # Handle embeddings/lm_head (BF16) and quantized weights (FP8)
+            if "embed_tokens" in name or "lm_head" in name:
+                old_fp8_data.copy_(new_param.data)
+                new_param.data = old_fp8_data
+            elif (
+                new_param.dtype == torch.float8_e4m3fn
+                and old_fp8_data.dtype == torch.float8_e4m3fn
+            ):
+                # FP8: Use strided view for transposed storage
+                strided_data = torch.as_strided(
+                    new_param.data, old_fp8_data.shape, old_fp8_data.stride()
+                )
+                old_fp8_data.copy_(strided_data)
+                new_param.data = old_fp8_data
+                QuantizedRLModelLoader._apply_scale_update(
+                    all_params,
+                    name,
+                    quantized_scales.get(name),
+                )
+            elif new_param.dtype == old_fp8_data.dtype:
+                # Same dtype (LayerNorm, etc.): Direct copy
+                old_fp8_data.copy_(new_param.data)
+                new_param.data = old_fp8_data
+            else:
+                raise RuntimeError(
+                    f"Unexpected dtype mismatch for {name}: "
+                    f"new={new_param.dtype}, old={old_fp8_data.dtype}"
+                )
+
+        # Cleanup
+        del current_param_data
+        if is_last_update:
+            gc.collect()
+            torch.cuda.empty_cache()
+
+        logger.info("[QuantizedRL] Reload complete")
+        return updated_param_names, is_last_update
+
+    @staticmethod
+    def _get_updated_params(weights_list, model):
+        """Identify which parameters need updating from incoming weights."""
+        stacked_params_mapping = [
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+            ("gate_up_proj", "gate_proj", 0),
+            ("gate_up_proj", "up_proj", 1),
+        ]
+
+        params_dict = dict(model.named_parameters())
+        updated_params = set()
+        is_last_update = False
+
+        for name, _ in weights_list:
+            if name == "lm_head.weight":
+                is_last_update = True
+
+            if any(
+                skip in name for skip in QuantizedRLModelLoader.SKIP_QUANTIZATION_PARAMS
+            ):
+                continue
+
+            from sglang.srt.layers.utils import get_layer_id
+
+            # Skip params outside layer range (for pipeline parallelism)
+            layer_id = get_layer_id(name)
+            if (
+                layer_id is not None
+                and hasattr(model, "start_layer")
+                and (layer_id < model.start_layer or layer_id >= model.end_layer)
+            ):
+                continue
+
+            # Skip tied embeddings and vision tower params
+            if (
+                hasattr(model, "config")
+                and model.config.tie_word_embeddings
+                and "lm_head.weight" in name
+            ):
+                continue
+            if name.startswith("model.vision_tower") and name not in params_dict:
+                continue
+
+            # Map stacked param shards (q/k/v_proj → qkv_proj)
+            mapped = False
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                if weight_name in name:
+                    name = name.replace(weight_name, param_name)
+                    if name.endswith(".bias") and name not in params_dict:
+                        continue
+                    updated_params.add(name)
+                    mapped = True
+                    break
+
+            if not mapped:
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                if name in params_dict:
+                    updated_params.add(name)
+
+        return list(updated_params), is_last_update
+
+
+class DummyModelLoader(BaseModelLoader):
+    """Model loader that will set model weights to random values."""
+
+    def __init__(self, load_config: LoadConfig):
+        super().__init__(load_config)
+        if load_config.model_loader_extra_config:
+            raise ValueError(
+                f"Model loader extra config is not supported for "
+                f"load format {load_config.load_format}"
+            )
+
+    def download_model(self, model_config: ModelConfig) -> None:
+        pass  # Nothing to download
+
+    def load_model(
+        self,
+        *,
+        model_config: ModelConfig,
+        device_config: DeviceConfig,
+    ) -> nn.Module:
+
+        if get_bool_env_var("SGL_CPU_QUANTIZATION"):
+            return load_model_with_cpu_quantization(
+                self, model_config=model_config, device_config=device_config
+            )
+
+        quant_config = _get_quantization_config(model_config, self.load_config)
+
+        with set_default_torch_dtype(model_config.dtype):
+            with torch.device(device_config.device):
+                model = _initialize_model(
+                    model_config,
+                    self.load_config,
+                    quant_config,
+                )
+
+            for _, module in model.named_modules():
+                quant_method = getattr(module, "quant_method", None)
+                if quant_method is not None:
+                    # Skip FusedMoE layers already quantized during init (FP8 or FP4)
+                    if (
+                        hasattr(module, "is_weights_quantized")
+                        and module.is_weights_quantized()
+                    ):
+                        continue
+                    quant_method.process_weights_after_loading(module)
+
+            # NOTE(woosuk): For accurate performance evaluation, we assign
+            # random values to the weights.
+            initialize_dummy_weights(model)
+
+            post_load_weights(model, model_config)
+
+        return model.eval()
+
+
+class ShardedStateLoader(BaseModelLoader):
+    """
+    Model loader that directly loads each worker's model state dict, which
+    enables a fast load path for large tensor-parallel models where each worker
+    only needs to read its own shard rather than the entire checkpoint. See
+    `examples/runtime/engine/save_sharded_state.py` for creating a sharded checkpoint.
+    """
+
+    DEFAULT_PATTERN = "model-rank-{rank}-part-{part}.safetensors"
+
+    def __init__(self, load_config: LoadConfig):
+        super().__init__(load_config)
+        extra_config = (
+            {}
+            if load_config.model_loader_extra_config is None
+            else load_config.model_loader_extra_config.copy()
+        )
+        self.pattern = extra_config.pop("pattern", self.DEFAULT_PATTERN)
+        if extra_config:
+            raise ValueError(
+                f"Unexpected extra config keys for load format "
+                f"{load_config.load_format}: "
+                f"{load_config.model_loader_extra_config.keys()}"
+            )
+
+    @staticmethod
+    def _filter_subtensors(tensors: Dict[str, torch.Tensor]) -> Dict[str, torch.Tensor]:
+        """
+        Filter out all tensors that share the same memory or a subset of the
+        memory of another tensor.
+        """
+        same_storage_groups: Dict[Any, List[Tuple[str, torch.Tensor]]] = (
+            collections.defaultdict(list)
+        )
+        for key, tensor in tensors.items():
+            if tensor.numel():
+                ptr = tensor.untyped_storage().data_ptr()
+                same_storage_groups[tensor.device, ptr].append((key, tensor))
+
+        def get_end_ptr(tensor: torch.Tensor) -> int:
+            return tensor.view(-1)[-1].data_ptr() + tensor.element_size()
+
+        result: Dict[str, torch.Tensor] = {}
+        for group in same_storage_groups.values():
+            for k, t in group:
+                a, b = t.data_ptr(), get_end_ptr(t)
+                for k2, t2 in group:
+                    if not t2.is_contiguous():
+                        continue
+                    a2, b2 = t2.data_ptr(), get_end_ptr(t2)
+                    if a < a2 or b2 < b:
+                        continue
+                    if a2 < a or b < b2 or not t.is_contiguous():
+                        break  # t2 covers strictly more memory than t.
+                    if k2 < k:
+                        # Same tensors, keep the one with the smaller key.
+                        break
+                else:
+                    result[k] = t
+        return result
+
+    def _prepare_weights(self, model_name_or_path: str, revision: Optional[str]):
+        if os.path.isdir(model_name_or_path):
+            return model_name_or_path
+        else:
+            allow_patterns = ["*.safetensors"]
+            return download_weights_from_hf(
+                model_name_or_path,
+                self.load_config.download_dir,
+                allow_patterns,
+                revision,
+                ignore_patterns=self.load_config.ignore_patterns,
+            )
+
+    def download_model(self, model_config: ModelConfig) -> None:
+        self._prepare_weights(model_config.model_path, model_config.revision)
+
+    def load_model(
+        self,
+        *,
+        model_config: ModelConfig,
+        device_config: DeviceConfig,
+    ) -> nn.Module:
+        from safetensors.torch import safe_open
+
+        from sglang.srt.distributed import get_tensor_model_parallel_rank
+
+        local_model_path = self._prepare_weights(
+            model_config.model_path, model_config.revision
+        )
+
+        quant_config = _get_quantization_config(model_config, self.load_config)
+
+        with set_default_torch_dtype(model_config.dtype):
+            with torch.device(device_config.device):
+                model = _initialize_model(model_config, self.load_config, quant_config)
+                for _, module in model.named_modules():
+                    quant_method = getattr(module, "quant_method", None)
+                    if quant_method is not None:
+                        quant_method.process_weights_after_loading(module)
+            rank = get_tensor_model_parallel_rank()
+            pattern = os.path.join(
+                local_model_path,
+                self.pattern.format(rank=rank, part="*"),
+            )
+            filepaths = glob.glob(pattern)
+            if not filepaths:
+                # TODO: support un-sharded checkpoints too
+                raise ValueError(
+                    f"Could not find checkpoint files '{pattern}', only "
+                    f"pre-sharded checkpoints are currently supported!"
+                )
+            state_dict = self._filter_subtensors(model.state_dict())
+            for path in filepaths:
+                with safe_open(path, framework="pt") as f:
+                    for key in f.keys():  # noqa: SIM118
+                        tensor = f.get_tensor(key)
+                        # If loading with LoRA enabled, additional padding may
+                        # be added to certain parameters. We only load into a
+                        # narrowed view of the parameter data.
+                        param_data = state_dict[key].data
+                        param_shape = state_dict[key].shape
+                        for dim, size in enumerate(tensor.shape):
+                            if size < param_shape[dim]:
+                                param_data = param_data.narrow(dim, 0, size)
+                        if tensor.shape != param_shape:
+                            logger.warning(
+                                "loading tensor of shape %s into "
+                                "parameter '%s' of shape %s",
+                                tensor.shape,
+                                key,
+                                param_shape,
+                            )
+                        param_data.copy_(tensor)
+                        state_dict.pop(key)
+            if state_dict:
+                raise ValueError(f"Missing keys {tuple(state_dict)} in loaded state!")
+
+            post_load_weights(model, model_config)
+
+        return model.eval()
+
+    @staticmethod
+    def save_model(
+        model: torch.nn.Module,
+        path: str,
+        pattern: Optional[str] = None,
+        max_size: Optional[int] = None,
+    ) -> None:
+        from safetensors.torch import save_file
+
+        from sglang.srt.distributed import get_tensor_model_parallel_rank
+
+        if pattern is None:
+            pattern = ShardedStateLoader.DEFAULT_PATTERN
+        rank = get_tensor_model_parallel_rank()
+        part_idx = 0
+        total_size = 0
+        state_dict = ShardedStateLoader._filter_subtensors(model.state_dict())
+        state_dict_part: Dict[str, torch.Tensor] = {}
+        for key, tensor in state_dict.items():
+            param_size = tensor.nelement() * tensor.element_size()
+            if max_size is not None and total_size + param_size > max_size:
+                filename = pattern.format(rank=rank, part=part_idx)
+                save_file(
+                    state_dict_part,
+                    os.path.join(path, filename),
+                )
+                part_idx += 1
+                total_size = 0
+                state_dict_part = {}
+            state_dict_part[key] = tensor
+            total_size += param_size
+        if len(state_dict_part) > 0:
+            filename = pattern.format(rank=rank, part=part_idx)
+            save_file(
+                state_dict_part,
+                os.path.join(path, filename),
+            )
+
+
+class BitsAndBytesModelLoader(BaseModelLoader):
+    """Model loader to load model weights with BitAndBytes quantization."""
+
+    possible_config_file_names = ["adapter_config.json"]
+
+    default_target_modules = [
+        ".gate_proj.",
+        ".down_proj.",
+        ".up_proj.",
+        ".q_proj.",
+        ".k_proj.",
+        ".v_proj.",
+        ".o_proj.",
+        ".fc1.",
+        ".fc2.",
+        ".dense.",
+        ".query_key_value.",
+        ".qkv_proj.",
+        ".dense_h_to_4h.",
+        ".dense_4h_to_h.",
+        ".out_proj.",
+    ]
+
+    def __init__(self, load_config: LoadConfig):
+        super().__init__(load_config)
+
+        # we don't need to quantize the whole model, only the target modules
+        # that are specified in the adapter config file. If the adapter config
+        # file is not provided, we will quantize the default modules.
+        if (
+            not load_config.model_loader_extra_config
+            or "qlora_adapter_name_or_path" not in load_config.model_loader_extra_config
+        ):
+            self.target_modules = []
+            return
+
+        qlora_adapter = load_config.model_loader_extra_config[
+            "qlora_adapter_name_or_path"
+        ]
+
+        config_file_path = self._get_config_file(qlora_adapter)
+
+        with open(config_file_path, "r") as f:
+            config = json.load(f)
+            self.target_modules = config["target_modules"]
+
+    def _get_config_file(self, qlora_adapter: str) -> str:
+        is_local = os.path.isdir(qlora_adapter)
+        config_file_path = None
+        if is_local:
+            for file in self.possible_config_file_names:
+                config_file_path = os.path.join(qlora_adapter, file)
+                if os.path.exists(config_file_path):
+                    break
+        else:
+            hf_api = HfApi()
+            repo_files = hf_api.list_repo_files(repo_id=qlora_adapter)
+            for file in self.possible_config_file_names:
+                if file in repo_files:
+                    config_file_path = hf_hub_download(
+                        repo_id=qlora_adapter, filename=file
+                    )
+                    break
+
+        if not config_file_path:
+            raise ValueError(f"Cannot find adapter config file in {qlora_adapter}")
+
+        return config_file_path
+
+    def _get_weight_files(
+        self,
+        model_name_or_path: str,
+        allowed_patterns: List[str],
+        revision: Optional[str] = None,
+    ) -> Tuple[List[str], str]:
+        """Retrieve weight files. Download the files if necessary.
+
+        Return the weight files and the file pattern."""
+        is_local = os.path.isdir(model_name_or_path)
+
+        if is_local:
+            for pattern in allowed_patterns:
+                weight_files = glob.glob(os.path.join(model_name_or_path, pattern))
+                if weight_files:
+                    return weight_files, pattern
+        else:
+            hf_api = HfApi()
+            repo_files = hf_api.list_repo_files(repo_id=model_name_or_path)
+            for pattern in allowed_patterns:
+                matching_files = fnmatch.filter(repo_files, pattern)
+                if matching_files:
+                    hf_folder = download_weights_from_hf(
+                        model_name_or_path,
+                        self.load_config.download_dir,
+                        [pattern],
+                        revision,
+                        ignore_patterns=self.load_config.ignore_patterns,
+                    )
+                    return glob.glob(os.path.join(hf_folder, pattern)), pattern
+
+        raise RuntimeError(f"No model weights found in: `{model_name_or_path}`")
+
+    def _prepare_weights(
+        self, model_name_or_path: str, revision: Optional[str]
+    ) -> Tuple[List[str], bool]:
+        """Prepare weight files for the model."""
+
+        allowed_patterns = ["*.safetensors", "*.bin", "*.pt"]
+
+        hf_weights_files, matched_pattern = self._get_weight_files(
+            model_name_or_path, allowed_patterns, revision
+        )
+
+        if matched_pattern != "*.safetensors":
+            hf_weights_files = filter_files_not_needed_for_inference(hf_weights_files)
+
+        if len(hf_weights_files) == 0:
+            raise RuntimeError(
+                f"Cannot find any model weights with `{model_name_or_path}`"
+            )
+
+        return hf_weights_files, matched_pattern == "*.safetensors"
+
+    def _hf_weight_iter(self, hf_weights_files, use_safetensors: bool):
+        if use_safetensors:
+            return safetensors_weights_iterator(hf_weights_files)
+        else:
+            return pt_weights_iterator(hf_weights_files)
+
+    def _get_quantized_weights_iterator(
+        self,
+        model_name_or_path: str,
+        revision: Optional[str],
+        pre_quant: bool,
+        load_8bit: bool,
+    ) -> Tuple[Generator[Tuple[str, torch.Tensor], None, None], Dict[str, Any]]:
+        """Get an iterator to the model weights with bitsandbytes quantization,
+        as well as the quantization state dictionary."""
+
+        # only load the bitsandbytes module when needed
+        try:
+            import bitsandbytes
+
+            if bitsandbytes.__version__ < "0.44.0":
+                raise ImportError(
+                    "bitsandbytes version is wrong. Please "
+                    "install bitsandbytes>=0.44.0."
+                )
+        except ImportError as err:
+            raise ImportError(
+                "Please install bitsandbytes>=0.44.0 via "
+                "`pip install bitsandbytes>=0.44.0` to use "
+                "bitsandbytes quantizer."
+            ) from err
+
+        hf_weights_files, use_safetensors = self._prepare_weights(
+            model_name_or_path, revision
+        )
+
+        quant_state_dict: Dict[str, Any] = {}
+
+        if pre_quant:
+            if load_8bit:
+                return (
+                    self._quantized_8bit_generator(
+                        hf_weights_files, use_safetensors, quant_state_dict
+                    ),
+                    quant_state_dict,
+                )
+            else:
+                return (
+                    self._quantized_4bit_generator(
+                        hf_weights_files, use_safetensors, quant_state_dict
+                    ),
+                    quant_state_dict,
+                )
+
+        return (
+            self._unquantized_generator(
+                hf_weights_files, use_safetensors, quant_state_dict
+            ),
+            quant_state_dict,
+        )
+
+    def _is_8bit_weight_name(self, weight_name: str):
+        quantized_suffix = {".scb", ".weight_format"}
+        return any(weight_name.lower().endswith(suffix) for suffix in quantized_suffix)
+
+    def _is_4bit_weight_name(self, weight_name: str):
+        quantized_suffix = {
+            "absmax",
+            "quant_map",
+            "nested_absmax",
+            "nested_quant_map",
+            "bitsandbytes",
+        }
+        suffix = weight_name.split(".")[-1]
+        return any(q_suffix in suffix for q_suffix in quantized_suffix)
+
+    def _quantized_8bit_generator(
+        self, hf_weights_files, use_safetensors, quant_state_dict
+    ) -> Generator:
+        for weight_name, weight_tensor in self._hf_weight_iter(
+            hf_weights_files, use_safetensors
+        ):
+            if not weight_name.lower().endswith(".scb"):
+                continue
+
+            weight_key = weight_name.lower().replace(".scb", ".weight")
+            quant_state_dict[weight_key] = weight_tensor
+
+        for weight_name, weight_tensor in self._hf_weight_iter(
+            hf_weights_files, use_safetensors
+        ):
+            if self._is_8bit_weight_name(weight_name):
+                continue
+
+            if weight_name in quant_state_dict:
+                set_weight_attrs(weight_tensor, {"load_in_8bit": True})
+                yield weight_name, weight_tensor
+            else:
+                yield weight_name, weight_tensor
+
+    def _quantized_4bit_generator(
+        self, hf_weights_files, use_safetensors, quant_state_dict
+    ) -> Generator:
+        from bitsandbytes.functional import QuantState
+
+        # First iterate over all quant state weights
+        weight_iterator = self._hf_weight_iter(hf_weights_files, use_safetensors)
+        temp_state_dict = {}
+        for weight_name, weight_tensor in weight_iterator:
+            if not self._is_4bit_weight_name(weight_name):
+                continue
+            # bitsandbytes library requires
+            # weight.quant_state.bitsandbytes__* in CPU
+            if "quant_state.bitsandbytes" in weight_name:
+                temp_state_dict[weight_name] = weight_tensor.cpu().data
+            else:
+                temp_state_dict[weight_name] = weight_tensor
+
+        # Closure to parse quant_state for each prequant weight
+        def _parse_quant_state(param_name: str, temp_state_dict: Dict) -> QuantState:
+            quant_state = {}
+            for k in temp_state_dict:
+                if param_name + "." in k:
+                    quant_state[k] = temp_state_dict[k]
+
+            return QuantState.from_dict(quant_state, device="cuda")
+
+        # Second iterate over all prequant and normal weights
+        # pre quantized weights would have a quant_state
+        for weight_name, weight_tensor in self._hf_weight_iter(
+            hf_weights_files, use_safetensors
+        ):
+
+            if self._is_4bit_weight_name(weight_name):
+                continue
+
+            if (f"{weight_name}.quant_state.bitsandbytes__nf4" in temp_state_dict) or (
+                f"{weight_name}.quant_state.bitsandbytes__fp4" in temp_state_dict
+            ):
+                quant_state = _parse_quant_state(weight_name, temp_state_dict)
+                quant_state_dict[weight_name] = quant_state
+                yield weight_name, weight_tensor
+            else:
+                yield weight_name, weight_tensor
+
+    def _unquantized_generator(
+        self, hf_weights_files, use_safetensors, quant_state_dict
+    ) -> Generator:
+        from bitsandbytes.functional import quantize_4bit
+
+        tp_size = get_tensor_model_parallel_world_size()
+        tp_rank = get_tensor_model_parallel_rank()
+
+        for weight_name, weight_tensor in self._hf_weight_iter(
+            hf_weights_files, use_safetensors
+        ):
+
+            if any(
+                target_module in weight_name for target_module in self.target_modules
+            ) and weight_name.endswith(".weight"):
+                weight_name = weight_name.replace(".weight", ".qweight")
+
+                if any(
+                    module in weight_name
+                    for module in self.column_parallel_weights_modules
+                ):
+
+                    total_size = weight_tensor.size(-1)
+                    start_index = total_size // tp_size * tp_rank
+                    end_index = total_size // tp_size * (tp_rank + 1)
+                    weight_sub_tensor = weight_tensor[..., start_index:end_index]
+
+                else:
+                    total_size = weight_tensor.size(0)
+                    start_index = total_size // tp_size * tp_rank
+                    end_index = total_size // tp_size * (tp_rank + 1)
+                    weight_sub_tensor = weight_tensor[start_index:end_index, ...]
+
+                # bitsandbytes requires data in GPU
+                if weight_sub_tensor.is_cuda:
+                    loaded_weight = weight_sub_tensor
+                else:
+                    loaded_weight = weight_sub_tensor.cuda()
+
+                # remove the following after the issue is fixed:
+                # https://github.com/bitsandbytes-foundation/bitsandbytes/issues/1342
+                if loaded_weight.is_contiguous() is False:
+                    loaded_weight = loaded_weight.contiguous()
+
+                with set_default_torch_dtype(torch.float32):
+                    processed_weight, quant_state = quantize_4bit(
+                        loaded_weight, compress_statistics=True, quant_type="nf4"
+                    )
+
+                quant_state_dict[weight_name] = quant_state
+            else:
+                processed_weight = weight_tensor
+
+            yield weight_name, processed_weight
+
+    def _load_weights(self, model_config: ModelConfig, model: nn.Module) -> None:
+        if not hasattr(model, "load_weights"):
+            raise AttributeError(
+                "The required method 'load_weights' is not defined in class"
+                f" {type(model).__name__}."
+            )
+
+        if not hasattr(model, "bitsandbytes_stacked_params_mapping"):
+            raise AttributeError(
+                f"Model {type(model).__name__} does not support BitsAndBytes "
+                "quantization yet."
+            )
+
+        if len(self.target_modules) == 0:
+            if hasattr(model, "default_bitsandbytes_target_modules"):
+                self.target_modules = model.default_bitsandbytes_target_modules
+            else:
+                self.target_modules = self.default_target_modules
+
+        if hasattr(model, "column_parallel_weights_modules"):
+            self.column_parallel_weights_modules = model.column_parallel_weights_modules
+        else:
+            self.column_parallel_weights_modules = []
+
+        self.model_type = type(model).__name__
+
+        logger.info(
+            "Loading weights with BitsAndBytes quantization. " " May take a while ..."
+        )
+
+        quant_config = getattr(model_config.hf_config, "quantization_config", None)
+
+        pre_quant = False
+        if quant_config is not None:
+            quant_method = quant_config.get("quant_method")
+            if quant_method == "bitsandbytes":
+                pre_quant = True
+            else:
+                raise ValueError(
+                    f"BitsAndBytes loader does not support {quant_method} "
+                    "quantization"
+                )
+
+        # The quant_states in pre_quantized models cannot work with a split
+        # weight tensor. So TP does not work with pre_quantized bnb models.
+        if pre_quant and get_tensor_model_parallel_world_size() > 1:
+            raise ValueError(
+                "Prequant BitsAndBytes models with TP is not supported."
+                "Please try with PP."
+            )
+
+        load_8bit = False
+        if pre_quant:
+            load_8bit = quant_config.get("load_in_8bit", False)
+
+        qweight_iterator, quant_state_dict = self._get_quantized_weights_iterator(
+            model_config.model_path, model_config.revision, pre_quant, load_8bit
+        )
+
+        model.load_weights(qweight_iterator)
+
+        torch.cuda.empty_cache()
+
+        param_dict = dict(model.named_parameters())
+        stacked_quant_state_dict: Dict[str, Dict[int, Any]] = {}
+        model_type = model_config.hf_config.model_type
+        for quant_param_name in quant_state_dict:
+            non_stacked_param_name = quant_param_name
+            if model_type == "mllama" and "vision_model" in quant_param_name:
+                # adapt to VisionAttention
+                quant_param_name = quant_param_name.replace(
+                    "self_attn.o_proj", "self_attn.proj"
+                )
+            shard_index = 0
+            for shard_name, (
+                weight_name,
+                index,
+            ) in model.bitsandbytes_stacked_params_mapping.items():
+                if (
+                    model_type in ["qwen2_vl", "qwen2_5_vl"]
+                    and "visual" in quant_param_name
+                ):
+                    break
+                if shard_name in quant_param_name:
+                    shard_index = index
+                    quant_param_name = quant_param_name.replace(shard_name, weight_name)
+                    break
+
+            if (
+                model_type in ["qwen2_vl", "qwen2_5_vl"]
+                and "visual" in quant_param_name
+            ):
+                quant_param_name = quant_param_name.replace(
+                    r"attn.qkv.", r"attn.qkv_proj."
+                )
+
+            if quant_param_name not in param_dict:
+                raise ValueError(
+                    f"Parameter {quant_param_name} not found in the model."
+                )
+
+            if quant_param_name not in stacked_quant_state_dict:
+                stacked_quant_state_dict[quant_param_name] = {}
+
+            stacked_quant_state_dict[quant_param_name][shard_index] = quant_state_dict[
+                non_stacked_param_name
+            ]
+
+        # save quant_states and offsets as the attributes of the parameters
+        for param_name, param in param_dict.items():
+            if param_name in stacked_quant_state_dict:
+                quant_states = stacked_quant_state_dict[param_name]
+                set_weight_attrs(param, {"bnb_quant_state": quant_states})
+
+                pack_ratio = getattr(param, "pack_factor", -1)
+                if pack_ratio == -1:
+                    raise ValueError(f"pack_factor not set for parameter {param_name}.")
+
+                num_elements = [0] * len(quant_states)
+                for seq, quant_state in quant_states.items():
+                    num_elements[seq] = math.prod(quant_state.shape) // pack_ratio
+
+                offsets = np.concatenate(([0], np.cumsum(num_elements)))
+                # Make torch infer_schema happy(Compatible with vLLM)
+                offsets = torch.tensor(offsets).cpu()
+                set_weight_attrs(param, {"bnb_shard_offsets": offsets})
+
+                if load_8bit:
+                    set_weight_attrs(
+                        param, {"matmul_state": [None] * len(quant_states)}
+                    )
+
+    def download_model(self, model_config: ModelConfig) -> None:
+        self._prepare_weights(model_config.model_path, model_config.revision)
+
+    def load_model(
+        self,
+        *,
+        model_config: ModelConfig,
+        device_config: DeviceConfig,
+    ) -> nn.Module:
+        quant_config = _get_quantization_config(model_config, self.load_config)
+        with set_default_torch_dtype(model_config.dtype):
+            with torch.device(device_config.device):
+                model = _initialize_model(
+                    model_config,
+                    self.load_config,
+                    quant_config,
+                )
+
+                self._load_weights(model_config, model)
+
+        return model.eval()
+
+
+class GGUFModelLoader(BaseModelLoader):
+    """
+    Model loader that can load GGUF files. This is useful for loading models
+    that are quantized with GGUF and saved in the GGUF format. This loader
+    supports loading both full models and sharded models.
+    """
+
+    def __init__(self, load_config: LoadConfig):
+        super().__init__(load_config)
+        if load_config.model_loader_extra_config:
+            raise ValueError(
+                f"Model loader extra config is not supported for "
+                f"load format {load_config.load_format}"
+            )
+
+    def _prepare_weights(self, model_name_or_path: str):
+        if os.path.isfile(model_name_or_path):
+            return model_name_or_path
+        else:
+            raise ValueError(f"{model_name_or_path} is not a file.")
+
+    def _get_gguf_weights_map(self, model_config: ModelConfig):
+        """
+        GGUF uses this naming convention for their tensors from HF checkpoint:
+        `blk.N.BB.weight` and `blk.N.BB.bias`
+        where N signifies the block number of a layer, and BB signifies the
+        attention/mlp layer components.
+        See "Standardized tensor names" in
+        https://github.com/ggerganov/ggml/blob/master/docs/gguf.md for details.
+        """
+
+        # only load the gguf module when needed
+        try:
+            import gguf
+
+            # FIXME: add version check for gguf
+        except ImportError as err:
+            raise ImportError(
+                "Please install gguf via `pip install gguf` to use gguf quantizer."
+            ) from err
+
+        config = model_config.hf_config
+        model_type = config.model_type
+        # hack: ggufs have a different name than transformers
+        if model_type == "cohere":
+            model_type = "command-r"
+        arch = None
+        for key, value in gguf.MODEL_ARCH_NAMES.items():
+            if value == model_type:
+                arch = key
+                break
+        if arch is None:
+            raise RuntimeError(f"Unknown gguf model_type: {model_type}")
+        num_layers = config.num_hidden_layers
+        name_map = gguf.get_tensor_name_map(arch, num_layers)
+        with torch.device("meta"):
+            dummy_model = AutoModelForCausalLM.from_config(config)
+        state_dict = dummy_model.state_dict()
+
+        gguf_to_hf_name_map = {}
+        for hf_name in state_dict:
+            name, suffix = hf_name.rsplit(".", 1)
+            gguf_name = name_map.get_name(name)
+            gguf_to_hf_name_map[f"{gguf_name}.{suffix}"] = hf_name
+        return gguf_to_hf_name_map
+
+    def _get_weights_iterator(
+        self, model_name_or_path: str, gguf_to_hf_name_map: Dict[str, str]
+    ) -> Generator[Tuple[str, torch.Tensor], None, None]:
+        return gguf_quant_weights_iterator(model_name_or_path, gguf_to_hf_name_map)
+
+    def download_model(self, model_config: ModelConfig) -> None:
+        self._prepare_weights(model_config.model_path)
+
+    def load_model(
+        self,
+        *,
+        model_config: ModelConfig,
+        device_config: DeviceConfig,
+    ) -> nn.Module:
+
+        local_model_path = self._prepare_weights(model_config.model_path)
+        gguf_weights_map = self._get_gguf_weights_map(model_config)
+        # we can only know if tie word embeddings after mapping weights
+        if "lm_head.weight" in get_gguf_extra_tensor_names(
+            local_model_path, gguf_weights_map
+        ):
+            model_config.hf_config.update({"tie_word_embeddings": True})
+
+        target_device = torch.device(device_config.device)
+        quant_config = _get_quantization_config(model_config, self.load_config)
+        with set_default_torch_dtype(model_config.dtype):
+            with target_device:
+                model = _initialize_model(model_config, self.load_config, quant_config)
+            model.load_weights(
+                self._get_weights_iterator(local_model_path, gguf_weights_map)
+            )
+
+            for _, module in model.named_modules():
+                quant_method = getattr(module, "quant_method", None)
+                if quant_method is not None:
+                    with device_loading_context(module, target_device):
+                        quant_method.process_weights_after_loading(module)
+        return model
+
+
+class RemoteInstanceModelLoader(BaseModelLoader):
+    """Model loader that can load Tensors from remote sglang instance."""
+
+    def __init__(self, load_config: LoadConfig):
+        super().__init__(load_config)
+        if load_config.model_loader_extra_config:
+            raise ValueError(
+                f"Model loader extra config is not supported for "
+                f"load format {load_config.load_format}"
+            )
+        self.remote_instance_transfer_engine_weight_info = None
+
+    def download_model(self, model_config: ModelConfig) -> None:
+        raise NotImplementedError
+
+    def load_model(
+        self,
+        *,
+        model_config: ModelConfig,
+        device_config: DeviceConfig,
+    ) -> nn.Module:
+        logger.info("Loading weights from remote instance ...")
+        load_config = self.load_config
+
+        assert load_config.load_format == LoadFormat.REMOTE_INSTANCE, (
+            f"Model loader {self.load_config.load_format} is not supported for "
+            f"load format {load_config.load_format}"
+        )
+
+        quant_config = _get_quantization_config(model_config, self.load_config)
+        with set_default_torch_dtype(model_config.dtype):
+            with torch.device(device_config.device):
+                model = _initialize_model(model_config, self.load_config, quant_config)
+
+        if (
+            load_config.remote_instance_weight_loader_backend
+            == RemoteInstanceWeightLoaderBackend.NCCL
+        ):
+            model_weights = f"instance://{load_config.remote_instance_weight_loader_seed_instance_ip}:{load_config.remote_instance_weight_loader_send_weights_group_ports[load_config.tp_rank]}"
+            with create_remote_connector(model_weights, device_config.device) as client:
+                connector_type = get_connector_type(client)
+                if connector_type == ConnectorType.INSTANCE:
+                    self.load_model_from_remote_instance_by_nccl(
+                        model, client, model_config, device_config
+                    )
+                else:
+                    raise ValueError(
+                        f"Unsupported connector type {connector_type} for "
+                        f"remote tensor model loading."
+                    )
+        elif (
+            load_config.remote_instance_weight_loader_backend
+            == RemoteInstanceWeightLoaderBackend.TRANSFER_ENGINE
+        ):
+            if load_config.remote_instance_weight_loader_transfer_engine is None:
+                raise RuntimeError(
+                    "Transfer engine is not initialized for remote instance "
+                    "model loader with `transfer_engine` backend. "
+                )
+            logger.info(
+                "TransferEngine registering memory regions (this may take a few seconds)..."
+            )
+            # register memory region
+            self.remote_instance_transfer_engine_weight_info = register_memory_region(
+                model, load_config.remote_instance_weight_loader_transfer_engine
+            )
+            logger.info(
+                "TransferEngine memory regions have been successfully registered."
+            )
+
+            # transfer weights
+            success = self.load_model_from_remote_instance_by_transfer_engine(
+                model,
+                load_config.remote_instance_weight_loader_transfer_engine,
+                f"http://{load_config.remote_instance_weight_loader_seed_instance_ip}:{load_config.remote_instance_weight_loader_seed_instance_service_port}",
+                load_config.tp_rank,
+            )
+            if not success:
+                raise RuntimeError(
+                    "Failed to load weights from remote instance via transfer engine."
+                )
+        else:
+            raise ValueError("Invalid remote instance weight loader backend.")
+
+        return model.eval()
+
+    def load_model_from_remote_instance_by_nccl(
+        self, model, client, model_config: ModelConfig, device_config: DeviceConfig
+    ) -> nn.Module:
+        load_config = self.load_config
+        instance_ip = socket.gethostbyname(socket.gethostname())
+        start_build_group_tic = time.time()
+        client.build_group(
+            gpu_id=device_config.gpu_id,
+            tp_rank=load_config.tp_rank,
+            instance_ip=instance_ip,
+        )
+        torch.cuda.synchronize()
+        end_build_group_tic = time.time()
+        logger.debug(
+            f"finish building group for remote instance, time used: {(end_build_group_tic - start_build_group_tic):.4f}s"
+        )
+
+        if load_config.tp_rank == 0:
+            t = threading.Thread(
+                target=trigger_transferring_weights_request,
+                args=(
+                    load_config.remote_instance_weight_loader_seed_instance_ip,
+                    load_config.remote_instance_weight_loader_seed_instance_service_port,
+                    load_config.remote_instance_weight_loader_send_weights_group_ports,
+                    instance_ip,
+                ),
+            )
+            t.start()
+
+        start_get_weights_tic = time.time()
+        with set_default_torch_dtype(model_config.dtype):
+            for _, tensor in model.named_parameters():
+                torch.distributed.broadcast(
+                    tensor.data,
+                    src=0,
+                    group=client._model_update_group,
+                )
+            torch.cuda.synchronize()
+
+            if hasattr(model, "post_load_weights"):
+                model.post_load_weights()
+        end_get_weights_tic = time.time()
+        logger.debug(
+            f"finish getting all weights from remote instance, time used: {(end_get_weights_tic - start_get_weights_tic):.4f}s"
+        )
+        # destroy the process group after loading weights
+        torch.distributed.distributed_c10d.destroy_process_group(
+            client._model_update_group
+        )
+        torch.cuda.empty_cache()
+
+    def load_model_from_remote_instance_by_transfer_engine(
+        self, model, transfer_engine, seed_url, tp_rank
+    ) -> bool:
+        # get remote weights metadata from source instance
+        seed_transfer_engine_session_id, seed_transfer_engine_weight_info = (
+            get_remote_instance_transfer_engine_info_per_rank(seed_url, tp_rank)
+        )
+        if (
+            seed_transfer_engine_session_id is None
+            or seed_transfer_engine_weight_info is None
+        ):
+            logger.error("Cannot get transfer engine session or weight info.")
+            return False
+
+        # prepare local/remote RDMA keys
+        seed_ptr_list = []
+        client_ptr_list = []
+        client_len_list = []
+        for name, tensor in model.named_parameters():
+            weight_info = seed_transfer_engine_weight_info.get(name, None)
+            if weight_info is None:
+                logger.error(f"Cannot find weight info for {name}.")
+                return False
+
+            seed_ptr, seed_numel, seed_element_size = weight_info
+            if (
+                seed_numel != tensor.numel()
+                or seed_element_size != tensor.element_size()
+            ):
+                logger.error(
+                    f"Weight info does not match for {name}, "
+                    f"expected ({seed_numel}, {seed_element_size}), "
+                    f"got ({tensor.numel()}, {tensor.element_size()})"
+                )
+                return False
+            client_ptr = tensor.data_ptr()
+            client_len = tensor.numel() * tensor.element_size()
+            seed_ptr_list.append(seed_ptr)
+            client_ptr_list.append(client_ptr)
+            client_len_list.append(client_len)
+
+        # load weights from source instance through TransferEngine
+        ret = transfer_engine.batch_transfer_sync_read(
+            seed_transfer_engine_session_id,
+            client_ptr_list,
+            seed_ptr_list,
+            client_len_list,
+        )
+        if ret < 0:
+            logger.error(f"batch transfer failed, error: {ret}")
+            return False
+
+        if hasattr(model, "post_load_weights"):
+            model.post_load_weights()
+
+        return True
+
+
+class RemoteModelLoader(BaseModelLoader):
+    """Model loader that can load Tensors from remote database."""
+
+    def __init__(self, load_config: LoadConfig):
+        super().__init__(load_config)
+        # TODO @DellCurry: move to s3 connector only
+        set_runai_streamer_env(load_config)
+
+    def _get_weights_iterator_kv(
+        self,
+        client,
+    ) -> Generator[Tuple[str, torch.Tensor], None, None]:
+        """Get an iterator for the model weights from remote storage."""
+        assert get_connector_type(client) == ConnectorType.KV
+        rank = get_tensor_model_parallel_rank()
+        return client.weight_iterator(rank)
+
+    def _get_weights_iterator_fs(
+        self,
+        client,
+    ) -> Generator[Tuple[str, torch.Tensor], None, None]:
+        """Get an iterator for the model weights from remote storage."""
+        assert get_connector_type(client) == ConnectorType.FS
+        return client.weight_iterator()
+
+    def download_model(self, model_config: ModelConfig) -> None:
+        pass
+
+    @staticmethod
+    def save_model(
+        model: torch.nn.Module,
+        model_path: str,
+        url: str,
+    ) -> None:
+        with create_remote_connector(url) as client:
+            assert get_connector_type(client) == ConnectorType.KV
+            model_name = parse_model_name(url)
+            rank = get_tensor_model_parallel_rank()
+            state_dict = ShardedStateLoader._filter_subtensors(model.state_dict())
+            for key, tensor in state_dict.items():
+                r_key = f"{model_name}/keys/rank_{rank}/{key}"
+                client.set(r_key, tensor)
+
+            for root, _, files in os.walk(model_path):
+                for file_name in files:
+                    # ignore hidden files
+                    if file_name.startswith("."):
+                        continue
+                    if os.path.splitext(file_name)[1] in (".json", ".py"):
+                        file_path = os.path.join(root, file_name)
+                        with open(file_path, encoding="utf-8") as file:
+                            file_content = file.read()
+                            f_key = f"{model_name}/files/{file_name}"
+                            client.setstr(f_key, file_content)
+
+    def _load_model_from_remote_kv(
+        self, model: nn.Module, model_config: ModelConfig, client
+    ):
+        for _, module in model.named_modules():
+            quant_method = getattr(module, "quant_method", None)
+            if quant_method is not None:
+                quant_method.process_weights_after_loading(module)
+        weights_iterator = self._get_weights_iterator_kv(client)
+        state_dict = ShardedStateLoader._filter_subtensors(model.state_dict())
+        for key, tensor in weights_iterator:
+            # If loading with LoRA enabled, additional padding may
+            # be added to certain parameters. We only load into a
+            # narrowed view of the parameter data.
+            param_data = state_dict[key].data
+            param_shape = state_dict[key].shape
+            for dim, size in enumerate(tensor.shape):
+                if size < param_shape[dim]:
+                    param_data = param_data.narrow(dim, 0, size)
+            if tensor.shape != param_shape:
+                logger.warning(
+                    "loading tensor of shape %s into " "parameter '%s' of shape %s",
+                    tensor.shape,
+                    key,
+                    param_shape,
+                )
+            param_data.copy_(tensor)
+            state_dict.pop(key)
+        if state_dict:
+            raise ValueError(f"Missing keys {tuple(state_dict)} in loaded state!")
+
+        post_load_weights(model, model_config)
+
+    def _load_model_from_remote_fs(
+        self, model, client, model_config: ModelConfig, device_config: DeviceConfig
+    ) -> nn.Module:
+
+        target_device = torch.device(device_config.device)
+        with set_default_torch_dtype(model_config.dtype):
+            model.load_weights(self._get_weights_iterator_fs(client))
+
+            for _, module in model.named_modules():
+                quant_method = getattr(module, "quant_method", None)
+                if quant_method is not None:
+                    # When quant methods need to process weights after loading
+                    # (for repacking, quantizing, etc), they expect parameters
+                    # to be on the global target device. This scope is for the
+                    # case where cpu offloading is used, where we will move the
+                    # parameters onto device for processing and back off after.
+                    with device_loading_context(module, target_device):
+                        quant_method.process_weights_after_loading(module)
+
+    def load_model(
+        self,
+        *,
+        model_config: ModelConfig,
+        device_config: DeviceConfig,
+    ) -> nn.Module:
+        logger.info("Loading weights from remote storage ...")
+        start = time.perf_counter()
+        load_config = self.load_config
+
+        assert load_config.load_format == LoadFormat.REMOTE, (
+            f"Model loader {self.load_config.load_format} is not supported for "
+            f"load format {load_config.load_format}"
+        )
+
+        model_weights = model_config.model_path
+        if hasattr(model_config, "model_weights"):
+            model_weights = model_config.model_weights
+
+        quant_config = _get_quantization_config(model_config, self.load_config)
+
+        with set_default_torch_dtype(model_config.dtype):
+            with torch.device(device_config.device):
+                model = _initialize_model(model_config, self.load_config, quant_config)
+
+            with create_remote_connector(
+                model_weights, device=device_config.device
+            ) as client:
+                connector_type = get_connector_type(client)
+                if connector_type == ConnectorType.KV:
+                    self._load_model_from_remote_kv(model, model_config, client)
+                elif connector_type == ConnectorType.FS:
+                    self._load_model_from_remote_fs(
+                        model, client, model_config, device_config
+                    )
+
+        end = time.perf_counter()
+        logger.info("Loaded weights from remote storage in %.2f seconds.", end - start)
+        return model.eval()
+
+
+def load_model_with_cpu_quantization(
+    self,
+    *,
+    model_config: ModelConfig,
+    device_config: DeviceConfig,
+) -> nn.Module:
+    target_device = torch.device(device_config.device)
+    quant_config = _get_quantization_config(model_config, self.load_config)
+    with set_default_torch_dtype(model_config.dtype):
+        model = _initialize_model(
+            model_config,
+            self.load_config,
+            quant_config,
+        )
+
+        if not isinstance(self, DummyModelLoader):
+            model.load_weights(self._get_all_weights(model_config, model))
+
+        for _, module in model.named_modules():
+            quant_method = getattr(module, "quant_method", None)
+            if quant_method is not None:
+                # When quant methods need to process weights after loading
+                # (for repacking, quantizing, etc), they expect parameters
+                # to be on the global target device. This scope is for the
+                # case where cpu offloading is used, where we will move the
+                # parameters onto device for processing and back off after.
+                with device_loading_context(module, target_device):
+                    quant_method.process_weights_after_loading(module)
+
+        model.to(target_device)
+
+    return model.eval()
+
+
+class ModelOptModelLoader(DefaultModelLoader):
+    """
+    Model loader that applies NVIDIA Model Optimizer quantization
+    """
+
+    def __init__(self, load_config: LoadConfig):
+        super().__init__(load_config)
+        # Any ModelOpt specific initialization if needed
+
+    def _setup_modelopt_quantization(
+        self,
+        model,
+        tokenizer,
+        quant_cfg,
+        quantized_ckpt_restore_path: str | None = None,
+        quantized_ckpt_save_path: str | None = None,
+        export_path: str | None = None,
+    ) -> None:
+        """
+        Set up ModelOpt quantization for the given model.
+
+        Args:
+            model: The model to quantize
+            tokenizer: The tokenizer associated with the model
+            quant_cfg: The quantization configuration
+            quantized_ckpt_restore_path: Path to restore quantized checkpoint from
+            quantized_ckpt_save_path: Path to save quantized checkpoint to
+            export_path: Path to export the quantized model in HuggingFace format
+
+        Raises:
+            ImportError: If ModelOpt is not available
+            Exception: If quantization setup fails
+        """
+        try:
+            import modelopt.torch.opt as mto
+            import modelopt.torch.quantization as mtq
+            from modelopt.torch.quantization.utils import is_quantized
+        except ImportError as e:
+            raise ImportError(
+                "ModelOpt is not available. Please install modelopt."
+            ) from e
+
+        if is_quantized(model):
+            rank0_log("Model is already quantized, skipping quantization setup.")
+            return
+        # Restore from checkpoint if provided
+        if quantized_ckpt_restore_path:
+            try:
+                mto.restore(model, quantized_ckpt_restore_path)
+                rank0_log(
+                    f"Restored quantized model from {quantized_ckpt_restore_path}"
+                )
+
+                # Export model if path provided (even when restoring from checkpoint)
+                self._maybe_export_modelopt(model, export_path)
+                return
+            except Exception as e:
+                logger.warning(
+                    f"Failed to restore from {quantized_ckpt_restore_path}: {e}"
+                )
+                rank0_log("Proceeding with calibration-based quantization...")
+
+        # Set up calibration-based quantization
+        try:
+            # Left padding tends to work better for batched generation with decoder-only LMs
+            with suppress(Exception):
+                tokenizer.padding_side = "left"
+
+            from modelopt.torch.utils.dataset_utils import (
+                create_forward_loop,
+                get_dataset_dataloader,
+            )
+
+            # Create calibration dataloader
+            calib_dataloader = get_dataset_dataloader(
+                dataset_name="cnn_dailymail",  # TODO: Consider making this configurable
+                tokenizer=tokenizer,
+                batch_size=36,  # TODO: Consider making this configurable
+                num_samples=512,  # TODO: Consider making this configurable
+                device=model.device,
+                include_labels=False,
+            )
+
+            calibrate_loop = create_forward_loop(dataloader=calib_dataloader)
+
+            # Apply quantization
+            mtq.quantize(model, quant_cfg, forward_loop=calibrate_loop)
+
+            if (
+                not model_parallel_is_initialized()
+                or get_tensor_model_parallel_rank() == 0
+            ):
+                mtq.print_quant_summary(model)
+
+            # Save checkpoint if path provided
+            if quantized_ckpt_save_path:
+                try:
+                    mto.save(model, quantized_ckpt_save_path)
+                    rank0_log(f"Quantized model saved to {quantized_ckpt_save_path}")
+                except Exception as e:
+                    logger.warning(
+                        f"Failed to save quantized checkpoint to {quantized_ckpt_save_path}: {e}"
+                    )
+
+            # Export model if path provided
+            self._maybe_export_modelopt(model, export_path)
+
+        except Exception as e:
+            raise Exception(f"Failed to set up ModelOpt quantization: {e}") from e
+
+    def _maybe_export_modelopt(self, model, export_path: str | None) -> None:
+        """Export model to HuggingFace format if export_path is provided."""
+        if export_path:
+            try:
+                # Get the original model path from the model config
+                original_model_path = getattr(self, "_original_model_path", None)
+                self._export_modelopt_checkpoint(
+                    model, export_path, original_model_path
+                )
+                rank0_log(
+                    f"Quantized model exported to HuggingFace format at {export_path}"
+                )
+            except Exception as e:
+                rank0_log(
+                    f"Warning: Failed to export quantized model to {export_path}: {e}"
+                )
+
+    def _export_modelopt_checkpoint(
+        self,
+        model,
+        export_path: str,
+        model_path: str = None,
+        trust_remote_code: bool = True,
+    ) -> None:
+        """
+        Export the quantized model to HuggingFace format using ModelOpt export API.
+
+        Args:
+            model: The quantized model to export
+            export_path: Directory path to export the model to
+            model_path: Path to the original model (for tokenizer export)
+            trust_remote_code: Whether to trust remote code for tokenizer loading
+
+        Raises:
+            ImportError: If ModelOpt export functionality is not available
+            Exception: If export fails
+        """
+        try:
+            from modelopt.torch.export import export_hf_checkpoint
+            from transformers import AutoTokenizer
+        except ImportError as e:
+            raise ImportError(
+                "ModelOpt export functionality is not available. "
+                "Please ensure you have the latest version of modelopt installed."
+            ) from e
+
+        # Create export directory if it doesn't exist
+        os.makedirs(export_path, exist_ok=True)
+
+        # Export the quantized model
+        export_hf_checkpoint(model, export_dir=export_path)
+
+        # Export the tokenizer if model_path is provided
+        if model_path:
+            try:
+                tokenizer = AutoTokenizer.from_pretrained(
+                    model_path, trust_remote_code=trust_remote_code
+                )
+                tokenizer.save_pretrained(export_path)
+                rank0_log(f"Tokenizer exported to {export_path}")
+            except Exception as e:
+                rank0_log(f"Warning: Failed to export tokenizer: {e}")
+
+    def load_model(
+        self,
+        *,
+        model_config: ModelConfig,
+        device_config: DeviceConfig,
+    ) -> nn.Module:
+
+        logger.info("ModelOptModelLoader: Loading base model...")
+
+        # Store the original model path for tokenizer export
+        self._original_model_path = model_config.model_path
+
+        # Check if model is already quantized
+        if model_config._is_already_quantized():
+            logger.info("Model is already quantized, loading directly...")
+            # Use default loading for pre-quantized models
+            return super().load_model(
+                model_config=model_config, device_config=device_config
+            )
+
+        # TODO: Quantize-and-serve mode has been disabled at the ModelConfig level
+        # All quantization now uses the standard workflow (quantize + export/save)
+        logger.info("Standard quantization mode: Will quantize and export/save")
+        return self._standard_quantization_workflow(model_config, device_config)
+
+    def _standard_quantization_workflow(
+        self, model_config: ModelConfig, device_config: DeviceConfig
+    ) -> nn.Module:
+        """Standard quantization workflow: quantize, save checkpoint, export, then return model."""
+        # Use shared method from parent class to load base model for quantization
+        model = self._load_modelopt_base_model(model_config)
+
+        # Import ModelOpt modules
+        try:
+            import modelopt.torch.quantization as mtq
+        except ImportError:
+            logger.error(
+                "NVIDIA Model Optimizer (modelopt) library not found. "
+                "Please install it to use ModelOpt quantization."
+            )
+            raise
+
+        # Handle both old modelopt_quant and new unified quantization flags
+        if hasattr(model_config, "modelopt_quant") and model_config.modelopt_quant:
+            # Legacy modelopt_quant flag
+            quant_choice_str = model_config.modelopt_quant
+        else:
+            # Unified quantization flag - extract the type (fp8/fp4)
+            quant_choice_str = model_config._get_modelopt_quant_type()
+
+        quant_cfg_name = QUANT_CFG_CHOICES.get(quant_choice_str)
+        if not quant_cfg_name:
+            raise ValueError(
+                f"Invalid quantization choice: '{quant_choice_str}'. "
+                f"Available choices: {list(QUANT_CFG_CHOICES.keys())}"
+            )
+
+        try:
+            # getattr will fetch the config object, e.g., mtq.FP8_DEFAULT_CFG
+            quant_cfg = getattr(mtq, quant_cfg_name)
+        except AttributeError:
+            raise AttributeError(
+                f"ModelOpt quantization config '{quant_cfg_name}' not found. "
+                "Please verify the ModelOpt library installation."
+            )
+
+        logger.info(
+            f"Quantizing model with ModelOpt using config: mtq.{quant_cfg_name}"
+        )
+
+        # Get ModelOpt configuration from LoadConfig
+        modelopt_config = self.load_config.modelopt_config
+        quantized_ckpt_restore_path = (
+            modelopt_config.checkpoint_restore_path if modelopt_config else None
+        )
+        quantized_ckpt_save_path = (
+            modelopt_config.checkpoint_save_path if modelopt_config else None
+        )
+        export_path = modelopt_config.export_path if modelopt_config else None
+        tokenizer = AutoTokenizer.from_pretrained(
+            model_config.model_path, use_fast=True
+        )
+
+        try:
+            self._setup_modelopt_quantization(
+                model,
+                tokenizer,
+                quant_cfg,
+                quantized_ckpt_restore_path=quantized_ckpt_restore_path,
+                quantized_ckpt_save_path=quantized_ckpt_save_path,
+                export_path=export_path,
+            )
+        except Exception as e:
+            logger.warning(f"ModelOpt quantization failed: {e}")
+            rank0_log("Proceeding without quantization...")
+
+        return model.eval()
+
+
+def get_model_loader(
+    load_config: LoadConfig, model_config: Optional[ModelConfig] = None
+) -> BaseModelLoader:
+    """Get a model loader based on the load format."""
+
+    if load_config.load_format == LoadFormat.DUMMY:
+        return DummyModelLoader(load_config)
+
+    if model_config and (
+        (hasattr(model_config, "modelopt_quant") and model_config.modelopt_quant)
+        or model_config.quantization in ["modelopt_fp8", "modelopt_fp4", "modelopt"]
+    ):
+        logger.info("Using ModelOptModelLoader due to ModelOpt quantization config.")
+        return ModelOptModelLoader(load_config)
+
+    # Use ModelOptModelLoader for unified quantization flags
+    if (
+        model_config
+        and hasattr(model_config, "quantization")
+        and model_config.quantization in ["modelopt_fp8", "modelopt_fp4"]
+    ):
+        if model_config._is_already_quantized():
+            logger.info(
+                f"Using ModelOptModelLoader for pre-quantized model: {model_config.quantization}"
+            )
+        else:
+            logger.info(
+                f"Using ModelOptModelLoader for quantization: {model_config.quantization}"
+            )
+        return ModelOptModelLoader(load_config)
+
+    if isinstance(load_config.load_format, type):
+        return load_config.load_format(load_config)
+
+    if load_config.load_format == LoadFormat.SHARDED_STATE:
+        return ShardedStateLoader(load_config)
+
+    if load_config.load_format == LoadFormat.BITSANDBYTES:
+        return BitsAndBytesModelLoader(load_config)
+
+    if load_config.load_format == LoadFormat.GGUF:
+        return GGUFModelLoader(load_config)
+
+    if load_config.load_format == LoadFormat.LAYERED:
+        return LayeredModelLoader(load_config)
+
+    # Check for FLASH_RL format early
+    # FP8 approach: BF16/FP16 model with native FP8 quantization
+    if load_config.load_format == LoadFormat.FLASH_RL:
+        logger.info(
+            "Using QuantizedRLModelLoader for RL training with native FP8 quantization."
+        )
+        logger.info(
+            "FP8 approach: Model loads with native SGLang FP8 quantization. "
+            "Same model path for both training and inference."
+        )
+
+        # Set quantization to FP8 for native SGLang support
+        if model_config and not model_config.quantization:
+            logger.info(
+                "QuantizedRL: Setting quantization to fp8 (native SGLang support). "
+                "Model will be loaded with FP8 infrastructure"
+            )
+            model_config.quantization = "fp8"
+
+        return QuantizedRLModelLoader(load_config)
+
+    if load_config.load_format == LoadFormat.REMOTE:
+        return RemoteModelLoader(load_config)
+
+    if load_config.load_format == LoadFormat.REMOTE_INSTANCE:
+        return RemoteInstanceModelLoader(load_config)
+
+    if load_config.load_format == LoadFormat.PRIVATE:
+        import importlib
+
+        try:
+            module = importlib.import_module("sglang.private.private_model_loader")
+            return module.PrivateModelLoader(load_config)
+        except ImportError:
+            raise ValueError("Failed to import sglang.private.private_model_loader")
+
+    return DefaultModelLoader(load_config)
diff --git a/sglang/python/sglang/srt/model_loader/remote_instance_weight_loader_utils.py b/sglang/python/sglang/srt/model_loader/remote_instance_weight_loader_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..c063ea342d6b2deb9028a103ac2133a0905be270
--- /dev/null
+++ b/sglang/python/sglang/srt/model_loader/remote_instance_weight_loader_utils.py
@@ -0,0 +1,208 @@
+# SPDX-License-Identifier: Apache-2.0
+
+import enum
+import importlib
+import importlib.util
+import logging
+import time
+from typing import List
+
+import requests
+
+logger = logging.getLogger(__name__)
+
+
+class RemoteInstanceWeightLoaderBackend(str, enum.Enum):
+    NCCL = "nccl"
+    TRANSFER_ENGINE = "transfer_engine"
+
+
+def trigger_init_weights_send_group_for_remote_instance_request(
+    remote_instance_weight_loader_seed_instance_ip: str,
+    remote_instance_weight_loader_seed_instance_service_port: int,
+    remote_instance_weight_loader_send_weights_group_ports: List[int],
+    remote_instance_weight_loader_client_id: str,
+):
+    seed_instance_service_url = f"http://{remote_instance_weight_loader_seed_instance_ip}:{remote_instance_weight_loader_seed_instance_service_port}"
+    # Only support loading weights from instance with same parallelism strategy.
+    # Per TP rank pair between seed and dst instances will build a communication group for sending weights.
+    # i.e. seed TP 0 <-> dst TP 0, seed TP 1 <-> dst TP 1, etc.
+    # Each communication group will have a world size 2.
+    try:
+        requests.post(
+            f"{seed_instance_service_url}/init_weights_send_group_for_remote_instance",
+            json={
+                "master_address": remote_instance_weight_loader_seed_instance_ip,
+                "ports": (
+                    ",".join(
+                        str(p)
+                        for p in remote_instance_weight_loader_send_weights_group_ports
+                    )
+                ),
+                "group_rank": 0,
+                "world_size": 2,
+                "group_name": f"send_weights_{remote_instance_weight_loader_client_id}",
+                "backend": "nccl",
+            },
+        )
+    except Exception as e:
+        logger.error(
+            f"Failed to trigger init_weights_send_group_for_remote_instance_request to seed instance {seed_instance_service_url}: {e}."
+        )
+        raise
+
+
+def trigger_transferring_weights_request(
+    remote_instance_weight_loader_seed_instance_ip: str,
+    remote_instance_weight_loader_seed_instance_service_port: int,
+    remote_instance_weight_loader_send_weights_group_ports: List[int],
+    remote_instance_weight_loader_client_id: str,
+):
+    seed_instance_service_url = f"http://{remote_instance_weight_loader_seed_instance_ip}:{remote_instance_weight_loader_seed_instance_service_port}"
+    try:
+        requests.post(
+            f"{seed_instance_service_url}/send_weights_to_remote_instance",
+            json={
+                "master_address": remote_instance_weight_loader_seed_instance_ip,
+                "ports": (
+                    ",".join(
+                        str(p)
+                        for p in remote_instance_weight_loader_send_weights_group_ports
+                    )
+                ),
+                "group_name": f"send_weights_{remote_instance_weight_loader_client_id}",
+            },
+        )
+    except Exception as e:
+        logger.error(f"Failed to trigger send weights to remote instance request: {e}")
+        raise
+
+
+def get_remote_instance_transfer_engine_info_per_rank(seed_url: str, rank: int):
+    try:
+        response = requests.get(
+            f"{seed_url}/get_remote_instance_transfer_engine_info",
+            params={
+                "rank": rank,
+            },
+        )
+
+        if response.status_code == 200:
+            data = response.json()
+
+            if "remote_instance_transfer_engine_info" in data:
+                return data["remote_instance_transfer_engine_info"]
+            else:
+                logger.error(
+                    "Failed to get `remote_instance_transfer_engine_info` in response."
+                )
+                return None, None
+        else:
+            logger.error(f"request.get failed: {response.status_code}")
+            return None, None
+    except Exception as e:
+        logger.error(f"Exception: {e}")
+        return None, None
+
+
+def parse_remote_instance_transfer_engine_info_from_scheduler_infos(scheduler_infos):
+    remote_instance_transfer_engine_info = {}
+    for data in scheduler_infos:
+        if (
+            "tp_rank" in data
+            and "remote_instance_transfer_engine_session_id" in data
+            and "remote_instance_transfer_engine_weights_info_dict" in data
+        ):
+            remote_instance_transfer_engine_info[data["tp_rank"]] = (
+                data["remote_instance_transfer_engine_session_id"],
+                data["remote_instance_transfer_engine_weights_info_dict"],
+            )
+    return remote_instance_transfer_engine_info
+
+
+def register_memory_region(model, transfer_engine):
+    if importlib.util.find_spec("torch") is None:
+        return register_memory_region_v1(model, transfer_engine)
+    else:
+        return register_memory_region_v2(model, transfer_engine)
+
+
+def register_memory_region_v1(model, transfer_engine):
+    start_tic = time.time()
+
+    weight_mr_dict = {}
+    for name, weight in model.named_parameters():
+        ret = transfer_engine.register_memory(
+            weight.data_ptr(), weight.numel() * weight.element_size()
+        )
+        if ret != 0:
+            raise RuntimeError(
+                f"register memory failed for weight {name}, error: {ret}"
+            )
+        weight_mr_dict[name] = (
+            weight.data_ptr(),
+            weight.numel(),
+            weight.element_size(),
+        )
+
+    end_tic = time.time()
+    logger.debug(f"Register memory region time: {(end_tic - start_tic):.4f}s")
+    return weight_mr_dict
+
+
+def register_memory_region_v2(model, transfer_engine):
+    start_tic = time.time()
+
+    weight_mr_dict = {}
+    weight_addr_set = set()
+    for name, weight in model.named_parameters():
+        weight_mr_dict[name] = (
+            weight.data_ptr(),
+            weight.numel(),
+            weight.element_size(),
+        )
+        weight_addr_set.add(weight.data_ptr())
+
+    import torch
+
+    memory_snapshot = torch.cuda.memory.memory_snapshot()
+    weight_blocks_for_reg_mr = []
+    # Blocks in each segment have continuous physical addresses,
+    # so they can be merged for memory registration.
+    for segment in memory_snapshot:
+        current_weight_block = None
+        blocks = segment.get("blocks", [])
+        for block in blocks:
+            address = block.get("address", -1)
+            size = block.get("size", -1)
+            state = block.get("state", "")
+            if address < 0 or size < 0 or state == "":
+                continue
+            # Only register active allocated memory blocks that hold weights.
+            if state == "active_allocated":
+                if address in weight_addr_set:
+                    if current_weight_block is None:
+                        current_weight_block = (address, size)
+                    elif current_weight_block[0] + current_weight_block[1] == address:
+                        current_weight_block = (
+                            current_weight_block[0],
+                            current_weight_block[1] + size,
+                        )
+                    else:
+                        weight_blocks_for_reg_mr.append(current_weight_block)
+                        current_weight_block = (address, size)
+        if current_weight_block is not None:
+            weight_blocks_for_reg_mr.append(current_weight_block)
+
+    # Register merged memory blocks that hold weights.
+    for weight_block in weight_blocks_for_reg_mr:
+        address, size = weight_block
+        ret = transfer_engine.register_memory(address, size)
+        if ret != 0:
+            raise RuntimeError(
+                f"register memory failed for weight block at address {address} with size {size}, error: {ret}"
+            )
+
+    end_tic = time.time()
+    logger.debug(f"Register memory region v2 time: {(end_tic - start_tic):.4f}s")
+    return weight_mr_dict
diff --git a/sglang/python/sglang/srt/model_loader/utils.py b/sglang/python/sglang/srt/model_loader/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..18739ed6954e23f67ac0f852971cb02d38e5ecc9
--- /dev/null
+++ b/sglang/python/sglang/srt/model_loader/utils.py
@@ -0,0 +1,182 @@
+# Adapted from https://github.com/vllm-project/vllm/blob/v0.6.4.post1/vllm/model_executor/model_loader/utils.py
+
+"""Utilities for selecting and loading models."""
+
+import concurrent.futures
+import contextlib
+import logging
+from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Type
+
+import torch
+import transformers
+from torch import nn
+from transformers.dynamic_module_utils import get_class_from_dynamic_module
+
+from sglang.srt.configs.model_config import ModelConfig, ModelImpl
+from sglang.srt.layers import deep_gemm_wrapper
+
+logger = logging.getLogger(__name__)
+
+
+@contextlib.contextmanager
+def set_default_torch_dtype(dtype: torch.dtype):
+    """Sets the default torch dtype to the given dtype."""
+    old_dtype = torch.get_default_dtype()
+    torch.set_default_dtype(dtype)
+    yield
+    torch.set_default_dtype(old_dtype)
+
+
+def resolve_transformers_arch(model_config: ModelConfig, architectures: list[str]):
+    for i, arch in enumerate(architectures):
+        if arch == "TransformersForCausalLM":
+            continue
+        auto_map: dict[str, str] = (
+            getattr(model_config.hf_config, "auto_map", None) or dict()
+        )
+        # Make sure that config class is always initialized before model class,
+        # otherwise the model class won't be able to access the config class,
+        # the expected auto_map should have correct order like:
+        # "auto_map": {
+        #     "AutoConfig": "<your-repo-name>--<config-name>",
+        #     "AutoModel": "<your-repo-name>--<config-name>",
+        #     "AutoModelFor<Task>": "<your-repo-name>--<config-name>",
+        # },
+        auto_modules = {
+            name: get_class_from_dynamic_module(
+                module, model_config.model_path, revision=model_config.revision
+            )
+            for name, module in sorted(auto_map.items(), key=lambda x: x[0])
+        }
+        model_module = getattr(transformers, arch, None)
+        if model_module is None:
+            if "AutoModel" not in auto_map:
+                raise ValueError(
+                    f"Cannot find model module. '{arch}' is not a registered "
+                    "model in the Transformers library (only relevant if the "
+                    "model is meant to be in Transformers) and 'AutoModel' is "
+                    "not present in the model config's 'auto_map' (relevant "
+                    "if the model is custom)."
+                )
+            model_module = auto_modules["AutoModel"]
+        if model_config.model_impl == ModelImpl.TRANSFORMERS:
+            if hasattr(model_module, "is_backend_compatible") and (
+                not model_module.is_backend_compatible()
+            ):
+                raise ValueError(
+                    f"The Transformers implementation of {arch} is not "
+                    "compatible with SGLang."
+                )
+            architectures[i] = "TransformersForCausalLM"
+        if model_config.model_impl == ModelImpl.AUTO:
+            if hasattr(model_module, "is_backend_compatible") and (
+                not model_module.is_backend_compatible()
+            ):
+                raise ValueError(
+                    f"{arch} has no SGlang implementation and the Transformers "
+                    "implementation is not compatible with SGLang."
+                )
+            logger.warning(
+                "%s has no SGLang implementation, falling back to Transformers "
+                "implementation. Some features may not be supported and "
+                "performance may not be optimal.",
+                arch,
+            )
+            architectures[i] = "TransformersForCausalLM"
+    return architectures
+
+
+def get_model_architecture(model_config: ModelConfig) -> Tuple[Type[nn.Module], str]:
+    from sglang.srt.models.registry import ModelRegistry
+
+    architectures = getattr(model_config.hf_config, "architectures", [])
+    # Special handling for quantized Mixtral.
+    # FIXME(woosuk): This is a temporary hack.
+    mixtral_supported = [
+        "fp8",
+        "compressed-tensors",
+        "gptq_marlin",
+        "awq_marlin",
+        "quark_int4fp8_moe",
+    ]
+
+    if (
+        model_config.quantization is not None
+        and model_config.quantization not in mixtral_supported
+        and "MixtralForCausalLM" in architectures
+    ):
+        architectures = ["QuantMixtralForCausalLM"]
+
+    supported_archs = ModelRegistry.get_supported_archs()
+    is_native_supported = any(arch in supported_archs for arch in architectures)
+
+    if model_config.model_impl == ModelImpl.MINDSPORE:
+        architectures = ["MindSporeForCausalLM"]
+    elif not is_native_supported or model_config.model_impl == ModelImpl.TRANSFORMERS:
+        architectures = resolve_transformers_arch(model_config, architectures)
+    return ModelRegistry.resolve_model_cls(architectures)
+
+
+def get_architecture_class_name(model_config: ModelConfig) -> str:
+    return get_model_architecture(model_config)[1]
+
+
+def post_load_weights(model: nn.Module, model_config: ModelConfig):
+    # Model weight loading consists of two stages:
+    # 1. Initial weight loading.
+    # 2. Post-processing of weights, including assigning specific member variables.
+    # For `dummy_init`, only the second stage is required.
+    if hasattr(model, "post_load_weights"):
+        if model_config.hf_config.architectures[0] == "DeepseekV3ForCausalLMNextN":
+            model.post_load_weights(is_nextn=True)
+        else:
+            model.post_load_weights()
+
+
+def should_deepgemm_weight_requant_ue8m0(weight_block_size):
+    """Should we requant fp8 weights into UE8M0 format when loading the model"""
+    return (
+        deep_gemm_wrapper.ENABLE_JIT_DEEPGEMM
+        and deep_gemm_wrapper.DEEPGEMM_SCALE_UE8M0
+        and weight_block_size is not None
+    )
+
+
+def should_async_load(weight: torch.Tensor) -> bool:
+    """Return True if we should load the given weight asynchronously.
+
+    For host (CPU) tensors, using a threadpool can overlap H2D copies
+    and improve throughput. For device tensors, threading often adds overhead
+    (e.g., GIL contention) without benefit, so we do it synchronously.
+    """
+    device = getattr(weight, "device", None)
+    if device is None:
+        return False
+    return device.type == "cpu"
+
+
+def maybe_executor_submit(
+    *,
+    executor: concurrent.futures.ThreadPoolExecutor,
+    futures: List[concurrent.futures.Future],
+    use_async: bool,
+    func: Callable[..., Any],
+    func_args: Iterable[Any] = (),
+    func_kwargs: Optional[Dict[str, Any]] = None,
+) -> None:
+    """Submit a task to the executor if async loading is enabled.
+
+    Parameters (keyword-only):
+    - executor: ThreadPoolExecutor used to submit background tasks
+    - futures: a list collecting the submitted Future objects
+    - use_async: whether to submit to executor or run inline
+    - func: the callable to run
+    - func_args: positional args for the callable (defaults to empty tuple)
+    - func_kwargs: keyword args for the callable (defaults to empty dict)
+    """
+    if func_kwargs is None:
+        func_kwargs = {}
+    if use_async:
+        futures.append(executor.submit(func, *func_args, **func_kwargs))
+    else:
+        func(*func_args, **func_kwargs)
diff --git a/sglang/python/sglang/srt/model_loader/weight_utils.py b/sglang/python/sglang/srt/model_loader/weight_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..4be11541a92417e34be5a55a3eb739e7a1145a61
--- /dev/null
+++ b/sglang/python/sglang/srt/model_loader/weight_utils.py
@@ -0,0 +1,1429 @@
+# Adapted from https://github.com/vllm-project/vllm/blob/v0.6.4.post1/vllm/model_executor/model_loader/weight_utils.py
+
+"""Utilities for downloading and initializing model weights."""
+
+import collections
+import concurrent.futures
+import fnmatch
+import glob
+import hashlib
+import itertools
+import json
+import logging
+import os
+import tempfile
+from collections import defaultdict
+from typing import (
+    Any,
+    Callable,
+    Dict,
+    Generator,
+    Iterable,
+    List,
+    Optional,
+    Tuple,
+    Union,
+)
+
+import filelock
+import huggingface_hub.constants
+import numpy as np
+import safetensors.torch
+import torch
+from huggingface_hub import HfFileSystem, hf_hub_download, snapshot_download
+from pydantic import BaseModel, ConfigDict, ValidationInfo, model_validator
+from tqdm.auto import tqdm
+
+from sglang.srt.configs.load_config import LoadConfig
+from sglang.srt.configs.model_config import ModelConfig
+from sglang.srt.distributed import (
+    get_tensor_model_parallel_rank,
+    get_tensor_model_parallel_world_size,
+)
+from sglang.srt.layers.dp_attention import get_attention_tp_rank
+from sglang.srt.layers.quantization import QuantizationConfig, get_quantization_config
+from sglang.srt.layers.quantization.fp8 import Fp8Config
+from sglang.srt.layers.quantization.modelopt_quant import (
+    ModelOptFp4Config,
+    ModelOptFp8Config,
+)
+from sglang.srt.model_loader.ci_weight_validation import (
+    ci_download_with_validation_and_retry,
+    ci_validate_and_cleanup_local_snapshot,
+)
+from sglang.srt.utils import (
+    BAR_FORMAT,
+    find_local_repo_dir,
+    is_cpu,
+    log_info_on_rank0,
+    print_warning_once,
+)
+from sglang.utils import is_in_ci
+
+try:
+    from fastsafetensors import SafeTensorsFileLoader, SingleGroup
+except ImportError as e:
+    SafeTensorsFileLoader = SingleGroup = None
+
+logger = logging.getLogger(__name__)
+
+
+def enable_hf_transfer():
+    """automatically activates hf_transfer"""
+    if "HF_HUB_ENABLE_HF_TRANSFER" not in os.environ:
+        try:
+            # enable hf hub transfer if available
+            import hf_transfer  # type: ignore # noqa
+
+            huggingface_hub.constants.HF_HUB_ENABLE_HF_TRANSFER = True
+        except ImportError:
+            pass
+
+
+enable_hf_transfer()
+
+
+# use system-level temp directory for file locks, so that multiple users
+# can share the same lock without error.
+# lock files in the temp directory will be automatically deleted when the
+# system reboots, so users will not complain about annoying lock files
+temp_dir = tempfile.gettempdir()
+
+
+def get_lock(
+    model_name_or_path: str, cache_dir: Optional[str] = None, suffix: str = ""
+):
+    lock_dir = cache_dir or temp_dir
+    os.makedirs(os.path.dirname(lock_dir), exist_ok=True)
+    model_name = model_name_or_path.replace("/", "-")
+    hash_name = hashlib.sha256(model_name.encode()).hexdigest()
+    # add hash to avoid conflict with old users' lock files
+    lock_file_name = hash_name + model_name + suffix + ".lock"
+    # mode 0o666 is required for the filelock to be shared across users
+    lock = filelock.FileLock(os.path.join(lock_dir, lock_file_name), mode=0o666)
+    return lock
+
+
+def _shared_pointers(tensors):
+    ptrs = defaultdict(list)
+    for k, v in tensors.items():
+        ptrs[v.data_ptr()].append(k)
+    failing = []
+    for _, names in ptrs.items():
+        if len(names) > 1:
+            failing.append(names)
+    return failing
+
+
+def convert_bin_to_safetensor_file(
+    pt_filename: str,
+    sf_filename: str,
+) -> None:
+    loaded = torch.load(pt_filename, map_location="cpu", weights_only=True)
+    if "state_dict" in loaded:
+        loaded = loaded["state_dict"]
+    shared = _shared_pointers(loaded)
+    for shared_weights in shared:
+        for name in shared_weights[1:]:
+            loaded.pop(name)
+
+    # For tensors to be contiguous
+    loaded = {k: v.contiguous() for k, v in loaded.items()}
+
+    dirname = os.path.dirname(sf_filename)
+    os.makedirs(dirname, exist_ok=True)
+
+    from safetensors.torch import save_file
+
+    save_file(loaded, sf_filename, metadata={"format": "pt"})
+
+    # check file size
+    sf_size = os.stat(sf_filename).st_size
+    pt_size = os.stat(pt_filename).st_size
+    if (sf_size - pt_size) / pt_size > 0.01:
+        raise RuntimeError(f"""The file size different is more than 1%:
+         - {sf_filename}: {sf_size}
+         - {pt_filename}: {pt_size}
+         """)
+
+    # check if the tensors are the same
+    reloaded = safetensors.torch.load_file(sf_filename)
+    for k in loaded:
+        pt_tensor = loaded[k]
+        sf_tensor = reloaded[k]
+        if not torch.equal(pt_tensor, sf_tensor):
+            raise RuntimeError(f"The output tensors do not match for key {k}")
+
+
+def replace_prefix(key: str, prefix_mapping: dict[str, str]) -> str:
+    for prefix, new_prefix in prefix_mapping.items():
+        if key.startswith(prefix):
+            key = key.replace(prefix, new_prefix, 1)
+    return key
+
+
+def replace_substrings(key: str, substring_mapping: dict[str, str]) -> str:
+    for substr, new_substr in substring_mapping.items():
+        if substr in key:
+            key = key.replace(substr, new_substr)
+    return key
+
+
+class DisabledTqdm(tqdm):
+    def __init__(self, *args, **kwargs):
+        kwargs["disable"] = True
+        super().__init__(*args, **kwargs)
+
+
+# TODO(woosuk): Move this to other place.
+def get_quant_config(
+    model_config: ModelConfig,
+    load_config: LoadConfig,
+    packed_modules_mapping: Dict[str, List[str]],
+    remap_prefix: Dict[str, str] | None = None,
+) -> QuantizationConfig:
+    quant_cls = get_quantization_config(model_config.quantization)
+
+    # GGUF doesn't have config file
+    if model_config.quantization == "gguf":
+        return quant_cls.from_config({})
+
+    # Read the quantization config from the HF model config, if available.
+    hf_quant_config = getattr(model_config.hf_config, "quantization_config", None)
+    # some vision model may keep quantization_config in their text_config
+    hf_text_config = getattr(model_config.hf_config, "text_config", None)
+    if hf_quant_config is None and hf_text_config is not None:
+        hf_quant_config = getattr(hf_text_config, "quantization_config", None)
+    if hf_quant_config is None:
+        # compressed-tensors uses a compressions_config
+        hf_quant_config = getattr(model_config.hf_config, "compression_config", None)
+    if hf_quant_config is not None:
+        if not isinstance(hf_quant_config, dict):
+            hf_quant_config = hf_quant_config.to_dict()
+        hf_quant_config["packed_modules_mapping"] = packed_modules_mapping
+        return quant_cls.from_config(hf_quant_config)
+
+    # In case of bitsandbytes/QLoRA, get quant config from the adapter model.
+    if model_config.quantization == "bitsandbytes":
+        if (
+            not load_config.model_loader_extra_config
+            or "qlora_adapter_name_or_path" not in load_config.model_loader_extra_config
+        ):
+            return quant_cls.from_config({"adapter_name_or_path": ""})
+        model_name_or_path = load_config.model_loader_extra_config[
+            "qlora_adapter_name_or_path"
+        ]
+    else:
+        model_name_or_path = model_config.model_path
+
+    is_local = os.path.isdir(model_name_or_path)
+    if not is_local:
+        # Download the config files.
+        with get_lock(model_name_or_path, load_config.download_dir):
+            hf_folder = snapshot_download(
+                model_name_or_path,
+                revision=model_config.revision,
+                allow_patterns="*.json",
+                cache_dir=load_config.download_dir,
+                local_files_only=huggingface_hub.constants.HF_HUB_OFFLINE,
+                tqdm_class=DisabledTqdm,
+            )
+    else:
+        hf_folder = model_name_or_path
+
+    possible_config_filenames = quant_cls.get_config_filenames()
+
+    # If the quantization config is not found, use the default config.
+    if not possible_config_filenames:
+        if model_config.quantization == "mxfp8":
+            return Fp8Config(use_mxfp8=True, is_checkpoint_fp8_serialized=False)
+        return quant_cls()
+
+    config_files = glob.glob(os.path.join(hf_folder, "*.json"))
+
+    quant_config_files = [
+        f for f in config_files if any(f.endswith(x) for x in possible_config_filenames)
+    ]
+    if len(quant_config_files) == 0:
+        raise ValueError(f"Cannot find the config file for {model_config.quantization}")
+    if len(quant_config_files) > 1:
+        raise ValueError(
+            f"Found multiple config files for {model_config.quantization}: "
+            f"{quant_config_files}"
+        )
+
+    quant_config_file = quant_config_files[0]
+    with open(quant_config_file) as f:
+        config = json.load(f)
+        if remap_prefix is not None:
+            exclude_modules = [
+                replace_prefix(key, remap_prefix)
+                for key in config["quantization"]["exclude_modules"]
+            ]
+            config["quantization"]["exclude_modules"] = exclude_modules
+        config["packed_modules_mapping"] = packed_modules_mapping
+
+        if model_config.quantization == "bitsandbytes":
+            config["adapter_name_or_path"] = model_name_or_path
+        elif model_config.quantization.startswith("modelopt") and (
+            config.get("producer", {}).get("name", "").startswith("modelopt")
+        ):
+            quant_algo = config["quantization"]["quant_algo"]
+            if quant_algo is None:
+                # (yizhang2077) workaround for nvidia/Llama-4-Maverick-17B-128E-Eagle3
+                if model_config.hf_config.architectures[0] != "LlamaForCausalLMEagle3":
+                    raise ValueError(
+                        f"Invalid quant_config, quantization method: {model_config.quantization},"
+                        f"hf architectures: {model_config.hf_config.architectures[0]}. "
+                    )
+                return None
+            elif quant_algo == "FP8" or model_config.quantization == "modelopt_fp8":
+                return ModelOptFp8Config.from_config(config)
+            elif "FP4" in quant_algo:
+                return ModelOptFp4Config.from_config(config)
+        return quant_cls.from_config(config)
+
+
+def _check_index_files_exist(snapshot_dir: str) -> Tuple[bool, Optional[str]]:
+    """
+    Check if all files listed in safetensors index files actually exist on disk.
+
+    This catches cases where the snapshot directory exists but files are missing
+    (e.g., due to incomplete downloads or corrupted cache).
+
+    Args:
+        snapshot_dir: Path to the model snapshot directory
+
+    Returns:
+        Tuple of (all_exist, error_message)
+    """
+    index_files = [
+        f for f in os.listdir(snapshot_dir) if f.endswith(".safetensors.index.json")
+    ]
+
+    if not index_files:
+        return True, None  # Not a sharded model
+
+    for index_file in index_files:
+        index_path = os.path.join(snapshot_dir, index_file)
+        if not os.path.exists(index_path):
+            continue
+        try:
+            with open(index_path) as f:
+                weight_map = json.load(f).get("weight_map", {})
+            if not weight_map:
+                continue
+            required_files = set(weight_map.values())
+            missing_files = [
+                fn
+                for fn in required_files
+                if not os.path.exists(os.path.join(snapshot_dir, fn))
+            ]
+            if missing_files:
+                return (
+                    False,
+                    f"Missing {len(missing_files)} file(s) from index {index_file}: "
+                    f"{missing_files[:3]}{'...' if len(missing_files) > 3 else ''}",
+                )
+        except Exception as e:
+            logger.warning("Failed to read index file %s: %s", index_file, e)
+            continue
+
+    return True, None
+
+
+def _find_local_hf_snapshot_dir_unlocked(
+    model_name_or_path: str,
+    cache_dir: Optional[str],
+    allow_patterns: List[str],
+    revision: Optional[str] = None,
+) -> Optional[str]:
+    """Find local HF snapshot directory without locking.
+
+    IMPORTANT: Caller MUST hold the model lock before calling this function
+    to prevent race conditions during validation and cleanup.
+
+    If the weights are already local, skip downloading and returns the path.
+    """
+    if os.path.isdir(model_name_or_path):
+        return None
+
+    found_local_snapshot_dir = None
+
+    # Check custom cache_dir (if provided)
+    if cache_dir:
+        try:
+            repo_folder = os.path.join(
+                cache_dir,
+                huggingface_hub.constants.REPO_ID_SEPARATOR.join(
+                    ["models", *model_name_or_path.split("/")]
+                ),
+            )
+            rev_to_use = revision
+            if not rev_to_use:
+                ref_main = os.path.join(repo_folder, "refs", "main")
+                if os.path.isfile(ref_main):
+                    with open(ref_main) as f:
+                        rev_to_use = f.read().strip()
+            if rev_to_use:
+                rev_dir = os.path.join(repo_folder, "snapshots", rev_to_use)
+                if os.path.isdir(rev_dir):
+                    found_local_snapshot_dir = rev_dir
+        except Exception as e:
+            logger.warning(
+                "Failed to find local snapshot in custom cache_dir %s: %s",
+                cache_dir,
+                e,
+            )
+
+    # Check default HF cache as well
+    if not found_local_snapshot_dir:
+        try:
+            rev_dir = find_local_repo_dir(model_name_or_path, revision)
+            if rev_dir and os.path.isdir(rev_dir):
+                found_local_snapshot_dir = rev_dir
+        except Exception as e:
+            logger.warning("Failed to find local snapshot in default HF cache: %s", e)
+
+    # if local snapshot exists, validate it contains at least one weight file
+    # matching allow_patterns before skipping download.
+    if found_local_snapshot_dir is None:
+        return None
+
+    # Check if snapshot dir exists (might have been cleaned by another process
+    # before we acquired the lock)
+    if not os.path.isdir(found_local_snapshot_dir):
+        return None
+
+    local_weight_files: List[str] = []
+    try:
+        for pattern in allow_patterns:
+            matched_files = glob.glob(os.path.join(found_local_snapshot_dir, pattern))
+            for f in matched_files:
+                # os.path.exists returns False for broken symlinks.
+                if not os.path.exists(f):
+                    continue
+                local_weight_files.append(f)
+    except Exception as e:
+        logger.warning(
+            "Failed to scan local snapshot %s with patterns %s: %s",
+            found_local_snapshot_dir,
+            allow_patterns,
+            e,
+        )
+        local_weight_files = []
+
+    # Check for missing files from index (lightweight, for all users)
+    # This catches incomplete downloads before they cause cryptic load errors
+    if local_weight_files:
+        is_complete, error_msg = _check_index_files_exist(found_local_snapshot_dir)
+        if not is_complete:
+            log_info_on_rank0(
+                logger,
+                f"Local snapshot incomplete for {model_name_or_path}: {error_msg}. "
+                f"Will download missing files.",
+            )
+            return None  # Triggers snapshot_download() which handles partial downloads
+
+    # Only perform cache validation and cleanup in CI to avoid
+    # unnecessary overhead for regular users
+    if is_in_ci() and local_weight_files:
+        is_valid = ci_validate_and_cleanup_local_snapshot(
+            model_name_or_path, found_local_snapshot_dir, local_weight_files
+        )
+        if not is_valid:
+            return None
+
+    if len(local_weight_files) > 0:
+        log_info_on_rank0(
+            logger,
+            f"Found local HF snapshot for {model_name_or_path} at "
+            f"{found_local_snapshot_dir}; skipping download.",
+        )
+        return found_local_snapshot_dir
+    else:
+        log_info_on_rank0(
+            logger,
+            f"Local HF snapshot at {found_local_snapshot_dir} has no files matching "
+            f"{allow_patterns}; will attempt download.",
+        )
+        return None
+
+
+def download_weights_from_hf(
+    model_name_or_path: str,
+    cache_dir: Optional[str],
+    allow_patterns: List[str],
+    revision: Optional[str] = None,
+    ignore_patterns: Optional[Union[str, List[str]]] = None,
+    max_retries: int = 3,
+) -> str:
+    """Download model weights from Hugging Face Hub.
+
+    Args:
+        model_name_or_path (str): The model name or path.
+        cache_dir (Optional[str]): The cache directory to store the model
+            weights. If None, will use HF defaults.
+        allow_patterns (List[str]): The allowed patterns for the
+            weight files. Files matched by any of the patterns will be
+            downloaded.
+        revision (Optional[str]): The revision of the model.
+        ignore_patterns (Optional[Union[str, List[str]]]): The patterns to
+            filter out the weight files. Files matched by any of the patterns
+            will be ignored.
+        max_retries (int): Maximum number of download retries if corruption
+            is detected. Defaults to 3.
+
+    Returns:
+        str: The path to the downloaded model weights.
+    """
+    # For local paths, no HF operations needed
+    if os.path.isdir(model_name_or_path):
+        return model_name_or_path
+
+    # Use a SINGLE lock for the entire operation (validation + cleanup + download)
+    # to prevent race conditions where:
+    # 1. Process A validates, finds corruption, deletes corrupted file
+    # 2. Process B validates, sees missing file, deletes ENTIRE cache
+    # 3. Process A tries to download but cache is gone
+    # By using one lock, validation/cleanup and download are atomic.
+    with get_lock(model_name_or_path, cache_dir):
+        # Check for valid local cache first (validates and cleans up if needed)
+        path = _find_local_hf_snapshot_dir_unlocked(
+            model_name_or_path, cache_dir, allow_patterns, revision
+        )
+        if path is not None:
+            # Valid local cache found, skip download
+            return path
+
+        # In CI, skip HF API calls if we're in offline mode or want to avoid rate limits
+        # But we already checked for local cache above, so if we're here we need to download
+        if not huggingface_hub.constants.HF_HUB_OFFLINE:
+            # Before we download we look at what is available:
+            fs = HfFileSystem()
+            file_list = fs.ls(model_name_or_path, detail=False, revision=revision)
+
+            # depending on what is available we download different things
+            for pattern in allow_patterns:
+                matching = fnmatch.filter(file_list, pattern)
+                if len(matching) > 0:
+                    allow_patterns = [pattern]
+                    break
+
+        log_info_on_rank0(logger, f"Using model weights format {allow_patterns}")
+
+        if not is_in_ci():
+            # Simple download without validation for non-CI environments
+            hf_folder = snapshot_download(
+                model_name_or_path,
+                allow_patterns=allow_patterns,
+                ignore_patterns=ignore_patterns,
+                cache_dir=cache_dir,
+                tqdm_class=DisabledTqdm,
+                revision=revision,
+                local_files_only=huggingface_hub.constants.HF_HUB_OFFLINE,
+            )
+            return hf_folder
+        else:
+            # Only perform validation and retry in CI to avoid overhead for regular users
+            return ci_download_with_validation_and_retry(
+                model_name_or_path=model_name_or_path,
+                allow_patterns=allow_patterns,
+                ignore_patterns=ignore_patterns,
+                cache_dir=cache_dir,
+                revision=revision,
+                max_retries=max_retries,
+            )
+
+
+def download_safetensors_index_file_from_hf(
+    model_name_or_path: str,
+    index_file: str,
+    cache_dir: Optional[str],
+    revision: Optional[str] = None,
+) -> None:
+    """Download hf safetensors index file from Hugging Face Hub.
+
+    Args:
+        model_name_or_path (str): The model name or path.
+        cache_dir (Optional[str]): The cache directory to store the model
+            weights. If None, will use HF defaults.
+        revision (Optional[str]): The revision of the model.
+    """
+    # Use file lock to prevent multiple processes from
+    # downloading the same model weights at the same time.
+    with get_lock(model_name_or_path, cache_dir):
+        try:
+            # Download the safetensors index file.
+            hf_hub_download(
+                repo_id=model_name_or_path,
+                filename=index_file,
+                cache_dir=cache_dir,
+                revision=revision,
+                local_files_only=huggingface_hub.constants.HF_HUB_OFFLINE,
+            )
+        # If file not found on remote or locally, we should not fail since
+        # only some models will have index_file.
+        except huggingface_hub.utils.EntryNotFoundError:
+            logger.info("No %s found in remote.", index_file)
+        except huggingface_hub.utils.LocalEntryNotFoundError:
+            logger.info("No %s found in local cache.", index_file)
+
+
+# For models like Mistral-7B-v0.3, there are both sharded
+# safetensors files and a consolidated safetensors file.
+# Passing both of these to the weight loader functionality breaks.
+# So, we use the index_file to
+# look up which safetensors files should be used.
+def filter_duplicate_safetensors_files(
+    hf_weights_files: List[str], hf_folder: str, index_file: str
+) -> List[str]:
+    # model.safetensors.index.json is a mapping from keys in the
+    # torch state_dict to safetensors file holding that weight.
+    index_file_name = os.path.join(hf_folder, index_file)
+    if not os.path.isfile(index_file_name):
+        # NOTE: this is a trick of handling mistral model
+        # skip the unsupported consolidated.safetensors file
+        if len(hf_weights_files) == 2:
+            hf_weights_files.sort()
+            if hf_weights_files[0].endswith(
+                "consolidated.safetensors"
+            ) and hf_weights_files[1].endswith("model.safetensors"):
+                return [hf_weights_files[1]]
+        return hf_weights_files
+
+    # Iterate through the weight_map (weight_name: safetensors files)
+    # to identify weights that we should use.
+    with open(index_file_name) as f:
+        weight_map = json.load(f)["weight_map"]
+    weight_files_in_index = set()
+    for weight_name in weight_map:
+        weight_files_in_index.add(os.path.join(hf_folder, weight_map[weight_name]))
+    # Filter out any fields that are not found in the index file.
+    hf_weights_files = [f for f in hf_weights_files if f in weight_files_in_index]
+    return hf_weights_files
+
+
+def maybe_add_mtp_safetensors(
+    hf_weights_files: List[str], hf_folder: str, index_file: str, hf_config
+) -> List[str]:
+    """
+    Auto-detect and add mtp.safetensors for GLM4Moe MTP/NextN models if:
+    1. mtp.safetensors exists in the model directory
+    2. mtp.safetensors is NOT in the index (checkpoint packaging bug)
+    3. Model architecture is Glm4MoeForCausalLM with num_nextn_predict_layers > 0
+
+    This works around incorrectly packaged FP4 checkpoints like
+    baseten-admin/glm-4.7-fp4 where mtp.safetensors exists but
+    isn't referenced in model.safetensors.index.json.
+    """
+    # Only apply for GLM4Moe architecture with nextn layers
+    arch = getattr(hf_config, "architectures", [None])[0]
+    num_nextn_layers = getattr(hf_config, "num_nextn_predict_layers", 0)
+    if not (
+        arch in ["Glm4MoeForCausalLM", "Glm4MoeForCausalLMNextN"]
+        and num_nextn_layers > 0
+    ):
+        return hf_weights_files
+
+    # Check if mtp.safetensors exists and is not already in the file list
+    mtp_path = os.path.join(hf_folder, "mtp.safetensors")
+    if not os.path.isfile(mtp_path) or mtp_path in hf_weights_files:
+        return hf_weights_files
+
+    # mtp.safetensors exists but not in index - this is a bug
+    logger.warning(
+        f"Found mtp.safetensors but it's not referenced in {index_file}. "
+        f"This is a checkpoint packaging bug. Auto-adding it for loading. "
+        f"Please report this to the checkpoint provider."
+    )
+
+    # Add it to the files list
+    return hf_weights_files + [mtp_path]
+
+
+def filter_files_not_needed_for_inference(hf_weights_files: List[str]) -> List[str]:
+    """
+    Exclude files that are not needed for inference.
+
+    See https://github.com/huggingface/transformers/blob/v4.34.0/src/transformers/trainer.py#L227-L233
+    """
+    blacklist = [
+        "training_args.bin",
+        "optimizer.bin",
+        "optimizer.pt",
+        "scheduler.pt",
+        "scaler.pt",
+    ]
+    hf_weights_files = [
+        f for f in hf_weights_files if not any(f.endswith(x) for x in blacklist)
+    ]
+    return hf_weights_files
+
+
+def np_cache_weights_iterator(
+    model_name_or_path: str,
+    cache_dir: Optional[str],
+    hf_folder: str,
+    hf_weights_files: List[str],
+) -> Generator[Tuple[str, torch.Tensor], None, None]:
+    """Iterate over the weights in the model np files.
+
+    Will dump the model weights to numpy files if they are not already dumped.
+    """
+    enable_tqdm = (
+        not torch.distributed.is_initialized() or torch.distributed.get_rank() == 0
+    )
+    # Convert the model weights from torch tensors to numpy arrays for
+    # faster loading.
+    np_folder = os.path.join(hf_folder, "np")
+    os.makedirs(np_folder, exist_ok=True)
+    weight_names_file = os.path.join(np_folder, "weight_names.json")
+    # Use file lock to prevent multiple processes from
+    # dumping the same model weights to numpy at the same time.
+    with get_lock(model_name_or_path, cache_dir):
+        if not os.path.exists(weight_names_file):
+            weight_names: List[str] = []
+            for bin_file in tqdm(
+                hf_weights_files,
+                desc="Loading np_cache checkpoint shards",
+                disable=not enable_tqdm,
+                bar_format=BAR_FORMAT,
+                position=tqdm._get_free_pos(),
+            ):
+                state = torch.load(bin_file, map_location="cpu", weights_only=True)
+                for name, param in state.items():
+                    param_path = os.path.join(np_folder, name)
+                    with open(param_path, "wb") as f:
+                        np.save(f, param.cpu().detach().numpy())
+                    weight_names.append(name)
+            with open(weight_names_file, "w") as f:
+                json.dump(weight_names, f)
+
+    with open(weight_names_file) as f:
+        weight_names = json.load(f)
+
+    for name in weight_names:
+        param_path = os.path.join(np_folder, name)
+        with open(param_path, "rb") as f:
+            param = np.load(f)
+        yield name, torch.from_numpy(param)
+
+
+def safetensors_weights_iterator(
+    hf_weights_files: List[str],
+    disable_mmap: bool = False,
+) -> Generator[Tuple[str, torch.Tensor], None, None]:
+    """Iterate over the weights in the model safetensor files."""
+    enable_tqdm = (
+        not torch.distributed.is_initialized() or torch.distributed.get_rank() == 0
+    )
+    for st_file in tqdm(
+        hf_weights_files,
+        desc="Loading safetensors checkpoint shards",
+        disable=not enable_tqdm,
+        bar_format=BAR_FORMAT,
+        position=tqdm._get_free_pos(),
+    ):
+        if disable_mmap:
+            with open(st_file, "rb") as f:
+                result = safetensors.torch.load(f.read())
+                for name in sorted(result.keys()):
+                    yield name, result[name]
+        else:
+            with safetensors.safe_open(st_file, framework="pt", device="cpu") as f:
+                for name in f.keys():
+                    yield name, f.get_tensor(name)
+
+
+def fastsafetensors_weights_iterator(
+    hf_weights_files: List[str],
+) -> Generator[Tuple[str, torch.Tensor], None, None]:
+    """
+    Iterate over the weights in the model safetensor files
+    using fastsafetensor library to accelerate loading via GPU Direct Storage (if available).
+    """
+    if SafeTensorsFileLoader is None:
+        raise ImportError(
+            "Please install fastsafetensors via `pip install fastsafetensors`"
+        )
+
+    if torch.distributed.is_initialized():
+        pg = torch.distributed.group.WORLD
+    else:
+        pg = SingleGroup()
+
+    try:
+        rank = pg.rank()
+    except Exception:
+        rank = 0
+
+    device = torch.device(f"cuda:{rank}")
+
+    weight_files_sub_lists = [
+        hf_weights_files[i : i + pg.size()]
+        for i in range(0, len(hf_weights_files), pg.size())
+    ]
+
+    _BAR_FORMAT = (
+        "{l_bar}{bar}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}, {rate_fmt}]"
+    )
+
+    for f_list in tqdm(
+        weight_files_sub_lists,
+        desc="Loading safetensors using Fastsafetensor loader",
+        disable=False,
+        bar_format=_BAR_FORMAT,
+    ):
+        loader = SafeTensorsFileLoader(pg, device)
+        rank_file_map = {i: [f] for i, f in enumerate(f_list)}
+        loader.add_filenames(rank_file_map)
+        try:
+            fb = loader.copy_files_to_device()
+            try:
+                keys = list(fb.key_to_rank_lidx.keys())
+                for k in keys:
+                    t = fb.get_tensor(k)
+                    yield k, t
+            finally:
+                pass
+        finally:
+            loader.close()
+
+
+def multi_thread_safetensors_weights_iterator(
+    hf_weights_files: List[str],
+    max_workers: int,
+    disable_mmap: bool = False,
+) -> Generator[Tuple[str, torch.Tensor], None, None]:
+    """Multi-Thread iterate over the weights in the model safetensor files."""
+    enable_tqdm = (
+        not torch.distributed.is_initialized() or torch.distributed.get_rank() == 0
+    )
+
+    def _load_file(st_file: str):
+        if disable_mmap:
+            with open(st_file, "rb") as f:
+                return safetensors.torch.load(f.read())
+        return safetensors.torch.load_file(st_file, device="cpu")
+
+    with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
+        futures = [executor.submit(_load_file, st_file) for st_file in hf_weights_files]
+
+        if enable_tqdm:
+            futures_iter = tqdm(
+                concurrent.futures.as_completed(futures),
+                total=len(hf_weights_files),
+                desc="Multi-thread loading shards",
+                disable=not enable_tqdm,
+                bar_format=BAR_FORMAT,
+            )
+        else:
+            futures_iter = concurrent.futures.as_completed(futures)
+
+        for future in futures_iter:
+            state_dict = future.result()
+            for name, param in state_dict.items():
+                yield name, param
+
+
+def buffered_multi_thread_safetensors_weights_iterator(
+    hf_weights_files: List[str],
+    max_workers: int,
+    disable_mmap: bool = False,
+) -> Generator[Tuple[str, torch.Tensor], None, None]:
+    """Multi-threaded safetensor loader with bounded memory via a sliding window.
+
+    At most (max_workers + 1) shard files are in-flight at any time:
+    max_workers loading concurrently + 1 prefetched and ready to yield.
+    Peak CPU RAM ≈ (max_workers + 2) × shard_file_size.
+    """
+    enable_tqdm = (
+        not torch.distributed.is_initialized() or torch.distributed.get_rank() == 0
+    )
+
+    def _load_file(st_file: str):
+        if disable_mmap:
+            with open(st_file, "rb") as f:
+                result = safetensors.torch.load(f.read())
+        else:
+            with safetensors.safe_open(st_file, framework="pt", device="cpu") as f:
+                result = {k: f.get_tensor(k) for k in f.keys()}
+        return result
+
+    # Sliding window: max_workers loading + 1 prefetched.
+    buffer_size = max_workers + 1
+
+    with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
+        file_iter = iter(hf_weights_files)
+        pending: collections.deque = collections.deque()
+
+        # Seed the buffer.
+        for st_file in itertools.islice(file_iter, buffer_size):
+            pending.append(executor.submit(_load_file, st_file))
+
+        with tqdm(
+            total=len(hf_weights_files),
+            desc="Multi-thread loading shards",
+            disable=not enable_tqdm,
+            bar_format=BAR_FORMAT,
+            position=tqdm._get_free_pos(),
+        ) as pbar:
+            while pending:
+                future = pending.popleft()
+                state_dict = future.result()
+                del future  # let GC reclaim the Future's internal result
+
+                # Replenish: submit the next file to keep the buffer full.
+                next_file = next(file_iter, None)
+                if next_file is not None:
+                    pending.append(executor.submit(_load_file, next_file))
+
+                for name in sorted(state_dict.keys()):
+                    yield name, state_dict[name]
+                del state_dict
+                pbar.update(1)
+
+
+def _load_pt_file(bin_file: str) -> dict:
+    """Load a PyTorch checkpoint file, handling legacy tar format.
+
+    PyTorch 2.6 changed the default of weights_only from False to True.
+    Legacy tar format files cannot be loaded with weights_only=True.
+    This function tries weights_only=True first, then falls back to False
+    for legacy tar format files from trusted sources (HuggingFace Hub).
+    """
+    try:
+        return torch.load(bin_file, map_location="cpu", weights_only=True)
+    except RuntimeError as e:
+        if "legacy .tar format" in str(e):
+            logger.warning(
+                "Loading %s with weights_only=False (legacy tar format)",
+                os.path.basename(bin_file),
+            )
+            return torch.load(bin_file, map_location="cpu", weights_only=False)
+        raise
+
+
+def pt_weights_iterator(
+    hf_weights_files: List[str],
+) -> Generator[Tuple[str, torch.Tensor], None, None]:
+    """Iterate over the weights in the model bin/pt files."""
+    enable_tqdm = (
+        not torch.distributed.is_initialized() or torch.distributed.get_rank() == 0
+    )
+    for bin_file in tqdm(
+        hf_weights_files,
+        desc="Loading pt checkpoint shards",
+        disable=not enable_tqdm,
+        bar_format=BAR_FORMAT,
+        position=tqdm._get_free_pos(),
+    ):
+        state = _load_pt_file(bin_file)
+        yield from state.items()
+        del state
+
+
+def multi_thread_pt_weights_iterator(
+    hf_weights_files: List[str],
+    max_workers: int,
+) -> Generator[Tuple[str, torch.Tensor], None, None]:
+    """Multi-Thread iterate over the weights in the model bin/pt files."""
+    enable_tqdm = (
+        not torch.distributed.is_initialized() or torch.distributed.get_rank() == 0
+    )
+
+    with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
+        futures = [
+            executor.submit(_load_pt_file, bin_file) for bin_file in hf_weights_files
+        ]
+
+        if enable_tqdm:
+            futures_iter = tqdm(
+                concurrent.futures.as_completed(futures),
+                total=len(hf_weights_files),
+                desc="Multi-thread loading pt checkpoint shards",
+                disable=not enable_tqdm,
+                bar_format=BAR_FORMAT,
+            )
+        else:
+            futures_iter = concurrent.futures.as_completed(futures)
+
+        for future in futures_iter:
+            state = future.result()
+            yield from state.items()
+
+
+def get_gguf_extra_tensor_names(
+    gguf_file: str, gguf_to_hf_name_map: Dict[str, str]
+) -> List[str]:
+    import gguf
+
+    reader = gguf.GGUFReader(gguf_file)
+    expected_gguf_keys = set(gguf_to_hf_name_map.keys())
+    exact_gguf_keys = set([tensor.name for tensor in reader.tensors])
+    extra_keys = expected_gguf_keys - exact_gguf_keys
+    return [gguf_to_hf_name_map[key] for key in extra_keys]
+
+
+def gguf_quant_weights_iterator(
+    gguf_file: str, gguf_to_hf_name_map: Dict[str, str]
+) -> Generator[Tuple[str, torch.Tensor], None, None]:
+    """
+    Iterate over the quant weights in the model gguf files and convert
+    them to torch tensors
+    """
+
+    import gguf
+
+    reader = gguf.GGUFReader(gguf_file)
+
+    for tensor in reader.tensors:
+        if tensor.name in gguf_to_hf_name_map:
+            weight_type = tensor.tensor_type
+            name = gguf_to_hf_name_map[tensor.name]
+
+            if weight_type.name != "F32":
+                weight_type_name = name.replace("weight", "qweight_type")
+                weight_type = torch.tensor(weight_type)
+                yield weight_type_name, weight_type
+
+    for tensor in reader.tensors:
+        if tensor.name in gguf_to_hf_name_map:
+            weight = tensor.data
+            weight_type = tensor.tensor_type
+            name = gguf_to_hf_name_map[tensor.name]
+
+            if weight_type.name != "F32":
+                name = name.replace("weight", "qweight")
+            param = torch.tensor(weight)
+            yield name, param
+
+
+def convert_pyslice_to_tensor(x: Any) -> torch.Tensor:
+    """convert PySafeSlice object from safetensors to torch.Tensor
+
+    PySafeSlice object supports indexing, which is done before loading the
+    actual tensor and can reduce the amount of memory being read into the
+    memory. However, it does not support more advanced functionalities
+    like `.view()` or `.t()`. Therefore, if we need to modify the loaded
+    tensor with these more complicated operators, we need to convert to
+    tensor first.
+    """
+    if not isinstance(x, torch.Tensor):
+        x = x[:]
+    return x
+
+
+def default_weight_loader(param: torch.Tensor, loaded_weight: torch.Tensor) -> None:
+    """Default weight loader."""
+    try:
+        if param.numel() == 1 and loaded_weight.numel() == 1:
+            # Sometimes scalar values aren't considered tensors with shapes
+            # so if both param and loaded_weight are a scalar,
+            # "broadcast" instead of copy
+            param.data.fill_(loaded_weight.item())
+        else:
+            assert param.size() == loaded_weight.size(), (
+                f"Attempted to load weight ({loaded_weight.size()}) "
+                f"into parameter ({param.size()})"
+            )
+
+            param.data.copy_(loaded_weight)
+    except Exception:
+        # NOTE: This exception is added for the purpose of setting breakpoint to
+        # debug weight loading issues.
+        raise
+
+
+def row_parallel_weight_loader(
+    param: torch.Tensor, loaded_weight: torch.Tensor
+) -> None:
+    """Load weights that are row-parallelized."""
+    tp_rank = get_tensor_model_parallel_rank()
+    shard_dim = 0 if param.dim() != 1 else None
+
+    if shard_dim is not None:
+        shard_size = param.data.shape[shard_dim]
+        start_idx = tp_rank * shard_size
+        loaded_weight = loaded_weight.narrow(shard_dim, start_idx, shard_size)
+
+    return default_weight_loader(param, loaded_weight)
+
+
+LoaderFunction = Callable[[torch.Tensor, torch.Tensor], torch.Tensor]
+
+
+def sharded_weight_loader(shard_axis: int) -> LoaderFunction:
+    """Create a weight loader that shards the weights along the given axis"""
+
+    def loader(param: torch.Tensor, loaded_weight: torch.Tensor) -> None:
+        tp_rank = get_attention_tp_rank()
+
+        shard_size = param.data.shape[shard_axis]
+        start_idx = tp_rank * shard_size
+
+        if (
+            is_cpu()
+            and loaded_weight.size(0) % get_tensor_model_parallel_world_size() != 0
+            and loaded_weight.dim() == 1
+        ):
+            param_data = param.data  # view copy on param for uneven padding
+            param_data, loaded_weight = narrow_padded_param_and_loaded_weight(
+                param_data,
+                loaded_weight,
+                0,  # param_data_start
+                start_idx,
+                shard_axis,
+                shard_size,
+            )
+            return default_weight_loader(param_data, loaded_weight)
+        else:
+            loaded_weight = loaded_weight.narrow(shard_axis, start_idx, shard_size)
+            return default_weight_loader(param, loaded_weight)
+
+    return loader
+
+
+def composed_weight_loader(
+    loader: LoaderFunction, fn: Callable[[torch.Tensor], torch.Tensor]
+) -> LoaderFunction:
+    """Create a weight loader that post-processes the weights after loading"""
+
+    def composed_loader(param: torch.Tensor, loaded_weight: torch.Tensor) -> None:
+        loader(param, loaded_weight)
+        param.data.copy_(fn(param))
+        return
+
+    return composed_loader
+
+
+def runai_safetensors_weights_iterator(
+    hf_weights_files: List[str],
+) -> Generator[Tuple[str, torch.Tensor], None, None]:
+    """Iterate over the weights in the model safetensor files."""
+    from runai_model_streamer import SafetensorsStreamer
+
+    enable_tqdm = (
+        not torch.distributed.is_initialized() or torch.distributed.get_rank() == 0
+    )
+
+    with SafetensorsStreamer() as streamer:
+        for st_file in tqdm(
+            hf_weights_files,
+            desc="Loading safetensors using Runai Model Streamer",
+            disable=not enable_tqdm,
+            bar_format=BAR_FORMAT,
+            position=tqdm._get_free_pos(),
+        ):
+            streamer.stream_file(st_file)
+            yield from streamer.get_tensors()
+
+
+def set_runai_streamer_env(load_config: LoadConfig):
+    if load_config.model_loader_extra_config:
+        extra_config = load_config.model_loader_extra_config
+
+        if "concurrency" in extra_config and isinstance(
+            extra_config.get("concurrency"), int
+        ):
+            os.environ["RUNAI_STREAMER_CONCURRENCY"] = str(
+                extra_config.get("concurrency")
+            )
+
+        if "memory_limit" in extra_config and isinstance(
+            extra_config.get("memory_limit"), int
+        ):
+            os.environ["RUNAI_STREAMER_MEMORY_LIMIT"] = str(
+                extra_config.get("memory_limit")
+            )
+
+    runai_streamer_s3_endpoint = os.getenv("RUNAI_STREAMER_S3_ENDPOINT")
+    aws_endpoint_url = os.getenv("AWS_ENDPOINT_URL")
+    if runai_streamer_s3_endpoint is None and aws_endpoint_url is not None:
+        os.environ["RUNAI_STREAMER_S3_ENDPOINT"] = aws_endpoint_url
+
+
+def initialize_dummy_weights(
+    model: torch.nn.Module,
+    low: float = -1e-3,
+    high: float = 1e-3,
+    seed: int = 1234,
+) -> None:
+    """Initialize model weights with random values.
+
+    The model weights must be randomly initialized for accurate performance
+    measurements. Additionally, the model weights should not cause NaNs in the
+    forward pass. We empirically found that initializing the weights with
+    values between -1e-3 and 1e-3 works well for most models.
+
+    We use per-parameter random seed, so that dummy weights are consistent,
+    even if the model is partitioned across multiple devices. When the seed
+    is fixed, the random values generated by this function only depends on
+    the parameter's number of elements and its data type.
+    """
+    for param in model.state_dict().values():
+        if torch.is_floating_point(param):
+            generator = torch.Generator(device=param.data.device)
+            generator.manual_seed(seed)
+            if torch.finfo(param.data.dtype).bits < 16:
+                # uniform_ doesn't support < 16-bit datatypes (FP8)
+                dtype = param.data.dtype
+                tmp_param = param.data.to(torch.float16)
+                tmp_param = tmp_param.uniform_(low, high, generator=generator).to(dtype)
+                param.data.copy_(tmp_param)
+            else:
+                param.uniform_(low, high, generator=generator)
+
+
+def maybe_remap_kv_scale_name(name: str, params_dict: dict) -> Optional[str]:
+    """Remap the name of FP8 k/v_scale parameters.
+
+    This function handles the remapping of FP8 k/v_scale parameter names.
+    It detects if the given name ends with a suffix and attempts to remap
+    it to the expected name format in the model. If the remapped name is not
+    found in the params_dict, a warning is printed and None is returned.
+
+    Args:
+        name (str): The original loaded checkpoint parameter name.
+        params_dict (dict): Dictionary containing the model's named parameters.
+
+    Returns:
+        str: The remapped parameter name if successful, or the original name
+             if no remapping is needed.
+        None: If the remapped name is not found in params_dict.
+    """
+    if name.endswith(".kv_scale"):
+        print_warning_once(
+            "DEPRECATED. Found kv_scale in the checkpoint. "
+            "This format is deprecated in favor of separate k_scale and "
+            "v_scale tensors and will be removed in a future release. "
+            "Functionally, we will remap kv_scale to k_scale and duplicate "
+            "k_scale to v_scale"
+        )
+        # NOTE: we remap the deprecated kv_scale to k_scale
+        remapped_name = name.replace(".kv_scale", ".attn.k_scale")
+        if remapped_name not in params_dict:
+            print_warning_once(
+                f"Found kv_scale in the checkpoint (e.g. {name}), "
+                "but not found the expected name in the model "
+                f"(e.g. {remapped_name}). kv_scale is "
+                "not loaded."
+            )
+            return None
+        return remapped_name
+
+    possible_scale_names = [".k_scale", ".v_scale"]
+    # Patterns where modelopt stores scales under k_proj/v_proj
+    # but the model expects them under attn (RadixAttention)
+    modelopt_attn_prefixes = [".self_attn.", ".mixer."]
+    for scale_name in possible_scale_names:
+        if name.endswith(scale_name):
+            # Check if this is a modelopt-style scale under k_proj/v_proj
+            matched_prefix = None
+            for attn_prefix in modelopt_attn_prefixes:
+                if f"{attn_prefix}{scale_name[1]}_proj{scale_name}" in name:
+                    matched_prefix = attn_prefix
+                    break
+
+            if matched_prefix is not None:
+                remapped_name = name.replace(
+                    f"{matched_prefix}{scale_name[1]}_proj{scale_name}",
+                    f"{matched_prefix}attn{scale_name}",
+                )
+            else:
+                remapped_name = name.replace(scale_name, f".attn{scale_name}")
+            if remapped_name not in params_dict:
+                print_warning_once(
+                    f"Found {scale_name} in the checkpoint (e.g. {name}), "
+                    "but not found the expected name in the model "
+                    f"(e.g. {remapped_name}). {scale_name} is "
+                    "not loaded."
+                )
+                return None
+            return remapped_name
+
+    quark_scale_names = {
+        ".q_proj.output_scale": ".attn.q_scale",
+        ".k_proj.output_scale": ".attn.k_scale",
+        ".v_proj.output_scale": ".attn.v_scale",
+        "self_attn.prob_output_scale": ".attn.prob_scale",
+    }
+    for quark_scale_name, sglang_scale_name in quark_scale_names.items():
+        if name.endswith(quark_scale_name):
+            return name.replace(quark_scale_name, sglang_scale_name)
+
+    # If there were no matches, return the untouched param name
+    return name
+
+
+# Adapted from https://github.com/vllm-project/vllm/blob/68ad4e3a8d8a66fb2a43be57471ee13a8bec4ec0/vllm/model_executor/layers/quantization/schema.py
+class KVCacheQuantSchema(BaseModel):
+    dtype: str
+    # Each key is a TP rank. Each value is a dictionary mapping a TP rank's
+    # layer indices to their per-tensor KV cache scaling factor.
+    # TODO: Consider pulling this and its validation methods out into its
+    # own schema class (tricky as its members are variable)
+    scaling_factor: Dict[int, Dict[int, float]]
+
+    @model_validator(mode="after")
+    def check_is_fp8(self) -> "KVCacheQuantSchema":
+        assert self.dtype == "float8_e4m3fn", (
+            "Loaded scaling factors intended for KV cache dtype = "
+            f"{self.dtype} rather than float8_e4m3fn!"
+        )
+        return self
+
+    @model_validator(mode="after")
+    def check_tp_ranks(self, info: ValidationInfo) -> "KVCacheQuantSchema":
+        context = info.context
+        if context:
+            tp_size = context["tp_size"]
+            num_hidden_layers = context["num_hidden_layers"]
+            assert len(self.scaling_factor) == tp_size, (
+                f"Loaded dictionary has TP size {len(self.scaling_factor)} "
+                f"but LLM engine is currently running with TP size {tp_size}."
+            )
+            for tp_rank, layer_maps in self.scaling_factor.items():
+                assert len(layer_maps) == num_hidden_layers, (
+                    f"KV cache scales map for TP rank {tp_rank} is malformed. "
+                    f"Expected {num_hidden_layers} layers, got "
+                    f"{len(layer_maps)}."
+                )
+            for i in range(tp_size):
+                assert (
+                    i in self.scaling_factor
+                ), f"KV cache scales map for TP rank {i} not found."
+        return self
+
+    @model_validator(mode="after")
+    def check_current_rank(self, info: ValidationInfo) -> "KVCacheQuantSchema":
+        context = info.context
+        if context:
+            tp_rank = context["tp_rank"]
+            num_hidden_layers = context["num_hidden_layers"]
+            layer_scales_map = self.scaling_factor[tp_rank]
+            for i in range(num_hidden_layers):
+                assert i in layer_scales_map, (
+                    f"Could not find KV cache scales for layer {i} in "
+                    f"TP rank {tp_rank}."
+                )
+        return self
+
+
+class QuantParamSchema(BaseModel):
+    # TODO: Generalize and extend with more fields
+    # (e.g. weights/activations params) once functionality is enabled
+    model_config = ConfigDict(protected_namespaces=())
+    model_type: Optional[str]
+    kv_cache: KVCacheQuantSchema
+
+    @model_validator(mode="after")
+    def check_model_type(self, info: ValidationInfo) -> "QuantParamSchema":
+        context = info.context
+        if context:
+            model_type = context.get("model_type", None)
+            if model_type is not None:
+                assert model_type == self.model_type, (
+                    f"Model type is {model_type} but loaded "
+                    f"scaling factors belonging to different "
+                    f"model type {self.model_type}!"
+                )
+        return self
+
+
+def kv_cache_scales_loader(
+    filename: str,
+    tp_rank: int,
+    tp_size: int,
+    num_hidden_layers: int,
+    model_type: Optional[str],
+) -> Iterable[Tuple[int, float]]:
+    """
+    A simple utility to read in KV cache scaling factors that have been
+    previously serialized to disk. Used by the model to populate the appropriate
+    KV cache scaling factors. The serialization should represent a dictionary
+    whose keys are the TP ranks and values are another dictionary mapping layers
+    to their KV cache scaling factors.
+    """
+    try:
+        with open(filename) as f:
+            context = {
+                "model_type": model_type,
+                "num_hidden_layers": num_hidden_layers,
+                "tp_rank": tp_rank,
+                "tp_size": tp_size,
+            }
+            schema_dct = json.load(f)
+            schema = QuantParamSchema.model_validate(schema_dct, context=context)
+            layer_scales_map = schema.kv_cache.scaling_factor[tp_rank]
+            return layer_scales_map.items()
+    except FileNotFoundError:
+        logger.error("File or directory '%s' not found.", filename)
+    except json.JSONDecodeError:
+        logger.error("Error decoding JSON in file '%s'.", filename)
+    except Exception:
+        logger.error("An error occurred while reading '%s'.", filename)
+    # This section is reached if and only if any of the excepts are hit
+    # Return an empty iterable (list) => no KV cache scales are loaded
+    # which ultimately defaults to 1.0 scales
+    logger.warning(
+        "Defaulting to KV cache scaling factors = 1.0 for all "
+        "layers in TP rank %d as an error occurred during loading.",
+        tp_rank,
+    )
+    return []
+
+
+def get_actual_shard_size(shard_size, weight_start, weight_end):
+    if weight_end < weight_start:
+        return 0
+
+    return min(shard_size, weight_end - weight_start)
+
+
+def reset_param_data_if_needed(param_data, dim, start, length):
+    if length == 0:
+        return
+
+    assert length > 0, f"Length should be positive, but got {length}"
+
+    param_data.narrow(dim, start, length).zero_()
+    return
+
+
+def narrow_padded_param_and_loaded_weight(
+    param_data,
+    loaded_weight,
+    param_data_start,
+    weight_start,
+    dim,
+    shard_size,
+    narrow_weight=True,
+):
+    actual_shard_size = get_actual_shard_size(
+        shard_size, weight_start, loaded_weight.size(dim)
+    )
+
+    if narrow_weight:
+        if actual_shard_size > 0:
+            loaded_weight = loaded_weight.narrow(dim, weight_start, actual_shard_size)
+        else:
+            # No real data to load; create a dummy tensor filled with zeros
+            loaded_weight = torch.zeros_like(
+                param_data.narrow(dim, param_data_start, actual_shard_size)
+            )
+
+    # [Note] Reset padded weights to zero.
+    # If the actual shard size is less than the shard size, we need to reset
+    # the padded param_data to zero and then copy the loaded_weight into it.
+    reset_param_data_if_needed(
+        param_data,
+        dim,
+        param_data_start + actual_shard_size,
+        shard_size - actual_shard_size,
+    )
+
+    param_data = param_data.narrow(dim, param_data_start, actual_shard_size)
+
+    return param_data, loaded_weight
diff --git a/sglang/python/sglang/srt/models/afmoe.py b/sglang/python/sglang/srt/models/afmoe.py
new file mode 100644
index 0000000000000000000000000000000000000000..92a11b09af032682f5c84f424955efa26bbb733e
--- /dev/null
+++ b/sglang/python/sglang/srt/models/afmoe.py
@@ -0,0 +1,633 @@
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Inference-only AfMoE model compatible with HuggingFace weights.
+
+AfMoE is a Mixture-of-Experts model with:
+- Gated attention with sigmoid gating
+- Q/K normalization with RMSNorm
+- Dual normalization (pre/post for both attention and MLP)
+- Sliding window attention for local layers
+- muP (maximal update parameterization) scaling support
+"""
+
+from __future__ import annotations
+
+import functools
+from typing import Iterable, Optional, Tuple
+
+import torch
+import torch.nn.functional as F
+from torch import nn
+from transformers import PretrainedConfig
+
+from sglang.srt.distributed import (
+    get_tensor_model_parallel_rank,
+    get_tensor_model_parallel_world_size,
+    tensor_model_parallel_all_reduce,
+)
+from sglang.srt.layers.activation import SiluAndMul
+from sglang.srt.layers.layernorm import RMSNorm
+from sglang.srt.layers.linear import (
+    ColumnParallelLinear,
+    MergedColumnParallelLinear,
+    QKVParallelLinear,
+    ReplicatedLinear,
+    RowParallelLinear,
+)
+from sglang.srt.layers.logits_processor import LogitsProcessor
+from sglang.srt.layers.moe.fused_moe_triton import fused_moe
+from sglang.srt.layers.moe.moe_runner import MoeRunnerConfig
+from sglang.srt.layers.moe.topk import TopK
+from sglang.srt.layers.quantization.base_config import QuantizationConfig
+from sglang.srt.layers.radix_attention import RadixAttention
+from sglang.srt.layers.rotary_embedding import get_rope
+from sglang.srt.layers.vocab_parallel_embedding import (
+    ParallelLMHead,
+    VocabParallelEmbedding,
+)
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch
+from sglang.srt.model_loader.weight_utils import default_weight_loader
+from sglang.srt.utils import add_prefix
+
+
+def get_attention_sliding_window_size(config: PretrainedConfig) -> Optional[int]:
+    sliding_window = getattr(config, "sliding_window", None)
+    if sliding_window is None:
+        return None
+    if sliding_window <= 0:
+        return None
+    # Align with other local attention implementations (see gpt_oss).
+    return sliding_window - 1
+
+
+class AfmoeMLP(nn.Module):
+
+    def __init__(
+        self,
+        hidden_size: int,
+        intermediate_size: int,
+        hidden_act: str,
+        quant_config: Optional[QuantizationConfig] = None,
+        reduce_results: bool = True,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.gate_up_proj = MergedColumnParallelLinear(
+            hidden_size,
+            [intermediate_size] * 2,
+            bias=False,
+            quant_config=quant_config,
+            prefix=add_prefix("gate_up_proj", prefix),
+        )
+        self.down_proj = RowParallelLinear(
+            intermediate_size,
+            hidden_size,
+            bias=False,
+            reduce_results=reduce_results,
+            quant_config=quant_config,
+            prefix=add_prefix("down_proj", prefix),
+        )
+        if hidden_act != "silu":
+            raise ValueError(
+                f"Unsupported activation: {hidden_act}. Only silu is supported for now."
+            )
+        self.act_fn = SiluAndMul()
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        gate_up, _ = self.gate_up_proj(x)
+        x = self.act_fn(gate_up)
+        x, _ = self.down_proj(x)
+        return x
+
+
+class AfmoeMoE(nn.Module):
+
+    @staticmethod
+    def _custom_routing_function(
+        hidden_states: torch.Tensor,
+        gating_output: torch.Tensor,
+        topk: int,
+        renormalize: bool,
+        *,
+        score_func: str,
+        expert_bias: Optional[torch.Tensor],
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        logits = gating_output.to(torch.float32)
+        if score_func == "sigmoid":
+            scores = torch.sigmoid(logits)
+            if expert_bias is not None:
+                bias = expert_bias.to(scores.device, dtype=scores.dtype)
+                scores_for_choice = scores + bias
+                topk_ids = torch.topk(scores_for_choice, k=topk, dim=-1)[1]
+                topk_weights = scores.gather(dim=-1, index=topk_ids)
+            else:
+                topk_weights, topk_ids = torch.topk(scores, k=topk, dim=-1)
+        else:
+            if expert_bias is not None:
+                logits = logits + expert_bias.to(logits.device, dtype=logits.dtype)
+            probs = F.softmax(logits, dim=-1)
+            topk_weights, topk_ids = torch.topk(probs, k=topk, dim=-1)
+
+        if renormalize:
+            denom = topk_weights.sum(dim=-1, keepdim=True).clamp(min=1e-20)
+            topk_weights = topk_weights / denom
+
+        return topk_weights.to(torch.float32), topk_ids.to(torch.int32)
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.config = config
+        self.rank = get_tensor_model_parallel_rank()
+        self.tp_size = get_tensor_model_parallel_world_size()
+
+        self.n_routed_experts = getattr(config, "num_experts", None)
+        if self.n_routed_experts is None:
+            raise ValueError("AfmoeConfig must define `num_experts`.")
+        self.top_k = config.num_experts_per_tok
+        if self.tp_size > self.n_routed_experts:
+            raise ValueError(
+                f"Tensor parallel size {self.tp_size} is greater than "
+                f"the number of experts {self.n_routed_experts}."
+            )
+
+        self.score_func = getattr(config, "score_func", "softmax")
+        self.route_norm = getattr(config, "route_norm", True)
+        self.route_scale = float(getattr(config, "route_scale", 1.0))
+        self.n_group = getattr(config, "n_group", 1)
+        self.topk_group = getattr(config, "topk_group", 1)
+        self.use_grouped_topk = self.n_group is not None and self.n_group > 1
+        self.num_shared_experts = getattr(config, "num_shared_experts", 0)
+
+        self.gate = ReplicatedLinear(
+            config.hidden_size,
+            self.n_routed_experts,
+            bias=False,
+            quant_config=None,
+            prefix=add_prefix("gate", prefix),
+        )
+
+        self.expert_bias = nn.Parameter(
+            torch.zeros(self.n_routed_experts, dtype=torch.float32),
+            requires_grad=False,
+        )
+
+        self.experts = nn.ModuleList(
+            [
+                AfmoeMLP(
+                    hidden_size=config.hidden_size,
+                    intermediate_size=config.moe_intermediate_size,
+                    hidden_act=config.hidden_act,
+                    quant_config=quant_config,
+                    reduce_results=False,
+                    prefix=add_prefix(f"experts.{idx}", prefix),
+                )
+                for idx in range(self.n_routed_experts)
+            ]
+        )
+        self.pack_params()
+
+        if self.num_shared_experts:
+            intermediate_size = config.moe_intermediate_size * self.num_shared_experts
+            self.shared_experts = AfmoeMLP(
+                hidden_size=config.hidden_size,
+                intermediate_size=intermediate_size,
+                hidden_act=config.hidden_act,
+                quant_config=quant_config,
+                reduce_results=False,
+                prefix=add_prefix("shared_experts", prefix),
+            )
+        else:
+            self.shared_experts = None
+
+        custom_routing_fn = None
+        correction_bias = None
+        if self.use_grouped_topk:
+            correction_bias = self.expert_bias
+        elif self.score_func == "sigmoid":
+            custom_routing_fn = functools.partial(
+                AfmoeMoE._custom_routing_function,
+                score_func=self.score_func,
+                expert_bias=self.expert_bias,
+            )
+
+        renormalize = self.route_norm if self.score_func == "sigmoid" else False
+        self.topk = TopK(
+            top_k=self.top_k,
+            renormalize=renormalize,
+            use_grouped_topk=self.use_grouped_topk,
+            num_expert_group=self.n_group if self.use_grouped_topk else None,
+            topk_group=self.topk_group if self.use_grouped_topk else None,
+            custom_routing_function=custom_routing_fn,
+            correction_bias=correction_bias,
+            routed_scaling_factor=self.route_scale,
+        )
+
+    def pack_params(self) -> None:
+        w1: list[torch.Tensor] = []
+        w2: list[torch.Tensor] = []
+        for expert in self.experts:
+            w1.append(expert.gate_up_proj.weight)
+            w2.append(expert.down_proj.weight)
+        self.w1 = torch._utils._flatten_dense_tensors(w1)
+        w1s = torch._utils._unflatten_dense_tensors(self.w1, w1)
+        for data, param in zip(w1s, w1):
+            param.data = data
+        self.w1 = self.w1.view(len(w1), *w1s[0].shape)
+
+        self.w2 = torch._utils._flatten_dense_tensors(w2)
+        w2s = torch._utils._unflatten_dense_tensors(self.w2, w2)
+        for data, param in zip(w2s, w2):
+            param.data = data
+        self.w2 = self.w2.view(len(w2), *w2s[0].shape)
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        num_tokens, hidden_dim = hidden_states.shape
+        hidden_states = hidden_states.view(-1, hidden_dim)
+
+        shared_output = None
+        if self.shared_experts is not None:
+            shared_output = self.shared_experts(hidden_states)
+
+        router_logits, _ = self.gate(hidden_states)
+        topk_output = self.topk(hidden_states, router_logits)
+        final_hidden_states = fused_moe.fused_moe(
+            hidden_states,
+            w1=self.w1,
+            w2=self.w2,
+            topk_output=topk_output,
+            moe_runner_config=MoeRunnerConfig(
+                inplace=True,
+                routed_scaling_factor=self.route_scale,
+            ),
+        )
+
+        if shared_output is not None:
+            final_hidden_states = final_hidden_states + shared_output
+
+        final_hidden_states = tensor_model_parallel_all_reduce(final_hidden_states)
+        return final_hidden_states.view(num_tokens, hidden_dim)
+
+
+class AfmoeAttention(nn.Module):
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        hidden_size: int,
+        num_heads: int,
+        num_kv_heads: int,
+        layer_id: int = 0,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.hidden_size = hidden_size
+        tp_size = get_tensor_model_parallel_world_size()
+        self.total_num_heads = num_heads
+        assert self.total_num_heads % tp_size == 0
+        self.num_heads = self.total_num_heads // tp_size
+        self.total_num_kv_heads = num_kv_heads
+        if self.total_num_kv_heads >= tp_size:
+            assert self.total_num_kv_heads % tp_size == 0
+        else:
+            assert tp_size % self.total_num_kv_heads == 0
+        self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
+
+        self.head_dim = getattr(config, "head_dim", hidden_size // self.total_num_heads)
+        self.q_size = self.num_heads * self.head_dim
+        self.kv_size = self.num_kv_heads * self.head_dim
+        self.scaling = self.head_dim**-0.5
+
+        rope_theta = getattr(config, "rope_theta", 10000)
+        rope_scaling = getattr(config, "rope_scaling", None)
+        partial_rotary_factor = getattr(config, "partial_rotary_factor", 1.0)
+        self.rotary_dim = int(self.head_dim * partial_rotary_factor)
+        max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
+
+        layer_types = getattr(config, "layer_types", None)
+        self.is_local_attention = (
+            layer_types is not None and layer_types[layer_id] == "sliding_attention"
+        )
+        sliding_window = (
+            get_attention_sliding_window_size(config) if self.is_local_attention else -1
+        )
+
+        self.qkv_proj = QKVParallelLinear(
+            hidden_size,
+            self.head_dim,
+            self.total_num_heads,
+            self.total_num_kv_heads,
+            bias=False,
+            quant_config=quant_config,
+            prefix=add_prefix("qkv_proj", prefix),
+        )
+        self.o_proj = RowParallelLinear(
+            self.total_num_heads * self.head_dim,
+            hidden_size,
+            bias=False,
+            quant_config=quant_config,
+            prefix=add_prefix("o_proj", prefix),
+        )
+        self.gate_proj = ColumnParallelLinear(
+            hidden_size,
+            self.total_num_heads * self.head_dim,
+            bias=False,
+            quant_config=quant_config,
+            prefix=add_prefix("gate_proj", prefix),
+        )
+
+        self.rotary_emb = get_rope(
+            self.head_dim,
+            rotary_dim=self.rotary_dim,
+            max_position=max_position_embeddings,
+            base=rope_theta,
+            rope_scaling=rope_scaling,
+            is_neox_style=True,
+        )
+        self.attn = RadixAttention(
+            self.num_heads,
+            self.head_dim,
+            self.scaling,
+            num_kv_heads=self.num_kv_heads,
+            layer_id=layer_id,
+            sliding_window_size=sliding_window,
+            quant_config=quant_config,
+            prefix=add_prefix("attn", prefix),
+        )
+
+        eps = getattr(config, "rms_norm_eps", 1e-5)
+        self.q_norm = RMSNorm(self.head_dim, eps=eps)
+        self.k_norm = RMSNorm(self.head_dim, eps=eps)
+        self.sliding_window = sliding_window
+
+    def _apply_qk_norm(
+        self, q: torch.Tensor, k: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        q_heads = self.q_norm(q.reshape(-1, self.head_dim))
+        k_heads = self.k_norm(k.reshape(-1, self.head_dim))
+        q = q_heads.view(q.shape)
+        k = k_heads.view(k.shape)
+        return q, k
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+    ) -> torch.Tensor:
+        qkv, _ = self.qkv_proj(hidden_states)
+        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+        q, k = self._apply_qk_norm(q, k)
+
+        if self.is_local_attention:
+            q, k = self.rotary_emb(positions, q, k)
+        attn_output = self.attn(q, k, v, forward_batch)
+
+        gate_vals, _ = self.gate_proj(hidden_states)
+        attn_output = attn_output * torch.sigmoid(gate_vals)
+        output, _ = self.o_proj(attn_output)
+        return output
+
+
+class AfmoeDecoderLayer(nn.Module):
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        layer_id: int,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.layer_id = layer_id
+
+        self.self_attn = AfmoeAttention(
+            config=config,
+            hidden_size=config.hidden_size,
+            num_heads=config.num_attention_heads,
+            num_kv_heads=config.num_key_value_heads,
+            layer_id=layer_id,
+            quant_config=quant_config,
+            prefix=add_prefix("self_attn", prefix),
+        )
+
+        use_moe = False
+        if hasattr(config, "num_dense_layers"):
+            use_moe = layer_id >= config.num_dense_layers
+        elif (
+            getattr(config, "num_experts", None) is not None
+            and hasattr(config, "first_k_dense_replace")
+            and hasattr(config, "moe_layer_freq")
+        ):
+            base = config.first_k_dense_replace
+            freq = config.moe_layer_freq
+            use_moe = layer_id >= base and (layer_id - base) % freq == 0
+
+        if use_moe:
+            self.mlp = AfmoeMoE(
+                config=config,
+                quant_config=quant_config,
+                prefix=add_prefix("mlp", prefix),
+            )
+        else:
+            self.mlp = AfmoeMLP(
+                hidden_size=config.hidden_size,
+                intermediate_size=config.intermediate_size,
+                hidden_act=config.hidden_act,
+                quant_config=quant_config,
+                prefix=add_prefix("mlp", prefix),
+            )
+
+        eps = getattr(config, "rms_norm_eps", 1e-5)
+        self.input_layernorm = RMSNorm(config.hidden_size, eps=eps)
+        self.post_attention_layernorm = RMSNorm(config.hidden_size, eps=eps)
+        self.pre_mlp_layernorm = RMSNorm(config.hidden_size, eps=eps)
+        self.post_mlp_layernorm = RMSNorm(config.hidden_size, eps=eps)
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+    ) -> torch.Tensor:
+        attn_residual = hidden_states
+        hidden_states = self.input_layernorm(hidden_states)
+        hidden_states = self.self_attn(positions, hidden_states, forward_batch)
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = attn_residual + hidden_states
+
+        mlp_residual = hidden_states
+        hidden_states = self.pre_mlp_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = self.post_mlp_layernorm(hidden_states)
+        hidden_states = mlp_residual + hidden_states
+
+        return hidden_states
+
+
+class AfmoeModel(nn.Module):
+
+    fall_back_to_pt_during_load = False
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.config = config
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+
+        self.embed_tokens = VocabParallelEmbedding(
+            config.vocab_size,
+            config.hidden_size,
+        )
+        self.layers = nn.ModuleList(
+            [
+                AfmoeDecoderLayer(
+                    config,
+                    layer_id,
+                    quant_config=quant_config,
+                    prefix=add_prefix(f"layers.{layer_id}", prefix),
+                )
+                for layer_id in range(config.num_hidden_layers)
+            ]
+        )
+        self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        input_embeds: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        if input_embeds is None:
+            hidden_states = self.embed_tokens(input_ids)
+        else:
+            hidden_states = input_embeds
+
+        if getattr(self.config, "mup_enabled", False):
+            hidden_states = hidden_states * (self.config.hidden_size**0.5)
+
+        for layer in self.layers:
+            hidden_states = layer(positions, hidden_states, forward_batch)
+        hidden_states = self.norm(hidden_states)
+        return hidden_states
+
+    def get_input_embeddings(self) -> nn.Embedding:
+        return self.embed_tokens
+
+
+class AfmoeForCausalLM(nn.Module):
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.config = config
+        self.quant_config = quant_config
+        self.model = AfmoeModel(
+            config, quant_config, prefix=add_prefix("model", prefix)
+        )
+        self.lm_head = ParallelLMHead(
+            config.vocab_size,
+            config.hidden_size,
+            quant_config=quant_config,
+            prefix=add_prefix("lm_head", prefix),
+        )
+        self.logits_processor = LogitsProcessor(config)
+
+    def get_input_embeddings(self) -> nn.Embedding:
+        return self.model.embed_tokens
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        input_embeds: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        hidden_states = self.model(input_ids, positions, forward_batch, input_embeds)
+        return self.logits_processor(
+            input_ids, hidden_states, self.lm_head, forward_batch
+        )
+
+    def get_attention_sliding_window_size(self) -> Optional[int]:
+        return get_attention_sliding_window_size(self.config)
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]) -> None:
+        stacked_params_mapping = [
+            # (param_name, weight_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+            ("gate_up_proj", "gate_proj", 0),
+            ("gate_up_proj", "up_proj", 1),
+        ]
+
+        params_dict = dict(self.named_parameters())
+        for name, loaded_weight in weights:
+            # Skip rotary embedding inverse frequencies
+            if "rotary_emb.inv_freq" in name:
+                continue
+
+            # Remap router gate weights: HF uses .mlp.router.gate., SGLang uses .mlp.gate.
+            if ".mlp.router.gate." in name:
+                name = name.replace(".mlp.router.gate.", ".mlp.gate.")
+
+            # Handle stacked params (qkv_proj, gate_up_proj)
+            handled = False
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                # Skip gate_proj/up_proj stacking for self_attn (attention uses separate gate_proj)
+                if ".self_attn." in name and weight_name in {"gate_proj", "up_proj"}:
+                    continue
+
+                new_name = name.replace(weight_name, param_name)
+                # Skip if parameter doesn't exist (e.g., bias for layers without bias)
+                if new_name not in params_dict:
+                    handled = True
+                    break
+
+                param = params_dict[new_name]
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
+                weight_loader(param, loaded_weight, shard_id)
+                handled = True
+                break
+
+            if handled:
+                continue
+
+            # Load remaining weights directly
+            if name in params_dict:
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
+                weight_loader(param, loaded_weight)
+
+
+EntryClass = AfmoeForCausalLM
diff --git a/sglang/python/sglang/srt/models/apertus.py b/sglang/python/sglang/srt/models/apertus.py
new file mode 100644
index 0000000000000000000000000000000000000000..ca84264b9362036020f6690163cecb19b63738df
--- /dev/null
+++ b/sglang/python/sglang/srt/models/apertus.py
@@ -0,0 +1,685 @@
+# Copyright 2025 The SwissAI Initiative
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+# Adapted from
+# https://github.com/vllm-project/vllm/blob/c7f2cf2b7f67bce5842fedfdba508440fe257375/vllm/model_executor/models/llama.py#L1
+"""Inference-only Apertus model compatible with HuggingFace weights."""
+
+import logging
+from typing import Any, Dict, Iterable, List, Optional, Tuple, Union
+
+import torch
+from torch import nn
+from transformers import ApertusConfig
+
+from sglang.srt.distributed import (
+    get_pp_group,
+    get_tensor_model_parallel_rank,
+    get_tensor_model_parallel_world_size,
+)
+from sglang.srt.layers.activation import XIELU
+from sglang.srt.layers.layernorm import RMSNorm
+from sglang.srt.layers.linear import (
+    ColumnParallelLinear,
+    QKVParallelLinear,
+    RowParallelLinear,
+)
+from sglang.srt.layers.logits_processor import LogitsProcessor, LogitsProcessorOutput
+from sglang.srt.layers.pooler import Pooler, PoolingType
+from sglang.srt.layers.quantization.base_config import QuantizationConfig
+from sglang.srt.layers.radix_attention import RadixAttention
+from sglang.srt.layers.rotary_embedding import get_rope
+from sglang.srt.layers.utils import PPMissingLayer, get_layer_id
+from sglang.srt.layers.vocab_parallel_embedding import (
+    ParallelLMHead,
+    VocabParallelEmbedding,
+)
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch, PPProxyTensors
+from sglang.srt.model_loader.weight_utils import (
+    default_weight_loader,
+    kv_cache_scales_loader,
+    maybe_remap_kv_scale_name,
+)
+from sglang.srt.server_args import get_global_server_args
+from sglang.srt.utils import add_prefix, make_layers
+
+logger = logging.getLogger(__name__)
+
+
+class ApertusMLP(nn.Module):
+    def __init__(
+        self,
+        hidden_size: int,
+        intermediate_size: int,
+        hidden_act: str,
+        quant_config: Optional[QuantizationConfig] = None,
+        bias: bool = False,
+        prefix: str = "",
+        reduce_results: bool = True,
+    ) -> None:
+        super().__init__()
+        self.up_proj = ColumnParallelLinear(
+            hidden_size,
+            intermediate_size,
+            bias=bias,
+            quant_config=quant_config,
+            prefix=add_prefix("up_proj", prefix),
+        )
+        self.down_proj = RowParallelLinear(
+            intermediate_size,
+            hidden_size,
+            bias=bias,
+            quant_config=quant_config,
+            prefix=add_prefix("down_proj", prefix),
+            reduce_results=reduce_results,
+        )
+        if hidden_act != "xielu":
+            raise ValueError(
+                f"Unsupported activation: {hidden_act}. "
+                "Only xIELU is supported for now."
+            )
+        self.act_fn = XIELU()
+
+    def forward(
+        self,
+        x,
+        forward_batch=None,
+        use_reduce_scatter: bool = False,
+    ):
+        # note: with xielu, there's no gate_proj
+        x, _ = self.up_proj(x)
+        x = self.act_fn(x)
+        x, _ = self.down_proj(
+            x,
+            skip_all_reduce=use_reduce_scatter,
+        )
+        return x
+
+
+class ApertusAttention(nn.Module):
+    def __init__(
+        self,
+        config: ApertusConfig,
+        hidden_size: int,
+        num_heads: int,
+        num_kv_heads: int,
+        layer_id: int = 0,
+        rope_theta: float = 10000,
+        rope_scaling: Optional[Dict[str, Any]] = None,
+        rope_is_neox_style: bool = True,
+        max_position_embeddings: int = 8192,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+        bias: bool = False,
+        bias_o_proj: bool = False,
+    ) -> None:
+        super().__init__()
+        self.layer_id = layer_id
+        self.hidden_size = hidden_size
+        tp_size = get_tensor_model_parallel_world_size()
+        self.total_num_heads = num_heads
+        assert self.total_num_heads % tp_size == 0
+        self.num_heads = self.total_num_heads // tp_size
+        self.total_num_kv_heads = num_kv_heads
+        if self.total_num_kv_heads >= tp_size:
+            # Number of KV heads is greater than TP size, so we partition
+            # the KV heads across multiple tensor parallel GPUs.
+            assert self.total_num_kv_heads % tp_size == 0
+        else:
+            # Number of KV heads is less than TP size, so we replicate
+            # the KV heads across multiple tensor parallel GPUs.
+            assert tp_size % self.total_num_kv_heads == 0
+        self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
+        # MistralConfig has an optional head_dim introduced by Mistral-Nemo
+        self.head_dim = getattr(
+            config, "head_dim", self.hidden_size // self.total_num_heads
+        )
+        partial_rotary_factor = getattr(config, "partial_rotary_factor", 1)
+        self.rotary_dim = int(partial_rotary_factor * self.head_dim)
+        self.q_size = self.num_heads * self.head_dim
+        self.kv_size = self.num_kv_heads * self.head_dim
+        self.scaling = self.head_dim**-0.5
+        self.rope_theta = rope_theta
+        self.max_position_embeddings = max_position_embeddings
+
+        self.qkv_proj = QKVParallelLinear(
+            hidden_size,
+            self.head_dim,
+            self.total_num_heads,
+            self.total_num_kv_heads,
+            bias=bias,
+            quant_config=quant_config,
+            prefix=add_prefix("qkv_proj", prefix),
+        )
+        self.o_proj = RowParallelLinear(
+            self.total_num_heads * self.head_dim,
+            hidden_size,
+            bias=bias_o_proj,
+            quant_config=quant_config,
+            prefix=add_prefix("o_proj", prefix),
+        )
+
+        self.rotary_emb = get_rope(
+            self.head_dim,
+            rotary_dim=self.rotary_dim,
+            max_position=max_position_embeddings,
+            base=rope_theta,
+            rope_scaling=rope_scaling,
+            is_neox_style=rope_is_neox_style,
+        )
+        self.attn = RadixAttention(
+            self.num_heads,
+            self.head_dim,
+            self.scaling,
+            num_kv_heads=self.num_kv_heads,
+            layer_id=layer_id,
+            quant_config=quant_config,
+            prefix=add_prefix("attn", prefix),
+        )
+        self.q_norm = RMSNorm(self.head_dim, eps=config.rms_norm_eps)
+        self.k_norm = RMSNorm(self.head_dim, eps=config.rms_norm_eps)
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+    ) -> torch.Tensor:
+        qkv, _ = self.qkv_proj(hidden_states)
+        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+        q = self.q_norm(q.contiguous().view(-1, self.head_dim)).view_as(q)
+        k = self.k_norm(k.contiguous().view(-1, self.head_dim)).view_as(k)
+        q, k = self.rotary_emb(positions, q, k)
+        attn_output = self.attn(q, k, v, forward_batch)
+        output, _ = self.o_proj(attn_output)
+        return output
+
+
+class ApertusDecoderLayer(nn.Module):
+    def __init__(
+        self,
+        config: ApertusConfig,
+        layer_id: int = 0,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        rope_theta = getattr(config, "rope_theta", 10000)
+        rope_scaling = getattr(config, "rope_scaling", None)
+        if rope_scaling is not None and getattr(
+            config, "original_max_position_embeddings", None
+        ):
+            rope_scaling["original_max_position_embeddings"] = (
+                config.original_max_position_embeddings
+            )
+        rope_is_neox_style = getattr(config, "rope_is_neox_style", True)
+        max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
+        # Support llamafy/Qwen-Qwen2.5-7B-Instruct-llamafied with attention_bias
+        # Support internlm/internlm-7b with bias
+        attention_bias = getattr(config, "attention_bias", False) or getattr(
+            config, "bias", False
+        )
+        bias_o_proj = attention_bias
+        # support internlm/internlm3-8b with qkv_bias
+        if hasattr(config, "qkv_bias"):
+            attention_bias = config.qkv_bias
+        self.self_attn = ApertusAttention(
+            config=config,
+            hidden_size=self.hidden_size,
+            num_heads=config.num_attention_heads,
+            num_kv_heads=config.num_key_value_heads,
+            layer_id=layer_id,
+            rope_theta=rope_theta,
+            rope_scaling=rope_scaling,
+            rope_is_neox_style=rope_is_neox_style,
+            max_position_embeddings=max_position_embeddings,
+            quant_config=quant_config,
+            prefix=add_prefix("self_attn", prefix),
+            bias=attention_bias,
+            bias_o_proj=bias_o_proj,
+        )
+        self.mlp = ApertusMLP(
+            hidden_size=self.hidden_size,
+            intermediate_size=config.intermediate_size,
+            hidden_act=config.hidden_act,
+            quant_config=quant_config,
+            bias=getattr(config, "mlp_bias", False),
+            prefix=add_prefix("mlp", prefix),
+        )
+        self.attention_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.feedforward_layernorm = RMSNorm(
+            config.hidden_size, eps=config.rms_norm_eps
+        )
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+        residual: Optional[torch.Tensor],
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        # Self Attention
+        if residual is None:
+            residual = hidden_states
+            hidden_states = self.attention_layernorm(hidden_states)
+        else:
+            hidden_states, residual = self.attention_layernorm(hidden_states, residual)
+        hidden_states = self.self_attn(
+            positions=positions,
+            hidden_states=hidden_states,
+            forward_batch=forward_batch,
+        )
+
+        # Fully Connected
+        hidden_states, residual = self.feedforward_layernorm(hidden_states, residual)
+        hidden_states = self.mlp(hidden_states)
+        return hidden_states, residual
+
+
+class ApertusModel(nn.Module):
+    def __init__(
+        self,
+        config: ApertusConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.quant_config = quant_config
+        self.config = config
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+        self.org_vocab_size = config.vocab_size
+        self.pp_group = get_pp_group()
+        if self.pp_group.is_first_rank:
+            self.embed_tokens = VocabParallelEmbedding(
+                config.vocab_size,
+                config.hidden_size,
+                quant_config=quant_config,
+                prefix=add_prefix("embed_tokens", prefix),
+            )
+        else:
+            self.embed_tokens = PPMissingLayer()
+
+        self.layers, self.start_layer, self.end_layer = make_layers(
+            config.num_hidden_layers,
+            lambda idx, prefix: ApertusDecoderLayer(
+                config=config, quant_config=quant_config, layer_id=idx, prefix=prefix
+            ),
+            pp_rank=self.pp_group.rank_in_group,
+            pp_size=self.pp_group.world_size,
+            prefix="model.layers",
+        )
+
+        if self.pp_group.is_last_rank:
+            self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        else:
+            self.norm = PPMissingLayer(return_tuple=True)
+        self.layers_to_capture = []
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        input_embeds: torch.Tensor = None,
+        pp_proxy_tensors: Optional[PPProxyTensors] = None,
+    ) -> Union[torch.Tensor, Tuple[torch.Tensor, List[torch.Tensor]], PPProxyTensors]:
+        if self.pp_group.is_first_rank:
+            if input_embeds is None:
+                hidden_states = self.embed_tokens(input_ids)
+            else:
+                hidden_states = input_embeds
+            residual = None
+        else:
+            assert pp_proxy_tensors is not None
+            # FIXME(@ying): reduce the number of proxy tensors by not fusing layer norms
+            hidden_states = pp_proxy_tensors["hidden_states"]
+            residual = pp_proxy_tensors["residual"]
+            deferred_norm = None
+
+        aux_hidden_states = []
+        for i in range(self.start_layer, self.end_layer):
+            if i in self.layers_to_capture:
+                aux_hidden_states.append(hidden_states + residual)
+            layer = self.layers[i]
+            hidden_states, residual = layer(
+                positions,
+                hidden_states,
+                forward_batch,
+                residual,
+            )
+
+        if not self.pp_group.is_last_rank:
+            return PPProxyTensors(
+                {
+                    "hidden_states": hidden_states,
+                    "residual": residual,
+                }
+            )
+        else:
+            hidden_states, _ = self.norm(hidden_states, residual)
+
+        if len(aux_hidden_states) == 0:
+            return hidden_states
+
+        return hidden_states, aux_hidden_states
+
+    # If this function is called, it should always initialize KV cache scale
+    # factors (or else raise an exception). Thus, handled exceptions should
+    # make sure to leave KV cache scale factors in a known good (dummy) state
+    def load_kv_cache_scales(self, quantization_param_path: str) -> None:
+        tp_size = get_tensor_model_parallel_world_size()
+        tp_rank = get_tensor_model_parallel_rank()
+        for layer_idx, scaling_factor in kv_cache_scales_loader(
+            quantization_param_path,
+            tp_rank,
+            tp_size,
+            self.config.num_hidden_layers,
+            self.config.__class__.model_type,
+        ):
+            if not isinstance(self.layers[layer_idx], nn.Identity):
+                layer_self_attn = self.layers[layer_idx].self_attn
+
+            if hasattr(layer_self_attn.attn, "k_scale"):
+                layer_self_attn.attn.k_scale = scaling_factor
+                layer_self_attn.attn.v_scale = scaling_factor
+            else:
+                raise RuntimeError(
+                    "Self attention has no KV cache scaling " "factor attribute!"
+                )
+
+
+class ApertusForCausalLM(nn.Module):
+    # LoRA specific attributes
+    embedding_modules = {
+        "embed_tokens": "input_embeddings",
+        "lm_head": "output_embeddings",
+    }
+    embedding_padding_modules = ["lm_head"]
+    # BitandBytes specific attributes
+    default_bitsandbytes_target_modules = [
+        ".down_proj.",
+        ".up_proj.",
+        ".q_proj.",
+        ".k_proj.",
+        ".v_proj.",
+        ".o_proj.",
+    ]
+    # in TP, these weights are partitioned along the column dimension (dim=-1)
+    column_parallel_weights_modules = [".down_proj.", ".o_proj."]
+    bitsandbytes_stacked_params_mapping = {
+        # shard_name, weight_name, index
+        ".q_proj": (".qkv_proj", 0),
+        ".k_proj": (".qkv_proj", 1),
+        ".v_proj": (".qkv_proj", 2),
+    }
+
+    def __init__(
+        self,
+        config: ApertusConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.pp_group = get_pp_group()
+        self.config = config
+        self.quant_config = quant_config
+        self.model = self._init_model(config, quant_config, add_prefix("model", prefix))
+        if self.config.tie_word_embeddings:
+            self.lm_head = self.model.embed_tokens
+        else:
+            self.lm_head = ParallelLMHead(
+                config.vocab_size,
+                config.hidden_size,
+                quant_config=quant_config,
+                prefix=add_prefix("lm_head", prefix),
+                use_attn_tp_group=get_global_server_args().enable_dp_lm_head,
+            )
+        self.logits_processor = LogitsProcessor(config)
+        self.pooler = Pooler(pooling_type=PoolingType.LAST, normalize=True)
+        self.stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            (".qkv_proj", ".q_proj", "q"),
+            (".qkv_proj", ".k_proj", "k"),
+            (".qkv_proj", ".v_proj", "v"),
+        ]
+
+        self.capture_aux_hidden_states = False
+
+    def _init_model(
+        self,
+        config: ApertusConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        return ApertusModel(config, quant_config=quant_config, prefix=prefix)
+
+    @torch.no_grad()
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        input_embeds: torch.Tensor = None,
+        get_embedding: bool = False,
+        pp_proxy_tensors: Optional[PPProxyTensors] = None,
+    ) -> LogitsProcessorOutput:
+        hidden_states = self.model(
+            input_ids,
+            positions,
+            forward_batch,
+            input_embeds,
+            pp_proxy_tensors=pp_proxy_tensors,
+        )
+
+        aux_hidden_states = None
+        if self.capture_aux_hidden_states:
+            hidden_states, aux_hidden_states = hidden_states
+
+        if self.pp_group.is_last_rank:
+            if not get_embedding:
+                return self.logits_processor(
+                    input_ids,
+                    hidden_states,
+                    self.lm_head,
+                    forward_batch,
+                    aux_hidden_states,
+                )
+            else:
+                return self.pooler(hidden_states, forward_batch)
+        else:
+            return hidden_states
+
+    @torch.no_grad()
+    def forward_split_prefill(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        split_interval: Tuple[int, int],  # [start, end) 0-based
+        input_embeds: torch.Tensor = None,
+    ) -> Optional[LogitsProcessorOutput]:
+        start, end = split_interval
+        # embed
+        if start == 0:
+            if input_embeds is None:
+                forward_batch.hidden_states = self.model.embed_tokens(input_ids)
+            else:
+                forward_batch.hidden_states = input_embeds
+        # decoder layer
+        for i in range(start, end):
+            layer = self.model.layers[i]
+            forward_batch.hidden_states, forward_batch.residual = layer(
+                positions,
+                forward_batch.hidden_states,
+                forward_batch,
+                forward_batch.residual,
+            )
+
+        if end == self.model.config.num_hidden_layers:
+            # norm
+            hidden_states, _ = self.model.norm(
+                forward_batch.hidden_states, forward_batch.residual
+            )
+            forward_batch.hidden_states = hidden_states
+            # logits process
+            result = self.logits_processor(
+                input_ids, forward_batch.hidden_states, self.lm_head, forward_batch
+            )
+        else:
+            result = None
+
+        return result
+
+    @property
+    def start_layer(self):
+        return self.model.start_layer
+
+    @property
+    def end_layer(self):
+        return self.model.end_layer
+
+    def get_input_embeddings(self) -> nn.Embedding:
+        return self.model.embed_tokens
+
+    def get_module_name_from_weight_name(self, name):
+        for param_name, weight_name, shard_id, num_shard in self.stacked_params_mapping:
+            if weight_name in name:
+                return (
+                    name.replace(weight_name, param_name)[: -len(".weight")],
+                    num_shard,
+                )
+        return name[: -len(".weight")], 1
+
+    def get_num_params(self):
+        params_dict = dict(self.named_parameters())
+        return len(params_dict)
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            (".qkv_proj", ".q_proj", "q"),
+            (".qkv_proj", ".k_proj", "k"),
+            (".qkv_proj", ".v_proj", "v"),
+        ]
+
+        params_dict = dict(self.named_parameters())
+
+        for name, buffer in self.named_buffers():
+            if name.endswith(".beta") or name.endswith(".eps"):
+                params_dict[name] = buffer
+
+        for name, loaded_weight in weights:
+            layer_id = get_layer_id(name)
+            if (
+                layer_id is not None
+                and hasattr(self.model, "start_layer")
+                and (
+                    layer_id < self.model.start_layer
+                    or layer_id >= self.model.end_layer
+                )
+            ):
+                continue
+            if "rotary_emb.inv_freq" in name or "projector" in name:
+                continue
+            if "rotary_emb.cos_cached" in name or "rotary_emb.sin_cached" in name:
+                # Models trained using ColossalAI may include these tensors in
+                # the checkpoint. Skip them.
+                continue
+            if name.startswith("model.vision_tower") and name not in params_dict:
+                continue
+            if self.config.tie_word_embeddings and "lm_head.weight" in name:
+                continue
+            # Handle FP8 kv-scale remapping
+            if "scale" in name:
+                name = maybe_remap_kv_scale_name(name, params_dict)
+                if name is None:
+                    continue
+
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                if name not in params_dict:
+                    continue
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                # Skip loading kv_scale from ckpts towards new design.
+                if name.endswith(".kv_scale") and name not in params_dict:
+                    continue
+                if name in params_dict.keys():
+                    param = params_dict[name]
+                    weight_loader = getattr(
+                        param, "weight_loader", default_weight_loader
+                    )
+                    weight_loader(param, loaded_weight)
+                else:
+                    logger.warning(f"Parameter {name} not found in params_dict")
+
+    def get_embed_and_head(self):
+        return self.model.embed_tokens.weight, self.lm_head.weight
+
+    def set_embed_and_head(self, embed, head):
+        del self.model.embed_tokens.weight
+        del self.lm_head.weight
+        self.model.embed_tokens.weight = embed
+        self.lm_head.weight = head
+        torch.cuda.empty_cache()
+        torch.cuda.synchronize()
+
+    def get_embed(self):
+        return self.model.embed_tokens.weight
+
+    def set_embed(self, embed):
+        # NOTE: If draft hidden size != target hidden size, the embed weight cannot be shared for EAGLE3
+        if (
+            hasattr(self.config, "target_hidden_size")
+            and self.config.target_hidden_size != self.config.hidden_size
+        ):
+            return
+        del self.model.embed_tokens.weight
+        self.model.embed_tokens.weight = embed
+        torch.cuda.empty_cache()
+        torch.cuda.synchronize()
+
+    def load_kv_cache_scales(self, quantization_param_path: str) -> None:
+        self.model.load_kv_cache_scales(quantization_param_path)
+
+    def set_eagle3_layers_to_capture(self, layer_ids: Optional[List[int]] = None):
+        if not self.pp_group.is_last_rank:
+            return
+
+        if layer_ids is None:
+            self.capture_aux_hidden_states = True
+            num_layers = self.config.num_hidden_layers
+            self.model.layers_to_capture = [2, num_layers // 2, num_layers - 3]
+        else:
+            self.capture_aux_hidden_states = True
+            # we plus 1 here because in sglang, for the ith layer, it takes the output
+            # of the (i-1)th layer as aux hidden state
+            self.model.layers_to_capture = [val + 1 for val in layer_ids]
+
+
+EntryClass = [ApertusForCausalLM]
diff --git a/sglang/python/sglang/srt/models/arcee.py b/sglang/python/sglang/srt/models/arcee.py
new file mode 100644
index 0000000000000000000000000000000000000000..5afd5f34f5dd3337ff49222f3739c8aa194d86ae
--- /dev/null
+++ b/sglang/python/sglang/srt/models/arcee.py
@@ -0,0 +1,532 @@
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Inference-only Arcee Foundational Model (AFM) compatible with HuggingFace weights."""
+
+import logging
+from typing import Any, Dict, Iterable, List, Optional, Tuple, Union
+
+import torch
+from torch import nn
+from transformers import LlamaConfig
+
+from sglang.srt.distributed import (
+    get_pp_group,
+    get_tensor_model_parallel_rank,
+    get_tensor_model_parallel_world_size,
+)
+from sglang.srt.layers.activation import get_act_fn
+from sglang.srt.layers.layernorm import RMSNorm
+from sglang.srt.layers.linear import (
+    ColumnParallelLinear,
+    QKVParallelLinear,
+    RowParallelLinear,
+)
+from sglang.srt.layers.logits_processor import LogitsProcessor, LogitsProcessorOutput
+from sglang.srt.layers.pooler import Pooler, PoolingType
+from sglang.srt.layers.quantization.base_config import QuantizationConfig
+from sglang.srt.layers.radix_attention import RadixAttention
+from sglang.srt.layers.rotary_embedding import get_rope
+from sglang.srt.layers.utils import PPMissingLayer, get_layer_id
+from sglang.srt.layers.vocab_parallel_embedding import (
+    ParallelLMHead,
+    VocabParallelEmbedding,
+)
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch, PPProxyTensors
+from sglang.srt.model_loader.weight_utils import (
+    default_weight_loader,
+    kv_cache_scales_loader,
+    maybe_remap_kv_scale_name,
+)
+from sglang.srt.server_args import get_global_server_args
+from sglang.srt.utils import add_prefix, make_layers
+
+logger = logging.getLogger(__name__)
+
+
+class ArceeMLP(nn.Module):
+    """
+    MLP block for the Arcee model, using a ReLU-squared activation function.
+    This differs from the Llama SwiGLU activation.
+    """
+
+    def __init__(
+        self,
+        hidden_size: int,
+        intermediate_size: int,
+        hidden_act: str,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+        reduce_results: bool = True,
+    ) -> None:
+        super().__init__()
+        # Arcee uses a single up-projection, not a merged gate/up projection.
+        self.up_proj = ColumnParallelLinear(
+            hidden_size,
+            intermediate_size,
+            bias=False,
+            quant_config=quant_config,
+            prefix=add_prefix("up_proj", prefix),
+        )
+        self.down_proj = RowParallelLinear(
+            intermediate_size,
+            hidden_size,
+            bias=False,
+            quant_config=quant_config,
+            prefix=add_prefix("down_proj", prefix),
+            reduce_results=reduce_results,
+        )
+        if hidden_act != "relu2":
+            raise ValueError(
+                f"Unsupported activation: {hidden_act}. "
+                "Arcee model in SGLang only supports 'relu2'."
+            )
+        # The activation function is relu(x)^2
+        self.act_fn = get_act_fn("relu2")
+
+    def forward(self, x, forward_batch=None):
+        x, _ = self.up_proj(x)
+        x = self.act_fn(x)
+        x, _ = self.down_proj(x)
+        return x
+
+
+class ArceeAttention(nn.Module):
+    def __init__(
+        self,
+        config: LlamaConfig,
+        hidden_size: int,
+        num_heads: int,
+        num_kv_heads: int,
+        layer_id: int = 0,
+        rope_theta: float = 10000,
+        rope_scaling: Optional[Dict[str, Any]] = None,
+        rope_is_neox_style: bool = True,
+        max_position_embeddings: int = 8192,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+        bias: bool = False,
+    ) -> None:
+        super().__init__()
+        self.hidden_size = hidden_size
+        tp_size = get_tensor_model_parallel_world_size()
+        self.total_num_heads = num_heads
+        assert self.total_num_heads % tp_size == 0
+        self.num_heads = self.total_num_heads // tp_size
+        self.total_num_kv_heads = num_kv_heads
+        if self.total_num_kv_heads >= tp_size:
+            assert self.total_num_kv_heads % tp_size == 0
+        else:
+            assert tp_size % self.total_num_kv_heads == 0
+        self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
+        self.head_dim = getattr(config, "head_dim", None)
+        if self.head_dim is None:
+            self.head_dim = self.hidden_size // self.total_num_heads
+        self.partial_rotary_factor = getattr(config, "partial_rotary_factor", 1)
+        self.rotary_dim = int(self.partial_rotary_factor * self.head_dim)
+        self.q_size = self.num_heads * self.head_dim
+        self.kv_size = self.num_kv_heads * self.head_dim
+        self.scaling = self.head_dim**-0.5
+        self.rope_theta = rope_theta
+        self.max_position_embeddings = max_position_embeddings
+
+        self.qkv_proj = QKVParallelLinear(
+            hidden_size,
+            self.head_dim,
+            self.total_num_heads,
+            self.total_num_kv_heads,
+            bias=bias,
+            quant_config=quant_config,
+            prefix=add_prefix("qkv_proj", prefix),
+        )
+        self.o_proj = RowParallelLinear(
+            self.total_num_heads * self.head_dim,
+            hidden_size,
+            bias=bias,
+            quant_config=quant_config,
+            prefix=add_prefix("o_proj", prefix),
+        )
+
+        self.rotary_emb = get_rope(
+            self.head_dim,
+            rotary_dim=self.rotary_dim,
+            max_position=max_position_embeddings,
+            base=rope_theta,
+            rope_scaling=rope_scaling,
+            is_neox_style=rope_is_neox_style,
+        )
+        self.attn = RadixAttention(
+            self.num_heads,
+            self.head_dim,
+            self.scaling,
+            num_kv_heads=self.num_kv_heads,
+            layer_id=layer_id,
+            quant_config=quant_config,
+            prefix=add_prefix("attn", prefix),
+        )
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+    ) -> torch.Tensor:
+        qkv, _ = self.qkv_proj(hidden_states)
+        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+        q, k = self.rotary_emb(positions, q, k)
+        attn_output = self.attn(q, k, v, forward_batch)
+        output, _ = self.o_proj(attn_output)
+        return output
+
+
+class ArceeDecoderLayer(nn.Module):
+    def __init__(
+        self,
+        config: LlamaConfig,
+        layer_id: int = 0,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        rope_theta = getattr(config, "rope_theta", 10000)
+        rope_scaling = getattr(config, "rope_scaling", None)
+        if rope_scaling is not None and getattr(
+            config, "original_max_position_embeddings", None
+        ):
+            rope_scaling["original_max_position_embeddings"] = (
+                config.original_max_position_embeddings
+            )
+        rope_is_neox_style = getattr(config, "rope_is_neox_style", True)
+        max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
+        attention_bias = getattr(config, "attention_bias", False) or getattr(
+            config, "bias", False
+        )
+        self.self_attn = ArceeAttention(
+            config=config,
+            hidden_size=self.hidden_size,
+            num_heads=config.num_attention_heads,
+            num_kv_heads=config.num_key_value_heads,
+            layer_id=layer_id,
+            rope_theta=rope_theta,
+            rope_scaling=rope_scaling,
+            rope_is_neox_style=rope_is_neox_style,
+            max_position_embeddings=max_position_embeddings,
+            quant_config=quant_config,
+            prefix=add_prefix("self_attn", prefix),
+            bias=attention_bias,
+        )
+        self.mlp = ArceeMLP(
+            hidden_size=self.hidden_size,
+            intermediate_size=config.intermediate_size,
+            hidden_act=config.hidden_act,
+            quant_config=quant_config,
+            prefix=add_prefix("mlp", prefix),
+        )
+        self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = RMSNorm(
+            config.hidden_size, eps=config.rms_norm_eps
+        )
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+        residual: Optional[torch.Tensor],
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        # Self Attention
+        if residual is None:
+            residual = hidden_states
+            hidden_states = self.input_layernorm(hidden_states)
+        else:
+            hidden_states, residual = self.input_layernorm(hidden_states, residual)
+        hidden_states = self.self_attn(
+            positions=positions,
+            hidden_states=hidden_states,
+            forward_batch=forward_batch,
+        )
+
+        # Fully Connected
+        hidden_states, residual = self.post_attention_layernorm(hidden_states, residual)
+        hidden_states = self.mlp(hidden_states)
+        return hidden_states, residual
+
+
+class ArceeModel(nn.Module):
+    def __init__(
+        self,
+        config: LlamaConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.config = config
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+        self.pp_group = get_pp_group()
+        if self.pp_group.is_first_rank:
+            self.embed_tokens = VocabParallelEmbedding(
+                config.vocab_size,
+                config.hidden_size,
+                quant_config=quant_config,
+                prefix=add_prefix("embed_tokens", prefix),
+            )
+        else:
+            self.embed_tokens = PPMissingLayer()
+
+        self.layers, self.start_layer, self.end_layer = make_layers(
+            config.num_hidden_layers,
+            lambda idx, prefix: ArceeDecoderLayer(
+                config=config, quant_config=quant_config, layer_id=idx, prefix=prefix
+            ),
+            pp_rank=self.pp_group.rank_in_group,
+            pp_size=self.pp_group.world_size,
+            prefix="model.layers",
+        )
+
+        if self.pp_group.is_last_rank:
+            self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        else:
+            self.norm = PPMissingLayer(return_tuple=True)
+        self.layers_to_capture = []
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        input_embeds: torch.Tensor = None,
+        pp_proxy_tensors: Optional[PPProxyTensors] = None,
+    ) -> Union[torch.Tensor, Tuple[torch.Tensor, List[torch.Tensor]], PPProxyTensors]:
+        if self.pp_group.is_first_rank:
+            if input_embeds is None:
+                hidden_states = self.embed_tokens(input_ids)
+            else:
+                hidden_states = input_embeds
+            residual = None
+        else:
+            assert pp_proxy_tensors is not None
+            hidden_states = pp_proxy_tensors["hidden_states"]
+            residual = pp_proxy_tensors["residual"]
+
+        aux_hidden_states = []
+        for i in range(self.start_layer, self.end_layer):
+            if i in self.layers_to_capture:
+                aux_hidden_states.append(hidden_states + residual)
+            layer = self.layers[i]
+            hidden_states, residual = layer(
+                positions,
+                hidden_states,
+                forward_batch,
+                residual,
+            )
+
+        if not self.pp_group.is_last_rank:
+            return PPProxyTensors(
+                {
+                    "hidden_states": hidden_states,
+                    "residual": residual,
+                }
+            )
+        else:
+            hidden_states, _ = self.norm(hidden_states, residual)
+
+        if len(aux_hidden_states) == 0:
+            return hidden_states
+
+        return hidden_states, aux_hidden_states
+
+    def load_kv_cache_scales(self, quantization_param_path: str) -> None:
+        tp_size = get_tensor_model_parallel_world_size()
+        tp_rank = get_tensor_model_parallel_rank()
+        for layer_idx, scaling_factor in kv_cache_scales_loader(
+            quantization_param_path,
+            tp_rank,
+            tp_size,
+            self.config.num_hidden_layers,
+            self.config.__class__.model_type,
+        ):
+            if not isinstance(self.layers[layer_idx], nn.Identity):
+                layer_self_attn = self.layers[layer_idx].self_attn
+
+            if hasattr(layer_self_attn.attn, "k_scale"):
+                layer_self_attn.attn.k_scale = scaling_factor
+                layer_self_attn.attn.v_scale = scaling_factor
+            else:
+                raise RuntimeError(
+                    "Self attention has no KV cache scaling factor attribute!"
+                )
+
+
+class ArceeForCausalLM(nn.Module):
+    # BitandBytes specific attributes
+    default_bitsandbytes_target_modules = [
+        # Note: gate_proj is removed compared to Llama
+        ".down_proj.",
+        ".up_proj.",
+        ".q_proj.",
+        ".k_proj.",
+        ".v_proj.",
+        ".o_proj.",
+    ]
+    # in TP, these weights are partitioned along the column dimension (dim=-1)
+    column_parallel_weights_modules = [".down_proj.", ".o_proj."]
+    bitsandbytes_stacked_params_mapping = {
+        # shard_name, weight_name, index
+        # Note: gate_proj and up_proj are removed as they are not stacked in ArceeMLP
+        ".q_proj": (".qkv_proj", 0),
+        ".k_proj": (".qkv_proj", 1),
+        ".v_proj": (".qkv_proj", 2),
+    }
+
+    def __init__(
+        self,
+        config: LlamaConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.pp_group = get_pp_group()
+        self.config = config
+        self.quant_config = quant_config
+        self.model = self._init_model(config, quant_config, add_prefix("model", prefix))
+        # Arcee does not tie word embeddings
+        self.lm_head = ParallelLMHead(
+            config.vocab_size,
+            config.hidden_size,
+            quant_config=quant_config,
+            prefix=add_prefix("lm_head", prefix),
+            use_attn_tp_group=get_global_server_args().enable_dp_lm_head,
+        )
+        self.logits_processor = LogitsProcessor(config)
+        self.pooler = Pooler(pooling_type=PoolingType.LAST, normalize=True)
+        # Parameters that are stacked in a single tensor in this model
+        self.stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            (".qkv_proj", ".q_proj", "q"),
+            (".qkv_proj", ".k_proj", "k"),
+            (".qkv_proj", ".v_proj", "v"),
+        ]
+        self.capture_aux_hidden_states = False
+
+    def _init_model(
+        self,
+        config: LlamaConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        return ArceeModel(config, quant_config=quant_config, prefix=prefix)
+
+    @torch.no_grad()
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        input_embeds: torch.Tensor = None,
+        get_embedding: bool = False,
+        pp_proxy_tensors: Optional[PPProxyTensors] = None,
+    ) -> LogitsProcessorOutput:
+        hidden_states = self.model(
+            input_ids,
+            positions,
+            forward_batch,
+            input_embeds,
+            pp_proxy_tensors=pp_proxy_tensors,
+        )
+
+        aux_hidden_states = None
+        if self.capture_aux_hidden_states:
+            hidden_states, aux_hidden_states = hidden_states
+
+        if self.pp_group.is_last_rank:
+            if not get_embedding:
+                return self.logits_processor(
+                    input_ids,
+                    hidden_states,
+                    self.lm_head,
+                    forward_batch,
+                    aux_hidden_states,
+                )
+            else:
+                return self.pooler(hidden_states, forward_batch)
+        else:
+            return hidden_states
+
+    @property
+    def start_layer(self):
+        return self.model.start_layer
+
+    @property
+    def end_layer(self):
+        return self.model.end_layer
+
+    def get_input_embeddings(self) -> nn.Embedding:
+        return self.model.embed_tokens
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        params_dict = dict(self.named_parameters())
+
+        for name, loaded_weight in weights:
+            layer_id = get_layer_id(name)
+            if (
+                layer_id is not None
+                and hasattr(self.model, "start_layer")
+                and (
+                    layer_id < self.model.start_layer
+                    or layer_id >= self.model.end_layer
+                )
+            ):
+                continue
+            if "rotary_emb.inv_freq" in name or "projector" in name:
+                continue
+            if "rotary_emb.cos_cached" in name or "rotary_emb.sin_cached" in name:
+                continue
+
+            # Handle FP8 kv-scale remapping
+            if "scale" in name:
+                name = maybe_remap_kv_scale_name(name, params_dict)
+                if name is None:
+                    continue
+
+            is_stacked = False
+            for param_name, weight_name, shard_id in self.stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+
+                name = name.replace(weight_name, param_name)
+                if name not in params_dict:
+                    continue
+
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                is_stacked = True
+                break
+
+            if not is_stacked:
+                if name in params_dict:
+                    param = params_dict[name]
+                    weight_loader = getattr(
+                        param, "weight_loader", default_weight_loader
+                    )
+                    weight_loader(param, loaded_weight)
+                else:
+                    logger.warning(f"Parameter {name} not found in model.")
+
+    def load_kv_cache_scales(self, quantization_param_path: str) -> None:
+        self.model.load_kv_cache_scales(quantization_param_path)
+
+
+EntryClass = [ArceeForCausalLM]
diff --git a/sglang/python/sglang/srt/models/baichuan.py b/sglang/python/sglang/srt/models/baichuan.py
new file mode 100644
index 0000000000000000000000000000000000000000..6d060b88182ec0e2cb2ec001710f7c0bc8efa8ec
--- /dev/null
+++ b/sglang/python/sglang/srt/models/baichuan.py
@@ -0,0 +1,448 @@
+# coding=utf-8
+# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Inference-only BaiChuan model compatible with HuggingFace weights."""
+
+import math
+from typing import Iterable, Optional, Tuple
+
+import torch
+from torch import nn
+from transformers import PretrainedConfig
+
+from sglang.srt.distributed import (
+    get_tensor_model_parallel_rank,
+    get_tensor_model_parallel_world_size,
+)
+from sglang.srt.layers.activation import SiluAndMul
+from sglang.srt.layers.layernorm import RMSNorm
+from sglang.srt.layers.linear import (
+    MergedColumnParallelLinear,
+    QKVParallelLinear,
+    RowParallelLinear,
+)
+from sglang.srt.layers.logits_processor import LogitsProcessor
+from sglang.srt.layers.quantization.base_config import QuantizationConfig
+from sglang.srt.layers.radix_attention import RadixAttention
+from sglang.srt.layers.rotary_embedding import get_rope
+from sglang.srt.layers.vocab_parallel_embedding import (
+    ParallelLMHead,
+    VocabParallelEmbedding,
+)
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch
+from sglang.srt.model_loader.weight_utils import default_weight_loader
+from sglang.srt.utils import add_prefix, is_npu
+
+_is_npu = is_npu()
+
+
+def _get_alibi_slopes(total_num_heads: int) -> torch.Tensor:
+    closest_power_of_2 = 2 ** math.floor(math.log2(total_num_heads))
+    base = torch.tensor(
+        2 ** (-(2 ** -(math.log2(closest_power_of_2) - 3))),
+        dtype=torch.float32,
+    )
+    powers = torch.arange(1, 1 + closest_power_of_2, dtype=torch.int32)
+    slopes = torch.pow(base, powers)
+
+    if closest_power_of_2 != total_num_heads:
+        extra_base = torch.tensor(
+            2 ** (-(2 ** -(math.log2(2 * closest_power_of_2) - 3))),
+            dtype=torch.float32,
+        )
+        num_remaining_heads = min(
+            closest_power_of_2, total_num_heads - closest_power_of_2
+        )
+        extra_powers = torch.arange(
+            start=1, end=1 + 2 * num_remaining_heads, step=2, dtype=torch.int32
+        )
+        slopes = torch.cat([slopes, torch.pow(extra_base, extra_powers)], dim=0)
+    return slopes
+
+
+class BaiChuanMLP(nn.Module):
+
+    def __init__(
+        self,
+        hidden_size: int,
+        intermediate_size: int,
+        hidden_act: str,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.gate_up_proj = MergedColumnParallelLinear(
+            hidden_size,
+            [intermediate_size] * 2,
+            bias=False,
+            quant_config=quant_config,
+            prefix=add_prefix("gate_up_proj", prefix),
+        )
+        self.down_proj = RowParallelLinear(
+            intermediate_size,
+            hidden_size,
+            bias=False,
+            quant_config=quant_config,
+            prefix=add_prefix("down_proj", prefix),
+        )
+        if hidden_act != "silu":
+            raise ValueError(
+                f"Unsupported activation: {hidden_act}. "
+                "Only silu is supported for now."
+            )
+        self.act_fn = SiluAndMul()
+
+    def forward(self, x):
+        gate_up, _ = self.gate_up_proj(x)
+        x = self.act_fn(gate_up)
+        x, _ = self.down_proj(x)
+        return x
+
+
+class BaiChuanAttention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    def __init__(
+        self,
+        hidden_size: int,
+        num_heads: int,
+        position_embedding: str,
+        rope_theta: float = 10000,
+        max_position_embeddings: int = 8192,
+        quant_config: Optional[QuantizationConfig] = None,
+        layer_id: int = 0,
+        dtype: Optional[torch.dtype] = torch.bfloat16,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.hidden_size = hidden_size
+        tp_size = get_tensor_model_parallel_world_size()
+        self.total_num_heads = num_heads
+        self.total_num_kv_heads = self.total_num_heads
+        assert self.total_num_heads % tp_size == 0
+        self.head_dim = hidden_size // self.total_num_heads
+        self.position_embedding = position_embedding
+        self.rope_theta = rope_theta
+        self.max_position_embeddings = max_position_embeddings
+        if self.total_num_kv_heads >= tp_size:
+            # Number of KV heads is greater than TP size, so we partition
+            # the KV heads across multiple tensor parallel GPUs.
+            assert self.total_num_kv_heads % tp_size == 0
+        else:
+            # Number of KV heads is less than TP size, so we replicate
+            # the KV heads across multiple tensor parallel GPUs.
+            assert tp_size % self.total_num_kv_heads == 0
+        self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
+        self.num_heads = self.num_kv_heads
+
+        # pylint: disable=invalid-name
+        self.W_pack = QKVParallelLinear(
+            hidden_size,
+            self.head_dim,
+            self.total_num_heads,
+            self.total_num_heads,
+            bias=False,
+            quant_config=quant_config,
+            prefix=add_prefix("W_pack", prefix),
+        )
+        self.o_proj = RowParallelLinear(
+            self.total_num_heads * self.head_dim,
+            hidden_size,
+            bias=False,
+            quant_config=quant_config,
+            prefix=add_prefix("o_proj", prefix),
+        )
+        self.scaling = self.head_dim**-0.5
+
+        self.attn = RadixAttention(
+            self.num_heads,
+            self.head_dim,
+            self.scaling,
+            num_kv_heads=self.num_kv_heads,
+            layer_id=layer_id,
+            quant_config=quant_config,
+            prefix=add_prefix("attn", prefix),
+        )
+
+        # Create the alibi slopes and slice them.
+        if self.position_embedding == "ALIBI":
+            tp_rank = get_tensor_model_parallel_rank()
+            head_start = tp_rank * self.num_heads
+            head_end = (tp_rank + 1) * self.num_heads
+            alibi_slopes = _get_alibi_slopes(self.total_num_heads)
+            alibi_slopes = alibi_slopes[head_start:head_end]
+            self.alibi_slopes = torch.tensor(
+                alibi_slopes, dtype=dtype, device="npu" if _is_npu else "cuda"
+            )
+        else:
+            self.rotary_emb = get_rope(
+                self.head_dim,
+                rotary_dim=self.head_dim,
+                max_position=self.max_position_embeddings,
+                base=self.rope_theta,
+            )
+
+        self.attn_kwargs = {}
+        if self.position_embedding == "ALIBI" and _is_npu:
+            self.attn_kwargs["slopes"] = self.alibi_slopes
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+    ) -> torch.Tensor:
+        qkv, _ = self.W_pack(hidden_states)
+        q, k, v = qkv.chunk(chunks=3, dim=-1)
+        if self.position_embedding != "ALIBI":
+            q, k = self.rotary_emb(positions, q, k)
+        attn_output = self.attn(q, k, v, forward_batch, **self.attn_kwargs)
+        output, _ = self.o_proj(attn_output)
+        return output
+
+
+class BaiChuanDecoderLayer(nn.Module):
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        position_embedding: str,
+        layer_id: int = 0,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        rope_theta = getattr(config, "rope_theta", 10000)
+        max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
+        self.self_attn = BaiChuanAttention(
+            hidden_size=self.hidden_size,
+            num_heads=config.num_attention_heads,
+            position_embedding=position_embedding,
+            rope_theta=rope_theta,
+            layer_id=layer_id,
+            max_position_embeddings=max_position_embeddings,
+            quant_config=quant_config,
+            prefix=add_prefix("self_attn", prefix),
+        )
+        self.mlp = BaiChuanMLP(
+            hidden_size=self.hidden_size,
+            intermediate_size=config.intermediate_size,
+            hidden_act=config.hidden_act,
+            quant_config=quant_config,
+            prefix=add_prefix("mlp", prefix),
+        )
+        self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = RMSNorm(
+            config.hidden_size, eps=config.rms_norm_eps
+        )
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+        residual: Optional[torch.Tensor],
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        # Self Attention
+        if residual is None:
+            residual = hidden_states
+            hidden_states = self.input_layernorm(hidden_states)
+        else:
+            hidden_states, residual = self.input_layernorm(hidden_states, residual)
+        hidden_states = self.self_attn(
+            positions=positions,
+            hidden_states=hidden_states,
+            forward_batch=forward_batch,
+        )
+
+        # Fully Connected
+        hidden_states, residual = self.post_attention_layernorm(hidden_states, residual)
+        hidden_states = self.mlp(hidden_states)
+        return hidden_states, residual
+
+
+class BaiChuanModel(nn.Module):
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        position_embedding: str,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.config = config
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+
+        self.embed_tokens = VocabParallelEmbedding(
+            config.vocab_size,
+            config.hidden_size,
+            org_num_embeddings=config.vocab_size,
+            prefix=add_prefix("embed_tokens", prefix),
+        )
+        self.layers = nn.ModuleList(
+            [
+                BaiChuanDecoderLayer(
+                    config,
+                    layer_id=i,
+                    position_embedding=position_embedding,
+                    quant_config=quant_config,
+                    prefix=add_prefix(f"layers.{i}", prefix),
+                )
+                for i in range(config.num_hidden_layers)
+            ]
+        )
+        self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+    ) -> torch.Tensor:
+        hidden_states = self.embed_tokens(input_ids)
+        residual = None
+        for i in range(len(self.layers)):
+            layer = self.layers[i]
+            hidden_states, residual = layer(
+                positions,
+                hidden_states,
+                forward_batch,
+                residual,
+            )
+        hidden_states, _ = self.norm(hidden_states, residual)
+        return hidden_states
+
+
+class BaiChuanBaseForCausalLM(nn.Module):
+    packed_modules_mapping = {
+        "W_pack": ["W_pack"],
+        "gate_up_proj": [
+            "gate_proj",
+            "up_proj",
+        ],
+    }
+    # LoRA specific attributes
+    supported_lora_modules = [
+        "W_pack",
+        "o_proj",
+        "gate_up_proj",
+        "down_proj",
+    ]
+    embedding_modules = {
+        "embed_tokens": ["embed_tokens"],
+    }
+    embedding_padding_modules = []
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        position_embedding: str,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+
+        self.config = config
+
+        self.quant_config = quant_config
+        self.model = BaiChuanModel(
+            config, position_embedding, quant_config, prefix=add_prefix("model", prefix)
+        )
+        if self.config.tie_word_embeddings:
+            self.lm_head = self.model.embed_tokens
+        else:
+            self.lm_head = ParallelLMHead(
+                config.vocab_size,
+                config.hidden_size,
+                quant_config=quant_config,
+                prefix=add_prefix("lm_head", prefix),
+            )
+        self.logits_processor = LogitsProcessor(config)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+    ) -> torch.Tensor:
+        hidden_states = self.model(input_ids, positions, forward_batch)
+        return self.logits_processor(
+            input_ids, hidden_states, self.lm_head, forward_batch
+        )
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("gate_up_proj", "gate_proj", 0),
+            ("gate_up_proj", "up_proj", 1),
+        ]
+        params_dict = dict(self.named_parameters())
+        for name, loaded_weight in weights:
+            if "rotary_emb.inv_freq" in name:
+                continue
+            if name == "lm_head.weight":
+                # Unlike Baichuan, Baichuan2 normalizes the head weights.
+                # Refer to:
+                # https://huggingface.co/baichuan-inc/Baichuan2-7B-Chat/blob/84603cde5ebffb6084e476cfaeceaf0b8b91fe54/modeling_baichuan.py#L508
+                # Distinguish between Baichuan and Baichuan2 by checking the
+                # vocab size. This is suggested by
+                # https://github.com/vllm-project/vllm/pull/1022#discussion_r1325652704
+                is_baichuan2 = self.config.vocab_size == 125696
+                if is_baichuan2:
+                    loaded_weight = torch.nn.functional.normalize(loaded_weight)
+
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
+                weight_loader(param, loaded_weight)
+
+
+class BaichuanForCausalLM(BaiChuanBaseForCausalLM):
+    """Baichuan 13B and Baichuan2 7B/13B."""
+
+    def __init__(
+        self,
+        config,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        if config.hidden_size == 4096:  # baichuan2 7b
+            super().__init__(config, "ROPE", quant_config, prefix=prefix)
+        else:  # baichuan 13b, baichuan2 13b
+            super().__init__(config, "ALIBI", quant_config, prefix=prefix)
+
+
+EntryClass = [BaichuanForCausalLM]
diff --git a/sglang/python/sglang/srt/models/bailing_moe.py b/sglang/python/sglang/srt/models/bailing_moe.py
new file mode 100644
index 0000000000000000000000000000000000000000..38eb17aba6733a0fa04a33f70dc067caa03758c6
--- /dev/null
+++ b/sglang/python/sglang/srt/models/bailing_moe.py
@@ -0,0 +1,1056 @@
+# coding=utf-8
+# Copyright 2023 Antgroup and The HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""SGLang BailingMoE model."""
+
+import logging
+from typing import Iterable, List, Optional, Tuple, Union
+
+import torch
+import torch.nn.functional as F
+from torch import nn
+from transformers import PretrainedConfig
+
+from sglang.srt.distributed import (
+    get_pp_group,
+    get_tensor_model_parallel_world_size,
+    parallel_state,
+    tensor_model_parallel_all_reduce,
+)
+from sglang.srt.eplb.expert_distribution import get_global_expert_distribution_recorder
+from sglang.srt.eplb.expert_location import ModelConfigForExpertLocation
+from sglang.srt.eplb.expert_location_dispatch import ExpertLocationDispatchInfo
+from sglang.srt.layers.activation import SiluAndMul
+from sglang.srt.layers.communicator import (
+    LayerCommunicator,
+    LayerScatterModes,
+    enable_moe_dense_fully_dp,
+)
+from sglang.srt.layers.dp_attention import (
+    get_attention_dp_size,
+    get_attention_tp_rank,
+    get_attention_tp_size,
+    is_dp_attention_enabled,
+)
+from sglang.srt.layers.layernorm import RMSNorm
+from sglang.srt.layers.linear import (
+    MergedColumnParallelLinear,
+    QKVParallelLinear,
+    RowParallelLinear,
+)
+from sglang.srt.layers.logits_processor import LogitsProcessor
+from sglang.srt.layers.moe import (
+    get_deepep_mode,
+    get_moe_a2a_backend,
+    should_use_flashinfer_cutlass_moe_fp4_allgather,
+)
+from sglang.srt.layers.moe.ep_moe.layer import get_moe_impl_class
+from sglang.srt.layers.moe.fused_moe_triton.layer import FusedMoE
+from sglang.srt.layers.moe.token_dispatcher import DeepEPDispatcher
+from sglang.srt.layers.moe.topk import TopK
+from sglang.srt.layers.moe.utils import filter_moe_weight_param_global_expert
+from sglang.srt.layers.quantization.base_config import QuantizationConfig
+from sglang.srt.layers.radix_attention import RadixAttention
+from sglang.srt.layers.rotary_embedding import get_rope
+from sglang.srt.layers.utils import PPMissingLayer
+from sglang.srt.layers.vocab_parallel_embedding import (
+    ParallelLMHead,
+    VocabParallelEmbedding,
+)
+from sglang.srt.model_executor.cuda_graph_runner import get_is_capture_mode
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch, PPProxyTensors
+from sglang.srt.model_loader.weight_utils import default_weight_loader
+from sglang.srt.models.utils import (
+    apply_qk_norm,
+    create_fused_set_kv_buffer_arg,
+    enable_fused_set_kv_buffer,
+)
+from sglang.srt.server_args import get_global_server_args
+from sglang.srt.utils import add_prefix, is_cuda, is_non_idle_and_non_empty, make_layers
+
+LoraConfig = None
+logger = logging.getLogger(__name__)
+_is_cuda = is_cuda()
+
+
+class BailingMoEMLP(nn.Module):
+    def __init__(
+        self,
+        intermediate_size: int,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        reduce_results: Optional[bool] = True,
+        prefix: str = "",
+        tp_rank: Optional[int] = None,
+        tp_size: Optional[int] = None,
+    ) -> None:
+        super().__init__()
+        self.tp_size = tp_size
+
+        self.gate_up_proj = MergedColumnParallelLinear(
+            config.hidden_size,
+            [intermediate_size] * 2,
+            bias=config.use_bias,
+            quant_config=quant_config,
+            prefix=add_prefix("gate_up_proj", prefix),
+            tp_rank=tp_rank,
+            tp_size=tp_size,
+        )
+        self.down_proj = RowParallelLinear(
+            intermediate_size,
+            config.hidden_size,
+            bias=config.use_bias,
+            reduce_results=reduce_results,
+            quant_config=quant_config,
+            prefix=add_prefix("down_proj", prefix),
+            tp_rank=tp_rank,
+            tp_size=tp_size,
+        )
+
+        if config.hidden_act != "silu":
+            raise ValueError("Unsupported activation. Only silu is supported for now.")
+        self.act_fn = SiluAndMul()
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        forward_batch: Optional[ForwardBatch] = None,
+        should_allreduce_fusion: bool = False,
+        use_reduce_scatter: bool = False,
+    ) -> torch.Tensor:
+        if (self.tp_size == 1) and hidden_states.shape[0] == 0:
+            return hidden_states
+
+        gate_up, _ = self.gate_up_proj(hidden_states)
+        hidden_states = self.act_fn(gate_up)
+        hidden_states, _ = self.down_proj(
+            hidden_states, skip_all_reduce=should_allreduce_fusion or use_reduce_scatter
+        )
+        return hidden_states
+
+
+class BailingMoEGate(nn.Module):
+    def __init__(
+        self,
+        config,
+        params_dtype: Optional[torch.dtype] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        if params_dtype is None:
+            params_dtype = torch.get_default_dtype()
+        self.params_dtype = params_dtype
+        self.weight = nn.Parameter(
+            torch.empty(
+                (config.num_experts, config.hidden_size),
+                dtype=self.params_dtype,
+            ),
+        )
+        if getattr(config, "moe_router_enable_expert_bias", False):
+            self.expert_bias = nn.Parameter(
+                torch.empty((config.num_experts,), dtype=torch.float32),
+            )
+        else:
+            self.expert_bias = None
+
+    def forward(self, hidden_states):
+        logits = F.linear(hidden_states.to(self.weight.dtype), self.weight, None).to(
+            hidden_states.dtype
+        )
+        return logits
+
+
+class BailingMoESparseMoeBlock(nn.Module):
+    def __init__(
+        self,
+        layer_id: int,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        alt_stream: Optional[torch.cuda.Stream] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.layer_id = layer_id
+        self.alt_stream = alt_stream
+        self.tp_size = get_tensor_model_parallel_world_size()
+        self.top_k = config.num_experts_per_tok
+        self.norm_topk_prob = config.norm_topk_prob
+        self.hidden_size = config.hidden_size
+        self.num_shared_experts = config.num_shared_experts
+        self.routed_scaling_factor = getattr(config, "routed_scaling_factor", 1.0)
+        self.score_function = getattr(config, "score_function", None)
+
+        if config.hidden_act != "silu":
+            raise ValueError(
+                f"Unsupported activation: {config.hidden_act}. "
+                "Only silu is supported for now."
+            )
+
+        # Gate always runs at half / full precision for now.
+        router_dtype = getattr(config, "router_dtype", None)
+        if router_dtype is None:
+            self.router_dtype = None
+        elif router_dtype == "fp32":
+            self.router_dtype = torch.float32
+        else:
+            self.router_dtype = torch.bfloat16
+
+        # TODO global_server_args.ep_num_redundant_experts is used for eplb, not supported now
+        assert get_global_server_args().ep_num_redundant_experts == 0
+        # check group topk
+        self.num_expert_group = getattr(config, "n_group", 0)
+        self.topk_group = getattr(config, "topk_group", 0)
+        if self.num_expert_group > 0 or self.topk_group > 0:
+            assert (
+                self.num_expert_group > 0
+                and 0 < self.topk_group <= self.num_expert_group
+            )
+            self.use_grouped_topk = True
+        else:
+            self.num_expert_group = self.topk_group = None
+            self.use_grouped_topk = False
+
+        self.num_experts = (
+            config.num_experts + get_global_server_args().ep_num_redundant_experts
+        )
+
+        self.gate = BailingMoEGate(
+            config=config,
+            params_dtype=self.router_dtype,
+            prefix=add_prefix("gate", prefix),
+        )
+        self.correction_bias = (
+            self.gate.expert_bias.data if self.gate.expert_bias is not None else None
+        )
+
+        if self.score_function is not None:
+            assert (
+                self.score_function == "softmax" and self.correction_bias is None
+            ) or (
+                self.score_function == "sigmoid" and self.correction_bias is not None
+            ), "score_function and correction_bias should be in 2 combination (softmax, None) or (sigmoid, not None)"
+
+        self.topk = TopK(
+            top_k=self.top_k,
+            renormalize=self.norm_topk_prob,
+            use_grouped_topk=self.use_grouped_topk,
+            num_expert_group=self.num_expert_group,
+            # num_fused_shared_experts=self.num_fused_shared_experts,
+            topk_group=self.topk_group,
+            correction_bias=self.correction_bias,
+            routed_scaling_factor=self.routed_scaling_factor,
+        )
+
+        self.experts = get_moe_impl_class(quant_config)(
+            num_experts=self.num_experts,
+            top_k=self.top_k,
+            layer_id=self.layer_id,
+            hidden_size=config.hidden_size,
+            intermediate_size=config.moe_intermediate_size,
+            quant_config=quant_config,
+            routed_scaling_factor=self.routed_scaling_factor,
+            prefix=add_prefix("experts", prefix),
+        )
+        # shared expert
+        if config.num_shared_experts is not None:
+            if hasattr(config, "moe_shared_expert_intermediate_size"):
+                intermediate_size = config.moe_shared_expert_intermediate_size
+            else:
+                intermediate_size = config.moe_intermediate_size
+            intermediate_size *= config.num_shared_experts
+            # disable tp for shared experts when enable deepep moe
+            self.shared_experts = BailingMoEMLP(
+                intermediate_size=intermediate_size,
+                config=config,
+                quant_config=quant_config,
+                reduce_results=False,
+                prefix=add_prefix("shared_experts", prefix),
+                **(
+                    dict(tp_rank=0, tp_size=1)
+                    if get_moe_a2a_backend().is_deepep()
+                    else {}
+                ),
+            )
+        # dispatcher
+        if get_moe_a2a_backend().is_deepep():
+            # TODO: we will support tp < ep in the future
+            self.ep_size = get_tensor_model_parallel_world_size()
+
+            self.deepep_dispatcher = DeepEPDispatcher(
+                group=parallel_state.get_tp_group().device_group,
+                router_topk=self.top_k,
+                permute_fusion=True,
+                num_experts=self.num_experts,
+                num_local_experts=config.num_experts // self.tp_size,
+                hidden_size=config.hidden_size,
+                params_dtype=config.torch_dtype,
+                deepep_mode=get_deepep_mode(),
+                async_finish=True,  # TODO
+                return_recv_hook=True,
+            )
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        forward_batch: Optional[ForwardBatch] = None,
+        should_allreduce_fusion: bool = False,
+        use_reduce_scatter: bool = False,
+    ) -> torch.Tensor:
+        if not get_moe_a2a_backend().is_deepep():
+            return self.forward_normal(
+                hidden_states,
+                should_allreduce_fusion,
+                use_reduce_scatter,
+            )
+        else:
+            return self.forward_deepep(hidden_states, forward_batch)
+
+    def get_moe_weights(self):
+        return [
+            x.data
+            for name, x in self.experts.named_parameters()
+            if name not in ["correction_bias"]
+            and filter_moe_weight_param_global_expert(
+                name, x, self.experts.num_local_experts
+            )
+        ]
+
+    def _forward_shared_experts(self, hidden_states: torch.Tensor):
+        shared_output = None
+        if self.num_shared_experts > 0:
+            shared_output = self.shared_experts(hidden_states)
+        return shared_output
+
+    def _forward_router_experts(self, hidden_states: torch.Tensor):
+        # router_logits: (num_tokens, n_experts)
+        router_logits = self.gate(hidden_states)
+        topk_output = self.topk(hidden_states, router_logits)
+        return self.experts(hidden_states, topk_output)
+
+    def forward_normal_dual_stream(
+        self,
+        hidden_states: torch.Tensor,
+    ) -> torch.Tensor:
+        current_stream = torch.cuda.current_stream()
+        self.alt_stream.wait_stream(current_stream)
+        shared_output = self._forward_shared_experts(hidden_states.clone())
+
+        with torch.cuda.stream(self.alt_stream):
+            router_output = self._forward_router_experts(hidden_states)
+        current_stream.wait_stream(self.alt_stream)
+
+        return router_output, shared_output
+
+    def forward_normal(
+        self,
+        hidden_states: torch.Tensor,
+        should_allreduce_fusion: bool = False,
+        use_reduce_scatter: bool = False,
+    ) -> torch.Tensor:
+        num_tokens, hidden_size = hidden_states.shape
+        hidden_states = hidden_states.view(-1, hidden_size)
+
+        if (
+            self.alt_stream is not None
+            and hidden_states.shape[0] > 0
+            and get_is_capture_mode()
+        ):
+            final_hidden_states, shared_output = self.forward_normal_dual_stream(
+                hidden_states
+            )
+        else:
+            shared_output = self._forward_shared_experts(hidden_states)
+            final_hidden_states = self._forward_router_experts(hidden_states)
+
+        if self.num_shared_experts > 0:
+            final_hidden_states = final_hidden_states + shared_output
+
+        if (
+            self.tp_size > 1
+            and not should_allreduce_fusion
+            and not use_reduce_scatter
+            and not should_use_flashinfer_cutlass_moe_fp4_allgather()
+        ):
+            final_hidden_states = tensor_model_parallel_all_reduce(final_hidden_states)
+        return final_hidden_states.view(num_tokens, hidden_size)
+
+    def forward_deepep(
+        self, hidden_states: torch.Tensor, forward_batch: ForwardBatch
+    ) -> torch.Tensor:
+        shared_output = None
+        forward_mode = forward_batch.forward_mode
+        if is_non_idle_and_non_empty(forward_mode, hidden_states):
+            router_logits = self.gate(hidden_states)
+            if self.num_shared_experts > 0:
+                shared_output = self.shared_experts(hidden_states)
+
+            topk_output = self.topk(
+                hidden_states,
+                router_logits,
+                num_token_non_padded=forward_batch.num_token_non_padded,
+                expert_location_dispatch_info=ExpertLocationDispatchInfo.init_new(
+                    layer_id=self.layer_id,
+                ),
+            )
+        else:
+            topk_output = self.topk.empty_topk_output(hidden_states.device)
+
+        final_hidden_states = self.experts(
+            hidden_states=hidden_states,
+            topk_output=topk_output,
+        )
+
+        if shared_output is not None:
+            final_hidden_states += shared_output
+        return final_hidden_states
+
+
+class BailingMoEAttention(nn.Module):
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        layer_id: int = 0,
+        quant_config: Optional[QuantizationConfig] = None,
+        reduce_results: bool = True,
+        prefix: str = "",
+        alt_stream: Optional[torch.cuda.Stream] = None,
+    ):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.total_num_heads = config.num_attention_heads
+        self.total_kv_heads = config.num_key_value_heads
+        self.dp_size = get_attention_dp_size()
+        attn_tp_rank = get_attention_tp_rank()
+        attn_tp_size = get_attention_tp_size()
+
+        assert self.total_num_heads % attn_tp_size == 0
+        if self.total_kv_heads >= attn_tp_size:
+            # Number of KV heads is greater than TP size, so we partition
+            # the KV heads across multiple tensor parallel GPUs.
+            assert self.total_kv_heads % attn_tp_size == 0
+        else:
+            # Number of KV heads is less than TP size, so we replicate
+            # the KV heads across multiple tensor parallel GPUs.
+            assert attn_tp_size % self.total_kv_heads == 0
+        assert self.total_num_heads >= self.total_kv_heads
+
+        self.num_heads = self.total_num_heads // attn_tp_size
+        self.head_dim = config.head_dim or (self.hidden_size // self.total_num_heads)
+        self.q_size = self.head_dim * self.num_heads
+
+        self.num_kv_heads = max(1, self.total_kv_heads // attn_tp_size)
+        self.kv_size = max(1, self.num_kv_heads * self.head_dim)
+
+        self.scale = self.head_dim**-0.5
+
+        self.use_qk_norm = getattr(config, "use_qk_norm", False)
+
+        self.query_key_value = QKVParallelLinear(
+            self.hidden_size,
+            self.head_dim,
+            self.total_num_heads,
+            self.total_kv_heads,
+            bias=(config.use_bias or config.use_qkv_bias),
+            quant_config=quant_config,
+            prefix=add_prefix("query_key_value", prefix),
+            tp_rank=attn_tp_rank,
+            tp_size=attn_tp_size,
+        )
+
+        if self.use_qk_norm:
+            self.query_layernorm = RMSNorm(self.head_dim, eps=config.rms_norm_eps)
+            self.key_layernorm = RMSNorm(self.head_dim, eps=config.rms_norm_eps)
+
+        self.dense = RowParallelLinear(
+            self.total_num_heads * self.head_dim,
+            self.hidden_size,
+            bias=config.use_bias,
+            quant_config=quant_config,
+            reduce_results=reduce_results,
+            prefix=add_prefix("dense", prefix),
+            tp_rank=attn_tp_rank,
+            tp_size=attn_tp_size,
+        )
+
+        if hasattr(config, "partial_rotary_factor"):
+            self.rotary_dim = int(self.head_dim * config.partial_rotary_factor)
+        elif hasattr(config, "rotary_dim"):
+            self.rotary_dim = config.rotary_dim
+        else:
+            self.rotary_dim = self.head_dim
+        self.rotary_emb = get_rope(
+            self.head_dim,
+            rotary_dim=self.rotary_dim,
+            max_position=config.max_position_embeddings,
+            base=config.rope_theta,
+            rope_scaling=config.rope_scaling,
+        )
+
+        self.attn = RadixAttention(
+            self.num_heads,
+            self.head_dim,
+            self.scale,
+            num_kv_heads=self.num_kv_heads,
+            layer_id=layer_id,
+            prefix=add_prefix("attn", prefix),
+        )
+
+        self.alt_stream = alt_stream
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+    ) -> torch.Tensor:
+        if hidden_states.shape[0] == 0:
+            return hidden_states
+        qkv, _ = self.query_key_value(hidden_states)
+        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+        if self.use_qk_norm:
+            q, k = apply_qk_norm(
+                q=q,
+                k=k,
+                q_norm=self.query_layernorm,
+                k_norm=self.key_layernorm,
+                head_dim=self.head_dim,
+                alt_stream=self.alt_stream,
+            )
+        q, k = self.rotary_emb(
+            positions,
+            q,
+            k,
+            fused_set_kv_buffer_arg=(
+                create_fused_set_kv_buffer_arg(
+                    value=v,
+                    layer=self.attn,
+                    forward_batch=forward_batch,
+                )
+                if enable_fused_set_kv_buffer(forward_batch)
+                else None
+            ),
+        )
+        context_layer = self.attn(
+            q,
+            k,
+            v,
+            forward_batch,
+            save_kv_cache=not enable_fused_set_kv_buffer(forward_batch),
+        )
+        attn_output, _ = self.dense(context_layer)
+        return attn_output
+
+
+class BailingMoEBlock(nn.Module):
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        layer_id: int = 0,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+        alt_stream: Optional[torch.cuda.Stream] = None,
+    ):
+        super().__init__()
+        self.config = config
+        hidden_size = config.hidden_size
+
+        self.input_layernorm = RMSNorm(hidden_size, eps=config.rms_norm_eps)
+        self.dp_size = get_attention_dp_size()
+        self.attention = BailingMoEAttention(
+            config,
+            layer_id,
+            quant_config,
+            reduce_results=False,
+            prefix=add_prefix("attention", prefix),
+            alt_stream=alt_stream,
+        )
+        self.layer_id = layer_id
+        self.attn_tp_size = get_attention_tp_size()
+        self.attn_tp_rank = get_attention_tp_rank()
+
+        self.is_layer_sparse = self._is_layer_sparse(
+            config, layer_id=layer_id, is_nextn=False
+        )
+        is_previous_layer_sparse = self._is_layer_sparse(
+            config, layer_id=layer_id - 1, is_nextn=False
+        )
+        is_next_layer_sparse = self._is_layer_sparse(
+            config, layer_id=layer_id + 1, is_nextn=False
+        )
+
+        self.layer_scatter_modes = LayerScatterModes.init_new(
+            layer_id=layer_id,
+            num_layers=config.num_hidden_layers,
+            is_layer_sparse=self.is_layer_sparse,
+            is_previous_layer_sparse=is_previous_layer_sparse,
+            is_next_layer_sparse=is_next_layer_sparse,
+        )
+
+        self.is_last_layer = self.layer_id == config.num_hidden_layers - 1
+
+        if self.is_layer_sparse:
+            self.mlp = BailingMoESparseMoeBlock(
+                layer_id=layer_id,
+                config=config,
+                quant_config=quant_config,
+                alt_stream=alt_stream,
+                prefix=add_prefix("mlp", prefix),
+            )
+        else:
+            if enable_moe_dense_fully_dp():
+                mlp_tp_rank, mlp_tp_size = 0, 1
+            else:
+                mlp_tp_rank, mlp_tp_size = None, None
+            self.mlp = BailingMoEMLP(
+                intermediate_size=config.intermediate_size,
+                config=config,
+                quant_config=quant_config,
+                prefix=add_prefix("mlp", prefix),
+                tp_rank=mlp_tp_rank,
+                tp_size=mlp_tp_size,
+            )
+
+        self.post_attention_layernorm = RMSNorm(hidden_size, eps=config.rms_norm_eps)
+
+        self.layer_communicator = LayerCommunicator(
+            layer_scatter_modes=self.layer_scatter_modes,
+            input_layernorm=self.input_layernorm,
+            post_attention_layernorm=self.post_attention_layernorm,
+            allow_reduce_scatter=True,
+            is_last_layer=(self.layer_id == self.config.num_hidden_layers - 1),
+        )
+
+    def _is_layer_sparse(
+        self, config: PretrainedConfig, layer_id: int, is_nextn: bool
+    ) -> bool:
+        return is_nextn or (
+            config.num_experts is not None and layer_id >= config.first_k_dense_replace
+        )
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+        residual: Optional[torch.Tensor],
+        captured_last_layer_outputs: Optional[List[torch.Tensor]] = None,
+    ) -> torch.Tensor:
+        hidden_states, residual = (
+            self.layer_communicator.prepare_attn_and_capture_last_layer_outputs(
+                hidden_states,
+                residual,
+                forward_batch,
+                captured_last_layer_outputs=captured_last_layer_outputs,
+            )
+        )
+
+        if hidden_states.shape[0] != 0:
+            hidden_states = self.attention(
+                positions=positions,
+                hidden_states=hidden_states,
+                forward_batch=forward_batch,
+            )
+
+        hidden_states, residual = self.layer_communicator.prepare_mlp(
+            hidden_states=hidden_states,
+            residual=residual,
+            forward_batch=forward_batch,
+        )
+
+        should_allreduce_fusion = (
+            self.layer_communicator.should_fuse_mlp_allreduce_with_next_layer(
+                forward_batch
+            )
+        )
+
+        # For DP with padding, reduce scatter can be used instead of all-reduce.
+        use_reduce_scatter = self.layer_communicator.should_use_reduce_scatter(
+            forward_batch
+        )
+
+        hidden_states = self.mlp(
+            hidden_states, forward_batch, should_allreduce_fusion, use_reduce_scatter
+        )
+
+        if should_allreduce_fusion:
+            hidden_states._sglang_needs_allreduce_fusion = True
+        else:
+            hidden_states, residual = self.layer_communicator.postprocess_layer(
+                hidden_states, residual, forward_batch
+            )
+
+        return hidden_states, residual
+
+
+class BailingMoEModel(nn.Module):
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        alt_stream: Optional[torch.cuda.Stream] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.pp_group = get_pp_group()
+        self.config = config
+        self.vocab_size = config.vocab_size
+        self.embed_dim = config.hidden_size
+        if self.pp_group.is_first_rank:
+            self.word_embeddings = VocabParallelEmbedding(
+                self.vocab_size,
+                self.embed_dim,
+                quant_config=quant_config,
+                prefix=add_prefix("word_embeddings", prefix),
+                use_attn_tp_group=is_dp_attention_enabled(),
+            )
+        else:
+            self.word_embeddings = PPMissingLayer()
+
+        self.embedding_dropout = torch.nn.Dropout(config.embedding_dropout)
+
+        self.layers, self.start_layer, self.end_layer = make_layers(
+            config.num_hidden_layers,
+            lambda idx, prefix: BailingMoEBlock(
+                layer_id=idx,
+                config=config,
+                quant_config=quant_config,
+                prefix=prefix,
+                alt_stream=alt_stream,
+            ),
+            pp_rank=self.pp_group.rank_in_group,
+            pp_size=self.pp_group.world_size,
+            prefix=add_prefix("layers", prefix),
+        )
+        if self.pp_group.is_last_rank:
+            self.norm = RMSNorm(self.embed_dim, eps=config.rms_norm_eps)
+        else:
+            self.norm = PPMissingLayer(return_tuple=True)
+
+        self.layers_to_capture = []
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        input_embeds: torch.Tensor = None,
+        pp_proxy_tensors: Optional[PPProxyTensors] = None,
+    ) -> Union[torch.Tensor, PPProxyTensors]:
+        if self.pp_group.is_first_rank:
+            if input_embeds is None:
+                hidden_states = self.word_embeddings(input_ids)
+            else:
+                hidden_states = input_embeds
+            residual = None
+        else:
+            assert pp_proxy_tensors is not None
+            hidden_states = pp_proxy_tensors["hidden_states"]
+            residual = pp_proxy_tensors["residual"]
+
+        aux_hidden_states = []
+        for i in range(self.start_layer, self.end_layer):
+            with get_global_expert_distribution_recorder().with_current_layer(i):
+                if i in self.layers_to_capture:
+                    aux_hidden_states.append(
+                        hidden_states if residual is None else hidden_states + residual
+                    )
+                layer = self.layers[i]
+                hidden_states, residual = layer(
+                    positions,
+                    hidden_states,
+                    forward_batch,
+                    residual,
+                    captured_last_layer_outputs=(
+                        aux_hidden_states
+                        if getattr(layer, "_is_layer_to_capture", False)
+                        else None
+                    ),
+                )
+        if not self.pp_group.is_last_rank:
+            return PPProxyTensors(
+                {
+                    "hidden_states": hidden_states,
+                    "residual": residual,
+                }
+            )
+        else:
+            if not forward_batch.forward_mode.is_idle():
+                if residual is None:
+                    hidden_states = self.norm(hidden_states)
+                else:
+                    hidden_states, _ = self.norm(hidden_states, residual)
+
+        if len(aux_hidden_states) == 0:
+            return hidden_states
+        return hidden_states, aux_hidden_states
+
+
+class BailingMoEForCausalLM(nn.Module):
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.pp_group = get_pp_group()
+        self.config = config
+        self.quant_config = quant_config
+        alt_stream = torch.cuda.Stream() if _is_cuda else None
+
+        self.model = BailingMoEModel(
+            config,
+            quant_config,
+            alt_stream=alt_stream,
+            prefix=add_prefix("model", ""),
+        )
+
+        # tie_word_embeddings为true，复用tie_word_embeddings，反之是独立的
+        if config.tie_word_embeddings:
+            self.lm_head = self.model.word_embeddings
+        else:
+            # TODO something wrong with ParallelLMHead with DP attention enabled
+            self.lm_head = ParallelLMHead(
+                config.vocab_size,
+                config.hidden_size,
+                quant_config=quant_config,
+                prefix=add_prefix("lm_head", prefix),
+                use_attn_tp_group=get_global_server_args().enable_dp_lm_head,
+            )
+        self.logits_processor = LogitsProcessor(config)
+
+        self.capture_aux_hidden_states = False
+
+    @property
+    def start_layer(self):
+        return self.model.start_layer
+
+    @property
+    def end_layer(self):
+        return self.model.end_layer
+
+    def get_embed_and_head(self):
+        """Used by the eagle_worker."""
+        return self.model.word_embeddings.weight, self.lm_head.weight
+
+    def set_embed_and_head(self, embed, head):
+        """Used by the eagle_worker."""
+        del self.model.word_embeddings.weight
+        del self.lm_head.weight
+        self.model.word_embeddings.weight = embed
+        self.lm_head.weight = head
+        torch.cuda.empty_cache()
+        torch.cuda.synchronize()
+
+    @torch.no_grad()
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        input_embeds: torch.Tensor = None,
+        pp_proxy_tensors: Optional[PPProxyTensors] = None,
+    ) -> torch.Tensor:
+        hidden_states = self.model(
+            input_ids,
+            positions,
+            forward_batch,
+            input_embeds,
+            pp_proxy_tensors=pp_proxy_tensors,
+        )
+
+        aux_hidden_states = None
+        if self.capture_aux_hidden_states:
+            hidden_states, aux_hidden_states = hidden_states
+
+        if self.pp_group.is_last_rank:
+            return self.logits_processor(
+                input_ids, hidden_states, self.lm_head, forward_batch, aux_hidden_states
+            )
+        else:
+            return hidden_states
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]], is_nextn=False):
+        if is_nextn:
+            if hasattr(self.config, "num_nextn_predict_layers"):
+                num_nextn_layers = self.config.num_nextn_predict_layers
+                assert num_nextn_layers == 1, "Only 1 nextn layer is supported"
+                # compatible with old design
+                nextn_layer_id = (
+                    0
+                    if self.config.num_hidden_layers == 1
+                    else self.config.num_hidden_layers
+                )
+            else:
+                raise ValueError("num_nextn_predict_layers is not in the config")
+
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("gate_up_proj", "gate_proj", 0),
+            ("gate_up_proj", "up_proj", 1),
+        ]
+
+        if is_nextn:
+            nextn_layer_prefix = f"model.layers.{nextn_layer_id}"
+            nextn_spec_weight_names = [
+                "final_layernorm",
+                "eh_proj",
+                "enorm",
+                "hnorm",
+            ]
+        # Params for weights, fp8 weight scales, fp8 activation scales
+        # (param_name, weight_name, expert_id, shard_id)
+        expert_params_mapping = FusedMoE.make_expert_params_mapping(
+            ckpt_gate_proj_name="gate_proj",
+            ckpt_down_proj_name="down_proj",
+            ckpt_up_proj_name="up_proj",
+            num_experts=self.config.num_experts,
+        )
+
+        params_dict = dict(self.named_parameters())
+        for name, loaded_weight in weights:
+            if (
+                ("v_head" in name)
+                or ("inv_freq" in name)
+                or (self.config.tie_word_embeddings and "lm_head" in name)
+            ):
+                continue
+
+            if (
+                hasattr(self.config, "norm_head")
+                and self.config.norm_head
+                and "lm_head.weight" in name
+            ):
+                import torch.nn.functional as F
+
+                loaded_weight = F.normalize(loaded_weight, dim=0, p=2, eps=1e-7)
+
+            if is_nextn:
+                if not name.startswith(nextn_layer_prefix):
+                    continue
+
+                # Use shared head and embed weights from target model
+                if "shared_head.head" in name or "embed_tokens" in name:
+                    continue
+
+                is_decoder = True
+                # For nextn specific weights
+                for weight_name in nextn_spec_weight_names:
+                    if weight_name in name:
+                        name = name.replace(nextn_layer_prefix, "model")
+                        is_decoder = False
+                        break
+                # For decoder layer weights
+                if is_decoder:
+                    name = name.replace(nextn_layer_prefix, "model.decoder")
+
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                # We have mlp.experts[0].gate_proj in the checkpoint.
+                # Since we handle the experts below in expert_params_mapping,
+                # we need to skip here BEFORE we update the name, otherwise
+                # name will be updated to mlp.experts[0].gate_up_proj, which
+                # will then be updated below in expert_params_mapping
+                # for mlp.experts[0].gate_gate_up_proj, which breaks load.
+                if "mlp.experts" in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                if name not in params_dict:
+                    continue
+
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                for mapping in expert_params_mapping:
+                    param_name, weight_name, expert_id, shard_id = mapping
+                    if weight_name not in name:
+                        continue
+                    name = name.replace(weight_name, param_name)
+                    if name not in params_dict:
+                        continue
+                    param = params_dict[name]
+                    weight_loader = param.weight_loader
+                    weight_loader(
+                        param,
+                        loaded_weight,
+                        name,
+                        shard_id=shard_id,
+                        expert_id=expert_id,
+                    )
+                    break
+                else:
+                    # Skip loading extra bias for GPTQ models.
+                    if name.endswith(".bias") and name not in params_dict:
+                        continue
+                    if name not in params_dict:
+                        continue
+
+                    param = params_dict[name]
+                    weight_loader = getattr(
+                        param, "weight_loader", default_weight_loader
+                    )
+                    weight_loader(param, loaded_weight)
+
+        if not is_nextn:
+            self.routed_experts_weights_of_layer = {
+                layer_id: layer.mlp.get_moe_weights()
+                for layer_id, layer in enumerate(self.model.layers)
+                if not isinstance(layer, PPMissingLayer)
+                and isinstance(layer.mlp, BailingMoESparseMoeBlock)
+            }
+
+    @classmethod
+    def get_model_config_for_expert_location(cls, config):
+        num_groups = getattr(config, "n_group", 0)
+        return ModelConfigForExpertLocation(
+            num_layers=config.num_hidden_layers,
+            num_logical_experts=config.num_experts,
+            num_groups=None if num_groups == 0 else num_groups,
+        )
+
+    def set_eagle3_layers_to_capture(self, layer_ids: Optional[List[int]] = None):
+        if not self.pp_group.is_last_rank:
+            return
+
+        self.capture_aux_hidden_states = True
+        if layer_ids is None:
+            num_layers = self.config.num_hidden_layers
+            self.model.layers_to_capture = [2, num_layers // 2, num_layers - 3]
+        else:
+            # Add +1 because in SGLang, for the i-th layer, the auxiliary hidden state
+            # corresponds to the output of layer (i - 1).
+            self.model.layers_to_capture = [val + 1 for val in layer_ids]
+
+
+class BailingMoeForCausalLM(BailingMoEForCausalLM):
+    pass
+
+
+class BailingMoeV2ForCausalLM(BailingMoEForCausalLM):
+    pass
+
+
+EntryClass = [BailingMoEForCausalLM, BailingMoeForCausalLM, BailingMoeV2ForCausalLM]
diff --git a/sglang/python/sglang/srt/models/bailing_moe_linear.py b/sglang/python/sglang/srt/models/bailing_moe_linear.py
new file mode 100644
index 0000000000000000000000000000000000000000..71c98e881d754e1080e3dd8d53844ed3602bb97c
--- /dev/null
+++ b/sglang/python/sglang/srt/models/bailing_moe_linear.py
@@ -0,0 +1,1569 @@
+# coding=utf-8
+# Copyright 2023 Antgroup and The HuggingFace Inc. team. All rights reserved.
+import copy
+import logging
+from typing import Callable, Iterable, Optional, Set, Tuple, Union
+
+import torch
+import torch.nn.functional as F
+from torch import nn
+from transformers import PretrainedConfig
+
+from sglang.srt.distributed import (
+    get_pp_group,
+    get_tensor_model_parallel_rank,
+    get_tensor_model_parallel_world_size,
+    tensor_model_parallel_all_reduce,
+)
+from sglang.srt.eplb.expert_distribution import get_global_expert_distribution_recorder
+from sglang.srt.layers import deep_gemm_wrapper
+from sglang.srt.layers.activation import SiluAndMul
+from sglang.srt.layers.attention.fla.layernorm_gated import RMSNorm as RMSNormGated
+from sglang.srt.layers.attention.fla.layernorm_gated import layernorm_fn
+from sglang.srt.layers.communicator import LayerCommunicator, LayerScatterModes
+from sglang.srt.layers.dp_attention import (
+    get_attention_tp_rank,
+    get_attention_tp_size,
+    is_dp_attention_enabled,
+)
+from sglang.srt.layers.layernorm import RMSNorm
+from sglang.srt.layers.linear import (
+    ColumnParallelLinear,
+    MergedColumnParallelLinear,
+    QKVParallelLinear,
+    RowParallelLinear,
+)
+from sglang.srt.layers.logits_processor import LogitsProcessor
+from sglang.srt.layers.moe.ep_moe.layer import DeepEPMoE, get_moe_impl_class
+from sglang.srt.layers.moe.fused_moe_triton.layer import FusedMoE
+from sglang.srt.layers.moe.topk import TopK
+from sglang.srt.layers.quantization.base_config import QuantizationConfig
+from sglang.srt.layers.quantization.fp8_kernel import is_fp8_fnuz
+from sglang.srt.layers.quantization.fp8_utils import (
+    block_quant_dequant,
+    block_quant_to_tensor_quant,
+    channel_quant_to_tensor_quant,
+    normalize_e4m3fn_to_e4m3fnuz,
+    requant_weight_ue8m0_inplace,
+)
+from sglang.srt.layers.quantization.int8_utils import (
+    block_dequant as int8_block_dequant,
+)
+from sglang.srt.layers.radix_attention import RadixAttention
+from sglang.srt.layers.rotary_embedding import get_rope_wrapper
+from sglang.srt.layers.utils import PPMissingLayer
+from sglang.srt.layers.vocab_parallel_embedding import (
+    ParallelLMHead,
+    VocabParallelEmbedding,
+)
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch, PPProxyTensors
+from sglang.srt.model_loader.weight_utils import default_weight_loader
+from sglang.srt.models.deepseek_v2 import DeepseekV2AttentionMLA, DeepseekV2MLP, _is_hip
+from sglang.srt.models.utils import WeightsMapper
+from sglang.srt.server_args import get_global_server_args
+from sglang.srt.utils import (
+    BumpAllocator,
+    add_prefix,
+    bind_or_assign,
+    cpu_has_amx_support,
+    get_bool_env_var,
+    get_device_sm,
+    is_cpu,
+    is_cuda,
+    is_flashinfer_available,
+    is_gfx95_supported,
+    is_hip,
+    is_npu,
+    is_sm100_supported,
+    make_layers,
+)
+from sglang.srt.utils.common import rank0_log
+
+_is_hip = is_hip()
+_is_cuda = is_cuda()
+_is_npu = is_npu()
+_is_fp8_fnuz = is_fp8_fnuz()
+_use_aiter = get_bool_env_var("SGLANG_USE_AITER") and _is_hip
+_is_cpu_amx_available = cpu_has_amx_support()
+_is_cpu = is_cpu()
+_device_sm = get_device_sm()
+_is_gfx95_supported = is_gfx95_supported()
+
+_use_aiter_gfx95 = _use_aiter and _is_gfx95_supported
+
+if _use_aiter_gfx95:
+    pass
+
+if _is_cuda:
+    from sgl_kernel import awq_dequantize
+elif _is_cpu and _is_cpu_amx_available:
+    pass
+elif _is_hip:
+    from sglang.srt.layers.quantization.awq_triton import (
+        awq_dequantize_triton as awq_dequantize,
+    )
+else:
+    from vllm._custom_ops import awq_dequantize
+
+if _is_hip:
+    pass
+
+_is_flashinfer_available = is_flashinfer_available()
+_is_sm100_supported = is_cuda() and is_sm100_supported()
+
+
+class DsV3MLA(DeepseekV2AttentionMLA):
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+        if kwargs["rope_scaling"]:
+            self.rotary_emb.forward = self.rotary_emb.forward_cuda
+
+
+LoraConfig = None
+logger = logging.getLogger(__name__)
+_is_cpu = is_cpu()
+
+
+def is_linear_layer(layer_idx, layer_group_size):
+    if layer_idx is None:
+        return False
+    if layer_group_size > 0:
+        return (layer_idx + 1) % layer_group_size != 0
+    else:
+        return False
+
+
+def is_pp_missing_parameter(
+    name: str,
+    model: torch.nn.Module,
+) -> bool:
+    if isinstance(model, PPMissingLayer):
+        return True
+    return False
+
+
+def weight_loader_with_alias(alias: str):
+    def wrapper(func: Callable):
+        def inner_func(
+            param: torch.Tensor,
+            loaded_weight: torch.Tensor,
+            *args,
+            prefix: str = None,
+            **kwargs,
+        ):
+            # pf = "[vLLM][load]" + " " if prefix is None else f"[{prefix}] "
+            value = func(param, loaded_weight, *args, **kwargs)
+            return value
+
+        return inner_func
+
+    return wrapper
+
+
+class BailingMLP(nn.Module):
+
+    def __init__(
+        self,
+        hidden_size: int,
+        intermediate_size: int,
+        reduce_results=True,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.gate_up_proj = MergedColumnParallelLinear(
+            hidden_size,
+            [intermediate_size] * 2,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.gate_up_proj",
+        )
+        self.down_proj = RowParallelLinear(
+            intermediate_size,
+            hidden_size,
+            bias=False,
+            quant_config=quant_config,
+            reduce_results=reduce_results,
+            prefix=f"{prefix}.down_proj",
+        )
+        self.act_fn = SiluAndMul()
+
+    def forward(
+        self,
+        x,
+        should_allreduce_fusion: bool = False,
+        use_reduce_scatter: bool = False,
+    ):
+        x, _ = self.gate_up_proj(x)
+        x = self.act_fn(x)
+        x, _ = self.down_proj(
+            x,
+            skip_all_reduce=use_reduce_scatter or should_allreduce_fusion,
+        )
+        return x
+
+
+class BailingMoEGate(nn.Module):
+    def __init__(
+        self,
+        config,
+        params_dtype: Optional[torch.dtype] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        if params_dtype is None:
+            params_dtype = torch.get_default_dtype()
+        self.params_dtype = params_dtype
+        self.weight = nn.Parameter(
+            torch.empty(
+                (config.num_experts, config.hidden_size),
+                dtype=self.params_dtype,
+            ),
+        )
+        if getattr(config, "moe_router_enable_expert_bias", False):
+            self.expert_bias = nn.Parameter(
+                torch.empty((config.num_experts,), dtype=torch.float32),
+            )
+        else:
+            self.expert_bias = None
+
+    def forward(self, hidden_states):
+        logits = F.linear(hidden_states.to(self.weight.dtype), self.weight, None).to(
+            hidden_states.dtype
+        )
+        return logits
+
+
+class BailingMoE(nn.Module):
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        layer_id: int = 0,
+        prefix: str = "moe",
+    ):
+        super().__init__()
+
+        self.layer_id = layer_id
+
+        self.tp_size = get_tensor_model_parallel_world_size()
+        self.tp_rank = get_tensor_model_parallel_rank()
+
+        self.top_k = config.num_experts_per_tok
+        self.norm_expert_prob = getattr(config, "norm_topk_prob", False)
+        self.hidden_size = config.hidden_size
+        self.intermediate_size = config.moe_intermediate_size
+        self.num_shared_experts = getattr(config, "num_shared_experts", 0)
+        self.routed_scaling_factor = getattr(config, "routed_scaling_factor", 1.0)
+        self.score_function = getattr(config, "score_function", None)
+
+        # Gate always runs at half / full precision for now.
+        router_dtype = getattr(config, "router_dtype", None)
+        if router_dtype is None:
+            self.router_dtype = torch.float32
+        elif router_dtype == "fp32":
+            self.router_dtype = torch.float32
+        else:
+            self.router_dtype = torch.bfloat16
+
+        # check group topk
+        self.num_expert_group = getattr(config, "n_group", 0)
+        self.topk_group = getattr(config, "topk_group", 0)
+        if self.num_expert_group > 0 or self.topk_group > 0:
+            assert (
+                self.num_expert_group > 0
+                and 0 < self.topk_group <= self.num_expert_group
+            )
+            self.use_grouped_topk = True
+        else:
+            self.num_expert_group = self.topk_group = None
+            self.use_grouped_topk = False
+
+        self.num_experts = config.num_experts
+
+        self.gate = BailingMoEGate(
+            config=config,
+            params_dtype=self.router_dtype,
+            prefix=add_prefix("gate", prefix),
+        )
+        self.correction_bias = (
+            self.gate.expert_bias.data if self.gate.expert_bias is not None else None
+        )
+
+        if self.score_function is not None:
+            assert (
+                self.score_function == "softmax" and self.correction_bias is None
+            ) or (
+                self.score_function == "sigmoid" and self.correction_bias is not None
+            ), "score_function and correction_bias should be in 2 combination (softmax, None) or (sigmoid, not None)"
+
+        self.topk = TopK(
+            top_k=self.top_k,
+            use_grouped_topk=self.use_grouped_topk,
+            renormalize=self.norm_expert_prob,
+            num_expert_group=self.num_expert_group,
+            topk_group=self.topk_group,
+            correction_bias=self.correction_bias,
+            routed_scaling_factor=self.routed_scaling_factor,
+        )
+        moe_cls = get_moe_impl_class(quant_config)
+        self.experts = moe_cls(
+            num_experts=self.num_experts,
+            top_k=self.top_k,
+            layer_id=self.layer_id,
+            hidden_size=self.hidden_size,
+            intermediate_size=self.intermediate_size,
+            quant_config=quant_config,
+            routed_scaling_factor=self.routed_scaling_factor,
+            prefix=f"{prefix}.experts",
+        )
+
+        if self.num_shared_experts > 0:
+            intermediate_size = self.intermediate_size * self.num_shared_experts
+            self.shared_experts = BailingMLP(
+                hidden_size=self.hidden_size,
+                intermediate_size=intermediate_size,
+                reduce_results=False,
+                prefix=f"{prefix}.shared_experts",
+                quant_config=quant_config,
+            )
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        should_allreduce_fusion: bool = False,
+        use_reduce_scatter: bool = False,
+    ) -> torch.Tensor:
+        num_tokens, hidden_size = hidden_states.shape
+        hidden_states = hidden_states.view(-1, hidden_size)
+        if self.num_shared_experts > 0:
+            shared_output = self.shared_experts(hidden_states)
+
+        router_logits = self.gate(hidden_states)
+        topk_output = self.topk(hidden_states, router_logits)
+        final_hidden_states = self.experts(hidden_states, topk_output)
+
+        if self.num_shared_experts > 0:
+            final_hidden_states = final_hidden_states + shared_output
+
+        if self.tp_size > 1 and not use_reduce_scatter and not should_allreduce_fusion:
+            final_hidden_states = tensor_model_parallel_all_reduce(final_hidden_states)
+        return final_hidden_states
+
+
+class BailingGroupRMSNormGate(RMSNormGated):
+    def __init__(
+        self,
+        hidden_size,
+        eps=1e-5,
+        group_size=None,
+        norm_before_gate=True,
+        device=None,
+        dtype=None,
+    ):
+        super().__init__(
+            hidden_size,
+            eps=eps,
+            group_size=group_size,
+            norm_before_gate=norm_before_gate,
+            device=device,
+            dtype=dtype,
+            activation="sigmoid",
+        )
+        self.weight.weight_loader = self.weight_loader
+
+    @staticmethod
+    def weight_loader(
+        param: torch.nn.Parameter,
+        loaded_weight: torch.Tensor,
+    ) -> None:
+        tp_size = get_attention_tp_size()
+        tp_rank = get_attention_tp_rank()
+        shard_size = loaded_weight.shape[0] // tp_size
+        shard = slice(tp_rank * shard_size, (tp_rank + 1) * shard_size)
+        param.data.copy_(loaded_weight[shard].contiguous())
+        return
+
+
+class BailingMoELinearAttention(nn.Module):
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        layer_id: int = 0,
+        prefix: str = "linear_attn",
+    ):
+        super().__init__()
+
+        self.layer_id = layer_id
+        self.hidden_size = config.hidden_size
+        self.total_num_heads = config.num_attention_heads
+        self.total_kv_heads = config.num_attention_heads  # MHA
+
+        self.head_dim = getattr(config, "head_dim", None)
+        if self.head_dim is None:
+            self.head_dim = config.hidden_size // self.total_num_heads
+
+        self.hidden_inner_size = self.head_dim * self.total_num_heads
+        self.scaling = self.head_dim**-0.5
+        self.tp_size = get_attention_tp_size()
+        self.tp_rank = get_attention_tp_rank()
+
+        assert self.total_num_heads % self.tp_size == 0
+        self.tp_heads = self.total_num_heads // self.tp_size
+
+        self.max_position_embeddings = config.max_position_embeddings
+        self.rope_theta = getattr(config, "rope_theta", 600000)
+
+        self.tp_kv_heads = self.total_kv_heads // self.tp_size
+        self.q_size_per_rank = self.head_dim * self.tp_heads
+        self.kv_size_per_rank = self.head_dim * self.tp_kv_heads
+
+        self.use_qk_norm = getattr(config, "use_qk_norm", False)
+        # minimax / seg_la / fla
+        # TODO support fla
+        self.linear_backend = getattr(config, "linear_backend", "seg_la")
+        logger.debug(f"linear_backend in bailing_moe_linear: {self.linear_backend}")
+        self.linear_scale = True if self.linear_backend == "minimax" else False
+        self.linear_rope = getattr(config, "linear_rope", True)
+        if hasattr(config, "use_linear_silu"):
+            self.linear_silu = config.use_linear_silu
+        elif hasattr(config, "linear_silu"):
+            self.linear_silu = config.linear_silu
+        else:
+            self.linear_silu = False
+
+        self.query_key_value = QKVParallelLinear(
+            self.hidden_size,
+            self.head_dim,
+            self.total_num_heads,
+            self.total_kv_heads,
+            bias=(config.use_bias or config.use_qkv_bias),
+            quant_config=quant_config,
+            prefix=f"{prefix}.qkv_proj",
+            tp_rank=self.tp_rank,
+            tp_size=self.tp_size,
+        )
+
+        if self.use_qk_norm:
+            self.query_layernorm = RMSNorm(self.head_dim, eps=config.rms_norm_eps)
+            self.key_layernorm = RMSNorm(self.head_dim, eps=config.rms_norm_eps)
+
+        self.g_proj = ColumnParallelLinear(
+            self.hidden_size,
+            self.hidden_inner_size,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.output_gate",
+            tp_rank=self.tp_rank,
+            tp_size=self.tp_size,
+        )
+        self.dense = RowParallelLinear(
+            self.hidden_inner_size,
+            self.hidden_size,
+            bias=config.use_bias,
+            quant_config=quant_config,
+            prefix=f"{prefix}.out_proj",
+            tp_rank=self.tp_rank,
+            tp_size=self.tp_size,
+            reduce_results=False,
+        )
+        self.attn = RadixAttention(
+            self.tp_heads,
+            self.head_dim,
+            self.scaling,
+            num_kv_heads=self.tp_kv_heads,
+            layer_id=layer_id,
+            quant_config=quant_config,
+            prefix=f"{prefix}.attn",
+        )
+
+        self.group_norm_size = getattr(config, "group_norm_size", 1)
+        self.rms_norm_eps = float(getattr(config, "rms_norm_eps", 1e-5))
+        assert (
+            self.tp_size <= self.group_norm_size
+        ), "tp_size must be less than or equal to group_norm_size that can use local rms norm"
+        assert (
+            self.group_norm_size % self.tp_size == 0
+        ), "group_norm_size must be divisible by tp_size"
+        self.g_norm = BailingGroupRMSNormGate(
+            hidden_size=self.hidden_inner_size // self.tp_size,
+            eps=self.rms_norm_eps,
+            group_size=self.hidden_inner_size // self.group_norm_size,
+        )
+        # use fp32 rotary embedding
+        if hasattr(config, "rotary_dim"):
+            rotary_dim = config.rotary_dim
+        elif hasattr(config, "partial_rotary_factor"):
+            rotary_dim = int(self.head_dim * config.partial_rotary_factor)
+        else:
+            rotary_dim = self.head_dim
+
+        self.rotary_emb = get_rope_wrapper(
+            self.head_dim,
+            rotary_dim=rotary_dim,
+            max_position=self.max_position_embeddings,
+            base=self.rope_theta,
+            rope_scaling=config.rope_scaling,
+            is_neox_style=True,
+            device=get_global_server_args().device,
+            dtype=torch.float32,
+        )
+
+    @staticmethod
+    def weight_direct_load(param: torch.Tensor, loaded_weight: torch.Tensor) -> None:
+        assert param.size() == loaded_weight.size()
+        param.data.copy_(loaded_weight)
+        return
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        **kwargs,
+    ) -> torch.Tensor:
+        qkv, _ = self.query_key_value(hidden_states)
+        # logger.warning(f"===={self.layer_id=}, 1-1 {qkv.shape=}")
+        # use rotary_emb support fp32
+        qkv = qkv.to(torch.float32)
+        if self.linear_silu:
+            qkv = F.silu(qkv)
+
+        q, k, v = torch.split(
+            qkv,
+            [self.q_size_per_rank, self.kv_size_per_rank, self.kv_size_per_rank],
+            dim=-1,
+        )
+        if self.use_qk_norm:
+            q = q.reshape(-1, self.tp_heads, self.head_dim)
+            k = k.reshape(-1, self.tp_kv_heads, self.head_dim)
+            q = layernorm_fn(
+                q,
+                self.query_layernorm.weight.data,
+                bias=None,
+                eps=self.rms_norm_eps,
+                is_rms_norm=True,
+            )
+            k = layernorm_fn(
+                k,
+                self.key_layernorm.weight.data,
+                bias=None,
+                eps=self.rms_norm_eps,
+                is_rms_norm=True,
+            )
+            q = q.reshape(-1, self.q_size_per_rank)
+            k = k.reshape(-1, self.kv_size_per_rank)
+
+        if self.linear_rope:
+            q, k = self.rotary_emb(positions, q, k)
+
+        q = q.view((qkv.shape[0], self.tp_heads, self.head_dim))
+        k = k.view((qkv.shape[0], self.tp_kv_heads, self.head_dim))
+        v = v.view((qkv.shape[0], self.tp_kv_heads, self.head_dim))
+        # logger.warning(f"===={self.layer_id=}, 1-2 {q.shape=}, {k.shape=}, {v.shape=}")
+
+        if self.linear_scale:
+            q = q * self.scaling
+        # q = q.to(torch.float32)
+        # k = k.to(torch.float32)
+        # v = v.to(torch.float32)
+        hidden = self.attn(q, k, v, forward_batch).to(hidden_states.dtype)
+        gate, _ = self.g_proj(hidden_states)
+        # logger.warning(
+        #     f"===={self.layer_id=}, 1-3 {gate.shape=}, {hidden.shape=}, {gate.dtype=}, {hidden_states.dtype=}, {hidden.dtype=}"
+        # )
+        if self.group_norm_size > 1:
+            hidden = self.g_norm(hidden, gate)
+        else:
+            hidden = self.g_norm(hidden)
+            hidden = F.sigmoid(gate) * hidden
+        # logger.warning(f"===={self.layer_id=}, 1-4 {hidden.shape=}")
+        hidden = hidden.data.to(hidden_states.dtype)
+        hidden, _ = self.dense(hidden)
+        # logger.warning(f"===={self.layer_id=}, 1-5 {hidden.shape=}")
+        return hidden
+
+
+class BailingMoEAttention(nn.Module):
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        layer_id: int = None,
+        prefix: str = "mha",
+    ) -> None:
+        super().__init__()
+        self.layer_id = layer_id
+
+        self.hidden_size = config.hidden_size
+        tp_size = get_attention_tp_size()
+        self.total_num_heads = config.num_attention_heads
+        assert self.total_num_heads % tp_size == 0
+        self.num_heads = self.total_num_heads // tp_size
+        self.total_num_kv_heads = config.num_key_value_heads
+        if self.total_num_kv_heads >= tp_size:
+            assert self.total_num_kv_heads % tp_size == 0
+        else:
+            assert tp_size % self.total_num_kv_heads == 0
+        self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
+        self.head_dim = getattr(config, "head_dim", None)
+        if self.head_dim is None:
+            self.head_dim = self.hidden_size // self.total_num_heads
+
+        self.q_size = self.num_heads * self.head_dim
+        self.kv_size = self.num_kv_heads * self.head_dim
+        self.scaling = self.head_dim**-0.5
+
+        self.split_qkv = getattr(config, "using_split_qkv_in_self_attention", False)
+        assert not self.split_qkv, "split_qkv is not supported for now"
+        self.use_qk_norm = getattr(config, "use_qk_norm", False)
+
+        self.query_key_value = QKVParallelLinear(
+            self.hidden_size,
+            self.head_dim,
+            self.total_num_heads,
+            self.total_num_kv_heads,
+            bias=(config.use_bias or config.use_qkv_bias),
+            quant_config=quant_config,
+            prefix=f"{prefix}.qkv_proj",
+        )
+        if self.use_qk_norm:
+            self.query_layernorm = RMSNorm(self.head_dim, eps=config.rms_norm_eps)
+            self.key_layernorm = RMSNorm(self.head_dim, eps=config.rms_norm_eps)
+
+        self.dense = RowParallelLinear(
+            self.total_num_heads * self.head_dim,
+            self.hidden_size,
+            bias=config.use_bias,
+            quant_config=quant_config,
+            prefix=f"{prefix}.o_proj",
+        )
+        if hasattr(config, "rotary_dim"):
+            self.rotary_dim = config.rotary_dim
+        elif hasattr(config, "partial_rotary_factor"):
+            self.rotary_dim = int(self.head_dim * config.partial_rotary_factor)
+        else:
+            self.rotary_dim = self.head_dim
+        self.max_position_embeddings = config.max_position_embeddings
+        self.rope_theta = getattr(config, "rope_theta", 600000)
+        self.rotary_emb = get_rope_wrapper(
+            self.head_dim,
+            rotary_dim=self.rotary_dim,
+            max_position=self.max_position_embeddings,
+            base=self.rope_theta,
+            rope_scaling=config.rope_scaling,
+            device=get_global_server_args().device,
+        )
+        self.attn = RadixAttention(
+            self.num_heads,
+            self.head_dim,
+            self.scaling,
+            num_kv_heads=self.num_kv_heads,
+            layer_id=layer_id,
+            quant_config=quant_config,
+            prefix=f"{prefix}.attn",
+        )
+
+    def _apply_qk_norm(
+        self, q: torch.Tensor, k: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        q_by_head = q.reshape(-1, self.head_dim)
+        q_by_head = self.query_layernorm(q_by_head)
+        q = q_by_head.view(q.shape)
+        k_by_head = k.reshape(-1, self.head_dim)
+        k_by_head = self.key_layernorm(k_by_head)
+        k = k_by_head.view(k.shape)
+        return q, k
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        **kwargs,
+    ) -> torch.Tensor:
+        qkv, _ = self.query_key_value(hidden_states)
+        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+        if self.use_qk_norm:
+            q, k = self._apply_qk_norm(q, k)
+        q, k = self.rotary_emb(positions, q, k)
+        attn_output = self.attn(q, k, v, forward_batch)
+        output, _ = self.dense(attn_output)
+        return output
+
+
+class BailingMoELinearDecoderLayer(nn.Module):
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        layer_id: int = 0,
+        prefix: str = "layer",
+        is_nextn: bool = False,
+    ) -> None:
+        super().__init__()
+        self.layer_id = layer_id
+        self.use_mla = getattr(config, "full_attention_type", "mla") == "mla"
+        alt_stream = None  # tptest
+        # todo nextn
+
+        if config.attention_type == 0:  # Linear layer
+            self.attention = BailingMoELinearAttention(
+                config,
+                quant_config=quant_config,
+                layer_id=self.layer_id,
+                prefix=prefix + ".attention",
+            )
+        elif config.attention_type == 1:  # softmax layer
+            if self.use_mla:
+                self.attention = DsV3MLA(
+                    config=config,
+                    hidden_size=config.hidden_size,
+                    num_heads=config.num_attention_heads,
+                    qk_nope_head_dim=config.qk_nope_head_dim,
+                    qk_rope_head_dim=config.qk_rope_head_dim,
+                    v_head_dim=config.v_head_dim,
+                    q_lora_rank=(
+                        config.q_lora_rank if hasattr(config, "q_lora_rank") else None
+                    ),
+                    kv_lora_rank=config.kv_lora_rank,
+                    rope_theta=getattr(config, "rope_theta", 600000),
+                    rope_scaling=config.rope_scaling,
+                    max_position_embeddings=262144,
+                    quant_config=quant_config,
+                    layer_id=layer_id,
+                    reduce_results=False,
+                    prefix=add_prefix("attention", prefix),
+                    alt_stream=alt_stream,
+                )
+            else:
+                logger.debug(f"layer {layer_id} use gqa")
+                self.attention = BailingMoEAttention(
+                    config,
+                    quant_config=quant_config,
+                    layer_id=self.layer_id,
+                    prefix=prefix + ".attention",
+                )
+        else:
+            raise ValueError(f"Unsupported attention type: {config.attention_type}")
+
+        self.expert_num = config.num_experts
+        self.hidden_size = config.hidden_size
+        is_moe_layer = self._is_layer_sparse(config, self.layer_id)
+        is_previous_moe_layer = self._is_layer_sparse(config, self.layer_id - 1)
+        is_next_layer_moe_layer = self._is_layer_sparse(config, self.layer_id + 1)
+        if self.expert_num == 1:
+            self.mlp = BailingMLP(
+                hidden_size=self.hidden_size,
+                intermediate_size=config.intermediate_size,
+                quant_config=quant_config,
+                prefix=add_prefix("mlp", prefix),
+            )
+        else:
+            if is_nextn or self.layer_id >= config.first_k_dense_replace:
+                # MoE layer
+                self.mlp = BailingMoE(
+                    config,
+                    quant_config=quant_config,
+                    layer_id=self.layer_id,
+                    prefix=add_prefix("mlp", prefix),
+                )
+            else:
+                # dense layer
+                self.mlp = BailingMLP(
+                    hidden_size=self.hidden_size,
+                    intermediate_size=config.intermediate_size,
+                    quant_config=quant_config,
+                    prefix=add_prefix("mlp", prefix),
+                )
+        rms_norm_eps = float(getattr(config, "rms_norm_eps", 1e-5))
+        self.input_layernorm = RMSNorm(self.hidden_size, eps=rms_norm_eps)
+        self.post_attention_layernorm = RMSNorm(self.hidden_size, eps=rms_norm_eps)
+
+        self.layer_scatter_modes = LayerScatterModes.init_new(
+            layer_id=layer_id,
+            num_layers=config.num_hidden_layers,
+            is_layer_sparse=is_moe_layer,
+            is_previous_layer_sparse=is_previous_moe_layer,
+            is_next_layer_sparse=is_next_layer_moe_layer,
+        )
+
+        qkv_latent_func = (
+            self.attention.prepare_qkv_latent
+            if config.attention_type == 1 and self.use_mla
+            else None
+        )
+        self.layer_communicator = LayerCommunicator(
+            layer_scatter_modes=self.layer_scatter_modes,
+            input_layernorm=self.input_layernorm,
+            post_attention_layernorm=self.post_attention_layernorm,
+            allow_reduce_scatter=False,
+            qkv_latent_func=qkv_latent_func,
+        )
+
+    def _is_layer_sparse(
+        self, config: PretrainedConfig, layer_id: int, is_nextn: bool = False
+    ) -> bool:
+        return is_nextn or (
+            config.num_experts is not None and layer_id >= config.first_k_dense_replace
+        )
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        residual: Optional[torch.Tensor],
+        zero_allocator: BumpAllocator,
+        **kwargs,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        hidden_states, residual = self.layer_communicator.prepare_attn(
+            hidden_states, residual, forward_batch
+        )
+        # logger.warning(
+        #     f"===={self.layer_id=}, 1 shape= {hidden_states.shape}, {residual.shape}"
+        # )
+        if not forward_batch.forward_mode.is_idle():
+            if self.use_mla:
+                hidden_states = self.attention(
+                    positions=positions,
+                    hidden_states=hidden_states,
+                    forward_batch=forward_batch,
+                    zero_allocator=zero_allocator,
+                )
+            else:
+                hidden_states = self.attention(
+                    hidden_states=hidden_states,
+                    positions=positions,
+                    forward_batch=forward_batch,
+                )
+        # logger.warning(
+        #     f"===={self.layer_id=}, 2 shape= {hidden_states.shape}, {residual.shape}"
+        # )
+        hidden_states, residual = self.layer_communicator.prepare_mlp(
+            hidden_states, residual, forward_batch
+        )
+        # logger.warning(
+        #     f"===={self.layer_id=}, 3 shape= {hidden_states.shape}, {residual.shape}"
+        # )
+        should_allreduce_fusion = (
+            self.layer_communicator.should_fuse_mlp_allreduce_with_next_layer(
+                forward_batch
+            )
+        )
+        use_reduce_scatter = self.layer_communicator.should_use_reduce_scatter(
+            forward_batch
+        )
+        hidden_states = self.mlp(
+            hidden_states, should_allreduce_fusion, use_reduce_scatter
+        )
+        hidden_states, residual = self.layer_communicator.postprocess_layer(
+            hidden_states, residual, forward_batch
+        )
+        return hidden_states, residual
+
+    @staticmethod
+    def shared_moe_coefficient_loader(
+        param: torch.Tensor, loaded_weight: torch.Tensor
+    ) -> None:
+        assert param.size() == loaded_weight.size()
+
+        param.data.copy_(loaded_weight.to(torch.float32))
+        return
+
+
+class BailingMoELinearModel(nn.Module):
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.pp_group = get_pp_group()
+        self.config = config
+        self.vocab_size = config.vocab_size
+        self.embed_dim = config.hidden_size
+        self.num_layers = config.num_hidden_layers
+
+        self.layer_group_size = getattr(config, "layer_group_size", 1)
+        self.decoder_attention_types = [
+            0 if is_linear_layer(i, self.layer_group_size) else 1
+            for i in range(self.num_layers)
+        ]
+        num_linear = sum(1 for t in self.decoder_attention_types if t == 0)
+        num_full = sum(1 for t in self.decoder_attention_types if t == 1)
+        rank0_log(
+            f"Layer config: {num_linear} linear attention layers, {num_full} full attention layers"
+        )
+
+        assert (
+            self.num_layers % self.layer_group_size == 0
+        ), f"num_layers={self.num_layers} must be divided by layer_group_size={self.layer_group_size}"
+
+        if self.pp_group.is_first_rank:
+            self.word_embeddings = VocabParallelEmbedding(
+                self.vocab_size,
+                self.embed_dim,
+                enable_tp=not is_dp_attention_enabled(),
+                org_num_embeddings=self.vocab_size,
+            )
+        else:
+            self.word_embeddings = PPMissingLayer()
+
+        def layer_fn(idx, prefix):
+            layer_idx = idx
+            layer_config = copy.deepcopy(config)
+            layer_config.attention_type = self.decoder_attention_types[layer_idx]
+
+            decoder_kwargs = {"quant_config": quant_config, "layer_id": layer_idx}
+            return BailingMoELinearDecoderLayer(
+                layer_config, **decoder_kwargs, prefix=prefix
+            )
+
+        self.layers, self.start_layer, self.end_layer = make_layers(
+            self.num_layers,
+            layer_fn,
+            pp_rank=self.pp_group.rank_in_group,
+            pp_size=self.pp_group.world_size,
+            prefix=f"{prefix}.layers",
+        )
+
+        norm_kwargs = {}
+        if hasattr(config, "rms_norm_eps"):
+            norm_kwargs["eps"] = config.rms_norm_eps
+        if self.pp_group.is_last_rank:
+            self.norm = RMSNorm(config.hidden_size, **norm_kwargs)
+        else:
+            self.norm = PPMissingLayer()
+        self.embed_scale = 1.0
+        return
+
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor],
+        positions: torch.Tensor,
+        forward_batch: Optional[ForwardBatch] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        pp_proxy_tensors: Optional[PPProxyTensors] = None,
+    ) -> Union[torch.Tensor, PPProxyTensors]:
+        if self.pp_group.is_first_rank:
+            if inputs_embeds is None:
+                hidden_states = self.word_embeddings(input_ids)
+            else:
+                hidden_states = inputs_embeds
+            residual = None
+        else:
+            assert pp_proxy_tensors is not None
+            hidden_states = pp_proxy_tensors["hidden_states"]
+            residual = pp_proxy_tensors["residual"]
+
+        total_num_layers = self.end_layer - self.start_layer
+        device = inputs_embeds.device if inputs_embeds is not None else input_ids.device
+        zero_allocator = BumpAllocator(
+            buffer_size=total_num_layers * 2 * (2 if forward_batch.can_run_tbo else 1),
+            dtype=torch.float32,
+            device=device,
+        )
+
+        for i in range(self.start_layer, self.end_layer):
+            with get_global_expert_distribution_recorder().with_current_layer(i):
+                layer = self.layers[i]
+                hidden_states, residual = layer(
+                    hidden_states=hidden_states,
+                    positions=positions,
+                    forward_batch=forward_batch,
+                    residual=residual,
+                    zero_allocator=zero_allocator,
+                )
+        if not self.pp_group.is_last_rank:
+            return PPProxyTensors(
+                {"hidden_states": hidden_states, "residual": residual}
+            )
+        else:
+            if not forward_batch.forward_mode.is_idle():
+                if residual is None:
+                    hidden_states = self.norm(hidden_states)
+                else:
+                    hidden_states, _ = self.norm(hidden_states, residual)
+            return hidden_states
+
+
+class BailingMoELinearForCausalLM(nn.Module):
+
+    packed_modules_mapping = {
+        "fused_qkv_a_proj_with_mqa": ["q_a_proj", "kv_a_proj_with_mqa"],
+        "gate_up_proj": ["gate_proj", "up_proj"],
+    }
+    # To ensure correct weight loading and mapping.
+    hf_to_sglang_mapper = WeightsMapper(
+        orig_to_new_substr={
+            "attention.dense": "attention.out_proj",
+            "layers.7.attention.out_proj": "layers.7.attention.o_proj",
+            "layers.15.attention.out_proj": "layers.15.attention.o_proj",
+            "layers.23.attention.out_proj": "layers.23.attention.o_proj",
+            "layers.31.attention.out_proj": "layers.31.attention.o_proj",
+            "layers.39.attention.out_proj": "layers.39.attention.o_proj",
+            "layers.47.attention.out_proj": "layers.47.attention.o_proj",
+            "layers.55.attention.out_proj": "layers.55.attention.o_proj",
+            "layers.63.attention.out_proj": "layers.63.attention.o_proj",
+            "layers.71.attention.out_proj": "layers.71.attention.o_proj",
+            "layers.79.attention.out_proj": "layers.79.attention.o_proj",
+            "attention.query_key_value": "attention.qkv_proj",
+            "attention.g_proj": "attention.output_gate",
+        },
+    )
+
+    def __init__(
+        self,
+        *,
+        config,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.pp_group = get_pp_group()
+        self.config = config
+        self.quant_config = quant_config
+        self.model = BailingMoELinearModel(
+            self.config, quant_config, prefix=add_prefix("model", prefix)
+        )
+
+        if self.pp_group.is_last_rank:
+            self.lm_head = (
+                self.word_embeddings
+                if config.tie_word_embeddings
+                else ParallelLMHead(
+                    config.vocab_size,
+                    config.hidden_size,
+                    params_dtype=torch.float32,
+                    quant_config=quant_config,
+                    use_attn_tp_group=get_global_server_args().enable_dp_lm_head,
+                )
+            )
+            self.logits_processor = LogitsProcessor(config)
+        else:
+            self.lm_head = PPMissingLayer()
+
+    @property
+    def start_layer(self):
+        return self.model.start_layer
+
+    @property
+    def end_layer(self):
+        return self.model.end_layer
+
+    def get_embed_and_head(self):
+        """Used by the eagle_worker."""
+        return self.model.word_embeddings.weight, self.lm_head.weight
+
+    def post_load_weights(self, is_nextn=False, weight_names=None):
+
+        # Perform post-processing after loading weights
+        if is_nextn:
+            layer_ids = [self.config.num_hidden_layers]
+        else:
+            if weight_names is None:
+                layer_ids = range(self.model.start_layer, self.model.end_layer)
+            else:
+                layer_ids = set()
+                for name in weight_names:
+                    if "kv_b_proj" in name:
+                        layer_id = int(name.split(".")[2])
+                        if (
+                            layer_id < self.model.end_layer
+                            and layer_id >= self.model.start_layer
+                        ):
+                            layer_ids.add(layer_id)
+        logger.debug(f"weight loading layer_ids: {layer_ids}")
+
+        for layer_id in layer_ids:
+            self_attn = (
+                self.model.layers[layer_id].attention
+                if not is_nextn
+                else self.model.decoder.attention
+            )
+            if not hasattr(self_attn, "kv_b_proj"):
+                continue
+            if hasattr(self_attn.kv_b_proj, "qweight"):
+                # AWQ compatible
+                if _is_cuda or _is_hip:
+                    w = awq_dequantize(
+                        self_attn.kv_b_proj.qweight,
+                        self_attn.kv_b_proj.scales,
+                        self_attn.kv_b_proj.qzeros,
+                    ).T
+                else:
+                    w = awq_dequantize(
+                        self_attn.kv_b_proj.qweight,
+                        self_attn.kv_b_proj.scales,
+                        self_attn.kv_b_proj.qzeros,
+                        0,
+                        0,
+                        0,
+                    ).T
+            else:
+                w = self_attn.kv_b_proj.weight
+            # NOTE(HandH1998): Since `bmm_fp8` only supports per-tensor scale, we have to requantize `self_attn.kv_b_proj`.
+            # This may affect the accuracy of fp8 model.
+            # Fix deepseek v3 blockwise bmm by using deep_gemm
+            use_deep_gemm_bmm = False
+
+            if w.dtype in (
+                torch.float8_e4m3fn,
+                torch.float8_e4m3fnuz,
+            ):
+                if (
+                    hasattr(self.quant_config, "weight_block_size")
+                    and self.quant_config.weight_block_size is not None
+                ):
+                    weight_block_size = self.quant_config.weight_block_size
+                    assert hasattr(self_attn.kv_b_proj, "weight_scale_inv")
+                    if _is_fp8_fnuz:
+                        weight, weight_scale, _ = normalize_e4m3fn_to_e4m3fnuz(
+                            weight=w,
+                            weight_scale=self_attn.kv_b_proj.weight_scale_inv,
+                            input_scale=None,
+                        )
+                    else:
+                        weight = w
+                        weight_scale = self_attn.kv_b_proj.weight_scale_inv
+
+                    if (
+                        _is_cuda
+                        and weight_block_size[0] == 128
+                        and weight_block_size[1] == 128
+                    ):
+                        if (
+                            deep_gemm_wrapper.ENABLE_JIT_DEEPGEMM
+                            and not deep_gemm_wrapper.DEEPGEMM_BLACKWELL
+                            and get_bool_env_var("SGL_USE_DEEPGEMM_BMM", "false")
+                        ):
+                            block_scale = weight_scale
+                            use_deep_gemm_bmm = True
+                        else:
+                            w = block_quant_dequant(
+                                weight,
+                                weight_scale,
+                                weight_block_size,
+                                torch.bfloat16,
+                            )
+                    else:
+                        w, scale = block_quant_to_tensor_quant(
+                            weight, weight_scale, weight_block_size
+                        )
+                        self_attn.w_scale = scale
+                else:
+                    if _is_fp8_fnuz:
+                        weight, weight_scale, _ = normalize_e4m3fn_to_e4m3fnuz(
+                            weight=w,
+                            weight_scale=self_attn.kv_b_proj.weight_scale,
+                            input_scale=None,
+                        )
+                    else:
+                        weight = w
+                        weight_scale = self_attn.kv_b_proj.weight_scale
+
+                    w, scale = channel_quant_to_tensor_quant(weight, weight_scale)
+                    self_attn.w_scale = scale
+
+            if w.dtype == torch.int8:
+                if hasattr(self.quant_config, "weight_block_size"):
+                    # block-wise int8 need it
+                    weight_block_size = self.quant_config.weight_block_size
+                    if weight_block_size is not None:
+                        assert hasattr(self_attn.kv_b_proj, "weight_scale_inv")
+                        weight = w
+                        weight_scale = self_attn.kv_b_proj.weight_scale_inv
+                        w = int8_block_dequant(
+                            weight, weight_scale, weight_block_size
+                        ).to(torch.bfloat16)
+                else:
+                    # channel-wise int8 need it
+                    w = w.to(torch.bfloat16) * self_attn.kv_b_proj.weight_scale.to(
+                        torch.bfloat16
+                    )
+
+            w_kc, w_vc = w.unflatten(
+                0, (-1, self_attn.qk_nope_head_dim + self_attn.v_head_dim)
+            ).split([self_attn.qk_nope_head_dim, self_attn.v_head_dim], dim=1)
+            if not use_deep_gemm_bmm:
+                self_attn.w_kc = bind_or_assign(
+                    self_attn.w_kc, w_kc.transpose(1, 2).contiguous().transpose(1, 2)
+                )
+                self_attn.w_vc = bind_or_assign(
+                    self_attn.w_vc, w_vc.contiguous().transpose(1, 2)
+                )
+                if (
+                    hasattr(self_attn.kv_b_proj, "weight_scale")
+                    and self_attn.w_scale is None
+                ):
+                    self_attn.w_scale = bind_or_assign(
+                        self_attn.w_scale, self_attn.kv_b_proj.weight_scale
+                    )
+                    if _is_hip:
+                        self_attn.w_scale *= 2.0
+                # TODO: remove this after adding FP8 support in bmm cpu kernel
+                if _is_cpu and _is_cpu_amx_available and w.dtype == torch.float8_e4m3fn:
+                    self_attn.w_kc = (
+                        self_attn.w_kc.to(torch.bfloat16) * self_attn.w_scale
+                    )
+                    self_attn.w_vc = (
+                        self_attn.w_vc.to(torch.bfloat16) * self_attn.w_scale
+                    )
+            else:
+                num_tiles_k = self_attn.qk_nope_head_dim // weight_block_size[1]
+                num_tiles_n = self_attn.v_head_dim // weight_block_size[0]
+                ws_kc, ws_vc = block_scale.unflatten(
+                    0, (-1, (num_tiles_k + num_tiles_n))
+                ).split([num_tiles_k, num_tiles_n], dim=1)
+                self_attn.w_scale_k = bind_or_assign(
+                    self_attn.w_scale_k, ws_kc.transpose(1, 2).contiguous()
+                )
+                self_attn.w_scale_v = bind_or_assign(
+                    self_attn.w_scale_v, ws_vc.contiguous()
+                )
+                self_attn.w_kc = bind_or_assign(
+                    self_attn.w_kc, w_kc.transpose(1, 2).contiguous()
+                )
+                self_attn.w_vc = bind_or_assign(self_attn.w_vc, w_vc.contiguous())
+                self_attn.use_deep_gemm_bmm = True
+
+        if (
+            deep_gemm_wrapper.ENABLE_JIT_DEEPGEMM
+            and deep_gemm_wrapper.DEEPGEMM_SCALE_UE8M0
+            and hasattr(self.quant_config, "weight_block_size")
+            and self.quant_config.weight_block_size is not None
+        ):
+            self._weight_requant_ue8m0(is_nextn)
+
+    @classmethod
+    def get_model_config_for_expert_location(cls, config):
+        num_groups = getattr(config, "n_group", 0)
+        from sglang.srt.eplb.expert_location import ModelConfigForExpertLocation
+
+        return ModelConfigForExpertLocation(
+            num_layers=config.num_hidden_layers,
+            num_logical_experts=config.num_experts,
+            num_groups=None if num_groups == 0 else num_groups,
+        )
+
+    def _weight_requant_ue8m0(self, is_nextn=False):
+        weight_block_size = self.quant_config.weight_block_size
+
+        moe_layers = list(
+            range(
+                self.config.first_k_dense_replace,
+                self.config.num_hidden_layers,
+                self.config.moe_layer_freq,
+            )
+        )
+
+        num_hidden_layers = 1 if is_nextn else self.config.num_hidden_layers
+
+        for layer_id in range(num_hidden_layers):
+            if is_nextn:
+                layer = self.model.decoder
+            else:
+                layer = self.model.layers[layer_id]
+
+            module_list = [
+                layer.self_attn.kv_b_proj,
+                layer.self_attn.o_proj,
+            ]
+
+            if self.config.q_lora_rank is not None:
+                module_list.append(layer.self_attn.fused_qkv_a_proj_with_mqa)
+                module_list.append(layer.self_attn.q_b_proj)
+            else:
+                module_list.append(layer.self_attn.kv_a_proj_with_mqa)
+                module_list.append(layer.self_attn.q_proj)
+
+            for module in module_list:
+                requant_weight_ue8m0_inplace(
+                    module.weight, module.weight_scale_inv, weight_block_size
+                )
+
+            if layer_id in moe_layers or is_nextn:
+                shared_experts = getattr(layer.mlp, "shared_experts", None)
+                if shared_experts is not None:
+                    for module in [
+                        shared_experts.gate_up_proj,
+                        shared_experts.down_proj,
+                    ]:
+                        requant_weight_ue8m0_inplace(
+                            module.weight, module.weight_scale_inv, weight_block_size
+                        )
+
+                experts = layer.mlp.experts
+                if isinstance(experts, DeepEPMoE):
+                    for w in [
+                        experts.w13_weight_fp8,
+                        experts.w2_weight_fp8,
+                    ]:
+                        requant_weight_ue8m0_inplace(w[0], w[1], weight_block_size)
+            else:
+                mlp = layer.mlp
+                assert isinstance(mlp, DeepseekV2MLP)
+                for module in [
+                    mlp.gate_up_proj,
+                    mlp.down_proj,
+                ]:
+                    requant_weight_ue8m0_inplace(
+                        module.weight, module.weight_scale_inv, weight_block_size
+                    )
+
+    def get_decoder_attention_types(self):
+        return self.model.decoder_attention_types
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        pp_proxy_tensors: Optional[PPProxyTensors] = None,
+    ) -> Union[torch.Tensor, PPProxyTensors]:
+        hidden_states = self.model(
+            input_ids=input_ids,
+            positions=positions,
+            inputs_embeds=inputs_embeds,
+            forward_batch=forward_batch,
+            pp_proxy_tensors=pp_proxy_tensors,
+        )
+        if self.pp_group.is_last_rank:
+            return self.logits_processor(
+                input_ids, hidden_states.float(), self.lm_head, forward_batch
+            )
+        else:
+            return hidden_states
+
+    def load_weights(
+        self, weights: Iterable[Tuple[str, torch.Tensor]], is_nextn=False
+    ) -> Set[str]:
+        def load_linear_attn_weight(
+            name: str, loaded_weight: torch.Tensor, self
+        ) -> None:
+            if is_pp_missing_parameter(name, self):
+                return
+            param = params_dict[name]
+            weight_loader = getattr(
+                param, "weight_loader", BailingMoELinearAttention.weight_direct_load
+            )
+            weight_loader = weight_loader_with_alias(name)(weight_loader)
+            weight_loader(param, loaded_weight)
+            return
+
+        if is_nextn:
+            if hasattr(self.config, "num_nextn_predict_layers"):
+                num_nextn_layers = self.config.num_nextn_predict_layers
+                assert num_nextn_layers == 1, "Only 1 nextn layer is supported"
+                # compatible with old design
+                nextn_layer_id = (
+                    0
+                    if self.config.num_hidden_layers == 1
+                    else self.config.num_hidden_layers
+                )
+            else:
+                raise ValueError("num nextn_predict_layers is not in the config")
+
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("gate_up_proj", "gate_proj", 0),
+            ("gate_up_proj", "up_proj", 1),
+        ]
+        expert_params_mapping = FusedMoE.make_expert_params_mapping(
+            ckpt_gate_proj_name="gate_proj",
+            ckpt_down_proj_name="down_proj",
+            ckpt_up_proj_name="up_proj",
+            num_experts=self.config.num_experts,
+        )
+
+        if is_nextn:
+            nextn_layer_prefix = f"model.layers.{nextn_layer_id}"
+            nextn_spec_weight_names = [
+                "final_layernorm",
+                "eh_proj",
+                "enorm",
+                "hnorm",
+            ]
+
+        params_dict = dict(self.named_parameters())
+        loaded_params: Set[str] = set()
+        weight_names = []
+        fuse_qkv_a_proj = hasattr(self.config, "q_lora_rank") and (
+            self.config.q_lora_rank is not None
+        )
+        cached_a_proj = {} if fuse_qkv_a_proj else None
+
+        for name, loaded_weight in weights:
+            if name.startswith("model.mtp"):
+                continue
+            layer_idx = None
+            if "model.layers." in name:
+                layer_idx = int(name.split(".")[2])
+            if (
+                ("v_head" in name)
+                or ("inv_freq" in name)
+                or (self.config.tie_word_embeddings and "lm_head" in name)
+            ):
+                continue
+
+            weight_names.append(name)
+
+            if is_nextn:
+                if not name.startswith(nextn_layer_prefix):
+                    continue
+
+                    # Use shared head and embed weights from target model
+                if "shared_head.head" in name or "embed_tokens" in name:
+                    continue
+
+                is_decoder = True
+                # For nextn specific weights
+                for weight_name in nextn_spec_weight_names:
+                    if weight_name in name:
+                        name = name.replace(nextn_layer_prefix, "model")
+                        is_decoder = False
+                        break
+                # For decoder layer weights
+                if is_decoder:
+                    name = name.replace(nextn_layer_prefix, "model.decoder")
+
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                if "mlp.experts" in name:
+                    continue
+
+                name = name.replace(weight_name, param_name)
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                if name not in params_dict:
+                    continue
+                if is_pp_missing_parameter(name, self):
+                    continue
+
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+
+                for mapping in expert_params_mapping:
+                    param_name, weight_name, expert_id, shard_id = mapping
+                    if weight_name not in name:
+                        continue
+                    name = name.replace(weight_name, param_name)
+
+                    if name not in params_dict:
+                        continue
+                    if is_pp_missing_parameter(name, self):
+                        continue
+                    param = params_dict[name]
+                    weight_loader = param.weight_loader
+                    weight_loader(
+                        param,
+                        loaded_weight,
+                        name,
+                        shard_id=shard_id,
+                        expert_id=expert_id,
+                    )
+                    break
+                else:
+
+                    if name.endswith(".bias") and name not in params_dict:
+                        continue
+                    if "slope" in name:
+                        continue
+
+                    if fuse_qkv_a_proj and (
+                        "q_a_proj" in name or "kv_a_proj_with_mqa" in name
+                    ):
+                        cached_a_proj[name] = loaded_weight
+                        q_a_proj_name = (
+                            name
+                            if "q_a_proj" in name
+                            else name.replace("kv_a_proj_with_mqa", "q_a_proj")
+                        )
+                        kv_a_proj_name = (
+                            name
+                            if "kv_a_proj_with_mqa" in name
+                            else name.replace("q_a_proj", "kv_a_proj_with_mqa")
+                        )
+
+                        # When both q_a_proj and kv_a_proj_with_mqa has been cached, load the fused weight to parameter
+                        if (
+                            q_a_proj_name in cached_a_proj
+                            and kv_a_proj_name in cached_a_proj
+                        ):
+                            q_a_proj_weight = cached_a_proj[q_a_proj_name]
+                            kv_a_proj_weight = cached_a_proj[kv_a_proj_name]
+                            cat_dim = 0
+                            if self.quant_config is not None and (
+                                self.quant_config.get_name() == "awq"
+                                or self.quant_config.get_name() == "awq_marlin"
+                                or self.quant_config.get_name() == "moe_wna16"
+                            ):
+                                cat_dim = 1
+                            fused_weight = torch.cat(
+                                [q_a_proj_weight, kv_a_proj_weight], dim=cat_dim
+                            )
+                            param_name = (
+                                name.replace("q_a_proj", "fused_qkv_a_proj_with_mqa")
+                                if "q_a_proj" in name
+                                else name.replace(
+                                    "kv_a_proj_with_mqa",
+                                    "fused_qkv_a_proj_with_mqa",
+                                )
+                            )
+                            if param_name not in params_dict:
+                                continue
+                            param = params_dict[param_name]
+                            weight_loader = getattr(
+                                param, "weight_loader", default_weight_loader
+                            )
+
+                            weight_loader(param, fused_weight)
+                            cached_a_proj.pop(q_a_proj_name)
+                            cached_a_proj.pop(kv_a_proj_name)
+                    else:
+
+                        if name not in params_dict:
+                            name = name.replace(".dense.", ".o_proj.")
+                            if name not in params_dict:
+                                continue
+                        if is_pp_missing_parameter(name, self):
+                            continue
+                        if (
+                            "attention" in name
+                            and "slope" not in name
+                            and is_linear_layer(layer_idx, self.model.layer_group_size)
+                        ):
+                            load_linear_attn_weight(name, loaded_weight, self)
+                            loaded_params.add(name)
+                            continue
+
+                        param = params_dict[name]
+                        weight_loader = getattr(
+                            param, "weight_loader", default_weight_loader
+                        )
+                        weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        self.post_load_weights(is_nextn=is_nextn, weight_names=weight_names)
+
+        return loaded_params
+
+
+class BailingMoeV2_5ForCausalLM(BailingMoELinearForCausalLM):
+    pass
+
+
+EntryClass = [
+    BailingMoeV2_5ForCausalLM,
+]
diff --git a/sglang/python/sglang/srt/models/bailing_moe_nextn.py b/sglang/python/sglang/srt/models/bailing_moe_nextn.py
new file mode 100644
index 0000000000000000000000000000000000000000..fcbae6bbad4de16f45e47a2c615c3ab92c1b811f
--- /dev/null
+++ b/sglang/python/sglang/srt/models/bailing_moe_nextn.py
@@ -0,0 +1,250 @@
+# coding=utf-8
+# Copyright 2023 Antgroup and The HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""SGLang BailingMoENextN model."""
+
+import logging
+from typing import Iterable, Optional, Tuple
+
+import torch
+from torch import nn
+from transformers import PretrainedConfig
+
+from sglang.srt.distributed import get_tensor_model_parallel_world_size
+from sglang.srt.layers.dp_attention import is_dp_attention_enabled
+from sglang.srt.layers.layernorm import RMSNorm
+from sglang.srt.layers.linear import ReplicatedLinear
+from sglang.srt.layers.logits_processor import LogitsProcessor
+from sglang.srt.layers.quantization.base_config import QuantizationConfig
+from sglang.srt.layers.vocab_parallel_embedding import (
+    ParallelLMHead,
+    VocabParallelEmbedding,
+)
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch
+from sglang.srt.models.bailing_moe import BailingMoEBlock, BailingMoEForCausalLM
+from sglang.srt.models.bailing_moe_linear import (
+    BailingMoELinearDecoderLayer,
+    BailingMoeV2_5ForCausalLM,
+)
+from sglang.srt.models.utils import WeightsMapper
+from sglang.srt.server_args import get_global_server_args
+from sglang.srt.utils import BumpAllocator, add_prefix
+
+LoraConfig = None
+logger = logging.getLogger(__name__)
+
+
+class BailingMoEModelNextN(nn.Module):
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.layer_group_size = 1
+        self.start_layer = 0
+        self.end_layer = 1
+        self.total_num_layers = 1
+        self.vocab_size = config.vocab_size
+        config.for_nextn_model = True
+
+        if quant_config is not None and quant_config.get_name() == "modelopt_fp4":
+            logger.warning(
+                "Overriding DeepseekV3ForCausalLMNextN quant config for modelopt_fp4 Deepseek model."
+            )
+            quant_config = None
+
+        self.vocab_size = config.vocab_size
+
+        self.word_embeddings = VocabParallelEmbedding(
+            config.vocab_size,
+            config.hidden_size,
+            enable_tp=not is_dp_attention_enabled(),
+            prefix=add_prefix("word_embeddings", prefix),
+        )
+
+        self.enorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.hnorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+        self.eh_proj = ReplicatedLinear(
+            2 * config.hidden_size,
+            config.hidden_size,
+            bias=False,
+            quant_config=quant_config,
+            prefix=add_prefix(f"layers.{config.num_hidden_layers}.eh_proj", prefix),
+        )
+
+        self.is_hybrid = (
+            hasattr(config, "model_type") and config.model_type == "bailing_hybrid"
+        )
+        if self.is_hybrid:
+            config.attention_type = 1
+            self.decoder = BailingMoELinearDecoderLayer(
+                config,
+                quant_config=quant_config,
+                layer_id=0,
+                is_nextn=True,
+                prefix=add_prefix(f"layers.{config.num_hidden_layers}", prefix),
+            )
+        else:
+            self.decoder = BailingMoEBlock(
+                config,
+                0,
+                quant_config=quant_config,
+                # is_nextn=True,
+                prefix=add_prefix("decoder", prefix),
+            )
+
+        self.shared_head = nn.Module()
+        self.final_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        input_embeds: torch.Tensor = None,
+    ) -> torch.Tensor:
+
+        if input_embeds is None:
+            hidden_states = self.word_embeddings(input_ids)
+        else:
+            hidden_states = input_embeds
+
+        if hidden_states.shape[0] > 0:
+            hidden_states, _ = self.eh_proj(
+                torch.cat(
+                    (
+                        self.enorm(hidden_states),
+                        self.hnorm(
+                            forward_batch.spec_info.hidden_states.to(
+                                self.hnorm.weight.dtype
+                            )
+                        ),
+                    ),
+                    dim=-1,
+                )
+            )
+
+        residual = None
+        if self.is_hybrid:
+            device = input_ids.device
+            zero_allocator = BumpAllocator(
+                buffer_size=self.total_num_layers
+                * 2
+                * (2 if forward_batch.can_run_tbo else 1),
+                dtype=torch.float32,
+                device=device,
+            )
+            hidden_states, residual = self.decoder(
+                hidden_states=hidden_states,
+                positions=positions,
+                forward_batch=forward_batch,
+                residual=residual,
+                zero_allocator=zero_allocator,
+            )
+        else:
+            hidden_states, residual = self.decoder(
+                positions, hidden_states, forward_batch, residual
+            )
+
+        if not forward_batch.forward_mode.is_idle():
+            if residual is not None:
+                hidden_states, _ = self.final_layernorm(hidden_states, residual)
+            else:
+                hidden_states = self.final_layernorm(hidden_states)
+
+        return hidden_states
+
+
+class BailingMoeForCausalLMNextN(nn.Module):
+
+    packed_modules_mapping = {
+        "fused_qkv_a_proj_with_mqa": ["q_a_proj", "kv_a_proj_with_mqa"],
+        "gate_up_proj": ["gate_proj", "up_proj"],
+    }
+    # To ensure correct weight loading and mapping.
+    hf_to_sglang_mapper = WeightsMapper(
+        orig_to_new_substr={
+            "attention.dense": "attention.o_proj",
+        },
+    )
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        nn.Module.__init__(self)
+        self.config = config
+        self.tp_size = get_tensor_model_parallel_world_size()
+        self.quant_config = quant_config
+        if hasattr(self, "determine_num_fused_shared_experts"):
+            # Asystem has determine_num_fused_shared_experts but theta does not.
+            self.determine_num_fused_shared_experts("BailingMoeForCausalLMNextN")
+
+        self.model = BailingMoEModelNextN(
+            config, quant_config, prefix=add_prefix("model", prefix)
+        )
+        self.lm_head = ParallelLMHead(
+            config.vocab_size,
+            config.hidden_size,
+            quant_config=quant_config,
+            prefix=add_prefix("model.shared_head.head", prefix),
+            use_attn_tp_group=get_global_server_args().enable_dp_lm_head,
+        )
+        self.logits_processor = LogitsProcessor(config)
+        if hasattr(self.config, "model_type") and config.model_type == "bailing_hybrid":
+            self.base_load_weights_func = BailingMoeV2_5ForCausalLM.load_weights
+            self.post_load_weights_func = BailingMoeV2_5ForCausalLM.post_load_weights
+        else:
+            self.base_load_weights_func = BailingMoEForCausalLM.load_weights
+            self.post_load_weights_func = BailingMoEForCausalLM.post_load_weights
+
+    @torch.no_grad()
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+    ) -> torch.Tensor:
+        hidden_states = self.model(input_ids, positions, forward_batch)
+        return self.logits_processor(
+            input_ids, hidden_states, self.lm_head, forward_batch
+        )
+
+    def set_embed_and_head(self, embed, head):
+        """Used by the eagle_worker."""
+        del self.model.word_embeddings.weight
+        del self.lm_head.weight
+        self.model.word_embeddings.weight = embed
+        self.lm_head.weight = head
+        torch.cuda.empty_cache()
+        torch.cuda.synchronize()
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        self.base_load_weights_func(self, weights, is_nextn=True)
+
+    def post_load_weights(self, is_nextn=False, weight_names=None):
+        self.post_load_weights_func(self, is_nextn=is_nextn, weight_names=weight_names)
+
+
+EntryClass = [BailingMoeForCausalLMNextN]
diff --git a/sglang/python/sglang/srt/models/bert.py b/sglang/python/sglang/srt/models/bert.py
new file mode 100644
index 0000000000000000000000000000000000000000..976b69ab89552e609ba7d749d5d604a1485f1f58
--- /dev/null
+++ b/sglang/python/sglang/srt/models/bert.py
@@ -0,0 +1,504 @@
+# SPDX-License-Identifier: Apache-2.0
+from typing import Iterable, Optional, Set, Tuple
+
+import torch
+from torch import nn
+
+from sglang.srt.distributed import get_tensor_model_parallel_world_size
+from sglang.srt.layers.activation import get_act_fn
+from sglang.srt.layers.linear import (
+    ColumnParallelLinear,
+    QKVParallelLinear,
+    RowParallelLinear,
+)
+from sglang.srt.layers.pooler import CrossEncodingPooler, Pooler, PoolingType
+from sglang.srt.layers.quantization.base_config import QuantizationConfig
+from sglang.srt.layers.radix_attention import AttentionType, RadixAttention
+from sglang.srt.layers.vocab_parallel_embedding import VocabParallelEmbedding
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch
+from sglang.srt.model_loader.weight_utils import default_weight_loader
+from sglang.srt.server_args import get_global_server_args
+from sglang.srt.utils import add_prefix
+
+BertConfig = None
+
+
+class BertEmbedding(nn.Module):
+
+    def __init__(self, config: BertConfig):
+
+        super().__init__()
+        self.size = config.hidden_size
+        self.word_embeddings = VocabParallelEmbedding(
+            config.vocab_size, config.hidden_size
+        )
+        self.position_embeddings = VocabParallelEmbedding(
+            config.max_position_embeddings, config.hidden_size
+        )
+        self.token_type_embeddings = VocabParallelEmbedding(
+            config.type_vocab_size, config.hidden_size
+        )
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.position_ids = nn.Parameter(
+            torch.empty((1, config.max_position_embeddings)),
+        )
+
+        self.position_embedding_type = config.position_embedding_type
+        if self.position_embedding_type != "absolute":
+            raise ValueError(
+                "Only 'absolute' position_embedding_type" + " is supported"
+            )
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+    ) -> torch.Tensor:
+        input_shape = input_ids.size()
+
+        # Input embeddings.
+        inputs_embeds = self.word_embeddings(input_ids)
+
+        # Position embeddings.
+        position_embeddings = self.position_embeddings(positions)
+
+        token_type_ids = forward_batch.token_type_ids
+
+        if token_type_ids is None:
+            token_type_ids = torch.zeros(
+                input_shape, dtype=torch.long, device=inputs_embeds.device
+            )
+
+        token_type_embeddings = self.token_type_embeddings(token_type_ids)
+
+        embeddings = inputs_embeds + token_type_embeddings + position_embeddings
+        embeddings = self.LayerNorm(embeddings)
+        return embeddings
+
+
+class BertPooler(nn.Module):
+
+    def __init__(self, config: BertConfig):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.activation = nn.Tanh()
+
+    def forward(
+        self, hidden_states: torch.Tensor, forward_batch: ForwardBatch
+    ) -> torch.Tensor:
+        # simply taking the hidden state corresponding
+        first_token_tensor = hidden_states[0, :]
+
+        pooled_output = self.dense(first_token_tensor)
+        pooled_output = self.activation(pooled_output)
+
+        return pooled_output
+
+
+class BertEncoder(nn.Module):
+
+    def __init__(
+        self,
+        config: BertConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.config = config
+        self.quant_config = quant_config
+        self.layer = nn.ModuleList(
+            [
+                BertLayer(
+                    config=config,
+                    layer_id=layer_idx,
+                    quant_config=quant_config,
+                    prefix=f"{prefix}.layer.{layer_idx}",
+                )
+                for layer_idx in range(config.num_hidden_layers)
+            ]
+        )
+
+    def forward(
+        self, hidden_states: torch.Tensor, forward_batch: ForwardBatch
+    ) -> torch.Tensor:
+        for layer in self.layer:
+            hidden_states = layer(hidden_states, forward_batch)
+        return hidden_states
+
+
+class BertLayer(nn.Module):
+
+    def __init__(
+        self,
+        config: BertConfig,
+        layer_id: int = 0,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+
+        self.layer_id = layer_id
+
+        self.attention = BertAttention(
+            hidden_size=config.hidden_size,
+            num_attention_heads=config.num_attention_heads,
+            layer_id=layer_id,
+            layer_norm_eps=config.layer_norm_eps,
+            quant_config=quant_config,
+            prefix=f"{prefix}.attention",
+        )
+
+        self.intermediate = BertIntermediate(
+            hidden_size=config.hidden_size,
+            intermediate_size=config.intermediate_size,
+            hidden_act=config.hidden_act,
+            quant_config=quant_config,
+            prefix=f"{prefix}.intermediate",
+        )
+
+        self.output = BertOutput(
+            hidden_size=config.hidden_size,
+            intermediate_size=config.intermediate_size,
+            layer_norm_eps=config.layer_norm_eps,
+            quant_config=quant_config,
+            prefix=f"{prefix}.output",
+        )
+
+    def forward(self, hidden_states: torch.Tensor, forward_batch: ForwardBatch):
+        attn_output = self.attention(hidden_states, forward_batch)
+        intermediate_output = self.intermediate(attn_output)
+        output = self.output(intermediate_output, attn_output)
+
+        return output
+
+
+class BertAttention(nn.Module):
+
+    def __init__(
+        self,
+        hidden_size: int,
+        num_attention_heads: int,
+        layer_norm_eps: float,
+        layer_id: int = 0,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+
+        self.self_attn = BertSelfAttention(
+            hidden_size=hidden_size,
+            num_attention_heads=num_attention_heads,
+            layer_id=layer_id,
+            quant_config=quant_config,
+            prefix=f"{prefix}.output",
+        )
+
+        self.output = BertSelfOutput(
+            hidden_size=hidden_size,
+            layer_norm_eps=layer_norm_eps,
+            quant_config=quant_config,
+            prefix=f"{prefix}.output",
+        )
+
+    def forward(
+        self, hidden_states: torch.Tensor, forward_batch: ForwardBatch
+    ) -> torch.Tensor:
+        self_output = self.self_attn(hidden_states, forward_batch)
+        return self.output(self_output, hidden_states)
+
+
+class BertSelfAttention(nn.Module):
+
+    def __init__(
+        self,
+        hidden_size: int,
+        num_attention_heads: int,
+        layer_id: int = 0,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.hidden_size = hidden_size
+        tp_size = get_tensor_model_parallel_world_size()
+
+        self.total_num_heads = num_attention_heads
+        assert self.total_num_heads % tp_size == 0
+
+        self.num_heads = self.total_num_heads // tp_size
+        self.total_num_kv_heads = self.total_num_heads
+        self.head_dim = self.hidden_size // self.total_num_heads
+        assert self.head_dim * self.total_num_heads == self.hidden_size
+
+        self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
+
+        self.q_size = self.num_heads * self.head_dim
+        self.kv_size = self.num_kv_heads * self.head_dim
+        self.scaling = self.head_dim**-0.5
+        self.qkv_proj = QKVParallelLinear(
+            hidden_size=self.hidden_size,
+            head_size=self.head_dim,
+            total_num_heads=self.total_num_heads,
+            total_num_kv_heads=self.total_num_kv_heads,
+            bias=True,
+            quant_config=quant_config,
+            prefix=f"{prefix}.qkv_proj",
+        )
+
+        self.attn = RadixAttention(
+            num_heads=self.num_heads,
+            head_dim=self.head_dim,
+            scaling=self.scaling,
+            num_kv_heads=self.num_kv_heads,
+            layer_id=layer_id,
+            prefix=f"{prefix}.attn",
+            attn_type=AttentionType.ENCODER_ONLY,
+        )
+
+    def forward(
+        self, hidden_states: torch.Tensor, forward_batch: ForwardBatch
+    ) -> torch.Tensor:
+        qkv, _ = self.qkv_proj(hidden_states)
+        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+        output = self.attn(q, k, v, forward_batch)
+        return output
+
+
+class BertSelfOutput(nn.Module):
+
+    def __init__(
+        self,
+        hidden_size: int,
+        layer_norm_eps: float,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.dense = RowParallelLinear(
+            input_size=hidden_size,
+            output_size=hidden_size,
+            bias=True,
+            quant_config=quant_config,
+            prefix=f"{prefix}.dense",
+        )
+        self.LayerNorm = nn.LayerNorm(hidden_size, eps=layer_norm_eps)
+
+    def forward(
+        self, hidden_states: torch.Tensor, input_tensor: torch.Tensor
+    ) -> torch.Tensor:
+        hidden_states, _ = self.dense(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+
+
+class BertIntermediate(nn.Module):
+
+    def __init__(
+        self,
+        hidden_size: int,
+        intermediate_size: int,
+        hidden_act: str,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.dense = ColumnParallelLinear(
+            input_size=hidden_size,
+            output_size=intermediate_size,
+            bias=True,
+            quant_config=quant_config,
+            prefix=f"{prefix}.dense",
+        )
+        self.intermediate_act_fn = get_act_fn(hidden_act)
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states, _ = self.dense(hidden_states)
+        hidden_states = self.intermediate_act_fn(hidden_states)
+        return hidden_states
+
+
+class BertOutput(nn.Module):
+
+    def __init__(
+        self,
+        hidden_size: int,
+        intermediate_size: int,
+        layer_norm_eps: float,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+
+        self.dense = RowParallelLinear(
+            input_size=intermediate_size,
+            output_size=hidden_size,
+            bias=True,
+            quant_config=quant_config,
+            prefix=f"{prefix}.dense",
+        )
+
+        self.LayerNorm = nn.LayerNorm(hidden_size, eps=layer_norm_eps)
+
+    def forward(
+        self, hidden_states: torch.Tensor, input_tensor: torch.Tensor
+    ) -> torch.Tensor:
+        hidden_states, _ = self.dense(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+
+
+class BertModel(nn.Module):
+
+    def __init__(
+        self,
+        *,
+        config: BertConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        use_bert_pooler: bool = False,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.use_bert_pooler = use_bert_pooler
+        self.config = config
+        self.embeddings = BertEmbedding(config)
+        self.encoder = BertEncoder(
+            config=config,
+            quant_config=quant_config,
+            prefix=add_prefix("encoder", prefix),
+        )
+        pooling_type = (
+            PoolingType.CLS
+            if get_global_server_args().is_embedding
+            else PoolingType.LAST
+        )
+        self.pooler = (
+            BertPooler(config)
+            if self.use_bert_pooler
+            else Pooler(pooling_type=pooling_type, normalize=True)
+        )
+
+    @torch.no_grad()
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        input_embeds: torch.Tensor = None,
+        get_embedding: bool = False,
+    ) -> torch.Tensor:
+        assert get_embedding == True
+        # Your tokenized IDs
+
+        hidden_states = self.embeddings(
+            input_ids=input_ids,
+            positions=positions,
+            forward_batch=forward_batch,
+        )
+
+        hidden_states = self.encoder(hidden_states, forward_batch=forward_batch)
+
+        if not self.use_bert_pooler:
+            hidden_states = self.pooler(hidden_states, forward_batch)
+
+        return hidden_states
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]) -> Set[str]:
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "query", "q"),
+            ("qkv_proj", "key", "k"),
+            ("qkv_proj", "value", "v"),
+        ]
+
+        params_dict = dict(self.named_parameters())
+        for name, loaded_weight in weights:
+            name = name.replace("self", "self_attn")
+            if not self.use_bert_pooler and "pooler" in name:
+                continue
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
+                weight_loader(param, loaded_weight)
+
+
+class Contriever(BertModel):
+    pass
+
+
+class BertForSequenceClassification(nn.Module):
+
+    def __init__(
+        self,
+        *,
+        config: BertConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+
+        self.num_labels = config.num_labels
+        self.bert = BertModel(
+            config=config,
+            quant_config=quant_config,
+            use_bert_pooler=True,
+            prefix=add_prefix("bert", prefix),
+        )
+        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
+        self.pooler = CrossEncodingPooler(config, self.classifier, self.bert.pooler)
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        self_weights = []
+
+        def weight_filter():
+            for name, weight in weights:
+                if name.startswith("bert."):
+                    yield (name[len("bert.") :], weight)
+                else:
+                    self_weights.append((name, weight))
+
+        self.bert.load_weights(weight_filter())
+
+        params_dict = dict(self.named_parameters())
+
+        for name, loaded_weight in self_weights:
+            if name.startswith("classifier"):
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
+                weight_loader(param, loaded_weight)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        input_embeds: torch.Tensor = None,
+        get_embedding: bool = False,
+    ) -> torch.Tensor:
+        assert get_embedding == True
+
+        hidden_states = self.bert(
+            input_ids=input_ids,
+            positions=positions,
+            forward_batch=forward_batch,
+            input_embeds=input_embeds,
+            get_embedding=get_embedding,
+        )
+        return self.pooler(hidden_states, forward_batch)
+
+
+EntryClass = [BertModel, Contriever, BertForSequenceClassification]
diff --git a/sglang/python/sglang/srt/models/chatglm.py b/sglang/python/sglang/srt/models/chatglm.py
new file mode 100644
index 0000000000000000000000000000000000000000..9cf585a02bde6f7f4682c8c6abdc1e861572c156
--- /dev/null
+++ b/sglang/python/sglang/srt/models/chatglm.py
@@ -0,0 +1,427 @@
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+# Adapted from
+# https://github.com/THUDM/ChatGLM2-6B
+"""Inference-only ChatGLM model compatible with THUDM weights."""
+
+from typing import Iterable, Optional, Tuple
+
+import torch
+from torch import nn
+from torch.nn import LayerNorm
+
+from sglang.srt.configs import ChatGLMConfig
+from sglang.srt.distributed import get_tensor_model_parallel_world_size
+from sglang.srt.layers.activation import SiluAndMul
+from sglang.srt.layers.layernorm import RMSNorm
+from sglang.srt.layers.linear import (
+    MergedColumnParallelLinear,
+    QKVParallelLinear,
+    RowParallelLinear,
+)
+from sglang.srt.layers.logits_processor import LogitsProcessor
+from sglang.srt.layers.quantization.base_config import QuantizationConfig
+from sglang.srt.layers.radix_attention import RadixAttention
+from sglang.srt.layers.rotary_embedding import get_rope
+from sglang.srt.layers.vocab_parallel_embedding import (
+    ParallelLMHead,
+    VocabParallelEmbedding,
+)
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch
+from sglang.srt.model_loader.weight_utils import default_weight_loader
+from sglang.srt.utils import add_prefix
+
+LoraConfig = None
+
+
+class GLMAttention(nn.Module):
+    def __init__(
+        self,
+        config,
+        layer_id: int = 0,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        tp_size = get_tensor_model_parallel_world_size()
+        self.total_num_heads = config.num_attention_heads
+        assert self.total_num_heads % tp_size == 0
+        self.num_heads = self.total_num_heads // tp_size
+        self.multi_query_attention = config.multi_query_attention
+        self.total_num_kv_heads = (
+            config.multi_query_group_num
+            if config.multi_query_attention
+            else config.num_attention_heads
+        )
+        if self.total_num_kv_heads >= tp_size:
+            # Number of KV heads is greater than TP size, so we partition
+            # the KV heads across multiple tensor parallel GPUs.
+            assert self.total_num_kv_heads % tp_size == 0
+        else:
+            # Number of KV heads is less than TP size, so we replicate
+            # the KV heads across multiple tensor parallel GPUs.
+            assert tp_size % self.total_num_kv_heads == 0
+        self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
+        self.head_dim = config.hidden_size // self.total_num_heads
+        self.q_size = self.num_heads * self.head_dim
+        self.kv_size = self.num_kv_heads * self.head_dim
+        self.scaling = self.head_dim**-0.5
+
+        self.query_key_value = QKVParallelLinear(
+            self.hidden_size,
+            self.head_dim,
+            self.total_num_heads,
+            self.total_num_kv_heads,
+            bias=config.add_bias_linear or config.add_qkv_bias,
+            quant_config=quant_config,
+            prefix=add_prefix("query_key_value", prefix),
+        )
+        self.dense = RowParallelLinear(
+            self.total_num_heads * self.head_dim,
+            config.hidden_size,
+            bias=config.add_bias_linear,
+            quant_config=quant_config,
+            prefix=add_prefix("dense", prefix),
+        )
+
+        # https://huggingface.co/THUDM/chatglm3-6b-32k/blob/e210410255278dd9d74463cf396ba559c0ef801c/modeling_chatglm.py#L141
+        rope_ratio = getattr(config, "rope_ratio", 1.0)
+        max_positions = getattr(config, "seq_length", 8192)
+        self.rotary_emb = get_rope(
+            self.head_dim,
+            rotary_dim=self.head_dim // 2,
+            max_position=max_positions,
+            base=10000 * rope_ratio,
+            is_neox_style=False,
+        )
+        self.attn = RadixAttention(
+            self.num_heads,
+            self.head_dim,
+            self.scaling,
+            num_kv_heads=self.num_kv_heads,
+            layer_id=layer_id,
+            quant_config=quant_config,
+            prefix=add_prefix("attn", prefix),
+        )
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        position_ids: torch.Tensor,
+        forward_batch: ForwardBatch,
+    ) -> torch.Tensor:
+        qkv, _ = self.query_key_value(hidden_states)
+        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+        q, k = self.rotary_emb(position_ids, q, k)
+        context_layer = self.attn(
+            q,
+            k,
+            v,
+            forward_batch,
+        )
+        attn_output, _ = self.dense(context_layer)
+        return attn_output
+
+
+class GLMMLP(nn.Module):
+    """MLP.
+
+    MLP will take the input with h hidden state, project it to 4*h
+    hidden dimension, perform nonlinear transformation, and project the
+    state back into h hidden dimension.
+    """
+
+    def __init__(
+        self,
+        config,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+
+        self.add_bias = config.add_bias_linear
+
+        # Project to 4h.
+        self.dense_h_to_4h = MergedColumnParallelLinear(
+            config.hidden_size,
+            [config.ffn_hidden_size] * 2,
+            bias=config.add_bias_linear,
+            quant_config=quant_config,
+            prefix=add_prefix("dense_h_to_4h", prefix),
+        )
+
+        self.activation_func = SiluAndMul()
+
+        # Project back to h.
+        self.dense_4h_to_h = RowParallelLinear(
+            config.ffn_hidden_size,
+            config.hidden_size,
+            bias=config.add_bias_linear,
+            quant_config=quant_config,
+            prefix=add_prefix("dense_4h_to_h", prefix),
+        )
+
+    def forward(self, hidden_states):
+        # [s, b, 4hp]
+        intermediate_parallel, _ = self.dense_h_to_4h(hidden_states)
+        intermediate_parallel = self.activation_func(intermediate_parallel)
+        # [s, b, h]
+        output, _ = self.dense_4h_to_h(intermediate_parallel)
+        return output
+
+
+class GLMBlock(nn.Module):
+    """A single transformer layer.
+
+    Transformer layer takes input with size [s, b, h] and returns an
+    output of the same size.
+    """
+
+    def __init__(
+        self,
+        config,
+        layer_id: int,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.apply_residual_connection_post_layernorm = (
+            config.apply_residual_connection_post_layernorm
+        )
+
+        self.fp32_residual_connection = config.fp32_residual_connection
+
+        layer_norm_func = RMSNorm if config.rmsnorm else LayerNorm
+        # Layernorm on the input data.
+        self.input_layernorm = layer_norm_func(
+            config.hidden_size, eps=config.layernorm_epsilon
+        )
+
+        # Self attention.
+        self.self_attention = GLMAttention(
+            config, layer_id, quant_config, prefix=add_prefix("self_attention", prefix)
+        )
+        self.hidden_dropout = config.hidden_dropout
+
+        # Layernorm on the attention output
+        self.post_attention_layernorm = layer_norm_func(
+            config.hidden_size, eps=config.layernorm_epsilon
+        )
+
+        # MLP
+        self.mlp = GLMMLP(config, quant_config, prefix=add_prefix("mlp", prefix))
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        position_ids: torch.Tensor,
+        forward_batch: ForwardBatch,
+    ) -> torch.Tensor:
+        # hidden_states: [num_tokens, h]
+        # Layer norm at the beginning of the transformer layer.
+        layernorm_output = self.input_layernorm(hidden_states)
+        # Self attention.
+        attention_output = self.self_attention(
+            hidden_states=layernorm_output,
+            position_ids=position_ids,
+            forward_batch=forward_batch,
+        )
+
+        # Residual connection.
+        if self.apply_residual_connection_post_layernorm:
+            residual = layernorm_output
+        else:
+            residual = hidden_states
+
+        layernorm_input = residual + attention_output
+
+        # Layer norm post the self attention.
+        layernorm_output = self.post_attention_layernorm(layernorm_input)
+
+        # Second residual connection.
+        if self.apply_residual_connection_post_layernorm:
+            residual = layernorm_output
+        else:
+            residual = layernorm_input
+
+        output = self.mlp(layernorm_output) + residual
+
+        return output
+
+
+class GLMTransformer(nn.Module):
+    """Transformer class."""
+
+    def __init__(
+        self,
+        config,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.post_layer_norm = config.post_layer_norm
+
+        # Number of layers.
+        self.num_layers = config.num_layers
+
+        # Transformer layers.
+        self.layers = nn.ModuleList(
+            [
+                GLMBlock(
+                    config,
+                    i,
+                    quant_config,
+                    prefix=add_prefix(f"layers.{i}", prefix),
+                )
+                for i in range(self.num_layers)
+            ]
+        )
+
+        if self.post_layer_norm:
+            layer_norm_func = RMSNorm if config.rmsnorm else LayerNorm
+            # Final layer norm before output.
+            self.final_layernorm = layer_norm_func(
+                config.hidden_size, eps=config.layernorm_epsilon
+            )
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        position_ids: torch.Tensor,
+        forward_batch: ForwardBatch,
+    ) -> torch.Tensor:
+        for i in range(self.num_layers):
+            layer = self.layers[i]
+            hidden_states = layer(
+                hidden_states=hidden_states,
+                position_ids=position_ids,
+                forward_batch=forward_batch,
+            )
+        # Final layer norm.
+        if self.post_layer_norm:
+            hidden_states = self.final_layernorm(hidden_states)
+
+        return hidden_states
+
+
+class ChatGLMM(nn.Module):
+    def __init__(
+        self,
+        config,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+
+        self.embedding = VocabParallelEmbedding(
+            config.padded_vocab_size,
+            config.hidden_size,
+            prefix=add_prefix("embedding", prefix),
+        )
+
+        self.num_layers = config.num_layers
+        self.multi_query_group_num = config.multi_query_group_num
+        self.kv_channels = config.kv_channels
+        self.encoder = GLMTransformer(
+            config, quant_config, add_prefix("encoder", prefix)
+        )
+
+        self.output_layer = ParallelLMHead(
+            config.padded_vocab_size,
+            config.hidden_size,
+            prefix=add_prefix("output_layer", prefix),
+        )
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        position_ids: torch.Tensor,
+        forward_batch: ForwardBatch,
+    ) -> torch.Tensor:
+        inputs_embeds = self.embedding(input_ids)
+
+        # Run encoder.
+        hidden_states = self.encoder(
+            hidden_states=inputs_embeds,
+            position_ids=position_ids,
+            forward_batch=forward_batch,
+        )
+        return hidden_states
+
+
+class ChatGLMForCausalLM(nn.Module):
+    packed_modules_mapping = {
+        "query_key_value": ["query_key_value"],
+        "dense_h_to_4h": ["dense_h_to_4h"],
+    }
+    # LoRA specific attributes
+    supported_lora_modules = [
+        "query_key_value",
+        "dense",
+        "dense_h_to_4h",
+        "dense_4h_to_h",
+    ]
+    embedding_modules = {}
+    embedding_padding_modules = []
+
+    def __init__(
+        self,
+        config: ChatGLMConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.config: ChatGLMConfig = config
+        self.quant_config = quant_config
+        self.max_position_embeddings = getattr(config, "max_sequence_length", 8192)
+        self.transformer = ChatGLMM(
+            config, quant_config, prefix=add_prefix("transformer", prefix)
+        )
+        self.lm_head = self.transformer.output_layer
+        self.logits_processor = LogitsProcessor(config)
+
+    @torch.no_grad()
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+    ) -> torch.Tensor:
+        hidden_states = self.transformer(input_ids, positions, forward_batch)
+        return self.logits_processor(
+            input_ids, hidden_states, self.lm_head, forward_batch
+        )
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        params_dict = dict(self.named_parameters(remove_duplicate=False))
+        for name, loaded_weight in weights:
+            if "rotary_pos_emb.inv_freq" in name:
+                continue
+            if "word_embeddings" in name:
+                name = name.replace(".word_embeddings", "")
+            # Skip loading extra bias for GPTQ models.
+            if name.endswith(".bias") and name not in params_dict:
+                continue
+            param = params_dict[name]
+            weight_loader = getattr(param, "weight_loader", default_weight_loader)
+            weight_loader(param, loaded_weight)
+
+
+class ChatGLMModel(ChatGLMForCausalLM):
+    pass
+
+
+EntryClass = [ChatGLMModel]
diff --git a/sglang/python/sglang/srt/models/clip.py b/sglang/python/sglang/srt/models/clip.py
new file mode 100644
index 0000000000000000000000000000000000000000..9294e6f8807fe1bc61ca09ee0fde20e3e80756f9
--- /dev/null
+++ b/sglang/python/sglang/srt/models/clip.py
@@ -0,0 +1,559 @@
+# Adapted from
+# https://github.com/huggingface/transformers/blob/af9b2eaa54c150741f298d6db939af6328e1dc38/src/transformers/models/clip/modeling_clip.py
+
+from functools import partial
+from typing import Iterable, List, Optional, Tuple, Type, Union
+
+import torch
+import torch.nn as nn
+from transformers import CLIPConfig, CLIPTextConfig, CLIPVisionConfig
+from transformers.modeling_attn_mask_utils import _create_4d_causal_attention_mask
+
+from sglang.srt.layers.activation import QuickGELU
+from sglang.srt.layers.attention.vision import VisionAttention
+from sglang.srt.layers.linear import ColumnParallelLinear, RowParallelLinear
+from sglang.srt.layers.pooler import EmbeddingPoolerOutput, Pooler, PoolingType
+from sglang.srt.layers.quantization.base_config import QuantizationConfig
+from sglang.srt.managers.schedule_batch import MultimodalInputs
+from sglang.srt.model_executor.model_runner import ForwardBatch
+from sglang.srt.model_loader.weight_utils import default_weight_loader
+from sglang.srt.utils import add_prefix, flatten_nested_list
+
+
+class CLIPVisionEmbeddings(nn.Module):
+
+    def __init__(self, config: CLIPVisionConfig):
+        super().__init__()
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.image_size = config.image_size
+        self.patch_size = config.patch_size
+        assert self.image_size % self.patch_size == 0
+
+        self.class_embedding = nn.Parameter(torch.randn(self.embed_dim))
+
+        self.patch_embedding = nn.Conv2d(
+            in_channels=config.num_channels,
+            out_channels=self.embed_dim,
+            kernel_size=self.patch_size,
+            stride=self.patch_size,
+            bias=False,
+        )
+
+        self.num_patches = (self.image_size // self.patch_size) ** 2
+        self.num_positions = self.num_patches + 1
+        self.position_embedding = nn.Embedding(self.num_positions, self.embed_dim)
+        self.register_buffer(
+            "position_ids",
+            torch.arange(self.num_positions).expand((1, -1)),
+            persistent=False,
+        )
+
+    def forward(self, pixel_values: torch.Tensor) -> torch.Tensor:
+        batch_size = pixel_values.shape[0]
+        target_dtype = self.patch_embedding.weight.dtype
+        patch_embeds = self.patch_embedding(
+            pixel_values.to(dtype=target_dtype)
+        )  # shape = [*, width, grid, grid]
+        patch_embeds = patch_embeds.flatten(2).transpose(1, 2)
+
+        class_embeds = self.class_embedding.expand(batch_size, 1, -1)
+        embeddings = torch.cat([class_embeds, patch_embeds], dim=1)
+        embeddings = embeddings + self.position_embedding(self.position_ids)
+
+        return embeddings
+
+
+class CLIPTextEmbeddings(nn.Module):
+    def __init__(self, config: CLIPTextConfig):
+        super().__init__()
+        embed_dim = config.hidden_size
+
+        self.token_embedding = nn.Embedding(config.vocab_size, embed_dim)
+        self.position_embedding = nn.Embedding(
+            config.max_position_embeddings, embed_dim
+        )
+
+        # position_ids (1, len position emb) is contiguous in memory and exported when serialized
+        self.register_buffer(
+            "position_ids",
+            torch.arange(config.max_position_embeddings).expand((1, -1)),
+            persistent=False,
+        )
+
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+    ) -> torch.Tensor:
+        seq_length = (
+            input_ids.shape[-1] if input_ids is not None else inputs_embeds.shape[-2]
+        )
+
+        if position_ids is None:
+            position_ids = self.position_ids[:, :seq_length]
+
+        if inputs_embeds is None:
+            inputs_embeds = self.token_embedding(input_ids)
+
+        position_embeddings = self.position_embedding(position_ids)
+        embeddings = inputs_embeds + position_embeddings
+
+        return embeddings
+
+
+class CLIPMLP(nn.Module):
+
+    def __init__(
+        self,
+        config,
+        act_layer: Type[nn.Module] = QuickGELU,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.fc1 = ColumnParallelLinear(
+            config.hidden_size,
+            config.intermediate_size,
+            quant_config=quant_config,
+            prefix=add_prefix("fc1", prefix),
+        )
+        self.act = act_layer()
+        self.fc2 = RowParallelLinear(
+            config.intermediate_size,
+            config.hidden_size,
+            quant_config=quant_config,
+            prefix=add_prefix("fc2", prefix),
+        )
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x_parallel, _ = self.fc1(x)
+        x_parallel = self.act(x_parallel)
+        x, _ = self.fc2(x_parallel)
+        return x
+
+
+class CLIPEncoderLayer(nn.Module):
+
+    def __init__(
+        self,
+        config: CLIPVisionConfig,
+        act_layer: Type[nn.Module] = QuickGELU,
+        norm_layer: Type[nn.Module] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        if norm_layer is None:
+            norm_layer = partial(nn.LayerNorm, eps=config.layer_norm_eps)
+        self.layer_norm1 = norm_layer(config.hidden_size)
+        self.layer_norm2 = norm_layer(config.hidden_size)
+        self.self_attn = VisionAttention(
+            embed_dim=config.hidden_size,
+            num_heads=config.num_attention_heads,
+            projection_size=config.hidden_size,
+            use_qkv_parallel=True,
+            flatten_batch=True,
+            quant_config=quant_config,
+            prefix=add_prefix("self_attn", prefix),
+        )
+        self.mlp = CLIPMLP(
+            config,
+            act_layer=act_layer,
+            quant_config=quant_config,
+            prefix=add_prefix("mlp", prefix),
+        )
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: torch.Tensor,
+        causal_attention_mask: torch.Tensor,
+    ) -> torch.Tensor:
+
+        residual = hidden_states
+        hidden_states = self.layer_norm1(hidden_states)
+        # CLIP text model uses both `causal_attention_mask` and `attention_mask`
+        if attention_mask is not None and causal_attention_mask is not None:
+            attn_mask = attention_mask + causal_attention_mask
+        elif causal_attention_mask is not None:
+            attn_mask = causal_attention_mask
+        else:
+            attn_mask = attention_mask
+        hidden_states = self.self_attn(
+            hidden_states,
+            attention_mask=attn_mask,
+            # causal_attention_mask=causal_attention_mask,
+        )
+
+        hidden_states = residual + hidden_states
+        residual = hidden_states
+        hidden_states = self.layer_norm2(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+        return hidden_states
+
+
+class CLIPEncoder(nn.Module):
+    """
+    Transformer encoder consisting of `config.num_hidden_layers` self
+    attention layers. Each layer is a [`CLIPEncoderLayer`].
+
+    Args:
+        config: CLIPConfig
+    """
+
+    def __init__(
+        self,
+        config: CLIPVisionConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+
+        self.config = config
+
+        num_hidden_layers = config.num_hidden_layers
+        norm_layer = partial(nn.LayerNorm, eps=config.layer_norm_eps)
+        self.layers = nn.ModuleList(
+            [
+                CLIPEncoderLayer(
+                    config=config,
+                    norm_layer=norm_layer,
+                    quant_config=quant_config,
+                    prefix=add_prefix(f"layers.{layer_idx}", prefix),
+                )
+                for layer_idx in range(num_hidden_layers)
+            ]
+        )
+
+    def forward(
+        self,
+        inputs_embeds: torch.Tensor,
+        attention_mask: torch.Tensor = None,
+        causal_attention_mask: torch.Tensor = None,
+        return_all_hidden_states: bool = False,
+    ) -> Union[torch.Tensor, list[torch.Tensor]]:
+        hidden_states_pool = [inputs_embeds]
+        hidden_states = inputs_embeds
+
+        for encoder_layer in self.layers:
+            hidden_states = encoder_layer(
+                hidden_states, attention_mask, causal_attention_mask
+            )
+            if return_all_hidden_states:
+                hidden_states_pool.append(hidden_states)
+        if return_all_hidden_states:
+            return hidden_states_pool
+        return hidden_states
+
+
+class CLIPTextTransformer(nn.Module):
+    def __init__(
+        self,
+        config: CLIPTextConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.config = config
+        embed_dim = config.hidden_size
+        self.embeddings = CLIPTextEmbeddings(config)
+        self.encoder = CLIPEncoder(
+            config=config,
+            quant_config=quant_config,
+            prefix=add_prefix("encoder", prefix),
+        )
+        self.final_layer_norm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
+
+    @property
+    def device(self) -> torch.device:
+        return self.encoder.layers[0].layer_norm1.weight.device
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+    ):
+        input_shape = input_ids.size()
+        input_ids = input_ids.view(-1, input_shape[-1])
+        hidden_states = self.embeddings(input_ids, position_ids)
+        causal_attention_mask = _create_4d_causal_attention_mask(
+            input_ids.shape, hidden_states.dtype, device=hidden_states.device
+        )
+        encoder_outputs = self.encoder(
+            hidden_states, attention_mask, causal_attention_mask
+        )
+        last_hidden_state = self.final_layer_norm(encoder_outputs)
+        return last_hidden_state
+
+
+class CLIPTextModel(nn.Module):
+    def __init__(
+        self,
+        config: CLIPTextConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.config = config
+        self.text_model = CLIPTextTransformer(
+            config=config,
+            quant_config=quant_config,
+            prefix=add_prefix("text_model", prefix),
+        )
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        position_ids: torch.Tensor,
+    ):
+        return self.text_model(input_ids, position_ids)
+
+
+class CLIPVisionTransformer(nn.Module):
+
+    def __init__(
+        self,
+        config: CLIPVisionConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+
+        self.config = config
+        embed_dim = config.hidden_size
+
+        self.embeddings = CLIPVisionEmbeddings(config)
+
+        # NOTE: This typo of "layrnorm" is not fixed on purpose to match
+        # the original transformers code and name of the model weights.
+        self.pre_layrnorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
+
+        self.encoder = CLIPEncoder(
+            config=config,
+            quant_config=quant_config,
+            prefix=add_prefix("encoder", prefix),
+        )
+
+        num_hidden_layers = config.num_hidden_layers
+        if len(self.encoder.layers) > config.num_hidden_layers:
+            raise ValueError(
+                f"The original encoder only has {num_hidden_layers} "
+                f"layers, but you requested {len(self.encoder.layers)} layers."
+            )
+
+        self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
+
+    @property
+    def device(self) -> torch.device:
+        return self.encoder.layers[0].layer_norm1.weight.device
+
+    def forward(
+        self,
+        pixel_values: torch.Tensor,
+    ) -> torch.Tensor:
+        hidden_states = self.embeddings(pixel_values.to(self.device))
+        hidden_states = self.pre_layrnorm(hidden_states)
+
+        return_all_hidden_states = False
+
+        last_hidden_state = self.encoder(
+            inputs_embeds=hidden_states,
+            return_all_hidden_states=return_all_hidden_states,
+        )
+
+        last_hidden_state = self.post_layernorm(last_hidden_state)
+
+        return last_hidden_state
+
+
+class CLIPVisionModel(nn.Module):
+    def __init__(
+        self,
+        config: CLIPVisionConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.vision_model = CLIPVisionTransformer(
+            config, quant_config, prefix=add_prefix("vision_model", prefix)
+        )
+
+    @property
+    def device(self) -> torch.device:
+        return self.vision_model.device
+
+    def forward(self, pixel_values: torch.Tensor):
+        return self.vision_model(pixel_values)
+
+
+class CLIPModel(nn.Module):
+    def __init__(
+        self,
+        config: CLIPConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.config = config
+        if not isinstance(config.text_config, CLIPTextConfig):
+            raise TypeError(
+                "config.text_config is expected to be of type CLIPTextConfig but is of type"
+                f" {type(config.text_config)}."
+            )
+
+        if not isinstance(config.vision_config, CLIPVisionConfig):
+            raise TypeError(
+                "config.vision_config is expected to be of type CLIPVisionConfig but is of type"
+                f" {type(config.vision_config)}."
+            )
+
+        text_config = config.text_config
+        vision_config = config.vision_config
+
+        self.projection_dim = config.projection_dim
+        self.text_embed_dim = text_config.hidden_size
+        self.vision_embed_dim = vision_config.hidden_size
+        self.visual_projection = nn.Linear(
+            self.vision_embed_dim, self.projection_dim, bias=False
+        )
+        self.text_projection = nn.Linear(
+            self.text_embed_dim, self.projection_dim, bias=False
+        )
+        self.logit_scale = nn.Parameter(
+            torch.tensor(self.config.logit_scale_init_value)
+        )
+
+        text_model = CLIPTextModel(
+            text_config, quant_config, prefix=add_prefix("text_model", prefix)
+        )
+        vision_model = CLIPVisionModel(
+            vision_config, quant_config, prefix=add_prefix("vision_model", prefix)
+        )
+        self.text_model = text_model.text_model
+        self.vision_model = vision_model.vision_model
+        self.pooler = Pooler(pooling_type=PoolingType.LAST, normalize=True)
+        monkey_patch_weight_loader()
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        get_embedding: bool = True,
+    ):
+        assert get_embedding, "CLIPEmbeddingModel is only used for embedding"
+        mm_inputs = []
+        if forward_batch.mm_inputs is not None:
+            mm_inputs = forward_batch.mm_inputs
+        pixel_values_list = [
+            item.feature
+            for item in flatten_nested_list(
+                [mm_input.mm_items for mm_input in mm_inputs if mm_input is not None]
+            )
+        ]
+        if len(pixel_values_list) != 0:
+            pixel_values = torch.concat(pixel_values_list)
+            vision_outputs = self.vision_model(pixel_values)
+            pooled_output = vision_outputs[:, 0, :]
+            image_embeds = self.visual_projection(pooled_output)
+            image_embeds = nn.functional.normalize(image_embeds, p=2, dim=1)
+            return EmbeddingPoolerOutput(embeddings=image_embeds)
+
+        else:
+            text_outputs = self.text_model(input_ids, position_ids=positions)
+            pooled_output = self.pooler(text_outputs[0], forward_batch)
+            return EmbeddingPoolerOutput(
+                embeddings=self.text_projection(pooled_output.embeddings)
+            )
+
+    def pad_input_ids(self, input_ids: List[int], image_inputs: MultimodalInputs):
+        # Clip embeddings models handle text/image separately, so we don't need to pad input ids
+        return input_ids
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+        ]
+        params_dict = dict(self.named_parameters())
+        for name, loaded_weight in weights:
+            if "position_ids" in name:
+                continue
+            if "out_proj" in name:
+                name = name.replace("out_proj", "proj")
+            for param_name, shard_name, shard_id in stacked_params_mapping:
+                if shard_name not in name:
+                    continue
+                name = name.replace(shard_name, param_name)
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
+                weight_loader(param, loaded_weight)
+
+
+# monkey patch weight loader to remove open_clip file
+def monkey_patch_weight_loader():
+    import glob
+    import os
+
+    from sglang.srt.model_loader.loader import DefaultModelLoader
+    from sglang.srt.model_loader.weight_utils import (
+        download_weights_from_hf,
+        filter_files_not_needed_for_inference,
+    )
+
+    def prepare_weights(
+        self, model_name_or_path: str, revision: Optional[str], fall_back_to_pt: bool
+    ) -> Tuple[str, List[str], bool]:
+        model_name_or_path = (
+            self._maybe_download_from_modelscope(model_name_or_path, revision)
+            or model_name_or_path
+        )
+
+        is_local = os.path.isdir(model_name_or_path)
+        use_safetensors = False
+        allow_patterns = ["*.bin"]
+
+        if not is_local:
+            hf_folder = download_weights_from_hf(
+                model_name_or_path,
+                self.load_config.download_dir,
+                allow_patterns,
+                revision,
+                ignore_patterns=self.load_config.ignore_patterns,
+            )
+        else:
+            hf_folder = model_name_or_path
+
+        hf_weights_files: List[str] = []
+        for pattern in allow_patterns:
+            hf_weights_files += glob.glob(os.path.join(hf_folder, pattern))
+
+        hf_weights_files = filter_files_not_needed_for_inference(hf_weights_files)
+
+        # remove open_clip file
+        hf_weights_files = [
+            file for file in hf_weights_files if "open_clip" not in file
+        ]
+
+        if len(hf_weights_files) == 0:
+            raise RuntimeError(
+                f"Cannot find any model weights with `{model_name_or_path}`"
+            )
+
+        return hf_folder, hf_weights_files, use_safetensors
+
+    setattr(DefaultModelLoader, "_prepare_weights", prepare_weights)
+
+
+EntryClass = CLIPModel
diff --git a/sglang/python/sglang/srt/models/commandr.py b/sglang/python/sglang/srt/models/commandr.py
new file mode 100644
index 0000000000000000000000000000000000000000..7c799f5f84000ad055fca63a226b92b5bc548021
--- /dev/null
+++ b/sglang/python/sglang/srt/models/commandr.py
@@ -0,0 +1,432 @@
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+# Copyright 2024 Cohere and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+# Adapted from
+# https://github.com/vllm-project/vllm/blob/c7f2cf2b7f67bce5842fedfdba508440fe257375/vllm/model_executor/models/commandr.py#L1
+
+# This file is based on the LLama model definition file in transformers
+"""PyTorch Cohere model."""
+
+from typing import Iterable, Optional, Tuple
+
+import torch
+import torch.utils.checkpoint
+from torch import nn
+from torch.nn.parameter import Parameter
+from transformers import Cohere2Config, CohereConfig, PretrainedConfig
+
+from sglang.srt.distributed import (
+    get_tensor_model_parallel_rank,
+    get_tensor_model_parallel_world_size,
+)
+from sglang.srt.layers.activation import SiluAndMul
+from sglang.srt.layers.linear import (
+    MergedColumnParallelLinear,
+    QKVParallelLinear,
+    RowParallelLinear,
+)
+from sglang.srt.layers.logits_processor import LogitsProcessor
+from sglang.srt.layers.quantization.base_config import QuantizationConfig
+from sglang.srt.layers.radix_attention import RadixAttention
+from sglang.srt.layers.rotary_embedding import get_rope
+from sglang.srt.layers.vocab_parallel_embedding import VocabParallelEmbedding
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch
+from sglang.srt.model_loader.weight_utils import (
+    default_weight_loader,
+    maybe_remap_kv_scale_name,
+)
+from sglang.srt.utils import add_prefix, get_compiler_backend, set_weight_attrs
+
+
+@torch.compile(backend=get_compiler_backend())
+def layer_norm_func(hidden_states, weight, variance_epsilon):
+    input_dtype = hidden_states.dtype
+    hidden_states = hidden_states.to(torch.float32)
+    mean = hidden_states.mean(-1, keepdim=True)
+    variance = (hidden_states - mean).pow(2).mean(-1, keepdim=True)
+    hidden_states = (hidden_states - mean) * torch.rsqrt(variance + variance_epsilon)
+    hidden_states = weight.to(torch.float32) * hidden_states
+    return hidden_states.to(input_dtype)
+
+
+class LayerNorm(nn.Module):
+    def __init__(self, param_shape=None, eps=1e-5):
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(param_shape))
+        self.variance_epsilon = eps
+        set_weight_attrs(self.weight, {"weight_loader": self.weight_loader})
+
+    def forward(self, hidden_states, residuals=None):
+        hidden_states = layer_norm_func(
+            hidden_states, self.weight, self.variance_epsilon
+        )
+        return hidden_states, residuals
+
+    def weight_loader(self, param: Parameter, loaded_weight: torch.Tensor):
+        tp_rank = get_tensor_model_parallel_rank()
+        shard_dim = 0 if param.dim() != 1 else None
+        param_data = param.data
+        if shard_dim is not None:
+            shard_size = param_data.shape[shard_dim]
+            start_idx = tp_rank * shard_size
+            loaded_weight = loaded_weight.narrow(shard_dim, start_idx, shard_size)
+        assert param_data.shape == loaded_weight.shape
+        param_data.copy_(loaded_weight)
+
+
+# Copied from transformers.models.llama.modeling_llama.LlamaMLP Llama->Cohere
+class CohereMLP(nn.Module):
+    def __init__(
+        self,
+        config,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.intermediate_size = config.intermediate_size
+        self.gate_up_proj = MergedColumnParallelLinear(
+            self.hidden_size,
+            [self.intermediate_size] * 2,
+            bias=False,
+            quant_config=quant_config,
+            prefix=add_prefix("gate_up_proj", prefix),
+        )
+        self.down_proj = RowParallelLinear(
+            self.intermediate_size,
+            self.hidden_size,
+            bias=False,
+            quant_config=quant_config,
+            prefix=add_prefix("down_proj", prefix),
+        )
+        self.act_fn = SiluAndMul()
+
+    def forward(self, x):
+        gate_up, _ = self.gate_up_proj(x)
+        x = self.act_fn(gate_up)
+        x, _ = self.down_proj(x)
+        return x
+
+
+class CohereAttention(nn.Module):
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        layer_id: int = 0,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        tp_size = get_tensor_model_parallel_world_size()
+        self.config = config
+        self.attention_dropout = config.attention_dropout
+        self.hidden_size = config.hidden_size
+        self.total_num_heads = config.num_attention_heads
+        self.num_heads = self.total_num_heads // tp_size
+        self.head_dim = self.hidden_size // self.total_num_heads
+        self.total_num_kv_heads = config.num_key_value_heads
+        if self.total_num_kv_heads >= tp_size:
+            # Number of KV heads is greater than TP size, so we partition
+            # the KV heads across multiple tensor parallel GPUs.
+            assert self.total_num_kv_heads % tp_size == 0
+        else:
+            # Number of KV heads is less than TP size, so we replicate
+            # the KV heads across multiple tensor parallel GPUs.
+            assert tp_size % self.total_num_kv_heads == 0
+        self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
+        self.q_size = self.num_heads * self.head_dim
+        self.kv_size = self.num_kv_heads * self.head_dim
+        self.scaling = self.head_dim**-0.5
+        self.max_position_embeddings = getattr(
+            config, "model_max_length", None
+        ) or getattr(config, "max_position_embeddings", 8192)
+        self.rope_theta = config.rope_theta
+        self.rope_scaling = getattr(config, "rope_scaling", None)
+        self.use_qk_norm = getattr(config, "use_qk_norm", False)
+        self.qkv_proj = QKVParallelLinear(
+            self.hidden_size,
+            self.head_dim,
+            self.total_num_heads,
+            self.total_num_kv_heads,
+            bias=False,
+            quant_config=quant_config,
+            prefix=add_prefix("qkv_proj", prefix),
+        )
+        self.o_proj = RowParallelLinear(
+            self.total_num_heads * self.head_dim,
+            self.hidden_size,
+            bias=False,
+            quant_config=quant_config,
+            prefix=add_prefix("o_proj", prefix),
+        )
+        self.rotary_emb = get_rope(
+            self.head_dim,
+            rotary_dim=self.head_dim,
+            max_position=self.max_position_embeddings,
+            base=self.rope_theta,
+            rope_scaling=self.rope_scaling,
+            is_neox_style=False,
+        )
+
+        self.v1 = isinstance(config, CohereConfig)
+        self.v2 = isinstance(config, Cohere2Config)
+
+        # Model v2 has interleaved sliding windows, v1 does not
+        if self.v2 and config.layer_types[layer_id] == "sliding_attention":
+            self.sliding_window_size = config.sliding_window
+        else:
+            self.sliding_window_size = -1
+
+        self.attn = RadixAttention(
+            self.num_heads,
+            self.head_dim,
+            self.scaling,
+            num_kv_heads=self.num_kv_heads,
+            layer_id=layer_id,
+            sliding_window_size=self.sliding_window_size,
+            quant_config=quant_config,
+            prefix=add_prefix("attn", prefix),
+        )
+        if self.use_qk_norm:
+            self.q_norm = LayerNorm(
+                param_shape=(self.num_heads, self.head_dim), eps=config.layer_norm_eps
+            )
+            self.k_norm = LayerNorm(
+                param_shape=(self.num_kv_heads, self.head_dim),
+                eps=config.layer_norm_eps,
+            )
+
+    def _apply_qk_norm(self, q, k):
+        q = q.view(*q.shape[:-1], -1, self.head_dim)
+        k = k.view(*k.shape[:-1], -1, self.head_dim)
+        q, _ = self.q_norm(q)
+        k, _ = self.k_norm(k)
+        q = q.view(*q.shape[:-2], -1)
+        k = k.view(*k.shape[:-2], -1)
+        return q, k
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+    ) -> torch.Tensor:
+        qkv, _ = self.qkv_proj(hidden_states)
+        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+        if self.use_qk_norm:
+            q, k = self._apply_qk_norm(q, k)
+        # Model v1 uses RoPE throughout, Model v2 uses RoPE only for SWA layers
+        if self.v1 or self.sliding_window_size > 0:
+            q, k = self.rotary_emb(positions, q, k)
+        attn_output = self.attn(q, k, v, forward_batch)
+        output, _ = self.o_proj(attn_output)
+        return output
+
+
+class CohereDecoderLayer(nn.Module):
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        layer_id: int = 0,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+
+        self.self_attn = CohereAttention(
+            config,
+            layer_id=layer_id,
+            quant_config=quant_config,
+            prefix=add_prefix("self_attn", prefix),
+        )
+
+        self.mlp = CohereMLP(
+            config,
+            quant_config=quant_config,
+            prefix=add_prefix("mlp", prefix),
+        )
+        self.input_layernorm = LayerNorm(
+            param_shape=(config.hidden_size), eps=config.layer_norm_eps
+        )
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+        residual: Optional[torch.Tensor],
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        # Self Attention
+        residual = hidden_states
+        hidden_states, residual = self.input_layernorm(hidden_states, residual)
+        hidden_states_attention = self.self_attn(
+            positions=positions,
+            hidden_states=hidden_states,
+            forward_batch=forward_batch,
+        )
+        hidden_states_mlp = self.mlp(hidden_states)
+        # Add everything together
+        hidden_states = residual + hidden_states_attention + hidden_states_mlp
+
+        return hidden_states, residual
+
+
+class CohereModel(nn.Module):
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.config = config
+        self.vocab_size = config.vocab_size
+        self.embed_tokens = VocabParallelEmbedding(
+            config.vocab_size, config.hidden_size
+        )
+        self.layers = nn.ModuleList(
+            [
+                CohereDecoderLayer(
+                    config,
+                    i,
+                    quant_config=quant_config,
+                    prefix=add_prefix(f"layers.{i}", prefix),
+                )
+                for i in range(config.num_hidden_layers)
+            ]
+        )
+        self.norm = LayerNorm(
+            param_shape=(config.hidden_size), eps=config.layer_norm_eps
+        )
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+    ) -> torch.Tensor:
+        hidden_states = self.embed_tokens(input_ids)
+        residual = None
+        for i in range(len(self.layers)):
+            layer = self.layers[i]
+            hidden_states, residual = layer(
+                positions,
+                hidden_states,
+                forward_batch,
+                residual,
+            )
+        hidden_states, _ = self.norm(hidden_states, residual)
+        return hidden_states
+
+
+class CohereForCausalLM(nn.Module):
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.config = config
+        self.quant_config = quant_config
+        self.logit_scale = getattr(config, "logit_scale", None)
+        self.logits_processor = LogitsProcessor(config, logit_scale=self.logit_scale)
+        self.model = CohereModel(
+            config, quant_config, prefix=add_prefix("model", prefix)
+        )
+
+    @torch.no_grad()
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+    ) -> torch.Tensor:
+        hidden_states = self.model(
+            input_ids,
+            positions,
+            forward_batch,
+        )
+        return self.logits_processor(
+            input_ids, hidden_states, self.model.embed_tokens, forward_batch
+        )
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+            ("gate_up_proj", "gate_proj", 0),
+            ("gate_up_proj", "up_proj", 1),
+        ]
+        params_dict = dict(self.named_parameters())
+        loaded_params = set()
+        for name, loaded_weight in weights:
+            for param_name, shard_name, shard_id in stacked_params_mapping:
+                if shard_name not in name:
+                    continue
+                name = name.replace(shard_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                # lm_head is not used in vllm as it is tied with embed_token.
+                # To prevent errors, skip loading lm_head.weight.
+                if "lm_head.weight" in name:
+                    continue
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                # Remapping the name of FP8 kv-scale.
+                name = maybe_remap_kv_scale_name(name, params_dict)
+                if name is None:
+                    continue
+
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
+                weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+
+
+class Cohere2ForCausalLM(CohereForCausalLM):
+    pass
+
+
+EntryClass = [CohereForCausalLM, Cohere2ForCausalLM]
diff --git a/sglang/python/sglang/srt/models/dbrx.py b/sglang/python/sglang/srt/models/dbrx.py
new file mode 100644
index 0000000000000000000000000000000000000000..74de384b33957e7d899b9048898d4863430346ee
--- /dev/null
+++ b/sglang/python/sglang/srt/models/dbrx.py
@@ -0,0 +1,460 @@
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+# Adapted from:
+# https://github.com/vllm-project/vllm/blob/c7f2cf2b7f67bce5842fedfdba508440fe257375/vllm/model_executor/models/dbrx.py#L1
+
+from typing import Iterable, Optional, Tuple
+
+import torch
+import torch.nn as nn
+
+from sglang.srt.configs import DbrxConfig
+from sglang.srt.distributed import (
+    get_tensor_model_parallel_rank,
+    get_tensor_model_parallel_world_size,
+    tensor_model_parallel_all_reduce,
+)
+from sglang.srt.layers.linear import (
+    QKVParallelLinear,
+    ReplicatedLinear,
+    RowParallelLinear,
+)
+from sglang.srt.layers.logits_processor import LogitsProcessor
+from sglang.srt.layers.moe.fused_moe_triton.fused_moe import fused_moe
+from sglang.srt.layers.moe.moe_runner import MoeRunnerConfig
+from sglang.srt.layers.moe.topk import TopK
+from sglang.srt.layers.quantization.base_config import QuantizationConfig
+from sglang.srt.layers.radix_attention import RadixAttention
+from sglang.srt.layers.rotary_embedding import get_rope
+from sglang.srt.layers.vocab_parallel_embedding import (
+    DEFAULT_VOCAB_PADDING_SIZE,
+    ParallelLMHead,
+    VocabParallelEmbedding,
+)
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch
+from sglang.srt.model_loader.weight_utils import (
+    default_weight_loader,
+    maybe_remap_kv_scale_name,
+)
+from sglang.srt.utils import add_prefix, set_weight_attrs
+
+
+class DbrxRouter(nn.Module):
+    """A Router implementation for DBRX that returns logits for each expert
+    per token.
+    """
+
+    def __init__(
+        self,
+        config: DbrxConfig,
+        params_dtype: Optional[torch.dtype] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.tp_size = get_tensor_model_parallel_world_size()
+        self.num_total_experts = config.ffn_config.moe_num_experts
+        self.d_model = config.d_model
+        self.layer = ReplicatedLinear(
+            self.d_model,
+            self.num_total_experts,
+            bias=False,
+            params_dtype=params_dtype,
+            quant_config=None,
+        )
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        router_logits, _ = self.layer(hidden_states)
+        return router_logits
+
+
+class DbrxExperts(nn.Module):
+    """A tensor-parallel MoE implementation for DBRX.
+
+    Each expert's weights are sharded across all ranks and a fused MoE
+    kernel is used for the forward pass, and finally we reduce the outputs
+    across ranks.
+    """
+
+    def __init__(
+        self,
+        config: DbrxConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        params_dtype: Optional[torch.dtype] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.tp_size = get_tensor_model_parallel_world_size()
+        self.num_total_experts = config.ffn_config.moe_num_experts
+        self.top_k = config.ffn_config.moe_top_k
+        self.d_model = config.d_model
+        self.intermediate_size = config.ffn_config.ffn_hidden_size // self.tp_size
+
+        if params_dtype is None:
+            params_dtype = torch.get_default_dtype()
+        self.params_dtype = params_dtype
+
+        self.router = DbrxRouter(config, self.params_dtype)
+        self.topk = TopK(
+            self.top_k,
+            renormalize=True,
+        )
+        self.moe_runner_config = MoeRunnerConfig(inplace=True)
+        self.ws = nn.Parameter(
+            torch.empty(
+                self.num_total_experts,
+                2 * self.intermediate_size,
+                self.d_model,
+                device="cuda",
+                dtype=self.params_dtype,
+            )
+        )
+        self.w2s = nn.Parameter(
+            torch.empty(
+                self.num_total_experts,
+                self.d_model,
+                self.intermediate_size,
+                device="cuda",
+                dtype=self.params_dtype,
+            )
+        )
+
+        set_weight_attrs(
+            self.ws,
+            {
+                "weight_loader": self.weight_loader,
+            },
+        )
+        set_weight_attrs(
+            self.w2s,
+            {
+                "weight_loader": self.weight_loader,
+            },
+        )
+
+    def weight_loader(
+        self, param: nn.Parameter, loaded_weight: torch.Tensor, weight_name: str
+    ):
+        tp_rank = get_tensor_model_parallel_rank()
+        param_data = param.data
+        shard_size = self.intermediate_size
+        shard = slice(tp_rank * shard_size, (tp_rank + 1) * shard_size)
+        # DBRX uses GLU for each experts.
+        # GLU has 3 linear layers: w1, v1 and w2.
+        if weight_name.endswith("w1"):
+            loaded_weight = torch.reshape(
+                loaded_weight,
+                [-1, self.intermediate_size * self.tp_size, self.d_model],
+            )
+            param_data[:, 0:shard_size, :] = loaded_weight[:, shard, :]
+        if weight_name.endswith("v1"):
+            loaded_weight = torch.reshape(
+                loaded_weight,
+                [-1, self.intermediate_size * self.tp_size, self.d_model],
+            )
+            param_data[:, shard_size : 2 * shard_size, :] = loaded_weight[:, shard, :]
+        if weight_name.endswith("w2"):
+            loaded_weight = torch.reshape(
+                loaded_weight,
+                [-1, self.intermediate_size * self.tp_size, self.d_model],
+            ).transpose(1, 2)
+            param_data[:] = loaded_weight[:, :, shard]
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        num_tokens, hidden_size = hidden_states.shape
+        hidden_states = hidden_states.view(-1, self.d_model)
+        # router_logits: (num_tokens, n_experts)
+        router_logits = self.router(hidden_states)
+        topk_output = self.topk(hidden_states, router_logits)
+        final_hidden_states = fused_moe(
+            hidden_states,
+            self.ws,
+            self.w2s,
+            topk_output,
+            self.moe_runner_config,
+        )
+
+        if self.tp_size > 1:
+            final_hidden_states = tensor_model_parallel_all_reduce(final_hidden_states)
+
+        return final_hidden_states.view(num_tokens, hidden_size)
+
+
+class DbrxAttention(nn.Module):
+    def __init__(
+        self,
+        config: DbrxConfig,
+        layer_id: int = 0,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.d_model = config.d_model
+        self.total_num_heads = config.n_heads
+        self.head_dim = self.d_model // self.total_num_heads
+        self.total_num_kv_heads = config.attn_config.kv_n_heads
+        self.clip_qkv = config.attn_config.clip_qkv
+        self.rope_theta = config.attn_config.rope_theta
+        self.max_position = config.max_seq_len
+
+        # pylint: disable=invalid-name
+        self.Wqkv = QKVParallelLinear(
+            self.d_model,
+            self.head_dim,
+            self.total_num_heads,
+            self.total_num_kv_heads,
+            bias=False,
+            quant_config=quant_config,
+            prefix=add_prefix("Wqkv", prefix),
+        )
+        self.out_proj = RowParallelLinear(
+            self.d_model,
+            self.d_model,
+            bias=False,
+            quant_config=quant_config,
+            prefix=add_prefix("out_proj", prefix),
+        )
+        self.rotary_emb = get_rope(
+            self.head_dim,
+            rotary_dim=self.head_dim,
+            max_position=self.max_position,
+            base=int(self.rope_theta),
+            is_neox_style=True,
+        )
+
+        tp_world_size = get_tensor_model_parallel_world_size()
+        self.tp_size = tp_world_size
+        assert self.total_num_heads % tp_world_size == 0
+        self.num_heads = self.total_num_heads // tp_world_size
+        if self.total_num_kv_heads >= tp_world_size:
+            # Number of KV heads is greater than TP size, so we partition
+            # the KV heads across multiple tensor parallel GPUs.
+            assert self.total_num_kv_heads % tp_world_size == 0
+        else:
+            # Number of KV heads is less than TP size, so we replicate
+            # the KV heads across multiple tensor parallel GPUs.
+            assert tp_world_size % self.total_num_kv_heads == 0
+        self.num_kv_heads = max(1, self.total_num_kv_heads // tp_world_size)
+        self.q_size = self.num_heads * self.head_dim
+        self.kv_size = self.num_kv_heads * self.head_dim
+        self.scaling = self.head_dim**-0.5
+        self.attn = RadixAttention(
+            self.num_heads,
+            self.head_dim,
+            self.scaling,
+            num_kv_heads=self.num_kv_heads,
+            layer_id=layer_id,
+            quant_config=quant_config,
+            prefix=add_prefix("attn", prefix),
+        )
+
+    def forward(
+        self,
+        position_ids: torch.Tensor,
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+    ) -> torch.Tensor:
+        qkv, _ = self.Wqkv(hidden_states)
+        if self.clip_qkv is not None:
+            qkv.clamp_(min=-self.clip_qkv, max=self.clip_qkv)
+        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+        q, k = self.rotary_emb(position_ids, q, k)
+        attn_output = self.attn(q, k, v, forward_batch)
+        hidden_states, _ = self.out_proj(attn_output)
+        return hidden_states
+
+
+class DbrxFusedNormAttention(nn.Module):
+    def __init__(
+        self,
+        config: DbrxConfig,
+        layer_id: int = 0,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.d_model = config.d_model
+        self.attn = DbrxAttention(
+            config,
+            layer_id,
+            quant_config=quant_config,
+            prefix=add_prefix("attn", prefix),
+        )
+        self.norm_1 = nn.LayerNorm(self.d_model)
+        self.norm_2 = nn.LayerNorm(self.d_model)
+
+    def forward(
+        self,
+        position_ids: torch.Tensor,
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        residual = hidden_states
+        hidden_states = self.norm_1(hidden_states)
+        x = self.attn(
+            position_ids=position_ids,
+            hidden_states=hidden_states,
+            forward_batch=forward_batch,
+        )
+        hidden_states = residual + x
+        residual = hidden_states
+        hidden_states = self.norm_2(hidden_states)
+        return hidden_states, residual
+
+
+class DbrxBlock(nn.Module):
+    def __init__(
+        self,
+        config: DbrxConfig,
+        layer_id: int = 0,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.norm_attn_norm = DbrxFusedNormAttention(
+            config,
+            layer_id,
+            quant_config=quant_config,
+            prefix=add_prefix("norm_attn_norm", prefix),
+        )
+        self.ffn = DbrxExperts(config, quant_config=quant_config)
+
+    def forward(
+        self,
+        position_ids: torch.Tensor,
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+    ) -> torch.Tensor:
+        hidden_states, residual = self.norm_attn_norm(
+            position_ids=position_ids,
+            hidden_states=hidden_states,
+            forward_batch=forward_batch,
+        )
+        hidden_states = self.ffn(hidden_states)
+        hidden_states = hidden_states + residual
+        return hidden_states
+
+
+class DbrxModel(nn.Module):
+    def __init__(
+        self,
+        config: DbrxConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.wte = VocabParallelEmbedding(
+            config.vocab_size,
+            config.d_model,
+        )
+        self.blocks = nn.ModuleList(
+            [
+                DbrxBlock(
+                    config,
+                    i,
+                    quant_config=quant_config,
+                    prefix=add_prefix(f"blocks.{i}", prefix),
+                )
+                for i in range(config.n_layers)
+            ]
+        )
+        self.norm_f = nn.LayerNorm(config.d_model, eps=1e-5)
+        for module in self.modules():
+            if hasattr(module, "bias") and isinstance(module.bias, nn.Parameter):
+                # Remove the bias term in Linear and LayerNorm.
+                module.register_parameter("bias", None)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        position_ids: torch.Tensor,
+        forward_batch: ForwardBatch,
+        input_embeds: torch.Tensor = None,
+    ) -> torch.Tensor:
+        if input_embeds is None:
+            hidden_states = self.wte(input_ids)
+        else:
+            hidden_states = input_embeds
+        for i in range(len(self.blocks)):
+            block = self.blocks[i]
+            hidden_states = block(position_ids, hidden_states, forward_batch)
+        hidden_states = self.norm_f(hidden_states)
+        return hidden_states
+
+
+class DbrxForCausalLM(nn.Module):
+    def __init__(
+        self,
+        config: DbrxConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.config = config
+        self.quant_config = quant_config
+        self.unpadded_vocab_size = config.vocab_size
+        self.transformer = DbrxModel(
+            config, quant_config=quant_config, prefix=add_prefix("transformer", prefix)
+        )
+        self.lm_head = ParallelLMHead(
+            config.vocab_size,
+            config.d_model,
+            org_num_embeddings=config.vocab_size,
+            padding_size=DEFAULT_VOCAB_PADDING_SIZE,
+            prefix=add_prefix("lm_head", prefix),
+        )
+        self.logits_processor = LogitsProcessor(config)
+
+    @torch.no_grad()
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+    ) -> torch.Tensor:
+        hidden_states = self.transformer(input_ids, positions, forward_batch)
+        return self.logits_processor(
+            input_ids, hidden_states, self.lm_head, forward_batch
+        )
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        expert_params_mapping = [
+            (
+                "ws" if weight_name in ["w1", "v1"] else "w2s",
+                f"experts.mlp.{weight_name}",
+            )
+            for weight_name in ["w1", "v1", "w2"]
+        ]
+        params_dict = dict(self.named_parameters(remove_duplicate=False))
+        for name, loaded_weight in weights:
+            for param_name, weight_name in expert_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, weight_name)
+                break
+            else:
+                # Remapping the name of FP8 kv-scale.
+                name = maybe_remap_kv_scale_name(name, params_dict)
+                if name is None:
+                    continue
+
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
+                weight_loader(param, loaded_weight)
+
+
+EntryClass = DbrxForCausalLM
diff --git a/sglang/python/sglang/srt/models/deepseek.py b/sglang/python/sglang/srt/models/deepseek.py
new file mode 100644
index 0000000000000000000000000000000000000000..ef431e00d4603e7dd85b5a94135a84705619eec6
--- /dev/null
+++ b/sglang/python/sglang/srt/models/deepseek.py
@@ -0,0 +1,489 @@
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+# Adapted from:
+# https://github.com/vllm-project/vllm/blob/14f91fe67c2342f2fe859dc6a5c40810df0e1c61/vllm/model_executor/models/deepseek.py
+"""Inference-only Deepseek model."""
+
+from typing import Any, Dict, Iterable, Optional, Tuple
+
+import torch
+from torch import nn
+from transformers import PretrainedConfig
+
+from sglang.srt.distributed import (
+    get_tensor_model_parallel_rank,
+    get_tensor_model_parallel_world_size,
+    tensor_model_parallel_all_reduce,
+)
+from sglang.srt.layers.activation import SiluAndMul
+from sglang.srt.layers.layernorm import RMSNorm
+from sglang.srt.layers.linear import (
+    MergedColumnParallelLinear,
+    QKVParallelLinear,
+    ReplicatedLinear,
+    RowParallelLinear,
+)
+from sglang.srt.layers.logits_processor import LogitsProcessor
+from sglang.srt.layers.moe.fused_moe_triton import fused_moe
+from sglang.srt.layers.moe.moe_runner import MoeRunnerConfig
+from sglang.srt.layers.moe.topk import TopK
+from sglang.srt.layers.quantization.base_config import QuantizationConfig
+from sglang.srt.layers.radix_attention import RadixAttention
+from sglang.srt.layers.rotary_embedding import get_rope
+from sglang.srt.layers.vocab_parallel_embedding import (
+    ParallelLMHead,
+    VocabParallelEmbedding,
+)
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch
+from sglang.srt.model_loader.weight_utils import default_weight_loader
+from sglang.srt.utils import add_prefix
+
+
+class DeepseekMLP(nn.Module):
+
+    def __init__(
+        self,
+        hidden_size: int,
+        intermediate_size: int,
+        hidden_act: str,
+        quant_config: Optional[QuantizationConfig] = None,
+        reduce_results: bool = True,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.gate_up_proj = MergedColumnParallelLinear(
+            hidden_size,
+            [intermediate_size] * 2,
+            bias=False,
+            quant_config=quant_config,
+            prefix=add_prefix("gate_up_proj", prefix),
+        )
+        self.down_proj = RowParallelLinear(
+            intermediate_size,
+            hidden_size,
+            bias=False,
+            quant_config=quant_config,
+            reduce_results=reduce_results,
+            prefix=add_prefix("down_proj", prefix),
+        )
+        if hidden_act != "silu":
+            raise ValueError(
+                f"Unsupported activation: {hidden_act}. "
+                "Only silu is supported for now."
+            )
+        self.act_fn = SiluAndMul()
+
+    def forward(self, x):
+        gate_up, _ = self.gate_up_proj(x)
+        x = self.act_fn(gate_up)
+        x, _ = self.down_proj(x)
+        return x
+
+
+class DeepseekMoE(nn.Module):
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.config = config
+        self.rank = get_tensor_model_parallel_rank()
+        self.tp_size = get_tensor_model_parallel_world_size()
+        self.n_routed_experts = config.n_routed_experts
+        self.top_k = config.num_experts_per_tok
+        if self.tp_size > self.n_routed_experts:
+            raise ValueError(
+                f"Tensor parallel size {self.tp_size} is greater than "
+                f"the number of experts {self.n_routed_experts}."
+            )
+        self.topk = TopK(
+            top_k=self.top_k,
+            renormalize=config.norm_topk_prob,
+        )
+        self.experts = nn.ModuleList(
+            [
+                DeepseekMLP(
+                    hidden_size=config.hidden_size,
+                    intermediate_size=config.moe_intermediate_size,
+                    hidden_act=config.hidden_act,
+                    quant_config=quant_config,
+                    reduce_results=False,
+                    prefix=add_prefix(f"{idx}.experts", prefix),
+                )
+                for idx in range(self.n_routed_experts)
+            ]
+        )
+        self.pack_params()
+
+        self.gate = ReplicatedLinear(
+            config.hidden_size,
+            self.n_routed_experts,
+            bias=False,
+            quant_config=None,
+            prefix=add_prefix("gate", prefix),
+        )
+
+        if config.n_shared_experts is not None:
+            intermediate_size = config.moe_intermediate_size * config.n_shared_experts
+            self.shared_experts = DeepseekMLP(
+                hidden_size=config.hidden_size,
+                intermediate_size=intermediate_size,
+                hidden_act=config.hidden_act,
+                quant_config=quant_config,
+                reduce_results=False,
+                prefix=add_prefix("shared_experts", prefix),
+            )
+
+    def pack_params(self):
+        w1 = []
+        w2 = []
+        for expert in self.experts:
+            w1.append(expert.gate_up_proj.weight)
+            w2.append(expert.down_proj.weight)
+        self.w1 = torch._utils._flatten_dense_tensors(w1)
+        w1s = torch._utils._unflatten_dense_tensors(self.w1, w1)
+        for data, param in zip(w1s, w1):
+            param.data = data
+        self.w1 = self.w1.view(len(w1), *w1s[0].shape)
+
+        self.w2 = torch._utils._flatten_dense_tensors(w2)
+        w2s = torch._utils._unflatten_dense_tensors(self.w2, w2)
+        for data, param in zip(w2s, w2):
+            param.data = data
+
+        self.w2 = self.w2.view(len(w2), *w2s[0].shape)
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        num_tokens, hidden_dim = hidden_states.shape
+        hidden_states = hidden_states.view(-1, hidden_dim)
+        if self.config.n_shared_experts is not None:
+            shared_output = self.shared_experts(hidden_states)
+        # router_logits: (num_tokens, n_experts)
+        router_logits, _ = self.gate(hidden_states)
+        topk_output = self.topk(hidden_states, router_logits)
+        final_hidden_states = fused_moe.fused_moe(
+            hidden_states,
+            w1=self.w1,
+            w2=self.w2,
+            topk_output=topk_output,
+            moe_runner_config=MoeRunnerConfig(inplace=True),
+        )
+
+        if self.config.n_shared_experts is not None:
+            final_hidden_states = final_hidden_states + shared_output
+        final_hidden_states = tensor_model_parallel_all_reduce(final_hidden_states)
+
+        return final_hidden_states.view(num_tokens, hidden_dim)
+
+
+class DeepseekAttention(nn.Module):
+
+    def __init__(
+        self,
+        hidden_size: int,
+        num_heads: int,
+        num_kv_heads: int,
+        layer_id: int = 0,
+        rope_theta: float = 10000,
+        rope_scaling: Optional[Dict[str, Any]] = None,
+        max_position_embeddings: int = 8192,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.hidden_size = hidden_size
+        tp_size = get_tensor_model_parallel_world_size()
+        self.total_num_heads = num_heads
+        assert self.total_num_heads % tp_size == 0
+        self.num_heads = self.total_num_heads // tp_size
+        self.total_num_kv_heads = num_kv_heads
+        if self.total_num_kv_heads >= tp_size:
+            # Number of KV heads is greater than TP size, so we partition
+            # the KV heads across multiple tensor parallel GPUs.
+            assert self.total_num_kv_heads % tp_size == 0
+        else:
+            # Number of KV heads is less than TP size, so we replicate
+            # the KV heads across multiple tensor parallel GPUs.
+            assert tp_size % self.total_num_kv_heads == 0
+        self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
+        self.head_dim = hidden_size // self.total_num_heads
+        self.q_size = self.num_heads * self.head_dim
+        self.kv_size = self.num_kv_heads * self.head_dim
+        self.scaling = self.head_dim**-0.5
+        self.rope_theta = rope_theta
+        self.max_position_embeddings = max_position_embeddings
+
+        self.qkv_proj = QKVParallelLinear(
+            hidden_size,
+            self.head_dim,
+            self.total_num_heads,
+            self.total_num_kv_heads,
+            bias=False,
+            quant_config=quant_config,
+            prefix=add_prefix("qkv_proj", prefix),
+        )
+
+        self.o_proj = RowParallelLinear(
+            self.total_num_heads * self.head_dim,
+            hidden_size,
+            bias=False,
+            quant_config=quant_config,
+            prefix=add_prefix("o_proj", prefix),
+        )
+
+        self.rotary_emb = get_rope(
+            self.head_dim,
+            rotary_dim=self.head_dim,
+            max_position=max_position_embeddings,
+            base=rope_theta,
+            rope_scaling=rope_scaling,
+        )
+        self.attn = RadixAttention(
+            self.num_heads,
+            self.head_dim,
+            self.scaling,
+            num_kv_heads=self.num_kv_heads,
+            layer_id=layer_id,
+            quant_config=quant_config,
+            prefix=add_prefix("attn", prefix),
+        )
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+    ) -> torch.Tensor:
+        qkv, _ = self.qkv_proj(hidden_states)
+        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+        q, k = self.rotary_emb(positions, q, k)
+        attn_output = self.attn(q, k, v, forward_batch)
+        output, _ = self.o_proj(attn_output)
+        return output
+
+
+class DeepseekDecoderLayer(nn.Module):
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        layer_id: int,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        rope_theta = getattr(config, "rope_theta", 10000)
+        rope_scaling = getattr(config, "rope_scaling", None)
+        max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
+        self.self_attn = DeepseekAttention(
+            hidden_size=self.hidden_size,
+            num_heads=config.num_attention_heads,
+            num_kv_heads=config.num_key_value_heads,
+            layer_id=layer_id,
+            rope_theta=rope_theta,
+            rope_scaling=rope_scaling,
+            max_position_embeddings=max_position_embeddings,
+            quant_config=quant_config,
+            prefix=add_prefix("self_attn", prefix),
+        )
+        if (
+            config.n_routed_experts is not None
+            and layer_id >= config.first_k_dense_replace
+            and layer_id % config.moe_layer_freq == 0
+        ):
+            self.mlp = DeepseekMoE(
+                config=config,
+                quant_config=quant_config,
+                prefix=add_prefix("mlp", prefix),
+            )
+        else:
+            self.mlp = DeepseekMLP(
+                hidden_size=config.hidden_size,
+                intermediate_size=config.intermediate_size,
+                hidden_act=config.hidden_act,
+                quant_config=quant_config,
+                prefix=add_prefix("mlp", prefix),
+            )
+        self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = RMSNorm(
+            config.hidden_size, eps=config.rms_norm_eps
+        )
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+        residual: Optional[torch.Tensor],
+    ) -> torch.Tensor:
+        # Self Attention
+        if residual is None:
+            residual = hidden_states
+            hidden_states = self.input_layernorm(hidden_states)
+        else:
+            hidden_states, residual = self.input_layernorm(hidden_states, residual)
+        hidden_states = self.self_attn(
+            positions=positions,
+            hidden_states=hidden_states,
+            forward_batch=forward_batch,
+        )
+
+        # Fully Connected
+        hidden_states, residual = self.post_attention_layernorm(hidden_states, residual)
+        hidden_states = self.mlp(hidden_states)
+        return hidden_states, residual
+
+
+class DeepseekModel(nn.Module):
+
+    fall_back_to_pt_during_load = False
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+
+        self.embed_tokens = VocabParallelEmbedding(
+            config.vocab_size,
+            config.hidden_size,
+        )
+        self.layers = nn.ModuleList(
+            [
+                DeepseekDecoderLayer(
+                    config,
+                    layer_id,
+                    quant_config=quant_config,
+                    prefix=add_prefix(f"layers.{layer_id}", prefix),
+                )
+                for layer_id in range(config.num_hidden_layers)
+            ]
+        )
+        self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        input_embeds: torch.Tensor = None,
+    ) -> torch.Tensor:
+
+        if input_embeds is None:
+            hidden_states = self.embed_tokens(input_ids)
+        else:
+            hidden_states = input_embeds
+
+        residual = None
+        for i in range(len(self.layers)):
+            layer = self.layers[i]
+            hidden_states, residual = layer(
+                positions, hidden_states, forward_batch, residual
+            )
+        hidden_states, _ = self.norm(hidden_states, residual)
+        return hidden_states
+
+
+class DeepseekForCausalLM(nn.Module):
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.config = config
+        self.quant_config = quant_config
+        self.model = DeepseekModel(
+            config, quant_config, prefix=add_prefix("model", prefix)
+        )
+        self.lm_head = ParallelLMHead(
+            config.vocab_size,
+            config.hidden_size,
+            quant_config=quant_config,
+            prefix=add_prefix("lm_head", prefix),
+        )
+        self.logits_processor = LogitsProcessor(config)
+
+    def get_input_embeddings(self) -> nn.Embedding:
+        return self.model.embed_tokens
+
+    @torch.no_grad()
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        input_embeds: torch.Tensor = None,
+    ) -> torch.Tensor:
+        hidden_states = self.model(input_ids, positions, forward_batch, input_embeds)
+        return self.logits_processor(
+            input_ids, hidden_states, self.lm_head, forward_batch
+        )
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+            ("gate_up_proj", "gate_proj", 0),
+            ("gate_up_proj", "up_proj", 1),
+        ]
+
+        params_dict = dict(self.named_parameters())
+        for name, loaded_weight in weights:
+            if "rotary_emb.inv_freq" in name:
+                continue
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                # Skip experts that are not assigned to this worker.
+                if (
+                    "mlp.experts." in name or "mlp.shared_experts." in name
+                ) and name not in params_dict:
+                    continue
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                # Skip experts that are not assigned to this worker.
+                if (
+                    "mlp.experts." in name or "mlp.shared_experts." in name
+                ) and name not in params_dict:
+                    continue
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
+                weight_loader(param, loaded_weight)
+
+
+EntryClass = DeepseekForCausalLM
diff --git a/sglang/python/sglang/srt/models/deepseek_janus_pro.py b/sglang/python/sglang/srt/models/deepseek_janus_pro.py
new file mode 100644
index 0000000000000000000000000000000000000000..d8298e61f7aa08e8a25e46ad3ee0e5dcc4fdf4ae
--- /dev/null
+++ b/sglang/python/sglang/srt/models/deepseek_janus_pro.py
@@ -0,0 +1,2063 @@
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+# Copied and Adapted from:
+# https://github.com/deepseek-ai/Janus
+
+
+import collections
+import math
+import os
+from dataclasses import field
+from enum import Enum
+from functools import partial
+from itertools import repeat
+from typing import (
+    Callable,
+    Final,
+    Iterable,
+    Literal,
+    Optional,
+    Sequence,
+    Set,
+    Tuple,
+    Type,
+    Union,
+)
+
+import torch
+import torch.nn.functional as F
+from einops import rearrange
+from torch import Tensor, _assert, nn
+from torch.nn.init import trunc_normal_
+from transformers import AutoModel, PreTrainedModel
+
+from sglang.srt.configs.janus_pro import *
+from sglang.srt.layers.attention.vision import VisionAttention
+from sglang.srt.layers.logits_processor import LogitsProcessor
+from sglang.srt.layers.quantization import QuantizationConfig
+from sglang.srt.managers.mm_utils import (
+    MultiModalityDataPaddingPatternTokenPairs,
+    general_mm_embed_routine,
+)
+from sglang.srt.managers.schedule_batch import MultimodalDataItem, MultimodalInputs
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch
+from sglang.srt.model_loader.weight_utils import default_weight_loader
+from sglang.srt.models.llama import LlamaForCausalLM
+from sglang.utils import logger
+
+#################################################################################
+#                              VQ Model Configs                                 #
+#################################################################################
+
+
+# Copied from:
+# https://github.com/deepseek-ai/Janus/tree/main/janus/models/vq_model.py
+@dataclass
+class ModelArgs:
+    codebook_size: int = 16384
+    codebook_embed_dim: int = 8
+    codebook_l2_norm: bool = True
+    codebook_show_usage: bool = True
+    commit_loss_beta: float = 0.25
+    entropy_loss_ratio: float = 0.0
+
+    encoder_ch_mult: List[int] = field(default_factory=lambda: [1, 1, 2, 2, 4])
+    decoder_ch_mult: List[int] = field(default_factory=lambda: [1, 1, 2, 2, 4])
+    z_channels: int = 256
+    dropout_p: float = 0.0
+
+
+def named_apply(
+    fn: Callable,
+    module: nn.Module,
+    name="",
+    depth_first: bool = True,
+    include_root: bool = False,
+) -> nn.Module:
+    if not depth_first and include_root:
+        fn(module=module, name=name)
+    for child_name, child_module in module.named_children():
+        child_name = ".".join((name, child_name)) if name else child_name
+        named_apply(
+            fn=fn,
+            module=child_module,
+            name=child_name,
+            depth_first=depth_first,
+            include_root=True,
+        )
+    if depth_first and include_root:
+        fn(module=module, name=name)
+    return module
+
+
+def VQ_16(**kwargs):
+    return VQModel(
+        ModelArgs(
+            encoder_ch_mult=[1, 1, 2, 2, 4], decoder_ch_mult=[1, 1, 2, 2, 4], **kwargs
+        )
+    )
+
+
+VQ_models = {"VQ-16": VQ_16}
+
+import collections.abc
+
+
+# From PyTorch internals
+def _ntuple(n):
+    def parse(x):
+        if isinstance(x, collections.abc.Iterable) and not isinstance(x, str):
+            return tuple(x)
+        return tuple(repeat(x, n))
+
+    return parse
+
+
+def _trunc_normal_(tensor, mean, std, a, b):
+    # Cut & paste from PyTorch official master until it's in a few official releases - RW
+    # Method based on https://people.sc.fsu.edu/~jburkardt/presentations/truncated_normal.pdf
+    def norm_cdf(x):
+        # Computes standard normal cumulative distribution function
+        return (1.0 + math.erf(x / math.sqrt(2.0))) / 2.0
+
+    if (mean < a - 2 * std) or (mean > b + 2 * std):
+        logger.warn(
+            "mean is more than 2 std from [a, b] in nn.init.trunc_normal_. "
+            "The distribution of values may be incorrect.",
+            stacklevel=2,
+        )
+
+    # Values are generated by using a truncated uniform distribution and
+    # then using the inverse CDF for the normal distribution.
+    # Get upper and lower cdf values
+    l = norm_cdf((a - mean) / std)
+    u = norm_cdf((b - mean) / std)
+
+    # Uniformly fill tensor with values from [l, u], then translate to
+    # [2l-1, 2u-1].
+    tensor.uniform_(2 * l - 1, 2 * u - 1)
+
+    # Use inverse cdf transform for normal distribution to get truncated
+    # standard normal
+    if tensor.dtype in [torch.float16, torch.bfloat16]:
+        # The `erfinv_` op is not (yet?) defined in float16+cpu, bfloat16+gpu
+        og_dtype = tensor.dtype
+        tensor = tensor.to(torch.float32)
+        tensor.erfinv_()
+        tensor = tensor.to(og_dtype)
+    else:
+        tensor.erfinv_()
+
+    # Transform to proper mean, std
+    tensor.mul_(std * math.sqrt(2.0))
+    tensor.add_(mean)
+
+    # Clamp to ensure it's in the proper range
+    if tensor.dtype == torch.float16:
+        # The `clamp_` op is not (yet?) defined in float16+cpu
+        tensor = tensor.to(torch.float32)
+        tensor.clamp_(min=a, max=b)
+    else:
+        tensor.clamp_(min=a, max=b)
+
+
+def trunc_normal_tf_(
+    tensor: torch.Tensor,
+    mean: float = 0.0,
+    std: float = 1.0,
+    a: float = -2.0,
+    b: float = 2.0,
+):
+    """Fills the input Tensor with values drawn from a truncated
+    normal distribution. The values are effectively drawn from the
+    normal distribution :math:`\\mathcal{N}(\text{mean}, \text{std}^2)`
+    with values outside :math:`[a, b]` redrawn until they are within
+    the bounds. The method used for generating the random values works
+    best when :math:`a \\leq \text{mean} \\leq b`.
+    NOTE: this 'tf' variant behaves closer to Tensorflow / JAX impl where the
+    bounds [a, b] are applied when sampling the normal distribution with mean=0, std=1.0
+    and the result is subsequently scaled and shifted by the mean and std args.
+    Args:
+        tensor: an n-dimensional `torch.Tensor`
+        mean: the mean of the normal distribution
+        std: the standard deviation of the normal distribution
+        a: the minimum cutoff value
+        b: the maximum cutoff value
+    """
+    with torch.no_grad():
+        _trunc_normal_(tensor, 0, 1.0, a, b)
+        tensor.mul_(std).add_(mean)
+
+
+to_2tuple = _ntuple(2)
+
+
+class Format(str, Enum):
+    NCHW = "NCHW"
+    NHWC = "NHWC"
+    NCL = "NCL"
+    NLC = "NLC"
+
+
+def nchw_to(x: torch.Tensor, fmt: Format):
+    if fmt == Format.NHWC:
+        x = x.permute(0, 2, 3, 1)
+    elif fmt == Format.NLC:
+        x = x.flatten(2).transpose(1, 2)
+    elif fmt == Format.NCL:
+        x = x.flatten(2)
+    return x
+
+
+def resample_patch_embed(
+    patch_embed,
+    new_size: List[int],
+    interpolation: str = "bicubic",
+    antialias: bool = True,
+    verbose: bool = False,
+):
+    """Resample the weights of the patch embedding kernel to target resolution.
+    We resample the patch embedding kernel by approximately inverting the effect
+    of patch resizing.
+
+    Code based on:
+      https://github.com/google-research/big_vision/blob/b00544b81f8694488d5f36295aeb7972f3755ffe/big_vision/models/proj/flexi/vit.py
+
+    With this resizing, we can for example load a B/8 filter into a B/16 model
+    and, on 2x larger input image, the result will match.
+
+    Args:
+        patch_embed: original parameter to be resized.
+        new_size (tuple(int, int): target shape (height, width)-only.
+        interpolation (str): interpolation for resize
+        antialias (bool): use anti-aliasing filter in resize
+        verbose (bool): log operation
+    Returns:
+        Resized patch embedding kernel.
+    """
+    import numpy as np
+
+    try:
+        from torch import vmap
+    except ImportError:
+        from torch.func import vmap
+
+    assert len(patch_embed.shape) == 4, "Four dimensions expected"
+    assert len(new_size) == 2, "New shape should only be hw"
+    old_size = patch_embed.shape[-2:]
+    if tuple(old_size) == tuple(new_size):
+        return patch_embed
+
+    if verbose:
+        logger.info(
+            f"Resize patch embedding {patch_embed.shape} to {new_size}, w/ {interpolation} interpolation."
+        )
+
+    def resize(x_np, _new_size):
+        x_tf = torch.Tensor(x_np)[None, None, ...]
+        x_upsampled = F.interpolate(
+            x_tf, size=_new_size, mode=interpolation, antialias=antialias
+        )[0, 0, ...].numpy()
+        return x_upsampled
+
+    def get_resize_mat(_old_size, _new_size):
+        mat = []
+        for i in range(np.prod(_old_size)):
+            basis_vec = np.zeros(_old_size)
+            basis_vec[np.unravel_index(i, _old_size)] = 1.0
+            mat.append(resize(basis_vec, _new_size).reshape(-1))
+        return np.stack(mat).T
+
+    resize_mat = get_resize_mat(old_size, new_size)
+    resize_mat_pinv = torch.tensor(
+        np.linalg.pinv(resize_mat.T), device=patch_embed.device
+    )
+
+    def resample_kernel(kernel):
+        resampled_kernel = resize_mat_pinv @ kernel.reshape(-1)
+        return resampled_kernel.reshape(new_size)
+
+    v_resample_kernel = vmap(vmap(resample_kernel, 0, 0), 1, 1)
+    orig_dtype = patch_embed.dtype
+    patch_embed = patch_embed.float()
+    patch_embed = v_resample_kernel(patch_embed)
+    patch_embed = patch_embed.to(orig_dtype)
+    return patch_embed
+
+
+# Copied from:
+# https://github.com/deepseek-ai/Janus/tree/main/janus/models/siglip_vit.py
+class PatchEmbed(nn.Module):
+    """2D Image to Patch Embedding"""
+
+    output_fmt: Format
+    dynamic_img_pad: torch.jit.Final[bool]
+
+    def __init__(
+        self,
+        img_size: Optional[int] = 224,
+        patch_size: int = 16,
+        in_chans: int = 3,
+        embed_dim: int = 768,
+        norm_layer: Optional[Callable] = None,
+        flatten: bool = True,
+        output_fmt: Optional[str] = None,
+        bias: bool = True,
+        strict_img_size: bool = True,
+        dynamic_img_pad: bool = False,
+    ):
+        super().__init__()
+        self.patch_size = tuple(to_2tuple(patch_size))
+        self.img_size, self.grid_size, self.num_patches = self._init_img_size(img_size)
+
+        if output_fmt is not None:
+            self.flatten = False
+            self.output_fmt = Format(output_fmt)
+        else:
+            # flatten spatial dim and transpose to channels last, kept for bwd compat
+            self.flatten = flatten
+            self.output_fmt = Format.NCHW
+        self.strict_img_size = strict_img_size
+        self.dynamic_img_pad = dynamic_img_pad
+
+        self.proj = nn.Conv2d(
+            in_chans, embed_dim, kernel_size=patch_size, stride=patch_size, bias=bias
+        )
+        self.norm = norm_layer(embed_dim) if norm_layer else nn.Identity()
+
+    def _init_img_size(self, img_size: Union[int, Tuple[int, int]]):
+        assert self.patch_size
+        if img_size is None:
+            return None, None, None
+        img_size = to_2tuple(img_size)
+        grid_size = tuple([s // p for s, p in zip(img_size, self.patch_size)])
+        num_patches = grid_size[0] * grid_size[1]
+        return img_size, grid_size, num_patches
+
+    def set_input_size(
+        self,
+        img_size: Optional[Union[int, Tuple[int, int]]] = None,
+        patch_size: Optional[Union[int, Tuple[int, int]]] = None,
+    ):
+        new_patch_size = None
+        if patch_size is not None:
+            new_patch_size = to_2tuple(patch_size)
+        if new_patch_size is not None and new_patch_size != self.patch_size:
+            with torch.no_grad():
+                new_proj = nn.Conv2d(
+                    self.proj.in_channels,
+                    self.proj.out_channels,
+                    kernel_size=new_patch_size,
+                    stride=new_patch_size,
+                    bias=self.proj.bias is not None,
+                )
+                new_proj.weight.copy_(
+                    resample_patch_embed(self.proj.weight, new_patch_size, verbose=True)
+                )
+                if self.proj.bias is not None:
+                    new_proj.bias.copy_(self.proj.bias)
+                self.proj = new_proj
+            self.patch_size = new_patch_size
+        img_size = img_size or self.img_size
+        if img_size != self.img_size or new_patch_size is not None:
+            self.img_size, self.grid_size, self.num_patches = self._init_img_size(
+                img_size
+            )
+
+    def feat_ratio(self, as_scalar=True) -> Union[Tuple[int, int], int]:
+        if as_scalar:
+            return max(self.patch_size)
+        else:
+            return self.patch_size
+
+    def dynamic_feat_size(self, img_size: Tuple[int, int]) -> Tuple[int, int]:
+        """Get grid (feature) size for given image size taking account of dynamic padding.
+        NOTE: must be torchscript compatible so using fixed tuple indexing
+        """
+        if self.dynamic_img_pad:
+            return math.ceil(img_size[0] / self.patch_size[0]), math.ceil(
+                img_size[1] / self.patch_size[1]
+            )
+        else:
+            return img_size[0] // self.patch_size[0], img_size[1] // self.patch_size[1]
+
+    def forward(self, x):
+        B, C, H, W = x.shape
+        if self.img_size is not None:
+            if self.strict_img_size:
+                _assert(
+                    H == self.img_size[0],
+                    f"Input height ({H}) doesn't match model ({self.img_size[0]}).",
+                )
+                _assert(
+                    W == self.img_size[1],
+                    f"Input width ({W}) doesn't match model ({self.img_size[1]}).",
+                )
+            elif not self.dynamic_img_pad:
+                _assert(
+                    H % self.patch_size[0] == 0,
+                    f"Input height ({H}) should be divisible by patch size ({self.patch_size[0]}).",
+                )
+                _assert(
+                    W % self.patch_size[1] == 0,
+                    f"Input width ({W}) should be divisible by patch size ({self.patch_size[1]}).",
+                )
+        if self.dynamic_img_pad:
+            pad_h = (self.patch_size[0] - H % self.patch_size[0]) % self.patch_size[0]
+            pad_w = (self.patch_size[1] - W % self.patch_size[1]) % self.patch_size[1]
+            x = F.pad(x, (0, pad_w, 0, pad_h))
+        x = self.proj(x)
+        if self.flatten:
+            x = x.flatten(2).transpose(1, 2)  # NCHW -> NLC
+        elif self.output_fmt != Format.NCHW:
+            x = nchw_to(x, self.output_fmt)
+        x = self.norm(x)
+        return x
+
+
+class Mlp(nn.Module):
+    """MLP as used in Vision Transformer, MLP-Mixer and related networks
+
+    NOTE: When use_conv=True, expects 2D NCHW tensors, otherwise N*C expected.
+    """
+
+    def __init__(
+        self,
+        in_features,
+        hidden_features=None,
+        out_features=None,
+        act_layer=nn.GELU,
+        norm_layer=None,
+        bias=True,
+        drop=0.0,
+        use_conv=False,
+    ):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        bias = to_2tuple(bias)
+        drop_probs = to_2tuple(drop)
+        linear_layer = partial(nn.Conv2d, kernel_size=1) if use_conv else nn.Linear
+
+        self.fc1 = linear_layer(in_features, hidden_features, bias=bias[0])
+        self.act = act_layer()
+        self.drop1 = nn.Dropout(drop_probs[0])
+        self.norm = (
+            norm_layer(hidden_features) if norm_layer is not None else nn.Identity()
+        )
+        self.fc2 = linear_layer(hidden_features, out_features, bias=bias[1])
+        self.drop2 = nn.Dropout(drop_probs[1])
+
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.drop1(x)
+        x = self.norm(x)
+        x = self.fc2(x)
+        x = self.drop2(x)
+        return x
+
+
+def drop_path(
+    x, drop_prob: float = 0.0, training: bool = False, scale_by_keep: bool = True
+):
+    """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
+
+    This is the same as the DropConnect impl I created for EfficientNet, etc networks, however,
+    the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
+    See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for
+    changing the layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use
+    'survival rate' as the argument.
+
+    """
+    if drop_prob == 0.0 or not training:
+        return x
+    keep_prob = 1 - drop_prob
+    shape = (x.shape[0],) + (1,) * (
+        x.ndim - 1
+    )  # work with diff dim tensors, not just 2D ConvNets
+    random_tensor = x.new_empty(shape).bernoulli_(keep_prob)
+    if keep_prob > 0.0 and scale_by_keep:
+        random_tensor.div_(keep_prob)
+    return x * random_tensor
+
+
+class DropPath(nn.Module):
+    """Drop paths (Stochastic Depth) per sample  (when applied in main path of residual blocks)."""
+
+    def __init__(self, drop_prob: float = 0.0, scale_by_keep: bool = True):
+        super(DropPath, self).__init__()
+        self.drop_prob = drop_prob
+        self.scale_by_keep = scale_by_keep
+
+    def forward(self, x):
+        return drop_path(x, self.drop_prob, self.training, self.scale_by_keep)
+
+    def extra_repr(self):
+        return f"drop_prob={round(self.drop_prob, 3):0.3f}"
+
+
+class VisionTransformerBlock(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        num_heads: int,
+        mlp_ratio: float = 4.0,
+        qkv_bias: bool = False,
+        qk_norm: bool = False,
+        proj_drop: float = 0.0,
+        attn_drop: float = 0.0,
+        init_values: Optional[float] = None,
+        drop_path: float = 0.0,
+        act_layer: nn.Module = nn.GELU,
+        norm_layer: nn.Module = nn.LayerNorm,
+        mlp_layer: nn.Module = Mlp,
+    ) -> None:
+        super().__init__()
+        self.norm1 = norm_layer(dim)
+        self.attn = VisionAttention(
+            embed_dim=dim,
+            num_heads=num_heads,
+            projection_size=dim,
+            use_qkv_parallel=True,
+            dropout=attn_drop,
+        )
+
+        self.ls1 = (
+            LayerScale(dim, init_values=init_values) if init_values else nn.Identity()
+        )
+        self.drop_path1 = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
+
+        self.norm2 = norm_layer(dim)
+        self.mlp = mlp_layer(
+            in_features=dim,
+            hidden_features=int(dim * mlp_ratio),
+            act_layer=act_layer,
+            drop=proj_drop,
+        )
+        self.ls2 = (
+            LayerScale(dim, init_values=init_values) if init_values else nn.Identity()
+        )
+        self.drop_path2 = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = x + self.drop_path1(self.ls1(self.attn(self.norm1(x))))
+        x = x + self.drop_path2(self.ls2(self.mlp(self.norm2(x))))
+        return x
+
+
+LayerType = Union[str, Callable, Type[torch.nn.Module]]
+
+
+class PatchDropout(nn.Module):
+    """
+    https://arxiv.org/abs/2212.00794 and https://arxiv.org/pdf/2208.07220
+    """
+
+    return_indices: torch.jit.Final[bool]
+
+    def __init__(
+        self,
+        prob: float = 0.5,
+        num_prefix_tokens: int = 1,
+        ordered: bool = False,
+        return_indices: bool = False,
+    ):
+        super().__init__()
+        assert 0 <= prob < 1.0
+        self.prob = prob
+        self.num_prefix_tokens = (
+            num_prefix_tokens  # exclude CLS token (or other prefix tokens)
+        )
+        self.ordered = ordered
+        self.return_indices = return_indices
+
+    def forward(
+        self, x
+    ) -> Union[torch.Tensor, Tuple[torch.Tensor, Optional[torch.Tensor]]]:
+        if not self.training or self.prob == 0.0:
+            if self.return_indices:
+                return x, None
+            return x
+
+        if self.num_prefix_tokens:
+            prefix_tokens, x = (
+                x[:, : self.num_prefix_tokens],
+                x[:, self.num_prefix_tokens :],
+            )
+        else:
+            prefix_tokens = None
+
+        B = x.shape[0]
+        L = x.shape[1]
+        num_keep = max(1, int(L * (1.0 - self.prob)))
+        keep_indices = torch.argsort(torch.randn(B, L, device=x.device), dim=-1)[
+            :, :num_keep
+        ]
+        if self.ordered:
+            # NOTE does not need to maintain patch order in typical transformer use,
+            # but possibly useful for debug / visualization
+            keep_indices = keep_indices.sort(dim=-1)[0]
+        x = x.gather(1, keep_indices.unsqueeze(-1).expand((-1, -1) + x.shape[2:]))
+
+        if prefix_tokens is not None:
+            x = torch.cat((prefix_tokens, x), dim=1)
+
+        if self.return_indices:
+            return x, keep_indices
+        return x
+
+
+def resample_abs_pos_embed(
+    posemb: torch.Tensor,
+    new_size: List[int],
+    old_size: Optional[List[int]] = None,
+    num_prefix_tokens: int = 1,
+    interpolation: str = "bicubic",
+    antialias: bool = True,
+    verbose: bool = False,
+):
+    # sort out sizes, assume square if old size not provided
+    num_pos_tokens = posemb.shape[1]
+    num_new_tokens = new_size[0] * new_size[1] + num_prefix_tokens
+    if num_new_tokens == num_pos_tokens and new_size[0] == new_size[1]:
+        return posemb
+
+    if old_size is None:
+        hw = int(math.sqrt(num_pos_tokens - num_prefix_tokens))
+        old_size = hw, hw
+
+    if num_prefix_tokens:
+        posemb_prefix, posemb = (
+            posemb[:, :num_prefix_tokens],
+            posemb[:, num_prefix_tokens:],
+        )
+    else:
+        posemb_prefix, posemb = None, posemb
+
+    # do the interpolation
+    embed_dim = posemb.shape[-1]
+    orig_dtype = posemb.dtype
+    posemb = posemb.float()  # interpolate needs float32
+    posemb = posemb.reshape(1, old_size[0], old_size[1], -1).permute(0, 3, 1, 2)
+    posemb = F.interpolate(
+        posemb, size=new_size, mode=interpolation, antialias=antialias
+    )
+    posemb = posemb.permute(0, 2, 3, 1).reshape(1, -1, embed_dim)
+    posemb = posemb.to(orig_dtype)
+
+    # add back extra (class, etc) prefix tokens
+    if posemb_prefix is not None:
+        posemb = torch.cat([posemb_prefix, posemb], dim=1)
+
+    if not torch.jit.is_scripting() and verbose:
+        logger.info(f"Resized position embedding: {old_size} to {new_size}.")
+
+    return posemb
+
+
+def init_weights(self):
+    if self.pos_embed is not None:
+        trunc_normal_(self.pos_embed, std=self.pos_embed.shape[1] ** -0.5)
+    trunc_normal_(self.latent, std=self.latent_dim**-0.5)
+
+
+def init_weights_vit_timm(module: nn.Module, name: str = "") -> None:
+    """ViT weight initialization, original timm impl (for reproducibility)"""
+    if isinstance(module, nn.Linear):
+        trunc_normal_(module.weight, std=0.02)
+        if module.bias is not None:
+            nn.init.zeros_(module.bias)
+    elif hasattr(module, "init_weights"):
+        module.init_weights()
+
+
+class VisionTransformer(nn.Module):
+    """Vision Transformer
+
+    A PyTorch impl of : `An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale`
+        - https://arxiv.org/abs/2010.11929
+    """
+
+    dynamic_img_size: Final[bool]
+
+    def __init__(
+        self,
+        img_size: Union[int, Tuple[int, int]] = 224,
+        patch_size: Union[int, Tuple[int, int]] = 16,
+        in_chans: int = 3,
+        num_classes: int = 1000,
+        global_pool: Literal["", "avg", "token", "map"] = "token",
+        embed_dim: int = 768,
+        depth: int = 12,
+        num_heads: int = 12,
+        mlp_ratio: float = 4.0,
+        qkv_bias: bool = True,
+        qk_norm: bool = False,
+        init_values: Optional[float] = None,
+        class_token: bool = True,
+        no_embed_class: bool = False,
+        reg_tokens: int = 0,
+        pre_norm: bool = False,
+        fc_norm: Optional[bool] = None,
+        dynamic_img_size: bool = False,
+        dynamic_img_pad: bool = False,
+        drop_rate: float = 0.0,
+        pos_drop_rate: float = 0.0,
+        patch_drop_rate: float = 0.0,
+        proj_drop_rate: float = 0.0,
+        attn_drop_rate: float = 0.0,
+        drop_path_rate: float = 0.0,
+        weight_init: Literal["skip", "jax", "jax_nlhb", "moco", ""] = "",
+        embed_layer: Callable = PatchEmbed,
+        _norm_layer: Optional[LayerType] = None,
+        _act_layer: Optional[LayerType] = None,
+        block_fn: Type[nn.Module] = VisionTransformerBlock,
+        mlp_layer: Type[nn.Module] = Mlp,
+        ignore_head: bool = False,
+    ) -> None:
+        """
+        Args:
+            img_size: Input image size.
+            patch_size: Patch size.
+            in_chans: Number of image input channels.
+            num_classes: Number of classes for classification head.
+            global_pool: Type of global pooling for final sequence (default: 'token').
+            embed_dim: Transformer embedding dimension.
+            depth: Depth of transformer.
+            num_heads: Number of attention heads.
+            mlp_ratio: Ratio of mlp hidden dim to embedding dim.
+            qkv_bias: Enable bias for qkv projections if True.
+            init_values: Layer-scale init values (layer-scale enabled if not None).
+            class_token: Use class token.
+            no_embed_class: Don't include position embeddings for class (or reg) tokens.
+            reg_tokens: Number of register tokens.
+            fc_norm: Pre head norm after pool (instead of before), if None, enabled when global_pool == 'avg'.
+            drop_rate: Head dropout rate.
+            pos_drop_rate: Position embedding dropout rate.
+            attn_drop_rate: Attention dropout rate.
+            drop_path_rate: Stochastic depth rate.
+            weight_init: Weight initialization scheme.
+            embed_layer: Patch embedding layer.
+            _norm_layer: Normalization layer.
+            _act_layer: MLP activation layer.
+            block_fn: Transformer block layer.
+        """
+        super().__init__()
+        assert global_pool in ("", "avg", "token", "map")
+        assert class_token or global_pool != "token"
+        use_fc_norm = global_pool == "avg" if fc_norm is None else fc_norm
+        # norm_layer = get_norm_layer(norm_layer) or partial(nn.LayerNorm, eps=1e-6)
+        # act_layer = get_act_layer(act_layer) or nn.GELU
+        norm_layer = partial(nn.LayerNorm, eps=1e-6)
+        act_layer = nn.GELU
+
+        self.num_classes = num_classes
+        self.global_pool = global_pool
+        self.num_features = self.embed_dim = (
+            embed_dim  # num_features for consistency with other models
+        )
+        self.num_prefix_tokens = 1 if class_token else 0
+        self.num_prefix_tokens += reg_tokens
+        self.num_reg_tokens = reg_tokens
+        self.has_class_token = class_token
+        self.no_embed_class = (
+            no_embed_class  # don't embed prefix positions (includes reg)
+        )
+        self.dynamic_img_size = dynamic_img_size
+        self.grad_checkpointing = False
+        self.ignore_head = ignore_head
+
+        embed_args = {}
+        if dynamic_img_size:
+            # flatten deferred until after pos embed
+            embed_args.update(dict(strict_img_size=False, output_fmt="NHWC"))
+        self.patch_embed = embed_layer(
+            img_size=img_size,
+            patch_size=patch_size,
+            in_chans=in_chans,
+            embed_dim=embed_dim,
+            bias=not pre_norm,  # disable bias if pre-norm is used (e.g. CLIP)
+            dynamic_img_pad=dynamic_img_pad,
+            **embed_args,
+        )
+        num_patches = self.patch_embed.num_patches
+
+        self.cls_token = (
+            nn.Parameter(torch.zeros(1, 1, embed_dim)) if class_token else None
+        )
+        self.reg_token = (
+            nn.Parameter(torch.zeros(1, reg_tokens, embed_dim)) if reg_tokens else None
+        )
+        embed_len = (
+            num_patches if no_embed_class else num_patches + self.num_prefix_tokens
+        )
+        self.pos_embed = nn.Parameter(torch.randn(1, embed_len, embed_dim) * 0.02)
+        self.pos_drop = nn.Dropout(p=pos_drop_rate)
+        if patch_drop_rate > 0:
+            self.patch_drop = PatchDropout(
+                patch_drop_rate,
+                num_prefix_tokens=self.num_prefix_tokens,
+            )
+        else:
+            self.patch_drop = nn.Identity()
+        self.norm_pre = norm_layer(embed_dim) if pre_norm else nn.Identity()
+
+        dpr = [
+            x.item() for x in torch.linspace(0, drop_path_rate, depth)
+        ]  # stochastic depth decay rule
+        self.blocks = nn.Sequential(
+            *[
+                block_fn(
+                    dim=embed_dim,
+                    num_heads=num_heads,
+                    mlp_ratio=mlp_ratio,
+                    qkv_bias=qkv_bias,
+                    qk_norm=qk_norm,
+                    init_values=init_values,
+                    proj_drop=proj_drop_rate,
+                    attn_drop=attn_drop_rate,
+                    drop_path=dpr[i],
+                    norm_layer=norm_layer,
+                    act_layer=act_layer,
+                    mlp_layer=mlp_layer,
+                )
+                for i in range(depth)
+            ]
+        )
+        self.norm = norm_layer(embed_dim) if not use_fc_norm else nn.Identity()
+
+        # Classifier Head
+        if global_pool == "map":
+            AttentionPoolLatent.init_weights = init_weights
+            self.attn_pool = AttentionPoolLatent(
+                self.embed_dim,
+                num_heads=num_heads,
+                mlp_ratio=mlp_ratio,
+                norm_layer=norm_layer,
+            )
+        else:
+            self.attn_pool = None
+        self.fc_norm = norm_layer(embed_dim) if use_fc_norm else nn.Identity()
+        self.head_drop = nn.Dropout(drop_rate)
+        self.head = (
+            nn.Linear(self.embed_dim, num_classes) if num_classes > 0 else nn.Identity()
+        )
+
+        if weight_init != "skip":
+            self.init_weights(weight_init)
+
+    def init_weights(self, mode: Literal["jax", "jax_nlhb", "moco", ""] = "") -> None:
+        assert mode in ("jax", "jax_nlhb", "moco", "")
+        # head_bias = -math.log(self.num_classes) if "nlhb" in mode else 0.0
+        trunc_normal_(self.pos_embed, std=0.02)
+        if self.cls_token is not None:
+            nn.init.normal_(self.cls_token, std=1e-6)
+        named_apply(init_weights_vit_timm, self)
+
+    @torch.jit.ignore
+    def no_weight_decay(self) -> Set:
+        return {"pos_embed", "cls_token", "dist_token"}
+
+    @torch.jit.ignore
+    def group_matcher(self, coarse: bool = False) -> Dict:
+        return dict(
+            stem=r"^cls_token|pos_embed|patch_embed",  # stem and embed
+            blocks=[(r"^blocks\.(\d+)", None), (r"^norm", (99999,))],
+        )
+
+    @torch.jit.ignore
+    def get_classifier(self) -> nn.Module:
+        return self.head
+
+    def reset_classifier(self, num_classes: int, global_pool=None) -> None:
+        self.num_classes = num_classes
+        if global_pool is not None:
+            assert global_pool in ("", "avg", "token", "map")
+            if global_pool == "map" and self.attn_pool is None:
+                assert (
+                    False
+                ), "Cannot currently add attention pooling in reset_classifier()."
+            elif global_pool != "map " and self.attn_pool is not None:
+                self.attn_pool = None  # remove attention pooling
+            self.global_pool = global_pool
+        self.head = (
+            nn.Linear(self.embed_dim, num_classes) if num_classes > 0 else nn.Identity()
+        )
+
+    def _pos_embed(self, x: torch.Tensor) -> torch.Tensor:
+        if self.dynamic_img_size:
+            B, H, W, C = x.shape
+            pos_embed = resample_abs_pos_embed(
+                self.pos_embed,
+                [H, W],
+                num_prefix_tokens=0 if self.no_embed_class else self.num_prefix_tokens,
+            )
+            x = x.view(B, -1, C)
+        else:
+            pos_embed = self.pos_embed
+
+        to_cat = []
+        if self.cls_token is not None:
+            to_cat.append(self.cls_token.expand(x.shape[0], -1, -1))
+        if self.reg_token is not None:
+            to_cat.append(self.reg_token.expand(x.shape[0], -1, -1))
+
+        if self.no_embed_class:
+            # deit-3, updated JAX (big vision)
+            # position embedding does not overlap with class token, add then concat
+            x = x + pos_embed
+            if to_cat:
+                x = torch.cat(to_cat + [x], dim=1)
+        else:
+            # original timm, JAX, and deit vit impl
+            # pos_embed has entry for class token, concat then add
+            if to_cat:
+                x = torch.cat(to_cat + [x], dim=1)
+            x = x + pos_embed
+
+        return self.pos_drop(x)
+
+    def _intermediate_layers(
+        self,
+        x: torch.Tensor,
+        n: Union[int, Sequence] = 1,
+    ) -> List[torch.Tensor]:
+        outputs, num_blocks = [], len(self.blocks)
+        take_indices = set(
+            range(num_blocks - n, num_blocks) if isinstance(n, int) else n
+        )
+
+        # forward pass
+        x = self.patch_embed(x)
+        x = self._pos_embed(x)
+        x = self.patch_drop(x)
+        x = self.norm_pre(x)
+        for i, blk in enumerate(self.blocks):
+            x = blk(x)
+            if i in take_indices:
+                outputs.append(x)
+
+        return outputs
+
+    def forward_features(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.patch_embed(x)
+        x = self._pos_embed(x)
+        x = self.patch_drop(x)
+        x = self.norm_pre(x)
+        x = self.blocks(x)
+        x = self.norm(x)
+        return x
+
+    def forward_head(self, x: torch.Tensor, pre_logits: bool = False) -> torch.Tensor:
+        if self.attn_pool is not None:
+            x = self.attn_pool(x)
+        elif self.global_pool == "avg":
+            x = x[:, self.num_prefix_tokens :].mean(dim=1)
+        elif self.global_pool:
+            x = x[:, 0]  # class token
+        x = self.fc_norm(x)
+        x = self.head_drop(x)
+        return x if pre_logits else self.head(x)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.forward_features(x)
+        if not self.ignore_head:
+            x = self.forward_head(x)
+        return x
+
+
+def model_name_to_cls(cls_name):
+    if "MlpProjector" in cls_name:
+        cls = MlpProjector
+
+    elif "CLIPVisionTower" in cls_name:
+        cls = CLIPVisionTower
+
+    elif "VQ" in cls_name:
+
+        cls = VQ_models[cls_name]
+    elif "vision_head" in cls_name:
+        cls = vision_head
+    else:
+        raise ValueError(f"class_name {cls_name} is invalid.")
+
+    return cls
+
+
+class vision_head(torch.nn.Module):
+    def __init__(self, params):
+        super().__init__()
+        self.output_mlp_projector = torch.nn.Linear(
+            params["n_embed"], params["image_token_embed"]
+        )
+        self.vision_activation = torch.nn.GELU()
+        self.vision_head = torch.nn.Linear(
+            params["image_token_embed"], params["image_token_size"]
+        )
+
+    def forward(self, x):
+        x = self.output_mlp_projector(x)
+        x = self.vision_activation(x)
+        x = self.vision_head(x)
+        return x
+
+
+SigLIP_MODEL_CONFIG = {
+    "siglip_so400m_patch14_384": {
+        "image_size": 336,
+        "patch_size": 14,
+        "width": 1152,
+        "layers": 27,
+        "heads": 16,
+        "mlp_ratio": 3.7362,
+        "global_pool": "map",
+        "use_checkpoint": False,
+    },
+    "siglip_so400m_patch14_224": {
+        "image_size": 224,
+        "patch_size": 14,
+        "width": 1152,
+        "layers": 27,
+        "heads": 16,
+        "mlp_ratio": 3.7362,
+        "global_pool": "map",
+        "use_checkpoint": False,
+    },
+    "siglip_large_patch16_384": {
+        "image_size": 384,
+        "patch_size": 16,
+        "width": 1024,
+        "layers": 24,
+        "heads": 16,
+        "mlp_ratio": 4,
+        "global_pool": "map",
+        "use_checkpoint": False,
+    },
+}
+
+
+def create_siglip_vit(
+    model_name: str = "siglip_so400m_patch14_384",
+    image_size: int = 384,
+    select_layer: int = -1,
+    ckpt_path: str = "",
+    **kwargs,
+):
+    assert (
+        model_name in SigLIP_MODEL_CONFIG.keys()
+    ), f"model name should be in {SigLIP_MODEL_CONFIG.keys()}"
+
+    vision_cfg = SigLIPVisionCfg(**SigLIP_MODEL_CONFIG[model_name])
+
+    if select_layer <= 0:
+        layers = min(vision_cfg.layers, vision_cfg.layers + select_layer + 1)
+    else:
+        layers = min(vision_cfg.layers, select_layer)
+
+    model = VisionTransformer(
+        img_size=image_size,
+        patch_size=vision_cfg.patch_size,
+        embed_dim=vision_cfg.width,
+        depth=layers,
+        num_heads=vision_cfg.heads,
+        mlp_ratio=vision_cfg.mlp_ratio,
+        class_token=vision_cfg.class_token,
+        global_pool=vision_cfg.global_pool,
+        ignore_head=kwargs.get("ignore_head", True),
+        weight_init=kwargs.get("weight_init", "skip"),
+        num_classes=0,
+    )
+
+    if ckpt_path:
+        state_dict = torch.load(ckpt_path, map_location="cpu", weights_only=True)
+
+        incompatible_keys = model.load_state_dict(state_dict, strict=False)
+        print(
+            f"SigLIP-ViT restores from {ckpt_path},\n"
+            f"\tincompatible_keys:', {incompatible_keys}."
+        )
+
+    return model
+
+
+class Normalize(torch.nn.Module):
+    """Normalize a tensor image with mean and standard deviation.
+    This transform does not support PIL Image.
+    Given mean: ``(mean[1],...,mean[n])`` and std: ``(std[1],..,std[n])`` for ``n``
+    channels, this transform will normalize each channel of the input
+    ``torch.*Tensor`` i.e.,
+    ``output[channel] = (input[channel] - mean[channel]) / std[channel]``
+
+    .. note::
+        This transform acts out of place, i.e., it does not mutate the input tensor.
+
+    Args:
+        mean (sequence): Sequence of means for each channel.
+        std (sequence): Sequence of standard deviations for each channel.
+        inplace(bool,optional): Bool to make this operation in-place.
+
+    """
+
+    def __init__(self, mean, std, inplace=False):
+        super().__init__()
+        # _log_api_usage_once(self)
+        self.mean = mean
+        self.std = std
+        self.inplace = inplace
+
+    def forward(self, tensor: Tensor) -> Tensor:
+        """
+        Args:
+            tensor (Tensor): Tensor image to be normalized.
+
+        Returns:
+            Tensor: Normalized Tensor image.
+        """
+        return F.normalize(tensor, self.mean, self.std, self.inplace)
+
+    def __repr__(self) -> str:
+        return f"{self.__class__.__name__}(mean={self.mean}, std={self.std})"
+
+
+class CLIPVisionTower(nn.Module):
+    def __init__(
+        self,
+        model_name: str = "siglip_large_patch16_384",
+        image_size: Union[Tuple[int, int], int] = 336,
+        select_feature: str = "patch",
+        select_layer: int = -2,
+        select_layers: list = None,
+        ckpt_path: str = "",
+        pixel_mean: Optional[List[float]] = None,
+        pixel_std: Optional[List[float]] = None,
+        **kwargs,
+    ):
+        super().__init__()
+
+        self.model_name = model_name
+        self.select_feature = select_feature
+        self.select_layer = select_layer
+        self.select_layers = select_layers
+
+        vision_tower_params = {
+            "model_name": model_name,
+            "image_size": image_size,
+            "ckpt_path": ckpt_path,
+            "select_layer": select_layer,
+        }
+        vision_tower_params.update(kwargs)
+        self.vision_tower, self.forward_kwargs = self.build_vision_tower(
+            vision_tower_params
+        )
+
+        if pixel_mean is not None and pixel_std is not None:
+            image_norm = Normalize(mean=pixel_mean, std=pixel_std)
+        else:
+            image_norm = None
+
+        self.image_norm = image_norm
+
+    @property
+    def device(self) -> torch.device:
+        return next(self.vision_tower.parameters()).device
+
+    @property
+    def dtype(self):
+        return next(self.vision_tower.parameters()).dtype
+
+    def build_vision_tower(self, vision_tower_params):
+        if self.model_name.startswith("siglip"):
+            self.select_feature = "same"
+            vision_tower = create_siglip_vit(**vision_tower_params)
+            forward_kwargs = dict()
+
+        elif self.model_name.startswith("sam"):
+            # vision_tower = create_sam_vit(**vision_tower_params)
+            forward_kwargs = dict()
+
+        else:  # huggingface
+            from transformers import CLIPVisionModel
+
+            vision_tower = CLIPVisionModel.from_pretrained(**vision_tower_params)
+            forward_kwargs = dict(output_hidden_states=True)
+
+        return vision_tower, forward_kwargs
+
+    def feature_select(self, image_forward_outs):
+        if isinstance(image_forward_outs, torch.Tensor):
+            # the output has been the self.select_layer"s features
+            image_features = image_forward_outs
+        else:
+            image_features = image_forward_outs.hidden_states[self.select_layer]
+
+        if self.select_feature == "patch":
+            # if the output has cls_token
+            image_features = image_features[:, 1:]
+        elif self.select_feature == "cls_patch":
+            image_features = image_features
+        elif self.select_feature == "same":
+            image_features = image_features
+
+        else:
+            raise ValueError(f"Unexpected select feature: {self.select_feature}")
+        return image_features
+
+    def forward(self, images):
+        """
+
+        Args:
+            images (torch.Tensor): [b, 3, H, W]
+
+        Returns:
+            image_features (torch.Tensor): [b, n_patch, d]
+        """
+
+        if self.image_norm is not None:
+            images = self.image_norm(images)
+
+        image_forward_outs = self.vision_tower(images, **self.forward_kwargs)
+        image_features = self.feature_select(image_forward_outs)
+        return image_features
+
+
+class MlpProjector(nn.Module):
+    def __init__(self, cfg):
+        super().__init__()
+
+        self.cfg = cfg
+
+        if cfg["projector_type"] == "identity":
+            modules = nn.Identity()
+
+        elif cfg["projector_type"] == "linear":
+            modules = nn.Linear(cfg["input_dim"], cfg["n_embed"])
+
+        elif cfg["projector_type"] == "mlp_gelu":
+            mlp_depth = cfg.get("depth", 1)
+            modules = [nn.Linear(cfg["input_dim"], cfg["n_embed"])]
+            for _ in range(1, mlp_depth):
+                modules.append(nn.GELU())
+                modules.append(nn.Linear(cfg["n_embed"], cfg["n_embed"]))
+            modules = nn.Sequential(*modules)
+
+        elif cfg["projector_type"] == "low_high_hybrid_split_mlp_gelu":
+            mlp_depth = cfg.get("depth", 1)
+            self.high_up_proj = nn.Linear(cfg["input_dim"], cfg["n_embed"] // 2)
+            self.low_up_proj = nn.Linear(cfg["input_dim"], cfg["n_embed"] // 2)
+
+            modules = []
+            for _ in range(1, mlp_depth):
+                modules.append(nn.GELU())
+                modules.append(nn.Linear(cfg["n_embed"], cfg["n_embed"]))
+            modules = nn.Sequential(*modules)
+
+        else:
+            raise ValueError(f"Unknown projector type: {cfg['projector_type']}")
+
+        self.layers = modules
+
+    def forward(
+        self, x_or_tuple: Union[Tuple[torch.Tensor, torch.Tensor], torch.Tensor]
+    ):
+        """
+
+        Args:
+            x_or_tuple (Union[Tuple[torch.Tensor, torch.Tensor], torch.Tensor]:  if it is a tuple of torch.Tensor,
+                then it comes from the hybrid vision encoder, and x = high_res_x, low_res_x);
+                otherwise it is the feature from the single vision encoder.
+
+        Returns:
+            x (torch.Tensor): [b, s, c]
+        """
+
+        if isinstance(x_or_tuple, tuple):
+            # self.cfg.projector_type == "low_high_hybrid_split_mlp_gelu":
+            high_x, low_x = x_or_tuple
+            high_x = self.high_up_proj(high_x)
+            low_x = self.low_up_proj(low_x)
+            x = torch.cat([high_x, low_x], dim=-1)
+        else:
+            x = x_or_tuple
+
+        return self.layers(x)
+
+
+class LayerScale(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        init_values: float = 1e-5,
+        inplace: bool = False,
+    ) -> None:
+        super().__init__()
+        self.inplace = inplace
+        self.gamma = nn.Parameter(init_values * torch.ones(dim))
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return x.mul_(self.gamma) if self.inplace else x * self.gamma
+
+
+# use torch.scaled_dot_product_attention where possible
+_HAS_FUSED_ATTN = hasattr(torch.nn.functional, "scaled_dot_product_attention")
+if "TIMM_FUSED_ATTN" in os.environ:
+    _USE_FUSED_ATTN = int(os.environ["TIMM_FUSED_ATTN"])
+else:
+    _USE_FUSED_ATTN = (
+        1  # 0 == off, 1 == on (for tested use), 2 == on (for experimental use)
+    )
+
+# Set to True if exporting a model with Same padding via ONNX
+_EXPORTABLE = False
+
+
+def use_fused_attn(experimental: bool = False) -> bool:
+    # NOTE: ONNX export cannot handle F.scaled_dot_product_attention as of pytorch 2.0
+    if not _HAS_FUSED_ATTN or _EXPORTABLE:
+        return False
+    if experimental:
+        return _USE_FUSED_ATTN > 1
+    return _USE_FUSED_ATTN > 0
+
+
+class AttentionPoolLatent(nn.Module):
+    """Attention pooling w/ latent query"""
+
+    fused_attn: torch.jit.Final[bool]
+
+    def __init__(
+        self,
+        in_features: int,
+        out_features: int = None,
+        embed_dim: int = None,
+        num_heads: int = 8,
+        feat_size: Optional[int] = None,
+        mlp_ratio: float = 4.0,
+        qkv_bias: bool = True,
+        qk_norm: bool = False,
+        latent_len: int = 1,
+        latent_dim: int = None,
+        pos_embed: str = "",
+        pool_type: str = "token",
+        norm_layer: Optional[nn.Module] = None,
+        drop: float = 0.0,
+    ):
+        super().__init__()
+        embed_dim = embed_dim or in_features
+        out_features = out_features or in_features
+        assert embed_dim % num_heads == 0
+        self.num_heads = num_heads
+        self.head_dim = embed_dim // num_heads
+        self.feat_size = feat_size
+        self.scale = self.head_dim**-0.5
+        self.pool = pool_type
+        self.fused_attn = use_fused_attn()
+
+        if pos_embed == "abs":
+            assert feat_size is not None
+            self.pos_embed = nn.Parameter(torch.zeros(feat_size, in_features))
+        else:
+            self.pos_embed = None
+
+        self.latent_dim = latent_dim or embed_dim
+        self.latent_len = latent_len
+        self.latent = nn.Parameter(torch.zeros(1, self.latent_len, embed_dim))
+
+        self.q = nn.Linear(embed_dim, embed_dim, bias=qkv_bias)
+        self.kv = nn.Linear(embed_dim, embed_dim * 2, bias=qkv_bias)
+        self.q_norm = norm_layer(self.head_dim) if qk_norm else nn.Identity()
+        self.k_norm = norm_layer(self.head_dim) if qk_norm else nn.Identity()
+        self.proj = nn.Linear(embed_dim, embed_dim)
+        self.proj_drop = nn.Dropout(drop)
+
+        self.norm = (
+            norm_layer(out_features) if norm_layer is not None else nn.Identity()
+        )
+        self.mlp = Mlp(embed_dim, int(embed_dim * mlp_ratio))
+
+        self.init_weights()
+
+    def init_weights(self):
+        if self.pos_embed is not None:
+            trunc_normal_tf_(self.pos_embed, std=self.pos_embed.shape[1] ** -0.5)
+        trunc_normal_tf_(self.latent, std=self.latent_dim**-0.5)
+
+    def forward(self, x):
+        B, N, C = x.shape
+
+        if self.pos_embed is not None:
+            # FIXME interpolate
+            x = x + self.pos_embed.unsqueeze(0).to(x.dtype)
+
+        q_latent = self.latent.expand(B, -1, -1)
+        q = (
+            self.q(q_latent)
+            .reshape(B, self.latent_len, self.num_heads, self.head_dim)
+            .transpose(1, 2)
+        )
+
+        kv = (
+            self.kv(x)
+            .reshape(B, N, 2, self.num_heads, self.head_dim)
+            .permute(2, 0, 3, 1, 4)
+        )
+        k, v = kv.unbind(0)
+
+        q, k = self.q_norm(q), self.k_norm(k)
+
+        if self.fused_attn:
+            x = F.scaled_dot_product_attention(q, k, v)
+        else:
+            q = q * self.scale
+            attn = q @ k.transpose(-2, -1)
+            attn = attn.softmax(dim=-1)
+            x = attn @ v
+        x = x.transpose(1, 2).reshape(B, self.latent_len, C)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+
+        x = x + self.mlp(self.norm(x))
+
+        # optional pool if latent seq_len > 1 and pooled output is desired
+        if self.pool == "token":
+            x = x[:, 0]
+        elif self.pool == "avg":
+            x = x.mean(1)
+
+
+class Encoder(nn.Module):
+    def __init__(
+        self,
+        in_channels=3,
+        ch=128,
+        ch_mult=(1, 1, 2, 2, 4),
+        num_res_blocks=2,
+        norm_type="group",
+        dropout=0.0,
+        resamp_with_conv=True,
+        z_channels=256,
+    ):
+        super().__init__()
+        self.num_resolutions = len(ch_mult)
+        self.num_res_blocks = num_res_blocks
+        self.conv_in = nn.Conv2d(in_channels, ch, kernel_size=3, stride=1, padding=1)
+
+        # downsampling
+        in_ch_mult = (1,) + tuple(ch_mult)
+        self.conv_blocks = nn.ModuleList()
+        for i_level in range(self.num_resolutions):
+            conv_block = nn.Module()
+            # res & attn
+            res_block = nn.ModuleList()
+            attn_block = nn.ModuleList()
+            block_in = ch * in_ch_mult[i_level]
+            block_out = ch * ch_mult[i_level]
+            for _ in range(self.num_res_blocks):
+                res_block.append(
+                    ResnetBlock(
+                        block_in, block_out, dropout=dropout, norm_type=norm_type
+                    )
+                )
+                block_in = block_out
+                if i_level == self.num_resolutions - 1:
+                    attn_block.append(AttnBlock(block_in, norm_type))
+            conv_block.res = res_block
+            conv_block.attn = attn_block
+            # downsample
+            if i_level != self.num_resolutions - 1:
+                conv_block.downsample = Downsample(block_in, resamp_with_conv)
+            self.conv_blocks.append(conv_block)
+
+        # middle
+        self.mid = nn.ModuleList()
+        self.mid.append(
+            ResnetBlock(block_in, block_in, dropout=dropout, norm_type=norm_type)
+        )
+        self.mid.append(AttnBlock(block_in, norm_type=norm_type))
+        self.mid.append(
+            ResnetBlock(block_in, block_in, dropout=dropout, norm_type=norm_type)
+        )
+
+        # end
+        self.norm_out = Normalize(block_in, norm_type)
+        self.conv_out = nn.Conv2d(
+            block_in, z_channels, kernel_size=3, stride=1, padding=1
+        )
+
+    def forward(self, x):
+        h = self.conv_in(x)
+        # downsampling
+        for i_level, block in enumerate(self.conv_blocks):
+            for i_block in range(self.num_res_blocks):
+                h = block.res[i_block](h)
+                if len(block.attn) > 0:
+                    h = block.attn[i_block](h)
+            if i_level != self.num_resolutions - 1:
+                h = block.downsample(h)
+
+        # middle
+        for mid_block in self.mid:
+            h = mid_block(h)
+
+        # end
+        h = self.norm_out(h)
+        h = nonlinearity(h)
+        h = self.conv_out(h)
+        return h
+
+
+class Decoder(nn.Module):
+    def __init__(
+        self,
+        z_channels=256,
+        ch=128,
+        ch_mult=(1, 1, 2, 2, 4),
+        num_res_blocks=2,
+        norm_type="group",
+        dropout=0.0,
+        resamp_with_conv=True,
+        out_channels=3,
+    ):
+        super().__init__()
+        self.num_resolutions = len(ch_mult)
+        self.num_res_blocks = num_res_blocks
+
+        block_in = ch * ch_mult[self.num_resolutions - 1]
+        # z to block_in
+        self.conv_in = nn.Conv2d(
+            z_channels, block_in, kernel_size=3, stride=1, padding=1
+        )
+
+        # middle
+        self.mid = nn.ModuleList()
+        self.mid.append(
+            ResnetBlock(block_in, block_in, dropout=dropout, norm_type=norm_type)
+        )
+        self.mid.append(AttnBlock(block_in, norm_type=norm_type))
+        self.mid.append(
+            ResnetBlock(block_in, block_in, dropout=dropout, norm_type=norm_type)
+        )
+
+        # upsampling
+        self.conv_blocks = nn.ModuleList()
+        for i_level in reversed(range(self.num_resolutions)):
+            conv_block = nn.Module()
+            # res & attn
+            res_block = nn.ModuleList()
+            attn_block = nn.ModuleList()
+            block_out = ch * ch_mult[i_level]
+            for _ in range(self.num_res_blocks + 1):
+                res_block.append(
+                    ResnetBlock(
+                        block_in, block_out, dropout=dropout, norm_type=norm_type
+                    )
+                )
+                block_in = block_out
+                if i_level == self.num_resolutions - 1:
+                    attn_block.append(AttnBlock(block_in, norm_type))
+            conv_block.res = res_block
+            conv_block.attn = attn_block
+            # downsample
+            if i_level != 0:
+                conv_block.upsample = Upsample(block_in, resamp_with_conv)
+            self.conv_blocks.append(conv_block)
+
+        # end
+        self.norm_out = Normalize(block_in, norm_type)
+        self.conv_out = nn.Conv2d(
+            block_in, out_channels, kernel_size=3, stride=1, padding=1
+        )
+
+    @property
+    def last_layer(self):
+        return self.conv_out.weight
+
+    def forward(self, z):
+        # z to block_in
+        h = self.conv_in(z)
+
+        # middle
+        for mid_block in self.mid:
+            h = mid_block(h)
+
+        # upsampling
+        for i_level, block in enumerate(self.conv_blocks):
+            for i_block in range(self.num_res_blocks + 1):
+                h = block.res[i_block](h)
+                if len(block.attn) > 0:
+                    h = block.attn[i_block](h)
+            if i_level != self.num_resolutions - 1:
+                h = block.upsample(h)
+
+        # end
+        h = self.norm_out(h)
+        h = nonlinearity(h)
+        h = self.conv_out(h)
+        return h
+
+
+class VectorQuantizer(nn.Module):
+    def __init__(self, n_e, e_dim, beta, entropy_loss_ratio, l2_norm, show_usage):
+        super().__init__()
+        self.n_e = n_e
+        self.e_dim = e_dim
+        self.beta = beta
+        self.entropy_loss_ratio = entropy_loss_ratio
+        self.l2_norm = l2_norm
+        self.show_usage = show_usage
+
+        self.embedding = nn.Embedding(self.n_e, self.e_dim)
+        self.embedding.weight.data.uniform_(-1.0 / self.n_e, 1.0 / self.n_e)
+        if self.l2_norm:
+            self.embedding.weight.data = F.normalize(
+                self.embedding.weight.data, p=2, dim=-1
+            )
+        if self.show_usage:
+            # self.register_buffer("codebook_used", nn.Parameter(torch.zeros(65536)))
+            self.codebook_used = nn.Parameter(torch.zeros(65536))
+
+    def forward(self, z):
+        # reshape z -> (batch, height, width, channel) and flatten
+        z = torch.einsum("b c h w -> b h w c", z).contiguous()
+        z_flattened = z.view(-1, self.e_dim)
+        # distances from z to embeddings e_j (z - e)^2 = z^2 + e^2 - 2 e * z
+
+        if self.l2_norm:
+            z = F.normalize(z, p=2, dim=-1)
+            z_flattened = F.normalize(z_flattened, p=2, dim=-1)
+            embedding = F.normalize(self.embedding.weight, p=2, dim=-1)
+        else:
+            embedding = self.embedding.weight
+
+        d = (
+            torch.sum(z_flattened**2, dim=1, keepdim=True)
+            + torch.sum(embedding**2, dim=1)
+            - 2
+            * torch.einsum(
+                "bd,dn->bn", z_flattened, torch.einsum("n d -> d n", embedding)
+            )
+        )
+
+        min_encoding_indices = torch.argmin(d, dim=1)
+        z_q = embedding[min_encoding_indices].view(z.shape)
+        perplexity = None
+        min_encodings = None
+        vq_loss = None
+        commit_loss = None
+        entropy_loss = None
+
+        # compute loss for embedding
+        if self.training:
+            vq_loss = torch.mean((z_q - z.detach()) ** 2)
+            commit_loss = self.beta * torch.mean((z_q.detach() - z) ** 2)
+            entropy_loss = self.entropy_loss_ratio * compute_entropy_loss(-d)
+
+        # preserve gradients
+        z_q = z + (z_q - z).detach()
+
+        # reshape back to match original input shape
+        z_q = torch.einsum("b h w c -> b c h w", z_q)
+
+        return (
+            z_q,
+            (vq_loss, commit_loss, entropy_loss),
+            (perplexity, min_encodings, min_encoding_indices),
+        )
+
+    def get_codebook_entry(self, indices, shape=None, channel_first=True):
+        # shape = (batch, channel, height, width) if channel_first else (batch, height, width, channel)
+        if self.l2_norm:
+            embedding = F.normalize(self.embedding.weight, p=2, dim=-1)
+        else:
+            embedding = self.embedding.weight
+        z_q = embedding[indices]  # (b*h*w, c)
+
+        if shape is not None:
+            if channel_first:
+                z_q = z_q.reshape(shape[0], shape[2], shape[3], shape[1])
+                # reshape back to match original input shape
+                z_q = z_q.permute(0, 3, 1, 2).contiguous()
+            else:
+                z_q = z_q.view(shape)
+        return z_q
+
+
+class ResnetBlock(nn.Module):
+    def __init__(
+        self,
+        in_channels,
+        out_channels=None,
+        conv_shortcut=False,
+        dropout=0.0,
+        norm_type="group",
+    ):
+        super().__init__()
+        self.in_channels = in_channels
+        out_channels = in_channels if out_channels is None else out_channels
+        self.out_channels = out_channels
+        self.use_conv_shortcut = conv_shortcut
+
+        self.norm1 = Normalize(in_channels, norm_type)
+        self.conv1 = nn.Conv2d(
+            in_channels, out_channels, kernel_size=3, stride=1, padding=1
+        )
+        self.norm2 = Normalize(out_channels, norm_type)
+        self.dropout = nn.Dropout(dropout)
+        self.conv2 = nn.Conv2d(
+            out_channels, out_channels, kernel_size=3, stride=1, padding=1
+        )
+
+        if self.in_channels != self.out_channels:
+            if self.use_conv_shortcut:
+                self.conv_shortcut = nn.Conv2d(
+                    in_channels, out_channels, kernel_size=3, stride=1, padding=1
+                )
+            else:
+                self.nin_shortcut = nn.Conv2d(
+                    in_channels, out_channels, kernel_size=1, stride=1, padding=0
+                )
+
+    def forward(self, x):
+        h = x
+        h = self.norm1(h)
+        h = nonlinearity(h)
+        h = self.conv1(h)
+        h = self.norm2(h)
+        h = nonlinearity(h)
+        h = self.dropout(h)
+        h = self.conv2(h)
+
+        if self.in_channels != self.out_channels:
+            if self.use_conv_shortcut:
+                x = self.conv_shortcut(x)
+            else:
+                x = self.nin_shortcut(x)
+        return x + h
+
+
+class AttnBlock(nn.Module):
+    def __init__(self, in_channels, norm_type="group"):
+        super().__init__()
+        self.norm = Normalize(in_channels, norm_type)
+        self.q = nn.Conv2d(in_channels, in_channels, kernel_size=1, stride=1, padding=0)
+        self.k = nn.Conv2d(in_channels, in_channels, kernel_size=1, stride=1, padding=0)
+        self.v = nn.Conv2d(in_channels, in_channels, kernel_size=1, stride=1, padding=0)
+        self.proj_out = nn.Conv2d(
+            in_channels, in_channels, kernel_size=1, stride=1, padding=0
+        )
+
+    def forward(self, x):
+        h_ = x
+        h_ = self.norm(h_)
+        q = self.q(h_)
+        k = self.k(h_)
+        v = self.v(h_)
+
+        # compute attention
+        b, c, h, w = q.shape
+        q = q.reshape(b, c, h * w)
+        q = q.permute(0, 2, 1)  # b,hw,c
+        k = k.reshape(b, c, h * w)  # b,c,hw
+        w_ = torch.bmm(q, k)  # b,hw,hw    w[b,i,j]=sum_c q[b,i,c]k[b,c,j]
+        w_ = w_ * (int(c) ** (-0.5))
+        w_ = F.softmax(w_, dim=2)
+
+        # attend to values
+        v = v.reshape(b, c, h * w)
+        w_ = w_.permute(0, 2, 1)  # b,hw,hw (first hw of k, second of q)
+        h_ = torch.bmm(v, w_)  # b, c,hw (hw of q) h_[b,c,j] = sum_i v[b,c,i] w_[b,i,j]
+        h_ = h_.reshape(b, c, h, w)
+
+        h_ = self.proj_out(h_)
+
+        return x + h_
+
+
+def nonlinearity(x):
+    # swish
+    return x * torch.sigmoid(x)
+
+
+def Normalize(in_channels, norm_type="group"):
+    assert norm_type in ["group", "batch"]
+    if norm_type == "group":
+        return nn.GroupNorm(
+            num_groups=32, num_channels=in_channels, eps=1e-6, affine=True
+        )
+    elif norm_type == "batch":
+        return nn.SyncBatchNorm(in_channels)
+
+
+class Upsample(nn.Module):
+    def __init__(self, in_channels, with_conv):
+        super().__init__()
+        self.with_conv = with_conv
+        if self.with_conv:
+            self.conv = nn.Conv2d(
+                in_channels, in_channels, kernel_size=3, stride=1, padding=1
+            )
+
+    def forward(self, x):
+        if x.dtype != torch.float32:
+            x = F.interpolate(x.to(torch.float), scale_factor=2.0, mode="nearest").to(
+                torch.bfloat16
+            )
+        else:
+            x = F.interpolate(x, scale_factor=2.0, mode="nearest")
+
+        if self.with_conv:
+            x = self.conv(x)
+        return x
+
+
+class Downsample(nn.Module):
+    def __init__(self, in_channels, with_conv):
+        super().__init__()
+        self.with_conv = with_conv
+        if self.with_conv:
+            # no asymmetric padding in torch conv, must do it ourselves
+            self.conv = nn.Conv2d(
+                in_channels, in_channels, kernel_size=3, stride=2, padding=0
+            )
+
+    def forward(self, x):
+        if self.with_conv:
+            pad = (0, 1, 0, 1)
+            x = F.pad(x, pad, mode="constant", value=0)
+            x = self.conv(x)
+        else:
+            x = F.avg_pool2d(x, kernel_size=2, stride=2)
+        return x
+
+
+def compute_entropy_loss(affinity, loss_type="softmax", temperature=0.01):
+    flat_affinity = affinity.reshape(-1, affinity.shape[-1])
+    flat_affinity /= temperature
+    probs = F.softmax(flat_affinity, dim=-1)
+    log_probs = F.log_softmax(flat_affinity + 1e-5, dim=-1)
+    if loss_type == "softmax":
+        target_probs = probs
+    else:
+        raise ValueError("Entropy loss {} not supported".format(loss_type))
+    avg_probs = torch.mean(target_probs, dim=0)
+    avg_entropy = -torch.sum(avg_probs * torch.log(avg_probs + 1e-5))
+    sample_entropy = -torch.mean(torch.sum(target_probs * log_probs, dim=-1))
+    loss = sample_entropy - avg_entropy
+    return loss
+
+
+class VQModel(nn.Module):
+    def __init__(self, config: ModelArgs):
+        super().__init__()
+        self.config = config
+        self.encoder = Encoder(
+            ch_mult=config.encoder_ch_mult,
+            z_channels=config.z_channels,
+            dropout=config.dropout_p,
+        )
+        self.decoder = Decoder(
+            ch_mult=config.decoder_ch_mult,
+            z_channels=config.z_channels,
+            dropout=config.dropout_p,
+        )
+
+        self.quantize = VectorQuantizer(
+            config.codebook_size,
+            config.codebook_embed_dim,
+            config.commit_loss_beta,
+            config.entropy_loss_ratio,
+            config.codebook_l2_norm,
+            config.codebook_show_usage,
+        )
+        self.quant_conv = nn.Conv2d(config.z_channels, config.codebook_embed_dim, 1)
+        self.post_quant_conv = nn.Conv2d(
+            config.codebook_embed_dim, config.z_channels, 1
+        )
+
+    def encode(self, x):
+        h = self.encoder(x)
+        h = self.quant_conv(h)
+        quant, emb_loss, info = self.quantize(h)
+        return quant, emb_loss, info
+
+    def decode(self, quant):
+        quant = self.post_quant_conv(quant)
+        dec = self.decoder(quant)
+        return dec
+
+    def decode_code(self, code_b, shape=None, channel_first=True):
+        quant_b = self.quantize.get_codebook_entry(code_b, shape, channel_first)
+        dec = self.decode(quant_b)
+        return dec
+
+    def forward(self, input):
+        quant, diff, _ = self.encode(input)
+        dec = self.decode(quant)
+        return dec, diff
+
+
+class MultiModalityPreTrainedModel(PreTrainedModel):
+    config_class = MultiModalityConfig
+    base_model_prefix = "multi_modality"
+    _no_split_modules = []
+    _skip_keys_device_placement = "past_key_values"
+
+
+# Copied and adapted from:
+# https://github.com/deepseek-ai/Janus/tree/main/janus/models/modeling_vlm.py
+class MultiModalityCausalLM(MultiModalityPreTrainedModel):
+
+    def __init__(
+        self,
+        config: MultiModalityConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+    ):
+        super().__init__(config)
+
+        vision_config = config.vision_config
+        vision_cls = model_name_to_cls(vision_config.cls)
+        self.vision_model = vision_cls(**vision_config.params)
+
+        aligner_config = config.aligner_config
+        aligner_cls = model_name_to_cls(aligner_config.cls)
+        self.aligner = aligner_cls(aligner_config.params)
+
+        gen_vision_config = config.gen_vision_config
+        gen_vision_cls = model_name_to_cls(gen_vision_config.cls)
+        self.gen_vision_model = gen_vision_cls()
+
+        gen_aligner_config = config.gen_aligner_config
+        gen_aligner_cls = model_name_to_cls(gen_aligner_config.cls)
+        self.gen_aligner = gen_aligner_cls(gen_aligner_config.params)
+
+        gen_head_config = config.gen_head_config
+        gen_head_cls = model_name_to_cls(gen_head_config.cls)
+        self.gen_head = gen_head_cls(gen_head_config.params)
+
+        self.gen_embed = torch.nn.Embedding(
+            gen_vision_config.params["image_token_size"],
+            gen_vision_config.params["n_embed"],
+        )
+
+        language_config = config.language_config
+        self.language_model = LlamaForCausalLM(
+            language_config, quant_config=quant_config
+        )
+        self.logits_processor = LogitsProcessor(language_config)
+
+    def get_image_feature(self, items: List[MultimodalDataItem]) -> torch.Tensor:
+        pixel_values = torch.concat([item.feature for item in items], dim=0)
+        bs, n = pixel_values.shape[0:2]
+        pixel_values = pixel_values.to(
+            device=self.vision_model.device, dtype=self.vision_model.dtype
+        )
+        images = rearrange(pixel_values, "b n c h w -> (b n) c h w")
+
+        # [b x n, T2, D]
+        images_embeds = self.aligner(self.vision_model(images))
+
+        # [b x n, T2, D] -> [b, n x T2, D]
+        images_embeds = rearrange(images_embeds, "(b n) t d -> b (n t) d", b=bs, n=n)
+
+        return images_embeds
+
+    def get_input_embeddings(self) -> nn.Embedding:
+        return self.language_model.get_input_embeddings()
+
+    @torch.no_grad()
+    def forward(
+        self,
+        input_ids: torch.LongTensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        get_embedding: bool = False,
+    ) -> torch.Tensor:
+        hidden_states = general_mm_embed_routine(
+            input_ids=input_ids,
+            forward_batch=forward_batch,
+            multimodal_model=self,
+            language_model=self.language_model,
+            positions=positions,
+        )
+
+        return hidden_states
+
+    def prepare_gen_img_embeds(self, image_ids: torch.LongTensor):
+        return self.gen_aligner(self.gen_embed(image_ids))
+
+    def pad_input_ids(self, input_ids: List[int], image_inputs: MultimodalInputs):
+        im_start_id = image_inputs.im_start_id
+        im_end_id = image_inputs.im_end_id
+        media_token_pairs = [(im_start_id, im_end_id)]
+
+        helper = MultiModalityDataPaddingPatternTokenPairs(media_token_pairs)
+
+        return helper.pad_input_tokens(input_ids, image_inputs)
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            (".qkv_proj", ".q_proj", "q"),
+            (".qkv_proj", ".k_proj", "k"),
+            (".qkv_proj", ".v_proj", "v"),
+            ("gate_up_proj", "gate_proj", 0),
+            ("gate_up_proj", "up_proj", 1),
+        ]
+
+        params_dict = dict(self.named_parameters())
+        for name, loaded_weight in weights:
+            if "rotary_emb.inv_freq~" in name or "projector" in name:
+                continue
+            if "rotary_emb.cos_cached" in name or "rotary_emb.sin_cached" in name:
+                # Models trained using ColossalAI may include these tensors in
+                # the checkpoint. Skip them.
+                continue
+            if name.startswith("model.vision_tower") and name not in params_dict:
+                continue
+
+            # skip generation sub model
+            if "gen" in name:
+                continue
+
+            # adapt to VisionAttention
+            name = name.replace(r"self_attn.out_proj", r"self_attn.proj")
+            if "vision_model.vision_tower" in name:
+                name = name.replace("attn.qkv", "attn.qkv_proj")
+
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                # replace the name and load with customized loader
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+
+                # # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader", None)
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
+                weight_loader(param, loaded_weight)
+
+
+AutoModel.register(config_class=MultiModalityConfig, model_class=MultiModalityCausalLM)
+EntryClass = [MultiModalityCausalLM]
diff --git a/sglang/python/sglang/srt/models/deepseek_nextn.py b/sglang/python/sglang/srt/models/deepseek_nextn.py
new file mode 100644
index 0000000000000000000000000000000000000000..d57eb882296c86a9dd5be57cc6c5fd717ef039d7
--- /dev/null
+++ b/sglang/python/sglang/srt/models/deepseek_nextn.py
@@ -0,0 +1,265 @@
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Inference-only DeepSeek NextN Speculative Decoding."""
+
+import logging
+from typing import Iterable, Optional, Tuple
+
+import torch
+from torch import nn
+from transformers import PretrainedConfig
+
+from sglang.srt.configs.model_config import is_deepseek_nsa
+from sglang.srt.distributed import get_pp_group, get_tensor_model_parallel_world_size
+from sglang.srt.environ import envs
+from sglang.srt.eplb.expert_distribution import get_global_expert_distribution_recorder
+from sglang.srt.layers.attention.nsa.utils import (
+    can_cp_split,
+    cp_all_gather_rerange_output,
+    cp_split_and_rebuild_data,
+    cp_split_and_rebuild_position,
+    is_nsa_enable_prefill_cp,
+    nsa_use_prefill_cp,
+    prepare_input_dp_with_cp_dsa,
+)
+from sglang.srt.layers.dp_attention import (
+    get_attention_cp_rank,
+    get_attention_cp_size,
+    is_dp_attention_enabled,
+)
+from sglang.srt.layers.layernorm import RMSNorm
+from sglang.srt.layers.logits_processor import LogitsProcessor
+from sglang.srt.layers.quantization import Fp8Config
+from sglang.srt.layers.quantization.base_config import QuantizationConfig
+from sglang.srt.layers.vocab_parallel_embedding import (
+    ParallelLMHead,
+    VocabParallelEmbedding,
+)
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch
+from sglang.srt.models.deepseek_common.utils import enable_nextn_moe_bf16_cast_to_fp8
+from sglang.srt.models.deepseek_v2 import DeepseekV2DecoderLayer, DeepseekV3ForCausalLM
+from sglang.srt.models.utils import WeightsMapper
+from sglang.srt.server_args import get_global_server_args
+from sglang.srt.utils import BumpAllocator, add_prefix, is_cuda, is_npu
+
+logger = logging.getLogger(__name__)
+
+
+_is_cuda = is_cuda()
+_is_npu = is_npu()
+
+
+class DeepseekModelNextN(nn.Module):
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        if enable_nextn_moe_bf16_cast_to_fp8(quant_config):
+            # refer to real DeepSeek V3 quant config
+            moe_quant_config_override = Fp8Config(
+                is_checkpoint_fp8_serialized=True,
+                weight_block_size=[128, 128],
+            )
+        else:
+            moe_quant_config_override = None
+
+        if quant_config is not None and quant_config.get_name() == "modelopt_fp4":
+            logger.warning(
+                "Overriding DeepseekV3ForCausalLMNextN quant config for modelopt_fp4 Deepseek model."
+            )
+            quant_config = None
+
+        self.vocab_size = config.vocab_size
+
+        self.embed_tokens = VocabParallelEmbedding(
+            config.vocab_size,
+            config.hidden_size,
+            use_attn_tp_group=is_dp_attention_enabled(),
+            prefix=add_prefix("embed_tokens", prefix),
+        )
+
+        self.enorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.hnorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+        self.eh_proj = nn.Linear(2 * config.hidden_size, config.hidden_size, bias=False)
+
+        self.alt_stream = (
+            torch.cuda.Stream()
+            if _is_cuda or envs.SGLANG_NPU_USE_MULTI_STREAM.get()
+            else None
+        )
+
+        layer_name = "decoder"
+        if _is_npu and (
+            get_global_server_args().speculative_draft_model_path
+            == get_global_server_args().model_path
+        ):
+            layer_name = "layers." + str(config.num_hidden_layers)
+
+        self.decoder = DeepseekV2DecoderLayer(
+            config,
+            0,
+            quant_config=quant_config,
+            moe_quant_config_override=moe_quant_config_override,
+            is_nextn=True,
+            prefix=add_prefix(layer_name, prefix),
+            alt_stream=self.alt_stream,
+        )
+
+        self.shared_head = nn.Module()
+        self.shared_head.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.nsa_enable_prefill_cp = is_nsa_enable_prefill_cp()
+        if self.nsa_enable_prefill_cp:
+            self.cp_size = get_attention_cp_size()
+        else:
+            self.cp_size = None
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        input_embeds: torch.Tensor = None,
+    ) -> torch.Tensor:
+        zero_allocator = BumpAllocator(
+            buffer_size=2,
+            dtype=torch.float32,
+            device=(
+                input_embeds.device if input_embeds is not None else input_ids.device
+            ),
+        )
+
+        if input_embeds is None:
+            hidden_states = self.embed_tokens(input_ids)
+        else:
+            hidden_states = input_embeds
+
+        if hidden_states.shape[0] > 0:
+            hidden_states = self.eh_proj(
+                torch.cat(
+                    (
+                        self.enorm(hidden_states),
+                        self.hnorm(forward_batch.spec_info.hidden_states),
+                    ),
+                    dim=-1,
+                )
+            )
+
+        if nsa_use_prefill_cp(forward_batch, self.nsa_enable_prefill_cp):
+            hidden_states = cp_split_and_rebuild_data(forward_batch, hidden_states)
+            positions = cp_split_and_rebuild_position(forward_batch, positions)
+        residual = None
+        with get_global_expert_distribution_recorder().disable_this_region():
+            hidden_states, residual = self.decoder(
+                positions,
+                hidden_states,
+                forward_batch,
+                residual,
+                zero_allocator,
+            )
+
+        if not forward_batch.forward_mode.is_idle():
+            if residual is not None:
+                hidden_states, _ = self.shared_head.norm(hidden_states, residual)
+            else:
+                hidden_states = self.shared_head.norm(hidden_states)
+
+            if nsa_use_prefill_cp(forward_batch, self.nsa_enable_prefill_cp):
+                # allgather + rerrange
+                hidden_states = cp_all_gather_rerange_output(
+                    hidden_states,
+                    self.cp_size,
+                    forward_batch,
+                    torch.cuda.current_stream(),
+                )
+
+        return hidden_states
+
+
+class DeepseekV3ForCausalLMNextN(DeepseekV3ForCausalLM):
+
+    # Support amd/DeepSeek-R1-0528-MXFP4 renaming: model.layers.61*.
+    # Ref: HF config.json for amd/DeepSeek-R1-0528-MXFP4
+    # https://huggingface.co/amd/DeepSeek-R1-0528-MXFP4/blob/main/config.json
+    hf_to_sglang_mapper = WeightsMapper(
+        orig_to_new_substr={
+            "model.layers.61": "model.decoder",
+        },
+    )
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        nn.Module.__init__(self)
+        self.config = config
+        self.tp_size = get_tensor_model_parallel_world_size()
+        self.quant_config = quant_config
+        # if not set, model load will be broken in DeepseekV3ForCausalLM load_weights()
+        self.pp_group = get_pp_group()
+        self.determine_num_fused_shared_experts("DeepseekV3ForCausalLMNextN")
+        self.use_nsa = is_deepseek_nsa(config)
+        self.nsa_enable_prefill_cp = is_nsa_enable_prefill_cp()
+        if self.nsa_enable_prefill_cp:
+            self.cp_rank = get_attention_cp_rank()
+            self.cp_size = get_attention_cp_size()
+        else:
+            self.cp_rank = None
+            self.cp_size = None
+
+        self.model = DeepseekModelNextN(
+            config, quant_config, prefix=add_prefix("model", prefix)
+        )
+        self.lm_head = ParallelLMHead(
+            config.vocab_size,
+            config.hidden_size,
+            quant_config=quant_config,
+            prefix=add_prefix("model.shared_head.head", prefix),
+            use_attn_tp_group=get_global_server_args().enable_dp_lm_head,
+        )
+        self.logits_processor = LogitsProcessor(config)
+
+    @torch.no_grad()
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+    ) -> torch.Tensor:
+        # TODO current just support prefill batch=1 and len(input_ids) > self.cp_size * 2
+        if self.nsa_enable_prefill_cp:
+            if can_cp_split(len(input_ids), self.cp_size, self.use_nsa, forward_batch):
+                forward_batch.nsa_cp_metadata = prepare_input_dp_with_cp_dsa(
+                    len(input_ids),
+                    self.cp_rank,
+                    self.cp_size,
+                    forward_batch.seq_lens_cpu.tolist(),
+                )
+        hidden_states = self.model(input_ids, positions, forward_batch)
+        return self.logits_processor(
+            input_ids, hidden_states, self.lm_head, forward_batch
+        )
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        super().load_weights(weights, is_nextn=True)
+
+
+EntryClass = [DeepseekV3ForCausalLMNextN]
diff --git a/sglang/python/sglang/srt/models/deepseek_ocr.py b/sglang/python/sglang/srt/models/deepseek_ocr.py
new file mode 100644
index 0000000000000000000000000000000000000000..fa334af7b0c96b775e4de8122ce8572b49537fbe
--- /dev/null
+++ b/sglang/python/sglang/srt/models/deepseek_ocr.py
@@ -0,0 +1,1856 @@
+# Copyright 2025 The SwissAI Initiative
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+# Adapted from
+# https://github.com/vllm-project/vllm/blob/c7f2cf2b7f67bce5842fedfdba508440fe257375/vllm/model_executor/models/llama.py#L1
+"""Inference-only Apertus model compatible with HuggingFace weights."""
+
+import copy
+import logging
+import math
+from functools import partial
+from typing import Iterable, List, Optional, Set, Tuple, Type, TypeAlias, Union
+
+import torch
+import torch.nn.functional as F
+import transformers
+from torch import Tensor, nn
+from transformers.models.vitdet.modeling_vitdet import get_rel_pos
+
+from sglang.srt.configs.deepseek_ocr import DeepseekVLV2Config
+from sglang.srt.layers.quantization import QuantizationConfig
+from sglang.srt.managers.mm_utils import (
+    MultiModalityDataPaddingPatternMultimodalTokens,
+    general_mm_embed_routine,
+)
+from sglang.srt.managers.schedule_batch import MultimodalDataItem, MultimodalInputs
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch
+from sglang.srt.model_loader.weight_utils import default_weight_loader
+from sglang.srt.models.deepseek import DeepseekForCausalLM
+from sglang.srt.models.deepseek_v2 import DeepseekV2ForCausalLM, DeepseekV3ForCausalLM
+from sglang.srt.models.transformers import maybe_prefix
+
+NestedTensors: TypeAlias = Union[
+    list["NestedTensors"],
+    list["torch.Tensor"],
+    "torch.Tensor",
+    tuple["torch.Tensor", ...],
+]
+
+MultiModalEmbeddings: TypeAlias = list[Tensor] | Tensor | tuple[Tensor, ...]
+
+logger = logging.getLogger(__name__)
+
+
+def _flatten_embeddings(embeddings: NestedTensors) -> torch.Tensor:
+    """
+    Recursively flattens and concatenates NestedTensors on all but the last
+    dimension.
+    """
+
+    if isinstance(embeddings, torch.Tensor):
+        # Flatten all but the last dimension.
+        return embeddings.flatten(0, -2)
+
+    return torch.cat(tuple(_flatten_embeddings(t) for t in embeddings))
+
+
+def _embedding_count_expression(embeddings: NestedTensors) -> str:
+    """
+    Constructs a debugging representation of the number of embeddings in the
+    NestedTensors.
+    """
+
+    if isinstance(embeddings, torch.Tensor):
+        return " x ".join([str(dim) for dim in embeddings.shape[:-1]])
+
+    return " + ".join(_embedding_count_expression(inner) for inner in embeddings)
+
+
+def _merge_multimodal_embeddings(
+    inputs_embeds: torch.Tensor,
+    multimodal_embeddings: NestedTensors,
+    is_multimodal: torch.Tensor,
+) -> torch.Tensor:
+    """
+    Merge `multimodal_embeddings` into `inputs_embeds` by overwriting the
+    positions in `inputs_embeds` corresponding to placeholder tokens in
+    `input_ids`.
+
+    Note:
+        This updates `inputs_embeds` in place.
+    """
+    if len(multimodal_embeddings) == 0:
+        return inputs_embeds
+
+    mm_embeds_flat = _flatten_embeddings(multimodal_embeddings)
+    input_dtype = inputs_embeds.dtype
+
+    try:
+        # NOTE: This can avoid D2H sync (#22105), but fails to
+        # raise an error if is_multimodal.sum() < len(mm_embeds_flat)
+        inputs_embeds.masked_scatter_(
+            is_multimodal.unsqueeze(-1), mm_embeds_flat.to(dtype=input_dtype)
+        )
+    except RuntimeError as e:
+        num_actual_tokens = len(mm_embeds_flat)
+        num_expected_tokens = is_multimodal.sum().item()
+
+        if num_actual_tokens != num_expected_tokens:
+            expr = _embedding_count_expression(multimodal_embeddings)
+
+            raise ValueError(
+                f"Attempted to assign {expr} = {num_actual_tokens} "
+                f"multimodal tokens to {num_expected_tokens} placeholders"
+            ) from e
+
+        raise ValueError("Error during masked scatter operation") from e
+
+    return inputs_embeds
+
+
+def isin_list(
+    elements: torch.Tensor,
+    test_elements_list: list[int],
+) -> torch.Tensor:
+    test_elements = torch.tensor(test_elements_list, pin_memory=True).to(
+        device=elements.device, non_blocking=True
+    )
+
+    return torch.isin(elements, test_elements)
+
+
+def merge_multimodal_embeddings(
+    input_ids: torch.Tensor,
+    inputs_embeds: torch.Tensor,
+    multimodal_embeddings: NestedTensors,
+    placeholder_token_id: int | list[int],
+) -> torch.Tensor:
+    """
+    Merge `multimodal_embeddings` into `inputs_embeds` by overwriting the
+    positions in `inputs_embeds` corresponding to placeholder tokens in
+    `input_ids`.
+
+    `placeholder_token_id` can be a list of token ids (e.g, token ids
+    of img_start, img_break, and img_end tokens) when needed: This means
+    the order of these tokens in the `input_ids` MUST MATCH the order of
+    their embeddings in `multimodal_embeddings` since we need to
+    slice-merge instead of individually scattering.
+
+    For example, if input_ids is "TTTTTSIIIBIIIBIIIETTT", where
+    - T is text token
+    - S is image start token
+    - I is image embedding token
+    - B is image break token
+    - E is image end token.
+
+    Then the image embeddings (that correspond to I's) from vision encoder
+    must be padded with embeddings of S, B, and E in the same order of
+    input_ids for a correct embedding merge.
+
+    Note:
+        This updates `inputs_embeds` in place.
+    """
+    if isinstance(placeholder_token_id, list):
+        is_multimodal = isin_list(input_ids, placeholder_token_id)
+    else:
+        is_multimodal = input_ids == placeholder_token_id
+
+    return _merge_multimodal_embeddings(
+        inputs_embeds,
+        multimodal_embeddings=multimodal_embeddings,
+        is_multimodal=is_multimodal,
+    )
+
+
+class MlpProjector(nn.Module):
+
+    def __init__(
+        self,
+        projector_type,
+        input_dim,
+        n_embed,
+        depth=1,
+        mlp_ratio=1,
+        downsample_ratio=4,
+    ):
+        self.projector_type = projector_type
+        self.input_dim = input_dim
+        self.n_embed = n_embed
+        self.depth = depth
+        self.token_pooling = False
+        self.conv_fusion_high_low_features = False
+
+        super().__init__()
+
+        if projector_type == "identity":
+            modules = nn.Identity()
+
+        elif projector_type == "linear":
+            modules = nn.Linear(input_dim, n_embed)
+
+        elif projector_type == "mlp_gelu":
+            mlp_depth = depth
+            modules = [nn.Linear(input_dim, n_embed)]
+            for _ in range(1, mlp_depth):
+                modules.append(nn.GELU())
+                modules.append(nn.Linear(n_embed, n_embed))
+            modules = nn.Sequential(*modules)
+
+        elif projector_type == "normlayer_downsample_mlp_gelu":
+            mlp_depth = depth
+            mlp_ratio = mlp_ratio
+            modules = [
+                nn.LayerNorm(input_dim * downsample_ratio * downsample_ratio),
+                nn.Linear(
+                    input_dim * downsample_ratio * downsample_ratio,
+                    n_embed * mlp_ratio,
+                ),
+            ]
+            for _ in range(1, mlp_depth - 1):
+                modules.append(nn.GELU())
+                modules.append(nn.Linear(n_embed * mlp_ratio, n_embed * mlp_ratio))
+            modules.append(nn.GELU())
+            modules.append(nn.Linear(n_embed * mlp_ratio, n_embed))
+            modules = nn.Sequential(*modules)
+
+        elif projector_type == "downsample_mlp_gelu":
+            mlp_depth = depth
+            mlp_ratio = mlp_ratio
+            modules = [
+                nn.Linear(
+                    input_dim * downsample_ratio * downsample_ratio,
+                    n_embed * mlp_ratio,
+                )
+            ]
+            for _ in range(1, mlp_depth - 1):
+                modules.append(nn.GELU())
+                modules.append(nn.Linear(n_embed * mlp_ratio, n_embed * mlp_ratio))
+            modules.append(nn.GELU())
+            modules.append(nn.Linear(n_embed * mlp_ratio, n_embed))
+            modules = nn.Sequential(*modules)
+
+        elif projector_type == "low_high_hybrid_split_mlp_gelu":
+            mlp_depth = depth
+            self.high_up_proj = nn.Linear(input_dim, n_embed // 2)
+            self.low_up_proj = nn.Linear(input_dim, n_embed // 2)
+
+            modules = []
+            for _ in range(1, mlp_depth):
+                modules.append(nn.GELU())
+                modules.append(nn.Linear(n_embed, n_embed))
+            modules = nn.Sequential(*modules)
+
+        elif projector_type == "hybrid_split_feature_mlp_gelu":
+            mlp_depth = depth
+            channel_div = 0.5
+            self.high_up_proj = nn.Linear(input_dim[0], int(n_embed * channel_div))
+            self.low_up_proj = nn.Linear(
+                input_dim[1], n_embed - int(n_embed * channel_div)
+            )
+
+            modules = []
+            for _ in range(1, mlp_depth):
+                modules.append(nn.GELU())
+                modules.append(nn.Linear(n_embed, n_embed))
+            modules = nn.Sequential(*modules)
+
+        elif projector_type == "low_high_split_mlp_gelu":
+            mlp_depth = depth
+            modules = []
+            for _ in range(1, mlp_depth):
+                modules.append(nn.GELU())
+                modules.append(nn.Linear(n_embed // 2, n_embed // 2))
+            modules = nn.Sequential(*modules)
+            self.high_layers = nn.Sequential(*modules)
+            self.low_layers = copy.deepcopy(modules)
+
+        else:
+            raise ValueError(f"Unknown projector type: {projector_type}")
+
+        self.layers = modules
+
+    def forward(self, x):
+        if self.token_pooling:
+            batch_size, wxh, channels = x.shape
+            w = h = int(wxh**0.5)
+            x = x.view(batch_size, w, h, channels)
+            x = x.permute(0, 3, 1, 2)
+            patches = x.unfold(2, 2, 2).unfold(3, 2, 2)
+            batch_size, channels, h_patches, w_patches, _, _ = patches.size()
+            # Concatenate on channel dimension
+            patches = patches.contiguous().view(
+                batch_size, channels, h_patches * w_patches, -1
+            )
+
+            # Pass through linear layer
+            patches = patches.permute(0, 2, 1, 3).contiguous()
+            patches = patches.view(batch_size, h_patches * w_patches, channels * 4)
+
+            x = self.token_pooling_layer(patches)
+
+        if self.conv_fusion_high_low_features:
+            x = self.fusion_layer(x[:, 0]) + x[:, 1]
+
+        if self.projector_type == "low_high_hybrid_split_mlp_gelu":
+            high_x, low_x = x[0], x[1]
+            high_x = self.high_up_proj(high_x)
+            low_x = self.low_up_proj(low_x)
+            x = torch.concat([high_x, low_x], dim=-1)
+
+        if self.projector_type == "hybrid_split_feature_mlp_gelu":
+            high_x = x[..., : self.input_dim[0]]
+            low_x = x[..., self.input_dim[0] :]
+            high_x = self.high_up_proj(high_x)
+            low_x = self.low_up_proj(low_x)
+            x = torch.concat([high_x, low_x], dim=-1)
+
+        if self.projector_type == "low_high_split_mlp_gelu":
+            high_x, low_x = x[0], x[1]
+            high_x = self.high_layers(high_x)
+            low_x = self.low_layers(low_x)
+            x = torch.concat([high_x, low_x], dim=-1)
+            return x
+
+        if (
+            self.projector_type == "downsample_mlp_gelu"
+            or self.projector_type == "normlayer_downsample_mlp_gelu"
+        ):
+            bs, hw, input_dim = x.shape
+            h = w = int((hw) ** 0.5)
+
+            """compute padding"""
+            if h % self.downsample_ratio:
+                pad = self.downsample_ratio - h % self.downsample_ratio
+            else:
+                pad = 0
+            x = x.reshape(bs, h, w, input_dim)
+            if pad > 0:
+                x = F.pad(x, (0, 0, 0, pad, 0, pad), "constant", 0)
+
+            """4 to 1 concat"""
+            x = x.permute(0, 3, 1, 2)  # B, C, H, W
+            x = F.unfold(
+                x,
+                kernel_size=self.downsample_ratio,
+                stride=self.downsample_ratio,
+                padding=0,
+            )  # B, C*4, HW // 4
+            x = x.permute(0, 2, 1)
+
+        return self.layers(x)
+
+
+class LayerNorm2d(nn.Module):
+    def __init__(self, num_channels: int, eps: float = 1e-6) -> None:
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(num_channels))
+        self.bias = nn.Parameter(torch.zeros(num_channels))
+        self.eps = eps
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        u = x.mean(1, keepdim=True)
+        s = (x - u).pow(2).mean(1, keepdim=True)
+        x = (x - u) / torch.sqrt(s + self.eps)
+        x = self.weight[:, None, None] * x + self.bias[:, None, None]
+        return x
+
+
+class MLPBlock(nn.Module):
+    def __init__(
+        self,
+        embedding_dim: int,
+        mlp_dim: int,
+        act: Type[nn.Module] = nn.GELU,
+    ) -> None:
+        super().__init__()
+        self.lin1 = nn.Linear(embedding_dim, mlp_dim)
+        self.lin2 = nn.Linear(mlp_dim, embedding_dim)
+        self.act = act()
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.lin2(self.act(self.lin1(x)))
+
+
+def add_decomposed_rel_pos(
+    q: torch.Tensor,
+    rel_pos_h: torch.Tensor,
+    rel_pos_w: torch.Tensor,
+    q_size: Tuple[int, int],
+    k_size: Tuple[int, int],
+) -> torch.Tensor:
+    """
+    Calculate decomposed Relative Positional Embeddings from :paper:`mvitv2`.
+    https://github.com/facebookresearch/mvit/blob/19786631e330df9f3622e5402b4a419a263a2c80/mvit/models/attention.py   # noqa B950
+    Args:
+        q (Tensor): query q in the attention layer with shape (B, q_h * q_w, C).
+        rel_pos_h (Tensor): relative position embeddings (Lh, C) for height axis.
+        rel_pos_w (Tensor): relative position embeddings (Lw, C) for width axis.
+        q_size (Tuple): spatial sequence size of query q with (q_h, q_w).
+        k_size (Tuple): spatial sequence size of key k with (k_h, k_w).
+    Returns:
+        attn (Tensor): attention map with added relative positional embeddings.
+    """
+    q_h, q_w = q_size
+    k_h, k_w = k_size
+    Rh = get_rel_pos(q_h, k_h, rel_pos_h)
+    Rw = get_rel_pos(q_w, k_w, rel_pos_w)
+
+    B, _, dim = q.shape
+    r_q = q.reshape(B, q_h, q_w, dim)
+    rel_h = torch.einsum("bhwc,hkc->bhwk", r_q, Rh)
+    rel_w = torch.einsum("bhwc,wkc->bhwk", r_q, Rw)
+    rel_h = rel_h.unsqueeze(-1)
+    rel_w = rel_w.unsqueeze(-2)
+    rel_h = rel_h.reshape(B, q_h * q_w, k_h, 1)
+    rel_w = rel_w.reshape(B, q_h * q_w, 1, k_w)
+
+    return rel_h, rel_w
+
+
+class Attention(nn.Module):
+    """Multi-head Attention block with relative position embeddings."""
+
+    def __init__(
+        self,
+        dim: int,
+        num_heads: int = 8,
+        qkv_bias: bool = True,
+        use_rel_pos: bool = False,
+        rel_pos_zero_init: bool = True,
+        input_size: Optional[Tuple[int, int]] = None,
+    ) -> None:
+        """
+        Args:
+            dim (int): Number of input channels.
+            num_heads (int): Number of attention heads.
+            qkv_bias (bool):  If True, add a learnable bias to query, key, value.
+            rel_pos_zero_init (bool): If True, zero initialize relative positional parameters.
+            input_size (tuple(int, int) or None): Input resolution for calculating the relative
+                positional parameter size.
+        """
+        super().__init__()
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = head_dim**-0.5
+
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.proj = nn.Linear(dim, dim)
+
+        self.use_rel_pos = use_rel_pos
+        if self.use_rel_pos:
+            assert (
+                input_size is not None
+            ), "Input size must be provided if using relative positional encoding."
+            # initialize relative positional embeddings
+            self.rel_pos_h = nn.Parameter(torch.zeros(2 * input_size[0] - 1, head_dim))
+            self.rel_pos_w = nn.Parameter(torch.zeros(2 * input_size[1] - 1, head_dim))
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        B, H, W, _ = x.shape
+        # qkv with shape (3, B, nHead, H * W, C)
+        qkv = (
+            self.qkv(x).reshape(B, H * W, 3, self.num_heads, -1).permute(2, 0, 3, 1, 4)
+        )
+        # q, k, v with shape (B * nHead, H * W, C)
+        q, k, v = qkv.reshape(3, B * self.num_heads, H * W, -1).unbind(0)
+
+        rel_h, rel_w = None, None
+        if self.use_rel_pos:
+            rel_h, rel_w = add_decomposed_rel_pos(
+                q, self.rel_pos_h, self.rel_pos_w, (H, W), (H, W)
+            )
+
+        q = q.view(B, self.num_heads, H * W, -1)
+        k = k.view(B, self.num_heads, H * W, -1)
+        v = v.view(B, self.num_heads, H * W, -1)
+
+        if self.use_rel_pos:
+            rel_h = rel_h.view(
+                B, self.num_heads, rel_h.size(1), rel_h.size(2), rel_h.size(3)
+            )
+            rel_w = rel_w.view(
+                B, self.num_heads, rel_w.size(1), rel_w.size(2), rel_w.size(3)
+            )
+            attn_bias = (rel_h + rel_w).view(
+                B, self.num_heads, rel_h.size(2), rel_h.size(3) * rel_w.size(4)
+            )
+            x = torch.nn.functional.scaled_dot_product_attention(
+                q, k, v, attn_mask=attn_bias
+            )
+            # x = _attention_rel_h_rel_w(q, k, v, rel_h, rel_w)
+        else:
+            x = torch.nn.functional.scaled_dot_product_attention(q, k, v)
+
+        x = (
+            x.view(B, self.num_heads, H, W, -1)
+            .permute(0, 2, 3, 1, 4)
+            .reshape(B, H, W, -1)
+        )
+
+        x = self.proj(x)
+
+        return x
+
+
+def window_partition(
+    x: torch.Tensor, window_size: int
+) -> Tuple[torch.Tensor, Tuple[int, int]]:
+    """
+    Partition into non-overlapping windows with padding if needed.
+    Args:
+        x (tensor): input tokens with [B, H, W, C].
+        window_size (int): window size.
+    Returns:
+        windows: windows after partition with [B * num_windows, window_size, window_size, C].
+        (Hp, Wp): padded height and width before partition
+    """
+    B, H, W, C = x.shape
+
+    pad_h = (window_size - H % window_size) % window_size
+    pad_w = (window_size - W % window_size) % window_size
+    if pad_h > 0 or pad_w > 0:
+        x = F.pad(x, (0, 0, 0, pad_w, 0, pad_h))
+    Hp, Wp = H + pad_h, W + pad_w
+
+    x = x.view(B, Hp // window_size, window_size, Wp // window_size, window_size, C)
+    windows = (
+        x.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, window_size, window_size, C)
+    )
+    return windows, (Hp, Wp)
+
+
+def window_unpartition(
+    windows: torch.Tensor,
+    window_size: int,
+    pad_hw: Tuple[int, int],
+    hw: Tuple[int, int],
+) -> torch.Tensor:
+    """
+    Window unpartition into original sequences and removing padding.
+    Args:
+        windows (tensor): input tokens with [B * num_windows, window_size, window_size, C].
+        window_size (int): window size.
+        pad_hw (Tuple): padded height and width (Hp, Wp).
+        hw (Tuple): original height and width (H, W) before padding.
+    Returns:
+        x: unpartitioned sequences with [B, H, W, C].
+    """
+    Hp, Wp = pad_hw
+    H, W = hw
+    B = windows.shape[0] // (Hp * Wp // window_size // window_size)
+    x = windows.view(
+        B, Hp // window_size, Wp // window_size, window_size, window_size, -1
+    )
+    x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, Hp, Wp, -1)
+
+    if Hp > H or Wp > W:
+        x = x[:, :H, :W, :].contiguous()
+    return x
+
+
+class Block(nn.Module):
+    """Transformer blocks with support of window attention and residual propagation blocks"""
+
+    def __init__(
+        self,
+        dim: int,
+        num_heads: int,
+        mlp_ratio: float = 4.0,
+        qkv_bias: bool = True,
+        norm_layer: Type[nn.Module] = nn.LayerNorm,
+        act_layer: Type[nn.Module] = nn.GELU,
+        use_rel_pos: bool = False,
+        rel_pos_zero_init: bool = True,
+        window_size: int = 0,
+        input_size: Optional[Tuple[int, int]] = None,
+    ) -> None:
+        """
+        Args:
+            dim (int): Number of input channels.
+            num_heads (int): Number of attention heads in each ViT block.
+            mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
+            qkv_bias (bool): If True, add a learnable bias to query, key, value.
+            norm_layer (nn.Module): Normalization layer.
+            act_layer (nn.Module): Activation layer.
+            use_rel_pos (bool): If True, add relative positional embeddings to the attention map.
+            rel_pos_zero_init (bool): If True, zero initialize relative positional parameters.
+            window_size (int): Window size for window attention blocks. If it equals 0, then
+                use global attention.
+            input_size (tuple(int, int) or None): Input resolution for calculating the relative
+                positional parameter size.
+        """
+        super().__init__()
+        self.norm1 = norm_layer(dim)
+        self.attn = Attention(
+            dim,
+            num_heads=num_heads,
+            qkv_bias=qkv_bias,
+            use_rel_pos=use_rel_pos,
+            rel_pos_zero_init=rel_pos_zero_init,
+            input_size=input_size if window_size == 0 else (window_size, window_size),
+        )
+
+        self.norm2 = norm_layer(dim)
+        self.mlp = MLPBlock(
+            embedding_dim=dim, mlp_dim=int(dim * mlp_ratio), act=act_layer
+        )
+
+        self.window_size = window_size
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        shortcut = x
+        x = self.norm1(x)
+        # Window partition
+        if self.window_size > 0:
+            H, W = x.shape[1], x.shape[2]
+            x, pad_hw = window_partition(x, self.window_size)
+
+        x = self.attn(x)
+        # Reverse window partition
+        if self.window_size > 0:
+            x = window_unpartition(x, self.window_size, pad_hw, (H, W))
+
+        x = shortcut + x
+        x = x + self.mlp(self.norm2(x))
+
+        return x
+
+
+class PatchEmbed(nn.Module):
+    """
+    Image to Patch Embedding.
+    """
+
+    def __init__(
+        self,
+        kernel_size: Tuple[int, int] = (16, 16),
+        stride: Tuple[int, int] = (16, 16),
+        padding: Tuple[int, int] = (0, 0),
+        in_chans: int = 3,
+        embed_dim: int = 768,
+    ) -> None:
+        """
+        Args:
+            kernel_size (Tuple): kernel size of the projection layer.
+            stride (Tuple): stride of the projection layer.
+            padding (Tuple): padding size of the projection layer.
+            in_chans (int): Number of input image channels.
+            embed_dim (int): Patch embedding dimension.
+        """
+        super().__init__()
+
+        self.proj = nn.Conv2d(
+            in_chans, embed_dim, kernel_size=kernel_size, stride=stride, padding=padding
+        )
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.proj(x)
+        # B C H W -> B H W C
+        x = x.permute(0, 2, 3, 1)
+        return x
+
+
+def get_abs_pos_sam(abs_pos, tgt_size):
+    dtype = abs_pos.dtype
+
+    src_size = abs_pos.size(1)
+
+    if src_size != tgt_size:
+        old_pos_embed = abs_pos.permute(0, 3, 1, 2)
+        old_pos_embed = old_pos_embed.to(torch.float32)
+        new_pos_embed = F.interpolate(
+            old_pos_embed,
+            size=(tgt_size, tgt_size),
+            mode="bicubic",
+            antialias=True,
+            align_corners=False,
+        ).to(dtype)
+        new_pos_embed = new_pos_embed.permute(0, 2, 3, 1)
+        return new_pos_embed
+    else:
+        return abs_pos
+
+
+# This class and its supporting functions below lightly adapted from the ViTDet backbone available at: https://github.com/facebookresearch/detectron2/blob/main/detectron2/modeling/backbone/vit.py # noqa
+class ImageEncoderViT(nn.Module):
+    def __init__(
+        self,
+        img_size: int = 1024,
+        patch_size: int = 16,
+        in_chans: int = 3,
+        embed_dim: int = 768,
+        depth: int = 12,
+        num_heads: int = 12,
+        mlp_ratio: float = 4.0,
+        out_chans: int = 256,
+        qkv_bias: bool = True,
+        norm_layer: Type[nn.Module] = nn.LayerNorm,
+        act_layer: Type[nn.Module] = nn.GELU,
+        use_abs_pos: bool = True,
+        use_rel_pos: bool = False,
+        rel_pos_zero_init: bool = True,
+        window_size: int = 0,
+        global_attn_indexes: Tuple[int, ...] = (),
+        net_3_out_channels: int = 1024,
+    ) -> None:
+        """
+        Args:
+            img_size (int): Input image size.
+            patch_size (int): Patch size.
+            in_chans (int): Number of input image channels.
+            embed_dim (int): Patch embedding dimension.
+            depth (int): Depth of ViT.
+            num_heads (int): Number of attention heads in each ViT block.
+            mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
+            qkv_bias (bool): If True, add a learnable bias to query, key, value.
+            norm_layer (nn.Module): Normalization layer.
+            act_layer (nn.Module): Activation layer.
+            use_abs_pos (bool): If True, use absolute positional embeddings.
+            use_rel_pos (bool): If True, add relative positional embeddings to the attention map.
+            rel_pos_zero_init (bool): If True, zero initialize relative positional parameters.
+            window_size (int): Window size for window attention blocks.
+            global_attn_indexes (list): Indexes for blocks using global attention.
+        """
+        super().__init__()
+        self.img_size = img_size
+
+        self.patch_embed = PatchEmbed(
+            kernel_size=(patch_size, patch_size),
+            stride=(patch_size, patch_size),
+            in_chans=in_chans,
+            embed_dim=embed_dim,
+        )
+
+        self.pos_embed: Optional[nn.Parameter] = None
+        if use_abs_pos:
+            # Initialize absolute positional embedding with pretrain image size.
+            self.pos_embed = nn.Parameter(
+                torch.zeros(
+                    1, img_size // patch_size, img_size // patch_size, embed_dim
+                )
+            )
+
+        self.blocks = nn.ModuleList()
+        for i in range(depth):
+            block = Block(
+                dim=embed_dim,
+                num_heads=num_heads,
+                mlp_ratio=mlp_ratio,
+                qkv_bias=qkv_bias,
+                norm_layer=norm_layer,
+                act_layer=act_layer,
+                use_rel_pos=use_rel_pos,
+                rel_pos_zero_init=rel_pos_zero_init,
+                window_size=window_size if i not in global_attn_indexes else 0,
+                input_size=(img_size // patch_size, img_size // patch_size),
+            )
+            self.blocks.append(block)
+
+        self.neck = nn.Sequential(
+            nn.Conv2d(
+                embed_dim,
+                out_chans,
+                kernel_size=1,
+                bias=False,
+            ),
+            LayerNorm2d(out_chans),
+            nn.Conv2d(
+                out_chans,
+                out_chans,
+                kernel_size=3,
+                padding=1,
+                bias=False,
+            ),
+            LayerNorm2d(out_chans),
+        )
+
+        self.net_2 = nn.Conv2d(256, 512, kernel_size=3, stride=2, padding=1, bias=False)
+        self.net_3 = nn.Conv2d(
+            512, net_3_out_channels, kernel_size=3, stride=2, padding=1, bias=False
+        )
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.patch_embed(x)
+        if self.pos_embed is not None:
+            x = x + get_abs_pos_sam(self.pos_embed, x.size(1))
+
+        for blk in self.blocks:
+            x = blk(x)
+
+        x = self.neck(x.permute(0, 3, 1, 2))
+        x2 = self.net_2(x)
+        x3 = self.net_3(x2.clone())
+
+        return x3
+
+
+def _build_sam(
+    encoder_embed_dim,
+    encoder_depth,
+    encoder_num_heads,
+    encoder_global_attn_indexes,
+    checkpoint=None,
+    net_3_out_channels: int = 1024,
+):
+    prompt_embed_dim = 256
+    image_size = 1024
+    vit_patch_size = 16
+    image_encoder = ImageEncoderViT(
+        depth=encoder_depth,
+        embed_dim=encoder_embed_dim,
+        img_size=image_size,
+        mlp_ratio=4,
+        norm_layer=partial(torch.nn.LayerNorm, eps=1e-6),
+        num_heads=encoder_num_heads,
+        patch_size=vit_patch_size,
+        qkv_bias=True,
+        use_rel_pos=True,
+        global_attn_indexes=encoder_global_attn_indexes,
+        window_size=14,
+        out_chans=prompt_embed_dim,
+        net_3_out_channels=net_3_out_channels,
+    )
+    image_encoder.eval()
+    if checkpoint is not None:
+        state_dict = torch.load(checkpoint)
+        image_encoder.load_state_dict(
+            {k[30:]: v for k, v in state_dict.items() if "vision_tower_high" in k},
+            strict=True,
+        )
+    return image_encoder
+
+
+def build_sam_vit_b(checkpoint=None, net_3_out_channels: int = 1024):
+    return _build_sam(
+        encoder_embed_dim=768,
+        encoder_depth=12,
+        encoder_num_heads=12,
+        encoder_global_attn_indexes=[2, 5, 8, 11],
+        checkpoint=checkpoint,
+        net_3_out_channels=net_3_out_channels,
+    )
+
+
+def get_abs_pos(abs_pos, tgt_size):
+    # abs_pos: L, C
+    # tgt_size: M
+    # return: M, C
+    dim = abs_pos.size(-1)
+    abs_pos_new = abs_pos.squeeze(0)
+    cls_token, old_pos_embed = abs_pos_new[:1], abs_pos_new[1:]
+
+    src_size = int(math.sqrt(abs_pos_new.shape[0] - 1))
+    tgt_size = int(math.sqrt(tgt_size))
+    dtype = abs_pos.dtype
+
+    if src_size != tgt_size:
+        old_pos_embed = (
+            old_pos_embed.view(1, src_size, src_size, dim)
+            .permute(0, 3, 1, 2)
+            .contiguous()
+        )
+        old_pos_embed = old_pos_embed.to(torch.float32)
+        new_pos_embed = F.interpolate(
+            old_pos_embed,
+            size=(tgt_size, tgt_size),
+            mode="bicubic",
+            antialias=True,
+            align_corners=False,
+        ).to(dtype)
+        new_pos_embed = new_pos_embed.permute(0, 2, 3, 1)
+        new_pos_embed = new_pos_embed.view(tgt_size * tgt_size, dim)
+        vision_pos_embed = torch.cat([cls_token, new_pos_embed], dim=0)
+        vision_pos_embed = vision_pos_embed.view(1, tgt_size * tgt_size + 1, dim)
+        return vision_pos_embed
+    else:
+        return abs_pos
+
+
+class CLIPVisionEmbeddings(nn.Module):
+    def __init__(self, hidden_size=1024, image_size=224, patch_size=14, num_channels=3):
+        super().__init__()
+        self.embed_dim = hidden_size
+        self.image_size = image_size
+        self.patch_size = patch_size
+
+        self.class_embedding = torch.nn.Parameter(torch.randn(self.embed_dim))
+
+        self.patch_embedding = torch.nn.Conv2d(
+            in_channels=num_channels,
+            out_channels=self.embed_dim,
+            kernel_size=self.patch_size,
+            stride=self.patch_size,
+            bias=False,
+        )
+
+        self.num_patches = (self.image_size // self.patch_size) ** 2
+        self.num_positions = self.num_patches + 1
+        self.position_embedding = torch.nn.Embedding(self.num_positions, self.embed_dim)
+        self.register_buffer(
+            "position_ids", torch.arange(self.num_positions).expand((1, -1))
+        )
+
+    def forward(self, pixel_values, patch_embeds):
+        batch_size = pixel_values.shape[0]
+
+        if patch_embeds is not None:
+            patch_embeds = patch_embeds
+        else:
+            patch_embeds = self.patch_embedding(pixel_values)
+
+        patch_embeds = patch_embeds.flatten(2).transpose(1, 2)
+
+        class_embeds = self.class_embedding.expand(batch_size, 1, -1)
+        embeddings = torch.cat([class_embeds, patch_embeds], dim=1)
+
+        embeddings = embeddings + get_abs_pos(
+            self.position_embedding(self.position_ids), embeddings.size(1)
+        )
+        return embeddings
+
+
+class NoTPAttention(torch.nn.Module):
+    def __init__(self, cfg):
+        super().__init__()
+        self.num_heads = cfg["num_attention_heads"]
+        self.n_local_heads = cfg["num_attention_heads"]
+        self.head_dim = cfg["hidden_size"] // cfg["num_attention_heads"]
+        self.max_seq_len = cfg["seq_length"]
+        self.use_flash_attention = cfg["use_flash_attn"]
+
+        self.qkv_proj = torch.nn.Linear(
+            cfg["hidden_size"], cfg["hidden_size"] * 3, bias=True
+        )
+        self.out_proj = torch.nn.Linear(
+            cfg["hidden_size"], cfg["hidden_size"], bias=True
+        )
+
+        # self.core_attention = CoreAttention(cfg, AttnType.self_attn)
+
+        self.attn_drop = cfg["attention_dropout"]
+
+    def forward(
+        self,
+        x: torch.Tensor,
+    ):
+        bsz, seqlen, _ = x.shape
+        xqkv = self.qkv_proj(x)
+        xqkv = xqkv.view(bsz, seqlen, 3, self.num_heads, self.head_dim)
+
+        if self.use_flash_attention:
+
+            xq, xk, xv = torch.split(xqkv, 1, dim=2)
+            xq = xq.squeeze(2)
+            xk = xk.squeeze(2)
+            xv = xv.squeeze(2)
+            # xq, xk, xv = xqkv[:, :, 0, ...], xqkv[:, :, 1, ...], xqkv[:, :, 2, ...]
+
+            # （B, num_head, S, head_size)
+            xq = xq.permute(0, 2, 1, 3)
+            xk = xk.permute(0, 2, 1, 3)
+            xv = xv.permute(0, 2, 1, 3)
+            output = torch.nn.functional.scaled_dot_product_attention(
+                xq, xk, xv, attn_mask=None
+            )
+            output = output.permute(0, 2, 1, 3).reshape(bsz, seqlen, -1)
+        else:
+            xq, xk, xv = torch.split(xqkv, 1, dim=2)
+            xq = xq.squeeze(2)
+            xk = xk.squeeze(2)
+            xv = xv.squeeze(2)
+
+            xq = xq.permute(0, 2, 1, 3)
+            xk = xk.permute(0, 2, 1, 3)
+            xv = xv.permute(0, 2, 1, 3)
+            output = torch.nn.functional.scaled_dot_product_attention(
+                xq, xk, xv, attn_mask=None
+            )
+            output = output.permute(0, 2, 1, 3).reshape(bsz, seqlen, -1)
+        output = self.out_proj(output)
+        return output
+
+
+@torch.jit.script
+def quick_gelu(x):
+    return x * torch.sigmoid(1.702 * x)
+
+
+class NoTPFeedForward(nn.Module):
+    def __init__(
+        self,
+        cfg,
+        dim: int,
+        hidden_dim: int,
+    ):
+        super().__init__()
+
+        self.fc1 = torch.nn.Linear(dim, hidden_dim, bias=True)
+        self.fc2 = torch.nn.Linear(hidden_dim, dim, bias=True)
+
+    def forward(self, x):
+        output = self.fc2(quick_gelu(self.fc1(x)))
+        return output
+
+
+class LayerNormfp32(torch.nn.LayerNorm):
+    """Subclass torch's LayerNorm to handle fp16."""
+
+    def forward(self, x: torch.Tensor):
+        orig_type = x.dtype
+        ret = super().forward(x.type(torch.float32))
+        return ret.type(orig_type)
+
+
+class NoTPTransformerBlock(nn.Module):
+    def __init__(self, cfg, layer_id: int, multiple_of=256):
+        super().__init__()
+
+        self.n_heads = cfg["num_attention_heads"]
+        self.dim = cfg["hidden_size"]
+        self.head_dim = cfg["hidden_size"] // cfg["num_attention_heads"]
+        self.self_attn = NoTPAttention(cfg)
+        self.mlp = NoTPFeedForward(
+            cfg, dim=cfg["hidden_size"], hidden_dim=cfg["ffn_hidden_size"]
+        )
+        self.layer_id = layer_id
+        self.layer_norm1 = torch.nn.LayerNorm(
+            cfg["hidden_size"], eps=cfg["layernorm_epsilon"]
+        )
+        self.layer_norm2 = torch.nn.LayerNorm(
+            cfg["hidden_size"], eps=cfg["layernorm_epsilon"]
+        )
+
+    def forward(self, x: torch.Tensor):
+        residual = self.self_attn.forward(self.layer_norm1(x))
+        h = x + residual
+        out = h + self.mlp.forward(self.layer_norm2(h))
+        return out
+
+
+class NoTPTransformer(nn.Module):
+    def __init__(self, cfg):
+        super().__init__()
+
+        self.cfg = cfg
+        self.num_layers = cfg["num_layers"]
+
+        self.layers = torch.nn.ModuleList()
+        for layer_id in range(self.num_layers):
+            self.layers.append(
+                NoTPTransformerBlock(
+                    cfg,
+                    layer_id + 1,
+                )
+            )
+
+    def forward(
+        self,
+        hidden_states,
+    ):
+
+        for layer in self.layers:
+            hidden_states = layer(hidden_states)
+
+        return hidden_states
+
+
+class VitModel(nn.Module):
+    def __init__(self, cfg, freeze_embed=False, freeze_pre_norm=False) -> None:
+        super().__init__()
+
+        self.embeddings = CLIPVisionEmbeddings(
+            hidden_size=cfg["hidden_size"],
+            image_size=cfg["image_size"],
+            patch_size=cfg["patch_size"],
+        )
+
+        if freeze_embed:
+            for _, param in self.embeddings.named_parameters():
+                param.requires_grad = False
+
+        self.transformer = NoTPTransformer(cfg=cfg)
+
+        if cfg.get("fp32norm", False):
+            logger.info("Load fp32 layernorm for ViT.")
+            self.pre_layrnorm = LayerNormfp32(
+                cfg["hidden_size"],
+                eps=cfg.get("pre_layernorm_epsilon", 1e-5),
+            )
+        else:
+            self.pre_layrnorm = torch.nn.LayerNorm(
+                cfg["hidden_size"],
+                eps=cfg.get("pre_layernorm_epsilon", 1e-5),
+            )
+
+        if freeze_pre_norm:
+            for _, param in self.pre_layrnorm.named_parameters():
+                param.requires_grad = False
+
+        for p in self.parameters():
+            p.micro_dp = True
+
+    @property
+    def dtype(self):
+        return next(self.parameters()).dtype
+
+    def set_input_tensor(self, input_tensor):
+        if not isinstance(input_tensor, list):
+            input_tensor = [input_tensor]
+        self.transformer.set_input_tensor(input_tensor[0])
+
+    def __str__(self) -> str:
+        return "open_clip"
+
+    def forward(self, x, patch_embeds):
+        x = self.embeddings(x, patch_embeds)
+        hidden_states = self.pre_layrnorm(x)
+
+        output = self.transformer(hidden_states)
+
+        return output
+
+
+vit_model_cfg = dict(
+    num_layers=24,
+    hidden_size=1024,
+    num_heads=16,
+    num_attention_heads=16,
+    ffn_hidden_size=4096,
+    seq_length=256,
+    max_position_embeddings=256,
+    use_flash_attn=False,
+    understand_projector_stride=2,
+    hidden_dropout=0.0,
+    attention_dropout=0.0,
+    no_persist_layer_norm=False,
+    layernorm_epsilon=1e-5,
+    pre_layernorm_epsilon=1e-5,
+    image_size=224,
+    patch_size=14,
+    recompute_list=[],
+)
+
+
+def build_clip_l():
+    return VitModel(
+        cfg=vit_model_cfg,
+        freeze_embed=False,
+        freeze_pre_norm=False,
+    )
+
+
+class CustomQwen2Decoder(nn.Module):
+    """Qwen2 decoder with mixed causal masking for OCR2 vision encoder."""
+
+    def __init__(
+        self,
+        decoder_layer: int = 24,
+        max_position_embeddings: int = 131072,
+        hidden_dimension: int = 896,
+        num_attention_heads: int = 14,
+        num_key_value_heads: int = 2,
+        intermediate_size: int = 4864,
+        vocab_size: int = 151936,
+        attn_implementation: str = "sdpa",
+        rms_norm_eps: float = 1e-6,
+        rope_theta: float = 1000000.0,
+        attention_dropout: float = 0.0,
+        hidden_act: str = "silu",
+        initializer_range: float = 0.02,
+    ):
+        super().__init__()
+        if attn_implementation == "flash_attention_2":
+            raise ValueError(
+                "CustomQwen2Decoder does not support flash_attention_2; "
+                "use sdpa or eager."
+            )
+
+        Qwen2Model = getattr(transformers.models.qwen2.modeling_qwen2, "Qwen2Model")
+        Qwen2Config = getattr(transformers, "Qwen2Config")
+
+        config = Qwen2Config(
+            hidden_size=hidden_dimension,
+            num_hidden_layers=decoder_layer,
+            num_attention_heads=num_attention_heads,
+            num_key_value_heads=num_key_value_heads,
+            intermediate_size=intermediate_size,
+            max_position_embeddings=max_position_embeddings,
+            vocab_size=vocab_size,
+            rms_norm_eps=rms_norm_eps,
+            rope_theta=rope_theta,
+            attention_dropout=attention_dropout,
+            hidden_act=hidden_act,
+            initializer_range=initializer_range,
+            _attn_implementation=attn_implementation,
+        )
+
+        self.model = self._create_custom_model(Qwen2Model, config)
+        del self.model.embed_tokens
+
+    def _create_custom_model(self, Qwen2Model, config):
+        class CustomQwen2ModelInner(Qwen2Model):
+            def forward(
+                self,
+                input_ids=None,
+                attention_mask=None,
+                position_ids=None,
+                past_key_values=None,
+                inputs_embeds=None,
+                token_type_ids=None,
+                use_cache=None,
+                output_attentions=None,
+                output_hidden_states=None,
+                return_dict=None,
+                cache_position=None,
+            ):
+                self._current_token_type_ids = token_type_ids
+                causal_mask_mapping = {
+                    "full_attention": self._update_causal_mask(
+                        attention_mask,
+                        inputs_embeds,
+                        cache_position,
+                        past_key_values,
+                        output_attentions,
+                    )
+                }
+                return super().forward(
+                    input_ids=input_ids,
+                    attention_mask=causal_mask_mapping,
+                    position_ids=position_ids,
+                    past_key_values=past_key_values,
+                    inputs_embeds=inputs_embeds,
+                    use_cache=use_cache,
+                    output_attentions=output_attentions,
+                    output_hidden_states=output_hidden_states,
+                    return_dict=return_dict,
+                    cache_position=cache_position,
+                )
+
+            def _update_causal_mask(
+                self,
+                attention_mask,
+                input_tensor,
+                cache_position,
+                past_key_values,
+                output_attentions,
+            ):
+                dtype, device = input_tensor.dtype, input_tensor.device
+                min_dtype = torch.finfo(dtype).min
+                batch_size, sequence_length = (
+                    input_tensor.shape[0],
+                    input_tensor.shape[1],
+                )
+
+                token_type_ids = getattr(self, "_current_token_type_ids", None)
+                if token_type_ids is None:
+                    return super()._update_causal_mask(
+                        attention_mask,
+                        input_tensor,
+                        cache_position,
+                        past_key_values,
+                        output_attentions,
+                    )
+
+                causal_mask = self._create_custom_4d_mask(
+                    sequence_length=sequence_length,
+                    dtype=dtype,
+                    device=device,
+                    batch_size=batch_size,
+                    token_type_ids=token_type_ids,
+                )
+
+                if attention_mask is not None and attention_mask.dim() == 2:
+                    padding_mask = attention_mask[:, None, None, :].to(dtype=dtype)
+                    padding_mask = (1.0 - padding_mask) * min_dtype
+                    causal_mask = causal_mask + padding_mask
+
+                return causal_mask
+
+            def _create_custom_4d_mask(
+                self,
+                sequence_length,
+                dtype,
+                device,
+                batch_size,
+                token_type_ids,
+            ):
+                min_dtype = torch.finfo(dtype).min
+                masks = []
+                for b in range(batch_size):
+                    mask = torch.full(
+                        (sequence_length, sequence_length),
+                        fill_value=min_dtype,
+                        dtype=dtype,
+                        device=device,
+                    )
+
+                    type_ids = token_type_ids[b]
+                    image_positions = (type_ids == 0).nonzero(as_tuple=True)[0]
+                    text_positions = (type_ids == 1).nonzero(as_tuple=True)[0]
+
+                    if len(image_positions) > 0:
+                        mask[image_positions[:, None], image_positions] = 0.0
+
+                    for i, text_pos in enumerate(text_positions):
+                        if len(image_positions) > 0:
+                            mask[text_pos, image_positions] = 0.0
+                        mask[text_pos, text_positions[: i + 1]] = 0.0
+
+                    masks.append(mask)
+
+                mask = torch.stack(masks, dim=0).unsqueeze(1)
+                return mask
+
+        return CustomQwen2ModelInner(config)
+
+    def forward(self, inputs_embeds, token_type_ids, attention_mask=None, **kwargs):
+        return self.model(
+            inputs_embeds=inputs_embeds,
+            token_type_ids=token_type_ids,
+            attention_mask=attention_mask,
+            **kwargs,
+        )
+
+
+class Qwen2Decoder2Encoder(nn.Module):
+    """Decoder-as-encoder for OCR2 vision tokens."""
+
+    def __init__(
+        self,
+        decoder_layer: int,
+        hidden_dimension: int,
+        num_attention_heads: int,
+        num_key_value_heads: int,
+        intermediate_size: int,
+        max_query: int,
+    ):
+        super().__init__()
+        self.model = CustomQwen2Decoder(
+            decoder_layer=decoder_layer,
+            hidden_dimension=hidden_dimension,
+            num_attention_heads=num_attention_heads,
+            num_key_value_heads=num_key_value_heads,
+            intermediate_size=intermediate_size,
+            attn_implementation="sdpa",
+        )
+
+        self.query_768 = nn.Embedding(144, hidden_dimension)
+        self.query_1024 = nn.Embedding(256, hidden_dimension)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = x.flatten(2).transpose(1, 2)
+        bs, n_query, _ = x.shape
+
+        if n_query == 144:
+            param_img = self.query_768.weight
+        elif n_query == 256:
+            param_img = self.query_1024.weight
+        else:
+            base = (
+                self.query_1024.weight
+                if n_query > self.query_768.num_embeddings
+                else self.query_768.weight
+            )
+            param_img = (
+                F.interpolate(
+                    base.T.unsqueeze(0),
+                    size=n_query,
+                    mode="linear",
+                    align_corners=False,
+                )
+                .squeeze(0)
+                .T
+            )
+
+        batch_query_imgs = param_img.unsqueeze(0).expand(bs, -1, -1)
+        x_combined = torch.cat([x, batch_query_imgs], dim=1)
+        token_type_ids = torch.cat(
+            [
+                torch.zeros(bs, n_query, dtype=torch.long, device=x.device),
+                torch.ones(bs, n_query, dtype=torch.long, device=x.device),
+            ],
+            dim=1,
+        )
+        y = self.model(x_combined, token_type_ids)[0]
+        y = y[:, n_query:, :]
+        return y
+
+
+def build_qwen2_decoder_as_encoder(
+    decoder_layer: int = 24,
+    hidden_dimension: int = 896,
+    num_attention_heads: int = 14,
+    num_key_value_heads: int = 2,
+    intermediate_size: int = 4864,
+    max_query: int = 400,
+    checkpoint=None,
+):
+    decoder_as_encoder = Qwen2Decoder2Encoder(
+        decoder_layer=decoder_layer,
+        hidden_dimension=hidden_dimension,
+        num_attention_heads=num_attention_heads,
+        num_key_value_heads=num_key_value_heads,
+        intermediate_size=intermediate_size,
+        max_query=max_query,
+    )
+    if checkpoint is not None:
+        state_dict = torch.load(checkpoint)
+        decoder_as_encoder.load_state_dict(state_dict, strict=True)
+    return decoder_as_encoder
+
+
+class DeepseekOCRForCausalLM(nn.Module):
+    def __init__(
+        self,
+        *,
+        config: DeepseekVLV2Config,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+
+        self.config = config
+
+        self.vision_config = config.vision_config
+        self.projector_config = config.projector_config
+        self.text_config = config.text_config
+        self.is_ocr2 = (
+            str(getattr(self.vision_config, "model_name", "")).lower()
+            == "deepencoderv2"
+            or getattr(self.projector_config, "input_dim", None) == 896
+        )
+        n_embed = getattr(self.projector_config, "n_embed", 1280)
+
+        self.tile_tag = config.tile_tag
+        self.global_view_pos = config.global_view_pos
+
+        # special token for image token sequence format
+        embed_std = 1 / torch.sqrt(torch.tensor(n_embed, dtype=torch.float32))
+        if self.tile_tag == "2D":
+            # <|view_separator|>, <|\n|>
+            self.view_seperator = nn.Parameter(torch.randn(n_embed) * embed_std)
+            if not self.is_ocr2:
+                self.image_newline = nn.Parameter(torch.randn(n_embed) * embed_std)
+        else:
+            raise ValueError(
+                f"Only 2D tile_tag is supported currently, got: {self.tile_tag}"
+            )
+
+        if not self.is_ocr2:
+            if self.text_config.topk_method == "noaux_tc":
+                self.model = DeepseekV3ForCausalLM(
+                    config=config.text_config,
+                    quant_config=quant_config,
+                    prefix=maybe_prefix(prefix, "language"),
+                )
+            elif not self.text_config.use_mla:
+                self.model = DeepseekForCausalLM(
+                    config=config.text_config,
+                    quant_config=quant_config,
+                    prefix=maybe_prefix(prefix, "language"),
+                )
+            else:
+                self.model = DeepseekV2ForCausalLM(
+                    config=config.text_config,
+                    quant_config=quant_config,
+                    prefix=maybe_prefix(prefix, "language"),
+                )
+        else:
+            # OCR2 language_config uses non-MLA attention (qk_* dims are 0).
+            # Use the non-MLA Deepseek model to avoid MLA-specific assumptions.
+            self.model = DeepseekForCausalLM(
+                config=config.text_config,
+                quant_config=quant_config,
+                prefix=maybe_prefix(prefix, "language"),
+            )
+
+        if not self.is_ocr2:
+            self.sam_model = build_sam_vit_b()
+            self.vision_model = build_clip_l()
+        else:
+            projector_input_dim = getattr(self.projector_config, "input_dim", 896)
+            self.sam_model = build_sam_vit_b(net_3_out_channels=projector_input_dim)
+            self.qwen2_model = build_qwen2_decoder_as_encoder(
+                hidden_dimension=projector_input_dim
+            )
+
+        self.projector = MlpProjector(
+            projector_type=self.projector_config.projector_type,
+            input_dim=self.projector_config.input_dim,
+            n_embed=n_embed,
+            depth=self.projector_config.depth,
+            mlp_ratio=self.projector_config.mlp_ratio,
+            downsample_ratio=self.projector_config.downsample_ratio,
+        )
+
+    @staticmethod
+    def _collect_mm_flag(
+        items: List[MultimodalDataItem], flag_name: str
+    ) -> Optional[List[bool]]:
+        values = []
+        for item in items:
+            value = getattr(item, flag_name, None)
+            if value is None:
+                return None
+            values.append(bool(value))
+        return values
+
+    def _encode_ocr2_features(self, images: torch.Tensor) -> torch.Tensor:
+        features = self.sam_model(images)
+        features = self.qwen2_model(features)
+        features = self.projector(features)
+        return features.view(-1, features.shape[-1])
+
+    def _encode_ocr1_features(self, images: torch.Tensor) -> torch.Tensor:
+        features_1 = self.sam_model(images)
+        features_2 = self.vision_model(images, features_1)
+        features = torch.cat(
+            (
+                features_2[:, 1:],
+                features_1.flatten(2).permute(0, 2, 1),
+            ),
+            dim=-1,
+        )
+        return self.projector(features)
+
+    def _format_ocr1_global_features(self, features: torch.Tensor) -> torch.Tensor:
+        _, hw, n_dim = features.shape
+        h = w = int(hw**0.5)
+        features = features.view(h, w, n_dim)
+        features = torch.cat(
+            [features, self.image_newline[None, None, :].expand(h, 1, n_dim)],
+            dim=1,
+        )
+        return features.view(-1, n_dim)
+
+    def _format_ocr1_local_features(
+        self, features: torch.Tensor, crop_shape: torch.Tensor
+    ) -> torch.Tensor:
+        _, hw2, n_dim2 = features.shape
+        h2 = w2 = int(hw2**0.5)
+        width_crop_num, height_crop_num = int(crop_shape[0]), int(crop_shape[1])
+        features = (
+            features.view(height_crop_num, width_crop_num, h2, w2, n_dim2)
+            .permute(0, 2, 1, 3, 4)
+            .reshape(height_crop_num * h2, width_crop_num * w2, n_dim2)
+        )
+        features = torch.cat(
+            [
+                features,
+                self.image_newline[None, None, :].expand(
+                    height_crop_num * h2, 1, n_dim2
+                ),
+            ],
+            dim=1,
+        )
+        return features.view(-1, n_dim2)
+
+    def _parse_and_validate_image_input(self, **kwargs: object):
+
+        pixel_values = kwargs.pop("pixel_values", None)
+        images_spatial_crop = kwargs.pop("images_spatial_crop", None)
+        images_crop = kwargs.pop("images_crop", None)
+        has_images = kwargs.pop("has_images", None)
+
+        if pixel_values is None:
+            return None
+        if has_images is not None:
+            if not has_images:
+                return None
+        elif torch.sum(pixel_values).item() == 0:
+            return None
+
+        if pixel_values is not None:
+            if not isinstance(pixel_values, (torch.Tensor, list)):
+                raise ValueError(
+                    "Incorrect type of pixel values. " f"Got type: {type(pixel_values)}"
+                )
+
+            if not isinstance(images_spatial_crop, (torch.Tensor, list)):
+                raise ValueError(
+                    "Incorrect type of image sizes. "
+                    f"Got type: {type(images_spatial_crop)}"
+                )
+
+            if not isinstance(images_crop, (torch.Tensor, list)):
+                raise ValueError(
+                    "Incorrect type of image crop. " f"Got type: {type(images_crop)}"
+                )
+
+            return [pixel_values, images_crop, images_spatial_crop]
+
+        raise AssertionError("This line should be unreachable.")
+
+    def _pixel_values_to_embedding(
+        self,
+        pixel_values: torch.Tensor,
+        images_crop: torch.Tensor,
+        images_spatial_crop: torch.Tensor,
+        has_local_crops: Optional[List[bool]] = None,
+    ) -> NestedTensors:
+
+        # Pixel_values (global view): [n_image, batch_size, 3, height, width]
+        # images_spatial_crop: [n_image, batch_size, [num_tiles_w, num_tiles_h]]
+        # images_crop (local view): [n_image, batch_size, num_pathes, 3, h, w]
+        # split the pixel and image_crop, all batch_size = 1
+
+        images_in_this_batch = []
+
+        if not self.is_ocr2:
+            with torch.no_grad():
+                for jdx in range(images_spatial_crop.size(0)):
+                    patches = images_crop[jdx][0].to(torch.bfloat16)
+                    image_ori = pixel_values[jdx]
+                    crop_shape = images_spatial_crop[jdx][0]
+                    use_local_crops = (
+                        has_local_crops[jdx]
+                        if has_local_crops is not None
+                        else torch.sum(patches).item() != 0
+                    )
+
+                    global_features = self._encode_ocr1_features(image_ori)
+                    global_features = self._format_ocr1_global_features(global_features)
+
+                    if use_local_crops:
+                        local_features = self._encode_ocr1_features(patches)
+                        local_features = self._format_ocr1_local_features(
+                            local_features, crop_shape
+                        )
+                        global_local_features = torch.cat(
+                            [
+                                local_features,
+                                global_features,
+                                self.view_seperator[None, :],
+                            ],
+                            dim=0,
+                        )
+                    else:
+                        global_local_features = torch.cat(
+                            [global_features, self.view_seperator[None, :]], dim=0
+                        )
+
+                    images_in_this_batch.append(global_local_features)
+
+            return images_in_this_batch
+
+        with torch.no_grad():
+            for jdx in range(images_spatial_crop.size(0)):
+                patches = images_crop[jdx][0].to(torch.bfloat16)
+                image_ori = pixel_values[jdx]
+                use_local_crops = (
+                    has_local_crops[jdx]
+                    if has_local_crops is not None
+                    else torch.sum(patches).item() != 0
+                )
+
+                global_features = self._encode_ocr2_features(image_ori)
+                if use_local_crops:
+                    local_features = self._encode_ocr2_features(patches)
+                    global_local_features = torch.cat(
+                        [local_features, global_features, self.view_seperator[None, :]],
+                        dim=0,
+                    )
+                else:
+                    global_local_features = torch.cat(
+                        [global_features, self.view_seperator[None, :]], dim=0
+                    )
+
+                images_in_this_batch.append(global_local_features)
+
+        return images_in_this_batch
+
+    def _process_image_input(self, mm_items: List[MultimodalDataItem]) -> torch.Tensor:
+        target_dtype = (
+            next(self.sam_model.parameters()).dtype
+            if self.is_ocr2
+            else self.vision_model.dtype
+        )
+        has_local_crops = self._collect_mm_flag(mm_items, "has_local_crops")
+        pixel_values = torch.stack([item.feature for item in mm_items], dim=0).type(
+            target_dtype
+        )
+
+        images_crop = (
+            torch.stack([item.images_crop for item in mm_items], dim=0)
+            .type(torch.long)
+            .to(device=pixel_values.device)
+        )
+        images_spatial_crop = (
+            torch.cat([item.images_spatial_crop for item in mm_items], dim=0)
+            .type(torch.long)
+            .to(device=pixel_values.device)
+        )
+
+        assert images_crop.dim() == 6
+        assert images_spatial_crop.dim() == 3
+
+        vision_feature_lists = self._pixel_values_to_embedding(
+            pixel_values=pixel_values,
+            images_crop=images_crop,
+            images_spatial_crop=images_spatial_crop,
+            has_local_crops=has_local_crops,
+        )
+        vision_features = torch.cat(vision_feature_lists, dim=0).type(target_dtype)
+
+        return vision_features
+
+    def get_language_model(self) -> torch.nn.Module:
+        return self.model
+
+    def get_multimodal_embeddings(
+        self, **kwargs: object
+    ) -> Optional[MultiModalEmbeddings]:
+        image_input = self._parse_and_validate_image_input(**kwargs)
+        if image_input is None:
+            return None
+        vision_embeddings = self._process_image_input(image_input)
+        return vision_embeddings
+
+    def get_input_embeddings(
+        self,
+        input_ids: torch.Tensor,
+        multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
+    ) -> torch.Tensor:
+
+        inputs_embeds = self.model.get_input_embeddings(input_ids)
+
+        if multimodal_embeddings is not None:
+            inputs_embeds = merge_multimodal_embeddings(
+                input_ids, inputs_embeds, multimodal_embeddings, self.image_token_id
+            )
+
+        return inputs_embeds
+
+    def pad_input_ids(self, input_ids: List[int], mm_inputs: MultimodalInputs):
+        pattern = MultiModalityDataPaddingPatternMultimodalTokens()
+        return pattern.pad_input_tokens(input_ids, mm_inputs)
+
+    def get_image_feature(self, items: List[MultimodalDataItem]) -> torch.Tensor:
+        vision_embeddings = self._process_image_input(items)
+        return vision_embeddings
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        **kwargs: object,
+    ):
+        hidden_states = general_mm_embed_routine(
+            input_ids=input_ids,
+            forward_batch=forward_batch,
+            language_model=self.model,
+            multimodal_model=self,
+            positions=positions,
+        )
+
+        return hidden_states
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            (".qkv_proj", ".q_proj", "q"),
+            (".qkv_proj", ".k_proj", "k"),
+            (".qkv_proj", ".v_proj", "v"),
+            (".gate_up_proj", ".gate_proj", 0),
+            (".gate_up_proj", ".up_proj", 1),
+        ]
+
+        params_dict = dict(self.named_parameters())
+        loaded_params: Set[str] = set()
+
+        for name, loaded_weight in weights:
+            if "rotary_emb.inv_freq" in name:
+                continue
+            is_qwen2_weight = "qwen2_model." in name
+            if name == "lm_head.weight":
+                name = "model.lm_head.weight"
+            elif name.startswith("model."):
+                if (
+                    "image_newline" in name
+                    or ".projector" in name
+                    or "vision_model" in name
+                    or "qwen2_model" in name
+                    or "sam_model" in name
+                    or "view_seperator" in name
+                ):
+                    name = name[len("model.") :]
+                elif not (
+                    ".projector" in name
+                    or "vision_model" in name
+                    or "qwen2_model" in name
+                    or "sam_model" in name
+                    or "image_newline" in name
+                ):
+                    name = name.replace("model.", "model.model.")
+
+            if is_qwen2_weight:
+                target_name = name
+                if target_name not in params_dict:
+                    if ".model.model." in target_name:
+                        alt_name = target_name.replace(".model.model.", ".model.")
+                    else:
+                        alt_name = target_name.replace(".model.", ".model.model.", 1)
+                    if alt_name in params_dict:
+                        target_name = alt_name
+                if target_name.endswith(".bias") and target_name not in params_dict:
+                    continue
+                if target_name in params_dict:
+                    param = params_dict[target_name]
+                    weight_loader = getattr(
+                        param, "weight_loader", default_weight_loader
+                    )
+                    weight_loader(param, loaded_weight)
+                    loaded_params.add(target_name)
+                continue
+
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                # Skip experts that are not assigned to this worker.
+                if (
+                    "mlp.experts." in name or "mlp.shared_experts." in name
+                ) and name not in params_dict:
+                    continue
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                # Skip experts that are not assigned to this worker.
+                if (
+                    "mlp.experts." in name or "mlp.shared_experts." in name
+                ) and name not in params_dict:
+                    continue
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
+                weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        unloaded_params = params_dict.keys() - loaded_params
+        if unloaded_params:
+            raise RuntimeError(
+                f"Some weights are not initialized from checkpoints: {unloaded_params}"
+            )
+
+
+EntryClass = [DeepseekOCRForCausalLM]
diff --git a/sglang/python/sglang/srt/models/deepseek_v2.py b/sglang/python/sglang/srt/models/deepseek_v2.py
new file mode 100644
index 0000000000000000000000000000000000000000..e9a4f511bfbb314678807330a4fddeb57e9f663c
--- /dev/null
+++ b/sglang/python/sglang/srt/models/deepseek_v2.py
@@ -0,0 +1,2242 @@
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+# Adapted from:
+# https://github.com/vllm-project/vllm/blob/fb6af8bc086328ca6659e72d11ffd4309ce4de22/vllm/model_executor/models/deepseek_v2.py
+"""Inference-only DeepseekV2 model."""
+
+from __future__ import annotations
+
+import logging
+from contextlib import nullcontext
+from typing import Any, Dict, Iterable, List, Optional, Tuple, Union
+
+import torch
+import torch.nn.functional as F
+from torch import nn
+from transformers import PretrainedConfig
+
+from sglang.srt.batch_overlap.single_batch_overlap import SboFlags, compute_overlap_args
+from sglang.srt.batch_overlap.two_batch_overlap import (
+    MaybeTboDeepEPDispatcher,
+    model_forward_maybe_tbo,
+)
+from sglang.srt.configs.model_config import (
+    get_nsa_index_head_dim,
+    get_nsa_index_n_heads,
+    get_nsa_index_topk,
+    is_deepseek_nsa,
+)
+from sglang.srt.distributed import (
+    divide,
+    get_moe_expert_parallel_world_size,
+    get_pp_group,
+    get_tensor_model_parallel_world_size,
+    tensor_model_parallel_all_gather,
+    tensor_model_parallel_all_reduce,
+)
+from sglang.srt.environ import envs
+from sglang.srt.eplb.expert_distribution import get_global_expert_distribution_recorder
+from sglang.srt.eplb.expert_location import ModelConfigForExpertLocation
+from sglang.srt.eplb.expert_location_dispatch import ExpertLocationDispatchInfo
+from sglang.srt.layers import deep_gemm_wrapper
+from sglang.srt.layers.activation import SiluAndMul
+from sglang.srt.layers.amx_utils import PackWeightMethod
+from sglang.srt.layers.attention.nsa.nsa_indexer import Indexer
+from sglang.srt.layers.attention.nsa.utils import (
+    can_cp_split,
+    cp_all_gather_rerange_output,
+    cp_split_and_rebuild_data,
+    cp_split_and_rebuild_position,
+    is_nsa_enable_prefill_cp,
+    nsa_use_prefill_cp,
+    prepare_input_dp_with_cp_dsa,
+)
+from sglang.srt.layers.communicator import (
+    LayerCommunicator,
+    LayerScatterModes,
+    enable_moe_dense_fully_dp,
+    get_attn_tp_context,
+)
+from sglang.srt.layers.communicator_nsa_cp import NSACPLayerCommunicator
+from sglang.srt.layers.dp_attention import (
+    get_attention_cp_rank,
+    get_attention_cp_size,
+    get_attention_tp_rank,
+    get_attention_tp_size,
+    is_dp_attention_enabled,
+)
+from sglang.srt.layers.layernorm import RMSNorm
+from sglang.srt.layers.linear import (
+    ColumnParallelLinear,
+    MergedColumnParallelLinear,
+    ReplicatedLinear,
+    RowParallelLinear,
+)
+from sglang.srt.layers.logits_processor import LogitsProcessor
+from sglang.srt.layers.moe import (
+    get_moe_a2a_backend,
+    get_moe_runner_backend,
+    should_use_flashinfer_cutlass_moe_fp4_allgather,
+)
+from sglang.srt.layers.moe.ep_moe.layer import get_moe_impl_class
+from sglang.srt.layers.moe.fused_moe_triton.layer import FusedMoE
+from sglang.srt.layers.moe.kt_ep_wrapper import KTEPWrapperMethod
+from sglang.srt.layers.moe.token_dispatcher.base import (
+    BaseDispatcher,
+    CombineInput,
+    DispatchOutput,
+)
+from sglang.srt.layers.moe.topk import TopK, TopKOutputFormat
+from sglang.srt.layers.moe.utils import (
+    RoutingMethodType,
+    filter_moe_weight_param_global_expert,
+)
+from sglang.srt.layers.quantization.base_config import QuantizationConfig
+from sglang.srt.layers.quantization.fp8 import Fp8Config
+from sglang.srt.layers.radix_attention import RadixAttention
+from sglang.srt.layers.rotary_embedding import get_rope_wrapper
+from sglang.srt.layers.utils import PPMissingLayer
+from sglang.srt.layers.vocab_parallel_embedding import (
+    ParallelLMHead,
+    VocabParallelEmbedding,
+)
+from sglang.srt.model_executor.cuda_graph_runner import get_is_capture_mode
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch, PPProxyTensors
+from sglang.srt.models.deepseek_common.attention_backend_handler import (
+    AttentionBackendRegistry,
+)
+from sglang.srt.models.deepseek_common.attention_forward_methods import (
+    AttnForwardMethod,
+    DeepseekMHAForwardMixin,
+    DeepseekMLACpuForwardMixin,
+    DeepseekMLAForwardMixin,
+    DeepseekMLARocmForwardMixin,
+)
+from sglang.srt.models.deepseek_common.deepseek_weight_loader import (
+    DeepseekV2WeightLoaderMixin,
+)
+from sglang.srt.models.deepseek_common.utils import (
+    _device_sm,
+    _get_llama_4_scaling,
+    _is_cpu,
+    _is_cpu_amx_available,
+    _is_cuda,
+    _is_gfx95_supported,
+    _is_hip,
+    _is_npu,
+    _use_aiter,
+    _use_aiter_gfx95,
+    yarn_get_mscale,
+)
+from sglang.srt.server_args import get_global_server_args
+from sglang.srt.speculative.spec_info import SpeculativeAlgorithm
+from sglang.srt.utils import (
+    BumpAllocator,
+    LazyValue,
+    add_prefix,
+    is_non_idle_and_non_empty,
+    log_info_on_rank0,
+    make_layers,
+    use_intel_amx_backend,
+)
+
+if _use_aiter_gfx95:
+    from sglang.srt.layers.rocm_linear_utils import (
+        aiter_dsv3_router_gemm,
+        get_dsv3_gemm_output_zero_allocator_size,
+    )
+
+if _use_aiter:
+    pass
+
+if _is_cuda:
+    from sgl_kernel import dsv3_fused_a_gemm, dsv3_router_gemm
+elif _is_npu:
+    from sglang.srt.hardware_backend.npu.modules.deepseek_v2_attention_mla_npu import (
+        forward_dsa_core_npu,
+        forward_dsa_prepare_npu,
+        forward_mha_core_npu,
+        forward_mha_prepare_npu,
+        forward_mla_core_npu,
+        forward_mla_prepare_npu,
+    )
+else:
+    pass
+
+logger = logging.getLogger(__name__)
+
+
+class DeepseekV2MLP(nn.Module):
+    def __init__(
+        self,
+        hidden_size: int,
+        intermediate_size: int,
+        hidden_act: str,
+        quant_config: Optional[QuantizationConfig] = None,
+        reduce_results: bool = True,
+        prefix: str = "",
+        tp_rank: Optional[int] = None,
+        tp_size: Optional[int] = None,
+    ) -> None:
+        super().__init__()
+        self.tp_size = tp_size
+
+        self.gate_up_proj = MergedColumnParallelLinear(
+            hidden_size,
+            [intermediate_size] * 2,
+            bias=False,
+            quant_config=quant_config,
+            prefix=add_prefix("gate_up_proj", prefix),
+            tp_rank=tp_rank,
+            tp_size=tp_size,
+        )
+        self.down_proj = RowParallelLinear(
+            intermediate_size,
+            hidden_size,
+            bias=False,
+            quant_config=quant_config,
+            reduce_results=reduce_results,
+            prefix=add_prefix("down_proj", prefix),
+            tp_rank=tp_rank,
+            tp_size=tp_size,
+        )
+        if not hasattr(self.gate_up_proj, "weight") and hasattr(
+            self.gate_up_proj, "weight_packed"
+        ):
+            self.gate_up_proj.weight = self.gate_up_proj.weight_packed
+        if not hasattr(self.down_proj, "weight") and hasattr(
+            self.down_proj, "weight_packed"
+        ):
+            self.down_proj.weight = self.down_proj.weight_packed
+        if hidden_act != "silu":
+            raise ValueError(
+                f"Unsupported activation: {hidden_act}. "
+                "Only silu is supported for now."
+            )
+        self.act_fn = SiluAndMul()
+
+    def forward(
+        self,
+        x,
+        forward_batch=None,
+        should_allreduce_fusion: bool = False,
+        use_reduce_scatter: bool = False,
+        gemm_output_zero_allocator: BumpAllocator = None,
+    ):
+        if (self.tp_size == 1) and x.shape[0] == 0:
+            return x
+
+        if (
+            gemm_output_zero_allocator is not None
+            and x.shape[0] <= 256
+            and self.gate_up_proj.weight.dtype == torch.uint8
+        ):
+            y = gemm_output_zero_allocator.allocate(
+                x.shape[0] * self.gate_up_proj.output_size_per_partition
+            ).view(x.shape[0], self.gate_up_proj.output_size_per_partition)
+            x = (x, None, y)
+
+        gate_up, _ = self.gate_up_proj(x)
+        x = self.act_fn(gate_up)
+        x, _ = self.down_proj(
+            x,
+            skip_all_reduce=should_allreduce_fusion or use_reduce_scatter,
+        )
+        return x
+
+
+class MoEGate(nn.Module):
+    def __init__(
+        self,
+        config,
+        quant_config,
+        prefix: str = "",
+        is_nextn: bool = False,
+    ):
+        super().__init__()
+        self.is_nextn = is_nextn
+        self.weight = nn.Parameter(
+            torch.empty((config.n_routed_experts, config.hidden_size))
+        )
+        if config.topk_method == "noaux_tc":
+            correction_bias_dtype = torch.float32
+            if quant_config is not None:
+                if (
+                    quant_config.get_name() == "modelopt_fp4"
+                    and get_moe_runner_backend().is_flashinfer_trtllm()
+                ):
+                    correction_bias_dtype = torch.bfloat16
+                elif _use_aiter and quant_config.get_name() in (
+                    "fp8",
+                    "compressed_tensors",
+                ):
+                    correction_bias_dtype = torch.bfloat16
+            self.e_score_correction_bias = nn.Parameter(
+                torch.empty((config.n_routed_experts), dtype=correction_bias_dtype)
+            )
+        else:
+            self.e_score_correction_bias = None
+        if _is_cpu and _is_cpu_amx_available:
+            self.quant_method = PackWeightMethod(weight_names=["weight"])
+        self.nsa_enable_prefill_cp = is_nsa_enable_prefill_cp()
+
+    def forward(
+        self,
+        hidden_states,
+        gemm_output_zero_allocator: BumpAllocator = None,
+        forward_batch: ForwardBatch = None,
+    ):
+        if use_intel_amx_backend(self):
+            return torch.ops.sgl_kernel.weight_packed_linear(
+                hidden_states,
+                self.weight,
+                None,  # bias
+                True,  # is_vnni
+            )
+
+        if get_global_server_args().enable_deterministic_inference:
+            return F.linear(hidden_states, self.weight, None)
+
+        if forward_batch is not None and nsa_use_prefill_cp(forward_batch):
+            logits = F.linear(hidden_states, self.weight, None)
+        else:
+            # NOTE: For some unknown reason, router_gemm seems degrade accept length.
+            if (
+                _is_cuda
+                and hidden_states.shape[0] <= 16
+                and hidden_states.shape[1] == 7168
+                and (self.weight.shape[0] == 256 or self.weight.shape[0] == 384)
+                and _device_sm >= 90
+            ):
+
+                # router gemm output float32
+                logits = dsv3_router_gemm(
+                    hidden_states, self.weight, out_dtype=torch.float32
+                )
+            elif _use_aiter_gfx95 and hidden_states.shape[0] <= 256:
+                logits = aiter_dsv3_router_gemm(
+                    hidden_states, self.weight, gemm_output_zero_allocator
+                )
+            else:
+                logits = F.linear(hidden_states, self.weight, None)
+
+        return logits
+
+
+class DeepseekV2MoE(nn.Module):
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        layer_id: int,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+        alt_stream: Optional[torch.cuda.Stream] = None,
+        is_nextn: bool = False,
+    ):
+        super().__init__()
+        self.tp_size = get_tensor_model_parallel_world_size()
+        self.moe_ep_size = get_moe_expert_parallel_world_size()
+        self.routed_scaling_factor = config.routed_scaling_factor
+        self.n_shared_experts = config.n_shared_experts
+        self.num_fused_shared_experts = (
+            0
+            if get_global_server_args().disable_shared_experts_fusion
+            else config.n_shared_experts
+        )
+        self.config = config
+        self.layer_id = layer_id
+        self.alt_stream = alt_stream
+        self.is_nextn = is_nextn
+
+        if self.tp_size > config.n_routed_experts:
+            raise ValueError(
+                f"Tensor parallel size {self.tp_size} is greater than "
+                f"the number of experts {config.n_routed_experts}."
+            )
+
+        if config.hidden_act != "silu":
+            raise ValueError(
+                f"Unsupported activation: {config.hidden_act}. "
+                "Only silu is supported for now."
+            )
+
+        self.gate = MoEGate(
+            config=config,
+            quant_config=quant_config,
+            prefix=add_prefix("gate", prefix),
+            is_nextn=is_nextn,
+        )
+
+        # scaling factor for fused shared experts on AMD-platform.
+        fused_shared_experts_scaling_factor = None
+        if self.moe_ep_size > 1 and self.num_fused_shared_experts > 0:
+            # if enable_ep_moe tp_szie == ep_size, every gpu get shared experts gemm output
+            # so we scale with 1 / self.moe_ep_size in ep mode which will make it equalation as in tp mode
+            # with fused_shared_experts
+            fused_shared_experts_scaling_factor = 1.0 / float(self.moe_ep_size)
+
+        self.experts = get_moe_impl_class(quant_config)(
+            num_experts=config.n_routed_experts
+            + self.num_fused_shared_experts
+            + get_global_server_args().ep_num_redundant_experts,
+            num_fused_shared_experts=self.num_fused_shared_experts,
+            top_k=config.num_experts_per_tok + self.num_fused_shared_experts,
+            hidden_size=config.hidden_size,
+            intermediate_size=config.moe_intermediate_size,
+            layer_id=self.layer_id,
+            quant_config=quant_config,
+            routed_scaling_factor=self.routed_scaling_factor,
+            routing_method_type=getattr(
+                config, "routing_method_type", RoutingMethodType.DeepSeekV3
+            ),
+            prefix=add_prefix("experts", prefix),
+        )
+
+        self.topk = TopK(
+            top_k=config.num_experts_per_tok + self.num_fused_shared_experts,
+            layer_id=self.layer_id,
+            renormalize=config.norm_topk_prob,
+            use_grouped_topk=True,
+            num_expert_group=config.n_group,
+            num_fused_shared_experts=self.num_fused_shared_experts,
+            topk_group=config.topk_group,
+            correction_bias=self.gate.e_score_correction_bias,
+            quant_config=quant_config,
+            routed_scaling_factor=self.routed_scaling_factor,
+            apply_routed_scaling_factor_on_output=self.experts.should_fuse_routed_scaling_factor_in_topk,
+            fused_shared_experts_scaling_factor=fused_shared_experts_scaling_factor,
+            # Some Fp4 MoE backends require the output format to be bypassed but the MTP layers are unquantized
+            # and requires the output format to be standard (except trtllm). We use quant_config to determine the output format.
+            output_format=(
+                TopKOutputFormat.STANDARD
+                if (quant_config is None)
+                and (not get_moe_runner_backend().is_flashinfer_trtllm())
+                else None
+            ),
+        )
+
+        self.shared_experts_is_int8 = False
+        self.shared_experts_is_fp8 = False
+        self.shared_experts_weight_block_size = None
+        if config.n_shared_experts is not None and self.num_fused_shared_experts == 0:
+            intermediate_size = config.moe_intermediate_size * config.n_shared_experts
+            # disable tp for shared experts when enable deepep moe, or with fp4 allgather
+            self.shared_experts = DeepseekV2MLP(
+                hidden_size=config.hidden_size,
+                intermediate_size=intermediate_size,
+                hidden_act=config.hidden_act,
+                quant_config=quant_config,
+                reduce_results=False,
+                prefix=add_prefix("shared_experts", prefix),
+                **(
+                    dict(tp_rank=0, tp_size=1)
+                    if get_moe_a2a_backend().is_deepep()
+                    or get_moe_a2a_backend().is_mooncake()
+                    or get_moe_a2a_backend().is_mori()
+                    or get_moe_a2a_backend().is_ascend_fuseep()
+                    or get_moe_a2a_backend().is_flashinfer()
+                    or should_use_flashinfer_cutlass_moe_fp4_allgather()
+                    else {}
+                ),
+            )
+            is_packed_weight = hasattr(
+                self.shared_experts.gate_up_proj.quant_method, "quant_config"
+            ) and self.shared_experts.gate_up_proj.quant_method.quant_config.get_name() in {
+                "awq",
+                "awq_marlin",
+                "moe_wna16",
+            }
+            self.shared_experts_is_int8 = (
+                not is_packed_weight
+                and self.shared_experts.gate_up_proj.weight.dtype == torch.int8
+            )
+            self.shared_experts_is_fp8 = (
+                not is_packed_weight
+                and self.shared_experts.gate_up_proj.weight.dtype == torch.float8_e4m3fn
+            )
+            if self.shared_experts_is_fp8:
+                if (
+                    _use_aiter
+                    and config.quantization_config.get("quant_method")
+                    == "compressed-tensors"
+                ):
+                    # For compressed-tensors ptpc model, don't need to check the weight_block_size
+                    pass
+                else:
+                    assert (
+                        self.shared_experts.gate_up_proj.quant_method.quant_config.weight_block_size
+                        == self.shared_experts.down_proj.quant_method.quant_config.weight_block_size
+                    )
+                    self.shared_experts_weight_block_size = (
+                        self.shared_experts.gate_up_proj.quant_method.quant_config.weight_block_size
+                    )
+
+        self.top_k = config.num_experts_per_tok
+
+        if (
+            get_moe_a2a_backend().is_deepep()
+            or get_moe_a2a_backend().is_mooncake()
+            or get_moe_a2a_backend().is_mori()
+            or get_moe_a2a_backend().is_ascend_fuseep()
+        ):
+            # TODO: we will support tp < ep in the future
+            self.ep_size = get_moe_expert_parallel_world_size()
+            self.num_experts = (
+                config.n_routed_experts
+                + get_global_server_args().ep_num_redundant_experts
+            )
+            self.renormalize = config.norm_topk_prob
+            self.topk_group = config.topk_group
+            self.num_expert_group = config.n_group
+            self.correction_bias = (
+                self.gate.e_score_correction_bias.data
+                if self.gate.e_score_correction_bias is not None
+                else None
+            )
+
+        self._enable_a2a_moe = (
+            get_moe_a2a_backend().is_deepep()
+            or get_moe_a2a_backend().is_mooncake()
+            or get_moe_a2a_backend().is_mori()
+            or get_moe_a2a_backend().is_ascend_fuseep()
+            or get_moe_a2a_backend().is_flashinfer()
+        )
+        self._fuse_shared_experts_inside_sbo = SboFlags.fuse_shared_experts_inside_sbo()
+
+    def get_moe_weights(self):
+        return [
+            x.data
+            for name, x in self.experts.named_parameters()
+            if name not in ["correction_bias"]
+            and filter_moe_weight_param_global_expert(
+                name, x, self.experts.num_local_experts
+            )
+        ]
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        forward_batch: Optional[ForwardBatch] = None,
+        should_allreduce_fusion: bool = False,
+        use_reduce_scatter: bool = False,
+        gemm_output_zero_allocator: BumpAllocator = None,
+    ) -> torch.Tensor:
+        if not self._enable_a2a_moe:
+            if (
+                self.alt_stream is not None
+                and self.num_fused_shared_experts == 0
+                and hidden_states.shape[0] > 0
+                and get_is_capture_mode()
+            ):
+                return self.forward_normal_dual_stream(
+                    hidden_states,
+                    should_allreduce_fusion,
+                    use_reduce_scatter,
+                    gemm_output_zero_allocator,
+                )
+            else:
+                return self.forward_normal(
+                    hidden_states,
+                    should_allreduce_fusion,
+                    use_reduce_scatter,
+                    gemm_output_zero_allocator,
+                )
+        else:
+            return self.forward_deepep(hidden_states, forward_batch)
+
+    def forward_normal_dual_stream(
+        self,
+        hidden_states: torch.Tensor,
+        should_allreduce_fusion: bool = False,
+        use_reduce_scatter: bool = False,
+        gemm_output_zero_allocator: BumpAllocator = None,
+    ) -> torch.Tensor:
+
+        current_stream = torch.cuda.current_stream()
+        self.alt_stream.wait_stream(current_stream)
+        shared_output = self._forward_shared_experts(
+            hidden_states, gemm_output_zero_allocator
+        )
+
+        with torch.cuda.stream(self.alt_stream):
+            # router_logits: (num_tokens, n_experts)
+            router_logits = self.gate(hidden_states, gemm_output_zero_allocator)
+            topk_output = self.topk(hidden_states, router_logits)
+            final_hidden_states = self.experts(hidden_states, topk_output)
+            if not _is_cuda or isinstance(self.experts.quant_method, KTEPWrapperMethod):
+                final_hidden_states *= self.routed_scaling_factor
+
+        current_stream.wait_stream(self.alt_stream)
+        final_hidden_states += shared_output
+        if (
+            self.tp_size > 1
+            and not should_allreduce_fusion
+            and not use_reduce_scatter
+            and not should_use_flashinfer_cutlass_moe_fp4_allgather()
+        ):
+            final_hidden_states = tensor_model_parallel_all_reduce(final_hidden_states)
+        return final_hidden_states
+
+    def forward_normal(
+        self,
+        hidden_states: torch.Tensor,
+        should_allreduce_fusion: bool = False,
+        use_reduce_scatter: bool = False,
+        gemm_output_zero_allocator: BumpAllocator = None,
+    ) -> torch.Tensor:
+        if hasattr(self, "shared_experts") and use_intel_amx_backend(
+            self.shared_experts.gate_up_proj
+        ):
+            return self.forward_cpu(hidden_states, should_allreduce_fusion)
+
+        if hidden_states.shape[0] > 0:
+            if (
+                not self._fuse_shared_experts_inside_sbo
+            ):  # TODO: check if it supports mtp
+                shared_output = self._forward_shared_experts(
+                    hidden_states, gemm_output_zero_allocator
+                )
+            # router_logits: (num_tokens, n_experts)
+            router_logits = self.gate(hidden_states, gemm_output_zero_allocator)
+            topk_output = self.topk(hidden_states, router_logits)
+        else:
+            shared_output = None
+            topk_output = self.topk.empty_topk_output(hidden_states.device)
+
+        if self._fuse_shared_experts_inside_sbo:
+            shared_output = None
+
+            def _pre_combine_hook(
+                dispatcher: BaseDispatcher, combine_input: CombineInput
+            ):
+
+                nonlocal shared_output
+                self.alt_stream.wait_stream(torch.cuda.current_stream())
+                with torch.cuda.stream(self.alt_stream):
+                    shared_output = self._forward_shared_experts(
+                        hidden_states, gemm_output_zero_allocator
+                    )
+
+                pre_combine_hook_handle.remove()
+
+            def _post_combine_hook(
+                dispatcher: BaseDispatcher, hidden_states: torch.Tensor
+            ):
+                nonlocal shared_output
+                torch.cuda.current_stream().wait_stream(self.alt_stream)
+                post_combine_hook_handle.remove()
+
+            pre_combine_hook_handle = self.experts.dispatcher.register_pre_combine_hook(
+                _pre_combine_hook
+            )
+            post_combine_hook_handle = (
+                self.experts.dispatcher.register_post_combine_hook(_post_combine_hook)
+            )
+
+        final_hidden_states = self.experts(
+            hidden_states,
+            topk_output,
+        )
+        if (
+            not _is_cuda
+            and not _use_aiter
+            or isinstance(self.experts.quant_method, KTEPWrapperMethod)
+        ):
+            # fused in biased_grouped_topk so we can skip here
+            final_hidden_states *= self.routed_scaling_factor
+        if shared_output is not None:
+            final_hidden_states += shared_output
+        if (
+            self.tp_size > 1
+            and not should_allreduce_fusion
+            and not use_reduce_scatter
+            and not should_use_flashinfer_cutlass_moe_fp4_allgather()
+        ):
+            final_hidden_states = tensor_model_parallel_all_reduce(final_hidden_states)
+        return final_hidden_states
+
+    def forward_cpu(
+        self,
+        hidden_states: torch.Tensor,
+        should_allreduce_fusion: bool = False,
+    ) -> torch.Tensor:
+        # router_logits: (num_tokens, n_experts)
+        router_logits = self.gate(hidden_states)
+        topk_output = self.topk(hidden_states, router_logits)
+        fused_experts_out = self.experts(
+            hidden_states=hidden_states, topk_output=topk_output
+        )
+
+        assert use_intel_amx_backend(
+            self.shared_experts.gate_up_proj
+        ) == use_intel_amx_backend(self.shared_experts.down_proj)
+        # [Note] inplace should be False in fused_experts.
+        # If inplace is True in fused_experts (self.experts), hidden_states will be changed after fused_experts
+        # While hidden_states is still needed in shared_expert.
+        final_hidden_states = torch.ops.sgl_kernel.shared_expert_cpu(
+            hidden_states,
+            self.shared_experts.gate_up_proj.weight,
+            self.shared_experts.down_proj.weight,
+            fused_experts_out,
+            self.routed_scaling_factor,
+            True,  # inplace
+            self.shared_experts_is_int8,  # use_int8_w8a8
+            self.shared_experts_is_fp8,  # use_fp8_w8a16
+            (
+                self.shared_experts.gate_up_proj.weight_scale
+                if self.shared_experts_is_int8
+                else (
+                    self.shared_experts.gate_up_proj.weight_scale_inv
+                    if self.shared_experts_is_fp8
+                    else None
+                )
+            ),  # w1_scale
+            (
+                self.shared_experts.down_proj.weight_scale
+                if self.shared_experts_is_int8
+                else (
+                    self.shared_experts.down_proj.weight_scale_inv
+                    if self.shared_experts_is_fp8
+                    else None
+                )
+            ),  # w2_scale
+            (
+                self.shared_experts_weight_block_size
+                if self.shared_experts_is_fp8
+                else None
+            ),  # block_size
+            True,  # is_vnni
+        )
+        if self.tp_size > 1 and not should_allreduce_fusion:
+            final_hidden_states = tensor_model_parallel_all_reduce(final_hidden_states)
+        return final_hidden_states
+
+    def forward_deepep(
+        self,
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+    ) -> torch.Tensor:
+        shared_output = None
+        sbo_enabled_flag = self._fuse_shared_experts_inside_sbo and not self.is_nextn
+        sbo_overlap_dispatch_flag = (
+            sbo_enabled_flag and SboFlags.enable_dispatch_shared_one_stream_overlap()
+        )
+        sbo_overlap_combine_flag = (
+            sbo_enabled_flag and SboFlags.enable_combine_shared_two_stream_overlap()
+        )
+
+        if hidden_states.shape[0] > 0:
+            # router_logits: (num_tokens, n_experts)
+            router_logits = self.gate(hidden_states, forward_batch=forward_batch)
+            if not sbo_enabled_flag:
+                if self.alt_stream is not None:
+                    self.alt_stream.wait_stream(torch.cuda.current_stream())
+                    with torch.cuda.stream(self.alt_stream):
+                        shared_output = self._forward_shared_experts(hidden_states)
+                        shared_output.record_stream(self.alt_stream)
+                        shared_event = self.alt_stream.record_event()
+                else:
+                    shared_output = self._forward_shared_experts(hidden_states)
+            topk_output = self.topk(
+                hidden_states,
+                router_logits,
+                num_token_non_padded=forward_batch.num_token_non_padded,
+                expert_location_dispatch_info=ExpertLocationDispatchInfo.init_new(
+                    layer_id=self.layer_id,
+                ),
+            )
+        else:
+            topk_output = self.topk.empty_topk_output(hidden_states.device)
+
+        if sbo_overlap_dispatch_flag:
+            shared_output = None
+
+            def _deepep_dispatch_hook(dispatcher: BaseDispatcher):
+                nonlocal shared_output
+                shared_output = self._forward_shared_experts(hidden_states)
+                for handle in deepep_dispatch_hook_handle:
+                    handle.remove()
+
+            def _post_dispatch_hook(
+                dispatcher: BaseDispatcher, dispatch_output: DispatchOutput
+            ):
+                combine_overlap_args, down_gemm_overlap_args, meta_overlap_args = (
+                    compute_overlap_args(dispatch_output, self.alt_stream)
+                )
+                dispatcher.set_overlap_args(
+                    combine_overlap_args=combine_overlap_args,
+                    meta_overlap_args=meta_overlap_args,
+                )
+                self.experts.set_overlap_args(
+                    down_gemm_overlap_args=down_gemm_overlap_args,
+                    meta_overlap_args=meta_overlap_args,
+                )
+                post_dispatch_hook_handle.remove()
+
+            def _post_combine_hook(
+                dispatcher: BaseDispatcher, hidden_states: torch.Tensor
+            ):
+                dispatcher.clear_overlap_args()
+                self.experts.clear_overlap_args()
+                post_combine_hook_handle.remove()
+
+            assert isinstance(self.experts.dispatcher, MaybeTboDeepEPDispatcher)
+            deepep_dispatch_hook_handle = (
+                self.experts.dispatcher.register_deepep_dispatch_hook(
+                    _deepep_dispatch_hook
+                )
+            )
+            post_dispatch_hook_handle = (
+                self.experts.dispatcher.register_post_dispatch_hook(_post_dispatch_hook)
+            )
+            post_combine_hook_handle = (
+                self.experts.dispatcher.register_post_combine_hook(_post_combine_hook)
+            )
+
+        elif sbo_overlap_combine_flag:
+            shared_output = None
+
+            def _post_dispatch_hook(
+                dispatcher: BaseDispatcher, dispatch_output: DispatchOutput
+            ):
+
+                combine_overlap_args, down_gemm_overlap_args, meta_overlap_args = (
+                    compute_overlap_args(dispatch_output, self.alt_stream)
+                )
+                dispatcher.set_overlap_args(
+                    combine_overlap_args=combine_overlap_args,
+                    meta_overlap_args=meta_overlap_args,
+                )
+                self.experts.set_overlap_args(
+                    down_gemm_overlap_args=down_gemm_overlap_args,
+                    meta_overlap_args=meta_overlap_args,
+                )
+
+                post_dispatch_hook_handle.remove()
+
+            def _pre_combine_hook(
+                dispatcher: BaseDispatcher, combine_input: CombineInput
+            ):
+
+                nonlocal shared_output
+
+                if (
+                    e := dispatcher.meta_overlap_args.get("record_event_after_down")
+                ) is not None:
+                    e.record()
+
+                # TODO reduce sm for non-deepgemm
+                with deep_gemm_wrapper.configure_deep_gemm_num_sms(
+                    dispatcher.meta_overlap_args["compute_num_sms"]
+                ):
+                    shared_output = self._forward_shared_experts(hidden_states)
+
+                pre_combine_hook_handle.remove()
+
+            def _post_combine_hook(
+                dispatcher: BaseDispatcher, hidden_states: torch.Tensor
+            ):
+                dispatcher.clear_overlap_args()
+                self.experts.clear_overlap_args()
+                post_combine_hook_handle.remove()
+
+            post_dispatch_hook_handle = (
+                self.experts.dispatcher.register_post_dispatch_hook(_post_dispatch_hook)
+            )
+            pre_combine_hook_handle = self.experts.dispatcher.register_pre_combine_hook(
+                _pre_combine_hook
+            )
+            post_combine_hook_handle = (
+                self.experts.dispatcher.register_post_combine_hook(_post_combine_hook)
+            )
+        elif envs.SGLANG_BLACKWELL_OVERLAP_SHARED_EXPERTS_OUTSIDE_SBO.get():
+            # On GB200: Shared experts overlapped on alt_stream, down gemm overlapped with DeepEP Combine
+
+            def _post_dispatch_hook(
+                dispatcher: BaseDispatcher, dispatch_output: DispatchOutput
+            ):
+
+                combine_overlap_args, down_gemm_overlap_args, meta_overlap_args = (
+                    compute_overlap_args(dispatch_output, self.alt_stream)
+                )
+                dispatcher.set_overlap_args(
+                    combine_overlap_args=combine_overlap_args,
+                    meta_overlap_args=meta_overlap_args,
+                )
+                self.experts.set_overlap_args(
+                    down_gemm_overlap_args=down_gemm_overlap_args,
+                    meta_overlap_args=meta_overlap_args,
+                )
+
+                post_dispatch_hook_handle.remove()
+
+            def _pre_combine_hook(
+                dispatcher: BaseDispatcher, combine_input: CombineInput
+            ):
+                if (
+                    e := dispatcher.meta_overlap_args.get("record_event_after_down")
+                ) is not None:
+                    e.record()
+                pre_combine_hook_handle.remove()
+
+            def _post_combine_hook(
+                dispatcher: BaseDispatcher, hidden_states: torch.Tensor
+            ):
+                dispatcher.clear_overlap_args()
+                self.experts.clear_overlap_args()
+                post_combine_hook_handle.remove()
+
+            post_dispatch_hook_handle = (
+                self.experts.dispatcher.register_post_dispatch_hook(_post_dispatch_hook)
+            )
+            pre_combine_hook_handle = self.experts.dispatcher.register_pre_combine_hook(
+                _pre_combine_hook
+            )
+            post_combine_hook_handle = (
+                self.experts.dispatcher.register_post_combine_hook(_post_combine_hook)
+            )
+
+        final_hidden_states = self.experts(
+            hidden_states=hidden_states,
+            topk_output=topk_output,
+        )
+
+        if (
+            hidden_states.shape[0] > 0
+            and not sbo_enabled_flag
+            and self.alt_stream is not None
+        ):
+            torch.cuda.current_stream().wait_event(shared_event)
+
+        if shared_output is not None:
+            x = shared_output
+            # aiter moe call will handle routed_scaling_factor in the function
+            # so add _use_aiter condition to eliminate to use self.routed_scaling_factor in add_ call
+            if self.experts.should_fuse_routed_scaling_factor_in_topk or _use_aiter:
+                x.add_(final_hidden_states)
+            else:
+                x.add_(final_hidden_states, alpha=self.routed_scaling_factor)
+            final_hidden_states = x
+        else:
+            if not (
+                self.experts.should_fuse_routed_scaling_factor_in_topk or _use_aiter
+            ):
+                final_hidden_states *= self.routed_scaling_factor
+
+        return final_hidden_states
+
+    def _forward_shared_experts(
+        self, hidden_states, gemm_output_zero_allocator: BumpAllocator = None
+    ):
+        if (hidden_states.shape[0] > 0) and (self.num_fused_shared_experts == 0):
+            return self.shared_experts(
+                hidden_states, gemm_output_zero_allocator=gemm_output_zero_allocator
+            )
+        else:
+            return None
+
+    def op_gate(self, state):
+        if is_non_idle_and_non_empty(
+            state.forward_batch.forward_mode, state.hidden_states_mlp_input
+        ):
+            # router_logits: (num_tokens, n_experts)
+            state.router_logits = self.gate(state.hidden_states_mlp_input)
+        else:
+            state.router_logits = None
+
+    def op_shared_experts(self, state):
+        hidden_states_mlp_input = state.pop("hidden_states_mlp_input")
+        if (self.num_fused_shared_experts == 0) and is_non_idle_and_non_empty(
+            state.forward_batch.forward_mode, hidden_states_mlp_input
+        ):
+            state.shared_output = self.shared_experts(hidden_states_mlp_input)
+        else:
+            state.shared_output = None
+
+    def op_select_experts(self, state):
+        router_logits = state.pop("router_logits")
+        hidden_states = state.hidden_states_mlp_input
+
+        if router_logits is not None:
+            with get_global_expert_distribution_recorder().with_current_layer(
+                self.layer_id
+            ):
+                state.topk_output = self.topk(
+                    hidden_states=hidden_states,
+                    router_logits=router_logits,
+                    num_token_non_padded=state.forward_batch.num_token_non_padded,
+                    expert_location_dispatch_info=ExpertLocationDispatchInfo.init_new(
+                        layer_id=self.layer_id,
+                    ),
+                )
+        else:
+            state.topk_output = self.topk.empty_topk_output(hidden_states.device)
+
+    def op_dispatch_a(self, state):
+        if self.ep_size > 1:
+            self.experts.dispatcher.dispatch_a(
+                hidden_states=state.hidden_states_mlp_input,
+                topk_output=state.pop("topk_output"),
+                tbo_subbatch_index=state.get("tbo_subbatch_index"),
+            )
+
+    def op_dispatch_b(self, state):
+        if self.ep_size > 1:
+            with get_global_expert_distribution_recorder().with_current_layer(
+                self.layer_id
+            ):
+                state.dispatch_output = self.experts.dispatcher.dispatch_b(
+                    tbo_subbatch_index=state.get("tbo_subbatch_index"),
+                )
+
+    def op_experts(self, state):
+        state.combine_input = self.experts.run_moe_core(
+            dispatch_output=state.dispatch_output,
+        )
+
+    def op_combine_a(self, state):
+        if self.ep_size > 1:
+            self.experts.dispatcher.combine_a(
+                combine_input=state.pop("combine_input"),
+                tbo_subbatch_index=state.get("tbo_subbatch_index"),
+            )
+            state.pop("dispatch_output")
+
+    def op_combine_b(self, state):
+        if self.ep_size > 1:
+            state.hidden_states_after_combine = self.experts.dispatcher.combine_b(
+                tbo_subbatch_index=state.get("tbo_subbatch_index"),
+            )
+
+    def op_output(self, state):
+        final_hidden_states = state.pop("hidden_states_after_combine")
+
+        if get_moe_a2a_backend().is_mori():
+            num_tokens = state.pop("num_tokens")
+            final_hidden_states = final_hidden_states[:num_tokens]
+
+        if (shared_output := state.pop("shared_output")) is not None:
+            x = shared_output
+            if _use_aiter:
+                x.add_(final_hidden_states)
+            else:
+                x.add_(final_hidden_states, alpha=self.routed_scaling_factor)
+            final_hidden_states = x
+        elif _use_aiter:
+            # fused in aiter_biased_grouped_topk so we can skip here
+            pass
+        else:
+            final_hidden_states *= self.routed_scaling_factor
+
+        state.hidden_states_mlp_output = final_hidden_states
+
+
+class DeepseekV2AttentionMLA(
+    nn.Module,
+    DeepseekMHAForwardMixin,
+    DeepseekMLAForwardMixin,
+    DeepseekMLARocmForwardMixin,
+    DeepseekMLACpuForwardMixin,
+):
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        hidden_size: int,
+        num_heads: int,
+        qk_nope_head_dim: int,
+        qk_rope_head_dim: int,
+        v_head_dim: int,
+        q_lora_rank: int,
+        kv_lora_rank: int,
+        rope_theta: float = 10000,
+        rope_scaling: Optional[Dict[str, Any]] = None,
+        max_position_embeddings: int = 8192,
+        quant_config: Optional[QuantizationConfig] = None,
+        reduce_results: bool = True,
+        layer_id: int = None,
+        prefix: str = "",
+        alt_stream: Optional[torch.cuda.Stream] = None,
+        skip_rope: bool = False,
+    ) -> None:
+        super().__init__()
+        self.layer_id = layer_id
+        self.hidden_size = hidden_size
+        self.qk_nope_head_dim = qk_nope_head_dim
+        self.qk_rope_head_dim = qk_rope_head_dim
+        self.qk_head_dim = qk_nope_head_dim + qk_rope_head_dim
+        self.v_head_dim = v_head_dim
+        self.q_lora_rank = q_lora_rank
+        self.kv_lora_rank = kv_lora_rank
+        self.quant_config = quant_config
+        attn_tp_rank = get_attention_tp_rank()
+        attn_tp_size = get_attention_tp_size()
+        self.use_nsa = is_deepseek_nsa(config)
+        self.nsa_enable_prefill_cp = is_nsa_enable_prefill_cp()
+        if self.nsa_enable_prefill_cp:
+            assert self.use_nsa, "CP currently only supports deepseek v3.2 model"
+        # cp reuse the attn_tp comm group but need to duplicate the weights
+        if self.nsa_enable_prefill_cp and self.use_nsa:
+            self.cp_size = get_attention_cp_size()
+        self.num_heads = num_heads
+        assert num_heads % attn_tp_size == 0
+        self.num_local_heads = num_heads // attn_tp_size
+        self.scaling = self.qk_head_dim**-0.5
+        self.rope_theta = rope_theta
+        self.max_position_embeddings = max_position_embeddings
+        self.kv_cache_dtype = get_global_server_args().kv_cache_dtype
+
+        # NOTE modification to rope_scaling must be done early enough, b/c e.g. Indexer needs it
+        if rope_scaling:
+            rope_scaling["rope_type"] = "deepseek_yarn"
+
+        # For tensor parallel attention
+        if self.q_lora_rank is not None:
+            self.fused_qkv_a_proj_with_mqa = ReplicatedLinear(
+                self.hidden_size,
+                self.q_lora_rank + self.kv_lora_rank + self.qk_rope_head_dim,
+                bias=False,
+                quant_config=quant_config,
+                prefix=add_prefix("fused_qkv_a_proj_with_mqa", prefix),
+            )
+            self.q_a_layernorm = RMSNorm(self.q_lora_rank, eps=config.rms_norm_eps)
+            self.q_b_proj = ColumnParallelLinear(
+                q_lora_rank,
+                self.num_heads * self.qk_head_dim,
+                bias=False,
+                quant_config=self._get_q_b_proj_quant_config(quant_config),
+                prefix=add_prefix("q_b_proj", prefix),
+                tp_rank=attn_tp_rank,
+                tp_size=attn_tp_size,
+            )
+        else:
+            self.q_proj = ColumnParallelLinear(
+                self.hidden_size,
+                self.num_heads * self.qk_head_dim,
+                bias=False,
+                quant_config=quant_config,
+                prefix=add_prefix("q_proj", prefix),
+                tp_rank=attn_tp_rank,
+                tp_size=attn_tp_size,
+            )
+            self.kv_a_proj_with_mqa = ReplicatedLinear(
+                self.hidden_size,
+                self.kv_lora_rank + self.qk_rope_head_dim,
+                bias=False,
+                quant_config=quant_config,
+                prefix=add_prefix("kv_a_proj_with_mqa", prefix),
+            )
+
+        if self.use_nsa:
+            is_neox_style = not getattr(config, "indexer_rope_interleave", False)
+            self.indexer = Indexer(
+                hidden_size=hidden_size,
+                index_n_heads=get_nsa_index_n_heads(config),
+                index_head_dim=get_nsa_index_head_dim(config),
+                rope_head_dim=qk_rope_head_dim,
+                index_topk=get_nsa_index_topk(config),
+                q_lora_rank=q_lora_rank,
+                max_position_embeddings=max_position_embeddings,
+                rope_theta=rope_theta,
+                scale_fmt="ue8m0",
+                block_size=128,
+                rope_scaling=rope_scaling,
+                is_neox_style=is_neox_style,
+                prefix=add_prefix("indexer", prefix),
+                quant_config=quant_config,
+                layer_id=layer_id,
+                alt_stream=alt_stream,
+            )
+
+        self.kv_b_proj = ColumnParallelLinear(
+            self.kv_lora_rank,
+            self.num_heads * (self.qk_nope_head_dim + self.v_head_dim),
+            bias=False,
+            quant_config=quant_config,
+            prefix=add_prefix("kv_b_proj", prefix),
+            tp_rank=attn_tp_rank,
+            tp_size=attn_tp_size,
+        )
+        # O projection.
+        self.o_proj = RowParallelLinear(
+            self.num_heads * self.v_head_dim,
+            self.hidden_size,
+            bias=False,
+            quant_config=quant_config,
+            reduce_results=reduce_results,
+            prefix=add_prefix("o_proj", prefix),
+            tp_rank=attn_tp_rank,
+            tp_size=attn_tp_size,
+        )
+        self.kv_a_layernorm = RMSNorm(self.kv_lora_rank, eps=config.rms_norm_eps)
+
+        if not skip_rope:
+            is_neox_style = not getattr(config, "rope_interleave", True)
+            self.rotary_emb = get_rope_wrapper(
+                qk_rope_head_dim,
+                rotary_dim=qk_rope_head_dim,
+                max_position=max_position_embeddings,
+                base=rope_theta,
+                rope_scaling=rope_scaling,
+                is_neox_style=is_neox_style,
+                device=get_global_server_args().device,
+            )
+
+            if rope_scaling:
+                mscale_all_dim = rope_scaling.get("mscale_all_dim", False)
+                scaling_factor = rope_scaling["factor"]
+                mscale = yarn_get_mscale(scaling_factor, float(mscale_all_dim))
+                self.scaling = self.scaling * mscale * mscale
+        else:
+            self.rotary_emb = None
+        self.use_deepseek_yarn_rope = rope_scaling is not None
+
+        self.attn_mqa = RadixAttention(
+            self.num_local_heads,
+            self.kv_lora_rank + self.qk_rope_head_dim,
+            self.scaling,
+            num_kv_heads=1,
+            layer_id=layer_id,
+            v_head_dim=self.kv_lora_rank,
+            quant_config=quant_config,
+            prefix=add_prefix("attn_mqa", prefix),
+        )
+
+        self.attn_mha = RadixAttention(
+            self.num_local_heads,
+            self.qk_nope_head_dim + self.qk_rope_head_dim,
+            self.scaling,
+            num_kv_heads=self.num_local_heads,
+            layer_id=layer_id,
+            v_head_dim=self.v_head_dim,
+            quant_config=quant_config,
+            prefix=add_prefix("attn_mha", prefix),
+        )
+
+        self.alt_stream = alt_stream
+        self.attn_mha.kv_b_proj = None
+
+        self.w_kc = None
+        self.w_vc = None
+        self.w_scale = 1.0
+
+        self.w_scale_k = None
+        self.w_scale_v = None
+        self.use_deep_gemm_bmm = False
+
+        self.current_attention_backend = (
+            None  # Attention backend used by current forward batch
+        )
+
+        self.has_fused_proj = hasattr(self, "fused_qkv_a_proj_with_mqa")
+        self.is_packed_weight = (
+            self.has_fused_proj
+            and hasattr(self.fused_qkv_a_proj_with_mqa.quant_method, "quant_config")
+            and self.fused_qkv_a_proj_with_mqa.quant_method.quant_config.get_name()
+            in {"awq", "awq_marlin", "moe_wna16"}
+        )
+        self.use_min_latency_fused_a_gemm = (
+            self.has_fused_proj
+            and not self.is_packed_weight
+            and self.fused_qkv_a_proj_with_mqa.weight.dtype == torch.bfloat16
+            and self.fused_qkv_a_proj_with_mqa.weight.shape[0] == 2112
+            and self.fused_qkv_a_proj_with_mqa.weight.shape[1] == 7168
+            and _is_cuda
+            and 90 <= _device_sm < 120
+        )
+
+        self.init_mha_forward()
+        self.init_mla_forward()
+        self.init_mla_fused_rope_rocm_forward()
+        self.init_mla_fused_rope_cpu_forward()
+
+    def dispatch_attn_forward_method(
+        self, forward_batch: ForwardBatch
+    ) -> AttnForwardMethod:
+        # Determine attention backend used by current forward batch
+        if forward_batch.forward_mode.is_decode_or_idle():
+            attention_backend = get_global_server_args().decode_attention_backend
+        elif (
+            forward_batch.forward_mode.is_target_verify()
+            or forward_batch.forward_mode.is_draft_extend()
+        ):
+            # Use the specified backend for speculative operations (both verify and draft extend)
+            if get_global_server_args().speculative_attention_mode == "decode":
+                attention_backend = get_global_server_args().decode_attention_backend
+            else:  # default to prefill
+                attention_backend = get_global_server_args().prefill_attention_backend
+        else:
+            attention_backend = get_global_server_args().prefill_attention_backend
+        self.current_attention_backend = attention_backend
+
+        handler = AttentionBackendRegistry.get_handler(attention_backend)
+        return handler(self, forward_batch)
+
+    def op_prepare(self, state):
+        state.attn_intermediate_state = self.forward_prepare(
+            positions=state.positions,
+            hidden_states=state.pop("hidden_states_after_comm_pre_attn"),
+            forward_batch=state.forward_batch,
+            zero_allocator=state.zero_allocator,
+        )
+
+    def op_core(self, state):
+        state.hidden_states_after_attn = self.forward_core(
+            state.pop("attn_intermediate_state")
+        )
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+        zero_allocator: BumpAllocator,
+        layer_scatter_modes: LayerScatterModes = None,
+        llama_4_scaling: Optional[torch.Tensor] = None,
+    ):
+        s = self.forward_prepare(
+            positions=positions,
+            hidden_states=hidden_states,
+            forward_batch=forward_batch,
+            zero_allocator=zero_allocator,
+            layer_scatter_modes=layer_scatter_modes,
+            llama_4_scaling=llama_4_scaling,
+        )
+        return self.forward_core(s)
+
+    def forward_prepare(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+        zero_allocator: BumpAllocator,
+        layer_scatter_modes: LayerScatterModes = None,
+        llama_4_scaling: Optional[torch.Tensor] = None,
+    ):
+        if self.attn_mha.kv_b_proj is None:
+            self.attn_mha.kv_b_proj = self.kv_b_proj
+
+        # when hidden_states is a tuple of tensors, the tuple will include quantized weight and scale tensor
+        if isinstance(hidden_states, tuple):
+            if (
+                not get_attn_tp_context().input_scattered
+                and hidden_states[0].shape[0] == 0
+            ):
+                assert (
+                    not self.o_proj.reduce_results
+                ), "short-circuiting allreduce will lead to hangs"
+                return hidden_states[0]
+        else:
+            if (
+                not get_attn_tp_context().input_scattered
+                and hidden_states.shape[0] == 0
+            ):
+                assert (
+                    not self.o_proj.reduce_results
+                ), "short-circuiting allreduce will lead to hangs"
+                return hidden_states, None, forward_batch, None
+
+        attn_forward_method = self.dispatch_attn_forward_method(forward_batch)
+        if attn_forward_method == AttnForwardMethod.MHA:
+            inner_state = self.forward_normal_prepare(
+                positions, hidden_states, forward_batch, zero_allocator
+            )
+        elif attn_forward_method == AttnForwardMethod.MHA_CHUNKED_KV:
+            inner_state = self.forward_normal_chunked_kv_prepare(
+                positions, hidden_states, forward_batch, zero_allocator
+            )
+        elif attn_forward_method == AttnForwardMethod.MHA_ONE_SHOT:
+            inner_state = self.forward_normal_one_shot_prepare(
+                positions, hidden_states, forward_batch, zero_allocator
+            )
+        elif attn_forward_method == AttnForwardMethod.MLA:
+            inner_state = self.forward_absorb_prepare(
+                positions, hidden_states, forward_batch, zero_allocator, llama_4_scaling
+            )
+        elif attn_forward_method == AttnForwardMethod.MLA_FUSED_ROPE_ROCM:
+            inner_state = self.forward_absorb_fused_mla_rope_prepare(
+                positions, hidden_states, forward_batch, zero_allocator
+            )
+        elif attn_forward_method == AttnForwardMethod.MLA_FUSED_ROPE_CPU:
+            inner_state = self.forward_absorb_fused_mla_rope_cpu_prepare(
+                positions, hidden_states, forward_batch, zero_allocator
+            )
+        elif attn_forward_method == AttnForwardMethod.MHA_NPU:
+            inner_state = forward_mha_prepare_npu(
+                self,
+                positions,
+                hidden_states,
+                forward_batch,
+                zero_allocator,
+                layer_scatter_modes,
+            )
+        elif attn_forward_method == AttnForwardMethod.MLA_NPU:
+            inner_state = forward_mla_prepare_npu(
+                self,
+                positions,
+                hidden_states,
+                forward_batch,
+                zero_allocator,
+                layer_scatter_modes,
+            )
+        elif attn_forward_method == AttnForwardMethod.DSA_NPU:
+            inner_state = forward_dsa_prepare_npu(
+                self,
+                positions,
+                hidden_states,
+                forward_batch,
+                zero_allocator,
+                layer_scatter_modes,
+            )
+        else:
+            raise NotImplementedError
+        return None, attn_forward_method, forward_batch, inner_state
+
+    def forward_core(self, intermediate_state):
+        hidden_states, attn_forward_method, forward_batch, inner_state = (
+            intermediate_state
+        )
+        if inner_state is None:
+            return hidden_states
+
+        if attn_forward_method == AttnForwardMethod.MHA:
+            return self.forward_normal_core(*inner_state)
+        elif attn_forward_method == AttnForwardMethod.MHA_CHUNKED_KV:
+            return self.forward_normal_chunked_kv_core(*inner_state)
+        elif attn_forward_method == AttnForwardMethod.MHA_ONE_SHOT:
+            return self.forward_normal_one_shot_core(*inner_state)
+        elif attn_forward_method == AttnForwardMethod.MLA:
+            return self.forward_absorb_core(*inner_state)
+        elif attn_forward_method == AttnForwardMethod.MLA_FUSED_ROPE_ROCM:
+            return self.forward_absorb_fused_mla_rope_core(*inner_state)
+        elif attn_forward_method == AttnForwardMethod.MLA_FUSED_ROPE_CPU:
+            return self.forward_absorb_fused_mla_rope_cpu_core(*inner_state)
+        elif attn_forward_method == AttnForwardMethod.MHA_NPU:
+            return forward_mha_core_npu(self, *inner_state)
+        elif attn_forward_method == AttnForwardMethod.MLA_NPU:
+            return forward_mla_core_npu(self, *inner_state)
+        elif attn_forward_method == AttnForwardMethod.DSA_NPU:
+            return forward_dsa_core_npu(self, *inner_state)
+        else:
+            raise NotImplementedError
+
+    def prepare_qkv_latent(
+        self, hidden_states: torch.Tensor, forward_batch: ForwardBatch
+    ):
+        assert self.q_lora_rank is not None
+        if (
+            (not isinstance(hidden_states, tuple))
+            and hidden_states.shape[0] >= 1
+            and hidden_states.shape[0] <= 16
+            and self.use_min_latency_fused_a_gemm
+        ):
+            qkv_latent = dsv3_fused_a_gemm(
+                hidden_states, self.fused_qkv_a_proj_with_mqa.weight.T
+            )
+        else:
+            qkv_latent = self.fused_qkv_a_proj_with_mqa(hidden_states)[0]
+        return qkv_latent
+
+    def rebuild_cp_kv_cache(self, latent_cache, forward_batch, k_nope, k_pe):
+        # support allgather+rerrange
+        latent_cache[..., : self.kv_lora_rank] = k_nope.squeeze(1)
+        latent_cache[..., self.kv_lora_rank :] = k_pe.squeeze(1)
+        latent_cache_output = cp_all_gather_rerange_output(
+            latent_cache.contiguous(),
+            self.cp_size,
+            forward_batch,
+            torch.cuda.current_stream(),
+        )
+        k_nope = latent_cache_output[..., : self.kv_lora_rank].unsqueeze(1)
+        k_pe = latent_cache_output[..., self.kv_lora_rank :].unsqueeze(1)
+        return k_nope, k_pe
+
+    @staticmethod
+    def _get_q_b_proj_quant_config(quant_config):
+        if envs.SGLANG_NVFP4_CKPT_FP8_GEMM_IN_ATTN.get():
+            # refer to real DeepSeek V3 quant config
+            return Fp8Config(
+                is_checkpoint_fp8_serialized=True,
+                weight_block_size=[128, 128],
+            )
+        else:
+            return quant_config
+
+
+class DeepseekV2DecoderLayer(nn.Module):
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        layer_id: int,
+        quant_config: Optional[QuantizationConfig] = None,
+        moe_quant_config_override: Optional[QuantizationConfig] = None,
+        is_nextn: bool = False,
+        prefix: str = "",
+        alt_stream: Optional[torch.cuda.Stream] = None,
+    ) -> None:
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.config = config
+        if hasattr(config, "rope_parameters"):
+            rope_theta = config.rope_parameters.get("rope_theta")
+            assert rope_theta is not None, f"rope_theta not found in config: {config}"
+            rope_type = config.rope_parameters.get("rope_type")
+            rope_scaling = config.rope_parameters if rope_type != "default" else None
+        else:
+            rope_theta = config.rope_theta
+            rope_scaling = config.rope_scaling
+        max_position_embeddings = config.max_position_embeddings
+        self.speculative_algorithm = SpeculativeAlgorithm.from_string(
+            get_global_server_args().speculative_algorithm
+        )
+        self.nsa_enable_prefill_cp = is_nsa_enable_prefill_cp()
+        self.layer_id = layer_id
+        self.is_nextn = is_nextn
+        self.self_attn = DeepseekV2AttentionMLA(
+            config=config,
+            hidden_size=self.hidden_size,
+            num_heads=config.num_attention_heads,
+            qk_nope_head_dim=config.qk_nope_head_dim,
+            qk_rope_head_dim=config.qk_rope_head_dim,
+            v_head_dim=config.v_head_dim,
+            q_lora_rank=(
+                config.q_lora_rank if hasattr(config, "q_lora_rank") else None
+            ),
+            kv_lora_rank=config.kv_lora_rank,
+            rope_theta=rope_theta,
+            rope_scaling=rope_scaling,
+            max_position_embeddings=max_position_embeddings,
+            quant_config=quant_config,
+            layer_id=layer_id,
+            reduce_results=False,
+            prefix=add_prefix("self_attn", prefix),
+            alt_stream=alt_stream,
+        )
+        if not hasattr(config, "q_lora_rank") and envs.SGLANG_USE_AG_AFTER_QLORA.get():
+            raise ValueError(
+                "SGLANG_USE_AG_AFTER_QLORA only supports the model with q_lora_rank"
+            )
+
+        self.is_layer_sparse = self._is_layer_sparse(layer_id, is_nextn=is_nextn)
+        is_previous_layer_sparse = self._is_layer_sparse(layer_id - 1, is_nextn=False)
+        is_next_layer_sparse = self._is_layer_sparse(layer_id + 1, is_nextn=False)
+
+        self.layer_scatter_modes = LayerScatterModes.init_new(
+            layer_id=layer_id,
+            num_layers=1 if is_nextn else config.num_hidden_layers,
+            is_layer_sparse=self.is_layer_sparse,
+            is_previous_layer_sparse=is_previous_layer_sparse,
+            is_next_layer_sparse=is_next_layer_sparse,
+        )
+
+        if self.is_layer_sparse:
+            self.mlp = DeepseekV2MoE(
+                config=config,
+                quant_config=moe_quant_config_override or quant_config,
+                prefix=add_prefix("mlp", prefix),
+                layer_id=self.layer_id,
+                alt_stream=alt_stream,
+                is_nextn=is_nextn,
+            )
+        else:
+            if enable_moe_dense_fully_dp():
+                mlp_tp_rank, mlp_tp_size = 0, 1
+            else:
+                mlp_tp_rank, mlp_tp_size = None, None
+            self.mlp = DeepseekV2MLP(
+                hidden_size=config.hidden_size,
+                intermediate_size=config.intermediate_size,
+                hidden_act=config.hidden_act,
+                quant_config=quant_config,
+                prefix=add_prefix("mlp", prefix),
+                tp_rank=mlp_tp_rank,
+                tp_size=mlp_tp_size,
+            )
+
+        self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = RMSNorm(
+            config.hidden_size, eps=config.rms_norm_eps
+        )
+
+        if self.nsa_enable_prefill_cp:
+            self.layer_communicator = NSACPLayerCommunicator(
+                layer_scatter_modes=self.layer_scatter_modes,
+                input_layernorm=self.input_layernorm,
+                post_attention_layernorm=self.post_attention_layernorm,
+                allow_reduce_scatter=True,
+                is_last_layer=(
+                    is_nextn or (self.layer_id == self.config.num_hidden_layers - 1)
+                ),
+                qkv_latent_func=self.self_attn.prepare_qkv_latent,
+            )
+        else:
+            self.layer_communicator = LayerCommunicator(
+                layer_scatter_modes=self.layer_scatter_modes,
+                input_layernorm=self.input_layernorm,
+                post_attention_layernorm=self.post_attention_layernorm,
+                allow_reduce_scatter=True,
+                is_last_layer=(
+                    is_nextn or (self.layer_id == self.config.num_hidden_layers - 1)
+                ),
+                qkv_latent_func=self.self_attn.prepare_qkv_latent,
+            )
+
+    def _is_layer_sparse(self, layer_id: int, is_nextn: bool) -> bool:
+        return is_nextn or (
+            self.config.n_routed_experts is not None
+            and layer_id >= self.config.first_k_dense_replace
+            and layer_id % self.config.moe_layer_freq == 0
+        )
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+        residual: Optional[torch.Tensor],
+        zero_allocator: BumpAllocator,
+        gemm_output_zero_allocator: BumpAllocator = None,
+        llama_4_scaling: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        quant_format = (
+            "mxfp4"
+            if (
+                _is_gfx95_supported
+                and getattr(self.self_attn, "fused_qkv_a_proj_with_mqa", None)
+                is not None
+                and getattr(self.self_attn.fused_qkv_a_proj_with_mqa, "weight", None)
+                is not None
+                and self.self_attn.fused_qkv_a_proj_with_mqa.weight.dtype == torch.uint8
+            )
+            else (
+                "fp8"
+                if (
+                    _is_gfx95_supported
+                    and getattr(self.self_attn, "fused_qkv_a_proj_with_mqa", None)
+                    is not None
+                    and getattr(
+                        self.self_attn.fused_qkv_a_proj_with_mqa, "weight", None
+                    )
+                    is not None
+                    and self.self_attn.fused_qkv_a_proj_with_mqa.weight.dtype
+                    == getattr(torch, "float8_e4m3fn", None)
+                )
+                else ""
+            )
+        )
+
+        hidden_states, residual = self.layer_communicator.prepare_attn(
+            hidden_states,
+            residual,
+            forward_batch,
+            quant_format,
+        )
+
+        hidden_states = self.self_attn(
+            positions=positions,
+            hidden_states=hidden_states,
+            forward_batch=forward_batch,
+            zero_allocator=zero_allocator,
+            llama_4_scaling=llama_4_scaling,
+            layer_scatter_modes=self.layer_scatter_modes,
+        )
+
+        hidden_states, residual = self.layer_communicator.prepare_mlp(
+            hidden_states, residual, forward_batch
+        )
+
+        should_allreduce_fusion = (
+            self.layer_communicator.should_fuse_mlp_allreduce_with_next_layer(
+                forward_batch
+            )
+        )
+
+        # For DP with padding, reduce scatter can be used instead of all-reduce.
+        use_reduce_scatter = self.layer_communicator.should_use_reduce_scatter(
+            forward_batch
+        )
+
+        if isinstance(self.mlp, DeepseekV2MLP):
+            gemm_output_zero_allocator = None
+
+        hidden_states = self.mlp(
+            hidden_states,
+            forward_batch,
+            should_allreduce_fusion,
+            use_reduce_scatter,
+            gemm_output_zero_allocator,
+        )
+
+        if not self.nsa_enable_prefill_cp and should_allreduce_fusion:
+            hidden_states._sglang_needs_allreduce_fusion = True
+
+        if not should_allreduce_fusion:
+            hidden_states, residual = self.layer_communicator.postprocess_layer(
+                hidden_states, residual, forward_batch
+            )
+
+        return hidden_states, residual
+
+    def op_comm_prepare_attn(
+        self,
+        state,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+        residual: Optional[torch.Tensor],
+        zero_allocator: BumpAllocator,
+        tbo_subbatch_index: Optional[int] = None,
+    ):
+        state.hidden_states_after_comm_pre_attn, state.residual_after_input_ln = (
+            self.layer_communicator.prepare_attn(hidden_states, residual, forward_batch)
+        )
+        if get_moe_a2a_backend().is_mori():
+            state.num_tokens = hidden_states.shape[0]
+        state.update(
+            dict(
+                forward_batch=forward_batch,
+                positions=positions,
+                zero_allocator=zero_allocator,
+                tbo_subbatch_index=tbo_subbatch_index,
+            )
+        )
+
+    def op_comm_prepare_mlp(self, state):
+        state.hidden_states_mlp_input, state.residual_after_comm_pre_mlp = (
+            self.layer_communicator.prepare_mlp(
+                state.pop("hidden_states_after_attn"),
+                state.pop("residual_after_input_ln"),
+                state.forward_batch,
+            )
+        )
+
+    def op_mlp(self, state):
+        hidden_states = state.pop("hidden_states_mlp_input")
+        if not (
+            enable_moe_dense_fully_dp()
+            and (not self.is_layer_sparse)
+            and hidden_states.shape[0] == 0
+        ):
+            state.hidden_states_mlp_output = self.mlp(
+                hidden_states, state.forward_batch
+            )
+        else:
+            state.hidden_states_mlp_output = hidden_states
+
+    def op_comm_postprocess_layer(self, state):
+        hidden_states, residual = self.layer_communicator.postprocess_layer(
+            state.pop("hidden_states_mlp_output"),
+            state.pop("residual_after_comm_pre_mlp"),
+            state.forward_batch,
+        )
+
+        output = dict(
+            positions=state.positions,
+            hidden_states=hidden_states,
+            residual=residual,
+            forward_batch=state.forward_batch,
+            zero_allocator=state.zero_allocator,
+            tbo_subbatch_index=state.tbo_subbatch_index,
+        )
+
+        state.clear(
+            expect_keys={
+                "positions",
+                "forward_batch",
+                "zero_allocator",
+                "tbo_subbatch_index",
+            }
+        )
+        return output
+
+
+class DeepseekV2Model(nn.Module):
+    fall_back_to_pt_during_load = False
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.padding_id = config.pad_token_id
+        self.vocab_size = config.vocab_size
+        self.first_k_dense_replace = config.first_k_dense_replace
+        self.pp_group = get_pp_group()
+        self.nsa_enable_prefill_cp = is_nsa_enable_prefill_cp()
+        if self.nsa_enable_prefill_cp:
+            self.cp_size = get_attention_cp_size()
+        else:
+            self.cp_size = None
+
+        if self.pp_group.is_first_rank:
+            self.embed_tokens = VocabParallelEmbedding(
+                config.vocab_size,
+                config.hidden_size,
+                use_attn_tp_group=is_dp_attention_enabled(),
+            )
+        else:
+            self.embed_tokens = PPMissingLayer()
+
+        self.alt_stream = (
+            torch.cuda.Stream()
+            if _is_cuda or envs.SGLANG_NPU_USE_MULTI_STREAM.get()
+            else None
+        )
+
+        self.layers, self.start_layer, self.end_layer = make_layers(
+            config.num_hidden_layers,
+            lambda idx, prefix: DeepseekV2DecoderLayer(
+                config=config,
+                layer_id=idx,
+                quant_config=quant_config,
+                prefix=prefix,
+                alt_stream=self.alt_stream,
+            ),
+            pp_rank=self.pp_group.rank_in_group,
+            pp_size=self.pp_group.world_size,
+            prefix=add_prefix("layers", prefix),
+            offloader_kwargs=dict(
+                submodule_accessor=lambda layer: (
+                    layer.mlp.experts
+                    if isinstance(layer.mlp, DeepseekV2MoE)
+                    else layer.mlp
+                ),
+                whitelist_param_names_creator=lambda module: (
+                    [
+                        "w13_weight",
+                        "w2_weight",
+                        # only for nvfp4
+                        *(
+                            [
+                                "w13_blockscale_swizzled",
+                                "w2_blockscale_swizzled",
+                            ]
+                            if hasattr(module, "w13_blockscale_swizzled")
+                            else []
+                        ),
+                    ]
+                    if isinstance(module, FusedMoE)
+                    else []
+                ),
+            ),
+        )
+        if self.pp_group.is_last_rank:
+            self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        else:
+            self.norm = PPMissingLayer(return_tuple=True)
+
+        self.gemm_output_zero_allocator_size = 0
+        if (
+            _use_aiter_gfx95
+            and config.n_routed_experts == 256
+            and self.embed_tokens.embedding_dim == 7168
+        ):
+            num_moe_layers = sum(
+                [
+                    1
+                    for i in range(len(self.layers))
+                    if isinstance(self.layers[i].mlp, DeepseekV2MoE)
+                ]
+            )
+
+            allocate_size = 0
+            for i in range(len(self.layers)):
+                if isinstance(self.layers[i].mlp, DeepseekV2MoE):
+                    # tp_size = get_tensor_model_parallel_world_size()
+                    a2a_backend = get_moe_a2a_backend()
+                    is_a2a_moe = (
+                        a2a_backend.is_deepep()
+                        or a2a_backend.is_mori()
+                        or a2a_backend.is_mooncake()
+                    )
+                    tp_size = (
+                        1 if is_a2a_moe else get_tensor_model_parallel_world_size()
+                    )
+                    intermediate_size = (
+                        config.moe_intermediate_size * config.n_shared_experts
+                    )
+                    share_expert_output_size_per_partition = divide(
+                        intermediate_size * 2, tp_size
+                    )
+                    allocate_size = share_expert_output_size_per_partition
+                    break
+
+            self.gemm_output_zero_allocator_size = (
+                get_dsv3_gemm_output_zero_allocator_size(
+                    config.n_routed_experts,
+                    num_moe_layers,
+                    allocate_size,
+                    self.embed_tokens.embedding_dim,
+                )
+            )
+        self.layers_to_capture = []
+        if get_moe_a2a_backend().is_deepep() or get_moe_a2a_backend().is_mooncake():
+            self.enable_a2a_moe = True
+        else:
+            self.enable_a2a_moe = False
+
+        # llama_4_scaling: for supporting Mistral-Large-3 model
+        self.llama_4_scaling_config = getattr(config, "llama_4_scaling", None)
+
+    def get_input_embeddings(self) -> torch.Tensor:
+        return self.embed_tokens
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        input_embeds: torch.Tensor = None,
+        pp_proxy_tensors: Optional[PPProxyTensors] = None,
+    ) -> Union[torch.Tensor, PPProxyTensors]:
+        total_num_layers = self.end_layer - self.start_layer
+        if self.pp_group.is_first_rank:
+            if input_embeds is None:
+                hidden_states = self.embed_tokens(input_ids)
+            else:
+                hidden_states = input_embeds
+            residual = None
+        else:
+            assert pp_proxy_tensors is not None
+            hidden_states = pp_proxy_tensors["hidden_states"]
+            residual = pp_proxy_tensors["residual"]
+        device = hidden_states.device
+        zero_allocator = BumpAllocator(
+            buffer_size=total_num_layers * 2 * (2 if forward_batch.can_run_tbo else 1),
+            dtype=torch.float32,
+            device=device,
+        )
+
+        has_gemm_output_zero_allocator = hasattr(
+            self, "gemm_output_zero_allocator_size"
+        )
+
+        gemm_output_zero_allocator = (
+            BumpAllocator(
+                buffer_size=self.gemm_output_zero_allocator_size,
+                dtype=torch.float32,
+                device=device,
+            )
+            if has_gemm_output_zero_allocator
+            and self.gemm_output_zero_allocator_size > 0
+            else None
+        )
+
+        if nsa_use_prefill_cp(forward_batch):
+            if self.pp_group.is_first_rank:
+                hidden_states = cp_split_and_rebuild_data(forward_batch, hidden_states)
+            positions = cp_split_and_rebuild_position(forward_batch, positions)
+
+        # llama_4_scaling: for supporting Mistral-Large-3 model
+        # Compute llama 4 scaling once per forward pass if enabled
+        llama_4_scaling: Optional[torch.Tensor] = None
+        if self.llama_4_scaling_config is not None:
+            llama_4_scaling = _get_llama_4_scaling(
+                original_max_position_embeddings=self.llama_4_scaling_config[
+                    "original_max_position_embeddings"
+                ],
+                scaling_beta=self.llama_4_scaling_config["beta"],
+                positions=positions,
+            )
+
+        normal_start_layer = self.start_layer
+        normal_end_layer = self.end_layer
+        if forward_batch.can_run_tbo:
+            if (
+                self.first_k_dense_replace > normal_start_layer
+                and self.first_k_dense_replace < normal_end_layer
+            ):
+                normal_end_layer = self.first_k_dense_replace
+            elif self.first_k_dense_replace < normal_start_layer:
+                normal_end_layer = normal_start_layer = 0
+        aux_hidden_states = []
+        for i in range(normal_start_layer, normal_end_layer):
+            # NOTE: torch dynamo does not support graph break in context manager
+            ctx = (
+                nullcontext()
+                if not get_global_server_args().disable_piecewise_cuda_graph
+                else get_global_expert_distribution_recorder().with_current_layer(i)
+            )
+            with ctx:
+                if i in self.layers_to_capture:
+                    if self.enable_a2a_moe and i > self.first_k_dense_replace:
+                        aux_hidden_state = tensor_model_parallel_all_gather(
+                            hidden_states + residual, dim=0
+                        )
+                        aux_hidden_states.append(aux_hidden_state)
+                    else:
+                        aux_hidden_states.append(hidden_states + residual)
+                layer = self.layers[i]
+                hidden_states, residual = layer(
+                    positions,
+                    hidden_states,
+                    forward_batch,
+                    residual,
+                    zero_allocator,
+                    gemm_output_zero_allocator,
+                    llama_4_scaling,
+                )
+
+        if normal_end_layer != self.end_layer:
+            hidden_states, residual = model_forward_maybe_tbo(
+                layers=self.layers[normal_end_layer : self.end_layer],
+                enable_tbo=True,
+                positions=positions,
+                forward_batch=forward_batch,
+                hidden_states=hidden_states,
+                residual=residual,
+                input_data_scatter_mode=self.layers[
+                    normal_end_layer - 1
+                ].layer_scatter_modes.layer_output_mode,
+                zero_allocator=zero_allocator,
+            )
+
+        if not self.pp_group.is_last_rank:
+            return PPProxyTensors(
+                {
+                    "hidden_states": hidden_states,
+                    "residual": residual,
+                }
+            )
+        else:
+            if not forward_batch.forward_mode.is_idle():
+                if residual is None:
+                    hidden_states = self.norm(hidden_states)
+                else:
+                    hidden_states, _ = self.norm(hidden_states, residual)
+
+        if self.pp_group.is_last_rank and nsa_use_prefill_cp(forward_batch):
+            # allgather + rerrange
+            hidden_states = cp_all_gather_rerange_output(
+                hidden_states,
+                self.cp_size,
+                forward_batch,
+                torch.cuda.current_stream(),
+            )
+        if len(aux_hidden_states) == 0:
+            return hidden_states
+        return hidden_states, aux_hidden_states
+
+
+class DeepseekV2ForCausalLM(nn.Module, DeepseekV2WeightLoaderMixin):
+    # for quark model load
+    packed_modules_mapping = {}
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+
+        # for quark model load
+        # Fuse q_a_proj and kv_a_proj_with_mqa along output dimension when q_lora_rank is not None
+        self.fuse_qkv_a_proj = (
+            hasattr(config, "q_lora_rank") and config.q_lora_rank is not None
+        )
+        if self.fuse_qkv_a_proj:
+            self.packed_modules_mapping["fused_qkv_a_proj_with_mqa"] = [
+                "q_a_proj",
+                "kv_a_proj_with_mqa",
+            ]
+
+        self.pp_group = get_pp_group()
+        self.config = config
+        self.tp_size = get_tensor_model_parallel_world_size()
+        self.quant_config = quant_config
+        self.determine_num_fused_shared_experts()
+        self.use_nsa = is_deepseek_nsa(config)
+        self.model = DeepseekV2Model(
+            config, quant_config, prefix=add_prefix("model", prefix)
+        )
+
+        if self.pp_group.is_last_rank:
+            if self.pp_group.world_size == 1 and config.tie_word_embeddings:
+                self.lm_head = self.model.embed_tokens
+            else:
+                self.lm_head = ParallelLMHead(
+                    config.vocab_size,
+                    config.hidden_size,
+                    quant_config=quant_config,
+                    prefix=add_prefix("lm_head", prefix),
+                    use_attn_tp_group=get_global_server_args().enable_dp_lm_head,
+                )
+        else:
+            # ranks other than the last rank will have a placeholder layer
+            self.lm_head = PPMissingLayer()
+        self.logits_processor = LogitsProcessor(config)
+
+        self._routed_experts_weights_of_layer = LazyValue(
+            lambda: {
+                layer_id: layer.mlp.get_moe_weights()
+                for layer_id, layer in enumerate(self.model.layers)
+                if isinstance(layer.mlp, DeepseekV2MoE)
+            }
+        )
+        self.capture_aux_hidden_states = False
+
+        self.nsa_enable_prefill_cp = is_nsa_enable_prefill_cp()
+        if self.nsa_enable_prefill_cp:
+            self.cp_rank = get_attention_cp_rank()
+            self.cp_size = get_attention_cp_size()
+        else:
+            self.cp_rank = self.cp_size = None
+
+        q_lora_rank = config.q_lora_rank if hasattr(config, "q_lora_rank") else None
+        get_attn_tp_context().init_context(q_lora_rank, is_deepseek_nsa(config))
+
+    @property
+    def routed_experts_weights_of_layer(self):
+        return self._routed_experts_weights_of_layer.value
+
+    def determine_num_fused_shared_experts(
+        self, architecture: str = "DeepseekV3ForCausalLM"
+    ):
+        self.num_fused_shared_experts = 0
+        if get_global_server_args().disable_shared_experts_fusion:
+            return
+
+        # Only Deepseek V3/R1 can use shared experts fusion optimization now.
+        disable_reason = None
+        if (
+            self.config.architectures[0] != architecture
+            or self.config.n_routed_experts != 256
+            or self.config.n_shared_experts != 1
+        ):
+            disable_reason = "Config does not support fused shared expert(s)."
+        elif (not _is_cuda or torch.cuda.get_device_capability("cuda") < (8, 0)) and (
+            not _is_hip or torch.cuda.get_device_capability("cuda") < (9, 4)
+        ):
+            disable_reason = (
+                "Only Deepseek V3/R1 on NV-platform with capability >= 80 "
+                "or AMD-platform with capability >= gfx942(MI30x) can use shared experts fusion optimization."
+            )
+        elif get_moe_expert_parallel_world_size() > 1 and (
+            not _is_hip or torch.cuda.get_device_capability("cuda") < (9, 4)
+        ):
+            disable_reason = "Only Deepseek V3/R1 on AMD-platform with capability >= gfx942(MI30x) can use shared experts fusion optimization under expert parallelism."
+        elif disable_reason is None and (
+            get_moe_a2a_backend().is_deepep() or get_moe_a2a_backend().is_mori()
+        ):
+            disable_reason = "Deepseek V3/R1 cannot use shared experts fusion optimization under deepep expert parallelism."
+        elif self.quant_config and self.quant_config.get_name() == "w4afp8":
+            disable_reason = "Deepseek V3/R1 W4AFP8 model uses different quant method for routed experts and shared experts."
+
+        if disable_reason is not None:
+            get_global_server_args().disable_shared_experts_fusion = True
+            self.num_fused_shared_experts = 0
+            log_info_on_rank0(
+                logger,
+                f"{disable_reason} Shared experts fusion optimization is disabled.",
+            )
+            return
+
+        self.num_fused_shared_experts = self.config.n_shared_experts
+
+    def get_input_embeddings(self) -> nn.Embedding:
+        return self.model.embed_tokens
+
+    @torch.no_grad()
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        input_embeds: torch.Tensor = None,
+        pp_proxy_tensors: Optional[PPProxyTensors] = None,
+    ) -> torch.Tensor:
+        if self.nsa_enable_prefill_cp:
+            if can_cp_split(len(input_ids), self.cp_size, self.use_nsa, forward_batch):
+                forward_batch.nsa_cp_metadata = prepare_input_dp_with_cp_dsa(
+                    len(input_ids),
+                    self.cp_rank,
+                    self.cp_size,
+                    forward_batch.seq_lens_cpu.tolist(),
+                )
+
+        with get_attn_tp_context().maybe_input_scattered(forward_batch):
+            hidden_states = self.model(
+                input_ids, positions, forward_batch, input_embeds, pp_proxy_tensors
+            )
+        aux_hidden_states = None
+        if self.capture_aux_hidden_states:
+            hidden_states, aux_hidden_states = hidden_states
+
+        if self.pp_group.is_last_rank:
+            return self.logits_processor(
+                input_ids, hidden_states, self.lm_head, forward_batch, aux_hidden_states
+            )
+        else:
+            return hidden_states
+
+    @property
+    def start_layer(self):
+        return self.model.start_layer
+
+    @property
+    def end_layer(self):
+        return self.model.end_layer
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]], is_nextn=False):
+        self.do_load_weights(weights, is_nextn)
+
+    def get_embed_and_head(self):
+        return self.model.embed_tokens.weight, self.lm_head.weight
+
+    def set_embed_and_head(self, embed, head):
+        del self.model.embed_tokens.weight
+        del self.lm_head.weight
+        self.model.embed_tokens.weight = embed
+        self.lm_head.weight = head
+        torch.cuda.empty_cache()
+        torch.cuda.synchronize()
+
+    @classmethod
+    def get_model_config_for_expert_location(cls, config):
+        return ModelConfigForExpertLocation(
+            num_layers=config.num_hidden_layers,
+            num_logical_experts=config.n_routed_experts,
+            num_groups=config.n_group,
+        )
+
+    def set_eagle3_layers_to_capture(self, layer_ids: Optional[List[int]] = None):
+        if not self.pp_group.is_last_rank:
+            return
+
+        if layer_ids is None:
+            self.capture_aux_hidden_states = True
+            num_layers = self.config.num_hidden_layers
+            self.model.layers_to_capture = [2, num_layers // 2, num_layers - 3]
+        else:
+            self.capture_aux_hidden_states = True
+            # we plus 1 here because in sglang, for the ith layer, it takes the output
+            # of the (i-1)th layer as aux hidden state
+            self.model.layers_to_capture = [val + 1 for val in layer_ids]
+
+
+class DeepseekV3ForCausalLM(DeepseekV2ForCausalLM):
+    pass
+
+
+class DeepseekV32ForCausalLM(DeepseekV2ForCausalLM):
+    pass
+
+
+EntryClass = [DeepseekV2ForCausalLM, DeepseekV3ForCausalLM, DeepseekV32ForCausalLM]
diff --git a/sglang/python/sglang/srt/models/deepseek_vl2.py b/sglang/python/sglang/srt/models/deepseek_vl2.py
new file mode 100644
index 0000000000000000000000000000000000000000..3fba37008b64e8872ba4b74418c7dd5b6d9fa80e
--- /dev/null
+++ b/sglang/python/sglang/srt/models/deepseek_vl2.py
@@ -0,0 +1,362 @@
+from typing import Iterable, List, Optional, Tuple
+
+import torch
+import torch.nn.functional as F
+from einops import rearrange, repeat
+from torch import nn
+
+from sglang.srt.configs.deepseekvl2 import (
+    DeepseekVL2Config,
+    DeepseekVL2MlpProjectorConfig,
+)
+from sglang.srt.layers.linear import ReplicatedLinear
+from sglang.srt.layers.quantization.base_config import QuantizationConfig
+from sglang.srt.managers.mm_utils import (
+    MultiModalityDataPaddingPatternMultimodalTokens,
+    general_mm_embed_routine,
+)
+from sglang.srt.managers.schedule_batch import MultimodalDataItem, MultimodalInputs
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch
+from sglang.srt.model_loader.weight_utils import default_weight_loader
+from sglang.srt.models.deepseek import DeepseekForCausalLM
+from sglang.srt.models.deepseek_v2 import DeepseekV2ForCausalLM
+
+
+class DeepseekVL2MlpProjector(nn.Module):
+    def __init__(
+        self,
+        config: DeepseekVL2MlpProjectorConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+    ):
+
+        super().__init__()
+
+        self.config = config
+
+        if config.projector_type == "identity":
+            modules = nn.Identity()
+
+        elif config.projector_type == "linear":
+            self.layers = nn.ModuleList(
+                [
+                    ReplicatedLinear(
+                        config.input_dim,
+                        config.n_embed,
+                        quant_config=quant_config,
+                    )
+                ]
+            )
+
+        elif config.projector_type == "mlp_gelu":
+            mlp_depth = config.depth
+            self.layers = nn.ModuleList(
+                [
+                    ReplicatedLinear(
+                        config.input_dim,
+                        config.n_embed,
+                        quant_config=quant_config,
+                    )
+                ]
+            )
+            for _ in range(1, mlp_depth):
+                self.layers.append(nn.GELU())
+                self.layers.append(
+                    ReplicatedLinear(
+                        config.n_embed,
+                        config.n_embed,
+                        quant_config=quant_config,
+                    )
+                )
+
+        elif config.projector_type == "downsample_mlp_gelu":
+            mlp_depth = config.depth
+            mlp_ratio = config.mlp_ratio
+            self.layers = nn.ModuleList(
+                [
+                    ReplicatedLinear(
+                        config.input_dim
+                        * config.downsample_ratio
+                        * config.downsample_ratio,
+                        config.n_embed * mlp_ratio,
+                        quant_config=quant_config,
+                    )
+                ]
+            )
+            for _ in range(1, mlp_depth - 1):
+                self.layers.append(nn.GELU())
+                self.layers.append(
+                    ReplicatedLinear(
+                        config.n_embed * mlp_ratio,
+                        config.n_embed * mlp_ratio,
+                        quant_config=quant_config,
+                    )
+                )
+            self.layers.append(nn.GELU())
+            self.layers.append(
+                ReplicatedLinear(
+                    config.n_embed * mlp_ratio,
+                    config.n_embed,
+                    quant_config=quant_config,
+                )
+            )
+
+        else:
+            raise ValueError(f"Unknown projector type: {config.projector_type}")
+
+        if config.token_pooling:
+            self.token_pooling_layer = ReplicatedLinear(
+                config.input_dim * 4, config.input_dim, quant_config=quant_config
+            )
+
+    def forward(self, x):
+        if self.config.token_pooling:
+            batch_size, wxh, channels = x.shape
+            w = h = int(wxh**0.5)
+            x = x.view(batch_size, w, h, channels)
+            x = x.permute(0, 3, 1, 2)
+
+            patches = x.unfold(2, 2, 2).unfold(3, 2, 2)
+            batch_size, channels, h_patches, w_patches, _, _ = patches.size()
+            patches = patches.contiguous().view(
+                batch_size, channels, h_patches * w_patches, -1
+            )
+            patches = patches.permute(0, 2, 1, 3).contiguous()
+            patches = patches.view(batch_size, h_patches * w_patches, channels * 4)
+
+            x = self.token_pooling_layer(patches)[0]
+
+        elif self.config.projector_type == "downsample_mlp_gelu":
+            bs, hw, input_dim = x.shape
+            h = w = int((hw) ** 0.5)
+
+            """compute padding"""
+            if h % self.config.downsample_ratio:
+                pad = self.config.downsample_ratio - h % self.config.downsample_ratio
+            else:
+                pad = 0
+            x = x.reshape(bs, h, w, input_dim)
+            if pad > 0:
+                x = F.pad(x, (0, 0, 0, pad, 0, pad), "constant", 0)
+
+            """4 to 1 concat"""
+            x = x.permute(0, 3, 1, 2)  # B, C, H, W
+            x = F.unfold(
+                x,
+                kernel_size=self.config.downsample_ratio,
+                stride=self.config.downsample_ratio,
+                padding=0,
+            )  # B, C*4, HW // 4
+            x = x.permute(0, 2, 1)
+
+        for layer in self.layers:
+            x = layer(x)
+            if isinstance(x, tuple):
+                x = x[0]
+        return x
+
+
+class DeepseekVL2ForCausalLM(nn.Module):
+
+    def __init__(
+        self,
+        config: DeepseekVL2Config,
+        quant_config: Optional[QuantizationConfig] = None,
+    ):
+        super().__init__()
+
+        # ----------- vision encoder ------------
+        vision_config = config.vision_config
+        self.vision = self._init_vision_module(vision_config, quant_config)
+
+        # ----------- vl projector ------------
+        projector_config = config.projector_config
+        self.projector = DeepseekVL2MlpProjector(projector_config, quant_config)
+
+        self.tile_tag = config.tile_tag
+        self.global_view_pos = config.global_view_pos
+
+        embed_std = 1 / torch.sqrt(
+            torch.tensor(projector_config.n_embed, dtype=torch.float32)
+        )
+        if self.tile_tag == "2D":
+            self.image_newline = nn.Parameter(
+                torch.randn(projector_config.n_embed) * embed_std
+            )
+            self.view_seperator = nn.Parameter(
+                torch.randn(projector_config.n_embed) * embed_std
+            )
+        else:
+            raise ValueError(f"tile tag should be 2D, but got {self.tile_tag}")
+
+        # ----------- language model ------------
+        language_config = config.language_config
+        if language_config.use_mla:
+            self.language_model = DeepseekV2ForCausalLM(language_config)
+        else:
+            # deepseek-vl2-tiny forbids mla
+            self.language_model = DeepseekForCausalLM(language_config)
+
+    def _init_vision_module(
+        self, vision_config, quant_config: Optional[QuantizationConfig]
+    ) -> nn.Module:
+        # TODO: refactor vision model through timm wrapper from transformers
+        try:
+            import timm
+        except ImportError:
+            raise ImportError("Please install timm") from ImportError
+
+        model = timm.create_model(
+            "vit_so400m_patch14_siglip_384.webli",
+            pretrained=False,
+            num_classes=0,
+            dynamic_img_size=True,
+            dynamic_img_pad=True,
+        )
+
+        model = model.to(dtype=torch.get_default_dtype())
+        return model
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        **kwargs: object,
+    ):
+        hs = general_mm_embed_routine(
+            input_ids=input_ids,
+            positions=positions,
+            forward_batch=forward_batch,
+            multimodal_model=self,
+            language_model=self.language_model,
+        )
+
+        return hs
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+            ("gate_up_proj", "up_proj", 1),
+            ("gate_up_proj", "gate_proj", 0),
+        ]
+        params_dict = dict(self.named_parameters())
+        weights = list(weights)
+        for name, loaded_weight in weights:
+            if "language" in name:
+                name = name.replace("language.", "")
+                self.language_model.load_weights([(name, loaded_weight)])
+            else:
+                param = params_dict[name]
+                weights_loader = getattr(param, "weight_loader", default_weight_loader)
+                weights_loader(param, loaded_weight)
+
+    def pad_input_ids(self, input_ids: List[int], mm_inputs: MultimodalInputs):
+        pattern = MultiModalityDataPaddingPatternMultimodalTokens()
+        return pattern.pad_input_tokens(input_ids, mm_inputs)
+
+    def get_image_feature(self, items: List[MultimodalDataItem]):
+
+        images_spatial_crop = torch.cat(
+            [item.images_spatial_crop for item in items], dim=0
+        )
+
+        assert images_spatial_crop.dim() == 3
+
+        # TODO: can it be batched ?
+        images_in_this_batch = []
+        for item in items:
+            assert item.feature.dim() == 4
+            image_feature = self.vision.forward_features(
+                item.feature.type(next(self.vision.parameters()).dtype).to(
+                    device=next(self.vision.parameters()).device
+                )
+            )
+            images_embeds = self.projector(image_feature)
+            _, hw, n_dim = images_embeds.shape
+            h = w = int(hw**0.5)
+            tile_index = 0
+            for jdx in range(item.images_spatial_crop.shape[1]):
+                num_width_tiles, num_height_tiles = item.images_spatial_crop[0, jdx]
+                if num_width_tiles == 0 or num_height_tiles == 0:
+                    break
+                num_tiles_in_image = num_width_tiles * num_height_tiles
+
+                # [hw, D]
+                global_features = images_embeds[tile_index]
+
+                # [num_height_tiles * num_width_tiles, hw, D]
+                local_features = images_embeds[
+                    tile_index + 1 : tile_index + 1 + num_tiles_in_image
+                ]
+                tile_index += num_tiles_in_image + 1
+
+                # format global and local features
+                # ----------------- global view add newline -----------------
+                # [hw, D] -> [h, w, D]
+                global_features = global_features.view(h, w, n_dim)
+
+                # [D]     -> [h, 1, D]
+                new_lines_in_global = repeat(self.image_newline, "d -> h 1 d", h=h)
+
+                # cat([h, w, D], [h, 1, D], dim=1) -> [h, w + 1, D]
+                global_features = torch.cat(
+                    [global_features, new_lines_in_global], dim=1
+                )
+
+                # [h, w + 1, D] -> [h * (w + 1), D]
+                global_features = global_features.view(-1, n_dim)
+
+                # ----------------- local view add newline -----------------
+                # [num_height_tiles * num_width_tiles, h * w, D] ->
+                # [num_height_tiles * h, num_width_tiles * w, D]
+                local_features = rearrange(
+                    local_features,
+                    "(th tw) (h w) d -> (th h) (tw w) d",
+                    th=num_height_tiles,
+                    tw=num_width_tiles,
+                    h=h,
+                    w=w,
+                )
+
+                # [D] -> [num_height_tiles * h, 1, D]
+                new_lines_in_local = repeat(
+                    self.image_newline,
+                    "d -> (th h) 1 d",
+                    th=num_height_tiles,
+                    h=h,
+                )
+
+                # [num_height_tiles * h, num_width_tiles * w + 1, D]
+                local_features = torch.cat([local_features, new_lines_in_local], dim=1)
+
+                # [num_height_tiles * h, num_width_tiles * w + 1, D]
+                #   --> [(num_height_tiles * h) * (num_width_tiles * w + 1), D]
+                local_features = local_features.view(-1, n_dim)
+
+                # merge global and local tiles
+                if self.global_view_pos == "head":
+                    global_local_features = torch.cat(
+                        [
+                            global_features,
+                            self.view_seperator[None, :],
+                            local_features,
+                        ]
+                    )
+                else:
+                    global_local_features = torch.cat(
+                        [
+                            local_features,
+                            self.view_seperator[None, :],
+                            global_features,
+                        ]
+                    )
+
+                images_in_this_batch.append(global_local_features)
+
+        return torch.cat(images_in_this_batch, dim=0)
+
+
+EntryClass = DeepseekVL2ForCausalLM
diff --git a/sglang/python/sglang/srt/models/dots_ocr.py b/sglang/python/sglang/srt/models/dots_ocr.py
new file mode 100644
index 0000000000000000000000000000000000000000..d1f60feccb5e92bd486649ba991f41588a14810b
--- /dev/null
+++ b/sglang/python/sglang/srt/models/dots_ocr.py
@@ -0,0 +1,171 @@
+# coding=utf-8
+# Adapted from Qwen2.5-VL SGLang implementation
+
+import logging
+from typing import Iterable, List, Optional, Tuple
+
+import torch
+import torch.nn as nn
+
+from sglang.srt.configs import DotsOCRConfig
+from sglang.srt.layers.logits_processor import LogitsProcessor
+from sglang.srt.layers.quantization.base_config import QuantizationConfig
+from sglang.srt.layers.vocab_parallel_embedding import ParallelLMHead
+from sglang.srt.managers.mm_utils import (
+    MultiModalityDataPaddingPatternMultimodalTokens,
+    general_mm_embed_routine,
+)
+from sglang.srt.managers.schedule_batch import MultimodalDataItem, MultimodalInputs
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch
+from sglang.srt.model_loader.weight_utils import default_weight_loader
+from sglang.srt.models.dots_vlm_vit import DotsVisionTransformer
+from sglang.srt.models.qwen2 import Qwen2ForCausalLM
+from sglang.srt.utils import add_prefix
+
+logger = logging.getLogger(__name__)
+
+
+class DotsOCRForCausalLM(nn.Module):
+    def __init__(
+        self,
+        config: DotsOCRConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.config = config
+
+        # Initialize vision transformer
+        self.visual = DotsVisionTransformer(
+            config.vision_config,
+        )
+
+        # Initialize language model
+        self.model = Qwen2ForCausalLM(config, quant_config)
+
+        # Initialize LM head
+        if config.tie_word_embeddings:
+            self.lm_head = self.model.embed_tokens
+        else:
+            self.lm_head = ParallelLMHead(
+                config.vocab_size,
+                config.hidden_size,
+                quant_config=quant_config,
+                prefix=add_prefix("lm_head", prefix),
+            )
+
+        self.logits_processor = LogitsProcessor(config)
+
+    def pad_input_ids(self, input_ids: List[int], mm_inputs: MultimodalInputs):
+        pattern = MultiModalityDataPaddingPatternMultimodalTokens()
+        return pattern.pad_input_tokens(input_ids, mm_inputs)
+
+    def get_image_feature(self, items: List[MultimodalDataItem]) -> torch.Tensor:
+        # Extract pixel values and grid information (following reference pattern)
+        pixel_values = torch.cat([item.feature for item in items], dim=0).type(
+            self.visual.dtype
+        )
+        image_grid_thw = torch.concat(
+            [item.image_grid_thw for item in items], dim=0
+        ).to(self.visual.device)
+
+        # Add dimension checks like in reference code
+        assert pixel_values.dim() == 2, f"{pixel_values.dim()=}"
+        assert image_grid_thw.dim() == 2, f"{image_grid_thw.dim()=}"
+
+        # Process through vision tower
+        image_embeds = self.visual(pixel_values, image_grid_thw)
+
+        # Ensure consistent dtype for FlashInfer compatibility
+        # Force bfloat16 to match model's expected dtype
+        if hasattr(self.model, "embed_tokens"):
+            target_dtype = self.model.embed_tokens.weight.dtype
+            if image_embeds.dtype != target_dtype:
+                image_embeds = image_embeds.to(target_dtype)
+
+        return image_embeds
+
+    def _pad_vit_attn_dummy_heads(self, name: str, loaded_weight: torch.Tensor):
+        """pad attn qkv weights for dummy heads"""
+        num_dummy_heads = self.config.vision_config.num_dummy_heads
+        if num_dummy_heads == 0:
+            return loaded_weight
+        head_dim = self.config.vision_config.head_dim
+
+        if "attn.qkv_proj" in name:
+            wq, wk, wv = loaded_weight.chunk(3, dim=0)
+            if name.endswith(".weight"):
+                dummy_shape = [num_dummy_heads, head_dim, wq.shape[-1]]
+            elif name.endswith(".bias"):
+                dummy_shape = [num_dummy_heads, head_dim]
+            else:
+                raise RuntimeError(f"Unsupported weight with name={name}")
+            pad_func = lambda x: torch.cat(
+                [x.unflatten(0, (-1, head_dim)), x.new_zeros(dummy_shape)], dim=0
+            ).flatten(0, 1)
+            wq, wk, wv = pad_func(wq), pad_func(wk), pad_func(wv)
+            loaded_weight = torch.cat([wq, wk, wv], dim=0)
+        if "attn.proj.weight" in name:
+            padded_weight = loaded_weight.new_zeros(
+                loaded_weight.shape[0], head_dim * num_dummy_heads
+            )
+            loaded_weight = torch.cat([loaded_weight, padded_weight], dim=-1)
+        if "attn.q_norm.weight" in name or "attn.k_norm.weight" in name:
+            padded_weight = loaded_weight.new_zeros(head_dim * num_dummy_heads)
+            loaded_weight = torch.cat([loaded_weight, padded_weight], dim=0)
+        return loaded_weight
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        **kwargs: object,
+    ) -> torch.Tensor:
+        hidden_states = general_mm_embed_routine(
+            input_ids=input_ids,
+            positions=positions,
+            forward_batch=forward_batch,
+            multimodal_model=self,
+            language_model=self.model,
+        )
+        return hidden_states
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        """Load weights for the model, separating vision and language weights"""
+        weights = list(weights)
+
+        # Separate vision tower weights and language model weights
+        vision_weights = []
+        language_weights = []
+
+        for name, loaded_weight in weights:
+            if name.startswith("vision_tower."):
+                vision_name = name.replace(r"attn.qkv.", r"attn.qkv_proj.")
+
+                vision_weights.append((vision_name, loaded_weight))
+            else:
+                # All other weights go to language model
+                language_weights.append((name, loaded_weight))
+
+        # Load vision tower weights
+        vision_state_dict = dict(vision_weights)
+        params_dict = dict(self.named_parameters(remove_duplicate=False))
+
+        for name, loaded_weight in vision_state_dict.items():
+            name = name.replace("vision_tower", "visual")
+            if name not in params_dict:
+                raise ValueError(f"Weight {name} not found in params_dict")
+            param = params_dict[name]
+            weight_loader = getattr(param, "weight_loader", default_weight_loader)
+            loaded_weight = self._pad_vit_attn_dummy_heads(name, loaded_weight)
+            weight_loader(param, loaded_weight)
+
+        if language_weights:
+            self.model.load_weights(language_weights)
+
+    def get_embed_and_head(self):
+        return self.model.embed_tokens.weight, self.lm_head.weight
+
+
+EntryClass = [DotsOCRForCausalLM]
diff --git a/sglang/python/sglang/srt/models/dots_vlm.py b/sglang/python/sglang/srt/models/dots_vlm.py
new file mode 100644
index 0000000000000000000000000000000000000000..ea113b60c54d7d4bcce0b0241869ce565b4043ea
--- /dev/null
+++ b/sglang/python/sglang/srt/models/dots_vlm.py
@@ -0,0 +1,187 @@
+# Copyright 2025 The RedNote HiLab team.
+# Copyright 2025 The SGLang team.
+#
+# This code is based on the DeepseekVL2ForCausalLM and DotsVisionTransformer
+# implementation in this library.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Inference-only Dots-VL model compatible with HuggingFace weights."""
+
+from typing import Iterable, List, Optional, Tuple
+
+import torch
+from torch import nn
+
+from sglang.srt.configs.dots_vlm import DotsVLMConfig
+from sglang.srt.distributed import get_pp_group
+from sglang.srt.layers.quantization.base_config import QuantizationConfig
+from sglang.srt.managers.mm_utils import (
+    MultiModalityDataPaddingPatternMultimodalTokens,
+    general_mm_embed_routine,
+)
+from sglang.srt.managers.schedule_batch import MultimodalDataItem, MultimodalInputs
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch, PPProxyTensors
+from sglang.srt.model_loader.weight_utils import default_weight_loader
+from sglang.srt.models.deepseek_v2 import DeepseekV2ForCausalLM
+
+from .dots_vlm_vit import DotsVisionTransformer
+
+
+class DotsVLMForCausalLM(nn.Module):
+    """DotsVLM model for sglang inference"""
+
+    def __init__(
+        self, config: DotsVLMConfig, quant_config: Optional[QuantizationConfig] = None
+    ) -> None:
+        super().__init__()
+
+        self.config = config
+        self.image_token_id = config.im_span_id
+        self.video_token_id = config.video_span_id
+        self.pp_group = get_pp_group()
+
+        if not config.encoder_only:
+            self.language_model = DeepseekV2ForCausalLM(
+                config.language_config, quant_config
+            )
+
+        # Initialize vision tower (matching transformers naming for weight compatibility)
+        self.vision_tower = DotsVisionTransformer(config.vision_config)
+
+    def _pad_vit_attn_dummy_heads(self, name: str, loaded_weight: torch.Tensor):
+        """pad attn qkv weights for dummy heads"""
+        num_dummy_heads = self.config.vision_config.num_dummy_heads
+        if num_dummy_heads == 0:
+            return loaded_weight
+        head_dim = self.config.vision_config.head_dim
+
+        if "attn.qkv_proj" in name:
+            wq, wk, wv = loaded_weight.chunk(3, dim=0)
+            if name.endswith(".weight"):
+                dummy_shape = [num_dummy_heads, head_dim, wq.shape[-1]]
+            elif name.endswith(".bias"):
+                dummy_shape = [num_dummy_heads, head_dim]
+            else:
+                raise RuntimeError(f"Unsupported weight with name={name}")
+            pad_func = lambda x: torch.cat(
+                [x.unflatten(0, (-1, head_dim)), x.new_zeros(dummy_shape)], dim=0
+            ).flatten(0, 1)
+            wq, wk, wv = pad_func(wq), pad_func(wk), pad_func(wv)
+            loaded_weight = torch.cat([wq, wk, wv], dim=0)
+        if "attn.proj.weight" in name:
+            padded_weight = loaded_weight.new_zeros(
+                loaded_weight.shape[0], head_dim * num_dummy_heads
+            )
+            loaded_weight = torch.cat([loaded_weight, padded_weight], dim=-1)
+        if "attn.q_norm.weight" in name or "attn.k_norm.weight" in name:
+            padded_weight = loaded_weight.new_zeros(head_dim * num_dummy_heads)
+            loaded_weight = torch.cat([loaded_weight, padded_weight], dim=0)
+        return loaded_weight
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        """Load weights for the model, separating vision and language weights"""
+        weights = list(weights)
+
+        # Separate vision tower weights and language model weights
+        vision_weights = []
+        language_weights = []
+
+        for name, loaded_weight in weights:
+            if name.startswith("vision_tower."):
+                vision_name = name.replace(r"attn.qkv.", r"attn.qkv_proj.")
+                vision_weights.append((vision_name, loaded_weight))
+            else:
+                # All other weights go to language model
+                language_weights.append((name, loaded_weight))
+
+        # Load vision tower weights
+        if not self.config.language_only:
+            vision_state_dict = dict(vision_weights)
+            params_dict = dict(self.named_parameters(remove_duplicate=False))
+            for name, loaded_weight in vision_state_dict.items():
+                if name not in params_dict:
+                    raise ValueError(f"Weight {name} not found in params_dict")
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
+                loaded_weight = self._pad_vit_attn_dummy_heads(name, loaded_weight)
+                weight_loader(param, loaded_weight)
+
+        # Load language model weights
+        if not self.config.encoder_only and language_weights:
+            self.language_model.load_weights(language_weights)
+
+    @classmethod
+    def get_model_config_for_expert_location(cls, config):
+        return DeepseekV2ForCausalLM.get_model_config_for_expert_location(config)
+
+    def pad_input_ids(self, input_ids: List[int], mm_inputs: MultimodalInputs):
+        """Pad input_ids with multimodal tokens"""
+        # Get image token ID for padding pattern
+        pattern = MultiModalityDataPaddingPatternMultimodalTokens()
+        padded_input_ids = pattern.pad_input_tokens(input_ids, mm_inputs)
+        return padded_input_ids
+
+    def get_image_feature(self, items: List[MultimodalDataItem]) -> torch.Tensor:
+        # Extract pixel values and grid information (following reference pattern)
+        pixel_values = torch.cat([item.feature for item in items], dim=0).type(
+            self.vision_tower.dtype
+        )
+        image_grid_thw = torch.concat(
+            [item.image_grid_thw for item in items], dim=0
+        ).to(self.vision_tower.device)
+
+        # Add dimension checks like in reference code
+        assert pixel_values.dim() == 2, f"{pixel_values.dim()=}"
+        assert image_grid_thw.dim() == 2, f"{image_grid_thw.dim()=}"
+
+        # Process through vision tower
+        image_embeds = self.vision_tower(pixel_values, image_grid_thw)
+
+        # Ensure consistent dtype for FlashInfer compatibility
+        # Force bfloat16 to match model's expected dtype
+        if image_embeds.dtype != torch.bfloat16 and hasattr(
+            self.language_model.model, "embed_tokens"
+        ):
+            target_dtype = self.language_model.model.embed_tokens.weight.dtype
+            image_embeds = image_embeds.to(target_dtype)
+
+        return image_embeds
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        pp_proxy_tensors: Optional[PPProxyTensors] = None,
+    ) -> torch.Tensor:
+        if self.pp_group.is_first_rank:
+            hidden_states = general_mm_embed_routine(
+                input_ids=input_ids,
+                positions=positions,
+                forward_batch=forward_batch,
+                multimodal_model=self,
+                language_model=self.language_model,
+            )
+
+        else:
+            hidden_states = self.language_model(
+                input_ids=input_ids,
+                positions=positions,
+                forward_batch=forward_batch,
+                pp_proxy_tensors=pp_proxy_tensors,
+            )
+
+        return hidden_states
+
+
+EntryClass = [DotsVLMForCausalLM]
diff --git a/sglang/python/sglang/srt/models/dots_vlm_vit.py b/sglang/python/sglang/srt/models/dots_vlm_vit.py
new file mode 100644
index 0000000000000000000000000000000000000000..873994e0b769d95c4f76293224ac6d3128403adc
--- /dev/null
+++ b/sglang/python/sglang/srt/models/dots_vlm_vit.py
@@ -0,0 +1,331 @@
+import logging
+from typing import Optional
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.utils.checkpoint
+from torch.nn import LayerNorm
+from transformers.modeling_utils import PreTrainedModel
+
+from sglang.srt.configs.dots_vlm import DotsVisionConfig
+from sglang.srt.distributed import parallel_state
+from sglang.srt.layers.attention.vision import VisionAttention
+from sglang.srt.layers.quantization import QuantizationConfig
+from sglang.srt.utils import add_prefix, is_npu
+
+logger = logging.getLogger(__name__)
+
+
+class VisionRotaryEmbedding(nn.Module):
+    def __init__(self, dim: int, theta: float = 10000.0) -> None:
+        super().__init__()
+        inv_freq = 1.0 / (theta ** (torch.arange(0, dim, 2, dtype=torch.float) / dim))
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+
+    def forward(self, seqlen: int) -> torch.Tensor:
+        seq = torch.arange(
+            seqlen, device=self.inv_freq.device, dtype=self.inv_freq.dtype
+        )
+        freqs = torch.outer(seq, self.inv_freq)
+        return freqs
+
+
+class PatchMerger(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        context_dim: int,
+        spatial_merge_size: int = 2,
+        pre_norm="layernorm",
+        init_merger_std=None,
+        quant_config: Optional[QuantizationConfig] = None,
+    ) -> None:
+        super().__init__()
+        self.hidden_size = context_dim * (spatial_merge_size**2)
+        self.pre_norm = pre_norm
+        if self.pre_norm == "layernorm":
+            self.ln_q = LayerNorm(context_dim, eps=1e-6)
+        elif self.pre_norm == "rmsnorm":
+            self.ln_q = RMSNorm(context_dim, eps=1e-6)
+        else:
+            logger.warning(f"no norm in patch merger: {self.pre_norm}")
+
+        self.mlp = nn.Sequential(
+            nn.Linear(self.hidden_size, self.hidden_size),
+            nn.GELU(),
+            nn.Linear(self.hidden_size, dim),
+        )
+
+        if init_merger_std is not None:
+            nn.init.normal_(self.mlp[0].weight, mean=0.0, std=init_merger_std)
+            nn.init.zeros_(self.mlp[0].bias)
+            nn.init.normal_(self.mlp[2].weight, mean=0.0, std=init_merger_std)
+            nn.init.zeros_(self.mlp[2].bias)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        if self.pre_norm:
+            x = self.mlp(self.ln_q(x).view(-1, self.hidden_size))
+        else:
+            x = self.mlp(x.view(-1, self.hidden_size))
+        return x
+
+
+class RMSNorm(nn.Module):
+    def __init__(self, dim: int, eps: float = 1e-6):
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(dim))
+        self.eps = eps
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        output = self._norm(x.float()).type_as(x)
+        return output * self.weight
+
+    def extra_repr(self) -> str:
+        return f"{tuple(self.weight.shape)}, eps={self.eps}"
+
+    def _norm(self, x: torch.Tensor) -> torch.Tensor:
+        return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)
+
+
+class DotsSwiGLUFFN(nn.Module):
+    def __init__(self, config, quant_config: Optional[QuantizationConfig] = None):
+        super().__init__()
+        hidden_features = config.intermediate_size
+        in_features = config.embed_dim
+        bias = config.use_bias
+
+        self.fc1 = nn.Linear(in_features, hidden_features, bias=bias)
+        self.fc2 = nn.Linear(hidden_features, in_features, bias=bias)
+        self.fc3 = nn.Linear(in_features, hidden_features, bias=bias)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = F.silu(self.fc1(x)) * self.fc3(x)
+        x = self.fc2(x)
+        return x
+
+
+class DotsPatchEmbed(nn.Module):
+    def __init__(self, config, quant_config: Optional[QuantizationConfig] = None):
+        super().__init__()
+        self.num_channels = config.num_channels
+        self.patch_size = config.patch_size
+        self.temporal_patch_size = config.temporal_patch_size
+        self.embed_dim = config.embed_dim
+        self.config = config
+        self.proj = nn.Conv2d(
+            config.num_channels,
+            config.embed_dim,
+            kernel_size=(config.patch_size, config.patch_size),
+            stride=(config.patch_size, config.patch_size),
+        )
+        self.norm = RMSNorm(config.embed_dim, eps=config.rms_norm_eps)
+
+    def forward(self, x: torch.Tensor, grid_thw=None) -> torch.Tensor:
+        x = x.view(
+            -1,
+            self.num_channels,
+            self.temporal_patch_size,
+            self.patch_size,
+            self.patch_size,
+        )[:, :, 0]
+        x = self.proj(x).view(-1, self.embed_dim)
+        x = self.norm(x)
+        return x
+
+
+class DotsViTPreprocessor(nn.Module):
+    def __init__(self, config, quant_config: Optional[QuantizationConfig] = None):
+        super().__init__()
+        self.patch_h = config.patch_size
+        self.patch_w = config.patch_size
+        self.embed_dim = config.embed_dim
+        self.config = config
+        self.patchifier = DotsPatchEmbed(config, quant_config)
+
+    def forward(self, x: torch.Tensor, grid_thw=None) -> torch.Tensor:
+        tokens = self.patchifier(x, grid_thw)
+        return tokens
+
+
+class DotsVisionBlock(nn.Module):
+    def __init__(
+        self,
+        config: DotsVisionConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.attn = VisionAttention(
+            embed_dim=config.embed_dim,
+            num_heads=config.num_attention_heads,
+            projection_size=config.embed_dim,
+            use_qkv_parallel=True,
+            flatten_batch=True,
+            quant_config=quant_config,
+            prefix=add_prefix("attn", prefix),
+            num_dummy_heads=config.num_dummy_heads,
+            qkv_bias=config.use_bias,
+            proj_bias=config.use_bias,
+        )
+        self.norm1 = RMSNorm(config.embed_dim, eps=config.rms_norm_eps)
+        self.mlp = DotsSwiGLUFFN(config, quant_config)
+        self.norm2 = RMSNorm(config.embed_dim, eps=config.rms_norm_eps)
+
+    def forward(self, hidden_states, cu_seqlens, rotary_pos_emb) -> torch.Tensor:
+        hidden_states = hidden_states + self.attn(
+            self.norm1(hidden_states),
+            cu_seqlens=cu_seqlens,
+            position_embeddings=rotary_pos_emb,
+        )
+        hidden_states = hidden_states + self.mlp(self.norm2(hidden_states))
+        return hidden_states
+
+
+class DotsVisionTransformer(PreTrainedModel):
+    def __init__(
+        self,
+        config: DotsVisionConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+    ) -> None:
+        super().__init__(config)
+        self.config = config
+        self._update_vision_config()
+        self.spatial_merge_size = config.spatial_merge_size
+
+        self.patch_embed = DotsViTPreprocessor(config, quant_config)
+        self._init_weights(self.patch_embed.patchifier.proj)
+
+        head_dim = config.embed_dim // config.num_attention_heads
+
+        self.rotary_pos_emb = VisionRotaryEmbedding(head_dim // 2)
+
+        _num_hidden_layers = config.num_hidden_layers
+        self.blocks = nn.ModuleList(
+            [
+                DotsVisionBlock(config, quant_config, f"blocks.{i}")
+                for i in range(_num_hidden_layers)
+            ]
+        )
+
+        if self.config.post_norm:
+            self.post_trunk_norm = RMSNorm(config.embed_dim, eps=config.rms_norm_eps)
+
+        self.merger = PatchMerger(
+            dim=config.hidden_size,
+            context_dim=config.embed_dim,
+            spatial_merge_size=config.spatial_merge_size,
+            init_merger_std=self.config.init_merger_std,
+            quant_config=quant_config,
+        )
+
+        self.gradient_checkpointing = False
+
+    def _update_vision_config(self):
+        """update vision config to support tp"""
+        world_size = parallel_state.get_tensor_model_parallel_world_size()
+        num_heads = self.config.num_attention_heads
+        head_dim = self.config.embed_dim // num_heads
+        num_dummy_heads = 0
+
+        if num_heads % world_size != 0:
+            num_dummy_heads = (
+                (num_heads + world_size) // world_size
+            ) * world_size - num_heads
+
+        setattr(self.config, "head_dim", head_dim)
+        setattr(self.config, "num_dummy_heads", num_dummy_heads)
+
+    def _init_weights(self, module):
+        std = self.config.initializer_range
+        if isinstance(module, (nn.Linear, nn.Conv2d)):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+
+    @property
+    def dtype(self) -> torch.dtype:
+        return self.blocks[0].mlp.fc2.weight.dtype
+
+    @property
+    def device(self) -> torch.device:
+        return self.blocks[0].mlp.fc2.weight.device
+
+    def get_pos_ids_by_grid(self, grid_thw):
+        pos_ids = []
+        for t, h, w in grid_thw:
+            hpos_ids = torch.arange(h).unsqueeze(1).expand(-1, w)
+            hpos_ids = hpos_ids.reshape(
+                h // self.spatial_merge_size,
+                self.spatial_merge_size,
+                w // self.spatial_merge_size,
+                self.spatial_merge_size,
+            )
+            hpos_ids = hpos_ids.permute(0, 2, 1, 3)
+            hpos_ids = hpos_ids.flatten()
+
+            wpos_ids = torch.arange(w).unsqueeze(0).expand(h, -1)
+            wpos_ids = wpos_ids.reshape(
+                h // self.spatial_merge_size,
+                self.spatial_merge_size,
+                w // self.spatial_merge_size,
+                self.spatial_merge_size,
+            )
+            wpos_ids = wpos_ids.permute(0, 2, 1, 3)
+            wpos_ids = wpos_ids.flatten()
+            pos_ids.append(torch.stack([hpos_ids, wpos_ids], dim=-1).repeat(t, 1))
+
+        return pos_ids
+
+    def rot_pos_emb(self, grid_thw):
+        pos_ids = self.get_pos_ids_by_grid(grid_thw)
+        pos_ids = torch.cat(pos_ids, dim=0)
+        max_grid_size = grid_thw[:, 1:].max()
+        rotary_pos_emb_full = self.rotary_pos_emb(max_grid_size)
+        rotary_pos_emb = rotary_pos_emb_full[pos_ids].flatten(1)
+        return rotary_pos_emb
+
+    def calc_cos_sin(self, rotary_pos_emb):
+        cos = rotary_pos_emb.cos()
+        sin = rotary_pos_emb.sin()
+        cos = cos.unsqueeze(1).repeat(1, 1, 2).unsqueeze(0).float()
+        sin = sin.unsqueeze(1).repeat(1, 1, 2).unsqueeze(0).float()
+        rotary_pos_emb = (cos, sin)
+        return rotary_pos_emb
+
+    def forward(
+        self, hidden_states: torch.Tensor, grid_thw: torch.Tensor, bf16=True
+    ) -> torch.Tensor:
+        hidden_states = hidden_states.to(self.device)
+        if bf16:
+            hidden_states = hidden_states.bfloat16()
+        hidden_states = self.patch_embed(hidden_states, grid_thw)
+
+        rotary_pos_emb = self.rot_pos_emb(grid_thw)
+        rotary_pos_emb = self.calc_cos_sin(rotary_pos_emb)
+
+        cu_seqlens = torch.repeat_interleave(
+            grid_thw[:, 1] * grid_thw[:, 2], grid_thw[:, 0]
+        ).cumsum(
+            dim=0,
+            dtype=grid_thw.dtype if torch.jit.is_tracing() else torch.int32,
+        )
+        cu_seqlens = torch.cat([cu_seqlens.new_zeros(1), cu_seqlens])
+        # cu_seqlens must be on cpu because of npu_flash_attention_unpad operator restriction
+        if is_npu():
+            cu_seqlens = cu_seqlens.to("cpu")
+
+        for blk in self.blocks:
+            hidden_states = blk(
+                hidden_states, cu_seqlens=cu_seqlens, rotary_pos_emb=rotary_pos_emb
+            )
+
+        if self.config.post_norm:
+            hidden_states = self.post_trunk_norm(hidden_states)
+
+        hidden_states = self.merger(hidden_states)
+        return hidden_states
diff --git a/sglang/python/sglang/srt/models/ernie4.py b/sglang/python/sglang/srt/models/ernie4.py
new file mode 100644
index 0000000000000000000000000000000000000000..3a61d8fdce0ce7a1e320ccfaa4c19793d7470d3c
--- /dev/null
+++ b/sglang/python/sglang/srt/models/ernie4.py
@@ -0,0 +1,427 @@
+# Copyright 2023-2025 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Inference-only Ernie4.5 model compatible with baidu/ERNIE-4.5-*-PT weights."""
+
+from typing import Iterable, List, Optional, Tuple, Union
+
+import torch
+import torch.nn.functional as F
+from torch import nn
+from transformers.models.ernie4_5_moe.configuration_ernie4_5_moe import (
+    Ernie4_5_MoeConfig,
+)
+
+from sglang.srt.distributed import (
+    get_tensor_model_parallel_world_size,
+    tensor_model_parallel_all_reduce,
+)
+from sglang.srt.layers.communicator import enable_moe_dense_fully_dp
+from sglang.srt.layers.layernorm import RMSNorm
+from sglang.srt.layers.logits_processor import LogitsProcessor
+from sglang.srt.layers.moe.ep_moe.layer import get_moe_impl_class
+from sglang.srt.layers.moe.fused_moe_triton.layer import FusedMoE
+from sglang.srt.layers.moe.topk import TopK
+from sglang.srt.layers.quantization.base_config import QuantizationConfig
+from sglang.srt.layers.vocab_parallel_embedding import (
+    ParallelLMHead,
+    VocabParallelEmbedding,
+)
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch
+from sglang.srt.model_loader.weight_utils import default_weight_loader
+from sglang.srt.models.deepseek_v2 import DeepseekV2MLP as Ernie4MLP
+from sglang.srt.models.llama import LlamaAttention as Ernie4Attention
+from sglang.srt.utils import add_prefix, make_layers
+
+
+class MoEGate(nn.Module):
+    def __init__(
+        self,
+        config,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.weight = nn.Parameter(
+            torch.empty((config.moe_num_experts, config.hidden_size))
+        )
+        self.e_score_correction_bias = nn.Parameter(
+            torch.empty((1, config.moe_num_experts))
+        )
+
+    def forward(self, hidden_states):
+        logits = F.linear(hidden_states, self.weight, None)
+        return logits
+
+
+class Ernie4Moe(nn.Module):
+    def __init__(
+        self,
+        config: Ernie4_5_MoeConfig,
+        layer_id: int,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.layer_id = layer_id
+        self.tp_size = get_tensor_model_parallel_world_size()
+        self.moe_num_shared_experts = getattr(config, "moe_num_shared_experts", 0)
+
+        if config.hidden_act != "silu":
+            raise ValueError(
+                f"Unsupported activation: {config.hidden_act}. "
+                "Only silu is supported for now."
+            )
+
+        self.gate = MoEGate(config=config, prefix=add_prefix("gate", prefix))
+
+        self.topk = TopK(
+            top_k=config.moe_k,
+            layer_id=layer_id,
+            renormalize=True,
+            use_grouped_topk=False,
+            correction_bias=self.gate.e_score_correction_bias,
+        )
+
+        self.experts = get_moe_impl_class(quant_config)(
+            num_experts=config.moe_num_experts,
+            top_k=config.moe_k,
+            hidden_size=config.hidden_size,
+            intermediate_size=config.moe_intermediate_size,
+            layer_id=self.layer_id,
+            quant_config=quant_config,
+            prefix=add_prefix("experts", prefix),
+        )
+
+        if self.moe_num_shared_experts > 0:
+            intermediate_size = (
+                config.moe_intermediate_size * config.moe_num_shared_experts
+            )
+            # disable tp for shared experts when enable deepep moe
+            self.shared_experts = Ernie4MLP(
+                hidden_size=config.hidden_size,
+                intermediate_size=intermediate_size,
+                hidden_act=config.hidden_act,
+                quant_config=quant_config,
+                reduce_results=False,
+                prefix=add_prefix("shared_experts", prefix),
+            )
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        return self.forward_normal(hidden_states)
+
+    def forward_normal(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        shared_output = (
+            self.shared_experts(hidden_states)
+            if self.moe_num_shared_experts > 0
+            else None
+        )
+        # router_logits: (num_tokens, n_experts)
+        router_logits = self.gate(hidden_states)
+        topk_output = self.topk(hidden_states, router_logits)
+        final_hidden_states = self.experts(
+            hidden_states=hidden_states, topk_output=topk_output
+        )
+        if shared_output is not None:
+            final_hidden_states = final_hidden_states + shared_output
+        if self.tp_size > 1:
+            final_hidden_states = tensor_model_parallel_all_reduce(final_hidden_states)
+        return final_hidden_states
+
+
+class Ernie4DecoderLayer(nn.Module):
+    """A single transformer layer.
+
+    Transformer layer takes input with size [s, b, h] and returns an
+    output of the same size.
+    """
+
+    def __init__(
+        self,
+        config,
+        layer_id: int,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+        is_mtp: bool = False,
+    ):
+        super().__init__()
+        rope_theta = getattr(config, "rope_theta", 10000)
+        rope_scaling = getattr(config, "rope_scaling", None)
+        rope_is_neox_style = getattr(config, "rope_is_neox_style", False)
+        # Self attention.
+        self.self_attn = Ernie4Attention(
+            config=config,
+            hidden_size=config.hidden_size,
+            num_heads=config.num_attention_heads,
+            num_kv_heads=config.num_key_value_heads,
+            layer_id=layer_id,
+            rope_theta=rope_theta,
+            rope_scaling=rope_scaling,
+            rope_is_neox_style=rope_is_neox_style,
+            max_position_embeddings=config.max_position_embeddings,
+            quant_config=quant_config,
+            prefix=add_prefix("self_attn", prefix),
+            bias=config.use_bias,
+        )
+        moe_layer_start_index = getattr(
+            config, "moe_layer_start_index", config.num_hidden_layers
+        )
+        moe_layer_end_index = getattr(
+            config, "moe_layer_end_index", config.num_hidden_layers - 1
+        )
+        # MLP
+        if (not is_mtp) and (
+            moe_layer_start_index <= layer_id <= moe_layer_end_index
+            and (layer_id - moe_layer_start_index) % config.moe_layer_interval == 0
+        ):
+            self.mlp = Ernie4Moe(
+                config=config,
+                layer_id=layer_id,
+                quant_config=quant_config,
+                prefix=add_prefix("mlp", prefix),
+            )
+        else:
+            if enable_moe_dense_fully_dp():
+                mlp_tp_rank, mlp_tp_size = 0, 1
+            else:
+                mlp_tp_rank, mlp_tp_size = None, None
+            self.mlp = Ernie4MLP(
+                hidden_size=config.hidden_size,
+                intermediate_size=config.intermediate_size,
+                hidden_act=config.hidden_act,
+                quant_config=quant_config,
+                prefix=add_prefix("mlp", prefix),
+                tp_rank=mlp_tp_rank,
+                tp_size=mlp_tp_size,
+            )
+
+        self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = RMSNorm(
+            config.hidden_size, eps=config.rms_norm_eps
+        )
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+        residual: Optional[torch.Tensor],
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        # Self Attention
+        if residual is None:
+            residual = hidden_states
+            hidden_states = self.input_layernorm(hidden_states)
+        else:
+            hidden_states, residual = self.input_layernorm(hidden_states, residual)
+        hidden_states = self.self_attn(
+            positions=positions,
+            hidden_states=hidden_states,
+            forward_batch=forward_batch,
+        )
+
+        # Fully Connected
+        hidden_states, residual = self.post_attention_layernorm(hidden_states, residual)
+        hidden_states = self.mlp(hidden_states)
+
+        return hidden_states, residual
+
+
+class Ernie4Model(nn.Module):
+    def __init__(
+        self,
+        config: Ernie4_5_MoeConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.config = config
+        self.embed_tokens = VocabParallelEmbedding(
+            config.vocab_size,
+            config.hidden_size,
+            quant_config=quant_config,
+            prefix=add_prefix("embed_tokens", prefix),
+        )
+        self.layers = make_layers(
+            config.num_hidden_layers,
+            lambda idx, prefix: Ernie4DecoderLayer(
+                config=config, layer_id=idx, quant_config=quant_config, prefix=prefix
+            ),
+            prefix="model.layers",
+        )
+
+        self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+    @torch.no_grad()
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        input_embeds: torch.Tensor = None,
+    ) -> Union[torch.Tensor, Tuple[torch.Tensor, List[torch.Tensor]]]:
+        if input_embeds is None:
+            hidden_states = self.embed_tokens(input_ids)
+        else:
+            hidden_states = input_embeds
+        residual = None
+        for layer in self.layers:
+            hidden_states, residual = layer(
+                positions,
+                hidden_states,
+                forward_batch,
+                residual,
+            )
+        hidden_states, _ = self.norm(hidden_states, residual)
+
+        return hidden_states
+
+
+class Ernie4_5_ForCausalLM(nn.Module):
+    packed_modules_mapping = {
+        "qkv_proj": ["q_proj", "k_proj", "v_proj"],
+        "gate_up_proj": ["gate_proj", "up_proj"],
+    }
+    stacked_params_mapping = [
+        # (param_name, weight_name, shard_id)
+        (".qkv_proj", ".q_proj", "q"),
+        (".qkv_proj", ".k_proj", "k"),
+        (".qkv_proj", ".v_proj", "v"),
+        (".gate_up_proj", ".gate_proj", 0),
+        (".gate_up_proj", ".up_proj", 1),
+    ]
+
+    def __init__(
+        self,
+        config: Ernie4_5_MoeConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.config: Ernie4_5_MoeConfig = config
+        self.quant_config = quant_config
+        self.model = Ernie4Model(config, quant_config, add_prefix("model", prefix))
+        if config.tie_word_embeddings:
+            self.lm_head = self.model.embed_tokens
+        else:
+            self.lm_head = ParallelLMHead(
+                config.vocab_size,
+                config.hidden_size,
+                quant_config=quant_config,
+                prefix="lm_head",
+            )
+        self.logits_processor = LogitsProcessor(config)
+
+    @torch.no_grad()
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+    ) -> torch.Tensor:
+        hidden_states = self.model(input_ids, positions, forward_batch)
+        return self.logits_processor(
+            input_ids, hidden_states, self.lm_head, forward_batch
+        )
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        params_dict = dict(self.named_parameters())
+        for name, loaded_weight in weights:
+            if self.config.tie_word_embeddings and "lm_head.weight" in name:
+                continue
+            for param_name, weight_name, shard_id in self.stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                if name in params_dict.keys():
+                    param = params_dict[name]
+                    weight_loader = getattr(
+                        param, "weight_loader", default_weight_loader
+                    )
+                    weight_loader(param, loaded_weight)
+                else:
+                    raise KeyError(f"Parameter '{name}' not found in model.")
+
+    def get_embed_and_head(self):
+        return self.model.embed_tokens.weight, self.lm_head.weight
+
+
+class Ernie4_5_MoeForCausalLM(Ernie4_5_ForCausalLM):
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        expert_params_mapping = FusedMoE.make_expert_params_mapping(
+            ckpt_gate_proj_name="gate_proj",
+            ckpt_down_proj_name="down_proj",
+            ckpt_up_proj_name="up_proj",
+            num_experts=self.config.moe_num_experts,
+        )
+        params_dict = dict(self.named_parameters())
+        for name, loaded_weight in weights:
+            if self.config.tie_word_embeddings and "lm_head.weight" in name:
+                continue
+            if name.startswith("model.mtp_"):
+                continue
+            if "moe_statics.e_score_correction_bias" in name:
+                name = name.replace("moe_statics", "gate")
+            for param_name, weight_name, shard_id in self.stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                # We have mlp.experts[0].gate_proj in the checkpoint.
+                # Since we handle the experts below in expert_params_mapping,
+                # we need to skip here BEFORE we update the name, otherwise
+                # name will be updated to mlp.experts[0].gate_up_proj, which
+                # will then be updated below in expert_params_mapping
+                # for mlp.experts[0].gate_gate_up_proj, which breaks load.
+                if ("mlp.experts." in name) and name not in params_dict:
+                    continue
+                name = name.replace(weight_name, param_name)
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                for mapping in expert_params_mapping:
+                    param_name, weight_name, expert_id, shard_id = mapping
+                    if weight_name not in name:
+                        continue
+                    name = name.replace(weight_name, param_name)
+                    if name in params_dict.keys():
+                        param = params_dict[name]
+                        weight_loader = param.weight_loader
+                        weight_loader(
+                            param,
+                            loaded_weight,
+                            name,
+                            shard_id=shard_id,
+                            expert_id=expert_id,
+                        )
+                    else:
+                        raise KeyError(
+                            f"Parameter '{name}'(replaced) not found in model."
+                        )
+                    break
+                else:
+                    if name in params_dict.keys():
+                        param = params_dict[name]
+                        weight_loader = getattr(
+                            param, "weight_loader", default_weight_loader
+                        )
+                        weight_loader(param, loaded_weight)
+                    else:
+                        raise KeyError(f"Parameter '{name}' not found in model.")
+
+
+EntryClass = [Ernie4_5_MoeForCausalLM, Ernie4_5_ForCausalLM]
diff --git a/sglang/python/sglang/srt/models/ernie45_moe_vl.py b/sglang/python/sglang/srt/models/ernie45_moe_vl.py
new file mode 100644
index 0000000000000000000000000000000000000000..24b37ce2b6661f5638ae40f6929f55f6c274a9c7
--- /dev/null
+++ b/sglang/python/sglang/srt/models/ernie45_moe_vl.py
@@ -0,0 +1,552 @@
+# Copyright 2023-2025 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Inference-only Ernie4.5 VL model compatible with baidu/ERNIE-4.5-VL-*-PT weights."""
+
+import logging
+from itertools import islice
+from typing import Any, Dict, Optional, Tuple, Union
+
+import torch
+from torch import nn
+from transformers import PretrainedConfig
+
+from sglang.srt.distributed import (
+    get_pp_group,
+    get_tensor_model_parallel_world_size,
+    tensor_model_parallel_all_reduce,
+)
+from sglang.srt.layers.dp_attention import is_dp_attention_enabled
+from sglang.srt.layers.layernorm import RMSNorm
+from sglang.srt.layers.linear import (
+    QKVParallelLinear,
+    ReplicatedLinear,
+    RowParallelLinear,
+)
+from sglang.srt.layers.moe.ep_moe.layer import get_moe_impl_class
+from sglang.srt.layers.moe.topk import TopK
+from sglang.srt.layers.quantization.base_config import QuantizationConfig
+from sglang.srt.layers.radix_attention import RadixAttention
+from sglang.srt.layers.rotary_embedding import Ernie4_5_VLRotaryEmbedding
+from sglang.srt.layers.utils import PPMissingLayer
+from sglang.srt.layers.vocab_parallel_embedding import VocabParallelEmbedding
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch, PPProxyTensors
+from sglang.srt.models.deepseek_v2 import DeepseekV2MLP as Ernie4_5_VLMoeMLP
+from sglang.srt.utils import add_prefix, make_layers
+
+logger = logging.getLogger(__name__)
+
+
+class Ernie4_5_VLMoeAttention(nn.Module):
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        hidden_size: int,
+        num_heads: int,
+        num_kv_heads: int,
+        layer_id: int = 0,
+        rope_theta: float = 10000,
+        rope_scaling: Optional[Dict[str, Any]] = None,
+        rope_is_neox_style: bool = True,
+        freq_allocation: int = 20,
+        max_position_embeddings: int = 8192,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+        bias: bool = False,
+    ) -> None:
+        super().__init__()
+        self.hidden_size = hidden_size
+        tp_size = get_tensor_model_parallel_world_size()
+        self.total_num_heads = num_heads
+        assert self.total_num_heads % tp_size == 0
+        self.num_heads = self.total_num_heads // tp_size
+        self.total_num_kv_heads = num_kv_heads
+        if self.total_num_kv_heads >= tp_size:
+            # Number of KV heads is greater than TP size, so we partition
+            # the KV heads across multiple tensor parallel GPUs.
+            assert self.total_num_kv_heads % tp_size == 0
+        else:
+            # Number of KV heads is less than TP size, so we replicate
+            # the KV heads across multiple tensor parallel GPUs.
+            assert tp_size % self.total_num_kv_heads == 0
+        self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
+        # MistralConfig has an optional head_dim introduced by Mistral-Nemo
+        self.head_dim = getattr(
+            config, "head_dim", self.hidden_size // self.total_num_heads
+        )
+        partial_rotary_factor = getattr(config, "partial_rotary_factor", 1)
+        self.rotary_dim = int(partial_rotary_factor * self.head_dim)
+        self.q_size = self.num_heads * self.head_dim
+        self.kv_size = self.num_kv_heads * self.head_dim
+        self.scaling = self.head_dim**-0.5
+        self.rope_theta = rope_theta
+        self.max_position_embeddings = max_position_embeddings
+
+        self.qkv_proj = QKVParallelLinear(
+            hidden_size,
+            self.head_dim,
+            self.total_num_heads,
+            self.total_num_kv_heads,
+            bias=bias,
+            quant_config=quant_config,
+            prefix=add_prefix("qkv_proj", prefix),
+        )
+        self.o_proj = RowParallelLinear(
+            self.total_num_heads * self.head_dim,
+            hidden_size,
+            bias=bias,
+            quant_config=quant_config,
+            prefix=add_prefix("o_proj", prefix),
+        )
+
+        # 3D rope
+        t_rope = freq_allocation
+        h_rope = (self.head_dim // 2 - freq_allocation) // 2
+        w_rope = (self.head_dim // 2 - freq_allocation) // 2
+
+        self.rotary_emb = Ernie4_5_VLRotaryEmbedding(
+            head_size=self.head_dim,
+            rotary_dim=self.head_dim,
+            max_position_embeddings=max_position_embeddings,
+            base=rope_theta,
+            is_neox_style=False,
+            dtype=torch.get_default_dtype(),
+            mrope_section=[h_rope, w_rope, t_rope],
+        )
+        self.attn = RadixAttention(
+            self.num_heads,
+            self.head_dim,
+            self.scaling,
+            num_kv_heads=self.num_kv_heads,
+            layer_id=layer_id,
+            quant_config=quant_config,
+            prefix=add_prefix("attn", prefix),
+        )
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+    ) -> torch.Tensor:
+        qkv, _ = self.qkv_proj(hidden_states)
+        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+        q, k = self.rotary_emb(positions, q, k)
+        attn_output = self.attn(q, k, v, forward_batch)
+        output, _ = self.o_proj(attn_output)
+        return output
+
+
+class Ernie4_5_VLMoeMoE(nn.Module):
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        layer_id: int,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.layer_id = layer_id
+        self.tp_size = get_tensor_model_parallel_world_size()
+        self.moe_num_shared_experts = getattr(config, "moe_num_shared_experts", 0)
+        self.hidden_size = config.hidden_size
+
+        moe_num_experts = config.moe_num_experts
+        max_moe_num_experts = max(moe_num_experts)
+
+        if self.tp_size > max_moe_num_experts:
+            raise ValueError(
+                f"Tensor parallel size {self.tp_size} is greater than "
+                f"the number of experts {moe_num_experts}."
+            )
+
+        moe_layer_start_index = config.moe_layer_start_index
+        text_moe_layer_start_index = moe_layer_start_index[0]
+        vision_moe_layer_start_index = moe_layer_start_index[1]
+        moe_layer_end_index = config.moe_layer_end_index
+        moe_layer_end_index = getattr(
+            config,
+            "moe_layer_end_index",
+            [config.num_hidden_layers - 1, config.num_hidden_layers - 1],
+        )
+        text_moe_layer_end_index = moe_layer_end_index[0]
+        vision_moe_layer_end_index = moe_layer_end_index[1]
+
+        assert config.moe_num_experts[0] == config.moe_num_experts[1]
+        self.e_score_correction_bias = nn.Parameter(
+            torch.empty(2, config.moe_num_experts[0], dtype=torch.float32)
+        )
+
+        assert text_moe_layer_start_index <= text_moe_layer_end_index
+
+        if (
+            layer_id >= text_moe_layer_start_index
+            and layer_id <= text_moe_layer_end_index
+        ):
+            self.text_experts_gate = ReplicatedLinear(
+                config.hidden_size,
+                config.moe_num_experts[0],
+                bias=False,
+                params_dtype=torch.float32,
+                quant_config=quant_config,
+                prefix=add_prefix("text_experts_gate", prefix),
+            )
+
+            self.text_experts_topk = TopK(
+                top_k=config.moe_k,
+                renormalize=True,
+                use_grouped_topk=False,
+                correction_bias=self.e_score_correction_bias[0],
+            )
+
+            self.text_experts = get_moe_impl_class(quant_config)(
+                num_experts=config.moe_num_experts[0],
+                top_k=config.moe_k,
+                hidden_size=config.hidden_size,
+                intermediate_size=config.moe_intermediate_size[0],
+                layer_id=self.layer_id,
+                quant_config=quant_config,
+                prefix=add_prefix("text_experts", prefix),
+            )
+
+        assert vision_moe_layer_start_index <= vision_moe_layer_end_index
+        if (
+            layer_id >= vision_moe_layer_start_index
+            and layer_id <= vision_moe_layer_end_index
+        ):
+
+            self.vision_experts_gate = ReplicatedLinear(
+                config.hidden_size,
+                config.moe_num_experts[1],
+                bias=False,
+                params_dtype=torch.float32,
+                quant_config=quant_config,
+                prefix=add_prefix("vision_experts_gate", prefix),
+            )
+
+            self.vision_experts_topk = TopK(
+                top_k=config.moe_k,
+                renormalize=True,
+                use_grouped_topk=False,
+                correction_bias=self.e_score_correction_bias[1],
+            )
+
+            self.vision_experts = get_moe_impl_class(quant_config)(
+                num_experts=config.moe_num_experts[1],
+                top_k=config.moe_k,
+                hidden_size=config.hidden_size,
+                intermediate_size=config.moe_intermediate_size[1],
+                layer_id=self.layer_id,
+                quant_config=quant_config,
+                prefix=add_prefix("vision_experts", prefix),
+            )
+
+        if self.moe_num_shared_experts > 0:
+            intermediate_size = (
+                config.moe_intermediate_size[0] * config.moe_num_shared_experts
+            )
+            self.shared_experts = Ernie4_5_VLMoeMLP(
+                hidden_size=config.hidden_size,
+                intermediate_size=intermediate_size,
+                hidden_act=config.hidden_act,
+                quant_config=quant_config,
+                reduce_results=False,
+                prefix=add_prefix("shared_experts", prefix),
+            )
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        visual_token_mask: torch.Tensor,
+        **kwargs: object,
+    ) -> torch.Tensor:
+        shared_output = (
+            self.shared_experts(hidden_states)
+            if self.moe_num_shared_experts > 0
+            else None
+        )
+
+        orig_shape = hidden_states.shape
+        hidden_dim = hidden_states.shape[-1]
+        hidden_states = hidden_states.view(-1, hidden_dim)
+
+        capturing = torch.cuda.is_current_stream_capturing()
+
+        if visual_token_mask is not None and not capturing:
+            all_visual = visual_token_mask.all()
+            any_visual = visual_token_mask.any()
+        else:
+            # During CUDA Graph capture, all set false
+            all_visual = False
+            any_visual = False
+
+        if all_visual:
+            # vision modal input processing directly
+            vision_router_logits, _ = self.vision_experts_gate(
+                hidden_states.to(dtype=torch.float32)
+            )
+            vision_topk_output = self.vision_experts_topk(
+                hidden_states, vision_router_logits
+            )
+            final_hidden_states = self.vision_experts(
+                hidden_states=hidden_states, topk_output=vision_topk_output
+            )
+        elif any_visual:
+            visual_token_mask = visual_token_mask.repeat(1, self.hidden_size).bool()
+            text_token_mask = ~visual_token_mask
+            final_hidden_states = torch.zeros_like(hidden_states)
+
+            text_hidden_states = hidden_states[text_token_mask].reshape(
+                -1, self.hidden_size
+            )
+            vision_hidden_states = hidden_states[visual_token_mask].reshape(
+                -1, self.hidden_size
+            )
+
+            text_router_logits, _ = self.text_experts_gate(
+                text_hidden_states.to(dtype=torch.float32)
+            )
+            text_topk_output = self.text_experts_topk(
+                text_hidden_states, text_router_logits
+            )
+            final_hidden_states[text_token_mask] = self.text_experts(
+                hidden_states=text_hidden_states, topk_output=text_topk_output
+            ).flatten()
+
+            vision_router_logits, _ = self.vision_experts_gate(
+                vision_hidden_states.to(dtype=torch.float32)
+            )
+            vision_topk_output = self.vision_experts_topk(
+                vision_hidden_states, vision_router_logits
+            )
+            final_hidden_states[visual_token_mask] = self.vision_experts(
+                hidden_states=vision_hidden_states, topk_output=vision_topk_output
+            ).flatten()
+
+        else:
+            # text modal input processing directly
+            text_router_logits, _ = self.text_experts_gate(
+                hidden_states.to(dtype=torch.float32)
+            )
+            topk_output = self.text_experts_topk(hidden_states, text_router_logits)
+            final_hidden_states = self.text_experts(
+                hidden_states=hidden_states, topk_output=topk_output
+            )
+
+        if shared_output is not None:
+            final_hidden_states = final_hidden_states + shared_output
+
+        if self.tp_size > 1:
+            final_hidden_states = tensor_model_parallel_all_reduce(final_hidden_states)
+
+        return final_hidden_states.view(orig_shape)
+
+
+class Ernie4_5_VLMoeDecoderLayer(nn.Module):
+    """A single transformer layer.
+
+    Transformer layer takes input with size [s, b, h] and returns an
+    output of the same size.
+    """
+
+    def __init__(
+        self,
+        config,
+        layer_id: int,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        rope_theta = getattr(config, "rope_theta", 500000)
+        rope_scaling = getattr(config, "rope_scaling", None)
+        rope_is_neox_style = getattr(config, "rope_is_neox_style", False)
+        freq_allocation = getattr(config, "freq_allocation", 20)
+        max_position_embeddings = getattr(config, "max_position_embeddings", 131072)
+        # Self attention.
+        self.self_attn = Ernie4_5_VLMoeAttention(
+            config=config,
+            hidden_size=config.hidden_size,
+            num_heads=config.num_attention_heads,
+            num_kv_heads=config.num_key_value_heads,
+            layer_id=layer_id,
+            rope_theta=rope_theta,
+            rope_scaling=rope_scaling,
+            rope_is_neox_style=rope_is_neox_style,
+            freq_allocation=freq_allocation,
+            max_position_embeddings=config.max_position_embeddings,
+            quant_config=quant_config,
+            prefix=add_prefix("self_attn", prefix),
+            bias=config.use_bias,
+        )
+
+        # MoE
+        moe_layer_start_index = config.moe_layer_start_index
+        min_moe_layer_start_index = min(moe_layer_start_index)
+        moe_layer_end_index = getattr(
+            config,
+            "moe_layer_end_index",
+            [config.num_hidden_layers - 1, config.num_hidden_layers - 1],
+        )
+        max_moe_layer_end_index = max(moe_layer_end_index)
+        assert min_moe_layer_start_index <= max_moe_layer_end_index
+        moe_num_experts = config.moe_num_experts
+        max_moe_num_experts = max(moe_num_experts)
+        moe_layer_interval = getattr(config, "moe_layer_interval", 1)
+        use_moe = getattr(config, "use_moe", max_moe_num_experts > 0)
+        # MLP
+        if (
+            use_moe
+            and ((layer_id + 1) % moe_layer_interval == 0)
+            and layer_id >= min_moe_layer_start_index
+            and layer_id <= max_moe_layer_end_index
+        ):
+            self.mlp = Ernie4_5_VLMoeMoE(
+                config=config,
+                layer_id=layer_id,
+                quant_config=quant_config,
+                prefix=add_prefix("mlp", prefix),
+            )
+        else:
+            self.mlp = Ernie4_5_VLMoeMLP(
+                hidden_size=config.hidden_size,
+                intermediate_size=config.intermediate_size,
+                hidden_act=config.hidden_act,
+                quant_config=quant_config,
+                prefix=add_prefix("mlp", prefix),
+            )
+
+        self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = RMSNorm(
+            config.hidden_size, eps=config.rms_norm_eps
+        )
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+        residual: Optional[torch.Tensor],
+        visual_token_mask: torch.Tensor | None,
+        **kwargs: object,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        # Self Attention
+        if residual is None:
+            residual = hidden_states
+            hidden_states = self.input_layernorm(hidden_states)
+        else:
+            hidden_states, residual = self.input_layernorm(hidden_states, residual)
+        hidden_states = self.self_attn(
+            positions=positions,
+            hidden_states=hidden_states,
+            forward_batch=forward_batch,
+        )
+
+        # Fully Connected
+        hidden_states, residual = self.post_attention_layernorm(hidden_states, residual)
+        if isinstance(self.mlp, Ernie4_5_VLMoeMoE):
+            hidden_states = self.mlp(hidden_states, visual_token_mask, **kwargs)
+        else:
+            hidden_states = self.mlp(hidden_states)
+
+        return hidden_states, residual
+
+
+# only used as text backbone for ernie4.5 vl
+class Ernie4_5_VLMoeModel(nn.Module):
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.config = config
+        self.pp_group = get_pp_group()
+
+        if self.pp_group.is_first_rank:
+            self.embed_tokens = VocabParallelEmbedding(
+                config.vocab_size,
+                config.hidden_size,
+                enable_tp=not is_dp_attention_enabled(),
+                prefix=add_prefix("embed_tokens", prefix),
+            )
+        else:
+            self.embed_tokens = PPMissingLayer()
+
+        self.layers, self.start_layer, self.end_layer = make_layers(
+            config.num_hidden_layers,
+            lambda idx, prefix: Ernie4_5_VLMoeDecoderLayer(
+                layer_id=idx,
+                config=config,
+                quant_config=quant_config,
+                prefix=prefix,
+            ),
+            pp_rank=self.pp_group.rank_in_group,
+            pp_size=self.pp_group.world_size,
+            prefix=add_prefix("layers", prefix),
+        )
+        if self.pp_group.is_last_rank:
+            self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        else:
+            self.norm = PPMissingLayer(return_tuple=True)
+
+    def get_input_embeddings(self) -> torch.Tensor:
+        return self.embed_tokens
+
+    @torch.no_grad()
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        input_embeds: torch.Tensor = None,
+        pp_proxy_tensors: Optional[PPProxyTensors] = None,
+        visual_token_mask: torch.Tensor | None = None,
+    ) -> Union[torch.Tensor, PPProxyTensors]:
+
+        if self.pp_group.is_first_rank:
+            if input_embeds is None:
+                hidden_states = self.embed_tokens(input_ids)
+            else:
+                hidden_states = input_embeds
+            residual = None
+        else:
+            assert pp_proxy_tensors is not None
+            hidden_states = pp_proxy_tensors["hidden_states"]
+            residual = pp_proxy_tensors["residual"]
+
+        for layer in islice(self.layers, self.start_layer, self.end_layer):
+            hidden_states, residual = layer(
+                positions,
+                hidden_states,
+                forward_batch,
+                residual,
+                visual_token_mask,
+            )
+
+        if not self.pp_group.is_last_rank:
+            return PPProxyTensors(
+                {
+                    "hidden_states": hidden_states,
+                    "residual": residual,
+                }
+            )
+
+        if hidden_states.shape[0] != 0:
+            if residual is None:
+                hidden_states = self.norm(hidden_states)
+            else:
+                hidden_states, _ = self.norm(hidden_states, residual)
+
+        return hidden_states
diff --git a/sglang/python/sglang/srt/models/ernie45_vl.py b/sglang/python/sglang/srt/models/ernie45_vl.py
new file mode 100644
index 0000000000000000000000000000000000000000..e2071f416667c1daa685064c16f5158111824c60
--- /dev/null
+++ b/sglang/python/sglang/srt/models/ernie45_vl.py
@@ -0,0 +1,868 @@
+# Copyright 2023-2025 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Inference-only Ernie45-VL model compatible with HuggingFace weights."""
+
+import logging
+from functools import lru_cache, partial
+from typing import Iterable, List, Optional, Tuple, Type
+
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from einops import rearrange
+from transformers import PretrainedConfig
+
+from sglang.srt.layers.activation import QuickGELU
+from sglang.srt.layers.attention.vision import VisionAttention
+from sglang.srt.layers.layernorm import RMSNorm
+from sglang.srt.layers.linear import ColumnParallelLinear, RowParallelLinear
+from sglang.srt.layers.logits_processor import LogitsProcessor
+from sglang.srt.layers.moe.fused_moe_triton.layer import FusedMoE
+from sglang.srt.layers.quantization.base_config import QuantizationConfig
+from sglang.srt.layers.rotary_embedding import get_rope
+from sglang.srt.layers.vocab_parallel_embedding import ParallelLMHead
+from sglang.srt.managers.mm_utils import (
+    MultiModalityDataPaddingPatternMultimodalTokens,
+    general_mm_embed_routine,
+)
+from sglang.srt.managers.schedule_batch import MultimodalDataItem, MultimodalInputs
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch
+from sglang.srt.model_loader.weight_utils import default_weight_loader
+from sglang.srt.models.ernie45_moe_vl import Ernie4_5_VLMoeModel
+from sglang.srt.utils import add_prefix
+from sglang.srt.utils.hf_transformers_utils import get_processor
+
+logger = logging.getLogger(__name__)
+
+
+# === Vision Encoder === #
+
+
+class Ernie4_5_VisionMLP(nn.Module):
+
+    def __init__(
+        self,
+        in_features: int,
+        hidden_features: int = None,
+        act_layer: Type[nn.Module] = QuickGELU,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.fc1 = ColumnParallelLinear(
+            in_features,
+            hidden_features,
+            quant_config=quant_config,
+            prefix=add_prefix("fc1", prefix),
+        )
+        self.act = act_layer()
+        self.fc2 = RowParallelLinear(
+            hidden_features,
+            in_features,
+            quant_config=quant_config,
+            prefix=add_prefix("fc2", prefix),
+        )
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x_parallel, _ = self.fc1(x)
+        x_parallel = self.act(x_parallel)
+        x, _ = self.fc2(x_parallel)
+        return x
+
+
+class Ernie4_5_VisionBlock(nn.Module):
+
+    def __init__(
+        self,
+        dim: int,
+        num_heads: int,
+        mlp_ratio: float,
+        act_layer: Type[nn.Module] = QuickGELU,
+        norm_layer: Type[nn.Module] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        if norm_layer is None:
+            norm_layer = partial(nn.LayerNorm, eps=1e-6)
+        self.norm1 = norm_layer(dim)
+        self.norm2 = norm_layer(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+
+        self.attn = VisionAttention(
+            embed_dim=dim,
+            num_heads=num_heads,
+            projection_size=dim,
+            use_qkv_parallel=True,
+            flatten_batch=True,
+            quant_config=quant_config,
+            prefix=add_prefix("attn", prefix),
+        )
+        self.mlp = Ernie4_5_VisionMLP(
+            dim,
+            mlp_hidden_dim,
+            act_layer=act_layer,
+            quant_config=quant_config,
+            prefix=add_prefix("mlp", prefix),
+        )
+
+    def forward(
+        self,
+        x: torch.Tensor,
+        cu_seqlens: torch.Tensor,
+        rotary_pos_emb_cos: torch.Tensor,
+        rotary_pos_emb_sin: torch.Tensor,
+    ) -> torch.Tensor:
+        hidden_states = self.norm1(x)
+        hidden_states = rearrange(hidden_states, "s b ... -> b s ...")
+        attn = self.attn(
+            hidden_states,
+            cu_seqlens=cu_seqlens,
+            rotary_pos_emb_cos=rotary_pos_emb_cos,
+            rotary_pos_emb_sin=rotary_pos_emb_sin,
+        )
+        attn = rearrange(attn, "b s ... -> s b ...")
+        x = x + attn
+        x = x + self.mlp(self.norm2(x))
+        return x
+
+
+class Ernie4_5_VisionPatchEmbed(nn.Module):
+
+    def __init__(
+        self,
+        patch_size: int = 14,
+        in_chans: int = 3,
+        embed_dim: int = 1280,
+    ) -> None:
+        super().__init__()
+        self.patch_size = patch_size
+        self.in_channels = in_chans
+        self.embed_dim = embed_dim
+
+        self.proj = nn.Linear(in_chans * patch_size * patch_size, embed_dim, bias=False)
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        target_dtype = self.proj.weight.dtype
+        hidden_states = hidden_states.to(target_dtype)
+        hidden_states = self.proj(hidden_states)
+
+        return hidden_states
+
+
+class VariableResolutionResamplerModel(nn.Module):
+    def __init__(
+        self,
+        in_dim,
+        out_dim,
+        spatial_conv_size,
+        temporal_conv_size,
+        config,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.in_dim = in_dim
+        self.out_dim = out_dim
+        self.config = config
+        self.spatial_conv_size = spatial_conv_size
+        self.temporal_conv_size = temporal_conv_size
+        self.use_temporal_conv = config.use_temporal_conv
+
+        # compress 2d conv(picture) to 1d
+        self.spatial_dim = self.in_dim * self.spatial_conv_size * self.spatial_conv_size
+        # compress 3d conv(video) to 1d
+        self.temporal_dim = (
+            self.in_dim
+            * self.spatial_conv_size
+            * self.spatial_conv_size
+            * self.temporal_conv_size
+        )
+
+        self.spatial_linear1 = ColumnParallelLinear(
+            self.spatial_dim,
+            self.spatial_dim,
+            bias=True,
+            gather_output=True,
+            quant_config=getattr(config, "quant_config", None),
+            prefix=f"{prefix}.spatial_linear1",
+        )
+
+        self.spatial_gelu = nn.GELU()
+
+        self.spatial_linear2 = ColumnParallelLinear(
+            self.spatial_dim,
+            self.spatial_dim,
+            bias=True,
+            gather_output=True,
+            quant_config=getattr(config, "quant_config", None),
+            prefix=f"{prefix}.spatial_linear2",
+        )
+
+        self.spatial_norm = nn.LayerNorm(self.spatial_dim, eps=1e-6)
+
+        if self.use_temporal_conv:
+            self.temporal_linear1 = ColumnParallelLinear(
+                self.temporal_dim,
+                self.spatial_dim,
+                bias=True,
+                gather_output=True,
+                quant_config=getattr(config, "quant_config", None),
+                prefix=f"{prefix}.temporal_linear1",
+            )
+
+            self.temporal_gelu = nn.GELU()
+
+            self.temporal_linear2 = ColumnParallelLinear(
+                self.spatial_dim,
+                self.spatial_dim,
+                bias=True,
+                gather_output=True,
+                quant_config=getattr(config, "quant_config", None),
+                prefix=f"{prefix}.temporal_linear2",
+            )
+
+            self.temporal_norm = nn.LayerNorm(self.spatial_dim, eps=1e-6)
+
+        self.mlp = ColumnParallelLinear(
+            self.spatial_dim,
+            self.out_dim,
+            bias=True,
+            gather_output=True,
+            quant_config=getattr(config, "quant_config", None),
+            prefix=f"{prefix}.mlp",
+        )
+
+        self.after_norm = RMSNorm(
+            hidden_size=out_dim, eps=getattr(config, "rms_norm_eps", 1e-6)
+        )
+
+    def spatial_conv_reshape(self, x, spatial_conv_size):
+        S, C = x.shape
+        x = x.reshape([-1, C * (spatial_conv_size**2)])
+        return x
+
+    def forward(self, x, grid_thw):
+        def fwd_spatial(x):
+            x = self.spatial_conv_reshape(x, self.spatial_conv_size)
+
+            x, _ = self.spatial_linear1(x)
+            x = self.spatial_gelu(x)
+            x, _ = self.spatial_linear2(x)
+            x = self.spatial_norm(x)
+
+            return x
+
+        def fwd_placeholder(x, grid_thw, to_tensor=False):
+            grid_thw_cpu = grid_thw.cpu().numpy()
+            grid_t, grid_hw = grid_thw_cpu[:, 0], grid_thw_cpu[:, 1:]
+            grid_hw_after_conv = grid_hw.prod(-1) // (self.spatial_conv_size**2)
+
+            tokens_per_img_or_vid = grid_thw_cpu.prod(-1) // (self.spatial_conv_size**2)
+            batch_offset = np.empty(
+                tokens_per_img_or_vid.size, dtype=tokens_per_img_or_vid.dtype
+            )
+            batch_offset[0] = 0
+            batch_offset[1:] = tokens_per_img_or_vid.cumsum()[:-1]
+
+            slice_offsets = []
+            for temporoal_size, spatial_size, b_offset in zip(
+                grid_t, grid_hw_after_conv, batch_offset
+            ):
+                for temp_offset in range(0, temporoal_size, 2):
+                    slice_offsets.append(
+                        np.arange(
+                            b_offset + (temp_offset) * spatial_size,
+                            b_offset + (temp_offset + 1) * spatial_size,
+                        )
+                    )
+            slice_offsets = torch.tensor(np.concatenate(slice_offsets, axis=-1)).to(
+                x.device
+            )
+
+            slice_offsets2 = []
+            for temporoal_size, spatial_size, b_offset in zip(
+                grid_t, grid_hw_after_conv, batch_offset
+            ):
+                for temp_offset in range(
+                    1 if temporoal_size > 1 else 0, temporoal_size, 2
+                ):
+                    slice_offsets2.append(
+                        np.arange(
+                            b_offset + (temp_offset) * spatial_size,
+                            b_offset + (temp_offset + 1) * spatial_size,
+                        )
+                    )
+            slice_offsets2 = torch.tensor(np.concatenate(slice_offsets2, axis=-1)).to(
+                x.device
+            )
+
+            x_timestep_1 = torch.index_select(x, dim=0, index=slice_offsets)
+            x_timestep_2 = torch.index_select(x, dim=0, index=slice_offsets2)
+            x = torch.concat([x_timestep_1, x_timestep_2], dim=-1)
+            return x
+
+        def fwd_temporal(x):
+            x, _ = self.temporal_linear1(x)
+            x = self.temporal_gelu(x)
+            x, _ = self.temporal_linear2(x)
+            x = self.temporal_norm(x)
+            return x
+
+        def fwd_mlp(x):
+            x, _ = self.mlp(x)
+            x = self.after_norm(x)
+            return x
+
+        x = fwd_spatial(x)
+        if self.use_temporal_conv:
+            x = fwd_placeholder(x, grid_thw)
+            x = fwd_temporal(x)
+        x = fwd_mlp(x)
+        return x
+
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
+        params_dict = dict(self.named_parameters(remove_duplicate=False))
+        loaded_params: set[str] = set()
+
+        for name, loaded_weight in weights:
+            if name not in params_dict:
+                continue
+            param = params_dict[name]
+            weight_loader = getattr(param, "weight_loader", default_weight_loader)
+            weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
+
+
+class Ernie4_5_VisionRotaryEmbedding(nn.Module):
+
+    def __init__(self, dim: int, theta: float = 10000.0) -> None:
+        super().__init__()
+        self.inv_freq = 1.0 / theta ** (
+            torch.arange(start=0, end=dim, step=2, dtype=torch.float32) / dim
+        )
+
+    def forward(self, seqlen: int) -> torch.Tensor:
+        seq = torch.arange(
+            seqlen, device=self.inv_freq.device, dtype=self.inv_freq.dtype
+        )
+        freqs = torch.outer(input=seq, vec2=self.inv_freq)
+        return freqs
+
+
+class Ernie4_5_VisionTransformer(nn.Module):
+
+    def __init__(
+        self,
+        vision_config: PretrainedConfig,
+        norm_eps: float = 1e-6,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+
+        patch_size: int = vision_config.patch_size
+        spatial_merge_size: int = vision_config.spatial_merge_size
+        in_chans: int = vision_config.in_chans
+        hidden_size: int = vision_config.hidden_size
+        embed_dim: int = vision_config.embed_dim
+        depth: int = vision_config.depth
+        num_heads: int = vision_config.num_heads
+        mlp_ratio: float = vision_config.mlp_ratio
+
+        self.spatial_merge_size = spatial_merge_size
+
+        self.patch_embed = Ernie4_5_VisionPatchEmbed(
+            patch_size=patch_size,
+            in_chans=in_chans,
+            embed_dim=embed_dim,
+        )
+
+        norm_layer = partial(nn.LayerNorm, eps=norm_eps)
+        head_dim = embed_dim // num_heads
+        self.rotary_pos_emb = get_rope(
+            head_size=head_dim,
+            rotary_dim=head_dim // 2,
+            max_position=8192,
+            base=10000.0,
+            is_neox_style=True,
+        )
+        self.blocks = nn.ModuleList(
+            [
+                Ernie4_5_VisionBlock(
+                    dim=embed_dim,
+                    num_heads=num_heads,
+                    mlp_ratio=mlp_ratio,
+                    norm_layer=norm_layer,
+                    quant_config=quant_config,
+                    prefix=add_prefix(f"blocks.{i}", prefix),
+                )
+                for i in range(depth)
+            ]
+        )
+
+        self.ln = nn.LayerNorm(hidden_size, eps=1e-6)
+
+    @property
+    def dtype(self) -> torch.dtype:
+        return self.patch_embed.proj.weight.dtype
+
+    @property
+    def device(self) -> torch.device:
+        return self.blocks[0].mlp.fc2.weight.device
+
+    def rot_pos_emb(
+        self, grid_thw: torch.Tensor
+    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        pos_ids = []
+        for i in range(grid_thw.size(0)):
+            t, h, w = grid_thw[i].tolist()
+            hpos_ids = torch.arange(h).unsqueeze(1).expand(-1, w)
+            wpos_ids = torch.arange(w).unsqueeze(0).expand(h, -1)
+            hpos_ids = (
+                hpos_ids.reshape(
+                    h // self.spatial_merge_size,
+                    self.spatial_merge_size,
+                    w // self.spatial_merge_size,
+                    self.spatial_merge_size,
+                )
+                .permute(0, 2, 1, 3)
+                .flatten()
+            )
+            wpos_ids = (
+                wpos_ids.reshape(
+                    h // self.spatial_merge_size,
+                    self.spatial_merge_size,
+                    w // self.spatial_merge_size,
+                    self.spatial_merge_size,
+                )
+                .permute(0, 2, 1, 3)
+                .flatten()
+            )
+            pos_ids.append(torch.stack([hpos_ids, wpos_ids], dim=-1).repeat(t, 1))
+        pos_ids = torch.cat(pos_ids, dim=0).to(self.device, non_blocking=True)
+        max_grid_size = grid_thw[:, 1:].max()
+
+        # Use pre-computed cos_sin_cache from RotaryEmbedding
+        cos, sin = self.rotary_pos_emb.get_cos_sin(max_grid_size)
+
+        cos_combined = cos[pos_ids].flatten(1)
+        sin_combined = sin[pos_ids].flatten(1)
+        return cos_combined, sin_combined, pos_ids
+
+    def forward(
+        self,
+        x: torch.Tensor,
+        grid_thw: torch.Tensor,
+    ) -> torch.Tensor:
+        # patchify
+        x = x.to(device=self.device, dtype=self.dtype)
+        x = self.patch_embed(x)
+
+        # compute position embedding
+        rotary_pos_emb_cos, rotary_pos_emb_sin, image_type_ids = self.rot_pos_emb(
+            grid_thw
+        )
+        rotary_pos_emb_cos = torch.cat([rotary_pos_emb_cos, rotary_pos_emb_cos], dim=-1)
+        rotary_pos_emb_sin = torch.cat([rotary_pos_emb_sin, rotary_pos_emb_sin], dim=-1)
+        # compute cu_seqlens
+        cu_seqlens = torch.repeat_interleave(
+            grid_thw[:, 1] * grid_thw[:, 2], grid_thw[:, 0]
+        ).cumsum(dim=0, dtype=torch.int32)
+        cu_seqlens = torch.cat([cu_seqlens.new_zeros(1), cu_seqlens])
+
+        # transformers
+        x = x.unsqueeze(1)
+        for blk in self.blocks:
+            x = blk(
+                x,
+                cu_seqlens=cu_seqlens,
+                rotary_pos_emb_cos=rotary_pos_emb_cos,
+                rotary_pos_emb_sin=rotary_pos_emb_sin,
+            )
+
+        final_output = self.ln(x)
+
+        if final_output.ndim == 3:
+            final_output = final_output.squeeze(dim=1)
+
+        return final_output
+
+
+cached_get_processor = lru_cache(get_processor)
+
+
+class Ernie4_5_VLMoeForConditionalGeneration(nn.Module):
+    # BitandBytes specific attributes
+    default_bitsandbytes_target_modules = [
+        ".gate_proj.",
+        ".down_proj.",
+        ".up_proj.",
+        ".q_proj.",
+        ".k_proj.",
+        ".v_proj.",
+        ".o_proj.",
+    ]
+    bitsandbytes_stacked_params_mapping = {
+        # shard_name, weight_name, index
+        "q_proj": ("qkv_proj", 0),
+        "k_proj": ("qkv_proj", 1),
+        "v_proj": ("qkv_proj", 2),
+        "gate_proj": ("gate_up_proj", 0),
+        "up_proj": ("gate_up_proj", 1),
+    }
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+
+        self.config = config
+        self.vision_model = Ernie4_5_VisionTransformer(
+            config.vision_config,
+            norm_eps=getattr(config, "rms_norm_eps", 1e-6),
+            quant_config=quant_config,
+            prefix=add_prefix("vision_model", prefix),
+        )
+
+        self.model = Ernie4_5_VLMoeModel(
+            config, quant_config, prefix=add_prefix("model", prefix)
+        )
+
+        self.resampler_model = VariableResolutionResamplerModel(
+            self.config.pixel_hidden_size,
+            self.config.hidden_size,
+            self.config.spatial_conv_size,
+            self.config.temporal_conv_size,
+            config=self.config,
+            prefix=add_prefix("resampler_model", prefix),
+        )
+
+        if config.tie_word_embeddings:
+            self.lm_head = self.model.embed_tokens
+        else:
+            self.lm_head = ParallelLMHead(
+                config.vocab_size,
+                config.hidden_size,
+                quant_config=quant_config,
+                prefix=add_prefix("lm_head", prefix),
+            )
+
+        self.is_mrope_enabled = "mrope_section" in self.config.rope_scaling
+        self.logits_processor = LogitsProcessor(config)
+
+        if getattr(self.config, "im_patch_id", None):
+            visual_token_ids = [
+                token_id
+                for token_id in [
+                    self.config.im_patch_id,
+                    getattr(self.config, "image_start_token_id", None),
+                    getattr(self.config, "image_end_token_id", None),
+                    getattr(self.config, "video_start_token_id", None),
+                    getattr(self.config, "video_end_token_id", None),
+                ]
+                if token_id is not None
+            ]
+            self._visual_token_ids_tensor_cache = torch.tensor(
+                visual_token_ids, dtype=torch.long
+            )
+        else:
+            self._visual_token_ids_tensor_cache = None
+
+    def pad_input_ids(self, input_ids: List[int], mm_inputs: MultimodalInputs):
+        pattern = MultiModalityDataPaddingPatternMultimodalTokens()
+        return pattern.pad_input_tokens(input_ids, mm_inputs)
+
+    def _vision_forward(
+        self,
+        pixel_values: torch.Tensor,
+        grid_thw: torch.Tensor,
+    ) -> torch.Tensor:
+        if grid_thw is not None:
+            grid_thw = grid_thw[grid_thw > 0]
+            if grid_thw.numel() % 3 != 0:
+                raise ValueError(
+                    f"grid_thw has {grid_thw.numel()} elements after filtering,"
+                    "which is not divisible by 3."
+                )
+            grid_thw = grid_thw.reshape(-1, 3)
+            # example: [[1,64,64],[2,80,80]] -> [[1,64,64],[1,80,80],[1,80,80]]
+            grid_thw = F.pad(
+                torch.repeat_interleave(grid_thw[:, 1:], grid_thw[:, 0], 0),
+                [1, 0, 0, 0],
+                value=1,
+            )
+        image_features = self.vision_model(pixel_values, grid_thw)
+        return image_features
+
+    def get_image_feature(self, items: List[MultimodalDataItem]) -> torch.Tensor:
+        # in qwen-vl, last dim is the same
+        pixel_values = torch.cat([item.feature for item in items], dim=0).type(
+            self.vision_model.dtype
+        )
+        image_grid_thw = torch.concat([item.image_grid_thw for item in items], dim=0)
+        assert pixel_values.dim() == 2, pixel_values.dim()
+        assert image_grid_thw.dim() == 2, image_grid_thw.dim()
+        image_feature = self._vision_forward(pixel_values, grid_thw=image_grid_thw)
+        image_embeds = self.resampler_model(image_feature, image_grid_thw)
+        return image_embeds
+
+    def get_video_feature(self, items: List[MultimodalDataItem]) -> torch.Tensor:
+        # in qwen-vl, last dim is the same
+        pixel_values = torch.cat([item.feature for item in items], dim=0).type(
+            self.vision_model.dtype
+        )
+        video_grid_thw = torch.concat([item.video_grid_thw for item in items], dim=0)
+        assert pixel_values.dim() == 2, pixel_values.dim()
+        assert video_grid_thw.dim() == 2, video_grid_thw.dim()
+        video_feature = self._vision_forward(pixel_values, grid_thw=video_grid_thw)
+        video_embeds = self.resampler_model(video_feature, video_grid_thw)
+        return video_embeds
+
+    def _set_visual_token_mask(
+        self, input_ids: torch.Tensor, forward_batch: ForwardBatch
+    ) -> None:
+        """Set mask for visual tokens (image/video patches and delimiters)."""
+        if self._visual_token_ids_tensor_cache is None:
+            self.visual_token_mask = None
+            return
+        # Create tensor on the correct device
+        visual_token_ids_tensor = self._visual_token_ids_tensor_cache.to(
+            device=input_ids.device,
+            dtype=input_ids.dtype,
+        )
+
+        pad_values = []
+        if hasattr(forward_batch, "mm_inputs") and forward_batch.mm_inputs is not None:
+            for mm_input in forward_batch.mm_inputs:
+                if mm_input is None:
+                    continue
+                for item in mm_input.mm_items:
+                    pad_values.append(item.pad_value)
+        placeholder_tensor = torch.as_tensor(
+            pad_values,
+            device=input_ids.device,
+        )
+        pad_visual_token_ids_tensor = torch.cat(
+            [visual_token_ids_tensor, placeholder_tensor], dim=0
+        )
+        self.visual_token_mask = torch.isin(
+            input_ids, pad_visual_token_ids_tensor
+        ).reshape(-1, 1)
+
+    def get_input_embeddings(self):
+        return self.model.embed_tokens
+
+    def should_apply_lora(self, module_name: str) -> bool:
+        # skip vision_model
+        return not module_name.startswith("vision_model")
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        get_embedding: bool = False,
+    ):
+        """Run forward pass for Ernie45-VL.
+
+        Args:
+            input_ids: Flattened (concatenated) input_ids corresponding to a
+                batch.
+            positions: Flattened (concatenated) position ids corresponding to a
+                batch.
+                **NOTE**: If mrope is enabled (default setting for Qwen2-VL
+                opensource models), the shape will be `(3, seq_len)`,
+                otherwise it will be `(seq_len,).
+                (Use input_metadata.mrope_positions to replace it)
+        """
+        if self.is_mrope_enabled:
+            positions = forward_batch.mrope_positions
+
+        if not (
+            forward_batch.forward_mode.is_decode()
+            or not forward_batch.contains_image_inputs()
+        ):
+            if self.is_mrope_enabled:
+                assert positions.ndim == 2 and positions.size(0) == 3, (
+                    "multimodal section rotary embedding requires "
+                    f"(3, seq_len) positions, but got {positions.size()}"
+                )
+
+        self._set_visual_token_mask(input_ids, forward_batch)
+
+        assert (
+            input_ids.numel() == positions.shape[-1]
+        ), f"input_ids {input_ids.shape} and position_ids {positions.shape} should have the same length"
+
+        hidden_states = general_mm_embed_routine(
+            input_ids=input_ids,
+            forward_batch=forward_batch,
+            language_model=self.model,
+            multimodal_model=self,
+            positions=positions,
+            visual_token_mask=self.visual_token_mask,
+        )
+
+        self.visual_token_mask = None
+
+        return self.logits_processor(
+            input_ids, hidden_states, self.lm_head, forward_batch
+        )
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+            ("gate_up_proj", "up_proj", 1),
+            ("gate_up_proj", "gate_proj", 0),
+        ]
+
+        # resampler_weight_mappings
+        resampler_weight_mapping = {
+            "spatial_linear.0.": "spatial_linear1.",
+            "spatial_linear.2.": "spatial_linear2.",
+            "spatial_linear.3.": "spatial_norm.",
+            "temporal_linear.0.": "temporal_linear1.",
+            "temporal_linear.2.": "temporal_linear2.",
+            "temporal_linear.3.": "temporal_norm.",
+        }
+
+        expert_params_mapping = FusedMoE.make_expert_params_mapping(
+            ckpt_gate_proj_name="gate_proj",
+            ckpt_down_proj_name="down_proj",
+            ckpt_up_proj_name="up_proj",
+            num_experts=max(self.config.moe_num_experts),
+        )
+        params_dict = dict(self.named_parameters(remove_duplicate=False))
+        for name, loaded_weight in weights:
+            if "rotary_emb.inv_freq" in name:
+                continue
+            if self.config.tie_word_embeddings and "lm_head.weight" in name:
+                continue
+
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+
+                if ("mlp.experts." in name) and name not in params_dict:
+                    continue
+                name = name.replace(weight_name, param_name)
+
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                if "vision_model" in name:
+                    # adapt to VisionAttention
+                    name = name.replace(r"attn.qkv.", r"attn.qkv_proj.")
+                if name.startswith("model.resampler_model"):
+                    name = name.replace("model.resampler_model", "resampler_model")
+
+                for (
+                    old_weight_name,
+                    new_weight_name,
+                ) in resampler_weight_mapping.items():
+                    if old_weight_name in name:
+                        name = name.replace(old_weight_name, new_weight_name, 1)
+                        break
+
+                # Distinguish between vision experts and text experts
+                if "mlp.experts" in name:
+                    moe_offset = int(name.split(".")[-3])
+                    vision_expert_start_idx = self.config.moe_num_experts[0]
+                    is_text_expert = moe_offset <= vision_expert_start_idx - 1
+                    if is_text_expert:
+                        name = name.replace(".experts.", ".text_experts.")
+                    else:
+                        name = name.replace(
+                            f".experts.{moe_offset}",
+                            f".vision_experts.{moe_offset - vision_expert_start_idx}",
+                        )
+
+                for mapping in expert_params_mapping:
+                    param_name, weight_name, expert_id, shard_id = mapping
+                    if weight_name not in name:
+                        continue
+
+                    # Distinguish between vision experts and text experts
+                    moe_offset = int(name.split(".")[-3])
+                    is_text_expert = moe_offset <= self.config.moe_num_experts[0] - 1
+
+                    name = name.replace(weight_name, param_name)
+                    if is_text_expert:
+                        name = name.replace(".experts.", ".text_experts.")
+                    else:
+                        name = name.replace(".experts.", ".vision_experts.")
+
+                    # Skip loading extra bias for GPTQ models.
+                    if (
+                        name.endswith(".bias") or name.endswith("_bias")
+                    ) and name not in params_dict:
+                        continue
+
+                    if name in params_dict.keys():
+                        param = params_dict[name]
+                        weight_loader = param.weight_loader
+                        weight_loader(
+                            param,
+                            loaded_weight,
+                            name,
+                            shard_id=shard_id,
+                            expert_id=expert_id,
+                        )
+                    else:
+                        logger.warning(f"Parameter {name} not found in params_dict")
+                    break
+                else:
+                    # Distinguish between vision expert gate
+                    # and text expert gate
+                    if name.endswith("mlp.gate.weight"):
+                        name = name.replace("gate.weight", "text_experts_gate.weight")
+                        loaded_weight = loaded_weight.T
+                    elif name.endswith("mlp.gate.weight_1"):
+                        name = name.replace(
+                            "gate.weight_1", "vision_experts_gate.weight"
+                        )
+                        loaded_weight = loaded_weight.T
+
+                    if "e_score_correction_bias" in name:
+                        name = name.replace(".moe_statics.", ".")
+
+                    # Skip loading extra bias for GPTQ models.
+                    if (
+                        name.endswith(".bias") or name.endswith("_bias")
+                    ) and name not in params_dict:
+                        continue
+
+                    if name in params_dict.keys():
+                        param = params_dict[name]
+                        weight_loader = getattr(
+                            param, "weight_loader", default_weight_loader
+                        )
+                        weight_loader(param, loaded_weight)
+                    else:
+                        logger.warning(f"Parameter {name} not found in params_dict")
+
+
+EntryClass = [Ernie4_5_VLMoeForConditionalGeneration]
diff --git a/sglang/python/sglang/srt/models/ernie4_eagle.py b/sglang/python/sglang/srt/models/ernie4_eagle.py
new file mode 100644
index 0000000000000000000000000000000000000000..3b9a96fa90bf911215ee45f2ac7e701dad6d3702
--- /dev/null
+++ b/sglang/python/sglang/srt/models/ernie4_eagle.py
@@ -0,0 +1,203 @@
+# Copyright 2023-2025 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Ernie4.5 MTP model compatible with baidu/ERNIE-4.5-*-PT weights."""
+
+from typing import Iterable, Optional, Tuple
+
+import torch
+from torch import nn
+from transformers.models.ernie4_5_moe.configuration_ernie4_5_moe import (
+    Ernie4_5_MoeConfig,
+)
+
+from sglang.srt.layers.layernorm import RMSNorm
+from sglang.srt.layers.logits_processor import LogitsProcessor
+from sglang.srt.layers.quantization.base_config import QuantizationConfig
+from sglang.srt.layers.vocab_parallel_embedding import (
+    ParallelLMHead,
+    VocabParallelEmbedding,
+)
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch
+from sglang.srt.model_loader.weight_utils import default_weight_loader
+from sglang.srt.models.ernie4 import Ernie4_5_ForCausalLM, Ernie4DecoderLayer
+from sglang.srt.utils import add_prefix
+
+
+class Ernie4ModelMTP(nn.Module):
+    def __init__(
+        self,
+        config: Ernie4_5_MoeConfig,
+        layer_id: int,
+        prefix: str,
+        quant_config: Optional[QuantizationConfig] = None,
+    ) -> None:
+        super().__init__()
+
+        self.embed_tokens = VocabParallelEmbedding(
+            config.vocab_size,
+            config.hidden_size,
+            quant_config=quant_config,
+            prefix=add_prefix("embed_tokens", prefix),
+        )
+        self.mtp_emb_norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.mtp_hidden_norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.mtp_linear_proj = nn.Linear(
+            config.hidden_size * 2, config.hidden_size, bias=config.use_bias
+        )
+        self.mtp_block = Ernie4DecoderLayer(
+            config=config,
+            layer_id=layer_id,
+            quant_config=quant_config,
+            prefix=add_prefix("mtp_block", prefix),
+            is_mtp=True,
+        )
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        input_embeds: torch.Tensor = None,
+    ) -> torch.Tensor:
+        if input_embeds is None:
+            hidden_states = self.embed_tokens(input_ids)
+        else:
+            hidden_states = input_embeds
+        # masking inputs at position 0, as not needed by MTP
+        hidden_states[positions == 0] = 0
+
+        hidden_states = self.mtp_linear_proj(
+            torch.cat(
+                (
+                    self.mtp_emb_norm(hidden_states),
+                    self.mtp_hidden_norm(forward_batch.spec_info.hidden_states),
+                ),
+                dim=-1,
+            )
+        )
+        residual = None
+        hidden_states, residual = self.mtp_block(
+            positions=positions,
+            hidden_states=hidden_states,
+            forward_batch=forward_batch,
+            residual=residual,
+        )
+        hidden_states = residual + hidden_states
+        return hidden_states
+
+
+class Ernie4_5_MoeForCausalLMMTP(nn.Module):
+    def __init__(
+        self,
+        config: Ernie4_5_MoeConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+        mtp_layer_id: int = 0,
+    ) -> None:
+        nn.Module.__init__(self)
+        self.config = config
+        self.mtp_layer_id = mtp_layer_id
+
+        self.model = Ernie4ModelMTP(
+            config=config,
+            layer_id=self.mtp_layer_id,
+            quant_config=quant_config,
+            prefix=add_prefix("model", prefix),
+        )
+
+        if config.tie_word_embeddings:
+            self.lm_head = self.model.embed_tokens
+        else:
+            self.lm_head = ParallelLMHead(
+                config.vocab_size,
+                config.hidden_size,
+                quant_config=quant_config,
+                prefix="lm_head",
+            )
+        self.logits_processor = LogitsProcessor(config)
+
+    @torch.no_grad()
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+    ) -> torch.Tensor:
+        hidden_states = self.model(input_ids, positions, forward_batch)
+        return self.logits_processor(
+            input_ids, hidden_states, self.lm_head, forward_batch
+        )
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        mtp_layer_found = False
+        mtp_weight_patterns = [
+            f"mtp_block.{self.mtp_layer_id}",
+            f"mtp_emb_norm.{self.mtp_layer_id}",
+            f"mtp_hidden_norm.{self.mtp_layer_id}",
+            f"mtp_linear_proj.{self.mtp_layer_id}",
+        ]
+        params_dict = dict(self.named_parameters())
+        for name, loaded_weight in weights:
+            # Only name matched patterns should be loaded
+            for layer_pattern in mtp_weight_patterns:
+                if layer_pattern in name:
+                    mtp_layer_found = True
+                    break
+            else:
+                continue
+            # But strip mtp_layer_id before loading, because each MTP layer is a MTP model.
+            name = name.replace(f".{self.mtp_layer_id}.", ".")
+            for (
+                param_name,
+                weight_name,
+                shard_id,
+            ) in Ernie4_5_ForCausalLM.stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                if name in params_dict.keys():
+                    param = params_dict[name]
+                    weight_loader = getattr(
+                        param, "weight_loader", default_weight_loader
+                    )
+                    weight_loader(param, loaded_weight)
+                else:
+                    raise KeyError(f"Parameter '{name}' not found in MTP model.")
+        if not mtp_layer_found:
+            raise KeyError(
+                f"MTP layers 'mtp_*.{self.mtp_layer_id}.*' not found in weights."
+            )
+
+    def get_embed_and_head(self):
+        return self.model.embed_tokens.weight, self.lm_head.weight
+
+    def set_embed_and_head(self, embed, head):
+        del self.model.embed_tokens.weight
+        self.model.embed_tokens.weight = embed
+        if self.config.tie_word_embeddings:
+            self.lm_head = self.model.embed_tokens
+        else:
+            del self.lm_head.weight
+            self.lm_head.weight = head
+        torch.cuda.empty_cache()
+        torch.cuda.synchronize()
+
+
+EntryClass = [Ernie4_5_MoeForCausalLMMTP]
diff --git a/sglang/python/sglang/srt/models/exaone.py b/sglang/python/sglang/srt/models/exaone.py
new file mode 100644
index 0000000000000000000000000000000000000000..1e4dfb3df21727d0884b067fca8912662b03add8
--- /dev/null
+++ b/sglang/python/sglang/srt/models/exaone.py
@@ -0,0 +1,377 @@
+# Copyright 2024 The LGcns AI Engineering Team
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+# Adapted from llama2.py
+"""Inference-only Exaone model compatible with HuggingFace weights."""
+
+from typing import Any, Dict, Iterable, Optional, Tuple
+
+import torch
+from torch import nn
+
+from sglang.srt.distributed import get_tensor_model_parallel_world_size
+from sglang.srt.layers.activation import SiluAndMul
+from sglang.srt.layers.layernorm import RMSNorm
+from sglang.srt.layers.linear import (
+    MergedColumnParallelLinear,
+    QKVParallelLinear,
+    RowParallelLinear,
+)
+from sglang.srt.layers.logits_processor import LogitsProcessor, LogitsProcessorOutput
+from sglang.srt.layers.quantization.base_config import QuantizationConfig
+from sglang.srt.layers.radix_attention import RadixAttention
+from sglang.srt.layers.rotary_embedding import get_rope
+from sglang.srt.layers.vocab_parallel_embedding import (
+    ParallelLMHead,
+    VocabParallelEmbedding,
+)
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch
+from sglang.srt.model_loader.weight_utils import default_weight_loader
+from sglang.srt.utils import add_prefix
+
+
+class ExaoneGatedMLP(nn.Module):
+    def __init__(
+        self,
+        hidden_size: int,
+        intermediate_size: int,
+        hidden_act: str,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.gate_up_proj = MergedColumnParallelLinear(
+            hidden_size,
+            [intermediate_size] * 2,
+            bias=False,
+            quant_config=quant_config,
+            prefix=add_prefix("gate_up_proj", prefix),
+        )
+        self.c_proj = RowParallelLinear(
+            intermediate_size,
+            hidden_size,
+            bias=False,
+            quant_config=quant_config,
+            prefix=add_prefix("c_proj", prefix),
+        )
+        if hidden_act != "silu":
+            raise ValueError(
+                f"Unsupported activation: {hidden_act}. "
+                "Only silu is supported for now."
+            )
+        self.act_fn = SiluAndMul()
+
+    def forward(self, x):
+        gate_up, _ = self.gate_up_proj(x)
+        x = self.act_fn(gate_up)
+        x, _ = self.c_proj(x)
+        return x
+
+
+class ExaoneAttention(nn.Module):
+    def __init__(
+        self,
+        config,
+        hidden_size: int,
+        num_heads: int,
+        num_kv_heads: int,
+        layer_id: int = 0,
+        rope_theta: float = 500000,
+        rope_scaling: Optional[Dict[str, Any]] = None,
+        rope_is_neox_style: bool = True,
+        max_position_embeddings: int = 4096,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.hidden_size = hidden_size
+        tp_size = get_tensor_model_parallel_world_size()
+        self.total_num_heads = num_heads
+        assert self.total_num_heads % tp_size == 0
+        self.num_heads = self.total_num_heads // tp_size
+        self.total_num_kv_heads = num_kv_heads
+        if self.total_num_kv_heads >= tp_size:
+            # Number of KV heads is greater than TP size, so we partition
+            # the KV heads across multiple tensor parallel GPUs.
+            assert self.total_num_kv_heads % tp_size == 0
+        else:
+            # Number of KV heads is less than TP size, so we replicate
+            # the KV heads across multiple tensor parallel GPUs.
+            assert tp_size % self.total_num_kv_heads == 0
+        self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
+        # MistralConfig has an optional head_dim introduced by Mistral-Nemo
+        self.head_dim = getattr(
+            config, "head_dim", self.hidden_size // self.total_num_heads
+        )
+        self.rotary_dim = int(
+            self.head_dim * getattr(config, "partial_rotary_factor", 1)
+        )
+        self.q_size = self.num_heads * self.head_dim
+        self.kv_size = self.num_kv_heads * self.head_dim
+        self.scaling = self.head_dim**-0.5
+        self.rope_theta = rope_theta
+        self.max_position_embeddings = max_position_embeddings
+
+        self.qkv_proj = QKVParallelLinear(
+            hidden_size,
+            self.head_dim,
+            self.total_num_heads,
+            self.total_num_kv_heads,
+            bias=False,
+            quant_config=quant_config,
+            prefix=add_prefix("qkv_proj", prefix),
+        )
+        self.out_proj = RowParallelLinear(
+            self.total_num_heads * self.head_dim,
+            hidden_size,
+            bias=False,
+            quant_config=quant_config,
+            prefix=add_prefix("out_proj", prefix),
+        )
+
+        self.rotary_emb = get_rope(
+            self.head_dim,
+            rotary_dim=self.rotary_dim,
+            max_position=max_position_embeddings,
+            base=rope_theta,
+            rope_scaling=rope_scaling,
+            is_neox_style=rope_is_neox_style,
+        )
+        self.attn = RadixAttention(
+            self.num_heads,
+            self.head_dim,
+            self.scaling,
+            num_kv_heads=self.num_kv_heads,
+            layer_id=layer_id,
+            quant_config=quant_config,
+        )
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+    ) -> torch.Tensor:
+        qkv, _ = self.qkv_proj(hidden_states)
+        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+        q, k = self.rotary_emb(positions, q, k)
+        attn_output = self.attn(q, k, v, forward_batch)
+        output, _ = self.out_proj(attn_output)
+        return output
+
+
+class ExaoneDecoderLayer(nn.Module):
+    def __init__(
+        self,
+        config,
+        layer_id: int = 0,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        rope_theta = getattr(config, "rope_theta", 500000)
+        rope_scaling = getattr(config, "rope_scaling", None)
+        if rope_scaling is not None and getattr(
+            config, "original_max_position_embeddings", None
+        ):
+            rope_scaling["original_max_position_embeddings"] = (
+                config.original_max_position_embeddings
+            )
+        rope_is_neox_style = getattr(config, "rope_is_neox_style", True)
+        max_position_embeddings = getattr(config, "max_position_embeddings", 4096)
+        self.self_attn = ExaoneAttention(
+            config=config,
+            hidden_size=self.hidden_size,
+            num_heads=config.num_attention_heads,
+            num_kv_heads=config.num_key_value_heads,
+            layer_id=layer_id,
+            rope_theta=rope_theta,
+            rope_scaling=rope_scaling,
+            rope_is_neox_style=rope_is_neox_style,
+            max_position_embeddings=max_position_embeddings,
+            quant_config=quant_config,
+            prefix=add_prefix("self_attn", prefix),
+        )
+        self.mlp = ExaoneGatedMLP(
+            hidden_size=self.hidden_size,
+            intermediate_size=config.intermediate_size,
+            hidden_act=config.activation_function,
+            quant_config=quant_config,
+            prefix=add_prefix("mlp", prefix),
+        )
+        rms_norm_eps = config.layer_norm_epsilon
+        self.ln_1 = RMSNorm(config.hidden_size, eps=rms_norm_eps)
+        self.ln_2 = RMSNorm(config.hidden_size, eps=rms_norm_eps)
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+        residual: Optional[torch.Tensor],
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        # Self Attention
+        if residual is None:
+            residual = hidden_states
+            hidden_states = self.ln_1(hidden_states)
+        else:
+            hidden_states, residual = self.ln_1(hidden_states, residual)
+        hidden_states = self.self_attn(
+            positions=positions,
+            hidden_states=hidden_states,
+            forward_batch=forward_batch,
+        )
+
+        # Fully Connected
+        hidden_states, residual = self.ln_2(hidden_states, residual)
+        hidden_states = self.mlp(hidden_states)
+        return hidden_states, residual
+
+
+class ExaoneModel(nn.Module):
+    def __init__(
+        self,
+        config,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.config = config
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+        self.wte = VocabParallelEmbedding(
+            config.vocab_size,
+            config.hidden_size,
+        )
+        self.h = nn.ModuleList(
+            [
+                ExaoneDecoderLayer(
+                    config,
+                    i,
+                    quant_config=quant_config,
+                    prefix=add_prefix(f"h.{i}", prefix),
+                )
+                for i in range(config.num_hidden_layers)
+            ]
+        )
+        rms_norm_eps = config.layer_norm_epsilon
+        self.ln_f = RMSNorm(config.hidden_size, eps=rms_norm_eps)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        input_embeds: torch.Tensor = None,
+    ) -> torch.Tensor:
+        if input_embeds is None:
+            hidden_states = self.wte(input_ids)
+        else:
+            hidden_states = input_embeds
+        residual = None
+        for i in range(len(self.h)):
+            layer = self.h[i]
+            hidden_states, residual = layer(
+                positions,
+                hidden_states,
+                forward_batch,
+                residual,
+            )
+        hidden_states, _ = self.ln_f(hidden_states, residual)
+        return hidden_states
+
+
+class ExaoneForCausalLM(nn.Module):
+    def __init__(
+        self,
+        config,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.config = config
+        self.quant_config = quant_config
+        self.transformer = ExaoneModel(
+            config, quant_config=quant_config, prefix=add_prefix("transformer", prefix)
+        )
+        if self.config.tie_word_embeddings:
+            self.lm_head = self.transformer.wte
+        else:
+            self.lm_head = ParallelLMHead(
+                config.vocab_size,
+                config.hidden_size,
+                prefix=add_prefix("lm_head", prefix),
+            )
+        self.logits_processor = LogitsProcessor(config)
+
+    @torch.no_grad()
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        input_embeds: torch.Tensor = None,
+    ) -> LogitsProcessorOutput:
+        hidden_states = self.transformer(
+            input_ids, positions, forward_batch, input_embeds
+        )
+        return self.logits_processor(
+            input_ids, hidden_states, self.lm_head, forward_batch
+        )
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+            ("gate_up_proj", "c_fc_0", 0),
+            ("gate_up_proj", "c_fc_1", 1),
+        ]
+        params_dict = dict(self.named_parameters())
+
+        for name, loaded_weight in weights:
+            if "rotary_emb.inv_freq" in name or "projector" in name:
+                continue
+            if "rotary_emb.cos_cached" in name or "rotary_emb.sin_cached" in name:
+                # Models trained using ColossalAI may include these tensors in
+                # the checkpoint. Skip them.
+                continue
+            if name.startswith("model.vision_tower") and name not in params_dict:
+                continue
+
+            name = name.replace("attn.attention", "self_attn")
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
+                weight_loader(param, loaded_weight)
+
+
+EntryClass = ExaoneForCausalLM
diff --git a/sglang/python/sglang/srt/models/exaone4.py b/sglang/python/sglang/srt/models/exaone4.py
new file mode 100644
index 0000000000000000000000000000000000000000..76d5998ecf8ac95525042ae54cc7d3a354784625
--- /dev/null
+++ b/sglang/python/sglang/srt/models/exaone4.py
@@ -0,0 +1,719 @@
+from collections.abc import Iterable
+from typing import Any, List, Optional, Tuple, Union
+
+import torch
+from torch import nn
+from transformers import Exaone4Config
+
+from sglang.srt.distributed import get_pp_group, get_tensor_model_parallel_world_size
+from sglang.srt.layers.activation import SiluAndMul
+from sglang.srt.layers.dp_attention import (
+    get_attention_tp_rank,
+    get_attention_tp_size,
+    get_local_attention_dp_size,
+)
+from sglang.srt.layers.layernorm import RMSNorm
+from sglang.srt.layers.linear import (
+    MergedColumnParallelLinear,
+    QKVParallelLinear,
+    RowParallelLinear,
+)
+from sglang.srt.layers.logits_processor import LogitsProcessor, LogitsProcessorOutput
+from sglang.srt.layers.pooler import Pooler, PoolingType
+from sglang.srt.layers.quantization.base_config import QuantizationConfig
+from sglang.srt.layers.radix_attention import RadixAttention
+from sglang.srt.layers.rotary_embedding import get_rope
+from sglang.srt.layers.utils import PPMissingLayer, get_layer_id
+from sglang.srt.layers.vocab_parallel_embedding import (
+    ParallelLMHead,
+    VocabParallelEmbedding,
+)
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch, PPProxyTensors
+from sglang.srt.model_loader.weight_utils import (
+    default_weight_loader,
+    maybe_remap_kv_scale_name,
+)
+from sglang.srt.server_args import get_global_server_args
+from sglang.srt.utils import add_prefix, make_layers
+from sglang.utils import get_exception_traceback, logger
+
+
+# Aligned with HF's implementation, using sliding window inclusive with the last token
+# SGLang assumes exclusive
+def get_attention_sliding_window_size(config):
+    if getattr(config, "sliding_window", None) is not None:
+        return config.sliding_window - 1
+    else:
+        return None
+
+
+class Exaone4GatedMLP(nn.Module):
+    def __init__(
+        self,
+        hidden_size: int,
+        intermediate_size: int,
+        hidden_act: str,
+        quant_config: Optional[QuantizationConfig] = None,
+        bias: bool = False,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.gate_up_proj = MergedColumnParallelLinear(
+            hidden_size,
+            [intermediate_size] * 2,
+            bias=bias,
+            quant_config=quant_config,
+            prefix=add_prefix("gate_up_proj", prefix),
+        )
+        self.down_proj = RowParallelLinear(
+            intermediate_size,
+            hidden_size,
+            bias=bias,
+            quant_config=quant_config,
+            prefix=add_prefix("down_proj", prefix),
+        )
+        if hidden_act != "silu":
+            raise ValueError(
+                f"Unsupported activation: {hidden_act}. "
+                "Only silu is supported for now."
+            )
+        self.act_fn = SiluAndMul()
+
+    def forward(self, x):
+        gate_up, _ = self.gate_up_proj(x)
+        x = self.act_fn(gate_up)
+        x, _ = self.down_proj(x)
+        return x
+
+
+class Exaone4Attention(nn.Module):
+    def __init__(
+        self,
+        config,
+        hidden_size: int,
+        num_heads: int,
+        num_kv_heads: int,
+        layer_id: int = 0,
+        head_dim: Optional[int] = None,
+        rms_norm_eps: float = 1e-06,
+        rope_theta: float = 10000,
+        rope_scaling: Optional[dict[str, Any]] = None,
+        max_position_embeddings: int = 8192,
+        quant_config: Optional[QuantizationConfig] = None,
+        bias: bool = False,
+        bias_o_proj: bool = False,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.hidden_size = hidden_size
+        tp_size = get_tensor_model_parallel_world_size()
+
+        attn_tp_rank = get_attention_tp_rank()
+        attn_tp_size = get_attention_tp_size()
+
+        self.total_num_heads = num_heads
+        assert self.total_num_heads % tp_size == 0
+        self.num_heads = self.total_num_heads // tp_size
+        self.total_num_kv_heads = num_kv_heads
+        if self.total_num_kv_heads >= tp_size:
+            # Number of KV heads is greater than TP size, so we partition
+            # the KV heads across multiple tensor parallel GPUs.
+            assert self.total_num_kv_heads % tp_size == 0
+        else:
+            # Number of KV heads is less than TP size, so we replicate
+            # the KV heads across multiple tensor parallel GPUs.
+            assert tp_size % self.total_num_kv_heads == 0
+        self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
+
+        self.head_dim = head_dim or hidden_size // self.total_num_heads
+        self.q_size = self.num_heads * self.head_dim
+        self.kv_size = self.num_kv_heads * self.head_dim
+        self.scaling = self.head_dim**-0.5
+        self.rope_theta = rope_theta
+        self.max_position_embeddings = max_position_embeddings
+
+        self.qkv_proj = QKVParallelLinear(
+            hidden_size=hidden_size,
+            head_size=self.head_dim,
+            total_num_heads=self.total_num_heads,
+            total_num_kv_heads=self.total_num_kv_heads,
+            bias=bias,
+            quant_config=quant_config,
+            prefix=add_prefix("qkv_proj", prefix),
+            tp_rank=attn_tp_rank,
+            tp_size=attn_tp_size,
+        )
+
+        self.o_proj = RowParallelLinear(
+            input_size=self.total_num_heads * self.head_dim,
+            output_size=hidden_size,
+            bias=bias_o_proj,
+            quant_config=quant_config,
+            prefix=add_prefix("o_proj", prefix),
+            tp_rank=attn_tp_rank,
+            tp_size=attn_tp_size,
+        )
+
+        is_neox_style = True
+        if quant_config is not None and quant_config.get_name() == "gguf":
+            is_neox_style = False
+
+        interleaved_sliding_window = get_attention_sliding_window_size(config)
+        self.sliding_window_pattern = getattr(config, "sliding_window_pattern", None)
+
+        self.is_sliding = False
+        if self.sliding_window_pattern:
+            if (layer_id + 1) % len(self.sliding_window_pattern) != 0:
+                self.is_sliding = True
+
+        self.rotary_emb = get_rope(
+            self.head_dim,
+            rotary_dim=self.head_dim,
+            max_position=max_position_embeddings,
+            base=rope_theta,
+            rope_scaling=rope_scaling,
+            is_neox_style=is_neox_style,
+        )
+        self.attn = RadixAttention(
+            self.num_heads,
+            self.head_dim,
+            self.scaling,
+            num_kv_heads=self.num_kv_heads,
+            layer_id=layer_id,
+            sliding_window_size=(
+                interleaved_sliding_window if self.is_sliding else None
+            ),
+            quant_config=quant_config,
+            prefix=add_prefix("attn", prefix),
+        )
+
+        self.q_norm = RMSNorm(self.head_dim, eps=rms_norm_eps)
+        self.k_norm = RMSNorm(self.head_dim, eps=rms_norm_eps)
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+    ) -> torch.Tensor:
+        qkv, _ = self.qkv_proj(hidden_states)
+        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+
+        # Add qk-norm
+        q_shape = q.shape
+        q = q.reshape(-1, self.head_dim)
+        q = self.q_norm(q)
+        q = q.reshape(q_shape)
+
+        k_shape = k.shape
+        k = k.reshape(-1, self.head_dim)
+        k = self.k_norm(k)
+        k = k.reshape(k_shape)
+
+        if not self.sliding_window_pattern or self.is_sliding:
+            q, k = self.rotary_emb(positions, q, k)
+        attn_output = self.attn(q, k, v, forward_batch)
+        output, _ = self.o_proj(attn_output)
+        return output
+
+
+class Exaone4DecoderLayer(nn.Module):
+    def __init__(
+        self,
+        config: Exaone4Config,
+        layer_id: int = 0,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.layer_id = layer_id
+        self.hidden_size = config.hidden_size
+
+        rope_theta = getattr(config, "rope_theta", 1000000)
+        rope_scaling = getattr(config, "rope_scaling", None)
+        if rope_scaling is not None and getattr(
+            config, "original_max_position_embeddings", None
+        ):
+            rope_scaling["original_max_position_embeddings"] = (
+                config.original_max_position_embeddings
+            )
+
+        max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
+
+        self.local_dp_size = get_local_attention_dp_size()
+        self.attn_tp_size = get_attention_tp_size()
+        self.attn_tp_rank = get_attention_tp_rank()
+
+        self.self_attn = Exaone4Attention(
+            config=config,
+            hidden_size=self.hidden_size,
+            num_heads=config.num_attention_heads,
+            num_kv_heads=getattr(
+                config, "num_key_value_heads", config.num_key_value_heads
+            ),
+            layer_id=layer_id,
+            rope_theta=rope_theta,
+            rope_scaling=rope_scaling,
+            max_position_embeddings=max_position_embeddings,
+            quant_config=quant_config,
+            prefix=add_prefix("self_attn", prefix),
+        )
+        self.mlp = Exaone4GatedMLP(
+            hidden_size=self.hidden_size,
+            intermediate_size=config.intermediate_size,
+            hidden_act=config.hidden_act,
+            quant_config=quant_config,
+            prefix=add_prefix("mlp", prefix),
+        )
+        self.post_attention_layernorm = RMSNorm(
+            self.hidden_size, eps=config.rms_norm_eps
+        )
+        self.post_feedforward_layernorm = RMSNorm(
+            self.hidden_size, eps=config.rms_norm_eps
+        )
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+        residual: Optional[torch.Tensor],
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+
+        if residual is None:
+            residual = hidden_states
+
+        # Self Attention
+        hidden_states = self.self_attn(
+            positions=positions,
+            hidden_states=hidden_states,
+            forward_batch=forward_batch,
+        )
+
+        # Use post-LN
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = hidden_states + residual
+        residual = hidden_states
+
+        # Fully Connected
+        hidden_states = self.mlp(hidden_states)
+
+        # Use post-LN
+        hidden_states = self.post_feedforward_layernorm(hidden_states)
+        hidden_states = hidden_states + residual
+        residual = hidden_states
+
+        return hidden_states, residual
+
+
+class Exaone4Model(nn.Module):
+    def __init__(
+        self,
+        config,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.config = config
+        self.quant_config = quant_config
+        self.vocab_size = config.vocab_size
+        self.pp_group = get_pp_group()
+        if self.pp_group.is_first_rank:
+            self.embed_tokens = VocabParallelEmbedding(
+                config.vocab_size,
+                config.hidden_size,
+                quant_config=quant_config,
+                prefix=add_prefix("embed_tokens", prefix),
+            )
+        else:
+            self.embed_tokens = PPMissingLayer()
+
+        self.layers, self.start_layer, self.end_layer = make_layers(
+            config.num_hidden_layers,
+            lambda idx, prefix: Exaone4DecoderLayer(
+                config=config,
+                quant_config=quant_config,
+                layer_id=idx,
+                prefix=prefix,
+            ),
+            pp_rank=self.pp_group.rank_in_group,
+            pp_size=self.pp_group.world_size,
+            prefix=add_prefix("layers", prefix),
+        )
+        if self.pp_group.is_last_rank:
+            self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        else:
+            self.norm = PPMissingLayer(return_tuple=True)
+
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.embed_tokens(input_ids)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        input_embeds: Optional[torch.Tensor] = None,
+        pp_proxy_tensors: Optional[PPProxyTensors] = None,
+    ) -> Union[torch.Tensor, Tuple[torch.Tensor, List[torch.Tensor]], PPProxyTensors]:
+        if self.pp_group.is_first_rank:
+            if input_embeds is None:
+                hidden_states = self.get_input_embeddings(input_ids)
+            else:
+                hidden_states = input_embeds
+            residual = None
+        else:
+            assert pp_proxy_tensors is not None
+            hidden_states = pp_proxy_tensors["hidden_states"]
+            residual = pp_proxy_tensors["residual"]
+
+        for i in range(len(self.layers)):
+            layer = self.layers[i]
+            hidden_states, residual = layer(
+                positions,
+                hidden_states,
+                forward_batch,
+                residual,
+            )
+        if not self.pp_group.is_last_rank:
+            return PPProxyTensors(
+                {
+                    "hidden_states": hidden_states,
+                    "residual": residual,
+                }
+            )
+        else:
+            hidden_states = self.norm(hidden_states)
+        return hidden_states
+
+
+class Exaone4ForCausalLM(nn.Module):
+    _tied_weights_keys = ["lm_head.weight"]
+    _tp_plan = {"lm_head": "colwise_rep"}
+    _pp_plan = {"lm_head": (["hidden_states"], ["logits"])}
+    base_model_prefix = "language_model"
+
+    # BitandBytes specific attributes
+    default_bitsandbytes_target_modules = [
+        ".gate_proj.",
+        ".down_proj.",
+        ".up_proj.",
+        ".q_proj.",
+        ".k_proj.",
+        ".v_proj.",
+        ".o_proj.",
+    ]
+    bitsandbytes_stacked_params_mapping = {
+        ".q_proj": (".qkv_proj", 0),
+        ".k_proj": (".qkv_proj", 1),
+        ".v_proj": (".qkv_proj", 2),
+        ".gate_proj": (".gate_up_proj", 0),
+        ".up_proj": (".gate_up_proj", 1),
+    }
+
+    packed_modules_mapping = {
+        "qkv_proj": [
+            "q_proj",
+            "k_proj",
+            "v_proj",
+        ],
+        "gate_up_proj": [
+            "gate_proj",
+            "up_proj",
+        ],
+    }
+
+    def __init__(
+        self,
+        config,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.pp_group = get_pp_group()
+        self.config = config
+        self.quant_config = quant_config
+
+        self.model = self._init_model(config, quant_config, add_prefix("model", prefix))
+        # Exaone-4.0 32B set tie_word_embeddins to False
+        # Exaone-4.0 1.2B set tie_word_embeddins to True
+        if config.tie_word_embeddings:
+            self.lm_head = self.model.embed_tokens
+        else:
+            self.lm_head = ParallelLMHead(
+                config.vocab_size,
+                config.hidden_size,
+                quant_config=quant_config,
+                prefix=add_prefix("lm_head", prefix),
+                use_attn_tp_group=get_global_server_args().enable_dp_lm_head,
+            )
+
+        self.logits_processor = LogitsProcessor(config)
+        self.pooler = Pooler(pooling_type=PoolingType.LAST, normalize=True)
+
+    def _init_model(
+        self,
+        config,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        return Exaone4Model(config, quant_config=quant_config, prefix=prefix)
+
+    def get_input_embeddings(self) -> nn.Embedding:
+        return self.model.embed_tokens
+
+    @torch.no_grad()
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        input_embeds: Optional[torch.Tensor] = None,
+        get_embedding: bool = False,
+        pp_proxy_tensors: Optional[PPProxyTensors] = None,
+    ) -> LogitsProcessorOutput:
+        hidden_states = self.model(
+            input_ids,
+            positions,
+            forward_batch,
+            input_embeds,
+            pp_proxy_tensors=pp_proxy_tensors,
+        )
+
+        if self.pp_group.is_last_rank:
+            if not get_embedding:
+                return self.logits_processor(
+                    input_ids,
+                    hidden_states,
+                    self.lm_head,
+                    forward_batch,
+                )
+            else:
+                return self.pooler(hidden_states, forward_batch)
+        else:
+            return hidden_states
+
+    @torch.no_grad()
+    def forward_split_prefill(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        split_interval: Tuple[int, int],  # [start, end) 0-based
+        input_embeds: torch.Tensor = None,
+    ):
+        start, end = split_interval
+        # embed
+        if start == 0:
+            if input_embeds is None:
+                forward_batch.hidden_states = self.model.embed_tokens(input_ids)
+            else:
+                forward_batch.hidden_states = input_embeds
+        # decoder layer
+        for i in range(start, end):
+            layer = self.model.layers[i]
+            forward_batch.hidden_states, forward_batch.residual = layer(
+                positions,
+                forward_batch.hidden_states,
+                forward_batch,
+                forward_batch.residual,
+            )
+
+        if end == self.model.config.num_hidden_layers:
+            # norm
+            hidden_states, _ = self.model.norm(
+                forward_batch.hidden_states, forward_batch.residual
+            )
+            forward_batch.hidden_states = hidden_states
+            # logits process
+            result = self.logits_processor(
+                input_ids, forward_batch.hidden_states, self.lm_head, forward_batch
+            )
+        else:
+            result = None
+
+        return result
+
+    @property
+    def start_layer(self):
+        return self.model.start_layer
+
+    @property
+    def end_layer(self):
+        return self.model.end_layer
+
+    def get_attention_sliding_window_size(self):
+        return get_attention_sliding_window_size(self.config)
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            (".qkv_proj", ".q_proj", "q"),
+            (".qkv_proj", ".k_proj", "k"),
+            (".qkv_proj", ".v_proj", "v"),
+            (".gate_up_proj", ".gate_proj", 0),
+            (".gate_up_proj", ".up_proj", 1),
+        ]
+
+        params_dict = dict(self.named_parameters())
+
+        for name, loaded_weight in weights:
+            layer_id = get_layer_id(name)
+            if (
+                layer_id is not None
+                and hasattr(self.model, "start_layer")
+                and (
+                    layer_id < self.model.start_layer
+                    or layer_id >= self.model.end_layer
+                )
+            ):
+                continue
+            if "rotary_emb.inv_freq" in name or "projector" in name:
+                continue
+            if "rotary_emb.cos_cached" in name or "rotary_emb.sin_cached" in name:
+                # Models trained using ColossalAI may include these tensors in
+                # the checkpoint. Skip them.
+                continue
+            if name.startswith("model.vision_tower") and name not in params_dict:
+                continue
+            if self.config.tie_word_embeddings and "lm_head.weight" in name:
+                continue
+            # Handle FP8 kv-scale remapping
+            if "scale" in name:
+                name = maybe_remap_kv_scale_name(name, params_dict)
+                if name is None:
+                    continue
+
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                if name not in params_dict:
+                    continue
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                # Skip loading kv_scale from ckpts towards new design.
+                if name.endswith(".kv_scale") and name not in params_dict:
+                    continue
+                if name in params_dict.keys():
+                    param = params_dict[name]
+                    weight_loader = getattr(
+                        param, "weight_loader", default_weight_loader
+                    )
+                    weight_loader(param, loaded_weight)
+                else:
+                    logger.warning(f"Parameter {name} not found in params_dict")
+
+    def get_weights_by_name(
+        self, name: str, truncate_size: int = 100, tp_size: int = 1
+    ) -> Optional[torch.Tensor]:
+        """Get the weights of the parameter by its name. Similar to `get_parameter` in Hugging Face.
+
+        Only used for unit test with an unoptimized performance.
+        For optimized performance, please use torch.save and torch.load.
+        """
+        try:
+            if name == "lm_head.weight" and self.config.tie_word_embeddings:
+                logger.info(
+                    "word embedding is tied for this model, return embed_tokens.weight as lm_head.weight."
+                )
+                return (
+                    self.model.embed_tokens.weight.cpu()
+                    .to(torch.float32)
+                    .numpy()
+                    .tolist()[:truncate_size]
+                )
+
+            mapped_name = name
+            mapped_shard_id = None
+            for param_name, weight_name, shard_id in self.stacked_params_mapping:
+                if weight_name in name:
+                    mapped_name = name.replace(weight_name, param_name)
+                    mapped_shard_id = shard_id
+                    break
+            params_dict = dict(self.named_parameters())
+            param = params_dict[mapped_name]
+            if mapped_shard_id is not None:
+                if mapped_shard_id in ["q", "k", "v"]:
+                    num_heads = self.config.num_attention_heads // tp_size
+                    num_kv_heads = self.config.num_key_value_heads // tp_size
+                    head_dim = (
+                        self.config.hidden_size // self.config.num_attention_heads
+                    )
+                    if mapped_shard_id == "q":
+                        offset = 0
+                        size = num_heads * head_dim
+                    elif mapped_shard_id == "k":
+                        offset = num_heads * head_dim
+                        size = num_kv_heads * head_dim
+                    elif mapped_shard_id == "v":
+                        offset = (num_heads + num_kv_heads) * head_dim
+                        size = num_kv_heads * head_dim
+                    weight = param.data.narrow(0, offset, size)
+                elif mapped_shard_id in [0, 1]:
+                    intermediate_size = self.config.intermediate_size
+                    slice_size = intermediate_size // tp_size
+                    if mapped_shard_id == 0:  # gate_proj
+                        offset = 0
+                        size = slice_size
+                    elif mapped_shard_id == 1:  # up_proj
+                        offset = slice_size
+                        size = slice_size
+
+                    weight = param.data.narrow(0, offset, size)
+                else:
+                    weight = param.data
+            else:
+                weight = param.data
+            if tp_size > 1 and ("o_proj" in name or "down_proj" in name):
+                gathered_weights = [torch.zeros_like(weight) for _ in range(tp_size)]
+                torch.distributed.all_gather(gathered_weights, weight)
+                weight = torch.cat(gathered_weights, dim=1)
+            return weight.cpu().to(torch.float32).numpy().tolist()[:truncate_size]
+
+        except Exception:
+            logger.error(
+                f"Error getting weights by name {name} in Exaone4ForCausalLM: {get_exception_traceback()}"
+            )
+            return None
+
+    def get_embed_and_head(self):
+        return self.model.embed_tokens.weight, self.lm_head.weight
+
+    def set_embed_and_head(self, embed, head):
+        del self.model.embed_tokens.weight
+        del self.lm_head.weight
+        self.model.embed_tokens.weight = embed
+        self.lm_head.weight = head
+        torch.cuda.empty_cache()
+        torch.cuda.synchronize()
+
+    def get_embed(self):
+        return self.model.embed_tokens.weight
+
+    def set_embed(self, embed):
+        # NOTE: If draft hidden size != target hidden size, the embed weight cannot be shared for EAGLE3
+        if (
+            hasattr(self.config, "target_hidden_size")
+            and self.config.target_hidden_size != self.config.hidden_size
+        ):
+            return
+        del self.model.embed_tokens.weight
+        self.model.embed_tokens.weight = embed
+        torch.cuda.empty_cache()
+        torch.cuda.synchronize()
+
+    def load_kv_cache_scales(self, quantization_param_path: str) -> None:
+        self.model.load_kv_cache_scales(quantization_param_path)
+
+
+EntryClass = Exaone4ForCausalLM
diff --git a/sglang/python/sglang/srt/models/exaone_moe.py b/sglang/python/sglang/srt/models/exaone_moe.py
new file mode 100644
index 0000000000000000000000000000000000000000..ab31c0a597241dea78455dcc0747e2e1ff603010
--- /dev/null
+++ b/sglang/python/sglang/srt/models/exaone_moe.py
@@ -0,0 +1,881 @@
+# Copyright 2025 The LG AI Research Team
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+# Adapted from the vLLM version of EXAONE-MoE model
+"""Inference-only ExaoneMoE model compatible with HuggingFace weights."""
+
+import logging
+from collections.abc import Iterable
+from typing import Any, Dict, Optional, Tuple, Union
+
+import torch
+from torch import nn
+from transformers import PretrainedConfig
+
+from sglang.srt.distributed import (
+    get_moe_expert_parallel_world_size,
+    get_pp_group,
+    get_tensor_model_parallel_world_size,
+    tensor_model_parallel_all_reduce,
+)
+from sglang.srt.eplb.expert_distribution import get_global_expert_distribution_recorder
+from sglang.srt.eplb.expert_location import ModelConfigForExpertLocation
+from sglang.srt.eplb.expert_location_dispatch import ExpertLocationDispatchInfo
+from sglang.srt.layers.activation import SiluAndMul
+from sglang.srt.layers.dp_attention import (
+    get_attention_tp_rank,
+    get_attention_tp_size,
+    is_dp_attention_enabled,
+)
+from sglang.srt.layers.layernorm import RMSNorm
+from sglang.srt.layers.linear import (
+    MergedColumnParallelLinear,
+    QKVParallelLinear,
+    ReplicatedLinear,
+    RowParallelLinear,
+)
+from sglang.srt.layers.logits_processor import LogitsProcessor, LogitsProcessorOutput
+from sglang.srt.layers.moe import get_moe_a2a_backend
+from sglang.srt.layers.moe.ep_moe.layer import get_moe_impl_class
+from sglang.srt.layers.moe.fused_moe_triton import FusedMoE
+from sglang.srt.layers.moe.topk import TopK
+from sglang.srt.layers.moe.utils import RoutingMethodType
+from sglang.srt.layers.quantization.base_config import QuantizationConfig
+from sglang.srt.layers.radix_attention import RadixAttention
+from sglang.srt.layers.rotary_embedding import get_rope
+from sglang.srt.layers.utils import PPMissingLayer
+from sglang.srt.layers.vocab_parallel_embedding import (
+    ParallelLMHead,
+    VocabParallelEmbedding,
+)
+from sglang.srt.model_executor.cuda_graph_runner import get_is_capture_mode
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch, PPProxyTensors
+from sglang.srt.model_loader.weight_utils import default_weight_loader
+from sglang.srt.server_args import get_global_server_args
+from sglang.srt.utils import LazyValue, add_prefix, is_cuda, make_layers
+
+logger = logging.getLogger(__name__)
+
+_is_cuda = is_cuda()
+
+
+class ExaoneMoEMLP(nn.Module):
+    def __init__(
+        self,
+        hidden_size: int,
+        intermediate_size: int,
+        hidden_act: str,
+        quant_config: Optional[QuantizationConfig] = None,
+        reduce_results: bool = True,
+        prefix: str = "",
+        tp_rank: Optional[int] = None,
+        tp_size: Optional[int] = None,
+    ) -> None:
+        super().__init__()
+        gateup_quant_config = quant_config
+        down_quant_config = quant_config
+        if quant_config and hasattr(quant_config, "ignore") and quant_config.ignore:
+            if add_prefix("gate_proj", prefix) in quant_config.ignore:
+                gateup_quant_config = None
+            if add_prefix("down_proj", prefix) in quant_config.ignore:
+                down_quant_config = None
+
+        self.gate_up_proj = MergedColumnParallelLinear(
+            hidden_size,
+            [intermediate_size] * 2,
+            bias=False,
+            quant_config=gateup_quant_config,
+            prefix=add_prefix("gate_up_proj", prefix),
+            tp_rank=tp_rank,
+            tp_size=tp_size,
+        )
+        self.down_proj = RowParallelLinear(
+            intermediate_size,
+            hidden_size,
+            bias=False,
+            quant_config=down_quant_config,
+            reduce_results=reduce_results,
+            prefix=add_prefix("down_proj", prefix),
+            tp_rank=tp_rank,
+            tp_size=tp_size,
+        )
+        if hidden_act != "silu":
+            raise ValueError(
+                f"Unsupported activation: {hidden_act}. "
+                "Only silu is supported for now."
+            )
+        self.act_fn = SiluAndMul()
+
+    def forward(
+        self,
+        x,
+        forward_batch=None,
+        should_allreduce_fusion: bool = False,
+        use_reduce_scatter: bool = False,
+    ):
+        gate_up, _ = self.gate_up_proj(x)
+        x = self.act_fn(gate_up)
+        x, _ = self.down_proj(
+            x,
+            skip_all_reduce=should_allreduce_fusion or use_reduce_scatter,
+        )
+        return x
+
+
+class ExaoneMoESparseMoEBlock(nn.Module):
+    def __init__(
+        self,
+        layer_id: int,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        alt_stream: Optional[torch.cuda.Stream] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.tp_size = get_tensor_model_parallel_world_size()
+        self.moe_ep_size = get_moe_expert_parallel_world_size()
+        self.layer_id = layer_id
+        self.routed_scaling_factor = config.routed_scaling_factor
+        self.alt_stream = alt_stream
+
+        self.n_routed_experts = config.num_experts
+
+        if self.tp_size > config.num_experts:
+            raise ValueError(
+                f"Tensor parallel size {self.tp_size} is greater than "
+                f"the number of experts {config.num_experts}."
+            )
+
+        self.gate = ReplicatedLinear(
+            config.hidden_size,
+            config.num_experts,
+            bias=False,
+            quant_config=None,
+            prefix=add_prefix("gate", prefix),
+        )
+
+        self.e_score_correction_bias = nn.Parameter(
+            torch.empty(config.num_experts, dtype=torch.float32)
+        )
+
+        self.experts = get_moe_impl_class(quant_config)(
+            num_experts=config.num_experts
+            + get_global_server_args().ep_num_redundant_experts,
+            top_k=config.num_experts_per_tok,
+            hidden_size=config.hidden_size,
+            intermediate_size=config.moe_intermediate_size,
+            layer_id=self.layer_id,
+            quant_config=quant_config,
+            prefix=add_prefix("experts", prefix),
+            routing_method_type=RoutingMethodType.RenormalizeNaive,
+        )
+
+        self.topk = TopK(
+            top_k=config.num_experts_per_tok,
+            renormalize=config.norm_topk_prob,
+            use_grouped_topk=True,
+            num_expert_group=config.n_group,
+            topk_group=config.topk_group,
+            correction_bias=self.e_score_correction_bias,
+            routed_scaling_factor=self.routed_scaling_factor,
+            apply_routed_scaling_factor_on_output=True,
+            scoring_func="sigmoid",
+        )
+
+        if config.num_shared_experts is not None:
+            intermediate_size = config.moe_intermediate_size * config.num_shared_experts
+            self.shared_experts = ExaoneMoEMLP(
+                hidden_size=config.hidden_size,
+                intermediate_size=intermediate_size,
+                hidden_act=config.hidden_act,
+                quant_config=quant_config,
+                reduce_results=False,
+                prefix=add_prefix("shared_experts", prefix),
+                **(
+                    dict(tp_rank=0, tp_size=1)
+                    if get_moe_a2a_backend().is_deepep()
+                    else {}
+                ),
+            )
+
+        if get_moe_a2a_backend().is_deepep():
+            self.ep_size = get_moe_expert_parallel_world_size()
+            self.num_experts = (
+                config.num_experts + get_global_server_args().ep_num_redundant_experts
+            )
+            self.top_k = config.num_experts_per_tok
+
+    def get_moe_weights(self):
+        return [
+            x.data
+            for name, x in self.experts.named_parameters()
+            if name not in ["correction_bias"]
+        ]
+
+    def _forward_shared_experts(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        shared_output = self.shared_experts(hidden_states)
+        return shared_output
+
+    def _forward_deepep(self, hidden_states: torch.Tensor, forward_batch: ForwardBatch):
+        shared_output = None
+        if hidden_states.shape[0] > 0:
+            router_logits, _ = self.gate(hidden_states)
+            shared_output = self._forward_shared_experts(hidden_states)
+            topk_output = self.topk(
+                hidden_states,
+                router_logits,
+                num_token_non_padded=forward_batch.num_token_non_padded,
+                expert_location_dispatch_info=ExpertLocationDispatchInfo.init_new(
+                    layer_id=self.layer_id,
+                ),
+            )
+        else:
+            topk_output = self.topk.empty_topk_output(hidden_states.device)
+        final_hidden_states = self.experts(
+            hidden_states=hidden_states,
+            topk_output=topk_output,
+        )
+
+        if shared_output is not None:
+            final_hidden_states.add_(shared_output)
+
+        return final_hidden_states
+
+    def _forward_router_experts(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        router_logits, _ = self.gate(hidden_states)
+        topk_output = self.topk(hidden_states, router_logits)
+        return self.experts(hidden_states, topk_output)
+
+    def forward_normal_dual_stream(
+        self,
+        hidden_states: torch.Tensor,
+    ) -> torch.Tensor:
+        current_stream = torch.cuda.current_stream()
+        self.alt_stream.wait_stream(current_stream)
+
+        shared_output = self._forward_shared_experts(hidden_states.clone())
+
+        with torch.cuda.stream(self.alt_stream):
+            router_output = self._forward_router_experts(hidden_states)
+
+        current_stream.wait_stream(self.alt_stream)
+
+        return router_output, shared_output
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        forward_batch: Optional[ForwardBatch] = None,
+        use_reduce_scatter: bool = False,
+    ) -> torch.Tensor:
+        num_tokens, hidden_dim = hidden_states.shape
+        hidden_states = hidden_states.view(-1, hidden_dim)
+
+        if get_moe_a2a_backend().is_deepep():
+            return self._forward_deepep(hidden_states, forward_batch)
+
+        if (
+            self.alt_stream is not None
+            and hidden_states.shape[0] > 0
+            and get_is_capture_mode()
+        ):
+            final_hidden_states, shared_output = self.forward_normal_dual_stream(
+                hidden_states
+            )
+        else:
+            shared_output = self._forward_shared_experts(hidden_states)
+            final_hidden_states = self._forward_router_experts(hidden_states)
+
+        if shared_output is not None:
+            final_hidden_states = final_hidden_states + shared_output
+        if self.tp_size > 1 and not use_reduce_scatter:
+            final_hidden_states = tensor_model_parallel_all_reduce(final_hidden_states)
+
+        return final_hidden_states.view(num_tokens, hidden_dim)
+
+
+class ExaoneMoEAttention(nn.Module):
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        hidden_size: int,
+        num_heads: int,
+        num_kv_heads: int,
+        layer_id: int = 0,
+        rope_theta: float = 1000000,
+        rope_scaling: Optional[Dict[str, Any]] = None,
+        rope_is_neox_style: bool = True,
+        max_position_embeddings: int = 8192,
+        quant_config: Optional[QuantizationConfig] = None,
+        bias: bool = False,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.hidden_size = hidden_size
+        attn_tp_rank = get_attention_tp_rank()
+        attn_tp_size = get_attention_tp_size()
+
+        self.total_num_heads = num_heads
+        assert self.total_num_heads % attn_tp_size == 0
+        self.num_heads = self.total_num_heads // attn_tp_size
+        self.total_num_kv_heads = num_kv_heads
+        if self.total_num_kv_heads >= attn_tp_size:
+            # Number of KV heads is greater than TP size, so we partition
+            # the KV heads across multiple tensor parallel GPUs.
+            assert self.total_num_kv_heads % attn_tp_size == 0
+        else:
+            # Number of KV heads is less than TP size, so we replicate
+            # the KV heads across multiple tensor parallel GPUs.
+            assert attn_tp_size % self.total_num_kv_heads == 0
+        self.num_kv_heads = max(1, self.total_num_kv_heads // attn_tp_size)
+        # MistralConfig has an optional head_dim introduced by Mistral-Nemo
+        self.head_dim = getattr(
+            config, "head_dim", self.hidden_size // self.total_num_heads
+        )
+        self.q_size = self.num_heads * self.head_dim
+        self.kv_size = self.num_kv_heads * self.head_dim
+        self.scaling = self.head_dim**-0.5
+        self.max_position_embeddings = max_position_embeddings
+
+        qkv_quant_config = quant_config
+        o_quant_config = quant_config
+        if quant_config and hasattr(quant_config, "ignore") and quant_config.ignore:
+            if add_prefix("q_proj", prefix) in quant_config.ignore:
+                qkv_quant_config = None
+            if add_prefix("o_proj", prefix) in quant_config.ignore:
+                o_quant_config = None
+
+        self.qkv_proj = QKVParallelLinear(
+            hidden_size,
+            self.head_dim,
+            self.total_num_heads,
+            self.total_num_kv_heads,
+            bias=bias,
+            quant_config=qkv_quant_config,
+            prefix=add_prefix("qkv_proj", prefix),
+            tp_rank=attn_tp_rank,
+            tp_size=attn_tp_size,
+        )
+        self.o_proj = RowParallelLinear(
+            self.total_num_heads * self.head_dim,
+            hidden_size,
+            bias=bias,
+            quant_config=o_quant_config,
+            prefix=add_prefix("o_proj", prefix),
+            tp_rank=attn_tp_rank,
+            tp_size=attn_tp_size,
+        )
+
+        self.q_norm = RMSNorm(self.head_dim, eps=config.rms_norm_eps)
+        self.k_norm = RMSNorm(self.head_dim, eps=config.rms_norm_eps)
+
+        if quant_config is not None and quant_config.get_name() == "gguf":
+            rope_is_neox_style = False
+
+        self.sliding_window = config.layer_types[layer_id] == "sliding_attention"
+
+        # apply rotary embeddings to every layer in full attention models
+        self.apply_rope_all_layers = "sliding_attention" not in config.layer_types
+
+        self.rotary_emb = get_rope(
+            self.head_dim,
+            rotary_dim=self.head_dim,
+            max_position=max_position_embeddings,
+            base=rope_theta,
+            rope_scaling=rope_scaling,
+            is_neox_style=rope_is_neox_style,
+        )
+
+        self.attn = RadixAttention(
+            self.num_heads,
+            self.head_dim,
+            self.scaling,
+            num_kv_heads=self.num_kv_heads,
+            layer_id=layer_id,
+            prefix=add_prefix("attn", prefix),
+            sliding_window_size=(
+                config.sliding_window if self.sliding_window else None
+            ),
+        )
+        self.layer_id = layer_id
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+    ) -> torch.Tensor:
+        qkv, _ = self.qkv_proj(hidden_states)
+        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+
+        q = q.reshape(-1, self.head_dim)
+        q = self.q_norm(q)
+        q = q.reshape(-1, self.num_heads * self.head_dim)
+
+        k = k.reshape(-1, self.head_dim)
+        k = self.k_norm(k)
+        k = k.reshape(-1, self.num_kv_heads * self.head_dim)
+
+        if self.sliding_window or self.apply_rope_all_layers:
+            q, k = self.rotary_emb(positions, q, k)
+
+        attn_output = self.attn(q, k, v, forward_batch)
+        output, _ = self.o_proj(attn_output)
+
+        return output
+
+
+class ExaoneMoEDecoderLayer(nn.Module):
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        layer_id: int = 0,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+        alt_stream: Optional[torch.cuda.Stream] = None,
+    ) -> None:
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.config = config
+        rope_theta = getattr(config, "rope_theta", 1000000)
+        rope_scaling = getattr(config, "rope_scaling", None)
+        if rope_scaling is not None and getattr(
+            config, "original_max_position_embeddings", None
+        ):
+            rope_scaling["original_max_position_embeddings"] = (
+                config.original_max_position_embeddings
+            )
+        rope_is_neox_style = getattr(config, "rope_is_neox_style", True)
+        max_position_embeddings = getattr(config, "max_position_embeddings", 131072)
+
+        attention_bias = getattr(config, "attention_bias", False) or getattr(
+            config, "bias", False
+        )
+        self.attn_tp_size = get_attention_tp_size()
+        self.attn_tp_rank = get_attention_tp_rank()
+
+        self.self_attn = ExaoneMoEAttention(
+            config=config,
+            hidden_size=self.hidden_size,
+            num_heads=config.num_attention_heads,
+            num_kv_heads=config.num_key_value_heads,
+            layer_id=layer_id,
+            rope_theta=rope_theta,
+            rope_scaling=rope_scaling,
+            rope_is_neox_style=rope_is_neox_style,
+            max_position_embeddings=max_position_embeddings,
+            quant_config=quant_config,
+            bias=attention_bias,
+            prefix=add_prefix("self_attn", prefix),
+        )
+
+        if config.is_moe_layer[layer_id]:
+            self.mlp = ExaoneMoESparseMoEBlock(
+                layer_id=layer_id,
+                config=config,
+                quant_config=quant_config,
+                alt_stream=alt_stream,
+                prefix=add_prefix("mlp", prefix),
+            )
+        else:
+            self.mlp = ExaoneMoEMLP(
+                hidden_size=self.hidden_size,
+                intermediate_size=config.intermediate_size,
+                hidden_act=config.hidden_act,
+                quant_config=quant_config,
+                prefix=add_prefix("mlp", prefix),
+            )
+        self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = RMSNorm(
+            config.hidden_size, eps=config.rms_norm_eps
+        )
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+        residual: Optional[torch.Tensor],
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        if residual is None:
+            residual = hidden_states
+            hidden_states = self.input_layernorm(hidden_states)
+        else:
+            hidden_states, residual = self.input_layernorm(hidden_states, residual)
+
+        # Self Attention
+        hidden_states = self.self_attn(
+            positions=positions,
+            hidden_states=hidden_states,
+            forward_batch=forward_batch,
+        )
+
+        hidden_states, residual = self.post_attention_layernorm(hidden_states, residual)
+        # Fully Connected
+        hidden_states = self.mlp(hidden_states)
+
+        return hidden_states, residual
+
+
+class ExaoneMoEModel(nn.Module):
+    fall_back_to_pt_during_load = False
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+        alt_stream: Optional[torch.cuda.Stream] = None,
+    ) -> None:
+        super().__init__()
+        self.config = config
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+        self.pp_group = get_pp_group()
+
+        if self.pp_group.is_first_rank:
+            self.embed_tokens = VocabParallelEmbedding(
+                config.vocab_size,
+                config.hidden_size,
+                enable_tp=not is_dp_attention_enabled(),
+            )
+        else:
+            self.embed_tokens = PPMissingLayer()
+
+        self.layers, self.start_layer, self.end_layer = make_layers(
+            config.num_hidden_layers,
+            lambda idx, prefix: ExaoneMoEDecoderLayer(
+                layer_id=idx,
+                config=config,
+                quant_config=quant_config,
+                prefix=prefix,
+                alt_stream=alt_stream,
+            ),
+            pp_rank=self.pp_group.rank_in_group,
+            pp_size=self.pp_group.world_size,
+            prefix=add_prefix("layers", prefix),
+        )
+
+        if self.pp_group.is_last_rank:
+            self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        else:
+            self.norm = PPMissingLayer(return_tuple=True)
+
+        # for EAGLE3 support
+        self.layers_to_capture = []
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        input_embeds: torch.Tensor = None,
+        pp_proxy_tensors: Optional[PPProxyTensors] = None,
+    ) -> Union[torch.Tensor, PPProxyTensors]:
+        if self.pp_group.is_first_rank:
+            if input_embeds is None:
+                hidden_states = self.embed_tokens(input_ids)
+            else:
+                hidden_states = input_embeds
+            residual = None
+        else:
+            assert pp_proxy_tensors is not None
+            hidden_states = pp_proxy_tensors["hidden_states"]
+            residual = pp_proxy_tensors["residual"]
+
+        aux_hidden_states = []
+        for i in range(self.start_layer, self.end_layer):
+            with get_global_expert_distribution_recorder().with_current_layer(i):
+                if i in self.layers_to_capture:
+                    aux_hidden_states.append(hidden_states + residual)
+                layer = self.layers[i]
+                hidden_states, residual = layer(
+                    positions, hidden_states, forward_batch, residual
+                )
+        if not self.pp_group.is_last_rank:
+            return PPProxyTensors(
+                {
+                    "hidden_states": hidden_states,
+                    "residual": residual,
+                }
+            )
+        else:
+            if hidden_states.shape[0] != 0:
+                if residual is None:
+                    hidden_states = self.norm(hidden_states)
+                else:
+                    hidden_states, _ = self.norm(hidden_states, residual)
+        if len(aux_hidden_states) == 0:
+            return hidden_states
+
+        return hidden_states, aux_hidden_states
+
+
+class ExaoneMoEForCausalLM(nn.Module):
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.pp_group = get_pp_group()
+        self.config = config
+        self.quant_config = quant_config
+        alt_stream = torch.cuda.Stream() if _is_cuda else None
+        self.model = ExaoneMoEModel(
+            config,
+            quant_config=quant_config,
+            prefix=add_prefix("model", prefix),
+            alt_stream=alt_stream,
+        )
+        if self.config.tie_word_embeddings:
+            self.lm_head = self.model.embed_tokens
+        else:
+            self.lm_head = ParallelLMHead(
+                config.vocab_size,
+                config.hidden_size,
+                quant_config=quant_config,
+                prefix=add_prefix("lm_head", prefix),
+                use_attn_tp_group=get_global_server_args().enable_dp_lm_head,
+            )
+        self.logits_processor = LogitsProcessor(config)
+        # For EAGLE3 support
+        self.capture_aux_hidden_states = False
+
+        self._routed_experts_weights_of_layer = LazyValue(
+            lambda: {
+                layer_id: self.model.layers[layer_id].mlp.get_moe_weights()
+                for layer_id in range(self.start_layer, self.end_layer)
+                if isinstance(self.model.layers[layer_id].mlp, ExaoneMoESparseMoEBlock)
+            }
+        )
+
+    @property
+    def routed_experts_weights_of_layer(self):
+        return self._routed_experts_weights_of_layer.value
+
+    @torch.no_grad()
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        input_embeds: torch.Tensor = None,
+        pp_proxy_tensors: Optional[PPProxyTensors] = None,
+    ) -> LogitsProcessorOutput:
+        hidden_states = self.model(
+            input_ids,
+            positions,
+            forward_batch,
+            input_embeds,
+            pp_proxy_tensors=pp_proxy_tensors,
+        )
+
+        aux_hidden_states = None
+        if self.capture_aux_hidden_states:
+            hidden_states, aux_hidden_states = hidden_states
+
+        if self.pp_group.is_last_rank:
+            return self.logits_processor(
+                input_ids,
+                hidden_states,
+                self.lm_head,
+                forward_batch,
+                aux_hidden_states,
+            )
+        else:
+            return hidden_states
+
+    @torch.no_grad()
+    def forward_split_prefill(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        split_interval: Tuple[int, int],  # [start, end) 0-based
+        input_embeds: torch.Tensor = None,
+    ):
+        start, end = split_interval
+        # embed
+        if start == 0:
+            if input_embeds is None:
+                forward_batch.hidden_states = self.model.embed_tokens(input_ids)
+            else:
+                forward_batch.hidden_states = input_embeds
+        # decoder layer
+        for i in range(start, end):
+            layer = self.model.layers[i]
+            forward_batch.hidden_states, forward_batch.residual = layer(
+                positions,
+                forward_batch.hidden_states,
+                forward_batch,
+                forward_batch.residual,
+            )
+
+        if end == self.model.config.num_hidden_layers:
+            # norm
+            hidden_states, _ = self.model.norm(
+                forward_batch.hidden_states, forward_batch.residual
+            )
+            forward_batch.hidden_states = hidden_states
+            # logits process
+            result = self.logits_processor(
+                input_ids, forward_batch.hidden_states, self.lm_head, forward_batch
+            )
+        else:
+            result = None
+
+        return result
+
+    @property
+    def start_layer(self):
+        return self.model.start_layer
+
+    @property
+    def end_layer(self):
+        return self.model.end_layer
+
+    def get_embed_and_head(self):
+        return self.model.embed_tokens.weight, self.lm_head.weight
+
+    def set_embed_and_head(self, embed, head):
+        del self.model.embed_tokens.weight
+        del self.lm_head.weight
+        self.model.embed_tokens.weight = embed
+        self.lm_head.weight = head
+        torch.cuda.empty_cache()
+        torch.cuda.synchronize()
+
+    def load_weights(
+        self, weights: Iterable[Tuple[str, torch.Tensor]], is_mtp: bool = False
+    ):
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+            ("gate_up_proj", "gate_proj", 0),
+            ("gate_up_proj", "up_proj", 1),
+        ]
+
+        expert_params_mapping = FusedMoE.make_expert_params_mapping(
+            ckpt_gate_proj_name="gate_proj",
+            ckpt_down_proj_name="down_proj",
+            ckpt_up_proj_name="up_proj",
+            num_experts=self.config.num_experts,
+        )
+
+        params_dict = dict(self.named_parameters())
+
+        for name, loaded_weight in weights:
+            if is_mtp:
+                if "mtp" not in name:
+                    continue
+                if name in [
+                    "mtp.fc.weight",
+                    "mtp.pre_fc_norm_embedding.weight",
+                    "mtp.pre_fc_norm_hidden.weight",
+                ]:
+                    name = name.replace("mtp.", "")
+                else:
+                    name = name.replace("mtp", "model")
+
+            if not is_mtp and "mtp" in name:
+                continue
+
+            if "rotary_emb.inv_freq" in name or "projector" in name:
+                continue
+            if "rotary_emb.cos_cached" in name or "rotary_emb.sin_cached" in name:
+                # Models trained using ColossalAI may include these tensors in
+                # the checkpoint. Skip them.
+                continue
+            if name.startswith("model.vision_tower") and name not in params_dict:
+                continue
+
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                if "mlp.experts" in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                if name not in params_dict:
+                    continue
+
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                for mapping in expert_params_mapping:
+                    param_name, weight_name, expert_id, shard_id = mapping
+                    if weight_name not in name:
+                        continue
+                    name = name.replace(weight_name, param_name)
+                    param = params_dict[name]
+                    weight_loader = param.weight_loader
+                    weight_loader(
+                        param,
+                        loaded_weight,
+                        name,
+                        expert_id=expert_id,
+                        shard_id=shard_id,
+                    )
+                    break
+                else:
+                    # Skip loading extra bias for GPTQ models.
+                    if name.endswith(".bias") and name not in params_dict:
+                        continue
+
+                    if name not in params_dict:
+                        continue
+
+                    if name in params_dict.keys():
+                        param = params_dict[name]
+                        weight_loader = getattr(
+                            param, "weight_loader", default_weight_loader
+                        )
+                        weight_loader(param, loaded_weight)
+                    else:
+                        logger.warning(f"Parameter {name} not found in params_dict")
+
+    @classmethod
+    def get_model_config_for_expert_location(cls, config):
+        return ModelConfigForExpertLocation(
+            num_layers=config.num_hidden_layers,
+            num_logical_experts=config.num_experts,
+            num_groups=None,
+        )
+
+    def set_eagle3_layers_to_capture(self, layer_ids: Optional[list[int]] = None):
+        if not get_pp_group().is_last_rank:
+            return
+
+        self.capture_aux_hidden_states = True
+        if layer_ids is None:
+            num_layers = self.config.num_hidden_layers
+            self.model.layers_to_capture = [
+                2,
+                num_layers // 2,
+                num_layers - 3,
+            ]  # Specific layers for EAGLE3 support
+        else:
+            self.model.layers_to_capture = [val + 1 for val in layer_ids]
+
+
+EntryClass = ExaoneMoEForCausalLM
diff --git a/sglang/python/sglang/srt/models/exaone_moe_mtp.py b/sglang/python/sglang/srt/models/exaone_moe_mtp.py
new file mode 100644
index 0000000000000000000000000000000000000000..05e63dcae992615006e3a7c016c450d267823c20
--- /dev/null
+++ b/sglang/python/sglang/srt/models/exaone_moe_mtp.py
@@ -0,0 +1,106 @@
+# Copyright 2025 The LG AI Research Team
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+# Adapted from the vLLM version of EXAONE-MoE MTP
+"""Inference-only ExaoneMoE MTP Speculative Decoding."""
+
+import logging
+from typing import Iterable, Optional, Tuple
+
+import torch
+from torch import nn
+from transformers import PretrainedConfig
+
+from sglang.srt.distributed import get_pp_group, get_tensor_model_parallel_world_size
+from sglang.srt.layers.layernorm import RMSNorm
+from sglang.srt.layers.logits_processor import LogitsProcessor
+from sglang.srt.layers.quantization.base_config import QuantizationConfig
+from sglang.srt.layers.vocab_parallel_embedding import ParallelLMHead
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch
+from sglang.srt.models.exaone_moe import ExaoneMoEForCausalLM, ExaoneMoEModel
+from sglang.srt.server_args import get_global_server_args
+from sglang.srt.utils import add_prefix
+
+logger = logging.getLogger(__name__)
+
+
+class ExaoneMoEForCausalLMMTP(ExaoneMoEForCausalLM):
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        nn.Module.__init__(self)
+        self.config = config
+        config.num_hidden_layers = 1
+        self.tp_size = get_tensor_model_parallel_world_size()
+        self.quant_config = quant_config
+        self.pp_group = get_pp_group()
+
+        self.fc = nn.Linear(2 * config.hidden_size, config.hidden_size, bias=False)
+        self.pre_fc_norm_embedding = RMSNorm(
+            config.hidden_size, eps=config.rms_norm_eps
+        )
+        self.pre_fc_norm_hidden = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.model = ExaoneMoEModel(
+            config, quant_config, prefix=add_prefix("model", prefix)
+        )
+        self.lm_head = ParallelLMHead(
+            config.vocab_size,
+            config.hidden_size,
+            quant_config=quant_config,
+            prefix=add_prefix("lm_head", prefix),
+            use_attn_tp_group=get_global_server_args().enable_dp_lm_head,
+        )
+        self.logits_processor = LogitsProcessor(config)
+
+    @torch.no_grad()
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        input_embeds: Optional[torch.Tensor] = None,
+        **kwargs,
+    ):
+        if input_embeds is None:
+            input_embeds = self.model.embed_tokens(input_ids)
+
+        hidden_states = forward_batch.spec_info.hidden_states
+
+        if not forward_batch.forward_mode.is_idle():
+            input_embeds = self.pre_fc_norm_embedding(input_embeds)
+            hidden_states = self.pre_fc_norm_hidden(hidden_states)
+        hidden_states = self.fc(torch.cat((input_embeds, hidden_states), dim=-1))
+
+        hidden_states = self.model(
+            input_ids,
+            positions,
+            forward_batch,
+            hidden_states,
+        )
+
+        return self.logits_processor(
+            input_ids, hidden_states, self.lm_head, forward_batch
+        )
+
+    def load_weights(
+        self, weights: Iterable[Tuple[str, torch.Tensor]], is_mtp: bool = False
+    ):
+        super().load_weights(weights, is_mtp=True)
+
+
+EntryClass = ExaoneMoEForCausalLMMTP
diff --git a/sglang/python/sglang/srt/models/falcon_h1.py b/sglang/python/sglang/srt/models/falcon_h1.py
new file mode 100644
index 0000000000000000000000000000000000000000..628f99c6e46e54ff2e35d168c8c990561ace1797
--- /dev/null
+++ b/sglang/python/sglang/srt/models/falcon_h1.py
@@ -0,0 +1,572 @@
+import logging
+from typing import Any, Iterable, List, Optional, Set, Tuple
+
+import torch
+from torch import nn
+
+from sglang.srt.configs.falcon_h1 import FalconH1Config
+from sglang.srt.distributed import get_pp_group, get_tensor_model_parallel_world_size
+from sglang.srt.layers.activation import SiluAndMul
+from sglang.srt.layers.attention.hybrid_linear_attn_backend import (
+    HybridLinearAttnBackend,
+    Mamba2AttnBackend,
+)
+from sglang.srt.layers.attention.mamba.mamba import MambaMixer2
+from sglang.srt.layers.communicator import LayerCommunicator, LayerScatterModes
+from sglang.srt.layers.dp_attention import (
+    get_attention_tp_rank,
+    get_attention_tp_size,
+    is_dp_attention_enabled,
+)
+from sglang.srt.layers.layernorm import RMSNorm
+from sglang.srt.layers.linear import (
+    MergedColumnParallelLinear,
+    QKVParallelLinear,
+    RowParallelLinear,
+)
+from sglang.srt.layers.logits_processor import LogitsProcessor
+from sglang.srt.layers.quantization.base_config import QuantizationConfig
+from sglang.srt.layers.radix_attention import RadixAttention
+from sglang.srt.layers.rotary_embedding import get_rope
+from sglang.srt.layers.vocab_parallel_embedding import (
+    ParallelLMHead,
+    VocabParallelEmbedding,
+)
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch
+from sglang.srt.model_loader.weight_utils import default_weight_loader
+from sglang.srt.server_args import get_global_server_args
+from sglang.srt.utils import add_prefix, is_cuda, make_layers
+
+logger = logging.getLogger(__name__)
+_is_cuda = is_cuda()
+
+
+class FalconH1MLP(nn.Module):
+    def __init__(
+        self,
+        hidden_size: int,
+        intermediate_size: int,
+        hidden_act: str,
+        layer_id: int,
+        mlp_multipliers: List[float],
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+        reduce_results: bool = True,
+    ) -> None:
+        super().__init__()
+        self.gate_up_proj = MergedColumnParallelLinear(
+            hidden_size,
+            [intermediate_size] * 2,
+            bias=False,
+            quant_config=quant_config,
+            prefix=add_prefix("gate_up_proj", prefix),
+        )
+        self.down_proj = RowParallelLinear(
+            intermediate_size,
+            hidden_size,
+            bias=False,
+            quant_config=quant_config,
+            prefix=add_prefix("down_proj", prefix),
+            reduce_results=reduce_results,
+        )
+        if hidden_act != "silu":
+            raise ValueError(
+                f"Unsupported activation: {hidden_act}. "
+                "Only silu is supported for now."
+            )
+        self.act_fn = SiluAndMul()
+        self.layer_id = layer_id
+
+        self.intermediate_size = intermediate_size
+        self.tp_size = get_tensor_model_parallel_world_size()
+
+        self.gate_multiplier, self.down_multiplier = mlp_multipliers
+
+    def forward(
+        self,
+        x,
+        forward_batch=None,
+        use_reduce_scatter: bool = False,
+    ):
+        gate_up, _ = self.gate_up_proj(x)
+        gate_up[:, : self.intermediate_size // self.tp_size] *= self.gate_multiplier
+
+        x = self.act_fn(gate_up)
+        x, _ = self.down_proj(
+            x,
+            skip_all_reduce=use_reduce_scatter,
+        )
+        x = x * self.down_multiplier
+        return x
+
+
+class FalconH1HybridAttentionDecoderLayer(nn.Module):
+
+    def __init__(
+        self,
+        config: FalconH1Config,
+        layer_id: int,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+        alt_stream: Optional[torch.cuda.Stream] = None,
+    ) -> None:
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.attn_tp_rank = get_attention_tp_rank()
+        self.attn_tp_size = get_attention_tp_size()
+        self.tp_size = get_tensor_model_parallel_world_size()
+        self.total_num_heads = config.num_attention_heads
+        assert self.total_num_heads % self.attn_tp_size == 0
+        self.num_heads = self.total_num_heads // self.attn_tp_size
+        self.total_num_kv_heads = config.num_key_value_heads
+        if self.total_num_kv_heads >= self.attn_tp_size:
+            # Number of KV heads is greater than TP size, so we partition
+            # the KV heads across multiple tensor parallel GPUs.
+            assert self.total_num_kv_heads % self.attn_tp_size == 0
+        else:
+            # Number of KV heads is less than TP size, so we replicate
+            # the KV heads across multiple tensor parallel GPUs.
+            assert self.attn_tp_size % self.total_num_kv_heads == 0
+        self.num_kv_heads = max(1, self.total_num_kv_heads // self.attn_tp_size)
+        self.head_dim = config.head_dim or (self.hidden_size // self.num_heads)
+        self.q_size = self.num_heads * self.head_dim
+        self.kv_size = self.num_kv_heads * self.head_dim
+        self.scaling = self.head_dim**-0.5
+        self.rope_theta = getattr(config, "rope_theta", 10000)
+        self.max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
+        self.rope_scaling = getattr(config, "rope_scaling", None)
+        self.partial_rotary_factor = getattr(config, "partial_rotary_factor", 1)
+        self.layer_id = layer_id
+
+        self.rotary_emb = get_rope(
+            head_size=self.head_dim,
+            rotary_dim=self.head_dim,
+            max_position=self.max_position_embeddings,
+            rope_scaling=self.rope_scaling,
+            base=self.rope_theta,
+            partial_rotary_factor=self.partial_rotary_factor,
+            is_neox_style=True,
+            dtype=torch.get_default_dtype(),  # see impl of get_rope
+        )
+
+        self.qkv_proj = QKVParallelLinear(
+            config.hidden_size,
+            self.head_dim,
+            self.total_num_heads,
+            self.total_num_kv_heads,
+            bias=False,
+            quant_config=quant_config,
+            tp_rank=self.attn_tp_rank,
+            tp_size=self.attn_tp_size,
+        )
+
+        self.o_proj = RowParallelLinear(
+            self.total_num_heads * self.head_dim,
+            config.hidden_size,
+            bias=False,
+            quant_config=quant_config,
+            reduce_results=False,
+            tp_rank=self.attn_tp_rank,
+            tp_size=self.attn_tp_size,
+        )
+
+        self.attn = RadixAttention(
+            self.num_heads,
+            self.head_dim,
+            self.scaling,
+            num_kv_heads=self.num_kv_heads,
+            layer_id=layer_id,
+            prefix=f"{prefix}.attn",
+        )
+
+        self.d_ssm = (
+            int(config.mamba_expand * config.hidden_size)
+            if config.mamba_d_ssm is None
+            else config.mamba_d_ssm
+        )
+
+        self.mamba = MambaMixer2(
+            cache_params=config.mamba2_cache_params,
+            hidden_size=config.hidden_size,
+            use_conv_bias=config.mamba_conv_bias,
+            use_bias=config.mamba_proj_bias,
+            n_groups=config.mamba_n_groups,
+            rms_norm_eps=config.rms_norm_eps,
+            activation=config.hidden_act,
+            use_rms_norm=config.mamba_rms_norm,
+            prefix=f"{prefix}.mixer",
+        )
+
+        # FalconH1 all layers are dense and have no nextn now
+        self.is_layer_sparse = False
+        is_previous_layer_sparse = False
+        is_next_layer_sparse = False
+
+        self.layer_scatter_modes = LayerScatterModes.init_new(
+            layer_id=layer_id,
+            num_layers=config.num_hidden_layers,
+            is_layer_sparse=self.is_layer_sparse,
+            is_previous_layer_sparse=is_previous_layer_sparse,
+            is_next_layer_sparse=is_next_layer_sparse,
+        )
+
+        self.feed_forward = FalconH1MLP(
+            hidden_size=self.hidden_size,
+            intermediate_size=config.intermediate_size,
+            hidden_act=config.hidden_act,
+            layer_id=layer_id,
+            mlp_multipliers=config.mlp_multipliers,
+            quant_config=quant_config,
+            prefix=add_prefix("mlp", prefix),
+        )
+
+        self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.pre_ff_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+        self.q_norm = RMSNorm(self.head_dim, eps=config.rms_norm_eps)
+        self.k_norm = RMSNorm(self.head_dim, eps=config.rms_norm_eps)
+
+        self.layer_communicator = LayerCommunicator(
+            layer_scatter_modes=self.layer_scatter_modes,
+            input_layernorm=self.input_layernorm,
+            post_attention_layernorm=self.pre_ff_layernorm,
+            allow_reduce_scatter=True,
+        )
+
+        self.alt_stream = alt_stream
+        self.key_multiplier = config.key_multiplier
+
+        self.ssm_out_multiplier = config.ssm_out_multiplier
+        self.ssm_in_multiplier = config.ssm_in_multiplier
+
+        self.attention_in_multiplier = config.attention_in_multiplier
+        self.attn_out_multiplier = config.attention_out_multiplier
+
+        self.groups_time_state_size = self.mamba.n_groups * config.mamba_d_state
+        self.zxbcdt_multipliers = config.ssm_multipliers
+        self._init_mup_vector()
+
+    def _init_mup_vector(self):
+        """
+        Non learnable per-block scaling vector composed of element-wise
+        multipliersapplied to each separate contiguous block of the output
+        of the linear projection (in_proj) before further processing
+        (gating, convolution, SSM):
+
+            - Z block:  [0 : d_ssm]                      → zxbcdt_multipliers[0]
+            - X block:  [d_ssm : 2 * d_ssm]              → zxbcdt_multipliers[1]
+            - B block:  [2 * d_ssm : 2 * d_ssm + G * S]  → zxbcdt_multipliers[2]
+            - C block:  [2 * d_ssm + G * S : 2 * d_ssm + 2 * G * S]
+                        → zxbcdt_multipliers[3]
+            - dt block: [2 * d_ssm + 2 * G * S : end]    → zxbcdt_multipliers[4]
+
+        where:
+            - d_ssm:     Dimension of state-space model latent
+            - G:         Number of groups (n_groups)
+            - S:         SSM state size per group
+            - All indices are divided by tp_size to support tensor parallelism
+        """
+        vector_shape = (
+            2 * self.d_ssm + 2 * self.groups_time_state_size + self.config.mamba_n_heads
+        ) // self.tp_size
+        mup_vector = torch.ones(1, vector_shape)
+        # Z vector 0 -> d_ssm
+        mup_vector[:, : self.d_ssm // self.tp_size] *= self.zxbcdt_multipliers[0]
+        # X vector d_ssm -> 2 * d_ssm
+        mup_vector[
+            :, (self.d_ssm // self.tp_size) : (2 * self.d_ssm // self.tp_size)
+        ] *= self.zxbcdt_multipliers[1]
+        # B vector 2 * d_ssm -> 2 * d_ssm + (n_group * d_state)
+        mup_vector[
+            :,
+            (2 * self.d_ssm)
+            // self.tp_size : (2 * self.d_ssm + self.groups_time_state_size)
+            // self.tp_size,
+        ] *= self.zxbcdt_multipliers[2]
+        # C vector 2 * d_ssm + (n_group * d_state)
+        # -> 2 * d_ssm + 2 * (n_group * d_state)
+        mup_vector[
+            :,
+            (2 * self.d_ssm + self.groups_time_state_size)
+            // self.tp_size : (2 * self.d_ssm + 2 * self.groups_time_state_size)
+            // self.tp_size,
+        ] *= self.zxbcdt_multipliers[3]
+        # dt vector 2 * d_ssm + 2 * (n_group * d_state)
+        # -> 2 * d_ssm + 2 * (n_group * d_state) + n_heads
+        mup_vector[
+            :,
+            (2 * self.d_ssm + 2 * self.groups_time_state_size) // self.tp_size :,
+        ] *= self.zxbcdt_multipliers[4]
+
+        self.register_buffer("mup_vector", mup_vector, persistent=False)
+
+    def self_attention(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+    ) -> torch.Tensor:
+        qkv, _ = self.qkv_proj(hidden_states)
+        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+        k = k * self.key_multiplier
+        q, k = self.rotary_emb(positions, q, k)
+
+        attn_output = self.attn(q, k, v, forward_batch)
+
+        output, _ = self.o_proj(attn_output)
+        return output
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        residual: Optional[torch.Tensor],
+        forward_batch: ForwardBatch,
+        **kwargs: Any,
+    ):
+        hidden_states, residual = self.layer_communicator.prepare_attn(
+            hidden_states, residual, forward_batch
+        )
+
+        if not forward_batch.forward_mode.is_idle():
+            # Attention block
+            attention_hidden_states = self.self_attention(
+                positions=positions,
+                hidden_states=hidden_states * self.attention_in_multiplier,
+                forward_batch=forward_batch,
+            )
+            attention_hidden_states = attention_hidden_states * self.attn_out_multiplier
+
+            attn_backend = forward_batch.attn_backend
+            assert isinstance(attn_backend, HybridLinearAttnBackend)
+            assert isinstance(attn_backend.linear_attn_backend, Mamba2AttnBackend)
+            # Mamba block
+            mamba_hidden_states = torch.empty_like(hidden_states)
+            attn_backend.linear_attn_backend.forward(
+                self.mamba,
+                hidden_states * self.ssm_in_multiplier,
+                mamba_hidden_states,
+                layer_id=self.layer_id,
+                mup_vector=self.mup_vector,
+            )
+            mamba_hidden_states = mamba_hidden_states * self.ssm_out_multiplier
+
+            hidden_states = attention_hidden_states + mamba_hidden_states
+
+        # Fully Connected
+        hidden_states, residual = self.layer_communicator.prepare_mlp(
+            hidden_states, residual, forward_batch
+        )
+        use_reduce_scatter = self.layer_communicator.should_use_reduce_scatter(
+            forward_batch
+        )
+        hidden_states = self.feed_forward(
+            hidden_states, forward_batch, use_reduce_scatter
+        )
+
+        hidden_states, residual = self.layer_communicator.postprocess_layer(
+            hidden_states, residual, forward_batch
+        )
+
+        return hidden_states, residual
+
+
+ALL_DECODER_LAYER_TYPES = {
+    "falcon_h1": FalconH1HybridAttentionDecoderLayer,
+}
+
+
+class FalconH1Model(nn.Module):
+    def __init__(
+        self,
+        config: FalconH1Config,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.config = config
+
+        alt_stream = torch.cuda.Stream() if _is_cuda else None
+        self.embedding_multiplier = config.embedding_multiplier
+
+        self.embed_tokens = VocabParallelEmbedding(
+            config.vocab_size,
+            config.hidden_size,
+            org_num_embeddings=config.vocab_size,
+            use_attn_tp_group=is_dp_attention_enabled(),
+        )
+
+        def get_layer(idx: int, prefix: str):
+            layer_class = ALL_DECODER_LAYER_TYPES[config.layers_block_type[idx]]
+            return layer_class(
+                config,
+                idx,
+                quant_config=quant_config,
+                prefix=prefix,
+                alt_stream=alt_stream,
+            )
+
+        self.layers = make_layers(
+            config.num_hidden_layers, get_layer, prefix=f"{prefix}.layers"
+        )
+
+        self.final_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.infer_count = 0
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        # mamba_cache_params: MambaCacheParams,
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+
+        # pass a sequence index tensor, that is required for
+        # proper continuous batching computation including
+        # chunked prefill
+        if inputs_embeds is not None:
+            hidden_states = inputs_embeds * self.embedding_multiplier
+        else:
+            hidden_states = self.embed_tokens(input_ids) * self.embedding_multiplier
+
+        residual = None
+        for i in range(len(self.layers)):
+            layer = self.layers[i]
+            hidden_states, residual = layer(
+                layer_id=i,
+                positions=positions,
+                hidden_states=hidden_states,
+                residual=residual,
+                forward_batch=forward_batch,
+            )
+
+        if not forward_batch.forward_mode.is_idle():
+            if residual is None:
+                hidden_states = self.final_layernorm(hidden_states)
+            else:
+                hidden_states, _ = self.final_layernorm(hidden_states, residual)
+
+        return hidden_states
+
+
+class FalconH1ForCausalLM(nn.Module):
+    fall_back_to_pt_during_load = False
+
+    def __init__(
+        self,
+        config: FalconH1Config,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.config = config
+        self.pp_group = get_pp_group()
+        assert self.pp_group.is_first_rank and self.pp_group.is_last_rank
+        self.quant_config = quant_config
+        self.model = FalconH1Model(
+            config, quant_config, prefix=add_prefix("model", prefix)
+        )
+        if config.tie_word_embeddings:
+            self.lm_head = self.model.embed_tokens
+        else:
+            self.lm_head = ParallelLMHead(
+                config.vocab_size,
+                config.hidden_size,
+                quant_config=quant_config,
+                org_num_embeddings=config.vocab_size,
+                prefix=add_prefix("lm_head", prefix),
+                use_attn_tp_group=get_global_server_args().enable_dp_lm_head,
+            )
+        self.lm_head = self.lm_head.float()
+        self.lm_head_multiplier = config.lm_head_multiplier
+        self.logits_processor = LogitsProcessor(
+            config, logit_scale=self.lm_head_multiplier
+        )
+
+    @torch.no_grad()
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        **kwargs,
+    ):
+        hidden_states = self.model(input_ids, positions, forward_batch, inputs_embeds)
+
+        return self.logits_processor(
+            input_ids, hidden_states, self.lm_head, forward_batch
+        )
+
+    def get_embed_and_head(self):
+        return self.model.embed_tokens.weight, self.lm_head.weight
+
+    def set_embed_and_head(self, embed, head):
+        del self.model.embed_tokens.weight
+        del self.lm_head.weight
+        self.model.embed_tokens.weight = embed
+        self.lm_head.weight = head
+        torch.cuda.empty_cache()
+        torch.cuda.synchronize()
+
+    def load_weights(
+        self, weights: Iterable[Tuple[str, torch.Tensor]], is_mtp: bool = False
+    ) -> Set[str]:
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+            ("gate_up_proj", "gate_proj", 0),
+            ("gate_up_proj", "up_proj", 1),
+        ]
+
+        params_dict = dict(self.named_parameters())
+        loaded_params: Set[str] = set()
+        for name, loaded_weight in weights:
+
+            if "rotary_emb.inv_freq" in name:
+                continue
+
+            if ".self_attn." in name:
+                name = name.replace(".self_attn", "")
+
+            if "A_log" in name:
+                name = name.replace("A_log", "A")
+
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                # Skip layers on other devices.
+                # if is_pp_missing_parameter(name, self):
+                #     continue
+                if name not in params_dict:
+                    continue
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader")
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                # if is_pp_missing_parameter(name, self):
+                #     continue
+
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
+
+                weight_loader(param, loaded_weight)
+
+            loaded_params.add(name)
+        return loaded_params
+
+
+EntryClass = FalconH1ForCausalLM
diff --git a/sglang/python/sglang/srt/models/gemma.py b/sglang/python/sglang/srt/models/gemma.py
new file mode 100644
index 0000000000000000000000000000000000000000..1ecb5011f71ce0d170f7e1efab776728664a271e
--- /dev/null
+++ b/sglang/python/sglang/srt/models/gemma.py
@@ -0,0 +1,410 @@
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+# Adapted from:
+# https://github.com/vllm-project/vllm/blob/c7f2cf2b7f67bce5842fedfdba508440fe257375/vllm/model_executor/models/gemma.py#L1
+"""Inference-only Gemma model compatible with HuggingFace weights."""
+
+from typing import Iterable, Optional, Tuple
+
+import torch
+from torch import nn
+from transformers import PretrainedConfig
+
+from sglang.srt.distributed import get_tensor_model_parallel_world_size
+from sglang.srt.layers.activation import GeluAndMul
+from sglang.srt.layers.layernorm import RMSNorm
+from sglang.srt.layers.linear import (
+    MergedColumnParallelLinear,
+    QKVParallelLinear,
+    RowParallelLinear,
+)
+from sglang.srt.layers.logits_processor import LogitsProcessor
+from sglang.srt.layers.quantization.base_config import QuantizationConfig
+from sglang.srt.layers.radix_attention import RadixAttention
+from sglang.srt.layers.rotary_embedding import get_rope
+from sglang.srt.layers.vocab_parallel_embedding import VocabParallelEmbedding
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch
+from sglang.srt.model_loader.weight_utils import default_weight_loader
+from sglang.srt.utils import add_prefix
+
+
+class GemmaMLP(nn.Module):
+    def __init__(
+        self,
+        hidden_size: int,
+        intermediate_size: int,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.gate_up_proj = MergedColumnParallelLinear(
+            hidden_size,
+            [intermediate_size] * 2,
+            bias=False,
+            quant_config=quant_config,
+            prefix=add_prefix("gate_up_proj", prefix),
+        )
+        self.down_proj = RowParallelLinear(
+            intermediate_size,
+            hidden_size,
+            bias=False,
+            quant_config=quant_config,
+            prefix=add_prefix("down_proj", prefix),
+        )
+        self.act_fn = GeluAndMul("none")
+
+    def forward(self, x):
+        gate_up, _ = self.gate_up_proj(x)
+        x = self.act_fn(gate_up)
+        x, _ = self.down_proj(x)
+        return x
+
+
+class GemmaAttention(nn.Module):
+    def __init__(
+        self,
+        hidden_size: int,
+        num_heads: int,
+        num_kv_heads: int,
+        head_dim: int,
+        layer_id: int = 0,
+        max_position_embeddings: int = 8192,
+        rope_theta: float = 10000,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.hidden_size = hidden_size
+        tp_size = get_tensor_model_parallel_world_size()
+        self.total_num_heads = num_heads
+        assert self.total_num_heads % tp_size == 0
+        self.num_heads = self.total_num_heads // tp_size
+        self.total_num_kv_heads = num_kv_heads
+        if self.total_num_kv_heads >= tp_size:
+            # Number of KV heads is greater than TP size, so we partition
+            # the KV heads across multiple tensor parallel GPUs.
+            assert self.total_num_kv_heads % tp_size == 0
+        else:
+            # Number of KV heads is less than TP size, so we replicate
+            # the KV heads across multiple tensor parallel GPUs.
+            assert tp_size % self.total_num_kv_heads == 0
+        self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
+        self.head_dim = head_dim
+        self.q_size = self.num_heads * self.head_dim
+        self.kv_size = self.num_kv_heads * self.head_dim
+        self.scaling = self.head_dim**-0.5
+        self.rope_theta = rope_theta
+
+        self.qkv_proj = QKVParallelLinear(
+            hidden_size,
+            self.head_dim,
+            self.total_num_heads,
+            self.total_num_kv_heads,
+            bias=False,
+            quant_config=quant_config,
+            prefix=add_prefix("qkv_proj", prefix),
+        )
+        self.o_proj = RowParallelLinear(
+            self.total_num_heads * self.head_dim,
+            hidden_size,
+            bias=False,
+            quant_config=quant_config,
+            prefix=add_prefix("o_proj", prefix),
+        )
+
+        self.rotary_emb = get_rope(
+            self.head_dim,
+            rotary_dim=self.head_dim,
+            max_position=max_position_embeddings,
+            base=self.rope_theta,
+            is_neox_style=True,
+        )
+        self.attn = RadixAttention(
+            self.num_heads,
+            self.head_dim,
+            self.scaling,
+            num_kv_heads=self.num_kv_heads,
+            layer_id=layer_id,
+            quant_config=quant_config,
+            prefix=add_prefix("attn", prefix),
+        )
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+    ) -> torch.Tensor:
+        qkv, _ = self.qkv_proj(hidden_states)
+        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+        q, k = self.rotary_emb(positions, q, k)
+        attn_output = self.attn(q, k, v, forward_batch)
+        output, _ = self.o_proj(attn_output)
+        return output
+
+
+class GemmaDecoderLayer(nn.Module):
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        layer_id: int = 0,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.self_attn = GemmaAttention(
+            hidden_size=self.hidden_size,
+            num_heads=config.num_attention_heads,
+            num_kv_heads=config.num_key_value_heads,
+            head_dim=config.head_dim,
+            layer_id=layer_id,
+            max_position_embeddings=config.max_position_embeddings,
+            rope_theta=config.rope_theta,
+            quant_config=quant_config,
+            prefix=add_prefix("self_attn", prefix),
+        )
+        self.mlp = GemmaMLP(
+            hidden_size=self.hidden_size,
+            intermediate_size=config.intermediate_size,
+            quant_config=quant_config,
+            prefix=add_prefix("mlp", prefix),
+        )
+        self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = RMSNorm(
+            config.hidden_size, eps=config.rms_norm_eps
+        )
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+        residual: Optional[torch.Tensor],
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        # Self Attention
+        if residual is None:
+            residual = hidden_states
+            hidden_states = self.input_layernorm(hidden_states)
+        else:
+            hidden_states, residual = self.input_layernorm(hidden_states, residual)
+        hidden_states = self.self_attn(
+            positions=positions,
+            hidden_states=hidden_states,
+            forward_batch=forward_batch,
+        )
+
+        # Fully Connected
+        hidden_states, residual = self.post_attention_layernorm(hidden_states, residual)
+        hidden_states = self.mlp(hidden_states)
+        return hidden_states, residual
+
+
+class GemmaModel(nn.Module):
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.config = config
+
+        self.embed_tokens = VocabParallelEmbedding(
+            config.vocab_size,
+            config.hidden_size,
+        )
+        self.layers = nn.ModuleList(
+            [
+                GemmaDecoderLayer(
+                    config,
+                    i,
+                    quant_config=quant_config,
+                    prefix=add_prefix(f"layers.{i}", prefix),
+                )
+                for i in range(config.num_hidden_layers)
+            ]
+        )
+        self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        input_embeds: torch.Tensor = None,
+    ) -> torch.Tensor:
+        if input_embeds is None:
+            hidden_states = self.embed_tokens(input_ids)
+        else:
+            hidden_states = input_embeds
+
+        # Normalize the embedding by sqrt(hidden_size)
+        hidden_states *= self.config.hidden_size**0.5
+
+        residual = None
+        for i in range(len(self.layers)):
+            layer = self.layers[i]
+            hidden_states, residual = layer(
+                positions,
+                hidden_states,
+                forward_batch,
+                residual,
+            )
+        hidden_states, _ = self.norm(hidden_states, residual)
+        return hidden_states
+
+
+class GemmaForCausalLM(nn.Module):
+    packed_modules_mapping = {
+        "qkv_proj": [
+            "q_proj",
+            "k_proj",
+            "v_proj",
+        ],
+        "gate_up_proj": [
+            "gate_proj",
+            "up_proj",
+        ],
+    }
+
+    # LoRA specific attributes
+    supported_lora_modules = [
+        "qkv_proj",
+        "o_proj",
+        "gate_up_proj",
+        "down_proj",
+    ]
+    # Gemma does not apply LoRA to the embedding layer.
+    embedding_modules = {}
+    embedding_padding_modules = []
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.config = config
+        self.quant_config = quant_config
+        self.model = GemmaModel(
+            config, quant_config=quant_config, prefix=add_prefix("model", prefix)
+        )
+        self.logits_processor = LogitsProcessor(config)
+
+    @torch.no_grad()
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        input_embeds: torch.Tensor = None,
+    ) -> torch.Tensor:
+        hidden_states = self.model(input_ids, positions, forward_batch, input_embeds)
+        return self.logits_processor(
+            input_ids, hidden_states, self.model.embed_tokens, forward_batch
+        )
+
+    @torch.no_grad()
+    def forward_split_prefill(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        split_interval: Tuple[int, int],  # [start, end) 0-based
+        input_embeds: torch.Tensor = None,
+    ):
+        start, end = split_interval
+        # embed
+        if start == 0:
+            if input_embeds is None:
+                forward_batch.hidden_states = self.model.embed_tokens(input_ids)
+            else:
+                forward_batch.hidden_states = input_embeds
+
+            # Normalize the embedding by sqrt(hidden_size)
+            forward_batch.hidden_states *= self.model.config.hidden_size**0.5
+
+        # decoder layer
+        for i in range(start, end):
+            layer = self.model.layers[i]
+            forward_batch.hidden_states, forward_batch.residual = layer(
+                positions,
+                forward_batch.hidden_states,
+                forward_batch,
+                forward_batch.residual,
+            )
+
+        if end == self.model.config.num_hidden_layers:
+            # norm
+            forward_batch.hidden_states, _ = self.model.norm(
+                forward_batch.hidden_states, forward_batch.residual
+            )
+
+            # logits process
+            result = self.logits_processor(
+                input_ids,
+                forward_batch.hidden_states,
+                self.model.embed_tokens,
+                forward_batch,
+            )
+        else:
+            result = None
+
+        return result
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+            ("gate_up_proj", "gate_proj", 0),
+            ("gate_up_proj", "up_proj", 1),
+        ]
+        params_dict = dict(self.named_parameters())
+        loaded_params = set()
+        for name, loaded_weight in weights:
+            for param_name, shard_name, shard_id in stacked_params_mapping:
+                if shard_name not in name:
+                    continue
+                name = name.replace(shard_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                # lm_head is not used in vllm as it is tied with embed_token.
+                # To prevent errors, skip loading lm_head.weight.
+                if "lm_head.weight" in name:
+                    continue
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                # GemmaRMSNorm is different from Llama's in that it multiplies
+                # (1 + weight) to the output, instead of just weight.
+                if "norm.weight" in name:
+                    loaded_weight += 1.0
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
+                weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+
+
+EntryClass = GemmaForCausalLM
diff --git a/sglang/python/sglang/srt/models/gemma2.py b/sglang/python/sglang/srt/models/gemma2.py
new file mode 100644
index 0000000000000000000000000000000000000000..883eec81fe684de68e3938e504d353949b1c3f23
--- /dev/null
+++ b/sglang/python/sglang/srt/models/gemma2.py
@@ -0,0 +1,499 @@
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+# Adapted from:
+# https://github.com/vllm-project/vllm/blob/56b325e977435af744f8b3dca7af0ca209663558/vllm/model_executor/models/gemma2.py
+
+from typing import Iterable, Optional, Set, Tuple
+
+import torch
+from torch import nn
+from transformers import PretrainedConfig
+
+from sglang.srt.distributed import get_tensor_model_parallel_world_size
+from sglang.srt.layers.activation import GeluAndMul
+from sglang.srt.layers.layernorm import GemmaRMSNorm
+from sglang.srt.layers.linear import (
+    MergedColumnParallelLinear,
+    QKVParallelLinear,
+    RowParallelLinear,
+)
+from sglang.srt.layers.logits_processor import LogitsProcessor
+from sglang.srt.layers.quantization.base_config import QuantizationConfig
+from sglang.srt.layers.radix_attention import RadixAttention
+from sglang.srt.layers.rotary_embedding import get_rope
+from sglang.srt.layers.vocab_parallel_embedding import VocabParallelEmbedding
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch
+from sglang.srt.model_loader.weight_utils import (
+    default_weight_loader,
+    maybe_remap_kv_scale_name,
+)
+from sglang.srt.utils import add_prefix, is_npu, make_layers
+
+_is_npu = is_npu()
+
+
+# Aligned with HF's implementation, using sliding window inclusive with the last token
+# SGLang assumes exclusive
+def get_attention_sliding_window_size(config):
+    return config.sliding_window - 1
+
+
+class Gemma2MLP(nn.Module):
+    def __init__(
+        self,
+        hidden_size: int,
+        intermediate_size: int,
+        hidden_act: str,
+        hidden_activation: str,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.gate_up_proj = MergedColumnParallelLinear(
+            hidden_size,
+            [intermediate_size] * 2,
+            bias=False,
+            quant_config=quant_config,
+            prefix=add_prefix("gate_up_proj", prefix),
+        )
+        self.down_proj = RowParallelLinear(
+            intermediate_size,
+            hidden_size,
+            bias=False,
+            quant_config=quant_config,
+            prefix=add_prefix("down_proj", prefix),
+        )
+        if not (hidden_act == hidden_activation == "gelu_pytorch_tanh"):
+            raise ValueError(
+                "Gemma2 uses `gelu_pytorch_tanh` as the hidden activation "
+                "function. Please set `hidden_act` and `hidden_activation` to "
+                "`gelu_pytorch_tanh`."
+            )
+        self.act_fn = GeluAndMul()
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        gate_up, _ = self.gate_up_proj(x)
+        x = self.act_fn(gate_up)
+        x, _ = self.down_proj(x)
+        return x
+
+
+class Gemma2Attention(nn.Module):
+    def __init__(
+        self,
+        layer_id: int,
+        config: PretrainedConfig,
+        hidden_size: int,
+        num_heads: int,
+        num_kv_heads: int,
+        head_dim: int,
+        max_position_embeddings: int,
+        rope_theta: float,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.layer_id = layer_id
+        self.config = config
+        self.hidden_size = hidden_size
+        tp_size = get_tensor_model_parallel_world_size()
+        self.total_num_heads = num_heads
+        assert self.total_num_heads % tp_size == 0
+        self.num_heads = self.total_num_heads // tp_size
+        self.total_num_kv_heads = num_kv_heads
+        if self.total_num_kv_heads >= tp_size:
+            # Number of KV heads is greater than TP size, so we partition
+            # the KV heads across multiple tensor parallel GPUs.
+            assert self.total_num_kv_heads % tp_size == 0
+        else:
+            # Number of KV heads is less than TP size, so we replicate
+            # the KV heads across multiple tensor parallel GPUs.
+            assert tp_size % self.total_num_kv_heads == 0
+        self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
+        self.head_dim = head_dim
+        self.q_size = self.num_heads * self.head_dim
+        self.kv_size = self.num_kv_heads * self.head_dim
+        self.scaling = config.query_pre_attn_scalar**-0.5
+        self.rope_theta = rope_theta
+
+        self.qkv_proj = QKVParallelLinear(
+            hidden_size,
+            self.head_dim,
+            self.total_num_heads,
+            self.total_num_kv_heads,
+            bias=config.attention_bias,
+            quant_config=quant_config,
+            prefix=add_prefix("qkv_proj", prefix),
+        )
+        self.o_proj = RowParallelLinear(
+            self.total_num_heads * self.head_dim,
+            hidden_size,
+            bias=config.attention_bias,
+            quant_config=quant_config,
+            prefix=add_prefix("o_proj", prefix),
+        )
+        if (
+            not _is_npu
+            or "Gemma2ForSequenceClassification" not in self.config.architectures
+        ):
+            self.rotary_emb = get_rope(
+                self.head_dim,
+                rotary_dim=self.head_dim,
+                max_position=max_position_embeddings,
+                base=self.rope_theta,
+                is_neox_style=True,
+            )
+            logit_cap = self.config.attn_logit_softcapping
+        else:
+            self.rotary_emb = get_rope(
+                self.head_dim,
+                rotary_dim=self.head_dim,
+                max_position=max_position_embeddings,
+                base=self.rope_theta,
+                is_neox_style=True,
+                dtype=torch.float32,
+            )
+            logit_cap = 0.0
+
+        use_sliding_window = layer_id % 2 == 0 and hasattr(config, "sliding_window")
+        self.attn = RadixAttention(
+            self.num_heads,
+            self.head_dim,
+            self.scaling,
+            num_kv_heads=self.num_kv_heads,
+            layer_id=layer_id,
+            logit_cap=logit_cap,
+            sliding_window_size=(
+                get_attention_sliding_window_size(config)
+                if use_sliding_window
+                else None
+            ),
+            quant_config=quant_config,
+            prefix=add_prefix("attn", prefix),
+        )
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+    ) -> torch.Tensor:
+        qkv, _ = self.qkv_proj(hidden_states)
+        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+        q, k = self.rotary_emb(positions, q, k)
+        attn_output = self.attn(q, k, v, forward_batch)
+        output, _ = self.o_proj(attn_output)
+        return output
+
+
+class Gemma2DecoderLayer(nn.Module):
+    def __init__(
+        self,
+        layer_id: int,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.layer_id = layer_id
+        self.hidden_size = config.hidden_size
+        self.self_attn = Gemma2Attention(
+            layer_id=layer_id,
+            config=config,
+            hidden_size=self.hidden_size,
+            num_heads=config.num_attention_heads,
+            num_kv_heads=config.num_key_value_heads,
+            head_dim=config.head_dim,
+            max_position_embeddings=config.max_position_embeddings,
+            rope_theta=config.rope_theta,
+            quant_config=quant_config,
+            prefix=add_prefix("self_attn", prefix),
+        )
+        self.hidden_size = config.hidden_size
+        self.mlp = Gemma2MLP(
+            hidden_size=self.hidden_size,
+            intermediate_size=config.intermediate_size,
+            hidden_act=config.hidden_act,
+            hidden_activation=config.hidden_activation,
+            quant_config=quant_config,
+            prefix=add_prefix("mlp", prefix),
+        )
+        self.input_layernorm = GemmaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = GemmaRMSNorm(
+            config.hidden_size, eps=config.rms_norm_eps
+        )
+        self.pre_feedforward_layernorm = GemmaRMSNorm(
+            config.hidden_size, eps=config.rms_norm_eps
+        )
+        self.post_feedforward_layernorm = GemmaRMSNorm(
+            config.hidden_size, eps=config.rms_norm_eps
+        )
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+        residual: Optional[torch.Tensor],
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        if residual is None:
+            residual = hidden_states
+            hidden_states = self.input_layernorm(hidden_states)
+        else:
+            hidden_states, residual = self.input_layernorm(hidden_states, residual)
+        hidden_states = self.self_attn(
+            positions=positions,
+            hidden_states=hidden_states,
+            forward_batch=forward_batch,
+        )
+        hidden_states = self.post_attention_layernorm(hidden_states)
+
+        hidden_states, residual = self.pre_feedforward_layernorm(
+            hidden_states, residual
+        )
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = self.post_feedforward_layernorm(hidden_states)
+        return hidden_states, residual
+
+
+class Gemma2Model(nn.Module):
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.config = config
+
+        self.embed_tokens = VocabParallelEmbedding(
+            config.vocab_size,
+            config.hidden_size,
+        )
+        self.layers = make_layers(
+            config.num_hidden_layers,
+            lambda idx, prefix: Gemma2DecoderLayer(
+                layer_id=idx,
+                config=config,
+                quant_config=quant_config,
+            ),
+            prefix=add_prefix("layers", prefix),
+        )
+        self.norm = GemmaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+        # Normalize the embedding by sqrt(hidden_size)
+        # The normalizer's data type should be downcasted to the model's
+        # data type such as bfloat16, not float32.
+        # See https://github.com/huggingface/transformers/pull/29402
+        normalizer = self.config.hidden_size**0.5
+        self.register_buffer("normalizer", torch.tensor(normalizer))
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        input_embeds: torch.Tensor = None,
+    ) -> torch.Tensor:
+        if input_embeds is None:
+            hidden_states = self.embed_tokens(input_ids)
+        else:
+            hidden_states = input_embeds
+        normalizer = torch.tensor(
+            self.config.hidden_size**0.5, dtype=hidden_states.dtype
+        )
+        hidden_states *= normalizer
+
+        residual = None
+        for i in range(len(self.layers)):
+            layer = self.layers[i]
+            hidden_states, residual = layer(
+                positions,
+                hidden_states,
+                forward_batch,
+                residual,
+            )
+        hidden_states, _ = self.norm(hidden_states, residual)
+        return hidden_states
+
+
+class Gemma2ForCausalLM(nn.Module):
+    # BitandBytes specific attributes
+    default_bitsandbytes_target_modules = [
+        ".gate_proj.",
+        ".down_proj.",
+        ".up_proj.",
+        ".q_proj.",
+        ".k_proj.",
+        ".v_proj.",
+        ".o_proj.",
+    ]
+    bitsandbytes_stacked_params_mapping = {
+        # shard_name, weight_name, index
+        "q_proj": ("qkv_proj", 0),
+        "k_proj": ("qkv_proj", 1),
+        "v_proj": ("qkv_proj", 2),
+        "gate_proj": ("gate_up_proj", 0),
+        "up_proj": ("gate_up_proj", 1),
+    }
+
+    packed_modules_mapping = {
+        "qkv_proj": [
+            "q_proj",
+            "k_proj",
+            "v_proj",
+        ],
+        "gate_up_proj": [
+            "gate_proj",
+            "up_proj",
+        ],
+    }
+
+    # LoRA specific attributes
+    supported_lora_modules = [
+        "qkv_proj",
+        "o_proj",
+        "gate_up_proj",
+        "down_proj",
+    ]
+    # Gemma does not apply LoRA to the embedding layer.
+    embedding_modules = {}
+    embedding_padding_modules = []
+    supports_lora = True
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.config = config
+        self.quant_config = quant_config
+        self.model = Gemma2Model(
+            config, quant_config, prefix=add_prefix("model", prefix)
+        )
+        self.logits_processor = LogitsProcessor(config)
+
+    @torch.no_grad()
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        input_embeds: torch.Tensor = None,
+    ) -> torch.Tensor:
+        hidden_states = self.model(input_ids, positions, forward_batch, input_embeds)
+        return self.logits_processor(
+            input_ids, hidden_states, self.model.embed_tokens, forward_batch
+        )
+
+    @torch.no_grad()
+    def forward_split_prefill(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        split_interval: Tuple[int, int],  # [start, end) 0-based
+        input_embeds: torch.Tensor = None,
+    ):
+        start, end = split_interval
+        # embed
+        if start == 0:
+            if input_embeds is None:
+                forward_batch.hidden_states = self.model.embed_tokens(input_ids)
+            else:
+                forward_batch.hidden_states = input_embeds
+
+            # Normalize
+            normalizer = torch.tensor(
+                self.model.config.hidden_size**0.5, dtype=torch.float16
+            )
+            forward_batch.hidden_states *= normalizer
+
+        # decoder layer
+        for i in range(start, end):
+            layer = self.model.layers[i]
+            forward_batch.hidden_states, forward_batch.residual = layer(
+                positions,
+                forward_batch.hidden_states,
+                forward_batch,
+                forward_batch.residual,
+            )
+
+        if end == self.model.config.num_hidden_layers:
+            # norm
+            forward_batch.hidden_states, _ = self.model.norm(
+                forward_batch.hidden_states, forward_batch.residual
+            )
+
+            # logits process
+            result = self.logits_processor(
+                input_ids,
+                forward_batch.hidden_states,
+                self.model.embed_tokens,
+                forward_batch,
+            )
+        else:
+            result = None
+
+        return result
+
+    def get_attention_sliding_window_size(self):
+        return get_attention_sliding_window_size(self.config)
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+            ("gate_up_proj", "gate_proj", 0),
+            ("gate_up_proj", "up_proj", 1),
+        ]
+        params_dict = dict(self.named_parameters())
+        loaded_params: Set[str] = set()
+        for name, loaded_weight in weights:
+            for param_name, shard_name, shard_id in stacked_params_mapping:
+                if shard_name not in name:
+                    continue
+                name = name.replace(shard_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                # lm_head is not used in vllm as it is tied with embed_token.
+                # To prevent errors, skip loading lm_head.weight.
+                if "lm_head.weight" in name:
+                    continue
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                # Remapping the name of FP8 kv-scale.
+                name = maybe_remap_kv_scale_name(name, params_dict)
+                if name is None:
+                    continue
+
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
+                weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+
+
+EntryClass = Gemma2ForCausalLM
diff --git a/sglang/python/sglang/srt/models/gemma2_reward.py b/sglang/python/sglang/srt/models/gemma2_reward.py
new file mode 100644
index 0000000000000000000000000000000000000000..03bea4d1085568c09293f9aed2ff3fae614c1530
--- /dev/null
+++ b/sglang/python/sglang/srt/models/gemma2_reward.py
@@ -0,0 +1,70 @@
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+from typing import Iterable, Optional, Tuple
+
+import torch
+from torch import nn
+from transformers import Gemma2Config
+
+from sglang.srt.layers.pooler import EmbeddingPoolerOutput, Pooler, PoolingType
+from sglang.srt.layers.quantization.base_config import QuantizationConfig
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch
+from sglang.srt.models.gemma2 import Gemma2ForCausalLM, Gemma2Model
+from sglang.srt.utils import add_prefix
+
+
+class Gemma2ForSequenceClassification(nn.Module):
+    def __init__(
+        self,
+        config: Gemma2Config,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.config = config
+        self.quant_config = quant_config
+        self.num_labels = config.num_labels
+        self.model = Gemma2Model(
+            config, quant_config=quant_config, prefix=add_prefix("model", prefix)
+        )
+        self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False)
+        self.pooler = Pooler(pooling_type=PoolingType.LAST, normalize=False)
+
+        self.eos_token_id = config.eos_token_id
+
+    @torch.no_grad()
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        input_embeds: torch.Tensor = None,
+        get_embedding: bool = True,
+    ) -> EmbeddingPoolerOutput:
+        assert (
+            get_embedding
+        ), "Gemma2ForSequenceClassification is only used for embedding"
+
+        hidden_states = self.model(input_ids, positions, forward_batch, input_embeds)
+        last_token_hidden = self.pooler(hidden_states, forward_batch).embeddings
+        scores = self.score(last_token_hidden)
+
+        return EmbeddingPoolerOutput(scores)
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        Gemma2ForCausalLM.load_weights(self, weights)
+
+
+EntryClass = [Gemma2ForSequenceClassification]
diff --git a/sglang/python/sglang/srt/models/gemma3_causal.py b/sglang/python/sglang/srt/models/gemma3_causal.py
new file mode 100644
index 0000000000000000000000000000000000000000..17c535d73d3fb57bf700ae9629a7a04f409158e3
--- /dev/null
+++ b/sglang/python/sglang/srt/models/gemma3_causal.py
@@ -0,0 +1,722 @@
+# Copyright 2025 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+import copy
+from typing import Iterable, Optional, Set, Tuple
+
+import einops
+import torch
+from torch import nn
+from transformers import (
+    ROPE_INIT_FUNCTIONS,
+    Gemma3TextConfig,
+    PretrainedConfig,
+    PreTrainedModel,
+)
+
+from sglang.srt.distributed import get_tensor_model_parallel_world_size
+from sglang.srt.layers.activation import GeluAndMul
+from sglang.srt.layers.layernorm import Gemma3RMSNorm
+from sglang.srt.layers.linear import (
+    MergedColumnParallelLinear,
+    QKVParallelLinear,
+    RowParallelLinear,
+)
+from sglang.srt.layers.logits_processor import LogitsProcessor
+from sglang.srt.layers.quantization.base_config import QuantizationConfig
+from sglang.srt.layers.radix_attention import AttentionType, RadixAttention
+from sglang.srt.layers.rotary_embedding import apply_rotary_pos_emb
+from sglang.srt.layers.vocab_parallel_embedding import ParallelLMHead
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch
+from sglang.srt.model_loader.weight_utils import (
+    default_weight_loader,
+    maybe_remap_kv_scale_name,
+)
+from sglang.srt.utils import add_prefix, make_layers
+
+
+# Aligned with HF's implementation, using sliding window inclusive with the last token
+# SGLang assumes exclusive
+def get_attention_sliding_window_size(config):
+    return config.sliding_window - 1
+
+
+# Adapted from:
+# https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/models/gemma3.py
+def extract_layer_index(prefix: str) -> int:
+    """Extract the layer index from a prefix string."""
+    parts = prefix.split(".")
+    for part in parts:
+        if part.startswith("layers."):
+            layer_str = part.split(".")[-1]
+            try:
+                return int(layer_str)
+            except ValueError:
+                continue
+    return -1
+
+
+class Gemma3MLP(nn.Module):
+    def __init__(
+        self,
+        hidden_size: int,
+        intermediate_size: int,
+        hidden_activation: str,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.gate_up_proj = MergedColumnParallelLinear(
+            hidden_size,
+            [intermediate_size] * 2,
+            bias=False,
+            quant_config=quant_config,
+            prefix=add_prefix("gate_up_proj", prefix),
+        )
+        self.down_proj = RowParallelLinear(
+            intermediate_size,
+            hidden_size,
+            bias=False,
+            quant_config=quant_config,
+            prefix=add_prefix("down_proj", prefix),
+        )
+        if hidden_activation != "gelu_pytorch_tanh":
+            raise ValueError(
+                "Gemma3 uses `gelu_pytorch_tanh` as the hidden activation "
+                "function. Please set `hidden_activation` to "
+                "`gelu_pytorch_tanh`."
+            )
+        self.act_fn = GeluAndMul()
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        gate_up, _ = self.gate_up_proj(x)
+        x = self.act_fn(gate_up)
+        x, _ = self.down_proj(x)
+        return x
+
+
+class Gemma3Attention(nn.Module):
+    def __init__(
+        self,
+        layer_id: int,
+        config: Gemma3TextConfig,
+        max_position_embeddings: int,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.layer_id = layer_id
+        self.config = config
+        tp_size = get_tensor_model_parallel_world_size()
+
+        self.total_num_heads = config.num_attention_heads
+        assert self.total_num_heads % tp_size == 0
+        self.num_heads = self.total_num_heads // tp_size
+        self.total_num_kv_heads = config.num_key_value_heads
+
+        self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
+
+        if self.total_num_kv_heads >= tp_size:
+            # Number of KV heads is greater than TP size, so we partition
+            # the KV heads across multiple tensor parallel GPUs.
+            assert self.total_num_kv_heads % tp_size == 0
+        else:
+            # Number of KV heads is less than TP size, so we replicate
+            # the KV heads across multiple tensor parallel GPUs.
+            assert tp_size % self.total_num_kv_heads == 0
+
+        hidden_size = config.hidden_size
+
+        head_dim = getattr(
+            config, "head_dim", hidden_size // config.num_attention_heads
+        )
+        self.head_dim = head_dim
+
+        self.q_size = self.num_heads * self.head_dim
+
+        self.kv_size = self.num_kv_heads * self.head_dim
+        self.scaling = config.query_pre_attn_scalar**-0.5
+
+        self.qkv_proj = QKVParallelLinear(
+            hidden_size,
+            self.head_dim,
+            self.total_num_heads,
+            self.total_num_kv_heads,
+            bias=config.attention_bias,
+            quant_config=quant_config,
+            prefix=add_prefix("qkv_proj", prefix),
+        )
+        self.o_proj = RowParallelLinear(
+            self.total_num_heads * self.head_dim,
+            hidden_size,
+            bias=config.attention_bias,
+            quant_config=quant_config,
+            prefix=add_prefix("o_proj", prefix),
+        )
+
+        self.is_sliding = config.layer_types[layer_id] == "sliding_attention"
+
+        # Initialize the rotary embedding.
+        if self.is_sliding:
+            # Local attention. Override the values in config.json.
+            self.rope_theta = config.rope_local_base_freq
+            self.rope_scaling = {"rope_type": "default"}
+            # FIXME(mick): idk why vllm does this
+            # self.sliding_window = config.interleaved_sliding_window
+            self.sliding_window = get_attention_sliding_window_size(config)
+        else:
+            # Global attention. Use the values in config.json.
+            self.rope_theta = config.rope_theta
+            self.rope_scaling = config.rope_scaling
+            self.sliding_window = None
+
+        self.attn = RadixAttention(
+            self.num_heads,
+            self.head_dim,
+            self.scaling,
+            num_kv_heads=self.num_kv_heads,
+            layer_id=layer_id,
+            logit_cap=0.0,
+            # Module must also define `get_attention_sliding_window_size` to correctly initialize
+            # attention backend in `ForwardBatch`.
+            sliding_window_size=self.sliding_window,
+            quant_config=quant_config,
+            prefix=add_prefix("attn", prefix),
+            attn_type=AttentionType.DECODER_BIDIRECTIONAL,
+        )
+
+        # Gemma3 adds normalization for q and k
+        self.q_norm = Gemma3RMSNorm(dim=config.head_dim, eps=config.rms_norm_eps)
+        self.k_norm = Gemma3RMSNorm(dim=config.head_dim, eps=config.rms_norm_eps)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        position_embeddings: Tuple[torch.Tensor, torch.Tensor],
+        forward_batch: ForwardBatch,
+        **kwargs,
+    ) -> torch.Tensor:
+        qkv, _ = self.qkv_proj(hidden_states)
+        # [s, h * head_dim]
+        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+
+        # [s, h, head_dim]
+        q = q.unflatten(-1, (self.num_heads, self.head_dim))
+        # -> [h, s, head_dim]
+        q = q.transpose(0, 1).unsqueeze(0)
+        q = self.q_norm(q)
+        k = k.unflatten(-1, (self.num_kv_heads, self.head_dim))
+        # -> [h, s, head_dim]
+        k = k.transpose(0, 1).unsqueeze(0)
+        k = self.k_norm(k)
+
+        # q, k = self.rotary_emb(positions, q, k)
+        cos, sin = position_embeddings
+        q, k = apply_rotary_pos_emb(q, k, cos, sin)
+
+        # [b, h, s, head_dim] ->  [b, s, h, head_dim]
+        q = q.permute(0, 2, 1, 3)
+        k = k.permute(0, 2, 1, 3)
+
+        attn_output = self.attn(q, k, v, forward_batch=forward_batch)
+
+        # Compatible with triton backend which returns [1, s, h, head_dim]
+        if attn_output.dim() == 4 and attn_output.shape[0] == 1:
+            attn_output = attn_output.squeeze(0)
+            attn_output = attn_output.flatten(-2, -1)
+        # [s, h * head_dim]
+
+        output, _ = self.o_proj(attn_output)
+        return output
+
+
+class Gemma3DecoderLayer(nn.Module):
+    def __init__(
+        self,
+        layer_id: int,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.self_attn = Gemma3Attention(
+            layer_id=layer_id,
+            config=config,
+            max_position_embeddings=config.max_position_embeddings,
+            quant_config=quant_config,
+            prefix=add_prefix("self_attn", prefix),
+        )
+        self.hidden_size = config.hidden_size
+        self.mlp = Gemma3MLP(
+            hidden_size=self.hidden_size,
+            intermediate_size=config.intermediate_size,
+            hidden_activation=config.hidden_activation,
+            quant_config=quant_config,
+            prefix=add_prefix("mlp", prefix),
+        )
+        self.input_layernorm = Gemma3RMSNorm(
+            config.hidden_size, eps=config.rms_norm_eps
+        )
+        self.post_attention_layernorm = Gemma3RMSNorm(
+            config.hidden_size, eps=config.rms_norm_eps
+        )
+        self.pre_feedforward_layernorm = Gemma3RMSNorm(
+            config.hidden_size, eps=config.rms_norm_eps
+        )
+        self.post_feedforward_layernorm = Gemma3RMSNorm(
+            config.hidden_size, eps=config.rms_norm_eps
+        )
+        self.is_sliding = self.self_attn.is_sliding
+        self.layer_id = layer_id
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        position_embeddings_global: torch.Tensor,
+        position_embeddings_local: torch.Tensor,
+        forward_batch: ForwardBatch,
+        **kwargs,
+    ) -> tuple[
+        torch.FloatTensor, Optional[tuple[torch.FloatTensor, torch.FloatTensor]]
+    ]:
+        residual = hidden_states
+        hidden_states = self.input_layernorm(hidden_states)
+
+        # apply global RoPE to non-sliding layer only
+        if self.self_attn.is_sliding:
+            position_embeddings = position_embeddings_local
+        else:
+            position_embeddings = position_embeddings_global
+
+        hidden_states = self.self_attn(
+            positions=positions,
+            hidden_states=hidden_states,
+            position_embeddings=position_embeddings,
+            forward_batch=forward_batch,
+            **kwargs,
+        )
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = residual + hidden_states
+
+        residual = hidden_states
+        hidden_states = self.pre_feedforward_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = self.post_feedforward_layernorm(hidden_states)
+        hidden_states = residual + hidden_states
+
+        outputs = (hidden_states,)
+
+        return outputs
+
+
+class Gemma3RotaryEmbedding(nn.Module):
+    def __init__(self, config: Gemma3TextConfig, device=None):
+        super().__init__()
+        # BC: "rope_type" was originally "type"
+        if hasattr(config, "rope_scaling") and config.rope_scaling is not None:
+            self.rope_type = config.rope_scaling.get(
+                "rope_type", config.rope_scaling.get("type", "default")
+            )
+
+        else:
+            self.rope_type = "default"
+
+        if self.rope_type is None:
+            self.rope_type = "default"
+
+        self.max_seq_len_cached = config.max_position_embeddings
+        self.original_max_seq_len = config.max_position_embeddings
+
+        self.config = config
+
+        self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]
+
+        inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device)
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+        self.original_inv_freq = self.inv_freq
+
+    def _dynamic_frequency_update(self, position_ids, device):
+        """
+        dynamic RoPE layers should recompute `inv_freq` in the following situations:
+        1 - growing beyond the cached sequence length (allow scaling)
+        2 - the current sequence length is in the original scale (avoid losing precision with small sequences)
+        """
+        seq_len = torch.max(position_ids) + 1
+        if seq_len > self.max_seq_len_cached:  # growth
+            inv_freq, self.attention_scaling = self.rope_init_fn(
+                self.config, device, seq_len=seq_len
+            )
+            self.register_buffer(
+                "inv_freq", inv_freq, persistent=False
+            )  # TODO joao: may break with compilation
+            self.max_seq_len_cached = seq_len
+
+        if (
+            seq_len < self.original_max_seq_len
+            and self.max_seq_len_cached > self.original_max_seq_len
+        ):  # reset
+            # This .to() is needed if the model has been moved to a device after being initialized (because
+            # the buffer is automatically moved, but not the original copy)
+            self.original_inv_freq = self.original_inv_freq.to(device)
+            self.register_buffer("inv_freq", self.original_inv_freq, persistent=False)
+            self.max_seq_len_cached = self.original_max_seq_len
+
+    @torch.no_grad()
+    def forward(self, x, position_ids):
+        if "dynamic" in self.rope_type:
+            self._dynamic_frequency_update(position_ids, device=x.device)
+
+        # Core RoPE block
+        inv_freq_expanded = (
+            self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1)
+        )
+        position_ids_expanded = position_ids[:, None, :].float()
+        # Force float32 (see https://github.com/huggingface/transformers/pull/29285)
+        device_type = x.device.type
+        device_type = (
+            device_type
+            if isinstance(device_type, str) and device_type != "mps"
+            else "cpu"
+        )
+        with torch.autocast(device_type=device_type, enabled=False):
+            freqs = (
+                inv_freq_expanded.float().to(x.device) @ position_ids_expanded.float()
+            ).transpose(1, 2)
+            emb = torch.cat((freqs, freqs), dim=-1)
+            cos = emb.cos()
+            sin = emb.sin()
+
+        # Advanced RoPE types (e.g. yarn) apply a post-processing scaling factor, equivalent to scaling attention
+        cos = cos * self.attention_scaling
+        sin = sin * self.attention_scaling
+
+        return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
+
+
+class Gemma3TextScaledWordEmbedding(nn.Embedding):
+    """
+    This module overrides nn.Embeddings' forward by multiplying with embeddings scale.
+    """
+
+    def __init__(
+        self,
+        num_embeddings: int,
+        embedding_dim: int,
+        padding_idx: int,
+        embed_scale: Optional[float] = 1.0,
+    ):
+        super().__init__(num_embeddings, embedding_dim, padding_idx)
+        self.embed_scale = embed_scale
+
+    def forward(self, input_ids: torch.Tensor):
+        return super().forward(input_ids) * self.embed_scale
+
+
+class Gemma3TextModel(PreTrainedModel):
+    def __init__(
+        self,
+        config: Gemma3TextConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__(config=config)
+        self.config = config
+        self.quant_config = quant_config
+
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+
+        # Gemma3 downcasts the below to float16, causing sqrt(3072)=55.4256 to become 55.5. See https://github.com/huggingface/transformers/pull/29402
+        self.embed_tokens = Gemma3TextScaledWordEmbedding(
+            config.vocab_size,
+            config.hidden_size,
+            self.padding_idx,
+            embed_scale=self.config.hidden_size**0.5,
+        )
+
+        self.norm = Gemma3RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.rotary_emb = Gemma3RotaryEmbedding(config=config)
+        self.gradient_checkpointing = False
+
+        # when we want to create a local RoPE layer. Config defaults should hold values for global RoPE
+        config = copy.deepcopy(config)
+        config.rope_theta = config.rope_local_base_freq
+        config.rope_scaling = {"rope_type": "default"}
+        self.rotary_emb_local = Gemma3RotaryEmbedding(config=config)
+
+        self.layers = make_layers(
+            config.num_hidden_layers,
+            lambda idx, prefix: Gemma3DecoderLayer(
+                layer_id=idx,
+                config=config,
+                quant_config=quant_config,
+                prefix=prefix,
+            ),
+            prefix=add_prefix("layers", prefix),
+        )
+        self.norm = Gemma3RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_init()
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        input_embeds: torch.Tensor = None,
+        **kwargs,
+    ) -> torch.Tensor:
+        if input_embeds is None:
+            hidden_states = self.embed_tokens(input_ids)
+        else:
+            hidden_states = input_embeds
+
+        if positions.dim() == 1:
+            positions = einops.rearrange(positions, "s -> 1 s")
+
+        position_embeddings_global = self.rotary_emb(hidden_states, positions)
+        position_embeddings_local = self.rotary_emb_local(hidden_states, positions)
+        for layer in self.layers:
+            layer_outputs = layer(
+                positions=positions,
+                position_embeddings_global=position_embeddings_global,
+                position_embeddings_local=position_embeddings_local,
+                hidden_states=hidden_states,
+                forward_batch=forward_batch,
+                **kwargs,
+            )
+            hidden_states = layer_outputs[0]
+
+        hidden_states = self.norm(hidden_states)
+
+        return hidden_states
+
+
+class Gemma3ForCausalLM(PreTrainedModel):
+    config_class = Gemma3TextConfig
+
+    _tied_weights_keys = ["lm_head.weight"]
+    _tp_plan = {"lm_head": "colwise_rep"}
+    _pp_plan = {"lm_head": (["hidden_states"], ["logits"])}
+    config_class = Gemma3TextConfig
+    base_model_prefix = "language_model"
+
+    # BitandBytes specific attributes
+    default_bitsandbytes_target_modules = [
+        ".gate_proj.",
+        ".down_proj.",
+        ".up_proj.",
+        ".q_proj.",
+        ".k_proj.",
+        ".v_proj.",
+        ".o_proj.",
+    ]
+    bitsandbytes_stacked_params_mapping = {
+        # shard_name, weight_name, index
+        "q_proj": ("qkv_proj", 0),
+        "k_proj": ("qkv_proj", 1),
+        "v_proj": ("qkv_proj", 2),
+        "gate_proj": ("gate_up_proj", 0),
+        "up_proj": ("gate_up_proj", 1),
+    }
+
+    packed_modules_mapping = {
+        "qkv_proj": [
+            "q_proj",
+            "k_proj",
+            "v_proj",
+        ],
+        "gate_up_proj": [
+            "gate_proj",
+            "up_proj",
+        ],
+    }
+
+    # LoRA specific attributes
+    supported_lora_modules = [
+        "qkv_proj",
+        "o_proj",
+        "gate_up_proj",
+        "down_proj",
+    ]
+    # Gemma does not apply LoRA to the embedding layer.
+    embedding_modules = {}
+    embedding_padding_modules = []
+    supports_lora = True
+
+    def __init__(
+        self,
+        config: Gemma3TextConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__(config=config)
+        self.config = config
+        self.quant_config = quant_config
+        self.model = Gemma3TextModel(
+            config, quant_config, prefix=add_prefix("model", prefix)
+        )
+        self.logits_processor = LogitsProcessor(config)
+
+        if self.config.tie_word_embeddings:
+            self.lm_head = self.model.embed_tokens
+        else:
+            self.lm_head = ParallelLMHead(
+                config.vocab_size,
+                config.hidden_size,
+                quant_config=quant_config,
+                prefix=add_prefix("lm_head", prefix),
+            )
+        self.post_init()
+
+    def get_input_embeddings(self) -> nn.Embedding:
+        return self.model.embed_tokens
+
+    def get_attention_sliding_window_size(self):
+        return get_attention_sliding_window_size(self.config)
+
+    def dtype(self) -> torch.dtype:
+        return next(self.parameters()).dtype
+
+    @torch.no_grad()
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        input_embeds: torch.Tensor = None,
+        **kwargs,
+    ) -> LogitsProcessor:
+        hidden_states = self.model(
+            input_ids, positions, forward_batch, input_embeds, **kwargs
+        )
+
+        return self.logits_processor(
+            input_ids, hidden_states, self.model.embed_tokens, forward_batch
+        )
+
+    @torch.no_grad()
+    def forward_split_prefill(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        split_interval: Tuple[int, int],  # [start, end) 0-based
+        input_embeds: torch.Tensor = None,
+    ):
+        start, end = split_interval
+        # embed
+        if start == 0:
+            if input_embeds is None:
+                hidden_states = self.model.embed_tokens(input_ids)
+            else:
+                hidden_states = input_embeds
+
+            if positions.dim() == 1:
+                positions = einops.rearrange(positions, "s -> 1 s")
+            position_embeddings_global = self.model.rotary_emb(hidden_states, positions)
+            position_embeddings_local = self.model.rotary_emb_local(
+                hidden_states, positions
+            )
+
+            forward_batch.hidden_states = hidden_states
+            forward_batch.model_specific_states = {
+                "positions": positions,
+                "position_embeddings_global": position_embeddings_global,
+                "position_embeddings_local": position_embeddings_local,
+            }
+
+        # decoder layer
+        for i in range(start, end):
+            layer = self.model.layers[i]
+            layer_output = layer(
+                positions=forward_batch.model_specific_states["positions"],
+                position_embeddings_global=forward_batch.model_specific_states[
+                    "position_embeddings_global"
+                ],
+                position_embeddings_local=forward_batch.model_specific_states[
+                    "position_embeddings_local"
+                ],
+                hidden_states=forward_batch.hidden_states,
+                forward_batch=forward_batch,
+            )
+            forward_batch.hidden_states = layer_output[0]
+
+        if end == self.model.config.num_hidden_layers:
+            # norm
+            forward_batch.hidden_states = self.model.norm(forward_batch.hidden_states)
+
+            # logits process
+            result = self.logits_processor(
+                input_ids,
+                forward_batch.hidden_states,
+                self.model.embed_tokens,
+                forward_batch,
+            )
+        else:
+            result = None
+
+        return result
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+            ("gate_up_proj", "gate_proj", 0),
+            ("gate_up_proj", "up_proj", 1),
+        ]
+        params_dict = dict(self.named_parameters())
+        loaded_params: Set[str] = set()
+        for name, loaded_weight in weights:
+            for param_name, shard_name, shard_id in stacked_params_mapping:
+                # if param_name in name:
+                # print(f"{param_name} is already in {name}")
+                if shard_name not in name:
+                    continue
+                name = name.replace(shard_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                # lm_head is not used in vllm as it is tied with embed_token.
+                # To prevent errors, skip loading lm_head.weight.
+                if "lm_head.weight" in name:
+                    continue
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                # Remapping the name of FP8 kv-scale.
+                name = maybe_remap_kv_scale_name(name, params_dict)
+                if name is None:
+                    continue
+
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
+                weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        # unloaded_params = params_dict.keys() - loaded_params
+        # if unloaded_params:
+        #     logger.warning(
+        #         "Some weights are not initialized from checkpoints: %s", unloaded_params
+        #     )
+        return loaded_params
+
+
+EntryClass = Gemma3ForCausalLM
diff --git a/sglang/python/sglang/srt/models/gemma3_mm.py b/sglang/python/sglang/srt/models/gemma3_mm.py
new file mode 100644
index 0000000000000000000000000000000000000000..317cc71a3104b0b13b1b9132aec972d02d57285e
--- /dev/null
+++ b/sglang/python/sglang/srt/models/gemma3_mm.py
@@ -0,0 +1,484 @@
+# Copyright 2025 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+# Adapted from:
+# https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/models/gemma3_mm.py
+
+import logging
+import re
+from functools import lru_cache
+from typing import Iterable, List, Optional, Set, Tuple, TypedDict
+
+import torch
+from torch import nn
+from transformers import Gemma3Config, PreTrainedModel
+
+from sglang.srt.layers.attention.triton_backend import TritonAttnBackend
+from sglang.srt.layers.layernorm import Gemma3RMSNorm
+from sglang.srt.layers.logits_processor import LogitsProcessor
+from sglang.srt.layers.quantization.base_config import QuantizationConfig
+from sglang.srt.managers.mm_utils import (
+    MultiModalityDataPaddingPatternTokenPairs,
+    general_mm_embed_routine,
+)
+from sglang.srt.managers.schedule_batch import (
+    MultimodalDataItem,
+    MultimodalInputs,
+    flatten_nested_list,
+)
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch, ForwardMode
+from sglang.srt.model_loader.weight_utils import (
+    default_weight_loader,
+    maybe_remap_kv_scale_name,
+)
+from sglang.srt.models.gemma3_causal import Gemma3ForCausalLM
+from sglang.srt.models.siglip import SiglipVisionModel
+from sglang.srt.utils import add_prefix
+from sglang.srt.utils.hf_transformers_utils import get_processor
+
+logger = logging.getLogger(__name__)
+
+cached_get_processor = lru_cache(get_processor)
+
+
+class Gemma3ImagePixelInputs(TypedDict):
+    pixel_values: torch.Tensor
+    """Shape: `(batch_size * num_images, num_channels, height, width)`"""
+
+
+class Gemma3MultiModalProjector(nn.Module):
+    """Projector for Gemma3 multimodal."""
+
+    def __init__(self, config: Gemma3Config):
+        super().__init__()
+
+        self.mm_input_projection_weight = nn.Parameter(
+            torch.zeros(
+                config.vision_config.hidden_size, config.text_config.hidden_size
+            )
+        )
+
+        self.mm_soft_emb_norm = Gemma3RMSNorm(
+            config.vision_config.hidden_size, eps=config.vision_config.layer_norm_eps
+        )
+
+        self.patches_per_image = int(
+            config.vision_config.image_size // config.vision_config.patch_size
+        )
+        self.tokens_per_side = int(config.mm_tokens_per_image**0.5)
+        self.kernel_size = self.patches_per_image // self.tokens_per_side
+        self.avg_pool = nn.AvgPool2d(
+            kernel_size=self.kernel_size, stride=self.kernel_size
+        )
+
+    def forward(self, vision_outputs: torch.Tensor) -> torch.Tensor:
+        batch_size, seq_length, hidden_size = vision_outputs.shape
+
+        # Reshape for pooling
+        reshaped_vision_outputs = vision_outputs.transpose(1, 2)
+        reshaped_vision_outputs = reshaped_vision_outputs.reshape(
+            batch_size, hidden_size, self.patches_per_image, self.patches_per_image
+        )
+        reshaped_vision_outputs = reshaped_vision_outputs.contiguous()
+
+        # Apply pooling
+        pooled_vision_outputs = self.avg_pool(reshaped_vision_outputs)
+        pooled_vision_outputs = pooled_vision_outputs.flatten(2)
+        pooled_vision_outputs = pooled_vision_outputs.transpose(1, 2)
+
+        # Apply normalization
+        normed_vision_outputs = self.mm_soft_emb_norm(pooled_vision_outputs)
+
+        # Project to text embedding space
+        projected_vision_outputs = torch.matmul(
+            normed_vision_outputs, self.mm_input_projection_weight
+        )
+
+        return projected_vision_outputs.type_as(vision_outputs)
+
+
+class Gemma3ForConditionalGeneration(PreTrainedModel):
+    config_class = Gemma3Config
+    """Gemma3 multimodal model for conditional generation."""
+
+    # BitandBytes specific attributes
+    default_bitsandbytes_target_modules = [
+        ".gate_proj.",
+        ".down_proj.",
+        ".up_proj.",
+        ".q_proj.",
+        ".k_proj.",
+        ".v_proj.",
+        ".o_proj.",
+        ".out_proj.",
+    ]
+    bitsandbytes_stacked_params_mapping = {
+        # shard_name, weight_name, index
+        "q_proj": ("qkv_proj", 0),
+        "k_proj": ("qkv_proj", 1),
+        "v_proj": ("qkv_proj", 2),
+        "gate_proj": ("gate_up_proj", 0),
+        "up_proj": ("gate_up_proj", 1),
+        "out_proj": ("proj", 0),
+    }
+
+    packed_modules_mapping = {
+        "qkv_proj": [
+            "q_proj",
+            "k_proj",
+            "v_proj",
+        ],
+        "gate_up_proj": [
+            "gate_proj",
+            "up_proj",
+        ],
+    }
+
+    # LoRA specific attributes
+    supported_lora_modules = [
+        "qkv_proj",
+        "o_proj",
+        "gate_up_proj",
+        "down_proj",
+    ]
+    # Gemma does not apply LoRA to the embedding layer.
+    embedding_modules = {}
+    embedding_padding_modules = []
+    supports_lora = True
+    # Pattern to match language model layers only (skip vision_tower and multi_modal_projector)
+    lora_pattern = re.compile(
+        r"^language_model\.model\.layers\.(\d+)\.(?:self_attn|mlp)\.(?:qkv_proj|o_proj|down_proj|gate_up_proj)"
+    )
+
+    def __init__(
+        self,
+        config: Gemma3Config,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__(config=config)
+        self.config = config
+        self.quant_config = quant_config
+
+        # For LoRA compatibility: expose text_config attributes at top level
+        # This allows LoRA code to work without special multimodal handling
+        if not hasattr(config, "num_hidden_layers"):
+            config.num_hidden_layers = config.text_config.num_hidden_layers
+        if not hasattr(config, "hidden_size"):
+            config.hidden_size = config.text_config.hidden_size
+
+        self.vision_tower = SiglipVisionModel(
+            config=config.vision_config,
+            quant_config=quant_config,
+            prefix=add_prefix("vision_tower", prefix),
+        )
+
+        self.multi_modal_projector = Gemma3MultiModalProjector(config)
+        self.vocab_size = config.text_config.vocab_size
+
+        # Text model
+        self.language_model = Gemma3ForCausalLM(
+            config.text_config,
+            quant_config,
+            prefix=add_prefix("language_model", prefix),
+        )
+        if self.language_model.logits_processor.logit_scale:
+            logit_scale = getattr(config, "logit_scale", 1.0)
+            self.language_model.logits_processor.logit_scale *= logit_scale
+        self.post_init()
+
+    def pad_input_ids(
+        self, input_ids: List[int], image_inputs: MultimodalInputs
+    ) -> List[int]:
+        """Pad input IDs with image tokens."""
+        # Get special token IDs
+        im_start_id: int = image_inputs.im_start_id
+        im_end_id: int = image_inputs.im_end_id
+
+        media_token_pairs = [(im_start_id, im_end_id)]
+        pattern = MultiModalityDataPaddingPatternTokenPairs(media_token_pairs)
+        ids = pattern.pad_input_tokens(input_ids, image_inputs)
+        return ids
+
+    def prepare_attn_masks(
+        self,
+        forward_batch: ForwardBatch,
+        input_ids: torch.Tensor,
+        mask_dtype: torch.dtype,
+    ):
+        """Prepare attention masks for multimodal inputs."""
+        if isinstance(forward_batch.attn_backend, TritonAttnBackend):
+            assert forward_batch.forward_mode == ForwardMode.EXTEND
+            bidirectional_attn_masks_list = []
+            bidirectional_attn_mask_indptr = torch.zeros(
+                forward_batch.batch_size + 1, dtype=torch.int32, device=input_ids.device
+            )
+
+            for i in range(forward_batch.batch_size):
+                bidirectional_attn_mask = torch.empty(
+                    forward_batch.extend_seq_lens[i],
+                    forward_batch.extend_seq_lens[i]
+                    + forward_batch.extend_prefix_lens[i],
+                    dtype=mask_dtype,
+                    device=input_ids.device,
+                )
+                bidirectional_attn_mask.fill_(1)
+                bidirectional_attn_mask = bidirectional_attn_mask.tril(
+                    diagonal=forward_batch.extend_prefix_lens[i]
+                )
+
+                # Consider bidirectional attention between image tokens
+                mm_inputs = forward_batch.mm_inputs[i]
+                for mm_item in mm_inputs.mm_items:
+                    if mm_item.is_image():
+                        for im_begin, im_end in mm_item.offsets:
+                            if (
+                                im_begin >= forward_batch.extend_prefix_lens[i]
+                            ):  # compatible with radix cache
+                                bidirectional_attn_mask[
+                                    im_begin
+                                    - forward_batch.extend_prefix_lens[i] : im_end
+                                    + 1
+                                    - forward_batch.extend_prefix_lens[i],
+                                    im_begin : im_end + 1,
+                                ] = 1
+                bidirectional_attn_masks_list.append(bidirectional_attn_mask.flatten())
+                bidirectional_attn_mask_indptr[i + 1] = (
+                    bidirectional_attn_mask_indptr[i]
+                    + bidirectional_attn_mask.nelement()
+                )
+
+            if bidirectional_attn_masks_list:
+                bidirectional_attn_masks = torch.cat(
+                    bidirectional_attn_masks_list, dim=0
+                )
+                forward_batch.attn_backend.forward_metadata.mask_indptr = (
+                    bidirectional_attn_mask_indptr
+                )
+                forward_batch.attn_backend.forward_metadata.custom_mask = (
+                    bidirectional_attn_masks
+                )
+
+    def get_input_embeddings(self) -> nn.Embedding:
+        return self.language_model.get_input_embeddings()
+
+    def get_attention_sliding_window_size(self):
+        """
+        This value is used to initialize attention backends in `ForwardBatch`.
+        """
+        return self.language_model.get_attention_sliding_window_size()
+
+    def get_image_feature(self, items: List[MultimodalDataItem]):
+        """
+        Projects the last hidden state from the vision model into language model space.
+        Supports both raw image pixel values and precomputed embeddings.
+
+        Returns:
+            image_features (`torch.Tensor`): Image feature tensor of shape `(num_images, image_length, embed_dim)`).
+        """
+        # Process images one by one to handle flatten_batch=True constraint in vision_tower
+        all_pixel_values = flatten_nested_list([item.feature for item in items])
+
+        final_features_list = []
+
+        for pixel_values_batch in all_pixel_values:
+            if (
+                pixel_values_batch.dim() == 3
+                and pixel_values_batch.shape[-1] == self.config.text_config.hidden_size
+            ):
+                final_features_list.append(
+                    pixel_values_batch.to(self.language_model.device)
+                )
+                continue
+
+            # Normalize input shape to [batch_size, channels, height, width]
+            if pixel_values_batch.dim() == 5:
+                pixel_values_batch = pixel_values_batch.squeeze(0)
+            elif pixel_values_batch.dim() == 3:
+                pixel_values_batch = pixel_values_batch.unsqueeze(0)
+            elif pixel_values_batch.dim() != 4:
+                raise ValueError(
+                    f"Unexpected pixel_values shape: {pixel_values_batch.shape}"
+                )
+
+            # Process each image in the batch through Vision Tower
+            batch_vision_outputs = []
+            batch_size = pixel_values_batch.shape[0]
+
+            for i in range(batch_size):
+                pixel_value = pixel_values_batch[i : i + 1]  # Keep batch dimension as 1
+                pixel_value = pixel_value.to(
+                    device=self.vision_tower.device, dtype=self.language_model.dtype()
+                )
+                vision_output = self.vision_tower(pixel_values=pixel_value)
+                batch_vision_outputs.append(vision_output)
+
+            if batch_vision_outputs:
+                vision_outputs_cat = torch.cat(batch_vision_outputs, dim=0)
+
+                projected_features = self.multi_modal_projector(vision_outputs_cat)
+                final_features_list.append(projected_features)
+
+        # Concatenate all features (all are now in text space)
+        if final_features_list:
+            return torch.cat(final_features_list, dim=0)
+        else:
+            return torch.tensor([], device=self.language_model.device)
+
+    @torch.no_grad()
+    def forward(
+        self,
+        input_ids: torch.LongTensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        input_embeds: torch.Tensor = None,
+        **kwargs: object,
+    ) -> LogitsProcessor:
+        r"""
+            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+                config.text_config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+                (masked), the loss is only computed for the tokens with labels in `[0, ..., config.text_config.vocab_size]`.
+
+            logits_to_keep (`int` or `torch.Tensor`, *optional*):
+                If an `int`, compute logits for the last `logits_to_keep` tokens. If `0`, calculate logits for all
+                `input_ids` (special case). Only last token logits are needed for generation, and calculating them only for that
+                token can save memory, which becomes pretty significant for long sequences or large vocabulary size.
+                If a `torch.Tensor`, must be 1D corresponding to the indices to keep in the sequence length dimension.
+                This is useful when using packed tensor format (single dimension for batch and sequence length).
+
+        Returns:
+
+        Example:
+
+        ```python
+        >>> from PIL import Image
+        >>> import requests
+        >>> from transformers import AutoProcessor, Gemma3ForConditionalGeneration
+
+        >>> model = Gemma3ForConditionalGeneration.from_pretrained("google/Gemma3-test-224px-hf")
+        >>> processor = AutoProcessor.from_pretrained("google/Gemma3-test-224px-hf")
+
+        >>> prompt = "answer en Where is the cow standing?"
+        >>> url = "https://huggingface.co/gv-hf/Gemma3-test-224px-hf/resolve/main/cow_beach_1.png"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> inputs = processor(images=image, text=prompt,  return_tensors="pt")
+
+        >>> # Generate
+        >>> generate_ids = model.generate(**inputs, max_length=30)
+        >>> processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+        "answer en Where is the cow standing?\nbeach"
+        ```"""
+
+        # Important: position_ids in Gemma3 are 1-indexed
+        # This really does cost me sometime
+        positions += 1
+
+        # Replace image id with PAD if the image token if OOV, to avoid index-errors
+        if input_ids is not None and self.config.image_token_index >= self.vocab_size:
+            special_image_mask = input_ids == self.config.image_token_index
+            llm_input_ids = input_ids.clone()
+            llm_input_ids[special_image_mask] = 0
+        else:
+            llm_input_ids = input_ids
+
+        # NOTE: As described in https://huggingface.co/blog/gemma3#multimodality, in the prefill stage of Gemma-3, image tokens use bidirectional attention. Currently, only the TritonAttnBackend supports bidirectional attention; other backends have not yet implemented this. Bidirectional attention is incompatible with CUDA Graph and chunked prefill.
+        if (
+            forward_batch.forward_mode
+            == ForwardMode.EXTEND  # only Extend mode is supported for now
+            and forward_batch.contains_image_inputs()  # Gemma-3 only supports image as mm inputs
+        ):
+            self.prepare_attn_masks(
+                forward_batch,
+                llm_input_ids,
+                mask_dtype=torch.bool,
+            )
+
+        hs = general_mm_embed_routine(
+            input_ids=llm_input_ids,
+            forward_batch=forward_batch,
+            language_model=self.language_model,
+            multimodal_model=self,
+            positions=positions,
+        )
+
+        return hs
+
+    def should_apply_lora(self, module_name: str) -> bool:
+        """Skip vision tower and multi_modal_projector for LoRA."""
+        return bool(self.lora_pattern.match(module_name))
+
+    def tie_weights(self):
+        return self.language_model.tie_weights()
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            (".qkv_proj", ".q_proj", "q"),
+            (".qkv_proj", ".k_proj", "k"),
+            (".qkv_proj", ".v_proj", "v"),
+            ("gate_up_proj", "up_proj", 1),
+            ("gate_up_proj", "gate_proj", 0),
+        ]
+        """Load weights for the model."""
+        params_dict = dict(self.named_parameters())
+        loaded_params: Set[str] = set()
+
+        for name, loaded_weight in weights:
+            if "language_model" in name:
+                # Gemma3ForCausalLM.load_weights(self, [(name.replace("language_model.", ""), loaded_weight)])
+                causal_loaded_params = Gemma3ForCausalLM.load_weights(
+                    self, [(name, loaded_weight)]
+                )
+                loaded_params.update(causal_loaded_params)
+                continue
+            else:
+                for param_name, weight_name, shard_id in stacked_params_mapping:
+                    if weight_name not in name:
+                        continue
+                    name = name.replace(weight_name, param_name)
+                    # Skip loading extra bias for GPTQ models.
+                    if name.endswith(".bias") and name not in params_dict:
+                        continue
+                    param = params_dict[name]
+                    weight_loader = param.weight_loader
+                    weight_loader(param, loaded_weight, shard_id)
+                    break
+                else:
+                    if "vision_model" in name:
+                        # adapt to VisionAttention
+                        name = name.replace(".self_attn.out_proj", ".self_attn.proj")
+                    # Skip loading extra bias for GPTQ models
+                    if name.endswith(".bias") and name not in params_dict:
+                        continue
+                    # Remapping the name of FP8 kv-scale
+                    name = maybe_remap_kv_scale_name(name, params_dict)
+                    if name is None:
+                        continue
+                    param = params_dict[name]
+                    weight_loader = getattr(
+                        param, "weight_loader", default_weight_loader
+                    )
+                    weight_loader(param, loaded_weight)
+                loaded_params.add(name)
+        unloaded_params = params_dict.keys() - loaded_params
+        if unloaded_params:
+            pass
+            # raise RuntimeError(
+            #     f"Some weights are not initialized from checkpoints: {unloaded_params}")
+        return loaded_params
+
+
+EntryClass = Gemma3ForConditionalGeneration
diff --git a/sglang/python/sglang/srt/models/gemma3n_audio.py b/sglang/python/sglang/srt/models/gemma3n_audio.py
new file mode 100644
index 0000000000000000000000000000000000000000..57a743c9f534de7585b873d2c837568c88ef804f
--- /dev/null
+++ b/sglang/python/sglang/srt/models/gemma3n_audio.py
@@ -0,0 +1,949 @@
+import math
+from typing import Optional, Sequence, Tuple
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from transformers import Gemma3nAudioConfig, PreTrainedModel
+
+from sglang.srt.layers.linear import (
+    ColumnParallelLinear,
+    QKVParallelLinear,
+    RowParallelLinear,
+)
+from sglang.srt.layers.quantization.base_config import QuantizationConfig
+from sglang.srt.models.gemma3n_causal import Gemma3nRMSNorm
+from sglang.srt.utils import add_prefix, make_layers
+
+
+class Gemma3nCumulativeGroupNorm(nn.Module):
+    """Applies Group Normalization cumulatively over the time dimension.
+
+    This layer normalizes the input by calculating the mean and variance
+    cumulatively over the time dimension (dim 1). The statistics are computed
+    over all feature dimensions (specified by `feature_dims` and `num_channels`)
+    for elements marked as valid by the optional `mask`.
+
+    If a `mask` is provided (True for valid, False for invalid/padded),
+    invalid time steps do not contribute to the statistics calculation, and
+    their corresponding output values are zeroed out.
+
+    Scale and bias, if enabled, are applied per-channel (last dimension).
+    This behavior is similar to JAX's `GroupNormalization` with `num_groups=1`
+    and `cumulative=True`.
+    """
+
+    def __init__(
+        self,
+        num_channels: int,  # Number of channels (size of the last dimension)
+        feature_dims: Sequence[
+            int
+        ],  # Sizes of non-channel feature dimensions, e.g., (H, W) for input [B,T,H,W,C]
+        eps: float = 1e-3,
+    ):
+        super().__init__()
+        self.num_channels = num_channels
+        self.feature_dims = tuple(feature_dims)
+        self.eps = eps
+
+        # Scale parameter depends only on the channel dimension
+        self.weight = nn.Parameter(torch.ones(num_channels))
+
+        # Axes for normalization: all dimensions except Batch (0) and Time (1).
+        # For input [B, T, *feature_dims, C], these are dims from 2 onwards.
+        self.reduction_axes = tuple(range(2, 2 + len(self.feature_dims) + 1))
+
+    def forward(
+        self, x: torch.Tensor, mask: Optional[torch.Tensor] = None
+    ) -> torch.Tensor:
+        """Applies cumulative group norm, optionally using a mask.
+
+        Args:
+          x: Input tensor, shape [B, T, *feature_dims, C].
+          mask: Optional boolean mask, shape [B, T]. True indicates a valid
+            (non-padded) time step. If None, all time steps are considered valid.
+
+        Returns:
+          Normalized tensor with the same shape as x.
+        """
+        expected_input_suffix = self.feature_dims + (self.num_channels,)
+        if x.shape[2:] != expected_input_suffix:
+            raise ValueError(
+                f"Input tensor shape suffix {x.shape[2:]} does not match expected"
+                f" suffix (feature_dims + num_channels) {expected_input_suffix}"
+            )
+
+        input_dtype = x.dtype
+        # Calculations are performed in float32 for numerical stability.
+        calc_dtype = torch.float32
+        x_calc = x.to(calc_dtype)
+
+        # Prepare a broadcastable mask (`mask_calc`).
+        # If no mask is provided, treat all elements as valid
+        # (mask_calc is all ones).
+        # Otherwise, expand the [B, T] mask to [B, T, 1, ..., 1] for broadcasting.
+        mask_calc = torch.ones_like(x_calc, dtype=calc_dtype)
+
+        # Cumulative Statistics Calculation
+        # 1. Sum of values over reduction axes at each time step.
+        sum_values_at_t = torch.sum(x_calc, dim=self.reduction_axes, keepdim=True)
+        # 2. Cumulative sum of values over time.
+        cum_sum_values = torch.cumsum(sum_values_at_t, dim=1)
+
+        # 3. Count of valid elements in the normalization group at each time step.
+        #    (A "group" here consists of all features at a given Batch, Time).
+        elements_in_group_at_t = torch.sum(
+            mask_calc, dim=self.reduction_axes, keepdim=True
+        )
+        # 4. Cumulative count of valid elements over time.
+        cum_count_elements = torch.cumsum(elements_in_group_at_t, dim=1)
+        # Avoid division by zero if all preceding elements were masked.
+        safe_cum_count_elements = torch.clamp(cum_count_elements, min=1.0)
+
+        # 5. Cumulative mean.
+        cum_mean = cum_sum_values / safe_cum_count_elements
+
+        # 6. Sum of squared differences from the cumulative mean.
+        #    Only sum for valid elements: (x_calc - cum_mean)^2 * mask_calc.
+        #    Using x_calc here for the difference, as cum_mean already accounts for masking.
+        squared_diff_from_mean = (x_calc - cum_mean).pow(2)
+        sum_sq_diff_at_t = torch.sum(
+            squared_diff_from_mean, dim=self.reduction_axes, keepdim=True
+        )
+
+        # 7. Cumulative sum of squared differences over time.
+        cum_sum_sq_diff = torch.cumsum(sum_sq_diff_at_t, dim=1)
+
+        # 8. Cumulative variance.
+        cum_variance = cum_sum_sq_diff / safe_cum_count_elements
+
+        # Normalize the input using the calculated cumulative statistics:
+        # (x - E[x]) / sqrt(Var[x] + eps)
+        normalized_x = (x_calc - cum_mean) * torch.rsqrt(cum_variance + self.eps)
+
+        # Apply affine transformation (scale and bias) if enabled.
+        # Scale and bias are applied per-channel (last dimension).
+        scale = self.weight.to(calc_dtype)
+        # Reshape for broadcasting: [C] -> [1, ..., 1, C]
+        scale_view_shape = [1] * (x.dim() - 1) + [self.num_channels]
+        normalized_x = normalized_x * scale.view(scale_view_shape)
+
+        # Zero out outputs for time steps that were originally masked (where mask_calc is 0).
+        # This ensures padded/invalid positions in the input result in zero output.
+        final_output = normalized_x * mask_calc
+
+        return final_output.to(input_dtype)
+
+
+class Gemma3nAudioRelativePositionEmbedding(nn.Module):
+    def __init__(
+        self,
+        config: Gemma3nAudioConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.config = config
+
+        self.num_heads = self.config.conf_num_attention_heads
+        self.channels = self.config.hidden_size
+        self.head_dim = self.channels // self.num_heads
+        self.max_backward = max(0, self.config.conf_attention_context_left - 1)
+        self.max_forward = self.config.conf_attention_context_right
+
+        self.pos_proj = ColumnParallelLinear(
+            self.channels,
+            self.num_heads * self.head_dim,
+            bias=False,
+            quant_config=quant_config,
+            prefix=add_prefix("pos_proj", prefix),
+        )
+
+        min_timescale = 1.0
+        max_timescale = 1.0e4
+        num_timescales = self.channels // 2
+        log_timescale_increment = math.log(
+            float(max_timescale) / float(min_timescale)
+        ) / max(num_timescales - 1, 1)
+        inv_timescales = min_timescale * torch.exp(
+            torch.arange(num_timescales) * -log_timescale_increment
+        )
+        self.register_buffer(
+            "inv_timescales",
+            inv_timescales.float().unsqueeze(0).unsqueeze(0),
+            persistent=False,
+        )
+
+    def _get_timing_signal_1d_pos(
+        self, position: torch.Tensor, dtype: torch.dtype
+    ) -> torch.Tensor:
+        assert position.ndim == 2
+        position = position.float().unsqueeze(-1)
+        scaled_time = position * self.inv_timescales.to(
+            device=position.device, dtype=torch.float32
+        )
+        timing_signal = torch.cat(
+            [torch.sin(scaled_time), torch.cos(scaled_time)], dim=-1
+        )
+        return timing_signal.type(dtype)
+
+    def _relative_shift(
+        self,
+        term_bd_before_shift: torch.Tensor,
+        batch_size: int,
+        num_heads: int,
+        num_query_blocks: int,
+        query_block_size: int,
+        key_context_size: int,
+        max_span_plus_1: int,
+    ) -> torch.Tensor:
+        """Performs the relative shift."""
+        pad_amount_last_dim = (key_context_size + 1) - max_span_plus_1
+        padding_tuple = (0, pad_amount_last_dim)
+
+        term_bd_padded = F.pad(term_bd_before_shift, padding_tuple)
+        term_bd_reshaped = term_bd_padded.reshape(
+            (
+                batch_size,
+                num_heads,
+                num_query_blocks,
+                query_block_size * (key_context_size + 1),
+            )
+        )
+        term_bd_sliced = term_bd_reshaped[
+            :, :, :, : query_block_size * key_context_size
+        ]
+        term_bd_shifted = term_bd_sliced.reshape(
+            (
+                batch_size,
+                num_heads,
+                num_query_blocks,
+                query_block_size,
+                key_context_size,
+            )
+        )
+        return term_bd_shifted
+
+    def forward(self, queries: torch.Tensor, keys: torch.Tensor) -> torch.Tensor:
+        batch_size, num_query_blocks, query_block_size, num_heads, head_dim = (
+            queries.shape
+        )
+        _, _, key_context_size, _, _ = keys.shape
+
+        pos_indices = torch.arange(
+            self.max_backward, -self.max_forward - 1, -1, device=queries.device
+        ).unsqueeze(0)
+        max_span_plus_1 = pos_indices.shape[1]
+
+        sin_emb_timing_signal = self._get_timing_signal_1d_pos(
+            pos_indices, dtype=queries.dtype
+        )
+        projected_sin_emb, _ = self.pos_proj(sin_emb_timing_signal)
+        sin_emb = projected_sin_emb.reshape(
+            1, max_span_plus_1, self.num_heads, self.head_dim
+        ).squeeze(0)
+
+        queries_p = queries.permute(0, 3, 1, 2, 4)
+        keys_p_t = keys.permute(0, 3, 1, 4, 2)
+        term_ac = torch.matmul(queries_p, keys_p_t)
+
+        q_permuted = queries.permute(0, 3, 1, 2, 4)
+        s_permuted = sin_emb.permute(1, 2, 0)
+        q_reshaped = q_permuted.reshape(
+            batch_size, num_heads, num_query_blocks * query_block_size, head_dim
+        )
+        term_bd_unshifed_matmul = torch.matmul(q_reshaped, s_permuted)
+        term_bd_unshifed = term_bd_unshifed_matmul.reshape(
+            batch_size,
+            num_heads,
+            num_query_blocks,
+            query_block_size,
+            max_span_plus_1,
+        )
+
+        term_bd_shifted = self._relative_shift(
+            term_bd_unshifed,
+            batch_size,
+            num_heads,
+            num_query_blocks,
+            query_block_size,
+            key_context_size,
+            max_span_plus_1,
+        )
+
+        return term_ac + term_bd_shifted
+
+
+class Gemma3nAudioAttention(nn.Module):
+    """Local dot product self-attention for audio."""
+
+    def __init__(
+        self,
+        config: Gemma3nAudioConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.config = config
+
+        self.num_heads = self.config.conf_num_attention_heads
+        self.hidden_size = self.config.hidden_size
+        self.head_dim = self.hidden_size // self.num_heads
+
+        self.chunk_size = self.config.conf_attention_chunk_size
+        self.max_future_horizon = self.config.conf_attention_context_right
+        self.max_past_horizon = max(0, self.config.conf_attention_context_left - 1)
+        self.attention_logits_soft_cap = self.config.conf_attention_logit_cap
+        self.context_size = (
+            self.chunk_size + self.max_past_horizon + self.max_future_horizon
+        )
+
+        self.relative_position_embedding = Gemma3nAudioRelativePositionEmbedding(
+            config,
+            quant_config,
+            prefix=add_prefix("relative_position_embedding", prefix),
+        )
+        self.per_dim_scale = nn.Parameter(torch.zeros((self.head_dim,)))
+
+        self.qkv_proj = QKVParallelLinear(
+            self.hidden_size,
+            self.head_dim,
+            self.num_heads,
+            self.num_heads,
+            bias=False,
+            quant_config=quant_config,
+            prefix=add_prefix("qkv_proj", prefix),
+        )
+
+        q_scale = self.head_dim**-0.5
+        r_softplus_0 = 1.0 / F.softplus(torch.tensor(0.0))
+        self.register_buffer(
+            "q_scale", (q_scale * r_softplus_0).clone().detach(), persistent=False
+        )
+
+        # Create local causal mask
+        lower_causal_mask = torch.tril(
+            torch.ones((self.context_size, self.chunk_size), dtype=torch.bool),
+            diagonal=0,
+        ).T
+        upper_causal_mask = torch.tril(
+            torch.ones((self.chunk_size, self.context_size), dtype=torch.bool),
+            diagonal=self.max_past_horizon + self.max_future_horizon,
+        )
+        local_causal_valid_mask = torch.ones(
+            (self.chunk_size, self.context_size), dtype=torch.bool
+        )
+        local_causal_valid_mask = (
+            local_causal_valid_mask * lower_causal_mask * upper_causal_mask
+        )
+        self.register_buffer(
+            "local_causal_valid_mask", local_causal_valid_mask, persistent=False
+        )
+
+        self.register_buffer(
+            "softcap",
+            torch.tensor(self.attention_logits_soft_cap).float(),
+            persistent=False,
+        )
+
+    def _pad_dim1(
+        self, x: torch.Tensor, dim10_val: int, dim11_val: int
+    ) -> torch.Tensor:
+        padding_tuple = [0] * x.ndim * 2
+        dim_idx_from_end = x.ndim - 2
+        start_idx_for_dim = 2 * dim_idx_from_end
+        padding_tuple[start_idx_for_dim] = dim10_val
+        padding_tuple[start_idx_for_dim + 1] = dim11_val
+        return F.pad(x, tuple(padding_tuple))
+
+    def _convert_to_block(self, x: torch.Tensor) -> torch.Tensor:
+        """Turns a sequence to non overlapping blocks."""
+        shape = x.shape
+        b, t = shape[:2]
+        num_blocks = (t + self.chunk_size - 1) // self.chunk_size
+
+        if (padding_len := num_blocks * self.chunk_size - t) > 0:
+            x = self._pad_dim1(x, 0, padding_len)
+
+        permute_dims = (b, num_blocks, self.chunk_size) + shape[2:]
+        x = x.reshape(permute_dims).contiguous()
+        return x
+
+    def _extract_block_context(self, x: torch.Tensor) -> torch.Tensor:
+        """Extracts temporal context for every block."""
+        pad_left = self.max_past_horizon
+        pad_right = self.max_future_horizon + self.chunk_size - 1
+        x = self._pad_dim1(x, pad_left, pad_right)
+
+        frame_len = self.context_size
+        frame_step = self.chunk_size
+
+        x_unfolded = x.unfold(dimension=1, size=frame_len, step=frame_step)
+
+        if x.ndim > 2 and x_unfolded.ndim > 3:
+            x_unfolded = torch.movedim(x_unfolded, source=-1, destination=2)
+
+        return x_unfolded.contiguous()
+
+    def forward(self, x: torch.Tensor, mask: torch.BoolTensor) -> torch.Tensor:
+        # Project to Q, K, V
+        qkv, _ = self.qkv_proj(x)
+        query_states, key_states, value_states = qkv.chunk(chunks=3, dim=-1)
+
+        # Reshape
+        query_states = query_states.reshape(
+            *x.shape[:-1], self.num_heads, self.head_dim
+        ).contiguous()
+        key_states = key_states.reshape(
+            *x.shape[:-1], self.num_heads, self.head_dim
+        ).contiguous()
+        value_states = value_states.reshape(
+            *x.shape[:-1], self.num_heads, self.head_dim
+        ).contiguous()
+
+        # Apply per-dim scale
+        per_dim_scale_sp = F.softplus(self.per_dim_scale)
+        broadcast_shape = (1, 1, 1, self.head_dim)
+        per_dim_scale_sp_broadcast = per_dim_scale_sp.view(broadcast_shape)
+        query_states = query_states * self.q_scale * per_dim_scale_sp_broadcast
+
+        batch_size, q_time = query_states.shape[:2]
+
+        # Convert to blocks
+        query_blocks = self._convert_to_block(query_states)
+        key_blocks = self._extract_block_context(key_states)
+        value_blocks = self._extract_block_context(value_states)
+        num_query_blocks = query_blocks.shape[1]
+
+        # Create mask for valid positions
+        original_valid_mask = ~mask
+        extracted_valid_mask_blocks = self._extract_block_context(original_valid_mask)
+
+        if (
+            extracted_valid_mask_blocks.ndim == 4
+            and extracted_valid_mask_blocks.shape[0] == batch_size
+            and extracted_valid_mask_blocks.shape[1] == num_query_blocks
+            and extracted_valid_mask_blocks.shape[2]
+            * extracted_valid_mask_blocks.shape[3]
+            == self.context_size
+        ):
+            extracted_valid_mask_blocks = extracted_valid_mask_blocks.reshape(
+                batch_size, num_query_blocks, self.context_size
+            )
+
+        condition_from_input_validity = extracted_valid_mask_blocks.unsqueeze(
+            1
+        ).unsqueeze(-2)
+        condition_from_causality = (
+            self.local_causal_valid_mask.unsqueeze(0).unsqueeze(0).unsqueeze(0)
+        )
+
+        final_condition_for_where = torch.logical_and(
+            condition_from_input_validity,
+            condition_from_causality.to(condition_from_input_validity.device),
+        )
+
+        # Compute attention scores
+        logits = self.relative_position_embedding(query_blocks, key_blocks)
+
+        # Apply attention logit softcap
+        softcap_val = self.softcap.to(logits.device)
+        logits = logits / softcap_val
+        logits = torch.tanh(logits)
+        logits = logits * softcap_val
+
+        # Apply the combined mask.
+        # final_condition_for_where will broadcast with logits [B,N,U,W,C]
+        logits = torch.where(
+            final_condition_for_where, logits, torch.finfo(logits.dtype).min
+        )
+
+        probabilities = F.softmax(logits, dim=-1, dtype=torch.float32).to(
+            dtype=value_blocks.dtype
+        )
+
+        # context_vectors is adapted from jax.numpy.einsum("BNuwc,BucNH->BuwNH", ...)
+        b_dim, n_dim, u_dim, w_dim, c_dim = probabilities.shape
+        h_dim = value_blocks.shape[-1]
+        prob_bun = probabilities.permute(0, 2, 1, 3, 4).reshape(-1, w_dim, c_dim)
+        v_bun = value_blocks.permute(0, 1, 3, 2, 4).reshape(-1, c_dim, h_dim)
+        result_bmm = torch.bmm(prob_bun, v_bun)
+        context_vectors = result_bmm.reshape(b_dim, u_dim, n_dim, w_dim, h_dim).permute(
+            0, 1, 3, 2, 4
+        )
+        context_vectors = context_vectors.reshape(
+            (
+                batch_size,
+                num_query_blocks * self.chunk_size,
+                self.num_heads,
+                self.head_dim,
+            )
+        )
+        context_vectors = context_vectors[:, :q_time]
+
+        return context_vectors
+
+
+class Gemma3nAudioSSCPConvBlock(nn.Module):
+    """A single convolution block for the SubSampleConvProjection."""
+
+    def __init__(
+        self,
+        config: Gemma3nAudioConfig,
+        idx: int,
+        input_freq_dim: int,
+        manual_padding: Tuple[int, int, int, int] = (0, 0, 0, 0),
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.config = config
+        self.manual_padding = manual_padding
+
+        in_channels = 1 if idx == 0 else self.config.sscp_conv_channel_size[idx - 1]
+        out_channels = self.config.sscp_conv_channel_size[idx]
+        kernel_h, kernel_w = self.config.sscp_conv_kernel_size[idx]
+        stride_h, stride_w = self.config.sscp_conv_stride_size[idx]
+
+        self.conv = nn.Conv2d(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=(kernel_h, kernel_w),
+            stride=(stride_h, stride_w),
+            padding=(0, 0),  # Manual padding is used
+            bias=False,
+        )
+
+        f_in_padded = input_freq_dim + self.manual_padding[0] + self.manual_padding[1]
+        f_out_conv = (f_in_padded - kernel_w) // stride_w + 1
+
+        self.norm = Gemma3nCumulativeGroupNorm(
+            num_channels=out_channels,
+            feature_dims=(f_out_conv,),
+            eps=self.config.sscp_conv_group_norm_eps,
+        )
+
+        self.activation = nn.ReLU()
+
+    def forward(self, audio_encodings: torch.Tensor) -> torch.Tensor:
+        audio_encodings_padded = F.pad(
+            audio_encodings, self.manual_padding, mode="constant", value=0.0
+        )
+        audio_encodings_conv = self.conv(audio_encodings_padded)
+        x_for_norm = audio_encodings_conv.permute(0, 2, 3, 1).contiguous()
+        x_normed = self.norm(x_for_norm)
+        audio_encodings_normed = x_normed.permute(0, 3, 1, 2).contiguous()
+        return self.activation(audio_encodings_normed)
+
+
+class Gemma3nAudioSubSampleConvProjection(nn.Module):
+    def __init__(
+        self,
+        config: Gemma3nAudioConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.config = config
+
+        current_f_for_block_input = config.input_feat_size
+        calculated_block_padding = []
+        calculated_f_out_dims = []
+
+        for i in range(2):  # Assuming 2 conv layers
+            kernel_h, kernel_w = config.sscp_conv_kernel_size[i]
+            stride_h, stride_w = config.sscp_conv_stride_size[i]
+
+            # Padding for Time (Height for Conv2d) - REVERSE_CAUSAL like
+            pad_t_top = 0
+            pad_t_bottom = kernel_h - 1
+
+            # Frequency Padding (Width for Conv2d)
+            pad_f_left = 1
+            pad_f_right = 1
+
+            manual_padding_tuple = (pad_f_left, pad_f_right, pad_t_top, pad_t_bottom)
+            calculated_block_padding.append(manual_padding_tuple)
+
+            f_in_padded = current_f_for_block_input + pad_f_left + pad_f_right
+            f_out_after_conv = (f_in_padded - kernel_w) // stride_w + 1
+            calculated_f_out_dims.append(f_out_after_conv)
+            current_f_for_block_input = f_out_after_conv
+
+        self.conv_0 = Gemma3nAudioSSCPConvBlock(
+            idx=0,
+            input_freq_dim=config.input_feat_size,
+            config=config,
+            manual_padding=calculated_block_padding[0],
+            quant_config=quant_config,
+            prefix=add_prefix("conv_0", prefix),
+        )
+        self.conv_1 = Gemma3nAudioSSCPConvBlock(
+            idx=1,
+            input_freq_dim=calculated_f_out_dims[0],
+            config=config,
+            manual_padding=calculated_block_padding[1],
+            quant_config=quant_config,
+            prefix=add_prefix("conv_1", prefix),
+        )
+
+        final_c_out = config.sscp_conv_channel_size[-1]
+        final_f_out = calculated_f_out_dims[-1]
+        self.input_proj_in_features = final_c_out * final_f_out
+
+        self.input_proj_linear = RowParallelLinear(
+            self.input_proj_in_features,
+            self.config.hidden_size,
+            bias=False,
+            quant_config=quant_config,
+            prefix=add_prefix("input_proj_linear", prefix),
+        )
+
+    def forward(self, audio_encodings: torch.Tensor) -> torch.Tensor:
+        audio_encodings_reshaped = audio_encodings.unsqueeze(1)
+        x = self.conv_0(audio_encodings_reshaped)
+        x = self.conv_1(x)
+        b, c_out, t_out, f_out = x.shape
+        x_permuted = x.permute(0, 2, 3, 1).contiguous()
+        output_flattened = x_permuted.view(b, t_out, f_out * c_out)
+        output, _ = self.input_proj_linear(output_flattened)
+        return output
+
+
+class Gemma3nAudioConformerAttention(nn.Module):
+    def __init__(
+        self,
+        config: Gemma3nAudioConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.config = config
+
+        head_dim = self.config.hidden_size // self.config.conf_num_attention_heads
+        self.post_in_shape = (self.config.conf_num_attention_heads, head_dim)
+        self.post_in_features = self.config.hidden_size
+
+        self.register_buffer(
+            "gradient_clipping",
+            torch.tensor(self.config.gradient_clipping),
+            persistent=False,
+        )
+
+        self.pre_attn_norm = Gemma3nRMSNorm(self.config.hidden_size)
+        self.attn = Gemma3nAudioAttention(
+            config, quant_config, prefix=add_prefix("attn", prefix)
+        )
+        self.post = RowParallelLinear(
+            self.post_in_features,
+            self.config.hidden_size,
+            bias=False,
+            quant_config=quant_config,
+            prefix=add_prefix("post", prefix),
+        )
+        self.post_norm = Gemma3nRMSNorm(self.config.hidden_size)
+
+    def forward(
+        self, audio_encodings: torch.Tensor, audio_mel_mask: torch.BoolTensor
+    ) -> torch.Tensor:
+        audio_encodings_input_to_attn = audio_encodings
+        audio_encodings = torch.clamp(
+            audio_encodings, -self.gradient_clipping, self.gradient_clipping
+        )
+        audio_encodings_norm = self.pre_attn_norm(audio_encodings)
+        audio_encodings_attn_out = self.attn(audio_encodings_norm, audio_mel_mask)
+
+        b, t, num_heads, head_dim = audio_encodings_attn_out.shape
+        audio_encodings_reshaped = audio_encodings_attn_out.reshape(
+            b, t, num_heads * head_dim
+        )
+
+        audio_encodings, _ = self.post(audio_encodings_reshaped)
+        audio_encodings = torch.clamp(
+            audio_encodings, -self.gradient_clipping, self.gradient_clipping
+        )
+        return audio_encodings_input_to_attn + self.post_norm(audio_encodings)
+
+
+class Gemma3nAudioConformerFeedForward(nn.Module):
+    def __init__(
+        self,
+        config: Gemma3nAudioConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.config = config
+
+        self.register_buffer(
+            "gradient_clipping",
+            torch.tensor(self.config.gradient_clipping),
+            persistent=False,
+        )
+
+        self.pre_layer_norm = Gemma3nRMSNorm(self.config.hidden_size)
+        self.ffw_layer_1 = ColumnParallelLinear(
+            self.config.hidden_size,
+            self.config.hidden_size * 4,
+            bias=False,
+            quant_config=quant_config,
+            prefix=add_prefix("ffw_layer_1", prefix),
+        )
+        self.ffw_layer_2 = RowParallelLinear(
+            self.config.hidden_size * 4,
+            self.config.hidden_size,
+            bias=False,
+            quant_config=quant_config,
+            prefix=add_prefix("ffw_layer_2", prefix),
+        )
+        self.post_layer_norm = Gemma3nRMSNorm(self.config.hidden_size)
+        self.post_layer_scale = torch.tensor(self.config.conf_residual_weight)
+
+    def forward(self, audio_encodings: torch.Tensor) -> torch.Tensor:
+        residual = audio_encodings
+        audio_encodings = torch.clamp(
+            audio_encodings, -self.gradient_clipping, self.gradient_clipping
+        )
+        audio_encodings = self.pre_layer_norm(audio_encodings)
+        audio_encodings, _ = self.ffw_layer_1(audio_encodings)
+        audio_encodings = F.silu(audio_encodings)
+        audio_encodings, _ = self.ffw_layer_2(audio_encodings)
+        audio_encodings = torch.clamp(
+            audio_encodings, -self.gradient_clipping, self.gradient_clipping
+        )
+        audio_encodings = self.post_layer_norm(audio_encodings)
+        return residual + (audio_encodings * self.post_layer_scale)
+
+
+class Gemma3nAudioConformerLightConv1d(nn.Module):
+    def __init__(
+        self,
+        config: Gemma3nAudioConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.config = config
+
+        self.pre_layer_norm = Gemma3nRMSNorm(
+            self.config.hidden_size, eps=self.config.rms_norm_eps
+        )
+        self.linear_start = ColumnParallelLinear(
+            self.config.hidden_size,
+            self.config.hidden_size * 2,
+            bias=False,
+            quant_config=quant_config,
+            prefix=add_prefix("linear_start", prefix),
+        )
+
+        self.depthwise_conv1d = nn.Conv1d(
+            in_channels=self.config.hidden_size,
+            out_channels=self.config.hidden_size,
+            kernel_size=self.config.conf_conv_kernel_size,
+            stride=1,
+            padding=0,  # Manual causal padding
+            groups=self.config.hidden_size,  # Depthwise
+            bias=False,
+        )
+        self.register_buffer(
+            "gradient_clipping",
+            torch.tensor(self.config.gradient_clipping),
+            persistent=False,
+        )
+        self.conv_norm = Gemma3nRMSNorm(
+            self.config.hidden_size, eps=self.config.rms_norm_eps
+        )
+        self.linear_end = RowParallelLinear(
+            self.config.hidden_size,
+            self.config.hidden_size,
+            bias=False,
+            quant_config=quant_config,
+            prefix=add_prefix("linear_end", prefix),
+        )
+
+        self.causal_padding = self.config.conf_conv_kernel_size - 1
+
+    def forward(self, audio_encodings: torch.Tensor) -> torch.Tensor:
+        audio_encodings_residual = audio_encodings  # Save for residual connection
+
+        audio_encodings = self.pre_layer_norm(audio_encodings)
+        audio_encodings, _ = self.linear_start(audio_encodings)
+        audio_encodings = F.glu(audio_encodings, dim=-1)
+
+        # Permute for Conv1d: [B, T, D] -> [B, D, T]
+        audio_encodings_permuted = audio_encodings.permute(0, 2, 1)
+        # Apply manual causal padding
+        audio_encodings_permuted_padded = F.pad(
+            audio_encodings_permuted, (self.causal_padding, 0)
+        )
+        audio_encodings = self.depthwise_conv1d(audio_encodings_permuted_padded)
+        # Permute back: [B, D, T_out] -> [B, T_out, D]
+        audio_encodings = audio_encodings.permute(0, 2, 1)
+        audio_encodings = torch.clamp(
+            audio_encodings, -self.gradient_clipping, self.gradient_clipping
+        )
+        audio_encodings = self.conv_norm(audio_encodings)
+        audio_encodings = F.silu(audio_encodings)
+        audio_encodings, _ = self.linear_end(audio_encodings)
+        output = audio_encodings + audio_encodings_residual
+        return output
+
+
+class Gemma3nAudioConformerBlock(nn.Module):
+    def __init__(
+        self,
+        config: Gemma3nAudioConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.config = config
+
+        self.ffw_layer_start = Gemma3nAudioConformerFeedForward(
+            config, quant_config, prefix=add_prefix("ffw_layer_start", prefix)
+        )
+        self.attention = Gemma3nAudioConformerAttention(
+            config, quant_config, prefix=add_prefix("attention", prefix)
+        )
+        self.lconv1d = Gemma3nAudioConformerLightConv1d(
+            config, quant_config, prefix=add_prefix("lconv1d", prefix)
+        )
+        self.ffw_layer_end = Gemma3nAudioConformerFeedForward(
+            config, quant_config, prefix=add_prefix("ffw_layer_end", prefix)
+        )
+        self.register_buffer(
+            "gradient_clipping",
+            torch.tensor(self.config.gradient_clipping),
+            persistent=False,
+        )
+        self.norm = Gemma3nRMSNorm(self.config.hidden_size)
+
+    def forward(
+        self, audio_encodings: torch.Tensor, audio_mel_mask: torch.BoolTensor
+    ) -> torch.Tensor:
+        audio_encodings = self.ffw_layer_start(audio_encodings)
+        audio_encodings = self.attention(audio_encodings, audio_mel_mask)
+        validity_mask_for_lconv = ~audio_mel_mask  # True for valid
+        audio_encodings_for_lconv_input = (
+            audio_encodings
+            * validity_mask_for_lconv.unsqueeze(-1).to(audio_encodings.dtype)
+        )
+        audio_encodings = self.lconv1d(audio_encodings_for_lconv_input)
+
+        audio_encodings = self.ffw_layer_end(audio_encodings)
+        audio_encodings = torch.clamp(
+            audio_encodings, -self.gradient_clipping, self.gradient_clipping
+        )
+        output = self.norm(audio_encodings)
+        return output
+
+
+class Gemma3nAudioEncoder(PreTrainedModel):
+    """A Universal Speech Encoder -- https://arxiv.org/abs/2303.01037"""
+
+    config_class = Gemma3nAudioConfig
+
+    def __init__(
+        self,
+        config: Gemma3nAudioConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__(config)
+        self.config = config
+
+        self.subsample_conv_projection = Gemma3nAudioSubSampleConvProjection(
+            config, quant_config, prefix=add_prefix("subsample_conv_projection", prefix)
+        )
+        self.conformer = make_layers(
+            config.conf_num_hidden_layers,
+            lambda idx, prefix: Gemma3nAudioConformerBlock(
+                config=config,
+                quant_config=quant_config,
+                prefix=prefix,
+            ),
+            prefix=add_prefix("conformer", prefix),
+        )
+
+    def forward(
+        self, audio_mel: torch.Tensor, audio_mel_mask: torch.BoolTensor
+    ) -> Tuple[torch.Tensor, torch.BoolTensor]:
+        """Encodes a batch of MELs.
+
+        Args:
+            audio_mel: a torch.Tensor of shape [batch, num_frames, mel_bins].
+            audio_mel_mask: a torch.BoolTensor of shape [batch, num_frames].
+
+        Returns:
+            audio_encodings: a torch.Tensor of shape
+                `[batch_size, reduced_time_frames, hidden_size]`
+            audio_mel_mask: a torch.BoolTensor of shape [batch, reduced_time_frames].
+        """
+        audio_encodings = self.subsample_conv_projection(
+            audio_mel
+        )  # audio_encodings: [B, T_sub, D]
+
+        # Subsample the input audio_mel_mask to match the time dimension of audio_encodings (T_sub)
+        t_sub = audio_encodings.shape[1]
+
+        time_stride_product = 1
+        for stride_pair_idx in range(len(self.config.sscp_conv_stride_size)):
+            time_stride_product *= self.config.sscp_conv_stride_size[stride_pair_idx][0]
+
+        # Create indices for gathering from the original mask.
+        # These indices map to original time steps corresponding to the start of each
+        # receptive field in the subsampled output.
+        indices = (
+            torch.arange(t_sub, device=audio_mel_mask.device) * time_stride_product
+        )
+        indices = torch.clamp(indices, max=audio_mel_mask.shape[1] - 1)
+
+        # Expand indices for batch compatibility if B > 1 and indices is 1D.
+        if audio_mel_mask.ndim > 1 and indices.ndim == 1:
+            indices = indices.unsqueeze(0).expand(
+                audio_mel_mask.shape[0], -1
+            )  # [B, T_sub]
+        elif (
+            audio_mel_mask.ndim == indices.ndim
+            and audio_mel_mask.shape[0] == 1
+            and indices.shape[0] != 1
+            and t_sub == indices.shape[0]
+        ):
+            # Handle case where B=1 but indices became [T_sub] instead of [1, T_sub]
+            indices = indices.unsqueeze(0)
+
+        current_mask = torch.gather(audio_mel_mask, 1, indices)  # [B, T_sub]
+
+        # Fallback: Ensure mask length matches feature length after gather.
+        if current_mask.shape[1] != t_sub:
+            if current_mask.shape[1] > t_sub:
+                current_mask = current_mask[:, :t_sub]
+            else:  # current_mask.shape[1] < t_sub
+                padding_needed = t_sub - current_mask.shape[1]
+                current_mask = F.pad(
+                    current_mask, (0, padding_needed), value=True
+                )  # Pad with True (masked)
+
+        for i, block in enumerate(self.conformer):
+            audio_encodings = block(
+                audio_encodings, current_mask
+            )  # Pass the processed mask
+
+        if self.config.conf_reduction_factor > 1:
+            audio_encodings = audio_encodings[:, :: self.config.conf_reduction_factor]
+            # Reduce the mask as well
+            current_mask = current_mask[:, :: self.config.conf_reduction_factor]
+
+        # Final masking of audio_encodings based on the final current_mask
+        # Ensure current_mask length matches the finally reduced audio_encodings length
+        if current_mask.shape[1] != audio_encodings.shape[1]:
+            target_len = audio_encodings.shape[1]
+            mask_current_len = current_mask.shape[1]
+            if target_len > mask_current_len:
+                padding_needed = target_len - mask_current_len
+                current_mask = F.pad(current_mask, (0, padding_needed), value=True)
+            elif mask_current_len > target_len:  # mask is longer
+                current_mask = current_mask[:, :target_len]
+
+        audio_encodings = audio_encodings.masked_fill(current_mask.unsqueeze(-1), 0.0)
+        return audio_encodings, current_mask
diff --git a/sglang/python/sglang/srt/models/gemma3n_causal.py b/sglang/python/sglang/srt/models/gemma3n_causal.py
new file mode 100644
index 0000000000000000000000000000000000000000..c92f70971498e5389fbfe8e2aaa7f3a3265cdd8a
--- /dev/null
+++ b/sglang/python/sglang/srt/models/gemma3n_causal.py
@@ -0,0 +1,1011 @@
+from typing import Iterable, Optional, Set, Tuple
+
+import torch
+import torch.nn.functional as F
+from torch import nn
+from transformers import AutoModel, Gemma3nTextConfig, PretrainedConfig, PreTrainedModel
+
+from sglang.srt.distributed import get_tensor_model_parallel_world_size
+from sglang.srt.layers.activation import GeluAndMul
+from sglang.srt.layers.layernorm import RMSNorm
+from sglang.srt.layers.linear import (
+    ColumnParallelLinear,
+    MergedColumnParallelLinear,
+    QKVParallelLinear,
+    ReplicatedLinear,
+    RowParallelLinear,
+)
+from sglang.srt.layers.logits_processor import LogitsProcessor
+from sglang.srt.layers.quantization.base_config import QuantizationConfig
+from sglang.srt.layers.radix_attention import RadixAttention
+from sglang.srt.layers.rotary_embedding import get_rope
+from sglang.srt.layers.vocab_parallel_embedding import ParallelLMHead
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch
+from sglang.srt.model_loader.weight_utils import (
+    default_weight_loader,
+    maybe_remap_kv_scale_name,
+)
+from sglang.srt.models.gemma3_causal import Gemma3TextScaledWordEmbedding
+from sglang.srt.utils import add_prefix, make_layers
+
+
+# Aligned with HF's implementation, using sliding window inclusive with the last token
+# SGLang assumes exclusive
+def get_attention_sliding_window_size(config):
+    return config.sliding_window - 1
+
+
+class Gemma3nRMSNorm(RMSNorm):
+    def __init__(
+        self,
+        dim: int,
+        eps: float = 1e-6,
+        with_scale: bool = True,
+    ) -> None:
+        super().__init__(dim, eps=eps)
+        if not with_scale:
+            del self.weight
+            self.register_buffer(
+                "weight",
+                torch.ones(dim, dtype=torch.get_default_dtype()),
+                persistent=False,
+            )
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        original_shape = x.shape
+        x_2d = x.contiguous().reshape(-1, original_shape[-1])
+        x_2d = super().forward(x_2d)
+        x = x_2d.reshape(original_shape)
+        return x
+
+
+class Gemma3nTextScaledWordEmbedding(Gemma3TextScaledWordEmbedding):
+    pass
+
+
+class Gemma3nTextMLP(nn.Module):
+    def __init__(
+        self,
+        hidden_size: int,
+        intermediate_size: int,
+        hidden_activation: str,
+        activation_sparsity: float = 0.0,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.gate_up_proj = MergedColumnParallelLinear(
+            hidden_size,
+            [intermediate_size] * 2,
+            bias=False,
+            quant_config=quant_config,
+            prefix=add_prefix("gate_up_proj", prefix),
+        )
+        self.down_proj = RowParallelLinear(
+            intermediate_size,
+            hidden_size,
+            bias=False,
+            quant_config=quant_config,
+            prefix=add_prefix("down_proj", prefix),
+        )
+        if hidden_activation != "gelu_pytorch_tanh":
+            raise ValueError(
+                "Gemma3n uses `gelu_pytorch_tanh` as the hidden activation "
+                "function. Please set `hidden_activation` to "
+                "`gelu_pytorch_tanh`."
+            )
+        # Use proper GELU with tanh approximation as specified
+        self.act_fn = GeluAndMul()
+        self.activation_sparsity = activation_sparsity
+        self.register_buffer(
+            "target_sparsity_tensor",
+            torch.tensor(self.activation_sparsity, dtype=torch.float32),
+            persistent=False,
+        )  # moved from _gaussian_topk for cuda graph
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        gate_up, _ = self.gate_up_proj(x)
+
+        # Split gate and up projections
+        gate_proj, up_proj = gate_up.chunk(2, dim=-1)
+
+        # Apply activation sparsity if needed
+        if self.activation_sparsity > 0.0:
+            gate_proj = self._gaussian_topk(gate_proj)
+
+        gate_up = torch.cat([gate_proj, up_proj], dim=-1)
+
+        # Apply GELU activation to gate projection and multiply with up projection
+        x = self.act_fn(gate_up)
+        x, _ = self.down_proj(x)
+        return x
+
+    def _gaussian_topk(self, inputs: torch.Tensor) -> torch.Tensor:
+        normal_dist = torch.distributions.normal.Normal(0, 1)
+        std_multiplier = normal_dist.icdf(self.target_sparsity_tensor)
+        std_multiplier = std_multiplier.type(inputs.dtype)
+        inputs_mean = torch.mean(inputs, dim=-1, keepdim=True)
+        inputs_std = torch.std(inputs, dim=-1, keepdim=True, unbiased=False)
+        cutoff_x = inputs_mean + inputs_std * std_multiplier
+        return F.relu(inputs - cutoff_x)
+
+
+class Gemma3nLaurelBlock(nn.Module):
+    """Learned Augmented Residual Layer"""
+
+    def __init__(
+        self,
+        config: Gemma3nTextConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.config = config
+
+        self.linear_left = ColumnParallelLinear(
+            config.hidden_size,
+            config.laurel_rank,
+            bias=False,
+            quant_config=quant_config,
+            prefix=add_prefix("linear_left", prefix),
+        )
+        self.linear_right = RowParallelLinear(
+            config.laurel_rank,
+            config.hidden_size,
+            bias=False,
+            quant_config=quant_config,
+            prefix=add_prefix("linear_right", prefix),
+        )
+        self.post_laurel_norm = Gemma3nRMSNorm(
+            dim=config.hidden_size,
+            eps=config.rms_norm_eps,
+        )
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        # [num_tokens, hidden_size]
+        laurel_x, _ = self.linear_left(x)
+        laurel_x, _ = self.linear_right(laurel_x)
+        normed_laurel_x = self.post_laurel_norm(laurel_x)
+        return x + normed_laurel_x
+
+
+class Gemma3nAltUp(nn.Module):
+    """Alternating Updates (AltUp)"""
+
+    def __init__(
+        self,
+        config: Gemma3nTextConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.config = config
+
+        self.correct_output_scale = nn.Parameter(
+            torch.zeros(config.hidden_size, dtype=torch.float32)
+        )
+        self.correction_coefs = ReplicatedLinear(
+            config.altup_num_inputs,
+            config.altup_num_inputs,
+            bias=False,
+            quant_config=quant_config,
+            prefix=add_prefix("correction_coefs", prefix),
+        )
+        self.prediction_coefs = ReplicatedLinear(
+            config.altup_num_inputs,
+            config.altup_num_inputs**2,
+            bias=False,
+            quant_config=quant_config,
+            prefix=add_prefix("prediction_coefs", prefix),
+        )
+        self.modality_router = ReplicatedLinear(
+            config.hidden_size,
+            config.altup_num_inputs,
+            bias=False,
+            quant_config=quant_config,
+            prefix=add_prefix("modality_router", prefix),
+        )
+
+        self.router_norm = Gemma3nRMSNorm(
+            dim=config.hidden_size,
+            eps=config.rms_norm_eps,
+        )
+
+        self.register_buffer(
+            "router_input_scale",
+            torch.tensor(config.hidden_size**-1.0),
+            persistent=False,
+        )
+
+    def compute_router_modalities(self, x: torch.Tensor) -> torch.Tensor:
+        # x  : [num_tokens, hidden_size]
+        router_inputs = self.router_norm(x) * self.router_input_scale.to(
+            self.router_norm.weight.dtype
+        )
+        # router_inputs : [num_tokens, hidden_size]
+        routed, _ = self.modality_router(router_inputs)
+
+        # routed : [num_tokens, altup_num_inputs]
+        return torch.tanh(routed.float()).type_as(routed)
+
+    def predict(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        """Predicts the output of a layer using a trainable map.
+        hidden_states: [num_altup_inputs, num_tokens, hidden_size]
+        """
+        modalities = self.compute_router_modalities(
+            hidden_states[self.config.altup_active_idx]
+        )  # (n_tokens, altup_num_inputs)
+        # TODO: CHECK DO WE NEED THIS: self.prediction_coefs.float()  # Force computation in float32, in-place operation
+
+        if self.config.altup_coef_clip is not None:
+            self.prediction_coefs.weight.data.clamp_(
+                -self.config.altup_coef_clip, self.config.altup_coef_clip
+            )
+
+        all_coefs, _ = self.prediction_coefs(
+            modalities
+        )  # (n_tokens, altup_num_inputs) -> (n_tokens, altup_num_inputs**2)
+
+        all_coefs = all_coefs.reshape(
+            *modalities.shape[:-1],
+            self.config.altup_num_inputs,
+            self.config.altup_num_inputs,
+        ).permute(0, 2, 1)
+
+        # permute hidden_states from [num_altup_inputs, num_tokens, hidden_size] to [num_tokens, hidden_size, altup_num_inputs]
+        predictions = torch.matmul(hidden_states.permute(1, 2, 0), all_coefs)
+        predictions = predictions.permute(2, 0, 1)  # undo the permute
+        predictions += hidden_states  # add the original input
+        return predictions.contiguous().type_as(
+            hidden_states
+        )  # [num_altup_inputs, num_tokens, hidden_size]
+
+    def correct(
+        self, predictions: torch.Tensor, activated: torch.Tensor
+    ) -> torch.Tensor:
+        """Corrects the predictions relative to the activated inputs."""
+        # prediction : [num_altup_inputs, num_tokens, hidden_size]
+        # activated  : [num_tokens, hidden_size]
+        modalities = self.compute_router_modalities(
+            activated
+        )  # [num_tokens, altup_num_inputs]
+        innovation = (
+            activated - predictions[self.config.altup_active_idx]
+        )  # [num_tokens, hidden_size]
+        innovation = innovation.repeat(
+            self.config.altup_num_inputs, 1, 1
+        )  # (self.config.altup_num_inputs, num_tokens, hidden_size)
+
+        if self.config.altup_coef_clip is not None:
+            self.correction_coefs.weight.data.clamp_(
+                -self.config.altup_coef_clip, self.config.altup_coef_clip
+            )
+
+        all_coefs, _ = self.correction_coefs(
+            modalities
+        )  # [num_tokens, altup_num_inputs]
+        all_coefs = (all_coefs + 1.0).permute(1, 0).unsqueeze(-1)
+        # # [num_tokens, altup_num_inputs, 1]
+
+        corrected = torch.mul(innovation, all_coefs)
+        corrected += predictions
+        return corrected.contiguous().type_as(activated)
+
+    def scale_corrected_output(self, corrected: torch.Tensor) -> torch.Tensor:
+        """Scales the provided 3D tensor."""
+        return corrected * self.correct_output_scale.to(corrected.dtype)
+
+    def forward(
+        self, hidden_states: torch.Tensor, activated: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """Predicts, correct, and optionally scales the output of a layer using trainable maps.
+
+        hidden_states: [num_altup_inputs, num_tokens, hidden_size]
+        """
+
+        predictions = self.predict(hidden_states)
+        corrected = self.correct(predictions=predictions, activated=activated)
+        output = corrected[self.config.altup_active_idx]
+        if self.config.altup_correct_scale:
+            output = self.scale_corrected_output(output)
+        return corrected, output
+
+
+class Gemma3nAttention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    def __init__(
+        self,
+        layer_id: int,
+        config: Gemma3nTextConfig,
+        max_position_embeddings: int,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.layer_id = layer_id
+        self.config = config
+        tp_size = get_tensor_model_parallel_world_size()
+
+        self.total_num_heads = config.num_attention_heads
+        assert self.total_num_heads % tp_size == 0
+        self.num_heads = self.total_num_heads // tp_size
+        self.total_num_kv_heads = config.num_key_value_heads
+
+        self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
+
+        if self.total_num_kv_heads >= tp_size:
+            assert self.total_num_kv_heads % tp_size == 0
+        else:
+            assert tp_size % self.total_num_kv_heads == 0
+
+        hidden_size = config.hidden_size
+        head_dim = getattr(
+            config, "head_dim", hidden_size // config.num_attention_heads
+        )
+        self.head_dim = head_dim
+
+        self.q_size = self.num_heads * self.head_dim
+        self.kv_size = self.num_kv_heads * self.head_dim
+        # self.scaling = config.query_rescale_scalar / config.query_pre_attn_scalar
+        self.scaling = 1.0
+
+        self.qkv_proj = QKVParallelLinear(
+            hidden_size,
+            self.head_dim,
+            self.total_num_heads,
+            self.total_num_kv_heads,
+            bias=config.attention_bias,
+            quant_config=quant_config,
+            prefix=add_prefix("qkv_proj", prefix),
+        )
+        self.o_proj = RowParallelLinear(
+            self.total_num_heads * self.head_dim,
+            hidden_size,
+            bias=config.attention_bias,
+            quant_config=quant_config,
+            prefix=add_prefix("o_proj", prefix),
+        )
+
+        # Determine if layer uses sliding window based on pattern
+        self.is_sliding = config.layer_types[layer_id] == "sliding_attention"
+
+        # Check if this is a KV shared layer
+        first_kv_shared_layer_idx = (
+            config.num_hidden_layers - config.num_kv_shared_layers
+        )
+        self.is_kv_shared_layer = layer_id >= first_kv_shared_layer_idx
+
+        # Compute the layer index from which shared KV cache values will be retrieved
+        if not self.is_kv_shared_layer:
+            self.kv_shared_layer_index = None
+        elif self.is_sliding:
+            self.kv_shared_layer_index = first_kv_shared_layer_idx - 2
+        else:
+            self.kv_shared_layer_index = first_kv_shared_layer_idx - 1
+
+        if self.is_sliding:
+            self.rotary_emb = get_rope(
+                self.head_dim,
+                rotary_dim=self.head_dim,
+                max_position=config.max_position_embeddings,
+                base=config.rope_local_base_freq,
+                rope_scaling={"rope_type": "default"},
+            )
+        else:
+            self.rotary_emb = get_rope(
+                self.head_dim,
+                rotary_dim=self.head_dim,
+                max_position=config.max_position_embeddings,
+                base=config.rope_theta,
+                rope_scaling=config.rope_scaling,
+            )
+
+        self.sliding_window = config.sliding_window if self.is_sliding else None
+
+        self.attn = RadixAttention(
+            self.num_heads,
+            self.head_dim,
+            self.scaling,
+            num_kv_heads=self.num_kv_heads,
+            layer_id=(
+                layer_id if not self.is_kv_shared_layer else self.kv_shared_layer_index
+            ),
+            logit_cap=0.0,
+            sliding_window_size=self.sliding_window,
+            quant_config=quant_config,
+            prefix=add_prefix("attn", prefix),
+        )
+
+        # Gemma3n adds normalization for q, k, v
+        self.q_norm = Gemma3nRMSNorm(
+            dim=config.head_dim,
+            eps=config.rms_norm_eps,
+        )
+        self.k_norm = Gemma3nRMSNorm(
+            dim=config.head_dim,
+            eps=config.rms_norm_eps,
+        )
+        self.v_norm = Gemma3nRMSNorm(
+            dim=config.head_dim,
+            eps=config.rms_norm_eps,
+            with_scale=False,
+        )
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        positions: Tuple[torch.Tensor, torch.Tensor],
+        forward_batch: ForwardBatch,
+        **kwargs,
+    ) -> torch.Tensor:
+
+        qkv, _ = self.qkv_proj(hidden_states)
+        # TODO: for first 20 layers, we use QKVParallelLinear
+        #       for others, we only calc Q.
+        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+
+        # Apply normalization to q, k, v
+        q = q.unflatten(-1, (self.num_heads, self.head_dim))
+        q = self.q_norm(q)
+
+        # Check if we should use shared KV cache
+        if self.is_kv_shared_layer and self.kv_shared_layer_index is not None:
+            # For KV shared layers, we skip K/V computation and normalization
+            # The RadixAttention will handle retrieving shared KV from cache
+            k = None
+            v = None
+        else:
+            k = k.unflatten(-1, (self.num_kv_heads, self.head_dim))
+            k = self.k_norm(k)
+
+            v = v.unflatten(-1, (self.num_kv_heads, self.head_dim))
+            v = self.v_norm(v)
+
+        # Flatten back for rotary embedding
+        q = q.flatten(-2, -1)
+
+        # Apply rotary embedding
+        if k is not None:
+            k = k.flatten(-2, -1)
+            q, k = self.rotary_emb(positions, q, k)
+            # Reshape k back to head format for attention
+            k = k.unflatten(-1, (self.num_kv_heads, self.head_dim))
+        else:
+            # For shared KV layers, create a dummy key for rotary embedding and discard it
+            dummy_k = torch.zeros_like(
+                q[:, : self.kv_size]
+            )  # Create dummy key with same shape as needed
+            q, _ = self.rotary_emb(positions, q, dummy_k)
+
+        # Reshape q back to head format for attention
+        q = q.unflatten(-1, (self.num_heads, self.head_dim))
+
+        attn_output = self.attn(
+            q,
+            k,
+            v,
+            forward_batch=forward_batch,
+            save_kv_cache=not self.is_kv_shared_layer,
+        )
+
+        output, _ = self.o_proj(attn_output)
+        return output
+
+
+class Gemma3nDecoderLayer(nn.Module):
+    def __init__(
+        self,
+        layer_id: int,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.layer_id = layer_id
+        self.attention_type = config.layer_types[layer_id]
+        self.config = config
+
+        self.self_attn = Gemma3nAttention(
+            layer_id=layer_id,
+            config=config,
+            max_position_embeddings=config.max_position_embeddings,
+            quant_config=quant_config,
+            prefix=add_prefix("self_attn", prefix),
+        )
+
+        intermediate_size = config.intermediate_size[layer_id]
+        activation_sparsity = config.activation_sparsity_pattern[layer_id]
+        self.mlp = Gemma3nTextMLP(
+            hidden_size=self.hidden_size,
+            intermediate_size=intermediate_size,
+            hidden_activation=config.hidden_activation,
+            activation_sparsity=activation_sparsity,
+            quant_config=quant_config,
+            prefix=add_prefix("mlp", prefix),
+        )
+
+        self.input_layernorm = Gemma3nRMSNorm(self.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = Gemma3nRMSNorm(
+            self.hidden_size, eps=config.rms_norm_eps
+        )
+        self.pre_feedforward_layernorm = Gemma3nRMSNorm(
+            self.hidden_size, eps=config.rms_norm_eps
+        )
+        self.post_feedforward_layernorm = Gemma3nRMSNorm(
+            self.hidden_size, eps=config.rms_norm_eps
+        )
+
+        self.hidden_size_per_layer_input = config.hidden_size_per_layer_input
+
+        self.altup = Gemma3nAltUp(
+            config, quant_config, prefix=add_prefix("altup", prefix)
+        )
+        self.laurel = Gemma3nLaurelBlock(
+            config, quant_config, prefix=add_prefix("laurel", prefix)
+        )
+
+        self.per_layer_input_gate = ReplicatedLinear(
+            self.hidden_size,
+            self.hidden_size_per_layer_input,
+            bias=False,
+            quant_config=quant_config,
+            prefix=add_prefix("per_layer_input_gate", prefix),
+        )
+        self.per_layer_projection = ReplicatedLinear(
+            self.hidden_size_per_layer_input,
+            self.hidden_size,
+            bias=False,
+            quant_config=quant_config,
+            prefix=add_prefix("per_layer_projection", prefix),
+        )
+        self.post_per_layer_input_norm = Gemma3nRMSNorm(
+            self.hidden_size, eps=config.rms_norm_eps
+        )
+        self.is_sliding = self.self_attn.is_sliding
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        per_layer_input: torch.Tensor,
+        forward_batch: ForwardBatch,
+        **kwargs,
+    ) -> torch.Tensor:
+        predictions = self.altup.predict(
+            hidden_states
+        )  # [num_altup_inputs, num_tokens, hidden_size]
+        active_prediction = predictions[self.config.altup_active_idx]
+
+        active_prediction_normed = self.input_layernorm(active_prediction)
+        laurel_output = self.laurel(
+            active_prediction_normed
+        )  # laurel_output: [num_tokens, hidden_size]
+        # active_prediction: [num_tokens, hidden_size]
+
+        attn = self.self_attn(
+            positions=positions,
+            hidden_states=active_prediction_normed,
+            forward_batch=forward_batch,
+            **kwargs,
+        )
+        attn = self.post_attention_layernorm(attn)  # [num_tokens, hidden_size]
+
+        attn_gated = active_prediction + attn  # [num_tokens, hidden_size]
+        attn_laurel = (attn_gated + laurel_output) / torch.sqrt(torch.tensor(2.0))
+
+        attn_norm = self.pre_feedforward_layernorm(
+            attn_laurel
+        )  # [num_tokens, hidden_size]
+        attn_ffw = self.mlp(attn_norm)  # [num_tokens, hidden_size]
+        attn_ffw_norm = self.post_feedforward_layernorm(
+            attn_ffw
+        )  # [num_tokens, hidden_size]
+        attn_ffw_laurel_gated = attn_laurel + attn_ffw_norm  # [num_tokens, hidden_size]
+        corrected_predictions = self.altup.correct(
+            predictions, attn_ffw_laurel_gated
+        )  # prediction : [num_altup_inputs, num_tokens, hidden_size]
+        # attn_ffw_laurel_gated: [num_tokens, hidden_size]
+        first_prediction = corrected_predictions[self.config.altup_active_idx]
+
+        if self.config.altup_correct_scale:
+            first_prediction = self.altup.scale_corrected_output(first_prediction)
+
+        # per_layer_input_gate
+        first_prediction = first_prediction.to(self.per_layer_input_gate.weight.dtype)
+        first_prediction, _ = self.per_layer_input_gate(first_prediction)
+        first_prediction = F.gelu(first_prediction, approximate="tanh")
+        first_prediction = torch.multiply(first_prediction, per_layer_input)
+
+        # per_layer_projection
+        first_prediction, _ = self.per_layer_projection(first_prediction)
+        first_prediction = self.post_per_layer_input_norm(first_prediction)
+        corrected_predictions[1:] += first_prediction
+
+        return corrected_predictions
+
+
+class Gemma3nTextModel(PreTrainedModel):
+    def __init__(
+        self,
+        config: Gemma3nTextConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__(config=config)
+        self.config = config
+        self.quant_config = quant_config
+        self.vocab_size = config.vocab_size
+        self.padding_idx = config.pad_token_id
+
+        # Gemma3n downcasts the below to float16, causing sqrt(3072)=55.4256 to become 55.5
+        self.embed_tokens = Gemma3nTextScaledWordEmbedding(
+            config.vocab_size,
+            config.hidden_size,
+            self.padding_idx,
+            embed_scale=self.config.hidden_size**0.5,
+        )
+
+        self.norm = Gemma3nRMSNorm(
+            config.hidden_size,
+            eps=config.rms_norm_eps,
+        )
+
+        self.layers = make_layers(
+            config.num_hidden_layers,
+            lambda idx, prefix: Gemma3nDecoderLayer(
+                layer_id=idx,
+                config=config,
+                quant_config=quant_config,
+                prefix=prefix,
+            ),
+            prefix=add_prefix("layers", prefix),
+        )
+
+        # Per-layer input embeddings
+        self.hidden_size = config.hidden_size
+        self.hidden_size_per_layer_input = config.hidden_size_per_layer_input
+
+        self.embed_tokens_per_layer = Gemma3nTextScaledWordEmbedding(
+            config.vocab_size_per_layer_input,
+            config.num_hidden_layers * config.hidden_size_per_layer_input,
+            self.padding_idx,
+            embed_scale=self.config.hidden_size_per_layer_input**0.5,
+        )
+
+        self.per_layer_model_projection = ColumnParallelLinear(
+            self.hidden_size,
+            config.num_hidden_layers * config.hidden_size_per_layer_input,
+            bias=False,
+            gather_output=True,
+            quant_config=quant_config,
+            prefix=add_prefix("per_layer_model_projection", prefix),
+        )
+
+        self.per_layer_projection_norm = Gemma3nRMSNorm(
+            dim=config.hidden_size_per_layer_input,
+            eps=config.rms_norm_eps,
+        )
+
+        self.altup_projections = make_layers(
+            self.config.altup_num_inputs - 1,
+            lambda idx, prefix: ColumnParallelLinear(
+                self.hidden_size,
+                self.hidden_size,
+                bias=False,
+                gather_output=True,
+                quant_config=quant_config,
+                prefix=prefix,
+            ),
+            prefix=add_prefix("altup_projections", prefix),
+        )
+
+        self.altup_unembed_projections = make_layers(
+            self.config.altup_num_inputs - 1,
+            lambda idx, prefix: ColumnParallelLinear(
+                self.hidden_size,
+                self.hidden_size,
+                bias=False,
+                gather_output=True,
+                quant_config=quant_config,
+                prefix=prefix,
+            ),
+            prefix=add_prefix("altup_unembed_projections", prefix),
+        )
+
+        self.register_buffer(
+            "per_layer_projection_scale",
+            torch.tensor(self.hidden_size**-0.5),
+            persistent=False,
+        )
+        self.register_buffer(
+            "per_layer_input_scale", torch.rsqrt(torch.tensor(2.0)), persistent=False
+        )
+
+        self.post_init()
+
+    def get_input_embeddings(self) -> nn.Embedding:
+        return self.embed_tokens
+
+    def dtype(self) -> torch.dtype:
+        return next(self.parameters()).dtype
+
+    def get_per_layer_inputs(self, input_ids: torch.LongTensor) -> torch.Tensor:
+        embeddings = self.embed_tokens_per_layer(input_ids)
+        return embeddings.reshape(
+            *input_ids.shape,
+            self.config.num_hidden_layers,
+            self.hidden_size_per_layer_input,
+        )
+
+    def project_per_layer_inputs(
+        self,
+        inputs_embeds: torch.Tensor,
+        per_layer_inputs: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        per_layer_projection, _ = self.per_layer_model_projection(inputs_embeds)
+        per_layer_projection *= self.per_layer_projection_scale.type(
+            inputs_embeds.dtype
+        )
+        per_layer_projection = per_layer_projection.reshape(
+            *inputs_embeds.shape[:-1],
+            self.config.num_hidden_layers,
+            self.hidden_size_per_layer_input,
+        )
+        per_layer_projection = self.per_layer_projection_norm(per_layer_projection)
+
+        if per_layer_inputs is None:
+            return per_layer_projection
+
+        if per_layer_projection.shape != per_layer_inputs.shape:
+            # per-layer inputs are sometimes padded with zeros, slice the relevant embeddings
+            per_layer_inputs = per_layer_inputs[..., : self.config.num_hidden_layers, :]
+
+        return (
+            per_layer_projection + per_layer_inputs
+        ) * self.per_layer_input_scale.type(inputs_embeds.dtype)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        input_embeds: torch.Tensor = None,
+        per_layer_inputs: Optional[torch.Tensor] = None,
+        **kwargs,
+    ) -> torch.Tensor:
+        if (input_ids is None) ^ (input_embeds is not None):
+            raise ValueError(
+                "You must specify exactly one of input_ids or inputs_embeds"
+            )
+
+        if input_ids is not None:
+            input_embeds = self.embed_tokens(input_ids)
+            per_layer_inputs = self.get_per_layer_inputs(input_ids)
+
+        per_layer_inputs = self.project_per_layer_inputs(input_embeds, per_layer_inputs)
+
+        # Expand hidden_states to support per-layer inputs
+        target_magnitude = torch.mean(input_embeds**2, dim=-1, keepdim=True) ** 0.5
+        epsilon_tensor = torch.tensor(torch.finfo(input_embeds.dtype).min)
+
+        # embed positions
+        hidden_states_0 = input_embeds
+        temp_hidden_states = [hidden_states_0]
+
+        for i in range(1, self.config.altup_num_inputs):
+            altup_proj, _ = self.altup_projections[i - 1](hidden_states_0)
+            current_hidden_state = altup_proj.type(hidden_states_0.dtype)
+            new_magnitude = (
+                torch.mean(current_hidden_state**2, dim=-1, keepdim=True) ** 0.5
+            )
+            current_hidden_state = current_hidden_state * (
+                target_magnitude / torch.maximum(new_magnitude, epsilon_tensor)
+            )
+            temp_hidden_states.append(current_hidden_state)
+
+        hidden_states = torch.stack(
+            temp_hidden_states, dim=0
+        )  # [num_altup_inputs, n_tokens, hidden_size]
+
+        for layer_idx, layer in enumerate(self.layers):
+            per_layer_input = per_layer_inputs[:, layer_idx, :]
+            hidden_states = layer(
+                positions=positions,
+                per_layer_input=per_layer_input,
+                hidden_states=hidden_states,
+                forward_batch=forward_batch,
+                **kwargs,
+            )
+
+        # Per-layer inputs to single output
+        target_magnitude = (
+            torch.mean(hidden_states[0] ** 2, dim=-1, keepdim=True) ** 0.5
+        )
+
+        temp_hidden_states = [hidden_states[0]]
+
+        for i in range(1, self.config.altup_num_inputs):
+            # altup_unembed_projections adapted from jax.numpy.einsum("btp,pd->btd", ...)
+            altup_unemb_proj, _ = self.altup_unembed_projections[i - 1](
+                hidden_states[i]
+            )
+            current_hidden_state = altup_unemb_proj.type(hidden_states_0.dtype)
+            new_magnitude = (
+                torch.mean(current_hidden_state**2, dim=-1, keepdim=True) ** 0.5
+            )
+            current_hidden_state = current_hidden_state * (
+                target_magnitude / torch.maximum(new_magnitude, epsilon_tensor)
+            )
+            temp_hidden_states.append(current_hidden_state)
+
+        hidden_states = torch.stack(temp_hidden_states)
+        hidden_states = torch.mean(hidden_states, dim=0)
+        hidden_states = self.norm(hidden_states)
+
+        return hidden_states
+
+
+class Gemma3nForCausalLM(PreTrainedModel):
+    config_class = Gemma3nTextConfig
+
+    _tied_weights_keys = ["lm_head.weight"]
+    _tp_plan = {"lm_head": "colwise_rep"}
+    _pp_plan = {"lm_head": (["hidden_states"], ["logits"])}
+    config_class = Gemma3nTextConfig
+    base_model_prefix = "language_model"
+
+    # BitandBytes specific attributes
+    default_bitsandbytes_target_modules = [
+        ".gate_proj.",
+        ".down_proj.",
+        ".up_proj.",
+        ".q_proj.",
+        ".k_proj.",
+        ".v_proj.",
+        ".o_proj.",
+    ]
+    bitsandbytes_stacked_params_mapping = {
+        ".q_proj": (".qkv_proj", 0),
+        ".k_proj": (".qkv_proj", 1),
+        ".v_proj": (".qkv_proj", 2),
+        ".gate_proj": (".gate_up_proj", 0),
+        ".up_proj": (".gate_up_proj", 1),
+    }
+
+    packed_modules_mapping = {
+        ".qkv_proj": [
+            ".q_proj",
+            ".k_proj",
+            ".v_proj",
+        ],
+        ".gate_up_proj": [
+            ".gate_proj",
+            ".up_proj",
+        ],
+    }
+
+    # LoRA specific attributes
+    supported_lora_modules = [
+        ".qkv_proj",
+        ".o_proj",
+        ".gate_up_proj",
+        ".down_proj",
+    ]
+    # Gemma does not apply LoRA to the embedding layer
+    embedding_modules = {}
+    embedding_padding_modules = []
+    supports_lora = True
+
+    def __init__(
+        self,
+        config: Gemma3nTextConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__(config=config)
+        self.config = config
+        self.quant_config = quant_config
+        self.model = Gemma3nTextModel(
+            config=config,
+            quant_config=quant_config,
+            prefix=add_prefix("model", prefix),
+        )
+        self.logits_processor = LogitsProcessor(config)
+
+        if self.config.tie_word_embeddings:
+            self.lm_head = self.model.embed_tokens
+        else:
+            self.lm_head = ParallelLMHead(
+                config.vocab_size,
+                config.hidden_size,
+                quant_config=quant_config,
+                prefix=add_prefix("lm_head", prefix),
+            )
+        self.post_init()
+
+    def get_input_embeddings(self) -> nn.Embedding:
+        return self.model.embed_tokens
+
+    def get_attention_sliding_window_size(self):
+        return get_attention_sliding_window_size(self.config)
+
+    def dtype(self) -> torch.dtype:
+        return next(self.parameters()).dtype
+
+    @torch.no_grad()
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        input_embeds: torch.Tensor = None,
+        per_layer_inputs: Optional[torch.Tensor] = None,
+        **kwargs,
+    ) -> LogitsProcessor:
+        hidden_states = self.model(
+            input_ids,
+            positions,
+            forward_batch,
+            input_embeds,
+            per_layer_inputs,
+            **kwargs,
+        )
+
+        return self.logits_processor(
+            input_ids, hidden_states, self.model.embed_tokens, forward_batch
+        )
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            (".qkv_proj", ".q_proj", "q"),
+            (".qkv_proj", ".k_proj", "k"),
+            (".qkv_proj", ".v_proj", "v"),
+            (".gate_up_proj", ".gate_proj", 0),
+            (".gate_up_proj", ".up_proj", 1),
+        ]
+        params_dict = dict(self.named_parameters())
+        loaded_params: Set[str] = set()
+
+        for name, loaded_weight in weights:
+            name = name.replace("model.language_model.", "model.")
+            for param_name, shard_name, shard_id in stacked_params_mapping:
+                if shard_name not in name:
+                    continue
+                name = name.replace(shard_name, param_name)
+                # Skip loading extra bias for GPTQ models
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                if name not in params_dict:
+                    # Skip loading weights that are not in the model
+                    continue
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                # lm_head is not used in vllm as it is tied with embed_token
+                if "lm_head.weight" in name:
+                    continue
+                # Skip loading extra bias for GPTQ models
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                # Remapping the name of FP8 kv-scale
+                name = maybe_remap_kv_scale_name(name, params_dict)
+                if name is None:
+                    continue
+                if name not in params_dict:
+                    # Skip loading weights that are not in the model
+                    continue
+
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
+                weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
+
+
+EntryClass = Gemma3nForCausalLM
+AutoModel.register(Gemma3nTextConfig, Gemma3nForCausalLM, exist_ok=True)
diff --git a/sglang/python/sglang/srt/models/gemma3n_mm.py b/sglang/python/sglang/srt/models/gemma3n_mm.py
new file mode 100644
index 0000000000000000000000000000000000000000..e2dfe99ccfa881759f23d13df979a0be7b0580be
--- /dev/null
+++ b/sglang/python/sglang/srt/models/gemma3n_mm.py
@@ -0,0 +1,533 @@
+import logging
+import re
+from functools import lru_cache
+from typing import Iterable, List, Optional, Set, Tuple, TypedDict, Union
+
+import torch
+from torch import nn
+from transformers import (
+    Gemma3nAudioConfig,
+    Gemma3nConfig,
+    Gemma3nTextConfig,
+    Gemma3nVisionConfig,
+    PreTrainedModel,
+)
+from transformers.models.auto.modeling_auto import AutoModel
+
+from sglang.srt.layers.linear import ReplicatedLinear
+from sglang.srt.layers.logits_processor import LogitsProcessor
+from sglang.srt.layers.quantization.base_config import QuantizationConfig
+from sglang.srt.layers.vocab_parallel_embedding import VocabParallelEmbedding
+from sglang.srt.managers.mm_utils import (
+    MultiModalityDataPaddingPatternMultimodalTokens,
+    general_mm_embed_routine,
+)
+from sglang.srt.managers.schedule_batch import (
+    Modality,
+    MultimodalDataItem,
+    MultimodalInputs,
+    flatten_nested_list,
+)
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch
+from sglang.srt.model_loader.weight_utils import (
+    default_weight_loader,
+    maybe_remap_kv_scale_name,
+)
+from sglang.srt.models.gemma3n_audio import Gemma3nAudioEncoder
+from sglang.srt.models.gemma3n_causal import Gemma3nRMSNorm, Gemma3nTextModel
+from sglang.srt.utils import add_prefix
+from sglang.srt.utils.hf_transformers_utils import get_processor
+
+logger = logging.getLogger(__name__)
+
+cached_get_processor = lru_cache(get_processor)
+
+
+class Gemma3nImagePixelInputs(TypedDict):
+    pixel_values: torch.Tensor
+    """Shape: `(batch_size * num_images, num_channels, height, width)`"""
+
+
+class Gemma3nAudioInputs(TypedDict):
+    input_features: torch.Tensor
+    """Shape: `(batch_size * num_audio, seq_length, num_features)`"""
+    input_features_mask: torch.Tensor
+    """Shape: `(batch_size * num_audio, seq_length)`"""
+
+
+class Gemma3nMultimodalEmbedder(nn.Module):
+    """Embeds token ids or soft tokens for multimodal content into language model space."""
+
+    def __init__(
+        self,
+        multimodal_config: Union[Gemma3nAudioConfig, Gemma3nVisionConfig],
+        text_config: Gemma3nTextConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+
+        self.multimodal_hidden_size = multimodal_config.hidden_size
+        self.eps = multimodal_config.rms_norm_eps
+        self.vocab_offset = multimodal_config.vocab_offset
+        self.vocab_size = multimodal_config.vocab_size
+        self.text_hidden_size = text_config.hidden_size
+
+        self.embedding = VocabParallelEmbedding(
+            self.vocab_size,
+            self.multimodal_hidden_size,
+            quant_config=quant_config,
+            prefix=add_prefix("embedding", prefix),
+        )
+
+        self.hard_embedding_norm = Gemma3nRMSNorm(
+            self.multimodal_hidden_size,
+            eps=self.eps,
+        )
+
+        self.soft_embedding_norm = Gemma3nRMSNorm(
+            self.multimodal_hidden_size,
+            eps=self.eps,
+        )
+
+        self.embedding_projection = ReplicatedLinear(
+            self.multimodal_hidden_size,
+            self.text_hidden_size,
+            bias=False,
+            quant_config=quant_config,
+            prefix=add_prefix("embedding_projection", prefix),
+        )
+
+        self.embedding_post_projection_norm = Gemma3nRMSNorm(
+            self.text_hidden_size,
+            eps=self.eps,
+            with_scale=False,
+        )
+
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        """Embeds token ids or soft tokens for multimodal content into language model space.
+
+        Args:
+            input_ids: A torch.LongTensor containing the token ids to embed. Values should be in the range
+                `[vocab_offset, vocab_offset + vocab_size)`.
+            inputs_embeds: A torch.Tensor containing the soft tokens to embed.
+
+        Returns:
+            A torch.Tensor of embeddings with  shape `[batch_size, seq_len, self.config.text_config.hidden_size]`.
+        """
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError(
+                "You must specify exactly one of input_ids or inputs_embeds"
+            )
+
+        if inputs_embeds is not None:
+            emb_norm = self.soft_embedding_norm(inputs_embeds)
+        else:
+            # Handle out of vocab ids to prevent CUDA assertion failures
+            out_of_vocab_id = self.vocab_size - 1
+            adjusted_ids = input_ids - self.vocab_offset
+            adjusted_ids = torch.where(adjusted_ids < 0, out_of_vocab_id, adjusted_ids)
+            adjusted_ids = torch.where(
+                adjusted_ids >= self.vocab_size, out_of_vocab_id, adjusted_ids
+            )
+            hard_emb = self.embedding(adjusted_ids)
+            emb_norm = self.hard_embedding_norm(hard_emb)
+
+        emb_norm_proj, _ = self.embedding_projection(emb_norm)
+        return self.embedding_post_projection_norm(emb_norm_proj)
+
+
+class Gemma3nForConditionalGeneration(PreTrainedModel):
+    config_class = Gemma3nConfig
+    """Gemma3n multimodal model for conditional generation."""
+
+    # BitandBytes specific attributes
+    default_bitsandbytes_target_modules = [
+        ".gate_proj.",
+        ".down_proj.",
+        ".up_proj.",
+        ".q_proj.",
+        ".k_proj.",
+        ".v_proj.",
+        ".o_proj.",
+        ".out_proj.",
+    ]
+    bitsandbytes_stacked_params_mapping = {
+        "q_proj": ("qkv_proj", 0),
+        "k_proj": ("qkv_proj", 1),
+        "v_proj": ("qkv_proj", 2),
+        "gate_proj": ("gate_up_proj", 0),
+        "up_proj": ("gate_up_proj", 1),
+        "out_proj": ("proj", 0),
+    }
+
+    packed_modules_mapping = {
+        "qkv_proj": [
+            "q_proj",
+            "k_proj",
+            "v_proj",
+        ],
+        "gate_up_proj": [
+            "gate_proj",
+            "up_proj",
+        ],
+    }
+
+    # LoRA specific attributes
+    supported_lora_modules = [
+        "qkv_proj",
+        "o_proj",
+        "gate_up_proj",
+        "down_proj",
+    ]
+    # Gemma does not apply LoRA to the embedding layer
+    embedding_modules = {}
+    embedding_padding_modules = []
+    supports_lora = True
+
+    def __init__(
+        self,
+        config: Gemma3nConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__(config=config)
+        self.config = config
+        self.quant_config = quant_config
+
+        prefix = add_prefix("model", prefix)
+
+        # Vision components
+        # TODO: Use sglang's vision model
+        self.vision_tower = AutoModel.from_config(config=config.vision_config)
+
+        self.embed_vision = Gemma3nMultimodalEmbedder(
+            config.vision_config,
+            config.text_config,
+            quant_config=quant_config,
+            prefix=add_prefix("embed_vision", prefix),
+        )
+
+        # Audio components
+        self.embed_audio = Gemma3nMultimodalEmbedder(
+            config.audio_config,
+            config.text_config,
+            quant_config=quant_config,
+            prefix=add_prefix("embed_audio", prefix),
+        )
+
+        self.audio_tower = Gemma3nAudioEncoder(
+            config.audio_config,
+            quant_config=quant_config,
+            prefix=add_prefix("audio_tower", prefix),
+        )
+
+        self.vocab_size = config.text_config.vocab_size
+        self.vocab_size_per_layer_input = config.text_config.vocab_size_per_layer_input
+
+        # Text model
+        self.language_model = Gemma3nTextModel(
+            config.text_config,
+            quant_config,
+            prefix=add_prefix("language_model", prefix),
+        )
+
+        # Create logits processor for the multimodal model
+        self.logits_processor = LogitsProcessor(config.text_config)
+
+        self.post_init()
+
+    def pad_input_ids(
+        self,
+        input_ids: List[int],
+        mm_inputs: MultimodalInputs,
+    ) -> List[int]:
+        """Pad input IDs with image and audio tokens."""
+        pattern = MultiModalityDataPaddingPatternMultimodalTokens()
+        return pattern.pad_input_tokens(input_ids, mm_inputs)
+
+    def get_input_embeddings(self) -> nn.Embedding:
+        return self.language_model.get_input_embeddings()
+
+    def get_attention_sliding_window_size(self):
+        return self.config.text_config.sliding_window - 1
+
+    def get_image_feature(self, items: List[MultimodalDataItem]):
+        """
+        Projects the last hidden state from the vision model into language model space.
+
+        Returns:
+            image_features (`torch.Tensor`): Image feature tensor of shape `(num_images, image_length, embed_dim)`).
+        """
+        # Process images one by one to handle flatten_batch=True constraint in vision_tower
+        all_pixel_values = flatten_nested_list([item.feature for item in items])
+        vision_outputs_list = []
+
+        for pixel_values_batch in all_pixel_values:
+            # Normalize input shape to [batch_size, channels, height, width]
+            if pixel_values_batch.dim() == 5:
+                pixel_values_batch = pixel_values_batch.squeeze(0)
+            elif pixel_values_batch.dim() == 3:
+                pixel_values_batch = pixel_values_batch.unsqueeze(0)
+            elif pixel_values_batch.dim() != 4:
+                raise ValueError(
+                    f"Unexpected pixel_values shape: {pixel_values_batch.shape}"
+                )
+
+            # Process each image in the batch
+            batch_size = pixel_values_batch.shape[0]
+            for i in range(batch_size):
+                pixel_value = pixel_values_batch[i : i + 1]  # Keep batch dimension as 1
+                pixel_value = pixel_value.to(
+                    device=self.vision_tower.device, dtype=self.language_model.dtype()
+                )
+                vision_outputs = self.vision_tower(
+                    pixel_values=pixel_value, do_pooling=False, return_dict=True
+                ).last_hidden_state
+                vision_outputs_list.append(vision_outputs)
+
+        # Concatenate all vision outputs
+        vision_outputs = torch.cat(vision_outputs_list, dim=0)
+
+        # Convert from (batch, channels, height, width) to (batch, height * width, channels)
+        vision_outputs = vision_outputs.reshape(
+            vision_outputs.shape[0],
+            self.config.vision_config.hidden_size,
+            self.config.vision_soft_tokens_per_image,
+        ).permute(0, 2, 1)
+
+        # Normalize and embed the soft tokens into language model space
+        vision_outputs *= self.config.vision_config.hidden_size**0.5
+        return self.embed_vision(inputs_embeds=vision_outputs)
+
+    def get_audio_feature(self, items: List[MultimodalDataItem]) -> torch.Tensor:
+        """
+        Projects the last hidden state from the audio encoder into language model space.
+
+        Args:
+            items: List of multimodal data items containing audio data.
+
+        Returns:
+            audio_features (`torch.Tensor`): Audio feature tensor of shape `(num_audios, audio_length, embed_dim)`).
+        """
+        # Extract audio features and masks from items
+        all_input_features = flatten_nested_list([item.feature for item in items])
+        all_input_features_mask = flatten_nested_list(
+            [~item.input_features_mask for item in items]
+        )  # Note(Xinyuan): reverse the mask according to the HF implementation
+
+        # Process audio features one by one
+        audio_features_list = []
+
+        for input_features, input_features_mask in zip(
+            all_input_features, all_input_features_mask
+        ):
+            # Ensure proper tensor format
+            if input_features.dim() == 2:
+                input_features = input_features.unsqueeze(0)
+            if input_features_mask.dim() == 1:
+                input_features_mask = input_features_mask.unsqueeze(0)
+
+            # Move to device and dtype
+            input_features = input_features.to(
+                device=next(self.audio_tower.parameters()).device,
+                dtype=self.language_model.dtype(),
+            )
+            input_features_mask = input_features_mask.to(device=input_features.device)
+
+            # Process through audio tower
+            audio_outputs, audio_mask = self.audio_tower(
+                input_features, input_features_mask
+            )
+
+            # Embed the audio outputs
+            audio_embeds = self.embed_audio(inputs_embeds=audio_outputs)
+            audio_features_list.append(audio_embeds)
+
+        # Concatenate all audio features
+        if audio_features_list:
+            audio_features = torch.cat(audio_features_list, dim=0)
+
+            # The Gemma3nProcessor expects all audio will be 30s in length and inserts 188 audio soft tokens into the
+            # text to account for this. However, the audio preprocessing and encoder do not gurarantee they will
+            # produce 188 soft tokens; they will produce at most that many tokens, but they may produce fewer tokens
+            # depending on the length of the longest audio input in the batch. When we encounter this situation, we pad
+            # the audio feature out to 188 soft tokens with the emebedding of the last token in the embed_audio vocab.
+            audio_padding_toks = torch.tensor(
+                [[self.vocab_size - 1]], dtype=torch.long, device=audio_features.device
+            )
+            audio_padding_embs = self.embed_audio(input_ids=audio_padding_toks)
+            audio_features = torch.where(
+                audio_mask.unsqueeze(-1), audio_padding_embs, audio_features
+            )
+
+            audio_batch_size, audio_seq_len, audio_embed_dim = audio_features.shape
+            extra_padding_tokens = (
+                self.config.audio_soft_tokens_per_image - audio_seq_len
+            )
+            extra_padding_features = audio_padding_embs.expand(
+                audio_batch_size, extra_padding_tokens, audio_embed_dim
+            )
+
+            audio_features = torch.cat((audio_features, extra_padding_features), dim=1)
+            return audio_features
+        else:
+            return torch.empty(
+                0,
+                0,
+                self.language_model.config.hidden_size,
+                device=next(self.parameters()).device,
+                dtype=self.language_model.dtype(),
+            )
+
+    def get_per_layer_inputs(
+        self, input_ids: torch.LongTensor
+    ) -> Optional[torch.Tensor]:
+        return self.language_model.get_per_layer_inputs(input_ids)
+
+    def project_per_layer_inputs(
+        self,
+        inputs_embeds: torch.Tensor,
+        per_layer_inputs: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        return self.language_model.project_per_layer_inputs(
+            inputs_embeds, per_layer_inputs
+        )
+
+    @torch.no_grad()
+    def forward(
+        self,
+        input_ids: torch.LongTensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        input_embeds: torch.Tensor = None,
+        **kwargs: object,
+    ) -> LogitsProcessor:
+        """Forward pass for multimodal Gemma3n."""
+        if (input_ids is None) ^ (input_embeds is not None):
+            raise ValueError(
+                "You must specify exactly one of input_ids or inputs_embeds"
+            )
+
+        positions += 1
+        if input_ids is not None:
+            # Prepare per-layer inputs from inputs_ids
+            per_layer_inputs_mask = torch.logical_and(
+                input_ids >= 0, input_ids < self.vocab_size_per_layer_input
+            )
+            per_layer_inputs_tokens = torch.where(
+                per_layer_inputs_mask, input_ids, torch.zeros_like(input_ids)
+            )
+            per_layer_inputs = self.language_model.get_per_layer_inputs(
+                per_layer_inputs_tokens
+            )
+
+        # Use general_mm_embed_routine for handling multimodal data
+        # This will automatically handle text, image, and audio embeddings
+        hidden_states = general_mm_embed_routine(
+            input_ids=input_ids,
+            forward_batch=forward_batch,
+            language_model=self.language_model,
+            data_embedding_funcs={
+                Modality.IMAGE: self.get_image_feature,
+                Modality.AUDIO: self.get_audio_feature,
+            },
+            positions=positions,
+            per_layer_inputs=per_layer_inputs,
+        )
+
+        # Process hidden states through logits processor
+        return self.logits_processor(
+            input_ids, hidden_states, self.language_model.embed_tokens, forward_batch
+        )
+
+    def tie_weights(self):
+        return self.language_model.tie_weights()
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            (".qkv_proj", ".q_proj", "q"),
+            (".qkv_proj", ".k_proj", "k"),
+            (".qkv_proj", ".v_proj", "v"),
+            (".gate_up_proj", ".up_proj", 1),
+            (".gate_up_proj", ".gate_proj", 0),
+        ]
+        """Load weights for the model."""
+        params_dict = dict(self.named_parameters())
+        loaded_params: Set[str] = set()
+
+        for name, loaded_weight in weights:
+            name = re.sub(r"^model\.", "", name)
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                if "vision_model" in name:
+                    # adapt to VisionAttention
+                    name = name.replace(".self_attn.out_proj", ".self_attn.proj")
+                # Skip loading extra bias for GPTQ models
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                # Remapping the name of FP8 kv-scale
+                name = maybe_remap_kv_scale_name(name, params_dict)
+                if name is None:
+                    continue
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
+                weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
+
+    lora_pattern = re.compile(
+        r"^language_model\.layers\.(\d+)\.(?:self_attn|mlp)\.(?:qkv_proj|o_proj|down_proj|gate_up_proj)"
+    )
+
+    def should_apply_lora(self, module_name: str) -> bool:
+        return bool(self.lora_pattern.match(module_name))
+
+    def get_hidden_dim(self, module_name, layer_idx):
+        # return input_dim, output_dim
+        if module_name == "qkv_proj":
+            return (
+                self.config.hidden_size,
+                self.config.head_dim
+                * (
+                    self.config.num_attention_heads
+                    + self.config.num_key_value_heads * 2
+                ),
+            )
+        elif module_name == "o_proj":
+            return (
+                self.config.head_dim * self.config.num_attention_heads,
+                self.config.hidden_size,
+            )
+        elif module_name == "gate_up_proj":
+            assert len(set(self.config.intermediate_size)) == 1, (
+                "Currently SGLang requires uniform intermediate size for all layers. "
+                "Please file an issue if you need support for non-uniform intermediate sizes."
+            )
+            return self.config.hidden_size, self.config.intermediate_size[0] * 2
+        elif module_name == "down_proj":
+            assert len(set(self.config.intermediate_size)) == 1, (
+                "Currently SGLang requires uniform intermediate size for all layers. "
+                "Please file an issue if you need support for non-uniform intermediate sizes."
+            )
+            return self.config.intermediate_size[0], self.config.hidden_size
+        else:
+            raise NotImplementedError()
+
+
+EntryClass = Gemma3nForConditionalGeneration
diff --git a/sglang/python/sglang/srt/models/glm4.py b/sglang/python/sglang/srt/models/glm4.py
new file mode 100644
index 0000000000000000000000000000000000000000..ba40a1f7446a008e0b60b31765461b22eac6dc9f
--- /dev/null
+++ b/sglang/python/sglang/srt/models/glm4.py
@@ -0,0 +1,646 @@
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+# Modeling from:
+# ./llama.py and
+# https://github.com/huggingface/transformers/blob/main/src/transformers/models/glm4/modular_glm4.py
+"""Inference-only GLM-4-0414 model compatible with HuggingFace weights."""
+
+import logging
+from typing import Any, Dict, Iterable, Optional, Tuple, Union
+
+import torch
+from torch import nn
+
+from sglang.srt.distributed import (
+    get_pp_group,
+    get_tensor_model_parallel_rank,
+    get_tensor_model_parallel_world_size,
+)
+from sglang.srt.layers.activation import SiluAndMul
+from sglang.srt.layers.dp_attention import is_dp_attention_enabled
+from sglang.srt.layers.layernorm import RMSNorm
+from sglang.srt.layers.linear import (
+    MergedColumnParallelLinear,
+    QKVParallelLinear,
+    RowParallelLinear,
+)
+from sglang.srt.layers.logits_processor import LogitsProcessor
+from sglang.srt.layers.pooler import Pooler, PoolingType
+from sglang.srt.layers.quantization.base_config import QuantizationConfig
+from sglang.srt.layers.radix_attention import RadixAttention
+from sglang.srt.layers.rotary_embedding import get_rope
+from sglang.srt.layers.utils import PPMissingLayer, get_layer_id
+from sglang.srt.layers.vocab_parallel_embedding import (
+    ParallelLMHead,
+    VocabParallelEmbedding,
+)
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch, PPProxyTensors
+from sglang.srt.model_loader.weight_utils import (
+    default_weight_loader,
+    kv_cache_scales_loader,
+)
+from sglang.srt.utils import add_prefix, make_layers
+
+Glm4Config = None
+
+logger = logging.getLogger(__name__)
+
+
+class Glm4MLP(nn.Module):
+    def __init__(
+        self,
+        hidden_size: int,
+        intermediate_size: int,
+        hidden_act: str,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+        reduce_results: bool = True,
+    ) -> None:
+        super().__init__()
+        self.gate_up_proj = MergedColumnParallelLinear(
+            hidden_size,
+            [intermediate_size] * 2,
+            bias=False,
+            quant_config=quant_config,
+            prefix=add_prefix("gate_up_proj", prefix),
+        )
+        self.down_proj = RowParallelLinear(
+            intermediate_size,
+            hidden_size,
+            bias=False,
+            quant_config=quant_config,
+            prefix=add_prefix("down_proj", prefix),
+            reduce_results=reduce_results,
+        )
+        if hidden_act != "silu":
+            raise ValueError(
+                f"Unsupported activation: {hidden_act}. Only silu is supported for now."
+            )
+        self.act_fn = SiluAndMul()
+
+    def forward(
+        self,
+        x,
+        forward_batch=None,
+        use_reduce_scatter: bool = False,
+    ):
+        gate_up, _ = self.gate_up_proj(x)
+        x = self.act_fn(gate_up)
+        x, _ = self.down_proj(
+            x,
+            skip_all_reduce=use_reduce_scatter,
+        )
+        return x
+
+
+class Glm4Attention(nn.Module):
+    def __init__(
+        self,
+        hidden_size: int,
+        num_heads: int,
+        num_kv_heads: int,
+        head_dim: Optional[int] = None,
+        layer_id: int = 0,
+        rope_theta: float = 1000000,
+        rope_scaling: Optional[Dict[str, Any]] = None,
+        max_position_embeddings: int = 131072,
+        quant_config: Optional[QuantizationConfig] = None,
+        dual_chunk_attention_config: Optional[dict[str, Any]] = None,
+        partial_rotary_factor: float = 0.5,
+        bias: bool = True,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.hidden_size = hidden_size
+        tp_size = get_tensor_model_parallel_world_size()
+        self.total_num_heads = num_heads
+        assert self.total_num_heads % tp_size == 0
+        self.num_heads = self.total_num_heads // tp_size
+        self.total_num_kv_heads = num_kv_heads
+        if self.total_num_kv_heads >= tp_size:
+            # Number of KV heads is greater than TP size, so we partition
+            # the KV heads across multiple tensor parallel GPUs.
+            assert self.total_num_kv_heads % tp_size == 0
+        else:
+            # Number of KV heads is less than TP size, so we replicate
+            # the KV heads across multiple tensor parallel GPUs.
+            assert tp_size % self.total_num_kv_heads == 0
+        self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
+        if head_dim is not None:
+            self.head_dim = head_dim
+        else:
+            self.head_dim = hidden_size // self.total_num_heads
+        self.q_size = self.num_heads * self.head_dim
+        self.kv_size = self.num_kv_heads * self.head_dim
+        self.scaling = self.head_dim**-0.5
+        self.rope_theta = rope_theta
+        self.max_position_embeddings = max_position_embeddings
+        self.partial_rotary_factor = partial_rotary_factor
+
+        self.qkv_proj = QKVParallelLinear(
+            hidden_size,
+            self.head_dim,
+            self.total_num_heads,
+            self.total_num_kv_heads,
+            bias=bias,
+            quant_config=quant_config,
+            prefix=add_prefix("qkv_proj", prefix),
+        )
+        self.o_proj = RowParallelLinear(
+            self.total_num_heads * self.head_dim,
+            hidden_size,
+            bias=False,
+            quant_config=quant_config,
+            prefix=add_prefix("o_proj", prefix),
+        )
+
+        self.rotary_emb = get_rope(
+            self.head_dim,
+            rotary_dim=self.head_dim,
+            max_position=max_position_embeddings,
+            base=rope_theta,
+            rope_scaling=rope_scaling,
+            dual_chunk_attention_config=dual_chunk_attention_config,
+            partial_rotary_factor=partial_rotary_factor,
+            is_neox_style=False,
+        )
+        self.attn = RadixAttention(
+            self.num_heads,
+            self.head_dim,
+            self.scaling,
+            num_kv_heads=self.num_kv_heads,
+            layer_id=layer_id,
+            quant_config=quant_config,
+            prefix=add_prefix("attn", prefix),
+        )
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+    ) -> torch.Tensor:
+        qkv, _ = self.qkv_proj(hidden_states)
+        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+        q, k = self.rotary_emb(positions, q, k)
+        attn_output = self.attn(q, k, v, forward_batch)
+        output, _ = self.o_proj(attn_output)
+        return output
+
+
+class Glm4DecoderLayer(nn.Module):
+    """A single transformer layer.
+
+    Transformer layer takes input with size [s, b, h] and returns an
+    output of the same size.
+    """
+
+    def __init__(
+        self,
+        config: Glm4Config,
+        layer_id: int = 0,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+        alt_stream: Optional[torch.cuda.Stream] = None,
+    ) -> None:
+        super().__init__()
+        self.hidden_size = config.hidden_size
+
+        rp = getattr(config, "rope_parameters", None)
+        if isinstance(rp, dict):
+            rope_theta = rp.get("rope_theta", getattr(config, "rope_theta", 1000000))
+            partial_rotary_factor = rp.get(
+                "partial_rotary_factor",
+                getattr(config, "partial_rotary_factor", 0.5),
+            )
+            rope_scaling = getattr(config, "rope_scaling", None)
+        else:
+            rope_theta = getattr(config, "rope_theta", 1000000)
+            rope_scaling = getattr(config, "rope_scaling", None)
+            partial_rotary_factor = getattr(config, "partial_rotary_factor", 0.5)
+
+        bias = getattr(config, "attention_bias", True)
+        max_position_embeddings = getattr(config, "max_position_embeddings", 32768)
+        head_dim = getattr(config, "head_dim", None)
+        dual_chunk_attention_config = getattr(
+            config, "dual_chunk_attention_config", None
+        )
+        self.self_attn = Glm4Attention(
+            hidden_size=self.hidden_size,
+            num_heads=config.num_attention_heads,
+            num_kv_heads=config.num_key_value_heads,
+            head_dim=head_dim,
+            layer_id=layer_id,
+            rope_theta=rope_theta,
+            rope_scaling=rope_scaling,
+            max_position_embeddings=max_position_embeddings,
+            quant_config=quant_config,
+            dual_chunk_attention_config=dual_chunk_attention_config,
+            partial_rotary_factor=partial_rotary_factor,
+            bias=bias,
+            prefix=add_prefix("self_attn", prefix),
+        )
+
+        # MLP
+        self.mlp = Glm4MLP(
+            config.hidden_size,
+            intermediate_size=config.intermediate_size,
+            hidden_act=config.hidden_act,
+            quant_config=quant_config,
+            prefix=add_prefix("mlp", prefix),
+        )
+
+        self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = RMSNorm(
+            config.hidden_size, eps=config.rms_norm_eps
+        )
+        self.post_self_attn_layernorm = RMSNorm(
+            config.hidden_size, eps=config.rms_norm_eps
+        )
+        self.post_mlp_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+        residual: Optional[torch.Tensor],
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        # Self Attention
+        if residual is None:
+            residual = hidden_states
+            hidden_states = self.input_layernorm(hidden_states)
+        else:
+            hidden_states, residual = self.input_layernorm(hidden_states, residual)
+        hidden_states = self.self_attn(
+            positions=positions,
+            hidden_states=hidden_states,
+            forward_batch=forward_batch,
+        )
+        hidden_states = self.post_self_attn_layernorm(hidden_states)
+
+        # Fully Connected
+        hidden_states, residual = self.post_attention_layernorm(hidden_states, residual)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = self.post_mlp_layernorm(hidden_states)
+
+        return hidden_states, residual
+
+
+class Glm4Model(nn.Module):
+    def __init__(
+        self,
+        config: Glm4Config,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+        decoder_layer_type: type[nn.Module] = Glm4DecoderLayer,
+        alt_stream: Optional[torch.cuda.Stream] = None,
+    ) -> None:
+        super().__init__()
+        self.config = config
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+        self.pp_group = get_pp_group()
+
+        if self.pp_group.is_first_rank:
+            self.embed_tokens = VocabParallelEmbedding(
+                config.vocab_size,
+                config.hidden_size,
+                quant_config=quant_config,
+                use_attn_tp_group=is_dp_attention_enabled(),
+                prefix=add_prefix("embed_tokens", prefix),
+            )
+        else:
+            self.embed_tokens = PPMissingLayer()
+
+        # Use the provided decoder layer type or default to Glm4DecoderLayer
+        decoder_layer_type = decoder_layer_type or Glm4DecoderLayer
+        self.layers, self.start_layer, self.end_layer = make_layers(
+            config.num_hidden_layers,
+            lambda idx, prefix: decoder_layer_type(
+                layer_id=idx,
+                config=config,
+                quant_config=quant_config,
+                prefix=prefix,
+                alt_stream=alt_stream,
+            ),
+            pp_rank=self.pp_group.rank_in_group,
+            pp_size=self.pp_group.world_size,
+            prefix=add_prefix("layers", prefix),
+        )
+        if self.pp_group.is_last_rank:
+            self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        else:
+            self.norm = PPMissingLayer(return_tuple=True)
+
+        # For EAGLE3 support
+        self.layers_to_capture = []
+
+    def get_input_embeddings(self) -> nn.Embedding:
+        return self.embed_tokens
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        input_embeds: torch.Tensor = None,
+        pp_proxy_tensors: Optional[PPProxyTensors] = None,
+    ) -> Union[torch.Tensor, PPProxyTensors]:
+        if self.pp_group.is_first_rank:
+            if input_embeds is None:
+                hidden_states = self.embed_tokens(input_ids)
+            else:
+                hidden_states = input_embeds
+            residual = None
+        else:
+            assert pp_proxy_tensors is not None
+            hidden_states = pp_proxy_tensors["hidden_states"]
+            residual = pp_proxy_tensors["residual"]
+
+        aux_hidden_states = []
+        for i in range(self.start_layer, self.end_layer):
+            if i in self.layers_to_capture:
+                aux_hidden_states.append(
+                    hidden_states + residual if residual is not None else hidden_states
+                )
+            layer = self.layers[i]
+            hidden_states, residual = layer(
+                positions,
+                hidden_states,
+                forward_batch,
+                residual,
+            )
+        if not self.pp_group.is_last_rank:
+            return PPProxyTensors(
+                {
+                    "hidden_states": hidden_states,
+                    "residual": residual,
+                }
+            )
+        else:
+            if hidden_states.shape[0] != 0:
+                if residual is None:
+                    hidden_states = self.norm(hidden_states)
+                else:
+                    hidden_states, _ = self.norm(hidden_states, residual)
+
+        if len(aux_hidden_states) == 0:
+            return hidden_states
+
+        return hidden_states, aux_hidden_states
+
+    # If this function is called, it should always initialize KV cache scale
+    # factors (or else raise an exception). Thus, handled exceptions should
+    # make sure to leave KV cache scale factors in a known good (dummy) state
+    def load_kv_cache_scales(self, quantization_param_path: str) -> None:
+        tp_size = get_tensor_model_parallel_world_size()
+        tp_rank = get_tensor_model_parallel_rank()
+        for layer_idx, scaling_factor in kv_cache_scales_loader(
+            quantization_param_path,
+            tp_rank,
+            tp_size,
+            self.config.num_hidden_layers,
+            self.config.__class__.model_type,
+        ):
+            if not isinstance(self.layers[layer_idx], nn.Identity):
+                layer_self_attn = self.layers[layer_idx].self_attn
+            if hasattr(layer_self_attn.attn, "k_scale"):
+                layer_self_attn.attn.k_scale = scaling_factor
+                layer_self_attn.attn.v_scale = scaling_factor
+            else:
+                raise RuntimeError(
+                    "Self attention has no KV cache scaling factor attribute!"
+                )
+
+
+class Glm4ForCausalLM(nn.Module):
+    def __init__(
+        self,
+        config: Glm4Config,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.pp_group = get_pp_group()
+        self.config = config
+        self.quant_config = quant_config
+        self.model = Glm4Model(
+            config, quant_config=quant_config, prefix=add_prefix("model", prefix)
+        )
+
+        # handle the lm head on different pp ranks
+        if self.pp_group.is_last_rank:
+            if self.pp_group.world_size == 1 and config.tie_word_embeddings:
+                self.lm_head = self.model.embed_tokens
+            else:
+                self.lm_head = ParallelLMHead(
+                    config.vocab_size,
+                    config.hidden_size,
+                    quant_config=quant_config,
+                    prefix=add_prefix("lm_head", prefix),
+                )
+        else:
+            # ranks other than the last rank will have a placeholder layer
+            self.lm_head = PPMissingLayer()
+
+        # perform weight tying for PP
+        if self.pp_group.world_size > 1 and config.tie_word_embeddings:
+            if self.pp_group.is_first_rank:
+                self.pp_group.send(
+                    self.model.embed_tokens.weight, dst=self.pp_group.last_rank
+                )
+            else:
+                emb_token_weight = self.pp_group.recv(
+                    size=(config.vocab_size, config.hidden_size),
+                    dtype=next(self.model.parameters()).dtype,
+                    src=self.pp_group.first_rank,
+                )
+                self.lm_head.weight.copy_(emb_token_weight)
+
+        self.logits_processor = LogitsProcessor(config)
+        self.pooler = Pooler(pooling_type=PoolingType.LAST, normalize=True)
+        # For EAGLE3 support
+        self.capture_aux_hidden_states = False
+
+    def get_input_embedding(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.get_input_embedding(input_ids)
+
+    def get_input_embeddings(self) -> nn.Embedding:
+        return self.model.embed_tokens
+
+    @torch.no_grad()
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        input_embeds: torch.Tensor = None,
+        get_embedding: bool = False,
+        pp_proxy_tensors: Optional[PPProxyTensors] = None,
+    ) -> torch.Tensor:
+        hidden_states = self.model(
+            input_ids,
+            positions,
+            forward_batch,
+            input_embeds,
+            pp_proxy_tensors=pp_proxy_tensors,
+        )
+        aux_hidden_states = None
+        if self.capture_aux_hidden_states:
+            hidden_states, aux_hidden_states = hidden_states
+
+        if self.pp_group.is_last_rank:
+            if not get_embedding:
+                return self.logits_processor(
+                    input_ids,
+                    hidden_states,
+                    self.lm_head,
+                    forward_batch,
+                    aux_hidden_states,
+                )
+            else:
+                return self.pooler(hidden_states, forward_batch)
+        else:
+            return hidden_states
+
+    @torch.no_grad()
+    def forward_split_prefill(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        split_interval: Tuple[int, int],  # [start, end) 0-based
+        input_embeds: torch.Tensor = None,
+    ):
+        start, end = split_interval
+        # embed
+        if start == 0:
+            if input_embeds is None:
+                forward_batch.hidden_states = self.model.embed_tokens(input_ids)
+            else:
+                forward_batch.hidden_states = input_embeds
+        # decoder layer
+        for i in range(start, end):
+            layer = self.model.layers[i]
+            forward_batch.hidden_states, forward_batch.residual = layer(
+                positions,
+                forward_batch.hidden_states,
+                forward_batch,
+                forward_batch.residual,
+            )
+
+        if end == self.model.config.num_hidden_layers:
+            # norm
+            hidden_states, _ = self.model.norm(
+                forward_batch.hidden_states, forward_batch.residual
+            )
+            forward_batch.hidden_states = hidden_states
+            # logits process
+            result = self.logits_processor(
+                input_ids, forward_batch.hidden_states, self.lm_head, forward_batch
+            )
+        else:
+            result = None
+
+        return result
+
+    @property
+    def start_layer(self):
+        return self.model.start_layer
+
+    @property
+    def end_layer(self):
+        return self.model.end_layer
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            (".qkv_proj", ".q_proj", "q"),
+            (".qkv_proj", ".k_proj", "k"),
+            (".qkv_proj", ".v_proj", "v"),
+            (".gate_up_proj", ".up_proj", 1),
+            (".gate_up_proj", ".gate_proj", 0),
+        ]
+
+        params_dict = dict(self.named_parameters())
+        for name, loaded_weight in weights:
+            layer_id = get_layer_id(name)
+            if (
+                layer_id is not None
+                and hasattr(self.model, "start_layer")
+                and (
+                    layer_id < self.model.start_layer
+                    or layer_id >= self.model.end_layer
+                )
+            ):
+                continue
+
+            if "rotary_emb.inv_freq" in name or "projector" in name:
+                continue
+            if self.config.tie_word_embeddings and "lm_head.weight" in name:
+                if self.pp_group.world_size > 1 and self.pp_group.is_last_rank:
+                    # Handle pp weight tying here
+                    # find the embed_tokens.weight in the weights
+                    embed_token_weights = next(
+                        filter(lambda x: x[0] == "model.embed_tokens.weight", weights)
+                    )[1]
+                    loaded_weight = embed_token_weights
+                else:
+                    continue
+
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                if name not in params_dict:
+                    continue
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+
+                if name in params_dict.keys():
+                    param = params_dict[name]
+                    weight_loader = getattr(
+                        param, "weight_loader", default_weight_loader
+                    )
+                    weight_loader(param, loaded_weight)
+                else:
+                    logger.warning(f"Parameter {name} not found in params_dict")
+
+    def get_embed_and_head(self):
+        return self.model.embed_tokens.weight, self.lm_head.weight
+
+    def set_embed_and_head(self, embed, head):
+        del self.model.embed_tokens.weight
+        del self.lm_head.weight
+        self.model.embed_tokens.weight = embed
+        self.lm_head.weight = head
+        torch.cuda.empty_cache()
+        torch.cuda.synchronize()
+
+    def load_kv_cache_scales(self, quantization_param_path: str) -> None:
+        self.model.load_kv_cache_scales(quantization_param_path)
+
+
+EntryClass = [Glm4ForCausalLM]
diff --git a/sglang/python/sglang/srt/models/glm4_moe.py b/sglang/python/sglang/srt/models/glm4_moe.py
new file mode 100644
index 0000000000000000000000000000000000000000..db8c1c7ce7e7da41710ae1734b35a4c1fcf1ff34
--- /dev/null
+++ b/sglang/python/sglang/srt/models/glm4_moe.py
@@ -0,0 +1,1288 @@
+# Copyright 2025-2026 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Inference-only GLM-4.5, GLM-4.6 and GLM-4.7 model compatible with HuggingFace weights"""
+
+import logging
+from typing import Any, Dict, Iterable, List, Optional, Tuple, Union
+
+import torch
+import torch.nn.functional as F
+from torch import nn
+from transformers import PretrainedConfig
+
+from sglang.srt.batch_overlap.two_batch_overlap import model_forward_maybe_tbo
+from sglang.srt.distributed import (
+    get_moe_expert_parallel_world_size,
+    get_pp_group,
+    get_tensor_model_parallel_rank,
+    get_tensor_model_parallel_world_size,
+    parallel_state,
+    tensor_model_parallel_all_reduce,
+)
+from sglang.srt.distributed.device_communicators.pynccl_allocator import (
+    use_symmetric_memory,
+)
+from sglang.srt.eplb.expert_distribution import get_global_expert_distribution_recorder
+from sglang.srt.eplb.expert_location import ModelConfigForExpertLocation
+from sglang.srt.eplb.expert_location_dispatch import ExpertLocationDispatchInfo
+from sglang.srt.layers.activation import SiluAndMul
+from sglang.srt.layers.communicator import (
+    LayerCommunicator,
+    LayerScatterModes,
+    enable_moe_dense_fully_dp,
+)
+from sglang.srt.layers.dp_attention import (
+    get_attention_tp_rank,
+    get_attention_tp_size,
+    is_allocation_symmetric,
+    is_dp_attention_enabled,
+)
+from sglang.srt.layers.layernorm import RMSNorm
+from sglang.srt.layers.linear import (
+    MergedColumnParallelLinear,
+    QKVParallelLinear,
+    RowParallelLinear,
+)
+from sglang.srt.layers.logits_processor import LogitsProcessor
+from sglang.srt.layers.moe import (
+    get_moe_a2a_backend,
+    should_use_flashinfer_cutlass_moe_fp4_allgather,
+)
+from sglang.srt.layers.moe.ep_moe.layer import get_moe_impl_class
+from sglang.srt.layers.moe.fused_moe_triton.layer import FusedMoE
+from sglang.srt.layers.moe.topk import TopK
+from sglang.srt.layers.moe.utils import (
+    RoutingMethodType,
+    filter_moe_weight_param_global_expert,
+)
+from sglang.srt.layers.quantization.base_config import QuantizationConfig
+from sglang.srt.layers.quantization.fp8_kernel import is_fp8_fnuz
+from sglang.srt.layers.radix_attention import RadixAttention
+from sglang.srt.layers.rotary_embedding import get_rope
+from sglang.srt.layers.utils import PPMissingLayer
+from sglang.srt.layers.vocab_parallel_embedding import (
+    ParallelLMHead,
+    VocabParallelEmbedding,
+)
+from sglang.srt.model_executor.cuda_graph_runner import get_is_capture_mode
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch, PPProxyTensors
+from sglang.srt.model_loader.weight_utils import default_weight_loader
+from sglang.srt.models.deepseek_v2 import DeepseekV2ForCausalLM
+from sglang.srt.models.utils import apply_qk_norm
+from sglang.srt.server_args import get_global_server_args
+from sglang.srt.utils import (
+    add_prefix,
+    cpu_has_amx_support,
+    get_bool_env_var,
+    get_device_sm,
+    is_cpu,
+    is_cuda,
+    is_hip,
+    is_non_idle_and_non_empty,
+    log_info_on_rank0,
+    make_layers,
+)
+
+_is_hip = is_hip()
+_is_cuda = is_cuda()
+_is_fp8_fnuz = is_fp8_fnuz()
+_use_aiter = get_bool_env_var("SGLANG_USE_AITER") and _is_hip
+_is_cpu_amx_available = cpu_has_amx_support()
+_is_cpu = is_cpu()
+_device_sm = get_device_sm()
+
+logger = logging.getLogger(__name__)
+
+
+class Glm4MoeMLP(nn.Module):
+    def __init__(
+        self,
+        hidden_size: int,
+        intermediate_size: int,
+        hidden_act: str,
+        quant_config: Optional[QuantizationConfig] = None,
+        reduce_results: bool = True,
+        prefix: str = "",
+        tp_rank: Optional[int] = None,
+        tp_size: Optional[int] = None,
+    ) -> None:
+        super().__init__()
+        self.tp_size = tp_size
+
+        self.gate_up_proj = MergedColumnParallelLinear(
+            hidden_size,
+            [intermediate_size] * 2,
+            bias=False,
+            quant_config=quant_config,
+            prefix=add_prefix("gate_up_proj", prefix),
+            tp_rank=tp_rank,
+            tp_size=tp_size,
+        )
+        self.down_proj = RowParallelLinear(
+            intermediate_size,
+            hidden_size,
+            bias=False,
+            quant_config=quant_config,
+            reduce_results=reduce_results,
+            prefix=add_prefix("down_proj", prefix),
+            tp_rank=tp_rank,
+            tp_size=tp_size,
+        )
+        if hidden_act != "silu":
+            raise ValueError(
+                f"Unsupported activation: {hidden_act}. Only silu is supported for now."
+            )
+        self.act_fn = SiluAndMul()
+
+    def forward(
+        self,
+        x,
+        forward_batch=None,
+        should_allreduce_fusion: bool = False,
+        use_reduce_scatter: bool = False,
+    ):
+        if (self.tp_size == 1) and x.shape[0] == 0:
+            return x
+
+        gate_up, _ = self.gate_up_proj(x)
+        x = self.act_fn(gate_up)
+        x, _ = self.down_proj(
+            x, skip_all_reduce=should_allreduce_fusion or use_reduce_scatter
+        )
+        return x
+
+
+class Glm4MoeAttention(nn.Module):
+    def __init__(
+        self,
+        hidden_size: int,
+        num_heads: int,
+        num_kv_heads: int,
+        layer_id: int = 0,
+        rope_theta: float = 10000,
+        partial_rotary_factor: float = 0.5,
+        rope_scaling: Optional[Dict[str, Any]] = None,
+        max_position_embeddings: int = 8192,
+        head_dim: Optional[int] = None,
+        rms_norm_eps: float = 1e-05,
+        attention_bias: bool = True,
+        quant_config: Optional[QuantizationConfig] = None,
+        use_qk_norm: bool = False,
+        prefix: str = "",
+        alt_stream: Optional[torch.cuda.Stream] = None,
+    ) -> None:
+        super().__init__()
+        self.hidden_size = hidden_size
+
+        attn_tp_rank = get_attention_tp_rank()
+        attn_tp_size = get_attention_tp_size()
+
+        self.total_num_heads = num_heads
+        assert self.total_num_heads % attn_tp_size == 0
+        self.num_heads = self.total_num_heads // attn_tp_size
+        self.total_num_kv_heads = num_kv_heads
+        if self.total_num_kv_heads >= attn_tp_size:
+            # Number of KV heads is greater than TP size, so we partition
+            # the KV heads across multiple tensor parallel GPUs.
+            assert self.total_num_kv_heads % attn_tp_size == 0
+        else:
+            # Number of KV heads is less than TP size, so we replicate
+            # the KV heads across multiple tensor parallel GPUs.
+            assert attn_tp_size % self.total_num_kv_heads == 0
+        self.num_kv_heads = max(1, self.total_num_kv_heads // attn_tp_size)
+        self.head_dim = head_dim or hidden_size // self.total_num_heads
+        self.q_size = self.num_heads * self.head_dim
+        self.kv_size = self.num_kv_heads * self.head_dim
+        self.scaling = self.head_dim**-0.5
+        self.rope_theta = rope_theta
+        self.use_qk_norm = use_qk_norm
+        self.max_position_embeddings = max_position_embeddings
+        self.tp_rank = get_tensor_model_parallel_rank()
+
+        self.qkv_proj = QKVParallelLinear(
+            hidden_size,
+            self.head_dim,
+            self.total_num_heads,
+            self.total_num_kv_heads,
+            bias=attention_bias,
+            quant_config=quant_config,
+            tp_rank=attn_tp_rank,
+            tp_size=attn_tp_size,
+            prefix=add_prefix("qkv_proj", prefix),
+        )
+
+        self.o_proj = RowParallelLinear(
+            self.total_num_heads * self.head_dim,
+            hidden_size,
+            bias=False,
+            quant_config=quant_config,
+            tp_rank=attn_tp_rank,
+            tp_size=attn_tp_size,
+            reduce_results=False,
+            prefix=add_prefix("o_proj", prefix),
+        )
+
+        self.rotary_emb = get_rope(
+            self.head_dim,
+            rotary_dim=self.head_dim,
+            max_position=max_position_embeddings,
+            partial_rotary_factor=partial_rotary_factor,
+            base=rope_theta,
+            rope_scaling=rope_scaling,
+        )
+        self.attn = RadixAttention(
+            self.num_heads,
+            self.head_dim,
+            self.scaling,
+            num_kv_heads=self.num_kv_heads,
+            layer_id=layer_id,
+            prefix=add_prefix("attn", prefix),
+        )
+
+        if self.use_qk_norm:
+            self.q_norm = RMSNorm(self.head_dim, eps=rms_norm_eps)
+            self.k_norm = RMSNorm(self.head_dim, eps=rms_norm_eps)
+        self.alt_stream = alt_stream
+
+    def op_prepare(self, state):
+        state.attn_intermediate_state = self.forward_prepare(
+            positions=state.positions,
+            hidden_states=state.pop("hidden_states_after_comm_pre_attn"),
+            forward_batch=state.forward_batch,
+        )
+
+    def op_core(self, state):
+        state.hidden_states_after_attn = self.forward_core(
+            state.pop("attn_intermediate_state")
+        )
+
+    def forward_prepare(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+    ):
+        if hidden_states.shape[0] == 0:
+            return hidden_states, forward_batch, None
+        qkv, _ = self.qkv_proj(hidden_states)
+        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+        if self.use_qk_norm:
+            q, k = apply_qk_norm(
+                q=q,
+                k=k,
+                q_norm=self.q_norm,
+                k_norm=self.k_norm,
+                head_dim=self.head_dim,
+                alt_stream=self.alt_stream,
+            )
+        q, k = self.rotary_emb(positions, q, k)
+        inner_state = q, k, v, forward_batch
+        return None, forward_batch, inner_state
+
+    def forward_core(self, intermediate_state):
+        hidden_states, forward_batch, inner_state = intermediate_state
+        if inner_state is None:
+            return hidden_states
+        attn_output = self.attn(*inner_state)
+        output, _ = self.o_proj(attn_output)
+        return output
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+    ) -> torch.Tensor:
+        s = self.forward_prepare(
+            positions=positions,
+            hidden_states=hidden_states,
+            forward_batch=forward_batch,
+        )
+        return self.forward_core(s)
+
+
+class Glm4MoeGate(nn.Module):
+    def __init__(
+        self,
+        config,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.weight = nn.Parameter(
+            torch.empty((config.n_routed_experts, config.hidden_size))
+        )
+        self.e_score_correction_bias = nn.Parameter(
+            torch.empty((config.n_routed_experts), dtype=torch.float32)
+        )
+
+    def forward(self, hidden_states):
+        logits = F.linear(hidden_states, self.weight, None)
+        return logits
+
+
+class Glm4MoeSparseMoeBlock(nn.Module):
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        layer_id: int,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+        alt_stream: Optional[torch.cuda.Stream] = None,
+    ):
+        nn.Module.__init__(self)
+        self.top_k = config.num_experts_per_tok
+        self.tp_size = get_tensor_model_parallel_world_size()
+        self.moe_ep_size = get_moe_expert_parallel_world_size()
+        self.routed_scaling_factor = config.routed_scaling_factor
+        self.n_shared_experts = config.n_shared_experts
+        self.num_fused_shared_experts = (
+            0
+            if get_global_server_args().disable_shared_experts_fusion
+            else config.n_shared_experts
+        )
+
+        self.config = config
+        self.layer_id = layer_id
+        self.alt_stream = alt_stream
+
+        if self.tp_size > config.n_routed_experts:
+            raise ValueError(
+                f"Tensor parallel size {self.tp_size} is greater than "
+                f"the number of experts {config.n_routed_experts}."
+            )
+
+        if config.hidden_act != "silu":
+            raise ValueError(
+                f"Unsupported activation: {config.hidden_act}. "
+                "Only silu is supported for now."
+            )
+
+        self.gate = Glm4MoeGate(config=config, prefix=add_prefix("gate", prefix))
+
+        self.experts = get_moe_impl_class(quant_config)(
+            num_experts=config.n_routed_experts + self.num_fused_shared_experts,
+            num_fused_shared_experts=self.num_fused_shared_experts,
+            top_k=self.top_k + self.num_fused_shared_experts,
+            layer_id=self.layer_id,
+            hidden_size=config.hidden_size,
+            intermediate_size=config.moe_intermediate_size,
+            quant_config=quant_config,
+            routed_scaling_factor=self.routed_scaling_factor,
+            routing_method_type=RoutingMethodType.DeepSeekV3,
+            prefix=add_prefix("experts", prefix),
+        )
+
+        self.topk = TopK(
+            top_k=self.top_k + self.num_fused_shared_experts,
+            layer_id=self.layer_id,
+            renormalize=config.norm_topk_prob,
+            use_grouped_topk=True,
+            num_expert_group=config.n_group,
+            topk_group=config.topk_group,
+            correction_bias=self.gate.e_score_correction_bias,
+            routed_scaling_factor=self.routed_scaling_factor,
+            num_fused_shared_experts=self.num_fused_shared_experts,
+            apply_routed_scaling_factor_on_output=getattr(
+                self.experts, "should_fuse_routed_scaling_factor_in_topk", False
+            ),
+            fused_shared_experts_scaling_factor=1,
+        )
+
+        # shared expert
+        if config.n_shared_experts is not None and self.num_fused_shared_experts == 0:
+            intermediate_size = config.moe_intermediate_size * config.n_shared_experts
+            self.shared_experts = Glm4MoeMLP(
+                hidden_size=config.hidden_size,
+                intermediate_size=intermediate_size,
+                hidden_act=config.hidden_act,
+                quant_config=quant_config,
+                reduce_results=False,
+                prefix=add_prefix("shared_experts", prefix),
+                **(
+                    dict(tp_rank=0, tp_size=1)
+                    if get_moe_a2a_backend().is_deepep()
+                    or get_moe_a2a_backend().is_mooncake()
+                    or get_moe_a2a_backend().is_flashinfer()
+                    or should_use_flashinfer_cutlass_moe_fp4_allgather()
+                    else {}
+                ),
+            )
+
+        if get_moe_a2a_backend().is_deepep() or get_moe_a2a_backend().is_mooncake():
+            # TODO: we will support tp < ep in the future
+            self.ep_size = get_moe_expert_parallel_world_size()
+            self.num_experts = (
+                config.n_routed_experts
+                + get_global_server_args().ep_num_redundant_experts
+            )
+            self.renormalize = config.norm_topk_prob
+            self.topk_group = config.topk_group
+            self.num_expert_group = config.n_group
+            self.correction_bias = (
+                self.gate.e_score_correction_bias.data
+                if self.gate.e_score_correction_bias is not None
+                else None
+            )
+
+        self._enable_a2a_moe = (
+            get_moe_a2a_backend().is_deepep() or get_moe_a2a_backend().is_mooncake()
+        )
+
+    def get_moe_weights(self):
+        return [
+            x.data
+            for name, x in self.experts.named_parameters()
+            if name not in ["correction_bias"]
+            and filter_moe_weight_param_global_expert(
+                name, x, self.experts.num_local_experts
+            )
+        ]
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        forward_batch: Optional[ForwardBatch] = None,
+        should_allreduce_fusion: bool = False,
+        use_reduce_scatter: bool = False,
+    ) -> torch.Tensor:
+
+        if not get_moe_a2a_backend().is_deepep():
+            if (
+                self.alt_stream is not None
+                and self.num_fused_shared_experts == 0
+                and hidden_states.shape[0] > 0
+                and get_is_capture_mode()
+            ):
+                return self.forward_normal_dual_stream(
+                    hidden_states, should_allreduce_fusion, use_reduce_scatter
+                )
+            else:
+                return self.forward_normal(
+                    hidden_states, should_allreduce_fusion, use_reduce_scatter
+                )
+        else:
+            return self.forward_deepep(hidden_states, forward_batch)
+
+    def forward_normal_dual_stream(
+        self,
+        hidden_states: torch.Tensor,
+        should_allreduce_fusion: bool = False,
+        use_reduce_scatter: bool = False,
+    ) -> torch.Tensor:
+        current_stream = torch.cuda.current_stream()
+        self.alt_stream.wait_stream(current_stream)
+        shared_output = self._forward_shared_experts(hidden_states)
+
+        with torch.cuda.stream(self.alt_stream):
+            # router_logits: (num_tokens, n_experts)
+            router_logits = self.gate(hidden_states)
+            topk_output = self.topk(hidden_states, router_logits)
+
+            final_hidden_states = self.experts(hidden_states, topk_output)
+            if not _is_cuda and not _use_aiter:
+                # fused in biased_grouped_topk so we can skip here
+                final_hidden_states *= self.routed_scaling_factor
+
+        current_stream.wait_stream(self.alt_stream)
+
+        with use_symmetric_memory(
+            parallel_state.get_tp_group(), disabled=not is_allocation_symmetric()
+        ):
+            final_hidden_states_out = torch.empty_like(final_hidden_states)
+        torch.add(final_hidden_states, shared_output, out=final_hidden_states_out)
+        final_hidden_states = final_hidden_states_out
+        if (
+            self.tp_size > 1
+            and not should_allreduce_fusion
+            and not use_reduce_scatter
+            and not should_use_flashinfer_cutlass_moe_fp4_allgather()
+        ):
+            final_hidden_states = tensor_model_parallel_all_reduce(final_hidden_states)
+        return final_hidden_states
+
+    def forward_normal(
+        self,
+        hidden_states: torch.Tensor,
+        should_allreduce_fusion: bool = False,
+        use_reduce_scatter: bool = False,
+    ) -> torch.Tensor:
+        if hidden_states.shape[0] > 0:
+            shared_output = self._forward_shared_experts(hidden_states)
+            # router_logits: (num_tokens, n_experts)
+            router_logits = self.gate(hidden_states)
+            topk_output = self.topk(hidden_states, router_logits)
+        else:
+            shared_output = None
+            topk_output = self.topk.empty_topk_output(hidden_states.device)
+
+        final_hidden_states = self.experts(hidden_states, topk_output)
+        if not _is_cuda and not _use_aiter:
+            final_hidden_states *= self.routed_scaling_factor
+        if shared_output is not None:
+            with use_symmetric_memory(
+                parallel_state.get_tp_group(), disabled=not is_allocation_symmetric()
+            ):
+                final_hidden_states_out = torch.empty_like(final_hidden_states)
+            torch.add(final_hidden_states, shared_output, out=final_hidden_states_out)
+            final_hidden_states = final_hidden_states_out
+        if (
+            self.tp_size > 1
+            and not should_allreduce_fusion
+            and not use_reduce_scatter
+            and not should_use_flashinfer_cutlass_moe_fp4_allgather()
+        ):
+            final_hidden_states = tensor_model_parallel_all_reduce(final_hidden_states)
+        return final_hidden_states
+
+    def forward_deepep(
+        self, hidden_states: torch.Tensor, forward_batch: ForwardBatch
+    ) -> torch.Tensor:
+        shared_output = None
+        if hidden_states.shape[0] > 0:
+            # router_logits: (num_tokens, n_experts)
+            router_logits = self.gate(hidden_states)
+            shared_output = self._forward_shared_experts(hidden_states)
+            topk_output = self.topk(
+                hidden_states,
+                router_logits,
+                num_token_non_padded=forward_batch.num_token_non_padded,
+                expert_location_dispatch_info=ExpertLocationDispatchInfo.init_new(
+                    layer_id=self.layer_id,
+                ),
+            )
+        else:
+            topk_output = self.topk.empty_topk_output(hidden_states.device)
+        final_hidden_states = self.experts(
+            hidden_states=hidden_states,
+            topk_output=topk_output,
+        )
+
+        if shared_output is not None:
+            x = shared_output
+            if self.experts.should_fuse_routed_scaling_factor_in_topk:
+                x.add_(final_hidden_states)
+            else:
+                x.add_(final_hidden_states, alpha=self.routed_scaling_factor)
+            final_hidden_states = x
+        else:
+            if not self.experts.should_fuse_routed_scaling_factor_in_topk:
+                final_hidden_states *= self.routed_scaling_factor
+
+        return final_hidden_states
+
+    def _forward_shared_experts(self, hidden_states: torch.Tensor):
+        if (hidden_states.shape[0] > 0) and (self.num_fused_shared_experts == 0):
+            return self.shared_experts(hidden_states)
+        else:
+            return None
+
+    def op_gate(self, state):
+        if is_non_idle_and_non_empty(
+            state.forward_batch.forward_mode, state.hidden_states_mlp_input
+        ):
+            # router_logits: (num_tokens, n_experts)
+            state.router_logits = self.gate(state.hidden_states_mlp_input)
+        else:
+            state.router_logits = None
+
+    def op_select_experts(self, state):
+        router_logits = state.pop("router_logits")
+        hidden_states = state.hidden_states_mlp_input
+
+        if router_logits is not None:
+            with get_global_expert_distribution_recorder().with_current_layer(
+                self.layer_id
+            ):
+                state.topk_output = self.topk(
+                    hidden_states=hidden_states,
+                    router_logits=router_logits,
+                    num_token_non_padded=state.forward_batch.num_token_non_padded,
+                    expert_location_dispatch_info=ExpertLocationDispatchInfo.init_new(
+                        layer_id=self.layer_id,
+                    ),
+                )
+        else:
+            state.topk_output = self.topk.empty_topk_output(hidden_states.device)
+
+    def op_dispatch_a(self, state):
+        if self.ep_size > 1:
+            self.experts.dispatcher.dispatch_a(
+                hidden_states=state.hidden_states_mlp_input,
+                topk_output=state.pop("topk_output"),
+                tbo_subbatch_index=state.get("tbo_subbatch_index"),
+            )
+
+    def op_dispatch_b(self, state):
+        if self.ep_size > 1:
+            with get_global_expert_distribution_recorder().with_current_layer(
+                self.layer_id
+            ):
+                state.dispatch_output = self.experts.dispatcher.dispatch_b(
+                    tbo_subbatch_index=state.get("tbo_subbatch_index"),
+                )
+
+    def op_experts(self, state):
+        state.combine_input = self.experts.run_moe_core(
+            dispatch_output=state.dispatch_output,
+        )
+
+    def op_combine_a(self, state):
+        if self.ep_size > 1:
+            self.experts.dispatcher.combine_a(
+                combine_input=state.pop("combine_input"),
+                tbo_subbatch_index=state.get("tbo_subbatch_index"),
+            )
+            state.pop("dispatch_output")
+
+    def op_combine_b(self, state):
+        if self.ep_size > 1:
+            state.hidden_states_after_combine = self.experts.dispatcher.combine_b(
+                tbo_subbatch_index=state.get("tbo_subbatch_index"),
+            )
+
+    def op_output(self, state):
+        final_hidden_states = state.pop("hidden_states_after_combine")
+
+        if (shared_output := state.pop("shared_output")) is not None:
+            x = shared_output
+            x.add_(final_hidden_states, alpha=self.routed_scaling_factor)
+            final_hidden_states = x
+        else:
+            final_hidden_states *= self.routed_scaling_factor
+
+        state.hidden_states_mlp_output = final_hidden_states
+
+
+class Glm4MoeDecoderLayer(nn.Module):
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        layer_id: int,
+        quant_config: Optional[QuantizationConfig] = None,
+        is_nextn: bool = False,
+        prefix: str = "",
+        alt_stream: Optional[torch.cuda.Stream] = None,
+    ) -> None:
+        nn.Module.__init__(self)
+        self.hidden_size = config.hidden_size
+        self.config = config
+        rope_theta = getattr(config, "rope_theta", 10000)
+        rope_scaling = getattr(config, "rope_scaling", None)
+        partial_rotary_factor = getattr(
+            getattr(config, "rope_parameters", None), "partial_rotary_factor", None
+        ) or getattr(config, "partial_rotary_factor", 0.5)
+        max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
+        head_dim = getattr(
+            config, "head_dim", config.hidden_size // config.num_attention_heads
+        )
+        rms_norm_eps = config.rms_norm_eps
+        attention_bias = config.attention_bias
+        self.layer_id = layer_id
+
+        use_qk_norm = config.use_qk_norm if hasattr(config, "use_qk_norm") else False
+
+        self.self_attn = Glm4MoeAttention(
+            hidden_size=self.hidden_size,
+            num_heads=config.num_attention_heads,
+            num_kv_heads=config.num_key_value_heads,
+            layer_id=layer_id,
+            rope_theta=rope_theta,
+            rope_scaling=rope_scaling,
+            partial_rotary_factor=partial_rotary_factor,
+            max_position_embeddings=max_position_embeddings,
+            head_dim=head_dim,
+            rms_norm_eps=rms_norm_eps,
+            attention_bias=attention_bias,
+            quant_config=quant_config,
+            prefix=add_prefix("self_attn", prefix),
+            use_qk_norm=use_qk_norm,
+            alt_stream=alt_stream,
+        )
+
+        self.is_layer_sparse = self._is_layer_sparse(layer_id, is_nextn=is_nextn)
+        is_previous_layer_sparse = self._is_layer_sparse(layer_id - 1, is_nextn=False)
+        is_next_layer_sparse = self._is_layer_sparse(layer_id + 1, is_nextn=False)
+
+        self.layer_scatter_modes = LayerScatterModes.init_new(
+            layer_id=layer_id,
+            num_layers=1 if is_nextn else config.num_hidden_layers,
+            is_layer_sparse=self.is_layer_sparse,
+            is_previous_layer_sparse=is_previous_layer_sparse,
+            is_next_layer_sparse=is_next_layer_sparse,
+        )
+
+        if self.is_layer_sparse:
+            self.mlp = Glm4MoeSparseMoeBlock(
+                config=config,
+                quant_config=quant_config,
+                prefix=add_prefix("mlp", prefix),
+                layer_id=self.layer_id,
+                alt_stream=alt_stream,
+            )
+        else:
+            if enable_moe_dense_fully_dp():
+                mlp_tp_rank, mlp_tp_size = 0, 1
+            else:
+                mlp_tp_rank, mlp_tp_size = None, None
+            self.mlp = Glm4MoeMLP(
+                hidden_size=config.hidden_size,
+                intermediate_size=config.intermediate_size,
+                hidden_act=config.hidden_act,
+                quant_config=quant_config,
+                prefix=add_prefix("mlp", prefix),
+                tp_rank=mlp_tp_rank,
+                tp_size=mlp_tp_size,
+            )
+
+        self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = RMSNorm(
+            config.hidden_size, eps=config.rms_norm_eps
+        )
+
+        self.layer_communicator = LayerCommunicator(
+            layer_scatter_modes=self.layer_scatter_modes,
+            input_layernorm=self.input_layernorm,
+            post_attention_layernorm=self.post_attention_layernorm,
+            allow_reduce_scatter=True,
+            is_last_layer=(
+                is_nextn or (self.layer_id == self.config.num_hidden_layers - 1)
+            ),
+        )
+
+    def _is_layer_sparse(self, layer_id: int, is_nextn: bool) -> bool:
+        return is_nextn or (
+            self.config.n_routed_experts is not None
+            and layer_id >= self.config.first_k_dense_replace
+        )
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+        residual: Optional[torch.Tensor],
+    ) -> torch.Tensor:
+
+        hidden_states, residual = self.layer_communicator.prepare_attn(
+            hidden_states, residual, forward_batch
+        )
+
+        hidden_states = self.self_attn(
+            positions=positions,
+            hidden_states=hidden_states,
+            forward_batch=forward_batch,
+        )
+
+        hidden_states, residual = self.layer_communicator.prepare_mlp(
+            hidden_states, residual, forward_batch
+        )
+
+        should_allreduce_fusion = (
+            self.layer_communicator.should_fuse_mlp_allreduce_with_next_layer(
+                forward_batch
+            )
+        )
+
+        # For DP with padding, reduce scatter can be used instead of all-reduce.
+        use_reduce_scatter = self.layer_communicator.should_use_reduce_scatter(
+            forward_batch
+        )
+
+        hidden_states = self.mlp(
+            hidden_states, forward_batch, should_allreduce_fusion, use_reduce_scatter
+        )
+
+        if should_allreduce_fusion:
+            hidden_states._sglang_needs_allreduce_fusion = True
+        else:
+            hidden_states, residual = self.layer_communicator.postprocess_layer(
+                hidden_states, residual, forward_batch
+            )
+
+        return hidden_states, residual
+
+    def op_comm_prepare_attn(
+        self,
+        state,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+        residual: Optional[torch.Tensor],
+        tbo_subbatch_index: Optional[int] = None,
+    ):
+        state.hidden_states_after_comm_pre_attn, state.residual_after_input_ln = (
+            self.layer_communicator.prepare_attn(hidden_states, residual, forward_batch)
+        )
+        state.update(
+            dict(
+                forward_batch=forward_batch,
+                positions=positions,
+                tbo_subbatch_index=tbo_subbatch_index,
+            )
+        )
+
+    def op_comm_prepare_mlp(self, state):
+        state.hidden_states_mlp_input, state.residual_after_comm_pre_mlp = (
+            self.layer_communicator.prepare_mlp(
+                state.pop("hidden_states_after_attn"),
+                state.pop("residual_after_input_ln"),
+                state.forward_batch,
+            )
+        )
+
+    def op_mlp(self, state):
+        hidden_states = state.pop("hidden_states_mlp_input")
+        if not (
+            enable_moe_dense_fully_dp()
+            and (not self.is_layer_sparse)
+            and hidden_states.shape[0] == 0
+        ):
+            state.hidden_states_mlp_output = self.mlp(
+                hidden_states, state.forward_batch
+            )
+        else:
+            state.hidden_states_mlp_output = hidden_states
+
+    def op_comm_postprocess_layer(self, state):
+        hidden_states, residual = self.layer_communicator.postprocess_layer(
+            state.pop("hidden_states_mlp_output"),
+            state.pop("residual_after_comm_pre_mlp"),
+            state.forward_batch,
+        )
+
+        output = dict(
+            positions=state.positions,
+            hidden_states=hidden_states,
+            residual=residual,
+            forward_batch=state.forward_batch,
+            tbo_subbatch_index=state.tbo_subbatch_index,
+        )
+
+        state.clear(
+            expect_keys={
+                "positions",
+                "forward_batch",
+                "tbo_subbatch_index",
+            }
+        )
+        return output
+
+
+class Glm4MoeModel(nn.Module):
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.pp_group = get_pp_group()
+        self.config = config
+        self.vocab_size = config.vocab_size
+        self.first_k_dense_replace = config.first_k_dense_replace
+        self.embed_dim = config.hidden_size
+        if self.pp_group.is_first_rank:
+            self.embed_tokens = VocabParallelEmbedding(
+                config.vocab_size,
+                config.hidden_size,
+                use_attn_tp_group=is_dp_attention_enabled(),
+            )
+        else:
+            self.embed_tokens = PPMissingLayer()
+
+        self.alt_stream = torch.cuda.Stream() if _is_cuda else None
+        self.layers, self.start_layer, self.end_layer = make_layers(
+            config.num_hidden_layers,
+            lambda idx, prefix: Glm4MoeDecoderLayer(
+                layer_id=idx,
+                config=config,
+                quant_config=quant_config,
+                prefix=prefix,
+                alt_stream=self.alt_stream,
+            ),
+            pp_rank=self.pp_group.rank_in_group,
+            pp_size=self.pp_group.world_size,
+            prefix=add_prefix("layers", prefix),
+        )
+        if self.pp_group.is_last_rank:
+            self.norm = RMSNorm(self.embed_dim, eps=config.rms_norm_eps)
+        else:
+            self.norm = PPMissingLayer(return_tuple=True)
+
+        self.layers_to_capture = []
+
+    def get_input_embeddings(self) -> torch.Tensor:
+        return self.embed_tokens
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        input_embeds: torch.Tensor = None,
+        pp_proxy_tensors: Optional[PPProxyTensors] = None,
+    ) -> Union[torch.Tensor, PPProxyTensors]:
+        if self.pp_group.is_first_rank:
+            if input_embeds is None:
+                hidden_states = self.embed_tokens(input_ids)
+            else:
+                hidden_states = input_embeds
+            residual = None
+        else:
+            assert pp_proxy_tensors is not None
+            hidden_states = pp_proxy_tensors["hidden_states"]
+            residual = pp_proxy_tensors["residual"]
+
+        normal_start_layer = self.start_layer
+        normal_end_layer = self.end_layer
+        if forward_batch.can_run_tbo:
+            if (
+                self.first_k_dense_replace > normal_start_layer
+                and self.first_k_dense_replace < normal_end_layer
+            ):
+                normal_end_layer = self.first_k_dense_replace
+            elif self.first_k_dense_replace < normal_start_layer:
+                normal_end_layer = normal_start_layer = 0
+
+        aux_hidden_states = []
+        for i in range(normal_start_layer, normal_end_layer):
+            with get_global_expert_distribution_recorder().with_current_layer(i):
+                if i in self.layers_to_capture:
+                    aux_hidden_states.append(hidden_states + residual)
+                layer = self.layers[i]
+                hidden_states, residual = layer(
+                    positions,
+                    hidden_states,
+                    forward_batch,
+                    residual,
+                )
+
+        if normal_end_layer != self.end_layer:
+            hidden_states, residual = model_forward_maybe_tbo(
+                layers=self.layers[normal_end_layer : self.end_layer],
+                enable_tbo=True,
+                positions=positions,
+                forward_batch=forward_batch,
+                hidden_states=hidden_states,
+                residual=residual,
+                input_data_scatter_mode=self.layers[
+                    normal_end_layer - 1
+                ].layer_scatter_modes.layer_output_mode,
+            )
+
+        if not self.pp_group.is_last_rank:
+            return PPProxyTensors(
+                {
+                    "hidden_states": hidden_states,
+                    "residual": residual,
+                }
+            )
+        else:
+            if not forward_batch.forward_mode.is_idle():
+                if residual is None:
+                    hidden_states = self.norm(hidden_states)
+                else:
+                    hidden_states, _ = self.norm(hidden_states, residual)
+        if len(aux_hidden_states) == 0:
+            return hidden_states
+        return hidden_states, aux_hidden_states
+
+
+class Glm4MoeForCausalLM(nn.Module):
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        nn.Module.__init__(self)
+        self.pp_group = get_pp_group()
+        self.config = config
+        self.tp_size = get_tensor_model_parallel_world_size()
+        self.quant_config = quant_config
+        self.num_fused_shared_experts = 0
+        self.determine_num_fused_shared_experts()
+        self.model = Glm4MoeModel(
+            config, quant_config, prefix=add_prefix("model", prefix)
+        )
+        self.lm_head = ParallelLMHead(
+            config.vocab_size,
+            config.hidden_size,
+            quant_config=quant_config,
+            prefix=add_prefix("lm_head", prefix),
+            use_attn_tp_group=get_global_server_args().enable_dp_lm_head,
+        )
+        self.logits_processor = LogitsProcessor(config)
+
+        # For EAGLE3 support
+        self.capture_aux_hidden_states = False
+
+    def get_input_embeddings(self) -> nn.Embedding:
+        return self.model.embed_tokens
+
+    def determine_num_fused_shared_experts(self):
+        if get_global_server_args().disable_shared_experts_fusion:
+            return
+
+        disable_reason = None
+        if not getattr(self.config, "n_shared_experts", None):
+            disable_reason = "No shared experts are defined in the config."
+        elif not _is_cuda:
+            disable_reason = "Shared experts fusion currently requires CUDA devices."
+        elif _is_cuda and (_device_sm is not None) and (_device_sm < 80):
+            disable_reason = "Shared experts fusion requires SM80 or newer GPUs."
+        elif get_moe_expert_parallel_world_size() > 1:
+            disable_reason = "Shared experts fusion is not supported together with expert parallelism yet."
+        elif get_moe_a2a_backend().is_deepep():
+            disable_reason = "Shared experts fusion is not supported when Deepep MoE backend is enabled."
+
+        if disable_reason is not None:
+            get_global_server_args().disable_shared_experts_fusion = True
+            log_info_on_rank0(
+                logger,
+                f"{disable_reason} Shared experts fusion optimization is disabled.",
+            )
+            return
+
+        self.num_fused_shared_experts = self.config.n_shared_experts
+        assert (
+            self.num_fused_shared_experts == 1
+        ), "Only 1 fused shared expert is supported for Glm4MoeForCausalLM"
+        log_info_on_rank0(logger, "Shared experts fusion optimization enabled.")
+
+    @torch.no_grad()
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        input_embeds: torch.Tensor = None,
+        pp_proxy_tensors: Optional[PPProxyTensors] = None,
+    ) -> torch.Tensor:
+        hidden_states = self.model(
+            input_ids, positions, forward_batch, input_embeds, pp_proxy_tensors
+        )
+        aux_hidden_states = None
+        if self.capture_aux_hidden_states:
+            hidden_states, aux_hidden_states = hidden_states
+
+        if self.pp_group.is_last_rank:
+            return self.logits_processor(
+                input_ids, hidden_states, self.lm_head, forward_batch, aux_hidden_states
+            )
+        else:
+            return hidden_states
+
+    @property
+    def start_layer(self):
+        return self.model.start_layer
+
+    @property
+    def end_layer(self):
+        return self.model.end_layer
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]], is_nextn=False):
+        if is_nextn:
+            if hasattr(self.config, "num_nextn_predict_layers"):
+                num_nextn_layers = self.config.num_nextn_predict_layers
+                assert num_nextn_layers == 1, "Only 1 nextn layer is supported"
+                # compatible with old design
+                nextn_layer_id = (
+                    0
+                    if self.config.num_hidden_layers == 1
+                    else self.config.num_hidden_layers
+                )
+            else:
+                raise ValueError("num_nextn_predict_layers is not in the config")
+
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+            ("gate_up_proj", "gate_proj", 0),
+            ("gate_up_proj", "up_proj", 1),
+        ]
+
+        expert_params_mapping = FusedMoE.make_expert_params_mapping(
+            ckpt_gate_proj_name="gate_proj",
+            ckpt_down_proj_name="down_proj",
+            ckpt_up_proj_name="up_proj",
+            num_experts=self.config.n_routed_experts + self.num_fused_shared_experts,
+        )
+
+        if is_nextn:
+            nextn_layer_prefix = f"model.layers.{nextn_layer_id}"
+            nextn_spec_weight_names = [
+                "shared_head.norm",
+                "eh_proj",
+                "enorm",
+                "hnorm",
+            ]
+
+        params_dict = dict(self.named_parameters())
+        weight_names = []
+        for name, loaded_weight in weights:
+            weight_names.append(name)
+
+            if self.num_fused_shared_experts > 0 and "mlp.shared_experts" in name:
+                # Map shared expert weights to the last expert slot
+                # Shared expert becomes expert ID = n_routed_experts
+                name = name.replace(
+                    "mlp.shared_experts",
+                    f"mlp.experts.{self.config.n_routed_experts}",
+                )
+
+            if not is_nextn:
+                if hasattr(self.config, "num_nextn_predict_layers"):
+                    num_nextn_layers = self.config.num_nextn_predict_layers
+                    if num_nextn_layers > 0 and name.startswith("model.layers"):
+                        name_list = name.split(".")
+                        if (
+                            len(name_list) >= 3
+                            and int(name_list[2]) >= self.config.num_hidden_layers
+                        ):
+                            continue
+            else:
+                if not name.startswith(nextn_layer_prefix):
+                    continue
+
+                # Use shared head and embed weights from target model
+                if "shared_head.head" in name or "embed_tokens" in name:
+                    continue
+
+                is_decoder = True
+                # For nextn specific weights
+                for weight_name in nextn_spec_weight_names:
+                    if weight_name in name:
+                        name = name.replace(nextn_layer_prefix, "model")
+                        is_decoder = False
+                        break
+                # For decoder layer weights
+                if is_decoder:
+                    name = name.replace(nextn_layer_prefix, "model.decoder")
+
+            if "rotary_emb.inv_freq" in name:
+                continue
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                # Skip non-stacked layers and experts (experts handled below).
+                if weight_name not in name:
+                    continue
+                # We have mlp.experts[0].gate_proj in the checkpoint.
+                # Since we handle the experts below in expert_params_mapping,
+                # we need to skip here BEFORE we update the name, otherwise
+                # name will be updated to mlp.experts[0].gate_up_proj, which
+                # will then be updated below in expert_params_mapping
+                # for mlp.experts[0].gate_gate_up_proj, which breaks load.
+                if "mlp.experts" in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                if name not in params_dict:
+                    continue
+
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                # Track if this is an expert weight to enable early skipping
+                is_expert_weight = False
+
+                for mapping in expert_params_mapping:
+                    param_name, weight_name, expert_id, shard_id = mapping
+                    if weight_name not in name:
+                        continue
+
+                    # Mark as expert weight regardless of whether we can process it
+                    is_expert_weight = True
+
+                    name = name.replace(weight_name, param_name)
+                    if name not in params_dict:
+                        # Expert weight not on this rank, will be skipped below
+                        continue
+
+                    param = params_dict[name]
+                    weight_loader = param.weight_loader
+                    weight_loader(
+                        param,
+                        loaded_weight,
+                        name,
+                        shard_id=shard_id,
+                        expert_id=expert_id,
+                    )
+                    break
+                else:
+                    if is_expert_weight:
+                        # This is an expert weight but not mapped to this rank, skip all remaining processing
+                        continue
+
+                    # Skip loading extra bias for GPTQ models.
+                    if name.endswith(".bias") and name not in params_dict:
+                        continue
+                    if name not in params_dict:
+                        continue
+
+                    if name in params_dict.keys():
+                        param = params_dict[name]
+                        weight_loader = getattr(
+                            param, "weight_loader", default_weight_loader
+                        )
+                        weight_loader(param, loaded_weight)
+                    else:
+                        logger.warning(f"Parameter {name} not found in params_dict")
+
+    def get_embed_and_head(self):
+        return self.model.embed_tokens.weight, self.lm_head.weight
+
+    def set_embed_and_head(self, embed, head):
+        del self.model.embed_tokens.weight
+        del self.lm_head.weight
+        self.model.embed_tokens.weight = embed
+        self.lm_head.weight = head
+        torch.cuda.empty_cache()
+        torch.cuda.synchronize()
+
+    @classmethod
+    def get_model_config_for_expert_location(cls, config):
+        return ModelConfigForExpertLocation(
+            num_layers=config.num_hidden_layers,
+            num_logical_experts=config.n_routed_experts,
+            num_groups=config.n_group,
+        )
+
+    def set_eagle3_layers_to_capture(self, layer_ids: Optional[List[int]] = None):
+        if not self.pp_group.is_last_rank:
+            return
+
+        if layer_ids is None:
+            self.capture_aux_hidden_states = True
+            num_layers = self.config.num_hidden_layers
+            self.model.layers_to_capture = [2, num_layers // 2, num_layers - 3]
+        else:
+            self.capture_aux_hidden_states = True
+            # we plus 1 here because in sglang, for the ith layer, it takes the output
+            # of the (i-1)th layer as aux hidden state
+            self.model.layers_to_capture = [val + 1 for val in layer_ids]
+
+
+class GlmMoeDsaForCausalLM(DeepseekV2ForCausalLM):
+    def determine_num_fused_shared_experts(self):
+        super().determine_num_fused_shared_experts("GlmMoeDsaForCausalLM")
+
+
+EntryClass = [Glm4MoeForCausalLM, GlmMoeDsaForCausalLM]
diff --git a/sglang/python/sglang/srt/models/glm4_moe_lite.py b/sglang/python/sglang/srt/models/glm4_moe_lite.py
new file mode 100644
index 0000000000000000000000000000000000000000..ffd9acc4ed0d153c8b2565a8e12ae4a9ac4a05e5
--- /dev/null
+++ b/sglang/python/sglang/srt/models/glm4_moe_lite.py
@@ -0,0 +1,808 @@
+# Copyright 2025-2026 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Inference-only GLM-Lite model compatible with HuggingFace weights"""
+
+import logging
+from typing import Iterable, Optional, Tuple
+
+import torch
+import torch.nn.functional as F
+from torch import nn
+from transformers import PretrainedConfig
+
+from sglang.srt.batch_overlap.single_batch_overlap import SboFlags
+from sglang.srt.distributed import (
+    get_moe_expert_parallel_world_size,
+    get_pp_group,
+    get_tensor_model_parallel_world_size,
+)
+from sglang.srt.layers.activation import SiluAndMul
+from sglang.srt.layers.communicator import (
+    LayerCommunicator,
+    LayerScatterModes,
+    enable_moe_dense_fully_dp,
+)
+from sglang.srt.layers.dp_attention import (
+    get_attention_tp_size,
+    is_dp_attention_enabled,
+)
+from sglang.srt.layers.layernorm import RMSNorm
+from sglang.srt.layers.linear import MergedColumnParallelLinear, RowParallelLinear
+from sglang.srt.layers.logits_processor import LogitsProcessor
+from sglang.srt.layers.moe import (
+    get_moe_a2a_backend,
+    should_use_flashinfer_cutlass_moe_fp4_allgather,
+)
+from sglang.srt.layers.moe.ep_moe.layer import get_moe_impl_class
+from sglang.srt.layers.moe.fused_moe_triton.layer import FusedMoE
+from sglang.srt.layers.moe.topk import TopK, TopKOutputFormat
+from sglang.srt.layers.quantization.base_config import QuantizationConfig
+from sglang.srt.layers.utils import PPMissingLayer
+from sglang.srt.layers.vocab_parallel_embedding import (
+    ParallelLMHead,
+    VocabParallelEmbedding,
+)
+from sglang.srt.model_loader.weight_utils import default_weight_loader
+from sglang.srt.models.deepseek_v2 import (
+    DeepseekV2AttentionMLA,
+    DeepseekV2DecoderLayer,
+    DeepseekV2ForCausalLM,
+    DeepseekV2Model,
+    DeepseekV2MoE,
+)
+from sglang.srt.server_args import get_global_server_args
+from sglang.srt.utils import (
+    BumpAllocator,
+    LazyValue,
+    add_prefix,
+    get_device_sm,
+    is_cuda,
+    log_info_on_rank0,
+    make_layers,
+)
+
+_is_cuda = is_cuda()
+_device_sm = get_device_sm()
+
+logger = logging.getLogger(__name__)
+
+
+class Glm4MoeLiteMLP(nn.Module):
+    def __init__(
+        self,
+        hidden_size: int,
+        intermediate_size: int,
+        hidden_act: str,
+        quant_config: Optional[QuantizationConfig] = None,
+        reduce_results: bool = True,
+        prefix: str = "",
+        tp_rank: Optional[int] = None,
+        tp_size: Optional[int] = None,
+    ) -> None:
+        super().__init__()
+        self.tp_size = tp_size
+
+        self.gate_up_proj = MergedColumnParallelLinear(
+            hidden_size,
+            [intermediate_size] * 2,
+            bias=False,
+            quant_config=quant_config,
+            prefix=add_prefix("gate_up_proj", prefix),
+            tp_rank=tp_rank,
+            tp_size=tp_size,
+        )
+        self.down_proj = RowParallelLinear(
+            intermediate_size,
+            hidden_size,
+            bias=False,
+            quant_config=quant_config,
+            reduce_results=reduce_results,
+            prefix=add_prefix("down_proj", prefix),
+            tp_rank=tp_rank,
+            tp_size=tp_size,
+        )
+        if hidden_act != "silu":
+            raise ValueError(
+                f"Unsupported activation: {hidden_act}. Only silu is supported for now."
+            )
+        self.act_fn = SiluAndMul()
+
+    def forward(
+        self,
+        x,
+        forward_batch=None,
+        should_allreduce_fusion: bool = False,
+        use_reduce_scatter: bool = False,
+        gemm_output_zero_allocator: BumpAllocator = None,
+    ):
+        # Keep parity with DeepseekV2MLP.forward signature since DeepseekV2DecoderLayer
+        # invokes MLP modules with these extra arguments.
+        if (self.tp_size == 1) and x.shape[0] == 0:
+            return x
+
+        # Some quantization wrappers store the underlying parameter as `weight_packed`.
+        if not hasattr(self.gate_up_proj, "weight"):
+            self.gate_up_proj.weight = getattr(self.gate_up_proj, "weight_packed")
+        if not hasattr(self.down_proj, "weight"):
+            self.down_proj.weight = getattr(self.down_proj, "weight_packed")
+
+        if (
+            gemm_output_zero_allocator is not None
+            and x.shape[0] <= 256
+            and self.gate_up_proj.weight.dtype == torch.uint8
+        ):
+            y = gemm_output_zero_allocator.allocate(
+                x.shape[0] * self.gate_up_proj.output_size_per_partition
+            ).view(x.shape[0], self.gate_up_proj.output_size_per_partition)
+            x = (x, None, y)
+
+        gate_up, _ = self.gate_up_proj(x)
+        x = self.act_fn(gate_up)
+        x, _ = self.down_proj(
+            x,
+            skip_all_reduce=should_allreduce_fusion or use_reduce_scatter,
+        )
+        return x
+
+
+class Glm4MoeLiteGate(nn.Module):
+    def __init__(
+        self,
+        config,
+        prefix: str = "",
+        is_nextn: bool = False,
+    ):
+        super().__init__()
+        self.is_nextn = is_nextn
+        self.weight = nn.Parameter(
+            torch.empty((config.n_routed_experts, config.hidden_size))
+        )
+        self.e_score_correction_bias = nn.Parameter(
+            torch.empty((config.n_routed_experts), dtype=torch.float32)
+        )
+
+    def forward(self, hidden_states, gemm_output_zero_allocator: BumpAllocator = None):
+        # NOTE: For some unknown reason, router_gemm seems degrade accept length.
+        if (
+            _is_cuda
+            and not self.is_nextn
+            and hidden_states.shape[0] < 4
+            and hidden_states.shape[1] == 7168
+            and self.weight.shape[0] == 256
+            and _device_sm >= 90
+        ):
+            from sgl_kernel import dsv3_router_gemm
+
+            logits = dsv3_router_gemm(hidden_states, self.weight).to(
+                hidden_states.dtype
+            )
+        else:
+            logits = F.linear(hidden_states, self.weight, None)
+
+        return logits
+
+
+class Glm4MoeLiteSparseMoeBlock(DeepseekV2MoE):
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        layer_id: int,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+        alt_stream: Optional[torch.cuda.Stream] = None,
+        is_nextn: bool = False,
+    ):
+        nn.Module.__init__(self)
+        self.tp_size = get_tensor_model_parallel_world_size()
+        self.routed_scaling_factor = config.routed_scaling_factor
+        self.n_shared_experts = config.n_shared_experts
+        self.num_fused_shared_experts = (
+            0
+            if get_global_server_args().disable_shared_experts_fusion
+            else config.n_shared_experts
+        )
+        self.config = config
+        self.layer_id = layer_id
+        self.alt_stream = alt_stream
+        self.is_nextn = is_nextn
+
+        if self.tp_size > config.n_routed_experts:
+            raise ValueError(
+                f"Tensor parallel size {self.tp_size} is greater than "
+                f"the number of experts {config.n_routed_experts}."
+            )
+
+        if config.hidden_act != "silu":
+            raise ValueError(
+                f"Unsupported activation: {config.hidden_act}. "
+                "Only silu is supported for now."
+            )
+
+        self.gate = Glm4MoeLiteGate(
+            config=config, prefix=add_prefix("gate", prefix), is_nextn=is_nextn
+        )
+
+        self.experts = get_moe_impl_class(quant_config)(
+            num_experts=config.n_routed_experts
+            + self.num_fused_shared_experts
+            + get_global_server_args().ep_num_redundant_experts,
+            num_fused_shared_experts=self.num_fused_shared_experts,
+            top_k=config.num_experts_per_tok + self.num_fused_shared_experts,
+            hidden_size=config.hidden_size,
+            intermediate_size=config.moe_intermediate_size,
+            layer_id=self.layer_id,
+            quant_config=quant_config,
+            routed_scaling_factor=self.routed_scaling_factor,
+            prefix=add_prefix("experts", prefix),
+        )
+
+        self.topk = TopK(
+            top_k=config.num_experts_per_tok + self.num_fused_shared_experts,
+            layer_id=self.layer_id,
+            renormalize=config.norm_topk_prob,
+            use_grouped_topk=True,
+            num_expert_group=config.n_group,
+            num_fused_shared_experts=self.num_fused_shared_experts,
+            topk_group=config.topk_group,
+            correction_bias=self.gate.e_score_correction_bias,
+            quant_config=quant_config,
+            routed_scaling_factor=self.routed_scaling_factor,
+            apply_routed_scaling_factor_on_output=self.experts.should_fuse_routed_scaling_factor_in_topk,
+            # Some Fp4 MoE backends require the output format to be bypassed but the MTP layers are unquantized
+            # and requires the output format to be standard. We use quant_config to determine the output format.
+            output_format=TopKOutputFormat.STANDARD if quant_config is None else None,
+        )
+
+        self.shared_experts_is_int8 = False
+        self.shared_experts_is_fp8 = False
+        # self.shared_experts_weight_block_size = None
+        if config.n_shared_experts is not None and self.num_fused_shared_experts == 0:
+            intermediate_size = config.moe_intermediate_size * config.n_shared_experts
+            # disable tp for shared experts when enable deepep moe, or with fp4 allgather
+            self.shared_experts = Glm4MoeLiteMLP(
+                hidden_size=config.hidden_size,
+                intermediate_size=intermediate_size,
+                hidden_act=config.hidden_act,
+                quant_config=quant_config,
+                reduce_results=False,
+                prefix=add_prefix("shared_experts", prefix),
+                **(
+                    dict(tp_rank=0, tp_size=1)
+                    if get_moe_a2a_backend().is_deepep()
+                    or get_moe_a2a_backend().is_mooncake()
+                    or should_use_flashinfer_cutlass_moe_fp4_allgather()
+                    else {}
+                ),
+            )
+            is_packed_weight = hasattr(
+                self.shared_experts.gate_up_proj.quant_method, "quant_config"
+            )
+            self.shared_experts_is_int8 = (
+                not is_packed_weight
+                and self.shared_experts.gate_up_proj.weight.dtype == torch.int8
+            )
+            self.shared_experts_is_fp8 = (
+                not is_packed_weight
+                and self.shared_experts.gate_up_proj.weight.dtype == torch.float8_e4m3fn
+            )
+
+        self.top_k = config.num_experts_per_tok
+
+        if get_moe_a2a_backend().is_deepep() or get_moe_a2a_backend().is_mooncake():
+            # TODO: we will support tp < ep in the future
+            self.ep_size = get_moe_expert_parallel_world_size()
+            self.num_experts = (
+                config.n_routed_experts
+                + get_global_server_args().ep_num_redundant_experts
+            )
+            self.renormalize = config.norm_topk_prob
+            self.topk_group = config.topk_group
+            self.num_expert_group = config.n_group
+            self.correction_bias = (
+                self.gate.e_score_correction_bias.data
+                if self.gate.e_score_correction_bias is not None
+                else None
+            )
+
+        self._enable_a2a_moe = (
+            get_moe_a2a_backend().is_deepep() or get_moe_a2a_backend().is_mooncake()
+        )
+        self._fuse_shared_experts_inside_sbo = SboFlags.fuse_shared_experts_inside_sbo()
+
+
+class Glm4MoeLiteDecoderLayer(DeepseekV2DecoderLayer):
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        layer_id: int,
+        quant_config: Optional[QuantizationConfig] = None,
+        is_nextn: bool = False,
+        prefix: str = "",
+        alt_stream: Optional[torch.cuda.Stream] = None,
+    ) -> None:
+        nn.Module.__init__(self)
+        self.hidden_size = config.hidden_size
+        self.config = config
+
+        from sglang.srt.layers.attention.nsa.utils import is_nsa_enable_prefill_cp
+
+        self.nsa_enable_prefill_cp = is_nsa_enable_prefill_cp()
+        rope_theta = 1000000
+        rope_scaling = None
+        max_position_embeddings = getattr(config, "max_position_embeddings", 202752)
+        self.layer_id = layer_id
+
+        self.self_attn = DeepseekV2AttentionMLA(
+            config=config,
+            hidden_size=config.hidden_size,
+            num_heads=config.num_attention_heads,
+            qk_nope_head_dim=config.qk_nope_head_dim,
+            qk_rope_head_dim=config.qk_rope_head_dim,
+            v_head_dim=config.v_head_dim,
+            q_lora_rank=config.q_lora_rank,
+            kv_lora_rank=config.kv_lora_rank,
+            rope_theta=rope_theta,
+            rope_scaling=rope_scaling,
+            max_position_embeddings=max_position_embeddings,
+            quant_config=quant_config,
+            reduce_results=False,
+            layer_id=layer_id,
+            prefix=add_prefix("self_attn", prefix),
+        )
+
+        self.is_layer_sparse = self._is_layer_sparse(layer_id, is_nextn=is_nextn)
+        is_previous_layer_sparse = self._is_layer_sparse(layer_id - 1, is_nextn=False)
+        is_next_layer_sparse = self._is_layer_sparse(layer_id + 1, is_nextn=False)
+
+        self.layer_scatter_modes = LayerScatterModes.init_new(
+            layer_id=layer_id,
+            num_layers=1 if is_nextn else config.num_hidden_layers,
+            is_layer_sparse=self.is_layer_sparse,
+            is_previous_layer_sparse=is_previous_layer_sparse,
+            is_next_layer_sparse=is_next_layer_sparse,
+        )
+
+        if self.is_layer_sparse:
+            self.mlp = Glm4MoeLiteSparseMoeBlock(
+                config=config,
+                quant_config=quant_config,
+                prefix=add_prefix("mlp", prefix),
+                layer_id=self.layer_id,
+                alt_stream=alt_stream,
+                is_nextn=is_nextn,
+            )
+        else:
+            if enable_moe_dense_fully_dp():
+                mlp_tp_rank, mlp_tp_size = 0, 1
+            else:
+                mlp_tp_rank, mlp_tp_size = None, None
+            self.mlp = Glm4MoeLiteMLP(
+                hidden_size=config.hidden_size,
+                intermediate_size=config.intermediate_size,
+                hidden_act=config.hidden_act,
+                quant_config=quant_config,
+                prefix=add_prefix("mlp", prefix),
+                tp_rank=mlp_tp_rank,
+                tp_size=mlp_tp_size,
+            )
+
+        self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = RMSNorm(
+            config.hidden_size, eps=config.rms_norm_eps
+        )
+
+        self.layer_communicator = LayerCommunicator(
+            layer_scatter_modes=self.layer_scatter_modes,
+            input_layernorm=self.input_layernorm,
+            post_attention_layernorm=self.post_attention_layernorm,
+            allow_reduce_scatter=True,
+            is_last_layer=(
+                is_nextn or (self.layer_id == self.config.num_hidden_layers - 1)
+            ),
+            qkv_latent_func=self.self_attn.prepare_qkv_latent,
+        )
+
+
+class Glm4MoeLiteModel(DeepseekV2Model):
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        nn.Module.__init__(self)
+        self.padding_id = config.pad_token_id
+        self.vocab_size = config.vocab_size
+        self.first_k_dense_replace = config.first_k_dense_replace
+        self.pp_group = get_pp_group()
+
+        # DeepseekV2Model.forward expects these attributes to exist.
+        from sglang.srt.layers.attention.nsa.utils import is_nsa_enable_prefill_cp
+
+        self.nsa_enable_prefill_cp = is_nsa_enable_prefill_cp()
+        self.cp_size = get_attention_tp_size() if self.nsa_enable_prefill_cp else None
+        self.gemm_output_zero_allocator_size = 0
+        self.llama_4_scaling_config = getattr(config, "llama_4_scaling", None)
+
+        if self.pp_group.is_first_rank:
+            self.embed_tokens = VocabParallelEmbedding(
+                config.vocab_size,
+                config.hidden_size,
+                use_attn_tp_group=is_dp_attention_enabled(),
+            )
+        else:
+            self.embed_tokens = PPMissingLayer()
+
+        self.alt_stream = torch.cuda.Stream() if _is_cuda else None
+        self.layers, self.start_layer, self.end_layer = make_layers(
+            config.num_hidden_layers,
+            lambda idx, prefix: Glm4MoeLiteDecoderLayer(
+                config=config,
+                layer_id=idx,
+                quant_config=quant_config,
+                prefix=prefix,
+                alt_stream=self.alt_stream,
+            ),
+            pp_rank=self.pp_group.rank_in_group,
+            pp_size=self.pp_group.world_size,
+            prefix=add_prefix("layers", prefix),
+        )
+        if self.pp_group.is_last_rank:
+            self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        else:
+            self.norm = PPMissingLayer(return_tuple=True)
+        self.layers_to_capture = []
+
+
+class Glm4MoeLiteForCausalLM(DeepseekV2ForCausalLM):
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        nn.Module.__init__(self)
+        config.moe_layer_freq = 1
+        self.config = config
+        self.tp_size = get_tensor_model_parallel_world_size()
+        self.quant_config = quant_config
+        self.pp_group = get_pp_group()
+        self.determine_num_fused_shared_experts("Glm4MoeLiteForCausalLM")
+        self.model = Glm4MoeLiteModel(
+            config, quant_config, prefix=add_prefix("model", prefix)
+        )
+        self.lm_head = ParallelLMHead(
+            config.vocab_size,
+            config.hidden_size,
+            quant_config=quant_config,
+            prefix=add_prefix("lm_head", prefix),
+            use_attn_tp_group=get_global_server_args().enable_dp_lm_head,
+        )
+        self.logits_processor = LogitsProcessor(config)
+
+        self._routed_experts_weights_of_layer = LazyValue(
+            lambda: {
+                layer_id: layer.mlp.get_moe_weights()
+                for layer_id, layer in enumerate(self.model.layers)
+                if isinstance(layer.mlp, Glm4MoeLiteSparseMoeBlock)
+            }
+        )
+        self.capture_aux_hidden_states = False
+
+        from sglang.srt.layers.attention.nsa.utils import is_nsa_enable_prefill_cp
+
+        self.nsa_enable_prefill_cp = is_nsa_enable_prefill_cp()
+        if self.nsa_enable_prefill_cp:
+            from sglang.srt.layers.dp_attention import (
+                get_attention_tp_rank,
+                get_attention_tp_size,
+            )
+
+            self.cp_rank = get_attention_tp_rank()
+            self.cp_size = get_attention_tp_size()
+        else:
+            self.cp_rank = self.cp_size = None
+
+    def determine_num_fused_shared_experts(
+        self, architecture: str = "Glm4MoeLiteForCausalLM"
+    ):
+        self.num_fused_shared_experts = 0
+        if get_global_server_args().disable_shared_experts_fusion:
+            return
+
+        disable_reason = None
+        if (
+            not _is_cuda
+            or torch.cuda.get_device_capability("cuda") < (8, 0)
+            or self.config.architectures[0] != architecture
+            or self.config.n_shared_experts != 1
+        ):
+            disable_reason = "Only GLM-4.5 or GLM-4.6 on NV-platform with capability >= 80 can use shared experts fusion optimization."
+        elif get_moe_expert_parallel_world_size() > 1:
+            disable_reason = "GLM-4.5 or GLM-4.6 cannot use shared experts fusion optimization under expert parallelism."
+
+        if disable_reason is not None:
+            get_global_server_args().disable_shared_experts_fusion = True
+            self.num_fused_shared_experts = 0
+            log_info_on_rank0(
+                logger,
+                f"{disable_reason} Shared experts fusion optimization is disabled.",
+            )
+            return
+
+        self.num_fused_shared_experts = self.config.n_shared_experts
+
+    def load_weights(
+        self,
+        weights: Iterable[Tuple[str, torch.Tensor]],
+        is_nextn=False,
+        params_dict=None,
+        is_eagle=False,
+    ):
+        if is_nextn:
+            if hasattr(self.config, "num_nextn_predict_layers"):
+                num_nextn_layers = self.config.num_nextn_predict_layers
+                assert num_nextn_layers == 1, "Only 1 nextn layer is supported"
+                # compatible with old design
+                nextn_layer_id = (
+                    0
+                    if self.config.num_hidden_layers == 1
+                    else self.config.num_hidden_layers
+                )
+            else:
+                raise ValueError("num_nextn_predict_layers is not in the config")
+
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+            ("gate_up_proj", "gate_proj", 0),
+            ("gate_up_proj", "up_proj", 1),
+        ]
+
+        if self.num_fused_shared_experts > 0:
+            assert self.num_fused_shared_experts == 1
+
+            def iter_weights_with_fused_shared_experts(
+                weights: Iterable[Tuple[str, torch.Tensor]],
+            ) -> Iterable[Tuple[str, torch.Tensor]]:
+                import re
+
+                pattern = re.compile(
+                    r"^model\.layers\.(\d+)\.mlp\.shared_experts\.(.+)$"
+                )
+                for name, weight in weights:
+                    match = pattern.match(name)
+                    if match:
+                        layer_id = int(match.group(1))
+                        suffix = match.group(2)
+                        name = f"model.layers.{layer_id}.mlp.experts.{self.config.n_routed_experts}.{suffix}"
+                    yield name, weight
+
+            weights = iter_weights_with_fused_shared_experts(weights)
+
+        # Params for weights, fp8 weight scales, fp8 activation scales
+        # (param_name, weight_name, expert_id, shard_id)
+        expert_params_mapping = FusedMoE.make_expert_params_mapping(
+            ckpt_gate_proj_name="gate_proj",
+            ckpt_down_proj_name="down_proj",
+            ckpt_up_proj_name="up_proj",
+            num_experts=self.config.n_routed_experts + self.num_fused_shared_experts,
+        )
+
+        # Fuse q_a_proj and kv_a_proj_with_mqa along output dimension when q_lora_rank is not None
+        fuse_qkv_a_proj = hasattr(self.config, "q_lora_rank") and (
+            self.config.q_lora_rank is not None
+        )
+        cached_a_proj = {} if fuse_qkv_a_proj else None
+
+        if is_nextn:
+            nextn_layer_prefix = f"model.layers.{nextn_layer_id}"
+            nextn_spec_weight_names = [
+                "shared_head.norm",
+                "eh_proj",
+                "enorm",
+                "hnorm",
+            ]
+        else:
+            nextn_layer_prefix = None
+            nextn_spec_weight_names = []
+
+        eagle_ignore_weight_names = []
+        if is_eagle:
+            eagle_ignore_weight_names = [
+                "eagle_draft_tokens_map",
+                "eagle_lm_head.weight",
+            ]
+
+        if params_dict is None:
+            params_dict = dict(self.named_parameters())
+
+        weight_names = []
+        for name, loaded_weight in weights:
+            weight_names.append(name)
+
+            if not is_nextn and not is_eagle:
+                if hasattr(self.config, "num_nextn_predict_layers"):
+                    num_nextn_layers = self.config.num_nextn_predict_layers
+                    if num_nextn_layers > 0 and name.startswith("model.layers"):
+                        name_list = name.split(".")
+                        if (
+                            len(name_list) >= 3
+                            and int(name_list[2]) >= self.config.num_hidden_layers
+                        ):
+                            continue
+            else:
+                if nextn_layer_prefix and not name.startswith(nextn_layer_prefix):
+                    continue
+
+                if nextn_layer_prefix is not None:  # mtp
+                    # Use shared head and embed weights from target model
+                    if "shared_head.head" in name or "embed_tokens" in name:
+                        continue
+
+                    is_decoder = True
+                    # For nextn specific weights
+                    for weight_name in nextn_spec_weight_names:
+                        if weight_name in name:
+                            name = name.replace(nextn_layer_prefix, "model")
+                            is_decoder = False
+                            break
+                    # For decoder layer weights
+                    if is_decoder:
+                        name = name.replace(nextn_layer_prefix, "model.decoder")
+
+            if "rotary_emb.inv_freq" in name:
+                continue
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                # Skip non-stacked layers and experts (experts handled below).
+                if weight_name not in name:
+                    continue
+                # We have mlp.experts[0].gate_proj in the checkpoint.
+                # Since we handle the experts below in expert_params_mapping,
+                # we need to skip here BEFORE we update the name, otherwise
+                # name will be updated to mlp.experts[0].gate_up_proj, which
+                # will then be updated below in expert_params_mapping
+                # for mlp.experts[0].gate_gate_up_proj, which breaks load.
+                if "mlp.experts" in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                if name not in params_dict:
+                    continue
+
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                # Track if this is an expert weight to enable early skipping
+                is_expert_weight = False
+
+                for mapping in expert_params_mapping:
+                    param_name, weight_name, expert_id, shard_id = mapping
+                    if weight_name not in name:
+                        continue
+
+                    # Mark as expert weight regardless of whether we can process it
+                    is_expert_weight = True
+
+                    name = name.replace(weight_name, param_name)
+                    if name not in params_dict:
+                        # Expert weight not on this rank, will be skipped below
+                        continue
+
+                    param = params_dict[name]
+                    weight_loader = param.weight_loader
+                    weight_loader(
+                        param,
+                        loaded_weight,
+                        name,
+                        shard_id=shard_id,
+                        expert_id=expert_id,
+                    )
+                    break
+                else:
+                    if is_expert_weight:
+                        # This is an expert weight but not mapped to this rank, skip all remaining processing
+                        continue
+
+                    # Skip loading extra bias for GPTQ models.
+                    if name.endswith(".bias") and name not in params_dict:
+                        continue
+                    if name in eagle_ignore_weight_names:
+                        continue
+
+                    # GLM NOTE: for MLA
+                    if fuse_qkv_a_proj and (
+                        "q_a_proj" in name or "kv_a_proj_with_mqa" in name
+                    ):
+                        cached_a_proj[name] = loaded_weight
+                        q_a_proj_name = (
+                            name
+                            if "q_a_proj" in name
+                            else name.replace("kv_a_proj_with_mqa", "q_a_proj")
+                        )
+                        kv_a_proj_name = (
+                            name
+                            if "kv_a_proj_with_mqa" in name
+                            else name.replace("q_a_proj", "kv_a_proj_with_mqa")
+                        )
+
+                        # When both q_a_proj and kv_a_proj_with_mqa has been cached, load the fused weight to parameter
+                        if (
+                            q_a_proj_name in cached_a_proj
+                            and kv_a_proj_name in cached_a_proj
+                        ):
+                            q_a_proj_weight = cached_a_proj[q_a_proj_name]
+                            kv_a_proj_weight = cached_a_proj[kv_a_proj_name]
+                            fused_weight = torch.cat(
+                                [q_a_proj_weight, kv_a_proj_weight], dim=0
+                            )
+                            param_name = (
+                                name.replace("q_a_proj", "fused_qkv_a_proj_with_mqa")
+                                if "q_a_proj" in name
+                                else name.replace(
+                                    "kv_a_proj_with_mqa", "fused_qkv_a_proj_with_mqa"
+                                )
+                            )
+                            if param_name not in params_dict:
+                                continue
+                            param = params_dict[param_name]
+
+                            weight_loader = getattr(
+                                param, "weight_loader", default_weight_loader
+                            )
+                            weight_loader(param, fused_weight)
+                            cached_a_proj.pop(q_a_proj_name)
+                            cached_a_proj.pop(kv_a_proj_name)
+                    else:
+                        if (
+                            "k_scale" in name or "v_scale" in name
+                        ) and name not in params_dict:
+                            # modelopt attn kv scale is named differently
+                            if any(scale in name for scale in ["k_scale", "v_scale"]):
+                                name = name.replace("_proj", "attn_mqa")
+                            else:
+                                logger.warning(
+                                    f"Unknown scale found in checkpoint: {name}"
+                                )
+
+                    if name not in params_dict:
+                        continue
+
+                    if name in params_dict.keys():
+                        param = params_dict[name]
+                        weight_loader = getattr(
+                            param, "weight_loader", default_weight_loader
+                        )
+                        weight_loader(param, loaded_weight)
+                    else:
+                        logger.warning(f"Parameter {name} not found in params_dict")
+
+        # DeepseekV2AttentionMLA.forward_* expects post_load_weights() to populate
+        # per-layer packed weights like `w_kc`/`w_vc` (used during CUDA graph capture).
+        # GLM-Lite configs may not set `config.mla`, but this model always uses
+        # DeepseekV2AttentionMLA, so we must run the post-load processing.
+        # Use weight_names=None to ensure we always process all layers. Some checkpoints /
+        # naming schemes may not include "kv_b_proj" in `weight_names`, but `w_kc`/`w_vc`
+        # are still required by DeepseekV2AttentionMLA at runtime.
+        self.post_load_weights(is_nextn=is_nextn, weight_names=None)
+
+
+EntryClass = [Glm4MoeLiteForCausalLM]
diff --git a/sglang/python/sglang/srt/models/glm4_moe_nextn.py b/sglang/python/sglang/srt/models/glm4_moe_nextn.py
new file mode 100644
index 0000000000000000000000000000000000000000..1f6e753646cb6af1d44043e51cc07be09a184da4
--- /dev/null
+++ b/sglang/python/sglang/srt/models/glm4_moe_nextn.py
@@ -0,0 +1,162 @@
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Inference-only GLM-4.5, GLM-4.6 Speculative Decoding."""
+
+import logging
+from typing import Iterable, Optional, Tuple
+
+import torch
+from torch import nn
+from transformers import PretrainedConfig
+
+from sglang.srt.distributed import get_tensor_model_parallel_world_size
+from sglang.srt.eplb.expert_distribution import get_global_expert_distribution_recorder
+from sglang.srt.layers.dp_attention import is_dp_attention_enabled
+from sglang.srt.layers.layernorm import RMSNorm
+from sglang.srt.layers.logits_processor import LogitsProcessor
+from sglang.srt.layers.quantization.base_config import QuantizationConfig
+from sglang.srt.layers.vocab_parallel_embedding import (
+    ParallelLMHead,
+    VocabParallelEmbedding,
+)
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch
+from sglang.srt.models.glm4_moe import Glm4MoeDecoderLayer, Glm4MoeForCausalLM
+from sglang.srt.server_args import get_global_server_args
+from sglang.srt.utils import add_prefix
+
+logger = logging.getLogger(__name__)
+
+
+class Glm4MoeModelNextN(nn.Module):
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        if quant_config is not None and quant_config.get_name() == "modelopt_fp4":
+            logger.warning(
+                "Overriding Glm4MoeForCausalLMNextN quant config for modelopt_fp4 GLM-4.5 / GLM-4.6 model."
+            )
+            quant_config = None
+
+        self.vocab_size = config.vocab_size
+
+        self.embed_tokens = VocabParallelEmbedding(
+            config.vocab_size,
+            config.hidden_size,
+            use_attn_tp_group=is_dp_attention_enabled(),
+            prefix=add_prefix("embed_tokens", prefix),
+        )
+
+        self.enorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.hnorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+        self.eh_proj = nn.Linear(2 * config.hidden_size, config.hidden_size, bias=False)
+
+        self.decoder = Glm4MoeDecoderLayer(
+            config,
+            0,
+            quant_config=quant_config,
+            is_nextn=True,
+            prefix=add_prefix("decoder", prefix),
+        )
+
+        self.shared_head = nn.Module()
+        self.shared_head.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        input_embeds: torch.Tensor = None,
+    ) -> torch.Tensor:
+        if input_embeds is None:
+            hidden_states = self.embed_tokens(input_ids)
+        else:
+            hidden_states = input_embeds
+
+        if hidden_states.shape[0] > 0:
+            hidden_states = self.eh_proj(
+                torch.cat(
+                    (
+                        self.enorm(hidden_states),
+                        self.hnorm(forward_batch.spec_info.hidden_states),
+                    ),
+                    dim=-1,
+                )
+            )
+
+        residual = None
+        with get_global_expert_distribution_recorder().disable_this_region():
+            hidden_states, residual = self.decoder(
+                positions, hidden_states, forward_batch, residual
+            )
+
+        if not forward_batch.forward_mode.is_idle():
+            if residual is not None:
+                hidden_states, _ = self.shared_head.norm(hidden_states, residual)
+            else:
+                hidden_states = self.shared_head.norm(hidden_states)
+
+        return hidden_states
+
+
+class Glm4MoeForCausalLMNextN(Glm4MoeForCausalLM):
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        nn.Module.__init__(self)
+        self.config = config
+        self.tp_size = get_tensor_model_parallel_world_size()
+        self.quant_config = quant_config
+        self.model = Glm4MoeModelNextN(
+            config, quant_config, prefix=add_prefix("model", prefix)
+        )
+        self.lm_head = ParallelLMHead(
+            config.vocab_size,
+            config.hidden_size,
+            quant_config=quant_config,
+            prefix=add_prefix("model.shared_head.head", prefix),
+            use_attn_tp_group=get_global_server_args().enable_dp_lm_head,
+        )
+        self.logits_processor = LogitsProcessor(config)
+
+        self.num_fused_shared_experts = (
+            0 if get_global_server_args().disable_shared_experts_fusion else 1
+        )
+
+    @torch.no_grad()
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+    ) -> torch.Tensor:
+        hidden_states = self.model(input_ids, positions, forward_batch)
+        return self.logits_processor(
+            input_ids, hidden_states, self.lm_head, forward_batch
+        )
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        super().load_weights(weights, is_nextn=True)
+
+
+EntryClass = [Glm4MoeForCausalLMNextN]
diff --git a/sglang/python/sglang/srt/models/glm4v.py b/sglang/python/sglang/srt/models/glm4v.py
new file mode 100644
index 0000000000000000000000000000000000000000..44f9837a958d8d5c412506f6959754edf76aacb8
--- /dev/null
+++ b/sglang/python/sglang/srt/models/glm4v.py
@@ -0,0 +1,818 @@
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+# Modeling from:
+# ./llama.py and
+# https://github.com/huggingface/transformers/blob/main/src/transformers/models/glm4v/modular_glm4v.py
+"""Inference-only GLM-4.1V model compatible with HuggingFace weights."""
+
+import logging
+from functools import lru_cache
+from typing import Iterable, List, Optional, Tuple
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from einops import rearrange
+from transformers.models.glm4v.configuration_glm4v import Glm4vConfig, Glm4vVisionConfig
+
+from sglang.srt.distributed import (
+    get_tensor_model_parallel_rank,
+    get_tensor_model_parallel_world_size,
+)
+from sglang.srt.distributed.parallel_state import get_pp_group
+from sglang.srt.layers.activation import SiluAndMul
+from sglang.srt.layers.attention import vision_utils
+from sglang.srt.layers.attention.vision import VisionAttention
+from sglang.srt.layers.layernorm import LayerNorm, RMSNorm
+from sglang.srt.layers.linear import (
+    MergedColumnParallelLinear,
+    ReplicatedLinear,
+    RowParallelLinear,
+)
+from sglang.srt.layers.logits_processor import LogitsProcessor
+from sglang.srt.layers.pooler import Pooler, PoolingType
+from sglang.srt.layers.quantization.base_config import QuantizationConfig
+from sglang.srt.layers.rotary_embedding import get_rope
+from sglang.srt.layers.utils import PPMissingLayer
+from sglang.srt.layers.vocab_parallel_embedding import ParallelLMHead
+from sglang.srt.managers.mm_utils import (
+    MultiModalityDataPaddingPatternMultimodalTokens,
+    general_mm_embed_routine,
+)
+from sglang.srt.managers.schedule_batch import MultimodalDataItem, MultimodalInputs
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch, PPProxyTensors
+from sglang.srt.model_loader.weight_utils import default_weight_loader
+from sglang.srt.models.glm4 import Glm4Model
+from sglang.srt.multimodal.mm_utils import run_dp_sharded_mrope_vision_model
+from sglang.srt.server_args import get_global_server_args
+from sglang.srt.utils import add_prefix, is_npu
+from sglang.srt.utils.hf_transformers_utils import get_processor
+
+logger = logging.getLogger(__name__)
+
+cached_get_processor = lru_cache(get_processor)
+
+
+class Glm4vRMSNorm(RMSNorm):
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        original_shape = x.shape
+        x_2d = x.contiguous().reshape(-1, original_shape[-1])
+        x_2d = super().forward(x_2d)
+        x = x_2d.reshape(original_shape)
+        return x
+
+
+class Glm4vVisionMLP(nn.Module):
+    def __init__(
+        self,
+        in_features: int,
+        hidden_features: int,
+        bias: bool = False,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+        use_data_parallel: bool = False,
+    ):
+        super().__init__()
+        self.tp_size = (
+            1 if use_data_parallel else get_tensor_model_parallel_world_size()
+        )
+        self.tp_rank = 0 if use_data_parallel else get_tensor_model_parallel_rank()
+        self.gate_up_proj = MergedColumnParallelLinear(
+            input_size=in_features,
+            output_sizes=[hidden_features] * 2,  # [gate_proj, up_proj]
+            bias=bias,
+            quant_config=quant_config,
+            prefix=add_prefix("gate_up_proj", prefix),
+            tp_size=self.tp_size,
+            tp_rank=self.tp_rank,
+        )
+        self.down_proj = RowParallelLinear(
+            hidden_features,
+            in_features,
+            bias=bias,
+            quant_config=quant_config,
+            prefix=add_prefix("down_proj", prefix),
+            tp_size=self.tp_size,
+            tp_rank=self.tp_rank,
+        )
+        self.act_fn = SiluAndMul()
+
+    def forward(self, x: torch.Tensor):
+        gate_up, _ = self.gate_up_proj(x)
+        x = self.act_fn(gate_up)
+        x, _ = self.down_proj(x)
+        return x
+
+
+class Glm4vVisionBlock(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        intermediate_dim: int,
+        num_heads: int,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+        attn_qkv_bias: bool = True,
+        num_dummy_heads: int = 0,
+        rms_norm_eps: float = 1e-5,
+        use_data_parallel: bool = False,
+    ) -> None:
+        super().__init__()
+        self.norm1 = RMSNorm(dim, eps=rms_norm_eps)
+        self.norm2 = RMSNorm(dim, eps=rms_norm_eps)
+
+        self.attn = VisionAttention(
+            embed_dim=dim,
+            num_heads=num_heads,
+            projection_size=dim,
+            use_qkv_parallel=True,
+            proj_bias=False,
+            qkv_bias=attn_qkv_bias,
+            flatten_batch=True,
+            quant_config=quant_config,
+            prefix=add_prefix("attn", prefix),
+            num_dummy_heads=num_dummy_heads,
+            use_data_parallel=use_data_parallel,
+        )
+        self.mlp = Glm4vVisionMLP(
+            dim,
+            intermediate_dim,
+            quant_config=quant_config,
+            prefix=add_prefix("mlp", prefix),
+            use_data_parallel=use_data_parallel,
+        )
+
+    def forward(
+        self,
+        x: torch.Tensor,
+        cu_seqlens: torch.Tensor,
+        rotary_pos_emb_cos: torch.Tensor,
+        rotary_pos_emb_sin: torch.Tensor,
+    ) -> torch.Tensor:
+        S, B, H = x.shape
+        # norm1: flatten to 2D -> [S*B, H], then reshape back
+        x2d = x.reshape(-1, H)
+        hidden_states = self.norm1(x2d).reshape(S, B, H)
+
+        # Attention expects [B, S, H]
+        hidden_states = rearrange(hidden_states, "s b h -> b s h")
+        attn = self.attn(
+            hidden_states,
+            cu_seqlens=cu_seqlens,
+            rotary_pos_emb_cos=rotary_pos_emb_cos,
+            rotary_pos_emb_sin=rotary_pos_emb_sin,
+        )
+        attn = rearrange(attn, "b s h -> s b h")
+
+        # norm2 with fused residual-add: also 2D
+        attn2d = attn.reshape(-1, H)
+        x_norm_2d, x_after_add_2d = self.norm2(x2d, residual=attn2d)
+        x_norm = x_norm_2d.reshape(S, B, H)
+        x_after_add = x_after_add_2d.reshape(S, B, H)
+
+        # MLP and final residual
+        mlp_out = self.mlp(x_norm)
+        x = x_after_add + mlp_out
+        return x
+
+
+class Glm4vVisionPatchEmbed(nn.Module):
+    def __init__(
+        self,
+        patch_size: int = 14,
+        temporal_patch_size: int = 2,
+        in_channels: int = 3,
+        hidden_size: int = 1536,
+    ) -> None:
+        super().__init__()
+        self.patch_size = patch_size
+        self.temporal_patch_size = temporal_patch_size
+        self.hidden_size = hidden_size
+        self.in_channels = in_channels
+
+        kernel_size = (temporal_patch_size, patch_size, patch_size)
+        self.proj = nn.Conv3d(
+            in_channels,
+            hidden_size,
+            kernel_size=kernel_size,
+            stride=kernel_size,
+            bias=True,
+        )
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = x.view(
+            -1,
+            self.in_channels,
+            self.temporal_patch_size,
+            self.patch_size,
+            self.patch_size,
+        )
+        x = self.proj(x).view(-1, self.hidden_size)
+        return x
+
+
+class Glm4vPatchMerger(nn.Module):
+    def __init__(
+        self,
+        d_model: int,
+        context_dim: int,
+        quant_config: Optional[QuantizationConfig] = None,
+        bias: bool = False,
+        prefix: str = "",
+        use_data_parallel: bool = False,
+    ) -> None:
+        super().__init__()
+        self.hidden_size = d_model
+        tp_size = 1 if use_data_parallel else get_tensor_model_parallel_world_size()
+        tp_rank = 0 if use_data_parallel else get_tensor_model_parallel_rank()
+        self.proj = ReplicatedLinear(
+            self.hidden_size,
+            self.hidden_size,
+            bias=bias,
+            quant_config=quant_config,
+            prefix=add_prefix("proj", prefix),
+        )
+        self.post_projection_norm = LayerNorm(self.hidden_size)
+        self.gate_up_proj = MergedColumnParallelLinear(
+            input_size=self.hidden_size,
+            output_sizes=[context_dim] * 2,
+            bias=bias,
+            quant_config=quant_config,
+            prefix=add_prefix("gate_up_proj", prefix),
+            tp_size=tp_size,
+            tp_rank=tp_rank,
+        )
+        self.down_proj = RowParallelLinear(
+            context_dim,
+            self.hidden_size,
+            bias=bias,
+            quant_config=quant_config,
+            prefix=add_prefix("down_proj", prefix),
+            tp_size=tp_size,
+            tp_rank=tp_rank,
+        )
+        self.extra_activation_func = nn.GELU()
+
+    def forward(self, x: torch.Tensor):
+        x, _ = self.proj(x)
+        x = self.extra_activation_func(self.post_projection_norm(x))
+        gate_up, _ = self.gate_up_proj(x)
+        gate, up = gate_up.chunk(2, dim=-1)
+        x = F.silu(gate) * up
+        x, _ = self.down_proj(x)
+        return x
+
+
+class Glm4vVisionEmbeddings(nn.Module):
+    def __init__(self, config: Glm4vVisionConfig):
+        super().__init__()
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.image_size = config.image_size
+        self.patch_size = config.patch_size
+
+        self.num_patches = (self.image_size // self.patch_size) ** 2
+        self.num_positions = self.num_patches
+        self.position_embedding = nn.Embedding(self.num_positions, self.embed_dim)
+        self.register_buffer(
+            "position_ids",
+            torch.arange(self.num_positions).expand((1, -1)),
+            persistent=False,
+        )
+
+    def forward(
+        self, embeddings, lengths, image_shapes, h_coords, w_coords
+    ) -> torch.Tensor:
+        pos_embed_weight = self.position_embedding.weight
+        hidden_size = pos_embed_weight.shape[1]
+        total_seq = h_coords.shape[0]
+        device = pos_embed_weight.device
+
+        # Move coordinates to correct device
+        h_coords, w_coords = h_coords.to(device), w_coords.to(device)
+
+        # Handle empty sequence case
+        if total_seq == 0:
+            adapted_pos_embed = torch.empty(
+                0, hidden_size, device=device, dtype=pos_embed_weight.dtype
+            )
+        else:
+            # Convert inputs to tensors if needed
+            if isinstance(lengths, list):
+                lengths = torch.tensor(lengths, device=device, dtype=torch.long)
+            if not isinstance(image_shapes, torch.Tensor):
+                image_shapes = torch.tensor(
+                    image_shapes, device=device, dtype=torch.long
+                )
+
+            # Prepare 2D position embedding
+            orig_size_sq = pos_embed_weight.shape[0]
+            orig_size = int(orig_size_sq**0.5)
+            pos_embed_2d = (
+                pos_embed_weight.view(orig_size, orig_size, hidden_size)
+                .permute(2, 0, 1)
+                .unsqueeze(0)
+                .to(device=device, dtype=torch.float32)
+            )
+
+            # Calculate target dimensions for each patch
+            target_h = torch.cat(
+                [image_shapes[i, 1].repeat(lengths[i]) for i in range(len(lengths))]
+            ).to(device=device, dtype=torch.float32)
+            target_w = torch.cat(
+                [image_shapes[i, 2].repeat(lengths[i]) for i in range(len(lengths))]
+            ).to(device=device, dtype=torch.float32)
+
+            # Normalize coordinates to [-1, 1] range for grid_sample
+            h_coords = h_coords.to(device=device, dtype=torch.float32)
+            w_coords = w_coords.to(device=device, dtype=torch.float32)
+            norm_w = ((w_coords + 0.5) / target_w) * 2 - 1
+            norm_h = ((h_coords + 0.5) / target_h) * 2 - 1
+
+            # Create sampling grid
+            grid = torch.stack((norm_w, norm_h), dim=-1).unsqueeze(0).unsqueeze(2)
+
+            # Perform bicubic interpolation
+            interpolated_embed_fp32 = F.grid_sample(
+                pos_embed_2d,
+                grid,
+                mode="bicubic",
+                align_corners=False,
+                padding_mode="border",
+            )
+
+            # Reshape and convert back to original dtype
+            adapted_pos_embed_fp32 = (
+                interpolated_embed_fp32.squeeze(0).squeeze(-1).permute(1, 0)
+            )
+            adapted_pos_embed = adapted_pos_embed_fp32.to(pos_embed_weight.dtype).to(
+                embeddings.device
+            )
+
+        # Add adapted position encoding to embeddings
+        embeddings = embeddings + adapted_pos_embed
+        return embeddings
+
+
+class Glm4vVisionModel(nn.Module):
+    def __init__(
+        self,
+        vision_config: Glm4vVisionConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+        use_data_parallel: bool = False,
+    ) -> None:
+        super().__init__()
+
+        patch_size = vision_config.patch_size
+        temporal_patch_size = vision_config.temporal_patch_size
+        in_channels = vision_config.in_channels
+        depth = vision_config.depth
+        self.hidden_size = vision_config.hidden_size
+        self.num_heads = vision_config.num_heads
+
+        self.patch_size = vision_config.patch_size
+        self.spatial_merge_size = vision_config.spatial_merge_size
+        self.out_hidden_size = vision_config.out_hidden_size
+        self.use_data_parallel = use_data_parallel
+
+        self.patch_embed = Glm4vVisionPatchEmbed(
+            patch_size=patch_size,
+            temporal_patch_size=temporal_patch_size,
+            in_channels=in_channels,
+            hidden_size=self.hidden_size,
+        )
+
+        head_dim = self.hidden_size // self.num_heads
+        self.rotary_pos_emb = get_rope(
+            head_size=head_dim,
+            rotary_dim=head_dim // 2,
+            max_position=8192,
+            base=10000.0,
+            is_neox_style=True,
+        )
+
+        self.blocks = nn.ModuleList(
+            [
+                Glm4vVisionBlock(
+                    dim=self.hidden_size,
+                    intermediate_dim=self.out_hidden_size,
+                    num_heads=self.num_heads,
+                    quant_config=quant_config,
+                    prefix=add_prefix(f"blocks.{layer_idx}", prefix),
+                    rms_norm_eps=vision_config.rms_norm_eps,
+                    attn_qkv_bias=vision_config.attention_bias,
+                    use_data_parallel=use_data_parallel,
+                )
+                for layer_idx in range(depth)
+            ]
+        )
+
+        self.merger = Glm4vPatchMerger(
+            d_model=vision_config.out_hidden_size,
+            context_dim=vision_config.intermediate_size,
+            quant_config=quant_config,
+            bias=False,
+            prefix=add_prefix("merger", prefix),
+            use_data_parallel=use_data_parallel,
+        )
+
+        self.embeddings = Glm4vVisionEmbeddings(vision_config)
+
+        self.post_conv_layernorm = Glm4vRMSNorm(
+            vision_config.hidden_size, eps=vision_config.rms_norm_eps
+        )
+        self.downsample = nn.Conv2d(
+            in_channels=vision_config.hidden_size,
+            out_channels=vision_config.out_hidden_size,
+            kernel_size=vision_config.spatial_merge_size,
+            stride=vision_config.spatial_merge_size,
+        )
+        self.post_layernorm = Glm4vRMSNorm(
+            vision_config.hidden_size, eps=vision_config.rms_norm_eps
+        )
+
+    @property
+    def dtype(self) -> torch.dtype:
+        return self.patch_embed.proj.weight.dtype
+
+    @property
+    def device(self) -> torch.device:
+        return self.patch_embed.proj.weight.device
+
+    def rot_pos_emb(
+        self, grid_thw: torch.Tensor
+    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        pos_ids = []
+        for t, h, w in grid_thw:
+            hpos_ids = torch.arange(h).unsqueeze(1).expand(-1, w)
+            wpos_ids = torch.arange(w).unsqueeze(0).expand(h, -1)
+            hpos_ids = (
+                hpos_ids.reshape(
+                    h // self.spatial_merge_size,
+                    self.spatial_merge_size,
+                    w // self.spatial_merge_size,
+                    self.spatial_merge_size,
+                )
+                .permute(0, 2, 1, 3)
+                .flatten()
+            )
+            wpos_ids = (
+                wpos_ids.reshape(
+                    h // self.spatial_merge_size,
+                    self.spatial_merge_size,
+                    w // self.spatial_merge_size,
+                    self.spatial_merge_size,
+                )
+                .permute(0, 2, 1, 3)
+                .flatten()
+            )
+            pos_ids.append(torch.stack([hpos_ids, wpos_ids], dim=-1).repeat(t, 1))
+        pos_ids = torch.cat(pos_ids, dim=0).to(self.device, non_blocking=True)
+        max_grid_size = grid_thw[:, 1:].max()
+
+        # Use pre-computed cos_sin_cache from RotaryEmbedding
+        cos, sin = self.rotary_pos_emb.get_cos_sin(max_grid_size)
+
+        cos_combined = cos[pos_ids].flatten(1)
+        sin_combined = sin[pos_ids].flatten(1)
+        return cos_combined, sin_combined, pos_ids
+
+    def forward(self, x: torch.Tensor, grid_thw: torch.Tensor) -> torch.Tensor:
+        # patchify
+        x = x.to(device=self.device, dtype=self.dtype)
+        x = self.patch_embed(x)
+        x = self.post_conv_layernorm(x)
+
+        # compute position embedding
+        rotary_pos_emb_cos, rotary_pos_emb_sin, image_type_ids = self.rot_pos_emb(
+            grid_thw
+        )
+        # compute cu_seqlens
+        cu_seqlens = torch.repeat_interleave(
+            grid_thw[:, 1] * grid_thw[:, 2], grid_thw[:, 0]
+        ).cumsum(dim=0, dtype=torch.int32)
+        cu_seqlens = torch.cat([cu_seqlens.new_zeros(1), cu_seqlens])
+
+        seqlens = (cu_seqlens[1:] - cu_seqlens[:-1]).tolist()
+        x = self.embeddings(
+            x, seqlens, grid_thw, image_type_ids[:, 0], image_type_ids[:, 1]
+        )
+
+        rotary_pos_emb_cos = torch.cat([rotary_pos_emb_cos, rotary_pos_emb_cos], dim=-1)
+        rotary_pos_emb_sin = torch.cat([rotary_pos_emb_sin, rotary_pos_emb_sin], dim=-1)
+
+        # cu_seqlens must be on cpu because of npu_flash_attention_unpad operator restriction
+        if is_npu():
+            cu_seqlens = cu_seqlens.to("cpu")
+
+        # x.shape: (s, b, d) where b=1 for vision processing
+        # transformers
+        x = x.unsqueeze(1)
+        for blk in self.blocks:
+            x = blk(
+                x,
+                cu_seqlens=cu_seqlens,
+                rotary_pos_emb_cos=rotary_pos_emb_cos,
+                rotary_pos_emb_sin=rotary_pos_emb_sin,
+            )
+
+        # adapter
+        x = self.post_layernorm(x)
+        x = x.view(-1, self.spatial_merge_size, self.spatial_merge_size, x.shape[-1])
+        x = x.permute(0, 3, 1, 2)
+        x = self.downsample(x).view(-1, self.out_hidden_size)
+        x = self.merger(x)
+
+        return x
+
+
+class Glm4vForConditionalGeneration(nn.Module):
+    def __init__(
+        self,
+        config: Glm4vConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+
+        self.pp_group = get_pp_group()
+        self.config = config
+        self.use_data_parallel = get_global_server_args().mm_enable_dp_encoder
+        self.visual = Glm4vVisionModel(
+            config.vision_config,
+            quant_config=quant_config,
+            prefix=add_prefix("visual", prefix),
+            use_data_parallel=self.use_data_parallel,
+        )
+
+        vision_utils.update_vit_attn_dummy_heads_config(self.config)
+
+        self.model = Glm4Model(
+            config,
+            quant_config=quant_config,
+            prefix=add_prefix("model", prefix),
+        )
+
+        if self.pp_group.is_last_rank:
+            if self.pp_group.world_size == 1 and self.config.tie_word_embeddings:
+                self.lm_head = self.model.embed_tokens
+            else:
+                self.lm_head = ParallelLMHead(
+                    self.config.vocab_size,
+                    self.config.hidden_size,
+                    quant_config=quant_config,
+                    prefix=add_prefix("lm_head", prefix),
+                )
+        else:
+            # ranks other than the last rank will have a placeholder layer
+            self.lm_head = PPMissingLayer()
+
+        self.is_mrope_enabled = "mrope_section" in self.config.rope_scaling
+
+        self.logits_processor = LogitsProcessor(config)
+        self.pooler = Pooler(pooling_type=PoolingType.LAST, normalize=True)
+
+        # For EAGLE3 support
+        self.capture_aux_hidden_states = False
+
+    def pad_input_ids(self, input_ids: List[int], mm_inputs: MultimodalInputs):
+        pattern = MultiModalityDataPaddingPatternMultimodalTokens()
+        return pattern.pad_input_tokens(input_ids, mm_inputs)
+
+    def get_image_feature(self, items: List[MultimodalDataItem]) -> torch.Tensor:
+        # in GLM-V, last dim is the same
+        pixel_values = torch.cat([item.feature for item in items], dim=0).type(
+            self.visual.dtype
+        )
+        image_grid_thw = torch.concat([item.image_grid_thw for item in items], dim=0)
+        assert pixel_values.dim() == 2, pixel_values.dim()
+        assert image_grid_thw.dim() == 2, image_grid_thw.dim()
+        if self.use_data_parallel:
+            return run_dp_sharded_mrope_vision_model(
+                self.visual, pixel_values, image_grid_thw.tolist(), rope_type="rope_3d"
+            )
+        else:
+            image_embeds = self.visual(pixel_values, grid_thw=image_grid_thw)
+        return image_embeds
+
+    def get_video_feature(self, items: List[MultimodalDataItem]) -> torch.Tensor:
+        # in GLM-V, last dim is the same
+        pixel_values = torch.cat([item.feature for item in items], dim=0).type(
+            self.visual.dtype
+        )
+        video_grid_thw = torch.concat([item.video_grid_thw for item in items], dim=0)
+
+        # reshape video_grid_thw -> [b, 3] -> [1, h, w] * frames
+        temp_frames_hw = []
+        for t, h, w in video_grid_thw:
+            repeated_row = (
+                torch.tensor([1, h.item(), w.item()]).unsqueeze(0).repeat(t, 1)
+            )
+            temp_frames_hw.append(repeated_row)
+        flattened_video_grid_thw = torch.cat(temp_frames_hw, dim=0)
+
+        assert pixel_values.dim() == 2, pixel_values.dim()
+        assert video_grid_thw.dim() == 2, video_grid_thw.dim()
+        if self.use_data_parallel:
+            return run_dp_sharded_mrope_vision_model(
+                self.visual,
+                pixel_values,
+                flattened_video_grid_thw.tolist(),
+                rope_type="rope_3d",
+            )
+        else:
+            video_embeds = self.visual(pixel_values, grid_thw=flattened_video_grid_thw)
+        return video_embeds
+
+    def get_input_embeddings(self):
+        return self.model.embed_tokens
+
+    @torch.no_grad()
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        get_embedding: bool = False,
+        pp_proxy_tensors: Optional[PPProxyTensors] = None,
+    ):
+        """Run forward pass for GLM-4.1V.
+
+        Args:
+            input_ids: Flattened (concatenated) input_ids corresponding to a
+                batch.
+            positions: Flattened (concatenated) position ids corresponding to a
+                batch.
+                **NOTE**: If mrope is enabled (default setting for GLM-4.1V
+                opensource models), the shape will be `(3, seq_len)`,
+                otherwise it will be `(seq_len,).
+                (Use input_metadata.mrope_positions to replace it)
+        """
+        if self.is_mrope_enabled:
+            positions = forward_batch.mrope_positions
+
+        if not (
+            forward_batch.forward_mode.is_decode()
+            or not forward_batch.contains_image_inputs()
+        ):
+            if self.is_mrope_enabled:
+                assert positions.ndim == 2 and positions.size(0) == 3, (
+                    "multimodal section rotary embedding requires "
+                    f"(3, seq_len) positions, but got {positions.size()}"
+                )
+
+        hidden_states = general_mm_embed_routine(
+            input_ids=input_ids,
+            forward_batch=forward_batch,
+            language_model=self.model,
+            multimodal_model=self,
+            positions=positions,
+            pp_proxy_tensors=pp_proxy_tensors,
+        )
+
+        aux_hidden_states = None
+        if self.capture_aux_hidden_states:
+            hidden_states, aux_hidden_states = hidden_states
+
+        if self.pp_group.is_last_rank:
+            if not get_embedding:
+                return self.logits_processor(
+                    input_ids,
+                    hidden_states,
+                    self.lm_head,
+                    forward_batch,
+                )
+            else:
+                return self.pooler(hidden_states, forward_batch)
+        else:
+            return hidden_states
+
+    def _pad_vit_attn_dummy_heads(self, name: str, loaded_weight: torch.Tensor):
+        """pad attn qkv weights for dummy heads"""
+        num_dummy_heads = self.config.vision_config.num_dummy_heads
+        if num_dummy_heads == 0:
+            return loaded_weight
+        head_dim = self.config.vision_config.head_dim
+
+        if "attn.qkv_proj" in name:
+            wq, wk, wv = loaded_weight.chunk(3, dim=0)
+            if name.endswith(".weight"):
+                dummy_shape = [num_dummy_heads, head_dim, wq.shape[-1]]
+            elif name.endswith(".bias"):
+                dummy_shape = [num_dummy_heads, head_dim]
+            else:
+                raise RuntimeError(f"Unsupported weight with name={name}")
+            pad_func = lambda x: torch.cat(
+                [x.unflatten(0, (-1, head_dim)), x.new_zeros(dummy_shape)], dim=0
+            ).flatten(0, 1)
+            wq, wk, wv = pad_func(wq), pad_func(wk), pad_func(wv)
+            loaded_weight = torch.cat([wq, wk, wv], dim=0)
+        elif "attn.proj.weight" in name:
+            padded_weight = loaded_weight.new_zeros(
+                loaded_weight.shape[0], head_dim * num_dummy_heads
+            )
+            loaded_weight = torch.cat([loaded_weight, padded_weight], dim=-1)
+        elif "attn.q_norm.weight" in name or "attn.k_norm.weight" in name:
+            padded_weight = loaded_weight.new_zeros(head_dim * num_dummy_heads)
+            loaded_weight = torch.cat([loaded_weight, padded_weight], dim=0)
+        return loaded_weight
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            (".qkv_proj", ".q_proj", "q"),
+            (".qkv_proj", ".k_proj", "k"),
+            (".qkv_proj", ".v_proj", "v"),
+            (".gate_up_proj", ".up_proj", 1),
+            (".gate_up_proj", ".gate_proj", 0),
+        ]
+        params_dict = dict(self.named_parameters(remove_duplicate=False))
+
+        # For the PP case, we add special handling for lm_head.weight,
+        # - On non–last ranks: we continue, because this stage is supposed to
+        #   be just an empty PPMissingLayer shell.
+        # - On the last rank: params_dict is expected to contain lm_head.weight,
+        #   so it will never hit the branch "if name not in params_dict".
+        #
+        # For all other parameters, such like
+        # "model.visual.blocks.20.mlp.gate_proj.weight", the unified rule is:
+        # If this name does not exist in the current rank’s params_dict,
+        # it does not belong to this pipeline stage, thus we simply continue.
+        for name, loaded_weight in weights:
+            if "rotary_emb.inv_freq" in name:
+                continue
+            if "language_model" in name:
+                name = name.replace(r"model.language_model.", r"model.")
+            if "model.visual." in name:
+                name = name.replace("model.visual.", "visual.")
+
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+
+                if name not in params_dict:
+                    continue
+
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                if "visual" in name:
+                    # adapt to VisionAttention
+                    name = name.replace(r"attn.qkv.", r"attn.qkv_proj.")
+
+                try:
+                    # Skip loading extra bias for GPTQ models.
+                    if name.endswith(".bias") and name not in params_dict:
+                        continue
+
+                    if name not in params_dict:
+                        continue
+
+                    param = params_dict[name]
+                except KeyError:
+                    print(params_dict.keys())
+                    raise
+
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
+                if "visual" in name:
+                    loaded_weight = vision_utils.pad_vit_attn_dummy_heads(
+                        self.config, name, loaded_weight
+                    )
+                weight_loader(param, loaded_weight)
+
+    def get_embed_and_head(self):
+        return self.model.embed_tokens.weight, self.lm_head.weight
+
+    def set_embed_and_head(self, embed, head):
+        del self.model.embed_tokens.weight
+        self.model.embed_tokens.weight = embed
+        if self.config.tie_word_embeddings:
+            self.lm_head = self.model.embed_tokens
+        else:
+            del self.lm_head.weight
+            self.lm_head.weight = head
+        torch.cuda.empty_cache()
+        torch.cuda.synchronize()
+
+
+EntryClass = [Glm4vForConditionalGeneration]
diff --git a/sglang/python/sglang/srt/models/glm4v_moe.py b/sglang/python/sglang/srt/models/glm4v_moe.py
new file mode 100644
index 0000000000000000000000000000000000000000..324de18b49b7c20385d705683373b70f9d4b66aa
--- /dev/null
+++ b/sglang/python/sglang/srt/models/glm4v_moe.py
@@ -0,0 +1,285 @@
+import logging
+from functools import lru_cache
+from typing import Iterable, Optional, Tuple
+
+import torch
+import torch.nn as nn
+from transformers.models.glm4v_moe.configuration_glm4v_moe import Glm4vMoeConfig
+
+from sglang.srt.distributed import (
+    get_moe_expert_parallel_world_size,
+    get_tensor_model_parallel_world_size,
+)
+from sglang.srt.distributed.parallel_state import get_pp_group
+from sglang.srt.layers.attention import vision_utils
+from sglang.srt.layers.logits_processor import LogitsProcessor
+from sglang.srt.layers.moe import get_moe_a2a_backend
+from sglang.srt.layers.moe.fused_moe_triton.layer import FusedMoE
+from sglang.srt.layers.pooler import Pooler, PoolingType
+from sglang.srt.layers.quantization.base_config import QuantizationConfig
+from sglang.srt.layers.utils import PPMissingLayer
+from sglang.srt.layers.vocab_parallel_embedding import ParallelLMHead
+from sglang.srt.model_loader.weight_utils import default_weight_loader
+from sglang.srt.models.glm4_moe import Glm4MoeModel
+from sglang.srt.models.glm4v import Glm4vForConditionalGeneration, Glm4vVisionModel
+from sglang.srt.server_args import get_global_server_args
+from sglang.srt.utils import add_prefix, get_device_sm, is_cuda, log_info_on_rank0
+from sglang.srt.utils.hf_transformers_utils import get_processor
+
+_is_cuda = is_cuda()
+_device_sm = get_device_sm()
+
+logger = logging.getLogger(__name__)
+
+cached_get_processor = lru_cache(get_processor)
+
+
+class Glm4vMoeForConditionalGeneration(Glm4vForConditionalGeneration):
+    def __init__(
+        self,
+        config: Glm4vMoeConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        nn.Module.__init__(self)
+
+        self.pp_group = get_pp_group()
+        self.config = config
+        self.use_data_parallel = get_global_server_args().mm_enable_dp_encoder
+        vision_utils.update_vit_attn_dummy_heads_config(self.config)
+        self.tp_size = get_tensor_model_parallel_world_size()
+        self.quant_config = quant_config
+        self.num_fused_shared_experts = 0
+        self.determine_num_fused_shared_experts()
+
+        self.model = Glm4MoeModel(
+            config,
+            quant_config,
+            prefix=add_prefix("language_model", prefix),
+        )
+        self.visual = Glm4vVisionModel(
+            config.vision_config,
+            quant_config=quant_config,
+            prefix=add_prefix("visual", prefix),
+            use_data_parallel=self.use_data_parallel,
+        )
+
+        if self.pp_group.is_last_rank:
+            if self.pp_group.world_size == 1 and self.config.tie_word_embeddings:
+                self.lm_head = self.model.embed_tokens
+            else:
+                self.lm_head = ParallelLMHead(
+                    config.vocab_size,
+                    config.hidden_size,
+                    quant_config=quant_config,
+                    prefix=add_prefix("lm_head", prefix),
+                    use_attn_tp_group=get_global_server_args().enable_dp_lm_head,
+                )
+        else:
+            # ranks other than the last rank will have a placeholder layer
+            self.lm_head = PPMissingLayer()
+
+        self.logits_processor = LogitsProcessor(config)
+        self.pooler = Pooler(pooling_type=PoolingType.LAST, normalize=True)
+        self.is_mrope_enabled = "mrope_section" in self.config.rope_scaling
+
+        # For EAGLE3 support
+        self.capture_aux_hidden_states = False
+
+    def determine_num_fused_shared_experts(self):
+        if get_global_server_args().disable_shared_experts_fusion:
+            return
+
+        disable_reason = None
+        if not getattr(self.config, "n_shared_experts", None):
+            disable_reason = "No shared experts are defined in the config."
+        elif not _is_cuda:
+            disable_reason = "Shared experts fusion currently requires CUDA devices."
+        elif _is_cuda and (_device_sm is not None) and (_device_sm < 80):
+            disable_reason = "Shared experts fusion requires SM80 or newer GPUs."
+        elif get_moe_expert_parallel_world_size() > 1:
+            disable_reason = "Shared experts fusion is not supported together with expert parallelism yet."
+        elif get_moe_a2a_backend().is_deepep():
+            disable_reason = "Shared experts fusion is not supported when Deepep MoE backend is enabled."
+
+        if disable_reason is not None:
+            get_global_server_args().disable_shared_experts_fusion = True
+            log_info_on_rank0(
+                logger,
+                f"{disable_reason} Shared experts fusion optimization is disabled.",
+            )
+            return
+
+        self.num_fused_shared_experts = self.config.n_shared_experts
+        assert (
+            self.num_fused_shared_experts == 1
+        ), "Only 1 fused shared expert is supported for Glm4vMoeForConditionalGeneration"
+        log_info_on_rank0(logger, "Shared experts fusion optimization enabled.")
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]], is_nextn=False):
+        if is_nextn:
+            if hasattr(self.config, "num_nextn_predict_layers"):
+                num_nextn_layers = self.config.num_nextn_predict_layers
+                assert num_nextn_layers == 1, "Only 1 nextn layer is supported"
+                # compatible with old design
+                nextn_layer_id = (
+                    0
+                    if self.config.num_hidden_layers == 1
+                    else self.config.num_hidden_layers
+                )
+            else:
+                raise ValueError("num_nextn_predict_layers is not in the config")
+
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+            ("gate_up_proj", "gate_proj", 0),
+            ("gate_up_proj", "up_proj", 1),
+        ]
+
+        expert_params_mapping = FusedMoE.make_expert_params_mapping(
+            ckpt_gate_proj_name="gate_proj",
+            ckpt_down_proj_name="down_proj",
+            ckpt_up_proj_name="up_proj",
+            num_experts=self.config.n_routed_experts + self.num_fused_shared_experts,
+        )
+
+        if is_nextn:
+            nextn_layer_prefix = f"model.layers.{nextn_layer_id}"
+            nextn_spec_weight_names = [
+                "shared_head.norm",
+                "eh_proj",
+                "enorm",
+                "hnorm",
+            ]
+
+        params_dict = dict(self.named_parameters())
+        weight_names = []
+        for name, loaded_weight in weights:
+            weight_names.append(name)
+
+            if self.num_fused_shared_experts > 0 and "mlp.shared_experts" in name:
+                # Shared expert becomes expert ID = n_routed_experts
+                name = name.replace(
+                    "mlp.shared_experts",
+                    f"mlp.experts.{self.config.n_routed_experts}",
+                )
+
+            if not is_nextn:
+                if hasattr(self.config, "num_nextn_predict_layers"):
+                    num_nextn_layers = self.config.num_nextn_predict_layers
+                    if num_nextn_layers > 0 and name.startswith("model.layers"):
+                        name_list = name.split(".")
+                        if (
+                            len(name_list) >= 3
+                            and int(name_list[2]) >= self.config.num_hidden_layers
+                        ):
+                            continue
+            else:
+                if not name.startswith(nextn_layer_prefix):
+                    continue
+
+                # Use shared head and embed weights from target model
+                if "shared_head.head" in name or "embed_tokens" in name:
+                    continue
+
+                is_decoder = True
+                # For nextn specific weights
+                for weight_name in nextn_spec_weight_names:
+                    if weight_name in name:
+                        name = name.replace(nextn_layer_prefix, "model")
+                        is_decoder = False
+                        break
+                # For decoder layer weights
+                if is_decoder:
+                    name = name.replace(nextn_layer_prefix, "model.decoder")
+
+            if "language_model." in name:
+                name = name.replace("language_model.", "")
+            if "model.visual." in name:
+                name = name.replace("model.visual.", "visual.")
+            if "rotary_emb.inv_freq" in name:
+                continue
+
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                # Skip non-stacked layers and experts (experts handled below).
+                if weight_name not in name:
+                    continue
+                # We have mlp.experts[0].gate_proj in the checkpoint.
+                # Since we handle the experts below in expert_params_mapping,
+                # we need to skip here BEFORE we update the name, otherwise
+                # name will be updated to mlp.experts[0].gate_up_proj, which
+                # will then be updated below in expert_params_mapping
+                # for mlp.experts[0].gate_gate_up_proj, which breaks load.
+                if "mlp.experts" in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                if name not in params_dict:
+                    continue
+
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                # Track if this is an expert weight to enable early skipping
+                is_expert_weight = False
+
+                for mapping in expert_params_mapping:
+                    param_name, weight_name, expert_id, shard_id = mapping
+                    if weight_name not in name:
+                        continue
+
+                    # Mark as expert weight regardless of whether we can process it
+                    is_expert_weight = True
+
+                    name = name.replace(weight_name, param_name)
+                    if name not in params_dict:
+                        # Expert weight not on this rank, will be skipped below
+                        continue
+
+                    param = params_dict[name]
+                    weight_loader = param.weight_loader
+                    weight_loader(
+                        param,
+                        loaded_weight,
+                        name,
+                        shard_id=shard_id,
+                        expert_id=expert_id,
+                    )
+                    break
+                else:
+                    if is_expert_weight:
+                        # This is an expert weight but not mapped to this rank, skip all remaining processing
+                        continue
+
+                    if "visual" in name:
+                        # adapt to VisionAttention for GLM-V
+                        name = name.replace(r"attn.qkv.", r"attn.qkv_proj.")
+
+                    # Skip loading extra bias for GPTQ models.
+                    if name.endswith(".bias") and name not in params_dict:
+                        continue
+                    if name not in params_dict:
+                        continue
+
+                    if name in params_dict.keys():
+                        param = params_dict[name]
+                        weight_loader = getattr(
+                            param, "weight_loader", default_weight_loader
+                        )
+                        if "visual" in name:
+                            loaded_weight = vision_utils.pad_vit_attn_dummy_heads(
+                                self.config, name, loaded_weight
+                            )
+                        weight_loader(param, loaded_weight)
+                    else:
+                        logger.warning(f"Parameter {name} not found in params_dict")
+
+
+EntryClass = [Glm4vMoeForConditionalGeneration]
diff --git a/sglang/python/sglang/srt/models/glm_ocr.py b/sglang/python/sglang/srt/models/glm_ocr.py
new file mode 100644
index 0000000000000000000000000000000000000000..c2f4adc4c566b2db01dfdfb6d7442b0fcad90b38
--- /dev/null
+++ b/sglang/python/sglang/srt/models/glm_ocr.py
@@ -0,0 +1,435 @@
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+# Modeling from:
+# ./llama.py and
+# https://github.com/huggingface/transformers/blob/main/src/transformers/models/GlmOcr/modular_GlmOcr.py
+"""Inference-only GLM-OCR model compatible with HuggingFace weights."""
+
+import logging
+from functools import lru_cache
+from typing import Iterable, Optional, Tuple
+
+import torch
+import torch.nn as nn
+from einops import rearrange
+from transformers.models.glm_ocr.configuration_glm_ocr import (
+    GlmOcrConfig,
+    GlmOcrVisionConfig,
+)
+
+from sglang.srt.distributed.parallel_state import get_pp_group
+from sglang.srt.layers.attention import vision_utils
+from sglang.srt.layers.attention.vision import VisionAttention
+from sglang.srt.layers.layernorm import RMSNorm
+from sglang.srt.layers.logits_processor import LogitsProcessor
+from sglang.srt.layers.pooler import Pooler, PoolingType
+from sglang.srt.layers.quantization.base_config import QuantizationConfig
+from sglang.srt.layers.rotary_embedding import get_rope
+from sglang.srt.layers.utils import PPMissingLayer
+from sglang.srt.layers.vocab_parallel_embedding import ParallelLMHead
+from sglang.srt.model_loader.weight_utils import default_weight_loader
+from sglang.srt.models.glm4 import Glm4Model
+from sglang.srt.models.glm4v import (
+    Glm4vForConditionalGeneration,
+    Glm4vPatchMerger,
+    Glm4vRMSNorm,
+    Glm4vVisionMLP,
+    Glm4vVisionModel,
+    Glm4vVisionPatchEmbed,
+)
+from sglang.srt.server_args import get_global_server_args
+from sglang.srt.utils import add_prefix
+from sglang.srt.utils.hf_transformers_utils import get_processor
+
+logger = logging.getLogger(__name__)
+
+cached_get_processor = lru_cache(get_processor)
+
+
+class GlmOcrRMSNorm(Glm4vRMSNorm):
+    pass
+
+
+class GlmOcrVisionMLP(Glm4vVisionMLP):
+    pass
+
+
+class GlmOcrVisionBlock(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        intermediate_dim: int,
+        num_heads: int,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+        attn_qkv_bias: bool = True,
+        num_dummy_heads: int = 0,
+        rms_norm_eps: float = 1e-5,
+        use_data_parallel: bool = False,
+    ) -> None:
+        super().__init__()
+        self.norm1 = RMSNorm(dim, eps=rms_norm_eps)
+        self.norm2 = RMSNorm(dim, eps=rms_norm_eps)
+        self.attn = VisionAttention(
+            embed_dim=dim,
+            num_heads=num_heads,
+            projection_size=dim,
+            use_qkv_parallel=True,
+            qkv_bias=attn_qkv_bias,
+            proj_bias=True,
+            qk_normalization_by_head_size=True,
+            flatten_batch=True,
+            quant_config=quant_config,
+            prefix=add_prefix("attn", prefix),
+            num_dummy_heads=num_dummy_heads,
+            use_data_parallel=use_data_parallel,
+        )
+        self.mlp = GlmOcrVisionMLP(
+            dim,
+            intermediate_dim,
+            bias=True,
+            quant_config=quant_config,
+            prefix=add_prefix("mlp", prefix),
+            use_data_parallel=use_data_parallel,
+        )
+
+    def forward(
+        self,
+        x: torch.Tensor,
+        cu_seqlens: torch.Tensor,
+        rotary_pos_emb_cos: torch.Tensor,
+        rotary_pos_emb_sin: torch.Tensor,
+    ) -> torch.Tensor:
+        S, B, H = x.shape
+        # norm1: flatten to 2D -> [S*B, H], then reshape back
+        x2d = x.reshape(-1, H)
+        hidden_states = self.norm1(x2d).reshape(S, B, H)
+
+        # Attention expects [B, S, H]
+        hidden_states = rearrange(hidden_states, "s b h -> b s h")
+        attn = self.attn(
+            hidden_states,
+            cu_seqlens=cu_seqlens,
+            rotary_pos_emb_cos=rotary_pos_emb_cos,
+            rotary_pos_emb_sin=rotary_pos_emb_sin,
+        )
+        attn = rearrange(attn, "b s h -> s b h")
+
+        # norm2 with fused residual-add: also 2D
+        attn2d = attn.reshape(-1, H)
+        x_norm_2d, x_after_add_2d = self.norm2(x2d, residual=attn2d)
+        x_norm = x_norm_2d.reshape(S, B, H)
+        x_after_add = x_after_add_2d.reshape(S, B, H)
+
+        # MLP and final residual
+        mlp_out = self.mlp(x_norm)
+        x = x_after_add + mlp_out
+        return x
+
+
+class GlmOcrVisionPatchEmbed(Glm4vVisionPatchEmbed):
+    pass
+
+
+class GlmOcrVisionPatchMerger(Glm4vPatchMerger):
+    pass
+
+
+class GlmOcrVisionModel(Glm4vVisionModel):
+    def __init__(
+        self,
+        vision_config: GlmOcrVisionConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+        use_data_parallel: bool = False,
+    ) -> None:
+        super().__init__(vision_config, quant_config, prefix, use_data_parallel)
+
+        patch_size = vision_config.patch_size
+        temporal_patch_size = vision_config.temporal_patch_size
+        in_channels = vision_config.in_channels
+        depth = vision_config.depth
+        self.hidden_size = vision_config.hidden_size
+        self.num_heads = vision_config.num_heads
+
+        self.patch_size = vision_config.patch_size
+        self.spatial_merge_size = vision_config.spatial_merge_size
+        self.out_hidden_size = vision_config.out_hidden_size
+        self.intermediate_size = vision_config.intermediate_size
+        self.use_data_parallel = use_data_parallel
+
+        self.patch_embed = GlmOcrVisionPatchEmbed(
+            patch_size=patch_size,
+            temporal_patch_size=temporal_patch_size,
+            in_channels=in_channels,
+            hidden_size=self.hidden_size,
+        )
+
+        head_dim = self.hidden_size // self.num_heads
+        self.rotary_pos_emb = get_rope(
+            head_size=head_dim,
+            rotary_dim=head_dim // 2,
+            max_position=8192,
+            base=10000.0,
+            is_neox_style=True,
+        )
+
+        self.blocks = nn.ModuleList(
+            [
+                GlmOcrVisionBlock(
+                    dim=self.hidden_size,
+                    intermediate_dim=self.intermediate_size,
+                    num_heads=self.num_heads,
+                    quant_config=quant_config,
+                    prefix=add_prefix(f"blocks.{layer_idx}", prefix),
+                    rms_norm_eps=vision_config.rms_norm_eps,
+                    attn_qkv_bias=vision_config.attention_bias,
+                    use_data_parallel=use_data_parallel,
+                )
+                for layer_idx in range(depth)
+            ]
+        )
+        self.merger = GlmOcrVisionPatchMerger(
+            d_model=vision_config.out_hidden_size,
+            context_dim=vision_config.out_hidden_size * vision_config.in_channels,
+            quant_config=quant_config,
+            bias=False,
+            prefix=add_prefix("merger", prefix),
+            use_data_parallel=use_data_parallel,
+        )
+
+        self.downsample = nn.Conv2d(
+            in_channels=vision_config.hidden_size,
+            out_channels=vision_config.out_hidden_size,
+            kernel_size=vision_config.spatial_merge_size,
+            stride=vision_config.spatial_merge_size,
+        )
+        self.post_layernorm = GlmOcrRMSNorm(
+            vision_config.hidden_size, eps=vision_config.rms_norm_eps
+        )
+
+    def forward(self, x: torch.Tensor, grid_thw: torch.Tensor) -> torch.Tensor:
+        # patchify
+        x = x.to(device=self.device, dtype=self.dtype)
+        x = self.patch_embed(x)
+
+        # compute position embedding
+        rotary_pos_emb_cos, rotary_pos_emb_sin, image_type_ids = self.rot_pos_emb(
+            grid_thw
+        )
+        # compute cu_seqlens
+        cu_seqlens = torch.repeat_interleave(
+            grid_thw[:, 1] * grid_thw[:, 2], grid_thw[:, 0]
+        ).cumsum(dim=0, dtype=torch.int32)
+        cu_seqlens = torch.cat([cu_seqlens.new_zeros(1), cu_seqlens])
+
+        rotary_pos_emb_cos = torch.cat([rotary_pos_emb_cos, rotary_pos_emb_cos], dim=-1)
+        rotary_pos_emb_sin = torch.cat([rotary_pos_emb_sin, rotary_pos_emb_sin], dim=-1)
+
+        # x.shape: (s, b, d) where b=1 for vision processing
+        # transformers
+        x = x.unsqueeze(1)
+        for blk in self.blocks:
+            x = blk(
+                x,
+                cu_seqlens=cu_seqlens,
+                rotary_pos_emb_cos=rotary_pos_emb_cos,
+                rotary_pos_emb_sin=rotary_pos_emb_sin,
+            )
+
+        # adapter
+        x = self.post_layernorm(x)
+        x = x.view(-1, self.spatial_merge_size, self.spatial_merge_size, x.shape[-1])
+        x = x.permute(0, 3, 1, 2)
+        x = self.downsample(x).view(-1, self.out_hidden_size)
+        x = self.merger(x)
+
+        return x
+
+
+class GlmOcrForConditionalGeneration(Glm4vForConditionalGeneration):
+    def __init__(
+        self,
+        config: GlmOcrConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__(config, quant_config, prefix)
+
+        self.pp_group = get_pp_group()
+        self.config = config
+        self.use_data_parallel = get_global_server_args().mm_enable_dp_encoder
+        self.visual = GlmOcrVisionModel(
+            vision_config=config.vision_config,
+            quant_config=quant_config,
+            prefix=add_prefix("visual", prefix),
+            use_data_parallel=self.use_data_parallel,
+        )
+
+        vision_utils.update_vit_attn_dummy_heads_config(self.config)
+
+        self.model = Glm4Model(
+            config,
+            quant_config=quant_config,
+            prefix=add_prefix("model", prefix),
+        )
+
+        if self.pp_group.is_last_rank:
+            if self.pp_group.world_size == 1 and self.config.tie_word_embeddings:
+                self.lm_head = self.model.embed_tokens
+            else:
+                self.lm_head = ParallelLMHead(
+                    self.config.vocab_size,
+                    self.config.hidden_size,
+                    quant_config=quant_config,
+                    prefix=add_prefix("lm_head", prefix),
+                )
+        else:
+            # ranks other than the last rank will have a placeholder layer
+            self.lm_head = PPMissingLayer()
+
+        self.is_mrope_enabled = "mrope_section" in self.config.rope_scaling
+
+        self.logits_processor = LogitsProcessor(config)
+        self.pooler = Pooler(pooling_type=PoolingType.LAST, normalize=True)
+
+        # For EAGLE3 support
+        self.capture_aux_hidden_states = False
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]], is_nextn=False):
+        if is_nextn:
+            if hasattr(self.config, "num_nextn_predict_layers"):
+                num_nextn_layers = self.config.num_nextn_predict_layers
+                assert num_nextn_layers == 1, "Only 1 nextn layer is supported"
+                # compatible with old design
+                nextn_layer_id = (
+                    0
+                    if self.config.num_hidden_layers == 1
+                    else self.config.num_hidden_layers
+                )
+            else:
+                raise ValueError("num_nextn_predict_layers is not in the config")
+
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            (".qkv_proj", ".q_proj", "q"),
+            (".qkv_proj", ".k_proj", "k"),
+            (".qkv_proj", ".v_proj", "v"),
+            (".gate_up_proj", ".up_proj", 1),
+            (".gate_up_proj", ".gate_proj", 0),
+        ]
+
+        if is_nextn:
+            nextn_layer_prefix = f"model.layers.{nextn_layer_id}"
+            nextn_spec_weight_names = [
+                "shared_head.norm",
+                "eh_proj",
+                "enorm",
+                "hnorm",
+            ]
+
+        params_dict = dict(self.named_parameters(remove_duplicate=False))
+
+        # For the PP case, we add special handling for lm_head.weight,
+        # - On non–last ranks: we continue, because this stage is supposed to
+        #   be just an empty PPMissingLayer shell.
+        # - On the last rank: params_dict is expected to contain lm_head.weight,
+        #   so it will never hit the branch "if name not in params_dict".
+        #
+        # For all other parameters, such like
+        # "model.visual.blocks.20.mlp.gate_proj.weight", the unified rule is:
+        # If this name does not exist in the current rank’s params_dict,
+        # it does not belong to this pipeline stage, thus we simply continue.
+
+        for name, loaded_weight in weights:
+            if "rotary_emb.inv_freq" in name:
+                continue
+            if "language_model" in name:
+                name = name.replace(r"model.language_model.", r"model.")
+            if "model.visual." in name:
+                name = name.replace("model.visual.", "visual.")
+
+            if not is_nextn:
+                if hasattr(self.config, "num_nextn_predict_layers"):
+                    num_nextn_layers = self.config.num_nextn_predict_layers
+                    if num_nextn_layers > 0 and name.startswith("model.layers"):
+                        name_list = name.split(".")
+                        if (
+                            len(name_list) >= 3
+                            and int(name_list[2]) >= self.config.num_hidden_layers
+                        ):
+                            continue
+            else:
+                if not name.startswith(nextn_layer_prefix):
+                    continue
+
+                # Use shared head and embed weights from target model
+                if "shared_head.head" in name or "embed_tokens" in name:
+                    continue
+
+                is_decoder = True
+                # For nextn specific weights
+                for weight_name in nextn_spec_weight_names:
+                    if weight_name in name:
+                        name = name.replace(nextn_layer_prefix, "model")
+                        is_decoder = False
+                        break
+                # For decoder layer weights
+                if is_decoder:
+                    name = name.replace(nextn_layer_prefix, "model.decoder")
+
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+
+                if name not in params_dict:
+                    continue
+
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                if "visual" in name:
+                    # adapt to VisionAttention
+                    name = name.replace(r"attn.qkv.", r"attn.qkv_proj.")
+
+                try:
+                    # Skip loading extra bias for GPTQ models.
+                    if name.endswith(".bias") and name not in params_dict:
+                        continue
+
+                    if name not in params_dict:
+                        continue
+
+                    param = params_dict[name]
+                except KeyError:
+                    print(params_dict.keys())
+                    raise
+
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
+                if "visual" in name:
+                    loaded_weight = vision_utils.pad_vit_attn_dummy_heads(
+                        self.config, name, loaded_weight
+                    )
+                weight_loader(param, loaded_weight)
+
+
+EntryClass = [GlmOcrForConditionalGeneration]
diff --git a/sglang/python/sglang/srt/models/glm_ocr_nextn.py b/sglang/python/sglang/srt/models/glm_ocr_nextn.py
new file mode 100644
index 0000000000000000000000000000000000000000..ae771af53b318ee669c857b8d59f7db3ed7d5024
--- /dev/null
+++ b/sglang/python/sglang/srt/models/glm_ocr_nextn.py
@@ -0,0 +1,162 @@
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Inference-only GLM-OCR Speculative Decoding."""
+
+import logging
+from typing import Iterable, Optional, Tuple
+
+import torch
+from torch import nn
+from transformers import PretrainedConfig
+
+from sglang.srt.distributed import get_tensor_model_parallel_world_size
+from sglang.srt.eplb.expert_distribution import get_global_expert_distribution_recorder
+from sglang.srt.layers.dp_attention import is_dp_attention_enabled
+from sglang.srt.layers.layernorm import RMSNorm
+from sglang.srt.layers.logits_processor import LogitsProcessor
+from sglang.srt.layers.quantization.base_config import QuantizationConfig
+from sglang.srt.layers.vocab_parallel_embedding import (
+    ParallelLMHead,
+    VocabParallelEmbedding,
+)
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch
+from sglang.srt.models.glm4 import Glm4DecoderLayer
+from sglang.srt.models.glm_ocr import GlmOcrForConditionalGeneration
+from sglang.srt.server_args import get_global_server_args
+from sglang.srt.utils import add_prefix
+
+logger = logging.getLogger(__name__)
+
+
+class GlmOcrModelNextN(nn.Module):
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        if quant_config is not None and quant_config.get_name() == "modelopt_fp4":
+            logger.warning(
+                "Overriding GlmOcrModelNextN quant config for modelopt_fp4 GLM-OCR model."
+            )
+            quant_config = None
+
+        self.vocab_size = config.vocab_size
+
+        self.embed_tokens = VocabParallelEmbedding(
+            config.vocab_size,
+            config.hidden_size,
+            enable_tp=not is_dp_attention_enabled(),
+            prefix=add_prefix("embed_tokens", prefix),
+        )
+
+        self.enorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.hnorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+        self.eh_proj = nn.Linear(2 * config.hidden_size, config.hidden_size, bias=False)
+
+        self.decoder = Glm4DecoderLayer(
+            config,
+            0,
+            quant_config=quant_config,
+            prefix=add_prefix("decoder", prefix),
+        )
+
+        self.shared_head = nn.Module()
+        self.shared_head.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        input_embeds: torch.Tensor = None,
+    ) -> torch.Tensor:
+        if input_embeds is None:
+            hidden_states = self.embed_tokens(input_ids)
+        else:
+            hidden_states = input_embeds
+
+        if hidden_states.shape[0] > 0:
+            hidden_states = self.eh_proj(
+                torch.cat(
+                    (
+                        self.enorm(hidden_states),
+                        self.hnorm(forward_batch.spec_info.hidden_states),
+                    ),
+                    dim=-1,
+                )
+            )
+
+        residual = None
+        with get_global_expert_distribution_recorder().disable_this_region():
+            hidden_states, residual = self.decoder(
+                positions, hidden_states, forward_batch, residual
+            )
+
+        if not forward_batch.forward_mode.is_idle():
+            if residual is not None:
+                hidden_states, _ = self.shared_head.norm(hidden_states, residual)
+            else:
+                hidden_states = self.shared_head.norm(hidden_states)
+
+        return hidden_states
+
+
+class GlmOcrForConditionalGenerationNextN(GlmOcrForConditionalGeneration):
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        nn.Module.__init__(self)
+        self.config = config
+        self.tp_size = get_tensor_model_parallel_world_size()
+        self.quant_config = quant_config
+        self.model = GlmOcrModelNextN(
+            config, quant_config, prefix=add_prefix("model", prefix)
+        )
+        self.lm_head = ParallelLMHead(
+            config.vocab_size,
+            config.hidden_size,
+            quant_config=quant_config,
+            prefix=add_prefix("model.shared_head.head", prefix),
+            use_attn_tp_group=get_global_server_args().enable_dp_lm_head,
+        )
+        self.logits_processor = LogitsProcessor(config)
+
+        self.num_fused_shared_experts = (
+            0 if get_global_server_args().disable_shared_experts_fusion else 1
+        )
+
+    @torch.no_grad()
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+    ) -> torch.Tensor:
+        hidden_states = self.model(input_ids, positions, forward_batch)
+        return self.logits_processor(
+            input_ids, hidden_states, self.lm_head, forward_batch
+        )
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        super().load_weights(weights, is_nextn=True)
+
+
+EntryClass = [GlmOcrForConditionalGenerationNextN]
diff --git a/sglang/python/sglang/srt/models/glmasr.py b/sglang/python/sglang/srt/models/glmasr.py
new file mode 100644
index 0000000000000000000000000000000000000000..d19f80f92d9dbb793767f2180a3c9b18fa07840c
--- /dev/null
+++ b/sglang/python/sglang/srt/models/glmasr.py
@@ -0,0 +1,171 @@
+# Copyright 2023-2025 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+# Modeling from:
+# ./llama.py and
+# https://github.com/huggingface/transformers/blob/main/src/transformers/models/glmasr/modular_glmasr.py
+"""Inference-only GLM-ASR-HF model compatible with HuggingFace weights."""
+
+import logging
+from typing import Any, Iterable, List, Optional, Tuple
+
+import torch
+import torch.nn as nn
+from transformers import GlmAsrConfig, GlmAsrEncoderConfig
+from transformers.models.glmasr.modeling_glmasr import (
+    GlmAsrEncoder,
+    GlmAsrMultiModalProjector,
+)
+
+from sglang.srt.layers.quantization.base_config import QuantizationConfig
+from sglang.srt.managers.mm_utils import (
+    MultiModalityDataPaddingPatternMultimodalTokens,
+    general_mm_embed_routine,
+)
+from sglang.srt.managers.schedule_batch import (
+    Modality,
+    MultimodalDataItem,
+    MultimodalInputs,
+)
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch
+from sglang.srt.model_loader.weight_utils import default_weight_loader
+from sglang.srt.models.llama import LlamaForCausalLM
+from sglang.srt.utils import add_prefix
+
+logger = logging.getLogger(__name__)
+
+
+class GlmAsrForConditionalGeneration(nn.Module):
+    # BitandBytes specific attributes
+    default_bitsandbytes_target_modules = [
+        ".gate_proj.",
+        ".down_proj.",
+        ".up_proj.",
+        ".q_proj.",
+        ".k_proj.",
+        ".v_proj.",
+        ".o_proj.",
+    ]
+    bitsandbytes_stacked_params_mapping = {
+        # shard_name, weight_name, index
+        "q_proj": ("qkv_proj", 0),
+        "k_proj": ("qkv_proj", 1),
+        "v_proj": ("qkv_proj", 2),
+        "gate_proj": ("gate_up_proj", 0),
+        "up_proj": ("gate_up_proj", 1),
+    }
+
+    def __init__(
+        self,
+        config: GlmAsrConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+
+        self.config = config
+
+        if getattr(self.config, "audio_config", None) is None:
+            self.config.audio_config = GlmAsrEncoderConfig(self.config._name_or_path)
+
+        self.audio_tower = GlmAsrEncoder(
+            config.audio_config,
+        )
+        self.multi_modal_projector = GlmAsrMultiModalProjector(config)
+        self.language_model = LlamaForCausalLM(
+            config.text_config, quant_config, prefix=add_prefix("model", prefix)
+        )
+        self.pattern = MultiModalityDataPaddingPatternMultimodalTokens()
+
+    def pad_input_ids(self, input_ids: List[int], mm_inputs: MultimodalInputs):
+        return self.pattern.pad_input_tokens(input_ids, mm_inputs)
+
+    def get_audio_feature(self, items: List[MultimodalDataItem]) -> torch.Tensor:
+        # Extract audio features from input items
+        input_features = torch.cat([item.feature for item in items], dim=0).type(
+            self.audio_tower.dtype
+        )
+
+        audio_embeds = self.audio_tower(input_features).last_hidden_state
+        audio_embeds = audio_embeds.reshape(
+            -1, self.config.audio_config.intermediate_size
+        )
+        audio_embeds = self.multi_modal_projector(audio_embeds)
+
+        return audio_embeds
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        **kwargs: Any,
+    ) -> torch.Tensor:
+        hidden_states = general_mm_embed_routine(
+            input_ids=input_ids,
+            forward_batch=forward_batch,
+            language_model=self.language_model,
+            data_embedding_funcs={
+                Modality.AUDIO: self.get_audio_feature,
+            },
+            positions=positions,
+        )
+
+        return hidden_states
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+            ("gate_up_proj", "gate_proj", 0),
+            ("gate_up_proj", "up_proj", 1),
+        ]
+        params_dict = dict(self.named_parameters(remove_duplicate=False))
+
+        for name, loaded_weight in weights:
+            if "rotary_emb.inv_freq" in name:
+                continue
+
+            if self.config.text_config.tie_word_embeddings and "lm_head.weight" in name:
+                continue
+
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                if weight_name not in name or "audio_tower" in name:
+                    continue
+                name_tmp = name.replace(weight_name, param_name)
+
+                # Skip loading extra bias for GPTQ models.
+                if name_tmp.endswith(".bias") and name_tmp not in params_dict:
+                    continue
+                param = params_dict[name_tmp]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                try:
+                    # Skip loading extra bias for GPTQ models.
+                    if name.endswith(".bias") and name not in params_dict:
+                        continue
+                    param = params_dict[name]
+                except KeyError:
+                    print(params_dict.keys())
+                    raise
+
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
+                weight_loader(param, loaded_weight)
+
+
+EntryClass = GlmAsrForConditionalGeneration
diff --git a/sglang/python/sglang/srt/models/gpt2.py b/sglang/python/sglang/srt/models/gpt2.py
new file mode 100644
index 0000000000000000000000000000000000000000..6dac103e2ad9d1f722d2b074a3f7e68e63adb650
--- /dev/null
+++ b/sglang/python/sglang/srt/models/gpt2.py
@@ -0,0 +1,289 @@
+# coding=utf-8
+# Adapted from
+# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/gpt2/modeling_gpt2.py
+# Copyright 2023 The vLLM team.
+# Copyright 2018 The OpenAI Team Authors and HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Inference-only GPT-2 model compatible with HuggingFace weights."""
+
+from typing import Iterable, Optional, Tuple, Type
+
+import torch
+from torch import nn
+from transformers import GPT2Config
+
+from sglang.srt.distributed.parallel_state import get_tensor_model_parallel_world_size
+from sglang.srt.layers.activation import NewGELU
+from sglang.srt.layers.linear import (
+    ColumnParallelLinear,
+    QKVParallelLinear,
+    RowParallelLinear,
+)
+from sglang.srt.layers.logits_processor import LogitsProcessor
+from sglang.srt.layers.quantization.base_config import QuantizationConfig
+from sglang.srt.layers.radix_attention import RadixAttention
+from sglang.srt.layers.vocab_parallel_embedding import VocabParallelEmbedding
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch
+from sglang.srt.model_loader.weight_utils import default_weight_loader
+from sglang.srt.utils import add_prefix
+
+
+class GPT2Attention(nn.Module):
+
+    def __init__(
+        self,
+        layer_id: int,
+        config: GPT2Config,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        total_num_heads = config.num_attention_heads
+        tensor_model_parallel_world_size = get_tensor_model_parallel_world_size()
+        assert total_num_heads % tensor_model_parallel_world_size == 0
+        self.num_heads = total_num_heads // tensor_model_parallel_world_size
+        self.head_dim = self.hidden_size // total_num_heads
+        self.scale = self.head_dim**-0.5
+
+        self.c_attn = QKVParallelLinear(
+            self.hidden_size,
+            self.head_dim,
+            total_num_heads,
+            bias=True,
+            quant_config=quant_config,
+            prefix=add_prefix("c_attn", prefix),
+        )
+        self.c_proj = RowParallelLinear(
+            self.hidden_size,
+            self.hidden_size,
+            bias=True,
+            quant_config=quant_config,
+            prefix=add_prefix("c_proj", prefix),
+        )
+        self.attn = RadixAttention(
+            self.num_heads,
+            self.head_dim,
+            scaling=self.scale,
+            num_kv_heads=total_num_heads,
+            layer_id=layer_id,
+            quant_config=quant_config,
+        )
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+    ) -> torch.Tensor:
+        qkv, _ = self.c_attn(hidden_states)
+        q, k, v = qkv.chunk(chunks=3, dim=-1)
+        attn_output = self.attn(q, k, v, forward_batch)
+        attn_output, _ = self.c_proj(attn_output)
+        return attn_output
+
+
+class GPT2MLP(nn.Module):
+
+    def __init__(
+        self,
+        intermediate_size: int,
+        config: GPT2Config,
+        act_layer: Type[nn.Module] = NewGELU,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        hidden_size = config.hidden_size
+        self.c_fc = ColumnParallelLinear(
+            hidden_size,
+            intermediate_size,
+            bias=True,
+            quant_config=quant_config,
+            prefix=add_prefix("c_fc", prefix),
+        )
+        self.c_proj = RowParallelLinear(
+            intermediate_size,
+            hidden_size,
+            bias=True,
+            quant_config=quant_config,
+            prefix=add_prefix("c_proj", prefix),
+        )
+        self.act = act_layer()
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+    ) -> torch.Tensor:
+        hidden_states, _ = self.c_fc(hidden_states)
+        hidden_states = self.act(hidden_states)
+        hidden_states, _ = self.c_proj(hidden_states)
+        return hidden_states
+
+
+class GPT2Block(nn.Module):
+
+    def __init__(
+        self,
+        layer_id: int,
+        config: GPT2Config,
+        act_layer: Type[nn.Module] = NewGELU,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        hidden_size = config.hidden_size
+        inner_dim = config.n_inner if config.n_inner is not None else 4 * hidden_size
+
+        self.ln_1 = nn.LayerNorm(hidden_size, eps=config.layer_norm_epsilon)
+        self.attn = GPT2Attention(
+            layer_id, config, quant_config, prefix=add_prefix("attn", prefix)
+        )
+        self.ln_2 = nn.LayerNorm(hidden_size, eps=config.layer_norm_epsilon)
+        self.mlp = GPT2MLP(
+            inner_dim,
+            config,
+            act_layer=act_layer,
+            quant_config=quant_config,
+            prefix=add_prefix("mlp", prefix),
+        )
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+    ) -> torch.Tensor:
+        residual = hidden_states
+        hidden_states = self.ln_1(hidden_states)
+        attn_output = self.attn(
+            hidden_states=hidden_states,
+            forward_batch=forward_batch,
+        )
+        # residual connection
+        hidden_states = attn_output + residual
+
+        residual = hidden_states
+        hidden_states = self.ln_2(hidden_states)
+        feed_forward_hidden_states = self.mlp(hidden_states)
+        # residual connection
+        hidden_states = residual + feed_forward_hidden_states
+        return hidden_states
+
+
+class GPT2Model(nn.Module):
+
+    def __init__(
+        self,
+        config: GPT2Config,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.config = config
+        assert not config.add_cross_attention
+        assert not config.scale_attn_by_inverse_layer_idx
+        assert not config.reorder_and_upcast_attn
+        self.embed_dim = config.hidden_size
+        self.wte = VocabParallelEmbedding(config.vocab_size, self.embed_dim)
+        self.wpe = nn.Embedding(config.max_position_embeddings, self.embed_dim)
+        self.h = nn.ModuleList(
+            [
+                GPT2Block(
+                    i,
+                    config,
+                    quant_config=quant_config,
+                    prefix=add_prefix(f"h.{i}", prefix),
+                )
+                for i in range(config.num_hidden_layers)
+            ]
+        )
+
+        self.ln_f = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_epsilon)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        position_ids: torch.Tensor,
+        forward_batch: ForwardBatch,
+    ) -> torch.Tensor:
+        inputs_embeds = self.wte(input_ids)
+        position_embeds = self.wpe(position_ids)
+        hidden_states = inputs_embeds + position_embeds
+
+        for i in range(len(self.h)):
+            layer = self.h[i]
+            hidden_states = layer(hidden_states, forward_batch)
+
+        hidden_states = self.ln_f(hidden_states)
+        return hidden_states
+
+
+class GPT2LMHeadModel(nn.Module):
+
+    def __init__(
+        self,
+        config: GPT2Config,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.config = config
+        self.quant_config = quant_config
+        self.transformer = GPT2Model(
+            config, quant_config, prefix=add_prefix("transformer", prefix)
+        )
+        self.lm_head = self.transformer.wte
+
+        self.logits_processor = LogitsProcessor(config)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+    ) -> torch.Tensor:
+        hidden_states = self.transformer(input_ids, positions, forward_batch)
+        return self.logits_processor(
+            input_ids, hidden_states, self.lm_head, forward_batch
+        )
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        params_dict = dict(self.named_parameters(remove_duplicate=False))
+        for name, loaded_weight in weights:
+            if "lm_head.weight" in name:
+                # GPT-2 ties the weights of the embedding layer and the final
+                # linear layer.
+                continue
+            if ".attn.bias" in name or ".attn.masked_bias" in name:
+                # Skip attention mask.
+                # NOTE: "c_attn.bias" should not be skipped.
+                continue
+            if not name.startswith("transformer."):
+                name = "transformer." + name
+
+            param = params_dict[name]
+            # The HF's GPT-2 implementation uses Conv1D instead of Linear.
+            # Because of this, we need to transpose the weights.
+            # Note(zhuohan): the logic below might break quantized models.
+            for conv1d_weight_name in ["c_attn", "c_proj", "c_fc"]:
+                if conv1d_weight_name not in name:
+                    continue
+                if not name.endswith(".weight"):
+                    continue
+                loaded_weight = loaded_weight.t()
+            weight_loader = getattr(param, "weight_loader", default_weight_loader)
+            weight_loader(param, loaded_weight)
+
+
+EntryClass = GPT2LMHeadModel
diff --git a/sglang/python/sglang/srt/models/gpt_bigcode.py b/sglang/python/sglang/srt/models/gpt_bigcode.py
new file mode 100644
index 0000000000000000000000000000000000000000..f49a5d304198233adc1077f36dc0446d4880b5eb
--- /dev/null
+++ b/sglang/python/sglang/srt/models/gpt_bigcode.py
@@ -0,0 +1,304 @@
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+# Adapted from:
+# https://github.com/vllm-project/vllm/blob/07eb6f19f3b0ee9f7adf6eb689607028aa40bfd5/vllm/model_executor/models/gpt_bigcode.py
+"""Inference-only GPTBigCode model compatible with HuggingFace weights."""
+
+from typing import Iterable, Optional, Tuple
+
+import torch
+from torch import nn
+from transformers import GPTBigCodeConfig
+
+from sglang.srt.distributed import get_tensor_model_parallel_world_size
+from sglang.srt.layers.activation import get_act_fn
+from sglang.srt.layers.linear import (
+    ColumnParallelLinear,
+    QKVParallelLinear,
+    RowParallelLinear,
+)
+from sglang.srt.layers.logits_processor import LogitsProcessor
+from sglang.srt.layers.quantization.base_config import QuantizationConfig
+from sglang.srt.layers.radix_attention import RadixAttention
+from sglang.srt.layers.vocab_parallel_embedding import VocabParallelEmbedding
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch
+from sglang.srt.model_loader.weight_utils import default_weight_loader
+from sglang.srt.utils import add_prefix
+
+
+class GPTBigCodeAttention(nn.Module):
+
+    def __init__(
+        self,
+        layer_id: int,
+        config: GPTBigCodeConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        total_num_heads = config.num_attention_heads
+        self.tensor_model_parallel_world_size = get_tensor_model_parallel_world_size()
+        assert total_num_heads % self.tensor_model_parallel_world_size == 0
+        self.num_heads = total_num_heads // self.tensor_model_parallel_world_size
+        self.head_dim = self.hidden_size // total_num_heads
+        self.scale = self.head_dim**-0.5
+
+        self.multi_query = config.multi_query
+        if self.multi_query:
+            total_num_kv_heads = 1
+            self.num_kv_heads = 1
+        else:
+            total_num_kv_heads = total_num_heads
+            self.num_kv_heads = self.num_heads
+        self.kv_dim = self.head_dim * self.num_kv_heads
+        self.c_attn = QKVParallelLinear(
+            self.hidden_size,
+            self.head_dim,
+            total_num_heads,
+            total_num_kv_heads,
+            bias=True,
+            quant_config=quant_config,
+            prefix=add_prefix("c_attn", prefix),
+        )
+
+        self.c_proj = RowParallelLinear(
+            self.hidden_size,
+            self.hidden_size,
+            bias=True,
+            quant_config=quant_config,
+            prefix=add_prefix("c_proj", prefix),
+        )
+        self.attn = RadixAttention(
+            self.num_heads,
+            self.head_dim,
+            scaling=self.scale,
+            num_kv_heads=self.num_kv_heads,
+            layer_id=layer_id,
+            quant_config=quant_config,
+            prefix=add_prefix("attn", prefix),
+        )
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+    ) -> torch.Tensor:
+        qkv, _ = self.c_attn(hidden_states)
+        q, k, v = qkv.split(
+            [
+                self.hidden_size // self.tensor_model_parallel_world_size,
+                self.kv_dim,
+                self.kv_dim,
+            ],
+            dim=-1,
+        )
+        attn_output = self.attn(q, k, v, forward_batch)
+        attn_output, _ = self.c_proj(attn_output)
+        return attn_output
+
+
+class GPTBigMLP(nn.Module):
+
+    def __init__(
+        self,
+        intermediate_size: int,
+        config: GPTBigCodeConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        hidden_size = config.hidden_size
+        self.c_fc = ColumnParallelLinear(
+            hidden_size,
+            intermediate_size,
+            bias=True,
+            quant_config=quant_config,
+            prefix=add_prefix("c_fc", prefix),
+        )
+        self.c_proj = RowParallelLinear(
+            intermediate_size,
+            hidden_size,
+            bias=True,
+            quant_config=quant_config,
+            prefix=add_prefix("c_proj", prefix),
+        )
+        self.act = get_act_fn(
+            config.activation_function, quant_config, intermediate_size
+        )
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states, _ = self.c_fc(hidden_states)
+        hidden_states = self.act(hidden_states)
+        hidden_states, _ = self.c_proj(hidden_states)
+        return hidden_states
+
+
+class GPTBigCodeBlock(nn.Module):
+
+    def __init__(
+        self,
+        layer_id: int,
+        config: GPTBigCodeConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        hidden_size = config.hidden_size
+        inner_dim = config.n_inner if config.n_inner is not None else 4 * hidden_size
+
+        self.ln_1 = nn.LayerNorm(hidden_size, eps=config.layer_norm_epsilon)
+        self.attn = GPTBigCodeAttention(
+            layer_id, config, quant_config, prefix=add_prefix("attn", prefix)
+        )
+        self.ln_2 = nn.LayerNorm(hidden_size, eps=config.layer_norm_epsilon)
+        self.mlp = GPTBigMLP(
+            inner_dim, config, quant_config, prefix=add_prefix("mlp", prefix)
+        )
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+    ) -> torch.Tensor:
+        residual = hidden_states
+        hidden_states = self.ln_1(hidden_states)
+        attn_output = self.attn(
+            hidden_states=hidden_states, forward_batch=forward_batch
+        )
+        # residual connection
+        hidden_states = attn_output + residual
+
+        residual = hidden_states
+        hidden_states = self.ln_2(hidden_states)
+        feed_forward_hidden_states = self.mlp(hidden_states)
+        # residual connection
+        hidden_states = residual + feed_forward_hidden_states
+        return hidden_states
+
+
+class GPTBigCodeModel(nn.Module):
+
+    def __init__(
+        self,
+        config: GPTBigCodeConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.config = config
+        assert not config.add_cross_attention
+
+        self.embed_dim = config.hidden_size
+        lora_vocab = 0
+        self.vocab_size = config.vocab_size + lora_vocab
+        self.wte = VocabParallelEmbedding(
+            self.vocab_size,
+            self.embed_dim,
+            org_num_embeddings=config.vocab_size,
+            prefix=add_prefix("wte", prefix),
+        )
+        self.wpe = nn.Embedding(config.max_position_embeddings, self.embed_dim)
+        self.h = nn.ModuleList(
+            [
+                GPTBigCodeBlock(
+                    i, config, quant_config, prefix=add_prefix(f"h.{i}", prefix)
+                )
+                for i in range(config.num_hidden_layers)
+            ]
+        )
+        self.ln_f = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_epsilon)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        position_ids: torch.Tensor,
+        forward_batch: ForwardBatch,
+    ) -> torch.Tensor:
+        inputs_embeds = self.wte(input_ids)
+        position_embeds = self.wpe(position_ids)
+        hidden_states = inputs_embeds + position_embeds
+
+        for i in range(len(self.h)):
+            layer = self.h[i]
+            hidden_states = layer(hidden_states, forward_batch)
+
+        hidden_states = self.ln_f(hidden_states)
+        return hidden_states
+
+
+class GPTBigCodeForCausalLM(nn.Module):
+    packed_modules_mapping = {"c_attn": ["c_attn"]}
+
+    supported_lora_modules = ["c_fc", "c_proj", "wte", "c_attn"]
+
+    embedding_modules = {
+        "wte": "input_embeddings",
+        "lm_head": "output_embeddings",
+    }
+
+    embedding_padding_modules = []
+
+    def __init__(
+        self,
+        config: GPTBigCodeConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+
+        self.config = config
+
+        self.quant_config = quant_config
+        self.transformer = GPTBigCodeModel(
+            config, quant_config, prefix=add_prefix("transformer", prefix)
+        )
+        self.lm_head = self.transformer.wte
+        self.unpadded_vocab_size = config.vocab_size
+        self.logits_processor = LogitsProcessor(config)
+
+    @torch.no_grad()
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+    ) -> torch.Tensor:
+        hidden_states = self.transformer(input_ids, positions, forward_batch)
+        return self.logits_processor(
+            input_ids, hidden_states, self.lm_head, forward_batch
+        )
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        params_dict = dict(self.named_parameters(remove_duplicate=False))
+        for name, loaded_weight in weights:
+            if "lm_head.weight" in name:
+                continue
+            if ".attn.bias" in name:
+                # Skip attention mask.
+                # NOTE: "c_attn.bias" should not be skipped.
+                continue
+            param = params_dict[name]
+            weight_loader = getattr(param, "weight_loader", default_weight_loader)
+            # TODO (@robertgshaw2-neuralmagic): move to fp8 linear method
+            if "c_attn.input_scale" in name or "c_attn.weight_scale" in name:
+                weight_loader(param, loaded_weight, "q")
+                weight_loader(param, loaded_weight, "k")
+                weight_loader(param, loaded_weight, "v")
+            else:
+                weight_loader(param, loaded_weight)
+
+
+EntryClass = GPTBigCodeForCausalLM
diff --git a/sglang/python/sglang/srt/models/gpt_j.py b/sglang/python/sglang/srt/models/gpt_j.py
new file mode 100644
index 0000000000000000000000000000000000000000..e4fc61eaa1ef0d2bb208b7b0e6183d2db6794035
--- /dev/null
+++ b/sglang/python/sglang/srt/models/gpt_j.py
@@ -0,0 +1,326 @@
+# Copyright 2023-2025 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+# Adapted from
+# https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/models/gpt_j.py
+"""Inference-only GPT-J model compatible with HuggingFace weights."""
+
+from typing import Iterable, Optional, Tuple
+
+import torch
+from torch import nn
+from transformers import GPTJConfig
+
+from sglang.srt.distributed.parallel_state import get_tensor_model_parallel_world_size
+from sglang.srt.layers.activation import get_act_fn
+from sglang.srt.layers.linear import (
+    ColumnParallelLinear,
+    QKVParallelLinear,
+    RowParallelLinear,
+)
+from sglang.srt.layers.logits_processor import LogitsProcessor
+from sglang.srt.layers.quantization.base_config import QuantizationConfig
+from sglang.srt.layers.radix_attention import RadixAttention
+from sglang.srt.layers.rotary_embedding import get_rope
+from sglang.srt.layers.vocab_parallel_embedding import (
+    ParallelLMHead,
+    VocabParallelEmbedding,
+)
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch
+from sglang.srt.model_loader.weight_utils import (
+    default_weight_loader,
+    maybe_remap_kv_scale_name,
+)
+from sglang.srt.utils import add_prefix
+
+
+class GPTJAttention(nn.Module):
+
+    def __init__(
+        self,
+        layer_id: int,
+        config: GPTJConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        total_num_heads = config.num_attention_heads
+        hidden_size = config.hidden_size
+        head_dim = hidden_size // total_num_heads
+
+        self.qkv_proj = QKVParallelLinear(
+            hidden_size,
+            head_dim,
+            total_num_heads,
+            bias=False,
+            quant_config=quant_config,
+            prefix=add_prefix("qkv_proj", prefix),
+        )
+        self.out_proj = RowParallelLinear(
+            hidden_size,
+            hidden_size,
+            bias=False,
+            quant_config=quant_config,
+            prefix=add_prefix("out_proj", prefix),
+        )
+
+        tensor_model_parallel_world_size = get_tensor_model_parallel_world_size()
+        assert total_num_heads % tensor_model_parallel_world_size == 0
+        num_heads = total_num_heads // tensor_model_parallel_world_size
+
+        scaling = head_dim**-0.5
+        assert getattr(config, "rotary", True)
+        assert config.rotary_dim % 2 == 0
+        rope_theta = getattr(config, "rope_theta", 10000)
+        max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
+        self.rotary_emb = get_rope(
+            head_dim,
+            rotary_dim=config.rotary_dim,
+            max_position=max_position_embeddings,
+            base=rope_theta,
+            is_neox_style=False,
+        )
+        self.attn = RadixAttention(
+            num_heads,
+            head_dim,
+            scaling=scaling,
+            num_kv_heads=num_heads,
+            layer_id=layer_id,
+            quant_config=quant_config,
+        )
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+    ) -> torch.Tensor:
+        qkv, _ = self.qkv_proj(hidden_states)
+        q, k, v = qkv.chunk(chunks=3, dim=-1)
+        q, k = self.rotary_emb(positions, q, k)
+        attn_output = self.attn(q, k, v, forward_batch)
+        attn_output, _ = self.out_proj(attn_output)
+        return attn_output
+
+
+class GPTJMLP(nn.Module):
+
+    def __init__(
+        self,
+        intermediate_size: int,
+        config: GPTJConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        hidden_size = config.n_embd
+        self.fc_in = ColumnParallelLinear(
+            hidden_size,
+            intermediate_size,
+            quant_config=quant_config,
+            prefix=add_prefix("fc_in", prefix),
+        )
+        self.fc_out = RowParallelLinear(
+            intermediate_size,
+            hidden_size,
+            quant_config=quant_config,
+            prefix=add_prefix("fc_out", prefix),
+        )
+
+        self.act = get_act_fn(config.activation_function)
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states, _ = self.fc_in(hidden_states)
+        hidden_states = self.act(hidden_states)
+        hidden_states, _ = self.fc_out(hidden_states)
+        return hidden_states
+
+
+class GPTJBlock(nn.Module):
+
+    def __init__(
+        self,
+        layer_id: int,
+        config: GPTJConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        inner_dim = 4 * config.n_embd if config.n_inner is None else config.n_inner
+        self.ln_1 = nn.LayerNorm(config.n_embd, eps=config.layer_norm_epsilon)
+        self.attn = GPTJAttention(
+            layer_id,
+            config,
+            quant_config,
+            prefix=add_prefix("attn", prefix),
+        )
+        self.mlp = GPTJMLP(
+            inner_dim,
+            config,
+            quant_config=quant_config,
+            prefix=add_prefix("mlp", prefix),
+        )
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+    ) -> torch.Tensor:
+        residual = hidden_states
+        hidden_states = self.ln_1(hidden_states)
+        attn_output = self.attn(
+            positions=positions,
+            hidden_states=hidden_states,
+            forward_batch=forward_batch,
+        )
+        mlp_output = self.mlp(hidden_states)
+        hidden_states = attn_output + mlp_output + residual
+        return hidden_states
+
+
+class GPTJModel(nn.Module):
+
+    def __init__(
+        self,
+        config: GPTJConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        embed_dim = config.n_embd
+        self.wte = VocabParallelEmbedding(
+            config.vocab_size,
+            embed_dim,
+        )
+        self.h = nn.ModuleList(
+            [
+                GPTJBlock(
+                    i,
+                    config,
+                    quant_config=quant_config,
+                    prefix=add_prefix(f"h.{i}", prefix),
+                )
+                for i in range(config.n_layer)
+            ]
+        )
+        self.ln_f = nn.LayerNorm(embed_dim, eps=config.layer_norm_epsilon)
+
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.wte(input_ids)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        if inputs_embeds is not None:
+            hidden_states = inputs_embeds
+        else:
+            hidden_states = self.get_input_embeddings(input_ids)
+
+        for layer in self.h:
+            hidden_states = layer(positions, hidden_states, forward_batch)
+        hidden_states = self.ln_f(hidden_states)
+        return hidden_states
+
+
+class GPTJForCausalLM(nn.Module):
+
+    def __init__(
+        self,
+        config: GPTJConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        assert not config.tie_word_embeddings
+        self.quant_config = quant_config
+        self.transformer = GPTJModel(
+            config,
+            quant_config,
+            prefix=add_prefix("transformer", prefix),
+        )
+        self.lm_head = ParallelLMHead(
+            config.vocab_size,
+            config.n_embd,
+            bias=True,
+            quant_config=quant_config,
+        )
+        self.logits_processor = LogitsProcessor(config)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        hidden_states = self.transformer(
+            input_ids, positions, forward_batch, inputs_embeds
+        )
+        return self.logits_processor(
+            input_ids, hidden_states, self.lm_head, forward_batch
+        )
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+        ]
+        params_dict = dict(self.named_parameters())
+        for name, loaded_weight in weights:
+            if "attn.bias" in name or "attn.masked_bias" in name:
+                continue
+
+            if self.quant_config is not None and (
+                scale_name := self.quant_config.get_cache_scale(name)
+            ):
+                # Loading kv cache quantization scales
+                param = params_dict[scale_name]
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
+                loaded_weight = (
+                    loaded_weight if loaded_weight.dim() == 0 else loaded_weight[0]
+                )
+                weight_loader(param, loaded_weight)
+                continue
+
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                name = maybe_remap_kv_scale_name(name, params_dict)
+                if name is None:
+                    continue
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
+                weight_loader(param, loaded_weight)
+
+
+EntryClass = GPTJForCausalLM
diff --git a/sglang/python/sglang/srt/models/gpt_oss.py b/sglang/python/sglang/srt/models/gpt_oss.py
new file mode 100644
index 0000000000000000000000000000000000000000..96caaa65b57c57633872f586c2033bd92326875f
--- /dev/null
+++ b/sglang/python/sglang/srt/models/gpt_oss.py
@@ -0,0 +1,1196 @@
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+
+"""Inference-only GptOss model compatible with HuggingFace weights."""
+
+import logging
+import math
+from collections.abc import Iterable
+from functools import partial
+from typing import Any, Dict, List, Optional, Tuple, Union
+
+import torch
+from torch import nn
+from transformers import PretrainedConfig
+
+from sglang.srt.compilation.piecewise_context_manager import (
+    get_forward_context,
+    is_in_piecewise_cuda_graph,
+)
+from sglang.srt.distributed import (
+    get_moe_expert_parallel_rank,
+    get_moe_expert_parallel_world_size,
+    get_moe_tensor_parallel_rank,
+    get_moe_tensor_parallel_world_size,
+    get_pp_group,
+    get_tensor_model_parallel_rank,
+    get_tensor_model_parallel_world_size,
+    tensor_model_parallel_all_reduce,
+)
+from sglang.srt.eplb.expert_distribution import get_global_expert_distribution_recorder
+from sglang.srt.eplb.expert_location import ModelConfigForExpertLocation
+from sglang.srt.layers.communicator import LayerCommunicator, LayerScatterModes
+from sglang.srt.layers.dp_attention import (
+    get_attention_tp_rank,
+    get_attention_tp_size,
+    is_dp_attention_enabled,
+)
+from sglang.srt.layers.layernorm import RMSNorm
+from sglang.srt.layers.linear import (
+    QKVParallelLinear,
+    ReplicatedLinear,
+    RowParallelLinear,
+)
+from sglang.srt.layers.logits_processor import LogitsProcessor
+from sglang.srt.layers.moe import get_moe_a2a_backend
+from sglang.srt.layers.moe.ep_moe.layer import get_moe_impl_class
+from sglang.srt.layers.moe.fused_moe_triton.layer import FusedMoE
+from sglang.srt.layers.moe.topk import TopK
+from sglang.srt.layers.moe.utils import filter_moe_weight_param_global_expert
+from sglang.srt.layers.quantization.base_config import QuantizationConfig
+from sglang.srt.layers.quantization.fp8_utils import dequant_mxfp4
+from sglang.srt.layers.radix_attention import RadixAttention
+from sglang.srt.layers.rotary_embedding import get_rope
+from sglang.srt.layers.utils import PPMissingLayer, get_layer_id
+from sglang.srt.layers.vocab_parallel_embedding import (
+    ParallelLMHead,
+    VocabParallelEmbedding,
+)
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch, PPProxyTensors
+from sglang.srt.model_loader.weight_utils import default_weight_loader
+from sglang.srt.models.utils import (
+    create_fused_set_kv_buffer_arg,
+    enable_fused_set_kv_buffer,
+)
+from sglang.srt.server_args import get_global_server_args
+from sglang.srt.utils import LazyValue, add_prefix, is_npu, make_layers
+from sglang.srt.utils.custom_op import register_custom_op
+
+_is_npu = is_npu()
+
+
+class GptOssConfig(PretrainedConfig):
+    model_type = "gpt_oss"
+
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+
+
+logger = logging.getLogger(__name__)
+
+
+# Aligned with HF's implementation, using sliding window inclusive with the last token
+# SGLang assumes exclusive
+def get_attention_sliding_window_size(config):
+    return config.sliding_window - 1
+
+
+class GptOssSparseMoeBlock(nn.Module):
+    def __init__(
+        self,
+        layer_id: int,
+        config: GptOssConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.tp_size = get_tensor_model_parallel_world_size()
+        self.layer_id = layer_id
+        self.activation = config.hidden_act
+        self.gemm1_alpha = getattr(config, "hidden_act_alpha", 1.702)
+        self.gemm1_clamp_limit = config.swiglu_limit
+
+        self.topk = TopK(
+            top_k=config.num_experts_per_tok,
+            renormalize=True,
+            layer_id=layer_id,
+        )
+
+        self.top_k = config.num_experts_per_tok
+        experts_type = get_moe_impl_class(quant_config)
+        extra_kwargs = {}
+        if experts_type.__name__ == "FusedMoE":
+            quant_config_name = (
+                quant_config.get_name() if quant_config is not None else None
+            )
+            extra_kwargs = {
+                # for moe gate_up_proj and down_proj and their bias loading
+                "use_weight_loader_fused": quant_config_name
+                != "mxfp4"
+            }
+
+        self.experts = experts_type(
+            num_experts=config.num_local_experts
+            + get_global_server_args().ep_num_redundant_experts,
+            top_k=config.num_experts_per_tok,
+            layer_id=layer_id,
+            hidden_size=config.hidden_size,
+            intermediate_size=config.intermediate_size,
+            quant_config=quant_config,
+            activation=self.activation,
+            gemm1_alpha=self.gemm1_alpha,
+            gemm1_clamp_limit=self.gemm1_clamp_limit,
+            with_bias=True,
+            prefix=add_prefix("experts", prefix),
+            **extra_kwargs,
+        )
+
+        self.router = ReplicatedLinear(
+            config.hidden_size,
+            config.num_local_experts,
+            bias=True,
+            quant_config=None,
+            prefix=add_prefix("gate", prefix),
+            params_dtype=config.torch_dtype,
+        )
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        forward_batch: Optional[ForwardBatch] = None,
+        should_allreduce_fusion: bool = False,
+    ) -> torch.Tensor:
+        if not get_moe_a2a_backend().is_deepep():
+            return self.forward_normal(hidden_states, should_allreduce_fusion)
+        else:
+            raise Exception("forward_deepep branch not implemented yet")
+
+    def get_moe_weights(self):
+        return [
+            x.data
+            for name, x in self.experts.named_parameters()
+            if name not in ["correction_bias"]
+            and filter_moe_weight_param_global_expert(
+                name, x, self.experts.num_local_experts
+            )
+        ]
+
+    def forward_normal(
+        self,
+        hidden_states: torch.Tensor,
+        should_allreduce_fusion: bool = False,
+    ) -> torch.Tensor:
+        num_tokens, hidden_dim = hidden_states.shape
+        if is_in_piecewise_cuda_graph():
+            final_hidden_states = moe_impl(self.layer_id, hidden_states)
+        else:
+            router_logits, _ = self.router(hidden_states)
+            topk_output = self.topk(hidden_states, router_logits)
+            final_hidden_states = self.experts(hidden_states, topk_output)
+
+        if self.tp_size > 1 and not should_allreduce_fusion:
+            final_hidden_states = tensor_model_parallel_all_reduce(final_hidden_states)
+
+        ans = final_hidden_states.view(num_tokens, hidden_dim)
+        return ans
+
+
+@register_custom_op(out_shape="hidden_states")
+def moe_impl(layer_id: int, hidden_states: torch.Tensor) -> torch.Tensor:
+    forward_context = get_forward_context()
+    moe_fusion = forward_context.moe_fusions[layer_id]
+    router_logits, _ = moe_fusion.router(hidden_states)
+    topk_output = moe_fusion.topk(hidden_states, router_logits)
+    final_hidden_states = moe_fusion.experts(hidden_states, topk_output)
+    return final_hidden_states
+
+
+class GptOssAttention(nn.Module):
+    def __init__(
+        self,
+        hidden_size: int,
+        num_heads: int,
+        num_kv_heads: int,
+        layer_id: int = 0,
+        rope_theta: float = 10000,
+        rope_scaling: Optional[Dict[str, Any]] = None,
+        max_position_embeddings: int = 8192,
+        head_dim: Optional[int] = None,
+        rms_norm_eps: float = 1e-06,
+        attention_bias: bool = False,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+        sliding_window_size: int = -1,  # if -1, normal attention, else, window attention.
+        layer_type: str = "",
+        params_dtype: torch.dtype = torch.bfloat16,
+    ) -> None:
+        super().__init__()
+        self.hidden_size = hidden_size
+        self.sliding_window_size = sliding_window_size
+
+        attn_tp_rank = get_attention_tp_rank()
+        attn_tp_size = get_attention_tp_size()
+
+        self.total_num_heads = num_heads
+        assert self.total_num_heads % attn_tp_size == 0
+        self.num_heads = self.total_num_heads // attn_tp_size
+        self.total_num_kv_heads = num_kv_heads
+        if self.total_num_kv_heads >= attn_tp_size:
+            # Number of KV heads is greater than TP size, so we partition
+            # the KV heads across multiple tensor parallel GPUs.
+            assert self.total_num_kv_heads % attn_tp_size == 0
+        else:
+            # Number of KV heads is less than TP size, so we replicate
+            # the KV heads across multiple tensor parallel GPUs.
+            assert attn_tp_size % self.total_num_kv_heads == 0
+        self.num_kv_heads = max(1, self.total_num_kv_heads // attn_tp_size)
+        self.head_dim = head_dim or hidden_size // self.total_num_heads
+        self.q_size = self.num_heads * self.head_dim
+        self.kv_size = self.num_kv_heads * self.head_dim
+        self.scaling = self.head_dim**-0.5
+        self.rope_theta = rope_theta
+        self.max_position_embeddings = max_position_embeddings
+        self.tp_rank = get_tensor_model_parallel_rank()
+
+        self.qkv_proj = QKVParallelLinear(
+            hidden_size,
+            self.head_dim,
+            self.total_num_heads,
+            self.total_num_kv_heads,
+            bias=attention_bias,
+            params_dtype=params_dtype,
+            quant_config=quant_config,
+            tp_rank=attn_tp_rank,
+            tp_size=attn_tp_size,
+            prefix=add_prefix("qkv_proj", prefix),
+        )
+
+        # Choose dtype of sinks based on attention backend: trtllm_mha requires float32,
+        # others can use bfloat16
+        attn_backend = get_global_server_args().attention_backend
+        sinks_dtype = torch.float32 if attn_backend == "trtllm_mha" else torch.bfloat16
+        self.sinks = nn.Parameter(
+            torch.empty(self.num_heads, dtype=sinks_dtype), requires_grad=False
+        )
+
+        self.o_proj = RowParallelLinear(
+            self.total_num_heads * self.head_dim,
+            hidden_size,
+            bias=attention_bias,
+            quant_config=quant_config,
+            tp_rank=attn_tp_rank,
+            tp_size=attn_tp_size,
+            reduce_results=False,
+            params_dtype=params_dtype,
+            prefix=add_prefix("o_proj", prefix),
+        )
+
+        self.rotary_emb = get_rope(
+            self.head_dim,
+            rotary_dim=self.head_dim,
+            max_position=max_position_embeddings,
+            base=rope_theta,
+            rope_scaling=rope_scaling,
+        )
+
+        assert layer_type in {"sliding_attention", "full_attention"}
+        use_sliding_window = layer_type == "sliding_attention"
+        self.attn = RadixAttention(
+            self.num_heads,
+            self.head_dim,
+            self.scaling,
+            num_kv_heads=self.num_kv_heads,
+            layer_id=layer_id,
+            prefix=add_prefix("attn", prefix),
+            sliding_window_size=(sliding_window_size if use_sliding_window else -1),
+        )
+        self.layer_id = layer_id
+
+    def forward_prepare(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+    ):
+        if hidden_states.shape[0] == 0:
+            return hidden_states, forward_batch, None
+        qkv, _ = self.qkv_proj(hidden_states)
+        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+
+        extra_args = {}
+        if not _is_npu:
+            extra_args = {
+                "fused_set_kv_buffer_arg": (
+                    create_fused_set_kv_buffer_arg(
+                        value=v,
+                        layer=self.attn,
+                        forward_batch=forward_batch,
+                    )
+                    if enable_fused_set_kv_buffer(forward_batch)
+                    else None
+                ),
+            }
+        q, k = self.rotary_emb(positions, q, k, **extra_args)
+        inner_state = q, k, v, forward_batch
+        return None, forward_batch, inner_state
+
+    def forward_core(self, intermediate_state):
+        hidden_states, forward_batch, inner_state = intermediate_state
+        if inner_state is None:
+            return hidden_states
+        attn_output = self.attn(
+            *inner_state,
+            sinks=self.sinks,
+            save_kv_cache=not enable_fused_set_kv_buffer(forward_batch),
+        )
+        output, _ = self.o_proj(attn_output)
+        return output
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+    ) -> torch.Tensor:
+        s = self.forward_prepare(
+            positions=positions,
+            hidden_states=hidden_states,
+            forward_batch=forward_batch,
+        )
+        return self.forward_core(s)
+
+
+class GptOssDecoderLayer(nn.Module):
+    def __init__(
+        self,
+        config: GptOssConfig,
+        layer_id: int,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+        sliding_window_size: int | None = None,
+    ) -> None:
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        rope_theta = getattr(config, "rope_theta", 10000)
+        rope_scaling = getattr(config, "rope_scaling", None)
+        max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
+        head_dim = getattr(
+            config, "head_dim", config.hidden_size // config.num_attention_heads
+        )
+        rms_norm_eps = config.rms_norm_eps
+        attention_bias = config.attention_bias
+
+        if sliding_window_size is None:
+            self.sliding_window_size = get_attention_sliding_window_size(self.config)
+        else:
+            self.sliding_window_size = sliding_window_size
+
+        self.self_attn = GptOssAttention(
+            hidden_size=self.hidden_size,
+            num_heads=config.num_attention_heads,
+            num_kv_heads=config.num_key_value_heads,
+            layer_id=layer_id,
+            rope_theta=rope_theta,
+            rope_scaling=rope_scaling,
+            max_position_embeddings=max_position_embeddings,
+            head_dim=head_dim,
+            rms_norm_eps=rms_norm_eps,
+            attention_bias=attention_bias,
+            prefix=add_prefix("self_attn", prefix),
+            sliding_window_size=self.sliding_window_size,
+            layer_type=config.layer_types[layer_id],
+            params_dtype=config.torch_dtype,
+        )
+
+        self.layer_id = layer_id
+
+        self.attn_tp_size = get_attention_tp_size()
+        self.attn_tp_rank = get_attention_tp_rank()
+
+        # GptOss all layers are sparse and have no nextn now
+        self.is_layer_sparse = True
+        self.is_nextn = False
+        is_previous_layer_sparse = True
+        is_next_layer_sparse = True
+
+        self.layer_scatter_modes = LayerScatterModes.init_new(
+            layer_id=layer_id,
+            num_layers=config.num_hidden_layers,
+            is_layer_sparse=self.is_layer_sparse,
+            is_previous_layer_sparse=is_previous_layer_sparse,
+            is_next_layer_sparse=is_next_layer_sparse,
+        )
+
+        if self.is_layer_sparse:
+            self.mlp = GptOssSparseMoeBlock(
+                layer_id=self.layer_id,
+                config=config,
+                quant_config=quant_config,
+                prefix=add_prefix("mlp", prefix),
+            )
+        else:
+            raise NotImplementedError(
+                "Dense MLP is not implemented for GptOssDecoderLayer. "
+                "Please use GptOssSparseMoeBlock instead."
+            )
+        self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = RMSNorm(
+            config.hidden_size, eps=config.rms_norm_eps
+        )
+
+        self.layer_communicator = LayerCommunicator(
+            layer_scatter_modes=self.layer_scatter_modes,
+            input_layernorm=self.input_layernorm,
+            post_attention_layernorm=self.post_attention_layernorm,
+            is_last_layer=(
+                self.is_nextn or (self.layer_id == self.config.num_hidden_layers - 1)
+            ),
+        )
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+        residual: Optional[torch.Tensor],
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        hidden_states, residual = self.layer_communicator.prepare_attn(
+            hidden_states, residual, forward_batch
+        )
+
+        if hidden_states.shape[0] != 0:
+            hidden_states = self.self_attn(
+                positions=positions,
+                hidden_states=hidden_states,
+                forward_batch=forward_batch,
+            )
+
+        hidden_states, residual = self.layer_communicator.prepare_mlp(
+            hidden_states, residual, forward_batch
+        )
+
+        should_allreduce_fusion = (
+            self.layer_communicator.should_fuse_mlp_allreduce_with_next_layer(
+                forward_batch
+            )
+        )
+
+        hidden_states = self.mlp(hidden_states, forward_batch, should_allreduce_fusion)
+
+        if should_allreduce_fusion:
+            hidden_states._sglang_needs_allreduce_fusion = True
+
+        if not should_allreduce_fusion:
+            hidden_states, residual = self.layer_communicator.postprocess_layer(
+                hidden_states, residual, forward_batch
+            )
+
+        return hidden_states, residual
+
+
+class GptOssModel(nn.Module):
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+        decoder_layer_type: type[nn.Module] = GptOssDecoderLayer,
+    ) -> None:
+        super().__init__()
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+        self.pp_group = get_pp_group()
+
+        if _is_npu:
+            config.hidden_act = "npu_swiglu_oai"
+
+        if self.pp_group.is_first_rank:
+            self.embed_tokens = VocabParallelEmbedding(
+                config.vocab_size,
+                config.hidden_size,
+                use_attn_tp_group=is_dp_attention_enabled(),
+                prefix=add_prefix("embed_tokens", prefix),
+            )
+        else:
+            self.embed_tokens = PPMissingLayer()
+
+        # Use the provided decoder layer type or default to GptOssDecoderLayer
+        decoder_layer_type = decoder_layer_type or GptOssDecoderLayer
+        self.layers, self.start_layer, self.end_layer = make_layers(
+            config.num_hidden_layers,
+            lambda idx, prefix: decoder_layer_type(
+                layer_id=idx,
+                config=config,
+                quant_config=quant_config,
+                prefix=prefix,
+            ),
+            pp_rank=self.pp_group.rank_in_group,
+            pp_size=self.pp_group.world_size,
+            prefix=add_prefix("layers", prefix),
+        )
+        if self.pp_group.is_last_rank:
+            self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        else:
+            self.norm = PPMissingLayer(return_tuple=True)
+
+        self.layers_to_capture = []
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        input_embeds: torch.Tensor = None,
+        pp_proxy_tensors: Optional[PPProxyTensors] = None,
+    ) -> Union[torch.Tensor, PPProxyTensors]:
+        if self.pp_group.is_first_rank:
+            if input_embeds is None:
+                hidden_states = self.embed_tokens(input_ids)
+            else:
+                hidden_states = input_embeds
+            residual = None
+        else:
+            assert pp_proxy_tensors is not None
+            hidden_states = pp_proxy_tensors["hidden_states"]
+            residual = pp_proxy_tensors["residual"]
+
+        aux_hidden_states = []
+        for i in range(self.start_layer, self.end_layer):
+            with get_global_expert_distribution_recorder().with_current_layer(i):
+                if i in self.layers_to_capture:
+                    aux_hidden_states.append(hidden_states + residual)
+                layer = self.layers[i]
+                hidden_states, residual = layer(
+                    positions, hidden_states, forward_batch, residual
+                )
+        if not self.pp_group.is_last_rank:
+            return PPProxyTensors(
+                {
+                    "hidden_states": hidden_states,
+                    "residual": residual,
+                }
+            )
+        else:
+            if hidden_states.shape[0] != 0:
+                if residual is None:
+                    hidden_states = self.norm(hidden_states)
+                else:
+                    hidden_states, _ = self.norm(hidden_states, residual)
+        if len(aux_hidden_states) == 0:
+            return hidden_states
+
+        return hidden_states, aux_hidden_states
+
+
+class GptOssForCausalLM(nn.Module):
+    fall_back_to_pt_during_load = False
+
+    def __init__(
+        self,
+        config: GptOssConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.pp_group = get_pp_group()
+        self.config = config
+        self.quant_config = quant_config
+        self.model = GptOssModel(
+            config, quant_config, prefix=add_prefix("model", prefix)
+        )
+        self.lm_head = ParallelLMHead(
+            config.vocab_size,
+            config.hidden_size,
+            # quant_config=quant_config,
+            prefix=add_prefix("lm_head", prefix),
+            use_attn_tp_group=get_global_server_args().enable_dp_lm_head,
+        )
+        self.logits_processor = LogitsProcessor(config)
+        self.capture_aux_hidden_states = False
+
+        self._routed_experts_weights_of_layer = LazyValue(
+            lambda: {
+                layer_id: self.model.layers[layer_id].mlp.get_moe_weights()
+                for layer_id in range(self.start_layer, self.end_layer)
+                if isinstance(self.model.layers[layer_id].mlp, GptOssSparseMoeBlock)
+            }
+        )
+
+    @property
+    def routed_experts_weights_of_layer(self):
+        return self._routed_experts_weights_of_layer.value
+
+    @torch.no_grad()
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        input_embeds: torch.Tensor = None,
+        pp_proxy_tensors: Optional[PPProxyTensors] = None,
+    ) -> torch.Tensor:
+        hidden_states = self.model(
+            input_ids,
+            positions,
+            forward_batch,
+            input_embeds,
+            pp_proxy_tensors=pp_proxy_tensors,
+        )
+
+        aux_hidden_states = None
+        if self.capture_aux_hidden_states:
+            hidden_states, aux_hidden_states = hidden_states
+
+        if self.pp_group.is_last_rank:
+            return self.logits_processor(
+                input_ids,
+                hidden_states,
+                self.lm_head,
+                forward_batch,
+                aux_hidden_states,
+            )
+        else:
+            return hidden_states
+
+    @property
+    def start_layer(self):
+        return self.model.start_layer
+
+    @property
+    def end_layer(self):
+        return self.model.end_layer
+
+    def _get_default_weight_mapping(self):
+        """Generate default weight name mapping for GptOss safetensors."""
+        weight_mapping = {}
+
+        # Map router weights to gate
+        weight_mapping["embedding.weight"] = "model.embed_tokens.weight"
+        weight_mapping["unembedding.weight"] = "lm_head.weight"
+        weight_mapping["norm.scale"] = "model.norm.weight"
+        for layer_id in range(self.config.num_hidden_layers):
+            weight_mapping[f"block.{layer_id}.attn.q_proj.weight"] = (
+                f"model.layers.{layer_id}.self_attn.q_proj.weight"
+            )
+            weight_mapping[f"block.{layer_id}.attn.q_proj.bias"] = (
+                f"model.layers.{layer_id}.self_attn.q_proj.bias"
+            )
+
+            weight_mapping[f"block.{layer_id}.attn.k_proj.weight"] = (
+                f"model.layers.{layer_id}.self_attn.k_proj.weight"
+            )
+            weight_mapping[f"block.{layer_id}.attn.k_proj.bias"] = (
+                f"model.layers.{layer_id}.self_attn.k_proj.bias"
+            )
+
+            weight_mapping[f"block.{layer_id}.attn.v_proj.weight"] = (
+                f"model.layers.{layer_id}.self_attn.v_proj.weight"
+            )
+            weight_mapping[f"block.{layer_id}.attn.v_proj.bias"] = (
+                f"model.layers.{layer_id}.self_attn.v_proj.bias"
+            )
+
+            weight_mapping[f"block.{layer_id}.attn.out.weight"] = (
+                f"model.layers.{layer_id}.self_attn.o_proj.weight"
+            )
+            weight_mapping[f"block.{layer_id}.attn.out.bias"] = (
+                f"model.layers.{layer_id}.self_attn.o_proj.bias"
+            )
+            weight_mapping[f"block.{layer_id}.attn.sinks"] = (
+                f"model.layers.{layer_id}.self_attn.sinks"
+            )
+            weight_mapping[f"block.{layer_id}.attn.norm.scale"] = (
+                f"model.layers.{layer_id}.input_layernorm.weight"
+            )
+
+            weight_mapping[f"block.{layer_id}.mlp.gate.weight"] = (
+                f"model.layers.{layer_id}.mlp.router.weight"
+            )
+            weight_mapping[f"block.{layer_id}.mlp.gate.bias"] = (
+                f"model.layers.{layer_id}.mlp.router.bias"
+            )
+            weight_mapping[f"block.{layer_id}.mlp.norm.scale"] = (
+                f"model.layers.{layer_id}.post_attention_layernorm.weight"
+            )
+            weight_mapping[f"block.{layer_id}.mlp.experts.gate_up_proj"] = (
+                f"model.layers.{layer_id}.mlp.experts.gate_up_proj"
+            )
+            weight_mapping[f"block.{layer_id}.mlp.gate_up_proj_bias"] = (
+                f"model.layers.{layer_id}.mlp.experts.gate_up_proj_bias"
+            )
+            weight_mapping[f"block.{layer_id}.mlp.down_proj"] = (
+                f"model.layers.{layer_id}.mlp.experts.mlp2_weight"
+            )
+            weight_mapping[f"block.{layer_id}.mlp.down_proj_bias"] = (
+                f"model.layers.{layer_id}.mlp.experts.mlp2_bias"
+            )
+
+        return weight_mapping
+
+    # TODO beautify code
+    def load_weights(
+        self,
+        weights: Iterable[Tuple[str, torch.Tensor]],
+        is_nextn: bool = False,
+        weight_name_mapping: dict = None,
+    ):
+        quant_config_name = (
+            self.quant_config.get_name() if self.quant_config is not None else None
+        )
+        if quant_config_name != "mxfp4":
+            self._load_normal_weights(
+                weights, is_nextn=is_nextn, weight_name_mapping=weight_name_mapping
+            )
+        else:
+            self._load_weights_mxfp4(
+                weights, is_nextn=is_nextn, weight_name_mapping=weight_name_mapping
+            )
+
+    def _load_weights_mxfp4(self, weights, is_nextn, weight_name_mapping):
+        mxfp4_weights = []
+        normal_weights = []
+
+        for name, weight in weights:
+            if (
+                ".experts" in name
+                and self.quant_config is not None
+                and self.quant_config.get_name() == "mxfp4"
+            ):
+                mxfp4_weights.append((name, weight))
+            else:
+                normal_weights.append((name, weight))
+
+        mxfp4_loaded_params = self._load_mxfp4_experts_weights(mxfp4_weights)
+        self._load_normal_weights(
+            normal_weights,
+            is_nextn=is_nextn,
+            weight_name_mapping=weight_name_mapping,
+            other_loaded_param_names=mxfp4_loaded_params,
+        )
+
+    def _load_mxfp4_experts_weights(self, weights):
+
+        params_dict = dict(self.named_parameters())
+        loaded_params: set[str] = set()
+        mxfp4_block = 32
+
+        moe_tp_rank = get_moe_tensor_parallel_rank()
+        moe_tp_size = get_moe_tensor_parallel_world_size()
+        moe_ep_rank = get_moe_expert_parallel_rank()
+        moe_ep_size = get_moe_expert_parallel_world_size()
+
+        intermediate_size = self.config.intermediate_size
+        assert (
+            intermediate_size % mxfp4_block == 0
+        ), f"{intermediate_size=} must be divisible by {mxfp4_block=}"
+        intermediate_size_block = intermediate_size // mxfp4_block
+
+        per_rank_intermediate_size_block = math.ceil(
+            intermediate_size_block / moe_tp_size
+        )
+
+        per_rank_intermediate_size = per_rank_intermediate_size_block * mxfp4_block
+
+        # Calculate common slicing bounds for current rank
+        assert self.config.num_local_experts % moe_ep_size == 0
+        moe_num_global_experts = self.config.num_local_experts
+        moe_num_local_experts = self.config.num_local_experts // moe_ep_size
+
+        moe_tp_rank_start = moe_tp_rank * per_rank_intermediate_size
+        moe_tp_rank_end = min(
+            (moe_tp_rank + 1) * per_rank_intermediate_size, intermediate_size
+        )
+
+        moe_ep_rank_start = moe_ep_rank * moe_num_local_experts
+        moe_ep_rank_end = (moe_ep_rank + 1) * moe_num_local_experts
+
+        for name, weight in weights:
+            weight = weight.cuda()
+
+            if "gate_up_proj_blocks" in name:
+                # Handle MLP gate and up projection weights
+                new_name = name.replace("gate_up_proj_blocks", "w13_weight")
+
+                # flat weight from (E, 2 * N, block_size, entry_per_block)
+                # to (E, 2 * N, -1), shouldn't trigger copy for contiguous
+                weight = weight.view(
+                    moe_num_global_experts, 2 * intermediate_size, -1
+                ).contiguous()
+
+                narrow_weight = weight[
+                    moe_ep_rank_start:moe_ep_rank_end,
+                    2 * moe_tp_rank_start : 2 * moe_tp_rank_end,
+                    ...,
+                ]
+
+                param = params_dict[new_name]
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
+                weight_loader(
+                    param,
+                    narrow_weight,
+                    weight_name=new_name,
+                    shard_id=None,
+                    expert_id=None,
+                )
+                loaded_params.add(new_name)
+
+            elif "down_proj_blocks" in name:
+                # Handle MLP down projection weights
+                new_name = name.replace("down_proj_blocks", "w2_weight")
+                # same flatten here, but since 2 mx4 value are packed in 1
+                # uint8, divide by 2
+                weight = weight.view(
+                    moe_num_global_experts, -1, intermediate_size // 2
+                ).contiguous()
+                narrow_weight = weight[
+                    moe_ep_rank_start:moe_ep_rank_end,
+                    ...,
+                    moe_tp_rank_start // 2 : moe_tp_rank_end // 2,
+                ]
+
+                param = params_dict[new_name]
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
+                weight_loader(
+                    param,
+                    narrow_weight,
+                    weight_name=new_name,
+                    shard_id=None,
+                    expert_id=None,
+                )
+                loaded_params.add(new_name)
+
+            elif "gate_up_proj_scales" in name:
+                # Handle MLP gate and up projection weights scale
+                new_name = name.replace("gate_up_proj_scales", "w13_weight_scale")
+                narrow_weight = weight[
+                    moe_ep_rank_start:moe_ep_rank_end,
+                    2 * moe_tp_rank_start : 2 * moe_tp_rank_end,
+                    ...,
+                ]
+
+                param = params_dict[new_name]
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
+                weight_loader(
+                    param,
+                    narrow_weight,
+                    weight_name=new_name,
+                    shard_id=None,
+                    expert_id=None,
+                )
+                loaded_params.add(new_name)
+
+            elif "down_proj_scales" in name:
+                # Handle MLP down projection weights
+                new_name = name.replace("down_proj_scales", "w2_weight_scale")
+                narrow_weight = weight[
+                    moe_ep_rank_start:moe_ep_rank_end,
+                    ...,
+                    moe_tp_rank_start // mxfp4_block : moe_tp_rank_end // mxfp4_block,
+                ]
+
+                param = params_dict[new_name]
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
+                weight_loader(
+                    param,
+                    narrow_weight,
+                    weight_name=new_name,
+                    shard_id=None,
+                    expert_id=None,
+                )
+                loaded_params.add(new_name)
+            elif "gate_up_proj_bias" in name:
+                # Handle MLP gate and up projection biases
+                new_name = name.replace("gate_up_proj_bias", "w13_weight_bias")
+
+                narrow_weight = weight[
+                    moe_ep_rank_start:moe_ep_rank_end,
+                    2 * moe_tp_rank_start : 2 * moe_tp_rank_end,
+                ]
+
+                param = params_dict[new_name]
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
+                weight_loader(
+                    param,
+                    narrow_weight,
+                    weight_name=new_name,
+                    shard_id=None,
+                    expert_id=None,
+                )
+                loaded_params.add(new_name)
+
+            elif "down_proj_bias" in name:
+                narrow_weight = weight[moe_ep_rank_start:moe_ep_rank_end, ...]
+                if moe_tp_rank != 0:
+                    narrow_weight = torch.zeros_like(narrow_weight)
+
+                # Handle MLP down projection bias
+                new_name = name.replace("down_proj_bias", "w2_weight_bias")
+                param = params_dict[new_name]
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
+                weight_loader(
+                    param,
+                    narrow_weight,
+                    weight_name=new_name,
+                    shard_id=None,
+                    expert_id=None,
+                )
+                loaded_params.add(new_name)
+
+        return loaded_params
+
+    def _load_normal_weights(
+        self,
+        weights,
+        is_nextn: bool,
+        weight_name_mapping: dict,
+        other_loaded_param_names=[],
+    ):
+        tp_rank = get_tensor_model_parallel_rank()
+        if is_nextn:
+            logging.warning(
+                "Loading weights for nextn is currently not supported in GptOssForCausalLM. "
+            )
+            return
+        weights = _canonicalize_weights(self.config, weights)
+        weights = sorted(weights, key=lambda x: x[0])  # Sort by name for consistency
+
+        new_weights = []
+        for name, p in weights:
+            if "qkv.weight" in name:
+                q_proj, k_proj, v_proj = p.split(
+                    [
+                        self.config.num_attention_heads * self.config.head_dim,
+                        self.config.num_key_value_heads * self.config.head_dim,
+                        self.config.num_key_value_heads * self.config.head_dim,
+                    ],
+                    dim=0,
+                )
+                new_weights.append(
+                    (f"{name.replace('qkv.weight', 'q_proj.weight')}", q_proj)
+                )
+                new_weights.append(
+                    (f"{name.replace('qkv.weight', 'k_proj.weight')}", k_proj)
+                )
+                new_weights.append(
+                    (f"{name.replace('qkv.weight', 'v_proj.weight')}", v_proj)
+                )
+            elif "qkv.bias" in name:
+                q_bias, k_bias, v_bias = p.split(
+                    [
+                        self.config.num_attention_heads * self.config.head_dim,
+                        self.config.num_key_value_heads * self.config.head_dim,
+                        self.config.num_key_value_heads * self.config.head_dim,
+                    ],
+                    dim=0,
+                )
+                new_weights.append(
+                    (f"{name.replace('qkv.bias', 'q_proj.bias')}", q_bias)
+                )
+                new_weights.append(
+                    (f"{name.replace('qkv.bias', 'k_proj.bias')}", k_bias)
+                )
+                new_weights.append(
+                    (f"{name.replace('qkv.bias', 'v_proj.bias')}", v_bias)
+                )
+            else:
+                new_weights.append((name, p))
+        weights = new_weights
+
+        # Use provided weight name mapping if available, otherwise use default
+        if weight_name_mapping is None:
+            weight_name_mapping = self._get_default_weight_mapping()
+        else:
+            # Merge with default mapping
+            default_mapping = self._get_default_weight_mapping()
+            default_mapping.update(weight_name_mapping)
+            weight_name_mapping = default_mapping
+
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+        ]
+        expert_params_mapping = FusedMoE.make_expert_params_mapping_fused(
+            ckpt_gate_up_proj_name="gate_up_proj",
+            ckpt_down_proj_name="down_proj",
+            ckpt_gate_up_proj_bias_name="gate_up_proj_bias",
+            ckpt_down_proj_bias_name="down_proj_bias",
+        )
+
+        params_dict = dict(self.named_parameters())
+
+        for name, loaded_weight in weights:
+            loaded_weight = _WeightCreator.maybe_materialize(loaded_weight)
+
+            # Apply weight name mapping if provided
+            if weight_name_mapping and name in weight_name_mapping:
+                name = weight_name_mapping[name]
+
+            layer_id = get_layer_id(name)
+            if (
+                layer_id is not None
+                and hasattr(self.model, "start_layer")
+                and (
+                    layer_id < self.model.start_layer
+                    or layer_id >= self.model.end_layer
+                )
+            ):
+                continue
+
+            if "rotary_emb.inv_freq" in name:
+                continue
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                if "mlp.experts" in name:
+                    continue
+
+                name = name.replace(weight_name, param_name)
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                if name not in params_dict:
+                    continue
+
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                for mapping in expert_params_mapping:
+                    param_name, weight_name, shard_id = mapping
+                    if weight_name not in name:
+                        continue
+                    name = name.replace(weight_name, param_name)
+                    if name not in params_dict:
+                        continue
+                    param = params_dict[name]
+                    weight_loader = param.weight_loader
+                    if "bias" not in name:
+                        loaded_weight = loaded_weight.transpose(-2, -1)
+                    if "w2_weight_bias" in name and get_moe_tensor_parallel_rank() != 0:
+                        loaded_weight = loaded_weight.zero_()
+
+                    weight_loader(
+                        param,
+                        loaded_weight,
+                        name,
+                        shard_id=shard_id,
+                    )
+                    break
+                else:
+                    if name.endswith(".bias") and name not in params_dict:
+                        continue
+                    if name not in params_dict:
+                        continue
+                    if name in params_dict.keys():
+                        param = params_dict[name]
+                        if "sinks" in name:
+                            start = get_attention_tp_rank() * param.numel()
+                            param.data.copy_(
+                                loaded_weight[start : start + param.numel()]
+                            )
+                        else:
+                            weight_loader = getattr(
+                                param, "weight_loader", default_weight_loader
+                            )
+                            weight_loader(param, loaded_weight)
+                    else:
+                        logger.warning(f"Parameter {name} not found in params_dict")
+
+    def get_embed_and_head(self):
+        return self.model.embed_tokens.weight, self.lm_head.weight
+
+    def set_embed_and_head(self, embed, head):
+        del self.model.embed_tokens.weight
+        del self.lm_head.weight
+        self.model.embed_tokens.weight = embed
+        self.lm_head.weight = head
+        torch.cuda.empty_cache()
+        torch.cuda.synchronize()
+
+    def set_eagle3_layers_to_capture(self, layer_ids: Optional[List[int]] = None):
+        if not self.pp_group.is_last_rank:
+            return
+
+        if layer_ids is None:
+            self.capture_aux_hidden_states = True
+            num_layers = self.config.num_hidden_layers
+            self.model.layers_to_capture = [2, num_layers // 2, num_layers - 3]
+        else:
+            self.capture_aux_hidden_states = True
+            # we plus 1 here because in sglang, for the ith layer, it takes the output
+            # of the (i-1)th layer as aux hidden state
+            self.model.layers_to_capture = [val + 1 for val in layer_ids]
+
+    @classmethod
+    def get_model_config_for_expert_location(cls, config):
+        return ModelConfigForExpertLocation(
+            num_layers=config.num_hidden_layers,
+            num_logical_experts=config.num_local_experts,
+            num_groups=None,
+        )
+
+    def get_attention_sliding_window_size(self):
+        return get_attention_sliding_window_size(self.config)
+
+
+def _canonicalize_weights(config, weights_in: Iterable[Tuple[str, torch.Tensor]]):
+    weights_out_dict = dict(weights_in)
+
+    for layer_id in range(config.num_hidden_layers):
+        for name_chunk in ["mlp1_weight", "mlp2_weight"]:
+            name_prefix = f"block.{layer_id}.mlp.{name_chunk}"
+            w_blocks = weights_out_dict.pop(f"{name_prefix}.blocks", None)
+            w_scales = weights_out_dict.pop(f"{name_prefix}.scales", None)
+            if w_blocks is not None:
+                weights_out_dict[name_prefix] = _WeightCreator(
+                    partial(
+                        _dequant_mlp_weight,
+                        debug_name=name_prefix,
+                        w_blocks=w_blocks,
+                        w_scales=w_scales,
+                    )
+                )
+
+    return list(weights_out_dict.items())
+
+
+def _dequant_mlp_weight(debug_name, w_blocks, w_scales):
+    if get_tensor_model_parallel_rank() == 0:
+        logger.info(f"Dequantize {debug_name} start")
+
+    original_device = w_blocks.device
+
+    w_blocks = w_blocks.cuda()
+    w_scales = w_scales.cuda()
+
+    w_bf16 = dequant_mxfp4(w_block=w_blocks, w_scale=w_scales, out_dtype=torch.bfloat16)
+    w_bf16 = w_bf16.transpose(-2, -1).contiguous()
+
+    if get_tensor_model_parallel_rank() == 0:
+        logger.info(
+            f"Dequantize {debug_name} end {w_blocks.shape=} {w_scales.shape=} {w_bf16.shape=}"
+        )
+
+    return w_bf16.to(original_device)
+
+
+class _WeightCreator:
+    def __init__(self, fn):
+        self._fn = fn
+
+    @staticmethod
+    def maybe_materialize(obj):
+        if isinstance(obj, _WeightCreator):
+            output = obj._fn()
+            obj._fn = None
+            return output
+
+        return obj
+
+
+EntryClass = GptOssForCausalLM
diff --git a/sglang/python/sglang/srt/models/granite.py b/sglang/python/sglang/srt/models/granite.py
new file mode 100644
index 0000000000000000000000000000000000000000..19252dc8db62eb31a6d188ad88c035339dbfda16
--- /dev/null
+++ b/sglang/python/sglang/srt/models/granite.py
@@ -0,0 +1,505 @@
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+# Adapted from
+# https://github.com/vllm-project/vllm/blob/c7f2cf2b7f67bce5842fedfdba508440fe257375/vllm/model_executor/models/llama.py#L1
+"""Inference-only Granite model compatible with HuggingFace weights."""
+
+import logging
+from typing import Any, Dict, Iterable, Optional, Tuple
+
+import torch
+from torch import nn
+from transformers import GraniteConfig
+
+from sglang.srt.distributed import get_tensor_model_parallel_world_size
+from sglang.srt.layers.activation import SiluAndMul
+from sglang.srt.layers.layernorm import RMSNorm
+from sglang.srt.layers.linear import (
+    MergedColumnParallelLinear,
+    QKVParallelLinear,
+    RowParallelLinear,
+)
+from sglang.srt.layers.logits_processor import LogitsProcessor, LogitsProcessorOutput
+from sglang.srt.layers.pooler import Pooler, PoolingType
+from sglang.srt.layers.quantization.base_config import QuantizationConfig
+from sglang.srt.layers.radix_attention import RadixAttention
+from sglang.srt.layers.rotary_embedding import get_rope
+from sglang.srt.layers.vocab_parallel_embedding import (
+    ParallelLMHead,
+    VocabParallelEmbedding,
+)
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch
+from sglang.srt.model_loader.weight_utils import default_weight_loader
+from sglang.srt.utils import add_prefix
+from sglang.utils import get_exception_traceback
+
+logger = logging.getLogger(__name__)
+
+
+class GraniteMLP(nn.Module):
+    def __init__(
+        self,
+        hidden_size: int,
+        intermediate_size: int,
+        hidden_act: str,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.gate_up_proj = MergedColumnParallelLinear(
+            hidden_size,
+            [intermediate_size] * 2,
+            bias=False,
+            quant_config=quant_config,
+            prefix=add_prefix("gate_up_proj", prefix),
+        )
+        self.down_proj = RowParallelLinear(
+            intermediate_size,
+            hidden_size,
+            bias=False,
+            quant_config=quant_config,
+            prefix=add_prefix("down_proj", prefix),
+        )
+        if hidden_act != "silu":
+            raise ValueError(
+                f"Unsupported activation: {hidden_act}. "
+                "Only silu is supported for now."
+            )
+        self.act_fn = SiluAndMul()
+
+    def forward(self, x):
+        gate_up, _ = self.gate_up_proj(x)
+        x = self.act_fn(gate_up)
+        x, _ = self.down_proj(x)
+        return x
+
+
+class GraniteAttention(nn.Module):
+    def __init__(
+        self,
+        config: GraniteConfig,
+        hidden_size: int,
+        num_heads: int,
+        num_kv_heads: int,
+        layer_id: int = 0,
+        rope_theta: float = 10000,
+        rope_scaling: Optional[Dict[str, Any]] = None,
+        rope_is_neox_style: bool = True,
+        max_position_embeddings: int = 8192,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.hidden_size = hidden_size
+        tp_size = get_tensor_model_parallel_world_size()
+        self.total_num_heads = num_heads
+        assert self.total_num_heads % tp_size == 0
+        self.num_heads = self.total_num_heads // tp_size
+        self.total_num_kv_heads = num_kv_heads
+        if self.total_num_kv_heads >= tp_size:
+            # Number of KV heads is greater than TP size, so we partition
+            # the KV heads across multiple tensor parallel GPUs.
+            assert self.total_num_kv_heads % tp_size == 0
+        else:
+            # Number of KV heads is less than TP size, so we replicate
+            # the KV heads across multiple tensor parallel GPUs.
+            assert tp_size % self.total_num_kv_heads == 0
+        self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
+        # MistralConfig has an optional head_dim introduced by Mistral-Nemo
+        self.head_dim = getattr(
+            config, "head_dim", self.hidden_size // self.total_num_heads
+        )
+        self.q_size = self.num_heads * self.head_dim
+        self.kv_size = self.num_kv_heads * self.head_dim
+        self.scaling = config.attention_multiplier
+        self.rope_theta = rope_theta
+        self.max_position_embeddings = max_position_embeddings
+
+        self.qkv_proj = QKVParallelLinear(
+            hidden_size,
+            self.head_dim,
+            self.total_num_heads,
+            self.total_num_kv_heads,
+            bias=False,
+            quant_config=quant_config,
+            prefix=add_prefix("qkv_proj", prefix),
+        )
+        self.o_proj = RowParallelLinear(
+            self.total_num_heads * self.head_dim,
+            hidden_size,
+            bias=False,
+            quant_config=quant_config,
+            prefix=add_prefix("o_proj", prefix),
+        )
+
+        self.rotary_emb = get_rope(
+            self.head_dim,
+            rotary_dim=self.head_dim,
+            max_position=max_position_embeddings,
+            base=rope_theta,
+            rope_scaling=rope_scaling,
+            is_neox_style=rope_is_neox_style,
+        )
+        self.attn = RadixAttention(
+            self.num_heads,
+            self.head_dim,
+            self.scaling,
+            num_kv_heads=self.num_kv_heads,
+            layer_id=layer_id,
+            quant_config=quant_config,
+            prefix=add_prefix("attn", prefix),
+        )
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+    ) -> torch.Tensor:
+        qkv, _ = self.qkv_proj(hidden_states)
+        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+        q, k = self.rotary_emb(positions, q, k)
+        attn_output = self.attn(q, k, v, forward_batch)
+        output, _ = self.o_proj(attn_output)
+        return output
+
+
+class GraniteDecoderLayer(nn.Module):
+    def __init__(
+        self,
+        config: GraniteConfig,
+        layer_id: int = 0,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.residual_multiplier = config.residual_multiplier
+        rope_theta = getattr(config, "rope_theta", 10000)
+        rope_scaling = getattr(config, "rope_scaling", None)
+        if rope_scaling is not None and getattr(
+            config, "original_max_position_embeddings", None
+        ):
+            rope_scaling["original_max_position_embeddings"] = (
+                config.original_max_position_embeddings
+            )
+        rope_is_neox_style = getattr(config, "rope_is_neox_style", True)
+        max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
+        self.self_attn = GraniteAttention(
+            config=config,
+            hidden_size=self.hidden_size,
+            num_heads=config.num_attention_heads,
+            num_kv_heads=config.num_key_value_heads,
+            layer_id=layer_id,
+            rope_theta=rope_theta,
+            rope_scaling=rope_scaling,
+            rope_is_neox_style=rope_is_neox_style,
+            max_position_embeddings=max_position_embeddings,
+            quant_config=quant_config,
+            prefix=add_prefix("self_attn", prefix),
+        )
+        self.mlp = GraniteMLP(
+            hidden_size=self.hidden_size,
+            intermediate_size=config.intermediate_size,
+            hidden_act=config.hidden_act,
+            quant_config=quant_config,
+            prefix=add_prefix("mlp", prefix),
+        )
+        self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = RMSNorm(
+            config.hidden_size, eps=config.rms_norm_eps
+        )
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+        residual: Optional[torch.Tensor],
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        # Self Attention
+        if residual is None:
+            residual = hidden_states
+            hidden_states = self.input_layernorm(hidden_states)
+        else:
+            hidden_states, residual = self.input_layernorm(hidden_states, residual)
+        hidden_states = (
+            self.self_attn(
+                positions=positions,
+                hidden_states=hidden_states,
+                forward_batch=forward_batch,
+            )
+            * self.residual_multiplier
+        )  # multiplier for Maximal Update Parameterization
+
+        # Fully Connected
+        hidden_states, residual = self.post_attention_layernorm(hidden_states, residual)
+        hidden_states = self.mlp(hidden_states) * self.residual_multiplier
+        return hidden_states, residual
+
+
+class GraniteModel(nn.Module):
+    def __init__(
+        self,
+        config: GraniteConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.config = config
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+        self.embed_tokens = VocabParallelEmbedding(
+            config.vocab_size, config.hidden_size
+        )
+        self.layers = nn.ModuleList(
+            [
+                GraniteDecoderLayer(
+                    config,
+                    i,
+                    quant_config=quant_config,
+                    prefix=add_prefix(f"layers.{i}", prefix),
+                )
+                for i in range(config.num_hidden_layers)
+            ]
+        )
+        self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        input_embeds: torch.Tensor = None,
+    ) -> torch.Tensor:
+        if input_embeds is None:
+            hidden_states = self.embed_tokens(input_ids)
+        else:
+            hidden_states = input_embeds
+        residual = None
+        hidden_states *= self.config.embedding_multiplier
+        for i in range(len(self.layers)):
+            layer = self.layers[i]
+            hidden_states, residual = layer(
+                positions,
+                hidden_states,
+                forward_batch,
+                residual,
+            )
+        hidden_states, _ = self.norm(hidden_states, residual)
+        return hidden_states
+
+
+class GraniteForCausalLM(nn.Module):
+    def __init__(
+        self,
+        config: GraniteConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.config = config
+        self.quant_config = quant_config
+        self.model = GraniteModel(
+            config, quant_config=quant_config, prefix=add_prefix("model", prefix)
+        )
+        # If tie_word_embeddings == True, then input and output embeddings are
+        # the same tensor. Enforce during object creation so that weights will
+        # load correctly even if the LM head weights don't have a separate entry
+        # in the state dict.
+        self.lm_head = ParallelLMHead(
+            config.vocab_size,
+            config.hidden_size,
+            quant_config=quant_config,
+            prefix=add_prefix("lm_head", prefix),
+        )
+        if self.config.tie_word_embeddings:
+            self.lm_head.tie_weights(self.model.embed_tokens)
+
+        # Granite logit scaling factors are applied via division, but
+        # LogitsProcessor expects a multiplicative factor.
+        if hasattr(config, "logits_scaling"):
+            logit_scale = 1.0 / config.logits_scaling
+        else:
+            logit_scale = None
+        self.logits_processor = LogitsProcessor(config, logit_scale=logit_scale)
+        self.pooler = Pooler(pooling_type=PoolingType.LAST, normalize=True)
+        self.stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            (".qkv_proj", ".q_proj", "q"),
+            (".qkv_proj", ".k_proj", "k"),
+            (".qkv_proj", ".v_proj", "v"),
+            (".gate_up_proj", ".gate_proj", 0),
+            (".gate_up_proj", ".up_proj", 1),
+        ]
+
+    @torch.no_grad()
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        input_embeds: torch.Tensor = None,
+        get_embedding: bool = False,
+    ) -> LogitsProcessorOutput:
+        hidden_states = self.model(input_ids, positions, forward_batch, input_embeds)
+        if not get_embedding:
+            logits_processor_output: LogitsProcessorOutput = self.logits_processor(
+                input_ids, hidden_states, self.lm_head, forward_batch
+            )
+            return logits_processor_output
+        else:
+            return self.pooler(hidden_states, forward_batch)
+
+    def get_module_name_from_weight_name(self, name):
+        for param_name, weight_name, shard_id, num_shard in self.stacked_params_mapping:
+            if weight_name in name:
+                return (
+                    name.replace(weight_name, param_name)[: -len(".weight")],
+                    num_shard,
+                )
+        return name[: -len(".weight")], 1
+
+    def get_num_params(self):
+        params_dict = dict(self.named_parameters())
+        return len(params_dict)
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            (".qkv_proj", ".q_proj", "q"),
+            (".qkv_proj", ".k_proj", "k"),
+            (".qkv_proj", ".v_proj", "v"),
+            (".gate_up_proj", ".gate_proj", 0),
+            (".gate_up_proj", ".up_proj", 1),
+        ]
+
+        params_dict = dict(self.named_parameters())
+
+        for name, loaded_weight in weights:
+            if "rotary_emb.inv_freq" in name or "projector" in name:
+                continue
+            if "rotary_emb.cos_cached" in name or "rotary_emb.sin_cached" in name:
+                # Models trained using ColossalAI may include these tensors in
+                # the checkpoint. Skip them.
+                continue
+            if name.startswith("model.vision_tower") and name not in params_dict:
+                continue
+            if "lm_head.weight" in name and self.config.tie_word_embeddings:
+                # Input and output embeddings are tied, so the output embeddings
+                # may not be present in the checkpoint. We assume that the input
+                # embeddings are always present in the checkpoint.
+                continue
+
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                # This block only runs if the preceding for loop doesn't find
+                # a match for `name` in `stacked_params_mapping`.
+
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                # Skip loading kv_scale from ckpts towards new design.
+                if name.endswith(".kv_scale") and name not in params_dict:
+                    continue
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
+                weight_loader(param, loaded_weight)
+
+    def get_weights_by_name(
+        self, name: str, truncate_size: int = 100, tp_size: int = 1
+    ) -> Optional[torch.Tensor]:
+        """Get the weights of the parameter by its name. Similar to `get_parameter` in Hugging Face.
+
+        Only used for unit test with an unoptimized performance.
+        For optimized performance, please use torch.save and torch.load.
+        """
+        try:
+            if name == "lm_head.weight" and self.config.tie_word_embeddings:
+                logger.info(
+                    "word embedding is tied for this model, return embed_tokens.weight as lm_head.weight."
+                )
+                return (
+                    self.model.embed_tokens.weight.cpu()
+                    .to(torch.float32)
+                    .numpy()
+                    .tolist()[:truncate_size]
+                )
+
+            mapped_name = name
+            mapped_shard_id = None
+            for param_name, weight_name, shard_id in self.stacked_params_mapping:
+                if weight_name in name:
+                    mapped_name = name.replace(weight_name, param_name)
+                    mapped_shard_id = shard_id
+                    break
+            params_dict = dict(self.named_parameters())
+            param = params_dict[mapped_name]
+            if mapped_shard_id is not None:
+                if mapped_shard_id in ["q", "k", "v"]:
+                    num_heads = self.config.num_attention_heads // tp_size
+                    num_kv_heads = self.config.num_key_value_heads // tp_size
+                    head_dim = (
+                        self.config.hidden_size // self.config.num_attention_heads
+                    )
+                    if mapped_shard_id == "q":
+                        offset = 0
+                        size = num_heads * head_dim
+                    elif mapped_shard_id == "k":
+                        offset = num_heads * head_dim
+                        size = num_kv_heads * head_dim
+                    elif mapped_shard_id == "v":
+                        offset = (num_heads + num_kv_heads) * head_dim
+                        size = num_kv_heads * head_dim
+                    weight = param.data.narrow(0, offset, size)
+                elif mapped_shard_id in [0, 1]:
+                    intermediate_size = self.config.intermediate_size
+                    slice_size = intermediate_size // tp_size
+                    if mapped_shard_id == 0:  # gate_proj
+                        offset = 0
+                        size = slice_size
+                    elif mapped_shard_id == 1:  # up_proj
+                        offset = slice_size
+                        size = slice_size
+
+                    weight = param.data.narrow(0, offset, size)
+                else:
+                    weight = param.data
+            else:
+                weight = param.data
+            if tp_size > 1 and ("o_proj" in name or "down_proj" in name):
+                gathered_weights = [torch.zeros_like(weight) for _ in range(tp_size)]
+                torch.distributed.all_gather(gathered_weights, weight)
+                weight = torch.cat(gathered_weights, dim=1)
+            return weight.cpu().to(torch.float32).numpy().tolist()[:truncate_size]
+
+        except Exception:
+            logger.error(
+                f"Error getting weights by name {name} in GraniteForCausalLM: {get_exception_traceback()}"
+            )
+            return None
+
+
+EntryClass = [GraniteForCausalLM]
diff --git a/sglang/python/sglang/srt/models/granitemoe.py b/sglang/python/sglang/srt/models/granitemoe.py
new file mode 100644
index 0000000000000000000000000000000000000000..d65b9ec06d312aed38fa602579d3e7b4d34ffe26
--- /dev/null
+++ b/sglang/python/sglang/srt/models/granitemoe.py
@@ -0,0 +1,387 @@
+"""Inference-only GraniteMoe model."""
+
+from typing import Iterable, Optional
+
+import torch
+from torch import nn
+from transformers import GraniteConfig
+
+from sglang.srt.distributed import get_tensor_model_parallel_world_size
+from sglang.srt.layers.layernorm import RMSNorm
+from sglang.srt.layers.linear import (
+    QKVParallelLinear,
+    ReplicatedLinear,
+    RowParallelLinear,
+)
+from sglang.srt.layers.logits_processor import LogitsProcessor, LogitsProcessorOutput
+from sglang.srt.layers.moe.fused_moe_triton import FusedMoE
+from sglang.srt.layers.moe.topk import TopK
+from sglang.srt.layers.pooler import Pooler, PoolingType
+from sglang.srt.layers.quantization.base_config import QuantizationConfig
+from sglang.srt.layers.radix_attention import RadixAttention
+from sglang.srt.layers.rotary_embedding import get_rope
+from sglang.srt.layers.vocab_parallel_embedding import (
+    ParallelLMHead,
+    VocabParallelEmbedding,
+)
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch
+from sglang.srt.models import mixtral
+from sglang.srt.utils import add_prefix
+
+
+class GraniteMoeMoE(nn.Module):
+    """A tensor-parallel MoE implementation for GraniteMoe that shards each
+    expert across all ranks.
+    Each expert's weights are sharded across all ranks and a fused MoE
+    kernel is used for the forward pass, and finally we reduce the outputs
+    across ranks.
+    """
+
+    def __init__(
+        self,
+        num_experts: int,
+        top_k: int,
+        hidden_size: int,
+        intermediate_size: int,
+        layer_id: int,
+        params_dtype: Optional[torch.dtype] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        tp_size: Optional[int] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.hidden_size = hidden_size
+
+        # Gate always runs at half / full precision for now.
+        self.gate = ReplicatedLinear(
+            hidden_size,
+            num_experts,
+            bias=False,
+            params_dtype=params_dtype,
+            quant_config=None,
+            prefix=f"{prefix}.gate",
+        )
+
+        self.topk = TopK(
+            top_k=top_k,
+            renormalize=True,
+        )
+
+        self.experts = FusedMoE(
+            num_experts=num_experts,
+            top_k=top_k,
+            hidden_size=hidden_size,
+            intermediate_size=intermediate_size,
+            layer_id=layer_id,
+            params_dtype=params_dtype,
+            reduce_results=True,
+            quant_config=quant_config,
+            prefix=f"{prefix}.experts",
+        )
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        # NOTE: hidden_states can have either 1D or 2D shape.
+        orig_shape = hidden_states.shape
+        hidden_states = hidden_states.view(-1, self.hidden_size)
+        router_logits, _ = self.gate(hidden_states)
+        topk_output = self.topk(hidden_states, router_logits)
+        final_hidden_states = self.experts(hidden_states, topk_output)
+        return final_hidden_states.view(orig_shape)
+
+
+class GraniteMoeAttention(nn.Module):
+
+    def __init__(
+        self,
+        hidden_size: int,
+        num_heads: int,
+        num_kv_heads: int,
+        max_position: int = 4096 * 32,
+        layer_id: int = 0,
+        rope_theta: float = 10000,
+        quant_config: Optional[QuantizationConfig] = None,
+        attention_multiplier: Optional[float] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.hidden_size = hidden_size
+        tp_size = get_tensor_model_parallel_world_size()
+        self.total_num_heads = num_heads
+        assert self.total_num_heads % tp_size == 0
+        self.num_heads = self.total_num_heads // tp_size
+        self.total_num_kv_heads = num_kv_heads
+        if self.total_num_kv_heads >= tp_size:
+            # Number of KV heads is greater than TP size, so we partition
+            # the KV heads across multiple tensor parallel GPUs.
+            assert self.total_num_kv_heads % tp_size == 0
+        else:
+            # Number of KV heads is less than TP size, so we replicate
+            # the KV heads across multiple tensor parallel GPUs.
+            assert tp_size % self.total_num_kv_heads == 0
+        self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
+        self.head_dim = hidden_size // self.total_num_heads
+        self.q_size = self.num_heads * self.head_dim
+        self.kv_size = self.num_kv_heads * self.head_dim
+        self.scaling = (
+            attention_multiplier
+            if attention_multiplier is not None
+            else self.head_dim**-1
+        )
+        self.rope_theta = rope_theta
+
+        self.qkv_proj = QKVParallelLinear(
+            hidden_size,
+            self.head_dim,
+            self.total_num_heads,
+            self.total_num_kv_heads,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.qkv_proj",
+        )
+        self.o_proj = RowParallelLinear(
+            self.total_num_heads * self.head_dim,
+            hidden_size,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.o_proj",
+        )
+        self.rotary_emb = get_rope(
+            self.head_dim,
+            rotary_dim=self.head_dim,
+            max_position=max_position,
+            base=int(self.rope_theta),
+            is_neox_style=True,
+        )
+        self.attn = RadixAttention(
+            self.num_heads,
+            self.head_dim,
+            self.scaling,
+            num_kv_heads=self.num_kv_heads,
+            layer_id=layer_id,
+            quant_config=quant_config,
+            prefix=f"{prefix}.attn",
+        )
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+    ) -> torch.Tensor:
+        qkv, _ = self.qkv_proj(hidden_states)
+        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+        q, k = self.rotary_emb(positions, q, k)
+        attn_output = self.attn(q, k, v, forward_batch)
+        output, _ = self.o_proj(attn_output)
+        return output
+
+
+class GraniteMoeDecoderLayer(nn.Module):
+
+    def __init__(
+        self,
+        config: GraniteConfig,
+        layer_id: int = 0,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        rope_theta = getattr(config, "rope_theta", 10000)
+        self.self_attn = GraniteMoeAttention(
+            hidden_size=self.hidden_size,
+            num_heads=config.num_attention_heads,
+            max_position=config.max_position_embeddings,
+            num_kv_heads=config.num_key_value_heads,
+            rope_theta=rope_theta,
+            layer_id=layer_id,
+            quant_config=quant_config,
+            prefix=f"{prefix}.self_attn",
+            attention_multiplier=config.attention_multiplier,
+        )
+        self.block_sparse_moe = GraniteMoeMoE(
+            num_experts=config.num_local_experts,
+            top_k=config.num_experts_per_tok,
+            hidden_size=config.hidden_size,
+            intermediate_size=config.intermediate_size,
+            layer_id=layer_id,
+            quant_config=quant_config,
+            prefix=f"{prefix}.block_sparse_moe",
+        )
+
+        self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = RMSNorm(
+            config.hidden_size, eps=config.rms_norm_eps
+        )
+
+        self.residual_multiplier = config.residual_multiplier
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+    ) -> torch.Tensor:
+        residual = hidden_states
+        hidden_states = self.input_layernorm(hidden_states)
+        # Self Attention
+        hidden_states = self.self_attn(
+            positions=positions,
+            hidden_states=hidden_states,
+            forward_batch=forward_batch,
+        )
+        hidden_states = residual + hidden_states * self.residual_multiplier
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.block_sparse_moe(hidden_states)
+        hidden_states = residual + hidden_states * self.residual_multiplier
+
+        return hidden_states
+
+
+class GraniteMoeModel(nn.Module):
+
+    def __init__(
+        self,
+        config: GraniteConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.embed_tokens = VocabParallelEmbedding(
+            config.vocab_size,
+            config.hidden_size,
+            org_num_embeddings=config.vocab_size,
+        )
+        self.embedding_multiplier = config.embedding_multiplier
+
+        self.layers = nn.ModuleList(
+            [
+                GraniteMoeDecoderLayer(
+                    config,
+                    i,
+                    quant_config=quant_config,
+                    prefix=add_prefix(f"layers.{i}", prefix),
+                )
+                for i in range(config.num_hidden_layers)
+            ]
+        )
+        self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.embed_tokens(input_ids)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        if inputs_embeds is not None:
+            hidden_states = inputs_embeds
+        else:
+            hidden_states = self.get_input_embeddings(input_ids)
+        hidden_states *= self.embedding_multiplier
+
+        for i in range(len(self.layers)):
+            layer = self.layers[i]
+            hidden_states = layer(
+                positions,
+                hidden_states,
+                forward_batch,
+            )
+        hidden_states = self.norm(hidden_states)
+        return hidden_states
+
+
+class GraniteMoeForCausalLM(nn.Module):
+
+    def __init__(
+        self,
+        config: GraniteConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.config = config
+        self.quant_config = quant_config
+
+        self.model = GraniteMoeModel(
+            config, quant_config=quant_config, prefix=add_prefix("model", prefix)
+        )
+        self.lm_head = ParallelLMHead(
+            config.vocab_size,
+            config.hidden_size,
+            quant_config=quant_config,
+            prefix=add_prefix("lm_head", prefix),
+        )
+        if config.tie_word_embeddings:
+            self.lm_head.weight = self.model.embed_tokens.weight
+        # Granite logit scaling factors are applied via division, but
+        # LogitsProcessor expects a multiplicative factor.
+        if hasattr(config, "logits_scaling"):
+            logit_scale = 1.0 / config.logits_scaling
+        else:
+            logit_scale = None
+        self.logits_processor = LogitsProcessor(config, logit_scale=logit_scale)
+        self.pooler = Pooler(pooling_type=PoolingType.LAST, normalize=True)
+
+    @torch.no_grad()
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        input_embeds: torch.Tensor = None,
+        get_embedding: bool = False,
+    ) -> LogitsProcessorOutput:
+        hidden_states = self.model(input_ids, positions, forward_batch, input_embeds)
+        if not get_embedding:
+            logits_processor_output: LogitsProcessorOutput = self.logits_processor(
+                input_ids, hidden_states, self.lm_head, forward_batch
+            )
+            return logits_processor_output
+        else:
+            return self.pooler(hidden_states, forward_batch)
+
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
+        new_weights = {}
+        for n, p in weights:
+            if n.endswith(".block_sparse_moe.input_linear.weight"):
+                for e in range(p.size(0)):
+                    w1_name = n.replace(
+                        ".block_sparse_moe.input_linear.weight",
+                        f".block_sparse_moe.experts.{e}.w1.weight",
+                    )
+                    w3_name = n.replace(
+                        ".block_sparse_moe.input_linear.weight",
+                        f".block_sparse_moe.experts.{e}.w3.weight",
+                    )
+                    w1_param, w3_param = p[e].chunk(2, dim=0)
+                    assert w1_name not in new_weights
+                    assert w3_name not in new_weights
+                    new_weights[w1_name] = w1_param
+                    new_weights[w3_name] = w3_param
+            elif n.endswith(".block_sparse_moe.output_linear.weight"):
+                for e in range(p.size(0)):
+                    w2_name = n.replace(
+                        ".block_sparse_moe.output_linear.weight",
+                        f".block_sparse_moe.experts.{e}.w2.weight",
+                    )
+                    w2_param = p[e]
+                    assert w2_name not in new_weights
+                    new_weights[w2_name] = w2_param
+            elif n.endswith(".block_sparse_moe.router.layer.weight"):
+                gate_name = n.replace(
+                    ".block_sparse_moe.router.layer.weight",
+                    ".block_sparse_moe.gate.weight",
+                )
+                assert gate_name not in new_weights
+                new_weights[gate_name] = p
+            else:
+                new_weights[n] = p
+        mixtral.MixtralForCausalLM.load_weights(self, new_weights.items())
+
+
+EntryClass = [GraniteMoeForCausalLM]
diff --git a/sglang/python/sglang/srt/models/granitemoehybrid.py b/sglang/python/sglang/srt/models/granitemoehybrid.py
new file mode 100644
index 0000000000000000000000000000000000000000..e18aeb466a9cf562197404d487c88cc7a11d4766
--- /dev/null
+++ b/sglang/python/sglang/srt/models/granitemoehybrid.py
@@ -0,0 +1,737 @@
+from typing import Iterable, Optional
+
+import torch
+from torch import nn
+from transformers.models.granitemoeshared import GraniteMoeSharedConfig
+
+from sglang.srt.configs.granitemoehybrid import GraniteMoeHybridConfig
+from sglang.srt.distributed import get_pp_group, get_tensor_model_parallel_world_size
+from sglang.srt.layers.activation import SiluAndMul
+from sglang.srt.layers.attention.hybrid_linear_attn_backend import (
+    HybridLinearAttnBackend,
+    Mamba2AttnBackend,
+)
+from sglang.srt.layers.attention.mamba.mamba import MambaMixer2
+from sglang.srt.layers.layernorm import RMSNorm
+from sglang.srt.layers.linear import (
+    MergedColumnParallelLinear,
+    QKVParallelLinear,
+    RowParallelLinear,
+)
+from sglang.srt.layers.logits_processor import LogitsProcessor
+from sglang.srt.layers.pooler import Pooler, PoolingType
+from sglang.srt.layers.quantization.base_config import QuantizationConfig
+from sglang.srt.layers.radix_attention import RadixAttention
+from sglang.srt.layers.rotary_embedding import get_rope
+from sglang.srt.layers.utils import PPMissingLayer
+from sglang.srt.layers.vocab_parallel_embedding import (
+    ParallelLMHead,
+    VocabParallelEmbedding,
+)
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch, PPProxyTensors
+from sglang.srt.model_loader.weight_utils import default_weight_loader
+from sglang.srt.models.transformers import maybe_prefix
+from sglang.srt.utils import make_layers
+
+from .granitemoe import GraniteMoeMoE
+
+
+# in vLLM this is in a separate file, but keeping it here for decoupling
+class GraniteMoeSharedMLP(nn.Module):
+    def __init__(
+        self,
+        config: GraniteMoeSharedConfig,
+        quant_config: QuantizationConfig | None = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+
+        self.input_size = config.hidden_size
+        self.hidden_size = config.shared_intermediate_size
+        self.input_linear = MergedColumnParallelLinear(
+            input_size=self.input_size,
+            output_sizes=[self.hidden_size] * 2,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.input_linear",
+        )
+        self.output_linear = RowParallelLinear(
+            self.hidden_size,
+            self.input_size,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.output_linear",
+        )
+        if config.hidden_act != "silu":
+            raise ValueError(
+                f"Unsupported activation: {config.hidden_act}. "
+                "Only silu is supported for now."
+            )
+        self.act_fn = SiluAndMul()
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        gate_up, _ = self.input_linear(hidden_states)
+        x = self.act_fn(gate_up)
+        x, _ = self.output_linear(x)
+        return x
+
+
+class GraniteMoeHybridMambaDecoderLayer(nn.Module):
+    def __init__(
+        self,
+        config: GraniteMoeHybridConfig,
+        layer_idx: int,
+        quant_config: QuantizationConfig | None = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.config = config
+        self.layer_idx = layer_idx
+        self.hidden_size = config.hidden_size
+        self.residual_multiplier = config.residual_multiplier
+
+        self.mamba = MambaMixer2(
+            cache_params=config.mamba2_cache_params,
+            hidden_size=config.hidden_size,
+            use_conv_bias=config.mamba_conv_bias,
+            use_bias=config.mamba_proj_bias,
+            n_groups=config.mamba_n_groups,
+            rms_norm_eps=config.rms_norm_eps,
+            activation=config.hidden_act,
+            quant_config=quant_config,
+            prefix=f"{prefix}.mixer",
+        )
+
+        self.block_sparse_moe = None
+        if getattr(config, "num_local_experts", 0) > 0:
+            self.block_sparse_moe = GraniteMoeMoE(
+                num_experts=config.num_local_experts,
+                top_k=config.num_experts_per_tok,
+                hidden_size=config.hidden_size,
+                intermediate_size=config.intermediate_size,
+                layer_id=layer_idx,
+                quant_config=quant_config,
+                tp_size=get_tensor_model_parallel_world_size(),
+                prefix=f"{prefix}.block_sparse_moe",
+            )
+
+        self.shared_mlp = (
+            None
+            if getattr(config, "shared_intermediate_size", 0) == 0
+            else GraniteMoeSharedMLP(
+                config, quant_config=quant_config, prefix=f"{prefix}.shared_mlp"
+            )
+        )
+
+        self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = RMSNorm(
+            config.hidden_size, eps=config.rms_norm_eps
+        )
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        residual: torch.Tensor | None,
+        forward_batch: ForwardBatch,
+    ):
+        residual = hidden_states
+        hidden_states = self.input_layernorm(hidden_states)
+
+        output = torch.empty_like(hidden_states)
+        attn_backend = forward_batch.attn_backend
+        assert isinstance(attn_backend, HybridLinearAttnBackend)
+        assert isinstance(attn_backend.linear_attn_backend, Mamba2AttnBackend)
+        attn_backend.linear_attn_backend.forward(
+            mixer=self.mamba,
+            layer_id=self.layer_idx,
+            hidden_states=hidden_states,
+            output=output,
+            use_triton_causal_conv=True,
+        )
+
+        hidden_states = residual + output * self.residual_multiplier
+
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        if self.shared_mlp is None:
+            if self.block_sparse_moe is not None:
+                hidden_states = self.block_sparse_moe(hidden_states)
+            # else: skip
+        else:
+            # create a copy since block_sparse_moe modifies in-place
+            if self.block_sparse_moe is not None:
+                moe_hidden_states = hidden_states.clone()
+                moe_hidden_states = self.block_sparse_moe(moe_hidden_states)
+                hidden_states = moe_hidden_states + self.shared_mlp(hidden_states)
+                del moe_hidden_states
+            else:
+                hidden_states = self.shared_mlp(hidden_states)
+        hidden_states = residual + hidden_states * self.residual_multiplier
+
+        return hidden_states, residual
+
+
+class GraniteMoeHybridAttention(nn.Module):
+    def __init__(
+        self,
+        config: GraniteMoeHybridConfig,
+        layer_id: int,
+        quant_config: QuantizationConfig | None = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.causal = True
+        self.hidden_size = config.hidden_size
+        self.attention_bias = config.attention_bias
+        self.attention_multiplier = config.attention_multiplier
+        self.total_num_heads = config.num_attention_heads
+        self.head_dim = self.hidden_size // self.total_num_heads
+        self.total_num_kv_heads = config.num_key_value_heads
+
+        # TensorParallel logic
+        tp_size = get_tensor_model_parallel_world_size()
+        assert self.total_num_heads % tp_size == 0
+        self.num_heads = self.total_num_heads // tp_size
+        if self.total_num_kv_heads >= tp_size:
+            # Number of KV heads is greater than TP size, so we partition
+            # the KV heads across multiple tensor parallel GPUs.
+            assert self.total_num_kv_heads % tp_size == 0
+        else:
+            # Number of KV heads is less than TP size, so we replicate
+            # the KV heads across multiple tensor parallel GPUs.
+            assert tp_size % self.total_num_kv_heads == 0
+        self.num_key_value_heads = max(1, self.total_num_kv_heads // tp_size)
+
+        self.qkv_proj = QKVParallelLinear(
+            self.hidden_size,
+            self.head_dim,
+            self.total_num_heads,
+            self.total_num_kv_heads,
+            bias=self.attention_bias,
+            quant_config=quant_config,
+            prefix=f"{prefix}.qkv_proj",
+        )
+
+        self.o_proj = RowParallelLinear(
+            self.hidden_size,
+            self.hidden_size,
+            bias=self.attention_bias,
+            quant_config=quant_config,
+            prefix=f"{prefix}.o_proj",
+        )
+
+        if config.position_embedding_type == "rope":
+
+            self.rotary_emb = get_rope(
+                head_size=self.head_dim,
+                rotary_dim=self.head_dim,  # its not in the config
+                max_position=config.max_position_embeddings,
+                base=config.rope_theta,
+                rope_scaling=config.rope_scaling,
+            )
+        else:
+            self.rotary_emb = None
+
+        self.attn = RadixAttention(
+            num_heads=self.num_heads,
+            head_dim=self.head_dim,
+            scaling=self.attention_multiplier,
+            num_kv_heads=self.num_key_value_heads,
+            layer_id=layer_id,
+            quant_config=quant_config,
+            prefix=f"{prefix}.attn",
+        )
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch | None = None,
+    ) -> torch.Tensor:
+        qkv, _ = self.qkv_proj(hidden_states)
+        query, key, value = qkv.split(
+            [
+                self.num_heads * self.head_dim,
+                self.num_key_value_heads * self.head_dim,
+                self.num_key_value_heads * self.head_dim,
+            ],
+            dim=-1,
+        )
+
+        if self.rotary_emb is not None:
+            query, key = self.rotary_emb(positions, query, key)
+
+        hidden_states = self.attn(query, key, value, forward_batch=forward_batch)
+        del query, key, value
+
+        hidden_states = self.o_proj(hidden_states)[0]
+        return hidden_states
+
+
+class GraniteMoeHybridAttentionDecoderLayer(nn.Module):
+    def __init__(
+        self,
+        config: GraniteMoeHybridConfig,
+        layer_idx: int,
+        quant_config: QuantizationConfig | None = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.residual_multiplier = config.residual_multiplier
+
+        self.self_attn = GraniteMoeHybridAttention(
+            config,
+            layer_id=layer_idx,
+            quant_config=quant_config,
+            prefix=f"{prefix}.self_attn",
+        )
+
+        self.block_sparse_moe = None
+        if getattr(config, "num_local_experts", 0) > 0:
+            self.block_sparse_moe = GraniteMoeMoE(
+                num_experts=config.num_local_experts,
+                top_k=config.num_experts_per_tok,
+                hidden_size=config.hidden_size,
+                intermediate_size=config.intermediate_size,
+                layer_id=layer_idx,
+                quant_config=quant_config,
+                tp_size=get_tensor_model_parallel_world_size(),
+                prefix=f"{prefix}.block_sparse_moe",
+            )
+
+        self.shared_mlp = (
+            None
+            if getattr(config, "shared_intermediate_size", 0) == 0
+            else GraniteMoeSharedMLP(
+                config, quant_config=quant_config, prefix=f"{prefix}.shared_mlp"
+            )
+        )
+
+        self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = RMSNorm(
+            config.hidden_size, eps=config.rms_norm_eps
+        )
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        residual: torch.Tensor | None,
+        forward_batch: ForwardBatch | None = None,
+    ) -> torch.Tensor:
+        residual = hidden_states
+        hidden_states = self.input_layernorm(hidden_states)
+
+        hidden_states = self.self_attn(
+            positions=positions,
+            hidden_states=hidden_states,
+            forward_batch=forward_batch,
+        )
+        hidden_states = residual + hidden_states * self.residual_multiplier
+
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        if self.shared_mlp is None:
+            if self.block_sparse_moe is not None:
+                hidden_states = self.block_sparse_moe(hidden_states)
+            # else: skip
+        else:
+            # create a copy since block_sparse_moe modifies in-place
+            if self.block_sparse_moe is not None:
+                moe_hidden_states = hidden_states.clone()
+                moe_hidden_states = self.block_sparse_moe(moe_hidden_states)
+                hidden_states = moe_hidden_states + self.shared_mlp(hidden_states)
+                del moe_hidden_states
+            else:
+                hidden_states = self.shared_mlp(hidden_states)
+        hidden_states = residual + hidden_states * self.residual_multiplier
+
+        return hidden_states, residual
+
+
+ALL_DECODER_LAYER_TYPES = {
+    "attention": GraniteMoeHybridAttentionDecoderLayer,
+    "mamba": GraniteMoeHybridMambaDecoderLayer,
+}
+
+
+class GraniteMoeHybridModel(nn.Module):
+    def __init__(
+        self,
+        config: GraniteMoeHybridConfig,
+        quant_config: QuantizationConfig | None = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+
+        self.config = config
+        self.quant_config = quant_config
+
+        self.vocab_size = config.vocab_size
+
+        self.pp_group = get_pp_group()
+
+        if self.pp_group.is_first_rank:
+            self.embed_tokens = VocabParallelEmbedding(
+                self.vocab_size,
+                config.hidden_size,
+                org_num_embeddings=config.vocab_size,
+            )
+        else:
+            self.embed_tokens = PPMissingLayer()
+
+        self.embedding_multiplier = config.embedding_multiplier
+
+        def get_layer(idx: int, prefix: str):
+            layer_idx = int(prefix.rsplit(".", 1)[1])
+            layer_class = ALL_DECODER_LAYER_TYPES[config.layer_types[layer_idx]]
+            return layer_class(
+                config,
+                layer_idx,
+                quant_config=quant_config,
+                prefix=prefix,
+            )
+
+        self.layers, self.start_layer, self.end_layer = make_layers(
+            config.num_hidden_layers,
+            get_layer,
+            pp_rank=self.pp_group.rank_in_group,
+            pp_size=self.pp_group.world_size,
+            prefix=f"{prefix}.layers",
+        )
+
+        if self.pp_group.is_last_rank:
+            self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        else:
+            self.norm = PPMissingLayer(return_tuple=True)
+        self.layers_to_capture = []
+
+    def get_input_embeddings(self) -> nn.Embedding:
+        """Get input embeddings from the model."""
+        return self.embed_tokens
+
+    def forward(
+        self,
+        input_ids: torch.Tensor | None,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+        pp_proxy_tensors: Optional[PPProxyTensors] = None,
+    ) -> torch.Tensor:
+        if self.pp_group.is_first_rank:
+            if inputs_embeds is not None:
+                hidden_states = inputs_embeds
+            else:
+                hidden_states = self.embed_tokens(input_ids)
+                hidden_states = hidden_states * self.embedding_multiplier
+            residual = None
+        else:
+            assert pp_proxy_tensors is not None
+            hidden_states = pp_proxy_tensors["hidden_states"]
+            residual = pp_proxy_tensors["residual"]
+
+        aux_hidden_states = []
+        for i in range(self.start_layer, self.end_layer):
+            if i in self.layers_to_capture:
+                aux_hidden_states.append(hidden_states + residual)
+            layer = self.layers[i]
+            hidden_states, residual = layer(
+                positions,
+                hidden_states,
+                residual,
+                forward_batch,
+            )
+
+        if not self.pp_group.is_last_rank:
+            return PPProxyTensors(
+                {
+                    "hidden_states": hidden_states,
+                    "residual": residual,
+                }
+            )
+        else:
+            hidden_states, _ = self.norm(hidden_states, residual)
+
+        if len(aux_hidden_states) == 0:
+            return hidden_states
+
+        return hidden_states, aux_hidden_states
+
+
+class GraniteMoeHybridForCausalLM(
+    nn.Module,
+):
+    packed_modules_mapping = {
+        "qkv_proj": [
+            "q_proj",
+            "k_proj",
+            "v_proj",
+        ],
+        "conv1d": ["conv1d"],
+        "in_proj": ["in_proj"],
+        "input_linear": ["input_linear"],
+    }
+    embedding_modules = {
+        "embed_tokens": "input_embeddings",
+        "lm_head": "output_embeddings",
+    }
+
+    def __init__(
+        self,
+        config: GraniteMoeHybridConfig,
+        quant_config: QuantizationConfig | None = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+
+        self.capture_aux_hidden_states = False
+        self.pp_group = get_pp_group()
+
+        self.quant_config = quant_config
+        self.config = config
+        self.model = GraniteMoeHybridModel(
+            config=config,
+            quant_config=quant_config,
+            prefix=maybe_prefix(prefix, "model"),
+        )
+
+        self.lm_head = ParallelLMHead(
+            config.vocab_size,
+            config.hidden_size,
+            quant_config=self.quant_config,
+            prefix=maybe_prefix(prefix, "lm_head"),
+        )
+
+        if config.tie_word_embeddings:
+            self.lm_head.weight = self.model.embed_tokens.weight
+
+        self.logits_processor = LogitsProcessor(
+            config,
+            logit_scale=1 / self.config.logits_scaling,
+        )
+
+        self.pooler = Pooler(pooling_type=PoolingType.LAST, normalize=True)
+
+    @property
+    def start_layer(self):
+        return self.model.start_layer
+
+    @property
+    def end_layer(self):
+        return self.model.end_layer
+
+    def get_input_embeddings(self) -> nn.Embedding:
+        return self.model.embed_tokens
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        input_embeds: torch.Tensor = None,
+        get_embedding: bool = False,
+        pp_proxy_tensors: Optional[PPProxyTensors] = None,
+    ):
+        hidden_states = self.model(
+            input_ids, positions, forward_batch, input_embeds, pp_proxy_tensors
+        )
+
+        aux_hidden_states = None
+        if self.capture_aux_hidden_states:
+            hidden_states, aux_hidden_states = hidden_states
+
+        if self.pp_group.is_last_rank:
+            if not get_embedding:
+                return self.logits_processor(
+                    input_ids,
+                    hidden_states,
+                    self.lm_head,
+                    forward_batch,
+                    aux_hidden_states,
+                )
+            else:
+                return self.pooler(hidden_states, forward_batch)
+        else:
+            return hidden_states
+
+    def get_expert_mapping(self) -> list[tuple[str, str, int, str]]:
+        # Params for weights, fp8 weight scales, fp8 activation scales
+        # (param_name, weight_name, expert_id, shard_id)
+        # layers.0.block_sparse_moe.expert_0.input_linear.input_scale
+        ckpt_gate_proj_name = "gate_proj"
+        ckpt_down_proj_name = "down_proj"
+        ckpt_up_proj_name = "up_proj"
+        num_experts = self.config.num_local_experts
+
+        return [
+            # (param_name, weight_name, expert_id, shard_id)
+            (
+                (
+                    "block_sparse_moe.experts.w13_"
+                    if weight_name in [ckpt_gate_proj_name, ckpt_up_proj_name]
+                    else "block_sparse_moe.experts.w2_"
+                ),
+                f"block_sparse_moe.experts.{expert_id}.{weight_name}.",
+                expert_id,
+                shard_id,
+            )
+            for expert_id in range(num_experts)
+            for shard_id, weight_name in [
+                ("w1", ckpt_gate_proj_name),
+                ("w2", ckpt_down_proj_name),
+                ("w3", ckpt_up_proj_name),
+            ]
+        ]
+
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            (".qkv_proj", ".q_proj", "q"),
+            (".qkv_proj", ".k_proj", "k"),
+            (".qkv_proj", ".v_proj", "v"),
+        ]
+        params_dict = dict(self.named_parameters())
+        loaded_params: set[str] = set()
+        expert_params_mapping = self.get_expert_mapping()
+
+        def _load(n, p):
+            param = params_dict[n]
+            weight_loader = getattr(param, "weight_loader", default_weight_loader)
+            weight_loader(param, p)
+            loaded_params.add(n)
+
+        def _load_shard(n, p, shard_id):
+            # Skip layers on other devices.
+            param = params_dict[n]
+            weight_loader = getattr(param, "weight_loader", default_weight_loader)
+            weight_loader(param, p, shard_id)
+            loaded_params.add(n)
+
+        def _load_expert(n, p, name, shard_id, expert_id):
+            param = params_dict[n]
+            weight_loader = getattr(param, "weight_loader", default_weight_loader)
+            weight_loader(param, p, name, shard_id=shard_id, expert_id=expert_id)
+            loaded_params.add(n)
+
+        def _load_quant_expert(name, loaded_weight):
+            for mapping in expert_params_mapping:
+                param_name, weight_name, expert_id, shard_id = mapping
+
+                if weight_name not in name:
+                    continue
+
+                name_mapped = name.replace(weight_name, param_name)
+
+                # Skip layers on other devices.
+                # if is_pp_missing_parameter(name_mapped, self):
+                #     continue
+
+                param = params_dict[name_mapped]
+                weight_loader = param.weight_loader
+                success = False
+
+                if weight_loader is not None:
+                    success = weight_loader(
+                        param,
+                        loaded_weight,
+                        name_mapped,
+                        shard_id=shard_id,
+                        expert_id=expert_id,
+                        return_success=True,
+                    )
+
+                if success:
+                    return name_mapped
+            return None
+
+        for n, p in weights:
+            if "A_log" in n:
+                n = n.replace("A_log", "A")
+
+            if self.quant_config is not None and (
+                scale_name := self.quant_config.get_cache_scale(n)
+            ):
+                # Loading kv cache quantization scales
+                loaded_weight = p
+                loaded_weight = (
+                    loaded_weight if loaded_weight.dim() == 0 else loaded_weight[0]
+                )
+                _load(scale_name, loaded_weight)
+                loaded_params.add(scale_name)
+                continue
+
+            if _load_quant_expert(n, p):
+                continue
+
+            # Logic analogous to: https://github.com/vllm-project/vllm/blob/f49e5aff11c986ed4d45202b1716c5d74786efa9/vllm/model_executor/models/granitemoeshared.py#L215
+            # Mapping different experts' layout:
+            #  from HF (input_linear, output_linear, router)
+            #  to vLLM (experts_w13({e}.w1, {e}.w2), experts_w3({e}.w3), gate)
+            # The renaming and parameter loading logic is the same for weight
+            # and weight_scale tensors so we can reuse them without issues.
+            if n.endswith(".block_sparse_moe.input_linear.weight") or n.endswith(
+                ".block_sparse_moe.input_linear.weight_scale"
+            ):
+                for e in range(p.size(0)):
+                    w1_name = n.replace(
+                        ".block_sparse_moe.input_linear.weight",
+                        f".block_sparse_moe.experts.{e}.w1.weight",
+                    )
+                    w3_name = n.replace(
+                        ".block_sparse_moe.input_linear.weight",
+                        f".block_sparse_moe.experts.{e}.w3.weight",
+                    )
+                    w1_param, w3_param = p[e].chunk(2, dim=0)
+                    _load_expert(
+                        n.replace(".input_linear.", ".experts.w13_"),
+                        w1_param,
+                        w1_name,
+                        shard_id="w1",
+                        expert_id=e,
+                    )
+                    _load_expert(
+                        n.replace(".input_linear.", ".experts.w13_"),
+                        w3_param,
+                        w3_name,
+                        shard_id="w3",
+                        expert_id=e,
+                    )
+            elif n.endswith(".block_sparse_moe.output_linear.weight") or n.endswith(
+                ".block_sparse_moe.output_linear.weight_scale"
+            ):
+                for e in range(p.size(0)):
+                    w2_name = n.replace(
+                        ".block_sparse_moe.output_linear.weight",
+                        f".block_sparse_moe.experts.{e}.w2.weight",
+                    )
+                    w2_param = p[e]
+                    _load_expert(
+                        n.replace(".output_linear.", ".experts.w2_"),
+                        w2_param,
+                        w2_name,
+                        shard_id="w2",
+                        expert_id=e,
+                    )
+            elif n.endswith(".block_sparse_moe.router.layer.weight"):
+                gate_name = n.replace(
+                    ".block_sparse_moe.router.layer.weight",
+                    ".block_sparse_moe.gate.weight",
+                )
+                _load(gate_name, p)
+            else:
+                loaded = False
+                for param_name, weight_name, shard_id in stacked_params_mapping:
+                    if weight_name in n:
+                        _load_shard(
+                            n.replace(weight_name, param_name), p, shard_id=shard_id
+                        )
+                        loaded = True
+                if not loaded:
+                    _load(n, p)
+
+        return loaded_params
+
+
+EntryClass = [GraniteMoeHybridForCausalLM]
diff --git a/sglang/python/sglang/srt/models/grok.py b/sglang/python/sglang/srt/models/grok.py
new file mode 100644
index 0000000000000000000000000000000000000000..f82642777447e3fc3642069197a5f5e39effc7dc
--- /dev/null
+++ b/sglang/python/sglang/srt/models/grok.py
@@ -0,0 +1,997 @@
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+import functools
+import logging
+import math
+from typing import Iterable, Optional, Tuple
+
+import torch
+import torch.nn.functional as F
+from torch import nn
+from transformers import PretrainedConfig
+
+from sglang.srt.distributed import (
+    get_tensor_model_parallel_rank,
+    get_tensor_model_parallel_world_size,
+    tensor_model_parallel_all_reduce,
+)
+from sglang.srt.layers.activation import GeluAndMul
+from sglang.srt.layers.elementwise import (
+    fused_dual_residual_rmsnorm,
+    fused_rmsnorm,
+    gelu_and_mul_triton,
+)
+from sglang.srt.layers.layernorm import RMSNorm
+from sglang.srt.layers.linear import (
+    MergedColumnParallelLinear,
+    QKVParallelLinear,
+    ReplicatedLinear,
+    RowParallelLinear,
+)
+from sglang.srt.layers.logits_processor import LogitsProcessor
+from sglang.srt.layers.moe.fused_moe_triton import FusedMoE
+from sglang.srt.layers.moe.router import fused_moe_router_shim
+from sglang.srt.layers.moe.topk import TopK
+from sglang.srt.layers.quantization.base_config import QuantizationConfig
+from sglang.srt.layers.radix_attention import RadixAttention
+from sglang.srt.layers.rotary_embedding import (
+    RotaryEmbedding,
+    _yarn_find_correction_range,
+    _yarn_get_mscale,
+    get_rope,
+)
+from sglang.srt.layers.vocab_parallel_embedding import (
+    ParallelLMHead,
+    VocabParallelEmbedding,
+)
+from sglang.srt.model_executor.cuda_graph_runner import get_is_capture_mode
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch
+from sglang.srt.model_loader.loader import DefaultModelLoader
+from sglang.srt.model_loader.weight_utils import default_weight_loader
+from sglang.srt.utils import add_prefix, is_npu
+
+_is_npu = is_npu()
+
+logger = logging.getLogger(__name__)
+
+
+class Grok1MLP(nn.Module):
+    def __init__(
+        self,
+        hidden_size: int,
+        intermediate_size: int,
+        layer_id: int,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+        reduce_results=True,
+        use_presharded_weights: bool = False,
+        split_gate_up: bool = False,
+    ) -> None:
+        super().__init__()
+
+        self.gate_up_proj = MergedColumnParallelLinear(
+            hidden_size,
+            [intermediate_size] * 2,
+            bias=False,
+            quant_config=quant_config,
+            prefix=add_prefix("gate_up_proj", prefix),
+            use_presharded_weights=use_presharded_weights,
+        )
+        self.down_proj = RowParallelLinear(
+            intermediate_size,
+            hidden_size,
+            bias=False,
+            quant_config=quant_config,
+            prefix=add_prefix("down_proj", prefix),
+            reduce_results=reduce_results,
+            use_presharded_weights=use_presharded_weights,
+        )
+        self.act_fn = GeluAndMul(approximate="tanh")
+        self.layer_id = layer_id
+
+    def forward(self, x):
+        gate_up, _ = self.gate_up_proj(x)
+        x, _ = gelu_and_mul_triton(gate_up)
+        x, _ = self.down_proj(x)
+        return x
+
+
+class Grok1MoE(nn.Module):
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        layer_id: int,
+        num_experts: int,
+        top_k: int,
+        hidden_size: int,
+        intermediate_size: int,
+        params_dtype: Optional[torch.dtype] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        tp_size: Optional[int] = None,
+        reduce_results: bool = True,
+        use_presharded_weights: bool = False,
+        inplace: bool = True,
+        no_combine: bool = False,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.hidden_size = hidden_size
+
+        self.gate = ReplicatedLinear(
+            hidden_size,
+            num_experts,
+            bias=False,
+            params_dtype=torch.float32,
+            quant_config=None,
+        )
+
+        self.router_logit_softcapping = 30.0
+        custom_routing_function = functools.partial(
+            fused_moe_router_shim, self.router_logit_softcapping
+        )
+
+        self.topk = TopK(
+            top_k=top_k,
+            renormalize=False,
+            layer_id=layer_id,
+            custom_routing_function=None if _is_npu else custom_routing_function,
+        )
+
+        self.experts = FusedMoE(
+            num_experts=num_experts,
+            top_k=top_k,
+            layer_id=layer_id,
+            hidden_size=hidden_size,
+            intermediate_size=intermediate_size,
+            params_dtype=params_dtype,
+            quant_config=quant_config,
+            activation="gelu",
+            reduce_results=reduce_results,
+            use_presharded_weights=use_presharded_weights,
+            inplace=inplace,
+            no_combine=no_combine,
+        )
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        if not _is_npu:
+            topk_output = self.topk(hidden_states, self.gate.weight)
+            return self.experts(hidden_states, topk_output)
+        else:
+            orig_shape = hidden_states.shape
+            hidden_states = hidden_states.view(-1, self.hidden_size)
+
+            router_logits, _ = self.gate(hidden_states)
+            router_logits = self.router_logit_softcapping * F.tanh(
+                router_logits / self.router_logit_softcapping
+            )
+            topk_output = self.topk(hidden_states, router_logits)
+
+            final_hidden_states = self.experts(hidden_states, topk_output)
+            return final_hidden_states.view(orig_shape)
+
+
+def _yarn_linear_ramp_mask(
+    low: float, high: float, dim: int, dtype: torch.dtype
+) -> torch.Tensor:
+    if low == high:
+        low -= 0.001  # Prevent singularity
+
+    linear_func = (torch.arange(dim, dtype=dtype) - low) / (high - low)
+    ramp_func = torch.clamp(linear_func, 0, 1)
+    return ramp_func
+
+
+def get_rope_scaling(config):
+    rope_type = getattr(config, "rope_type", None)
+    if rope_type:
+        original_max_position_embeddings = getattr(
+            config, "original_max_position_embeddings", None
+        )
+        scaling_factor = getattr(config, "scaling_factor", None)
+        extrapolation_factor = getattr(config, "extrapolation_factor", 1.0)
+        attn_factor = getattr(config, "attn_factor", 1.0)
+        beta_fast = getattr(config, "beta_fast", 32)
+        beta_slow = getattr(config, "beta_slow", 1)
+        rope_scaling = {
+            "extra_method": rope_type,
+            "max_position_embeddings": original_max_position_embeddings,
+            "scaling_factor": scaling_factor,
+            "extrapolation_factor": extrapolation_factor,
+            "attn_factor": attn_factor,
+            "beta_fast": beta_fast,
+            "beta_slow": beta_slow,
+            "dtype": torch.bfloat16,
+        }
+        return rope_scaling
+    else:
+        return None
+
+
+class ScalingRotaryEmbedding(RotaryEmbedding):
+    """Scale the RotaryEmbedding in a way similar to YaRN method. https://arxiv.org/pdf/2309.00071."""
+
+    def __init__(
+        self,
+        head_size: int,
+        rotary_dim: int,
+        max_position_embeddings: int,
+        base: int,
+        is_neox_style: bool,
+        scaling_factor: float,
+        dtype: torch.dtype,
+        *,
+        extra_method: str = "yarn_log",
+        extrapolation_factor: float = 1,
+        attn_factor: float = 1,
+        beta_fast: int = 32,
+        beta_slow: int = 1,
+    ) -> None:
+        self.scaling_factor = scaling_factor
+        self.extra_method = extra_method
+        self.extrapolation_factor = extrapolation_factor
+        self.attn_factor = attn_factor
+        self.beta_fast = beta_fast
+        self.beta_slow = beta_slow
+        if _is_npu:
+            dtype = torch.float32
+        # Get n-d magnitude scaling corrected for interpolation
+        self.mscale = float(_yarn_get_mscale(self.scaling_factor) * attn_factor)
+        super().__init__(
+            head_size, rotary_dim, max_position_embeddings, base, is_neox_style, dtype
+        )
+
+    def _compute_inv_freq(self, scaling_factor: float) -> torch.Tensor:
+        pos_freqs = self.base ** (
+            torch.arange(0, self.rotary_dim, 2, dtype=torch.float) / self.rotary_dim
+        )
+        inv_freq_extrapolation = 1.0 / pos_freqs
+        inv_freq_interpolation = 1.0 / (scaling_factor * pos_freqs)
+
+        low, high = _yarn_find_correction_range(
+            self.beta_fast,
+            self.beta_slow,
+            self.rotary_dim,
+            self.base,
+            self.max_position_embeddings,
+        )
+        # Get n-d rotational scaling corrected for extrapolation
+        inv_freq_mask = (
+            1
+            - _yarn_linear_ramp_mask(low, high, self.rotary_dim // 2, dtype=torch.float)
+        ) * self.extrapolation_factor
+        if self.extra_method in ["original"]:
+            inv_freq = inv_freq_extrapolation
+        elif self.extra_method in ["yarn", "yarn_linear"]:
+            inv_freq = (
+                inv_freq_interpolation * (1 - inv_freq_mask)
+                + inv_freq_extrapolation * inv_freq_mask
+            )
+        elif self.extra_method == "yarn_log":
+            inv_freq = torch.exp(
+                torch.log(inv_freq_extrapolation) * inv_freq_mask
+                + torch.log(inv_freq_interpolation) * (1.0 - inv_freq_mask)
+            )
+        elif self.extra_method == "theta_scale":
+            exponents = torch.arange(0, self.rotary_dim, 2, dtype=torch.float)
+            theta_scale_exponent = self.base ** (
+                math.log(
+                    self.max_position_embeddings * self.scaling_factor / (2 * math.pi)
+                )
+                / math.log(self.max_position_embeddings / (2 * math.pi))
+            )
+            inv_freq = torch.tensor(
+                1.0 / (theta_scale_exponent ** (exponents / self.rotary_dim)),
+                dtype=torch.float32,
+            )
+        else:
+            raise ValueError(f"Unknown extrapolation method: {self.extra_method}")
+        return inv_freq
+
+    def _compute_cos_sin_cache(self) -> torch.Tensor:
+        inv_freq = self._compute_inv_freq(self.scaling_factor)
+        t = torch.arange(
+            self.max_position_embeddings * self.scaling_factor, dtype=torch.float32
+        )
+        freqs = torch.einsum("i,j -> ij", t, inv_freq)
+        # cos = freqs.cos() * self.mscale
+        # sin = freqs.sin() * self.mscale
+        cos = freqs.cos()
+        sin = freqs.sin()
+        cache = torch.cat((cos, sin), dim=-1)
+        return cache
+
+
+class Grok1Attention(nn.Module):
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        hidden_size: int,
+        num_heads: int,
+        num_kv_heads: int,
+        layer_id: int = 0,
+        max_position: int = 4096 * 32,
+        rope_theta: float = 10000,
+        quant_config: Optional[QuantizationConfig] = None,
+        reduce_results: bool = True,
+        alt_stream: Optional[torch.cuda.Stream] = None,
+        load_presharded_attn: bool = False,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.config = config
+        self.layer_id = layer_id
+        self.hidden_size = hidden_size
+        attn_tp_rank = get_tensor_model_parallel_rank()
+        attn_tp_size = get_tensor_model_parallel_world_size()
+        self.total_num_heads = num_heads
+        assert self.total_num_heads % attn_tp_size == 0
+        self.num_heads = self.total_num_heads // attn_tp_size
+        self.total_num_kv_heads = num_kv_heads
+        if self.total_num_kv_heads >= attn_tp_size:
+            # Number of KV heads is greater than TP size, so we partition
+            # the KV heads across multiple tensor parallel GPUs.
+            assert self.total_num_kv_heads % attn_tp_size == 0
+        else:
+            # Number of KV heads is less than TP size, so we replicate
+            # the KV heads across multiple tensor parallel GPUs.
+            assert attn_tp_size % self.total_num_kv_heads == 0
+        self.num_kv_heads = max(1, self.total_num_kv_heads // attn_tp_size)
+        self.head_dim = getattr(config, "head_dim", 128)
+        self.q_size = self.num_heads * self.head_dim
+        self.kv_size = self.num_kv_heads * self.head_dim
+        self.scaling = self.head_dim**-0.5
+        self.rope_theta = rope_theta
+        rope_scaling = get_rope_scaling(config)
+        self.load_presharded_attn = load_presharded_attn
+        self.alt_stream = alt_stream or torch.cuda.Stream()
+
+        self.qkv_proj = QKVParallelLinear(
+            hidden_size,
+            self.head_dim,
+            self.total_num_heads,
+            self.total_num_kv_heads,
+            bias=False,
+            quant_config=quant_config,
+            tp_rank=attn_tp_rank,
+            tp_size=attn_tp_size,
+            load_presharded_attn=self.load_presharded_attn,
+            prefix=add_prefix("qkv_proj", prefix),
+        )
+        self.o_proj = RowParallelLinear(
+            self.total_num_heads * self.head_dim,
+            hidden_size,
+            bias=False,
+            quant_config=quant_config,
+            reduce_results=reduce_results,
+            tp_rank=attn_tp_rank,
+            tp_size=attn_tp_size,
+            use_presharded_weights=self.load_presharded_attn,
+            prefix=add_prefix("o_proj", prefix),
+        )
+        self.rotary_emb = get_rope(
+            self.head_dim,
+            rotary_dim=self.head_dim,
+            max_position=max_position,
+            base=int(self.rope_theta),
+            is_neox_style=True,
+        )
+
+        self.rope_rotate_half_dims = getattr(config, "rope_rotate_half_dims", False)
+
+        if rope_scaling is not None:
+            self.rotary_emb = ScalingRotaryEmbedding(
+                self.head_dim,
+                rotary_dim=(
+                    self.head_dim
+                    if not self.rope_rotate_half_dims
+                    else self.head_dim // 2
+                ),
+                base=int(self.rope_theta),
+                is_neox_style=True,
+                **rope_scaling,
+            )
+            pos_encoding_mode = "NONE"
+        else:
+            self.rotary_emb = get_rope(
+                self.head_dim,
+                rotary_dim=(
+                    self.head_dim
+                    if not self.rope_rotate_half_dims
+                    else self.head_dim // 2
+                ),
+                max_position=max_position,
+                base=int(self.rope_theta),
+                is_neox_style=True,
+                dtype=torch.float32 if _is_npu else None,
+            )
+            pos_encoding_mode = "NONE"
+
+        logit_cap = max(getattr(config, "attn_logit_softcapping", 30.0), 0.0)
+        logit_capping_method = getattr(config, "attn_logit_softcapping_method", "tanh")
+
+        self.attn = RadixAttention(
+            self.num_heads,
+            self.head_dim,
+            self.scaling,
+            num_kv_heads=self.num_kv_heads,
+            layer_id=layer_id,
+            logit_cap=logit_cap,
+            quant_config=quant_config,
+            pos_encoding_mode=pos_encoding_mode,
+            logit_capping_method=logit_capping_method,
+            prefix=add_prefix("attn", prefix),
+        )
+        self.attn.xai_temperature_len = getattr(self.config, "attn_temperature_len", -1)
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+    ) -> torch.Tensor:
+        qkv, _ = self.qkv_proj(hidden_states)
+
+        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+        if not _is_npu:
+            q, k = self.rotary_emb(positions, q, k)
+        else:
+            odtype = q.dtype
+            q, k = self.rotary_emb(positions, q.to(torch.float32), k.to(torch.float32))
+            q, k = q.to(odtype), k.to(odtype)
+
+        attn_output = self.attn(q, k, v, forward_batch)
+
+        output, _ = self.o_proj(attn_output)
+        return output
+
+
+class Grok1DecoderLayer(nn.Module):
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        layer_id: int = 0,
+        quant_config: Optional[QuantizationConfig] = None,
+        load_presharded_moe: bool = False,
+        load_presharded_attn: bool = False,
+        load_presharded_mlp: bool = False,
+        alt_stream: Optional[torch.cuda.Stream] = None,
+        skip_moe: bool = False,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.num_experts = config.num_local_experts
+        self.hidden_size = config.hidden_size
+        self.residual_moe = getattr(config, "residual_moe", False)
+        self.layer_id = layer_id
+        self.alt_stream = alt_stream or torch.cuda.Stream()
+
+        rope_theta = getattr(config, "rope_theta", 10000)
+        self.self_attn = Grok1Attention(
+            config=config,
+            hidden_size=self.hidden_size,
+            num_heads=config.num_attention_heads,
+            max_position=(
+                config.context_len
+                if hasattr(config, "context_len")
+                else config.max_position_embeddings
+            ),
+            num_kv_heads=config.num_key_value_heads,
+            layer_id=layer_id,
+            rope_theta=rope_theta,
+            quant_config=quant_config,
+            reduce_results=False,
+            alt_stream=self.alt_stream,
+            load_presharded_attn=load_presharded_attn,
+            prefix=add_prefix("attn", prefix),
+        )
+
+        split_gate_up = not getattr(config, "merge_gate_up", True)
+        if self.num_experts > 0:
+            self.block_sparse_moe = Grok1MoE(
+                config=config,
+                layer_id=layer_id,
+                num_experts=config.num_local_experts,
+                top_k=config.num_experts_per_tok,
+                hidden_size=config.hidden_size,
+                intermediate_size=getattr(
+                    config,
+                    "moe_intermediate_size",
+                    getattr(config, "intermediate_size", None),
+                ),
+                quant_config=quant_config,
+                reduce_results=not self.residual_moe,
+                use_presharded_weights=load_presharded_moe,
+                inplace=False,  # not self.residual_moe,
+                no_combine=False,  # self.residual_moe,  # just a suggestion to not combine topk
+                prefix=add_prefix("block_sparse_moe", prefix),
+            )
+            if self.residual_moe:
+                self.mlp = Grok1MLP(
+                    hidden_size=config.hidden_size,
+                    intermediate_size=config.intermediate_size,
+                    quant_config=quant_config,
+                    reduce_results=False,
+                    use_presharded_weights=load_presharded_mlp,
+                    layer_id=layer_id,
+                    split_gate_up=split_gate_up,
+                )
+        else:
+            raise NotImplementedError()
+
+        self.pre_attn_norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attn_norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.pre_moe_norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_moe_norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+        if self.num_experts > 0:
+            if self.residual_moe:
+                # NOTE: self.block_sparse_moe modifies the input in-place,
+                # so we have to call it later. Be aware of any possible related errors.
+                if get_tensor_model_parallel_world_size() > 1:
+                    self.ffn = lambda x: tensor_model_parallel_all_reduce(
+                        self.moe_with_rmoe(x)
+                    )
+                else:
+                    self.ffn = self.moe_with_rmoe
+            else:
+                self.ffn = self.block_sparse_moe
+        else:
+            raise NotImplementedError()
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+        residual: Optional[torch.Tensor] = None,
+        deferred_norm: Optional[RMSNorm] = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor, RMSNorm]:
+
+        hidden_states_original = hidden_states
+        residual_original = residual
+
+        # Self Attention
+        if deferred_norm is not None:
+            assert residual is not None
+            # here hidden_states is output of ffn, residual is residual from after previous attn layer
+            hidden_states, residual = fused_dual_residual_rmsnorm(
+                hidden_states,
+                residual,
+                deferred_norm.weight,
+                self.pre_attn_norm.weight,
+                deferred_norm.variance_epsilon,
+            )
+        else:
+            # here hidden_states is the residual
+            hidden_states, residual = (
+                fused_rmsnorm(
+                    hidden_states,
+                    self.pre_attn_norm.weight,
+                    self.pre_attn_norm.variance_epsilon,
+                ),
+                hidden_states,
+            )
+
+        hidden_states = self.self_attn(
+            positions=positions,
+            hidden_states=hidden_states,
+            forward_batch=forward_batch,
+        )
+
+        if get_tensor_model_parallel_world_size() > 1:
+            hidden_states = tensor_model_parallel_all_reduce(hidden_states)
+
+        hidden_states, residual = fused_dual_residual_rmsnorm(
+            hidden_states,
+            residual,
+            self.post_attn_norm.weight,
+            self.pre_moe_norm.weight,
+            self.post_attn_norm.variance_epsilon,
+        )
+
+        # Fully Connected
+        hidden_states = self.ffn(hidden_states)
+        return hidden_states, residual, self.post_moe_norm  # defer layernorm
+
+    def moe_with_rmoe(self, x):
+        if self.alt_stream is not None and get_is_capture_mode():
+            current_stream = torch.cuda.current_stream()
+            self.alt_stream.wait_stream(current_stream)
+            mlp_result = self.mlp(x)
+            with torch.cuda.stream(self.alt_stream):
+                moe_result = self.block_sparse_moe(x)
+            current_stream.wait_stream(self.alt_stream)
+        else:
+            mlp_result = self.mlp(x)
+            moe_result = self.block_sparse_moe(x)
+        return (mlp_result + moe_result) / 1.4142135623730951
+
+
+class Grok1Model(nn.Module):
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        load_presharded_moe: bool = False,
+        load_presharded_embedding: bool = False,
+        load_presharded_attn: bool = False,
+        load_presharded_mlp: bool = False,
+        replicate_embedding: bool = False,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.config = config
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+
+        self.embed_tokens = VocabParallelEmbedding(
+            config.vocab_size,
+            config.hidden_size,
+            use_presharded_weights=load_presharded_embedding,
+            enable_tp=not replicate_embedding,
+            prefix=add_prefix("embed_tokens", prefix),
+        )
+
+        self.alt_stream = torch.cuda.Stream()
+        self.layers = nn.ModuleList(
+            [
+                Grok1DecoderLayer(
+                    config,
+                    i,
+                    quant_config=quant_config,
+                    load_presharded_moe=load_presharded_moe,
+                    load_presharded_attn=load_presharded_attn,
+                    load_presharded_mlp=load_presharded_mlp,
+                    alt_stream=self.alt_stream,
+                )
+                for i in range(config.num_hidden_layers)
+            ]
+        )
+        self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        input_embeds: torch.Tensor = None,
+    ) -> torch.Tensor:
+        if input_embeds is None:
+            hidden_states = self.embed_tokens(input_ids)
+            hidden_states.mul_(self.config.embedding_multiplier_scale)
+        else:
+            hidden_states = input_embeds
+
+        residual, deferred_norm = None, None
+        for i in range(len(self.layers)):
+            hidden_states, residual, deferred_norm = self.layers[i](
+                positions, hidden_states, forward_batch, residual, deferred_norm
+            )
+
+        hidden_states, _ = fused_dual_residual_rmsnorm(
+            hidden_states,
+            residual,
+            deferred_norm.weight,
+            self.norm.weight,
+            deferred_norm.variance_epsilon,
+        )
+
+        return hidden_states
+
+
+class Grok1ForCausalLM(nn.Module):
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.config = config
+        self.quant_config = quant_config
+
+        # Get presharded weights.
+        self.load_presharded_mlp = getattr(config, "load_presharded_mlp", False)
+        self.load_presharded_moe = (
+            getattr(config, "load_presharded_moe", True)
+            and self.config.num_local_experts > 0
+            and get_tensor_model_parallel_world_size() > 1
+        )
+        self.load_presharded_attn = getattr(config, "load_presharded_attn", False)
+        self.load_presharded_embedding = getattr(
+            config, "load_presharded_embedding", False
+        )
+
+        default_replicate_lm_head = False
+        self.replicate_lm_head = getattr(
+            config, "replicate_lm_head", default_replicate_lm_head
+        )
+
+        if get_tensor_model_parallel_world_size() > 1:
+            setattr(DefaultModelLoader, "_prepare_weights", _prepare_presharded_weights)
+
+        self.replicate_embedding = getattr(config, "replicate_embedding", False)
+
+        self.model = Grok1Model(
+            config,
+            quant_config=quant_config,
+            load_presharded_moe=self.load_presharded_moe,
+            load_presharded_embedding=self.load_presharded_embedding,
+            load_presharded_attn=self.load_presharded_attn,
+            load_presharded_mlp=self.load_presharded_mlp,
+            replicate_embedding=self.replicate_embedding,
+            prefix=add_prefix("model", prefix),
+        )
+
+        lm_head_params_dtype = None
+        if self.replicate_lm_head:
+            self.lm_head = ReplicatedLinear(
+                config.hidden_size,
+                config.vocab_size,
+                bias=False,
+                params_dtype=lm_head_params_dtype,
+                prefix=add_prefix("lm_head", prefix),
+            )
+            self.logits_processor = LogitsProcessor(config, skip_all_gather=True)
+        else:
+            self.lm_head = ParallelLMHead(
+                config.vocab_size,
+                config.hidden_size,
+                use_presharded_weights=self.load_presharded_embedding,
+                params_dtype=lm_head_params_dtype,
+                prefix=add_prefix("lm_head", prefix),
+            )
+            self.logits_processor = LogitsProcessor(config)
+
+        self.loaded_param_names = set()
+
+    @torch.no_grad()
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        input_embeds: torch.Tensor = None,
+    ) -> torch.Tensor:
+        hidden_states = self.model(input_ids, positions, forward_batch, input_embeds)
+        return self.logits_processor(
+            input_ids, hidden_states, self.lm_head, forward_batch
+        )
+
+    def load_weights(
+        self,
+        weights: Iterable[Tuple[str, torch.Tensor]],
+        ignore_parent_name: bool = False,
+        check_hit_names: bool = True,
+        model_config: PretrainedConfig | None = None,
+    ) -> dict[str, torch.Tensor]:
+        if model_config is None:
+            model_config = self.config
+
+        stacked_params_mapping = []
+        stacked_params_mapping += [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+        ]
+        stacked_params_mapping += [
+            # (param_name, shard_name, shard_id)
+            ("gate_up_proj", "gate_proj", 0),
+            ("gate_up_proj", "up_proj", 1),
+        ]
+
+        # Params for weights, fp8 weight scales, fp8 activation scales
+        # (param_name, weight_name, expert_id, shard_id)
+        num_experts = model_config.num_local_experts
+        expert_params_mapping = FusedMoE.make_expert_params_mapping(
+            ckpt_gate_proj_name="w1",
+            ckpt_down_proj_name="w2",
+            ckpt_up_proj_name="w3",
+            num_experts=num_experts,
+        )
+
+        params_dict = dict(self.named_parameters())
+        all_names = set(params_dict.keys())
+        hit_names = set()
+
+        def load_weight_wrapper(
+            name: str, loaded_weight: torch.Tensor, *args, **kwargs
+        ):
+            # Fuse constant multipliers into the weights
+            if "lm_head" in name:
+                loaded_weight = (
+                    loaded_weight.to(torch.float32)
+                    * model_config.output_multiplier_scale
+                )
+
+            original_name = name
+            if ignore_parent_name:
+                name = name.split(".")[-1]
+
+            if name not in params_dict:
+                logger.info(f"Skipping {name=} in load_weights_wrapper")
+                return
+
+            param = params_dict[name]
+            weight_loader = getattr(param, "weight_loader", default_weight_loader)
+            weight_loader(param, loaded_weight, *args, **kwargs)
+            hit_names.add(name)
+            self.loaded_param_names.add(original_name)
+
+        for name, loaded_weight in weights:
+            if "rotary_emb.inv_freq" in name:
+                continue
+
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                load_weight_wrapper(name, loaded_weight, shard_id)
+                break
+            else:
+                for mapping in expert_params_mapping:
+                    param_name, weight_name, expert_id, shard_id = mapping
+                    if weight_name not in name:
+                        continue
+                    name = name.replace(weight_name, param_name)
+
+                    load_weight_wrapper(
+                        name,
+                        loaded_weight,
+                        name,
+                        shard_id=shard_id,
+                        expert_id=expert_id,
+                    )
+                    break
+                else:
+                    # Skip loading extra bias for GPTQ models.
+                    if name.endswith(".bias") and name not in params_dict:
+                        continue
+                    if name is None:
+                        continue
+
+                    load_weight_wrapper(name=name, loaded_weight=loaded_weight)
+
+        if check_hit_names:
+            if len(hit_names) > 5:
+                missing = all_names - hit_names
+                missing_exclude_scales = {x for x in missing if "scale" not in x}
+                logger.info(
+                    f"#all_names: {len(all_names)}, #hit_names: {len(hit_names)}, #missing_exclude_scales: {len(missing_exclude_scales)}",
+                )
+                if len(missing_exclude_scales) > 0:
+                    raise ValueError(
+                        f"load_weights failed because some weights are missing: {missing_exclude_scales=}."
+                    )
+
+            elif len(hit_names) == 0:
+                raise ValueError(
+                    f"load_weights failed because it did not hit any names. {all_names=} {hit_names=}"
+                )
+
+        return hit_names
+
+    def get_num_params_analytical(self):
+        cfg = self.config
+        moe_intermediate_size = getattr(
+            cfg,
+            "moe_intermediate_size",
+            getattr(cfg, "intermediate_size", None),
+        )
+        residual_moe = getattr(cfg, "residual_moe", False)
+        if cfg.num_local_experts > 0:
+            num_experts = cfg.num_local_experts + (1 if residual_moe else 0)
+        else:
+            num_experts = 1
+
+        wq = (
+            cfg.num_hidden_layers
+            * cfg.hidden_size
+            * cfg.num_attention_heads
+            * cfg.head_dim
+        )
+        wkv = (
+            cfg.num_hidden_layers
+            * cfg.hidden_size
+            * cfg.num_key_value_heads
+            * cfg.head_dim
+            * 2
+        )
+        out = (
+            cfg.num_hidden_layers
+            * cfg.hidden_size
+            * cfg.num_attention_heads
+            * cfg.head_dim
+        )
+        ffn1 = (
+            cfg.num_hidden_layers
+            * num_experts
+            * cfg.hidden_size
+            * moe_intermediate_size
+            * 2
+        )
+        ffn2 = (
+            cfg.num_hidden_layers
+            * num_experts
+            * cfg.hidden_size
+            * moe_intermediate_size
+        )
+        embed = cfg.hidden_size * cfg.vocab_size * 2
+        return wq + wkv + out + ffn1 + ffn2 + embed
+
+    def get_num_params_torch(self):
+        return (
+            sum(p.numel() for p in self.parameters())
+            * get_tensor_model_parallel_world_size()
+        )
+
+
+old_prepare_weights = getattr(DefaultModelLoader, "_prepare_weights")
+
+
+def _prepare_presharded_weights(
+    self, model_name_or_path: str, revision: Optional[str], fall_back_to_pt: bool
+) -> Tuple[str, list[str], bool]:
+    import glob
+    import os
+
+    if get_tensor_model_parallel_world_size() == 1:
+        return old_prepare_weights(self, model_name_or_path, revision, fall_back_to_pt)
+
+    if not os.path.isdir(model_name_or_path):
+        from sglang.srt.model_loader.weight_utils import download_weights_from_hf
+
+        allow_patterns = ["*.safetensors", "*.bin"]
+        hf_folder = download_weights_from_hf(
+            model_name_or_path,
+            self.load_config.download_dir,
+            allow_patterns,
+            revision,
+            ignore_patterns=self.load_config.ignore_patterns,
+        )
+    else:
+        hf_folder = model_name_or_path
+
+    tp_rank = get_tensor_model_parallel_rank()
+
+    # The old format
+    allow_patterns = [f"*-{tp_rank:03d}.bin"]
+
+    # The new format
+    allow_patterns += [f"*-TP-{tp_rank:03d}.safetensors", "*-TP-common.safetensors"]
+
+    hf_weights_files = []
+    for pattern in allow_patterns:
+        hf_weights_files += glob.glob(os.path.join(hf_folder, pattern))
+
+    if hf_weights_files[0].endswith("safetensors"):
+        use_safetensors = True
+    else:
+        use_safetensors = False
+
+    return hf_folder, hf_weights_files, use_safetensors
+
+
+class Grok1ModelForCausalLM(Grok1ForCausalLM):
+    """An alias for backward-compatbility."""
+
+    pass
+
+
+EntryClass = [Grok1ForCausalLM, Grok1ModelForCausalLM]
diff --git a/sglang/python/sglang/srt/models/hunyuan.py b/sglang/python/sglang/srt/models/hunyuan.py
new file mode 100644
index 0000000000000000000000000000000000000000..0128d009564dfdc310217102c8ba2541bb49247d
--- /dev/null
+++ b/sglang/python/sglang/srt/models/hunyuan.py
@@ -0,0 +1,816 @@
+# coding=utf-8
+# Copyright 2024 The HunYuan team.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Inference-only HunYuan model compatible with HuggingFace weights."""
+
+import re
+from typing import Any, Dict, Iterable, Optional, Tuple
+
+import torch
+from torch import nn
+from transformers import PretrainedConfig
+
+from sglang.srt.distributed import (
+    get_tensor_model_parallel_rank,
+    get_tensor_model_parallel_world_size,
+    tensor_model_parallel_all_reduce,
+)
+from sglang.srt.eplb.expert_distribution import ExpertDistributionRecorder
+from sglang.srt.layers.activation import SiluAndMul
+from sglang.srt.layers.layernorm import RMSNorm
+from sglang.srt.layers.linear import (
+    ColumnParallelLinear,
+    MergedColumnParallelLinear,
+    QKVParallelLinear,
+    ReplicatedLinear,
+    RowParallelLinear,
+)
+from sglang.srt.layers.logits_processor import LogitsProcessor
+from sglang.srt.layers.moe.fused_moe_triton import FusedMoE
+from sglang.srt.layers.moe.topk import TopK
+from sglang.srt.layers.quantization.base_config import QuantizationConfig
+from sglang.srt.layers.radix_attention import RadixAttention
+from sglang.srt.layers.rotary_embedding import get_rope
+from sglang.srt.layers.sampler import create_sampler
+from sglang.srt.layers.vocab_parallel_embedding import (
+    ParallelLMHead,
+    VocabParallelEmbedding,
+)
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch
+from sglang.srt.model_loader.weight_utils import (
+    default_weight_loader,
+    kv_cache_scales_loader,
+    maybe_remap_kv_scale_name,
+)
+from sglang.srt.utils import is_hip
+
+expert_distribution_recorder = ExpertDistributionRecorder()
+
+
+def _is_moe(config: PretrainedConfig) -> bool:
+    if getattr(config, "num_experts", None) and (
+        (isinstance(config.num_experts, int) and config.num_experts > 1)
+        or (isinstance(config.num_experts, list) and max(config.num_experts) > 1)
+    ):
+        return True
+    else:
+        return False
+
+
+def _get_cla_factor(config: PretrainedConfig) -> int:
+    if not getattr(config, "use_cla", False):
+        return 1
+    return getattr(config, "cla_share_factor", 1)
+
+
+class HunYuanMLP(nn.Module):
+
+    def __init__(
+        self,
+        hidden_size: int,
+        intermediate_size: int,
+        hidden_act: str,
+        quant_config: Optional[QuantizationConfig] = None,
+        bias: bool = False,
+        prefix: str = "",
+        reduce_results: bool = True,
+    ) -> None:
+        super().__init__()
+        self.gate_up_proj = MergedColumnParallelLinear(
+            input_size=hidden_size,
+            output_sizes=[intermediate_size] * 2,
+            bias=bias,
+            quant_config=quant_config,
+            prefix=f"{prefix}.gate_up_proj",
+        )
+        self.down_proj = RowParallelLinear(
+            input_size=intermediate_size,
+            output_size=hidden_size,
+            bias=bias,
+            quant_config=quant_config,
+            prefix=f"{prefix}.down_proj",
+            reduce_results=reduce_results,
+        )
+        if hidden_act != "silu":
+            raise ValueError(
+                f"Unsupported activation: {hidden_act}. "
+                "Only silu is supported for now."
+            )
+        self.act_fn = SiluAndMul()
+
+    def forward(self, x):
+        gate_up, _ = self.gate_up_proj(x)
+        x = self.act_fn(gate_up)
+        x, _ = self.down_proj(x)
+        return x
+
+
+class HunYuanSparseMoeBlock(nn.Module):
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        layer_id: int = -1,
+    ):
+        super().__init__()
+        self.tp_size = get_tensor_model_parallel_world_size()
+
+        if self.tp_size > config.num_experts:
+            raise ValueError(
+                f"Tensor parallel size {self.tp_size} is greater than "
+                f"the number of experts {config.num_experts}."
+            )
+
+        # Get layer_id topk if config.moe_topk is a list
+        if isinstance(config.moe_topk, list):
+            assert layer_id >= 0
+            assert len(config.moe_topk) > layer_id
+            top_k = config.moe_topk[layer_id]
+        else:
+            top_k = config.moe_topk
+
+        # If it is moe, moe_intermediate_size is preferred
+        intermediate_size = config.intermediate_size
+        if config.moe_intermediate_size is not None:
+            intermediate_size = (
+                config.moe_intermediate_size
+                if isinstance(config.moe_intermediate_size, int)
+                else config.moe_intermediate_size[layer_id]
+            )
+
+        self.topk = TopK(
+            top_k=top_k,
+            layer_id=layer_id,
+            renormalize=True if top_k > 1 else False,
+        )
+
+        self.experts = FusedMoE(
+            num_experts=config.num_experts,
+            hidden_size=config.hidden_size,
+            intermediate_size=intermediate_size,
+            reduce_results=False,
+            layer_id=layer_id,
+            quant_config=quant_config,
+        )
+
+        self.gate = ReplicatedLinear(
+            config.hidden_size, config.num_experts, bias=False, quant_config=None
+        )
+        if config.use_mixed_mlp_moe > 0:
+            # Get layer_id num_shared_expert if config.num_shared_expert is a list
+            if isinstance(config.num_shared_expert, list):
+                assert layer_id >= 0
+                assert len(config.num_shared_expert) > layer_id
+                num_shared_expert = config.num_shared_expert[layer_id]
+            else:
+                num_shared_expert = config.num_shared_expert
+
+            self.shared_mlp = HunYuanMLP(
+                hidden_size=config.hidden_size,
+                intermediate_size=config.intermediate_size * num_shared_expert,
+                hidden_act=config.hidden_act,
+                quant_config=quant_config,
+                reduce_results=False,
+            )
+        else:
+            self.shared_mlp = None
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        # NOTE: hidden_states can have either 1D or 2D shape.
+        orig_shape = hidden_states.shape
+        hidden_dim = hidden_states.shape[-1]
+        hidden_states = hidden_states.view(-1, hidden_dim)
+        shared_output = None
+        if self.shared_mlp is not None:
+            shared_output = self.shared_mlp(hidden_states)
+
+        # router_logits: (num_tokens, n_experts)
+        router_logits, _ = self.gate(hidden_states)
+        topk_output = self.topk(hidden_states, router_logits)
+        final_hidden_states = self.experts(hidden_states, topk_output)
+        if shared_output is not None:
+            final_hidden_states = final_hidden_states + shared_output
+        if self.tp_size > 1:
+            final_hidden_states = tensor_model_parallel_all_reduce(final_hidden_states)
+
+        return final_hidden_states.view(orig_shape)
+
+
+def get_head_dim(config):
+    if hasattr(config, "head_dim"):
+        return int(config.head_dim)
+    if hasattr(config, "attention_head_dim"):
+        return int(config.attention_head_dim)
+
+    # since some hunyuan model don't follow the self.hidden_size // self.total_num_heads rule
+    # wrong setting may cause runtime error, just throw error if this field is missing.
+    raise ValueError("Missing head dim config, try set head_dim in config.json")
+
+
+def check_head_dim(config):
+    # Some models may lack `head_dim` and use `attention_head_dim` instead.
+    # This attribute is also used by flashinfer_backend.py, so we check for
+    # consistency and raise an error if it's not met to avoid silent failures.
+    # Although we could adapt the HunYuan model to use `attention_head_dim`,
+    # flashinfer expects `head_dim`, so we enforce its presence for correctness.
+    calc_head_dim = config.hidden_size // config.num_attention_heads
+
+    if hasattr(config, "attention_head_dim"):
+        if calc_head_dim != config.attention_head_dim and not hasattr(
+            config, "head_dim"
+        ):
+            # in this case, flash infer(and other components may calculate wrong value.)
+            raise ValueError(
+                f"HunYuan model config error: calculated head_dim {calc_head_dim} != attention_head_dim {config.attention_head_dim}"
+                + f"\nPlease Add head_dim:{config.attention_head_dim} in config.json to make sure correctly inference."
+            )
+
+        if hasattr(config, "head_dim") and config.attention_head_dim != config.head_dim:
+            raise ValueError(
+                f"HunYuan model config error: head_dim({config.head_dim}) != attention_head_dim({config.attention_head_dim})"
+                + f"\nPlease change head_dim:{config.attention_head_dim} in config.json to make sure correctly inference."
+            )
+
+
+class HunYuanAttention(nn.Module):
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        hidden_size: int,
+        num_heads: int,
+        num_kv_heads: int,
+        rope_theta: float = 10000,
+        rope_scaling: Optional[Dict[str, Any]] = None,
+        max_position_embeddings: int = 8192,
+        quant_config: Optional[QuantizationConfig] = None,
+        bias: bool = False,
+        prefix: str = "",
+        attention_type: str = "self",
+        layer_id: int = -1,
+    ) -> None:
+        super().__init__()
+        self.hidden_size = hidden_size
+        tp_size = get_tensor_model_parallel_world_size()
+        self.total_num_heads = num_heads
+        assert self.total_num_heads % tp_size == 0
+        self.num_heads = self.total_num_heads // tp_size
+        self.total_num_kv_heads = num_kv_heads
+        if self.total_num_kv_heads >= tp_size:
+            # Number of KV heads is greater than TP size, so we partition
+            # the KV heads across multiple tensor parallel GPUs.
+            assert self.total_num_kv_heads % tp_size == 0
+        else:
+            # Number of KV heads is less than TP size, so we replicate
+            # the KV heads across multiple tensor parallel GPUs.
+            assert tp_size % self.total_num_kv_heads == 0
+        self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
+        # MistralConfig has an optional head_dim introduced by Mistral-Nemo
+        # Prioritize `head_dim` but fall back to `attention_head_dim` for Hunyuan models.
+        self.head_dim = get_head_dim(config)
+
+        check_head_dim(config)
+
+        self.q_size = self.num_heads * self.head_dim
+        self.kv_size = self.num_kv_heads * self.head_dim
+        self.scaling = self.head_dim**-0.5
+        self.rope_theta = rope_theta
+        self.max_position_embeddings = max_position_embeddings
+        self.use_qk_norm = getattr(config, "use_qk_norm", False)
+        self.attention_type = attention_type
+        self.layer_id = layer_id
+
+        if attention_type == "self":
+            self.qkv_proj = QKVParallelLinear(
+                hidden_size=hidden_size,
+                head_size=self.head_dim,
+                total_num_heads=self.total_num_heads,
+                total_num_kv_heads=self.total_num_kv_heads,
+                bias=bias,
+                quant_config=quant_config,
+                prefix=f"{prefix}.qkv_proj",
+            )
+        elif attention_type == "cross":
+            self.q_proj = ColumnParallelLinear(
+                hidden_size,
+                hidden_size,
+                bias=bias,
+                quant_config=quant_config,
+                prefix=f"{prefix}.q_proj",
+            )
+        else:
+            raise RuntimeError("Not support attnention type")
+
+        self.o_proj = RowParallelLinear(
+            input_size=self.total_num_heads * self.head_dim,
+            output_size=hidden_size,
+            bias=bias,
+            quant_config=quant_config,
+            prefix=f"{prefix}.o_proj",
+        )
+
+        is_neox_style = True
+        if quant_config is not None and quant_config.get_name() == "gguf":
+            is_neox_style = False
+
+        self.rotary_emb = get_rope(
+            self.head_dim,
+            rotary_dim=self.head_dim,
+            max_position=max_position_embeddings,
+            base=rope_theta,
+            rope_scaling=rope_scaling,
+            is_neox_style=is_neox_style,
+        )
+        self.attn = RadixAttention(
+            self.num_heads,
+            self.head_dim,
+            self.scaling,
+            num_kv_heads=self.num_kv_heads,
+            layer_id=layer_id,
+            prefix=f"{prefix}.attn",
+        )
+
+        if self.use_qk_norm:
+            self.query_layernorm = RMSNorm(self.head_dim, eps=config.rms_norm_eps)
+            self.key_layernorm = RMSNorm(self.head_dim, eps=config.rms_norm_eps)
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+        kv_states: Optional[Tuple[torch.Tensor]] = None,
+    ) -> torch.Tensor:
+        if self.attention_type == "self":
+            qkv, _ = self.qkv_proj(hidden_states)
+            q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+            q, k = self.rotary_emb(positions, q, k)
+            ori_k = k
+            if self.use_qk_norm:
+                # q = self.query_layernorm(q.view(-1, self.num_heads, self.head_dim).contiguous())
+                # k = self.key_layernorm(k.view(-1, self.num_kv_heads, self.head_dim).contiguous())
+                q = self.query_layernorm(q.reshape(-1, self.head_dim).contiguous())
+                k = self.key_layernorm(k.reshape(-1, self.head_dim).contiguous())
+        elif self.attention_type == "cross":
+            assert kv_states is not None
+            ori_k, v = kv_states  # use last layer kv,
+            k = ori_k
+            q, _ = self.q_proj(hidden_states)
+            k_tmp = torch.empty_like(k)  # Todo: reduant rotary embedding
+            q, _ = self.rotary_emb(positions, q, k_tmp)
+            if self.use_qk_norm:
+                q = self.query_layernorm(
+                    q.view(-1, self.num_heads, self.head_dim).contiguous()
+                )
+                k = self.key_layernorm(
+                    k.view(-1, self.num_kv_heads, self.head_dim).contiguous()
+                )
+        else:
+            raise RuntimeError("Not support attnention type")
+
+        attn_output = self.attn(q, k, v, forward_batch)
+        output, _ = self.o_proj(attn_output)
+        return output, (ori_k, v)
+
+
+class HunYuanDecoderLayer(nn.Module):
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+        layer_id: int = -1,
+    ) -> None:
+        super().__init__()
+        assert layer_id >= 0
+        self.layer_id = layer_id
+        self.hidden_size = config.hidden_size
+        self.intermediate_size = (
+            config.intermediate_size
+            if isinstance(config.intermediate_size, int)
+            else config.intermediate_size[layer_id]
+        )
+        rope_theta = getattr(config, "rope_theta", 10000)
+        rope_scaling = getattr(config, "rope_scaling", None)
+        if rope_scaling is not None and getattr(
+            config, "original_max_position_embeddings", None
+        ):
+            rope_scaling["original_max_position_embeddings"] = (
+                config.original_max_position_embeddings
+            )
+        max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
+        # Support abacusai/Smaug-72B-v0.1 with attention_bias
+        # Support internlm/internlm-7b with bias
+        attention_bias = getattr(config, "attention_bias", False) or getattr(
+            config, "bias", False
+        )
+        cla_factor = _get_cla_factor(config)
+        attention_type = (
+            "cross" if layer_id >= 0 and layer_id % cla_factor != 0 else "self"
+        )
+        self.self_attn = HunYuanAttention(
+            config=config,
+            hidden_size=self.hidden_size,
+            num_heads=config.num_attention_heads,
+            num_kv_heads=getattr(
+                config, "num_key_value_heads", config.num_attention_heads
+            ),
+            rope_theta=rope_theta,
+            rope_scaling=rope_scaling,
+            max_position_embeddings=max_position_embeddings,
+            quant_config=quant_config,
+            bias=attention_bias,
+            prefix=f"{prefix}.self_attn",
+            attention_type=attention_type,
+            layer_id=layer_id,
+        )
+        if _is_moe(config):
+            self.mlp = HunYuanSparseMoeBlock(
+                config=config,
+                quant_config=quant_config,
+                layer_id=layer_id,
+            )
+        else:
+            self.mlp = HunYuanMLP(
+                hidden_size=self.hidden_size,
+                intermediate_size=self.intermediate_size,
+                hidden_act=config.hidden_act,
+                quant_config=quant_config,
+                bias=getattr(config, "mlp_bias", False),
+                prefix=f"{prefix}.mlp",
+            )
+        self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = RMSNorm(
+            config.hidden_size, eps=config.rms_norm_eps
+        )
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+        residual: Optional[torch.Tensor],
+        kv_states: Optional[Tuple[torch.Tensor]] = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        # Self Attention
+        if residual is None:
+            residual = hidden_states
+            hidden_states = self.input_layernorm(hidden_states)
+        else:
+            hidden_states, residual = self.input_layernorm(hidden_states, residual)
+        hidden_states, ori_kv_states = self.self_attn(
+            positions=positions,
+            hidden_states=hidden_states,
+            forward_batch=forward_batch,
+            kv_states=kv_states,
+        )
+
+        # Fully Connected
+        hidden_states, residual = self.post_attention_layernorm(hidden_states, residual)
+        hidden_states = self.mlp(hidden_states)
+        return hidden_states, residual, ori_kv_states
+
+
+class HunYuanModel(nn.Module):
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.config = config
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+        self.org_vocab_size = config.vocab_size
+
+        self.embed_tokens = VocabParallelEmbedding(
+            self.vocab_size,
+            config.hidden_size,
+        )
+
+        self.layers = nn.ModuleList(
+            [
+                HunYuanDecoderLayer(
+                    config=config,
+                    layer_id=layer_id,
+                    quant_config=quant_config,
+                    # prefix=prefix
+                )
+                for layer_id in range(config.num_hidden_layers)
+            ]
+        )
+
+        self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.embed_tokens(input_ids)
+
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor],
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        input_embeds: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        if input_embeds is not None:
+            hidden_states = input_embeds
+        else:
+            hidden_states = self.get_input_embeddings(input_ids)
+        residual = None
+
+        prev_kv_states = None
+        for i in range(len(self.layers)):
+            layer = self.layers[i]
+            hidden_states, residual, kv_states = layer(
+                positions,
+                hidden_states,
+                forward_batch,
+                residual,
+                prev_kv_states,
+            )
+
+            if False:  # (i - self.start_layer) % cla_factor == 0:
+                prev_kv_states = kv_states
+            else:
+                prev_kv_states = None
+
+        hidden_states, _ = self.norm(hidden_states, residual)
+        return hidden_states
+
+
+class HunYuanMoEV1ForCausalLM(nn.Module):
+    packed_modules_mapping = {
+        "qkv_proj": [
+            "q_proj",
+            "k_proj",
+            "v_proj",
+        ],
+        "gate_up_proj": [
+            "gate_proj",
+            "up_proj",
+        ],
+    }
+
+    embedding_modules = {
+        "embed_tokens": "input_embeddings",
+        "lm_head": "output_embeddings",
+    }
+    embedding_padding_modules = ["lm_head"]
+    bitsandbytes_stacked_params_mapping = {
+        # shard_name, weight_name, index
+        "q_proj": ("qkv_proj", 0),
+        "k_proj": ("qkv_proj", 1),
+        "v_proj": ("qkv_proj", 2),
+        "gate_proj": ("gate_up_proj", 0),
+        "up_proj": ("gate_up_proj", 1),
+    }
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+    ) -> None:
+        super().__init__()
+
+        self.config = config
+
+        self.model = HunYuanModel(config, quant_config, prefix="model")
+        self.unpadded_vocab_size = config.vocab_size
+        self.lm_head = ParallelLMHead(
+            config.vocab_size,
+            config.hidden_size,
+            quant_config=quant_config,
+        )
+        if config.tie_word_embeddings:
+            self.lm_head.weight = self.model.embed_tokens.weight
+
+        self.hidden_size = config.hidden_size
+        self.head_dim = get_head_dim(config)
+
+        check_head_dim(config)
+
+        logit_scale = getattr(config, "logit_scale", 1.0)
+        self.logits_processor = LogitsProcessor(config, logit_scale=logit_scale)
+        self.sampler = create_sampler()
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        input_embeds: torch.Tensor = None,
+    ) -> torch.Tensor:
+        hidden_states = self.model(input_ids, positions, forward_batch, input_embeds)
+        return self.logits_processor(
+            input_ids, hidden_states, self.lm_head, forward_batch
+        )
+
+    def _split_qkv_weight(self, qkv: torch.Tensor):
+        num_attention_heads = self.config.num_attention_heads
+        num_kv_heads = getattr(
+            self.config, "num_key_value_heads", self.config.num_attention_heads
+        )
+        num_key_value_groups = num_attention_heads // num_kv_heads
+
+        qkv = qkv.reshape(
+            num_kv_heads, num_key_value_groups + 2, self.head_dim, self.hidden_size
+        )
+        q, k, v = torch.split(qkv, (num_key_value_groups, 1, 1), dim=1)
+        q = q.reshape(-1, self.hidden_size)
+        k = k.reshape(-1, self.hidden_size)
+        v = v.reshape(-1, self.hidden_size)
+        return torch.concat((q, k, v))
+        # return qkv.reshape((num_kv_heads, num_key_value_groups+2 , attention_head_dim, hidden_size)).permute((1,0,2,3)).reshape((-1, hidden_size)),
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        cla_factor = _get_cla_factor(self.config)
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            (".qkv_proj", ".q_proj", "q"),
+            (".qkv_proj", ".k_proj", "k"),
+            (".qkv_proj", ".v_proj", "v"),
+            (".gate_up_proj", ".gate_proj", 0),
+            (".gate_up_proj", ".up_proj", 1),
+        ]
+
+        num_attention_heads = self.config.num_attention_heads
+        num_kv_heads = getattr(
+            self.config, "num_key_value_heads", self.config.num_attention_heads
+        )
+        split_params_mapping = [
+            (".gate_up_proj", ".gate_and_up_proj", 2, [(1, 1), (0, 1)], None),
+            (
+                ".qkv_proj",
+                ".qkv_proj",
+                num_attention_heads + num_kv_heads * 2,
+                [("q", num_attention_heads), ("k", num_kv_heads), ("v", num_kv_heads)],
+                self._split_qkv_weight,
+            ),
+        ]
+
+        if _is_moe(self.config):
+            # Params for weights, fp8 weight scales, fp8 activation scales
+            # (param_name, weight_name, expert_id, shard_id)
+            expert_params_mapping = FusedMoE.make_expert_params_mapping(
+                ckpt_gate_proj_name="gate_proj",
+                ckpt_down_proj_name="down_proj",
+                ckpt_up_proj_name="up_proj",
+                num_experts=self.config.num_experts,
+            )
+        else:
+            expert_params_mapping = {}
+
+        params_dict = dict(self.named_parameters())
+        for name, loaded_weight in weights:
+            if "rotary_emb.inv_freq" in name:
+                continue
+            if "gate_proj_bias" in name:
+                name = name.replace("gate_proj_bias", "gate_proj.bias")
+            if "up_proj_bias" in name:
+                name = name.replace("up_proj_bias", "up_proj.bias")
+            if "rotary_emb.cos_cached" in name or "rotary_emb.sin_cached" in name:
+                # Models trained using ColossalAI may include these tensors in
+                # the checkpoint. Skip them.
+                continue
+            # With tie_word_embeddings, we can skip lm_head.weight
+            # The weight might appear unnecessarily in the files if the model is
+            # processed with quantization, LoRA, fine-tuning, etc.
+            if self.config.tie_word_embeddings and "lm_head.weight" in name:
+                continue
+
+            is_found = False
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                if "mlp.experts" in name:
+                    continue
+                # cross layer only have q_proj, skip qkv pack
+                if weight_name == ".q_proj":
+                    match = re.search(r"layers\.\d+", name)
+                    if match:
+                        layer_id = int(match.group(0).split(".")[-1])
+                        if cla_factor > 1 and layer_id % cla_factor != 0:
+                            continue
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+
+                is_found = True
+                break
+            if is_found:
+                continue
+
+            for param_name, weight_name, den, split_param, func in split_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+
+                assert loaded_weight.shape[0] % den == 0
+                units = loaded_weight.shape[0] // den
+
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                offset = 0
+                for shard_id, num in split_param:
+                    new_offset = offset + num * units
+                    if func:
+                        weight_loader(
+                            param, func(loaded_weight)[offset:new_offset], shard_id
+                        )
+                    else:
+                        weight_loader(param, loaded_weight[offset:new_offset], shard_id)
+                    offset = new_offset
+
+                break
+            else:
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                for mapping in expert_params_mapping:
+                    param_name, weight_name, expert_id, shard_id = mapping
+                    if weight_name not in name:
+                        continue
+                    name = name.replace(weight_name, param_name)
+                    # Skip layers on other devices.
+                    param = params_dict[name]
+                    weight_loader = param.weight_loader
+                    weight_loader(
+                        param,
+                        loaded_weight,
+                        name,
+                        shard_id=shard_id,
+                        expert_id=expert_id,
+                    )
+                    break
+                else:
+                    # Remapping the name of FP8 kv-scale.
+                    name = maybe_remap_kv_scale_name(name, params_dict)
+                    if name is None:
+                        continue
+
+                    if "mlp.gate.wg." in name:
+                        name = name.replace("wg.", "")
+
+                    param = params_dict[name]
+                    weight_loader = getattr(
+                        param, "weight_loader", default_weight_loader
+                    )
+                    weight_loader(param, loaded_weight)
+
+    # If this function is called, it should always initialize KV cache scale
+    # factors (or else raise an exception). Thus, handled exceptions should
+    # make sure to leave KV cache scale factors in a known good (dummy) state
+    def load_kv_cache_scales(self, quantization_param_path: str) -> None:
+        tp_size = get_tensor_model_parallel_world_size()
+        tp_rank = get_tensor_model_parallel_rank()
+        for layer_idx, scaling_factor in kv_cache_scales_loader(
+            quantization_param_path,
+            tp_rank,
+            tp_size,
+            self.config.num_hidden_layers,
+            self.config.__class__.model_type,
+        ):
+            if not isinstance(self.model.layers[layer_idx], nn.Identity):
+                layer_self_attn = self.model.layers[layer_idx].self_attn
+
+            if is_hip():
+                # The scaling factor convention we are assuming is
+                # quantized_value * scaling_factor ~= true_value
+                # which is consistent with the practice of setting
+                # scaling_factor = tensor_amax / FPtype_max
+                scaling_factor *= 2
+            if hasattr(layer_self_attn, "kv_scale"):
+                layer_self_attn.attn._kv_scale = scaling_factor
+            else:
+                raise RuntimeError(
+                    "Self attention has no KV cache scaling " "factor attribute!"
+                )
+
+
+class HunYuanDenseV1ForCausalLM(HunYuanMoEV1ForCausalLM):
+    pass
+
+
+EntryClass = [HunYuanMoEV1ForCausalLM, HunYuanDenseV1ForCausalLM]
diff --git a/sglang/python/sglang/srt/models/idefics2.py b/sglang/python/sglang/srt/models/idefics2.py
new file mode 100644
index 0000000000000000000000000000000000000000..c16c86d1073a46271ddd83515427991a2c443168
--- /dev/null
+++ b/sglang/python/sglang/srt/models/idefics2.py
@@ -0,0 +1,344 @@
+# Copyright 2023 The SGLang team.
+# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Optional
+
+import torch
+from torch import nn
+from transformers import PretrainedConfig
+
+from sglang.srt.layers.activation import get_act_fn
+from sglang.srt.layers.attention.vision import VisionAttention
+from sglang.srt.layers.linear import ColumnParallelLinear, RowParallelLinear
+from sglang.srt.layers.quantization.base_config import QuantizationConfig
+from sglang.srt.utils import add_prefix, is_npu
+
+
+class Idefics2VisionMLP(nn.Module):
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.config = config
+        self.activation_fn = get_act_fn(config.hidden_act)
+        self.fc1 = ColumnParallelLinear(
+            config.hidden_size,
+            config.intermediate_size,
+            bias=True,
+            quant_config=quant_config,
+            prefix=add_prefix("fc1", prefix),
+        )
+        self.fc2 = RowParallelLinear(
+            config.intermediate_size,
+            config.hidden_size,
+            bias=True,
+            quant_config=quant_config,
+            prefix=add_prefix("fc2", prefix),
+        )
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states, _ = self.fc1(hidden_states)
+        hidden_states = self.activation_fn(hidden_states)
+        hidden_states, _ = self.fc2(hidden_states)
+        return hidden_states
+
+
+class Idefics2EncoderLayer(nn.Module):
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.embed_dim = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.self_attn = VisionAttention(
+            embed_dim=config.hidden_size,
+            num_heads=self.num_heads,
+            projection_size=config.intermediate_size,
+            use_qkv_parallel=True,
+            quant_config=quant_config,
+            dropout=config.attention_dropout,
+            softmax_in_single_precision=True,
+            flatten_batch=False,
+            prefix=add_prefix("self_attn", prefix),
+        )
+        self.layer_norm1 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
+        self.mlp = Idefics2VisionMLP(
+            config,
+            quant_config=quant_config,
+            prefix=add_prefix("mlp", prefix),
+        )
+        self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        cu_seqlens: torch.Tensor,
+    ) -> torch.Tensor:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`):
+                Input to the layer of shape `(batch, seq_len, embed_dim)`.
+
+        """
+        residual = hidden_states
+        hidden_states = self.layer_norm1(hidden_states)
+        hidden_states = self.self_attn(hidden_states, cu_seqlens=cu_seqlens)
+
+        hidden_states = residual + hidden_states
+        residual = hidden_states
+        hidden_states = self.layer_norm2(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+        return hidden_states
+
+
+class Idefics2Encoder(nn.Module):
+    """
+    Transformer encoder consisting of `config.num_hidden_layers` self attention
+    layers. Each layer is a
+    [`Idefics2EncoderLayer`].
+
+    Args:
+        config: Idefics2Config
+    """
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+
+        self.config = config
+        self.layers = nn.ModuleList(
+            [
+                Idefics2EncoderLayer(
+                    config,
+                    quant_config=quant_config,
+                    prefix=add_prefix(f"layers.{i}", prefix),
+                )
+                for i in range(config.num_hidden_layers)
+            ]
+        )
+
+    def forward(
+        self,
+        inputs_embeds: torch.Tensor,
+        cu_seqlens: torch.Tensor,
+    ) -> torch.Tensor:
+        r"""
+        Args:
+            inputs_embeds (torch.Tensor):
+                Optionally, instead of passing `input_ids` you can choose to
+                directly pass an embedded representation.
+                This is useful if you want more control over how to convert
+                `input_ids` indices into associated vectorsthan the model's
+                internal embedding lookup matrix.
+        """
+        # cu_seqlens must be on cpu because of npu_flash_attention_unpad operator restriction
+        if is_npu():
+            cu_seqlens = cu_seqlens.to("cpu")
+        hidden_states = inputs_embeds
+        for encoder_layer in self.layers:
+            layer_outputs = encoder_layer(
+                hidden_states,
+                cu_seqlens=cu_seqlens,
+            )
+            hidden_states = layer_outputs
+        return hidden_states
+
+
+class Idefics2VisionEmbeddings(nn.Module):
+    """
+    This is a modified version of `siglip.modelign_siglip.SiglipVisionEmbeddings
+    ` to enable images of variable
+    resolution.
+
+    The modifications are adapted from [Patch n' Pack: NaViT, a Vision
+    Transformer for any Aspect Ratio and Resolution](https://arxiv.org/abs/2307.06304)
+    which allows treating images in their native aspect ratio and without the
+    need to resize them to the same fixed size. In particular, we start from the
+    original pre-trained SigLIP model(which uses images of fixed-size square
+    images) and adapt it by training on images of variable resolutions.
+    """
+
+    def __init__(self, config: PretrainedConfig):
+        super().__init__()
+        self.embed_dim = config.hidden_size
+        self.image_size = config.image_size
+        self.patch_size = config.patch_size
+        self.patch_embedding = nn.Conv2d(
+            in_channels=config.num_channels,
+            out_channels=self.embed_dim,
+            kernel_size=self.patch_size,
+            stride=self.patch_size,
+            padding="valid",
+        )
+        self.num_patches_per_side = self.image_size // self.patch_size
+        self.num_patches = self.num_patches_per_side**2
+        self.num_positions = self.num_patches
+        self.position_embedding = nn.Embedding(self.num_positions, self.embed_dim)
+
+    def get_position_ids(
+        self,
+        pixel_values: torch.FloatTensor,
+        patch_attention_mask: torch.BoolTensor,
+        tgt_sizes: Optional[torch.IntTensor] = None,
+    ):
+        batch_size, _, max_im_h, max_im_w = pixel_values.shape
+
+        max_nb_patches_h, max_nb_patches_w = (
+            max_im_h // self.patch_size,
+            max_im_w // self.patch_size,
+        )
+        boundaries = torch.arange(
+            1 / self.num_patches_per_side, 1.0, 1 / self.num_patches_per_side
+        )
+        position_ids = torch.full(
+            size=(batch_size, max_nb_patches_h * max_nb_patches_w), fill_value=0
+        )
+
+        for batch_idx, p_attn_mask in enumerate(patch_attention_mask):
+
+            if tgt_sizes is not None:
+                nb_patches_h = tgt_sizes[batch_idx][0]
+                nb_patches_w = tgt_sizes[batch_idx][1]
+            else:
+                nb_patches_h = p_attn_mask[:, 0].sum()
+                nb_patches_w = p_attn_mask[0].sum()
+            fractional_coords_h = torch.arange(0, 1 - 1e-6, 1 / nb_patches_h)
+            fractional_coords_w = torch.arange(0, 1 - 1e-6, 1 / nb_patches_w)
+            bucket_coords_h = torch.bucketize(
+                fractional_coords_h, boundaries, right=True
+            )
+            bucket_coords_w = torch.bucketize(
+                fractional_coords_w, boundaries, right=True
+            )
+            pos_ids = (
+                bucket_coords_h[:, None] * self.num_patches_per_side + bucket_coords_w
+            ).flatten()
+            position_ids[batch_idx][p_attn_mask.view(-1).cpu()] = pos_ids
+        position_ids = position_ids.to(self.position_embedding.weight.device)
+        return position_ids
+
+    def forward(
+        self,
+        pixel_values: torch.FloatTensor,
+        patch_attention_mask: torch.BoolTensor,
+        tgt_sizes: Optional[torch.IntTensor] = None,
+    ) -> torch.Tensor:
+        target_dtype = self.patch_embedding.weight.dtype
+        pixel_values = pixel_values.to(
+            device=self.patch_embedding.weight.device, dtype=target_dtype
+        )
+        patch_embeds = self.patch_embedding(pixel_values)
+        embeddings = patch_embeds.flatten(2).transpose(1, 2)
+        position_ids = self.get_position_ids(
+            pixel_values, patch_attention_mask, tgt_sizes
+        )
+
+        embeddings = embeddings + self.position_embedding(position_ids)
+        return embeddings
+
+
+class Idefics2VisionTransformer(nn.Module):
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        require_post_norm: bool = True,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+
+        embed_dim = config.hidden_size
+        self.config = config
+        self.embeddings = Idefics2VisionEmbeddings(config)
+        self.encoder = Idefics2Encoder(
+            config=config,
+            quant_config=quant_config,
+            prefix=add_prefix("encoder", prefix),
+        )
+        self.post_layernorm = (
+            nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
+            if require_post_norm
+            else nn.Identity()
+        )
+
+    def get_input_embeddings(self) -> nn.Embedding:
+        return self.embeddings
+
+    def compute_cu_seqlens(
+        self,
+        tgt_sizes: Optional[torch.Tensor] = None,
+        input_embeds: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        # shape: (batch_size,)
+        if tgt_sizes is not None:
+            seqlen = tgt_sizes[:, 0] * tgt_sizes[:, 1]
+        elif input_embeds is not None:
+            seqlen = torch.full(
+                size=(input_embeds.shape[0],),
+                fill_value=input_embeds.shape[1],
+                dtype=torch.int32,
+                device=input_embeds.device,
+            )
+        else:
+            raise ValueError(
+                "Either `tgt_sizes` or `input_embeds` must be provided to compute cu_seqlens."
+            )
+
+        cu_seqlens = torch.cat(
+            [
+                torch.tensor([0], device=seqlen.device, dtype=torch.int32),
+                torch.cumsum(seqlen, dim=0, dtype=torch.int32),
+            ],
+            dim=0,
+        ).to(seqlen.device)
+        return cu_seqlens
+
+    def forward(
+        self,
+        pixel_values,
+        patch_attention_mask: Optional[torch.BoolTensor] = None,
+        tgt_sizes: Optional[torch.IntTensor] = None,
+    ) -> torch.Tensor:
+        hidden_states = self.embeddings(
+            pixel_values=pixel_values,
+            patch_attention_mask=patch_attention_mask,
+            tgt_sizes=tgt_sizes,
+        )
+        cu_seqlens = self.compute_cu_seqlens(tgt_sizes, hidden_states)
+        encoder_outputs = self.encoder(
+            hidden_states,
+            cu_seqlens=cu_seqlens,
+        )
+        last_hidden_state = self.post_layernorm(encoder_outputs)
+        return last_hidden_state
diff --git a/sglang/python/sglang/srt/models/internlm2.py b/sglang/python/sglang/srt/models/internlm2.py
new file mode 100644
index 0000000000000000000000000000000000000000..7e0956ff090fc064a683f688b99b1c8d7805ae08
--- /dev/null
+++ b/sglang/python/sglang/srt/models/internlm2.py
@@ -0,0 +1,357 @@
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+# Adapted from https://raw.githubusercontent.com/vllm-project/vllm/7f62077af5159c625fe3ad1c812e6c1a2b93ba3b/vllm/model_executor/models/internlm2.py
+
+from typing import Any, Dict, Iterable, Optional, Tuple
+
+import torch
+from torch import nn
+from transformers import PretrainedConfig
+
+from sglang.srt.distributed import get_tensor_model_parallel_world_size
+from sglang.srt.layers.activation import SiluAndMul
+from sglang.srt.layers.layernorm import RMSNorm
+from sglang.srt.layers.linear import (
+    MergedColumnParallelLinear,
+    QKVParallelLinear,
+    RowParallelLinear,
+)
+from sglang.srt.layers.logits_processor import LogitsProcessor
+from sglang.srt.layers.quantization.base_config import QuantizationConfig
+from sglang.srt.layers.radix_attention import RadixAttention
+from sglang.srt.layers.rotary_embedding import get_rope
+from sglang.srt.layers.vocab_parallel_embedding import (
+    ParallelLMHead,
+    VocabParallelEmbedding,
+)
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch
+from sglang.srt.model_loader.weight_utils import default_weight_loader
+from sglang.srt.utils import add_prefix
+
+
+class InternLM2MLP(nn.Module):
+    def __init__(
+        self,
+        hidden_size: int,
+        intermediate_size: int,
+        hidden_act: str,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.gate_up_proj = MergedColumnParallelLinear(
+            hidden_size,
+            [intermediate_size] * 2,
+            bias=False,
+            quant_config=quant_config,
+            prefix=add_prefix("gate_up_proj", prefix),
+        )
+        self.w2 = RowParallelLinear(
+            intermediate_size,
+            hidden_size,
+            bias=False,
+            quant_config=quant_config,
+            prefix=add_prefix("w2", prefix),
+        )
+        if hidden_act != "silu":
+            raise ValueError(
+                f"Unsupported activation: {hidden_act}. "
+                "Only silu is supported for now."
+            )
+        self.act_fn = SiluAndMul()
+
+    def forward(self, x):
+        gate_up, _ = self.gate_up_proj(x)
+        x = self.act_fn(gate_up)
+        x, _ = self.w2(x)
+        return x
+
+
+class InternLM2Attention(nn.Module):
+    def __init__(
+        self,
+        hidden_size: int,
+        num_heads: int,
+        num_kv_heads: int,
+        rope_theta: float = 10000,
+        rope_scaling: Optional[Dict[str, Any]] = None,
+        max_position_embeddings: int = 8192,
+        layer_id: int = 0,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.hidden_size = hidden_size
+        tp_size = get_tensor_model_parallel_world_size()
+        self.total_num_heads = num_heads
+        assert self.total_num_heads % tp_size == 0
+        self.num_heads = self.total_num_heads // tp_size
+        self.total_num_kv_heads = num_kv_heads
+        if self.total_num_kv_heads >= tp_size:
+            # Number of KV heads is greater than TP size, so we partition
+            # the KV heads across multiple tensor parallel GPUs.
+            assert self.total_num_kv_heads % tp_size == 0
+        else:
+            # Number of KV heads is less than TP size, so we replicate
+            # the KV heads across multiple tensor parallel GPUs.
+            assert tp_size % self.total_num_kv_heads == 0
+        self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
+        self.head_dim = hidden_size // self.total_num_heads
+        self.q_size = self.num_heads * self.head_dim
+        self.kv_size = self.num_kv_heads * self.head_dim
+        self.scaling = self.head_dim**-0.5
+        self.rope_theta = rope_theta
+        self.max_position_embeddings = max_position_embeddings
+
+        self.wqkv = QKVParallelLinear(
+            hidden_size,
+            self.head_dim,
+            self.total_num_heads,
+            self.total_num_kv_heads,
+            bias=False,
+            quant_config=quant_config,
+            prefix=add_prefix("wqkv", prefix),
+        )
+        self.wo = RowParallelLinear(
+            self.total_num_heads * self.head_dim,
+            hidden_size,
+            bias=False,
+            quant_config=quant_config,
+            prefix=add_prefix("wo", prefix),
+        )
+
+        self.rotary_emb = get_rope(
+            self.head_dim,
+            rotary_dim=self.head_dim,
+            max_position=max_position_embeddings,
+            base=rope_theta,
+            rope_scaling=rope_scaling,
+        )
+        self.attn = RadixAttention(
+            self.num_heads,
+            self.head_dim,
+            self.scaling,
+            self.num_kv_heads,
+            layer_id,
+            quant_config=quant_config,
+            prefix=add_prefix("attn", prefix),
+        )
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+    ) -> torch.Tensor:
+        qkv, _ = self.wqkv(hidden_states)
+        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+        q, k = self.rotary_emb(positions, q, k)
+        attn_output = self.attn(q, k, v, forward_batch)
+        output, _ = self.wo(attn_output)
+        return output
+
+
+class InternLMDecoderLayer(nn.Module):
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        layer_id: int = 0,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        rope_theta = getattr(config, "rope_theta", 10000)
+        rope_scaling = getattr(config, "rope_scaling", None)
+        max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
+        self.attention = InternLM2Attention(
+            hidden_size=self.hidden_size,
+            num_heads=config.num_attention_heads,
+            num_kv_heads=config.num_key_value_heads,
+            rope_theta=rope_theta,
+            rope_scaling=rope_scaling,
+            max_position_embeddings=max_position_embeddings,
+            layer_id=layer_id,
+            quant_config=quant_config,
+            prefix=add_prefix("attention", prefix),
+        )
+        self.feed_forward = InternLM2MLP(
+            hidden_size=self.hidden_size,
+            intermediate_size=config.intermediate_size,
+            hidden_act=config.hidden_act,
+            quant_config=quant_config,
+            prefix=add_prefix("feed_forward", prefix),
+        )
+        self.attention_norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.ffn_norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+        residual: Optional[torch.Tensor],
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        # Self Attention
+        if residual is None:
+            residual = hidden_states
+            hidden_states = self.attention_norm(hidden_states)
+        else:
+            hidden_states, residual = self.attention_norm(hidden_states, residual)
+        hidden_states = self.attention(
+            positions=positions,
+            hidden_states=hidden_states,
+            forward_batch=forward_batch,
+        )
+
+        # Fully Connected
+        hidden_states, residual = self.ffn_norm(hidden_states, residual)
+        hidden_states = self.feed_forward(hidden_states)
+        return hidden_states, residual
+
+
+class InternLM2Model(nn.Module):
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.config = config
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+        self.tok_embeddings = VocabParallelEmbedding(
+            config.vocab_size,
+            config.hidden_size,
+            prefix=add_prefix("tok_embeddings", prefix),
+        )
+        self.layers = nn.ModuleList(
+            [
+                InternLMDecoderLayer(
+                    config, i, quant_config, prefix=add_prefix(f"layers.{i}", prefix)
+                )
+                for i in range(config.num_hidden_layers)
+            ]
+        )
+        self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        input_embeds: torch.Tensor = None,
+    ) -> torch.Tensor:
+        if input_embeds is None:
+            hidden_states = self.tok_embeddings(input_ids)
+        else:
+            hidden_states = input_embeds
+        residual = None
+        for i in range(len(self.layers)):
+            layer = self.layers[i]
+            hidden_states, residual = layer(
+                positions,
+                hidden_states,
+                forward_batch,
+                residual,
+            )
+        hidden_states, _ = self.norm(hidden_states, residual)
+        return hidden_states
+
+
+class InternLM2ForCausalLM(nn.Module):
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.config = config
+        self.quant_config = quant_config
+        self.model = InternLM2Model(
+            config, quant_config, prefix=add_prefix("model", prefix)
+        )
+        self.output = ParallelLMHead(
+            config.vocab_size, config.hidden_size, prefix=add_prefix("output", prefix)
+        )
+        self.logits_processor = LogitsProcessor(config)
+
+    def get_input_embeddings(self) -> nn.Embedding:
+        return self.model.tok_embeddings
+
+    @torch.no_grad()
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        input_embeds: torch.Tensor = None,
+    ) -> torch.Tensor:
+        hidden_states = self.model(input_ids, positions, forward_batch, input_embeds)
+        return self.logits_processor(
+            input_ids, hidden_states, self.output, forward_batch
+        )
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("gate_up_proj", "w1", 0),
+            ("gate_up_proj", "w3", 1),
+        ]
+        params_dict = dict(self.named_parameters())
+        for name, loaded_weight in weights:
+            if "rotary_emb.inv_freq" in name:
+                continue
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                param = params_dict[name]
+                if "wqkv" in name:
+                    config = self.config
+                    kv_groups = config.num_attention_heads // config.num_key_value_heads
+                    head_dim = config.hidden_size // config.num_attention_heads
+                    loaded_weight = loaded_weight.view(
+                        -1, 2 + kv_groups, head_dim, loaded_weight.shape[-1]
+                    )
+                    wq, wk, wv = torch.split(loaded_weight, [kv_groups, 1, 1], dim=1)
+                    wq = wq.reshape(-1, wq.shape[-1])
+                    wk = wk.reshape(-1, wk.shape[-1])
+                    wv = wv.reshape(-1, wv.shape[-1])
+                    weight_loader = param.weight_loader
+                    weight_loader(param, wq, "q")
+                    weight_loader(param, wk, "k")
+                    weight_loader(param, wv, "v")
+                else:
+                    weight_loader = getattr(
+                        param, "weight_loader", default_weight_loader
+                    )
+                    weight_loader(param, loaded_weight)
+
+
+EntryClass = InternLM2ForCausalLM
diff --git a/sglang/python/sglang/srt/models/internlm2_reward.py b/sglang/python/sglang/srt/models/internlm2_reward.py
new file mode 100644
index 0000000000000000000000000000000000000000..68be8d001e71ff6b887f708b4f8fd8a1c2bc570a
--- /dev/null
+++ b/sglang/python/sglang/srt/models/internlm2_reward.py
@@ -0,0 +1,64 @@
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+from typing import Iterable, Optional, Tuple
+
+import torch
+from torch import nn
+from transformers import PretrainedConfig
+
+from sglang.srt.layers.pooler import EmbeddingPoolerOutput, Pooler, PoolingType
+from sglang.srt.layers.quantization.base_config import QuantizationConfig
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch
+from sglang.srt.models.internlm2 import InternLM2ForCausalLM, InternLM2Model
+from sglang.srt.utils import add_prefix
+
+
+class InternLM2ForRewardModel(nn.Module):
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.config = config
+        self.quant_config = quant_config
+        self.vocab_size = config.vocab_size
+        self.model = InternLM2Model(
+            config, quant_config, prefix=add_prefix("model", prefix)
+        )
+        self.v_head = nn.Linear(config.hidden_size, 1, bias=False)
+        self.pooler = Pooler(pooling_type=PoolingType.LAST, normalize=False)
+
+    @torch.no_grad()
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        input_embeds: torch.Tensor = None,
+        get_embedding: bool = True,
+    ) -> EmbeddingPoolerOutput:
+        assert get_embedding, "InternLM2ForRewardModel is only used for embedding"
+        hidden_states = self.model(input_ids, positions, forward_batch, input_embeds)
+        last_token_hidden = self.pooler(hidden_states, forward_batch).embeddings
+        scores = self.v_head(last_token_hidden)
+        return EmbeddingPoolerOutput(scores)
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        return InternLM2ForCausalLM.load_weights(self, weights)
+
+
+EntryClass = InternLM2ForRewardModel
diff --git a/sglang/python/sglang/srt/models/interns1.py b/sglang/python/sglang/srt/models/interns1.py
new file mode 100644
index 0000000000000000000000000000000000000000..e30906245075f6ac5fc8819f2206f187af451e52
--- /dev/null
+++ b/sglang/python/sglang/srt/models/interns1.py
@@ -0,0 +1,274 @@
+from typing import Iterable, List, Optional, Tuple
+
+import torch
+from torch import nn
+from transformers import PretrainedConfig
+
+from sglang.srt.layers.attention import vision_utils
+from sglang.srt.layers.moe.fused_moe_triton.layer import FusedMoE
+from sglang.srt.layers.quantization.base_config import QuantizationConfig
+from sglang.srt.managers.mm_utils import (
+    MultiModalityDataPaddingPatternTokenPairs,
+    general_mm_embed_routine,
+)
+from sglang.srt.managers.schedule_batch import (
+    Modality,
+    MultimodalDataItem,
+    MultimodalInputs,
+)
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch
+from sglang.srt.model_loader.weight_utils import default_weight_loader
+from sglang.srt.models.internvl import InternVisionModel
+from sglang.srt.models.qwen2 import Qwen2ForCausalLM
+from sglang.srt.models.qwen3 import Qwen3ForCausalLM
+from sglang.srt.models.qwen3_moe import Qwen3MoeForCausalLM
+from sglang.utils import logger
+
+
+class InternS1ForConditionalGeneration(nn.Module):
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        use_flash_attn=True,
+    ) -> None:
+        super().__init__()
+        self.config = config
+        self.quant_config = quant_config
+        vision_utils.update_vit_attn_dummy_heads_config(self.config)
+        image_size = (
+            getattr(config, "force_image_size", None) or config.vision_config.image_size
+        )
+        patch_size = config.vision_config.patch_size
+        if isinstance(image_size, list):
+            image_size = image_size[0]
+        if isinstance(patch_size, list):
+            patch_size = patch_size[0]
+        self.patch_size = patch_size
+        self.select_layer = config.vision_feature_layer
+        self.num_image_token = int(
+            (image_size // patch_size) ** 2 * (config.downsample_ratio**2)
+        )
+        self.downsample_ratio = config.downsample_ratio
+
+        config.vision_config.use_flash_attn = True if use_flash_attn else False
+        config.text_config._attn_implementation = (
+            "flash_attention_2" if use_flash_attn else "eager"
+        )
+
+        logger.info(f"num_image_token: {self.num_image_token}")
+
+        self.vision_model = InternVisionModel(config.vision_config)
+        if config.text_config.architectures[0] == "Qwen2ForCausalLM":
+            self.language_model = Qwen2ForCausalLM(
+                config=config.text_config, quant_config=quant_config
+            )
+        elif config.text_config.architectures[0] == "Qwen3MoeForCausalLM":
+            self.language_model = Qwen3MoeForCausalLM(
+                config=config.text_config, quant_config=quant_config
+            )
+        elif config.text_config.architectures[0] == "Qwen3ForCausalLM":
+            self.language_model = Qwen3ForCausalLM(
+                config=config.text_config, quant_config=quant_config
+            )
+        else:
+            raise NotImplementedError(
+                f"{config.text_config.architectures[0]} is not implemented."
+            )
+
+        vit_hidden_size = config.vision_config.hidden_size
+        llm_hidden_size = config.text_config.hidden_size
+
+        self.mlp1 = nn.Sequential(
+            nn.LayerNorm(vit_hidden_size * int(1 / self.downsample_ratio) ** 2),
+            nn.Linear(
+                vit_hidden_size * int(1 / self.downsample_ratio) ** 2, llm_hidden_size
+            ),
+            nn.GELU(),
+            nn.Linear(llm_hidden_size, llm_hidden_size),
+        )
+
+    def pixel_shuffle(self, x, scale_factor=0.5):
+        n, w, h, c = x.size()
+        # N, W, H, C --> N, W, H * scale, C // scale
+        x = x.view(n, w, int(h * scale_factor), int(c / scale_factor))
+        # N, W, H * scale, C // scale --> N, H * scale, W, C // scale
+        x = x.permute(0, 2, 1, 3).contiguous()
+        # N, H * scale, W, C // scale --> N, H * scale, W * scale, C // (scale ** 2)
+        x = x.view(
+            n,
+            int(h * scale_factor),
+            int(w * scale_factor),
+            int(c / (scale_factor * scale_factor)),
+        )
+        x = x.permute(0, 2, 1, 3).contiguous()
+        return x
+
+    def extract_feature(self, pixel_values):
+        if self.select_layer == -1:
+            vit_embeds = self.vision_model(
+                pixel_values=pixel_values, output_hidden_states=False, return_dict=True
+            ).last_hidden_state
+        else:
+            vit_embeds = self.vision_model(
+                pixel_values=pixel_values, output_hidden_states=True, return_dict=True
+            ).hidden_states[self.select_layer]
+        vit_embeds = vit_embeds[:, 1:, :]
+
+        h = w = int(vit_embeds.shape[1] ** 0.5)
+        vit_embeds = vit_embeds.reshape(vit_embeds.shape[0], h, w, -1)
+        vit_embeds = self.pixel_shuffle(vit_embeds, scale_factor=self.downsample_ratio)
+        vit_embeds = vit_embeds.reshape(vit_embeds.shape[0], -1, vit_embeds.shape[-1])
+        vit_embeds = self.mlp1(vit_embeds)
+        return vit_embeds
+
+    def get_image_feature(self, items: List[MultimodalDataItem]):
+        """
+        Projects the last hidden state from the vision model into language model space.
+
+        Returns:
+            image_features (`torch.Tensor`): Image feature tensor of shape `(num_images, image_length, embed_dim)`).
+        """
+        pixel_values = torch.cat([item.feature for item in items])
+        image_features = self.extract_feature(pixel_values)
+        return image_features
+
+    @torch.no_grad()
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        input_embeds: torch.Tensor = None,
+    ) -> torch.Tensor:
+
+        hs = general_mm_embed_routine(
+            input_ids=input_ids,
+            forward_batch=forward_batch,
+            language_model=self.language_model,
+            data_embedding_funcs={
+                Modality.IMAGE: self.get_image_feature,
+            },
+            positions=positions,
+        )
+
+        return hs
+
+    def pad_input_ids(self, input_ids: List[int], mm_inputs: MultimodalInputs):
+        # Get all special token IDs
+        im_start_id: int = mm_inputs.im_start_id
+        im_end_id: int = mm_inputs.im_end_id
+
+        media_token_pairs = [(im_start_id, im_end_id)]
+        helper = MultiModalityDataPaddingPatternTokenPairs(media_token_pairs)
+
+        return helper.pad_input_tokens(input_ids, mm_inputs)
+
+    def _mapping_interns1_name(self, name):
+        names_map = {
+            "lm_head.weight": "language_model.lm_head.weight",
+            "model.multi_modal_projector.layer_norm.bias": "mlp1.0.bias",
+            "model.multi_modal_projector.layer_norm.weight": "mlp1.0.weight",
+            "model.multi_modal_projector.linear_1.bias": "mlp1.1.bias",
+            "model.multi_modal_projector.linear_1.weight": "mlp1.1.weight",
+            "model.multi_modal_projector.linear_2.bias": "mlp1.3.bias",
+            "model.multi_modal_projector.linear_2.weight": "mlp1.3.weight",
+            "model.vision_tower.embeddings.cls_token": "vision_model.embeddings.class_embedding",
+            "model.vision_tower.embeddings.patch_embeddings.projection.bias": "vision_model.embeddings.patch_embedding.bias",
+            "model.vision_tower.embeddings.patch_embeddings.projection.weight": "vision_model.embeddings.patch_embedding.weight",
+            "model.vision_tower.embeddings.position_embeddings": "vision_model.embeddings.position_embedding",
+        }
+        if name in names_map:
+            name = names_map[name]
+        elif name.startswith("model.language_model."):
+            name = "language_model.model." + name[len("model.language_model.") :]
+        elif name.startswith("model.vision_tower."):
+            name = "vision_model." + name[len("model.vision_tower.") :]
+
+        if name.startswith("vision_model.encoder.layer"):
+
+            name = name.replace(r".layer.", r".layers.")
+            name = name.replace(r".attention.", r".attn.attn.")
+            name = name.replace(r".projection_layer.", r".proj.")
+            name = name.replace(r".lambda_1", r".ls1")
+            name = name.replace(r".lambda_2", r".ls2")
+            name = name.replace(r".layernorm_before.", r".norm1.")
+            name = name.replace(r".layernorm_after.", r".norm2.")
+        return name
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+            ("gate_up_proj", "gate_proj", 0),
+            ("gate_up_proj", "up_proj", 1),
+        ]
+        expert_params_mapping = []
+        if "Qwen3MoeForCausalLM" in self.config.text_config.architectures:
+            expert_params_mapping = FusedMoE.make_expert_params_mapping(
+                ckpt_gate_proj_name="gate_proj",
+                ckpt_down_proj_name="down_proj",
+                ckpt_up_proj_name="up_proj",
+                num_experts=self.config.num_experts,
+            )
+
+        params_dict = dict(self.named_parameters())
+
+        for name, loaded_weight in weights:
+            if "rotary_emb.inv_freq" in name:
+                continue
+            name = self._mapping_interns1_name(name)
+            if "vision_model" in name:
+                loaded_weight = vision_utils.pad_vit_attn_dummy_heads(
+                    self.config, name, loaded_weight
+                )
+
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                # We have mlp.experts[0].gate_proj in the checkpoint.
+                # Since we handle the experts below in expert_params_mapping,
+                # we need to skip here BEFORE we update the name, otherwise
+                # name will be updated to mlp.experts[0].gate_up_proj, which
+                # will then be updated below in expert_params_mapping
+                # for mlp.experts[0].gate_gate_up_proj, which breaks load.
+                if "mlp.experts" in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                for mapping in expert_params_mapping:
+                    param_name, weight_name, expert_id, shard_id = mapping
+                    if weight_name not in name:
+                        continue
+                    name = name.replace(weight_name, param_name)
+                    param = params_dict[name]
+                    weight_loader = param.weight_loader
+                    weight_loader(
+                        param,
+                        loaded_weight,
+                        name,
+                        shard_id=shard_id,
+                        expert_id=expert_id,
+                    )
+                    break
+                else:
+                    # Skip loading extra bias for GPTQ models.
+                    if name.endswith(".bias") and name not in params_dict:
+                        continue
+                    param = params_dict[name]
+                    weight_loader = getattr(
+                        param, "weight_loader", default_weight_loader
+                    )
+                    weight_loader(param, loaded_weight)
+
+
+EntryClass = InternS1ForConditionalGeneration
diff --git a/sglang/python/sglang/srt/models/interns1pro.py b/sglang/python/sglang/srt/models/interns1pro.py
new file mode 100644
index 0000000000000000000000000000000000000000..b22ff20ec5339f0e021fe22af351495107d7a520
--- /dev/null
+++ b/sglang/python/sglang/srt/models/interns1pro.py
@@ -0,0 +1,252 @@
+import functools
+import logging
+from typing import Any, Dict, Iterable, Optional, Tuple
+
+import torch
+from transformers import PretrainedConfig
+
+from sglang.srt.layers.dp_attention import get_attention_tp_rank, get_attention_tp_size
+from sglang.srt.layers.moe.topk import TopK
+from sglang.srt.layers.quantization.base_config import QuantizationConfig
+from sglang.srt.layers.rotary_embedding import get_rope
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch
+from sglang.srt.model_loader.weight_utils import default_weight_loader
+from sglang.srt.models.qwen3_moe import Qwen3MoeAttention, Qwen3MoeDecoderLayer
+from sglang.srt.models.qwen3_vl_moe import (
+    Qwen3MoeLLMModel,
+    Qwen3VLMoeForConditionalGeneration,
+)
+from sglang.srt.utils import add_prefix
+
+logger = logging.getLogger(__name__)
+
+
+class InternS1ProTextAttention(Qwen3MoeAttention):
+    def __init__(
+        self,
+        hidden_size: int,
+        num_heads: int,
+        num_kv_heads: int,
+        layer_id: int = 0,
+        rope_theta: float = 1000000,
+        rope_scaling: Optional[Dict[str, Any]] = None,
+        max_position_embeddings: int = 32768,
+        **kwargs,
+    ) -> None:
+        super().__init__(
+            hidden_size,
+            num_heads,
+            num_kv_heads,
+            layer_id=layer_id,
+            rope_theta=rope_theta,
+            rope_scaling=rope_scaling,
+            max_position_embeddings=max_position_embeddings,
+            **kwargs,
+        )
+        # for fope
+        fope_keys = {"fope_init_factor", "fope_sep_head", "num_inv_freq"}
+        use_fope = any(rope_scaling.get(key) is not None for key in fope_keys)
+        if use_fope:
+            rope_scaling["use_fope"] = True
+            rope_scaling["num_kv_heads"] = self.num_kv_heads
+
+        self.rotary_emb = get_rope(
+            self.head_dim,
+            rotary_dim=self.head_dim,
+            max_position=max_position_embeddings,
+            base=rope_theta,
+            rope_scaling=rope_scaling,
+        )
+        self.compatible_with_fused_kv_buffer = False
+        self.use_fused_qk_norm_rope = False
+        self._used_fused_qk_norm_rope_last_call = False
+
+    def forward_prepare_npu(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+    ):
+        raise NotImplementedError()
+
+
+class InternS1ProTextDecoderLayer(Qwen3MoeDecoderLayer):
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        layer_id: int,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+        alt_stream: Optional[torch.cuda.Stream] = None,
+    ) -> None:
+        super().__init__(
+            config,
+            layer_id,
+            quant_config=quant_config,
+            prefix=prefix,
+            alt_stream=alt_stream,
+        )
+
+        rope_theta = getattr(config, "rope_theta", 1000000)
+        rope_scaling = getattr(config, "rope_scaling", None)
+        max_position_embeddings = getattr(config, "max_position_embeddings", 32768)
+        head_dim = getattr(
+            config, "head_dim", config.hidden_size // config.num_attention_heads
+        )
+        rms_norm_eps = config.rms_norm_eps
+        attention_bias = config.attention_bias
+
+        self.self_attn = InternS1ProTextAttention(
+            hidden_size=self.hidden_size,
+            num_heads=config.num_attention_heads,
+            num_kv_heads=config.num_key_value_heads,
+            layer_id=layer_id,
+            rope_theta=rope_theta,
+            rope_scaling=rope_scaling,
+            max_position_embeddings=max_position_embeddings,
+            head_dim=head_dim,
+            rms_norm_eps=rms_norm_eps,
+            attention_bias=attention_bias,
+            config=config,
+            quant_config=quant_config,
+            prefix=add_prefix("self_attn", prefix),
+            alt_stream=alt_stream,
+        )
+        # update with group router
+        self.router_n_groups = getattr(config, "router_n_groups", -1)
+        if self.router_n_groups > 0:
+            assert (
+                config.num_experts_per_tok % self.router_n_groups == 0
+            ), f"{config.num_experts_per_tok} cannot be divided by {self.router_n_groups}"
+            self.mlp.topk = TopK(
+                top_k=config.num_experts_per_tok,
+                renormalize=config.norm_topk_prob,
+                use_grouped_topk=False,
+                layer_id=layer_id,
+                custom_routing_function=self._custom_routing_function,
+            )
+
+    @staticmethod
+    @functools.lru_cache
+    def get_group_offsets(router_n_groups: int, group_size: int, device: str):
+        group_offsets = (
+            torch.arange(router_n_groups, device=device) * group_size
+        ).view(
+            1, -1, 1
+        )  # [1, n_groups, 1]
+        return group_offsets
+
+    def _custom_routing_function(
+        self,
+        hidden_states: torch.Tensor,
+        gating_output: torch.Tensor,
+        topk: int,
+        renormalize: bool,
+    ) -> torch.Tensor:
+        """Group router"""
+        routing_weights = torch.softmax(gating_output, dim=-1, dtype=torch.float32)
+        if self.router_n_groups > 0:
+            assert (
+                routing_weights.shape[-1] % self.router_n_groups == 0
+            ), f"{routing_weights.shape[-1]} cannot be divided by {self.router_n_groups}"
+            per_group_top_k = topk // self.router_n_groups
+            group_size = routing_weights.shape[-1] // self.router_n_groups
+            group_offsets = self.get_group_offsets(
+                self.router_n_groups, group_size, routing_weights.device
+            )
+            routing_weights = routing_weights.unflatten(
+                -1, (self.router_n_groups, group_size)
+            )
+            topk_weights, topk_ids = torch.topk(
+                routing_weights, per_group_top_k, dim=-1
+            )
+            topk_ids = (topk_ids + group_offsets).flatten(-2, -1)
+            topk_weights = topk_weights.flatten(-2, -1)
+        else:
+            topk_weights, topk_ids = torch.topk(routing_weights, topk, dim=-1)
+
+        if renormalize:
+            topk_weights = topk_weights / topk_weights.sum(dim=-1, keepdim=True)
+
+        return topk_weights, topk_ids
+
+
+class InternS1ProTextModel(Qwen3MoeLLMModel):
+    def __init__(
+        self,
+        *,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        decoder_layer_type=InternS1ProTextDecoderLayer,
+        prefix: str = "",
+    ):
+        super().__init__(
+            config=config,
+            quant_config=quant_config,
+            prefix=prefix,
+            decoder_layer_type=decoder_layer_type,
+        )
+
+
+class InternS1ProForConditionalGeneration(Qwen3VLMoeForConditionalGeneration):
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+        language_model_cls=InternS1ProTextModel,
+    ) -> None:
+        # deal with no deepstack
+        if not hasattr(config.vision_config, "deepstack_visual_indexes"):
+            config.vision_config.deepstack_visual_indexes = []
+
+        super().__init__(
+            config,
+            quant_config=quant_config,
+            prefix=prefix,
+            language_model_cls=language_model_cls,
+        )
+
+        # disable deepstack
+        if len(config.vision_config.deepstack_visual_indexes) == 0:
+            self.use_deepstack = {}
+
+    def _load_fope_weights(self, name: str, loaded_weight: torch.Tensor, params_dict):
+        """load fope weights"""
+        attn_tp_size = get_attention_tp_size()
+        attn_tp_rank = get_attention_tp_rank()
+
+        num_key_value_heads = loaded_weight.size(0)
+        # replicate head if necessary
+        if num_key_value_heads < attn_tp_size:
+            n_replicate = attn_tp_size // num_key_value_heads
+            attn_tp_size = num_key_value_heads
+            attn_tp_rank = attn_tp_rank // n_replicate
+        loaded_weight = loaded_weight.chunk(attn_tp_size, dim=0)[attn_tp_rank]
+
+        # rotary_emb is shared cross layers
+        param_name = name.replace(".rotary_emb.", ".layers.0.self_attn.rotary_emb.")
+        assert param_name in params_dict
+        param = params_dict[param_name]
+        weight_loader = getattr(param, "weight_loader", default_weight_loader)
+        weight_loader(param, loaded_weight)
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        """load weights"""
+        # Cache params_dict to avoid repeated expensive traversal of model parameters
+        if not hasattr(self, "_cached_params_dict"):
+            self._cached_params_dict = dict(self.named_parameters())
+        params_dict = self._cached_params_dict
+        other_weights = dict()
+        for name, loaded_weight in weights:
+            if "sin_coef" in name or "cos_coef" in name:
+                name = name.replace(r"model.language_model.", r"model.")
+                self._load_fope_weights(name, loaded_weight, params_dict)
+            else:
+                other_weights[name] = loaded_weight
+
+        super().load_weights(other_weights.items())
+
+
+EntryClass = InternS1ProForConditionalGeneration
diff --git a/sglang/python/sglang/srt/models/internvl.py b/sglang/python/sglang/srt/models/internvl.py
new file mode 100644
index 0000000000000000000000000000000000000000..e5d6a5b70a7826922765f17ca5f723bb8ee9a8fe
--- /dev/null
+++ b/sglang/python/sglang/srt/models/internvl.py
@@ -0,0 +1,790 @@
+from typing import Iterable, List, Optional, Tuple, Union
+
+import torch
+
+# Adapted from https://raw.githubusercontent.com/vllm-project/vllm/7f62077af5159c625fe3ad1c812e6c1a2b93ba3b/vllm/model_executor/models/internlm2.py
+# Adapted from https://raw.githubusercontent.com/hehesangsj/sglang/refs/heads/internvl/python/sglang/srt/models/internvl.py
+import torch.nn.functional as F
+from torch import nn
+from transformers import PretrainedConfig, PreTrainedModel
+from transformers.modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling
+
+from sglang.srt.distributed import (
+    get_tensor_model_parallel_rank,
+    get_tensor_model_parallel_world_size,
+)
+from sglang.srt.environ import envs
+from sglang.srt.layers.activation import get_act_fn
+from sglang.srt.layers.attention import vision_utils
+from sglang.srt.layers.attention.vision import SingletonCache, VisionAttention
+from sglang.srt.layers.linear import ColumnParallelLinear, RowParallelLinear
+from sglang.srt.layers.moe.fused_moe_triton.layer import FusedMoE
+from sglang.srt.layers.quantization.base_config import QuantizationConfig
+from sglang.srt.managers.mm_utils import (
+    MultiModalityDataPaddingPatternTokenPairs,
+    general_mm_embed_routine,
+)
+from sglang.srt.managers.schedule_batch import (
+    Modality,
+    MultimodalDataItem,
+    MultimodalInputs,
+)
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch
+from sglang.srt.model_loader.weight_utils import default_weight_loader
+from sglang.srt.models.deepseek_janus_pro import DropPath
+from sglang.srt.models.gpt_oss import GptOssForCausalLM
+from sglang.srt.models.internlm2 import InternLM2ForCausalLM
+from sglang.srt.models.qwen2 import Qwen2ForCausalLM
+from sglang.srt.models.qwen3 import Qwen3ForCausalLM
+from sglang.srt.models.qwen3_moe import Qwen3MoeForCausalLM
+from sglang.srt.multimodal.internvl_vit_cuda_graph_runner import (
+    InternViTCudaGraphRunner,
+)
+from sglang.srt.multimodal.mm_utils import run_dp_sharded_vision_model
+from sglang.srt.server_args import get_global_server_args
+from sglang.srt.utils import is_cuda
+from sglang.utils import logger
+
+_is_cuda = is_cuda()
+
+
+class InternAttention(nn.Module):
+    def __init__(
+        self,
+        config,
+        quant_config: QuantizationConfig = None,
+        use_data_parallel: bool = False,
+        aux_stream: Optional[torch.cuda.Stream] = None,
+    ):
+        super().__init__()
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.embed_dim // self.num_heads
+        self.scale = self.head_dim**-0.5
+
+        self.attn = VisionAttention(
+            embed_dim=self.embed_dim,
+            num_heads=self.num_heads,
+            projection_size=self.embed_dim,
+            use_qkv_parallel=True,
+            quant_config=quant_config,
+            dropout=getattr(config, "dropout", 0.0),
+            qkv_bias=getattr(config, "qkv_bias", False)
+            or getattr(config, "attention_bias", False),
+            num_dummy_heads=getattr(config, "num_dummy_heads", 0),
+            qk_normalization=getattr(config, "qk_normalization", False)
+            or getattr(config, "use_qk_norm", False),
+            flatten_batch=False,
+            use_data_parallel=use_data_parallel,
+            aux_stream=aux_stream,
+        )
+
+        self.proj_drop = nn.Dropout(config.dropout)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        cu_seqlens: torch.Tensor,
+        output_ws: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        out = self.attn(hidden_states, cu_seqlens=cu_seqlens, output_ws=output_ws)
+        outs = self.proj_drop(out)
+        return outs
+
+
+class InternVisionEmbeddings(nn.Module):
+    def __init__(self, config: PretrainedConfig):
+        super().__init__()
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.image_size = (
+            config.image_size
+            if isinstance(config.image_size, int)
+            else config.image_size[0]
+        )
+        self.patch_size = (
+            config.patch_size
+            if isinstance(config.patch_size, int)
+            else config.patch_size[0]
+        )
+
+        self.class_embedding = nn.Parameter(
+            torch.randn(1, 1, self.embed_dim),
+        )
+
+        self.patch_embedding = nn.Conv2d(
+            in_channels=3,
+            out_channels=self.embed_dim,
+            kernel_size=self.patch_size,
+            stride=self.patch_size,
+        )
+
+        self.num_patches = (self.image_size // self.patch_size) ** 2
+        self.num_positions = self.num_patches + 1
+
+        self.position_embedding = nn.Parameter(
+            torch.randn(1, self.num_positions, self.embed_dim)
+        )
+
+    def _get_pos_embed(self, pos_embed, H, W):
+        target_dtype = pos_embed.dtype
+        pos_embed = (
+            pos_embed.float()
+            .reshape(
+                1,
+                self.image_size // self.patch_size,
+                self.image_size // self.patch_size,
+                -1,
+            )
+            .permute(0, 3, 1, 2)
+        )
+        pos_embed = (
+            F.interpolate(pos_embed, size=(H, W), mode="bicubic", align_corners=False)
+            .reshape(1, -1, H * W)
+            .permute(0, 2, 1)
+            .to(target_dtype)
+        )
+        return pos_embed
+
+    def forward(self, pixel_values: torch.FloatTensor) -> torch.Tensor:
+        target_dtype = self.patch_embedding.weight.dtype
+        patch_embeds = self.patch_embedding(
+            pixel_values
+        )  # shape = [*, channel, width, height]
+        batch_size, _, height, width = patch_embeds.shape
+        patch_embeds = patch_embeds.flatten(2).transpose(1, 2)
+        class_embeds = self.class_embedding.expand(batch_size, 1, -1).to(target_dtype)
+        embeddings = torch.cat([class_embeds, patch_embeds], dim=1)
+        position_embedding = torch.cat(
+            [
+                self.position_embedding[:, :1, :],
+                self._get_pos_embed(self.position_embedding[:, 1:, :], height, width),
+            ],
+            dim=1,
+        )
+        embeddings = embeddings + position_embedding.to(target_dtype)
+        return embeddings
+
+
+class InternRMSNorm(nn.Module):
+    def __init__(self, hidden_size, eps=1e-6):
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+
+    def forward(self, hidden_states):
+        input_dtype = hidden_states.dtype
+        hidden_states = hidden_states.to(torch.float32)
+        variance = hidden_states.pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+        return self.weight * hidden_states.to(input_dtype)
+
+
+class InternMLP(nn.Module):
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        use_data_parallel: bool = False,
+    ):
+        super().__init__()
+        self.tp_size = (
+            1 if use_data_parallel else get_tensor_model_parallel_world_size()
+        )
+        self.tp_rank = 0 if use_data_parallel else get_tensor_model_parallel_rank()
+        self.config = config
+        self.act = get_act_fn(config.hidden_act)
+        self.fc1 = ColumnParallelLinear(
+            config.hidden_size,
+            config.intermediate_size,
+            bias=True,
+            quant_config=None,
+            tp_size=self.tp_size,
+            tp_rank=self.tp_rank,
+        )
+        self.fc2 = RowParallelLinear(
+            config.intermediate_size,
+            config.hidden_size,
+            bias=True,
+            quant_config=None,
+            tp_size=self.tp_size,
+            tp_rank=self.tp_rank,
+        )
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states, _ = self.fc1(hidden_states)
+        hidden_states = self.act(hidden_states)
+        hidden_states, _ = self.fc2(hidden_states)
+        return hidden_states
+
+
+NORM2FN = {
+    "rms_norm": InternRMSNorm,
+    "layer_norm": nn.LayerNorm,
+}
+
+
+class InternVisionEncoderLayer(nn.Module):
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        drop_path_rate: float,
+        quant_config: QuantizationConfig = None,
+        use_data_parallel: bool = False,
+        aux_stream: Optional[torch.cuda.Stream] = None,
+    ):
+        super().__init__()
+        self.embed_dim = config.hidden_size
+        self.intermediate_size = config.intermediate_size
+        self.norm_type = config.norm_type
+        self.attn = InternAttention(
+            config=config,
+            quant_config=quant_config,
+            use_data_parallel=use_data_parallel,
+            aux_stream=aux_stream,
+        )
+        self.mlp = InternMLP(config, use_data_parallel)
+        self.norm1 = NORM2FN[self.norm_type](self.embed_dim, eps=config.layer_norm_eps)
+        self.norm2 = NORM2FN[self.norm_type](self.embed_dim, eps=config.layer_norm_eps)
+
+        self.ls1 = nn.Parameter(config.initializer_factor * torch.ones(self.embed_dim))
+        self.ls2 = nn.Parameter(config.initializer_factor * torch.ones(self.embed_dim))
+        self.drop_path1 = (
+            DropPath(drop_path_rate) if drop_path_rate > 0.0 else nn.Identity()
+        )
+        self.drop_path2 = (
+            DropPath(drop_path_rate) if drop_path_rate > 0.0 else nn.Identity()
+        )
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        cu_seqlens: torch.Tensor,
+        output_ws: Optional[torch.Tensor] = None,
+    ) -> Tuple[
+        torch.FloatTensor,
+        Optional[torch.FloatTensor],
+        Optional[Tuple[torch.FloatTensor]],
+    ]:
+        """
+        Args:
+            hidden_states (`Tuple[torch.FloatTensor, Optional[torch.FloatTensor]]`): input to the layer of shape `(batch, seq_len, embed_dim)`
+        """
+
+        hidden_states = hidden_states + self.drop_path1(
+            self.attn(
+                self.norm1(hidden_states).to(hidden_states.dtype),
+                cu_seqlens=cu_seqlens,
+                output_ws=output_ws,
+            )
+            * self.ls1
+        )
+
+        hidden_states = hidden_states + self.drop_path2(
+            self.mlp(self.norm2(hidden_states).to(hidden_states.dtype)) * self.ls2
+        )
+
+        return hidden_states
+
+
+class InternVisionEncoder(nn.Module):
+    """
+    Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
+    [`InternEncoderLayer`].
+
+    Args:
+        config (`InternConfig`):
+            The corresponding vision configuration for the `InternEncoder`.
+    """
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        use_data_parallel: bool = False,
+    ):
+        super().__init__()
+        self.config = config
+        # stochastic depth decay rule
+        dpr = [
+            x.item()
+            for x in torch.linspace(0, config.drop_path_rate, config.num_hidden_layers)
+        ]
+
+        self.enable_cg = _is_cuda and envs.SGLANG_VIT_ENABLE_CUDA_GRAPH.get()
+        aux_stream = (
+            None if self.enable_cg else (torch.cuda.Stream() if _is_cuda else None)
+        )
+        self.layers = nn.ModuleList(
+            [
+                InternVisionEncoderLayer(
+                    config, dpr[idx], quant_config, use_data_parallel, aux_stream
+                )
+                for idx in range(config.num_hidden_layers)
+            ]
+        )
+
+        self.cuda_graph_runner: Optional[InternViTCudaGraphRunner] = None
+        if self.enable_cg:
+            self.cuda_graph_runner = InternViTCudaGraphRunner(self)
+
+    def forward(
+        self,
+        inputs_embeds,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutput]:
+        r"""
+        Args:
+            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+                Embedded representation of the inputs. Should be float, not int tokens.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+                for more detail.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        """
+        if self.enable_cg and (not output_hidden_states):
+            # graph path only returns last_hidden_state
+            hidden_states = inputs_embeds.to(device=inputs_embeds.device).contiguous()
+            hidden_states = self.cuda_graph_runner.run(hidden_states)
+            if not return_dict:
+                return (hidden_states,)
+            return BaseModelOutput(last_hidden_state=hidden_states, hidden_states=None)
+
+        output_hidden_states = (
+            output_hidden_states
+            if output_hidden_states is not None
+            else self.config.output_hidden_states
+        )
+        return_dict = (
+            return_dict if return_dict is not None else self.config.use_return_dict
+        )
+
+        encoder_states = () if output_hidden_states else None
+        hidden_states = inputs_embeds
+
+        cu_seqlens = SingletonCache()
+
+        for idx, encoder_layer in enumerate(self.layers):
+            if output_hidden_states:
+                encoder_states = encoder_states + (hidden_states,)
+            layer_outputs = encoder_layer(hidden_states, cu_seqlens=cu_seqlens)
+            hidden_states = layer_outputs
+
+        if output_hidden_states:
+            encoder_states = encoder_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, encoder_states] if v is not None)
+        return BaseModelOutput(
+            last_hidden_state=hidden_states, hidden_states=encoder_states
+        )
+
+
+class InternVisionModel(PreTrainedModel):
+    main_input_name = "pixel_values"
+    _supports_flash_attn_2 = True
+    config_class = PretrainedConfig
+    _no_split_modules = ["InternVisionEncoderLayer"]
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        use_data_parallel: bool = False,
+    ):
+        super().__init__(config)
+
+        self.config = config
+        self.use_data_parallel = use_data_parallel
+        self.embeddings = InternVisionEmbeddings(
+            config,
+        )
+        self.encoder = InternVisionEncoder(config, quant_config, use_data_parallel)
+
+    def resize_pos_embeddings(self, old_size, new_size, patch_size):
+        pos_emb = self.embeddings.position_embedding
+        _, num_positions, embed_dim = pos_emb.shape
+        cls_emb = pos_emb[:, :1, :]
+        pos_emb = (
+            pos_emb[:, 1:, :]
+            .reshape(1, old_size // patch_size, old_size // patch_size, -1)
+            .permute(0, 3, 1, 2)
+        )
+        pos_emb = F.interpolate(
+            pos_emb.float(),
+            size=new_size // patch_size,
+            mode="bicubic",
+            align_corners=False,
+        )
+        pos_emb = pos_emb.to(cls_emb.dtype).reshape(1, embed_dim, -1).permute(0, 2, 1)
+        pos_emb = torch.cat([cls_emb, pos_emb], dim=1)
+        self.embeddings.position_embedding = nn.Parameter(pos_emb)
+        self.embeddings.image_size = new_size
+        logger.info(
+            "Resized position embeddings from {} to {}".format(old_size, new_size)
+        )
+
+    def get_input_embeddings(self):
+        return self.embeddings
+
+    def forward(
+        self,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        pixel_embeds: Optional[torch.FloatTensor] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPooling]:
+        pixel_values = pixel_values.to(device=self.device, dtype=self.dtype)
+        output_hidden_states = (
+            output_hidden_states
+            if output_hidden_states is not None
+            else self.config.output_hidden_states
+        )
+        return_dict = (
+            return_dict if return_dict is not None else self.config.use_return_dict
+        )
+
+        if pixel_values is None and pixel_embeds is None:
+            raise ValueError("You have to specify pixel_values or pixel_embeds")
+
+        if pixel_embeds is not None:
+            hidden_states = pixel_embeds
+        else:
+            if len(pixel_values.shape) == 4:
+                hidden_states = self.embeddings(pixel_values)
+            else:
+                raise ValueError(f"wrong pixel_values size: {pixel_values.shape}")
+
+        if self.use_data_parallel:
+            encoder_outputs = run_dp_sharded_vision_model(hidden_states, self.encoder)
+            last_hidden_state = encoder_outputs
+        else:
+            encoder_outputs = self.encoder(
+                inputs_embeds=hidden_states,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+            )
+            last_hidden_state = encoder_outputs.last_hidden_state
+        pooled_output = last_hidden_state[:, 0, :]
+
+        if not return_dict:
+            return (last_hidden_state, pooled_output) + encoder_outputs[1:]
+
+        if self.use_data_parallel:
+            return BaseModelOutputWithPooling(
+                last_hidden_state=last_hidden_state,
+                pooler_output=pooled_output,
+                hidden_states=None,
+                attentions=None,
+            )
+        else:
+            return BaseModelOutputWithPooling(
+                last_hidden_state=last_hidden_state,
+                pooler_output=pooled_output,
+                hidden_states=encoder_outputs.hidden_states,
+                attentions=encoder_outputs.attentions,
+            )
+
+
+class InternVLChatModel(nn.Module):
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        use_flash_attn=True,
+    ) -> None:
+        super().__init__()
+        self.config = config
+        self.use_data_parallel = get_global_server_args().mm_enable_dp_encoder
+        self.quant_config = quant_config
+        vision_utils.update_vit_attn_dummy_heads_config(self.config)
+        image_size = config.force_image_size or config.vision_config.image_size
+        patch_size = config.vision_config.patch_size
+        self.patch_size = patch_size
+        self.select_layer = config.select_layer
+        self.template = config.template
+        self.num_image_token = int(
+            (image_size // patch_size) ** 2 * (config.downsample_ratio**2)
+        )
+        self.downsample_ratio = config.downsample_ratio
+        self.ps_version = config.ps_version
+
+        config.vision_config.use_flash_attn = True if use_flash_attn else False
+        config.llm_config._attn_implementation = (
+            "flash_attention_2" if use_flash_attn else "eager"
+        )
+
+        logger.info(f"num_image_token: {self.num_image_token}")
+        logger.info(f"ps_version: {self.ps_version}")
+
+        self.vision_model = InternVisionModel(
+            config.vision_config,
+            use_data_parallel=self.use_data_parallel,
+        )
+        if config.llm_config.architectures[0] == "Qwen2ForCausalLM":
+            self.language_model = Qwen2ForCausalLM(
+                config=config.llm_config, quant_config=quant_config
+            )
+        elif config.llm_config.architectures[0] == "InternLM2ForCausalLM":
+            self.language_model = InternLM2ForCausalLM(
+                config=config.llm_config, quant_config=quant_config
+            )
+        elif config.llm_config.architectures[0] == "Qwen3MoeForCausalLM":
+            self.language_model = Qwen3MoeForCausalLM(
+                config=config.llm_config, quant_config=quant_config
+            )
+        elif config.llm_config.architectures[0] == "GptOssForCausalLM":
+            self.language_model = GptOssForCausalLM(
+                config=config.llm_config, quant_config=quant_config
+            )
+        elif config.llm_config.architectures[0] == "Qwen3ForCausalLM":
+            self.language_model = Qwen3ForCausalLM(
+                config=config.llm_config, quant_config=quant_config
+            )
+        else:
+            raise NotImplementedError(
+                f"{config.llm_config.architectures[0]} is not implemented."
+            )
+
+        vit_hidden_size = config.vision_config.hidden_size
+        llm_hidden_size = config.llm_config.hidden_size
+
+        self.mlp1 = nn.Sequential(
+            nn.LayerNorm(vit_hidden_size * int(1 / self.downsample_ratio) ** 2),
+            nn.Linear(
+                vit_hidden_size * int(1 / self.downsample_ratio) ** 2, llm_hidden_size
+            ),
+            nn.GELU(),
+            nn.Linear(llm_hidden_size, llm_hidden_size),
+        )
+
+        self.external_mm_data_embedding_funcs = {
+            Modality.IMAGE: self.get_image_feature,
+            Modality.VIDEO: self.get_video_feature,
+        }
+
+        self.model = self.language_model.model
+
+    def pixel_shuffle(self, x, scale_factor=0.5):
+        n, w, h, c = x.size()
+        # N, W, H, C --> N, W, H * scale, C // scale
+        x = x.view(n, w, int(h * scale_factor), int(c / scale_factor))
+        # N, W, H * scale, C // scale --> N, H * scale, W, C // scale
+        x = x.permute(0, 2, 1, 3).contiguous()
+        # N, H * scale, W, C // scale --> N, H * scale, W * scale, C // (scale ** 2)
+        x = x.view(
+            n,
+            int(h * scale_factor),
+            int(w * scale_factor),
+            int(c / (scale_factor * scale_factor)),
+        )
+        if self.ps_version == "v1":
+            logger.warn(
+                "In ps_version 'v1', the height and width have not been swapped back, "
+                "which results in a transposed image."
+            )
+        else:
+            x = x.permute(0, 2, 1, 3).contiguous()
+        return x
+
+    def extract_feature(self, pixel_values):
+        if self.select_layer == -1:
+            vit_embeds = self.vision_model(
+                pixel_values=pixel_values, output_hidden_states=False, return_dict=True
+            ).last_hidden_state
+        else:
+            vit_embeds = self.vision_model(
+                pixel_values=pixel_values, output_hidden_states=True, return_dict=True
+            ).hidden_states[self.select_layer]
+        vit_embeds = vit_embeds[:, 1:, :]
+
+        h = w = int(vit_embeds.shape[1] ** 0.5)
+        vit_embeds = vit_embeds.reshape(vit_embeds.shape[0], h, w, -1)
+        vit_embeds = self.pixel_shuffle(vit_embeds, scale_factor=self.downsample_ratio)
+        vit_embeds = vit_embeds.reshape(vit_embeds.shape[0], -1, vit_embeds.shape[-1])
+        vit_embeds = self.mlp1(vit_embeds)
+        return vit_embeds
+
+    def get_image_feature(self, items: List[MultimodalDataItem]):
+        """
+        Projects the last hidden state from the vision model into language model space.
+
+        Returns:
+            image_features (`torch.Tensor`): Image feature tensor of shape `(num_images, image_length, embed_dim)`).
+        """
+        pixel_values = torch.cat([item.feature for item in items])
+        # If already precomputed embeddings (not raw pixel values), skip vision encoder.
+        # Normal pixel_values are 4D [N, C, H, W]; precomputed embeddings are 2D or 3D.
+        if pixel_values.dim() != 4:
+            return pixel_values
+        image_features = self.extract_feature(pixel_values)
+        return image_features
+
+    def get_video_feature(self, items: List[MultimodalDataItem]):
+        # items: each item corresponds to one video (recommended)
+        # item.feature shape: [num_frames, 3, 448, 448]  (or [num_tiles, 3, 448, 448])
+        pixel_values = torch.cat([item.feature for item in items], dim=0)
+        # If already precomputed embeddings, skip vision encoder.
+        if pixel_values.dim() != 4:
+            return pixel_values
+        video_features = self.extract_feature(pixel_values)
+        return video_features
+
+    @torch.no_grad()
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        input_embeds: torch.Tensor = None,
+    ) -> torch.Tensor:
+
+        hidden_states = general_mm_embed_routine(
+            input_ids=input_ids,
+            forward_batch=forward_batch,
+            language_model=self.language_model,
+            multimodal_model=self,
+            data_embedding_funcs=self.external_mm_data_embedding_funcs,
+            positions=positions,
+        )
+
+        return hidden_states
+
+    def pad_input_ids(self, input_ids: List[int], mm_inputs: MultimodalInputs):
+        # Get all special token IDs
+        im_start_id: int = mm_inputs.im_start_id
+        im_end_id: int = mm_inputs.im_end_id
+
+        media_token_pairs = [(im_start_id, im_end_id)]
+        helper = MultiModalityDataPaddingPatternTokenPairs(media_token_pairs)
+
+        return helper.pad_input_tokens(input_ids, mm_inputs)
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        expert_params_mapping = []
+        if "InternLM2ForCausalLM" in self.config.llm_config.architectures:
+            stacked_params_mapping = [
+                # (param_name, shard_name, shard_id)
+                ("gate_up_proj", "w1", 0),
+                ("gate_up_proj", "w3", 1),
+            ]
+        elif "Qwen2ForCausalLM" in self.config.llm_config.architectures:
+            stacked_params_mapping = [
+                # (param_name, shard_name, shard_id)
+                ("qkv_proj", "q_proj", "q"),
+                ("qkv_proj", "k_proj", "k"),
+                ("qkv_proj", "v_proj", "v"),
+                ("gate_up_proj", "gate_proj", 0),
+                ("gate_up_proj", "up_proj", 1),
+            ]
+        elif "Qwen3MoeForCausalLM" in self.config.llm_config.architectures:
+            stacked_params_mapping = [
+                # (param_name, shard_name, shard_id)
+                ("qkv_proj", "q_proj", "q"),
+                ("qkv_proj", "k_proj", "k"),
+                ("qkv_proj", "v_proj", "v"),
+                ("gate_up_proj", "gate_proj", 0),
+                ("gate_up_proj", "up_proj", 1),
+            ]
+
+            expert_params_mapping = FusedMoE.make_expert_params_mapping(
+                ckpt_gate_proj_name="gate_proj",
+                ckpt_down_proj_name="down_proj",
+                ckpt_up_proj_name="up_proj",
+                num_experts=self.config.num_experts,
+            )
+        elif "Qwen3ForCausalLM" in self.config.llm_config.architectures:
+            stacked_params_mapping = [
+                # (param_name, shard_name, shard_id)
+                ("qkv_proj", "q_proj", "q"),
+                ("qkv_proj", "k_proj", "k"),
+                ("qkv_proj", "v_proj", "v"),
+                ("gate_up_proj", "gate_proj", 0),
+                ("gate_up_proj", "up_proj", 1),
+            ]
+
+        params_dict = dict(self.named_parameters())
+
+        for name, loaded_weight in weights:
+            if "rotary_emb.inv_freq" in name:
+                continue
+
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                # We have mlp.experts[0].gate_proj in the checkpoint.
+                # Since we handle the experts below in expert_params_mapping,
+                # we need to skip here BEFORE we update the name, otherwise
+                # name will be updated to mlp.experts[0].gate_up_proj, which
+                # will then be updated below in expert_params_mapping
+                # for mlp.experts[0].gate_gate_up_proj, which breaks load.
+                if "mlp.experts" in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                if "vision_model" in name:
+                    # adapt to VisionAttention
+                    name = name.replace(r"attn.", r"attn.attn.")
+                    name = name.replace(r"qkv.", r"qkv_proj.")
+
+                for mapping in expert_params_mapping:
+                    param_name, weight_name, expert_id, shard_id = mapping
+                    if weight_name not in name:
+                        continue
+                    name = name.replace(weight_name, param_name)
+                    param = params_dict[name]
+                    weight_loader = param.weight_loader
+                    weight_loader(
+                        param,
+                        loaded_weight,
+                        name,
+                        shard_id=shard_id,
+                        expert_id=expert_id,
+                    )
+                    break
+                else:
+                    # Skip loading extra bias for GPTQ models.
+                    if name.endswith(".bias") and name not in params_dict:
+                        continue
+                    param = params_dict[name]
+                    if "wqkv" in name:
+                        config = self.config
+                        kv_groups = (
+                            config.num_attention_heads // config.num_key_value_heads
+                        )
+                        head_dim = config.hidden_size // config.num_attention_heads
+                        loaded_weight = loaded_weight.view(
+                            -1, 2 + kv_groups, head_dim, loaded_weight.shape[-1]
+                        )
+                        wq, wk, wv = torch.split(
+                            loaded_weight, [kv_groups, 1, 1], dim=1
+                        )
+                        wq = wq.reshape(-1, wq.shape[-1])
+                        wk = wk.reshape(-1, wk.shape[-1])
+                        wv = wv.reshape(-1, wv.shape[-1])
+                        weight_loader = param.weight_loader
+                        weight_loader(param, wq, "q")
+                        weight_loader(param, wk, "k")
+                        weight_loader(param, wv, "v")
+                    else:
+                        weight_loader = getattr(
+                            param, "weight_loader", default_weight_loader
+                        )
+                        if "vision_model" in name:
+                            loaded_weight = vision_utils.pad_vit_attn_dummy_heads(
+                                self.config, name, loaded_weight
+                            )
+                        weight_loader(param, loaded_weight)
+
+
+EntryClass = InternVLChatModel
diff --git a/sglang/python/sglang/srt/models/iquest_loopcoder.py b/sglang/python/sglang/srt/models/iquest_loopcoder.py
new file mode 100644
index 0000000000000000000000000000000000000000..240aa5306a29e859746a2bfda3fdf613b4bcead9
--- /dev/null
+++ b/sglang/python/sglang/srt/models/iquest_loopcoder.py
@@ -0,0 +1,498 @@
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Inference-only LoopCoder model compatible with HuggingFace weights."""
+
+import logging
+from typing import Iterable, Optional, Tuple
+
+import torch
+from torch import nn
+from transformers import PretrainedConfig
+
+from sglang.srt.distributed import get_tensor_model_parallel_world_size
+from sglang.srt.layers.layernorm import RMSNorm
+from sglang.srt.layers.linear import (
+    ColumnParallelLinear,
+    QKVParallelLinear,
+    RowParallelLinear,
+)
+from sglang.srt.layers.logits_processor import LogitsProcessor
+from sglang.srt.layers.quantization.base_config import QuantizationConfig
+from sglang.srt.layers.radix_attention import RadixAttention
+from sglang.srt.layers.rotary_embedding import get_rope
+from sglang.srt.layers.vocab_parallel_embedding import (
+    ParallelLMHead,
+    VocabParallelEmbedding,
+)
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch
+from sglang.srt.model_loader.weight_utils import default_weight_loader
+from sglang.srt.models.llama import LlamaMLP as LoopCoderMLP
+from sglang.srt.utils import add_prefix, make_layers
+
+logger = logging.getLogger(__name__)
+
+
+class LoopGateProjection(nn.Module):
+    """Gate projection for mixed attention in Loop 2+.
+
+    Computes: g = sigmoid(linear(Q)) for each head independently.
+    This gate determines how much to use Loop1's KV (global) vs current loop's KV (local).
+
+    Supports tensor parallelism: each GPU handles a subset of heads.
+    The weight matrix has shape [num_heads, head_dim] and is split along the head dimension.
+    """
+
+    def __init__(
+        self,
+        total_num_heads: int,
+        head_dim: int,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.total_num_heads = total_num_heads
+        self.head_dim = head_dim
+        tp_size = get_tensor_model_parallel_world_size()
+        assert self.total_num_heads % tp_size == 0
+        self.num_heads = self.total_num_heads // tp_size
+
+        self.gate_proj = ColumnParallelLinear(
+            head_dim,
+            self.total_num_heads,
+            bias=True,
+            gather_output=False,
+            quant_config=quant_config,
+            prefix=add_prefix("gate_proj", prefix),
+        )
+
+    def forward(self, query: torch.Tensor) -> torch.Tensor:
+        """Compute gate values from query tensor.
+
+        Args:
+            query: [num_heads, num_tokens, head_dim]
+                where num_heads is the number of heads on this TP rank
+                and num_tokens = batch * seq_len
+
+        Returns:
+            gate: [num_tokens, num_heads * head_dim] (flattened format matching q shape)
+        """
+        num_heads, num_tokens, head_dim = query.shape
+
+        assert (
+            num_heads == self.num_heads
+        ), f"Expected {self.num_heads} heads, got {num_heads}"
+
+        query_flat = query.reshape(-1, head_dim)
+
+        gate_logits_flat, _ = self.gate_proj(query_flat)
+
+        gate_logits = gate_logits_flat.reshape(num_heads, num_tokens, self.num_heads)
+
+        # Extract diagonal: each head h's query should use output column h
+        gate_logits = torch.diagonal(gate_logits, dim1=0, dim2=2)
+        gate_logits = gate_logits.transpose(0, 1)
+        gate_logits = gate_logits.unsqueeze(-1)
+
+        # Apply sigmoid
+        gate = torch.sigmoid(gate_logits)
+
+        # Expand and reshape to match q shape: [num_tokens, num_heads * head_dim]
+        gate = gate.transpose(0, 1)
+        gate = gate.expand(-1, -1, head_dim)
+        gate = gate.reshape(num_tokens, num_heads * head_dim)
+
+        return gate
+
+
+class LoopCoderAttention(nn.Module):
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        hidden_size: int,
+        num_heads: int,
+        num_kv_heads: int,
+        layer_id: int = 0,
+        max_position: int = 4096 * 32,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.layer_id = layer_id
+        self.hidden_size = hidden_size
+        tp_size = get_tensor_model_parallel_world_size()
+        self.total_num_heads = num_heads
+        assert self.total_num_heads % tp_size == 0
+        self.num_heads = self.total_num_heads // tp_size
+        self.total_num_kv_heads = num_kv_heads
+        if self.total_num_kv_heads >= tp_size:
+            assert self.total_num_kv_heads % tp_size == 0
+        else:
+            assert tp_size % self.total_num_kv_heads == 0
+        self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
+        self.head_dim = hidden_size // self.total_num_heads
+        self.q_size = self.num_heads * self.head_dim
+        self.kv_size = self.num_kv_heads * self.head_dim
+        self.scaling = self.head_dim**-0.5
+
+        # Get loop_num from config, default to 2 if not specified
+        self.loop_num = getattr(config, "loop_num", 2)
+        self.loop_window_size = getattr(config, "loop_window_size", 64)
+
+        self.qkv_proj = QKVParallelLinear(
+            hidden_size,
+            self.head_dim,
+            self.total_num_heads,
+            self.total_num_kv_heads,
+            bias=False,
+            quant_config=quant_config,
+            prefix=add_prefix("qkv_proj", prefix),
+        )
+        self.o_proj = RowParallelLinear(
+            self.total_num_heads * self.head_dim,
+            hidden_size,
+            bias=False,
+            quant_config=quant_config,
+            prefix=add_prefix("o_proj", prefix),
+        )
+
+        rope_theta = getattr(config, "rope_theta", 10000)
+        rope_scaling = getattr(config, "rope_scaling", None)
+        max_position_embeddings = getattr(
+            config, "max_position_embeddings", max_position
+        )
+
+        self.rotary_emb = get_rope(
+            self.head_dim,
+            rotary_dim=self.head_dim,
+            max_position=max_position_embeddings,
+            base=rope_theta,
+            rope_scaling=rope_scaling,
+        )
+
+        # Create attention instances for each loop
+        # Loop 0: global attention without sliding window for full context
+        # Loop 1+: local attention with sliding window for recent tokens
+        # Each loop needs a unique layer_id to avoid KV cache conflicts
+        self.attn = nn.ModuleList()
+        total_layers = getattr(config, "num_hidden_layers", 24)
+        for loop_idx in range(self.loop_num):
+            sliding_window = -1 if loop_idx == 0 else self.loop_window_size
+            # Use unique layer_id for each loop: loop_idx * total_layers + layer_id
+            # This ensures each loop has its own KV cache space
+            unique_layer_id = loop_idx * total_layers + layer_id
+
+            self.attn.append(
+                RadixAttention(
+                    self.num_heads,
+                    self.head_dim,
+                    self.scaling,
+                    num_kv_heads=self.num_kv_heads,
+                    layer_id=unique_layer_id,  # Unique layer_id for each loop
+                    sliding_window_size=sliding_window,
+                    quant_config=quant_config,
+                    prefix=add_prefix(f"attn.{loop_idx}", prefix),
+                )
+            )
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+        loop_idx: int,
+        gate_proj: Optional[LoopGateProjection] = None,
+    ) -> torch.Tensor:
+        qkv, _ = self.qkv_proj(hidden_states)
+        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+        q, k = self.rotary_emb(positions, q, k)
+
+        if loop_idx == 0:
+            # First loop: standard global attention, save KV to cache
+            attn_output = self.attn[0](q, k, v, forward_batch)
+        else:
+            # Loop 2+: mixed attention with learned gating
+            # Global attention: read from Loop 0's KV cache without updating (save_kv_cache=False)
+            # This provides full context information
+            # Pass k=None, v=None to read from KV cache instead of recomputing
+            global_attn_output = self.attn[0](
+                q, None, None, forward_batch, save_kv_cache=False
+            )
+
+            # Local attention: use current loop's KV with sliding window
+            # This focuses on recent tokens within the window
+            local_attn_output = self.attn[loop_idx](q, k, v, forward_batch)
+
+            # Compute gating weights using query-dependent projection
+            assert gate_proj is not None, "gate_proj must be provided for loop_idx > 0"
+            num_tokens = q.shape[0]
+            q_reshaped = q.view(num_tokens, self.num_heads, self.head_dim).transpose(
+                0, 1
+            )
+            gate = gate_proj(q_reshaped)
+
+            # Mix global and local attention outputs with learned gate
+            # gate controls the balance between global context and local focus
+            attn_output = global_attn_output * gate + local_attn_output * (1 - gate)
+
+        output, _ = self.o_proj(attn_output)
+        return output
+
+
+class LoopCoderDecoderLayer(nn.Module):
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        layer_id: int = 0,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.layer_id = layer_id
+
+        self.self_attn = LoopCoderAttention(
+            config=config,
+            hidden_size=self.hidden_size,
+            num_heads=config.num_attention_heads,
+            num_kv_heads=config.num_key_value_heads,
+            layer_id=layer_id,
+            max_position=getattr(config, "max_position_embeddings", 4096 * 32),
+            quant_config=quant_config,
+            prefix=add_prefix("self_attn", prefix),
+        )
+        self.mlp = LoopCoderMLP(
+            hidden_size=self.hidden_size,
+            intermediate_size=config.intermediate_size,
+            hidden_act=config.hidden_act,
+            quant_config=quant_config,
+            prefix=add_prefix("mlp", prefix),
+        )
+        self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = RMSNorm(
+            config.hidden_size, eps=config.rms_norm_eps
+        )
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+        loop_idx: int,
+        gate_proj: Optional[LoopGateProjection] = None,
+    ) -> torch.Tensor:
+        # Self Attention
+        residual = hidden_states
+        hidden_states = self.input_layernorm(hidden_states)
+        hidden_states = self.self_attn(
+            positions=positions,
+            hidden_states=hidden_states,
+            forward_batch=forward_batch,
+            loop_idx=loop_idx,
+            gate_proj=gate_proj,
+        )
+        hidden_states = hidden_states + residual
+
+        # MLP
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = hidden_states + residual
+
+        return hidden_states
+
+
+class IQuestLoopCoderModel(nn.Module):
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.config = config
+        self.quant_config = quant_config
+        self.vocab_size = config.vocab_size
+
+        self.embed_tokens = VocabParallelEmbedding(
+            config.vocab_size,
+            config.hidden_size,
+            quant_config=quant_config,
+            prefix=add_prefix("embed_tokens", prefix),
+        )
+
+        self.loop_num = getattr(self.config, "loop_num", 2)
+        self.window_size = getattr(self.config, "loop_window_size", 64)
+
+        # Gate projections for Loop 2+ (one per layer)
+        head_dim = config.hidden_size // config.num_attention_heads
+        gate_projections = make_layers(
+            config.num_hidden_layers,
+            lambda idx, prefix: LoopGateProjection(
+                total_num_heads=config.num_attention_heads,
+                head_dim=head_dim,
+                quant_config=quant_config,
+                prefix=prefix,
+            ),
+            prefix=add_prefix("gate_projections", prefix),
+        )
+        if isinstance(gate_projections, tuple):
+            self.start_layer, self.end_layer, self.gate_projections = gate_projections
+        else:
+            self.start_layer, self.end_layer = 0, config.num_hidden_layers
+            self.gate_projections = gate_projections
+
+        layers = make_layers(
+            config.num_hidden_layers,
+            lambda idx, prefix: LoopCoderDecoderLayer(
+                config=config,
+                layer_id=idx,
+                quant_config=quant_config,
+                prefix=prefix,
+            ),
+            prefix=add_prefix("layers", prefix),
+        )
+        if isinstance(layers, tuple):
+            self.start_layer, self.end_layer, self.layers = layers
+        else:
+            self.start_layer, self.end_layer = 0, config.num_hidden_layers
+            self.layers = layers
+
+        self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        input_embeds: torch.Tensor = None,
+    ) -> torch.Tensor:
+        if input_embeds is not None:
+            hidden_states = input_embeds
+        else:
+            hidden_states = self.embed_tokens(input_ids)
+
+        # Multi-loop forward pass
+        for loop_idx in range(self.loop_num):
+            for layer_idx in range(self.start_layer, self.end_layer):
+                layer = self.layers[layer_idx]
+                # Get gate_proj for this layer (only for loop_idx > 0)
+                gate_proj = self.gate_projections[layer_idx] if loop_idx > 0 else None
+                hidden_states = layer(
+                    positions, hidden_states, forward_batch, loop_idx, gate_proj
+                )
+
+        hidden_states = self.norm(hidden_states)
+        return hidden_states
+
+
+class IQuestLoopCoderForCausalLM(nn.Module):
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.config = config
+        self.quant_config = quant_config
+
+        self.model = IQuestLoopCoderModel(
+            config=config,
+            quant_config=quant_config,
+            prefix=add_prefix("model", prefix),
+        )
+
+        if config.tie_word_embeddings:
+            self.lm_head = self.model.embed_tokens
+        else:
+            self.lm_head = ParallelLMHead(
+                config.vocab_size,
+                config.hidden_size,
+                quant_config=quant_config,
+                prefix=add_prefix("lm_head", prefix),
+            )
+
+        self.logits_processor = LogitsProcessor(config)
+
+    @torch.no_grad()
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        input_embeds: torch.Tensor = None,
+    ):
+        hidden_states = self.model(input_ids, positions, forward_batch, input_embeds)
+        return self.logits_processor(
+            input_ids, hidden_states, self.lm_head, forward_batch
+        )
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        stacked_params_mapping = [
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+            ("gate_up_proj", "gate_proj", 0),
+            ("gate_up_proj", "up_proj", 1),
+        ]
+        params_dict = dict(self.named_parameters())
+        for name, loaded_weight in weights:
+            if "rotary_emb.inv_freq" in name:
+                continue
+
+            # Handle gate_projections weights
+            if name.startswith("gate_projections."):
+                if name.endswith(".weight"):
+                    sglang_name = name.replace(".weight", ".gate_proj.weight")
+                elif name.endswith(".bias"):
+                    sglang_name = name.replace(".bias", ".gate_proj.bias")
+                else:
+                    continue
+
+                if sglang_name in params_dict:
+                    param = params_dict[sglang_name]
+                    weight_loader = getattr(
+                        param, "weight_loader", default_weight_loader
+                    )
+                    weight_loader(param, loaded_weight)
+                continue
+
+            # Handle stacked parameters
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                if name in params_dict:
+                    param = params_dict[name]
+                    weight_loader = getattr(
+                        param, "weight_loader", default_weight_loader
+                    )
+                    weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                # Handle regular parameters
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                if name in params_dict:
+                    param = params_dict[name]
+                    weight_loader = getattr(
+                        param, "weight_loader", default_weight_loader
+                    )
+                    weight_loader(param, loaded_weight)
+
+
+# Entry class for model registration
+EntryClass = IQuestLoopCoderForCausalLM
diff --git a/sglang/python/sglang/srt/models/jet_nemotron.py b/sglang/python/sglang/srt/models/jet_nemotron.py
new file mode 100644
index 0000000000000000000000000000000000000000..513f2ce3759a41ae4e58b2993c2c66720b78f966
--- /dev/null
+++ b/sglang/python/sglang/srt/models/jet_nemotron.py
@@ -0,0 +1,599 @@
+from collections.abc import Iterable
+from typing import cast
+
+import einops
+import torch
+import torch.nn as nn
+
+from sglang.srt.configs.jet_nemotron import JetBlockConfig, JetNemotronConfig
+from sglang.srt.layers.attention.fla.fused_recurrent import (
+    fused_recurrent_gated_delta_rule_update,
+)
+from sglang.srt.layers.attention.fla.layernorm_gated import RMSNorm as RMSNormGated
+from sglang.srt.layers.attention.hybrid_linear_attn_backend import (
+    HybridLinearAttnBackend,
+    MambaAttnBackendBase,
+)
+from sglang.srt.layers.layernorm import RMSNorm
+from sglang.srt.layers.linear import (
+    ColumnParallelLinear,
+    MergedColumnParallelLinear,
+    QKVParallelLinear,
+    RowParallelLinear,
+)
+from sglang.srt.layers.logits_processor import LogitsProcessor, LogitsProcessorOutput
+from sglang.srt.layers.pooler import EmbeddingPoolerOutput, Pooler, PoolingType
+from sglang.srt.layers.quantization.base_config import QuantizationConfig
+from sglang.srt.layers.radix_attention import RadixAttention
+from sglang.srt.layers.rotary_embedding import get_rope
+from sglang.srt.layers.vocab_parallel_embedding import ParallelLMHead
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch
+from sglang.srt.model_loader.weight_utils import default_weight_loader
+from sglang.srt.models.qwen2 import Qwen2MLP, Qwen2Model
+from sglang.srt.utils import add_prefix
+
+
+class DynamicShortConvolutionKernelGenerator(nn.Module):
+    def __init__(
+        self,
+        input_size: int,
+        hidden_size: int,
+        output_size: int,
+        quant_config: QuantizationConfig | None = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+
+        self.w1 = ColumnParallelLinear(
+            input_size,
+            hidden_size,
+            bias=False,
+            quant_config=quant_config,
+            prefix=add_prefix("w1", prefix),
+        )
+
+        self.act = nn.SiLU()
+
+        self.w2 = ColumnParallelLinear(
+            hidden_size,
+            output_size,
+            bias=True,
+            quant_config=quant_config,
+            prefix=add_prefix("w2", prefix),
+        )
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x, _ = self.w1(x)
+        x = self.act(x)
+        x, _ = self.w2(x)
+        return x
+
+
+class DynamicShortConvolution(nn.Module):
+    def __init__(
+        self,
+        hidden_size: int,
+        kernel_size: int,
+        generator_input_size: int,
+        generator_reduction: int,
+        quant_config: QuantizationConfig | None = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+
+        generator_hidden_size = hidden_size // generator_reduction
+
+        self.kernel_generator = DynamicShortConvolutionKernelGenerator(
+            input_size=generator_input_size,
+            hidden_size=generator_hidden_size,
+            output_size=hidden_size * kernel_size,
+            quant_config=quant_config,
+            prefix=add_prefix("kernel_generator", prefix),
+        )
+
+        self.hidden_size = hidden_size
+        self.kernel_size = kernel_size
+
+    def forward(
+        self,
+        x: torch.Tensor,  # (cu_seq_len, hidden_size)
+        *,
+        conv_state: torch.Tensor,  # (batch_size, hidden_size, kernel_size - 1)
+        generator_input: torch.Tensor,  # (cu_seq_len, generator_input_size)
+        seq_lens: torch.Tensor,  # (batch_size,)
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        """
+        Args:
+            x: (cu_seq_len, hidden_size)
+            conv_state: (batch_size, hidden_size, kernel_size - 1)
+            generator_input: (cu_seq_len, generator_input_size)
+            seq_lens: (batch_size,)
+
+        Returns:
+            out: (cu_seq_len, hidden_size)
+            conv_state: (batch_size, hidden_size, kernel_size - 1)
+        """
+
+        x_seqs = self._continuous_to_seqs(x, seq_lens=seq_lens)
+        conv_state = einops.rearrange(conv_state, "b d k -> b k d")
+        x_seqs = [torch.cat([conv_state[i], x_seqs[i]]) for i in range(len(x_seqs))]
+        x = self._seqs_to_batch(
+            x_seqs
+        )  # (batch_size, max_seq_len + kernel_size - 1, hidden_size)
+
+        x = einops.rearrange(x, "b l d -> b d l")
+
+        new_conv_state = x[
+            :, :, -(self.kernel_size - 1) :
+        ]  # (batch_size, hidden_size, kernel_size - 1)
+
+        x = x.unfold(
+            dimension=-1, size=self.kernel_size, step=1
+        )  # (batch_size, hidden_size, max_seq_len, kernel_size)
+        x = einops.rearrange(x, "b d l k -> b l d k")
+
+        kernels = self.kernel_generator(
+            generator_input
+        )  # (cu_seq_len, hidden_size * kernel_size)
+        kernels = einops.rearrange(
+            kernels,
+            "l (d k) -> l d k",
+            d=self.hidden_size,
+            k=self.kernel_size,
+        )
+        kernels = self._seqs_to_batch(
+            self._continuous_to_seqs(kernels, seq_lens=seq_lens)
+        )  # (batch_size, max_seq_len, hidden_size, kernel_size)
+
+        out = (x * kernels).sum(dim=-1)  # (batch_size, max_seq_len, hidden_size)
+
+        out = self._batch_to_continuous(
+            out, seq_lens=seq_lens
+        )  # (cu_seq_len, hidden_size)
+
+        out = nn.functional.silu(out)
+
+        return out, new_conv_state
+
+    def _batch_to_continuous(
+        self,
+        x: torch.Tensor,
+        *,
+        seq_lens: torch.Tensor,
+    ) -> torch.Tensor:
+        return torch.cat([x[i, -seq_lens[i] :] for i in range(seq_lens.size(0))])
+
+    def _continuous_to_seqs(
+        self,
+        x: torch.Tensor,
+        *,
+        seq_lens: torch.Tensor,
+    ) -> list[torch.Tensor]:
+        return [
+            x[(seq_lens[:i].sum()) : (seq_lens[: i + 1].sum())]
+            for i in range(seq_lens.size(0))
+        ]
+
+    def _seqs_to_batch(
+        self,
+        seqs: list[torch.Tensor],
+    ) -> torch.Tensor:
+        return nn.utils.rnn.pad_sequence(
+            seqs,
+            batch_first=True,
+            padding_side="left",
+        )
+
+
+class JetBlock(nn.Module):
+    def __init__(
+        self,
+        config: JetNemotronConfig,
+        layer_id: int,
+        quant_config: QuantizationConfig | None = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+
+        self.config = config
+
+        jet_block_config = JetBlockConfig(
+            **self.config.efficient_attention_config[self.config.layer_types[layer_id]]
+        )
+
+        hidden_size = self.config.hidden_size
+        num_heads = jet_block_config.num_heads
+        head_k_dim = jet_block_config.head_dim
+        total_k_dim = num_heads * head_k_dim
+        head_v_dim = int(head_k_dim * jet_block_config.expand_v)
+        total_v_dim = num_heads * head_v_dim
+        conv_size = jet_block_config.conv_size
+
+        self.qkvabz_proj = MergedColumnParallelLinear(
+            hidden_size,
+            [
+                total_k_dim,
+                total_k_dim,
+                total_v_dim,
+                num_heads,
+                num_heads,
+                total_v_dim,
+            ],
+            bias=False,
+            quant_config=quant_config,
+            prefix=add_prefix("qkvabz_proj", prefix),
+        )
+
+        self.o_proj = RowParallelLinear(total_v_dim, hidden_size, bias=False)
+
+        self.A_log = nn.Parameter(torch.empty(num_heads, dtype=torch.float32))
+        self.dt_bias = nn.Parameter(torch.empty(num_heads))
+
+        self.dynamic_conv1d = DynamicShortConvolution(
+            quant_config=quant_config,
+            prefix=add_prefix("dynamic_conv1d", prefix),
+            hidden_size=total_v_dim,
+            kernel_size=conv_size,
+            generator_input_size=hidden_size,
+            generator_reduction=jet_block_config.dconv_generator_reduction,
+        )
+
+        self.o_norm = RMSNormGated(
+            head_v_dim,
+            eps=float(jet_block_config.norm_eps),
+        )
+
+        # Attributes.
+        self.conv_size = conv_size
+        self.head_k_dim = head_k_dim
+        self.head_v_dim = head_v_dim
+        self.layer_id = layer_id
+        self.num_heads = num_heads
+        self.total_k_dim = total_k_dim
+        self.total_v_dim = total_v_dim
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+    ) -> torch.Tensor:
+        assert isinstance(forward_batch.attn_backend, HybridLinearAttnBackend)
+        assert isinstance(
+            forward_batch.attn_backend.linear_attn_backend, MambaAttnBackendBase
+        )
+        linear_attn_backend = forward_batch.attn_backend.linear_attn_backend
+        forward_metadata = linear_attn_backend.forward_metadata
+        layer_cache = linear_attn_backend.req_to_token_pool.mamba2_layer_cache(
+            self.layer_id
+        )
+
+        qkvabz, _ = self.qkvabz_proj(hidden_states)
+        q, k, v, a, beta, z = qkvabz.split(
+            [
+                self.total_k_dim,
+                self.total_k_dim,
+                self.total_v_dim,
+                self.num_heads,
+                self.num_heads,
+                self.total_v_dim,
+            ],
+            dim=-1,
+        )
+
+        q = nn.functional.silu(q)
+        q = einops.rearrange(q, "l (h d) -> l h d", h=self.num_heads, d=self.head_k_dim)
+
+        k = nn.functional.silu(k)
+        k = einops.rearrange(k, "l (h d) -> l h d", h=self.num_heads, d=self.head_k_dim)
+
+        conv_cache = layer_cache.conv
+        assert isinstance(conv_cache, torch.Tensor)
+        v, new_conv_state = self.dynamic_conv1d(
+            v,
+            conv_state=conv_cache[
+                forward_metadata.mamba_cache_indices, -self.total_v_dim :, :
+            ],
+            generator_input=hidden_states,
+            seq_lens=(
+                forward_batch.extend_seq_lens
+                if forward_batch.extend_seq_lens is not None
+                else torch.ones(
+                    (forward_batch.batch_size,),
+                    dtype=torch.long,
+                )
+            ),
+        )
+        conv_cache[forward_metadata.mamba_cache_indices, -self.total_v_dim :, :] = (
+            new_conv_state
+        )
+        v = einops.rearrange(v, "l (h d) -> l h d", h=self.num_heads, d=self.head_v_dim)
+
+        g = -self.A_log.float().exp() * nn.functional.softplus(a.float() + self.dt_bias)
+
+        beta = nn.functional.sigmoid(beta)
+
+        o = fused_recurrent_gated_delta_rule_update(
+            q=q.unsqueeze(0),
+            k=k.unsqueeze(0),
+            v=v.unsqueeze(0),
+            g=g.unsqueeze(0),
+            beta=beta.unsqueeze(0),
+            initial_state_source=layer_cache.temporal,
+            initial_state_indices=forward_metadata.mamba_cache_indices,
+            cu_seqlens=cast(torch.LongTensor, forward_metadata.query_start_loc),
+            use_qk_l2norm_in_kernel=True,
+        ).squeeze(0)
+
+        z = einops.rearrange(z, "l (h d) -> l h d", h=self.num_heads)
+
+        o = self.o_norm(o, z)
+
+        o = einops.rearrange(o, "l h d -> l (h d)")
+
+        o, _ = self.o_proj(o)
+
+        return o
+
+
+class JetNemotronAttention(nn.Module):
+    def __init__(
+        self,
+        config: JetNemotronConfig,
+        layer_id: int,
+        quant_config: QuantizationConfig | None = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+
+        self.config = config
+
+        self.head_dim = self.config.hidden_size // self.config.num_attention_heads
+
+        self.q_size = self.config.num_attention_heads * self.head_dim
+        self.kv_size = self.config.num_key_value_heads * self.head_dim
+
+        self.qkv_proj = QKVParallelLinear(
+            self.config.hidden_size,
+            self.head_dim,
+            self.config.num_attention_heads,
+            self.config.num_key_value_heads,
+            bias=True,
+            quant_config=quant_config,
+            prefix=add_prefix("qkv_proj", prefix),
+        )
+        self.o_proj = RowParallelLinear(
+            self.config.num_attention_heads * self.head_dim,
+            self.config.hidden_size,
+            bias=False,
+            quant_config=quant_config,
+            prefix=add_prefix("o_proj", prefix),
+        )
+
+        self.rotary_emb = get_rope(
+            self.head_dim,
+            rotary_dim=self.head_dim,
+            max_position=self.config.max_position_embeddings,
+            base=int(self.config.rope_theta),
+            rope_scaling=self.config.rope_scaling,
+        )
+
+        match self.config.layer_types[layer_id]:
+            case "attn":
+                sliding_window_size = -1
+
+            case "swa":
+                sliding_window_size = self.config.efficient_attention_config["swa"][
+                    "window_size"
+                ]
+
+            case _:
+                raise NotImplementedError
+
+        self.attn = RadixAttention(
+            self.config.num_attention_heads,
+            self.head_dim,
+            self.head_dim**-0.5,
+            num_kv_heads=self.config.num_key_value_heads,
+            layer_id=layer_id,
+            sliding_window_size=sliding_window_size,
+            quant_config=quant_config,
+            prefix=add_prefix("attn", prefix),
+        )
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+    ) -> torch.Tensor:
+        qkv, _ = self.qkv_proj(hidden_states)
+        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+        q, k = self.rotary_emb(positions, q, k)
+        attn_output = self.attn(q, k, v, forward_batch)
+        output, _ = self.o_proj(attn_output)
+        return output
+
+
+class JetNemotronDecoderLayer(nn.Module):
+    def __init__(
+        self,
+        config: JetNemotronConfig,
+        alt_stream: torch.cuda.Stream | None = None,
+        layer_id: int = 0,
+        quant_config: QuantizationConfig | None = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+
+        match config.layer_types[layer_id]:
+            case "attn" | "swa":
+                self.self_attn = JetNemotronAttention(
+                    config,
+                    quant_config=quant_config,
+                    prefix=add_prefix("self_attn", prefix),
+                    layer_id=layer_id,
+                )
+
+            case "jet":
+                self.self_attn = JetBlock(
+                    config,
+                    quant_config=quant_config,
+                    prefix=add_prefix("self_attn", prefix),
+                    layer_id=layer_id,
+                )
+
+            case _:
+                raise NotImplementedError
+
+        self.mlp = Qwen2MLP(
+            hidden_size=config.hidden_size,
+            intermediate_size=config.intermediate_size,
+            hidden_act=config.hidden_act,
+            quant_config=quant_config,
+            prefix=add_prefix("mlp", prefix),
+        )
+        self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = RMSNorm(
+            config.hidden_size, eps=config.rms_norm_eps
+        )
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+        residual: torch.Tensor | None,
+    ) -> tuple[torch.Tensor, torch.Tensor | None]:
+        # Self Attention
+        residual = hidden_states
+
+        hidden_states = self.input_layernorm(hidden_states)
+
+        hidden_states = self.self_attn(
+            positions=positions,
+            hidden_states=hidden_states,
+            forward_batch=forward_batch,
+        )
+
+        hidden_states = residual + hidden_states
+
+        # Fully Connected
+        residual = hidden_states
+
+        hidden_states = self.post_attention_layernorm(hidden_states)
+
+        hidden_states = self.mlp(hidden_states)
+
+        hidden_states = residual + hidden_states
+
+        return hidden_states, None
+
+
+class JetNemotronForCausalLM(nn.Module):
+    def __init__(
+        self,
+        config: JetNemotronConfig,
+        quant_config: QuantizationConfig | None = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+
+        self.config = config
+        self.quant_config = quant_config
+
+        self.model = Qwen2Model(
+            config,
+            quant_config=quant_config,
+            prefix=add_prefix("model", prefix),
+            decoder_layer_type=JetNemotronDecoderLayer,
+        )
+
+        if config.tie_word_embeddings:
+            self.lm_head = self.model.embed_tokens
+        else:
+            self.lm_head = ParallelLMHead(
+                config.vocab_size,
+                config.hidden_size,
+                quant_config=quant_config,
+                prefix=add_prefix("lm_head", prefix),
+            )
+
+        self.logits_processor = LogitsProcessor(config)
+        self.pooler = Pooler(PoolingType.LAST, normalize=True)
+
+    @torch.no_grad()
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        input_embeds: torch.Tensor | None = None,
+        get_embedding: bool = False,
+    ) -> EmbeddingPoolerOutput | LogitsProcessorOutput:
+        hidden_states = self.model(
+            input_ids,
+            positions,
+            forward_batch,
+            input_embeds,
+        )
+
+        if not get_embedding:
+            return self.logits_processor(
+                input_ids, hidden_states, self.lm_head, forward_batch
+            )
+        else:
+            return self.pooler(hidden_states, forward_batch)
+
+    def get_input_embeddings(self) -> nn.Module:
+        return self.model.embed_tokens
+
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
+        stacked_params_mapping: list[tuple[str, str, str | int]] = [
+            # (param_name, shard_weight_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+            ("gate_up_proj", "gate_proj", 0),
+            ("gate_up_proj", "up_proj", 1),
+            ("qkvabz_proj", "q_proj", 0),
+            ("qkvabz_proj", "k_proj", 1),
+            ("qkvabz_proj", "v_proj", 2),
+            ("qkvabz_proj", "a_proj", 3),
+            ("qkvabz_proj", "b_proj", 4),
+            ("qkvabz_proj", "g_proj", 5),
+        ]
+
+        params_dict = dict(self.named_parameters())
+        for weight_name, loaded_weight in weights:
+            # Handle stacked parameters first.
+            for (
+                param_name_part,
+                shard_weight_name_part,
+                shard_id,
+            ) in stacked_params_mapping:
+                if shard_weight_name_part not in weight_name.split("."):
+                    continue
+
+                param_name = weight_name.replace(
+                    shard_weight_name_part, param_name_part
+                )
+
+                if param_name not in params_dict:
+                    # Fall back to direct match if no such stacked parameter.
+                    continue
+
+                param = params_dict[param_name]
+                weight_loader = getattr(param, "weight_loader")
+                weight_loader(param, loaded_weight, shard_id)
+                break
+
+            else:
+                param_name = weight_name
+
+                param = params_dict[param_name]
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
+                weight_loader(param, loaded_weight)
+
+
+EntryClass = JetNemotronForCausalLM
diff --git a/sglang/python/sglang/srt/models/jet_vlm.py b/sglang/python/sglang/srt/models/jet_vlm.py
new file mode 100644
index 0000000000000000000000000000000000000000..fc7c6b6c13b43c7ad164178653e001552e490b0f
--- /dev/null
+++ b/sglang/python/sglang/srt/models/jet_vlm.py
@@ -0,0 +1,143 @@
+import math
+from collections.abc import Iterable
+
+import einops
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch import Tensor
+from transformers.modeling_outputs import BaseModelOutputWithPooling
+from transformers.models.siglip import SiglipVisionModel
+
+import sglang.srt.managers.mm_utils as mm_utils
+import sglang.srt.model_loader.weight_utils as weight_utils
+import sglang.srt.utils as utils
+from sglang.srt.configs.jet_vlm import JetVLMConfig
+from sglang.srt.layers.logits_processor import LogitsProcessorOutput
+from sglang.srt.layers.quantization.base_config import QuantizationConfig
+from sglang.srt.managers.mm_utils import MultiModalityDataPaddingPatternMultimodalTokens
+from sglang.srt.managers.schedule_batch import (
+    Modality,
+    MultimodalDataItem,
+    MultimodalInputs,
+)
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch
+from sglang.srt.models.jet_nemotron import JetNemotronForCausalLM
+
+MM_HIDDEN_SIZE = 1152
+
+
+class JetVLMDownSample2x2BlockFix(nn.Module):
+    def forward(self, x: Tensor) -> Tensor:
+        _, seq_len, _ = x.shape
+
+        feat_size = math.isqrt(seq_len)
+
+        features = einops.rearrange(x, "b (h w) d -> b h w d", h=feat_size, w=feat_size)
+
+        if feat_size % 2 == 1:
+            features = F.pad(features, (0, 0, 0, 1, 0, 1))
+
+        features = einops.rearrange(
+            features, "b (h p1) (w p2) d -> b (h w) (p1 p2 d)", p1=2, p2=2
+        )
+
+        return features
+
+
+class JetVLMMultiModalProjector(nn.Module):
+    def __init__(self, config: JetVLMConfig) -> None:
+        super().__init__()
+
+        self.layers = nn.Sequential(
+            JetVLMDownSample2x2BlockFix(),
+            nn.LayerNorm(MM_HIDDEN_SIZE * 4),
+            nn.Linear(MM_HIDDEN_SIZE * 4, config.text_config.hidden_size),
+            nn.GELU(),
+            nn.Linear(config.text_config.hidden_size, config.text_config.hidden_size),
+        )
+
+    def forward(self, x: Tensor) -> Tensor:
+        return self.layers(x)
+
+
+class JetVLMForConditionalGeneration(nn.Module):
+    def __init__(
+        self,
+        config: JetVLMConfig,
+        quant_config: QuantizationConfig | None = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+
+        self.config = config
+
+        self.vision_tower = SiglipVisionModel(config.vision_config)
+        self.mm_projector = JetVLMMultiModalProjector(config)
+        self.llm = JetNemotronForCausalLM(
+            config=config.text_config,
+            quant_config=quant_config,
+            prefix=utils.add_prefix("llm", prefix),
+        )
+
+    def forward(
+        self,
+        input_ids: Tensor,
+        positions: Tensor,
+        forward_batch: ForwardBatch,
+        get_embedding: bool = False,
+    ) -> LogitsProcessorOutput:
+        output = mm_utils.general_mm_embed_routine(
+            input_ids=input_ids,
+            forward_batch=forward_batch,
+            language_model=self.llm,
+            data_embedding_funcs={
+                Modality.IMAGE: self.get_image_feature,
+                Modality.VIDEO: self.get_image_feature,
+            },
+            get_embedding=get_embedding,
+            positions=positions,
+        )
+
+        assert isinstance(output, LogitsProcessorOutput)
+
+        return output
+
+    def get_image_feature(self, mm_input: list[MultimodalDataItem]) -> Tensor:
+        pixel_values = torch.cat([torch.tensor(x.feature) for x in mm_input], dim=0)
+
+        vision_tower_output: BaseModelOutputWithPooling = self.vision_tower(
+            pixel_values,
+            output_hidden_states=True,
+        )
+        assert vision_tower_output.hidden_states is not None
+
+        vision_features = vision_tower_output.hidden_states[-2]
+
+        vision_features = self.mm_projector(vision_features)
+
+        vision_features = einops.rearrange(vision_features, "n p d -> (n p) d")
+
+        return vision_features
+
+    def load_weights(self, weights: Iterable[tuple[str, Tensor]]) -> None:
+        params_dict = dict(self.named_parameters())
+
+        for name, loaded_weight in weights:
+            if name.startswith("llm."):
+                self.llm.load_weights([(name[len("llm.") :], loaded_weight)])
+            else:
+                param = params_dict[name]
+                weight_loader = getattr(
+                    param, "weight_loader", weight_utils.default_weight_loader
+                )
+                weight_loader(param, loaded_weight)
+
+    def pad_input_ids(
+        self, input_ids: list[int], mm_inputs: MultimodalInputs
+    ) -> list[int]:
+        pattern = MultiModalityDataPaddingPatternMultimodalTokens()
+        return pattern.pad_input_tokens(input_ids, mm_inputs)
+
+
+EntryClass = [JetVLMForConditionalGeneration]
diff --git a/sglang/python/sglang/srt/models/kimi_k25.py b/sglang/python/sglang/srt/models/kimi_k25.py
new file mode 100644
index 0000000000000000000000000000000000000000..bf931d5cc45ee74cac0e4333eebe2f007dd9ad2e
--- /dev/null
+++ b/sglang/python/sglang/srt/models/kimi_k25.py
@@ -0,0 +1,827 @@
+import logging
+from copy import deepcopy
+from typing import Iterable, List, Optional, Sequence, Tuple
+
+import numpy as np
+import torch
+import torch.nn.functional as F
+from torch import nn
+from transformers import activations
+
+from sglang.srt.configs.kimi_k25 import KimiK25Config, KimiK25VisionConfig
+from sglang.srt.eplb.expert_location import ModelConfigForExpertLocation
+from sglang.srt.layers.quantization.base_config import QuantizationConfig
+from sglang.srt.managers.mm_utils import (
+    MultiModalityDataPaddingPatternMultimodalTokens,
+    general_mm_embed_routine,
+)
+
+try:
+    from transformers.activations import PytorchGELUTanh
+except ImportError:
+    from transformers.activations import GELUTanh
+
+    activations.PytorchGELUTanh = GELUTanh
+    PytorchGELUTanh = GELUTanh
+
+from sglang.srt.layers.attention.vision import VisionAttention
+from sglang.srt.layers.linear import ReplicatedLinear
+from sglang.srt.managers.schedule_batch import (
+    Modality,
+    MultimodalDataItem,
+    MultimodalInputs,
+)
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch, PPProxyTensors
+from sglang.srt.model_loader.weight_utils import default_weight_loader
+from sglang.srt.models.deepseek_v2 import DeepseekV3ForCausalLM
+from sglang.srt.models.kimi_vl_moonvit import MLP2
+from sglang.srt.models.utils import WeightsMapper
+from sglang.srt.multimodal.mm_utils import run_dp_sharded_mrope_vision_model
+from sglang.srt.server_args import get_global_server_args
+from sglang.srt.utils import add_prefix, is_npu
+
+KIMIV_VT_INFER_MAX_PATCH_NUM = 16328
+logger = logging.getLogger(__name__)
+
+from sglang.srt.layers.dp_attention import is_dp_attention_enabled
+
+_is_npu = is_npu()
+
+
+def apply_rope(
+    xq: torch.Tensor, xk: torch.Tensor, freqs_cis: torch.Tensor, x_shape=None
+) -> tuple[torch.Tensor, torch.Tensor]:
+    """
+    Args: (The leading dimensions of all inputs should be the same)
+        xq: query, tensor of shape (..., num_heads, head_dim)
+        xk: key, tensor of shape (..., num_heads, head_dim)
+        freqs_cis: tensor of shape (..., head_dim/2), dtype=torch.complex64. It contains the precomputed cis(freqs) for each position in the 2D grid.
+    Returns:
+        xq_out, xk_out: tensors of shape (..., num_heads, head_dim)
+    """
+
+    freqs_cis = freqs_cis.unsqueeze(-2)  # ..., 1, head_dim/2
+    # ..., num_heads, head_dim/2
+    xq_ = torch.view_as_complex(xq.float().view(*xq.shape[:-1], -1, 2))
+    xk_ = torch.view_as_complex(xk.float().view(*xq.shape[:-1], -1, 2))
+    xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(-2)  # ..., num_heads, head_dim
+    xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(-2)  # ..., num_heads, head_dim
+    return xq_out.type_as(xq), xk_out.type_as(xk)
+
+
+def tpool_patch_merger(
+    x: torch.Tensor,
+    grid_thws: torch.Tensor,
+    merge_kernel_size: tuple[int, int] = (2, 2),
+) -> list[torch.Tensor]:
+    d_model = x.size(-1)
+
+    outputs = []
+    pre_sum = 0
+    for t, h, w in grid_thws.tolist():
+        # Get the current sequence
+        seq = x[pre_sum : pre_sum + t * h * w]
+        # Reshape along self.merge_kernel_size and concat to the last dimension
+        kernel_height, kernel_width = merge_kernel_size
+        new_height, new_width = h // kernel_height, w // kernel_width
+        reshaped_seq = seq.view(
+            t, new_height, kernel_height, new_width, kernel_width, d_model
+        )
+        reshaped_seq = (
+            reshaped_seq.permute(0, 1, 3, 2, 4, 5).contiguous().mean(dim=0)
+        )  # temporal pooling
+        padded_seq = reshaped_seq.view(
+            new_height * new_width, kernel_height * kernel_width, -1
+        )
+        outputs.append(padded_seq)
+        pre_sum += t * h * w
+
+    return outputs
+
+
+class MoonViTEncoderLayer(nn.Module):
+
+    def __init__(
+        self,
+        num_heads: int,
+        hidden_dim: int,
+        mlp_dim: int,
+        *,
+        activation=F.gelu,
+        attn_bias: bool = False,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+        use_data_parallel: bool = False,
+    ):
+        super().__init__()
+        self.num_heads = num_heads
+        self.hidden_dim = hidden_dim
+        self.hidden_size_per_attention_head = self.hidden_dim // self.num_heads
+
+        self.norm0 = nn.LayerNorm(hidden_dim)
+        self.norm1 = nn.LayerNorm(hidden_dim)
+        self.mlp = MLP2([hidden_dim, mlp_dim, hidden_dim], activation)
+
+        self.attn = VisionAttention(
+            embed_dim=hidden_dim,
+            num_heads=num_heads,
+            projection_size=hidden_dim,
+            use_qkv_parallel=True,
+            qkv_bias=attn_bias,
+            proj_bias=attn_bias,
+            flatten_batch=True,
+            quant_config=quant_config,
+            prefix=add_prefix("attn", prefix),
+            use_data_parallel=use_data_parallel,
+            customized_position_embedding_applier=apply_rope,
+            use_dp_attention_reduce=is_dp_attention_enabled(),
+        )
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        cu_seqlens: torch.Tensor,
+        max_seqlen: int,
+        rope_freqs_cis: torch.Tensor | None = None,
+    ):
+        residual = hidden_states
+        hidden_states = self.norm0(hidden_states)
+
+        hidden_states = self.attn(
+            hidden_states,
+            cu_seqlens=cu_seqlens,
+            position_embeddings=rope_freqs_cis,
+        )
+
+        hidden_states = residual + hidden_states
+
+        residual = hidden_states
+        hidden_states = self.norm1(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+
+        return hidden_states
+
+
+def get_rope_shape_decorate(func):
+    _get_rope_shape_first_call_flag = set()
+
+    def wrapper(org, interpolation_mode, shape):
+        key = (org.requires_grad, torch.is_grad_enabled(), interpolation_mode)
+        if key not in _get_rope_shape_first_call_flag:
+            _get_rope_shape_first_call_flag.add(key)
+            _ = func(org, interpolation_mode, shape=(64, 64))
+        return func(org, interpolation_mode, shape)
+
+    return wrapper
+
+
+def get_1d_sincos_pos_embed_from_grid(embed_dim, pos):
+    """
+    From:
+    https://github.com/OpenGVLab/InternVideo/blob/421f6d2361fc8f61a3394244571f2601a4e99e29/InternVideo2/multi_modality/models/backbones/internvideo2/pos_embed.py#L86
+    embed_dim: output dimension for each position
+    pos: a list of positions to be encoded: size (M,)
+    out: (M, D)
+    """
+    assert embed_dim % 2 == 0
+    omega = np.arange(embed_dim // 2, dtype=np.float32)
+    omega /= embed_dim / 2.0
+    omega = 1.0 / 10000**omega  # (D/2,)
+
+    pos = pos.reshape(-1)  # (M,)
+    out = np.einsum("m,d->md", pos, omega)  # (M, D/2), outer product
+
+    emb_sin = np.sin(out)  # (M, D/2)
+    emb_cos = np.cos(out)  # (M, D/2)
+
+    emb = np.concatenate([emb_sin, emb_cos], axis=1)  # (M, D)
+    return emb
+
+
+@get_rope_shape_decorate
+@torch.compile(dynamic=True, disable=_is_npu)
+def get_rope_shape(org, interpolation_mode, shape):
+    return (
+        F.interpolate(
+            org.permute((2, 0, 1)).unsqueeze(0),
+            size=shape,
+            mode=interpolation_mode,
+        )
+        .squeeze(0)
+        .permute((1, 2, 0))
+        .flatten(end_dim=1)
+    )
+
+
+def get_1d_sincos_pos_embed(embed_dim, t_size, cls_token=False):
+    """
+    t_size: int of the temporal size
+    return:
+    pos_embed: [t_size, embed_dim] or [1+t_size, embed_dim] (w/ or w/o cls_token)
+    """
+    grid_t = np.arange(t_size, dtype=np.float32)
+    pos_embed = get_1d_sincos_pos_embed_from_grid(embed_dim, grid_t)
+    if cls_token:
+        pos_embed = np.concatenate([np.zeros([1, embed_dim]), pos_embed], axis=0)
+    return pos_embed
+
+
+class Learnable2DInterpPosEmbDivided_fixed(nn.Module):
+
+    def __init__(
+        self,
+        height: int,
+        width: int,
+        num_frames: int,
+        dim: int,
+        interpolation_mode: str = "bicubic",
+    ) -> None:
+        super().__init__()
+        self.height = height
+        self.width = width
+        self.num_frames = num_frames
+        self.dim = dim
+        self.interpolation_mode = interpolation_mode
+        self.weight = nn.Parameter(torch.empty(height, width, dim))
+        self.register_buffer(
+            "time_weight",
+            torch.from_numpy(get_1d_sincos_pos_embed(self.dim, self.num_frames))
+            .float()
+            .unsqueeze(1),
+            persistent=False,
+        )
+
+        self.reset_parameters()
+
+    def reset_parameters(self):
+        nn.init.normal_(self.weight)
+
+    def forward(self, x: torch.Tensor, grid_thws: torch.Tensor) -> torch.Tensor:
+        pos_embs = []
+        for t, h, w in grid_thws.tolist():
+            assert t <= self.num_frames, f"t:{t} > self.num_frames:{self.num_frames}"
+            if (h, w) == self.weight.shape[:-1]:
+                pos_emb_2d = self.weight.flatten(end_dim=1)
+            else:
+                pos_emb_2d = get_rope_shape(
+                    self.weight,
+                    interpolation_mode=self.interpolation_mode,
+                    shape=(h, w),
+                )
+
+            if t == 1:
+                pos_emb_3d = pos_emb_2d
+            else:
+                pos_emb_3d = (
+                    pos_emb_2d.unsqueeze(0).repeat(t, 1, 1) + self.time_weight[0:t]
+                )
+
+            pos_embs.append(pos_emb_3d.reshape(-1, pos_emb_3d.shape[-1]))
+
+        out = x + torch.cat(pos_embs)
+        return out
+
+
+class Rope2DPosEmbRepeated(nn.Module):
+    """2D rotary position embedding with multi-resolution support.
+    This class is intended to be used in the following way:
+    1. Before training, create an instance of Rope2DPosEmb. This instance will hold the precomputed cis.
+    2. Before each forward pass, call `get_freqs_cis_by_*` to get the `freqs_cis` tensor for this iteration.
+    3. During the forward pass, pass the `freqs_cis` tensor to each attention layer, and call `apply` just before each attention operation.
+        The rope is shared across all attention layers and all heads.
+    Refs:
+    - RoFormer: https://arxiv.org/abs/2104.09864
+    - VisionLLaMA: https://arxiv.org/abs/2403.00522
+    - https://github.com/Meituan-AutoML/VisionLLaMA/blob/main/dit/models.py
+    Args:
+        dim (int): usually the multi-head attention dimension, should be divisible by 4 (TODO: relax this constraint if needed)
+        max_height (int): the maximum height of the 2D grid
+        max_width (int): the maximum width of the 2D grid
+        theta_base (float): the base of the theta
+    """
+
+    def __init__(self, dim: int, max_height: int, max_width: int, theta_base=10000):
+        super().__init__()
+        self.dim = dim
+        assert self.dim % 4 == 0, "dim must be divisible by 4"
+        self.max_height = max_height
+        self.max_width = max_width
+        self.theta_base = theta_base
+
+    def extra_repr(self):
+        return f"dim={self.dim}, max_height={self.max_height}, max_width={self.max_width}, theta_base={self.theta_base}"
+
+    def _precompute_freqs_cis(self, device: torch.device) -> torch.Tensor:
+        """Calculate the cis(freqs) for each position in the 2D grid.
+        Return: complex tensor of shape (max_height, max_width, dim//2) and value:
+            height axis: ret[h, w, 2*i] = cis(h * theta_base**(-4*i/dim))
+            weight axis: ret[h, w, 2*i+1] = cis(w * theta_base**(-4*i/dim))   with (i in [0, dim//4))
+            note: `cis` is a mathematical notation defined by cis x = cos x + i sin x,
+        """
+        N = self.max_height * self.max_width
+        flat_pos = torch.arange(0, N).float().to(device)
+        x_pos = flat_pos % self.max_width
+        y_pos = flat_pos // self.max_width
+        dim_range = (
+            torch.arange(0, self.dim, 4)[: (self.dim // 4)].float().to(device)
+        )  # C/4
+        freqs = 1.0 / (self.theta_base ** (dim_range / self.dim))
+        x_freqs = torch.outer(x_pos, freqs).float()  # N, C/4
+        y_freqs = torch.outer(y_pos, freqs).float()  # N, C/4
+        x_cis = torch.polar(torch.ones_like(x_freqs), x_freqs)  # N, C/4
+        y_cis = torch.polar(torch.ones_like(y_freqs), y_freqs)  # N, C/4
+        # N, C/4, 2
+        freqs_cis = torch.cat(
+            [x_cis.unsqueeze(dim=-1), y_cis.unsqueeze(dim=-1)], dim=-1
+        )
+        # max_height, max_width, C/2
+        freqs_cis = freqs_cis.reshape(self.max_height, self.max_width, -1)
+        return freqs_cis
+
+    def get_freqs_cis(
+        self, grid_thws: torch.Tensor, device: torch.device
+    ) -> torch.Tensor:
+        """
+        Args:
+            grid_thws (torch.Tensor): grid time, height and width
+        Returns:
+            freqs_cis: tensor of shape (sum(t * height * width), dim//2)
+        """
+        if not hasattr(self, "freqs_cis"):
+            self.register_buffer(
+                "freqs_cis", self._precompute_freqs_cis(device), persistent=False
+            )
+
+        shapes = grid_thws.tolist()
+        assert all(
+            1 <= h <= self.max_height and 1 <= w <= self.max_width for t, h, w in shapes
+        ), (
+            shapes,
+            self.max_height,
+            self.max_width,
+        )
+        freqs_cis = torch.cat(
+            [
+                self.freqs_cis[:h, :w].reshape(-1, self.dim // 2).repeat(t, 1)
+                for t, h, w in shapes
+            ],
+            dim=0,
+        )
+        return freqs_cis
+
+
+class MoonVision3dPatchEmbed(nn.Module):
+
+    def __init__(
+        self,
+        out_dim: int,
+        in_dim: int = 3,
+        patch_size: int | tuple[int, int] = (14, 14),
+        pos_emb_height: int = 14,
+        pos_emb_width: int = 14,
+        pos_emb_time: int = 4,
+        pos_emb_type: str = "divided_fixed",
+    ):
+        super().__init__()
+        assert isinstance(
+            patch_size, int | Sequence
+        ), f"Invalid patch_size type: {type(patch_size)}"
+        if isinstance(patch_size, int):
+            patch_size = (patch_size, patch_size)
+        assert (
+            len(patch_size) == 2
+        ), f"Expected patch_size to be a tuple of 2, got {patch_size}"
+        self.patch_size = patch_size
+
+        self.proj = nn.Conv2d(
+            in_dim, out_dim, kernel_size=patch_size, stride=patch_size
+        )
+
+        if pos_emb_type == "divided_fixed":
+            self.pos_emb = Learnable2DInterpPosEmbDivided_fixed(
+                height=pos_emb_height,
+                width=pos_emb_width,
+                num_frames=pos_emb_time,
+                dim=out_dim,
+            )
+        else:
+            raise NotImplementedError(f"Not support pos_emb_type: {pos_emb_type}")
+
+    def forward(self, x: torch.Tensor, grid_thws: torch.Tensor) -> torch.Tensor:
+        """
+        Args:
+            x (L, Channels): input tensor
+            grid_hws (N, 3): temporal, height and width
+        Returns:
+            (L, Cout) tensor
+        """
+        x = self.proj(x).view(x.size(0), -1)
+        # apply positional embedding
+        x = self.pos_emb(x, grid_thws)
+        return x
+
+
+class MoonViT3dEncoder(nn.Module):
+
+    def __init__(
+        self,
+        hidden_dim: int,
+        num_layers: int,
+        block_cfg: dict,
+        video_attn_type: str = "spatial_temporal",
+    ) -> None:
+        super().__init__()
+
+        assert (
+            video_attn_type == "spatial_temporal"
+        ), f'video_attn_type must be "spatial_temporal", got {video_attn_type}'
+        self.video_attn_type = video_attn_type
+        self.rope_2d = Rope2DPosEmbRepeated(
+            block_cfg["hidden_dim"] // block_cfg["num_heads"], 512, 512
+        )
+        self.blocks = nn.ModuleList(
+            [MoonViTEncoderLayer(**block_cfg) for _ in range(num_layers)]
+        )
+        self.final_layernorm = nn.LayerNorm(hidden_dim)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        grid_thws: torch.Tensor,
+    ) -> torch.Tensor:
+        rope_freqs_cis = self.rope_2d.get_freqs_cis(
+            grid_thws=grid_thws, device=hidden_states.device
+        )
+
+        lengths = torch.cat(
+            (
+                torch.zeros(1, dtype=grid_thws.dtype, device=grid_thws.device),
+                grid_thws[:, 0] * grid_thws[:, 1] * grid_thws[:, 2],
+            )
+        )
+
+        max_seqlen = lengths.max()
+        cu_seqlens = lengths.to(hidden_states.device).cumsum(dim=0, dtype=torch.int32)
+
+        for block in self.blocks:
+            hidden_states = block(
+                hidden_states, cu_seqlens, max_seqlen, rope_freqs_cis=rope_freqs_cis
+            )
+
+        hidden_states = self.final_layernorm(hidden_states)
+
+        return hidden_states
+
+
+class MoonViT3dPretrainedModel(nn.Module):
+    model_type = "moonvit3d"
+    _no_split_modules = ["PackingTransformer"]
+    _supports_flash_attn_2 = True
+    _supports_sdpa = True
+
+    def __init__(self, config, *inputs, use_data_parallel: bool = False, **kwargs):
+        super().__init__()
+        config = deepcopy(config)
+        self.config = config
+        self.merge_kernel_size = config.merge_kernel_size
+        self.patch_size = config.patch_size
+        self.merge_type = config.merge_type
+
+        self.patch_embed = MoonVision3dPatchEmbed(
+            out_dim=config.hidden_size,
+            patch_size=config.patch_size,
+            pos_emb_height=config.init_pos_emb_height,
+            pos_emb_width=config.init_pos_emb_width,
+            pos_emb_time=config.init_pos_emb_time,
+            pos_emb_type=config.pos_emb_type,
+        )
+
+        self.encoder = MoonViT3dEncoder(
+            hidden_dim=config.hidden_size,
+            num_layers=config.num_hidden_layers,
+            block_cfg={
+                "num_heads": config.num_attention_heads,
+                "hidden_dim": config.hidden_size,
+                "mlp_dim": config.intermediate_size,
+                "activation": PytorchGELUTanh(),
+                "attn_bias": True,
+                "use_data_parallel": use_data_parallel,
+            },
+            video_attn_type=config.video_attn_type,
+        )
+
+    @property
+    def dtype(self) -> torch.dtype:
+        return self.patch_embed.proj.weight.dtype
+
+    @property
+    def device(self) -> torch.device:
+        return self.patch_embed.proj.weight.device
+
+    def forward(
+        self, pixel_values: torch.Tensor, grid_thws: torch.Tensor
+    ) -> torch.Tensor:
+        """
+        Args:
+            pixel_values (torch.Tensor): The input pixel values.
+            grid_thws (torch.Tensor): Temporal, height and width.
+        Returns:
+            torch.Tensor: The output tokens.
+        """
+        assert grid_thws.ndim == 2, f"grid_thws should be 2D, got {grid_thws.ndim}"
+        assert grid_thws.size(1) == 3, f"No support for _thw: {grid_thws}"
+        hidden_states = self.patch_embed(pixel_values, grid_thws)
+        hidden_states = self.encoder(hidden_states, grid_thws)
+        hidden_states = hidden_states.squeeze(0)
+        # spatial downsampling 2x with temporal pooling all
+        hidden_states = tpool_patch_merger(
+            hidden_states, grid_thws, merge_kernel_size=self.merge_kernel_size
+        )
+
+        return hidden_states
+
+
+class K2VLMultiModalProjector(nn.Module):
+    """Multi-modal projector with patch merging for K2-VL."""
+
+    def __init__(
+        self,
+        config: KimiK25VisionConfig,
+        prefix: str = "",
+    ):
+        super().__init__()
+
+        # Hidden size after patch merging
+        merge_h, merge_w = config.merge_kernel_size
+        self.hidden_size = config.vt_hidden_size * merge_h * merge_w
+
+        self.pre_norm = torch.nn.LayerNorm(config.vt_hidden_size, eps=1e-5)
+        self.linear_1 = ReplicatedLinear(
+            self.hidden_size,
+            self.hidden_size,
+            bias=True,
+            prefix=add_prefix(prefix, "linear_1"),
+        )
+        self.linear_2 = ReplicatedLinear(
+            self.hidden_size,
+            config.text_hidden_size,
+            bias=True,
+            prefix=add_prefix(prefix, "linear_2"),
+        )
+        self.act = nn.GELU()
+
+    def forward(self, image_features: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.pre_norm(image_features).view(-1, self.hidden_size)
+        hidden_states, _ = self.linear_1(hidden_states)
+        hidden_states = self.act(hidden_states)
+        hidden_states, _ = self.linear_2(hidden_states)
+        return hidden_states
+
+
+@torch.inference_mode()
+def mm_projection_auto(
+    mm_projector: torch.nn.Module | None, vt_output: list[torch.Tensor]
+):
+    """Apply MM projector to vision tower outputs."""
+    if mm_projector is None:
+        return vt_output
+
+    num_embedding_list = [x.shape[0] for x in vt_output]
+    batched = torch.cat(vt_output, dim=0)
+    proj_out = mm_projector(batched) if mm_projector else batched
+    proj_out = proj_out.reshape(-1, proj_out.shape[-1])
+    proj_out = torch.split(proj_out, num_embedding_list)
+    return proj_out
+
+
+@torch.inference_mode()
+def vision_tower_forward_auto(
+    vision_tower: torch.nn.Module,
+    pixel_values: torch.Tensor,
+    grid_thw: torch.Tensor,
+    mm_projector: torch.nn.Module | None = None,
+) -> list[torch.Tensor]:
+    """Auto-batched vision tower forward."""
+    assert isinstance(
+        pixel_values, torch.Tensor
+    ), "expect pixel_values to be a tensor, get {}".format(type(pixel_values))
+    n = grid_thw.shape[0]
+    n_patches_each_media = grid_thw.prod(-1)
+    max_infer_batch = max(n_patches_each_media.max(), KIMIV_VT_INFER_MAX_PATCH_NUM)
+    logger.debug(
+        "vt max_infer_batch: %s, KIMIV_VT_INFER_MAX_PATCH_NUM: %s",
+        max_infer_batch,
+        KIMIV_VT_INFER_MAX_PATCH_NUM,
+    )
+    tensors = []
+    pre_sum = 0
+    current_group_start = 0
+    current_group_patches = 0
+
+    for i in range(n):
+        current_media_patches = n_patches_each_media[i].item()
+        if current_group_patches + current_media_patches <= max_infer_batch:
+            current_group_patches += current_media_patches
+        else:
+            if current_group_start < i:
+                group_grid_thw = grid_thw[current_group_start:i]
+                group_n_patches = n_patches_each_media[current_group_start:i].sum()
+                group_input = pixel_values[pre_sum : pre_sum + group_n_patches]
+                group_output = vision_tower(group_input, group_grid_thw)
+                proj_out = mm_projection_auto(mm_projector, group_output)
+                tensors.extend(proj_out)
+                pre_sum += group_n_patches
+
+            current_group_start = i
+            current_group_patches = current_media_patches
+
+    # Process the last group
+    if current_group_start < n:
+        group_grid_thw = grid_thw[current_group_start:n]
+        group_n_patches = n_patches_each_media[current_group_start:n].sum()
+        group_input = pixel_values[pre_sum : pre_sum + group_n_patches]
+        group_output = vision_tower(group_input, group_grid_thw)
+        proj_out = mm_projection_auto(mm_projector, group_output)
+        tensors.extend(proj_out)
+
+    return tensors
+
+
+class KimiK25ForConditionalGeneration(nn.Module):
+    # Support nvidia/Kimi-K2.5-NVFP4 naming: language_model.layers.*.
+    # Ref: HF config.json for nvidia/Kimi-K2.5-NVFP4
+    # https://huggingface.co/nvidia/Kimi-K2.5-NVFP4/blob/main/config.json
+    hf_to_sglang_mapper = WeightsMapper(
+        orig_to_new_prefix={
+            "language_model.layers.": "language_model.model.layers.",
+        }
+    )
+
+    def __init__(
+        self,
+        config: KimiK25Config,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+        **kwargs,  # fix init_tts argument error
+    ) -> None:
+        super().__init__()
+        self.config = config
+        self.quant_config = quant_config
+        self.use_data_parallel = get_global_server_args().mm_enable_dp_encoder
+        # Create vision tower
+        self.vision_tower = MoonViT3dPretrainedModel(
+            config.vision_config, use_data_parallel=self.use_data_parallel
+        )
+        # Create mm projector
+        self.mm_projector = K2VLMultiModalProjector(config.vision_config)
+
+        self.language_model = DeepseekV3ForCausalLM(config.text_config, quant_config)
+
+        # Ensure that the dtype of the vision_tower and mm_projector matches that of the language_model.
+        # This solves the dtype mismatch issue when using device_map="auto" and torch_dtype.
+        if hasattr(self.language_model, "dtype"):
+            target_dtype = self.language_model.dtype
+            self.vision_tower = self.vision_tower.to(dtype=target_dtype)
+            self.mm_projector = self.mm_projector.to(dtype=target_dtype)
+
+    def get_image_feature(self, items: List[MultimodalDataItem]) -> torch.Tensor:
+        pixel_values = torch.cat([item.feature for item in items], dim=0).type(
+            self.vision_tower.dtype
+        )
+        grid_thws = torch.concat([item.grid_thws for item in items], dim=0).to(
+            self.vision_tower.device
+        )
+
+        target_dtype = self.vision_tower.patch_embed.proj.weight.dtype
+        pixel_values = pixel_values.to(target_dtype)
+
+        if self.use_data_parallel:
+            image_embeds = run_dp_sharded_mrope_vision_model(
+                self.vision_tower,
+                pixel_values,
+                grid_thws.tolist(),
+                rope_type="rope_2d",
+            )
+            image_features = self.mm_projector(image_embeds)
+            return image_features
+
+        image_features = vision_tower_forward_auto(
+            self.vision_tower,
+            pixel_values,
+            grid_thws,
+            mm_projector=self.mm_projector,
+        )
+        image_features = torch.cat(image_features, dim=0)
+        return image_features
+
+    def pad_input_ids(self, input_ids: List[int], mm_inputs: MultimodalInputs):
+        pattern = MultiModalityDataPaddingPatternMultimodalTokens()
+        return pattern.pad_input_tokens(input_ids, mm_inputs)
+
+    @property
+    def start_layer(self) -> int:
+        return self.language_model.start_layer
+
+    @property
+    def end_layer(self) -> int:
+        return self.language_model.end_layer
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        get_embedding: bool = False,
+        pp_proxy_tensors: Optional[PPProxyTensors] = None,
+    ):
+        hidden_states = general_mm_embed_routine(
+            input_ids=input_ids,
+            forward_batch=forward_batch,
+            language_model=self.language_model,
+            data_embedding_funcs={
+                Modality.IMAGE: self.get_image_feature,
+            },
+            positions=positions,
+            pp_proxy_tensors=pp_proxy_tensors,
+        )
+
+        return hidden_states
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        """Load weights for the model, separating vision and language weights"""
+        mapper = getattr(self, "hf_to_sglang_mapper", None)
+        if mapper is not None:
+            weights = mapper.apply(weights)
+
+        # Separate vision tower weights and language model weights
+        vision_weights = []
+        language_weights = []
+
+        for name, loaded_weight in weights:
+            if "vision_tower" in name or "mm_projector" in name:
+                name = name.replace(r"wqkv.", r"attn.qkv_proj.")
+                name = name.replace(r"wo.", r"attn.proj.")
+                name = name.replace("mm_projector.proj.0", "mm_projector.linear_1")
+                name = name.replace("mm_projector.proj.2", "mm_projector.linear_2")
+                vision_weights.append((name, loaded_weight))
+            else:
+                name = name.replace("language_model.", "")
+                # All other weights go to language model
+                language_weights.append((name, loaded_weight))
+
+        # Load vision tower weights
+        vision_state_dict = dict(vision_weights)
+        params_dict = dict(self.named_parameters(remove_duplicate=False))
+        for name, loaded_weight in vision_state_dict.items():
+            if name not in params_dict:
+                raise ValueError(f"Weight {name} not found in params_dict")
+            param = params_dict[name]
+            weight_loader = getattr(param, "weight_loader", default_weight_loader)
+            # loaded_weight = self._pad_vit_attn_dummy_heads(name, loaded_weight)
+            weight_loader(param, loaded_weight)
+
+        # Load language model weights
+        if language_weights:
+            self.language_model.load_weights(language_weights)
+
+    @classmethod
+    def get_model_config_for_expert_location(cls, config: KimiK25Config):
+        text_config = config.text_config
+        return ModelConfigForExpertLocation(
+            num_layers=text_config.num_hidden_layers,
+            num_logical_experts=text_config.n_routed_experts,
+            num_groups=text_config.n_group,
+        )
+
+    def set_eagle3_layers_to_capture(
+        self, layer_ids: Optional[List[int]] = None
+    ) -> None:
+        """Set the layers to capture for EAGLE3 speculative decoding."""
+        if not hasattr(self.language_model, "set_eagle3_layers_to_capture"):
+            raise AttributeError(
+                "language_model does not support EAGLE3 speculative decoding."
+            )
+
+        self.language_model.set_eagle3_layers_to_capture(layer_ids)
+
+    def get_embed_and_head(self) -> Tuple[torch.Tensor, torch.Tensor]:
+        """Get embedding and LM head weights for speculative decoding."""
+        if not hasattr(self.language_model, "get_embed_and_head"):
+            raise AttributeError(
+                "language_model does not support get_embed_and_head()."
+            )
+
+        return self.language_model.get_embed_and_head()
+
+    def set_embed_and_head(self, embed: torch.Tensor, head: torch.Tensor) -> None:
+        """Set embedding and LM head weights for speculative decoding."""
+        if not hasattr(self.language_model, "set_embed_and_head"):
+            raise AttributeError(
+                "language_model does not support set_embed_and_head()."
+            )
+
+        self.language_model.set_embed_and_head(embed, head)
+
+
+EntryClass = [KimiK25ForConditionalGeneration]
diff --git a/sglang/python/sglang/srt/models/kimi_linear.py b/sglang/python/sglang/srt/models/kimi_linear.py
new file mode 100644
index 0000000000000000000000000000000000000000..0ba637c11a97b7a83ba303a84fd8a8e6840d9b45
--- /dev/null
+++ b/sglang/python/sglang/srt/models/kimi_linear.py
@@ -0,0 +1,794 @@
+# Adapted from: https://github.com/vllm-project/vllm/blob/0384aa7150c4c9778efca041ffd1beb3ad2bd694/vllm/model_executor/models/kimi_linear.py
+
+from collections.abc import Iterable
+from typing import Optional
+
+import torch
+from einops import rearrange
+from torch import nn
+
+from sglang.srt.configs.kimi_linear import KimiLinearConfig
+from sglang.srt.distributed import (
+    divide,
+    get_pp_group,
+    get_tensor_model_parallel_world_size,
+    tensor_model_parallel_all_reduce,
+)
+from sglang.srt.eplb.expert_distribution import get_global_expert_distribution_recorder
+from sglang.srt.layers.attention.fla.fused_norm_gate import FusedRMSNormGated
+from sglang.srt.layers.attention.fla.kda import fused_kda_gate
+from sglang.srt.layers.dp_attention import get_attention_tp_rank, get_attention_tp_size
+from sglang.srt.layers.layernorm import RMSNorm
+from sglang.srt.layers.linear import (
+    ColumnParallelBatchedLinear,
+    ColumnParallelLinear,
+    MergedColumnParallelLinear,
+    MergedColumnParallelRepeatedLinear,
+    QKVParallelLinear,
+    ReplicatedLinear,
+    RowParallelLinear,
+)
+from sglang.srt.layers.logits_processor import LogitsProcessor
+from sglang.srt.layers.moe.ep_moe.layer import get_moe_impl_class
+from sglang.srt.layers.moe.fused_moe_triton.layer import FusedMoE
+from sglang.srt.layers.moe.topk import TopK, TopKOutputFormat
+from sglang.srt.layers.quantization.base_config import QuantizationConfig
+from sglang.srt.layers.radix_linear_attention import RadixLinearAttention
+from sglang.srt.layers.utils import PPMissingLayer
+from sglang.srt.layers.vocab_parallel_embedding import (
+    ParallelLMHead,
+    VocabParallelEmbedding,
+)
+from sglang.srt.model_executor.cuda_graph_runner import get_is_capture_mode
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch, PPProxyTensors
+from sglang.srt.model_loader.weight_utils import (
+    default_weight_loader,
+    maybe_remap_kv_scale_name,
+    sharded_weight_loader,
+)
+from sglang.srt.models.deepseek_v2 import DeepseekV2AttentionMLA as KimiMLAAttention
+from sglang.srt.models.llama import LlamaMLP as KimiMLP
+from sglang.srt.models.transformers import maybe_prefix
+from sglang.srt.utils import make_layers
+from sglang.srt.utils.common import BumpAllocator, add_prefix, set_weight_attrs
+
+
+class KimiMoE(nn.Module):
+    def __init__(
+        self,
+        config: KimiLinearConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+        layer_idx: int = 0,
+        alt_stream: Optional[torch.cuda.Stream] = None,
+    ):
+        super().__init__()
+        hidden_size = config.hidden_size
+        intermediate_size = config.intermediate_size
+        moe_intermediate_size = config.moe_intermediate_size
+        num_experts = config.num_experts
+        moe_renormalize = config.moe_renormalize
+        self.tp_size = get_tensor_model_parallel_world_size()
+        self.routed_scaling_factor = config.routed_scaling_factor
+        self.num_shared_experts = config.num_shared_experts
+        self.layer_idx = layer_idx
+        self.alt_stream = alt_stream
+
+        if config.hidden_act != "silu":
+            raise ValueError(
+                f"Unsupported activation: {config.hidden_act}. "
+                "Only silu is supported for now."
+            )
+
+        # Gate always runs at half / full precision for now.
+        self.gate = ReplicatedLinear(
+            hidden_size,
+            num_experts,
+            bias=False,
+            quant_config=None,
+            prefix=f"{prefix}.gate",
+        )
+
+        self.gate.e_score_correction_bias = nn.Parameter(torch.empty(num_experts))
+
+        self.experts = get_moe_impl_class(quant_config)(
+            num_experts=config.n_routed_experts,
+            top_k=config.num_experts_per_token,
+            hidden_size=config.hidden_size,
+            intermediate_size=config.moe_intermediate_size,
+            layer_id=self.layer_idx,
+            quant_config=quant_config,
+            routed_scaling_factor=self.routed_scaling_factor,
+            prefix=add_prefix("experts", prefix),
+        )
+
+        self.topk = TopK(
+            top_k=config.num_experts_per_token,
+            renormalize=moe_renormalize,
+            use_grouped_topk=True,
+            num_expert_group=config.num_expert_group,
+            topk_group=config.topk_group,
+            correction_bias=self.gate.e_score_correction_bias,
+            quant_config=quant_config,
+            routed_scaling_factor=self.routed_scaling_factor,
+            apply_routed_scaling_factor_on_output=self.experts.should_fuse_routed_scaling_factor_in_topk,
+            # Some Fp4 MoE backends require the output format to be bypassed but the MTP layers are unquantized
+            # and requires the output format to be standard. We use quant_config to determine the output format.
+            output_format=TopKOutputFormat.STANDARD if quant_config is None else None,
+        )
+
+        if self.num_shared_experts is not None:
+            intermediate_size = moe_intermediate_size * self.num_shared_experts
+            self.shared_experts = KimiMLP(
+                hidden_size=config.hidden_size,
+                intermediate_size=intermediate_size,
+                hidden_act=config.hidden_act,
+                quant_config=quant_config,
+                reduce_results=False,
+            )
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        num_tokens, hidden_size = hidden_states.shape
+        hidden_states = hidden_states.view(-1, hidden_size)
+
+        shared_output = None
+
+        if (
+            self.alt_stream is not None
+            and self.num_shared_experts is not None
+            and hidden_states.shape[0] > 0
+            and get_is_capture_mode()
+        ):
+            current_stream = torch.cuda.current_stream()
+            self.alt_stream.wait_stream(current_stream)
+
+            shared_output = self.shared_experts(hidden_states.clone())
+
+            with torch.cuda.stream(self.alt_stream):
+                router_logits, _ = self.gate(hidden_states)
+                topk_output = self.topk(hidden_states, router_logits)
+                final_hidden_states = self.experts(hidden_states, topk_output)
+
+            current_stream.wait_stream(self.alt_stream)
+        else:
+            if self.num_shared_experts is not None and hidden_states.shape[0] > 0:
+                shared_output = self.shared_experts(hidden_states)
+            router_logits, _ = self.gate(hidden_states)
+            topk_output = self.topk(hidden_states, router_logits)
+            final_hidden_states = self.experts(hidden_states, topk_output)
+
+        if shared_output is not None:
+            final_hidden_states = final_hidden_states + shared_output
+
+        if self.tp_size > 1:
+            final_hidden_states = tensor_model_parallel_all_reduce(final_hidden_states)
+        return final_hidden_states.view(num_tokens, hidden_size)
+
+
+class KimiDeltaAttention(nn.Module):
+    def __init__(
+        self,
+        layer_idx: int,
+        hidden_size: int,
+        config: KimiLinearConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        rms_norm_eps: float = 1e-5,
+        prefix: str = "",
+        **kwargs,
+    ) -> None:
+        super().__init__()
+        self.tp_size = get_tensor_model_parallel_world_size()
+        self.attn_tp_size = get_attention_tp_size()
+        self.hidden_size = hidden_size
+        self.config = config
+        self.head_dim = config.linear_attn_config["head_dim"]
+        self.num_heads = config.linear_attn_config["num_heads"]
+        self.num_k_heads = config.linear_attn_config["num_heads"]
+        self.num_v_heads = config.linear_attn_config["num_heads"]
+        self.head_k_dim = config.linear_attn_config["head_dim"]
+        self.head_v_dim = config.v_head_dim
+        self.layer_idx = layer_idx
+        self.prefix = prefix
+        assert self.num_heads % self.tp_size == 0
+        self.local_num_heads = divide(self.num_heads, self.tp_size)
+
+        projection_size = self.head_dim * self.num_heads
+        self.conv_size = config.linear_attn_config["short_conv_kernel_size"]
+
+        # TODO: support fusion with quant
+        self.do_fuse_qkvbfg = quant_config is None
+
+        if self.do_fuse_qkvbfg:
+            # Fuse: q, k, v, beta (column parallel) + f_a, g_a (replicated)
+            self.qkvb_sizes = [
+                projection_size,
+                projection_size,
+                projection_size,
+                self.num_heads,
+            ]
+            self.fg_sizes = [self.head_dim, self.head_dim]
+
+            self.fused_qkvbfg_a_proj = MergedColumnParallelRepeatedLinear(
+                self.hidden_size,
+                self.qkvb_sizes,  # Column parallel
+                self.fg_sizes,  # Replicated: f_a, g_a
+                quant_config=quant_config,
+                prefix=f"{prefix}.fused_qkvbfg_a_proj",
+            )
+            self.split_sizes = [
+                3 * projection_size // self.tp_size,  # qkv
+                self.num_heads // self.tp_size,  # beta
+                2 * self.head_dim,  # f_a, g_a
+            ]
+            self.fused_fg_b_proj = ColumnParallelBatchedLinear(
+                2, self.head_dim, projection_size, dtype=config.dtype
+            )
+        else:
+            # Unfused path: separate QKVParallelLinear
+            attn_tp_rank = get_attention_tp_rank()
+            self.qkv_proj = QKVParallelLinear(
+                self.hidden_size,
+                self.head_dim,
+                self.num_heads,
+                self.num_k_heads,
+                bias=False,
+                quant_config=quant_config,
+                tp_rank=attn_tp_rank,
+                tp_size=self.attn_tp_size,
+                v_head_size=self.head_v_dim,
+                prefix=f"{prefix}.qkv_proj",
+            )
+
+            self.f_a_proj = ReplicatedLinear(
+                self.hidden_size,
+                self.head_dim,
+                bias=False,
+                quant_config=quant_config,
+                prefix=f"{prefix}.f_a_proj",
+            )
+
+            self.f_b_proj = ColumnParallelLinear(
+                self.head_dim,
+                projection_size,
+                bias=False,
+                quant_config=quant_config,
+                prefix=f"{prefix}.f_b_proj",
+            )
+
+            self.b_proj = ColumnParallelLinear(
+                self.hidden_size,
+                self.num_heads,
+                bias=False,
+                quant_config=quant_config,
+                prefix=f"{prefix}.b_proj",
+            )
+
+            self.g_a_proj = ReplicatedLinear(
+                self.hidden_size,
+                self.head_dim,
+                bias=False,
+                quant_config=quant_config,
+                prefix=f"{prefix}.g_a_proj",
+            )
+            self.g_b_proj = ColumnParallelLinear(
+                self.head_dim,
+                projection_size,
+                bias=False,
+                quant_config=quant_config,
+                prefix=f"{prefix}.g_b_proj",
+            )
+
+        self.dt_bias = nn.Parameter(
+            torch.empty(divide(projection_size, self.tp_size), dtype=torch.float32)
+        )
+
+        set_weight_attrs(self.dt_bias, {"weight_loader": sharded_weight_loader(0)})
+
+        self.qkv_conv1d = MergedColumnParallelLinear(
+            input_size=self.conv_size,
+            output_sizes=[projection_size, projection_size, projection_size],
+            bias=False,
+            params_dtype=torch.float32,
+            prefix=f"{prefix}.qkv_conv1d",
+        )
+        # unsqueeze to fit conv1d weights shape into the linear weights shape.
+        # Can't do this in `weight_loader` since it already exists in
+        # `ColumnParallelLinear` and `set_weight_attrs`
+        # doesn't allow to override it
+        self.qkv_conv1d.weight.data = self.qkv_conv1d.weight.data.unsqueeze(1)
+
+        self.A_log = nn.Parameter(
+            torch.empty(1, 1, self.local_num_heads, 1, dtype=torch.float32)
+        )
+        set_weight_attrs(self.A_log, {"weight_loader": sharded_weight_loader(2)})
+
+        self.o_norm = FusedRMSNormGated(
+            self.head_dim, eps=rms_norm_eps, activation="sigmoid"
+        )
+        self.o_proj = RowParallelLinear(
+            projection_size,
+            self.hidden_size,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.o_proj",
+        )
+
+        conv_weights = self.qkv_conv1d.weight.squeeze(1)
+        bias = self.qkv_conv1d.bias
+
+        self.attn = RadixLinearAttention(
+            layer_id=self.layer_idx,
+            num_q_heads=self.num_k_heads // self.attn_tp_size,
+            num_k_heads=self.num_k_heads // self.attn_tp_size,
+            num_v_heads=self.num_v_heads // self.attn_tp_size,
+            head_q_dim=self.head_k_dim,
+            head_k_dim=self.head_k_dim,
+            head_v_dim=self.head_v_dim,
+            conv_weights=conv_weights,
+            bias=bias,
+            A_log=self.A_log,
+            dt_bias=self.dt_bias,
+        )
+
+    def forward_qkvbfg(self, hidden_states: torch.Tensor):
+        qkv, _ = self.qkv_proj(hidden_states)
+
+        # Compute beta, forget_gate, and g_proj_states
+        beta = self.b_proj(hidden_states)[0]
+        forget_gate = self.f_b_proj(self.f_a_proj(hidden_states)[0])[0]
+        g_proj_states = self.g_b_proj(self.g_a_proj(hidden_states)[0])[0]
+
+        return (
+            qkv,
+            beta,
+            forget_gate,
+            g_proj_states,
+        )
+
+    def forward_qkvbfg_fused(self, hidden_states: torch.Tensor):
+        # Single fused projection for all: qkv + beta + f_a + g_a
+        fused_states = self.fused_qkvbfg_a_proj(hidden_states)
+
+        qkv, beta, fg_a_states = torch.split(
+            fused_states,
+            self.split_sizes,
+            dim=-1,
+        )
+
+        # use batch matmul to calculate forget_gate and g_proj_states
+        forget_gate, g_proj_states = self.fused_fg_b_proj(
+            fg_a_states.view(-1, 2, self.head_dim).transpose(0, 1)
+        )
+
+        return (
+            qkv,
+            beta,
+            forget_gate,
+            g_proj_states,
+        )
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        zero_allocator: BumpAllocator,
+    ) -> None:
+        if self.do_fuse_qkvbfg:
+            mixed_qkv, beta, forget_gate, g_proj_states = self.forward_qkvbfg_fused(
+                hidden_states
+            )
+        else:
+            mixed_qkv, beta, forget_gate, g_proj_states = self.forward_qkvbfg(
+                hidden_states
+            )
+
+        # fused_kda_gate is fused to KimiLinearAttentionBackend with decode
+        if not forward_batch.forward_mode.is_decode():
+            forget_gate = fused_kda_gate(
+                forget_gate, self.A_log, self.head_dim, g_bias=self.dt_bias
+            )
+            beta = beta.float().sigmoid()
+            forget_gate = forget_gate.unsqueeze(0)
+        beta = beta.unsqueeze(0)
+
+        core_attn_out = self.attn(
+            forward_batch,
+            mixed_qkv=mixed_qkv,
+            a=forget_gate,
+            b=beta,
+        )
+
+        norm_gate = rearrange(g_proj_states, "... (h d) -> ... h d", d=self.head_dim)
+        core_attn_out = self.o_norm(core_attn_out, norm_gate)
+        core_attn_out = rearrange(core_attn_out, "1 n h d -> n (h d)")
+
+        return self.o_proj(core_attn_out)[0]
+
+
+class KimiDecoderLayer(nn.Module):
+    def __init__(
+        self,
+        config: KimiLinearConfig,
+        layer_idx: int,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+        alt_stream: Optional[torch.cuda.Stream] = None,
+    ) -> None:
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.alt_stream = alt_stream
+
+        self.is_moe = config.is_moe
+
+        if config.is_kda_layer(layer_idx):
+            self.self_attn = KimiDeltaAttention(
+                layer_idx=layer_idx,
+                hidden_size=config.hidden_size,
+                config=config,
+                quant_config=quant_config,
+                prefix=f"{prefix}.self_attn",
+            )
+        else:
+            self.self_attn = KimiMLAAttention(
+                layer_id=layer_idx,
+                hidden_size=self.hidden_size,
+                num_heads=config.num_attention_heads,
+                quant_config=quant_config,
+                prefix=f"{prefix}.self_attn",
+                config=config,
+                qk_nope_head_dim=config.qk_nope_head_dim,
+                qk_rope_head_dim=config.qk_rope_head_dim,
+                v_head_dim=config.v_head_dim,
+                q_lora_rank=config.q_lora_rank,
+                kv_lora_rank=config.kv_lora_rank,
+                skip_rope=True,
+            )
+
+        if (
+            self.is_moe
+            and config.num_experts is not None
+            and layer_idx >= config.first_k_dense_replace
+            and layer_idx % config.moe_layer_freq == 0
+        ):
+            self.block_sparse_moe = KimiMoE(
+                config=config,
+                quant_config=quant_config,
+                layer_idx=layer_idx,
+                prefix=f"{prefix}.mlp",
+                alt_stream=self.alt_stream,
+            )
+            self.mlp = self.block_sparse_moe
+        else:
+            self.mlp = KimiMLP(
+                hidden_size=self.hidden_size,
+                intermediate_size=config.intermediate_size,
+                hidden_act=config.hidden_act,
+                quant_config=quant_config,
+                prefix=f"{prefix}.mlp",
+            )
+        self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = RMSNorm(
+            config.hidden_size, eps=config.rms_norm_eps
+        )
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+        residual: Optional[torch.Tensor],
+        zero_allocator: BumpAllocator,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        # Self Attention
+        if residual is None:
+            residual = hidden_states
+            hidden_states = self.input_layernorm(hidden_states)
+        else:
+            hidden_states, residual = self.input_layernorm(hidden_states, residual)
+
+        hidden_states = self.self_attn(
+            hidden_states=hidden_states,
+            positions=positions,
+            forward_batch=forward_batch,
+            zero_allocator=zero_allocator,
+        )
+
+        # Fully Connected
+        hidden_states, residual = self.post_attention_layernorm(hidden_states, residual)
+        hidden_states = self.mlp(hidden_states)
+        return hidden_states, residual
+
+
+class KimiLinearModel(nn.Module):
+    def __init__(
+        self,
+        config: KimiLinearConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+
+        self.config = config
+
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+        self.pp_group = get_pp_group()
+
+        if self.pp_group.is_first_rank:
+            self.embed_tokens = VocabParallelEmbedding(
+                config.vocab_size,
+                config.hidden_size,
+                prefix=f"{prefix}.embed_tokens",
+            )
+        else:
+            self.embed_tokens = PPMissingLayer()
+
+        self.alt_stream = torch.cuda.Stream()
+
+        self.layers, self.start_layer, self.end_layer = make_layers(
+            config.num_hidden_layers,
+            lambda idx, prefix: KimiDecoderLayer(
+                layer_idx=idx,
+                config=config,
+                quant_config=quant_config,
+                prefix=prefix,
+                alt_stream=self.alt_stream,
+            ),
+            pp_rank=self.pp_group.rank_in_group,
+            pp_size=self.pp_group.world_size,
+            prefix=f"{prefix}.layers",
+        )
+
+        if self.pp_group.is_last_rank:
+            self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        else:
+            self.norm = PPMissingLayer()
+
+        world_size = get_tensor_model_parallel_world_size()
+        assert (
+            config.num_attention_heads % world_size == 0
+        ), "num_attention_heads must be divisible by world_size"
+
+    def forward(
+        self,
+        input_ids: torch.Tensor | None,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        inputs_embeds: torch.Tensor | None = None,
+        pp_proxy_tensors: Optional[PPProxyTensors] = None,
+    ) -> torch.Tensor:
+        if get_pp_group().is_first_rank:
+            if inputs_embeds is not None:
+                hidden_states = inputs_embeds
+            else:
+                hidden_states = self.embed_tokens(input_ids)
+            residual = None
+        else:
+            assert pp_proxy_tensors is not None
+            hidden_states = pp_proxy_tensors["hidden_states"]
+            residual = pp_proxy_tensors["residual"]
+
+        total_num_layers = self.end_layer - self.start_layer
+        device = hidden_states.device
+        zero_allocator = BumpAllocator(
+            buffer_size=total_num_layers * 2,
+            dtype=torch.float32,
+            device=device,
+        )
+        # TODO: capture aux hidden states
+        aux_hidden_states = []
+        for i in range(self.start_layer, self.end_layer):
+            ctx = get_global_expert_distribution_recorder().with_current_layer(i)
+            with ctx:
+                layer = self.layers[i]
+                hidden_states, residual = layer(
+                    positions=positions,
+                    hidden_states=hidden_states,
+                    forward_batch=forward_batch,
+                    residual=residual,
+                    zero_allocator=zero_allocator,
+                )
+
+        if not self.pp_group.is_last_rank:
+            return PPProxyTensors(
+                {
+                    "hidden_states": hidden_states,
+                    "residual": residual,
+                }
+            )
+        else:
+            if hidden_states.shape[0] != 0:
+                if residual is None:
+                    hidden_states = self.norm(hidden_states)
+                else:
+                    hidden_states, _ = self.norm(hidden_states, residual)
+
+        if len(aux_hidden_states) == 0:
+            return hidden_states
+
+        return hidden_states, aux_hidden_states
+
+
+class KimiLinearForCausalLM(nn.Module):
+    def __init__(
+        self,
+        config: KimiLinearConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.config = config
+        self.quant_config = quant_config
+        self.model = KimiLinearModel(
+            config, quant_config, prefix=maybe_prefix(prefix, "model")
+        )
+        self.pp_group = get_pp_group()
+        if self.pp_group.is_last_rank:
+            self.lm_head = ParallelLMHead(
+                self.config.vocab_size,
+                self.config.hidden_size,
+                quant_config=quant_config,
+                prefix=maybe_prefix(prefix, "lm_head"),
+            )
+        else:
+            self.lm_head = PPMissingLayer()
+        logit_scale = getattr(self.config, "logit_scale", 1.0)
+        self.logits_processor = LogitsProcessor(config=config, logit_scale=logit_scale)
+
+    @torch.no_grad()
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        pp_proxy_tensors: Optional[PPProxyTensors] = None,
+    ) -> torch.Tensor:
+        hidden_states = self.model(
+            input_ids,
+            positions,
+            forward_batch,
+            inputs_embeds,
+            pp_proxy_tensors,
+        )
+        if self.pp_group.is_last_rank:
+            return self.logits_processor(
+                input_ids, hidden_states, self.lm_head, forward_batch
+            )
+        else:
+            return hidden_states
+
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            (".gate_up_proj", ".gate_proj", 0),
+            (".gate_up_proj", ".up_proj", 1),
+            # Fused path
+            (".fused_qkvbfg_a_proj", ".q_proj", 0),
+            (".fused_qkvbfg_a_proj", ".k_proj", 1),
+            (".fused_qkvbfg_a_proj", ".v_proj", 2),
+            (".fused_qkvbfg_a_proj", ".b_proj", 3),
+            (".fused_qkvbfg_a_proj", ".f_a_proj", 4),
+            (".fused_qkvbfg_a_proj", ".g_a_proj", 5),
+            (".fused_fg_b_proj", ".f_b_proj", 0),
+            (".fused_fg_b_proj", ".g_b_proj", 1),
+            # Unfused path: separate qkv_proj (when do_fuse_qkvbfg=False)
+            (".qkv_proj", ".q_proj", "q"),
+            (".qkv_proj", ".k_proj", "k"),
+            (".qkv_proj", ".v_proj", "v"),
+            # qkv conv fuse
+            (".qkv_conv1d", ".q_conv1d", 0),
+            (".qkv_conv1d", ".k_conv1d", 1),
+            (".qkv_conv1d", ".v_conv1d", 2),
+        ]
+        if self.config.is_moe:
+            # Params for weights, fp8 weight scales, fp8 activation scales
+            # (param_name, weight_name, expert_id, shard_id)
+            expert_params_mapping = FusedMoE.make_expert_params_mapping(
+                ckpt_gate_proj_name="w1",
+                ckpt_down_proj_name="w2",
+                ckpt_up_proj_name="w3",
+                num_experts=self.config.num_experts,
+            )
+        else:
+            expert_params_mapping = []
+        params_dict = dict(self.named_parameters())
+        loaded_params: set[str] = set()
+        for args in weights:
+            name, loaded_weight = args[:2]
+            kwargs = args[2] if len(args) > 2 else {}
+            if "rotary_emb.inv_freq" in name:
+                continue
+
+            if "rotary_emb.cos_cached" in name or "rotary_emb.sin_cached" in name:
+                # Models trained using ColossalAI may include these tensors in
+                # the checkpoint. Skip them.
+                continue
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                # We have mlp.experts[0].gate_proj in the checkpoint.
+                # Since we handle the experts below in expert_params_mapping,
+                # we need to skip here BEFORE we update the name, otherwise
+                # name will be updated to mlp.experts[0].gate_up_proj, which
+                # will then be updated below in expert_params_mapping
+                # for mlp.experts[0].gate_gate_up_proj, which breaks load.
+                if ("mlp.experts." in name) and name not in params_dict:
+                    continue
+                # Check if this mapping targets a fused projection (only apply fusion check to fused params)
+                if param_name in {".fused_qkvbfg_a_proj", ".fused_fg_b_proj"}:
+                    layer_id = int(name.split(".")[2])
+                    if not self.config.is_kda_layer(layer_id):
+                        continue
+                    layer = self.model.layers[layer_id].self_attn
+                    # Only load to fused projection if fusion is enabled
+                    if not getattr(layer, "do_fuse_qkvbfg", False):
+                        continue
+                if weight_name in {".q_proj", ".k_proj", ".v_proj"}:
+                    layer_id = int(name.split(".")[2])
+                    if not self.config.is_kda_layer(layer_id):
+                        continue
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                # if is_pp_missing_parameter(name, self):
+                #     continue
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                for idx, (param_name, weight_name, expert_id, shard_id) in enumerate(
+                    expert_params_mapping
+                ):
+                    if weight_name not in name:
+                        continue
+                    name = name.replace(weight_name, param_name)
+                    # if is_pp_missing_parameter(name, self):
+                    #     continue
+                    param = params_dict[name]
+                    weight_loader = param.weight_loader
+                    weight_loader(
+                        param,
+                        loaded_weight,
+                        name,
+                        expert_id=expert_id,
+                        shard_id=shard_id,
+                    )
+                    break
+                else:
+                    # Skip loading extra bias for GPTQ models.
+                    if (
+                        name.endswith(".bias")
+                        and name not in params_dict
+                        and not self.config.is_linear_attn
+                    ):  # noqa: E501
+                        continue
+                    # Remapping the name of FP8 kv-scale.
+                    name = maybe_remap_kv_scale_name(name, params_dict)
+                    if name is None:
+                        continue
+                    # if is_pp_missing_parameter(name, self):
+                    #     continue
+
+                    param = params_dict[name]
+                    weight_loader = getattr(
+                        param, "weight_loader", default_weight_loader
+                    )
+                    weight_loader(param, loaded_weight, **kwargs)
+            loaded_params.add(name)
+
+        for layer_id in self.config.full_attention_layer_ids:
+            self_attn = self.model.layers[layer_id].self_attn
+            w_kc, w_vc = self_attn.kv_b_proj.weight.unflatten(
+                0, (-1, self_attn.qk_nope_head_dim + self_attn.v_head_dim)
+            ).split([self_attn.qk_nope_head_dim, self_attn.v_head_dim], dim=1)
+            self_attn.w_kc = w_kc.transpose(1, 2).contiguous().transpose(1, 2)
+            self_attn.w_vc = w_vc.contiguous().transpose(1, 2)
+            if hasattr(self_attn.kv_b_proj, "weight_scale"):
+                self_attn.w_scale = self_attn.kv_b_proj.weight_scale
+
+
+EntryClass = KimiLinearForCausalLM
diff --git a/sglang/python/sglang/srt/models/kimi_vl.py b/sglang/python/sglang/srt/models/kimi_vl.py
new file mode 100644
index 0000000000000000000000000000000000000000..e54ec7f382279c2f8866cbdff00585b277fa0f30
--- /dev/null
+++ b/sglang/python/sglang/srt/models/kimi_vl.py
@@ -0,0 +1,314 @@
+# SPDX-License-Identifier: Apache-2.0
+# ruff: noqa: E501
+# Adapted from https://huggingface.co/moonshotai/Kimi-VL-A3B-Instruct/blob/main/modeling_kimi_vl.py
+# Copyright 2025 The Moonshot AI Team, DeepSeek-AI, and HuggingFace Inc. team. All rights reserved.
+#
+# The code is based on llava (llava/modeling_llava.py) and DeepSeek-V3 (DeepSeek-V3/modeling_deepseek.py), but modified for KimiVL.
+#
+# Licensing Information:
+# - Code derived from llava (llava/modeling_llava.py) and DeepSeek-V3 (DeepSeek-V3/modeling_deepseek.py) is licensed under the Apache License, Version 2.0.
+# - Other parts of the code are licensed under the MIT License.
+#
+# Apache License, Version 2.0:
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# MIT License:
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+import copy
+import logging
+from dataclasses import dataclass
+from typing import Iterable, List, Optional, Tuple
+
+import torch
+from torch import nn
+from transformers.activations import GELUActivation
+
+from sglang.srt.configs import KimiVLConfig
+from sglang.srt.configs.deepseekvl2 import DeepseekV2Config
+from sglang.srt.configs.kimi_vl import KimiVLConfig
+from sglang.srt.configs.kimi_vl_moonvit import MoonViTConfig
+from sglang.srt.layers.activation import QuickGELU
+from sglang.srt.layers.moe.fused_moe_triton import FusedMoE
+from sglang.srt.layers.quantization.base_config import QuantizationConfig
+from sglang.srt.managers.mm_utils import (
+    MultiModalityDataPaddingPatternMultimodalTokens,
+    general_mm_embed_routine,
+)
+from sglang.srt.managers.schedule_batch import (
+    Modality,
+    MultimodalDataItem,
+    MultimodalInputs,
+)
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch
+from sglang.srt.model_loader.weight_utils import (
+    default_weight_loader,
+    maybe_remap_kv_scale_name,
+)
+from sglang.srt.models.deepseek_v2 import DeepseekV2ForCausalLM
+from sglang.srt.models.kimi_vl_moonvit import MoonVitPretrainedModel
+from sglang.srt.utils import add_prefix
+
+logger = logging.getLogger(__name__)
+
+
+# For dummy input only
+@dataclass
+class MaxImageTokenMeta:
+    width: int = 1024
+    height: int = 1024
+
+
+class KimiVLMultiModalProjector(nn.Module):
+
+    def __init__(self, config: KimiVLConfig):
+        super().__init__()
+
+        self.hidden_size = (
+            config.vision_config.hidden_size
+            * config.vision_config.merge_kernel_size[0]
+            * config.vision_config.merge_kernel_size[1]
+        )
+
+        self.pre_norm = torch.nn.LayerNorm(config.vision_config.hidden_size, eps=1e-5)
+        self.linear_1 = nn.Linear(self.hidden_size, self.hidden_size, bias=True)
+        self.act = GELUActivation()
+        self.act = QuickGELU()
+        self.linear_2 = nn.Linear(
+            self.hidden_size, config.text_config.hidden_size, bias=True
+        )
+
+    def forward(self, image_features: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.pre_norm(image_features).view(-1, self.hidden_size)
+        hidden_states = self.linear_1(hidden_states)
+        hidden_states = self.act(hidden_states)
+        hidden_states = self.linear_2(hidden_states)
+        return hidden_states
+
+
+class KimiVLForConditionalGeneration(nn.Module):
+    def __init__(
+        self,
+        config: KimiVLConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+        **kwargs,  # fix init_tts argument error
+    ) -> None:
+        super().__init__()
+        self.config = config
+        assert isinstance(config.vision_config, MoonViTConfig)
+
+        self.vision_tower = MoonVitPretrainedModel(config.vision_config)
+
+        self.multi_modal_projector = KimiVLMultiModalProjector(config=config)
+        self.quant_config = quant_config
+        text_config = copy.deepcopy(config.text_config)
+        text_config.architectures = ["DeepseekV2ForCausalLM"]
+        self.language_model = DeepseekV2ForCausalLM(
+            config=text_config,
+            quant_config=quant_config,
+            prefix=add_prefix("language_model", prefix),
+        )
+
+    def get_image_feature(self, items: List[MultimodalDataItem]) -> torch.Tensor:
+        pixel_values = (
+            torch.cat([item.feature for item in items], dim=0)
+            .type(self.vision_tower.dtype)
+            .to(self.vision_tower.device)
+        )
+
+        if (
+            pixel_values.dim() == 2
+            and pixel_values.shape[-1] == self.config.text_config.hidden_size
+        ):
+            return pixel_values
+
+        image_grid_hws = torch.cat([item.image_grid_hws for item in items], dim=0).to(
+            self.vision_tower.device
+        )
+        image_features = self.vision_tower(pixel_values, image_grid_hws)
+        assert isinstance(image_features, list)
+        # lengths = [x.shape[0] for x in image_features]
+        res = self.multi_modal_projector(torch.cat(image_features))  # .split(lengths)
+        return res
+
+    def pad_input_ids(self, input_ids: List[int], mm_inputs: MultimodalInputs):
+        pattern = MultiModalityDataPaddingPatternMultimodalTokens()
+        return pattern.pad_input_tokens(input_ids, mm_inputs)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        get_embedding: bool = False,
+    ):
+        hidden_states = general_mm_embed_routine(
+            input_ids=input_ids,
+            forward_batch=forward_batch,
+            language_model=self.language_model,
+            data_embedding_funcs={
+                Modality.IMAGE: self.get_image_feature,
+            },
+            positions=positions,
+        )
+
+        return hidden_states
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        config = self.config.text_config
+        _KEYS_TO_MODIFY_MAPPING = {
+            # "language_model.lm_head": "lm_head",
+            # "language_model.model": "language_model",
+        }
+        # only doing this for language model part for now.
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            (".gate_up_proj", ".gate_proj", 0),
+            (".gate_up_proj", ".up_proj", 1),
+        ]
+        if not config.use_mla:
+            stacked_params_mapping += [
+                (".qkv_proj", ".q_proj", "q"),
+                (".qkv_proj", ".k_proj", "k"),
+                (".qkv_proj", ".v_proj", "v"),
+            ]
+        if getattr(config, "n_routed_experts", None):
+            # Params for weights, fp8 weight scales, fp8 activation scales
+            # (param_name, weight_name, expert_id, shard_id)
+            expert_params_mapping = FusedMoE.make_expert_params_mapping(
+                ckpt_gate_proj_name="gate_proj",
+                ckpt_down_proj_name="down_proj",
+                ckpt_up_proj_name="up_proj",
+                num_experts=config.n_routed_experts,
+            )
+        else:
+            expert_params_mapping = []
+
+        params_dict = dict(self.named_parameters())
+        for args in weights:
+            name, loaded_weight = args[:2]
+            kwargs = args[2] if len(args) > 2 else {}
+            if "rotary_emb.inv_freq" in name:
+                continue
+
+            spec_layer = get_spec_layer_idx_from_weight_name(config, name)
+            if spec_layer is not None:
+                continue  # skip spec decode layers for main model
+
+            if "rotary_emb.cos_cached" in name or "rotary_emb.sin_cached" in name:
+                # Models trained using ColossalAI may include these tensors in
+                # the checkpoint. Skip them.
+                continue
+            for key_to_modify, new_key in _KEYS_TO_MODIFY_MAPPING.items():
+                if key_to_modify in name:
+                    name = name.replace(key_to_modify, new_key)
+            use_default_weight_loading = False
+            if "vision" in name:
+                if self.vision_tower is not None:
+                    # We only do sharding for language model and
+                    # not vision model for now.
+                    use_default_weight_loading = True
+            else:
+                for param_name, weight_name, shard_id in stacked_params_mapping:
+                    if weight_name not in name:
+                        continue
+                    # We have mlp.experts[0].gate_proj in the checkpoint.
+                    # Since we handle the experts below in expert_params_mapping,
+                    # we need to skip here BEFORE we update the name, otherwise
+                    # name will be updated to mlp.experts[0].gate_up_proj, which
+                    # will then be updated below in expert_params_mapping
+                    # for mlp.experts[0].gate_gate_up_proj, which breaks load.
+                    if ("mlp.experts." in name) and name not in params_dict:
+                        continue
+                    name = name.replace(weight_name, param_name)
+                    # Skip loading extra bias for GPTQ models.
+                    if name.endswith(".bias") and name not in params_dict:
+                        continue
+
+                    param = params_dict[name]
+                    weight_loader = param.weight_loader
+                    weight_loader(param, loaded_weight, shard_id, **kwargs)
+                    break
+                else:
+                    for idx, (
+                        param_name,
+                        weight_name,
+                        expert_id,
+                        shard_id,
+                    ) in enumerate(expert_params_mapping):
+                        if weight_name not in name:
+                            continue
+                        name = name.replace(weight_name, param_name)
+
+                        param = params_dict[name]
+                        weight_loader = param.weight_loader
+                        weight_loader(
+                            param,
+                            loaded_weight,
+                            name,
+                            expert_id=expert_id,
+                            shard_id=shard_id,
+                            **kwargs,
+                        )
+                        break
+                    else:
+                        use_default_weight_loading = True
+            if use_default_weight_loading:
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                # Remapping the name of FP8 kv-scale.
+                name = maybe_remap_kv_scale_name(name, params_dict)
+                if name is None:
+                    continue
+
+                # if is_pp_missing_parameter(name, self):
+                #     continue
+
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
+                weight_loader(param, loaded_weight, **kwargs)
+        self.language_model.post_load_weights()
+
+
+def get_spec_layer_idx_from_weight_name(
+    config: DeepseekV2Config, weight_name: str
+) -> Optional[int]:
+    if hasattr(config, "num_nextn_predict_layers") and (
+        config.num_nextn_predict_layers > 0
+    ):
+        layer_idx = config.num_hidden_layers
+        for i in range(config.num_nextn_predict_layers):
+            if weight_name.startswith(f"model.layers.{layer_idx+i}."):
+                return layer_idx + i
+    return None
+
+
+EntryClass = [KimiVLForConditionalGeneration]
diff --git a/sglang/python/sglang/srt/models/kimi_vl_moonvit.py b/sglang/python/sglang/srt/models/kimi_vl_moonvit.py
new file mode 100644
index 0000000000000000000000000000000000000000..286e857722d24d0ca2905a3da6ab934567864ef6
--- /dev/null
+++ b/sglang/python/sglang/srt/models/kimi_vl_moonvit.py
@@ -0,0 +1,641 @@
+# SPDX-License-Identifier: Apache-2.0
+# ruff: noqa: E501
+# Adapted from https://huggingface.co/moonshotai/Kimi-VL-A3B-Instruct/blob/main/modeling_kimi_vl.py
+# This file is meant to be used in kimi_vl.py only
+# Copyright 2025 The Moonshot AI Team, DeepSeek-AI, and HuggingFace Inc. team. All rights reserved.
+#
+# The code is based on llava (llava/modeling_llava.py) and DeepSeek-V3 (DeepSeek-V3/modeling_deepseek.py), but modified for KimiVL.
+#
+# Licensing Information:
+# - Code derived from llava (llava/modeling_llava.py) and DeepSeek-V3 (DeepSeek-V3/modeling_deepseek.py) is licensed under the Apache License, Version 2.0.
+# - Other parts of the code are licensed under the MIT License.
+#
+# Apache License, Version 2.0:
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# MIT License:
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+import math
+from copy import deepcopy
+from functools import cached_property
+from typing import List, Optional, Sequence, Tuple, Union
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from transformers.activations import ACT2FN
+from transformers.modeling_utils import PreTrainedModel
+
+try:
+    from flash_attn.flash_attn_interface import flash_attn_varlen_func
+except ImportError:
+    flash_attn_varlen_func = None
+
+from sglang.srt.configs import MoonViTConfig
+
+
+def multihead_attention(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    q_cu_seqlens: Optional[torch.Tensor] = None,
+    k_cu_seqlens: Optional[torch.Tensor] = None,
+):
+    """Multi-head attention using flash attention 2.
+    This function is used to handle the case where the query, key, and value are packed.
+    Args:
+        q, k, v: tensor of shape (tot_seqlens, num_heads, head_dim).
+        q_cu_seqlens (torch.Tensor): cumulative sequence lengths of q.
+            The first element should be 0 and the last element should be q.shape[0].
+        k_cu_seqlens (torch.Tensor): cumulative sequence lengths of k.
+            The first element should be 0 and the last element should be k.shape[0].
+
+    Returns:
+        output: shape (batch_size, seqlen, dim) or (tot_seqlens, dim) if packing,
+            where dim = num_heads * head_dim
+    """
+    if flash_attn_varlen_func is None:
+        raise ImportError(
+            "flash_attn is not installed, this function needs flash_attn_varlen_func from flash_attn"
+        )
+    # Unified format legal check
+    assert q.dim() == k.dim() == v.dim() == 3, "q, k, v must have 3 dims"
+    assert q_cu_seqlens[-1] == q.shape[0], "q_cu_seqlens must sum to q.shape[0]"
+    assert (
+        k_cu_seqlens[-1] == k.shape[0] == v.shape[0]
+    ), "k_cu_seqlens must sum to k.shape[0]"
+    assert q.dtype in [
+        torch.bfloat16,
+        torch.float16,
+    ], f"unsupported dtype {q.dtype} for multihead attn"
+
+    max_seqlen_q = (q_cu_seqlens[1:] - q_cu_seqlens[:-1]).max().item()
+    max_seqlen_k = (k_cu_seqlens[1:] - k_cu_seqlens[:-1]).max().item()
+    attn_out = flash_attn_varlen_func(
+        q,
+        k,
+        v,
+        q_cu_seqlens,
+        k_cu_seqlens,
+        max_seqlen_q,
+        max_seqlen_k,
+        causal=False,
+    )
+    attn_out = attn_out.flatten(start_dim=-2)
+
+    return attn_out
+
+
+def sdpa_attention(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    q_cu_seqlens: Optional[torch.Tensor] = None,
+    k_cu_seqlens: Optional[torch.Tensor] = None,
+) -> torch.Tensor:
+    """Multi-head attention using torch scaled dot product attention.
+    This function is used to handle the case where the query, key, and value are packed.
+    Args:
+        q, k, v: tensor of shape (tot_seqlens, num_heads, head_dim).
+        q_cu_seqlens (torch.Tensor): cumulative sequence lengths of q.
+            The first element should be 0 and the last element should be q.shape[0].
+        k_cu_seqlens (torch.Tensor): cumulative sequence lengths of k.
+            The first element should be 0 and the last element should be k.shape[0].
+
+    Returns:
+        output: shape (batch_size, seqlen, dim) or (tot_seqlens, dim) if packing,
+            where dim = num_heads * head_dim
+    """
+    # Unified format legal check
+    assert q.dim() == k.dim() == v.dim() == 3, "q, k, v must have 3 dims"
+    assert q_cu_seqlens[-1] == q.shape[0], "q_cu_seqlens must sum to q.shape[0]"
+    seq_length = q.shape[0]
+    attention_mask = torch.zeros(
+        [1, seq_length, seq_length], device=q.device, dtype=torch.bool
+    )
+    for i in range(1, len(q_cu_seqlens)):
+        attention_mask[
+            ...,
+            q_cu_seqlens[i - 1] : q_cu_seqlens[i],
+            q_cu_seqlens[i - 1] : q_cu_seqlens[i],
+        ] = True
+    q = q.transpose(0, 1)
+    k = k.transpose(0, 1)
+    v = v.transpose(0, 1)
+    attn_output = F.scaled_dot_product_attention(q, k, v, attention_mask, dropout_p=0.0)
+    attn_output = attn_output.transpose(0, 1)
+    attn_output = attn_output.reshape(seq_length, -1)
+    return attn_output
+
+
+VL_VISION_ATTENTION_FUNCTIONS = {
+    "flash_attention_2": multihead_attention,
+    "sdpa": sdpa_attention,
+}
+
+
+def _apply_rope_input_validation(x, freqs_cis):
+    assert x.ndim == freqs_cis.ndim + 1, (x.shape, freqs_cis.shape)
+    assert x.shape[:-2] == freqs_cis.shape[:-1], (x.shape, freqs_cis.shape)
+    assert x.shape[-1] == 2 * freqs_cis.shape[-1], (x.shape, freqs_cis.shape)
+    assert freqs_cis.dtype == torch.complex64, freqs_cis.dtype
+
+
+def apply_rope(
+    xq: torch.Tensor, xk: torch.Tensor, freqs_cis: torch.Tensor
+) -> tuple[torch.Tensor, torch.Tensor]:
+    """
+    Args: (The leading dimensions of all inputs should be the same)
+        xq: query, tensor of shape (..., num_heads, head_dim)
+        xk: key, tensor of shape (..., num_heads, head_dim)
+        freqs_cis: tensor of shape (..., head_dim/2), dtype=torch.complex64. It contains the precomputed cis(freqs) for each position in the 2D grid.
+    Returns:
+        xq_out, xk_out: tensors of shape (..., num_heads, head_dim)
+    """
+    _apply_rope_input_validation(xq, freqs_cis)
+    _apply_rope_input_validation(xk, freqs_cis)
+
+    freqs_cis = freqs_cis.unsqueeze(-2)  # ..., 1, head_dim/2
+    # ..., num_heads, head_dim/2
+    xq_ = torch.view_as_complex(xq.float().view(*xq.shape[:-1], -1, 2))
+    xk_ = torch.view_as_complex(xk.float().view(*xq.shape[:-1], -1, 2))
+    xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(-2)  # ..., num_heads, head_dim
+    xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(-2)  # ..., num_heads, head_dim
+    return xq_out.type_as(xq), xk_out.type_as(xk)
+
+
+class Learnable2DInterpPosEmb(nn.Module):
+
+    def __init__(
+        self, height: int, width: int, dim: int, interpolation_mode: str = "bicubic"
+    ) -> None:
+        super().__init__()
+        self.height = height
+        self.width = width
+        self.interpolation_mode = interpolation_mode
+        self.weight = nn.Parameter(torch.empty(height, width, dim))
+        self.reset_parameters()
+
+    def reset_parameters(self):
+        nn.init.normal_(self.weight)
+
+    def forward(self, x: torch.Tensor, grid_hws: torch.Tensor) -> torch.Tensor:
+        pos_embs = []
+        for shape in grid_hws.tolist():
+            if shape == self.weight.shape[:-1]:
+                pos_embs.append(self.weight.flatten(end_dim=1))
+            else:
+                pos_embs.append(
+                    F.interpolate(
+                        self.weight.permute((2, 0, 1)).unsqueeze(0),
+                        size=shape,
+                        mode=self.interpolation_mode,
+                    )
+                    .squeeze(0)
+                    .permute((1, 2, 0))
+                    .flatten(end_dim=1)
+                )
+        out = x + torch.cat(pos_embs)
+        return out
+
+
+class MoonVisionPatchEmbed(nn.Module):
+
+    def __init__(
+        self,
+        out_dim: int,
+        in_dim: int = 3,
+        patch_size: Union[int, Tuple[int, int]] = (14, 14),
+        pos_emb_height: int = 14,
+        pos_emb_width: int = 14,
+    ):
+        super().__init__()
+        assert isinstance(
+            patch_size, (int, Sequence)
+        ), f"Invalid patch_size type: {type(patch_size)}"
+        if isinstance(patch_size, int):
+            patch_size = (patch_size, patch_size)
+        assert (
+            len(patch_size) == 2
+        ), f"Expected patch_size to be a tuple of 2, got {patch_size}"
+        self.patch_size = patch_size
+
+        self.proj = nn.Conv2d(
+            in_dim, out_dim, kernel_size=patch_size, stride=patch_size
+        )
+
+        self.pos_emb = Learnable2DInterpPosEmb(
+            height=pos_emb_height, width=pos_emb_width, dim=out_dim
+        )
+
+    def forward(self, x: torch.Tensor, grid_hw: torch.Tensor) -> torch.Tensor:
+        """
+        Args:
+            x (L, Channels): input tensor
+            grid_hw (N, 2): grid height and width
+
+        Returns:
+            (L, Cout) tensor
+        """
+        x = self.proj(x).view(x.size(0), -1)
+        # apply positional embedding
+        x = self.pos_emb(x, grid_hw)
+        return x
+
+
+class Rope2DPosEmb(nn.Module):
+    """2D rotary position embedding with multi-resolution support.
+
+    This class is intended to be used in the following way:
+    1. Before training, create an instance of Rope2DPosEmb. This instance will hold the precomputed cis.
+    2. Before each forward pass, call `get_freqs_cis_by_*` to get the `freqs_cis` tensor for this iteration.
+    3. During the forward pass, pass the `freqs_cis` tensor to each attention layer, and call `apply` just before each attention operation.
+        The rope is shared across all attention layers and all heads.
+
+    Refs:
+    - RoFormer: https://arxiv.org/abs/2104.09864
+    - VisionLLaMA: https://arxiv.org/abs/2403.00522
+    - https://github.com/Meituan-AutoML/VisionLLaMA/blob/main/dit/models.py
+
+    Args:
+        dim (int): usually the multi-head attention dimension, should be divisible by 4 (TODO: relax this constraint if needed)
+        max_height (int): the maximum height of the 2D grid
+        max_width (int): the maximum width of the 2D grid
+        theta_base (float): the base of the theta
+        device (str): the device to store the precomputed cis
+    """
+
+    def __init__(
+        self, dim: int, max_height: int, max_width: int, theta_base=10000, device="cuda"
+    ):
+        super().__init__()
+        self.dim = dim
+        assert self.dim % 4 == 0, "dim must be divisible by 4"
+        self.max_height = max_height
+        self.max_width = max_width
+        self.theta_base = theta_base
+        self.device = device
+
+    def extra_repr(self):
+        return f"dim={self.dim}, max_height={self.max_height}, max_width={self.max_width}, theta_base={self.theta_base}"
+
+    @cached_property
+    def precomputed_freqs_cis(self) -> torch.Tensor:
+        """Calculate the cis(freqs) for each position in the 2D grid.
+
+        Return: complex tensor of shape (max_height, max_width, dim//2) and value:
+            height axis: ret[h, w, 2*i] = cis(h * theta_base**(-4*i/dim))
+            weight axis: ret[h, w, 2*i+1] = cis(w * theta_base**(-4*i/dim))   with (i in [0, dim//4))
+            note: `cis` is a mathematical notation defined by cis x = cos x + i sin x,
+        """
+        N = self.max_height * self.max_width
+        flat_pos = torch.arange(0, N).float().to(self.device)
+        x_pos = flat_pos % self.max_width
+        y_pos = flat_pos // self.max_width
+        dim_range = (
+            torch.arange(0, self.dim, 4)[: (self.dim // 4)].float().to(self.device)
+        )  # C/4
+        freqs = 1.0 / (self.theta_base ** (dim_range / self.dim))
+        x_freqs = torch.outer(x_pos, freqs).float()  # N, C/4
+        y_freqs = torch.outer(y_pos, freqs).float()  # N, C/4
+        x_cis = torch.polar(torch.ones_like(x_freqs), x_freqs)  # N, C/4
+        y_cis = torch.polar(torch.ones_like(y_freqs), y_freqs)  # N, C/4
+        # N, C/4, 2
+        freqs_cis = torch.cat(
+            [x_cis.unsqueeze(dim=-1), y_cis.unsqueeze(dim=-1)], dim=-1
+        )
+        # max_height, max_width, C/2
+        freqs_cis = freqs_cis.reshape(self.max_height, self.max_width, -1)
+        return freqs_cis
+
+    def get_freqs_cis_by_seqlens(self, grid_hws: torch.Tensor) -> torch.Tensor:
+        """
+        Args:
+            grid_hws (torch.Tensor): containing list of (height, width) or (t, height, width) tuples.
+        Returns:
+            freqs_cis: tensor of shape (sum(t * height * width), dim//2)
+        """
+        shapes = grid_hws.tolist()
+        assert all(
+            1 <= h <= self.max_height and 1 <= w <= self.max_width for h, w in shapes
+        ), (
+            shapes,
+            self.max_height,
+            self.max_width,
+        )
+        freqs_cis = torch.cat(
+            [
+                self.precomputed_freqs_cis[:h, :w].reshape(-1, self.dim // 2)
+                for h, w in shapes
+            ],
+            dim=0,
+        )
+        return freqs_cis
+
+    def get_freqs_cis_by_idx(
+        self, pos_idx: torch.Tensor, pos_idx_mask: torch.Tensor
+    ) -> torch.Tensor:
+        """
+        Args:
+            pos_idx: tensor of shape (..., 2), It contains the (h, w) position indices of each 2D token.
+            pos_idx_mask: a mask of shape (...), the leading dimensions should be the same as pos_idx.
+                Rope will only be applied to the tokens with True mask. `freqs_cis` for the tokens with False mask with be ones.
+        Return:
+            freqs_cis: tensor of shape (..., dim//2)
+        """
+        assert (
+            pos_idx.shape[:-1] == pos_idx_mask.shape
+            and pos_idx.shape[-1] == 2
+            and pos_idx.ndim == pos_idx_mask.ndim + 1
+        ), (pos_idx.shape, pos_idx_mask.shape)
+        assert pos_idx_mask.dtype == torch.bool, pos_idx_mask.dtype
+
+        shp = pos_idx_mask.shape + (self.dim // 2,)  # ..., head_dim/2
+        freqs_cis = torch.ones(
+            shp, dtype=torch.complex64, device=self.device
+        )  # ..., head_dim/2
+        freqs_cis[pos_idx_mask] = self.precomputed_freqs_cis[
+            pos_idx[..., 0][pos_idx_mask], pos_idx[..., 1][pos_idx_mask]
+        ]
+        return freqs_cis
+
+
+class MLP2(nn.Module):
+    """
+    Args:
+        dims: [in_dim, hidden_dim, out_dim]
+        bias: whether to use bias in linear layer.
+    """
+
+    def __init__(self, dims: list[int], activation, bias=True):
+        super().__init__()
+        assert len(dims) == 3
+        self.fc0 = nn.Linear(dims[0], dims[1], bias=bias)
+        self.fc1 = nn.Linear(dims[1], dims[2], bias=bias)
+        self.activation = activation
+        for m in [self.fc0, self.fc1]:
+            nn.init.trunc_normal_(m.weight, std=math.sqrt(2 / m.in_features))
+            if m.bias is not None:
+                nn.init.zeros_(m.bias)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.fc0(x)
+        x = self.activation(x)
+        return self.fc1(x)
+
+
+class MoonVitEncoderLayer(nn.Module):
+
+    def __init__(
+        self,
+        num_heads: int,
+        hidden_dim: int,
+        mlp_dim: int,
+        *,
+        attn_implementation: str = "flash_attention_2",  # use fa2 in sglang by default
+        activation=F.gelu,
+        attn_bias: bool = False,
+    ):
+        super().__init__()
+        self.num_heads = num_heads
+        self.hidden_dim = hidden_dim
+        self.hidden_size_per_attention_head = self.hidden_dim // self.num_heads
+        self.attn_implementation = attn_implementation
+
+        self.norm0 = nn.LayerNorm(hidden_dim)
+        self.norm1 = nn.LayerNorm(hidden_dim)
+        self.mlp = MLP2([hidden_dim, mlp_dim, hidden_dim], activation)
+        self.wqkv = nn.Linear(hidden_dim, hidden_dim * 3, bias=attn_bias)
+        self.wo = nn.Linear(hidden_dim, hidden_dim, bias=attn_bias)
+
+    def attention_qkvpacked(
+        self,
+        x: torch.Tensor,
+        cu_seqlens: torch.Tensor,
+        rope_freqs_cis: Optional[torch.Tensor] = None,
+    ):
+        """
+        Args:
+            x (torch.Tensor): (batch_size, seqlen, hidden_dim)
+            cu_seqlens (torch.Tensor):
+        """
+        xqkv = self.wqkv(x)
+
+        qkv_shape = xqkv.size()[:-1] + (
+            3,
+            self.num_heads,
+            self.hidden_size_per_attention_head,
+        )
+        # xqkv: (batch_size, seqlen, 3, nheads, headdim)
+        xqkv = xqkv.view(*qkv_shape)
+        xq, xk, xv = torch.unbind(xqkv, dim=-3)
+
+        xq, xk = apply_rope(xq, xk, rope_freqs_cis)
+
+        attn_func = VL_VISION_ATTENTION_FUNCTIONS[self.attn_implementation]
+        attn_out = attn_func(
+            xq, xk, xv, q_cu_seqlens=cu_seqlens, k_cu_seqlens=cu_seqlens
+        )
+
+        attn_out = self.wo(attn_out)
+        return attn_out
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        cu_seqlens: torch.Tensor,
+        rope_freqs_cis: Union[torch.Tensor, None] = None,
+    ) -> torch.Tensor:
+        """
+        Args:
+            hidden_states: non-packed (B, N, D) or packed (L, D). if non-packed, seqlens should be None, if packed, seqlens should be set
+
+        Returns:
+            output: same shape of input, non-packed (B, N, D) for non-packed input, (L, D) for packed input
+        """
+        residual = hidden_states
+        hidden_states = self.norm0(hidden_states)
+        attn_out = self.attention_qkvpacked(
+            hidden_states, cu_seqlens, rope_freqs_cis=rope_freqs_cis
+        )
+        hidden_states = residual + attn_out
+
+        residual = hidden_states
+        hidden_states = self.mlp(self.norm1(hidden_states))
+        hidden_states = residual + hidden_states
+        return hidden_states
+
+
+class MoonVitEncoder(nn.Module):
+
+    def __init__(
+        self,
+        hidden_dim: int,
+        num_layers: int,
+        block_cfg: dict,
+    ) -> None:
+        super().__init__()
+
+        self.rope_2d = Rope2DPosEmb(
+            block_cfg["hidden_dim"] // block_cfg["num_heads"], 512, 512
+        )
+        self.blocks = nn.ModuleList(
+            [MoonVitEncoderLayer(**block_cfg) for _ in range(num_layers)]
+        )
+        self.final_layernorm = nn.LayerNorm(hidden_dim)
+
+    def forward(
+        self, hidden_states: torch.Tensor, grid_hw: torch.Tensor
+    ) -> torch.Tensor:
+        rope_freqs_cis = self.rope_2d.get_freqs_cis_by_seqlens(grid_hws=grid_hw)
+
+        lengths = torch.cat(
+            (
+                torch.zeros(1, device=hidden_states.device, dtype=grid_hw.dtype),
+                grid_hw[:, 0] * grid_hw[:, 1],
+            )
+        )
+        cu_seqlens = lengths.cumsum(dim=0, dtype=torch.int32)
+
+        for _, block in enumerate(self.blocks):
+            hidden_states = block(
+                hidden_states, cu_seqlens, rope_freqs_cis=rope_freqs_cis
+            )
+
+        hidden_states = self.final_layernorm(hidden_states)
+
+        return hidden_states
+
+
+def patch_merger(
+    x: torch.Tensor,
+    grid_hw: torch.Tensor,
+    merge_kernel_size: list[int, int] = (2, 2),
+) -> List[torch.Tensor]:
+    d_model = x.size(-1)
+
+    outputs = []
+    pre_sum = 0
+    for x_shape in grid_hw.tolist():
+        height, width = x_shape[0], x_shape[1]
+        # Get the current sequence
+        seq = x[pre_sum : pre_sum + height * width]
+        # Reshape along self.merge_kernel_size and concat to the last dimension
+        kernel_height, kernel_width = merge_kernel_size
+        new_height, new_width = height // kernel_height, width // kernel_width
+        reshaped_seq = seq.view(
+            new_height, kernel_height, new_width, kernel_width, d_model
+        )
+        reshaped_seq = reshaped_seq.permute(0, 2, 1, 3, 4).contiguous()
+        padded_seq = reshaped_seq.view(
+            new_height * new_width, kernel_height * kernel_width, -1
+        )
+        outputs.append(padded_seq)
+        pre_sum += height * width
+
+    return outputs
+
+
+class MoonVitVLProjector(nn.Module):
+
+    def __init__(
+        self,
+        in_channels: int,
+        merge_kernel_size: list[int, int],
+        hidden_act: str = "gelu",
+        ln_eps: float = 1e-5,
+        out_dim: int = 4096,
+    ):
+        super().__init__()
+        self.hidden_size = in_channels * merge_kernel_size[0] * merge_kernel_size[1]
+
+        self.pre_norm = nn.nn.LayerNorm(in_channels, eps=ln_eps)
+        self.linear_1 = nn.Linear(self.hidden_size, self.hidden_size, bias=True)
+        self.act = ACT2FN[hidden_act]
+        self.linear_2 = nn.Linear(self.hidden_size, out_dim, bias=True)
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.pre_norm(hidden_states).view(-1, self.hidden_size)
+        hidden_states = self.linear_1(hidden_states)
+        hidden_states = self.act(hidden_states)
+        hidden_states = self.linear_2(hidden_states)
+        return hidden_states
+
+
+class MoonVitPretrainedModel(PreTrainedModel):
+    config_class = MoonViTConfig
+    model_type = "moonvit"
+    _no_split_modules = ["PackingTransformer"]
+    _supports_flash_attn_2 = True
+    _supports_sdpa = True
+
+    def __init__(self, config: MoonViTConfig, *inputs, **kwargs):
+        from transformers.activations import GELUTanh
+
+        super().__init__(config, *inputs, **kwargs)
+        config = deepcopy(config)
+        self.merge_kernel_size = config.merge_kernel_size
+        self.patch_size = config.patch_size
+        self.patch_embed = MoonVisionPatchEmbed(
+            out_dim=config.hidden_size,
+            patch_size=config.patch_size,
+            pos_emb_height=config.init_pos_emb_height,
+            pos_emb_width=config.init_pos_emb_width,
+        )
+
+        self.encoder = MoonVitEncoder(
+            hidden_dim=config.hidden_size,
+            num_layers=config.num_hidden_layers,
+            block_cfg={
+                "num_heads": config.num_attention_heads,
+                "hidden_dim": config.hidden_size,
+                "mlp_dim": config.intermediate_size,
+                "activation": GELUTanh(),
+                "attn_bias": True,
+                "attn_implementation": config._attn_implementation,
+            },
+        )
+
+    def forward(
+        self, pixel_values: torch.Tensor, grid_hw: torch.Tensor
+    ) -> torch.Tensor:
+        """
+        Args:
+            pixel_values (torch.Tensor): The input pixel values.
+            grid_hw (torch.Tensor): The grid height and width.
+
+        Returns:
+            torch.Tensor: The output tokens.
+        """
+        hidden_states = self.patch_embed(pixel_values, grid_hw)
+        hidden_states = self.encoder(hidden_states, grid_hw)
+        hidden_states = patch_merger(
+            hidden_states, grid_hw, merge_kernel_size=self.merge_kernel_size
+        )
+        return hidden_states
diff --git a/sglang/python/sglang/srt/models/lfm2.py b/sglang/python/sglang/srt/models/lfm2.py
new file mode 100644
index 0000000000000000000000000000000000000000..8a271c33606ed4869253d563664b86377e246428
--- /dev/null
+++ b/sglang/python/sglang/srt/models/lfm2.py
@@ -0,0 +1,554 @@
+"""
+LFM2 (Liquid Foundation Model 2) implementation for SGLang.
+
+This is a hybrid architecture with both attention and short conv layers.
+- Attention layers use standard KV cache (RadixAttention)
+- Conv layers use MambaPool for state caching (via HybridReqToTokenPool)
+
+The model uses a gated 1D causal convolution (kernel=3) instead of attention
+in some layers, providing linear memory complexity for those layers.
+
+Uses optimized causal_conv1d kernels from the mamba package for fast inference.
+"""
+
+import logging
+from typing import Iterable, Optional, Set, Tuple
+
+import torch
+import torch.nn.functional as F
+from torch import nn
+
+from sglang.srt.configs.lfm2 import Lfm2Config
+from sglang.srt.distributed import get_pp_group, get_tensor_model_parallel_world_size
+from sglang.srt.layers.attention.mamba.causal_conv1d import (
+    causal_conv1d_fn,
+    causal_conv1d_update,
+)
+from sglang.srt.layers.layernorm import RMSNorm
+from sglang.srt.layers.linear import (
+    ColumnParallelLinear,
+    MergedColumnParallelLinear,
+    QKVParallelLinear,
+    RowParallelLinear,
+)
+from sglang.srt.layers.logits_processor import LogitsProcessor
+from sglang.srt.layers.quantization.base_config import QuantizationConfig
+from sglang.srt.layers.radix_attention import RadixAttention
+from sglang.srt.layers.rotary_embedding import get_rope
+from sglang.srt.layers.vocab_parallel_embedding import (
+    ParallelLMHead,
+    VocabParallelEmbedding,
+)
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch
+from sglang.srt.model_loader.weight_utils import (
+    default_weight_loader,
+    sharded_weight_loader,
+)
+from sglang.srt.utils import add_prefix, make_layers, set_weight_attrs
+
+logger = logging.getLogger(__name__)
+
+
+class Lfm2MLP(nn.Module):
+    """MLP with SwiGLU activation."""
+
+    def __init__(
+        self,
+        config: Lfm2Config,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        intermediate_size = config.intermediate_size
+
+        if config.block_auto_adjust_ff_dim:
+            intermediate_size = int(2 * intermediate_size / 3)
+            if config.block_ffn_dim_multiplier is not None:
+                intermediate_size = int(
+                    config.block_ffn_dim_multiplier * intermediate_size
+                )
+                intermediate_size = config.block_multiple_of * (
+                    (intermediate_size + config.block_multiple_of - 1)
+                    // config.block_multiple_of
+                )
+
+        self.w1 = ColumnParallelLinear(
+            config.hidden_size,
+            intermediate_size,
+            bias=False,
+            quant_config=quant_config,
+            prefix=add_prefix("w1", prefix),
+        )
+        self.w3 = ColumnParallelLinear(
+            config.hidden_size,
+            intermediate_size,
+            bias=False,
+            quant_config=quant_config,
+            prefix=add_prefix("w3", prefix),
+        )
+        self.w2 = RowParallelLinear(
+            intermediate_size,
+            config.hidden_size,
+            bias=False,
+            quant_config=quant_config,
+            prefix=add_prefix("w2", prefix),
+        )
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        gate, _ = self.w1(x)
+        up, _ = self.w3(x)
+        out, _ = self.w2(F.silu(gate) * up)
+        return out
+
+
+class Lfm2Attention(nn.Module):
+    """Grouped-query attention with RoPE and Q/K layernorm."""
+
+    def __init__(
+        self,
+        config: Lfm2Config,
+        layer_id: int,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.total_num_heads = config.num_attention_heads
+        self.total_num_kv_heads = config.num_key_value_heads
+        self.head_dim = getattr(config, "head_dim", None) or (
+            self.hidden_size // self.total_num_heads
+        )
+        self.scaling = self.head_dim**-0.5
+
+        rope_parameters = getattr(config, "rope_parameters", None)
+        if rope_parameters is not None and "rope_theta" in rope_parameters:
+            rope_theta = rope_parameters["rope_theta"]
+        else:
+            rope_theta = getattr(config, "rope_theta", 10000)
+
+        self.rotary_emb = get_rope(
+            head_size=self.head_dim,
+            rotary_dim=self.head_dim,
+            max_position=getattr(config, "max_position_embeddings", 8192),
+            rope_scaling=getattr(config, "rope_scaling", None),
+            base=rope_theta,
+            is_neox_style=True,
+            dtype=torch.get_default_dtype(),
+        )
+
+        self.qkv_proj = QKVParallelLinear(
+            self.hidden_size,
+            self.head_dim,
+            self.total_num_heads,
+            self.total_num_kv_heads,
+            bias=False,
+            quant_config=quant_config,
+            prefix=add_prefix("qkv_proj", prefix),
+        )
+        self.out_proj = RowParallelLinear(
+            self.total_num_heads * self.head_dim,
+            self.hidden_size,
+            bias=False,
+            quant_config=quant_config,
+            prefix=add_prefix("out_proj", prefix),
+        )
+
+        self.q_layernorm = RMSNorm(self.head_dim, eps=config.norm_eps)
+        self.k_layernorm = RMSNorm(self.head_dim, eps=config.norm_eps)
+
+        self.num_local_q_heads = self.qkv_proj.num_heads
+        self.num_local_kv_heads = self.qkv_proj.num_kv_heads
+
+        self.attn = RadixAttention(
+            num_heads=self.num_local_q_heads,
+            head_dim=self.head_dim,
+            scaling=self.scaling,
+            num_kv_heads=self.num_local_kv_heads,
+            layer_id=layer_id,
+            prefix=add_prefix("attn", prefix),
+        )
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+    ) -> torch.Tensor:
+        T = hidden_states.shape[0]
+        qkv, _ = self.qkv_proj(hidden_states)
+
+        q_size = self.num_local_q_heads * self.head_dim
+        kv_size = self.num_local_kv_heads * self.head_dim
+        q, k, v = torch.split(qkv, [q_size, kv_size, kv_size], dim=-1)
+
+        q = q.reshape(T, self.num_local_q_heads, self.head_dim)
+        k = k.reshape(T, self.num_local_kv_heads, self.head_dim)
+
+        q = self.q_layernorm(q.reshape(-1, self.head_dim)).reshape(
+            T, self.num_local_q_heads, self.head_dim
+        )
+        k = self.k_layernorm(k.reshape(-1, self.head_dim)).reshape(
+            T, self.num_local_kv_heads, self.head_dim
+        )
+
+        q, k = self.rotary_emb(positions, q, k)
+
+        attn_out = self.attn(q.reshape(T, -1), k.reshape(T, -1), v, forward_batch)
+        out, _ = self.out_proj(attn_out)
+        return out
+
+
+class Lfm2ShortConv(nn.Module):
+    """
+    Gated short convolution layer using optimized causal_conv1d kernels.
+
+    Architecture: in_proj -> split(B, C, x) -> Bx -> conv1d -> C*conv_out -> out_proj
+    - Uses double gating: B (before conv) and C (after conv)
+    - Fixed-size cache: stores last (kernel_size - 1) tokens
+    - Uses causal_conv1d_fn for prefill and causal_conv1d_update for decode
+    - Supports tensor parallelism: hidden dimension is sharded across TP ranks
+    """
+
+    def __init__(
+        self,
+        config: Lfm2Config,
+        layer_idx: int,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.layer_idx = layer_idx
+        self.conv_kernel = int(config.conv_L_cache)
+        self.use_bias = bool(config.conv_bias)
+        self.hidden_size = config.hidden_size
+
+        tp_size = get_tensor_model_parallel_world_size()
+        self.hidden_size_per_partition = self.hidden_size // tp_size
+
+        # Use MergedColumnParallelLinear so each output (B, C, x) is sharded separately
+        self.in_proj = MergedColumnParallelLinear(
+            config.hidden_size,
+            [config.hidden_size] * 3,  # B, C, x each get hidden_size
+            bias=self.use_bias,
+            quant_config=quant_config,
+            prefix=f"{prefix}.in_proj",
+        )
+        self.out_proj = RowParallelLinear(
+            config.hidden_size,
+            config.hidden_size,
+            bias=self.use_bias,
+            input_is_parallel=True,
+            quant_config=quant_config,
+            prefix=f"{prefix}.out_proj",
+        )
+
+        # Conv weights sharded along hidden dimension: (hidden_size/tp, kernel_size)
+        self.conv_weight = nn.Parameter(
+            torch.empty(self.hidden_size_per_partition, self.conv_kernel)
+        )
+        set_weight_attrs(self.conv_weight, {"weight_loader": sharded_weight_loader(0)})
+        if self.use_bias:
+            self.conv_bias = nn.Parameter(torch.empty(self.hidden_size_per_partition))
+            set_weight_attrs(
+                self.conv_bias, {"weight_loader": sharded_weight_loader(0)}
+            )
+        else:
+            self.register_parameter("conv_bias", None)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+    ) -> torch.Tensor:
+        if forward_batch.forward_mode.is_idle():
+            return hidden_states
+
+        layer_cache = forward_batch.req_to_token_pool.mamba2_layer_cache(self.layer_idx)
+        conv_state = layer_cache.conv[0]
+        req_pool_indices = forward_batch.req_pool_indices
+
+        # Project and split into gates: B (pre-conv), C (post-conv), x (input)
+        proj, _ = self.in_proj(hidden_states)
+        B_gate, C_gate, x = proj.chunk(3, dim=-1)
+        Bx = B_gate * x
+
+        if forward_batch.forward_mode.is_decode():
+            # Decode: single token per request, use optimized update kernel
+            conv_out = causal_conv1d_update(
+                Bx,
+                conv_state,
+                self.conv_weight,
+                self.conv_bias,
+                activation=None,
+                conv_state_indices=req_pool_indices.to(torch.int32),
+            )
+        else:
+            # Prefill: multiple tokens, use varlen kernel
+            T = hidden_states.shape[0]
+            Bx_t = Bx.transpose(0, 1).contiguous()
+
+            # Build query_start_loc: [0, cumsum(seq_lens)...]
+            extend_start_loc = forward_batch.extend_start_loc
+            if extend_start_loc is not None and len(extend_start_loc) > 1:
+                query_start_loc = torch.cat(
+                    [
+                        extend_start_loc,
+                        torch.tensor(
+                            [T], dtype=torch.int32, device=hidden_states.device
+                        ),
+                    ]
+                )
+                cache_indices = req_pool_indices.to(torch.int32)
+            else:
+                query_start_loc = torch.tensor(
+                    [0, T], dtype=torch.int32, device=hidden_states.device
+                )
+                cache_indices = req_pool_indices[:1].to(torch.int32)
+
+            conv_out = causal_conv1d_fn(
+                Bx_t,
+                self.conv_weight,
+                self.conv_bias,
+                query_start_loc=query_start_loc,
+                cache_indices=cache_indices,
+                has_initial_state=None,
+                conv_states=conv_state,
+                activation=None,
+            ).transpose(0, 1)
+
+        output, _ = self.out_proj(C_gate * conv_out)
+        return output
+
+
+class Lfm2DecoderLayer(nn.Module):
+    """Decoder layer - either attention or conv based on config."""
+
+    def __init__(
+        self,
+        config: Lfm2Config,
+        layer_id: int,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.layer_type = config.layer_types[layer_id]
+        self.is_attention_layer = self.layer_type == "full_attention"
+
+        self.operator_norm = RMSNorm(config.hidden_size, eps=config.norm_eps)
+        self.ffn_norm = RMSNorm(config.hidden_size, eps=config.norm_eps)
+
+        if self.is_attention_layer:
+            self.self_attn = Lfm2Attention(
+                config=config,
+                layer_id=layer_id,
+                quant_config=quant_config,
+                prefix=add_prefix("self_attn", prefix),
+            )
+        else:
+            self.conv = Lfm2ShortConv(
+                config=config,
+                layer_idx=layer_id,
+                quant_config=quant_config,
+                prefix=add_prefix("conv", prefix),
+            )
+
+        self.feed_forward = Lfm2MLP(
+            config=config,
+            quant_config=quant_config,
+            prefix=add_prefix("feed_forward", prefix),
+        )
+
+    def forward(
+        self,
+        layer_id: int,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        residual: Optional[torch.Tensor],
+        forward_batch: ForwardBatch,
+        **kwargs,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        if not forward_batch.forward_mode.is_idle():
+            residual = hidden_states
+            normed = self.operator_norm(hidden_states)
+
+            if self.is_attention_layer:
+                hidden_states = self.self_attn(positions, normed, forward_batch)
+            else:
+                hidden_states = self.conv(normed, forward_batch)
+
+            hidden_states = hidden_states + residual
+            hidden_states = hidden_states + self.feed_forward(
+                self.ffn_norm(hidden_states)
+            )
+
+        return hidden_states, residual
+
+
+class Lfm2Model(nn.Module):
+    def __init__(
+        self,
+        config: Lfm2Config,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.config = config
+
+        self.embed_tokens = VocabParallelEmbedding(
+            config.vocab_size,
+            config.hidden_size,
+            org_num_embeddings=config.vocab_size,
+            prefix=add_prefix("embed_tokens", prefix),
+        )
+
+        # Count attention layers for KV cache sizing
+        self.num_attention_layers = sum(
+            1 for lt in config.layer_types if lt == "full_attention"
+        )
+
+        def get_layer(idx: int, prefix: str, **kwargs):
+            return Lfm2DecoderLayer(
+                config=config,
+                layer_id=idx,
+                quant_config=quant_config,
+                prefix=prefix,
+            )
+
+        self.layers = make_layers(
+            config.num_hidden_layers, get_layer, prefix=f"{prefix}.layers"
+        )
+        self.embedding_norm = RMSNorm(config.hidden_size, eps=config.norm_eps)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        hidden_states = (
+            inputs_embeds if inputs_embeds is not None else self.embed_tokens(input_ids)
+        )
+
+        residual = None
+        for i in range(len(self.layers)):
+            hidden_states, residual = self.layers[i](
+                layer_id=i,
+                positions=positions,
+                hidden_states=hidden_states,
+                residual=residual,
+                forward_batch=forward_batch,
+            )
+
+        return self.embedding_norm(hidden_states)
+
+
+class Lfm2ForCausalLM(nn.Module):
+    """LFM2 for causal language modeling with hybrid attention/conv architecture."""
+
+    fall_back_to_pt_during_load = False
+
+    def __init__(
+        self,
+        config: Lfm2Config,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.config = config
+        self.pp_group = get_pp_group()
+        assert self.pp_group.is_first_rank and self.pp_group.is_last_rank
+
+        self.quant_config = quant_config
+        self.model = Lfm2Model(config, quant_config, prefix=add_prefix("model", prefix))
+        self.lm_head = ParallelLMHead(
+            config.vocab_size,
+            config.hidden_size,
+            quant_config=quant_config,
+            org_num_embeddings=config.vocab_size,
+            prefix=add_prefix("lm_head", prefix),
+        )
+        self.logits_processor = LogitsProcessor(config)
+        self.num_attention_layers = self.model.num_attention_layers
+
+    def get_num_kv_cache_layers(self) -> int:
+        return self.num_attention_layers
+
+    @torch.no_grad()
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        **kwargs,
+    ):
+        hidden_states = self.model(input_ids, positions, forward_batch, inputs_embeds)
+        return self.logits_processor(
+            input_ids, hidden_states, self.lm_head, forward_batch
+        )
+
+    def load_weights(
+        self, weights: Iterable[Tuple[str, torch.Tensor]], is_mtp: bool = False
+    ) -> Set[str]:
+        stacked_params_mapping = [
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+        ]
+
+        params_dict = dict(self.named_parameters())
+        loaded_params: Set[str] = set()
+        embed_tokens_weight = None
+
+        for name, loaded_weight in weights:
+            if "rotary_emb.inv_freq" in name:
+                continue
+
+            if "embed_tokens.weight" in name:
+                embed_tokens_weight = loaded_weight
+
+            # Handle conv weight/bias naming: HF uses conv.conv, we use conv_weight/conv_bias
+            if ".conv.conv.weight" in name:
+                name = name.replace(".conv.conv.weight", ".conv.conv_weight")
+                loaded_weight = loaded_weight.squeeze(1)  # (D, 1, K) -> (D, K)
+            if ".conv.conv.bias" in name:
+                name = name.replace(".conv.conv.bias", ".conv.conv_bias")
+
+            # Handle QKV stacking
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                if name.endswith(".bias") and name not in params_dict:
+                    break
+                if name not in params_dict:
+                    break
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader")
+                weight_loader(param, loaded_weight, shard_id)
+                loaded_params.add(name)
+                break
+            else:
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                if name not in params_dict:
+                    continue
+
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
+                weight_loader(param, loaded_weight)
+                loaded_params.add(name)
+
+        # Handle tied lm_head weight
+        if "lm_head.weight" not in loaded_params and "lm_head.weight" in params_dict:
+            if embed_tokens_weight is not None:
+                param = params_dict["lm_head.weight"]
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
+                weight_loader(param, embed_tokens_weight)
+                loaded_params.add("lm_head.weight")
+
+        return loaded_params
+
+
+EntryClass = [Lfm2ForCausalLM]
diff --git a/sglang/python/sglang/srt/models/lfm2_moe.py b/sglang/python/sglang/srt/models/lfm2_moe.py
new file mode 100644
index 0000000000000000000000000000000000000000..2e164591eb35e6b74d08235075f5e131f21a5ca7
--- /dev/null
+++ b/sglang/python/sglang/srt/models/lfm2_moe.py
@@ -0,0 +1,679 @@
+"""
+LFM2-MoE (Liquid Foundation Model 2 - Mixture of Experts) implementation for SGLang.
+
+This is a hybrid architecture with attention, ShortConv, and MoE layers:
+- Attention layers use standard KV cache (RadixAttention)
+- Conv layers use MambaPool for state caching (via HybridReqToTokenPool)
+- First `num_dense_layers` use dense MLP, rest use MoE with sigmoid routing
+
+Key MoE characteristics:
+- Sigmoid routing (not softmax) - auxiliary-loss-free style
+- Expert bias (fp32) affects selection but not weighting
+- Post-hoc normalization of top-k weights
+"""
+
+from typing import Iterable, Optional, Set, Tuple
+
+import torch
+from torch import nn
+
+from sglang.srt.configs.lfm2_moe import Lfm2MoeConfig
+from sglang.srt.distributed import get_pp_group, get_tensor_model_parallel_world_size
+from sglang.srt.layers.activation import SiluAndMul
+from sglang.srt.layers.attention.mamba.causal_conv1d import (
+    causal_conv1d_fn,
+    causal_conv1d_update,
+)
+from sglang.srt.layers.layernorm import RMSNorm
+from sglang.srt.layers.linear import (
+    MergedColumnParallelLinear,
+    QKVParallelLinear,
+    ReplicatedLinear,
+    RowParallelLinear,
+)
+from sglang.srt.layers.logits_processor import LogitsProcessor
+from sglang.srt.layers.moe.fused_moe_triton import FusedMoE
+from sglang.srt.layers.moe.topk import TopK
+from sglang.srt.layers.quantization.base_config import QuantizationConfig
+from sglang.srt.layers.radix_attention import RadixAttention
+from sglang.srt.layers.rotary_embedding import get_rope
+from sglang.srt.layers.vocab_parallel_embedding import (
+    ParallelLMHead,
+    VocabParallelEmbedding,
+)
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch
+from sglang.srt.model_loader.weight_utils import (
+    default_weight_loader,
+    sharded_weight_loader,
+)
+from sglang.srt.utils import add_prefix, make_layers, set_weight_attrs
+
+
+class Lfm2MoeMLP(nn.Module):
+    """Dense MLP for first N layers (before MoE kicks in)."""
+
+    def __init__(
+        self,
+        config: Lfm2MoeConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        # Use MergedColumnParallelLinear for w1/w3 (gate/up projections)
+        self.gate_up_proj = MergedColumnParallelLinear(
+            config.hidden_size,
+            [config.intermediate_size] * 2,
+            bias=False,
+            quant_config=quant_config,
+            prefix=add_prefix("gate_up_proj", prefix),
+        )
+        self.down_proj = RowParallelLinear(
+            config.intermediate_size,
+            config.hidden_size,
+            bias=False,
+            quant_config=quant_config,
+            prefix=add_prefix("down_proj", prefix),
+        )
+        self.act_fn = SiluAndMul()
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        gate_up, _ = self.gate_up_proj(x)
+        x = self.act_fn(gate_up)
+        out, _ = self.down_proj(x)
+        return out
+
+
+class Lfm2MoeSparseMoeBlock(nn.Module):
+    """
+    Sparse MoE block with sigmoid routing using optimized FusedMoE.
+
+    Key features:
+    - Sigmoid scoring (not softmax) - auxiliary-loss-free style
+    - Expert bias (fp32) for load balancing
+    - Bias affects selection only, not weighting
+    - Uses FusedMoE for efficient batched expert computation
+    """
+
+    def __init__(
+        self,
+        config: Lfm2MoeConfig,
+        layer_idx: int,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.tp_size = get_tensor_model_parallel_world_size()
+        self.routed_scaling_factor = config.routed_scaling_factor
+
+        if self.tp_size > config.num_experts:
+            raise ValueError(
+                f"Tensor parallel size {self.tp_size} is greater than "
+                f"the number of experts {config.num_experts}."
+            )
+
+        # Gate (router) - outputs logits for each expert
+        self.gate = ReplicatedLinear(
+            config.hidden_size,
+            config.num_experts,
+            bias=False,
+            quant_config=None,
+            prefix=add_prefix("gate", prefix),
+        )
+
+        # Expert bias (fp32) - affects selection but not weighting
+        if config.use_expert_bias:
+            self.expert_bias = nn.Parameter(
+                torch.zeros(config.num_experts, dtype=torch.float32)
+            )
+        else:
+            self.register_parameter("expert_bias", None)
+
+        # TopK selector with sigmoid scoring
+        self.topk = TopK(
+            top_k=config.num_experts_per_tok,
+            layer_id=layer_idx,
+            renormalize=config.norm_topk_prob,
+            scoring_func="sigmoid",
+            correction_bias=self.expert_bias if config.use_expert_bias else None,
+        )
+
+        # FusedMoE for efficient batched expert computation
+        # Note: We intentionally do NOT pass routed_scaling_factor to FusedMoE.
+        # While FusedMoE supports it, passing it there increases numerical
+        # differences vs HuggingFace (likely due to different code paths in the
+        # Triton runner when scaling_factor != None). We apply it manually below.
+        self.experts = FusedMoE(
+            num_experts=config.num_experts,
+            top_k=config.num_experts_per_tok,
+            hidden_size=config.hidden_size,
+            intermediate_size=config.moe_intermediate_size,
+            layer_id=layer_idx,
+            reduce_results=True,
+            quant_config=quant_config,
+            prefix=add_prefix("experts", prefix),
+        )
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        """Optimized expert forward pass using FusedMoE."""
+        # Get router logits
+        router_logits, _ = self.gate(hidden_states)
+
+        # Select top-k experts with sigmoid scoring
+        topk_output = self.topk(hidden_states, router_logits)
+
+        # Run fused expert computation
+        final_hidden_states = self.experts(hidden_states, topk_output)
+
+        # Apply routed scaling factor (see __init__ comment for why not in FusedMoE)
+        return final_hidden_states * self.routed_scaling_factor
+
+
+class Lfm2MoeAttention(nn.Module):
+    """Grouped-query attention with RoPE and Q/K layernorm."""
+
+    def __init__(
+        self,
+        config: Lfm2MoeConfig,
+        layer_id: int,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.total_num_heads = config.num_attention_heads
+        self.total_num_kv_heads = config.num_key_value_heads
+        self.head_dim = self.hidden_size // self.total_num_heads
+        self.scaling = self.head_dim**-0.5
+
+        rope_parameters = getattr(config, "rope_parameters", None)
+        if rope_parameters is not None and "rope_theta" in rope_parameters:
+            rope_theta = rope_parameters["rope_theta"]
+        else:
+            rope_theta = getattr(config, "rope_theta", 1000000.0)
+
+        self.rotary_emb = get_rope(
+            head_size=self.head_dim,
+            rotary_dim=self.head_dim,
+            max_position=getattr(config, "max_position_embeddings", 128000),
+            rope_scaling=getattr(config, "rope_scaling", None),
+            base=rope_theta,
+            is_neox_style=True,
+            dtype=torch.get_default_dtype(),
+        )
+
+        self.qkv_proj = QKVParallelLinear(
+            self.hidden_size,
+            self.head_dim,
+            self.total_num_heads,
+            self.total_num_kv_heads,
+            bias=False,
+            quant_config=quant_config,
+            prefix=add_prefix("qkv_proj", prefix),
+        )
+        self.out_proj = RowParallelLinear(
+            self.total_num_heads * self.head_dim,
+            self.hidden_size,
+            bias=False,
+            quant_config=quant_config,
+            prefix=add_prefix("out_proj", prefix),
+        )
+
+        self.q_layernorm = RMSNorm(self.head_dim, eps=config.norm_eps)
+        self.k_layernorm = RMSNorm(self.head_dim, eps=config.norm_eps)
+
+        self.num_local_q_heads = self.qkv_proj.num_heads
+        self.num_local_kv_heads = self.qkv_proj.num_kv_heads
+
+        self.attn = RadixAttention(
+            num_heads=self.num_local_q_heads,
+            head_dim=self.head_dim,
+            scaling=self.scaling,
+            num_kv_heads=self.num_local_kv_heads,
+            layer_id=layer_id,
+            prefix=add_prefix("attn", prefix),
+        )
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+    ) -> torch.Tensor:
+        T = hidden_states.shape[0]
+        qkv, _ = self.qkv_proj(hidden_states)
+
+        q_size = self.num_local_q_heads * self.head_dim
+        kv_size = self.num_local_kv_heads * self.head_dim
+        q, k, v = torch.split(qkv, [q_size, kv_size, kv_size], dim=-1)
+
+        q = q.reshape(T, self.num_local_q_heads, self.head_dim)
+        k = k.reshape(T, self.num_local_kv_heads, self.head_dim)
+
+        q = self.q_layernorm(q.reshape(-1, self.head_dim)).reshape(
+            T, self.num_local_q_heads, self.head_dim
+        )
+        k = self.k_layernorm(k.reshape(-1, self.head_dim)).reshape(
+            T, self.num_local_kv_heads, self.head_dim
+        )
+
+        q, k = self.rotary_emb(positions, q, k)
+
+        attn_out = self.attn(q.reshape(T, -1), k.reshape(T, -1), v, forward_batch)
+        out, _ = self.out_proj(attn_out)
+        return out
+
+
+class Lfm2MoeShortConv(nn.Module):
+    """
+    Gated short convolution layer using optimized causal_conv1d kernels.
+
+    Architecture: in_proj -> split(B, C, x) -> Bx -> conv1d -> C*conv_out -> out_proj
+    - Supports tensor parallelism: hidden dimension is sharded across TP ranks
+    """
+
+    def __init__(
+        self,
+        config: Lfm2MoeConfig,
+        layer_idx: int,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.layer_idx = layer_idx
+        self.conv_kernel = int(config.conv_L_cache)
+        self.use_bias = bool(config.conv_bias)
+        self.hidden_size = config.hidden_size
+
+        # Get tensor parallel size for sharding
+        self.tp_size = get_tensor_model_parallel_world_size()
+        self.hidden_size_per_partition = self.hidden_size // self.tp_size
+
+        # Use MergedColumnParallelLinear so each output (B, C, x) is sharded separately
+        self.in_proj = MergedColumnParallelLinear(
+            config.hidden_size,
+            [config.hidden_size] * 3,  # B, C, x each get hidden_size
+            bias=self.use_bias,
+            quant_config=quant_config,
+            prefix=f"{prefix}.in_proj",
+        )
+        self.out_proj = RowParallelLinear(
+            config.hidden_size,
+            config.hidden_size,
+            bias=self.use_bias,
+            input_is_parallel=True,
+            quant_config=quant_config,
+            prefix=f"{prefix}.out_proj",
+        )
+
+        # Conv weights sharded along hidden dimension: (hidden_size/tp, kernel_size)
+        self.conv_weight = nn.Parameter(
+            torch.empty(self.hidden_size_per_partition, self.conv_kernel)
+        )
+        set_weight_attrs(self.conv_weight, {"weight_loader": sharded_weight_loader(0)})
+        if self.use_bias:
+            self.conv_bias = nn.Parameter(torch.empty(self.hidden_size_per_partition))
+            set_weight_attrs(
+                self.conv_bias, {"weight_loader": sharded_weight_loader(0)}
+            )
+        else:
+            self.register_parameter("conv_bias", None)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+    ) -> torch.Tensor:
+        if forward_batch.forward_mode.is_idle():
+            return hidden_states
+
+        layer_cache = forward_batch.req_to_token_pool.mamba2_layer_cache(self.layer_idx)
+        conv_state = layer_cache.conv[0]
+        req_pool_indices = forward_batch.req_pool_indices
+
+        proj, _ = self.in_proj(hidden_states)
+        B_gate, C_gate, x = proj.chunk(3, dim=-1)
+        Bx = B_gate * x
+
+        if forward_batch.forward_mode.is_decode():
+            conv_out = causal_conv1d_update(
+                Bx,
+                conv_state,
+                self.conv_weight,
+                self.conv_bias,
+                activation=None,
+                conv_state_indices=req_pool_indices.to(torch.int32),
+            )
+        else:
+            T = hidden_states.shape[0]
+            Bx_t = Bx.transpose(0, 1).contiguous()
+
+            # Build query_start_loc for variable-length sequences
+            # causal_conv1d_fn expects [start0, start1, ..., startN, T]
+            extend_start_loc = forward_batch.extend_start_loc
+            if extend_start_loc is not None and len(extend_start_loc) > 1:
+                # Multiple sequences: append T to extend_start_loc
+                # Allocate and fill to avoid torch.cat overhead
+                query_start_loc = extend_start_loc.new_empty(len(extend_start_loc) + 1)
+                query_start_loc[:-1] = extend_start_loc
+                query_start_loc[-1] = T
+                cache_indices = req_pool_indices.to(torch.int32)
+            else:
+                # Single sequence: [0, T]
+                query_start_loc = hidden_states.new_tensor([0, T], dtype=torch.int32)
+                cache_indices = req_pool_indices[:1].to(torch.int32)
+
+            conv_out = causal_conv1d_fn(
+                Bx_t,
+                self.conv_weight,
+                self.conv_bias,
+                query_start_loc=query_start_loc,
+                cache_indices=cache_indices,
+                has_initial_state=None,
+                conv_states=conv_state,
+                activation=None,
+            ).transpose(0, 1)
+
+        output, _ = self.out_proj(C_gate * conv_out)
+        return output
+
+
+class Lfm2MoeDecoderLayer(nn.Module):
+    """
+    Decoder layer with attention/conv and dense MLP or MoE.
+
+    - Layers 0 to num_dense_layers-1: use Lfm2MoeMLP (dense)
+    - Layers num_dense_layers+: use Lfm2MoeSparseMoeBlock (MoE)
+    """
+
+    def __init__(
+        self,
+        config: Lfm2MoeConfig,
+        layer_id: int,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.layer_type = config.layer_types[layer_id]
+        self.is_attention_layer = self.layer_type == "full_attention"
+
+        self.operator_norm = RMSNorm(config.hidden_size, eps=config.norm_eps)
+        self.ffn_norm = RMSNorm(config.hidden_size, eps=config.norm_eps)
+
+        # Attention or Conv
+        if self.is_attention_layer:
+            self.self_attn = Lfm2MoeAttention(
+                config=config,
+                layer_id=layer_id,
+                quant_config=quant_config,
+                prefix=add_prefix("self_attn", prefix),
+            )
+        else:
+            self.conv = Lfm2MoeShortConv(
+                config=config,
+                layer_idx=layer_id,
+                quant_config=quant_config,
+                prefix=add_prefix("conv", prefix),
+            )
+
+        # Dense MLP or MoE
+        if layer_id < config.num_dense_layers:
+            self.feed_forward = Lfm2MoeMLP(
+                config=config,
+                quant_config=quant_config,
+                prefix=add_prefix("feed_forward", prefix),
+            )
+        else:
+            self.feed_forward = Lfm2MoeSparseMoeBlock(
+                config=config,
+                layer_idx=layer_id,
+                quant_config=quant_config,
+                prefix=add_prefix("feed_forward", prefix),
+            )
+
+    def forward(
+        self,
+        layer_id: int,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        residual: Optional[torch.Tensor],
+        forward_batch: ForwardBatch,
+        **kwargs,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        if not forward_batch.forward_mode.is_idle():
+            residual = hidden_states
+            normed = self.operator_norm(hidden_states)
+
+            if self.is_attention_layer:
+                hidden_states = self.self_attn(positions, normed, forward_batch)
+            else:
+                hidden_states = self.conv(normed, forward_batch)
+
+            hidden_states = hidden_states + residual
+            hidden_states = hidden_states + self.feed_forward(
+                self.ffn_norm(hidden_states)
+            )
+
+        return hidden_states, residual
+
+
+class Lfm2MoeModel(nn.Module):
+    def __init__(
+        self,
+        config: Lfm2MoeConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.config = config
+
+        self.embed_tokens = VocabParallelEmbedding(
+            config.vocab_size,
+            config.hidden_size,
+            org_num_embeddings=config.vocab_size,
+            prefix=add_prefix("embed_tokens", prefix),
+        )
+
+        # Count attention layers for KV cache sizing
+        self.num_attention_layers = sum(
+            1 for lt in config.layer_types if lt == "full_attention"
+        )
+
+        def get_layer(idx: int, prefix: str, **kwargs):
+            return Lfm2MoeDecoderLayer(
+                config=config,
+                layer_id=idx,
+                quant_config=quant_config,
+                prefix=prefix,
+            )
+
+        self.layers = make_layers(
+            config.num_hidden_layers, get_layer, prefix=f"{prefix}.layers"
+        )
+        self.embedding_norm = RMSNorm(config.hidden_size, eps=config.norm_eps)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        hidden_states = (
+            inputs_embeds if inputs_embeds is not None else self.embed_tokens(input_ids)
+        )
+
+        residual = None
+        for i in range(len(self.layers)):
+            hidden_states, residual = self.layers[i](
+                layer_id=i,
+                positions=positions,
+                hidden_states=hidden_states,
+                residual=residual,
+                forward_batch=forward_batch,
+            )
+
+        return self.embedding_norm(hidden_states)
+
+
+class Lfm2MoeForCausalLM(nn.Module):
+    """LFM2-MoE for causal language modeling."""
+
+    fall_back_to_pt_during_load = False
+
+    def __init__(
+        self,
+        config: Lfm2MoeConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.config = config
+        self.pp_group = get_pp_group()
+        assert self.pp_group.is_first_rank and self.pp_group.is_last_rank
+
+        self.quant_config = quant_config
+        self.model = Lfm2MoeModel(
+            config, quant_config, prefix=add_prefix("model", prefix)
+        )
+        self.lm_head = ParallelLMHead(
+            config.vocab_size,
+            config.hidden_size,
+            quant_config=quant_config,
+            org_num_embeddings=config.vocab_size,
+            prefix=add_prefix("lm_head", prefix),
+        )
+        self.logits_processor = LogitsProcessor(config)
+        self.num_attention_layers = self.model.num_attention_layers
+
+    def get_num_kv_cache_layers(self) -> int:
+        return self.num_attention_layers
+
+    @torch.no_grad()
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        **kwargs,
+    ):
+        hidden_states = self.model(input_ids, positions, forward_batch, inputs_embeds)
+        return self.logits_processor(
+            input_ids, hidden_states, self.lm_head, forward_batch
+        )
+
+    def load_weights(
+        self, weights: Iterable[Tuple[str, torch.Tensor]], is_mtp: bool = False
+    ) -> Set[str]:
+        """Load weights with FusedMoE expert format."""
+        stacked_params_mapping = [
+            # (param_name, weight_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+            # Dense MLP w1/w3 -> gate_up_proj
+            ("gate_up_proj", "w1", 0),
+            ("gate_up_proj", "w3", 1),
+        ]
+
+        # FusedMoE expert params mapping
+        # HF format: experts.{expert_id}.w{1,2,3}.weight
+        # FusedMoE format: experts.w13_weight, experts.w2_weight
+        expert_params_mapping = FusedMoE.make_expert_params_mapping(
+            ckpt_gate_proj_name="w1",
+            ckpt_down_proj_name="w2",
+            ckpt_up_proj_name="w3",
+            num_experts=self.config.num_experts,
+        )
+
+        params_dict = dict(self.named_parameters())
+        loaded_params: Set[str] = set()
+        embed_tokens_weight = None
+
+        for name, loaded_weight in weights:
+            if "rotary_emb.inv_freq" in name:
+                continue
+
+            if "embed_tokens.weight" in name:
+                embed_tokens_weight = loaded_weight
+
+            # Handle conv weight/bias naming: HF uses conv.conv, we use conv_weight/conv_bias
+            if ".conv.conv.weight" in name:
+                name = name.replace(".conv.conv.weight", ".conv.conv_weight")
+                loaded_weight = loaded_weight.squeeze(1)  # (D, 1, K) -> (D, K)
+            if ".conv.conv.bias" in name:
+                name = name.replace(".conv.conv.bias", ".conv.conv_bias")
+
+            # Handle dense MLP w2 -> down_proj
+            if "feed_forward.w2" in name and "experts" not in name:
+                name = name.replace("feed_forward.w2", "feed_forward.down_proj")
+
+            # Handle stacked params (QKV, dense MLP gate_up)
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                # Skip expert weights (handled below)
+                if "experts" in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                if name.endswith(".bias") and name not in params_dict:
+                    break
+                if name not in params_dict:
+                    break
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader")
+                weight_loader(param, loaded_weight, shard_id)
+                loaded_params.add(name)
+                break
+            else:
+                # Handle MoE expert weights using FusedMoE format
+                # HF format: model.layers.X.feed_forward.experts.Y.wZ.weight
+                # FusedMoE format: model.layers.X.feed_forward.experts.w13_weight/w2_weight
+                for (
+                    param_name,
+                    weight_name,
+                    expert_id,
+                    shard_id,
+                ) in expert_params_mapping:
+                    if weight_name not in name:
+                        continue
+                    # Build our parameter name
+                    name = name.replace(weight_name, param_name)
+                    if name not in params_dict:
+                        continue
+                    param = params_dict[name]
+                    weight_loader = param.weight_loader
+                    weight_loader(
+                        param,
+                        loaded_weight,
+                        name,
+                        shard_id=shard_id,
+                        expert_id=expert_id,
+                    )
+                    loaded_params.add(name)
+                    break
+                else:
+                    # Handle regular weights
+                    if name.endswith(".bias") and name not in params_dict:
+                        continue
+                    if name not in params_dict:
+                        continue
+                    param = params_dict[name]
+                    weight_loader = getattr(
+                        param, "weight_loader", default_weight_loader
+                    )
+                    weight_loader(param, loaded_weight)
+                    loaded_params.add(name)
+
+        # Handle tied lm_head weight
+        if "lm_head.weight" not in loaded_params and "lm_head.weight" in params_dict:
+            if embed_tokens_weight is not None:
+                param = params_dict["lm_head.weight"]
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
+                weight_loader(param, embed_tokens_weight)
+                loaded_params.add("lm_head.weight")
+
+        return loaded_params
+
+
+EntryClass = [Lfm2MoeForCausalLM]
diff --git a/sglang/python/sglang/srt/models/lightonocr.py b/sglang/python/sglang/srt/models/lightonocr.py
new file mode 100644
index 0000000000000000000000000000000000000000..fe854f6036257c1bbd30053a99e7bc656b95ac21
--- /dev/null
+++ b/sglang/python/sglang/srt/models/lightonocr.py
@@ -0,0 +1,298 @@
+# Copyright 2025 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""
+Support for lightonai/LightOnOCR-2-1B.
+
+LightOnOCR is a vision-language OCR model that combines:
+- Pixtral vision encoder (24 layers, 1024 hidden dim)
+- Spatial merge projection with RMSNorm + PatchMerger (2x2 = 4x token reduction)
+- Qwen3 language decoder (28 layers, 1024 hidden dim)
+
+Key differences from PixtralForConditionalGeneration:
+- Uses Qwen3ForCausalLM instead of MistralLarge3ForCausalLM as the language model
+- Has an RMSNorm applied to vision encoder output before patch merging
+- Does not use image break/end tokens (single contiguous image token range)
+- HuggingFace checkpoint uses a vision_projection namespace for norm, patch_merger,
+  and adapter weights
+
+References:
+- https://huggingface.co/lightonai/LightOnOCR-2-1B
+"""
+
+from dataclasses import fields
+from typing import Iterable, List, Tuple
+
+import torch
+import torch.nn as nn
+
+from sglang.srt.layers.layernorm import RMSNorm
+from sglang.srt.managers.mm_utils import (
+    MultiModalityDataPaddingPatternMultimodalTokens,
+    general_mm_embed_routine,
+)
+from sglang.srt.managers.schedule_batch import MultimodalDataItem, MultimodalInputs
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch
+from sglang.srt.model_loader.weight_utils import default_weight_loader
+from sglang.srt.models.pixtral import (
+    PATCH_MERGE,
+    PatchMerger,
+    PixtralHFVisionModel,
+    VisionEncoderArgs,
+    VisionLanguageAdapter,
+)
+from sglang.srt.models.qwen3 import Qwen3ForCausalLM
+
+
+class LightOnOCRForConditionalGeneration(nn.Module):
+    """
+    LightOnOCR model for SGLang inference.
+
+    Architecture:
+    - Pixtral-based vision encoder (PixtralHFVisionModel, 24 layers)
+    - RMSNorm on vision encoder output
+    - Spatial merge via PatchMerger (2x2 = 4x token reduction)
+    - VisionLanguageAdapter projection to text hidden size
+    - Qwen3-based decoder (28 layers) with QK norms
+    """
+
+    merge_by_field_config = True
+
+    @classmethod
+    def get_placeholder_str(cls, modality: str, i: int) -> str | None:
+        if modality.startswith("image"):
+            return None
+        raise ValueError("Only image modality is supported")
+
+    def __init__(self, *, config, prefix: str = "", **kwargs):
+        super().__init__()
+        self.config = config
+        quant_config = kwargs.get("quant_config")
+
+        # Build VisionEncoderArgs from config
+        vision_config = config.vision_config
+        dataclass_fields = {field.name for field in fields(VisionEncoderArgs)}
+        vision_args = {
+            key: value
+            for key, value in vision_config.to_dict().items()
+            if key in dataclass_fields
+        }
+        # LightOnOCR stores these at the top-level config
+        if "image_token_id" not in vision_args:
+            vision_args["image_token_id"] = getattr(config, "image_token_id", 151655)
+        if "spatial_merge_size" not in vision_args:
+            vision_args["spatial_merge_size"] = getattr(config, "spatial_merge_size", 2)
+        if "adapter_bias" not in vision_args:
+            vision_args["adapter_bias"] = getattr(
+                config, "multimodal_projector_bias", True
+            )
+        # LightOnOCR uses patch merging for spatial merge
+        vision_args["mm_projector_id"] = PATCH_MERGE
+        self.vision_args = VisionEncoderArgs(**vision_args)
+
+        # Vision encoder (Pixtral HF variant with SGLang parallel layers)
+        self.vision_encoder = PixtralHFVisionModel(vision_config, quant_config=None)
+
+        # RMSNorm applied to vision encoder output before patch merging
+        self.vision_projection_norm = RMSNorm(self.vision_args.hidden_size, eps=1e-5)
+
+        # Patch merger for spatial token reduction
+        self.patch_merger = PatchMerger(
+            vision_encoder_dim=self.vision_args.hidden_size,
+            spatial_merge_size=self.vision_args.spatial_merge_size,
+        )
+
+        # Vision-to-language projection adapter
+        self.vision_language_adapter = VisionLanguageAdapter(
+            self.vision_args, dim=config.text_config.hidden_size
+        )
+
+        # Language model
+        self.language_model = Qwen3ForCausalLM(
+            config=config.text_config,
+            quant_config=quant_config,
+        )
+
+    def pad_input_ids(self, input_ids: List[int], mm_inputs: MultimodalInputs):
+        pattern = MultiModalityDataPaddingPatternMultimodalTokens()
+        return pattern.pad_input_tokens(input_ids, mm_inputs)
+
+    def get_image_feature(self, items: List[MultimodalDataItem]) -> torch.Tensor:
+        """Process images through vision encoder and projection pipeline."""
+        images = [item.feature for item in items]
+
+        # Extract image sizes from model-specific data or infer from tensor shape
+        image_sizes_list = []
+        for item in items:
+            if item.model_specific_data and "image_sizes" in item.model_specific_data:
+                sizes_tensor = item.model_specific_data["image_sizes"]
+                for size in sizes_tensor:
+                    image_sizes_list.append((int(size[0]), int(size[1])))
+            else:
+                img = item.feature
+                for _ in range(img.shape[0]):
+                    image_sizes_list.append((img.shape[-2], img.shape[-1]))
+
+        # Stack pixel values
+        if len(images) > 1:
+            pixel_values = torch.cat(images, dim=0)
+        else:
+            pixel_values = images[0]
+
+        # Vision encoder forward
+        image_features = self.vision_encoder(pixel_values, image_sizes=image_sizes_list)
+        image_features = image_features.view(-1, image_features.shape[-1])
+
+        # Norm before patch merge (matches HF Mistral3MultiModalProjector order)
+        image_features = self.vision_projection_norm(image_features)
+
+        # Spatial merge via patch merger — use actual image sizes (not padded tensor
+        # shape) because PixtralHFVisionModel crops embeddings to real dimensions.
+        patch_size = self.vision_args.patch_size
+        img_patch_dims = [
+            (h // patch_size, w // patch_size) for (h, w) in image_sizes_list
+        ]
+        image_features = self.patch_merger(image_features, image_sizes=img_patch_dims)
+
+        # Project to language model dimension
+        image_embeds = self.vision_language_adapter(image_features)
+        return image_embeds
+
+    def get_language_model(self) -> torch.nn.Module:
+        return self.language_model
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+    ):
+        return general_mm_embed_routine(
+            input_ids=input_ids,
+            forward_batch=forward_batch,
+            language_model=self.language_model,
+            multimodal_model=self,
+            positions=positions,
+        )
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+    ) -> torch.Tensor | None:
+        return self.language_model.compute_logits(hidden_states)
+
+    def get_embed_and_head(self):
+        return self.language_model.get_embed_and_head()
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        """Load weights from HuggingFace checkpoint.
+
+        HF checkpoint weight layout (after stripping ``model.`` prefix):
+        - ``vision_encoder.*`` -> self.vision_encoder
+        - ``vision_projection.norm.*`` -> self.vision_projection_norm
+        - ``vision_projection.patch_merger.*`` -> self.patch_merger
+        - ``vision_projection.linear_1.*`` -> self.vision_language_adapter.w_in
+        - ``vision_projection.linear_2.*`` -> self.vision_language_adapter.w_out
+        - ``language_model.*`` -> self.language_model (Qwen3ForCausalLM)
+        """
+        vision_encoder_dict = dict(self.vision_encoder.named_parameters())
+        patch_merger_dict = dict(self.patch_merger.named_parameters())
+        norm_dict = dict(self.vision_projection_norm.named_parameters())
+        adapter_dict = dict(self.vision_language_adapter.named_parameters())
+
+        # PixtralHFVisionModel uses SGLang parallel layers with stacked params
+        stacked_params_mapping = [
+            (".attention.qkv_proj", ".attention.q_proj", "q"),
+            (".attention.qkv_proj", ".attention.k_proj", "k"),
+            (".attention.qkv_proj", ".attention.v_proj", "v"),
+            (".feed_forward.gate_up_proj", ".feed_forward.gate_proj", 0),
+            (".feed_forward.gate_up_proj", ".feed_forward.up_proj", 1),
+        ]
+
+        def llm_weights_generator():
+            for name, w in weights:
+                # HF checkpoint prefixes all weights with model.
+                if name.startswith("model."):
+                    name = name[len("model.") :]
+
+                if name.startswith("vision_encoder."):
+                    trimmed = name[len("vision_encoder.") :]
+
+                    # Handle stacked params (QKV, gate/up)
+                    loaded = False
+                    for param_name, weight_name, shard_id in stacked_params_mapping:
+                        if weight_name in trimmed:
+                            transformed = trimmed.replace(weight_name, param_name)
+                            if transformed in vision_encoder_dict:
+                                param = vision_encoder_dict[transformed]
+                                weight_loader = getattr(
+                                    param, "weight_loader", default_weight_loader
+                                )
+                                with torch.no_grad():
+                                    weight_loader(param, w, shard_id)
+                                loaded = True
+                                break
+
+                    if not loaded:
+                        # Handle o_proj -> proj rename
+                        if ".attention.o_proj" in trimmed:
+                            trimmed = trimmed.replace(
+                                ".attention.o_proj", ".attention.proj"
+                            )
+                        if trimmed in vision_encoder_dict:
+                            param = vision_encoder_dict[trimmed]
+                            weight_loader = getattr(
+                                param, "weight_loader", default_weight_loader
+                            )
+                            with torch.no_grad():
+                                weight_loader(param, w)
+
+                elif name.startswith("vision_projection."):
+                    remaining = name[len("vision_projection.") :]
+
+                    if remaining.startswith("patch_merger."):
+                        trimmed = remaining[len("patch_merger.") :]
+                        if trimmed in patch_merger_dict:
+                            param = patch_merger_dict[trimmed]
+                            with torch.no_grad():
+                                default_weight_loader(param, w)
+
+                    elif remaining.startswith("norm."):
+                        trimmed = remaining[len("norm.") :]
+                        if trimmed in norm_dict:
+                            param = norm_dict[trimmed]
+                            with torch.no_grad():
+                                default_weight_loader(param, w)
+
+                    else:
+                        # linear_1 -> w_in, linear_2 -> w_out
+                        trimmed = remaining.replace("linear_1.", "w_in.").replace(
+                            "linear_2.", "w_out."
+                        )
+                        if trimmed in adapter_dict:
+                            param = adapter_dict[trimmed]
+                            with torch.no_grad():
+                                default_weight_loader(param, w)
+
+                else:
+                    # Language model weights and any other weights
+                    if name.startswith("language_model."):
+                        # Qwen3ForCausalLM expects model.* prefix
+                        name = "model." + name[len("language_model.") :]
+                    yield (name, w)
+
+        self.language_model.load_weights(llm_weights_generator())
+
+
+EntryClass = LightOnOCRForConditionalGeneration
diff --git a/sglang/python/sglang/srt/models/llada2.py b/sglang/python/sglang/srt/models/llada2.py
new file mode 100644
index 0000000000000000000000000000000000000000..5f5cc438ca8d3ad0236925f40b4a4e2002b6b696
--- /dev/null
+++ b/sglang/python/sglang/srt/models/llada2.py
@@ -0,0 +1,932 @@
+# coding=utf-8
+# Copyright 2023 Antgroup and The HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""SGLang LLaDA2MoeModelLM model."""
+
+import logging
+from typing import Iterable, Optional, Tuple, Union
+
+import torch
+import torch.nn.functional as F
+from torch import nn
+from transformers import PretrainedConfig
+
+from sglang.srt.distributed import (
+    get_pp_group,
+    get_tensor_model_parallel_world_size,
+    parallel_state,
+    tensor_model_parallel_all_reduce,
+)
+from sglang.srt.eplb.expert_distribution import get_global_expert_distribution_recorder
+from sglang.srt.eplb.expert_location import ModelConfigForExpertLocation
+from sglang.srt.eplb.expert_location_dispatch import ExpertLocationDispatchInfo
+from sglang.srt.layers.activation import SiluAndMul
+from sglang.srt.layers.communicator import (
+    LayerCommunicator,
+    LayerScatterModes,
+    enable_moe_dense_fully_dp,
+)
+from sglang.srt.layers.dp_attention import (
+    get_attention_dp_size,
+    get_attention_tp_rank,
+    get_attention_tp_size,
+    is_dp_attention_enabled,
+)
+from sglang.srt.layers.layernorm import RMSNorm
+from sglang.srt.layers.linear import (
+    MergedColumnParallelLinear,
+    QKVParallelLinear,
+    RowParallelLinear,
+)
+from sglang.srt.layers.logits_processor import LogitsProcessor
+from sglang.srt.layers.moe import get_deepep_mode, get_moe_a2a_backend
+from sglang.srt.layers.moe.ep_moe.layer import get_moe_impl_class
+from sglang.srt.layers.moe.fused_moe_triton.layer import FusedMoE
+from sglang.srt.layers.moe.token_dispatcher import DeepEPDispatcher
+from sglang.srt.layers.moe.topk import TopK
+from sglang.srt.layers.quantization.base_config import QuantizationConfig
+from sglang.srt.layers.radix_attention import AttentionType, RadixAttention
+from sglang.srt.layers.rotary_embedding import get_rope
+from sglang.srt.layers.utils import PPMissingLayer
+from sglang.srt.layers.vocab_parallel_embedding import (
+    ParallelLMHead,
+    VocabParallelEmbedding,
+)
+from sglang.srt.model_executor.cuda_graph_runner import get_is_capture_mode
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch, PPProxyTensors
+from sglang.srt.model_loader.weight_utils import default_weight_loader
+from sglang.srt.models.utils import (
+    apply_qk_norm,
+    create_fused_set_kv_buffer_arg,
+    enable_fused_set_kv_buffer,
+)
+from sglang.srt.server_args import get_global_server_args
+from sglang.srt.utils import add_prefix, is_cuda, is_non_idle_and_non_empty, make_layers
+
+LoraConfig = None
+logger = logging.getLogger(__name__)
+_is_cuda = is_cuda()
+
+
+class LLaDA2MoeMLP(nn.Module):
+    def __init__(
+        self,
+        intermediate_size: int,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        reduce_results: Optional[bool] = True,
+        prefix: str = "",
+        tp_rank: Optional[int] = None,
+        tp_size: Optional[int] = None,
+    ) -> None:
+        super().__init__()
+        self.tp_size = tp_size
+
+        self.gate_up_proj = MergedColumnParallelLinear(
+            config.hidden_size,
+            [intermediate_size] * 2,
+            bias=config.use_bias,
+            quant_config=quant_config,
+            prefix=add_prefix("gate_up_proj", prefix),
+            tp_rank=tp_rank,
+            tp_size=tp_size,
+        )
+        self.down_proj = RowParallelLinear(
+            intermediate_size,
+            config.hidden_size,
+            bias=config.use_bias,
+            reduce_results=reduce_results,
+            quant_config=quant_config,
+            prefix=add_prefix("down_proj", prefix),
+            tp_rank=tp_rank,
+            tp_size=tp_size,
+        )
+
+        if config.hidden_act != "silu":
+            raise ValueError("Unsupported activation. Only silu is supported for now.")
+        self.act_fn = SiluAndMul()
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        forward_batch: Optional[ForwardBatch] = None,
+        use_reduce_scatter: bool = False,
+    ) -> torch.Tensor:
+        if (self.tp_size == 1) and hidden_states.shape[0] == 0:
+            return hidden_states
+
+        gate_up, _ = self.gate_up_proj(hidden_states)
+        hidden_states = self.act_fn(gate_up)
+        hidden_states, _ = self.down_proj(
+            hidden_states, skip_all_reduce=use_reduce_scatter
+        )
+        return hidden_states
+
+
+class LLaDA2MoeGate(nn.Module):
+    def __init__(
+        self,
+        config,
+        params_dtype: Optional[torch.dtype] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        if params_dtype is None:
+            params_dtype = torch.get_default_dtype()
+        self.params_dtype = params_dtype
+        self.weight = nn.Parameter(
+            torch.empty(
+                (config.num_experts, config.hidden_size),
+                dtype=self.params_dtype,
+            ),
+        )
+        if getattr(config, "moe_router_enable_expert_bias", False):
+            self.expert_bias = nn.Parameter(
+                torch.empty((config.num_experts,), dtype=torch.float32),
+            )
+        else:
+            self.expert_bias = None
+
+    def forward(self, hidden_states):
+        logits = F.linear(hidden_states.to(self.weight.dtype), self.weight, None).to(
+            hidden_states.dtype
+        )
+        return logits
+
+
+class LLaDA2MoeSparseMoeBlock(nn.Module):
+    def __init__(
+        self,
+        layer_id: int,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        alt_stream: Optional[torch.cuda.Stream] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.layer_id = layer_id
+        self.alt_stream = alt_stream
+        self.tp_size = get_tensor_model_parallel_world_size()
+        self.top_k = config.num_experts_per_tok
+        self.norm_topk_prob = config.norm_topk_prob
+        self.hidden_size = config.hidden_size
+        self.num_shared_experts = config.num_shared_experts
+        self.routed_scaling_factor = getattr(config, "routed_scaling_factor", 1.0)
+        self.score_function = getattr(config, "score_function", None)
+
+        if config.hidden_act != "silu":
+            raise ValueError(
+                f"Unsupported activation: {config.hidden_act}. "
+                "Only silu is supported for now."
+            )
+
+        # Gate always runs at half / full precision for now.
+        router_dtype = getattr(config, "router_dtype", None)
+        if router_dtype is None:
+            self.router_dtype = None
+        elif router_dtype == "fp32":
+            self.router_dtype = torch.float32
+        else:
+            self.router_dtype = torch.bfloat16
+
+        # TODO global_server_args.ep_num_redundant_experts is used for eplb, not supported now
+        assert get_global_server_args().ep_num_redundant_experts == 0
+        # check group topk
+        self.num_expert_group = getattr(config, "n_group", 0)
+        self.topk_group = getattr(config, "topk_group", 0)
+        if self.num_expert_group > 0 or self.topk_group > 0:
+            assert (
+                self.num_expert_group > 0
+                and 0 < self.topk_group <= self.num_expert_group
+            )
+            self.use_grouped_topk = True
+        else:
+            self.num_expert_group = self.topk_group = None
+            self.use_grouped_topk = False
+
+        self.num_experts = (
+            config.num_experts + get_global_server_args().ep_num_redundant_experts
+        )
+
+        self.gate = LLaDA2MoeGate(
+            config=config,
+            params_dtype=self.router_dtype,
+            prefix=add_prefix("gate", prefix),
+        )
+        self.correction_bias = (
+            self.gate.expert_bias.data if self.gate.expert_bias is not None else None
+        )
+
+        if self.score_function is not None:
+            assert (
+                self.score_function == "softmax" and self.correction_bias is None
+            ) or (
+                self.score_function == "sigmoid" and self.correction_bias is not None
+            ), "score_function and correction_bias should be in 2 combination (softmax, None) or (sigmoid, not None)"
+
+        self.topk = TopK(
+            top_k=self.top_k,
+            renormalize=self.norm_topk_prob,
+            use_grouped_topk=self.use_grouped_topk,
+            num_expert_group=self.num_expert_group,
+            # num_fused_shared_experts=self.num_fused_shared_experts,
+            topk_group=self.topk_group,
+            correction_bias=self.correction_bias,
+            routed_scaling_factor=self.routed_scaling_factor,
+        )
+
+        self.experts = get_moe_impl_class(quant_config)(
+            num_experts=self.num_experts,
+            top_k=self.top_k,
+            layer_id=self.layer_id,
+            hidden_size=config.hidden_size,
+            intermediate_size=config.moe_intermediate_size,
+            quant_config=quant_config,
+            routed_scaling_factor=self.routed_scaling_factor,
+            prefix=add_prefix("experts", prefix),
+        )
+        # shared expert
+        if config.num_shared_experts is not None:
+            if hasattr(config, "moe_shared_expert_intermediate_size"):
+                intermediate_size = config.moe_shared_expert_intermediate_size
+            else:
+                intermediate_size = config.moe_intermediate_size
+            intermediate_size *= config.num_shared_experts
+            # disable tp for shared experts when enable deepep moe
+            self.shared_experts = LLaDA2MoeMLP(
+                intermediate_size=intermediate_size,
+                config=config,
+                quant_config=quant_config,
+                reduce_results=False,
+                prefix=add_prefix("shared_experts", prefix),
+                **(
+                    dict(tp_rank=0, tp_size=1)
+                    if get_moe_a2a_backend().is_deepep()
+                    else {}
+                ),
+            )
+        # dispatcher
+        if get_moe_a2a_backend().is_deepep():
+            # TODO: we will support tp < ep in the future
+            self.ep_size = get_tensor_model_parallel_world_size()
+
+            self.deepep_dispatcher = DeepEPDispatcher(
+                group=parallel_state.get_tp_group().device_group,
+                router_topk=self.top_k,
+                permute_fusion=True,
+                num_experts=self.num_experts,
+                num_local_experts=config.num_experts // self.tp_size,
+                hidden_size=config.hidden_size,
+                params_dtype=config.torch_dtype,
+                deepep_mode=get_deepep_mode(),
+                async_finish=True,  # TODO
+                return_recv_hook=True,
+            )
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        forward_batch: Optional[ForwardBatch] = None,
+        use_reduce_scatter: bool = False,
+    ) -> torch.Tensor:
+        if not get_moe_a2a_backend().is_deepep():
+            return self.forward_normal(hidden_states, use_reduce_scatter)
+        else:
+            return self.forward_deepep(hidden_states, forward_batch)
+
+    def get_moe_weights(self):
+        return [
+            x.data
+            for name, x in self.experts.named_parameters()
+            if name not in ["correction_bias"]
+        ]
+
+    def _forward_shared_experts(self, hidden_states: torch.Tensor):
+        shared_output = None
+        if self.num_shared_experts > 0:
+            shared_output = self.shared_experts(hidden_states)
+        return shared_output
+
+    def _forward_router_experts(self, hidden_states: torch.Tensor):
+        # router_logits: (num_tokens, n_experts)
+        router_logits = self.gate(hidden_states)
+        topk_output = self.topk(hidden_states, router_logits)
+        return self.experts(hidden_states, topk_output)
+
+    def forward_normal_dual_stream(
+        self,
+        hidden_states: torch.Tensor,
+    ) -> torch.Tensor:
+        current_stream = torch.cuda.current_stream()
+        self.alt_stream.wait_stream(current_stream)
+        shared_output = self._forward_shared_experts(hidden_states.clone())
+
+        with torch.cuda.stream(self.alt_stream):
+            router_output = self._forward_router_experts(hidden_states)
+        current_stream.wait_stream(self.alt_stream)
+
+        return router_output, shared_output
+
+    def forward_normal(
+        self,
+        hidden_states: torch.Tensor,
+        use_reduce_scatter: bool = False,
+    ) -> torch.Tensor:
+        num_tokens, hidden_size = hidden_states.shape
+        hidden_states = hidden_states.view(-1, hidden_size)
+
+        if (
+            self.alt_stream is not None
+            and hidden_states.shape[0] > 0
+            and get_is_capture_mode()
+        ):
+            final_hidden_states, shared_output = self.forward_normal_dual_stream(
+                hidden_states
+            )
+        else:
+            shared_output = self._forward_shared_experts(hidden_states)
+            final_hidden_states = self._forward_router_experts(hidden_states)
+
+        if self.num_shared_experts > 0:
+            final_hidden_states = final_hidden_states + shared_output
+
+        if self.tp_size > 1 and not use_reduce_scatter:
+            final_hidden_states = tensor_model_parallel_all_reduce(final_hidden_states)
+        return final_hidden_states.view(num_tokens, hidden_size)
+
+    def forward_deepep(
+        self, hidden_states: torch.Tensor, forward_batch: ForwardBatch
+    ) -> torch.Tensor:
+        shared_output = None
+        forward_mode = forward_batch.forward_mode
+        if is_non_idle_and_non_empty(forward_mode, hidden_states):
+            router_logits = self.gate(hidden_states)
+            if self.num_shared_experts > 0:
+                shared_output = self.shared_experts(hidden_states)
+
+            topk_output = self.topk(
+                hidden_states,
+                router_logits,
+                num_token_non_padded=forward_batch.num_token_non_padded,
+                expert_location_dispatch_info=ExpertLocationDispatchInfo.init_new(
+                    layer_id=self.layer_id,
+                ),
+            )
+        else:
+            topk_output = self.topk.empty_topk_output(hidden_states.device)
+
+        final_hidden_states = self.experts(
+            hidden_states=hidden_states,
+            topk_output=topk_output,
+        )
+
+        if shared_output is not None:
+            final_hidden_states += shared_output
+        return final_hidden_states
+
+
+class LLaDA2MoeAttention(nn.Module):
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        layer_id: int = 0,
+        quant_config: Optional[QuantizationConfig] = None,
+        reduce_results: bool = True,
+        prefix: str = "",
+        alt_stream: Optional[torch.cuda.Stream] = None,
+    ):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.total_num_heads = config.num_attention_heads
+        self.total_kv_heads = config.num_key_value_heads
+        self.dp_size = get_attention_dp_size()
+        attn_tp_rank = get_attention_tp_rank()
+        attn_tp_size = get_attention_tp_size()
+
+        assert self.total_num_heads % attn_tp_size == 0
+        if self.total_kv_heads >= attn_tp_size:
+            # Number of KV heads is greater than TP size, so we partition
+            # the KV heads across multiple tensor parallel GPUs.
+            assert self.total_kv_heads % attn_tp_size == 0
+        else:
+            # Number of KV heads is less than TP size, so we replicate
+            # the KV heads across multiple tensor parallel GPUs.
+            assert attn_tp_size % self.total_kv_heads == 0
+        assert self.total_num_heads >= self.total_kv_heads
+
+        self.num_heads = self.total_num_heads // attn_tp_size
+        self.head_dim = config.head_dim or (self.hidden_size // self.total_num_heads)
+        self.q_size = self.head_dim * self.num_heads
+
+        self.num_kv_heads = max(1, self.total_kv_heads // attn_tp_size)
+        self.kv_size = max(1, self.num_kv_heads * self.head_dim)
+
+        self.scale = self.head_dim**-0.5
+
+        self.use_qk_norm = getattr(config, "use_qk_norm", True)
+
+        self.query_key_value = QKVParallelLinear(
+            self.hidden_size,
+            self.head_dim,
+            self.total_num_heads,
+            self.total_kv_heads,
+            bias=(config.use_bias or config.use_qkv_bias),
+            quant_config=quant_config,
+            prefix=add_prefix("query_key_value", prefix),
+            tp_rank=attn_tp_rank,
+            tp_size=attn_tp_size,
+        )
+
+        if self.use_qk_norm:
+            self.query_layernorm = RMSNorm(self.head_dim, eps=config.rms_norm_eps)
+            self.key_layernorm = RMSNorm(self.head_dim, eps=config.rms_norm_eps)
+
+        self.dense = RowParallelLinear(
+            self.total_num_heads * self.head_dim,
+            self.hidden_size,
+            bias=config.use_bias,
+            quant_config=quant_config,
+            reduce_results=reduce_results,
+            prefix=add_prefix("dense", prefix),
+            tp_rank=attn_tp_rank,
+            tp_size=attn_tp_size,
+        )
+
+        if hasattr(config, "partial_rotary_factor"):
+            self.rotary_dim = int(self.head_dim * config.partial_rotary_factor)
+        elif hasattr(config, "rotary_dim"):
+            self.rotary_dim = config.rotary_dim
+        else:
+            self.rotary_dim = self.head_dim
+        self.rotary_emb = get_rope(
+            self.head_dim,
+            rotary_dim=self.rotary_dim,
+            max_position=config.max_position_embeddings,
+            base=config.rope_theta,
+            rope_scaling=config.rope_scaling,
+        )
+
+        self.attn = RadixAttention(
+            self.num_heads,
+            self.head_dim,
+            self.scale,
+            num_kv_heads=self.num_kv_heads,
+            layer_id=layer_id,
+            attn_type=AttentionType.ENCODER_ONLY,
+            prefix=add_prefix("attn", prefix),
+        )
+
+        self.alt_stream = alt_stream
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+    ) -> torch.Tensor:
+        if hidden_states.shape[0] == 0:
+            return hidden_states
+        qkv, _ = self.query_key_value(hidden_states)
+        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+        if self.use_qk_norm:
+            q, k = apply_qk_norm(
+                q=q,
+                k=k,
+                q_norm=self.query_layernorm,
+                k_norm=self.key_layernorm,
+                head_dim=self.head_dim,
+                alt_stream=self.alt_stream,
+            )
+        can_fuse_set_kv = (
+            self.head_dim == self.rotary_emb.rotary_dim
+            and enable_fused_set_kv_buffer(forward_batch)
+        )
+        q, k = self.rotary_emb(
+            positions,
+            q,
+            k,
+            fused_set_kv_buffer_arg=(
+                create_fused_set_kv_buffer_arg(
+                    value=v,
+                    layer=self.attn,
+                    forward_batch=forward_batch,
+                )
+                if can_fuse_set_kv
+                else None
+            ),
+        )
+        context_layer = self.attn(
+            q,
+            k,
+            v,
+            forward_batch,
+            save_kv_cache=not can_fuse_set_kv,
+        )
+        attn_output, _ = self.dense(context_layer)
+        return attn_output
+
+
+class LLaDA2MoeBlock(nn.Module):
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        layer_id: int = 0,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+        alt_stream: Optional[torch.cuda.Stream] = None,
+    ):
+        super().__init__()
+        hidden_size = config.hidden_size
+
+        self.input_layernorm = RMSNorm(hidden_size, eps=config.rms_norm_eps)
+        self.dp_size = get_attention_dp_size()
+        self.attention = LLaDA2MoeAttention(
+            config,
+            layer_id,
+            quant_config,
+            reduce_results=False,
+            prefix=add_prefix("attention", prefix),
+            alt_stream=alt_stream,
+        )
+        self.layer_id = layer_id
+        self.attn_tp_size = get_attention_tp_size()
+        self.attn_tp_rank = get_attention_tp_rank()
+
+        self.is_layer_sparse = self._is_layer_sparse(config, layer_id=layer_id)
+        is_previous_layer_sparse = self._is_layer_sparse(config, layer_id=layer_id - 1)
+        is_next_layer_sparse = self._is_layer_sparse(config, layer_id=layer_id + 1)
+
+        self.layer_scatter_modes = LayerScatterModes.init_new(
+            layer_id=layer_id,
+            num_layers=config.num_hidden_layers,
+            is_layer_sparse=self.is_layer_sparse,
+            is_previous_layer_sparse=is_previous_layer_sparse,
+            is_next_layer_sparse=is_next_layer_sparse,
+        )
+
+        self.is_last_layer = self.layer_id == config.num_hidden_layers - 1
+
+        if self.is_layer_sparse:
+            self.mlp = LLaDA2MoeSparseMoeBlock(
+                layer_id=layer_id,
+                config=config,
+                quant_config=quant_config,
+                alt_stream=alt_stream,
+                prefix=add_prefix("mlp", prefix),
+            )
+        else:
+            if enable_moe_dense_fully_dp():
+                mlp_tp_rank, mlp_tp_size = 0, 1
+            else:
+                mlp_tp_rank, mlp_tp_size = None, None
+            self.mlp = LLaDA2MoeMLP(
+                intermediate_size=config.intermediate_size,
+                config=config,
+                quant_config=quant_config,
+                prefix=add_prefix("mlp", prefix),
+                tp_rank=mlp_tp_rank,
+                tp_size=mlp_tp_size,
+            )
+
+        self.post_attention_layernorm = RMSNorm(hidden_size, eps=config.rms_norm_eps)
+
+        self.layer_communicator = LayerCommunicator(
+            layer_scatter_modes=self.layer_scatter_modes,
+            input_layernorm=self.input_layernorm,
+            post_attention_layernorm=self.post_attention_layernorm,
+            allow_reduce_scatter=True,
+        )
+
+    def _is_layer_sparse(self, config: PretrainedConfig, layer_id: int) -> bool:
+        return (
+            config.num_experts is not None and layer_id >= config.first_k_dense_replace
+        )
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+        residual: Optional[torch.Tensor],
+    ) -> torch.Tensor:
+        hidden_states, residual = self.layer_communicator.prepare_attn(
+            hidden_states=hidden_states,
+            residual=residual,
+            forward_batch=forward_batch,
+        )
+
+        hidden_states = self.attention(
+            positions=positions,
+            hidden_states=hidden_states,
+            forward_batch=forward_batch,
+        )
+
+        hidden_states, residual = self.layer_communicator.prepare_mlp(
+            hidden_states=hidden_states,
+            residual=residual,
+            forward_batch=forward_batch,
+        )
+
+        # For DP with padding, reduce scatter can be used instead of all-reduce.
+        use_reduce_scatter = self.layer_communicator.should_use_reduce_scatter(
+            forward_batch
+        )
+
+        hidden_states = self.mlp(hidden_states, forward_batch, use_reduce_scatter)
+
+        hidden_states, residual = self.layer_communicator.postprocess_layer(
+            hidden_states=hidden_states,
+            residual=residual,
+            forward_batch=forward_batch,
+        )
+
+        return hidden_states, residual
+
+
+class LLaDA2MoeModel(nn.Module):
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        alt_stream: Optional[torch.cuda.Stream] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.pp_group = get_pp_group()
+        self.config = config
+        self.vocab_size = config.vocab_size
+        self.embed_dim = config.hidden_size
+        if self.pp_group.is_first_rank:
+            self.word_embeddings = VocabParallelEmbedding(
+                self.vocab_size,
+                self.embed_dim,
+                quant_config=quant_config,
+                prefix=add_prefix("word_embeddings", prefix),
+                use_attn_tp_group=is_dp_attention_enabled(),
+            )
+        else:
+            self.word_embeddings = PPMissingLayer()
+
+        self.embedding_dropout = torch.nn.Dropout(config.embedding_dropout)
+
+        self.layers, self.start_layer, self.end_layer = make_layers(
+            config.num_hidden_layers,
+            lambda idx, prefix: LLaDA2MoeBlock(
+                layer_id=idx,
+                config=config,
+                quant_config=quant_config,
+                prefix=prefix,
+                alt_stream=alt_stream,
+            ),
+            pp_rank=self.pp_group.rank_in_group,
+            pp_size=self.pp_group.world_size,
+            prefix=add_prefix("layers", prefix),
+        )
+        if self.pp_group.is_last_rank:
+            self.norm = RMSNorm(self.embed_dim, eps=config.rms_norm_eps)
+        else:
+            self.norm = PPMissingLayer(return_tuple=True)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        input_embeds: torch.Tensor = None,
+        pp_proxy_tensors: Optional[PPProxyTensors] = None,
+    ) -> Union[torch.Tensor, PPProxyTensors]:
+        if self.pp_group.is_first_rank:
+            if input_embeds is None:
+                hidden_states = self.word_embeddings(input_ids)
+            else:
+                hidden_states = input_embeds
+            residual = None
+        else:
+            assert pp_proxy_tensors is not None
+            hidden_states = pp_proxy_tensors["hidden_states"]
+            residual = pp_proxy_tensors["residual"]
+
+        for i in range(self.start_layer, self.end_layer):
+            with get_global_expert_distribution_recorder().with_current_layer(i):
+                layer = self.layers[i]
+                hidden_states, residual = layer(
+                    positions,
+                    hidden_states,
+                    forward_batch,
+                    residual,
+                )
+        if not self.pp_group.is_last_rank:
+            return PPProxyTensors(
+                {
+                    "hidden_states": hidden_states,
+                    "residual": residual,
+                }
+            )
+        else:
+            if not forward_batch.forward_mode.is_idle():
+                if residual is None:
+                    hidden_states = self.norm(hidden_states)
+                else:
+                    hidden_states, _ = self.norm(hidden_states, residual)
+            return hidden_states
+
+
+class LLaDA2MoeModelLM(nn.Module):
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.pp_group = get_pp_group()
+        self.config = config
+        self.quant_config = quant_config
+        alt_stream = torch.cuda.Stream() if _is_cuda else None
+
+        self.model = LLaDA2MoeModel(
+            config,
+            quant_config,
+            alt_stream=alt_stream,
+            prefix=add_prefix("model", ""),
+        )
+
+        if config.tie_word_embeddings:
+            self.lm_head = self.model.word_embeddings
+        else:
+            # TODO something wrong with ParallelLMHead with DP attention enabled
+            self.lm_head = ParallelLMHead(
+                config.vocab_size,
+                config.hidden_size,
+                quant_config=quant_config,
+                prefix=add_prefix("lm_head", prefix),
+                use_attn_tp_group=get_global_server_args().enable_dp_lm_head,
+            )
+        self.logits_processor = LogitsProcessor(config, return_full_logits=True)
+
+    @property
+    def start_layer(self):
+        return self.model.start_layer
+
+    @property
+    def end_layer(self):
+        return self.model.end_layer
+
+    def get_embed_and_head(self):
+        """Used by the eagle_worker."""
+        return self.model.word_embeddings.weight, self.lm_head.weight
+
+    def set_embed_and_head(self, embed, head):
+        """Used by the eagle_worker."""
+        del self.model.word_embeddings.weight
+        del self.lm_head.weight
+        self.model.word_embeddings.weight = embed
+        self.lm_head.weight = head
+        torch.cuda.empty_cache()
+        torch.cuda.synchronize()
+
+    @torch.no_grad()
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        input_embeds: torch.Tensor = None,
+        pp_proxy_tensors: Optional[PPProxyTensors] = None,
+    ) -> torch.Tensor:
+        hidden_states = self.model(
+            input_ids,
+            positions,
+            forward_batch,
+            input_embeds,
+            pp_proxy_tensors=pp_proxy_tensors,
+        )
+        if self.pp_group.is_last_rank:
+            return self.logits_processor(
+                input_ids, hidden_states, self.lm_head, forward_batch
+            )
+        else:
+            return hidden_states
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("gate_up_proj", "gate_proj", 0),
+            ("gate_up_proj", "up_proj", 1),
+        ]
+
+        # Params for weights, fp8 weight scales, fp8 activation scales
+        # (param_name, weight_name, expert_id, shard_id)
+        expert_params_mapping = FusedMoE.make_expert_params_mapping(
+            ckpt_gate_proj_name="gate_proj",
+            ckpt_down_proj_name="down_proj",
+            ckpt_up_proj_name="up_proj",
+            num_experts=self.config.num_experts,
+        )
+
+        params_dict = dict(self.named_parameters())
+        for name, loaded_weight in weights:
+            if (
+                ("v_head" in name)
+                or ("inv_freq" in name)
+                or (self.config.tie_word_embeddings and "lm_head" in name)
+            ):
+                continue
+
+            if (
+                hasattr(self.config, "norm_head")
+                and self.config.norm_head
+                and "lm_head.weight" in name
+            ):
+                import torch.nn.functional as F
+
+                loaded_weight = F.normalize(loaded_weight, dim=0, p=2, eps=1e-7)
+
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                # We have mlp.experts[0].gate_proj in the checkpoint.
+                # Since we handle the experts below in expert_params_mapping,
+                # we need to skip here BEFORE we update the name, otherwise
+                # name will be updated to mlp.experts[0].gate_up_proj, which
+                # will then be updated below in expert_params_mapping
+                # for mlp.experts[0].gate_gate_up_proj, which breaks load.
+                if "mlp.experts" in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                if name not in params_dict:
+                    continue
+
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                for mapping in expert_params_mapping:
+                    param_name, weight_name, expert_id, shard_id = mapping
+                    if weight_name not in name:
+                        continue
+                    name = name.replace(weight_name, param_name)
+                    if name not in params_dict:
+                        continue
+                    param = params_dict[name]
+                    weight_loader = param.weight_loader
+                    weight_loader(
+                        param,
+                        loaded_weight,
+                        name,
+                        shard_id=shard_id,
+                        expert_id=expert_id,
+                    )
+                    break
+                else:
+                    # Skip loading extra bias for GPTQ models.
+                    if name.endswith(".bias") and name not in params_dict:
+                        continue
+                    if name not in params_dict:
+                        continue
+
+                    param = params_dict[name]
+                    weight_loader = getattr(
+                        param, "weight_loader", default_weight_loader
+                    )
+                    weight_loader(param, loaded_weight)
+
+        self.routed_experts_weights_of_layer = {
+            layer_id: layer.mlp.get_moe_weights()
+            for layer_id, layer in enumerate(self.model.layers)
+            if not isinstance(layer, PPMissingLayer)
+            and isinstance(layer.mlp, LLaDA2MoeSparseMoeBlock)
+        }
+
+    @classmethod
+    def get_model_config_for_expert_location(cls, config):
+        num_groups = getattr(config, "n_group", 0)
+        return ModelConfigForExpertLocation(
+            num_layers=config.num_hidden_layers,
+            num_logical_experts=config.num_experts,
+            num_groups=None if num_groups == 0 else num_groups,
+        )
+
+
+EntryClass = LLaDA2MoeModelLM
diff --git a/sglang/python/sglang/srt/models/llama.py b/sglang/python/sglang/srt/models/llama.py
new file mode 100644
index 0000000000000000000000000000000000000000..01e934dcc096c8c277ff69d6541d6f80e4ca24d7
--- /dev/null
+++ b/sglang/python/sglang/srt/models/llama.py
@@ -0,0 +1,810 @@
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+# Adapted from
+# https://github.com/vllm-project/vllm/blob/c7f2cf2b7f67bce5842fedfdba508440fe257375/vllm/model_executor/models/llama.py#L1
+"""Inference-only LLaMA model compatible with HuggingFace weights."""
+
+import logging
+from typing import Any, Dict, Iterable, List, Optional, Tuple, Union
+
+import torch
+from torch import nn
+from transformers import LlamaConfig
+
+from sglang.srt.distributed import (
+    get_pp_group,
+    get_tensor_model_parallel_rank,
+    get_tensor_model_parallel_world_size,
+)
+from sglang.srt.layers.activation import SiluAndMul
+from sglang.srt.layers.layernorm import RMSNorm
+from sglang.srt.layers.linear import (
+    MergedColumnParallelLinear,
+    QKVParallelLinear,
+    RowParallelLinear,
+)
+from sglang.srt.layers.logits_processor import LogitsProcessor, LogitsProcessorOutput
+from sglang.srt.layers.pooler import Pooler, PoolingType
+from sglang.srt.layers.quantization.base_config import QuantizationConfig
+from sglang.srt.layers.radix_attention import RadixAttention
+from sglang.srt.layers.rotary_embedding import get_rope
+from sglang.srt.layers.utils import PPMissingLayer, get_layer_id
+from sglang.srt.layers.vocab_parallel_embedding import (
+    ParallelLMHead,
+    VocabParallelEmbedding,
+)
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch, PPProxyTensors
+from sglang.srt.model_loader.weight_utils import (
+    default_weight_loader,
+    kv_cache_scales_loader,
+    maybe_remap_kv_scale_name,
+)
+from sglang.srt.server_args import get_global_server_args
+from sglang.srt.utils import add_prefix, is_npu, make_layers
+from sglang.utils import get_exception_traceback
+
+logger = logging.getLogger(__name__)
+_is_npu = is_npu()
+
+if _is_npu:
+    from sgl_kernel_npu.norm.split_qkv_rmsnorm_rope import split_qkv_rmsnorm_rope
+
+
+class LlamaMLP(nn.Module):
+    def __init__(
+        self,
+        hidden_size: int,
+        intermediate_size: int,
+        hidden_act: str,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+        reduce_results: bool = True,
+        tp_rank: Optional[int] = None,
+        tp_size: Optional[int] = None,
+    ) -> None:
+        super().__init__()
+        self.gate_up_proj = MergedColumnParallelLinear(
+            hidden_size,
+            [intermediate_size] * 2,
+            bias=False,
+            quant_config=quant_config,
+            prefix=add_prefix("gate_up_proj", prefix),
+            tp_rank=tp_rank,
+            tp_size=tp_size,
+        )
+        self.down_proj = RowParallelLinear(
+            intermediate_size,
+            hidden_size,
+            bias=False,
+            quant_config=quant_config,
+            prefix=add_prefix("down_proj", prefix),
+            reduce_results=reduce_results,
+            tp_rank=tp_rank,
+            tp_size=tp_size,
+        )
+        if hidden_act != "silu":
+            raise ValueError(
+                f"Unsupported activation: {hidden_act}. "
+                "Only silu is supported for now."
+            )
+        self.act_fn = SiluAndMul()
+
+    def forward(
+        self,
+        x,
+        forward_batch=None,
+        use_reduce_scatter: bool = False,
+    ):
+        gate_up, _ = self.gate_up_proj(x)
+        x = self.act_fn(gate_up)
+        x, _ = self.down_proj(
+            x,
+            skip_all_reduce=use_reduce_scatter,
+        )
+        return x
+
+
+class LlamaAttention(nn.Module):
+    def __init__(
+        self,
+        config: LlamaConfig,
+        hidden_size: int,
+        num_heads: int,
+        num_kv_heads: int,
+        layer_id: int = 0,
+        rope_theta: float = 10000,
+        rope_scaling: Optional[Dict[str, Any]] = None,
+        rope_is_neox_style: bool = True,
+        max_position_embeddings: int = 8192,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+        bias: bool = False,
+    ) -> None:
+        super().__init__()
+        self.hidden_size = hidden_size
+        tp_size = get_tensor_model_parallel_world_size()
+        self.total_num_heads = num_heads
+        assert self.total_num_heads % tp_size == 0
+        self.num_heads = self.total_num_heads // tp_size
+        self.total_num_kv_heads = num_kv_heads
+        if self.total_num_kv_heads >= tp_size:
+            # Number of KV heads is greater than TP size, so we partition
+            # the KV heads across multiple tensor parallel GPUs.
+            assert self.total_num_kv_heads % tp_size == 0
+        else:
+            # Number of KV heads is less than TP size, so we replicate
+            # the KV heads across multiple tensor parallel GPUs.
+            assert tp_size % self.total_num_kv_heads == 0
+        self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
+        # MistralConfig has an optional head_dim introduced by Mistral-Nemo
+        self.head_dim = getattr(
+            config, "head_dim", self.hidden_size // self.total_num_heads
+        )
+        partial_rotary_factor = getattr(config, "partial_rotary_factor", 1)
+        self.rotary_dim = int(partial_rotary_factor * self.head_dim)
+        self.q_size = self.num_heads * self.head_dim
+        self.kv_size = self.num_kv_heads * self.head_dim
+        self.scaling = self.head_dim**-0.5
+        self.rope_theta = rope_theta
+        self.max_position_embeddings = max_position_embeddings
+
+        self.qkv_proj = QKVParallelLinear(
+            hidden_size,
+            self.head_dim,
+            self.total_num_heads,
+            self.total_num_kv_heads,
+            bias=bias,
+            quant_config=quant_config,
+            prefix=add_prefix("qkv_proj", prefix),
+        )
+        self.o_proj = RowParallelLinear(
+            self.total_num_heads * self.head_dim,
+            hidden_size,
+            bias=bias,
+            quant_config=quant_config,
+            prefix=add_prefix("o_proj", prefix),
+        )
+
+        self.rotary_emb = get_rope(
+            self.head_dim,
+            rotary_dim=self.rotary_dim,
+            max_position=max_position_embeddings,
+            base=rope_theta,
+            rope_scaling=rope_scaling,
+            is_neox_style=rope_is_neox_style,
+        )
+        self.attn = RadixAttention(
+            self.num_heads,
+            self.head_dim,
+            self.scaling,
+            num_kv_heads=self.num_kv_heads,
+            layer_id=layer_id,
+            quant_config=quant_config,
+            prefix=add_prefix("attn", prefix),
+        )
+
+    def forward_prepare_native(self, positions, hidden_states):
+        qkv, _ = self.qkv_proj(hidden_states)
+        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+        q, k = self.rotary_emb(positions, q, k)
+        return q, k, v
+
+    def forward_prepare_npu(self, positions, hidden_states, forward_batch):
+        qkv, _ = self.qkv_proj(hidden_states)
+        if self.attn.layer_id == forward_batch.token_to_kv_pool.start_layer:
+            self.rotary_emb.get_cos_sin_with_position(positions)
+        q, k, v = split_qkv_rmsnorm_rope(
+            qkv,
+            self.rotary_emb.position_sin,
+            self.rotary_emb.position_cos,
+            self.q_size,
+            self.kv_size,
+            self.head_dim,
+        )
+        return q, k, v
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+    ) -> torch.Tensor:
+        if (
+            not _is_npu
+            or not hasattr(self.rotary_emb, "get_cos_sin_with_position")
+            or forward_batch.forward_mode.is_extend()
+        ):
+            q, k, v = self.forward_prepare_native(
+                positions=positions,
+                hidden_states=hidden_states,
+            )
+        else:
+            q, k, v = self.forward_prepare_npu(
+                positions=positions,
+                hidden_states=hidden_states,
+                forward_batch=forward_batch,
+            )
+
+        attn_output = self.attn(q, k, v, forward_batch)
+        output, _ = self.o_proj(attn_output)
+        return output
+
+
+class LlamaDecoderLayer(nn.Module):
+    def __init__(
+        self,
+        config: LlamaConfig,
+        layer_id: int = 0,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        rope_theta = getattr(config, "rope_theta", 10000)
+        rope_scaling = getattr(config, "rope_scaling", None)
+        if rope_scaling is not None and getattr(
+            config, "original_max_position_embeddings", None
+        ):
+            rope_scaling["original_max_position_embeddings"] = (
+                config.original_max_position_embeddings
+            )
+        rope_is_neox_style = getattr(config, "rope_is_neox_style", True)
+        max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
+        # Support llamafy/Qwen-Qwen2.5-7B-Instruct-llamafied with attention_bias
+        # Support internlm/internlm-7b with bias
+        attention_bias = getattr(config, "attention_bias", False) or getattr(
+            config, "bias", False
+        )
+        self.self_attn = LlamaAttention(
+            config=config,
+            hidden_size=self.hidden_size,
+            num_heads=config.num_attention_heads,
+            num_kv_heads=config.num_key_value_heads,
+            layer_id=layer_id,
+            rope_theta=rope_theta,
+            rope_scaling=rope_scaling,
+            rope_is_neox_style=rope_is_neox_style,
+            max_position_embeddings=max_position_embeddings,
+            quant_config=quant_config,
+            prefix=add_prefix("self_attn", prefix),
+            bias=attention_bias,
+        )
+        self.mlp = LlamaMLP(
+            hidden_size=self.hidden_size,
+            intermediate_size=config.intermediate_size,
+            hidden_act=config.hidden_act,
+            quant_config=quant_config,
+            prefix=add_prefix("mlp", prefix),
+        )
+        self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = RMSNorm(
+            config.hidden_size, eps=config.rms_norm_eps
+        )
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+        residual: Optional[torch.Tensor],
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        # Self Attention
+        if residual is None:
+            residual = hidden_states
+            hidden_states = self.input_layernorm(hidden_states)
+        else:
+            hidden_states, residual = self.input_layernorm(hidden_states, residual)
+        hidden_states = self.self_attn(
+            positions=positions,
+            hidden_states=hidden_states,
+            forward_batch=forward_batch,
+        )
+
+        # Fully Connected
+        hidden_states, residual = self.post_attention_layernorm(hidden_states, residual)
+        hidden_states = self.mlp(hidden_states)
+        return hidden_states, residual
+
+
+class LlamaModel(nn.Module):
+    def __init__(
+        self,
+        config: LlamaConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.config = config
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+        self.pp_group = get_pp_group()
+        if self.pp_group.is_first_rank:
+            self.embed_tokens = VocabParallelEmbedding(
+                config.vocab_size,
+                config.hidden_size,
+                quant_config=quant_config,
+                prefix=add_prefix("embed_tokens", prefix),
+            )
+        else:
+            self.embed_tokens = PPMissingLayer()
+
+        self.layers, self.start_layer, self.end_layer = make_layers(
+            config.num_hidden_layers,
+            lambda idx, prefix: LlamaDecoderLayer(
+                config=config, quant_config=quant_config, layer_id=idx, prefix=prefix
+            ),
+            pp_rank=self.pp_group.rank_in_group,
+            pp_size=self.pp_group.world_size,
+            prefix="model.layers",
+        )
+
+        if self.pp_group.is_last_rank:
+            self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        else:
+            self.norm = PPMissingLayer(return_tuple=True)
+        self.layers_to_capture = []
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        input_embeds: torch.Tensor = None,
+        pp_proxy_tensors: Optional[PPProxyTensors] = None,
+    ) -> Union[torch.Tensor, Tuple[torch.Tensor, List[torch.Tensor]], PPProxyTensors]:
+        if self.pp_group.is_first_rank:
+            if input_embeds is None:
+                hidden_states = self.embed_tokens(input_ids)
+            else:
+                hidden_states = input_embeds
+            residual = None
+        else:
+            assert pp_proxy_tensors is not None
+            # FIXME(@ying): reduce the number of proxy tensors by not fusing layer norms
+            hidden_states = pp_proxy_tensors["hidden_states"]
+            residual = pp_proxy_tensors["residual"]
+            deferred_norm = None
+
+        aux_hidden_states = []
+        for i in range(self.start_layer, self.end_layer):
+            if i in self.layers_to_capture:
+                aux_hidden_states.append(hidden_states + residual)
+            layer = self.layers[i]
+            hidden_states, residual = layer(
+                positions,
+                hidden_states,
+                forward_batch,
+                residual,
+            )
+
+        if not self.pp_group.is_last_rank:
+            return PPProxyTensors(
+                {
+                    "hidden_states": hidden_states,
+                    "residual": residual,
+                }
+            )
+        else:
+            hidden_states, _ = self.norm(hidden_states, residual)
+
+        if len(aux_hidden_states) == 0:
+            return hidden_states
+
+        return hidden_states, aux_hidden_states
+
+    # If this function is called, it should always initialize KV cache scale
+    # factors (or else raise an exception). Thus, handled exceptions should
+    # make sure to leave KV cache scale factors in a known good (dummy) state
+    def load_kv_cache_scales(self, quantization_param_path: str) -> None:
+        tp_size = get_tensor_model_parallel_world_size()
+        tp_rank = get_tensor_model_parallel_rank()
+        for layer_idx, scaling_factor in kv_cache_scales_loader(
+            quantization_param_path,
+            tp_rank,
+            tp_size,
+            self.config.num_hidden_layers,
+            self.config.__class__.model_type,
+        ):
+            if not isinstance(self.layers[layer_idx], nn.Identity):
+                layer_self_attn = self.layers[layer_idx].self_attn
+
+            if hasattr(layer_self_attn.attn, "k_scale"):
+                layer_self_attn.attn.k_scale = scaling_factor
+                layer_self_attn.attn.v_scale = scaling_factor
+            else:
+                raise RuntimeError(
+                    "Self attention has no KV cache scaling " "factor attribute!"
+                )
+
+    def get_input_embeddings(self) -> nn.Embedding:
+        """Get input embeddings from the model."""
+        return self.embed_tokens
+
+
+class LlamaForCausalLM(nn.Module):
+    # BitandBytes specific attributes
+    default_bitsandbytes_target_modules = [
+        ".gate_proj.",
+        ".down_proj.",
+        ".up_proj.",
+        ".q_proj.",
+        ".k_proj.",
+        ".v_proj.",
+        ".o_proj.",
+    ]
+    # in TP, these weights are partitioned along the column dimension (dim=-1)
+    column_parallel_weights_modules = [".down_proj.", ".o_proj."]
+    bitsandbytes_stacked_params_mapping = {
+        # shard_name, weight_name, index
+        ".q_proj": (".qkv_proj", 0),
+        ".k_proj": (".qkv_proj", 1),
+        ".v_proj": (".qkv_proj", 2),
+        ".gate_proj": (".gate_up_proj", 0),
+        ".up_proj": (".gate_up_proj", 1),
+    }
+
+    def __init__(
+        self,
+        config: LlamaConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.pp_group = get_pp_group()
+        self.config = config
+        self.quant_config = quant_config
+        self.model = self._init_model(config, quant_config, add_prefix("model", prefix))
+        # Llama 3.2 1B Instruct set tie_word_embeddings to True
+        # Llama 3.1 8B Instruct set tie_word_embeddings to False
+        if self.config.tie_word_embeddings:
+            self.lm_head = self.model.embed_tokens
+        else:
+            self.lm_head = ParallelLMHead(
+                config.vocab_size,
+                config.hidden_size,
+                quant_config=quant_config,
+                prefix=add_prefix("lm_head", prefix),
+                use_attn_tp_group=get_global_server_args().enable_dp_lm_head,
+            )
+        self.logits_processor = LogitsProcessor(config)
+        self.pooler = Pooler(pooling_type=PoolingType.LAST, normalize=True)
+        self.stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            (".qkv_proj", ".q_proj", "q"),
+            (".qkv_proj", ".k_proj", "k"),
+            (".qkv_proj", ".v_proj", "v"),
+            (".gate_up_proj", ".gate_proj", 0),
+            (".gate_up_proj", ".up_proj", 1),
+        ]
+
+        self.capture_aux_hidden_states = False
+
+    def _init_model(
+        self,
+        config: LlamaConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        return LlamaModel(config, quant_config=quant_config, prefix=prefix)
+
+    @torch.no_grad()
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        input_embeds: torch.Tensor = None,
+        get_embedding: bool = False,
+        pp_proxy_tensors: Optional[PPProxyTensors] = None,
+    ) -> LogitsProcessorOutput:
+        hidden_states = self.model(
+            input_ids,
+            positions,
+            forward_batch,
+            input_embeds,
+            pp_proxy_tensors=pp_proxy_tensors,
+        )
+
+        aux_hidden_states = None
+        if self.capture_aux_hidden_states:
+            hidden_states, aux_hidden_states = hidden_states
+
+        if self.pp_group.is_last_rank:
+            if not get_embedding:
+                return self.logits_processor(
+                    input_ids,
+                    hidden_states,
+                    self.lm_head,
+                    forward_batch,
+                    aux_hidden_states,
+                )
+            else:
+                return self.pooler(hidden_states, forward_batch)
+        else:
+            return hidden_states
+
+    @torch.no_grad()
+    def forward_split_prefill(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        split_interval: Tuple[int, int],  # [start, end) 0-based
+        input_embeds: torch.Tensor = None,
+    ) -> Optional[LogitsProcessorOutput]:
+        start, end = split_interval
+        # embed
+        if start == 0:
+            if input_embeds is None:
+                forward_batch.hidden_states = self.model.embed_tokens(input_ids)
+            else:
+                forward_batch.hidden_states = input_embeds
+        # decoder layer
+        for i in range(start, end):
+            layer = self.model.layers[i]
+            forward_batch.hidden_states, forward_batch.residual = layer(
+                positions,
+                forward_batch.hidden_states,
+                forward_batch,
+                forward_batch.residual,
+            )
+
+        if end == self.model.config.num_hidden_layers:
+            # norm
+            hidden_states, _ = self.model.norm(
+                forward_batch.hidden_states, forward_batch.residual
+            )
+            forward_batch.hidden_states = hidden_states
+            # logits process
+            result = self.logits_processor(
+                input_ids, forward_batch.hidden_states, self.lm_head, forward_batch
+            )
+        else:
+            result = None
+
+        return result
+
+    @property
+    def start_layer(self):
+        return self.model.start_layer
+
+    @property
+    def end_layer(self):
+        return self.model.end_layer
+
+    def get_input_embeddings(self) -> nn.Embedding:
+        return self.model.embed_tokens
+
+    def get_module_name_from_weight_name(self, name):
+        for param_name, weight_name, shard_id, num_shard in self.stacked_params_mapping:
+            if weight_name in name:
+                return (
+                    name.replace(weight_name, param_name)[: -len(".weight")],
+                    num_shard,
+                )
+        return name[: -len(".weight")], 1
+
+    def get_num_params(self):
+        params_dict = dict(self.named_parameters())
+        return len(params_dict)
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            (".qkv_proj", ".q_proj", "q"),
+            (".qkv_proj", ".k_proj", "k"),
+            (".qkv_proj", ".v_proj", "v"),
+            (".gate_up_proj", ".gate_proj", 0),
+            (".gate_up_proj", ".up_proj", 1),
+        ]
+
+        params_dict = dict(self.named_parameters())
+
+        for name, loaded_weight in weights:
+            if name.endswith(".activation_scale"):
+                name = name.replace(".activation_scale", ".input_scale")
+            if name.endswith(".weight_scale_inv"):
+                name = name.replace(".weight_scale_inv", ".weight_scale")
+
+            layer_id = get_layer_id(name)
+            if (
+                layer_id is not None
+                and hasattr(self.model, "start_layer")
+                and (
+                    layer_id < self.model.start_layer
+                    or layer_id >= self.model.end_layer
+                )
+            ):
+                continue
+            if "rotary_emb.inv_freq" in name or "projector" in name:
+                continue
+            if "rotary_emb.cos_cached" in name or "rotary_emb.sin_cached" in name:
+                # Models trained using ColossalAI may include these tensors in
+                # the checkpoint. Skip them.
+                continue
+            if name.startswith("model.vision_tower") and name not in params_dict:
+                continue
+            if self.config.tie_word_embeddings and "lm_head.weight" in name:
+                continue
+            # Handle FP8 kv-scale remapping
+            if "scale" in name:
+                name = maybe_remap_kv_scale_name(name, params_dict)
+                if name is None:
+                    continue
+
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                if name not in params_dict:
+                    continue
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                # Skip loading kv_scale from ckpts towards new design.
+                if name.endswith(".kv_scale") and name not in params_dict:
+                    continue
+                if name in params_dict.keys():
+                    param = params_dict[name]
+                    weight_loader = getattr(
+                        param, "weight_loader", default_weight_loader
+                    )
+                    weight_loader(param, loaded_weight)
+                else:
+                    logger.warning(f"Parameter {name} not found in params_dict")
+
+    def get_weights_by_name(
+        self, name: str, truncate_size: int = 100, tp_size: int = 1
+    ) -> Optional[torch.Tensor]:
+        """Get the weights of the parameter by its name. Similar to `get_parameter` in Hugging Face.
+
+        Only used for unit test with an unoptimized performance.
+        For optimized performance, please use torch.save and torch.load.
+        """
+        try:
+            if name == "lm_head.weight" and self.config.tie_word_embeddings:
+                logger.info(
+                    "word embedding is tied for this model, return embed_tokens.weight as lm_head.weight."
+                )
+                return (
+                    self.model.embed_tokens.weight.cpu()
+                    .to(torch.float32)
+                    .numpy()
+                    .tolist()[:truncate_size]
+                )
+
+            mapped_name = name
+            mapped_shard_id = None
+            for param_name, weight_name, shard_id in self.stacked_params_mapping:
+                if weight_name in name:
+                    mapped_name = name.replace(weight_name, param_name)
+                    mapped_shard_id = shard_id
+                    break
+            params_dict = dict(self.named_parameters())
+            param = params_dict[mapped_name]
+            if mapped_shard_id is not None:
+                if mapped_shard_id in ["q", "k", "v"]:
+                    num_heads = self.config.num_attention_heads // tp_size
+                    num_kv_heads = self.config.num_key_value_heads // tp_size
+                    head_dim = (
+                        self.config.hidden_size // self.config.num_attention_heads
+                    )
+                    if mapped_shard_id == "q":
+                        offset = 0
+                        size = num_heads * head_dim
+                    elif mapped_shard_id == "k":
+                        offset = num_heads * head_dim
+                        size = num_kv_heads * head_dim
+                    elif mapped_shard_id == "v":
+                        offset = (num_heads + num_kv_heads) * head_dim
+                        size = num_kv_heads * head_dim
+                    weight = param.data.narrow(0, offset, size)
+                elif mapped_shard_id in [0, 1]:
+                    intermediate_size = self.config.intermediate_size
+                    slice_size = intermediate_size // tp_size
+                    if mapped_shard_id == 0:  # gate_proj
+                        offset = 0
+                        size = slice_size
+                    elif mapped_shard_id == 1:  # up_proj
+                        offset = slice_size
+                        size = slice_size
+
+                    weight = param.data.narrow(0, offset, size)
+                else:
+                    weight = param.data
+            else:
+                weight = param.data
+            if tp_size > 1 and ("o_proj" in name or "down_proj" in name):
+                gathered_weights = [torch.zeros_like(weight) for _ in range(tp_size)]
+                torch.distributed.all_gather(gathered_weights, weight)
+                weight = torch.cat(gathered_weights, dim=1)
+            return weight.cpu().to(torch.float32).numpy().tolist()[:truncate_size]
+
+        except Exception:
+            logger.error(
+                f"Error getting weights by name {name} in LlamaForCausalLM: {get_exception_traceback()}"
+            )
+            return None
+
+    def get_embed_and_head(self):
+        return self.model.embed_tokens.weight, self.lm_head.weight
+
+    def set_embed_and_head(self, embed, head):
+        del self.model.embed_tokens.weight
+        del self.lm_head.weight
+        self.model.embed_tokens.weight = embed
+        self.lm_head.weight = head
+        torch.cuda.empty_cache()
+        torch.cuda.synchronize()
+
+    def get_embed(self):
+        return self.model.embed_tokens.weight
+
+    def set_embed(self, embed):
+        # NOTE: If draft hidden size != target hidden size, the embed weight cannot be shared for EAGLE3
+        if (
+            hasattr(self.config, "target_hidden_size")
+            and self.config.target_hidden_size != self.config.hidden_size
+        ):
+            return
+        del self.model.embed_tokens.weight
+        self.model.embed_tokens.weight = embed
+        torch.cuda.empty_cache()
+        torch.cuda.synchronize()
+
+    def load_kv_cache_scales(self, quantization_param_path: str) -> None:
+        self.model.load_kv_cache_scales(quantization_param_path)
+
+    def set_eagle3_layers_to_capture(self, layer_ids: Optional[List[int]] = None):
+        if not self.pp_group.is_last_rank:
+            return
+
+        if layer_ids is None:
+            self.capture_aux_hidden_states = True
+            num_layers = self.config.num_hidden_layers
+            self.model.layers_to_capture = [2, num_layers // 2, num_layers - 3]
+        else:
+            self.capture_aux_hidden_states = True
+            # we plus 1 here because in sglang, for the ith layer, it takes the output
+            # of the (i-1)th layer as aux hidden state
+            self.model.layers_to_capture = [val + 1 for val in layer_ids]
+
+
+class Phi3ForCausalLM(LlamaForCausalLM):
+    pass
+
+
+class InternLM3ForCausalLM(LlamaForCausalLM):
+    pass
+
+
+class IQuestCoderForCausalLM(LlamaForCausalLM):
+    pass
+
+
+EntryClass = [
+    LlamaForCausalLM,
+    Phi3ForCausalLM,
+    InternLM3ForCausalLM,
+    IQuestCoderForCausalLM,
+]
diff --git a/sglang/python/sglang/srt/models/llama4.py b/sglang/python/sglang/srt/models/llama4.py
new file mode 100644
index 0000000000000000000000000000000000000000..22749a16333a78f49497ea89a3192c6d31aba783
--- /dev/null
+++ b/sglang/python/sglang/srt/models/llama4.py
@@ -0,0 +1,569 @@
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+# Adapted from
+# https://github.com/vllm-project/vllm/blob/v0.8.3/vllm/model_executor/models/llama4.py
+"""Inference-only LLaMA model compatible with HuggingFace weights."""
+
+import logging
+from typing import Any, Dict, List, Optional, Tuple, Union
+
+import torch
+from torch import nn
+from transformers import Llama4TextConfig
+
+from sglang.srt.distributed import (
+    get_tensor_model_parallel_world_size,
+    tensor_model_parallel_all_reduce,
+)
+from sglang.srt.layers.communicator import LayerCommunicator, LayerScatterModes
+from sglang.srt.layers.dp_attention import (
+    get_attention_tp_rank,
+    get_attention_tp_size,
+    is_dp_attention_enabled,
+)
+from sglang.srt.layers.layernorm import RMSNorm
+from sglang.srt.layers.linear import (
+    QKVParallelLinear,
+    ReplicatedLinear,
+    RowParallelLinear,
+)
+from sglang.srt.layers.moe.fused_moe_triton import FusedMoE
+from sglang.srt.layers.moe.topk import TopK
+from sglang.srt.layers.quantization.base_config import QuantizationConfig
+from sglang.srt.layers.radix_attention import RadixAttention
+from sglang.srt.layers.rotary_embedding import get_rope
+from sglang.srt.layers.vocab_parallel_embedding import VocabParallelEmbedding
+from sglang.srt.model_executor.forward_batch_info import (
+    ForwardBatch,
+    ForwardMode,
+    PPProxyTensors,
+)
+from sglang.srt.models.llama import LlamaForCausalLM, LlamaMLP
+from sglang.srt.utils import (
+    add_prefix,
+    fast_topk,
+    get_compiler_backend,
+    is_cuda,
+    is_npu,
+    make_layers,
+)
+from sglang.srt.utils.common import get_current_device_stream_fast
+
+_is_cuda = is_cuda()
+_is_npu = is_npu()
+
+logger = logging.getLogger(__name__)
+
+
+class Llama4MoE(nn.Module):
+
+    @torch.compile(dynamic=True, backend=get_compiler_backend())
+    @staticmethod
+    def custom_routing_function(
+        hidden_states: torch.Tensor,
+        gating_output: torch.Tensor,
+        topk: int,
+        renormalize: bool,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        router_scores_aK, router_indices_aK = fast_topk(gating_output, topk, dim=-1)
+        router_scores_aK = torch.sigmoid(router_scores_aK.float()).to(
+            hidden_states.dtype
+        )
+        return (
+            router_scores_aK.view(-1).reshape(router_scores_aK.shape),
+            router_indices_aK.to(torch.int32),
+        )
+
+    def __init__(
+        self,
+        config: Llama4TextConfig,
+        layer_id: int,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.tp_size = get_tensor_model_parallel_world_size()
+        self.top_k = config.num_experts_per_tok
+        self.device_module = torch.get_device_module()
+
+        intermediate_size_moe = config.intermediate_size
+        self.router = ReplicatedLinear(
+            config.hidden_size,
+            config.num_local_experts,
+            bias=False,
+            quant_config=None,
+            prefix=add_prefix("router", prefix),
+        )
+
+        self.topk = TopK(
+            top_k=self.top_k,
+            renormalize=False,
+            custom_routing_function=Llama4MoE.custom_routing_function,
+        )
+
+        self.experts = FusedMoE(
+            num_experts=config.num_local_experts,
+            hidden_size=config.hidden_size,
+            intermediate_size=intermediate_size_moe,
+            layer_id=layer_id,
+            reduce_results=False,
+            quant_config=quant_config,
+            apply_router_weight_on_input=True,
+            prefix=add_prefix("experts", prefix),
+        )
+
+        self.shared_expert = LlamaMLP(
+            hidden_size=config.hidden_size,
+            intermediate_size=intermediate_size_moe,
+            hidden_act="silu",
+            quant_config=quant_config,
+            prefix=add_prefix("shared_expert", prefix),
+            reduce_results=False,  # We need to do scatter before reduce
+        )
+
+    def forward(
+        self,
+        hidden_states,
+        forward_batch: ForwardBatch,
+        use_reduce_scatter: bool = False,
+    ):
+        shared_out, routed_out = self._forward_core(
+            hidden_states, forward_batch.forward_mode
+        )
+
+        out_aD = routed_out + shared_out
+
+        if self.tp_size > 1 and not use_reduce_scatter:
+            out_aD = tensor_model_parallel_all_reduce(out_aD)
+
+        return out_aD
+
+    def _forward_core(self, hidden_states, forward_mode: ForwardMode):
+        if _is_cuda:
+            return self._forward_core_shared_routed_overlap(hidden_states)
+        else:
+            return self._forward_core_normal(hidden_states)
+
+    def _forward_core_normal(self, hidden_states):
+        # router_scores: [num_tokens, num_experts]
+        router_logits, _ = self.router(hidden_states)
+        shared_out = self.shared_expert(hidden_states)
+        topk_output = self.topk(hidden_states, router_logits)
+        routed_out = self.experts(hidden_states, topk_output)
+        return shared_out, routed_out
+
+    def _forward_core_shared_routed_overlap(self, hidden_states):
+        alt_stream = _get_or_create_alt_stream(self.device_module)
+
+        alt_stream.wait_stream(get_current_device_stream_fast())
+
+        shared_out = self.shared_expert(hidden_states)
+
+        with self.device_module.stream(alt_stream):
+            # router_scores: [num_tokens, num_experts]
+            router_logits, _ = self.router(hidden_states)
+            topk_output = self.topk(hidden_states, router_logits)
+            routed_out = self.experts(hidden_states, topk_output)
+        get_current_device_stream_fast().wait_stream(alt_stream)
+
+        return shared_out, routed_out
+
+
+_alt_stream = None
+
+
+def _get_or_create_alt_stream(device_module):
+    global _alt_stream
+    if _alt_stream is None:
+        _alt_stream = device_module.Stream()
+    return _alt_stream
+
+
+class Llama4Attention(nn.Module):
+
+    def __init__(
+        self,
+        config: Llama4TextConfig,
+        layer_id: int,
+        hidden_size: int,
+        num_heads: int,
+        num_kv_heads: int,
+        rope_theta: float = 10000,
+        rope_scaling: Optional[Dict[str, Any]] = None,
+        max_position_embeddings: int = 8192,
+        quant_config: Optional[QuantizationConfig] = None,
+        bias: bool = False,
+        bias_o_proj: bool = False,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.layer_id = layer_id
+        self.hidden_size = hidden_size
+        self.use_rope = (layer_id + 1) % 4 != 0
+        self.use_qk_norm = config.use_qk_norm and self.use_rope
+
+        attn_tp_rank = get_attention_tp_rank()
+        attn_tp_size = get_attention_tp_size()
+
+        self.total_num_heads = num_heads
+        assert self.total_num_heads % attn_tp_size == 0
+        self.num_heads = self.total_num_heads // attn_tp_size
+        self.total_num_kv_heads = num_kv_heads
+        if self.total_num_kv_heads >= attn_tp_size:
+            # Number of KV heads is greater than TP size, so we partition
+            # the KV heads across multiple tensor parallel GPUs.
+            assert self.total_num_kv_heads % attn_tp_size == 0
+        else:
+            # Number of KV heads is less than TP size, so we replicate
+            # the KV heads across multiple tensor parallel GPUs.
+            assert attn_tp_size % self.total_num_kv_heads == 0
+        self.num_kv_heads = max(1, self.total_num_kv_heads // attn_tp_size)
+        self.head_dim = config.head_dim
+        self.q_size = self.num_heads * self.head_dim
+        self.kv_size = self.num_kv_heads * self.head_dim
+        self.scaling = self.head_dim**-0.5
+        self.attn_temperature_tuning = config.attn_temperature_tuning
+        self.floor_scale = config.floor_scale
+        self.attn_scale = config.attn_scale
+        self.rope_theta = rope_theta
+        self.max_position_embeddings = max_position_embeddings
+        self.n_rep = self.num_heads // self.num_kv_heads
+        self.qk_norm = (
+            RMSNorm(
+                hidden_size=self.head_dim,
+                eps=config.rms_norm_eps,
+                has_weight=False,
+            )
+            if self.use_qk_norm
+            else None
+        )
+
+        qkv_quant_config = quant_config
+        o_quant_config = quant_config
+        if quant_config and hasattr(quant_config, "ignore") and quant_config.ignore:
+            if add_prefix("q_proj", prefix) in quant_config.ignore:
+                qkv_quant_config = None
+            if add_prefix("o_proj", prefix) in quant_config.ignore:
+                o_quant_config = None
+
+        self.qkv_proj = QKVParallelLinear(
+            hidden_size=hidden_size,
+            head_size=self.head_dim,
+            total_num_heads=self.total_num_heads,
+            total_num_kv_heads=self.total_num_kv_heads,
+            bias=bias,
+            quant_config=qkv_quant_config,
+            prefix=add_prefix("qkv_proj", prefix),
+            tp_rank=attn_tp_rank,
+            tp_size=attn_tp_size,
+        )
+
+        self.o_proj = RowParallelLinear(
+            input_size=self.total_num_heads * self.head_dim,
+            output_size=hidden_size,
+            bias=bias_o_proj,
+            quant_config=o_quant_config,
+            prefix=add_prefix("o_proj", prefix),
+            tp_rank=attn_tp_rank,
+            tp_size=attn_tp_size,
+            reduce_results=False,
+        )
+        is_neox_style = True
+        is_gguf = quant_config and quant_config.get_name() == "gguf"
+        if is_gguf and config.model_type in ["llama", "llama4"]:
+            is_neox_style = False
+
+        self.rotary_emb = (
+            get_rope(
+                self.head_dim,
+                rotary_dim=self.head_dim,
+                max_position=max_position_embeddings,
+                base=int(rope_theta),
+                rope_scaling=rope_scaling if rope_scaling != "default" else None,
+                is_neox_style=is_neox_style,
+            )
+            if self.use_rope
+            else None
+        )
+
+        self.attn = RadixAttention(
+            self.num_heads,
+            self.head_dim,
+            self.scaling,
+            num_kv_heads=self.num_kv_heads,
+            layer_id=layer_id,
+            prefix=add_prefix("attn", prefix),
+            use_irope=self.use_rope,
+        )
+
+    def _get_attn_scale(self, positions: torch.Tensor) -> torch.Tensor:
+        floor = torch.floor((positions + 1.0) / self.floor_scale)
+        attn_scale = torch.log(floor + 1.0) * self.attn_scale + 1.0
+        return attn_scale.unsqueeze(-1)
+
+    @torch.compile(dynamic=True, backend=get_compiler_backend())
+    def _mul_attn_scale(self, positions, q):
+        attn_scale = self._get_attn_scale(positions)
+        return (q * attn_scale).to(q.dtype)
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+    ) -> torch.Tensor:
+        qkv, _ = self.qkv_proj(hidden_states)
+
+        qk, v = qkv.split([self.q_size + self.kv_size, self.kv_size], dim=-1)
+
+        if self.rotary_emb is not None:
+            q_view, k_view = qk.split([self.q_size, self.kv_size], dim=-1)
+            q_out_unused, k_out_unused = self.rotary_emb(positions, q_view, k_view)
+            if _is_npu:
+                qk = torch.cat([q_out_unused, k_out_unused], dim=-1)
+            del q_view, k_view, q_out_unused, k_out_unused
+
+        if self.qk_norm is not None:
+            # TODO there are still 2 redundant direct_copy_kernel_cuda for this `reshape` and (in attn backend) q.contiguous(), maybe we can fuse them later
+            qk = qk.reshape(-1, self.head_dim).contiguous().bfloat16()
+            qk = self.qk_norm(qk).to(torch.bfloat16)
+            qk = qk.reshape(-1, self.q_size + self.kv_size)
+
+        q, k = qk.split([self.q_size, self.kv_size], dim=-1)
+
+        # We are applying temperature tuning (https://arxiv.org/abs/2501.19399) to NoPE layers, where
+        # the inference-time temperature tuning function is customized to not affect short context
+        # while working at very long context
+        # https://arxiv.org/abs/2501.19399
+        if self.attn_temperature_tuning and not self.use_rope:
+            q = self._mul_attn_scale(positions=positions, q=q)
+
+        attn_output = self.attn(q, k, v, forward_batch)
+        output, _ = self.o_proj(attn_output)
+        return output
+
+
+class Llama4DecoderLayer(nn.Module):
+    def __init__(
+        self,
+        config: Llama4TextConfig,
+        layer_id: int = 0,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.layer_id = layer_id
+        self.hidden_size = config.hidden_size
+        rope_theta = config.rope_theta
+        rope_scaling = config.rope_scaling
+        max_position_embeddings = config.max_position_embeddings
+        self.attn_tp_size = get_attention_tp_size()
+        self.attn_tp_rank = get_attention_tp_rank()
+
+        self.self_attn = Llama4Attention(
+            config=config,
+            layer_id=layer_id,
+            hidden_size=self.hidden_size,
+            num_heads=config.num_attention_heads,
+            num_kv_heads=config.num_key_value_heads,
+            rope_theta=rope_theta,
+            rope_scaling=rope_scaling,
+            max_position_embeddings=max_position_embeddings,
+            quant_config=quant_config,
+            bias=False,
+            bias_o_proj=False,
+            prefix=add_prefix("self_attn", prefix),
+        )
+        self.config = config
+        is_moe_layer = self._is_moe_layer(layer_id)
+        is_previous_moe_layer = self._is_moe_layer(layer_id - 1)
+        is_next_moe_layer = self._is_moe_layer(layer_id + 1)
+
+        if is_moe_layer:
+            self.feed_forward = Llama4MoE(
+                config=config,
+                layer_id=layer_id,
+                quant_config=quant_config,
+                prefix=add_prefix("feed_forward", prefix),
+            )
+        else:
+            self.feed_forward = LlamaMLP(
+                hidden_size=self.hidden_size,
+                intermediate_size=config.intermediate_size_mlp,
+                hidden_act="silu",
+                quant_config=quant_config,
+                prefix=add_prefix("feed_forward", prefix),
+            )
+        self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = RMSNorm(
+            config.hidden_size, eps=config.rms_norm_eps
+        )
+
+        self.layer_scatter_modes = LayerScatterModes.init_new(
+            layer_id=layer_id,
+            num_layers=config.num_hidden_layers,
+            is_layer_sparse=is_moe_layer,
+            is_previous_layer_sparse=is_previous_moe_layer,
+            is_next_layer_sparse=is_next_moe_layer,
+        )
+
+        self.layer_communicator = LayerCommunicator(
+            layer_scatter_modes=self.layer_scatter_modes,
+            input_layernorm=self.input_layernorm,
+            post_attention_layernorm=self.post_attention_layernorm,
+            allow_reduce_scatter=True,
+        )
+
+    def _is_moe_layer(self, layer_id: int) -> bool:
+        if self.config.interleave_moe_layer_step == 0:
+            return self.config.num_local_experts > 0
+        return (layer_id + 1) % self.config.interleave_moe_layer_step == 0
+
+    def get_intermediate_size(self) -> int:
+        if isinstance(self.feed_forward, Llama4MoE):
+            return self.config.intermediate_size
+        else:
+            return self.config.intermediate_size_mlp
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+        residual: Optional[torch.Tensor],
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        hidden_states, residual = self.layer_communicator.prepare_attn(
+            hidden_states, residual, forward_batch
+        )
+
+        if hidden_states.shape[0] != 0:
+            hidden_states = self.self_attn(
+                positions=positions,
+                hidden_states=hidden_states,
+                forward_batch=forward_batch,
+            )
+
+        hidden_states, residual = self.layer_communicator.prepare_mlp(
+            hidden_states, residual, forward_batch
+        )
+
+        # For DP with padding, reduce scatter can be used instead of all-reduce.
+        use_reduce_scatter = self.layer_communicator.should_use_reduce_scatter(
+            forward_batch
+        )
+
+        # Fully Connected
+        hidden_states = self.feed_forward(
+            hidden_states, forward_batch, use_reduce_scatter
+        )
+        hidden_states, residual = self.layer_communicator.postprocess_layer(
+            hidden_states, residual, forward_batch
+        )
+
+        return hidden_states, residual
+
+
+class Llama4Model(nn.Module):
+    def __init__(
+        self,
+        config: Llama4TextConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.config = config
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+        self.embed_tokens = VocabParallelEmbedding(
+            config.vocab_size,
+            config.hidden_size,
+            quant_config=quant_config,
+            prefix=add_prefix("embed_tokens", prefix),
+            use_attn_tp_group=is_dp_attention_enabled(),
+        )
+        self.layers = make_layers(
+            config.num_hidden_layers,
+            lambda idx, prefix: Llama4DecoderLayer(
+                config=config, layer_id=idx, quant_config=quant_config, prefix=prefix
+            ),
+            prefix=add_prefix("layers", prefix),
+        )
+
+        self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.layers_to_capture = []
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        input_embeds: torch.Tensor = None,
+        pp_proxy_tensors: Optional[PPProxyTensors] = None,
+    ) -> Union[torch.Tensor, Tuple[torch.Tensor, List[torch.Tensor]]]:
+        if input_embeds is None:
+            hidden_states = self.embed_tokens(input_ids)
+        else:
+            hidden_states = input_embeds
+        residual = None
+        aux_hidden_states = []
+        for i in range(len(self.layers)):
+            if i in self.layers_to_capture:
+                aux_hidden_states.append(hidden_states + residual)
+            layer = self.layers[i]
+            hidden_states, residual = layer(
+                positions,
+                hidden_states,
+                forward_batch,
+                residual,
+            )
+        if not forward_batch.forward_mode.is_idle():
+            hidden_states, _ = self.norm(hidden_states, residual)
+
+        if len(aux_hidden_states) == 0:
+            return hidden_states
+
+        return hidden_states, aux_hidden_states
+
+
+class Llama4ForCausalLM(LlamaForCausalLM):
+    packed_modules_mapping = {
+        "qkv_proj": ["q_proj", "k_proj", "v_proj"],
+        "gate_up_proj": ["gate_proj", "up_proj"],
+    }
+
+    def __init__(
+        self,
+        config: Llama4TextConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__(config, quant_config, prefix)
+
+    def get_input_embeddings(self):
+        return self.model.embed_tokens
+
+    def get_layers(self):
+        return self.model.layers
+
+    def _init_model(
+        self,
+        config: Llama4TextConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        return Llama4Model(config, quant_config=quant_config, prefix=prefix)
+
+
+EntryClass = [Llama4ForCausalLM]
diff --git a/sglang/python/sglang/srt/models/llama_classification.py b/sglang/python/sglang/srt/models/llama_classification.py
new file mode 100644
index 0000000000000000000000000000000000000000..8387d20300b4264b97763857a3024799a2f10bcb
--- /dev/null
+++ b/sglang/python/sglang/srt/models/llama_classification.py
@@ -0,0 +1,81 @@
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+from typing import Iterable, Optional, Tuple
+
+import torch
+from torch import nn
+from transformers import LlamaConfig
+
+from sglang.srt.layers.pooler import EmbeddingPoolerOutput, Pooler, PoolingType
+from sglang.srt.layers.quantization.base_config import QuantizationConfig
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch
+from sglang.srt.model_loader.weight_utils import default_weight_loader
+from sglang.srt.models.llama import LlamaForCausalLM, LlamaModel
+from sglang.srt.utils import add_prefix
+
+
+class LlamaForClassification(nn.Module):
+    def __init__(
+        self,
+        config: LlamaConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.config = config
+        self.quant_config = quant_config
+        self.model = LlamaModel(
+            config, quant_config=quant_config, prefix=add_prefix("model", prefix)
+        )
+
+        self.classification_head = nn.Linear(
+            config.hidden_size, config.classification_out_size, bias=False
+        )
+        self.pooler = Pooler(pooling_type=PoolingType.LAST, normalize=False)
+
+    @torch.no_grad()
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        input_embeds: torch.Tensor = None,
+        get_embedding: bool = True,
+    ) -> EmbeddingPoolerOutput:
+        assert (
+            get_embedding
+        ), "LlamaForClassification is only used for embedding. Please add --is-embedding when you launch the server."
+
+        hidden_states = self.model(input_ids, positions, forward_batch, input_embeds)
+        last_token_hidden = self.pooler(hidden_states, forward_batch).embeddings
+        scores = self.classification_head(last_token_hidden)
+
+        return EmbeddingPoolerOutput(scores)
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        params_dict = dict(self.named_parameters())
+
+        for name, loaded_weight in weights:
+            if "classification_head" in name:
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
+                weight_loader(param, loaded_weight)
+            elif "lm_head" in name:
+                continue
+            else:
+                LlamaForCausalLM.load_weights(self, [(name, loaded_weight)])
+
+
+EntryClass = LlamaForClassification
diff --git a/sglang/python/sglang/srt/models/llama_eagle.py b/sglang/python/sglang/srt/models/llama_eagle.py
new file mode 100644
index 0000000000000000000000000000000000000000..881731fdf2f3f135aea24d76c0193ab3cf687606
--- /dev/null
+++ b/sglang/python/sglang/srt/models/llama_eagle.py
@@ -0,0 +1,149 @@
+"""
+Copyright 2023-2024 SGLang Team
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+from sglang.srt.utils import add_prefix
+
+# Adapted from
+# https://github.com/SafeAILab/EAGLE/blob/main/eagle/model/cnets.py
+"""Inference-only LLaMA-EAGLE model compatible with HuggingFace weights."""
+
+from typing import Iterable, Optional, Tuple
+
+import torch
+from torch import nn
+from transformers import LlamaConfig
+
+from sglang.srt.distributed import get_pp_group
+from sglang.srt.layers.logits_processor import LogitsProcessor
+from sglang.srt.layers.quantization.base_config import QuantizationConfig
+from sglang.srt.layers.vocab_parallel_embedding import (
+    ParallelLMHead,
+    VocabParallelEmbedding,
+)
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch, PPProxyTensors
+from sglang.srt.models.llama import LlamaDecoderLayer, LlamaForCausalLM
+
+
+class LlamaDecoderLayer(LlamaDecoderLayer):
+    def __init__(
+        self,
+        config: LlamaConfig,
+        layer_id: int = 0,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__(config, layer_id, quant_config, prefix)
+
+        # Skip the input_layernorm
+        # https://github.com/SafeAILab/EAGLE/blob/35c78f6cdc19a73e05cf5c330b4c358dad970c6a/eagle/model/cnets.py#L427
+        if layer_id == 0:
+            del self.input_layernorm
+            setattr(self, "input_layernorm", lambda x: x)
+
+
+class LlamaModel(nn.Module):
+    def __init__(
+        self,
+        config: LlamaConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.config = config
+        self.vocab_size = config.vocab_size
+        self.embed_tokens = VocabParallelEmbedding(
+            config.vocab_size,
+            config.hidden_size,
+            prefix=add_prefix("embed_tokens", prefix),
+        )
+        self.layers = nn.ModuleList(
+            [
+                LlamaDecoderLayer(
+                    config,
+                    i,
+                    quant_config=quant_config,
+                    prefix=add_prefix(f"layers.{i}", prefix),
+                )
+                for i in range(config.num_hidden_layers)
+            ]
+        )
+        self.fc = torch.nn.Linear(config.hidden_size * 2, config.hidden_size)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        input_embeds: torch.Tensor = None,
+        pp_proxy_tensors: Optional[PPProxyTensors] = None,
+    ) -> torch.Tensor:
+        if input_embeds is None:
+            hidden_states = self.embed_tokens(input_ids)
+        else:
+            hidden_states = input_embeds
+
+        hidden_states = self.fc(
+            torch.cat((hidden_states, forward_batch.spec_info.hidden_states), dim=-1)
+        )
+
+        residual = None
+        for i in range(len(self.layers)):
+            layer = self.layers[i]
+            hidden_states, residual = layer(
+                positions,
+                hidden_states,
+                forward_batch,
+                residual,
+            )
+        return hidden_states + residual
+
+
+class LlamaForCausalLMEagle(LlamaForCausalLM):
+    def __init__(
+        self,
+        config: LlamaConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        nn.Module.__init__(self)
+        self.config = config
+        self.quant_config = quant_config
+        self.pp_group = get_pp_group()
+        self.model = LlamaModel(
+            config, quant_config=quant_config, prefix=add_prefix("model", prefix)
+        )
+        # Llama 3.2 1B Instruct set tie_word_embeddings to True
+        # Llama 3.1 8B Instruct set tie_word_embeddings to False
+        if self.config.tie_word_embeddings:
+            self.lm_head = self.model.embed_tokens
+        else:
+            self.lm_head = ParallelLMHead(
+                getattr(config, "hot_vocab_size", config.vocab_size),
+                config.hidden_size,
+                quant_config=quant_config,
+                prefix=add_prefix("lm_head", prefix),
+            )
+
+        self.logits_processor = LogitsProcessor(config)
+        self.capture_aux_hidden_states = False
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        for name, loaded_weight in weights:
+            if "lm_head" not in name:
+                name = "model." + name
+                super().load_weights([(name, loaded_weight)])
+
+
+EntryClass = [LlamaForCausalLMEagle]
diff --git a/sglang/python/sglang/srt/models/llama_eagle3.py b/sglang/python/sglang/srt/models/llama_eagle3.py
new file mode 100644
index 0000000000000000000000000000000000000000..49f938a1c5fe188aafe63218422c2d526652bdaa
--- /dev/null
+++ b/sglang/python/sglang/srt/models/llama_eagle3.py
@@ -0,0 +1,276 @@
+"""
+Copyright 2023-2024 SGLang Team
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+from sglang.srt.utils import add_prefix
+
+# Adapted from
+# https://github.com/SafeAILab/EAGLE/blob/main/eagle/model/cnets.py
+"""Inference-only LLaMA-EAGLE model compatible with HuggingFace weights."""
+
+import copy
+from typing import Iterable, Optional, Tuple
+
+import torch
+from torch import nn
+from transformers import LlamaConfig
+
+from sglang.srt.distributed import get_pp_group
+from sglang.srt.layers.layernorm import RMSNorm
+from sglang.srt.layers.linear import QKVParallelLinear
+from sglang.srt.layers.logits_processor import LogitsProcessor
+from sglang.srt.layers.quantization.base_config import QuantizationConfig
+from sglang.srt.layers.vocab_parallel_embedding import (
+    ParallelLMHead,
+    VocabParallelEmbedding,
+)
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch, PPProxyTensors
+from sglang.srt.model_loader.weight_utils import default_weight_loader
+from sglang.srt.models.llama import LlamaDecoderLayer, LlamaForCausalLM, LlamaMLP
+
+
+class LlamaDecoderLayer(LlamaDecoderLayer):
+    def __init__(
+        self,
+        config: LlamaConfig,
+        layer_id: int = 0,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__(config, layer_id, quant_config, prefix)
+
+        # override qkv
+        self.self_attn.qkv_proj = QKVParallelLinear(
+            2 * self.hidden_size,
+            self.self_attn.head_dim,
+            self.self_attn.total_num_heads,
+            self.self_attn.total_num_kv_heads,
+            bias=False,
+            quant_config=quant_config,
+            prefix=add_prefix("qkv_proj", prefix),
+        )
+
+        if config.model_type == "llama4_text":
+            inter_size = config.intermediate_size_mlp
+        else:
+            inter_size = config.intermediate_size
+
+        self.mlp = LlamaMLP(
+            config.hidden_size, inter_size, config.hidden_act, quant_config, prefix
+        )
+
+        self.hidden_norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        embeds: torch.Tensor,
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+        residual: Optional[torch.Tensor],
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+
+        residual = hidden_states
+        embeds = self.input_layernorm(embeds)
+        hidden_states = self.hidden_norm(hidden_states)
+
+        hidden_states = torch.cat([embeds, hidden_states], dim=-1)
+        # Self Attention
+        hidden_states = self.self_attn(
+            positions=positions,
+            hidden_states=hidden_states,
+            forward_batch=forward_batch,
+        )
+
+        hidden_states, residual = self.post_attention_layernorm(hidden_states, residual)
+
+        # Fully Connected
+        hidden_states = self.mlp(hidden_states)
+
+        return hidden_states, residual
+
+
+class LlamaModel(nn.Module):
+    def __init__(
+        self,
+        config: LlamaConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.config = config
+
+        self.is_mrope_enabled = (
+            hasattr(config, "rope_scaling")
+            and config.rope_scaling is not None
+            and "mrope_section" in config.rope_scaling
+        )
+        # fix rope_scaling for qwen2.5-vl
+        if self.is_mrope_enabled:
+            config.rope_scaling["rope_type"] = "default"
+
+        self.vocab_size = config.vocab_size
+        self.embed_tokens = VocabParallelEmbedding(
+            config.vocab_size,
+            config.hidden_size,
+            prefix=add_prefix("embed_tokens", prefix),
+        )
+
+        if hasattr(config, "target_hidden_size"):
+            self.hidden_size_in = config.target_hidden_size
+        else:
+            self.hidden_size_in = config.hidden_size
+
+        self.fc = torch.nn.Linear(
+            self.hidden_size_in * 3,
+            config.hidden_size,
+            bias=getattr(config, "bias", False),
+        )
+
+        self.midlayer = LlamaDecoderLayer(config, 0, quant_config, prefix)
+
+        self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        input_embeds: torch.Tensor = None,
+        pp_proxy_tensors: Optional[PPProxyTensors] = None,
+    ) -> torch.Tensor:
+        if input_embeds is None:
+            embeds = self.embed_tokens(input_ids)
+        else:
+            embeds = input_embeds
+
+        if self.is_mrope_enabled:
+            positions = forward_batch.mrope_positions
+
+        hidden_states = forward_batch.spec_info.hidden_states
+        if hidden_states.shape[-1] != embeds.shape[-1]:
+            hidden_states = self.fc(hidden_states)
+
+        # idle batch
+        if hidden_states.shape[0] == 0:
+            return hidden_states, [hidden_states]
+
+        residual = None
+        hidden_states, residual = self.midlayer(
+            positions,
+            embeds,
+            hidden_states,
+            forward_batch,
+            residual,
+        )
+
+        hidden_states_to_logits, hidden_states_to_aux = self.norm(
+            hidden_states, residual
+        )
+
+        # For draft decode, we capture the hidden state before norm
+        return hidden_states_to_logits, [hidden_states_to_aux]
+
+
+class LlamaForCausalLMEagle3(LlamaForCausalLM):
+    def __init__(
+        self,
+        config: LlamaConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        nn.Module.__init__(self)
+        self.config = config
+        self.quant_config = quant_config
+        self.pp_group = get_pp_group()
+
+        if self.config.num_hidden_layers != 1:
+            raise ValueError("EAGLE3 currently only supports 1 layer")
+
+        self.model = LlamaModel(
+            config, quant_config=quant_config, prefix=add_prefix("model", prefix)
+        )
+        # Llama 3.2 1B Instruct set tie_word_embeddings to True
+        # Llama 3.1 8B Instruct set tie_word_embeddings to False
+        self.load_lm_head_from_target = False
+        if self.config.tie_word_embeddings:
+            self.lm_head = self.model.embed_tokens
+        else:
+            if config.draft_vocab_size is None:
+                self.load_lm_head_from_target = True
+                config.draft_vocab_size = config.vocab_size
+            self.lm_head = ParallelLMHead(
+                config.draft_vocab_size,
+                config.hidden_size,
+                quant_config=quant_config,
+                prefix=add_prefix("lm_head", prefix),
+            )
+
+        config_ = copy.deepcopy(config)
+        config_.vocab_size = (
+            config_.draft_vocab_size
+        )  # draft logits processor has it's own vocab size
+        self.logits_processor = LogitsProcessor(config_)
+
+        self.capture_aux_hidden_states = True
+        self.hot_token_id = None
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]) -> None:
+        params_dict = dict(self.named_parameters())
+        # Define the parameter mapping for stacked parameters
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            (".qkv_proj", ".q_proj", "q"),
+            (".qkv_proj", ".k_proj", "k"),
+            (".qkv_proj", ".v_proj", "v"),
+            (".gate_up_proj", ".gate_proj", 0),
+            (".gate_up_proj", ".up_proj", 1),
+        ]
+
+        for name, loaded_weight in weights:
+            if "d2t" in name:
+                # d2t stores diffs between draft id and target id
+                self.hot_token_id = loaded_weight + torch.arange(loaded_weight.shape[0])
+                continue
+
+            if "t2d" in name:
+                continue
+
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                param_name = f"model.{name}" if name not in params_dict else name
+                if param_name in params_dict:
+                    param = params_dict[param_name]
+                    weight_loader = getattr(
+                        param, "weight_loader", default_weight_loader
+                    )
+                    weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                # Handle regular parameters
+                param_name = name if name in params_dict else f"model.{name}"
+                if param_name in params_dict:
+                    param = params_dict[param_name]
+                    weight_loader = getattr(
+                        param, "weight_loader", default_weight_loader
+                    )
+                    weight_loader(param, loaded_weight)
+
+    def get_hot_token_id(self):
+        return self.hot_token_id
+
+
+EntryClass = [LlamaForCausalLMEagle3]
diff --git a/sglang/python/sglang/srt/models/llama_embedding.py b/sglang/python/sglang/srt/models/llama_embedding.py
new file mode 100644
index 0000000000000000000000000000000000000000..ba448f7fcd3bf615330b461c9ae922dfe771395c
--- /dev/null
+++ b/sglang/python/sglang/srt/models/llama_embedding.py
@@ -0,0 +1,87 @@
+from typing import Iterable, Tuple
+
+import torch
+from torch import nn
+from transformers import LlamaConfig
+
+from sglang.srt.layers.pooler import EmbeddingPoolerOutput, Pooler, PoolingType
+from sglang.srt.model_executor.model_runner import ForwardBatch
+from sglang.srt.model_loader.weight_utils import default_weight_loader
+from sglang.srt.models.llama import LlamaModel
+from sglang.srt.utils import add_prefix
+
+
+class LlamaEmbeddingModel(nn.Module):
+    def __init__(
+        self,
+        config: LlamaConfig,
+        quant_config=None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.model = LlamaModel(
+            config, quant_config=quant_config, prefix=add_prefix("model", prefix)
+        )
+        self.pooler = Pooler(pooling_type=PoolingType.LAST, normalize=True)
+
+    @torch.no_grad()
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        input_embeds: torch.Tensor = None,
+        get_embedding: bool = True,
+    ) -> EmbeddingPoolerOutput:
+        assert (
+            get_embedding
+        ), "LlamaEmbeddingModel / MistralModel is only used for embedding"
+        hidden_states = self.model(input_ids, positions, forward_batch, input_embeds)
+        return self.pooler(hidden_states, forward_batch)
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+            ("gate_up_proj", "gate_proj", 0),
+            ("gate_up_proj", "up_proj", 1),
+        ]
+        params_dict = dict(self.model.named_parameters())
+
+        for name, loaded_weight in weights:
+            if "rotary_emb.inv_freq" in name or "projector" in name:
+                return
+            if "rotary_emb.cos_cached" in name or "rotary_emb.sin_cached" in name:
+                # Models trained using ColossalAI may include these tensors in
+                # the checkpoint. Skip them.
+                return
+            if name.startswith("model.vision_tower") and name not in params_dict:
+                return
+
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    return
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
+                weight_loader(param, loaded_weight)
+
+
+class MistralModel(LlamaEmbeddingModel):
+    pass
+
+
+EntryClass = [LlamaEmbeddingModel, MistralModel]
diff --git a/sglang/python/sglang/srt/models/llama_reward.py b/sglang/python/sglang/srt/models/llama_reward.py
new file mode 100644
index 0000000000000000000000000000000000000000..2f78dfa1bb3b48a9f0be4298e2fd15d25d281684
--- /dev/null
+++ b/sglang/python/sglang/srt/models/llama_reward.py
@@ -0,0 +1,126 @@
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+from typing import Iterable, Optional, Tuple
+
+import torch
+from torch import nn
+from transformers import LlamaConfig
+
+from sglang.srt.layers.pooler import EmbeddingPoolerOutput, Pooler, PoolingType
+from sglang.srt.layers.quantization.base_config import QuantizationConfig
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch
+from sglang.srt.models.llama import LlamaForCausalLM, LlamaModel
+from sglang.srt.utils import add_prefix
+
+
+class LlamaForSequenceClassification(nn.Module):
+    def __init__(
+        self,
+        config: LlamaConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.config = config
+        self.quant_config = quant_config
+        self.num_labels = config.num_labels
+        self.model = LlamaModel(
+            config, quant_config=quant_config, prefix=add_prefix("model", prefix)
+        )
+        self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False)
+        self.pooler = Pooler(pooling_type=PoolingType.LAST, normalize=False)
+
+        self.eos_token_id = config.eos_token_id
+
+    @torch.no_grad()
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        input_embeds: torch.Tensor = None,
+        get_embedding: bool = True,
+    ) -> EmbeddingPoolerOutput:
+        assert (
+            get_embedding
+        ), "LlamaForSequenceClassification is only used for embedding"
+
+        hidden_states = self.model(input_ids, positions, forward_batch, input_embeds)
+        last_token_hidden = self.pooler(hidden_states, forward_batch).embeddings
+        scores = self.score(last_token_hidden)
+
+        return EmbeddingPoolerOutput(scores)
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        return LlamaForCausalLM.load_weights(self, weights)
+
+
+class LlamaForSequenceClassificationWithNormal_Weights(LlamaForSequenceClassification):
+    class Weights(torch.nn.Module):
+        def __init__(self, hidden_size, num_label):
+            super().__init__()
+            self.fc = torch.nn.Sequential(
+                torch.nn.Linear(hidden_size, hidden_size, dtype=torch.float16),
+                torch.nn.SELU(),
+                torch.nn.Linear(hidden_size, hidden_size, dtype=torch.float16),
+                torch.nn.SELU(),
+                torch.nn.Linear(hidden_size, num_label // 2, dtype=torch.float16),
+            )
+
+        def forward(self, x):
+            return self.fc(x.to(torch.float16))
+
+    def __init__(
+        self,
+        config: LlamaConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__(config, quant_config, prefix=prefix)
+        self.weights = self.Weights(config.hidden_size, self.num_labels)
+
+    @torch.no_grad()
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        input_embeds: torch.Tensor = None,
+        get_embedding: bool = True,
+    ) -> EmbeddingPoolerOutput:
+        assert (
+            get_embedding
+        ), "LlamaForSequenceClassification is only used for embedding"
+        hidden_states = self.model(input_ids, positions, forward_batch, input_embeds)
+        logits = self.score(hidden_states)
+        weights = self.weights(hidden_states)
+
+        pooled_logits = self.pooler(logits, forward_batch).embeddings
+        pooled_weights = self.pooler(weights, forward_batch).embeddings
+
+        rews = pooled_logits.view(-1, self.num_labels // 2, 2)[:, :, 0].view(
+            -1, self.num_labels // 2
+        )
+        scores = (rews * pooled_weights).sum(dim=-1).view(-1, 1)
+        return EmbeddingPoolerOutput(scores)
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        return super().load_weights(weights)
+
+
+EntryClass = [
+    LlamaForSequenceClassification,
+    LlamaForSequenceClassificationWithNormal_Weights,
+]
diff --git a/sglang/python/sglang/srt/models/llava.py b/sglang/python/sglang/srt/models/llava.py
new file mode 100644
index 0000000000000000000000000000000000000000..c76312c0a833afb997f738e19c3743a85f543416
--- /dev/null
+++ b/sglang/python/sglang/srt/models/llava.py
@@ -0,0 +1,845 @@
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Inference-only LLaVa model compatible with HuggingFace weights."""
+
+import math
+import re
+from functools import lru_cache
+from typing import Dict, Iterable, List, Optional, Tuple, Type, Union
+
+import numpy as np
+import torch
+from torch import nn
+from transformers import (
+    CLIPVisionConfig,
+    CLIPVisionModel,
+    LlavaConfig,
+    MistralConfig,
+    Qwen2Config,
+    SiglipVisionModel,
+)
+from transformers.models.auto.modeling_auto import AutoModel, AutoModelForCausalLM
+from transformers.models.llava.modeling_llava import LlavaMultiModalProjector
+
+# leave till last and symbol only in case circular import
+import sglang.srt.models as sgl_models
+from sglang.srt.layers.quantization.base_config import QuantizationConfig
+from sglang.srt.managers.mm_utils import general_mm_embed_routine
+from sglang.srt.managers.schedule_batch import (
+    Modality,
+    MultimodalDataItem,
+    MultimodalInputs,
+)
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch, PPProxyTensors
+from sglang.srt.model_loader.weight_utils import default_weight_loader
+from sglang.srt.models.llama import LlamaForCausalLM
+from sglang.srt.models.mistral import MistralForCausalLM
+from sglang.srt.models.qwen2 import Qwen2ForCausalLM
+from sglang.srt.multimodal.mm_utils import (
+    get_anyres_image_grid_shape,
+    unpad_image,
+    unpad_image_shape,
+)
+from sglang.srt.utils import add_prefix, flatten_nested_list, logger
+
+
+class LlavaBaseForCausalLM(nn.Module):
+    def pad_input_ids(self, input_ids: List[int], image_inputs: MultimodalInputs):
+        image_sizes = flatten_nested_list(
+            [item.image_sizes for item in image_inputs.mm_items]
+        )
+
+        pad_values = [item.pad_value for item in image_inputs.mm_items]
+
+        # hardcode for spatial_unpad + anyres
+        if any(
+            item.modality == Modality.MULTI_IMAGES or item.modality == Modality.VIDEO
+            for item in image_inputs.mm_items
+        ):
+            image_aspect_ratio = "pad"
+        else:
+            image_aspect_ratio = "anyres"
+        offset_list = []
+        image_inputs.image_pad_len = []
+        for image_idx, image_s in enumerate(image_sizes):
+            if len(image_sizes) > 16:
+                # 2x2 pooling with stride 2
+                new_image_feature_len = (
+                    math.ceil(self.image_size / self.patch_size / 2) ** 2
+                )
+            else:
+                new_image_feature_len = self.image_feature_len  # multi-image
+
+            height = width = self.num_patches_per_side
+            if "anyres" in image_aspect_ratio:
+                num_patch_width, num_patch_height = get_anyres_image_grid_shape(
+                    image_s,
+                    self.image_grid_pinpoints,
+                    self.vision_tower.config.image_size,
+                )
+                h = num_patch_height * height
+                w = num_patch_width * width
+                new_h, new_w = unpad_image_shape(h, w, image_s)
+
+                if "anyres_max" in self.config.image_aspect_ratio:
+                    matched_anyres_max_num_patches = re.match(
+                        r"anyres_max_(\d+)", self.config.image_aspect_ratio
+                    )
+                    if matched_anyres_max_num_patches:
+                        max_num_patches = int(matched_anyres_max_num_patches.group(1))
+                    # times = math.sqrt(h * w / (max_num_patches * unit**2))
+                    times = math.sqrt(
+                        new_h * new_w / (max_num_patches * self.image_feature_len)
+                    )
+                    if times > 1.1:
+                        new_h = int(new_h // times)
+                        new_w = int(new_w // times)
+                new_image_feature_len += new_h * (new_w + 1)
+
+            try:
+                offset = input_ids.index(self.config.image_token_index)
+            except ValueError:
+                offset = 0
+            # old_len + pad_len - 1, because we need to remove image_token_id
+            input_ids = (
+                input_ids[:offset]
+                + [pad_values[image_idx % len(pad_values)]] * new_image_feature_len
+                + input_ids[offset + 1 :]
+            )
+            offset_list.append(offset)
+            image_inputs.image_pad_len.append(new_image_feature_len)
+
+        image_inputs.image_offsets = offset_list
+        return input_ids
+
+    def encode_images(
+        self, pixel_values: Union[torch.Tensor, List[torch.Tensor]]
+    ) -> torch.Tensor:
+        """
+        encode images by vision tower and multimodal projector
+        Args:
+            pixel_values: torch.Tensor or List[torch.Tensor]: each tensor for an input image
+        Returns:
+            torch.Tensor: encoded image features from the input image; if multiple, flattened by seq_len axis
+        """
+        image_outputs = self.vision_tower(pixel_values, output_hidden_states=True)
+        # NOTE: This is not memory efficient. (output_hidden_states=True) will save all the hidden stated.
+        selected_image_feature = image_outputs.hidden_states[self.vision_feature_layer]
+        if self.vision_feature_select_strategy in ["default", "patch"]:
+            selected_image_feature = selected_image_feature[:, 1:]
+        elif self.vision_feature_select_strategy == "full":
+            selected_image_feature = selected_image_feature
+        else:
+            raise ValueError(
+                f"Unexpected select feature strategy: {self.config.vision_feature_select_strategy}"
+            )
+        image_features = self.multi_modal_projector(selected_image_feature)
+        return image_features
+
+    @torch.no_grad()
+    def forward(
+        self,
+        input_ids: torch.LongTensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+    ) -> torch.Tensor:
+        image_inputs = forward_batch.mm_inputs
+
+        if forward_batch.forward_mode.is_extend():
+            # Clamp input ids. This is because the input_ids for the image tokens are
+            # filled with the hash values of the image for the prefix matching in the radix attention.
+            # There values are useless because their embeddings will be replaced by vision embeddings anyway.
+            input_ids.clamp_(min=0, max=self.config.vocab_size - 1)
+
+            # Embed text inputs
+            input_embeds = self.language_model.model.embed_tokens(input_ids)
+
+            # Got List[List[str]] extend it to List[str]
+            # The length of the List should be equal to batch size
+            modalities_list = []
+            max_image_offset = []
+            for im in image_inputs:
+                if im:
+                    modalities_list.extend([item.modality for item in im.mm_items])
+                if im and im.image_offsets:
+                    max_image_offset.append(
+                        np.max(np.array(im.image_offsets) + np.array(im.image_pad_len))
+                    )
+                else:
+                    max_image_offset.append(-1)
+
+            start_positions = positions[forward_batch.extend_start_loc].cpu().numpy()
+            need_vision = start_positions <= np.array(max_image_offset)
+
+            if need_vision.any():
+                bs = forward_batch.batch_size
+                pixel_values = flatten_nested_list(
+                    [
+                        [item.feature for item in image_inputs[i].mm_items]
+                        for i in range(bs)
+                        if need_vision[i]
+                    ]
+                )
+                image_sizes = [
+                    flatten_nested_list(
+                        [item.image_sizes for item in image_inputs[i].mm_items]
+                    )
+                    for i in range(bs)
+                    if need_vision[i]
+                ]
+
+                ########## Encode Image ########
+
+                if pixel_values[0].ndim == 4:
+                    # llava-hd: BS, num_patch, C=3, H=336, W=336, num_patch obtained from process_images
+                    np.concatenate(pixel_values, axis=0)
+                    # ndim=4
+                    concat_images = torch.tensor(
+                        np.concatenate(pixel_values, axis=0),
+                        device=self.vision_tower.device,
+                    )
+                    image_features = self.encode_images(concat_images)
+                    split_sizes = [image.shape[0] for image in pixel_values]
+                    image_features = torch.split(image_features, split_sizes, dim=0)
+                    # hd image_features: BS, num_patch, 576, 4096
+                else:
+                    # normal pixel: BS, C=3, H=336, W=336
+                    pixel_values = torch.tensor(
+                        np.array(pixel_values), device=self.vision_tower.device
+                    )
+                    image_features = self.encode_images(pixel_values)
+                    # image_features: BS, 576, 4096
+
+                if self.mm_patch_merge_type.startswith("spatial"):
+                    new_image_features = []
+                    height = width = self.num_patches_per_side
+                    for image_idx, image_feature in enumerate(image_features):
+                        if modalities_list[image_idx] == Modality.IMAGE:
+                            image_aspect_ratio = (
+                                self.config.image_aspect_ratio
+                            )  # single image
+                        elif (
+                            modalities_list[image_idx] == Modality.MULTI_IMAGES
+                            or modalities_list[image_idx] == Modality.VIDEO
+                        ):
+                            image_aspect_ratio = "pad"  # multi image
+                        # image_aspect_ratio = (
+                        #     "anyres" if len(image_sizes[image_idx]) == 1 else "pad"
+                        # )
+                        if (
+                            image_feature.shape[0] > 1
+                            and "anyres" in image_aspect_ratio
+                            and modalities_list[image_idx] == Modality.IMAGE
+                        ):
+                            base_image_feature = image_feature[0]
+                            image_feature = image_feature[1:]
+                            assert height * width == base_image_feature.shape[0]
+
+                            if "anyres_max" in image_aspect_ratio:
+                                matched_anyres_max_num_patches = re.match(
+                                    r"anyres_max_(\d+)", image_aspect_ratio
+                                )
+                                if matched_anyres_max_num_patches:
+                                    max_num_patches = int(
+                                        matched_anyres_max_num_patches.group(1)
+                                    )
+
+                            if (
+                                image_aspect_ratio == "anyres"
+                                or "anyres_max" in image_aspect_ratio
+                            ):
+                                vision_tower_image_size = self.image_size
+                                try:
+                                    num_patch_width, num_patch_height = (
+                                        get_anyres_image_grid_shape(
+                                            image_sizes[image_idx][0],
+                                            self.config.image_grid_pinpoints,
+                                            vision_tower_image_size,
+                                        )
+                                    )
+                                except Exception as e:
+                                    print(f"Error: {e}")
+                                    num_patch_width, num_patch_height = 2, 2
+                                image_feature = image_feature.view(
+                                    num_patch_height, num_patch_width, height, width, -1
+                                )
+                            else:
+                                image_feature = image_feature.view(
+                                    2, 2, height, width, -1
+                                )
+
+                            # (
+                            #     num_patch_width,
+                            #     num_patch_height,
+                            # ) = get_anyres_image_grid_shape(
+                            #     image_sizes[image_idx][0],
+                            #     self.image_grid_pinpoints,
+                            #     self.vision_tower.config.image_size,
+                            # )
+
+                            # image_feature = image_feature.view(
+                            #     num_patch_height, num_patch_width, height, width, -1
+                            # )
+
+                            if "unpad" in self.mm_patch_merge_type:
+                                unit = image_feature.shape[2]
+                                image_feature = image_feature.permute(
+                                    4, 0, 2, 1, 3
+                                ).contiguous()
+                                image_feature = image_feature.flatten(1, 2).flatten(
+                                    2, 3
+                                )
+                                image_feature = unpad_image(
+                                    image_feature, image_sizes[image_idx][0]
+                                )
+                                if (
+                                    "anyres_max" in image_aspect_ratio
+                                    and matched_anyres_max_num_patches
+                                ):
+                                    c, h, w = image_feature.shape
+                                    times = math.sqrt(
+                                        h * w / (max_num_patches * unit**2)
+                                    )
+                                    if times > 1.1:
+                                        image_feature = image_feature[None]
+                                        image_feature = nn.functional.interpolate(
+                                            image_feature,
+                                            [int(h // times), int(w // times)],
+                                            mode="bilinear",
+                                        )[0]
+                                image_feature = torch.cat(
+                                    (
+                                        image_feature,
+                                        self.language_model.model.image_newline[
+                                            :, None, None
+                                        ].expand(*image_feature.shape[:-1], 1),
+                                    ),
+                                    dim=-1,
+                                )
+                                image_feature = image_feature.flatten(1, 2).transpose(
+                                    0, 1
+                                )
+                            else:
+                                image_feature = image_feature.permute(
+                                    0, 2, 1, 3, 4
+                                ).contiguous()
+                                image_feature = image_feature.flatten(0, 3)
+                            image_feature = torch.cat(
+                                (base_image_feature, image_feature), dim=0
+                            )
+                            image_feature = image_feature.unsqueeze(0)
+                        else:
+                            if modalities_list[image_idx] == Modality.VIDEO:  # video
+                                # 2x2 pooling
+                                num_of_frames = image_feature.shape[0]
+                                image_feature = image_feature.view(
+                                    num_of_frames, height, width, -1
+                                )
+                                image_feature = image_feature.permute(
+                                    0, 3, 1, 2
+                                ).contiguous()  # N, C, H, W
+                                height, weight = image_feature.shape[2:]
+                                scaled_shape = [
+                                    math.ceil(height / 2),
+                                    math.ceil(weight / 2),
+                                ]
+                                image_feature = nn.functional.interpolate(
+                                    image_feature, size=scaled_shape, mode="bilinear"
+                                )
+                                image_feature = (
+                                    image_feature.flatten(2)
+                                    .transpose(1, 2)
+                                    .contiguous()
+                                )  # N, C, H*W
+                            if "unpad" in self.mm_patch_merge_type:
+                                image_feature = torch.cat(
+                                    (
+                                        image_feature,
+                                        # Expand to (bs, 1, hidden_dim) and concat at the end of the image tokens
+                                        self.language_model.model.image_newline[
+                                            None, None
+                                        ].expand(
+                                            image_feature.shape[0],
+                                            1,
+                                            image_feature.shape[-1],
+                                        ),
+                                    ),
+                                    dim=1,
+                                )
+
+                        new_image_features.append(image_feature)
+                    image_features = new_image_features
+
+                # Fill in the placeholder for the image
+                extend_start_loc_cpu = forward_batch.extend_start_loc.cpu().numpy()
+                extend_seq_lens = forward_batch.extend_seq_lens.cpu().numpy()
+                prefix_lens_cpu = forward_batch.extend_prefix_lens_cpu
+                pt = 0
+                for i in range(bs):
+                    if not need_vision[i]:
+                        continue
+
+                    start_idx = extend_start_loc_cpu[i]
+                    seq_len = extend_seq_lens[i]
+                    prefix_len = prefix_lens_cpu[i]
+
+                    # Multiple images
+                    for image_idx, image_offset in enumerate(
+                        image_inputs[i].image_offsets
+                    ):
+                        if (
+                            image_offset + image_inputs[i].image_pad_len[image_idx]
+                            <= prefix_len
+                        ):
+                            continue
+                        if image_offset >= prefix_len + seq_len:
+                            break
+
+                        tmp_image_feature = image_features[pt][image_idx]
+                        pad_len = tmp_image_feature.shape[0]
+
+                        input_offset = image_offset - prefix_len
+                        left_idx = start_idx + input_offset
+                        right_idx = left_idx + pad_len
+                        assert right_idx > start_idx
+                        if input_offset < 0:
+                            left_idx = start_idx
+                            tmp_image_feature = tmp_image_feature[-input_offset:]
+                        if right_idx > start_idx + seq_len:
+                            tmp_image_feature = tmp_image_feature[
+                                : start_idx + seq_len - right_idx
+                            ]
+                            right_idx = start_idx + seq_len
+                        try:
+                            input_embeds[left_idx:right_idx] = tmp_image_feature
+                        except RuntimeError as e:
+                            print(f"RuntimeError in image encoding: {e}")
+                            print(f"{input_embeds.shape=}, {tmp_image_feature.shape=}")
+                            print(
+                                f"{start_idx=}, {image_offset=}, {prefix_len=}, {pad_len=}"
+                            )
+                    pt += 1
+
+            return self.language_model(
+                input_ids, positions, forward_batch, input_embeds=input_embeds
+            )
+        elif forward_batch.forward_mode.is_decode():
+            return self.language_model(input_ids, positions, forward_batch)
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        # Load clip vision model by cfg['mm_vision_tower']:
+        # huggingface_name or path_of_clip_relative_to_llava_model_dir
+        # We put the initialization here instead of __init__ to allow it being reused by other subclasses.
+        vision_path = self.config.mm_vision_tower
+        if "clip" in vision_path:
+            self.vision_tower = CLIPVisionModel.from_pretrained(
+                vision_path, torch_dtype=torch.float16
+            ).cuda()
+        elif "siglip" in vision_path:
+            self.vision_tower = SiglipVisionModel.from_pretrained(
+                vision_path, torch_dtype=torch.float16
+            ).cuda()
+            # Siglip needs all feature tokens
+            self.config.mm_vision_select_feature = "full"
+        self.vision_tower.eval()
+
+        self.vision_feature_layer = self.config.mm_vision_select_layer
+        self.vision_feature_select_strategy = self.config.mm_vision_select_feature
+        self.image_size = self.vision_tower.config.image_size
+        self.patch_size = self.vision_tower.config.patch_size
+
+        self.mm_patch_merge_type = getattr(self.config, "mm_patch_merge_type", "flat")
+        self.image_aspect_ratio = getattr(self.config, "image_aspect_ratio", "square")
+        self.image_grid_pinpoints = getattr(self.config, "image_grid_pinpoints", None)
+
+        self.image_feature_len = int((self.image_size // self.patch_size) ** 2)
+        if (
+            self.vision_feature_select_strategy == "patch"
+            or self.vision_feature_select_strategy == "full"
+        ):
+            pass
+        elif self.vision_feature_select_strategy == "cls_patch":
+            self.image_feature_len += 1
+        else:
+            raise ValueError(f"Unexpected select feature: {self.select_feature}")
+
+        # load mm_projector
+        projector_weights = {
+            "model.mm_projector.0": "multi_modal_projector.linear_1",
+            "model.mm_projector.2": "multi_modal_projector.linear_2",
+            "model.vision_tower.vision_tower": "vision_tower",
+            # Update the vision tower weights if we find them in the checkpoint (it may be finetuned).
+            "model.image_newline": "language_model.model.image_newline",
+        }
+        params_dict = dict(self.named_parameters())
+        for name, loaded_weight in weights:
+            if "projector" in name or "vision_tower" in name or "image_newline" in name:
+                for weight_name, param_name in projector_weights.items():
+                    if weight_name in name:
+                        name = name.replace(weight_name, param_name)
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
+                weight_loader(param, loaded_weight)
+            else:
+                self.language_model.load_weights([(name, loaded_weight)])
+
+    @property
+    def num_patches_per_side(self):
+        return self.image_size // self.patch_size
+
+
+class LlavaLlamaForCausalLM(LlavaBaseForCausalLM):
+    def __init__(
+        self,
+        config: LlavaConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+
+        self.config = config
+        self.vision_tower = None
+        self.config.vision_config.hidden_size = config.mm_hidden_size
+        self.config.text_config.hidden_size = config.hidden_size
+
+        self.multi_modal_projector = LlavaMultiModalProjector(config)
+        self.language_model = LlamaForCausalLM(
+            config,
+            quant_config=quant_config,
+            prefix=add_prefix("language_model", prefix),
+        )
+        if "unpad" in getattr(config, "mm_patch_merge_type", ""):
+            self.language_model.model.image_newline = nn.Parameter(
+                torch.empty(config.text_config.hidden_size, dtype=torch.float16)
+            )
+
+
+class LlavaQwenForCausalLM(LlavaBaseForCausalLM):
+    def __init__(
+        self,
+        config: LlavaConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+
+        self.config = config
+        self.vision_tower = None
+
+        if getattr(self.config, "vision_config", None) is None:
+            self.config.vision_config = CLIPVisionConfig(self.config.mm_vision_tower)
+        if getattr(self.config, "text_config", None) is None:
+            self.config.text_config = Qwen2Config(self.config._name_or_path)
+
+        self.config.vision_config.hidden_size = config.mm_hidden_size
+        self.config.text_config.hidden_size = config.hidden_size
+
+        if getattr(self.config, "projector_hidden_act", None) is None:
+            self.config.projector_hidden_act = "gelu"
+        if getattr(self.config, "image_token_index", None) is None:
+            self.config.image_token_index = 151646
+
+        self.multi_modal_projector = LlavaMultiModalProjector(config)
+        self.language_model = Qwen2ForCausalLM(
+            config,
+            quant_config=quant_config,
+            prefix=add_prefix("language_model", prefix),
+        )
+        if "unpad" in getattr(config, "mm_patch_merge_type", ""):
+            self.language_model.model.image_newline = nn.Parameter(
+                torch.empty(config.text_config.hidden_size, dtype=torch.float16)
+            )
+
+
+class LlavaMistralForCausalLM(LlavaBaseForCausalLM):
+    def __init__(
+        self,
+        config: LlavaConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+
+        self.config = config
+        self.vision_tower = None
+
+        if getattr(self.config, "vision_config", None) is None:
+            self.config.vision_config = CLIPVisionConfig(self.config.mm_vision_tower)
+        if getattr(self.config, "text_config", None) is None:
+            self.config.text_config = MistralConfig(self.config._name_or_path)
+
+        self.config.vision_config.hidden_size = config.mm_hidden_size
+        self.config.text_config.hidden_size = config.hidden_size
+
+        if getattr(self.config, "projector_hidden_act", None) is None:
+            self.config.projector_hidden_act = "gelu"
+        if getattr(self.config, "image_token_index", None) is None:
+            self.config.image_token_index = 32000
+
+        self.multi_modal_projector = LlavaMultiModalProjector(config)
+        self.language_model = MistralForCausalLM(
+            config,
+            quant_config=quant_config,
+            prefix=add_prefix("language_model", prefix),
+        )
+        if "unpad" in getattr(config, "mm_patch_merge_type", ""):
+            self.language_model.model.image_newline = nn.Parameter(
+                torch.empty(config.text_config.hidden_size, dtype=torch.float16)
+            )
+
+
+class LlavaForConditionalGeneration(LlavaBaseForCausalLM):
+    """
+    An adaptor class to enable support for multiple mmlm such as mistral-community/pixtral-12b
+    It follows the structure of (vision_tower, multi_modal_projector, language_model)
+
+    Once a model config is loaded, text_config and vision_config will be extracted, and
+    LlavaForConditionalGeneration will load the language_model and vision_tower models
+    according to config.
+    """
+
+    MULTIMODAL_PROJECTOR_TYPE = LlavaMultiModalProjector
+
+    @property
+    def dtype(self):
+        return self.torch_dtype
+
+    def pad_input_ids(self, input_ids: List[int], image_inputs: MultimodalInputs):
+        if hasattr(self.vision_tower, "pad_input_ids"):
+            return self.vision_tower.pad_input_ids(input_ids, image_inputs)
+        else:
+            return super().pad_input_ids(input_ids, image_inputs)
+
+    def _get_sgl_model_cls(self, config, auto_model_type: Type[AutoModel] = AutoModel):
+        """
+        Get the SGLang model implementation class according to config.
+
+        Args:
+            config: The config object of the model.
+            auto_model_type: The type of the auto model.
+
+        Returns:
+            The SGLang model implementation class.
+        """
+        config_cls_name = config.__class__.__name__
+        arch_name_mapping = self._config_cls_name_to_arch_name_mapping(auto_model_type)
+        if arch := arch_name_mapping.get(config_cls_name):
+            if isinstance(arch, tuple):
+                arch = arch[0]
+                logger.warning(
+                    f"Multiple {auto_model_type.__name__} models found for submodule config `{config_cls_name}`, defaulting to [0]: {arch.__name__}"
+                )
+            try:
+                return sgl_models.registry.ModelRegistry.resolve_model_cls(arch)[0]
+            except Exception as e:
+                raise ValueError(
+                    f"{auto_model_type.__name__} found a corresponding model `{arch}` for config class `{config_cls_name}`, but failed to load it from SGLang ModelRegistry. \n{e}"
+                )
+        else:
+            raise ValueError(
+                f"{auto_model_type.__name__} cannot find a corresponding model for config class `{config_cls_name}`"
+            )
+
+    @lru_cache
+    def _config_cls_name_to_arch_name_mapping(
+        self, auto_model_type: Type[AutoModel]
+    ) -> Dict[str, str]:
+        mapping = {}
+        for config_cls in auto_model_type._model_mapping.keys():
+            archs = auto_model_type._model_mapping.get(config_cls, None)
+            if archs is not None:
+                if isinstance(archs, tuple):
+                    mapping[config_cls.__name__] = tuple(
+                        arch.__name__ for arch in archs
+                    )
+                else:
+                    mapping[config_cls.__name__] = archs.__name__
+        return mapping
+
+    def __init__(
+        self,
+        config: LlavaConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+
+        assert hasattr(config, "text_config")
+        assert hasattr(config, "vision_config")
+        self.config = config
+        self.text_config = self.config.text_config
+        self.vision_config = self.config.vision_config
+        self.torch_dtype = getattr(self.config, "torch_dtype")
+
+        if not getattr(self.text_config, "torch_dtype"):
+            self.text_config.torch_dtype = self.torch_dtype
+        if not getattr(self.vision_config, "torch_dtype"):
+            self.vision_config.torch_dtype = self.torch_dtype
+
+        if not hasattr(self.config, "vocab_size"):
+            self.config.vocab_size = self.text_config.vocab_size
+        if not hasattr(self.config, "image_aspect_ratio"):
+            self.config.image_aspect_ratio = "anyres"
+        if not hasattr(self.config, "image_grid_pinpoints"):
+            # from transformers.models.llava_onevision.configuration_llava_onevision import LlavaOnevisionConfig
+            # self.config.image_grid_pinpoints = LlavaOnevisionConfig().image_grid_pinpoints
+            self.config.image_grid_pinpoints = [
+                [96, 96],
+                [224, 224],
+                [384, 384],
+                [512, 512],
+                [768, 768],
+                [1024, 1024],
+            ]
+        if not hasattr(self.config, "mm_patch_merge_type"):
+            self.config.mm_patch_merge_type = "flat"
+        if not hasattr(self.config, "image_token_index"):
+            self.config.image_token_index = 10
+        if not hasattr(self.config, "projector_hidden_act"):
+            self.config.projector_hidden_act = "gelu"
+
+        self.vision_feature_layer = getattr(self.config, "vision_feature_layer", -1)
+        self.vision_feature_select_strategy = getattr(
+            self.config, "vision_feature_select_strategy", "full"
+        )
+        self.image_size = self.vision_config.image_size
+        self.patch_size = self.vision_config.patch_size
+
+        self.mm_patch_merge_type = self.config.mm_patch_merge_type
+        self.image_aspect_ratio = self.config.image_aspect_ratio
+        self.image_grid_pinpoints = self.config.image_grid_pinpoints
+
+        self.image_feature_len = int((self.image_size // self.patch_size) ** 2)
+
+        self.multi_modal_projector = self.MULTIMODAL_PROJECTOR_TYPE(config)
+
+        language_model_cls = self._get_sgl_model_cls(
+            self.text_config, AutoModelForCausalLM
+        )
+        vision_model_cls = self._get_sgl_model_cls(self.vision_config, AutoModel)
+        self.language_model = language_model_cls(
+            self.text_config,
+            quant_config=quant_config,
+            prefix=add_prefix("language_model", prefix),
+        )
+        self.vision_tower = vision_model_cls(
+            self.vision_config,
+            quant_config=quant_config,
+            prefix=add_prefix("vision_tower", prefix),
+        )
+
+        if "unpad" in getattr(self.config, "mm_patch_merge_type", ""):
+            self.language_model.model.image_newline = nn.Parameter(
+                torch.empty(self.text_config.hidden_size, dtype=self.torch_dtype)
+            )
+
+    def get_image_feature(self, items: List[MultimodalDataItem]) -> torch.Tensor:
+        """Extract features from image inputs.
+
+        Args:
+            items: List of MultimodalDataItem objects containing image data
+                Note that an item can be either "image" or "multi-images"
+
+        Returns:
+            torch.Tensor: features from image inputs, concatenated
+        """
+        features = []
+        for item in items:
+            # in each item, we assume pixel_values is always batched
+            pixel_values, image_sizes = item.feature, item.image_sizes
+            image_outputs = self.vision_tower(
+                pixel_values, image_sizes, output_hidden_states=True
+            )
+            selected_image_feature = image_outputs.hidden_states[
+                self.vision_feature_layer
+            ]
+
+            if self.vision_feature_select_strategy in ["default", "patch"]:
+                selected_image_feature = selected_image_feature[:, 1:]
+            elif self.vision_feature_select_strategy == "full":
+                selected_image_feature = selected_image_feature
+            else:
+                raise ValueError(
+                    f"Unexpected select feature: {self.vision_feature_select_strategy}"
+                )
+            features.append(
+                self.multi_modal_projector(selected_image_feature.squeeze(0))
+            )
+        ret = torch.cat(features, dim=0)
+        return ret
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        get_embedding: bool = False,
+        pp_proxy_tensors: Optional[PPProxyTensors] = None,
+    ):
+        hidden_states = general_mm_embed_routine(
+            input_ids=input_ids,
+            forward_batch=forward_batch,
+            get_embedding=get_embedding,
+            language_model=self.language_model,
+            data_embedding_funcs={
+                Modality.IMAGE: self.get_image_feature,
+            },
+            placeholder_tokens=None,  # using mm_item.pad_value
+            positions=positions,
+            pp_proxy_tensors=pp_proxy_tensors,
+        )
+
+        return hidden_states
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        """Load weights for LlavaForConditionalGeneration.
+
+        Unlike the base class implementation, this one doesn't need to handle
+        weight name remapping as the weights are already properly structured with
+        'language_model' and 'vision_tower' prefixes in the safetensors files.
+        """
+        if (
+            self.vision_feature_select_strategy == "patch"
+            or self.vision_feature_select_strategy == "full"
+        ):
+            pass
+        elif self.vision_feature_select_strategy == "cls_patch":
+            self.image_feature_len += 1
+        else:
+            raise ValueError(
+                f"Unexpected select feature: {self.vision_feature_select_strategy}"
+            )
+
+        # Create dictionaries for direct parameter loading
+        params_dict = dict(self.named_parameters())
+
+        # Load weights directly without remapping
+        for name, loaded_weight in weights:
+            for part in ("language_model", "vision_tower"):
+                if name.startswith(part):
+                    name = name[len(part + ".") :]
+                    getattr(self, part).load_weights([(name, loaded_weight)])
+                    break
+            else:
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
+                weight_loader(param, loaded_weight)
+
+
+EntryClass = [
+    LlavaLlamaForCausalLM,
+    LlavaQwenForCausalLM,
+    LlavaMistralForCausalLM,
+    LlavaForConditionalGeneration,
+]
diff --git a/sglang/python/sglang/srt/models/llavavid.py b/sglang/python/sglang/srt/models/llavavid.py
new file mode 100644
index 0000000000000000000000000000000000000000..e5d6aa72ba9aa716e4435d0f6fb516191397df1a
--- /dev/null
+++ b/sglang/python/sglang/srt/models/llavavid.py
@@ -0,0 +1,283 @@
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Inference-only LLaVa video model compatible with HuggingFace weights."""
+
+from typing import Iterable, List, Optional, Tuple
+
+import numpy as np
+import torch
+from torch import nn
+from transformers import CLIPVisionModel, LlavaConfig
+from transformers.models.llava.modeling_llava import LlavaMultiModalProjector
+
+from sglang.srt.layers.quantization.base_config import QuantizationConfig
+from sglang.srt.managers.schedule_batch import MultimodalInputs, flatten_nested_list
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch
+from sglang.srt.model_loader.weight_utils import default_weight_loader
+from sglang.srt.models.llama import LlamaForCausalLM
+from sglang.srt.utils import add_prefix
+
+
+class LlavaVidForCausalLM(nn.Module):
+    def __init__(
+        self,
+        config: LlavaConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.config = config
+        self.vision_tower = None
+        self.config.vision_config.hidden_size = config.mm_hidden_size
+        self.config.text_config.hidden_size = config.hidden_size
+        self.multi_modal_projector = LlavaMultiModalProjector(config)
+        self.mm_spatial_pool_stride = getattr(self.config, "mm_spatial_pool_stride", 2)
+        self.resampler = nn.AvgPool2d(
+            kernel_size=self.mm_spatial_pool_stride, stride=self.mm_spatial_pool_stride
+        )
+        self.language_model = LlamaForCausalLM(
+            config,
+            quant_config=quant_config,
+            prefix=add_prefix("language_model", prefix),
+        )
+        self.num_frames = getattr(self.config, "num_frames", 16)
+        if "unpad" in getattr(config, "mm_patch_merge_type", ""):
+            self.language_model.model.image_newline = nn.Parameter(
+                torch.empty(config.text_config.hidden_size, dtype=torch.float16)
+            )
+
+    def pad_input_ids(self, input_ids: List[int], image_inputs: MultimodalInputs):
+        pad_values = [item.pad_value for item in image_inputs.mm_items]
+        new_image_feature_len = self.image_feature_len
+
+        pad_ids = pad_values * (
+            (new_image_feature_len + len(pad_values)) // len(pad_values)
+        )
+        offset = input_ids.index(self.config.image_token_index)
+        # old_len + pad_len - 1, because we need to remove image_token_id
+        new_input_ids = (
+            input_ids[:offset]
+            + pad_ids[:new_image_feature_len]
+            + input_ids[offset + 1 :]
+        )
+        image_inputs.image_offsets = [offset]
+        return new_input_ids
+
+    def encode_images(self, pixel_values: torch.Tensor) -> torch.Tensor:
+        image_outputs = self.vision_tower(pixel_values, output_hidden_states=True)
+        # NOTE: This is not memory efficient. (output_hidden_states=True) will save all the hidden stated.
+
+        selected_image_feature = image_outputs.hidden_states[self.vision_feature_layer]
+        if self.vision_feature_select_strategy in ["default", "patch"]:
+            selected_image_feature = selected_image_feature[:, 1:]
+        elif self.vision_feature_select_strategy == "full":
+            selected_image_feature = selected_image_feature
+        else:
+            raise ValueError(
+                f"Unexpected select feature strategy: {self.config.vision_feature_select_strategy}"
+            )
+
+        height = width = self.num_patches_per_side
+        num_of_frames = selected_image_feature.shape[0]
+        selected_image_feature = selected_image_feature.view(
+            num_of_frames, height, width, -1
+        )
+        selected_image_feature = selected_image_feature.permute(0, 3, 1, 2).contiguous()
+        selected_image_feature = (
+            self.resampler(selected_image_feature)
+            .flatten(2)
+            .transpose(1, 2)
+            .contiguous()
+        )
+
+        image_features = self.multi_modal_projector(selected_image_feature)
+
+        return image_features
+
+    @torch.no_grad()
+    def forward(
+        self,
+        input_ids: torch.LongTensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+    ) -> torch.Tensor:
+        image_inputs = forward_batch.mm_inputs
+        if forward_batch.forward_mode.is_extend():
+            bs = forward_batch.batch_size
+
+            # Clamp input ids. See llava.py for more details
+            input_ids = input_ids.clamp_(min=0, max=self.config.vocab_size - 1)
+
+            # Embed text inputs
+            input_embeds = self.language_model.model.embed_tokens(input_ids)
+
+            # Whether the requests need vision inputs
+            max_image_offset = []
+            for im in image_inputs:
+                if im and im.image_offsets:
+                    max_image_offset.append(max(im.image_offsets))
+                else:
+                    max_image_offset.append(-1)
+            start_positions = positions[forward_batch.extend_start_loc].cpu().numpy()
+            need_vision = start_positions <= np.array(max_image_offset)
+
+            if need_vision.any():
+                pixel_values = flatten_nested_list(
+                    [
+                        [item.feature for item in image_inputs[i].mm_items]
+                        for i in range(bs)
+                        if need_vision[i]
+                    ]
+                )
+                image_offsets = [
+                    flatten_nested_list(
+                        [item.offsets for item in image_inputs[i].mm_items]
+                    )
+                    for i in range(bs)
+                    if need_vision[i]
+                ]
+
+                ########## Encode Image ########
+
+                if pixel_values[0].ndim == 4:
+                    # llava-hd: BS, num_patch, C=3, H=336, W=336, num_patch obtained from process_images
+                    np.concatenate(pixel_values, axis=0)
+                    # ndim=4
+                    concat_images = torch.tensor(
+                        np.concatenate(pixel_values, axis=0),
+                        device=self.vision_tower.device,
+                    )
+                    # image_features = self.encode_images(concat_images)
+                    # split_sizes = [image.shape[0] for image in pixel_values]
+                    # image_features = torch.split(image_features, split_sizes, dim=0)
+                    image_features = self.encode_images(
+                        concat_images
+                    )  # , prompts)#, image_counts, long_video=long_video)
+                    split_sizes = [image.shape[0] for image in pixel_values]
+                    image_features = torch.split(image_features, split_sizes, dim=0)
+
+                    # hd image_features: BS, num_patch, 576, 4096
+                else:
+                    # normal pixel: BS, C=3, H=336, W=336
+                    pixel_values = torch.tensor(
+                        np.array(pixel_values), device=self.vision_tower.device
+                    )
+                    image_features = self.encode_images(pixel_values)
+                    # image_features: BS, 576, 4096
+
+                new_image_features = []
+                for image_idx, image_feature in enumerate(image_features):
+                    new_image_features.append(image_feature.flatten(0, 1))
+                image_features = new_image_features
+
+                # Fill in the placeholder for the image
+                extend_start_loc_cpu = forward_batch.extend_start_loc.cpu().numpy()
+                prefix_lens_cpu = forward_batch.extend_prefix_lens_cpu
+                pt = 0
+                for i in range(bs):
+                    if not need_vision[i]:
+                        continue
+
+                    start_idx = extend_start_loc_cpu[i]
+                    prefix_len = prefix_lens_cpu[i]
+
+                    # Multiple images
+                    for image_offset in image_offsets[i]:
+                        if image_offset < prefix_len:
+                            continue
+
+                        tmp_image_feature = image_features[pt]
+                        pad_len = tmp_image_feature.shape[0]
+
+                        left_idx = start_idx + (image_offset - prefix_len)
+                        right_idx = start_idx + (image_offset - prefix_len) + pad_len
+                        try:
+                            input_embeds[left_idx:right_idx] = tmp_image_feature
+                        except RuntimeError as e:
+                            print(f"RuntimeError in image encoding: {e}")
+                            print(f"{input_embeds.shape=}, {tmp_image_feature.shape=}")
+                            print(
+                                f"{start_idx=}, {image_offset=}, {prefix_len=}, {pad_len=}"
+                            )
+                        pt += 1
+
+            return self.language_model(
+                input_ids, positions, forward_batch, input_embeds=input_embeds
+            )
+        elif forward_batch.forward_mode.is_decode():
+            return self.language_model(input_ids, positions, forward_batch)
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        # Load clip vision model by cfg['mm_vision_tower']:
+        # huggingface_name or path_of_clip_relative_to_llava_model_dir
+        # We put the initialization here instead of __init__ to allow it being reused by other subclasses.
+        vision_path = self.config.mm_vision_tower
+        self.vision_tower = CLIPVisionModel.from_pretrained(
+            vision_path, torch_dtype=torch.float16
+        ).cuda()
+        self.vision_tower.eval()
+
+        self.vision_feature_layer = self.config.mm_vision_select_layer
+        self.vision_feature_select_strategy = self.config.mm_vision_select_feature
+        self.image_size = self.vision_tower.config.image_size
+        self.patch_size = self.vision_tower.config.patch_size
+
+        self.mm_patch_merge_type = getattr(self.config, "mm_patch_merge_type", "flat")
+        self.image_aspect_ratio = getattr(self.config, "image_aspect_ratio", "square")
+        self.image_grid_pinpoints = getattr(self.config, "image_grid_pinpoints", None)
+
+        print(f"target_frames: {self.num_frames}")
+        self.image_feature_len = self.num_frames * int(
+            (self.image_size / self.patch_size / self.mm_spatial_pool_stride) ** 2
+        )
+        if self.vision_feature_select_strategy == "patch":
+            pass
+        elif self.vision_feature_select_strategy == "cls_patch":
+            self.image_feature_len += 1
+        else:
+            raise ValueError(f"Unexpected select feature: {self.select_feature}")
+
+        # load mm_projector
+        projector_weights = {
+            "model.mm_projector.0": "multi_modal_projector.linear_1",
+            "model.mm_projector.2": "multi_modal_projector.linear_2",
+            "model.vision_resampler.mm_projector.0": "multi_modal_projector.linear_1",
+            "model.vision_resampler.mm_projector.2": "multi_modal_projector.linear_2",
+            "model.vision_tower.vision_tower": "vision_tower",
+            # Update the vision tower weights if we find them in the checkpoint (it may be finetuned).
+            "model.image_newline": "language_model.model.image_newline",
+        }
+        params_dict = dict(self.named_parameters())
+        for name, loaded_weight in weights:
+            # FIXME: why projector weights read two times?
+            if "projector" in name or "vision_tower" in name or "image_newline" in name:
+                for weight_name, param_name in projector_weights.items():
+                    if weight_name in name:
+                        name = name.replace(weight_name, param_name)
+                if name in params_dict:
+                    param = params_dict[name]
+                else:
+                    print(f"Warning: {name} not found in the model")
+                    continue
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
+                weight_loader(param, loaded_weight)
+            else:
+                self.language_model.load_weights([(name, loaded_weight)])
+
+    @property
+    def num_patches_per_side(self):
+        return self.image_size // self.patch_size
+
+
+EntryClass = LlavaVidForCausalLM
diff --git a/sglang/python/sglang/srt/models/longcat_flash.py b/sglang/python/sglang/srt/models/longcat_flash.py
new file mode 100644
index 0000000000000000000000000000000000000000..c16a937973090ecd4d15dbc535b3c706358d8741
--- /dev/null
+++ b/sglang/python/sglang/srt/models/longcat_flash.py
@@ -0,0 +1,1058 @@
+# Apache License, Version 2.0:
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# MIT License:
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+import concurrent.futures
+import logging
+from typing import Iterable, List, Optional, Tuple
+
+import torch
+from torch import nn
+
+from sglang.srt.configs import LongcatFlashConfig
+from sglang.srt.distributed import (
+    get_tensor_model_parallel_world_size,
+    tensor_model_parallel_all_reduce,
+)
+from sglang.srt.eplb.expert_distribution import get_global_expert_distribution_recorder
+from sglang.srt.eplb.expert_location import ModelConfigForExpertLocation
+from sglang.srt.layers import deep_gemm_wrapper
+from sglang.srt.layers.activation import SiluAndMul
+from sglang.srt.layers.communicator import LayerCommunicator, LayerScatterModes
+from sglang.srt.layers.dp_attention import (
+    get_attention_tp_rank,
+    get_attention_tp_size,
+    is_dp_attention_enabled,
+)
+from sglang.srt.layers.layernorm import RMSNorm
+from sglang.srt.layers.linear import (
+    MergedColumnParallelLinear,
+    ReplicatedLinear,
+    RowParallelLinear,
+)
+from sglang.srt.layers.logits_processor import LogitsProcessor
+from sglang.srt.layers.moe.ep_moe.kernels import zero_experts_compute_triton
+from sglang.srt.layers.moe.ep_moe.layer import DeepEPMoE, get_moe_impl_class
+from sglang.srt.layers.moe.fused_moe_triton.layer import FusedMoE
+from sglang.srt.layers.moe.topk import StandardTopKOutput, TopK
+from sglang.srt.layers.moe.utils import filter_moe_weight_param_global_expert
+from sglang.srt.layers.quantization.base_config import QuantizationConfig
+from sglang.srt.layers.quantization.fp8_kernel import is_fp8_fnuz
+from sglang.srt.layers.quantization.fp8_utils import (
+    block_quant_dequant,
+    block_quant_to_tensor_quant,
+    channel_quant_to_tensor_quant,
+    normalize_e4m3fn_to_e4m3fnuz,
+    requant_weight_ue8m0_inplace,
+)
+from sglang.srt.layers.quantization.int8_utils import (
+    block_dequant as int8_block_dequant,
+)
+from sglang.srt.layers.vocab_parallel_embedding import (
+    ParallelLMHead,
+    VocabParallelEmbedding,
+)
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch
+from sglang.srt.model_loader.utils import (
+    maybe_executor_submit,
+    should_async_load,
+    should_deepgemm_weight_requant_ue8m0,
+)
+from sglang.srt.model_loader.weight_utils import default_weight_loader
+from sglang.srt.models.deepseek_v2 import DeepseekV2AttentionMLA
+from sglang.srt.server_args import get_global_server_args
+from sglang.srt.utils import (
+    BumpAllocator,
+    add_prefix,
+    bind_or_assign,
+    cpu_has_amx_support,
+    get_bool_env_var,
+    get_device_sm,
+    is_cpu,
+    is_cuda,
+    is_hip,
+    is_npu,
+)
+
+_is_hip = is_hip()
+_is_cuda = is_cuda()
+_is_npu = is_npu()
+_is_fp8_fnuz = is_fp8_fnuz()
+_use_aiter = get_bool_env_var("SGLANG_USE_AITER") and _is_hip
+_is_cpu_amx_available = cpu_has_amx_support()
+_is_cpu = is_cpu()
+_device_sm = get_device_sm()
+
+if _is_cuda:
+    from sgl_kernel import awq_dequantize
+elif _is_cpu and _is_cpu_amx_available:
+    pass
+elif _is_hip:
+    from sglang.srt.layers.quantization.awq_triton import (
+        awq_dequantize_triton as awq_dequantize,
+    )
+else:
+    pass
+
+logger = logging.getLogger(__name__)
+
+
+class LongcatFlashMLP(nn.Module):
+    def __init__(
+        self,
+        hidden_size: int,
+        intermediate_size: int,
+        hidden_act: str,
+        quant_config: Optional[QuantizationConfig] = None,
+        reduce_results: bool = False,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.gate_up_proj = MergedColumnParallelLinear(
+            hidden_size,
+            [intermediate_size] * 2,
+            bias=False,
+            quant_config=quant_config,
+            prefix=add_prefix("gate_up_proj", prefix),
+        )
+        self.down_proj = RowParallelLinear(
+            intermediate_size,
+            hidden_size,
+            bias=False,
+            quant_config=quant_config,
+            reduce_results=reduce_results,
+            prefix=add_prefix("down_proj", prefix),
+        )
+        if hidden_act != "silu":
+            raise ValueError(
+                f"Unsupported activation: {hidden_act}. "
+                "Only silu is supported for now."
+            )
+        self.act_fn = SiluAndMul()
+
+    def forward(
+        self,
+        x,
+    ):
+        gate_up, _ = self.gate_up_proj(x)
+        x = self.act_fn(gate_up)
+        x, _ = self.down_proj(x)
+        return x
+
+
+class LongcatFlashRouter(nn.Module):
+    def __init__(
+        self,
+        config,
+        zero_expert_num=0,
+        rounter_params_dtype=torch.float32,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.n_routed_experts = config.n_routed_experts
+        self.n_routed_experts = self.n_routed_experts + zero_expert_num
+        self.rounter_params_dtype = rounter_params_dtype
+        self.classifier = ReplicatedLinear(
+            config.hidden_size,
+            self.n_routed_experts,
+            bias=config.router_bias,
+            params_dtype=rounter_params_dtype,
+            quant_config=None,
+            prefix=add_prefix("classifier", prefix),
+        )
+        self.e_score_correction_bias = nn.Parameter(
+            torch.zeros((self.n_routed_experts), dtype=rounter_params_dtype)
+        )
+
+    def forward(self, hidden_states):
+        logits, _ = self.classifier(hidden_states.to(self.rounter_params_dtype))
+        return logits
+
+
+class LongcatFlashMoE(nn.Module):
+
+    def __init__(
+        self,
+        config: LongcatFlashConfig,
+        layer_id: int,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.config = config
+        self.layer_id = layer_id
+        self.routed_scaling_factor = config.routed_scaling_factor
+        self.num_experts = config.n_routed_experts
+        self.top_k = config.moe_topk
+        self.zero_expert_num = config.zero_expert_num
+        self.zero_expert_type = config.zero_expert_type
+
+        if config.rounter_params_dtype == "float32":
+            self.rounter_params_dtype = torch.float32
+        else:
+            self.rounter_params_dtype = torch.bfloat16
+
+        self.tp_size = get_tensor_model_parallel_world_size()
+
+        if self.tp_size > config.n_routed_experts:
+            raise ValueError(
+                f"Tensor parallel size {self.tp_size} is greater than "
+                f"the number of experts {config.n_routed_experts}."
+            )
+
+        if config.hidden_act != "silu":
+            raise ValueError(
+                f"Unsupported activation: {config.hidden_act}. "
+                "Only silu is supported for now."
+            )
+
+        self.router = LongcatFlashRouter(
+            config=self.config,
+            zero_expert_num=self.zero_expert_num,
+            rounter_params_dtype=self.rounter_params_dtype,
+            prefix=add_prefix("router", prefix),
+        )
+
+        self.topk = TopK(
+            top_k=self.top_k,
+            renormalize=False,
+            use_grouped_topk=False,
+            correction_bias=self.router.e_score_correction_bias.data,
+            layer_id=layer_id,
+        )
+        self.topk.forward = self.topk.forward_native
+
+        self.experts = get_moe_impl_class(quant_config)(
+            num_experts=self.num_experts,
+            top_k=self.top_k,
+            layer_id=self.layer_id,
+            hidden_size=config.hidden_size,
+            intermediate_size=config.moe_intermediate_size,
+            quant_config=quant_config,
+            prefix=add_prefix("experts", prefix),
+        )
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        num_tokens, hidden_dim = hidden_states.shape
+        hidden_states = hidden_states.view(-1, hidden_dim)
+
+        # router_logits: (num_tokens, n_experts)
+        router_logits = self.router(hidden_states)
+        topk_weights, topk_idx, _ = self.topk(
+            hidden_states,
+            router_logits,
+        )
+        if self.zero_expert_type is not None:
+            zero_expert_result = zero_experts_compute_triton(
+                expert_indices=topk_idx,
+                expert_scales=topk_weights,
+                num_experts=self.num_experts,
+                zero_expert_type=self.zero_expert_type,
+                hidden_states=hidden_states,
+            )
+        topk_output = StandardTopKOutput(topk_weights, topk_idx, _)
+
+        final_hidden_states = self.experts(hidden_states, topk_output)
+        final_hidden_states *= self.routed_scaling_factor
+
+        if self.zero_expert_type is not None and hidden_states.shape[0] > 0:
+            final_hidden_states += zero_expert_result.to(final_hidden_states.device)
+
+        if self.tp_size > 1:
+            final_hidden_states = tensor_model_parallel_all_reduce(final_hidden_states)
+
+        return final_hidden_states.view(num_tokens, hidden_dim)
+
+    def get_moe_weights(self):
+        return [
+            x.data
+            for name, x in self.experts.named_parameters()
+            if name not in ["correction_bias"]
+            and filter_moe_weight_param_global_expert(
+                name, x, self.experts.num_local_experts
+            )
+        ]
+
+
+class LongcatFlashDecoderLayer(nn.Module):
+
+    def __init__(
+        self,
+        config: LongcatFlashConfig,
+        layer_id: int,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+        alt_stream: Optional[torch.cuda.Stream] = None,
+    ) -> None:
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.layer_id = layer_id
+        self.alt_stream = alt_stream
+        self.self_attn = nn.ModuleList(
+            [
+                DeepseekV2AttentionMLA(
+                    config=config,
+                    hidden_size=config.hidden_size,
+                    num_heads=config.num_attention_heads,
+                    qk_nope_head_dim=config.qk_nope_head_dim,
+                    qk_rope_head_dim=config.qk_rope_head_dim,
+                    v_head_dim=config.v_head_dim,
+                    q_lora_rank=config.q_lora_rank,
+                    kv_lora_rank=config.kv_lora_rank,
+                    rope_theta=config.rope_theta,
+                    rope_scaling=None,
+                    max_position_embeddings=config.max_position_embeddings,
+                    quant_config=(
+                        None
+                        if "self_attn" in getattr(config, "disable_quant_module", [])
+                        else quant_config
+                    ),
+                    layer_id=layer_id * 2 + i,
+                    reduce_results=False,
+                    prefix=add_prefix(f"self_attn.{i}", prefix),
+                    alt_stream=self.alt_stream,
+                )
+                for i in range(2)
+            ]
+        )
+
+        self.input_layernorm = nn.ModuleList(
+            [RMSNorm(config.hidden_size, eps=config.rms_norm_eps) for i in range(2)]
+        )
+        self.post_attention_layernorm = nn.ModuleList(
+            [RMSNorm(config.hidden_size, eps=config.rms_norm_eps) for i in range(2)]
+        )
+
+        self.mlps = nn.ModuleList(
+            [
+                LongcatFlashMLP(
+                    hidden_size=config.hidden_size,
+                    intermediate_size=config.intermediate_size,
+                    hidden_act=config.hidden_act,
+                    quant_config=(
+                        None
+                        if "mlps" in getattr(config, "disable_quant_module", [])
+                        else quant_config
+                    ),
+                    prefix=add_prefix(f"mlps.{i}", prefix),
+                )
+                for i in range(2)
+            ]
+        )
+
+        self.mlp = LongcatFlashMoE(
+            layer_id=self.layer_id,
+            config=config,
+            quant_config=quant_config,
+            prefix=add_prefix("mlp", prefix),
+        )
+
+        self.attn_tp_size = get_attention_tp_size()
+        self.attn_tp_rank = get_attention_tp_rank()
+
+        self.mlp_layer_scatter_modes = [
+            LayerScatterModes.init_new(
+                layer_id=self.layer_id * 2 + i,
+                num_layers=config.num_hidden_layers,
+                is_layer_sparse=False,
+                is_previous_layer_sparse=False,
+                # TODO: Check if the following is correct.
+                is_next_layer_sparse=False,
+            )
+            for i in range(2)
+        ]
+        self.mlp_layer_communicator = [
+            LayerCommunicator(
+                layer_scatter_modes=self.mlp_layer_scatter_modes[i],
+                input_layernorm=self.input_layernorm[i],
+                post_attention_layernorm=self.post_attention_layernorm[i],
+                qkv_latent_func=self.self_attn[i].prepare_qkv_latent,
+            )
+            for i in range(2)
+        ]
+
+        self.moe_layer_scatter_modes = LayerScatterModes.init_new(
+            layer_id=self.layer_id,
+            num_layers=config.num_hidden_layers,
+            is_layer_sparse=True,
+            is_previous_layer_sparse=True,
+            # TODO: Check if the following is correct.
+            is_next_layer_sparse=True,
+        )
+        self.moe_layer_communicator = LayerCommunicator(
+            layer_scatter_modes=self.moe_layer_scatter_modes,
+            input_layernorm=self.input_layernorm[0],
+            post_attention_layernorm=self.post_attention_layernorm[0],
+            qkv_latent_func=self.self_attn[0].prepare_qkv_latent,
+        )
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+        residual: Optional[torch.Tensor],
+        zero_allocator: BumpAllocator,
+    ) -> torch.Tensor:
+        # first_attn
+        hidden_states, residual = self.moe_layer_communicator.prepare_attn(
+            hidden_states, residual, forward_batch
+        )
+        if hidden_states.shape[0] != 0:
+            hidden_states = self.self_attn[0](
+                positions=positions,
+                hidden_states=hidden_states,
+                forward_batch=forward_batch,
+                zero_allocator=zero_allocator,
+            )
+
+        # moe
+        hidden_states, residual = self.moe_layer_communicator.prepare_mlp(
+            hidden_states, residual, forward_batch
+        )
+        moe_hidden_states = hidden_states.clone()
+        moe_residual = residual.clone()
+        moe_hidden_states = self.mlp(moe_hidden_states)
+        moe_hidden_states, moe_residual = self.moe_layer_communicator.postprocess_layer(
+            moe_hidden_states, moe_residual, forward_batch
+        )
+
+        hidden_states, residual = self.forward_mlp(
+            hidden_states, positions, residual, forward_batch, zero_allocator
+        )
+
+        hidden_states = moe_hidden_states + hidden_states
+        return hidden_states, residual
+
+    def forward_mlp(
+        self, hidden_states, positions, residual, forward_batch, zero_allocator
+    ):
+        # first_mlp
+        hidden_states = self.mlps[0](hidden_states)
+        # TP all_reduce
+        hidden_states = tensor_model_parallel_all_reduce(hidden_states)
+
+        # second_attn
+        hidden_states, residual = self.mlp_layer_communicator[1].prepare_attn(
+            hidden_states, residual, forward_batch
+        )
+        if hidden_states.shape[0] != 0:
+            hidden_states = self.self_attn[1](
+                positions=positions,
+                hidden_states=hidden_states,
+                forward_batch=forward_batch,
+                zero_allocator=zero_allocator,
+            )
+
+        # second_mlp
+        hidden_states, residual = self.mlp_layer_communicator[1].prepare_mlp(
+            hidden_states, residual, forward_batch
+        )
+        hidden_states = self.mlps[1](hidden_states)
+        # TP all_reduce
+        hidden_states = tensor_model_parallel_all_reduce(hidden_states)
+
+        hidden_states, residual = self.mlp_layer_communicator[1].postprocess_layer(
+            hidden_states, residual, forward_batch
+        )
+
+        return hidden_states, residual
+
+
+class LongcatFlashModel(nn.Module):
+    fall_back_to_pt_during_load = False
+
+    def __init__(
+        self,
+        config: LongcatFlashConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.vocab_size = config.vocab_size
+
+        self.embed_tokens = VocabParallelEmbedding(
+            config.vocab_size,
+            config.hidden_size,
+            use_attn_tp_group=is_dp_attention_enabled(),
+        )
+
+        self.alt_stream = torch.cuda.Stream()
+        self.layers = nn.ModuleList(
+            [
+                LongcatFlashDecoderLayer(
+                    config,
+                    layer_id,
+                    quant_config=quant_config,
+                    prefix=add_prefix(f"layers.{layer_id}", prefix),
+                    alt_stream=self.alt_stream,
+                )
+                for layer_id in range(config.num_hidden_layers)
+            ]
+        )
+        self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.layers_to_capture = []
+
+    def get_input_embeddings(self) -> torch.Tensor:
+        return self.embed_tokens
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        input_embeds: torch.Tensor = None,
+    ) -> torch.Tensor:
+        total_num_layers = len(self.layers)
+        device = input_embeds.device if input_embeds is not None else input_ids.device
+        zero_allocator = BumpAllocator(
+            buffer_size=total_num_layers * 2 * (2 if forward_batch.can_run_tbo else 1),
+            dtype=torch.float32,
+            device=device,
+        )
+        if input_embeds is None:
+            hidden_states = self.embed_tokens(input_ids)
+        else:
+            hidden_states = input_embeds
+
+        residual = None
+
+        aux_hidden_states = []
+        for i in range(total_num_layers):
+            if i in self.layers_to_capture:
+                aux_hidden_states.append(hidden_states + residual)
+            with get_global_expert_distribution_recorder().with_current_layer(i):
+                layer = self.layers[i]
+                hidden_states, residual = layer(
+                    positions, hidden_states, forward_batch, residual, zero_allocator
+                )
+
+        if hidden_states.shape[0] != 0:
+            if residual is None:
+                hidden_states = self.norm(hidden_states)
+            else:
+                hidden_states, _ = self.norm(hidden_states, residual)
+
+        if len(aux_hidden_states) == 0:
+            return hidden_states
+
+        return hidden_states, aux_hidden_states
+
+
+class LongcatFlashForCausalLM(nn.Module):
+    # for quark model load
+    packed_modules_mapping = {}
+
+    def __init__(
+        self,
+        config: LongcatFlashConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+
+        # for quark model load
+        # Fuse q_a_proj and kv_a_proj_with_mqa along output dimension when q_lora_rank is not None
+        self.fuse_qkv_a_proj = (
+            hasattr(config, "q_lora_rank") and config.q_lora_rank is not None
+        )
+        if self.fuse_qkv_a_proj:
+            self.packed_modules_mapping["fused_qkv_a_proj_with_mqa"] = [
+                "q_a_proj",
+                "kv_a_proj_with_mqa",
+            ]
+
+        self.config = config
+        self.tp_size = get_tensor_model_parallel_world_size()
+        self.quant_config = quant_config
+        self.model = LongcatFlashModel(
+            config, quant_config, prefix=add_prefix("model", prefix)
+        )
+        self.lm_head = ParallelLMHead(
+            config.vocab_size,
+            config.hidden_size,
+            quant_config=quant_config,
+            prefix=add_prefix("lm_head", prefix),
+            use_attn_tp_group=get_global_server_args().enable_dp_lm_head,
+        )
+        self.logits_processor = LogitsProcessor(config)
+        self.capture_aux_hidden_states = False
+
+    def get_input_embeddings(self) -> nn.Embedding:
+        return self.model.embed_tokens
+
+    @torch.no_grad()
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        input_embeds: torch.Tensor = None,
+    ) -> torch.Tensor:
+        hidden_states = self.model(input_ids, positions, forward_batch, input_embeds)
+
+        aux_hidden_states = None
+        if self.capture_aux_hidden_states:
+            hidden_states, aux_hidden_states = hidden_states
+
+        return self.logits_processor(
+            input_ids, hidden_states, self.lm_head, forward_batch, aux_hidden_states
+        )
+
+    def post_load_weights(self, weight_names=None):
+
+        # Perform post-processing after loading weights
+        if weight_names is None:
+            layer_ids = range(self.config.num_hidden_layers)
+        else:
+            layer_ids = set()
+            for name in weight_names:
+                if "kv_b_proj" in name:
+                    layer_id = int(name.split(".")[2])
+                    if layer_id < self.config.num_hidden_layers:
+                        layer_ids.add(layer_id)
+
+        for layer_id in layer_ids:
+            for i in range(2):
+                self_attn = self.model.layers[layer_id].self_attn[i]
+                if hasattr(self_attn.kv_b_proj, "qweight"):
+                    # AWQ compatible
+                    if _is_cuda or _is_hip:
+                        w = awq_dequantize(
+                            self_attn.kv_b_proj.qweight,
+                            self_attn.kv_b_proj.scales,
+                            self_attn.kv_b_proj.qzeros,
+                        ).T
+                    else:
+                        w = awq_dequantize(
+                            self_attn.kv_b_proj.qweight,
+                            self_attn.kv_b_proj.scales,
+                            self_attn.kv_b_proj.qzeros,
+                            0,
+                            0,
+                            0,
+                        ).T
+                else:
+                    w = self_attn.kv_b_proj.weight
+                use_deep_gemm_bmm = False
+
+                if w.dtype in (
+                    torch.float8_e4m3fn,
+                    torch.float8_e4m3fnuz,
+                ):
+                    if (
+                        hasattr(self.quant_config, "weight_block_size")
+                        and self.quant_config.weight_block_size is not None
+                    ):
+                        weight_block_size = self.quant_config.weight_block_size
+                        assert hasattr(self_attn.kv_b_proj, "weight_scale_inv")
+                        if _is_fp8_fnuz:
+                            weight, weight_scale, _ = normalize_e4m3fn_to_e4m3fnuz(
+                                weight=w,
+                                weight_scale=self_attn.kv_b_proj.weight_scale_inv,
+                                input_scale=None,
+                            )
+                        else:
+                            weight = w
+                            weight_scale = self_attn.kv_b_proj.weight_scale_inv
+
+                        if (
+                            _is_cuda
+                            and weight_block_size[0] == 128
+                            and weight_block_size[1] == 128
+                        ):
+                            if (
+                                deep_gemm_wrapper.ENABLE_JIT_DEEPGEMM
+                                and not deep_gemm_wrapper.DEEPGEMM_BLACKWELL
+                                and get_bool_env_var("SGL_USE_DEEPGEMM_BMM", "false")
+                            ):
+                                block_scale = weight_scale
+                                use_deep_gemm_bmm = True
+                            else:
+                                w = block_quant_dequant(
+                                    weight,
+                                    weight_scale,
+                                    weight_block_size,
+                                    torch.bfloat16,
+                                )
+                        else:
+                            w, scale = block_quant_to_tensor_quant(
+                                weight, weight_scale, weight_block_size
+                            )
+                            self_attn.w_scale = scale
+                    else:
+                        if _is_fp8_fnuz:
+                            weight, weight_scale, _ = normalize_e4m3fn_to_e4m3fnuz(
+                                weight=w,
+                                weight_scale=self_attn.kv_b_proj.weight_scale,
+                                input_scale=None,
+                            )
+                        else:
+                            weight = w
+                            weight_scale = self_attn.kv_b_proj.weight_scale
+
+                        w, scale = channel_quant_to_tensor_quant(weight, weight_scale)
+                        self_attn.w_scale = scale
+
+                if w.dtype == torch.int8:
+                    if hasattr(self.quant_config, "weight_block_size"):
+                        # block-wise int8 need it
+                        weight_block_size = self.quant_config.weight_block_size
+                        if weight_block_size is not None:
+                            assert hasattr(self_attn.kv_b_proj, "weight_scale_inv")
+                            weight = w
+                            weight_scale = self_attn.kv_b_proj.weight_scale_inv
+                            w = int8_block_dequant(
+                                weight, weight_scale, weight_block_size
+                            ).to(torch.bfloat16)
+                    else:
+                        # channel-wise int8 need it
+                        w = w.to(torch.bfloat16) * self_attn.kv_b_proj.weight_scale.to(
+                            torch.bfloat16
+                        )
+
+                w_kc, w_vc = w.unflatten(
+                    0, (-1, self_attn.qk_nope_head_dim + self_attn.v_head_dim)
+                ).split([self_attn.qk_nope_head_dim, self_attn.v_head_dim], dim=1)
+                if not use_deep_gemm_bmm:
+                    self_attn.w_kc = bind_or_assign(
+                        self_attn.w_kc,
+                        w_kc.transpose(1, 2).contiguous().transpose(1, 2),
+                    )
+                    self_attn.w_vc = bind_or_assign(
+                        self_attn.w_vc, w_vc.contiguous().transpose(1, 2)
+                    )
+                    if (
+                        hasattr(self_attn.kv_b_proj, "weight_scale")
+                        and self_attn.w_scale is None
+                    ):
+                        self_attn.w_scale = bind_or_assign(
+                            self_attn.w_scale, self_attn.kv_b_proj.weight_scale
+                        )
+                        if _is_hip:
+                            self_attn.w_scale *= 2.0
+                    # TODO: remove this after adding FP8 support in bmm cpu kernel
+                    if (
+                        _is_cpu
+                        and _is_cpu_amx_available
+                        and w.dtype == torch.float8_e4m3fn
+                    ):
+                        self_attn.w_kc = (
+                            self_attn.w_kc.to(torch.bfloat16) * self_attn.w_scale
+                        )
+                        self_attn.w_vc = (
+                            self_attn.w_vc.to(torch.bfloat16) * self_attn.w_scale
+                        )
+                else:
+                    num_tiles_k = self_attn.qk_nope_head_dim // weight_block_size[1]
+                    num_tiles_n = self_attn.v_head_dim // weight_block_size[0]
+                    ws_kc, ws_vc = block_scale.unflatten(
+                        0, (-1, (num_tiles_k + num_tiles_n))
+                    ).split([num_tiles_k, num_tiles_n], dim=1)
+                    self_attn.w_scale_k = bind_or_assign(
+                        self_attn.w_scale_k, ws_kc.transpose(1, 2).contiguous()
+                    )
+                    self_attn.w_scale_v = bind_or_assign(
+                        self_attn.w_scale_v, ws_vc.contiguous()
+                    )
+                    self_attn.w_kc = bind_or_assign(
+                        self_attn.w_kc, w_kc.transpose(1, 2).contiguous()
+                    )
+                    self_attn.w_vc = bind_or_assign(self_attn.w_vc, w_vc.contiguous())
+                    self_attn.use_deep_gemm_bmm = True
+
+                if self.config.mla_scale_q_lora:
+                    self_attn.q_a_layernorm.weight.data *= (
+                        self.config.hidden_size / self.config.q_lora_rank
+                    ) ** 0.5
+                if self.config.mla_scale_kv_lora:
+                    self_attn.kv_a_layernorm.weight.data *= (
+                        self.config.hidden_size / self.config.kv_lora_rank
+                    ) ** 0.5
+
+        # TODO(linguoyuan) EPMoE not support DEEPGEMM_BLACKWELL, DeepEP needs to be supported in the future
+        deep_gemm_wrapper.DEEPGEMM_SCALE_UE8M0 = False
+
+        if should_deepgemm_weight_requant_ue8m0(
+            weight_block_size=getattr(self.quant_config, "weight_block_size", None)
+        ):
+            self._weight_requant_ue8m0()
+
+    def _weight_requant_ue8m0(self):
+        weight_block_size = self.quant_config.weight_block_size
+
+        for layer_id in range(self.config.num_hidden_layers):
+            layer = self.model.layers[layer_id]
+            for i in range(2):
+                self_attn = layer.self_attn[i]
+                module_list = [
+                    self_attn.kv_b_proj,
+                    self_attn.o_proj,
+                ]
+
+                if self.config.q_lora_rank is not None:
+                    module_list.append(self_attn.fused_qkv_a_proj_with_mqa)
+                    module_list.append(self_attn.q_b_proj)
+                else:
+                    module_list.append(self_attn.kv_a_proj_with_mqa)
+                    module_list.append(self_attn.q_proj)
+
+                for module in module_list:
+                    if hasattr(module, "weight_scale_inv"):
+                        requant_weight_ue8m0_inplace(
+                            module.weight, module.weight_scale_inv, weight_block_size
+                        )
+
+                mlp = layer.mlps[i]
+                assert isinstance(mlp, LongcatFlashMLP)
+                for module in [
+                    mlp.gate_up_proj,
+                    mlp.down_proj,
+                ]:
+                    if hasattr(module, "weight_scale_inv"):
+                        requant_weight_ue8m0_inplace(
+                            module.weight, module.weight_scale_inv, weight_block_size
+                        )
+
+        for layer_id in range(self.config.num_hidden_layers):
+            experts = layer.mlp.experts
+            if isinstance(experts, DeepEPMoE):
+                for w in [
+                    (experts.w13_weight, experts.w13_weight_scale_inv),
+                    (experts.w2_weight, experts.w2_weight_scale_inv),
+                ]:
+                    requant_weight_ue8m0_inplace(w[0], w[1], weight_block_size)
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("gate_up_proj", "gate_proj", 0),
+            ("gate_up_proj", "up_proj", 1),
+        ]
+
+        # Params for weights, fp8 weight scales, fp8 activation scales
+        # (param_name, weight_name, expert_id, shard_id)
+        expert_params_mapping = FusedMoE.make_expert_params_mapping(
+            ckpt_gate_proj_name="gate_proj",
+            ckpt_down_proj_name="down_proj",
+            ckpt_up_proj_name="up_proj",
+            num_experts=self.config.n_routed_experts,
+        )
+
+        # Fuse q_a_proj and kv_a_proj_with_mqa along output dimension when q_lora_rank is not None
+        fuse_qkv_a_proj = hasattr(self.config, "q_lora_rank") and (
+            self.config.q_lora_rank is not None
+        )
+        cached_a_proj = {} if fuse_qkv_a_proj else None
+
+        with concurrent.futures.ThreadPoolExecutor() as executor:
+            futures = []
+            params_dict = dict(self.named_parameters())
+            weight_names = []
+            for name, loaded_weight in weights:
+                use_async_loading = should_async_load(loaded_weight)
+                if "mtp" in name:
+                    continue
+                weight_names.append(name)
+                if "rotary_emb.inv_freq" in name:
+                    continue
+                for param_name, weight_name, shard_id in stacked_params_mapping:
+                    # Skip non-stacked layers and experts (experts handled below).
+                    if weight_name not in name:
+                        continue
+                    # We have mlp.experts[0].gate_proj in the checkpoint.
+                    # Since we handle the experts below in expert_params_mapping,
+                    # we need to skip here BEFORE we update the name, otherwise
+                    # name will be updated to mlp.experts[0].gate_up_proj, which
+                    # will then be updated below in expert_params_mapping
+                    # for mlp.experts[0].gate_gate_up_proj, which breaks load.
+                    if ("mlp.experts." in name) and name not in params_dict:
+                        continue
+                    name = name.replace(weight_name, param_name)
+                    # Skip loading extra bias for GPTQ models.
+                    if name.endswith(".bias") and name not in params_dict:
+                        continue
+                    param = params_dict[name]
+                    weight_loader = param.weight_loader
+                    maybe_executor_submit(
+                        executor=executor,
+                        futures=futures,
+                        use_async=use_async_loading,
+                        func=weight_loader,
+                        func_args=(param, loaded_weight, shard_id),
+                    )
+                    break
+                else:
+                    for mapping in expert_params_mapping:
+                        param_name, weight_name, expert_id, shard_id = mapping
+                        if weight_name not in name:
+                            continue
+                        name = name.replace(weight_name, param_name)
+                        param = params_dict[name]
+                        weight_loader = param.weight_loader
+                        maybe_executor_submit(
+                            executor=executor,
+                            futures=futures,
+                            use_async=use_async_loading,
+                            func=weight_loader,
+                            func_args=(param, loaded_weight, name),
+                            func_kwargs={
+                                "shard_id": shard_id,
+                                "expert_id": expert_id,
+                            },
+                        )
+                        break
+                    else:
+                        # Skip loading extra bias for GPTQ models.
+                        if name.endswith(".bias") and name not in params_dict:
+                            continue
+                        if fuse_qkv_a_proj and (
+                            "q_a_proj" in name or "kv_a_proj_with_mqa" in name
+                        ):
+                            cached_a_proj[name] = loaded_weight
+                            q_a_proj_name = (
+                                name
+                                if "q_a_proj" in name
+                                else name.replace("kv_a_proj_with_mqa", "q_a_proj")
+                            )
+                            kv_a_proj_name = (
+                                name
+                                if "kv_a_proj_with_mqa" in name
+                                else name.replace("q_a_proj", "kv_a_proj_with_mqa")
+                            )
+
+                            # When both q_a_proj and kv_a_proj_with_mqa has been cached, load the fused weight to parameter
+                            if (
+                                q_a_proj_name in cached_a_proj
+                                and kv_a_proj_name in cached_a_proj
+                            ):
+                                q_a_proj_weight = cached_a_proj[q_a_proj_name]
+                                kv_a_proj_weight = cached_a_proj[kv_a_proj_name]
+                                cat_dim = 0
+                                if self.quant_config is not None and (
+                                    self.quant_config.get_name() == "awq"
+                                    or self.quant_config.get_name() == "awq_marlin"
+                                    or self.quant_config.get_name() == "moe_wna16"
+                                ):
+                                    cat_dim = 1
+                                fused_weight = torch.cat(
+                                    [q_a_proj_weight, kv_a_proj_weight], dim=cat_dim
+                                )
+                                param_name = (
+                                    name.replace(
+                                        "q_a_proj", "fused_qkv_a_proj_with_mqa"
+                                    )
+                                    if "q_a_proj" in name
+                                    else name.replace(
+                                        "kv_a_proj_with_mqa",
+                                        "fused_qkv_a_proj_with_mqa",
+                                    )
+                                )
+                                param = params_dict[param_name]
+
+                                weight_loader = getattr(
+                                    param, "weight_loader", default_weight_loader
+                                )
+                                maybe_executor_submit(
+                                    executor=executor,
+                                    futures=futures,
+                                    use_async=use_async_loading,
+                                    func=weight_loader,
+                                    func_args=(param, fused_weight),
+                                )
+                                cached_a_proj.pop(q_a_proj_name)
+                                cached_a_proj.pop(kv_a_proj_name)
+                        else:
+                            if (
+                                "k_scale" in name or "v_scale" in name
+                            ) and name not in params_dict:
+                                # modelopt attn kv scale is named differently
+                                for scale in ["k_scale", "v_scale"]:
+                                    if scale in name:
+                                        name = name.replace(
+                                            f"{scale[0]}_proj", "attn_mqa"
+                                        )
+                                        break
+                            if name not in params_dict:
+                                # modelopt ckpt contains not needed weights for MTP module:
+                                # model.decoder.self_attn.attn_mqa.v_scale and
+                                # model.decoder.self_attn.attn_mqa.k_scale
+                                logger.warning(f"{name} not found in params_dict.")
+                                continue
+                            param = params_dict[name]
+                            weight_loader = getattr(
+                                param, "weight_loader", default_weight_loader
+                            )
+                            maybe_executor_submit(
+                                executor=executor,
+                                futures=futures,
+                                use_async=use_async_loading,
+                                func=weight_loader,
+                                func_args=(param, loaded_weight),
+                            )
+
+            # Wait for all tasks to complete and raise any exceptions.
+            for future in concurrent.futures.as_completed(futures):
+                future.result()
+
+        self.post_load_weights(weight_names=weight_names)
+
+    def get_embed_and_head(self):
+        return self.model.embed_tokens.weight, self.lm_head.weight
+
+    def set_embed_and_head(self, embed, head):
+        del self.model.embed_tokens.weight
+        del self.lm_head.weight
+        self.model.embed_tokens.weight = embed
+        self.lm_head.weight = head
+        torch.cuda.empty_cache()
+        torch.cuda.synchronize()
+
+    @classmethod
+    def get_model_config_for_expert_location(cls, config):
+        return ModelConfigForExpertLocation(
+            num_layers=config.num_hidden_layers,
+            num_logical_experts=config.n_routed_experts,
+        )
+
+    def set_eagle3_layers_to_capture(self, layer_ids: Optional[List[int]] = None):
+        if layer_ids is None:
+            self.capture_aux_hidden_states = True
+            num_layers = self.config.num_hidden_layers
+            self.model.layers_to_capture = [2, num_layers // 2, num_layers - 3]
+        else:
+            self.capture_aux_hidden_states = True
+            self.model.layers_to_capture = [val + 1 for val in layer_ids]
+
+
+EntryClass = [LongcatFlashForCausalLM]
diff --git a/sglang/python/sglang/srt/models/longcat_flash_nextn.py b/sglang/python/sglang/srt/models/longcat_flash_nextn.py
new file mode 100644
index 0000000000000000000000000000000000000000..12c9cb13fae9dc604f717de17255d4941c48b98f
--- /dev/null
+++ b/sglang/python/sglang/srt/models/longcat_flash_nextn.py
@@ -0,0 +1,687 @@
+# Apache License, Version 2.0:
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# MIT License:
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+import concurrent.futures
+import logging
+from typing import Iterable, Optional, Tuple
+
+import torch
+from torch import nn
+
+from sglang.srt.configs import LongcatFlashConfig
+from sglang.srt.eplb.expert_distribution import get_global_expert_distribution_recorder
+from sglang.srt.layers import deep_gemm_wrapper
+from sglang.srt.layers.communicator import LayerCommunicator, LayerScatterModes
+from sglang.srt.layers.dp_attention import (
+    get_attention_tp_rank,
+    get_attention_tp_size,
+    is_dp_attention_enabled,
+)
+from sglang.srt.layers.layernorm import RMSNorm
+from sglang.srt.layers.linear import ReplicatedLinear
+from sglang.srt.layers.logits_processor import LogitsProcessor
+from sglang.srt.layers.quantization.base_config import QuantizationConfig
+from sglang.srt.layers.quantization.fp8_kernel import is_fp8_fnuz
+from sglang.srt.layers.quantization.fp8_utils import (
+    block_quant_dequant,
+    block_quant_to_tensor_quant,
+    channel_quant_to_tensor_quant,
+    normalize_e4m3fn_to_e4m3fnuz,
+    requant_weight_ue8m0_inplace,
+)
+from sglang.srt.layers.quantization.int8_utils import (
+    block_dequant as int8_block_dequant,
+)
+from sglang.srt.layers.vocab_parallel_embedding import (
+    ParallelLMHead,
+    VocabParallelEmbedding,
+)
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch
+from sglang.srt.model_loader.utils import should_deepgemm_weight_requant_ue8m0
+from sglang.srt.model_loader.weight_utils import default_weight_loader
+from sglang.srt.models.deepseek_v2 import DeepseekV2AttentionMLA
+from sglang.srt.models.longcat_flash import LongcatFlashForCausalLM, LongcatFlashMLP
+from sglang.srt.utils import (
+    BumpAllocator,
+    add_prefix,
+    bind_or_assign,
+    cpu_has_amx_support,
+    get_bool_env_var,
+    get_device_sm,
+    is_cpu,
+    is_cuda,
+    is_hip,
+    is_npu,
+)
+
+_is_hip = is_hip()
+_is_cuda = is_cuda()
+_is_npu = is_npu()
+_is_fp8_fnuz = is_fp8_fnuz()
+_use_aiter = get_bool_env_var("SGLANG_USE_AITER") and _is_hip
+_is_cpu_amx_available = cpu_has_amx_support()
+_is_cpu = is_cpu()
+_device_sm = get_device_sm()
+
+if _is_cuda:
+    from sgl_kernel import awq_dequantize
+elif _is_cpu and _is_cpu_amx_available:
+    pass
+elif _is_hip:
+    from sglang.srt.layers.quantization.awq_triton import (
+        awq_dequantize_triton as awq_dequantize,
+    )
+else:
+    pass
+
+
+logger = logging.getLogger(__name__)
+
+
+class LongcatFlashDenseDecoderLayer(nn.Module):
+
+    def __init__(
+        self,
+        config: LongcatFlashConfig,
+        layer_id: int,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+        alt_stream: Optional[torch.cuda.Stream] = None,
+    ) -> None:
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.layer_id = layer_id
+        self.alt_stream = alt_stream
+
+        self.self_attn = DeepseekV2AttentionMLA(
+            config=config,
+            hidden_size=config.hidden_size,
+            num_heads=config.num_attention_heads,
+            qk_nope_head_dim=config.qk_nope_head_dim,
+            qk_rope_head_dim=config.qk_rope_head_dim,
+            v_head_dim=config.v_head_dim,
+            q_lora_rank=config.q_lora_rank,
+            kv_lora_rank=config.kv_lora_rank,
+            rope_theta=config.rope_theta,
+            rope_scaling=None,
+            max_position_embeddings=config.max_position_embeddings,
+            quant_config=quant_config,
+            layer_id=layer_id,
+            reduce_results=False,
+            prefix=add_prefix(f"self_attn", prefix),
+            alt_stream=self.alt_stream,
+        )
+
+        self.mlp = LongcatFlashMLP(
+            hidden_size=config.hidden_size,
+            intermediate_size=config.intermediate_size,
+            hidden_act=config.hidden_act,
+            quant_config=quant_config,
+            prefix=add_prefix(f"mlps", prefix),
+        )
+        self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = RMSNorm(
+            config.hidden_size, eps=config.rms_norm_eps
+        )
+
+        self.attn_tp_size = get_attention_tp_size()
+        self.attn_tp_rank = get_attention_tp_rank()
+        self.layer_scatter_modes = LayerScatterModes.init_new(
+            layer_id=self.layer_id,
+            num_layers=config.num_hidden_layers,
+            is_layer_sparse=False,
+            is_previous_layer_sparse=False,
+            is_next_layer_sparse=False,
+        )
+        self.layer_communicator = LayerCommunicator(
+            layer_scatter_modes=self.layer_scatter_modes,
+            input_layernorm=self.input_layernorm,
+            post_attention_layernorm=self.post_attention_layernorm,
+        )
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+        residual: Optional[torch.Tensor],
+        zero_allocator: BumpAllocator,
+    ) -> torch.Tensor:
+
+        hidden_states, residual = self.layer_communicator.prepare_attn(
+            hidden_states, residual, forward_batch
+        )
+        if hidden_states.shape[0] != 0:
+            hidden_states = self.self_attn(
+                positions=positions,
+                hidden_states=hidden_states,
+                forward_batch=forward_batch,
+                zero_allocator=zero_allocator,
+            )
+
+        hidden_states, residual = self.layer_communicator.prepare_mlp(
+            hidden_states, residual, forward_batch
+        )
+        hidden_states = self.mlp(hidden_states)
+        hidden_states, residual = self.layer_communicator.postprocess_layer(
+            hidden_states, residual, forward_batch
+        )
+        return hidden_states, residual
+
+
+class LongcatFlashModelNextN(nn.Module):
+    def __init__(
+        self,
+        config: LongcatFlashConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.vocab_size = config.vocab_size
+        self.alt_stream = torch.cuda.Stream()
+
+        self.embed_tokens = VocabParallelEmbedding(
+            config.vocab_size,
+            config.hidden_size,
+            use_attn_tp_group=is_dp_attention_enabled(),
+            prefix=add_prefix("embed_tokens", prefix),
+        )
+
+        self.enorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.hnorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+        self.eh_proj = ReplicatedLinear(
+            2 * config.hidden_size,
+            config.hidden_size,
+            bias=False,
+            quant_config=quant_config,
+            prefix=add_prefix("eh_proj", ""),
+        )
+        self.decoder = LongcatFlashDenseDecoderLayer(
+            config, 0, quant_config=quant_config, alt_stream=self.alt_stream
+        )
+
+        self.final_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+    def get_input_embeddings(self) -> torch.Tensor:
+        return self.embed_tokens
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        input_embeds: torch.Tensor = None,
+    ) -> torch.Tensor:
+        total_num_layers = 1
+        device = input_embeds.device if input_embeds is not None else input_ids.device
+        zero_allocator = BumpAllocator(
+            buffer_size=total_num_layers * 2 * (2 if forward_batch.can_run_tbo else 1),
+            dtype=torch.float32,
+            device=device,
+        )
+        if input_embeds is None:
+            hidden_states = self.embed_tokens(input_ids)
+        else:
+            hidden_states = input_embeds
+
+        if hidden_states.shape[0] > 0:
+            hidden_states, _ = self.eh_proj(
+                torch.cat(
+                    (
+                        self.enorm(hidden_states),
+                        self.hnorm(forward_batch.spec_info.hidden_states),
+                    ),
+                    dim=-1,
+                )
+            )
+
+        residual = None
+        with get_global_expert_distribution_recorder().disable_this_region():
+            hidden_states, residual = self.decoder(
+                positions, hidden_states, forward_batch, residual, zero_allocator
+            )
+
+        if not forward_batch.forward_mode.is_idle():
+            if residual is not None:
+                hidden_states, _ = self.final_layernorm(hidden_states, residual)
+            else:
+                hidden_states = self.final_layernorm(hidden_states)
+        return hidden_states
+
+
+class LongcatFlashForCausalLMNextN(LongcatFlashForCausalLM):
+
+    def __init__(
+        self,
+        config: LongcatFlashConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+    ) -> None:
+        nn.Module.__init__(self)
+        self.config = config
+        self.quant_config = (
+            None
+            if "mtp" in getattr(config, "disable_quant_module", [])
+            else quant_config
+        )
+        self.model = LongcatFlashModelNextN(config, self.quant_config)
+        self.lm_head = ParallelLMHead(
+            config.vocab_size,
+            config.hidden_size,
+            quant_config=self.quant_config,
+        )
+        self.logits_processor = LogitsProcessor(config)
+
+    @torch.no_grad()
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+    ) -> torch.Tensor:
+        hidden_states = self.model(input_ids, positions, forward_batch)
+        return self.logits_processor(
+            input_ids, hidden_states, self.lm_head, forward_batch
+        )
+
+    def post_load_weights(self):
+        self_attn = self.model.decoder.self_attn
+        if hasattr(self_attn.kv_b_proj, "qweight"):
+            # AWQ compatible
+            if _is_cuda or _is_hip:
+                w = awq_dequantize(
+                    self_attn.kv_b_proj.qweight,
+                    self_attn.kv_b_proj.scales,
+                    self_attn.kv_b_proj.qzeros,
+                ).T
+            else:
+                w = awq_dequantize(
+                    self_attn.kv_b_proj.qweight,
+                    self_attn.kv_b_proj.scales,
+                    self_attn.kv_b_proj.qzeros,
+                    0,
+                    0,
+                    0,
+                ).T
+        else:
+            w = self_attn.kv_b_proj.weight
+        use_deep_gemm_bmm = False
+        if w.dtype in (
+            torch.float8_e4m3fn,
+            torch.float8_e4m3fnuz,
+        ):
+            if (
+                hasattr(self.quant_config, "weight_block_size")
+                and self.quant_config.weight_block_size is not None
+            ):
+                weight_block_size = self.quant_config.weight_block_size
+                assert hasattr(self_attn.kv_b_proj, "weight_scale_inv")
+                if _is_fp8_fnuz:
+                    weight, weight_scale, _ = normalize_e4m3fn_to_e4m3fnuz(
+                        weight=w,
+                        weight_scale=self_attn.kv_b_proj.weight_scale_inv,
+                        input_scale=None,
+                    )
+                else:
+                    weight = w
+                    weight_scale = self_attn.kv_b_proj.weight_scale_inv
+                if (
+                    _is_cuda
+                    and weight_block_size[0] == 128
+                    and weight_block_size[1] == 128
+                ):
+                    if (
+                        deep_gemm_wrapper.ENABLE_JIT_DEEPGEMM
+                        and not deep_gemm_wrapper.DEEPGEMM_BLACKWELL
+                        and get_bool_env_var("SGL_USE_DEEPGEMM_BMM", "false")
+                    ):
+                        block_scale = weight_scale
+                        use_deep_gemm_bmm = True
+                    else:
+                        w = block_quant_dequant(
+                            weight,
+                            weight_scale,
+                            weight_block_size,
+                            torch.bfloat16,
+                        )
+                else:
+                    w, scale = block_quant_to_tensor_quant(
+                        weight, weight_scale, weight_block_size
+                    )
+                    self_attn.w_scale = scale
+            else:
+                if _is_fp8_fnuz:
+                    weight, weight_scale, _ = normalize_e4m3fn_to_e4m3fnuz(
+                        weight=w,
+                        weight_scale=self_attn.kv_b_proj.weight_scale,
+                        input_scale=None,
+                    )
+                else:
+                    weight = w
+                    weight_scale = self_attn.kv_b_proj.weight_scale
+                w, scale = channel_quant_to_tensor_quant(weight, weight_scale)
+                self_attn.w_scale = scale
+        if w.dtype == torch.int8:
+            if hasattr(self.quant_config, "weight_block_size"):
+                # block-wise int8 need it
+                weight_block_size = self.quant_config.weight_block_size
+                if weight_block_size is not None:
+                    assert hasattr(self_attn.kv_b_proj, "weight_scale_inv")
+                    weight = w
+                    weight_scale = self_attn.kv_b_proj.weight_scale_inv
+                    w = int8_block_dequant(weight, weight_scale, weight_block_size).to(
+                        torch.bfloat16
+                    )
+            else:
+                # channel-wise int8 need it
+                w = w.to(torch.bfloat16) * self_attn.kv_b_proj.weight_scale.to(
+                    torch.bfloat16
+                )
+        w_kc, w_vc = w.unflatten(
+            0, (-1, self_attn.qk_nope_head_dim + self_attn.v_head_dim)
+        ).split([self_attn.qk_nope_head_dim, self_attn.v_head_dim], dim=1)
+        if not use_deep_gemm_bmm:
+            self_attn.w_kc = bind_or_assign(
+                self_attn.w_kc, w_kc.transpose(1, 2).contiguous().transpose(1, 2)
+            )
+            self_attn.w_vc = bind_or_assign(
+                self_attn.w_vc, w_vc.contiguous().transpose(1, 2)
+            )
+            if (
+                hasattr(self_attn.kv_b_proj, "weight_scale")
+                and self_attn.w_scale is None
+            ):
+                self_attn.w_scale = bind_or_assign(
+                    self_attn.w_scale, self_attn.kv_b_proj.weight_scale
+                )
+                if _is_hip:
+                    self_attn.w_scale *= 2.0
+            # TODO: remove this after adding FP8 support in bmm cpu kernel
+            if _is_cpu and _is_cpu_amx_available and w.dtype == torch.float8_e4m3fn:
+                self_attn.w_kc = self_attn.w_kc.to(torch.bfloat16) * self_attn.w_scale
+                self_attn.w_vc = self_attn.w_vc.to(torch.bfloat16) * self_attn.w_scale
+        else:
+            num_tiles_k = self_attn.qk_nope_head_dim // weight_block_size[1]
+            num_tiles_n = self_attn.v_head_dim // weight_block_size[0]
+            ws_kc, ws_vc = block_scale.unflatten(
+                0, (-1, (num_tiles_k + num_tiles_n))
+            ).split([num_tiles_k, num_tiles_n], dim=1)
+            self_attn.w_scale_k = bind_or_assign(
+                self_attn.w_scale_k, ws_kc.transpose(1, 2).contiguous()
+            )
+            self_attn.w_scale_v = bind_or_assign(
+                self_attn.w_scale_v, ws_vc.contiguous()
+            )
+            self_attn.w_kc = bind_or_assign(
+                self_attn.w_kc, w_kc.transpose(1, 2).contiguous()
+            )
+            self_attn.w_vc = bind_or_assign(self_attn.w_vc, w_vc.contiguous())
+            self_attn.use_deep_gemm_bmm = True
+
+        if self.config.mla_scale_q_lora:
+            self_attn.q_a_layernorm.weight.data *= (
+                self.config.hidden_size / self.config.q_lora_rank
+            ) ** 0.5
+        if self.config.mla_scale_kv_lora:
+            self_attn.kv_a_layernorm.weight.data *= (
+                self.config.hidden_size / self.config.kv_lora_rank
+            ) ** 0.5
+
+        if should_deepgemm_weight_requant_ue8m0(
+            weight_block_size=getattr(self.quant_config, "weight_block_size", None)
+        ):
+            self._weight_requant_ue8m0()
+
+    def _weight_requant_ue8m0(self):
+        weight_block_size = self.quant_config.weight_block_size
+        layer = self.model.decoder
+        self_attn = layer.self_attn
+        module_list = [
+            self_attn.kv_b_proj,
+            self_attn.o_proj,
+        ]
+
+        if self.config.q_lora_rank is not None:
+            module_list.append(self_attn.fused_qkv_a_proj_with_mqa)
+            module_list.append(self_attn.q_b_proj)
+        else:
+            module_list.append(self_attn.kv_a_proj_with_mqa)
+            module_list.append(self_attn.q_proj)
+
+        for module in module_list:
+            if hasattr(module, "weight_scale_inv"):
+                requant_weight_ue8m0_inplace(
+                    module.weight, module.weight_scale_inv, weight_block_size
+                )
+
+        mlp = layer.mlps
+        assert isinstance(mlp, LongcatFlashMLP)
+        for module in [
+            mlp.gate_up_proj,
+            mlp.down_proj,
+        ]:
+            if hasattr(module, "weight_scale_inv"):
+                requant_weight_ue8m0_inplace(
+                    module.weight, module.weight_scale_inv, weight_block_size
+                )
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("gate_up_proj", "gate_proj", 0),
+            ("gate_up_proj", "up_proj", 1),
+        ]
+
+        # Fuse q_a_proj and kv_a_proj_with_mqa along output dimension when q_lora_rank is not None
+        fuse_qkv_a_proj = hasattr(self.config, "q_lora_rank") and (
+            self.config.q_lora_rank is not None
+        )
+        cached_a_proj = {} if fuse_qkv_a_proj else None
+
+        nextn_layer_prefix = "model.layers.0"
+        nextn_spec_weight_names = [
+            "shared_head.norm",
+            "eh_proj",
+            "enorm",
+            "hnorm",
+            "final_layernorm",
+        ]
+
+        weight_names_mapping = {
+            "model.mtp.embed_tokens.weight": "embed_tokens.weight",
+            "model.mtp.layers.0.eh_proj.weight": "eh_proj.weight",
+            "model.mtp.layers.0.eh_proj.weight_scale_inv": "eh_proj.weight_scale_inv",
+            "model.mtp.layers.0.enorm.m.weight": "enorm.weight",
+            "model.mtp.layers.0.hnorm.m.weight": "hnorm.weight",
+            "model.mtp.layers.0.input_layernorm.weight": "layers.0.input_layernorm.weight",
+            "model.mtp.layers.0.post_attention_layernorm.weight": "layers.0.post_attention_layernorm.weight",
+            "model.mtp.layers.0.self_attn.kv_a_layernorm.weight": "layers.0.self_attn.kv_a_layernorm.weight",
+            "model.mtp.layers.0.self_attn.kv_a_proj_with_mqa.weight": "layers.0.self_attn.kv_a_proj_with_mqa.weight",
+            "model.mtp.layers.0.self_attn.kv_a_proj_with_mqa.weight_scale_inv": "layers.0.self_attn.kv_a_proj_with_mqa.weight_scale_inv",
+            "model.mtp.layers.0.self_attn.kv_b_proj.weight": "layers.0.self_attn.kv_b_proj.weight",
+            "model.mtp.layers.0.self_attn.kv_b_proj.weight_scale_inv": "layers.0.self_attn.kv_b_proj.weight_scale_inv",
+            "model.mtp.layers.0.self_attn.o_proj.weight": "layers.0.self_attn.o_proj.weight",
+            "model.mtp.layers.0.self_attn.o_proj.weight_scale_inv": "layers.0.self_attn.o_proj.weight_scale_inv",
+            "model.mtp.layers.0.self_attn.q_a_layernorm.weight": "layers.0.self_attn.q_a_layernorm.weight",
+            "model.mtp.layers.0.self_attn.q_a_proj.weight": "layers.0.self_attn.q_a_proj.weight",
+            "model.mtp.layers.0.self_attn.q_a_proj.weight_scale_inv": "layers.0.self_attn.q_a_proj.weight_scale_inv",
+            "model.mtp.layers.0.self_attn.q_b_proj.weight": "layers.0.self_attn.q_b_proj.weight",
+            "model.mtp.layers.0.self_attn.q_b_proj.weight_scale_inv": "layers.0.self_attn.q_b_proj.weight_scale_inv",
+            "model.mtp.layers.0.transformer_layer.mlp.down_proj.weight": "layers.0.mlp.down_proj.weight",
+            "model.mtp.layers.0.transformer_layer.mlp.down_proj.weight_scale_inv": "layers.0.mlp.down_proj.weight_scale_inv",
+            "model.mtp.layers.0.transformer_layer.mlp.gate_proj.weight": "layers.0.mlp.gate_proj.weight",
+            "model.mtp.layers.0.transformer_layer.mlp.gate_proj.weight_scale_inv": "layers.0.mlp.gate_proj.weight_scale_inv",
+            "model.mtp.layers.0.transformer_layer.mlp.up_proj.weight": "layers.0.mlp.up_proj.weight",
+            "model.mtp.layers.0.transformer_layer.mlp.up_proj.weight_scale_inv": "layers.0.mlp.up_proj.weight_scale_inv",
+            "model.mtp.norm.weight": "layers.0.final_layernorm.weight",
+        }
+        with concurrent.futures.ThreadPoolExecutor() as executor:
+            futures = []
+            params_dict = dict(self.named_parameters())
+            weight_names = []
+            for name, loaded_weight in weights:
+                if ".mtp." not in name:
+                    continue
+                if name in weight_names_mapping:
+                    name = weight_names_mapping[name]
+                if name.startswith("layers.0"):
+                    name = "model." + name
+                if (
+                    name.startswith("enorm")
+                    or name.startswith("hnorm")
+                    or name.startswith("eh_proj")
+                ):
+                    name = nextn_layer_prefix + "." + name
+                if not name.startswith(nextn_layer_prefix):
+                    continue
+
+                # Use shared head and embed weights from target model
+                if "shared_head.head" in name or "embed_tokens" in name:
+                    continue
+
+                is_decoder = True
+                # For nextn specific weights
+                for weight_name in nextn_spec_weight_names:
+                    if weight_name in name:
+                        name = name.replace(nextn_layer_prefix, "model")
+                        is_decoder = False
+                        break
+                # For decoder layer weights
+                if is_decoder:
+                    name = name.replace(nextn_layer_prefix, "model.decoder")
+
+                weight_names.append(name)
+                if "rotary_emb.inv_freq" in name:
+                    continue
+                for param_name, weight_name, shard_id in stacked_params_mapping:
+                    # Skip non-stacked layers and experts (experts handled below).
+                    if weight_name not in name:
+                        continue
+                    # We have mlp.experts[0].gate_proj in the checkpoint.
+                    # Since we handle the experts below in expert_params_mapping,
+                    # we need to skip here BEFORE we update the name, otherwise
+                    # name will be updated to mlp.experts[0].gate_up_proj, which
+                    # will then be updated below in expert_params_mapping
+                    # for mlp.experts[0].gate_gate_up_proj, which breaks load.
+                    if ("mlp.experts." in name) and name not in params_dict:
+                        continue
+                    name = name.replace(weight_name, param_name)
+                    # Skip loading extra bias for GPTQ models.
+                    if name.endswith(".bias") and name not in params_dict:
+                        continue
+                    param = params_dict[name]
+                    weight_loader = param.weight_loader
+                    futures.append(
+                        executor.submit(weight_loader, param, loaded_weight, shard_id)
+                    )
+                    break
+                else:
+                    # Skip loading extra bias for GPTQ models.
+                    if name.endswith(".bias") and name not in params_dict:
+                        continue
+                    if fuse_qkv_a_proj and (
+                        "q_a_proj" in name or "kv_a_proj_with_mqa" in name
+                    ):
+                        cached_a_proj[name] = loaded_weight
+                        q_a_proj_name = (
+                            name
+                            if "q_a_proj" in name
+                            else name.replace("kv_a_proj_with_mqa", "q_a_proj")
+                        )
+                        kv_a_proj_name = (
+                            name
+                            if "kv_a_proj_with_mqa" in name
+                            else name.replace("q_a_proj", "kv_a_proj_with_mqa")
+                        )
+
+                        # When both q_a_proj and kv_a_proj_with_mqa has been cached, load the fused weight to parameter
+                        if (
+                            q_a_proj_name in cached_a_proj
+                            and kv_a_proj_name in cached_a_proj
+                        ):
+                            q_a_proj_weight = cached_a_proj[q_a_proj_name]
+                            kv_a_proj_weight = cached_a_proj[kv_a_proj_name]
+                            cat_dim = 0
+                            if self.quant_config is not None and (
+                                self.quant_config.get_name() == "awq"
+                                or self.quant_config.get_name() == "awq_marlin"
+                                or self.quant_config.get_name() == "moe_wna16"
+                            ):
+                                cat_dim = 1
+                            fused_weight = torch.cat(
+                                [q_a_proj_weight, kv_a_proj_weight], dim=cat_dim
+                            )
+                            param_name = (
+                                name.replace("q_a_proj", "fused_qkv_a_proj_with_mqa")
+                                if "q_a_proj" in name
+                                else name.replace(
+                                    "kv_a_proj_with_mqa",
+                                    "fused_qkv_a_proj_with_mqa",
+                                )
+                            )
+                            param = params_dict[param_name]
+
+                            weight_loader = getattr(
+                                param, "weight_loader", default_weight_loader
+                            )
+                            futures.append(
+                                executor.submit(weight_loader, param, fused_weight)
+                            )
+                            cached_a_proj.pop(q_a_proj_name)
+                            cached_a_proj.pop(kv_a_proj_name)
+                    else:
+                        if (
+                            "k_scale" in name or "v_scale" in name
+                        ) and name not in params_dict:
+                            # modelopt attn kv scale is named differently
+                            for scale in ["k_scale", "v_scale"]:
+                                if scale in name:
+                                    name = name.replace(f"{scale[0]}_proj", "attn_mqa")
+                                    break
+                        if name not in params_dict:
+                            # modelopt ckpt contains not needed weights for MTP module:
+                            # model.decoder.self_attn.attn_mqa.v_scale and
+                            # model.decoder.self_attn.attn_mqa.k_scale
+                            logger.warning(f"{name} not found in params_dict.")
+                            continue
+                        param = params_dict[name]
+                        weight_loader = getattr(
+                            param, "weight_loader", default_weight_loader
+                        )
+                        futures.append(
+                            executor.submit(weight_loader, param, loaded_weight)
+                        )
+        self.post_load_weights()
+
+
+EntryClass = [LongcatFlashForCausalLMNextN]
diff --git a/sglang/python/sglang/srt/models/midashenglm.py b/sglang/python/sglang/srt/models/midashenglm.py
new file mode 100644
index 0000000000000000000000000000000000000000..2698fd724edcc28264aaa54a7c2054e843b52c59
--- /dev/null
+++ b/sglang/python/sglang/srt/models/midashenglm.py
@@ -0,0 +1,706 @@
+import collections
+import collections.abc
+import logging
+from collections.abc import Callable, Sequence
+from typing import Iterable, List, Optional, Tuple, TypeAlias, cast
+
+import torch
+import torch.nn as nn
+import torchaudio.functional as F
+from transformers import PretrainedConfig
+
+from sglang.srt.layers.attention.vision import VisionAttention
+from sglang.srt.layers.linear import ColumnParallelLinear, RowParallelLinear
+from sglang.srt.layers.quantization.base_config import QuantizationConfig
+from sglang.srt.managers.mm_utils import (
+    MultiModalityDataPaddingPatternMultimodalTokens,
+    general_mm_embed_routine,
+)
+from sglang.srt.managers.schedule_batch import (
+    Modality,
+    MultimodalDataItem,
+    MultimodalInputs,
+)
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch
+from sglang.srt.model_loader.weight_utils import default_weight_loader
+from sglang.srt.models.qwen2 import Qwen2ForCausalLM
+from sglang.srt.utils import add_prefix
+
+logger = logging.getLogger(__name__)
+_Tuple2: TypeAlias = int | tuple[int, int] | Sequence[int]
+
+
+def _resolve_tuple2(x: _Tuple2) -> tuple[int, int]:
+    if isinstance(x, collections.abc.Sequence):
+        assert (
+            len(x) == 2
+        ), f"Expected a sequence of length 2, got {x} with length {len(x)}"
+        return cast(tuple[int, int], tuple(x))
+    return (x, x)
+
+
+def calculate_mel_frames_dasheng(
+    audio_length_samples: int,
+    n_fft: int = 512,
+    hop_size: int = 160,
+    dasheng_subsampling: int = 4,
+    center=True,
+    model_subsampling: int = 5,
+) -> int:
+    """Calculate the number of Mel-spectrogram frames."""
+    if center:
+        audio_length_samples = audio_length_samples + n_fft
+
+    return (
+        int(1 + ((audio_length_samples - n_fft) / hop_size))
+        // dasheng_subsampling
+        // model_subsampling
+    )
+
+
+class AudioPatchEmbed(nn.Module):
+    def __init__(
+        self,
+        input_size: _Tuple2 = 64,
+        patch_size: _Tuple2 = 16,
+        patch_stride: _Tuple2 = 16,
+        in_chans: int = 1,
+        embed_dim: int = 768,
+        norm_layer: Callable | None = None,
+        flatten: bool = False,
+    ):
+        super().__init__()
+        self.input_size = _resolve_tuple2(input_size)
+        self.patch_size = _resolve_tuple2(patch_size)
+        self.patch_stride = _resolve_tuple2(patch_stride)
+        self.grid_size = (
+            self.input_size[0] // self.patch_stride[0],
+            self.input_size[1] // self.patch_stride[1],
+        )
+        self.num_patches = self.grid_size[0] * self.grid_size[1]
+        self.flatten = flatten
+        self.proj = nn.Conv2d(
+            in_chans,
+            embed_dim,
+            kernel_size=self.patch_size,
+            stride=self.patch_stride,
+        )
+        self.norm = norm_layer(embed_dim) if norm_layer else nn.Identity()
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.proj(x)
+        if self.flatten:
+            x = torch.permute(torch.flatten(x, 2, 3), (0, 2, 1))
+        x = self.norm(x)
+        return x
+
+
+class LayerScale(nn.Module):
+    def __init__(self, dim, init_values=1e-5, inplace=False):
+        super().__init__()
+        self.inplace = inplace
+        self.gamma = nn.Parameter(init_values * torch.ones(dim))
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return x.mul_(self.gamma) if self.inplace else x * self.gamma
+
+
+class DashengMlp(nn.Module):
+    def __init__(
+        self,
+        in_features: int,
+        hidden_features: int | None = None,
+        out_features: int | None = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = ColumnParallelLinear(
+            input_size=in_features,
+            output_size=hidden_features,
+            bias=True,
+            quant_config=quant_config,
+            prefix=add_prefix("fc1", prefix),
+        )
+        self.act = nn.GELU()
+        self.fc2 = RowParallelLinear(
+            input_size=hidden_features,
+            output_size=out_features,
+            bias=True,
+            quant_config=quant_config,
+            prefix=add_prefix("fc2", prefix),
+        )
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x, _ = self.fc1(x)
+        x = self.act(x)
+        x, _ = self.fc2(x)
+        return x
+
+
+class DashengAttention(nn.Module):
+    """Audio encoder attention using VisionAttention for compatibility."""
+
+    def __init__(
+        self,
+        dim: int,
+        num_heads: int = 8,
+        qkv_bias: bool = False,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        assert dim % num_heads == 0, "dim should be divisible by num_heads"
+        self.embed_dim = dim
+        self.num_heads = num_heads
+        self.head_dim = self.embed_dim // self.num_heads
+        self.scale = self.head_dim**-0.5
+
+        self.attn = VisionAttention(
+            embed_dim=dim,
+            num_heads=num_heads,
+            projection_size=dim,
+            use_qkv_parallel=True,
+            proj_bias=True,
+            qkv_bias=qkv_bias,
+            qkv_backend="sdpa",
+            softmax_in_single_precision=False,
+            flatten_batch=False,
+            quant_config=quant_config,
+            prefix=prefix,
+        )
+
+    def forward(self, x: torch.Tensor, mask: torch.Tensor | None = None):
+        """
+        Args:
+            x: [B, N, C] tensor
+            mask: [B, N] boolean mask
+        """
+        attn_mask = None
+        if mask is not None:
+            attn_mask = mask.unsqueeze(1).unsqueeze(2)  # [B, 1, 1, N]
+            attn_mask = attn_mask.float()
+            attn_mask = (1.0 - attn_mask) * -10000.0
+
+        x = self.attn(x, attn_mask=attn_mask)
+        return x
+
+
+class DashengBlock(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        num_heads: int,
+        mlp_ratio: float = 4.0,
+        qkv_bias: bool = False,
+        init_values: float | None = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.norm1 = nn.LayerNorm(dim, eps=1e-6)
+        self.attn = DashengAttention(
+            dim,
+            num_heads=num_heads,
+            qkv_bias=qkv_bias,
+            quant_config=quant_config,
+            prefix=add_prefix("attn", prefix),
+        )
+        self.ls1 = (
+            LayerScale(dim, init_values=init_values) if init_values else nn.Identity()
+        )
+        self.norm2 = nn.LayerNorm(dim, eps=1e-6)
+        self.mlp = DashengMlp(
+            in_features=dim,
+            hidden_features=int(dim * mlp_ratio),
+            quant_config=quant_config,
+            prefix=add_prefix("mlp", prefix),
+        )
+        self.ls2 = (
+            LayerScale(dim, init_values=init_values) if init_values else nn.Identity()
+        )
+
+    def forward(
+        self,
+        x: torch.Tensor,
+        mask: torch.Tensor | None = None,
+    ) -> torch.Tensor:
+        x = x + self.ls1(self.attn(self.norm1(x), mask))
+        x = x + self.ls2(self.mlp(self.norm2(x)))
+        return x
+
+
+class DashengFrontend(nn.Module):
+    """Audio frontend that converts waveforms to log mel-spectrograms."""
+
+    def __init__(self, config: PretrainedConfig):
+        super().__init__()
+        self.n_fft = config.n_fft
+        self.hop_length = config.hop_length
+        self.win_length = config.win_length
+        self.center = config.center
+        spectrogram_window = torch.hann_window(config.win_length)
+        self.register_buffer(
+            "spectrogram_window",
+            spectrogram_window,
+            persistent=False,
+        )
+        self.spectrogram_window: torch.Tensor
+        melscale_fbanks = F.melscale_fbanks(
+            n_freqs=config.n_fft // 2 + 1,
+            f_min=config.f_min,
+            f_max=config.f_max,
+            n_mels=config.n_mels,
+            sample_rate=config.sample_rate,
+        )
+        self.register_buffer("melscale_fbanks", melscale_fbanks, persistent=False)
+        self.melscale_fbanks: torch.Tensor
+
+    def forward(self, waveform: torch.Tensor) -> torch.Tensor:
+        """Convert waveform to log mel-spectrogram.
+
+        Args:
+            waveform: [B, T] tensor of audio samples
+
+        Returns:
+            log_mel_spectrogram: [B, n_mels, time] tensor
+        """
+        spectrogram = F.spectrogram(
+            waveform=waveform.to(torch.float32),
+            pad=0,
+            window=self.spectrogram_window,
+            n_fft=self.n_fft,
+            hop_length=self.hop_length,
+            win_length=self.win_length,
+            power=2,
+            normalized=False,
+            center=self.center,
+        )
+        mel_spectrogram = (spectrogram.mT @ self.melscale_fbanks.to(torch.float32)).mT
+        log_mel_spectrogram = F.amplitude_to_DB(
+            mel_spectrogram.unsqueeze(1),
+            multiplier=10,
+            amin=1e-10,
+            db_multiplier=0,
+            top_db=120,
+        ).squeeze(1)
+        return log_mel_spectrogram.to(waveform.dtype)
+
+
+class DashengAudioTransformer(nn.Module):
+    """Audio encoder transformer."""
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.target_length = config.target_length
+        self.hop_length = config.hop_length
+        self.front_end = DashengFrontend(config)
+        self.init_bn = nn.BatchNorm2d(config.n_mels, momentum=0.01)
+        self.patch_embed = AudioPatchEmbed(
+            input_size=(config.n_mels, config.target_length),
+            embed_dim=config.embed_dim,
+            in_chans=config.input_channels,
+            patch_size=config.patch_size,
+            flatten=False,
+            patch_stride=config.patch_stride,
+        )
+        self.time_pos_embed = nn.Parameter(
+            torch.empty(1, config.embed_dim, 1, self.patch_embed.grid_size[1])
+        )
+        self.freq_pos_embed = nn.Parameter(
+            torch.empty(1, config.embed_dim, self.patch_embed.grid_size[0], 1)
+        )
+        self.blocks = nn.ModuleList(
+            DashengBlock(
+                dim=config.embed_dim,
+                num_heads=config.num_heads,
+                mlp_ratio=config.mlp_ratio,
+                qkv_bias=config.qkv_bias,
+                init_values=config.init_values,
+                quant_config=quant_config,
+                prefix=add_prefix(f"blocks.{i}", prefix),
+            )
+            for i in range(config.depth)
+        )
+        self.norm = nn.LayerNorm(config.embed_dim, eps=1e-6)
+
+    def forward_features(
+        self,
+        x: torch.Tensor,
+        mask: torch.Tensor | None = None,
+    ) -> torch.Tensor:
+        t = x.shape[-1]
+        x = x + self.time_pos_embed[:, :, :, :t]
+        x = x + self.freq_pos_embed[:, :, :, :]
+        x = torch.permute(torch.flatten(x, 2, 3), (0, 2, 1))
+        for block in self.blocks:
+            x = block(x, mask)
+        x = self.norm(x)
+        return x
+
+    def _to_mask(self, lengths: torch.Tensor, max_length: int) -> torch.Tensor:
+        batch_size = len(lengths)
+        idx = torch.arange(max_length, device=lengths.device)
+        idx = idx.repeat(batch_size).view(batch_size, max_length)
+        mask = (idx < lengths.unsqueeze(-1)).bool()
+        return mask
+
+    def forward(
+        self,
+        x: torch.Tensor,
+        x_length: torch.Tensor | None = None,
+    ) -> tuple[torch.Tensor, torch.Tensor | None]:
+        """
+        Args:
+            x: [B, T] audio waveform tensor
+            x_length: [B] tensor of audio lengths
+
+        Returns:
+            x: [B, seq_len, embed_dim] encoded features
+            mask: [B, seq_len] mask tensor
+        """
+        x = self.front_end(x)
+        x = x.to(self.time_pos_embed.dtype)
+        target_length_in_patches = self.target_length // 4
+        x = x.unsqueeze(1)
+        x = torch.permute(x, (0, 2, 1, 3))
+        x = self.init_bn(x)
+        x = torch.permute(x, (0, 2, 1, 3))
+        x = self.patch_embed(x)
+        t = x.shape[-1]
+        input_splits = x.split(target_length_in_patches, dim=-1)
+        if x_length is not None:
+            assert len(x_length) == len(
+                x
+            ), "batchsizes of input x and x_length need to be same"
+            assert x_length.ndim == 1, "Lengths are of size (B,)"
+            scaled_lengths = (x_length / (self.hop_length * 4)).long()
+            mask = self._to_mask(max_length=t, lengths=scaled_lengths)
+            split_masks = mask.split(target_length_in_patches, dim=-1)
+        else:
+            mask = None
+            split_masks = [None] * len(input_splits)
+        outputs = []
+        for split_x, split_mask in zip(input_splits, split_masks):
+            forward_kwargs = {}
+            forward_kwargs["mask"] = split_mask
+            split_x = self.forward_features(split_x, **forward_kwargs)
+            outputs.append(split_x)
+        x = torch.cat(outputs, dim=1)
+        return x, mask
+
+
+class AudioProjectorSubsample(nn.Module):
+    """Audio projector with subsampling."""
+
+    def __init__(
+        self,
+        in_dim: int,
+        out_dim: int,
+        downsample_rate=5,
+        dtype: torch.dtype | None = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.k = downsample_rate
+        self.fc1 = ColumnParallelLinear(
+            input_size=in_dim * self.k,
+            output_size=out_dim,
+            bias=False,
+            quant_config=quant_config,
+            prefix=add_prefix("net.0", prefix),
+        )
+        self.act = nn.GELU()
+        self.fc2 = RowParallelLinear(
+            input_size=out_dim,
+            output_size=out_dim,
+            bias=False,
+            quant_config=quant_config,
+            prefix=add_prefix("net.2", prefix),
+        )
+
+    def forward(self, x, mask=None):
+        batch_size, seq_len, dim = x.shape
+        num_frames_to_discard = seq_len % self.k
+        if num_frames_to_discard > 0:
+            x = x[:, :-num_frames_to_discard, :]
+            if mask is not None:
+                mask = mask[:, :-num_frames_to_discard]
+        if mask is None:
+            mask = torch.ones(x.shape[:-1], dtype=torch.long, device=x.device)
+        x = x.reshape(batch_size, -1, self.k * dim)
+        x, _ = self.fc1(x)
+        x = self.act(x)
+        x, _ = self.fc2(x)
+        mask = mask.reshape(batch_size, -1, self.k)
+        mask = mask.any(dim=-1).long()
+        return x, mask
+
+
+class MiDashengLMModel(nn.Module):
+    """MiDashengLM model for audio-language processing."""
+
+    default_bitsandbytes_target_modules = [
+        ".fc1.",
+        ".fc2.",
+        ".gate_up_proj.",
+        ".down_proj.",
+        ".q_proj.",
+        ".k_proj.",
+        ".v_proj.",
+        ".o_proj.",
+    ]
+
+    bitsandbytes_stacked_params_mapping = {
+        "q_proj": ("qkv_proj", 0),
+        "k_proj": ("qkv_proj", 1),
+        "v_proj": ("qkv_proj", 2),
+        "gate_proj": ("gate_up_proj", 0),
+        "up_proj": ("gate_up_proj", 1),
+    }
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.config = config
+        if (
+            hasattr(config.text_config, "rope_scaling")
+            and config.text_config.rope_scaling
+        ):
+            if "mrope_section" in config.text_config.rope_scaling:
+
+                new_rope_scaling = {
+                    k: v
+                    for k, v in config.text_config.rope_scaling.items()
+                    if k != "mrope_section"
+                }
+                config.text_config.rope_scaling = (
+                    new_rope_scaling if new_rope_scaling else None
+                )
+        self.audio_encoder = DashengAudioTransformer(
+            config.audio_encoder_config,
+            quant_config=quant_config,
+            prefix=add_prefix("audio_encoder", prefix),
+        )
+        self.audio_projector = AudioProjectorSubsample(
+            in_dim=config.audio_encoder_config.embed_dim,
+            out_dim=config.text_config.hidden_size,
+            downsample_rate=config.subsample_factor,
+            quant_config=quant_config,
+            prefix=add_prefix("audio_projector", prefix),
+        )
+        self.language_model = Qwen2ForCausalLM(
+            config.text_config,
+            quant_config=quant_config,
+            prefix=add_prefix("decoder", prefix),
+        )
+        self.logits_processor = self.language_model.logits_processor
+        self.quant_config = quant_config
+
+    def pad_input_ids(self, input_ids: List[int], mm_inputs: MultimodalInputs):
+        """Pad input IDs with multimodal tokens."""
+        pattern = MultiModalityDataPaddingPatternMultimodalTokens()
+        return pattern.pad_input_tokens(input_ids, mm_inputs)
+
+    def get_audio_feature(self, items: List[MultimodalDataItem]) -> torch.Tensor:
+        """Process audio inputs and return embeddings.
+
+        Args:
+            items: List of multimodal data items containing audio features
+
+        Returns:
+            audio_embeddings: Concatenated audio embeddings
+        """
+        logger.debug("=" * 80)
+        logger.debug(f"get_audio_feature called with {len(items)} items")
+        logger.debug("=" * 80)
+        for i, item in enumerate(items):
+            logger.debug(f"Item {i} feature shape: {item.feature.shape}")
+            logger.debug(
+                f"Item {i} audio_length: {getattr(item, 'audio_length', 'NOT SET')}"
+            )
+            logger.debug(f"Item {i} pad_value: {getattr(item, 'pad_value', 'NOT SET')}")
+            logger.debug(f"Item {i} hash: {getattr(item, 'hash', 'NOT SET')}")
+        input_values = torch.cat([item.feature for item in items], dim=0)
+        logger.debug(f"Concatenated input_values shape: {input_values.shape}")
+        audio_lengths = []
+        for item in items:
+            if hasattr(item, "audio_length") and item.audio_length is not None:
+                audio_lengths.append(item.audio_length)
+            else:
+                audio_lengths.append(item.feature.shape[-1])
+        audio_length = torch.tensor(audio_lengths, device=input_values.device)
+        logger.debug(f"audio_length: {audio_length}")
+        encoder_out, encoder_atts = self.audio_encoder(input_values, audio_length)
+        logger.debug(f"Encoder output shape: {encoder_out.shape}")
+        audio_embeddings, _ = self.audio_projector(encoder_out, encoder_atts)
+        audio_embeddings = audio_embeddings.to(input_values.dtype)
+        logger.debug(f"Projector output shape: {audio_embeddings.shape}")
+        batch_size, max_audio_tokens, embed_dim = audio_embeddings.shape
+        logger.debug(f"Using all {max_audio_tokens} audio tokens from projector output")
+        masked_audio_features = audio_embeddings.reshape(-1, embed_dim)
+        logger.debug(f"Final output shape: {masked_audio_features.shape}")
+        logger.debug(
+            f"Stats: min={masked_audio_features.min().item():.4f}, max={masked_audio_features.max().item():.4f}"
+        )
+        logger.debug(
+            f"Audio embeddings dtype: {masked_audio_features.dtype}, device: {masked_audio_features.device}"
+        )
+        logger.debug(
+            f"First 5 values of first audio token: {masked_audio_features[0, :5].tolist()}"
+        )
+        logger.debug("=" * 80)
+        return masked_audio_features
+
+    def get_input_embeddings(self):
+        return self.language_model.model.embed_tokens
+
+    @torch.no_grad()
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        **kwargs,
+    ):
+        """Run forward pass for MiDashengLM.
+
+        Args:
+            input_ids: Flattened (concatenated) input_ids corresponding to a batch.
+            positions: Flattened (concatenated) position ids corresponding to a batch.
+            forward_batch: Forward batch information including multimodal data.
+        """
+        if forward_batch.contains_mm_inputs():
+            logger.debug("=" * 80)
+            logger.debug(f"input_ids shape: {input_ids.shape}")
+            logger.debug(f"input_ids first 20: {input_ids[:20].tolist()}")
+            logger.debug(
+                f"input_ids unique values count: {len(torch.unique(input_ids))}"
+            )
+            if forward_batch.mm_inputs and len(forward_batch.mm_inputs) > 0:
+                mm_input = forward_batch.mm_inputs[0]
+                if mm_input and len(mm_input.mm_items) > 0:
+                    pad_value = mm_input.mm_items[0].pad_value
+                    logger.debug(f"Expected pad_value: {pad_value}")
+                    logger.debug(
+                        f"Count of pad_value in input_ids: {(input_ids == pad_value).sum().item()}"
+                    )
+                    if hasattr(mm_input, "audio_token_id") and mm_input.audio_token_id:
+                        logger.debug(f"audio_token_id: {mm_input.audio_token_id}")
+                        logger.debug(
+                            f"Count of audio_token_id in input_ids: {(input_ids == mm_input.audio_token_id).sum().item()}"
+                        )
+            logger.debug("=" * 80)
+
+        return general_mm_embed_routine(
+            input_ids=input_ids,
+            forward_batch=forward_batch,
+            language_model=self.language_model,
+            positions=positions,
+            data_embedding_funcs={Modality.AUDIO: self.get_audio_feature},
+        )
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        """Load model weights."""
+        params_dict = dict(self.named_parameters(remove_duplicate=False))
+        buffers_dict = dict(self.named_buffers())
+        audio_encoder_loaded = []
+        audio_projector_loaded = []
+        skipped_weights = []
+        decoder_weights = []
+        for name, loaded_weight in weights:
+            if "rotary_emb.inv_freq" in name:
+                continue
+            if "rotary_emb.cos_cached" in name or "rotary_emb.sin_cached" in name:
+                continue
+            if name.startswith("decoder"):
+                decoder_weights.append((name, loaded_weight))
+                continue
+            original_name = name
+            if "audio_encoder.front_end" in name:
+                if ".mel_scale.fb" in name:
+                    name = name.replace(".mel_scale.fb", ".melscale_fbanks")
+                elif ".spectrogram.window" in name:
+                    name = name.replace(".spectrogram.window", ".spectrogram_window")
+            if "audio_encoder" in name and ".attn.qkv." in name:
+                name = name.replace(".attn.qkv.", ".attn.attn.qkv_proj.")
+            if "audio_encoder" in name and ".attn.proj." in name:
+                name = name.replace(".attn.proj.", ".attn.attn.proj.")
+            if "audio_projector" in name:
+                name = name.replace(".net.0.", ".fc1.")
+                name = name.replace(".net.2.", ".fc2.")
+            if (
+                name.endswith(".bias")
+                and name not in params_dict
+                and name not in buffers_dict
+            ):
+                skipped_weights.append(f"{original_name} (bias not in params/buffers)")
+                continue
+            if name in params_dict:
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
+                weight_loader(param, loaded_weight)
+            elif name in buffers_dict:
+                buffers_dict[name].copy_(loaded_weight)
+            else:
+                if "audio_projector" in original_name:
+                    skipped_weights.append(f"{original_name} -> {name} (NOT IN MODEL)")
+                else:
+                    skipped_weights.append(f"{original_name} (not in model)")
+                continue
+
+            if "audio_encoder" in original_name:
+                audio_encoder_loaded.append(original_name)
+            elif "audio_projector" in original_name:
+                audio_projector_loaded.append(original_name)
+        if decoder_weights:
+            logger.debug(
+                f"Passing {len(decoder_weights)} decoder weights to language_model.load_weights()"
+            )
+            decoder_weights_stripped = [
+                (name.replace("decoder.", "", 1), weight)
+                for name, weight in decoder_weights
+            ]
+            self.language_model.load_weights(decoder_weights_stripped)
+        logger.debug("=" * 80)
+        logger.debug(f"Audio encoder weights loaded: {len(audio_encoder_loaded)}")
+        logger.debug(f"Audio projector weights loaded: {len(audio_projector_loaded)}")
+        logger.debug(
+            f"Decoder weights passed to language_model: {len(decoder_weights)}"
+        )
+        logger.debug(f"Skipped weights: {len(skipped_weights)}")
+        encoder_skipped = [s for s in skipped_weights if "audio_encoder" in s]
+        projector_skipped = [s for s in skipped_weights if "audio_projector" in s]
+        if projector_skipped:
+            logger.debug("Skipped audio_projector weights:")
+            for s in projector_skipped:
+                logger.debug(f"  {s}")
+        if encoder_skipped:
+            logger.debug(f"Skipped audio_encoder weights: {len(encoder_skipped)}")
+            non_bias_skipped = [s for s in encoder_skipped if "bias" not in s]
+            if non_bias_skipped:
+                logger.debug("  First 10 non-bias skipped:")
+                for s in non_bias_skipped[:10]:
+                    logger.debug(f"    {s}")
+        logger.debug("=" * 80)
+
+    def get_embed_and_head(self):
+        return (
+            self.language_model.model.embed_tokens.weight,
+            self.language_model.lm_head.weight,
+        )
+
+
+EntryClass = [MiDashengLMModel]
diff --git a/sglang/python/sglang/srt/models/mimo.py b/sglang/python/sglang/srt/models/mimo.py
new file mode 100644
index 0000000000000000000000000000000000000000..15aad8f41c5dac9c5ff73a5433e428077d7c0eff
--- /dev/null
+++ b/sglang/python/sglang/srt/models/mimo.py
@@ -0,0 +1,160 @@
+# Adapted from qwen2.py
+
+from typing import Iterable, Optional, Tuple
+
+import torch
+from torch import nn
+
+from sglang.srt.layers.logits_processor import LogitsProcessor
+from sglang.srt.layers.pooler import Pooler, PoolingType
+from sglang.srt.layers.quantization.base_config import QuantizationConfig
+from sglang.srt.layers.vocab_parallel_embedding import ParallelLMHead
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch
+from sglang.srt.model_loader.weight_utils import default_weight_loader
+from sglang.srt.models.qwen2 import Qwen2DecoderLayer, Qwen2Model
+from sglang.srt.utils import add_prefix
+
+MiMoConfig = None
+
+
+class MiMoModel(Qwen2Model):
+    def __init__(
+        self,
+        config: MiMoConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__(
+            config=config,
+            quant_config=quant_config,
+            prefix=prefix,
+            decoder_layer_type=Qwen2DecoderLayer,
+        )
+
+
+class MiMoForCausalLM(nn.Module):
+    # BitandBytes specific attributes
+    default_bitsandbytes_target_modules = [
+        ".gate_proj.",
+        ".down_proj.",
+        ".up_proj.",
+        ".q_proj.",
+        ".k_proj.",
+        ".v_proj.",
+        ".o_proj.",
+    ]
+    bitsandbytes_stacked_params_mapping = {
+        # shard_name, weight_name, index
+        "q_proj": ("qkv_proj", 0),
+        "k_proj": ("qkv_proj", 1),
+        "v_proj": ("qkv_proj", 2),
+        "gate_proj": ("gate_up_proj", 0),
+        "up_proj": ("gate_up_proj", 1),
+    }
+
+    def __init__(
+        self,
+        config: MiMoConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.config = config
+        self.quant_config = quant_config
+        self.model = MiMoModel(
+            config, quant_config=quant_config, prefix=add_prefix("model", prefix)
+        )
+        if config.tie_word_embeddings:
+            self.lm_head = self.model.embed_tokens
+        else:
+            self.lm_head = ParallelLMHead(
+                config.vocab_size,
+                config.hidden_size,
+                quant_config=quant_config,
+                prefix=add_prefix("lm_head", prefix),
+            )
+        self.logits_processor = LogitsProcessor(config)
+        self.pooler = Pooler(pooling_type=PoolingType.LAST, normalize=True)
+
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.get_input_embeddings(input_ids)
+
+    @torch.no_grad()
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        input_embeds: torch.Tensor = None,
+        get_embedding: bool = False,
+    ) -> torch.Tensor:
+        hidden_states = self.model(input_ids, positions, forward_batch, input_embeds)
+        if not get_embedding:
+            return self.logits_processor(
+                input_ids, hidden_states, self.lm_head, forward_batch
+            )
+        else:
+            return self.pooler(hidden_states, forward_batch)
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+            ("gate_up_proj", "gate_proj", 0),
+            ("gate_up_proj", "up_proj", 1),
+        ]
+
+        params_dict = dict(self.named_parameters())
+        for name, loaded_weight in weights:
+            if (
+                "rotary_emb.inv_freq" in name
+                or "projector" in name
+                or "mtp_layers" in name
+            ):
+                continue
+            if "rotary_emb.cos_cached" in name or "rotary_emb.sin_cached" in name:
+                # Models trained using ColossalAI may include these tensors in
+                # the checkpoint. Skip them.
+                continue
+            if self.config.tie_word_embeddings and "lm_head.weight" in name:
+                continue
+            if name.startswith("model.vision_tower") and name not in params_dict:
+                continue
+
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
+                weight_loader(param, loaded_weight)
+
+    def get_embed_and_head(self):
+        return self.model.embed_tokens.weight, self.lm_head.weight
+
+    def set_embed_and_head(self, embed, head):
+        del self.model.embed_tokens.weight
+        del self.lm_head.weight
+        self.model.embed_tokens.weight = embed
+        self.lm_head.weight = head
+        torch.cuda.empty_cache()
+        torch.cuda.synchronize()
+
+    def load_kv_cache_scales(self, quantization_param_path: str) -> None:
+        self.model.load_kv_cache_scales(quantization_param_path)
+
+
+EntryClass = MiMoForCausalLM
diff --git a/sglang/python/sglang/srt/models/mimo_mtp.py b/sglang/python/sglang/srt/models/mimo_mtp.py
new file mode 100644
index 0000000000000000000000000000000000000000..2702a637d46fae0073e77f30b734e6c539cd4dc9
--- /dev/null
+++ b/sglang/python/sglang/srt/models/mimo_mtp.py
@@ -0,0 +1,203 @@
+# Adapted from https://github.com/vllm-project/vllm/pull/17433/files  and deepseek_nextn.py
+
+from typing import Iterable, Optional, Tuple
+
+import torch
+from torch import nn
+from transformers import PretrainedConfig
+
+from sglang.srt.distributed import get_tensor_model_parallel_world_size
+from sglang.srt.layers.layernorm import RMSNorm
+from sglang.srt.layers.logits_processor import LogitsProcessor
+from sglang.srt.layers.quantization.base_config import QuantizationConfig
+from sglang.srt.layers.vocab_parallel_embedding import (
+    ParallelLMHead,
+    VocabParallelEmbedding,
+)
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch
+from sglang.srt.model_loader.weight_utils import default_weight_loader
+from sglang.srt.models.qwen2 import Qwen2DecoderLayer
+
+
+class MiMoMultiTokenPredictorLayer(nn.Module):
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        prefix: str,
+        quant_config: Optional[QuantizationConfig] = None,
+    ) -> None:
+        super().__init__()
+
+        self.embed_tokens = VocabParallelEmbedding(
+            config.vocab_size,
+            config.hidden_size,
+        )
+        self.token_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.hidden_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.input_proj = nn.Linear(
+            config.hidden_size * 2, config.hidden_size, bias=False
+        )
+        self.mtp_block = Qwen2DecoderLayer(
+            config=config, quant_config=quant_config, prefix=prefix
+        )
+        self.final_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        input_embeds: torch.Tensor = None,
+    ) -> torch.Tensor:
+
+        if input_embeds is None:
+            hidden_states = self.embed_tokens(input_ids)
+        else:
+            hidden_states = input_embeds
+        # masking inputs at position 0, as not needed by MTP
+        hidden_states[positions == 0] = 0
+
+        hidden_states = self.input_proj(
+            torch.cat(
+                (
+                    self.hidden_layernorm(forward_batch.spec_info.hidden_states),
+                    self.token_layernorm(hidden_states),
+                ),
+                dim=-1,
+            )
+        )
+
+        hidden_states, residual = self.mtp_block(
+            positions=positions,
+            hidden_states=hidden_states,
+            forward_batch=forward_batch,
+            residual=None,
+        )
+        hidden_states = residual + hidden_states
+        hidden_states = self.final_layernorm(hidden_states)
+        return hidden_states
+
+
+class MiMoMTP(nn.Module):
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        nn.Module.__init__(self)
+        self.config = config
+        self.tp_size = get_tensor_model_parallel_world_size()
+        self.quant_config = quant_config
+
+        self.model = MiMoMultiTokenPredictorLayer(
+            config,
+            prefix,
+            quant_config,
+        )
+        self.lm_head = ParallelLMHead(
+            config.vocab_size,
+            config.hidden_size,
+            quant_config=quant_config,
+        )
+        self.logits_processor = LogitsProcessor(config)
+
+    @torch.no_grad()
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+    ) -> torch.Tensor:
+        hidden_states = self.model(input_ids, positions, forward_batch)
+        return self.logits_processor(
+            input_ids, hidden_states, self.lm_head, forward_batch
+        )
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+            ("gate_up_proj", "gate_proj", 0),
+            ("gate_up_proj", "up_proj", 1),
+        ]
+
+        params_dict = dict(self.named_parameters())
+        for name, loaded_weight in weights:
+            if "rotary_emb.inv_freq" in name or "projector" in name:
+                continue
+            if "rotary_emb.cos_cached" in name or "rotary_emb.sin_cached" in name:
+                # Models trained using ColossalAI may include these tensors in
+                # the checkpoint. Skip them.
+                continue
+            if self.config.tie_word_embeddings and "lm_head.weight" in name:
+                continue
+            if name.startswith("model.vision_tower") and name not in params_dict:
+                continue
+            name = self.map_model_name_to_mtp_param_name(name)
+
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                if "mtp_block" not in name:
+                    break
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                if "mtp_block" not in name and (
+                    "embed_tokens" not in name
+                    and "lm_head" not in name
+                    and "token_layernorm" not in name
+                    and "hidden_layernorm" not in name
+                    and "input_proj" not in name
+                    and "final_layernorm" not in name
+                ):
+                    continue
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
+                weight_loader(param, loaded_weight)
+
+    def map_model_name_to_mtp_param_name(self, name: str) -> str:
+        import re
+
+        name_without_prefix = [
+            "token_layernorm",
+            "hidden_layernorm",
+            "input_proj",
+            "final_layernorm",
+        ]
+        pattern = r"model.mtp_layers.(\d+)."
+        group = re.match(pattern, name)
+        if group is not None:
+            for sub_name in name_without_prefix:
+                if sub_name in name:
+                    name = name.replace(group.group(), "model.")
+                    return name
+            name = name.replace(group.group(), "model.mtp_block.")
+        return name
+
+    def get_embed_and_head(self):
+        return self.model.embed_tokens.weight, self.lm_head.weight
+
+    def set_embed_and_head(self, embed, head):
+        del self.model.embed_tokens.weight
+        del self.lm_head.weight
+        self.model.embed_tokens.weight = embed
+        self.lm_head.weight = head
+        torch.cuda.empty_cache()
+        torch.cuda.synchronize()
+
+
+EntryClass = MiMoMTP
diff --git a/sglang/python/sglang/srt/models/mimo_v2_flash.py b/sglang/python/sglang/srt/models/mimo_v2_flash.py
new file mode 100644
index 0000000000000000000000000000000000000000..d6f5eb07f4de6a1061fd6aebaba47e7c8ef6a235
--- /dev/null
+++ b/sglang/python/sglang/srt/models/mimo_v2_flash.py
@@ -0,0 +1,1169 @@
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+import logging
+from typing import Any, Dict, Iterable, Optional, Tuple, Union
+
+import torch
+import torch.nn.functional as F
+from torch import nn
+
+from sglang.srt.batch_overlap.two_batch_overlap import model_forward_maybe_tbo
+from sglang.srt.distributed import (
+    get_moe_expert_parallel_world_size,
+    get_pp_group,
+    get_tensor_model_parallel_world_size,
+    tensor_model_parallel_all_reduce,
+)
+from sglang.srt.eplb.expert_distribution import get_global_expert_distribution_recorder
+from sglang.srt.eplb.expert_location import ModelConfigForExpertLocation
+from sglang.srt.eplb.expert_location_dispatch import ExpertLocationDispatchInfo
+from sglang.srt.layers.activation import SiluAndMul
+from sglang.srt.layers.communicator import (
+    LayerCommunicator,
+    LayerScatterModes,
+    ScatterMode,
+    enable_moe_dense_fully_dp,
+)
+from sglang.srt.layers.dp_attention import (
+    get_attention_tp_rank,
+    get_attention_tp_size,
+    is_dp_attention_enabled,
+)
+from sglang.srt.layers.layernorm import RMSNorm
+from sglang.srt.layers.linear import (
+    MergedColumnParallelLinear,
+    QKVParallelLinear,
+    RowParallelLinear,
+)
+from sglang.srt.layers.logits_processor import LogitsProcessor
+from sglang.srt.layers.moe import (
+    get_moe_a2a_backend,
+    get_moe_runner_backend,
+    should_use_flashinfer_cutlass_moe_fp4_allgather,
+)
+from sglang.srt.layers.moe.ep_moe.layer import DeepEPMoE, get_moe_impl_class
+from sglang.srt.layers.moe.topk import TopK, TopKOutputFormat
+from sglang.srt.layers.quantization.base_config import QuantizationConfig
+from sglang.srt.layers.radix_attention import RadixAttention
+from sglang.srt.layers.rotary_embedding import get_rope
+from sglang.srt.layers.utils import PPMissingLayer, get_layer_id
+from sglang.srt.layers.vocab_parallel_embedding import (
+    ParallelLMHead,
+    VocabParallelEmbedding,
+)
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch, PPProxyTensors
+from sglang.srt.model_loader.weight_utils import (
+    default_weight_loader,
+    kv_cache_scales_loader,
+)
+from sglang.srt.server_args import get_global_server_args
+from sglang.srt.utils import (
+    LazyValue,
+    add_prefix,
+    is_non_idle_and_non_empty,
+    make_layers,
+)
+
+MiMoV2FlashConfig = None
+
+logger = logging.getLogger(__name__)
+
+
+class MiMoV2MLP(nn.Module):
+    def __init__(
+        self,
+        hidden_size: int,
+        intermediate_size: int,
+        hidden_act: str,
+        quant_config: Optional[QuantizationConfig] = None,
+        reduce_results: bool = True,
+        prefix: str = "",
+        tp_rank: Optional[int] = None,
+        tp_size: Optional[int] = None,
+    ) -> None:
+        super().__init__()
+        self.tp_size = tp_size
+
+        self.gate_up_proj = MergedColumnParallelLinear(
+            hidden_size,
+            [intermediate_size] * 2,
+            bias=False,
+            quant_config=quant_config,
+            prefix=add_prefix("gate_up_proj", prefix),
+            tp_rank=tp_rank,
+            tp_size=tp_size,
+        )
+        self.down_proj = RowParallelLinear(
+            intermediate_size,
+            hidden_size,
+            bias=False,
+            quant_config=quant_config,
+            reduce_results=reduce_results,
+            prefix=add_prefix("down_proj", prefix),
+            tp_rank=tp_rank,
+            tp_size=tp_size,
+        )
+        if hidden_act != "silu":
+            raise ValueError(
+                f"Unsupported activation: {hidden_act}. "
+                "Only silu is supported for now."
+            )
+        self.act_fn = SiluAndMul()
+
+    def forward(
+        self,
+        x,
+        forward_batch: ForwardBatch = None,
+        should_allreduce_fusion: bool = False,
+        use_reduce_scatter: bool = False,
+    ):
+        if (self.tp_size == 1) and x.shape[0] == 0:
+            return x
+
+        gate_up, _ = self.gate_up_proj(x)
+        x = self.act_fn(gate_up)
+        x, _ = self.down_proj(
+            x, skip_all_reduce=should_allreduce_fusion or use_reduce_scatter
+        )
+        return x
+
+
+class MoEGate(nn.Module):
+    def __init__(
+        self,
+        config,
+        quant_config,
+        prefix: str = "",
+        is_nextn: bool = False,
+    ):
+        super().__init__()
+        self.is_nextn = is_nextn
+        self.dtype = torch.float32
+        self.weight = nn.Parameter(
+            torch.empty((config.n_routed_experts, config.hidden_size), dtype=self.dtype)
+        )
+        if config.topk_method == "noaux_tc":
+            correction_bias_dtype = (
+                torch.bfloat16
+                if quant_config is not None
+                and quant_config.get_name() == "modelopt_fp4"
+                and get_moe_runner_backend().is_flashinfer_trtllm()
+                else self.dtype
+            )
+            self.e_score_correction_bias = nn.Parameter(
+                torch.empty((config.n_routed_experts), dtype=correction_bias_dtype)
+            )
+        else:
+            self.e_score_correction_bias = None
+
+    def forward(self, hidden_states):
+        logits = F.linear(hidden_states.to(self.dtype), self.weight, None)
+
+        return logits
+
+
+class MiMoV2MoE(nn.Module):
+
+    def __init__(
+        self,
+        config: MiMoV2FlashConfig,
+        layer_id: int,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+        is_nextn: bool = False,
+    ):
+        super().__init__()
+        self.tp_size = get_tensor_model_parallel_world_size()
+
+        self.config = config
+        self.layer_id = layer_id
+
+        if self.tp_size > config.n_routed_experts:
+            raise ValueError(
+                f"Tensor parallel size {self.tp_size} is greater than "
+                f"the number of experts {config.n_routed_experts}."
+            )
+
+        if config.hidden_act != "silu":
+            raise ValueError(
+                f"Unsupported activation: {config.hidden_act}. "
+                "Only silu is supported for now."
+            )
+
+        self.gate = MoEGate(
+            config=config,
+            quant_config=quant_config,
+            prefix=add_prefix("gate", prefix),
+            is_nextn=is_nextn,
+        )
+
+        experts_type = get_moe_impl_class(quant_config)
+        self.experts = experts_type(
+            num_experts=config.n_routed_experts
+            + get_global_server_args().ep_num_redundant_experts,
+            top_k=config.num_experts_per_tok,
+            hidden_size=config.hidden_size,
+            intermediate_size=config.moe_intermediate_size,
+            layer_id=self.layer_id,
+            quant_config=quant_config,
+            routed_scaling_factor=1.0,
+            prefix=add_prefix("experts", prefix),
+        )
+
+        self.topk = TopK(
+            top_k=config.num_experts_per_tok,
+            renormalize=config.norm_topk_prob,
+            use_grouped_topk=True,
+            num_expert_group=config.n_group,
+            topk_group=config.topk_group,
+            correction_bias=self.gate.e_score_correction_bias,
+            quant_config=quant_config,
+            routed_scaling_factor=1.0,
+            apply_routed_scaling_factor_on_output=self.experts.should_fuse_routed_scaling_factor_in_topk,
+            # Some Fp4 MoE backends require the output format to be bypassed but the MTP layers are unquantized
+            # and requires the output format to be standard. We use quant_config to determine the output format.
+            output_format=TopKOutputFormat.STANDARD if quant_config is None else None,
+        )
+
+        # todo : implement tbo forward needed
+        if get_moe_a2a_backend().is_deepep() or get_moe_a2a_backend().is_mooncake():
+            # TODO: we will support tp < ep in the future
+            self.ep_size = get_moe_expert_parallel_world_size()
+            self.num_experts = (
+                config.n_routed_experts
+                + get_global_server_args().ep_num_redundant_experts
+            )
+            self.renormalize = config.norm_topk_prob
+            self.topk_group = config.topk_group
+            self.num_expert_group = config.n_group
+            self.correction_bias = (
+                self.gate.e_score_correction_bias.data
+                if self.gate.e_score_correction_bias is not None
+                else None
+            )
+
+        self._enable_a2a_moe = (
+            get_moe_a2a_backend().is_deepep() or get_moe_a2a_backend().is_mooncake()
+        )
+
+    def get_moe_weights(self):
+        return [
+            x.data
+            for name, x in self.experts.named_parameters()
+            if name not in ["correction_bias"]
+        ]
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        forward_batch: Optional[ForwardBatch] = None,
+        should_allreduce_fusion: bool = False,
+        use_reduce_scatter: bool = False,
+    ) -> torch.Tensor:
+        if not self._enable_a2a_moe:
+            return self.forward_normal(
+                hidden_states,
+                should_allreduce_fusion,
+                use_reduce_scatter,
+            )
+        else:
+            return self.forward_deepep(hidden_states, forward_batch)
+
+    def forward_normal(
+        self,
+        hidden_states: torch.Tensor,
+        should_allreduce_fusion: bool = False,
+        use_reduce_scatter: bool = False,
+    ) -> torch.Tensor:
+
+        if hidden_states.shape[0] > 0:
+            # router_logits: (num_tokens, n_experts)
+            router_logits = self.gate(hidden_states)
+            topk_output = self.topk(hidden_states, router_logits)
+        else:
+            topk_output = self.topk.empty_topk_output(hidden_states.device)
+
+        final_hidden_states = self.experts(hidden_states, topk_output)
+
+        if (
+            self.tp_size > 1
+            and not should_allreduce_fusion
+            and not use_reduce_scatter
+            and not should_use_flashinfer_cutlass_moe_fp4_allgather()
+        ):
+            final_hidden_states = tensor_model_parallel_all_reduce(final_hidden_states)
+
+        return final_hidden_states
+
+    def forward_deepep(
+        self, hidden_states: torch.Tensor, forward_batch: ForwardBatch
+    ) -> torch.Tensor:
+        if hidden_states.shape[0] > 0:
+            # router_logits: (num_tokens, n_experts)
+            router_logits = self.gate(hidden_states)
+            topk_output = self.topk(
+                hidden_states,
+                router_logits,
+                num_token_non_padded=forward_batch.num_token_non_padded,
+                expert_location_dispatch_info=ExpertLocationDispatchInfo.init_new(
+                    layer_id=self.layer_id,
+                ),
+            )
+        else:
+            topk_output = self.topk.empty_topk_output(hidden_states.device)
+
+        final_hidden_states = self.experts(
+            hidden_states=hidden_states, topk_output=topk_output
+        )
+
+        return final_hidden_states
+
+    def op_gate(self, state):
+        if is_non_idle_and_non_empty(
+            state.forward_batch.forward_mode, state.hidden_states_mlp_input
+        ):
+            # router_logits: (num_tokens, n_experts)
+            state.router_logits = self.gate(state.hidden_states_mlp_input)
+        else:
+            state.router_logits = None
+
+    def op_select_experts(self, state):
+        router_logits = state.pop("router_logits")
+        hidden_states = state.hidden_states_mlp_input
+        if router_logits is not None:
+            with get_global_expert_distribution_recorder().with_current_layer(
+                self.layer_id
+            ):
+                state.topk_output = self.topk(
+                    hidden_states=hidden_states,
+                    router_logits=router_logits,
+                    num_token_non_padded=state.forward_batch.num_token_non_padded,
+                    expert_location_dispatch_info=ExpertLocationDispatchInfo.init_new(
+                        layer_id=self.layer_id,
+                    ),
+                )
+        else:
+            state.topk_output = self.topk.empty_topk_output(hidden_states.device)
+
+    def op_dispatch_a(self, state):
+        if self.ep_size > 1:
+            self.experts.dispatcher.dispatch_a(
+                hidden_states=state.pop("hidden_states_mlp_input"),
+                topk_output=state.pop("topk_output"),
+                tbo_subbatch_index=state.get("tbo_subbatch_index"),
+            )
+
+    def op_dispatch_b(self, state):
+        if self.ep_size > 1:
+            with get_global_expert_distribution_recorder().with_current_layer(
+                self.layer_id
+            ):
+                state.dispatch_output = self.experts.dispatcher.dispatch_b(
+                    tbo_subbatch_index=state.get("tbo_subbatch_index"),
+                )
+
+    def op_experts(self, state):
+        state.combine_input = self.experts.run_moe_core(
+            dispatch_output=state.dispatch_output,
+        )
+
+    def op_combine_a(self, state):
+        if self.ep_size > 1:
+            self.experts.dispatcher.combine_a(
+                combine_input=state.pop("combine_input"),
+                tbo_subbatch_index=state.get("tbo_subbatch_index"),
+            )
+            state.pop("dispatch_output")
+
+    def op_combine_b(self, state):
+        if self.ep_size > 1:
+            state.hidden_states_after_combine = self.experts.dispatcher.combine_b(
+                tbo_subbatch_index=state.get("tbo_subbatch_index"),
+            )
+
+    def op_output(self, state):
+        state.hidden_states_mlp_output = state.pop("hidden_states_after_combine")
+
+
+class MiMoV2Attention(nn.Module):
+    def __init__(
+        self,
+        hidden_size: int,
+        num_heads: int,
+        num_kv_heads: int,
+        head_dim: Optional[int] = None,
+        v_head_dim: Optional[int] = None,
+        v_scale: Optional[float] = None,
+        sliding_window_size: int = -1,  # if is -1 ,normal attention,else ,window attention
+        attention_bias: bool = False,
+        attention_sink_bias: bool = False,
+        layer_id: int = 0,
+        rope_theta: float = 1000000,
+        rope_scaling: Optional[Dict[str, Any]] = None,
+        max_position_embeddings: int = 32768,
+        quant_config: Optional[QuantizationConfig] = None,
+        partial_rotary_factor: float = 1.0,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.hidden_size = hidden_size
+
+        attn_tp_rank = get_attention_tp_rank()
+        attn_tp_size = get_attention_tp_size()
+
+        self.total_num_heads = num_heads
+        assert self.total_num_heads % attn_tp_size == 0
+        self.num_heads = self.total_num_heads // attn_tp_size
+        self.total_num_kv_heads = num_kv_heads
+        if self.total_num_kv_heads >= attn_tp_size:
+            # Number of KV heads is greater than TP size, so we partition
+            # the KV heads across multiple tensor parallel GPUs.
+            assert self.total_num_kv_heads % attn_tp_size == 0
+        else:
+            # Number of KV heads is less than TP size, so we replicate
+            # the KV heads across multiple tensor parallel GPUs.
+            assert attn_tp_size % self.total_num_kv_heads == 0
+        self.num_kv_heads = max(1, self.total_num_kv_heads // attn_tp_size)
+        self.head_dim = head_dim
+        self.v_head_dim = v_head_dim if v_head_dim is not None else head_dim
+
+        self.q_size = self.num_heads * self.head_dim
+        self.k_size = self.num_kv_heads * self.head_dim
+        self.v_size = self.num_kv_heads * self.v_head_dim
+
+        self.v_scale = v_scale
+
+        self.scaling = self.head_dim**-0.5
+
+        self.qkv_proj = QKVParallelLinear(
+            hidden_size,
+            self.head_dim,
+            self.total_num_heads,
+            self.total_num_kv_heads,
+            v_head_size=self.v_head_dim,
+            bias=attention_bias,
+            quant_config=quant_config,
+            tp_rank=attn_tp_rank,
+            tp_size=attn_tp_size,
+            prefix=add_prefix("qkv_proj", prefix),
+            skip_block_quant_check=True,
+        )
+
+        self.o_proj = RowParallelLinear(
+            self.total_num_heads * self.v_head_dim,
+            hidden_size,
+            bias=False,
+            quant_config=quant_config,
+            tp_rank=attn_tp_rank,
+            tp_size=attn_tp_size,
+            reduce_results=False,
+            prefix=add_prefix("o_proj", prefix),
+        )
+
+        self.rotary_emb = get_rope(
+            self.head_dim,
+            rotary_dim=self.head_dim,
+            max_position=max_position_embeddings,
+            base=rope_theta,
+            rope_scaling=rope_scaling,
+            partial_rotary_factor=partial_rotary_factor,
+        )
+
+        self.attn = RadixAttention(
+            self.num_heads,
+            self.head_dim,
+            self.scaling,
+            num_kv_heads=self.num_kv_heads,
+            layer_id=layer_id,
+            v_head_dim=self.v_head_dim,
+            sliding_window_size=sliding_window_size,  # if is -1 ,normal attention,else ,window attention
+            quant_config=quant_config,
+            prefix=add_prefix("attn", prefix),
+        )
+
+        self.attention_sink_bias = (
+            torch.nn.Parameter(torch.empty(self.num_heads), requires_grad=False)
+            if attention_sink_bias
+            else None
+        )
+
+    def op_prepare(self, state):
+        state.attn_intermediate_state = self.forward_prepare(
+            positions=state.positions,
+            hidden_states=state.pop("hidden_states_after_comm_pre_attn"),
+            forward_batch=state.forward_batch,
+        )
+
+    def op_core(self, state):
+        state.hidden_states_after_attn = self.forward_core(
+            state.pop("attn_intermediate_state")
+        )
+
+    def forward_prepare(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+    ):
+        if hidden_states.shape[0] == 0:
+            return hidden_states, forward_batch, None
+        qkv, _ = self.qkv_proj(hidden_states)
+        q, k, v = qkv.split([self.q_size, self.k_size, self.v_size], dim=-1)
+
+        q, k = self.rotary_emb(positions, q, k)
+        if self.v_scale is not None:
+            v = v * self.v_scale
+
+        inner_state = q, k, v, forward_batch
+        return None, forward_batch, inner_state
+
+    def forward_core(self, intermediate_state):
+        hidden_states, forward_batch, inner_state = intermediate_state
+        if inner_state is None:
+            return hidden_states
+        attn_output = self.attn(
+            *inner_state,
+            sinks=self.attention_sink_bias,
+        )
+        output, _ = self.o_proj(attn_output)
+        return output
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+    ) -> torch.Tensor:
+        qkv, _ = self.qkv_proj(hidden_states)
+        q, k, v = qkv.split([self.q_size, self.k_size, self.v_size], dim=-1)
+
+        # [t, h, dr]
+        q, k = self.rotary_emb(positions, q, k)
+        # [t, h, d]
+
+        if self.v_scale is not None:
+            v = v * self.v_scale
+        attn_output = self.attn(q, k, v, forward_batch, sinks=self.attention_sink_bias)
+        output, _ = self.o_proj(attn_output)
+        return output
+
+
+class MiMoV2DecoderLayer(nn.Module):
+    def __init__(
+        self,
+        config: MiMoV2FlashConfig,
+        layer_id: int = 0,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.layer_id = layer_id
+
+        rope_theta = getattr(config, "rope_theta", 1000000)
+        rope_scaling = getattr(config, "rope_scaling", None)
+        max_position_embeddings = getattr(config, "max_position_embeddings", 32768)
+
+        if self.is_swa_layer():
+            self.self_attn = MiMoV2Attention(
+                hidden_size=self.hidden_size,
+                num_heads=config.swa_num_attention_heads,
+                num_kv_heads=config.swa_num_key_value_heads,
+                head_dim=config.swa_head_dim,
+                v_head_dim=getattr(config, "swa_v_head_dim", None),
+                v_scale=getattr(config, "attention_value_scale", None),
+                sliding_window_size=config.sliding_window_size,
+                attention_bias=config.attention_bias,
+                attention_sink_bias=getattr(
+                    config, "add_swa_attention_sink_bias", False
+                ),
+                layer_id=layer_id,
+                rope_theta=getattr(config, "swa_rope_theta", rope_theta),
+                rope_scaling=rope_scaling,
+                max_position_embeddings=max_position_embeddings,
+                quant_config=quant_config,
+                partial_rotary_factor=getattr(config, "partial_rotary_factor", 1.0),
+                prefix=add_prefix("self_attn", prefix),
+            )
+        else:
+            self.self_attn = MiMoV2Attention(
+                hidden_size=self.hidden_size,
+                num_heads=self.config.num_attention_heads,
+                num_kv_heads=config.num_key_value_heads,
+                head_dim=config.head_dim,
+                v_head_dim=getattr(config, "v_head_dim", None),
+                v_scale=getattr(config, "attention_value_scale", None),
+                sliding_window_size=-1,  # normal attention
+                attention_bias=config.attention_bias,
+                attention_sink_bias=getattr(
+                    config, "add_full_attention_sink_bias", False
+                ),
+                layer_id=layer_id,
+                rope_theta=rope_theta,
+                rope_scaling=rope_scaling,
+                max_position_embeddings=max_position_embeddings,
+                quant_config=quant_config,
+                partial_rotary_factor=getattr(config, "partial_rotary_factor", 1.0),
+                prefix=add_prefix("self_attn", prefix),
+            )
+
+        self.is_layer_sparse = self.is_moe_layer(layer_id)
+        is_previous_layer_sparse = self.is_moe_layer(layer_id - 1)
+        is_next_layer_sparse = self.is_moe_layer(layer_id + 1)
+
+        if self.is_layer_sparse:
+            self.mlp = MiMoV2MoE(
+                config=config,
+                quant_config=quant_config,
+                prefix=add_prefix("mlp", prefix),
+                layer_id=layer_id,
+            )
+        else:
+            if enable_moe_dense_fully_dp():
+                mlp_tp_rank, mlp_tp_size = 0, 1
+            else:
+                mlp_tp_rank, mlp_tp_size = None, None
+            self.mlp = MiMoV2MLP(
+                hidden_size=self.hidden_size,
+                intermediate_size=config.intermediate_size,
+                hidden_act=config.hidden_act,
+                quant_config=quant_config,
+                prefix=add_prefix("mlp", prefix),
+                tp_rank=mlp_tp_rank,
+                tp_size=mlp_tp_size,
+            )
+
+        self.input_layernorm = RMSNorm(config.hidden_size, eps=config.layernorm_epsilon)
+        self.post_attention_layernorm = RMSNorm(
+            config.hidden_size, eps=config.layernorm_epsilon
+        )
+
+        self.layer_scatter_modes = LayerScatterModes.init_new(
+            layer_id=layer_id,
+            num_layers=config.num_hidden_layers,
+            is_layer_sparse=self.is_layer_sparse,
+            is_previous_layer_sparse=is_previous_layer_sparse,
+            is_next_layer_sparse=is_next_layer_sparse,
+        )
+        self.layer_communicator = LayerCommunicator(
+            layer_scatter_modes=self.layer_scatter_modes,
+            input_layernorm=self.input_layernorm,
+            post_attention_layernorm=self.post_attention_layernorm,
+            allow_reduce_scatter=True,
+            is_last_layer=(self.layer_id == self.config.num_hidden_layers - 1),
+        )
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+        residual: Optional[torch.Tensor],
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        # Self Attention
+        hidden_states, residual = self.layer_communicator.prepare_attn(
+            hidden_states, residual, forward_batch
+        )
+
+        if hidden_states.shape[0] != 0:
+            hidden_states = self.self_attn(
+                positions=positions,
+                hidden_states=hidden_states,
+                forward_batch=forward_batch,
+            )
+
+        hidden_states, residual = self.layer_communicator.prepare_mlp(
+            hidden_states, residual, forward_batch
+        )
+
+        should_allreduce_fusion = (
+            self.layer_communicator.should_fuse_mlp_allreduce_with_next_layer(
+                forward_batch
+            )
+        )
+
+        # For DP with padding, reduce scatter can be used instead of all-reduce.
+        use_reduce_scatter = self.layer_communicator.should_use_reduce_scatter(
+            forward_batch
+        )
+
+        hidden_states = self.mlp(
+            hidden_states, forward_batch, should_allreduce_fusion, use_reduce_scatter
+        )
+
+        if should_allreduce_fusion:
+            hidden_states._sglang_needs_allreduce_fusion = True
+        else:
+            hidden_states, residual = self.layer_communicator.postprocess_layer(
+                hidden_states, residual, forward_batch
+            )
+
+        return hidden_states, residual
+
+    def is_moe_layer(self, layer_idx: int) -> bool:
+        return (
+            hasattr(self.config, "moe_layer_freq")
+            and 0 <= layer_idx < len(self.config.moe_layer_freq)
+            and not isinstance(self.config.moe_layer_freq, int)
+            and self.config.moe_layer_freq[layer_idx]
+        )
+
+    def is_swa_layer(self) -> bool:
+        return self.config.hybrid_layer_pattern[self.layer_id] == 1
+
+    def op_comm_prepare_attn(
+        self,
+        state,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+        residual: Optional[torch.Tensor],
+        tbo_subbatch_index: Optional[int] = None,
+    ):
+        state.hidden_states_after_comm_pre_attn, state.residual_after_input_ln = (
+            self.layer_communicator.prepare_attn(hidden_states, residual, forward_batch)
+        )
+        state.update(
+            dict(
+                forward_batch=forward_batch,
+                positions=positions,
+                tbo_subbatch_index=tbo_subbatch_index,
+            )
+        )
+
+    def op_comm_prepare_mlp(self, state):
+        state.hidden_states_mlp_input, state.residual_after_comm_pre_mlp = (
+            self.layer_communicator.prepare_mlp(
+                state.pop("hidden_states_after_attn"),
+                state.pop("residual_after_input_ln"),
+                state.forward_batch,
+            )
+        )
+
+    def op_mlp(self, state):
+        hidden_states = state.pop("hidden_states_mlp_input")
+        state.hidden_states_mlp_output = self.mlp(hidden_states, state.forward_batch)
+
+    def op_comm_postprocess_layer(self, state):
+        hidden_states, residual = self.layer_communicator.postprocess_layer(
+            state.pop("hidden_states_mlp_output"),
+            state.pop("residual_after_comm_pre_mlp"),
+            state.forward_batch,
+        )
+
+        output = dict(
+            positions=state.positions,
+            hidden_states=hidden_states,
+            residual=residual,
+            forward_batch=state.forward_batch,
+            tbo_subbatch_index=state.tbo_subbatch_index,
+        )
+
+        state.clear(
+            expect_keys={
+                "positions",
+                "forward_batch",
+                "tbo_subbatch_index",
+            }
+        )
+        return output
+
+
+class MiMoV2Model(nn.Module):
+    def __init__(
+        self,
+        config: MiMoV2FlashConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+        decoder_layer_type: type[nn.Module] = MiMoV2DecoderLayer,
+    ) -> None:
+        super().__init__()
+        self.config = config
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+        self.pp_group = get_pp_group()
+
+        if self.pp_group.is_first_rank:
+            self.embed_tokens = VocabParallelEmbedding(
+                config.vocab_size,
+                config.hidden_size,
+                quant_config=quant_config,
+                use_attn_tp_group=is_dp_attention_enabled(),
+                prefix=add_prefix("embed_tokens", prefix),
+            )
+        else:
+            self.embed_tokens = PPMissingLayer()
+
+        # Use the provided decoder layer type or default to MiMoV2DecoderLayer
+        decoder_layer_type = decoder_layer_type or MiMoV2DecoderLayer
+        self.layers, self.start_layer, self.end_layer = make_layers(
+            config.num_hidden_layers,
+            layer_fn=lambda idx, prefix: decoder_layer_type(
+                layer_id=idx,
+                config=config,
+                quant_config=quant_config,
+                prefix=prefix,
+            ),
+            pp_rank=self.pp_group.rank_in_group,
+            pp_size=self.pp_group.world_size,
+            prefix=add_prefix("layers", prefix),
+        )
+        if self.pp_group.is_last_rank:
+            self.norm = RMSNorm(config.hidden_size, eps=config.layernorm_epsilon)
+        else:
+            self.norm = PPMissingLayer(return_tuple=True)
+
+    def get_input_embedding(self, input_ids: torch.Tensor) -> torch.Tensor:
+        if hasattr(self.config, "scale_emb"):
+            return self.get_input_embeddings()(input_ids) * self.config.scale_emb
+        else:
+            return self.get_input_embeddings()(input_ids)
+
+    def get_input_embeddings(self) -> nn.Embedding:
+        return self.embed_tokens
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        input_embeds: torch.Tensor = None,
+        pp_proxy_tensors: Optional[PPProxyTensors] = None,
+    ) -> Union[torch.Tensor, PPProxyTensors]:
+        if self.pp_group.is_first_rank:
+            if input_embeds is None:
+                hidden_states = self.embed_tokens(input_ids)
+            else:
+                hidden_states = input_embeds
+            residual = None
+        else:
+            assert pp_proxy_tensors is not None
+            hidden_states = pp_proxy_tensors["hidden_states"]
+            residual = pp_proxy_tensors["residual"]
+
+        if forward_batch.can_run_tbo:
+            tbo_start_layer = self.start_layer
+            tbo_end_layer = self.end_layer
+
+            # skip first layer for TBO when starting from layer 0
+            if self.start_layer == 0:
+                layer = self.layers[0]
+                hidden_states, residual = layer(
+                    positions, hidden_states, forward_batch, residual
+                )
+                tbo_start_layer = tbo_start_layer + 1
+
+            hidden_states, residual = model_forward_maybe_tbo(
+                layers=self.layers[tbo_start_layer:tbo_end_layer],
+                enable_tbo=True,
+                input_data_scatter_mode=(
+                    ScatterMode.model_input_output()
+                    if tbo_start_layer == self.start_layer
+                    else self.layers[
+                        tbo_start_layer - 1
+                    ].layer_scatter_modes.layer_output_mode
+                ),
+                positions=positions,
+                forward_batch=forward_batch,
+                hidden_states=hidden_states,
+                residual=residual,
+            )
+        else:
+            for i in range(self.start_layer, self.end_layer):
+                layer = self.layers[i]
+                hidden_states, residual = layer(
+                    positions,
+                    hidden_states,
+                    forward_batch,
+                    residual,
+                )
+
+        hidden_states_before_norm = None
+        if not self.pp_group.is_last_rank:
+            return PPProxyTensors(
+                {
+                    "hidden_states": hidden_states,
+                    "residual": residual,
+                }
+            )
+        else:
+            if hidden_states.shape[0] > 0:
+                if forward_batch.return_hidden_states_before_norm:
+                    hidden_states_before_norm = (
+                        hidden_states if residual is None else hidden_states + residual
+                    )
+                if residual is None:
+                    hidden_states = self.norm(hidden_states)
+                else:
+                    hidden_states, _ = self.norm(hidden_states, residual)
+
+        return hidden_states, hidden_states_before_norm
+
+    # If this function is called, it should always initialize KV cache scale
+    # factors (or else raise an exception). Thus, handled exceptions should
+    # make sure to leave KV cache scale factors in a known good (dummy) state
+    def load_kv_cache_scales(self, quantization_param_path: str) -> None:
+        attn_tp_rank = get_attention_tp_rank()
+        attn_tp_size = get_attention_tp_size()
+        for layer_idx, scaling_factor in kv_cache_scales_loader(
+            quantization_param_path,
+            attn_tp_rank,
+            attn_tp_size,
+            self.config.num_hidden_layers,
+            self.config.__class__.model_type,
+        ):
+            if not isinstance(self.layers[layer_idx], nn.Identity):
+                layer_self_attn = self.layers[layer_idx].self_attn
+            if hasattr(layer_self_attn.attn, "k_scale"):
+                layer_self_attn.attn.k_scale = scaling_factor
+                layer_self_attn.attn.v_scale = scaling_factor
+            else:
+                raise RuntimeError(
+                    "Self attention has no KV cache scaling " "factor attribute!"
+                )
+
+
+class MiMoV2FlashForCausalLM(nn.Module):
+    # BitandBytes specific attributes
+    default_bitsandbytes_target_modules = [
+        ".gate_proj.",
+        ".down_proj.",
+        ".up_proj.",
+        ".q_proj.",
+        ".k_proj.",
+        ".v_proj.",
+        ".o_proj.",
+    ]
+    bitsandbytes_stacked_params_mapping = {
+        # shard_name, weight_name, index
+        "q_proj": ("qkv_proj", 0),
+        "k_proj": ("qkv_proj", 1),
+        "v_proj": ("qkv_proj", 2),
+        "gate_proj": ("gate_up_proj", 0),
+        "up_proj": ("gate_up_proj", 1),
+    }
+
+    def __init__(
+        self,
+        config: MiMoV2FlashConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.pp_group = get_pp_group()
+        self.config = config
+        self.quant_config = quant_config
+        self.model = MiMoV2Model(
+            config, quant_config=quant_config, prefix=add_prefix("model", prefix)
+        )
+
+        if self.pp_group.is_last_rank:
+            self.lm_head = ParallelLMHead(
+                config.vocab_size,
+                config.hidden_size,
+                quant_config=quant_config,
+                prefix=add_prefix("lm_head", prefix),
+                use_attn_tp_group=get_global_server_args().enable_dp_lm_head,
+            )
+        else:
+            # ranks other than the last rank will have a placeholder layer
+            self.lm_head = PPMissingLayer()
+
+        self.logits_processor = LogitsProcessor(config)
+
+        self._routed_experts_weights_of_layer = LazyValue(
+            lambda: {
+                layer_id: layer.mlp.get_moe_weights()
+                for layer_id, layer in enumerate(self.model.layers)
+                if isinstance(layer.mlp, MiMoV2MoE)
+            }
+        )
+
+    @property
+    def routed_experts_weights_of_layer(self):
+        return self._routed_experts_weights_of_layer.value
+
+    def get_input_embedding(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.get_input_embedding(input_ids)
+
+    def get_input_embeddings(self) -> nn.Embedding:
+        return self.model.embed_tokens
+
+    @torch.no_grad()
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        input_embeds: torch.Tensor = None,
+        pp_proxy_tensors: Optional[PPProxyTensors] = None,
+    ) -> torch.Tensor:
+        hidden_states, hidden_states_before_norm = self.model(
+            input_ids,
+            positions,
+            forward_batch,
+            input_embeds,
+            pp_proxy_tensors=pp_proxy_tensors,
+        )
+
+        if self.pp_group.is_last_rank:
+            return self.logits_processor(
+                input_ids,
+                hidden_states,
+                self.lm_head,
+                forward_batch,
+                hidden_states_before_norm=hidden_states_before_norm,
+            )
+        else:
+            return hidden_states
+
+    @property
+    def start_layer(self):
+        return self.model.start_layer
+
+    @property
+    def end_layer(self):
+        return self.model.end_layer
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+            ("gate_up_proj", "gate_proj", 0),
+            ("gate_up_proj", "up_proj", 1),
+        ]
+
+        # (param_name, weight_name, expert_id, shard_id)
+        expert_params_mapping = DeepEPMoE.make_expert_params_mapping(
+            ckpt_gate_proj_name="gate_proj",
+            ckpt_down_proj_name="down_proj",
+            ckpt_up_proj_name="up_proj",
+            num_experts=self.config.n_routed_experts,
+        )
+
+        params_dict = dict(self.named_parameters())
+
+        for name, loaded_weight in weights:
+            layer_id = get_layer_id(name)
+            if (
+                layer_id is not None
+                and hasattr(self.model, "start_layer")
+                and (
+                    layer_id < self.model.start_layer
+                    or layer_id >= self.model.end_layer
+                )
+            ):
+                continue
+
+            if "rotary_emb.inv_freq" in name or "projector" in name:
+                continue
+            if "rotary_emb.cos_cached" in name or "rotary_emb.sin_cached" in name:
+                # Models trained using ColossalAI may include these tensors in
+                # the checkpoint. Skip them.
+                continue
+
+            if self.config.tie_word_embeddings and "lm_head.weight" in name:
+                if self.pp_group.world_size > 1 and self.pp_group.is_last_rank:
+                    # Handle pp weight tying here
+                    # find the embed_tokens.weight in the weights
+                    embed_token_weights = next(
+                        filter(lambda x: x[0] == "model.embed_tokens.weight", weights)
+                    )[1]
+                    loaded_weight = embed_token_weights
+                else:
+                    continue
+
+            # TODO: skip mtp weights for now, need to implement mtp
+            if "mtp" in name:
+                continue
+
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                if ("mlp.experts." in name) and name not in params_dict:
+                    continue
+
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                for mapping in expert_params_mapping:
+                    param_name, weight_name, expert_id, shard_id = mapping
+                    if weight_name not in name:
+                        continue
+                    name = name.replace(weight_name, param_name)
+                    param = params_dict[name]
+                    weight_loader = param.weight_loader
+                    weight_loader(
+                        param,
+                        loaded_weight,
+                        name,
+                        shard_id=shard_id,
+                        expert_id=expert_id,
+                    )
+                    break
+                else:
+                    # Skip loading extra bias for GPTQ models.
+                    if name.endswith(".bias") and name not in params_dict:
+                        continue
+
+                    if name in params_dict.keys():
+                        param = params_dict[name]
+                        if "attention_sink_bias" in name:
+                            start = get_attention_tp_rank() * param.numel()
+                            param.data.copy_(
+                                loaded_weight[start : start + param.numel()]
+                            )
+                        else:
+                            weight_loader = getattr(
+                                param, "weight_loader", default_weight_loader
+                            )
+                            weight_loader(param, loaded_weight)
+                    else:
+                        logger.warning(f"Parameter {name} not found in params_dict")
+
+    def get_embed_and_head(self):
+        return self.model.embed_tokens.weight, self.lm_head.weight
+
+    def set_embed_and_head(self, embed, head):
+        del self.model.embed_tokens.weight
+        del self.lm_head.weight
+        self.model.embed_tokens.weight = embed
+        self.lm_head.weight = head
+        torch.cuda.empty_cache()
+        torch.cuda.synchronize()
+
+    def load_kv_cache_scales(self, quantization_param_path: str) -> None:
+        self.model.load_kv_cache_scales(quantization_param_path)
+
+    @classmethod
+    def get_model_config_for_expert_location(cls, config):
+        return ModelConfigForExpertLocation(
+            num_layers=config.num_hidden_layers,
+            num_logical_experts=getattr(config, "n_routed_experts", 1),
+            num_groups=getattr(config, "n_group", None),
+        )
+
+
+EntryClass = MiMoV2FlashForCausalLM
diff --git a/sglang/python/sglang/srt/models/mimo_v2_flash_nextn.py b/sglang/python/sglang/srt/models/mimo_v2_flash_nextn.py
new file mode 100644
index 0000000000000000000000000000000000000000..18b5453953c06972552d0f679ca50a937fc7cc58
--- /dev/null
+++ b/sglang/python/sglang/srt/models/mimo_v2_flash_nextn.py
@@ -0,0 +1,369 @@
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+import logging
+from typing import Iterable, Optional, Tuple
+
+import torch
+from torch import nn
+from transformers import PretrainedConfig
+
+from sglang.srt.distributed import get_tensor_model_parallel_world_size
+from sglang.srt.eplb.expert_distribution import get_global_expert_distribution_recorder
+from sglang.srt.layers.communicator import (
+    LayerCommunicator,
+    LayerScatterModes,
+    enable_moe_dense_fully_dp,
+)
+from sglang.srt.layers.dp_attention import (
+    get_attention_tp_rank,
+    is_dp_attention_enabled,
+)
+from sglang.srt.layers.layernorm import RMSNorm
+from sglang.srt.layers.logits_processor import LogitsProcessor
+from sglang.srt.layers.quantization.base_config import QuantizationConfig
+from sglang.srt.layers.vocab_parallel_embedding import (
+    ParallelLMHead,
+    VocabParallelEmbedding,
+)
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch
+from sglang.srt.model_loader.weight_utils import default_weight_loader
+from sglang.srt.models.mimo_v2_flash import (
+    MiMoV2Attention,
+    MiMoV2FlashForCausalLM,
+    MiMoV2MLP,
+)
+from sglang.srt.server_args import get_global_server_args
+from sglang.srt.utils import add_prefix
+
+MiMoV2FlashConfig = None
+
+logger = logging.getLogger(__name__)
+
+
+class MiMoV2MTPLayer(nn.Module):
+    def __init__(
+        self,
+        config: MiMoV2FlashConfig,
+        layer_id: int = 0,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+
+        rope_theta = getattr(config, "rope_theta", 1000000)
+        rope_scaling = getattr(config, "rope_scaling", None)
+        max_position_embeddings = getattr(config, "max_position_embeddings", 32768)
+
+        self.self_attn = MiMoV2Attention(
+            hidden_size=self.hidden_size,
+            num_heads=config.swa_num_attention_heads,
+            num_kv_heads=config.swa_num_key_value_heads,
+            head_dim=config.swa_head_dim,
+            v_head_dim=getattr(config, "swa_v_head_dim", None),
+            v_scale=getattr(config, "attention_value_scale", None),
+            sliding_window_size=config.sliding_window_size,
+            attention_bias=config.attention_bias,
+            attention_sink_bias=getattr(config, "add_swa_attention_sink_bias", False),
+            layer_id=layer_id,
+            rope_theta=getattr(config, "swa_rope_theta", rope_theta),
+            rope_scaling=rope_scaling,
+            max_position_embeddings=max_position_embeddings,
+            quant_config=quant_config,
+            partial_rotary_factor=getattr(config, "partial_rotary_factor", 1.0),
+            prefix=add_prefix("self_attn", prefix),
+        )
+        self.is_layer_sparse = False
+        is_previous_layer_sparse = True
+        is_next_layer_sparse = False
+
+        if enable_moe_dense_fully_dp():
+            mlp_tp_rank, mlp_tp_size = 0, 1
+        else:
+            mlp_tp_rank, mlp_tp_size = None, None
+        self.mlp = MiMoV2MLP(
+            hidden_size=self.hidden_size,
+            intermediate_size=config.intermediate_size,
+            hidden_act=config.hidden_act,
+            quant_config=quant_config,
+            prefix=add_prefix("mlp", prefix),
+            tp_rank=mlp_tp_rank,
+            tp_size=mlp_tp_size,
+        )
+        self.input_layernorm = RMSNorm(config.hidden_size, eps=config.layernorm_epsilon)
+        self.post_attention_layernorm = RMSNorm(
+            config.hidden_size, eps=config.layernorm_epsilon
+        )
+        self.layer_scatter_modes = LayerScatterModes.init_new(
+            layer_id=layer_id,
+            num_layers=1,
+            is_layer_sparse=self.is_layer_sparse,
+            is_previous_layer_sparse=is_previous_layer_sparse,
+            is_next_layer_sparse=is_next_layer_sparse,
+        )
+        self.layer_communicator = LayerCommunicator(
+            layer_scatter_modes=self.layer_scatter_modes,
+            input_layernorm=self.input_layernorm,
+            post_attention_layernorm=self.post_attention_layernorm,
+        )
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+        residual: Optional[torch.Tensor],
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+
+        hidden_states, residual = self.layer_communicator.prepare_attn(
+            hidden_states, residual, forward_batch
+        )
+
+        if hidden_states.shape[0] != 0:
+            hidden_states = self.self_attn(
+                positions=positions,
+                hidden_states=hidden_states,
+                forward_batch=forward_batch,
+            )
+
+        hidden_states, residual = self.layer_communicator.prepare_mlp(
+            hidden_states, residual, forward_batch
+        )
+        with get_global_expert_distribution_recorder().disable_this_region():
+            hidden_states = self.mlp(hidden_states)
+        hidden_states, residual = self.layer_communicator.postprocess_layer(
+            hidden_states, residual, forward_batch
+        )
+
+        return hidden_states, residual
+
+
+class MiMoV2ModelNextN(nn.Module):
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+
+        self.vocab_size = config.vocab_size
+
+        self.embed_tokens = VocabParallelEmbedding(
+            config.vocab_size,
+            config.hidden_size,
+            use_attn_tp_group=is_dp_attention_enabled(),
+            prefix=add_prefix("embed_tokens", prefix),
+        )
+
+        self.enorm = RMSNorm(config.hidden_size, eps=config.layernorm_epsilon)
+        self.hnorm = RMSNorm(config.hidden_size, eps=config.layernorm_epsilon)
+
+        self.eh_proj = nn.Linear(2 * config.hidden_size, config.hidden_size, bias=False)
+
+        self.mtp_block = MiMoV2MTPLayer(
+            config,
+            0,
+            quant_config=quant_config,
+            prefix=add_prefix("decoder", prefix),
+        )
+        self.final_layernorm = RMSNorm(config.hidden_size, eps=config.layernorm_epsilon)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        input_embeds: torch.Tensor = None,
+    ) -> torch.Tensor:
+        if input_embeds is None:
+            hidden_states = self.embed_tokens(input_ids)
+        else:
+            hidden_states = input_embeds
+        if hidden_states.shape[0] > 0:
+            hidden_states = self.eh_proj(
+                torch.cat(
+                    (
+                        self.enorm(hidden_states),
+                        self.hnorm(forward_batch.spec_info.hidden_states),
+                    ),
+                    dim=-1,
+                )
+            )
+        hidden_states, residual = self.mtp_block(
+            positions=positions,
+            hidden_states=hidden_states,
+            forward_batch=forward_batch,
+            residual=None,
+        )
+        hidden_states_before_norm = None
+        if not forward_batch.forward_mode.is_idle():
+            if forward_batch.return_hidden_states_before_norm:
+                hidden_states_before_norm = (
+                    hidden_states if residual is None else hidden_states + residual
+                )
+            if residual is not None:
+                hidden_states, _ = self.final_layernorm(hidden_states, residual)
+            else:
+                hidden_states = self.final_layernorm(hidden_states)
+
+        return hidden_states, hidden_states_before_norm
+
+
+class MiMoV2MTP(MiMoV2FlashForCausalLM):
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        draft_model_idx: Optional[int] = None,
+        prefix: str = "",
+    ) -> None:
+        nn.Module.__init__(self)
+        self.config = config
+        self.tp_size = get_tensor_model_parallel_world_size()
+        self.quant_config = quant_config
+
+        self.model = MiMoV2ModelNextN(
+            config, quant_config, prefix=add_prefix("model", prefix)
+        )
+        self.lm_head = ParallelLMHead(
+            config.vocab_size,
+            config.hidden_size,
+            quant_config=quant_config,
+            prefix=add_prefix("lm_head", prefix),
+            use_attn_tp_group=get_global_server_args().enable_dp_lm_head,
+        )
+        self.logits_processor = LogitsProcessor(config)
+
+    @torch.no_grad()
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+    ) -> torch.Tensor:
+        hidden_states, hidden_states_before_norm = self.model(
+            input_ids, positions, forward_batch
+        )
+        return self.logits_processor(
+            input_ids,
+            hidden_states,
+            self.lm_head,
+            forward_batch,
+            hidden_states_before_norm=hidden_states_before_norm,
+        )
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]], is_nextn=False):
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+            ("gate_up_proj", "gate_proj", 0),
+            ("gate_up_proj", "up_proj", 1),
+        ]
+
+        params_dict = dict(self.named_parameters())
+        for name, loaded_weight in weights:
+            if "rotary_emb.inv_freq" in name or "projector" in name:
+                continue
+            if "rotary_emb.cos_cached" in name or "rotary_emb.sin_cached" in name:
+                # Models trained using ColossalAI may include these tensors in
+                # the checkpoint. Skip them.
+                continue
+            if self.config.tie_word_embeddings and "lm_head.weight" in name:
+                continue
+            if name.startswith("model.vision_tower") and name not in params_dict:
+                continue
+            name = self.map_model_name_to_mtp_param_name(name)
+
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+
+                if weight_name not in name:
+                    continue
+                if "mtp_block" not in name:
+                    break
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+
+                if "mtp_block" not in name and (
+                    "embed_tokens" not in name
+                    and "lm_head" not in name
+                    and "enorm" not in name
+                    and "hnorm" not in name
+                    and "eh_proj" not in name
+                    and "final_layernorm" not in name
+                ):
+                    continue
+                if name in params_dict.keys():
+                    param = params_dict[name]
+                    if "attention_sink_bias" in name:
+                        start = get_attention_tp_rank() * param.numel()
+                        param.data.copy_(loaded_weight[start : start + param.numel()])
+                    else:
+                        weight_loader = getattr(
+                            param, "weight_loader", default_weight_loader
+                        )
+                        weight_loader(param, loaded_weight)
+                else:
+                    logger.warning(f"Parameter {name} not found in params_dict")
+
+    def map_model_name_to_mtp_param_name(self, name: str) -> str:
+        import re
+
+        if "pre_mlp_layernorm" in name:
+            name = name.replace("pre_mlp_layernorm", "post_attention_layernorm")
+
+        name_without_prefix = [
+            "enorm",
+            "hnorm",
+            "eh_proj",
+            "final_layernorm",
+        ]
+        pattern = r"model.mtp.layers.(\d+)."
+        group = re.match(pattern, name)
+        if group is not None:
+            for sub_name in name_without_prefix:
+                if sub_name in name:
+                    name = name.replace(group.group(), "model.")
+                    return name
+            name = name.replace(group.group(), "model.mtp_block.")
+        return name
+
+    def get_embed_and_head(self):
+        return self.model.embed_tokens.weight, self.lm_head.weight
+
+    def set_embed_and_head(self, embed, head):
+        del self.model.embed_tokens.weight
+        del self.lm_head.weight
+        self.model.embed_tokens.weight = embed
+        self.lm_head.weight = head
+        torch.cuda.empty_cache()
+        torch.cuda.synchronize()
+
+
+EntryClass = MiMoV2MTP
diff --git a/sglang/python/sglang/srt/models/mindspore.py b/sglang/python/sglang/srt/models/mindspore.py
new file mode 100644
index 0000000000000000000000000000000000000000..0514756f8cffab5b2feef3fe7c99e524ab727fbf
--- /dev/null
+++ b/sglang/python/sglang/srt/models/mindspore.py
@@ -0,0 +1,317 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the SGLang project
+from __future__ import annotations
+
+import logging
+from typing import Any, Iterable, Optional, Tuple
+
+import torch
+
+from sglang.srt.distributed import (
+    get_tensor_model_parallel_rank,
+    get_tensor_model_parallel_world_size,
+)
+from sglang.srt.layers.logits_processor import LogitsProcessorOutput
+from sglang.srt.layers.quantization.base_config import QuantizationConfig
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch
+from sglang.srt.models.registry import import_model_classes
+from sglang.srt.utils import is_npu
+
+_is_npu = is_npu()
+
+if _is_npu:
+    import mindspore as ms
+    import numpy as np
+    import torch_npu
+    from mindspore import Tensor, mint, mutable
+
+logger = logging.getLogger(__name__)
+
+
+def _get_arch_from_config(config):
+    mindspore_models = import_model_classes("sgl_mindspore.models")
+    architectures = getattr(config, "architectures", [])
+    if isinstance(architectures, str):
+        architectures = [architectures]
+    if not architectures:
+        raise ValueError("No model architectures are specified")
+    for arch in architectures:
+        if arch in mindspore_models:
+            return mindspore_models[arch]
+    raise ValueError(f"Unsupported arch {architectures}")
+
+
+def tensor_torch2ms(x: torch.Tensor):
+    if x is None or not isinstance(x, torch.Tensor):
+        return x
+
+    # torch tensor -> dlpack -> mindspore tensor
+    pt_dlpack = torch.utils.dlpack.to_dlpack(x)
+    ms_tensor = ms.utils.dlpack.from_dlpack(pt_dlpack)
+    return ms_tensor
+
+
+def tensor_ms2torch(x: "ms.Tensor"):
+    if x is None or not isinstance(x, ms.Tensor):
+        return x
+
+    # ms tensor -> dlpack -> torch tensor
+    ms_dlpack = ms.utils.dlpack.to_dlpack(x)
+    torch_tensor = torch.utils.dlpack.from_dlpack(ms_dlpack)
+    torch_npu.npu.synchronize()
+    return torch_tensor
+
+
+# Adapt from: https://gitee.com/mindspore/vllm-mindspore/blob/master/vllm_mindspore/model_executor/models/attention_mask.py
+class LowerTriangularMask:
+    r"""
+    Provide Infer model attention mask.
+    Args:
+        dtype (ms dtype): The compute type of Infer model.
+        max_model_len (int): The max model length of Infer model.
+    """
+
+    def __init__(self, dtype, max_model_len, decode_mask_coeff=-10000.0):
+        self.dtype = dtype
+        self.max_model_len = max_model_len
+        self.cached_mask_len = 8 * 1024
+        self.decode_mask_coeff = decode_mask_coeff
+
+        prefill_mask_coeff = 1.0 if self.dtype == ms.bfloat16 else -10000.0
+        self.prefill_mask = Tensor(
+            np.triu(np.ones(shape=(128, 128), dtype=np.float16), k=1)
+            * prefill_mask_coeff,
+            dtype=self.dtype,
+        )
+
+        self.hard_mask = mint.zeros((1, 1), dtype=dtype)
+        self.decode_mask = (
+            Tensor(
+                np.triu(
+                    np.ones(
+                        shape=(self.cached_mask_len, self.cached_mask_len),
+                        dtype=np.int8,
+                    ),
+                    k=1,
+                ),
+                dtype=self.dtype,
+            )
+            * self.decode_mask_coeff
+        )
+
+    def create_mask(self, query_lens_np, seq_lens_np):
+        """
+        when query_lens_np = [3], seq_lens_np = [6], decode_mask_coeff = 1
+        init attention mask
+        0 0 0 0 0 0
+        0 0 0 0 0 0
+        0 0 0 0 0 0
+        """
+        max_seq_len = seq_lens_np.max().item()
+        total_q_len = query_lens_np.sum().item()
+        attention_mask = mint.zeros((total_q_len, max_seq_len), dtype=self.dtype)
+
+        req_num = query_lens_np.shape[0]
+        current_row = 0
+        for i in range(req_num):
+            q_len = query_lens_np[i].item()
+            current_row += q_len
+            # skip row when q_len <= 1, to decrease execute time
+            if q_len <= 1:
+                continue
+            seq_len = seq_lens_np[i].item()
+            context_len = seq_len - q_len
+            """
+            set the right half to 1
+            0 0 0 1 1 1
+            0 0 0 1 1 1
+            0 0 0 1 1 1
+            """
+            attention_mask[current_row - q_len : current_row, context_len:] = (
+                self.decode_mask_coeff
+            )
+            """
+            set the lower triangle of the right half to 0
+            0 0 0 0 1 1
+            0 0 0 0 0 1
+            0 0 0 0 0 0
+            """
+            right_tensor = attention_mask[
+                current_row - q_len : current_row, context_len:seq_len
+            ]
+
+            # use masked_fill_ to inplace modify attention_mask
+            right_tensor.masked_fill_(right_tensor.tril() == self.decode_mask_coeff, 0)
+
+        return attention_mask
+
+    def gen_attention_mask(
+        self,
+        is_prefill: bool,
+        position_ids: "ms.Tensor",
+        query_lens_np: np.ndarray,
+        seq_lens_np: np.ndarray,
+    ):
+        max_query_len = query_lens_np.max()
+        max_seq_len = seq_lens_np.max()
+        if is_prefill:
+            attention_mask = self.prefill_mask
+        elif max_query_len > 1:
+            if max_seq_len <= self.cached_mask_len:
+                attention_mask = mint.index_select(self.decode_mask, 0, position_ids)
+            else:
+                attention_mask = self.create_mask(query_lens_np, seq_lens_np)
+        else:
+            attention_mask = self.hard_mask
+        return attention_mask
+
+
+class MindSporeForCausalLM(torch.nn.Module):
+    def __init__(
+        self,
+        config: Any,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.config = config
+
+        ms.set_context(graph_kernel_flags="--disable_pass=gather_pre_rms_norm_fusion")
+        ms.set_kernel_launch_capture(False)
+
+        logger.info(
+            "MindSporeForCausalLM tp size %d tp rank %d",
+            get_tensor_model_parallel_world_size(),
+            get_tensor_model_parallel_rank(),
+        )
+        if get_tensor_model_parallel_world_size() not in (1, 2, 4, 8):
+            # MatMulAllReduce only support tp size in (1, 2, 4, 8)
+            ms.set_context(graph_kernel_flags="--disable_pass=MatMulAllReduce")
+
+        arch = self.get_arch(self.config)
+        self.model = arch(config=config, quant_config=quant_config)
+
+        self.causal_mask = LowerTriangularMask(
+            self.config.param_dtype, self.config.max_position_embeddings
+        )
+        self.key_cache = []
+        self.value_cache = []
+
+    def get_arch(self, config):
+        return _get_arch_from_config(config)
+
+    @property
+    def use_mla(self):
+        return self.config.architectures[0] in ("DeepseekV3ForCausalLM")
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        self.model.load_weights(weights)
+        for _, cell in self.model.cells_and_names():
+            quant_method = getattr(cell, "quant_method", None)
+            if quant_method is not None:
+                quant_method.process_weights_after_loading(cell)
+
+    def get_kvcache(self, forward_batch: ForwardBatch):
+        def prepare_cache(cache_list, is_key_cache):
+            for i in range(self.config.num_hidden_layers):
+                if is_key_cache:
+                    cache = forward_batch.token_to_kv_pool.get_key_buffer(i)
+                else:
+                    cache = forward_batch.token_to_kv_pool.get_value_buffer(i)
+                cache_ms = tensor_torch2ms(cache)
+                if cache_ms.ndim == 3:
+                    cache_ms = mint.unsqueeze(cache_ms, 2)
+                cache_list.append(cache_ms)
+
+        if self.use_mla:
+            if not self.key_cache:
+                prepare_cache(self.key_cache, is_key_cache=True)
+            return mutable(self.key_cache)
+
+        if self.key_cache and self.value_cache:
+            return mutable(self.key_cache), mutable(self.value_cache)
+
+        prepare_cache(self.key_cache, is_key_cache=True)
+        prepare_cache(self.value_cache, is_key_cache=False)
+
+        return mutable(self.key_cache), mutable(self.value_cache)
+
+    def prepare_inputs(self, input_ids, positions, forward_batch):
+        if self.use_mla:
+            key_cache = self.get_kvcache(forward_batch)
+        else:
+            key_cache, value_cache = self.get_kvcache(forward_batch)
+
+        # Different processing for the mindspore attention operator
+        # Without any prefix cache => Use FlashAttentionScore
+        # With cache => Use PagedAttention, no matter the query length is 1 or not
+        is_prefill = forward_batch.forward_mode.is_extend()
+        is_prefill = is_prefill and forward_batch.extend_prefix_lens.sum().item() == 0
+
+        batch_valid_length = forward_batch.seq_lens.cpu().numpy()
+
+        if forward_batch.extend_seq_lens is not None:
+            q_seq_lens = forward_batch.extend_seq_lens.cpu().numpy()
+        else:
+            q_seq_lens = np.ones([forward_batch.batch_size], dtype=np.int32)
+
+        page_size = forward_batch.token_to_kv_pool.page_size
+        block_tables = tensor_torch2ms(
+            (
+                forward_batch.req_to_token_pool.req_to_token[
+                    forward_batch.req_pool_indices, : forward_batch.seq_lens.max()
+                ][:, ::page_size]
+                // page_size
+            )
+        ).to(ms.int32)
+
+        model_inputs = {}
+        model_inputs["input_ids"] = tensor_torch2ms(input_ids).to(ms.int32)
+        model_inputs["batch_valid_length"] = ms.Tensor(
+            batch_valid_length, dtype=ms.int32
+        )
+        model_inputs["position_ids"] = tensor_torch2ms(positions)
+        model_inputs["q_seq_lens"] = ms.Tensor(q_seq_lens, dtype=ms.int32)
+        model_inputs["attention_mask"] = self.causal_mask.gen_attention_mask(
+            is_prefill, model_inputs["position_ids"], q_seq_lens, batch_valid_length
+        ).contiguous()
+        model_inputs["out_cache_loc"] = tensor_torch2ms(forward_batch.out_cache_loc).to(
+            ms.int32
+        )
+        model_inputs["is_prefill"] = is_prefill
+        model_inputs["key_cache"] = key_cache
+        if not self.use_mla:
+            model_inputs["value_cache"] = value_cache
+        model_inputs["block_tables"] = block_tables
+        return model_inputs
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+    ) -> "ms.Tensor":
+        # prepare base inputs
+        model_inputs = self.prepare_inputs(input_ids, positions, forward_batch)
+        # prepare model inputs
+        model_inputs = self.model.prepare_inputs(forward_batch, model_inputs)
+
+        logits = self.model(**model_inputs)
+
+        # TODO: npu tensor ms2torch error to be fix, remain issues of torch_npu to get tensor from dlpack
+        logits_result = LogitsProcessorOutput(next_token_logits=tensor_ms2torch(logits))
+        return logits_result
+
+    @classmethod
+    def get_model_config_for_expert_location(cls, config):
+        try:
+            arch_cls = _get_arch_from_config(config)
+            method = getattr(arch_cls, "get_model_config_for_expert_location", None)
+            if method is None:
+                return None
+            return method(config)
+        except Exception:
+            return None
+
+
+EntryClass = [MindSporeForCausalLM]
diff --git a/sglang/python/sglang/srt/models/minicpm.py b/sglang/python/sglang/srt/models/minicpm.py
new file mode 100644
index 0000000000000000000000000000000000000000..e7c94c85d0b2e15cc5aa2a6d9eeae4d3f0662e6f
--- /dev/null
+++ b/sglang/python/sglang/srt/models/minicpm.py
@@ -0,0 +1,399 @@
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Inference-only MiniCPM model compatible with HuggingFace weights."""
+
+import math
+from typing import Any, Dict, Iterable, Optional, Tuple
+
+import torch
+from torch import nn
+
+from sglang.srt.distributed import get_tensor_model_parallel_world_size
+from sglang.srt.layers.activation import SiluAndMul
+from sglang.srt.layers.layernorm import RMSNorm
+from sglang.srt.layers.linear import (
+    MergedColumnParallelLinear,
+    QKVParallelLinear,
+    RowParallelLinear,
+)
+from sglang.srt.layers.logits_processor import LogitsProcessor
+from sglang.srt.layers.quantization.base_config import QuantizationConfig
+from sglang.srt.layers.radix_attention import RadixAttention
+from sglang.srt.layers.rotary_embedding import get_rope
+from sglang.srt.layers.vocab_parallel_embedding import (
+    ParallelLMHead,
+    VocabParallelEmbedding,
+)
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch
+from sglang.srt.model_loader.weight_utils import default_weight_loader
+from sglang.srt.utils import add_prefix
+
+
+class MiniCPMMLP(nn.Module):
+    def __init__(
+        self,
+        hidden_size: int,
+        intermediate_size: int,
+        hidden_act: str,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.gate_up_proj = MergedColumnParallelLinear(
+            hidden_size,
+            [intermediate_size] * 2,
+            bias=False,
+            quant_config=quant_config,
+            prefix=add_prefix("gate_up_proj", prefix),
+        )
+        self.down_proj = RowParallelLinear(
+            intermediate_size,
+            hidden_size,
+            bias=False,
+            quant_config=quant_config,
+            prefix=add_prefix("down_proj", prefix),
+        )
+        if hidden_act != "silu":
+            raise ValueError(
+                f"Unsupported activation: {hidden_act}. "
+                "Only silu is supported for now."
+            )
+        self.act_fn = SiluAndMul()
+
+    def forward(self, x):
+        gate_up, _ = self.gate_up_proj(x)
+        x = self.act_fn(gate_up)
+        x, _ = self.down_proj(x)
+        return x
+
+
+class MiniCPMAttention(nn.Module):
+    def __init__(
+        self,
+        hidden_size: int,
+        num_heads: int,
+        num_kv_heads: int,
+        layer_id: int = 0,
+        rope_theta: float = 10000,
+        rope_scaling: Optional[Dict[str, Any]] = None,
+        max_position_embeddings: int = 8192,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.hidden_size = hidden_size
+        tp_size = get_tensor_model_parallel_world_size()
+        self.total_num_heads = num_heads
+        assert self.total_num_heads % tp_size == 0
+        self.num_heads = self.total_num_heads // tp_size
+        self.total_num_kv_heads = num_kv_heads
+        if self.total_num_kv_heads >= tp_size:
+            # Number of KV heads is greater than TP size, so we partition
+            # the KV heads across multiple tensor parallel GPUs.
+            assert self.total_num_kv_heads % tp_size == 0
+        else:
+            # Number of KV heads is less than TP size, so we replicate
+            # the KV heads across multiple tensor parallel GPUs.
+            assert tp_size % self.total_num_kv_heads == 0
+        self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
+        self.head_dim = hidden_size // self.total_num_heads
+        self.q_size = self.num_heads * self.head_dim
+        self.kv_size = self.num_kv_heads * self.head_dim
+        self.scaling = self.head_dim**-0.5
+        self.rope_theta = rope_theta
+        self.max_position_embeddings = max_position_embeddings
+
+        self.qkv_proj = QKVParallelLinear(
+            hidden_size,
+            self.head_dim,
+            self.total_num_heads,
+            self.total_num_kv_heads,
+            bias=False,
+            quant_config=quant_config,
+            prefix=add_prefix("qkv_proj", prefix),
+        )
+        self.o_proj = RowParallelLinear(
+            self.total_num_heads * self.head_dim,
+            hidden_size,
+            bias=False,
+            quant_config=quant_config,
+            prefix=add_prefix("o_proj", prefix),
+        )
+
+        self.rotary_emb = get_rope(
+            self.head_dim,
+            rotary_dim=self.head_dim,
+            max_position=max_position_embeddings,
+            base=rope_theta,
+            rope_scaling=rope_scaling,
+        )
+        self.attn = RadixAttention(
+            self.num_heads,
+            self.head_dim,
+            self.scaling,
+            num_kv_heads=self.num_kv_heads,
+            layer_id=layer_id,
+            quant_config=quant_config,
+            prefix=add_prefix("attn", prefix),
+        )
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+    ) -> torch.Tensor:
+        qkv, _ = self.qkv_proj(hidden_states)
+        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+        orig_dtype = q.dtype
+        q, k = q.float(), k.float()
+        q, k = self.rotary_emb(positions, q, k)
+        q, k = q.to(orig_dtype), k.to(orig_dtype)
+        attn_output = self.attn(q, k, v, forward_batch)
+        output, _ = self.o_proj(attn_output)
+        return output
+
+
+class MiniCPMDecoderLayer(nn.Module):
+    def __init__(
+        self,
+        config,
+        layer_id: int = 0,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        rope_theta = getattr(config, "rope_theta", 10000)
+        rope_scaling = getattr(config, "rope_scaling", None)
+        max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
+        self.self_attn = MiniCPMAttention(
+            hidden_size=self.hidden_size,
+            num_heads=config.num_attention_heads,
+            num_kv_heads=config.num_key_value_heads,
+            layer_id=layer_id,
+            rope_theta=rope_theta,
+            rope_scaling=rope_scaling,
+            max_position_embeddings=max_position_embeddings,
+            quant_config=quant_config,
+            prefix=add_prefix("self_attn", prefix),
+        )
+        self.mlp = MiniCPMMLP(
+            hidden_size=self.hidden_size,
+            intermediate_size=config.intermediate_size,
+            hidden_act=config.hidden_act,
+            quant_config=quant_config,
+            prefix=add_prefix("mlp", prefix),
+        )
+        self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = RMSNorm(
+            config.hidden_size, eps=config.rms_norm_eps
+        )
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+        residual: Optional[torch.Tensor],
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        # Self Attention
+        residual = hidden_states
+        hidden_states = self.input_layernorm(hidden_states)
+        hidden_states = self.self_attn(
+            positions=positions,
+            hidden_states=hidden_states,
+            forward_batch=forward_batch,
+        )
+        hidden_states = residual + hidden_states * (
+            self.config.scale_depth / math.sqrt(self.config.num_hidden_layers)
+        )
+
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states * (
+            self.config.scale_depth / math.sqrt(self.config.num_hidden_layers)
+        )
+
+        return hidden_states, None
+
+
+class MiniCPMModel(nn.Module):
+    def __init__(
+        self,
+        config,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.config = config
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+        self.embed_tokens = VocabParallelEmbedding(
+            self.vocab_size,
+            config.hidden_size,
+            org_num_embeddings=config.vocab_size,
+            prefix=add_prefix("embed_tokens", prefix),
+        )
+        self.layers = nn.ModuleList(
+            [
+                MiniCPMDecoderLayer(
+                    config,
+                    i,
+                    quant_config=quant_config,
+                    prefix=add_prefix(f"layers.{i}", prefix),
+                )
+                for i in range(config.num_hidden_layers)
+            ]
+        )
+        self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        input_embeds: torch.Tensor = None,
+    ) -> torch.Tensor:
+        if input_embeds is None:
+            hidden_states = self.embed_tokens(input_ids) * self.config.scale_emb
+        else:
+            hidden_states = input_embeds
+        residual = None
+
+        for i in range(len(self.layers)):
+            layer = self.layers[i]
+            hidden_states, residual = layer(
+                positions,
+                hidden_states,
+                forward_batch,
+                residual,
+            )
+        hidden_states = self.norm(hidden_states)
+        return hidden_states
+
+
+class MiniCPMForCausalLM(nn.Module):
+    def __init__(
+        self,
+        config,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.config = config
+
+        self.num_experts = getattr(self.config, "num_experts", 0)
+        self.quant_config = quant_config
+        self.model = MiniCPMModel(
+            config, quant_config=quant_config, prefix=add_prefix("model", prefix)
+        )
+        # self.lm_head = ParallelLMHead(config.vocab_size, config.hidden_size)
+        if not self.config.tie_word_embeddings:
+            self.lm_head = ParallelLMHead(
+                config.vocab_size,
+                config.hidden_size,
+                org_num_embeddings=config.vocab_size,
+                prefix=add_prefix("lm_head", prefix),
+            )
+
+        self.scale_width = self.config.hidden_size / self.config.dim_model_base
+
+        self.logits_processor = LogitsProcessor(config)
+
+    @torch.no_grad()
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        input_embeds: torch.Tensor = None,
+    ) -> torch.Tensor:
+        if input_embeds is not None:
+            input_embeds = input_embeds * self.config.scale_emb
+        hidden_states = self.model(input_ids, positions, forward_batch, input_embeds)
+        hidden_states = hidden_states / self.scale_width
+        if self.config.tie_word_embeddings:
+            lm_head = self.model.embed_tokens
+        else:
+            lm_head = self.lm_head
+        return self.logits_processor(input_ids, hidden_states, lm_head, forward_batch)
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+            ("gate_up_proj", "gate_proj", 0),
+            ("gate_up_proj", "up_proj", 1),
+        ]
+        expert_params_mapping = [
+            # (param_name, weight_name, expert_id)
+            (
+                "ws" if weight_name in ["w1", "w3"] else "w2s",
+                f"experts.{expert_id}.{weight_name}.weight",
+                expert_id,
+            )
+            for expert_id in range(self.num_experts)
+            for weight_name in ["w1", "w2", "w3"]
+        ]
+        params_dict = dict(self.named_parameters())
+        for name, loaded_weight in weights:
+            if "rotary_emb.inv_freq" in name:
+                continue
+            if "rotary_emb.cos_cached" in name or "rotary_emb.sin_cached" in name:
+                # Models trained using ColossalAI may include these tensors in
+                # the checkpoint. Skip them.
+                continue
+            if self.config.tie_word_embeddings and "lm_head.weight" in name:
+                continue
+
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                for param_name, weight_name, expert_id in expert_params_mapping:
+                    if weight_name not in name:
+                        continue
+                    name = name.replace(weight_name, param_name)
+                    param = params_dict[name]
+                    weight_loader = param.weight_loader
+                    weight_loader(
+                        param, loaded_weight, weight_name, expert_id=expert_id
+                    )
+                    break
+                else:
+                    # Skip loading extra bias for GPTQ models.
+                    if name.endswith(".bias") and name not in params_dict:
+                        continue
+                    param = params_dict[name]
+                    weight_loader = getattr(
+                        param, "weight_loader", default_weight_loader
+                    )
+                    weight_loader(param, loaded_weight)
+
+
+EntryClass = MiniCPMForCausalLM
diff --git a/sglang/python/sglang/srt/models/minicpm3.py b/sglang/python/sglang/srt/models/minicpm3.py
new file mode 100644
index 0000000000000000000000000000000000000000..95dca19da00982b425a68b5609d0f33898c7810e
--- /dev/null
+++ b/sglang/python/sglang/srt/models/minicpm3.py
@@ -0,0 +1,544 @@
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Inference-only MiniCPM3 model compatible with HuggingFace weights."""
+
+import math
+from typing import Any, Dict, Iterable, Optional, Tuple
+
+import torch
+from torch import nn
+from transformers import PretrainedConfig
+
+from sglang.srt.distributed import get_tensor_model_parallel_world_size
+from sglang.srt.layers.activation import SiluAndMul
+from sglang.srt.layers.layernorm import RMSNorm
+from sglang.srt.layers.linear import (
+    ColumnParallelLinear,
+    MergedColumnParallelLinear,
+    ReplicatedLinear,
+    RowParallelLinear,
+)
+from sglang.srt.layers.logits_processor import LogitsProcessor
+from sglang.srt.layers.quantization.base_config import QuantizationConfig
+from sglang.srt.layers.radix_attention import RadixAttention
+from sglang.srt.layers.rotary_embedding import get_rope
+from sglang.srt.layers.vocab_parallel_embedding import (
+    ParallelLMHead,
+    VocabParallelEmbedding,
+)
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch
+from sglang.srt.model_loader.weight_utils import default_weight_loader
+from sglang.srt.utils import add_prefix, is_cuda
+
+if is_cuda():
+    from sgl_kernel import bmm_fp8 as _raw_bmm_fp8
+
+    from sglang.srt.utils.custom_op import register_custom_op
+
+    # TODO(yuwei): remove this wrapper after sgl-kernel registers its own fake/meta impl
+    # Wrap bmm_fp8 as a custom op so torch.compile does not trace into
+    # torch.cuda.current_blas_handle() (which returns a non-Tensor).
+    @register_custom_op(mutates_args=["out"])
+    def _bmm_fp8_op(
+        A: torch.Tensor,
+        B: torch.Tensor,
+        out: torch.Tensor,
+        A_scale: torch.Tensor,
+        B_scale: torch.Tensor,
+    ) -> None:
+        _raw_bmm_fp8(A, B, A_scale, B_scale, out.dtype, out)
+
+    def bmm_fp8(A, B, A_scale, B_scale, dtype, out=None):
+        if out is None:
+            out = torch.empty(
+                (A.shape[0], A.shape[1], B.shape[2]),
+                device=A.device,
+                dtype=dtype,
+            )
+        _bmm_fp8_op(A, B, out, A_scale, B_scale)
+        return out
+
+
+class MiniCPM3MLP(nn.Module):
+    def __init__(
+        self,
+        hidden_size: int,
+        intermediate_size: int,
+        hidden_act: str,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.gate_up_proj = MergedColumnParallelLinear(
+            hidden_size,
+            [intermediate_size] * 2,
+            bias=False,
+            quant_config=quant_config,
+            prefix=add_prefix("gate_up_proj", prefix),
+        )
+        self.down_proj = RowParallelLinear(
+            intermediate_size,
+            hidden_size,
+            bias=False,
+            quant_config=quant_config,
+            prefix=add_prefix("down_proj", prefix),
+        )
+        if hidden_act != "silu":
+            raise ValueError(
+                f"Unsupported activation: {hidden_act}. "
+                "Only silu is supported for now."
+            )
+        self.act_fn = SiluAndMul()
+
+    def forward(self, x):
+        gate_up, _ = self.gate_up_proj(x)
+        x = self.act_fn(gate_up)
+        x, _ = self.down_proj(x)
+        return x
+
+
+def input_to_float8(x, dtype=torch.float8_e4m3fn):
+    finfo = torch.finfo(dtype)
+    min_val, max_val = x.aminmax()
+    amax = torch.maximum(min_val.abs(), max_val.abs()).clamp(min=1e-12)
+    scale = finfo.max / amax
+    x_scl_sat = (x * scale).clamp(min=finfo.min, max=finfo.max)
+    return x_scl_sat.to(dtype).contiguous(), scale.float().reciprocal()
+
+
+class MiniCPM3AttentionMLA(nn.Module):
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        hidden_size: int,
+        num_heads: int,
+        qk_nope_head_dim: int,
+        qk_rope_head_dim: int,
+        v_head_dim: int,
+        q_lora_rank: int,
+        kv_lora_rank: int,
+        rope_theta: float = 10000,
+        rope_scaling: Optional[Dict[str, Any]] = None,
+        max_position_embeddings: int = 8192,
+        quant_config: Optional[QuantizationConfig] = None,
+        layer_id=None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.layer_id = layer_id
+        self.hidden_size = hidden_size
+        self.qk_nope_head_dim = qk_nope_head_dim
+        self.qk_rope_head_dim = qk_rope_head_dim
+        self.qk_head_dim = qk_nope_head_dim + qk_rope_head_dim
+        self.v_head_dim = v_head_dim
+        self.q_lora_rank = q_lora_rank
+        self.kv_lora_rank = kv_lora_rank
+        self.num_heads = num_heads
+        tp_size = get_tensor_model_parallel_world_size()
+        assert num_heads % tp_size == 0
+        self.num_local_heads = num_heads // tp_size
+        self.scaling = self.qk_head_dim**-0.5
+        self.rope_theta = rope_theta
+        self.max_position_embeddings = max_position_embeddings
+
+        if self.q_lora_rank is not None:
+            self.q_a_proj = ReplicatedLinear(
+                self.hidden_size,
+                self.q_lora_rank,
+                bias=False,
+                quant_config=quant_config,
+                prefix=add_prefix("q_a_proj", prefix),
+            )
+            self.q_a_layernorm = RMSNorm(self.q_lora_rank, eps=config.rms_norm_eps)
+            self.q_b_proj = ColumnParallelLinear(
+                q_lora_rank,
+                self.num_heads * self.qk_head_dim,
+                bias=False,
+                quant_config=quant_config,
+                prefix=add_prefix("q_b_proj", prefix),
+            )
+        else:
+            self.q_proj = ColumnParallelLinear(
+                self.hidden_size,
+                self.num_heads * self.qk_head_dim,
+                bias=False,
+                quant_config=quant_config,
+                prefix=add_prefix("q_proj", prefix),
+            )
+
+        self.kv_a_proj_with_mqa = ReplicatedLinear(
+            self.hidden_size,
+            self.kv_lora_rank + self.qk_rope_head_dim,
+            bias=False,
+            quant_config=quant_config,
+            prefix=add_prefix("kv_a_proj_with_mqa", prefix),
+        )
+        self.kv_a_layernorm = RMSNorm(self.kv_lora_rank, eps=config.rms_norm_eps)
+        self.kv_b_proj = ColumnParallelLinear(
+            self.kv_lora_rank,
+            self.num_heads * (self.qk_nope_head_dim + self.v_head_dim),
+            bias=False,
+            quant_config=quant_config,
+            prefix=add_prefix("kv_b_proj", prefix),
+        )
+        # O projection.
+        self.o_proj = RowParallelLinear(
+            self.num_heads * self.v_head_dim,
+            self.hidden_size,
+            bias=False,
+            quant_config=quant_config,
+            prefix=add_prefix("o_proj", prefix),
+        )
+        self.rotary_emb = get_rope(
+            qk_rope_head_dim,
+            rotary_dim=qk_rope_head_dim,
+            max_position=max_position_embeddings,
+            base=rope_theta,
+            rope_scaling=rope_scaling,
+        )
+
+        self.attn = RadixAttention(
+            self.num_local_heads,
+            self.kv_lora_rank + self.qk_rope_head_dim,
+            self.scaling,
+            num_kv_heads=1,
+            layer_id=layer_id,
+            v_head_dim=self.kv_lora_rank,
+            quant_config=quant_config,
+            prefix=add_prefix("attn", prefix),
+        )
+
+        self.w_kc = None
+        self.w_vc = None
+        self.w_scale = None
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+    ) -> torch.Tensor:
+        q_len = hidden_states.shape[0]
+        q_input = hidden_states.new_empty(
+            q_len, self.num_local_heads, self.kv_lora_rank + self.qk_rope_head_dim
+        )
+        if self.q_lora_rank is not None:
+            q = self.q_a_proj(hidden_states)[0]
+            q = self.q_a_layernorm(q)
+            q = self.q_b_proj(q)[0].view(-1, self.num_local_heads, self.qk_head_dim)
+        else:
+            q = self.q_proj(hidden_states)[0].view(
+                -1, self.num_local_heads, self.qk_head_dim
+            )
+        q_nope, q_pe = q.split([self.qk_nope_head_dim, self.qk_rope_head_dim], dim=-1)
+
+        if self.w_kc.dtype == torch.float8_e4m3fn:
+            q_nope_val, q_nope_scale = input_to_float8(
+                q_nope.transpose(0, 1), torch.float8_e4m3fn
+            )
+            q_nope_out = bmm_fp8(
+                q_nope_val, self.w_kc, q_nope_scale, self.w_scale, torch.bfloat16
+            )
+        else:
+            q_nope_out = torch.bmm(q_nope.transpose(0, 1), self.w_kc)
+        q_input[..., : self.kv_lora_rank] = q_nope_out.transpose(0, 1)
+
+        latent_cache = self.kv_a_proj_with_mqa(hidden_states)[0]
+        v_input = latent_cache[..., : self.kv_lora_rank]
+        v_input = self.kv_a_layernorm(v_input.contiguous()).unsqueeze(1)
+        k_input = latent_cache.unsqueeze(1)
+        k_input[..., : self.kv_lora_rank] = v_input
+        k_pe = k_input[..., self.kv_lora_rank :]
+
+        original_shapes = [q_pe.shape, k_pe.shape]
+        q_pe, k_pe = self.rotary_emb(
+            positions,
+            q_pe.reshape(-1, q_pe.shape[1] * q_pe.shape[2]),
+            k_pe.reshape(-1, k_pe.shape[1] * k_pe.shape[2]),
+        )
+        q_pe, k_pe = q_pe.view(original_shapes[0]), k_pe.view(original_shapes[1])
+        q_input[..., self.kv_lora_rank :] = q_pe
+        k_input[..., self.kv_lora_rank :] = k_pe
+
+        attn_output = self.attn(q_input, k_input, v_input, forward_batch)
+        attn_output = attn_output.view(-1, self.num_local_heads, self.kv_lora_rank)
+
+        if self.w_vc.dtype == torch.float8_e4m3fn:
+            attn_output_val, attn_output_scale = input_to_float8(
+                attn_output.transpose(0, 1), torch.float8_e4m3fn
+            )
+            attn_bmm_output = bmm_fp8(
+                attn_output_val,
+                self.w_vc,
+                attn_output_scale,
+                self.w_scale,
+                torch.bfloat16,
+            )
+        else:
+            attn_bmm_output = torch.bmm(attn_output.transpose(0, 1), self.w_vc)
+        attn_output = attn_bmm_output.transpose(0, 1).flatten(1, 2)
+        output, _ = self.o_proj(attn_output)
+
+        return output
+
+
+class MiniCPM3DecoderLayer(nn.Module):
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        layer_id: int,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        rope_theta = getattr(config, "rope_theta", 10000)
+        rope_scaling = getattr(config, "rope_scaling", None)
+        max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
+        self.self_attn = MiniCPM3AttentionMLA(
+            config=config,
+            hidden_size=self.hidden_size,
+            num_heads=config.num_attention_heads,
+            qk_nope_head_dim=config.qk_nope_head_dim,
+            qk_rope_head_dim=config.qk_rope_head_dim,
+            v_head_dim=self.hidden_size // config.num_attention_heads,
+            q_lora_rank=(
+                config.q_lora_rank if hasattr(config, "q_lora_rank") else None
+            ),
+            kv_lora_rank=config.kv_lora_rank,
+            rope_theta=rope_theta,
+            rope_scaling=rope_scaling,
+            max_position_embeddings=max_position_embeddings,
+            quant_config=quant_config,
+            layer_id=layer_id,
+            prefix=add_prefix("self_attn", prefix),
+        )
+
+        self.mlp = MiniCPM3MLP(
+            hidden_size=self.hidden_size,
+            intermediate_size=config.intermediate_size,
+            hidden_act=config.hidden_act,
+            quant_config=quant_config,
+            prefix=add_prefix("mlp", prefix),
+        )
+        self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = RMSNorm(
+            config.hidden_size, eps=config.rms_norm_eps
+        )
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+        residual: Optional[torch.Tensor],
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        # Self Attention
+        residual = hidden_states
+        hidden_states = self.input_layernorm(hidden_states)
+        hidden_states = self.self_attn(
+            positions=positions,
+            hidden_states=hidden_states,
+            forward_batch=forward_batch,
+        )
+        hidden_states = residual + hidden_states * (
+            self.config.scale_depth / math.sqrt(self.config.num_hidden_layers)
+        )
+
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states * (
+            self.config.scale_depth / math.sqrt(self.config.num_hidden_layers)
+        )
+
+        return hidden_states, None
+
+
+class MiniCPM3Model(nn.Module):
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.config = config
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+        self.embed_tokens = VocabParallelEmbedding(
+            self.vocab_size,
+            config.hidden_size,
+            org_num_embeddings=config.vocab_size,
+            prefix=add_prefix("embed_tokens", prefix),
+        )
+        self.layers = nn.ModuleList(
+            [
+                MiniCPM3DecoderLayer(
+                    config,
+                    i,
+                    quant_config=quant_config,
+                    prefix=add_prefix(f"layers.{i}", prefix),
+                )
+                for i in range(config.num_hidden_layers)
+            ]
+        )
+        self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        input_embeds: torch.Tensor = None,
+    ) -> torch.Tensor:
+        if input_embeds is None:
+            hidden_states = self.embed_tokens(input_ids) * self.config.scale_emb
+        else:
+            hidden_states = input_embeds
+        residual = None
+
+        for i in range(len(self.layers)):
+            layer = self.layers[i]
+            hidden_states, residual = layer(
+                positions,
+                hidden_states,
+                forward_batch,
+                residual,
+            )
+        hidden_states = self.norm(hidden_states)
+        return hidden_states
+
+
+class MiniCPM3ForCausalLM(nn.Module):
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.config = config
+
+        self.num_experts = getattr(self.config, "num_experts", 0)
+        self.quant_config = quant_config
+        self.model = MiniCPM3Model(
+            config, quant_config=quant_config, prefix=add_prefix("model", prefix)
+        )
+        # self.lm_head = ParallelLMHead(config.vocab_size, config.hidden_size)
+        if not self.config.tie_word_embeddings:
+            self.lm_head = ParallelLMHead(
+                config.vocab_size,
+                config.hidden_size,
+                org_num_embeddings=config.vocab_size,
+                prefix=add_prefix("lm_head", prefix),
+            )
+
+        self.scale_width = self.config.hidden_size / self.config.dim_model_base
+
+        self.logits_processor = LogitsProcessor(config)
+
+    @torch.no_grad()
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        input_embeds: torch.Tensor = None,
+    ) -> torch.Tensor:
+        if input_embeds is not None:
+            input_embeds = input_embeds * self.config.scale_emb
+        hidden_states = self.model(input_ids, positions, forward_batch, input_embeds)
+        hidden_states = hidden_states / self.scale_width
+        if self.config.tie_word_embeddings:
+            lm_head = self.model.embed_tokens
+        else:
+            lm_head = self.lm_head
+        return self.logits_processor(input_ids, hidden_states, lm_head, forward_batch)
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("gate_up_proj", "gate_proj", 0),
+            ("gate_up_proj", "up_proj", 1),
+        ]
+        expert_params_mapping = [
+            # (param_name, weight_name, expert_id)
+            (
+                "ws" if weight_name in ["w1", "w3"] else "w2s",
+                f"experts.{expert_id}.{weight_name}.weight",
+                expert_id,
+            )
+            for expert_id in range(self.num_experts)
+            for weight_name in ["w1", "w2", "w3"]
+        ]
+        params_dict = dict(self.named_parameters())
+        for name, loaded_weight in weights:
+            if "rotary_emb.inv_freq" in name:
+                continue
+            if "rotary_emb.cos_cached" in name or "rotary_emb.sin_cached" in name:
+                # Models trained using ColossalAI may include these tensors in
+                # the checkpoint. Skip them.
+                continue
+            if self.config.tie_word_embeddings and "lm_head.weight" in name:
+                continue
+
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                for param_name, weight_name, expert_id in expert_params_mapping:
+                    if weight_name not in name:
+                        continue
+                    name = name.replace(weight_name, param_name)
+                    param = params_dict[name]
+                    weight_loader = param.weight_loader
+                    weight_loader(
+                        param, loaded_weight, weight_name, expert_id=expert_id
+                    )
+                    break
+                else:
+                    # Skip loading extra bias for GPTQ models.
+                    if name.endswith(".bias") and name not in params_dict:
+                        continue
+                    param = params_dict[name]
+                    weight_loader = getattr(
+                        param, "weight_loader", default_weight_loader
+                    )
+                    weight_loader(param, loaded_weight)
+
+        for layer_id in range(self.config.num_hidden_layers):
+            self_attn = self.model.layers[layer_id].self_attn
+            w_kc, w_vc = self_attn.kv_b_proj.weight.unflatten(
+                0, (-1, self_attn.qk_nope_head_dim + self_attn.v_head_dim)
+            ).split([self_attn.qk_nope_head_dim, self_attn.v_head_dim], dim=1)
+            self_attn.w_kc = w_kc.transpose(1, 2).contiguous().transpose(1, 2)
+            self_attn.w_vc = w_vc.contiguous().transpose(1, 2)
+            if hasattr(self_attn.kv_b_proj, "weight_scale"):
+                self_attn.w_scale = self_attn.kv_b_proj.weight_scale
+            del self_attn.kv_b_proj
+
+
+EntryClass = MiniCPM3ForCausalLM
diff --git a/sglang/python/sglang/srt/models/minicpmo.py b/sglang/python/sglang/srt/models/minicpmo.py
new file mode 100644
index 0000000000000000000000000000000000000000..fc03e29bf5609593f8956dc3c751331dbb4bd49d
--- /dev/null
+++ b/sglang/python/sglang/srt/models/minicpmo.py
@@ -0,0 +1,1946 @@
+# Copied and adapted from: https://huggingface.co/openbmb/MiniCPM-o-2_6/blob/main/modeling_minicpmo.py
+
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Inference-only MiniCPM-o model compatible with HuggingFace weights."""
+
+import math
+from dataclasses import dataclass
+from typing import Any, Iterable, List, Literal, Optional, Tuple, Union
+
+import numpy as np
+import torch
+import torch.nn.functional as F
+import torch.nn.utils.parametrize as P
+import torch.types
+from torch import nn
+from torch.nn.utils import parametrizations
+from tqdm import tqdm
+from transformers import LlamaConfig, LlamaModel, PretrainedConfig, PreTrainedModel
+from transformers.activations import ACT2FN
+from transformers.cache_utils import DynamicCache, EncoderDecoderCache
+from transformers.modeling_outputs import BaseModelOutputWithPast, ModelOutput
+from transformers.models.whisper.modeling_whisper import (
+    WhisperAttention,
+    WhisperConfig,
+    WhisperEncoder,
+)
+
+from sglang.srt.layers.quantization import QuantizationConfig
+from sglang.srt.managers.mm_utils import (
+    MultiModalityDataPaddingPatternTokenPairs,
+    general_mm_embed_routine,
+)
+from sglang.srt.managers.schedule_batch import (
+    MultimodalDataItem,
+    MultimodalInputFormat,
+    MultimodalInputs,
+    flatten_nested_list,
+)
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch
+from sglang.srt.model_loader.utils import set_default_torch_dtype
+from sglang.srt.model_loader.weight_utils import default_weight_loader
+from sglang.srt.models.idefics2 import Idefics2VisionTransformer
+from sglang.srt.models.minicpmv import MiniCPMBaseModel, Resampler2_5
+from sglang.srt.models.qwen2 import Qwen2ForCausalLM
+from sglang.srt.utils import logger
+
+try:
+    from transformers import LogitsWarper
+    from vector_quantize_pytorch import GroupedResidualFSQ
+
+    _tts_deps = True
+except:
+    LogitsWarper = None
+    _tts_deps = False
+
+
+def apply_spk_emb(
+    input_ids: torch.Tensor = None,
+    spk_emb: torch.Tensor = None,
+    input_embeds: torch.Tensor = None,
+    spk_emb_token_id: int = 0,
+    num_spk_embs: int = 1,
+):
+    """
+    Replace consecutive `num_spk_embs` speaker embedding placeholders in input_embeds with pre-prepared speaker embeddings. This is an in-place replacement, no new tensor is created, so no value is returned.
+
+    Args:
+        input_ids (torch.Tensor): Input ID tensor, shape [batch_size, seq_len_max]
+        spk_emb (torch.Tensor): Speaker embedding tensor, shape [batch_size, num_spk_emb, hidden_dim]
+        input_embeds (torch.Tensor): Input embedding tensor, shape [batch_size, seq_len_max, hidden_dim]
+        spk_emb_token_id (int): ID of the speaker embedding token
+        num_spk_embs (int): Number of speaker embeddings
+
+    Returns:
+        None
+    """
+
+    batch_size = input_ids.shape[0]
+
+    for idx in range(batch_size):
+        input_ids_ = input_ids[idx]  # [seq_len_max]
+        spk_emb_ = spk_emb[idx]  # [num_spk_emb]
+        mask_ = input_ids_ == spk_emb_token_id  # [batch_size, seq_len_max]
+        nonzero_position_idx = mask_.nonzero(as_tuple=False)  # [num_spk_emb, 1]
+        assert nonzero_position_idx.shape[0] == num_spk_embs
+        begin_idx = nonzero_position_idx.min()
+        end_idx = nonzero_position_idx.max()
+        input_embeds[idx, begin_idx : end_idx + 1, :] = spk_emb_
+
+    return
+
+
+@dataclass
+class ConditionalChatTTSGenerationOutput(ModelOutput):
+    """
+    Output class for ConditionalChatTTS generation.
+
+    Args:
+        new_ids (torch.LongTensor): Newly generated audio code sequence, shape (batch_size, sequence_length, num_vq).
+        audio_input_ids (torch.LongTensor): Updated input IDs including condition and generated audio codes, shape (batch_size, full_sequence_length, num_vq).
+        past_key_values (Tuple[Tuple[torch.FloatTensor]]): Tuple containing pre-computed keys and values used for attention mechanism. Each element has shape (batch_size, num_heads, sequence_length, embed_size_per_head).
+        finished (bool): Boolean indicating whether generation is complete.
+
+    """
+
+    new_ids: torch.LongTensor = None
+    audio_input_ids: torch.LongTensor = None
+    past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
+    finished: bool = None
+
+
+def make_streaming_chunk_mask_generation(
+    inputs_embeds: torch.Tensor,
+    past_seen_tokens: int,
+    streaming_tts_text_mask: torch.Tensor,
+    streaming_reserved_length: int = 300,
+    streaming_audio_chunk_size: int = 50,
+    streaming_text_chunk_size: int = 10,
+    num_spk_emb: int = 1,
+    use_spk_emb: bool = True,
+) -> torch.Tensor:
+    """
+    In streaming audio generation, determine which `text` positions the TTS model can attend to when generating each chunk of `audio` tokens.
+
+    This function creates a mask that allows the model to attend to a specific chunk of text
+    tokens when generating each chunk of audio tokens, enabling streaming TTS generation.
+
+    Args:
+        inputs_embeds (torch.Tensor): Input embeddings tensor.
+        past_seen_tokens (int): Number of tokens already seen by the model.
+        streaming_tts_text_mask (torch.Tensor): Mask for the text tokens.
+        streaming_reserved_length (int, optional): Number of reserved tokens for streaming. Defaults to 300.
+        streaming_text_chunk_size (int, optional): Size of each text chunk. Defaults to 7.
+
+    Returns:
+        torch.Tensor: Causal mask for streaming TTS generation, shape is [batch_size=1, 1, seq_len=1, past_seen_tokens+1]
+
+    Raises:
+        AssertionError: If the batch size is not 1 (only supports batch size of 1 for inference).
+    """
+    assert inputs_embeds.shape[0] == 1
+
+    dtype = inputs_embeds.dtype
+    device = inputs_embeds.device
+    min_dtype = torch.finfo(dtype).min
+
+    # Add `1` to the past seen tokens to account for new `tokens` during `generate`
+    causal_mask = torch.full(
+        (1, past_seen_tokens + inputs_embeds.shape[1]),
+        fill_value=0,
+        dtype=dtype,
+        device=device,
+    )
+
+    # Calculate the start of invisible text tokens
+    invisible_text_tokens_start = (
+        min(
+            math.ceil(
+                (past_seen_tokens - streaming_reserved_length)
+                / streaming_audio_chunk_size
+            )
+            * streaming_text_chunk_size,
+            streaming_reserved_length,
+        )
+        + 1
+        + num_spk_emb * use_spk_emb
+    )  # Add 1 for [Stts] and N for [spk_emb] tokens if `use_spk_emb` is True
+
+    invisible_text_tokens_end = (
+        streaming_reserved_length + 1 + num_spk_emb * use_spk_emb + 1
+    )  # Add 1 for [Ptts] (aka `audio_bos_token_id`)
+
+    # Set invisible text tokens to min_dtype (effectively -inf)
+    causal_mask[0, invisible_text_tokens_start:invisible_text_tokens_end] = min_dtype
+
+    # Mask padding positions in the text mask
+    causal_mask[
+        0, 0 : 1 + num_spk_emb * use_spk_emb + streaming_reserved_length + 1
+    ].masked_fill_(streaming_tts_text_mask == 0, min_dtype)
+
+    # Add extra dimensions for batch and heads
+    causal_mask = causal_mask.unsqueeze(0).unsqueeze(0)
+
+    return causal_mask
+
+
+# Borrowed from `https://github.com/2noise/ChatTTS/blob/main/ChatTTS/model/dvae.py`
+class ConvNeXtBlock(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        intermediate_dim: int,
+        kernel: int,
+        dilation: int,
+        layer_scale_init_value: float = 1e-6,
+    ):
+        # ConvNeXt Block copied from Vocos.
+        super().__init__()
+        self.dwconv = nn.Conv1d(
+            dim,
+            dim,
+            kernel_size=kernel,
+            padding=dilation * (kernel // 2),
+            dilation=dilation,
+            groups=dim,
+        )
+
+        self.norm = nn.LayerNorm(dim, eps=1e-6)
+        self.pwconv1 = nn.Linear(dim, intermediate_dim)
+        self.act = nn.GELU()
+        self.pwconv2 = nn.Linear(intermediate_dim, dim)
+        self.coef = (
+            nn.Parameter(layer_scale_init_value * torch.ones(dim), requires_grad=True)
+            if layer_scale_init_value > 0
+            else None
+        )
+
+    def forward(self, x: torch.Tensor, cond=None) -> torch.Tensor:
+        residual = x
+
+        y = self.dwconv(x)
+        y.transpose_(1, 2)  # (B, C, T) -> (B, T, C)
+        x = self.norm(y)
+        del y
+        y = self.pwconv1(x)
+        del x
+        x = self.act(y)
+        del y
+        y = self.pwconv2(x)
+        del x
+        if self.coef is not None:
+            y *= self.coef
+        y.transpose_(1, 2)  # (B, T, C) -> (B, C, T)
+
+        x = y + residual
+        del y
+
+        return x
+
+
+# Borrowed from `https://github.com/2noise/ChatTTS/blob/main/ChatTTS/model/dvae.py`
+class DVAEDecoder(nn.Module):
+    def __init__(
+        self,
+        idim: int,
+        odim: int,
+        n_layer=12,
+        bn_dim=64,
+        hidden=256,
+        kernel=7,
+        dilation=2,
+        up=False,
+    ):
+        super().__init__()
+        self.up = up
+        self.conv_in = nn.Sequential(
+            nn.Conv1d(idim, bn_dim, 3, 1, 1),
+            nn.GELU(),
+            nn.Conv1d(bn_dim, hidden, 3, 1, 1),
+        )
+        self.decoder_block = nn.ModuleList(
+            [
+                ConvNeXtBlock(
+                    hidden,
+                    hidden * 4,
+                    kernel,
+                    dilation,
+                )
+                for _ in range(n_layer)
+            ]
+        )
+        self.conv_out = nn.Conv1d(hidden, odim, kernel_size=1, bias=False)
+
+    def forward(self, x: torch.Tensor, conditioning=None) -> torch.Tensor:
+        # B, C, T
+        y = self.conv_in(x)
+        del x
+        for f in self.decoder_block:
+            y = f(y, conditioning)
+
+        x = self.conv_out(y)
+        del y
+        return x
+
+
+# Borrowed from `https://github.com/2noise/ChatTTS/blob/main/ChatTTS/model/dvae.py`
+class GFSQ(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        levels: List[int],
+        G: int,
+        R: int,
+        eps=1e-5,
+        transpose=True,
+    ):
+        super(GFSQ, self).__init__()
+        self.quantizer = GroupedResidualFSQ(
+            dim=dim,
+            levels=list(levels),
+            num_quantizers=R,
+            groups=G,
+        )
+        self.n_ind = math.prod(levels)
+        self.eps = eps
+        self.transpose = transpose
+        self.G = G
+        self.R = R
+
+    def _embed(self, x: torch.Tensor):
+        if self.transpose:
+            x = x.transpose(1, 2)
+        x = x.view(x.size(0), x.size(1), self.G, self.R).permute(2, 0, 1, 3)
+        feat = self.quantizer.get_output_from_indices(x)
+        return feat.transpose_(1, 2) if self.transpose else feat
+
+    def __call__(self, x: torch.Tensor) -> torch.Tensor:
+        return super().__call__(x)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        if self.transpose:
+            x.transpose_(1, 2)
+        _, ind = self.quantizer(x)
+        ind = ind.permute(1, 2, 0, 3).contiguous()
+        ind = ind.view(ind.size(0), ind.size(1), -1)
+        return ind.transpose_(1, 2) if self.transpose else ind
+
+
+# Borrowed from `https://github.com/2noise/ChatTTS/blob/main/ChatTTS/model/dvae.py`
+class DVAE(nn.Module):
+    def __init__(
+        self,
+    ):
+        super().__init__()
+
+        coef = torch.rand(100)
+        self.coef = nn.Parameter(coef.unsqueeze(0).unsqueeze_(2))
+
+        self.downsample_conv = nn.Sequential(
+            nn.Conv1d(100, 512, 3, 1, 1),
+            nn.GELU(),
+            nn.Conv1d(512, 512, 4, 2, 1),
+            nn.GELU(),
+        )
+
+        self.encoder = DVAEDecoder(
+            idim=512,
+            odim=1024,
+            hidden=256,
+            n_layer=12,
+            bn_dim=128,
+        )
+
+        self.decoder = DVAEDecoder(
+            idim=512,
+            odim=512,
+            hidden=256,
+            n_layer=12,
+            bn_dim=128,
+        )
+
+        self.out_conv = nn.Conv1d(512, 100, 3, 1, 1, bias=False)
+
+        self.vq_layer = GFSQ(
+            dim=1024,
+            levels=(5, 5, 5, 5),
+            G=2,
+            R=2,
+        )
+
+    @torch.inference_mode()
+    def forward(
+        self, inp: torch.Tensor, mode: Literal["encode", "decode"] = "decode"
+    ) -> torch.Tensor:
+        if mode == "encode" and hasattr(self, "encoder") and self.vq_layer is not None:
+            mel = inp.clone()
+            x: torch.Tensor = self.downsample_conv(
+                torch.div(mel, self.coef.view(100, 1).expand(mel.shape), out=mel),
+            ).unsqueeze_(0)
+            del mel
+            x = self.encoder(x)
+            ind = self.vq_layer(x)
+            del x
+            return ind
+
+        if self.vq_layer is not None:
+            vq_feats = self.vq_layer._embed(inp)
+        else:
+            vq_feats = inp
+
+        vq_feats = (
+            vq_feats.view(
+                (vq_feats.size(0), 2, vq_feats.size(1) // 2, vq_feats.size(2)),
+            )
+            .permute(0, 2, 3, 1)
+            .flatten(2)
+        )
+
+        dec_out = self.out_conv(
+            self.decoder(
+                x=vq_feats,
+            ),
+        )
+
+        del vq_feats
+
+        return torch.mul(dec_out, self.coef, out=dec_out)
+
+
+# Borrowed from `https://github.com/2noise/ChatTTS/blob/main/ChatTTS/model/processors.py`
+class CustomRepetitionPenaltyLogitsProcessorRepeat:
+    def __init__(self, penalty: float, max_input_ids: int, past_window: int):
+        if not isinstance(penalty, float) or not (penalty > 0):
+            raise ValueError(
+                f"`penalty` has to be a strictly positive float, but is {penalty}"
+            )
+
+        self.penalty = penalty
+        self.max_input_ids = max_input_ids
+        self.past_window = past_window
+
+    def __call__(
+        self, input_ids: torch.LongTensor, scores: torch.FloatTensor
+    ) -> torch.FloatTensor:
+        if input_ids.size(1) > self.past_window:
+            input_ids = input_ids.narrow(1, -self.past_window, self.past_window)
+        freq = F.one_hot(input_ids, scores.size(1)).sum(1)
+        if freq.size(0) > self.max_input_ids:
+            freq.narrow(
+                0, self.max_input_ids, freq.size(0) - self.max_input_ids
+            ).zero_()
+        alpha = torch.pow(self.penalty, freq)
+        scores = scores.contiguous()
+        inp = scores.multiply(alpha)
+        oth = scores.divide(alpha)
+        con = scores < 0
+        out = torch.where(con, inp, oth)
+        del inp, oth, scores, con, alpha
+        return out
+
+
+class ConditionalChatTTS(PreTrainedModel):
+    """A conditional text-to-speech model that can generate speech from text with speaker conditioning.
+
+    This model extends PreTrainedModel to provide text-to-speech capabilities with:
+    - LLM hidden state conditioning
+    - Streaming generation
+
+    The model uses a transformer architecture with LLM hidden states and can operate in both
+    streaming and non-streaming modes for flexible deployment.
+
+    The model process sequence in the following format:
+    | text bos token | LLM embedding projected to tts embedding space | text tokens (fixed length, reserved for future tokens) | audio bos token | audio tokens (audio token length is not fixed)| audio eos token |
+
+    The format is designed to support LLM-conditioned streaming audio generation.
+
+    Usage:
+    To support streaming generation, two global variables should be maintained outside of the model.
+        1. `audio_input_ids`: stores *discrete* audio codes. It is a tensor with shape [1, sequence length+1, num_vq].
+        2. `past_key_values`: stores the KV cache for both text tokens and audio codes. It is a list of tuples, each tuple contains two tensors with shape [1, num_attention_heads, sequence length, hidden_size // num_attention_heads]
+
+    where `num_vq` is the number of audio codebooks, in default setting, it is `4`.
+
+    1. Create an empty `past_key_values` with
+    ```python
+    initial_kv_cache_length = 1 + model.num_spk_embs + model.streaming_text_reserved_len # where `1` denotes the `bos` token
+    dtype = model.emb_text.weight.dtype
+    device = model.emb_text.weight.device
+    past_key_values = [
+        (
+            torch.zeros(1, model.config.num_attention_heads, initial_kv_cache_length, model.config.hidden_size // model.config.num_attention_heads, dtype=dtype, device=device),
+            torch.zeros(1, model.config.num_attention_heads, initial_kv_cache_length, model.config.hidden_size // model.config.num_attention_heads, dtype=dtype, device=device)
+        )
+        for _ in range(model.config.num_hidden_layers)
+    ]
+
+    2. At the same time, create an empty `audio_input_ids` with shape [1, sequence length, num_vq], `num_vq` denotes multiple layer audio codebooks. But here we also include text tokens in the sequence, but they will be zeros, and will not be used, just a placeholder.
+
+    ```python
+    initial_audio_input_ids_length = 1 + model.num_spk_embs + model.streaming_text_reserved_len + 1
+    # [bos token, speaker embeddings, text tokens, audio bos token]
+    audio_input_ids = torch.zeros(batch_size=1, initial_audio_input_ids_length, model.num_vq)
+    ```
+
+    2. Prefill some text tokens to TTS model (for example, 10 tokens) using `prefill_text` method.
+
+    ```python
+    outputs = llm.generate(**kwargs)
+    llm_tokens = some_function_to_extract_llm_tokens(outputs)
+    lm_spk_emb_last_hidden_states = some_function_to_extract_lm_spk_emb_last_hidden_states(outputs)
+    tts_text_input_ids = tts_tokenizer.encode(llm_tokenizer.decode(llm_tokens))
+    # here assume we are prefilling text token 0 to text token 9 (included), totally 10 tokens.
+    begin = 0
+    end = 9+1
+    position_ids = torch.arange(begin, end, dtype=torch.long, device=device)
+
+    past_key_values = model.prefill_text(
+        input_ids=tts_text_input_ids,
+        position_ids=position_ids,
+        past_key_values=past_key_values,
+        lm_spk_emb_last_hidden_states=lm_spk_emb_last_hidden_states,
+    )
+    ```
+
+    3. Make a `streaming_tts_text_mask` to denote which position contains valid text tokens, similar to `attention_mask` in standard causal attention.
+
+    ```python
+    streaming_tts_text_mask = torch.zeros(model.streaming_reserved_length)
+    streaming_tts_text_mask[0:end] = 1 # denotes these post
+    ```
+
+    3. Generate audio codes using `generate` method.
+
+    ```python
+    outputs = model.generate(
+        input_ids=audio_input_ids,
+        past_key_values=past_key_values,
+        streaming_tts_text_mask=streaming_tts_text_mask,
+        max_new_token=50,
+    )
+
+    # update past_key_values and input_ids
+    past_key_values = outputs.past_key_values
+    audio_input_ids = outputs.input_ids
+    ```
+
+    The `past_key_values` is extended by `max_new_token=50`, and `audio_input_ids` is also extended by `max_new_token=50` after `generate` calling.
+
+    4. Notice that after prefilling `10` text tokens, the model can generate up to `50` audio tokens, if you want to generate more audio tokens, you need to prefill next `10` text tokens. And it is okay to only generate `25` audio tokens for faster initial response.
+
+    5. Repeat steps `2,3,4` as needed in your streaming audio generation cases, but ensure usage complies with the following guidelines discussed above.
+    """
+
+    config_class = PretrainedConfig
+    _no_split_modules = []
+
+    def __init__(self, config: PretrainedConfig):
+        super().__init__(config)
+
+        self.use_speaker_embedding = config.use_speaker_embedding
+        self.use_llm_hidden_state = config.use_llm_hidden_state
+        self.num_spk_embs = config.num_spk_embs
+        self.spk_emb_token_id = config.spk_emb_token_id
+
+        self.use_text = config.use_text
+        self.streaming = config.streaming
+        self.streaming_text_chunk_size = config.streaming_text_chunk_size
+        self.streaming_audio_chunk_size = config.streaming_audio_chunk_size
+        self.streaming_text_reserved_len = config.streaming_text_reserved_len
+        self.audio_bos_token_id = config.audio_bos_token_id
+        self.num_mel_bins = config.num_mel_bins
+        self.num_vq = config.num_vq
+        self.num_audio_tokens = config.num_audio_tokens
+
+        self.top_p = config.top_p
+        self.top_k = config.top_k
+        self.repetition_penalty = config.repetition_penalty
+
+        if self.config.use_mlp:
+            self.projector = MultiModalProjector(config.llm_dim, config.hidden_size)
+        else:
+            self.projector = nn.Linear(config.llm_dim, config.hidden_size, bias=False)
+        self.emb_code = nn.ModuleList(
+            [
+                nn.Embedding(config.num_audio_tokens, config.hidden_size)
+                for _ in range(config.num_vq)
+            ]
+        )
+        self.emb_text = nn.Embedding(config.num_text_tokens, config.hidden_size)
+        self.head_code = nn.ModuleList(
+            [
+                parametrizations.weight_norm(
+                    nn.Linear(config.hidden_size, config.num_audio_tokens, bias=False),
+                    name="weight",
+                )
+                for _ in range(config.num_vq)
+            ]
+        )
+
+        dvae = DVAE()
+        self.dvae = dvae
+
+        model_config = LlamaConfig(
+            hidden_size=config.hidden_size,
+            intermediate_size=config.intermediate_size,
+            num_attention_heads=config.num_attention_heads,
+            num_hidden_layers=config.num_hidden_layers,
+            max_position_embeddings=config.max_position_embeddings,
+            attn_implementation=config.attn_implementation,
+        )
+
+        model = LlamaModel(model_config)
+        self.model = model
+
+    @torch.inference_mode()
+    def merge_inputs_embeds(
+        self,
+        input_ids: torch.Tensor,
+        lm_spk_emb_last_hidden_states: Optional[torch.Tensor] = None,
+    ):
+        """Merge `input_ids` and `lm_spk_emb_last_hidden_states` to `inputs_embeds`.
+
+        Args:
+            input_ids (torch.Tensor): Input token IDs.
+            lm_spk_emb_last_hidden_states (Optional[torch.Tensor], optional): Last hidden states of speaker embeddings from the language model. Defaults to None.
+
+        Raises:
+            NotImplementedError: If speaker embedding is not used and language model hidden states are not implemented.
+
+        Returns:
+            torch.Tensor: Prepared input embeddings for the model.
+        """
+        assert input_ids.shape[0] == 1
+
+        # Embed input_ids to input_embeds
+        inputs_embeds = self.emb_text(input_ids)
+
+        # Inject speaker embedding to input_embeds if it exists
+        if self.use_speaker_embedding:
+            spk_emb_mask = input_ids == self.spk_emb_token_id
+            if spk_emb_mask.any():
+                assert lm_spk_emb_last_hidden_states is not None
+                # Project spk emb to tts hidden size first, [batch_size, num_spk_emb, llm_dim] -> [batch_size, num_spk_emb, self.hidden_size]
+                lm_spk_emb_last_hidden_states = lm_spk_emb_last_hidden_states.to(
+                    self.projector.linear1.weight.dtype
+                )
+                projected_spk_emb = self.projector(lm_spk_emb_last_hidden_states)
+                projected_spk_emb = F.normalize(projected_spk_emb, p=2, dim=-1)
+                apply_spk_emb(
+                    input_ids=input_ids,
+                    spk_emb=projected_spk_emb,
+                    input_embeds=inputs_embeds,
+                    spk_emb_token_id=self.spk_emb_token_id,
+                    num_spk_embs=self.num_spk_embs,
+                )
+        else:
+            raise NotImplementedError
+
+        return inputs_embeds
+
+    @torch.inference_mode()
+    def prefill_text(
+        self,
+        input_ids: torch.Tensor,
+        position_ids: torch.LongTensor,
+        past_key_values: List[Tuple[torch.Tensor, torch.Tensor]],
+        lm_spk_emb_last_hidden_states: Optional[torch.Tensor] = None,
+    ):
+        """Prefill a chunk of new text tokens in streaming setting.
+        Specifically speaking, update `past_key_values` using new text tokens, then the model will read the new text tokens.
+
+        Args:
+            input_ids (Tensor): Tensor of shape [batch_size, seq_len]
+            position_ids (LongTensor): Tensor of shape [batch_size, seq_len]
+            past_key_values (List[Tuple[Tensor]]): KV Cache of all layers, each layer is a tuple (Tensor, Tensor) denoting keys and values. Each tensor is of seq_len = `self.streaming_text_reserved_len`. `past_key_values` will be updated.
+            lm_spk_emb_last_hidden_states (Tensor, optional): Tensor of shape [batch_size, num_spk_emb, llm_dim]. Defaults to None.
+
+        Note that all `batch_size` should be `1`.
+        """
+        assert input_ids.shape[0] == 1
+        assert past_key_values is not None
+
+        # Merge text and LLM embeddings
+        inputs_embeds = self.merge_inputs_embeds(
+            input_ids=input_ids,
+            lm_spk_emb_last_hidden_states=lm_spk_emb_last_hidden_states,
+        )
+
+        # Clone KV Cache
+        past_key_values_for_prefill = []
+        for i in range(len(past_key_values)):
+            past_key_values_for_prefill.append(
+                (
+                    past_key_values[i][0][:, :, : position_ids[:, 0], :].clone(),
+                    past_key_values[i][1][:, :, : position_ids[:, 0], :].clone(),
+                )
+            )
+
+        # ModelMiniCPMVBaseModel
+        outputs_prefill: BaseModelOutputWithPast = self.model(
+            attention_mask=None,  # because for text, it is standard causal attention mask, do nothing
+            position_ids=position_ids,  # position_ids denotes the position of new text tokens in the sequence
+            past_key_values=past_key_values_for_prefill,  # `past_key_values` will be updated by the model
+            inputs_embeds=inputs_embeds,  # contains text and language model embedding
+            use_cache=True,
+            output_attentions=False,
+            cache_position=position_ids,  # which new positions will use this cache, basically the same as position_ids
+        )
+
+        # Get model updated KV Cache
+        past_key_values_for_prefill_updated = outputs_prefill.past_key_values
+
+        # Update generated KV Cache to input `past_key_values`
+        for layer_idx in range(len(past_key_values)):
+            # Update keys
+            past_key_values[layer_idx][0][
+                :, :, position_ids[:, 0] : position_ids[:, -1] + 1, :
+            ] = past_key_values_for_prefill_updated[layer_idx][0][
+                :, :, position_ids[:, 0] : position_ids[:, -1] + 1
+            ].clone()
+            # Update values
+            past_key_values[layer_idx][1][
+                :, :, position_ids[:, 0] : position_ids[:, -1] + 1, :
+            ] = past_key_values_for_prefill_updated[layer_idx][1][
+                :, :, position_ids[:, 0] : position_ids[:, -1] + 1
+            ].clone()
+
+        # TODO: del past_key_values_for_prefill_updated recursively
+        # TODO: del outputs_prefill recursively
+
+        return past_key_values
+
+    @torch.inference_mode()
+    def prefill_audio_ids(
+        self,
+        input_ids: torch.Tensor,
+        past_key_values: List[Tuple[torch.Tensor, torch.Tensor]],
+        streaming_tts_text_mask=None,
+        add_audio_bos: bool = True,
+    ):
+        """Prefill a chunk of audio ids to the model. Used in sliding-window long audio generation.
+        Specifically, prefill many audio ids (typically from last window) to the model in the new window.
+
+        Args:
+            input_ids (torch.Tensor): (1, seq_len, num_vq) Audio input token ids.
+            past_key_values (List[Tuple[torch.Tensor, torch.Tensor]]): Past key values for attention mechanism.
+        """
+        assert input_ids.shape[0] == 1
+        assert past_key_values is not None
+
+        code_emb = [self.emb_code[i](input_ids[:, :, i]) for i in range(self.num_vq)]
+        inputs_embeds = torch.stack(code_emb, 3).sum(3)  # [1,seq_len,768]
+        input_len = input_ids.shape[1]
+
+        if add_audio_bos:
+            narrowed_input_ids = torch.tensor(
+                [[self.audio_bos_token_id]], dtype=torch.long, device=self.device
+            )
+            bos_inputs_embeds = self.emb_text(narrowed_input_ids)
+            inputs_embeds = torch.cat([bos_inputs_embeds, inputs_embeds], dim=1)
+            input_len += 1
+
+        past_key_values_length = past_key_values[0][0].shape[2]
+        position_ids = torch.arange(
+            past_key_values_length,
+            past_key_values_length + input_len,
+            dtype=torch.long,
+            device=self.device,
+        ).unsqueeze(0)
+
+        cache_position = position_ids.clone()
+        causal_mask = make_streaming_chunk_mask_generation(
+            inputs_embeds=inputs_embeds,
+            past_seen_tokens=past_key_values[0][0].shape[2],
+            streaming_tts_text_mask=streaming_tts_text_mask,
+            streaming_reserved_length=self.streaming_text_reserved_len,
+            streaming_text_chunk_size=self.streaming_text_chunk_size,
+        )  # [1, 1, 1, past_key_values_length + input_len]
+
+        # Model forward
+        outputs: BaseModelOutputWithPast = self.model(
+            attention_mask=causal_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=True,
+            output_attentions=False,
+            cache_position=cache_position,
+        )
+        past_key_values = outputs.past_key_values
+        return past_key_values
+
+    @torch.inference_mode()
+    def generate(
+        self,
+        input_ids: torch.Tensor,
+        past_key_values: List[Tuple[torch.Tensor, torch.Tensor]],
+        temperature: torch.Tensor,
+        eos_token: Union[int, torch.Tensor],
+        streaming_tts_text_mask=None,
+        force_no_stop=False,
+        min_new_token=10,
+        max_new_token=50,
+        logits_warpers: Optional[List[LogitsWarper]] = None,
+        logits_processors: Optional[
+            List[CustomRepetitionPenaltyLogitsProcessorRepeat]
+        ] = None,
+        show_tqdm=False,
+    ):
+        """Generate audio codes in streaming setting or non-streaming setting.
+        Specifically speaking, generate audio codes when not all text tokens are prefilled.
+
+        Always pass a valid `past_key_values` to the method. The method does not do `prefill` by itself. It relies on `prefill_text` method to provide valid `past_key_values`. Please refer to docstring of this class for more details.
+
+        In this method, we borrowed a lot of codes from `https://github.com/2noise/ChatTTS/blob/main/ChatTTS/model/gpt.py`.
+
+        Args:
+            input_ids (torch.Tensor): Input token ids.
+            past_key_values (List[Tuple[torch.Tensor, torch.Tensor]]): Past key values for attention mechanism.
+            temperature (torch.Tensor): Temperature for sampling.
+            eos_token (Union[int, torch.Tensor]): End of sequence token.
+            streaming_tts_text_mask (Optional[torch.Tensor], optional): Mask for streaming TTS text. Defaults to None.
+            max_new_token (int, optional): Maximum number of new tokens to generate. Defaults to 50.
+            logits_warpers (List[LogitsWarper], optional): List of logits warpers. Defaults to [].
+            logits_processors (List[CustomRepetitionPenaltyLogitsProcessorRepeat], optional): List of logits processors. Defaults to [].
+            show_tqdm (bool, optional): Whether to show progress bar. Defaults to True.
+
+        Returns:
+            GenerationOutputs: Generation outputs.
+        """
+
+        # We only support batch size `1` for now
+        assert input_ids.shape[0] == 1
+        assert past_key_values is not None
+
+        logits_warpers = logits_warpers or []
+        logits_processors = logits_processors or []
+
+        # fix: this should not be `input_ids.shape[1]`
+        # start_idx = input_ids.shape[1]
+        start_idx = (
+            1
+            + self.num_spk_embs * self.use_speaker_embedding
+            + self.streaming_text_reserved_len
+            + 1
+        )
+
+        finish = torch.zeros(input_ids.shape[0], device=input_ids.device).bool()
+
+        temperature = (
+            temperature.unsqueeze(0)
+            .expand(input_ids.shape[0], -1)
+            .contiguous()
+            .view(-1, 1)
+        )
+
+        progress = input_ids.shape[1]
+
+        # Pre-allocate input_ids, shape is [batch_size=1, max_possible_seq_len, self.num_vqs]
+        input_ids_buf = torch.zeros(
+            input_ids.shape[0],  # batch_size
+            progress
+            + max_new_token,  # max_possible_seq_len = input_ids.shape[1] + max_new_token
+            input_ids.shape[2],  # self.num_vqs
+            dtype=input_ids.dtype,
+            device=input_ids.device,
+        )
+
+        # Copy existing `input_ids` to `input_ids_buf`
+        input_ids_buf.narrow(1, 0, progress).copy_(input_ids)
+
+        del input_ids
+        input_ids = input_ids_buf.narrow(1, 0, progress)
+
+        pbar: Optional[tqdm] = None
+        if show_tqdm:
+            pbar = tqdm(
+                total=max_new_token,
+                desc="code",
+                bar_format="{l_bar}{bar}| {n_fmt}/{total_fmt}(max) [{elapsed}, {rate_fmt}{postfix}]",
+            )
+
+        condition_length = (
+            1
+            + self.num_spk_embs * self.use_speaker_embedding
+            + self.streaming_text_reserved_len
+            + 1
+        )
+
+        for i in range(max_new_token):
+            # Prepare generation inputs
+            audio_bos = False
+
+            # If this is the first audio token, the case is SPECIAL
+            if progress == condition_length:
+                audio_bos = True
+
+            assert progress == (
+                past_key_values[0][0].shape[2] + 1
+            )  # If you are using according to the guidelines, this should be passed.
+
+            if audio_bos:
+                # Generate the first token, activate the model with `self.audio_bos_token_id`, the model will predict
+                # a new audio token. This is a special case because without the `audio bos token`, it is impossible
+                # to generate the first audio token in our streaming setting.
+                narrowed_input_ids = torch.tensor(
+                    [[self.audio_bos_token_id]], dtype=torch.long, device=self.device
+                )
+                inputs_embeds = self.emb_text(narrowed_input_ids)
+                del narrowed_input_ids
+            else:
+                # Generate the following audio tokens, it is applicable to all other cases, including second and the
+                # following calling of `generate`.
+                narrowed_input_ids = input_ids.narrow(
+                    dim=1, start=input_ids.shape[1] - 1, length=1
+                )
+                code_emb = [
+                    self.emb_code[i](narrowed_input_ids[:, :, i])
+                    for i in range(self.num_vq)
+                ]
+                inputs_embeds = torch.stack(code_emb, 3).sum(3)
+
+            position_ids = torch.tensor(
+                [past_key_values[0][0].shape[2]], dtype=torch.long, device=self.device
+            ).unsqueeze(0)
+
+            cache_position = position_ids.clone()
+
+            # Make causal mask
+            causal_mask = make_streaming_chunk_mask_generation(
+                inputs_embeds=inputs_embeds,
+                past_seen_tokens=past_key_values[0][0].shape[2],
+                streaming_tts_text_mask=streaming_tts_text_mask,
+                streaming_reserved_length=self.streaming_text_reserved_len,
+                streaming_text_chunk_size=self.streaming_text_chunk_size,
+            )
+
+            # Model forward
+            outputs: BaseModelOutputWithPast = self.model(
+                attention_mask=causal_mask,
+                position_ids=position_ids,
+                past_key_values=past_key_values,
+                inputs_embeds=inputs_embeds,
+                use_cache=True,
+                output_attentions=False,
+                cache_position=cache_position,
+            )
+
+            del position_ids
+            del inputs_embeds
+            del cache_position
+            del causal_mask
+
+            hidden_states = outputs.last_hidden_state
+            past_key_values = outputs.past_key_values
+
+            with P.cached():
+                logits = torch.empty(
+                    hidden_states.size(0),
+                    hidden_states.size(1),
+                    self.num_audio_tokens,
+                    self.num_vq,
+                    dtype=torch.float,
+                    device=self.device,
+                )
+                for num_vq_iter in range(self.num_vq):
+                    x: torch.Tensor = self.head_code[num_vq_iter](hidden_states)
+                    logits[..., num_vq_iter] = x
+                    del x
+
+            del hidden_states
+
+            # logits = logits[:, -1].float()
+            logits = logits.narrow(1, -1, 1).squeeze_(1).float()
+
+            # logits = rearrange(logits, "b c n -> (b n) c")
+            logits = logits.permute(0, 2, 1)
+            logits = logits.reshape(-1, logits.size(2))
+            # logits_token = rearrange(input_ids[:, start_idx:], "b c n -> (b n) c")
+            input_ids_sliced = input_ids.narrow(
+                1,
+                start_idx,
+                input_ids.size(1) - start_idx,
+            ).permute(0, 2, 1)
+            logits_token = input_ids_sliced.reshape(
+                input_ids_sliced.size(0) * input_ids_sliced.size(1),
+                -1,
+            ).to(self.device)
+            del input_ids_sliced
+
+            logits /= temperature
+
+            if not audio_bos:
+                for logitsProcessors in logits_processors:
+                    logits = logitsProcessors(logits_token, logits)
+            if not audio_bos:
+                for logitsWarpers in logits_warpers:
+                    logits = logitsWarpers(logits_token, logits)
+
+            del logits_token
+
+            if i < min_new_token:
+                logits[:, eos_token] = -torch.inf
+
+            if force_no_stop:
+                logits[:, eos_token] = -torch.inf
+
+            scores = F.softmax(logits, dim=-1)
+
+            del logits
+            idx_next = torch.multinomial(scores, num_samples=1)  # .to(finish.device)
+
+            del scores
+
+            # idx_next = rearrange(idx_next, "(b n) 1 -> b n", n=self.num_vq)
+            idx_next = idx_next.view(-1, self.num_vq)
+            finish_or = idx_next.eq(eos_token).any(1)
+            finish.logical_or_(finish_or)
+
+            del finish_or
+            # Store new `token` into `input_ids_buf`
+            input_ids_buf.narrow(1, progress, 1).copy_(idx_next.unsqueeze_(1))
+
+            if i == 0 and finish.any():
+                # raise Exception
+                break
+
+            del idx_next
+            progress += 1
+            input_ids = input_ids_buf.narrow(1, 0, progress)
+
+            if finish.all():
+                break
+
+            if pbar is not None:
+                pbar.update(1)
+
+        if pbar is not None:
+            pbar.close()
+
+        if not finish.all():
+            if show_tqdm:
+                logger.info(f"incomplete result. hit max_new_token: {max_new_token}")
+
+        del input_ids_buf
+
+        if finish.all():
+            # the last may contains eos token
+            genrated_input_ids = input_ids[:, condition_length:-1, :]
+        else:
+            # there is no eos token
+            genrated_input_ids = input_ids[:, condition_length:, :]
+
+        return ConditionalChatTTSGenerationOutput(
+            new_ids=genrated_input_ids,
+            audio_input_ids=input_ids,  # for update purpose
+            past_key_values=past_key_values,  # for update purpose
+            finished=finish.all(),
+        )
+
+    @torch.inference_mode()
+    def decode_to_mel_specs(
+        self,
+        result_list: List[torch.Tensor],
+    ):
+        """Decode discrete audio codes to mel spectrograms.
+
+        Borrowed from `https://github.com/2noise/ChatTTS/blob/main/ChatTTS/core.py`
+
+        Args:
+            result_list (List[torch.Tensor]): Audio codes output from `generate`.
+
+        Returns:
+            torch.Tensor: Mel spectrograms.
+        """
+
+        decoder = self.dvae
+        max_x_len = -1
+        if len(result_list) == 0:
+            return np.array([], dtype=np.float32)
+        for result in result_list:
+            if result.size(0) > max_x_len:
+                max_x_len = result.size(0)
+        batch_result = torch.zeros(
+            (len(result_list), result_list[0].size(1), max_x_len),
+            dtype=result_list[0].dtype,
+            device=result_list[0].device,
+        )
+        for i in range(len(result_list)):
+            src = result_list[i]
+            batch_result[i].narrow(1, 0, src.size(0)).copy_(src.permute(1, 0))
+            del src
+
+        mel_specs = decoder(batch_result)
+        del batch_result
+        return mel_specs
+
+
+# Copied from transformers.models.whisper.modeling_whisper.WhisperEncoderLayer and add use_cache for streaming inference
+class MiniCPMWhisperEncoderLayer(nn.Module):
+    def __init__(self, config: WhisperConfig, layer_idx: int = None):
+        super().__init__()
+        self.embed_dim = config.d_model
+        self.self_attn = WhisperAttention(
+            embed_dim=self.embed_dim,
+            num_heads=config.encoder_attention_heads,
+            dropout=config.attention_dropout,
+            config=config,
+            layer_idx=layer_idx,
+        )
+        self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
+        self.dropout = config.dropout
+        self.activation_fn = ACT2FN[config.activation_function]
+        self.activation_dropout = config.activation_dropout
+        self.fc1 = nn.Linear(self.embed_dim, config.encoder_ffn_dim)
+        self.fc2 = nn.Linear(config.encoder_ffn_dim, self.embed_dim)
+        self.final_layer_norm = nn.LayerNorm(self.embed_dim)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: torch.Tensor,
+        layer_head_mask: torch.Tensor,
+        output_attentions: bool = False,
+        past_key_values: Optional[EncoderDecoderCache] = None,
+        use_cache: Optional[bool] = False,
+    ) -> torch.Tensor:
+        r"""
+        Args:
+            hidden_states (`torch.FloatTensor` of shape `(batch_size, seq_len, embed_dim)`):
+                Hidden states to be fed into the encoder layer.
+            attention_mask (`torch.FloatTensor` of shape `(batch_size, 1, tgt_len, src_len)`):
+                Attention mask where padding elements are indicated by large negative values.
+            layer_head_mask (`torch.FloatTensor` of shape `(encoder_attention_heads,)`):
+                Mask to nullify selected heads of the attention modules.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attention weights.
+            past_key_values (`EncoderDecoderCache`, *optional*):
+                Past key-value pairs used for incremental decoding.
+            use_cache (`bool`, *optional*):
+                Whether or not to return updated `past_key_values` for caching.
+
+        Returns:
+            A tuple of shape `(hidden_states, optional(attn_weights), optional(past_key_values))`.
+        """
+        residual = hidden_states
+        hidden_states = self.self_attn_layer_norm(hidden_states)
+        # TODO (lifuhuang): confirmed with Mick that the logic for past_key_values is copied from minicpmo official code,
+        # currently we are not using past_key_values at all. We need to redesign the caching logic when we support streaming
+        # in the future.
+        hidden_states, attn_weights = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            layer_head_mask=layer_head_mask,
+            output_attentions=output_attentions,
+            past_key_value=past_key_values,
+        )
+        hidden_states = nn.functional.dropout(
+            hidden_states, p=self.dropout, training=False
+        )
+        hidden_states = residual + hidden_states
+
+        residual = hidden_states
+        hidden_states = self.final_layer_norm(hidden_states)
+        hidden_states = self.activation_fn(self.fc1(hidden_states))
+        hidden_states = nn.functional.dropout(
+            hidden_states, p=self.activation_dropout, training=False
+        )
+        hidden_states = self.fc2(hidden_states)
+        hidden_states = nn.functional.dropout(
+            hidden_states, p=self.dropout, training=False
+        )
+        hidden_states = residual + hidden_states
+
+        if hidden_states.dtype == torch.float16 and (
+            torch.isinf(hidden_states).any() or torch.isnan(hidden_states).any()
+        ):
+            clamp_value = torch.finfo(hidden_states.dtype).max - 1000
+            hidden_states = torch.clamp(
+                hidden_states, min=-clamp_value, max=clamp_value
+            )
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (attn_weights,)
+
+        if use_cache:
+            outputs += (past_key_values,)
+
+        return outputs
+
+
+# Copied from from transformers.models.whisper.modeling_whisper.WhisperEncoder and add use_cache for streaming inference
+class MiniCPMWhisperEncoder(WhisperEncoder):
+
+    def __init__(self, config: WhisperConfig):
+        super().__init__(config)
+        self.layers = nn.ModuleList(
+            [
+                MiniCPMWhisperEncoderLayer(config, layer_idx=i)
+                for i in range(config.encoder_layers)
+            ]
+        )
+
+    def forward(
+        self,
+        input_features,
+        attention_mask=None,
+        head_mask=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+        past_key_values: Optional[EncoderDecoderCache] = None,
+        use_cache: Optional[bool] = None,
+    ):
+        r"""
+        Forward pass of the Whisper encoder.
+
+        Args:
+            input_features (`torch.FloatTensor` of shape `(batch_size, feature_size, sequence_length)`):
+                Float values of log-mel features extracted from the raw audio waveform. Typically generated
+                by a feature extractor (e.g., `WhisperFeatureExtractor`) that processes `.flac` or `.wav`
+                files into padded 2D mel spectrogram frames. These features are projected via convolution layers
+                (`conv1` and `conv2`) and then transformed into embeddings for the encoder.
+
+            attention_mask (`torch.Tensor`, *optional*):
+                Not used by Whisper for masking `input_features`, but included for API compatibility with
+                other models. If provided, it is simply ignored within the model. By default, Whisper
+                effectively ignores silence in the input log-mel spectrogram.
+
+            head_mask (`torch.Tensor` of shape `(encoder_layers, encoder_attention_heads)`, *optional*):
+                Mask to nullify selected attention heads. The elements should be either 1 or 0, where:
+                - 1 indicates the head is **not masked**,
+                - 0 indicates the head is **masked** (i.e., the attention head is dropped).
+
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attention tensors of all encoder layers. If set to `True`, the
+                returned tuple (or `BaseModelOutputWithPast`) will contain an additional element with
+                attention weights for each encoder layer.
+
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers. If set to `True`, the returned
+                tuple (or `BaseModelOutputWithPast`) will contain a tuple of hidden states, including the
+                initial embedding output as well as the outputs of each layer.
+
+            return_dict (`bool`, *optional*):
+                Whether or not to return a `BaseModelOutputWithPast` (a subclass of `ModelOutput`) instead
+                of a plain tuple. If set to `True`, the output will be a `BaseModelOutputWithPast` object,
+                otherwise it will be a tuple.
+
+            past_key_values (`EncoderDecoderCache`, *optional*):
+                When using caching for faster inference, this is an object that stores the key-value pairs
+                for attention states. If provided, the model will append new states to the existing cache
+                and return the updated cache. This speeds up sequential decoding or chunked inference.
+
+                - If `past_key_values` is `None`, no past states are used or returned.
+                - If `past_key_values` is not `None` and `use_cache=True`, the model will use the provided
+                cache and return the updated cache (as `next_encoder_cache`).
+
+            use_cache (`bool`, *optional*):
+                Whether or not the model should use caching (`past_key_values`) to speed up processing
+                during inference. When set to `True`, the model will:
+                - Inspect and use `past_key_values` if provided.
+                - Return updated `past_key_values` (under the name `next_encoder_cache` in
+                    `BaseModelOutputWithPast`).
+
+        Returns:
+            `BaseModelOutputWithPast` or `tuple` (depending on `return_dict`):
+                If `return_dict=True`, a `BaseModelOutputWithPast` is returned, which contains:
+                - **last_hidden_state** (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+                The output of the final encoder layer.
+                - **hidden_states** (`tuple(torch.FloatTensor)`, *optional*, returned if `output_hidden_states=True`):
+                Hidden states of the model at each layer (including the initial projection).
+                - **attentions** (`tuple(torch.FloatTensor)`, *optional*, returned if `output_attentions=True`):
+                Attention weights from each encoder layer.
+                - **past_key_values** (an object of type `EncoderDecoderCache` or `None`, *optional*):
+                Updated cache of key-value pairs if `use_cache=True`.
+
+                If `return_dict=False`, a tuple is returned, where the format is:
+                `(last_hidden_state, hidden_states, attentions)`, with `hidden_states` and `attentions`
+                only present if their respective `output_*` arguments are set to `True`.
+
+        """
+        output_attentions = (
+            output_attentions
+            if output_attentions is not None
+            else self.config.output_attentions
+        )
+        output_hidden_states = (
+            output_hidden_states
+            if output_hidden_states is not None
+            else self.config.output_hidden_states
+        )
+        return_dict = (
+            return_dict if return_dict is not None else self.config.use_return_dict
+        )
+
+        # Ignore copy
+        input_features = input_features.to(
+            dtype=self.conv1.weight.dtype, device=self.conv1.weight.device
+        )
+
+        inputs_embeds = nn.functional.gelu(self.conv1(input_features))
+        inputs_embeds = nn.functional.gelu(self.conv2(inputs_embeds))
+
+        inputs_embeds = inputs_embeds.permute(0, 2, 1)
+
+        embed_pos = self.embed_positions.weight
+        past_key_values_length = 0
+        if use_cache:
+            if past_key_values is None:
+                past_key_values = EncoderDecoderCache(DynamicCache(), DynamicCache())
+            elif isinstance(past_key_values, list):
+                past_key_values = EncoderDecoderCache(
+                    DynamicCache.from_legacy_cache(past_key_values), DynamicCache()
+                )
+            elif isinstance(past_key_values, DynamicCache):
+                past_key_values = EncoderDecoderCache(past_key_values, DynamicCache())
+            else:
+                pass
+            past_key_values_length = (
+                past_key_values.self_attention_cache.get_usable_length(
+                    inputs_embeds.shape[1]
+                )
+            )
+            if inputs_embeds.shape[1] + past_key_values_length > embed_pos.shape[0]:
+                logger.warning(
+                    "seems the audio is longer than 30s. repeating the last part of the audio"
+                )
+                embed_pos_front = embed_pos[past_key_values_length:, :]
+                embed_pos = torch.cat(
+                    (
+                        embed_pos_front,
+                        torch.repeat_interleave(
+                            embed_pos[-1, :].unsqueeze(0),
+                            inputs_embeds.shape[1]
+                            - embed_pos.shape[0]
+                            + past_key_values_length,
+                            dim=0,
+                        ),
+                    )
+                )
+            else:
+                embed_pos = embed_pos[
+                    past_key_values_length : inputs_embeds.shape[1]
+                    + past_key_values_length,
+                    :,
+                ]
+        else:
+            embed_pos = embed_pos[: inputs_embeds.shape[1], :]
+
+        hidden_states = inputs_embeds + embed_pos
+        hidden_states = nn.functional.dropout(
+            hidden_states, p=self.dropout, training=False
+        )
+
+        encoder_states = () if output_hidden_states else None
+        all_attentions = () if output_attentions else None
+
+        # check if head_mask has a correct number of layers specified if desired
+        if head_mask is not None:
+            assert head_mask.size()[0] == (
+                len(self.layers)
+            ), f"The head_mask should be specified for {len(self.layers)} layers, but it is for {head_mask.size()[0]}."
+
+        for idx, encoder_layer in enumerate(self.layers):
+            if output_hidden_states:
+                encoder_states = encoder_states + (hidden_states,)
+            # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
+            to_drop = False
+
+            # Ignore copy
+            if to_drop:
+                layer_outputs = (None, None)
+            else:
+                layer_outputs = encoder_layer(
+                    hidden_states,
+                    attention_mask,
+                    layer_head_mask=(head_mask[idx] if head_mask is not None else None),
+                    output_attentions=output_attentions,
+                    past_key_values=past_key_values,
+                    use_cache=use_cache,
+                )
+
+                hidden_states = layer_outputs[0]
+
+            if use_cache:
+                next_encoder_cache = layer_outputs[2 if output_attentions else 1]
+            else:
+                next_encoder_cache = None
+
+            if output_attentions:
+                all_attentions = all_attentions + (layer_outputs[1],)
+
+        hidden_states = self.layer_norm(hidden_states)
+        if output_hidden_states:
+            encoder_states = encoder_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(
+                v
+                for v in [hidden_states, encoder_states, all_attentions]
+                if v is not None
+            )
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            hidden_states=encoder_states,
+            attentions=all_attentions,
+            past_key_values=next_encoder_cache,
+        )
+
+
+class MultiModalProjector(nn.Module):
+    def __init__(self, in_dim, out_dim):
+        super().__init__()
+        self.linear1 = nn.Linear(in_features=in_dim, out_features=out_dim, bias=True)
+        self.relu = nn.ReLU()
+        self.linear2 = nn.Linear(in_features=out_dim, out_features=out_dim, bias=True)
+
+    def forward(self, audio_features):
+        hidden_states = self.relu(self.linear1(audio_features))
+        hidden_states = self.linear2(hidden_states)
+        return hidden_states
+
+
+class MiniCPMO(MiniCPMBaseModel):
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+    ) -> None:
+        super().__init__(config=config, quant_config=quant_config)
+
+        self.llm = self.init_llm(config=config, quant_config=quant_config)
+
+        self.embed_dim = self.llm.config.hidden_size
+
+        # init vision module
+        if self.config.init_vision:
+            # print("vision-understanding enabled")
+            self.vpm = self.init_vision_module(config=config, quant_config=quant_config)
+            self.vision_dim = self.vpm.embed_dim
+            self.resampler = self.init_resampler(self.embed_dim, self.vision_dim)
+
+        # init audio module
+        self.config.init_audio = True
+        if self.config.init_audio:
+            # print("audio-understanding enabled")
+            self.apm = self.init_audio_module()
+            audio_output_dim = int(self.apm.config.encoder_ffn_dim // 4)
+            self.audio_avg_pooler = nn.AvgPool1d(
+                self.config.audio_pool_step, stride=self.config.audio_pool_step
+            )
+            self.audio_projection_layer = MultiModalProjector(
+                in_dim=audio_output_dim, out_dim=self.embed_dim
+            )
+            self.audio_encoder_layer = -1
+
+        # init tts module
+        self.config.init_tts = False
+        logger.info("TTS is disabled for now")
+        if self.config.init_tts:
+            # print("tts enabled")
+            assert (
+                _tts_deps
+            ), "please make sure vector_quantize_pytorch and vocos are installed."
+            self.tts = self.init_tts_module()
+
+    def init_tts_module(self):
+        model = ConditionalChatTTS(self.config.tts_config)
+        return model
+
+    def init_audio_module(self):
+        model = MiniCPMWhisperEncoder(self.config.audio_config)
+        return model
+
+    def init_llm(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> nn.Module:
+        return Qwen2ForCausalLM(config=config, quant_config=quant_config, prefix=prefix)
+
+    def init_vision_module(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig],
+        prefix: str = "",
+    ):
+        if self.config._attn_implementation == "flash_attention_2":
+            self.config.vision_config._attn_implementation = "flash_attention_2"
+        else:
+            self.config.vision_config._attn_implementation = "eager"
+        model = Idefics2VisionTransformer(
+            config=config.vision_config, quant_config=quant_config, prefix=prefix
+        )
+        if self.config.drop_vision_last_layer:
+            model.encoder.layers = model.encoder.layers[:-1]
+
+        setattr(model, "embed_dim", model.embeddings.embed_dim)
+        setattr(model, "patch_size", model.embeddings.patch_size)
+
+        return model
+
+    def init_resampler(
+        self,
+        embed_dim: int,
+        vision_dim: int,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> nn.Module:
+        with set_default_torch_dtype(torch.float16):
+            # The resampler in 2.6 remains consistent with the one in 2.5.
+            resampler = Resampler2_5(
+                num_queries=self.config.query_num,
+                embed_dim=embed_dim,
+                num_heads=embed_dim // 128,
+                kv_dim=vision_dim,
+                quant_config=quant_config,
+                prefix=prefix,
+            )
+
+        return resampler.to(device="cuda", dtype=torch.get_default_dtype())
+
+    def pad_input_ids(self, input_ids: List[int], mm_input: MultimodalInputs):
+        # Get all special token IDs
+        im_start_id: int = mm_input.im_start_id
+        im_end_id: int = mm_input.im_end_id
+        slice_start_id: int = mm_input.slice_start_id
+        slice_end_id: int = mm_input.slice_end_id
+
+        data_token_pairs = [
+            (im_start_id, im_end_id),
+            (slice_start_id, slice_end_id),
+            (mm_input.audio_start_id, mm_input.audio_end_id),
+        ]
+        data_start_token_ids = [im_start_id, mm_input.audio_start_id]
+        pattern = MultiModalityDataPaddingPatternTokenPairs(
+            data_token_pairs=data_token_pairs, data_start_token_ids=data_start_token_ids
+        )
+
+        return pattern.pad_input_tokens(input_ids, mm_input)
+
+    def _get_feat_extract_output_lengths(self, input_lengths: torch.LongTensor):
+        """
+        Computes the output length of the convolutional layers and the output length of the audio encoder
+        """
+        input_lengths_after_cnn = (input_lengths - 1) // 2 + 1
+        input_lengths_after_pooling = (
+            input_lengths_after_cnn - self.config.audio_pool_step
+        ) // self.config.audio_pool_step + 1
+        input_lengths_after_pooling = input_lengths_after_pooling.to(dtype=torch.int32)
+
+        return input_lengths_after_cnn, input_lengths_after_pooling
+
+    def get_audio_embedding_streaming(self, items: List[MultimodalDataItem]):
+        r"""
+        Extract audio embeddings in a streaming manner using cached key-value pairs.
+
+        This method processes incoming audio features incrementally and stores/updates `past_key_values`
+        for faster inference on subsequent audio frames. It only supports batch_size=1 and is intended
+        for streaming scenarios.
+
+        Returns:
+            List[List[torch.Tensor]]: audio embeddings
+        """
+        wavforms = flatten_nested_list([item.feature for item in items if item.feature])
+        # list, [[x1, x2], [y1], [z1]]
+        audio_feature_lens_raw = flatten_nested_list(
+            [item.audio_feature_lens for item in items if item.audio_feature_lens]
+        )
+
+        # exist audio
+        if len(wavforms) > 0:
+            audio_feature_lens = torch.hstack(audio_feature_lens_raw)
+            batch_size, _, max_mel_seq_len = wavforms.shape
+            assert batch_size == 1
+            max_seq_len = (max_mel_seq_len - 1) // 2 + 1
+
+            if self.audio_past_key_values is not None:
+                cache_length = self.audio_past_key_values[0][0].shape[2]
+                apm_max_len = self.apm.embed_positions.weight.shape[0]
+                if cache_length + max_seq_len >= apm_max_len:
+                    logger.warning(
+                        f"audio_past_key_values length {cache_length + max_seq_len} exceed {apm_max_len}, reset."
+                    )
+                    self.audio_past_key_values = None
+
+            audio_outputs = self.apm(
+                wavforms, past_key_values=self.audio_past_key_values, use_cache=True
+            )
+            audio_states = (
+                audio_outputs.last_hidden_state
+            )  # [:, :audio_feat_lengths, :]
+            self.audio_past_key_values = audio_outputs.past_key_values
+
+            audio_embeds = self.audio_projection_layer(audio_states)
+
+            audio_embeds = audio_embeds.transpose(1, 2)
+            audio_embeds = self.audio_avg_pooler(audio_embeds)
+            audio_embeds = audio_embeds.transpose(1, 2)
+
+            _, feature_lens_after_pooling = self._get_feat_extract_output_lengths(
+                audio_feature_lens
+            )
+
+            num_audio_tokens = feature_lens_after_pooling
+
+            final_audio_embeds = []
+            idx = 0
+            for i in range(len(audio_feature_lens_raw)):
+                target_audio_embeds = []
+                for _ in range(len(audio_feature_lens_raw[i])):
+                    target_audio_embeds.append(
+                        audio_embeds[idx, : num_audio_tokens[idx], :]
+                    )
+                    idx += 1
+                final_audio_embeds.append(target_audio_embeds)
+            return final_audio_embeds
+        else:
+            return []
+
+    def subsequent_chunk_mask(
+        self,
+        size: int,
+        chunk_size: int,
+        num_left_chunks: int = -1,
+        device: torch.device = torch.device("cpu"),
+        num_lookhead: int = 0,
+    ) -> torch.Tensor:
+        """Create mask for subsequent steps (size, size) with chunk size,
+        this is for streaming encoder
+
+        Args:
+            size (int): size of mask
+            chunk_size (int): size of chunk
+            num_left_chunks (int): number of left chunks
+                <0: use full chunk
+                >=0: use num_left_chunks
+            device (torch.device): "cpu" or "cuda" or torch.Tensor.device
+
+        Returns:
+            torch.Tensor: mask
+
+        """
+        ret = torch.zeros(size, size, device=device, dtype=torch.bool)
+        for i in range(size):
+            if num_left_chunks < 0:
+                start = 0
+            else:
+                start = max((i // chunk_size - num_left_chunks) * chunk_size, 0)
+            ending = min((i // chunk_size + 1) * chunk_size + num_lookhead, size)
+            ret[i, start:ending] = True
+        return ret
+
+    def get_audio_embedding(self, items: List[MultimodalDataItem], chunk_length=-1):
+        r"""
+        Extract full audio embeddings with optional chunk-based attention.
+
+        This method computes embeddings for all audio frames at once, either using full attention (when
+        `chunk_length` is -1) or chunk-based attention (when `chunk_length` is a positive number). It does
+        not use key-value caching and is suitable for non-streaming inference.
+
+        Args:
+            chunk_length (int, optional): Determines whether to use full attention (-1) or chunk-based
+                attention (>0) during embedding computation.
+
+        Returns:
+            List[List[torch.Tensor]]: audio embeddings
+        """
+        # (bs, 80, frames) or [], multi audios need filled in advance
+        wavforms = flatten_nested_list([item.feature for item in items if item.feature])
+        # list, [[x1, x2], [y1], [z1]]
+        audio_feature_lens_raw = flatten_nested_list(
+            [item.audio_feature_lens for item in items if item.audio_feature_lens]
+        )
+
+        # Ensure audio_feature_lens_raw is properly formatted as [[tensor], [tensor], ...]
+        if audio_feature_lens_raw:
+            if isinstance(audio_feature_lens_raw[0], torch.Tensor):
+                # Flat list of tensors, wrap each in a list
+                audio_feature_lens_raw = [[lens] for lens in audio_feature_lens_raw]
+            elif isinstance(audio_feature_lens_raw[0], list):
+                # Already nested, ensure all elements are properly formatted
+                # Flatten if needed
+                flattened = []
+                for item in audio_feature_lens_raw:
+                    if isinstance(item, list):
+                        flattened.extend(item)
+                    else:
+                        flattened.append(item)
+                audio_feature_lens_raw = [
+                    [item] if not isinstance(item, list) else item for item in flattened
+                ]
+
+        final_audio_embeds = []
+
+        assert isinstance(wavforms, list)
+        assert isinstance(wavforms[0], torch.Tensor)
+        # exist audio
+        for wavform in wavforms:
+            if len(wavform) > 0:
+                # Flatten audio_feature_lens_raw to get a list of tensors
+                flattened_lens = []
+                for item in audio_feature_lens_raw:
+                    if isinstance(item, list):
+                        flattened_lens.extend(item)
+                    else:
+                        flattened_lens.append(item)
+                audio_feature_lens = torch.hstack(flattened_lens)
+                batch_size, _, max_mel_seq_len = wavform.shape
+                max_seq_len = (max_mel_seq_len - 1) // 2 + 1
+
+                # Create a sequence tensor of shape (batch_size, max_seq_len)
+                seq_range = (
+                    torch.arange(
+                        0,
+                        max_seq_len,
+                        dtype=audio_feature_lens.dtype,
+                        device=audio_feature_lens.device,
+                    )
+                    .unsqueeze(0)
+                    .expand(batch_size, max_seq_len)
+                )
+                lengths_expand = audio_feature_lens.unsqueeze(1).expand(
+                    batch_size, max_seq_len
+                )
+                # Create mask
+                padding_mask = seq_range >= lengths_expand  # 1 for padded values
+
+                audio_attention_mask_ = padding_mask.view(
+                    batch_size, 1, 1, max_seq_len
+                ).expand(batch_size, 1, max_seq_len, max_seq_len)
+                audio_attention_mask = audio_attention_mask_.to(
+                    dtype=self.apm.conv1.weight.dtype,
+                    device=self.apm.conv1.weight.device,
+                )
+
+                if chunk_length > 0:
+                    chunk_num_frame = int(chunk_length * 50)
+                    chunk_mask = self.subsequent_chunk_mask(
+                        size=max_seq_len,
+                        chunk_size=chunk_num_frame,
+                        num_left_chunks=-1,
+                        device=audio_attention_mask_.device,
+                    )
+                    audio_attention_mask_ = torch.logical_or(
+                        audio_attention_mask_, torch.logical_not(chunk_mask)
+                    )
+
+                audio_attention_mask[audio_attention_mask_] = float("-inf")
+                audio_states = self.apm(
+                    wavform,
+                    output_hidden_states=True,
+                    attention_mask=audio_attention_mask,
+                ).hidden_states[self.audio_encoder_layer]
+                audio_embeds = self.audio_projection_layer(audio_states)
+
+                audio_embeds = audio_embeds.transpose(1, 2)
+                audio_embeds = self.audio_avg_pooler(audio_embeds)
+                audio_embeds = audio_embeds.transpose(1, 2)
+
+                _, feature_lens_after_pooling = self._get_feat_extract_output_lengths(
+                    audio_feature_lens
+                )
+
+                num_audio_tokens = feature_lens_after_pooling
+
+                idx = 0
+                for i in range(len(audio_feature_lens_raw)):
+                    target_audio_embeds = []
+                    for _ in range(len(audio_feature_lens_raw[i])):
+                        target_audio_embeds.append(
+                            audio_embeds[idx, : num_audio_tokens[idx], :]
+                        )
+                        idx += 1
+                    final_audio_embeds.append(target_audio_embeds)
+            return final_audio_embeds
+
+    def get_audio_feature(self, items: List[MultimodalDataItem]) -> torch.Tensor:
+        embedding = self.get_omni_embedding(
+            items=items,
+            chunk_length=self.config.audio_chunk_length,
+            stream_input=False,
+        )
+        return embedding
+
+    def get_omni_embedding(
+        self,
+        items: List[MultimodalDataItem],
+        chunk_length=-1,
+        stream_input=False,
+    ):
+        """
+        Args:
+            chunk_length: whisper use full attention or chunk attention
+            stream_input: use streaming audio embedding
+        Returns:
+            final embeddings with audio feature
+        """
+
+        if stream_input:
+            audio_embeddings = self.get_audio_embedding_streaming(items)
+        else:
+            audio_embeddings = self.get_audio_embedding(items, chunk_length)
+        bs = len(audio_embeddings)
+        # batch size
+        audio_embs = torch.cat(flatten_nested_list(audio_embeddings), dim=0)
+
+        return audio_embs
+
+    def get_image_feature(self, items: List[MultimodalDataItem]) -> torch.Tensor:
+        if items and items[0].format == MultimodalInputFormat.PRECOMPUTED_EMBEDDING:
+            result = torch.cat([item.feature for item in items])
+            return result.reshape(-1, result.shape[-1])
+
+        # list of tensors
+        pixel_values = flatten_nested_list([item.feature for item in items])
+        tgt_sizes = torch.stack(
+            flatten_nested_list([item.tgt_size for item in items]), dim=0
+        )
+        assert len(pixel_values) == tgt_sizes.shape[0]
+
+        device = self.vpm.embeddings.position_embedding.weight.device
+        dtype = self.vpm.embeddings.position_embedding.weight.dtype
+        all_pixel_values_lst = [
+            i.flatten(end_dim=1).permute(1, 0) for i in pixel_values
+        ]
+
+        max_patches = (tgt_sizes[:, 0] * tgt_sizes[:, 1]).max().item()
+        assert isinstance(max_patches, int)
+        all_pixel_values = torch.nn.utils.rnn.pad_sequence(
+            all_pixel_values_lst, batch_first=True, padding_value=0.0
+        )
+
+        B, L, _ = all_pixel_values.shape
+        all_pixel_values = all_pixel_values.permute(0, 2, 1).reshape(B, 3, -1, L)
+        patch_attn_mask = torch.zeros(
+            (B, 1, max_patches), dtype=torch.bool, device=device
+        )
+
+        tgt_sizes_tensor = tgt_sizes.clone().to(device=patch_attn_mask.device)
+        mask_shapes = tgt_sizes_tensor[:, 0] * tgt_sizes_tensor[:, 1]
+        patch_attn_mask[:, 0, :] = torch.arange(
+            patch_attn_mask.size(2), device=patch_attn_mask.device
+        ).unsqueeze(0) < mask_shapes.unsqueeze(1)
+
+        vision_embedding = self.vpm(
+            all_pixel_values.type(dtype),
+            patch_attention_mask=patch_attn_mask,
+            tgt_sizes=tgt_sizes,
+        )
+        return self.resampler(vision_embedding, tgt_sizes)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        **kwargs: Any,
+    ) -> torch.Tensor:
+
+        hidden_states = general_mm_embed_routine(
+            input_ids=input_ids,
+            forward_batch=forward_batch,
+            language_model=self.llm,
+            multimodal_model=self,
+            positions=positions,
+        )
+        return hidden_states
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+            ("gate_up_proj", "gate_proj", 0),
+            ("gate_up_proj", "up_proj", 1),
+        ]
+
+        params_dict = dict(self.named_parameters())
+        for name, loaded_weight in weights:
+
+            if "rotary_emb.inv_freq~" in name or "projector" in name:
+                continue
+            if "rotary_emb.cos_cached" in name or "rotary_emb.sin_cached" in name:
+                # Models trained using ColossalAI may include these tensors in
+                # the checkpoint. Skip them.
+                continue
+
+            # For weight_norm parametrization, handle both old and new formats
+            if self.config.init_tts and "tts" in name:
+                # Handle loading from older checkpoints with weight_g/weight_v format
+                if ".weight_g" in name or ".weight_v" in name:
+                    name = name.replace(
+                        ".weight_g", ".parametrizations.weight.original0"
+                    )
+                    name = name.replace(
+                        ".weight_v", ".parametrizations.weight.original1"
+                    )
+                elif ".weight" in name and name not in params_dict:
+                    param_name = name.replace(
+                        ".weight", ".parametrizations.weight.original0"
+                    )
+                    if param_name in params_dict:
+                        name = param_name
+
+            # adapt to VisionAttention
+            if "vpm" in name:
+                name = name.replace(r"self_attn.out_proj", r"self_attn.proj")
+
+            if not self.config.init_tts and "tts" in name:
+                continue
+            if not self.config.init_audio and ("apm" in name or "audio" in name):
+                continue
+            if not self.config.init_vision and "vpm" in name:
+                continue
+
+            if (
+                "sampler" in name
+                or "apm" in name
+                or ("tts" in name and "self_attn" in name)
+                or ("tts.model.layers" in name and ".mlp" in name)
+            ):
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
+                weight_loader(param, loaded_weight)
+                continue
+
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                # replace the name and load with customized loader
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                # # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
+                weight_loader(param, loaded_weight)
+
+
+EntryClass = [MiniCPMO]
diff --git a/sglang/python/sglang/srt/models/minicpmv.py b/sglang/python/sglang/srt/models/minicpmv.py
new file mode 100644
index 0000000000000000000000000000000000000000..cd4489152b8acb53838b36ae92f4083283c50cdf
--- /dev/null
+++ b/sglang/python/sglang/srt/models/minicpmv.py
@@ -0,0 +1,1446 @@
+# Adapted from
+# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
+# Copyright 2023 The SGLang team.
+# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Inference-only MiniCPM-V model compatible with HuggingFace weights."""
+
+import types
+from functools import partial
+from itertools import chain
+from typing import (
+    Any,
+    Callable,
+    Iterable,
+    List,
+    Literal,
+    Optional,
+    Tuple,
+    TypedDict,
+    Union,
+)
+
+import numpy as np
+import torch
+import torch.types
+from PIL import Image
+from torch import nn
+from torch.nn.init import trunc_normal_
+from transformers import PretrainedConfig
+
+from sglang.srt.layers.linear import ReplicatedLinear
+from sglang.srt.layers.logits_processor import LogitsProcessor
+from sglang.srt.layers.quantization.base_config import QuantizationConfig
+from sglang.srt.managers.mm_utils import (
+    MultiModalityDataPaddingPatternTokenPairs,
+    general_mm_embed_routine,
+)
+from sglang.srt.managers.schedule_batch import (
+    MultimodalDataItem,
+    MultimodalInputFormat,
+    MultimodalInputs,
+)
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch
+from sglang.srt.model_loader.utils import set_default_torch_dtype
+from sglang.srt.model_loader.weight_utils import default_weight_loader
+from sglang.srt.models.idefics2 import Idefics2VisionTransformer
+from sglang.srt.models.llama import LlamaConfig, LlamaForCausalLM
+from sglang.srt.models.qwen2 import Qwen2Config, Qwen2ForCausalLM
+from sglang.srt.models.qwen3 import Qwen3Config, Qwen3ForCausalLM
+from sglang.srt.utils import add_prefix, flatten_nested_list
+
+RawImageType = Union[Image.Image, torch.Tensor]
+
+
+# sin/cos positional embedding helpers are adapted from:
+# https://github.com/facebookresearch/mae/blob/efb2a8062c206524e35e47d04501ed4f544c0ae8/util/pos_embed.py#L20
+def get_1d_sincos_pos_embed_from_grid(
+    embed_dim: int, pos: np.ndarray, version: Tuple[int, int] = (2, 0)
+) -> torch.Tensor:
+    """
+    embed_dim: output dimension for each position
+    pos: a list of positions to be encoded: size (M,) / (H, W)
+    out: (M, D) / (H, W, D)
+    """
+    assert embed_dim % 2 == 0
+    omega = np.arange(embed_dim // 2, dtype=np.float32)
+    omega /= embed_dim / 2.0
+    omega = 1.0 / 10000**omega  # (D/2,)
+
+    if version == (2, 0):
+        pos = pos.reshape(-1)  # (M,)
+        out = np.einsum("m,d->md", pos, omega)  # (M, D/2), outer product
+        emb_sin = np.sin(out)  # (M, D/2)
+        emb_cos = np.cos(out)  # (M, D/2)
+        emb = np.concatenate([emb_sin, emb_cos], axis=1)  # (M, D)
+    else:
+        out = np.einsum("hw,d->hwd", pos, omega)  # (H, W, D/2), outer product
+        emb_sin = np.sin(out)  # (H, W, D/2)
+        emb_cos = np.cos(out)  # (H, W, D/2)
+        emb = np.concatenate([emb_sin, emb_cos], axis=-1)  # (H, W, D)
+    return emb
+
+
+def get_2d_sincos_pos_embed_from_grid(
+    embed_dim: int, grid: np.ndarray, version: Tuple[int, int] = (2, 0)
+) -> torch.Tensor:
+    assert embed_dim % 2 == 0
+
+    # use half of dimensions to encode grid_h
+    emb_h = get_1d_sincos_pos_embed_from_grid(
+        embed_dim // 2, grid[0], version
+    )  # (H*W, D/2) or (H, W, D/2)
+    emb_w = get_1d_sincos_pos_embed_from_grid(
+        embed_dim // 2, grid[1], version
+    )  # (H*W, D/2) or (H, W, D/2)
+
+    if version == (2, 0):
+        emb = np.concatenate([emb_h, emb_w], axis=1)  # (H*W, D)
+    else:
+        emb = np.concatenate([emb_h, emb_w], axis=-1)  # (H, W, D)
+    return emb
+
+
+def get_2d_sincos_pos_embed(
+    embed_dim: int,
+    grid_size: Union[int, Tuple[int, int]],
+    cls_token: bool = False,
+    version: Tuple[int, int] = (2, 0),
+) -> torch.Tensor:
+    """
+    grid_size: int of the grid height and width
+    return:
+    pos_embed: [grid_size*grid_size, embed_dim] or
+                [1+grid_size*grid_size, embed_dim] (w/ or w/o cls_token)
+    """
+    if isinstance(grid_size, int):
+        grid_h_size, grid_w_size = grid_size, grid_size
+    else:
+        grid_h_size, grid_w_size = grid_size[0], grid_size[1]
+
+    grid_h = np.arange(grid_h_size, dtype=np.float32)
+    grid_w = np.arange(grid_w_size, dtype=np.float32)
+    grid = np.meshgrid(grid_w, grid_h)  # here w goes first
+    grid = np.stack(grid, axis=0)
+    assert isinstance(grid, np.ndarray) and grid.shape == (2, grid_h_size, grid_w_size)
+
+    if version == (2, 0):
+        grid = grid.reshape([2, 1, grid_h_size, grid_w_size])
+        pos_embed = get_2d_sincos_pos_embed_from_grid(embed_dim, grid, version)
+        if cls_token:
+            pos_embed = np.concatenate([np.zeros([1, embed_dim]), pos_embed], axis=0)
+    else:
+        pos_embed = get_2d_sincos_pos_embed_from_grid(embed_dim, grid, version)
+    return pos_embed
+
+
+class MiniCPMVImagePixelInputs(TypedDict):
+    type: Literal["pixel_values"]
+    data: List[torch.Tensor]
+    """
+    Shape: `(batch_size * num_images, num_channels, height, width)`
+
+    Note that the image size may vary, so we pass it as a list
+    instead of a batched tensor.
+    """
+
+    image_bounds: torch.Tensor
+    """
+    Shape: `(batch_size * num_images, 2)`
+
+    This should be in `(start, stop)` format.
+    """
+
+    tgt_sizes: torch.Tensor
+    """
+    Shape: `(batch_size * num_images, 2)`
+
+    This should be in `(height, width)` format.
+    """
+
+
+class MiniCPMVImageEmbeddingInputs(TypedDict):
+    type: Literal["image_embeds"]
+    data: torch.Tensor
+    """
+    Shape: `(batch_size * num_images, image_feature_size, hidden_size)`
+
+    `hidden_size` must match the hidden size of language model backbone.
+    instead of a batched tensor.
+    """
+
+    image_bounds: torch.Tensor
+    """
+    Shape: `(batch_size * num_images, 2)`
+
+    This should be in `(start, stop)` format.
+    """
+
+
+MiniCPMVImageInputs = Union[MiniCPMVImagePixelInputs, MiniCPMVImageEmbeddingInputs]
+
+DEFAULT_LN = partial(nn.LayerNorm, eps=1e-6)
+
+
+class BaseResampler(nn.Module):
+    """
+    A 2D perceiver-resampler network with one cross attention layers by
+        (grid_size**2) learnable queries and 2d sincos pos_emb.
+    Outputs:
+        A tensor with the shape of (grid_size**2, embed_dim)
+    """
+
+    def __init__(
+        self,
+        num_queries: int,
+        embed_dim: int,
+        num_heads: int,
+        kv_dim: Optional[int] = None,
+        norm_layer: Callable[[int], nn.LayerNorm] = DEFAULT_LN,
+        do_post_projection: bool = True,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+
+        self.num_queries = num_queries
+        self.embed_dim = embed_dim
+        self.num_heads = num_heads
+
+        self.query = nn.Parameter(torch.zeros(self.num_queries, embed_dim))
+        trunc_normal_(self.query, std=0.02)
+        if kv_dim is not None and kv_dim != embed_dim:
+            self.kv_proj = ReplicatedLinear(
+                kv_dim,
+                embed_dim,
+                bias=False,
+                quant_config=quant_config,
+                prefix=add_prefix("kv_proj", prefix),
+            )
+        else:
+            # Maintain the same return value with ReplicatedLinear.forward
+            self.kv_proj = lambda *args, **kwargs: (  # type: ignore # noqa
+                nn.Identity()(*args, **kwargs),
+                None,
+            )
+        self.attn = nn.MultiheadAttention(embed_dim, num_heads)
+        self.ln_q = norm_layer(embed_dim)
+        self.ln_kv = norm_layer(embed_dim)
+        self.do_post_projection = do_post_projection
+        self.ln_post = norm_layer(embed_dim) if do_post_projection else None
+        self.proj = (
+            nn.Parameter((embed_dim**-0.5) * torch.randn(embed_dim, embed_dim))
+            if do_post_projection
+            else None
+        )
+
+    def _init_weights(self, m: nn.Module) -> None:
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight, std=0.02)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+
+    def _repeat(self, query, N: int):
+        return query.unsqueeze(1).repeat(1, N, 1)
+
+
+class Resampler2_5(BaseResampler):
+
+    def __init__(
+        self,
+        num_queries: int,
+        embed_dim: int,
+        num_heads: int,
+        kv_dim: Optional[int] = None,
+        norm_layer: Callable[[int], nn.LayerNorm] = DEFAULT_LN,
+        max_size: Tuple[int, int] = (70, 70),
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__(
+            num_queries,
+            embed_dim,
+            num_heads,
+            kv_dim,
+            norm_layer,
+            quant_config=quant_config,
+            prefix=prefix,
+        )
+
+        self.max_size = max_size
+        self._set_2d_pos_cache(self.max_size)
+
+        self.apply(self._init_weights)
+
+    def _set_2d_pos_cache(
+        self, max_size: Tuple[int, int], device: torch.types.Device = "cpu"
+    ) -> None:
+        pos_embed_arr = get_2d_sincos_pos_embed(
+            self.embed_dim, max_size, version=(2, 5)
+        )
+        pos_embed = torch.from_numpy(pos_embed_arr).float().to(device)
+        self.register_buffer("pos_embed", pos_embed, persistent=False)
+
+    def _adjust_pos_cache(
+        self, tgt_sizes: torch.Tensor, device: torch.types.Device
+    ) -> None:
+        max_h = tgt_sizes[:, 0].max().item()
+        max_w = tgt_sizes[:, 1].max().item()
+        assert isinstance(max_h, int) and isinstance(max_w, int)
+
+        if max_h > self.max_size[0] or max_w > self.max_size[1]:
+            self.max_size = (
+                max(max_h, self.max_size[0]),
+                max(max_w, self.max_size[1]),
+            )
+            self._set_2d_pos_cache(self.max_size, device)
+
+    def forward(self, x: torch.Tensor, tgt_sizes: torch.Tensor) -> torch.Tensor:
+        assert x.shape[0] == tgt_sizes.shape[0]
+        bs = x.shape[0]
+
+        device = x.device
+        dtype = x.dtype
+
+        patch_len = tgt_sizes[:, 0] * tgt_sizes[:, 1]
+
+        self._adjust_pos_cache(tgt_sizes, device=device)
+
+        max_patch_len = patch_len.max().item()
+        assert isinstance(max_patch_len, int)
+
+        key_padding_mask = torch.zeros(
+            (bs, max_patch_len), dtype=torch.bool, device=device
+        )
+
+        pos_embed = []
+        for i in range(bs):
+            tgt_h, tgt_w = tgt_sizes[i].tolist()
+            pos_embed.append(
+                self.pos_embed[:tgt_h, :tgt_w, :].reshape((tgt_h * tgt_w, -1)).to(dtype)
+            )  # patches * D
+            key_padding_mask[i, patch_len[i] :] = True
+        pos_embed = torch.nn.utils.rnn.pad_sequence(
+            pos_embed, batch_first=True, padding_value=0.0
+        ).permute(
+            1, 0, 2
+        )  # BLD => L * B * D
+        x, _ = self.kv_proj(x)  # B * L * D
+        x = self.ln_kv(x).permute(1, 0, 2)  # L * B * D
+
+        q = self.ln_q(self.query)  # Q * D
+
+        out = self.attn(
+            self._repeat(q, bs),  # Q * B * D
+            x + pos_embed,  # L * B * D +  L * B * D
+            x,
+            key_padding_mask=key_padding_mask,
+        )[0]
+        #  out: Q * B * D
+        x = out.permute(1, 0, 2)  # B * Q * D
+
+        x = self.ln_post(x)
+        x = x @ self.proj
+        return x
+
+
+class Resampler4_5(BaseResampler):
+
+    def __init__(
+        self,
+        num_queries: int,
+        embed_dim: int,
+        num_heads: int,
+        kv_dim: Optional[int] = None,
+        norm_layer: Callable[[int], nn.LayerNorm] = DEFAULT_LN,
+        max_size: tuple[int, int] = (70, 70),
+        max_temporal_size=36000,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__(
+            num_queries,
+            embed_dim,
+            num_heads,
+            kv_dim,
+            norm_layer,
+            quant_config=quant_config,
+            prefix=prefix,
+        )
+
+        self.max_size = max_size
+        self.max_temporal_size = max_temporal_size
+
+        self._set_2d_pos_cache(self.max_size)
+        self._set_temporal_pos_cache(self.max_temporal_size)
+        self.apply(self._init_weights)
+
+    def get_1d_sincos_pos_embed_from_temporal_size(
+        self, embed_dim: int, pos: np.ndarray
+    ):
+        """
+        embed_dim: output dimension for each position
+        pos: a list of positions to be encoded: size (M,)
+        out: (M, D)
+        """
+        assert embed_dim % 2 == 0
+        omega = np.arange(embed_dim // 2, dtype=np.float32)
+        omega /= embed_dim / 2.0
+        omega = 1.0 / 10000**omega  # (D/2,)
+
+        pos = pos.reshape(-1)  # (M,)
+        out = np.einsum("m,d->md", pos, omega)  # (M, D/2), outer product
+
+        emb_sin = np.sin(out)  # (M, D/2)
+        emb_cos = np.cos(out)  # (M, D/2)
+
+        emb = np.concatenate([emb_sin, emb_cos], axis=1)  # (M, D)
+        return emb
+
+    def _set_2d_pos_cache(
+        self, max_size: tuple[int, int], device: torch.types.Device = "cpu"
+    ) -> None:
+        pos_embed_arr = get_2d_sincos_pos_embed(
+            self.embed_dim, max_size, version=(2, 5)
+        )
+        pos_embed = torch.from_numpy(pos_embed_arr).float().to(device)
+        self.register_buffer("pos_embed", pos_embed, persistent=False)
+
+    def _adjust_pos_cache(
+        self, tgt_sizes: torch.Tensor, device: torch.types.Device
+    ) -> None:
+        max_h = tgt_sizes[:, 0].max().item()
+        max_w = tgt_sizes[:, 1].max().item()
+        assert isinstance(max_h, int) and isinstance(max_w, int)
+
+        if max_h > self.max_size[0] or max_w > self.max_size[1]:
+            self.max_size = (
+                max(max_h, self.max_size[0]),
+                max(max_w, self.max_size[1]),
+            )
+            self._set_2d_pos_cache(self.max_size, device)
+
+    def _set_temporal_pos_cache(
+        self, max_temporal_size: int, device: torch.types.Device = "cpu"
+    ) -> None:
+        temporal_size = np.arange(max_temporal_size, dtype=np.float32)
+        pos_embed = (
+            torch.from_numpy(
+                self.get_1d_sincos_pos_embed_from_temporal_size(
+                    self.embed_dim, temporal_size
+                )
+            )
+            .float()
+            .to(device)
+        )
+        self.register_buffer("temporal_pos_embed", pos_embed, persistent=False)
+
+    def _adjust_temporal_pos_cache(
+        self, max_temporal_size: int, device: torch.types.Device = "cpu"
+    ):
+        if max_temporal_size > self.max_temporal_size:
+            self.max_temporal_size = max_temporal_size
+            self._set_temporal_pos_cache(self.max_temporal_size, device)
+
+    def forward(
+        self, x: torch.Tensor, tgt_sizes: torch.Tensor, temporal_ids=None
+    ) -> torch.Tensor:
+        assert x.shape[0] == tgt_sizes.shape[0]
+        bs = x.shape[0]
+
+        device = x.device
+        dtype = x.dtype
+
+        patch_len = tgt_sizes[:, 0] * tgt_sizes[:, 1]
+
+        self._adjust_pos_cache(tgt_sizes, device=device)
+
+        temporal_pos_emb = False
+        temporal_ids_flatten = None
+        if temporal_ids is not None:
+            # example: [[-1], [-1], [2, 6, 9]]
+            temporal_ids_flatten = list(chain.from_iterable(temporal_ids))
+            max_temporal_size = max(temporal_ids_flatten)
+            if max_temporal_size > -1:
+                temporal_pos_emb = True
+            if max_temporal_size > self.max_temporal_size:
+                self._adjust_temporal_pos_cache(max_temporal_size, device)
+
+        max_patch_len = patch_len.max().item()
+        assert isinstance(max_patch_len, int)
+
+        key_padding_mask = torch.zeros(
+            (bs, max_patch_len), dtype=torch.bool, device=device
+        )
+
+        x, _ = self.kv_proj(x)  # B * L * D
+        x = self.ln_kv(x).permute(1, 0, 2)  # L * B * D
+        q = self.ln_q(self.query)  # Q * D
+
+        pos_embed_2d = []
+        pos_embed_temporal = []
+        for i in range(bs):
+            tgt_h, tgt_w = tgt_sizes[i]
+            if temporal_pos_emb:
+                if temporal_ids_flatten[i] == -1:
+                    pos_embed_temporal.append(
+                        torch.zeros(self.embed_dim, dtype=dtype, device=device)
+                    )
+                else:
+                    pos_embed_temporal.append(
+                        self.temporal_pos_embed[temporal_ids_flatten[i]].to(dtype)
+                    )  # D
+
+            pos_embed_2d.append(
+                self.pos_embed[:tgt_h, :tgt_w, :].reshape((tgt_h * tgt_w, -1)).to(dtype)
+            )  # patches * D
+            key_padding_mask[i, patch_len[i] :] = True
+
+        pos_embed_2d = torch.nn.utils.rnn.pad_sequence(
+            pos_embed_2d, batch_first=True, padding_value=0.0
+        ).permute(
+            1, 0, 2
+        )  # BLD => L * B * D
+
+        k = x
+        v = x + pos_embed_2d
+
+        if pos_embed_temporal:
+            k += torch.stack(pos_embed_temporal, dim=0)
+            bs = len(temporal_ids)
+            merge_k = []
+            merge_v = []
+            merge_key_padding_mask = []
+
+            start = 0
+            for tp in temporal_ids:
+                end = start + len(tp)
+                # # L * (end-start) * D -> (end-start) * L * D -> 1 * L*(end-start) * D
+                merge_k.append(
+                    k[:, start:end, :].permute(1, 0, 2).reshape(-1, self.embed_dim)
+                )
+                merge_v.append(
+                    v[:, start:end, :].permute(1, 0, 2).reshape(-1, self.embed_dim)
+                )
+                merge_key_padding_mask.append(
+                    key_padding_mask[start:end, :].reshape(-1, 1)
+                )
+
+                start = end
+
+            k = torch.nn.utils.rnn.pad_sequence(
+                merge_k, batch_first=True, padding_value=0.0
+            ).permute(
+                1, 0, 2
+            )  # L*(end-start)
+            v = torch.nn.utils.rnn.pad_sequence(
+                merge_v, batch_first=True, padding_value=0.0
+            ).permute(
+                1, 0, 2
+            )  # L*(end-start)
+            key_padding_mask = torch.nn.utils.rnn.pad_sequence(
+                merge_key_padding_mask, batch_first=True, padding_value=True
+            ).squeeze(-1)
+
+        out = self.attn(
+            self._repeat(q, bs),  # Q * B * D
+            k,  # L * B * D +  L * B * D
+            v,
+            key_padding_mask=key_padding_mask,
+        )[0]
+        #  out: Q * B * D
+        x = out.permute(1, 0, 2)  # B * Q * D
+
+        x = self.ln_post(x)
+        x = x @ self.proj
+        return x
+
+
+def get_version_by_config(config: PretrainedConfig) -> Tuple[int, ...]:
+    version_float = getattr(config, "version", None)
+
+    # The old configs do not include version number
+    # TODO: Remove this after the HF repos are updated
+    if version_float is None:
+        if config.hidden_size == 2304 and config.query_num == 64:
+            return 2, 0
+        return 2, 5
+
+    version_str = str(version_float)
+    return tuple(int(x) for x in version_str.split("."))
+
+
+class MiniCPMBaseModel(nn.Module):
+    """
+    The abstract class of MiniCPMV can only be inherited, but cannot be
+    instantiated.
+    """
+
+    def __init__(
+        self,
+        *,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        # All MiniCPM-V models disable `tie_word_embeddings` but
+        # `PretrainedConfig.tie_word_embeddings` defaults to True; we cannot
+        # check `tie_word_embeddings` until SGLang integrate MiniCPM-V model
+        # and config class
+        self.config = config
+
+        self.version = get_version_by_config(self.config)
+        self.llm = self.init_llm(
+            config=config, quant_config=quant_config, prefix=add_prefix("llm", prefix)
+        )
+        self.vpm = self.init_vision_module(
+            config, quant_config, add_prefix("vpm", prefix)
+        )
+        self.vision_dim = (
+            self.vpm.embed_dim
+            if self.version == (2, 0)
+            else self.vpm.embeddings.embed_dim
+        )
+        self.embed_dim = self.config.hidden_size
+
+        self.resampler = self.init_resampler(
+            self.embed_dim,
+            self.vision_dim,
+            quant_config=quant_config,
+            prefix=add_prefix("resampler", prefix),
+        )
+
+        self.logits_processor = LogitsProcessor(config)
+
+    def _get_image_bounds(
+        self,
+        input_ids: torch.Tensor,
+        pad_values: List[int],
+        im_start_id: int,
+        im_end_id: int,
+        slice_start_id: Optional[int] = None,
+        slice_end_id: Optional[int] = None,
+    ) -> torch.Tensor:
+        """
+        Returns a tensor indicating the bounds (start and end token ids) of the images
+        """
+        # All the images in the batch should share the same special image
+        # bound token ids.
+        start_cond = input_ids == im_start_id
+        end_cond = input_ids == im_end_id
+        if slice_start_id is not None:
+            start_cond |= input_ids == slice_start_id
+            end_cond |= input_ids == slice_end_id
+
+        (image_start_tokens,) = torch.where(start_cond)
+        image_start_tokens += 1
+        (image_end_tokens,) = torch.where(end_cond)
+
+        # the im_start_id sometimes can be cached as prefix, but it is needed for the embedding of the images
+        if len(image_start_tokens) != len(image_end_tokens):
+            if (
+                len(image_start_tokens) + 1 == len(image_end_tokens)
+                and input_ids[0] in pad_values
+                and len(image_start_tokens) != 0
+                and len(image_end_tokens) != 0
+                and image_end_tokens[0] < image_start_tokens[0]
+            ):
+                image_start_tokens = torch.cat(
+                    [
+                        torch.tensor([0], device=image_start_tokens.device),
+                        image_start_tokens,
+                    ]
+                )
+        valid_image_nums = min(len(image_start_tokens), len(image_end_tokens))
+
+        if valid_image_nums == 0:
+            return torch.zeros((0, 2), device=input_ids.device)
+
+        # Filter out pairs where start_token >= end_token
+        valid_pairs = []
+        for i in range(valid_image_nums):
+            start_token = image_start_tokens[i]
+            end_token = image_end_tokens[i]
+            if start_token < end_token:
+                valid_pairs.append((start_token, end_token))
+
+        if not valid_pairs:
+            return torch.zeros((0, 2), device=input_ids.device)
+
+        # Convert valid pairs to tensor
+        valid_pairs_tensor = torch.tensor(valid_pairs, device=input_ids.device)
+        return valid_pairs_tensor
+
+    def _parse_and_validate_inputs(
+        self,
+        input_ids: torch.Tensor,
+        **kwargs: object,
+    ) -> Optional[MiniCPMVImageInputs]:
+        pixel_values = kwargs.pop("pixel_values", [])
+        tgt_sizes = kwargs.pop("tgt_sizes", [])
+        im_start_id = kwargs.pop("im_start_id", None)
+        im_end_id = kwargs.pop("im_end_id", None)
+        slice_start_id = kwargs.pop("slice_start_id", None)
+        slice_end_id = kwargs.pop("slice_end_id", None)
+        image_embeds = kwargs.pop("image_embeds", None)
+        pad_values = kwargs.pop("pad_values", None)
+
+        if image_embeds is not None:
+            image_bounds = self._get_image_bounds(
+                input_ids=input_ids,
+                pad_values=pad_values,
+                im_start_id=im_start_id,
+                im_end_id=im_end_id,
+                slice_start_id=slice_start_id,
+                slice_end_id=slice_end_id,
+            )
+            if not isinstance(image_embeds, (torch.Tensor, list)):
+                raise ValueError(
+                    f"Incorrect type of image embeds. "
+                    f"Got type: {type(image_embeds)}"
+                )
+
+            if isinstance(image_embeds, list):
+                image_embeds = torch.cat(image_embeds)
+
+            return MiniCPMVImageEmbeddingInputs(
+                image_bounds=image_bounds,
+                data=image_embeds,
+                type="image_embeds",
+            )
+
+        image_bounds = self._get_image_bounds(
+            input_ids=input_ids,
+            pad_values=pad_values,
+            im_start_id=im_start_id,
+            im_end_id=im_end_id,
+            slice_start_id=slice_start_id,
+            slice_end_id=slice_end_id,
+        )
+        return MiniCPMVImagePixelInputs(
+            image_bounds=image_bounds.to(device=input_ids.device),
+            data=pixel_values,
+            tgt_sizes=tgt_sizes,
+            type="pixel_values",
+        )
+
+    def get_embedding(
+        self,
+        input_ids: torch.Tensor,
+        image_inputs: Optional[MiniCPMVImageInputs],
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        vlm_embedding: torch.Tensor = self.llm.get_input_embeddings(input_ids)
+
+        if image_inputs is None:  # No image
+            vision_hidden_states = torch.tensor([], device=input_ids.device)
+        else:
+            if image_inputs["type"] == "image_embeds":
+                vision_hidden_states = (
+                    image_inputs["data"]
+                    .type(vlm_embedding.dtype)
+                    .to(vlm_embedding.device)
+                )
+            else:
+                vision_hidden_states = self.get_vision_hidden_states(image_inputs)
+            # See NOTE in _parse_and_validate_inputs
+            image_bounds = image_inputs["image_bounds"]
+            if len(image_bounds) > 0:
+                image_indices = torch.stack(
+                    [
+                        torch.arange(start, end, dtype=torch.long)
+                        for start, end in image_bounds.tolist()
+                    ]
+                ).to(vlm_embedding.device)
+
+                vlm_embedding.scatter_(
+                    0,
+                    image_indices.view(-1, 1).repeat(1, vlm_embedding.shape[-1]),
+                    vision_hidden_states.view(-1, vision_hidden_states.shape[-1]),
+                )
+
+        return vlm_embedding, vision_hidden_states
+
+    def get_input_embeddings(self) -> nn.Embedding:
+        return self.llm.get_input_embeddings()
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        **kwargs: Any,
+    ) -> torch.Tensor:
+        hidden_states = general_mm_embed_routine(
+            input_ids=input_ids,
+            forward_batch=forward_batch,
+            multimodal_model=self,
+            language_model=self.llm,
+            positions=positions,
+        )
+        return hidden_states
+
+    def init_llm(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> nn.Module:
+        raise NotImplementedError
+
+    def init_vision_module(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig],
+        prefix: str = "",
+    ) -> nn.Module:
+        raise NotImplementedError
+
+    def init_resampler(
+        self,
+        embed_dim: int,
+        vision_dim: int,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> nn.Module:
+        raise NotImplementedError
+
+    def get_vision_embedding(
+        self,
+        pixel_values: List[torch.Tensor],
+        patch_attn_mask: Optional[torch.Tensor] = None,
+        tgt_sizes: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        raise NotImplementedError
+
+    def get_image_feature(self, items: List[MultimodalDataItem]) -> torch.Tensor:
+        raise NotImplementedError
+
+
+class MiniCPMV2_6(MiniCPMBaseModel):
+    packed_modules_mapping = {
+        "qkv_proj": [
+            "q_proj",
+            "k_proj",
+            "v_proj",
+        ],
+        "gate_up_proj": [
+            "gate_proj",
+            "up_proj",
+        ],
+    }
+    # LoRA specific attributes
+    supported_lora_modules = [
+        # vision encoder
+        "fc1",
+        "fc2",
+        "out_proj",
+        # language model
+        "qkv_proj",  # same name with vision encoder
+        "o_proj",
+        "gate_up_proj",
+        "down_proj",
+        # resampler
+        "kv_proj",
+    ]
+
+    # BitandBytes specific attributes
+    bitsandbytes_stacked_params_mapping = {
+        # shard_name, weight_name, index
+        "q_proj": ("qkv_proj", 0),
+        "k_proj": ("qkv_proj", 1),
+        "v_proj": ("qkv_proj", 2),
+        "gate_proj": ("gate_up_proj", 0),
+        "up_proj": ("gate_up_proj", 1),
+    }
+
+    embedding_modules = {}
+    embedding_padding_modules = []
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__(config=config, quant_config=quant_config, prefix=prefix)
+        assert self.version == (2, 6)
+
+    def init_llm(
+        self,
+        config: Qwen2Config,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> nn.Module:
+        return Qwen2ForCausalLM(config=config, quant_config=quant_config, prefix=prefix)
+
+    def init_vision_module(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig],
+        prefix: str = "",
+    ) -> nn.Module:
+        model = Idefics2VisionTransformer(
+            config=config.vision_config, quant_config=quant_config, prefix=prefix
+        )
+        if self.config.drop_vision_last_layer:
+            model.encoder.layers = model.encoder.layers[:-1]
+
+        setattr(model, "embed_dim", model.embeddings.embed_dim)
+        setattr(model, "patch_size", model.embeddings.patch_size)
+        return model
+
+    def init_resampler(
+        self,
+        embed_dim: int,
+        vision_dim: int,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> nn.Module:
+        with set_default_torch_dtype(torch.float16):
+            # The resampler in 2.6 remains consistent with the one in 2.5.
+            resampler = Resampler2_5(
+                num_queries=self.config.query_num,
+                embed_dim=embed_dim,
+                num_heads=embed_dim // 128,
+                kv_dim=vision_dim,
+                quant_config=quant_config,
+                prefix=prefix,
+            )
+
+        return resampler.to(device="cuda", dtype=torch.get_default_dtype())
+
+    def get_vision_embedding(
+        self,
+        pixel_values: List[torch.Tensor],
+        patch_attn_mask: Optional[torch.Tensor] = None,
+        tgt_sizes: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        vision_embedding = self.vpm(
+            pixel_values,
+            patch_attention_mask=patch_attn_mask,
+            tgt_sizes=tgt_sizes,
+        )
+        return vision_embedding
+
+    def get_image_feature(self, items: List[MultimodalDataItem]) -> torch.Tensor:
+        if items and items[0].format == MultimodalInputFormat.PRECOMPUTED_EMBEDDING:
+            result = torch.cat([item.feature for item in items])
+            return result.reshape(-1, result.shape[-1])
+
+        # list of tensors
+        pixel_values = flatten_nested_list([item.feature for item in items])
+        tgt_sizes = torch.stack(
+            flatten_nested_list([item.tgt_size for item in items]), dim=0
+        )
+        assert len(pixel_values) == tgt_sizes.shape[0]
+
+        device = self.vpm.embeddings.position_embedding.weight.device
+        dtype = self.vpm.embeddings.position_embedding.weight.dtype
+        all_pixel_values_lst = [
+            i.flatten(end_dim=1).permute(1, 0) for i in pixel_values
+        ]
+
+        max_patches = (tgt_sizes[:, 0] * tgt_sizes[:, 1]).max().item()
+        assert isinstance(max_patches, int)
+        all_pixel_values = torch.nn.utils.rnn.pad_sequence(
+            all_pixel_values_lst, batch_first=True, padding_value=0.0
+        )
+
+        B, L, _ = all_pixel_values.shape
+        all_pixel_values = all_pixel_values.permute(0, 2, 1).reshape(B, 3, -1, L)
+        patch_attn_mask = torch.zeros(
+            (B, 1, max_patches), dtype=torch.bool, device=device
+        )
+
+        tgt_sizes_tensor = tgt_sizes.clone().to(device=patch_attn_mask.device)
+        mask_shapes = tgt_sizes_tensor[:, 0] * tgt_sizes_tensor[:, 1]
+        patch_attn_mask[:, 0, :] = torch.arange(
+            patch_attn_mask.size(2), device=patch_attn_mask.device
+        ).unsqueeze(0) < mask_shapes.unsqueeze(1)
+
+        vision_embedding = self.vpm(
+            all_pixel_values.type(dtype),
+            patch_attention_mask=patch_attn_mask,
+            tgt_sizes=tgt_sizes,
+        )
+        return self.resampler(vision_embedding, tgt_sizes)
+
+    def pad_input_ids(self, input_ids: List[int], image_inputs: MultimodalInputs):
+        # Get all special token IDs
+        im_start_id: int = image_inputs.im_start_id
+        im_end_id: int = image_inputs.im_end_id
+        slice_start_id: int = image_inputs.slice_start_id
+        slice_end_id: int = image_inputs.slice_end_id
+
+        media_token_pairs = [(im_start_id, im_end_id), (slice_start_id, slice_end_id)]
+        pattern = MultiModalityDataPaddingPatternTokenPairs(media_token_pairs)
+
+        return pattern.pad_input_tokens(input_ids, image_inputs)
+
+
+class MiniCPMV4_0(MiniCPMBaseModel):
+    packed_modules_mapping = {
+        "qkv_proj": [
+            "q_proj",
+            "k_proj",
+            "v_proj",
+        ],
+        "gate_up_proj": [
+            "gate_proj",
+            "up_proj",
+        ],
+    }
+    # LoRA specific attributes
+    supported_lora_modules = [
+        # vision encoder
+        "fc1",
+        "fc2",
+        "out_proj",
+        # language model
+        "qkv_proj",  # same name with vision encoder
+        "o_proj",
+        "gate_up_proj",
+        "down_proj",
+        # resampler
+        "kv_proj",
+    ]
+
+    # BitandBytes specific attributes
+    bitsandbytes_stacked_params_mapping = {
+        # shard_name, weight_name, index
+        "q_proj": ("qkv_proj", 0),
+        "k_proj": ("qkv_proj", 1),
+        "v_proj": ("qkv_proj", 2),
+        "gate_proj": ("gate_up_proj", 0),
+        "up_proj": ("gate_up_proj", 1),
+    }
+
+    embedding_modules = {}
+    embedding_padding_modules = []
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__(config=config, quant_config=quant_config, prefix=prefix)
+        assert self.version == (4, 0)
+
+    def init_llm(
+        self,
+        config: LlamaConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> nn.Module:
+        return LlamaForCausalLM(config=config, quant_config=quant_config, prefix=prefix)
+
+    def init_vision_module(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig],
+        prefix: str = "",
+    ) -> nn.Module:
+        model = Idefics2VisionTransformer(
+            config=config.vision_config, quant_config=quant_config, prefix=prefix
+        )
+        if self.config.drop_vision_last_layer:
+            model.encoder.layers = model.encoder.layers[:-1]
+
+        setattr(model, "embed_dim", model.embeddings.embed_dim)
+        setattr(model, "patch_size", model.embeddings.patch_size)
+        return model
+
+    def init_resampler(
+        self,
+        embed_dim: int,
+        vision_dim: int,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> nn.Module:
+        with set_default_torch_dtype(torch.float16):
+            # The resampler in 2.6 remains consistent with the one in 2.5.
+            resampler = Resampler2_5(
+                num_queries=self.config.query_num,
+                embed_dim=embed_dim,
+                num_heads=embed_dim // 128,
+                kv_dim=vision_dim,
+                quant_config=quant_config,
+                prefix=prefix,
+            )
+
+        return resampler.to(device="cuda", dtype=torch.get_default_dtype())
+
+    def get_vision_embedding(
+        self,
+        pixel_values: List[torch.Tensor],
+        patch_attn_mask: Optional[torch.Tensor] = None,
+        tgt_sizes: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        vision_embedding = self.vpm(
+            pixel_values,
+            patch_attention_mask=patch_attn_mask,
+            tgt_sizes=tgt_sizes,
+        )
+        return vision_embedding
+
+    def get_image_feature(self, items: List[MultimodalDataItem]) -> torch.Tensor:
+        if items and items[0].format == MultimodalInputFormat.PRECOMPUTED_EMBEDDING:
+            result = torch.cat([item.feature for item in items])
+            return result.reshape(-1, result.shape[-1])
+
+        # list of tensors
+        pixel_values = flatten_nested_list([item.feature for item in items])
+        tgt_sizes = torch.stack(
+            flatten_nested_list([item.tgt_size for item in items]), dim=0
+        )
+        assert len(pixel_values) == tgt_sizes.shape[0]
+
+        device = self.vpm.embeddings.position_embedding.weight.device
+        dtype = self.vpm.embeddings.position_embedding.weight.dtype
+        all_pixel_values_lst = [
+            i.flatten(end_dim=1).permute(1, 0) for i in pixel_values
+        ]
+
+        max_patches = (tgt_sizes[:, 0] * tgt_sizes[:, 1]).max().item()
+        assert isinstance(max_patches, int)
+        all_pixel_values = torch.nn.utils.rnn.pad_sequence(
+            all_pixel_values_lst, batch_first=True, padding_value=0.0
+        )
+
+        B, L, _ = all_pixel_values.shape
+        all_pixel_values = all_pixel_values.permute(0, 2, 1).reshape(B, 3, -1, L)
+        patch_attn_mask = torch.zeros(
+            (B, 1, max_patches), dtype=torch.bool, device=device
+        )
+
+        tgt_sizes_tensor = tgt_sizes.clone().to(device=patch_attn_mask.device)
+        mask_shapes = tgt_sizes_tensor[:, 0] * tgt_sizes_tensor[:, 1]
+        patch_attn_mask[:, 0, :] = torch.arange(
+            patch_attn_mask.size(2), device=patch_attn_mask.device
+        ).unsqueeze(0) < mask_shapes.unsqueeze(1)
+
+        vision_embedding = self.vpm(
+            all_pixel_values.type(dtype),
+            patch_attention_mask=patch_attn_mask,
+            tgt_sizes=tgt_sizes,
+        )
+        return self.resampler(vision_embedding, tgt_sizes)
+
+    def pad_input_ids(self, input_ids: List[int], image_inputs: MultimodalInputs):
+        # Get all special token IDs
+        im_start_id: int = image_inputs.im_start_id
+        im_end_id: int = image_inputs.im_end_id
+        slice_start_id: int = image_inputs.slice_start_id
+        slice_end_id: int = image_inputs.slice_end_id
+
+        media_token_pairs = [(im_start_id, im_end_id), (slice_start_id, slice_end_id)]
+        pattern = MultiModalityDataPaddingPatternTokenPairs(media_token_pairs)
+
+        return pattern.pad_input_tokens(input_ids, image_inputs)
+
+
+class MiniCPMV4_5(MiniCPMBaseModel):
+    packed_modules_mapping = {
+        "qkv_proj": [
+            "q_proj",
+            "k_proj",
+            "v_proj",
+        ],
+        "gate_up_proj": [
+            "gate_proj",
+            "up_proj",
+        ],
+    }
+    # LoRA specific attributes
+    supported_lora_modules = [
+        # vision encoder
+        "fc1",
+        "fc2",
+        "out_proj",
+        # language model
+        "qkv_proj",  # same name with vision encoder
+        "o_proj",
+        "gate_up_proj",
+        "down_proj",
+        # resampler
+        "kv_proj",
+    ]
+
+    # BitandBytes specific attributes
+    bitsandbytes_stacked_params_mapping = {
+        # shard_name, weight_name, index
+        "q_proj": ("qkv_proj", 0),
+        "k_proj": ("qkv_proj", 1),
+        "v_proj": ("qkv_proj", 2),
+        "gate_proj": ("gate_up_proj", 0),
+        "up_proj": ("gate_up_proj", 1),
+    }
+
+    embedding_modules = {}
+    embedding_padding_modules = []
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__(config=config, quant_config=quant_config, prefix=prefix)
+        assert self.version == (4, 5)
+
+    def init_llm(
+        self,
+        config: Qwen3Config,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> nn.Module:
+        llm = Qwen3ForCausalLM(config=config, quant_config=quant_config, prefix=prefix)
+        llm.get_input_embeddings = types.MethodType(
+            lambda self: self.model.get_input_embeddings(), llm
+        )
+        return llm
+
+    def init_vision_module(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig],
+        prefix: str = "",
+    ) -> nn.Module:
+        model = Idefics2VisionTransformer(
+            config=config.vision_config, quant_config=quant_config, prefix=prefix
+        )
+        if self.config.drop_vision_last_layer:
+            model.encoder.layers = model.encoder.layers[:-1]
+
+        setattr(model, "embed_dim", model.embeddings.embed_dim)
+        setattr(model, "patch_size", model.embeddings.patch_size)
+        return model
+
+    def init_resampler(
+        self,
+        embed_dim: int,
+        vision_dim: int,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> nn.Module:
+        with set_default_torch_dtype(torch.float16):
+            # The resampler in 2.6 remains consistent with the one in 2.5.
+            resampler = Resampler4_5(
+                num_queries=self.config.query_num,
+                embed_dim=embed_dim,
+                num_heads=embed_dim // 128,
+                kv_dim=vision_dim,
+                quant_config=quant_config,
+                prefix=prefix,
+            )
+
+        return resampler.to(device="cuda", dtype=torch.get_default_dtype())
+
+    def get_vision_embedding(
+        self,
+        pixel_values: List[torch.Tensor],
+        patch_attn_mask: Optional[torch.Tensor] = None,
+        tgt_sizes: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        vision_embedding = self.vpm(
+            pixel_values,
+            patch_attention_mask=patch_attn_mask,
+            tgt_sizes=tgt_sizes,
+        )
+        return vision_embedding
+
+    def get_image_feature(self, items: List[MultimodalDataItem]) -> torch.Tensor:
+        if items and items[0].format == MultimodalInputFormat.PRECOMPUTED_EMBEDDING:
+            result = torch.cat([item.feature for item in items])
+            return result.reshape(-1, result.shape[-1])
+
+        # list of tensors
+        pixel_values = flatten_nested_list([item.feature for item in items])
+        tgt_sizes = torch.stack(
+            flatten_nested_list([item.tgt_size for item in items]), dim=0
+        )
+        assert len(pixel_values) == tgt_sizes.shape[0]
+
+        device = self.vpm.embeddings.position_embedding.weight.device
+        dtype = self.vpm.embeddings.position_embedding.weight.dtype
+        all_pixel_values_lst = [
+            i.flatten(end_dim=1).permute(1, 0) for i in pixel_values
+        ]
+
+        max_patches = (tgt_sizes[:, 0] * tgt_sizes[:, 1]).max().item()
+        assert isinstance(max_patches, int)
+        all_pixel_values = torch.nn.utils.rnn.pad_sequence(
+            all_pixel_values_lst, batch_first=True, padding_value=0.0
+        )
+
+        B, L, _ = all_pixel_values.shape
+        all_pixel_values = all_pixel_values.permute(0, 2, 1).reshape(B, 3, -1, L)
+        patch_attn_mask = torch.zeros(
+            (B, 1, max_patches), dtype=torch.bool, device=device
+        )
+
+        tgt_sizes_tensor = tgt_sizes.clone().to(device=patch_attn_mask.device)
+        mask_shapes = tgt_sizes_tensor[:, 0] * tgt_sizes_tensor[:, 1]
+        patch_attn_mask[:, 0, :] = torch.arange(
+            patch_attn_mask.size(2), device=patch_attn_mask.device
+        ).unsqueeze(0) < mask_shapes.unsqueeze(1)
+
+        vision_embedding = self.vpm(
+            all_pixel_values.type(dtype),
+            patch_attention_mask=patch_attn_mask,
+            tgt_sizes=tgt_sizes,
+        )
+        return self.resampler(vision_embedding, tgt_sizes)
+
+    def pad_input_ids(self, input_ids: List[int], image_inputs: MultimodalInputs):
+        # Get all special token IDs
+        im_start_id: int = image_inputs.im_start_id
+        im_end_id: int = image_inputs.im_end_id
+        slice_start_id: int = image_inputs.slice_start_id
+        slice_end_id: int = image_inputs.slice_end_id
+
+        media_token_pairs = [(im_start_id, im_end_id), (slice_start_id, slice_end_id)]
+        pattern = MultiModalityDataPaddingPatternTokenPairs(media_token_pairs)
+
+        return pattern.pad_input_tokens(input_ids, image_inputs)
+
+    def eval(self):
+        super().eval()
+        return self
+
+
+_SUPPORT_VERSION = {(2, 6): MiniCPMV2_6, (4, 0): MiniCPMV4_0, (4, 5): MiniCPMV4_5}
+
+
+class MiniCPMV:
+    """
+    Different versions of MiniCPMV use different visual encoders and LLMs,
+    which is not conducive to the current integration logic of LoRA and
+    bitsandbytes in SGLang. Therefore, it is necessary to separate them.
+    """
+
+    # Ensure that the LoRA support check passes when the class is not
+    # initialized, but set all these attributes to empty.
+    packed_modules_mapping = {}
+    supported_lora_modules = []
+    embedding_modules = {}
+    embedding_padding_modules = []
+
+    minicpmv: nn.Module
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+
+        if not hasattr(config, "version"):
+            version = (2, 6)
+        else:
+            version = str(config.version).split(".")
+            version = tuple([int(x) for x in version])
+        # Dispatch class based on version
+        instance_class = _SUPPORT_VERSION.get(version)
+        if instance_class is None:
+            supported_versions = ", ".join(
+                [f"{v[0]}.{v[1]}" for v in sorted(_SUPPORT_VERSION.keys())]
+            )
+            raise ValueError(
+                f"Currently, MiniCPMV only supports versions "
+                f"{supported_versions}. Got version: {version}"
+            )
+
+        try:
+            minicpmv = instance_class(
+                config=config, quant_config=quant_config, prefix=prefix
+            )
+            self.minicpmv = minicpmv
+        except Exception as e:
+            print(f"Failed to instantiate MiniCPMV: {e}")
+            raise e
+        self.config = config
+
+    def __getattr__(self, name):
+        if name == "minicpmv":
+            return None
+        return getattr(self.minicpmv, name)
+
+    def __call__(self, *args, **kwargs):
+        return self.minicpmv(*args, **kwargs)
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+            ("gate_up_proj", "gate_proj", 0),
+            ("gate_up_proj", "up_proj", 1),
+        ]
+
+        params_dict = dict(self.minicpmv.named_parameters())
+        for name, loaded_weight in weights:
+            if "rotary_emb.inv_freq~" in name or "projector" in name:
+                continue
+            if "rotary_emb.cos_cached" in name or "rotary_emb.sin_cached" in name:
+                # Models trained using ColossalAI may include these tensors in
+                # the checkpoint. Skip them.
+                continue
+            if name.startswith("model.vision_tower") and name not in params_dict:
+                continue
+
+            # adapt to VisionAttention
+            name = name.replace(r"self_attn.out_proj", r"self_attn.proj")
+
+            if "sampler" in name:
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
+                weight_loader(param, loaded_weight)
+                continue
+
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                # replace the name and load with customized loader
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                # # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
+                weight_loader(param, loaded_weight)
+
+
+EntryClass = MiniCPMV
diff --git a/sglang/python/sglang/srt/models/minimax_m2.py b/sglang/python/sglang/srt/models/minimax_m2.py
new file mode 100644
index 0000000000000000000000000000000000000000..7aad87c640a0da95e58489d664972fa9b5778ae8
--- /dev/null
+++ b/sglang/python/sglang/srt/models/minimax_m2.py
@@ -0,0 +1,1152 @@
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+# Adapted from DeepSeek and Mixtral implementation
+"""Inference-only MiniMax M2 model compatible with HuggingFace weights."""
+
+import logging
+from contextlib import nullcontext
+from typing import Iterable, Optional, Set, Tuple, Union
+
+import torch
+import triton
+import triton.language as tl
+from torch import nn
+from transformers import PretrainedConfig
+
+from sglang.srt.batch_overlap.two_batch_overlap import model_forward_maybe_tbo
+from sglang.srt.distributed import (
+    get_moe_expert_parallel_world_size,
+    get_pp_group,
+    get_tensor_model_parallel_rank,
+    get_tensor_model_parallel_world_size,
+    tensor_model_parallel_all_reduce,
+)
+from sglang.srt.eplb.expert_distribution import get_global_expert_distribution_recorder
+from sglang.srt.eplb.expert_location_dispatch import ExpertLocationDispatchInfo
+from sglang.srt.layers.communicator import (
+    LayerCommunicator,
+    LayerScatterModes,
+    ScatterMode,
+)
+from sglang.srt.layers.layernorm import RMSNorm
+from sglang.srt.layers.linear import (
+    QKVParallelLinear,
+    ReplicatedLinear,
+    RowParallelLinear,
+)
+from sglang.srt.layers.logits_processor import LogitsProcessor
+from sglang.srt.layers.moe.ep_moe.layer import get_moe_impl_class
+from sglang.srt.layers.moe.fused_moe_triton.layer import FusedMoE
+from sglang.srt.layers.moe.topk import TopK
+from sglang.srt.layers.moe.utils import get_moe_a2a_backend
+from sglang.srt.layers.quantization.base_config import QuantizationConfig
+from sglang.srt.layers.radix_attention import RadixAttention
+from sglang.srt.layers.rotary_embedding import get_rope
+from sglang.srt.layers.utils import PPMissingLayer, get_layer_id
+from sglang.srt.layers.vocab_parallel_embedding import (
+    ParallelLMHead,
+    VocabParallelEmbedding,
+)
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch, PPProxyTensors
+from sglang.srt.model_loader.weight_utils import (
+    default_weight_loader,
+    maybe_remap_kv_scale_name,
+)
+from sglang.srt.server_args import get_global_server_args
+from sglang.srt.utils import (
+    BumpAllocator,
+    add_prefix,
+    get_compiler_backend,
+    is_non_idle_and_non_empty,
+    make_layers,
+)
+
+logger = logging.getLogger(__name__)
+
+
+@triton.jit
+def rmsnorm_sumsq_kernel_serial(
+    x1_ptr,  # T* [B, D]
+    x2_ptr,  # T* [B, D]
+    stride_x1,  # int
+    stride_x2,  # int
+    sum_sq_ptr,  # float* [B]
+    B,  # int
+    D1,  # int
+    D2,  # int
+    BLOCK_SIZE1: tl.constexpr,
+    BLOCK_SIZE2: tl.constexpr,
+):
+    row_id = tl.program_id(0)
+    x1_row = x1_ptr + row_id * stride_x1
+    x2_row = x2_ptr + row_id * stride_x2
+
+    offsets1 = tl.arange(0, BLOCK_SIZE1)
+    mask1 = offsets1 < D1
+    offsets2 = tl.arange(0, BLOCK_SIZE2)
+    mask2 = offsets2 < D2
+
+    x1 = tl.load(x1_row + offsets1, mask=mask1, other=0.0)
+    x2 = tl.load(x2_row + offsets2, mask=mask2, other=0.0)
+
+    x1_f32 = x1.to(tl.float32)
+    sum_sq1 = tl.sum(x1_f32 * x1_f32, axis=0)
+
+    x2_f32 = x2.to(tl.float32)
+    sum_sq2 = tl.sum(x2_f32 * x2_f32, axis=0)
+
+    tl.store(sum_sq_ptr + row_id, sum_sq1)
+    tl.store(sum_sq_ptr + row_id + B, sum_sq2)
+
+
+@triton.jit
+def rmsnorm_apply_kernel_serial(
+    x1_ptr,  # T* [B, D]
+    x2_ptr,  # T* [B, D]
+    w1_ptr,  # T* [D]
+    w2_ptr,  # T* [D]
+    sum_sq_ptr,  # float* [B]
+    out1_ptr,  # T* [B, D]
+    out2_ptr,  # T* [B, D]
+    B,  # int
+    D1,  # int
+    D2,  # int
+    stride_x1,  # int
+    stride_x2,  # int
+    tp_world,  # int
+    eps,  # float
+    BLOCK_SIZE1: tl.constexpr,
+    BLOCK_SIZE2: tl.constexpr,
+):
+    row_id = tl.program_id(0)
+    x1_row = x1_ptr + row_id * stride_x1
+    x2_row = x2_ptr + row_id * stride_x2
+    out1_row = out1_ptr + row_id * stride_x1
+    out2_row = out2_ptr + row_id * stride_x2
+
+    sum_sq1 = tl.load(sum_sq_ptr + row_id)
+    sum_sq2 = tl.load(sum_sq_ptr + row_id + B)
+    inv_rms1 = tl.rsqrt(sum_sq1 / D1 / tp_world + eps)
+    inv_rms2 = tl.rsqrt(sum_sq2 / D2 / tp_world + eps)
+
+    offsets1 = tl.arange(0, BLOCK_SIZE1)
+    offsets2 = tl.arange(0, BLOCK_SIZE2)
+
+    mask1 = offsets1 < D1
+    mask2 = offsets2 < D2
+
+    x1 = tl.load(x1_row + offsets1, mask=mask1, other=0.0)
+    w1 = tl.load(w1_ptr + offsets1, mask=mask1, other=1.0)
+    x2 = tl.load(x2_row + offsets2, mask=mask2, other=0.0)
+    w2 = tl.load(w2_ptr + offsets2, mask=mask2, other=1.0)
+
+    out1 = (x1.to(tl.float32) * inv_rms1 * w1.to(tl.float32)).to(x1.dtype)
+    out2 = (x2.to(tl.float32) * inv_rms2 * w2.to(tl.float32)).to(x2.dtype)
+    tl.store(out1_row + offsets1, out1, mask=mask1)
+    tl.store(out2_row + offsets2, out2, mask=mask2)
+
+
+def rms_sumsq_serial(x1: torch.Tensor, x2: torch.Tensor) -> torch.Tensor:
+    assert x1.is_cuda and x2.is_cuda
+    B, D1 = x1.shape
+    B2, D2 = x2.shape
+    assert B == B2
+
+    stride_x1 = x1.stride(0)
+    stride_x2 = x2.stride(0)
+
+    # We found that custom all-reduce `sglang::cross_device_reduce_1stage`
+    # is much faster than the nccl all-reduce in torch.
+    # However, `should_custom_ar` checks if the reduced buffer is 16-byte aligned.
+    # RMSNormTP reduces a [B, 2] fp32 tensor, so we pad the total element count to
+    # satisfy the alignment requirement.
+    B_padded = (B + B2 + 3) // 4 * 4
+
+    sum_sq = torch.empty(B_padded, device=x1.device, dtype=torch.float32)
+
+    BLOCK_SIZE1 = triton.next_power_of_2(D1)
+    BLOCK_SIZE2 = triton.next_power_of_2(D2)
+
+    grid = (B,)
+
+    rmsnorm_sumsq_kernel_serial[grid](
+        x1,
+        x2,
+        stride_x1,
+        stride_x2,
+        sum_sq,
+        B,
+        D1,
+        D2,
+        BLOCK_SIZE1,
+        BLOCK_SIZE2,
+    )
+    return sum_sq
+
+
+def rms_apply_serial(
+    x1: torch.Tensor,
+    x2: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    sum_sq: torch.Tensor,
+    tp_world: int = 1,
+    eps: float = 1e-5,
+) -> torch.Tensor:
+    assert x1.is_cuda and x2.is_cuda and w1.is_cuda and w2.is_cuda and sum_sq.is_cuda
+    B, D1 = x1.shape
+    B2, D2 = x2.shape
+    assert B == B2
+
+    stride_x1 = x1.stride(0)
+    stride_x2 = x2.stride(0)
+    out1 = torch.empty(B, D1, device=x1.device, dtype=x1.dtype)
+    out2 = torch.empty(B, D2, device=x2.device, dtype=x2.dtype)
+
+    BLOCK_SIZE1 = triton.next_power_of_2(D1)
+    BLOCK_SIZE2 = triton.next_power_of_2(D2)
+
+    grid = (B,)
+
+    rmsnorm_apply_kernel_serial[grid](
+        x1,
+        x2,
+        w1,
+        w2,
+        sum_sq,
+        out1,
+        out2,
+        B,
+        D1,
+        D2,
+        stride_x1,
+        stride_x2,
+        tp_world,
+        eps,
+        BLOCK_SIZE1,
+        BLOCK_SIZE2,
+    )
+    return out1, out2
+
+
+class MiniMaxM2RMSNormTP(nn.Module):
+    """RMSNorm with Tensor Parallel support for QK normalization."""
+
+    def __init__(self, hidden_size: int, eps: float = 1e-6) -> None:
+        super().__init__()
+        self.tp_world = get_tensor_model_parallel_world_size()
+        self.tp_rank = get_tensor_model_parallel_rank()
+
+        # Weight parameter is sharded across TP ranks
+        self.weight = nn.Parameter(torch.ones(int(hidden_size / self.tp_world)))
+        self.weight.weight_loader = self.weight_loader
+        self.variance_epsilon = eps
+
+    @staticmethod
+    def weight_loader(
+        param: nn.Parameter,
+        loaded_weight: torch.Tensor,
+    ) -> None:
+        """Custom weight loader that handles TP sharding."""
+        tp_world = get_tensor_model_parallel_world_size()
+        tp_rank = get_tensor_model_parallel_rank()
+
+        shard_size = loaded_weight.shape[0] // tp_world
+        shard = slice(tp_rank * shard_size, (tp_rank + 1) * shard_size)
+        param.data.copy_(loaded_weight[shard])
+
+    @torch.compile(dynamic=True, backend=get_compiler_backend())
+    def forward(
+        self,
+        x: torch.Tensor,
+        residual: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
+        """Forward pass with TP-aware variance computation."""
+        assert residual is None, "RMSNormTP does not support residual connection."
+
+        orig_dtype = x.dtype
+        x = x.to(torch.float32)
+
+        # Compute variance across the full dimension (not just local shard)
+        variance = x.pow(2).mean(dim=-1, keepdim=True, dtype=torch.float32)
+
+        if self.tp_world > 1:
+            # All-reduce variance across TP ranks to get global variance
+            variance = tensor_model_parallel_all_reduce(variance) / self.tp_world
+
+        # Normalize and apply local weight shard
+        x = x * torch.rsqrt(variance + self.variance_epsilon)
+        x = (x * self.weight).to(orig_dtype)
+
+        return x
+
+    @staticmethod
+    def forward_qk(
+        q_norm: "MiniMaxM2RMSNormTP",
+        k_norm: "MiniMaxM2RMSNormTP",
+        q: torch.Tensor,
+        k: torch.Tensor,
+    ) -> torch.Tensor:
+        sum_sq = rms_sumsq_serial(q, k)
+        if q_norm.tp_world > 1:
+            sum_sq = tensor_model_parallel_all_reduce(sum_sq)
+
+        q, k = rms_apply_serial(
+            q,
+            k,
+            q_norm.weight,
+            k_norm.weight,
+            sum_sq,
+            q_norm.tp_world,
+            q_norm.variance_epsilon,
+        )
+
+        return q, k
+
+
+class MiniMaxM2MoE(nn.Module):
+    """MiniMax MoE implementation using DeepEP for Expert Parallel support."""
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        layer_id: int,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.tp_size = get_tensor_model_parallel_world_size()
+        if self.tp_size > config.num_local_experts:
+            raise ValueError(
+                f"Tensor parallel size {self.tp_size} is greater than "
+                f"the number of experts {config.num_local_experts}."
+            )
+        self.use_routing_bias = getattr(config, "use_routing_bias", False)
+        if self.use_routing_bias:
+            self.e_score_correction_bias = nn.Parameter(
+                torch.empty(config.num_local_experts, dtype=torch.float32)
+            )
+            self.e_score_correction_bias.weight_loader = (
+                MiniMaxM2MoE.ebias_weight_loader
+            )
+        else:
+            self.e_score_correction_bias = None
+
+        self.experts = get_moe_impl_class(quant_config)(
+            num_experts=config.num_local_experts
+            + get_global_server_args().ep_num_redundant_experts,
+            top_k=config.num_experts_per_tok,
+            hidden_size=config.hidden_size,
+            intermediate_size=config.intermediate_size,
+            layer_id=layer_id,
+            quant_config=quant_config,
+            prefix=add_prefix("experts", prefix),
+        )
+        self.topk = TopK(
+            top_k=config.num_experts_per_tok,
+            renormalize=True,
+            scoring_func=config.scoring_func,
+            correction_bias=self.e_score_correction_bias,
+            routed_scaling_factor=1.0,
+        )
+
+        self.gate = ReplicatedLinear(
+            config.hidden_size,
+            config.num_local_experts,
+            bias=False,
+            params_dtype=torch.float32,
+            quant_config=None,
+            prefix=add_prefix("gate", prefix),
+        )
+
+        self.layer_id = layer_id
+
+        if get_moe_a2a_backend().is_deepep():
+            self.ep_size = get_moe_expert_parallel_world_size()
+            self.top_k = config.num_experts_per_tok
+
+    @staticmethod
+    def ebias_weight_loader(param: nn.Parameter, loaded_weight: torch.Tensor) -> None:
+        assert param.size() == loaded_weight.size()
+        param.data.copy_(loaded_weight.to(torch.float32))
+
+    def forward(
+        self, hidden_states: torch.Tensor, forward_batch: ForwardBatch
+    ) -> torch.Tensor:
+        if get_moe_a2a_backend().is_deepep():
+            return self.forward_deepep(hidden_states, forward_batch)
+        else:
+            return self.forward_normal(hidden_states)
+
+    def forward_normal(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        num_tokens, hidden_dim = hidden_states.shape
+        hidden_states = hidden_states.view(-1, hidden_dim)
+
+        # router_logits: (num_tokens, n_experts)
+        router_logits, _ = self.gate(hidden_states.to(torch.float32))
+        topk_output = self.topk(hidden_states, router_logits)
+
+        final_hidden_states = self.experts(hidden_states, topk_output)
+        if self.tp_size > 1:
+            final_hidden_states = tensor_model_parallel_all_reduce(final_hidden_states)
+
+        return final_hidden_states.view(num_tokens, hidden_dim)
+
+    def forward_deepep(
+        self, hidden_states: torch.Tensor, forward_batch: ForwardBatch
+    ) -> torch.Tensor:
+        if hidden_states.shape[0] > 0:
+            # router_logits: (num_tokens, n_experts)
+            router_logits, _ = self.gate(hidden_states.to(torch.float32))
+            topk_output = self.topk(
+                hidden_states,
+                router_logits,
+                num_token_non_padded=forward_batch.num_token_non_padded,
+                expert_location_dispatch_info=ExpertLocationDispatchInfo.init_new(
+                    layer_id=self.layer_id,
+                ),
+            )
+        else:
+            topk_output = self.topk.empty_topk_output(device=hidden_states.device)
+        final_hidden_states = self.experts(
+            hidden_states=hidden_states,
+            topk_output=topk_output,
+        )
+
+        return final_hidden_states
+
+    # TBO Operations for MiniMax MoE
+    def op_gate(self, state):
+        """Gate operation for TBO - compute router logits"""
+        if is_non_idle_and_non_empty(
+            state.forward_batch.forward_mode, state.hidden_states_mlp_input
+        ):  # router_logits: (num_tokens, num_experts)
+            state.router_logits, _ = self.gate(state.hidden_states_mlp_input)
+        else:
+            state.router_logits = None
+
+    def op_select_experts(self, state):
+        """Expert selection operation for TBO"""
+        router_logits = state.pop("router_logits")
+        hidden_states = state.hidden_states_mlp_input
+
+        if router_logits is not None:
+            ctx = (
+                nullcontext()
+                if not get_global_server_args().disable_piecewise_cuda_graph
+                else get_global_expert_distribution_recorder().with_current_layer(
+                    self.layer_id
+                )
+            )
+            with ctx:
+                state.topk_weights_local, state.topk_idx_local, _ = self.topk(
+                    hidden_states=hidden_states,
+                    router_logits=router_logits,
+                    num_token_non_padded=state.forward_batch.num_token_non_padded,
+                    expert_location_dispatch_info=ExpertLocationDispatchInfo.init_new(
+                        layer_id=self.layer_id,
+                    ),
+                )
+        else:
+            state.topk_idx_local = torch.full(
+                (0, self.top_k), -1, dtype=torch.int, device=hidden_states.device
+            )
+            state.topk_weights_local = torch.empty(
+                (0, self.top_k), dtype=torch.float32, device=hidden_states.device
+            )
+
+    def op_dispatch_a(self, state):
+        """Dispatch A operation for TBO - start async dispatch"""
+        if self.ep_size > 1:
+            self.experts.deepep_dispatcher.dispatch_a(
+                hidden_states=state.pop("hidden_states_mlp_input"),
+                topk_idx=state.pop("topk_idx_local"),
+                topk_weights=state.pop("topk_weights_local"),
+                forward_batch=state.forward_batch,
+                tbo_subbatch_index=state.get("tbo_subbatch_index"),
+            )
+
+    def op_dispatch_b(self, state):
+        """Dispatch B operation for TBO - complete async dispatch"""
+        if self.ep_size > 1:
+            ctx = (
+                nullcontext()
+                if not get_global_server_args().disable_piecewise_cuda_graph
+                else get_global_expert_distribution_recorder().with_current_layer(
+                    self.layer_id
+                )
+            )
+            with ctx:
+                state.dispatch_output = self.experts.deepep_dispatcher.dispatch_b(
+                    tbo_subbatch_index=state.get("tbo_subbatch_index"),
+                )
+
+    def op_experts(self, state):
+        """Expert computation for TBO"""
+        state.hidden_states_experts_output = self.experts.moe_impl(
+            dispatch_output=state.dispatch_output,
+        )
+
+    def op_combine_a(self, state):
+        """Combine A operation for TBO - start async combine"""
+        if self.ep_size > 1:
+            self.experts.deepep_dispatcher.combine_a(
+                hidden_states=state.pop("hidden_states_experts_output"),
+                topk_idx=state.dispatch_output.topk_idx,
+                topk_weights=state.dispatch_output.topk_weights,
+                forward_batch=state.forward_batch,
+                tbo_subbatch_index=state.get("tbo_subbatch_index"),
+            )
+            state.pop("dispatch_output")
+
+    def op_combine_b(self, state):
+        """Combine B operation for TBO - complete async combine"""
+        if self.ep_size > 1:
+            state.hidden_states_after_combine = (
+                self.experts.deepep_dispatcher.combine_b(
+                    tbo_subbatch_index=state.get("tbo_subbatch_index"),
+                )
+            )
+
+    def op_output(self, state):
+        """Output operation for TBO - final MLP output"""
+        final_hidden_states = state.pop("hidden_states_after_combine")
+        # MiniMax doesn't have shared experts like DeepSeek, so no need to add them
+        state.hidden_states_mlp_output = final_hidden_states
+
+
+class MiniMaxM2Attention(nn.Module):
+    """MiniMax Attention implementation with QK normalization and partial RoPE."""
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        layer_id: int = 0,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        tp_size = get_tensor_model_parallel_world_size()
+
+        # Get dimensions from config
+        self.total_num_heads = config.num_attention_heads
+        assert self.total_num_heads % tp_size == 0
+        self.num_heads = self.total_num_heads // tp_size
+        self.total_num_kv_heads = config.num_key_value_heads
+
+        if self.total_num_kv_heads >= tp_size:
+            # Number of KV heads is greater than TP size, so we partition
+            # the KV heads across multiple tensor parallel GPUs.
+            assert self.total_num_kv_heads % tp_size == 0
+        else:
+            # Number of KV heads is less than TP size, so we replicate
+            # the KV heads across multiple tensor parallel GPUs.
+            assert tp_size % self.total_num_kv_heads == 0
+        self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
+
+        # Use head_dim from config if available, otherwise calculate
+        self.head_dim = getattr(
+            config, "head_dim", self.hidden_size // self.total_num_heads
+        )
+        self.q_size = self.num_heads * self.head_dim
+        self.kv_size = self.num_kv_heads * self.head_dim
+        self.scaling = self.head_dim**-0.5
+
+        # RoPE settings - support partial RoPE
+        self.rope_theta = getattr(config, "rope_theta", 10000)
+        self.max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
+        self.rotary_dim = getattr(
+            config, "rotary_dim", self.head_dim
+        )  # MiniMax uses rotary_dim=64
+
+        # QK Normalization settings
+        self.use_qk_norm = getattr(config, "use_qk_norm", False)
+        self.qk_norm_type = getattr(config, "qk_norm_type", "per_layer")
+
+        self.qkv_proj = QKVParallelLinear(
+            self.hidden_size,
+            self.head_dim,
+            self.total_num_heads,
+            self.total_num_kv_heads,
+            bias=False,
+            quant_config=quant_config,
+            prefix=add_prefix("qkv_proj", prefix),
+        )
+
+        self.o_proj = RowParallelLinear(
+            self.total_num_heads * self.head_dim,
+            self.hidden_size,
+            bias=False,
+            reduce_results=False,
+            quant_config=quant_config,
+            prefix=add_prefix("o_proj", prefix),
+        )
+
+        # Setup RoPE with partial rotary dimension
+        rope_scaling = getattr(config, "rope_scaling", None)
+        self.rotary_emb = get_rope(
+            self.head_dim,
+            rotary_dim=self.rotary_dim,  # Use partial rotary dimension
+            max_position=self.max_position_embeddings,
+            base=self.rope_theta,
+            rope_scaling=rope_scaling,
+        )
+
+        # QK Normalization layers
+        if self.use_qk_norm:
+            if self.qk_norm_type == "per_layer":
+                # Use RMSNormTP for proper tensor parallel support
+                # Use total dimensions (before TP sharding) for correct normalization
+                self.q_norm = MiniMaxM2RMSNormTP(
+                    self.total_num_heads * self.head_dim, eps=config.rms_norm_eps
+                )
+                self.k_norm = MiniMaxM2RMSNormTP(
+                    self.total_num_kv_heads * self.head_dim, eps=config.rms_norm_eps
+                )
+            else:
+                raise ValueError(f"Unsupported qk_norm_type: {self.qk_norm_type}")
+
+        self.attn = RadixAttention(
+            self.num_heads,
+            self.head_dim,
+            self.scaling,
+            num_kv_heads=self.num_kv_heads,
+            layer_id=layer_id,
+            quant_config=quant_config,
+            prefix=add_prefix("attn", prefix),
+        )
+
+    def forward_prepare(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+    ):
+        qkv, _ = self.qkv_proj(hidden_states)
+        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+        if self.use_qk_norm:
+            # q = self.q_norm(q.contiguous())
+            # k = self.k_norm(k.contiguous())
+            q, k = MiniMaxM2RMSNormTP.forward_qk(
+                self.q_norm, self.k_norm, q.contiguous(), k.contiguous()
+            )
+        else:
+            q, k = q.contiguous(), k.contiguous()
+        q, k = self.rotary_emb(positions, q, k)
+        inner_state = q, k, v, forward_batch
+        return None, forward_batch, inner_state
+
+    def forward_core(self, intermediate_state):
+        _, _, inner_state = intermediate_state
+        attn_output = self.attn(*inner_state)
+        output, _ = self.o_proj(attn_output)
+        return output
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+    ) -> torch.Tensor:
+        s = self.forward_prepare(
+            positions=positions,
+            hidden_states=hidden_states,
+            forward_batch=forward_batch,
+        )
+        return self.forward_core(s)
+
+    def op_prepare(self, state):
+        state.attn_intermediate_state = self.forward_prepare(
+            positions=state.positions,
+            hidden_states=state.pop("hidden_states_after_comm_pre_attn"),
+            forward_batch=state.forward_batch,
+        )
+
+    def op_core(self, state):
+        state.hidden_states_after_attn = self.forward_core(
+            state.pop("attn_intermediate_state")
+        )
+
+
+class MiniMaxM2DecoderLayer(nn.Module):
+    """MiniMax Decoder Layer implementation with MoE support."""
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        layer_id: int,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.layer_id = layer_id
+
+        # TBO support: All MiniMax layers are sparse (MoE)
+        self.is_layer_sparse = True
+
+        self.self_attn = MiniMaxM2Attention(
+            config=config,
+            layer_id=layer_id,
+            quant_config=quant_config,
+            prefix=add_prefix("self_attn", prefix),
+        )
+
+        self.block_sparse_moe = MiniMaxM2MoE(
+            config=config,
+            layer_id=layer_id,
+            quant_config=quant_config,
+            prefix=add_prefix("mlp", prefix),
+        )
+
+        self.input_layernorm = RMSNorm(
+            config.hidden_size, eps=getattr(config, "rms_norm_eps", 1e-6)
+        )
+        self.post_attention_layernorm = RMSNorm(
+            config.hidden_size, eps=getattr(config, "rms_norm_eps", 1e-6)
+        )
+
+        is_previous_layer_sparse = True
+        is_next_layer_sparse = True
+        self.layer_scatter_modes = LayerScatterModes.init_new(
+            layer_id=layer_id,
+            num_layers=config.num_hidden_layers,
+            is_layer_sparse=self.is_layer_sparse,
+            is_previous_layer_sparse=is_previous_layer_sparse,
+            is_next_layer_sparse=is_next_layer_sparse,
+        )
+
+        self.layer_communicator = LayerCommunicator(
+            layer_scatter_modes=self.layer_scatter_modes,
+            input_layernorm=self.input_layernorm,
+            post_attention_layernorm=self.post_attention_layernorm,
+            allow_reduce_scatter=True,
+        )
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+        residual: Optional[torch.Tensor],
+    ) -> torch.Tensor:
+        # Self Attention
+        hidden_states, residual = self.layer_communicator.prepare_attn(
+            hidden_states, residual, forward_batch
+        )
+
+        hidden_states = self.self_attn(
+            positions=positions,
+            hidden_states=hidden_states,
+            forward_batch=forward_batch,
+        )
+
+        # Fully Connected (MLP or MoE)
+
+        hidden_states, residual = self.layer_communicator.prepare_mlp(
+            hidden_states, residual, forward_batch
+        )
+
+        hidden_states = self.block_sparse_moe(hidden_states, forward_batch)
+
+        hidden_states, residual = self.layer_communicator.postprocess_layer(
+            hidden_states, residual, forward_batch
+        )
+
+        return hidden_states, residual
+
+    # TBO Operations for MiniMax Decoder Layer
+    def op_comm_prepare_attn(
+        self,
+        state,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+        residual: Optional[torch.Tensor],
+        zero_allocator: BumpAllocator,
+        tbo_subbatch_index: Optional[int] = None,
+    ):
+        """Communication prepare for attention - TBO operation"""
+        state.hidden_states_after_comm_pre_attn, state.residual_after_input_ln = (
+            self.layer_communicator.prepare_attn(hidden_states, residual, forward_batch)
+        )
+        state.update(
+            dict(
+                forward_batch=forward_batch,
+                positions=positions,
+                zero_allocator=zero_allocator,
+                tbo_subbatch_index=tbo_subbatch_index,
+            )
+        )
+
+    def op_comm_prepare_mlp(self, state):
+        """Communication prepare for MLP - TBO operation"""
+        state.hidden_states_mlp_input, state.residual_after_comm_pre_mlp = (
+            self.layer_communicator.prepare_mlp(
+                state.pop("hidden_states_after_attn"),
+                state.pop("residual_after_input_ln"),
+                state.forward_batch,
+            )
+        )
+
+    def op_mlp(self, state):
+        hidden_states = state.pop("hidden_states_mlp_input")
+        state.hidden_states_mlp_output = self.block_sparse_moe(
+            hidden_states, state.forward_batch
+        )
+
+    def op_comm_postprocess_layer(self, state):
+        """Communication postprocess for layer - TBO operation"""
+        hidden_states, residual = self.layer_communicator.postprocess_layer(
+            state.pop("hidden_states_mlp_output"),
+            state.pop("residual_after_comm_pre_mlp"),
+            state.forward_batch,
+        )
+
+        output = dict(
+            positions=state.positions,
+            hidden_states=hidden_states,
+            residual=residual,
+            forward_batch=state.forward_batch,
+            zero_allocator=state.zero_allocator,
+            tbo_subbatch_index=state.tbo_subbatch_index,
+        )
+        return output
+
+
+class MiniMaxM2Model(nn.Module):
+    """MiniMax Model implementation."""
+
+    fall_back_to_pt_during_load = False
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+
+        self.padding_idx = getattr(config, "pad_token_id", 0)
+        self.vocab_size = config.vocab_size
+        self.pp_group = get_pp_group()
+
+        self.embed_tokens = VocabParallelEmbedding(
+            config.vocab_size,
+            config.hidden_size,
+        )
+
+        def layer_fn(idx, prefix: str) -> nn.Module:
+            return MiniMaxM2DecoderLayer(
+                config=config,
+                layer_id=idx,
+                quant_config=quant_config,
+                prefix=prefix,
+            )
+
+        self.layers, self.start_layer, self.end_layer = make_layers(
+            config.num_hidden_layers,
+            layer_fn,
+            pp_rank=self.pp_group.rank_in_group,
+            pp_size=self.pp_group.world_size,
+            prefix=add_prefix("layers", prefix),
+        )
+        if self.pp_group.is_last_rank:
+            self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        else:
+            self.norm = PPMissingLayer(return_tuple=True)
+
+        # For EAGLE3 support
+        self.layers_to_capture = []
+
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.embed_tokens(input_ids)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        input_embeds: torch.Tensor = None,
+        pp_proxy_tensors: Optional[PPProxyTensors] = None,
+    ) -> Union[torch.Tensor, PPProxyTensors, Tuple[torch.Tensor, list[torch.Tensor]]]:
+        if self.pp_group.is_first_rank:
+            if input_embeds is None:
+                hidden_states = self.get_input_embeddings(input_ids)
+            else:
+                hidden_states = input_embeds
+            residual = None
+        else:
+            assert pp_proxy_tensors is not None
+            hidden_states = pp_proxy_tensors["hidden_states"]
+            residual = pp_proxy_tensors["residual"]
+
+        aux_hidden_states = []
+        if forward_batch.can_run_tbo:
+            hidden_states, residual = model_forward_maybe_tbo(
+                layers=self.layers,
+                enable_tbo=True,
+                input_data_scatter_mode=ScatterMode.model_input_output(),
+                positions=positions,
+                forward_batch=forward_batch,
+                hidden_states=hidden_states,
+                residual=residual,
+            )
+        else:
+            for i in range(self.start_layer, self.end_layer):
+                ctx = (
+                    nullcontext()
+                    if not get_global_server_args().disable_piecewise_cuda_graph
+                    else get_global_expert_distribution_recorder().with_current_layer(i)
+                )
+                with ctx:
+                    if i in self.layers_to_capture:
+                        aux_hidden_states.append(hidden_states + residual)
+                    layer = self.layers[i]
+                    hidden_states, residual = layer(
+                        positions=positions,
+                        forward_batch=forward_batch,
+                        hidden_states=hidden_states,
+                        residual=residual,
+                    )
+
+        if not self.pp_group.is_last_rank:
+            return PPProxyTensors(
+                {"hidden_states": hidden_states, "residual": residual}
+            )
+
+        if residual is not None:
+            hidden_states, _ = self.norm(hidden_states, residual)
+        else:
+            hidden_states = self.norm(hidden_states)
+
+        if len(aux_hidden_states) == 0:
+            return hidden_states
+        return hidden_states, aux_hidden_states
+
+
+class MiniMaxM2ForCausalLM(nn.Module):
+    """MiniMax M2 model for causal language modeling."""
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+
+        self.config = config
+        self.quant_config = quant_config
+
+        self.model = MiniMaxM2Model(
+            config, quant_config, prefix=add_prefix("model", prefix)
+        )
+
+        if get_pp_group().is_last_rank:
+            self.lm_head = ParallelLMHead(
+                config.vocab_size,
+                config.hidden_size,
+                quant_config=None,
+                prefix=add_prefix("lm_head", prefix),
+            )
+        else:
+            self.lm_head = PPMissingLayer()
+
+        self.logits_processor = LogitsProcessor(config)
+        self.pp_group = get_pp_group()
+
+        # For EAGLE3
+        self.capture_aux_hidden_states = False
+
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.get_input_embeddings(input_ids)
+
+    def set_eagle3_layers_to_capture(self, layer_ids: Optional[list[int]] = None):
+        if not get_pp_group().is_last_rank:
+            return
+
+        self.capture_aux_hidden_states = True
+        if layer_ids is None:
+            num_layers = self.config.num_hidden_layers
+            self.model.layers_to_capture = [
+                2,
+                num_layers // 2,
+                num_layers - 3,
+            ]  # Specific layers for EAGLE3 support
+        else:
+            self.model.layers_to_capture = [val + 1 for val in layer_ids]
+
+    def get_embed_and_head(self):
+        return self.model.embed_tokens.weight, self.lm_head.weight
+
+    @torch.no_grad()
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        input_embeds: torch.Tensor = None,
+        pp_proxy_tensors: Optional[PPProxyTensors] = None,
+    ) -> torch.Tensor:
+        hidden_states = self.model(
+            input_ids,
+            positions,
+            forward_batch,
+            input_embeds,
+            pp_proxy_tensors=pp_proxy_tensors,
+        )
+
+        aux_hidden_states = None
+        if self.capture_aux_hidden_states:
+            hidden_states, aux_hidden_states = hidden_states
+
+        if self.pp_group.is_last_rank:
+            return self.logits_processor(
+                input_ids, hidden_states, self.lm_head, forward_batch, aux_hidden_states
+            )
+        else:
+            return hidden_states
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        """Load model weights with proper mapping for MiniMax architecture."""
+
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+            ("gate_up_proj", "gate_proj", 0),
+            ("gate_up_proj", "up_proj", 1),
+        ]
+
+        # Params for weights, fp8 weight scales, fp8 activation scales
+        # (param_name, weight_name, expert_id, shard_id)
+        expert_params_mapping = FusedMoE.make_expert_params_mapping(
+            ckpt_gate_proj_name="w1",
+            ckpt_down_proj_name="w2",
+            ckpt_up_proj_name="w3",
+            num_experts=self.config.num_local_experts,
+        )
+
+        params_dict = dict(self.named_parameters())
+        loaded_params: Set[str] = set()
+        for name, loaded_weight in weights:
+            if "rotary_emb.inv_freq" in name:
+                continue
+
+            layer_id = get_layer_id(name)
+            if (
+                layer_id is not None
+                and hasattr(self.model, "start_layer")
+                and (
+                    layer_id < self.model.start_layer
+                    or layer_id >= self.model.end_layer
+                )
+            ):
+                continue
+
+            spec_layer = get_spec_layer_idx_from_weight_name(self.config, name)
+            if spec_layer is not None:
+                continue  # skip spec decode layers for main model
+
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                # Skip non-stacked layers and experts (experts handled below).
+                if weight_name not in name:
+                    continue
+                # We have mlp.experts[0].gate_proj in the checkpoint.
+                # Since we handle the experts below in expert_params_mapping,
+                # we need to skip here BEFORE we update the name, otherwise
+                # name will be updated to mlp.experts[0].gate_up_proj, which
+                # will then be updated below in expert_params_mapping
+                # for mlp.experts[0].gate_gate_up_proj, which breaks load.
+                if ("mlp.experts." in name) and name not in params_dict:
+                    continue
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name not in params_dict:
+                    continue
+
+                if name.endswith(".bias"):
+                    continue
+
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                for mapping in expert_params_mapping:
+                    param_name, weight_name, expert_id, shard_id = mapping
+                    if weight_name not in name:
+                        continue
+                    name = name.replace(weight_name, param_name)
+
+                    if name not in params_dict:
+                        continue
+                    param = params_dict[name]
+                    weight_loader = param.weight_loader
+                    weight_loader(
+                        param,
+                        loaded_weight,
+                        name,
+                        shard_id=shard_id,
+                        expert_id=expert_id,
+                    )
+                    break
+                else:
+                    # Skip loading extra bias for GPTQ models.
+                    if name.endswith(".bias") and name not in params_dict:
+                        continue
+
+                    # Remapping the name of FP8 kv-scale.
+                    name = maybe_remap_kv_scale_name(name, params_dict)
+                    if name is None:
+                        continue
+
+                    if name not in params_dict:
+                        continue
+                    param = params_dict[name]
+                    weight_loader = getattr(
+                        param, "weight_loader", default_weight_loader
+                    )
+                    weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
+
+    @classmethod
+    def get_model_config_for_expert_location(cls, config):
+        from sglang.srt.eplb.expert_location import ModelConfigForExpertLocation
+
+        return ModelConfigForExpertLocation(
+            num_layers=config.num_hidden_layers,
+            num_logical_experts=config.num_local_experts,
+            num_groups=None,
+        )
+
+
+def get_spec_layer_idx_from_weight_name(
+    config: PretrainedConfig, weight_name: str
+) -> Optional[int]:
+    if hasattr(config, "num_mtp_modules") and (config.num_mtp_modules > 0):
+        layer_idx = config.num_hidden_layers
+        for i in range(config.num_mtp_modules):
+            if weight_name.startswith(f"model.layers.{layer_idx + i}."):
+                return layer_idx + i
+    return None
+
+
+# Entry class for model registration
+EntryClass = MiniMaxM2ForCausalLM
diff --git a/sglang/python/sglang/srt/models/ministral3.py b/sglang/python/sglang/srt/models/ministral3.py
new file mode 100644
index 0000000000000000000000000000000000000000..460c7b30fb5ed984e5eefbcbcce9515342161c80
--- /dev/null
+++ b/sglang/python/sglang/srt/models/ministral3.py
@@ -0,0 +1,157 @@
+from typing import Any, Dict, Optional
+
+import torch
+from transformers import PretrainedConfig
+
+from sglang.srt.layers.quantization.base_config import QuantizationConfig
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch
+from sglang.srt.models.llama import (
+    LlamaAttention,
+    LlamaDecoderLayer,
+    LlamaForCausalLM,
+    LlamaModel,
+)
+from sglang.srt.utils import add_prefix, make_layers
+
+
+def _get_llama_4_attn_scale(
+    positions_ids: torch.Tensor, beta: float, max_position_embeddings: int
+) -> torch.Tensor:
+    scaling = 1 + beta * torch.log(
+        1 + torch.floor(positions_ids / max_position_embeddings)
+    )
+    return scaling.unsqueeze(-1)
+
+
+class Ministral3Attention(LlamaAttention):
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        hidden_size: int,
+        num_heads: int,
+        num_kv_heads: int,
+        layer_id: int = 0,
+        rope_theta: float = 1000000.0,
+        rope_scaling: Optional[Dict[str, Any]] = {},
+        rope_is_neox_style: bool = True,
+        max_position_embeddings: int = 8192,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+        bias: bool = False,
+    ) -> None:
+        super().__init__(
+            config,
+            hidden_size,
+            num_heads,
+            num_kv_heads,
+            layer_id,
+            rope_theta,
+            rope_scaling,
+            rope_is_neox_style,
+            max_position_embeddings,
+            quant_config,
+            prefix,
+            bias,
+        )
+        # Ministral3 specific: llama 4 style scaling beta
+        self.llama_4_scaling_beta = None
+        if hasattr(config, "rope_parameters") and config.rope_parameters:
+            self.llama_4_scaling_beta = config.rope_parameters.get(
+                "llama_4_scaling_beta"
+            )
+
+        # sliding window
+        self.sliding_window = getattr(config, "sliding_window", None)
+        if self.sliding_window is not None:
+            # Update RadixAttention with sliding window if needed
+            # currently RadixAttention in sglang handles this mostly via logic in forward/flashinfer
+            pass
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+    ) -> torch.Tensor:
+        qkv, _ = self.qkv_proj(hidden_states)
+        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+
+        # Apply RoPE
+        q, k = self.rotary_emb(positions, q, k)
+
+        # Ministral3 / Llama 4 scaling
+        if self.llama_4_scaling_beta is not None:
+            scale = _get_llama_4_attn_scale(
+                positions, self.llama_4_scaling_beta, self.max_position_embeddings
+            ).to(q.dtype)
+            # q shape is [batch_size * seq_len, num_heads * head_dim] or [batch_size * seq_len, num_heads, head_dim]
+            # positions is [batch_size * seq_len]
+            # scale is [batch_size * seq_len, 1]
+            # We need to reshape q to apply scale correctly if it's flattened
+            # Assuming q is (total_tokens, num_heads * head_dim)
+            q = q.view(-1, self.num_heads, self.head_dim)
+            q = q * scale.unsqueeze(1)  # Broadcast over heads
+            q = q.view(-1, self.num_heads * self.head_dim)
+
+        attn_output = self.attn(q, k, v, forward_batch)
+        output, _ = self.o_proj(attn_output)
+        return output
+
+
+class Ministral3DecoderLayer(LlamaDecoderLayer):
+    def __init__(self, config, layer_id=0, quant_config=None, prefix=""):
+        super().__init__(config, layer_id, quant_config, prefix)
+        self.self_attn = Ministral3Attention(
+            config=config,
+            hidden_size=self.hidden_size,
+            num_heads=config.num_attention_heads,
+            num_kv_heads=config.num_key_value_heads,
+            layer_id=layer_id,
+            rope_theta=getattr(config, "rope_parameters", {}).get(
+                "rope_theta", 1000000.0
+            ),
+            rope_scaling=getattr(
+                config, "rope_parameters", {}
+            ),  # rope_scaling is rope_parameters in Ministral3Config
+            max_position_embeddings=getattr(
+                config, "original_max_position_embeddings", 16384
+            ),
+            quant_config=quant_config,
+            prefix=add_prefix("self_attn", prefix),
+            bias=getattr(config, "attention_bias", False)
+            or getattr(config, "bias", False),
+        )
+
+
+class Ministral3Model(LlamaModel):
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        # Override layer creation to use Ministral3Attention
+        super().__init__(config, quant_config, prefix)
+
+        self.layers, self.start_layer, self.end_layer = make_layers(
+            config.num_hidden_layers,
+            lambda idx, prefix: Ministral3DecoderLayer(
+                config=config, quant_config=quant_config, layer_id=idx, prefix=prefix
+            ),
+            pp_rank=self.pp_group.rank_in_group,
+            pp_size=self.pp_group.world_size,
+            prefix="model.layers",
+        )
+
+
+class Ministral3ForCausalLM(LlamaForCausalLM):
+    def _init_model(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        return Ministral3Model(config, quant_config, prefix=prefix)
+
+
+EntryClass = [Ministral3ForCausalLM]
diff --git a/sglang/python/sglang/srt/models/mistral.py b/sglang/python/sglang/srt/models/mistral.py
new file mode 100644
index 0000000000000000000000000000000000000000..632e857c280ba3114ff3ca6019b0f835ddbb2063
--- /dev/null
+++ b/sglang/python/sglang/srt/models/mistral.py
@@ -0,0 +1,93 @@
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Inference-only Mistral model."""
+
+from typing import List
+
+import torch
+from transformers.models.mistral3.modeling_mistral3 import Mistral3MultiModalProjector
+
+from sglang.srt.managers.schedule_batch import MultimodalDataItem
+from sglang.srt.models.llama import LlamaForCausalLM
+
+
+class MistralForCausalLM(LlamaForCausalLM):
+    pass
+
+
+class Mistral3ForConditionalGeneration:
+    MULTIMODAL_PROJECTOR_TYPE = Mistral3MultiModalProjector
+
+    def __init__(self, **kwargs):
+        # lazy load inner class
+        # to bypass circular import
+        from sglang.srt.models.llava import LlavaForConditionalGeneration
+
+        # override config: mistral's projector adds patchmerger that doesn't require padding
+        kwargs["config"].vision_config.pad_image_border = False
+
+        self.inner = LlavaForConditionalGeneration(**kwargs)
+        self.inner.multi_modal_projector = self.MULTIMODAL_PROJECTOR_TYPE(
+            kwargs["config"]
+        )
+        self.inner.get_image_feature = self.get_image_feature
+
+    def get_image_feature(self, items: List[MultimodalDataItem]) -> torch.Tensor:
+        """Extract features from image inputs.
+
+        Args:
+            items: List of MultimodalDataItem objects containing image data
+                Note that an item can be either "image" or "multi-images"
+
+        Returns:
+            torch.Tensor: features from image inputs, concatenated
+        """
+        features = []
+        for item in items:
+            # in each item, we assume pixel_values is always batched
+            pixel_values, image_sizes = item.feature, item.image_sizes
+            image_outputs = self.vision_tower(
+                pixel_values, image_sizes, output_hidden_states=True
+            )
+            selected_image_feature = image_outputs.hidden_states[
+                self.vision_feature_layer
+            ]
+
+            if self.vision_feature_select_strategy in ["default", "patch"]:
+                selected_image_feature = selected_image_feature[:, 1:]
+            elif self.vision_feature_select_strategy == "full":
+                selected_image_feature = selected_image_feature
+            else:
+                raise ValueError(
+                    f"Unexpected select feature: {self.vision_feature_select_strategy}"
+                )
+            features.append(
+                self.multi_modal_projector(
+                    selected_image_feature.squeeze(0), image_sizes
+                )
+            )
+        ret = torch.cat(features, dim=0)
+        return ret
+
+    def __getattr__(self, name):
+        return getattr(self.inner, name)
+
+    def __hasattr__(self, name):
+        return hasattr(self.inner, name)
+
+    def __call__(self, *args, **kwargs):
+        return self.inner(*args, **kwargs)
+
+
+EntryClass = [MistralForCausalLM, Mistral3ForConditionalGeneration]
diff --git a/sglang/python/sglang/srt/models/mistral_large_3.py b/sglang/python/sglang/srt/models/mistral_large_3.py
new file mode 100644
index 0000000000000000000000000000000000000000..cbbf893b6e9d34e6b9758a83e7140ffe6184b4ec
--- /dev/null
+++ b/sglang/python/sglang/srt/models/mistral_large_3.py
@@ -0,0 +1,78 @@
+# Adapted from https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/models/mistral_large_3.py
+# SPDX-License-Identifier: Apache-2.0
+from collections.abc import Iterable
+
+import regex as re
+import torch
+
+from sglang.srt.models.deepseek_v2 import DeepseekV3ForCausalLM
+
+
+class MistralLarge3ForCausalLM(DeepseekV3ForCausalLM):
+    # fmt: off
+    remapping = {
+        r"layers\.(\d+)\.attention_norm\.weight": r"model.layers.\1.input_layernorm.weight",  # noqa: E501
+        r"layers\.(\d+)\.attention\.wq\.(\w+)": r"model.layers.\1.self_attn.q_proj.\2",  # noqa: E501
+        r"layers\.(\d+)\.attention\.wq_a\.(\w+)": r"model.layers.\1.self_attn.q_a_proj.\2",  # noqa: E501
+        r"layers\.(\d+)\.attention\.q_a_norm\.weight": r"model.layers.\1.self_attn.q_a_layernorm.weight",  # noqa: E501
+        r"layers\.(\d+)\.attention\.wq_b\.(\w+)": r"model.layers.\1.self_attn.q_b_proj.\2",  # noqa: E501
+        r"layers\.(\d+)\.attention\.wkv_a_with_mqa\.(\w+)": r"model.layers.\1.self_attn.kv_a_proj_with_mqa.\2",  # noqa: E501
+        r"layers\.(\d+)\.attention\.kv_a_norm\.weight": r"model.layers.\1.self_attn.kv_a_layernorm.weight",  # noqa: E501
+        r"layers\.(\d+)\.attention\.wkv_b\.(\w+)": r"model.layers.\1.self_attn.kv_b_proj.\2",  # noqa: E501
+        r"layers\.(\d+)\.attention\.wo\.(\w+)": r"model.layers.\1.self_attn.o_proj.\2",  # noqa: E501
+        # FP8 scales
+        r"layers\.(\d+)\.attention\.k_fake_quantizer\.qscale_act": r"model.layers.\1.self_attn.mla_attn.mla_attn.k_scale",  # noqa: E501
+        r"layers\.(\d+)\.attention\.q_fake_quantizer\.qscale_act": r"model.layers.\1.self_attn.mla_attn.mla_attn.q_scale",  # noqa: E501
+        r"layers\.(\d+)\.attention\.v_fake_quantizer\.qscale_act": r"model.layers.\1.self_attn.mla_attn.mla_attn.v_scale",  # noqa: E501
+        r"layers\.(\d+)\.ffn_norm\.weight": r"model.layers.\1.post_attention_layernorm.weight",  # noqa: E501
+        r"layers\.(\d+)\.feed_forward\.w1\.(\w+)": r"model.layers.\1.mlp.gate_proj.\2",  # noqa: E501
+        r"layers\.(\d+)\.feed_forward\.w2\.(\w+)": r"model.layers.\1.mlp.down_proj.\2",  # noqa: E501
+        r"layers\.(\d+)\.feed_forward\.w3\.(\w+)": r"model.layers.\1.mlp.up_proj.\2",  # noqa: E501
+        r"layers\.(\d+)\.gate\.weight": r"model.layers.\1.mlp.gate.weight",  # noqa: E501
+        r"layers\.(\d+)\.shared_experts\.w1\.(\w+)": r"model.layers.\1.mlp.shared_experts.gate_proj.\2",  # noqa: E501
+        r"layers\.(\d+)\.shared_experts\.w2\.(\w+)": r"model.layers.\1.mlp.shared_experts.down_proj.\2",  # noqa: E501
+        r"layers\.(\d+)\.shared_experts\.w3\.(\w+)": r"model.layers.\1.mlp.shared_experts.up_proj.\2",  # noqa: E501
+        r"layers\.(\d+)\.experts\.(\d+)\.w1\.(\w+)": r"model.layers.\1.mlp.experts.\2.gate_proj.\3",  # noqa: E501
+        r"layers\.(\d+)\.experts\.(\d+)\.w2\.(\w+)": r"model.layers.\1.mlp.experts.\2.down_proj.\3",  # noqa: E501
+        r"layers\.(\d+)\.experts\.(\d+)\.w3\.(\w+)": r"model.layers.\1.mlp.experts.\2.up_proj.\3",  # noqa: E501
+        r"layers\.(\d+)\.router_biases": r"model.layers.\1.mlp.gate.e_score_correction_bias",  # noqa: E501
+        r"norm\.weight": "model.norm.weight",  # noqa: E501
+        r"tok_embeddings\.weight": "model.embed_tokens.weight",  # noqa: E501
+        r"output\.weight": "lm_head.weight",  # noqa: E501
+    }
+    # fmt: on
+
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
+        return super().load_weights(self._iterable_remap_mistral_to_ds(weights))
+
+    def _iterable_remap_mistral_to_ds(
+        self, weights: Iterable[tuple[str, torch.Tensor]]
+    ) -> Iterable[tuple[str, torch.Tensor]]:
+        """Remap Mistral parameters to DeepseekV2 parameters."""
+        for name, loaded_weight in weights:
+            for k, v in self.remapping.items():
+                match = re.fullmatch(k, name)
+                if match:
+                    name = re.sub(k, v, name)
+                    break
+            else:
+                import logging
+
+                logging.warning(f"Unrecognized weight: {name}. Skipping.")
+                continue
+
+            # Note(Andy): Unlike Llama, this implementation uses
+            # is_neox_style=False for RoPE, which matches Mistral's implementation.
+            # Thus we don't need to permute the q/k weights (unlike Llama)
+
+            # Remapping scale names. We could do this in the regex above but it
+            # would triple the number of lines for most layers.
+            if name.endswith(".qscale_act"):
+                name = re.sub(r"\.qscale_act$", ".input_scale", name)
+            elif name.endswith(".qscale_weight"):
+                name = re.sub(r"\.qscale_weight$", ".weight_scale", name)
+
+            yield name, loaded_weight
+
+
+EntryClass = MistralLarge3ForCausalLM
diff --git a/sglang/python/sglang/srt/models/mistral_large_3_eagle.py b/sglang/python/sglang/srt/models/mistral_large_3_eagle.py
new file mode 100644
index 0000000000000000000000000000000000000000..a5ce7b6aabb6e7ac5aa97689ed62bcaba9f0fd66
--- /dev/null
+++ b/sglang/python/sglang/srt/models/mistral_large_3_eagle.py
@@ -0,0 +1,107 @@
+# Adapted from https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/models/mistral_large_3_eagle.py
+# SPDX-License-Identifier: Apache-2.0
+from typing import Optional
+
+import torch
+from torch import nn
+from transformers import PretrainedConfig
+
+from sglang.srt.distributed import get_pp_group
+from sglang.srt.layers.attention.nsa.utils import is_nsa_enable_prefill_cp
+from sglang.srt.layers.layernorm import RMSNorm
+from sglang.srt.layers.linear import RowParallelLinear
+from sglang.srt.layers.quantization.base_config import QuantizationConfig
+from sglang.srt.layers.vocab_parallel_embedding import VocabParallelEmbedding
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch, PPProxyTensors
+from sglang.srt.models.deepseek_v2 import DeepseekV2DecoderLayer, DeepseekV2Model
+from sglang.srt.models.mistral_large_3 import MistralLarge3ForCausalLM
+from sglang.srt.utils import add_prefix
+
+
+class MistralLarge3Model(DeepseekV2Model):
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        nn.Module.__init__(self)
+
+        self.config = config
+        self.vocab_size = config.vocab_size
+        assert get_pp_group().world_size == 1
+        self.pp_group = get_pp_group()
+        self.nsa_enable_prefill_cp = is_nsa_enable_prefill_cp()
+
+        self.embed_tokens = VocabParallelEmbedding(
+            config.vocab_size,
+            config.hidden_size,
+            prefix=add_prefix("embed_tokens", prefix),
+        )
+
+        self.layers = nn.ModuleList(
+            [
+                DeepseekV2DecoderLayer(
+                    config=config,
+                    prefix=add_prefix(prefix, f"layers.{i}"),
+                    quant_config=quant_config,
+                    layer_id=i,
+                )
+                for i in range(self.config.num_hidden_layers)
+            ]
+        )
+        self.start_layer = 0
+        self.end_layer = self.config.num_hidden_layers
+
+        self.fc = RowParallelLinear(
+            self.config.hidden_size * 2,
+            self.config.hidden_size,
+            bias=False,
+            quant_config=quant_config,
+            prefix=add_prefix(prefix, "fc"),
+            input_is_parallel=False,
+        )
+        self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.layers_to_capture = []
+        self.llama_4_scaling_config = getattr(config, "llama_4_scaling", None)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        input_embeds: torch.Tensor = None,
+        pp_proxy_tensors: Optional[PPProxyTensors] = None,
+    ) -> torch.Tensor:
+        if input_embeds is None:
+            input_embeds = self.embed_tokens(input_ids)
+        input_embeds, _ = self.fc(
+            torch.cat((input_embeds, forward_batch.spec_info.hidden_states), dim=-1)
+        )
+        output = super().forward(
+            input_ids, positions, forward_batch, input_embeds, pp_proxy_tensors
+        )
+        assert isinstance(output, torch.Tensor)
+        return output
+
+
+class MistralLarge3ForCausalLMEagle(MistralLarge3ForCausalLM):
+    remapping = MistralLarge3ForCausalLM.remapping | {
+        r"eagle_linear\.weight": r"model.fc.weight",
+        r"eagle_linear\.qscale_act": r"model.fc.input_scale",
+        r"eagle_linear\.qscale_weight": r"model.fc.weight_scale",
+    }
+
+    def __init__(
+        self,
+        *,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        config.quant_config = quant_config
+        self.model_cls = MistralLarge3Model
+        super().__init__(config=config, quant_config=quant_config, prefix=prefix)
+
+
+EntryClass = [MistralLarge3ForCausalLMEagle]
diff --git a/sglang/python/sglang/srt/models/mixtral.py b/sglang/python/sglang/srt/models/mixtral.py
new file mode 100644
index 0000000000000000000000000000000000000000..c4f3e4c446f7562264237f8f55c785fc77ad1a1d
--- /dev/null
+++ b/sglang/python/sglang/srt/models/mixtral.py
@@ -0,0 +1,478 @@
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+# Adapted from
+# https://github.com/vllm-project/vllm/blob/c7f2cf2b7f67bce5842fedfdba508440fe257375/vllm/model_executor/models/mixtral.py#L1
+"""Inference-only Mixtral model."""
+
+import logging
+from typing import Iterable, Optional, Tuple, Union
+
+import torch
+from torch import nn
+from transformers import MixtralConfig
+
+from sglang.srt.distributed import (
+    get_pp_group,
+    get_tensor_model_parallel_world_size,
+    tensor_model_parallel_all_reduce,
+)
+from sglang.srt.layers.layernorm import RMSNorm
+from sglang.srt.layers.linear import (
+    QKVParallelLinear,
+    ReplicatedLinear,
+    RowParallelLinear,
+)
+from sglang.srt.layers.logits_processor import LogitsProcessor
+from sglang.srt.layers.moe.fused_moe_triton import FusedMoE
+from sglang.srt.layers.moe.topk import TopK
+from sglang.srt.layers.quantization.base_config import QuantizationConfig
+from sglang.srt.layers.radix_attention import RadixAttention
+from sglang.srt.layers.rotary_embedding import get_rope
+from sglang.srt.layers.utils import PPMissingLayer, get_layer_id
+from sglang.srt.layers.vocab_parallel_embedding import (
+    ParallelLMHead,
+    VocabParallelEmbedding,
+)
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch, PPProxyTensors
+from sglang.srt.model_loader.weight_utils import default_weight_loader
+from sglang.srt.utils import add_prefix, make_layers
+
+logger = logging.getLogger(__name__)
+
+
+class MixtralMoE(nn.Module):
+    """A tensor-parallel MoE implementation for Mixtral that shards each expert
+    across all ranks.
+
+    Each expert's weights are sharded across all ranks and a fused MoE
+    kernel is used for the forward pass, and finally we reduce the outputs
+    across ranks.
+    """
+
+    def __init__(
+        self,
+        num_experts: int,
+        top_k: int,
+        hidden_size: int,
+        intermediate_size: int,
+        layer_id: int,
+        params_dtype: Optional[torch.dtype] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        tp_size: Optional[int] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.tp_size = get_tensor_model_parallel_world_size()
+        self.hidden_size = hidden_size
+
+        # Gate always runs at half / full precision for now.
+        self.gate = ReplicatedLinear(
+            hidden_size,
+            num_experts,
+            bias=False,
+            params_dtype=params_dtype,
+            quant_config=None,
+            prefix=add_prefix("gate", prefix),
+        )
+
+        self.topk = TopK(
+            top_k=top_k,
+            renormalize=True,
+        )
+
+        self.experts = FusedMoE(
+            num_experts=num_experts,
+            top_k=top_k,
+            layer_id=layer_id,
+            hidden_size=hidden_size,
+            intermediate_size=intermediate_size,
+            params_dtype=params_dtype,
+            quant_config=quant_config,
+            prefix=add_prefix("experts", prefix),
+        )
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        # NOTE: hidden_states can have either 1D or 2D shape.
+        orig_shape = hidden_states.shape
+        hidden_states = hidden_states.view(-1, self.hidden_size)
+        # router_logits: (num_tokens, n_experts)
+        router_logits, _ = self.gate(hidden_states)
+        topk_output = self.topk(hidden_states, router_logits)
+        final_hidden_states = self.experts(hidden_states, topk_output)
+        if self.tp_size > 1:
+            final_hidden_states = tensor_model_parallel_all_reduce(final_hidden_states)
+        return final_hidden_states.view(orig_shape)
+
+
+class MixtralAttention(nn.Module):
+    def __init__(
+        self,
+        hidden_size: int,
+        num_heads: int,
+        num_kv_heads: int,
+        layer_id: int = 0,
+        max_position: int = 4096 * 32,
+        rope_theta: float = 10000,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.hidden_size = hidden_size
+        tp_size = get_tensor_model_parallel_world_size()
+        self.total_num_heads = num_heads
+        assert self.total_num_heads % tp_size == 0
+        self.num_heads = self.total_num_heads // tp_size
+        self.total_num_kv_heads = num_kv_heads
+        if self.total_num_kv_heads >= tp_size:
+            # Number of KV heads is greater than TP size, so we partition
+            # the KV heads across multiple tensor parallel GPUs.
+            assert self.total_num_kv_heads % tp_size == 0
+        else:
+            # Number of KV heads is less than TP size, so we replicate
+            # the KV heads across multiple tensor parallel GPUs.
+            assert tp_size % self.total_num_kv_heads == 0
+        self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
+        self.head_dim = hidden_size // self.total_num_heads
+        self.q_size = self.num_heads * self.head_dim
+        self.kv_size = self.num_kv_heads * self.head_dim
+        self.scaling = self.head_dim**-0.5
+        self.rope_theta = rope_theta
+
+        self.qkv_proj = QKVParallelLinear(
+            hidden_size,
+            self.head_dim,
+            self.total_num_heads,
+            self.total_num_kv_heads,
+            bias=False,
+            quant_config=quant_config,
+            prefix=add_prefix("qkv_proj", prefix),
+        )
+        self.o_proj = RowParallelLinear(
+            self.total_num_heads * self.head_dim,
+            hidden_size,
+            bias=False,
+            quant_config=quant_config,
+            prefix=add_prefix("o_proj", prefix),
+        )
+        self.rotary_emb = get_rope(
+            self.head_dim,
+            rotary_dim=self.head_dim,
+            max_position=max_position,
+            base=int(self.rope_theta),
+            is_neox_style=True,
+        )
+        self.attn = RadixAttention(
+            self.num_heads,
+            self.head_dim,
+            self.scaling,
+            num_kv_heads=self.num_kv_heads,
+            layer_id=layer_id,
+            quant_config=quant_config,
+            prefix=add_prefix("attn", prefix),
+        )
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+    ) -> torch.Tensor:
+        qkv, _ = self.qkv_proj(hidden_states)
+        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+        q, k = self.rotary_emb(positions, q, k)
+        attn_output = self.attn(q, k, v, forward_batch)
+        output, _ = self.o_proj(attn_output)
+        return output
+
+
+class MixtralDecoderLayer(nn.Module):
+    def __init__(
+        self,
+        config: MixtralConfig,
+        layer_id: int = 0,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        # Requires transformers > 4.32.0
+        rope_theta = getattr(config, "rope_theta", 10000)
+        self.self_attn = MixtralAttention(
+            hidden_size=self.hidden_size,
+            num_heads=config.num_attention_heads,
+            max_position=config.max_position_embeddings,
+            num_kv_heads=config.num_key_value_heads,
+            layer_id=layer_id,
+            rope_theta=rope_theta,
+            quant_config=quant_config,
+            prefix=add_prefix("self_attn", prefix),
+        )
+        self.block_sparse_moe = MixtralMoE(
+            num_experts=config.num_local_experts,
+            top_k=config.num_experts_per_tok,
+            hidden_size=config.hidden_size,
+            intermediate_size=config.intermediate_size,
+            layer_id=layer_id,
+            quant_config=quant_config,
+            prefix=add_prefix("block_sparse_moe", prefix),
+        )
+        self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = RMSNorm(
+            config.hidden_size, eps=config.rms_norm_eps
+        )
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+        residual: Optional[torch.Tensor],
+    ) -> torch.Tensor:
+        # Self Attention
+        if residual is None:
+            residual = hidden_states
+            hidden_states = self.input_layernorm(hidden_states)
+        else:
+            hidden_states, residual = self.input_layernorm(hidden_states, residual)
+        hidden_states = self.self_attn(
+            positions=positions,
+            hidden_states=hidden_states,
+            forward_batch=forward_batch,
+        )
+
+        # Fully Connected
+        hidden_states, residual = self.post_attention_layernorm(hidden_states, residual)
+        hidden_states = self.block_sparse_moe(hidden_states)
+        return hidden_states, residual
+
+
+class MixtralModel(nn.Module):
+    def __init__(
+        self,
+        config: MixtralConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+        self.pp_group = get_pp_group()
+
+        if self.pp_group.is_first_rank:
+            self.embed_tokens = VocabParallelEmbedding(
+                config.vocab_size,
+                config.hidden_size,
+                prefix=add_prefix("embed_tokens", prefix),
+            )
+        else:
+            self.embed_tokens = PPMissingLayer()
+
+        self.layers, self.start_layer, self.end_layer = make_layers(
+            config.num_hidden_layers,
+            lambda idx, prefix: MixtralDecoderLayer(
+                config=config, quant_config=quant_config, layer_id=idx, prefix=prefix
+            ),
+            pp_rank=self.pp_group.rank_in_group,
+            pp_size=self.pp_group.world_size,
+            prefix="layers",
+            return_tuple=True,
+        )
+
+        if self.pp_group.is_last_rank:
+            self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        else:
+            self.norm = PPMissingLayer(return_tuple=True)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        input_embeds: torch.Tensor = None,
+        pp_proxy_tensors: Optional[PPProxyTensors] = None,
+    ) -> Union[torch.Tensor, PPProxyTensors]:
+        if self.pp_group.is_first_rank:
+            if input_embeds is None:
+                hidden_states = self.embed_tokens(input_ids)
+            else:
+                hidden_states = input_embeds
+            residual = None
+        else:
+            assert pp_proxy_tensors is not None
+            hidden_states = pp_proxy_tensors["hidden_states"]
+            residual = pp_proxy_tensors["residual"]
+
+        for i in range(self.start_layer, self.end_layer):
+            layer = self.layers[i]
+            hidden_states, residual = layer(
+                positions, hidden_states, forward_batch, residual
+            )
+
+        if not self.pp_group.is_last_rank:
+            return PPProxyTensors(
+                {
+                    "hidden_states": hidden_states,
+                    "residual": residual,
+                }
+            )
+        else:
+            hidden_states, _ = self.norm(hidden_states, residual)
+
+        return hidden_states
+
+
+class MixtralForCausalLM(nn.Module):
+
+    def __init__(
+        self,
+        config: MixtralConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.pp_group = get_pp_group()
+        self.config = config
+        self.quant_config = quant_config
+        self.model = MixtralModel(
+            config, quant_config=quant_config, prefix=add_prefix("model", prefix)
+        )
+        self.lm_head = ParallelLMHead(
+            config.vocab_size, config.hidden_size, prefix=add_prefix("lm_head", prefix)
+        )
+        self.logits_processor = LogitsProcessor(config)
+
+    @torch.no_grad()
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        input_embeds: torch.Tensor = None,
+        pp_proxy_tensors: Optional[PPProxyTensors] = None,
+    ) -> torch.Tensor:
+        hidden_states = self.model(
+            input_ids,
+            positions,
+            forward_batch,
+            input_embeds,
+            pp_proxy_tensors=pp_proxy_tensors,
+        )
+
+        if self.pp_group.is_last_rank:
+            return self.logits_processor(
+                input_ids, hidden_states, self.lm_head, forward_batch
+            )
+        else:
+            return hidden_states
+
+    @property
+    def start_layer(self):
+        return self.model.start_layer
+
+    @property
+    def end_layer(self):
+        return self.model.end_layer
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+        ]
+
+        # Params for weights, fp8 weight scales, fp8 activation scales
+        # (param_name, weight_name, expert_id, shard_id)
+        expert_params_mapping = FusedMoE.make_expert_params_mapping(
+            ckpt_gate_proj_name="w1",
+            ckpt_down_proj_name="w2",
+            ckpt_up_proj_name="w3",
+            num_experts=self.config.num_local_experts,
+        )
+
+        params_dict = dict(self.named_parameters())
+        for name, loaded_weight in weights:
+            layer_id = get_layer_id(name)
+            if (
+                layer_id is not None
+                and hasattr(self.model, "start_layer")
+                and (
+                    layer_id < self.model.start_layer
+                    or layer_id >= self.model.end_layer
+                )
+            ):
+                continue
+
+            if "rotary_emb.inv_freq" in name:
+                continue
+
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if (
+                    name.endswith(".bias") or name.endswith("_bias")
+                ) and name not in params_dict:
+                    continue
+
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                for mapping in expert_params_mapping:
+                    param_name, weight_name, expert_id, shard_id = mapping
+                    if weight_name not in name:
+                        continue
+                    name = name.replace(weight_name, param_name)
+
+                    if (
+                        name.endswith(".bias") or name.endswith("_bias")
+                    ) and name not in params_dict:
+                        continue
+                    param = params_dict[name]
+                    weight_loader = param.weight_loader
+                    weight_loader(
+                        param,
+                        loaded_weight,
+                        name,
+                        shard_id=shard_id,
+                        expert_id=expert_id,
+                    )
+                    break
+                else:
+                    # Skip loading extra bias for GPTQ models.
+                    if (
+                        name.endswith(".bias") or name.endswith("_bias")
+                    ) and name not in params_dict:
+                        continue
+                    # Skip loading kv_scale from ckpts towards new design.
+                    if name.endswith(".kv_scale") and name not in params_dict:
+                        continue
+                    if name is None:
+                        continue
+
+                    if name in params_dict.keys():
+                        param = params_dict[name]
+                        weight_loader = getattr(
+                            param, "weight_loader", default_weight_loader
+                        )
+                        weight_loader(param, loaded_weight)
+                    else:
+                        logger.warning(f"Parameter {name} not found in params_dict")
+
+
+EntryClass = MixtralForCausalLM
diff --git a/sglang/python/sglang/srt/models/mixtral_quant.py b/sglang/python/sglang/srt/models/mixtral_quant.py
new file mode 100644
index 0000000000000000000000000000000000000000..10bfa5068de215a45b10ddbe03dd8c9b907bc9c1
--- /dev/null
+++ b/sglang/python/sglang/srt/models/mixtral_quant.py
@@ -0,0 +1,430 @@
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+# Adapted from
+# https://github.com/vllm-project/vllm/blob/c7f2cf2b7f67bce5842fedfdba508440fe257375/vllm/model_executor/models/mixtral_quant.py#L1
+"""Inference-only Mixtral model."""
+
+from typing import Iterable, Optional, Tuple
+
+import numpy as np
+import torch
+import torch.nn.functional as F
+from torch import nn
+from transformers import MixtralConfig
+
+from sglang.srt.distributed import (
+    get_tensor_model_parallel_rank,
+    get_tensor_model_parallel_world_size,
+    tensor_model_parallel_all_reduce,
+)
+from sglang.srt.layers.layernorm import RMSNorm
+from sglang.srt.layers.linear import (
+    QKVParallelLinear,
+    ReplicatedLinear,
+    RowParallelLinear,
+)
+from sglang.srt.layers.logits_processor import LogitsProcessor
+from sglang.srt.layers.quantization.base_config import QuantizationConfig
+from sglang.srt.layers.radix_attention import RadixAttention
+from sglang.srt.layers.rotary_embedding import get_rope
+from sglang.srt.layers.vocab_parallel_embedding import (
+    ParallelLMHead,
+    VocabParallelEmbedding,
+)
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch
+from sglang.srt.model_loader.weight_utils import default_weight_loader
+from sglang.srt.utils import add_prefix
+
+
+class MixtralMLP(nn.Module):
+    def __init__(
+        self,
+        num_experts: int,
+        hidden_size: int,
+        intermediate_size: int,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.num_experts = num_experts
+        self.ffn_dim = intermediate_size
+        self.hidden_dim = hidden_size
+
+        self.w1 = ReplicatedLinear(
+            self.hidden_dim,
+            self.ffn_dim,
+            bias=False,
+            quant_config=quant_config,
+            prefix=add_prefix("w1", prefix),
+        )
+        self.w2 = ReplicatedLinear(
+            self.ffn_dim,
+            self.hidden_dim,
+            bias=False,
+            quant_config=quant_config,
+            prefix=add_prefix("w2", prefix),
+        )
+        self.w3 = ReplicatedLinear(
+            self.hidden_dim,
+            self.ffn_dim,
+            bias=False,
+            quant_config=quant_config,
+            prefix=add_prefix("w3", prefix),
+        )
+
+        # TODO: Use vllm's SiluAndMul
+        self.act_fn = nn.SiLU()
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        w1_out, _ = self.w1(hidden_states)
+        w1_out = self.act_fn(w1_out)
+        w3_out, _ = self.w3(hidden_states)
+        current_hidden_states = w1_out * w3_out
+        current_hidden_states, _ = self.w2(current_hidden_states)
+        return current_hidden_states
+
+
+class MixtralMoE(nn.Module):
+    def __init__(
+        self,
+        config: MixtralConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.config = config
+        self.rank = get_tensor_model_parallel_rank()
+        self.tp_size = get_tensor_model_parallel_world_size()
+        self.num_total_experts = config.num_local_experts
+        self.top_k = config.num_experts_per_tok
+        if self.tp_size > self.num_total_experts:
+            raise ValueError(
+                f"Tensor parallel size {self.tp_size} is greater than "
+                f"the number of experts {self.num_total_experts}."
+            )
+        # Split experts equally between ranks
+        self.expert_indices = np.array_split(
+            range(self.num_total_experts), self.tp_size
+        )[self.rank].tolist()
+        if not self.expert_indices:
+            raise ValueError(f"Rank {self.rank} has no experts assigned to it.")
+
+        self.experts = nn.ModuleList(
+            [
+                (
+                    MixtralMLP(
+                        self.num_total_experts,
+                        config.hidden_size,
+                        config.intermediate_size,
+                        quant_config=quant_config,
+                        prefix=add_prefix(f"experts.{idx}", prefix),
+                    )
+                    if idx in self.expert_indices
+                    else None
+                )
+                for idx in range(self.num_total_experts)
+            ]
+        )
+        self.gate = ReplicatedLinear(
+            config.hidden_size,
+            self.num_total_experts,
+            bias=False,
+            quant_config=None,
+            prefix=add_prefix("gate", prefix),
+        )
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        router_logits, _ = self.gate(hidden_states)
+
+        routing_weights = F.softmax(router_logits, dim=1, dtype=torch.float)
+        routing_weights, selected_experts = torch.topk(
+            routing_weights, self.top_k, dim=-1
+        )
+        routing_weights /= routing_weights.sum(dim=-1, keepdim=True)
+
+        final_hidden_states = None
+        for expert_idx in self.expert_indices:
+            expert_layer = self.experts[expert_idx]
+            expert_mask = selected_experts == expert_idx
+            expert_weights = (routing_weights * expert_mask).sum(dim=-1, keepdim=True)
+
+            current_hidden_states = expert_layer(hidden_states).mul_(expert_weights)
+            if final_hidden_states is None:
+                final_hidden_states = current_hidden_states
+            else:
+                final_hidden_states.add_(current_hidden_states)
+
+        return tensor_model_parallel_all_reduce(final_hidden_states)
+
+
+class MixtralAttention(nn.Module):
+    def __init__(
+        self,
+        hidden_size: int,
+        num_heads: int,
+        num_kv_heads: int,
+        layer_id: int = 0,
+        max_position: int = 4096 * 32,
+        rope_theta: float = 10000,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.hidden_size = hidden_size
+        tp_size = get_tensor_model_parallel_world_size()
+        self.total_num_heads = num_heads
+        assert self.total_num_heads % tp_size == 0
+        self.num_heads = self.total_num_heads // tp_size
+        self.total_num_kv_heads = num_kv_heads
+        if self.total_num_kv_heads >= tp_size:
+            # Number of KV heads is greater than TP size, so we partition
+            # the KV heads across multiple tensor parallel GPUs.
+            assert self.total_num_kv_heads % tp_size == 0
+        else:
+            # Number of KV heads is less than TP size, so we replicate
+            # the KV heads across multiple tensor parallel GPUs.
+            assert tp_size % self.total_num_kv_heads == 0
+        self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
+        self.head_dim = hidden_size // self.total_num_heads
+        self.q_size = self.num_heads * self.head_dim
+        self.kv_size = self.num_kv_heads * self.head_dim
+        self.scaling = self.head_dim**-0.5
+        self.rope_theta = rope_theta
+
+        self.qkv_proj = QKVParallelLinear(
+            hidden_size,
+            self.head_dim,
+            self.total_num_heads,
+            self.total_num_kv_heads,
+            bias=False,
+            quant_config=quant_config,
+            prefix=add_prefix("qkv_proj", prefix),
+        )
+        self.o_proj = RowParallelLinear(
+            self.total_num_heads * self.head_dim,
+            hidden_size,
+            bias=False,
+            quant_config=quant_config,
+            prefix=add_prefix("o_proj", prefix),
+        )
+        self.rotary_emb = get_rope(
+            self.head_dim,
+            rotary_dim=self.head_dim,
+            max_position=max_position,
+            base=int(self.rope_theta),
+            is_neox_style=True,
+        )
+        self.attn = RadixAttention(
+            self.num_heads,
+            self.head_dim,
+            self.scaling,
+            num_kv_heads=self.num_kv_heads,
+            layer_id=layer_id,
+            quant_config=quant_config,
+            prefix=add_prefix("attn", prefix),
+        )
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+    ) -> torch.Tensor:
+        qkv, _ = self.qkv_proj(hidden_states)
+        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+        q, k = self.rotary_emb(positions, q, k)
+        attn_output = self.attn(q, k, v, forward_batch)
+        output, _ = self.o_proj(attn_output)
+        return output
+
+
+class MixtralDecoderLayer(nn.Module):
+    def __init__(
+        self,
+        config: MixtralConfig,
+        layer_id: int = 0,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        # Requires transformers > 4.32.0
+        rope_theta = getattr(config, "rope_theta", 10000)
+        self.self_attn = MixtralAttention(
+            hidden_size=self.hidden_size,
+            num_heads=config.num_attention_heads,
+            max_position=config.max_position_embeddings,
+            num_kv_heads=config.num_key_value_heads,
+            layer_id=layer_id,
+            rope_theta=rope_theta,
+            quant_config=quant_config,
+            prefix=add_prefix("self_attn", prefix),
+        )
+        self.block_sparse_moe = MixtralMoE(
+            config=config,
+            quant_config=quant_config,
+            prefix=add_prefix("block_sparse_moe", prefix),
+        )
+        self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = RMSNorm(
+            config.hidden_size, eps=config.rms_norm_eps
+        )
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+        residual: Optional[torch.Tensor],
+    ) -> torch.Tensor:
+        # Self Attention
+        if residual is None:
+            residual = hidden_states
+            hidden_states = self.input_layernorm(hidden_states)
+        else:
+            hidden_states, residual = self.input_layernorm(hidden_states, residual)
+        hidden_states = self.self_attn(
+            positions=positions,
+            hidden_states=hidden_states,
+            forward_batch=forward_batch,
+        )
+
+        # Fully Connected
+        hidden_states, residual = self.post_attention_layernorm(hidden_states, residual)
+        hidden_states = self.block_sparse_moe(hidden_states)
+        return hidden_states, residual
+
+
+class MixtralModel(nn.Module):
+    def __init__(
+        self,
+        config: MixtralConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+
+        self.embed_tokens = VocabParallelEmbedding(
+            config.vocab_size,
+            config.hidden_size,
+            prefix=add_prefix("embed_tokens", prefix),
+        )
+        self.layers = nn.ModuleList(
+            [
+                MixtralDecoderLayer(
+                    config,
+                    i,
+                    quant_config=quant_config,
+                    prefix=add_prefix(f"layers.{i}", prefix),
+                )
+                for i in range(config.num_hidden_layers)
+            ]
+        )
+        self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        input_embeds: torch.Tensor = None,
+    ) -> torch.Tensor:
+        if input_embeds is None:
+            hidden_states = self.embed_tokens(input_ids)
+        else:
+            hidden_states = input_embeds
+        residual = None
+        for i in range(len(self.layers)):
+            layer = self.layers[i]
+            hidden_states, residual = layer(
+                positions, hidden_states, forward_batch, residual
+            )
+        hidden_states, _ = self.norm(hidden_states, residual)
+        return hidden_states
+
+
+class QuantMixtralForCausalLM(nn.Module):
+    def __init__(
+        self,
+        config: MixtralConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.config = config
+        self.quant_config = quant_config
+        self.model = MixtralModel(
+            config, quant_config=quant_config, prefix=add_prefix("model", prefix)
+        )
+        self.lm_head = ParallelLMHead(
+            config.vocab_size, config.hidden_size, prefix=add_prefix("lm_head", prefix)
+        )
+        self.logits_processor = LogitsProcessor(config)
+
+    @torch.no_grad()
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        input_embeds: torch.Tensor = None,
+    ) -> torch.Tensor:
+        hidden_states = self.model(input_ids, positions, forward_batch, input_embeds)
+        return self.logits_processor(
+            input_ids, hidden_states, self.lm_head, forward_batch
+        )
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+        ]
+
+        params_dict = dict(self.named_parameters())
+        for name, loaded_weight in weights:
+            if "rotary_emb.inv_freq" in name:
+                continue
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                if name not in params_dict:
+                    continue
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                # Skip experts that are not assigned to this worker.
+                if "block_sparse_moe.experts." in name and name not in params_dict:
+                    continue
+                if name not in params_dict:
+                    continue
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
+                weight_loader(param, loaded_weight)
+
+
+EntryClass = QuantMixtralForCausalLM
diff --git a/sglang/python/sglang/srt/models/mllama.py b/sglang/python/sglang/srt/models/mllama.py
new file mode 100644
index 0000000000000000000000000000000000000000..b9356692613e23b18cb35c16a23f6c23f8afb9ac
--- /dev/null
+++ b/sglang/python/sglang/srt/models/mllama.py
@@ -0,0 +1,1045 @@
+# Adapted from:
+# https://github.com/vllm-project/vllm/blob/7193774b1ff8603ad5bf4598e5efba0d9a39b436/vllm/model_executor/models/mllama.py
+"""PyTorch Mllama model."""
+
+import math
+from typing import Iterable, List, Optional, Tuple, Union
+
+import torch
+import torch.nn.functional as F
+import torch.utils.checkpoint
+import transformers.models.mllama.configuration_mllama as config_mllama
+from torch import nn
+from transformers.modeling_outputs import BaseModelOutput, CausalLMOutputWithPast
+from transformers.models.mllama.modeling_mllama import (
+    _prepare_aspect_ratio_attention_mask,
+)
+
+import sglang.srt.distributed.parallel_state as ps
+from sglang.srt.distributed import get_tensor_model_parallel_world_size
+from sglang.srt.layers.activation import get_act_fn
+from sglang.srt.layers.attention.vision import VisionAttention
+from sglang.srt.layers.layernorm import RMSNorm
+from sglang.srt.layers.linear import (
+    ColumnParallelLinear,
+    QKVParallelLinear,
+    ReplicatedLinear,
+    RowParallelLinear,
+)
+from sglang.srt.layers.logits_processor import LogitsProcessor
+from sglang.srt.layers.quantization import QuantizationConfig
+from sglang.srt.layers.radix_attention import RadixAttention
+from sglang.srt.layers.vocab_parallel_embedding import (
+    DEFAULT_VOCAB_PADDING_SIZE,
+    ParallelLMHead,
+    VocabParallelEmbedding,
+)
+from sglang.srt.managers.schedule_batch import MultimodalInputs
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch
+from sglang.srt.model_loader.weight_utils import default_weight_loader
+from sglang.srt.models.llama import LlamaDecoderLayer, LlamaMLP
+from sglang.srt.utils import add_prefix
+
+
+class ColumnParallelConv2dPatch(torch.nn.Module):
+    """Conv2D Patching layer with model parallelism.
+    Column parallel over unfolded input.
+    Arguments:
+        in_channels: Input channels.
+        out_channels: Output channels.
+        kernel_size: Size of convolution kernel.
+        stride (default 1): Stride for convolution.
+        bias (default False): Use bias in Conv2d.
+    Input: (bsz, in_channels, width, height)
+    Output: (bsz, num_tokens, out_channels)
+    """
+
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: Union[int, Tuple[int, int]],
+        stride: Union[int, Tuple[int, int]],
+        bias: bool = False,
+    ) -> None:
+        super().__init__()
+        if isinstance(kernel_size, int):
+            kernel_size = (kernel_size, kernel_size)
+        self._unfold = torch.nn.Unfold(kernel_size=kernel_size, stride=stride)
+        self._linear = ColumnParallelLinear(
+            in_channels * kernel_size[0] * kernel_size[1],
+            out_channels,
+            bias=bias,
+        )
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = self._unfold(x)
+        x = x.permute(0, 2, 1)
+        x, _ = self._linear(x)
+        return x
+
+
+class MllamaPrecomputedAspectRatioEmbedding(nn.Module):
+
+    def __init__(self, config: config_mllama.MllamaVisionConfig, is_gated: bool = True):
+        super().__init__()
+        self.max_num_tiles = config.max_num_tiles
+        self.hidden_size = config.hidden_size
+        self.max_aspect_ratio_id = config.max_aspect_ratio_id
+        self.is_gated = is_gated
+
+        self.embedding = nn.Embedding(
+            self.max_aspect_ratio_id + 1, self.max_num_tiles * self.hidden_size
+        )
+        if is_gated:
+            self.gate = nn.Parameter(torch.zeros(1))
+
+    def forward(
+        self, hidden_state: torch.Tensor, aspect_ratio_ids: torch.Tensor
+    ) -> torch.Tensor:
+        embeddings = self.embedding(aspect_ratio_ids)
+        embeddings = embeddings.reshape(-1, self.max_num_tiles, 1, self.hidden_size)
+
+        if self.is_gated:
+            embeddings = embeddings * self.gate.tanh()
+
+        hidden_state = hidden_state + embeddings
+        return hidden_state
+
+
+class MllamaPrecomputedPositionEmbedding(nn.Module):
+    def __init__(self, config: config_mllama.MllamaVisionConfig):
+        super().__init__()
+        self.max_num_tiles = config.max_num_tiles
+        self.max_aspect_ratio_id = config.max_aspect_ratio_id
+        self.num_patches = (config.image_size // config.patch_size) ** 2 + 1
+        self.hidden_size = config.hidden_size
+        self.scale = config.hidden_size**-0.5
+
+        self.gate = nn.Parameter(torch.zeros(1))
+
+        # position embedding
+        position_embedding = torch.randn(self.num_patches, self.hidden_size)
+        self.embedding = nn.Parameter(self.scale * position_embedding)
+
+        # tile position embedding
+        self.tile_embedding = nn.Embedding(
+            self.max_aspect_ratio_id + 1,
+            self.max_num_tiles * self.num_patches * self.hidden_size,
+        )
+
+    def forward(
+        self, hidden_state: torch.Tensor, aspect_ratio_ids: torch.Tensor
+    ) -> torch.Tensor:
+        # position embeddings
+        gated_position_embedding = (1 - self.gate.tanh()) * self.embedding
+        hidden_state = hidden_state + gated_position_embedding.view(
+            1, 1, self.num_patches, self.hidden_size
+        )
+
+        # precomputed tile position embeddings
+        tile_position_embedding = self.tile_embedding(aspect_ratio_ids)
+        batch_size = hidden_state.shape[0]
+        tile_position_embedding = tile_position_embedding.reshape(
+            batch_size, self.max_num_tiles, self.num_patches, self.hidden_size
+        )
+        gated_tile_position_embedding = self.gate.tanh() * tile_position_embedding
+        hidden_state = hidden_state + gated_tile_position_embedding
+
+        return hidden_state
+
+
+class MllamaVisionMLP(nn.Module):
+    def __init__(
+        self,
+        config,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.config = config
+        self.activation_fn = get_act_fn(config.hidden_act)
+        self.fc1 = ColumnParallelLinear(
+            config.hidden_size,
+            config.intermediate_size,
+            bias=True,
+            quant_config=quant_config,
+            prefix=add_prefix("fc1", prefix),
+        )
+        self.fc2 = RowParallelLinear(
+            config.intermediate_size,
+            config.hidden_size,
+            bias=True,
+            quant_config=quant_config,
+            prefix=add_prefix("fc2", prefix),
+        )
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states, _ = self.fc1(hidden_states)
+        hidden_states = self.activation_fn(hidden_states)
+        hidden_states, _ = self.fc2(hidden_states)
+
+        return hidden_states
+
+
+class MllamaVisionEncoderLayer(nn.Module):
+    def __init__(
+        self,
+        config: config_mllama.MllamaVisionConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        is_gated: bool = False,
+        prefix: str = "",
+    ):
+        super().__init__()
+
+        self.hidden_size = config.hidden_size
+        self.num_attention_heads = config.attention_heads
+        self.is_gated = is_gated
+        self.intermediate_size = config.intermediate_size
+
+        self.self_attn = VisionAttention(
+            self.hidden_size,
+            self.num_attention_heads,
+            self.hidden_size,
+            use_qkv_parallel=True,
+            quant_config=quant_config,
+            flatten_batch=False,
+            prefix=add_prefix("self_attn", prefix),
+        )
+        self.mlp = MllamaVisionMLP(
+            config, quant_config, prefix=add_prefix("mlp", prefix)
+        )
+
+        self.input_layernorm = nn.LayerNorm(self.hidden_size, eps=config.norm_eps)
+        self.post_attention_layernorm = nn.LayerNorm(
+            self.hidden_size, eps=config.norm_eps
+        )
+
+        # there used to be an if else here, no code path
+        if is_gated:
+            self.gate_attn = nn.Parameter(torch.ones(1) * math.pi / 4)
+            self.gate_ffn = nn.Parameter(torch.ones(1) * math.pi / 4)
+
+    def forward(
+        self,
+        hidden_state: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+    ):
+        # Self Attention
+        residual = hidden_state
+        hidden_state = self.input_layernorm(hidden_state)
+        hidden_state = self.self_attn(hidden_state, attention_mask=attention_mask)
+        gate_attn = 1 if not self.is_gated else self.gate_attn.tanh()
+        hidden_state = residual + gate_attn * hidden_state
+
+        # Feed forward
+        residual = hidden_state
+        hidden_state = self.post_attention_layernorm(hidden_state)
+        hidden_state = self.mlp(hidden_state)
+        gate_ffn = 1 if not self.is_gated else self.gate_ffn.tanh()
+        hidden_state = residual + gate_ffn * hidden_state
+
+        return hidden_state
+
+
+class MllamaVisionEncoder(nn.Module):
+    def __init__(
+        self,
+        config: config_mllama.MllamaVisionConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        num_layers=32,
+        is_gated=False,
+        output_hidden_states=None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.config = config
+        self.layers = nn.ModuleList(
+            [
+                MllamaVisionEncoderLayer(
+                    config,
+                    quant_config,
+                    is_gated,
+                    prefix=add_prefix(f"layers.{i}", prefix),
+                )
+                for i in range(num_layers)
+            ]
+        )
+        self.output_hidden_states = output_hidden_states or []
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+    ) -> Union[Tuple, BaseModelOutput]:
+        encoder_states = ()
+
+        for i, encoder_layer in enumerate(self.layers):
+            if i in self.output_hidden_states:
+                encoder_states = encoder_states + (hidden_states,)
+            hidden_states = encoder_layer(
+                hidden_states,
+                attention_mask,
+            )
+
+        if len(self.layers) - 1 in self.output_hidden_states:
+            encoder_states = encoder_states + (hidden_states,)
+
+        return hidden_states, encoder_states
+
+
+class MllamaVisionModel(nn.Module):
+    def __init__(
+        self,
+        config: config_mllama.MllamaVisionConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.image_size = config.image_size
+        self.patch_size = config.patch_size
+        self.max_num_tiles = config.max_num_tiles
+        self.hidden_size = config.hidden_size
+        self.in_channels = config.num_channels
+        self.intermediate_layers_indices = config.intermediate_layers_indices
+
+        self.num_patches = (self.image_size // self.patch_size) ** 2 + 1
+        self.scale = config.hidden_size**-0.5
+
+        self.patch_embedding = ColumnParallelConv2dPatch(
+            in_channels=config.num_channels,
+            out_channels=self.hidden_size,
+            kernel_size=self.patch_size,
+            stride=self.patch_size,
+            bias=False,
+        )
+
+        self.class_embedding = nn.Parameter(self.scale * torch.randn(self.hidden_size))
+        self.gated_positional_embedding = MllamaPrecomputedPositionEmbedding(config)
+
+        self.pre_tile_positional_embedding = MllamaPrecomputedAspectRatioEmbedding(
+            config, is_gated=True
+        )
+        self.post_tile_positional_embedding = MllamaPrecomputedAspectRatioEmbedding(
+            config, is_gated=True
+        )
+
+        # layer norms
+        self.layernorm_pre = nn.LayerNorm(self.hidden_size)
+        self.layernorm_post = nn.LayerNorm(self.hidden_size)
+
+        # encoders
+        self.transformer = MllamaVisionEncoder(
+            config,
+            quant_config,
+            config.num_hidden_layers,
+            is_gated=False,
+            output_hidden_states=config.intermediate_layers_indices,
+            prefix=add_prefix("transformer", prefix),
+        )
+        self.global_transformer = MllamaVisionEncoder(
+            config,
+            quant_config,
+            config.num_global_layers,
+            is_gated=True,
+            prefix=add_prefix("global_transformer", prefix),
+        )
+
+    def apply_class_embedding(self, hidden_state: torch.Tensor) -> torch.Tensor:
+        batch_size, _, hidden_size = hidden_state.shape
+        class_embedding = self.class_embedding.expand(batch_size, 1, hidden_size)
+        hidden_state = torch.cat([class_embedding, hidden_state], dim=1)
+        return hidden_state
+
+    def forward(
+        self,
+        pixel_values: torch.Tensor,
+        aspect_ratio_ids: torch.Tensor,
+        aspect_ratio_mask: torch.Tensor,
+    ) -> torch.Tensor:
+        batch_size, num_concurrent_media, num_tiles, num_channels, height, width = (
+            pixel_values.shape
+        )
+
+        pixel_values = pixel_values.reshape(
+            batch_size * num_concurrent_media * num_tiles, num_channels, height, width
+        )
+        aspect_ratio_ids = aspect_ratio_ids.reshape(
+            batch_size * num_concurrent_media, -1
+        )
+
+        # patch embedding
+        patch_embeds = self.patch_embedding(
+            pixel_values.to(self.layernorm_pre.weight.dtype)
+        )
+        hidden_state = patch_embeds
+        hidden_state = ps.get_tp_group().all_gather(hidden_state)
+
+        # tile embeddings
+        _, num_patches, dim = hidden_state.shape
+        hidden_state = hidden_state.reshape(
+            batch_size * num_concurrent_media, num_tiles, -1, dim
+        )
+        hidden_state = self.pre_tile_positional_embedding(
+            hidden_state, aspect_ratio_ids
+        )
+
+        # apply cls token
+        hidden_state = hidden_state.reshape(
+            batch_size * num_concurrent_media * num_tiles, num_patches, dim
+        )
+        hidden_state = self.apply_class_embedding(hidden_state)
+        num_patches += 1
+
+        # apply position embeddings
+        hidden_state = hidden_state.reshape(
+            batch_size * num_concurrent_media, num_tiles, num_patches, dim
+        )
+        hidden_state = self.gated_positional_embedding(hidden_state, aspect_ratio_ids)
+
+        # apply encoder
+        hidden_state = self.layernorm_pre(hidden_state)
+
+        # Compute the number of tokens to pad
+        num_padding_patches = (8 - (hidden_state.shape[-2] % 8)) % 8
+        # Compute padding tuple for pad function
+        padding = (
+            0,
+            0,
+            0,
+            num_padding_patches,
+        )  # (pad_left, pad_right, pad_left for dim -2, pad_right for dim -2)
+        # Pad the tensor
+        hidden_state = F.pad(hidden_state, padding, mode="constant", value=0)
+        slice_index = -num_padding_patches if num_padding_patches > 0 else None
+
+        attention_mask = aspect_ratio_mask.reshape(
+            batch_size * num_concurrent_media, -1
+        )
+        attention_mask = _prepare_aspect_ratio_attention_mask(
+            aspect_ratio_mask=attention_mask,
+            num_patches=self.num_patches,
+            target_length=hidden_state.shape[2],
+            dtype=self.layernorm_pre.weight.dtype,
+        )
+
+        hidden_state = hidden_state.view(batch_size * num_concurrent_media, -1, dim)
+        output = self.transformer(
+            hidden_state,
+            attention_mask=attention_mask,
+        )
+        hidden_state, intermediate_hidden_states = output[0], output[1]
+        intermediate_hidden_states = torch.stack(intermediate_hidden_states, dim=-1)
+
+        # apply global encoder
+        hidden_state = self.layernorm_post(hidden_state)
+        hidden_state = hidden_state.reshape(
+            batch_size * num_concurrent_media,
+            num_tiles,
+            num_patches + num_padding_patches,
+            dim,
+        )
+        hidden_state = self.post_tile_positional_embedding(
+            hidden_state, aspect_ratio_ids
+        )
+        hidden_state = hidden_state.reshape(
+            batch_size * num_concurrent_media,
+            num_tiles * (num_patches + num_padding_patches),
+            dim,
+        )
+        hidden_state = self.global_transformer(
+            hidden_state, attention_mask=attention_mask
+        )[0]
+        hidden_state = hidden_state.reshape(
+            batch_size * num_concurrent_media,
+            num_tiles,
+            num_patches + num_padding_patches,
+            dim,
+        )
+        hidden_state = hidden_state[:, :, :slice_index]
+
+        # adding intermediate layer outputs
+        hidden_state = hidden_state.reshape(
+            batch_size, num_concurrent_media, num_tiles, num_patches, dim
+        )
+        intermediate_hidden_states = intermediate_hidden_states.reshape(
+            batch_size * num_concurrent_media,
+            num_tiles,
+            num_patches + num_padding_patches,
+            -1,
+        )
+        intermediate_hidden_states = intermediate_hidden_states[:, :, :slice_index]
+        intermediate_hidden_states = intermediate_hidden_states.reshape(
+            batch_size, num_concurrent_media, num_tiles, num_patches, -1
+        )
+        hidden_state = torch.cat([hidden_state, intermediate_hidden_states], dim=-1)
+        return hidden_state
+
+
+class MllamaTextCrossAttention(nn.Module):
+    def __init__(
+        self,
+        config: Optional[config_mllama.MllamaTextConfig] = None,
+        layer_id: Optional[int] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.config = config
+        self.model_parallel_size = get_tensor_model_parallel_world_size()
+        self.num_heads = self.config.num_attention_heads
+        self.num_local_heads = self.num_heads // self.model_parallel_size
+        self.num_key_value_heads = self.config.num_key_value_heads
+        self.num_local_key_value_heads = (
+            self.num_key_value_heads // self.model_parallel_size
+        )
+        self.dropout = config.dropout
+        self.hidden_size = config.hidden_size
+        self.head_dim = config.hidden_size // self.num_heads
+        self.layer_id = layer_id
+        self.num_key_value_groups = self.num_heads // self.num_key_value_heads
+        self.q_local_size = self.num_local_heads * self.head_dim
+        self.kv_local_size = self.num_local_key_value_heads * self.head_dim
+
+        self.qkv_proj = QKVParallelLinear(
+            self.hidden_size,
+            self.head_dim,
+            self.num_heads,
+            self.num_key_value_heads,
+            bias=False,
+            quant_config=quant_config,
+            prefix=add_prefix("qkv_proj", prefix),
+        )
+        self.o_proj = RowParallelLinear(
+            self.num_heads * self.head_dim,
+            self.hidden_size,
+            bias=False,
+            input_is_parallel=True,
+            quant_config=quant_config,
+            prefix=add_prefix("o_proj", prefix),
+        )
+        self.q_norm = RMSNorm(self.head_dim, eps=config.rms_norm_eps)
+        self.k_norm = RMSNorm(self.head_dim, eps=config.rms_norm_eps)
+        self.scaling = self.head_dim**-0.5
+
+        self.attn = RadixAttention(
+            self.num_local_heads,
+            self.head_dim,
+            self.scaling,
+            self.num_local_key_value_heads,
+            layer_id=layer_id,
+            is_cross_attention=True,
+            quant_config=quant_config,
+            prefix=add_prefix("attn", prefix),
+        )
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor],
+        cross_attention_states: Optional[torch.Tensor],
+        forward_batch: ForwardBatch,
+    ) -> torch.Tensor:
+        qkv_dec, _ = self.qkv_proj(hidden_states)
+        q, _, _ = qkv_dec.split(
+            [self.q_local_size, self.kv_local_size, self.kv_local_size], dim=-1
+        )
+        if cross_attention_states is None:
+            k = None
+            v = None
+        else:
+            qkv_enc, _ = self.qkv_proj(cross_attention_states)
+            _, k, v = qkv_enc.split(
+                [self.q_local_size, self.kv_local_size, self.kv_local_size], dim=-1
+            )
+            k = k.view(-1, self.num_local_key_value_heads, self.head_dim)
+            v = v.view(-1, self.num_local_key_value_heads, self.head_dim)
+            k = self.k_norm(k.reshape(-1, self.head_dim)).reshape(
+                -1, self.num_local_key_value_heads, self.head_dim
+            )
+        q = q.view(-1, self.num_local_heads, self.head_dim)
+        q = self.q_norm(q.reshape(-1, self.head_dim)).reshape(
+            -1, self.num_local_heads, self.head_dim
+        )
+
+        output = self.attn(q, k, v, forward_batch)
+        out, _ = self.o_proj(output)
+        return out
+
+
+class MllamaCrossAttentionDecoderLayer(torch.nn.Module):
+    """Cross-attention transformer block with tanh-gated attention
+    and feedforward."""
+
+    def __init__(
+        self,
+        config: config_mllama.MllamaTextConfig,
+        layer_id: int,
+        quant_config: Optional[QuantizationConfig],
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.layer_id = layer_id
+        self.cross_attn = MllamaTextCrossAttention(
+            config=config,
+            layer_id=layer_id,
+            quant_config=quant_config,
+            prefix=add_prefix("cross_attn", prefix),
+        )
+
+        self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.cross_attn_attn_gate = torch.nn.Parameter(torch.zeros(1))
+
+        self.mlp = LlamaMLP(
+            hidden_size=config.hidden_size,
+            intermediate_size=config.intermediate_size,
+            hidden_act=config.hidden_act,
+            quant_config=quant_config,
+            prefix=add_prefix("mlp", prefix),
+        )
+        self.post_attention_layernorm = RMSNorm(
+            config.hidden_size, eps=config.rms_norm_eps
+        )
+        self.cross_attn_mlp_gate = torch.nn.Parameter(torch.zeros(1))
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        cross_attention_states: torch.Tensor,
+        cross_attention_mask: torch.Tensor,
+        full_text_row_masked_out_mask: torch.Tensor,
+        forward_batch: ForwardBatch,
+    ) -> torch.Tensor:
+        residual = hidden_states
+        hidden_states = self.input_layernorm(hidden_states)
+
+        hidden_states = self.cross_attn(
+            hidden_states=hidden_states,
+            attention_mask=cross_attention_mask,
+            cross_attention_states=cross_attention_states,
+            forward_batch=forward_batch,
+        )
+        hidden_states = full_text_row_masked_out_mask * hidden_states
+        hidden_states = residual + self.cross_attn_attn_gate.tanh() * hidden_states
+
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = full_text_row_masked_out_mask * hidden_states
+        hidden_states = residual + self.cross_attn_mlp_gate.tanh() * hidden_states
+        return hidden_states
+
+
+class MllamaTextModel(nn.Module):
+    config_class = config_mllama.MllamaTextConfig
+    base_model_prefix = "model"
+
+    def __init__(
+        self,
+        config: config_mllama.MllamaTextConfig,
+        quant_config: Optional[QuantizationConfig],
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.padding_id = config.pad_token_id
+        self.vocab_size = config.vocab_size
+        self.embed_tokens = VocabParallelEmbedding(
+            config.vocab_size + 8,
+            config.hidden_size,
+            prefix=add_prefix("embed_tokens", prefix),
+        )
+        self.cross_attention_layers = config.cross_attention_layers
+
+        layers = []
+        for layer_id in range(config.num_hidden_layers):
+            if layer_id in self.cross_attention_layers:
+                layers.append(
+                    MllamaCrossAttentionDecoderLayer(
+                        config,
+                        layer_id,
+                        quant_config=quant_config,
+                        prefix=add_prefix(f"layers.{layer_id}", prefix),
+                    )
+                )
+            else:
+                # TODO: force LlamaDecoderLayer to config.attention_bias=False
+                layers.append(
+                    LlamaDecoderLayer(
+                        config,
+                        quant_config=quant_config,
+                        layer_id=layer_id,
+                        prefix=add_prefix(f"layers.{layer_id}", prefix),
+                    )
+                )
+
+        self.layers = nn.ModuleList(layers)
+        self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+    def forward(
+        self,
+        input_ids: torch.LongTensor,
+        positions: Optional[torch.LongTensor],
+        cross_attention_states: Optional[torch.LongTensor],
+        cross_attention_mask: Optional[torch.LongTensor],
+        full_text_row_masked_out_mask: Optional[Tuple[torch.Tensor, torch.Tensor]],
+        forward_batch: ForwardBatch,
+        skip_cross_attention: bool,
+    ) -> torch.Tensor:
+        inputs_embeds = self.embed_tokens(input_ids)
+        hidden_states = inputs_embeds
+
+        for _, decoder_layer in enumerate(self.layers):
+            if isinstance(decoder_layer, MllamaCrossAttentionDecoderLayer):
+                if not skip_cross_attention:
+                    hidden_states = decoder_layer(
+                        hidden_states=hidden_states,
+                        cross_attention_states=cross_attention_states,
+                        cross_attention_mask=cross_attention_mask,
+                        full_text_row_masked_out_mask=full_text_row_masked_out_mask,
+                        forward_batch=forward_batch,
+                    )
+            elif isinstance(decoder_layer, LlamaDecoderLayer):
+                hidden_states, residual = decoder_layer(
+                    positions=positions,
+                    hidden_states=hidden_states,
+                    forward_batch=forward_batch,
+                    residual=None,
+                )
+                hidden_states = hidden_states + residual
+            else:
+                raise ValueError(f"Unknown decoder layer type {type(decoder_layer)}")
+        hidden_states = self.norm(hidden_states)
+        return hidden_states
+
+
+class MllamaForCausalLM(nn.Module):
+    config_class = config_mllama.MllamaTextConfig
+    base_model_prefix = "language_model"
+    _no_split_modules = [
+        "MllamaCrossAttentionDecoderLayer",
+        "MllamaSelfAttentionDecoderLayer",
+    ]
+
+    def __init__(
+        self,
+        config: config_mllama.MllamaTextConfig,
+        quant_config: Optional[QuantizationConfig],
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.vocab_size = config.vocab_size
+        self.model = MllamaTextModel(
+            config, quant_config, prefix=add_prefix("model", prefix)
+        )
+        self.lm_head = ParallelLMHead(
+            config.vocab_size,
+            config.hidden_size,
+            org_num_embeddings=config.vocab_size,
+            padding_size=DEFAULT_VOCAB_PADDING_SIZE,
+            quant_config=quant_config,
+            prefix=add_prefix("lm_head", prefix),
+        )
+
+    def forward(
+        self,
+        input_ids: torch.LongTensor,
+        positions: Optional[torch.LongTensor],
+        cross_attention_states: Optional[torch.LongTensor],
+        cross_attention_mask: Optional[torch.LongTensor],
+        full_text_row_masked_out_mask: Optional[Tuple[torch.Tensor, torch.Tensor]],
+        forward_batch: ForwardBatch,
+        skip_cross_attention: bool,
+    ) -> torch.Tensor:
+        hidden_states = self.model(
+            input_ids=input_ids,
+            positions=positions,
+            cross_attention_states=cross_attention_states,
+            cross_attention_mask=cross_attention_mask,
+            full_text_row_masked_out_mask=full_text_row_masked_out_mask,
+            forward_batch=forward_batch,
+            skip_cross_attention=skip_cross_attention,
+        )
+        return hidden_states
+
+
+class MllamaForConditionalGeneration(nn.Module):
+    # BitandBytes specific attributes
+    default_bitsandbytes_target_modules = [
+        ".gate_proj.",
+        ".down_proj.",
+        ".up_proj.",
+        ".q_proj.",
+        ".k_proj.",
+        ".v_proj.",
+        ".o_proj.",
+    ]
+    # in TP, these weights are partitioned along the column dimension (dim=-1)
+    column_parallel_weights_modules = [".down_proj.", ".o_proj."]
+    bitsandbytes_stacked_params_mapping = {
+        # shard_name, weight_name, index
+        "q_proj": ("qkv_proj", 0),
+        "k_proj": ("qkv_proj", 1),
+        "v_proj": ("qkv_proj", 2),
+        "gate_proj": ("gate_up_proj", 0),
+        "up_proj": ("gate_up_proj", 1),
+    }
+
+    def __init__(
+        self,
+        config: config_mllama.MllamaConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.quant_config = quant_config
+        self.vocab_size = config.text_config.vocab_size
+        self.hidden_size = config.text_config.hidden_size
+        self.max_num_tiles = config.vision_config.max_num_tiles
+        self.vision_output_dim = config.vision_config.vision_output_dim
+        self.pad_token_id = (
+            config.pad_token_id if config.pad_token_id is not None else -1
+        )
+        self.image_size = config.vision_config.image_size
+
+        self.vision_model = MllamaVisionModel(
+            config.vision_config,
+            quant_config=quant_config,
+            prefix=add_prefix("vision_model", prefix),
+        )
+        self.language_model = MllamaForCausalLM(
+            config.text_config,
+            quant_config=quant_config,
+            prefix=add_prefix("language_model", prefix),
+        )
+        self.multi_modal_projector = ReplicatedLinear(
+            config.vision_config.vision_output_dim,
+            config.text_config.hidden_size,
+            bias=True,
+            quant_config=quant_config,
+            prefix="multi_modal_projector",
+        )
+        self.logits_processor = LogitsProcessor(config.text_config)
+
+    def pad_input_ids(self, input_ids: List[int], mm_inputs: MultimodalInputs):
+        pixel_values = torch.cat([item.feature for item in mm_inputs.mm_items], dim=0)
+        pad_values = [item.pad_value for item in mm_inputs.mm_items]
+
+        num_concurrent_media, num_tiles = pixel_values.shape[1:3]
+        num_patches = self.vision_model.num_patches
+        image_len = num_concurrent_media * num_tiles * num_patches
+        mm_inputs.num_image_tokens = image_len
+
+        pad_ids = pad_values * ((image_len + len(pad_values)) // len(pad_values))
+
+        return pad_ids[:image_len] + input_ids
+
+    def _batch_image_inputs(self, forward_batch: ForwardBatch):
+        if forward_batch.forward_mode.is_decode() or all(forward_batch.encoder_cached):
+            return None, None, None, None
+
+        # pixel_values: shape (bs, num_image, num_tiles, 3, image_res, image_res)
+        max_num_images = max_num_tiles = bs = 0
+        for i, mm_input in enumerate(forward_batch.mm_inputs):
+
+            if not forward_batch.encoder_cached[i] and mm_input is not None:
+                pixel_values = torch.cat(
+                    [item.feature for item in mm_input.mm_items], dim=0
+                )
+                max_num_images = max(max_num_images, pixel_values.shape[1])
+
+                max_num_tiles = max(max_num_tiles, pixel_values.shape[2])
+                bs += 1
+
+        if max_num_images * max_num_tiles * bs == 0:
+            return None, None, None, None
+
+        with forward_batch.out_cache_loc.device:
+            batched_images = torch.zeros(
+                bs,
+                max_num_images,
+                max_num_tiles,
+                3,
+                self.image_size,
+                self.image_size,
+                dtype=torch.float32,
+            )
+            batched_ar_ids = torch.ones(
+                bs, max_num_images, dtype=torch.int64, device="cuda"
+            )
+            batched_ar_mask = torch.zeros(
+                bs, max_num_images, max_num_tiles, dtype=torch.int64
+            )
+            i = 0
+            encoder_lens_need = []
+
+            for k, mm_input in enumerate(forward_batch.mm_inputs):
+                if forward_batch.encoder_cached[k] or mm_input is None:
+                    continue
+
+                encoder_lens_need.append(forward_batch.encoder_lens[k])
+                pixel_values = torch.cat(
+                    [item.feature for item in mm_input.mm_items], dim=0
+                )
+                for j in range(pixel_values.shape[1]):
+                    img = pixel_values[0, j]
+                    num_tiles = img.shape[0]
+                    batched_images[i, j, :num_tiles] = img
+                    batched_ar_ids[i, j] = mm_input.mm_items[0].aspect_ratio_ids[0, j]
+
+                    batched_ar_mask[i, j, :num_tiles] = mm_input.mm_items[
+                        0
+                    ].aspect_ratio_mask[0, j]
+                i += 1
+
+        return batched_images, batched_ar_ids, batched_ar_mask, encoder_lens_need
+
+    def flat_encoder_result(
+        self, cross_attention_states: torch.Tensor, encoder_lens_need: List[int]
+    ):
+        # NOTE: not all encoders need computation, some are cached
+        head_dim = cross_attention_states.shape[-1]
+        total_encoder_len = sum(encoder_lens_need)
+        cross_attention_states_flat = torch.zeros(
+            total_encoder_len,
+            head_dim,
+            device=cross_attention_states.device,
+            dtype=cross_attention_states.dtype,
+        )
+
+        i = start_pos = 0
+        for encoder_len in encoder_lens_need:
+            if encoder_len == 0:
+                continue
+            end_pos = start_pos + encoder_len
+            cross_attention_states_flat[start_pos:end_pos] = cross_attention_states[i][
+                :encoder_len
+            ]
+            i += 1
+            start_pos += encoder_len
+
+        return cross_attention_states_flat
+
+    def get_full_text_row_masked_out_mask(self, forward_batch: ForwardBatch):
+        if forward_batch.forward_mode.is_decode():
+            full_text_row_masked_out_mask = forward_batch.encoder_lens != 0
+        else:
+            full_text_row_masked_out_mask = torch.ones(
+                forward_batch.extend_seq_lens.sum(), dtype=torch.bool
+            )
+            start_pos = 0
+
+            for seq_len, encoder_len in zip(
+                forward_batch.seq_lens.tolist(), forward_batch.encoder_lens_cpu
+            ):
+                if encoder_len == 0:
+                    full_text_row_masked_out_mask[start_pos : start_pos + seq_len] = (
+                        False
+                    )
+                start_pos += encoder_len
+
+            full_text_row_masked_out_mask = full_text_row_masked_out_mask.to(
+                forward_batch.seq_lens.device
+            )
+
+        return full_text_row_masked_out_mask.reshape(-1, 1)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+    ) -> Union[Tuple, CausalLMOutputWithPast]:
+        from sglang.srt.model_executor.cuda_graph_runner import get_is_capture_mode
+
+        batched_images, batched_ar_ids, batched_ar_mask, encoder_lens_need = (
+            self._batch_image_inputs(forward_batch)
+        )
+
+        # TODO: support multi-image by this mask
+        cross_attention_mask = None
+        cross_attention_states = None
+
+        if get_is_capture_mode():
+            # NOTE: when doing cuda graph capture, we do not want to skip cross attention
+            # Make is a constant value to avoid cuda graph capture issue
+            skip_cross_attention = False
+        else:
+            # NOTE: we do not need image_inputs when prefill
+            assert len(forward_batch.encoder_lens) == len(forward_batch.seq_lens)
+            assert len(forward_batch.encoder_lens_cpu) == len(forward_batch.seq_lens)
+            skip_cross_attention = forward_batch.encoder_lens.max() == 0
+
+        if not skip_cross_attention:
+            full_text_row_masked_out_mask = self.get_full_text_row_masked_out_mask(
+                forward_batch
+            )
+        else:
+            full_text_row_masked_out_mask = None
+
+        if batched_images is not None:
+            # NOTE: llama's reference implementation runs vision model on CPU
+            cross_attention_states = self.vision_model(
+                batched_images, batched_ar_ids, batched_ar_mask
+            )
+            cross_attention_states, _ = self.multi_modal_projector(
+                cross_attention_states
+            )
+
+            bs, _, _, _, image_token_dim = cross_attention_states.shape
+            cross_attention_states = cross_attention_states.view(
+                bs, -1, image_token_dim
+            )
+
+            cross_attention_states = self.flat_encoder_result(
+                cross_attention_states, encoder_lens_need
+            )
+
+        hidden_states = self.language_model(
+            input_ids=input_ids,
+            positions=positions,
+            cross_attention_states=cross_attention_states,
+            cross_attention_mask=cross_attention_mask,
+            full_text_row_masked_out_mask=full_text_row_masked_out_mask,
+            forward_batch=forward_batch,
+            skip_cross_attention=skip_cross_attention,
+        )
+        return self.logits_processor(
+            input_ids, hidden_states, self.language_model.lm_head, forward_batch
+        )
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            (".qkv_proj", ".q_proj", "q"),
+            (".qkv_proj", ".k_proj", "k"),
+            (".qkv_proj", ".v_proj", "v"),
+            (".gate_up_proj", ".gate_proj", 0),
+            (".gate_up_proj", ".up_proj", 1),
+        ]
+        params_dict = dict(self.named_parameters())
+        updated_params = set()
+        for name, loaded_weight in weights:
+            if "patch_embedding.weight" in name:
+                name = name.replace(
+                    "patch_embedding.weight", "patch_embedding._linear.weight"
+                )
+                loaded_weight = loaded_weight.view(loaded_weight.shape[0], -1)
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                param = params_dict[name]
+                updated_params.add(name)
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                if "vision_model" in name:
+                    # adapt to VisionAttention
+                    name = name.replace("self_attn.o_proj", "self_attn.proj")
+                param = params_dict.pop(name)
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
+                weight_loader(param, loaded_weight)
+
+
+EntryClass = MllamaForConditionalGeneration
diff --git a/sglang/python/sglang/srt/models/mllama4.py b/sglang/python/sglang/srt/models/mllama4.py
new file mode 100644
index 0000000000000000000000000000000000000000..0913f9adfd2d2cdde4482d8cecc8240c02c1d365
--- /dev/null
+++ b/sglang/python/sglang/srt/models/mllama4.py
@@ -0,0 +1,1008 @@
+import json as json_lib
+import logging
+import math
+import os
+import re
+from collections.abc import Iterable
+from typing import List, Optional, Set, Tuple
+
+import torch
+from torch import nn
+from transformers import Llama4Config, Llama4VisionConfig
+from transformers.models.llama4.modeling_llama4 import (
+    Llama4MultiModalProjector,
+    vision_apply_rotary_emb,
+)
+
+from sglang.srt.layers.attention.vision import VisionAttention
+from sglang.srt.layers.linear import (
+    ColumnParallelLinear,
+    ReplicatedLinear,
+    RowParallelLinear,
+)
+from sglang.srt.layers.logits_processor import LogitsProcessor
+from sglang.srt.layers.moe.fused_moe_triton import FusedMoE
+from sglang.srt.layers.quantization import QuantizationConfig
+from sglang.srt.managers.mm_utils import (
+    MultiModalityDataPaddingPatternMultimodalTokens,
+    general_mm_embed_routine,
+)
+from sglang.srt.managers.schedule_batch import (
+    Modality,
+    MultimodalDataItem,
+    MultimodalInputs,
+)
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch
+from sglang.srt.server_args import get_global_server_args
+from sglang.srt.utils import is_cpu
+
+_is_cpu = is_cpu()
+
+from sglang.srt.model_loader.weight_utils import (
+    default_weight_loader,
+    maybe_remap_kv_scale_name,
+)
+from sglang.srt.utils import add_prefix
+
+logger = logging.getLogger(__name__)
+
+
+class Llama4VisionMLP(nn.Module):
+
+    def __init__(
+        self,
+        input_size: int,
+        intermediate_size: int,
+        output_size: int,
+        bias: bool,
+        output_activation: bool,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+        use_data_parallel: bool = False,
+    ):
+        super().__init__()
+        cls_fc1 = ReplicatedLinear if use_data_parallel else ColumnParallelLinear
+        self.fc1 = cls_fc1(
+            input_size=input_size,
+            output_size=intermediate_size,
+            bias=bias,
+            quant_config=quant_config,
+            prefix=f"{prefix}.fc1",
+        )
+        cls_fc2 = ReplicatedLinear if use_data_parallel else RowParallelLinear
+        self.fc2 = cls_fc2(
+            input_size=intermediate_size,
+            output_size=output_size,
+            bias=bias,
+            quant_config=quant_config,
+            prefix=f"{prefix}.fc2",
+        )
+        self.activation_fn = nn.GELU()
+        self.output_activation = output_activation
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states, _ = self.fc1(hidden_states)
+        hidden_states = self.activation_fn(hidden_states)
+        hidden_states, _ = self.fc2(hidden_states)
+        if self.output_activation:
+            return self.activation_fn(hidden_states)
+        return hidden_states
+
+
+def pixel_shuffle(input_tensor, shuffle_ratio):
+    # input_tensor: [batch_size, num_patches, channels]
+    batch_size, num_patches, channels = input_tensor.shape
+    patch_size = int(math.sqrt(num_patches))
+
+    input_tensor = input_tensor.view(batch_size, patch_size, patch_size, -1)
+    batch_size, height, width, channels = input_tensor.size()
+
+    reshaped_tensor = input_tensor.view(
+        batch_size, height, int(width * shuffle_ratio), int(channels / shuffle_ratio)
+    )
+    reshaped_tensor = reshaped_tensor.permute(0, 2, 1, 3).contiguous()
+
+    reshaped_tensor = reshaped_tensor.view(
+        batch_size,
+        int(height * shuffle_ratio),
+        int(width * shuffle_ratio),
+        int(channels / (shuffle_ratio**2)),
+    )
+    reshaped_tensor = reshaped_tensor.permute(0, 2, 1, 3).contiguous()
+
+    output_tensor = reshaped_tensor.view(batch_size, -1, reshaped_tensor.shape[-1])
+    return output_tensor
+
+
+class Llama4VisionPixelShuffleMLP(nn.Module):
+
+    def __init__(
+        self,
+        config,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+        use_data_parallel: bool = False,
+    ):
+        super().__init__()
+        self.pixel_shuffle_ratio = config.pixel_shuffle_ratio
+        self.mlp = Llama4VisionMLP(
+            input_size=config.intermediate_size,
+            intermediate_size=config.projector_input_dim,
+            output_size=config.projector_output_dim,
+            bias=config.multi_modal_projector_bias,
+            output_activation=True,
+            quant_config=quant_config,
+            prefix=f"{prefix}.mlp",
+            use_data_parallel=use_data_parallel,
+        )
+
+    def forward(self, encoded_patches: torch.Tensor) -> torch.Tensor:
+        encoded_patches = pixel_shuffle(encoded_patches, self.pixel_shuffle_ratio)
+        return self.mlp(encoded_patches)
+
+
+def apply_position_embedding(q, k, freqs_ci, shape):
+    # [batch_size_times_num_tiles, num_channels]
+    input_shape = shape[:2]
+    # [batch_size_times_num_tiles, num_channels, num_heads, head_dim]
+    hidden_shape = (*input_shape, *q.shape[-2:])
+    q = q.view(hidden_shape)
+    k = k.view(hidden_shape)
+    q, k = vision_apply_rotary_emb(q, k, freqs_ci)
+    return q, k
+
+
+class Llama4VisionEncoderLayer(nn.Module):
+
+    def __init__(
+        self,
+        config: Llama4VisionConfig,
+        quant_config: Optional[QuantizationConfig],
+        prefix: str = "",
+        use_data_parallel: bool = False,
+    ):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.num_attention_heads = config.num_attention_heads
+        self.intermediate_size = config.intermediate_size
+
+        self.self_attn = VisionAttention(
+            self.hidden_size,
+            self.num_attention_heads,
+            self.hidden_size,
+            use_qkv_parallel=True,
+            # vision_model is explicitly ignored in Maverick-17B-128E-Instruct-FP8
+            quant_config=None,
+            flatten_batch=False,
+            prefix=add_prefix("self_attn", prefix),
+            qkv_bias=True,
+            customized_position_embedding_applier=apply_position_embedding,
+        )
+        self.mlp = Llama4VisionMLP(
+            input_size=config.hidden_size,
+            intermediate_size=config.intermediate_size,
+            output_size=config.hidden_size,
+            bias=True,
+            output_activation=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.mlp",
+            use_data_parallel=use_data_parallel,
+        )
+
+        self.input_layernorm = nn.LayerNorm(config.hidden_size)
+        self.post_attention_layernorm = nn.LayerNorm(config.hidden_size)
+
+    def forward(
+        self,
+        hidden_state: torch.Tensor,
+        freqs_ci: torch.Tensor,
+    ):
+        # Self Attention
+        residual = hidden_state
+        hidden_state = self.input_layernorm(hidden_state)
+        hidden_state = self.self_attn(hidden_state, position_embeddings=freqs_ci)
+        hidden_state = residual + hidden_state
+
+        # Feed forward
+        residual = hidden_state
+        hidden_state = self.post_attention_layernorm(hidden_state)
+        hidden_state = self.mlp(hidden_state)
+        hidden_state = residual + hidden_state
+
+        outputs = hidden_state
+        return outputs
+
+
+class Llama4VisionEncoder(nn.Module):
+
+    def __init__(
+        self,
+        config: Llama4VisionConfig,
+        quant_config: Optional[QuantizationConfig],
+        prefix: str = "",
+        use_data_parallel: bool = False,
+    ):
+        super().__init__()
+        self.config = config
+        self.layers = nn.ModuleList(
+            [
+                Llama4VisionEncoderLayer(
+                    config,
+                    quant_config=quant_config,
+                    prefix=f"{prefix}.layers.{layer_idx}",
+                    use_data_parallel=use_data_parallel,
+                )
+                for layer_idx in range(config.num_hidden_layers)
+            ]
+        )
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        freqs_ci: torch.Tensor,  # TODO: move this to an attribute instead of keeping it around
+    ) -> torch.Tensor:
+        r"""
+        Args:
+            hidden_states (`torch.FloatTensor` of shape
+                    `(batch_size, sequence_length, hidden_size)`):
+                Optionally, instead of passing `input_ids` you can choose to
+                directly pass an embedded representation. This is useful if you
+                want more control over how to convert `input_ids` indices into
+                associated vectors than the model's internal embedding
+                lookup matrix.
+        """
+
+        for encoder_layer in self.layers:
+            layer_outputs = encoder_layer(hidden_states, freqs_ci=freqs_ci)
+            hidden_states = layer_outputs
+
+        return hidden_states
+
+
+class Llama4UnfoldConvolution(nn.Module):
+
+    def __init__(
+        self,
+        config: Llama4VisionConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+        use_data_parallel: bool = False,
+    ):
+        super().__init__()
+        kernel_size = config.patch_size
+        if isinstance(kernel_size, int):
+            kernel_size = (kernel_size, kernel_size)
+        self.unfold = torch.nn.Unfold(kernel_size=kernel_size, stride=config.patch_size)
+        params = {
+            "input_size": config.num_channels * kernel_size[0] * kernel_size[1],
+            "output_size": config.hidden_size,
+            "bias": False,
+            "quant_config": quant_config,
+            "prefix": f"{prefix}.linear",
+        }
+        if use_data_parallel:
+            cls = ReplicatedLinear
+        else:
+            cls = ColumnParallelLinear
+            params["gather_output"] = True
+        self.linear = cls(**params)
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.unfold(hidden_states)
+        hidden_states = hidden_states.permute(0, 2, 1).contiguous()
+        hidden_states, _ = self.linear(hidden_states)
+        return hidden_states
+
+
+class Llama4VisionRotaryEmbedding(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        idx = config.image_size // config.patch_size
+        img_idx = torch.arange(idx**2, dtype=torch.int32).reshape(idx**2, 1)
+        img_idx = torch.cat([img_idx, img_idx[:1]], dim=0)
+        img_idx[-1, -1] = -2  # ID_CLS_TOKEN
+        frequencies_x = img_idx % idx  # get the coordinates of the 2d matrix along x
+        frequencies_y = img_idx // idx  # get the coordinates of the 2d matrix along y
+        freq_dim = config.hidden_size // config.num_attention_heads // 2
+        rope_freq = 1.0 / (
+            config.rope_theta
+            ** (torch.arange(0, freq_dim, 2)[: (freq_dim // 2)].float() / freq_dim)
+        )
+        freqs_x = (
+            (frequencies_x + 1)[..., None] * rope_freq[None, None, :]
+        ).repeat_interleave(2, dim=-1)
+        freqs_y = (
+            (frequencies_y + 1)[..., None] * rope_freq[None, None, :]
+        ).repeat_interleave(2, dim=-1)
+        freqs = torch.cat([freqs_x, freqs_y], dim=-1).float().contiguous()[..., ::2]
+        freqs = freqs.masked_fill(img_idx.reshape(-1, 1, 1) < 0, 0)
+        freq_cis = torch.view_as_complex(
+            torch.stack([torch.cos(freqs), torch.sin(freqs)], dim=-1)
+        )
+        self.freqs_ci = freq_cis  # idx**2, idx**2, idx * 2
+
+    def forward(self, hidden_states):
+        return self.freqs_ci.to(hidden_states.device)
+
+
+class Llama4VisionModel(nn.Module):
+
+    def __init__(
+        self,
+        config: Llama4VisionConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.config = config
+        self.image_size = config.image_size
+        self.patch_size = config.patch_size
+        self.hidden_size = config.hidden_size
+        self.num_channels = config.num_channels
+
+        self.num_patches = (self.image_size // self.patch_size) ** 2 + 1
+        self.scale = config.hidden_size**-0.5
+
+        self.patch_embedding = Llama4UnfoldConvolution(
+            config,
+            quant_config=quant_config,
+            prefix=f"{prefix}.patch_embedding",
+        )
+
+        self.class_embedding = nn.Parameter(self.scale * torch.randn(self.hidden_size))
+        self.positional_embedding_vlm = nn.Parameter(
+            self.scale * torch.randn(self.num_patches, self.hidden_size)
+        )
+
+        self.rotary_embedding = Llama4VisionRotaryEmbedding(config)
+
+        # layer norms
+        self.layernorm_pre = nn.LayerNorm(self.hidden_size, eps=1e-5)
+        self.layernorm_post = nn.LayerNorm(self.hidden_size, eps=1e-5)
+
+        # encoders
+        self.model = Llama4VisionEncoder(
+            config,
+            quant_config=quant_config,
+            prefix=f"{prefix}.model",
+        )
+        self.vision_adapter = Llama4VisionPixelShuffleMLP(
+            config,
+            quant_config,
+            prefix=f"{prefix}.vision_adapter",
+        )
+
+    def forward(
+        self,
+        pixel_values: torch.Tensor,
+    ) -> torch.Tensor:
+        # Patch embedding
+        hidden_state = self.patch_embedding(pixel_values)
+        num_tiles, num_patches, hidden_dim = hidden_state.shape
+
+        # Add cls token
+        class_embedding = self.class_embedding.expand(
+            hidden_state.shape[0], 1, hidden_state.shape[-1]
+        )
+        hidden_state = torch.cat([hidden_state, class_embedding], dim=1)
+        num_patches += 1
+
+        # Position embeddings
+        hidden_state = hidden_state.reshape(
+            num_tiles,
+            1,
+            num_patches,
+            hidden_dim,
+        )
+        positional_embedding = self.positional_embedding_vlm.to(
+            dtype=hidden_state.dtype, device=hidden_state.device
+        )
+        hidden_state = hidden_state + positional_embedding
+        hidden_state = self.layernorm_pre(hidden_state)
+        hidden_state = hidden_state.view(num_tiles, -1, hidden_dim)
+        freqs_ci = self.rotary_embedding(pixel_values)
+        # Apply encoder
+        hidden_state = self.model(hidden_state, freqs_ci=freqs_ci)
+        hidden_state = self.layernorm_post(hidden_state)
+
+        # Remove CLS token output
+        hidden_state = hidden_state[:, :-1, :]
+
+        # now, we use Llama4VisionPixelShuffle + mlp to project embeddings
+        hidden_state = self.vision_adapter(hidden_state)
+
+        return hidden_state
+
+
+class Llama4ForConditionalGeneration(nn.Module):
+    packed_modules_mapping = {
+        "qkv_proj": ["q_proj", "k_proj", "v_proj"],
+        "gate_up_proj": ["gate_proj", "up_proj"],
+    }
+
+    # Pattern to match language model layers only (skip vision_model and multi_modal_projector)
+    lora_pattern = re.compile(
+        r"^language_model\.model\.layers\.(\d+)\.(?:self_attn|mlp)\.(?:qkv_proj|o_proj|down_proj|gate_up_proj)"
+    )
+
+    def __init__(
+        self,
+        config: Llama4Config,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.config = config
+        self.quant_config = quant_config
+
+        # Check if this is a text-only model (modelopt fp8 llama4 has no vision components)
+        self.has_vision_weights = self._has_vision_weights(config)
+        if not self.has_vision_weights:
+            logger.warning(
+                "No vision weights found in checkpoint. Model will run in text-only mode. "
+                "Multimodal capabilities (vision understanding) will be unavailable. "
+                "Please not that this warning might be inaccurate if the weights haven't been fully downloaded"
+            )
+
+        self.has_vision = (
+            self.has_vision_weights and get_global_server_args().enable_multimodal
+        )
+
+        if self.has_vision:
+            # TODO: make this more general
+            ignore_quant_layers = getattr(config, "quantization_config", {}).get(
+                "ignore", {}
+            )
+            if (
+                "model.layers.vision_model*" in ignore_quant_layers
+                and "model.layers.multi_modal_projector*" in ignore_quant_layers
+            ):
+                vision_quant_config = None
+            else:
+                vision_quant_config = quant_config
+            self.vision_model = Llama4VisionModel(
+                config.vision_config,
+                quant_config=vision_quant_config,
+                prefix=add_prefix("vision_model", prefix),
+            )
+
+            self.multi_modal_projector = Llama4MultiModalProjector(config)
+        else:
+            self.vision_model = None
+            self.multi_modal_projector = None
+
+        # Initialize the language model
+        from sglang.srt.models.llama4 import Llama4ForCausalLM
+
+        self.language_model = Llama4ForCausalLM(
+            config.text_config if hasattr(config, "text_config") else config,
+            quant_config=quant_config,
+            prefix=add_prefix("language_model", prefix),
+        )
+
+        self.logits_processor = LogitsProcessor(
+            config.text_config if hasattr(config, "text_config") else config
+        )
+        self.padding_pattern = MultiModalityDataPaddingPatternMultimodalTokens()
+
+    def _has_vision_weights(self, config) -> bool:
+        """Check if the model has vision components by examining the checkpoint."""
+        model_path = getattr(config, "_name_or_path", None)
+        if not model_path:
+            return False
+
+        # Check if this is a local path first
+        if os.path.isdir(model_path):
+            index_file = os.path.join(model_path, "model.safetensors.index.json")
+            if os.path.exists(index_file):
+                return self._check_vision_weights_in_index(index_file)
+
+        # For HuggingFace models, we need to check the actual checkpoint
+        # The config might say it's multimodal, but the checkpoint might be text-only
+        try:
+            # Try to access the HuggingFace cache directory
+            from huggingface_hub import try_to_load_from_cache
+
+            # Check if index file exists in cache
+            index_file_path = try_to_load_from_cache(
+                repo_id=model_path,
+                filename="model.safetensors.index.json",
+                cache_dir=None,
+            )
+            if index_file_path and os.path.exists(index_file_path):
+                return self._check_vision_weights_in_index(index_file_path)
+
+        except Exception:
+            # If we can't access the cache, fall back to config-based detection
+            pass
+
+        # Fallback, assume text-only
+        return False
+
+    def _check_vision_weights_in_index(self, index_file: str) -> bool:
+        """Check if the model.safetensors.index.json contains vision weights."""
+        try:
+            with open(index_file, "r") as f:
+                index_data = json_lib.load(f)
+
+            vision_patterns = ["vision_model", "vision_tower", "multi_modal_projector"]
+            weight_names = index_data.get("weight_map", {}).keys()
+            return any(
+                pattern in weight_name
+                for weight_name in weight_names
+                for pattern in vision_patterns
+            )
+        except (OSError, json_lib.JSONDecodeError, KeyError):
+            return False
+
+    def pad_input_ids(self, input_ids: List[int], mm_inputs: MultimodalInputs):
+        return self.padding_pattern.pad_input_tokens(input_ids, mm_inputs)
+
+    def get_image_feature(
+        self,
+        items: List[MultimodalDataItem],
+    ) -> torch.Tensor:
+        # For text-only models, return None or raise an error
+        if not self.has_vision or self.vision_model is None:
+            raise ValueError("Vision model not available for text-only checkpoint")
+        pixel_values = (
+            torch.concat([item.feature for item in items])
+            .to(next(self.vision_model.parameters()).device)
+            .type(next(self.vision_model.parameters()).dtype)
+        )
+        image_features = self.vision_model(pixel_values)
+
+        vision_flat = image_features.view(-1, image_features.size(-1))
+
+        projected_vision_flat = self.multi_modal_projector(vision_flat)
+
+        return projected_vision_flat
+
+    def should_apply_lora(self, module_name: str) -> bool:
+        """Skip vision model and multi_modal_projector for LoRA."""
+        return bool(self.lora_pattern.match(module_name))
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        **kwargs: object,
+    ) -> torch.Tensor:
+
+        # For text-only models, pass None for image_data_embedding_func
+        image_embedding_func = self.get_image_feature if self.has_vision else None
+
+        hs = general_mm_embed_routine(
+            input_ids=input_ids,
+            forward_batch=forward_batch,
+            language_model=self.language_model,
+            data_embedding_funcs={
+                Modality.IMAGE: image_embedding_func,
+            },
+            positions=positions,
+        )
+
+        return hs
+
+    def permute_qk_weight_for_rotary(
+        self,
+        name: str,
+        loaded_weight: torch.Tensor,
+    ) -> Tuple[str, torch.Tensor]:
+
+        def permute(w: torch.Tensor, n_heads: int):
+            attn_in = self.language_model.config.head_dim * n_heads
+            attn_out = self.language_model.config.hidden_size
+
+            return (
+                w.view(n_heads, attn_in // n_heads // 2, 2, attn_out)
+                .transpose(1, 2)
+                .reshape(attn_in, attn_out)
+            )
+
+        modules = name.split(".")
+
+        # rotary embeds should be sliced
+        if ("wk" in modules or "k_proj" in modules) and modules[-1] == "weight":
+            if _is_cpu:
+                dim = self.language_model.config.original_total_num_kv_heads
+            else:
+                dim = self.language_model.config.num_key_value_heads
+            loaded_weight = permute(loaded_weight, dim)
+        elif ("wq" in modules or "q_proj" in modules) and modules[-1] == "weight":
+            if _is_cpu:
+                dim = self.language_model.config.original_num_attention_heads
+            else:
+                dim = self.language_model.config.num_attention_heads
+            loaded_weight = permute(loaded_weight, dim)
+
+        return name, loaded_weight
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]) -> Set[str]:
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            (".self_attn.qkv_proj", ".self_attn.q_proj", "q"),
+            (".self_attn.qkv_proj", ".self_attn.k_proj", "k"),
+            (".self_attn.qkv_proj", ".self_attn.v_proj", "v"),
+            (".shared_expert.gate_up_proj", ".shared_expert.gate_proj", 0),
+            (".shared_expert.gate_up_proj", ".shared_expert.up_proj", 1),
+            (".feed_forward.gate_up_proj", ".feed_forward.gate_proj", 0),
+            (".feed_forward.gate_up_proj", ".feed_forward.up_proj", 1),
+        ]
+
+        params_dict = dict(self.named_parameters())
+        num_experts = (
+            self.config.text_config.num_local_experts
+            if hasattr(self.config, "text_config")
+            else self.config.num_local_experts
+        )
+
+        expert_params_mapping = FusedMoE.make_expert_params_mapping(
+            ckpt_gate_proj_name="gate_proj",
+            ckpt_down_proj_name="down_proj",
+            ckpt_up_proj_name="up_proj",
+            num_experts=num_experts,
+        )
+
+        loaded_params = set()
+
+        for name, loaded_weight in weights:
+            if self._should_skip_weight(name):
+                continue
+
+            name = self._transform_weight_name(name)
+
+            if "vision" in name:
+                name = name.replace(".self_attn.o_proj", ".self_attn.proj")
+            else:
+                name, loaded_weight = self.permute_qk_weight_for_rotary(
+                    name, loaded_weight
+                )
+
+            if self._handle_scale_remapping(name, params_dict):
+                loaded_params.add(name)
+                continue
+
+            if self._handle_stacked_params(
+                name, loaded_weight, stacked_params_mapping, params_dict, loaded_params
+            ):
+                continue
+
+            if self._handle_expert_weights(
+                name,
+                loaded_weight,
+                expert_params_mapping,
+                params_dict,
+                num_experts,
+                loaded_params,
+            ):
+                continue
+
+            loaded_params.add(name)
+            self._handle_default_weight(name, loaded_weight, params_dict)
+        unloaded_params = params_dict.keys() - loaded_params
+        if unloaded_params:
+            logger.warning(
+                f"Some weights are not initialized from checkpoints {unloaded_params}"
+            )
+
+    def _should_skip_weight(self, name: str) -> bool:
+        """Check if we should skip loading this weight."""
+        return not self.has_vision and (
+            "vision" in name or "multi_modal_projector" in name
+        )
+
+    def _transform_weight_name(self, name: str) -> str:
+        """Transform weight name by adding language_model prefix if needed."""
+        if (
+            not name.startswith("language_model.")
+            and "vision" not in name
+            and "multi_modal_projector" not in name
+        ):
+            return f"language_model.{name}"
+        return name
+
+    def _handle_scale_remapping(self, name: str, params_dict: dict) -> bool:
+        """Handle scale parameter remapping. Returns True if handled."""
+        if "scale" in name and "expert" not in name:
+            remapped_name = maybe_remap_kv_scale_name(name, params_dict)
+            return remapped_name != name
+        return False
+
+    def _handle_stacked_params(
+        self,
+        name: str,
+        loaded_weight: torch.Tensor,
+        stacked_params_mapping: list,
+        params_dict: dict,
+        loaded_params: set,
+    ) -> bool:
+        """Handle stacked parameter loading. Returns True if handled."""
+        for param_name, weight_name, shard_id in stacked_params_mapping:
+            if weight_name in name:
+                transformed_name = name.replace(weight_name, param_name)
+                loaded_params.add(transformed_name)
+                param = params_dict[transformed_name]
+                param.weight_loader(param, loaded_weight, shard_id)
+                return True
+        return False
+
+    def _handle_expert_weights(
+        self,
+        name: str,
+        loaded_weight: torch.Tensor,
+        expert_params_mapping: list,
+        params_dict: dict,
+        num_experts: int,
+        loaded_params: set,
+    ) -> bool:
+        """Handle expert weight loading for MoE (Mixture of Experts) layers.
+
+        Args:
+            name: Parameter name from the checkpoint
+            loaded_weight: The weight tensor to be loaded
+            expert_params_mapping: Mapping of parameter names to expert configurations
+            params_dict: Dictionary of model parameters
+            num_experts: Total number of experts in the MoE layer
+
+        Returns:
+            bool: True if the parameter was handled (is an expert parameter), False otherwise
+        """
+        if ".experts" not in name:
+            return False
+
+        if "experts.gate_up_proj" not in name and "experts.down_proj" not in name:
+            return self._handle_other_expert_params(
+                name, loaded_weight, expert_params_mapping, params_dict, loaded_params
+            )
+
+        if "scale" in name:
+            return self._handle_expert_scale_params(
+                name, loaded_weight, params_dict, num_experts, loaded_params
+            )
+        else:
+            return self._handle_expert_weight_params(
+                name, loaded_weight, params_dict, num_experts, loaded_params
+            )
+
+    def _handle_other_expert_params(
+        self,
+        name: str,
+        loaded_weight: torch.Tensor,
+        expert_params_mapping: list,
+        params_dict: dict,
+        loaded_params: set,
+    ) -> bool:
+        """Handle expert parameters that are not gate_up_proj or down_proj weights.
+
+        Args:
+            name: Parameter name from the checkpoint
+            loaded_weight: The weight tensor to be loaded
+            expert_params_mapping: List of tuples mapping checkpoint names to model parameters
+            params_dict: Dictionary of model parameters
+            loaded_params: Set of loaded parameter names
+
+        Returns:
+            bool: True if parameter was found and handled, False otherwise
+        """
+        for param_name, weight_name, expert_id, shard_id in expert_params_mapping:
+            if weight_name in name:
+                transformed_name = name.replace(weight_name, param_name)
+                param = params_dict[transformed_name]
+                param.weight_loader(
+                    param, loaded_weight, name, shard_id=shard_id, expert_id=expert_id
+                )
+                loaded_params.add(transformed_name)
+                return True
+        return False
+
+    def _transform_expert_name(
+        self, name: str, is_weight: bool = False
+    ) -> Tuple[str, str, List[str]]:
+        """Transform expert parameter name and get shard information.
+
+        Args:
+            name: The original parameter name
+            is_weight: Whether this is a weight parameter (adds _weight suffix)
+
+        Returns:
+            Tuple of (transformed_name, shard_id, shard_id_list)
+        """
+        suffix = "_weight" if is_weight else ""
+
+        if ".gate_up_proj" in name:
+            transformed_name = name.replace(
+                ".experts.gate_up_proj", f".experts.w13{suffix}"
+            )
+            shard_id = "w13"
+            shard_id_list = ["w1", "w3"]
+        else:  # down_proj
+            transformed_name = name.replace(
+                ".experts.down_proj", f".experts.w2{suffix}"
+            )
+            shard_id = "w2"
+            shard_id_list = ["w2"]
+
+        return transformed_name, shard_id, shard_id_list
+
+    def _handle_expert_scale_params(
+        self,
+        name: str,
+        loaded_weight: torch.Tensor,
+        params_dict: dict,
+        num_experts: int,
+        loaded_params: set,
+    ) -> bool:
+        """Handle quantization scale parameters for expert weights.
+
+        Args:
+            name: Parameter name containing scale information
+            loaded_weight: Scale tensor to be loaded
+            params_dict: Dictionary of model parameters
+            num_experts: Total number of experts for broadcast operations
+            loaded_params: Set of loaded parameter names
+
+        Returns:
+            bool: True (always handles scale parameters)
+        """
+        import re
+
+        # Check if this matches the expert parameter pattern: experts.{expert_id}.{param_name}
+        expert_match = re.search(r"experts\.(\d+)\.", name)
+
+        # Transform name
+        transformed_name, _, _ = self._transform_expert_name(name)
+
+        if transformed_name not in params_dict:
+            return True
+
+        param = params_dict[transformed_name]
+
+        # Handle scale parameters
+        if expert_match:
+            # If we have a specific expert ID, only load for that expert
+            expert_id = int(expert_match.group(1))
+            # For scale parameters, we can directly set the value
+            param.data[expert_id] = loaded_weight
+        else:
+            # No expert ID found - this is a single scale for all experts
+            # Load the same scale for all experts
+            for expert_id in range(num_experts):
+                param.data[expert_id] = loaded_weight
+        loaded_params.add(transformed_name)
+
+        return True
+
+    def _handle_expert_weight_params(
+        self,
+        name: str,
+        loaded_weight: torch.Tensor,
+        params_dict: dict,
+        num_experts: int,
+        loaded_params: set,
+    ) -> bool:
+        """Handle actual weight tensors for expert layers (gate_up_proj and down_proj).
+
+        Args:
+            name: Parameter name (should contain gate_up_proj or down_proj)
+            loaded_weight: Weight tensor(s) to be loaded
+            params_dict: Dictionary of model parameters
+            num_experts: Total number of experts for tensor distribution
+            loaded_params: Set of loaded parameter names
+
+        Returns:
+            bool: True (always handles weight parameters)
+        """
+        # Transform name and get shard info
+        transformed_name, _, shard_id_list = self._transform_expert_name(
+            name, is_weight=True
+        )
+
+        if ".gate_up_proj" in name:
+            loaded_weight_list = loaded_weight.chunk(2, dim=-1)
+        else:  # down_proj
+            loaded_weight_list = [loaded_weight]
+
+        for param_name, weight_chunk, shard_id in zip(
+            [transformed_name] * len(shard_id_list), loaded_weight_list, shard_id_list
+        ):
+            if param_name not in params_dict:
+                continue
+
+            param = params_dict[param_name]
+            weight_loader = param.weight_loader
+            loaded_params.add(param_name)
+
+            # Handle the case where loaded_weight might be a single tensor for all experts
+            if weight_chunk.dim() == 2:
+                # Single tensor case - load for all experts
+                for expert_id in range(num_experts):
+                    weight_loader(
+                        param,
+                        weight_chunk.T,
+                        param_name,
+                        shard_id=shard_id,
+                        expert_id=expert_id,
+                    )
+            else:
+                # Multiple experts case - load each expert's weights
+                for expert_id in range(num_experts):
+                    weight_loader(
+                        param,
+                        weight_chunk[expert_id].T,
+                        param_name,
+                        shard_id=shard_id,
+                        expert_id=expert_id,
+                    )
+
+        return True
+
+    def _handle_default_weight(
+        self, name: str, loaded_weight: torch.Tensor, params_dict: dict
+    ):
+        """Handle default weight loading."""
+        # Skip loading extra bias for GPTQ models
+        if name.endswith(".bias") and name not in params_dict:
+            return
+
+        param = params_dict[name]
+        weight_loader = getattr(param, "weight_loader", default_weight_loader)
+        weight_loader(param, loaded_weight)
+
+    def set_eagle3_layers_to_capture(self, layer_ids: Optional[List[int]] = None):
+        if hasattr(self.language_model, "set_eagle3_layers_to_capture"):
+            self.language_model.set_eagle3_layers_to_capture(layer_ids)
+
+    def get_embed_and_head(self):
+        # For EAGLE3, we delegate to the language model which should have this method
+        # If the language model doesn't have lm_head (like EAGLE3), we return None for head
+        embed = self.language_model.get_embed()
+        if hasattr(self.language_model, "get_embed_and_head"):
+            return self.language_model.get_embed_and_head()
+        elif hasattr(self.language_model, "lm_head"):
+            return embed, self.language_model.lm_head.weight
+        else:
+            # For EAGLE3, head might not be needed
+            return embed, None
+
+    def set_embed_and_head(self, embed, head):
+        if hasattr(self.language_model, "set_embed_and_head"):
+            return self.language_model.set_embed_and_head(embed, head)
+        else:
+            # For EAGLE3, only set embed
+            return self.language_model.set_embed(embed)
+
+    def get_embed(self):
+        return self.language_model.get_embed()
+
+    def set_embed(self, embed):
+        return self.language_model.set_embed(embed)
+
+    def get_hidden_dim(self, module_name, layer_idx):
+        # return input_dim, output_dim
+        if module_name == "qkv_proj":
+            return (
+                self.config.hidden_size,
+                self.config.head_dim
+                * (
+                    self.config.num_attention_heads
+                    + self.config.num_key_value_heads * 2
+                ),
+            )
+        elif module_name == "o_proj":
+            return (
+                self.config.head_dim * self.config.num_attention_heads,
+                self.config.hidden_size,
+            )
+        elif module_name == "gate_up_proj":
+            return self.config.hidden_size, self.config.intermediate_size * 2
+        elif module_name == "down_proj":
+            decoder_layer = self.language_model.get_layers()[layer_idx]
+            intermediate_size = decoder_layer.get_intermediate_size()
+            return intermediate_size, self.config.hidden_size
+        else:
+            raise NotImplementedError()
+
+
+EntryClass = Llama4ForConditionalGeneration
diff --git a/sglang/python/sglang/srt/models/nano_nemotron_vl.py b/sglang/python/sglang/srt/models/nano_nemotron_vl.py
new file mode 100644
index 0000000000000000000000000000000000000000..cc140a333636abe7760f8f6a60c9648398f4dce5
--- /dev/null
+++ b/sglang/python/sglang/srt/models/nano_nemotron_vl.py
@@ -0,0 +1,224 @@
+# Copyright 2025 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+# Adapted from https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/models/nano_nemotron_vl.py
+
+import logging
+from typing import Iterable
+
+import torch
+import torch.nn as nn
+
+from sglang.srt.configs.nano_nemotron_vl import NemotronH_Nano_VL_V2_Config
+from sglang.srt.layers.activation import ReLU2
+from sglang.srt.layers.layernorm import RMSNorm
+from sglang.srt.layers.quantization.base_config import QuantizationConfig
+from sglang.srt.managers.mm_utils import (
+    MultiModalityDataPaddingPatternTokenPairs,
+    general_mm_embed_routine,
+)
+from sglang.srt.managers.schedule_batch import (
+    Modality,
+    MultimodalDataItem,
+    MultimodalInputs,
+)
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch
+from sglang.srt.model_loader.weight_utils import default_weight_loader
+from sglang.srt.models.nemotron_h import NemotronHForCausalLM
+from sglang.srt.models.radio import RadioModel
+from sglang.srt.multimodal.evs import EVS, EVSConfig
+from sglang.srt.utils import add_prefix
+
+logger = logging.getLogger(__name__)
+
+
+class NemotronH_Nano_VL_V2(EVS):
+    @staticmethod
+    def create_evs_config(config: NemotronH_Nano_VL_V2_Config):
+        return EVSConfig(video_pruning_rate=config.video_pruning_rate)
+
+    def __init__(
+        self,
+        config: NemotronH_Nano_VL_V2_Config,
+        quant_config: QuantizationConfig | None = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__(config)
+
+        self.downsample_ratio = config.downsample_ratio
+        self.language_model = NemotronHForCausalLM(
+            config=config.llm_config,
+            quant_config=quant_config,
+            prefix=add_prefix("language_model", prefix),
+        )
+        self.vision_model = RadioModel(config=config.create_radio_config()).to(
+            self.language_model.config.dtype
+        )
+
+        vit_hidden_size = config.vit_hidden_size
+        self.rmsnorm_hidden_size = vit_hidden_size * int(1 / self.downsample_ratio) ** 2
+        vision_projection_hidden_size = config.projector_hidden_size
+        llm_hidden_size = config.llm_config.hidden_size
+
+        self.mlp1 = nn.Sequential(
+            RMSNorm(
+                hidden_size=self.rmsnorm_hidden_size,
+                eps=1e-5,
+            ),
+            nn.Linear(
+                self.rmsnorm_hidden_size,
+                vision_projection_hidden_size,
+                bias=False,
+            ),
+            ReLU2(),
+            nn.Linear(vision_projection_hidden_size, llm_hidden_size, bias=False),
+        ).to(self.language_model.config.torch_dtype)
+        self.config = config
+
+    def pad_input_ids(self, input_ids: list[int], mm_inputs: MultimodalInputs):
+        # Get all special token IDs
+        im_start_id: int = mm_inputs.im_start_id
+        im_end_id: int = mm_inputs.im_end_id
+
+        media_token_pairs = [(im_start_id, im_end_id)]
+        helper = MultiModalityDataPaddingPatternTokenPairs(media_token_pairs)
+
+        return helper.pad_input_tokens(input_ids, mm_inputs)
+
+    def pixel_shuffle(self, x: torch.Tensor, scale_factor: float = 0.5) -> torch.Tensor:
+        n, w, h, c = x.size()
+        # N, W, H, C --> N, W, H * scale, C // scale
+        x = x.view(
+            n,
+            w,
+            int(h * scale_factor),
+            int(c / scale_factor),
+        )
+        # N, W, H * scale, C // scale --> N, H * scale, W, C // scale
+        x = x.permute(0, 2, 1, 3).contiguous()
+        # N, H * scale, W, C // scale -->
+        # N, H * scale, W * scale, C // (scale ** 2)
+        x = x.view(
+            n,
+            int(h * scale_factor),
+            int(w * scale_factor),
+            int(c / (scale_factor * scale_factor)),
+        )
+        if self.config.ps_version != "v1":
+            x = x.permute(0, 2, 1, 3).contiguous()
+        return x
+
+    def get_input_embeddings(self):
+        return self.language_model.get_input_embeddings()
+
+    def extract_feature(self, pixel_values):
+        # Process images in a micro-batch of at most 128 frames per call
+        # This is done on purpose to ensure peak GPU ram usage of huge batch
+        # (namely for really long videos with EVS ON) won't cause any problems
+        # as we don't support chunked prefill for video media
+        micro_batch_size = 128
+        n = pixel_values.shape[0]
+        vit_embeds_list = []
+        for i in range(0, n, micro_batch_size):
+            vit_embeds = self.vision_model(pixel_values[i : i + micro_batch_size])
+            vit_embeds = vit_embeds.to(dtype=torch.bfloat16)
+            h = w = int(vit_embeds.shape[1] ** 0.5)
+            vit_embeds = vit_embeds.reshape(vit_embeds.shape[0], h, w, -1)
+            vit_embeds = self.pixel_shuffle(
+                vit_embeds, scale_factor=self.downsample_ratio
+            )
+            vit_embeds = vit_embeds.view(-1, self.rmsnorm_hidden_size)
+            vit_embeds = self.mlp1(vit_embeds)
+            vit_embeds = vit_embeds.view(n, -1, self.rmsnorm_hidden_size)
+            vit_embeds_list.append(vit_embeds)
+        vit_embeds = torch.cat(vit_embeds_list, dim=0)
+        return vit_embeds
+
+    def get_image_feature(self, items: list[MultimodalDataItem]):
+        """
+        Projects the last hidden state from the vision model into language model space.
+
+        Returns:
+            image_features (`torch.Tensor`): Image feature tensor of shape `(num_images, image_length, embed_dim)`).
+        """
+        pixel_values = torch.cat([item.feature for item in items])
+        image_features = self.extract_feature(pixel_values)
+        return image_features
+
+    def get_video_feature(self, items: list[MultimodalDataItem]):
+        """
+        Projects the last hidden state from the video model into language model space.
+
+        Returns:
+            video_features (`torch.Tensor`): Video feature tensor of shape `(num_videos, video_length, embed_dim)`).
+        """
+        pixel_values = torch.cat([item.feature for item in items])
+        video_features = self.extract_feature(pixel_values)
+        return video_features
+
+    @torch.no_grad()
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        get_embedding: bool = False,
+    ):
+        hidden_states = general_mm_embed_routine(
+            input_ids=input_ids,
+            forward_batch=forward_batch,
+            language_model=self.language_model,
+            multimodal_model=self,
+            data_embedding_funcs={
+                Modality.IMAGE: self.get_image_feature,
+                Modality.VIDEO: self.get_video_feature,
+            },
+            positions=positions,
+        )
+        return hidden_states
+
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
+        adapter_dict = dict(self.mlp1.named_parameters())
+
+        def is_llm(name: str) -> bool:
+            return name.startswith("language_model")
+
+        def is_adapter_weights(weight: tuple[str, torch.Tensor]):
+            return weight[0].startswith("mlp1")
+
+        def is_vision_weights(name: str) -> bool:
+            return name.startswith("vision_model.radio_model.")
+
+        # Separate weights by component
+        llm_weights = []
+        vision_weights = []
+
+        for name, w in weights:
+            if is_llm(name):
+                # Strip 'language_model.' prefix for LLM weights
+                llm_weights.append((".".join(name.split(".")[1:]), w))
+            elif is_adapter_weights((name, w)):
+                # Load vision-language adapter weights directly
+                trimmed_name = ".".join(name.split(".")[1:])
+                param = adapter_dict[trimmed_name]
+                with torch.no_grad():
+                    default_weight_loader(param, w)
+            elif is_vision_weights(name):
+                # Convert: vision_model.radio_model.* → radio_model.*
+                hf_key = name[len("vision_model.") :]  # Remove "vision_model." prefix
+                vision_weights.append((hf_key, w))
+        self.language_model.load_weights(llm_weights)
+        self.vision_model.load_weights(vision_weights)
+
+
+EntryClass = [NemotronH_Nano_VL_V2]
diff --git a/sglang/python/sglang/srt/models/nemotron_h.py b/sglang/python/sglang/srt/models/nemotron_h.py
new file mode 100644
index 0000000000000000000000000000000000000000..77c21f0e5504b28ef96afaf644e3a4af637687ef
--- /dev/null
+++ b/sglang/python/sglang/srt/models/nemotron_h.py
@@ -0,0 +1,863 @@
+# Copyright 2023-2025 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+# Adapted from https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/models/nemotron_h.py
+
+"""Inference-only NemotronH model."""
+
+from collections.abc import Iterable
+from typing import Optional, Union
+
+import torch
+from torch import nn
+
+from sglang.srt.configs import NemotronHConfig
+from sglang.srt.configs.nemotron_h import ATTENTION, MAMBA, MLP, MOE
+from sglang.srt.distributed import (
+    get_moe_ep_group,
+    get_pp_group,
+    get_tensor_model_parallel_world_size,
+    tensor_model_parallel_all_reduce,
+)
+from sglang.srt.layers.activation import ReLU2
+from sglang.srt.layers.attention.hybrid_linear_attn_backend import (
+    HybridLinearAttnBackend,
+    Mamba2AttnBackend,
+)
+from sglang.srt.layers.attention.mamba.mamba import MambaMixer2
+from sglang.srt.layers.layernorm import RMSNorm
+from sglang.srt.layers.linear import (
+    ColumnParallelLinear,
+    QKVParallelLinear,
+    ReplicatedLinear,
+    RowParallelLinear,
+)
+from sglang.srt.layers.logits_processor import LogitsProcessor
+from sglang.srt.layers.moe.ep_moe.layer import get_moe_impl_class
+from sglang.srt.layers.moe.fused_moe_triton.layer import FusedMoE
+from sglang.srt.layers.moe.topk import TopK
+from sglang.srt.layers.quantization import QuantizationConfig
+from sglang.srt.layers.radix_attention import RadixAttention
+from sglang.srt.layers.utils import PPMissingLayer, get_layer_id
+from sglang.srt.layers.vocab_parallel_embedding import (
+    DEFAULT_VOCAB_PADDING_SIZE,
+    ParallelLMHead,
+    VocabParallelEmbedding,
+)
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch, PPProxyTensors
+from sglang.srt.model_loader.weight_utils import (
+    default_weight_loader,
+    maybe_remap_kv_scale_name,
+    replace_prefix,
+    replace_substrings,
+)
+from sglang.srt.models.utils import WeightsMapper
+from sglang.srt.server_args import get_global_server_args
+from sglang.srt.utils import (
+    add_prefix,
+    get_current_device_stream_fast,
+    is_cuda,
+    make_layers,
+)
+from sglang.utils import logger
+
+_is_cuda = is_cuda()
+
+
+class NemotronHMLP(nn.Module):
+    def __init__(
+        self,
+        config: NemotronHConfig,
+        intermediate_size: int,
+        quant_config: Optional[QuantizationConfig] = None,
+        bias: bool = False,
+        reduce_results: bool = True,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+
+        self.up_proj = ColumnParallelLinear(
+            input_size=config.hidden_size,
+            output_size=intermediate_size,
+            bias=bias,
+            quant_config=quant_config,
+            prefix=f"{prefix}.up_proj",
+        )
+        self.down_proj = RowParallelLinear(
+            input_size=intermediate_size,
+            output_size=config.hidden_size,
+            bias=bias,
+            quant_config=quant_config,
+            reduce_results=reduce_results,
+            prefix=f"{prefix}.down_proj",
+        )
+        self.act_fn = ReLU2()
+
+    def forward(self, x: torch.Tensor):
+        x, _ = self.up_proj(x)
+        x = self.act_fn(x)
+        x, _ = self.down_proj(x)
+        return x
+
+
+_alt_stream = None
+
+
+def _get_or_create_alt_stream(device_module):
+    global _alt_stream
+    if _alt_stream is None:
+        _alt_stream = device_module.Stream()
+    return _alt_stream
+
+
+class NemotronHMoE(nn.Module):
+    def __init__(
+        self,
+        config: NemotronHConfig,
+        layer_idx: int,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+
+        self.tp_size = get_tensor_model_parallel_world_size()
+        self.routed_scaling_factor = config.routed_scaling_factor
+        self.device_module = torch.get_device_module()
+
+        self.ep_group = get_moe_ep_group().device_group
+        self.ep_rank = self.ep_group.rank()
+        self.ep_size = self.ep_group.size()
+        self.n_routed_experts = config.n_routed_experts
+        self.n_shared_experts = config.n_shared_experts
+        self.use_latent_moe = getattr(config, "moe_latent_size", None) is not None
+        self.moe_hidden_size = (
+            config.moe_latent_size if self.use_latent_moe else config.hidden_size
+        )
+
+        self.gate = ReplicatedLinear(
+            config.hidden_size,
+            config.n_routed_experts,
+            bias=False,
+            params_dtype=torch.float32,
+            quant_config=None,
+            prefix=f"{prefix}.gate",
+        )
+        self.gate.e_score_correction_bias = nn.Parameter(
+            torch.empty(config.n_routed_experts, dtype=torch.float32)
+        )
+
+        self.topk = TopK(
+            top_k=config.num_experts_per_tok,
+            use_grouped_topk=True,
+            topk_group=config.topk_group,
+            num_expert_group=config.n_group,
+            renormalize=config.norm_topk_prob,
+            scoring_func="sigmoid",
+            correction_bias=self.gate.e_score_correction_bias,
+            routed_scaling_factor=1.0,
+        )
+        self.experts = get_moe_impl_class(quant_config)(
+            num_experts=config.n_routed_experts
+            + get_global_server_args().ep_num_redundant_experts,
+            top_k=config.num_experts_per_tok,
+            hidden_size=self.moe_hidden_size,
+            intermediate_size=config.moe_intermediate_size,
+            reduce_results=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.experts",
+            activation=config.mlp_hidden_act,
+            layer_id=layer_idx,
+            is_gated=False,
+        )
+        if config.n_shared_experts:
+            self.shared_experts = NemotronHMLP(
+                config,
+                intermediate_size=config.moe_shared_expert_intermediate_size
+                * config.n_shared_experts,
+                quant_config=quant_config,
+                reduce_results=False,
+                prefix=f"{prefix}.shared_experts",
+            )
+        else:
+            self.shared_experts = None
+
+        if self.use_latent_moe:
+            self.fc1_latent_proj = ReplicatedLinear(
+                input_size=config.hidden_size,
+                output_size=self.moe_hidden_size,
+                bias=config.mlp_bias,
+                quant_config=quant_config,
+                prefix=f"{prefix}.fc1_latent_proj",
+            )
+            self.fc2_latent_proj = ReplicatedLinear(
+                input_size=self.moe_hidden_size,
+                output_size=config.hidden_size,
+                bias=config.mlp_bias,
+                quant_config=quant_config,
+                prefix=f"{prefix}.fc2_latent_proj",
+            )
+        else:
+            self.fc1_latent_proj = None
+            self.fc2_latent_proj = None
+
+    def _forward_core(
+        self,
+        hidden_states: torch.Tensor,
+    ) -> tuple[torch.Tensor, torch.Tensor | None]:
+        if _is_cuda:
+            return self._forward_core_shared_routed_overlap(hidden_states)
+        else:
+            return self._forward_core_normal(hidden_states)
+
+    def _forward_core_normal(
+        self,
+        hidden_states: torch.Tensor,
+    ) -> tuple[torch.Tensor, torch.Tensor | None]:
+        # router_scores: [num_tokens, num_experts]
+        router_logits, _ = self.gate(hidden_states.to(dtype=torch.float32))
+        if self.shared_experts is not None:
+            shared_output = self.shared_experts(hidden_states)
+        else:
+            shared_output = None
+        topk_output = self.topk(hidden_states, router_logits)
+        if self.use_latent_moe:
+            hidden_states, _ = self.fc1_latent_proj(hidden_states)
+        final_hidden_states = self.experts(hidden_states, topk_output)
+        return final_hidden_states, shared_output
+
+    def _forward_core_shared_routed_overlap(
+        self,
+        hidden_states: torch.Tensor,
+    ) -> tuple[torch.Tensor, torch.Tensor | None]:
+        alt_stream = _get_or_create_alt_stream(self.device_module)
+
+        alt_stream.wait_stream(get_current_device_stream_fast())
+
+        if self.shared_experts is not None:
+            shared_output = self.shared_experts(hidden_states)
+        else:
+            shared_output = None
+
+        with self.device_module.stream(alt_stream):
+            # router_scores: [num_tokens, num_experts]
+            router_logits, _ = self.gate(hidden_states.to(dtype=torch.float32))
+            topk_output = self.topk(hidden_states, router_logits)
+            if self.use_latent_moe:
+                hidden_states, _ = self.fc1_latent_proj(hidden_states)
+            final_hidden_states = self.experts(hidden_states, topk_output)
+        get_current_device_stream_fast().wait_stream(alt_stream)
+
+        return final_hidden_states, shared_output
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        num_tokens, hidden_dim = hidden_states.shape
+        final_hidden_states, shared_output = self._forward_core(hidden_states)
+
+        # Fix FP16 overflow
+        if hidden_states.dtype != torch.float16:
+            final_hidden_states *= self.routed_scaling_factor
+        elif self.shared_experts is not None:
+            assert shared_output is not None
+            shared_output *= 1.0 / self.routed_scaling_factor
+
+        if self.use_latent_moe:
+            final_hidden_states, _ = self.fc2_latent_proj(final_hidden_states)
+
+        if shared_output is not None:
+            final_hidden_states += shared_output
+
+        if self.tp_size > 1:
+            final_hidden_states = tensor_model_parallel_all_reduce(final_hidden_states)
+
+        return final_hidden_states.view(num_tokens, hidden_dim)
+
+
+class NemotronHMLPDecoderLayer(nn.Module):
+    def __init__(
+        self,
+        config: NemotronHConfig,
+        layer_idx: int,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.config = config
+
+        hybrid_override_pattern = config.hybrid_override_pattern
+        mlp_index = hybrid_override_pattern[: layer_idx + 1].count("-") - 1
+        if isinstance(config.intermediate_size, list):
+            if len(config.intermediate_size) == 1:
+                intermediate_size = config.intermediate_size[0]
+            else:
+                intermediate_size = config.intermediate_size[mlp_index]
+        else:
+            intermediate_size = config.intermediate_size
+
+        self.mixer = NemotronHMLP(
+            config,
+            intermediate_size=intermediate_size,
+            quant_config=quant_config,
+            bias=config.mlp_bias,
+            prefix=f"{prefix}.mixer",
+        )
+
+        self.norm = RMSNorm(config.hidden_size, eps=config.layer_norm_epsilon)
+
+    def forward(
+        self,
+        *,
+        hidden_states: torch.Tensor,
+        residual: Optional[torch.Tensor],
+        forward_batch: ForwardBatch,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        if residual is None:
+            residual = hidden_states
+            hidden_states = self.norm(hidden_states)
+        else:
+            hidden_states, residual = self.norm(hidden_states, residual)
+
+        hidden_states = self.mixer.forward(hidden_states)
+        return hidden_states, residual
+
+
+class NemotronHMoEDecoderLayer(nn.Module):
+    def __init__(
+        self,
+        config: NemotronHConfig,
+        layer_idx: int,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+
+        self.mixer = NemotronHMoE(
+            config,
+            layer_idx=layer_idx,
+            quant_config=quant_config,
+            prefix=f"{prefix}.mixer",
+        )
+
+        self.norm = RMSNorm(config.hidden_size, eps=config.layer_norm_epsilon)
+
+    def forward(
+        self,
+        *,
+        hidden_states: torch.Tensor,
+        residual: Optional[torch.Tensor],
+        forward_batch: ForwardBatch,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        if residual is None:
+            residual = hidden_states
+            hidden_states = self.norm(hidden_states)
+        else:
+            hidden_states, residual = self.norm(hidden_states, residual)
+
+        hidden_states = self.mixer.forward(hidden_states)
+        return hidden_states, residual
+
+
+class NemotronHMambaDecoderLayer(nn.Module):
+    def __init__(
+        self,
+        config: NemotronHConfig,
+        layer_idx: int,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.config = config
+        self.layer_id = layer_idx
+        self.mixer = MambaMixer2(
+            cache_params=config.mamba2_cache_params,
+            hidden_size=config.hidden_size,
+            use_conv_bias=config.use_conv_bias,
+            use_bias=config.use_bias,
+            n_groups=config.mamba_n_groups,
+            rms_norm_eps=config.layer_norm_epsilon,
+            activation=config.mamba_hidden_act,
+            quant_config=quant_config,
+            prefix=f"{prefix}.mixer",
+        )
+
+        self.norm = RMSNorm(config.hidden_size, eps=config.layer_norm_epsilon)
+
+    def forward(
+        self,
+        *,
+        hidden_states: torch.Tensor,
+        residual: Optional[torch.Tensor],
+        forward_batch: ForwardBatch,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        if residual is None:
+            residual = hidden_states
+            hidden_states = self.norm(hidden_states)
+        else:
+            hidden_states, residual = self.norm(hidden_states, residual)
+
+        output = torch.empty_like(hidden_states)
+        attn_backend = forward_batch.attn_backend
+        assert isinstance(attn_backend, HybridLinearAttnBackend)
+        assert isinstance(attn_backend.linear_attn_backend, Mamba2AttnBackend)
+        attn_backend.linear_attn_backend.forward(
+            mixer=self.mixer,
+            layer_id=self.layer_id,
+            hidden_states=hidden_states,
+            output=output,
+            use_triton_causal_conv=True,  # TODO: investigate need of `use_triton_causal_conv`
+        )
+        return output, residual
+
+
+class NemotronHAttention(nn.Module):
+    def __init__(
+        self,
+        config: NemotronHConfig,
+        layer_idx: int,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        tp_size = get_tensor_model_parallel_world_size()
+        self.total_num_heads = config.num_attention_heads
+        assert self.total_num_heads % tp_size == 0
+        self.num_heads = self.total_num_heads // tp_size
+        self.total_num_kv_heads = config.num_key_value_heads
+        if self.total_num_kv_heads >= tp_size:
+            # Number of KV heads is greater than TP size, so we partition
+            # the KV heads across multiple tensor parallel GPUs.
+            assert self.total_num_kv_heads % tp_size == 0
+        else:
+            # Number of KV heads is less than TP size, so we replicate
+            # the KV heads across multiple tensor parallel GPUs.
+            assert tp_size % self.total_num_kv_heads == 0
+        self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
+        if hasattr(config, "head_dim") and config.head_dim is not None:
+            self.head_dim = config.head_dim
+        else:
+            self.head_dim = config.hidden_size // self.total_num_heads
+        self.q_size = self.num_heads * self.head_dim
+        self.kv_size = self.num_kv_heads * self.head_dim
+        self.scaling = self.head_dim**-0.5
+
+        self.qkv_proj = QKVParallelLinear(
+            config.hidden_size,
+            self.head_dim,
+            self.total_num_heads,
+            self.total_num_kv_heads,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.qkv_proj",
+        )
+        self.o_proj = RowParallelLinear(
+            self.total_num_heads * self.head_dim,
+            config.hidden_size,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.o_proj",
+        )
+
+        self.attn = RadixAttention(
+            self.num_heads,
+            self.head_dim,
+            self.scaling,
+            num_kv_heads=self.num_kv_heads,
+            layer_id=layer_idx,
+            quant_config=quant_config,
+            prefix=add_prefix("attn", prefix),
+        )
+
+    def forward(
+        self, hidden_states: torch.Tensor, forward_batch: ForwardBatch
+    ) -> torch.Tensor:
+        qkv, _ = self.qkv_proj(hidden_states)
+        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+        attn_output = self.attn.forward(q, k, v, forward_batch)
+        output, _ = self.o_proj(attn_output)
+        return output
+
+
+class NemotronHAttentionDecoderLayer(nn.Module):
+    def __init__(
+        self,
+        config: NemotronHConfig,
+        layer_idx: int,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+
+        self.mixer = NemotronHAttention(
+            config,
+            layer_idx,
+            quant_config,
+            prefix=f"{prefix}.mixer",
+        )
+
+        self.norm = RMSNorm(config.hidden_size, eps=config.layer_norm_epsilon)
+
+    def forward(
+        self,
+        *,
+        hidden_states: torch.Tensor,
+        residual: Optional[torch.Tensor],
+        forward_batch: ForwardBatch,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        if residual is None:
+            residual = hidden_states
+            hidden_states = self.norm(hidden_states)
+        else:
+            hidden_states, residual = self.norm(hidden_states, residual)
+
+        hidden_states = self.mixer.forward(
+            hidden_states=hidden_states, forward_batch=forward_batch
+        )
+        return hidden_states, residual
+
+
+Layers = (
+    NemotronHAttentionDecoderLayer
+    | NemotronHMLPDecoderLayer
+    | NemotronHMambaDecoderLayer
+    | NemotronHMoEDecoderLayer
+)
+ALL_DECODER_LAYER_TYPES: dict[str, type[Layers]] = {
+    ATTENTION: NemotronHAttentionDecoderLayer,
+    MLP: NemotronHMLPDecoderLayer,
+    MAMBA: NemotronHMambaDecoderLayer,
+    MOE: NemotronHMoEDecoderLayer,
+}
+
+
+class NemotronHModel(nn.Module):
+    def __init__(
+        self,
+        *,
+        config: NemotronHConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+
+        lora_config = None
+        self.config = config
+        lora_vocab = (
+            (lora_config.lora_extra_vocab_size * (lora_config.max_loras or 1))
+            if lora_config
+            else 0
+        )
+        self.vocab_size = config.vocab_size + lora_vocab
+        self.org_vocab_size = config.vocab_size
+        self.pp_group = get_pp_group()
+
+        if self.pp_group.is_first_rank:
+            self.embed_tokens = VocabParallelEmbedding(
+                self.vocab_size,
+                config.hidden_size,
+                org_num_embeddings=config.vocab_size,
+            )
+        else:
+            self.embed_tokens = PPMissingLayer()
+
+        def get_layer(idx: int, prefix: str):
+            layer_class = ALL_DECODER_LAYER_TYPES[config.hybrid_override_pattern[idx]]
+            return layer_class(config, idx, quant_config=quant_config, prefix=prefix)
+
+        self.layers, self.start_layer, self.end_layer = make_layers(
+            len(config.hybrid_override_pattern),
+            get_layer,
+            pp_rank=self.pp_group.rank_in_group,
+            pp_size=self.pp_group.world_size,
+            prefix=f"{prefix}.layers",
+        )
+        if self.pp_group.is_last_rank:
+            self.norm_f = RMSNorm(config.hidden_size, eps=config.layer_norm_epsilon)
+        else:
+            self.norm_f = PPMissingLayer(return_tuple=True)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        pp_proxy_tensors: Optional[PPProxyTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor, PPProxyTensors]:
+        if self.pp_group.is_first_rank:
+            if inputs_embeds is not None:
+                hidden_states = inputs_embeds
+            else:
+                hidden_states = self.embed_tokens(input_ids)
+            residual = None
+        else:
+            assert pp_proxy_tensors is not None
+            hidden_states = pp_proxy_tensors["hidden_states"]
+            residual = pp_proxy_tensors["residual"]
+
+        for i in range(self.start_layer, self.end_layer):
+            layer = self.layers[i]
+            if not isinstance(layer, Layers):
+                raise ValueError(f"Unknown layer type: {type(layer)}")
+            hidden_states, residual = layer.forward(
+                hidden_states=hidden_states,
+                residual=residual,
+                forward_batch=forward_batch,
+            )
+
+        if not self.pp_group.is_last_rank:
+            return PPProxyTensors(
+                {"hidden_states": hidden_states, "residual": residual}
+            )
+        hidden_states, _ = self.norm_f(hidden_states, residual)
+        return hidden_states
+
+
+class NemotronHForCausalLM(nn.Module):
+    stacked_params_mapping = [
+        # (param_name, shard_name, shard_id)
+        ("qkv_proj", "q_proj", "q"),
+        ("qkv_proj", "k_proj", "k"),
+        ("qkv_proj", "v_proj", "v"),
+    ]
+    packed_modules_mapping = {
+        "qkv_proj": ["q_proj", "k_proj", "v_proj"],
+    }
+
+    remap_prefix = {"backbone": "model"}
+    remap_substr = {
+        "A_log": "A",
+        "embeddings": "embed_tokens",
+        "k_proj.k_scale": "attn.k_scale",
+        "v_proj.v_scale": "attn.v_scale",
+    }
+
+    hf_to_sglang_mapper = WeightsMapper(
+        orig_to_new_prefix={
+            "backbone.": "model.",
+        }
+    )
+
+    def __init__(
+        self,
+        *,
+        config: NemotronHConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        lora_config = None
+        self.config = config
+        self.quant_config = quant_config
+        self.model = self._init_model(
+            config=config, quant_config=quant_config, prefix=prefix
+        )
+        self.pp_group = get_pp_group()
+
+        if self.pp_group.is_last_rank:
+            if self.pp_group.world_size == 1 and self.config.tie_word_embeddings:
+                self.lm_head = self.model.embed_tokens
+            else:
+                self.unpadded_vocab_size = config.vocab_size
+                if lora_config:
+                    self.unpadded_vocab_size += lora_config.lora_extra_vocab_size
+                self.lm_head = ParallelLMHead(
+                    self.unpadded_vocab_size,
+                    config.hidden_size,
+                    org_num_embeddings=config.vocab_size,
+                    padding_size=(
+                        DEFAULT_VOCAB_PADDING_SIZE
+                        # We need bigger padding if using lora for kernel
+                        # compatibility
+                        if not lora_config
+                        else lora_config.lora_vocab_padding_size
+                    ),
+                    quant_config=quant_config,
+                    prefix=add_prefix("lm_head", prefix),
+                )
+        else:
+            self.lm_head = PPMissingLayer()
+
+        if self.pp_group.world_size > 1 and self.config.tie_word_embeddings:
+            if self.pp_group.is_first_rank:
+                self.pp_group.send(
+                    self.model.embed_tokens.weight, dst=self.pp_group.last_rank
+                )
+            elif self.pp_group.is_last_rank:
+                emb_token_weight = self.pp_group.recv(
+                    size=self.lm_head.weight.shape,
+                    dtype=next(self.model.parameters()).dtype,
+                    src=self.pp_group.first_rank,
+                )
+                self.lm_head.weight.copy_(emb_token_weight)
+
+        self.logits_processor = LogitsProcessor(config)
+
+    def _init_model(
+        self,
+        config: NemotronHConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        return NemotronHModel(
+            config=config, quant_config=quant_config, prefix=add_prefix("model", prefix)
+        )
+
+    def get_input_embeddings(self) -> VocabParallelEmbedding:
+        return self.model.embed_tokens
+
+    @torch.no_grad()
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        input_embeds: Optional[torch.Tensor] = None,
+        pp_proxy_tensors: Optional[PPProxyTensors] = None,
+    ):
+        hidden_states = self.model.forward(
+            input_ids, positions, forward_batch, pp_proxy_tensors, input_embeds
+        )
+        if self.pp_group.is_last_rank:
+            return self.logits_processor(
+                input_ids, hidden_states, self.lm_head, forward_batch
+            )
+        else:
+            return hidden_states
+
+    def copy_inputs_before_cuda_graphs(self, input_buffers, **kwargs):
+        return self.mamba_cache.copy_inputs_before_cuda_graphs(input_buffers, **kwargs)
+
+    def get_seqlen_agnostic_capture_inputs(self, batch_size: int):
+        return self.mamba_cache.get_seqlen_agnostic_capture_inputs(batch_size)
+
+    def get_embed_and_head(self):
+        return self.model.embed_tokens.weight, self.lm_head.weight
+
+    def set_embed_and_head(self, embed, head):
+        del self.model.embed_tokens.weight
+        del self.lm_head.weight
+        self.model.embed_tokens.weight = embed
+        self.lm_head.weight = head
+        torch.cuda.empty_cache()
+        torch.cuda.synchronize()
+
+    def load_weights(
+        self, weights: Iterable[tuple[str, torch.Tensor]], is_mtp: bool = False
+    ) -> None:
+        updated_weights = []
+        for name, loaded_weight in weights:
+            name = replace_prefix(name, self.remap_prefix)
+            name = replace_substrings(name, self.remap_substr)
+            updated_weights.append((name, loaded_weight))
+
+        # - FusedMoe.w1 (aka gate_proj) should be up_proj since that's
+        #   what the activation is applied to
+        # - FusedMoe.w3 (aka up_proj) should be ignored since we're
+        #   using non-gated MoE
+        expert_params_mapping = FusedMoE.make_expert_params_mapping(
+            ckpt_gate_proj_name="up_proj",
+            ckpt_down_proj_name="down_proj",
+            ckpt_up_proj_name="",
+            num_experts=self.config.n_routed_experts,
+        )
+
+        params_dict = dict(self.named_parameters())
+
+        for name, loaded_weight in updated_weights:
+            if is_mtp:
+                if "mtp" not in name:
+                    continue
+
+                name = name.replace("mtp.layers.", "model.layers.")
+
+                if "embeddings" in name:
+                    name = name.replace("embeddings", "model.embed_tokens")
+                    if name.startswith("backbone."):
+                        name = name.replace("backbone.", "")
+
+            if not is_mtp and "mtp" in name:
+                continue
+
+            if "scale" in name:
+                if name not in params_dict:
+                    name = maybe_remap_kv_scale_name(name, params_dict)
+                    if name is None:
+                        continue
+
+            layer_id = get_layer_id(name)
+            if (
+                layer_id is not None
+                and hasattr(self.model, "start_layer")
+                and (
+                    layer_id < self.model.start_layer
+                    or layer_id >= self.model.end_layer
+                )
+            ):
+                continue
+
+            if "embed_tokens" in name and not self.pp_group.is_first_rank:
+                continue
+
+            if (
+                "norm_f" in name or "lm_head" in name
+            ) and not self.pp_group.is_last_rank:
+                continue
+
+            for param_name, weight_name, shard_id in self.stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                if name not in params_dict:
+                    continue
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                is_expert_weight = False
+                for mapping in expert_params_mapping:
+                    param_name, weight_name, expert_id, shard_id = mapping
+                    if weight_name not in name:
+                        continue
+                    is_expert_weight = True
+                    name_mapped = name.replace(weight_name, param_name)
+                    param = params_dict[name_mapped]
+                    param.weight_loader(
+                        param,
+                        loaded_weight,
+                        name_mapped,
+                        shard_id=shard_id,
+                        expert_id=expert_id,
+                    )
+                    name = name_mapped
+                    break
+                else:
+                    if is_expert_weight:
+                        continue
+                    # Skip loading extra bias for GPTQ models.
+                    if name.endswith(".bias") and name not in params_dict:
+                        continue
+                    if name in params_dict.keys():
+                        param = params_dict[name]
+                        weight_loader = getattr(
+                            param, "weight_loader", default_weight_loader
+                        )
+                        weight_loader(param, loaded_weight)
+                    else:
+                        logger.warning(f"Parameter {name} not found in params_dict")
+
+
+EntryClass = [NemotronHForCausalLM]
diff --git a/sglang/python/sglang/srt/models/nemotron_h_mtp.py b/sglang/python/sglang/srt/models/nemotron_h_mtp.py
new file mode 100644
index 0000000000000000000000000000000000000000..dabd4b4ae86a24cb31b63de5fe60af55506e8528
--- /dev/null
+++ b/sglang/python/sglang/srt/models/nemotron_h_mtp.py
@@ -0,0 +1,340 @@
+# Copyright 2023-2025 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+from collections.abc import Iterable
+
+import torch
+from torch import nn
+
+from sglang.srt.configs import NemotronHConfig
+from sglang.srt.distributed import get_pp_group
+from sglang.srt.layers.layernorm import RMSNorm
+from sglang.srt.layers.linear import ColumnParallelLinear
+from sglang.srt.layers.logits_processor import LogitsProcessor
+from sglang.srt.layers.quantization import QuantizationConfig
+from sglang.srt.layers.vocab_parallel_embedding import (
+    ParallelLMHead,
+    VocabParallelEmbedding,
+)
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch
+from sglang.srt.models.nemotron_h import (
+    NemotronHAttentionDecoderLayer,
+    NemotronHForCausalLM,
+    NemotronHMoEDecoderLayer,
+)
+from sglang.srt.server_args import get_global_server_args
+from sglang.srt.utils import add_prefix
+
+
+class NemotronHMTPAttentionDecoderLayer(NemotronHAttentionDecoderLayer):
+    def __init__(
+        self,
+        config: NemotronHConfig,
+        layer_idx: int,
+        quant_config: QuantizationConfig | None = None,
+        prefix: str = "",
+        has_start_projections: bool = False,
+        has_end_norm: bool = False,
+    ) -> None:
+        super().__init__(
+            config=config,
+            layer_idx=layer_idx,
+            quant_config=quant_config,
+            prefix=prefix,
+        )
+        self.has_start_projections = has_start_projections
+        self.has_end_norm = has_end_norm
+
+        if has_start_projections:
+            self.enorm = RMSNorm(config.hidden_size, eps=config.layer_norm_epsilon)
+            self.hnorm = RMSNorm(config.hidden_size, eps=config.layer_norm_epsilon)
+
+            # Fusion layer to combine embeddings with target hidden states
+            self.eh_proj = ColumnParallelLinear(
+                input_size=config.hidden_size * 2,
+                output_size=config.hidden_size,
+                bias=False,
+                gather_output=True,
+                params_dtype=(
+                    config.dtype if hasattr(config, "dtype") else torch.bfloat16
+                ),
+                quant_config=quant_config,
+                prefix=f"{prefix}.eh_proj",
+            )
+
+        if has_end_norm:
+            self.final_layernorm = RMSNorm(
+                config.hidden_size,
+                eps=getattr(config, "layer_norm_epsilon", 1e-5),
+            )
+
+    def forward(
+        self,
+        *,
+        inputs_embeds: torch.Tensor,
+        hidden_states: torch.Tensor,
+        residual: torch.Tensor | None = None,
+        forward_batch: ForwardBatch,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        if self.has_start_projections:
+            inputs_embeds_normed = self.enorm(inputs_embeds)
+            previous_hidden_states_normed = self.hnorm(hidden_states)
+
+            fused = torch.cat(
+                [inputs_embeds_normed, previous_hidden_states_normed], dim=-1
+            )
+            hidden_states, _ = self.eh_proj(fused)
+
+        hidden_states, residual = super().forward(
+            hidden_states=hidden_states,
+            residual=residual,
+            forward_batch=forward_batch,
+        )
+
+        if self.has_end_norm:
+            if residual is not None:
+                hidden_states = hidden_states + residual
+                residual = None
+
+            hidden_states = self.final_layernorm(hidden_states)
+
+        return hidden_states, residual
+
+
+class NemotronHMTPMoEDecoderLayer(NemotronHMoEDecoderLayer):
+    def __init__(
+        self,
+        config: NemotronHConfig,
+        layer_idx: int,
+        quant_config: QuantizationConfig | None = None,
+        prefix: str = "",
+        has_start_projections: bool = False,
+        has_end_norm: bool = False,
+    ) -> None:
+        super().__init__(
+            config=config,
+            layer_idx=layer_idx,
+            quant_config=quant_config,
+            prefix=prefix,
+        )
+        self.has_start_projections = has_start_projections
+        self.has_end_norm = has_end_norm
+
+        if has_start_projections:
+            self.enorm = RMSNorm(config.hidden_size, eps=config.layer_norm_epsilon)
+            self.hnorm = RMSNorm(config.hidden_size, eps=config.layer_norm_epsilon)
+
+            self.eh_proj = ColumnParallelLinear(
+                input_size=config.hidden_size * 2,
+                output_size=config.hidden_size,
+                bias=False,
+                gather_output=True,
+                params_dtype=(
+                    config.dtype if hasattr(config, "dtype") else torch.bfloat16
+                ),
+                quant_config=quant_config,
+                prefix=f"{prefix}.eh_proj",
+            )
+
+        if has_end_norm:
+            self.final_layernorm = RMSNorm(
+                config.hidden_size,
+                eps=getattr(config, "layer_norm_epsilon", 1e-5),
+            )
+
+    def forward(
+        self,
+        *,
+        inputs_embeds: torch.Tensor,
+        hidden_states: torch.Tensor,
+        residual: torch.Tensor | None = None,
+        forward_batch: ForwardBatch,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        if self.has_start_projections:
+            inputs_embeds_normed = self.enorm(inputs_embeds)
+            previous_hidden_states_normed = self.hnorm(hidden_states)
+
+            fused = torch.cat(
+                [inputs_embeds_normed, previous_hidden_states_normed], dim=-1
+            )
+            hidden_states, _ = self.eh_proj(fused)
+
+        hidden_states, residual = super().forward(
+            hidden_states=hidden_states,
+            residual=residual,
+            forward_batch=forward_batch,
+        )
+
+        if self.has_end_norm:
+            if residual is not None:
+                hidden_states = hidden_states + residual
+                residual = None
+
+            hidden_states = self.final_layernorm(hidden_states)
+
+        return hidden_states, residual
+
+
+class NemotronHMultiTokenPredictor(nn.Module):
+    def __init__(
+        self,
+        config: NemotronHConfig,
+        quant_config: QuantizationConfig | None = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+
+        self.config = config
+        self.vocab_size = config.vocab_size
+        self.org_vocab_size = config.vocab_size
+
+        self.mtp_start_layer_idx = config.num_hidden_layers
+        self.num_mtp_layers = getattr(config, "num_nextn_predict_layers", 1)
+        assert (
+            self.num_mtp_layers == 1
+        ), "Only one MTP layer is supported for NemotronH-MTP"
+
+        self.pattern_str = config.mtp_hybrid_override_pattern
+        self.pattern_len = len(self.pattern_str)
+        assert self.pattern_len > 0
+
+        self.embed_tokens = VocabParallelEmbedding(
+            self.vocab_size,
+            config.hidden_size,
+        )
+
+        # Build flat list of layers
+        self.layers = nn.ModuleDict()
+
+        # Total number of physical layers = num_steps * pattern_len
+        total_layers = self.num_mtp_layers * self.pattern_len
+        for i in range(total_layers):
+            step_rel_idx = i % self.pattern_len
+
+            char = self.pattern_str[step_rel_idx]
+
+            is_start_of_step = step_rel_idx == 0
+            is_end_of_step = step_rel_idx == self.pattern_len - 1
+
+            layer_prefix = f"{prefix}.layers.{i}"
+
+            common_kwargs = dict(
+                config=config,
+                layer_idx=i,
+                quant_config=quant_config,
+                prefix=layer_prefix,
+                has_start_projections=is_start_of_step,
+                has_end_norm=is_end_of_step,
+            )
+
+            if char == "*":
+                self.layers[str(i)] = NemotronHMTPAttentionDecoderLayer(**common_kwargs)
+            elif char == "E":
+                self.layers[str(i)] = NemotronHMTPMoEDecoderLayer(**common_kwargs)
+            else:
+                raise NotImplementedError(
+                    f"Pattern char '{char}' in {self.pattern_str} not implemented"
+                )
+
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        assert (
+            self.embed_tokens is not None
+        ), "embed_tokens not initialized - must be shared from target model"
+        return self.embed_tokens(input_ids)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor:
+        if inputs_embeds is None:
+            inputs_embeds = self.get_input_embeddings(input_ids)
+
+        residual = None
+
+        for i in range(self.pattern_len):
+            hidden_states, residual = self.layers[str(i)](
+                inputs_embeds=inputs_embeds,
+                hidden_states=hidden_states,
+                residual=residual,
+                forward_batch=forward_batch,
+            )
+        return hidden_states
+
+
+class NemotronHForCausalLMMTP(NemotronHForCausalLM):
+    def __init__(
+        self,
+        config: NemotronHConfig,
+        quant_config: QuantizationConfig | None = None,
+        prefix: str = "",
+    ):
+        nn.Module.__init__(self)
+        self.config = config
+        self.quant_config = quant_config
+        # Required for parent's load_weights
+        self.pp_group = get_pp_group()
+
+        # Override config for MTP pattern (which has no Mamba layers)
+        config.num_hidden_layers = len(config.mtp_hybrid_override_pattern)
+        # Set hybrid_override_pattern to MTP pattern so attention backend
+        # doesn't use Mamba2AttnBackend (MTP has no Mamba layers)
+        config.hybrid_override_pattern = config.mtp_hybrid_override_pattern
+
+        self.model = NemotronHMultiTokenPredictor(
+            config=config,
+            quant_config=quant_config,
+            prefix=add_prefix("mtp", prefix),
+        )
+
+        self.lm_head = ParallelLMHead(
+            self.config.vocab_size,
+            self.config.hidden_size,
+            quant_config=quant_config,
+            prefix=add_prefix("lm_head", prefix),
+            use_attn_tp_group=get_global_server_args().enable_dp_lm_head,
+        )
+
+        self.logits_processor = LogitsProcessor(config)
+
+    @torch.no_grad()
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        input_embeds: torch.Tensor | None = None,
+        **kwargs,
+    ) -> torch.Tensor:
+        hidden_states = forward_batch.spec_info.hidden_states
+
+        hidden_states = self.model(
+            input_ids,
+            hidden_states,
+            forward_batch,
+            input_embeds,
+        )
+        return self.logits_processor(
+            input_ids, hidden_states, self.lm_head, forward_batch
+        )
+
+    def load_weights(
+        self, weights: Iterable[tuple[str, torch.Tensor]], is_mtp: bool = False
+    ):
+        super().load_weights(weights, is_mtp=True)
+
+
+EntryClass = [NemotronHForCausalLMMTP]
diff --git a/sglang/python/sglang/srt/models/nemotron_nas.py b/sglang/python/sglang/srt/models/nemotron_nas.py
new file mode 100644
index 0000000000000000000000000000000000000000..ac1ccd231dc3ffb7378ee4e30bfcba1b3bc61bf8
--- /dev/null
+++ b/sglang/python/sglang/srt/models/nemotron_nas.py
@@ -0,0 +1,436 @@
+# Copyright 2023-2025 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+# Adapted from https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/models/nemotron_nas.py
+
+"""Inference-only deci model compatible with HuggingFace weights."""
+
+from typing import Iterable, Optional, Tuple, Type, Union
+
+import torch
+from torch import nn
+from transformers import LlamaConfig
+
+from sglang.srt.distributed import get_pp_group
+from sglang.srt.layers.layernorm import RMSNorm
+from sglang.srt.layers.logits_processor import LogitsProcessor, LogitsProcessorOutput
+from sglang.srt.layers.pooler import Pooler, PoolingType
+from sglang.srt.layers.quantization import QuantizationConfig
+from sglang.srt.layers.utils import PPMissingLayer
+from sglang.srt.layers.vocab_parallel_embedding import (
+    DEFAULT_VOCAB_PADDING_SIZE,
+    ParallelLMHead,
+    VocabParallelEmbedding,
+)
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch, PPProxyTensors
+from sglang.srt.model_loader.weight_utils import (
+    default_weight_loader,
+    maybe_remap_kv_scale_name,
+)
+from sglang.srt.models.llama import LlamaAttention, LlamaMLP
+from sglang.srt.utils import add_prefix, make_layers
+from sglang.utils import logger
+
+
+def _ffn_mult_to_intermediate_size(ffn_mult: float, n_embd: int) -> int:
+    # DeciLM-specific code
+    intermediate_size = int(2 * ffn_mult * n_embd / 3)
+    return _find_multiple(intermediate_size, 256)
+
+
+def _find_multiple(n: int, k: int) -> int:
+    # DeciLM-specific code
+    if n % k == 0:
+        return n
+    return n + k - (n % k)
+
+
+class DeciLMDecoderLayer(nn.Module):
+
+    def __init__(
+        self,
+        config: LlamaConfig,
+        layer_idx: int,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        block_config = config.block_configs[layer_idx]
+        self._is_no_op_attention = block_config.attention.no_op
+        self._is_no_op_ffn = block_config.ffn.no_op
+
+        self.hidden_size = config.hidden_size
+        rope_theta = getattr(config, "rope_theta", 10000)
+        rope_scaling = getattr(config, "rope_scaling", None)
+        if rope_scaling is not None and getattr(
+            config, "original_max_position_embeddings", None
+        ):
+            rope_scaling["original_max_position_embeddings"] = (
+                config.original_max_position_embeddings
+            )
+        max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
+        # Support abacusai/Smaug-72B-v0.1 with attention_bias
+        # Support internlm/internlm-7b with bias
+        rope_is_neox_style = getattr(config, "rope_is_neox_style", True)
+        attention_bias = getattr(config, "attention_bias", False) or getattr(
+            config, "bias", False
+        )
+        # support internlm/internlm3-8b with qkv_bias
+        if hasattr(config, "qkv_bias"):
+            attention_bias = config.qkv_bias
+
+        if not self._is_no_op_attention:
+            num_kv_heads = (
+                config.num_attention_heads // block_config.attention.n_heads_in_group
+            )
+            self.self_attn = LlamaAttention(
+                config=config,
+                hidden_size=self.hidden_size,
+                num_heads=config.num_attention_heads,
+                num_kv_heads=num_kv_heads,
+                layer_id=layer_idx,
+                rope_theta=rope_theta,
+                rope_scaling=rope_scaling,
+                rope_is_neox_style=rope_is_neox_style,
+                max_position_embeddings=max_position_embeddings,
+                quant_config=quant_config,
+                prefix=add_prefix("self_attn", prefix),
+                bias=attention_bias,
+            )
+            self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+        if not self._is_no_op_ffn:
+            ffn_mult = block_config.ffn.ffn_mult
+            intermediate_size = _ffn_mult_to_intermediate_size(
+                ffn_mult, config.hidden_size
+            )
+            self.mlp = LlamaMLP(
+                hidden_size=self.hidden_size,
+                intermediate_size=intermediate_size,
+                hidden_act=config.hidden_act,
+                quant_config=quant_config,
+                prefix=add_prefix("mlp", prefix),
+            )
+            self.post_attention_layernorm = RMSNorm(
+                config.hidden_size, eps=config.rms_norm_eps
+            )
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+        residual: Optional[torch.Tensor],
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        # Self Attention
+
+        if self._is_no_op_attention:
+            pass
+        else:
+            if residual is None:
+                residual = hidden_states
+                hidden_states = self.input_layernorm(hidden_states)
+            else:
+                hidden_states, residual = self.input_layernorm(hidden_states, residual)
+            hidden_states = self.self_attn(
+                positions=positions,
+                hidden_states=hidden_states,
+                forward_batch=forward_batch,
+            )
+
+        # Fully Connected
+        if not self._is_no_op_ffn:
+            hidden_states, residual = self.post_attention_layernorm(
+                hidden_states, residual
+            )
+            hidden_states = self.mlp(hidden_states)
+        return hidden_states, residual
+
+
+class DeciModel(nn.Module):
+    def __init__(
+        self,
+        *,
+        config: LlamaConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+        layer_type: Type[DeciLMDecoderLayer] = DeciLMDecoderLayer,
+    ):
+        super().__init__()
+
+        lora_config = None
+        self.config = config
+        self.quant_config = quant_config
+        self.padding_idx = config.pad_token_id
+        lora_vocab = (
+            (lora_config.lora_extra_vocab_size * (lora_config.max_loras or 1))
+            if lora_config
+            else 0
+        )
+        vocab_size = config.vocab_size + lora_vocab
+        if get_pp_group().is_first_rank:
+            self.embed_tokens = VocabParallelEmbedding(
+                vocab_size,
+                config.hidden_size,
+                org_num_embeddings=config.vocab_size,
+                quant_config=quant_config,
+            )
+        else:
+            self.embed_tokens = PPMissingLayer()
+
+        def get_layer(idx: int, prefix: str):
+            return layer_type(
+                config,
+                layer_idx=idx,
+                quant_config=quant_config,
+                prefix=prefix,
+            )
+
+        self.layers, self.start_layer, self.end_layer = make_layers(
+            config.num_hidden_layers,
+            get_layer,
+            pp_rank=get_pp_group().rank_in_group,
+            pp_size=get_pp_group().world_size,
+            prefix=add_prefix("layers", prefix),
+        )
+        if get_pp_group().is_last_rank:
+            self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        else:
+            self.norm = PPMissingLayer(return_tuple=True)
+
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.embed_tokens(input_ids)
+
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor],
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        pp_proxy_tensors: Optional[PPProxyTensors] = None,
+    ) -> Union[torch.Tensor, PPProxyTensors]:
+        if get_pp_group().is_first_rank:
+            if inputs_embeds is not None:
+                hidden_states = inputs_embeds
+            else:
+                hidden_states = self.get_input_embeddings(input_ids)
+            residual = None
+        else:
+            assert pp_proxy_tensors is not None
+            hidden_states = pp_proxy_tensors["hidden_states"]
+            residual = pp_proxy_tensors["residual"]
+
+        kv_cache_index = 0
+        for i in range(self.start_layer, self.end_layer):
+            layer = self.layers[i]
+            if not layer._is_no_op_attention:
+                hidden_states, residual = layer(
+                    positions, hidden_states, forward_batch, residual
+                )
+                kv_cache_index += 1
+            else:
+                hidden_states, residual = layer(
+                    positions, hidden_states, forward_batch, residual
+                )
+
+        if not get_pp_group().is_last_rank:
+            return PPProxyTensors(
+                {"hidden_states": hidden_states, "residual": residual}
+            )
+
+        hidden_states, _ = self.norm(hidden_states, residual)
+        return hidden_states
+
+
+class DeciLMForCausalLM(nn.Module):
+    packed_modules_mapping = {
+        "qkv_proj": ["q_proj", "k_proj", "v_proj"],
+        "gate_up_proj": ["gate_proj", "up_proj"],
+    }
+
+    # LoRA specific attributes
+    supported_lora_modules = [
+        "qkv_proj",
+        "o_proj",
+        "gate_up_proj",
+        "down_proj",
+        "embed_tokens",
+        "lm_head",
+    ]
+    embedding_modules = {
+        "embed_tokens": "input_embeddings",
+        "lm_head": "output_embeddings",
+    }
+    embedding_padding_modules = ["lm_head"]
+
+    # Mistral/Llama models can also be loaded with --load-format mistral
+    # from consolidated.safetensors checkpoints
+    mistral_mapping = {
+        "layers": "model.layers",
+        "attention": "self_attn",
+        "wq": "q_proj",
+        "wk": "k_proj",
+        "wv": "v_proj",
+        "wo": "o_proj",
+        "attention_norm": "input_layernorm",
+        "feed_forward": "mlp",
+        "w1": "gate_proj",
+        "w2": "down_proj",
+        "w3": "up_proj",
+        "ffn_norm": "post_attention_layernorm",
+        "tok_embeddings": "model.embed_tokens",
+        "output": "lm_head",
+        "norm": "model.norm",
+    }
+
+    def __init__(
+        self,
+        *,
+        config: LlamaConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        lora_config = None
+        self.config = config
+        self.lora_config = lora_config
+
+        self.model = self._init_model(
+            config=config, quant_config=quant_config, prefix=add_prefix("model", prefix)
+        )
+        if self.config.tie_word_embeddings:
+            self.lm_head = self.model.embed_tokens
+        else:
+            self.unpadded_vocab_size = config.vocab_size
+            if lora_config:
+                self.unpadded_vocab_size += lora_config.lora_extra_vocab_size
+            self.lm_head = ParallelLMHead(
+                self.unpadded_vocab_size,
+                config.hidden_size,
+                org_num_embeddings=config.vocab_size,
+                padding_size=(
+                    DEFAULT_VOCAB_PADDING_SIZE
+                    # We need bigger padding if using lora for kernel
+                    # compatibility
+                    if not lora_config
+                    else lora_config.lora_vocab_padding_size
+                ),
+                quant_config=quant_config,
+                prefix=add_prefix("lm_head", prefix),
+            )
+        self.logits_processor = LogitsProcessor(config)
+        self.pooler = Pooler(pooling_type=PoolingType.LAST, normalize=True)
+
+    def _init_model(
+        self,
+        config: LlamaConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        return DeciModel(config=config, quant_config=quant_config, prefix=prefix)
+
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.get_input_embeddings(input_ids)
+
+    @torch.no_grad()
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        get_embedding: bool = False,
+        pp_proxy_tensors: Optional[PPProxyTensors] = None,
+    ) -> LogitsProcessorOutput:
+        hidden_states = self.model(
+            input_ids,
+            positions,
+            forward_batch,
+            inputs_embeds,
+            pp_proxy_tensors=pp_proxy_tensors,
+        )
+        if get_pp_group().is_last_rank:
+            if not get_embedding:
+                return self.logits_processor(
+                    input_ids, hidden_states, self.lm_head, forward_batch
+                )
+            else:
+                return self.pooler(hidden_states, forward_batch)
+        else:
+            return hidden_states
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]) -> None:
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            (".qkv_proj", ".q_proj", "q"),
+            (".qkv_proj", ".k_proj", "k"),
+            (".qkv_proj", ".v_proj", "v"),
+            (".gate_up_proj", ".gate_proj", 0),
+            (".gate_up_proj", ".up_proj", 1),
+        ]
+
+        params_dict = dict(self.named_parameters())
+
+        for name, loaded_weight in weights:
+            if "rotary_emb.inv_freq" in name:
+                continue
+            if "rotary_emb.cos_cached" in name or "rotary_emb.sin_cached" in name:
+                # Models trained using ColossalAI may include these tensors in
+                # the checkpoint. Skip them.
+                continue
+            if self.config.tie_word_embeddings and "lm_head.weight" in name:
+                continue
+            if self.model.quant_config is not None and (
+                scale_name := self.model.quant_config.get_cache_scale(name)
+            ):
+                # Loading kv cache quantization scales
+                param = params_dict[scale_name]
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
+                loaded_weight = (
+                    loaded_weight if loaded_weight.dim() == 0 else loaded_weight[0]
+                )
+                weight_loader(param, loaded_weight)
+                continue
+            if "scale" in name:
+                name = maybe_remap_kv_scale_name(name, params_dict)
+                if name is None:
+                    continue
+
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                if name not in params_dict:
+                    continue
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                if name in params_dict.keys():
+                    param = params_dict[name]
+                    weight_loader = getattr(
+                        param, "weight_loader", default_weight_loader
+                    )
+                    weight_loader(param, loaded_weight)
+                else:
+                    logger.warning(f"Parameter {name} not found in params_dict")
+
+
+EntryClass = [DeciLMForCausalLM]
diff --git a/sglang/python/sglang/srt/models/nvila.py b/sglang/python/sglang/srt/models/nvila.py
new file mode 100644
index 0000000000000000000000000000000000000000..964955b004db7dfb4315544ed0cbc5ff083c9c00
--- /dev/null
+++ b/sglang/python/sglang/srt/models/nvila.py
@@ -0,0 +1,355 @@
+import itertools
+import math
+from collections.abc import Iterable
+from typing import Any
+
+import einops
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch import Tensor
+from transformers.configuration_utils import PretrainedConfig
+from transformers.modeling_outputs import BaseModelOutputWithPooling
+from transformers.models.qwen2.configuration_qwen2 import Qwen2Config
+from transformers.models.siglip import SiglipVisionConfig, SiglipVisionModel
+
+import sglang.srt.managers.mm_utils as mm_utils
+import sglang.srt.model_loader.weight_utils as weight_utils
+import sglang.srt.utils as utils
+from sglang.srt.layers.logits_processor import LogitsProcessorOutput
+from sglang.srt.layers.quantization.base_config import QuantizationConfig
+from sglang.srt.managers.mm_utils import MultiModalityDataPaddingPatternMultimodalTokens
+from sglang.srt.managers.schedule_batch import (
+    Modality,
+    MultimodalDataItem,
+    MultimodalInputs,
+)
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch
+from sglang.srt.models.qwen2 import Qwen2ForCausalLM
+
+MM_HIDDEN_SIZE = 3456
+
+
+class NVILAConfig(PretrainedConfig):
+    model_type = "nvila"
+    sub_configs = {
+        "text_config": Qwen2Config,
+        "vision_config": SiglipVisionConfig,
+    }
+    _auto_class = "AutoConfig"
+
+    def __init__(
+        self,
+        *,
+        text_config: dict[str, Any] | None = None,
+        vision_config: dict[str, Any] | None = None,
+        image_token_id: int | None = None,
+        video_token_id: int | None = None,
+        **kwargs,
+    ):
+        self.text_config = (
+            Qwen2Config(**text_config) if text_config is not None else Qwen2Config()
+        )
+        self.vision_config = (
+            SiglipVisionConfig(**vision_config)
+            if vision_config is not None
+            else SiglipVisionConfig()
+        )
+
+        self.image_token_id = image_token_id if image_token_id is not None else -1
+        self.video_token_id = video_token_id if video_token_id is not None else -1
+
+        super().__init__(**kwargs)
+
+
+class NVILAMultiModalProjectorDownsampleBlock(nn.Module):
+    def forward(self, x: Tensor) -> Tensor:
+        batch_size, sequence_length, hidden_size = x.shape
+
+        feat_size = math.isqrt(sequence_length)
+
+        features = x.reshape(batch_size, feat_size, feat_size, hidden_size)
+
+        pad_after = feat_size % 2
+        if pad_after > 0:
+            features = F.pad(features, (0, 0, 0, pad_after, 0, pad_after))
+            feat_size = feat_size + pad_after
+
+        features = features.reshape(
+            batch_size, feat_size // 2, 2, feat_size // 2, 2, hidden_size
+        )
+        features = features.permute(0, 1, 3, 2, 4, 5).contiguous()
+        features = features.reshape(batch_size, -1, 4 * hidden_size)
+
+        return features
+
+
+class NVILAMultiModalProjector(nn.Module):
+    def __init__(self, config: NVILAConfig):
+        super().__init__()
+
+        self.layers = nn.Sequential(
+            NVILAMultiModalProjectorDownsampleBlock(),
+            nn.LayerNorm(MM_HIDDEN_SIZE * 4),
+            nn.Linear(MM_HIDDEN_SIZE * 4, config.text_config.hidden_size),
+            nn.GELU(),
+            nn.Linear(config.text_config.hidden_size, config.text_config.hidden_size),
+        )
+
+    def forward(self, x: Tensor) -> Tensor:
+        return self.layers(x)
+
+
+class NVILAForConditionalGeneration(nn.Module):
+    def __init__(
+        self,
+        config: NVILAConfig,
+        quant_config: QuantizationConfig | None = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+
+        self.config = config
+
+        self.vision_tower = SiglipVisionModel(config.vision_config)
+        self.mm_projector = NVILAMultiModalProjector(config)
+        self.llm = Qwen2ForCausalLM(
+            config=config.text_config,
+            quant_config=quant_config,
+            prefix=utils.add_prefix("llm", prefix),
+        )
+
+    def forward(
+        self,
+        input_ids: Tensor,
+        positions: Tensor,
+        forward_batch: ForwardBatch,
+        get_embedding: bool = False,
+    ) -> LogitsProcessorOutput:
+        output = mm_utils.general_mm_embed_routine(
+            input_ids=input_ids,
+            forward_batch=forward_batch,
+            language_model=self.llm,
+            data_embedding_funcs={
+                Modality.IMAGE: self.get_image_feature,
+                Modality.VIDEO: self.get_image_feature,
+            },
+            get_embedding=get_embedding,
+            positions=positions,
+        )
+
+        assert isinstance(output, LogitsProcessorOutput)
+
+        return output
+
+    def get_image_feature(self, mm_input: list[MultimodalDataItem]) -> Tensor:
+        block_sizes = (
+            list(
+                itertools.chain.from_iterable(
+                    x.block_sizes for x in mm_input if hasattr(x, "block_sizes")
+                )
+            )
+            or None
+        )
+        pixel_values = torch.cat([torch.tensor(x.feature) for x in mm_input], dim=0)
+
+        vision_tower_output: BaseModelOutputWithPooling = self.vision_tower(
+            pixel_values.to(
+                device=self.vision_tower.device, dtype=self.vision_tower.dtype
+            ),
+            output_hidden_states=True,
+        )
+        assert vision_tower_output.hidden_states is not None
+
+        vision_features: Tensor = vision_tower_output.hidden_states[-2]
+
+        vision_features_list, block_sizes = merge_features_for_dynamic_s2(
+            vision_features,
+            block_sizes=(
+                block_sizes
+                if block_sizes is not None
+                else [None] * vision_features.shape[0]
+            ),
+            resize_output_to_scale_idx=-1,
+            scales=[448, 896, 1344],
+        )
+
+        vision_features_list = [
+            split_chessboard(x, block_size[0], block_size[1])
+            for x, block_size in zip(vision_features_list, block_sizes)
+        ]
+
+        vision_features = torch.cat(
+            [einops.rearrange(x, "b c h w -> b (h w) c") for x in vision_features_list]
+        )
+
+        vision_features = self.mm_projector(vision_features)
+
+        vision_features_list = list(
+            vision_features.split(
+                [block_size[0] * block_size[1] for block_size in block_sizes], dim=0
+            )
+        )
+        vision_features_list = [
+            merge_chessboard(x, block_size[0], block_size[1])
+            for x, block_size in zip(vision_features_list, block_sizes)
+        ]
+
+        vision_features = torch.stack(
+            [einops.rearrange(x, "1 c h w -> (h w) c") for x in vision_features_list]
+        )
+
+        vision_features = einops.rearrange(vision_features, "n p d -> (n p) d")
+
+        return vision_features
+
+    def load_weights(self, weights: Iterable[tuple[str, Tensor]]) -> None:
+        params_dict = dict(self.named_parameters())
+
+        for name, loaded_weight in weights:
+            if name.startswith("llm."):
+                self.llm.load_weights([(name[len("llm.") :], loaded_weight)])
+            else:
+                param = params_dict[name]
+                weight_loader = getattr(
+                    param, "weight_loader", weight_utils.default_weight_loader
+                )
+                weight_loader(param, loaded_weight)
+
+    def pad_input_ids(
+        self, input_ids: list[int], mm_inputs: MultimodalInputs
+    ) -> list[int]:
+        pattern = MultiModalityDataPaddingPatternMultimodalTokens()
+        return pattern.pad_input_tokens(input_ids, mm_inputs)
+
+
+def merge_chessboard(x, num_split_h, num_split_w):
+    """
+    x: b * n * c or b * h * w * c
+    out: b * c * h * w
+    Assuming x contains num_split**2 sub-squares concatenated along batch dimension, merge the sub-squares back to the original whole square.
+    """
+    B = x.shape[0]
+    if x.dim() == 3:
+        N = x.shape[1]
+        x = einops.rearrange(
+            x, "b (h w) c -> b c h w", h=math.isqrt(N), w=math.isqrt(N)
+        )
+
+    assert B % (num_split_h * num_split_w) == 0
+    b = B // (num_split_h * num_split_w)
+
+    x_merge = torch.cat(
+        [
+            torch.cat(
+                [
+                    x[(i * num_split_w + j) * b : (i * num_split_w + j + 1) * b]
+                    for j in range(num_split_w)
+                ],
+                dim=-1,
+            )
+            for i in range(num_split_h)
+        ],
+        dim=-2,
+    )
+
+    return x_merge
+
+
+def merge_features_for_dynamic_s2(
+    image_features, block_sizes, *, scales, resize_output_to_scale_idx
+):
+    image_features_each_image = []
+    new_block_sizes = []
+    block_cnt = 0
+    for block_size_each_image in block_sizes:
+        if block_size_each_image is None:
+            cur_features = image_features[block_cnt : block_cnt + 1]
+            cur_features = einops.rearrange(
+                cur_features,
+                "1 (h w) c -> 1 c h w",
+                h=math.isqrt(cur_features.shape[1]),
+            )
+            cur_features = cur_features.repeat(1, len(scales), 1, 1)
+            image_features_each_image.append(cur_features)
+            new_block_sizes.append((1, 1))
+            block_cnt += 1
+        else:
+            cur_features_each_scale = []
+            for scale in scales[:-1]:
+                num_blocks_this_scale = (scale // scales[0]) ** 2
+                cur_features_each_scale.append(
+                    merge_chessboard(
+                        image_features[block_cnt : block_cnt + num_blocks_this_scale],
+                        num_split_h=scale // scales[0],
+                        num_split_w=scale // scales[0],
+                    )
+                )  # 1 * C * H * W
+                block_cnt += num_blocks_this_scale
+            num_blocks_last_scale = block_size_each_image[0] * block_size_each_image[1]
+            cur_features_each_scale.append(
+                merge_chessboard(
+                    image_features[block_cnt : block_cnt + num_blocks_last_scale],
+                    num_split_h=block_size_each_image[0],
+                    num_split_w=block_size_each_image[1],
+                )
+            )  # 1 * C * H * W
+            block_cnt += num_blocks_last_scale
+
+            # resize and concat features from different scales
+            output_size = cur_features_each_scale[resize_output_to_scale_idx].shape[-2:]
+            cur_features = torch.cat(
+                [
+                    F.interpolate(
+                        cur_features_each_scale[i].to(torch.float32),
+                        size=output_size,
+                        mode="area",
+                    ).to(cur_features_each_scale[i].dtype)
+                    for i in range(len(cur_features_each_scale))
+                ],
+                dim=1,
+            )
+
+            image_features_each_image.append(cur_features)
+
+            if (
+                resize_output_to_scale_idx == len(scales) - 1
+                or resize_output_to_scale_idx == -1
+            ):
+                new_block_sizes.append(block_size_each_image)
+            else:
+                new_block_sizes.append(
+                    (
+                        scales[resize_output_to_scale_idx] // scales[0],
+                        scales[resize_output_to_scale_idx] // scales[0],
+                    )
+                )
+
+    assert block_cnt == len(
+        image_features
+    ), f"The number of blocks ({block_cnt}) does not match length of image_features ({len(image_features)})!"
+
+    return image_features_each_image, new_block_sizes
+
+
+def split_chessboard(x, num_split_h, num_split_w):
+    """
+    x: b * c * h * w
+    out: b * c * h * w
+    Deividing x into num_split**2 sub-squares, and concatenate all the sub-squares on the batch dimension
+    """
+    B, C, H, W = x.shape
+    assert H % num_split_h == 0 and W % num_split_w == 0
+    h, w = H // num_split_h, W // num_split_w
+    x_split = torch.cat(
+        [
+            x[:, :, i * h : (i + 1) * h, j * w : (j + 1) * w]
+            for i in range(num_split_h)
+            for j in range(num_split_w)
+        ],
+        dim=0,
+    )
+    return x_split
+
+
+EntryClass = [NVILAForConditionalGeneration]
diff --git a/sglang/python/sglang/srt/models/nvila_lite.py b/sglang/python/sglang/srt/models/nvila_lite.py
new file mode 100644
index 0000000000000000000000000000000000000000..1561250849a33ab516f94586909d30493f75c0a2
--- /dev/null
+++ b/sglang/python/sglang/srt/models/nvila_lite.py
@@ -0,0 +1,184 @@
+import math
+from collections.abc import Iterable
+from typing import Any
+
+import einops
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch import Tensor
+from transformers.configuration_utils import PretrainedConfig
+from transformers.modeling_outputs import BaseModelOutputWithPooling
+from transformers.models.qwen2.configuration_qwen2 import Qwen2Config
+from transformers.models.siglip import SiglipVisionConfig, SiglipVisionModel
+
+import sglang.srt.managers.mm_utils as mm_utils
+import sglang.srt.model_loader.weight_utils as weight_utils
+import sglang.srt.utils as utils
+from sglang.srt.layers.logits_processor import LogitsProcessorOutput
+from sglang.srt.layers.quantization.base_config import QuantizationConfig
+from sglang.srt.managers.mm_utils import MultiModalityDataPaddingPatternMultimodalTokens
+from sglang.srt.managers.schedule_batch import (
+    Modality,
+    MultimodalDataItem,
+    MultimodalInputs,
+)
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch
+from sglang.srt.models.qwen2 import Qwen2ForCausalLM
+
+MM_HIDDEN_SIZE = 1152
+
+
+class NVILALiteConfig(PretrainedConfig):
+    model_type = "nvila_lite"
+    sub_configs = {
+        "text_config": Qwen2Config,
+        "vision_config": SiglipVisionConfig,
+    }
+    _auto_class = "AutoConfig"
+
+    def __init__(
+        self,
+        *,
+        text_config: dict[str, Any] | None = None,
+        vision_config: dict[str, Any] | None = None,
+        image_token_id: int | None = None,
+        video_token_id: int | None = None,
+        **kwargs,
+    ):
+        self.text_config = (
+            Qwen2Config(**text_config) if text_config is not None else Qwen2Config()
+        )
+        self.vision_config = (
+            SiglipVisionConfig(**vision_config)
+            if vision_config is not None
+            else SiglipVisionConfig()
+        )
+
+        self.image_token_id = image_token_id if image_token_id is not None else -1
+        self.video_token_id = video_token_id if video_token_id is not None else -1
+
+        super().__init__(**kwargs)
+
+
+class NVILALiteMultiModalProjectorDownsampleBlock(nn.Module):
+    def forward(self, x: Tensor) -> Tensor:
+        batch_size, sequence_length, hidden_size = x.shape
+
+        feat_size = math.isqrt(sequence_length)
+
+        features = x.reshape(batch_size, feat_size, feat_size, hidden_size)
+
+        pad_after = (3 - feat_size % 3) % 3
+        if pad_after > 0:
+            features = F.pad(features, (0, 0, 0, pad_after, 0, pad_after))
+            feat_size = feat_size + pad_after
+
+        features = features.reshape(
+            batch_size, feat_size // 3, 3, feat_size // 3, 3, hidden_size
+        )
+        features = features.permute(0, 1, 3, 2, 4, 5).contiguous()
+        features = features.reshape(batch_size, -1, 9 * hidden_size)
+
+        return features
+
+
+class NVILALiteMultiModalProjector(nn.Module):
+    def __init__(self, config: NVILALiteConfig):
+        super().__init__()
+
+        self.layers = nn.Sequential(
+            NVILALiteMultiModalProjectorDownsampleBlock(),
+            nn.LayerNorm(MM_HIDDEN_SIZE * 9),
+            nn.Linear(MM_HIDDEN_SIZE * 9, MM_HIDDEN_SIZE * 3),
+            nn.GELU(),
+            nn.LayerNorm(MM_HIDDEN_SIZE * 3),
+            nn.Linear(MM_HIDDEN_SIZE * 3, config.text_config.hidden_size),
+            nn.GELU(),
+            nn.Linear(config.text_config.hidden_size, config.text_config.hidden_size),
+        )
+
+    def forward(self, x: Tensor) -> Tensor:
+        return self.layers(x)
+
+
+class NVILALiteForConditionalGeneration(nn.Module):
+    def __init__(
+        self,
+        config: NVILALiteConfig,
+        quant_config: QuantizationConfig | None = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+
+        self.config = config
+
+        self.vision_tower = SiglipVisionModel(config.vision_config)
+        self.mm_projector = NVILALiteMultiModalProjector(config)
+        self.llm = Qwen2ForCausalLM(
+            config=config.text_config,
+            quant_config=quant_config,
+            prefix=utils.add_prefix("llm", prefix),
+        )
+
+    def forward(
+        self,
+        input_ids: Tensor,
+        positions: Tensor,
+        forward_batch: ForwardBatch,
+        get_embedding: bool = False,
+    ) -> LogitsProcessorOutput:
+        output = mm_utils.general_mm_embed_routine(
+            input_ids=input_ids,
+            forward_batch=forward_batch,
+            language_model=self.llm,
+            data_embedding_funcs={
+                Modality.IMAGE: self.get_image_feature,
+                Modality.VIDEO: self.get_image_feature,
+            },
+            get_embedding=get_embedding,
+            positions=positions,
+        )
+
+        assert isinstance(output, LogitsProcessorOutput)
+
+        return output
+
+    def get_image_feature(self, mm_input: list[MultimodalDataItem]) -> Tensor:
+        pixel_values = torch.cat([torch.tensor(x.feature) for x in mm_input], dim=0)
+
+        vision_tower_output: BaseModelOutputWithPooling = self.vision_tower(
+            pixel_values,
+            output_hidden_states=True,
+        )
+        assert vision_tower_output.hidden_states is not None
+
+        vision_features = vision_tower_output.hidden_states[-2]
+
+        vision_features = self.mm_projector(vision_features)
+
+        vision_features = einops.rearrange(vision_features, "n p d -> (n p) d")
+
+        return vision_features
+
+    def load_weights(self, weights: Iterable[tuple[str, Tensor]]) -> None:
+        params_dict = dict(self.named_parameters())
+
+        for name, loaded_weight in weights:
+            if name.startswith("llm."):
+                self.llm.load_weights([(name[len("llm.") :], loaded_weight)])
+            else:
+                param = params_dict[name]
+                weight_loader = getattr(
+                    param, "weight_loader", weight_utils.default_weight_loader
+                )
+                weight_loader(param, loaded_weight)
+
+    def pad_input_ids(
+        self, input_ids: list[int], mm_inputs: MultimodalInputs
+    ) -> list[int]:
+        pattern = MultiModalityDataPaddingPatternMultimodalTokens()
+        return pattern.pad_input_tokens(input_ids, mm_inputs)
+
+
+EntryClass = [NVILALiteForConditionalGeneration]
diff --git a/sglang/python/sglang/srt/models/olmo.py b/sglang/python/sglang/srt/models/olmo.py
new file mode 100644
index 0000000000000000000000000000000000000000..091b08e8ac1cfead37bfdc9af5d56594225cc536
--- /dev/null
+++ b/sglang/python/sglang/srt/models/olmo.py
@@ -0,0 +1,379 @@
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+# Adapted from
+# https://github.com/vllm-project/vllm/blob/c7f2cf2b7f67bce5842fedfdba508440fe257375/vllm/model_executor/models/olmo.py#L1
+"""Inference-only OLMo model compatible with HuggingFace weights."""
+
+from typing import Iterable, Optional, Tuple
+
+import torch
+from torch import nn
+from transformers import OlmoConfig
+
+from sglang.srt.distributed import get_tensor_model_parallel_world_size
+from sglang.srt.layers.activation import SiluAndMul
+from sglang.srt.layers.linear import (
+    MergedColumnParallelLinear,
+    QKVParallelLinear,
+    RowParallelLinear,
+)
+from sglang.srt.layers.logits_processor import LogitsProcessor
+from sglang.srt.layers.quantization.base_config import QuantizationConfig
+from sglang.srt.layers.radix_attention import RadixAttention
+from sglang.srt.layers.rotary_embedding import get_rope
+from sglang.srt.layers.vocab_parallel_embedding import (
+    ParallelLMHead,
+    VocabParallelEmbedding,
+)
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch
+from sglang.srt.model_loader.weight_utils import default_weight_loader
+from sglang.srt.utils import add_prefix, make_layers
+
+
+class OlmoAttention(nn.Module):
+    """
+    This is the attention block where the output is computed as
+    ``Attention(LN(x))`` in ``MLP(LN(x + Attention(LN(x))))``
+    (plus another skip connection).
+    """
+
+    def __init__(
+        self,
+        config: OlmoConfig,
+        layer_id: int = 0,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        tensor_model_parallel_world_size = get_tensor_model_parallel_world_size()
+        self.total_num_heads = config.num_attention_heads
+
+        assert self.hidden_size % self.total_num_heads == 0
+        assert self.total_num_heads % tensor_model_parallel_world_size == 0
+
+        self.num_heads = self.total_num_heads // tensor_model_parallel_world_size
+        self.head_dim = self.hidden_size // self.total_num_heads
+        self.max_position_embeddings = config.max_position_embeddings
+        self.rope_theta = config.rope_theta
+        self.clip_qkv = config.clip_qkv
+
+        # Attention input projection. Projects x -> (q, k, v)
+        self.qkv_proj = QKVParallelLinear(
+            self.hidden_size,
+            self.head_dim,
+            self.total_num_heads,
+            bias=config.attention_bias,
+            prefix=add_prefix("qkv_proj", prefix),
+        )
+
+        # Rotary embeddings.
+        self.rotary_emb = get_rope(
+            self.head_dim,
+            rotary_dim=self.head_dim,
+            max_position=self.max_position_embeddings,
+            base=self.rope_theta,
+        )
+        self.scaling = self.head_dim**-0.5
+        self.attn = RadixAttention(
+            self.num_heads,
+            self.head_dim,
+            self.scaling,
+            num_kv_heads=self.num_heads,
+            layer_id=layer_id,
+            quant_config=quant_config,
+            prefix=add_prefix("attn", prefix),
+        )
+
+        # Attention output projection.
+        self.o_proj = RowParallelLinear(
+            self.hidden_size,
+            self.hidden_size,
+            bias=config.attention_bias,
+            prefix=add_prefix("o_proj", prefix),
+        )
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+    ) -> torch.Tensor:
+        qkv, _ = self.qkv_proj(hidden_states)
+        if self.clip_qkv is not None:
+            qkv.clamp_(min=-self.clip_qkv, max=self.clip_qkv)
+        q, k, v = qkv.chunk(chunks=3, dim=-1)
+        q, k = self.rotary_emb(positions, q, k)
+        attn_output = self.attn(q, k, v, forward_batch)
+        output, _ = self.o_proj(attn_output)
+        return output
+
+
+class OlmoMLP(nn.Module):
+    """
+    This is the MLP block where the output is computed as
+    ``MLP(LN(x))`` in ``MLP(LN(x + Attention(LN(x))))``
+    (plus another skip connection).
+    """
+
+    def __init__(
+        self,
+        config: OlmoConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.intermediate_size = config.intermediate_size
+
+        # Feed-forward input projection.
+        self.gate_up_proj = MergedColumnParallelLinear(
+            self.hidden_size,
+            [self.intermediate_size] * 2,
+            bias=False,
+            quant_config=quant_config,
+            prefix=add_prefix("gate_up_proj", prefix),
+        )
+
+        # Activation function.
+        self.act_fn = SiluAndMul()
+
+        # Feed-forward output projection.
+        self.down_proj = RowParallelLinear(
+            self.intermediate_size,
+            self.hidden_size,
+            bias=False,
+            quant_config=quant_config,
+            prefix=add_prefix("down_proj", prefix),
+        )
+
+    def forward(
+        self,
+        x: torch.Tensor,
+    ) -> torch.Tensor:
+        gate_up, _ = self.gate_up_proj(x)
+        x = self.act_fn(gate_up)
+        x, _ = self.down_proj(x)
+        return x
+
+
+class OlmoDecoderLayer(nn.Module):
+    """
+    This is a typical transformer block where the output is
+    computed as ``MLP(LN(x + Attention(LN(x))))``
+    (plus another skip connection).
+    """
+
+    def __init__(
+        self,
+        config: OlmoConfig,
+        layer_id: int = 0,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        # Attention block.
+        self.self_attn = OlmoAttention(
+            config,
+            layer_id,
+            quant_config,
+            prefix=add_prefix("self_attn", prefix),
+        )
+
+        # MLP block.
+        self.mlp = OlmoMLP(
+            config,
+            quant_config,
+            prefix=add_prefix("mlp", prefix),
+        )
+
+        # LayerNorm
+        self.input_layernorm = nn.LayerNorm(
+            config.hidden_size, elementwise_affine=False, bias=False
+        )
+        self.post_attention_layernorm = nn.LayerNorm(
+            config.hidden_size, elementwise_affine=False, bias=False
+        )
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+    ) -> Tuple[torch.Tensor, Optional[Tuple[torch.Tensor, torch.Tensor]]]:
+        # Attention block.
+        residual = hidden_states
+        hidden_states = self.input_layernorm(hidden_states)
+        hidden_states = self.self_attn(positions, hidden_states, forward_batch)
+        hidden_states = hidden_states + residual
+
+        # MLP block.
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+        return hidden_states
+
+
+class OlmoModel(nn.Module):
+
+    def __init__(
+        self,
+        config: OlmoConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.config = config
+
+        self.embed_tokens = VocabParallelEmbedding(
+            config.vocab_size,
+            config.hidden_size,
+            prefix=add_prefix("embed_tokens", prefix),
+        )
+        self.layers = make_layers(
+            config.num_hidden_layers,
+            lambda idx, prefix: OlmoDecoderLayer(
+                layer_id=idx,
+                config=config,
+                quant_config=quant_config,
+                prefix=prefix,
+            ),
+            prefix=add_prefix("layers", prefix),
+        )
+        self.norm = nn.LayerNorm(
+            config.hidden_size, elementwise_affine=False, bias=False
+        )
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        input_embeds: torch.Tensor = None,
+    ) -> torch.Tensor:
+        """
+        :param input_ids: A tensor of shape `(batch_size, seq_len)`.
+        """
+        # Get embeddings of input.
+        # shape: (batch_size, seq_len, d_model)
+
+        if input_embeds is None:
+            hidden_states = self.embed_tokens(input_ids)
+        else:
+            hidden_states = input_embeds
+
+        # Apply blocks one-by-one.
+        for layer_id, decoder_layer in enumerate(self.layers):
+            # shape: (batch_size, seq_len, d_model)
+            hidden_states = decoder_layer(
+                positions,
+                hidden_states,
+                forward_batch,
+            )
+
+        # Apply final layer norm.
+        # shape: (batch_size, seq_len or 1, d_model)
+        hidden_states = self.norm(hidden_states)
+        return hidden_states
+
+
+class OlmoForCausalLM(nn.Module):
+    """
+    Extremely barebones HF model wrapper.
+    """
+
+    def __init__(
+        self,
+        config: OlmoConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.config = config
+        self.model = OlmoModel(config, quant_config, prefix=add_prefix("model", prefix))
+        if config.tie_word_embeddings:
+            self.lm_head = self.model.embed_tokens
+        else:
+            self.unpadded_vocab_size = config.vocab_size
+            self.lm_head = ParallelLMHead(
+                self.unpadded_vocab_size,
+                config.hidden_size,
+                org_num_embeddings=config.vocab_size,
+                quant_config=quant_config,
+                prefix=add_prefix("lm_head", prefix),
+            )
+        self.logits_processor = LogitsProcessor(config)
+
+    @torch.no_grad()
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        input_embeds: torch.Tensor = None,
+    ) -> torch.Tensor:
+        hidden_states = self.model(
+            input_ids=input_ids,
+            positions=positions,
+            forward_batch=forward_batch,
+            input_embeds=input_embeds,
+        )
+        return self.logits_processor(
+            input_ids, hidden_states, self.lm_head, forward_batch
+        )
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+            ("gate_up_proj", "gate_proj", 0),
+            ("gate_up_proj", "up_proj", 1),
+        ]
+        params_dict = dict(self.named_parameters(remove_duplicate=False))
+        for name, loaded_weight in weights:
+            if "rotary_emb.inv_freq" in name:
+                continue
+            if "rotary_emb.cos_cached" in name or "rotary_emb.sin_cached" in name:
+                # Models trained using ColossalAI may include these tensors in
+                # the checkpoint. Skip them.
+                continue
+            if self.config.tie_word_embeddings and "lm_head.weight" in name:
+                continue
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
+                weight_loader(param, loaded_weight)
+
+
+EntryClass = OlmoForCausalLM
diff --git a/sglang/python/sglang/srt/models/olmo2.py b/sglang/python/sglang/srt/models/olmo2.py
new file mode 100644
index 0000000000000000000000000000000000000000..2b1c6fa89fa0301e3fbcede89744cb249093ff2a
--- /dev/null
+++ b/sglang/python/sglang/srt/models/olmo2.py
@@ -0,0 +1,484 @@
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+# Adapted from
+# https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/models/olmo2.py
+"""Inference-only OLMo2 model compatible with HuggingFace weights."""
+
+from functools import partial
+from typing import Iterable, Optional, Tuple
+
+import torch
+from torch import nn
+from transformers import PretrainedConfig
+
+from sglang.srt.distributed import (
+    get_tensor_model_parallel_rank,
+    get_tensor_model_parallel_world_size,
+    split_tensor_along_last_dim,
+    tensor_model_parallel_all_gather,
+)
+from sglang.srt.layers.activation import SiluAndMul
+from sglang.srt.layers.layernorm import RMSNorm
+from sglang.srt.layers.linear import (
+    MergedColumnParallelLinear,
+    QKVParallelLinear,
+    RowParallelLinear,
+)
+from sglang.srt.layers.logits_processor import LogitsProcessor
+from sglang.srt.layers.quantization.base_config import QuantizationConfig
+from sglang.srt.layers.radix_attention import RadixAttention
+from sglang.srt.layers.rotary_embedding import get_rope
+from sglang.srt.layers.vocab_parallel_embedding import (
+    ParallelLMHead,
+    VocabParallelEmbedding,
+)
+from sglang.srt.model_executor.cuda_graph_runner import get_is_capture_mode
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch
+from sglang.srt.model_loader.weight_utils import default_weight_loader
+from sglang.srt.utils import add_prefix, is_cuda, make_layers
+
+_is_cuda = is_cuda()
+
+
+# Aligned with HF's implementation, using sliding window inclusive with the last token
+# SGLang assumes exclusive
+def get_attention_sliding_window_size(config):
+    return config.sliding_window - 1 if hasattr(config, "sliding_window") else None
+
+
+class Olmo2Attention(nn.Module):
+    """
+    This is the attention block where the output is computed as
+    ``Attention(LN(x))`` in ``MLP(LN(x + Attention(LN(x))))``
+    (plus another skip connection).
+    """
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        layer_id: int = 0,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+        alt_stream: Optional[torch.cuda.Stream] = None,
+    ):
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.tp_size = get_tensor_model_parallel_world_size()
+        self.total_num_heads = config.num_attention_heads
+
+        assert self.hidden_size % self.total_num_heads == 0
+        assert self.total_num_heads % self.tp_size == 0
+
+        self.num_heads = self.total_num_heads // self.tp_size
+        self.total_num_kv_heads = self.config.num_key_value_heads
+
+        if self.total_num_kv_heads >= self.tp_size:
+            # Number of KV heads is greater than TP size, so we partition
+            # the KV heads across multiple tensor parallel GPUs.
+            assert self.total_num_kv_heads % self.tp_size == 0
+        else:
+            # Number of KV heads is less than TP size, so we replicate
+            # the KV heads across multiple tensor parallel GPUs.
+            assert self.tp_size % self.total_num_kv_heads == 0
+        self.num_kv_heads = max(1, self.total_num_kv_heads // self.tp_size)
+
+        self.head_dim = self.hidden_size // self.total_num_heads
+        self.q_size = self.num_heads * self.head_dim
+        self.kv_size = self.num_kv_heads * self.head_dim
+        self.max_position_embeddings = config.max_position_embeddings
+        self.rope_theta = config.rope_theta
+
+        # Attention input projection. Projects x -> (q, k, v)
+        self.qkv_proj = QKVParallelLinear(
+            self.hidden_size,
+            self.head_dim,
+            self.total_num_heads,
+            total_num_kv_heads=self.total_num_kv_heads,
+            bias=config.attention_bias,
+            quant_config=quant_config,
+            prefix=add_prefix("qkv_proj", prefix),
+        )
+        self.tp_rank = get_tensor_model_parallel_rank()
+        self.alt_stream = alt_stream
+
+        self.k_norm = RMSNorm(
+            self.total_num_kv_heads * self.head_dim,
+            eps=self.config.rms_norm_eps,
+        )
+        self.q_norm = RMSNorm(self.config.hidden_size, eps=self.config.rms_norm_eps)
+
+        sliding_window = None
+        if (
+            layer_types := getattr(self.config, "layer_types", None)
+        ) is not None and layer_types[layer_id] == "sliding_attention":
+            sliding_window = get_attention_sliding_window_size(self.config)
+
+        # Rotary embeddings. Rope scaling is only applied on full attention
+        # layers.
+        self.rope_scaling = (
+            self.config.rope_scaling
+            if sliding_window is None
+            else {"rope_type": "default"}
+        )
+        self.rotary_emb = get_rope(
+            self.head_dim,
+            rotary_dim=self.head_dim,
+            max_position=self.max_position_embeddings,
+            base=self.rope_theta,
+            rope_scaling=self.rope_scaling,
+        )
+        self.scaling = self.head_dim**-0.5
+        self.attn = RadixAttention(
+            self.num_heads,
+            self.head_dim,
+            self.scaling,
+            num_kv_heads=self.num_kv_heads,
+            layer_id=layer_id,
+            sliding_window_size=sliding_window,
+            quant_config=quant_config,
+            prefix=add_prefix("attn", prefix),
+        )
+
+        # Attention output projection.
+        self.o_proj = RowParallelLinear(
+            self.head_dim * self.total_num_heads,
+            self.hidden_size,
+            bias=config.attention_bias,
+            quant_config=quant_config,
+            prefix=add_prefix("o_proj", prefix),
+        )
+
+    def _apply_qk_norm(
+        self, q: torch.Tensor, k: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        if self.tp_size > 1:
+            q = tensor_model_parallel_all_gather(q.contiguous())
+            k = tensor_model_parallel_all_gather(k.contiguous())
+
+        if self.alt_stream is not None and get_is_capture_mode():
+            current_stream = torch.cuda.current_stream()
+            self.alt_stream.wait_stream(current_stream)
+
+            q_shape = q.shape
+            k_shape = k.shape
+
+            q_by_last = q.reshape(-1, q_shape[-1])
+            q_by_last = self.q_norm(q_by_last)
+
+            with torch.cuda.stream(self.alt_stream):
+                k_by_last = k.reshape(-1, k_shape[-1])
+                k_by_last = self.k_norm(k_by_last)
+
+            current_stream.wait_stream(self.alt_stream)
+
+            q = q_by_last.view(q_shape)
+            k = k_by_last.view(k_shape)
+        else:
+            q = self.q_norm.forward_native(q)
+            k = self.k_norm.forward_native(k)
+
+        if self.tp_size > 1:
+            splitter = partial(split_tensor_along_last_dim, num_partitions=self.tp_size)
+            q = splitter(q)[self.tp_rank]
+            k = splitter(k)[self.tp_rank]
+        return q, k
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+    ) -> torch.Tensor:
+        qkv, _ = self.qkv_proj(hidden_states)
+        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+        q, k = self._apply_qk_norm(q, k)
+        q, k = self.rotary_emb(positions, q, k)
+        attn_output = self.attn(q, k, v, forward_batch)
+        output, _ = self.o_proj(attn_output)
+        return output
+
+
+class Olmo2MLP(nn.Module):
+    """
+    This is the MLP block where the output is computed as
+    ``MLP(x)`` in ``LN(MLP(x + LN(Attention(x))))``
+    (plus another skip connection).
+    """
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.intermediate_size = config.intermediate_size
+
+        # Feed-forward input projection.
+        self.gate_up_proj = MergedColumnParallelLinear(
+            self.hidden_size,
+            [self.intermediate_size] * 2,
+            bias=False,
+            quant_config=quant_config,
+            prefix=add_prefix("gate_up_proj", prefix),
+        )
+
+        # Activation function.
+        self.act_fn = SiluAndMul()
+
+        # Feed-forward output projection.
+        self.down_proj = RowParallelLinear(
+            self.intermediate_size,
+            self.hidden_size,
+            bias=False,
+            quant_config=quant_config,
+            prefix=add_prefix("down_proj", prefix),
+        )
+
+    def forward(
+        self,
+        x: torch.Tensor,
+    ) -> torch.Tensor:
+        gate_up, _ = self.gate_up_proj(x)
+        x = self.act_fn(gate_up)
+        x, _ = self.down_proj(x)
+        return x
+
+
+class Olmo2DecoderLayer(nn.Module):
+    """
+    This is a typical transformer block where the output is
+    computed as ``MLP(LN(x + Attention(LN(x))))``
+    (plus another skip connection).
+    """
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        layer_id: int = 0,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+        alt_stream: Optional[torch.cuda.Stream] = None,
+    ):
+        super().__init__()
+        self.layer_id = layer_id
+        self.alt_stream = alt_stream
+        # Attention block.
+        self.self_attn = Olmo2Attention(
+            config,
+            layer_id,
+            quant_config,
+            prefix=add_prefix("self_attn", prefix),
+            alt_stream=alt_stream,
+        )
+
+        # MLP block.
+        self.mlp = Olmo2MLP(config, quant_config, prefix=add_prefix("mlp", prefix))
+
+        # RMSNorm
+        self.post_attention_layernorm = RMSNorm(
+            config.hidden_size, eps=config.rms_norm_eps
+        )
+
+        self.post_feedforward_layernorm = RMSNorm(
+            config.hidden_size, eps=config.rms_norm_eps
+        )
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+    ) -> torch.Tensor:
+        # Attention block.
+        residual = hidden_states
+        hidden_states = self.self_attn(positions, hidden_states, forward_batch)
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = hidden_states + residual
+
+        # MLP block.
+        residual = hidden_states
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = self.post_feedforward_layernorm(hidden_states)
+        hidden_states = residual + hidden_states
+        return hidden_states
+
+
+class Olmo2Model(nn.Module):
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+        alt_stream: Optional[torch.cuda.Stream] = None,
+    ):
+        super().__init__()
+        self.config = config
+        if alt_stream is None and _is_cuda:
+            alt_stream = torch.cuda.Stream()
+        self.alt_stream = alt_stream
+
+        self.embed_tokens = VocabParallelEmbedding(
+            config.vocab_size,
+            config.hidden_size,
+            prefix=add_prefix("embed_tokens", prefix),
+        )
+        self.layers = make_layers(
+            config.num_hidden_layers,
+            lambda idx, prefix: Olmo2DecoderLayer(
+                config=config,
+                layer_id=idx,
+                quant_config=quant_config,
+                prefix=prefix,
+                alt_stream=self.alt_stream,
+            ),
+            prefix=add_prefix("layers", prefix),
+        )
+        self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        input_embeds: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        """
+        :param input_ids: A tensor of shape `(batch_size, seq_len)`.
+        """
+        # Get embeddings of input.
+        # shape: (batch_size, seq_len, d_model)
+
+        if input_embeds is None:
+            hidden_states = self.embed_tokens(input_ids)
+        else:
+            hidden_states = input_embeds
+
+        # Apply blocks one-by-one.
+        for layer_id, decoder_layer in enumerate(self.layers):
+            # shape: (batch_size, seq_len, d_model)
+            hidden_states = decoder_layer(
+                positions,
+                hidden_states,
+                forward_batch,
+            )
+
+        # Apply final layer norm.
+        # shape: (batch_size, seq_len or 1, d_model)
+        hidden_states = self.norm(hidden_states)
+        return hidden_states
+
+
+class Olmo2ForCausalLM(nn.Module):
+    """
+    Extremely barebones HF model wrapper.
+    """
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+        alt_stream: Optional[torch.cuda.Stream] = None,
+    ):
+        super().__init__()
+        self.config = config
+        self.model = Olmo2Model(
+            config,
+            quant_config,
+            prefix=add_prefix("model", prefix),
+            alt_stream=alt_stream,
+        )
+        if config.tie_word_embeddings:
+            self.lm_head = self.model.embed_tokens
+        else:
+            self.unpadded_vocab_size = config.vocab_size
+            self.lm_head = ParallelLMHead(
+                self.unpadded_vocab_size,
+                config.hidden_size,
+                org_num_embeddings=config.vocab_size,
+                quant_config=quant_config,
+                prefix=add_prefix("lm_head", prefix),
+            )
+        self.logits_processor = LogitsProcessor(config)
+
+    def get_attention_sliding_window_size(self):
+        return get_attention_sliding_window_size(self.config)
+
+    @torch.no_grad()
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        input_embeds: torch.Tensor = None,
+    ) -> torch.Tensor:
+        hidden_states = self.model(
+            input_ids=input_ids,
+            positions=positions,
+            forward_batch=forward_batch,
+            input_embeds=input_embeds,
+        )
+        return self.logits_processor(
+            input_ids, hidden_states, self.lm_head, forward_batch
+        )
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+            ("gate_up_proj", "gate_proj", 0),
+            ("gate_up_proj", "up_proj", 1),
+        ]
+        params_dict = dict(self.named_parameters(remove_duplicate=False))
+        for name, loaded_weight in weights:
+            if "rotary_emb.inv_freq" in name:
+                continue
+            if "rotary_emb.cos_cached" in name or "rotary_emb.sin_cached" in name:
+                # Models trained using ColossalAI may include these tensors in
+                # the checkpoint. Skip them.
+                continue
+            # With tie_word_embeddings, we can skip lm_head.weight
+            # The weight might appear unnecessarily in the files if the model is
+            # processed with quantization, LoRA, fine-tuning, etc.
+            if self.config.tie_word_embeddings and "lm_head.weight" in name:
+                continue
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
+                weight_loader(param, loaded_weight)
+
+
+EntryClass = Olmo2ForCausalLM
diff --git a/sglang/python/sglang/srt/models/olmoe.py b/sglang/python/sglang/srt/models/olmoe.py
new file mode 100644
index 0000000000000000000000000000000000000000..a74a2968daefc774867565bb2e5682f262cb6c6c
--- /dev/null
+++ b/sglang/python/sglang/srt/models/olmoe.py
@@ -0,0 +1,437 @@
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+# Adapted from:
+# https://github.com/vllm-project/vllm/pull/7922
+
+"""Inference-only OLMoE model compatible with HuggingFace weights."""
+
+from typing import Any, Dict, Iterable, Optional, Tuple
+
+import torch
+from torch import nn
+from transformers import PretrainedConfig
+
+from sglang.srt.distributed import get_tensor_model_parallel_world_size
+from sglang.srt.layers.layernorm import RMSNorm
+from sglang.srt.layers.linear import (
+    QKVParallelLinear,
+    ReplicatedLinear,
+    RowParallelLinear,
+)
+from sglang.srt.layers.logits_processor import LogitsProcessor
+from sglang.srt.layers.moe.fused_moe_triton import FusedMoE
+from sglang.srt.layers.moe.topk import TopK
+from sglang.srt.layers.quantization.base_config import QuantizationConfig
+from sglang.srt.layers.radix_attention import RadixAttention
+from sglang.srt.layers.rotary_embedding import get_rope
+from sglang.srt.layers.vocab_parallel_embedding import (
+    ParallelLMHead,
+    VocabParallelEmbedding,
+)
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch
+from sglang.srt.model_loader.weight_utils import default_weight_loader
+from sglang.srt.utils import add_prefix, make_layers, print_warning_once
+
+
+class OlmoeMoE(nn.Module):
+    """A tensor-parallel MoE implementation for Olmoe that shards each expert
+    across all ranks.
+
+    Each expert's weights are sharded across all ranks and a fused MoE
+    kernel is used for the forward pass, and finally we reduce the outputs
+    across ranks.
+    """
+
+    def __init__(
+        self,
+        num_experts: int,
+        top_k: int,
+        hidden_size: int,
+        intermediate_size: int,
+        params_dtype: Optional[torch.dtype] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        tp_size: Optional[int] = None,
+        layer_id: int = 0,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.hidden_size = hidden_size
+
+        # Gate always runs at half / full precision for now.
+        self.gate = ReplicatedLinear(
+            hidden_size,
+            num_experts,
+            bias=False,
+            quant_config=None,
+            prefix=add_prefix("gate", prefix),
+        )
+
+        self.topk = TopK(
+            top_k=top_k,
+            renormalize=False,
+        )
+
+        self.experts = FusedMoE(
+            num_experts=num_experts,
+            hidden_size=hidden_size,
+            intermediate_size=intermediate_size,
+            reduce_results=True,
+            quant_config=quant_config,
+            layer_id=layer_id,
+            prefix=add_prefix("experts", prefix),
+        )
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        # NOTE: hidden_states can have either 1D or 2D shape.
+        orig_shape = hidden_states.shape
+        hidden_states = hidden_states.view(-1, self.hidden_size)
+        # router_logits: (num_tokens, n_experts)
+        router_logits, _ = self.gate(hidden_states)
+        topk_output = self.topk(hidden_states, router_logits)
+        final_hidden_states = self.experts(hidden_states, topk_output)
+        return final_hidden_states.view(orig_shape)
+
+
+class OlmoeAttention(nn.Module):
+
+    def __init__(
+        self,
+        layer_id: int,
+        hidden_size: int,
+        num_heads: int,
+        num_kv_heads: int,
+        rope_theta: float = 10000,
+        rope_scaling: Optional[Dict[str, Any]] = None,
+        max_position_embeddings: int = 4096,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.hidden_size = hidden_size
+        tp_size = get_tensor_model_parallel_world_size()
+        self.total_num_heads = num_heads
+        assert self.total_num_heads % tp_size == 0
+        self.num_heads = self.total_num_heads // tp_size
+        self.total_num_kv_heads = num_kv_heads
+        if self.total_num_kv_heads >= tp_size:
+            # Number of KV heads is greater than TP size, so we partition
+            # the KV heads across multiple tensor parallel GPUs.
+            assert self.total_num_kv_heads % tp_size == 0
+        else:
+            # Number of KV heads is less than TP size, so we replicate
+            # the KV heads across multiple tensor parallel GPUs.
+            assert tp_size % self.total_num_kv_heads == 0
+        self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
+        self.head_dim = hidden_size // self.total_num_heads
+        self.q_size = self.num_heads * self.head_dim
+        self.kv_size = self.num_kv_heads * self.head_dim
+        self.scaling = self.head_dim**-0.5
+        self.rope_theta = rope_theta
+        self.max_position_embeddings = max_position_embeddings
+
+        self.qkv_proj = QKVParallelLinear(
+            hidden_size,
+            self.head_dim,
+            self.total_num_heads,
+            self.total_num_kv_heads,
+            bias=False,
+            quant_config=quant_config,
+            prefix=add_prefix("qkv_proj", prefix),
+        )
+        self.q_norm = RMSNorm(hidden_size, eps=1e-5)
+        self.k_norm = RMSNorm(hidden_size, eps=1e-5)
+        self.o_proj = RowParallelLinear(
+            self.total_num_heads * self.head_dim,
+            hidden_size,
+            bias=False,
+            quant_config=quant_config,
+            prefix=add_prefix("o_proj", prefix),
+        )
+
+        self.rotary_emb = get_rope(
+            self.head_dim,
+            rotary_dim=self.head_dim,
+            max_position=max_position_embeddings,
+            base=rope_theta,
+            rope_scaling=rope_scaling,
+            is_neox_style=True,
+        )
+        self.attn = RadixAttention(
+            self.num_heads,
+            self.head_dim,
+            self.scaling,
+            layer_id=layer_id,
+            num_kv_heads=self.num_kv_heads,
+            quant_config=quant_config,
+            prefix=add_prefix("attn", prefix),
+        )
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+    ) -> torch.Tensor:
+        qkv, _ = self.qkv_proj(hidden_states)
+        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+        q, k = self.q_norm(q.contiguous()), self.k_norm(k.contiguous())
+        q, k = self.rotary_emb(positions, q, k)
+        attn_output = self.attn(q, k, v, forward_batch)
+        output, _ = self.o_proj(attn_output)
+        return output
+
+
+class OlmoeDecoderLayer(nn.Module):
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        layer_id: int = 0,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        rope_theta = getattr(config, "rope_theta", 10000)
+        rope_scaling = getattr(config, "rope_scaling", None)
+        max_position_embeddings = getattr(config, "max_position_embeddings", 4096)
+
+        self.self_attn = OlmoeAttention(
+            layer_id,
+            hidden_size=self.hidden_size,
+            num_heads=config.num_attention_heads,
+            num_kv_heads=config.num_key_value_heads,
+            rope_theta=rope_theta,
+            rope_scaling=rope_scaling,
+            max_position_embeddings=max_position_embeddings,
+            quant_config=quant_config,
+            prefix=add_prefix("self_attn", prefix),
+        )
+
+        self.mlp = OlmoeMoE(
+            num_experts=config.num_experts,
+            top_k=config.num_experts_per_tok,
+            hidden_size=config.hidden_size,
+            intermediate_size=config.intermediate_size,
+            layer_id=layer_id,
+            quant_config=quant_config,
+            prefix=add_prefix("mlp", prefix),
+        )
+        self.input_layernorm = RMSNorm(config.hidden_size, eps=1e-5)
+        self.post_attention_layernorm = RMSNorm(config.hidden_size, eps=1e-5)
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+        residual: Optional[torch.Tensor],
+    ) -> torch.Tensor:
+        # Self Attention
+        if residual is None:
+            residual = hidden_states
+            hidden_states = self.input_layernorm(hidden_states)
+        else:
+            hidden_states, residual = self.input_layernorm(hidden_states, residual)
+
+        hidden_states = self.self_attn(
+            positions=positions,
+            hidden_states=hidden_states,
+            forward_batch=forward_batch,
+        )
+
+        # Fully Connected
+        hidden_states, residual = self.post_attention_layernorm(hidden_states, residual)
+        hidden_states = self.mlp(hidden_states)
+        return hidden_states, residual
+
+
+class OlmoeModel(nn.Module):
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+
+        self.embed_tokens = VocabParallelEmbedding(
+            config.vocab_size,
+            config.hidden_size,
+            prefix=add_prefix("embed_tokens", prefix),
+        )
+        self.layers = make_layers(
+            config.num_hidden_layers,
+            lambda idx, prefix: OlmoeDecoderLayer(
+                config=config,
+                quant_config=quant_config,
+                layer_id=idx,
+                prefix=prefix,
+            ),
+            prefix=add_prefix("layers", prefix),
+        )
+        self.norm = RMSNorm(config.hidden_size, eps=1e-5)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        input_embeds: torch.Tensor = None,
+    ) -> torch.Tensor:
+        if input_embeds is None:
+            hidden_states = self.embed_tokens(input_ids)
+        else:
+            hidden_states = input_embeds
+        residual = None
+        for i in range(len(self.layers)):
+            layer = self.layers[i]
+            hidden_states, residual = layer(
+                positions, hidden_states, forward_batch, residual
+            )
+        hidden_states, _ = self.norm(hidden_states, residual)
+        return hidden_states
+
+
+class OlmoeForCausalLM(nn.Module):
+
+    fall_back_to_pt_during_load = False
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.config = config
+        self.quant_config = quant_config
+        self.model = OlmoeModel(
+            config, quant_config, prefix=add_prefix("model", prefix)
+        )
+        self.lm_head = ParallelLMHead(
+            config.vocab_size,
+            config.hidden_size,
+            quant_config=quant_config,
+            prefix=add_prefix("lm_head", prefix),
+        )
+        self.logits_processor = LogitsProcessor(config)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        input_embeds: torch.Tensor = None,
+    ) -> torch.Tensor:
+        hidden_states = self.model(input_ids, positions, forward_batch, input_embeds)
+        return self.logits_processor(
+            input_ids, hidden_states, self.lm_head, forward_batch
+        )
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+            ("gate_up_proj", "gate_proj", 0),
+            ("gate_up_proj", "up_proj", 1),
+        ]
+
+        # Params for weights, fp8 weight scales, fp8 activation scales
+        # (param_name, weight_name, expert_id, shard_id)
+        expert_params_mapping = FusedMoE.make_expert_params_mapping(
+            ckpt_gate_proj_name="gate_proj",
+            ckpt_down_proj_name="down_proj",
+            ckpt_up_proj_name="up_proj",
+            num_experts=self.config.num_experts,
+        )
+
+        params_dict = dict(self.named_parameters())
+        for name, loaded_weight in weights:
+            if "rotary_emb.inv_freq" in name:
+                continue
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                # Skip non-stacked layers and experts (experts handled below).
+                if weight_name not in name:
+                    continue
+                # We have mlp.experts[0].gate_proj in the checkpoint.
+                # Since we handle the experts below in expert_params_mapping,
+                # we need to skip here BEFORE we update the name, otherwise
+                # name will be updated to mlp.experts[0].gate_up_proj, which
+                # will then be updated below in expert_params_mapping
+                # for mlp.experts[0].gate_gate_up_proj, which breaks load.
+                if "mlp.experts" in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                if name not in params_dict:
+                    continue
+
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                for mapping in expert_params_mapping:
+                    param_name, weight_name, expert_id, shard_id = mapping
+                    if weight_name not in name:
+                        continue
+                    name = name.replace(weight_name, param_name)
+                    param = params_dict[name]
+                    weight_loader = param.weight_loader
+                    weight_loader(
+                        param,
+                        loaded_weight,
+                        name,
+                        shard_id=shard_id,
+                        expert_id=expert_id,
+                    )
+                    break
+                else:
+                    # Skip loading extra bias for GPTQ models.
+                    if name.endswith(".bias") and name not in params_dict:
+                        continue
+                    # Remapping the name of FP8 kv-scale.
+                    if name.endswith("kv_scale"):
+                        remapped_kv_scale_name = name.replace(
+                            ".kv_scale", ".attn.kv_scale"
+                        )
+                        if remapped_kv_scale_name not in params_dict:
+                            print_warning_once(
+                                "Found kv scale in the checkpoint "
+                                f"(e.g. {name}), but not found the expected "
+                                f"name in the model "
+                                f"(e.g. {remapped_kv_scale_name}). "
+                                "kv-scale is not loaded."
+                            )
+                            continue
+                        else:
+                            name = remapped_kv_scale_name
+
+                    param = params_dict[name]
+                    weight_loader = getattr(
+                        param, "weight_loader", default_weight_loader
+                    )
+                    weight_loader(param, loaded_weight)
+
+
+EntryClass = OlmoeForCausalLM
diff --git a/sglang/python/sglang/srt/models/opt.py b/sglang/python/sglang/srt/models/opt.py
new file mode 100644
index 0000000000000000000000000000000000000000..7db9250af96e9fe5801ea59d2d71960e45e73869
--- /dev/null
+++ b/sglang/python/sglang/srt/models/opt.py
@@ -0,0 +1,638 @@
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Inference-only OPT model compatible with HuggingFace weights."""
+
+import logging
+from collections.abc import Iterable
+from typing import Optional, Union
+
+import torch
+from torch import nn
+from transformers import OPTConfig
+
+from sglang.srt.distributed import (
+    get_pp_group,
+    get_tensor_model_parallel_rank,
+    get_tensor_model_parallel_world_size,
+)
+from sglang.srt.layers.linear import (
+    ColumnParallelLinear,
+    QKVParallelLinear,
+    ReplicatedLinear,
+    RowParallelLinear,
+)
+from sglang.srt.layers.logits_processor import LogitsProcessor, LogitsProcessorOutput
+from sglang.srt.layers.pooler import Pooler, PoolingType
+from sglang.srt.layers.quantization.base_config import QuantizationConfig
+from sglang.srt.layers.radix_attention import RadixAttention
+from sglang.srt.layers.utils import get_layer_id
+from sglang.srt.layers.vocab_parallel_embedding import (
+    ParallelLMHead,
+    VocabParallelEmbedding,
+)
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch, PPProxyTensors
+from sglang.srt.model_loader.weight_utils import (
+    default_weight_loader,
+    kv_cache_scales_loader,
+)
+from sglang.srt.utils import add_prefix, make_layers
+from sglang.utils import get_exception_traceback
+
+logger = logging.getLogger(__name__)
+
+
+def get_activation(name="relu"):
+    """Select an activation function by name
+
+    Args:
+        name: str
+            activation function name,
+            one of ["relu", "gelu", "swish", "sigmoid"],
+            default "relu".
+    """
+    name = name.lower()
+    if name == "relu":
+        return nn.ReLU()
+    if name == "gelu":
+        return nn.GELU()
+    if name == "sigmoid":
+        return torch.nn.Sigmoid()
+    return nn.Identity()
+
+
+class OPTLearnedPositionalEmbedding(nn.Embedding):
+
+    def __init__(self, num_embeddings: int, embedding_dim: int):
+        # OPT is set up so that if padding_idx is specified then offset the
+        # embedding ids by 2 and adjust num_embeddings appropriately. Other
+        # models don't have this hack
+        self.offset = 2
+        super().__init__(num_embeddings + self.offset, embedding_dim)
+
+    def forward(self, positions: torch.Tensor):
+        return super().forward(positions + self.offset)
+
+
+class OPTAttention(nn.Module):
+
+    def __init__(
+        self,
+        embed_dim: int,
+        num_heads: int,
+        layer_id: int = 0,
+        bias: bool = True,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.embed_dim = embed_dim
+        tensor_model_parallel_world_size = get_tensor_model_parallel_world_size()
+        total_num_heads = num_heads
+        assert num_heads % tensor_model_parallel_world_size == 0
+        self.num_heads = total_num_heads // tensor_model_parallel_world_size
+        self.head_dim = embed_dim // total_num_heads
+        self.scaling = self.head_dim**-0.5
+
+        self.qkv_proj = QKVParallelLinear(
+            embed_dim,
+            self.head_dim,
+            total_num_heads,
+            bias=bias,
+            quant_config=quant_config,
+            prefix=add_prefix("qkv_proj", prefix),
+        )
+        self.out_proj = RowParallelLinear(
+            embed_dim,
+            embed_dim,
+            bias=bias,
+            quant_config=quant_config,
+            prefix=add_prefix("o_proj", prefix),
+        )
+
+        self.attn = RadixAttention(
+            self.num_heads,
+            self.head_dim,
+            self.scaling,
+            num_kv_heads=self.num_heads,
+            layer_id=layer_id,
+            quant_config=quant_config,
+            prefix=add_prefix("attn", prefix),
+        )
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+    ) -> torch.Tensor:
+        qkv, _ = self.qkv_proj(hidden_states)
+        q, k, v = qkv.chunk(chunks=3, dim=-1)
+        attn_output = self.attn(q, k, v, forward_batch)
+        output, _ = self.out_proj(attn_output)
+        return output
+
+
+class OPTDecoderLayer(nn.Module):
+
+    def __init__(
+        self,
+        config: OPTConfig,
+        layer_id: int = 0,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.self_attn = OPTAttention(
+            embed_dim=self.embed_dim,
+            num_heads=config.num_attention_heads,
+            layer_id=layer_id,
+            bias=config.enable_bias,
+            quant_config=quant_config,
+            prefix=add_prefix("self_attn", prefix),
+        )
+        self.do_layer_norm_before = config.do_layer_norm_before
+
+        self.self_attn_layer_norm = nn.LayerNorm(
+            self.embed_dim, elementwise_affine=config.layer_norm_elementwise_affine
+        )
+        self.fc1 = ColumnParallelLinear(
+            self.embed_dim,
+            config.ffn_dim,
+            bias=config.enable_bias,
+            quant_config=quant_config,
+            prefix=add_prefix("fc1", prefix),
+        )
+        self.activation_fn = get_activation(config.activation_function)
+        self.fc2 = RowParallelLinear(
+            config.ffn_dim,
+            self.embed_dim,
+            bias=config.enable_bias,
+            quant_config=quant_config,
+            prefix=add_prefix("fc2", prefix),
+        )
+        self.final_layer_norm = nn.LayerNorm(
+            self.embed_dim, elementwise_affine=config.layer_norm_elementwise_affine
+        )
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+    ) -> torch.Tensor:
+        # Self Attention
+        residual = hidden_states
+        # 125m, 1.7B, ..., 175B applies layer norm BEFORE attention
+        if self.do_layer_norm_before:
+            hidden_states = self.self_attn_layer_norm(hidden_states)
+        hidden_states = self.self_attn(
+            hidden_states=hidden_states, forward_batch=forward_batch
+        )
+        hidden_states = residual + hidden_states
+        # 350m applies layer norm AFTER attention
+        if not self.do_layer_norm_before:
+            hidden_states = self.self_attn_layer_norm(hidden_states)
+
+        # Fully Connected
+        residual = hidden_states
+        # 125m, 1.7B, ..., 175B applies layer norm BEFORE attention
+        if self.do_layer_norm_before:
+            hidden_states = self.final_layer_norm(hidden_states)
+        hidden_states, _ = self.fc1(hidden_states)
+        hidden_states = self.activation_fn(hidden_states)
+        hidden_states, _ = self.fc2(hidden_states)
+        hidden_states = residual + hidden_states
+        # 350m applies layer norm AFTER attention
+        if not self.do_layer_norm_before:
+            hidden_states = self.final_layer_norm(hidden_states)
+        return hidden_states
+
+
+class OPTDecoder(nn.Module):
+
+    def __init__(
+        self,
+        config: OPTConfig,
+        layer_id: int = 0,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.config = config
+        self.max_target_positions = config.max_position_embeddings
+        self.vocab_size = config.vocab_size
+
+        self.pp_group = get_pp_group()
+
+        self.embed_tokens = VocabParallelEmbedding(
+            config.vocab_size,
+            config.word_embed_proj_dim,
+            prefix=add_prefix("embed_tokens", prefix),
+        )
+        # Positional embeddings are replicated (not sharded).
+        self.embed_positions = OPTLearnedPositionalEmbedding(
+            config.max_position_embeddings, config.hidden_size
+        )
+
+        # Project out & in will be replicated if they exist.
+        if config.word_embed_proj_dim != config.hidden_size:
+            self.project_out = ReplicatedLinear(
+                config.hidden_size,
+                config.word_embed_proj_dim,
+                bias=False,
+                quant_config=quant_config,
+                prefix=add_prefix("project_out", prefix),
+            )
+        else:
+            self.project_out = None
+
+        if config.word_embed_proj_dim != config.hidden_size:
+            self.project_in = ReplicatedLinear(
+                config.word_embed_proj_dim,
+                config.hidden_size,
+                bias=False,
+                quant_config=quant_config,
+                prefix=add_prefix("project_in", prefix),
+            )
+        else:
+            self.project_in = None
+
+        # Note that the only purpose of `config._remove_final_layer_norm` is to
+        # keep backward compatibility with checkpoints that have been fine-tuned
+        # before transformers v4.20.1
+        # see https://github.com/facebookresearch/metaseq/pull/164
+        if config.do_layer_norm_before and not config._remove_final_layer_norm:
+            self.final_layer_norm = nn.LayerNorm(
+                config.hidden_size,
+                elementwise_affine=config.layer_norm_elementwise_affine,
+            )
+        else:
+            self.final_layer_norm = None
+
+        self.layers, self.start_layer, self.end_layer = make_layers(
+            config.num_hidden_layers,
+            lambda idx, prefix: OPTDecoderLayer(
+                config=config, layer_id=idx, quant_config=quant_config, prefix=prefix
+            ),
+            pp_rank=self.pp_group.rank_in_group,
+            pp_size=self.pp_group.world_size,
+            prefix="model.layers",
+        )
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        pp_proxy_tensors: Optional[PPProxyTensors] = None,
+        input_embeds: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor, PPProxyTensors]:
+        if self.pp_group.is_first_rank:
+            if input_embeds is None:
+                input_embeds = self.embed_tokens(input_ids)
+            pos_embeds = self.embed_positions(positions)
+            if self.project_in is not None:
+                input_embeds, _ = self.project_in(input_embeds)
+            hidden_states = input_embeds + pos_embeds
+        else:
+            assert pp_proxy_tensors is not None
+            hidden_states = pp_proxy_tensors["hidden_states"]
+
+        for layer in self.layers[self.start_layer : self.end_layer]:
+            hidden_states = layer(
+                hidden_states=hidden_states, forward_batch=forward_batch
+            )
+        if not self.pp_group.is_last_rank:
+            return PPProxyTensors({"hidden_states": hidden_states})
+        if self.final_layer_norm is not None:
+            hidden_states = self.final_layer_norm(hidden_states)
+            # 没有经过这里
+        if self.project_out is not None:
+            hidden_states, _ = self.project_out(hidden_states)
+        return hidden_states
+
+
+class OPTModel(nn.Module):
+
+    def __init__(
+        self,
+        config: OPTConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+
+        # config = vllm_config.model_config.hf_config
+        # quant_config = vllm_config.quant_config
+        self.config = config
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+        self.pp_group = get_pp_group()
+
+        self.decoder = OPTDecoder(
+            config=config,
+            quant_config=quant_config,
+            prefix=add_prefix("decoder", prefix),
+        )
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        pp_proxy_tensors: Optional[PPProxyTensors],
+        input_embeds: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor, PPProxyTensors]:
+        return self.decoder(
+            input_ids,
+            positions,
+            pp_proxy_tensors=pp_proxy_tensors,
+            input_embeds=input_embeds,
+            forward_batch=forward_batch,
+        )
+
+    def load_kv_cache_scales(self, quantization_param_path: str) -> None:
+        tp_size = get_tensor_model_parallel_world_size()
+        tp_rank = get_tensor_model_parallel_rank()
+        for layer_idx, scaling_factor in kv_cache_scales_loader(
+            quantization_param_path,
+            tp_rank,
+            tp_size,
+            self.config.num_hidden_layers,
+            self.config.__class__.model_type,
+        ):
+            if not isinstance(self.decoder.layers[layer_idx], nn.Identity):
+                layer_self_attn = self.decoder.layers[layer_idx].self_attn
+
+            if hasattr(layer_self_attn.attn, "k_scale"):
+                layer_self_attn.attn.k_scale = scaling_factor
+                layer_self_attn.attn.v_scale = scaling_factor
+            else:
+                raise RuntimeError(
+                    "Self attention has no KV cache scaling " "factor attribute!"
+                )
+
+
+class OPTForCausalLM(nn.Module):
+    # BitandBytes specific attributes
+    # in TP, these weights are partitioned along the column dimension (dim=-1)
+    column_parallel_weights_modules = [".down_proj.", ".o_proj."]
+
+    def __init__(
+        self,
+        config: OPTConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.config = config
+        self.quant_config = quant_config
+
+        self.model = OPTModel(
+            config=config, quant_config=quant_config, prefix=add_prefix("model", prefix)
+        )
+        if self.config.tie_word_embeddings:
+            self.lm_head = self.model.decoder.embed_tokens
+        else:
+            self.lm_head = ParallelLMHead(
+                config.vocab_size,
+                config.word_embed_proj_dim,
+                prefix=add_prefix("lm_head", prefix),
+            )
+        self.logits_processor = LogitsProcessor(config)
+        self.pooler = Pooler(pooling_type=PoolingType.LAST, normalize=True)
+        self.capture_aux_hidden_states = False
+        self.pp_group = get_pp_group()
+        self.stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            (".qkv_proj", ".q_proj", "q"),
+            (".qkv_proj", ".k_proj", "k"),
+            (".qkv_proj", ".v_proj", "v"),
+        ]
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        pp_proxy_tensors: Optional[PPProxyTensors] = None,
+        input_embeds: Optional[torch.Tensor] = None,
+        get_embedding: bool = False,
+    ) -> LogitsProcessorOutput:
+        hidden_states = self.model(
+            input_ids=input_ids,
+            positions=positions,
+            forward_batch=forward_batch,
+            input_embeds=input_embeds,
+            pp_proxy_tensors=pp_proxy_tensors,
+        )
+        aux_hidden_states = None
+        if self.capture_aux_hidden_states:
+            hidden_states, aux_hidden_states = hidden_states
+
+        if self.pp_group.is_last_rank:
+            if not get_embedding:
+                return self.logits_processor(
+                    input_ids,
+                    hidden_states,
+                    self.lm_head,
+                    forward_batch,
+                    aux_hidden_states=aux_hidden_states,
+                )
+            else:
+                return self.pooler(hidden_states, forward_batch)
+        else:
+            return hidden_states
+
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> None:
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+        ]
+        params_dict = dict(self.named_parameters(remove_duplicate=False))
+
+        for name, loaded_weight in weights:
+            if name.startswith("decoder"):
+                name = name.replace("decoder.", "model.decoder.")
+            layer_id = get_layer_id(name)
+            if (
+                layer_id is not None
+                and hasattr(self.model, "start_layer")
+                and (
+                    layer_id < self.model.start_layer
+                    or layer_id >= self.model.end_layer
+                )
+            ):
+                continue
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                # if is_pp_missing_parameter(name, self):
+                #     continue
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                # if is_pp_missing_parameter(name, self):
+                #     continue
+                if name not in params_dict:
+                    continue
+                if name in params_dict.keys():
+                    param = params_dict[name]
+                    weight_loader = getattr(
+                        param, "weight_loader", default_weight_loader
+                    )
+                    weight_loader(param, loaded_weight)
+                else:
+                    logger.warning(f"Parameter {name} not found in params_dict")
+
+    @property
+    def start_layer(self):
+        return self.model.start_layer
+
+    @property
+    def end_layer(self):
+        return self.model.end_layer
+
+    def get_input_embeddings(self) -> nn.Embedding:
+        return self.model.embed_tokens
+
+    def get_module_name_from_weight_name(self, name):
+        for param_name, weight_name, shard_id, num_shard in self.stacked_params_mapping:
+            if weight_name in name:
+                return (
+                    name.replace(weight_name, param_name)[: -len(".weight")],
+                    num_shard,
+                )
+        return name[: -len(".weight")], 1
+
+    def get_num_params(self):
+        params_dict = dict(self.named_parameters())
+        return len(params_dict)
+
+    def get_weights_by_name(
+        self, name: str, truncate_size: int = 100, tp_size: int = 1
+    ) -> Optional[torch.Tensor]:
+        """Get the weights of the parameter by its name. Similar to `get_parameter` in Hugging Face.
+
+        Only used for unit test with an unoptimized performance.
+        For optimized performance, please use torch.save and torch.load.
+        """
+        try:
+            if name == "lm_head.weight" and self.config.tie_word_embeddings:
+                logger.info(
+                    "word embedding is tied for this model, return embed_tokens.weight as lm_head.weight."
+                )
+                return (
+                    self.model.embed_tokens.weight.cpu()
+                    .to(torch.float32)
+                    .numpy()
+                    .tolist()[:truncate_size]
+                )
+
+            mapped_name = name
+            mapped_shard_id = None
+            for param_name, weight_name, shard_id in self.stacked_params_mapping:
+                if weight_name in name:
+                    mapped_name = name.replace(weight_name, param_name)
+                    mapped_shard_id = shard_id
+                    break
+            params_dict = dict(self.named_parameters())
+            param = params_dict[mapped_name]
+            if mapped_shard_id is not None:
+                if mapped_shard_id in ["q", "k", "v"]:
+                    num_heads = self.config.num_attention_heads // tp_size
+                    num_kv_heads = self.config.num_attention_heads // tp_size
+                    head_dim = (
+                        self.config.hidden_size // self.config.num_attention_heads
+                    )
+                    if mapped_shard_id == "q":
+                        offset = 0
+                        size = num_heads * head_dim
+                    elif mapped_shard_id == "k":
+                        offset = num_heads * head_dim
+                        size = num_kv_heads * head_dim
+                    elif mapped_shard_id == "v":
+                        offset = (num_heads + num_kv_heads) * head_dim
+                        size = num_kv_heads * head_dim
+                    weight = param.data.narrow(0, offset, size)
+                elif mapped_shard_id in [0, 1]:
+                    intermediate_size = self.config.ffn_dim
+                    slice_size = intermediate_size // tp_size
+                    if mapped_shard_id == 0:  # gate_proj
+                        offset = 0
+                        size = slice_size
+                    elif mapped_shard_id == 1:  # up_proj
+                        offset = slice_size
+                        size = slice_size
+
+                    weight = param.data.narrow(0, offset, size)
+                else:
+                    weight = param.data
+            else:
+                weight = param.data
+            if tp_size > 1 and ("o_proj" in name or "down_proj" in name):
+                gathered_weights = [torch.zeros_like(weight) for _ in range(tp_size)]
+                torch.distributed.all_gather(gathered_weights, weight)
+                weight = torch.cat(gathered_weights, dim=1)
+            return weight.cpu().to(torch.float32).numpy().tolist()[:truncate_size]
+
+        except Exception:
+            logger.error(
+                f"Error getting weights by name {name} in OPTForCausalLM: {get_exception_traceback()}"
+            )
+            return None
+
+    def get_embed_and_head(self):
+        return self.model.embed_tokens.weight, self.lm_head.weight
+
+    def set_embed_and_head(self, embed, head):
+        del self.model.embed_tokens.weight
+        del self.lm_head.weight
+        self.model.embed_tokens.weight = embed
+        self.lm_head.weight = head
+        torch.cuda.empty_cache()
+        torch.cuda.synchronize()
+
+    def get_embed(self):
+        return self.model.embed_tokens.weight
+
+    def set_embed(self, embed):
+        # NOTE: If draft hidden size != target hidden size, the embed weight cannot be shared for EAGLE3
+        if (
+            hasattr(self.config, "target_hidden_size")
+            and self.config.target_hidden_size != self.config.hidden_size
+        ):
+            return
+        del self.model.embed_tokens.weight
+        self.model.embed_tokens.weight = embed
+        torch.cuda.empty_cache()
+        torch.cuda.synchronize()
+
+    def load_kv_cache_scales(self, quantization_param_path: str) -> None:
+        self.model.load_kv_cache_scales(quantization_param_path)
+
+
+EntryClass = [OPTForCausalLM]
diff --git a/sglang/python/sglang/srt/models/orion.py b/sglang/python/sglang/srt/models/orion.py
new file mode 100644
index 0000000000000000000000000000000000000000..2aee4b9aaf4437ff478d822ae73cd1228e50e6f0
--- /dev/null
+++ b/sglang/python/sglang/srt/models/orion.py
@@ -0,0 +1,370 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# Adapted from
+# https://huggingface.co/OrionStarAI/Orion-14B-Base/blob/main/modeling_orion.py
+# Copyright (c) OrionStar Inc.
+# LICENSE: https://huggingface.co/OrionStarAI/Orion-14B-Base/blob/main/LICENSE
+# Adapted from https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/models/orion.py
+"""Inference-only Orion-14B model compatible with HuggingFace weights."""
+
+from collections.abc import Iterable
+from typing import Any, Optional, Tuple
+
+import torch
+from torch import nn
+from transformers import PretrainedConfig
+
+from sglang.srt.distributed import get_tensor_model_parallel_world_size
+from sglang.srt.distributed.parallel_state import get_pp_group
+from sglang.srt.layers.activation import SiluAndMul
+from sglang.srt.layers.linear import (
+    MergedColumnParallelLinear,
+    QKVParallelLinear,
+    RowParallelLinear,
+)
+from sglang.srt.layers.logits_processor import LogitsProcessor, LogitsProcessorOutput
+from sglang.srt.layers.quantization import QuantizationConfig
+from sglang.srt.layers.radix_attention import RadixAttention
+from sglang.srt.layers.rotary_embedding import get_rope
+from sglang.srt.layers.utils import PPMissingLayer
+from sglang.srt.layers.vocab_parallel_embedding import (
+    ParallelLMHead,
+    VocabParallelEmbedding,
+)
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch, PPProxyTensors
+from sglang.srt.model_loader.weight_utils import default_weight_loader
+from sglang.srt.utils import add_prefix, make_layers
+
+
+class OrionMLP(nn.Module):
+    def __init__(
+        self,
+        hidden_size: int,
+        intermediate_size: int,
+        hidden_act: str,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.gate_up_proj = MergedColumnParallelLinear(
+            hidden_size,
+            [intermediate_size] * 2,
+            bias=False,
+            quant_config=quant_config,
+            prefix=add_prefix("gate_up_proj", prefix),
+        )
+        self.down_proj = RowParallelLinear(
+            intermediate_size,
+            hidden_size,
+            bias=False,
+            quant_config=quant_config,
+            prefix=add_prefix("down_proj", prefix),
+        )
+        if hidden_act != "silu":
+            raise ValueError(
+                f"Unsupported activation: {hidden_act}. "
+                "Only silu is supported for now."
+            )
+        self.act_fn = SiluAndMul()
+
+    def forward(self, x):
+        gate_up, _ = self.gate_up_proj(x)
+        x = self.act_fn(gate_up)
+        x, _ = self.down_proj(x)
+        return x
+
+
+class OrionAttention(nn.Module):
+    def __init__(
+        self,
+        hidden_size: int,
+        num_heads: int,
+        num_kv_heads: int,
+        rope_theta: float = 10000,
+        rope_scaling: Optional[dict[str, Any]] = None,
+        max_position_embeddings: int = 8192,
+        quant_config: Optional[QuantizationConfig] = None,
+        layer_id: int = 0,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.hidden_size = hidden_size
+        tp_size = get_tensor_model_parallel_world_size()
+        self.total_num_heads = num_heads
+        assert self.total_num_heads % tp_size == 0
+        self.num_heads = self.total_num_heads // tp_size
+        self.total_num_kv_heads = num_kv_heads
+        if self.total_num_kv_heads >= tp_size:
+            assert self.total_num_kv_heads % tp_size == 0
+        else:
+            assert tp_size % self.total_num_kv_heads == 0
+        self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
+        self.head_dim = hidden_size // self.total_num_heads
+        self.q_size = self.num_heads * self.head_dim
+        self.kv_size = self.num_kv_heads * self.head_dim
+        self.scaling = self.head_dim**-0.5
+        self.rope_theta = rope_theta
+        self.max_position_embeddings = max_position_embeddings
+
+        self.qkv_proj = QKVParallelLinear(
+            hidden_size,
+            self.head_dim,
+            self.total_num_heads,
+            self.total_num_kv_heads,
+            bias=False,
+            quant_config=quant_config,
+            prefix=add_prefix("qkv_proj", prefix),
+        )
+        self.o_proj = RowParallelLinear(
+            self.total_num_heads * self.head_dim,
+            hidden_size,
+            bias=False,
+            quant_config=quant_config,
+            prefix=add_prefix("o_proj", prefix),
+        )
+
+        self.rotary_emb = get_rope(
+            self.head_dim,
+            rotary_dim=self.head_dim,
+            max_position=max_position_embeddings,
+            base=rope_theta,
+            rope_scaling=rope_scaling,
+        )
+        self.attn = RadixAttention(
+            self.num_heads,
+            self.head_dim,
+            self.scaling,
+            num_kv_heads=self.num_kv_heads,
+            layer_id=layer_id,
+            quant_config=quant_config,
+            prefix=add_prefix("attn", prefix),
+        )
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+    ) -> torch.Tensor:
+        qkv, _ = self.qkv_proj(hidden_states)
+        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+        q, k = self.rotary_emb(positions, q, k)
+        attn_output = self.attn(q, k, v, forward_batch=forward_batch)
+        output, _ = self.o_proj(attn_output)
+        return output
+
+
+class OrionDecoderLayer(nn.Module):
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        layer_id: int,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        rope_theta = getattr(config, "rope_theta", 10000)
+        rope_scaling = getattr(config, "rope_scaling", None)
+        max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
+        self.self_attn = OrionAttention(
+            hidden_size=self.hidden_size,
+            num_heads=config.num_attention_heads,
+            num_kv_heads=config.num_key_value_heads,
+            rope_theta=rope_theta,
+            rope_scaling=rope_scaling,
+            max_position_embeddings=max_position_embeddings,
+            quant_config=quant_config,
+            prefix=add_prefix("self_attn", prefix),
+            layer_id=layer_id,
+        )
+        self.mlp = OrionMLP(
+            hidden_size=self.hidden_size,
+            intermediate_size=config.intermediate_size,
+            hidden_act=config.hidden_act,
+            quant_config=quant_config,
+            prefix=add_prefix("mlp", prefix),
+        )
+        self.input_layernorm = nn.LayerNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = nn.LayerNorm(
+            config.hidden_size, eps=config.rms_norm_eps
+        )
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+    ) -> torch.Tensor:
+        # Self Attention
+        residual = hidden_states
+        hidden_states = self.input_layernorm(hidden_states)
+        hidden_states = self.self_attn(
+            positions=positions,
+            hidden_states=hidden_states,
+            forward_batch=forward_batch,
+        )
+        hidden_states = residual + hidden_states
+
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+        return hidden_states
+
+
+class OrionModel(nn.Module):
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.config = config
+        self.pp_group = get_pp_group()
+
+        if self.pp_group.is_first_rank:
+            self.embed_tokens = VocabParallelEmbedding(
+                config.vocab_size, config.hidden_size
+            )
+        else:
+            self.embed_tokens = PPMissingLayer()
+
+        self.layers, self.start_layer, self.end_layer = make_layers(
+            config.num_hidden_layers,
+            lambda idx, prefix: OrionDecoderLayer(
+                config, layer_id=idx, quant_config=quant_config, prefix=prefix
+            ),
+            pp_rank=self.pp_group.rank_in_group,
+            pp_size=self.pp_group.world_size,
+            prefix=add_prefix("layers", prefix),
+        )
+
+        if self.pp_group.is_last_rank:
+            self.norm = nn.LayerNorm(config.hidden_size, eps=config.rms_norm_eps)
+        else:
+            self.norm = PPMissingLayer()
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        pp_proxy_tensors: Optional[PPProxyTensors] = None,
+    ):
+        if self.pp_group.is_first_rank:
+            if inputs_embeds is not None:
+                hidden_states = inputs_embeds
+            else:
+                hidden_states = self.embed_tokens(input_ids)
+        else:
+            assert pp_proxy_tensors is not None
+            hidden_states = pp_proxy_tensors["hidden_states"]
+
+        for i in range(self.start_layer, self.end_layer):
+            layer = self.layers[i]
+            hidden_states = layer(positions, hidden_states, forward_batch)
+
+        if not self.pp_group.is_last_rank:
+            return PPProxyTensors({"hidden_states": hidden_states})
+
+        hidden_states = self.norm(hidden_states)
+        return hidden_states
+
+
+class OrionForCausalLM(nn.Module):
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.config = config
+        self.quant_config = quant_config
+        self.pp_group = get_pp_group()
+        self.model = OrionModel(
+            config=config, quant_config=quant_config, prefix=add_prefix("model", prefix)
+        )
+
+        if self.pp_group.is_last_rank:
+            self.lm_head = ParallelLMHead(
+                config.vocab_size,
+                config.hidden_size,
+                quant_config=quant_config,
+                prefix=add_prefix("lm_head", prefix),
+            )
+            if self.config.tie_word_embeddings and self.pp_group.is_first_rank:
+                self.lm_head.weight = self.model.embed_tokens.weight
+            self.logits_processor = LogitsProcessor(config)
+        else:
+            self.lm_head = PPMissingLayer()
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> LogitsProcessorOutput:
+        hidden_states = self.model(
+            input_ids=input_ids,
+            positions=positions,
+            forward_batch=forward_batch,
+            inputs_embeds=inputs_embeds,
+        )
+
+        if self.pp_group.is_last_rank:
+            logits = self.logits_processor(
+                input_ids, hidden_states, self.lm_head, forward_batch
+            )
+            return logits
+        return hidden_states
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+            ("gate_up_proj", "gate_proj", 0),
+            ("gate_up_proj", "up_proj", 1),
+        ]
+        params_dict = dict(self.named_parameters())
+        for name, loaded_weight in weights:
+            if "rotary_emb.inv_freq" in name:
+                continue
+
+            is_packed = False
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                if name not in params_dict:
+                    continue
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                is_packed = True
+                break
+            if is_packed:
+                continue
+
+            # Skip loading extra bias for GPTQ models.
+            if name.endswith(".bias") and name not in params_dict:
+                continue
+            if name not in params_dict:
+                continue
+            param = params_dict[name]
+            weight_loader = getattr(param, "weight_loader", default_weight_loader)
+            weight_loader(param, loaded_weight)
+
+
+EntryClass = OrionForCausalLM
diff --git a/sglang/python/sglang/srt/models/paddleocr_vl.py b/sglang/python/sglang/srt/models/paddleocr_vl.py
new file mode 100644
index 0000000000000000000000000000000000000000..456fb19ab378316b4a1f9607c3db0fbf823a4088
--- /dev/null
+++ b/sglang/python/sglang/srt/models/paddleocr_vl.py
@@ -0,0 +1,729 @@
+# Reference: ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/paddleocr-genai-vllm-server:latest
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from collections.abc import Iterable
+from typing import List, Optional, Set, Tuple, Union
+
+import numpy as np
+import torch
+import torch.nn as nn
+from einops import rearrange
+from transformers.activations import GELUActivation
+from transformers.utils import torch_int
+
+from sglang.srt.layers.activation import get_act_fn
+from sglang.srt.layers.attention.vision import VisionAttention
+from sglang.srt.layers.linear import ColumnParallelLinear, RowParallelLinear
+from sglang.srt.layers.quantization.base_config import QuantizationConfig
+from sglang.srt.managers.mm_utils import (
+    MultiModalityDataPaddingPatternMultimodalTokens,
+    general_mm_embed_routine,
+)
+from sglang.srt.managers.schedule_batch import MultimodalDataItem, MultimodalInputs
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch
+from sglang.srt.model_loader.weight_utils import default_weight_loader
+from sglang.srt.models.ernie4 import Ernie4_5_ForCausalLM
+from sglang.srt.utils import add_prefix, is_npu
+
+
+class Projector(nn.Module):
+
+    def __init__(
+        self,
+        text_config,
+        vision_config,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.text_config = text_config
+        self.vision_config = vision_config
+        self.merge_kernel_size = (2, 2)
+
+        self.hidden_size = (
+            self.vision_config.hidden_size
+            * self.merge_kernel_size[0]
+            * self.merge_kernel_size[1]
+        )
+
+        self.pre_norm = torch.nn.LayerNorm(self.vision_config.hidden_size, eps=1e-05)
+        self.linear_1 = nn.Linear(self.hidden_size, self.hidden_size, bias=True)
+        self.act = GELUActivation()
+        self.linear_2 = nn.Linear(
+            self.hidden_size, self.text_config.hidden_size, bias=True
+        )
+
+    def forward(
+        self,
+        image_features: torch.Tensor,
+        image_grid_thw: List[Tuple[int, int, int]],
+    ) -> torch.Tensor:
+        m1, m2 = self.merge_kernel_size
+        if isinstance(image_features, (list, tuple)):
+            processed_features = list()
+            for image_feature, image_grid in zip(image_features, image_grid_thw):
+                image_feature = self.pre_norm(image_feature)
+                t, h, w = image_grid
+
+                image_feature = rearrange(
+                    image_feature,
+                    "(t h p1 w p2) d -> (t h w) (p1 p2 d)",
+                    t=t,
+                    h=h // m1,
+                    p1=m1,
+                    w=w // m2,
+                    p2=m2,
+                )
+                hidden_states = self.linear_1(image_feature)
+                hidden_states = self.act(hidden_states)
+                hidden_states = self.linear_2(hidden_states)
+                processed_features.append(hidden_states)
+
+            return processed_features
+
+        dims = image_features.shape[:-1]
+        dim = image_features.shape[-1]
+        image_features = image_features.view(np.prod(dims), dim)
+        hidden_states = self.pre_norm(image_features).view(-1, self.hidden_size)
+        hidden_states = self.linear_1(hidden_states)
+        hidden_states = self.act(hidden_states)
+        hidden_states = self.linear_2(hidden_states)
+
+        return hidden_states.view(*dims, -1)
+
+
+class SiglipVisionEmbeddings(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.image_size = config.image_size
+        self.patch_size = config.patch_size
+
+        self.patch_embedding = nn.Conv2d(
+            in_channels=config.num_channels,
+            out_channels=self.embed_dim,
+            kernel_size=self.patch_size,
+            stride=self.patch_size,
+            padding="valid",
+        )
+
+        self.num_patches = (self.image_size // self.patch_size) ** 2
+        self.num_positions = self.num_patches
+        self.cache_position_embedding = dict()
+        self.cache_position_count = dict()
+        self.position_embedding = nn.Embedding(self.num_positions, self.embed_dim)
+        self.packing_position_embedding = nn.Embedding(32768, self.embed_dim)
+
+        self.register_buffer(
+            "position_ids",
+            torch.arange(self.num_positions).expand((1, -1)),
+            persistent=False,
+        )
+
+    def interpolate_pos_encoding(
+        self,
+        embeddings: torch.Tensor,
+        height: int,
+        width: int,
+        is_after_patchify: bool = False,
+    ) -> torch.Tensor:
+
+        num_positions = self.position_embedding.weight.shape[0]
+
+        patch_pos_embed = self.position_embedding.weight.unsqueeze(0)
+
+        dim = embeddings.shape[-1]
+
+        if is_after_patchify:
+            new_height = height
+            new_width = width
+        else:
+            new_height = height // self.patch_size
+            new_width = width // self.patch_size
+
+        sqrt_num_positions = torch_int(num_positions**0.5)
+        patch_pos_embed = patch_pos_embed.reshape(
+            1, sqrt_num_positions, sqrt_num_positions, dim
+        )
+        patch_pos_embed = patch_pos_embed.permute(0, 3, 1, 2)
+
+        patch_pos_embed = nn.functional.interpolate(
+            patch_pos_embed,
+            size=(new_height, new_width),
+            mode="bilinear",
+            align_corners=False,
+        )
+
+        patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, dim)
+        return patch_pos_embed
+
+    def fetch_position_embedding_lfu_cache(self, embeddings, h, w, max_cache: int = 20):
+        grid = (h, w)
+        if grid in self.cache_position_embedding:
+            self.cache_position_count[grid] += 1
+            return self.cache_position_embedding[grid]
+
+        if len(self.cache_position_embedding) >= max_cache:
+            min_hit_grid = min(
+                self.cache_position_count,
+                key=self.cache_position_count.get,
+            )
+            self.cache_position_count.pop(min_hit_grid)
+            self.cache_position_embedding.pop(min_hit_grid)
+
+        position_embedding = self.interpolate_pos_encoding(embeddings, h, w, True)
+        self.cache_position_count[grid] = 1
+        self.cache_position_embedding[grid] = position_embedding
+        return position_embedding
+
+    def forward(
+        self,
+        pixel_values: torch.FloatTensor,
+        position_ids: Optional[torch.Tensor] = None,
+        image_grid_thw: Optional[
+            List[
+                Union[
+                    Tuple[int, int, int],
+                    List[Tuple[int, int, int]],
+                ]
+            ]
+        ] = None,
+        interpolate_pos_encoding=False,
+    ) -> torch.Tensor:
+        if pixel_values.dim() == 4:
+            pixel_values = pixel_values.unsqueeze(0)
+        if pixel_values.dim() == 5:
+            if position_ids is None:
+                raise ValueError(
+                    "position_ids cannot be None when pixel_values.dim() is 5."
+                )
+            (
+                batch_size,
+                squence_len,
+                channel,
+                height,
+                width,
+            ) = pixel_values.shape
+            target_dtype = self.patch_embedding.weight.dtype
+            pixel_values = rearrange(pixel_values, "b l c h w -> (b l) c h w")
+            patch_embeds = self.patch_embedding(pixel_values.to(dtype=target_dtype))
+            embeddings = patch_embeds.flatten(-2).squeeze(-1)
+
+            if interpolate_pos_encoding and image_grid_thw is not None:
+                start = 0
+                tmp_embeddings = list()
+                for image_grid in image_grid_thw:
+                    t, h, w = image_grid
+                    end = start + t * h * w
+                    image_embeddings = embeddings[start:end, :]
+                    position_embedding = (
+                        self.interpolate_pos_encoding(image_embeddings, h, w, True)
+                        .squeeze(0)
+                        .repeat(t, 1)
+                    )
+                    image_embeddings = image_embeddings + position_embedding
+                    tmp_embeddings.append(image_embeddings)
+                    start = end
+                embeddings = torch.concat(tmp_embeddings, dim=0).unsqueeze(0)
+            else:
+                embeddings = embeddings + self.packing_position_embedding(position_ids)
+            return embeddings
+        else:
+            raise ValueError(
+                "Unsupported pixel_values dimension:"
+                f" {pixel_values.dim()}. Expected 4 or 5."
+            )
+
+
+class SigLIPRotaryEmbedding(nn.Module):
+
+    def __init__(self, dim: int, theta: float = 10000.0) -> None:
+        super().__init__()
+        self.dim = dim
+        self.theta = theta
+        self.rope_init()
+
+    def rope_init(self):
+        inv_freq = 1.0 / (
+            self.theta ** (torch.arange(0, self.dim, 2, dtype=torch.float) / self.dim)
+        )
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+
+    def forward(self, seqlen: int) -> torch.Tensor:
+        seq = torch.arange(
+            seqlen,
+            device=self.inv_freq.device,
+            dtype=self.inv_freq.dtype,
+        )
+        freqs = torch.outer(seq, self.inv_freq)
+        return freqs
+
+
+class SiglipMLP(nn.Module):
+
+    def __init__(
+        self,
+        config,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+
+        self.config = config
+        self.activation_fn = get_act_fn(config.hidden_act)
+        if quant_config and quant_config.get_name() in ["bitsandbytes", "torchao"]:
+            quantizable = True
+        else:
+            quantizable = (
+                config.hidden_size % 64 == 0 and config.intermediate_size % 64 == 0
+            )
+        self.fc1 = ColumnParallelLinear(
+            config.hidden_size,
+            config.intermediate_size,
+            quant_config=quant_config if quantizable else None,
+            prefix=add_prefix("fc1", prefix),
+        )
+        self.fc2 = RowParallelLinear(
+            config.intermediate_size,
+            config.hidden_size,
+            quant_config=quant_config if quantizable else None,
+            prefix=add_prefix("fc2", prefix),
+        )
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states, _ = self.fc1(hidden_states)
+        hidden_states = self.activation_fn(hidden_states)
+        hidden_states, _ = self.fc2(hidden_states)
+        return hidden_states
+
+
+class SiglipEncoderLayer(nn.Module):
+
+    def __init__(
+        self,
+        config,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.embed_dim = config.hidden_size
+        self.layer_norm1 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
+
+        self.self_attn = VisionAttention(
+            embed_dim=self.embed_dim,
+            num_heads=config.num_attention_heads,
+            projection_size=self.embed_dim,
+            use_qkv_parallel=True,
+            qkv_bias=True,
+            flatten_batch=True,
+            quant_config=quant_config,
+            prefix=add_prefix("self_attn", prefix),
+        )
+
+        self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
+        self.mlp = SiglipMLP(
+            config, quant_config=quant_config, prefix=add_prefix("mlp", prefix)
+        )
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        cu_seqlens: Optional[List[torch.Tensor]] = None,
+        rope_emb: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+    ) -> Tuple[torch.FloatTensor]:
+
+        residual = hidden_states
+
+        hidden_states = self.layer_norm1(hidden_states)
+
+        hidden_states = self.self_attn(
+            hidden_states,
+            cu_seqlens=cu_seqlens,
+            position_embeddings=rope_emb,
+        )
+
+        hidden_states = residual + hidden_states
+
+        residual = hidden_states
+        hidden_states = self.layer_norm2(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+
+        hidden_states = residual + hidden_states
+
+        return hidden_states
+
+
+class SiglipEncoder(nn.Module):
+
+    def __init__(
+        self,
+        config,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.config = config
+        embed_dim = config.hidden_size
+        num_heads = config.num_attention_heads
+        head_dim = embed_dim // num_heads
+        self.layers = nn.ModuleList(
+            [
+                SiglipEncoderLayer(
+                    config,
+                    quant_config=quant_config,
+                    prefix=add_prefix(f"layers.{layer_idx}", prefix),
+                )
+                for layer_idx in range(config.num_hidden_layers)
+            ]
+        )
+        self.rotary_pos_emb = SigLIPRotaryEmbedding(head_dim // 2)
+
+    @staticmethod
+    def flatten_list(image_grid_thw):
+        tmp_image_grid_thw = list()
+        for image_grid in image_grid_thw:
+            if isinstance(image_grid, list):
+                tmp_image_grid_thw.extend(image_grid)
+            else:
+                tmp_image_grid_thw.append(image_grid)
+        return tmp_image_grid_thw
+
+    def forward(
+        self,
+        inputs_embeds,
+        cu_seqlens: Optional[List[torch.Tensor]] = None,
+        image_grid_thw: Optional[
+            List[
+                Union[
+                    Tuple[int, int, int],
+                    List[Tuple[int, int, int]],
+                ]
+            ]
+        ] = None,
+        height_position_ids: Optional[torch.Tensor] = None,
+        width_position_ids: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        device = inputs_embeds.device
+        hidden_states = inputs_embeds
+        flatten_image_grid_thw = self.flatten_list(image_grid_thw)
+
+        if width_position_ids is None or height_position_ids is None:
+            split_hids = list()
+            split_wids = list()
+            for t, h, w in flatten_image_grid_thw:
+                image_pids = torch.arange(t * h * w, device=device) % (h * w)
+                sample_hids = image_pids // w
+                sample_wids = image_pids % w
+                split_hids.append(sample_hids)
+                split_wids.append(sample_wids)
+            width_position_ids = torch.concat(split_wids, dim=0)
+            height_position_ids = torch.concat(split_hids, dim=0)
+
+        pids = torch.stack(
+            [height_position_ids, width_position_ids],
+            dim=-1,
+        )
+        max_grid_size = pids.max() + 1
+        rope_emb_max_grid = self.rotary_pos_emb(max_grid_size)
+        rope_emb = rope_emb_max_grid[pids].flatten(1)
+        rope_emb = rope_emb.repeat(1, 2)
+        rope_emb = (rope_emb.cos(), rope_emb.sin())
+        # cu_seqlens must be on cpu because of npu_flash_attention_unpad operator restriction
+        if is_npu() and isinstance(cu_seqlens, torch.Tensor):
+            cu_seqlens = cu_seqlens.to("cpu")
+        attn_cu_seqlens = cu_seqlens
+        hidden_states = inputs_embeds
+
+        for encoder_layer in self.layers:
+            hidden_states = encoder_layer(
+                hidden_states,
+                cu_seqlens=attn_cu_seqlens,
+                rope_emb=rope_emb,
+            )
+        return hidden_states
+
+
+class SiglipVisionTransformer(nn.Module):
+
+    def __init__(
+        self,
+        config,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.config = config
+        embed_dim = config.hidden_size
+
+        self.embeddings = SiglipVisionEmbeddings(config)
+        self.encoder = SiglipEncoder(
+            config,
+            quant_config=quant_config,
+            prefix=add_prefix("encoder", prefix),
+        )
+        self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
+
+    def forward(
+        self,
+        pixel_values,
+        interpolate_pos_encoding: Optional[bool] = False,
+        position_ids: Optional[torch.Tensor] = None,
+        height_position_ids: Optional[torch.Tensor] = None,
+        width_position_ids: Optional[torch.Tensor] = None,
+        cu_seqlens: Optional[List[torch.Tensor]] = None,
+        image_grid_thw: Optional[
+            List[
+                Union[
+                    Tuple[int, int, int],
+                    List[Tuple[int, int, int]],
+                ]
+            ]
+        ] = None,
+    ) -> list[torch.Tensor]:
+
+        hidden_states = self.embeddings(
+            pixel_values,
+            interpolate_pos_encoding=interpolate_pos_encoding,
+            position_ids=position_ids,
+            image_grid_thw=image_grid_thw,
+        )
+
+        last_hidden_state = self.encoder(
+            inputs_embeds=hidden_states,
+            cu_seqlens=cu_seqlens,
+            image_grid_thw=image_grid_thw,
+            height_position_ids=height_position_ids,
+            width_position_ids=width_position_ids,
+        )
+
+        last_hidden_state = self.post_layernorm(last_hidden_state)
+
+        sample_hidden_state = list()
+        if cu_seqlens is None:
+            raise ValueError(
+                "cu_seqlens cannot be None for "
+                "SiglipVisionTransformer output processing."
+            )
+        for i in range(cu_seqlens.shape[0] - 1):
+            start = cu_seqlens[i]
+            end = cu_seqlens[i + 1]
+            tensor = last_hidden_state[:, start:end, :].squeeze(0)
+            sample_hidden_state.append(tensor)
+
+        return sample_hidden_state
+
+
+class SiglipVisionModel(nn.Module):
+    config_class = "PaddleOCRVisionConfig"
+    main_input_name = "pixel_values"
+
+    def __init__(
+        self,
+        config,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+
+        self.vision_model = SiglipVisionTransformer(
+            config,
+            quant_config=quant_config,
+            prefix=add_prefix("vision_model", prefix),
+        )
+        self.quant_config = quant_config
+
+    @property
+    def dtype(self) -> torch.dtype:
+        return self.vision_model.embeddings.patch_embedding.weight.dtype
+
+    @property
+    def device(self) -> torch.device:
+        return self.vision_model.embeddings.patch_embedding.weight.device
+
+    def get_input_embeddings(self) -> nn.Module:
+        return self.vision_model.embeddings.patch_embedding
+
+    def forward(
+        self,
+        pixel_values,
+        interpolate_pos_encoding: bool = False,
+        position_ids: Optional[torch.Tensor] = None,
+        image_grid_thw: Optional[
+            List[
+                Union[
+                    Tuple[int, int, int],
+                    List[Tuple[int, int, int]],
+                ]
+            ]
+        ] = None,
+        cu_seqlens: Optional[List[torch.Tensor]] = None,
+    ) -> list[torch.Tensor]:
+
+        return self.vision_model(
+            pixel_values=pixel_values,
+            interpolate_pos_encoding=interpolate_pos_encoding,
+            position_ids=position_ids,
+            image_grid_thw=image_grid_thw,
+            cu_seqlens=cu_seqlens,
+        )
+
+
+class PaddleOCRVLForConditionalGeneration(Ernie4_5_ForCausalLM):
+
+    def __init__(self, *, config, quant_config=None, prefix: str = ""):
+        super().__init__(config=config, prefix=prefix)
+        config = self.config
+
+        self.mlp_AR = Projector(
+            config, config.vision_config, prefix=add_prefix("mlp_AR", prefix)
+        )
+        self.visual = SiglipVisionModel(
+            config=config.vision_config, prefix=add_prefix("visual", prefix)
+        )
+        if not hasattr(self.model, "get_input_embeddings"):
+            import types
+
+            self.model.get_input_embeddings = types.MethodType(
+                get_input_embeddings, self.model
+            )
+        self.is_mrope_enabled = "mrope_section" in self.config.rope_scaling
+
+    def pad_input_ids(self, input_ids: List[int], mm_inputs: MultimodalInputs):
+        pattern = MultiModalityDataPaddingPatternMultimodalTokens()
+        return pattern.pad_input_tokens(input_ids, mm_inputs)
+
+    def get_input_embeddings(self):
+        return self.model.embed_tokens
+
+    def encode_image(self, pixel_values, image_grid_thw):
+        pixel_values = pixel_values.type(self.visual.dtype)
+        siglip_position_ids = list()
+        image_grid_hws = list()
+        cu_seqlens = [0]
+
+        for idx, grid_thw in enumerate(image_grid_thw):
+            thw_tuple = tuple(grid_thw.detach().cpu().numpy().tolist())
+            numel = np.prod(thw_tuple)
+            image_grid_hws.append(thw_tuple)
+            image_position_ids = torch.arange(numel) % np.prod(thw_tuple[1:])
+            siglip_position_ids.append(image_position_ids)
+            cu_seqlens.append(cu_seqlens[-1] + numel)
+
+        siglip_position_ids = torch.concat(siglip_position_ids, dim=0).to(
+            pixel_values.device
+        )
+        cu_seqlens = torch.tensor(cu_seqlens, dtype=torch.int32).to(pixel_values.device)
+        vision_outputs = self.visual(
+            pixel_values=pixel_values,
+            image_grid_thw=image_grid_hws,
+            position_ids=siglip_position_ids,
+            interpolate_pos_encoding=True,
+            cu_seqlens=cu_seqlens,
+        )
+        image_embeds = self.mlp_AR(vision_outputs, image_grid_thw)
+
+        # image_embeds = torch.stack(image_embeds, dim=0)
+        image_embeds = torch.cat(image_embeds, dim=0)
+
+        return image_embeds
+
+    def get_image_feature(self, items: List[MultimodalDataItem]) -> torch.Tensor:
+        pixel_values = torch.cat([item.feature for item in items], dim=0).type(
+            self.visual.dtype
+        )
+        image_grid_thw = torch.concat([item.image_grid_thw for item in items], dim=0)
+        image_embeds = self.encode_image(pixel_values, image_grid_thw)
+
+        return image_embeds
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        get_embedding: bool = False,
+    ):
+        if self.is_mrope_enabled:
+            positions = forward_batch.mrope_positions
+        if not (
+            forward_batch.forward_mode.is_decode()
+            or not forward_batch.contains_image_inputs()
+        ):
+            if self.is_mrope_enabled:
+                assert positions.ndim == 2 and positions.size(0) == 3, (
+                    "multimodal section rotary embedding requires "
+                    f"(3, seq_len) positions, but got {positions.size()}"
+                )
+
+        hidden_states = general_mm_embed_routine(
+            input_ids=input_ids,
+            forward_batch=forward_batch,
+            language_model=self.model,
+            multimodal_model=self,
+            positions=positions,
+        )
+
+        return self.logits_processor(
+            input_ids, hidden_states, self.lm_head, forward_batch
+        )
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]) -> Set[str]:
+        stacked_params_mapping = [
+            # (param_name, weight_name, shard_id)
+            (".qkv_proj", ".q_proj", "q"),
+            (".qkv_proj", ".k_proj", "k"),
+            (".qkv_proj", ".v_proj", "v"),
+            (".gate_up_proj", ".gate_proj", 0),
+            (".gate_up_proj", ".up_proj", 1),
+        ]
+        params_dict = dict(self.named_parameters())
+        for name, loaded_weight in weights:
+            if "rotary_emb.inv_freq" in name:
+                continue
+            if "head.attention" in name or "head.layernorm" in name:
+                continue
+            if "head.mlp" in name or "head.probe" in name:
+                continue
+
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                if "vision_model" in name and "out_proj" in name:
+                    # adapt to VisionAttention
+                    name = name.replace(".self_attn.out_proj", ".self_attn.proj")
+                if name in params_dict.keys():
+                    param = params_dict[name]
+                    weight_loader = getattr(
+                        param, "weight_loader", default_weight_loader
+                    )
+                    weight_loader(param, loaded_weight)
+                else:
+                    raise KeyError(f"Parameter '{name}' not found in model.")
+
+
+# monkey patch
+def get_input_embeddings(self) -> nn.Embedding:
+    return self.embed_tokens
+
+
+EntryClass = [PaddleOCRVLForConditionalGeneration]
diff --git a/sglang/python/sglang/srt/models/persimmon.py b/sglang/python/sglang/srt/models/persimmon.py
new file mode 100644
index 0000000000000000000000000000000000000000..5f8885e716e537378081b0f1484232f1177ccaad
--- /dev/null
+++ b/sglang/python/sglang/srt/models/persimmon.py
@@ -0,0 +1,330 @@
+from collections.abc import Iterable
+from typing import Optional
+
+import torch
+from torch import nn
+from transformers import PersimmonConfig
+
+from sglang.srt.distributed import get_pp_group, get_tensor_model_parallel_world_size
+from sglang.srt.layers.activation import get_act_fn
+from sglang.srt.layers.linear import (
+    ColumnParallelLinear,
+    QKVParallelLinear,
+    RowParallelLinear,
+)
+from sglang.srt.layers.logits_processor import LogitsProcessor, LogitsProcessorOutput
+from sglang.srt.layers.quantization import QuantizationConfig
+from sglang.srt.layers.radix_attention import RadixAttention
+from sglang.srt.layers.rotary_embedding import get_rope
+from sglang.srt.layers.utils import PPMissingLayer
+from sglang.srt.layers.vocab_parallel_embedding import (
+    ParallelLMHead,
+    VocabParallelEmbedding,
+)
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch
+from sglang.srt.model_loader.weight_utils import default_weight_loader
+from sglang.srt.utils import add_prefix, make_layers
+
+
+class PersimmonMLP(nn.Module):
+
+    def __init__(
+        self, config: PersimmonConfig, quant_config: Optional[QuantizationConfig] = None
+    ):
+        super().__init__()
+        self.dense_h_to_4h = ColumnParallelLinear(
+            config.hidden_size, config.intermediate_size, quant_config=quant_config
+        )
+        self.dense_4h_to_h = RowParallelLinear(
+            config.intermediate_size, config.hidden_size, quant_config=quant_config
+        )
+        self.act = get_act_fn(config.hidden_act)
+
+    def forward(self, hidden_states) -> torch.Tensor:
+        hidden_states, _ = self.dense_h_to_4h(hidden_states)
+        hidden_states = self.act(hidden_states)
+        hidden_states, _ = self.dense_4h_to_h(hidden_states)
+        return hidden_states
+
+
+class PersimmonAttention(nn.Module):
+
+    def __init__(
+        self,
+        config: PersimmonConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+        layer_id: int = 0,
+    ):
+        super().__init__()
+        self.config = config
+        tensor_parallel_world_size = get_tensor_model_parallel_world_size()
+
+        self.hidden_size = config.hidden_size
+        self.total_num_heads = config.num_attention_heads
+        self.num_heads = self.total_num_heads // tensor_parallel_world_size
+        self.head_dim = self.hidden_size // self.total_num_heads
+        self.max_position_embeddings = config.max_position_embeddings
+        self.rope_theta = config.rope_theta
+        self.partial_rotary_factor = config.partial_rotary_factor
+        self.is_causal = True
+
+        assert (self.head_dim * self.total_num_heads) == self.hidden_size
+        assert self.total_num_heads % tensor_parallel_world_size == 0
+
+        self.query_key_value = QKVParallelLinear(
+            self.hidden_size,
+            self.head_dim,
+            self.total_num_heads,
+            bias=True,
+            quant_config=quant_config,
+        )
+        self.dense = RowParallelLinear(
+            self.total_num_heads * self.head_dim,
+            self.hidden_size,
+            bias=True,
+            quant_config=quant_config,
+        )
+        self.is_qk_layernorm = config.qk_layernorm
+
+        if self.is_qk_layernorm:
+            self.q_layernorm = nn.LayerNorm(self.head_dim)
+            self.k_layernorm = nn.LayerNorm(self.head_dim)
+
+        self.rotary_emb = get_rope(
+            self.head_dim,
+            rotary_dim=self.head_dim,
+            max_position=self.max_position_embeddings,
+            base=self.rope_theta,
+            partial_rotary_factor=self.partial_rotary_factor,
+        )
+        self.scaling = self.head_dim**-0.5
+        self.attn = RadixAttention(
+            self.num_heads,
+            self.head_dim,
+            self.scaling,
+            num_kv_heads=self.num_heads,
+            layer_id=layer_id,
+            quant_config=quant_config,
+            prefix=add_prefix("attn", prefix),
+        )
+
+    def _split_heads(self, x: torch.Tensor) -> torch.Tensor:
+        seq_length = x.shape[0]
+        return x.view(seq_length, self.num_heads, self.head_dim)
+
+    def _merge_heads(self, x: torch.Tensor) -> torch.Tensor:
+        seq_length = x.shape[0]
+        return x.view(seq_length, self.num_heads * self.head_dim)
+
+    def forward(
+        self,
+        position_ids: torch.Tensor,
+        forward_batch: ForwardBatch,
+        hidden_states: torch.Tensor,
+    ) -> torch.Tensor:
+        qkv, _ = self.query_key_value(hidden_states)
+        q, k, v = qkv.chunk(chunks=3, dim=-1)
+
+        if self.is_qk_layernorm:
+            q = self._split_heads(q)
+            k = self._split_heads(k)
+
+            q = self.q_layernorm(q)
+            k = self.k_layernorm(k)
+
+            q = self._merge_heads(q)
+            k = self._merge_heads(k)
+
+        q, k = self.rotary_emb(position_ids, q, k)
+        attn_output = self.attn(q, k, v, forward_batch=forward_batch)
+        output, _ = self.dense(attn_output)
+        return output
+
+
+class PersimmonDecoderLayer(nn.Module):
+
+    def __init__(
+        self,
+        config: PersimmonConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+        idx: int = 0,
+    ):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.self_attn = PersimmonAttention(
+            config=config,
+            quant_config=quant_config,
+            prefix=add_prefix("self_attn", prefix),
+            layer_id=idx,
+        )
+        self.mlp = PersimmonMLP(config, quant_config=quant_config)
+        self.input_layernorm = nn.LayerNorm(
+            config.hidden_size, eps=config.layer_norm_eps
+        )
+        self.post_attention_layernorm = nn.LayerNorm(
+            config.hidden_size, eps=config.layer_norm_eps
+        )
+
+    def forward(
+        self,
+        position_ids: torch.Tensor,
+        forward_batch: ForwardBatch,
+        hidden_states: torch.Tensor,
+    ) -> torch.Tensor:
+        residual = hidden_states
+
+        hidden_states = self.input_layernorm(hidden_states)
+
+        hidden_states = self.self_attn(
+            position_ids=position_ids,
+            hidden_states=hidden_states,
+            forward_batch=forward_batch,
+        )
+        hidden_states = residual + hidden_states
+
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+
+        hidden_states = hidden_states + residual
+
+        outputs = hidden_states
+        return outputs
+
+
+class PersimmonModel(nn.Module):
+
+    def __init__(
+        self,
+        config: PersimmonConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.config = config
+        self.pp_group = get_pp_group()
+
+        if self.pp_group.is_first_rank:
+            self.embed_tokens = VocabParallelEmbedding(
+                config.vocab_size, config.hidden_size
+            )
+        else:
+            self.embed_tokens = PPMissingLayer()
+
+        self.layers, self.start_layer, self.end_layer = make_layers(
+            config.num_hidden_layers,
+            lambda idx, prefix: PersimmonDecoderLayer(
+                config, quant_config=quant_config, prefix=prefix, idx=idx
+            ),
+            prefix="model.layers",
+            pp_rank=self.pp_group.rank_in_group,
+            pp_size=self.pp_group.world_size,
+        )
+
+        if self.pp_group.is_last_rank:
+            self.final_layernorm = nn.LayerNorm(
+                config.hidden_size, eps=config.layer_norm_eps
+            )
+        else:
+            self.final_layernorm = PPMissingLayer()
+
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.embed_tokens(input_ids)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        forward_batch: ForwardBatch,
+        positions: torch.Tensor,
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        if self.pp_group.is_first_rank:
+            if inputs_embeds is not None:
+                hidden_states = inputs_embeds
+            else:
+                hidden_states = self.get_input_embeddings(input_ids)
+        else:
+            hidden_states = forward_batch.pp_input_hidden
+        for i in range(self.start_layer, self.end_layer):
+            layer = self.layers[i]
+            hidden_states = layer(
+                position_ids=positions,
+                forward_batch=forward_batch,
+                hidden_states=hidden_states,
+            )
+        return self.final_layernorm(hidden_states)
+
+
+class PersimmonForCausalLM(nn.Module):
+
+    def __init__(
+        self,
+        config: PersimmonConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.config = config
+        self.quant_config = quant_config
+        self.model = PersimmonModel(
+            config=config, quant_config=quant_config, prefix=add_prefix("model", prefix)
+        )
+        self.lm_head = ParallelLMHead(
+            config.vocab_size,
+            config.hidden_size,
+            bias=False,
+            quant_config=quant_config,
+        )
+        self.logits_processor = LogitsProcessor(config)
+
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.get_input_embeddings(input_ids)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> LogitsProcessorOutput:
+        hidden_states = self.model(
+            input_ids=input_ids,
+            forward_batch=forward_batch,
+            positions=positions,
+            inputs_embeds=inputs_embeds,
+        )
+
+        return self.logits_processor(
+            input_ids, hidden_states, self.lm_head, forward_batch
+        )
+
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
+        params_dict = dict(self.named_parameters())
+        for name, loaded_weight in weights:
+            if "rotary_emb.inv_freq" in name:
+                continue
+            if name not in params_dict:
+                if name == "lm_head.weight":
+                    continue
+                print(f"Warning: weight {name} not found in model.")
+                continue
+            param = params_dict[name]
+            if "query_key_value" in name:
+                output_dim = getattr(param, "output_dim", None)
+                if output_dim is not None:
+                    loaded_weight_shape = loaded_weight.shape
+                    num_heads = self.config.num_attention_heads
+                    loaded_weight = loaded_weight.view(
+                        loaded_weight_shape[:output_dim]
+                        + (num_heads, 3, -1)
+                        + loaded_weight_shape[output_dim + 1 :]
+                    )
+                    loaded_weight = loaded_weight.transpose(output_dim, output_dim + 1)
+                    loaded_weight = loaded_weight.reshape(loaded_weight_shape)
+            weight_loader = getattr(param, "weight_loader", default_weight_loader)
+            weight_loader(param, loaded_weight)
+
+
+EntryClass = PersimmonForCausalLM
diff --git a/sglang/python/sglang/srt/models/phi.py b/sglang/python/sglang/srt/models/phi.py
new file mode 100644
index 0000000000000000000000000000000000000000..5679bc987812f38db282fc4ecdad0e9a20631ad0
--- /dev/null
+++ b/sglang/python/sglang/srt/models/phi.py
@@ -0,0 +1,321 @@
+# Adapted from https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/models/phi.py
+from typing import Iterable, Optional
+
+import torch
+from torch import nn
+from transformers import PhiConfig
+
+from sglang.srt.distributed import get_pp_group, get_tensor_model_parallel_world_size
+from sglang.srt.layers.activation import get_act_fn
+from sglang.srt.layers.linear import (
+    ColumnParallelLinear,
+    QKVParallelLinear,
+    RowParallelLinear,
+)
+from sglang.srt.layers.logits_processor import LogitsProcessor, LogitsProcessorOutput
+from sglang.srt.layers.quantization.base_config import QuantizationConfig
+from sglang.srt.layers.radix_attention import RadixAttention
+from sglang.srt.layers.rotary_embedding import get_rope
+from sglang.srt.layers.vocab_parallel_embedding import (
+    ParallelLMHead,
+    VocabParallelEmbedding,
+)
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch
+from sglang.srt.model_loader.weight_utils import default_weight_loader
+from sglang.srt.utils import add_prefix, make_layers
+
+
+class PhiAttention(nn.Module):
+
+    def __init__(
+        self,
+        config: PhiConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+        layer_id: int = 0,
+    ):
+        super().__init__()
+        self.total_num_heads = config.num_attention_heads
+        self.hidden_size = config.hidden_size
+        self.head_size = self.hidden_size // self.total_num_heads
+
+        tensor_model_parallel_world_size = get_tensor_model_parallel_world_size()
+        assert self.total_num_heads % tensor_model_parallel_world_size == 0
+        self.num_heads = self.total_num_heads // tensor_model_parallel_world_size
+
+        self.qkv_proj = QKVParallelLinear(
+            self.hidden_size,
+            self.head_size,
+            self.total_num_heads,
+            bias=True,
+            quant_config=quant_config,
+        )
+        self.dense = RowParallelLinear(
+            self.hidden_size,
+            self.hidden_size,
+            quant_config=quant_config,
+        )
+
+        scaling = self.head_size**-0.5
+        rotary_dim = int(
+            config.partial_rotary_factor
+            * (config.hidden_size // config.num_attention_heads)
+        )
+        assert rotary_dim % 2 == 0
+
+        rope_theta = getattr(config, "rope_theta", 10000.0)
+        max_position_embeddings = getattr(config, "max_position_embeddings", 2048)
+        self.rotary_emb = get_rope(
+            self.head_size,
+            rotary_dim=rotary_dim,
+            max_position=max_position_embeddings,
+            base=rope_theta,
+        )
+        self.attn = RadixAttention(
+            self.num_heads,
+            self.head_size,
+            scaling,
+            num_kv_heads=self.num_heads,
+            layer_id=layer_id,
+            quant_config=quant_config,
+            prefix=add_prefix("attn", prefix),
+        )
+
+    def forward(
+        self,
+        position_ids: torch.Tensor,
+        forward_batch: ForwardBatch,
+        hidden_states: torch.Tensor,
+    ) -> torch.Tensor:
+        qkv, _ = self.qkv_proj(hidden_states)
+        q, k, v = qkv.chunk(chunks=3, dim=-1)
+        q, k = self.rotary_emb(position_ids, q, k)
+        attn_output = self.attn(q, k, v, forward_batch=forward_batch)
+        output, _ = self.dense(attn_output)
+        return output
+
+
+class PhiMLP(nn.Module):
+
+    def __init__(
+        self, config: PhiConfig, quant_config: Optional[QuantizationConfig] = None
+    ):
+        super().__init__()
+
+        n_inner = getattr(config, "n_inner", None)
+        n_inner = n_inner if n_inner is not None else 4 * config.hidden_size
+
+        self.fc1 = ColumnParallelLinear(
+            config.hidden_size,
+            n_inner,
+            quant_config=quant_config,
+        )
+        self.fc2 = RowParallelLinear(
+            n_inner,
+            config.hidden_size,
+            quant_config=quant_config,
+        )
+        self.act = get_act_fn(config.hidden_act)
+
+    def forward(self, hidden_states):
+        hidden_states, _ = self.fc1(hidden_states)
+        hidden_states = self.act(hidden_states)
+        hidden_states, _ = self.fc2(hidden_states)
+        return hidden_states
+
+
+class PhiLayer(nn.Module):
+
+    def __init__(
+        self,
+        config: PhiConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+        idx: int = 0,
+    ):
+        super().__init__()
+        self.input_layernorm = nn.LayerNorm(
+            config.hidden_size, eps=config.layer_norm_eps
+        )
+        self.self_attn = PhiAttention(
+            config,
+            quant_config,
+            prefix=add_prefix("self_attn", prefix),
+            layer_id=idx,
+        )
+        self.mlp = PhiMLP(config, quant_config)
+
+    def forward(
+        self,
+        position_ids: torch.Tensor,
+        forward_batch: ForwardBatch,
+        hidden_states: torch.Tensor,
+    ) -> torch.Tensor:
+        residual = hidden_states
+        hidden_states = self.input_layernorm(hidden_states)
+        attn_outputs = self.self_attn(
+            position_ids=position_ids,
+            hidden_states=hidden_states,
+            forward_batch=forward_batch,
+        )
+        feed_forward_hidden_states = self.mlp(hidden_states)
+        hidden_states = attn_outputs + feed_forward_hidden_states + residual
+        return hidden_states
+
+
+class PhiModel(nn.Module):
+
+    def __init__(
+        self,
+        config: PhiConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.config = config
+        self.embed_tokens = VocabParallelEmbedding(
+            config.vocab_size, config.hidden_size
+        )
+
+        pp_group = get_pp_group()
+        pp_size = pp_group.world_size
+        pp_rank = pp_group.rank
+
+        self.start_layer = pp_rank * config.num_hidden_layers // pp_size
+        self.end_layer = (pp_rank + 1) * config.num_hidden_layers // pp_size
+
+        self.layers = make_layers(
+            config.num_hidden_layers,
+            lambda idx, prefix: PhiLayer(
+                config, quant_config=quant_config, prefix=prefix, idx=idx
+            ),
+            prefix=add_prefix("layers", prefix),
+        )
+
+        self.final_layernorm = nn.LayerNorm(
+            config.hidden_size, eps=config.layer_norm_eps
+        )
+
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.embed_tokens(input_ids)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        forward_batch: ForwardBatch,
+        positions: torch.Tensor,
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        if inputs_embeds is not None:
+            hidden_states = inputs_embeds
+        else:
+            hidden_states = self.get_input_embeddings(input_ids)
+        for i in range(self.start_layer, self.end_layer):
+            layer = self.layers[i]
+
+            hidden_states = layer(
+                position_ids=positions,
+                forward_batch=forward_batch,
+                hidden_states=hidden_states,
+            )
+        hidden_states = self.final_layernorm(hidden_states)
+        return hidden_states
+
+
+class PhiForCausalLM(nn.Module):
+    packed_modules_mapping = {
+        "qkv_proj": [
+            "q_proj",
+            "k_proj",
+            "v_proj",
+        ]
+    }
+
+    def __init__(
+        self,
+        config: PhiConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.config = config
+        self.quant_config = quant_config
+        self.model = PhiModel(
+            config=config,
+            quant_config=quant_config,
+            prefix=add_prefix("model", prefix),
+        )
+
+        self.lm_head = ParallelLMHead(
+            config.vocab_size,
+            config.hidden_size,
+            bias=True,
+            quant_config=quant_config,
+        )
+        self.logits_processor = LogitsProcessor(config)
+
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.get_input_embeddings(input_ids)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> LogitsProcessorOutput:
+
+        hidden_states = self.model(
+            input_ids=input_ids,
+            forward_batch=forward_batch,
+            positions=positions,
+            inputs_embeds=inputs_embeds,
+        )
+
+        return self.logits_processor(
+            input_ids, hidden_states, self.lm_head, forward_batch
+        )
+
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
+        params_dict = dict(self.named_parameters())
+        weights = dict(weights)
+        loaded_keys = set()
+
+        for name, param in params_dict.items():
+            if name in loaded_keys:
+                continue
+
+            # Handle packed weights
+            is_packed = False
+            for packed_name, src_names in self.packed_modules_mapping.items():
+                if packed_name not in name:
+                    continue
+
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
+                for src_name in src_names:
+                    full_src_name = name.replace(packed_name, src_name)
+                    if full_src_name in weights:
+                        loaded_weight = weights[full_src_name]
+                        # The shard_id for QKVParallelLinear is 'q', 'k', 'v'.
+                        shard_id = src_name.split("_")[0]
+                        weight_loader(param, loaded_weight, shard_id)
+                        loaded_keys.add(full_src_name)
+
+                loaded_keys.add(name)
+                is_packed = True
+                break
+            if is_packed:
+                continue
+
+            # Handle non-packed weights
+            if name not in weights:
+                # Redundant with the check in the loop, but good for safety
+                continue
+
+            loaded_weight = weights[name]
+            weight_loader = getattr(param, "weight_loader", default_weight_loader)
+            weight_loader(param, loaded_weight)
+            loaded_keys.add(name)
+
+
+EntryClass = PhiForCausalLM
diff --git a/sglang/python/sglang/srt/models/phi3_small.py b/sglang/python/sglang/srt/models/phi3_small.py
new file mode 100644
index 0000000000000000000000000000000000000000..9ac855c492f6980f9a2e7dccf408d0eb9f746553
--- /dev/null
+++ b/sglang/python/sglang/srt/models/phi3_small.py
@@ -0,0 +1,476 @@
+import math
+from typing import Iterable, Optional, Tuple, Union
+
+import torch
+from torch import nn
+from transformers import Phi3Config
+from transformers.configuration_utils import PretrainedConfig
+
+from sglang.srt.distributed import get_pp_group, get_tensor_model_parallel_world_size
+from sglang.srt.layers.linear import (
+    MergedColumnParallelLinear,
+    QKVParallelLinear,
+    RowParallelLinear,
+)
+from sglang.srt.layers.logits_processor import LogitsProcessor, LogitsProcessorOutput
+from sglang.srt.layers.pooler import Pooler, PoolingType
+from sglang.srt.layers.quantization.base_config import QuantizationConfig
+from sglang.srt.layers.radix_attention import RadixAttention
+from sglang.srt.layers.rotary_embedding import get_rope
+from sglang.srt.layers.utils import PPMissingLayer
+from sglang.srt.layers.vocab_parallel_embedding import (
+    DEFAULT_VOCAB_PADDING_SIZE,
+    ParallelLMHead,
+    VocabParallelEmbedding,
+)
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch
+from sglang.srt.model_loader.weight_utils import default_weight_loader
+from sglang.srt.utils import add_prefix, make_layers
+
+
+@torch.jit.script
+def quick_gelu(x):
+    return x * torch.sigmoid(1.702 * x)
+
+
+@torch.jit.script
+def gegelu(input, limit: Optional[float] = None):
+    a_gelu, a_linear = input[..., ::2], input[..., 1::2]
+    if limit is not None:
+        a_gelu = torch.where(
+            torch.isinf(a_gelu), a_gelu, a_gelu.clamp(min=None, max=limit)
+        )
+        a_linear = torch.where(
+            torch.isinf(a_linear),
+            a_linear,
+            a_linear.clamp(min=-limit, max=limit),
+        )
+    out_gelu = quick_gelu(a_gelu)
+    return out_gelu * (a_linear + 1)
+
+
+class Phi3SmallMLP(nn.Module):
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.config = config
+        assert (
+            self.config.hidden_act == "gegelu"
+        ), "Only `gegelu` is supported for the 4.7 series of models .."
+        self.hidden_size = config.hidden_size
+        self.gegelu_limit = config.gegelu_limit
+        self.intermediate_size = config.intermediate_size
+
+        self.up_proj = MergedColumnParallelLinear(
+            self.hidden_size,
+            2 * [self.intermediate_size],
+            bias=True,
+            quant_config=quant_config,
+            prefix=add_prefix("up_proj", prefix),
+        )
+        self.down_proj = RowParallelLinear(
+            self.intermediate_size,
+            self.hidden_size,
+            bias=True,
+            quant_config=quant_config,
+            prefix=add_prefix("down_proj", prefix),
+        )
+
+    def forward(self, x):
+        gate_up, _ = self.up_proj(x)
+        x = gegelu(gate_up)
+        x, _ = self.down_proj(x)
+        return x
+
+
+class Phi3SmallSelfAttention(nn.Module):
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        layer_id: int = 0,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.layer_id = layer_id
+        self.config = config
+        self.sparse_block_size = config.blocksparse_block_size
+        self.homo_heads = config.blocksparse_homo_head_pattern
+        self.local_blocks = config.blocksparse_num_local_blocks
+        self.vert_stride = config.blocksparse_vert_stride
+
+        assert (
+            config.blocksparse_block_size == config.blocksparse_triton_kernel_block_size
+        )
+
+        self.hidden_size = config.hidden_size
+        # Number of Query Heads
+        self.num_heads = config.num_attention_heads
+
+        self.head_dim = self.hidden_size // self.num_heads
+        self.tp_size = get_tensor_model_parallel_world_size()
+        # Number of total Key Value Heads before tensor parallel
+        self.num_key_value_heads = config.num_key_value_heads
+        self.num_q_per_kv = self.num_heads // self.num_key_value_heads
+        if self.tp_size > 1:
+            assert self.num_key_value_heads % self.tp_size == 0
+        self.num_kv_heads_per_partion = max(1, self.num_key_value_heads // self.tp_size)
+        self.num_heads_per_partition = self.num_heads // self.tp_size
+
+        self.max_position_embeddings = config.max_position_embeddings
+        self.rope_embedding_base = config.rope_embedding_base
+        self.rope_position_scale = config.rope_position_scale
+        self.is_causal = True
+
+        norm_factor = None
+        if config.mup_use_scaling:
+            norm_factor = self.head_dim / config.mup_attn_multiplier
+        else:
+            norm_factor = math.sqrt(self.head_dim)
+        self.scale = 1 / norm_factor
+
+        self.query_key_value = QKVParallelLinear(
+            self.hidden_size,
+            self.head_dim,
+            self.num_heads,
+            self.num_key_value_heads,
+            bias=True,
+            quant_config=quant_config,
+            prefix=add_prefix("qkv_proj", prefix),
+        )
+
+        self.dense = RowParallelLinear(
+            self.hidden_size,
+            self.hidden_size,
+            bias=True,
+            quant_config=quant_config,
+            prefix=add_prefix("o_proj", prefix),
+        )
+
+        if getattr(self.config, "rope_scaling", None) is not None:
+            rope_scaling = self.config.rope_scaling
+            for key in rope_scaling:
+                if isinstance(rope_scaling[key], list):
+                    rope_scaling[key] = tuple(rope_scaling[key])
+
+            if "factor" not in rope_scaling:
+                rope_scaling["factor"] = self.rope_position_scale
+        else:
+            rope_scaling = {
+                "rope_type": "linear",
+                "factor": self.rope_position_scale,
+            }
+
+        self.rotary_emb = get_rope(
+            self.head_dim,
+            rotary_dim=self.head_dim,
+            max_position=self.max_position_embeddings,
+            base=self.rope_embedding_base,
+            rope_scaling=rope_scaling,
+        )
+
+        # blocksparse params
+        self.blocksparse_block_size = config.blocksparse_block_size
+        self.blocksparse_num_local_blocks = config.blocksparse_num_local_blocks
+        self.blocksparse_vert_stride = config.blocksparse_vert_stride
+
+        use_dense_attn = (
+            getattr(self.config, "dense_attention_every_n_layers", None)
+            and (self.layer_id + 1) % self.config.dense_attention_every_n_layers == 0
+        )
+
+        bs_params = None
+        if not use_dense_attn:
+            bs_params = {
+                "max_seqlen": self.max_position_embeddings,
+                "num_heads": self.num_heads_per_partition,
+                "num_kv_heads": self.num_kv_heads_per_partion,
+                "block_size": self.sparse_block_size,
+                "local_blocks": self.local_blocks,
+                "vert_stride": self.vert_stride,
+                "homo_head": self.homo_heads,
+            }
+
+        self.attn = RadixAttention(
+            self.num_heads_per_partition,
+            self.head_dim,
+            self.scale,
+            num_kv_heads=self.num_kv_heads_per_partion,
+            layer_id=layer_id,
+            quant_config=quant_config,
+            prefix=add_prefix("attn", prefix),
+        )
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        qkv, _ = self.query_key_value(hidden_states)
+
+        qkv = qkv.view(qkv.shape[:-1] + (-1, (self.num_q_per_kv + 2), self.head_dim))
+        q, k, v = qkv.split([self.num_q_per_kv, 1, 1], dim=-2)
+
+        # NOTE: this is required by RotaryEmbed, which indeed does not have to
+        # TODO: allow 3D QK for rotary forward
+        q = q.reshape(-1, self.head_dim * self.num_heads_per_partition)
+        k = k.reshape(-1, self.head_dim * self.num_kv_heads_per_partion)
+        v = v.reshape(-1, self.head_dim * self.num_kv_heads_per_partion)
+
+        q, k = self.rotary_emb(positions, q, k)
+        attn_output = self.attn(q, k, v, forward_batch=forward_batch)
+        output, _ = self.dense(attn_output)
+
+        return output
+
+
+class Phi3SmallDecoderLayer(nn.Module):
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        layer_id: int,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.self_attn = Phi3SmallSelfAttention(
+            config,
+            layer_id,
+            quant_config=quant_config,
+            prefix=add_prefix("self_attn", prefix),
+        )
+        self.mlp = Phi3SmallMLP(
+            config,
+            quant_config,
+            prefix=add_prefix("mlp", prefix),
+        )
+
+        self.input_layernorm = nn.LayerNorm(
+            config.hidden_size, eps=config.layer_norm_epsilon
+        )
+        self.post_attention_layernorm = nn.LayerNorm(
+            config.hidden_size, eps=config.layer_norm_epsilon
+        )
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+    ) -> torch.Tensor:
+        residual = hidden_states
+        hidden_states = self.input_layernorm(hidden_states)
+
+        hidden_states = self.self_attn(
+            positions=positions,
+            hidden_states=hidden_states,
+            forward_batch=forward_batch,
+        )
+        hidden_states = residual + hidden_states
+
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+        return hidden_states
+
+
+class Phi3SmallModel(nn.Module):
+
+    def __init__(
+        self,
+        config: Phi3Config,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+
+        self.config = config
+
+        self.pp_group = get_pp_group()
+        if self.pp_group.is_first_rank:
+            self.embed_tokens = VocabParallelEmbedding(
+                config.vocab_size,
+                config.hidden_size,
+                prefix=add_prefix("embed_tokens", prefix),
+            )
+        else:
+            self.embed_tokens = PPMissingLayer()
+
+        self.embed_tokens = VocabParallelEmbedding(
+            config.vocab_size,
+            config.hidden_size,
+            prefix=add_prefix("embed_tokens", prefix),
+        )
+        self.mup_embedding_multiplier = config.mup_embedding_multiplier
+        self.layers, self.start_layer, self.end_layer = make_layers(
+            config.num_hidden_layers,
+            lambda idx, prefix: Phi3SmallDecoderLayer(
+                config,
+                int(prefix.split(".")[-1]),
+                quant_config,
+                prefix=prefix,
+            ),
+            pp_rank=self.pp_group.rank_in_group,
+            pp_size=self.pp_group.world_size,
+            prefix=add_prefix("layers", prefix),
+        )
+
+        self.final_layernorm = nn.LayerNorm(
+            config.hidden_size, eps=config.layer_norm_epsilon
+        )
+
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.embed_tokens(input_ids)
+
+    def forward(
+        self,
+        input_ids: torch.LongTensor,
+        positions: Optional[torch.LongTensor],
+        forward_batch: ForwardBatch,
+        inputs_embeds: Optional[torch.Tensor],
+    ) -> Union[torch.Tensor]:
+
+        if inputs_embeds is not None:
+            hidden_states = inputs_embeds
+        else:
+            hidden_states = self.get_input_embeddings(input_ids)
+        if (
+            self.mup_embedding_multiplier is not None
+            and self.mup_embedding_multiplier > 0.0
+        ):
+            hidden_states = hidden_states * self.mup_embedding_multiplier
+
+        for i in range(self.start_layer, self.end_layer):
+            layer = self.layers[i]
+            hidden_states = layer(positions, hidden_states, forward_batch=forward_batch)
+
+        hidden_states = self.final_layernorm(hidden_states)
+        return hidden_states
+
+
+class Phi3SmallForCausalLM(nn.Module):
+    _tied_weights_keys = ["lm_head.weight"]
+
+    def __init__(
+        self,
+        config: Phi3Config,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+
+        super().__init__()
+
+        self.config = config
+        self.quant_config = quant_config
+        self.model = Phi3SmallModel(
+            config=config,
+            quant_config=quant_config,
+            prefix=add_prefix("model", prefix),
+        )
+        self.vocab_size = config.vocab_size
+        self.mup_width_multiplier = config.mup_width_multiplier
+        self.lm_head = ParallelLMHead(
+            self.vocab_size,
+            config.hidden_size,
+            org_num_embeddings=config.vocab_size,
+            padding_size=DEFAULT_VOCAB_PADDING_SIZE,
+            quant_config=quant_config,
+            prefix=add_prefix("lm_head", prefix),
+        )
+        if self.config.tie_word_embeddings:
+            self.lm_head.weight = self.model.embed_tokens.weight
+        self.logits_processor = LogitsProcessor(config)
+        self.pooler = Pooler(pooling_type=PoolingType.LAST, normalize=True)
+
+        # tokens in tiktoken but not used
+        if hasattr(config, "dummy_token_indices"):
+            device = self.lm_head.weight.device
+            self.register_buffer(
+                "dummy_token_indices",
+                torch.LongTensor(config.dummy_token_indices).to(device),
+                persistent=False,
+            )
+        else:
+            self.dummy_token_indices = None
+
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.get_input_embeddings(input_ids)
+
+    def set_input_embeddings(self, value):
+        self.model.embed_tokens = value
+
+    def get_output_embeddings(self):
+        return self.lm_head
+
+    def set_output_embeddings(self, value):
+        self.lm_head = value
+
+    def set_decoder(self, decoder):
+        self.model = decoder
+
+    def get_decoder(self):
+        return self.model
+
+    def compute_logits(
+        self,
+        input_ids: torch.LongTensor,
+        hidden_states: torch.Tensor,
+        sampling_metadata,
+    ) -> Optional[torch.Tensor]:
+        logits = self.logits_processor(
+            input_ids, self.lm_head, hidden_states, sampling_metadata
+        )
+        if self.dummy_token_indices is not None and logits is not None:
+            logits.index_fill_(-1, self.dummy_token_indices, -torch.inf)
+        return logits
+
+    def forward(
+        self,
+        input_ids: torch.LongTensor,
+        positions: Optional[torch.LongTensor],
+        forward_batch: ForwardBatch,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        get_embedding: bool = False,
+    ) -> LogitsProcessorOutput:
+        hidden_states = self.model(
+            input_ids=input_ids,
+            positions=positions,
+            forward_batch=forward_batch,
+            inputs_embeds=inputs_embeds,
+        )
+
+        if not get_embedding:
+            return self.logits_processor(
+                input_ids, hidden_states, self.lm_head, forward_batch
+            )
+
+        else:
+            return self.pooler(hidden_states, forward_batch)
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+
+        params_dict = dict(self.named_parameters())
+        for name, loaded_weight in weights:
+            if "rotary_emb.inv_freq" in name:
+                continue
+            if name.endswith(".bias") and name not in params_dict:
+                continue
+            if self.config.tie_word_embeddings and "lm_head.weight" in name:
+                continue
+
+            param = params_dict[name]
+            weight_loader = getattr(param, "weight_loader", default_weight_loader)
+            weight_loader(param, loaded_weight)
+
+
+EntryClass = Phi3SmallForCausalLM
diff --git a/sglang/python/sglang/srt/models/phi4mm_audio.py b/sglang/python/sglang/srt/models/phi4mm_audio.py
new file mode 100644
index 0000000000000000000000000000000000000000..fd199836e9a9c04b64ef1ced43677414ab74bd32
--- /dev/null
+++ b/sglang/python/sglang/srt/models/phi4mm_audio.py
@@ -0,0 +1,1260 @@
+# Copyright 2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+#!/usr/bin/env python3
+import abc
+import math
+from typing import Literal, Optional
+
+import numpy as np
+import torch
+import torch.nn.functional as F
+from torch import Tensor, nn
+from torch.distributed.algorithms._checkpoint.checkpoint_wrapper import (
+    CheckpointWrapper,
+)
+from torch.distributed.fsdp.fully_sharded_data_parallel import FullyShardedDataParallel
+from transformers import PretrainedConfig
+
+from sglang.srt.models.phi4mm_utils import (
+    AbsolutePositionalEncoding,
+    ConvModule,
+    FeedForward,
+    MeanVarianceNormLayer,
+    MultiHeadedAttention,
+    MultiSequential,
+    NemoConvSubsampling,
+    T5RelativeAttentionLogitBias,
+    adaptive_enc_mask,
+    get_offset,
+    unfold_tensor,
+)
+
+_AUDIO_PLACEHOLDER_TOKEN_ID = 200011  # <|endoftext11|>
+
+
+class ConformerEncoderLayer(nn.Module):
+    """ConformerEncoder Layer module.
+    for more details see conformer paper:
+        https://arxiv.org/abs/2005.08100
+    This module implement the Conformer block layer.
+
+    Args:
+        d_model: int
+            attention dim.
+        ext_pw_out_channel: int
+            if > 0, ext_pw_out_channel is a dim channel size
+             for the last pointwise conv after swish activation.
+        depthwise_seperable_out_channel: int
+            if set different to 0, the number of
+             depthwise_seperable_out_channel will be used as a
+             channel_out of the second conv1d layer.
+             otherwise, it equal to 0, the second conv1d layer is skipped.
+        depthwise_multiplier: int
+            number of input_dim channels duplication. this value
+             will be used to compute the hidden channels of the Conv1D.
+        n_head: int
+            the number of heads for multihead attention module.
+        d_ffn: int
+            output size of the feed_forward blocks.
+        ext_pw_kernel_size: int
+            kernel size of the conv pointwise of the conformer.
+        kernel_size: int
+            kernel size.
+        dropout_rate: float
+            dropout rate.
+        causal: bool, optional
+            if set to True, convolution have no access
+             to future frames. default False.
+        batch_norm: bool, optional
+            if set to True, apply batchnorm before activation
+            in ConvModule layer of the conformer.
+            default False
+        activation: str, optional
+            activation function name,
+            one of ["relu", "swish", "sigmoid"],
+            sigmoid activation is only used with "glu_in_fnn=True",
+            default "relu".
+        chunk_se: int, optional
+            0 for offline SE.
+            1 for streaming SE, where mean is computed
+             by accumulated history until current chunk_se.
+            2 for streaming SE, where mean is computed
+             by only the current chunk.
+            default 0.
+        chunk_size: int, optional
+            chunk_size for cnn. default 18
+        conv_activation: str, optional
+            activation function used in ConvModule part
+            of the conformer, default "relu".
+        conv_glu_type: str, optional
+            activation function used for the glu inside
+            the ConvModule part of the conformer.
+            default: "sigmoid".
+        bias_in_glu: bool, optional
+            if set to True, use additive bias in the weight module
+             before GLU.
+        linear_glu_in_convm: bool, optional
+            if set to True, use GLULinear module,
+             otherwise, used GLUPointWiseConv module.
+              default to False.
+        attention_inner_dim: int, optional
+            if equal to -1, attention dim for linears k/q/v is
+            equal to d_model. otherwise attention_inner_dim is used.
+            default -1.
+        attention_glu_type: str, optional
+            activation function for glu used in the multihead attention,
+             default "swish".
+        activation_checkpointing: str, optional
+            a dictionarry of {"module","interval","offload"}, where
+                "module": str
+                    accept ["transformer", "attention"] to select
+                    which module should do activation checkpointing.
+                "interval": int, default 1,
+                    interval of applying activation checkpointing,
+                    interval = 1 means that we apply checkpointing
+                    on every layer (if activation), otherwise,
+                    we apply it every x interval.
+                "offload": bool, default False,
+                    if set to True, we offload activation to cpu and
+                    reload it during backward, otherwise,
+                    we recalculate activation in backward.
+            default "".
+        export: bool, optional
+            if set to True, it remove the padding from convolutional layers
+             and allow the onnx conversion for inference.
+              default False.
+        use_pt_scaled_dot_product_attention: bool, optional
+            if set to True, use pytorch's scaled dot product attention
+            implementation in training.
+        attn_group_sizes: int, optional
+            the number of groups to use for attention, default 1
+            (Multi-Head Attention),
+            1 = typical Multi-Head Attention,
+            1 < attn_group_sizes < attention_heads = Grouped-Query Attention
+            attn_group_sizes = attention_heads = Multi-Query Attention
+    """
+
+    def __init__(
+        self,
+        d_model=512,
+        ext_pw_out_channel=0,
+        depthwise_seperable_out_channel=256,
+        depthwise_multiplier=1,
+        n_head=4,
+        d_ffn=2048,
+        ext_pw_kernel_size=1,
+        kernel_size=3,
+        dropout_rate=0.1,
+        causal=False,
+        batch_norm=False,
+        activation="relu",
+        chunk_se=0,
+        chunk_size=18,
+        conv_activation="relu",
+        conv_glu_type="sigmoid",
+        bias_in_glu=True,
+        linear_glu_in_convm=False,
+        attention_inner_dim=-1,
+        attention_glu_type="swish",
+        activation_checkpointing="",
+        export=False,
+        use_pt_scaled_dot_product_attention=False,
+        attn_group_sizes: int = 1,
+    ):
+        super().__init__()
+
+        self.feed_forward_in = FeedForward(
+            d_model=d_model,
+            d_inner=d_ffn,
+            dropout_rate=dropout_rate,
+            activation=activation,
+            bias_in_glu=bias_in_glu,
+        )
+
+        self.self_attn = MultiHeadedAttention(
+            n_head,
+            d_model,
+            dropout_rate,
+            attention_inner_dim,
+            attention_glu_type,
+            bias_in_glu,
+            use_pt_scaled_dot_product_attention=use_pt_scaled_dot_product_attention,
+            group_size=attn_group_sizes,
+        )
+        self.conv = ConvModule(
+            d_model,
+            ext_pw_out_channel,
+            depthwise_seperable_out_channel,
+            ext_pw_kernel_size,
+            kernel_size,
+            depthwise_multiplier,
+            dropout_rate,
+            causal,
+            batch_norm,
+            chunk_se,
+            chunk_size,
+            conv_activation,
+            conv_glu_type,
+            bias_in_glu,
+            linear_glu_in_convm,
+            export=export,
+        )
+
+        self.feed_forward_out = FeedForward(
+            d_model=d_model,
+            d_inner=d_ffn,
+            dropout_rate=dropout_rate,
+            activation=activation,
+            bias_in_glu=bias_in_glu,
+        )
+
+        self.layer_norm_att = nn.LayerNorm(d_model)
+        self.layer_norm = nn.LayerNorm(d_model)
+
+    def forward(
+        self,
+        x,
+        pos_k,
+        pos_v,
+        mask,
+        relative_attention_bias: Optional[Tensor] = None,
+    ):
+        """ConformerEncoder forward.
+
+        Args:
+            x: torch.Tensor
+                input feature of shape (batch, max_time_in, size)
+            pos_k: torch.Tensor
+                positional key embedding.
+            mask: torch.Tensor
+                mask for x (batch, max_time_in)
+            relative_attention_bias: Optional[torch.Tensor]
+                bias added to attention logits w.r.t. relative positions
+                (1, n_head, time1, time2)
+        """
+        x = x + 0.5 * self.feed_forward_in(x)
+        norm_x = self.layer_norm_att(x)
+
+        x = x + self.self_attn(
+            norm_x,
+            norm_x,
+            norm_x,
+            pos_k,
+            pos_v,
+            mask,
+            relative_attention_bias=relative_attention_bias,
+        )
+        x = x + self.conv(x)
+        x = x + 0.5 * self.feed_forward_out(x)
+
+        out = self.layer_norm(x)
+
+        return out, pos_k, pos_v, mask
+
+
+class TransformerEncoderBase(abc.ABC, nn.Module):
+    """The Base class for Transformer based encoders
+
+    Please set causal = True in streaming model
+    Args:
+        input_size: int
+            input feature dimension.
+        chunk_size: int, list(int)
+            Number of frames for each chunk
+            This variable can take 2 forms:
+            int:  Used for inference, or single chunk size training
+            list(int) : Used only for variable chunk size training
+            Some examples for the 2 cases:
+            chunk_size = 12
+            chunk_size = [6, 8, 12, 24]
+        left_chunk: int, list(int)
+            Number of chunks used for masking in streaming mode.
+            This variable can take 2 forms:
+            int:  Used for inference, or single chunk size training
+            list(int) : Used only for variable chunk size training. When
+            chunk_size is a list, left_chunk must be a list with same length.
+            Some examples for the 2 cases:
+            left_chunk = 6
+            left_chunk = [12, 9, 6, 3]
+        attention_dim: int, optional
+            attention dimension. default 256.
+        attention_heads: int, optional
+            the number of heads. default 4
+        input_layer: str, optional
+            input layer type before Conformer,
+            one of ["linear", "conv2d", "custom", "vgg2l", "embed"],
+            default "conv2d"
+        cnn_out: int, optional
+            the number of CNN channels before Conformer.
+            default -1.
+        cnn_layer_norm: bool, optional
+            layer norm between Conformer and the first CNN.
+            default False.
+        time_reduction: int, optional
+            time reduction factor
+            default 4
+        dropout_rate: float, optional
+            dropout rate. default 0.1
+        padding_idx: int, optional
+            padding index for input_layer=embed
+            default -1
+        relative_attention_bias_args: dict, optional
+            use more efficient scalar bias-based relative multihead attention
+            (Q*K^T + B) implemented in cmb.basics.embedding.
+            [T5/ALiBi]RelativeAttentionLogitBias
+            usage: relative_attention_bias_args={"type": t5/alibi}
+            additional method-specific arguments can be provided (see
+            transformer_base.py)
+        positional_dropout_rate: float, optional
+            dropout rate after positional encoding. default 0.0
+        nemo_conv_settings: dict, optional
+            A dictionary of settings for NeMo Subsampling.
+            default None
+        conv2d_extra_padding: str, optional
+            Add extra padding in conv2d subsampling layers. Choices are
+            (feat, feat_time, none, True).
+            if True or feat_time, the extra padding is added into non full
+            supraframe utts in batch.
+            Default: none
+        attention_group_size: int, optional
+            the number of groups to use for attention, default 1
+            (Multi-Head Attention),
+            1 = typical Multi-Head Attention,
+            1 < attention_group_size < attention_heads = Grouped-Query
+            Attention
+            attention_group_size = attention_heads = Multi-Query Attention
+    """
+
+    def __init__(
+        self,
+        input_size,
+        chunk_size,
+        left_chunk,
+        attention_dim=256,
+        attention_heads=4,
+        input_layer="nemo_conv",
+        cnn_out=-1,
+        cnn_layer_norm=False,
+        time_reduction=4,
+        dropout_rate=0.0,
+        padding_idx=-1,
+        relative_attention_bias_args=None,
+        positional_dropout_rate=0.0,
+        nemo_conv_settings=None,
+        conv2d_extra_padding: Literal["feat", "feat_time", "none", True] = "none",
+        attention_group_size=1,
+        encoder_embedding_config=None,
+    ):
+        super().__init__()
+        self.input_size = input_size
+        self.input_layer = input_layer
+        self.chunk_size = chunk_size
+        self.left_chunk = left_chunk
+        self.attention_dim = attention_dim
+        self.num_heads = attention_heads
+        self.attention_group_size = attention_group_size
+        self.time_reduction = time_reduction
+        self.nemo_conv_settings = nemo_conv_settings
+        self.encoder_embedding_config = encoder_embedding_config
+
+        if self.input_layer == "nemo_conv":
+            default_nemo_conv_settings = {
+                "subsampling": "dw_striding",
+                "subsampling_factor": self.time_reduction,
+                "feat_in": input_size,
+                "feat_out": attention_dim,
+                "conv_channels": 256,
+                "subsampling_conv_chunking_factor": 1,
+                "activation": nn.ReLU(),
+                "is_causal": False,
+            }
+            # Override any of the defaults with the incoming, user settings
+            if nemo_conv_settings:
+                default_nemo_conv_settings.update(nemo_conv_settings)
+                for i in ["subsampling_factor", "feat_in", "feat_out"]:
+                    assert (
+                        i not in nemo_conv_settings
+                    ), "{i} should be specified outside of the NeMo dictionary"
+
+            self.embed = NemoConvSubsampling(
+                **default_nemo_conv_settings,
+            )
+        else:
+            raise ValueError("unknown input_layer: " + input_layer)
+
+        self.pos_emb = AbsolutePositionalEncoding(
+            attention_dim, positional_dropout_rate
+        )
+
+        self.relative_attention_bias_type = (
+            relative_attention_bias_args.get("type")
+            if relative_attention_bias_args
+            else None
+        )
+        if self.relative_attention_bias_type == "t5":
+            assert (
+                self.num_heads % self.attention_group_size == 0
+            ), "attention_group_size must divide n_head"
+            self.relative_attention_bias_layer = T5RelativeAttentionLogitBias(
+                self.num_heads // self.attention_group_size,
+                max_distance=relative_attention_bias_args.get(
+                    "t5_bias_max_distance", 1000
+                ),
+                symmetric=relative_attention_bias_args.get("t5_bias_symmetric", False),
+            )
+        else:
+            raise NotImplementedError
+
+        self.encoder_embedding = MeanVarianceNormLayer(
+            self.encoder_embedding_config["input_size"]
+        )
+
+    def compute_lens_change(self, feature_lens):
+        """feature_lens: int
+        return updated feature lens.
+
+        This used to return a different lambda function for each case that
+        computed the right thing.  That does not work within Torchscript.
+        If you really need this to be faster, create nn.Module()-s for all
+        the cases and return one of them.  Torchscript does support that.
+        """
+        if self.input_layer == "nemo_conv":
+            # Handle the special causal case
+            subsampling_causal_cond = self.nemo_conv_settings.get(
+                "subsampling", "dw_striding"
+            ) in [
+                "dw_striding",
+                "striding",
+                "striding_conv1d",
+            ]
+            is_causal = self.nemo_conv_settings.get("is_causal", False)
+            if is_causal and subsampling_causal_cond:
+                lens_change = (
+                    torch.ceil(feature_lens / self.time_reduction).long()
+                    if isinstance(feature_lens, Tensor)
+                    else math.ceil(feature_lens / self.time_reduction)
+                )
+                feature_lens_remainder = feature_lens % self.time_reduction
+                if isinstance(feature_lens, Tensor):
+                    lens_change[feature_lens_remainder != 1] += 1
+                elif feature_lens_remainder != 1:
+                    lens_change += 1
+                return lens_change
+            ceil_func = math.ceil if isinstance(feature_lens, int) else torch.ceil
+            return ceil_func(feature_lens / self.time_reduction)
+
+    @abc.abstractmethod
+    def forward(self):
+        """Abstract forward method implementation."""
+
+    def _chunk_size_selection(self, chunk_size=None, left_chunk=None):
+        """If chunk size is a list, we will randomly select a chunk size."""
+
+        if chunk_size is None:
+            chunk_size = self.chunk_size
+        if left_chunk is None:
+            left_chunk = self.left_chunk
+        if isinstance(chunk_size, list):
+            # Variable chunk size during training
+            chunk_size_index = int(
+                torch.randint(low=0, high=len(chunk_size), size=(1,))
+            )
+            chunk_size_train_eff = chunk_size[chunk_size_index]
+            if not isinstance(left_chunk, list):
+                raise ValueError(
+                    "Since chunk_size is a list, left_chunk must be a list"
+                )
+            if len(left_chunk) != len(chunk_size):
+                raise ValueError(
+                    "The length of left_chunk must be the same as length of "
+                    "chunk_size."
+                )
+            left_chunk_train_eff = left_chunk[chunk_size_index]
+        else:
+            chunk_size_train_eff = chunk_size
+            left_chunk_train_eff = left_chunk
+
+        return chunk_size_train_eff, left_chunk_train_eff
+
+    def _get_embed_class(self, embed):
+        # pylint: disable=protected-access
+        is_embed_using_act_chkpt = isinstance(embed, CheckpointWrapper)
+        is_embed_fsdp_wrapped = isinstance(embed, FullyShardedDataParallel)
+        embed_class = embed
+        if is_embed_using_act_chkpt:
+            embed_class = embed._checkpoint_wrapped_module
+        if is_embed_fsdp_wrapped:
+            embed_class = embed.module
+        return embed_class
+
+    def _forward_embeddings_core(self, input_tensor, masks):
+        embed_class = self._get_embed_class(self.embed)
+        assert isinstance(embed_class, NemoConvSubsampling)
+        input_tensor, masks = self.embed(input_tensor, masks)
+        return input_tensor, masks
+
+    def _position_embedding(self, input_tensor):
+        pos_k = None
+        pos_v = None
+        if self.relative_attention_bias_layer is None:
+            input_tensor = self.pos_emb(
+                input_tensor
+            )  # default to add abs sinusoid embedding
+        return pos_k, pos_v
+
+    def _streaming_mask(self, seq_len, batch_size, chunk_size, left_chunk):
+        chunk_size_train_eff, left_chunk_train_eff = self._chunk_size_selection(
+            chunk_size, left_chunk
+        )
+
+        # Create mask matrix for streaming
+        # S stores start index. if chunksize is 18, s is [0,18,36,....]
+        chunk_start_idx = np.arange(0, seq_len, chunk_size_train_eff)
+
+        enc_streaming_mask = (
+            adaptive_enc_mask(
+                seq_len, chunk_start_idx, left_window=left_chunk_train_eff
+            )
+            .unsqueeze(0)
+            .expand([batch_size, -1, -1])
+        )
+        return enc_streaming_mask
+
+    def forward_embeddings(self, xs_pad, masks, chunk_size_nc=None, left_chunk_nc=None):
+        """Forwarding the inputs through the top embedding layers
+
+        Args:
+            xs_pad: torch.Tensor
+                input tensor
+            masks: torch.Tensor
+                input mask
+            chunk_size_nc: (optional, default is None) chunk size for
+                            non-causal layers
+            left_chunk_nc: (optional, default is None) # of left chunks for
+                            non-causal layers
+        """
+        # pylint: disable=R0915
+        # get new lens.
+        seq_len = int(self.compute_lens_change(xs_pad.shape[1]))
+        if seq_len <= 0:
+            raise ValueError(
+                f"""The sequence length after time reduction is invalid:
+                {seq_len}. Your input feature is too short. Consider
+                filtering out the very short sentence from data
+                loader""",
+            )
+
+        batch_size = xs_pad.shape[0]
+
+        enc_streaming_mask = self._streaming_mask(
+            seq_len, batch_size, self.chunk_size, self.left_chunk
+        )
+
+        if xs_pad.is_cuda:
+            enc_streaming_mask = enc_streaming_mask.cuda()
+            xs_pad = xs_pad.cuda()
+
+        input_tensor = xs_pad
+        input_tensor, masks = self._forward_embeddings_core(input_tensor, masks)
+
+        streaming_mask = enc_streaming_mask
+        if streaming_mask is not None and masks is not None:
+            hs_mask = masks & streaming_mask
+        elif masks is not None:
+            hs_mask = masks
+        else:
+            hs_mask = streaming_mask
+
+        if chunk_size_nc is not None:
+            enc_streaming_mask_nc = self._streaming_mask(
+                seq_len, batch_size, chunk_size_nc, left_chunk_nc
+            )
+            if xs_pad.is_cuda:
+                enc_streaming_mask_nc = enc_streaming_mask_nc.cuda()
+            if masks is not None:
+                hs_mask_nc = masks & enc_streaming_mask_nc
+            else:
+                hs_mask_nc = enc_streaming_mask_nc
+        else:
+            hs_mask_nc = None
+
+        pos_k, pos_v = self._position_embedding(input_tensor)
+
+        if chunk_size_nc is None:
+            return input_tensor, pos_k, pos_v, hs_mask, masks
+        return input_tensor, pos_k, pos_v, hs_mask, masks, hs_mask_nc
+
+    def get_offset(self):
+        """Returns offset used when retaining inputs for decoding.
+
+        This is essentially, how many additional frames have to be added to
+        the front-end CNN input to ensure it can produce a single output.
+        So if the "padding" parameter is 0, typically offset will be > 0.
+        """
+        return get_offset(self.input_layer, self.time_reduction)
+
+
+class ConformerEncoder(TransformerEncoderBase):
+    """ConformerEncoder module.
+    see original paper for more details:
+        https://arxiv.org/abs/2005.08100
+
+    Please set causal = True in streaming model
+    Args:
+        input_size: int
+            input feature dimension.
+        chunk_size: int, list(int)
+            Number of frames for each chunk
+            This variable can take 2 forms:
+            int:  Used for inference, or single chunk size training
+            list(int) : Used only for variable chunk size training
+            Some examples for the 2 cases:
+            chunk_size = 12
+            chunk_size = [6, 8, 12, 24]
+        left_chunk: int, list(int)
+            Number of chunks used for masking in streaming mode.
+            This variable can take 2 forms:
+            int:  Used for inference, or single chunk size training
+            list(int) : Used only for variable chunk size training. When
+            chunk_size is a list, left_chunk must be a list with same length.
+            Some examples for the 2 cases:
+            left_chunk = 6
+            left_chunk = [12, 9, 6, 3]
+        left_chunk: int
+            number of chunks used for masking in streaming mode.
+        num_lang: int
+            This parameter is used to store the number of languages in the
+            lang_dict, only used for multiseed/multilingual models.
+            default None.
+        attention_dim: int, optional
+            attention dimension. default 256.
+        attention_heads: int, optional
+            the number of heads. default 4
+        linear_units:
+            the number of units of position-wise feed forward.
+            default 2048
+        num_block:
+            number of Transformer layer. default 6
+        dropout_rate: float, optional
+            dropout rate. default 0.1
+        input_layer: str, optional
+            input layer type before Conformer,
+            one of ["linear", "conv2d", "custom", "vgg2l", "embed"],
+            default "conv2d"
+        causal: bool, optional
+            if set to True, convolution have no access
+             to future frames. default False.
+        batch_norm: bool, optional
+            if set to True, apply batchnorm before activation
+            in ConvModule layer of the conformer.
+            default False
+        cnn_out: int, optional
+            the number of CNN channels before Conformer.
+            default -1.
+        cnn_layer_norm: bool, optional
+            layer norm between Conformer and the first CNN.
+            default False.
+        ext_pw_out_channel: int, optional
+            the number of channel for CNN
+            before depthwise_seperable_CNN.
+            If 0 then use linear. default 0.
+        ext_pw_kernel_size: int, optional
+            kernel size of N before depthwise_seperable_CNN.
+            only work for ext_pw_out_channel > 0.
+            default 1
+        depthwise_seperable_out_channel: int, optional
+            the number of channel for
+            depthwise_seperable_CNN.
+            default 256.
+        depthwise_multiplier: int, optional
+            the number of multiplier for
+            depthwise_seperable_CNN.
+            default 1.
+        chunk_se: int, optional
+            0 for offline SE.
+            1 for streaming SE, where mean is computed
+             by accumulated history until current chunk_se.
+            2 for streaming SE, where mean is computed
+             by only the current chunk.
+            default 0.
+        kernel_size: int, optional
+            the number of kernels for depthwise_seperable_CNN.
+            default 3.
+        activation: str, optional
+            FeedForward block activation.
+            one of ["relu", "swish", "sigmoid"]
+            default "relu".
+        conv_activation: str, optional
+            activation function used in ConvModule part
+            of the conformer, default "relu".
+        conv_glu_type: str, optional
+            activation used use glu in depthwise_seperable_CNN,
+            default "sigmoid"
+        bias_in_glu: bool, optional
+            if set to True, use additive bias in the weight module
+             before GLU. default True
+        linear_glu_in_convm: bool, optional
+            if set to True, use GLULinear module,
+             otherwise, used GLUPointWiseConv module.
+              default to False.
+        attention_glu_type: str
+            only work for glu_in_attention !=0
+            default "swish".
+        export: bool, optional
+            if set to True, it remove the padding from convolutional layers
+             and allow the onnx conversion for inference.
+              default False.
+        activation_checkpointing: str, optional
+            a dictionarry of {"module","interval","offload"}, where
+                "module": str
+                    accept ["transformer", "attention"] to select
+                    which module should do activation checkpointing.
+                "interval": int, default 1,
+                    interval of applying activation checkpointing,
+                    interval = 1 means that we apply checkpointing
+                    on every layer (if activation), otherwise,
+                    we apply it every x interval.
+                "offload": bool, default False,
+                    if set to True, we offload activation to cpu and
+                    reload it during backward, otherwise,
+                    we recalculate activation in backward.
+            default "".
+        extra_layer_output_idx: int
+            the layer index to be exposed.
+        relative_attention_bias_args: dict, optional
+            use more efficient scalar bias-based relative multihead attention
+            (Q*K^T + B) implemented in cmb.basics.embedding.
+            [T5/ALiBi]RelativeAttentionLogitBias
+            usage: relative_attention_bias_args={"type": t5/alibi}
+            additional method-specific arguments can be provided (see
+            transformer_base.py)
+        time_reduction: int optional
+            time reduction factor
+            default 4
+        use_pt_scaled_dot_product_attention: whether to use pytorch scaled
+            dot product attention in training.
+            Default: False
+        nemo_conv_settings: dict, optional
+            A dictionary of settings for NeMo Subsampling.
+            default: None
+            usage: nemo_conv_settings=
+                {
+                    "subsampling":
+                    dw_striding/striding/dw_striding_conv1d/striding_conv1d,
+                    "conv_channels": int,
+                    "subsampling_conv_chunking_factor": int,
+                    "is_causal": True/False
+                }
+        conv2d_extra_padding: str, optional
+            Add extra padding in conv2d subsampling layers. Choices are
+            (feat, feat_time, none, True)
+            Default: none
+        replication_pad_for_subsample_embedding:  For batched-streaming
+            decoding, use "replication" padding for the cache at start of
+            utterance.
+            Default: False
+        attention_group_size: int, optional
+            the number of groups to use for attention, default 1
+            (Multi-Head Attention),
+            1 = typical Multi-Head Attention,
+            1 < attention_group_size < attention_heads = Grouped-Query
+            Attention
+            attention_group_size = attention_heads = Multi-Query Attention
+    """
+
+    extra_multi_layer_output_idxs: list[int]
+
+    def __init__(  # pylint: disable-all
+        self,
+        input_size,
+        chunk_size,
+        left_chunk,
+        num_lang=None,
+        attention_dim=256,
+        attention_heads=4,
+        linear_units=2048,
+        num_blocks=6,
+        dropout_rate=0.1,
+        input_layer="nemo_conv",
+        causal=True,
+        batch_norm=False,
+        cnn_out=-1,
+        cnn_layer_norm=False,
+        ext_pw_out_channel=0,
+        ext_pw_kernel_size=1,
+        depthwise_seperable_out_channel=256,
+        depthwise_multiplier=1,
+        chunk_se=0,
+        kernel_size=3,
+        activation="relu",
+        conv_activation="relu",
+        conv_glu_type="sigmoid",
+        bias_in_glu=True,
+        linear_glu_in_convm=False,
+        attention_glu_type="swish",
+        export=False,
+        extra_layer_output_idx=-1,
+        extra_multi_layer_output_idxs=[],  # noqa
+        activation_checkpointing="",
+        relative_attention_bias_args=None,
+        time_reduction=4,
+        use_pt_scaled_dot_product_attention=False,
+        nemo_conv_settings=None,
+        conv2d_extra_padding: Literal["feat", "feat_time", "none", True] = "none",
+        replication_pad_for_subsample_embedding=False,
+        attention_group_size=1,
+        encoder_embedding_config=None,
+    ):
+        super().__init__(
+            input_size,
+            chunk_size,
+            left_chunk,
+            attention_dim,
+            attention_heads,
+            input_layer,
+            cnn_out,
+            cnn_layer_norm,
+            time_reduction,
+            dropout_rate=dropout_rate,
+            relative_attention_bias_args=relative_attention_bias_args,
+            positional_dropout_rate=0.0,
+            nemo_conv_settings=nemo_conv_settings,
+            conv2d_extra_padding=conv2d_extra_padding,
+            attention_group_size=attention_group_size,
+            encoder_embedding_config=encoder_embedding_config,
+        )
+        self.num_blocks = num_blocks
+        self.num_lang = num_lang
+        self.kernel_size = kernel_size
+        self.replication_pad_for_subsample_embedding: bool = (
+            replication_pad_for_subsample_embedding
+        )
+        assert (
+            self.num_heads % attention_group_size == 0
+        ), "attention_group_size must divide n_head"
+        self.num_heads_k = self.num_heads // attention_group_size
+
+        self.encoders = MultiSequential(
+            *[
+                ConformerEncoderLayer(
+                    d_model=attention_dim,
+                    ext_pw_out_channel=ext_pw_out_channel,
+                    depthwise_seperable_out_channel=depthwise_seperable_out_channel,
+                    depthwise_multiplier=depthwise_multiplier,
+                    n_head=attention_heads,
+                    d_ffn=linear_units,
+                    ext_pw_kernel_size=ext_pw_kernel_size,
+                    kernel_size=kernel_size,
+                    dropout_rate=dropout_rate,
+                    causal=causal,
+                    batch_norm=batch_norm,
+                    activation=activation,
+                    chunk_se=chunk_se,
+                    chunk_size=chunk_size,
+                    conv_activation=conv_activation,
+                    conv_glu_type=conv_glu_type,
+                    bias_in_glu=bias_in_glu,
+                    linear_glu_in_convm=linear_glu_in_convm,
+                    attention_glu_type=attention_glu_type,
+                    activation_checkpointing=activation_checkpointing,
+                    export=export,
+                    use_pt_scaled_dot_product_attention=use_pt_scaled_dot_product_attention,
+                    attn_group_sizes=attention_group_size,
+                )
+                for _ in range(num_blocks)
+            ]
+        )
+        self.extra_layer_output_idx = extra_layer_output_idx
+        self.extra_multi_layer_output_idxs = extra_multi_layer_output_idxs
+        # Make a zeros scalar we can use in get_initial_state to determine
+        # the device and the needed dtype:
+        self.register_buffer("dev_type", torch.zeros(()), persistent=False)
+
+    def init_relative_attention_bias(self, input_tensor):
+        if self.relative_attention_bias_layer:
+            return self.relative_attention_bias_layer(input_tensor)
+
+    def calculate_hs_mask(self, xs_pad, device, mask):
+        max_audio_length = xs_pad.shape[1]
+        batch_size = xs_pad.shape[0]
+        enc_streaming_mask = self._streaming_mask(
+            max_audio_length, batch_size, self.chunk_size, self.left_chunk
+        )
+        enc_streaming_mask = enc_streaming_mask.to(device)
+        if mask is None:
+            return enc_streaming_mask
+
+        feature_lens = mask.sum(1)
+        padding_length = feature_lens
+        pad_mask = torch.arange(0, max_audio_length, device=device).expand(
+            padding_length.size(0), -1
+        ) < padding_length.unsqueeze(1)
+        pad_mask = pad_mask.unsqueeze(1)
+        pad_mask = pad_mask & enc_streaming_mask
+        return pad_mask
+
+    @torch.jit.ignore
+    def forward(self, xs_pad, masks):
+        """Conformer Forward function
+
+        Args:
+            xs_pad: torch.Tensor
+                input tensor
+            masks: torch.Tensor
+                post-embedding input lengths
+        """
+        xs_pad = self.encoder_embedding(xs_pad)
+        input_tensor, pos_k, pos_v, hs_mask, masks = self.forward_embeddings(
+            xs_pad, masks
+        )
+
+        unfolded = False
+        ori_bz, seq_len, D = input_tensor.shape
+        max_seq_len = 500  # maximum position for absolute positional encoding
+        if seq_len > max_seq_len:
+            # audio sequence is longer than max_seq_len, unfold it into chunks
+            # of max_seq_len
+            unfolded = True
+            # the unfold op will drop residual frames, pad it to the multiple
+            # of max_seq_len
+            if seq_len % max_seq_len > 0:
+                chunk_pad_size = max_seq_len - (seq_len % max_seq_len)
+            else:
+                chunk_pad_size = 0
+            if chunk_pad_size > 0:
+                input_tensor_pad = F.pad(
+                    input_tensor, (0, 0, 0, chunk_pad_size), "constant", 0
+                )
+                input_tensor = input_tensor_pad.to(input_tensor.device)
+            input_tensor = unfold_tensor(input_tensor, max_seq_len)
+            if masks is not None:
+                # revise hs_mask here because the previous calculated hs_mask
+                # did not consider extra pad
+                subsampled_pad_mask = masks.squeeze(
+                    1
+                )  # [bz, subsampled_unmask_seq_len]
+                extra_padded_subsamlped_pad_mask = F.pad(
+                    subsampled_pad_mask, (0, chunk_pad_size), "constant", False
+                )  # extra padding to the pad mask
+                extra_padded_subsamlped_pad_mask = (
+                    extra_padded_subsamlped_pad_mask.unsqueeze(-1).float()
+                )
+                masks_unfold = unfold_tensor(
+                    extra_padded_subsamlped_pad_mask, max_seq_len
+                )  # unfold the pad mask like we did to the input tensor
+                masks_unfold = masks_unfold.squeeze(
+                    -1
+                ).bool()  # unfold op does not support bool tensor
+            else:
+                masks_unfold = None
+            hs_mask = self.calculate_hs_mask(
+                input_tensor, input_tensor.device, masks_unfold
+            )  # calculate hs_mask based on the unfolded pad mask
+
+        # layer_emb = None
+
+        relative_attention_bias = self.init_relative_attention_bias(input_tensor)
+
+        _simplified_path = (
+            self.extra_layer_output_idx == -1 and relative_attention_bias is None
+        )
+
+        if _simplified_path:
+            input_tensor, *_ = self.encoders(input_tensor, pos_k, pos_v, hs_mask)
+        else:
+            for i, layer in enumerate(self.encoders):
+                input_tensor, _, _, _ = layer(
+                    input_tensor,
+                    pos_k,
+                    pos_v,
+                    hs_mask,
+                    relative_attention_bias=relative_attention_bias,
+                )
+
+                # if i == self.extra_layer_output_idx:
+                #     layer_emb = input_tensor
+
+        if unfolded:
+            embed_dim = input_tensor.shape[-1]
+            input_tensor = input_tensor.reshape(ori_bz, -1, embed_dim)
+            # if we ever padded before unfolding, we need to remove the padding
+            if chunk_pad_size > 0:
+                input_tensor = input_tensor[:, :-chunk_pad_size, :]
+
+        return input_tensor, masks  # , layer_emb
+
+
+class WindowQformer(nn.Module):
+    """Window-level Qformer"""
+
+    def __init__(
+        self,
+        window_size: int = 8,
+        num_queries: int = 1,
+        num_blocks: int = 2,
+        attention_dim: int = 512,
+        attention_heads: int = 8,
+        linear_units: int = 2048,
+        dropout_rate: float = 0.0,
+        normalize_before: bool = True,
+    ):
+        super().__init__()
+
+        self.decoders = nn.ModuleList(
+            [
+                nn.TransformerDecoderLayer(
+                    d_model=attention_dim,
+                    nhead=attention_heads,
+                    dim_feedforward=linear_units,
+                    dropout=dropout_rate,
+                    activation="relu",
+                    batch_first=True,
+                    norm_first=normalize_before,  # TODO need to verify
+                )
+                for _ in range(num_blocks)
+            ]
+        )
+
+        self.queries = nn.Parameter(torch.zeros(1, num_queries, attention_dim))
+        self.after_norm = (
+            nn.LayerNorm(attention_dim, eps=1e-12) if normalize_before else None
+        )
+        self.window_size = window_size
+
+    def forward(self, audio_embed, mask, embed_len=None):
+        """forward decoder"""
+        # audio_embed: N x T x D => N x D x T
+
+        audio_embed = audio_embed.transpose(1, 2)
+        # audio_embed: N x D x 1 x T => N x DK x T'
+        padding = audio_embed.shape[-1] % self.window_size
+        if padding > 0:
+            audio_embed = F.pad(
+                audio_embed, (0, self.window_size - padding), "constant", 0
+            )
+
+        embed_chunk = F.unfold(
+            audio_embed[..., None, :],
+            kernel_size=(1, self.window_size),
+            stride=(1, self.window_size),
+        )
+        bsz, _, slen = embed_chunk.shape
+        # N x D x K x T'
+        embed_chunk = embed_chunk.view(bsz, -1, self.window_size, slen)
+        # N x T' x K x D
+        embed_chunk = embed_chunk.transpose(1, 3).contiguous()
+        # NT' x K x D
+        embed_chunk = embed_chunk.view(bsz * slen, self.window_size, -1)
+        # NT' x 1 x D
+        q = self.queries.expand(bsz * slen, -1, -1)
+        for layer in self.decoders:
+            q = layer(tgt=q, memory=embed_chunk, tgt_mask=None, memory_mask=mask)
+
+        if self.after_norm is not None:
+            q = self.after_norm(q)
+
+        if embed_len is not None:
+            embed_len = embed_len // self.window_size
+        # N x T' x D
+        out = q.view(bsz, slen, -1)
+
+        return out, embed_len
+
+
+class AudioEmbedding(nn.Module):
+    """Image embedding."""
+
+    def __init__(self, config: PretrainedConfig, **kwargs) -> None:
+        super().__init__()
+        self.config = config
+        # n_embed or hidden_size for text LM
+        hidden_size = config.n_embd if hasattr(config, "n_embd") else config.hidden_size
+
+        # self.wte = nn.Embedding(config.vocab_size, hidden_size)
+
+        audio_dim_out = (
+            None  # Set this variable according to the actual audio processor
+        )
+        self.layer_idx = -2
+
+        if (
+            isinstance(config.audio_processor, dict)
+            and config.audio_processor.get("name", None) == "cascades"
+        ):
+            encoder_config = config.audio_processor.get("config", None)
+            assert encoder_config is not None
+            self.encoder = ConformerEncoder(**encoder_config)
+
+            audio_dim_out = encoder_config["attention_dim"]
+            n_mels = encoder_config["input_size"]
+        else:
+            raise NotImplementedError("")
+
+        assert audio_dim_out is not None, "Remember to set values for audio_dim_out"
+        self.audio_dim_out = audio_dim_out
+        self.audio_dim_in = n_mels
+
+        self.freeze_audio_processor = kwargs.get("freeze_audio_processor", False)
+
+        self.downsample_rate = kwargs.get("downsample_rate", 1)
+
+        if kwargs.get("use_qformer", False):
+            qformer_config = kwargs.get("qformer_config", {})
+            qformer_config["attention_dim"] = audio_dim_out
+            self.qformer = WindowQformer(**qformer_config)
+        else:
+            self.qformer = None
+
+        if kwargs.get("use_conv_downsample", False):
+            assert (
+                self.qformer is None
+            ), "don't support use qformer and conv downsample together"
+            nemo_conv_settings = kwargs.get("nemo_conv_settings", {})
+            default_nemo_conv_settings = {
+                "subsampling": "dw_striding",
+                "subsampling_factor": self.downsample_rate,
+                "feat_in": audio_dim_out,
+                "feat_out": audio_dim_out,
+                "conv_channels": 256,
+                "subsampling_conv_chunking_factor": 1,
+                "activation": nn.ReLU(),
+                "is_causal": False,
+            }
+            # Override any of the defaults with the incoming, user settings
+            if nemo_conv_settings:
+                default_nemo_conv_settings.update(nemo_conv_settings)
+                for i in ["subsampling_factor", "feat_in", "feat_out"]:
+                    assert (
+                        i not in nemo_conv_settings
+                    ), "{i} should be specified outside of the NeMo dictionary"
+
+            self.conv_ds = NemoConvSubsampling(
+                **default_nemo_conv_settings,
+            )
+        else:
+            self.conv_ds = None
+
+        projection_cls = kwargs.get("projection_cls", "linear")
+        if projection_cls == "linear":
+            self.audio_projection = nn.Linear(audio_dim_out, hidden_size)
+        elif projection_cls == "mlp":
+            # follow llava-v1.5's implementation
+            # (do not use image_projection and image_proj_norm)
+            dim_projection = hidden_size
+            depth = 2
+            self.linear_downsample_rate = (
+                1 if (self.qformer or self.conv_ds) else self.downsample_rate
+            )
+            layers = [
+                nn.Linear(audio_dim_out * self.linear_downsample_rate, dim_projection)
+            ]
+            for _ in range(1, depth):
+                layers.extend([nn.GELU(), nn.Linear(dim_projection, dim_projection)])
+            self.audio_projection = nn.Sequential(*layers)
+            # NOTE vision-speech tasks use a separate projection layer
+            layers = [
+                nn.Linear(audio_dim_out * self.linear_downsample_rate, dim_projection)
+            ]
+            for _ in range(1, depth):
+                layers.extend([nn.GELU(), nn.Linear(dim_projection, dim_projection)])
+            self.audio_projection_for_vision = nn.Sequential(*layers)
+        else:
+            raise NotImplementedError(
+                f"projection_cls = {projection_cls}, not implemented"
+            )
+
+        # TODO: audio sequence compression - Qformer
+        self.vocab_size = config.vocab_size
+        self.input_embeds = None
+        self.audio_embed_sizes = None
+
+    def set_audio_embeds(self, input_embeds: torch.FloatTensor) -> None:
+        self.input_embeds = input_embeds
+
+    def set_audio_embed_sizes(self, audio_embed_sizes: torch.LongTensor) -> None:
+        self.audio_embed_sizes = audio_embed_sizes
+
+    def get_audio_features(
+        self,
+        input_embeds: torch.FloatTensor,
+        audio_attention_mask: torch.Tensor = None,
+        audio_projection_mode: str = "speech",
+    ) -> torch.FloatTensor:
+        """
+        arguments:
+            input_embeds: audio features (B, T, D)  B: num audios in a sequence
+        """
+        if self.freeze_audio_processor:
+            with torch.no_grad():
+                audio_features, masks = self.encoder(input_embeds, audio_attention_mask)
+        else:
+            audio_features, masks = self.encoder(input_embeds, audio_attention_mask)
+
+        if self.qformer is not None:
+            audio_features, _ = self.qformer(audio_features, mask=None)
+
+        if self.conv_ds is not None:
+            if masks is not None:
+                masks = masks.squeeze(1)
+
+            audio_features, masks = self.conv_ds(audio_features, mask=masks)
+
+        if self.linear_downsample_rate != 1:
+            bs, seq_len, feat_dim = audio_features.size()
+            padding = seq_len % self.linear_downsample_rate
+            if padding > 0:
+                audio_features = F.pad(
+                    audio_features,
+                    (0, 0, 0, self.linear_downsample_rate - padding),
+                    "constant",
+                    0,
+                )
+
+            seq_len = audio_features.size(1)
+            audio_features = audio_features.view(
+                bs,
+                seq_len // self.linear_downsample_rate,
+                feat_dim * self.linear_downsample_rate,
+            )
+
+        if audio_projection_mode == "speech":
+            audio_set_tensor = self.audio_projection(audio_features)
+        elif audio_projection_mode == "vision":
+            audio_set_tensor = self.audio_projection_for_vision(audio_features)
+        else:
+            raise ValueError(
+                f"audio_projection_mode = {audio_projection_mode} not " "implemented"
+            )
+
+        return audio_set_tensor
+
+    def forward(
+        self,
+        audio_features: torch.FloatTensor,
+        audio_attention_mask: torch.Tensor = None,
+        audio_projection_mode: str = "speech",
+    ) -> torch.FloatTensor:
+        """
+        arguments:
+            audio_features: audio features (num_audio_tokens, T, D)
+
+        returns:
+            audio_embeds: audio embeddings (num_audio_tokens, hidden_dim)
+        """
+        audio_embeds = self.get_audio_features(
+            audio_features,
+            audio_attention_mask=audio_attention_mask,
+            audio_projection_mode=audio_projection_mode,
+        )
+        return audio_embeds
diff --git a/sglang/python/sglang/srt/models/phi4mm_utils.py b/sglang/python/sglang/srt/models/phi4mm_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..e6bf35ebfc46ed8ee803c0d29560212bda07f5c3
--- /dev/null
+++ b/sglang/python/sglang/srt/models/phi4mm_utils.py
@@ -0,0 +1,1917 @@
+# Copyright 2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+#!/usr/bin/env python3
+import math
+from typing import Optional, Union
+
+import torch
+import torch.nn.functional as F
+from torch import Tensor, nn
+
+
+class BlockBase(nn.Module):
+    """Block abstract module"""
+
+    def __init__(self, input_size, output_size):
+        super().__init__()
+        self.input_size = input_size
+        self.output_size = output_size
+
+
+def get_activation(name="relu"):
+    """Select an activation function by name
+
+    Args:
+        name: str
+            activation function name,
+            one of ["relu", "gelu", "swish", "sigmoid"],
+            default "relu".
+    """
+    name = name.lower()
+    if name == "relu":
+        return nn.ReLU(inplace=True)
+    if name == "gelu":
+        return nn.GELU()
+    if name == "swish":
+        return Swish()
+    if name == "sigmoid":
+        return torch.nn.Sigmoid()
+    return nn.Identity()
+
+
+def adaptive_enc_mask(x_len, chunk_start_idx, left_window=0, right_window=0):
+    """
+    The function is very important for Transformer Transducer Streaming mode
+    Args:
+        xs_len (int): sequence length
+        chunk_start_idx (list): first idx of each chunk, such as [0,18,36,48].
+        It also supports adaptive chunk size [0,10,15,45]
+        left_window (int): how many left chunks can be seen
+        right_window (int): how many right chunks can be seen. It is used for
+        chunk overlap model.
+        Returns:
+            mask (torch.Tensor): a mask tensor for streaming model
+            Torch 1.0.1
+            tensor([[1., 1., 0., 0.],
+                    [0., 1., 1., 0.],
+                    [0., 0., 1., 1.]])
+            Torch 1.4.1
+            tensor([[True., True., False., False.],
+                    [False., True., True., False.],
+                    [False., False., True., True.]])
+    """
+    chunk_start_idx = torch.Tensor(
+        chunk_start_idx
+    ).long()  # first idx of each chunk, such as [0,18,36,48].
+    start_pad = torch.nn.functional.pad(
+        chunk_start_idx, (1, 0)
+    )  # append 0 to the beginning, so it becomes [0, 0, 18, 36, 48]
+    end_pad = torch.nn.functional.pad(
+        chunk_start_idx, (0, 1), value=x_len
+    )  # append x_len to the end, so it becomes [0,18,36,48, x_len]
+    seq_range = torch.arange(0, x_len).unsqueeze(-1)  # seq_range size: [x_len, 1]
+    idx = ((seq_range < end_pad) & (seq_range >= start_pad)).nonzero()[
+        :, 1
+    ]  # idx size: [x_len]
+    # boundary = end_pad[idx]  # boundary size: [x_len]
+    seq_range_expand = (
+        torch.arange(0, x_len).unsqueeze(0).expand(x_len, -1)
+    )  # seq_range_expand size [x_len, x_len]
+    idx_left = idx - left_window
+    idx_left[idx_left < 0] = 0
+    boundary_left = start_pad[idx_left]
+    mask_left = seq_range_expand >= boundary_left.unsqueeze(-1)
+    idx_right = idx + right_window
+    idx_right[idx_right > len(chunk_start_idx)] = len(chunk_start_idx)
+    boundary_right = end_pad[idx_right]
+    mask_right = seq_range_expand < boundary_right.unsqueeze(-1)
+    return mask_left & mask_right
+
+
+class Swish(nn.Module):
+    """Implement Swish activation module.
+    From https://arxiv.org/pdf/2005.03191.pdf
+
+    """
+
+    def __init__(self) -> None:
+        super().__init__()
+        self.act_fn = nn.Sigmoid()
+
+    def forward(self, x: Tensor) -> Tensor:
+        """Apply Swish function
+
+        Args:
+            x: torch.Tensor
+                Input.
+        """
+        return x * self.act_fn(x)
+
+
+class GLU(nn.Module):
+    """Implement Gated Linear Unit (GLU) module"""
+
+    def __init__(self, dim: int = -1, act_name: str = "sigmoid") -> None:
+        super().__init__()
+        self.dim = dim
+        self.act_name = act_name.lower()
+
+        if self.act_name == "relu":
+            self.act_fn = nn.ReLU(inplace=True)
+        elif self.act_name == "gelu":
+            self.act_fn = nn.GELU()
+        elif self.act_name == "swish":
+            self.act_fn = Swish()
+        elif self.act_name == "sigmoid":
+            self.act_fn = nn.Sigmoid()
+        else:
+            self.act_fn = nn.Identity()
+
+    def forward(self, x: Tensor) -> Tensor:
+        """GLU forward
+        Apply Swish function on the first half of input matrices
+        with sigmoid of the second half.
+
+        Args:
+            x: torch.Tensor
+                Input.
+
+        """
+        half_x, gate = x.chunk(2, dim=self.dim)
+        return half_x * self.act_fn(gate)
+
+
+# TODO: Abdel, this can be improved using GLU module
+class GLUPointWiseConv(nn.Module):
+    """GLUPointWiseConv module
+    used for conformer architecture,
+    for more details see:
+    https://arxiv.org/pdf/2005.08100v1.pdf
+
+    Args:
+        input_dim: int
+            input channel size.
+        output_dim: int
+            output channel size.
+        kernel_size: int
+            kernel size
+        glu_type: str, optional
+            activation function one of
+             ["sigmoid", "relu", "gelu"]
+              default "sigmoid".
+        bias_in_glu: bool, optional
+            use addtive bias in glu
+        causal: bool, optional
+            if set to True, padding is set to the half of
+             kernel size, ie, convolution can't see future frames.
+              default False.
+
+    """
+
+    def __init__(
+        self,
+        input_dim,
+        output_dim,
+        kernel_size,
+        glu_type="sigmoid",
+        bias_in_glu=True,
+        causal=False,
+    ):
+        super().__init__()
+
+        self.glu_type = glu_type
+        self.output_dim = output_dim
+        self.bias_in_glu = bias_in_glu
+        if causal:
+            self.ext_pw_conv_1d = nn.Conv1d(
+                input_dim,
+                output_dim * 2,
+                kernel_size,
+                1,
+                padding=(kernel_size - 1),
+            )
+        else:
+            self.ext_pw_conv_1d = nn.Conv1d(
+                input_dim,
+                output_dim * 2,
+                kernel_size,
+                1,
+                padding=(kernel_size - 1) // 2,
+            )
+
+        if glu_type == "sigmoid":
+            self.glu_act = nn.Sigmoid()
+        elif glu_type == "relu":
+            self.glu_act = nn.ReLU()
+        elif glu_type == "gelu":
+            self.glu_act = nn.GELU()
+        elif glu_type == "swish":
+            self.glu_act = Swish()
+        else:
+            raise ValueError(f"Unsupported activation type {self.glu_act}")
+
+        if bias_in_glu:
+            self.b1 = nn.Parameter(torch.zeros(1, output_dim, 1))
+            self.b2 = nn.Parameter(torch.zeros(1, output_dim, 1))
+
+    def forward(self, x):
+        """
+        Args:
+            x: torch.Tensor
+                input tensor
+        """
+        # to be consistent with GLULinear, we assume the input always has the
+        # #channel (#dim) in the last dimension of the tensor, so need to
+        # switch the dimension first for 1D-Conv case
+        x = x.permute([0, 2, 1])
+        x = self.ext_pw_conv_1d(x)
+        if self.glu_type == "bilinear":
+            if self.bias_in_glu:
+                x = (x[:, 0 : self.output_dim, :] + self.b1) * (
+                    x[:, self.output_dim : self.output_dim * 2, :] + self.b2
+                )
+            else:
+                x = (x[:, 0 : self.output_dim, :]) * (
+                    x[:, self.output_dim : self.output_dim * 2, :]
+                )
+        else:
+            if self.bias_in_glu:
+                x = (x[:, 0 : self.output_dim, :] + self.b1) * self.glu_act(
+                    x[:, self.output_dim : self.output_dim * 2, :] + self.b2
+                )
+            else:
+                x = (x[:, 0 : self.output_dim, :]) * self.glu_act(
+                    x[:, self.output_dim : self.output_dim * 2, :]
+                )
+
+        x = x.permute([0, 2, 1])
+        return x
+
+
+class DepthWiseSeperableConv1d(nn.Module):
+    """DepthWiseSeperableConv1d module used in Convnet module
+    for the conformer, for more details see:
+    https://arxiv.org/pdf/2005.08100v1.pdf
+
+    Args:
+        input_dim: int
+            input channel size.
+        depthwise_seperable_out_channel: int
+            if set different to 0, the number of
+             depthwise_seperable_out_channel will be used as a channel_out
+             of the second conv1d layer.
+             otherwise, it equal to 0, the second conv1d layer is skipped.
+        kernel_size: int
+            kernel_size
+        depthwise_multiplier: int
+            number of input_dim channels duplication. this value
+            will be used to compute the hidden channels of the Conv1D.
+        padding: int, optional
+            padding for the conv1d,
+             default: 0.
+
+    """
+
+    def __init__(
+        self,
+        input_dim,
+        depthwise_seperable_out_channel,
+        kernel_size,
+        depthwise_multiplier,
+        padding=0,
+    ):
+        super().__init__()
+
+        self.dw_conv = nn.Conv1d(
+            input_dim,
+            input_dim * depthwise_multiplier,
+            kernel_size,
+            1,
+            padding=padding,
+            groups=input_dim,
+        )
+
+        if depthwise_seperable_out_channel != 0:
+            self.pw_conv = nn.Conv1d(
+                input_dim * depthwise_multiplier,
+                depthwise_seperable_out_channel,
+                1,
+                1,
+                0,
+            )
+        else:
+            self.pw_conv = nn.Identity()
+        self.depthwise_seperable_out_channel = depthwise_seperable_out_channel
+
+    def forward(self, x):
+        """
+
+        Args:
+            x: torch.Tensor
+                input tensor
+        """
+        x = self.dw_conv(x)
+        if self.depthwise_seperable_out_channel != 0:
+            x = self.pw_conv(x)
+        return x
+
+
+class ConvModule(nn.Module):
+    """ConvModule Module for the conformer block.
+    for more details see:
+    https://arxiv.org/pdf/2005.08100v1.pdf
+
+    Args:
+        input_dim: int
+            input channel size.
+        ext_pw_out_channel: int
+            if > 0, ext_pw_out_channel is a dim channel size
+             for the last pointwise conv after swish activation.
+        depthwise_seperable_out_channel: int
+            if set different to 0, the number of
+             depthwise_seperable_out_channel
+             will be used as a channel_out of the second conv1d layer.
+             otherwise, it equal to 0, the second conv1d layer is skipped.
+        ext_pw_kernel_size: int
+            kernel size of the conv pointwise of the conformer.
+        kernel_size: int
+            kernel size.
+        depthwise_multiplier: int
+            number of input_dim channels duplication. this value
+             will be used to compute the hidden channels of the Conv1D.
+        dropout_rate: float
+            dropout rate.
+        causal: bool, optional
+            if set to True, convolution have no access
+             to future frames. default False.
+        batch_norm: bool, optional
+            if set to True, apply batchnorm before activation.
+            default False
+        chunk_se: int, optional
+            0 for offline SE.
+            1 for streaming SE, where mean is computed
+             by accumulated history until current chunk_se.
+            2 for streaming SE, where mean is computed
+             by only the current chunk.
+        chunk_size: int, optional
+            chunk size for cnn. default 18
+        activation: str, optional
+            activation function used in ConvModule,
+            default: "relu".
+        glu_type: str, optional
+            activation function used for the glu,
+            default: "sigmoid".
+        bias_in_glu: bool, optional
+            if set to True, use additive bias in the weight module
+             before GLU.
+        linear_glu_in_convm: bool, optional
+            if set to True, use GLULinear module,
+             otherwise, used GLUPointWiseConv module.
+              default to False.
+        export: bool, optional,
+            if set to True, padding is equal to 0.  This is for inference,
+             or onnx export.  Typically this is set by the export program or
+             the decoder program, and it isn't present in your config file.
+             default False
+    """
+
+    def __init__(
+        self,
+        input_dim,
+        ext_pw_out_channel,
+        depthwise_seperable_out_channel,
+        ext_pw_kernel_size,
+        kernel_size,
+        depthwise_multiplier,
+        dropout_rate,
+        causal=False,
+        batch_norm=False,
+        chunk_se=0,
+        chunk_size=18,
+        activation="relu",
+        glu_type="sigmoid",
+        bias_in_glu=True,
+        linear_glu_in_convm=False,
+        export=False,
+    ):
+        super().__init__()
+        self.layer_norm = nn.LayerNorm(input_dim)
+        self.input_dim = input_dim
+        self.ext_pw_out_channel = ext_pw_out_channel
+        self.ext_pw_kernel_size = ext_pw_kernel_size
+        self.depthwise_seperable_out_channel = depthwise_seperable_out_channel
+        self.glu_type = glu_type
+        self.bias_in_glu = bias_in_glu
+        self.linear_glu_in_convm = linear_glu_in_convm
+        self.causal = causal
+
+        self._add_ext_pw_layer()
+
+        self.batch_norm = batch_norm
+        self.kernel_size = kernel_size
+
+        if batch_norm:
+            self.bn_layer = nn.BatchNorm1d(input_dim)
+
+        self.act = get_activation(activation)
+        self.dropout = nn.Dropout(dropout_rate)
+        self.export = export
+
+        if causal:
+            padding = 0 if export else kernel_size - 1
+        else:
+            padding = (kernel_size - 1) // 2
+
+        self.dw_sep_conv_1d = DepthWiseSeperableConv1d(
+            input_dim,
+            depthwise_seperable_out_channel,
+            kernel_size,
+            depthwise_multiplier,
+            padding=padding,
+        )
+
+        if depthwise_seperable_out_channel != 0:
+            if input_dim != depthwise_seperable_out_channel:
+                self.ln2 = nn.Linear(depthwise_seperable_out_channel, input_dim)
+        else:
+            if depthwise_multiplier != 1:
+                self.ln2 = nn.Linear(input_dim * depthwise_multiplier, input_dim)
+
+    def _add_ext_pw_layer(self):
+        """
+        This function is an extension of __init__ function
+        and dedicated to the convolution module creation
+        of the conformer.
+        """
+        self.ln1 = self.glu = self.bn_layer = self.ext_pw_conv_1d = (
+            nn.Identity()
+        )  # jit hacks.
+        self.squeeze_excitation = nn.Identity()  # jit.
+        self.apply_ln1 = self.fix_len1 = False  # jit.
+
+        if self.ext_pw_out_channel != 0:
+            if self.causal:
+                self.ext_pw_conv_1d = nn.Conv1d(
+                    self.input_dim,
+                    self.ext_pw_out_channel,
+                    self.ext_pw_kernel_size,
+                    1,
+                    padding=(self.ext_pw_kernel_size - 1),
+                )
+                if self.ext_pw_kernel_size > 1:
+                    self.fix_len1 = True
+                else:
+                    self.fix_len1 = False
+            else:
+                self.ext_pw_conv_1d = nn.Conv1d(
+                    self.input_dim,
+                    self.ext_pw_out_channel,
+                    self.ext_pw_kernel_size,
+                    1,
+                    padding=(self.ext_pw_kernel_size - 1) // 2,
+                )
+                self.fix_len1 = False
+
+            if self.linear_glu_in_convm:
+                self.glu = GLULinear(
+                    self.input_dim,
+                    self.ext_pw_out_channel,
+                    self.glu_type,
+                    self.bias_in_glu,
+                )
+            else:
+                self.glu = GLUPointWiseConv(
+                    self.input_dim,
+                    self.ext_pw_out_channel,
+                    self.ext_pw_kernel_size,
+                    self.glu_type,
+                    self.bias_in_glu,
+                    self.causal,
+                )
+
+            if self.input_dim != self.ext_pw_out_channel:
+                self.apply_ln1 = True
+                self.ln1 = nn.Linear(self.ext_pw_out_channel, self.input_dim)
+            else:
+                self.apply_ln1 = False
+        else:
+            self.pw_conv_simplify_w = torch.nn.Parameter(torch.ones(3))
+            self.pw_conv_simplify_b = torch.nn.Parameter(torch.zeros(3))
+
+    def forward(self, x):
+        """ConvModule Forward.
+
+        Args:
+            x: torch.Tensor
+                input tensor.
+        """
+        x = self.layer_norm(x)
+
+        if self.ext_pw_out_channel != 0:
+            x = self.glu(x)
+            if self.causal and self.ext_pw_kernel_size > 1:
+                x = x[:, : -(self.ext_pw_kernel_size - 1), :]
+            if self.apply_ln1:
+                x = self.ln1(x)
+        else:
+            x_0 = x * self.pw_conv_simplify_w[0] + self.pw_conv_simplify_b[0]
+            x_1 = x * self.pw_conv_simplify_w[1] + self.pw_conv_simplify_b[1]
+            x = x_0 + x_1
+
+        x = x.permute([0, 2, 1])
+
+        x = self.dw_sep_conv_1d(x)
+        if self.causal and self.kernel_size > 1:
+            x = x[:, :, : -(self.kernel_size - 1)]
+        if hasattr(self, "ln2"):
+            x = x.permute([0, 2, 1])
+            x = self.ln2(x)
+            x = x.permute([0, 2, 1])
+        if self.batch_norm:
+            x = self.bn_layer(x)
+        x = self.act(x)
+
+        if self.ext_pw_out_channel != 0:
+            x = self.ext_pw_conv_1d(x)
+            if self.fix_len1:
+                x = x[:, :, : -(self.ext_pw_kernel_size - 1)]
+
+            if self.apply_ln1:
+                x = x.permute([0, 2, 1])
+                x = self.ln1(x)
+                x = x.permute([0, 2, 1])
+
+            x = x.permute([0, 2, 1])
+        else:
+            x = x.unsqueeze(1).permute([0, 1, 3, 2])
+            x = x * self.pw_conv_simplify_w[2] + self.pw_conv_simplify_b[2]
+            x = x.squeeze(1)
+
+        x = self.dropout(x)
+        return x
+
+
+class GLULinear(nn.Module):
+    """Linear + GLU module
+
+    Args:
+        input_dim: int
+            input size
+        output_dim: int
+            output size.
+        glu_type:
+            activation function name used in glu module.
+            default "sigmoid" (swish function).
+        bias_in_glu: bool, optional
+            If True, the addtive bias is added. Default False.
+    """
+
+    def __init__(
+        self,
+        input_dim,
+        output_dim,
+        glu_type="sigmoid",
+        bias_in_glu=True,
+    ):
+        super().__init__()
+        self.linear = nn.Linear(input_dim, output_dim * 2, bias_in_glu)
+        self.glu_act = GLU(-1, glu_type)
+
+    def forward(self, x):
+        """GLULinear forward
+
+        Args:
+            x: torch.Tensor
+                inpute tensor.
+        """
+        x = self.linear(x)
+        return self.glu_act(x)
+
+
+class FeedForward(nn.Module):
+    """FeedForward Module.
+    For more details see Conformer paper:
+        https://arxiv.org/pdf/2005.08100.pdf
+
+    Args:
+        d_model: int
+            input size.
+        d_inner: int
+            output size.
+        dropout_rate: float,
+            dropout rate.
+        activation: str,
+            activation function name,
+            one of ["relu", "swish", "sigmoid"],
+            sigmoid activation is only used with "glu_in_fnn=True",
+            default "sigmoid".
+        bias_in_glu: bool, optional
+    """
+
+    def __init__(
+        self,
+        d_model,
+        d_inner,
+        dropout_rate,
+        activation="sigmoid",
+        bias_in_glu=True,
+    ):
+        super().__init__()
+        self.d_model = d_model
+        self.d_inner = d_inner
+
+        self.layer_norm = nn.LayerNorm(d_model)
+        module = GLULinear(d_model, d_inner, activation, bias_in_glu)
+        self.net = nn.Sequential(
+            module,
+            nn.Dropout(dropout_rate),
+            nn.Linear(d_inner, d_model),
+            nn.Dropout(dropout_rate),
+        )
+
+    def forward(self, x):
+        """FeedForward forward function.
+
+        Args:
+            x: torch.Tensor
+                input tensor.
+        """
+        out = self.net(self.layer_norm(x))
+
+        return out
+
+
+#### positional encoding starts here
+def _pre_hook(
+    state_dict,
+    prefix,
+    local_metadata,
+    strict,
+    missing_keys,
+    unexpected_keys,
+    error_msgs,
+):
+    """Perform pre-hook in load_state_dict for backward compatibility.
+
+    Note:
+        We saved self.pe until v.0.5.2 but we have omitted it later.
+        Therefore, we remove the item "pe" from `state_dict` for backward
+        compatibility.
+
+    """
+    k = prefix + "pe"
+    if k in state_dict:
+        state_dict.pop(k)
+
+
+class T5RelativeAttentionLogitBias(nn.Module):
+    """
+    This module implements the relative position bias described in Section
+    2.1 of the T5 paper: https://arxiv.org/pdf/1910.10683.pdf
+
+    The Huggingface implementation is used as a reference
+    https://github.com/huggingface/transformers/blob/v4.30.0/src/
+    transformers/models/t5/modeling_t5.py#L435
+
+    Modifies attention as Q*K^T + B, where B is a learned scalar bias based
+    on relative position of the query and key. It is HxNxN, where H is the
+    number of heads, N is the sequence length.
+
+    I've made these modifications to the original T5 bias:
+    - Skipping of the bucketing step. Original T5 bias converted rel
+      position distances into logarithmically increasing buckets. This is
+      supposed to help with length generalization.
+    - I just directly use rel position index as bias values, as we don't
+      need length generalization (40s max is good enough for ASR encoder),
+      and it keeps ONNX export simple.
+    - I've also extended it so that biases can be asymmetric, the default
+      implementation treats L->R and R->L the same. Asymmetric was found to
+      yield better results in my experiments.
+
+    Args:
+        num_heads: int
+            Number of attention heads
+        num_buckets: int
+            Number of buckets to use for relative attention bias. This is the
+            size of the learnable bias parameter. Bucketing is not yet
+            supported, so this defaults to -1 which means no bucketing is
+            used (max_distance determines size of bias param).
+        max_distance: int
+            Maximum distance to use for relative attention bias. With
+            num_buckets=-1, this directly controls the max size of the bias
+            parameter. When num_buckets > 0 is supported, this will control
+            the maximum distance for logarithmic bucketing after which all
+            positions are in the same bucket.
+        symmetric: bool
+            Whether to use symmetric or asymmetric biases. symmetric=False uses
+            2x number of bias params to distinguish L->R from R->L. This was
+            found to be better for the encoder.
+    """
+
+    def __init__(self, num_heads, num_buckets=-1, max_distance=1000, symmetric=False):
+        super().__init__()
+        self.num_heads = num_heads
+        self.num_buckets = num_buckets
+        self.max_distance = max_distance
+        self.symmetric = symmetric
+        self._skip_bucketing = self.num_buckets < 0
+        if self._skip_bucketing:
+            self.num_buckets = max_distance
+        else:
+            raise NotImplementedError(
+                "T5 attention bias with bucketed positions is not yet tested"
+            )
+        if not self.symmetric:
+            self.num_buckets *= 2
+        self.bias_values = nn.Embedding(self.num_buckets, self.num_heads)
+
+    def forward(self, x):
+        # instantiate bias compatible with shape of x
+        maxpos = x.size(1)
+        context_position = torch.arange(maxpos, device=x.device, dtype=torch.long)[
+            :, None
+        ]
+        memory_position = torch.arange(maxpos, device=x.device, dtype=torch.long)[
+            None, :
+        ]
+        relative_position = memory_position - context_position
+        # clipping to a maximum distance using ops that play well with ONNX
+        # export
+        relative_position = relative_position.masked_fill(
+            relative_position < -self.max_distance, -self.max_distance
+        )
+        relative_position = relative_position.masked_fill(
+            relative_position > self.max_distance - 1, self.max_distance - 1
+        )
+
+        # mapping from relative position to index in the bias parameter
+        if self._skip_bucketing:
+            bias_idx = relative_position
+        else:
+            bias_idx = self._bucket_relative_position(relative_position)
+        if self.symmetric:
+            bias_idx = bias_idx.abs()
+        else:
+            bias_idx += self.num_buckets // 2
+
+        t5_rel_att_bias = self.bias_values(bias_idx)  # [L, L, H]
+        t5_rel_att_bias = t5_rel_att_bias.permute(2, 0, 1).unsqueeze(0)  # [1, H, L, L]
+
+        return t5_rel_att_bias
+
+    def _bucket_relative_position(self, relative_position):
+        # this is a placeholder (isn't tested, likely buggy) using HuggingFace
+        # implem as a reference this also needs to be extended to support
+        # asymmetric +/- ve positions
+        relative_buckets = 0
+        if not self.causal:
+            self.num_buckets //= 2
+            relative_buckets += (relative_position > 0).to(
+                torch.long
+            ) * self.num_buckets
+            relative_position = torch.abs(relative_position)
+        else:
+            relative_position = -torch.min(
+                relative_position, torch.zeros_like(relative_position)
+            )
+        # now relative_position is in the range [0, inf)
+
+        # half of the buckets are for exact increments in positions
+        max_exact = self.num_buckets // 2
+        is_small = relative_position < max_exact
+
+        # The other half of the buckets are for logarithmically bigger bins in
+        # positions up to max_distance
+        relative_position_if_large = max_exact + (
+            torch.log(relative_position.float() / max_exact)
+            / math.log(self.max_distance / max_exact)
+            * (self.num_buckets - max_exact)
+        ).to(torch.long)
+        relative_position_if_large = torch.min(
+            relative_position_if_large,
+            torch.full_like(relative_position_if_large, self.num_buckets - 1),
+        )
+
+        relative_buckets += torch.where(
+            is_small, relative_position, relative_position_if_large
+        )
+        return relative_buckets
+
+
+class AbsolutePositionalEncoding(nn.Module):
+    """Absolute Positional encoding module.
+    This module implement Absolute sinusoidal positional encoding
+    from: https://arxiv.org/pdf/1706.03762.pdf
+
+    Args:
+        d_model: int
+            Input embedding size.
+        dropout_rate: float
+            dropout rate
+        max_len: int, optional
+            Maximum input length sequence, Default 5000
+
+    """
+
+    def __init__(self, d_model, dropout_rate, max_len=5000):
+        """Construct an PositionalEncoding object."""
+        super().__init__()
+        self.d_model = d_model
+        self.xscale = math.sqrt(self.d_model)
+        self.dropout = torch.nn.Dropout(p=dropout_rate)
+        self.pe = None
+        self.extend_pe(torch.tensor(0.0).expand(1, max_len))
+        self._register_load_state_dict_pre_hook(_pre_hook)
+
+    def extend_pe(self, x):
+        """Reset the positional encodings.
+
+        Args:
+            x: torch.Tensor
+        """
+        if self.pe is not None and self.pe.size(1) >= x.size(1):
+            if self.pe.dtype != x.dtype or self.pe.device != x.device:
+                self.pe = self.pe.to(dtype=x.dtype, device=x.device)
+            return
+        pe = torch.zeros(x.size(1), self.d_model)
+        position = torch.arange(0, x.size(1), dtype=torch.float32).unsqueeze(1)
+        div_term = torch.exp(
+            torch.arange(0, self.d_model, 2, dtype=torch.float32)
+            * -(math.log(10000.0) / self.d_model)
+        )
+        pe[:, 0::2] = torch.sin(position * div_term)
+        pe[:, 1::2] = torch.cos(position * div_term)
+        pe = pe.unsqueeze(0)
+        self.pe = pe.to(device=x.device, dtype=x.dtype)
+
+    def forward(self, x: torch.Tensor):
+        """Add positional encoding.
+
+        Args:
+            x: torch.Tensor
+                Input tensor. shape is (batch, time, ...)
+
+        Returns:
+            torch.Tensor: Encoded tensor. Its shape is (batch, time, ...)
+
+        """
+        self.extend_pe(x)
+        x = x * self.xscale + self.pe[:, : x.size(1)]
+        return self.dropout(x)
+
+
+#### forward embedding layers starts here
+class MeanVarianceNormLayer(nn.Module):
+    """Mean/variance normalization layer.
+
+    Will subtract mean and multiply input by inverted standard deviation.
+    Typically used as a very first layer in a model.
+
+    Args:
+        input_size: int
+            layer input size.
+    """
+
+    def __init__(self, input_size):
+        super().__init__()
+        self.input_size = input_size
+        self.global_mean = nn.Parameter(torch.zeros(input_size))
+        self.global_invstd = nn.Parameter(torch.ones(input_size))
+
+    def forward(self, input_: Tensor) -> Tensor:
+        """MeanVarianceNormLayer Forward
+
+        Args:
+            input_: torch.Tensor
+                input tensor.
+        """
+        return (input_ - self.global_mean) * self.global_invstd
+
+
+class CausalConv1D(nn.Conv1d):
+    """
+    A causal version of nn.Conv1d where each step would have limited access to
+    locations on its right or left
+    All arguments are the same as nn.Conv1d except padding.
+
+    If padding is set None, then paddings are set automatically to make it a
+    causal convolution where each location would not see any steps on its right.
+
+    If padding is set as a list (size of 2), then padding[0] would be used as
+    left padding and padding[1] as right padding.
+    It would make it possible to control the number of steps to be accessible
+    on the right and left.
+    This mode is not supported when stride > 1. padding[0]+padding[1] should
+    be equal to (kernel_size - 1).
+    """
+
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: int,
+        stride: int = 1,
+        padding: Union[str, int] = 0,
+        dilation: int = 1,
+        groups: int = 1,
+        bias: bool = True,
+        padding_mode: str = "zeros",
+        device=None,
+        dtype=None,
+    ) -> None:
+        self.cache_drop_size = None
+        if padding is None:
+            self._left_padding = kernel_size - 1
+            self._right_padding = stride - 1
+        else:
+            if stride != 1 and padding != kernel_size - 1:
+                raise ValueError("No striding allowed for non-symmetric convolutions!")
+            if isinstance(padding, int):
+                self._left_padding = padding
+                self._right_padding = padding
+            elif (
+                isinstance(padding, list)
+                and len(padding) == 2
+                and padding[0] + padding[1] == kernel_size - 1
+            ):
+                self._left_padding = padding[0]
+                self._right_padding = padding[1]
+            else:
+                raise ValueError(f"Invalid padding param: {padding}!")
+
+        self._max_cache_len = self._left_padding
+
+        super().__init__(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=0,
+            dilation=dilation,
+            groups=groups,
+            bias=bias,
+            padding_mode=padding_mode,
+            device=device,
+            dtype=dtype,
+        )
+
+    def update_cache(self, x, cache=None):
+        if cache is None:
+            new_x = F.pad(x, pad=(self._left_padding, self._right_padding))
+            next_cache = cache
+        else:
+            new_x = F.pad(x, pad=(0, self._right_padding))
+            new_x = torch.cat([cache, new_x], dim=-1)
+            if self.cache_drop_size > 0:
+                next_cache = new_x[:, :, : -self.cache_drop_size]
+            else:
+                next_cache = new_x
+            next_cache = next_cache[:, :, -cache.size(-1) :]
+        return new_x, next_cache
+
+    def forward(self, x, cache=None):
+        x, cache = self.update_cache(x, cache=cache)
+        x = super().forward(x)
+        if cache is None:
+            return x
+        else:
+            return x, cache
+
+
+class CausalConv2D(nn.Conv2d):
+    """
+    A causal version of nn.Conv2d where each location in the 2D matrix would
+    have no access to locations on its right or down
+    All arguments are the same as nn.Conv2d except padding which should be
+    set as None
+    """
+
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: int,
+        stride: int = 1,
+        padding: Union[str, int] = 0,
+        dilation: int = 1,
+        groups: int = 1,
+        bias: bool = True,
+        padding_mode: str = "zeros",
+        device=None,
+        dtype=None,
+    ) -> None:
+        if padding is not None:
+            raise ValueError("Argument padding should be set to None for CausalConv2D.")
+        self._left_padding = kernel_size - 1
+        self._right_padding = stride - 1
+
+        padding = 0
+        super().__init__(
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride,
+            padding,
+            dilation,
+            groups,
+            bias,
+            padding_mode,
+            device,
+            dtype,
+        )
+
+    def forward(
+        self,
+        x,
+    ):
+        x = F.pad(
+            x,
+            pad=(self._left_padding, self._right_padding, 0, 0),
+        )
+        x = super().forward(x)
+        return x
+
+
+class NemoConvSubsampling(torch.nn.Module):
+    """Convlutional subsampling module, taken from NeMo ASR
+    (https://github.com/NVIDIA/NeMo/blob/b367413645d5c72db3c2c96e46e95a
+    34501479cf/nemo/collections/asr/parts/submodules/subsampling.py)
+
+    Striding Subsampling: "Speech-Transformer: A No-Recurrence
+    Sequence-to-Sequence Model for Speech Recognition" by Linhao Dong
+    et al. (https://ieeexplore.ieee.org/document/8462506)
+
+
+    Compared with the EncoderConv2D (`input_layer: custom`), this is a
+    much simplified approach, and uses no LayerNorm and far fewer Conv2Ds.
+    Moreover, depthwise convolutions are used to reduce FLOPs, but the first
+      layer is kept as a regular convolution so as not to degrade accuracy.
+
+    `Striding` and `dw_striding` are the same except that the latter uses
+    depthwise convolutions after the first layer, whereas the former does not.
+
+    Args:
+        subsampling_factor (int): Time reduction factor
+        feat_in (int): size of the input features
+        feat_out (int): size of the output features
+        subsampling (str): The subsampling technique, choose from
+            {"striding", "dw-striding", "striding_conv1d",
+            "dw_striding_conv1d"}
+        conv_channels (int): Number of channels for the convolution layers,
+                            default is 256.
+        subsampling_conv_chunking_factor (int): Input chunking factor which
+            can be -1 (no chunking) 1 (auto) or a power of 2. Default is 1
+        activation (Module): activation function, default is nn.ReLU()
+        is_causal (bool): whether to use causal Conv1/2D, where each step will
+            have limited access to locations on its right or left
+    """
+
+    def __init__(
+        self,
+        feat_in,
+        feat_out,
+        subsampling_factor=4,
+        subsampling="dw_striding",
+        conv_channels=256,
+        subsampling_conv_chunking_factor=1,
+        activation=nn.ReLU(),  # noqa: B008
+        is_causal=False,
+    ):
+        super().__init__()
+        self._subsampling = subsampling
+        self._conv_channels = conv_channels
+        self._feat_in = feat_in
+        self._feat_out = feat_out
+
+        if subsampling_factor % 2 != 0:
+            raise ValueError("Sampling factor should be a multiply of 2!")
+        self._sampling_num = int(math.log(subsampling_factor, 2))
+        self.subsampling_factor = subsampling_factor
+        self.is_causal = is_causal
+        self.subsampling_causal_cond = subsampling in (
+            "dw_striding",
+            "striding",
+            "striding_conv1d",
+        )
+
+        if (
+            subsampling_conv_chunking_factor != -1
+            and subsampling_conv_chunking_factor != 1
+            and subsampling_conv_chunking_factor % 2 != 0
+        ):
+            raise ValueError(
+                "subsampling_conv_chunking_factor should be -1, 1, or a " "power of 2"
+            )
+        self.subsampling_conv_chunking_factor = subsampling_conv_chunking_factor
+
+        in_channels = 1
+        layers = []
+
+        if subsampling == "dw_striding":
+            self._stride = 2
+            self._kernel_size = 3
+            self._ceil_mode = False
+
+            if self.is_causal:
+                self._left_padding = self._kernel_size - 1
+                self._right_padding = self._stride - 1
+                self._max_cache_len = subsampling_factor + 1
+            else:
+                self._left_padding = (self._kernel_size - 1) // 2
+                self._right_padding = (self._kernel_size - 1) // 2
+                self._max_cache_len = 0
+
+            # Layer 1
+            if self.is_causal:
+                layers.append(
+                    CausalConv2D(
+                        in_channels=in_channels,
+                        out_channels=conv_channels,
+                        kernel_size=self._kernel_size,
+                        stride=self._stride,
+                        padding=None,
+                    )
+                )
+            else:
+                layers.append(
+                    torch.nn.Conv2d(
+                        in_channels=in_channels,
+                        out_channels=conv_channels,
+                        kernel_size=self._kernel_size,
+                        stride=self._stride,
+                        padding=self._left_padding,
+                    )
+                )
+            in_channels = conv_channels
+            layers.append(activation)
+
+            for i in range(self._sampling_num - 1):
+                if self.is_causal:
+                    layers.append(
+                        CausalConv2D(
+                            in_channels=in_channels,
+                            out_channels=in_channels,
+                            kernel_size=self._kernel_size,
+                            stride=self._stride,
+                            padding=None,
+                            groups=in_channels,
+                        )
+                    )
+                else:
+                    layers.append(
+                        torch.nn.Conv2d(
+                            in_channels=in_channels,
+                            out_channels=in_channels,
+                            kernel_size=self._kernel_size,
+                            stride=self._stride,
+                            padding=self._left_padding,
+                            groups=in_channels,
+                        )
+                    )
+
+                layers.append(
+                    torch.nn.Conv2d(
+                        in_channels=in_channels,
+                        out_channels=conv_channels,
+                        kernel_size=1,
+                        stride=1,
+                        padding=0,
+                        groups=1,
+                    )
+                )
+                layers.append(activation)
+                in_channels = conv_channels
+
+        elif subsampling == "striding":
+            self._stride = 2
+            self._kernel_size = 3
+            self._ceil_mode = False
+
+            if self.is_causal:
+                self._left_padding = self._kernel_size - 1
+                self._right_padding = self._stride - 1
+                self._max_cache_len = subsampling_factor + 1
+            else:
+                self._left_padding = (self._kernel_size - 1) // 2
+                self._right_padding = (self._kernel_size - 1) // 2
+                self._max_cache_len = 0
+
+            for i in range(self._sampling_num):
+                if self.is_causal:
+                    layers.append(
+                        CausalConv2D(
+                            in_channels=in_channels,
+                            out_channels=conv_channels,
+                            kernel_size=self._kernel_size,
+                            stride=self._stride,
+                            padding=None,
+                        )
+                    )
+                else:
+                    layers.append(
+                        torch.nn.Conv2d(
+                            in_channels=in_channels,
+                            out_channels=conv_channels,
+                            kernel_size=self._kernel_size,
+                            stride=self._stride,
+                            padding=self._left_padding,
+                        )
+                    )
+                layers.append(activation)
+                in_channels = conv_channels
+
+        elif subsampling == "striding_conv1d":
+            in_channels = feat_in
+
+            self._stride = 2
+            self._kernel_size = 5
+            self._ceil_mode = False
+
+            if self.is_causal:
+                self._left_padding = self._kernel_size - 1
+                self._right_padding = self._stride - 1
+                self._max_cache_len = subsampling_factor + 1
+            else:
+                self._left_padding = (self._kernel_size - 1) // 2
+                self._right_padding = (self._kernel_size - 1) // 2
+                self._max_cache_len = 0
+
+            for i in range(self._sampling_num):
+                if self.is_causal:
+                    layers.append(
+                        CausalConv1D(
+                            in_channels=in_channels,
+                            out_channels=(
+                                feat_out
+                                if self._sampling_num == i + 1
+                                else conv_channels
+                            ),
+                            kernel_size=self._kernel_size,
+                            stride=self._stride,
+                            padding=None,
+                        )
+                    )
+                else:
+                    layers.append(
+                        torch.nn.Conv1d(
+                            in_channels=in_channels,
+                            out_channels=(
+                                feat_out
+                                if self._sampling_num == i + 1
+                                else conv_channels
+                            ),
+                            kernel_size=self._kernel_size,
+                            stride=self._stride,
+                            padding=self._left_padding,
+                        )
+                    )
+                layers.append(activation)
+                in_channels = conv_channels
+
+        elif subsampling == "dw_striding_conv1d":
+            in_channels = feat_in
+
+            self._stride = 2
+            self._kernel_size = 5
+            self._ceil_mode = False
+
+            self._left_padding = (self._kernel_size - 1) // 2
+            self._right_padding = (self._kernel_size - 1) // 2
+
+            # Layer 1
+            layers.extend(
+                [
+                    torch.nn.Conv1d(
+                        in_channels=in_channels,
+                        out_channels=in_channels,
+                        kernel_size=self._kernel_size,
+                        stride=self._stride,
+                        padding=self._left_padding,
+                        groups=in_channels,
+                    ),
+                    torch.nn.Conv1d(
+                        in_channels=in_channels,
+                        out_channels=(
+                            feat_out if self._sampling_num == 1 else conv_channels
+                        ),
+                        kernel_size=1,
+                        stride=1,
+                        padding=0,
+                        groups=1,
+                    ),
+                ]
+            )
+            in_channels = conv_channels
+            layers.append(activation)
+
+            for i in range(self._sampling_num - 1):
+                layers.extend(
+                    [
+                        torch.nn.Conv1d(
+                            in_channels=in_channels,
+                            out_channels=in_channels,
+                            kernel_size=self._kernel_size,
+                            stride=self._stride,
+                            padding=self._left_padding,
+                            groups=in_channels,
+                        ),
+                        torch.nn.Conv1d(
+                            in_channels=in_channels,
+                            out_channels=(
+                                feat_out
+                                if self._sampling_num == i + 2
+                                else conv_channels
+                            ),
+                            kernel_size=1,
+                            stride=1,
+                            padding=0,
+                            groups=1,
+                        ),
+                    ]
+                )
+                layers.append(activation)
+                in_channels = conv_channels
+
+        else:
+            raise ValueError(f"Not valid sub-sampling: {subsampling}!")
+
+        if subsampling in ["dw_striding", "striding"]:
+            in_length = torch.tensor(feat_in, dtype=torch.float)
+            out_length = calc_length(
+                lengths=in_length,
+                all_paddings=self._left_padding + self._right_padding,
+                kernel_size=self._kernel_size,
+                stride=self._stride,
+                ceil_mode=self._ceil_mode,
+                repeat_num=self._sampling_num,
+            )
+            self.out = torch.nn.Linear(conv_channels * int(out_length), feat_out)
+            self.conv2d_subsampling = True
+        elif subsampling in ["striding_conv1d", "dw_striding_conv1d"]:
+            self.out = None
+            self.conv2d_subsampling = False
+        else:
+            raise ValueError(f"Not valid sub-sampling: {subsampling}!")
+
+        self.conv = torch.nn.Sequential(*layers)
+
+    def get_sampling_frames(self):
+        return [1, self.subsampling_factor]
+
+    def get_streaming_cache_size(self):
+        return [0, self.subsampling_factor + 1]
+
+    def forward(self, x, mask):
+        """
+        Forward method for NeMo subsampling.
+
+        Args:
+            x[Batch, Time, Filters]: torch.Tensor
+                input tensor
+            x_mask: torch.Tensor
+                input mask
+
+        Returns:
+            x: torch.Tensor
+                Resulting tensor from subsampling (B, T //
+                time_reduction_factor, feat_out)
+            pad_mask: torch.Tensor
+                tensor of padded hidden state sequences (B, 1, T //
+                time_reduction_factor)
+        """
+        x = x.unsqueeze(1) if self.conv2d_subsampling else x.transpose(1, 2)
+
+        # split inputs if chunking_factor is set
+        if self.subsampling_conv_chunking_factor != -1 and self.conv2d_subsampling:
+            if self.subsampling_conv_chunking_factor == 1:
+                # if subsampling_conv_chunking_factor is 1, we split only
+                # if needed.
+                # avoiding a bug / feature limiting indexing of tensors
+                # to 2**31.
+                # see https://github.com/pytorch/pytorch/issues/80020
+                x_ceil = 2**31 / self._conv_channels * self._stride * self._stride
+                need_to_split = torch.numel(x) > x_ceil
+            else:
+                # if subsampling_conv_chunking_factor > 1 we always split
+                need_to_split = True
+
+            if need_to_split:
+                x, success = self.conv_split_by_batch(x)
+                if not success:  # if unable to split by batch, try by channel
+                    if self._subsampling == "dw_striding":
+                        x = self.conv_split_by_channel(x)
+                    else:
+                        x = self.conv(x)  # try anyway
+            else:
+                x = self.conv(x)
+        else:
+            x = self.conv(x)
+
+        # Flatten Channel and Frequency Axes
+        if self.conv2d_subsampling:
+            b, c, t, f = x.size()
+            x = self.out(x.transpose(1, 2).reshape(b, t, -1))
+        # Transpose to Channel Last mode
+        else:
+            x = x.transpose(1, 2)
+
+        if mask is None:
+            return x, None
+
+        max_audio_length = x.shape[1]
+        feature_lens = mask.sum(1)
+        padding_length = torch.ceil(feature_lens / self.subsampling_factor)
+        if self.is_causal and self.subsampling_causal_cond:
+            feature_lens_remainder = feature_lens % self.subsampling_factor
+            padding_length[feature_lens_remainder != 1] += 1
+        pad_mask = torch.arange(0, max_audio_length, device=x.device).expand(
+            padding_length.size(0), -1
+        ) < padding_length.unsqueeze(1)
+        return x, pad_mask.unsqueeze(1)
+
+    def reset_parameters(self):
+        # initialize weights
+        if self._subsampling == "dw_striding":
+            with torch.no_grad():
+                # init conv
+                scale = 1.0 / self._kernel_size
+                dw_max = (self._kernel_size**2) ** -0.5
+                pw_max = self._conv_channels**-0.5
+
+                torch.nn.init.uniform_(self.conv[0].weight, -scale, scale)
+                torch.nn.init.uniform_(self.conv[0].bias, -scale, scale)
+
+                for idx in range(2, len(self.conv), 3):
+                    torch.nn.init.uniform_(self.conv[idx].weight, -dw_max, dw_max)
+                    torch.nn.init.uniform_(self.conv[idx].bias, -dw_max, dw_max)
+                    torch.nn.init.uniform_(self.conv[idx + 1].weight, -pw_max, pw_max)
+                    torch.nn.init.uniform_(self.conv[idx + 1].bias, -pw_max, pw_max)
+
+                # init fc (80 * 64 = 5120 from https://github.com/kssteven418/
+                # Squeezeformer/blob/13c97d6cf92f2844d2cb3142b4c5bfa9ad1a8951/
+                # src/models/conformer_encoder.py#L487
+                fc_scale = (self._feat_out * self._feat_in / self._sampling_num) ** -0.5
+                torch.nn.init.uniform_(self.out.weight, -fc_scale, fc_scale)
+                torch.nn.init.uniform_(self.out.bias, -fc_scale, fc_scale)
+
+    def conv_split_by_batch(self, x):
+        """Tries to split input by batch, run conv and concat results"""
+        b, _, _, _ = x.size()
+        if b == 1:  # can't split if batch size is 1
+            return x, False
+
+        if self.subsampling_conv_chunking_factor > 1:
+            cf = self.subsampling_conv_chunking_factor
+        else:
+            # avoiding a bug / feature limiting indexing of tensors to 2**31
+            # see https://github.com/pytorch/pytorch/issues/80020
+            x_ceil = 2**31 / self._conv_channels * self._stride * self._stride
+            p = math.ceil(math.log(torch.numel(x) / x_ceil, 2))
+            cf = 2**p
+
+        new_batch_size = b // cf
+        if new_batch_size == 0:  # input is too big
+            return x, False
+
+        return (
+            torch.cat(
+                [self.conv(chunk) for chunk in torch.split(x, new_batch_size, 0)]
+            ),
+            True,
+        )
+
+    def conv_split_by_channel(self, x):
+        """For dw convs, tries to split input by time, run conv and concat
+        results"""
+        x = self.conv[0](x)  # full conv2D
+        x = self.conv[1](x)  # activation
+
+        for i in range(self._sampling_num - 1):
+            _, c, t, _ = x.size()
+
+            if self.subsampling_conv_chunking_factor > 1:
+                cf = self.subsampling_conv_chunking_factor
+            else:
+                # avoiding a bug / feature limiting indexing of tensors
+                # to 2**31
+                # see https://github.com/pytorch/pytorch/issues/80020
+                p = math.ceil(math.log(torch.numel(x) / 2**31, 2))
+                cf = 2**p
+
+            new_c = int(c // cf)
+            if new_c == 0:
+                new_c = 1
+
+            new_t = int(t // cf)
+            if new_t == 0:
+                new_t = 1
+
+            x = self.channel_chunked_conv(
+                self.conv[i * 3 + 2], new_c, x
+            )  # conv2D, depthwise
+
+            # splitting pointwise convs by time
+            x = torch.cat(
+                [self.conv[i * 3 + 3](chunk) for chunk in torch.split(x, new_t, 2)],
+                2,
+            )  # conv2D, pointwise
+            x = self.conv[i * 3 + 4](x)  # activation
+        return x
+
+    def channel_chunked_conv(self, conv, chunk_size, x):
+        """Performs channel chunked convolution"""
+
+        ind = 0
+        out_chunks = []
+        for chunk in torch.split(x, chunk_size, 1):
+            step = chunk.size()[1]
+
+            if self.is_causal:
+                chunk = nn.functional.pad(
+                    chunk,
+                    pad=(
+                        self._kernel_size - 1,
+                        self._stride - 1,
+                        self._kernel_size - 1,
+                        self._stride - 1,
+                    ),
+                )
+                ch_out = nn.functional.conv2d(
+                    chunk,
+                    conv.weight[ind : ind + step, :, :, :],
+                    bias=conv.bias[ind : ind + step],
+                    stride=self._stride,
+                    padding=0,
+                    groups=step,
+                )
+            else:
+                ch_out = nn.functional.conv2d(
+                    chunk,
+                    conv.weight[ind : ind + step, :, :, :],
+                    bias=conv.bias[ind : ind + step],
+                    stride=self._stride,
+                    padding=self._left_padding,
+                    groups=step,
+                )
+            out_chunks.append(ch_out)
+            ind += step
+
+        return torch.cat(out_chunks, 1)
+
+    def change_subsampling_conv_chunking_factor(
+        self, subsampling_conv_chunking_factor: int
+    ):
+        if (
+            subsampling_conv_chunking_factor != -1
+            and subsampling_conv_chunking_factor != 1
+            and subsampling_conv_chunking_factor % 2 != 0
+        ):
+            raise ValueError(
+                "subsampling_conv_chunking_factor should be -1, 1, or a " "power of 2"
+            )
+        self.subsampling_conv_chunking_factor = subsampling_conv_chunking_factor
+
+
+def calc_length(lengths, all_paddings, kernel_size, stride, ceil_mode, repeat_num=1):
+    """Calculates the output length of a Tensor passed through a convolution or
+    max pooling layer"""
+    add_pad: float = all_paddings - kernel_size
+    one: float = 1.0
+    for i in range(repeat_num):
+        lengths = torch.div(lengths.to(dtype=torch.float) + add_pad, stride) + one
+        lengths = torch.ceil(lengths) if ceil_mode else torch.floor(lengths)
+    return lengths.to(dtype=torch.int)
+
+
+####  multihead attention starts here
+class AttModule(nn.Module):
+    """Attention abstraction module"""
+
+    def __init__(self):
+        super().__init__()
+        self.export_mode = False
+
+    def set_export(self, mode=True):
+        """set the export mode"""
+        self.export_mode = mode
+
+    def forward(
+        self,
+        x: Tensor,
+        memory: Optional[Tensor] = None,
+        pos_emb: Optional[Tensor] = None,
+        att_mask: Optional[Tensor] = None,
+    ) -> tuple[Tensor, Tensor, Optional[Tensor], Optional[Tensor]]:
+        """AttModule forward
+
+        Args:
+            x: torch.Tensor
+                input tensor.
+            memory: torch.Tensor, optional
+                memory tensor.
+            pos_emb: torch.Tensor, optional
+                positional encoder embedding.
+            att_mask: torch.Tensor, optional
+                attention mask tensor.
+        """
+        return x, memory, pos_emb, att_mask
+
+
+class AttBlock(BlockBase, AttModule):
+    """Attention Block module to support both Attention and Block module."""
+
+    def memory_dims(self, max_len=False):
+        """memory dimensions"""
+        return (1, self.input_size)
+
+
+def masked_softmax(
+    scores,
+    mask: Optional[Tensor],
+):
+    if mask is not None:
+        mask = mask.unsqueeze(1).eq(0)  # (batch, 1, time1, time2)
+        scores = scores.masked_fill(mask, -torch.inf)
+        attn = torch.softmax(scores, dim=-1).masked_fill(
+            mask, 0.0
+        )  # (batch, head, time1, time2)
+    else:
+        attn = torch.softmax(scores, dim=-1)  # (batch, head, time1, time2)
+    return attn
+
+
+class MultiHeadedAttention(nn.Module):
+    """Multi-Head Attention layer with optional relative position embedding
+    and GLU.
+
+    Args:
+        n_head: int
+            the number of heads.
+        n_feat: int
+            input size features.
+        dropout_rate: float
+            dropout rate.
+        use_LN: bool
+            apply layer norm or not
+        dropout_at_output: bool
+            whether to apply dropout at output
+        attention_inner_dim: int, optional
+            the attention dimension used in the class,
+            it can be different from the input dimension n_feat.
+            default: -1 (equal to n_feat).
+        use_pt_scaled_dot_product_attention: bool, optional
+            if set True, use pytorch scaled dot product attention in training.
+            NOTE: this will NOT be used in ONNX decoding due to a lack of
+            support.  In that case, we use the original attention
+            implementation, which shows no regression.
+            default: False.
+        n_value: int, optional
+            if set to values other than -1, use a different dimension for
+            value. With the default value (i.e. -1), it is backward compatible.
+        group_size: int, optional. must divide `n_head`
+            if group_size > 1:       GQA
+            if group_size = 1:       MHA
+            if group_size = n_head:  MQA
+    """
+
+    inv_sqrt_d_k: torch.jit.Final[float]
+    h: torch.jit.Final[int]
+    h_k: torch.jit.Final[int]
+    g: torch.jit.Final[int]
+
+    def __init__(
+        self,
+        n_head,
+        n_feat,
+        dropout_rate,
+        attention_inner_dim=-1,
+        glu_type="swish",
+        bias_in_glu=True,
+        use_pt_scaled_dot_product_attention=False,
+        n_value=-1,
+        group_size: int = 1,
+    ):
+        super().__init__()
+        if n_value == -1:
+            n_value = n_feat
+        if attention_inner_dim == -1:
+            attention_inner_dim = n_feat
+        assert attention_inner_dim % n_head == 0
+
+        # We assume d_v always equals d_k
+        self.d_k = attention_inner_dim // n_head
+        self.inv_sqrt_d_k = 1.0 / math.sqrt(self.d_k)
+        self.h = n_head
+        assert n_head % group_size == 0, "group_size must divide n_head"
+        self.g = group_size
+        self.h_k = n_head // group_size
+
+        self.linear_q = nn.Linear(n_feat, attention_inner_dim)
+        self.linear_k = nn.Linear(n_feat, attention_inner_dim // group_size)
+        self.linear_v = nn.Linear(n_value, attention_inner_dim // group_size)
+        self.linear_out = nn.Linear(attention_inner_dim // group_size, n_value)
+
+        self.attn = torch.jit.Attribute(None, Optional[Tensor])
+        self.dropout = nn.Dropout(p=dropout_rate)
+        self.dropout_rate = dropout_rate
+        self.use_pt_scaled_dot_product_attention = use_pt_scaled_dot_product_attention
+
+        if use_pt_scaled_dot_product_attention and group_size > 1:
+            raise ValueError("Cannot use PT Scaled Attention with GQA")
+
+        # Torchscript eager quantization.  Note that these functions below are
+        # NOOPs and have very little impact on performance unless quantization
+        # is enabled.
+        self.quant_q = torch.ao.quantization.QuantStub()
+        self.quant_x = torch.ao.quantization.QuantStub()
+        self.dequant = torch.ao.quantization.DeQuantStub()
+        self.ffunc = torch.ao.nn.quantized.FloatFunctional()
+
+    def forward(
+        self,
+        query: Tensor,
+        key: Tensor,
+        value: Tensor,
+        pos_k: Tensor,
+        pos_v: Tensor,
+        mask: Optional[Tensor],
+        relative_attention_bias: Optional[Tensor] = None,
+    ):
+        """Compute 'Scaled Dot Product Attention'.
+
+        Args:
+            query: torch.Tensor
+                query tensor (batch, time1, size)
+            key: torch.Tensor
+                key tensor (batch, time2, size)
+            value: torch.Tensor
+                value tensor (batch, time1, size)
+            pos_k: torch.Tensor
+                key tensor used for relative positional embedding.
+            pos_v: torch.Tensor
+                value tensor used for relative positional embedding.
+            mask: torch.Tensor
+                mask tensor (batch, time1, time2)
+            relative_attention_bias: torch.Tensor
+                bias added to attention logits w.r.t. relative positions
+                (1, n_head, time1, time2)
+        """
+        n_batch = query.size(0)
+
+        q = self.linear_q(query).view(n_batch, -1, self.h, self.d_k)  # (b, t, d)
+        k = self.linear_k(key).view(n_batch, -1, self.h_k, self.d_k)  # (b, t, d)
+        v = self.linear_v(value).view(n_batch, -1, self.h_k, self.d_k)
+        q = (
+            q.transpose(1, 2)
+            if self.use_pt_scaled_dot_product_attention and not torch.jit.is_scripting()
+            else q.transpose(1, 2) * self.inv_sqrt_d_k
+        )
+        k = k.transpose(1, 2)  # (batch, head_k, time2, d_k)
+        v = v.transpose(1, 2)  # (batch, head_k, time2, d_k)
+
+        if self.use_pt_scaled_dot_product_attention and not torch.jit.is_scripting():
+            attn_mask = None
+            if mask is not None:
+                mask = mask.unsqueeze(1)
+                if relative_attention_bias is not None:
+                    attn_mask = mask + relative_attention_bias
+                else:
+                    attn_mask = mask
+                if mask.dtype != q.dtype:
+                    attn_mask = attn_mask.to(q.dtype)
+
+            with torch.nn.attention.sdpa_kernel(
+                [
+                    torch.nn.attention.SDPBackend.FLASH_ATTENTION,
+                    torch.nn.attention.SDPBackend.EFFICIENT_ATTENTION,
+                    torch.nn.attention.SDPBackend.MATH,
+                    torch.nn.attention.SDPBackend.CUDNN_ATTENTION,
+                ]
+            ):
+                x = torch.nn.functional.scaled_dot_product_attention(
+                    q,
+                    k,
+                    v,
+                    attn_mask=attn_mask,
+                    dropout_p=self.dropout_rate,
+                )
+        else:
+            if self.h != self.h_k:
+                q = q.reshape(n_batch, self.g, self.h_k, -1, self.d_k)
+                A = torch.einsum("b g h t d, b h s d -> b h t s", q, k)
+            else:
+                A = torch.matmul(q, k.transpose(-2, -1))
+            if pos_k is not None:
+                if self.h != self.h_k:
+                    B = torch.einsum("b g h t d, t s d -> b h t s", q, pos_k)
+                else:
+                    reshape_q = (
+                        q.contiguous()
+                        .view(n_batch * self.h, -1, self.d_k)
+                        .transpose(0, 1)
+                    )  # (t1,nh,dk)
+                    B = torch.matmul(
+                        reshape_q, pos_k.transpose(-2, -1)
+                    )  # pos_k: (t1,dk,t2)
+                    B = B.transpose(0, 1).view(
+                        n_batch, self.h, pos_k.size(0), pos_k.size(1)
+                    )
+                scores = A + B
+            else:
+                scores = A
+
+            if relative_attention_bias is not None:
+                scores = scores + relative_attention_bias
+
+            attn = masked_softmax(scores, mask)  # (batch, head, time1, time2)
+
+            self.attn = attn
+
+            p_attn = self.dropout(attn)
+            x = torch.matmul(p_attn.to(v.dtype), v)  # (batch, head, time1, d_k)
+            if pos_v is not None:
+                reshape_attn = (
+                    p_attn.contiguous()
+                    .view(n_batch * self.h, pos_v.size(0), pos_v.size(1))
+                    .transpose(0, 1)
+                )  # (t1, bh, t2)
+
+                attn_v = (
+                    torch.matmul(reshape_attn, pos_v)
+                    .transpose(0, 1)
+                    .contiguous()
+                    .view(n_batch, self.h, pos_v.size(0), self.d_k)
+                )
+                x = x + attn_v
+        x = (
+            x.transpose(1, 2).contiguous().view(n_batch, -1, self.h_k * self.d_k)
+        )  # (batch, time1, d_model)
+
+        return self.linear_out(x)  # (batch, time1, d_model)
+
+
+class MultiSequential(torch.nn.Sequential):
+    """Multi-input multi-output torch.nn.Sequential"""
+
+    @torch.jit.ignore
+    def forward(self, *args):
+        """Forward method implementation."""
+        for m in self:
+            args = m(*args)
+        return args
+
+
+def get_offset(input_layer: str, time_reduction: int):
+    """Get an offset. We will use the offset for determining #frames of a
+    subsampled feature.
+
+    Args:
+        input_layer (str): Type of an input layer
+        time_reduction (int): time reduction factor for downsampling a feature
+    Returns:
+        int: offset
+    """
+    if input_layer in ("conv2d", "nemo_conv") and time_reduction == 4:
+        return 3
+    if input_layer in ("conv2d",) and time_reduction == 6:
+        return 1
+    if input_layer in ("conv2d", "nemo_conv") and time_reduction == 8:
+        return 7
+    return 0
+
+
+def unfold_tensor(xs_pad, max_seq_len):
+    """
+    For a given tensor with shape of (N, T, D), if sequence length T is
+    longer than max_seq_len, this function unfold it to a
+    (NT', max_seq_len, D) where T' is T // max_seq_len.
+    Args:
+        xs_pad: N, T, D
+    """
+    _, _, D = xs_pad.shape
+    xs_pad = xs_pad.transpose(-1, -2)  # convert to N, D, T
+    # N x D x 1 x T => N x (D x max_seq_len) x T'
+    xs_pad = F.unfold(
+        xs_pad[..., None, :],
+        kernel_size=(1, max_seq_len),
+        stride=(1, max_seq_len),
+    )
+    new_bsz, _, slen = xs_pad.shape
+    # N x D x max_seq_len x T'
+    xs_pad = xs_pad.view(new_bsz, -1, max_seq_len, slen)
+    # N x T' x max_seq_len x D
+    xs_pad = xs_pad.permute(0, 3, 2, 1).contiguous()
+    # NT' x max_seq_len x D
+    xs_pad = xs_pad.view(-1, max_seq_len, D)
+    return xs_pad
diff --git a/sglang/python/sglang/srt/models/phimoe.py b/sglang/python/sglang/srt/models/phimoe.py
new file mode 100644
index 0000000000000000000000000000000000000000..0d147c2b178339466818d4edce83faf53f78444a
--- /dev/null
+++ b/sglang/python/sglang/srt/models/phimoe.py
@@ -0,0 +1,559 @@
+from typing import Iterable, Optional, Tuple, Union
+
+import torch
+from torch import nn
+from transformers.configuration_utils import PretrainedConfig
+
+from sglang.srt.distributed import get_tensor_model_parallel_world_size
+from sglang.srt.layers.dp_attention import get_attention_tp_rank, get_attention_tp_size
+from sglang.srt.layers.linear import (
+    QKVParallelLinear,
+    ReplicatedLinear,
+    RowParallelLinear,
+)
+from sglang.srt.layers.logits_processor import LogitsProcessor, LogitsProcessorOutput
+from sglang.srt.layers.moe.fused_moe_triton import FusedMoE
+from sglang.srt.layers.moe.topk import TopK
+from sglang.srt.layers.pooler import Pooler, PoolingType
+from sglang.srt.layers.quantization.base_config import QuantizationConfig
+from sglang.srt.layers.radix_attention import RadixAttention
+from sglang.srt.layers.rotary_embedding import get_rope
+from sglang.srt.layers.vocab_parallel_embedding import (
+    DEFAULT_VOCAB_PADDING_SIZE,
+    ParallelLMHead,
+    VocabParallelEmbedding,
+)
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch
+from sglang.srt.model_loader.weight_utils import (
+    default_weight_loader,
+    maybe_remap_kv_scale_name,
+)
+from sglang.srt.utils import add_prefix, make_layers
+
+
+class PhiMoEConfig(PretrainedConfig):
+
+    model_type = "phimoe"
+
+    def __init__(
+        self,
+        vocab_size=32000,
+        hidden_size=4096,
+        intermediate_size=14336,
+        num_hidden_layers=32,
+        num_attention_heads=32,
+        num_key_value_heads=8,
+        head_dim=None,
+        hidden_act="silu",
+        max_position_embeddings=4096 * 32,
+        initializer_range=0.02,
+        rms_norm_eps=1e-5,
+        use_cache=True,
+        pad_token_id=None,
+        bos_token_id=1,
+        eos_token_id=2,
+        tie_word_embeddings=False,
+        rope_theta=1e6,
+        sliding_window=None,
+        attention_dropout=0.0,
+        num_experts_per_tok=2,
+        num_local_experts=16,
+        output_router_logits=False,
+        router_aux_loss_coef=0.001,
+        router_jitter_noise=0.0,
+        attention_bias=False,
+        lm_head_bias=False,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.sliding_window = sliding_window
+        self.attention_bias = attention_bias
+        self.lm_head_bias = lm_head_bias
+        # for backward compatibility
+        if num_key_value_heads is None:
+            num_key_value_heads = num_attention_heads
+        if head_dim is None:
+            head_dim = hidden_size // num_attention_heads
+
+        self.num_key_value_heads = num_key_value_heads
+        self.head_dim = head_dim
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+        self.attention_dropout = attention_dropout
+
+        self.num_experts_per_tok = num_experts_per_tok
+        self.num_local_experts = num_local_experts
+        self.output_router_logits = output_router_logits
+        self.router_aux_loss_coef = router_aux_loss_coef
+        self.router_jitter_noise = router_jitter_noise
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
+
+
+def sparsemixer(scores, jitter_eps=0.01):
+    ################ Select first expert (topk=2) ################
+
+    # compute mask for sparsity
+    mask_logits_threshold, max_ind = scores.max(dim=-1, keepdim=True)
+    factor = scores.abs().clamp(min=mask_logits_threshold)
+    mask_logits_threshold = ((mask_logits_threshold - scores) / factor) > (
+        2 * jitter_eps
+    )
+
+    # apply mask
+    masked_gates = scores.masked_fill(mask_logits_threshold, float("-inf"))
+    selected_experts = max_ind
+
+    # compute scores for gradients
+    masked_gates = torch.softmax(masked_gates, dim=-1)
+    multiplier_o = masked_gates.gather(dim=-1, index=selected_experts)
+
+    multiplier = multiplier_o
+
+    # masked out first expert
+    masked_scores = torch.scatter(
+        scores,
+        -1,
+        selected_experts,
+        float("-inf"),
+    )
+
+    ################ Select second expert (topk=2) ################
+    # compute mask for sparsity
+    mask_logits_threshold, max_ind = masked_scores.max(dim=-1, keepdim=True)
+    factor = scores.abs().clamp(min=mask_logits_threshold)
+    mask_logits_threshold = ((mask_logits_threshold - scores) / factor) > (
+        2 * jitter_eps
+    )
+
+    # apply mask
+    masked_gates_top2 = masked_scores.masked_fill(mask_logits_threshold, float("-inf"))
+    selected_experts_top2 = max_ind
+    # compute scores for gradients
+    masked_gates_top2 = torch.softmax(masked_gates_top2, dim=-1)
+    multiplier_top2 = masked_gates_top2.gather(dim=-1, index=selected_experts_top2)
+
+    multiplier = torch.concat((multiplier, multiplier_top2), dim=-1)
+    selected_experts = torch.concat((selected_experts, selected_experts_top2), dim=-1)
+
+    return (
+        multiplier,
+        selected_experts,
+    )
+
+
+def phimoe_routing_function(
+    hidden_states: torch.Tensor,
+    gating_output: torch.Tensor,
+    topk: int,
+    renormalize: bool,
+):
+    assert hidden_states.shape[0] == gating_output.shape[0], "Number of tokens mismatch"
+    assert topk == 2, "Only top-2 routing is supported"
+    assert renormalize is False, "Renormalization is not supported"
+
+    topk_weights, topk_ids = sparsemixer(gating_output)
+    return topk_weights, topk_ids
+
+
+class PhiMoE(nn.Module):
+    """A tensor-parallel MoE implementation for PhiMoE that shards each expert
+    across all ranks.
+
+    Each expert's weights are sharded across all ranks and a fused MoE
+    kernel is used for the forward pass, and finally we reduce the outputs
+    across ranks.
+    """
+
+    def __init__(
+        self,
+        num_experts: int,
+        top_k: int,
+        hidden_size: int,
+        intermediate_size: int,
+        layer_id: int,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.hidden_size = hidden_size
+        self.tp_size = get_tensor_model_parallel_world_size()
+
+        # Gate always runs at half / full precision for now.
+        self.gate = ReplicatedLinear(
+            hidden_size,
+            num_experts,
+            bias=False,
+            quant_config=None,
+        )
+
+        self.topk = TopK(
+            top_k=top_k,
+            renormalize=False,
+            custom_routing_function=phimoe_routing_function,
+        )
+
+        self.experts = FusedMoE(
+            num_experts=num_experts,
+            top_k=top_k,
+            layer_id=layer_id,
+            hidden_size=hidden_size,
+            intermediate_size=intermediate_size,
+            reduce_results=True,
+            quant_config=quant_config,
+            prefix=add_prefix("experts", prefix),
+        )
+
+    def forward(
+        self, hidden_states: torch.Tensor, forward_batch: Optional[ForwardBatch] = None
+    ) -> torch.Tensor:
+        # NOTE: hidden_states can have either 1D or 2D shape.
+        orig_shape = hidden_states.shape
+        hidden_states = hidden_states.view(-1, self.hidden_size)
+        router_logits, _ = self.gate(hidden_states)
+        topk_output = self.topk(hidden_states, router_logits)
+        final_hidden_states = self.experts(hidden_states, topk_output)
+        return final_hidden_states.view(orig_shape)
+
+
+class PhiMoEAttention(nn.Module):
+
+    def __init__(
+        self,
+        hidden_size: int,
+        num_heads: int,
+        num_kv_heads: int,
+        head_dim: Optional[int] = None,
+        max_position: int = 4096 * 32,
+        rope_theta: float = 10000,
+        layer_id: int = 0,
+        attention_bias: bool = False,
+        quant_config: Optional[QuantizationConfig] = None,
+        rope_scaling: Optional[dict] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.hidden_size = hidden_size
+
+        attn_tp_rank = get_attention_tp_rank()
+        attn_tp_size = get_attention_tp_size()
+
+        self.total_num_heads = num_heads
+        assert self.total_num_heads % attn_tp_size == 0
+        self.num_heads = self.total_num_heads // attn_tp_size
+        self.total_num_kv_heads = num_kv_heads
+        if self.total_num_kv_heads >= attn_tp_size:
+            # Number of KV heads is greater than TP size, so we partition
+            # the KV heads across multiple tensor parallel GPUs.
+            assert self.total_num_kv_heads % attn_tp_size == 0
+        else:
+            # Number of KV heads is less than TP size, so we replicate
+            # the KV heads across multiple tensor parallel GPUs.
+            assert attn_tp_size % self.total_num_kv_heads == 0
+        self.num_kv_heads = max(1, self.total_num_kv_heads // attn_tp_size)
+        if head_dim is None:
+            head_dim = hidden_size // num_heads
+        self.head_dim = head_dim
+
+        self.q_size = self.num_heads * self.head_dim
+        self.kv_size = self.num_kv_heads * self.head_dim
+        self.scaling = self.head_dim**-0.5
+        self.rope_theta = rope_theta
+        self.rope_scaling = rope_scaling
+
+        self.qkv_proj = QKVParallelLinear(
+            hidden_size,
+            self.head_dim,
+            self.total_num_heads,
+            self.total_num_kv_heads,
+            bias=attention_bias,
+            quant_config=quant_config,
+            tp_rank=attn_tp_rank,
+            tp_size=attn_tp_size,
+            prefix=add_prefix("qkv_proj", prefix),
+        )
+        self.o_proj = RowParallelLinear(
+            self.total_num_heads * self.head_dim,
+            hidden_size,
+            bias=attention_bias,
+            quant_config=quant_config,
+            tp_rank=attn_tp_rank,
+            tp_size=attn_tp_size,
+            prefix=add_prefix("o_proj", prefix),
+        )
+        self.rotary_emb = get_rope(
+            self.head_dim,
+            rotary_dim=self.head_dim,
+            max_position=max_position,
+            base=int(self.rope_theta),
+            rope_scaling=self.rope_scaling,
+        )
+        self.attn = RadixAttention(
+            self.num_heads,
+            self.head_dim,
+            self.scaling,
+            num_kv_heads=self.num_kv_heads,
+            layer_id=layer_id,
+            quant_config=quant_config,
+            prefix=add_prefix("attn", prefix),
+        )
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+    ) -> torch.Tensor:
+        qkv, _ = self.qkv_proj(hidden_states)
+        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+        q, k = self.rotary_emb(positions, q, k)
+        attn_output = self.attn(q, k, v, forward_batch)
+        output, _ = self.o_proj(attn_output)
+        return output
+
+
+class PhiMoEDecoderLayer(nn.Module):
+
+    def __init__(
+        self,
+        config: PhiMoEConfig,
+        layer_id: int,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        rope_theta = getattr(config, "rope_theta", 10000)
+        self.self_attn = PhiMoEAttention(
+            hidden_size=self.hidden_size,
+            num_heads=config.num_attention_heads,
+            max_position=config.max_position_embeddings,
+            num_kv_heads=config.num_key_value_heads,
+            head_dim=getattr(
+                config, "head_dim", self.hidden_size // config.num_attention_heads
+            ),
+            rope_theta=rope_theta,
+            layer_id=layer_id,
+            attention_bias=config.attention_bias,
+            quant_config=quant_config,
+            rope_scaling=config.rope_scaling,
+            prefix=add_prefix("self_attn", prefix),
+        )
+        self.block_sparse_moe = PhiMoE(
+            num_experts=config.num_local_experts,
+            top_k=config.num_experts_per_tok,
+            hidden_size=config.hidden_size,
+            intermediate_size=config.intermediate_size,
+            layer_id=layer_id,
+            quant_config=quant_config,
+            prefix=add_prefix("block_sparse_moe", prefix),
+        )
+        self.input_layernorm = nn.LayerNorm(
+            config.hidden_size, eps=config.rms_norm_eps, elementwise_affine=True
+        )
+        self.post_attention_layernorm = nn.LayerNorm(
+            config.hidden_size, eps=config.rms_norm_eps, elementwise_affine=True
+        )
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        residual: Optional[torch.Tensor],
+        forward_batch: ForwardBatch,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        residual = hidden_states
+
+        hidden_states = self.input_layernorm(hidden_states)
+
+        hidden_states = self.self_attn(
+            positions=positions,
+            hidden_states=hidden_states,
+            forward_batch=forward_batch,
+        )
+        hidden_states = hidden_states + residual
+
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.block_sparse_moe(
+            hidden_states, forward_batch=forward_batch
+        )
+
+        hidden_states = hidden_states + residual
+        return hidden_states, residual
+
+
+class PhiMoEModel(nn.Module):
+
+    def __init__(
+        self,
+        config: PhiMoEConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+
+        self.config = config
+        self.quant_config = quant_config
+        self.vocab_size = config.vocab_size
+        self.embed_tokens = VocabParallelEmbedding(
+            config.vocab_size,
+            config.hidden_size,
+            quant_config=quant_config,
+            prefix=add_prefix("embed_tokens", prefix),
+        )
+
+        self.layers = make_layers(
+            config.num_hidden_layers,
+            lambda idx, prefix: PhiMoEDecoderLayer(
+                config, int(prefix.split(".")[-1]), quant_config, prefix=prefix
+            ),
+            prefix=add_prefix("layers", prefix),
+        )
+        self.norm = nn.LayerNorm(
+            config.hidden_size, eps=config.rms_norm_eps, elementwise_affine=True
+        )
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        input_embeds: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor]:
+        if input_embeds is None:
+            hidden_states = self.embed_tokens(input_ids)
+        else:
+            hidden_states = input_embeds
+        residual = None
+
+        for layer in self.layers:
+            hidden_states, residual = layer(
+                positions, hidden_states, residual, forward_batch=forward_batch
+            )
+
+        hidden_states = self.norm(hidden_states)
+        return hidden_states
+
+
+class PhiMoEForCausalLM(nn.Module):
+
+    def __init__(
+        self,
+        config: PhiMoEConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+
+        super().__init__()
+        self.config = config
+        self.quant_config = quant_config
+
+        self.model = PhiMoEModel(
+            config=config, quant_config=quant_config, prefix=add_prefix("model", prefix)
+        )
+        self.lm_head = ParallelLMHead(
+            config.vocab_size,
+            config.hidden_size,
+            org_num_embeddings=config.vocab_size,
+            padding_size=DEFAULT_VOCAB_PADDING_SIZE,
+            quant_config=quant_config,
+            bias=True,
+            prefix=add_prefix("lm_head", prefix),
+        )
+        if self.config.tie_word_embeddings:
+            self.lm_head.weight = self.model.embed_tokens.weight
+        self.logits_processor = LogitsProcessor(config)
+        self.pooler = Pooler(pooling_type=PoolingType.LAST, normalize=True)
+
+    @torch.no_grad()
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        get_embedding: bool = False,
+    ) -> LogitsProcessorOutput:
+        hidden_states = self.model(input_ids, positions, forward_batch, inputs_embeds)
+
+        if not get_embedding:
+            return self.logits_processor(
+                input_ids, hidden_states, self.lm_head, forward_batch
+            )
+
+        else:
+            return self.pooler(hidden_states, forward_batch)
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+        ]
+
+        expert_params_mapping = FusedMoE.make_expert_params_mapping(
+            ckpt_gate_proj_name="w1",
+            ckpt_down_proj_name="w2",
+            ckpt_up_proj_name="w3",
+            num_experts=self.config.num_local_experts,
+        )
+
+        params_dict = dict(self.named_parameters())
+        for name, loaded_weight in weights:
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                for mapping in expert_params_mapping:
+                    param_name, weight_name, expert_id, shard_id = mapping
+                    if weight_name not in name:
+                        continue
+                    name = name.replace(weight_name, param_name)
+                    param = params_dict[name]
+                    weight_loader = param.weight_loader
+                    weight_loader(
+                        param,
+                        loaded_weight,
+                        name,
+                        shard_id=shard_id,
+                        expert_id=expert_id,
+                    )
+                    break
+                else:
+                    if name.endswith(".bias") and name not in params_dict:
+                        continue
+                    # Remapping the name of FP8 kv-scale.
+                    name = maybe_remap_kv_scale_name(name, params_dict)
+                    if name is None:
+                        continue
+
+                    param = params_dict[name]
+                    weight_loader = getattr(
+                        param, "weight_loader", default_weight_loader
+                    )
+                    weight_loader(param, loaded_weight)
+
+
+EntryClass = PhiMoEForCausalLM
diff --git a/sglang/python/sglang/srt/models/pixtral.py b/sglang/python/sglang/srt/models/pixtral.py
new file mode 100644
index 0000000000000000000000000000000000000000..2658019014211376ffa42b996d24cb09ddd6b446
--- /dev/null
+++ b/sglang/python/sglang/srt/models/pixtral.py
@@ -0,0 +1,1027 @@
+# Copyright 2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""
+Using mistral-community/pixtral-12b as reference.
+"""
+
+from dataclasses import dataclass, fields
+from typing import Iterable, List, Optional, Set, Tuple, Union
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from transformers import PixtralVisionConfig, PretrainedConfig
+from transformers.models.pixtral.modeling_pixtral import (
+    PixtralRotaryEmbedding,
+)
+from transformers.models.pixtral.modeling_pixtral import (
+    generate_block_attention_mask as _get_pixtral_attention_mask,
+)
+from transformers.models.pixtral.modeling_pixtral import (
+    position_ids_in_meshgrid,
+)
+
+from sglang.srt.layers.activation import SiluAndMul
+from sglang.srt.layers.attention.vision import VisionAttention
+from sglang.srt.layers.layernorm import RMSNorm
+from sglang.srt.layers.linear import MergedColumnParallelLinear, RowParallelLinear
+from sglang.srt.layers.quantization.base_config import QuantizationConfig
+from sglang.srt.managers.mm_utils import (
+    MultiModalityDataPaddingPatternMultimodalTokens,
+    general_mm_embed_routine,
+)
+from sglang.srt.managers.schedule_batch import MultimodalDataItem, MultimodalInputs
+from sglang.srt.model_loader.weight_utils import default_weight_loader
+from sglang.srt.models.mistral_large_3 import MistralLarge3ForCausalLM
+
+USE_XFORMERS_OPS = False
+PATCH_MERGE = "patch_merge"
+
+
+# Vision encoder
+@dataclass
+class VisionEncoderArgs:
+    hidden_size: int
+    num_channels: int
+    image_size: int
+    patch_size: int
+    intermediate_size: int
+    num_hidden_layers: int
+    num_attention_heads: int
+    rope_theta: float  # for rope-2D
+    image_token_id: int
+    adapter_bias: bool = True
+    spatial_merge_size: int = 1
+    add_pre_mm_projector_layer_norm: bool = False
+    mm_projector_id: str = ""
+
+
+class PixtralForConditionalGeneration(nn.Module):
+    merge_by_field_config = True
+
+    @classmethod
+    def get_placeholder_str(cls, modality: str, i: int) -> str | None:
+        if modality.startswith("image"):
+            return None
+
+        raise ValueError("Only image modality is supported")
+
+    def __init__(self, *, config, prefix: str = "", **kwargs):
+        super().__init__()
+        self.config = config
+        dataclass_fields = {field.name for field in fields(VisionEncoderArgs)}
+        vision_args = {
+            key: value
+            for key, value in self.config.vision_config.to_dict().items()
+            if key in dataclass_fields
+        }
+
+        self.vision_args = VisionEncoderArgs(**vision_args)
+
+        self.language_model = MistralLarge3ForCausalLM(
+            config=self.config.text_config,
+            quant_config=kwargs.get("quant_config"),
+        )
+
+        self.vision_encoder = VisionTransformer(self.vision_args)
+
+        if self.vision_args.add_pre_mm_projector_layer_norm:
+            self.pre_mm_projector_norm = RMSNorm(self.vision_args.hidden_size, eps=1e-5)
+
+        if self.vision_args.mm_projector_id == PATCH_MERGE:
+            self.patch_merger = PatchMerger(
+                vision_encoder_dim=self.vision_args.hidden_size,
+                spatial_merge_size=self.vision_args.spatial_merge_size,
+                use_mlp_bias=False,
+            )
+
+        self.vision_language_adapter = VisionLanguageAdapter(
+            self.vision_args, dim=self.config.text_config.hidden_size
+        )
+
+    def pad_input_ids(self, input_ids: List[int], mm_inputs: MultimodalInputs):
+        pattern = MultiModalityDataPaddingPatternMultimodalTokens()
+        return pattern.pad_input_tokens(input_ids, mm_inputs)
+
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
+        def is_vision_encoder_weights(weight: tuple[str, torch.Tensor]):
+            return weight[0].startswith("vision_encoder")
+
+        def is_vision_lang_adapter_weights(weight: tuple[str, torch.Tensor]):
+            return weight[0].startswith("vision_language_adapter")
+
+        def is_patch_merger(weight: tuple[str, torch.Tensor]):
+            return weight[0].startswith("patch_merger")
+
+        def is_pre_mm_projector_norm(weight: tuple[str, torch.Tensor]):
+            return weight[0].startswith("pre_mm_projector_norm")
+
+        # Get references to parameters for direct loading
+        vision_encoder_dict = dict(self.vision_encoder.named_parameters())
+        patch_merger_dict = (
+            dict(self.patch_merger.named_parameters())
+            if self.vision_args.mm_projector_id == PATCH_MERGE
+            else dict()
+        )
+        pre_mm_projector_norm_dict = (
+            dict(self.pre_mm_projector_norm.named_parameters())
+            if self.vision_args.add_pre_mm_projector_layer_norm
+            else dict()
+        )
+        vision_lang_adapter_dict = dict(self.vision_language_adapter.named_parameters())
+
+        def llm_weights_generator():
+            # Single pass over weights
+            for name, w in weights:
+                if is_vision_encoder_weights((name, w)):
+                    # Load vision encoder weights directly
+                    trimmed_name = ".".join(name.split(".")[1:])
+                    # NOTE: The current nvfp4 model has extra weights that we need to ignore, called
+                    # vision_encoder.transformer.layers.*.attention.{k,v}_fake_quantizer.qscale_act
+                    # TODO: Remove this if condition once the model is fixed
+                    if "fake_quantizer.qscale_act" in trimmed_name:
+                        continue
+                    param = vision_encoder_dict[trimmed_name]
+                    with torch.no_grad():
+                        default_weight_loader(param, w)
+                elif is_patch_merger((name, w)):
+                    # Load vision patch merger weights directly
+                    trimmed_name = ".".join(name.split(".")[1:])
+                    param = patch_merger_dict[trimmed_name]
+                    with torch.no_grad():
+                        default_weight_loader(param, w)
+                elif is_pre_mm_projector_norm((name, w)):
+                    # Load vision pre_mm_projector_norm weights directly
+                    trimmed_name = ".".join(name.split(".")[1:])
+                    param = pre_mm_projector_norm_dict[trimmed_name]
+                    with torch.no_grad():
+                        default_weight_loader(param, w)
+                elif is_vision_lang_adapter_weights((name, w)):
+                    # Load vision-language adapter weights directly
+                    trimmed_name = ".".join(name.split(".")[1:])
+                    param = vision_lang_adapter_dict[trimmed_name]
+                    with torch.no_grad():
+                        default_weight_loader(param, w)
+                else:
+                    # LLM weights: yield them to be loaded
+                    # by language_model.load_weights
+                    yield (name, w)
+
+        # Now we call the language model load with the generator
+        self.language_model.load_weights(llm_weights_generator())
+
+    def get_language_model(self) -> torch.nn.Module:
+        return self.language_model
+
+    def get_image_feature(self, items: List[MultimodalDataItem]) -> torch.Tensor:
+        images = [item.feature for item in items]
+        # Process images through vision encoder
+        image_features = self.vision_encoder(images)
+        if self.vision_args.add_pre_mm_projector_layer_norm:
+            image_features = image_features.view(-1, image_features.shape[-1])
+            image_features = self.pre_mm_projector_norm(image_features)
+        if self.vision_args.mm_projector_id == PATCH_MERGE:
+            patch_size = self.vision_args.patch_size
+            img_patch_dims = [
+                (img.shape[-2] // patch_size, img.shape[-1] // patch_size)
+                for img in images
+                for _ in range(img.shape[0])
+            ]
+            image_features = self.patch_merger(
+                image_features, image_sizes=img_patch_dims
+            )
+        image_embeds = self.vision_language_adapter(image_features)
+        return image_embeds
+
+    def forward(self, input_ids, positions, forward_batch):
+        return general_mm_embed_routine(
+            input_ids=input_ids,
+            forward_batch=forward_batch,
+            language_model=self.language_model,
+            multimodal_model=self,
+            positions=positions,
+        )
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+    ) -> torch.Tensor | None:
+        return self.language_model.compute_logits(hidden_states)
+
+    def get_embed_and_head(self):
+        return self.language_model.get_embed_and_head()
+
+
+class PatchMerger(nn.Module):
+    """
+    Learned merging of spatial_merge_size ** 2 patches
+    """
+
+    def __init__(
+        self,
+        vision_encoder_dim: int,
+        spatial_merge_size: int,
+        use_mlp_bias: bool = False,
+    ) -> None:
+        super().__init__()
+
+        mlp_input_dim = vision_encoder_dim * (spatial_merge_size**2)
+
+        self.spatial_merge_size = spatial_merge_size
+        self.mlp_input_dim = mlp_input_dim
+
+        self.merging_layer = nn.Linear(
+            mlp_input_dim,
+            vision_encoder_dim,
+            bias=use_mlp_bias,
+        )
+
+    def forward(
+        self, x: torch.Tensor, image_sizes: list[tuple[int, int]]
+    ) -> torch.Tensor:
+        # image_sizes specified in tokens
+        assert sum([h * w for h, w in image_sizes]) == x.shape[-2]
+
+        # x is (N, vision_encoder_dim)
+        x = self.permute(x, image_sizes)
+
+        # x is (N / spatial_merge_size ** 2,
+        #       vision_encoder_dim * spatial_merge_size ** 2)
+        x = self.merging_layer(x)
+
+        # x is (N / spatial_merge_size ** 2, vision_encoder_dim)
+        return x
+
+    def permute(
+        self,
+        x: torch.Tensor,
+        image_sizes: list[tuple[int, int]],
+    ) -> torch.Tensor:
+        """
+        Args:
+            x: (N, D) where N is flattened and concatenated patch tokens
+                for all images
+            image_sizes: list of tuple of (height, width) in tokens for
+                each image
+        Returns:
+            image_features: reorders patch tokens so each grid of
+                (spatial_merge_size, spatial_merge_size) is contiguous.
+                now (N / spatial_merge_size ** 2, D * spatial_merge_size ** 2)
+        """
+
+        sub_grids = get_sub_grids(
+            x=x, image_sizes=image_sizes, spatial_merge_size=self.spatial_merge_size
+        )  # list of [d x sub_grid_size x sub_grid_size x n_patches]
+        permuted_tensor: list[torch.Tensor] = []
+        for grid in sub_grids:
+            n_patches = grid.shape[-1]
+            permuted_tensor.append(
+                grid.view(-1, n_patches).t()
+            )  # n_patches x d * sub_grid_size * sub_grid_size
+        return torch.cat(
+            permuted_tensor, dim=0
+        )  # (N / spatial_merge_size ** 2, d * spatial_merge_size ** 2)
+
+
+def get_sub_grids(
+    x: torch.Tensor,
+    image_sizes: list[tuple[int, int]],
+    spatial_merge_size: int,
+) -> list[torch.Tensor]:
+    # image_sizes specified in tokens
+    tokens_per_image = [h * w for h, w in image_sizes]
+    d = x.shape[-1]
+    all_img_sub_grids: list[torch.Tensor] = []
+    sub_grid_size = spatial_merge_size
+
+    for image_index, image_tokens in enumerate(x.split(tokens_per_image)):
+        # Reshape image_tokens into a 2D grid
+        h, w = image_sizes[image_index]
+        image_grid = image_tokens.view(h, w, d).permute(2, 0, 1)[
+            None, :, :, :
+        ]  # 1 x d x h x w
+        sub_grids = torch.nn.functional.unfold(
+            image_grid, kernel_size=sub_grid_size, stride=sub_grid_size
+        )
+        sub_grids = sub_grids.view(
+            1, d, sub_grid_size, sub_grid_size, -1
+        )  # 1 x d x sub_grid_size x sub_grid_size x n_patches
+
+        all_img_sub_grids.append(sub_grids[0])
+
+    return all_img_sub_grids
+
+
+class VisionTransformer(nn.Module):
+    def __init__(self, args: VisionEncoderArgs):
+        super().__init__()
+        self.args = args
+        self.patch_conv = nn.Conv2d(
+            in_channels=args.num_channels,
+            out_channels=args.hidden_size,
+            kernel_size=args.patch_size,
+            stride=args.patch_size,
+            bias=False,
+        )
+        self.ln_pre = RMSNorm(args.hidden_size, eps=1e-5)
+        self.transformer = Transformer(args)
+
+        head_dim = self.args.hidden_size // self.args.num_attention_heads
+        assert head_dim % 2 == 0, "ROPE requires even head_dim"
+        self._freqs_cis: torch.Tensor | None = None
+
+    @property
+    def max_patches_per_side(self) -> int:
+        return self.args.image_size // self.args.patch_size
+
+    @property
+    def device(self) -> torch.types.Device:
+        return next(self.parameters()).device
+
+    @property
+    def dtype(self) -> torch.dtype:
+        return next(self.parameters()).dtype
+
+    @property
+    def freqs_cis(self) -> torch.Tensor:
+        if self._freqs_cis is None:
+            self._freqs_cis = precompute_freqs_cis_2d(
+                dim=self.args.hidden_size // self.args.num_attention_heads,
+                height=self.max_patches_per_side,
+                width=self.max_patches_per_side,
+                theta=self.args.rope_theta,
+            )
+
+        if self._freqs_cis.device != self.device:
+            self._freqs_cis = self._freqs_cis.to(device=self.device)
+
+        return self._freqs_cis
+
+    def forward(
+        self,
+        images: list[torch.Tensor],
+    ) -> torch.Tensor:
+        """
+        Args:
+            images: list of N_img images of variable sizes,
+                each of shape (B, C, H, W)
+        Returns:
+            image_features: tensor of token features for
+                all tokens of all images of shape (N_toks, D)
+        """
+        patch_embeds_list = [self.patch_conv(img.to(self.dtype)) for img in images]
+
+        patch_embeds = [p.flatten(2).permute(0, 2, 1) for p in patch_embeds_list]
+
+        patch_embeds = torch.cat(patch_embeds, dim=1)
+        patch_embeds_shape = patch_embeds.shape
+        patch_embeds = patch_embeds.view(-1, patch_embeds_shape[-1])
+        patch_embeds = self.ln_pre(patch_embeds)
+        patch_embeds = patch_embeds.view(patch_embeds_shape)
+
+        # positional embeddings
+        positions = position_meshgrid(patch_embeds_list).to(self.device)
+        freqs_cis = self.freqs_cis[positions[:, 0], positions[:, 1]]
+
+        # pass through Transformer with a block diagonal mask delimiting images
+        if USE_XFORMERS_OPS:
+            from xformers import ops as xops
+
+            mask = xops.fmha.attn_bias.BlockDiagonalMask.from_seqlens(
+                [p.shape[-2] * p.shape[-1] for p in patch_embeds_list],
+            )
+        else:
+            from transformers.models.pixtral.modeling_pixtral import (
+                generate_block_attention_mask,
+            )
+
+            mask = generate_block_attention_mask(
+                [p.shape[-2] * p.shape[-1] for p in patch_embeds_list], patch_embeds
+            )
+        return self.transformer(patch_embeds, mask=mask, freqs_cis=freqs_cis)
+
+
+def position_meshgrid(
+    patch_embeds_list: list[torch.Tensor],
+) -> torch.Tensor:
+    positions = torch.cat(
+        [
+            torch.stack(
+                torch.meshgrid(
+                    torch.arange(p.shape[-2]),
+                    torch.arange(p.shape[-1]),
+                    indexing="ij",
+                ),
+                dim=-1,
+            ).reshape(-1, 2)
+            for p in patch_embeds_list
+        ]
+    )
+    return positions
+
+
+class PixtralHFMLP(nn.Module):
+    """MLP for PixtralHFVisionModel using SGLang components."""
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        *,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+
+        assert config.intermediate_size is not None
+
+        # Use MergedColumnParallelLinear for gate_up_proj to handle combined weights
+        self.gate_up_proj = MergedColumnParallelLinear(
+            input_size=config.hidden_size,
+            output_sizes=[config.intermediate_size, config.intermediate_size],
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.gate_up_proj",
+        )
+
+        self.down_proj = RowParallelLinear(
+            input_size=config.intermediate_size,
+            output_size=config.hidden_size,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.down_proj",
+        )
+
+        self.act_fn = SiluAndMul()
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        gate_up_output, _ = self.gate_up_proj(x)
+
+        # Apply SiLU activation and multiply
+        gate_up = self.act_fn(gate_up_output)
+
+        # Project back to hidden size
+        out, _ = self.down_proj(gate_up)
+        return out
+
+
+class VisionLanguageAdapter(nn.Module):
+    def __init__(self, args: VisionEncoderArgs, dim: int):
+        super().__init__()
+        assert isinstance(args, VisionEncoderArgs)
+        self.w_in = nn.Linear(
+            args.hidden_size,
+            dim,
+            bias=args.adapter_bias,
+        )
+        self.gelu = nn.GELU()
+        self.w_out = nn.Linear(dim, dim, bias=args.adapter_bias)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.w_out(self.gelu(self.w_in(x)))
+
+
+class PixtralHFTransformerBlock(nn.Module):
+    """Transformer block for PixtralHFVisionModel using SGLang components."""
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        layer_id: int,
+        quant_config: Optional[QuantizationConfig] = None,
+        *,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+
+        self.layer_id = layer_id
+        self.attention_norm = RMSNorm(config.hidden_size, eps=1e-5)
+
+        # Use SGLang's VisionAttention instead of vLLM's PixtralHFAttention
+        self.attention = VisionAttention(
+            embed_dim=config.hidden_size,
+            num_heads=config.num_attention_heads,
+            projection_size=config.hidden_size,
+            use_qkv_parallel=True,
+            quant_config=quant_config,
+            dropout=0.0,
+            use_context_forward=False,
+            flatten_batch=False,
+            qkv_bias=False,
+            proj_bias=False,
+            prefix=f"{prefix}.attention",
+        )
+
+        self.feed_forward = PixtralHFMLP(
+            config, quant_config=quant_config, prefix=f"{prefix}.feed_forward"
+        )
+
+        self.ffn_norm = RMSNorm(config.hidden_size, eps=1e-5)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor],
+        position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]],
+    ) -> torch.Tensor:
+        # Ensure hidden_states has the batch dimension [batch, seq_len, hidden_dim]
+        batch_size, seq_len, hidden_dim = hidden_states.shape
+
+        # Apply attention norm - normalize along the last dimension
+        attn_normalized = self.attention_norm(hidden_states.view(-1, hidden_dim)).view(
+            batch_size, seq_len, hidden_dim
+        )
+
+        # Pass through attention layer
+        attention_output = self.attention(
+            attn_normalized,
+            attention_mask=attention_mask,
+            cu_seqlens=None,
+            position_embeddings=position_embeddings,
+        )
+
+        # Apply first residual connection
+        hidden_states = hidden_states + attention_output
+
+        # Apply feed-forward norm - normalize along the last dimension
+        ffn_normalized = self.ffn_norm(hidden_states.view(-1, hidden_dim)).view(
+            batch_size, seq_len, hidden_dim
+        )
+
+        # Pass through feed-forward layer
+        # First reshape to 2D for the feed-forward network, then reshape back
+        ffn_output = self.feed_forward(ffn_normalized)
+
+        # Apply second residual connection
+        output = hidden_states + ffn_output
+
+        return output
+
+
+def _reshape_for_broadcast(freqs_cis: torch.Tensor, x: torch.Tensor) -> torch.Tensor:
+    """
+    freqs_cis: complex - (seq_len, head_dim / 2)
+    x: complex - (bsz, seq_len, head_dim / 2)
+    """
+    ndim = x.ndim
+    assert ndim > 1
+    assert freqs_cis.shape == (x.shape[1], x.shape[-1]), (
+        freqs_cis.shape,
+        (x.shape[1], x.shape[-1]),
+    )
+    shape = [d if i == 1 or i == ndim - 1 else 1 for i, d in enumerate(x.shape)]
+    return freqs_cis.view(*shape)
+
+
+def precompute_freqs_cis_2d(
+    dim: int,
+    height: int,
+    width: int,
+    theta: float,
+) -> torch.Tensor:
+    """
+    freqs_cis: 2D complex tensor of shape (height, width, dim // 2)
+        to be indexed by (height, width) position tuples
+    """
+    # (dim / 2) frequency bases
+    freqs = 1.0 / (theta ** (torch.arange(0, dim, 2).float() / dim))
+
+    h = torch.arange(height, device=freqs.device)
+    w = torch.arange(width, device=freqs.device)
+
+    freqs_h = torch.outer(h, freqs[::2]).float()
+    freqs_w = torch.outer(w, freqs[1::2]).float()
+    freqs_2d = torch.cat(
+        [
+            freqs_h[:, None, :].repeat(1, width, 1),
+            freqs_w[None, :, :].repeat(height, 1, 1),
+        ],
+        dim=-1,
+    )
+    return torch.polar(torch.ones_like(freqs_2d), freqs_2d)
+
+
+def apply_rotary_emb_vit(
+    xq: torch.Tensor,
+    xk: torch.Tensor,
+    freqs_cis: torch.Tensor,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))
+    xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))
+    assert freqs_cis.dtype == torch.complex64
+    freqs_cis = _reshape_for_broadcast(freqs_cis, xq_)
+    xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)
+    xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)
+    return xq_out.type_as(xq), xk_out.type_as(xk)
+
+
+class FeedForward(nn.Module):
+    def __init__(self, args: VisionEncoderArgs):
+        super().__init__()
+        assert args.intermediate_size is not None
+        self.w1 = nn.Linear(args.hidden_size, args.intermediate_size, bias=False)
+        self.w2 = nn.Linear(args.intermediate_size, args.hidden_size, bias=False)
+        self.w3 = nn.Linear(args.hidden_size, args.intermediate_size, bias=False)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.w2(F.silu(self.w1(x)) * self.w3(x))
+
+
+class Attention(nn.Module):
+    def __init__(self, args: VisionEncoderArgs):
+        super().__init__()
+        self.args = args
+        assert not args.hidden_size % args.num_attention_heads
+        self.n_heads = args.num_attention_heads
+        self.head_dim = args.hidden_size // args.num_attention_heads
+
+        self.wq = nn.Linear(args.hidden_size, args.hidden_size, bias=False)
+        self.wk = nn.Linear(args.hidden_size, args.hidden_size, bias=False)
+        self.wv = nn.Linear(args.hidden_size, args.hidden_size, bias=False)
+        self.wo = nn.Linear(args.hidden_size, args.hidden_size, bias=False)
+
+    def forward(
+        self,
+        x: torch.Tensor,
+        mask: torch.Tensor,
+        freqs_cis: torch.Tensor,
+    ) -> torch.Tensor:
+        batch, patches, _ = x.shape
+
+        q, k, v = self.wq(x), self.wk(x), self.wv(x)
+        q = q.reshape(batch, patches, self.n_heads, self.head_dim)
+        k = k.reshape(batch, patches, self.n_heads, self.head_dim)
+        v = v.reshape(batch, patches, self.n_heads, self.head_dim)
+
+        q, k = apply_rotary_emb_vit(q, k, freqs_cis=freqs_cis)
+
+        if USE_XFORMERS_OPS:
+            from xformers import ops as xops
+
+            out = xops.memory_efficient_attention(q, k, v, attn_bias=mask)
+        else:
+            q = q.transpose(1, 2)
+            k = k.transpose(1, 2)
+            v = v.transpose(1, 2)
+            out = nn.functional.scaled_dot_product_attention(q, k, v, attn_mask=mask)
+            out = out.transpose(1, 2)
+
+        out = out.reshape(batch, patches, self.n_heads * self.head_dim)
+        return self.wo(out)
+
+
+class TransformerBlock(nn.Module):
+    def __init__(self, args: VisionEncoderArgs):
+        super().__init__()
+        self.attention = Attention(args)
+        self.feed_forward = FeedForward(args)
+        self.attention_norm = RMSNorm(args.hidden_size, eps=1e-5)
+        self.ffn_norm = RMSNorm(args.hidden_size, eps=1e-5)
+
+    def forward(
+        self,
+        x: torch.Tensor,
+        mask: torch.Tensor,
+        freqs_cis: torch.Tensor,
+    ) -> torch.Tensor:
+        attention_norm_x = self.attention_norm(x.view(-1, x.shape[-1]))
+        attention_norm_x = attention_norm_x.view(x.shape)
+        r = self.attention.forward(attention_norm_x, mask=mask, freqs_cis=freqs_cis)
+        h = x + r
+        ffn_norm_h = self.ffn_norm(h.view(-1, h.shape[-1]))
+        ffn_norm_h = ffn_norm_h.view(h.shape)
+        r = self.feed_forward.forward(ffn_norm_h)
+        out = h + r
+        return out
+
+
+class Transformer(nn.Module):
+    def __init__(self, args: VisionEncoderArgs):
+        super().__init__()
+        self.layers = torch.nn.ModuleList()
+        for _ in range(args.num_hidden_layers):
+            self.layers.append(TransformerBlock(args))
+
+    def forward(
+        self,
+        x: torch.Tensor,
+        mask: torch.Tensor,
+        freqs_cis: torch.Tensor | None,
+    ) -> torch.Tensor:
+        for layer in self.layers:
+            x = layer(x, mask=mask, freqs_cis=freqs_cis)
+        return x
+
+
+class PixtralHFTransformer(nn.Module):
+    """Transformer for PixtralHFVisionModel using SGLang components."""
+
+    def __init__(
+        self,
+        config: PixtralVisionConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        *,
+        num_hidden_layers_override: Optional[int] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+
+        num_hidden_layers = config.num_hidden_layers
+        if num_hidden_layers_override is not None:
+            num_hidden_layers = num_hidden_layers_override
+
+        self.layers = nn.ModuleList(
+            [
+                PixtralHFTransformerBlock(
+                    config=config,
+                    layer_id=layer_idx,
+                    quant_config=quant_config,
+                    prefix=f"{prefix}.layers.{layer_idx}",
+                )
+                for layer_idx in range(num_hidden_layers)
+            ]
+        )
+
+    def forward(
+        self,
+        x: torch.Tensor,
+        attention_mask: Optional[torch.Tensor],
+        position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]],
+        return_all_hidden_states: bool = False,
+    ) -> Union[torch.Tensor, List[torch.Tensor]]:
+        """Forward pass through transformer layers.
+
+        Args:
+            x: Input tensor
+            attention_mask: Optional attention mask
+            position_embeddings: Optional position embeddings for rotary attention
+            return_all_hidden_states: Whether to return all hidden states
+
+        Returns:
+            Either the final hidden state, or a list of all hidden states if
+            return_all_hidden_states is True
+        """
+        # For HF model compatibility, always start with the input
+        hidden_states = x
+        all_hidden_states = [hidden_states] if return_all_hidden_states else None
+
+        for i, layer in enumerate(self.layers):
+            hidden_states = layer(hidden_states, attention_mask, position_embeddings)
+            if return_all_hidden_states:
+                all_hidden_states.append(hidden_states)
+
+        if return_all_hidden_states:
+            return all_hidden_states
+        return hidden_states
+
+
+def resolve_visual_encoder_outputs(
+    outputs: Union[torch.Tensor, List[torch.Tensor]],
+    feature_sample_layers: Optional[List[int]],
+    post_norm: Optional[nn.Module],
+    num_hidden_layers: int,
+) -> torch.Tensor:
+    """Resolve outputs from visual encoder based on feature_sample_layers."""
+    if feature_sample_layers is None:
+        # Just use the last layer's output
+        if isinstance(outputs, list):
+            outputs = outputs[-1]
+        if post_norm is not None:
+            outputs = post_norm(outputs)
+        return outputs
+
+    # Handle the case where we want to use specific layers
+    if not isinstance(outputs, list):
+        raise ValueError(
+            "Expected outputs to be a list when feature_sample_layers is provided"
+        )
+
+    # Validate layer indices
+    for layer_idx in feature_sample_layers:
+        if layer_idx < 0 or layer_idx > num_hidden_layers:
+            raise ValueError(
+                f"Feature sample layer index {layer_idx} is out of range "
+                f"[0, {num_hidden_layers}]"
+            )
+
+    # Collect outputs from specified layers
+    selected_outputs = [outputs[layer_idx] for layer_idx in feature_sample_layers]
+
+    # Combine the outputs
+    combined_outputs = torch.cat(selected_outputs, dim=-1)
+
+    if post_norm is not None:
+        combined_outputs = post_norm(combined_outputs)
+
+    return combined_outputs
+
+
+class PixtralHFVisionModel(nn.Module):
+    """Hugging Face Pixtral Vision Model implemented using SGLang components."""
+
+    DEFAULT_IMAGE_TOKEN_ID = 10
+
+    def pad_input_ids(self, input_ids: List[int], mm_inputs: MultimodalInputs):
+        return self.input_padder.pad_input_tokens(input_ids, mm_inputs)
+
+    def __init__(
+        self,
+        config: PixtralVisionConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        *,
+        num_hidden_layers_override: Optional[int] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+
+        self.config = config
+
+        self.image_size = config.image_size
+        self.patch_size = config.patch_size
+
+        self.patch_conv = nn.Conv2d(
+            in_channels=config.num_channels,
+            out_channels=config.hidden_size,
+            kernel_size=config.patch_size,
+            stride=config.patch_size,
+            bias=False,
+        )
+
+        self.ln_pre = RMSNorm(config.hidden_size, eps=1e-5)
+
+        self.transformer = PixtralHFTransformer(
+            config,
+            quant_config,
+            num_hidden_layers_override=num_hidden_layers_override,
+            prefix=f"{prefix}.transformer",
+        )
+
+        # Check that num_hidden_layers is valid
+        num_hidden_layers = config.num_hidden_layers
+        if len(self.transformer.layers) > config.num_hidden_layers:
+            raise ValueError(
+                f"The original encoder only has {num_hidden_layers} "
+                f"layers, but you requested {len(self.transformer.layers)} "
+                "layers."
+            )
+
+        # Initialize patch position embedding
+        self.patch_positional_embedding = PixtralRotaryEmbedding(config)
+        self.input_padder = MultiModalityDataPaddingPatternMultimodalTokens()
+
+    @property
+    def dtype(self):
+        return next(self.parameters()).dtype
+
+    @property
+    def device(self):
+        return next(self.parameters()).device
+
+    def forward(
+        self,
+        pixel_values: torch.Tensor,
+        image_sizes: list[tuple[int, int]],
+        output_hidden_states: bool = False,
+        feature_sample_layers: Optional[list[int]] = None,
+    ) -> Union[torch.Tensor, tuple]:
+        """
+        Args:
+            pixel_values: [batch_size, C, H, W], padded if multiple images
+            image_sizes: list of (H, W) for each image in the batch
+            output_hidden_states: Whether to return all hidden states.
+            feature_sample_layers: Layer indices whose features should be
+                concatenated and used as the visual encoder output. If none
+                are provided, the last layer is used.
+
+        Returns:
+            A tuple containing:
+              - hidden_states: Final model outputs (or selected layers if feature_sample_layers given)
+              - hidden_states tuple (optional): All hidden states if output_hidden_states=True
+        """
+        # batch patch images
+        embeds_orig = self.patch_conv(
+            pixel_values.to(device=self.device, dtype=self.dtype)
+        )
+        # crop the embeddings
+        embeds_2d = [
+            embed[..., : h // self.patch_size, : w // self.patch_size]
+            for embed, (h, w) in zip(embeds_orig, image_sizes)
+        ]
+
+        # flatten to sequence
+        embeds_1d = torch.cat([p.flatten(1).T for p in embeds_2d], dim=0)
+        embeds_featurized = self.ln_pre(embeds_1d).unsqueeze(0)
+
+        # positional embeddings
+        position_ids = position_ids_in_meshgrid(
+            embeds_2d,
+            max_width=self.image_size // self.patch_size,
+        ).to(self.device)
+
+        # The original PixtralRotaryEmbedding expects 2D input but returns a tuple of tensors (cos, sin)
+        # These tensors are used by apply_rotary_pos_emb in the transformer blocks
+        position_embedding = self.patch_positional_embedding(
+            embeds_featurized, position_ids
+        )
+        attention_mask = _get_pixtral_attention_mask(
+            [p.shape[-2] * p.shape[-1] for p in embeds_2d], embeds_featurized
+        )
+
+        return_all_hidden_states = (
+            output_hidden_states or feature_sample_layers is not None
+        )
+
+        transformer_outputs = self.transformer(
+            embeds_featurized,  # add batch dimension
+            attention_mask,
+            position_embedding,
+            return_all_hidden_states=return_all_hidden_states,
+        )
+
+        # Store all hidden states if requested
+        all_hidden_states = None
+        if isinstance(transformer_outputs, list):
+            all_hidden_states = transformer_outputs
+            # Use the last layer by default if feature_sample_layers is not specified
+            if feature_sample_layers is None:
+                out = transformer_outputs[-1]
+            else:
+                # Resolve outputs based on feature sample layers
+                out = resolve_visual_encoder_outputs(
+                    transformer_outputs,
+                    feature_sample_layers,
+                    None,
+                    self.config.num_hidden_layers,
+                )
+        else:
+            out = transformer_outputs
+
+        # Format return to be compatible with HuggingFace vision models
+        if output_hidden_states:
+            return type(
+                "VisualOutput",
+                (),
+                {
+                    "last_hidden_state": out,
+                    "hidden_states": all_hidden_states,
+                },
+            )
+        else:
+            return out
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]) -> Set[str]:
+        """Load weights from a HuggingFace checkpoint with proper parameter mapping."""
+        params_dict = dict(self.named_parameters())
+
+        # for (param, weight, shard_id): load weight into param as param's shard_id part
+        stacked_params_mapping = [
+            (".attention.qkv_proj", ".attention.q_proj", "q"),
+            (".attention.qkv_proj", ".attention.k_proj", "k"),
+            (".attention.qkv_proj", ".attention.v_proj", "v"),
+            (".feed_forward.gate_up_proj", ".feed_forward.gate_proj", 0),
+            (".feed_forward.gate_up_proj", ".feed_forward.up_proj", 1),
+        ]
+
+        # Process each weight
+        for name, loaded_weight in weights:
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                if weight_name in name:
+                    # Replace the weight name part with the combined parameter name
+                    transformed_name = name.replace(weight_name, param_name)
+                    if transformed_name in params_dict:
+                        param = params_dict[transformed_name]
+                        weight_loader = getattr(
+                            param, "weight_loader", default_weight_loader
+                        )
+                        weight_loader(param, loaded_weight, shard_id)
+                        break
+            else:
+                if ".attention.o_proj" in name:
+                    alt_name = name.replace(".attention.o_proj", ".attention.proj")
+                    if alt_name in params_dict:
+                        name = alt_name
+                if name in params_dict:
+                    param = params_dict[name]
+                    weight_loader = getattr(
+                        param, "weight_loader", default_weight_loader
+                    )
+                    weight_loader(param, loaded_weight)
+
+
+class PixtralVisionModel(PixtralHFVisionModel):
+    pass
+
+
+# Register the model classes for external access
+EntryClass = [PixtralForConditionalGeneration, PixtralVisionModel]
diff --git a/sglang/python/sglang/srt/models/points_v15_chat.py b/sglang/python/sglang/srt/models/points_v15_chat.py
new file mode 100644
index 0000000000000000000000000000000000000000..79a74ca2c885f591ff1e878abd85d7d8bc4cd72b
--- /dev/null
+++ b/sglang/python/sglang/srt/models/points_v15_chat.py
@@ -0,0 +1,186 @@
+import copy
+from typing import Iterable, List, Optional, Set, Tuple
+
+import torch
+import torch.nn.functional as F
+from torch import nn
+
+from sglang.srt.configs.points_v15_chat import POINTSV15ChatConfig
+from sglang.srt.layers.quantization.base_config import QuantizationConfig
+from sglang.srt.managers.mm_utils import (
+    MultiModalityDataPaddingPatternMultimodalTokens,
+    general_mm_embed_routine,
+)
+from sglang.srt.managers.schedule_batch import (
+    Modality,
+    MultimodalDataItem,
+    MultimodalInputs,
+)
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch
+from sglang.srt.model_loader.weight_utils import default_weight_loader
+from sglang.srt.models.qwen2 import Qwen2ForCausalLM
+from sglang.srt.models.qwen2_vl import Qwen2VisionPatchMerger, Qwen2VisionTransformer
+from sglang.srt.utils import add_prefix
+
+
+class Qwen2VisionTransformerForNavitPOINTS(Qwen2VisionTransformer):
+    def __init__(
+        self,
+        vision_config: POINTSV15ChatConfig,
+        norm_eps: float = 1e-6,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__(
+            vision_config,
+            norm_eps=norm_eps,
+            quant_config=quant_config,
+            prefix=prefix,
+        )
+
+    def forward(
+        self,
+        x: torch.Tensor,
+        grid_thw: torch.Tensor,
+    ) -> torch.Tensor:
+        # patchify
+        x = x.to(device=self.device, dtype=self.dtype)
+        x = self.patch_embed(x)
+
+        # compute position embedding
+        rotary_pos_emb = self.rot_pos_emb(grid_thw)
+        emb = torch.cat((rotary_pos_emb, rotary_pos_emb), dim=-1)
+        position_embeddings = (emb.cos(), emb.sin())
+
+        # compute cu_seqlens
+        cu_seqlens = torch.repeat_interleave(
+            grid_thw[:, 1] * grid_thw[:, 2], grid_thw[:, 0]
+        ).cumsum(dim=0, dtype=torch.int32)
+        cu_seqlens = F.pad(cu_seqlens, (1, 0), "constant", 0)
+
+        # transformers
+        x = x.unsqueeze(1)
+        for blk in self.blocks:
+            x = blk(x, cu_seqlens=cu_seqlens, position_embeddings=position_embeddings)
+
+        return x
+
+
+class POINTSV15ChatModel(nn.Module):
+    def __init__(
+        self,
+        config: POINTSV15ChatConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+        **kwargs,
+    ) -> None:
+        super().__init__()
+        config.llm_config._attn_implementation = "flash_attention_2"
+        config._attn_implementation_autoset = False
+        self.config = config
+        self.quant_config = quant_config
+
+        llm_config = copy.deepcopy(config.llm_config)
+        llm_config.architectures = ["Qwen2ForCausalLM"]
+        self.llm = Qwen2ForCausalLM(
+            config=llm_config,
+            quant_config=quant_config,
+            prefix=add_prefix("llm", prefix),
+        )
+
+        self.vision_encoder = Qwen2VisionTransformerForNavitPOINTS(
+            config.vision_config,
+            quant_config=quant_config,
+            prefix=add_prefix("vision_encoder", prefix),
+        )
+
+        self.vision_projector = Qwen2VisionPatchMerger(
+            d_model=config.llm_config.hidden_size,
+            context_dim=1280,
+            quant_config=quant_config,
+            prefix=add_prefix("vision_projector", prefix),
+        )
+
+    def pad_input_ids(self, input_ids: List[int], mm_inputs: MultimodalInputs):
+        pattern = MultiModalityDataPaddingPatternMultimodalTokens()
+        return pattern.pad_input_tokens(input_ids, mm_inputs)
+
+    def get_image_feature(self, items: List[MultimodalDataItem]) -> torch.Tensor:
+        pixel_values = torch.cat([item.feature for item in items], dim=0).type(
+            self.vision_encoder.dtype
+        )
+        image_grid_thw = torch.concat([item.image_grid_thw for item in items], dim=0)
+
+        assert pixel_values.dim() == 2, pixel_values.dim()
+        assert image_grid_thw.dim() == 2, image_grid_thw.dim()
+
+        image_features = self.vision_encoder(pixel_values, grid_thw=image_grid_thw)
+        image_features = self.vision_projector(image_features)
+        return image_features
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        get_embedding: bool = False,
+    ):
+        hidden_states = general_mm_embed_routine(
+            input_ids=input_ids,
+            forward_batch=forward_batch,
+            language_model=self.llm,
+            data_embedding_funcs={
+                Modality.IMAGE: self.get_image_feature,
+            },
+            positions=positions,
+        )
+
+        return hidden_states
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+            ("gate_up_proj", "gate_proj", 0),
+            ("gate_up_proj", "up_proj", 1),
+        ]
+        params_dict = dict(self.named_parameters())
+        loaded_params: Set[str] = set()
+
+        for name, loaded_weight in weights:
+            if "rotary_emb.inv_freq" in name:
+                continue
+
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                if "vision_encoder" in name:
+                    # adapt to VisionAttention
+                    name = name.replace(r"attn.qkv.", r"attn.qkv_proj.")
+
+                try:
+                    # Skip loading extra bias for GPTQ models.
+                    if name.endswith(".bias") and name not in params_dict:
+                        continue
+                    param = params_dict[name]
+                except KeyError:
+                    print(params_dict.keys())
+                    raise
+
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
+                weight_loader(param, loaded_weight)
+
+
+EntryClass = [POINTSV15ChatModel]
diff --git a/sglang/python/sglang/srt/models/qwen.py b/sglang/python/sglang/srt/models/qwen.py
new file mode 100644
index 0000000000000000000000000000000000000000..206908b49001598816b6bd46542d7eddf391d42e
--- /dev/null
+++ b/sglang/python/sglang/srt/models/qwen.py
@@ -0,0 +1,355 @@
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+# Adapted from
+# https://github.com/vllm-project/vllm/blob/c7f2cf2b7f67bce5842fedfdba508440fe257375/vllm/model_executor/models/qwen.py#L1
+
+from typing import Any, Dict, Iterable, Optional, Tuple
+
+import torch
+from torch import nn
+from transformers import PretrainedConfig
+
+from sglang.srt.distributed import get_tensor_model_parallel_world_size
+from sglang.srt.layers.activation import SiluAndMul
+from sglang.srt.layers.layernorm import RMSNorm
+from sglang.srt.layers.linear import (
+    MergedColumnParallelLinear,
+    QKVParallelLinear,
+    RowParallelLinear,
+)
+from sglang.srt.layers.logits_processor import LogitsProcessor
+from sglang.srt.layers.quantization.base_config import QuantizationConfig
+from sglang.srt.layers.radix_attention import RadixAttention
+from sglang.srt.layers.rotary_embedding import get_rope
+from sglang.srt.layers.vocab_parallel_embedding import (
+    ParallelLMHead,
+    VocabParallelEmbedding,
+)
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch
+from sglang.srt.model_loader.weight_utils import default_weight_loader
+from sglang.srt.utils import add_prefix
+
+
+class QWenMLP(nn.Module):
+    def __init__(
+        self,
+        hidden_size: int,
+        intermediate_size: int,
+        hidden_act: str = "silu",
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.gate_up_proj = MergedColumnParallelLinear(
+            hidden_size,
+            2 * [intermediate_size],
+            bias=False,
+            gather_output=False,
+            quant_config=quant_config,
+            prefix=add_prefix("gate_up_proj", prefix),
+        )
+        self.c_proj = RowParallelLinear(
+            intermediate_size,
+            hidden_size,
+            bias=False,
+            input_is_parallel=True,
+            quant_config=quant_config,
+            prefix=add_prefix("c_proj", prefix),
+        )
+        if hidden_act != "silu":
+            raise ValueError(
+                f"Unsupported activation: {hidden_act}. "
+                "Only silu is supported for now."
+            )
+        self.act_fn = SiluAndMul()
+
+    def forward(self, x):
+        gate_up, _ = self.gate_up_proj(x)
+        x = self.act_fn(gate_up)
+        x, _ = self.c_proj(x)
+        return x
+
+
+class QWenAttention(nn.Module):
+    def __init__(
+        self,
+        hidden_size: int,
+        num_heads: int,
+        max_position_embeddings: int,
+        layer_id: int = 0,
+        rope_theta: float = 10000,
+        rope_scaling: Optional[Dict[str, Any]] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.hidden_size = hidden_size
+        tensor_model_parallel_world_size = get_tensor_model_parallel_world_size()
+        self.total_num_heads = num_heads
+        assert self.total_num_heads % tensor_model_parallel_world_size == 0
+        self.num_heads = self.total_num_heads // tensor_model_parallel_world_size
+        self.head_dim = hidden_size // self.total_num_heads
+
+        # pylint: disable=invalid-name
+        self.c_attn = QKVParallelLinear(
+            hidden_size,
+            self.head_dim,
+            self.total_num_heads,
+            bias=True,
+            quant_config=quant_config,
+            prefix=add_prefix("c_attn", prefix),
+        )
+        self.c_proj = RowParallelLinear(
+            self.total_num_heads * self.head_dim,
+            hidden_size,
+            bias=False,
+            input_is_parallel=True,
+            quant_config=quant_config,
+            prefix=add_prefix("c_proj", prefix),
+        )
+        self.rotary_emb = get_rope(
+            self.head_dim,
+            rotary_dim=self.head_dim,
+            max_position=max_position_embeddings,
+            base=rope_theta,
+            rope_scaling=rope_scaling,
+        )
+        self.scaling = self.head_dim**-0.5
+        self.attn = RadixAttention(
+            self.num_heads,
+            self.head_dim,
+            self.scaling,
+            num_kv_heads=self.num_heads,
+            layer_id=layer_id,
+            quant_config=quant_config,
+            prefix=add_prefix("attn", prefix),
+        )
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+    ) -> torch.Tensor:
+        qkv, _ = self.c_attn(hidden_states)
+        q, k, v = qkv.chunk(chunks=3, dim=-1)
+        q, k = self.rotary_emb(positions, q, k)
+        attn_output = self.attn(q, k, v, forward_batch)
+        output, _ = self.c_proj(attn_output)
+        return output
+
+
+class QWenBlock(nn.Module):
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        layer_id,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.ln_1 = RMSNorm(config.hidden_size, eps=config.layer_norm_epsilon)
+
+        rope_theta = getattr(config, "rope_theta", 10000)
+        rope_scaling = getattr(config, "rope_scaling", None)
+        self.attn = QWenAttention(
+            config.hidden_size,
+            config.num_attention_heads,
+            config.max_position_embeddings,
+            rope_theta=rope_theta,
+            rope_scaling=rope_scaling,
+            layer_id=layer_id,
+            quant_config=quant_config,
+            prefix=add_prefix("attn", prefix),
+        )
+
+        self.ln_2 = RMSNorm(config.hidden_size, eps=config.layer_norm_epsilon)
+
+        self.mlp = QWenMLP(
+            config.hidden_size,
+            config.intermediate_size // 2,
+            quant_config=quant_config,
+            prefix=add_prefix("mlp", prefix),
+        )
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+    ) -> torch.Tensor:
+        # Self Attention
+        residual = hidden_states
+        hidden_states = self.ln_1(hidden_states)
+        hidden_states = self.attn(
+            positions=positions,
+            hidden_states=hidden_states,
+            forward_batch=forward_batch,
+        )
+        hidden_states = residual + hidden_states
+
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.ln_2(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+        return hidden_states
+
+
+class QWenModel(nn.Module):
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.config = config
+        self.vocab_size = config.vocab_size
+
+        vocab_size = ((config.vocab_size + 63) // 64) * 64
+        self.wte = VocabParallelEmbedding(
+            vocab_size,
+            config.hidden_size,
+            prefix=add_prefix("wte", prefix),
+        )
+        self.h = nn.ModuleList(
+            [
+                QWenBlock(
+                    config,
+                    i,
+                    quant_config=quant_config,
+                    prefix=add_prefix(f"h.{i}", prefix),
+                )
+                for i in range(config.num_hidden_layers)
+            ]
+        )
+        self.ln_f = RMSNorm(config.hidden_size, eps=config.layer_norm_epsilon)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+    ) -> torch.Tensor:
+        hidden_states = self.wte(input_ids)
+        for i in range(len(self.h)):
+            layer = self.h[i]
+            hidden_states = layer(
+                positions,
+                hidden_states,
+                forward_batch,
+            )
+        hidden_states = self.ln_f(hidden_states)
+        return hidden_states
+
+
+class QWenLMHeadModel(nn.Module):
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.config = config
+        self.transformer = QWenModel(
+            config, quant_config=quant_config, prefix=add_prefix("transformer", prefix)
+        )
+        vocab_size = ((config.vocab_size + 63) // 64) * 64
+        self.lm_head = ParallelLMHead(
+            vocab_size, config.hidden_size, prefix=add_prefix("lm_head", prefix)
+        )
+        self.logits_processor = LogitsProcessor(config)
+
+    @torch.no_grad()
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+    ):
+        hidden_states = self.transformer(input_ids, positions, forward_batch)
+        return self.logits_processor(
+            input_ids, hidden_states, self.lm_head, forward_batch
+        )
+
+    @torch.no_grad()
+    def forward_split_prefill(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        split_interval: Tuple[int, int],  # [start, end) 0-based
+    ):
+        start, end = split_interval
+        # embed
+        if start == 0:
+            forward_batch.hidden_states = self.transformer.wte(input_ids)
+
+        # decoder layer
+        for i in range(start, end):
+            layer = self.transformer.h[i]
+            forward_batch.hidden_states = layer(
+                positions,
+                forward_batch.hidden_states,
+                forward_batch,
+            )
+
+        if end == self.transformer.config.num_hidden_layers:
+            # norm
+            forward_batch.hidden_states = self.transformer.ln_f(
+                forward_batch.hidden_states
+            )
+            # logits process
+            result = self.logits_processor(
+                input_ids, forward_batch.hidden_states, self.lm_head, forward_batch
+            )
+        else:
+            result = None
+
+        return result
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("gate_up_proj", "w2", 0),
+            ("gate_up_proj", "w1", 1),
+        ]
+        params_dict = dict(self.named_parameters())
+        for name, loaded_weight in weights:
+            if "rotary_emb.inv_freq" in name:
+                continue
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
+                weight_loader(param, loaded_weight)
+
+
+EntryClass = QWenLMHeadModel
diff --git a/sglang/python/sglang/srt/models/qwen2.py b/sglang/python/sglang/srt/models/qwen2.py
new file mode 100644
index 0000000000000000000000000000000000000000..a3cfde4d6301ad45899492b74816a2d9c72aa78a
--- /dev/null
+++ b/sglang/python/sglang/srt/models/qwen2.py
@@ -0,0 +1,654 @@
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+# Adapted from llama2.py
+# Modify details for the adaptation of Qwen2 model.
+"""Inference-only Qwen2 model compatible with HuggingFace weights."""
+
+import logging
+from typing import Any, Dict, Iterable, List, Optional, Tuple, Union
+
+import torch
+from torch import nn
+
+from sglang.srt.distributed import (
+    get_pp_group,
+    get_tensor_model_parallel_rank,
+    get_tensor_model_parallel_world_size,
+)
+from sglang.srt.layers.activation import SiluAndMul
+from sglang.srt.layers.dp_attention import is_dp_attention_enabled
+from sglang.srt.layers.layernorm import RMSNorm
+from sglang.srt.layers.linear import (
+    MergedColumnParallelLinear,
+    QKVParallelLinear,
+    RowParallelLinear,
+)
+from sglang.srt.layers.logits_processor import LogitsProcessor
+from sglang.srt.layers.pooler import Pooler, PoolingType
+from sglang.srt.layers.quantization.base_config import QuantizationConfig
+from sglang.srt.layers.radix_attention import RadixAttention
+from sglang.srt.layers.rotary_embedding import get_rope
+from sglang.srt.layers.utils import PPMissingLayer, get_layer_id
+from sglang.srt.layers.vocab_parallel_embedding import (
+    ParallelLMHead,
+    VocabParallelEmbedding,
+)
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch, PPProxyTensors
+from sglang.srt.model_loader.weight_utils import (
+    default_weight_loader,
+    kv_cache_scales_loader,
+)
+from sglang.srt.server_args import get_global_server_args
+from sglang.srt.utils import add_prefix, make_layers
+
+Qwen2Config = None
+
+
+logger = logging.getLogger(__name__)
+
+
+class Qwen2MLP(nn.Module):
+    def __init__(
+        self,
+        hidden_size: int,
+        intermediate_size: int,
+        hidden_act: str,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.gate_up_proj = MergedColumnParallelLinear(
+            hidden_size,
+            [intermediate_size] * 2,
+            bias=False,
+            quant_config=quant_config,
+            prefix=add_prefix("gate_up_proj", prefix),
+        )
+        self.down_proj = RowParallelLinear(
+            intermediate_size,
+            hidden_size,
+            bias=False,
+            quant_config=quant_config,
+            prefix=add_prefix("down_proj", prefix),
+        )
+        if hidden_act != "silu":
+            raise ValueError(
+                f"Unsupported activation: {hidden_act}. "
+                "Only silu is supported for now."
+            )
+        self.act_fn = SiluAndMul()
+
+    def forward(self, x):
+        if get_global_server_args().rl_on_policy_target is not None:
+            x = x.bfloat16()
+
+        gate_up, _ = self.gate_up_proj(x)
+        x = self.act_fn(gate_up)
+        x, _ = self.down_proj(x)
+        return x
+
+
+class Qwen2Attention(nn.Module):
+    def __init__(
+        self,
+        hidden_size: int,
+        num_heads: int,
+        num_kv_heads: int,
+        head_dim: Optional[int] = None,
+        layer_id: int = 0,
+        rope_theta: float = 1000000,
+        rope_scaling: Optional[Dict[str, Any]] = None,
+        max_position_embeddings: int = 32768,
+        quant_config: Optional[QuantizationConfig] = None,
+        dual_chunk_attention_config: Optional[dict[str, Any]] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.hidden_size = hidden_size
+        tp_size = get_tensor_model_parallel_world_size()
+        self.total_num_heads = num_heads
+        assert self.total_num_heads % tp_size == 0
+        self.num_heads = self.total_num_heads // tp_size
+        self.total_num_kv_heads = num_kv_heads
+        if self.total_num_kv_heads >= tp_size:
+            # Number of KV heads is greater than TP size, so we partition
+            # the KV heads across multiple tensor parallel GPUs.
+            assert self.total_num_kv_heads % tp_size == 0
+        else:
+            # Number of KV heads is less than TP size, so we replicate
+            # the KV heads across multiple tensor parallel GPUs.
+            assert tp_size % self.total_num_kv_heads == 0
+        self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
+        if head_dim is not None:
+            self.head_dim = head_dim
+        else:
+            self.head_dim = hidden_size // self.total_num_heads
+        self.q_size = self.num_heads * self.head_dim
+        self.kv_size = self.num_kv_heads * self.head_dim
+        self.scaling = self.head_dim**-0.5
+        self.rope_theta = rope_theta
+        self.max_position_embeddings = max_position_embeddings
+
+        self.qkv_proj = QKVParallelLinear(
+            hidden_size,
+            self.head_dim,
+            self.total_num_heads,
+            self.total_num_kv_heads,
+            bias=True,
+            quant_config=quant_config,
+            prefix=add_prefix("qkv_proj", prefix),
+        )
+        self.o_proj = RowParallelLinear(
+            self.total_num_heads * self.head_dim,
+            hidden_size,
+            bias=False,
+            quant_config=quant_config,
+            prefix=add_prefix("o_proj", prefix),
+        )
+
+        self.rotary_emb = get_rope(
+            self.head_dim,
+            rotary_dim=self.head_dim,
+            max_position=max_position_embeddings,
+            base=rope_theta,
+            rope_scaling=rope_scaling,
+            dual_chunk_attention_config=dual_chunk_attention_config,
+        )
+        self.attn = RadixAttention(
+            self.num_heads,
+            self.head_dim,
+            self.scaling,
+            num_kv_heads=self.num_kv_heads,
+            layer_id=layer_id,
+            quant_config=quant_config,
+            prefix=add_prefix("attn", prefix),
+        )
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+    ) -> torch.Tensor:
+        qkv, _ = self.qkv_proj(hidden_states)
+        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+        q, k = self.rotary_emb(positions, q, k)
+        attn_output = self.attn(q, k, v, forward_batch)
+        output, _ = self.o_proj(attn_output)
+        return output
+
+
+class Qwen2DecoderLayer(nn.Module):
+    def __init__(
+        self,
+        config: Qwen2Config,
+        layer_id: int = 0,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+        alt_stream: Optional[torch.cuda.Stream] = None,
+    ) -> None:
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        rope_theta = getattr(config, "rope_theta", 1000000)
+        rope_scaling = getattr(config, "rope_scaling", None)
+        max_position_embeddings = getattr(config, "max_position_embeddings", 32768)
+        head_dim = getattr(config, "head_dim", None)
+        dual_chunk_attention_config = getattr(
+            config, "dual_chunk_attention_config", None
+        )
+        self.self_attn = Qwen2Attention(
+            hidden_size=self.hidden_size,
+            num_heads=config.num_attention_heads,
+            num_kv_heads=config.num_key_value_heads,
+            head_dim=head_dim,
+            layer_id=layer_id,
+            rope_theta=rope_theta,
+            rope_scaling=rope_scaling,
+            max_position_embeddings=max_position_embeddings,
+            quant_config=quant_config,
+            dual_chunk_attention_config=dual_chunk_attention_config,
+            prefix=add_prefix("self_attn", prefix),
+        )
+        self.mlp = Qwen2MLP(
+            hidden_size=self.hidden_size,
+            intermediate_size=config.intermediate_size,
+            hidden_act=config.hidden_act,
+            quant_config=quant_config,
+            prefix=add_prefix("mlp", prefix),
+        )
+        self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = RMSNorm(
+            config.hidden_size, eps=config.rms_norm_eps
+        )
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+        residual: Optional[torch.Tensor],
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        # Self Attention
+        if residual is None:
+            residual = hidden_states
+            hidden_states = self.input_layernorm(hidden_states)
+        else:
+            hidden_states, residual = self.input_layernorm(hidden_states, residual)
+        hidden_states = self.self_attn(
+            positions=positions,
+            hidden_states=hidden_states,
+            forward_batch=forward_batch,
+        )
+
+        # Fully Connected
+        hidden_states, residual = self.post_attention_layernorm(hidden_states, residual)
+        hidden_states = self.mlp(hidden_states)
+        return hidden_states, residual
+
+
+class Qwen2Model(nn.Module):
+    def __init__(
+        self,
+        config: Qwen2Config,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+        decoder_layer_type: type[nn.Module] = Qwen2DecoderLayer,
+        alt_stream: Optional[torch.cuda.Stream] = None,
+    ) -> None:
+        super().__init__()
+        self.config = config
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+        self.pp_group = get_pp_group()
+
+        if self.pp_group.is_first_rank:
+            self.embed_tokens = VocabParallelEmbedding(
+                config.vocab_size,
+                config.hidden_size,
+                quant_config=quant_config,
+                use_attn_tp_group=is_dp_attention_enabled(),
+                prefix=add_prefix("embed_tokens", prefix),
+                params_dtype=(
+                    torch.float32
+                    if get_global_server_args().rl_on_policy_target is not None
+                    else None
+                ),
+            )
+        else:
+            self.embed_tokens = PPMissingLayer()
+
+        # Use the provided decoder layer type or default to Qwen2DecoderLayer
+        decoder_layer_type = decoder_layer_type or Qwen2DecoderLayer
+        self.layers, self.start_layer, self.end_layer = make_layers(
+            config.num_hidden_layers,
+            lambda idx, prefix: decoder_layer_type(
+                layer_id=idx,
+                config=config,
+                quant_config=quant_config,
+                prefix=prefix,
+                alt_stream=alt_stream,
+            ),
+            pp_rank=self.pp_group.rank_in_group,
+            pp_size=self.pp_group.world_size,
+            prefix=add_prefix("layers", prefix),
+        )
+        if self.pp_group.is_last_rank:
+            norm_kwargs = (
+                dict(
+                    weight_dtype=torch.float32,
+                    cast_x_before_out_mul=True,
+                    override_orig_dtype=torch.float32,
+                    fp32_residual=True,
+                )
+                if get_global_server_args().rl_on_policy_target is not None
+                else {}
+            )
+            self.norm = RMSNorm(
+                config.hidden_size, eps=config.rms_norm_eps, **norm_kwargs
+            )
+        else:
+            self.norm = PPMissingLayer(return_tuple=True)
+
+        # For EAGLE3 support
+        self.layers_to_capture = []
+
+    def get_input_embedding(self, input_ids: torch.Tensor) -> torch.Tensor:
+        if hasattr(self.config, "scale_emb"):
+            return self.get_input_embeddings()(input_ids) * self.config.scale_emb
+        else:
+            return self.get_input_embeddings()(input_ids)
+
+    def get_input_embeddings(self) -> nn.Embedding:
+        return self.embed_tokens
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        input_embeds: torch.Tensor = None,
+        pp_proxy_tensors: Optional[PPProxyTensors] = None,
+    ) -> Union[torch.Tensor, PPProxyTensors]:
+
+        if self.pp_group.is_first_rank:
+            if input_embeds is None:
+                hidden_states = self.embed_tokens(input_ids)
+            else:
+                hidden_states = input_embeds
+            residual = None
+        else:
+            assert pp_proxy_tensors is not None
+            hidden_states = pp_proxy_tensors["hidden_states"]
+            residual = pp_proxy_tensors["residual"]
+
+        aux_hidden_states = []
+        for i in range(self.start_layer, self.end_layer):
+            if i in self.layers_to_capture:
+                aux_hidden_states.append(
+                    hidden_states + residual if residual is not None else hidden_states
+                )
+            layer = self.layers[i]
+            hidden_states, residual = layer(
+                positions,
+                hidden_states,
+                forward_batch,
+                residual,
+            )
+        if not self.pp_group.is_last_rank:
+            return PPProxyTensors(
+                {
+                    "hidden_states": hidden_states,
+                    "residual": residual,
+                }
+            )
+        else:
+            if hidden_states.shape[0] != 0:
+                if residual is None:
+                    hidden_states = self.norm(hidden_states)
+                else:
+                    hidden_states, _ = self.norm(hidden_states, residual)
+
+        if len(aux_hidden_states) == 0:
+            return hidden_states
+
+        return hidden_states, aux_hidden_states
+
+    # If this function is called, it should always initialize KV cache scale
+    # factors (or else raise an exception). Thus, handled exceptions should
+    # make sure to leave KV cache scale factors in a known good (dummy) state
+    def load_kv_cache_scales(self, quantization_param_path: str) -> None:
+        tp_size = get_tensor_model_parallel_world_size()
+        tp_rank = get_tensor_model_parallel_rank()
+        for layer_idx, scaling_factor in kv_cache_scales_loader(
+            quantization_param_path,
+            tp_rank,
+            tp_size,
+            self.config.num_hidden_layers,
+            self.config.__class__.model_type,
+        ):
+            if not isinstance(self.layers[layer_idx], nn.Identity):
+                layer_self_attn = self.layers[layer_idx].self_attn
+            if hasattr(layer_self_attn.attn, "k_scale"):
+                layer_self_attn.attn.k_scale = scaling_factor
+                layer_self_attn.attn.v_scale = scaling_factor
+            else:
+                raise RuntimeError(
+                    "Self attention has no KV cache scaling " "factor attribute!"
+                )
+
+
+class Qwen2ForCausalLM(nn.Module):
+    # BitandBytes specific attributes
+    default_bitsandbytes_target_modules = [
+        ".gate_proj.",
+        ".down_proj.",
+        ".up_proj.",
+        ".q_proj.",
+        ".k_proj.",
+        ".v_proj.",
+        ".o_proj.",
+    ]
+    bitsandbytes_stacked_params_mapping = {
+        # shard_name, weight_name, index
+        "q_proj": ("qkv_proj", 0),
+        "k_proj": ("qkv_proj", 1),
+        "v_proj": ("qkv_proj", 2),
+        "gate_proj": ("gate_up_proj", 0),
+        "up_proj": ("gate_up_proj", 1),
+    }
+
+    def __init__(
+        self,
+        config: Qwen2Config,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.pp_group = get_pp_group()
+        self.config = config
+        self.quant_config = quant_config
+        self.model = Qwen2Model(
+            config, quant_config=quant_config, prefix=add_prefix("model", prefix)
+        )
+
+        # handle the lm head on different pp ranks
+        if self.pp_group.is_last_rank:
+            if self.pp_group.world_size == 1 and config.tie_word_embeddings:
+                self.lm_head = self.model.embed_tokens
+            else:
+                self.lm_head = ParallelLMHead(
+                    config.vocab_size,
+                    config.hidden_size,
+                    quant_config=quant_config,
+                    prefix=add_prefix("lm_head", prefix),
+                )
+        else:
+            # ranks other than the last rank will have a placeholder layer
+            self.lm_head = PPMissingLayer()
+
+        self.logits_processor = LogitsProcessor(config)
+        self.pooler = Pooler(pooling_type=PoolingType.LAST, normalize=True)
+        # For EAGLE3 support
+        self.capture_aux_hidden_states = False
+
+    def get_input_embedding(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.get_input_embedding(input_ids)
+
+    def get_input_embeddings(self) -> nn.Embedding:
+        return self.model.embed_tokens
+
+    @torch.no_grad()
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        input_embeds: torch.Tensor = None,
+        get_embedding: bool = False,
+        pp_proxy_tensors: Optional[PPProxyTensors] = None,
+    ) -> torch.Tensor:
+        hidden_states = self.model(
+            input_ids,
+            positions,
+            forward_batch,
+            input_embeds,
+            pp_proxy_tensors=pp_proxy_tensors,
+        )
+        aux_hidden_states = None
+        if self.capture_aux_hidden_states:
+            hidden_states, aux_hidden_states = hidden_states
+
+        if self.pp_group.is_last_rank:
+            if not get_embedding:
+                return self.logits_processor(
+                    input_ids,
+                    hidden_states,
+                    self.lm_head,
+                    forward_batch,
+                    aux_hidden_states,
+                )
+            else:
+                return self.pooler(hidden_states, forward_batch)
+        else:
+            return hidden_states
+
+    @torch.no_grad()
+    def forward_split_prefill(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        split_interval: Tuple[int, int],  # [start, end) 0-based
+        input_embeds: torch.Tensor = None,
+    ):
+        start, end = split_interval
+        # embed
+        if start == 0:
+            if input_embeds is None:
+                forward_batch.hidden_states = self.model.embed_tokens(input_ids)
+            else:
+                forward_batch.hidden_states = input_embeds
+        # decoder layer
+        for i in range(start, end):
+            layer = self.model.layers[i]
+            forward_batch.hidden_states, forward_batch.residual = layer(
+                positions,
+                forward_batch.hidden_states,
+                forward_batch,
+                forward_batch.residual,
+            )
+
+        if end == self.model.config.num_hidden_layers:
+            # norm
+            hidden_states, _ = self.model.norm(
+                forward_batch.hidden_states, forward_batch.residual
+            )
+            forward_batch.hidden_states = hidden_states
+            # logits process
+            result = self.logits_processor(
+                input_ids, forward_batch.hidden_states, self.lm_head, forward_batch
+            )
+        else:
+            result = None
+
+        return result
+
+    @property
+    def start_layer(self):
+        return self.model.start_layer
+
+    @property
+    def end_layer(self):
+        return self.model.end_layer
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+            ("gate_up_proj", "gate_proj", 0),
+            ("gate_up_proj", "up_proj", 1),
+        ]
+
+        params_dict = dict(self.named_parameters())
+        for name, loaded_weight in weights:
+            layer_id = get_layer_id(name)
+            if (
+                layer_id is not None
+                and hasattr(self.model, "start_layer")
+                and (
+                    layer_id < self.model.start_layer
+                    or layer_id >= self.model.end_layer
+                )
+            ):
+                continue
+
+            if name == "model.embed_tokens.weight":
+                if self.pp_group.is_last_rank and self.config.tie_word_embeddings:
+                    if "lm_head.weight" in params_dict:
+                        param = params_dict["lm_head.weight"]
+                        weight_loader = getattr(
+                            param, "weight_loader", default_weight_loader
+                        )
+                        weight_loader(param, loaded_weight)
+
+            if "rotary_emb.inv_freq" in name or "projector" in name:
+                continue
+            if "rotary_emb.cos_cached" in name or "rotary_emb.sin_cached" in name:
+                # Models trained using ColossalAI may include these tensors in
+                # the checkpoint. Skip them.
+                continue
+            if name.startswith("model.vision_tower") and name not in params_dict:
+                continue
+
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                if name not in params_dict:
+                    continue
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+
+                if name in params_dict.keys():
+                    param = params_dict[name]
+                    weight_loader = getattr(
+                        param, "weight_loader", default_weight_loader
+                    )
+                    weight_loader(param, loaded_weight)
+                else:
+                    logger.warning(f"Parameter {name} not found in params_dict")
+
+    def get_embed_and_head(self):
+        return self.model.embed_tokens.weight, self.lm_head.weight
+
+    def set_embed_and_head(self, embed, head):
+        del self.model.embed_tokens.weight
+        del self.lm_head.weight
+        self.model.embed_tokens.weight = embed
+        self.lm_head.weight = head
+        torch.cuda.empty_cache()
+        torch.cuda.synchronize()
+
+    def load_kv_cache_scales(self, quantization_param_path: str) -> None:
+        self.model.load_kv_cache_scales(quantization_param_path)
+
+    def set_eagle3_layers_to_capture(self, layer_ids: Optional[List[int]] = None):
+        if not self.pp_group.is_last_rank:
+            return
+
+        self.capture_aux_hidden_states = True
+        if layer_ids is None:
+            num_layers = self.config.num_hidden_layers
+            self.model.layers_to_capture = [
+                2,
+                num_layers // 2,
+                num_layers - 3,
+            ]  # Specific layers for EAGLE3 support
+        else:
+            self.model.layers_to_capture = [val + 1 for val in layer_ids]
+
+
+EntryClass = Qwen2ForCausalLM
diff --git a/sglang/python/sglang/srt/models/qwen2_5_vl.py b/sglang/python/sglang/srt/models/qwen2_5_vl.py
new file mode 100644
index 0000000000000000000000000000000000000000..8b6b2b17d59c7ae0247498e64c21876db5dcda77
--- /dev/null
+++ b/sglang/python/sglang/srt/models/qwen2_5_vl.py
@@ -0,0 +1,889 @@
+# coding=utf-8
+# Adapted from
+# https://github.com/huggingface/transformers/blob/19e6e80e10118f855137b90740936c0b11ac397f/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py
+# Copyright 2024 The Qwen team.
+# Copyright 2023 The vLLM team.
+# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Inference-only Qwen2-VL model compatible with HuggingFace weights."""
+
+import logging
+import re
+from functools import partial
+from typing import Iterable, List, Optional, Tuple, Type
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from einops import rearrange
+from transformers.activations import ACT2FN
+from transformers.models.qwen2_5_vl.configuration_qwen2_5_vl import (
+    Qwen2_5_VLConfig,
+    Qwen2_5_VLVisionConfig,
+)
+from transformers.models.qwen2_5_vl.modeling_qwen2_5_vl import (
+    Qwen2_5_VisionPatchEmbed,
+    Qwen2_5_VisionRotaryEmbedding,
+)
+
+from sglang.srt.distributed import (
+    get_tensor_model_parallel_rank,
+    get_tensor_model_parallel_world_size,
+)
+from sglang.srt.distributed.parallel_state import get_pp_group
+from sglang.srt.environ import envs
+from sglang.srt.layers.activation import SiluAndMul
+from sglang.srt.layers.attention.vision import VisionAttention
+from sglang.srt.layers.layernorm import RMSNorm
+from sglang.srt.layers.linear import (
+    ColumnParallelLinear,
+    MergedColumnParallelLinear,
+    RowParallelLinear,
+)
+from sglang.srt.layers.logits_processor import LogitsProcessor
+from sglang.srt.layers.pooler import Pooler, PoolingType
+from sglang.srt.layers.quantization.base_config import QuantizationConfig
+from sglang.srt.layers.utils import PPMissingLayer, get_layer_id
+from sglang.srt.layers.vocab_parallel_embedding import ParallelLMHead
+from sglang.srt.managers.mm_utils import (
+    MultiModalityDataPaddingPatternMultimodalTokens,
+    general_mm_embed_routine,
+)
+from sglang.srt.managers.schedule_batch import (
+    Modality,
+    MultimodalDataItem,
+    MultimodalInputs,
+)
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch, PPProxyTensors
+from sglang.srt.model_loader.weight_utils import default_weight_loader
+from sglang.srt.models.qwen2 import Qwen2Model
+from sglang.srt.models.utils import RotaryPosMixin, WeightsMapper, permute_inv
+from sglang.srt.multimodal.mm_utils import run_dp_sharded_mrope_vision_model
+from sglang.srt.multimodal.vit_cuda_graph_runner import ViTCudaGraphRunner
+from sglang.srt.server_args import get_global_server_args
+from sglang.srt.utils import add_prefix, is_cuda, is_npu
+
+_is_cuda = is_cuda()
+
+logger = logging.getLogger(__name__)
+
+
+class Qwen2_5_VLMLP(nn.Module):
+    def __init__(
+        self,
+        in_features: int,
+        hidden_features: int = None,
+        bias: bool = True,
+        hidden_act="silu",
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+        use_data_parallel: bool = False,
+    ):
+        super().__init__()
+        self.tp_size = (
+            1 if use_data_parallel else get_tensor_model_parallel_world_size()
+        )
+        self.tp_rank = 0 if use_data_parallel else get_tensor_model_parallel_rank()
+        self.gate_up_proj = MergedColumnParallelLinear(
+            input_size=in_features,
+            output_sizes=[hidden_features] * 2,  # [gate_proj, up_proj]
+            bias=bias,
+            quant_config=quant_config,
+            prefix=add_prefix("gate_up_proj", prefix),
+            tp_size=self.tp_size,
+            tp_rank=self.tp_rank,
+        )
+        self.down_proj = RowParallelLinear(
+            hidden_features,
+            in_features,
+            bias=bias,
+            quant_config=quant_config,
+            prefix=add_prefix("down_proj", prefix),
+            tp_size=self.tp_size,
+            tp_rank=self.tp_rank,
+        )
+        self.hidden_act = hidden_act
+        if self.hidden_act == "silu":
+            self.act = SiluAndMul()
+        else:
+            base_act = ACT2FN[self.hidden_act]
+
+            def _act_fn(x: torch.Tensor) -> torch.Tensor:
+                gate, up = x.chunk(2, dim=-1)
+                return base_act(gate) * up
+
+            self.act = _act_fn
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        gate_up, _ = self.gate_up_proj(x)
+        x = self.act(gate_up)
+        x_down, _ = self.down_proj(x)
+        return x_down
+
+
+class Qwen2_5_VisionBlock(nn.Module):
+
+    def __init__(
+        self,
+        dim: int,
+        intermediate_dim: int,
+        num_heads: int,
+        hidden_act="silu",
+        norm_layer: Type[nn.Module] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+        num_dummy_heads: int = 0,
+        rms_norm_eps: float = 1e-6,
+        use_data_parallel: bool = False,
+    ) -> None:
+        super().__init__()
+        self.norm1 = RMSNorm(dim, eps=rms_norm_eps)
+        self.norm2 = RMSNorm(dim, eps=rms_norm_eps)
+
+        self.attn = VisionAttention(
+            embed_dim=dim,
+            num_heads=num_heads,
+            projection_size=dim,
+            use_qkv_parallel=True,
+            proj_bias=True,
+            flatten_batch=True,
+            quant_config=quant_config,
+            prefix=add_prefix("attn", prefix),
+            num_dummy_heads=num_dummy_heads,
+            use_data_parallel=use_data_parallel,
+        )
+        self.mlp = Qwen2_5_VLMLP(
+            dim,
+            intermediate_dim,
+            hidden_act=hidden_act,
+            quant_config=quant_config,
+            prefix=add_prefix("mlp", prefix),
+            use_data_parallel=use_data_parallel,
+        )
+
+    def forward(
+        self,
+        x: torch.Tensor,
+        cu_seqlens: torch.Tensor,
+        position_embeddings: torch.Tensor,
+        output_ws=None,
+    ) -> torch.Tensor:
+        S, B, H = x.shape
+        # norm1: flatten to 2D -> [S*B, H], then reshape back
+        x2d = x.reshape(-1, H)
+        hidden_states = self.norm1(x2d).reshape(S, B, H)
+
+        # Attention expects [B, S, H]
+        hidden_states = rearrange(hidden_states, "s b h -> b s h")
+        attn = self.attn(
+            hidden_states,
+            cu_seqlens=cu_seqlens,
+            position_embeddings=position_embeddings,
+            output_ws=output_ws,
+        )
+        attn = rearrange(attn, "b s h -> s b h")
+
+        # norm2 with fused residual-add: also 2D
+        attn2d = attn.reshape(-1, H)
+        x_norm_2d, x_after_add_2d = self.norm2(x2d, residual=attn2d)
+        x_norm = x_norm_2d.reshape(S, B, H)
+        x_after_add = x_after_add_2d.reshape(S, B, H)
+
+        # MLP and final residual
+        mlp_out = self.mlp(x_norm)
+        x = x_after_add + mlp_out
+        return x
+
+
+class Qwen2_5_VisionPatchMerger(nn.Module):
+
+    def __init__(
+        self,
+        dim: int,
+        context_dim: int,
+        spatial_merge_size: int = 2,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+        use_data_parallel: bool = False,
+    ) -> None:
+        super().__init__()
+        self.hidden_size = context_dim * (spatial_merge_size**2)
+        self.ln_q = RMSNorm(context_dim, eps=1e-6)
+        tp_size = 1 if use_data_parallel else get_tensor_model_parallel_world_size()
+        tp_rank = 0 if use_data_parallel else get_tensor_model_parallel_rank()
+        self.mlp = nn.ModuleList(
+            [
+                ColumnParallelLinear(
+                    self.hidden_size,
+                    self.hidden_size,
+                    bias=True,
+                    quant_config=quant_config,
+                    prefix=add_prefix("mlp.0", prefix),
+                    tp_size=tp_size,
+                    tp_rank=tp_rank,
+                ),
+                nn.GELU(),
+                RowParallelLinear(
+                    self.hidden_size,
+                    dim,
+                    bias=True,
+                    quant_config=quant_config,
+                    prefix=add_prefix("mlp.2", prefix),
+                    tp_size=tp_size,
+                    tp_rank=tp_rank,
+                ),
+            ]
+        )
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        # x expected shape: [S, B, context_dim]
+        S, B, D = x.shape
+        x2d = x.reshape(-1, D)
+        x2d = self.ln_q(x2d)  # RMSNorm expects 2D
+        x2d = x2d.view(-1, self.hidden_size)  # group into spatial_merge_unit
+        mlp_fc1, mlp_act, mlp_fc2 = self.mlp
+        x_parallel, _ = mlp_fc1(x2d)
+        x_parallel = mlp_act(x_parallel)
+        out, _ = mlp_fc2(x_parallel)
+        return out
+
+
+class Qwen2_5_VisionTransformer(nn.Module, RotaryPosMixin):
+
+    def __init__(
+        self,
+        vision_config: Qwen2_5_VLVisionConfig,
+        norm_eps: float = 1e-6,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+        use_data_parallel: bool = False,
+        max_context_len: Optional[int] = None,
+    ) -> None:
+        super().__init__()
+
+        patch_size: int = vision_config.patch_size
+        temporal_patch_size: int = vision_config.temporal_patch_size
+        spatial_merge_size: int = vision_config.spatial_merge_size
+        self.spatial_merge_size = spatial_merge_size
+        self.spatial_merge_unit: int = spatial_merge_size * spatial_merge_size
+        in_channels: int = vision_config.in_channels
+        hidden_size: int = vision_config.hidden_size
+        depth: int = vision_config.depth
+        num_heads: int = vision_config.num_heads
+        self.fullatt_block_indexes = vision_config.fullatt_block_indexes
+        self.window_size = vision_config.window_size
+        self.patch_size = vision_config.patch_size
+        mlp_hidden_size: int = ((vision_config.intermediate_size + 7) // 8) * 8
+        self.use_data_parallel = use_data_parallel
+        self.out_hidden_size = vision_config.out_hidden_size
+        self.patch_embed = Qwen2_5_VisionPatchEmbed(
+            patch_size=patch_size,
+            temporal_patch_size=temporal_patch_size,
+            in_channels=in_channels,
+            embed_dim=hidden_size,
+        )
+
+        norm_layer = partial(nn.LayerNorm, eps=norm_eps)
+        head_dim = hidden_size // num_heads
+        self.rotary_pos_emb = Qwen2_5_VisionRotaryEmbedding(head_dim // 2)
+        self.blocks = nn.ModuleList(
+            [
+                Qwen2_5_VisionBlock(
+                    dim=hidden_size,
+                    intermediate_dim=mlp_hidden_size,
+                    num_heads=num_heads,
+                    hidden_act=vision_config.hidden_act,
+                    norm_layer=norm_layer,
+                    quant_config=quant_config,
+                    prefix=add_prefix(f"blocks.{i}", prefix),
+                    use_data_parallel=use_data_parallel,
+                )
+                for i in range(depth)
+            ]
+        )
+        self.merger = Qwen2_5_VisionPatchMerger(
+            dim=vision_config.out_hidden_size,
+            context_dim=hidden_size,
+            spatial_merge_size=spatial_merge_size,
+            quant_config=quant_config,
+            prefix=add_prefix("merger", prefix),
+            use_data_parallel=use_data_parallel,
+        )
+
+        # Resource prepared for vit cuda graph
+        self.tp_size = (
+            1 if use_data_parallel else get_tensor_model_parallel_world_size()
+        )
+        self.max_context_len = max_context_len
+        self.enable_cg = _is_cuda and envs.SGLANG_VIT_ENABLE_CUDA_GRAPH.get()
+
+        self.cuda_graph_runner: Optional[ViTCudaGraphRunner] = None
+        if self.enable_cg:
+            self.cuda_graph_runner = ViTCudaGraphRunner(self)
+
+    def get_window_index(self, grid_thw):
+        cu_window_seqlens: list = [0]
+        window_index_id = 0
+        vit_merger_window_size = (
+            self.window_size // self.spatial_merge_size // self.patch_size
+        )
+        window_index: list = []
+        for grid_t, grid_h, grid_w in grid_thw:
+            llm_grid_h, llm_grid_w = (
+                grid_h // self.spatial_merge_size,
+                grid_w // self.spatial_merge_size,
+            )
+            index = torch.arange(grid_t * llm_grid_h * llm_grid_w).reshape(
+                grid_t, llm_grid_h, llm_grid_w
+            )
+            pad_h = vit_merger_window_size - llm_grid_h % vit_merger_window_size
+            pad_w = vit_merger_window_size - llm_grid_w % vit_merger_window_size
+            num_windows_h = (llm_grid_h + pad_h) // vit_merger_window_size
+            num_windows_w = (llm_grid_w + pad_w) // vit_merger_window_size
+            index_padded = F.pad(index, (0, pad_w, 0, pad_h), "constant", -100)
+            index_padded = index_padded.reshape(
+                grid_t,
+                num_windows_h,
+                vit_merger_window_size,
+                num_windows_w,
+                vit_merger_window_size,
+            )
+            index_padded = index_padded.permute(0, 1, 3, 2, 4).reshape(
+                grid_t,
+                num_windows_h * num_windows_w,
+                vit_merger_window_size,
+                vit_merger_window_size,
+            )
+            seqlens = (index_padded != -100).sum([2, 3]).reshape(-1)
+            index_padded = index_padded.reshape(-1)
+            index_new = index_padded[index_padded != -100]
+            window_index.append(index_new + window_index_id)
+            cu_seqlens_tmp = (
+                seqlens.cumsum(0) * self.spatial_merge_unit + cu_window_seqlens[-1]
+            )
+            cu_window_seqlens.extend(cu_seqlens_tmp.tolist())
+            window_index_id += (grid_t * llm_grid_h * llm_grid_w).item()
+        window_index = torch.cat(window_index, dim=0)
+        return window_index, cu_window_seqlens
+
+    @property
+    def dtype(self) -> torch.dtype:
+        return self.patch_embed.proj.weight.dtype
+
+    @property
+    def device(self) -> torch.device:
+        return self.patch_embed.proj.weight.device
+
+    def rot_pos_emb(self, grid_thw: torch.Tensor) -> torch.Tensor:
+        pos_ids = []
+        for t, h, w in grid_thw:
+            base = self.rot_pos_ids(h, w, self.spatial_merge_size)
+            pos_ids.append(base if t == 1 else base.repeat(t, 1))
+
+        pos_ids = torch.cat(pos_ids, dim=0)
+        max_grid_size = grid_thw[:, 1:].max()
+        rotary_pos_emb_full = self.rotary_pos_emb(max_grid_size)
+        rotary_pos_emb = rotary_pos_emb_full[pos_ids].flatten(1)
+        return rotary_pos_emb
+
+    def forward(
+        self,
+        x: torch.Tensor,
+        grid_thw: torch.Tensor,
+    ) -> torch.Tensor:
+        if self.enable_cg:
+            return self.forward_with_cuda_graph(x, grid_thw)
+
+        # patchify
+        x = x.to(device=self.device, dtype=self.dtype)
+        x = self.patch_embed(x)
+
+        # compute position embedding
+        rotary_pos_emb = self.rot_pos_emb(grid_thw)
+
+        window_index, cu_window_seqlens = self.get_window_index(grid_thw)
+        cu_window_seqlens = torch.tensor(
+            cu_window_seqlens,
+            device=x.device,
+            dtype=torch.int32,
+        )
+        cu_window_seqlens = torch.unique_consecutive(cu_window_seqlens)
+
+        # Move window_index to the same device as x before using it to index x
+        window_index = window_index.to(device=x.device)
+        reverse_indices = permute_inv(window_index)
+
+        # Ensure rotary_pos_emb is on the same device/dtype as x
+        rotary_pos_emb = rotary_pos_emb.to(device=x.device, dtype=x.dtype)
+
+        seq_len, _ = x.size()
+
+        x = x.reshape(seq_len // self.spatial_merge_unit, self.spatial_merge_unit, -1)
+        x = x[window_index, :, :]
+        x = x.reshape(seq_len, -1)
+        rotary_pos_emb = rotary_pos_emb.reshape(
+            seq_len // self.spatial_merge_unit, self.spatial_merge_unit, -1
+        )
+        rotary_pos_emb = rotary_pos_emb[window_index, :, :]
+        rotary_pos_emb = rotary_pos_emb.reshape(seq_len, -1)
+        emb = torch.cat((rotary_pos_emb, rotary_pos_emb), dim=-1)
+        position_embeddings = (emb.cos(), emb.sin())
+        # After building position_embeddings, make sure both cos and sin are on the same device/dtype as the attention input
+        position_embeddings = (
+            position_embeddings[0].to(x.device, x.dtype),
+            position_embeddings[1].to(x.device, x.dtype),
+        )
+
+        # compute cu_seqlens - move cu_seqlens to GPU and make it int32
+        cu_seqlens = torch.cat(
+            [
+                torch.tensor([0], device=x.device, dtype=torch.int32),
+                (grid_thw[:, 0] * grid_thw[:, 1] * grid_thw[:, 2])
+                .cumsum(dim=0)
+                .to(device=x.device, dtype=torch.int32),
+            ]
+        )
+        cu_seqlens = torch.cat([cu_seqlens.new_zeros(1), cu_seqlens])
+        # cu_seqlens must be on cpu because of npu_flash_attention_unpad operator restriction
+        if is_npu():
+            cu_seqlens = cu_seqlens.to("cpu")
+        # transformers
+        x = x.unsqueeze(1)
+        for layer_num, blk in enumerate(self.blocks):
+            fullatt_indexes = self.fullatt_block_indexes
+            if isinstance(fullatt_indexes, torch.Tensor):
+                fullatt_indexes = fullatt_indexes.tolist()
+            if layer_num in fullatt_indexes:
+                cu_seqlens_now = cu_seqlens
+            else:
+                cu_seqlens_now = cu_window_seqlens
+            x = blk(
+                x, cu_seqlens=cu_seqlens_now, position_embeddings=position_embeddings
+            )
+
+        # adapter
+        x = self.merger(x)
+        x = x[reverse_indices, :]
+
+        return x
+
+    def forward_with_cuda_graph(
+        self,
+        x: torch.Tensor,
+        grid_thw: torch.Tensor,
+    ) -> torch.Tensor:
+        # patchify
+        x = x.to(device=self.device, dtype=self.dtype)
+        x = self.patch_embed(x)
+
+        # compute position embedding
+        rotary_pos_emb = self.rot_pos_emb(grid_thw)
+
+        window_index, cu_window_seqlens = self.get_window_index(grid_thw)
+        cu_window_seqlens = torch.tensor(
+            cu_window_seqlens,
+            device=x.device,
+            dtype=torch.int32,
+        )
+        cu_window_seqlens = torch.unique_consecutive(cu_window_seqlens)
+
+        window_index = window_index.to(device=x.device)
+        reverse_indices = permute_inv(window_index)
+        rotary_pos_emb = rotary_pos_emb.to(device=x.device, dtype=x.dtype)
+
+        # patch token num
+        seq_len, _ = x.size()
+
+        # [G, M, hidden]
+        x = x.reshape(seq_len // self.spatial_merge_unit, self.spatial_merge_unit, -1)
+        x = x[window_index, :, :]  # [G, M, hidden]
+        x = x.reshape(seq_len, -1)  # [seq_len, hidden]
+
+        rotary_pos_emb = rotary_pos_emb.reshape(
+            seq_len // self.spatial_merge_unit, self.spatial_merge_unit, -1
+        )
+        rotary_pos_emb = rotary_pos_emb[window_index, :, :]
+        rotary_pos_emb = rotary_pos_emb.reshape(seq_len, -1)
+
+        emb = torch.cat((rotary_pos_emb, rotary_pos_emb), dim=-1)
+        position_embeddings = (emb.cos(), emb.sin())
+        # After building position_embeddings, make sure both cos and sin are on
+        # the same device/dtype as the attention input
+        position_embeddings = (
+            position_embeddings[0].to(x.device, x.dtype),
+            position_embeddings[1].to(x.device, x.dtype),
+        )
+
+        # compute cu_seqlens - move cu_seqlens to GPU and make it int32
+        cu_seqlens = torch.cat(
+            [
+                torch.tensor([0], device=x.device, dtype=torch.int32),
+                (grid_thw[:, 0] * grid_thw[:, 1] * grid_thw[:, 2])
+                .cumsum(dim=0)
+                .to(device=x.device, dtype=torch.int32),
+            ]
+        )
+        cu_seqlens = torch.cat([cu_seqlens.new_zeros(1), cu_seqlens])
+
+        return self.cuda_graph_runner.run(
+            x=x,
+            position_embeddings=position_embeddings,
+            cu_seqlens=cu_seqlens,
+            cu_window_seqlens=cu_window_seqlens,
+            output_indices=reverse_indices,
+        )
+
+
+class Qwen2_5_VLForConditionalGeneration(nn.Module):
+    # BitandBytes specific attributes
+    default_bitsandbytes_target_modules = [
+        ".gate_up_proj.",
+        ".down_proj.",
+        ".q_proj.",
+        ".k_proj.",
+        ".v_proj.",
+        ".o_proj.",
+    ]
+    bitsandbytes_stacked_params_mapping = {
+        # shard_name, weight_name, index
+        "q_proj": ("qkv_proj", 0),
+        "k_proj": ("qkv_proj", 1),
+        "v_proj": ("qkv_proj", 2),
+        "gate_proj": ("gate_up_proj", 0),
+        "up_proj": ("gate_up_proj", 1),
+    }
+
+    packed_modules_mapping = {
+        "gate_up_proj": ["gate_proj", "up_proj"],
+    }
+    # To ensure correct weight loading and mapping.
+    hf_to_sglang_mapper = WeightsMapper(
+        orig_to_new_substr={
+            "attn.qkv": "attn.qkv_proj",
+        },
+        orig_to_new_prefix={
+            # mapping for new names in checkpoint saved after transformers v4.52
+            "model.language_model.": "language_model.model.",
+            "model.visual.": "visual.",
+            # mapping for original checkpoint
+            "lm_head.": "language_model.lm_head.",
+            "model.": "language_model.model.",
+        },
+    )
+
+    def __init__(
+        self,
+        config: Qwen2_5_VLConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+
+        self.pp_group = get_pp_group()
+        self.config = config
+        self.use_data_parallel = get_global_server_args().mm_enable_dp_encoder
+
+        if not self.config.encoder_only:
+            self.model = Qwen2Model(
+                config,
+                quant_config,
+                prefix=add_prefix("model", prefix),
+            )
+
+            if self.pp_group.is_last_rank:
+                if self.pp_group.world_size == 1 and self.config.tie_word_embeddings:
+                    self.lm_head = self.model.embed_tokens
+                else:
+                    self.lm_head = ParallelLMHead(
+                        self.config.vocab_size,
+                        self.config.hidden_size,
+                        quant_config=quant_config,
+                        prefix=add_prefix("lm_head", prefix),
+                    )
+            else:
+                # ranks other than the last rank will have a placeholder layer
+                self.lm_head = PPMissingLayer()
+        else:
+            # encoder_only mode: no language model, so no lm_head needed
+            self.lm_head = None
+
+        self.visual = Qwen2_5_VisionTransformer(
+            config.vision_config,
+            norm_eps=getattr(config, "rms_norm_eps", 1e-6),
+            # NOTE: Qwen2_5-VL vision encoder currently supports BitsAndBytes 4-bit quantization.
+            # Other quantization methods (e.g., GPTQ, AWQ) are untested and may not be supported.
+            quant_config=quant_config,
+            prefix=add_prefix("visual", prefix),
+            use_data_parallel=self.use_data_parallel,
+            max_context_len=self.config.max_position_embeddings,
+        )
+
+        self.is_mrope_enabled = "mrope_section" in self.config.rope_scaling
+
+        self.logits_processor = LogitsProcessor(config)
+        self.pooler = Pooler(pooling_type=PoolingType.LAST, normalize=True)
+
+        # For EAGLE3 support
+        self.capture_aux_hidden_states = False
+
+    def pad_input_ids(self, input_ids: List[int], mm_inputs: MultimodalInputs):
+        pattern = MultiModalityDataPaddingPatternMultimodalTokens()
+        return pattern.pad_input_tokens(input_ids, mm_inputs)
+
+    def get_image_feature(self, items: List[MultimodalDataItem]) -> torch.Tensor:
+        # in qwen-vl, last dim is the same
+        pixel_values = torch.cat([item.feature for item in items], dim=0).type(
+            self.visual.dtype
+        )
+        image_grid_thw = torch.concat([item.image_grid_thw for item in items], dim=0)
+
+        expected_dim = getattr(self.visual, "embed_dim", -1)
+
+        if expected_dim == -1:
+            vision_conf = self.config.vision_config
+            expected_dim = getattr(
+                vision_conf, "embed_dim", getattr(vision_conf, "hidden_size", -1)
+            )
+
+        raw_patch_dim = 1176
+
+        if pixel_values.dim() == 2:
+            current_dim = pixel_values.shape[-1]
+            if current_dim == expected_dim:
+                return pixel_values
+            if current_dim != raw_patch_dim:
+
+                return pixel_values
+
+        assert pixel_values.dim() == 2, pixel_values.dim()
+        assert image_grid_thw.dim() == 2, image_grid_thw.dim()
+        if self.use_data_parallel:
+            return run_dp_sharded_mrope_vision_model(
+                self.visual, pixel_values, image_grid_thw.tolist(), rope_type="rope_3d"
+            )
+        else:
+            image_embeds = self.visual(pixel_values, grid_thw=image_grid_thw)
+        return image_embeds
+
+    _lora_pattern = re.compile(
+        r"^model\.layers\.(\d+)\.(?:self_attn|mlp)\.(?:qkv_proj|o_proj|down_proj|gate_up_proj)$"
+    )
+
+    def should_apply_lora(self, module_name: str) -> bool:
+        return bool(self._lora_pattern.match(module_name))
+
+    def get_video_feature(self, items: List[MultimodalDataItem]) -> torch.Tensor:
+        # in qwen-vl, last dim is the same
+        pixel_values = torch.cat([item.feature for item in items], dim=0).type(
+            self.visual.dtype
+        )
+        video_grid_thw = torch.concat([item.video_grid_thw for item in items], dim=0)
+        assert pixel_values.dim() == 2, pixel_values.dim()
+        assert video_grid_thw.dim() == 2, video_grid_thw.dim()
+        if self.use_data_parallel:
+            return run_dp_sharded_mrope_vision_model(
+                self.visual, pixel_values, video_grid_thw.tolist(), rope_type="rope_3d"
+            )
+        else:
+            video_embeds = self.visual(pixel_values, grid_thw=video_grid_thw)
+        return video_embeds
+
+    def post_process(
+        self,
+        inputs_embeds,
+        modalities: List[Modality],
+        embeddings: List[torch.Tensor],
+        indices: List[torch.Tensor],
+        forward_batch: ForwardBatch,
+    ) -> torch.Tensor:
+        # Placeholder for post_process
+        new_embeddings = []
+        for i, (modality, embedding, index) in enumerate(
+            zip(modalities, embeddings, indices)
+        ):
+            if embedding is None or index is None:
+                continue
+
+            new_embeddings.append(embedding)
+        return new_embeddings, forward_batch
+
+    def get_input_embeddings(self):
+        return self.model.embed_tokens
+
+    @torch.no_grad()
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        input_embeds=None,
+        get_embedding: bool = False,
+        pp_proxy_tensors: Optional[PPProxyTensors] = None,
+    ):
+        """Run forward pass for Qwen2_5-VL.
+
+        Args:
+            input_ids: Flattened (concatenated) input_ids corresponding to a
+                batch.
+            positions: Flattened (concatenated) position ids corresponding to a
+                batch.
+                **NOTE**: If mrope is enabled (default setting for Qwen2-VL
+                opensource models), the shape will be `(3, seq_len)`,
+                otherwise it will be `(seq_len,).
+                (Use input_metadata.mrope_positions to replace it)
+        """
+        if self.is_mrope_enabled:
+            positions = forward_batch.mrope_positions
+
+        if not (
+            forward_batch.forward_mode.is_decode()
+            or not forward_batch.contains_image_inputs()
+        ):
+            if self.is_mrope_enabled:
+                assert positions.ndim == 2 and positions.size(0) == 3, (
+                    "multimodal section rotary embedding requires "
+                    f"(3, seq_len) positions, but got {positions.size()}"
+                )
+
+        hidden_states = general_mm_embed_routine(
+            input_ids=input_ids,
+            forward_batch=forward_batch,
+            language_model=self.model,
+            multimodal_model=self,
+            positions=positions,
+            pp_proxy_tensors=pp_proxy_tensors,
+        )
+
+        aux_hidden_states = None
+        if self.capture_aux_hidden_states:
+            hidden_states, aux_hidden_states = hidden_states
+
+        if self.pp_group.is_last_rank:
+            if not get_embedding:
+                return self.logits_processor(
+                    input_ids,
+                    hidden_states,
+                    self.lm_head,
+                    forward_batch,
+                    aux_hidden_states,
+                )
+            else:
+                return self.pooler(hidden_states, forward_batch)
+        else:
+            return hidden_states
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            (".qkv_proj", ".q_proj", "q"),
+            (".qkv_proj", ".k_proj", "k"),
+            (".qkv_proj", ".v_proj", "v"),
+            ("gate_up_proj", "up_proj", 1),
+            ("gate_up_proj", "gate_proj", 0),
+        ]
+        params_dict = dict(self.named_parameters(remove_duplicate=False))
+        for name, loaded_weight in weights:
+            if "rotary_emb.inv_freq" in name:
+                continue
+
+            if (
+                self.config.tie_word_embeddings
+                and self.pp_group.is_last_rank
+                and "model.embed_tokens.weight" in name
+            ):
+                if "lm_head.weight" in params_dict:
+                    lm_head_param = params_dict["lm_head.weight"]
+                    weight_loader = getattr(
+                        lm_head_param, "weight_loader", default_weight_loader
+                    )
+                    weight_loader(lm_head_param, loaded_weight)
+
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                if (
+                    "visual" in name
+                    and "up_proj" not in name
+                    and "gate_proj" not in name
+                ):
+                    continue
+                name = name.replace(weight_name, param_name)
+                layer_id = get_layer_id(name)
+                if (
+                    layer_id is not None
+                    and hasattr(self, "model")
+                    and hasattr(self.model, "start_layer")
+                    and (
+                        layer_id < self.model.start_layer
+                        or layer_id >= self.model.end_layer
+                    )
+                ):
+                    continue
+
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                # Skip loading visual/language model weights
+                if (
+                    self.config.encoder_only or self.config.language_only
+                ) and name not in params_dict:
+                    continue
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                if "visual" in name:
+                    # adapt to VisionAttention
+                    name = name.replace(r"attn.qkv.", r"attn.qkv_proj.")
+
+                try:
+                    # Skip loading extra bias for GPTQ models.
+                    if name.endswith(".bias") and name not in params_dict:
+                        continue
+                    if name in params_dict.keys():
+                        param = params_dict[name]
+                    else:
+                        continue
+
+                except KeyError:
+                    print(params_dict.keys())
+                    raise
+
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
+                weight_loader(param, loaded_weight)
+
+    def get_embed_and_head(self):
+        return self.model.embed_tokens.weight, self.lm_head.weight
+
+    def set_eagle3_layers_to_capture(self, layer_ids: Optional[List[int]] = None):
+        self.capture_aux_hidden_states = True
+        self.model.capture_aux_hidden_states = True
+        if layer_ids is None:
+            num_layers = self.config.num_hidden_layers
+            self.model.layers_to_capture = [
+                2,
+                num_layers // 2,
+                num_layers - 3,
+            ]  # Specific layers for EAGLE3 support
+        else:
+            self.model.layers_to_capture = [val + 1 for val in layer_ids]
+
+
+EntryClass = [Qwen2_5_VLForConditionalGeneration]
diff --git a/sglang/python/sglang/srt/models/qwen2_audio.py b/sglang/python/sglang/srt/models/qwen2_audio.py
new file mode 100644
index 0000000000000000000000000000000000000000..669db5a39c4989d4d0e21c5a030c7b9fc064cbc4
--- /dev/null
+++ b/sglang/python/sglang/srt/models/qwen2_audio.py
@@ -0,0 +1,189 @@
+# coding=utf-8
+# Adapted from
+# https://github.com/huggingface/transformers/blob/1d45d90e5d1552eccb6d8cc9b7bba283ccefb808/src/transformers/models/qwen2_audio/modeling_qwen2_audio.py
+# Copyright 2024 The Qwen team.
+# Copyright 2023 The vLLM team.
+# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Inference-only Qwen2-Audio model compatible with HuggingFace weights."""
+
+import logging
+from typing import Any, Iterable, List, Optional, Tuple
+
+import torch
+import torch.nn as nn
+from transformers import Qwen2AudioEncoderConfig, Qwen2Config
+from transformers.models.qwen2_audio.configuration_qwen2_audio import Qwen2AudioConfig
+from transformers.models.qwen2_audio.modeling_qwen2_audio import (
+    Qwen2AudioEncoder,
+    Qwen2AudioMultiModalProjector,
+)
+
+from sglang.srt.layers.quantization.base_config import QuantizationConfig
+from sglang.srt.managers.mm_utils import (
+    MultiModalityDataPaddingPatternMultimodalTokens,
+    general_mm_embed_routine,
+)
+from sglang.srt.managers.schedule_batch import (
+    Modality,
+    MultimodalDataItem,
+    MultimodalInputs,
+)
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch
+from sglang.srt.model_loader.weight_utils import default_weight_loader
+from sglang.srt.models.qwen2 import Qwen2ForCausalLM
+from sglang.srt.utils import add_prefix
+
+logger = logging.getLogger(__name__)
+
+
+class Qwen2AudioForConditionalGeneration(nn.Module):
+    # BitandBytes specific attributes
+    default_bitsandbytes_target_modules = [
+        ".gate_proj.",
+        ".down_proj.",
+        ".up_proj.",
+        ".q_proj.",
+        ".k_proj.",
+        ".v_proj.",
+        ".o_proj.",
+    ]
+    bitsandbytes_stacked_params_mapping = {
+        # shard_name, weight_name, index
+        "q_proj": ("qkv_proj", 0),
+        "k_proj": ("qkv_proj", 1),
+        "v_proj": ("qkv_proj", 2),
+        "gate_proj": ("gate_up_proj", 0),
+        "up_proj": ("gate_up_proj", 1),
+    }
+
+    def __init__(
+        self,
+        config: Qwen2AudioConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+
+        self.config = config
+
+        if getattr(self.config, "audio_config", None) is None:
+            self.config.audio_config = Qwen2AudioEncoderConfig(
+                self.config._name_or_path
+            )
+
+        if getattr(self.config, "text_config", None) is None:
+            self.config.text_config = Qwen2Config(self.config._name_or_path)
+
+        self.audio_tower = Qwen2AudioEncoder(
+            config.audio_config,
+        )
+        self.multi_modal_projector = Qwen2AudioMultiModalProjector(config)
+        self.language_model = Qwen2ForCausalLM(
+            config.text_config, quant_config, prefix=add_prefix("model", prefix)
+        )
+        self.pattern = MultiModalityDataPaddingPatternMultimodalTokens()
+
+    def pad_input_ids(self, input_ids: List[int], mm_inputs: MultimodalInputs):
+        return self.pattern.pad_input_tokens(input_ids, mm_inputs)
+
+    def get_audio_feature(self, items: List[MultimodalDataItem]) -> torch.Tensor:
+        # Extract audio features from input items
+        input_features = torch.cat([item.feature for item in items], dim=0).type(
+            self.audio_tower.dtype
+        )
+
+        audio_embeds = self.audio_tower(input_features).last_hidden_state
+        audio_embeds = self.multi_modal_projector(audio_embeds)
+
+        audio_feature_lens = torch.cat([item.audio_feature_lens for item in items])
+        new_embeds = []
+        for i, d in zip(audio_feature_lens, audio_embeds):
+            new_embeds.append(d[: i.item()])
+
+        return torch.cat(new_embeds, dim=0)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        **kwargs: Any,
+    ) -> torch.Tensor:
+        hidden_states = general_mm_embed_routine(
+            input_ids=input_ids,
+            forward_batch=forward_batch,
+            language_model=self.language_model,
+            data_embedding_funcs={
+                Modality.AUDIO: self.get_audio_feature,
+            },
+            positions=positions,
+        )
+
+        return hidden_states
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+            ("gate_up_proj", "gate_proj", 0),
+            ("gate_up_proj", "up_proj", 1),
+        ]
+        params_dict = dict(self.named_parameters(remove_duplicate=False))
+
+        for name, loaded_weight in weights:
+            if "rotary_emb.inv_freq" in name:
+                continue
+            if "rotary_emb.cos_cached" in name or "rotary_emb.sin_cached" in name:
+                # Models trained using ColossalAI may include these tensors in
+                # the checkpoint. Skip them.
+                continue
+
+            if self.config.text_config.tie_word_embeddings and "lm_head.weight" in name:
+                continue
+
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                if weight_name not in name or "audio_tower" in name:
+                    continue
+                name_tmp = name.replace(weight_name, param_name)
+
+                # Skip loading extra bias for GPTQ models.
+                if name_tmp.endswith(".bias") and name_tmp not in params_dict:
+                    continue
+                param = params_dict[name_tmp]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                try:
+                    # Skip loading extra bias for GPTQ models.
+                    if name.endswith(".bias") and name not in params_dict:
+                        continue
+                    param = params_dict[name]
+                except KeyError:
+                    print(params_dict.keys())
+                    raise
+
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
+                weight_loader(param, loaded_weight)
+
+
+EntryClass = Qwen2AudioForConditionalGeneration
diff --git a/sglang/python/sglang/srt/models/qwen2_classification.py b/sglang/python/sglang/srt/models/qwen2_classification.py
new file mode 100644
index 0000000000000000000000000000000000000000..366213e70b8f4d0d6e2b8c02e273ae765f4ff491
--- /dev/null
+++ b/sglang/python/sglang/srt/models/qwen2_classification.py
@@ -0,0 +1,75 @@
+﻿# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+from typing import Iterable, Optional, Tuple
+
+import torch
+from torch import nn
+from transformers import Qwen2Config
+
+from sglang.srt.layers.pooler import EmbeddingPoolerOutput, Pooler, PoolingType
+from sglang.srt.layers.quantization.base_config import QuantizationConfig
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch
+from sglang.srt.models.qwen2 import Qwen2ForCausalLM, Qwen2Model
+from sglang.srt.utils import add_prefix
+
+
+class Qwen2ForSequenceClassification(nn.Module):
+    def __init__(
+        self,
+        config: Qwen2Config,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.config = config
+        self.quant_config = quant_config
+        self.model = Qwen2Model(
+            config, quant_config=quant_config, prefix=add_prefix("model", prefix)
+        )
+        self.score = nn.Linear(config.hidden_size, config.num_labels)
+        self.pooler = Pooler(pooling_type=PoolingType.LAST, normalize=False)
+
+        self.eos_token_id = config.eos_token_id
+
+    @torch.no_grad()
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        input_embeds: torch.Tensor = None,
+        get_embedding: bool = True,
+    ) -> EmbeddingPoolerOutput:
+        assert (
+            get_embedding
+        ), "Qwen2ForSequenceClassification is only used for embedding"
+
+        hidden_states = self.model(input_ids, positions, forward_batch, input_embeds)
+        logits = self.score(hidden_states)
+        pooled_logits = self.pooler(logits, forward_batch).embeddings
+
+        return EmbeddingPoolerOutput(pooled_logits)
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        # Filter out lm_head weights of Qwen2ForCausalLM
+        filtered_weights = [
+            (name, w) for name, w in weights if not name.startswith("lm_head")
+        ]
+        return Qwen2ForCausalLM.load_weights(self, filtered_weights)
+
+
+EntryClass = [
+    Qwen2ForSequenceClassification,
+]
diff --git a/sglang/python/sglang/srt/models/qwen2_eagle.py b/sglang/python/sglang/srt/models/qwen2_eagle.py
new file mode 100644
index 0000000000000000000000000000000000000000..4b4c0ec41cdc79a5538ae2df7ff63018b818e9b2
--- /dev/null
+++ b/sglang/python/sglang/srt/models/qwen2_eagle.py
@@ -0,0 +1,146 @@
+"""
+Copyright 2023-2024 SGLang Team
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+from sglang.srt.utils import add_prefix
+
+# Adapted from
+# https://github.com/SafeAILab/EAGLE/blob/main/eagle/model/cnets.py
+"""Inference-only LLaMA-EAGLE model compatible with HuggingFace weights."""
+
+from typing import Iterable, Optional, Tuple
+
+import torch
+from torch import nn
+
+from sglang.srt.distributed import get_pp_group
+from sglang.srt.layers.logits_processor import LogitsProcessor
+from sglang.srt.layers.quantization.base_config import QuantizationConfig
+from sglang.srt.layers.vocab_parallel_embedding import (
+    ParallelLMHead,
+    VocabParallelEmbedding,
+)
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch, PPProxyTensors
+from sglang.srt.models.qwen2 import Qwen2DecoderLayer, Qwen2ForCausalLM
+
+Qwen2Config = None
+
+
+class Qwen2DecoderLayer(Qwen2DecoderLayer):
+    def __init__(
+        self,
+        config: Qwen2Config,
+        layer_id: int = 0,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__(config, layer_id, quant_config, prefix=prefix)
+
+        # Skip the input_layernorm
+        # https://github.com/SafeAILab/EAGLE/blob/35c78f6cdc19a73e05cf5c330b4c358dad970c6a/eagle/model/cnets.py#L427
+        if layer_id == 0:
+            del self.input_layernorm
+            setattr(self, "input_layernorm", lambda x: x)
+
+
+class Qwen2Model(nn.Module):
+    def __init__(
+        self,
+        config: Qwen2Config,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.config = config
+        self.vocab_size = config.vocab_size
+        self.embed_tokens = VocabParallelEmbedding(
+            config.vocab_size,
+            config.hidden_size,
+            prefix=add_prefix("embed_tokens", prefix),
+        )
+        self.layers = nn.ModuleList(
+            [
+                Qwen2DecoderLayer(
+                    config,
+                    i,
+                    quant_config=quant_config,
+                    prefix=add_prefix(f"layers.{i}", prefix),
+                )
+                for i in range(config.num_hidden_layers)
+            ]
+        )
+        self.fc = torch.nn.Linear(config.hidden_size * 2, config.hidden_size)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        input_embeds: torch.Tensor = None,
+        pp_proxy_tensors: Optional[PPProxyTensors] = None,
+    ) -> torch.Tensor:
+        if input_embeds is None:
+            hidden_states = self.embed_tokens(input_ids)
+        else:
+            hidden_states = input_embeds
+
+        hidden_states = self.fc(
+            torch.cat((hidden_states, forward_batch.spec_info.hidden_states), dim=-1)
+        )
+
+        residual = None
+        for i in range(len(self.layers)):
+            layer = self.layers[i]
+            hidden_states, residual = layer(
+                positions,
+                hidden_states,
+                forward_batch,
+                residual,
+            )
+        return hidden_states + residual
+
+
+class Qwen2ForCausalLMEagle(Qwen2ForCausalLM):
+    def __init__(
+        self,
+        config: Qwen2Config,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        nn.Module.__init__(self)
+        self.config = config
+        self.quant_config = quant_config
+        self.pp_group = get_pp_group()
+        self.model = Qwen2Model(
+            config, quant_config=quant_config, prefix=add_prefix("model", prefix)
+        )
+        if self.config.tie_word_embeddings:
+            self.lm_head = self.model.embed_tokens
+        else:
+            self.lm_head = ParallelLMHead(
+                config.vocab_size,
+                config.hidden_size,
+                quant_config=quant_config,
+                prefix=add_prefix("lm_head", prefix),
+            )
+        self.logits_processor = LogitsProcessor(config)
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        for name, loaded_weight in weights:
+            if "lm_head" not in name:
+                name = "model." + name
+                super().load_weights([(name, loaded_weight)])
+
+
+EntryClass = [Qwen2ForCausalLMEagle]
diff --git a/sglang/python/sglang/srt/models/qwen2_moe.py b/sglang/python/sglang/srt/models/qwen2_moe.py
new file mode 100644
index 0000000000000000000000000000000000000000..4006de801b2be66ca2363c9568c68757be31b884
--- /dev/null
+++ b/sglang/python/sglang/srt/models/qwen2_moe.py
@@ -0,0 +1,901 @@
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+# Adapted from
+# https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/models/qwen2_moe.py
+"""Inference-only Qwen2MoE model compatible with HuggingFace weights."""
+
+import logging
+from contextlib import nullcontext
+from typing import Any, Dict, Iterable, List, Optional, Tuple, Union
+
+import torch
+import torch.nn.functional as F
+from torch import nn
+from transformers import PretrainedConfig
+
+from sglang.srt.batch_overlap.two_batch_overlap import model_forward_maybe_tbo
+from sglang.srt.distributed import (
+    get_moe_expert_parallel_world_size,
+    get_pp_group,
+    get_tensor_model_parallel_world_size,
+    tensor_model_parallel_all_reduce,
+)
+from sglang.srt.eplb.expert_distribution import get_global_expert_distribution_recorder
+from sglang.srt.eplb.expert_location import ModelConfigForExpertLocation
+from sglang.srt.eplb.expert_location_dispatch import ExpertLocationDispatchInfo
+from sglang.srt.layers.activation import SiluAndMul
+from sglang.srt.layers.communicator import (
+    LayerCommunicator,
+    LayerScatterModes,
+    ScatterMode,
+)
+from sglang.srt.layers.dp_attention import (
+    get_attention_tp_rank,
+    get_attention_tp_size,
+    is_dp_attention_enabled,
+)
+from sglang.srt.layers.layernorm import RMSNorm
+from sglang.srt.layers.linear import (
+    MergedColumnParallelLinear,
+    QKVParallelLinear,
+    ReplicatedLinear,
+    RowParallelLinear,
+)
+from sglang.srt.layers.logits_processor import LogitsProcessor
+from sglang.srt.layers.moe import get_moe_a2a_backend
+from sglang.srt.layers.moe.ep_moe.layer import get_moe_impl_class
+from sglang.srt.layers.moe.fused_moe_triton import FusedMoE
+from sglang.srt.layers.moe.topk import TopK
+from sglang.srt.layers.moe.utils import (
+    RoutingMethodType,
+    filter_moe_weight_param_global_expert,
+)
+from sglang.srt.layers.quantization.base_config import QuantizationConfig
+from sglang.srt.layers.radix_attention import RadixAttention
+from sglang.srt.layers.rotary_embedding import get_rope
+from sglang.srt.layers.utils import PPMissingLayer, get_layer_id
+from sglang.srt.layers.vocab_parallel_embedding import (
+    ParallelLMHead,
+    VocabParallelEmbedding,
+)
+from sglang.srt.model_executor.cuda_graph_runner import get_is_capture_mode
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch, PPProxyTensors
+from sglang.srt.model_loader.weight_utils import default_weight_loader
+from sglang.srt.server_args import get_global_server_args
+from sglang.srt.utils import (
+    add_prefix,
+    cpu_has_amx_support,
+    is_cpu,
+    is_cuda,
+    make_layers,
+    use_intel_amx_backend,
+)
+
+logger = logging.getLogger(__name__)
+
+_is_cuda = is_cuda()
+_is_cpu = is_cpu()
+_is_cpu_amx_available = cpu_has_amx_support()
+
+
+class Qwen2MoeMLP(nn.Module):
+    def __init__(
+        self,
+        hidden_size: int,
+        intermediate_size: int,
+        hidden_act: str,
+        quant_config: Optional[QuantizationConfig] = None,
+        reduce_results: bool = True,
+        prefix: str = "",
+        tp_rank: Optional[int] = None,
+        tp_size: Optional[int] = None,
+    ) -> None:
+        super().__init__()
+        self.gate_up_proj = MergedColumnParallelLinear(
+            hidden_size,
+            [intermediate_size] * 2,
+            bias=False,
+            quant_config=quant_config,
+            prefix=add_prefix("gate_up_proj", prefix),
+            tp_rank=tp_rank,
+            tp_size=tp_size,
+        )
+        self.down_proj = RowParallelLinear(
+            intermediate_size,
+            hidden_size,
+            bias=False,
+            quant_config=quant_config,
+            reduce_results=reduce_results,
+            prefix=add_prefix("down_proj", prefix),
+            tp_rank=tp_rank,
+            tp_size=tp_size,
+        )
+        if hidden_act != "silu":
+            raise ValueError(
+                f"Unsupported activation: {hidden_act}. Only silu is supported for now."
+            )
+        self.act_fn = SiluAndMul()
+
+    def forward(
+        self,
+        x,
+        should_allreduce_fusion: bool = False,
+        use_reduce_scatter: bool = False,
+    ):
+        gate_up, _ = self.gate_up_proj(x)
+        x = self.act_fn(gate_up)
+        x, _ = self.down_proj(
+            x, skip_all_reduce=should_allreduce_fusion or use_reduce_scatter
+        )
+        return x
+
+
+class Qwen2MoeSparseMoeBlock(nn.Module):
+    def __init__(
+        self,
+        layer_id: int,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        alt_stream: Optional[torch.cuda.Stream] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.tp_size = get_tensor_model_parallel_world_size()
+        self.layer_id = layer_id
+        self.alt_stream = alt_stream
+        if self.tp_size > config.num_experts:
+            raise ValueError(
+                f"Tensor parallel size {self.tp_size} is greater than "
+                f"the number of experts {config.num_experts}."
+            )
+
+        self.topk = TopK(
+            top_k=config.num_experts_per_tok,
+            renormalize=config.norm_topk_prob,
+            layer_id=layer_id,
+        )
+
+        self.experts = get_moe_impl_class(quant_config)(
+            layer_id=self.layer_id,
+            top_k=config.num_experts_per_tok,
+            num_experts=config.num_experts
+            + get_global_server_args().ep_num_redundant_experts,
+            hidden_size=config.hidden_size,
+            intermediate_size=config.moe_intermediate_size,
+            quant_config=quant_config,
+            prefix=add_prefix("experts", prefix),
+            routing_method_type=RoutingMethodType.RenormalizeNaive,
+        )
+
+        self.gate = ReplicatedLinear(
+            config.hidden_size,
+            config.num_experts,
+            bias=False,
+            quant_config=None,
+            prefix=add_prefix("gate", prefix),
+        )
+        if config.shared_expert_intermediate_size > 0:
+            self.shared_expert = Qwen2MoeMLP(
+                hidden_size=config.hidden_size,
+                intermediate_size=config.shared_expert_intermediate_size,
+                hidden_act=config.hidden_act,
+                quant_config=quant_config,
+                reduce_results=False,
+                prefix=add_prefix("shared_expert", prefix),
+                **(
+                    dict(tp_rank=0, tp_size=1)
+                    if get_moe_a2a_backend().is_deepep()
+                    else {}
+                ),
+            )
+        else:
+            self.shared_expert = None
+        if _is_cpu and _is_cpu_amx_available:
+            self.shared_expert_gate = ReplicatedLinear(
+                config.hidden_size,
+                1,
+                bias=False,
+                quant_config=None,
+                prefix=add_prefix("shared_expert_gate", prefix),
+            )
+        else:
+            self.shared_expert_gate = torch.nn.Linear(config.hidden_size, 1, bias=False)
+
+        if get_moe_a2a_backend().is_deepep():
+            # TODO: we will support tp < ep in the future
+            self.ep_size = get_moe_expert_parallel_world_size()
+            self.num_experts = (
+                config.num_experts + get_global_server_args().ep_num_redundant_experts
+            )
+            self.top_k = config.num_experts_per_tok
+
+    def get_moe_weights(self):
+        return [
+            x.data
+            for name, x in self.experts.named_parameters()
+            if name not in ["correction_bias"]
+            and filter_moe_weight_param_global_expert(
+                name, x, self.experts.num_local_experts
+            )
+        ]
+
+    def _forward_shared_experts(self, hidden_states: torch.Tensor):
+        shared_output = None
+        if self.shared_expert is not None:
+            shared_output = self.shared_expert(hidden_states)
+            if self.shared_expert_gate is not None:
+                if use_intel_amx_backend(self.shared_expert_gate):
+                    shared_output = torch.ops.sgl_kernel.fused_linear_sigmoid_mul(
+                        hidden_states,
+                        self.shared_expert_gate.weight,
+                        self.shared_expert_gate.bias,
+                        True,
+                        shared_output,
+                    )
+                else:
+                    shared_output = (
+                        F.sigmoid(self.shared_expert_gate(hidden_states))
+                        * shared_output
+                    )
+
+        return shared_output
+
+    def _forward_deepep(self, hidden_states: torch.Tensor, forward_batch: ForwardBatch):
+        shared_output = None
+        if hidden_states.shape[0] > 0:
+            # router_logits: (num_tokens, n_experts)
+            router_logits, _ = self.gate(hidden_states)
+            shared_output = self._forward_shared_experts(hidden_states)
+            topk_output = self.topk(
+                hidden_states,
+                router_logits,
+                num_token_non_padded=forward_batch.num_token_non_padded,
+                expert_location_dispatch_info=ExpertLocationDispatchInfo.init_new(
+                    layer_id=self.layer_id,
+                ),
+            )
+        else:
+            topk_output = self.topk.empty_topk_output(hidden_states.device)
+        final_hidden_states = self.experts(
+            hidden_states=hidden_states,
+            topk_output=topk_output,
+        )
+
+        if shared_output is not None:
+            final_hidden_states.add_(shared_output)
+
+        return final_hidden_states
+
+    def _forward_router_experts(self, hidden_states: torch.Tensor):
+        # router_logits: (num_tokens, n_experts)
+        router_logits, _ = self.gate(hidden_states)
+        topk_output = self.topk(hidden_states, router_logits)
+        return self.experts(hidden_states, topk_output)
+
+    def forward_normal_dual_stream(
+        self,
+        hidden_states: torch.Tensor,
+    ) -> torch.Tensor:
+        current_stream = torch.cuda.current_stream()
+        self.alt_stream.wait_stream(current_stream)
+        shared_output = self._forward_shared_experts(hidden_states.clone())
+
+        with torch.cuda.stream(self.alt_stream):
+            router_output = self._forward_router_experts(hidden_states)
+
+        current_stream.wait_stream(self.alt_stream)
+
+        return router_output, shared_output
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        forward_batch: Optional[ForwardBatch] = None,
+        use_reduce_scatter: bool = False,
+    ) -> torch.Tensor:
+        num_tokens, hidden_dim = hidden_states.shape
+        hidden_states = hidden_states.view(-1, hidden_dim)
+
+        if get_moe_a2a_backend().is_deepep():
+            return self._forward_deepep(hidden_states, forward_batch)
+
+        if (
+            self.alt_stream is not None
+            and hidden_states.shape[0] > 0
+            and get_is_capture_mode()
+        ):
+            final_hidden_states, shared_output = self.forward_normal_dual_stream(
+                hidden_states
+            )
+        else:
+            shared_output = self._forward_shared_experts(hidden_states)
+            final_hidden_states = self._forward_router_experts(hidden_states)
+
+        if shared_output is not None:
+            # In-place add is required to keep final_hidden_states in the
+            # symmetric memory pool (when --enable-symm-mem is used).
+            # An out-of-place add would allocate a new tensor outside symm
+            # memory, breaking subsequent symmetric collective operations.
+            final_hidden_states += shared_output
+        if self.tp_size > 1 and not use_reduce_scatter:
+            final_hidden_states = tensor_model_parallel_all_reduce(final_hidden_states)
+
+        return final_hidden_states.view(num_tokens, hidden_dim)
+
+
+class Qwen2MoeAttention(nn.Module):
+    def __init__(
+        self,
+        hidden_size: int,
+        num_heads: int,
+        num_kv_heads: int,
+        layer_id: int = 0,
+        rope_theta: float = 10000,
+        rope_scaling: Optional[Dict[str, Any]] = None,
+        max_position_embeddings: int = 8192,
+        qkv_bias: int = True,
+        quant_config: Optional[QuantizationConfig] = None,
+        dual_chunk_attention_config: Optional[dict[str, Any]] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.hidden_size = hidden_size
+
+        attn_tp_rank = get_attention_tp_rank()
+        attn_tp_size = get_attention_tp_size()
+
+        self.total_num_heads = num_heads
+        assert self.total_num_heads % attn_tp_size == 0
+        self.num_heads = self.total_num_heads // attn_tp_size
+        self.total_num_kv_heads = num_kv_heads
+        if self.total_num_kv_heads >= attn_tp_size:
+            # Number of KV heads is greater than TP size, so we partition
+            # the KV heads across multiple tensor parallel GPUs.
+            assert self.total_num_kv_heads % attn_tp_size == 0
+        else:
+            # Number of KV heads is less than TP size, so we replicate
+            # the KV heads across multiple tensor parallel GPUs.
+            assert attn_tp_size % self.total_num_kv_heads == 0
+        self.num_kv_heads = max(1, self.total_num_kv_heads // attn_tp_size)
+        self.head_dim = hidden_size // self.total_num_heads
+        self.q_size = self.num_heads * self.head_dim
+        self.kv_size = self.num_kv_heads * self.head_dim
+        self.scaling = self.head_dim**-0.5
+        self.rope_theta = rope_theta
+        self.max_position_embeddings = max_position_embeddings
+
+        self.qkv_proj = QKVParallelLinear(
+            hidden_size,
+            self.head_dim,
+            self.total_num_heads,
+            self.total_num_kv_heads,
+            bias=qkv_bias,
+            quant_config=quant_config,
+            tp_rank=attn_tp_rank,
+            tp_size=attn_tp_size,
+            prefix=add_prefix("qkv_proj", prefix),
+        )
+
+        self.o_proj = RowParallelLinear(
+            self.total_num_heads * self.head_dim,
+            hidden_size,
+            bias=False,
+            quant_config=quant_config,
+            tp_rank=attn_tp_rank,
+            tp_size=attn_tp_size,
+            reduce_results=False,
+            prefix=add_prefix("o_proj", prefix),
+        )
+
+        self.rotary_emb = get_rope(
+            self.head_dim,
+            rotary_dim=self.head_dim,
+            max_position=max_position_embeddings,
+            base=rope_theta,
+            rope_scaling=rope_scaling,
+            dual_chunk_attention_config=dual_chunk_attention_config,
+        )
+        self.attn = RadixAttention(
+            self.num_heads,
+            self.head_dim,
+            self.scaling,
+            num_kv_heads=self.num_kv_heads,
+            layer_id=layer_id,
+            quant_config=quant_config,
+            prefix=add_prefix("attn", prefix),
+        )
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+    ) -> torch.Tensor:
+        qkv, _ = self.qkv_proj(hidden_states)
+        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+        q, k = self.rotary_emb(positions, q, k)
+        attn_output = self.attn(q, k, v, forward_batch)
+        output, _ = self.o_proj(attn_output)
+        return output
+
+
+class Qwen2MoeDecoderLayer(nn.Module):
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        layer_id: int,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+        alt_stream: Optional[torch.cuda.Stream] = None,
+    ) -> None:
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        rope_theta = getattr(config, "rope_theta", 10000)
+        rope_scaling = getattr(config, "rope_scaling", None)
+        max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
+        qkv_bias = getattr(config, "qkv_bias", True)
+        dual_chunk_attention_config = getattr(
+            config, "dual_chunk_attention_config", None
+        )
+        self.self_attn = Qwen2MoeAttention(
+            hidden_size=self.hidden_size,
+            num_heads=config.num_attention_heads,
+            num_kv_heads=config.num_key_value_heads,
+            layer_id=layer_id,
+            rope_theta=rope_theta,
+            rope_scaling=rope_scaling,
+            max_position_embeddings=max_position_embeddings,
+            quant_config=quant_config,
+            dual_chunk_attention_config=dual_chunk_attention_config,
+            qkv_bias=qkv_bias,
+            prefix=add_prefix("self_attn", prefix),
+        )
+
+        self.layer_id = layer_id
+
+        self.attn_tp_size = get_attention_tp_size()
+        self.attn_tp_rank = get_attention_tp_rank()
+
+        # Qwen2MoE all layers are sparse and have no nextn now
+        self.is_layer_sparse = True
+        is_previous_layer_sparse = True
+        is_next_layer_sparse = True
+
+        self.layer_scatter_modes = LayerScatterModes.init_new(
+            layer_id=layer_id,
+            num_layers=config.num_hidden_layers,
+            is_layer_sparse=self.is_layer_sparse,
+            is_previous_layer_sparse=is_previous_layer_sparse,
+            is_next_layer_sparse=is_next_layer_sparse,
+        )
+
+        if self.is_layer_sparse:
+            self.mlp = Qwen2MoeSparseMoeBlock(
+                layer_id=layer_id,
+                config=config,
+                quant_config=quant_config,
+                alt_stream=alt_stream,
+                prefix=add_prefix("mlp", prefix),
+            )
+        else:
+            self.mlp = Qwen2MoeMLP(
+                hidden_size=config.hidden_size,
+                intermediate_size=config.intermediate_size,
+                hidden_act=config.hidden_act,
+                quant_config=quant_config,
+                prefix=add_prefix("mlp", prefix),
+            )
+        self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = RMSNorm(
+            config.hidden_size, eps=config.rms_norm_eps
+        )
+        self.layer_communicator = LayerCommunicator(
+            layer_scatter_modes=self.layer_scatter_modes,
+            input_layernorm=self.input_layernorm,
+            post_attention_layernorm=self.post_attention_layernorm,
+            allow_reduce_scatter=True,
+        )
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+        residual: Optional[torch.Tensor],
+        captured_last_layer_outputs: Optional[List[torch.Tensor]] = None,
+        **kwargs,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+
+        hidden_states, residual = (
+            self.layer_communicator.prepare_attn_and_capture_last_layer_outputs(
+                hidden_states,
+                residual,
+                forward_batch,
+                captured_last_layer_outputs=captured_last_layer_outputs,
+                **kwargs,
+            )
+        )
+
+        if hidden_states.shape[0] != 0:
+            hidden_states = self.self_attn(
+                positions=positions,
+                hidden_states=hidden_states,
+                forward_batch=forward_batch,
+            )
+
+        hidden_states, residual = self.layer_communicator.prepare_mlp(
+            hidden_states, residual, forward_batch
+        )
+
+        # For DP with padding, reduce scatter can be used instead of all-reduce.
+        use_reduce_scatter = self.layer_communicator.should_use_reduce_scatter(
+            forward_batch
+        )
+
+        hidden_states = self.mlp(hidden_states, forward_batch, use_reduce_scatter)
+
+        hidden_states, residual = self.layer_communicator.postprocess_layer(
+            hidden_states, residual, forward_batch
+        )
+
+        return hidden_states, residual
+
+
+class Qwen2MoeModel(nn.Module):
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+        decoder_layer_type: type[nn.Module] = Qwen2MoeDecoderLayer,
+        alt_stream: Optional[torch.cuda.Stream] = None,
+    ) -> None:
+        super().__init__()
+        self.config = config
+
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+        self.pp_group = get_pp_group()
+
+        if self.pp_group.is_first_rank:
+            self.embed_tokens = VocabParallelEmbedding(
+                config.vocab_size,
+                config.hidden_size,
+                use_attn_tp_group=is_dp_attention_enabled(),
+                prefix=add_prefix("embed_tokens", prefix),
+            )
+        else:
+            self.embed_tokens = PPMissingLayer()
+
+        # Use the provided decoder layer type or default to Qwen2MoeDecoderLayer
+        decoder_layer_type = decoder_layer_type or Qwen2MoeDecoderLayer
+        self.layers, self.start_layer, self.end_layer = make_layers(
+            config.num_hidden_layers,
+            lambda idx, prefix: decoder_layer_type(
+                layer_id=idx,
+                config=config,
+                quant_config=quant_config,
+                prefix=prefix,
+                alt_stream=alt_stream,
+            ),
+            pp_rank=self.pp_group.rank_in_group,
+            pp_size=self.pp_group.world_size,
+            prefix=add_prefix("layers", prefix),
+        )
+        if self.pp_group.is_last_rank:
+            self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        else:
+            self.norm = PPMissingLayer(return_tuple=True)
+
+        # For EAGLE3 support
+        self.layers_to_capture = []
+
+    def set_eagle3_layers_to_capture(self, layers_to_capture: List[int]):
+        self.layers_to_capture = layers_to_capture
+        for layer_id in self.layers_to_capture:
+            setattr(self.layers[layer_id], "_is_layer_to_capture", True)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        input_embeds: torch.Tensor = None,
+        pp_proxy_tensors: Optional[PPProxyTensors] = None,
+    ) -> Union[torch.Tensor, PPProxyTensors]:
+        if self.pp_group.is_first_rank:
+            if input_embeds is None:
+                hidden_states = self.embed_tokens(input_ids)
+            else:
+                hidden_states = input_embeds
+            residual = None
+        else:
+            assert pp_proxy_tensors is not None
+            hidden_states = pp_proxy_tensors["hidden_states"]
+            residual = pp_proxy_tensors["residual"]
+
+        aux_hidden_states = []
+        if forward_batch.can_run_tbo:
+            hidden_states, residual = model_forward_maybe_tbo(
+                layers=self.layers,
+                enable_tbo=True,
+                input_data_scatter_mode=ScatterMode.model_input_output(),
+                positions=positions,
+                forward_batch=forward_batch,
+                hidden_states=hidden_states,
+                residual=residual,
+            )
+        else:
+            for i in range(self.start_layer, self.end_layer):
+                ctx = (
+                    nullcontext()
+                    if not get_global_server_args().disable_piecewise_cuda_graph
+                    else get_global_expert_distribution_recorder().with_current_layer(i)
+                )
+                with ctx:
+                    layer = self.layers[i]
+                    hidden_states, residual = layer(
+                        positions,
+                        hidden_states,
+                        forward_batch,
+                        residual,
+                        captured_last_layer_outputs=(
+                            aux_hidden_states
+                            if getattr(layer, "_is_layer_to_capture", False)
+                            else None
+                        ),
+                    )
+        if not self.pp_group.is_last_rank:
+            return PPProxyTensors(
+                {
+                    "hidden_states": hidden_states,
+                    "residual": residual,
+                }
+            )
+        else:
+            if hidden_states.shape[0] != 0:
+                if residual is None:
+                    hidden_states = self.norm(hidden_states)
+                else:
+                    hidden_states, _ = self.norm(hidden_states, residual)
+
+        if len(aux_hidden_states) == 0:
+            return hidden_states
+
+        return hidden_states, aux_hidden_states
+
+
+class Qwen2MoeForCausalLM(nn.Module):
+    fall_back_to_pt_during_load = False
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.pp_group = get_pp_group()
+        self.config = config
+        self.quant_config = quant_config
+        alt_stream = torch.cuda.Stream() if _is_cuda else None
+        self.model = Qwen2MoeModel(
+            config,
+            quant_config,
+            prefix=add_prefix("model", prefix),
+            alt_stream=alt_stream,
+        )
+        self.lm_head = ParallelLMHead(
+            config.vocab_size,
+            config.hidden_size,
+            quant_config=quant_config,
+            prefix=add_prefix("lm_head", prefix),
+            use_attn_tp_group=get_global_server_args().enable_dp_lm_head,
+        )
+        self.logits_processor = LogitsProcessor(config)
+        # For EAGLE3 support
+        self.capture_aux_hidden_states = False
+
+    @torch.no_grad()
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        input_embeds: torch.Tensor = None,
+        pp_proxy_tensors: Optional[PPProxyTensors] = None,
+    ) -> torch.Tensor:
+        hidden_states = self.model(
+            input_ids,
+            positions,
+            forward_batch,
+            input_embeds,
+            pp_proxy_tensors=pp_proxy_tensors,
+        )
+        aux_hidden_states = None
+        if self.capture_aux_hidden_states:
+            hidden_states, aux_hidden_states = hidden_states
+        if self.pp_group.is_last_rank:
+            return self.logits_processor(
+                input_ids, hidden_states, self.lm_head, forward_batch, aux_hidden_states
+            )
+        else:
+            return hidden_states
+
+    @torch.no_grad()
+    def forward_split_prefill(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        split_interval: Tuple[int, int],  # [start, end) 0-based
+        input_embeds: torch.Tensor = None,
+    ):
+        start, end = split_interval
+        # embed
+        if start == 0:
+            if input_embeds is None:
+                forward_batch.hidden_states = self.model.embed_tokens(input_ids)
+            else:
+                forward_batch.hidden_states = input_embeds
+
+        # decoder layer
+        for i in range(start, end):
+            with get_global_expert_distribution_recorder().with_current_layer(i):
+                layer = self.model.layers[i]
+                forward_batch.hidden_states, forward_batch.residual = layer(
+                    positions,
+                    forward_batch.hidden_states,
+                    forward_batch,
+                    forward_batch.residual,
+                )
+
+        if end == self.model.config.num_hidden_layers:
+            # norm
+            hidden_states, _ = self.model.norm(
+                forward_batch.hidden_states, forward_batch.residual
+            )
+            forward_batch.hidden_states = hidden_states
+            # logits process
+            result = self.logits_processor(
+                input_ids, forward_batch.hidden_states, self.lm_head, forward_batch
+            )
+        else:
+            result = None
+
+        return result
+
+    @property
+    def start_layer(self):
+        return self.model.start_layer
+
+    @property
+    def end_layer(self):
+        return self.model.end_layer
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+            ("gate_up_proj", "gate_proj", 0),
+            ("gate_up_proj", "up_proj", 1),
+        ]
+
+        expert_params_mapping = FusedMoE.make_expert_params_mapping(
+            ckpt_gate_proj_name="gate_proj",
+            ckpt_down_proj_name="down_proj",
+            ckpt_up_proj_name="up_proj",
+            num_experts=self.config.num_experts,
+        )
+
+        params_dict = dict(self.named_parameters())
+        for name, loaded_weight in weights:
+            layer_id = get_layer_id(name)
+            if (
+                layer_id is not None
+                and hasattr(self.model, "start_layer")
+                and (
+                    layer_id < self.model.start_layer
+                    or layer_id >= self.model.end_layer
+                )
+            ):
+                continue
+            if "rotary_emb.inv_freq" in name:
+                continue
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                # Skip non-stacked layers and experts (experts handled below).
+                if weight_name not in name:
+                    continue
+                # We have mlp.experts[0].gate_proj in the checkpoint.
+                # Since we handle the experts below in expert_params_mapping,
+                # we need to skip here BEFORE we update the name, otherwise
+                # name will be updated to mlp.experts[0].gate_up_proj, which
+                # will then be updated below in expert_params_mapping
+                # for mlp.experts[0].gate_gate_up_proj, which breaks load.
+                if "mlp.experts" in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                if name not in params_dict:
+                    continue
+
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                for mapping in expert_params_mapping:
+                    param_name, weight_name, expert_id, shard_id = mapping
+                    if weight_name not in name:
+                        continue
+                    name = name.replace(weight_name, param_name)
+                    param = params_dict[name]
+                    weight_loader = param.weight_loader
+                    weight_loader(
+                        param,
+                        loaded_weight,
+                        name,
+                        shard_id=shard_id,
+                        expert_id=expert_id,
+                    )
+                    break
+                else:
+                    # Skip loading extra bias for GPTQ models.
+                    if name.endswith(".bias") and name not in params_dict:
+                        continue
+                    if name not in params_dict:
+                        continue
+
+                    if name in params_dict.keys():
+                        param = params_dict[name]
+                        weight_loader = getattr(
+                            param, "weight_loader", default_weight_loader
+                        )
+                        weight_loader(param, loaded_weight)
+                    else:
+                        logger.warning(f"Parameter {name} not found in params_dict")
+
+    @classmethod
+    def get_model_config_for_expert_location(cls, config):
+        return ModelConfigForExpertLocation(
+            num_layers=config.num_hidden_layers,
+            num_logical_experts=config.num_experts,
+            num_groups=None,
+        )
+
+    def set_eagle3_layers_to_capture(self, layer_ids: Optional[List[int]] = None):
+        if not self.pp_group.is_last_rank:
+            return
+
+        self.capture_aux_hidden_states = True
+        if layer_ids is None:
+            num_layers = self.config.num_hidden_layers
+            self.model.set_eagle3_layers_to_capture(
+                [
+                    2,
+                    num_layers // 2,
+                    num_layers - 3,
+                ]
+            )  # Specific layers for EAGLE3 support
+        else:
+            self.model.set_eagle3_layers_to_capture([val + 1 for val in layer_ids])
+
+
+EntryClass = Qwen2MoeForCausalLM
diff --git a/sglang/python/sglang/srt/models/qwen2_rm.py b/sglang/python/sglang/srt/models/qwen2_rm.py
new file mode 100644
index 0000000000000000000000000000000000000000..f5ed9eae23a3ee05f06265a88724cf7ac5043136
--- /dev/null
+++ b/sglang/python/sglang/srt/models/qwen2_rm.py
@@ -0,0 +1,78 @@
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+from typing import Iterable, Optional, Tuple
+
+import torch
+from torch import nn
+from transformers import Qwen2Config
+
+from sglang.srt.layers.pooler import EmbeddingPoolerOutput, Pooler, PoolingType
+from sglang.srt.layers.quantization.base_config import QuantizationConfig
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch
+from sglang.srt.models.qwen2 import Qwen2ForCausalLM, Qwen2Model
+from sglang.srt.utils import add_prefix
+
+
+class Qwen2ForRewardModel(nn.Module):
+    def __init__(
+        self,
+        config: Qwen2Config,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.config = config
+        self.quant_config = quant_config
+        self.num_labels = 1
+        self.model = Qwen2Model(
+            config, quant_config=quant_config, prefix=add_prefix("model", prefix)
+        )
+        self.score = nn.Sequential(
+            nn.Linear(config.hidden_size, config.hidden_size),
+            nn.ReLU(),
+            nn.Linear(config.hidden_size, self.num_labels),
+        )
+        self.pooler = Pooler(pooling_type=PoolingType.LAST, normalize=False)
+
+        self.eos_token_id = config.eos_token_id
+
+    @torch.no_grad()
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        input_embeds: torch.Tensor = None,
+        get_embedding: bool = True,
+    ) -> EmbeddingPoolerOutput:
+        assert get_embedding, "Qwen2ForRewardModel is only used for embedding"
+
+        hidden_states = self.model(input_ids, positions, forward_batch, input_embeds)
+        logits = self.score(hidden_states)
+        pooled_logits = self.pooler(logits, forward_batch).embeddings
+
+        return EmbeddingPoolerOutput(pooled_logits)
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        # Filter out lm_head weights of Qwen2ForCausalLM
+        filtered_weights = [
+            (name, w) for name, w in weights if not name.startswith("lm_head")
+        ]
+        return Qwen2ForCausalLM.load_weights(self, filtered_weights)
+
+
+EntryClass = [
+    Qwen2ForRewardModel,
+]
diff --git a/sglang/python/sglang/srt/models/qwen2_vl.py b/sglang/python/sglang/srt/models/qwen2_vl.py
new file mode 100644
index 0000000000000000000000000000000000000000..94e6a48bd76ab543a70c9538927a3fb2c8c34d38
--- /dev/null
+++ b/sglang/python/sglang/srt/models/qwen2_vl.py
@@ -0,0 +1,618 @@
+# coding=utf-8
+# Adapted from
+# https://github.com/huggingface/transformers/blob/19e6e80e10118f855137b90740936c0b11ac397f/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py
+# Copyright 2024 The Qwen team.
+# Copyright 2023 The vLLM team.
+# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Inference-only Qwen2-VL model compatible with HuggingFace weights."""
+
+import logging
+from functools import lru_cache, partial
+from typing import Iterable, List, Optional, Tuple, Type, TypedDict
+
+import torch
+import torch.nn as nn
+from einops import rearrange
+from transformers import Qwen2VLConfig
+from transformers.models.qwen2_vl.configuration_qwen2_vl import Qwen2VLVisionConfig
+
+from sglang.srt.layers.activation import QuickGELU
+from sglang.srt.layers.attention.vision import VisionAttention
+from sglang.srt.layers.linear import ColumnParallelLinear, RowParallelLinear
+from sglang.srt.layers.logits_processor import LogitsProcessor
+from sglang.srt.layers.pooler import Pooler, PoolingType
+from sglang.srt.layers.quantization.base_config import QuantizationConfig
+from sglang.srt.layers.vocab_parallel_embedding import ParallelLMHead
+from sglang.srt.managers.mm_utils import (
+    MultiModalityDataPaddingPatternMultimodalTokens,
+    general_mm_embed_routine,
+)
+from sglang.srt.managers.schedule_batch import MultimodalDataItem, MultimodalInputs
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch
+from sglang.srt.model_loader.weight_utils import default_weight_loader
+from sglang.srt.models.qwen2 import Qwen2Model
+from sglang.srt.models.utils import WeightsMapper, compute_cu_seqlens_from_grid_numpy
+from sglang.srt.utils import add_prefix, is_npu
+from sglang.srt.utils.hf_transformers_utils import get_processor
+
+logger = logging.getLogger(__name__)
+
+
+# === Vision Inputs === #
+
+
+class Qwen2VLImageInputs(TypedDict):
+    pixel_values: torch.Tensor
+    """Shape:
+    `(num_patches, num_channels * patch_size * patch_size)`
+    """
+
+    image_grid_thw: torch.Tensor
+    """Shape: `(num_images, 3)`
+
+    This should be in `(grid_t, grid_h, grid_w)` format.
+    """
+
+
+class Qwen2VLVideoInputs(TypedDict):
+    pixel_values_videos: torch.Tensor
+    """Shape:
+    `(num_patches,
+      num_channels * temporal_patch_size * patch_size * patch_size)`
+    """
+
+    video_grid_thw: torch.Tensor
+    """Shape: `(num_videos, 3)`
+
+    This should be in `(grid_t, grid_h, grid_w)` format.
+    """
+
+
+# === Vision Encoder === #
+
+
+class Qwen2VisionMLP(nn.Module):
+
+    def __init__(
+        self,
+        in_features: int,
+        hidden_features: int = None,
+        act_layer: Type[nn.Module] = QuickGELU,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.fc1 = ColumnParallelLinear(
+            in_features,
+            hidden_features,
+            quant_config=quant_config,
+            prefix=add_prefix("fc1", prefix),
+        )
+        self.act = act_layer()
+        self.fc2 = RowParallelLinear(
+            hidden_features,
+            in_features,
+            quant_config=quant_config,
+            prefix=add_prefix("fc2", prefix),
+        )
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x_parallel, _ = self.fc1(x)
+        x_parallel = self.act(x_parallel)
+        x, _ = self.fc2(x_parallel)
+        return x
+
+
+class Qwen2VisionBlock(nn.Module):
+
+    def __init__(
+        self,
+        dim: int,
+        num_heads: int,
+        mlp_ratio: float,
+        act_layer: Type[nn.Module] = QuickGELU,
+        norm_layer: Type[nn.Module] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        if norm_layer is None:
+            norm_layer = partial(nn.LayerNorm, eps=1e-6)
+        self.norm1 = norm_layer(dim)
+        self.norm2 = norm_layer(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+
+        self.attn = VisionAttention(
+            embed_dim=dim,
+            num_heads=num_heads,
+            projection_size=dim,
+            use_qkv_parallel=True,
+            flatten_batch=True,
+            quant_config=quant_config,
+            prefix=add_prefix("attn", prefix),
+        )
+        self.mlp = Qwen2VisionMLP(
+            dim,
+            mlp_hidden_dim,
+            act_layer=act_layer,
+            quant_config=quant_config,
+            prefix=add_prefix("mlp", prefix),
+        )
+
+    def forward(
+        self,
+        x: torch.Tensor,
+        cu_seqlens: torch.Tensor,
+        position_embeddings: torch.Tensor,
+    ) -> torch.Tensor:
+        hidden_states = self.norm1(x)
+        hidden_states = rearrange(hidden_states, "s b ... -> b s ...")
+        attn = self.attn(
+            hidden_states,
+            cu_seqlens=cu_seqlens,
+            position_embeddings=position_embeddings,
+        )
+        attn = rearrange(attn, "b s ... -> s b ...")
+        x = x + attn
+        x = x + self.mlp(self.norm2(x))
+        return x
+
+
+class Qwen2VisionPatchEmbed(nn.Module):
+
+    def __init__(
+        self,
+        patch_size: int = 14,
+        temporal_patch_size: int = 2,
+        in_chans: int = 3,
+        embed_dim: int = 1152,
+    ) -> None:
+        super().__init__()
+        self.patch_size = patch_size
+        self.temporal_patch_size = temporal_patch_size
+        self.embed_dim = embed_dim
+
+        kernel_size = [temporal_patch_size, patch_size, patch_size]
+        self.proj = nn.Conv3d(
+            in_chans, embed_dim, kernel_size=kernel_size, stride=kernel_size, bias=False
+        )
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        L, C = x.shape
+        x = x.view(L, -1, self.temporal_patch_size, self.patch_size, self.patch_size)
+        x = self.proj(x).view(L, self.embed_dim)
+        return x
+
+
+class Qwen2VisionPatchMerger(nn.Module):
+
+    def __init__(
+        self,
+        d_model: int,
+        context_dim: int,
+        norm_layer: Type[nn.Module] = None,
+        spatial_merge_size: int = 2,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.hidden_size = context_dim * (spatial_merge_size**2)
+        if norm_layer is None:
+            norm_layer = partial(nn.LayerNorm, eps=1e-6)
+        self.ln_q = norm_layer(context_dim)
+        self.mlp = nn.ModuleList(
+            [
+                ColumnParallelLinear(
+                    self.hidden_size,
+                    self.hidden_size,
+                    bias=True,
+                    quant_config=quant_config,
+                    prefix=add_prefix("mlp.0", prefix),
+                ),
+                nn.GELU(),
+                RowParallelLinear(
+                    self.hidden_size,
+                    d_model,
+                    bias=True,
+                    quant_config=quant_config,
+                    prefix=add_prefix("mlp.2", prefix),
+                ),
+            ]
+        )
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.ln_q(x)
+        x = x.view(-1, self.hidden_size)
+
+        mlp_fc1, mlp_act, mlp_fc2 = self.mlp
+        x_parallel, _ = mlp_fc1(x)
+        x_parallel = mlp_act(x_parallel)
+        out, _ = mlp_fc2(x_parallel)
+        return out
+
+
+class Qwen2VisionRotaryEmbedding(nn.Module):
+
+    def __init__(self, dim: int, theta: float = 10000.0) -> None:
+        super().__init__()
+        self.dim = dim
+        self.theta = theta
+        inv_freq = 1.0 / (theta ** (torch.arange(0, dim, 2, dtype=torch.float) / dim))
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+        self._seq_len_cached = 0
+        self._freqs_cached = None
+
+    def update_freqs_cache(self, seqlen: int) -> None:
+        if seqlen > self._seq_len_cached:
+            seqlen *= 2
+            self._seq_len_cached = seqlen
+            self.inv_freq = 1.0 / (
+                self.theta
+                ** (
+                    torch.arange(
+                        0, self.dim, 2, dtype=torch.float, device=self.inv_freq.device
+                    )
+                    / self.dim
+                )
+            )
+            seq = torch.arange(
+                seqlen, device=self.inv_freq.device, dtype=self.inv_freq.dtype
+            )
+            freqs = torch.outer(seq, self.inv_freq)
+            self._freqs_cached = freqs
+
+    def forward(self, seqlen: int) -> torch.Tensor:
+        self.update_freqs_cache(seqlen)
+        return self._freqs_cached[:seqlen]
+
+
+class Qwen2VisionTransformer(nn.Module):
+
+    def __init__(
+        self,
+        vision_config: Qwen2VLVisionConfig,
+        norm_eps: float = 1e-6,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+
+        patch_size: int = vision_config.patch_size
+        temporal_patch_size: int = vision_config.temporal_patch_size
+        spatial_merge_size: int = vision_config.spatial_merge_size
+        in_chans: int = vision_config.in_chans
+        hidden_size: int = vision_config.hidden_size
+        embed_dim: int = vision_config.embed_dim
+        depth: int = vision_config.depth
+        num_heads: int = vision_config.num_heads
+        mlp_ratio: float = vision_config.mlp_ratio
+
+        self.spatial_merge_size = spatial_merge_size
+
+        self.patch_embed = Qwen2VisionPatchEmbed(
+            patch_size=patch_size,
+            temporal_patch_size=temporal_patch_size,
+            in_chans=in_chans,
+            embed_dim=embed_dim,
+        )
+
+        norm_layer = partial(nn.LayerNorm, eps=norm_eps)
+        head_dim = embed_dim // num_heads
+        self.rotary_pos_emb = Qwen2VisionRotaryEmbedding(head_dim // 2)
+        self.blocks = nn.ModuleList(
+            [
+                Qwen2VisionBlock(
+                    dim=embed_dim,
+                    num_heads=num_heads,
+                    mlp_ratio=mlp_ratio,
+                    norm_layer=norm_layer,
+                    quant_config=quant_config,
+                    prefix=add_prefix(f"blocks.{i}", prefix),
+                )
+                for i in range(depth)
+            ]
+        )
+        self.merger = Qwen2VisionPatchMerger(
+            d_model=hidden_size,
+            context_dim=embed_dim,
+            norm_layer=norm_layer,
+            quant_config=quant_config,
+            prefix=add_prefix("merger", prefix),
+        )
+
+    @property
+    def dtype(self) -> torch.dtype:
+        return self.patch_embed.proj.weight.dtype
+
+    @property
+    def device(self) -> torch.device:
+        return self.blocks[0].mlp.fc2.weight.device
+
+    def rot_pos_emb(self, grid_thw: torch.Tensor) -> torch.Tensor:
+        pos_ids = []
+        for i in range(grid_thw.size(0)):
+            t, h, w = grid_thw[i].tolist()
+            hpos_ids = torch.arange(h).unsqueeze(1).expand(-1, w)
+            wpos_ids = torch.arange(w).unsqueeze(0).expand(h, -1)
+            hpos_ids = (
+                hpos_ids.reshape(
+                    h // self.spatial_merge_size,
+                    self.spatial_merge_size,
+                    w // self.spatial_merge_size,
+                    self.spatial_merge_size,
+                )
+                .permute(0, 2, 1, 3)
+                .flatten()
+            )
+            wpos_ids = (
+                wpos_ids.reshape(
+                    h // self.spatial_merge_size,
+                    self.spatial_merge_size,
+                    w // self.spatial_merge_size,
+                    self.spatial_merge_size,
+                )
+                .permute(0, 2, 1, 3)
+                .flatten()
+            )
+            pos_ids.append(torch.stack([hpos_ids, wpos_ids], dim=-1).repeat(t, 1))
+        pos_ids = torch.cat(pos_ids, dim=0)
+        max_grid_size = grid_thw[:, 1:].max()
+        rotary_pos_emb_full = self.rotary_pos_emb(max_grid_size)
+        rotary_pos_emb = rotary_pos_emb_full[pos_ids].flatten(1)
+        return rotary_pos_emb
+
+    def forward(
+        self,
+        x: torch.Tensor,
+        grid_thw: torch.Tensor,
+    ) -> torch.Tensor:
+        # patchify
+        x = x.to(device=self.device, dtype=self.dtype)
+        x = self.patch_embed(x)
+
+        # compute position embedding
+        rotary_pos_emb = self.rot_pos_emb(grid_thw)
+        emb = torch.cat((rotary_pos_emb, rotary_pos_emb), dim=-1)
+        position_embeddings = (emb.cos(), emb.sin())
+        # compute cu_seqlens
+        cu_seqlens = compute_cu_seqlens_from_grid_numpy(grid_thw)
+        # cu_seqlens must be on cpu because of npu_flash_attention_unpad operator restriction
+        if is_npu():
+            cu_seqlens = cu_seqlens.to("cpu")
+
+        # transformers
+        x = x.unsqueeze(1)
+        for blk in self.blocks:
+            x = blk(x, cu_seqlens=cu_seqlens, position_embeddings=position_embeddings)
+
+        # adapter
+        x = self.merger(x)
+        return x
+
+
+cached_get_processor = lru_cache(get_processor)
+
+
+class Qwen2VLForConditionalGeneration(nn.Module):
+    # BitandBytes specific attributes
+    default_bitsandbytes_target_modules = [
+        ".gate_proj.",
+        ".down_proj.",
+        ".up_proj.",
+        ".q_proj.",
+        ".k_proj.",
+        ".v_proj.",
+        ".o_proj.",
+    ]
+    bitsandbytes_stacked_params_mapping = {
+        # shard_name, weight_name, index
+        "q_proj": ("qkv_proj", 0),
+        "k_proj": ("qkv_proj", 1),
+        "v_proj": ("qkv_proj", 2),
+        "gate_proj": ("gate_up_proj", 0),
+        "up_proj": ("gate_up_proj", 1),
+    }
+
+    # To ensure correct weight loading and mapping.
+    hf_to_sglang_mapper = WeightsMapper(
+        orig_to_new_substr={
+            "attn.qkv": "attn.qkv_proj",
+        },
+        orig_to_new_prefix={
+            # mapping for new names in checkpoint saved after transformers v4.52
+            "model.language_model.": "language_model.model.",
+            "model.visual.": "visual.",
+            # mapping for original checkpoint
+            "lm_head.": "language_model.lm_head.",
+            "model.": "language_model.model.",
+        },
+    )
+
+    def __init__(
+        self,
+        config: Qwen2VLConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+
+        self.config = config
+        self.visual = Qwen2VisionTransformer(
+            config.vision_config,
+            norm_eps=getattr(config, "rms_norm_eps", 1e-6),
+            # NOTE: Qwen2-VL vision encoder currently supports BitsAndBytes 4-bit quantization.
+            # Other quantization methods (e.g., GPTQ, AWQ) are untested and may not be supported.
+            quant_config=quant_config,
+            prefix=add_prefix("visual", prefix),
+        )
+
+        self.model = Qwen2Model(
+            config, quant_config, prefix=add_prefix("model", prefix)
+        )
+
+        if config.tie_word_embeddings:
+            self.lm_head = self.model.embed_tokens
+        else:
+            self.lm_head = ParallelLMHead(
+                config.vocab_size,
+                config.hidden_size,
+                quant_config=quant_config,
+                prefix=add_prefix("lm_head", prefix),
+            )
+
+        self.is_mrope_enabled = "mrope_section" in self.config.rope_scaling
+        self.logits_processor = LogitsProcessor(config)
+        self.pooler = Pooler(pooling_type=PoolingType.LAST, normalize=True)
+
+    def pad_input_ids(self, input_ids: List[int], mm_inputs: MultimodalInputs):
+        pattern = MultiModalityDataPaddingPatternMultimodalTokens()
+        return pattern.pad_input_tokens(input_ids, mm_inputs)
+
+    def get_image_feature(self, items: List[MultimodalDataItem]) -> torch.Tensor:
+        # in qwen-vl, last dim is the same
+        pixel_values = torch.cat([item.feature for item in items], dim=0).type(
+            self.visual.dtype
+        )
+        image_grid_thw = torch.concat([item.image_grid_thw for item in items], dim=0)
+        assert pixel_values.dim() == 2, pixel_values.dim()
+        assert image_grid_thw.dim() == 2, image_grid_thw.dim()
+        image_embeds = self.visual(pixel_values, grid_thw=image_grid_thw)
+        return image_embeds
+
+    def get_video_feature(self, items: List[MultimodalDataItem]) -> torch.Tensor:
+        # in qwen-vl, last dim is the same
+        pixel_values = torch.cat([item.feature for item in items], dim=0).type(
+            self.visual.dtype
+        )
+        video_grid_thw = torch.concat([item.video_grid_thw for item in items], dim=0)
+        assert pixel_values.dim() == 2, pixel_values.dim()
+        assert video_grid_thw.dim() == 2, video_grid_thw.dim()
+        video_embeds = self.visual(pixel_values, grid_thw=video_grid_thw)
+        return video_embeds
+
+    def _process_video_input(self, video_input: Qwen2VLVideoInputs) -> torch.Tensor:
+        pixel_values_videos = video_input["pixel_values_videos"].type(self.visual.dtype)
+        video_embeds = self.visual(
+            pixel_values_videos, grid_thw=video_input["video_grid_thw"]
+        )
+        return video_embeds
+
+    def get_input_embeddings(self):
+        return self.model.embed_tokens
+
+    def should_apply_lora(self, module_name: str) -> bool:
+        # skip visual tower
+        return not module_name.startswith("visual")
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        input_embeds=None,
+        get_embedding: bool = False,
+    ):
+        """Run forward pass for Qwen2-VL.
+
+        Args:
+            input_ids: Flattened (concatenated) input_ids corresponding to a
+                batch.
+            positions: Flattened (concatenated) position ids corresponding to a
+                batch.
+                **NOTE**: If mrope is enabled (default setting for Qwen2-VL
+                opensource models), the shape will be `(3, seq_len)`,
+                otherwise it will be `(seq_len,).
+                (Use input_metadata.mrope_positions to replace it)
+        """
+        if self.is_mrope_enabled:
+            positions = forward_batch.mrope_positions
+
+        if not (
+            forward_batch.forward_mode.is_decode()
+            or not forward_batch.contains_image_inputs()
+        ):
+            if self.is_mrope_enabled:
+                assert positions.ndim == 2 and positions.size(0) == 3, (
+                    "multimodal section rotary embedding requires "
+                    f"(3, seq_len) positions, but got {positions.size()}"
+                )
+
+        hidden_states = general_mm_embed_routine(
+            input_ids=input_ids,
+            forward_batch=forward_batch,
+            language_model=self.model,
+            multimodal_model=self,
+            positions=positions,
+        )
+
+        if get_embedding:
+            return self.pooler(hidden_states, forward_batch)
+        else:
+            return self.logits_processor(
+                input_ids, hidden_states, self.lm_head, forward_batch
+            )
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+            ("gate_up_proj", "up_proj", 1),
+            ("gate_up_proj", "gate_proj", 0),
+        ]
+        params_dict = dict(self.named_parameters(remove_duplicate=False))
+        for name, loaded_weight in weights:
+            if "rotary_emb.inv_freq" in name:
+                continue
+            if self.config.tie_word_embeddings and "lm_head.weight" in name:
+                continue
+
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                if "visual" in name:
+                    # adapt to VisionAttention
+                    name = name.replace(r"attn.qkv.", r"attn.qkv_proj.")
+
+                try:
+                    # Skip loading extra bias for GPTQ models.
+                    if name.endswith(".bias") and name not in params_dict:
+                        continue
+                    param = params_dict[name]
+                except KeyError:
+                    print(params_dict.keys())
+                    raise
+
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
+                weight_loader(param, loaded_weight)
+
+
+EntryClass = Qwen2VLForConditionalGeneration
diff --git a/sglang/python/sglang/srt/models/qwen3.py b/sglang/python/sglang/srt/models/qwen3.py
new file mode 100644
index 0000000000000000000000000000000000000000..b6b955ae6c7c35067ef8254045a93618c8be769e
--- /dev/null
+++ b/sglang/python/sglang/srt/models/qwen3.py
@@ -0,0 +1,590 @@
+# Adapted from qwen2.py
+import logging
+from typing import Any, Dict, Iterable, List, Optional, Tuple
+
+import torch
+from torch import nn
+
+from sglang.srt.distributed import (
+    get_pp_group,
+    get_tensor_model_parallel_rank,
+    get_tensor_model_parallel_world_size,
+)
+from sglang.srt.layers.communicator import LayerCommunicator, LayerScatterModes
+from sglang.srt.layers.dp_attention import get_attention_tp_rank, get_attention_tp_size
+from sglang.srt.layers.layernorm import RMSNorm
+from sglang.srt.layers.linear import QKVParallelLinear, RowParallelLinear
+from sglang.srt.layers.logits_processor import LogitsProcessor
+from sglang.srt.layers.pooler import Pooler, PoolingType
+from sglang.srt.layers.quantization.base_config import QuantizationConfig
+from sglang.srt.layers.radix_attention import RadixAttention
+from sglang.srt.layers.rotary_embedding import get_rope
+from sglang.srt.layers.utils import PPMissingLayer, get_layer_id
+from sglang.srt.layers.vocab_parallel_embedding import ParallelLMHead
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch, PPProxyTensors
+from sglang.srt.model_loader.weight_utils import (
+    default_weight_loader,
+    maybe_remap_kv_scale_name,
+)
+from sglang.srt.models.qwen2 import Qwen2MLP as Qwen3MLP
+from sglang.srt.models.qwen2 import Qwen2Model
+from sglang.srt.models.utils import apply_qk_norm
+from sglang.srt.server_args import get_global_server_args
+from sglang.srt.utils import add_prefix, is_cuda, is_npu
+
+Qwen3Config = None
+
+logger = logging.getLogger(__name__)
+_is_cuda = is_cuda()
+_is_npu = is_npu()
+
+if _is_npu:
+    from sgl_kernel_npu.norm.split_qkv_rmsnorm_rope import split_qkv_rmsnorm_rope
+
+    from sglang.srt.hardware_backend.npu.cmo import get_cmo_stream, wait_cmo_stream
+
+
+class Qwen3Attention(nn.Module):
+    def __init__(
+        self,
+        hidden_size: int,
+        num_heads: int,
+        num_kv_heads: int,
+        layer_id: int = 0,
+        rope_theta: float = 1000000,
+        rope_scaling: Optional[Dict[str, Any]] = None,
+        head_dim: Optional[int] = None,
+        max_position_embeddings: int = 32768,
+        quant_config: Optional[QuantizationConfig] = None,
+        rms_norm_eps: float = None,
+        attention_bias: bool = False,
+        prefix: str = "",
+        alt_stream: Optional[torch.cuda.Stream] = None,
+    ) -> None:
+        super().__init__()
+        self.hidden_size = hidden_size
+        self.tp_size = get_tensor_model_parallel_world_size()
+        self.total_num_heads = num_heads
+        attn_tp_rank = get_attention_tp_rank()
+        attn_tp_size = get_attention_tp_size()
+
+        assert self.total_num_heads % attn_tp_size == 0
+        self.num_heads = self.total_num_heads // attn_tp_size
+        self.total_num_kv_heads = num_kv_heads
+        if self.total_num_kv_heads >= attn_tp_size:
+            # Number of KV heads is greater than TP size, so we partition
+            # the KV heads across multiple tensor parallel GPUs.
+            assert self.total_num_kv_heads % attn_tp_size == 0
+        else:
+            # Number of KV heads is less than TP size, so we replicate
+            # the KV heads across multiple tensor parallel GPUs.
+            assert attn_tp_size % self.total_num_kv_heads == 0
+        self.num_kv_heads = max(1, self.total_num_kv_heads // attn_tp_size)
+        self.head_dim = head_dim or hidden_size // self.total_num_heads
+        self.q_size = self.num_heads * self.head_dim
+        self.kv_size = self.num_kv_heads * self.head_dim
+        self.scaling = self.head_dim**-0.5
+        self.rope_theta = rope_theta
+        self.max_position_embeddings = max_position_embeddings
+        self.tp_rank = get_tensor_model_parallel_rank()
+
+        norm_kwargs = (
+            dict(
+                weight_dtype=torch.float32,
+                cast_x_before_out_mul=True,
+            )
+            if get_global_server_args().rl_on_policy_target is not None
+            else {}
+        )
+        self.q_norm = RMSNorm(self.head_dim, eps=rms_norm_eps, **norm_kwargs)
+        self.k_norm = RMSNorm(self.head_dim, eps=rms_norm_eps, **norm_kwargs)
+
+        self.qkv_proj = QKVParallelLinear(
+            hidden_size,
+            self.head_dim,
+            self.total_num_heads,
+            self.total_num_kv_heads,
+            bias=attention_bias,
+            quant_config=quant_config,
+            tp_rank=attn_tp_rank,
+            tp_size=attn_tp_size,
+            prefix=add_prefix("qkv_proj", prefix),
+        )
+        self.o_proj = RowParallelLinear(
+            self.total_num_heads * self.head_dim,
+            hidden_size,
+            bias=attention_bias,
+            quant_config=quant_config,
+            tp_rank=attn_tp_rank,
+            tp_size=attn_tp_size,
+            reduce_results=False,
+            prefix=add_prefix("o_proj", prefix),
+        )
+
+        self.rotary_emb = get_rope(
+            self.head_dim,
+            rotary_dim=self.head_dim,
+            max_position=max_position_embeddings,
+            base=rope_theta,
+            rope_scaling=rope_scaling,
+        )
+        self.attn = RadixAttention(
+            self.num_heads,
+            self.head_dim,
+            self.scaling,
+            num_kv_heads=self.num_kv_heads,
+            layer_id=layer_id,
+            prefix=add_prefix("attn", prefix),
+        )
+        self.alt_stream = alt_stream
+
+    def forward_prepare_native(self, positions, hidden_states):
+        qkv, _ = self.qkv_proj(hidden_states)
+        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+        q, k = apply_qk_norm(
+            q=q,
+            k=k,
+            q_norm=self.q_norm,
+            k_norm=self.k_norm,
+            head_dim=self.head_dim,
+            alt_stream=self.alt_stream,
+        )
+        q, k = self.rotary_emb(positions, q, k)
+        return q, k, v
+
+    def forward_prepare_npu(self, positions, hidden_states, forward_batch):
+        qkv, _ = self.qkv_proj(hidden_states)
+
+        if self.attn.layer_id == forward_batch.token_to_kv_pool.start_layer:
+            self.rotary_emb.get_cos_sin_with_position(positions)
+        q, k, v = split_qkv_rmsnorm_rope(
+            qkv,
+            self.rotary_emb.position_sin,
+            self.rotary_emb.position_cos,
+            self.q_size,
+            self.kv_size,
+            self.head_dim,
+            eps=self.q_norm.variance_epsilon,
+            q_weight=self.q_norm.weight,
+            k_weight=self.k_norm.weight,
+            q_bias=getattr(self.q_norm, "bias", None),
+            k_bias=getattr(self.k_norm, "bias", None),
+        )
+        return q, k, v
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+    ) -> torch.Tensor:
+        if get_global_server_args().rl_on_policy_target is not None:
+            hidden_states = hidden_states.bfloat16()
+
+        if (
+            not _is_npu
+            or forward_batch.forward_mode.is_extend_or_draft_extend_or_mixed()
+        ):
+            q, k, v = self.forward_prepare_native(
+                positions=positions,
+                hidden_states=hidden_states,
+            )
+        else:
+            q, k, v = self.forward_prepare_npu(
+                positions=positions,
+                hidden_states=hidden_states,
+                forward_batch=forward_batch,
+            )
+
+        if get_global_server_args().rl_on_policy_target is not None:
+            q = q.to(torch.bfloat16)
+            k = k.to(torch.bfloat16)
+
+        attn_output = self.attn(q, k, v, forward_batch)
+        output, _ = self.o_proj(attn_output)
+        return output
+
+
+class Qwen3DecoderLayer(nn.Module):
+    def __init__(
+        self,
+        config: Qwen3Config,
+        layer_id: int = 0,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+        alt_stream: Optional[torch.cuda.Stream] = None,
+    ) -> None:
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        rope_theta = getattr(config, "rope_theta", 1000000)
+        rope_scaling = getattr(config, "rope_scaling", None)
+        max_position_embeddings = getattr(config, "max_position_embeddings", 32768)
+        head_dim = getattr(config, "head_dim", None)
+        self.self_attn = Qwen3Attention(
+            hidden_size=self.hidden_size,
+            num_heads=config.num_attention_heads,
+            num_kv_heads=config.num_key_value_heads,
+            layer_id=layer_id,
+            rope_theta=rope_theta,
+            rope_scaling=rope_scaling,
+            head_dim=head_dim,
+            max_position_embeddings=max_position_embeddings,
+            quant_config=quant_config,
+            rms_norm_eps=config.rms_norm_eps,
+            attention_bias=config.attention_bias,
+            prefix=add_prefix("self_attn", prefix),
+            alt_stream=alt_stream,
+        )
+        self.mlp = Qwen3MLP(
+            hidden_size=self.hidden_size,
+            intermediate_size=config.intermediate_size,
+            hidden_act=config.hidden_act,
+            quant_config=quant_config,
+            prefix=add_prefix("mlp", prefix),
+        )
+
+        norm_kwargs = (
+            dict(
+                weight_dtype=torch.float32,
+                cast_x_before_out_mul=True,
+                override_orig_dtype=torch.float32,
+                fp32_residual=True,
+            )
+            if get_global_server_args().rl_on_policy_target is not None
+            else {}
+        )
+        self.input_layernorm = RMSNorm(
+            config.hidden_size, eps=config.rms_norm_eps, **norm_kwargs
+        )
+        self.post_attention_layernorm = RMSNorm(
+            config.hidden_size, eps=config.rms_norm_eps, **norm_kwargs
+        )
+
+        self.layer_scatter_modes = LayerScatterModes.init_new(
+            layer_id=layer_id,
+            num_layers=config.num_hidden_layers,
+            is_layer_sparse=False,
+            is_previous_layer_sparse=False,
+            is_next_layer_sparse=False,
+        )
+        self.layer_communicator = LayerCommunicator(
+            layer_scatter_modes=self.layer_scatter_modes,
+            input_layernorm=self.input_layernorm,
+            post_attention_layernorm=self.post_attention_layernorm,
+        )
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+        residual: Optional[torch.Tensor],
+        post_residual_addition: Optional[torch.Tensor] = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        # Self Attention
+        hidden_states, residual = self.layer_communicator.prepare_attn(
+            hidden_states,
+            residual,
+            forward_batch,
+            post_residual_addition=post_residual_addition,
+        )
+        if hidden_states.shape[0] != 0:
+            hidden_states = self.self_attn(
+                positions=positions,
+                hidden_states=hidden_states,
+                forward_batch=forward_batch,
+            )
+
+        # Fully Connected
+        hidden_states, residual = self.layer_communicator.prepare_mlp(
+            hidden_states,
+            residual,
+            forward_batch,
+            cache=(
+                [self.mlp.gate_up_proj.weight, self.mlp.down_proj.weight]
+                if _is_npu
+                and not get_global_server_args().disable_piecewise_cuda_graph
+                and (
+                    hasattr(self.mlp.gate_up_proj, "weight")
+                    and hasattr(self.mlp.down_proj, "weight")
+                )
+                else None
+            ),
+        )
+        hidden_states = self.mlp(hidden_states)
+        if _is_npu and get_cmo_stream():
+            wait_cmo_stream()
+        hidden_states, residual = self.layer_communicator.postprocess_layer(
+            hidden_states, residual, forward_batch
+        )
+        return hidden_states, residual
+
+
+class Qwen3Model(Qwen2Model):
+    def __init__(
+        self,
+        config: Qwen3Config,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        alt_stream = torch.cuda.Stream() if _is_cuda else None
+        super().__init__(
+            config=config,
+            quant_config=quant_config,
+            prefix=prefix,
+            decoder_layer_type=Qwen3DecoderLayer,
+            alt_stream=alt_stream,
+        )
+
+
+class Qwen3ForCausalLM(nn.Module):
+    # BitandBytes specific attributes
+    default_bitsandbytes_target_modules = [
+        ".gate_proj.",
+        ".down_proj.",
+        ".up_proj.",
+        ".q_proj.",
+        ".k_proj.",
+        ".v_proj.",
+        ".o_proj.",
+    ]
+    bitsandbytes_stacked_params_mapping = {
+        # shard_name, weight_name, index
+        "q_proj": ("qkv_proj", 0),
+        "k_proj": ("qkv_proj", 1),
+        "v_proj": ("qkv_proj", 2),
+        "gate_proj": ("gate_up_proj", 0),
+        "up_proj": ("gate_up_proj", 1),
+    }
+
+    def __init__(
+        self,
+        config: Qwen3Config,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.pp_group = get_pp_group()
+        self.config = config
+        self.quant_config = quant_config
+        self.model = Qwen3Model(
+            config, quant_config=quant_config, prefix=add_prefix("model", prefix)
+        )
+
+        # handle the lm head on different pp ranks
+        if self.pp_group.is_last_rank:
+            if self.pp_group.world_size == 1 and config.tie_word_embeddings:
+                self.lm_head = self.model.embed_tokens
+            else:
+                self.lm_head = ParallelLMHead(
+                    config.vocab_size,
+                    config.hidden_size,
+                    quant_config=quant_config,
+                    use_attn_tp_group=get_global_server_args().enable_dp_lm_head,
+                    prefix=add_prefix("lm_head", prefix),
+                )
+        else:
+            # ranks other than the last rank will have a placeholder layer
+            self.lm_head = PPMissingLayer()
+
+        self.logits_processor = LogitsProcessor(config)
+        self.pooler = Pooler(pooling_type=PoolingType.LAST, normalize=True)
+
+        # For EAGLE3 support
+        self.capture_aux_hidden_states = False
+
+    def get_input_embeddings(self) -> nn.Embedding:
+        return self.model.get_input_embeddings()
+
+    @torch.no_grad()
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        input_embeds: torch.Tensor = None,
+        get_embedding: bool = False,
+        pp_proxy_tensors: Optional[PPProxyTensors] = None,
+    ) -> torch.Tensor:
+        hidden_states = self.model(
+            input_ids,
+            positions,
+            forward_batch,
+            input_embeds,
+            pp_proxy_tensors=pp_proxy_tensors,
+        )
+
+        aux_hidden_states = None
+        if self.capture_aux_hidden_states:
+            hidden_states, aux_hidden_states = hidden_states
+
+        if self.pp_group.is_last_rank:
+            if not get_embedding:
+                return self.logits_processor(
+                    input_ids,
+                    hidden_states,
+                    self.lm_head,
+                    forward_batch,
+                    aux_hidden_states,
+                )
+            else:
+                return self.pooler(hidden_states, forward_batch)
+        else:
+            return hidden_states
+
+    @torch.no_grad()
+    def forward_split_prefill(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        split_interval: Tuple[int, int],  # [start, end) 0-based
+        input_embeds: torch.Tensor = None,
+    ):
+        start, end = split_interval
+        # embed
+        if start == 0:
+            if input_embeds is None:
+                forward_batch.hidden_states = self.model.embed_tokens(input_ids)
+            else:
+                forward_batch.hidden_states = input_embeds
+        # decoder layer
+        for i in range(start, end):
+            layer = self.model.layers[i]
+            forward_batch.hidden_states, forward_batch.residual = layer(
+                positions,
+                forward_batch.hidden_states,
+                forward_batch,
+                forward_batch.residual,
+            )
+
+        if end == self.model.config.num_hidden_layers:
+            # norm
+            hidden_states, _ = self.model.norm(
+                forward_batch.hidden_states, forward_batch.residual
+            )
+            forward_batch.hidden_states = hidden_states
+            # logits process
+            result = self.logits_processor(
+                input_ids, forward_batch.hidden_states, self.lm_head, forward_batch
+            )
+        else:
+            result = None
+
+        return result
+
+    @property
+    def start_layer(self):
+        return self.model.start_layer
+
+    @property
+    def end_layer(self):
+        return self.model.end_layer
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+            ("gate_up_proj", "gate_proj", 0),
+            ("gate_up_proj", "up_proj", 1),
+        ]
+
+        params_dict = dict(self.named_parameters())
+        for name, loaded_weight in weights:
+            if not name.startswith("model.") and (
+                name.startswith("layers.")
+                or name.startswith("embed_tokens.")
+                or name.startswith("norm.")
+            ):
+                name = add_prefix(name, "model")
+
+            if name == "model.embed_tokens.weight":
+                if self.pp_group.is_last_rank and self.config.tie_word_embeddings:
+                    if "lm_head.weight" in params_dict:
+                        param = params_dict["lm_head.weight"]
+                        weight_loader = getattr(
+                            param, "weight_loader", default_weight_loader
+                        )
+                        weight_loader(param, loaded_weight)
+
+            layer_id = get_layer_id(name)
+            if (
+                layer_id is not None
+                and hasattr(self.model, "start_layer")
+                and (
+                    layer_id < self.model.start_layer
+                    or layer_id >= self.model.end_layer
+                )
+            ):
+                continue
+
+            if "rotary_emb.inv_freq" in name or "projector" in name:
+                continue
+            if "rotary_emb.cos_cached" in name or "rotary_emb.sin_cached" in name:
+                # Models trained using ColossalAI may include these tensors in
+                # the checkpoint. Skip them.
+                continue
+            if name.startswith("model.vision_tower") and name not in params_dict:
+                continue
+            if "scale" in name:
+                name = maybe_remap_kv_scale_name(name, params_dict)
+                if name is None:
+                    continue
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+
+                if name in params_dict.keys():
+                    param = params_dict[name]
+                    weight_loader = getattr(
+                        param, "weight_loader", default_weight_loader
+                    )
+                    weight_loader(param, loaded_weight)
+                else:
+                    logger.warning(f"Parameter {name} not found in params_dict")
+
+    def get_embed_and_head(self):
+        return self.model.embed_tokens.weight, self.lm_head.weight
+
+    def set_embed_and_head(self, embed, head):
+        del self.model.embed_tokens.weight
+        del self.lm_head.weight
+        self.model.embed_tokens.weight = embed
+        self.lm_head.weight = head
+        torch.cuda.empty_cache()
+        torch.cuda.synchronize()
+
+    def load_kv_cache_scales(self, quantization_param_path: str) -> None:
+        self.model.load_kv_cache_scales(quantization_param_path)
+
+    def set_eagle3_layers_to_capture(self, layer_ids: Optional[List[int]] = None):
+        if not self.pp_group.is_last_rank:
+            return
+
+        self.capture_aux_hidden_states = True
+        if layer_ids is None:
+            num_layers = self.config.num_hidden_layers
+            self.model.layers_to_capture = [
+                2,
+                num_layers // 2,
+                num_layers - 3,
+            ]  # Specific layers for EAGLE3 support
+        else:
+            self.model.layers_to_capture = [val + 1 for val in layer_ids]
+
+
+EntryClass = Qwen3ForCausalLM
diff --git a/sglang/python/sglang/srt/models/qwen3_5.py b/sglang/python/sglang/srt/models/qwen3_5.py
new file mode 100644
index 0000000000000000000000000000000000000000..306adbccb453e0e7149b9ed655eadf83afdfbb8a
--- /dev/null
+++ b/sglang/python/sglang/srt/models/qwen3_5.py
@@ -0,0 +1,1353 @@
+# Copyright 2025 Qwen Team
+# Copyright 2025 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Inference-only Qwen3.5 model and Qwen3.5 MoE model compatible with HuggingFace weights."""
+
+import logging
+from functools import lru_cache
+from typing import Iterable, Optional, Set, Tuple, Union
+
+import torch
+import torch.nn as nn
+from einops import rearrange
+
+# Configs
+from sglang.srt.configs.qwen3_5 import (
+    Qwen3_5Config,
+    Qwen3_5MoeConfig,
+    Qwen3_5TextConfig,
+)
+
+# Distributed
+from sglang.srt.distributed import get_pp_group
+from sglang.srt.eplb.expert_distribution import get_global_expert_distribution_recorder
+from sglang.srt.eplb.expert_location import ModelConfigForExpertLocation
+
+# Layers - Attention
+from sglang.srt.layers.attention.fla.layernorm_gated import RMSNorm as RMSNormGated
+from sglang.srt.layers.attention.mamba.mamba import mamba_v2_sharded_weight_loader
+from sglang.srt.layers.communicator import LayerCommunicator, LayerScatterModes
+from sglang.srt.layers.dp_attention import (
+    get_attention_tp_rank,
+    get_attention_tp_size,
+    is_dp_attention_enabled,
+)
+
+# Layers - Others
+from sglang.srt.layers.layernorm import GemmaRMSNorm
+
+# Layers - Linear
+from sglang.srt.layers.linear import (
+    ColumnParallelLinear,
+    MergedColumnParallelLinear,
+    QKVParallelLinear,
+    RowParallelLinear,
+)
+from sglang.srt.layers.moe.fused_moe_triton.layer import FusedMoE
+from sglang.srt.layers.quantization.base_config import QuantizationConfig
+from sglang.srt.layers.radix_attention import RadixAttention
+from sglang.srt.layers.radix_linear_attention import RadixLinearAttention
+from sglang.srt.layers.rotary_embedding import get_rope
+from sglang.srt.layers.vocab_parallel_embedding import VocabParallelEmbedding
+from sglang.srt.model_executor.cuda_graph_runner import get_is_capture_mode
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch, PPProxyTensors
+from sglang.srt.model_loader.weight_utils import (
+    default_weight_loader,
+    sharded_weight_loader,
+)
+from sglang.srt.models.qwen2_moe import Qwen2MoeMLP, Qwen2MoeSparseMoeBlock
+
+# Models
+from sglang.srt.models.qwen3_vl import Qwen3VLForConditionalGeneration
+
+# Utils
+from sglang.srt.utils import add_prefix, is_cuda, is_npu, make_layers, set_weight_attrs
+from sglang.srt.utils.hf_transformers_utils import get_processor
+
+logger = logging.getLogger(__name__)
+_is_cuda = is_cuda()
+_is_npu = is_npu()
+
+cached_get_processor = lru_cache(get_processor)
+
+
+class Qwen3_5GatedDeltaNet(nn.Module):
+    def __init__(
+        self,
+        config: Qwen3_5TextConfig,
+        layer_id: int,
+        quant_config: Optional[QuantizationConfig] = None,
+        alt_stream: Optional[torch.cuda.Stream] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.config = config
+        self.attn_tp_rank = get_attention_tp_rank()
+        self.attn_tp_size = get_attention_tp_size()
+        self.hidden_size = config.hidden_size
+        self.num_v_heads = config.linear_num_value_heads
+        self.num_k_heads = config.linear_num_key_heads
+        self.head_k_dim = config.linear_key_head_dim
+        self.head_v_dim = config.linear_value_head_dim
+        self.key_dim = self.head_k_dim * self.num_k_heads
+        self.value_dim = self.head_v_dim * self.num_v_heads
+        self.alt_stream = alt_stream
+
+        self.conv_kernel_size = config.linear_conv_kernel_dim
+        self.layer_id = layer_id
+        self.activation = config.hidden_act
+        self.layer_norm_epsilon = config.rms_norm_eps
+
+        # Conv1d layer
+        self.conv_dim = self.key_dim * 2 + self.value_dim
+        self.conv1d = ColumnParallelLinear(
+            input_size=self.conv_kernel_size,
+            output_size=self.conv_dim,
+            bias=False,
+            quant_config=None,
+            tp_rank=self.attn_tp_rank,
+            tp_size=self.attn_tp_size,
+            prefix=add_prefix("conv1d", prefix),
+        )
+        self.conv1d.weight.data = self.conv1d.weight.data.unsqueeze(1)
+
+        # Split projection layers (following vLLM's implementation)
+        # Instead of fused in_proj_qkvz and in_proj_ba, use separate layers
+        self.in_proj_qkv = MergedColumnParallelLinear(
+            input_size=self.hidden_size,
+            output_sizes=[self.key_dim, self.key_dim, self.value_dim],
+            bias=False,
+            quant_config=quant_config,
+            tp_rank=self.attn_tp_rank,
+            tp_size=self.attn_tp_size,
+            prefix=add_prefix("in_proj_qkv", prefix),
+        )
+        self.in_proj_z = ColumnParallelLinear(
+            input_size=self.hidden_size,
+            output_size=self.value_dim,
+            bias=False,
+            quant_config=quant_config,
+            tp_rank=self.attn_tp_rank,
+            tp_size=self.attn_tp_size,
+            prefix=add_prefix("in_proj_z", prefix),
+        )
+        self.in_proj_b = ColumnParallelLinear(
+            input_size=self.hidden_size,
+            output_size=self.num_v_heads,
+            bias=False,
+            quant_config=quant_config,
+            tp_rank=self.attn_tp_rank,
+            tp_size=self.attn_tp_size,
+            prefix=add_prefix("in_proj_b", prefix),
+        )
+        self.in_proj_a = ColumnParallelLinear(
+            input_size=self.hidden_size,
+            output_size=self.num_v_heads,
+            bias=False,
+            quant_config=quant_config,
+            tp_rank=self.attn_tp_rank,
+            tp_size=self.attn_tp_size,
+            prefix=add_prefix("in_proj_a", prefix),
+        )
+
+        # Conv1d weight loader setup
+        query_key_settings = (self.key_dim, 0, False)
+        value_settings = (self.value_dim, 0, False)
+
+        delattr(self.conv1d.weight, "weight_loader")
+        set_weight_attrs(
+            self.conv1d.weight,
+            {
+                "weight_loader": mamba_v2_sharded_weight_loader(
+                    [
+                        query_key_settings,
+                        query_key_settings,
+                        value_settings,
+                    ],
+                    self.attn_tp_size,
+                    self.attn_tp_rank,
+                )
+            },
+        )
+
+        # State parameters
+        self.dt_bias = nn.Parameter(
+            torch.ones(self.num_v_heads // self.attn_tp_size),
+        )
+        self.A_log = nn.Parameter(
+            torch.empty(self.num_v_heads // self.attn_tp_size),
+        )
+
+        set_weight_attrs(self.A_log, {"weight_loader": sharded_weight_loader(0)})
+        set_weight_attrs(self.dt_bias, {"weight_loader": sharded_weight_loader(0)})
+
+        conv_weights = self.conv1d.weight.view(
+            self.conv1d.weight.size(0), self.conv1d.weight.size(2)
+        )
+        # RadixLinearAttention layer
+        self.attn = RadixLinearAttention(
+            layer_id=layer_id,
+            num_q_heads=self.num_k_heads // self.attn_tp_size,
+            num_k_heads=self.num_k_heads // self.attn_tp_size,
+            num_v_heads=self.num_v_heads // self.attn_tp_size,
+            head_q_dim=self.head_k_dim,
+            head_k_dim=self.head_k_dim,
+            head_v_dim=self.head_v_dim,
+            conv_weights=conv_weights,
+            bias=self.conv1d.bias,
+            activation=self.activation,
+            A_log=self.A_log,
+            dt_bias=self.dt_bias,
+        )
+
+        # Normalization layer
+        self.norm = RMSNormGated(
+            self.head_v_dim,
+            eps=self.layer_norm_epsilon,
+            group_size=None,
+            norm_before_gate=True,
+            device=torch.get_device_module().current_device(),
+            dtype=config.torch_dtype,
+        )
+
+        # Output projection
+        self.out_proj = RowParallelLinear(
+            self.value_dim,
+            self.hidden_size,
+            bias=False,
+            input_is_parallel=True,
+            reduce_results=False,
+            quant_config=quant_config,
+            tp_rank=self.attn_tp_rank,
+            tp_size=self.attn_tp_size,
+            prefix=add_prefix("out_proj", prefix),
+        )
+
+    def fix_query_key_value_ordering(
+        self,
+        mixed_qkv,
+        z,
+        b,
+        a,
+    ):
+        raise NotImplementedError(
+            "Qwen3.5 Series dont need to fix query key value ordering"
+        )
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+    ):
+        """
+        Forward pass with three parts:
+        1. Input projection
+        2. Core attention (custom op)
+        3. Output projection
+        """
+        seq_len, _ = hidden_states.shape
+
+        mixed_qkv, _ = self.in_proj_qkv(hidden_states)
+        z, _ = self.in_proj_z(hidden_states)
+        z = z.reshape(z.size(0), -1, self.head_v_dim)
+        b, _ = self.in_proj_b(hidden_states)
+        a, _ = self.in_proj_a(hidden_states)
+
+        b = b.contiguous()
+        a = a.contiguous()
+
+        core_attn_out = self.attn(
+            forward_batch=forward_batch,
+            mixed_qkv=mixed_qkv,
+            a=a,
+            b=b,
+        )
+
+        z_shape_og = z.shape
+        core_attn_out = core_attn_out.reshape(-1, core_attn_out.shape[-1])
+        z = z.reshape(-1, z.shape[-1])
+        core_attn_out = self.norm(core_attn_out, z)
+        core_attn_out = core_attn_out.reshape(z_shape_og)
+        core_attn_out = rearrange(core_attn_out, "... h d -> ... (h d)")
+        output, _ = self.out_proj(core_attn_out)
+        return output
+
+
+class Qwen3_5LinearDecoderLayer(nn.Module):
+    """Qwen3.5 Decoder Layer with Linear Attention (GatedDeltaNet)."""
+
+    def __init__(
+        self,
+        config: Qwen3_5TextConfig,
+        layer_id: int,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+        alt_stream: Optional[torch.cuda.Stream] = None,
+    ) -> None:
+        super().__init__()
+        self.config = config
+        self.layer_id = layer_id
+
+        linear_attn_quant_config = (
+            None
+            if quant_config and quant_config.get_name() == "modelopt_fp4"
+            else quant_config
+        )
+        self.linear_attn = Qwen3_5GatedDeltaNet(
+            config, layer_id, linear_attn_quant_config, alt_stream, prefix
+        )
+
+        # NOTE: Determine the MLP type based on the model type
+        # Qwen3.5 use all layers for MLP / Qwen3.5-MoE use sparse MoE blocks
+        if config.model_type == "qwen3_5_moe_text":
+            self.mlp = Qwen2MoeSparseMoeBlock(
+                layer_id=layer_id,
+                config=config,
+                quant_config=quant_config,
+                alt_stream=alt_stream,
+                prefix=add_prefix("mlp", prefix.replace(".linear_attn", "")),
+            )
+            is_layer_sparse = True
+            is_previous_layer_sparse = True
+            is_next_layer_sparse = True
+        elif config.model_type == "qwen3_5_text":
+            self.mlp = Qwen2MoeMLP(
+                hidden_size=config.hidden_size,
+                intermediate_size=config.intermediate_size,
+                hidden_act=config.hidden_act,
+                quant_config=quant_config,
+                prefix=add_prefix("mlp", prefix.replace(".linear_attn", "")),
+            )
+            is_layer_sparse = False
+            is_previous_layer_sparse = False
+            is_next_layer_sparse = False
+        else:
+            raise ValueError(f"Invalid model type: {config.model_type}")
+
+        self.layer_scatter_modes = LayerScatterModes.init_new(
+            layer_id=layer_id,
+            num_layers=config.num_hidden_layers,
+            is_layer_sparse=is_layer_sparse,
+            is_previous_layer_sparse=is_previous_layer_sparse,
+            is_next_layer_sparse=is_next_layer_sparse,
+        )
+
+        self.input_layernorm = GemmaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = GemmaRMSNorm(
+            config.hidden_size, eps=config.rms_norm_eps
+        )
+        self.layer_communicator = LayerCommunicator(
+            layer_scatter_modes=self.layer_scatter_modes,
+            input_layernorm=self.input_layernorm,
+            post_attention_layernorm=self.post_attention_layernorm,
+            allow_reduce_scatter=True,
+            is_last_layer=(layer_id == config.num_hidden_layers - 1),
+        )
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        residual: Optional[torch.Tensor],
+        **kwargs,
+    ):
+        forward_batch = kwargs.get("forward_batch", None)
+
+        hidden_states, residual = self.layer_communicator.prepare_attn(
+            hidden_states, residual, forward_batch
+        )
+
+        if not forward_batch.forward_mode.is_idle():
+            hidden_states = self.linear_attn(
+                hidden_states,
+                forward_batch,
+            )
+
+        # Fully Connected
+        hidden_states, residual = self.layer_communicator.prepare_mlp(
+            hidden_states, residual, forward_batch
+        )
+
+        use_reduce_scatter = self.layer_communicator.should_use_reduce_scatter(
+            forward_batch
+        )
+
+        should_allreduce_fusion = (
+            self.layer_communicator.should_fuse_mlp_allreduce_with_next_layer(
+                forward_batch
+            )
+        )
+        if isinstance(self.mlp, Qwen2MoeSparseMoeBlock):
+            hidden_states = self.mlp(hidden_states, forward_batch, use_reduce_scatter)
+        else:
+            hidden_states = self.mlp(
+                hidden_states, should_allreduce_fusion, use_reduce_scatter
+            )
+        if should_allreduce_fusion:
+            hidden_states._sglang_needs_allreduce_fusion = True
+        else:
+            hidden_states, residual = self.layer_communicator.postprocess_layer(
+                hidden_states, residual, forward_batch
+            )
+
+        return hidden_states, residual
+
+
+class Qwen3_5AttentionDecoderLayer(nn.Module):
+    """Qwen3.5 Decoder Layer with Full Attention."""
+
+    def __init__(
+        self,
+        config: Qwen3_5TextConfig,
+        layer_id: int,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+        alt_stream: Optional[torch.cuda.Stream] = None,
+    ) -> None:
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.attn_tp_rank = get_attention_tp_rank()
+        self.attn_tp_size = get_attention_tp_size()
+        self.total_num_heads = config.num_attention_heads
+        assert self.total_num_heads % self.attn_tp_size == 0
+        self.num_heads = self.total_num_heads // self.attn_tp_size
+        self.total_num_kv_heads = config.num_key_value_heads
+        if self.total_num_kv_heads >= self.attn_tp_size:
+            assert self.total_num_kv_heads % self.attn_tp_size == 0
+        else:
+            assert self.attn_tp_size % self.total_num_kv_heads == 0
+        self.num_kv_heads = max(1, self.total_num_kv_heads // self.attn_tp_size)
+        self.head_dim = config.head_dim or (self.hidden_size // self.num_heads)
+        self.q_size = self.num_heads * self.head_dim
+        self.kv_size = self.num_kv_heads * self.head_dim
+        self.scaling = self.head_dim**-0.5
+        self.max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
+
+        if hasattr(config, "rope_parameters"):
+            self.rope_scaling = getattr(config, "rope_parameters", None)
+        else:
+            self.rope_scaling = getattr(config, "rope_scaling", None)
+
+        self.rope_theta = self.rope_scaling.get("rope_theta", 10000)
+        self.partial_rotary_factor = self.rope_scaling.get("partial_rotary_factor", 1.0)
+        self.layer_id = layer_id
+
+        self.attn_output_gate = getattr(config, "attn_output_gate", True)
+        if self.attn_output_gate:
+            logger.warning_once("using attn output gate!")
+
+        self.rotary_emb = get_rope(
+            head_size=self.head_dim,
+            rotary_dim=self.head_dim,
+            max_position=self.max_position_embeddings,
+            rope_scaling=self.rope_scaling,
+            base=self.rope_theta,
+            partial_rotary_factor=self.partial_rotary_factor,
+            is_neox_style=True,
+            dtype=torch.get_default_dtype(),
+        )
+
+        attn_quant_config = (
+            None
+            if quant_config and quant_config.get_name() == "modelopt_fp4"
+            else quant_config
+        )
+
+        self.qkv_proj = QKVParallelLinear(
+            config.hidden_size,
+            self.head_dim,
+            self.total_num_heads * (1 + self.attn_output_gate),
+            self.total_num_kv_heads,
+            bias=False,
+            quant_config=attn_quant_config,
+            tp_rank=self.attn_tp_rank,
+            tp_size=self.attn_tp_size,
+            prefix=add_prefix("qkv_proj", prefix),
+        )
+
+        self.o_proj = RowParallelLinear(
+            self.total_num_heads * self.head_dim,
+            config.hidden_size,
+            bias=False,
+            quant_config=attn_quant_config,
+            reduce_results=False,
+            tp_rank=self.attn_tp_rank,
+            tp_size=self.attn_tp_size,
+            prefix=add_prefix("o_proj", prefix),
+        )
+
+        self.attn = RadixAttention(
+            self.num_heads,
+            self.head_dim,
+            self.scaling,
+            num_kv_heads=self.num_kv_heads,
+            layer_id=layer_id,
+            prefix=f"{prefix}.attn",
+        )
+
+        # Dense MLP for non-MoE variant
+        if config.model_type == "qwen3_5_text":
+            self.mlp = Qwen2MoeMLP(
+                hidden_size=config.hidden_size,
+                intermediate_size=config.intermediate_size,
+                hidden_act=config.hidden_act,
+                quant_config=quant_config,
+                prefix=add_prefix("mlp", prefix.replace(".self_attn", "")),
+            )
+            is_layer_sparse = False
+            is_previous_layer_sparse = False
+            is_next_layer_sparse = False
+        elif config.model_type == "qwen3_5_moe_text":
+            self.mlp = Qwen2MoeSparseMoeBlock(
+                layer_id=layer_id,
+                config=config,
+                quant_config=quant_config,
+                alt_stream=alt_stream,
+                prefix=add_prefix("mlp", prefix.replace(".self_attn", "")),
+            )
+            is_layer_sparse = True
+            is_previous_layer_sparse = True
+            is_next_layer_sparse = True
+        else:
+            raise ValueError(f"Invalid model type: {config.model_type}")
+
+        self.layer_scatter_modes = LayerScatterModes.init_new(
+            layer_id=layer_id,
+            num_layers=config.num_hidden_layers,
+            is_layer_sparse=is_layer_sparse,
+            is_previous_layer_sparse=is_previous_layer_sparse,
+            is_next_layer_sparse=is_next_layer_sparse,
+        )
+
+        self.input_layernorm = GemmaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = GemmaRMSNorm(
+            config.hidden_size, eps=config.rms_norm_eps
+        )
+
+        self.q_norm = GemmaRMSNorm(self.head_dim, eps=config.rms_norm_eps)
+        self.k_norm = GemmaRMSNorm(self.head_dim, eps=config.rms_norm_eps)
+
+        self.layer_communicator = LayerCommunicator(
+            layer_scatter_modes=self.layer_scatter_modes,
+            input_layernorm=self.input_layernorm,
+            post_attention_layernorm=self.post_attention_layernorm,
+            allow_reduce_scatter=True,
+            is_last_layer=(layer_id == config.num_hidden_layers - 1),
+        )
+
+        self.alt_stream = alt_stream
+
+    def _apply_qk_norm(
+        self, q: torch.Tensor, k: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """Apply Q/K normalization with optional alt_stream overlap."""
+        if self.alt_stream is not None and get_is_capture_mode():
+            current_stream = torch.cuda.current_stream()
+            self.alt_stream.wait_stream(current_stream)
+            q_by_head = q.reshape(-1, self.head_dim)
+            q_by_head = self.q_norm(q_by_head)
+            with torch.cuda.stream(self.alt_stream):
+                k_by_head = k.reshape(-1, self.head_dim)
+                k_by_head = self.k_norm(k_by_head)
+            current_stream.wait_stream(self.alt_stream)
+        else:
+            q_by_head = q.reshape(-1, self.head_dim)
+            q_by_head = self.q_norm(q_by_head)
+            k_by_head = k.reshape(-1, self.head_dim)
+            k_by_head = self.k_norm(k_by_head)
+        q = q_by_head.view(q.shape)
+        k = k_by_head.view(k.shape)
+        return q, k
+
+    def self_attention(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+    ) -> torch.Tensor:
+        """Full attention forward pass."""
+        qkv, _ = self.qkv_proj(hidden_states)
+
+        if self.attn_output_gate:
+            q_gate, k, v = qkv.split(
+                [self.q_size * 2, self.kv_size, self.kv_size], dim=-1
+            )
+            orig_shape = q_gate.shape[:-1]
+            q_gate = q_gate.view(*orig_shape, self.num_heads, -1)
+            q, gate = torch.chunk(q_gate, 2, dim=-1)
+            q = q.reshape(*orig_shape, -1)
+            gate = gate.reshape(*orig_shape, -1)
+        else:
+            q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+
+        q, k = self._apply_qk_norm(q, k)
+        q, k = self.rotary_emb(positions, q, k)
+        attn_output = self.attn(q, k, v, forward_batch)
+
+        if self.attn_output_gate:
+            gate = torch.sigmoid(gate)
+            attn_output = attn_output * gate
+
+        output, _ = self.o_proj(attn_output)
+        return output
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        residual: Optional[torch.Tensor],
+        forward_batch: ForwardBatch,
+        **kwargs,
+    ):
+        hidden_states, residual = self.layer_communicator.prepare_attn(
+            hidden_states, residual, forward_batch
+        )
+
+        if not forward_batch.forward_mode.is_idle():
+            hidden_states = self.self_attention(
+                positions=positions,
+                hidden_states=hidden_states,
+                forward_batch=forward_batch,
+            )
+
+        # Fully Connected
+        hidden_states, residual = self.layer_communicator.prepare_mlp(
+            hidden_states, residual, forward_batch
+        )
+        use_reduce_scatter = self.layer_communicator.should_use_reduce_scatter(
+            forward_batch
+        )
+
+        should_allreduce_fusion = (
+            self.layer_communicator.should_fuse_mlp_allreduce_with_next_layer(
+                forward_batch
+            )
+        )
+        if isinstance(self.mlp, Qwen2MoeSparseMoeBlock):
+            hidden_states = self.mlp(hidden_states, forward_batch, use_reduce_scatter)
+        else:
+            hidden_states = self.mlp(
+                hidden_states, should_allreduce_fusion, use_reduce_scatter
+            )
+        if should_allreduce_fusion:
+            hidden_states._sglang_needs_allreduce_fusion = True
+        else:
+            hidden_states, residual = self.layer_communicator.postprocess_layer(
+                hidden_states, residual, forward_batch
+            )
+
+        return hidden_states, residual
+
+
+ALL_DECODER_LAYER_TYPES = {
+    "attention": Qwen3_5AttentionDecoderLayer,
+    "linear_attention": Qwen3_5LinearDecoderLayer,
+}
+
+
+class Qwen3_5ForCausalLM(nn.Module):
+    """Qwen3.5 Model with support for dense variant."""
+
+    def __init__(
+        self,
+        config: Qwen3_5TextConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.pp_group = get_pp_group()
+
+        alt_stream = torch.cuda.Stream() if _is_cuda else None
+
+        # Embedding layer
+        if self.pp_group.is_first_rank:
+            self.embed_tokens = VocabParallelEmbedding(
+                config.vocab_size,
+                config.hidden_size,
+                org_num_embeddings=config.vocab_size,
+                enable_tp=not is_dp_attention_enabled(),
+            )
+
+        # Decoder layers
+        def get_layer(idx: int, prefix: str):
+            layer_type = config.layers_block_type[idx]
+            layer_class = ALL_DECODER_LAYER_TYPES[layer_type]
+            if layer_type == "attention":
+                prefix = add_prefix("self_attn", prefix)
+            else:
+                prefix = add_prefix("linear_attn", prefix)
+            return layer_class(
+                config=config,
+                layer_id=idx,
+                quant_config=quant_config,
+                prefix=prefix,
+                alt_stream=alt_stream,
+            )
+
+        self.layers = make_layers(
+            config.num_hidden_layers,
+            get_layer,
+            prefix=f"{prefix}.layers",
+        )
+
+        # Final normalization
+        if self.pp_group.is_last_rank:
+            self.norm = GemmaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+    def get_input_embeddings(self) -> nn.Embedding:
+        return self.embed_tokens
+
+    @torch.no_grad()
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        input_embeds: Optional[torch.Tensor] = None,
+        pp_proxy_tensors: Optional[PPProxyTensors] = None,
+        input_deepstack_embeds: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor, PPProxyTensors]:
+        # Initialize hidden states
+        if self.pp_group.is_first_rank:
+            if input_embeds is None:
+                hidden_states = self.embed_tokens(input_ids)
+            else:
+                hidden_states = input_embeds
+            residual = None
+        else:
+            assert pp_proxy_tensors is not None
+            hidden_states = pp_proxy_tensors["hidden_states"]
+            residual = pp_proxy_tensors["residual"]
+
+        # Pass through decoder layers
+        for layer_idx in range(len(self.layers)):
+            layer = self.layers[layer_idx]
+            with get_global_expert_distribution_recorder().with_current_layer(
+                layer_idx
+            ):
+                hidden_states, residual = layer(
+                    positions=positions,
+                    hidden_states=hidden_states,
+                    residual=residual,
+                    forward_batch=forward_batch,
+                )
+
+            # Process deepstack embeddings if provided
+            if (
+                input_deepstack_embeds is not None
+                and input_deepstack_embeds.numel() > 0
+                and layer_idx < 3
+            ):
+                sep = self.hidden_size * layer_idx
+                hidden_states.add_(
+                    input_deepstack_embeds[:, sep : sep + self.hidden_size]
+                )
+
+        # Return intermediate tensors for pipeline parallelism
+        if not self.pp_group.is_last_rank:
+            return PPProxyTensors(
+                {
+                    "hidden_states": hidden_states,
+                    "residual": residual,
+                }
+            )
+
+        # Apply final normalization
+        if hidden_states.shape[0] != 0:
+            if residual is None:
+                hidden_states = self.norm(hidden_states)
+            else:
+                hidden_states, _ = self.norm(hidden_states, residual)
+
+        return hidden_states
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+            ("gate_up_proj", "gate_proj", 0),
+            ("gate_up_proj", "up_proj", 1),
+        ]
+
+        loaded_params: Set[str] = set()
+        params_dict = dict(self.named_parameters(remove_duplicate=False))
+        for name, loaded_weight in weights:
+            if "rotary_emb.inv_freq" in name:
+                continue
+            if "mtp" in name:
+                continue
+            if "visual" in name:
+                continue
+            if "language_model" in name:
+                name = name.replace(r"model.language_model.", r"model.")
+            if ".self_attn." in name:
+                name = name.replace(".self_attn", "")
+
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+
+                if "mlp.experts" in name:
+                    continue
+
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                # Skip layers on other devices.
+                # if is_pp_missing_parameter(name, self):
+                #     continue
+                if name not in params_dict:
+                    continue
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader")
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                if name not in params_dict:
+                    logger.warning(f"Parameter {name} not found in params_dict")
+                    continue
+                param = params_dict[name]
+
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
+                weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
+
+
+class Qwen3_5MoeForCausalLM(Qwen3_5ForCausalLM):
+    def __init__(
+        self,
+        config: Qwen3_5TextConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__(config=config, quant_config=quant_config, prefix=prefix)
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+            ("gate_up_proj", "gate_proj", 0),
+            ("gate_up_proj", "up_proj", 1),
+        ]
+
+        # Params for weights, fp8 weight scales, fp8 activation scales
+        # (param_name, weight_name, expert_id, shard_id)
+        expert_params_mapping = FusedMoE.make_expert_params_mapping(
+            ckpt_gate_proj_name="gate_proj",
+            ckpt_down_proj_name="down_proj",
+            ckpt_up_proj_name="up_proj",
+            num_experts=self.config.num_experts,
+        )
+
+        # Skip loading extra parameters for GPTQ/modelopt models.
+        ignore_suffixes = (
+            ".bias",
+            "_bias",
+            ".k_scale",
+            "_k_scale",
+            ".v_scale",
+            "_v_scale",
+            ".weight_scale",
+            "_weight_scale",
+            ".input_scale",
+            "_input_scale",
+        )
+
+        is_fused_expert = False
+        fused_expert_params_mapping = [
+            ("experts.w13_weight", "experts.gate_up_proj", 0, "w1"),
+            ("experts.w2_weight", "experts.down_proj", 0, "w2"),
+        ]
+
+        num_experts = self.config.num_experts
+
+        def load_fused_expert_weights(
+            name: str,
+            params_dict: dict,
+            loaded_weight: torch.Tensor,
+            shard_id: str,
+            num_experts: int,
+        ):
+            param = params_dict[name]
+            weight_loader = param.weight_loader
+            # let ep moe layer to gracefully handle expert_ids that do not belong to local moe rank
+            for expert_id in range(num_experts):
+                curr_expert_weight = loaded_weight[expert_id]
+                weight_loader(
+                    param,
+                    curr_expert_weight,
+                    name,
+                    shard_id,
+                    expert_id,
+                )
+            return True
+
+        loaded_params: Set[str] = set()
+        params_dict = dict(self.named_parameters(remove_duplicate=False))
+
+        for name, loaded_weight in weights:
+            if "rotary_emb.inv_freq" in name:
+                continue
+            if "mtp" in name:
+                continue
+            if "visual" in name:
+                continue
+            if "language_model" in name:
+                name = name.replace(r"model.language_model.", r"model.")
+            if ".self_attn." in name:
+                name = name.replace(".self_attn", "")
+
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                if "experts.gate_up_proj" in name or "experts.down_proj" in name:
+                    is_fused_expert = True
+                    expert_params_mapping = fused_expert_params_mapping
+
+                # Skip non-stacked layers and experts (experts handled below).
+                if weight_name not in name:
+                    continue
+
+                # We have mlp.experts[0].gate_proj in the checkpoint.
+                # Since we handle the experts below in expert_params_mapping,
+                # we need to skip here BEFORE we update the name, otherwise
+                # name will be updated to mlp.experts[0].gate_up_proj, which
+                # will then be updated below in expert_params_mapping
+                # for mlp.experts[0].gate_gate_up_proj, which breaks load.
+                if "mlp.experts" in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra parameters for GPTQ/modelopt models.
+                if name.endswith(ignore_suffixes) and name not in params_dict:
+                    continue
+
+                if name not in params_dict:
+                    continue
+
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                # Track if this is an expert weight to enable early skipping
+                is_expert_weight = False
+
+                for mapping in expert_params_mapping:
+                    param_name, weight_name, expert_id, shard_id = mapping
+                    if weight_name not in name:
+                        continue
+                    # Anyway, this is an expert weight and should not be
+                    # attempted to load as other weights later
+                    is_expert_weight = True
+                    name_mapped = name.replace(weight_name, param_name)
+                    if is_fused_expert:
+                        if "experts.gate_up_proj" in name:
+                            loaded_weight = loaded_weight.chunk(2, dim=-2)
+                            load_fused_expert_weights(
+                                name_mapped,
+                                params_dict,
+                                loaded_weight[0],
+                                "w1",
+                                num_experts,
+                            )
+                            load_fused_expert_weights(
+                                name_mapped,
+                                params_dict,
+                                loaded_weight[1],
+                                "w3",
+                                num_experts,
+                            )
+                        else:
+                            load_fused_expert_weights(
+                                name_mapped,
+                                params_dict,
+                                loaded_weight,
+                                shard_id,
+                                num_experts,
+                            )
+                    else:
+                        # Skip loading extra parameters for GPTQ/modelopt models.
+                        if (
+                            name_mapped.endswith(ignore_suffixes)
+                            and name_mapped not in params_dict
+                        ):
+                            continue
+                        param = params_dict[name_mapped]
+                        # We should ask the weight loader to return success or
+                        # not here since otherwise we may skip experts with
+                        # # other available replicas.
+                        weight_loader = param.weight_loader
+                        weight_loader(
+                            param,
+                            loaded_weight,
+                            name_mapped,
+                            shard_id=shard_id,
+                            expert_id=expert_id,
+                        )
+                    name = name_mapped
+                    break
+                else:
+                    if is_expert_weight:
+                        # This is an expert weight but not mapped to this rank, skip all remaining processing
+                        continue
+
+                    # Skip loading extra parameters for GPTQ/modelopt models.
+                    if name.endswith(ignore_suffixes) and name not in params_dict:
+                        continue
+
+                    if name in params_dict.keys():
+                        param = params_dict[name]
+                        weight_loader = getattr(
+                            param, "weight_loader", default_weight_loader
+                        )
+                        weight_loader(param, loaded_weight)
+                    else:
+                        logger.warning(f"Parameter {name} not found in params_dict")
+            loaded_params.add(name)
+
+        return loaded_params
+
+
+class Qwen3_5ForConditionalGeneration(Qwen3VLForConditionalGeneration):
+    def __init__(
+        self,
+        config: Qwen3_5Config,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+        language_model_cls=Qwen3_5ForCausalLM,
+    ):
+        super().__init__(config, quant_config, prefix, language_model_cls)
+
+        rope_config = getattr(self.config, "rope_parameters", None) or getattr(
+            self.config, "rope_scaling", {}
+        )
+        self.is_mrope_enabled = "mrope_section" in rope_config
+
+        self.deepstack_visual_indexes = self.visual.deepstack_visual_indexes
+
+    def get_embed_and_head(self):
+        return self.model.embed_tokens.weight, self.lm_head.weight
+
+    def set_embed_and_head(self, embed, head):
+        del self.model.embed_tokens.weight
+        del self.lm_head.weight
+        self.model.embed_tokens.weight = embed
+        self.lm_head.weight = head
+        torch.cuda.empty_cache()
+        torch.cuda.synchronize()
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+            ("gate_up_proj", "gate_proj", 0),
+            ("gate_up_proj", "up_proj", 1),
+        ]
+
+        loaded_params: Set[str] = set()
+        params_dict = dict(self.named_parameters(remove_duplicate=False))
+        for name, loaded_weight in weights:
+            if "rotary_emb.inv_freq" in name:
+                continue
+            if "mtp" in name:
+                continue
+            if "language_model" in name:
+                name = name.replace(r"model.language_model.", r"model.")
+            if ".self_attn." in name:
+                name = name.replace(".self_attn", "")
+
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+
+                if "visual" in name or "mlp.experts" in name:
+                    continue
+
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                # Skip layers on other devices.
+                # if is_pp_missing_parameter(name, self):
+                #     continue
+                if name not in params_dict:
+                    continue
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader")
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                if "visual" in name:
+                    # adapt to VisionAttention
+                    name = name.replace(r"attn.qkv.", r"attn.qkv_proj.")
+                    name = name.replace(r"model.visual.", r"visual.")
+
+                # print(name, loaded_weight.shape)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                if name not in params_dict:
+                    logger.warning(f"Parameter {name} not found in params_dict")
+                    continue
+                param = params_dict[name]
+
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
+                weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
+
+
+class Qwen3_5MoeForConditionalGeneration(Qwen3VLForConditionalGeneration):
+    """Qwen3.5 MoE Vision-Language Model."""
+
+    def __init__(
+        self,
+        config: Qwen3_5MoeConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+        language_model_cls=Qwen3_5MoeForCausalLM,
+    ) -> None:
+        super().__init__(config, quant_config, prefix, language_model_cls)
+        rope_config = getattr(self.config, "rope_parameters", None) or getattr(
+            self.config, "rope_scaling", {}
+        )
+        self.is_mrope_enabled = "mrope_section" in rope_config
+
+        self.deepstack_visual_indexes = self.visual.deepstack_visual_indexes
+
+    def get_embed_and_head(self):
+        return self.model.embed_tokens.weight, self.lm_head.weight
+
+    def set_embed_and_head(self, embed, head):
+        del self.model.embed_tokens.weight
+        del self.lm_head.weight
+        self.model.embed_tokens.weight = embed
+        self.lm_head.weight = head
+        torch.cuda.empty_cache()
+        torch.cuda.synchronize()
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+            ("gate_up_proj", "gate_proj", 0),
+            ("gate_up_proj", "up_proj", 1),
+        ]
+
+        # Params for weights, fp8 weight scales, fp8 activation scales
+        # (param_name, weight_name, expert_id, shard_id)
+        expert_params_mapping = FusedMoE.make_expert_params_mapping(
+            ckpt_gate_proj_name="gate_proj",
+            ckpt_down_proj_name="down_proj",
+            ckpt_up_proj_name="up_proj",
+            num_experts=self.config.num_experts,
+        )
+
+        # Skip loading extra parameters for GPTQ/modelopt models.
+        ignore_suffixes = (
+            ".bias",
+            "_bias",
+            ".k_scale",
+            "_k_scale",
+            ".v_scale",
+            "_v_scale",
+            "_weight_scale",
+            "_input_scale",
+        )
+
+        is_fused_expert = False
+        fused_expert_params_mapping = [
+            ("experts.w13_weight", "experts.gate_up_proj", 0, "w1"),
+            ("experts.w2_weight", "experts.down_proj", 0, "w2"),
+        ]
+
+        num_experts = self.config.num_experts
+
+        def load_fused_expert_weights(
+            name: str,
+            params_dict: dict,
+            loaded_weight: torch.Tensor,
+            shard_id: str,
+            num_experts: int,
+        ):
+            param = params_dict[name]
+            weight_loader = param.weight_loader
+            # let ep moe layer to gracefully handle expert_ids that do not belong to local moe rank
+            for expert_id in range(num_experts):
+                curr_expert_weight = loaded_weight[expert_id]
+                weight_loader(
+                    param,
+                    curr_expert_weight,
+                    name,
+                    shard_id,
+                    expert_id,
+                )
+            return True
+
+        loaded_params: Set[str] = set()
+        params_dict = dict(self.named_parameters(remove_duplicate=False))
+
+        for name, loaded_weight in weights:
+            if "rotary_emb.inv_freq" in name:
+                continue
+            if "mtp" in name:
+                continue
+            if "language_model" in name:
+                name = name.replace(r"model.language_model.", r"model.")
+            if ".self_attn." in name:
+                name = name.replace(".self_attn", "")
+
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                if name.endswith("experts.gate_up_proj") or name.endswith(
+                    "experts.down_proj"
+                ):
+                    is_fused_expert = True
+                    expert_params_mapping = fused_expert_params_mapping
+
+                # Skip non-stacked layers and experts (experts handled below).
+                if weight_name not in name:
+                    continue
+                if "visual" in name:
+                    continue
+
+                # We have mlp.experts[0].gate_proj in the checkpoint.
+                # Since we handle the experts below in expert_params_mapping,
+                # we need to skip here BEFORE we update the name, otherwise
+                # name will be updated to mlp.experts[0].gate_up_proj, which
+                # will then be updated below in expert_params_mapping
+                # for mlp.experts[0].gate_gate_up_proj, which breaks load.
+                if "mlp.experts" in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra parameters for GPTQ/modelopt models.
+                if name.endswith(ignore_suffixes) and name not in params_dict:
+                    continue
+
+                if name not in params_dict:
+                    continue
+
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                # Track if this is an expert weight to enable early skipping
+                is_expert_weight = False
+
+                for mapping in expert_params_mapping:
+                    param_name, weight_name, expert_id, shard_id = mapping
+                    if weight_name not in name:
+                        continue
+                    if "visual" in name or self.config.encoder_only:
+                        continue
+                    # Anyway, this is an expert weight and should not be
+                    # attempted to load as other weights later
+                    is_expert_weight = True
+                    name_mapped = name.replace(weight_name, param_name)
+                    if is_fused_expert:
+                        if "experts.gate_up_proj" in name:
+                            loaded_weight = loaded_weight.chunk(2, dim=-2)
+                            load_fused_expert_weights(
+                                name_mapped,
+                                params_dict,
+                                loaded_weight[0],
+                                "w1",
+                                num_experts,
+                            )
+                            load_fused_expert_weights(
+                                name_mapped,
+                                params_dict,
+                                loaded_weight[1],
+                                "w3",
+                                num_experts,
+                            )
+                        else:
+                            load_fused_expert_weights(
+                                name_mapped,
+                                params_dict,
+                                loaded_weight,
+                                shard_id,
+                                num_experts,
+                            )
+                    else:
+                        # Skip loading extra parameters for GPTQ models.
+                        if (
+                            name_mapped.endswith(ignore_suffixes)
+                            and name_mapped not in params_dict
+                        ):
+                            continue
+                        param = params_dict[name_mapped]
+                        # We should ask the weight loader to return success or
+                        # not here since otherwise we may skip experts with
+                        # # other available replicas.
+                        weight_loader = param.weight_loader
+                        weight_loader(
+                            param,
+                            loaded_weight,
+                            name_mapped,
+                            shard_id=shard_id,
+                            expert_id=expert_id,
+                        )
+                    name = name_mapped
+                    break
+                else:
+                    if is_expert_weight:
+                        # This is an expert weight but not mapped to this rank, skip all remaining processing
+                        continue
+
+                    if "visual" in name:
+                        # adapt to VisionAttention
+                        name = name.replace(r"attn.qkv.", r"attn.qkv_proj.")
+                        name = name.replace(r"model.visual.", r"visual.")
+
+                    # Skip loading extra parameters for GPTQ/modelopt models.
+                    if name.endswith(ignore_suffixes) and name not in params_dict:
+                        continue
+
+                    if name in params_dict.keys():
+                        param = params_dict[name]
+                        weight_loader = getattr(
+                            param, "weight_loader", default_weight_loader
+                        )
+                        weight_loader(param, loaded_weight)
+                    else:
+                        logger.warning(f"Parameter {name} not found in params_dict")
+            loaded_params.add(name)
+
+        return loaded_params
+
+    @classmethod
+    def get_model_config_for_expert_location(cls, config):
+        text_config = getattr(config, "text_config", config)
+        return ModelConfigForExpertLocation(
+            num_layers=text_config.num_hidden_layers,
+            num_logical_experts=text_config.num_experts,
+            num_groups=None,
+        )
+
+
+EntryClass = [Qwen3_5MoeForConditionalGeneration, Qwen3_5ForConditionalGeneration]
diff --git a/sglang/python/sglang/srt/models/qwen3_5_mtp.py b/sglang/python/sglang/srt/models/qwen3_5_mtp.py
new file mode 100644
index 0000000000000000000000000000000000000000..97d02274b0d377a8007a8c14a249c92f638a97dd
--- /dev/null
+++ b/sglang/python/sglang/srt/models/qwen3_5_mtp.py
@@ -0,0 +1,347 @@
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Inference-only Qwen3_5 MTP model."""
+
+import logging
+from typing import Iterable, Optional, Tuple
+
+import torch
+from torch import nn
+from transformers import PretrainedConfig
+
+from sglang.srt.distributed import get_pp_group, get_tensor_model_parallel_world_size
+from sglang.srt.layers.layernorm import GemmaRMSNorm
+from sglang.srt.layers.logits_processor import LogitsProcessor
+from sglang.srt.layers.moe.fused_moe_triton.layer import FusedMoE
+from sglang.srt.layers.vocab_parallel_embedding import ParallelLMHead
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch
+from sglang.srt.model_loader.weight_utils import default_weight_loader
+from sglang.srt.models.qwen3_5 import Qwen3_5ForCausalLM
+from sglang.srt.utils import add_prefix
+
+logger = logging.getLogger(__name__)
+
+
+class Qwen3_5ForCausalLMMTP(nn.Module):
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        quant_config=None,
+        prefix: str = "",
+    ) -> None:
+        nn.Module.__init__(self)
+
+        self.is_multimodal = hasattr(config, "text_config")
+        if self.is_multimodal:
+            config = config.text_config
+
+        # The MTP model is unquantized in the nvfp4 checkpoint.
+        if quant_config and quant_config.get_name() == "modelopt_fp4":
+            quant_config = None
+
+        self.config = config
+        self.tp_size = get_tensor_model_parallel_world_size()
+        self.quant_config = quant_config
+        self.pp_group = get_pp_group()
+
+        self.fc = nn.Linear(2 * config.hidden_size, config.hidden_size, bias=False)
+        RMSNorm_cls = GemmaRMSNorm
+        self.pre_fc_norm_embedding = RMSNorm_cls(
+            config.hidden_size, config.rms_norm_eps
+        )
+        self.pre_fc_norm_hidden = RMSNorm_cls(config.hidden_size, config.rms_norm_eps)
+        config.num_hidden_layers = 1
+        config.full_attention_interval = 1
+        self.model = Qwen3_5ForCausalLM(
+            config,
+            quant_config,
+            prefix=add_prefix("mtp", prefix),
+        )
+
+        if get_pp_group().is_last_rank:
+            if config.tie_word_embeddings:
+                self.lm_head = self.model.embed_tokens
+            else:
+                self.lm_head = ParallelLMHead(
+                    config.vocab_size,
+                    config.hidden_size,
+                    quant_config=quant_config,
+                    prefix=add_prefix("lm_head", prefix),
+                )
+
+        self.logits_processor = LogitsProcessor(config)
+
+    def get_embed_and_head(self):
+        return self.model.embed_tokens.weight, self.lm_head.weight
+
+    def set_embed_and_head(self, embed, head):
+        del self.model.embed_tokens.weight
+        if not self.config.tie_word_embeddings:
+            del self.lm_head.weight
+
+        self.model.embed_tokens.weight = embed
+        self.lm_head.weight = head
+        torch.cuda.empty_cache()
+        torch.cuda.synchronize()
+
+    @torch.no_grad()
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        input_embeds: Optional[torch.Tensor] = None,
+        **kwargs,
+    ):
+        assert input_embeds is None
+        input_embeds = forward_batch.mm_input_embeds
+        if (
+            forward_batch.forward_mode.is_extend()
+            and forward_batch.contains_mm_inputs()
+            and not forward_batch.forward_mode.is_draft_extend(include_v2=True)
+        ):
+            assert input_embeds is not None
+            input_embeds = torch.cat(
+                [input_embeds[:-1], self.model.embed_tokens(input_ids[-1].unsqueeze(0))]
+            )
+
+        if input_embeds is None:
+            input_embeds = self.model.embed_tokens(input_ids)
+
+        hidden_states = forward_batch.spec_info.hidden_states
+
+        if not forward_batch.forward_mode.is_idle():
+            input_embeds = self.pre_fc_norm_embedding(input_embeds)
+            hidden_states = self.pre_fc_norm_hidden(hidden_states)
+        hidden_states = torch.cat([input_embeds, hidden_states], dim=-1)
+
+        hidden_states = self.fc(hidden_states)
+
+        hidden_states = self.model(
+            input_ids,
+            positions,
+            forward_batch,
+            hidden_states,
+        )
+
+        return self.logits_processor(
+            input_ids, hidden_states, self.lm_head, forward_batch
+        )
+
+    def load_weights(
+        self, weights: Iterable[Tuple[str, torch.Tensor]], is_mtp: bool = False
+    ):
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+            ("gate_up_proj", "gate_proj", 0),
+            ("gate_up_proj", "up_proj", 1),
+        ]
+
+        # Params for MoE experts (non-fused/fused)
+        num_experts = getattr(self.config, "num_experts", None)
+        if num_experts is not None:
+            expert_params_mapping = FusedMoE.make_expert_params_mapping(
+                ckpt_gate_proj_name="gate_proj",
+                ckpt_down_proj_name="down_proj",
+                ckpt_up_proj_name="up_proj",
+                num_experts=num_experts,
+            )
+        else:
+            expert_params_mapping = []
+
+        # Skip loading extra parameters for GPTQ/modelopt models.
+        ignore_suffixes = (
+            ".bias",
+            "_bias",
+            ".k_scale",
+            "_k_scale",
+            ".v_scale",
+            "_v_scale",
+            ".weight_scale",
+            "_weight_scale",
+            ".input_scale",
+            "_input_scale",
+        )
+
+        # fused experts: experts.w13_weight / experts.w2_weight
+        is_fused_expert = False
+        fused_expert_params_mapping = [
+            ("experts.w13_weight", "experts.gate_up_proj", 0, "w1"),
+            ("experts.w2_weight", "experts.down_proj", 0, "w2"),
+        ]
+
+        def load_fused_expert_weights(
+            name: str,
+            params_dict: dict,
+            loaded_weight: torch.Tensor,
+            shard_id: str,
+            num_experts: int,
+        ):
+            param = params_dict[name]
+            weight_loader = param.weight_loader
+            # Let EP MoE layer handle expert_ids that do not belong to local moe rank
+            for expert_id in range(num_experts):
+                curr_expert_weight = loaded_weight[expert_id]
+                weight_loader(
+                    param,
+                    curr_expert_weight,
+                    name,
+                    shard_id,
+                    expert_id,
+                )
+            return True
+
+        params_dict = dict(self.named_parameters())
+        loaded_params: set[str] = set()
+
+        for name, loaded_weight in weights:
+            if "rotary_emb.inv_freq" in name:
+                continue
+
+            # Only process MTP branch weights
+            if "mtp" not in name:
+                continue
+
+            if name.startswith("mtp."):
+                # Remove the mtp. prefix for processing
+                name = name.replace("mtp.", "model.")
+
+                name = name.replace("model.fc", "fc")
+                name = name.replace("model.pre_fc", "pre_fc")
+
+            if ".self_attn." in name:
+                name = name.replace(".self_attn", "")
+
+            # 1) Process stacked parameters (q_proj/k_proj/v_proj & gate_proj/up_proj)
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                # Check if this is a fused expert weight
+                if "experts.gate_up_proj" in name or "experts.down_proj" in name:
+                    is_fused_expert = True
+                    expert_params_mapping = fused_expert_params_mapping
+
+                # Skip non-matching weights
+                if weight_name not in name:
+                    continue
+
+                # Skip MoE experts.* here, handled separately below
+                if "mlp.experts" in name:
+                    continue
+
+                name_mapped = name.replace(weight_name, param_name)
+
+                # Skip loading extra parameters for GPTQ/modelopt models.
+                if (
+                    name_mapped.endswith(ignore_suffixes)
+                    and name_mapped not in params_dict
+                ):
+                    continue
+
+                if name_mapped not in params_dict:
+                    continue
+
+                param = params_dict[name_mapped]
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
+                weight_loader(param, loaded_weight, shard_id)
+                name = name_mapped
+                break
+            else:
+                # 2) Process MoE expert weights (including fused experts)
+                is_expert_weight = False
+
+                for mapping in expert_params_mapping:
+                    param_name, weight_name, expert_id, shard_id = mapping
+                    if weight_name not in name:
+                        continue
+
+                    is_expert_weight = True
+                    name_mapped = name.replace(weight_name, param_name)
+
+                    # Fused experts: single checkpoint weight contains multiple experts
+                    if is_fused_expert and num_experts is not None:
+                        if "experts.gate_up_proj" in name:
+                            # gate_up_proj fused: split into w1 / w3
+                            loaded_w1, loaded_w3 = loaded_weight.chunk(2, dim=-2)
+                            load_fused_expert_weights(
+                                name_mapped,
+                                params_dict,
+                                loaded_w1,
+                                "w1",
+                                num_experts,
+                            )
+                            load_fused_expert_weights(
+                                name_mapped,
+                                params_dict,
+                                loaded_w3,
+                                "w3",
+                                num_experts,
+                            )
+                        else:
+                            # down_proj fused: distribute entire weight
+                            load_fused_expert_weights(
+                                name_mapped,
+                                params_dict,
+                                loaded_weight,
+                                shard_id,
+                                num_experts,
+                            )
+                    else:
+                        # Non-fused expert, load by expert_id/shard
+                        if (
+                            name_mapped.endswith(ignore_suffixes)
+                            and name_mapped not in params_dict
+                        ):
+                            continue
+                        if name_mapped not in params_dict:
+                            break
+                        param = params_dict[name_mapped]
+                        weight_loader = param.weight_loader
+                        weight_loader(
+                            param,
+                            loaded_weight,
+                            name_mapped,
+                            shard_id=shard_id,
+                            expert_id=expert_id,
+                        )
+                    name = name_mapped
+                    break
+                else:
+                    # Skip expert weight if not handled by current rank
+                    if is_expert_weight:
+                        continue
+
+                    # 3) Regular non-stacked / non-expert parameters, use default loader
+                    if name.endswith(ignore_suffixes) and name not in params_dict:
+                        continue
+
+                    if name in params_dict:
+                        param = params_dict[name]
+                        weight_loader = getattr(
+                            param, "weight_loader", default_weight_loader
+                        )
+                        weight_loader(param, loaded_weight)
+                    else:
+                        logger.warning_once(
+                            f"Parameter {name} not found in params_dict, skip loading"
+                        )
+
+            loaded_params.add(name)
+        return loaded_params
+
+
+EntryClass = [Qwen3_5ForCausalLMMTP]
diff --git a/sglang/python/sglang/srt/models/qwen3_classification.py b/sglang/python/sglang/srt/models/qwen3_classification.py
new file mode 100644
index 0000000000000000000000000000000000000000..f6eff513de329d7d9cfff67bcbf6c55984e20d0c
--- /dev/null
+++ b/sglang/python/sglang/srt/models/qwen3_classification.py
@@ -0,0 +1,144 @@
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+import logging
+from typing import Iterable, Optional, Tuple
+
+import torch
+from torch import nn
+from transformers import Qwen2Config  # Qwen3 uses Qwen2Config
+
+from sglang.srt.layers.pooler import EmbeddingPoolerOutput, Pooler, PoolingType
+from sglang.srt.layers.quantization.base_config import QuantizationConfig
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch
+from sglang.srt.model_loader.weight_utils import default_weight_loader
+from sglang.srt.models.qwen3 import Qwen3Model
+from sglang.srt.utils import add_prefix
+
+logger = logging.getLogger(__name__)
+
+
+class Qwen3ForPooledOutput(nn.Module):
+    """Base class for Qwen3 models that produce pooled output (classification, reward).
+
+    Subclasses should set self.score and self.pooler in their __init__.
+    """
+
+    def __init__(
+        self,
+        config: Qwen2Config,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.config = config
+        self.quant_config = quant_config
+        self.model = Qwen3Model(
+            config, quant_config=quant_config, prefix=add_prefix("model", prefix)
+        )
+        self.eos_token_id = config.eos_token_id
+        # Subclasses must set self.score and self.pooler
+
+    @torch.no_grad()
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        input_embeds: Optional[torch.Tensor] = None,
+        get_embedding: bool = True,
+    ) -> EmbeddingPoolerOutput:
+        assert get_embedding, f"{self.__class__.__name__} is only used for embedding"
+
+        hidden_states = self.model(input_ids, positions, forward_batch, input_embeds)
+        logits = self.score(hidden_states)
+        pooled_logits = self.pooler(logits, forward_batch).embeddings
+
+        return EmbeddingPoolerOutput(pooled_logits)
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+            ("gate_up_proj", "gate_proj", 0),
+            ("gate_up_proj", "up_proj", 1),
+        ]
+
+        params_dict = dict(self.named_parameters())
+        for name, loaded_weight in weights:
+            # Skip lm_head weights (pooled output models don't have lm_head)
+            if name.startswith("lm_head"):
+                continue
+
+            # Skip rotary embeddings and other non-parameter tensors
+            if "rotary_emb.inv_freq" in name or "projector" in name:
+                continue
+            if "rotary_emb.cos_cached" in name or "rotary_emb.sin_cached" in name:
+                continue
+
+            # Handle stacked parameters (qkv_proj, gate_up_proj)
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                if name not in params_dict:
+                    continue
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                # Skip loading extra bias for GPTQ models
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+
+                if name in params_dict:
+                    param = params_dict[name]
+                    weight_loader = getattr(
+                        param, "weight_loader", default_weight_loader
+                    )
+                    weight_loader(param, loaded_weight)
+                else:
+                    logger.warning(f"Parameter {name} not found in params_dict")
+
+
+class Qwen3ForSequenceClassification(Qwen3ForPooledOutput):
+    def __init__(
+        self,
+        config: Qwen2Config,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__(config, quant_config, prefix)
+        self.score = nn.Linear(config.hidden_size, config.num_labels)
+        # Use normalize=True for qwen3 embedding based on official implementation
+        # Reference: https://github.com/QwenLM/Qwen3-Embedding/blob/main/examples/qwen3_embedding_transformers.py#L55
+        # Official code: output = F.normalize(output, p=2, dim=1)
+        normalize = True
+
+        # We don't want to normalize the embedding if we have a classification head
+        if config.id2label is not None or config.label2id is not None:
+            normalize = False
+
+        self.pooler = Pooler(pooling_type=PoolingType.LAST, normalize=normalize)
+
+
+EntryClass = [
+    Qwen3ForSequenceClassification,
+]
diff --git a/sglang/python/sglang/srt/models/qwen3_moe.py b/sglang/python/sglang/srt/models/qwen3_moe.py
new file mode 100644
index 0000000000000000000000000000000000000000..845502fe2e88a55e88335e0c66131a6cfa8af3b9
--- /dev/null
+++ b/sglang/python/sglang/srt/models/qwen3_moe.py
@@ -0,0 +1,1154 @@
+# Adapted from qwen2_moe.py
+
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+
+"""Inference-only Qwen3MoE model compatible with HuggingFace weights."""
+
+import logging
+import math
+from typing import Any, Dict, Iterable, List, Optional, Tuple, TypeVar
+
+import torch
+from torch import nn
+from transformers import PretrainedConfig
+
+from sglang.srt.distributed import (
+    get_moe_expert_parallel_world_size,
+    get_pp_group,
+    get_tensor_model_parallel_rank,
+    get_tensor_model_parallel_world_size,
+    tensor_model_parallel_all_reduce,
+)
+from sglang.srt.eplb.expert_distribution import get_global_expert_distribution_recorder
+from sglang.srt.eplb.expert_location import ModelConfigForExpertLocation
+from sglang.srt.eplb.expert_location_dispatch import ExpertLocationDispatchInfo
+from sglang.srt.layers.communicator import LayerCommunicator, LayerScatterModes
+from sglang.srt.layers.dp_attention import get_attention_tp_rank, get_attention_tp_size
+from sglang.srt.layers.layernorm import RMSNorm
+from sglang.srt.layers.linear import (
+    QKVParallelLinear,
+    ReplicatedLinear,
+    RowParallelLinear,
+)
+from sglang.srt.layers.logits_processor import LogitsProcessor
+from sglang.srt.layers.moe import (
+    get_moe_a2a_backend,
+    should_use_flashinfer_cutlass_moe_fp4_allgather,
+)
+from sglang.srt.layers.moe.ep_moe.layer import get_moe_impl_class
+from sglang.srt.layers.moe.fused_moe_triton.layer import FusedMoE
+from sglang.srt.layers.moe.topk import TopK
+from sglang.srt.layers.moe.utils import (
+    RoutingMethodType,
+    filter_moe_weight_param_global_expert,
+)
+from sglang.srt.layers.quantization.base_config import QuantizationConfig
+from sglang.srt.layers.radix_attention import RadixAttention
+from sglang.srt.layers.rotary_embedding import MRotaryEmbedding, get_rope
+from sglang.srt.layers.utils import get_layer_id
+from sglang.srt.layers.vocab_parallel_embedding import ParallelLMHead
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch, PPProxyTensors
+from sglang.srt.model_loader.weight_utils import default_weight_loader
+from sglang.srt.models.qwen2_moe import Qwen2MoeMLP as Qwen3MoeMLP
+from sglang.srt.models.qwen2_moe import Qwen2MoeModel
+from sglang.srt.models.utils import (
+    apply_qk_norm,
+    create_fused_set_kv_buffer_arg,
+    enable_fused_set_kv_buffer,
+)
+from sglang.srt.server_args import get_global_server_args
+from sglang.srt.utils import (
+    LazyValue,
+    add_prefix,
+    is_cuda,
+    is_flashinfer_available,
+    is_non_idle_and_non_empty,
+    is_npu,
+)
+
+_is_cuda = is_cuda()
+
+if _is_cuda:
+    from sgl_kernel import fused_qk_norm_rope
+
+TConfig = TypeVar("TConfig", bound=PretrainedConfig)
+
+Qwen3MoeConfig = None
+
+_is_flashinfer_available = is_flashinfer_available()
+
+logger = logging.getLogger(__name__)
+_is_cuda = is_cuda()
+_is_npu = is_npu()
+
+if _is_npu:
+    from sgl_kernel_npu.norm.split_qkv_rmsnorm_rope import split_qkv_rmsnorm_rope
+
+
+def compute_yarn_parameters(
+    config: PretrainedConfig,
+) -> tuple[float, float, float, float]:
+    """
+    Refer to https://github.com/huggingface/transformers/blob/main/src/transformers/modeling_rope_utils.py#L197C1-L288C1
+    Computes the inverse frequencies with NTK scaling. Please refer to the
+    [original paper](https://huggingface.co/papers/2309.00071)
+    Args:
+        config ([`~transformers.PretrainedConfig`]):
+            The model configuration.
+    Returns:
+        factor: float, the scaling factor for the RoPE embeddings
+        low: float, the lower bound of the dimension range
+        high: float, the upper bound of the dimension range
+        attention_factor: float, the post-processing scaling factor applied to the computed cos/sin
+    """
+
+    # The config does not contain rope_scaling, which means the model is not using yarn
+    rope_scaling = getattr(config, "rope_scaling", None)
+    if rope_scaling is None:
+        return 1.0, 0, 0, 1.0
+
+    base = config.rope_theta
+    partial_rotary_factor = (
+        config.partial_rotary_factor
+        if hasattr(config, "partial_rotary_factor")
+        else 1.0
+    )
+    head_dim = getattr(
+        config, "head_dim", config.hidden_size // config.num_attention_heads
+    )
+    dim = int(head_dim * partial_rotary_factor)
+    factor = getattr(rope_scaling, "factor", 1.0)
+    attention_factor = rope_scaling.get("attention_factor")
+    mscale = rope_scaling.get("mscale")
+    mscale_all_dim = rope_scaling.get("mscale_all_dim")
+
+    if "original_max_position_embeddings" in rope_scaling:
+        original_max_position_embeddings = rope_scaling[
+            "original_max_position_embeddings"
+        ]
+        factor = config.max_position_embeddings / original_max_position_embeddings
+    else:
+        original_max_position_embeddings = config.max_position_embeddings
+
+    def get_mscale(scale, mscale=1):
+        if scale <= 1:
+            return 1.0
+        return 0.1 * mscale * math.log(scale) + 1.0
+
+    # Sets the attention factor as suggested in the paper
+    if attention_factor is None:
+        if mscale and mscale_all_dim:
+            attention_factor = float(
+                get_mscale(factor, mscale) / get_mscale(factor, mscale_all_dim)
+            )
+        else:
+            attention_factor = get_mscale(factor)
+
+    # Optional config options
+    # beta_fast/beta_slow: as suggested in the paper, default to 32/1 (correspondingly)
+    beta_fast = rope_scaling.get("beta_fast") or 32
+    beta_slow = rope_scaling.get("beta_slow") or 1
+
+    # Compute the inverse frequencies
+    def find_correction_dim(num_rotations, dim, base, max_position_embeddings):
+        """Inverse dimension formula to find the dimension based on the number of rotations"""
+        return (
+            dim * math.log(max_position_embeddings / (num_rotations * 2 * math.pi))
+        ) / (2 * math.log(base))
+
+    def find_correction_range(
+        low_rot, high_rot, dim, base, max_position_embeddings, truncate
+    ):
+        """Find dimension range bounds based on rotations"""
+        low = find_correction_dim(low_rot, dim, base, max_position_embeddings)
+        high = find_correction_dim(high_rot, dim, base, max_position_embeddings)
+        if truncate:
+            low = math.floor(low)
+            high = math.ceil(high)
+        return max(low, 0), min(high, dim - 1)
+
+    truncate = rope_scaling.get("truncate", True)
+    low, high = find_correction_range(
+        beta_fast, beta_slow, dim, base, original_max_position_embeddings, truncate
+    )
+
+    # These parts are implemented in the fusedQKNormRopeKernel.cu
+    # # def linear_ramp_factor(min, max, dim):
+    # #     if min == max:
+    # #         max += 0.001  # Prevent singularity
+
+    # #     linear_func = (torch.arange(dim, dtype=torch.float32) - min) / (max - min)
+    # #     ramp_func = torch.clamp(linear_func, 0, 1)
+    # #     return ramp_func
+
+    # # Note on variable naming: "interpolation" comes from the original technique, where we interpolate the position IDs
+    # # to expand the possible context length. In other words, interpolation = apply scaling factor.
+    # # pos_freqs = base ** (torch.arange(0, dim, 2).to(device=device, dtype=torch.float) / dim)
+    # # inv_freq_extrapolation = 1.0 / pos_freqs
+    # # inv_freq_interpolation = 1.0 / (factor * pos_freqs)
+
+    # # # Get n-dimensional rotational scaling corrected for extrapolation
+    # # inv_freq_extrapolation_factor = 1 - linear_ramp_factor(low, high, dim // 2).to(device=device, dtype=torch.float)
+    # # inv_freq = (
+    # #     inv_freq_interpolation * (1 - inv_freq_extrapolation_factor)
+    # #     + inv_freq_extrapolation * inv_freq_extrapolation_factor
+    # # )
+    # # return inv_freq, attention_factor
+    return factor, low, high, attention_factor
+
+
+class Qwen3MoeSparseMoeBlock(nn.Module):
+    def __init__(
+        self,
+        layer_id: int,
+        config: Qwen3MoeConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.tp_size = get_tensor_model_parallel_world_size()
+        self.layer_id = layer_id
+        if self.tp_size > config.num_experts:
+            raise ValueError(
+                f"Tensor parallel size {self.tp_size} is greater than "
+                f"the number of experts {config.num_experts}."
+            )
+
+        self.topk = TopK(
+            top_k=config.num_experts_per_tok,
+            renormalize=config.norm_topk_prob,
+            use_grouped_topk=False,
+            layer_id=layer_id,
+        )
+
+        self.experts = get_moe_impl_class(quant_config)(
+            num_experts=config.num_experts
+            + get_global_server_args().ep_num_redundant_experts,
+            top_k=config.num_experts_per_tok,
+            layer_id=layer_id,
+            hidden_size=config.hidden_size,
+            intermediate_size=config.moe_intermediate_size,
+            quant_config=quant_config,
+            prefix=add_prefix("experts", prefix),
+            routing_method_type=RoutingMethodType.Renormalize,
+        )
+
+        self.gate = ReplicatedLinear(
+            config.hidden_size,
+            config.num_experts,
+            bias=False,
+            quant_config=None,
+            prefix=add_prefix("gate", prefix),
+        )
+
+        if get_moe_a2a_backend().is_deepep():
+            # TODO: we will support tp < ep in the future
+            self.ep_size = get_moe_expert_parallel_world_size()
+            self.num_experts = (
+                config.num_experts + get_global_server_args().ep_num_redundant_experts
+            )
+            self.top_k = config.num_experts_per_tok
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        forward_batch: Optional[ForwardBatch] = None,
+        should_allreduce_fusion: bool = False,
+        use_reduce_scatter: bool = False,
+    ) -> torch.Tensor:
+
+        if (
+            not get_moe_a2a_backend().is_deepep()
+            and not get_moe_a2a_backend().is_ascend_fuseep()
+        ):
+            return self.forward_normal(
+                hidden_states, should_allreduce_fusion, use_reduce_scatter
+            )
+        else:
+            return self.forward_deepep(hidden_states, forward_batch)
+
+    def get_moe_weights(self):
+        return [
+            x.data
+            for name, x in self.experts.named_parameters()
+            if name not in ["correction_bias"]
+            and filter_moe_weight_param_global_expert(
+                name, x, self.experts.num_local_experts
+            )
+        ]
+
+    def forward_normal(
+        self,
+        hidden_states: torch.Tensor,
+        should_allreduce_fusion: bool = False,
+        use_reduce_scatter: bool = False,
+    ) -> torch.Tensor:
+        num_tokens, hidden_dim = hidden_states.shape
+        hidden_states = hidden_states.view(-1, hidden_dim)
+
+        # router_logits: (num_tokens, n_experts)
+        router_logits, _ = self.gate(hidden_states)
+        topk_output = self.topk(hidden_states, router_logits)
+        final_hidden_states = self.experts(hidden_states, topk_output)
+        if (
+            self.tp_size > 1
+            and not should_allreduce_fusion
+            and not use_reduce_scatter
+            and not should_use_flashinfer_cutlass_moe_fp4_allgather()
+        ):
+            final_hidden_states = tensor_model_parallel_all_reduce(final_hidden_states)
+
+        return final_hidden_states.view(num_tokens, hidden_dim)
+
+    def forward_deepep(
+        self, hidden_states: torch.Tensor, forward_batch: ForwardBatch
+    ) -> torch.Tensor:
+        if hidden_states.shape[0] > 0:
+            # router_logits: (num_tokens, n_experts)
+            router_logits, _ = self.gate(hidden_states)
+            topk_output = self.topk(
+                hidden_states,
+                router_logits,
+                num_token_non_padded=forward_batch.num_token_non_padded,
+                expert_location_dispatch_info=ExpertLocationDispatchInfo.init_new(
+                    layer_id=self.layer_id,
+                ),
+            )
+        else:
+            topk_output = self.topk.empty_topk_output(hidden_states.device)
+        final_hidden_states = self.experts(
+            hidden_states=hidden_states,
+            topk_output=topk_output,
+        )
+        return final_hidden_states
+
+    def op_gate(self, state):
+        if is_non_idle_and_non_empty(
+            state.forward_batch.forward_mode, state.hidden_states_mlp_input
+        ):
+            # router_logits: (num_tokens, n_experts)
+            state.router_logits, _ = self.gate(state.hidden_states_mlp_input)
+        else:
+            state.router_logits = None
+
+    def op_select_experts(self, state):
+        router_logits = state.pop("router_logits")
+        hidden_states = state.hidden_states_mlp_input
+        if router_logits is not None:
+            with get_global_expert_distribution_recorder().with_current_layer(
+                self.layer_id
+            ):
+                state.topk_output = self.topk(
+                    hidden_states=hidden_states,
+                    router_logits=router_logits,
+                    num_token_non_padded=state.forward_batch.num_token_non_padded,
+                    expert_location_dispatch_info=ExpertLocationDispatchInfo.init_new(
+                        layer_id=self.layer_id,
+                    ),
+                )
+        else:
+            state.topk_output = self.topk.empty_topk_output(hidden_states.device)
+
+    def op_dispatch_a(self, state):
+        if self.ep_size > 1:
+            self.experts.dispatcher.dispatch_a(
+                hidden_states=state.pop("hidden_states_mlp_input"),
+                topk_output=state.pop("topk_output"),
+                tbo_subbatch_index=state.get("tbo_subbatch_index"),
+            )
+
+    def op_dispatch_b(self, state):
+        if self.ep_size > 1:
+            with get_global_expert_distribution_recorder().with_current_layer(
+                self.layer_id
+            ):
+                state.dispatch_output = self.experts.dispatcher.dispatch_b(
+                    tbo_subbatch_index=state.get("tbo_subbatch_index"),
+                )
+
+    def op_experts(self, state):
+        state.combine_input = self.experts.run_moe_core(
+            dispatch_output=state.dispatch_output,
+        )
+
+    def op_combine_a(self, state):
+        if self.ep_size > 1:
+            self.experts.dispatcher.combine_a(
+                combine_input=state.pop("combine_input"),
+                tbo_subbatch_index=state.get("tbo_subbatch_index"),
+            )
+            state.pop("dispatch_output")
+
+    def op_combine_b(self, state):
+        if self.ep_size > 1:
+            state.hidden_states_after_combine = self.experts.dispatcher.combine_b(
+                tbo_subbatch_index=state.get("tbo_subbatch_index"),
+            )
+
+    def op_output(self, state):
+        state.hidden_states_mlp_output = state.pop("hidden_states_after_combine")
+
+
+class Qwen3MoeAttention(nn.Module):
+    def __init__(
+        self,
+        hidden_size: int,
+        num_heads: int,
+        num_kv_heads: int,
+        layer_id: int = 0,
+        rope_theta: float = 10000,
+        rope_scaling: Optional[Dict[str, Any]] = None,
+        max_position_embeddings: int = 8192,
+        head_dim: Optional[int] = None,
+        rms_norm_eps: float = 1e-06,
+        attention_bias: bool = False,
+        config: Optional[TConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+        dual_chunk_attention_config: Optional[dict[str, Any]] = None,
+        alt_stream: Optional[torch.cuda.Stream] = None,
+    ) -> None:
+        super().__init__()
+        self.hidden_size = hidden_size
+
+        attn_tp_rank = get_attention_tp_rank()
+        attn_tp_size = get_attention_tp_size()
+
+        self.config = config
+        self.total_num_heads = num_heads
+        assert self.total_num_heads % attn_tp_size == 0
+        self.num_heads = self.total_num_heads // attn_tp_size
+        self.total_num_kv_heads = num_kv_heads
+        if self.total_num_kv_heads >= attn_tp_size:
+            # Number of KV heads is greater than TP size, so we partition
+            # the KV heads across multiple tensor parallel GPUs.
+            assert self.total_num_kv_heads % attn_tp_size == 0
+        else:
+            # Number of KV heads is less than TP size, so we replicate
+            # the KV heads across multiple tensor parallel GPUs.
+            assert attn_tp_size % self.total_num_kv_heads == 0
+        self.num_kv_heads = max(1, self.total_num_kv_heads // attn_tp_size)
+        self.head_dim = head_dim or hidden_size // self.total_num_heads
+        self.q_size = self.num_heads * self.head_dim
+        self.kv_size = self.num_kv_heads * self.head_dim
+        self.scaling = self.head_dim**-0.5
+        self.rope_theta = rope_theta
+        self.max_position_embeddings = max_position_embeddings
+        self.tp_rank = get_tensor_model_parallel_rank()
+
+        self.qkv_proj = QKVParallelLinear(
+            hidden_size,
+            self.head_dim,
+            self.total_num_heads,
+            self.total_num_kv_heads,
+            bias=attention_bias,
+            quant_config=quant_config,
+            tp_rank=attn_tp_rank,
+            tp_size=attn_tp_size,
+            prefix=add_prefix("qkv_proj", prefix),
+        )
+
+        self.o_proj = RowParallelLinear(
+            self.total_num_heads * self.head_dim,
+            hidden_size,
+            bias=attention_bias,
+            quant_config=quant_config,
+            tp_rank=attn_tp_rank,
+            tp_size=attn_tp_size,
+            reduce_results=False,
+            prefix=add_prefix("o_proj", prefix),
+        )
+
+        self.rotary_emb = get_rope(
+            self.head_dim,
+            rotary_dim=self.head_dim,
+            max_position=max_position_embeddings,
+            base=rope_theta,
+            rope_scaling=rope_scaling,
+            dual_chunk_attention_config=dual_chunk_attention_config,
+        )
+        self.compatible_with_fused_kv_buffer = (
+            False if isinstance(self.rotary_emb, MRotaryEmbedding) else True
+        )
+        self.compatible_with_fused_qk_norm_rope = (
+            not isinstance(self.rotary_emb, MRotaryEmbedding)
+        ) and self.head_dim in (64, 128, 256)
+        self.use_fused_qk_norm_rope = (
+            get_global_server_args().enable_fused_qk_norm_rope
+            and self.compatible_with_fused_qk_norm_rope
+        )
+        self._used_fused_qk_norm_rope_last_call = False
+
+        self.attn = RadixAttention(
+            self.num_heads,
+            self.head_dim,
+            self.scaling,
+            num_kv_heads=self.num_kv_heads,
+            layer_id=layer_id,
+            prefix=add_prefix("attn", prefix),
+        )
+
+        self.q_norm = RMSNorm(self.head_dim, eps=rms_norm_eps)
+        self.k_norm = RMSNorm(self.head_dim, eps=rms_norm_eps)
+        self.alt_stream = alt_stream
+
+    def op_prepare(self, state):
+        state.attn_intermediate_state = self.forward_prepare(
+            positions=state.positions,
+            hidden_states=state.pop("hidden_states_after_comm_pre_attn"),
+            forward_batch=state.forward_batch,
+        )
+
+    def op_core(self, state):
+        state.hidden_states_after_attn = self.forward_core(
+            state.pop("attn_intermediate_state")
+        )
+
+    def forward_prepare_npu(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+    ):
+        qkv, _ = self.qkv_proj(hidden_states)
+        if self.attn.layer_id == forward_batch.token_to_kv_pool.start_layer:
+            self.rotary_emb.get_cos_sin_with_position(positions)
+        q, k, v = split_qkv_rmsnorm_rope(
+            qkv,
+            self.rotary_emb.position_sin,
+            self.rotary_emb.position_cos,
+            self.q_size,
+            self.kv_size,
+            self.head_dim,
+            eps=self.q_norm.variance_epsilon,
+            q_weight=self.q_norm.weight,
+            k_weight=self.k_norm.weight,
+            q_bias=getattr(self.q_norm, "bias", None),
+            k_bias=getattr(self.k_norm, "bias", None),
+        )
+
+        inner_state = q, k, v, forward_batch
+        return None, forward_batch, inner_state
+
+    def forward_prepare_native(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+    ):
+        qkv, _ = self.qkv_proj(hidden_states)
+
+        q, k, v = self.apply_qk_norm_rope(qkv, positions, forward_batch)
+
+        inner_state = q, k, v, forward_batch
+        return None, forward_batch, inner_state
+
+    def apply_qk_norm_rope(self, qkv, positions, forward_batch):
+        use_fused = self.use_fused_qk_norm_rope and qkv.dtype == torch.bfloat16
+        if use_fused:
+            theta = getattr(self.config, "rope_theta", 10000.0)
+            positions = (
+                positions.view(-1).to(dtype=torch.int32, device=qkv.device).contiguous()
+            )
+            factor, low, high, attention_factor = compute_yarn_parameters(self.config)
+            fused_qk_norm_rope(
+                qkv,
+                self.num_heads,
+                self.num_kv_heads,
+                self.num_kv_heads,
+                self.head_dim,
+                self.q_norm.variance_epsilon,
+                self.q_norm.weight,
+                self.k_norm.weight,
+                theta,
+                self.rotary_emb.is_neox_style,
+                positions,
+                factor,
+                low,
+                high,
+                attention_factor,
+            )
+            q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+            self._used_fused_qk_norm_rope_last_call = True
+        else:
+            # Fallback to non-fused QK Norm & RoPE implementation
+            q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+            q, k = apply_qk_norm(
+                q=q,
+                k=k,
+                q_norm=self.q_norm,
+                k_norm=self.k_norm,
+                head_dim=self.head_dim,
+                alt_stream=self.alt_stream,
+            )
+            q, k = self.rotary_emb(
+                positions,
+                q,
+                k,
+                fused_set_kv_buffer_arg=(
+                    create_fused_set_kv_buffer_arg(
+                        value=v,
+                        layer=self.attn,
+                        forward_batch=forward_batch,
+                    )
+                    if enable_fused_set_kv_buffer(forward_batch)
+                    and self.compatible_with_fused_kv_buffer
+                    else None
+                ),
+            )
+            self._used_fused_qk_norm_rope_last_call = False
+        return q, k, v
+
+    def forward_prepare(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+    ):
+        if hidden_states.shape[0] == 0:
+            return hidden_states, forward_batch, None
+        if (
+            not _is_npu
+            or forward_batch.forward_mode.is_extend_or_draft_extend_or_mixed()
+        ):
+            return self.forward_prepare_native(
+                positions=positions,
+                hidden_states=hidden_states,
+                forward_batch=forward_batch,
+            )
+        else:
+            return self.forward_prepare_npu(
+                positions=positions,
+                hidden_states=hidden_states,
+                forward_batch=forward_batch,
+            )
+
+    def forward_core(self, intermediate_state):
+        hidden_states, forward_batch, inner_state = intermediate_state
+        if inner_state is None:
+            return hidden_states
+
+        q, k, v, fb = inner_state
+
+        must_save_kv = self._used_fused_qk_norm_rope_last_call
+        save_kv_cache = must_save_kv or not (
+            enable_fused_set_kv_buffer(forward_batch)
+            and self.compatible_with_fused_kv_buffer
+        )
+        attn_output = self.attn(
+            q,
+            k,
+            v,
+            fb,
+            save_kv_cache=save_kv_cache,
+        )
+        output, _ = self.o_proj(attn_output)
+        return output
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+    ) -> torch.Tensor:
+        s = self.forward_prepare(
+            positions=positions,
+            hidden_states=hidden_states,
+            forward_batch=forward_batch,
+        )
+        return self.forward_core(s)
+
+
+class Qwen3MoeDecoderLayer(nn.Module):
+    def __init__(
+        self,
+        config: Qwen3MoeConfig,
+        layer_id: int,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+        alt_stream: Optional[torch.cuda.Stream] = None,
+    ) -> None:
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        rope_theta = getattr(config, "rope_theta", 10000)
+        rope_scaling = getattr(config, "rope_scaling", None)
+        max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
+        head_dim = getattr(
+            config, "head_dim", config.hidden_size // config.num_attention_heads
+        )
+        rms_norm_eps = config.rms_norm_eps
+        attention_bias = config.attention_bias
+        dual_chunk_attention_config = getattr(
+            config, "dual_chunk_attention_config", None
+        )
+        self.self_attn = Qwen3MoeAttention(
+            hidden_size=self.hidden_size,
+            num_heads=config.num_attention_heads,
+            num_kv_heads=config.num_key_value_heads,
+            layer_id=layer_id,
+            rope_theta=rope_theta,
+            rope_scaling=rope_scaling,
+            max_position_embeddings=max_position_embeddings,
+            head_dim=head_dim,
+            rms_norm_eps=rms_norm_eps,
+            attention_bias=attention_bias,
+            config=config,
+            quant_config=quant_config,
+            prefix=add_prefix("self_attn", prefix),
+            dual_chunk_attention_config=dual_chunk_attention_config,
+            alt_stream=alt_stream,
+        )
+
+        self.layer_id = layer_id
+
+        self.attn_tp_size = get_attention_tp_size()
+        self.attn_tp_rank = get_attention_tp_rank()
+
+        # Qwen3MoE all layers are sparse and have no nextn now
+        self.is_layer_sparse = True
+        is_previous_layer_sparse = True
+        is_next_layer_sparse = True
+
+        self.layer_scatter_modes = LayerScatterModes.init_new(
+            layer_id=layer_id,
+            num_layers=config.num_hidden_layers,
+            is_layer_sparse=self.is_layer_sparse,
+            is_previous_layer_sparse=is_previous_layer_sparse,
+            is_next_layer_sparse=is_next_layer_sparse,
+        )
+
+        if self.is_layer_sparse:
+            self.mlp = Qwen3MoeSparseMoeBlock(
+                layer_id=self.layer_id,
+                config=config,
+                quant_config=quant_config,
+                prefix=add_prefix("mlp", prefix),
+            )
+        else:
+            self.mlp = Qwen3MoeMLP(
+                hidden_size=config.hidden_size,
+                intermediate_size=config.intermediate_size,
+                hidden_act=config.hidden_act,
+                quant_config=quant_config,
+                prefix=add_prefix("mlp", prefix),
+            )
+        self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = RMSNorm(
+            config.hidden_size, eps=config.rms_norm_eps
+        )
+
+        self.layer_communicator = LayerCommunicator(
+            layer_scatter_modes=self.layer_scatter_modes,
+            input_layernorm=self.input_layernorm,
+            post_attention_layernorm=self.post_attention_layernorm,
+            allow_reduce_scatter=True,
+            is_last_layer=(self.layer_id == self.config.num_hidden_layers - 1),
+        )
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+        residual: Optional[torch.Tensor],
+        captured_last_layer_outputs: Optional[List[torch.Tensor]] = None,
+        **kwargs,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+
+        hidden_states, residual = (
+            self.layer_communicator.prepare_attn_and_capture_last_layer_outputs(
+                hidden_states,
+                residual,
+                forward_batch,
+                captured_last_layer_outputs=captured_last_layer_outputs,
+                **kwargs,
+            )
+        )
+
+        if hidden_states.shape[0] != 0:
+            hidden_states = self.self_attn(
+                positions=positions,
+                hidden_states=hidden_states,
+                forward_batch=forward_batch,
+            )
+
+        hidden_states, residual = self.layer_communicator.prepare_mlp(
+            hidden_states, residual, forward_batch
+        )
+
+        should_allreduce_fusion = (
+            self.layer_communicator.should_fuse_mlp_allreduce_with_next_layer(
+                forward_batch
+            )
+        )
+
+        # For DP with padding, reduce scatter can be used instead of all-reduce.
+        use_reduce_scatter = self.layer_communicator.should_use_reduce_scatter(
+            forward_batch
+        )
+
+        hidden_states = self.mlp(
+            hidden_states, forward_batch, should_allreduce_fusion, use_reduce_scatter
+        )
+
+        if should_allreduce_fusion:
+            hidden_states._sglang_needs_allreduce_fusion = True
+        else:
+            hidden_states, residual = self.layer_communicator.postprocess_layer(
+                hidden_states, residual, forward_batch
+            )
+
+        return hidden_states, residual
+
+    def op_comm_prepare_attn(
+        self,
+        state,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+        residual: Optional[torch.Tensor],
+        tbo_subbatch_index: Optional[int] = None,
+    ):
+        state.hidden_states_after_comm_pre_attn, state.residual_after_input_ln = (
+            self.layer_communicator.prepare_attn(hidden_states, residual, forward_batch)
+        )
+        state.update(
+            dict(
+                forward_batch=forward_batch,
+                positions=positions,
+                tbo_subbatch_index=tbo_subbatch_index,
+            )
+        )
+
+    def op_comm_prepare_mlp(self, state):
+        state.hidden_states_mlp_input, state.residual_after_comm_pre_mlp = (
+            self.layer_communicator.prepare_mlp(
+                state.pop("hidden_states_after_attn"),
+                state.pop("residual_after_input_ln"),
+                state.forward_batch,
+            )
+        )
+
+    def op_mlp(self, state):
+        hidden_states = state.pop("hidden_states_mlp_input")
+        state.hidden_states_mlp_output = self.mlp(hidden_states, state.forward_batch)
+
+    def op_comm_postprocess_layer(self, state):
+        hidden_states, residual = self.layer_communicator.postprocess_layer(
+            state.pop("hidden_states_mlp_output"),
+            state.pop("residual_after_comm_pre_mlp"),
+            state.forward_batch,
+        )
+
+        output = dict(
+            positions=state.positions,
+            hidden_states=hidden_states,
+            residual=residual,
+            forward_batch=state.forward_batch,
+            tbo_subbatch_index=state.tbo_subbatch_index,
+        )
+
+        state.clear(
+            expect_keys={
+                "positions",
+                "forward_batch",
+                "tbo_subbatch_index",
+            }
+        )
+        return output
+
+
+class Qwen3MoeModel(Qwen2MoeModel):
+    def __init__(
+        self,
+        config: Qwen3MoeConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+        decoder_layer_type=Qwen3MoeDecoderLayer,
+    ) -> None:
+        alt_stream = torch.cuda.Stream() if _is_cuda else None
+        super().__init__(
+            config=config,
+            quant_config=quant_config,
+            prefix=prefix,
+            decoder_layer_type=decoder_layer_type,
+            alt_stream=alt_stream,
+        )
+
+
+class Qwen3MoeForCausalLM(nn.Module):
+    fall_back_to_pt_during_load = False
+
+    # Mapping from fused module names to their component weight names.
+    # Required for quantization configs (e.g., ModelOpt FP4) to correctly identify
+    # which layers should be skipped based on the exclude_modules/ignore list.
+    packed_modules_mapping = {
+        "qkv_proj": ["q_proj", "k_proj", "v_proj"],
+        "gate_up_proj": ["gate_proj", "up_proj"],
+    }
+
+    def __init__(
+        self,
+        config: Qwen3MoeConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.pp_group = get_pp_group()
+        self.config = config
+        self.quant_config = quant_config
+        self.model = Qwen3MoeModel(
+            config, quant_config, prefix=add_prefix("model", prefix)
+        )
+        self.lm_head = ParallelLMHead(
+            config.vocab_size,
+            config.hidden_size,
+            quant_config=quant_config,
+            prefix=add_prefix("lm_head", prefix),
+            use_attn_tp_group=get_global_server_args().enable_dp_lm_head,
+        )
+        self.logits_processor = LogitsProcessor(config)
+        self.capture_aux_hidden_states = False
+
+    def get_input_embeddings(self) -> nn.Embedding:
+        return self.model.embed_tokens
+
+    @torch.no_grad()
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        input_embeds: torch.Tensor = None,
+        pp_proxy_tensors: Optional[PPProxyTensors] = None,
+    ) -> torch.Tensor:
+        hidden_states = self.model(
+            input_ids,
+            positions,
+            forward_batch,
+            input_embeds,
+            pp_proxy_tensors=pp_proxy_tensors,
+        )
+
+        aux_hidden_states = None
+        if self.capture_aux_hidden_states:
+            hidden_states, aux_hidden_states = hidden_states
+
+        if self.pp_group.is_last_rank:
+            return self.logits_processor(
+                input_ids, hidden_states, self.lm_head, forward_batch, aux_hidden_states
+            )
+        else:
+            return hidden_states
+
+    @torch.no_grad()
+    def forward_split_prefill(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        split_interval: Tuple[int, int],  # [start, end) 0-based
+        input_embeds: torch.Tensor = None,
+    ):
+        start, end = split_interval
+        # embed
+        if start == 0:
+            if input_embeds is None:
+                forward_batch.hidden_states = self.model.embed_tokens(input_ids)
+            else:
+                forward_batch.hidden_states = input_embeds
+
+        # decoder layer
+        for i in range(start, end):
+            with get_global_expert_distribution_recorder().with_current_layer(i):
+                layer = self.model.layers[i]
+                forward_batch.hidden_states, forward_batch.residual = layer(
+                    positions,
+                    forward_batch.hidden_states,
+                    forward_batch,
+                    forward_batch.residual,
+                )
+
+        if end == self.model.config.num_hidden_layers:
+            # norm
+            hidden_states, _ = self.model.norm(
+                forward_batch.hidden_states, forward_batch.residual
+            )
+            forward_batch.hidden_states = hidden_states
+            # logits process
+            result = self.logits_processor(
+                input_ids, forward_batch.hidden_states, self.lm_head, forward_batch
+            )
+        else:
+            result = None
+
+        return result
+
+    @property
+    def start_layer(self):
+        return self.model.start_layer
+
+    @property
+    def end_layer(self):
+        return self.model.end_layer
+
+    def get_embed_and_head(self):
+        return self.model.embed_tokens.weight, self.lm_head.weight
+
+    def set_eagle3_layers_to_capture(self, layer_ids: Optional[List[int]] = None):
+        if not self.pp_group.is_last_rank:
+            return
+
+        self.capture_aux_hidden_states = True
+        if layer_ids is None:
+            num_layers = self.config.num_hidden_layers
+            self.model.set_eagle3_layers_to_capture(
+                [
+                    2,
+                    num_layers // 2,
+                    num_layers - 3,
+                ]
+            )  # Specific layers for EAGLE3 support
+        else:
+            self.model.set_eagle3_layers_to_capture([val + 1 for val in layer_ids])
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+            ("gate_up_proj", "gate_proj", 0),
+            ("gate_up_proj", "up_proj", 1),
+        ]
+
+        expert_params_mapping = FusedMoE.make_expert_params_mapping(
+            ckpt_gate_proj_name="gate_proj",
+            ckpt_down_proj_name="down_proj",
+            ckpt_up_proj_name="up_proj",
+            num_experts=self.config.num_experts,
+        )
+
+        # Cache params_dict to avoid repeated expensive traversal of model parameters
+        if not hasattr(self, "_cached_params_dict"):
+            self._cached_params_dict = dict(self.named_parameters())
+        params_dict = self._cached_params_dict
+        for name, loaded_weight in weights:
+            layer_id = get_layer_id(name)
+            if (
+                layer_id is not None
+                and hasattr(self.model, "start_layer")
+                and (
+                    layer_id < self.model.start_layer
+                    or layer_id >= self.model.end_layer
+                )
+            ):
+                continue
+
+            if "rotary_emb.inv_freq" in name:
+                continue
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                # Skip non-stacked layers and experts (experts handled below).
+                if weight_name not in name:
+                    continue
+                # We have mlp.experts[0].gate_proj in the checkpoint.
+                # Since we handle the experts below in expert_params_mapping,
+                # we need to skip here BEFORE we update the name, otherwise
+                # name will be updated to mlp.experts[0].gate_up_proj, which
+                # will then be updated below in expert_params_mapping
+                # for mlp.experts[0].gate_gate_up_proj, which breaks load.
+                if "mlp.experts" in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                if name not in params_dict:
+                    continue
+
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                # Track if this is an expert weight to enable early skipping
+                is_expert_weight = False
+
+                for mapping in expert_params_mapping:
+                    param_name, weight_name, expert_id, shard_id = mapping
+                    if weight_name not in name:
+                        continue
+
+                    # Mark as expert weight regardless of whether we can process it
+                    is_expert_weight = True
+
+                    name = name.replace(weight_name, param_name)
+                    if name not in params_dict:
+                        # Expert weight not on this rank, will be skipped below
+                        continue
+
+                    param = params_dict[name]
+                    weight_loader = param.weight_loader
+                    weight_loader(
+                        param,
+                        loaded_weight,
+                        name,
+                        shard_id=shard_id,
+                        expert_id=expert_id,
+                    )
+                    break
+                else:
+                    if is_expert_weight:
+                        # This is an expert weight but not mapped to this rank, skip all remaining processing
+                        continue
+
+                    # Skip loading extra bias for GPTQ models.
+                    if name.endswith(".bias") and name not in params_dict:
+                        continue
+                    if name not in params_dict:
+                        continue
+
+                    if name in params_dict.keys():
+                        param = params_dict[name]
+                        weight_loader = getattr(
+                            param, "weight_loader", default_weight_loader
+                        )
+                        weight_loader(param, loaded_weight)
+                    else:
+                        logger.warning(f"Parameter {name} not found in params_dict")
+
+        if not hasattr(self, "routed_experts_weights_of_layer"):
+            self.routed_experts_weights_of_layer = LazyValue(
+                lambda: {
+                    layer_id: self.model.layers[layer_id].mlp.get_moe_weights()
+                    for layer_id in range(self.start_layer, self.end_layer)
+                    if isinstance(
+                        self.model.layers[layer_id].mlp, Qwen3MoeSparseMoeBlock
+                    )
+                }
+            )
+
+    @classmethod
+    def get_model_config_for_expert_location(cls, config):
+        return ModelConfigForExpertLocation(
+            num_layers=config.num_hidden_layers,
+            num_logical_experts=config.num_experts,
+            num_groups=None,
+        )
+
+
+EntryClass = Qwen3MoeForCausalLM
diff --git a/sglang/python/sglang/srt/models/qwen3_next.py b/sglang/python/sglang/srt/models/qwen3_next.py
new file mode 100644
index 0000000000000000000000000000000000000000..42146e6057ea3df51f2e327b24a084654ade05dd
--- /dev/null
+++ b/sglang/python/sglang/srt/models/qwen3_next.py
@@ -0,0 +1,1175 @@
+import enum
+import logging
+from typing import Any, Iterable, Optional, Set, Tuple
+
+import torch
+from torch import nn
+
+from sglang.srt.configs.qwen3_next import Qwen3NextConfig
+from sglang.srt.distributed import get_pp_group
+from sglang.srt.eplb.expert_distribution import get_global_expert_distribution_recorder
+from sglang.srt.eplb.expert_location import ModelConfigForExpertLocation
+from sglang.srt.layers.attention.fla.layernorm_gated import RMSNorm as RMSNormGated
+from sglang.srt.layers.attention.mamba.mamba import mamba_v2_sharded_weight_loader
+from sglang.srt.layers.communicator import LayerCommunicator, LayerScatterModes
+from sglang.srt.layers.dp_attention import (
+    get_attention_tp_rank,
+    get_attention_tp_size,
+    is_dp_attention_enabled,
+)
+from sglang.srt.layers.layernorm import GemmaRMSNorm
+from sglang.srt.layers.linear import (
+    ColumnParallelLinear,
+    QKVParallelLinear,
+    RowParallelLinear,
+)
+from sglang.srt.layers.logits_processor import LogitsProcessor
+from sglang.srt.layers.moe.fused_moe_triton.layer import FusedMoE
+from sglang.srt.layers.quantization.base_config import QuantizationConfig
+from sglang.srt.layers.radix_attention import RadixAttention
+from sglang.srt.layers.radix_linear_attention import RadixLinearAttention
+from sglang.srt.layers.rotary_embedding import get_rope
+from sglang.srt.layers.vocab_parallel_embedding import (
+    ParallelLMHead,
+    VocabParallelEmbedding,
+)
+from sglang.srt.model_executor.cuda_graph_runner import get_is_capture_mode
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch
+from sglang.srt.model_loader.weight_utils import (
+    default_weight_loader,
+    sharded_weight_loader,
+)
+from sglang.srt.models.qwen2_moe import Qwen2MoeMLP, Qwen2MoeSparseMoeBlock
+from sglang.srt.server_args import get_global_server_args
+from sglang.srt.utils import (
+    LazyValue,
+    add_prefix,
+    cpu_has_amx_support,
+    is_cpu,
+    is_cuda,
+    is_npu,
+    make_layers,
+    set_weight_attrs,
+)
+
+logger = logging.getLogger(__name__)
+
+from sglang.srt.layers.attention.fla.fused_norm_gate import FusedRMSNormGated
+
+_is_cuda = is_cuda()
+_is_npu = is_npu()
+_is_cpu = is_cpu()
+_is_amx_available = cpu_has_amx_support()
+
+
+import triton
+import triton.language as tl
+
+
+@triton.jit
+def fused_qkvzba_split_reshape_cat_kernel(
+    mixed_qkv,
+    z,
+    b,
+    a,
+    mixed_qkvz,
+    mixed_ba,
+    NUM_HEADS_QK: tl.constexpr,
+    NUM_HEADS_V: tl.constexpr,
+    HEAD_QK: tl.constexpr,
+    HEAD_V: tl.constexpr,
+):
+    i_bs, i_qk = tl.program_id(0), tl.program_id(1)
+    QKVZ_DIM_T: tl.constexpr = HEAD_QK * 2 + NUM_HEADS_V // NUM_HEADS_QK * HEAD_V * 2
+    BA_DIM_T: tl.constexpr = NUM_HEADS_V // NUM_HEADS_QK * 2
+    QKV_DIM_T: tl.constexpr = HEAD_QK * 2 + NUM_HEADS_V // NUM_HEADS_QK * HEAD_V
+    q_end: tl.constexpr = HEAD_QK
+    blk_q_ptr = (
+        mixed_qkvz
+        + i_bs * NUM_HEADS_QK * QKVZ_DIM_T
+        + i_qk * QKVZ_DIM_T
+        + tl.arange(0, q_end)
+    )
+    k_end: tl.constexpr = q_end + HEAD_QK
+    blk_k_ptr = (
+        mixed_qkvz
+        + i_bs * NUM_HEADS_QK * QKVZ_DIM_T
+        + i_qk * QKVZ_DIM_T
+        + tl.arange(q_end, k_end)
+    )
+    v_end: tl.constexpr = k_end + NUM_HEADS_V // NUM_HEADS_QK * HEAD_V
+    blk_v_ptr = (
+        mixed_qkvz
+        + i_bs * NUM_HEADS_QK * QKVZ_DIM_T
+        + i_qk * QKVZ_DIM_T
+        + tl.arange(k_end, v_end)
+    )
+    z_end: tl.constexpr = v_end + NUM_HEADS_V // NUM_HEADS_QK * HEAD_V
+    blk_z_ptr = (
+        mixed_qkvz
+        + i_bs * NUM_HEADS_QK * QKVZ_DIM_T
+        + i_qk * QKVZ_DIM_T
+        + tl.arange(v_end, z_end)
+    )
+    blk_q_st_ptr = (
+        mixed_qkv
+        + i_bs * NUM_HEADS_QK * QKV_DIM_T
+        + i_qk * HEAD_QK
+        + tl.arange(0, HEAD_QK)
+    )
+    blk_k_st_ptr = (
+        mixed_qkv
+        + i_bs * NUM_HEADS_QK * QKV_DIM_T
+        + NUM_HEADS_QK * HEAD_QK
+        + i_qk * HEAD_QK
+        + tl.arange(0, HEAD_QK)
+    )
+    blk_v_st_ptr = (
+        mixed_qkv
+        + i_bs * NUM_HEADS_QK * QKV_DIM_T
+        + NUM_HEADS_QK * HEAD_QK * 2
+        + i_qk * HEAD_V * NUM_HEADS_V // NUM_HEADS_QK
+        + tl.arange(0, HEAD_V * NUM_HEADS_V // NUM_HEADS_QK)
+    )
+    blk_z_st_ptr = (
+        z
+        + i_bs * NUM_HEADS_V * HEAD_V
+        + i_qk * HEAD_V * NUM_HEADS_V // NUM_HEADS_QK
+        + tl.arange(0, HEAD_V * NUM_HEADS_V // NUM_HEADS_QK)
+    )
+    tl.store(blk_q_st_ptr, tl.load(blk_q_ptr))
+    tl.store(blk_k_st_ptr, tl.load(blk_k_ptr))
+    tl.store(blk_v_st_ptr, tl.load(blk_v_ptr))
+    tl.store(blk_z_st_ptr, tl.load(blk_z_ptr))
+    b_end: tl.constexpr = NUM_HEADS_V // NUM_HEADS_QK
+    a_end: tl.constexpr = b_end + NUM_HEADS_V // NUM_HEADS_QK
+    for i in tl.static_range(b_end):
+        blk_b_ptr = mixed_ba + i_bs * NUM_HEADS_QK * BA_DIM_T + i_qk * BA_DIM_T + i
+        blk_b_st_ptr = b + i_bs * NUM_HEADS_V + i_qk * NUM_HEADS_V // NUM_HEADS_QK + i
+        tl.store(blk_b_st_ptr, tl.load(blk_b_ptr))
+    for i in tl.static_range(b_end, a_end):
+        blk_a_ptr = mixed_ba + i_bs * NUM_HEADS_QK * BA_DIM_T + i_qk * BA_DIM_T + i
+        blk_a_st_ptr = (
+            a + i_bs * NUM_HEADS_V + i_qk * NUM_HEADS_V // NUM_HEADS_QK + (i - b_end)
+        )
+        tl.store(blk_a_st_ptr, tl.load(blk_a_ptr))
+
+
+def fused_qkvzba_split_reshape_cat(
+    mixed_qkvz,
+    mixed_ba,
+    num_heads_qk,
+    num_heads_v,
+    head_qk,
+    head_v,
+):
+    batch, seq_len = mixed_qkvz.shape[0], 1
+    qkv_dim_t = num_heads_qk * head_qk * 2 + num_heads_v * head_v
+    mixed_qkv = torch.empty(
+        [batch * seq_len, qkv_dim_t],
+        dtype=mixed_qkvz.dtype,
+        device=mixed_qkvz.device,
+    )
+    z = torch.empty(
+        [batch * seq_len, num_heads_v, head_v],
+        dtype=mixed_qkvz.dtype,
+        device=mixed_qkvz.device,
+    )
+    b = torch.empty(
+        [batch * seq_len, num_heads_v],
+        dtype=mixed_ba.dtype,
+        device=mixed_ba.device,
+    )
+    a = torch.empty_like(b)
+    grid = (batch * seq_len, num_heads_qk)
+    fused_qkvzba_split_reshape_cat_kernel[grid](
+        mixed_qkv,
+        z,
+        b,
+        a,
+        mixed_qkvz,
+        mixed_ba,
+        num_heads_qk,
+        num_heads_v,
+        head_qk,
+        head_v,
+        num_warps=1,
+        num_stages=3,
+    )
+    return mixed_qkv, z, b, a
+
+
+class Qwen3GatedDeltaNet(nn.Module):
+    def __init__(
+        self,
+        config: Qwen3NextConfig,
+        layer_id: int,
+        quant_config: Optional[QuantizationConfig] = None,
+        alt_stream: Optional[torch.cuda.Stream] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.config = config
+        self.attn_tp_rank = get_attention_tp_rank()
+        self.attn_tp_size = get_attention_tp_size()
+        self.hidden_size = config.hidden_size
+        self.num_v_heads = (
+            config.linear_num_value_heads
+            if not _is_cpu
+            else config.linear_num_value_heads_cpu
+        )
+        self.num_k_heads = (
+            config.linear_num_key_heads
+            if not _is_cpu
+            else config.linear_num_key_heads_cpu
+        )
+        self.head_k_dim = config.linear_key_head_dim
+        self.head_v_dim = config.linear_value_head_dim
+        self.key_dim = self.head_k_dim * self.num_k_heads
+        self.value_dim = self.head_v_dim * self.num_v_heads
+        self.alt_stream = alt_stream
+
+        self.conv_kernel_size = config.linear_conv_kernel_dim
+        self.layer_id = layer_id
+        self.activation = config.hidden_act
+        self.layer_norm_epsilon = config.rms_norm_eps
+
+        self.conv_dim = self.key_dim * 2 + self.value_dim
+        self.conv1d = ColumnParallelLinear(
+            input_size=self.conv_kernel_size,
+            output_size=self.conv_dim,
+            bias=False,
+            quant_config=None,
+            tp_rank=self.attn_tp_rank,
+            tp_size=self.attn_tp_size,
+            prefix=add_prefix("conv1d", prefix),
+        )
+        self.conv1d.weight.data = self.conv1d.weight.data.unsqueeze(1)
+        projection_size_qkvz = self.key_dim * 2 + self.value_dim * 2
+        projection_size_ba = self.num_v_heads * 2
+
+        self.in_proj_qkvz = ColumnParallelLinear(
+            input_size=self.hidden_size,
+            output_size=projection_size_qkvz,
+            bias=False,
+            quant_config=quant_config,
+            tp_rank=self.attn_tp_rank,
+            tp_size=self.attn_tp_size,
+            prefix=add_prefix("in_proj_qkvz", prefix),
+        )
+        self.in_proj_ba = ColumnParallelLinear(
+            input_size=self.hidden_size,
+            output_size=projection_size_ba,
+            bias=False,
+            quant_config=quant_config,
+            tp_rank=self.attn_tp_rank,
+            tp_size=self.attn_tp_size,
+            prefix=add_prefix("in_proj_ba", prefix),
+        )
+
+        query_key_settings = (self.key_dim, 0, False)
+        value_settings = (self.value_dim, 0, False)
+
+        delattr(self.conv1d.weight, "weight_loader")
+        set_weight_attrs(
+            self.conv1d.weight,
+            {
+                "weight_loader": mamba_v2_sharded_weight_loader(
+                    [
+                        query_key_settings,
+                        query_key_settings,
+                        value_settings,
+                    ],
+                    self.attn_tp_size,
+                    self.attn_tp_rank,
+                )
+            },
+        )
+
+        self.dt_bias = nn.Parameter(torch.zeros(self.num_v_heads // self.attn_tp_size))
+
+        self.A_log = nn.Parameter(
+            torch.zeros(self.num_v_heads // self.attn_tp_size, dtype=torch.float32)
+        )
+
+        set_weight_attrs(self.A_log, {"weight_loader": sharded_weight_loader(0)})
+        set_weight_attrs(self.dt_bias, {"weight_loader": sharded_weight_loader(0)})
+        self.norm = (
+            RMSNormGated(
+                self.head_v_dim,
+                eps=self.layer_norm_epsilon,
+                group_size=None,
+                norm_before_gate=True,
+                device=torch.get_device_module().current_device(),
+                dtype=config.torch_dtype,
+            )
+            if not get_global_server_args().disable_piecewise_cuda_graph
+            else FusedRMSNormGated(
+                self.head_v_dim,
+                eps=self.layer_norm_epsilon,
+                activation=self.activation,
+                device=torch.get_device_module().current_device(),
+                dtype=config.torch_dtype,
+            )
+        )
+
+        self.out_proj = RowParallelLinear(
+            self.value_dim,
+            self.hidden_size,
+            bias=False,
+            quant_config=quant_config,
+            input_is_parallel=True,
+            reduce_results=False,
+            tp_rank=self.attn_tp_rank,
+            tp_size=self.attn_tp_size,
+            prefix=add_prefix("out_proj", prefix),
+        )
+
+        self.attn = RadixLinearAttention(
+            layer_id=layer_id,
+            num_q_heads=self.num_k_heads // self.attn_tp_size,
+            num_k_heads=self.num_k_heads // self.attn_tp_size,
+            num_v_heads=self.num_v_heads // self.attn_tp_size,
+            head_q_dim=self.head_k_dim,
+            head_k_dim=self.head_k_dim,
+            head_v_dim=self.head_v_dim,
+            conv_weights=self.conv1d.weight.squeeze(1),
+            bias=self.conv1d.bias,
+            activation=self.activation,
+            A_log=self.A_log,
+            dt_bias=self.dt_bias,
+        )
+
+    def fix_query_key_value_ordering(self, mixed_qkvz, mixed_ba):
+        """
+        Derives `query`, `key` and `value` tensors from `mixed_qkvzba`.
+        """
+        new_tensor_shape_qkvz = mixed_qkvz.size()[:-1] + (
+            self.num_k_heads // self.attn_tp_size,
+            (
+                self.head_k_dim
+                + self.head_k_dim
+                + (self.head_v_dim + self.head_v_dim)
+                * self.num_v_heads
+                // self.num_k_heads
+            ),
+        )
+        new_tensor_shape_ba = mixed_ba.size()[:-1] + (
+            self.num_k_heads // self.attn_tp_size,
+            2 * self.num_v_heads // self.num_k_heads,
+        )
+
+        mixed_qkvz = mixed_qkvz.view(*new_tensor_shape_qkvz)
+        mixed_ba = mixed_ba.view(*new_tensor_shape_ba)
+
+        split_arg_list_qkvz = [
+            self.head_k_dim,
+            self.head_k_dim,
+            (self.num_v_heads // self.num_k_heads * self.head_v_dim),
+            (self.num_v_heads // self.num_k_heads * self.head_v_dim),
+        ]
+        split_arg_list_ba = [
+            self.num_v_heads // self.num_k_heads,
+            self.num_v_heads // self.num_k_heads,
+        ]
+
+        # [b, sq, ng, (hn + hn + np/ng * hn + np/ng + np/ng)]
+        # --> [b, sq, ng, hn], [b, sq, ng, hn], [b, sq, ng, np/ng * hn], [b, sq, ng, np/ng * hn], [b, sq, ng, np/ng], [b, sq, ng, np/ng]
+        query, key, value, z = torch.split(mixed_qkvz, split_arg_list_qkvz, dim=2)
+        b, a = torch.split(mixed_ba, split_arg_list_ba, dim=2)
+
+        # [b, sq, ng, np/ng * hn] -> [b, sq, np, hn]
+        value = value.reshape(value.size(0), -1, self.head_v_dim)
+        z = z.reshape(z.size(0), -1, self.head_v_dim)
+        b = b.reshape(b.size(0), self.num_v_heads // self.attn_tp_size)
+        a = a.reshape(a.size(0), self.num_v_heads // self.attn_tp_size)
+
+        return query, key, value, z, b, a
+
+    def _forward_input_proj(self, hidden_states: torch.Tensor):
+        if (
+            _is_cpu
+            or _is_npu
+            or not get_global_server_args().disable_piecewise_cuda_graph
+        ):
+            DUAL_STREAM_TOKEN_THRESHOLD = 0
+        else:
+            DUAL_STREAM_TOKEN_THRESHOLD = 1024
+
+        seq_len, _ = hidden_states.shape
+        if (
+            self.alt_stream is not None
+            and get_is_capture_mode()
+            and seq_len < DUAL_STREAM_TOKEN_THRESHOLD
+        ):
+            current_stream = torch.cuda.current_stream()
+            self.alt_stream.wait_stream(current_stream)
+            projected_states_qkvz, _ = self.in_proj_qkvz(hidden_states)
+            with torch.cuda.stream(self.alt_stream):
+                projected_states_ba, _ = self.in_proj_ba(hidden_states)
+            current_stream.wait_stream(self.alt_stream)
+        else:
+            projected_states_qkvz, _ = self.in_proj_qkvz(hidden_states)
+            projected_states_ba, _ = self.in_proj_ba(hidden_states)
+        return projected_states_qkvz, projected_states_ba
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+    ):
+        projected_states_qkvz, projected_states_ba = self._forward_input_proj(
+            hidden_states
+        )
+
+        if self.num_v_heads // self.num_k_heads in [1, 2, 4] and not _is_cpu:
+            mixed_qkv, z, b, a = fused_qkvzba_split_reshape_cat(
+                projected_states_qkvz,
+                projected_states_ba,
+                triton.cdiv(self.num_k_heads, self.attn_tp_size),
+                triton.cdiv(self.num_v_heads, self.attn_tp_size),
+                self.head_k_dim,
+                self.head_v_dim,
+            )
+        elif _is_cpu and _is_amx_available:
+            mixed_qkv, z, b, a = (
+                torch.ops.sgl_kernel.fused_qkvzba_split_reshape_cat_cpu(
+                    projected_states_qkvz,
+                    projected_states_ba,
+                    self.num_k_heads // self.attn_tp_size,
+                    self.num_v_heads // self.attn_tp_size,
+                    self.head_k_dim,
+                    self.head_v_dim,
+                )
+            )
+        else:
+            query, key, value, z, b, a = self.fix_query_key_value_ordering(
+                projected_states_qkvz, projected_states_ba
+            )
+            query, key, value = map(
+                lambda x: x.reshape(x.shape[0], -1), (query, key, value)
+            )
+            mixed_qkv = torch.cat((query, key, value), dim=-1)
+        core_attn_out = self.attn(
+            forward_batch,
+            mixed_qkv=mixed_qkv,
+            a=a,
+            b=b,
+        )
+
+        z_shape_og = z.shape
+        # reshape input data into 2D tensor
+        core_attn_out = core_attn_out.reshape(-1, core_attn_out.shape[-1])
+        z = z.reshape(-1, z.shape[-1])
+
+        # Add padding for DP-Attn
+        if core_attn_out.shape != z.shape:
+            core_attn_out_pad = torch.zeros_like(z)
+            core_attn_out_pad[: core_attn_out.shape[0], :] = core_attn_out
+            core_attn_out = core_attn_out_pad
+
+        core_attn_out = self.norm(core_attn_out, z)
+        core_attn_out = core_attn_out.reshape(z_shape_og)
+        core_attn_out = core_attn_out.reshape(*core_attn_out.shape[:-2], -1)
+
+        output, _ = self.out_proj(core_attn_out)
+        return output
+
+
+class Qwen3HybridLinearDecoderLayer(nn.Module):
+
+    def __init__(
+        self,
+        config: Qwen3NextConfig,
+        layer_id: int,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+        alt_stream: Optional[torch.cuda.Stream] = None,
+    ) -> None:
+        super().__init__()
+        self.config = config
+        self.linear_attn = Qwen3GatedDeltaNet(
+            config, layer_id, quant_config, alt_stream, prefix
+        )
+
+        # Qwen3Next all layers are sparse and have no nextn now
+        self.is_layer_sparse = True
+        is_previous_layer_sparse = True
+        is_next_layer_sparse = True
+        self.layer_id = layer_id
+
+        self.layer_scatter_modes = LayerScatterModes.init_new(
+            layer_id=layer_id,
+            num_layers=config.num_hidden_layers,
+            is_layer_sparse=self.is_layer_sparse,
+            is_previous_layer_sparse=is_previous_layer_sparse,
+            is_next_layer_sparse=is_next_layer_sparse,
+        )
+
+        if self.is_layer_sparse:
+            self.mlp = Qwen2MoeSparseMoeBlock(
+                layer_id=layer_id,
+                config=config,
+                quant_config=quant_config,
+                alt_stream=alt_stream,
+                prefix=add_prefix("mlp", prefix.replace(".linear_attn", "")),
+            )
+        else:
+            self.mlp = Qwen2MoeMLP(
+                hidden_size=config.hidden_size,
+                intermediate_size=config.intermediate_size,
+                hidden_act=config.hidden_act,
+                quant_config=quant_config,
+                prefix=add_prefix("mlp", prefix.replace(".linear_attn", "")),
+            )
+        self.input_layernorm = GemmaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = GemmaRMSNorm(
+            config.hidden_size, eps=config.rms_norm_eps
+        )
+        self.layer_communicator = LayerCommunicator(
+            layer_scatter_modes=self.layer_scatter_modes,
+            input_layernorm=self.input_layernorm,
+            post_attention_layernorm=self.post_attention_layernorm,
+            allow_reduce_scatter=True,
+        )
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        residual: Optional[torch.Tensor],
+        captured_last_layer_outputs: Optional[list[torch.Tensor]] = None,
+        **kwargs,
+    ):
+        forward_batch = kwargs.get("forward_batch", None)
+
+        hidden_states, residual = (
+            self.layer_communicator.prepare_attn_and_capture_last_layer_outputs(
+                hidden_states,
+                residual,
+                forward_batch,
+                captured_last_layer_outputs=captured_last_layer_outputs,
+            )
+        )
+
+        if not forward_batch.forward_mode.is_idle():
+            hidden_states = self.linear_attn(
+                hidden_states,
+                forward_batch,
+            )
+        # Fully Connected
+        hidden_states, residual = self.layer_communicator.prepare_mlp(
+            hidden_states, residual, forward_batch
+        )
+
+        use_reduce_scatter = self.layer_communicator.should_use_reduce_scatter(
+            forward_batch
+        )
+        hidden_states = self.mlp(hidden_states, forward_batch, use_reduce_scatter)
+
+        hidden_states, residual = self.layer_communicator.postprocess_layer(
+            hidden_states, residual, forward_batch
+        )
+
+        return hidden_states, residual
+
+
+class Qwen3HybridAttentionDecoderLayer(nn.Module):
+
+    def __init__(
+        self,
+        config: Qwen3NextConfig,
+        layer_id: int,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+        alt_stream: Optional[torch.cuda.Stream] = None,
+    ) -> None:
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.attn_tp_rank = get_attention_tp_rank()
+        self.attn_tp_size = get_attention_tp_size()
+        self.total_num_heads = config.num_attention_heads
+        assert self.total_num_heads % self.attn_tp_size == 0
+        self.num_heads = self.total_num_heads // self.attn_tp_size
+        self.total_num_kv_heads = config.num_key_value_heads
+        if self.total_num_kv_heads >= self.attn_tp_size:
+            # Number of KV heads is greater than TP size, so we partition
+            # the KV heads across multiple tensor parallel GPUs.
+            assert self.total_num_kv_heads % self.attn_tp_size == 0
+        else:
+            # Number of KV heads is less than TP size, so we replicate
+            # the KV heads across multiple tensor parallel GPUs.
+            assert self.attn_tp_size % self.total_num_kv_heads == 0
+        self.num_kv_heads = max(1, self.total_num_kv_heads // self.attn_tp_size)
+        self.head_dim = config.head_dim or (self.hidden_size // self.num_heads)
+        self.q_size = self.num_heads * self.head_dim
+        self.kv_size = self.num_kv_heads * self.head_dim
+        self.scaling = self.head_dim**-0.5
+        self.rope_theta = getattr(config, "rope_theta", 10000)
+        self.max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
+        if "rope_parameters" in config:
+            self.rope_scaling = getattr(config, "rope_parameters", None)
+        else:
+            self.rope_scaling = getattr(config, "rope_scaling", None)
+        self.partial_rotary_factor = config.partial_rotary_factor
+        self.layer_id = layer_id
+
+        self.attn_output_gate = getattr(config, "attn_output_gate", True)
+        if self.attn_output_gate:
+            logger.warning_once("using attn output gate!")
+
+        self.rotary_emb = get_rope(
+            head_size=self.head_dim,
+            rotary_dim=self.head_dim,
+            max_position=self.max_position_embeddings,
+            rope_scaling=self.rope_scaling,
+            base=self.rope_theta,
+            partial_rotary_factor=self.partial_rotary_factor,
+            is_neox_style=True,
+            dtype=torch.get_default_dtype(),  # see impl of get_rope
+        )
+
+        # qkv_proj is not quantized for fp4
+        self.qkv_proj = QKVParallelLinear(
+            config.hidden_size,
+            self.head_dim,
+            self.total_num_heads * (1 + self.attn_output_gate),
+            self.total_num_kv_heads,
+            bias=False,
+            quant_config=(
+                quant_config
+                if quant_config is not None
+                and quant_config.get_name() != "modelopt_fp4"
+                else None
+            ),
+            tp_rank=self.attn_tp_rank,
+            tp_size=self.attn_tp_size,
+            prefix=add_prefix("qkv_proj", prefix),
+        )
+
+        self.o_proj = RowParallelLinear(
+            self.total_num_heads * self.head_dim,
+            config.hidden_size,
+            bias=False,
+            quant_config=quant_config,
+            reduce_results=False,
+            tp_rank=self.attn_tp_rank,
+            tp_size=self.attn_tp_size,
+            prefix=add_prefix("o_proj", prefix),
+        )
+
+        self.attn = RadixAttention(
+            self.num_heads,
+            self.head_dim,
+            self.scaling,
+            num_kv_heads=self.num_kv_heads,
+            layer_id=layer_id,
+            quant_config=quant_config,
+            prefix=f"{prefix}.attn",
+        )
+
+        # Qwen3Next all layers are sparse and have no nextn now
+        self.is_layer_sparse = True
+        is_previous_layer_sparse = True
+        is_next_layer_sparse = True
+
+        self.layer_scatter_modes = LayerScatterModes.init_new(
+            layer_id=layer_id,
+            num_layers=config.num_hidden_layers,
+            is_layer_sparse=self.is_layer_sparse,
+            is_previous_layer_sparse=is_previous_layer_sparse,
+            is_next_layer_sparse=is_next_layer_sparse,
+        )
+
+        if self.is_layer_sparse:
+            self.mlp = Qwen2MoeSparseMoeBlock(
+                layer_id=layer_id,
+                config=config,
+                quant_config=quant_config,
+                alt_stream=alt_stream,
+                prefix=add_prefix("mlp", prefix.replace(".self_attn", "")),
+            )
+        else:
+            self.mlp = Qwen2MoeMLP(
+                hidden_size=config.hidden_size,
+                intermediate_size=config.intermediate_size,
+                hidden_act=config.hidden_act,
+                quant_config=quant_config,
+                prefix=add_prefix("mlp", prefix.replace(".self_attn", "")),
+            )
+        self.input_layernorm = GemmaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = GemmaRMSNorm(
+            config.hidden_size, eps=config.rms_norm_eps
+        )
+
+        self.q_norm = GemmaRMSNorm(self.head_dim, eps=config.rms_norm_eps)
+        self.k_norm = GemmaRMSNorm(self.head_dim, eps=config.rms_norm_eps)
+
+        self.layer_communicator = LayerCommunicator(
+            layer_scatter_modes=self.layer_scatter_modes,
+            input_layernorm=self.input_layernorm,
+            post_attention_layernorm=self.post_attention_layernorm,
+            allow_reduce_scatter=True,
+        )
+
+        self.alt_stream = alt_stream
+
+    def _apply_qk_norm(
+        self, q: torch.Tensor, k: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        # overlap qk norm
+        if self.alt_stream is not None and get_is_capture_mode():
+            current_stream = torch.cuda.current_stream()
+            self.alt_stream.wait_stream(current_stream)
+            q_by_head = q.reshape(-1, self.head_dim)
+            q_by_head = self.q_norm(q_by_head)
+            with torch.cuda.stream(self.alt_stream):
+                k_by_head = k.reshape(-1, self.head_dim)
+                k_by_head = self.k_norm(k_by_head)
+            current_stream.wait_stream(self.alt_stream)
+        else:
+            q_by_head = q.reshape(-1, self.head_dim)
+            q_by_head = self.q_norm(q_by_head)
+            k_by_head = k.reshape(-1, self.head_dim)
+            k_by_head = self.k_norm(k_by_head)
+        q = q_by_head.view(q.shape)
+        k = k_by_head.view(k.shape)
+        return q, k
+
+    def self_attention(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+    ) -> torch.Tensor:
+        qkv, _ = self.qkv_proj(hidden_states)
+
+        if self.attn_output_gate:
+            q_gate, k, v = qkv.split(
+                [self.q_size * 2, self.kv_size, self.kv_size], dim=-1
+            )
+            orig_shape = q_gate.shape[:-1]
+            q_gate = q_gate.view(*orig_shape, self.num_heads, -1)
+            q, gate = torch.chunk(q_gate, 2, dim=-1)
+            q = q.reshape(*orig_shape, -1)
+            gate = gate.reshape(*orig_shape, -1)
+        else:
+            q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+
+        q, k = self._apply_qk_norm(q, k)
+
+        q, k = self.rotary_emb(positions, q, k)
+
+        attn_output = self.attn(q, k, v, forward_batch)
+
+        if self.attn_output_gate:
+            gate = torch.sigmoid(gate)
+            attn_output = attn_output * gate
+
+        output, _ = self.o_proj(attn_output)
+        return output
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        residual: Optional[torch.Tensor],
+        forward_batch: ForwardBatch,
+        captured_last_layer_outputs: Optional[list[torch.Tensor]] = None,
+        **kwargs: Any,
+    ):
+        hidden_states, residual = (
+            self.layer_communicator.prepare_attn_and_capture_last_layer_outputs(
+                hidden_states,
+                residual,
+                forward_batch,
+                captured_last_layer_outputs=captured_last_layer_outputs,
+            )
+        )
+
+        if not forward_batch.forward_mode.is_idle():
+            hidden_states = self.self_attention(
+                positions=positions,
+                hidden_states=hidden_states,
+                forward_batch=forward_batch,
+            )
+
+        # Fully Connected
+        hidden_states, residual = self.layer_communicator.prepare_mlp(
+            hidden_states, residual, forward_batch
+        )
+        use_reduce_scatter = self.layer_communicator.should_use_reduce_scatter(
+            forward_batch
+        )
+        hidden_states = self.mlp(hidden_states, forward_batch, use_reduce_scatter)
+
+        hidden_states, residual = self.layer_communicator.postprocess_layer(
+            hidden_states, residual, forward_batch
+        )
+
+        return hidden_states, residual
+
+
+ALL_DECODER_LAYER_TYPES = {
+    "attention": Qwen3HybridAttentionDecoderLayer,
+    "linear_attention": Qwen3HybridLinearDecoderLayer,
+}
+
+
+class Qwen3NextModel(nn.Module):
+    def __init__(
+        self,
+        config: Qwen3NextConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.config = config
+
+        alt_stream = torch.cuda.Stream() if _is_cuda else None
+
+        self.embed_tokens = VocabParallelEmbedding(
+            config.vocab_size,
+            config.hidden_size,
+            org_num_embeddings=config.vocab_size,
+            use_attn_tp_group=is_dp_attention_enabled(),
+        )
+
+        def get_layer(idx: int, prefix: str):
+            layer_class = ALL_DECODER_LAYER_TYPES[config.layers_block_type[idx]]
+            if config.layers_block_type[idx] == "attention":
+                prefix = add_prefix("self_attn", prefix)
+            else:
+                prefix = add_prefix("linear_attn", prefix)
+            return layer_class(
+                config,
+                idx,
+                quant_config=quant_config,
+                prefix=prefix,
+                alt_stream=alt_stream,
+            )
+
+        self.layers = make_layers(
+            config.num_hidden_layers, get_layer, prefix=f"{prefix}.layers"
+        )
+
+        self.norm = GemmaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.infer_count = 0
+
+        # For EAGLE3 support
+        self.layers_to_capture = []
+
+    def set_eagle3_layers_to_capture(self, layers_to_capture: list[int]):
+        self.layers_to_capture = layers_to_capture
+        for layer_id in self.layers_to_capture:
+            setattr(self.layers[layer_id], "_is_layer_to_capture", True)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        # mamba_cache_params: MambaCacheParams,
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+
+        # pass a sequence index tensor, that is required for
+        # proper continuous batching computation including
+        # chunked prefill
+        if inputs_embeds is not None:
+            hidden_states = inputs_embeds
+        else:
+            hidden_states = self.embed_tokens(input_ids)
+
+        residual = None
+        aux_hidden_states = []
+        for i in range(len(self.layers)):
+            layer = self.layers[i]
+            with get_global_expert_distribution_recorder().with_current_layer(i):
+                hidden_states, residual = layer(
+                    layer_id=i,
+                    positions=positions,
+                    hidden_states=hidden_states,
+                    residual=residual,
+                    forward_batch=forward_batch,
+                    captured_last_layer_outputs=(
+                        aux_hidden_states
+                        if getattr(layer, "_is_layer_to_capture", False)
+                        else None
+                    ),
+                )
+
+        if not forward_batch.forward_mode.is_idle():
+            if residual is None:
+                hidden_states = self.norm(hidden_states)
+            else:
+                hidden_states, _ = self.norm(hidden_states, residual)
+
+        if len(aux_hidden_states) == 0:
+            return hidden_states
+
+        return hidden_states, aux_hidden_states
+
+
+class HybridLayerType(enum.Enum):
+    full_attention = "attention"
+    swa_attention = "swa_attention"
+    linear_attention = "linear_attention"
+    mamba2 = "mamba"
+
+
+class Qwen3NextForCausalLM(nn.Module):
+    fall_back_to_pt_during_load = False
+
+    # Map fused module names to their checkpoint (unfused) counterparts.
+    # This is needed so the quantization exclusion logic can match
+    # checkpoint-style names (e.g. "q_proj") against the fused sglang
+    # module names (e.g. "qkv_proj").
+    packed_modules_mapping = {
+        "qkv_proj": ["q_proj", "k_proj", "v_proj"],
+        "gate_up_proj": ["gate_proj", "up_proj"],
+    }
+
+    def __init__(
+        self,
+        config: Qwen3NextConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.config = config
+        self.pp_group = get_pp_group()
+        assert self.pp_group.is_first_rank and self.pp_group.is_last_rank
+
+        # The quant config's packed_modules_mapping may be None if it wasn't
+        # in the checkpoint config. The base class (QuantizationConfig) intends
+        # for models to set this. We need it so is_layer_skipped can unfuse
+        # "qkv_proj" into ["q_proj","k_proj","v_proj"] when checking exclusions.
+        if quant_config is not None and hasattr(quant_config, "packed_modules_mapping"):
+            quant_config.packed_modules_mapping = self.packed_modules_mapping
+
+        self.quant_config = quant_config
+        self.model = Qwen3NextModel(
+            config, quant_config, prefix=add_prefix("model", prefix)
+        )
+        self.lm_head = ParallelLMHead(
+            config.vocab_size,
+            config.hidden_size,
+            quant_config=quant_config,
+            org_num_embeddings=config.vocab_size,
+            prefix=add_prefix("lm_head", prefix),
+            use_attn_tp_group=get_global_server_args().enable_dp_lm_head,
+        )
+        self.logits_processor = LogitsProcessor(config)
+        # For EAGLE3 support
+        self.capture_aux_hidden_states = False
+
+        self._routed_experts_weights_of_layer = LazyValue(
+            lambda: {
+                layer_id: layer.mlp.get_moe_weights()
+                for layer_id, layer in enumerate(self.model.layers)
+                if isinstance(layer.mlp, Qwen2MoeSparseMoeBlock)
+            }
+        )
+
+    @property
+    def routed_experts_weights_of_layer(self):
+        return self._routed_experts_weights_of_layer.value
+
+    @torch.no_grad()
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        **kwargs,
+    ):
+        hidden_states = self.model(input_ids, positions, forward_batch, inputs_embeds)
+
+        aux_hidden_states = None
+        if self.capture_aux_hidden_states:
+            hidden_states, aux_hidden_states = hidden_states
+
+        return self.logits_processor(
+            input_ids, hidden_states, self.lm_head, forward_batch, aux_hidden_states
+        )
+
+    def get_embed_and_head(self):
+        return self.model.embed_tokens.weight, self.lm_head.weight
+
+    def set_embed_and_head(self, embed, head):
+        del self.model.embed_tokens.weight
+        del self.lm_head.weight
+        self.model.embed_tokens.weight = embed
+        self.lm_head.weight = head
+        torch.cuda.empty_cache()
+        torch.cuda.synchronize()
+
+    def get_embed(self):
+        return self.model.embed_tokens.weight
+
+    def set_embed(self, embed):
+        # NOTE: If draft hidden size != target hidden size, the embed weight cannot be shared for EAGLE3
+        if (
+            hasattr(self.config, "target_hidden_size")
+            and self.config.target_hidden_size != self.config.hidden_size
+        ):
+            return
+        del self.model.embed_tokens.weight
+        self.model.embed_tokens.weight = embed
+        torch.cuda.empty_cache()
+        torch.cuda.synchronize()
+
+    def load_weights(
+        self, weights: Iterable[Tuple[str, torch.Tensor]], is_mtp: bool = False
+    ) -> Set[str]:
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+            ("gate_up_proj", "gate_proj", 0),
+            ("gate_up_proj", "up_proj", 1),
+        ]
+
+        # Params for weights, fp8 weight scales, fp8 activation scales
+        # (param_name, weight_name, expert_id, shard_id)
+        expert_params_mapping = FusedMoE.make_expert_params_mapping(
+            ckpt_gate_proj_name="gate_proj",
+            ckpt_down_proj_name="down_proj",
+            ckpt_up_proj_name="up_proj",
+            num_experts=self.config.num_experts,
+        )
+
+        params_dict = dict(self.named_parameters())
+        loaded_params: Set[str] = set()
+        for name, loaded_weight in weights:
+
+            if is_mtp:
+
+                if "mtp" not in name:
+                    continue
+
+                if name in [
+                    "mtp.fc.weight",
+                    "mtp.pre_fc_norm_embedding.weight",
+                    "mtp.pre_fc_norm_hidden.weight",
+                ]:
+                    name = name.replace("mtp.", "")
+                else:
+                    name = name.replace("mtp", "model")
+
+            if not is_mtp and "mtp" in name:
+                continue
+
+            if "rotary_emb.inv_freq" in name:
+                continue
+
+            if ".self_attn." in name:
+                name = name.replace(".self_attn", "")
+
+            # Remap modelopt FP8 KV cache scale names:
+            # checkpoint: k_proj.k_scale / v_proj.v_scale
+            # model:      attn.k_scale   / attn.v_scale
+            if name.endswith(".k_proj.k_scale"):
+                name = name.replace(".k_proj.k_scale", ".attn.k_scale")
+            elif name.endswith(".v_proj.v_scale"):
+                name = name.replace(".v_proj.v_scale", ".attn.v_scale")
+
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+
+                # TODO(fix mtp loading)
+                if "mlp.experts" in name:
+                    continue
+
+                replaced_name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if replaced_name.endswith(".bias") and replaced_name not in params_dict:
+                    continue
+                # Skip layers on other devices.
+                # if is_pp_missing_parameter(name, self):
+                #     continue
+                if replaced_name not in params_dict:
+                    continue
+                name = replaced_name
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader")
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                for mapping in expert_params_mapping:
+                    param_name, weight_name, expert_id, shard_id = mapping
+                    if weight_name not in name:
+                        continue
+                    replaced_name = name.replace(weight_name, param_name)
+                    # Skip layers on other devices.
+                    # if is_pp_missing_parameter(name, self):
+                    #     continue
+                    # Skip loading extra bias for GPTQ models.
+                    if (
+                        replaced_name.endswith(".bias")
+                        or replaced_name.endswith("_bias")
+                    ) and replaced_name not in params_dict:
+                        continue
+                    name = replaced_name
+                    param = params_dict[name]
+
+                    weight_loader = getattr(param, "weight_loader")
+                    weight_loader(
+                        param,
+                        loaded_weight,
+                        name,
+                        shard_id=shard_id,
+                        expert_id=expert_id,
+                    )
+                    break
+                else:
+                    # Skip loading extra bias for GPTQ models.
+                    if name.endswith(".bias") and name not in params_dict:
+                        continue
+                    # if is_pp_missing_parameter(name, self):
+                    #     continue
+
+                    if name.endswith("_scale") and name not in params_dict:
+                        assert (
+                            abs(loaded_weight.item() - 1.0) < 1e-6
+                        ), f"Expected 1.0, got {loaded_weight.item()} in skipped {name}"
+                        continue
+                    param = params_dict[name]
+                    weight_loader = getattr(
+                        param, "weight_loader", default_weight_loader
+                    )
+                    weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
+
+    @classmethod
+    def get_model_config_for_expert_location(cls, config):
+        return ModelConfigForExpertLocation(
+            num_layers=config.num_hidden_layers,
+            num_logical_experts=config.num_experts,
+            num_groups=None,
+        )
+
+    def set_eagle3_layers_to_capture(self, layer_ids: Optional[list[int]] = None):
+        if not self.pp_group.is_last_rank:
+            return
+
+        self.capture_aux_hidden_states = True
+        if layer_ids is None:
+            num_layers = self.config.num_hidden_layers
+            self.model.set_eagle3_layers_to_capture(
+                [
+                    2,
+                    num_layers // 2,
+                    num_layers - 3,
+                ]
+            )  # Specific layers for EAGLE3 support
+        else:
+            self.model.set_eagle3_layers_to_capture([val + 1 for val in layer_ids])
+
+
+EntryClass = Qwen3NextForCausalLM
diff --git a/sglang/python/sglang/srt/models/qwen3_next_mtp.py b/sglang/python/sglang/srt/models/qwen3_next_mtp.py
new file mode 100644
index 0000000000000000000000000000000000000000..9722f7e4a35e166420297da60b2c8b3a9ecc5fcb
--- /dev/null
+++ b/sglang/python/sglang/srt/models/qwen3_next_mtp.py
@@ -0,0 +1,112 @@
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Inference-only Qwen3Next MTP Speculative Decoding."""
+
+import logging
+from typing import Iterable, Optional, Tuple
+
+import torch
+from torch import nn
+from transformers import PretrainedConfig
+
+from sglang.srt.distributed import get_pp_group, get_tensor_model_parallel_world_size
+from sglang.srt.layers.layernorm import GemmaRMSNorm
+from sglang.srt.layers.logits_processor import LogitsProcessor
+from sglang.srt.layers.quantization.base_config import QuantizationConfig
+from sglang.srt.layers.vocab_parallel_embedding import ParallelLMHead
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch
+from sglang.srt.models.qwen3_next import Qwen3NextForCausalLM, Qwen3NextModel
+from sglang.srt.server_args import get_global_server_args
+from sglang.srt.utils import add_prefix
+
+logger = logging.getLogger(__name__)
+
+
+class Qwen3NextForCausalLMMTP(Qwen3NextForCausalLM):
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        nn.Module.__init__(self)
+        self.config = config
+        self.tp_size = get_tensor_model_parallel_world_size()
+        self.quant_config = quant_config
+        # if not set, model load will be broken in Qwen3NextForCausalLM load_weights()
+        self.pp_group = get_pp_group()
+        # self.determine_num_fused_shared_experts("Qwen3NextForCausalLMMTP")
+
+        # currently based on the provided ckpt, we:
+        # (1) do not use_dedicated_mtp_embeddings provided in ckpt since not provided and directly use the target model embeddings
+        # (2) hardcode bias=False since not provided
+        self.fc = nn.Linear(2 * config.hidden_size, config.hidden_size, bias=False)
+        RMSNorm_cls = GemmaRMSNorm
+        self.pre_fc_norm_embedding = RMSNorm_cls(
+            config.hidden_size, config.rms_norm_eps
+        )
+        self.pre_fc_norm_hidden = RMSNorm_cls(config.hidden_size, config.rms_norm_eps)
+        config.num_hidden_layers = 1
+        config.full_attention_interval = 1
+        self.model = Qwen3NextModel(
+            config, quant_config, prefix=add_prefix("model", prefix)
+        )
+        self.lm_head = ParallelLMHead(
+            config.vocab_size,
+            config.hidden_size,
+            quant_config=quant_config,
+            prefix=add_prefix("model.shared_head.head", prefix),
+            use_attn_tp_group=get_global_server_args().enable_dp_lm_head,
+        )
+        self.logits_processor = LogitsProcessor(config)
+
+    @torch.no_grad()
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        input_embeds: Optional[torch.Tensor] = None,
+        **kwargs,
+    ):
+        if input_embeds is None:
+            input_embeds = self.model.embed_tokens(input_ids)
+
+        hidden_states = forward_batch.spec_info.hidden_states
+        # Some idle batch has 0 batch size. GemmaRMSNorm.forward would fail due to bs=0.
+        if not forward_batch.forward_mode.is_idle():
+            input_embeds = self.pre_fc_norm_embedding(input_embeds)
+            hidden_states = self.pre_fc_norm_hidden(hidden_states)
+        hidden_states = self.fc(torch.cat((input_embeds, hidden_states), dim=-1))
+
+        hidden_states = self.model(
+            input_ids,
+            positions,
+            forward_batch,
+            hidden_states,
+        )
+
+        return self.logits_processor(
+            input_ids, hidden_states, self.lm_head, forward_batch
+        )
+
+    def load_weights(
+        self, weights: Iterable[Tuple[str, torch.Tensor]], is_mtp: bool = False
+    ):
+        super().load_weights(weights, is_mtp=True)
+
+
+EntryClass = [Qwen3NextForCausalLMMTP]
diff --git a/sglang/python/sglang/srt/models/qwen3_omni_moe.py b/sglang/python/sglang/srt/models/qwen3_omni_moe.py
new file mode 100644
index 0000000000000000000000000000000000000000..e0e82d03ed97083ac12521b46989d7d129c9eead
--- /dev/null
+++ b/sglang/python/sglang/srt/models/qwen3_omni_moe.py
@@ -0,0 +1,664 @@
+# Copyright 2025 Qwen Team
+# Copyright 2025 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Inference-only Qwen3-VL model compatible with HuggingFace weights."""
+
+import math
+from typing import Iterable, List, Optional, Tuple
+
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from transformers import PreTrainedModel
+from transformers.activations import ACT2FN
+from transformers.modeling_outputs import BaseModelOutput
+
+from sglang.srt.configs.qwen3_omni import (
+    Qwen3OmniMoeAudioEncoderConfig,
+    Qwen3OmniMoeThinkerConfig,
+    Qwen3OmniMoeVisionEncoderConfig,
+)
+from sglang.srt.configs.qwen3_vl import Qwen3VLMoeConfig
+from sglang.srt.layers.attention.vision import VisionAttention
+from sglang.srt.layers.linear import ColumnParallelLinear, RowParallelLinear
+from sglang.srt.layers.moe.fused_moe_triton.layer import FusedMoE
+from sglang.srt.layers.quantization.base_config import QuantizationConfig
+from sglang.srt.managers.schedule_batch import MultimodalDataItem
+from sglang.srt.model_loader.weight_utils import default_weight_loader
+from sglang.srt.models.qwen3_vl import Qwen3VLMoeVisionModel
+from sglang.srt.models.qwen3_vl_moe import (
+    Qwen3MoeLLMModel,
+    Qwen3VLMoeForConditionalGeneration,
+    load_fused_expert_weights,
+)
+from sglang.srt.utils import add_prefix, is_npu, logger
+
+
+class Qwen3OmniMoeAudioEncoderLayer(nn.Module):
+    def __init__(
+        self,
+        config: Qwen3OmniMoeAudioEncoderConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        embed_dim = config.d_model
+        self.embed_dim = config.d_model
+        self.self_attn = VisionAttention(
+            embed_dim=embed_dim,
+            num_heads=config.encoder_attention_heads,
+            projection_size=embed_dim,
+            use_qkv_parallel=True,
+            proj_bias=True,
+            flatten_batch=True,
+            quant_config=quant_config,
+            prefix=add_prefix("attn", prefix),
+        )
+        self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
+        self.dropout = config.dropout
+        self.activation_fn = ACT2FN[config.activation_function]
+        self.activation_dropout = config.activation_dropout
+        self.fc1 = nn.Linear(self.embed_dim, config.encoder_ffn_dim)
+        self.fc2 = nn.Linear(config.encoder_ffn_dim, self.embed_dim)
+        self.final_layer_norm = nn.LayerNorm(self.embed_dim)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        cu_seqlens: torch.Tensor,
+        **kwargs,
+    ) -> torch.Tensor:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size
+                `(encoder_attention_heads,)`.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+        """
+        residual = hidden_states
+        hidden_states = self.self_attn_layer_norm(hidden_states)
+        hidden_states = self.self_attn(
+            x=hidden_states,
+            cu_seqlens=cu_seqlens,
+        )
+        hidden_states = residual + hidden_states
+        residual = hidden_states
+        hidden_states = self.final_layer_norm(hidden_states)
+        hidden_states = self.fc1(hidden_states)
+        hidden_states = self.activation_fn(hidden_states)
+        hidden_states = self.fc2(hidden_states)
+        hidden_states = residual + hidden_states
+
+        if hidden_states.dtype == torch.float16:
+            clamp_value = torch.finfo(hidden_states.dtype).max - 1000
+            hidden_states = torch.clamp(
+                hidden_states, min=-clamp_value, max=clamp_value
+            )
+
+        outputs = (hidden_states,)
+
+        return outputs
+
+
+class SinusoidsPositionEmbedding(nn.Module):
+    def __init__(self, length, channels, max_timescale=10000):
+        super().__init__()
+        if channels % 2 != 0:
+            raise ValueError("SinusoidsPositionEmbedding needs even channels input")
+        log_timescale_increment = np.log(max_timescale) / (channels // 2 - 1)
+        inv_timescales = torch.exp(
+            -log_timescale_increment * torch.arange(channels // 2).float()
+        )
+        scaled_time = (
+            torch.arange(length)[:, np.newaxis] * inv_timescales[np.newaxis, :]
+        )
+        self.register_buffer(
+            "positional_embedding",
+            torch.cat([torch.sin(scaled_time), torch.cos(scaled_time)], dim=1),
+            persistent=False,
+        )
+
+    def forward(self, seqlen: int):
+        return self.positional_embedding[:seqlen, :]
+
+
+def _get_feat_extract_output_lengths(input_lengths):
+    """
+    Computes the output length of the convolutional layers and the output length of the audio encoder
+    """
+
+    input_lengths_leave = input_lengths % 100
+    feat_lengths = (input_lengths_leave - 1) // 2 + 1
+    output_lengths = (
+        ((feat_lengths - 1) // 2 + 1 - 1) // 2 + 1 + (input_lengths // 100) * 13
+    )
+    return output_lengths
+
+
+class Qwen3OmniMoeAudioEncoder(PreTrainedModel):
+    config: Qwen3OmniMoeAudioEncoderConfig
+
+    def __init__(self, config: Qwen3OmniMoeAudioEncoderConfig):
+        super().__init__(config)
+        self.dropout = config.dropout
+
+        embed_dim = config.d_model
+        self.num_mel_bins = config.num_mel_bins
+        self.max_source_positions = config.max_source_positions
+        self.embed_scale = math.sqrt(embed_dim) if config.scale_embedding else 1.0
+        self.n_window = config.n_window
+        self.positional_embedding = SinusoidsPositionEmbedding(
+            self.max_source_positions, embed_dim
+        )
+        self.layers = nn.ModuleList(
+            [
+                Qwen3OmniMoeAudioEncoderLayer(config)
+                for _ in range(config.encoder_layers)
+            ]
+        )
+        self.ln_post = nn.LayerNorm(config.d_model)
+        self.gradient_checkpointing = False
+        self.conv2d1 = nn.Conv2d(1, config.downsample_hidden_size, 3, 2, padding=1)
+        self.conv2d2 = nn.Conv2d(
+            config.downsample_hidden_size,
+            config.downsample_hidden_size,
+            3,
+            2,
+            padding=1,
+        )
+        self.conv2d3 = nn.Conv2d(
+            config.downsample_hidden_size,
+            config.downsample_hidden_size,
+            3,
+            2,
+            padding=1,
+        )
+        self.conv_out = nn.Linear(
+            config.downsample_hidden_size
+            * ((((config.num_mel_bins + 1) // 2 + 1) // 2 + 1) // 2),
+            config.d_model,
+            bias=False,
+        )
+        self.proj1 = nn.Linear(config.d_model, config.d_model)
+        self.act = ACT2FN[config.activation_function]
+        self.proj2 = nn.Linear(config.d_model, config.output_dim)
+        self.n_window_infer = self.config.n_window_infer
+        self.conv_chunksize = self.config.conv_chunksize
+
+    def _freeze_parameters(self):
+        for param in self.parameters():
+            param.requires_grad = False
+        self._requires_grad = False
+
+    def get_input_embeddings(self) -> nn.Module:
+        return self.conv1
+
+    def set_input_embeddings(self, value: nn.Module):
+        self.conv1 = value
+
+    def forward(
+        self,
+        input_features,
+        feature_lens=None,
+        aftercnn_lens=None,
+    ):
+        r"""
+        feature_lens (`torch.LongTensor` of shape `(batch_size,)`):
+            mel length
+        aftercnn_lens (`torch.LongTensor` of shape `(batch_size,)`):
+            mel length after cnn
+        """
+        aftercnn_lens = _get_feat_extract_output_lengths(feature_lens)
+        chunk_num = torch.ceil(feature_lens / (self.n_window * 2)).long()
+
+        chunk_lengths = torch.tensor(
+            [self.n_window * 2] * chunk_num.sum(),
+            dtype=torch.long,
+            device=feature_lens.device,
+        )
+        tail_chunk_index = F.pad(chunk_num, (1, 0), value=-1).cumsum(0)[1:]
+        chunk_lengths[tail_chunk_index] = feature_lens % (self.n_window * 2)
+        chunk_lengths[chunk_lengths == 0] = self.n_window * 2
+
+        chunk_list = input_features.T.split(chunk_lengths.tolist(), dim=0)
+        padded_feature = nn.utils.rnn.pad_sequence(
+            chunk_list, batch_first=True
+        ).transpose(1, 2)
+        feature_lens_after_cnn = _get_feat_extract_output_lengths(chunk_lengths)
+        padded_mask_after_cnn = nn.utils.rnn.pad_sequence(
+            [
+                torch.ones(length, dtype=torch.bool, device=padded_feature.device)
+                for length in feature_lens_after_cnn
+            ],
+            batch_first=True,
+        )
+        padded_feature = padded_feature.unsqueeze(1)
+        # Split to chunk to avoid OOM during convolution
+        padded_embeds = []
+        for chunk in padded_feature.split(self.conv_chunksize, dim=0):
+            padded_embed = F.gelu(self.conv2d1(chunk))
+            padded_embed = F.gelu(self.conv2d2(padded_embed))
+            padded_embed = F.gelu(self.conv2d3(padded_embed))
+            padded_embeds.append(padded_embed)
+        padded_embed = torch.cat(padded_embeds, dim=0)
+        b, c, f, t = padded_embed.size()
+        padded_embed = self.conv_out(
+            padded_embed.permute(0, 3, 1, 2).contiguous().view(b, t, c * f)
+        )
+
+        positional_embedding = (
+            self.positional_embedding.positional_embedding[: padded_embed.shape[1], :]
+            .unsqueeze(0)
+            .to(padded_embed.dtype)
+        )
+        padded_embed = padded_embed + positional_embedding
+        hidden_states = padded_embed[padded_mask_after_cnn]
+        cu_chunk_lens = [0]
+        window_aftercnn = padded_mask_after_cnn.shape[-1] * (
+            self.n_window_infer // (self.n_window * 2)
+        )
+        for cnn_len in aftercnn_lens:
+            cu_chunk_lens += [window_aftercnn] * (cnn_len // window_aftercnn)
+            remainder = cnn_len % window_aftercnn
+            if remainder != 0:
+                cu_chunk_lens += [remainder]
+        cu_seqlens = torch.tensor(cu_chunk_lens, device=aftercnn_lens.device).cumsum(
+            -1, dtype=torch.int32
+        )
+        # cu_seqlens must be on cpu because of npu_flash_attention_unpad operator restriction
+        if is_npu():
+            cu_seqlens = cu_seqlens.to("cpu")
+
+        for encoder_layer in self.layers:
+            layer_outputs = encoder_layer(
+                hidden_states,
+                cu_seqlens,
+            )
+
+            hidden_states = layer_outputs[0]
+
+        hidden_states = self.ln_post(hidden_states)
+        hidden_states = self.proj1(hidden_states)
+        hidden_states = self.act(hidden_states)
+        hidden_states = self.proj2(hidden_states)
+        return BaseModelOutput(last_hidden_state=hidden_states)
+
+    # Ignore copy
+    def _get_feat_extract_output_lengths(self, input_lengths: torch.LongTensor):
+        """
+        Computes the output length of the convolutional layers and the output length of the audio encoder
+        """
+        input_lengths = (input_lengths - 1) // 2 + 1
+        output_lengths = (input_lengths - 2) // 2 + 1
+        return input_lengths, output_lengths
+
+
+class Qwen3OmniMoeVisionPatchMerger(nn.Module):
+
+    def __init__(
+        self,
+        dim: int,
+        context_dim: int,
+        spatial_merge_size: int = 2,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+        use_postshuffle_norm=False,
+    ) -> None:
+        super().__init__()
+        self.hidden_size = context_dim * (spatial_merge_size**2)
+        self.use_postshuffle_norm = use_postshuffle_norm
+        self.ln_q = nn.LayerNorm(
+            self.hidden_size if use_postshuffle_norm else context_dim, eps=1e-6
+        )
+        self.mlp = nn.ModuleList(
+            [
+                ColumnParallelLinear(
+                    self.hidden_size,
+                    self.hidden_size,
+                    bias=True,
+                    quant_config=quant_config,
+                    prefix=add_prefix("mlp.0", prefix),
+                ),
+                nn.GELU(),
+                RowParallelLinear(
+                    self.hidden_size,
+                    dim,
+                    bias=True,
+                    quant_config=quant_config,
+                    prefix=add_prefix("mlp.2", prefix),
+                ),
+            ]
+        )
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = (
+            x.view(-1, self.hidden_size)
+            if self.use_postshuffle_norm
+            else x.view(-1, x.shape[-1])
+        )
+        hidden = self.ln_q(x).view(-1, self.hidden_size)
+        for layer in self.mlp:
+            if isinstance(hidden, tuple):
+                hidden = hidden[0]
+            hidden = layer(hidden)
+
+        if isinstance(hidden, tuple):
+            hidden = hidden[0]
+
+        return hidden
+
+
+class Qwen3OmniMoeVisionEncoder(Qwen3VLMoeVisionModel):
+    config: Qwen3OmniMoeVisionEncoderConfig
+
+    def __init__(
+        self,
+        config: Qwen3OmniMoeVisionEncoderConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = None,
+        **kwargs,
+    ):
+        super().__init__(
+            vision_config=config,
+            quant_config=quant_config,
+            norm_eps=getattr(config, "rms_norm_eps", 1e-6),
+        )
+
+        self.merger = Qwen3OmniMoeVisionPatchMerger(
+            dim=config.out_hidden_size,
+            context_dim=config.hidden_size,
+            spatial_merge_size=config.spatial_merge_size,
+            quant_config=quant_config,
+            use_postshuffle_norm=False,
+            prefix=add_prefix("merger", prefix),
+        )
+        self.merger_list = nn.ModuleList(
+            [
+                Qwen3OmniMoeVisionPatchMerger(
+                    dim=config.out_hidden_size,
+                    context_dim=config.hidden_size,
+                    spatial_merge_size=config.spatial_merge_size,
+                    use_postshuffle_norm=True,
+                    quant_config=quant_config,
+                    prefix=add_prefix("merger_list", prefix),
+                )
+                for _ in range(len(config.deepstack_visual_indexes))
+            ]
+        )
+        del self.deepstack_merger_list
+
+    @property
+    def deepstack_merger_list(self):
+        return self.merger_list
+
+    @property
+    def dtype(self) -> torch.dtype:
+        return self.patch_embed.proj.weight.dtype
+
+    @property
+    def device(self) -> torch.device:
+        return self.patch_embed.proj.weight.device
+
+
+class Qwen3OmniMoeThinkerForConditionalGeneration(Qwen3VLMoeForConditionalGeneration):
+    config: Qwen3OmniMoeThinkerConfig
+
+    def __init__(
+        self,
+        config: Qwen3OmniMoeThinkerConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__(
+            config, quant_config, prefix, language_model_cls=Qwen3MoeLLMModel
+        )
+        self.audio_tower = Qwen3OmniMoeAudioEncoder(config.audio_config)
+        self.visual = Qwen3OmniMoeVisionEncoder(
+            config.vision_config,
+            quant_config=quant_config,
+            norm_eps=getattr(config, "rms_norm_eps", 1e-6),
+            prefix=add_prefix("visual", prefix),
+        )
+        self.pad_token_id = (
+            self.config.pad_token_id if self.config.pad_token_id is not None else -1
+        )
+
+    def get_audio_feature(self, items: List[MultimodalDataItem]):
+        feature_attention_mask = torch.cat(
+            [item.feature_attention_mask for item in items], dim=0
+        ).type(torch.long)
+        input_features = (
+            torch.cat([item.feature for item in items])
+            .type(self.audio_tower.dtype)
+            .to(next(self.audio_tower.parameters()).device)
+        )
+        if feature_attention_mask is not None:
+            audio_feature_lengths = torch.sum(feature_attention_mask, dim=1)
+            input_features = input_features.permute(0, 2, 1)[
+                feature_attention_mask.bool()
+            ].permute(1, 0)
+        else:
+            audio_feature_lengths = None
+
+        feature_lens = (
+            audio_feature_lengths
+            if audio_feature_lengths is not None
+            else feature_attention_mask.sum(-1)
+        )
+        audio_outputs = self.audio_tower(
+            input_features,
+            feature_lens=feature_lens,
+        )
+        audio_features = audio_outputs.last_hidden_state
+
+        return audio_features
+
+
+class Qwen3OmniMoeForConditionalGeneration(PreTrainedModel):
+    def __init__(
+        self,
+        config: Qwen3VLMoeConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__(config)
+        self.config = config
+
+        self.thinker = Qwen3OmniMoeThinkerForConditionalGeneration(
+            config.thinker_config, quant_config=quant_config, prefix=prefix
+        )
+        self.enable_talker = False
+        self.pad_input_ids = self.thinker.pad_input_ids
+        self.forward = self.thinker.forward
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            (".qkv_proj", ".q_proj", "q"),
+            (".qkv_proj", ".k_proj", "k"),
+            (".qkv_proj", ".v_proj", "v"),
+            ("gate_up_proj", "up_proj", 1),
+            ("gate_up_proj", "gate_proj", 0),
+        ]
+
+        expert_params_mapping = FusedMoE.make_expert_params_mapping(
+            ckpt_gate_proj_name="gate_proj",
+            ckpt_down_proj_name="down_proj",
+            ckpt_up_proj_name="up_proj",
+            num_experts=self.config.num_experts,
+        )
+
+        # Skip loading extra parameters for GPTQ/modelopt models.
+        ignore_suffixes = (
+            ".bias",
+            "_bias",
+            ".k_scale",
+            "_k_scale",
+            ".v_scale",
+            "_v_scale",
+            ".weight_scale",
+            "_weight_scale",
+            ".input_scale",
+            "_input_scale",
+        )
+
+        is_fused_expert = False
+        fused_expert_params_mapping = [
+            ("experts.w13_weight", "experts.gate_up_proj", 0, "w1"),
+            ("experts.w2_weight", "experts.down_proj", 0, "w2"),
+        ]
+
+        num_experts = self.config.num_experts
+
+        # Cache params_dict to avoid repeated expensive traversal of model parameters
+        if not hasattr(self, "_cached_params_dict"):
+            self._cached_params_dict = dict(self.named_parameters())
+        params_dict = self._cached_params_dict
+
+        for name, loaded_weight in weights:
+            name = name.replace(r"model.language_model.", r"model.")
+
+            if ("talker" in name or "code2wav" in name) and not self.enable_talker:
+                continue
+
+            name = name.replace(".self_attn.out_proj", ".self_attn.proj")
+
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                if "experts.gate_up_proj" in name or "experts.down_proj" in name:
+                    is_fused_expert = True
+                    expert_params_mapping = fused_expert_params_mapping
+
+                # Skip non-stacked layers and experts (experts handled below).
+                if weight_name not in name:
+                    continue
+                if "visual" in name:
+                    continue
+
+                # We have mlp.experts[0].gate_proj in the checkpoint.
+                # Since we handle the experts below in expert_params_mapping,
+                # we need to skip here BEFORE we update the name, otherwise
+                # name will be updated to mlp.experts[0].gate_up_proj, which
+                # will then be updated below in expert_params_mapping
+                # for mlp.experts[0].gate_gate_up_proj, which breaks load.
+                if "mlp.experts" in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra parameters for GPTQ/modelopt models.
+                if name.endswith(ignore_suffixes) and name not in params_dict:
+                    continue
+                # [TODO] Skip layers that are on other devices (check if sglang has a similar function)
+                # if is_pp_missing_parameter(name, self):
+                #     continue
+
+                if name not in params_dict:
+                    continue
+
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                # Track if this is an expert weight to enable early skipping
+                is_expert_weight = False
+
+                for mapping in expert_params_mapping:
+                    param_name, weight_name, expert_id, shard_id = mapping
+                    if weight_name not in name:
+                        continue
+                    if "visual" in name or "audio_tower" in name:
+                        continue
+                    # Anyway, this is an expert weight and should not be
+                    # attempted to load as other weights later
+                    is_expert_weight = True
+                    name_mapped = name.replace(weight_name, param_name)
+                    if is_fused_expert:
+                        loaded_weight = loaded_weight.transpose(-1, -2)  # no bias
+                        if "experts.gate_up_proj" in name:
+                            loaded_weight = loaded_weight.chunk(2, dim=-2)
+                            load_fused_expert_weights(
+                                name_mapped,
+                                params_dict,
+                                loaded_weight[0],
+                                "w1",
+                                num_experts,
+                            )
+                            load_fused_expert_weights(
+                                name_mapped,
+                                params_dict,
+                                loaded_weight[1],
+                                "w3",
+                                num_experts,
+                            )
+                        else:
+                            load_fused_expert_weights(
+                                name_mapped,
+                                params_dict,
+                                loaded_weight,
+                                shard_id,
+                                num_experts,
+                            )
+                    else:
+                        # Skip loading extra parameters for GPTQ/modelopt models.
+                        if (
+                            name_mapped.endswith(ignore_suffixes)
+                            and name_mapped not in params_dict
+                        ):
+                            continue
+                        if name_mapped in params_dict.keys():
+                            param = params_dict[name_mapped]
+                        else:
+                            continue
+                        # We should ask the weight loader to return success or
+                        # not here since otherwise we may skip experts with
+                        # # other available replicas.
+                        weight_loader = param.weight_loader
+                        weight_loader(
+                            param,
+                            loaded_weight,
+                            name_mapped,
+                            shard_id=shard_id,
+                            expert_id=expert_id,
+                        )
+                    name = name_mapped
+                    break
+                else:
+                    if is_expert_weight:
+                        # This is an expert weight but not mapped to this rank, skip all remaining processing
+                        continue
+                    if "visual" in name or "audio_tower" in name:
+                        # adapt to VisionAttention
+                        name = name.replace(r"attn.qkv.", r"attn.qkv_proj.")
+                        name = name.replace(r"model.visual.", r"visual.")
+                        name = name.replace(r"attn.out_proj.", r"attn.proj.")
+
+                    # Skip loading extra parameters for GPTQ/modelopt models.
+                    if name.endswith(ignore_suffixes) and name not in params_dict:
+                        continue
+
+                    if name in params_dict.keys():
+                        param = params_dict[name]
+                        weight_loader = getattr(
+                            param, "weight_loader", default_weight_loader
+                        )
+                        weight_loader(param, loaded_weight)
+                    else:
+                        logger.warning(
+                            f"Loaded weight with {name=} not found in params_dict"
+                        )
+
+
+EntryClass = Qwen3OmniMoeForConditionalGeneration
diff --git a/sglang/python/sglang/srt/models/qwen3_rm.py b/sglang/python/sglang/srt/models/qwen3_rm.py
new file mode 100644
index 0000000000000000000000000000000000000000..eeed15421f216aa2d3aff8c316dae0c127a3d15e
--- /dev/null
+++ b/sglang/python/sglang/srt/models/qwen3_rm.py
@@ -0,0 +1,47 @@
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Qwen3 Reward Model for RLHF and best-of-N sampling."""
+
+from typing import Optional
+
+from torch import nn
+from transformers import Qwen2Config  # Qwen3 uses Qwen2Config
+
+from sglang.srt.layers.pooler import Pooler, PoolingType
+from sglang.srt.layers.quantization.base_config import QuantizationConfig
+from sglang.srt.models.qwen3_classification import Qwen3ForPooledOutput
+
+
+class Qwen3ForRewardModel(Qwen3ForPooledOutput):
+    """Qwen3 Reward Model with 2-layer MLP scoring head for RLHF."""
+
+    def __init__(
+        self,
+        config: Qwen2Config,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__(config, quant_config, prefix)
+        self.num_labels = 1
+        self.score = nn.Sequential(
+            nn.Linear(config.hidden_size, config.hidden_size),
+            nn.ReLU(),
+            nn.Linear(config.hidden_size, self.num_labels),
+        )
+        self.pooler = Pooler(pooling_type=PoolingType.LAST, normalize=False)
+
+
+EntryClass = [
+    Qwen3ForRewardModel,
+]
diff --git a/sglang/python/sglang/srt/models/qwen3_vl.py b/sglang/python/sglang/srt/models/qwen3_vl.py
new file mode 100644
index 0000000000000000000000000000000000000000..34a645078c7153e541833cb3bc480f1500ee40f1
--- /dev/null
+++ b/sglang/python/sglang/srt/models/qwen3_vl.py
@@ -0,0 +1,1380 @@
+# Copyright 2025 Qwen Team
+# Copyright 2025 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Inference-only Qwen3-VL model compatible with HuggingFace weights."""
+
+import logging
+import math
+import re
+from functools import lru_cache, partial
+from typing import Callable, Iterable, List, Optional, Tuple, Union
+
+import numpy as np
+import torch
+import torch.nn as nn
+from einops import rearrange
+from transformers.activations import ACT2FN
+
+from sglang.srt.configs.qwen3_vl import Qwen3VLConfig, Qwen3VLVisionConfig
+from sglang.srt.distributed import get_tensor_model_parallel_world_size
+from sglang.srt.distributed.parallel_state import get_pp_group
+from sglang.srt.environ import envs
+from sglang.srt.layers.attention.vision import (
+    BATCH_BUCKETS,
+    FLASHINFER_MAX_SEQLEN_BUCKETS,
+    FLASHINFER_WORKSPACE_SIZE_BYTES,
+    VisionAttention,
+)
+from sglang.srt.layers.dp_attention import (
+    get_attention_tp_rank,
+    get_attention_tp_size,
+    is_dp_attention_enabled,
+)
+from sglang.srt.layers.linear import ColumnParallelLinear, RowParallelLinear
+from sglang.srt.layers.logits_processor import LogitsProcessor
+from sglang.srt.layers.pooler import Pooler, PoolingType
+from sglang.srt.layers.quantization.base_config import QuantizationConfig
+from sglang.srt.layers.rotary_embedding import get_rope
+from sglang.srt.layers.utils import PPMissingLayer, get_layer_id
+from sglang.srt.layers.vocab_parallel_embedding import (
+    ParallelLMHead,
+    VocabParallelEmbedding,
+)
+from sglang.srt.managers.mm_utils import (
+    MultiModalityDataPaddingPatternMultimodalTokens,
+    general_mm_embed_routine,
+)
+from sglang.srt.managers.schedule_batch import (
+    Modality,
+    MultimodalDataItem,
+    MultimodalInputs,
+)
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch, PPProxyTensors
+from sglang.srt.model_loader.weight_utils import default_weight_loader
+from sglang.srt.models.qwen3 import Qwen3Model
+from sglang.srt.models.utils import (
+    RotaryPosMixin,
+    WeightsMapper,
+    compute_cu_seqlens_from_grid_numpy,
+)
+from sglang.srt.multimodal.mm_utils import run_dp_sharded_mrope_vision_model
+from sglang.srt.multimodal.vit_cuda_graph_runner import ViTCudaGraphRunner
+from sglang.srt.server_args import get_global_server_args
+from sglang.srt.utils import add_prefix, get_int_env_var, is_npu, round_up
+from sglang.srt.utils.hf_transformers_utils import get_processor
+
+logger = logging.getLogger(__name__)
+
+
+class Qwen3_VisionMLP(nn.Module):
+
+    def __init__(
+        self,
+        in_features: int,
+        hidden_features: int,
+        bias: bool = True,
+        hidden_act="silu",
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+        use_data_parallel: bool = False,
+    ):
+        super().__init__()
+        self.tp_size = 1 if use_data_parallel else get_attention_tp_size()
+        self.tp_rank = 0 if use_data_parallel else get_attention_tp_rank()
+        self.linear_fc1 = ColumnParallelLinear(
+            in_features,
+            hidden_features,
+            bias=bias,
+            quant_config=quant_config,
+            prefix=add_prefix("linear_fc1", prefix),
+            tp_size=self.tp_size,
+            tp_rank=self.tp_rank,
+        )
+        self.linear_fc2 = RowParallelLinear(
+            hidden_features,
+            in_features,
+            bias=bias,
+            quant_config=quant_config,
+            prefix=add_prefix("linear_fc2", prefix),
+            tp_size=self.tp_size,
+            tp_rank=self.tp_rank,
+            use_dp_attention_reduce=is_dp_attention_enabled(),
+        )
+        self.act = ACT2FN[hidden_act]
+
+    def forward(self, x: torch.Tensor):
+        x_fc1, _ = self.linear_fc1(x)
+        mlp_output, _ = self.linear_fc2(self.act(x_fc1))
+        return mlp_output
+
+
+class Qwen3VLVisionPatchEmbed(nn.Module):
+    def __init__(self, config) -> None:
+        super().__init__()
+        self.patch_size = config.patch_size
+        self.temporal_patch_size = config.temporal_patch_size
+        self.in_channels = config.in_channels
+        self.embed_dim = config.hidden_size
+
+        kernel_size = [self.temporal_patch_size, self.patch_size, self.patch_size]
+        self.proj = nn.Conv3d(
+            self.in_channels,
+            self.embed_dim,
+            kernel_size=kernel_size,
+            stride=kernel_size,
+            bias=True,
+        )
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        target_dtype = self.proj.weight.dtype
+        hidden_states = hidden_states.view(
+            -1,
+            self.in_channels,
+            self.temporal_patch_size,
+            self.patch_size,
+            self.patch_size,
+        )
+        hidden_states = self.proj(hidden_states.to(dtype=target_dtype)).view(
+            -1, self.embed_dim
+        )
+        return hidden_states
+
+
+class Qwen3_VisionBlock(nn.Module):
+
+    def __init__(
+        self,
+        dim: int,
+        num_heads: int,
+        intermediate_dim: int,
+        hidden_act="silu",
+        norm_layer: Optional[Callable[[int], nn.Module]] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+        use_data_parallel: bool = False,
+        workspace_buffer: torch.Tensor | None = None,
+    ) -> None:
+        super().__init__()
+        if norm_layer is None:
+            norm_layer = partial(nn.LayerNorm, eps=1e-6)
+        self.norm1 = norm_layer(dim)
+        self.norm2 = norm_layer(dim)
+
+        self.attn = VisionAttention(
+            embed_dim=dim,
+            num_heads=num_heads,
+            projection_size=dim,
+            use_qkv_parallel=True,
+            proj_bias=True,
+            flatten_batch=True,
+            quant_config=quant_config,
+            prefix=add_prefix("attn", prefix),
+            use_data_parallel=use_data_parallel,
+            use_dp_attention_reduce=is_dp_attention_enabled(),
+            workspace_buffer=workspace_buffer,
+        )
+        self.mlp = Qwen3_VisionMLP(
+            dim,
+            intermediate_dim,
+            hidden_act=hidden_act,
+            bias=True,
+            quant_config=quant_config,
+            prefix=f"{prefix}.mlp",
+            use_data_parallel=use_data_parallel,
+        )
+
+    def forward(
+        self,
+        x: torch.Tensor,
+        cu_seqlens: torch.Tensor,
+        rotary_pos_emb_cos: torch.Tensor,
+        rotary_pos_emb_sin: torch.Tensor,
+        output_ws: Optional[torch.Tensor] = None,
+        max_seqlen: Optional[torch.Tensor] = None,
+        sequence_lengths: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        hidden_states = self.norm1(x)
+        hidden_states = rearrange(hidden_states, "s b ... -> b s ...")
+        attn = self.attn(
+            hidden_states,
+            cu_seqlens=cu_seqlens,
+            rotary_pos_emb_cos=rotary_pos_emb_cos,
+            rotary_pos_emb_sin=rotary_pos_emb_sin,
+            output_ws=output_ws,
+            max_seqlen=max_seqlen,
+            sequence_lengths=sequence_lengths,
+        )
+        attn = rearrange(attn, "b s ... -> s b ...")
+        x += attn
+        norm2 = self.norm2(x)
+        mlp = self.mlp(norm2)
+        x += mlp
+        return x
+
+
+class Qwen3VLMoeVisionPatchMerger(nn.Module):
+
+    def __init__(
+        self,
+        dim: int,
+        context_dim: int,
+        norm_layer: Optional[Callable[[int], nn.Module]] = None,
+        spatial_merge_size: int = 2,
+        use_postshuffle_norm: bool = False,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+        use_data_parallel: bool = False,
+    ) -> None:
+        super().__init__()
+        self.hidden_size = context_dim * (spatial_merge_size**2)
+
+        self.use_postshuffle_norm = use_postshuffle_norm
+
+        if norm_layer is None:
+            norm_layer = partial(nn.LayerNorm, eps=1e-6)
+        self.norm = norm_layer(
+            self.hidden_size if use_postshuffle_norm else context_dim
+        )
+        self.tp_size = 1 if use_data_parallel else get_attention_tp_size()
+        self.tp_rank = 0 if use_data_parallel else get_attention_tp_rank()
+        self.linear_fc1 = ColumnParallelLinear(
+            self.hidden_size,
+            self.hidden_size,
+            bias=True,
+            quant_config=quant_config,
+            prefix=add_prefix("linear_fc1", prefix),
+            tp_size=self.tp_size,
+            tp_rank=self.tp_rank,
+        )
+        self.act_fn = nn.GELU()
+        self.linear_fc2 = RowParallelLinear(
+            self.hidden_size,
+            dim,
+            bias=True,
+            quant_config=quant_config,
+            prefix=add_prefix("linear_fc2", prefix),
+            tp_size=self.tp_size,
+            tp_rank=self.tp_rank,
+            use_dp_attention_reduce=is_dp_attention_enabled(),
+        )
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        if self.use_postshuffle_norm:
+            x = self.norm(x.view(-1, self.hidden_size))
+        else:
+            x = self.norm(x).view(-1, self.hidden_size)
+
+        x_parallel, _ = self.linear_fc1(x)
+        x_parallel = self.act_fn(x_parallel)
+        out, _ = self.linear_fc2(x_parallel)
+        return out
+
+
+class Qwen3VLMoeVisionModel(nn.Module, RotaryPosMixin):
+
+    def __init__(
+        self,
+        vision_config: Qwen3VLVisionConfig,
+        norm_eps: float = 1e-6,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+        use_data_parallel: bool = False,
+    ) -> None:
+        super().__init__()
+        self.pp_group = get_pp_group()
+        self.hidden_size = vision_config.hidden_size
+        self.num_heads = vision_config.num_heads
+        self.num_position_embeddings = vision_config.num_position_embeddings
+        self.num_grid_per_side = int(self.num_position_embeddings**0.5)
+        self.num_grid = self.num_grid_per_side * self.num_grid_per_side
+        self.align_corners = (
+            get_global_server_args().enable_precise_embedding_interpolation
+        )
+        self.patch_size = vision_config.patch_size
+        self.spatial_merge_size = vision_config.spatial_merge_size
+        self.spatial_merge_unit = self.spatial_merge_size**2
+        self.temporal_patch_size = vision_config.temporal_patch_size
+        self.use_data_parallel = use_data_parallel
+        # layer indexes of which layer's output should be deep-stacked
+        self.deepstack_visual_indexes = vision_config.deepstack_visual_indexes
+        self.out_hidden_size = vision_config.out_hidden_size * (
+            1 + len(self.deepstack_visual_indexes)
+        )
+        self.patch_embed = Qwen3VLVisionPatchEmbed(config=vision_config)
+        if self.pp_group.is_first_rank:
+            self.pos_embed = VocabParallelEmbedding(
+                self.num_position_embeddings,
+                self.hidden_size,
+                quant_config=quant_config,
+                use_attn_tp_group=is_dp_attention_enabled(),
+                prefix=add_prefix("pos_embed", prefix),
+            )
+        else:
+            self.pos_embed = PPMissingLayer()
+
+        norm_layer = partial(nn.LayerNorm, eps=norm_eps)
+        head_dim = self.hidden_size // self.num_heads
+        self.rotary_pos_emb = get_rope(
+            head_size=head_dim,
+            rotary_dim=head_dim // 2,
+            max_position=8192,
+            base=10000.0,
+            is_neox_style=True,
+        )
+
+        workspace_buffer = None
+        if get_global_server_args().mm_attention_backend == "flashinfer_cudnn":
+            if torch.cuda.is_available() and (not is_npu()):
+                ws_device = torch.device("cuda", torch.cuda.current_device())
+            else:
+                ws_device = self.device
+            workspace_buffer = torch.empty(
+                FLASHINFER_WORKSPACE_SIZE_BYTES,
+                dtype=torch.uint8,
+                device=ws_device,
+            )
+
+        self.blocks = nn.ModuleList(
+            [
+                Qwen3_VisionBlock(
+                    dim=self.hidden_size,
+                    num_heads=self.num_heads,
+                    intermediate_dim=vision_config.intermediate_size,
+                    hidden_act=vision_config.hidden_act,
+                    norm_layer=norm_layer,
+                    quant_config=quant_config,
+                    prefix=add_prefix(f"blocks.{layer_idx}", prefix),
+                    use_data_parallel=use_data_parallel,
+                    workspace_buffer=workspace_buffer,
+                )
+                for layer_idx in range(vision_config.depth)
+            ]
+        )
+        self.merger = Qwen3VLMoeVisionPatchMerger(
+            dim=vision_config.out_hidden_size,
+            context_dim=self.hidden_size,
+            norm_layer=norm_layer,
+            spatial_merge_size=self.spatial_merge_size,
+            quant_config=quant_config,
+            prefix=add_prefix("merger", prefix),
+            use_data_parallel=use_data_parallel,
+        )
+
+        self.deepstack_merger_list = nn.ModuleList(
+            [
+                Qwen3VLMoeVisionPatchMerger(
+                    dim=vision_config.out_hidden_size,
+                    context_dim=self.hidden_size,
+                    spatial_merge_size=self.spatial_merge_size,
+                    use_postshuffle_norm=True,
+                    norm_layer=norm_layer,
+                    quant_config=quant_config,
+                    prefix=add_prefix(f"deepstack_merger_list.{layer_idx}", prefix),
+                    use_data_parallel=use_data_parallel,
+                )
+                for layer_idx in range(len(self.deepstack_visual_indexes))
+            ]
+        )
+
+        self.tp_size = (
+            1 if use_data_parallel else get_tensor_model_parallel_world_size()
+        )
+        self.cuda_graph_runner: Optional[ViTCudaGraphRunner] = ViTCudaGraphRunner(self)
+
+    @property
+    def dtype(self) -> torch.dtype:
+        return self.patch_embed.proj.weight.dtype
+
+    @property
+    def device(self) -> torch.device:
+        return self.patch_embed.proj.weight.device
+
+    def rot_pos_emb(
+        self, grid_thw: list[list[int]]
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        pos_ids = []
+        for t, h, w in grid_thw:
+            base = self.rot_pos_ids(h, w, self.spatial_merge_size)
+            pos_ids.append(base if t == 1 else base.repeat(t, 1))
+
+        pos_ids = torch.cat(pos_ids, dim=0).to(self.device, non_blocking=True)
+        max_grid_size = max(max(h, w) for _, h, w in grid_thw)
+
+        # Use pre-computed cos_sin_cache from RotaryEmbedding
+        cos, sin = self.rotary_pos_emb.get_cos_sin(max_grid_size)
+
+        cos_combined = cos[pos_ids].flatten(1)
+        sin_combined = sin[pos_ids].flatten(1)
+
+        return cos_combined, sin_combined
+
+    def _get_interpolation_indices(self, dim_size: int) -> torch.Tensor:
+        """
+        Compute continuous interpolation indices for a single dimension.
+
+        Returns continuous indices.
+        """
+        if self.align_corners:
+            indices = np.linspace(
+                0, self.num_grid_per_side - 1, dim_size, dtype=np.float32
+            )
+        else:
+            indices = (np.arange(dim_size, dtype=np.float32) + 0.5) * (
+                self.num_grid_per_side / dim_size
+            ) - 0.5
+            indices = np.clip(indices, 0, self.num_grid_per_side - 1)
+        return indices
+
+    def _calculate_indices_and_weights(self, h_idxs, w_idxs):
+        """
+        Compute bilinear interpolation indices and weights.
+
+        Returns tuple of (indices, weights), each as 4 numpy arrays for the 4 corner points.
+        """
+        h_f = np.floor(h_idxs).astype(np.int64)
+        h_c = np.clip(h_f + 1, 0, self.num_grid_per_side - 1)
+        dh = h_idxs - h_f
+
+        w_f = np.floor(w_idxs).astype(np.int64)
+        w_c = np.clip(w_f + 1, 0, self.num_grid_per_side - 1)
+        dw = w_idxs - w_f
+
+        side = self.num_grid_per_side
+
+        indices = [
+            (h_f[:, None] * side + w_f).flatten(),
+            (h_f[:, None] * side + w_c).flatten(),
+            (h_c[:, None] * side + w_f).flatten(),
+            (h_c[:, None] * side + w_c).flatten(),
+        ]
+        weights = [
+            ((1 - dh)[:, None] * (1 - dw)).flatten(),
+            ((1 - dh)[:, None] * dw).flatten(),
+            (dh[:, None] * (1 - dw)).flatten(),
+            (dh[:, None] * dw).flatten(),
+        ]
+        return indices, weights
+
+    def _get_position_embedding(self, patch_pos_embeds, grid_ts, grid_hs, grid_ws):
+        """
+        Tile and reorganize position embeddings to align with the token sequence.
+        """
+        result_parts = []
+        merge_size = self.spatial_merge_size
+
+        for pos_embed, t, h, w in zip(patch_pos_embeds, grid_ts, grid_hs, grid_ws):
+            pos_embed = pos_embed.repeat(t, 1)
+
+            h_merge = h // merge_size
+            w_merge = w // merge_size
+
+            pos_embed = (
+                pos_embed.view(t, h_merge, merge_size, w_merge, merge_size, -1)
+                .permute(0, 1, 3, 2, 4, 5)
+                .flatten(0, 4)
+            )
+
+            result_parts.append(pos_embed)
+
+        return torch.cat(result_parts, dim=0)
+
+    def _torch_interp_indices(
+        self, dim_size: int, device: torch.device
+    ) -> torch.Tensor:
+        side = self.num_grid_per_side
+        if self.align_corners:
+            # align_corners=True
+            return torch.linspace(
+                0, side - 1, dim_size, dtype=torch.float32, device=device
+            )
+        else:
+            # align_corners=False  (match _get_interpolation_indices)
+            idx = (torch.arange(dim_size, dtype=torch.float32, device=device) + 0.5) * (
+                side / dim_size
+            ) - 0.5
+            return idx.clamp_(0, side - 1)
+
+    def fast_pos_embed_interpolate_from_list(self, grid_thw):
+        num_grid_per_side = self.num_grid_per_side
+        m_size = self.spatial_merge_size
+        hidden_dim = self.pos_embed.embedding_dim
+
+        outputs = []
+        for t, h, w in grid_thw:
+            h_idxs = torch.linspace(
+                0, num_grid_per_side - 1, h, dtype=torch.float32, device=self.device
+            )
+            w_idxs = torch.linspace(
+                0, num_grid_per_side - 1, w, dtype=torch.float32, device=self.device
+            )
+
+            h_floor = h_idxs.to(torch.long)
+            w_floor = w_idxs.to(torch.long)
+            h_ceil = torch.clamp(h_floor + 1, max=num_grid_per_side - 1)
+            w_ceil = torch.clamp(w_floor + 1, max=num_grid_per_side - 1)
+
+            dh = h_idxs - h_floor
+            dw = w_idxs - w_floor
+
+            # Create meshgrid view for all h, w vars
+            dh_grid, dw_grid = torch.meshgrid(dh, dw, indexing="ij")
+            h_floor_grid, w_floor_grid = torch.meshgrid(h_floor, w_floor, indexing="ij")
+            h_ceil_grid, w_ceil_grid = torch.meshgrid(h_ceil, w_ceil, indexing="ij")
+
+            # original computation of weights
+            # w00 = (1 - dh_grid) * (1 - dw_grid)
+            # w01 = (1 - dh_grid) * dw_grid
+            # w10 = dh_grid * (1 - dw_grid)
+            # w11 = dh_grid * dw_grid
+            # we reuse w11 here to avoid duplicate
+            # dh_grid * dw_grid computation
+            w11 = dh_grid * dw_grid
+            w10 = dh_grid - w11
+            w01 = dw_grid - w11
+            w00 = 1 - dh_grid - w01
+
+            h_grid = torch.stack([h_floor_grid, h_floor_grid, h_ceil_grid, h_ceil_grid])
+            w_grid = torch.stack([w_floor_grid, w_ceil_grid, w_floor_grid, w_ceil_grid])
+            h_grid_idx = h_grid * num_grid_per_side
+
+            indices = (h_grid_idx + w_grid).reshape(4, -1)
+            weights = torch.stack([w00, w01, w10, w11], dim=0).reshape(4, -1, 1)
+            weights = weights.to(dtype=self.dtype)
+
+            embeds = self.pos_embed(indices)
+            embeds *= weights
+            combined = embeds.sum(dim=0)
+
+            combined = combined.reshape(
+                h // m_size, m_size, w // m_size, m_size, hidden_dim
+            )
+            combined = combined.permute(0, 2, 1, 3, 4).reshape(1, -1, hidden_dim)
+            repeated = combined.expand(t, -1, -1).reshape(-1, hidden_dim)
+            outputs.append(repeated)
+
+        return torch.cat(outputs, dim=0)
+
+    def add_padding_to_fi_seqlens(
+        self, seq: np.ndarray, batch_size: int, padding_value: int
+    ) -> np.ndarray:
+        batch_size_padded = next(
+            (b for b in BATCH_BUCKETS if b >= batch_size),
+            # For large batches (> max bucket), round up to a multiple of
+            # the base bucket size to avoid negative pad length.
+            round_up(batch_size, BATCH_BUCKETS[0]),
+        )
+        if batch_size_padded == batch_size:
+            return seq
+        return np.concatenate(
+            [
+                seq,
+                np.full(
+                    (batch_size_padded - batch_size,), padding_value, dtype=seq.dtype
+                ),
+            ]
+        )
+
+    def bucket_flashinfer_max_seqlen(self, real_max_seqlen: int) -> int:
+        if real_max_seqlen <= 0:
+            return FLASHINFER_MAX_SEQLEN_BUCKETS[0]
+        return next(
+            (s for s in FLASHINFER_MAX_SEQLEN_BUCKETS if s >= real_max_seqlen),
+            # For large sequences (> max bucket), round up to a multiple of
+            # the largest bucket to avoid under-estimation.
+            round_up(real_max_seqlen, FLASHINFER_MAX_SEQLEN_BUCKETS[-1]),
+        )
+
+    def fast_pos_embed_interpolate(self, grid_thw):
+        """Interpolate position embeddings for (batch, 3) size input dimensions.
+
+        Performs bilinear interpolation on spatial dimensions (height, width) and replicates
+        along temporal dimension. The result is reorganized according to spatial_merge_size.
+
+        Args:
+            grid_thw: Tensor of shape [batch_size, 3] with (temporal, height, width) dimensions
+                     in patches for each sample.
+
+        Returns:
+            Interpolated position embeddings tensor.
+        """
+        grid_thw_cpu = grid_thw.cpu().numpy()
+
+        # transfer data to CPU before loop
+        temporal_dims = grid_thw_cpu[:, 0].tolist()
+        height_dims = grid_thw_cpu[:, 1].tolist()
+        width_dims = grid_thw_cpu[:, 2].tolist()
+
+        device = self.pos_embed.weight.device
+        dtype = self.pos_embed.weight.dtype
+
+        patches_size = [h * w for h, w in zip(height_dims, width_dims)]
+        total_patches = sum(patches_size)
+        all_indices_np = np.zeros((4, total_patches), dtype=np.int64)
+        all_weights_np = np.zeros((4, total_patches), dtype=np.float32)
+
+        current_idx = 0
+
+        # calculate indices and weights on CPU
+        for t, h, w in zip(temporal_dims, height_dims, width_dims):
+            h_idxs = self._get_interpolation_indices(h)
+            w_idxs = self._get_interpolation_indices(w)
+
+            indices, weights = self._calculate_indices_and_weights(h_idxs, w_idxs)
+
+            end_idx = current_idx + h * w
+            for i in range(4):
+                all_indices_np[i, current_idx:end_idx] = indices[i]
+                all_weights_np[i, current_idx:end_idx] = weights[i]
+            current_idx = end_idx
+
+        idx_tensor = torch.from_numpy(all_indices_np).to(device)
+        weight_tensor = torch.from_numpy(all_weights_np).to(dtype=dtype, device=device)
+
+        # calculate interpolation
+        pos_embeds = self.pos_embed(idx_tensor.view(-1))
+        pos_embeds = pos_embeds.view(4, total_patches, -1)
+        patch_pos_embeds = (pos_embeds * weight_tensor.unsqueeze(-1)).sum(dim=0)
+        patch_pos_embeds = patch_pos_embeds.split(patches_size)
+        return self._get_position_embedding(
+            patch_pos_embeds, temporal_dims, height_dims, width_dims
+        )
+
+    def compute_flashinfer_batch_offsets_packed(
+        self,
+        token_cu_seqlens: np.ndarray,
+        *,
+        elem_per_token: int,
+    ) -> np.ndarray:
+        """
+        Build packed *element* indptrs for FlashInfer cuDNN prefill.
+
+        Input:
+        token_cu_seqlens: (B+1,) token indptr
+        elem_per_token: per-token element width on THIS TP rank
+                        (usually hidden_size / attn_tp_size)
+
+        Output:
+        packed_offsets: (3 * (B_padded + 1),) int32
+            [qk_indptr, v_indptr, o_indptr] concatenated,
+            each indptr is (B_padded + 1,) in element units.
+        """
+        assert token_cu_seqlens.ndim == 1 and token_cu_seqlens.size >= 2
+        B = int(token_cu_seqlens.size - 1)
+        B_padded = self.bucket_flashinfer_batch_size(B)
+
+        # token indptr -> pad to (B_padded+1,) by appending total_tokens for extra empty sequences
+        token_indptr = token_cu_seqlens.astype(np.int64, copy=False)  # (B+1,)
+        if B_padded != B:
+            pad = np.full((B_padded - B,), token_indptr[-1], dtype=token_indptr.dtype)
+            token_indptr = np.concatenate([token_indptr, pad], axis=0)  # (B_padded+1,)
+
+        # convert token indptr -> element indptr
+        elem_indptr = (token_indptr * int(elem_per_token)).astype(
+            np.int32
+        )  # (B_padded+1,)
+
+        # q/k/v/o in this ViT path share the same indptr
+        return np.concatenate([elem_indptr, elem_indptr, elem_indptr], axis=0)
+
+    def bucket_flashinfer_batch_size(self, batch_size: int) -> int:
+        """Bucketize batch size for cuDNN graph caching."""
+        return next(
+            (b for b in BATCH_BUCKETS if b >= batch_size),
+            round_up(batch_size, BATCH_BUCKETS[0]),
+        )
+
+    def compute_flashinfer_sequence_lengths_padded(
+        self,
+        token_cu_seqlens: np.ndarray,
+    ) -> np.ndarray:
+        """
+        token_cu_seqlens: (B+1,) token indptr
+        return: (B_padded,) token lengths (padded with 0)
+        """
+        assert token_cu_seqlens.ndim == 1 and token_cu_seqlens.size >= 2
+        B = int(token_cu_seqlens.size - 1)
+
+        seq_lens = (token_cu_seqlens[1:] - token_cu_seqlens[:-1]).astype(
+            np.int32
+        )  # (B,)
+
+        B_padded = self.bucket_flashinfer_batch_size(B)
+        if B_padded != B:
+            pad = np.zeros((B_padded - B,), dtype=np.int32)
+            seq_lens = np.concatenate([seq_lens, pad], axis=0)  # (B_padded,)
+        return seq_lens
+
+    def forward(
+        self,
+        x: torch.Tensor,
+        grid_thw: torch.Tensor,
+    ) -> torch.Tensor:
+        if envs.SGLANG_VIT_ENABLE_CUDA_GRAPH.get():
+            return self.forward_with_cuda_graph(x, grid_thw)
+
+        x = x.to(device=self.device, dtype=self.dtype)
+        x = self.patch_embed(x)
+
+        if isinstance(grid_thw, list):
+            grid_thw_list = grid_thw
+            grid_thw = np.array(grid_thw, dtype=np.int32)
+        else:
+            grid_thw_list = grid_thw.tolist()
+            grid_thw = grid_thw.cpu().numpy()
+
+        pos_embeds = self.fast_pos_embed_interpolate_from_list(grid_thw_list)
+        x += pos_embeds
+
+        rotary_pos_emb_cos, rotary_pos_emb_sin = self.rot_pos_emb(grid_thw_list)
+
+        # ---- build token indptr (B+1,) ----
+        token_cu_seqlens = np.repeat(
+            grid_thw[:, 1] * grid_thw[:, 2], grid_thw[:, 0]
+        ).cumsum(axis=0, dtype=np.int32)
+        token_cu_seqlens = np.concatenate(
+            [np.zeros(1, dtype=np.int32), token_cu_seqlens]
+        )
+
+        flashinfer_max_seqlen = 0
+        cu_seqlens = None
+        if get_global_server_args().mm_attention_backend == "flashinfer_cudnn":
+            # real token lens (B,)
+            real_seq_lens = token_cu_seqlens[1:] - token_cu_seqlens[:-1]
+            flashinfer_max_seqlen = self.bucket_flashinfer_max_seqlen(
+                int(real_seq_lens.max()) if real_seq_lens.size > 0 else 0
+            )
+
+            # (B_padded,) token lengths
+            seq_lens_padded = self.compute_flashinfer_sequence_lengths_padded(
+                token_cu_seqlens
+            )
+
+            # element-per-token width on THIS ATTENTION TP rank
+            # q/k/v in VisionAttention are sharded by attention TP
+            attn_tp_size = 1 if self.use_data_parallel else self.tp_size
+            elem_per_token = (
+                self.hidden_size // attn_tp_size
+            )  # == heads_per_rank * head_dim
+
+            # (3*(B_padded+1),) packed element indptrs
+            offsets_packed = self.compute_flashinfer_batch_offsets_packed(
+                token_cu_seqlens,
+                elem_per_token=elem_per_token,
+            )
+
+            sequence_lengths = (
+                torch.from_numpy(seq_lens_padded)
+                .to(device=self.device, dtype=torch.int32, non_blocking=True)
+                .view(-1, 1, 1, 1)
+            )  # match cuDNN test style
+
+            cu_seqlens = torch.from_numpy(offsets_packed).to(
+                device=self.device, dtype=torch.int32, non_blocking=True
+            )
+
+            max_seqlen = int(flashinfer_max_seqlen)
+            sequence_lengths = sequence_lengths.to(self.device, non_blocking=True)
+        else:
+            sequence_lengths = None
+            cu_seqlens = torch.from_numpy(token_cu_seqlens)
+            if not is_npu():
+                cu_seqlens = cu_seqlens.to(self.device, non_blocking=True)
+            else:
+                cu_seqlens = cu_seqlens.to("cpu")
+            max_seqlen = None
+
+        x = x.unsqueeze(1)
+
+        cu_seqlens = cu_seqlens.to(self.device, non_blocking=True)
+
+        deepstack_feature_lists = []
+        num_deepstack_captured = 0
+
+        for layer_num, blk in enumerate(self.blocks):
+            x = blk(
+                x,
+                cu_seqlens=cu_seqlens,
+                rotary_pos_emb_cos=rotary_pos_emb_cos,
+                rotary_pos_emb_sin=rotary_pos_emb_sin,
+                max_seqlen=max_seqlen,
+                sequence_lengths=sequence_lengths,
+            )
+
+            if layer_num in self.deepstack_visual_indexes:
+                deepstack_feature = self.deepstack_merger_list[num_deepstack_captured](
+                    x
+                )
+                deepstack_feature_lists.append(deepstack_feature)
+                num_deepstack_captured += 1
+        x = self.merger(x)
+        hidden_states = torch.cat(
+            [x] + deepstack_feature_lists, dim=1
+        )  # [seq_len, hidden_size * (1 + depth_of_deepstack)]
+        return hidden_states
+
+    def forward_with_cuda_graph(
+        self,
+        x: torch.Tensor,
+        grid_thw: torch.Tensor,
+    ) -> torch.Tensor:
+        # patchify
+        x = x.to(device=self.device, dtype=self.dtype)
+        x = self.patch_embed(x)
+
+        if isinstance(grid_thw, list):
+            grid_thw_list = grid_thw
+            grid_thw = torch.tensor(grid_thw, dtype=torch.int32)
+        else:
+            grid_thw_list = grid_thw.tolist()
+
+        pos_embeds = self.fast_pos_embed_interpolate(grid_thw)
+        x += pos_embeds
+
+        # rotary embedding -> (cos, sin)
+        rotary_pos_emb_cos, rotary_pos_emb_sin = self.rot_pos_emb(grid_thw_list)
+
+        # compute cu_seqlens
+        cu_seqlens = compute_cu_seqlens_from_grid_numpy(grid_thw)
+        if not isinstance(cu_seqlens, torch.Tensor):
+            cu_seqlens = torch.tensor(cu_seqlens, device=x.device, dtype=torch.int32)
+        else:
+            cu_seqlens = cu_seqlens.to(device=x.device, dtype=torch.int32)
+        cu_seqlens = cu_seqlens.contiguous()
+
+        # blocks + merger + deepstack(optional) via CUDA Graph Runner
+        return self.cuda_graph_runner.run(
+            x=x,
+            position_embeddings=None,
+            rotary_pos_emb_cos=rotary_pos_emb_cos,
+            rotary_pos_emb_sin=rotary_pos_emb_sin,
+            cu_seqlens=cu_seqlens,
+            cu_window_seqlens=None,
+            output_indices=None,
+        )
+
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("attn.qkv.", "attn.q.", "q"),
+            ("attn.qkv.", "attn.k.", "k"),
+            ("attn.qkv.", "attn.v.", "v"),
+        ]
+        params_dict = dict(self.named_parameters(remove_duplicate=False))
+        loaded_params: set[str] = set()
+
+        for name, loaded_weight in weights:
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
+                weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
+
+
+cached_get_processor = lru_cache(get_processor)
+
+
+class Qwen3LLMModel(Qwen3Model):
+
+    def __init__(
+        self,
+        *,
+        config: Qwen3VLConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__(config=config, quant_config=quant_config, prefix=prefix)
+        if not self.pp_group.is_first_rank:
+            assert self.start_layer >= len(
+                config.vision_config.deepstack_visual_indexes
+            ), "start_layer should be greater than or equal to len(deepstack_visual_indexes)"
+
+        self.hidden_size = config.hidden_size
+        self.deepstack_embed_to_decoder_layer = range(
+            len(config.vision_config.deepstack_visual_indexes)
+        )
+
+    def get_deepstack_embeds(
+        self, layer_idx: int, input_deepstack_embeds: Optional[torch.Tensor]
+    ) -> Optional[torch.Tensor]:
+        """Get deepstack embeddings for a given layer index, or None if not applicable."""
+        if (
+            input_deepstack_embeds is None
+            or layer_idx not in self.deepstack_embed_to_decoder_layer
+        ):
+            return None
+        sep = self.hidden_size * layer_idx
+        return input_deepstack_embeds[:, sep : sep + self.hidden_size]
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        input_embeds: torch.Tensor = None,
+        pp_proxy_tensors: Optional[PPProxyTensors] = None,
+        input_deepstack_embeds: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor, PPProxyTensors]:
+
+        if self.pp_group.is_first_rank:
+            if input_embeds is None:
+                hidden_states = self.embed_tokens(input_ids)
+            else:
+                hidden_states = input_embeds
+            residual = None
+        else:
+            assert pp_proxy_tensors is not None
+            hidden_states = pp_proxy_tensors["hidden_states"]
+            residual = pp_proxy_tensors["residual"]
+
+        aux_hidden_states = []
+        for layer_idx, layer in enumerate(
+            self.layers[self.start_layer : self.end_layer]
+        ):
+            layer_idx = layer_idx + self.start_layer
+            if layer_idx in self.layers_to_capture:
+                aux_hidden_states.append(
+                    hidden_states + residual if residual is not None else hidden_states
+                )
+
+            # SGLang applies residual at the START of the next layer, not at the END like HuggingFace.
+            # See: https://github.com/huggingface/transformers/blob/v5.0.0rc0/src/transformers/models/qwen3_vl/modeling_qwen3_vl.py#L549
+            # To match HF behavior, deepstack must be added AFTER residual: (hidden_states + residual) + deepstack
+            # The order matters because addition with different tensors is not associative in practice.
+            # Deepstack for prev_layer is applied at the start of current layer via post_residual_addition.
+            deepstack_embeds = self.get_deepstack_embeds(
+                layer_idx - 1, input_deepstack_embeds
+            )
+            hidden_states, residual = layer(
+                positions,
+                hidden_states,
+                forward_batch,
+                residual,
+                post_residual_addition=deepstack_embeds,
+            )
+
+        # Handle deepstack for the last processed layer if it exists.
+        last_deepstack = self.get_deepstack_embeds(
+            self.end_layer - 1, input_deepstack_embeds
+        )
+
+        if not self.pp_group.is_last_rank:
+            return PPProxyTensors(
+                {
+                    "hidden_states": hidden_states,
+                    "residual": residual,
+                }
+            )
+        else:
+            if hidden_states.shape[0] != 0:
+                if residual is None:
+                    hidden_states = self.norm(hidden_states)
+                else:
+                    hidden_states, _ = self.norm(
+                        hidden_states, residual, post_residual_addition=last_deepstack
+                    )
+
+        if len(aux_hidden_states) == 0:
+            return hidden_states
+
+        return hidden_states, aux_hidden_states
+
+
+class Qwen3VLForConditionalGeneration(nn.Module):
+    # To ensure correct weight loading and mapping.
+    hf_to_sglang_mapper = WeightsMapper(
+        orig_to_new_substr={
+            "attn.qkv": "attn.qkv_proj",
+        },
+        orig_to_new_prefix={
+            # mapping for new names in checkpoint saved after transformers v4.52
+            "model.language_model.": "language_model.model.",
+            "model.visual.": "visual.",
+            # mapping for original checkpoint
+            "lm_head.": "language_model.lm_head.",
+            "model.": "language_model.model.",
+        },
+    )
+
+    def __init__(
+        self,
+        config: Qwen3VLConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+        language_model_cls=Qwen3LLMModel,
+    ) -> None:
+        super().__init__()
+        self.pp_group = get_pp_group()
+        self.quant_config = quant_config
+
+        self.use_data_parallel = get_global_server_args().mm_enable_dp_encoder
+
+        self.visual = Qwen3VLMoeVisionModel(
+            config.vision_config,
+            # NOTE: Qwen3-VL vision encoder currently supports BitsAndBytes 4-bit quantization.
+            # Other quantization methods (e.g., GPTQ, AWQ) are untested and may not be supported.
+            quant_config=None,
+            norm_eps=getattr(config, "rms_norm_eps", 1e-6),
+            prefix=add_prefix("model.visual", prefix),
+            use_data_parallel=self.use_data_parallel,
+        )
+
+        # TODO: make it more elegant
+        if language_model_cls is Qwen3LLMModel:
+            self.config: Qwen3VLConfig = config  # for qwen3-vl
+        else:
+            self.config = config.text_config  # for qwen3-omni
+            self.config.encoder_only = getattr(config, "encoder_only", False)
+            self.config.language_only = getattr(config, "language_only", False)
+
+        if not hasattr(config, "encoder_only") or not config.encoder_only:
+            self.model = language_model_cls(
+                config=self.config,
+                quant_config=quant_config,
+                prefix=add_prefix("model.language_model", prefix),
+            )
+            if self.pp_group.is_last_rank:
+                if self.pp_group.world_size == 1 and self.config.tie_word_embeddings:
+                    self.lm_head = self.model.embed_tokens
+                else:
+                    self.lm_head = ParallelLMHead(
+                        self.config.vocab_size,
+                        self.config.hidden_size,
+                        quant_config=quant_config,
+                        use_attn_tp_group=get_global_server_args().enable_dp_lm_head,
+                        prefix=add_prefix("lm_head", prefix),
+                    )
+            else:
+                self.lm_head = PPMissingLayer()
+        else:
+            # encoder_only mode: no language model, so no lm_head needed
+            self.lm_head = None
+
+        self.is_mrope_enabled = "mrope_section" in self.config.rope_scaling
+
+        self.logits_processor = LogitsProcessor(self.config)
+        self.pooler = Pooler(pooling_type=PoolingType.LAST, normalize=True)
+        # like {8:0, 16:1, 24:2}, which stands for the captured deepstack features on
+        # 8, 16, 24 layer will be merged to 0, 1, 2 layer of decoder output hidden_states
+
+        # deepstack
+        self.deepstack_visual_indexes = config.vision_config.deepstack_visual_indexes
+        self.num_deepstack_embeddings = len(self.deepstack_visual_indexes)
+        self.use_deepstack = {Modality.IMAGE: True, Modality.VIDEO: True}
+
+    def separate_deepstack_embeds(self, embedding):
+        assert (
+            embedding.shape[-1] % (1 + self.num_deepstack_embeddings) == 0
+        ), f"hidden_state of {embedding.shape} should be divisible by ({1 + self.num_deepstack_embeddings})"
+
+        separate_index = self.config.hidden_size
+        input_embeds = embedding[:, :separate_index]
+        input_deepstack_embeds = embedding[:, separate_index:]
+        return input_embeds, input_deepstack_embeds
+
+    def pad_input_ids(self, input_ids: List[int], mm_inputs: MultimodalInputs):
+        pattern = MultiModalityDataPaddingPatternMultimodalTokens()
+        return pattern.pad_input_tokens(input_ids, mm_inputs)
+
+    def get_image_feature(self, items: List[MultimodalDataItem]) -> torch.Tensor:
+        # in qwen-vl, last dim is the same
+        pixel_values = torch.cat([item.feature for item in items], dim=0).type(
+            self.visual.dtype
+        )
+        image_grid_thw = torch.concat([item.image_grid_thw for item in items], dim=0)
+        assert pixel_values.dim() == 2, pixel_values.dim()
+        assert image_grid_thw.dim() == 2, image_grid_thw.dim()
+
+        max_patches_per_call = get_int_env_var("SGLANG_VLM_MAX_PATCHES_PER_VIT", 0)
+        max_images_per_call = get_int_env_var("SGLANG_VLM_MAX_IMAGES_PER_VIT", 0)
+
+        if max_patches_per_call == 0 and max_images_per_call == 0:
+            if self.use_data_parallel:
+                return run_dp_sharded_mrope_vision_model(
+                    self.visual,
+                    pixel_values,
+                    image_grid_thw.tolist(),
+                    rope_type="rope_3d",
+                )
+            else:
+                return self.visual(pixel_values, grid_thw=image_grid_thw)
+
+        # compute the number of patches per image and the slice positions in pixel_values
+        grid_thw_list = (
+            image_grid_thw.tolist()
+        )  # List[List[int]], each is [T, H, W] or similar
+        patches_per_image = [int(math.prod(g)) for g in grid_thw_list]
+        num_images = len(patches_per_image)
+
+        # cumulative sum used to slice pixel_values along the image dimension
+        cum_patches = [0]
+        for p in patches_per_image:
+            cum_patches.append(cum_patches[-1] + p)
+        total_patches = cum_patches[-1]
+
+        assert pixel_values.size(0) == total_patches, (
+            f"pixel_values rows ({pixel_values.size(0)}) "
+            f"!= total patches ({total_patches})"
+        )
+
+        # split into chunks in image order, each chunk obeys the patch/image limits
+        all_chunk_embeds: List[torch.Tensor] = []
+        img_start = 0
+
+        while img_start < num_images:
+            img_end = img_start
+            patches_in_chunk = 0
+            images_in_chunk = 0
+
+            # try to pack more images into the current chunk until some limit would be exceeded
+            while img_end < num_images:
+                next_patches = patches_per_image[img_end]
+
+                # if adding this image would exceed the patch limit, stop
+                if (
+                    max_patches_per_call > 0
+                    and patches_in_chunk + next_patches > max_patches_per_call
+                ):
+                    break
+
+                # if adding this image would exceed the image-count limit, also stop
+                if (
+                    max_images_per_call > 0
+                    and images_in_chunk + 1 > max_images_per_call
+                ):
+                    break
+
+                patches_in_chunk += next_patches
+                images_in_chunk += 1
+                img_end += 1
+
+            # extreme case: the first image alone exceeds the patch limit -> at least ensure img_end > img_start
+            if img_end == img_start:
+                img_end = img_start + 1
+                patches_in_chunk = patches_per_image[img_start]
+                images_in_chunk = 1
+
+            # slice pixel_values and grid_thw according to [img_start:img_end]
+            patch_start = cum_patches[img_start]
+            patch_end = cum_patches[img_end]
+            pixel_chunk = pixel_values[patch_start:patch_end]
+            grid_chunk = image_grid_thw[img_start:img_end]
+
+            # run ViT once on this chunk without extra padding
+            if self.use_data_parallel:
+                chunk_embeds = run_dp_sharded_mrope_vision_model(
+                    self.visual,
+                    pixel_chunk,
+                    grid_chunk.tolist(),
+                    rope_type="rope_3d",
+                )
+            else:
+                chunk_embeds = self.visual(pixel_chunk, grid_thw=grid_chunk)
+
+            # chunk_embeds: (sum_patches_after_merge_this_chunk, hidden)
+            all_chunk_embeds.append(chunk_embeds)
+
+            # next batch
+            img_start = img_end
+
+        # concatenate back the full image embedding sequence
+        return torch.cat(all_chunk_embeds, dim=0)
+
+    def get_video_feature(self, items: List[MultimodalDataItem]) -> torch.Tensor:
+        for item in items:
+            item.feature = item.feature.to(self.visual.device)
+        # in qwen-vl, last dim is the same
+        pixel_values = torch.cat([item.feature for item in items], dim=0).type(
+            self.visual.dtype
+        )
+        # Memory optimization for item.feature:
+        # 1. item.feature is released when request finished
+        # 2. High concurrency may cause device OOM due to delayed release
+        # 3. Fix: Offload item.feature to CPU, move to device only when needed
+        for item in items:
+            item.feature = item.feature.to("cpu")
+        video_grid_thw = torch.concat([item.video_grid_thw for item in items], dim=0)
+        assert pixel_values.dim() == 2, pixel_values.dim()
+        assert video_grid_thw.dim() == 2, video_grid_thw.dim()
+        if self.use_data_parallel:
+            return run_dp_sharded_mrope_vision_model(
+                self.visual, pixel_values, video_grid_thw.tolist(), rope_type="rope_3d"
+            )
+        else:
+            video_embeds = self.visual(pixel_values, grid_thw=video_grid_thw)
+        return video_embeds
+
+    def get_input_embeddings(self):
+        return self.model.embed_tokens
+
+    _lora_pattern = re.compile(
+        r"^model\.layers\.(\d+)\.(?:self_attn|mlp)\.(?:qkv_proj|o_proj|down_proj|gate_up_proj)$"
+    )
+
+    def should_apply_lora(self, module_name: str) -> bool:
+        return bool(self._lora_pattern.match(module_name))
+
+    @torch.no_grad()
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        get_embedding: bool = False,
+        pp_proxy_tensors: Optional[PPProxyTensors] = None,
+    ):
+        """Run forward pass for Qwen3-VL.
+
+        Args:
+            input_ids: Flattened (concatenated) input_ids corresponding to a
+                batch.
+            positions: Flattened (concatenated) position ids corresponding to a
+                batch.
+                **NOTE**: If mrope is enabled (default setting for Qwen2-VL
+                opensource models), the shape will be `(3, seq_len)`,
+                otherwise it will be `(seq_len,).
+                (Use input_metadata.mrope_positions to replace it)
+        """
+        if self.is_mrope_enabled:
+            positions = forward_batch.mrope_positions
+
+        if not (
+            forward_batch.forward_mode.is_decode()
+            or not forward_batch.contains_image_inputs()
+        ):
+            if self.is_mrope_enabled:
+                assert positions.ndim == 2 and positions.size(0) == 3, (
+                    "multimodal section rotary embedding requires "
+                    f"(3, seq_len) positions, but got {positions.size()}"
+                )
+
+        hidden_states = general_mm_embed_routine(
+            input_ids=input_ids,
+            forward_batch=forward_batch,
+            language_model=self.model,
+            multimodal_model=self,
+            positions=positions,
+            use_deepstack=self.use_deepstack,
+            pp_proxy_tensors=pp_proxy_tensors,
+        )
+
+        if self.pp_group.is_last_rank:
+            if not get_embedding:
+                return self.logits_processor(
+                    input_ids,
+                    hidden_states,
+                    self.lm_head,
+                    forward_batch,
+                )
+            else:
+                return self.pooler(hidden_states, forward_batch)
+        else:
+            return hidden_states
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            (".qkv_proj", ".q_proj", "q"),
+            (".qkv_proj", ".k_proj", "k"),
+            (".qkv_proj", ".v_proj", "v"),
+            ("gate_up_proj", "up_proj", 1),
+            ("gate_up_proj", "gate_proj", 0),
+        ]
+        params_dict = dict(self.named_parameters(remove_duplicate=False))
+        for name, loaded_weight in weights:
+            if "rotary_emb.inv_freq" in name:
+                continue
+            if "language_model" in name:
+                name = name.replace(r"model.language_model.", r"model.")
+            layer_id = get_layer_id(name)
+
+            # Only copy embed_tokens to lm_head when tie_word_embeddings=True
+            # For models with tie_word_embeddings=False (e.g. 8B), lm_head has independent weights
+            if (
+                self.pp_group.is_last_rank
+                and "model.embed_tokens.weight" in name
+                and self.config.tie_word_embeddings
+            ):
+                if "lm_head.weight" in params_dict:
+                    lm_head_param = params_dict["lm_head.weight"]
+                    weight_loader = getattr(
+                        lm_head_param, "weight_loader", default_weight_loader
+                    )
+                    weight_loader(lm_head_param, loaded_weight)
+
+            is_visual = "visual" in name
+            if (
+                not is_visual
+                and layer_id is not None
+                and hasattr(self, "model")
+                and hasattr(self.model, "start_layer")
+                and (
+                    layer_id < self.model.start_layer
+                    or layer_id >= self.model.end_layer
+                )
+            ):
+                continue
+
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                if "visual" in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                # Skip loading visual/language model weights
+                if (
+                    self.config.encoder_only or self.config.language_only
+                ) and name not in params_dict:
+                    continue
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                if "visual" in name:
+                    # adapt to VisionAttention
+                    name = name.replace(r"attn.qkv.", r"attn.qkv_proj.")
+                    name = name.replace(r"model.visual.", r"visual.")
+
+                try:
+                    # Skip loading extra bias for GPTQ models.
+                    if name.endswith(".bias") and name not in params_dict:
+                        continue
+                    if name in params_dict.keys():
+                        param = params_dict[name]
+                    else:
+                        continue
+
+                except KeyError:
+                    print(params_dict.keys())
+                    raise
+
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
+                weight_loader(param, loaded_weight)
+
+
+EntryClass = Qwen3VLForConditionalGeneration
diff --git a/sglang/python/sglang/srt/models/qwen3_vl_moe.py b/sglang/python/sglang/srt/models/qwen3_vl_moe.py
new file mode 100644
index 0000000000000000000000000000000000000000..f98460665d37fd36817214ef33ce87acecedda5c
--- /dev/null
+++ b/sglang/python/sglang/srt/models/qwen3_vl_moe.py
@@ -0,0 +1,377 @@
+# Copyright 2025 Qwen Team
+# Copyright 2025 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Inference-only Qwen3-VL model compatible with HuggingFace weights."""
+
+import logging
+import re
+from functools import lru_cache
+from typing import Iterable, Optional, Tuple, Union
+
+import torch
+import torch.nn as nn
+
+from sglang.srt.configs.qwen3_vl import Qwen3VLMoeConfig, Qwen3VLMoeTextConfig
+from sglang.srt.eplb.expert_location import ModelConfigForExpertLocation
+from sglang.srt.layers.moe.fused_moe_triton.layer import FusedMoE
+from sglang.srt.layers.quantization.base_config import QuantizationConfig
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch, PPProxyTensors
+from sglang.srt.model_loader.weight_utils import default_weight_loader
+from sglang.srt.models.qwen3_moe import Qwen3MoeDecoderLayer, Qwen3MoeModel
+from sglang.srt.models.qwen3_vl import Qwen3VLForConditionalGeneration
+from sglang.srt.utils.hf_transformers_utils import get_processor
+
+logger = logging.getLogger(__name__)
+
+cached_get_processor = lru_cache(get_processor)
+
+
+class Qwen3MoeLLMModel(Qwen3MoeModel):
+    def __init__(
+        self,
+        *,
+        config: Qwen3VLMoeTextConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+        decoder_layer_type=Qwen3MoeDecoderLayer,
+    ):
+        super().__init__(
+            config=config,
+            quant_config=quant_config,
+            prefix=prefix,
+            decoder_layer_type=decoder_layer_type,
+        )
+        self.hidden_size = config.hidden_size
+        # Currently, we use 3 as len(config.vision_config.deepstack_visual_indexes) is not directly accessible here.
+        # This approach follows the original implementation.
+        # TODO: make config of type Qwen3VLMoeConfig, so that we can directly obtain deepstack_visual_indexes.
+        self.deepstack_embed_to_decoder_layer = range(3)
+
+    def get_input_embeddings(self) -> nn.Embedding:
+        return self.embed_tokens
+
+    def get_deepstack_embeds(
+        self, layer_idx: int, input_deepstack_embeds: Optional[torch.Tensor]
+    ) -> Optional[torch.Tensor]:
+        """Get deepstack embeddings for a given layer index, or None if not applicable."""
+        if (
+            input_deepstack_embeds is None
+            or layer_idx not in self.deepstack_embed_to_decoder_layer
+        ):
+            return None
+        sep = self.hidden_size * layer_idx
+        return input_deepstack_embeds[:, sep : sep + self.hidden_size]
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        input_embeds: torch.Tensor = None,
+        pp_proxy_tensors: Optional[PPProxyTensors] = None,
+        input_deepstack_embeds: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor, PPProxyTensors]:
+        if self.pp_group.is_first_rank:
+            if input_embeds is None:
+                hidden_states = self.embed_tokens(input_ids)
+            else:
+                hidden_states = input_embeds
+            residual = None
+        else:
+            assert pp_proxy_tensors is not None
+            hidden_states = pp_proxy_tensors["hidden_states"]
+            residual = pp_proxy_tensors["residual"]
+
+        aux_hidden_states = []
+        for layer_idx, layer in enumerate(
+            self.layers[self.start_layer : self.end_layer]
+        ):
+            layer_idx += self.start_layer
+            if layer_idx in self.layers_to_capture:
+                aux_hidden_states.append(
+                    hidden_states + residual if residual is not None else hidden_states
+                )
+
+            # SGLang applies residual at the START of the next layer, not at the END like HuggingFace.
+            # See: https://github.com/huggingface/transformers/blob/v5.0.0rc0/src/transformers/models/qwen3_vl/modeling_qwen3_vl.py#L549
+            # To match HF behavior, deepstack must be added AFTER residual: (hidden_states + residual) + deepstack
+            # The order matters because addition with different tensors is not associative in practice.
+            # Deepstack for prev_layer is applied at the start of current layer via post_residual_addition.
+            deepstack_embeds = self.get_deepstack_embeds(
+                layer_idx - 1, input_deepstack_embeds
+            )
+            hidden_states, residual = layer(
+                positions,
+                hidden_states,
+                forward_batch,
+                residual,
+                post_residual_addition=deepstack_embeds,
+            )
+
+        # Handle deepstack for the last processed layer if it exists.
+        last_deepstack = self.get_deepstack_embeds(
+            self.end_layer - 1, input_deepstack_embeds
+        )
+
+        if not self.pp_group.is_last_rank:
+            return PPProxyTensors(
+                {
+                    "hidden_states": hidden_states,
+                    "residual": residual,
+                }
+            )
+        else:
+            if hidden_states.shape[0] != 0:
+                if residual is None:
+                    hidden_states = self.norm(hidden_states)
+                else:
+                    hidden_states, _ = self.norm(
+                        hidden_states, residual, post_residual_addition=last_deepstack
+                    )
+
+        if len(aux_hidden_states) == 0:
+            return hidden_states
+
+        return hidden_states, aux_hidden_states
+
+
+def load_fused_expert_weights(
+    name: str,
+    params_dict: dict,
+    loaded_weight: torch.Tensor,
+    shard_id: str,
+    num_experts: int,
+):
+    param = params_dict[name]
+    # weight_loader = typing.cast(Callable[..., bool], param.weight_loader)
+    weight_loader = param.weight_loader
+    # let ep moe layer to gracefully handle expert_ids that do not belong to local moe rank
+    for expert_id in range(num_experts):
+        curr_expert_weight = loaded_weight[expert_id]
+        weight_loader(
+            param,
+            curr_expert_weight,
+            name,
+            shard_id,
+            expert_id,
+        )
+    return True
+
+
+class Qwen3VLMoeForConditionalGeneration(Qwen3VLForConditionalGeneration):
+    def __init__(
+        self,
+        config: Qwen3VLMoeConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+        language_model_cls=Qwen3MoeLLMModel,
+    ):
+        super().__init__(config, quant_config, prefix, language_model_cls)
+
+    # Only allow LoRA on attention projections within text layers for MoE.
+    _lora_pattern_moe = re.compile(
+        r"^model\.layers\.(\d+)\.self_attn\.(?:qkv_proj|o_proj)$"
+    )
+
+    def should_apply_lora(self, module_name: str) -> bool:
+        return bool(self._lora_pattern_moe.match(module_name))
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            (".qkv_proj", ".q_proj", "q"),
+            (".qkv_proj", ".k_proj", "k"),
+            (".qkv_proj", ".v_proj", "v"),
+            ("gate_up_proj", "up_proj", 1),
+            ("gate_up_proj", "gate_proj", 0),
+        ]
+
+        expert_params_mapping = FusedMoE.make_expert_params_mapping(
+            ckpt_gate_proj_name="gate_proj",
+            ckpt_down_proj_name="down_proj",
+            ckpt_up_proj_name="up_proj",
+            num_experts=self.config.num_experts,
+        )
+
+        # Skip loading extra parameters for GPTQ/modelopt models.
+        ignore_suffixes = (
+            ".bias",
+            "_bias",
+            ".k_scale",
+            "_k_scale",
+            ".v_scale",
+            "_v_scale",
+            ".weight_scale",
+            "_weight_scale",
+            ".input_scale",
+            "_input_scale",
+        )
+
+        is_fused_expert = False
+        fused_expert_params_mapping = [
+            ("experts.w13_weight", "experts.gate_up_proj", 0, "w1"),
+            ("experts.w2_weight", "experts.down_proj", 0, "w2"),
+        ]
+
+        num_experts = self.config.num_experts
+
+        # Cache params_dict to avoid repeated expensive traversal of model parameters
+        if not hasattr(self, "_cached_params_dict"):
+            self._cached_params_dict = dict(self.named_parameters())
+        params_dict = self._cached_params_dict
+        for name, loaded_weight in weights:
+            name = name.replace(r"model.language_model.", r"model.")
+
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                if "experts.gate_up_proj" in name or "experts.down_proj" in name:
+                    is_fused_expert = True
+                    expert_params_mapping = fused_expert_params_mapping
+
+                # Skip non-stacked layers and experts (experts handled below).
+                if weight_name not in name:
+                    continue
+                if "visual" in name:
+                    continue
+
+                # We have mlp.experts[0].gate_proj in the checkpoint.
+                # Since we handle the experts below in expert_params_mapping,
+                # we need to skip here BEFORE we update the name, otherwise
+                # name will be updated to mlp.experts[0].gate_up_proj, which
+                # will then be updated below in expert_params_mapping
+                # for mlp.experts[0].gate_gate_up_proj, which breaks load.
+                if "mlp.experts" in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra parameters for GPTQ/modelopt models.
+                if name.endswith(ignore_suffixes) and name not in params_dict:
+                    continue
+                # [TODO] Skip layers that are on other devices (check if sglang has a similar function)
+                # if is_pp_missing_parameter(name, self):
+                #     continue
+
+                if name not in params_dict:
+                    continue
+
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                # Track if this is an expert weight to enable early skipping
+                is_expert_weight = False
+
+                for mapping in expert_params_mapping:
+                    param_name, weight_name, expert_id, shard_id = mapping
+                    if weight_name not in name:
+                        continue
+                    if "visual" in name or self.config.encoder_only:
+                        continue
+                    # Anyway, this is an expert weight and should not be
+                    # attempted to load as other weights later
+                    is_expert_weight = True
+                    name_mapped = name.replace(weight_name, param_name)
+                    if is_fused_expert:
+                        loaded_weight = loaded_weight.transpose(-1, -2)  # no bias
+                        if "experts.gate_up_proj" in name:
+                            loaded_weight = loaded_weight.chunk(2, dim=-2)
+                            load_fused_expert_weights(
+                                name_mapped,
+                                params_dict,
+                                loaded_weight[0],
+                                "w1",
+                                num_experts,
+                            )
+                            load_fused_expert_weights(
+                                name_mapped,
+                                params_dict,
+                                loaded_weight[1],
+                                "w3",
+                                num_experts,
+                            )
+                        else:
+                            load_fused_expert_weights(
+                                name_mapped,
+                                params_dict,
+                                loaded_weight,
+                                shard_id,
+                                num_experts,
+                            )
+                    else:
+                        # Skip loading extra parameters for GPTQ/modelopt models.
+                        if (
+                            name_mapped.endswith(ignore_suffixes)
+                            and name_mapped not in params_dict
+                        ):
+                            continue
+                        param = params_dict[name_mapped]
+                        # We should ask the weight loader to return success or
+                        # not here since otherwise we may skip experts with
+                        # # other available replicas.
+                        weight_loader = param.weight_loader
+                        weight_loader(
+                            param,
+                            loaded_weight,
+                            name_mapped,
+                            shard_id=shard_id,
+                            expert_id=expert_id,
+                        )
+                    name = name_mapped
+                    break
+                else:
+                    if is_expert_weight:
+                        # This is an expert weight but not mapped to this rank, skip all remaining processing
+                        continue
+                    if "visual" in name:
+                        # adapt to VisionAttention
+                        name = name.replace(r"attn.qkv.", r"attn.qkv_proj.")
+                        name = name.replace(r"model.visual.", r"visual.")
+
+                    # Skip loading extra parameters for GPTQ/modelopt models.
+                    if name.endswith(ignore_suffixes) and name not in params_dict:
+                        continue
+
+                    # Skip loading mm/language parameters
+                    if (
+                        self.config.encoder_only or self.config.language_only
+                    ) and name not in params_dict:
+                        continue
+
+                    if name in params_dict.keys():
+                        param = params_dict[name]
+                        weight_loader = getattr(
+                            param, "weight_loader", default_weight_loader
+                        )
+                        weight_loader(param, loaded_weight)
+                    else:
+                        logger.warning(f"Parameter {name} not found in params_dict")
+
+        # TODO mimic deepseek
+        # Lazy initialization of expert weights cache to avoid slowing down load_weights
+        # if not hasattr(self, "routed_experts_weights_of_layer"):
+        #     self.routed_experts_weights_of_layer = {
+        #         layer_id: self.model.layers[layer_id].mlp.get_moe_weights()
+        #         for layer_id in range(self.start_layer, self.end_layer)
+        #         if isinstance(self.model.layers[layer_id].mlp, Qwen3MoeSparseMoeBlock)
+        #     }
+
+    @classmethod
+    def get_model_config_for_expert_location(cls, config):
+        return ModelConfigForExpertLocation(
+            num_layers=config.text_config.num_hidden_layers,
+            num_logical_experts=config.text_config.num_experts,
+            num_groups=None,
+        )
+
+
+EntryClass = Qwen3VLMoeForConditionalGeneration
diff --git a/sglang/python/sglang/srt/models/radio.py b/sglang/python/sglang/srt/models/radio.py
new file mode 100644
index 0000000000000000000000000000000000000000..2cd233141c15ca6488c57769448f235b3b12732f
--- /dev/null
+++ b/sglang/python/sglang/srt/models/radio.py
@@ -0,0 +1,532 @@
+# Copyright 2025 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+# Adapted from https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/models/radio.py
+
+import math
+from collections.abc import Iterable
+from itertools import repeat
+from typing import TypeAlias
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from einops import rearrange
+from transformers import PretrainedConfig
+from transformers.modeling_outputs import BaseModelOutput
+
+from sglang.srt.layers.quantization.base_config import QuantizationConfig
+from sglang.srt.model_loader.weight_utils import (
+    default_weight_loader,
+    replace_prefix,
+    replace_substrings,
+)
+from sglang.srt.models.internvl import InternVisionEncoder
+
+input_dim_t: TypeAlias = int | tuple[int, int]
+norm_t: TypeAlias = tuple[float, float, float] | torch.Tensor
+
+
+def _ntuple(n):
+    def parse(x):
+        if isinstance(x, Iterable) and not isinstance(x, str):
+            return tuple(x)
+        return tuple(repeat(x, n))
+
+    return parse
+
+
+to_1tuple = _ntuple(1)
+to_2tuple = _ntuple(2)
+to_3tuple = _ntuple(3)
+to_4tuple = _ntuple(4)
+to_ntuple = _ntuple
+
+
+class ClsToken(nn.Module):
+    def __init__(
+        self,
+        ndim: int,
+        num_tokens: int = 1,
+        enabled: bool = True,
+        register_multiple: int | None = None,
+        num_registers: int | None = None,
+    ):
+        super().__init__()
+
+        self.ndim = ndim
+        self.enabled = enabled
+        self.num_registers = 0
+        self.num_tokens = num_tokens
+        if enabled:
+            if num_registers:
+                self.num_registers = num_registers
+            elif register_multiple:
+                self.num_registers = register_multiple - (
+                    num_tokens % register_multiple
+                )
+
+            scale = ndim**-0.5
+            self.token = nn.Parameter(
+                torch.randn(num_tokens + self.num_registers, ndim) * scale
+            )
+
+        else:
+            self.token = None
+
+        self.num_patches = self.num_tokens + self.num_registers
+
+    def forward(self, x: torch.Tensor):
+        if self.token is None:
+            return x
+
+        token = self.token.unsqueeze(0).expand(x.shape[0], -1, -1)
+        x = torch.cat(
+            [
+                token,
+                x,
+            ],
+            dim=1,
+        )
+
+        return x
+
+
+class ViTPatchGenerator(nn.Module):
+    def __init__(
+        self,
+        #  config: PretrainedConfig,
+        patch_size: int,
+        embed_dim: int,
+        input_dims: input_dim_t,
+        abs_pos: bool = True,
+        normalize_patches: bool = False,
+        cls_token: bool = False,
+        max_input_dims: input_dim_t | None = None,
+        pos_dropout: float = 0.0,
+        return_pos_enc: bool = False,
+        num_cls_tokens: int = 1,
+        register_multiple: int | None = None,
+        num_registers: int | None = None,
+        patch_bias: bool = False,
+        device=None,
+        dtype=None,
+    ):
+        super().__init__()
+        if isinstance(input_dims, int):
+            input_dims = (input_dims, input_dims)
+
+        if max_input_dims is None:
+            max_input_dims = input_dims
+        if isinstance(max_input_dims, int):
+            max_input_dims = (max_input_dims, max_input_dims)
+
+        max_input_dims = tuple(
+            int(math.ceil(d / patch_size) * patch_size) for d in max_input_dims
+        )
+
+        self.cpe_mode = max_input_dims != input_dims
+        self.pos_dropout = pos_dropout
+        self.return_pos_enc = return_pos_enc
+
+        factory = dict(device=device, dtype=dtype)
+
+        self.patch_size = patch_size
+        self.abs_pos = abs_pos
+        self.embed_dim = embed_dim
+
+        self.num_rows = max_input_dims[0] // patch_size
+        self.num_cols = max_input_dims[1] // patch_size
+        self.input_dims = tuple(d // patch_size for d in input_dims)
+        self.num_patches = self.num_rows * self.num_cols
+        self.max_input_dims = max_input_dims
+
+        self.im_to_patches = Im2Patches(patch_size)
+        self.embedder = ViTPatchLinear(
+            patch_size, embed_dim, bias=patch_bias, **factory
+        )
+
+        if abs_pos:
+            scale = embed_dim**-0.5
+            self.pos_embed = nn.Parameter(
+                torch.randn(1, self.num_patches, embed_dim, **factory) * scale
+            )
+
+        self.cls_token = ClsToken(
+            embed_dim,
+            num_tokens=num_cls_tokens,
+            enabled=cls_token,
+            register_multiple=register_multiple,
+            num_registers=num_registers,
+        )
+
+        self.patch_normalizer = (
+            nn.LayerNorm(embed_dim) if normalize_patches else nn.Identity()
+        )
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        patches = self.embed_patches(x)
+        patches, pos_enc = self.apply_pos_enc(patches, input_size=x.shape[2:])
+        patches = self.cls_token(patches)
+        patches = self.patch_normalizer(patches)
+        if self.return_pos_enc:
+            return patches, pos_enc
+        return patches
+
+    @property
+    def apply_cls_token(self):
+        return self.cls_token.enabled
+
+    @property
+    def num_cls_tokens(self):
+        return self.cls_token.num_tokens
+
+    @property
+    def num_cls_patches(self):
+        return self.cls_token.num_patches
+
+    @property
+    def num_registers(self):
+        return self.cls_token.num_registers
+
+    @property
+    def num_skip(self):
+        return self.num_cls_tokens + self.num_registers
+
+    def _load_embed(self, src_embed: torch.Tensor, targ_embed: nn.Parameter):
+        if src_embed.shape != targ_embed.shape:
+            src_size = int(math.sqrt(src_embed.shape[1]))
+
+            assert (
+                src_size**2 == src_embed.shape[1]
+            ), "Unable to interpolate non-square embedding"
+
+            src_embed = rearrange(
+                src_embed, "b (h w) c -> b c h w", h=src_size, w=src_size
+            )
+            src_embed = F.interpolate(
+                src_embed,
+                size=(self.num_rows, self.num_cols),
+                mode="bicubic",
+                align_corners=True,
+                antialias=False,
+            )
+            src_embed = rearrange(src_embed, "b c h w -> b (h w) c")
+        targ_embed.data.copy_(src_embed)
+
+    def _load_projection(
+        self, src_proj_weight: torch.Tensor, targ_proj_weight: torch.Tensor
+    ):
+        if src_proj_weight.shape != targ_proj_weight.shape:
+            src_patch_size = int(math.sqrt(src_proj_weight.shape[1] // 3))
+
+            assert (src_patch_size**2) * 3 == src_proj_weight.shape[
+                1
+            ], "Unable to interpolate non-square patch size"
+
+            src_proj_weight = rearrange(
+                src_proj_weight,
+                "b (c h w) -> b c h w",
+                c=3,
+                h=src_patch_size,
+                w=src_patch_size,
+            )
+            src_proj_weight = F.interpolate(
+                src_proj_weight,
+                size=(self.patch_size, self.patch_size),
+                mode="bicubic",
+                align_corners=True,
+                antialias=False,
+            )
+            src_proj_weight = rearrange(src_proj_weight, "b c h w -> b (c h w)")
+        targ_proj_weight.data.copy_(src_proj_weight)
+
+    def embed_patches(self, x: torch.Tensor) -> torch.Tensor:
+        patches = self.im_to_patches(x)
+        patches = self.embedder(patches)
+        return patches
+
+    def apply_pos_enc(
+        self,
+        patches: torch.Tensor,
+        patch_idxs: torch.Tensor | None = None,
+        input_size: tuple[int, int] | None = None,
+    ) -> torch.Tensor:
+        if not self.abs_pos:
+            return patches
+
+        pos_enc = self.get_pos_enc(patches.shape[0], patch_idxs, input_size)
+
+        if self.training and self.pos_dropout > 0:
+            keeps = (
+                torch.rand(
+                    patches.shape[0], 1, 1, dtype=pos_enc.dtype, device=pos_enc.device
+                )
+                > self.pos_dropout
+            )
+            pos_enc_drop = torch.where(keeps, pos_enc, 0)
+        else:
+            pos_enc_drop = pos_enc
+
+        return patches + pos_enc_drop, pos_enc
+
+    def get_pos_enc(
+        self,
+        batch_size: int,
+        patch_idxs: torch.Tensor | None = None,
+        input_size: tuple[int, int] | None = None,
+    ) -> torch.Tensor:
+        if input_size is None:
+            input_dims = self.input_dims
+        else:
+            input_dims = tuple(d // self.patch_size for d in input_size)
+
+        pos_embed = self._get_pos_embeddings(batch_size, input_dims)
+
+        if patch_idxs is None:
+            return pos_embed
+
+        exp_patch_idxs = patch_idxs.unsqueeze(-1).expand(-1, -1, pos_embed.shape[-1])
+
+        pos_embed = torch.gather(
+            pos_embed.expand(patch_idxs.shape[0], -1, -1), dim=1, index=exp_patch_idxs
+        )
+        return pos_embed
+
+    def _get_pos_embeddings(self, batch_size: int, input_dims: tuple[int, int]):
+        if (self.num_rows, self.num_cols) == input_dims:
+            return self.pos_embed
+
+        pos_embed = self.pos_embed.reshape(1, self.num_rows, self.num_cols, -1).permute(
+            0, 3, 1, 2
+        )
+
+        def window_select(pos_embed):
+            if input_dims[0] < pos_embed.shape[-2]:
+                pos_embed = pos_embed[..., : input_dims[0], :]
+            if input_dims[1] < pos_embed.shape[-1]:
+                pos_embed = pos_embed[..., :, : input_dims[1]]
+            return pos_embed
+
+        if self.cpe_mode:
+            if self.training:
+                min_scale = math.sqrt(0.1)
+                scale = (
+                    torch.rand(batch_size, 1, 1, device=pos_embed.device)
+                    * (1 - min_scale)
+                    + min_scale
+                )
+                aspect_min = math.log(3 / 4)
+                aspect_max = -aspect_min
+                aspect = torch.exp(
+                    torch.rand(batch_size, 1, 1, device=pos_embed.device)
+                    * (aspect_max - aspect_min)
+                    + aspect_min
+                )
+
+                scale_x = scale * aspect
+                scale_y = scale * (1 / aspect)
+                scale_xy = torch.stack([scale_x, scale_y], dim=-1).clamp_(0, 1)
+
+                pos_xy = torch.rand(batch_size, 1, 1, 2, device=pos_embed.device) * (
+                    1 - scale_xy
+                )
+
+                lin_x = torch.linspace(
+                    0, 1, steps=input_dims[1], device=pos_embed.device
+                )[None, None].expand(batch_size, input_dims[0], -1)
+                lin_y = torch.linspace(
+                    0, 1, steps=input_dims[0], device=pos_embed.device
+                )[None, :, None].expand(batch_size, -1, input_dims[1])
+
+                lin_xy = torch.stack([lin_x, lin_y], dim=-1)
+
+                grid_xy = lin_xy * scale_xy + pos_xy
+
+                # Convert to [-1, 1] range
+                grid_xy.mul_(2).sub_(1)
+
+                pos_embed = F.grid_sample(
+                    pos_embed.float().expand(batch_size, -1, -1, -1),
+                    grid=grid_xy,
+                    mode="bilinear",
+                    padding_mode="zeros",
+                    align_corners=True,
+                ).to(pos_embed.dtype)
+            else:
+                max_dim = max(input_dims)
+                pos_embed = F.interpolate(
+                    pos_embed.float(),
+                    size=(max_dim, max_dim),
+                    align_corners=True,
+                    mode="bilinear",
+                ).to(pos_embed.dtype)
+
+                pos_embed = window_select(pos_embed)
+        else:
+            pos_embed = window_select(pos_embed)
+
+        if pos_embed.shape[-2:] != input_dims:
+            pos_embed = F.interpolate(
+                pos_embed.float(), size=input_dims, align_corners=True, mode="bilinear"
+            ).to(pos_embed.dtype)
+
+        pos_embed = pos_embed.flatten(2).permute(0, 2, 1)
+
+        return pos_embed
+
+
+class Im2Patches(nn.Module):
+    def __init__(self, patch_size: int):
+        super().__init__()
+        self.patch_size = patch_size
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        if self.patch_size == 1:
+            patches = x.flatten(2)
+            patches = patches.permute(0, 2, 1)
+            return patches
+
+        py = x.shape[-2] // self.patch_size
+        px = x.shape[-1] // self.patch_size
+        patches = rearrange(
+            x,
+            "b c (py yy) (px xx) -> b (py px) (c yy xx)",
+            py=py,
+            yy=self.patch_size,
+            px=px,
+            xx=self.patch_size,
+        )
+        return patches
+
+
+class ViTPatchLinear(nn.Linear):
+    def __init__(self, patch_size: int, embed_dim: int, bias: bool = False, **factory):
+        super().__init__(3 * (patch_size**2), embed_dim, bias=bias, **factory)
+        self.patch_size = patch_size
+
+
+class RadioInternVisionModel(nn.Module):
+    packed_modules_mapping = {
+        "qkv": ["qkv"],
+    }
+
+    def __init__(
+        self,
+        config: PretrainedConfig = None,
+        quant_config: QuantizationConfig | None = None,
+    ) -> None:
+        super().__init__()
+
+        self.config = config
+        self.img_size, self.grid_size, self.num_patches = self._init_img_size(
+            to_2tuple(config.patch_size), config.image_size
+        )
+        max_img_size = int(
+            round(config.max_img_size / config.patch_size) * config.patch_size
+        )
+        self.patch_generator = ViTPatchGenerator(
+            config.patch_size,
+            config.hidden_size,
+            input_dims=self.img_size,
+            max_input_dims=max_img_size,
+            cls_token=True,
+            register_multiple=config.reg_tokens,
+        )
+
+        self.encoder = InternVisionEncoder(config=config, quant_config=quant_config)
+
+    def _init_img_size(self, patch_size, img_size: int | tuple[int, int]):
+        if img_size is None:
+            return None, None, None
+        img_size = to_2tuple(img_size)
+        grid_size = tuple([s // p for s, p in zip(img_size, patch_size)])
+        num_patches = grid_size[0] * grid_size[1]
+        return img_size, grid_size, num_patches
+
+    def get_input_embeddings(self):
+        return self.embeddings
+
+    def forward(self, x: torch.Tensor) -> torch.FloatTensor:
+        assert self.patch_generator is not None
+        hidden_states = self.patch_generator(x)
+        encoder_outputs = self.encoder.forward(inputs_embeds=hidden_states)
+        assert isinstance(encoder_outputs, BaseModelOutput)
+        return encoder_outputs.last_hidden_state
+
+
+class RadioModel(nn.Module):
+    packed_modules_mapping = {
+        "qkv": ["qkv"],
+    }
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        quant_config: QuantizationConfig | None = None,
+    ) -> None:
+        super().__init__()
+
+        self.config = config
+        self.model = RadioInternVisionModel(
+            config=config,
+            quant_config=quant_config,
+        )
+
+    def forward(
+        self,
+        pixel_values: torch.Tensor | None = None,
+        pixel_embeds: torch.Tensor | None = None,
+    ) -> torch.FloatTensor:
+        y = self.model(pixel_values)
+        return self._extract_final(y)
+
+    def load_weights(self, weights) -> set[str]:
+        remap_substrings = {
+            "attn": "attn.attn",
+            "qkv": "qkv_proj",
+            "blocks": "encoder.layers",
+        }
+        remap_prefixes = {
+            "radio_model.": "",
+        }
+
+        loaded_params: set[str] = set()
+        params_dict = dict(self.named_parameters())
+
+        if isinstance(weights, dict):
+            weights_list = list(weights.items())
+        else:
+            weights_list = list(weights)
+
+        for name, weight in weights_list:
+            if not name.startswith("radio_model."):
+                # Skip non-radio weights
+                continue
+            name = replace_substrings(name, remap_substrings)
+            name = replace_prefix(name, remap_prefixes)
+            if name and name in params_dict:
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
+                weight_loader(param, weight)
+                loaded_params.add(name)
+
+        return loaded_params
+
+    def _extract_final(self, y: torch.Tensor):
+        # Remove CLS + REGISTERS tokens
+        patch_gen = getattr(self.model, "patch_generator", None)
+        if patch_gen is not None:
+            all_feat = y[:, patch_gen.num_skip :]
+
+        return all_feat
diff --git a/sglang/python/sglang/srt/models/registry.py b/sglang/python/sglang/srt/models/registry.py
new file mode 100644
index 0000000000000000000000000000000000000000..066be3dc44b3dfb281cf725814030ccb0ec01925
--- /dev/null
+++ b/sglang/python/sglang/srt/models/registry.py
@@ -0,0 +1,132 @@
+# Adapted from https://github.com/vllm-project/vllm/blob/v0.6.4.post1/vllm/model_executor/models/registry.py
+
+import importlib
+import logging
+import pkgutil
+from dataclasses import dataclass, field
+from functools import lru_cache
+from typing import AbstractSet, Dict, List, Optional, Tuple, Type, Union
+
+import torch.nn as nn
+
+from sglang.srt.environ import envs
+
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class _ModelRegistry:
+    # Keyed by model_arch
+    models: Dict[str, Union[Type[nn.Module], str]] = field(default_factory=dict)
+
+    def register(
+        self, package_name: str, overwrite: bool = False, strict: bool = False
+    ):
+        new_models = import_model_classes(package_name, strict=strict)
+        if overwrite:
+            self.models.update(new_models)
+        else:
+            for arch, cls in new_models.items():
+                if arch in self.models:
+                    raise ValueError(
+                        f"Model architecture {arch} already registered. Set overwrite=True to replace."
+                    )
+                self.models[arch] = cls
+
+    def get_supported_archs(self) -> AbstractSet[str]:
+        return self.models.keys()
+
+    def _raise_for_unsupported(self, architectures: List[str]):
+        all_supported_archs = self.get_supported_archs()
+
+        if any(arch in all_supported_archs for arch in architectures):
+            raise ValueError(
+                f"Model architectures {architectures} failed "
+                "to be inspected. Please check the logs for more details."
+            )
+
+        raise ValueError(
+            f"Model architectures {architectures} are not supported for now. "
+            f"Supported architectures: {all_supported_archs}"
+        )
+
+    def _try_load_model_cls(self, model_arch: str) -> Optional[Type[nn.Module]]:
+        if model_arch not in self.models:
+            return None
+
+        return self.models[model_arch]
+
+    def _normalize_archs(
+        self,
+        architectures: Union[str, List[str]],
+    ) -> List[str]:
+        if isinstance(architectures, str):
+            architectures = [architectures]
+        if not architectures:
+            logger.warning("No model architectures are specified")
+
+        # filter out support architectures
+        normalized_arch = list(
+            filter(lambda model: model in self.models, architectures)
+        )
+
+        # make sure Transformers backend is put at the last as a fallback
+        if len(normalized_arch) != len(architectures):
+            normalized_arch.append("TransformersForCausalLM")
+        return normalized_arch
+
+    def resolve_model_cls(
+        self,
+        architectures: Union[str, List[str]],
+    ) -> Tuple[Type[nn.Module], str]:
+        architectures = self._normalize_archs(architectures)
+
+        for arch in architectures:
+            model_cls = self._try_load_model_cls(arch)
+            if model_cls is not None:
+                return (model_cls, arch)
+
+        return self._raise_for_unsupported(architectures)
+
+
+@lru_cache()
+def import_model_classes(package_name: str, strict: bool = False):
+    model_arch_name_to_cls = {}
+    package = importlib.import_module(package_name)
+    for _, name, ispkg in pkgutil.iter_modules(package.__path__, package_name + "."):
+        if not ispkg:
+            if name.split(".")[-1] in envs.SGLANG_DISABLED_MODEL_ARCHS.get():
+                logger.debug(f"Skip loading {name} due to SGLANG_DISABLED_MODEL_ARCHS")
+                continue
+
+            try:
+                module = importlib.import_module(name)
+            except Exception as e:
+                if strict:
+                    raise
+                logger.warning(f"Ignore import error when loading {name}: {e}")
+                continue
+            if hasattr(module, "EntryClass"):
+                entry = module.EntryClass
+                if isinstance(
+                    entry, list
+                ):  # To support multiple model classes in one module
+                    for tmp in entry:
+                        assert (
+                            tmp.__name__ not in model_arch_name_to_cls
+                        ), f"Duplicated model implementation for {tmp.__name__}"
+                        model_arch_name_to_cls[tmp.__name__] = tmp
+                else:
+                    assert (
+                        entry.__name__ not in model_arch_name_to_cls
+                    ), f"Duplicated model implementation for {entry.__name__}"
+                    model_arch_name_to_cls[entry.__name__] = entry
+
+    return model_arch_name_to_cls
+
+
+ModelRegistry = _ModelRegistry()
+ModelRegistry.register("sglang.srt.models")
+
+if external_pkg := envs.SGLANG_EXTERNAL_MODEL_PACKAGE.get():
+    ModelRegistry.register(external_pkg, overwrite=True)
diff --git a/sglang/python/sglang/srt/models/roberta.py b/sglang/python/sglang/srt/models/roberta.py
new file mode 100644
index 0000000000000000000000000000000000000000..c81590320cc93778c780a42e006b803560797f26
--- /dev/null
+++ b/sglang/python/sglang/srt/models/roberta.py
@@ -0,0 +1,338 @@
+# SPDX-License-Identifier: Apache-2.0
+
+import os
+from typing import Iterable, Optional, Tuple
+
+import torch
+from torch import nn
+
+from sglang.srt.layers.pooler import CrossEncodingPooler, Pooler, PoolingType
+from sglang.srt.layers.quantization.base_config import QuantizationConfig
+from sglang.srt.layers.sparse_pooler import SparsePooler
+from sglang.srt.layers.vocab_parallel_embedding import VocabParallelEmbedding
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch
+from sglang.srt.model_loader.weight_utils import default_weight_loader
+from sglang.srt.models.bert import BertEncoder
+from sglang.srt.utils.hf_transformers_utils import download_from_hf
+
+RobertaConfig = None
+
+
+# Adapted from transformers
+class RobertaClassificationHead(nn.Module):
+    """Head for sentence-level classification tasks."""
+
+    def __init__(self, config: RobertaConfig):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.out_proj = nn.Linear(config.hidden_size, config.num_labels)
+
+    def forward(self, features, **kwargs):
+        x = features[0, :]  # take <s> token (equiv. to [CLS])
+        x = self.dense(x)
+        x = torch.tanh(x)
+        x = self.out_proj(x)
+        return x
+
+
+class RobertaEmbedding(nn.Module):
+
+    def __init__(self, config: RobertaConfig):
+        super().__init__()
+        self.size = config.hidden_size
+        self.word_embeddings = VocabParallelEmbedding(
+            config.vocab_size, config.hidden_size
+        )
+        self.padding_idx = config.pad_token_id
+        self.position_embeddings = nn.Embedding(
+            config.max_position_embeddings,
+            config.hidden_size,
+            padding_idx=self.padding_idx,
+        )
+
+        self.token_type_embeddings = nn.Embedding(
+            config.type_vocab_size, config.hidden_size
+        )
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+
+        self.position_ids = nn.Parameter(
+            torch.empty((1, config.max_position_embeddings)),
+        )
+
+        self.position_embedding_type = config.position_embedding_type
+        if self.position_embedding_type != "absolute":
+            raise ValueError(
+                "Only 'absolute' position_embedding_type" + " is supported"
+            )
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        seq_lens: torch.Tensor,
+        position_ids: torch.Tensor,
+        forward_batch: ForwardBatch,
+    ) -> torch.Tensor:
+        input_shape = input_ids.size()
+        inputs_embeds = self.word_embeddings(input_ids)
+
+        # Adapted from vllm: https://github.com/vllm-project/vllm/commit/4a18fd14ba4a349291c798a16bf62fa8a9af0b6b/vllm/model_executor/models/roberta.py
+
+        pos_list = []
+        token_list = []
+        offset = 0
+        for seq_len in seq_lens:
+            pos_list.append(position_ids[offset : offset + seq_len])
+            token_list.append(input_ids[offset : offset + seq_len])
+            offset += seq_len
+
+        new_pos_list = []
+        for positions, tokens in zip(pos_list, token_list):
+            # Verify assumption that incoming position are
+            # always a sequence from 0 to N.
+            expected_pos = torch.arange(
+                positions.size()[0], dtype=torch.long, device=inputs_embeds.device
+            )
+            assert torch.equal(positions, expected_pos)
+            new_pos_list.append(
+                create_position_ids_from_input_ids(tokens, self.padding_idx)
+            )
+        position_ids = torch.cat(new_pos_list)
+
+        # Position embeddings.
+        position_embeddings = self.position_embeddings(position_ids)
+
+        token_type_ids = forward_batch.token_type_ids
+        if token_type_ids is None:
+            token_type_ids = torch.zeros(
+                input_shape, dtype=torch.long, device=inputs_embeds.device
+            )
+
+        token_type_embeddings = self.token_type_embeddings(token_type_ids)
+        embeddings = inputs_embeds + token_type_embeddings + position_embeddings
+        embeddings = self.LayerNorm(embeddings)
+        return embeddings
+
+
+class XLMRobertaBaseModel(nn.Module):
+    def __init__(
+        self,
+        *,
+        config: RobertaConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+        add_pooling_layer: bool = False,
+    ):
+        super().__init__()
+
+        self.config = config
+        self.embeddings = RobertaEmbedding(config)
+        self.encoder = BertEncoder(config=config, quant_config=quant_config, prefix="")
+        self.pooler = (
+            Pooler(pooling_type=PoolingType.CLS, normalize=True)
+            if add_pooling_layer
+            else None
+        )
+
+    @torch.no_grad()
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        input_embeds: torch.Tensor = None,
+        get_embedding: bool = False,
+    ) -> torch.Tensor:
+        assert get_embedding == True
+        # Your tokenized IDs
+
+        hidden_states = self.embeddings(
+            input_ids=input_ids,
+            position_ids=positions,
+            seq_lens=forward_batch.seq_lens,
+            forward_batch=forward_batch,
+        )
+
+        hidden_states = self.encoder(hidden_states, forward_batch=forward_batch)
+
+        return hidden_states
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "query", "q"),
+            ("qkv_proj", "key", "k"),
+            ("qkv_proj", "value", "v"),
+        ]
+
+        params_dict = dict(self.named_parameters())
+        for name, loaded_weight in weights:
+            name = name.replace("self", "self_attn")
+            if self.pooler is None and "pooler" in name:
+                continue
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
+                weight_loader(param, loaded_weight)
+
+
+# Adapted from transformers
+def create_position_ids_from_input_ids(
+    input_ids, padding_idx, past_key_values_length=0
+):
+    mask = input_ids.ne(padding_idx).int()
+    incremental_indices = (
+        torch.cumsum(mask, dim=0).type_as(mask) + past_key_values_length
+    ) * mask
+    return incremental_indices.long() + padding_idx
+
+
+class XLMRobertaModel(nn.Module):
+    def __init__(
+        self,
+        *,
+        config: RobertaConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+        sparse_head: Optional[str] = None,
+        model_path: Optional[str] = None,
+    ):
+        super().__init__()
+        self.roberta = XLMRobertaBaseModel(
+            config=config, quant_config=quant_config, prefix=prefix
+        )
+        if sparse_head is not None:
+            self._is_sparse = True
+            self._model_path = model_path
+            self._sparse_head = sparse_head
+            self.pooler = SparsePooler(config=config)
+            # Zero out special tokens
+            self._special_tokens = [
+                config.bos_token_id,
+                config.eos_token_id,
+                config.pad_token_id,
+                # self.config.unk_token_id # not available in the XLMRobertaConfig
+            ]
+            self._special_tokens = [t for t in self._special_tokens if t is not None]
+        else:
+            self._is_sparse = False
+            self.pooler = Pooler(pooling_type=PoolingType.CLS, normalize=True)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        input_embeds: torch.Tensor = None,
+        get_embedding: bool = False,
+    ) -> torch.Tensor:
+        hidden_states = self.roberta(
+            input_ids, positions, forward_batch, input_embeds, get_embedding
+        )
+        embeddings = self.pooler(hidden_states, forward_batch)
+
+        if self._is_sparse:
+            for token_id in self._special_tokens:
+                embeddings.embeddings[:, token_id] = 0.0
+            embeddings.embeddings = embeddings.embeddings.to_sparse()
+
+        return embeddings
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        self.roberta.load_weights(weights)
+
+        if self._is_sparse:
+            sparse_dict = XLMRobertaModel._load_sparse_linear(
+                self._model_path, self._sparse_head
+            )
+            self.pooler.load_weights(sparse_dict)
+
+    @staticmethod
+    def _load_sparse_linear(model_path_or_dir: str, sparse_head: str) -> dict:
+        """
+        Load sparse_head from local dir or HF Hub.
+        Returns a state_dict suitable for nn.Linear.load_state_dict().
+        """
+        if os.path.isdir(model_path_or_dir):
+            path = os.path.join(model_path_or_dir, sparse_head)
+            if not os.path.exists(path):
+                raise FileNotFoundError(
+                    f"'{sparse_head}' not found in {model_path_or_dir}"
+                )
+        else:
+            # remote → use SGLang HF utility
+            local_dir = download_from_hf(model_path_or_dir, allow_patterns=sparse_head)
+            path = os.path.join(local_dir, sparse_head)
+
+        state_dict = torch.load(path)
+        return state_dict
+
+
+class XLMRobertaForSequenceClassification(nn.Module):
+    def __init__(
+        self,
+        *,
+        config: RobertaConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.roberta = XLMRobertaBaseModel(
+            config=config, quant_config=quant_config, prefix=prefix
+        )
+        self.classifier = RobertaClassificationHead(config)
+        self.pooler = CrossEncodingPooler(config, self.classifier, self.roberta.pooler)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        input_embeds: torch.Tensor = None,
+        get_embedding: bool = True,
+    ) -> torch.Tensor:
+        assert (
+            get_embedding
+        ), "XLMRobertaForSequenceClassification is only used for rerank"
+
+        hidden_states = self.roberta(
+            input_ids, positions, forward_batch, input_embeds, get_embedding
+        )
+        return self.pooler(hidden_states, forward_batch)
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        self_weights = []
+
+        def weight_filter():
+            for name, weight in weights:
+                if name.startswith("roberta."):
+                    yield (name[len("roberta.") :], weight)
+                else:
+                    self_weights.append((name, weight))
+
+        self.roberta.load_weights(weight_filter())
+
+        params_dict = dict(self.named_parameters())
+
+        for name, loaded_weight in self_weights:
+            if name.startswith("classifier"):
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
+                weight_loader(param, loaded_weight)
+
+
+EntryClass = [XLMRobertaModel, XLMRobertaForSequenceClassification]
diff --git a/sglang/python/sglang/srt/models/sarashina2_vision.py b/sglang/python/sglang/srt/models/sarashina2_vision.py
new file mode 100644
index 0000000000000000000000000000000000000000..f58908b5d15835c782a5bcb6ee244662aaf9fe59
--- /dev/null
+++ b/sglang/python/sglang/srt/models/sarashina2_vision.py
@@ -0,0 +1,268 @@
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Inference-only Sarashina2Vision model compatible with HuggingFace weights."""
+
+import logging
+from typing import Iterable, List, Optional, Tuple
+
+import torch
+from torch import nn
+from transformers import LlamaConfig
+
+from sglang.srt.layers.logits_processor import LogitsProcessor
+from sglang.srt.layers.pooler import Pooler, PoolingType
+from sglang.srt.layers.quantization.base_config import QuantizationConfig
+from sglang.srt.managers.mm_utils import (
+    MultimodalDataItem,
+    MultimodalInputs,
+    MultiModalityDataPaddingPatternMultimodalTokens,
+    general_mm_embed_routine,
+)
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch
+from sglang.srt.model_loader.weight_utils import default_weight_loader
+from sglang.srt.models.llama import LlamaForCausalLM
+from sglang.srt.models.qwen2_vl import Qwen2VisionTransformer
+from sglang.srt.utils import add_prefix
+
+logger = logging.getLogger(__name__)
+
+
+class Sarashina2VisionForCausalLM(nn.Module):
+    """
+    Sarashina2Vision model that combines:
+    - Llama text backbone (sbintuitions/sarashina2-7b)
+    - Qwen2VL vision encoder
+    """
+
+    def __init__(
+        self,
+        config,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+
+        self.config = config
+
+        # Extract text and vision configurations
+        text_config = getattr(config, "text_config", config)
+        vision_config = getattr(config, "vision_config", None)
+
+        # Create vision transformer first (like original model)
+        if vision_config is not None:
+            self.visual = Qwen2VisionTransformer(
+                vision_config,
+                norm_eps=getattr(config, "rms_norm_eps", 1e-5),
+                quant_config=quant_config,
+                prefix=add_prefix("visual", prefix),
+            )
+        else:
+            self.visual = None
+
+        # Layer norm for vision outputs (matching original model)
+        self.norm = nn.LayerNorm(text_config.hidden_size)
+
+        # Create Llama text model (using 'llm' name to match original)
+        if hasattr(text_config, "model_type") and text_config.model_type == "llama":
+            llama_config = LlamaConfig(**text_config.__dict__)
+            # Set vocab_size from main config if available
+            if hasattr(config, "vocab_size"):
+                llama_config.vocab_size = config.vocab_size
+            self.llm = LlamaForCausalLM(
+                llama_config,
+                quant_config=quant_config,
+                prefix=add_prefix("llm", prefix),
+            )
+        else:
+            # Set vocab_size from main config if available
+            if hasattr(config, "vocab_size"):
+                config.vocab_size = config.vocab_size
+            self.llm = LlamaForCausalLM(
+                config,
+                quant_config=quant_config,
+                prefix=add_prefix("llm", prefix),
+            )
+
+        # Image token indices from config
+        self.image_token_index = getattr(config, "image_token_index", 14)
+        self.start_image_token_index = getattr(
+            config, "start_image_token_index", 102397
+        )
+        self.end_image_token_index = getattr(config, "end_image_token_index", 102398)
+
+        # Ensure vocabulary size matches
+        if hasattr(config, "vocab_size"):
+            self.llm.config.vocab_size = config.vocab_size
+
+        self.logits_processor = LogitsProcessor(config)
+        self.pooler = Pooler(pooling_type=PoolingType.LAST, normalize=True)
+
+    def pad_input_ids(self, input_ids: List[int], mm_inputs: MultimodalInputs):
+        """Pad input tokens with multimodal data hashes for RadixAttention."""
+        pattern = MultiModalityDataPaddingPatternMultimodalTokens()
+        return pattern.pad_input_tokens(input_ids, mm_inputs)
+
+    def get_input_embeddings(self):
+        """Get input embeddings from the language model."""
+        return self.llm.get_input_embeddings()
+
+    def get_image_embeds(
+        self,
+        pixel_values: torch.Tensor,
+        image_grid_thw: torch.Tensor,
+    ) -> torch.Tensor:
+        """Extract image embeddings using the vision transformer."""
+        if self.visual is None:
+            raise ValueError("Visual encoder not initialized")
+
+        # Use the existing Qwen2VisionTransformer forward method
+        hidden_states = self.visual(pixel_values, image_grid_thw)
+
+        # Apply normalization layer
+        return self.norm(hidden_states)
+
+    def get_image_feature(self, items: List[MultimodalDataItem]) -> torch.Tensor:
+        """Extract image features for SGLang compatibility."""
+        if self.visual is None:
+            raise ValueError("Visual encoder not initialized")
+
+        # Concatenate pixel values and grid_thw from all items
+        pixel_values = torch.cat([item.feature for item in items], dim=0).type(
+            self.visual.dtype
+        )
+        image_grid_thw = torch.cat([item.image_grid_thw for item in items], dim=0)
+
+        assert pixel_values.dim() == 2, pixel_values.dim()
+        assert image_grid_thw.dim() == 2, image_grid_thw.dim()
+
+        # Use the get_image_embeds method
+        return self.get_image_embeds(pixel_values, image_grid_thw)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        get_embedding: bool = False,
+    ) -> torch.Tensor:
+        """Forward pass through the model."""
+        # Handles token-to-feature mapping for expanded tokens
+        hidden_states = general_mm_embed_routine(
+            input_ids=input_ids,
+            forward_batch=forward_batch,
+            language_model=self.llm.model,
+            multimodal_model=self,
+            positions=positions,
+        )
+
+        if get_embedding:
+            return self.pooler(hidden_states, forward_batch)
+        else:
+            return self.logits_processor(
+                input_ids, hidden_states, self.llm.lm_head, forward_batch
+            )
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        """Load model weights."""
+        params_dict = dict(self.named_parameters())
+        loaded_params = set()
+
+        # Collect weights that need to be fused
+        qkv_weights = {}
+        gate_up_weights = {}
+
+        for name, loaded_weight in weights:
+            # Handle weight name mappings
+
+            # Map visual attention weights: qkv -> qkv_proj
+            if ".attn.qkv." in name:
+                mapped_name = name.replace(".attn.qkv.", ".attn.qkv_proj.")
+                if mapped_name in params_dict:
+                    param = params_dict[mapped_name]
+                    weight_loader = getattr(
+                        param, "weight_loader", default_weight_loader
+                    )
+                    weight_loader(param, loaded_weight)
+                    loaded_params.add(mapped_name)
+                    continue
+
+            # Handle Llama attention weights - need to fuse q, k, v into qkv
+            if ".self_attn.q_proj.weight" in name:
+                base = name.replace(".q_proj.weight", "")
+                qkv_weights[base] = qkv_weights.get(base, {})
+                qkv_weights[base]["q"] = loaded_weight
+                continue
+            elif ".self_attn.k_proj.weight" in name:
+                base = name.replace(".k_proj.weight", "")
+                qkv_weights[base] = qkv_weights.get(base, {})
+                qkv_weights[base]["k"] = loaded_weight
+                continue
+            elif ".self_attn.v_proj.weight" in name:
+                base = name.replace(".v_proj.weight", "")
+                qkv_weights[base] = qkv_weights.get(base, {})
+                qkv_weights[base]["v"] = loaded_weight
+                continue
+
+            # Handle Llama MLP weights - need to fuse gate and up projections
+            if ".mlp.gate_proj.weight" in name:
+                base = name.replace(".gate_proj.weight", "")
+                gate_up_weights[base] = gate_up_weights.get(base, {})
+                gate_up_weights[base]["gate"] = loaded_weight
+                continue
+            elif ".mlp.up_proj.weight" in name:
+                base = name.replace(".up_proj.weight", "")
+                gate_up_weights[base] = gate_up_weights.get(base, {})
+                gate_up_weights[base]["up"] = loaded_weight
+                continue
+
+            # Direct mapping for other weights
+            if name in params_dict:
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
+                weight_loader(param, loaded_weight)
+                loaded_params.add(name)
+
+        # Fuse QKV weights for Llama attention layers
+        for base, weights_dict in qkv_weights.items():
+            if "q" in weights_dict and "k" in weights_dict and "v" in weights_dict:
+                qkv_name = f"{base}.qkv_proj.weight"
+                if qkv_name in params_dict:
+                    # Concatenate q, k, v weights
+                    q, k, v = weights_dict["q"], weights_dict["k"], weights_dict["v"]
+                    qkv = torch.cat([q, k, v], dim=0)
+                    param = params_dict[qkv_name]
+                    weight_loader = getattr(
+                        param, "weight_loader", default_weight_loader
+                    )
+                    weight_loader(param, qkv)
+                    loaded_params.add(qkv_name)
+
+        # Fuse gate and up weights for Llama MLP layers
+        for base, weights_dict in gate_up_weights.items():
+            if "gate" in weights_dict and "up" in weights_dict:
+                gate_up_name = f"{base}.gate_up_proj.weight"
+                if gate_up_name in params_dict:
+                    # Concatenate gate and up weights
+                    gate, up = weights_dict["gate"], weights_dict["up"]
+                    gate_up = torch.cat([gate, up], dim=0)
+                    param = params_dict[gate_up_name]
+                    weight_loader = getattr(
+                        param, "weight_loader", default_weight_loader
+                    )
+                    weight_loader(param, gate_up)
+                    loaded_params.add(gate_up_name)
+
+
+# Register the model
+EntryClass = Sarashina2VisionForCausalLM
diff --git a/sglang/python/sglang/srt/models/sarvam_moe.py b/sglang/python/sglang/srt/models/sarvam_moe.py
new file mode 100644
index 0000000000000000000000000000000000000000..9c4179362fcd1c75c9830a8afb98da2bf15ca729
--- /dev/null
+++ b/sglang/python/sglang/srt/models/sarvam_moe.py
@@ -0,0 +1,1525 @@
+"""Inference-only Sarvam MoE models for SGLang.
+- SarvamMLAForCausalLM (105B)
+- SarvamMoEForCausalLM (30B)
+"""
+
+import math
+from enum import IntEnum, auto
+from typing import Any, Dict, Iterable, Optional, Tuple
+
+import torch
+import torch.nn.functional as F
+from torch import nn
+from transformers import PretrainedConfig
+
+from sglang.srt.distributed import (
+    get_pp_group,
+    get_tensor_model_parallel_world_size,
+    tensor_model_parallel_all_reduce,
+)
+from sglang.srt.eplb.expert_distribution import get_global_expert_distribution_recorder
+from sglang.srt.eplb.expert_location import ModelConfigForExpertLocation
+from sglang.srt.layers.activation import SiluAndMul
+from sglang.srt.layers.attention.utils import concat_and_cast_mha_k_triton
+from sglang.srt.layers.communicator import (
+    LayerCommunicator,
+    LayerScatterModes,
+    enable_moe_dense_fully_dp,
+)
+from sglang.srt.layers.dp_attention import (
+    get_attention_tp_rank,
+    get_attention_tp_size,
+    is_dp_attention_enabled,
+)
+from sglang.srt.layers.layernorm import RMSNorm
+from sglang.srt.layers.linear import (
+    ColumnParallelLinear,
+    MergedColumnParallelLinear,
+    ReplicatedLinear,
+    RowParallelLinear,
+)
+from sglang.srt.layers.logits_processor import LogitsProcessor, LogitsProcessorOutput
+from sglang.srt.layers.moe import should_use_flashinfer_cutlass_moe_fp4_allgather
+from sglang.srt.layers.moe.ep_moe.layer import get_moe_impl_class
+from sglang.srt.layers.moe.fused_moe_triton.layer import FusedMoE
+from sglang.srt.layers.moe.topk import TopK
+from sglang.srt.layers.moe.utils import RoutingMethodType
+from sglang.srt.layers.quantization.base_config import QuantizationConfig
+from sglang.srt.layers.radix_attention import RadixAttention
+from sglang.srt.layers.rotary_embedding import get_rope
+from sglang.srt.layers.utils import get_layer_id
+from sglang.srt.layers.vocab_parallel_embedding import (
+    ParallelLMHead,
+    VocabParallelEmbedding,
+)
+from sglang.srt.model_executor.cuda_graph_runner import get_is_capture_mode
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch, PPProxyTensors
+from sglang.srt.model_loader.weight_utils import default_weight_loader
+from sglang.srt.models.bailing_moe import BailingMoEForCausalLM
+from sglang.srt.models.deepseek_common.attention_forward_methods.forward_mha import (
+    DeepseekMHAForwardMixin,
+)
+from sglang.srt.server_args import get_global_server_args
+from sglang.srt.utils import (
+    BumpAllocator,
+    add_prefix,
+    bind_or_assign,
+    is_cuda,
+    is_nvidia_cublas_version_ge_12_9,
+    make_layers,
+    next_power_of_2,
+)
+
+_is_cuda = is_cuda()
+_is_cublas_ge_129 = is_nvidia_cublas_version_ge_12_9()
+
+if _is_cuda:
+    try:
+        from sgl_kernel import bmm_fp8, concat_mla_k, merge_state_v2
+
+        from sglang.srt.layers.quantization.fp8_kernel import per_tensor_quant_mla_fp8
+
+        _has_fp8_support = True
+        _has_concat_mla_k = True
+    except ImportError:
+        _has_fp8_support = False
+        _has_concat_mla_k = False
+        bmm_fp8 = None
+        concat_mla_k = None
+        merge_state_v2 = None
+        per_tensor_quant_mla_fp8 = None
+else:
+    _has_fp8_support = False
+    _has_concat_mla_k = False
+    bmm_fp8 = None
+    concat_mla_k = None
+    merge_state_v2 = None
+    per_tensor_quant_mla_fp8 = None
+
+
+class AttnForwardMethod(IntEnum):
+    MLA_SEPARATE_ROPE = auto()
+    MLA_CONCAT_ROPE = auto()
+    MHA_PREFILL = auto()
+
+
+SEPARATE_ROPE_BACKENDS = frozenset(
+    ["fa3", "flashinfer", "nsa", "cutlass_mla", "trtllm_mla"]
+)
+CONCAT_ROPE_BACKENDS = frozenset(["flashmla", "triton"])
+
+
+class AttentionBackendRegistry:
+    _handlers = {}
+
+    @classmethod
+    def register(cls, backend_name: str, handler_func):
+        cls._handlers[backend_name] = handler_func
+
+    @classmethod
+    def get_handler(cls, backend_name: str):
+        return cls._handlers.get(backend_name, cls._default_handler)
+
+    @classmethod
+    def _default_handler(cls, attn, forward_batch) -> AttnForwardMethod:
+        return AttnForwardMethod.MLA_CONCAT_ROPE
+
+    @classmethod
+    def get_forward_method(
+        cls, backend_name: str, attn, forward_batch
+    ) -> AttnForwardMethod:
+        handler = cls.get_handler(backend_name)
+        return handler(attn, forward_batch)
+
+
+def _handle_separate_rope_backend(attn, forward_batch) -> AttnForwardMethod:
+    return AttnForwardMethod.MLA_SEPARATE_ROPE
+
+
+def _handle_concat_rope_backend(attn, forward_batch) -> AttnForwardMethod:
+    return AttnForwardMethod.MLA_CONCAT_ROPE
+
+
+for backend in SEPARATE_ROPE_BACKENDS:
+    AttentionBackendRegistry.register(backend, _handle_separate_rope_backend)
+for backend in CONCAT_ROPE_BACKENDS:
+    AttentionBackendRegistry.register(backend, _handle_concat_rope_backend)
+
+
+def get_attn_forward_method(server_args, forward_batch) -> AttnForwardMethod:
+    is_decode = forward_batch.forward_mode.is_decode_or_idle()
+    if is_decode:
+        backend = server_args.decode_attention_backend or server_args.attention_backend
+    else:
+        backend = server_args.prefill_attention_backend or server_args.attention_backend
+        if (
+            forward_batch.forward_mode.is_extend_without_speculative()
+            and backend == "fa3"
+        ):
+            return AttnForwardMethod.MHA_PREFILL
+    return AttentionBackendRegistry.get_forward_method(backend, None, forward_batch)
+
+
+class SarvamMoEMLP(nn.Module):
+    def __init__(
+        self,
+        hidden_size: int,
+        intermediate_size: int,
+        hidden_act: str,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+        reduce_results: bool = True,
+        tp_rank: Optional[int] = None,
+        tp_size: Optional[int] = None,
+    ) -> None:
+        super().__init__()
+        self.gate_up_proj = MergedColumnParallelLinear(
+            hidden_size,
+            [intermediate_size] * 2,
+            bias=False,
+            quant_config=quant_config,
+            prefix=add_prefix("gate_up_proj", prefix),
+            tp_rank=tp_rank,
+            tp_size=tp_size,
+        )
+        self.down_proj = RowParallelLinear(
+            intermediate_size,
+            hidden_size,
+            bias=False,
+            quant_config=quant_config,
+            prefix=add_prefix("down_proj", prefix),
+            reduce_results=reduce_results,
+            tp_rank=tp_rank,
+            tp_size=tp_size,
+        )
+        if hidden_act != "silu":
+            raise ValueError(
+                f"Unsupported activation: {hidden_act}. Only silu is supported."
+            )
+        self.act_fn = SiluAndMul()
+
+    def forward(
+        self,
+        x,
+        forward_batch: ForwardBatch = None,
+        should_allreduce_fusion: bool = False,
+        use_reduce_scatter: bool = False,
+    ):
+        if x.shape[0] == 0:
+            return x
+        gate_up, _ = self.gate_up_proj(x)
+        x = self.act_fn(gate_up)
+        x, _ = self.down_proj(
+            x, skip_all_reduce=should_allreduce_fusion or use_reduce_scatter
+        )
+        return x
+
+
+class SarvamMoESparseMoeBlock(nn.Module):
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        layer_id: int,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+        alt_stream: Optional[torch.cuda.Stream] = None,
+    ):
+        super().__init__()
+        self.config = config
+        self.layer_id = layer_id
+        self.tp_size = get_tensor_model_parallel_world_size()
+        self.routed_scaling_factor = getattr(config, "routed_scaling_factor", 2.5)
+        self.score_function = getattr(config, "score_function", "sigmoid")
+        self.n_group = getattr(config, "n_group", None)
+        self.topk_group = getattr(config, "topk_group", None)
+        self.alt_stream = alt_stream
+
+        dtype_map = {
+            "fp32": torch.float32,
+            "bf16": torch.bfloat16,
+            "bfloat16": torch.bfloat16,
+        }
+        router_dtype_cfg = getattr(config, "router_dtype", "fp32")
+        self.router_dtype = dtype_map.get(router_dtype_cfg, None)
+
+        if self.tp_size > config.num_experts:
+            raise ValueError(
+                f"Tensor parallel size {self.tp_size} is greater than "
+                f"the number of experts {config.num_experts}."
+            )
+
+        self.e_score_correction_bias = nn.Parameter(
+            torch.zeros(config.num_experts, dtype=torch.float32),
+            requires_grad=False,
+        )
+
+        self.topk = TopK(
+            top_k=config.num_experts_per_tok,
+            use_grouped_topk=self.n_group is not None and self.topk_group is not None,
+            num_expert_group=self.n_group,
+            topk_group=self.topk_group,
+            renormalize=True,
+            routed_scaling_factor=None,
+            apply_routed_scaling_factor_on_output=False,
+            scoring_func=self.score_function,
+            correction_bias=self.e_score_correction_bias,
+            quant_config=quant_config,
+            layer_id=layer_id,
+        )
+
+        self.experts = get_moe_impl_class(quant_config)(
+            num_experts=config.num_experts
+            + get_global_server_args().ep_num_redundant_experts,
+            top_k=config.num_experts_per_tok,
+            hidden_size=config.hidden_size,
+            intermediate_size=config.moe_intermediate_size,
+            layer_id=layer_id,
+            quant_config=quant_config,
+            prefix=add_prefix("experts", prefix),
+            routing_method_type=RoutingMethodType.Renormalize,
+        )
+
+        self.gate = ReplicatedLinear(
+            config.hidden_size,
+            config.num_experts,
+            bias=False,
+            quant_config=None,
+            prefix=add_prefix("gate", prefix),
+        )
+
+        if (
+            getattr(config, "num_shared_experts", None)
+            and config.num_shared_experts > 0
+        ):
+            intermediate_size = config.moe_intermediate_size * config.num_shared_experts
+            if enable_moe_dense_fully_dp():
+                shared_tp_rank, shared_tp_size = 0, 1
+            else:
+                shared_tp_rank, shared_tp_size = None, None
+            self.shared_experts = SarvamMoEMLP(
+                hidden_size=config.hidden_size,
+                intermediate_size=intermediate_size,
+                hidden_act=config.hidden_act,
+                quant_config=quant_config,
+                prefix=add_prefix("shared_experts", prefix),
+                reduce_results=False,
+                tp_rank=shared_tp_rank,
+                tp_size=shared_tp_size,
+            )
+        else:
+            self.shared_experts = None
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        forward_batch: Optional[ForwardBatch] = None,
+        should_allreduce_fusion: bool = False,
+        use_reduce_scatter: bool = False,
+        gemm_output_zero_allocator: Optional[BumpAllocator] = None,
+    ) -> torch.Tensor:
+        del gemm_output_zero_allocator
+
+        if (
+            self.shared_experts is not None
+            and self.alt_stream is not None
+            and hidden_states.shape[0] > 0
+            and get_is_capture_mode()
+        ):
+            return self.forward_normal_dual_stream(
+                hidden_states, should_allreduce_fusion, use_reduce_scatter
+            )
+        else:
+            return self.forward_normal(
+                hidden_states, should_allreduce_fusion, use_reduce_scatter
+            )
+
+    def get_moe_weights(self):
+        return [
+            x.data
+            for name, x in self.experts.named_parameters()
+            if name not in ["correction_bias"]
+        ]
+
+    def _forward_shared_experts(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        return self.shared_experts(hidden_states)
+
+    def _forward_router_experts(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        if self.router_dtype is not None:
+            router_logits = F.linear(
+                hidden_states.to(self.router_dtype),
+                self.gate.weight.to(self.router_dtype),
+            )
+        else:
+            router_logits, _ = self.gate(hidden_states)
+        topk_output = self.topk(hidden_states, router_logits)
+        return self.experts(hidden_states, topk_output)
+
+    def forward_normal_dual_stream(
+        self,
+        hidden_states: torch.Tensor,
+        should_allreduce_fusion: bool = False,
+        use_reduce_scatter: bool = False,
+    ) -> torch.Tensor:
+        num_tokens, hidden_dim = hidden_states.shape
+        current_stream = torch.cuda.current_stream()
+        self.alt_stream.wait_stream(current_stream)
+        shared_out = self._forward_shared_experts(hidden_states)
+        with torch.cuda.stream(self.alt_stream):
+            final_hidden_states = self._forward_router_experts(hidden_states)
+            if self.routed_scaling_factor != 1.0:
+                final_hidden_states = final_hidden_states * self.routed_scaling_factor
+        current_stream.wait_stream(self.alt_stream)
+        final_hidden_states = final_hidden_states + shared_out
+        if (
+            self.tp_size > 1
+            and not should_allreduce_fusion
+            and not use_reduce_scatter
+            and not should_use_flashinfer_cutlass_moe_fp4_allgather()
+        ):
+            final_hidden_states = tensor_model_parallel_all_reduce(final_hidden_states)
+        return final_hidden_states.view(num_tokens, hidden_dim)
+
+    def forward_normal(
+        self,
+        hidden_states: torch.Tensor,
+        should_allreduce_fusion: bool = False,
+        use_reduce_scatter: bool = False,
+    ) -> torch.Tensor:
+        if hidden_states.shape[0] == 0:
+            return hidden_states
+
+        num_tokens, hidden_dim = hidden_states.shape
+        identity = (
+            hidden_states.clone() if self.shared_experts is not None else hidden_states
+        )
+
+        if self.router_dtype is not None:
+            router_logits = F.linear(
+                hidden_states.to(self.router_dtype),
+                self.gate.weight.to(self.router_dtype),
+            )
+        else:
+            router_logits, _ = self.gate(hidden_states)
+        topk_output = self.topk(hidden_states, router_logits)
+        final_hidden_states = self.experts(hidden_states, topk_output)
+
+        if self.shared_experts is not None:
+            shared_out = self.shared_experts(identity)
+            if self.routed_scaling_factor != 1.0:
+                shared_out.add_(final_hidden_states, alpha=self.routed_scaling_factor)
+            else:
+                shared_out.add_(final_hidden_states)
+            final_hidden_states = shared_out
+        elif self.routed_scaling_factor != 1.0:
+            final_hidden_states = final_hidden_states * self.routed_scaling_factor
+
+        if (
+            self.tp_size > 1
+            and not should_allreduce_fusion
+            and not use_reduce_scatter
+            and not should_use_flashinfer_cutlass_moe_fp4_allgather()
+        ):
+            final_hidden_states = tensor_model_parallel_all_reduce(final_hidden_states)
+
+        return final_hidden_states.view(num_tokens, hidden_dim)
+
+
+class SarvamMoEMLAAttention(nn.Module):
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        hidden_size: int,
+        num_heads: int,
+        layer_id: int = 0,
+        rope_theta: float = 10000,
+        rope_scaling: Optional[Dict[str, Any]] = None,
+        max_position_embeddings: int = 8192,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+        alt_stream: Optional[torch.cuda.Stream] = None,
+    ) -> None:
+        super().__init__()
+        self.config = config
+        self.hidden_size = hidden_size
+        self.layer_id = layer_id
+        self.alt_stream = alt_stream
+        self.quant_config = quant_config
+
+        attn_tp_rank = get_attention_tp_rank()
+        attn_tp_size = get_attention_tp_size()
+
+        self.qk_nope_head_dim = config.qk_nope_head_dim
+        self.qk_rope_head_dim = config.qk_rope_head_dim
+        self.qk_head_dim = config.qk_nope_head_dim + config.qk_rope_head_dim
+        self.v_head_dim = config.v_head_dim
+        self.q_lora_rank = getattr(config, "q_lora_rank", None)
+        self.kv_lora_rank = config.kv_lora_rank
+
+        self.num_heads = num_heads
+        assert num_heads % attn_tp_size == 0
+        self.num_local_heads = num_heads // attn_tp_size
+
+        self.scaling = self.qk_head_dim**-0.5
+        self.rope_theta = rope_theta
+        self.max_position_embeddings = max_position_embeddings
+        self.kv_cache_dtype = get_global_server_args().kv_cache_dtype
+
+        self._server_args = None
+        self.current_attention_backend = None
+
+        if self.q_lora_rank is None:
+            self.q_proj = ColumnParallelLinear(
+                self.hidden_size,
+                self.num_heads * self.qk_head_dim,
+                bias=False,
+                quant_config=quant_config,
+                prefix=add_prefix("q_proj", prefix),
+                tp_rank=attn_tp_rank,
+                tp_size=attn_tp_size,
+            )
+            self.kv_a_proj_with_mqa = ReplicatedLinear(
+                self.hidden_size,
+                self.kv_lora_rank + self.qk_rope_head_dim,
+                bias=False,
+                quant_config=quant_config,
+                prefix=add_prefix("kv_a_proj_with_mqa", prefix),
+            )
+        else:
+            self.q_a_proj = ReplicatedLinear(
+                self.hidden_size,
+                self.q_lora_rank,
+                bias=False,
+                quant_config=quant_config,
+                prefix=add_prefix("q_a_proj", prefix),
+            )
+            self.q_a_layernorm = RMSNorm(self.q_lora_rank, eps=config.rms_norm_eps)
+            self.q_b_proj = ColumnParallelLinear(
+                self.q_lora_rank,
+                self.num_heads * self.qk_head_dim,
+                bias=False,
+                quant_config=quant_config,
+                prefix=add_prefix("q_b_proj", prefix),
+                tp_rank=attn_tp_rank,
+                tp_size=attn_tp_size,
+            )
+            self.kv_a_proj_with_mqa = ReplicatedLinear(
+                self.hidden_size,
+                self.kv_lora_rank + self.qk_rope_head_dim,
+                bias=False,
+                quant_config=quant_config,
+                prefix=add_prefix("kv_a_proj_with_mqa", prefix),
+            )
+
+        self.kv_a_layernorm = RMSNorm(self.kv_lora_rank, eps=config.rms_norm_eps)
+        self.kv_b_proj = ColumnParallelLinear(
+            self.kv_lora_rank,
+            self.num_heads * (self.qk_nope_head_dim + self.v_head_dim),
+            bias=False,
+            quant_config=quant_config,
+            prefix=add_prefix("kv_b_proj", prefix),
+            tp_rank=attn_tp_rank,
+            tp_size=attn_tp_size,
+        )
+
+        self.o_proj = RowParallelLinear(
+            self.num_heads * self.v_head_dim,
+            self.hidden_size,
+            bias=False,
+            quant_config=quant_config,
+            prefix=add_prefix("o_proj", prefix),
+            tp_rank=attn_tp_rank,
+            tp_size=attn_tp_size,
+            reduce_results=False,
+        )
+
+        self.rotary_emb = get_rope(
+            self.qk_rope_head_dim,
+            rotary_dim=self.qk_rope_head_dim,
+            max_position=max_position_embeddings,
+            base=rope_theta,
+            rope_scaling=rope_scaling,
+            is_neox_style=False,
+        )
+        if rope_scaling and rope_scaling["type"] == "deepseek_yarn":
+            mscale_all_dim = rope_scaling.get("mscale_all_dim", 1.0)
+            scaling_factor = rope_scaling.get("factor", 1.0)
+            mscale = self.yarn_get_mscale(scaling_factor, float(mscale_all_dim))
+            self.scaling = self.scaling * mscale * mscale
+
+        self.attn_mqa = RadixAttention(
+            self.num_local_heads,
+            self.kv_lora_rank + self.qk_rope_head_dim,
+            self.scaling,
+            num_kv_heads=1,
+            layer_id=layer_id,
+            v_head_dim=self.kv_lora_rank,
+            quant_config=quant_config,
+            prefix=add_prefix("attn_mqa", prefix),
+        )
+
+        self.attn_mha = RadixAttention(
+            self.num_local_heads,
+            self.qk_nope_head_dim + self.qk_rope_head_dim,
+            self.scaling,
+            num_kv_heads=self.num_local_heads,
+            layer_id=layer_id,
+            v_head_dim=self.v_head_dim,
+            quant_config=quant_config,
+            prefix=add_prefix("attn_mha", prefix),
+        )
+
+        self.w_kc = None
+        self.w_vc = None
+        self.w_scale = None
+
+    def yarn_get_mscale(self, scale: float = 1, mscale: float = 1) -> float:
+        if scale <= 1:
+            return 1.0
+        return 0.1 * mscale * math.log(scale) + 1.0
+
+    def _concat_and_cast_mha_k(
+        self,
+        k_nope: torch.Tensor,
+        k_pe: torch.Tensor,
+        forward_batch: ForwardBatch,
+    ) -> torch.Tensor:
+        k_shape = (k_nope.shape[0], self.num_local_heads, self.qk_head_dim)
+
+        if (
+            _is_cuda
+            and _has_concat_mla_k
+            and (self.num_local_heads == 128)
+            and (self.qk_nope_head_dim == 128)
+            and (self.qk_rope_head_dim == 64)
+        ):
+            k = k_nope.new_empty(*k_shape)
+            concat_mla_k(k=k, k_nope=k_nope, k_rope=k_pe)
+            return k
+
+        if (
+            _is_cuda
+            and next_power_of_2(self.num_local_heads) == self.num_local_heads
+            and next_power_of_2(self.qk_nope_head_dim) == self.qk_nope_head_dim
+            and next_power_of_2(self.qk_rope_head_dim) == self.qk_rope_head_dim
+        ):
+            if (
+                self.current_attention_backend == "fa3"
+                and self.kv_cache_dtype != "auto"
+            ):
+                attn_dtype = forward_batch.token_to_kv_pool.dtype
+            else:
+                attn_dtype = k_nope.dtype
+            k = k_nope.new_empty(*k_shape, dtype=attn_dtype)
+            concat_and_cast_mha_k_triton(k, k_nope, k_pe)
+            return k
+
+        k = k_nope.new_empty(*k_shape)
+        k[..., : self.qk_nope_head_dim] = k_nope
+        k[..., self.qk_nope_head_dim :] = k_pe
+        return k
+
+    def _set_current_attention_backend(self, forward_batch: ForwardBatch) -> None:
+        if self._server_args is None:
+            self._server_args = get_global_server_args()
+        if forward_batch.forward_mode.is_decode_or_idle():
+            self.current_attention_backend = (
+                self._server_args.decode_attention_backend
+                or self._server_args.attention_backend
+            )
+        else:
+            self.current_attention_backend = (
+                self._server_args.prefill_attention_backend
+                or self._server_args.attention_backend
+            )
+
+    def _maybe_fp8_bmm(
+        self,
+        x_bmk: torch.Tensor,
+        w_bkn: torch.Tensor,
+        zero_allocator: Optional[BumpAllocator] = None,
+    ) -> torch.Tensor:
+        if (
+            _has_fp8_support
+            and w_bkn is not None
+            and w_bkn.dtype == torch.float8_e4m3fn
+        ):
+            x_val, x_scale = per_tensor_quant_mla_fp8(
+                x_bmk,
+                (
+                    torch.zeros((1,), dtype=torch.float32, device=x_bmk.device)
+                    if _is_cublas_ge_129
+                    else (
+                        zero_allocator.allocate(1)
+                        if zero_allocator
+                        else torch.zeros((1,), dtype=torch.float32, device=x_bmk.device)
+                    )
+                ),
+            )
+            w_scale = self.w_scale if self.w_scale is not None else 1.0
+            return bmm_fp8(x_val, w_bkn, x_scale, w_scale, torch.bfloat16)
+
+        return torch.bmm(x_bmk, w_bkn)
+
+    def _run_mha_prefill(
+        self,
+        positions: torch.Tensor,
+        q: torch.Tensor,
+        q_pe: torch.Tensor,
+        k_nope: torch.Tensor,
+        k_pe: torch.Tensor,
+        forward_batch: ForwardBatch,
+    ) -> torch.Tensor:
+
+        q_pe, k_pe = self.rotary_emb(positions, q_pe, k_pe)
+        q[..., self.qk_nope_head_dim :] = q_pe
+
+        forward_batch.token_to_kv_pool.set_mla_kv_buffer(
+            self.attn_mha,
+            forward_batch.out_cache_loc,
+            k_nope,
+            k_pe,
+        )
+
+        kv_a = k_nope.squeeze(1)
+        kv_expanded, _ = self.kv_b_proj(kv_a)
+        kv_expanded = kv_expanded.view(
+            -1, self.num_local_heads, self.qk_nope_head_dim + self.v_head_dim
+        )
+        k_nope_expanded = kv_expanded[..., : self.qk_nope_head_dim]
+        v = kv_expanded[..., self.qk_nope_head_dim :]
+
+        k = self._concat_and_cast_mha_k(k_nope_expanded, k_pe, forward_batch)
+
+        has_extend_prefix = forward_batch.extend_prefix_lens_cpu is not None and any(
+            forward_batch.extend_prefix_lens_cpu
+        )
+
+        self._set_current_attention_backend(forward_batch)
+        can_use_prefix_cache = not self._server_args.disable_radix_cache
+        do_prefix_merge = has_extend_prefix and can_use_prefix_cache
+
+        if do_prefix_merge and forward_batch.num_prefix_chunks is None:
+            if hasattr(forward_batch, "prepare_chunked_prefix_cache_info"):
+                forward_batch.prepare_chunked_prefix_cache_info(q.device)
+            else:
+                forward_batch.num_prefix_chunks = 0
+            if hasattr(forward_batch.attn_backend, "init_mha_chunk_metadata"):
+                forward_batch.attn_backend.init_mha_chunk_metadata(forward_batch)
+
+        forward_batch.set_attn_attend_prefix_cache(False)
+        forward_batch.mha_return_lse = do_prefix_merge
+        attn_output = self.attn_mha(q, k, v, forward_batch, save_kv_cache=False)
+
+        if do_prefix_merge and merge_state_v2 is not None:
+            attn_output, lse = attn_output
+            forward_batch.set_attn_attend_prefix_cache(True)
+            attn_output = self._chunked_prefix_attn_mha(
+                q=q,
+                accum_output=attn_output,
+                accum_lse=lse,
+                forward_batch=forward_batch,
+            )
+
+        forward_batch.set_attn_attend_prefix_cache(None)
+
+        attn_output = attn_output.reshape(-1, self.num_local_heads * self.v_head_dim)
+        output, _ = self.o_proj(attn_output)
+        return output
+
+    def _chunked_prefix_attn_mha(
+        self,
+        q: torch.Tensor,
+        accum_output: torch.Tensor,
+        accum_lse: torch.Tensor,
+        forward_batch: ForwardBatch,
+    ) -> torch.Tensor:
+        return DeepseekMHAForwardMixin._chunked_prefix_attn_mha(
+            self, q, accum_output, accum_lse, forward_batch
+        )
+
+    def _get_mla_kv_buffer(
+        self,
+        kv_indices: torch.Tensor,
+        dst_dtype: torch.dtype,
+        forward_batch: ForwardBatch,
+    ):
+        return DeepseekMHAForwardMixin._get_mla_kv_buffer(
+            self, kv_indices, dst_dtype, forward_batch
+        )
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+        zero_allocator: Optional[BumpAllocator] = None,
+        llama_4_scaling: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        del llama_4_scaling
+        if hidden_states.shape[0] == 0:
+            return hidden_states
+
+        if self.q_lora_rank is None:
+            q, _ = self.q_proj(hidden_states)
+            latent_cache, _ = self.kv_a_proj_with_mqa(hidden_states)
+            k_nope = latent_cache[..., : self.kv_lora_rank]
+            k_nope = self.kv_a_layernorm(k_nope).unsqueeze(1)
+        else:
+            q_a, _ = self.q_a_proj(hidden_states)
+            q_a = self.q_a_layernorm(q_a)
+            q, _ = self.q_b_proj(q_a)
+            latent_cache, _ = self.kv_a_proj_with_mqa(hidden_states)
+            k_nope = latent_cache[..., : self.kv_lora_rank]
+            k_nope = self.kv_a_layernorm(k_nope).unsqueeze(1)
+
+        q = q.view(-1, self.num_local_heads, self.qk_head_dim)
+        q_nope, q_pe = q.split([self.qk_nope_head_dim, self.qk_rope_head_dim], dim=-1)
+        k_pe = latent_cache[..., self.kv_lora_rank :].unsqueeze(1)
+
+        if self._server_args is None:
+            self._server_args = get_global_server_args()
+        self._set_current_attention_backend(forward_batch)
+
+        forward_method = get_attn_forward_method(self._server_args, forward_batch)
+
+        if forward_method == AttnForwardMethod.MHA_PREFILL:
+            return self._run_mha_prefill(
+                positions=positions,
+                q=q,
+                q_pe=q_pe,
+                k_nope=k_nope,
+                k_pe=k_pe,
+                forward_batch=forward_batch,
+            )
+
+        if self.alt_stream is not None and get_is_capture_mode():
+            current_stream = torch.cuda.current_stream()
+            self.alt_stream.wait_stream(current_stream)
+
+            with torch.cuda.stream(self.alt_stream):
+                q_pe, k_pe = self.rotary_emb(positions, q_pe, k_pe)
+
+            q_nope_out = self._maybe_fp8_bmm(
+                q_nope.transpose(0, 1), self.w_kc, zero_allocator
+            )
+            q_nope_out = q_nope_out.transpose(0, 1)
+
+            current_stream.wait_stream(self.alt_stream)
+        else:
+            q_nope_out = self._maybe_fp8_bmm(
+                q_nope.transpose(0, 1), self.w_kc, zero_allocator
+            )
+            q_nope_out = q_nope_out.transpose(0, 1)
+
+            q_pe, k_pe = self.rotary_emb(positions, q_pe, k_pe)
+
+        if forward_method == AttnForwardMethod.MLA_SEPARATE_ROPE:
+            attn_output = self.attn_mqa(
+                q_nope_out,
+                k_nope,
+                k_nope,
+                forward_batch,
+                q_rope=q_pe,
+                k_rope=k_pe,
+            )
+        elif forward_method == AttnForwardMethod.MLA_CONCAT_ROPE:
+            q = torch.cat([q_nope_out, q_pe], dim=-1)
+            k = torch.cat([k_nope, k_pe], dim=-1)
+            attn_output = self.attn_mqa(
+                q,
+                k,
+                k_nope,
+                forward_batch,
+            )
+        else:
+            raise ValueError(f"Unknown forward method: {forward_method}")
+        attn_output = attn_output.view(-1, self.num_local_heads, self.kv_lora_rank)
+
+        attn_bmm_output = self._maybe_fp8_bmm(
+            attn_output.transpose(0, 1), self.w_vc, zero_allocator
+        )
+        attn_bmm_output = attn_bmm_output.transpose(0, 1).flatten(1, 2)
+
+        output, _ = self.o_proj(attn_bmm_output)
+        return output
+
+    def forward_prepare(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+        zero_allocator: Optional[BumpAllocator] = None,
+        llama_4_scaling: Optional[torch.Tensor] = None,
+    ) -> Tuple[Optional[torch.Tensor], ForwardBatch, Optional[Tuple]]:
+        del llama_4_scaling
+        if hidden_states.shape[0] == 0:
+            return hidden_states, forward_batch, None
+
+        if self.q_lora_rank is None:
+            # Dual-stream parallel Q and KV projections
+            if self.alt_stream is not None and get_is_capture_mode():
+                current_stream = torch.cuda.current_stream()
+                self.alt_stream.wait_stream(current_stream)
+                with torch.cuda.stream(self.alt_stream):
+                    latent_cache, _ = self.kv_a_proj_with_mqa(hidden_states)
+                q, _ = self.q_proj(hidden_states)
+                current_stream.wait_stream(self.alt_stream)
+            else:
+                q, _ = self.q_proj(hidden_states)
+                latent_cache, _ = self.kv_a_proj_with_mqa(hidden_states)
+            k_nope = latent_cache[..., : self.kv_lora_rank]
+            k_nope = self.kv_a_layernorm(k_nope).unsqueeze(1)
+        else:
+            # For q_lora_rank path, overlap q_a_proj with kv_a_proj
+            if self.alt_stream is not None and get_is_capture_mode():
+                current_stream = torch.cuda.current_stream()
+                self.alt_stream.wait_stream(current_stream)
+                with torch.cuda.stream(self.alt_stream):
+                    latent_cache, _ = self.kv_a_proj_with_mqa(hidden_states)
+                q_a, _ = self.q_a_proj(hidden_states)
+                current_stream.wait_stream(self.alt_stream)
+            else:
+                q_a, _ = self.q_a_proj(hidden_states)
+                latent_cache, _ = self.kv_a_proj_with_mqa(hidden_states)
+            q_a = self.q_a_layernorm(q_a)
+            q, _ = self.q_b_proj(q_a)
+            k_nope = latent_cache[..., : self.kv_lora_rank]
+            k_nope = self.kv_a_layernorm(k_nope).unsqueeze(1)
+
+        q = q.view(-1, self.num_local_heads, self.qk_head_dim)
+        q_nope, q_pe = q.split([self.qk_nope_head_dim, self.qk_rope_head_dim], dim=-1)
+        k_pe = latent_cache[..., self.kv_lora_rank :].unsqueeze(1)
+
+        if self._server_args is None:
+            self._server_args = get_global_server_args()
+        self._set_current_attention_backend(forward_batch)
+        forward_method = get_attn_forward_method(self._server_args, forward_batch)
+
+        if forward_method == AttnForwardMethod.MHA_PREFILL:
+            output = self._run_mha_prefill(
+                positions=positions,
+                q=q,
+                q_pe=q_pe,
+                k_nope=k_nope,
+                k_pe=k_pe,
+                forward_batch=forward_batch,
+            )
+            return output, forward_batch, None
+
+        # Parallel Absorption + RoPE on separate streams
+        # - Stream 1 (main): Absorption (q_nope @ w_kc)
+        # - Stream 2 (alt): RoPE (q_pe, k_pe)
+        if self.alt_stream is not None and get_is_capture_mode():
+            current_stream = torch.cuda.current_stream()
+            self.alt_stream.wait_stream(current_stream)
+
+            # RoPE on alt stream
+            with torch.cuda.stream(self.alt_stream):
+                q_pe, k_pe = self.rotary_emb(positions, q_pe, k_pe)
+
+            # Absorption on main stream (runs in parallel with RoPE)
+            q_nope_out = self._maybe_fp8_bmm(
+                q_nope.transpose(0, 1), self.w_kc, zero_allocator
+            )
+            q_nope_out = q_nope_out.transpose(0, 1)
+
+            current_stream.wait_stream(self.alt_stream)
+        else:
+            q_nope_out = self._maybe_fp8_bmm(
+                q_nope.transpose(0, 1), self.w_kc, zero_allocator
+            )
+            q_nope_out = q_nope_out.transpose(0, 1)
+
+            q_pe, k_pe = self.rotary_emb(positions, q_pe, k_pe)
+
+        inner_state = (q_nope_out, k_nope, q_pe, k_pe, forward_batch, zero_allocator)
+        return None, forward_batch, inner_state
+
+    def forward_core(
+        self,
+        intermediate_state: Tuple[
+            Optional[torch.Tensor], ForwardBatch, Optional[Tuple]
+        ],
+    ) -> torch.Tensor:
+        hidden_states, forward_batch, inner_state = intermediate_state
+
+        if inner_state is None:
+            return hidden_states
+
+        q_nope_out, k_nope, q_pe, k_pe, forward_batch, zero_allocator = inner_state
+
+        if self._server_args is None:
+            self._server_args = get_global_server_args()
+        self._set_current_attention_backend(forward_batch)
+
+        forward_method = get_attn_forward_method(self._server_args, forward_batch)
+
+        if forward_method == AttnForwardMethod.MLA_SEPARATE_ROPE:
+            attn_output = self.attn_mqa(
+                q_nope_out,
+                k_nope,
+                k_nope,
+                forward_batch,
+                q_rope=q_pe,
+                k_rope=k_pe,
+            )
+        else:
+            q = torch.cat([q_nope_out, q_pe], dim=-1)
+            k = torch.cat([k_nope, k_pe], dim=-1)
+            attn_output = self.attn_mqa(
+                q,
+                k,
+                k_nope,
+                forward_batch,
+            )
+        attn_output = attn_output.view(-1, self.num_local_heads, self.kv_lora_rank)
+
+        attn_bmm_output = self._maybe_fp8_bmm(
+            attn_output.transpose(0, 1), self.w_vc, zero_allocator
+        )
+        attn_bmm_output = attn_bmm_output.transpose(0, 1).flatten(1, 2)
+
+        output, _ = self.o_proj(attn_bmm_output)
+        return output
+
+    def prepare_qkv_latent(
+        self, hidden_states: torch.Tensor, forward_batch: ForwardBatch
+    ) -> torch.Tensor:
+        del forward_batch
+        latent_cache, _ = self.kv_a_proj_with_mqa(hidden_states)
+        return latent_cache
+
+
+class SarvamMoEMLADecoderLayer(nn.Module):
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        layer_id: int = 0,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+        alt_stream: Optional[torch.cuda.Stream] = None,
+    ) -> None:
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.config = config
+        self.layer_id = layer_id
+
+        if hasattr(config, "rope_parameters"):
+            rope_theta = config.rope_parameters.get("rope_theta")
+            rope_type = config.rope_parameters.get("rope_type")
+            rope_scaling = config.rope_parameters if rope_type != "default" else None
+        else:
+            rope_theta = getattr(config, "rope_theta", 10000)
+            rope_scaling = getattr(config, "rope_scaling", None)
+        max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
+
+        self.self_attn = SarvamMoEMLAAttention(
+            config=config,
+            hidden_size=self.hidden_size,
+            num_heads=config.num_attention_heads,
+            layer_id=layer_id,
+            rope_theta=rope_theta,
+            rope_scaling=rope_scaling,
+            max_position_embeddings=max_position_embeddings,
+            quant_config=quant_config,
+            prefix=add_prefix("self_attn", prefix),
+            alt_stream=alt_stream,
+        )
+
+        first_k_dense = getattr(config, "first_k_dense_replace", 1)
+        moe_layer_freq = getattr(config, "moe_layer_freq", 1)
+        has_moe = getattr(config, "num_experts", None) is not None
+        self.is_layer_sparse = (
+            has_moe
+            and layer_id >= first_k_dense
+            and (layer_id - first_k_dense) % moe_layer_freq == 0
+        )
+        is_previous_layer_sparse = (
+            has_moe
+            and layer_id > 0
+            and (layer_id - 1) >= first_k_dense
+            and (layer_id - 1 - first_k_dense) % moe_layer_freq == 0
+        )
+        is_next_layer_sparse = (
+            has_moe
+            and layer_id < config.num_hidden_layers - 1
+            and (layer_id + 1) >= first_k_dense
+            and (layer_id + 1 - first_k_dense) % moe_layer_freq == 0
+        )
+
+        if self.is_layer_sparse:
+            self.mlp = SarvamMoESparseMoeBlock(
+                config=config,
+                layer_id=layer_id,
+                quant_config=quant_config,
+                prefix=add_prefix("mlp", prefix),
+                alt_stream=alt_stream,
+            )
+        else:
+            if enable_moe_dense_fully_dp():
+                mlp_tp_rank, mlp_tp_size = 0, 1
+            else:
+                mlp_tp_rank, mlp_tp_size = None, None
+            self.mlp = SarvamMoEMLP(
+                hidden_size=self.hidden_size,
+                intermediate_size=config.intermediate_size,
+                hidden_act=config.hidden_act,
+                quant_config=quant_config,
+                prefix=add_prefix("mlp", prefix),
+                reduce_results=False,
+                tp_rank=mlp_tp_rank,
+                tp_size=mlp_tp_size,
+            )
+
+        self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = RMSNorm(
+            config.hidden_size, eps=config.rms_norm_eps
+        )
+
+        self.attn_tp_size = get_attention_tp_size()
+        self.layer_scatter_modes = LayerScatterModes.init_new(
+            layer_id=layer_id,
+            num_layers=config.num_hidden_layers,
+            is_layer_sparse=self.is_layer_sparse,
+            is_previous_layer_sparse=is_previous_layer_sparse,
+            is_next_layer_sparse=is_next_layer_sparse,
+        )
+        self.layer_communicator = LayerCommunicator(
+            layer_scatter_modes=self.layer_scatter_modes,
+            input_layernorm=self.input_layernorm,
+            post_attention_layernorm=self.post_attention_layernorm,
+            qkv_latent_func=self.self_attn.prepare_qkv_latent,
+            allow_reduce_scatter=True,
+            is_last_layer=(layer_id == config.num_hidden_layers - 1),
+        )
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+        residual: Optional[torch.Tensor],
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        hidden_states, residual = self.layer_communicator.prepare_attn(
+            hidden_states, residual, forward_batch
+        )
+        if hidden_states.shape[0] != 0:
+            hidden_states = self.self_attn(
+                positions=positions,
+                hidden_states=hidden_states,
+                forward_batch=forward_batch,
+            )
+        hidden_states, residual = self.layer_communicator.prepare_mlp(
+            hidden_states, residual, forward_batch
+        )
+        should_allreduce_fusion = (
+            self.layer_communicator.should_fuse_mlp_allreduce_with_next_layer(
+                forward_batch
+            )
+        )
+        use_reduce_scatter = self.layer_communicator.should_use_reduce_scatter(
+            forward_batch
+        )
+        hidden_states = self.mlp(
+            hidden_states, forward_batch, should_allreduce_fusion, use_reduce_scatter
+        )
+        if (
+            not self.is_layer_sparse
+            and self.attn_tp_size > 1
+            and not use_reduce_scatter
+            and not should_allreduce_fusion
+        ):
+            hidden_states = tensor_model_parallel_all_reduce(hidden_states)
+        if should_allreduce_fusion:
+            hidden_states._sglang_needs_allreduce_fusion = True
+        else:
+            hidden_states, residual = self.layer_communicator.postprocess_layer(
+                hidden_states, residual, forward_batch
+            )
+        return hidden_states, residual
+
+
+class SarvamMLAModel(nn.Module):
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.config = config
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+        self.pp_group = get_pp_group()
+        self.alt_stream = torch.cuda.Stream() if _is_cuda else None
+
+        if self.pp_group.is_first_rank:
+            self.embed_tokens = VocabParallelEmbedding(
+                config.vocab_size,
+                config.hidden_size,
+                quant_config=quant_config,
+                prefix=add_prefix("embed_tokens", prefix),
+                enable_tp=not is_dp_attention_enabled(),
+            )
+        else:
+            self.embed_tokens = nn.Identity()
+
+        self.layers, self.start_layer, self.end_layer = make_layers(
+            config.num_hidden_layers,
+            lambda idx, prefix: SarvamMoEMLADecoderLayer(
+                config=config,
+                quant_config=quant_config,
+                layer_id=idx,
+                prefix=prefix,
+                alt_stream=self.alt_stream,
+            ),
+            pp_rank=self.pp_group.rank_in_group,
+            pp_size=self.pp_group.world_size,
+            prefix="model.layers",
+        )
+
+        if self.pp_group.is_last_rank:
+            self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        else:
+            self.norm = nn.Identity()
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        input_embeds: torch.Tensor = None,
+        pp_proxy_tensors: Optional[PPProxyTensors] = None,
+    ) -> torch.Tensor:
+        if self.pp_group.is_first_rank:
+            if input_embeds is None:
+                hidden_states = self.embed_tokens(input_ids)
+            else:
+                hidden_states = input_embeds
+            residual = None
+        else:
+            assert pp_proxy_tensors is not None
+            hidden_states = pp_proxy_tensors["hidden_states"]
+            residual = pp_proxy_tensors["residual"]
+
+        for i in range(self.start_layer, self.end_layer):
+            layer = self.layers[i]
+            hidden_states, residual = layer(
+                positions, hidden_states, forward_batch, residual
+            )
+
+        if not self.pp_group.is_last_rank:
+            return PPProxyTensors(
+                {"hidden_states": hidden_states, "residual": residual}
+            )
+
+        if hidden_states.shape[0] != 0:
+            if residual is None:
+                hidden_states = self.norm(hidden_states)
+            else:
+                hidden_states, _ = self.norm(hidden_states, residual)
+
+        return hidden_states
+
+
+class SarvamMLAForCausalLM(nn.Module):
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self._remap_config(config)
+        self.pp_group = get_pp_group()
+        self.config = config
+        self.quant_config = quant_config
+        self.model = SarvamMLAModel(config, quant_config, add_prefix("model", prefix))
+        self.lm_head = ParallelLMHead(
+            config.vocab_size,
+            config.hidden_size,
+            quant_config=quant_config,
+            prefix=add_prefix("lm_head", prefix),
+            use_attn_tp_group=get_global_server_args().enable_dp_lm_head,
+        )
+        self.logits_processor = LogitsProcessor(config)
+
+    @staticmethod
+    def _remap_config(config: PretrainedConfig) -> None:
+        defaults = {
+            "first_k_dense_replace": 1,
+            "moe_layer_freq": 1,
+            "hidden_act": "silu",
+            "tie_word_embeddings": False,
+            "n_group": 1,
+            "topk_group": 1,
+            "router_dtype": "fp32",
+            "routed_scaling_factor": 2.5,
+            "score_function": "sigmoid",
+            "norm_topk_prob": True,
+            "topk_method": "noaux_tc",
+        }
+        for attr, default in defaults.items():
+            if not hasattr(config, attr):
+                setattr(config, attr, default)
+
+    @property
+    def start_layer(self):
+        return self.model.start_layer
+
+    @property
+    def end_layer(self):
+        return self.model.end_layer
+
+    def get_input_embeddings(self) -> nn.Embedding:
+        return self.model.embed_tokens
+
+    @torch.no_grad()
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        input_embeds: torch.Tensor = None,
+        pp_proxy_tensors: Optional[PPProxyTensors] = None,
+    ) -> LogitsProcessorOutput:
+        hidden_states = self.model(
+            input_ids, positions, forward_batch, input_embeds, pp_proxy_tensors
+        )
+        if self.pp_group.is_last_rank:
+            return self.logits_processor(
+                input_ids, hidden_states, self.lm_head, forward_batch
+            )
+        return hidden_states
+
+    @torch.no_grad()
+    def forward_split_prefill(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        split_interval: Tuple[int, int],
+        input_embeds: torch.Tensor = None,
+    ) -> Optional[LogitsProcessorOutput]:
+        start, end = split_interval
+        if start == 0:
+            if input_embeds is None:
+                forward_batch.hidden_states = self.model.embed_tokens(input_ids)
+            else:
+                forward_batch.hidden_states = input_embeds
+            forward_batch.residual = None
+
+        for i in range(start, end):
+            with get_global_expert_distribution_recorder().with_current_layer(i):
+                layer = self.model.layers[i]
+                forward_batch.hidden_states, forward_batch.residual = layer(
+                    positions,
+                    forward_batch.hidden_states,
+                    forward_batch,
+                    forward_batch.residual,
+                )
+
+        if end == self.model.config.num_hidden_layers:
+            if forward_batch.residual is None:
+                hidden_states = self.model.norm(forward_batch.hidden_states)
+            else:
+                hidden_states, _ = self.model.norm(
+                    forward_batch.hidden_states, forward_batch.residual
+                )
+            forward_batch.hidden_states = hidden_states
+            return self.logits_processor(
+                input_ids, forward_batch.hidden_states, self.lm_head, forward_batch
+            )
+        return None
+
+    @classmethod
+    def get_model_config_for_expert_location(cls, config):
+        return ModelConfigForExpertLocation(
+            num_layers=config.num_hidden_layers,
+            num_logical_experts=config.num_experts,
+            num_groups=getattr(config, "n_group", None),
+        )
+
+    def load_weights(
+        self,
+        weights: Iterable[Tuple[str, torch.Tensor]],
+        is_nextn: bool = False,
+    ) -> None:
+        del is_nextn
+        stacked_params_mapping = [
+            (".gate_up_proj", ".gate_proj", 0),
+            (".gate_up_proj", ".up_proj", 1),
+        ]
+        expert_params_mapping = FusedMoE.make_expert_params_mapping(
+            ckpt_gate_proj_name="gate_proj",
+            ckpt_down_proj_name="down_proj",
+            ckpt_up_proj_name="up_proj",
+            num_experts=self.config.num_experts,
+        )
+        params_dict = dict(self.named_parameters())
+
+        for name, loaded_weight in weights:
+            layer_id = get_layer_id(name)
+            if layer_id is not None and (
+                layer_id < self.start_layer or layer_id >= self.end_layer
+            ):
+                continue
+
+            if "rotary_emb.inv_freq" in name:
+                continue
+
+            if ".mlp.gate.e_score_correction_bias" in name:
+                name = name.replace(
+                    ".mlp.gate.e_score_correction_bias", ".mlp.e_score_correction_bias"
+                )
+
+            is_stacked = False
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                if weight_name not in name or "mlp.experts" in name:
+                    continue
+                mapped_name = name.replace(weight_name, param_name)
+                if mapped_name.endswith(".bias") and mapped_name not in params_dict:
+                    continue
+                if mapped_name not in params_dict:
+                    continue
+                param = params_dict[mapped_name]
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
+                weight_loader(param, loaded_weight, shard_id)
+                is_stacked = True
+                break
+            if is_stacked:
+                continue
+
+            is_expert = False
+            for param_name, weight_name, expert_id, shard_id in expert_params_mapping:
+                if weight_name not in name:
+                    continue
+                mapped_name = name.replace(weight_name, param_name)
+                if mapped_name not in params_dict:
+                    continue
+                param = params_dict[mapped_name]
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
+                weight_loader(
+                    param,
+                    loaded_weight,
+                    mapped_name,
+                    shard_id=shard_id,
+                    expert_id=expert_id,
+                )
+                is_expert = True
+                break
+            if is_expert:
+                continue
+
+            if name.endswith(".bias") and name not in params_dict:
+                continue
+            if name not in params_dict:
+                continue
+            param = params_dict[name]
+            weight_loader = getattr(param, "weight_loader", default_weight_loader)
+            weight_loader(param, loaded_weight)
+
+        self._set_mla_wkc_wvc()
+        if not hasattr(self, "routed_experts_weights_of_layer"):
+            self.routed_experts_weights_of_layer = {
+                layer_id: self.model.layers[layer_id].mlp.get_moe_weights()
+                for layer_id in range(self.start_layer, self.end_layer)
+                if isinstance(self.model.layers[layer_id].mlp, SarvamMoESparseMoeBlock)
+            }
+
+    def _set_mla_wkc_wvc(self) -> None:
+        for layer_id in range(self.start_layer, self.end_layer):
+            layer = self.model.layers[layer_id]
+            self_attn = layer.self_attn
+            if not hasattr(self_attn, "kv_b_proj") or self_attn.kv_b_proj is None:
+                continue
+
+            w = self_attn.kv_b_proj.weight.data
+            weight_scale = None
+            if w.dtype in (torch.float8_e4m3fn, torch.float8_e4m3fnuz):
+                if (
+                    hasattr(self_attn.kv_b_proj, "weight_scale")
+                    and self_attn.kv_b_proj.weight_scale is not None
+                ):
+                    weight_scale = self_attn.kv_b_proj.weight_scale
+                elif (
+                    hasattr(self_attn.kv_b_proj, "weight_scale_inv")
+                    and self_attn.kv_b_proj.weight_scale_inv is not None
+                ):
+                    weight_scale = self_attn.kv_b_proj.weight_scale_inv
+                elif (
+                    hasattr(self_attn.kv_b_proj, "scale")
+                    and self_attn.kv_b_proj.scale is not None
+                ):
+                    weight_scale = self_attn.kv_b_proj.scale
+
+            w_reshaped = w.unflatten(
+                0,
+                (
+                    self_attn.num_local_heads,
+                    self_attn.qk_nope_head_dim + self_attn.v_head_dim,
+                ),
+            )
+            w_kc, w_vc = w_reshaped.split(
+                [self_attn.qk_nope_head_dim, self_attn.v_head_dim], dim=1
+            )
+            self_attn.w_kc = bind_or_assign(
+                self_attn.w_kc, w_kc.transpose(1, 2).contiguous().transpose(1, 2)
+            )
+            self_attn.w_vc = bind_or_assign(
+                self_attn.w_vc, w_vc.contiguous().transpose(1, 2)
+            )
+            if weight_scale is not None:
+                self_attn.w_scale = weight_scale
+
+
+class SarvamMoEForCausalLM(BailingMoEForCausalLM):
+
+    @torch.no_grad()
+    def forward_split_prefill(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        split_interval: Tuple[int, int],
+        input_embeds: torch.Tensor = None,
+    ) -> Optional[LogitsProcessorOutput]:
+        start, end = split_interval
+
+        if start == 0:
+            if input_embeds is None:
+                forward_batch.hidden_states = self.model.word_embeddings(input_ids)
+            else:
+                forward_batch.hidden_states = input_embeds
+            forward_batch.residual = None
+
+        for i in range(start, end):
+            with get_global_expert_distribution_recorder().with_current_layer(i):
+                layer = self.model.layers[i]
+                forward_batch.hidden_states, forward_batch.residual = layer(
+                    positions,
+                    forward_batch.hidden_states,
+                    forward_batch,
+                    forward_batch.residual,
+                )
+
+        if end == self.model.config.num_hidden_layers:
+            if forward_batch.residual is None:
+                hidden_states = self.model.norm(forward_batch.hidden_states)
+            else:
+                hidden_states, _ = self.model.norm(
+                    forward_batch.hidden_states, forward_batch.residual
+                )
+            forward_batch.hidden_states = hidden_states
+
+            return self.logits_processor(
+                input_ids, forward_batch.hidden_states, self.lm_head, forward_batch
+            )
+
+        return None
+
+
+EntryClass = [SarvamMLAForCausalLM, SarvamMoEForCausalLM]
diff --git a/sglang/python/sglang/srt/models/sdar_moe.py b/sglang/python/sglang/srt/models/sdar_moe.py
new file mode 100644
index 0000000000000000000000000000000000000000..a1b20f67c1ba2629357061f93bdec7f36b9b50a0
--- /dev/null
+++ b/sglang/python/sglang/srt/models/sdar_moe.py
@@ -0,0 +1,746 @@
+# coding=utf-8
+"""
+SGLang SDARMoeModelLM (block diffusion / dLLM-style forward) with MoE MLP.
+"""
+
+import logging
+from typing import Iterable, Optional, Tuple, Union
+
+import torch
+from torch import nn
+from transformers import PretrainedConfig
+
+from sglang.srt.distributed import (
+    get_moe_expert_parallel_world_size,
+    get_pp_group,
+    get_tensor_model_parallel_world_size,
+    tensor_model_parallel_all_reduce,
+)
+from sglang.srt.eplb.expert_distribution import get_global_expert_distribution_recorder
+from sglang.srt.eplb.expert_location import ModelConfigForExpertLocation
+from sglang.srt.eplb.expert_location_dispatch import ExpertLocationDispatchInfo
+from sglang.srt.layers.communicator import LayerCommunicator, LayerScatterModes
+from sglang.srt.layers.dp_attention import (
+    get_attention_tp_rank,
+    get_attention_tp_size,
+    is_dp_attention_enabled,
+)
+from sglang.srt.layers.layernorm import RMSNorm
+from sglang.srt.layers.linear import (
+    QKVParallelLinear,
+    ReplicatedLinear,
+    RowParallelLinear,
+)
+from sglang.srt.layers.logits_processor import LogitsProcessor
+from sglang.srt.layers.moe import (
+    get_moe_a2a_backend,
+    should_use_flashinfer_cutlass_moe_fp4_allgather,
+)
+from sglang.srt.layers.moe.ep_moe.layer import get_moe_impl_class
+from sglang.srt.layers.moe.fused_moe_triton.layer import FusedMoE
+from sglang.srt.layers.moe.topk import TopK
+from sglang.srt.layers.moe.utils import (
+    RoutingMethodType,
+    filter_moe_weight_param_global_expert,
+)
+from sglang.srt.layers.quantization.base_config import QuantizationConfig
+from sglang.srt.layers.radix_attention import AttentionType, RadixAttention
+from sglang.srt.layers.rotary_embedding import get_rope
+from sglang.srt.layers.utils import PPMissingLayer, get_layer_id
+from sglang.srt.layers.vocab_parallel_embedding import (
+    ParallelLMHead,
+    VocabParallelEmbedding,
+)
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch, PPProxyTensors
+from sglang.srt.model_loader.weight_utils import (
+    default_weight_loader,
+    maybe_remap_kv_scale_name,
+)
+from sglang.srt.models.utils import (
+    apply_qk_norm,
+    create_fused_set_kv_buffer_arg,
+    enable_fused_set_kv_buffer,
+)
+from sglang.srt.server_args import get_global_server_args
+from sglang.srt.utils import LazyValue, add_prefix, is_cuda, make_layers
+
+logger = logging.getLogger(__name__)
+_is_cuda = is_cuda()
+
+
+class SDARMoeSparseMoeBlock(nn.Module):
+    """
+    Qwen3MoE-style sparse MoE block:
+      - gate: ReplicatedLinear(hidden, num_experts)
+      - topk routing: TopK
+      - experts: get_moe_impl_class(quant_config)(...)
+    """
+
+    def __init__(
+        self,
+        layer_id: int,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.layer_id = layer_id
+        self.tp_size = get_tensor_model_parallel_world_size()
+
+        if self.tp_size > config.num_experts:
+            raise ValueError(
+                f"Tensor parallel size {self.tp_size} > num_experts {config.num_experts}."
+            )
+
+        self.topk = TopK(
+            top_k=config.num_experts_per_tok,
+            renormalize=config.norm_topk_prob,
+            use_grouped_topk=False,
+            layer_id=layer_id,
+        )
+
+        self.experts = get_moe_impl_class(quant_config)(
+            num_experts=config.num_experts
+            + get_global_server_args().ep_num_redundant_experts,
+            top_k=config.num_experts_per_tok,
+            layer_id=layer_id,
+            hidden_size=config.hidden_size,
+            intermediate_size=config.moe_intermediate_size,
+            quant_config=quant_config,
+            prefix=add_prefix("experts", prefix),
+            routing_method_type=RoutingMethodType.Renormalize,
+        )
+
+        self.gate = ReplicatedLinear(
+            config.hidden_size,
+            config.num_experts,
+            bias=False,
+            quant_config=None,
+            prefix=add_prefix("gate", prefix),
+        )
+
+        # Deepep / FuseEP support
+        if get_moe_a2a_backend().is_deepep():
+            self.ep_size = get_moe_expert_parallel_world_size()
+            self.num_experts = (
+                config.num_experts + get_global_server_args().ep_num_redundant_experts
+            )
+            self.top_k = config.num_experts_per_tok
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        forward_batch: Optional[ForwardBatch] = None,
+        should_allreduce_fusion: bool = False,
+        use_reduce_scatter: bool = False,
+    ) -> torch.Tensor:
+        if (
+            not get_moe_a2a_backend().is_deepep()
+            and not get_moe_a2a_backend().is_ascend_fuseep()
+        ):
+            return self.forward_normal(
+                hidden_states,
+                should_allreduce_fusion=should_allreduce_fusion,
+                use_reduce_scatter=use_reduce_scatter,
+            )
+        else:
+            assert forward_batch is not None, "deepep/fuseep MoE needs forward_batch"
+            return self.forward_deepep(hidden_states, forward_batch)
+
+    def forward_normal(
+        self,
+        hidden_states: torch.Tensor,
+        should_allreduce_fusion: bool = False,
+        use_reduce_scatter: bool = False,
+    ) -> torch.Tensor:
+        num_tokens, hidden_dim = hidden_states.shape
+        hidden_states = hidden_states.view(-1, hidden_dim)
+
+        router_logits, _ = self.gate(hidden_states)  # (T, E)
+        topk_output = self.topk(hidden_states, router_logits)
+        out = self.experts(hidden_states, topk_output)  # (T, H)
+
+        # TP all-reduce (unless fused / reduce_scatter / fp4 allgather path)
+        if (
+            self.tp_size > 1
+            and not should_allreduce_fusion
+            and not use_reduce_scatter
+            and not should_use_flashinfer_cutlass_moe_fp4_allgather()
+        ):
+            out = tensor_model_parallel_all_reduce(out)
+
+        return out.view(num_tokens, hidden_dim)
+
+    def forward_deepep(self, hidden_states: torch.Tensor, forward_batch: ForwardBatch):
+        if hidden_states.shape[0] > 0:
+            router_logits, _ = self.gate(hidden_states)
+            topk_output = self.topk(
+                hidden_states,
+                router_logits,
+                num_token_non_padded=forward_batch.num_token_non_padded,
+                expert_location_dispatch_info=ExpertLocationDispatchInfo.init_new(
+                    layer_id=self.layer_id
+                ),
+            )
+        else:
+            topk_output = self.topk.empty_topk_output(hidden_states.device)
+
+        out = self.experts(hidden_states=hidden_states, topk_output=topk_output)
+        return out
+
+    def get_moe_weights(self):
+        return [
+            p.data
+            for name, p in self.experts.named_parameters()
+            if name not in ["correction_bias"]
+            and filter_moe_weight_param_global_expert(
+                name, p, self.experts.num_local_experts
+            )
+        ]
+
+
+class SDARMoeAttention(nn.Module):
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        layer_id: int,
+        quant_config: Optional[QuantizationConfig] = None,
+        reduce_results: bool = True,
+        prefix: str = "",
+        alt_stream: Optional[torch.cuda.Stream] = None,
+    ):
+        super().__init__()
+        self.layer_id = layer_id
+        self.hidden_size = config.hidden_size
+        self.total_num_heads = config.num_attention_heads
+
+        attn_tp_rank = get_attention_tp_rank()
+        attn_tp_size = get_attention_tp_size()
+
+        assert self.total_num_heads % attn_tp_size == 0
+        self.num_heads = self.total_num_heads // attn_tp_size
+
+        self.total_num_kv_heads = config.num_key_value_heads
+        if self.total_num_kv_heads >= attn_tp_size:
+            assert self.total_num_kv_heads % attn_tp_size == 0
+        else:
+            assert attn_tp_size % self.total_num_kv_heads == 0
+        self.num_kv_heads = max(1, self.total_num_kv_heads // attn_tp_size)
+
+        self.head_dim = getattr(
+            config, "head_dim", self.hidden_size // self.total_num_heads
+        )
+        self.q_size = self.num_heads * self.head_dim
+        self.kv_size = self.num_kv_heads * self.head_dim
+        self.scale = self.head_dim**-0.5
+
+        self.qkv_proj = QKVParallelLinear(
+            self.hidden_size,
+            self.head_dim,
+            self.total_num_heads,
+            self.total_num_kv_heads,
+            bias=getattr(config, "attention_bias", False),
+            quant_config=quant_config,
+            prefix=add_prefix("qkv_proj", prefix),
+        )
+        self.o_proj = RowParallelLinear(
+            self.total_num_heads * self.head_dim,
+            self.hidden_size,
+            bias=getattr(config, "attention_bias", False),
+            quant_config=quant_config,
+            reduce_results=reduce_results,
+            tp_rank=attn_tp_rank,
+            tp_size=attn_tp_size,
+            prefix=add_prefix("o_proj", prefix),
+        )
+
+        self.q_norm = RMSNorm(self.head_dim, eps=config.rms_norm_eps)
+        self.k_norm = RMSNorm(self.head_dim, eps=config.rms_norm_eps)
+
+        rope_theta = getattr(config, "rope_theta", 10000.0)
+        rope_scaling = getattr(config, "rope_scaling", None)
+        max_pos = getattr(config, "max_position_embeddings", 32768)
+        self.rotary_dim = self.head_dim
+        self.rotary_emb = get_rope(
+            self.head_dim,
+            rotary_dim=self.rotary_dim,
+            max_position=max_pos,
+            base=rope_theta,
+            rope_scaling=rope_scaling,
+        )
+
+        self.attn = RadixAttention(
+            self.num_heads,
+            self.head_dim,
+            self.scale,
+            num_kv_heads=self.num_kv_heads,
+            layer_id=layer_id,
+            attn_type=AttentionType.ENCODER_ONLY,
+            prefix=add_prefix("attn", prefix),
+        )
+        self.alt_stream = alt_stream
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+    ) -> torch.Tensor:
+        if get_global_server_args().rl_on_policy_target is not None:
+            hidden_states = hidden_states.bfloat16()
+
+        qkv, _ = self.qkv_proj(hidden_states)
+        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+        q, k = apply_qk_norm(
+            q=q,
+            k=k,
+            q_norm=self.q_norm,
+            k_norm=self.k_norm,
+            head_dim=self.head_dim,
+            alt_stream=self.alt_stream,
+        )
+        q, k = self.rotary_emb(
+            positions,
+            q,
+            k,
+            fused_set_kv_buffer_arg=(
+                create_fused_set_kv_buffer_arg(
+                    value=v,
+                    layer=self.attn,
+                    forward_batch=forward_batch,
+                )
+                if enable_fused_set_kv_buffer(forward_batch)
+                else None
+            ),
+        )
+
+        if get_global_server_args().rl_on_policy_target is not None:
+            q = q.to(torch.bfloat16)
+            k = k.to(torch.bfloat16)
+
+        context = self.attn(
+            q,
+            k,
+            v,
+            forward_batch,
+            save_kv_cache=not enable_fused_set_kv_buffer(forward_batch),
+        )
+        out, _ = self.o_proj(context)
+        return out
+
+
+class SDARMoeBlock(nn.Module):
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        layer_id: int,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+        alt_stream: Optional[torch.cuda.Stream] = None,
+    ):
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.layer_id = layer_id
+
+        norm_kwargs = (
+            dict(
+                weight_dtype=torch.float32,
+                cast_x_before_out_mul=True,
+                override_orig_dtype=torch.float32,
+                fp32_residual=True,
+            )
+            if get_global_server_args().rl_on_policy_target is not None
+            else {}
+        )
+        self.input_layernorm = RMSNorm(
+            self.hidden_size, eps=config.rms_norm_eps, **norm_kwargs
+        )
+        self.post_attention_layernorm = RMSNorm(
+            self.hidden_size, eps=config.rms_norm_eps, **norm_kwargs
+        )
+
+        self.self_attn = SDARMoeAttention(
+            config=config,
+            layer_id=layer_id,
+            quant_config=quant_config,
+            reduce_results=False,
+            prefix=add_prefix("self_attn", prefix),
+            alt_stream=alt_stream,
+        )
+
+        self.mlp = SDARMoeSparseMoeBlock(
+            layer_id=layer_id,
+            config=config,
+            quant_config=quant_config,
+            prefix=add_prefix("mlp", prefix),
+        )
+
+        self.layer_scatter_modes = LayerScatterModes.init_new(
+            layer_id=layer_id,
+            num_layers=config.num_hidden_layers,
+            is_layer_sparse=True,
+            is_previous_layer_sparse=True,
+            is_next_layer_sparse=True,
+        )
+
+        self.layer_communicator = LayerCommunicator(
+            layer_scatter_modes=self.layer_scatter_modes,
+            input_layernorm=self.input_layernorm,
+            post_attention_layernorm=self.post_attention_layernorm,
+            allow_reduce_scatter=True,
+            is_last_layer=(layer_id == config.num_hidden_layers - 1),
+        )
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+        residual: Optional[torch.Tensor],
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+
+        hidden_states, residual = self.layer_communicator.prepare_attn(
+            hidden_states, residual, forward_batch
+        )
+
+        if hidden_states.shape[0] != 0:
+            hidden_states = self.self_attn(
+                positions=positions,
+                hidden_states=hidden_states,
+                forward_batch=forward_batch,
+            )
+
+        hidden_states, residual = self.layer_communicator.prepare_mlp(
+            hidden_states, residual, forward_batch
+        )
+
+        should_allreduce_fusion = (
+            self.layer_communicator.should_fuse_mlp_allreduce_with_next_layer(
+                forward_batch
+            )
+        )
+        use_reduce_scatter = self.layer_communicator.should_use_reduce_scatter(
+            forward_batch
+        )
+
+        hidden_states = self.mlp(
+            hidden_states,
+            forward_batch=forward_batch,
+            should_allreduce_fusion=should_allreduce_fusion,
+            use_reduce_scatter=use_reduce_scatter,
+        )
+
+        if should_allreduce_fusion:
+            hidden_states._sglang_needs_allreduce_fusion = True
+        else:
+            hidden_states, residual = self.layer_communicator.postprocess_layer(
+                hidden_states, residual, forward_batch
+            )
+
+        return hidden_states, residual
+
+
+class SDARMoeModel(nn.Module):
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+        alt_stream: Optional[torch.cuda.Stream] = None,
+    ):
+        super().__init__()
+        self.config = config
+        self.vocab_size = config.vocab_size
+        self.embed_dim = config.hidden_size
+        self.pp_group = get_pp_group()
+
+        if self.pp_group.is_first_rank:
+            self.embed_tokens = VocabParallelEmbedding(
+                self.vocab_size,
+                self.embed_dim,
+                quant_config=quant_config,
+                use_attn_tp_group=is_dp_attention_enabled(),
+                prefix=add_prefix("embed_tokens", prefix),
+            )
+        else:
+            self.embed_tokens = PPMissingLayer()
+
+        self.layers, self.start_layer, self.end_layer = make_layers(
+            config.num_hidden_layers,
+            lambda idx, prefix: SDARMoeBlock(
+                config=config,
+                layer_id=idx,
+                quant_config=quant_config,
+                prefix=prefix,
+                alt_stream=alt_stream,
+            ),
+            pp_rank=self.pp_group.rank_in_group,
+            pp_size=self.pp_group.world_size,
+            prefix=add_prefix("layers", prefix),
+        )
+
+        if self.pp_group.is_last_rank:
+            norm_kwargs = (
+                dict(
+                    weight_dtype=torch.float32,
+                    cast_x_before_out_mul=True,
+                    override_orig_dtype=torch.float32,
+                    fp32_residual=True,
+                )
+                if get_global_server_args().rl_on_policy_target is not None
+                else {}
+            )
+            self.norm = RMSNorm(self.embed_dim, eps=config.rms_norm_eps, **norm_kwargs)
+        else:
+            self.norm = PPMissingLayer(return_tuple=True)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        input_embeds: torch.Tensor = None,
+        pp_proxy_tensors: Optional[PPProxyTensors] = None,
+    ) -> Union[torch.Tensor, PPProxyTensors]:
+        if self.pp_group.is_first_rank:
+            hidden_states = (
+                self.embed_tokens(input_ids) if input_embeds is None else input_embeds
+            )
+            residual = None
+        else:
+            assert pp_proxy_tensors is not None
+            hidden_states = pp_proxy_tensors["hidden_states"]
+            residual = pp_proxy_tensors.get("residual", None)
+
+        for i in range(self.start_layer, self.end_layer):
+            layer = self.layers[i]
+            with get_global_expert_distribution_recorder().with_current_layer(i):
+                hidden_states, residual = layer(
+                    positions, hidden_states, forward_batch, residual
+                )
+
+        if not self.pp_group.is_last_rank:
+            return PPProxyTensors(
+                {"hidden_states": hidden_states, "residual": residual}
+            )
+
+        if not forward_batch.forward_mode.is_idle():
+            hidden_states, residual = self.norm(hidden_states, residual)
+        return hidden_states
+
+
+class SDARMoeForCausalLM(nn.Module):
+    fall_back_to_pt_during_load = False
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.pp_group = get_pp_group()
+        assert self.pp_group.world_size == 1, (
+            f"SDARMoeForCausalLM does not support pipeline parallel (pp_size={self.pp_group.world_size}). "
+            "Please set pp_size=1."
+        )
+
+        self.pp_group = get_pp_group()
+        self.config = config
+        self.quant_config = quant_config
+        alt_stream = torch.cuda.Stream() if _is_cuda else None
+
+        self.model = SDARMoeModel(
+            config,
+            quant_config=quant_config,
+            prefix=add_prefix("model", ""),
+            alt_stream=alt_stream,
+        )
+
+        if self.pp_group.is_last_rank:
+            tp_size = get_tensor_model_parallel_world_size()
+            if (
+                self.pp_group.world_size == 1
+                and getattr(config, "tie_word_embeddings", False)
+                and tp_size == 1
+            ):
+                self.lm_head = self.model.embed_tokens
+            else:
+                self.lm_head = ParallelLMHead(
+                    config.vocab_size,
+                    config.hidden_size,
+                    quant_config=quant_config,
+                    use_attn_tp_group=get_global_server_args().enable_dp_lm_head,
+                    prefix=add_prefix("lm_head", prefix),
+                )
+        else:
+            self.lm_head = PPMissingLayer()
+
+        self.logits_processor = LogitsProcessor(config, return_full_logits=True)
+
+    @property
+    def start_layer(self):
+        return self.model.start_layer
+
+    @property
+    def end_layer(self):
+        return self.model.end_layer
+
+    @torch.no_grad()
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        input_embeds: Optional[torch.Tensor] = None,
+        pp_proxy_tensors: Optional[PPProxyTensors] = None,
+    ) -> torch.Tensor:
+        hidden_states = self.model(
+            input_ids=input_ids,
+            positions=positions,
+            forward_batch=forward_batch,
+            input_embeds=input_embeds,
+            pp_proxy_tensors=pp_proxy_tensors,
+        )
+        if self.pp_group.is_last_rank:
+            return self.logits_processor(
+                input_ids, hidden_states, self.lm_head, forward_batch
+            )
+        return hidden_states
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        stacked_params_mapping = [
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+            ("gate_up_proj", "gate_proj", 0),
+            ("gate_up_proj", "up_proj", 1),
+        ]
+
+        expert_params_mapping = FusedMoE.make_expert_params_mapping(
+            ckpt_gate_proj_name="gate_proj",
+            ckpt_down_proj_name="down_proj",
+            ckpt_up_proj_name="up_proj",
+            num_experts=self.config.num_experts,
+        )
+
+        if not hasattr(self, "_cached_params_dict"):
+            self._cached_params_dict = dict(self.named_parameters())
+        params_dict = self._cached_params_dict
+
+        for name, loaded_weight in weights:
+            if not name.startswith("model.") and (
+                name.startswith("layers.")
+                or name.startswith("embed_tokens.")
+                or name.startswith("norm.")
+            ):
+                name = add_prefix(name, "model")
+
+            if name == "model.embed_tokens.weight":
+                if self.pp_group.is_last_rank and getattr(
+                    self.config, "tie_word_embeddings", False
+                ):
+                    if "lm_head.weight" in params_dict:
+                        param = params_dict["lm_head.weight"]
+                        weight_loader = getattr(
+                            param, "weight_loader", default_weight_loader
+                        )
+                        weight_loader(param, loaded_weight)
+
+            layer_id = get_layer_id(name)
+            if (
+                layer_id is not None
+                and hasattr(self.model, "start_layer")
+                and (
+                    layer_id < self.model.start_layer
+                    or layer_id >= self.model.end_layer
+                )
+            ):
+                continue
+
+            if "rotary_emb.inv_freq" in name or "projector" in name:
+                continue
+            if "rotary_emb.cos_cached" in name or "rotary_emb.sin_cached" in name:
+                continue
+
+            if "scale" in name:
+                name = maybe_remap_kv_scale_name(name, params_dict)
+                if name is None:
+                    continue
+
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                if "mlp.experts" in name:
+                    continue
+
+                name2 = name.replace(weight_name, param_name)
+                if name2.endswith(".bias") and name2 not in params_dict:
+                    continue
+                if name2 not in params_dict:
+                    continue
+
+                param = params_dict[name2]
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                is_expert_weight = False
+                for mapping in expert_params_mapping:
+                    param_name, weight_name, expert_id, shard_id = mapping
+                    if weight_name not in name:
+                        continue
+                    is_expert_weight = True
+
+                    name2 = name.replace(weight_name, param_name)
+                    if name2 not in params_dict:
+                        continue
+
+                    param = params_dict[name2]
+                    weight_loader = getattr(
+                        param, "weight_loader", default_weight_loader
+                    )
+                    weight_loader(
+                        param,
+                        loaded_weight,
+                        name2,
+                        shard_id=shard_id,
+                        expert_id=expert_id,
+                    )
+                    break
+                else:
+                    if is_expert_weight:
+                        continue
+
+                    # 3) regular params
+                    if name.endswith(".bias") and name not in params_dict:
+                        continue
+                    if name not in params_dict:
+                        continue
+
+                    param = params_dict[name]
+                    weight_loader = getattr(
+                        param, "weight_loader", default_weight_loader
+                    )
+                    weight_loader(param, loaded_weight)
+
+        if not hasattr(self, "routed_experts_weights_of_layer"):
+            self.routed_experts_weights_of_layer = LazyValue(
+                lambda: {
+                    lid: self.model.layers[lid].mlp.get_moe_weights()
+                    for lid in range(self.start_layer, self.end_layer)
+                    if isinstance(self.model.layers[lid].mlp, SDARMoeSparseMoeBlock)
+                }
+            )
+
+    @classmethod
+    def get_model_config_for_expert_location(cls, config):
+        return ModelConfigForExpertLocation(
+            num_layers=config.num_hidden_layers,
+            num_logical_experts=config.num_experts,
+            num_groups=None,
+        )
+
+
+EntryClass = SDARMoeForCausalLM
diff --git a/sglang/python/sglang/srt/models/siglip.py b/sglang/python/sglang/srt/models/siglip.py
new file mode 100644
index 0000000000000000000000000000000000000000..34afe07f8e4d918c0affe822f1268bf996a169b2
--- /dev/null
+++ b/sglang/python/sglang/srt/models/siglip.py
@@ -0,0 +1,281 @@
+# Adapted from
+# https://github.com/huggingface/transformers/blob/af9b2eaa54c150741f298d6db939af6328e1dc38/src/transformers/models/siglip/modeling_siglip.py
+
+from functools import partial
+from typing import Optional, Type, Union
+
+import torch
+import torch.nn as nn
+from transformers import SiglipVisionConfig
+
+from sglang.srt.layers.activation import QuickGELU
+from sglang.srt.layers.attention.vision import VisionAttention
+from sglang.srt.layers.linear import ColumnParallelLinear, RowParallelLinear
+from sglang.srt.layers.quantization.base_config import QuantizationConfig
+from sglang.srt.layers.vocab_parallel_embedding import VocabParallelEmbedding
+from sglang.srt.utils import add_prefix
+
+
+# Adapted from transformers.models.siglip.modeling_siglip.SiglipVisionTransformer
+class SiglipVisionEmbeddings(nn.Module):
+
+    def __init__(self, config: SiglipVisionConfig):
+        super().__init__()
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.image_size = config.image_size
+        self.patch_size = config.patch_size
+
+        self.patch_embedding = nn.Conv2d(
+            in_channels=config.num_channels,
+            out_channels=self.embed_dim,
+            kernel_size=self.patch_size,
+            stride=self.patch_size,
+            padding="valid",
+        )
+
+        self.num_patches = (self.image_size // self.patch_size) ** 2
+        self.num_positions = self.num_patches
+        self.position_embedding = VocabParallelEmbedding(
+            self.num_positions, self.embed_dim
+        )
+        self.register_buffer(
+            "position_ids",
+            torch.arange(self.num_positions).expand((1, -1)),
+            persistent=False,
+        )
+
+    def forward(self, pixel_values: torch.Tensor) -> torch.Tensor:
+        target_dtype = self.patch_embedding.weight.dtype
+        patch_embeds = self.patch_embedding(
+            pixel_values.to(dtype=target_dtype)
+        )  # shape = [*, width, grid, grid]
+        embeddings = patch_embeds.flatten(2).transpose(1, 2)
+        # interpolate_pos_encoding is never used in sglang
+        embeddings = embeddings + self.position_embedding(self.position_ids)
+
+        return embeddings
+
+
+# Copied from sglang.srt.models.clip.CLIPMLP
+class SiglipMLP(nn.Module):
+
+    def __init__(
+        self,
+        config,
+        act_layer: Type[nn.Module] = QuickGELU,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.fc1 = ColumnParallelLinear(
+            config.hidden_size,
+            config.intermediate_size,
+            quant_config=quant_config,
+            prefix=add_prefix("fc1", prefix),
+        )
+        self.act = act_layer()
+        self.fc2 = RowParallelLinear(
+            config.intermediate_size,
+            config.hidden_size,
+            quant_config=quant_config,
+            prefix=add_prefix("fc2", prefix),
+        )
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x_parallel, _ = self.fc1(x)
+        x_parallel = self.act(x_parallel)
+        x, _ = self.fc2(x_parallel)
+        return x
+
+
+# Copied from sglang.srt.models.clip.CLIPEncoderLayer
+class SiglipEncoderLayer(nn.Module):
+
+    def __init__(
+        self,
+        config: SiglipVisionConfig,
+        act_layer: Type[nn.Module] = QuickGELU,
+        norm_layer: Type[nn.Module] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        if norm_layer is None:
+            norm_layer = partial(nn.LayerNorm, eps=config.layer_norm_eps)
+        self.layer_norm1 = norm_layer(config.hidden_size)
+        self.layer_norm2 = norm_layer(config.hidden_size)
+        self.self_attn = VisionAttention(
+            embed_dim=config.hidden_size,
+            num_heads=config.num_attention_heads,
+            projection_size=config.hidden_size,
+            use_qkv_parallel=True,
+            flatten_batch=True,
+            quant_config=quant_config,
+            prefix=add_prefix("self_attn", prefix),
+        )
+        self.mlp = SiglipMLP(
+            config,
+            act_layer=act_layer,
+            quant_config=quant_config,
+            prefix=add_prefix("mlp", prefix),
+        )
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: torch.Tensor,
+        causal_attention_mask: torch.Tensor,
+    ) -> torch.Tensor:
+
+        residual = hidden_states
+        hidden_states = self.layer_norm1(hidden_states)
+        # Siglip text model uses both `causal_attention_mask` and `attention_mask`
+        if attention_mask is not None and causal_attention_mask is not None:
+            attn_mask = attention_mask + causal_attention_mask
+        elif causal_attention_mask is not None:
+            attn_mask = causal_attention_mask
+        else:
+            attn_mask = attention_mask
+        hidden_states = self.self_attn(
+            hidden_states,
+            attention_mask=attn_mask,
+            # causal_attention_mask=causal_attention_mask,
+        )
+
+        hidden_states = residual + hidden_states
+        residual = hidden_states
+        hidden_states = self.layer_norm2(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+        return hidden_states
+
+
+# Copied from sglang.srt.models.clip.CLIPEncoder
+class SiglipEncoder(nn.Module):
+    """
+    Transformer encoder consisting of `config.num_hidden_layers` self
+    attention layers. Each layer is a [`SiglipEncoderLayer`].
+
+    Args:
+        config: SiglipConfig
+    """
+
+    def __init__(
+        self,
+        config: SiglipVisionConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+
+        self.config = config
+
+        num_hidden_layers = config.num_hidden_layers
+        norm_layer = partial(nn.LayerNorm, eps=config.layer_norm_eps)
+        self.layers = nn.ModuleList(
+            [
+                SiglipEncoderLayer(
+                    config=config,
+                    norm_layer=norm_layer,
+                    quant_config=quant_config,
+                    prefix=add_prefix(f"layers.{layer_idx}", prefix),
+                )
+                for layer_idx in range(num_hidden_layers)
+            ]
+        )
+
+    def forward(
+        self,
+        inputs_embeds: torch.Tensor,
+        attention_mask: torch.Tensor = None,
+        causal_attention_mask: torch.Tensor = None,
+        return_all_hidden_states: bool = False,
+    ) -> Union[torch.Tensor, list[torch.Tensor]]:
+        hidden_states_pool = [inputs_embeds]
+        hidden_states = inputs_embeds
+
+        for encoder_layer in self.layers:
+            hidden_states = encoder_layer(
+                hidden_states, attention_mask, causal_attention_mask
+            )
+            if return_all_hidden_states:
+                hidden_states_pool.append(hidden_states)
+        if return_all_hidden_states:
+            return hidden_states_pool
+        return hidden_states
+
+
+# Adapted from transformers.models.siglip.modeling_siglip.SiglipVisionTransformer
+class SiglipVisionTransformer(nn.Module):
+
+    def __init__(
+        self,
+        config: SiglipVisionConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+
+        self.config = config
+        embed_dim = config.hidden_size
+
+        self.embeddings = SiglipVisionEmbeddings(config)
+
+        self.encoder = SiglipEncoder(
+            config=config,
+            quant_config=quant_config,
+            prefix=add_prefix("encoder", prefix),
+        )
+
+        num_hidden_layers = config.num_hidden_layers
+        if len(self.encoder.layers) > config.num_hidden_layers:
+            raise ValueError(
+                f"The original encoder only has {num_hidden_layers} "
+                f"layers, but you requested {len(self.encoder.layers)} layers."
+            )
+
+        # VisionAttention in SiglipEncoderLayer is multihead attention
+        self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
+
+    @property
+    def device(self) -> torch.device:
+        return self.encoder.layers[0].layer_norm1.weight.device
+
+    def forward(
+        self,
+        pixel_values: torch.Tensor,
+    ) -> torch.Tensor:
+        hidden_states = self.embeddings(pixel_values.to(self.device))
+
+        return_all_hidden_states = False
+
+        last_hidden_state = self.encoder(
+            inputs_embeds=hidden_states,
+            return_all_hidden_states=return_all_hidden_states,
+        )
+
+        last_hidden_state = self.post_layernorm(last_hidden_state)
+
+        return last_hidden_state
+
+
+# Copied from sglang.srt.models.clip.CLIPVisionModel
+class SiglipVisionModel(nn.Module):
+    def __init__(
+        self,
+        config: SiglipVisionConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.vision_model = SiglipVisionTransformer(
+            config, quant_config, prefix=add_prefix("vision_model", prefix)
+        )
+
+    @property
+    def device(self) -> torch.device:
+        return self.vision_model.device
+
+    def forward(self, pixel_values: torch.Tensor):
+        return self.vision_model(pixel_values)
diff --git a/sglang/python/sglang/srt/models/solar.py b/sglang/python/sglang/srt/models/solar.py
new file mode 100644
index 0000000000000000000000000000000000000000..8f85ad587ab08b59716ac0410a17cc2d5daea53b
--- /dev/null
+++ b/sglang/python/sglang/srt/models/solar.py
@@ -0,0 +1,505 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# Adapted from
+# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
+# Copyright 2023 The vLLM team.
+# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Adapted from https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/models/solar.py
+from collections.abc import Iterable
+from typing import Any, List, Optional, Tuple, Union
+
+import torch
+from torch import nn
+from transformers import PretrainedConfig
+
+from sglang.srt.distributed import get_pp_group, get_tensor_model_parallel_world_size
+from sglang.srt.distributed.parallel_state import get_tensor_model_parallel_rank
+from sglang.srt.layers.activation import SiluAndMul
+from sglang.srt.layers.layernorm import RMSNorm
+from sglang.srt.layers.linear import (
+    MergedColumnParallelLinear,
+    QKVParallelLinear,
+    RowParallelLinear,
+)
+from sglang.srt.layers.logits_processor import LogitsProcessor, LogitsProcessorOutput
+from sglang.srt.layers.quantization import QuantizationConfig
+from sglang.srt.layers.radix_attention import RadixAttention
+from sglang.srt.layers.rotary_embedding import get_rope
+from sglang.srt.layers.utils import PPMissingLayer
+from sglang.srt.layers.vocab_parallel_embedding import (
+    DEFAULT_VOCAB_PADDING_SIZE,
+    ParallelLMHead,
+    VocabParallelEmbedding,
+)
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch, PPProxyTensors
+from sglang.srt.model_loader.weight_utils import (
+    default_weight_loader,
+    kv_cache_scales_loader,
+)
+from sglang.srt.utils import add_prefix, make_layers
+
+
+class SolarMLP(nn.Module):
+
+    def __init__(
+        self,
+        hidden_size: int,
+        intermediate_size: int,
+        hidden_act: str,
+        quant_config: Optional[QuantizationConfig] = None,
+        bias: bool = False,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.gate_up_proj = MergedColumnParallelLinear(
+            input_size=hidden_size,
+            output_sizes=[intermediate_size] * 2,
+            bias=bias,
+            quant_config=quant_config,
+            prefix=f"{prefix}.gate_up_proj",
+        )
+        self.down_proj = RowParallelLinear(
+            input_size=intermediate_size,
+            output_size=hidden_size,
+            bias=bias,
+            quant_config=quant_config,
+            prefix=f"{prefix}.down_proj",
+        )
+        if hidden_act != "silu":
+            raise ValueError(
+                f"Unsupported activation: {hidden_act}. "
+                "Only silu is supported for now."
+            )
+        self.act_fn = SiluAndMul()
+
+    def forward(self, x):
+        gate_up, _ = self.gate_up_proj(x)
+        x = self.act_fn(gate_up)
+        x, _ = self.down_proj(x)
+        return x
+
+
+class SolarAttention(nn.Module):
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        hidden_size: int,
+        num_heads: int,
+        num_kv_heads: int,
+        rope_theta: float = 10000,
+        rope_scaling: Optional[dict[str, Any]] = None,
+        max_position_embeddings: int = 8192,
+        quant_config: Optional[QuantizationConfig] = None,
+        bias: bool = False,
+        prefix: str = "",
+        layer_id: int = 0,
+    ) -> None:
+        super().__init__()
+        self.hidden_size = hidden_size
+        tp_size = get_tensor_model_parallel_world_size()
+        self.total_num_heads = num_heads
+        assert self.total_num_heads % tp_size == 0
+        self.num_heads = self.total_num_heads // tp_size
+        self.total_num_kv_heads = num_kv_heads
+        if self.total_num_kv_heads >= tp_size:
+            assert self.total_num_kv_heads % tp_size == 0
+        else:
+            assert tp_size % self.total_num_kv_heads == 0
+        self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
+
+        self.head_dim = getattr(config, "head_dim", None)
+        if self.head_dim is None:
+            self.head_dim = self.hidden_size // self.total_num_heads
+        self.q_size = self.num_heads * self.head_dim
+        self.kv_size = self.num_kv_heads * self.head_dim
+        self.scaling = self.head_dim**-0.5
+        self.rope_theta = rope_theta
+        self.max_position_embeddings = max_position_embeddings
+
+        self.qkv_proj = QKVParallelLinear(
+            hidden_size=hidden_size,
+            head_size=self.head_dim,
+            total_num_heads=self.total_num_heads,
+            total_num_kv_heads=self.total_num_kv_heads,
+            bias=bias,
+            quant_config=quant_config,
+            prefix=f"{prefix}.qkv_proj",
+        )
+        self.o_proj = RowParallelLinear(
+            input_size=self.total_num_heads * self.head_dim,
+            output_size=hidden_size,
+            bias=bias,
+            quant_config=quant_config,
+            prefix=f"{prefix}.o_proj",
+        )
+
+        self.rotary_emb = get_rope(
+            self.head_dim,
+            rotary_dim=self.head_dim,
+            max_position=max_position_embeddings,
+            base=rope_theta,
+            rope_scaling=rope_scaling,
+        )
+        self.attn = RadixAttention(
+            self.num_heads,
+            self.head_dim,
+            self.scaling,
+            num_kv_heads=self.num_kv_heads,
+            layer_id=layer_id,
+            quant_config=quant_config,
+            prefix=f"{prefix}.attn",
+        )
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        hidden_states: torch.Tensor,
+    ) -> torch.Tensor:
+        qkv, _ = self.qkv_proj(hidden_states)
+        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+        q, k = self.rotary_emb(positions, q, k)
+        attn_output = self.attn(q, k, v, forward_batch=forward_batch)
+        output, _ = self.o_proj(attn_output)
+        return output
+
+
+class SolarDecoderLayer(nn.Module):
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        layer_id: int,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        rope_theta = getattr(config, "rope_theta", 10000)
+        rope_scaling = getattr(config, "rope_scaling", None)
+
+        if rope_scaling is not None and getattr(
+            config, "original_max_position_embeddings", None
+        ):
+            rope_scaling["original_max_position_embeddings"] = (
+                config.original_max_position_embeddings
+            )
+        max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
+
+        attention_bias = getattr(config, "attention_bias", False) or getattr(
+            config, "bias", False
+        )
+        self.self_attn = SolarAttention(
+            config=config,
+            layer_id=layer_id,
+            hidden_size=self.hidden_size,
+            num_heads=config.num_attention_heads,
+            num_kv_heads=getattr(
+                config, "num_key_value_heads", config.num_attention_heads
+            ),
+            rope_theta=rope_theta,
+            rope_scaling=rope_scaling,
+            max_position_embeddings=max_position_embeddings,
+            quant_config=quant_config,
+            bias=attention_bias,
+            prefix=f"{prefix}.self_attn",
+        )
+        self.mlp = SolarMLP(
+            hidden_size=self.hidden_size,
+            intermediate_size=config.intermediate_size,
+            hidden_act=config.hidden_act,
+            quant_config=quant_config,
+            bias=getattr(config, "mlp_bias", False),
+            prefix=f"{prefix}.mlp",
+        )
+        self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = RMSNorm(
+            config.hidden_size, eps=config.rms_norm_eps
+        )
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+        residual: Optional[torch.Tensor],
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        # Self Attention
+        if residual is None:
+            residual = hidden_states
+            hidden_states = self.input_layernorm(hidden_states)
+        else:
+            hidden_states, residual = self.input_layernorm(hidden_states, residual)
+        hidden_states = self.self_attn(
+            positions=positions,
+            hidden_states=hidden_states,
+            forward_batch=forward_batch,
+        )
+
+        # Fully Connected
+        hidden_states, residual = self.post_attention_layernorm(hidden_states, residual)
+        hidden_states = self.mlp(hidden_states)
+        return hidden_states, residual
+
+
+class SolarModel(nn.Module):
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.config = config
+
+        self.vocab_size = config.vocab_size
+        self.org_vocab_size = config.vocab_size
+        self.pp_group = get_pp_group()
+        if self.pp_group.is_first_rank:
+            self.embed_tokens = VocabParallelEmbedding(
+                config.vocab_size,
+                config.hidden_size,
+                quant_config=quant_config,
+                prefix=add_prefix("embed_tokens", prefix),
+            )
+        else:
+            self.embed_tokens = PPMissingLayer()
+        self.start_layer, self.end_layer, self.layers = make_layers(
+            config.num_hidden_layers,
+            lambda idx, prefix: SolarDecoderLayer(
+                config=config,
+                quant_config=quant_config,
+                layer_id=idx,
+                prefix=prefix,
+            ),
+            prefix=f"{prefix}.layers",
+        )
+        if get_pp_group().is_last_rank:
+            self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        else:
+            self.norm = PPMissingLayer()
+
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.embed_tokens(input_ids)
+
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor],
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        pp_proxy_tensors: Optional[PPProxyTensors] = None,
+    ) -> Union[torch.Tensor, Tuple[torch.Tensor, List[torch.Tensor]], PPProxyTensors]:
+        if self.pp_group().is_first_rank:
+            if inputs_embeds is not None:
+                hidden_states = inputs_embeds
+            else:
+                hidden_states = self.get_input_embeddings(input_ids)
+            residual = None
+        else:
+            assert pp_proxy_tensors is not None
+
+            hidden_states = pp_proxy_tensors["hidden_states"]
+            residual = pp_proxy_tensors["residual"]
+
+        # Depth up-scaling mechanism: caches hidden states and residuals from intermediate layers and interpolates them with the states of later layers.
+        # `bskcn` stands for "backbone skip connection".
+        bskcn_h_1 = None
+        bskcn_h_2 = None
+        bskcn_r_1 = None
+        bskcn_r_2 = None
+        bskcn_tv = self.config.bskcn_tv[0] if self.training else self.config.bskcn_tv[1]
+
+        for i in range(self.start_layer, self.end_layer):
+            if i in self.config.bskcn_1:
+                bskcn_h_1 = hidden_states.clone()
+                bskcn_r_1 = residual.clone() if residual is not None else None
+            if i in self.config.bskcn_2:
+                bskcn_h_2 = hidden_states.clone()
+                bskcn_r_2 = residual.clone() if residual is not None else None
+            if i in self.config.bskcn_3:
+                hidden_states = bskcn_h_1 * bskcn_tv + hidden_states * (1 - bskcn_tv)
+                if bskcn_r_1 is not None and residual is not None:
+                    residual = bskcn_r_1 * bskcn_tv + residual * (1 - bskcn_tv)
+            if i in self.config.bskcn_4:
+                hidden_states = bskcn_h_2 * bskcn_tv + hidden_states * (1 - bskcn_tv)
+                if bskcn_r_2 is not None and residual is not None:
+                    residual = bskcn_r_2 * bskcn_tv + residual * (1 - bskcn_tv)
+            layer = self.layers[i]
+            hidden_states, residual = layer(
+                positions=positions,
+                hidden_states=hidden_states,
+                forward_batch=forward_batch,
+                residual=residual,
+            )
+
+        if not self.pp_group().is_last_rank:
+            return PPProxyTensors(
+                {"hidden_states": hidden_states, "residual": residual}
+            )
+
+        hidden_states, _ = self.norm(hidden_states, residual)
+        return hidden_states
+
+    def load_kv_cache_scales(self, quantization_param_path: str) -> None:
+        tp_size = get_tensor_model_parallel_world_size()
+        tp_rank = get_tensor_model_parallel_rank()
+        for layer_idx, scaling_factor in kv_cache_scales_loader(
+            quantization_param_path,
+            tp_rank,
+            tp_size,
+            self.config.num_hidden_layers,
+            self.config.__class__.model_type,
+        ):
+            if not isinstance(self.layers[layer_idx], nn.Identity):
+                layer_self_attn = self.layers[layer_idx].self_attn
+
+            if hasattr(layer_self_attn.attn, "k_scale"):
+                layer_self_attn.attn.k_scale = scaling_factor
+                layer_self_attn.attn.v_scale = scaling_factor
+            else:
+                raise RuntimeError(
+                    "Self attention has no KV cache scaling " "factor attribute!"
+                )
+
+
+class SolarForCausalLM(nn.Module):
+
+    packed_modules_mapping = {
+        "qkv_proj": [
+            ("q_proj", "q"),
+            ("k_proj", "k"),
+            ("v_proj", "v"),
+        ],
+        "gate_up_proj": [
+            ("gate_proj", 0),
+            ("up_proj", 1),
+        ],
+    }
+
+    default_bitsandbytes_target_modules = [
+        ".gate_proj.",
+        ".down_proj.",
+        ".up_proj.",
+        ".q_proj.",
+        ".k_proj.",
+        ".v_proj.",
+        ".o_proj.",
+    ]
+    column_parallel_weights_modules = [".down_proj.", ".o_proj."]
+    bitsandbytes_stacked_params_mapping = {
+        ".q_proj": (".qkv_proj", 0),
+        ".k_proj": (".qkv_proj", 1),
+        ".v_proj": (".qkv_proj", 2),
+        ".gate_proj": (".gate_up_proj", 0),
+        ".up_proj": (".gate_up_proj", 1),
+    }
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.pp_group = get_pp_group()
+        self.config = config
+        self.quant_config = quant_config
+        self.model = SolarModel(
+            config=config,
+            quant_config=self.quant_config,
+            prefix=add_prefix("model", prefix),
+        )
+
+        if self.pp_group.is_last_rank:
+            self.unpadded_vocab_size = config.vocab_size
+            self.lm_head = ParallelLMHead(
+                self.unpadded_vocab_size,
+                config.hidden_size,
+                org_num_embeddings=config.vocab_size,
+                padding_size=DEFAULT_VOCAB_PADDING_SIZE,
+                quant_config=quant_config,
+            )
+            if config.tie_word_embeddings and self.pp_group.is_first_rank:
+                self.lm_head.weight = self.model.embed_tokens.weight
+
+            logit_scale = getattr(config, "logit_scale", 1.0)
+            self.logits_processor = LogitsProcessor(
+                self.unpadded_vocab_size, config.vocab_size, logit_scale
+            )
+        else:
+            self.lm_head = PPMissingLayer()
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor, LogitsProcessorOutput]:
+        hidden_states = self.model(
+            input_ids=input_ids,
+            positions=positions,
+            forward_batch=forward_batch,
+            inputs_embeds=inputs_embeds,
+        )
+
+        if self.pp_group().is_last_rank:
+            logits = self.logits_processor(self.lm_head, hidden_states, forward_batch)
+            return logits
+
+        return hidden_states
+
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
+
+        params_dict = dict(self.named_parameters())
+        for name, loaded_weight in weights:
+
+            is_packed = False
+            for packed_name, sources in self.packed_modules_mapping.items():
+                for src_name, shard_id in sources:
+                    if src_name in name:
+
+                        model_param_name = name.replace(src_name, packed_name)
+
+                        if model_param_name in params_dict:
+                            param = params_dict[model_param_name]
+                            weight_loader = getattr(
+                                param, "weight_loader", default_weight_loader
+                            )
+                            weight_loader(param, loaded_weight, shard_id)
+                            is_packed = True
+                            break
+                if is_packed:
+                    break
+
+            if is_packed:
+                continue
+
+            if name in params_dict:
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
+                weight_loader(param, loaded_weight)
+
+
+EntryClass = SolarForCausalLM
diff --git a/sglang/python/sglang/srt/models/stablelm.py b/sglang/python/sglang/srt/models/stablelm.py
new file mode 100644
index 0000000000000000000000000000000000000000..2adcfe92ffc5e46114dd33abfd2c9306b9d1c5a5
--- /dev/null
+++ b/sglang/python/sglang/srt/models/stablelm.py
@@ -0,0 +1,347 @@
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+# Adapted from:
+# https://github.com/vllm-project/vllm/blob/c7f2cf2b7f67bce5842fedfdba508440fe257375/vllm/model_executor/models/stablelm.py#L1
+"""
+Inference-only StableLM-2 (https://huggingface.co/stabilityai/stablelm-2-1_6b)
+model compatible with HuggingFace weights.
+"""
+
+from typing import Iterable, Optional, Tuple
+
+import torch
+from torch import nn
+from transformers import PretrainedConfig
+
+from sglang.srt.distributed import get_tensor_model_parallel_world_size
+from sglang.srt.layers.activation import SiluAndMul
+from sglang.srt.layers.linear import (
+    MergedColumnParallelLinear,
+    QKVParallelLinear,
+    RowParallelLinear,
+)
+from sglang.srt.layers.logits_processor import LogitsProcessor
+from sglang.srt.layers.quantization.base_config import QuantizationConfig
+from sglang.srt.layers.radix_attention import RadixAttention
+from sglang.srt.layers.rotary_embedding import get_rope
+from sglang.srt.layers.vocab_parallel_embedding import (
+    ParallelLMHead,
+    VocabParallelEmbedding,
+)
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch
+from sglang.srt.model_loader.weight_utils import default_weight_loader
+from sglang.srt.utils import add_prefix, is_npu
+
+_is_npu = is_npu()
+
+
+class StablelmMLP(nn.Module):
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.intermediate_size = config.intermediate_size
+        self.gate_up_proj = MergedColumnParallelLinear(
+            config.hidden_size,
+            [config.intermediate_size] * 2,
+            bias=False,
+            quant_config=quant_config,
+            prefix=add_prefix("gate_up_proj", prefix),
+        )
+        self.down_proj = RowParallelLinear(
+            config.intermediate_size,
+            config.hidden_size,
+            bias=False,
+            quant_config=quant_config,
+            prefix=add_prefix("down_proj", prefix),
+        )
+        self.act_fn = SiluAndMul()
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        gate_up, _ = self.gate_up_proj(x)
+        x = self.act_fn(gate_up)
+        x, _ = self.down_proj(x)
+        return x
+
+
+class StablelmAttention(nn.Module):
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        layer_id: int = 0,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        tp_size = get_tensor_model_parallel_world_size()
+        self.total_num_heads = config.num_attention_heads
+        self.num_heads = self.total_num_heads // tp_size
+
+        self.total_num_key_value_heads = config.num_key_value_heads
+        if self.total_num_key_value_heads >= tp_size:
+            # Number of KV heads is greater than TP size, so we partition
+            # the KV heads across multiple tensor parallel GPUs.
+            assert self.total_num_key_value_heads % tp_size == 0
+        else:
+            # Number of KV heads is less than TP size, so we replicate
+            # the KV heads across multiple tensor parallel GPUs.
+            assert tp_size % self.total_num_key_value_heads == 0
+        self.num_key_value_heads = max(1, self.total_num_key_value_heads // tp_size)
+        self.head_dim = self.hidden_size // self.total_num_heads
+        self.max_position_embeddings = config.max_position_embeddings
+        rope_pct = getattr(
+            config, "rope_pct", getattr(config, "partial_rotary_factor", 1)
+        )
+        self.rotary_ndims = int(self.head_dim * rope_pct)
+        self.scaling = self.head_dim**-0.5
+        self.q_size = self.num_heads * self.head_dim
+        self.kv_size = self.num_key_value_heads * self.head_dim
+        self.qkv_bias = getattr(config, "use_qkv_bias", False)
+        if (self.head_dim * self.num_heads * tp_size) != self.hidden_size:
+            raise ValueError(
+                f"hidden_size must be divisible by num_heads "
+                f"(got `hidden_size`: {self.hidden_size}"
+                f" and `num_heads`: {self.num_heads})."
+            )
+
+        self.qkv_proj = QKVParallelLinear(
+            self.hidden_size,
+            self.head_dim,
+            self.total_num_heads,
+            self.total_num_key_value_heads,
+            self.qkv_bias,
+            quant_config=quant_config,
+            prefix=add_prefix("qkv_proj", prefix),
+        )
+        self.o_proj = RowParallelLinear(
+            self.total_num_heads * self.head_dim,
+            self.hidden_size,
+            bias=False,
+            quant_config=quant_config,
+            prefix=add_prefix("o_proj", prefix),
+        )
+        if not _is_npu:
+            self.rotary_emb = get_rope(
+                self.head_dim,
+                rotary_dim=self.rotary_ndims,
+                max_position=self.config.max_position_embeddings,
+                base=self.config.rope_theta,
+            )
+        else:
+            self.rotary_emb = get_rope(
+                self.head_dim,
+                rotary_dim=self.rotary_ndims,
+                max_position=self.config.max_position_embeddings,
+                base=self.config.rope_theta,
+                dtype=torch.float32,
+            )
+        self.attn = RadixAttention(
+            self.num_heads,
+            self.head_dim,
+            self.scaling,
+            num_kv_heads=self.num_key_value_heads,
+            layer_id=layer_id,
+            quant_config=quant_config,
+            prefix=add_prefix("attn", prefix),
+        )
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+    ) -> torch.Tensor:
+        qkv, _ = self.qkv_proj(hidden_states)
+        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+        if not _is_npu:
+            q, k = self.rotary_emb(positions, q, k)
+        else:
+            odtype = q.dtype
+            q, k = self.rotary_emb(positions, q.to(torch.float32), k.to(torch.float32))
+            q, k = q.to(odtype), k.to(odtype)
+        attn_output = self.attn(q, k, v, forward_batch)
+        output, _ = self.o_proj(attn_output)
+        return output
+
+
+class StablelmDecoderLayer(nn.Module):
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        layer_id: int = 0,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.self_attn = StablelmAttention(
+            config, layer_id=layer_id, prefix=add_prefix("self_attn", prefix)
+        )
+        self.mlp = StablelmMLP(
+            config, quant_config=quant_config, prefix=add_prefix("mlp", prefix)
+        )
+        norm_eps = getattr(config, "norm_eps", getattr(config, "layer_norm_eps", 1e-05))
+        self.input_layernorm = nn.LayerNorm(config.hidden_size, eps=norm_eps)
+        self.post_attention_layernorm = nn.LayerNorm(config.hidden_size, eps=norm_eps)
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        # Self Attention
+        residual = hidden_states
+        hidden_states = self.input_layernorm(hidden_states)
+        hidden_states = self.self_attn(
+            positions=positions,
+            hidden_states=hidden_states,
+            forward_batch=forward_batch,
+        )
+        hidden_states = residual + hidden_states
+
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+
+        return hidden_states, residual
+
+
+class StableLMEpochModel(nn.Module):
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.embed_tokens = VocabParallelEmbedding(
+            config.vocab_size,
+            config.hidden_size,
+            prefix=add_prefix("embed_tokens", prefix),
+        )
+        self.layers = nn.ModuleList(
+            [
+                StablelmDecoderLayer(
+                    config,
+                    i,
+                    quant_config=quant_config,
+                    prefix=add_prefix(f"layers.{i}", prefix),
+                )
+                for i in range(config.num_hidden_layers)
+            ]
+        )
+        norm_eps = getattr(config, "norm_eps", getattr(config, "layer_norm_eps", 1e-05))
+        self.norm = nn.LayerNorm(config.hidden_size, eps=norm_eps)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        input_embeds: torch.Tensor = None,
+    ) -> torch.Tensor:
+        if input_embeds is None:
+            hidden_states = self.embed_tokens(input_ids)
+        else:
+            hidden_states = input_embeds
+        for i in range(len(self.layers)):
+            layer = self.layers[i]
+            hidden_states, residual = layer(
+                positions,
+                hidden_states,
+                forward_batch,
+            )
+        hidden_states = self.norm(hidden_states)
+        return hidden_states
+
+
+class StableLmForCausalLM(nn.Module):
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.config = config
+        self.quant_config = quant_config
+        self.model = StableLMEpochModel(
+            config, quant_config=quant_config, prefix=add_prefix("model", prefix)
+        )
+        self.lm_head = ParallelLMHead(
+            config.vocab_size, config.hidden_size, prefix=add_prefix("lm_head", prefix)
+        )
+        self.logits_processor = LogitsProcessor(config)
+
+    @torch.no_grad()
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        input_embeds: torch.Tensor = None,
+    ) -> torch.Tensor:
+        hidden_states = self.model(input_ids, positions, forward_batch, input_embeds)
+        return self.logits_processor(
+            input_ids, hidden_states, self.lm_head, forward_batch
+        )
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+            ("gate_up_proj", "gate_proj", 0),
+            ("gate_up_proj", "up_proj", 1),
+        ]
+        params_dict = dict(self.named_parameters())
+        for name, loaded_weight in weights:
+            if "rotary_emb.inv_freq" in name:
+                continue
+            if "rotary_emb.cos_cached" in name or "rotary_emb.sin_cached" in name:
+                # Models trained using ColossalAI may include these tensors in
+                # the checkpoint. Skip them.
+                continue
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
+                weight_loader(param, loaded_weight)
+
+
+EntryClass = StableLmForCausalLM
diff --git a/sglang/python/sglang/srt/models/starcoder2.py b/sglang/python/sglang/srt/models/starcoder2.py
new file mode 100644
index 0000000000000000000000000000000000000000..2ad4351e9ecf2ec57a3f23ac0d09efb2c4e29610
--- /dev/null
+++ b/sglang/python/sglang/srt/models/starcoder2.py
@@ -0,0 +1,358 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# Copyright 2024 BigCode and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Adapted from https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/models/starcoder2.py
+"""PyTorch Starcoder2 model."""
+
+from collections.abc import Iterable
+from typing import Optional, Tuple
+
+import torch
+from torch import nn
+from transformers import Starcoder2Config
+
+from sglang.srt.distributed import get_pp_group, get_tensor_model_parallel_world_size
+from sglang.srt.layers.activation import get_act_fn
+from sglang.srt.layers.linear import (
+    ColumnParallelLinear,
+    QKVParallelLinear,
+    RowParallelLinear,
+)
+from sglang.srt.layers.logits_processor import LogitsProcessor
+from sglang.srt.layers.quantization.base_config import QuantizationConfig
+from sglang.srt.layers.radix_attention import RadixAttention
+from sglang.srt.layers.rotary_embedding import get_rope
+from sglang.srt.layers.vocab_parallel_embedding import (
+    DEFAULT_VOCAB_PADDING_SIZE,
+    ParallelLMHead,
+    VocabParallelEmbedding,
+)
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch
+from sglang.srt.model_loader.weight_utils import default_weight_loader
+from sglang.srt.utils import add_prefix, make_layers
+
+
+class Starcoder2Attention(nn.Module):
+
+    def __init__(
+        self,
+        config: Starcoder2Config,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+        layer_id: int = 0,
+    ):
+        super().__init__()
+        self.config = config
+
+        self.hidden_size = config.hidden_size
+        tp_size = get_tensor_model_parallel_world_size()
+        self.total_num_heads = config.num_attention_heads
+        assert self.total_num_heads % tp_size == 0
+        self.num_heads = self.total_num_heads // tp_size
+        self.total_num_kv_heads = config.num_key_value_heads
+        if self.total_num_kv_heads >= tp_size:
+            # Number of KV heads is greater than TP size, so we partition
+            # the KV heads across multiple tensor parallel GPUs.
+            assert self.total_num_kv_heads % tp_size == 0
+        else:
+            # Number of KV heads is less than TP size, so we replicate
+            # the KV heads across multiple tensor parallel GPUs.
+            assert tp_size % self.total_num_kv_heads == 0
+        self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
+        self.head_dim = self.hidden_size // self.total_num_heads
+        self.q_size = self.num_heads * self.head_dim
+        self.kv_size = self.num_kv_heads * self.head_dim
+        self.scaling = self.head_dim**-0.5
+        self.rope_theta = config.rope_theta
+        self.max_position_embeddings = config.max_position_embeddings
+        self.use_bias = config.use_bias
+
+        self.qkv_proj = QKVParallelLinear(
+            self.hidden_size,
+            self.head_dim,
+            self.total_num_heads,
+            self.total_num_kv_heads,
+            bias=self.use_bias,
+            quant_config=quant_config,
+            prefix=f"{prefix}.qkv_proj",
+        )
+        self.o_proj = RowParallelLinear(
+            self.total_num_heads * self.head_dim,
+            self.hidden_size,
+            bias=self.use_bias,
+            quant_config=quant_config,
+            prefix=f"{prefix}.o_proj",
+        )
+        self.rotary_emb = get_rope(
+            self.head_dim,
+            rotary_dim=self.head_dim,
+            max_position=self.max_position_embeddings,
+            base=int(self.rope_theta),
+            is_neox_style=True,
+        )
+        self.attn = RadixAttention(
+            self.num_heads,
+            self.head_dim,
+            self.scaling,
+            num_kv_heads=self.num_kv_heads,
+            layer_id=layer_id,
+            quant_config=quant_config,
+            prefix=f"{prefix}.attn",
+        )
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+    ) -> torch.Tensor:
+        qkv, _ = self.qkv_proj(hidden_states)
+        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+        q, k = self.rotary_emb(positions, q, k)
+        attn_output = self.attn(q, k, v, forward_batch)
+        output, _ = self.o_proj(attn_output)
+        return output
+
+
+class Starcoder2MLP(nn.Module):
+
+    def __init__(
+        self,
+        config: Starcoder2Config,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.c_fc = ColumnParallelLinear(
+            config.hidden_size,
+            config.intermediate_size,
+            bias=config.use_bias,
+            quant_config=quant_config,
+            prefix=f"{prefix}.c_fc",
+        )
+        self.c_proj = RowParallelLinear(
+            config.intermediate_size,
+            config.hidden_size,
+            bias=config.use_bias,
+            quant_config=quant_config,
+            prefix=f"{prefix}.c_proj",
+        )
+        self.act = get_act_fn(config.hidden_act)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+    ) -> torch.Tensor:
+        hidden_states, _ = self.c_fc(hidden_states)
+        hidden_states = self.act(hidden_states)
+        hidden_states, _ = self.c_proj(hidden_states)
+        return hidden_states
+
+
+class Starcoder2DecoderLayer(nn.Module):
+
+    def __init__(
+        self,
+        config: Starcoder2Config,
+        layer_id: int,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.self_attn = Starcoder2Attention(
+            config=config,
+            layer_id=layer_id,
+            quant_config=quant_config,
+            prefix=f"{prefix}.self_attn",
+        )
+        self.mlp = Starcoder2MLP(
+            config, quant_config=quant_config, prefix=f"{prefix}.mlp"
+        )
+        self.input_layernorm = nn.LayerNorm(config.hidden_size, eps=config.norm_epsilon)
+        self.post_attention_layernorm = nn.LayerNorm(
+            config.hidden_size, eps=config.norm_epsilon
+        )
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+    ) -> torch.Tensor:
+        # Self Attention
+        residual = hidden_states
+        hidden_states = self.input_layernorm(hidden_states)
+        hidden_states = self.self_attn(
+            positions=positions,
+            hidden_states=hidden_states,
+            forward_batch=forward_batch,
+        )
+        hidden_states = residual + hidden_states
+
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+
+        return hidden_states
+
+
+class Starcoder2Model(nn.Module):
+
+    def __init__(
+        self,
+        config: Starcoder2Config,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+
+        self.config = config
+        self.vocab_size = config.vocab_size
+
+        self.embed_tokens = VocabParallelEmbedding(
+            config.vocab_size,
+            config.hidden_size,
+            quant_config=quant_config,
+            prefix=f"{prefix}.embed_tokens",
+        )
+
+        pp_group = get_pp_group()
+        pp_size = pp_group.world_size
+        pp_rank = pp_group.rank
+        self.start_layer = pp_rank * config.num_hidden_layers // pp_size
+        self.end_layer = (pp_rank + 1) * config.num_hidden_layers // pp_size
+
+        self.layers = make_layers(
+            config.num_hidden_layers,
+            lambda idx, prefix: Starcoder2DecoderLayer(
+                config=config, quant_config=quant_config, layer_id=idx, prefix=prefix
+            ),
+            prefix=f"{prefix}.layers",
+        )
+        self.norm = nn.LayerNorm(config.hidden_size, eps=config.norm_epsilon)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        if inputs_embeds is None:
+            hidden_states = self.embed_tokens(input_ids)
+        else:
+            hidden_states = inputs_embeds
+        for i in range(self.start_layer, self.end_layer):
+            layer = self.layers[i]
+            hidden_states = layer(
+                positions,
+                hidden_states,
+                forward_batch,
+            )
+        hidden_states = self.norm(hidden_states)
+        return hidden_states
+
+
+class Starcoder2ForCausalLM(nn.Module):
+
+    def __init__(
+        self,
+        config: Starcoder2Config,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.config = config
+        self.model = Starcoder2Model(
+            config, quant_config, prefix=add_prefix("model", prefix)
+        )
+        self.vocab_size = config.vocab_size
+        self.unpadded_vocab_size = config.vocab_size
+        if config.tie_word_embeddings:
+            self.lm_head = self.model.embed_tokens
+        else:
+            self.unpadded_vocab_size = config.vocab_size
+            self.lm_head = ParallelLMHead(
+                self.unpadded_vocab_size,
+                config.hidden_size,
+                org_num_embeddings=config.vocab_size,
+                padding_size=DEFAULT_VOCAB_PADDING_SIZE,
+                quant_config=quant_config,
+                prefix=f"{prefix}.lm_head",
+            )
+        self.logits_processor = LogitsProcessor(config=config)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        hidden_states = self.model(
+            input_ids=input_ids,
+            positions=positions,
+            forward_batch=forward_batch,
+            inputs_embeds=inputs_embeds,
+        )
+        return self.logits_processor(
+            input_ids, hidden_states, self.lm_head, forward_batch
+        )
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+        ]
+        params_dict = dict(self.named_parameters())
+
+        for name, loaded_weight in weights:
+            if "rotary_emb.inv_freqs" in name:
+                continue
+
+            is_stacked = False
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                if weight_name in name:
+                    name = name.replace(weight_name, param_name)
+                    param = params_dict[name]
+                    weight_loader = getattr(
+                        param, "weight_loader", default_weight_loader
+                    )
+                    weight_loader(param, loaded_weight, shard_id)
+                    is_stacked = True
+                    break
+            if is_stacked:
+                continue
+
+            param = params_dict.get(name)
+            if param is None:
+                continue
+
+            weight_loader = getattr(param, "weight_loader", default_weight_loader)
+            weight_loader(param, loaded_weight)
+
+
+EntryClass = Starcoder2ForCausalLM
diff --git a/sglang/python/sglang/srt/models/step3_vl.py b/sglang/python/sglang/srt/models/step3_vl.py
new file mode 100644
index 0000000000000000000000000000000000000000..5ac9528f94dd2d683718f640eae3967a2ed9f55c
--- /dev/null
+++ b/sglang/python/sglang/srt/models/step3_vl.py
@@ -0,0 +1,1003 @@
+import logging
+import math
+from math import sqrt
+from typing import Any, Dict, Iterable, List, Optional, Tuple
+
+import torch
+from torch import nn
+from torch.nn import LayerNorm
+from torch.nn import functional as F
+from transformers import PretrainedConfig
+from transformers.activations import ACT2FN
+
+from sglang.srt.configs.step3_vl import (
+    Step3TextConfig,
+    Step3VisionEncoderConfig,
+    Step3VLConfig,
+)
+from sglang.srt.distributed import (
+    get_tensor_model_parallel_rank,
+    get_tensor_model_parallel_world_size,
+    tensor_model_parallel_all_reduce,
+)
+from sglang.srt.eplb.expert_location import ModelConfigForExpertLocation
+from sglang.srt.layers.activation import SiluAndMul
+from sglang.srt.layers.attention.vision import VisionAttention
+from sglang.srt.layers.communicator import LayerCommunicator, LayerScatterModes
+from sglang.srt.layers.dp_attention import (
+    get_attention_tp_rank,
+    get_attention_tp_size,
+    is_dp_attention_enabled,
+)
+from sglang.srt.layers.layernorm import RMSNorm
+from sglang.srt.layers.linear import (
+    ColumnParallelLinear,
+    MergedColumnParallelLinear,
+    ReplicatedLinear,
+    RowParallelLinear,
+)
+from sglang.srt.layers.logits_processor import LogitsProcessor
+from sglang.srt.layers.moe import get_moe_a2a_backend
+from sglang.srt.layers.moe.ep_moe.layer import get_moe_impl_class
+from sglang.srt.layers.moe.fused_moe_triton import FusedMoE
+from sglang.srt.layers.moe.topk import TopK
+from sglang.srt.layers.quantization.base_config import QuantizationConfig
+from sglang.srt.layers.radix_attention import RadixAttention
+from sglang.srt.layers.rotary_embedding import get_rope
+from sglang.srt.layers.vocab_parallel_embedding import (
+    ParallelLMHead,
+    VocabParallelEmbedding,
+)
+from sglang.srt.managers.mm_utils import (
+    MultiModalityDataPaddingPatternMultimodalTokens,
+    general_mm_embed_routine,
+)
+from sglang.srt.managers.schedule_batch import (
+    Modality,
+    MultimodalDataItem,
+    MultimodalInputs,
+)
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch
+from sglang.srt.model_loader.weight_utils import default_weight_loader
+from sglang.srt.utils import add_prefix, log_info_on_rank0, make_layers
+
+logger = logging.getLogger(__name__)
+
+
+"""
+Text Model
+"""
+
+
+class Step3TextMLP(nn.Module):
+    def __init__(
+        self,
+        hidden_size: int,
+        intermediate_size: int,
+        hidden_act: str,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.gate_up_proj = MergedColumnParallelLinear(
+            hidden_size,
+            [intermediate_size] * 2,
+            bias=False,
+            quant_config=quant_config,
+            prefix=add_prefix("gate_up_proj", prefix),
+        )
+        self.down_proj = RowParallelLinear(
+            intermediate_size,
+            hidden_size,
+            bias=False,
+            quant_config=quant_config,
+            prefix=add_prefix("down_proj", prefix),
+        )
+        if hidden_act != "silu":
+            raise ValueError(
+                f"Unsupported activation: {hidden_act}. "
+                "Only silu is supported for now."
+            )
+        self.act_fn = SiluAndMul()
+
+    def forward(self, x):
+        gate_up, _ = self.gate_up_proj(x)
+        x = self.act_fn(gate_up)
+        x, _ = self.down_proj(x)
+        return x
+
+
+class Step3TextMoEMLP(nn.Module):
+    # Native
+    def __init__(
+        self,
+        layer_id: int,
+        config: Step3TextConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.tp_size = get_tensor_model_parallel_world_size()
+        self.layer_id = layer_id
+        if self.tp_size > config.moe_num_experts:
+            raise ValueError(
+                f"Tensor parallel size {self.tp_size} is greater than "
+                f"the number of experts {config.moe_num_experts}."
+            )
+
+        self.topk = TopK(
+            top_k=config.moe_top_k,
+            renormalize=config.norm_expert_weight,
+            use_grouped_topk=False,
+            layer_id=layer_id,
+        )
+
+        self.experts = get_moe_impl_class(quant_config)(
+            num_experts=config.moe_num_experts,
+            top_k=config.moe_top_k,
+            hidden_size=config.hidden_size,
+            intermediate_size=config.moe_intermediate_size,
+            layer_id=layer_id,
+            quant_config=quant_config,
+            prefix=add_prefix("experts", prefix),
+        )
+
+        self.gate = ReplicatedLinear(
+            config.hidden_size,
+            output_size=config.moe_num_experts,
+            bias=False,
+            quant_config=None,
+            prefix=add_prefix("gate", prefix),
+        )
+
+        if get_moe_a2a_backend().is_deepep():
+            raise NotImplementedError("DeepEP MoE is not supported yet in Step3 model.")
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        num_tokens, hidden_dim = hidden_states.shape
+        hidden_states = hidden_states.view(-1, hidden_dim)
+
+        router_logits, _ = self.gate(hidden_states)
+        topk_output = self.topk(hidden_states, router_logits)
+        final_hidden_states = self.experts(
+            hidden_states=hidden_states, topk_output=topk_output
+        )
+
+        if self.tp_size > 1:
+            final_hidden_states = tensor_model_parallel_all_reduce(final_hidden_states)
+        return final_hidden_states.view(num_tokens, hidden_dim)
+
+
+class Step3TextAttention(nn.Module):
+    def __init__(
+        self,
+        hidden_size: int,
+        num_heads: int,
+        num_kv_heads: int,
+        head_dim: int,
+        share_q_dim: int,
+        layer_id: int = 0,
+        rope_theta: float = 10000,
+        rope_scaling: Optional[Dict[str, Any]] = None,
+        max_position_embeddings: int = 8192,
+        quant_config: Optional[QuantizationConfig] = None,
+        rms_norm_eps=None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.hidden_size = hidden_size
+
+        attn_tp_rank = get_attention_tp_rank()
+        attn_tp_size = get_attention_tp_size()
+
+        self.all_tp_rank = get_tensor_model_parallel_rank()
+        self.total_num_heads = num_heads
+        self.attn_tp_rank = attn_tp_rank
+        self.layer_id = layer_id
+        assert self.total_num_heads % attn_tp_size == 0
+        self.num_heads = self.total_num_heads // attn_tp_size
+        self.total_num_kv_heads = num_kv_heads
+        if self.total_num_kv_heads >= attn_tp_size:
+            # Number of KV heads is greater than TP size, so we partition
+            # the KV heads across multiple tensor parallel GPUs.
+            assert self.total_num_kv_heads % attn_tp_size == 0
+        else:
+            # Number of KV heads is less than TP size, so we replicate
+            # the KV heads across multiple tensor parallel GPUs.
+            assert attn_tp_size % self.total_num_kv_heads == 0
+        self.num_kv_heads = max(1, self.total_num_kv_heads // attn_tp_size)
+        self.head_dim = head_dim
+        self.q_size = share_q_dim if share_q_dim else head_dim
+        self.kv_size = self.num_kv_heads * self.head_dim
+
+        self.scaling = self.head_dim**-0.5
+        self.rope_theta = rope_theta
+        self.max_position_embeddings = max_position_embeddings
+
+        self.qkv_proj = MergedColumnParallelLinear(
+            hidden_size,
+            [self.q_size, self.kv_size, self.kv_size],
+            bias=False,
+            quant_config=quant_config,
+            tp_rank=0,  # In fact, we need a MergedReplicatedLinear
+            tp_size=1,
+            prefix=add_prefix("qkv_proj", prefix),
+        )
+
+        self.o_proj = RowParallelLinear(
+            self.total_num_heads * self.head_dim,
+            hidden_size,
+            bias=False,
+            quant_config=quant_config,
+            tp_rank=attn_tp_rank,
+            tp_size=attn_tp_size,
+            reduce_results=False,
+            prefix=add_prefix("o_proj", prefix),
+        )
+
+        self.inter_norm = RMSNorm(self.q_size, eps=rms_norm_eps)
+
+        self.wq = ColumnParallelLinear(
+            self.q_size,
+            self.head_dim * self.total_num_heads,
+            bias=False,
+            quant_config=quant_config,
+            tp_rank=attn_tp_rank,
+            tp_size=attn_tp_size,
+            prefix=add_prefix("wq", prefix),
+        )
+        self.rotary_emb = get_rope(
+            self.head_dim,
+            rotary_dim=self.head_dim,
+            max_position=max_position_embeddings,
+            base=rope_theta,
+            rope_scaling=rope_scaling,
+        )
+        self.attn = RadixAttention(
+            self.num_heads,
+            self.head_dim,
+            self.scaling,
+            num_kv_heads=self.num_kv_heads,
+            layer_id=layer_id,
+            quant_config=quant_config,
+            prefix=add_prefix("attn", prefix),
+        )
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+    ) -> torch.Tensor:
+        qkv, _ = self.qkv_proj(hidden_states)
+        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+        q = self.inter_norm(q.contiguous())
+        q, _ = self.wq(q)
+        q, k = self.rotary_emb(positions, q, k)
+        attn_output = self.attn(q, k, v, forward_batch)
+        output, _ = self.o_proj(attn_output)
+        return output
+
+
+class Step3TextDecoderLayer(nn.Module):
+    def __init__(
+        self,
+        config: Step3TextConfig,
+        layer_id: int,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        rope_theta = getattr(config, "rope_theta", 10000)
+        rope_scaling = getattr(config, "rope_scaling", None)
+        max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
+        head_dim = getattr(
+            config, "head_dim", config.hidden_size // config.num_attention_heads
+        )
+        # TODO: support shared experts fusion
+        # self.n_shared_experts = 1
+        # self.num_fused_shared_experts = (
+        #     0
+        #     if global_server_args.disable_shared_experts_fusion
+        #     else self.n_shared_experts
+        # )
+        self.num_fused_shared_experts = 0
+        rms_norm_eps = config.rms_norm_eps
+        self.self_attn = Step3TextAttention(
+            hidden_size=self.hidden_size,
+            num_heads=config.num_attention_heads,
+            num_kv_heads=1,
+            head_dim=head_dim,
+            share_q_dim=config.share_q_dim,
+            layer_id=layer_id,
+            rope_theta=rope_theta,
+            rope_scaling=rope_scaling,
+            max_position_embeddings=max_position_embeddings,
+            rms_norm_eps=rms_norm_eps,
+            quant_config=quant_config,
+            prefix=add_prefix("self_attn", prefix),
+        )
+
+        moe_layers_enum = getattr(config, "moe_layers_enum", None)
+        if moe_layers_enum is not None:
+            moe_layers_idx = [int(i) for i in moe_layers_enum.strip().split(",")]
+        else:
+            # Default to 1dense.
+            moe_layers_idx = [i for i in range(1, config.num_hidden_layers)]
+
+        self.use_moe = False
+
+        self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = RMSNorm(
+            config.hidden_size, eps=config.rms_norm_eps
+        )
+
+        self.layer_id = layer_id
+        self.is_layer_sparse = True if layer_id in moe_layers_idx else False
+        self.is_previous_layer_sparse = (
+            True if layer_id - 1 in moe_layers_idx else False
+        )
+        self.is_next_layer_sparse = True if layer_id + 1 in moe_layers_idx else False
+
+        self.layer_scatter_modes = LayerScatterModes.init_new(
+            layer_id=layer_id,
+            num_layers=config.num_hidden_layers,
+            is_layer_sparse=self.is_layer_sparse,
+            is_previous_layer_sparse=self.is_previous_layer_sparse,
+            is_next_layer_sparse=self.is_next_layer_sparse,
+        )
+
+        if not self.is_layer_sparse:
+            self.mlp = Step3TextMLP(
+                hidden_size=config.hidden_size,
+                intermediate_size=config.intermediate_size,
+                hidden_act="silu",
+                quant_config=quant_config,
+                prefix=add_prefix("mlp", prefix),
+            )
+        else:
+            self.use_moe = True
+            if self.num_fused_shared_experts == 0:
+                self.moe = Step3TextMoEMLP(
+                    layer_id=layer_id,
+                    config=config,
+                    quant_config=quant_config,
+                    prefix=add_prefix("mlp", prefix),
+                )
+                self.share_expert = Step3TextMLP(
+                    hidden_size=config.hidden_size,
+                    intermediate_size=config.share_expert_dim,
+                    hidden_act="silu",
+                    quant_config=quant_config,
+                    prefix=add_prefix("share_expert", prefix),
+                )
+            else:
+                self.moe = Step3TextMoEMLP(
+                    layer_id=layer_id,
+                    config=config,
+                    quant_config=quant_config,
+                    prefix=add_prefix("mlp", prefix),
+                )
+
+        self.layer_communicator = LayerCommunicator(
+            layer_scatter_modes=self.layer_scatter_modes,
+            input_layernorm=self.input_layernorm,
+            post_attention_layernorm=self.post_attention_layernorm,
+        )
+
+    def moe_mlp_forward(self, hidden_states):
+        if not self.num_fused_shared_experts:
+            h = hidden_states.clone()
+            hidden_states = self.moe(hidden_states)
+            hidden_states += self.share_expert(h)
+        else:
+            hidden_states = self.moe(hidden_states)
+        return hidden_states
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+        residual: Optional[torch.Tensor],
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+
+        hidden_states, residual = self.layer_communicator.prepare_attn(
+            hidden_states, residual, forward_batch
+        )
+
+        if hidden_states.shape[0] != 0:
+            hidden_states = self.self_attn(
+                positions=positions,
+                hidden_states=hidden_states,
+                forward_batch=forward_batch,
+            )
+
+        hidden_states, residual = self.layer_communicator.prepare_mlp(
+            hidden_states, residual, forward_batch
+        )
+        if self.use_moe:
+            hidden_states = self.moe_mlp_forward(hidden_states)
+        else:
+            hidden_states = self.mlp(hidden_states)
+
+        hidden_states, residual = self.layer_communicator.postprocess_layer(
+            hidden_states, residual, forward_batch
+        )
+
+        return hidden_states, residual
+
+
+class Step3TextModel(nn.Module):
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+
+        self.embed_tokens = VocabParallelEmbedding(
+            config.vocab_size,
+            config.hidden_size,
+            use_attn_tp_group=is_dp_attention_enabled(),
+            prefix=add_prefix("embed_tokens", prefix),
+        )
+
+        self.layers = make_layers(
+            config.num_hidden_layers,
+            lambda idx, prefix: Step3TextDecoderLayer(
+                layer_id=idx,
+                config=config,
+                quant_config=quant_config,
+                prefix=prefix,
+            ),
+            prefix=add_prefix("layers", prefix),
+        )
+        self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+    def get_input_embeddings(self):
+        return self.embed_tokens
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        input_embeds: torch.Tensor = None,
+    ) -> torch.Tensor:
+        if input_embeds is None:
+            hidden_states = self.embed_tokens(input_ids)
+        else:
+            hidden_states = input_embeds
+
+        residual = None
+        for i in range(len(self.layers)):
+            layer = self.layers[i]
+            hidden_states, residual = layer(
+                positions, hidden_states, forward_batch, residual
+            )
+
+        if hidden_states.shape[0] != 0:
+            if residual is None:
+                hidden_states = self.norm(hidden_states)
+            else:
+                hidden_states, _ = self.norm(hidden_states, residual)
+        return hidden_states
+
+
+"""
+Vision Model
+"""
+
+
+def get_abs_pos(abs_pos, tgt_size):
+    dim = abs_pos.size(-1)
+    abs_pos_new = abs_pos.squeeze(0)
+    cls_token, old_pos_embed = abs_pos_new[:1], abs_pos_new[1:]
+
+    src_size = int(math.sqrt(abs_pos_new.shape[0] - 1))
+    tgt_size = int(math.sqrt(tgt_size))
+    dtype = abs_pos.dtype
+
+    if src_size != tgt_size:
+        old_pos_embed = (
+            old_pos_embed.view(1, src_size, src_size, dim)
+            .permute(0, 3, 1, 2)
+            .contiguous()
+        )
+        old_pos_embed = old_pos_embed.to(torch.float32)
+        new_pos_embed = F.interpolate(
+            old_pos_embed,
+            size=(tgt_size, tgt_size),
+            mode="bicubic",
+            antialias=True,
+            align_corners=False,
+        ).to(dtype)
+        new_pos_embed = new_pos_embed.permute(0, 2, 3, 1)
+        new_pos_embed = new_pos_embed.view(tgt_size * tgt_size, dim)
+        vision_pos_embed = torch.cat([cls_token, new_pos_embed], dim=0)
+        vision_pos_embed = vision_pos_embed.view(1, tgt_size * tgt_size + 1, dim)
+        return vision_pos_embed
+    else:
+        return abs_pos
+
+
+class Step3VisionMLP(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        intermediate_size: int,
+        bias: bool = True,
+        hidden_act="quick_gelu",
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        # Since this is a dense model,
+        # the MLP component likewise adopts a DP-MLP approach modeled after DP Attention.
+        # This choice may not represent the optimal solution and remains open to further deliberation.
+        attn_tp_rank = get_attention_tp_rank()
+        attn_tp_size = get_attention_tp_size()
+        self.fc1 = ColumnParallelLinear(
+            dim,
+            intermediate_size,
+            bias=bias,
+            quant_config=quant_config,
+            tp_rank=attn_tp_rank,
+            tp_size=attn_tp_size,
+            prefix=add_prefix("gate_proj", prefix),
+        )
+        self.act = ACT2FN[hidden_act]  # quick_gelu
+        self.fc2 = RowParallelLinear(
+            intermediate_size,
+            dim,
+            bias=bias,
+            quant_config=quant_config,
+            tp_rank=attn_tp_rank,
+            tp_size=attn_tp_size,
+            prefix=add_prefix("down_proj", prefix),
+        )
+
+    def forward(self, hidden_states) -> torch.Tensor:
+        hidden_states, _ = self.fc1(hidden_states)
+        hidden_states = self.act(hidden_states)
+        hidden_states, _ = self.fc2(hidden_states)
+        return hidden_states
+
+
+class Step3VisionAttention(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        num_heads: int = 16,
+        quant_config=None,
+        prefix: str = "",
+    ) -> None:
+
+        super().__init__()
+        self.num_heads = num_heads
+        self.head_dim = dim // num_heads
+        self.out_proj = RowParallelLinear(
+            dim,
+            dim,
+            bias=True,
+            quant_config=quant_config,
+            prefix=add_prefix("out_proj", prefix),
+        )
+        self.scale = self.head_dim**-0.5
+
+        self.attn = VisionAttention(
+            embed_dim=dim,
+            num_heads=num_heads,
+            projection_size=dim,
+            use_qkv_parallel=True,
+            proj_bias=True,
+            quant_config=quant_config,
+            prefix=add_prefix("attn", prefix),
+        )
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        attn_output = self.attn(hidden_states)
+        return attn_output
+
+
+class Step3VisionEmbeddings(nn.Module):
+
+    def __init__(self, config: Step3VisionEncoderConfig):
+        super().__init__()
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.image_size = config.image_size
+        self.patch_size = config.patch_size
+
+        self.class_embedding = nn.Parameter(torch.randn(1, self.embed_dim))
+
+        self.patch_embedding = nn.Conv2d(
+            in_channels=config.num_channels,
+            out_channels=self.embed_dim,
+            kernel_size=self.patch_size,
+            stride=self.patch_size,
+            bias=True,
+        )
+
+        self.num_patches = (self.image_size // self.patch_size) ** 2
+        self.pad_tp_size = 4  # hard code for padding
+        # To load the pretrained weights, we still use P+1 as the seqlen
+        self.position_embedding = torch.nn.Embedding(
+            self.num_patches + 1, self.embed_dim
+        )
+        self.register_buffer(
+            "position_ids",
+            torch.arange(self.num_patches + 1).expand((1, -1)),
+            persistent=False,
+        )
+
+    def forward(self, pixel_values: torch.Tensor) -> torch.Tensor:
+        batch_size = pixel_values.shape[0]
+        patch_embeds = self.patch_embedding(
+            pixel_values
+        )  # shape = [*, width, grid, grid]
+        patch_embeds = patch_embeds.flatten(2).transpose(1, 2)
+
+        # pad
+        class_embeds = self.class_embedding.expand(batch_size, 1, -1)
+        embeddings = torch.cat([class_embeds, patch_embeds], dim=1)
+        embeddings = embeddings + get_abs_pos(
+            self.position_embedding(self.position_ids), patch_embeds.size(1)
+        )
+        embeddings = torch.cat(
+            [
+                embeddings[:, 0, :].unsqueeze(1).repeat(1, self.pad_tp_size - 1, 1),
+                embeddings,
+            ],
+            dim=1,
+        )
+        return embeddings
+
+
+class Step3VisionEncoderLayer(nn.Module):
+    def __init__(self, config, attn_implementation: str = "sdpa") -> None:
+        super().__init__()
+        self.embed_dim = config.hidden_size
+        self.layer_norm1 = LayerNorm(self.embed_dim, eps=1e-6)
+        self.layer_norm2 = LayerNorm(self.embed_dim, eps=1e-6)
+
+        self.self_attn = Step3VisionAttention(
+            self.embed_dim, num_heads=config.num_attention_heads
+        )
+        self.mlp = Step3VisionMLP(
+            dim=self.embed_dim,
+            intermediate_size=config.intermediate_size,
+            hidden_act=config.hidden_act,
+        )
+
+    def forward(self, hidden_states) -> torch.Tensor:
+        hidden_states = hidden_states + self.layer_norm1(self.self_attn(hidden_states))
+        hidden_states = hidden_states + self.layer_norm2(self.mlp(hidden_states))
+        return hidden_states
+
+
+class Step3VisionTransformer(nn.Module):
+    def __init__(self, config: Step3VisionEncoderConfig):
+        super().__init__()
+        self.config = config
+        self.image_size = config.image_size
+        self.embeddings = Step3VisionEmbeddings(config)
+        self.transformer = Step3VisionEncoder(config)
+
+    @property
+    def dtype(self) -> torch.dtype:
+        return self.embeddings.patch_embedding.weight.dtype
+
+    def forward(
+        self,
+        pixel_values: torch.Tensor,
+    ):
+        hidden_states = self.embeddings(pixel_values)
+        hidden_states = self.transformer(inputs_embeds=hidden_states)
+        return hidden_states
+
+
+class Step3VisionEncoder(nn.Module):
+    """
+    Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
+    [`Step3VisionEncoderLayer`].
+
+    Args:
+        config: StepVisionEncoderConfig
+    """
+
+    def __init__(self, config: Step3VisionEncoderConfig):
+        super().__init__()
+        self.config = config
+        self.layers = nn.ModuleList(
+            [Step3VisionEncoderLayer(config) for _ in range(config.num_hidden_layers)]
+        )
+
+    def forward(
+        self,
+        inputs_embeds,
+    ) -> torch.Tensor:
+
+        hidden_states = inputs_embeds
+        for encoder_layer in self.layers:
+            hidden_states = encoder_layer(
+                hidden_states,
+            )
+
+        return hidden_states
+
+
+class Step3VLForConditionalGeneration(nn.Module):
+
+    def __init__(
+        self,
+        config: Step3VLConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.config = config
+        self.quant_config = quant_config
+        self.model = Step3TextModel(
+            config.text_config, quant_config, prefix=add_prefix("model", prefix)
+        )
+
+        self.vision_model = Step3VisionTransformer(config.vision_config)
+
+        self.vit_downsampler = nn.Conv2d(
+            config.vision_config.hidden_size,
+            config.vision_config.output_hidden_size,
+            kernel_size=2,
+            stride=config.understand_projector_stride,
+        )
+        self.vit_downsampler2 = nn.Conv2d(
+            config.vision_config.output_hidden_size,
+            config.vision_config.output_hidden_size * 2,
+            kernel_size=3,
+            stride=2,
+            padding=1,
+        )
+        self.vit_large_projector = nn.Linear(
+            config.vision_config.output_hidden_size * 2,
+            config.hidden_size,
+            bias=config.projector_bias,
+        )
+
+        # TODO: support shared experts fusion
+        # self.n_shared_experts = 1
+        # self.num_fused_shared_experts = (
+        #     0
+        #     if global_server_args.disable_shared_experts_fusion
+        #     else self.n_shared_experts
+        # )
+        self.num_fused_shared_experts = 0
+        self.config.tie_word_embeddings = False
+        if getattr(self.config, "tie_word_embeddings", False):
+            self.lm_head = self.model.embed_tokens
+        else:
+            self.lm_head = ParallelLMHead(
+                config.text_config.vocab_size,
+                config.text_config.hidden_size,
+                quant_config=quant_config,
+                prefix=add_prefix("lm_head", prefix),
+            )
+        self.logits_processor = LogitsProcessor(config.text_config)
+
+    def _get_vision_model_output(self, input_tensor: torch.Tensor) -> torch.Tensor:
+        return self.vision_model(input_tensor)[:, 4:]
+
+    def _flatten_embeddings(self, embeddings) -> torch.Tensor:
+
+        if isinstance(embeddings, torch.Tensor):
+            # Flatten all but the last dimension.
+            return embeddings.flatten(0, -2)
+
+        return torch.cat(tuple(self._flatten_embeddings(t) for t in embeddings))
+
+    def _process_image_features(self, image_features: torch.Tensor) -> torch.Tensor:
+        B, P = image_features.shape[:2]
+        HW = int(sqrt(P))
+        image_features = image_features.permute(0, 2, 1).view(B, -1, HW, HW)
+        image_features = self.vit_downsampler(image_features)
+        image_features = self.vit_downsampler2(image_features)
+        n_dim = image_features.size(1)
+        image_features = image_features.view(B, n_dim, -1).permute(0, 2, 1)
+        image_features = self.vit_large_projector(image_features)
+        return image_features
+
+    def get_image_feature(self, items: List[MultimodalDataItem]) -> torch.Tensor:
+        assert len(items) == 1  # We only have images.
+
+        item = items[0]
+        pixel_values = item.feature.type(self.vision_model.dtype)
+        num_patches = item.model_specific_data.get("num_patches")
+        patch_pixel_values = item.model_specific_data.get("patch_pixel_values", None)
+        if patch_pixel_values is not None:
+            patch_pixel_values = patch_pixel_values.type(self.vision_model.dtype)
+
+        if patch_pixel_values is not None:
+            patch_pixel_values = patch_pixel_values.to("cuda")
+
+        image_features = self._get_vision_model_output(pixel_values)
+        patch_image_features = (
+            self._get_vision_model_output(patch_pixel_values)
+            if patch_pixel_values is not None
+            else None
+        )
+
+        image_features = self._process_image_features(image_features)
+        patch_image_features = (
+            self._process_image_features(patch_image_features)
+            if patch_image_features is not None
+            else None
+        )
+
+        merged_image_features = []
+        cur_patch_idx = 0
+        for i, num_patch in enumerate(num_patches):
+            cur_feature = []
+            if num_patch > 0:
+                patch_slice = patch_image_features[
+                    cur_patch_idx : cur_patch_idx + num_patch
+                ]
+                cur_feature.append(patch_slice.view(-1, patch_slice.shape[-1]))
+            cur_feature.append(image_features[i].view(-1, image_features.shape[-1]))
+            cur_patch_idx += num_patch
+            merged_image_features.append(
+                torch.cat(cur_feature) if len(cur_feature) > 1 else cur_feature[0]
+            )
+        return self._flatten_embeddings(merged_image_features)
+
+    def pad_input_ids(self, input_ids: List[int], mm_inputs: MultimodalInputs):
+        pattern = MultiModalityDataPaddingPatternMultimodalTokens()
+        return pattern.pad_input_tokens(input_ids, mm_inputs)
+
+    @torch.no_grad()
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        input_embeds: torch.Tensor = None,
+    ) -> torch.Tensor:
+        hidden_states = general_mm_embed_routine(
+            input_ids=input_ids,
+            forward_batch=forward_batch,
+            language_model=self.model,
+            data_embedding_funcs={
+                Modality.IMAGE: self.get_image_feature,
+            },
+            positions=positions,
+        )
+
+        return self.logits_processor(
+            input_ids, hidden_states, self.lm_head, forward_batch
+        )
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            (".qkv_proj", ".q_proj", 0),
+            (".qkv_proj", ".k_proj", 1),
+            (".qkv_proj", ".v_proj", 2),
+            (".gate_up_proj", ".gate_proj", 0),
+            (".gate_up_proj", ".up_proj", 1),
+        ]
+
+        if self.num_fused_shared_experts > 0:
+            assert self.num_fused_shared_experts == 1
+            log_info_on_rank0(logger, "Shared experts fusion optimization enabled.")
+
+        expert_params_mapping = FusedMoE.make_expert_params_mapping(
+            ckpt_gate_proj_name="gate_proj",
+            ckpt_down_proj_name="down_proj",
+            ckpt_up_proj_name="up_proj",
+            num_experts=self.config.text_config.moe_num_experts
+            + self.num_fused_shared_experts,
+        )
+
+        params_dict = dict(self.named_parameters())
+        loaded_params = set()
+
+        def match_expert_and_shard_ids(name_path: str, weight_path: str) -> bool:
+            name_parts = name_path.split(".")
+            weight_parts = weight_path.split(".")
+            shard_id_matches = name_parts[4] == weight_parts[2]
+            return shard_id_matches
+
+        for name, loaded_weight in weights:
+            if "vision_model" in name:
+                name = name.replace("self_attn", "self_attn.attn")
+                name = name.replace("out_proj", "proj")
+
+            # TODO: support vision model
+            if self.num_fused_shared_experts > 0 and "share" in name:
+                # assert False
+                name = name.replace("share_expert", "moe")
+                for mapping in expert_params_mapping:
+                    param_name, weight_name, expert_id, shard_id = mapping
+                    if (
+                        expert_id != self.config.text_config.moe_num_experts
+                        or not match_expert_and_shard_ids(name, weight_name)
+                    ):
+                        continue
+
+                    part_name = weight_name.split(".")[-2]
+                    fake_weight_name = name.replace(part_name, weight_name[:-1])
+                    actual_param_name = name.replace(part_name + ".", param_name)
+                    param = params_dict[actual_param_name]
+                    weight_loader = param.weight_loader
+                    weight_loader(
+                        param,
+                        loaded_weight,
+                        name,
+                        shard_id=shard_id,
+                        expert_id=expert_id,
+                    )
+                    break
+                continue
+
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                if "gate." not in name and "moe" in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                loaded_params.add(name)
+                break
+            else:
+                if "moe" not in name:
+                    param = params_dict[name]
+                    weight_loader = getattr(
+                        param, "weight_loader", default_weight_loader
+                    )
+                    weight_loader(param, loaded_weight)
+                    loaded_params.add(name)
+                else:
+                    if "gate." in name:
+                        name = name.replace(weight_name, param_name)
+                        param = params_dict[name]
+                        weight_loader = param.weight_loader
+                        weight_loader(param, loaded_weight)
+                        loaded_params.add(name)
+                        continue
+
+                    for mapping in expert_params_mapping:
+                        param_name, weight_name, expert_id, shard_id = mapping
+                        if expert_id == self.config.text_config.moe_num_experts:
+                            continue
+                        if not match_expert_and_shard_ids(name, weight_name):
+                            continue
+                        part_name = weight_name.split(".")[-2]
+                        fake_weight_name = name.replace(part_name, weight_name[:-1])
+                        actual_param_name = name.replace(part_name + ".", param_name)
+                        param = params_dict[actual_param_name]
+                        weight_loader = param.weight_loader
+                        weight_loader(
+                            param,
+                            loaded_weight[expert_id],
+                            name,
+                            shard_id=shard_id,
+                            expert_id=expert_id,
+                        )
+                        loaded_params.add(actual_param_name)
+                        # Don't break here, because this 'loaded_weight' includes all the weights for this layer
+
+    @classmethod
+    def get_model_config_for_expert_location(cls, config: Step3VLConfig):
+        return ModelConfigForExpertLocation(
+            num_layers=config.text_config.num_hidden_layers,
+            num_logical_experts=config.text_config.moe_num_experts,
+            num_groups=None,
+        )
+
+
+EntryClass = Step3VLForConditionalGeneration
diff --git a/sglang/python/sglang/srt/models/step3_vl_10b.py b/sglang/python/sglang/srt/models/step3_vl_10b.py
new file mode 100644
index 0000000000000000000000000000000000000000..c043191ce9efbfaa2b1046071d3ef78664b7a171
--- /dev/null
+++ b/sglang/python/sglang/srt/models/step3_vl_10b.py
@@ -0,0 +1,582 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""This is basically a copy from perception_models/core/vision_encoder/pe.py"""
+
+from functools import partial
+from typing import Callable, Iterable, List, Optional, Tuple
+
+import torch
+from einops import rearrange, repeat
+from torch import nn
+from torch.nn import functional as F
+from transformers.activations import ACT2FN
+
+from sglang.srt.configs.step3_vl import Step3VLConfig
+from sglang.srt.layers.attention.vision import VisionAttention
+from sglang.srt.layers.linear import ColumnParallelLinear, RowParallelLinear
+from sglang.srt.layers.quantization.base_config import QuantizationConfig
+from sglang.srt.managers.mm_utils import (
+    MultiModalityDataPaddingPatternMultimodalTokens,
+    general_mm_embed_routine,
+)
+from sglang.srt.managers.schedule_batch import (
+    Modality,
+    MultimodalDataItem,
+    MultimodalInputs,
+)
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch
+from sglang.srt.model_loader.weight_utils import default_weight_loader
+from sglang.srt.models.qwen3 import Qwen3ForCausalLM
+from sglang.srt.utils import add_prefix
+
+_DEFAULT_NORM_LAYER = partial(nn.LayerNorm, eps=1e-5)
+
+
+def rotate_half(x):
+    x = rearrange(x, "... (d r) -> ... d r", r=2)
+    x1, x2 = x.unbind(dim=-1)
+    x = torch.stack((-x2, x1), dim=-1)
+    return rearrange(x, "... d r -> ... (d r)")
+
+
+def apply_rotary_emb(freqs, t, start_index=0, scale=1.0, seq_dim=-2):
+    dtype = t.dtype
+
+    if t.ndim == 3:
+        seq_len = t.shape[seq_dim]
+        freqs = freqs[-seq_len:]
+
+    rot_dim = freqs.shape[-1]
+    end_index = start_index + rot_dim
+
+    assert rot_dim <= t.shape[-1], (
+        "feature dimension {} is not of sufficient size to rotate in all the "
+        "positions {}".format(t.shape[-1], rot_dim)
+    )
+
+    t_left, t, t_right = (
+        t[..., :start_index],
+        t[..., start_index:end_index],
+        t[..., end_index:],
+    )
+    t = (t * freqs.cos() * scale) + (rotate_half(t) * freqs.sin() * scale)
+    out = torch.cat((t_left, t, t_right), dim=-1)
+
+    return out.type(dtype)
+
+
+class PerceptionEncoderRope2D(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        max_grid_height: int,
+        max_grid_width: int,
+        use_cls_token: bool = False,
+        theta=10000,
+        max_freq=10,
+        num_freqs=1,
+        theta_rescale_factor=1.0,
+    ):
+        super().__init__()
+        self.dim = dim
+        self.max_grid_height = max_grid_height
+        self.max_grid_width = max_grid_width
+        self.use_cls_token = use_cls_token
+        self.theta = theta * theta_rescale_factor ** (dim / (dim - 2))
+        self.max_freq = max_freq
+        self.num_freqs = num_freqs
+        cache = self._compute_2d_freqs()
+        self.register_buffer("freqs_cache", cache, persistent=False)
+
+    def _compute_inv_freq(self, base: int | float, dim: int) -> torch.Tensor:
+        freqs = 1.0 / (base ** (torch.arange(0, dim, 2)[: (dim // 2)].float() / dim))
+        return freqs
+
+    def _compute_freqs(self, t: torch.Tensor, inv_freq: torch.Tensor):
+        freqs = torch.einsum("..., f -> ... f", t.type(inv_freq.dtype), inv_freq)
+        freqs = repeat(freqs, "... n -> ... (n r)", r=2)
+        return freqs
+
+    def _compute_2d_freqs(self) -> torch.Tensor:
+        grid_h_range = torch.arange(self.max_grid_height, dtype=torch.float)
+        grid_w_range = torch.arange(self.max_grid_width, dtype=torch.float)
+        if self.use_cls_token:
+            grid_h_range += 1
+            grid_w_range += 1
+        inv_freq = self._compute_inv_freq(self.theta, self.dim // 2)
+        freqs_h = self._compute_freqs(grid_h_range, inv_freq)[:, None].expand(
+            self.max_grid_height, self.max_grid_width, -1
+        )
+        freqs_w = self._compute_freqs(grid_w_range, inv_freq)[None, :].expand(
+            self.max_grid_height, self.max_grid_width, -1
+        )
+        freqs = torch.cat([freqs_w, freqs_h], dim=-1).reshape(
+            self.max_grid_height * self.max_grid_width, -1
+        )
+        if self.use_cls_token:
+            freqs = torch.cat([torch.zeros(1, freqs.shape[-1]), freqs], dim=0)
+        freqs = freqs[None, None, ...]
+        return freqs
+
+    def forward(
+        self, q: torch.Tensor, k: torch.Tensor, grid_hw: tuple[int, int], x_shape
+    ):
+        if grid_hw[0] != self.max_grid_height or grid_hw[1] != self.max_grid_width:
+            rows = torch.arange(grid_hw[0], device=q.device).view(-1, 1)
+            cols = torch.arange(grid_hw[1], device=q.device).view(1, -1)
+            positions = (rows * self.max_grid_width + cols).reshape(-1).to(torch.long)
+            if self.use_cls_token:
+                positions = torch.cat(
+                    [torch.zeros(1, device=q.device), positions + 1], dim=0
+                )
+                positions = positions.to(torch.long)
+            freqs = self.freqs_cache.index_select(2, positions)
+        else:
+            freqs = self.freqs_cache
+        ori_shape = q.shape
+        bs, seq_len, _ = x_shape
+        q = q.view(bs, seq_len, -1, self.dim).permute(0, 2, 1, 3)
+        k = k.view(bs, seq_len, -1, self.dim).permute(0, 2, 1, 3)
+        q = apply_rotary_emb(freqs, q)
+        k = apply_rotary_emb(freqs, k)
+        q = q.permute(0, 2, 1, 3).reshape(ori_shape)
+        k = k.permute(0, 2, 1, 3).reshape(ori_shape)
+        return q, k
+
+
+class PerceptionEncoderLayerScale(nn.Module):
+    def __init__(self, dim, init_values=1e-5, inplace=False):
+        super().__init__()
+        self.inplace = inplace
+        self.gamma = nn.Parameter(init_values * torch.ones(dim))
+
+    def forward(self, x):
+        return x.mul_(self.gamma) if self.inplace else x * self.gamma
+
+
+class PerceptionEncoderMLP(nn.Module):
+    def __init__(
+        self,
+        input_dim: int,
+        hidden_dim: int,
+        act_layer: Callable[[], nn.Module],
+        quant_config: QuantizationConfig | None = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.fc1 = ColumnParallelLinear(
+            input_dim,
+            hidden_dim,
+            bias=True,
+            quant_config=quant_config,
+            prefix=f"{prefix}.fc1",
+        )
+        self.activation = act_layer
+        self.fc2 = RowParallelLinear(
+            hidden_dim,
+            input_dim,
+            bias=True,
+            quant_config=quant_config,
+            prefix=f"{prefix}.fc2",
+        )
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x, _ = self.fc1(x)
+        x = self.activation(x)
+        x, _ = self.fc2(x)
+        return x
+
+
+class PerceptionEncoderVisionBlock(nn.Module):
+    def __init__(
+        self,
+        d_model: int,
+        n_head: int,
+        max_grid_height: int,
+        max_grid_width: int,
+        mlp_ratio: float = 4.0,
+        ls_init_value: float = None,
+        act_layer: Callable = nn.GELU,
+        norm_layer: Callable = nn.LayerNorm,
+        use_cls_token: bool = False,
+        quant_config: QuantizationConfig | None = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.head_dim = d_model // n_head
+        self.rope = PerceptionEncoderRope2D(
+            dim=self.head_dim,
+            max_grid_height=max_grid_height,
+            max_grid_width=max_grid_width,
+            use_cls_token=use_cls_token,
+        )
+        self.attn = VisionAttention(
+            embed_dim=d_model,
+            num_heads=n_head,
+            projection_size=d_model,
+            use_qkv_parallel=True,
+            proj_bias=True,
+            # flatten_batch=True,
+            quant_config=quant_config,
+            prefix=add_prefix("attn", prefix),
+            customized_position_embedding_applier=self.rope,
+        )
+        self.ls_1 = (
+            PerceptionEncoderLayerScale(d_model, ls_init_value)
+            if ls_init_value is not None
+            else nn.Identity()
+        )
+        self.ls_2 = (
+            PerceptionEncoderLayerScale(d_model, ls_init_value)
+            if ls_init_value is not None
+            else nn.Identity()
+        )
+        self.ln_1 = norm_layer(d_model)
+        self.ln_2 = norm_layer(d_model)
+        hidden_dim = int(d_model * mlp_ratio)
+        self.mlp = PerceptionEncoderMLP(
+            d_model,
+            hidden_dim,
+            act_layer,
+            quant_config=quant_config,
+            prefix=f"{prefix}.mlp",
+        )
+
+    def forward(self, x: torch.Tensor, grid_hw: tuple[int, int]):
+        x = x + self.ls_1(self.attn(self.ln_1(x), position_embeddings=grid_hw))  # hacky
+        x = x + self.ls_2(self.mlp(self.ln_2(x)))
+        return x
+
+
+class PerceptionEncoderVisionTransformer(nn.Module):
+    def __init__(
+        self,
+        width: int,
+        layers: int,
+        heads: int,
+        max_grid_height: int,
+        max_grid_width: int,
+        mlp_ratio: float = 4.0,
+        ls_init_value: float = None,
+        act_layer: Callable = nn.GELU,
+        norm_layer: Callable = nn.LayerNorm,
+        use_cls_token: bool = False,
+        quant_config: QuantizationConfig | None = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.width = width
+        self.layers = layers
+        self.resblocks = nn.ModuleList(
+            [
+                PerceptionEncoderVisionBlock(
+                    d_model=width,
+                    n_head=heads,
+                    max_grid_height=max_grid_height,
+                    max_grid_width=max_grid_width,
+                    mlp_ratio=mlp_ratio,
+                    ls_init_value=ls_init_value,
+                    act_layer=act_layer,
+                    norm_layer=norm_layer,
+                    use_cls_token=use_cls_token,
+                    quant_config=quant_config,
+                    prefix=f"{prefix}.resblocks.{i}",
+                )
+                for i in range(layers)
+            ]
+        )
+
+    def forward(self, x: torch.Tensor, grid_hw: tuple[int, int]):
+        for block in self.resblocks:
+            x = block(x, grid_hw=grid_hw)
+        return x
+
+
+class PerceptionEncoder(nn.Module):
+    def __init__(
+        self,
+        config,
+        act_layer: Callable,
+        norm_layer: Callable = _DEFAULT_NORM_LAYER,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.patch_size = config.patch_size
+
+        self.output_dim = config.output_dim or config.width
+        self.heads = config.heads
+        self.width = config.width
+        self.layers = config.layers
+
+        self.use_abs_posemb = config.use_abs_posemb
+        self.use_cls_token = config.use_cls_token
+        self.use_rope2d = config.use_rope2d
+        if not self.use_rope2d:
+            raise ValueError("use_rope2d must be True")
+        self.image_size = config.image_size
+
+        self.conv1 = nn.Conv2d(
+            in_channels=3,
+            out_channels=config.width,
+            kernel_size=config.patch_size,
+            stride=config.patch_size,
+            bias=False,
+        )
+
+        self.ln_pre = norm_layer(config.width) if config.use_ln_pre else nn.Identity()
+        self.ln_post = norm_layer(self.width) if config.use_ln_post else nn.Identity()
+
+        self.transformer = PerceptionEncoderVisionTransformer(
+            config.width,
+            config.layers,
+            config.heads,
+            max_grid_height=self.image_size // self.patch_size,
+            max_grid_width=self.image_size // self.patch_size,
+            mlp_ratio=config.mlp_ratio,
+            ls_init_value=config.ls_init_value,
+            act_layer=act_layer,
+            norm_layer=norm_layer,
+            use_cls_token=self.use_cls_token,
+            quant_config=quant_config,
+            prefix=f"{prefix}.transformer",
+        )
+
+        self.vit_downsampler1 = nn.Conv2d(
+            config.width, config.width * 2, kernel_size=3, stride=2, padding=1
+        )
+        self.vit_downsampler2 = nn.Conv2d(
+            config.width * 2, config.width * 4, kernel_size=3, stride=2, padding=1
+        )
+
+        if self.use_cls_token:
+            self.class_embedding = nn.Parameter(
+                (self.width**-0.5) * torch.randn(self.width)
+            )
+
+        if self.use_abs_posemb:
+            self.posemb_grid_size = self.image_size // self.patch_size
+            self.positional_embedding = nn.Parameter(
+                (self.width**-0.5)
+                * torch.randn(
+                    int(self.use_cls_token) + self.posemb_grid_size**2,
+                    self.width,
+                )
+            )
+
+    @property
+    def dtype(self) -> torch.dtype:
+        return self.conv1.weight.dtype
+
+    def sample_abs_posemb(self, grid_h: int, grid_w: int):
+        if self.posemb_grid_size == grid_h and self.posemb_grid_size == grid_w:
+            return self.positional_embedding[None, ...]
+
+        pos_embed = self.positional_embedding
+        if self.use_cls_token:
+            cls_token_embed, pos_embed = pos_embed[:1], pos_embed[1:]
+
+        pos_embed = (
+            pos_embed.reshape(1, self.posemb_grid_size, self.posemb_grid_size, -1)
+            .permute(0, 3, 1, 2)
+            .contiguous()
+        )
+        pos_embed = F.interpolate(
+            pos_embed, size=(grid_h, grid_w), mode="bilinear", align_corners=False
+        )
+        pos_embed = pos_embed.permute(0, 2, 3, 1).reshape(-1, self.width)
+
+        if self.use_cls_token:
+            pos_embed = torch.cat([cls_token_embed, pos_embed], dim=0)
+
+        return pos_embed[None, ...]
+
+    def forward_features(self, x: torch.Tensor):
+        batch, _, h, w = x.shape
+        grid_h, grid_w = h // self.patch_size, w // self.patch_size
+
+        x = self.conv1(x)
+        x = x.permute(0, 2, 3, 1).reshape(batch, -1, self.width)
+
+        if self.use_cls_token:
+            x = torch.cat(
+                [self.class_embedding.view(1, 1, -1).expand(batch, -1, -1), x], dim=1
+            )
+
+        if self.use_abs_posemb:
+            x = x + self.sample_abs_posemb(grid_h, grid_w)
+
+        x = self.ln_pre(x)
+        x = self.transformer(x, grid_hw=(grid_h, grid_w))
+        x = self.ln_post(x)
+
+        if self.use_cls_token:
+            x = x[:, 1:, :]
+
+        return x
+
+    def forward(self, x: torch.Tensor):
+        x = self.forward_features(x)
+        B, P, C = x.shape
+        T = int(P**0.5)
+        x = x.transpose(2, 1).contiguous()
+        x = x.view(B, C, T, T)
+
+        x = self.vit_downsampler1(x)
+        x = self.vit_downsampler2(x)
+
+        B, C, T, T = x.shape
+        return x.view(B, -1, T * T).transpose(1, 2)
+
+
+class StepVLForConditionalGeneration(nn.Module):
+    def __init__(
+        self,
+        config: Step3VLConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+
+        self.config = config
+        self.vision_model = PerceptionEncoder(
+            config.vision_config,
+            ACT2FN[config.vision_config.hidden_act],
+            quant_config=quant_config,
+            prefix=add_prefix(prefix, "vision_model"),
+        )
+        self.vit_large_projector = ColumnParallelLinear(
+            config.vision_config.width * 4,
+            config.text_config.hidden_size,
+            bias=config.projector_bias,
+            gather_output=True,
+            quant_config=quant_config,
+            prefix=add_prefix(prefix, "vit_large_projector"),
+        )
+
+        self.language_model = Qwen3ForCausalLM(
+            config=config.text_config,
+            quant_config=quant_config,
+            prefix=add_prefix(prefix, "language_model"),
+        )
+
+    def _get_vision_model_output(self, input_tensor: torch.Tensor) -> torch.Tensor:
+        return self.vision_model(input_tensor)
+
+    @property
+    def device(self) -> torch.device:
+        return self.vit_large_projector.weight.device
+
+    def _flatten_embeddings(self, embeddings) -> torch.Tensor:
+
+        if isinstance(embeddings, torch.Tensor):
+            # Flatten all but the last dimension.
+            return embeddings.flatten(0, -2)
+
+        return torch.cat(tuple(self._flatten_embeddings(t) for t in embeddings))
+
+    def _process_image_features(self, image_features: torch.Tensor) -> torch.Tensor:
+        image_features, _ = self.vit_large_projector(image_features)
+        return image_features
+
+    def get_image_feature(self, items: List[MultimodalDataItem]) -> torch.Tensor:
+        assert len(items) == 1  # We only have images.
+
+        item = items[0]
+        pixel_values = item.feature.type(self.vision_model.dtype).to(self.device)
+        num_patches = item.model_specific_data.get("num_patches")
+        patch_pixel_values = item.model_specific_data.get("patch_pixel_values", None)
+        if patch_pixel_values is not None:
+            patch_pixel_values = patch_pixel_values.type(self.vision_model.dtype).to(
+                self.device
+            )
+
+        image_features = self._get_vision_model_output(pixel_values)
+        patch_image_features = (
+            self._get_vision_model_output(patch_pixel_values)
+            if patch_pixel_values is not None
+            else None
+        )
+        image_features = self._process_image_features(image_features)
+        patch_image_features = (
+            self._process_image_features(patch_image_features)
+            if patch_image_features is not None
+            else None
+        )
+        merged_image_features = []
+        cur_patch_idx = 0
+        for i, num_patch in enumerate(num_patches):
+            cur_feature = []
+            if num_patch > 0:
+                patch_slice = patch_image_features[
+                    cur_patch_idx : cur_patch_idx + num_patch
+                ]
+                cur_feature.append(patch_slice.view(-1, patch_slice.shape[-1]))
+            cur_feature.append(image_features[i].view(-1, image_features.shape[-1]))
+            cur_patch_idx += num_patch
+            merged_image_features.append(
+                torch.cat(cur_feature) if len(cur_feature) > 1 else cur_feature[0]
+            )
+        return self._flatten_embeddings(merged_image_features)
+
+    def pad_input_ids(self, input_ids: List[int], mm_inputs: MultimodalInputs):
+        pattern = MultiModalityDataPaddingPatternMultimodalTokens()
+        return pattern.pad_input_tokens(input_ids, mm_inputs)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        get_embedding: bool = False,
+    ):
+        hidden_states = general_mm_embed_routine(
+            input_ids=input_ids,
+            forward_batch=forward_batch,
+            language_model=self.language_model,
+            data_embedding_funcs={
+                Modality.IMAGE: self.get_image_feature,
+            },
+            positions=positions,
+        )
+
+        return hidden_states
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        """Load weights for the model, separating vision and language weights"""
+        weights = list(weights)
+
+        # Separate vision tower weights and language model weights
+        vision_weights = []
+        language_weights = []
+
+        for name, loaded_weight in weights:
+            if "vision_model" in name or "vit_large_projector" in name:
+                name = name.replace(r".attn.in_proj_weight", r".attn.qkv_proj.weight")
+                name = name.replace(r".attn.in_proj_bias", r".attn.qkv_proj.bias")
+                name = name.replace(r".attn.out_proj.bias", r".attn.proj.bias")
+                name = name.replace(r".attn.out_proj.weight", r".attn.proj.weight")
+                name = name.replace(".mlp.c_fc", ".mlp.fc1")
+                name = name.replace(".mlp.c_proj", ".mlp.fc2")
+                vision_weights.append((name, loaded_weight))
+            else:
+                # All other weights go to language model
+                language_weights.append((name, loaded_weight))
+
+        # Load vision tower weights
+        vision_state_dict = dict(vision_weights)
+        params_dict = dict(self.named_parameters(remove_duplicate=False))
+        for name, loaded_weight in vision_state_dict.items():
+            if name not in params_dict:
+                raise ValueError(f"Weight {name} not found in params_dict")
+            param = params_dict[name]
+            weight_loader = getattr(param, "weight_loader", default_weight_loader)
+            # loaded_weight = self._pad_vit_attn_dummy_heads(name, loaded_weight)
+            weight_loader(param, loaded_weight)
+
+        # Load language model weights
+        if language_weights:
+            self.language_model.load_weights(language_weights)
+
+
+EntryClass = StepVLForConditionalGeneration
diff --git a/sglang/python/sglang/srt/models/step3p5.py b/sglang/python/sglang/srt/models/step3p5.py
new file mode 100644
index 0000000000000000000000000000000000000000..b3f82b9169a003245499dbca1f797007b65346d0
--- /dev/null
+++ b/sglang/python/sglang/srt/models/step3p5.py
@@ -0,0 +1,1037 @@
+import logging
+import os
+from typing import Any, Dict, Iterable, Optional, Tuple, Union
+
+import torch
+import torch.nn.functional as F
+from torch import nn
+
+from sglang.srt.distributed import (
+    get_moe_expert_parallel_world_size,
+    get_pp_group,
+    get_tensor_model_parallel_rank,
+    get_tensor_model_parallel_world_size,
+    tensor_model_parallel_all_reduce,
+)
+from sglang.srt.eplb.expert_distribution import get_global_expert_distribution_recorder
+from sglang.srt.eplb.expert_location_dispatch import ExpertLocationDispatchInfo
+from sglang.srt.layers.activation import SiluAndMul
+from sglang.srt.layers.communicator import LayerCommunicator, LayerScatterModes
+from sglang.srt.layers.dp_attention import (
+    get_attention_tp_rank,
+    get_attention_tp_size,
+    is_dp_attention_enabled,
+)
+from sglang.srt.layers.layernorm import GemmaRMSNorm
+from sglang.srt.layers.linear import (
+    ColumnParallelLinear,
+    MergedColumnParallelLinear,
+    QKVParallelLinear,
+    ReplicatedLinear,
+    RowParallelLinear,
+)
+from sglang.srt.layers.logits_processor import LogitsProcessor
+from sglang.srt.layers.moe import (
+    get_moe_a2a_backend,
+    should_use_flashinfer_cutlass_moe_fp4_allgather,
+)
+from sglang.srt.layers.moe.ep_moe.layer import get_moe_impl_class
+from sglang.srt.layers.moe.fused_moe_triton.layer import FusedMoE
+from sglang.srt.layers.moe.topk import StandardTopKOutput, TopK
+from sglang.srt.layers.moe.utils import (
+    RoutingMethodType,
+    filter_moe_weight_param_global_expert,
+)
+from sglang.srt.layers.quantization.base_config import QuantizationConfig
+from sglang.srt.layers.radix_attention import RadixAttention
+from sglang.srt.layers.rotary_embedding import get_rope
+from sglang.srt.layers.utils import PPMissingLayer
+from sglang.srt.layers.vocab_parallel_embedding import (
+    ParallelLMHead,
+    VocabParallelEmbedding,
+)
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch, PPProxyTensors
+from sglang.srt.model_loader.weight_utils import default_weight_loader
+from sglang.srt.server_args import get_global_server_args
+from sglang.srt.utils import add_prefix, is_cuda, is_non_idle_and_non_empty, make_layers
+
+Step3p5Config = None
+
+logger = logging.getLogger(__name__)
+_is_cuda = is_cuda()
+
+
+class Step3p5MLP(nn.Module):
+    def __init__(
+        self,
+        hidden_size: int,
+        intermediate_size: int,
+        swiglu_limit: Optional[float] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.gate_up_proj = MergedColumnParallelLinear(
+            hidden_size,
+            [intermediate_size] * 2,
+            bias=False,
+            quant_config=quant_config,
+            prefix=add_prefix("gate_up_proj", prefix),
+        )
+        self.down_proj = RowParallelLinear(
+            intermediate_size,
+            hidden_size,
+            bias=False,
+            quant_config=quant_config,
+            prefix=add_prefix("down_proj", prefix),
+        )
+        self.act_fn = SiluAndMul()
+        self.limit = swiglu_limit
+
+    def forward(self, x):
+        if self.limit is not None:
+            gate_up, _ = self.gate_up_proj(x)
+            gate, up = gate_up.chunk(2, dim=-1)
+            gate = F.silu(gate)
+            gate = gate.clamp(min=None, max=self.limit)
+            up = up.clamp(min=-self.limit, max=self.limit)
+            output, _ = self.down_proj(gate * up)
+        else:
+            gate_up, _ = self.gate_up_proj(x)
+            x = self.act_fn(gate_up)
+            output, _ = self.down_proj(x)
+        return output
+
+
+class Step3p5MoEMLP(nn.Module):
+    def __init__(
+        self,
+        config,
+        layer_id: int,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.tp_size = get_tensor_model_parallel_world_size()
+        self.layer_id = layer_id
+
+        self.need_fp32_gate = config.need_fp32_gate
+        self.routed_scaling_factor = config.moe_router_scaling_factor
+        self.use_moe_router_bias = config.use_moe_router_bias
+        if self.use_moe_router_bias:
+            self.router_bias = nn.Parameter(
+                torch.zeros(config.moe_num_experts, dtype=torch.float32),
+                requires_grad=False,
+            )
+
+        if self.tp_size > config.moe_num_experts:
+            raise ValueError(
+                f"Tensor parallel size {self.tp_size} is greater than "
+                f"the number of experts {config.moe_num_experts}."
+            )
+
+        self.limit = config.swiglu_limits[layer_id]
+        self.limit = self.limit if self.limit > 0 else None
+
+        self.topk = TopK(
+            top_k=config.moe_top_k,
+            renormalize=True,
+            use_grouped_topk=False,
+            scoring_func="sigmoid",
+            correction_bias=self.router_bias,
+            apply_routed_scaling_factor_on_output=False,
+            layer_id=layer_id,
+        )
+
+        self.experts = get_moe_impl_class(quant_config)(
+            num_experts=config.moe_num_experts
+            + get_global_server_args().ep_num_redundant_experts,
+            top_k=config.moe_top_k,
+            layer_id=layer_id,
+            hidden_size=config.hidden_size,
+            intermediate_size=config.moe_intermediate_size,
+            quant_config=quant_config,
+            prefix=add_prefix("experts", prefix),
+            routing_method_type=RoutingMethodType.Renormalize,
+            gemm1_clamp_limit=self.limit,
+        )
+
+        self.gate = ReplicatedLinear(
+            config.hidden_size,
+            config.moe_num_experts,
+            bias=False,
+            quant_config=None,
+            prefix=add_prefix("gate", prefix),
+        )
+
+        if get_moe_a2a_backend().is_deepep():
+            # TODO: we will support tp < ep in the future
+            self.ep_size = get_moe_expert_parallel_world_size()
+            self.moe_num_experts = (
+                config.moe_num_experts
+                + get_global_server_args().ep_num_redundant_experts
+            )
+            self.top_k = config.moe_top_k
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        forward_batch: Optional[ForwardBatch] = None,
+        should_allreduce_fusion: bool = False,
+        use_reduce_scatter: bool = False,
+    ) -> torch.Tensor:
+
+        if (
+            not get_moe_a2a_backend().is_deepep()
+            and not get_moe_a2a_backend().is_ascend_fuseep()
+        ):
+            return self.forward_normal(
+                hidden_states, should_allreduce_fusion, use_reduce_scatter
+            )
+        else:
+            return self.forward_deepep(hidden_states, forward_batch)
+
+    def get_moe_weights(self):
+        return [
+            x.data
+            for name, x in self.experts.named_parameters()
+            if name not in ["correction_bias"]
+            and filter_moe_weight_param_global_expert(
+                name, x, self.experts.num_local_experts
+            )
+        ]
+
+    def forward_normal(
+        self,
+        hidden_states: torch.Tensor,
+        should_allreduce_fusion: bool = False,
+        use_reduce_scatter: bool = False,
+    ) -> torch.Tensor:
+        num_tokens, hidden_dim = hidden_states.shape
+        hidden_states = hidden_states.view(-1, hidden_dim)
+        # router_logits: (num_tokens, n_experts)
+        if self.need_fp32_gate:
+            router_logits = torch.matmul(
+                hidden_states.to(torch.float32), self.gate.weight.t().to(torch.float32)
+            )
+        else:
+            # router_logits: (batch * sequence_length, n_experts)
+            router_logits, _ = self.gate(hidden_states)
+        topk_output = self.topk(hidden_states, router_logits)
+        if self.routed_scaling_factor != 1.0:
+            topk_output = StandardTopKOutput(
+                topk_weights=topk_output.topk_weights * self.routed_scaling_factor,
+                topk_ids=topk_output.topk_ids,
+                router_logits=topk_output.router_logits,
+            )
+        final_hidden_states = self.experts(hidden_states, topk_output)
+        if (
+            self.tp_size > 1
+            and not should_allreduce_fusion
+            and not use_reduce_scatter
+            and not should_use_flashinfer_cutlass_moe_fp4_allgather()
+        ):
+            final_hidden_states = tensor_model_parallel_all_reduce(final_hidden_states)
+
+        return final_hidden_states.view(num_tokens, hidden_dim)
+
+    def forward_deepep(
+        self, hidden_states: torch.Tensor, forward_batch: ForwardBatch
+    ) -> torch.Tensor:
+        if hidden_states.shape[0] > 0:
+            # router_logits: (num_tokens, n_experts)
+            router_logits, _ = self.gate(hidden_states)
+            topk_output = self.topk(
+                hidden_states,
+                router_logits,
+                num_token_non_padded=forward_batch.num_token_non_padded,
+                expert_location_dispatch_info=ExpertLocationDispatchInfo.init_new(
+                    layer_id=self.layer_id,
+                ),
+            )
+        else:
+            topk_output = self.topk.empty_topk_output(hidden_states.device)
+        final_hidden_states = self.experts(
+            hidden_states=hidden_states,
+            topk_output=topk_output,
+        )
+        return final_hidden_states
+
+    def op_gate(self, state):
+        if is_non_idle_and_non_empty(
+            state.forward_batch.forward_mode, state.hidden_states_mlp_input
+        ):
+            # router_logits: (num_tokens, n_experts)
+            state.router_logits, _ = self.gate(state.hidden_states_mlp_input)
+        else:
+            state.router_logits = None
+
+    def op_select_experts(self, state):
+        router_logits = state.pop("router_logits")
+        hidden_states = state.hidden_states_mlp_input
+        if router_logits is not None:
+            with get_global_expert_distribution_recorder().with_current_layer(
+                self.layer_id
+            ):
+                state.topk_output = self.topk(
+                    hidden_states=hidden_states,
+                    router_logits=router_logits,
+                    num_token_non_padded=state.forward_batch.num_token_non_padded,
+                    expert_location_dispatch_info=ExpertLocationDispatchInfo.init_new(
+                        layer_id=self.layer_id,
+                    ),
+                )
+        else:
+            state.topk_output = self.topk.empty_topk_output(hidden_states.device)
+
+    def op_dispatch_a(self, state):
+        if self.ep_size > 1:
+            self.experts.dispatcher.dispatch_a(
+                hidden_states=state.pop("hidden_states_mlp_input"),
+                topk_output=state.pop("topk_output"),
+                tbo_subbatch_index=state.get("tbo_subbatch_index"),
+            )
+
+    def op_dispatch_b(self, state):
+        if self.ep_size > 1:
+            with get_global_expert_distribution_recorder().with_current_layer(
+                self.layer_id
+            ):
+                state.dispatch_output = self.experts.dispatcher.dispatch_b(
+                    tbo_subbatch_index=state.get("tbo_subbatch_index"),
+                )
+
+    def op_experts(self, state):
+        state.combine_input = self.experts.run_moe_core(
+            dispatch_output=state.dispatch_output,
+        )
+
+    def op_combine_a(self, state):
+        if self.ep_size > 1:
+            self.experts.dispatcher.combine_a(
+                combine_input=state.pop("combine_input"),
+                tbo_subbatch_index=state.get("tbo_subbatch_index"),
+            )
+            state.pop("dispatch_output")
+
+    def op_combine_b(self, state):
+        if self.ep_size > 1:
+            state.hidden_states_after_combine = self.experts.dispatcher.combine_b(
+                tbo_subbatch_index=state.get("tbo_subbatch_index"),
+            )
+
+    def op_output(self, state):
+        state.hidden_states_mlp_output = state.pop("hidden_states_after_combine")
+
+
+class Step3p5Attention(nn.Module):
+    def __init__(
+        self,
+        hidden_size: int,
+        num_heads: int,
+        num_kv_heads: int,
+        layer_id: int = 0,
+        rope_theta: float = 1000000,
+        rope_scaling: Optional[Dict[str, Any]] = None,
+        head_dim: Optional[int] = None,
+        max_position_embeddings: int = 32768,
+        quant_config: Optional[QuantizationConfig] = None,
+        rms_norm_eps: float = None,
+        partial_rotary_factor: float = 1.0,
+        use_head_wise_attn_gate: bool = False,
+        sliding_window_size: int = -1,  # if is -1 ,normal attention,else ,window attention
+        prefix: str = "",
+        alt_stream: Optional[torch.cuda.Stream] = None,
+    ) -> None:
+        super().__init__()
+        self.hidden_size = hidden_size
+        self.tp_size = get_tensor_model_parallel_world_size()
+        self.total_num_heads = num_heads
+        attn_tp_rank = get_attention_tp_rank()
+        attn_tp_size = get_attention_tp_size()
+
+        assert self.total_num_heads % attn_tp_size == 0
+        self.num_heads = self.total_num_heads // attn_tp_size
+        self.total_num_kv_heads = num_kv_heads
+        if self.total_num_kv_heads >= attn_tp_size:
+            # Number of KV heads is greater than TP size, so we partition
+            # the KV heads across multiple tensor parallel GPUs.
+            assert self.total_num_kv_heads % attn_tp_size == 0
+        else:
+            # Number of KV heads is less than TP size, so we replicate
+            # the KV heads across multiple tensor parallel GPUs.
+            assert attn_tp_size % self.total_num_kv_heads == 0
+        self.num_kv_heads = max(1, self.total_num_kv_heads // attn_tp_size)
+        self.head_dim = head_dim or hidden_size // self.total_num_heads
+        self.q_size = self.num_heads * self.head_dim
+        self.kv_size = self.num_kv_heads * self.head_dim
+        self.scaling = self.head_dim**-0.5
+        self.rope_theta = rope_theta
+        self.max_position_embeddings = max_position_embeddings
+        self.tp_rank = get_tensor_model_parallel_rank()
+        self.q_norm = GemmaRMSNorm(self.head_dim, eps=rms_norm_eps)
+        self.k_norm = GemmaRMSNorm(self.head_dim, eps=rms_norm_eps)
+
+        self.qkv_proj = QKVParallelLinear(
+            hidden_size,
+            self.head_dim,
+            self.total_num_heads,
+            self.total_num_kv_heads,
+            bias=False,
+            quant_config=quant_config,
+            tp_rank=attn_tp_rank,
+            tp_size=attn_tp_size,
+            prefix=add_prefix("qkv_proj", prefix),
+        )
+        self.o_proj = RowParallelLinear(
+            self.total_num_heads * self.head_dim,
+            hidden_size,
+            bias=False,
+            quant_config=quant_config,
+            tp_rank=attn_tp_rank,
+            tp_size=attn_tp_size,
+            prefix=add_prefix("o_proj", prefix),
+        )
+
+        self.use_head_wise_attn_gate = use_head_wise_attn_gate
+        if self.use_head_wise_attn_gate:
+            self.g_proj = ColumnParallelLinear(
+                hidden_size,
+                self.total_num_heads,
+                bias=False,
+                tp_rank=attn_tp_rank,
+                tp_size=attn_tp_size,
+                prefix=add_prefix("g_proj", prefix),
+            )
+
+        self.rotary_emb = get_rope(
+            self.head_dim,
+            rotary_dim=self.head_dim,
+            max_position=max_position_embeddings,
+            base=rope_theta,
+            rope_scaling=rope_scaling,
+            partial_rotary_factor=partial_rotary_factor,
+            is_neox_style=True,
+        )
+        self.attn = RadixAttention(
+            self.num_heads,
+            self.head_dim,
+            self.scaling,
+            num_kv_heads=self.num_kv_heads,
+            sliding_window_size=sliding_window_size,  # if is -1 ,normal attention,else ,window attention
+            layer_id=layer_id,
+            prefix=add_prefix("attn", prefix),
+        )
+        self.alt_stream = alt_stream
+
+    def forward_prepare_native(self, positions, hidden_states):
+        qkv, _ = self.qkv_proj(hidden_states)
+        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+        q_shape, k_shape = q.shape, k.shape
+        q = self.q_norm(q.reshape(-1, self.head_dim)).reshape(q_shape)
+        k = self.k_norm(k.reshape(-1, self.head_dim)).reshape(k_shape)
+        q, k = self.rotary_emb(positions, q, k)
+        return q, k, v
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+    ) -> torch.Tensor:
+
+        q, k, v = self.forward_prepare_native(
+            positions=positions,
+            hidden_states=hidden_states,
+        )
+        if self.use_head_wise_attn_gate:
+            gate_states, _ = self.g_proj(hidden_states)
+        attn_output = self.attn(q, k, v, forward_batch)
+        if self.use_head_wise_attn_gate:
+            output = (
+                attn_output.view(
+                    attn_output.shape[0],
+                    self.num_heads,  # TODO: check if this is correct
+                    self.head_dim,
+                )
+                * gate_states.unsqueeze(-1).sigmoid()
+            )
+            attn_output = output.view(*attn_output.shape)
+        output, _ = self.o_proj(attn_output)
+        return output
+
+
+class Step3p5DecoderLayer(nn.Module):
+    def __init__(
+        self,
+        config: Step3p5Config,
+        layer_id: int = 0,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+        alt_stream: Optional[torch.cuda.Stream] = None,
+    ) -> None:
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        layer_types = config.layer_types
+        yarn_only_types = config.yarn_only_types
+        if layer_types[layer_id] not in yarn_only_types:
+            rope_scaling = None
+        else:
+            rope_scaling = config.rope_scaling
+        rope_theta = config.rope_theta
+        max_position_embeddings = config.max_position_embeddings
+        head_dim = config.head_dim
+        moe_layers_list = [int(x) for x in config.moe_layers_enum.split(",")]
+        self.num_attention_heads = config.num_attention_heads
+        self.num_key_value_heads = config.num_attention_groups
+        self.is_moe_layer = layer_id in moe_layers_list
+        num_hidden_layers = config.num_hidden_layers
+
+        if (
+            config.swiglu_limits_shared
+            and config.swiglu_limits_shared[layer_id] is not None
+            and config.swiglu_limits_shared[layer_id] != 0
+        ):
+            swiglu_limit_shared = config.swiglu_limits_shared[layer_id]
+        else:
+            swiglu_limit_shared = None
+
+        self.sliding_window = -1
+
+        enable_sliding_window = layer_types[layer_id] == "sliding_attention"
+
+        if enable_sliding_window:
+            self.sliding_window = config.sliding_window
+            self.num_attention_heads = config.attention_other_setting[
+                "num_attention_heads"
+            ]
+            self.num_key_value_heads = config.attention_other_setting[
+                "num_attention_groups"
+            ]
+
+        self.self_attn = Step3p5Attention(
+            hidden_size=self.hidden_size,
+            num_heads=self.num_attention_heads,
+            num_kv_heads=self.num_key_value_heads,
+            layer_id=(
+                layer_id
+                if layer_id < num_hidden_layers
+                else layer_id - num_hidden_layers
+            ),
+            rope_theta=rope_theta[layer_id],
+            rope_scaling=rope_scaling,
+            head_dim=head_dim,
+            max_position_embeddings=max_position_embeddings,
+            sliding_window_size=self.sliding_window,
+            partial_rotary_factor=config.partial_rotary_factors[layer_id],
+            quant_config=quant_config,
+            rms_norm_eps=config.rms_norm_eps,
+            use_head_wise_attn_gate=config.use_head_wise_attn_gate,
+            prefix=add_prefix("self_attn", prefix),
+            alt_stream=alt_stream,
+        )
+        self.use_moe = False
+        if self.is_moe_layer:
+            self.moe = Step3p5MoEMLP(
+                config,
+                layer_id=layer_id,
+                quant_config=quant_config,
+                prefix=add_prefix("mlp", prefix),
+            )
+            self.share_expert = Step3p5MLP(
+                hidden_size=self.hidden_size,
+                intermediate_size=config.share_expert_dim,
+                swiglu_limit=swiglu_limit_shared,
+                quant_config=quant_config,
+                prefix=add_prefix("share_expert", prefix),
+            )
+            self.use_moe = True
+        else:
+            self.mlp = Step3p5MLP(
+                hidden_size=self.hidden_size,
+                intermediate_size=config.intermediate_size,
+                swiglu_limit=swiglu_limit_shared,
+                quant_config=quant_config,
+                prefix=add_prefix("mlp", prefix),
+            )
+
+        self.input_layernorm = GemmaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = GemmaRMSNorm(
+            config.hidden_size, eps=config.rms_norm_eps
+        )
+
+        self.layer_scatter_modes = LayerScatterModes.init_new(
+            layer_id=layer_id,
+            num_layers=(
+                config.num_hidden_layers if layer_id < config.num_hidden_layers else 1
+            ),  # 1 is for mtp
+            is_layer_sparse=False,
+            is_previous_layer_sparse=False,
+            is_next_layer_sparse=False,
+        )
+        self.layer_communicator = LayerCommunicator(
+            layer_scatter_modes=self.layer_scatter_modes,
+            input_layernorm=self.input_layernorm,
+            post_attention_layernorm=self.post_attention_layernorm,
+        )
+
+        self.layer_id = layer_id
+        self.dump_intermediate = (
+            os.environ.get("SGLANG_DUMP_STEP3P5_INTERMEDIATE") == "1"
+        )
+        self._dump_step = 0
+
+    def _dump_tensor(
+        self,
+        name: str,
+        tensor: Optional[torch.Tensor],
+        step_id: Optional[int] = None,
+    ) -> None:
+        if not self.dump_intermediate or tensor is None or not torch.is_tensor(tensor):
+            return
+        dump_dir = "/sgl-workspace/sgl"
+        try:
+            os.makedirs(dump_dir, exist_ok=True)
+            tp_rank = get_tensor_model_parallel_rank()
+            step_part = f"_step{step_id}" if step_id is not None else ""
+            path = os.path.join(
+                dump_dir,
+                f"step3p5_layer{self.layer_id}{step_part}_{name}_tp{tp_rank}.pt",
+            )
+            torch.save(tensor.detach().cpu(), path)
+        except Exception:
+            logger.exception(
+                "Failed to dump tensor %s for layer %s", name, self.layer_id
+            )
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+        residual: Optional[torch.Tensor],
+        post_residual_addition: Optional[torch.Tensor] = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        # Self Attention
+        hidden_states, residual = self.layer_communicator.prepare_attn(
+            hidden_states,
+            residual,
+            forward_batch,
+            post_residual_addition=post_residual_addition,
+        )
+        dump_step = None
+        if self.dump_intermediate:
+            dump_step = self._dump_step
+            self._dump_step += 1
+            self._dump_tensor("attn_input", hidden_states, dump_step)
+        if hidden_states.shape[0] != 0:
+            hidden_states = self.self_attn(
+                positions=positions,
+                hidden_states=hidden_states,
+                forward_batch=forward_batch,
+            )
+        self._dump_tensor("attn_output", hidden_states, dump_step)
+        # Fully Connected
+        # hidden_states, residual = self.layer_communicator.prepare_mlp(
+        #     hidden_states,
+        #     residual,
+        #     forward_batch,
+        # )
+        hidden_states = residual + hidden_states
+        residual = hidden_states
+        self._dump_tensor("post_attn_residual", hidden_states, dump_step)
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        self._dump_tensor("mlp_input", hidden_states, dump_step)
+        if self.use_moe:
+            share_output = self.share_expert(hidden_states)
+            moe_output = self.moe(hidden_states)
+            hidden_states = moe_output + share_output
+        else:
+            hidden_states = self.mlp(hidden_states)
+        self._dump_tensor("mlp_output", hidden_states, dump_step)
+        hidden_states, residual = self.layer_communicator.postprocess_layer(
+            hidden_states, residual, forward_batch
+        )
+        self._dump_tensor("layer_output", hidden_states, dump_step)
+        return hidden_states, residual
+
+
+class Step3p5Model(nn.Module):
+    def __init__(
+        self,
+        config,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.config = config
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+        self.pp_group = get_pp_group()
+
+        alt_stream = torch.cuda.Stream() if _is_cuda else None
+
+        if self.pp_group.is_first_rank:
+            self.embed_tokens = VocabParallelEmbedding(
+                config.vocab_size,
+                config.hidden_size,
+                quant_config=quant_config,
+                enable_tp=not is_dp_attention_enabled(),
+                prefix=add_prefix("embed_tokens", prefix),
+                params_dtype=(
+                    torch.float32
+                    if get_global_server_args().rl_on_policy_target is not None
+                    else None
+                ),
+            )
+        else:
+            self.embed_tokens = PPMissingLayer()
+
+        self.layers, self.start_layer, self.end_layer = make_layers(
+            config.num_hidden_layers,
+            # 1,
+            lambda idx, prefix: Step3p5DecoderLayer(
+                layer_id=idx,
+                config=config,
+                quant_config=quant_config,
+                prefix=prefix,
+                alt_stream=alt_stream,
+            ),
+            pp_rank=self.pp_group.rank_in_group,
+            pp_size=self.pp_group.world_size,
+            prefix=add_prefix("layers", prefix),
+        )
+        if self.pp_group.is_last_rank:
+            self.norm = GemmaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        else:
+            self.norm = PPMissingLayer(return_tuple=True)
+
+    def get_input_embedding(self, input_ids: torch.Tensor) -> torch.Tensor:
+        if hasattr(self.config, "scale_emb"):
+            return self.get_input_embeddings()(input_ids) * self.config.scale_emb
+        else:
+            return self.get_input_embeddings()(input_ids)
+
+    def get_input_embeddings(self) -> nn.Embedding:
+        return self.embed_tokens
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        input_embeds: torch.Tensor = None,
+        pp_proxy_tensors: Optional[PPProxyTensors] = None,
+    ) -> Union[torch.Tensor, PPProxyTensors]:
+        if self.pp_group.is_first_rank:
+            if input_embeds is None:
+                hidden_states = self.embed_tokens(input_ids)
+            else:
+                hidden_states = input_embeds
+            residual = None
+        else:
+            assert pp_proxy_tensors is not None
+            hidden_states = pp_proxy_tensors["hidden_states"]
+            residual = pp_proxy_tensors["residual"]
+
+        for i in range(self.start_layer, self.end_layer):
+            layer = self.layers[i]
+            hidden_states, residual = layer(
+                positions,
+                hidden_states,
+                forward_batch,
+                residual,
+            )
+            # break
+        if not self.pp_group.is_last_rank:
+            return PPProxyTensors(
+                {
+                    "hidden_states": hidden_states,
+                    "residual": residual,
+                }
+            )
+        else:
+            hidden_states_before_norm = None
+            if not self.pp_group.is_last_rank:
+                return PPProxyTensors(
+                    {
+                        "hidden_states": hidden_states,
+                        "residual": residual,
+                    }
+                )
+            else:
+                if hidden_states.shape[0] > 0:
+                    # if forward_batch.return_hidden_states_before_norm:
+                    hidden_states_before_norm = (
+                        hidden_states if residual is None else hidden_states + residual
+                    )
+                    if residual is None:
+                        hidden_states = self.norm(hidden_states)
+                    else:
+                        hidden_states, _ = self.norm(hidden_states, residual)
+            return hidden_states, hidden_states_before_norm
+
+
+class Step3p5ForCausalLM(nn.Module):
+    # BitandBytes specific attributes
+    default_bitsandbytes_target_modules = [
+        ".gate_proj.",
+        ".down_proj.",
+        ".up_proj.",
+        ".q_proj.",
+        ".k_proj.",
+        ".v_proj.",
+        ".o_proj.",
+    ]
+    bitsandbytes_stacked_params_mapping = {
+        # shard_name, weight_name, index
+        "q_proj": ("qkv_proj", 0),
+        "k_proj": ("qkv_proj", 1),
+        "v_proj": ("qkv_proj", 2),
+        "gate_proj": ("gate_up_proj", 0),
+        "up_proj": ("gate_up_proj", 1),
+    }
+
+    def __init__(
+        self,
+        config: Step3p5Config,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.pp_group = get_pp_group()
+        self.config = config
+        self.quant_config = quant_config
+        self.model = Step3p5Model(
+            config, quant_config=quant_config, prefix=add_prefix("model", prefix)
+        )
+
+        self.tie_word_embeddings = False
+        self.num_fused_shared_experts = 0
+
+        # handle the lm head on different pp ranks
+        if self.pp_group.is_last_rank:
+            if self.pp_group.world_size == 1 and self.tie_word_embeddings:
+                self.lm_head = self.model.embed_tokens
+            else:
+                self.lm_head = ParallelLMHead(
+                    config.vocab_size,
+                    config.hidden_size,
+                    quant_config=quant_config,
+                    use_attn_tp_group=get_global_server_args().enable_dp_lm_head,
+                    prefix=add_prefix("lm_head", prefix),
+                )
+        else:
+            # ranks other than the last rank will have a placeholder layer
+            self.lm_head = PPMissingLayer()
+
+        # perform weight tying for PP
+        if self.pp_group.world_size > 1 and self.tie_word_embeddings:
+            if self.pp_group.is_first_rank:
+                self.pp_group.send(
+                    self.model.embed_tokens.weight, dst=self.pp_group.world_size - 1
+                )
+            elif self.pp_group.is_last_rank:
+                emb_token_weight = self.pp_group.recv(
+                    size=self.lm_head.weight.shape,
+                    dtype=next(self.model.parameters()).dtype,
+                    src=0,
+                )
+                self.lm_head.weight.copy_(emb_token_weight)
+
+        self.logits_processor = LogitsProcessor(config)
+
+    def get_input_embeddings(self) -> nn.Embedding:
+        return self.model.get_input_embeddings()
+
+    @torch.no_grad()
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        input_embeds: torch.Tensor = None,
+        pp_proxy_tensors: Optional[PPProxyTensors] = None,
+    ) -> torch.Tensor:
+        hidden_states, hidden_states_before_norm = self.model(
+            input_ids,
+            positions,
+            forward_batch,
+            input_embeds,
+            pp_proxy_tensors=pp_proxy_tensors,
+        )
+
+        if self.pp_group.is_last_rank:
+            return self.logits_processor(
+                input_ids,
+                hidden_states,
+                self.lm_head,
+                forward_batch,
+                hidden_states_before_norm=hidden_states_before_norm,
+            )
+        else:
+            return hidden_states
+
+    @property
+    def start_layer(self):
+        return self.model.start_layer
+
+    @property
+    def end_layer(self):
+        return self.model.end_layer
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]], is_nextn=False):
+        # NOTE:
+        # Step3p5 HF checkpoints (e.g. MTP/nextn variants) may include an extra
+        # "nextn predict layer" appended after the main decoder layers, such as:
+        #   model.layers.<num_hidden_layers>.(eh_proj|enorm|hnorm|transformer.shared_head.*)
+        # This implementation currently does NOT instantiate those nextn modules,
+        # so we must safely skip them (or load them only when a corresponding
+        # nextn model is implemented).
+
+        def _get_layer_id_from_weight_name(weight_name: str) -> Optional[int]:
+            # Expected format: "model.layers.<id>...."
+            parts = weight_name.split(".")
+            if len(parts) >= 3 and parts[0] == "model" and parts[1] == "layers":
+                try:
+                    return int(parts[2])
+                except ValueError:
+                    return None
+            return None
+
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            (".qkv_proj", ".q_proj", "q"),
+            (".qkv_proj", ".k_proj", "k"),
+            (".qkv_proj", ".v_proj", "v"),
+            (".gate_up_proj", ".gate_proj", 0),
+            (".gate_up_proj", ".up_proj", 1),
+        ]
+
+        if self.num_fused_shared_experts > 0:
+            assert self.num_fused_shared_experts == 1
+
+        expert_params_mapping = FusedMoE.make_expert_params_mapping(
+            ckpt_gate_proj_name="gate_proj",
+            ckpt_down_proj_name="down_proj",
+            ckpt_up_proj_name="up_proj",
+            num_experts=self.config.moe_num_experts + self.num_fused_shared_experts,
+        )
+
+        params_dict = dict(self.named_parameters())
+        loaded_params = set()
+
+        def match_expert_and_shard_ids(name_path: str, weight_path: str) -> bool:
+            name_parts = name_path.split(".")
+            weight_parts = weight_path.split(".")
+            # Be defensive: some unexpected weight names may not match the shape.
+            if len(name_parts) <= 4 or len(weight_parts) <= 2:
+                return False
+            shard_id_matches = name_parts[4] == weight_parts[2]
+            return shard_id_matches
+
+        for name, loaded_weight in weights:
+            # Filter nextn layer weights.
+            if hasattr(self.config, "num_nextn_predict_layers"):
+                num_nextn_layers = getattr(self.config, "num_nextn_predict_layers", 0)
+                if num_nextn_layers and name.startswith("model.layers."):
+                    layer_id = _get_layer_id_from_weight_name(name)
+                    if layer_id is not None:
+                        if not is_nextn:
+                            # Normal load: skip layers appended after the main decoder.
+                            if layer_id >= self.config.num_hidden_layers:
+                                continue
+                        else:
+                            # nextn load: only keep the appended nextn layer.
+                            # (Only 1 nextn layer is supported by current checkpoints.)
+                            if num_nextn_layers != 1:
+                                raise ValueError(
+                                    "Only 1 nextn layer is supported for Step3p5 checkpoints."
+                                )
+                            nextn_layer_id = (
+                                0
+                                if self.config.num_hidden_layers == 1
+                                else self.config.num_hidden_layers
+                            )
+                            if layer_id != nextn_layer_id:
+                                # # nextn/MTP load: only keep the appended nextn layers.
+                                # # Expected layer ids: [num_hidden_layers, num_hidden_layers + num_nextn_layers).
+                                # start = self.config.num_hidden_layers
+                                # end = self.config.num_hidden_layers + num_nextn_layers
+                                # if not (start <= layer_id < end):
+                                continue
+
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                if "gate." not in name and "moe" in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                if name not in params_dict:
+                    # Extra / unsupported weights (e.g. nextn) should not crash loading.
+                    continue
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                loaded_params.add(name)
+                break
+            else:
+                if "moe" not in name or "router_bias" in name:
+                    if name not in params_dict:
+                        continue
+                    param = params_dict[name]
+                    weight_loader = getattr(
+                        param, "weight_loader", default_weight_loader
+                    )
+                    weight_loader(param, loaded_weight)
+                    loaded_params.add(name)
+                else:
+                    if "gate." in name:
+                        if name not in params_dict:
+                            continue
+                        param = params_dict[name]
+                        weight_loader = param.weight_loader
+                        weight_loader(param, loaded_weight)
+                        loaded_params.add(name)
+                        continue
+
+                    for mapping in expert_params_mapping:
+                        param_name, weight_name, expert_id, shard_id = mapping
+                        if expert_id == self.config.moe_num_experts:
+                            continue
+                        if not match_expert_and_shard_ids(name, weight_name):
+                            continue
+                        part_name = weight_name.split(".")[-2]
+                        fake_weight_name = name.replace(part_name, weight_name[:-1])
+                        actual_param_name = name.replace(part_name + ".", param_name)
+                        if actual_param_name not in params_dict:
+                            continue
+                        param = params_dict[actual_param_name]
+                        weight_loader = param.weight_loader
+                        weight_loader(
+                            param,
+                            loaded_weight[expert_id],
+                            name,
+                            shard_id=shard_id,
+                            expert_id=expert_id,
+                        )
+                        loaded_params.add(actual_param_name)
+
+        print_params = set(params_dict.keys()) - loaded_params
+        assert len(print_params) == 0, f"Some parameters are not loaded: {print_params}"
+
+    def get_embed_and_head(self):
+        return self.model.embed_tokens.weight, self.lm_head.weight
+
+    def set_embed_and_head(self, embed, head):
+        del self.model.embed_tokens.weight
+        del self.lm_head.weight
+        self.model.embed_tokens.weight = embed
+        self.lm_head.weight = head
+        torch.cuda.empty_cache()
+        torch.cuda.synchronize()
+
+
+EntryClass = Step3p5ForCausalLM
diff --git a/sglang/python/sglang/srt/models/step3p5_mtp.py b/sglang/python/sglang/srt/models/step3p5_mtp.py
new file mode 100644
index 0000000000000000000000000000000000000000..c52c8384719eb01a69cdbc4fa5aa2caf04c2f102
--- /dev/null
+++ b/sglang/python/sglang/srt/models/step3p5_mtp.py
@@ -0,0 +1,336 @@
+import logging
+from collections.abc import Iterable
+from typing import Optional
+
+import torch
+import torch.nn as nn
+from transformers import PretrainedConfig
+
+from sglang.srt.distributed import get_tensor_model_parallel_world_size
+from sglang.srt.layers.layernorm import GemmaRMSNorm
+from sglang.srt.layers.logits_processor import LogitsProcessor
+from sglang.srt.layers.quantization.base_config import QuantizationConfig
+from sglang.srt.layers.vocab_parallel_embedding import (
+    ParallelLMHead,
+    VocabParallelEmbedding,
+)
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch
+from sglang.srt.model_loader.weight_utils import default_weight_loader
+from sglang.srt.models.step3p5 import Step3p5DecoderLayer, Step3p5ForCausalLM
+from sglang.srt.utils import add_prefix
+
+logger = logging.getLogger(__name__)
+
+
+def get_spec_layer_idx_from_weight_name(
+    config: PretrainedConfig, weight_name: str
+) -> Optional[int]:
+    """Return MTP/nextn layer index if this weight belongs to spec layers.
+
+    Step3p5 MTP/nextn checkpoints append extra layers after the main decoder:
+      model.layers.[num_hidden_layers ... num_hidden_layers + num_nextn_predict_layers)
+    """
+    if hasattr(config, "num_nextn_predict_layers") and (
+        getattr(config, "num_nextn_predict_layers", 0) > 0
+    ):
+        base = config.num_hidden_layers
+        for i in range(config.num_nextn_predict_layers):
+            if weight_name.startswith(f"model.layers.{base + i}."):
+                return base + i
+    return None
+
+
+class SharedHead(nn.Module):
+
+    def __init__(
+        self,
+        config,
+        quant_config=None,
+    ) -> None:
+        super().__init__()
+        self.norm = GemmaRMSNorm(config.hidden_size, config.rms_norm_eps)
+        self.head = ParallelLMHead(
+            config.vocab_size, config.hidden_size, quant_config=quant_config
+        )
+        self.lm_head = self.head
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        return self.norm(hidden_states)
+
+
+class Step3p5AMultiTokenPredictor(nn.Module):
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.config = config
+        self.embed_tokens = VocabParallelEmbedding(
+            config.vocab_size,
+            config.hidden_size,
+        )
+        self.mtp_start_layer_idx = config.num_hidden_layers
+        self.num_mtp_layers = config.num_nextn_predict_layers
+
+        layer_id = 45  # FIXME
+
+        self.enorm = GemmaRMSNorm(config.hidden_size, config.rms_norm_eps)
+        self.hnorm = GemmaRMSNorm(config.hidden_size, config.rms_norm_eps)
+        self.eh_proj = nn.Linear(config.hidden_size * 2, config.hidden_size, bias=False)
+        self.shared_head = SharedHead(config=config, quant_config=quant_config)
+        self.mtp_block = Step3p5DecoderLayer(
+            config=config, layer_id=layer_id, prefix=f"{prefix}.mtp_block"
+        )
+        self.lm_head = self.shared_head.head
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        input_embeds: torch.Tensor = None,
+    ) -> torch.Tensor:
+        if input_embeds is None:
+            hidden_states = self.embed_tokens(input_ids)
+        else:
+            hidden_states = input_embeds
+
+        if hidden_states.shape[0] > 0:
+            hidden_states = self.eh_proj(
+                torch.cat(
+                    (
+                        self.enorm(hidden_states),
+                        self.hnorm(forward_batch.spec_info.hidden_states),
+                    ),
+                    dim=-1,
+                )
+            )
+        hidden_states, residual = self.mtp_block(
+            positions=positions,
+            hidden_states=hidden_states,
+            forward_batch=forward_batch,
+            residual=None,
+        )
+        hidden_states_before_norm = None
+        if not forward_batch.forward_mode.is_idle():
+            # if forward_batch.return_hidden_states_before_norm:
+            hidden_states_before_norm = (
+                hidden_states if residual is None else hidden_states + residual
+            )
+            if residual is not None:
+                hidden_states, _ = self.shared_head.norm(hidden_states, residual)
+            else:
+                hidden_states = self.shared_head.norm(hidden_states)
+
+        return hidden_states, hidden_states_before_norm
+
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.embed_tokens(input_ids)
+
+
+# The current implementation differs slightly from the standard MTP implementation in Step3.5 Flash.
+# In the standard multi-layer MTP design of Step3.5 Flash,
+# the hidden states of each MTP layer are passed from the preceding MTP layer
+# (the hidden states of the initial (layer-0) MTP still being provided by the target model).
+# In contrast, the current SGL implementation obtains hidden states directly from the target model for all MTP layers.
+# Empirical evaluations indicate that the overall performance remains strong;
+# however, this design choice may lead to a slight reduction in acceptance rate in certain scenarios.
+# This behavior will be corrected shortly, and we expect to implement the standard multi-layer MTP design of Step3.5 Flash in the near future.
+# FIXME(yhyang201)
+class Step3p5MTP(Step3p5ForCausalLM):
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        draft_model_idx: Optional[int] = None,
+        prefix: str = "",
+    ) -> None:
+        nn.Module.__init__(self)
+        self.config = config
+        self.tp_size = get_tensor_model_parallel_world_size()
+        self.quant_config = quant_config
+        self.draft_model_idx = draft_model_idx
+
+        self.model = Step3p5AMultiTokenPredictor(
+            config=config, quant_config=quant_config, prefix=add_prefix("model", prefix)
+        )
+        self.logits_processor = LogitsProcessor(config)
+        self.lm_head = self.model.lm_head
+
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.embed_input_ids(input_ids)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+    ) -> torch.Tensor:
+        hidden_states, hidden_states_before_norm = self.model(
+            input_ids, positions, forward_batch
+        )
+        return self.logits_processor(
+            input_ids,
+            hidden_states,
+            self.model.shared_head.head,
+            forward_batch,
+            hidden_states_before_norm=hidden_states_before_norm,
+        )
+
+    def get_embed_and_head(self):
+        return self.model.embed_tokens.weight, self.model.shared_head.head.weight
+
+    def set_embed_and_head(self, embed, head):
+        return
+
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+            ("gate_up_proj", "gate_proj", 0),
+            ("gate_up_proj", "up_proj", 1),
+        ]
+
+        expert_params_mapping = [
+            (".moe.experts.w13_weight", ".moe.gate_proj.weight", "w1"),
+            (".moe.experts.w13_weight", ".moe.up_proj.weight", "w3"),
+            (".moe.experts.w2_weight", ".moe.down_proj.weight", "w2"),
+        ]
+
+        params_dict = dict(self.named_parameters())
+        loaded_params: set[str] = set()
+        for name, loaded_weight in weights:
+            if "rotary_emb.inv_freq" in name:
+                continue
+            spec_layer = get_spec_layer_idx_from_weight_name(self.config, name)
+            if spec_layer is not None and spec_layer != (
+                self.config.num_hidden_layers + self.draft_model_idx
+            ):
+                continue
+            if "embed_tokens" not in name and spec_layer is None:
+                continue
+            name = self._rewrite_spec_layer_name(spec_layer, name)
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                # Skip non-stacked layers and experts (experts handled below).
+                if weight_name not in name:
+                    continue
+                # We have mlp.experts[0].gate_proj in the checkpoint.
+                # Since we handle the experts below in expert_params_mapping,
+                # we need to skip here BEFORE we update the name, otherwise
+                # name will be updated to mlp.experts[0].gate_up_proj, which
+                # will then be updated below in expert_params_mapping
+                # for mlp.experts[0].gate_gate_up_proj, which breaks load.
+                if ("mlp.experts." in name) and name not in params_dict:
+                    continue
+                if "experts" in name or "moe" in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                for mapping in expert_params_mapping:
+                    param_name, weight_name, shard_id = mapping
+                    if weight_name not in name:
+                        continue
+                    name = name.replace(weight_name, param_name)
+                    # Skip loading extra bias for GPTQ models.
+                    if (
+                        name.endswith(".bias") or name.endswith("_bias")
+                    ) and name not in params_dict:
+                        continue
+                    param = params_dict[name]
+                    weight_loader = param.weight_loader
+                    for expert_id in range(loaded_weight.shape[0]):
+                        loaded_weight_expert = loaded_weight[expert_id]
+                        weight_loader(
+                            param,
+                            loaded_weight_expert,
+                            name,
+                            shard_id=shard_id,
+                            expert_id=expert_id,
+                        )
+                    loaded_params.add(name)
+                    break
+                else:
+                    # Skip loading extra bias for GPTQ models.
+                    if (
+                        name.endswith(".bias")
+                        and name not in params_dict
+                        or "tok_embeddings" in name
+                    ):
+                        continue
+
+                    if "shared_head" in name:
+                        name = name.replace("shared_head.output", "shared_head.head")
+                    if "embed_tokens" in name:
+                        assert (
+                            hasattr(self.config, "num_nextn_predict_layers")
+                            and self.config.num_nextn_predict_layers > 0
+                        )
+                        name = "model.embed_tokens.weight"
+                    param = params_dict[name]
+                    weight_loader = getattr(
+                        param, "weight_loader", default_weight_loader
+                    )
+                    weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        params_need_to_load = set(params_dict.keys())
+        if params_need_to_load != loaded_params:
+            missing_params = list(params_need_to_load - loaded_params)
+            param_name_example = missing_params[0]
+            raise RuntimeError(
+                f"Some parameters like {param_name_example} are not in the checkpoint and will falsely use random initialization"
+            )
+        return loaded_params
+
+    def _rewrite_spec_layer_name(self, spec_layer: Optional[int], name: str) -> str:
+        """
+        Rewrite the weight name to match the format of the original model.
+        Add .mtp_block for modules in transformer layer block for spec layer
+        """
+        if spec_layer is None:
+            return name
+
+        # Some checkpoints place MTP weights under "model.layers.<id>.transformer.*".
+        # Our modules use "model.layers.<id>.*", so drop the ".transformer." segment.
+        transformer_prefix = f"model.layers.{spec_layer}.transformer."
+        if name.startswith(transformer_prefix):
+            name = name.replace(".transformer.", ".", 1)
+
+        spec_layer_weight_names = [
+            "embed_tokens",
+            "enorm",
+            "hnorm",
+            "eh_proj",
+            "shared_head",
+        ]
+        spec_layer_weight = False
+        for weight_name in spec_layer_weight_names:
+            if weight_name in name:
+                spec_layer_weight = True
+                break
+        if not spec_layer_weight:
+            # treat rest weights as weights for transformer layer block
+            name = name.replace(
+                f"model.layers.{spec_layer}.", f"model.layers.{spec_layer}.mtp_block."
+            )
+
+        # NEW: drop "layers.<idx>." from the rewritten name (minimal change).
+        layers_prefix = f"model.layers.{spec_layer}."
+        if name.startswith(layers_prefix):
+            name = name.replace(layers_prefix, "model.", 1)
+
+        return name
+
+
+EntryClass = [Step3p5MTP]
diff --git a/sglang/python/sglang/srt/models/teleflm.py b/sglang/python/sglang/srt/models/teleflm.py
new file mode 100644
index 0000000000000000000000000000000000000000..d0e093cf98383cbc1936e2f4914088f8150f761c
--- /dev/null
+++ b/sglang/python/sglang/srt/models/teleflm.py
@@ -0,0 +1,104 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# Adapted from
+# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
+# Copyright 2023 The vLLM team.
+# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Adapted from https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/models/teleflm.py
+
+from typing import List, Optional, Tuple, Union
+
+import torch
+from transformers import LlamaConfig
+
+from sglang.srt.layers.quantization.base_config import QuantizationConfig
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch, PPProxyTensors
+from sglang.srt.models.llama import LlamaForCausalLM, LlamaModel
+
+
+class TeleFLMModel(LlamaModel):
+    """
+    This implementation is based on the µScaling paper presented at
+    the ICLR 2025 Workshop:
+    NanoLM: An Affordable LLM Study Benchmark \
+    via Accurate Loss Prediction across Scales
+    by Yiqun Yao et al.
+    Available at: https://openreview.net/forum?id=IwaPYg1SCA
+    arXiv preprint: https://arxiv.org/abs/2304.06875
+    """
+
+    def __init__(
+        self,
+        config: LlamaConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__(config, quant_config=quant_config, prefix=prefix)
+        self.use_mup = getattr(self.config, "use_mup", False)
+        if self.use_mup:
+            self.input_mult = self.config.input_mult
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        input_embeds: torch.Tensor = None,
+        pp_proxy_tensors: Optional[PPProxyTensors] = None,
+    ) -> Union[torch.Tensor, Tuple[torch.Tensor, List[torch.Tensor]], PPProxyTensors]:
+        if self.pp_group.is_first_rank and input_embeds is None:
+            input_embeds = self.embed_tokens(input_ids)
+            if self.use_mup:
+                input_embeds = input_embeds * self.input_mult
+
+        return super().forward(
+            input_ids=input_ids,
+            positions=positions,
+            forward_batch=forward_batch,
+            input_embeds=input_embeds,
+            pp_proxy_tensors=pp_proxy_tensors,
+        )
+
+
+class TeleFLMForCausalLM(LlamaForCausalLM):
+    def __init__(
+        self,
+        config: LlamaConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__(config, quant_config=quant_config, prefix=prefix)
+        self.use_mup = getattr(self.config, "use_mup", False)
+        if self.use_mup:
+            self.mup_scale_factor = self.config.mup_scale_factor
+            self.output_mult = self.config.output_mult / self.mup_scale_factor
+            self.logits_processor.logit_scale = self.output_mult
+
+    def _init_model(
+        self,
+        config: LlamaConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        return TeleFLMModel(config, quant_config=quant_config, prefix=prefix)
+
+
+EntryClass = TeleFLMForCausalLM
diff --git a/sglang/python/sglang/srt/models/torch_native_llama.py b/sglang/python/sglang/srt/models/torch_native_llama.py
new file mode 100644
index 0000000000000000000000000000000000000000..14b327bd1a2ce07ed33a1c93563c4983df755bb6
--- /dev/null
+++ b/sglang/python/sglang/srt/models/torch_native_llama.py
@@ -0,0 +1,507 @@
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+# Adapted from
+# https://github.com/vllm-project/vllm/blob/c7f2cf2b7f67bce5842fedfdba508440fe257375/vllm/model_executor/models/llama.py#L1
+"""
+Inference-only LLaMA model compatible with HuggingFace weights.
+
+This model supports tensor parallelism (TP) using the PyTorch tensor parallel package.
+Reference: https://pytorch.org/docs/stable/distributed.tensor.parallel.html
+
+Here is a quick example to enable TP:
+```python
+from sglang.srt.layers.model_parallel import tensor_parallel
+
+device_mesh = torch.distributed.init_device_mesh("cuda", (tp_size,))
+tensor_parallel(model, device_mesh)
+```
+
+An end-to-end example can be found in `python/sglang/bench_one_batch.py`.
+You can run it with the following command:
+```bash
+$ python3 -m sglang.bench_one_batch --correct \
+  --model meta-llama/Meta-Llama-3-8B \
+  --json-model-override-args '{"architectures": ["TorchNativeLlamaForCausalLM"]}' \
+  --tensor-parallel-size 2 \
+  --disable-cuda-graph
+```
+We will enable CUDA Graph support soon.
+"""
+
+import types
+from typing import Any, Dict, Iterable, Optional, Tuple
+
+import torch
+from torch import nn
+from torch.nn.parameter import Parameter
+from transformers import LlamaConfig
+
+from sglang.srt.distributed import (
+    get_tensor_model_parallel_rank,
+    get_tensor_model_parallel_world_size,
+)
+from sglang.srt.layers.activation import SiluAndMul
+from sglang.srt.layers.layernorm import RMSNorm
+from sglang.srt.layers.logits_processor import LogitsProcessor, LogitsProcessorOutput
+from sglang.srt.layers.quantization.base_config import QuantizationConfig
+from sglang.srt.layers.radix_attention import RadixAttention
+from sglang.srt.layers.rotary_embedding import get_rope
+from sglang.srt.layers.vocab_parallel_embedding import (
+    ParallelLMHead,
+    VocabParallelEmbedding,
+)
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch
+from sglang.srt.model_loader.weight_utils import default_weight_loader
+from sglang.srt.utils import add_prefix
+
+tp_size: Optional[int] = None
+tp_rank: Optional[int] = None
+
+
+def gate_up_proj_weight_loader(
+    self,
+    param: Parameter,
+    loaded_weight: torch.Tensor,
+    loaded_shard_id: int,
+):
+    # shard_id: (shard_offset, shard_size)
+    gate_up_offsets = {}
+    current_shard_offset = 0
+    for i, output_size in enumerate(self.output_sizes):
+        # Everything shrinks by tp_size if TP enabled
+        output_size = output_size // tp_size
+        gate_up_offsets[i] = (current_shard_offset, output_size)
+        current_shard_offset += output_size
+    # Re-size the param to the size after TP
+    if current_shard_offset != param.shape[0]:
+        # The clone will free the original, full tensor
+        param.data = param.data.narrow(0, 0, current_shard_offset).clone()
+
+    # Now load gate or up
+    assert loaded_shard_id < len(self.output_sizes)
+    param_data = param.data
+    shard_offset, shard_size = gate_up_offsets[loaded_shard_id]
+    param_data = param_data.narrow(0, shard_offset, shard_size)
+    loaded_weight = loaded_weight.narrow(0, tp_rank * shard_size, shard_size)
+    assert param_data.shape == loaded_weight.shape
+    param_data.copy_(loaded_weight)
+
+
+class LlamaMLP(nn.Module):
+    _tp_plan = {
+        "gate_up_proj": "Colwise_Sharded",
+        "down_proj": "Rowwise",
+    }
+
+    def __init__(
+        self,
+        hidden_size: int,
+        intermediate_size: int,
+        hidden_act: str,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.gate_up_proj = torch.nn.Linear(
+            hidden_size,
+            intermediate_size * 2,
+            bias=False,
+        )
+        self.gate_up_proj.output_sizes = [intermediate_size] * 2
+        self.gate_up_proj.weight_loader = types.MethodType(
+            gate_up_proj_weight_loader, self.gate_up_proj
+        )
+        self.gate_up_proj.weight.weight_loader = self.gate_up_proj.weight_loader
+        self.down_proj = torch.nn.Linear(intermediate_size, hidden_size, bias=False)
+        if hidden_act != "silu":
+            raise ValueError(
+                f"Unsupported activation: {hidden_act}. "
+                "Only silu is supported for now."
+            )
+        self.act_fn = SiluAndMul()
+
+    def forward(self, x):
+        gate_up = self.gate_up_proj(x)
+        x = self.act_fn(gate_up)
+        x = self.down_proj(x)
+        return x
+
+
+def qkv_proj_weight_loader(
+    self,
+    param: Parameter,
+    loaded_weight: torch.Tensor,
+    loaded_shard_id: str,
+):
+    num_heads = self.num_heads // tp_size
+    num_kv_heads = self.num_kv_heads // tp_size
+    # shard_id: (shard_offset, shard_size)
+    qkv_offsets = {
+        "q": (0, num_heads * self.head_size),
+        "k": (num_heads * self.head_size, num_kv_heads * self.head_size),
+        "v": (
+            (num_heads + num_kv_heads) * self.head_size,
+            num_kv_heads * self.head_size,
+        ),
+    }
+    total_size = qkv_offsets["v"][0] + qkv_offsets["v"][1]
+    # Re-size the param to the size after TP
+    if total_size != param.shape[0]:
+        # The clone will free the original, full tensor
+        param.data = param.data.narrow(0, 0, total_size).clone()
+
+    # Now load q, k or v
+    shard_offset, shard_size = qkv_offsets[loaded_shard_id]
+    param_data = param.data
+    param_data = param_data.narrow(0, shard_offset, shard_size)
+    loaded_weight = loaded_weight.narrow(0, tp_rank * shard_size, shard_size)
+    assert param_data.shape == loaded_weight.shape
+    param_data.copy_(loaded_weight)
+
+
+class LlamaAttention(nn.Module):
+    _tp_plan = {
+        "qkv_proj": "Colwise_Sharded",
+        "o_proj": "Rowwise",
+    }
+
+    def __init__(
+        self,
+        config: LlamaConfig,
+        hidden_size: int,
+        num_heads: int,
+        num_kv_heads: int,
+        layer_id: int = 0,
+        rope_theta: float = 10000,
+        rope_scaling: Optional[Dict[str, Any]] = None,
+        rope_is_neox_style: bool = True,
+        max_position_embeddings: int = 8192,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.hidden_size = hidden_size
+        self.total_num_heads = num_heads
+        assert self.total_num_heads % tp_size == 0
+        self.num_heads = self.total_num_heads // tp_size
+        self.total_num_kv_heads = num_kv_heads
+        if self.total_num_kv_heads >= tp_size:
+            # Number of KV heads is greater than TP size, so we partition
+            # the KV heads across multiple tensor parallel GPUs.
+            assert self.total_num_kv_heads % tp_size == 0
+        else:
+            # Number of KV heads is less than TP size, so we replicate
+            # the KV heads across multiple tensor parallel GPUs.
+            assert tp_size % self.total_num_kv_heads == 0
+        self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
+        # MistralConfig has an optional head_dim introduced by Mistral-Nemo
+        self.head_dim = getattr(
+            config, "head_dim", self.hidden_size // self.total_num_heads
+        )
+        self.q_size = self.num_heads * self.head_dim
+        self.kv_size = self.num_kv_heads * self.head_dim
+        self.scaling = self.head_dim**-0.5
+        self.rope_theta = rope_theta
+        self.max_position_embeddings = max_position_embeddings
+
+        self.qkv_proj = torch.nn.Linear(
+            hidden_size,
+            (self.total_num_heads + 2 * self.total_num_kv_heads) * self.head_dim,
+            bias=False,
+        )
+        self.qkv_proj.head_size = self.head_dim
+        self.qkv_proj.num_heads = self.total_num_heads
+        self.qkv_proj.num_kv_heads = self.total_num_kv_heads
+        self.qkv_proj.weight_loader = types.MethodType(
+            qkv_proj_weight_loader, self.qkv_proj
+        )
+        self.qkv_proj.weight.weight_loader = self.qkv_proj.weight_loader
+        self.qkv_proj.weight.output_dim = 0
+        self.o_proj = torch.nn.Linear(
+            self.total_num_heads * self.head_dim,
+            hidden_size,
+            bias=False,
+        )
+        self.rotary_emb = get_rope(
+            self.head_dim,
+            rotary_dim=self.head_dim,
+            max_position=max_position_embeddings,
+            base=rope_theta,
+            rope_scaling=rope_scaling,
+            is_neox_style=rope_is_neox_style,
+        )
+        self.attn = RadixAttention(
+            self.num_heads,
+            self.head_dim,
+            self.scaling,
+            num_kv_heads=self.num_kv_heads,
+            layer_id=layer_id,
+        )
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+    ) -> torch.Tensor:
+        qkv = self.qkv_proj(hidden_states)
+        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+        q, k = self.rotary_emb(positions, q, k)
+        attn_output = self.attn(q, k, v, forward_batch)
+        output = self.o_proj(attn_output)
+        return output
+
+
+class LlamaDecoderLayer(nn.Module):
+    def __init__(
+        self,
+        config: LlamaConfig,
+        layer_id: int = 0,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        rope_theta = getattr(config, "rope_theta", 10000)
+        rope_scaling = getattr(config, "rope_scaling", None)
+        if rope_scaling is not None and getattr(
+            config, "original_max_position_embeddings", None
+        ):
+            rope_scaling["original_max_position_embeddings"] = (
+                config.original_max_position_embeddings
+            )
+        rope_is_neox_style = getattr(config, "rope_is_neox_style", True)
+        max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
+        self.self_attn = LlamaAttention(
+            config=config,
+            hidden_size=self.hidden_size,
+            num_heads=config.num_attention_heads,
+            num_kv_heads=config.num_key_value_heads,
+            layer_id=layer_id,
+            rope_theta=rope_theta,
+            rope_scaling=rope_scaling,
+            rope_is_neox_style=rope_is_neox_style,
+            max_position_embeddings=max_position_embeddings,
+            quant_config=quant_config,
+            prefix=add_prefix("self_attn", prefix),
+        )
+        self.mlp = LlamaMLP(
+            hidden_size=self.hidden_size,
+            intermediate_size=config.intermediate_size,
+            hidden_act=config.hidden_act,
+            quant_config=quant_config,
+            prefix=add_prefix("mlp", prefix),
+        )
+        self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = RMSNorm(
+            config.hidden_size, eps=config.rms_norm_eps
+        )
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+        residual: Optional[torch.Tensor],
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        # Self Attention
+        if residual is None:
+            residual = hidden_states
+            hidden_states = self.input_layernorm(hidden_states)
+        else:
+            hidden_states, residual = self.input_layernorm(hidden_states, residual)
+        hidden_states = self.self_attn(
+            positions=positions,
+            hidden_states=hidden_states,
+            forward_batch=forward_batch,
+        )
+
+        # Fully Connected
+        hidden_states, residual = self.post_attention_layernorm(hidden_states, residual)
+        hidden_states = self.mlp(hidden_states)
+        return hidden_states, residual
+
+
+class LlamaModel(nn.Module):
+    def __init__(
+        self,
+        config: LlamaConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+    ) -> None:
+        super().__init__()
+
+        global tp_size, tp_rank
+        if tp_size is None:
+            tp_size = get_tensor_model_parallel_world_size()
+        if tp_rank is None:
+            tp_rank = get_tensor_model_parallel_rank()
+
+        self.config = config
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+        self.embed_tokens = VocabParallelEmbedding(
+            config.vocab_size,
+            config.hidden_size,
+        )
+        self.layers = nn.ModuleList(
+            [
+                LlamaDecoderLayer(
+                    config, i, quant_config=quant_config, prefix=f"model.layers.{i}"
+                )
+                for i in range(config.num_hidden_layers)
+            ]
+        )
+        self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        input_embeds: torch.Tensor = None,
+    ) -> torch.Tensor:
+        if input_embeds is None:
+            hidden_states = self.embed_tokens(input_ids)
+        else:
+            hidden_states = input_embeds
+        residual = None
+        for i in range(len(self.layers)):
+            layer = self.layers[i]
+            hidden_states, residual = layer(
+                positions,
+                hidden_states,
+                forward_batch,
+                residual,
+            )
+        hidden_states, _ = self.norm(hidden_states, residual)
+        return hidden_states
+
+
+class TorchNativeLlamaForCausalLM(nn.Module):
+    def __init__(
+        self,
+        config: LlamaConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+    ) -> None:
+        super().__init__()
+        self.config = config
+        self.quant_config = quant_config
+        self.supports_torch_tp = True
+        self.model = LlamaModel(config, quant_config=quant_config)
+        if self.config.tie_word_embeddings:
+            self.lm_head = self.model.embed_tokens
+        else:
+            self.lm_head = ParallelLMHead(config.vocab_size, config.hidden_size)
+        self.logits_processor = LogitsProcessor(config)
+
+        # turning off autotune for fp8dq since it doesn't give speedup and
+        # increases compile time significantly
+        torch._inductor.config.max_autotune_gemm_backends = "ATEN"
+
+    @torch.no_grad()
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        input_embeds: torch.Tensor = None,
+    ) -> LogitsProcessorOutput:
+        hidden_states = self.model(input_ids, positions, forward_batch, input_embeds)
+        return self.logits_processor(
+            input_ids, hidden_states, self.lm_head, forward_batch
+        )
+
+    def get_module_name_from_weight_name(self, name):
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id, num_shard)
+            ("qkv_proj", "q_proj", "q", 3),
+            ("qkv_proj", "k_proj", "k", 3),
+            ("qkv_proj", "v_proj", "v", 3),
+            ("gate_up_proj", "gate_proj", 0, 2),
+            ("gate_up_proj", "up_proj", 1, 2),
+        ]
+        for param_name, weight_name, shard_id, num_shard in stacked_params_mapping:
+            if weight_name in name:
+                return (
+                    name.replace(weight_name, param_name)[: -len(".weight")],
+                    num_shard,
+                )
+        return name[: -len(".weight")], 1
+
+    def get_num_params(self):
+        params_dict = dict(self.named_parameters())
+        return len(params_dict)
+
+    def load_weights_to_module(
+        self,
+        fqn: str,
+        weights: Iterable[Tuple[str, torch.Tensor]],
+    ):
+        """Load weights onto submodule pointed by path `fqn`."""
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            (".qkv_proj", ".q_proj", "q"),
+            (".qkv_proj", ".k_proj", "k"),
+            (".qkv_proj", ".v_proj", "v"),
+            (".gate_up_proj", ".gate_proj", 0),
+            (".gate_up_proj", ".up_proj", 1),
+        ]
+        module = self.get_submodule(fqn)
+        params_dict = dict(module.named_parameters(prefix=fqn, recurse=False))
+
+        for name, loaded_weight in weights:
+            if "rotary_emb.inv_freq" in name or "projector" in name:
+                continue
+            if "rotary_emb.cos_cached" in name or "rotary_emb.sin_cached" in name:
+                # Models trained using ColossalAI may include these tensors in
+                # the checkpoint. Skip them.
+                continue
+            if name.startswith("model.vision_tower") and name not in params_dict:
+                continue
+            if self.config.tie_word_embeddings and "lm_head.weight" in name:
+                continue
+
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") or name not in params_dict:
+                    continue
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") or name not in params_dict:
+                    continue
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
+                weight_loader(param, loaded_weight)
+
+    def load_weights(
+        self,
+        weights: Iterable[Tuple[str, torch.Tensor]],
+    ):
+        """Load weights onto the full model."""
+        self.load_weights_to_module("", weights)
+
+
+class TorchNativePhi3ForCausalLM(TorchNativeLlamaForCausalLM):
+    pass
+
+
+EntryClass = [TorchNativeLlamaForCausalLM, TorchNativePhi3ForCausalLM]
diff --git a/sglang/python/sglang/srt/models/transformers.py b/sglang/python/sglang/srt/models/transformers.py
new file mode 100644
index 0000000000000000000000000000000000000000..0ea9da14a1beb8b991598c6eae4fe4de13337053
--- /dev/null
+++ b/sglang/python/sglang/srt/models/transformers.py
@@ -0,0 +1,289 @@
+# Copyright 2025 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+# Adapted from
+# https://github.com/vllm-project/vllm/blob/a1a2aaadb9122f05667140e39cf67e5736c8b6d6/vllm/model_executor/models/transformers.py
+"""Wrapper around `transformers` models"""
+
+import logging
+import re
+from typing import Iterable, Literal, Optional, Tuple, Union
+
+import torch
+from torch import nn
+from transformers import AutoModel, PretrainedConfig, PreTrainedModel
+from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS
+
+from sglang.srt.distributed import divide, get_tensor_model_parallel_world_size
+from sglang.srt.layers.linear import (
+    ColumnParallelLinear,
+    ReplicatedLinear,
+    RowParallelLinear,
+)
+from sglang.srt.layers.logits_processor import LogitsProcessor, LogitsProcessorOutput
+from sglang.srt.layers.quantization.base_config import QuantizationConfig
+from sglang.srt.layers.radix_attention import RadixAttention
+from sglang.srt.layers.vocab_parallel_embedding import (
+    ParallelLMHead,
+    VocabParallelEmbedding,
+)
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch
+from sglang.srt.model_loader.weight_utils import default_weight_loader
+
+logger = logging.getLogger(__name__)
+
+
+def maybe_prefix(prefix: str, name: str) -> str:
+    """Add a prefix to a name if the prefix is non-empty.
+
+    Args:
+        prefix: The prefix to add. If empty, no prefix will be added.
+        name: The name to potentially prefix.
+
+    Returns:
+        The string "prefix.name" if prefix was non-empty, otherwise just "name".
+    """
+    return name if not prefix else f"{prefix}.{name}"
+
+
+def sglang_flash_attention_forward(
+    # Transformers args
+    module: torch.nn.Module,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    attention_mask: torch.Tensor,
+    # sglang kwargs
+    forward_batch: ForwardBatch,
+    # Transformers kwargs
+    scaling: float = None,
+    attention_instances: list[RadixAttention] = None,
+    **kwargs,
+):
+    self_attn: RadixAttention = attention_instances[module.layer_idx]
+    if scaling is not None:
+        self_attn.scaling = float(scaling)
+    hidden = query.shape[-2]
+    query, key, value = (x.transpose(1, 2) for x in (query, key, value))
+    query, key, value = (x.reshape(hidden, -1) for x in (query, key, value))
+    return self_attn.forward(query, key, value, forward_batch=forward_batch), None
+
+
+ALL_ATTENTION_FUNCTIONS["sglang"] = sglang_flash_attention_forward
+
+
+class HFColumnParallelLinear(ColumnParallelLinear):
+
+    def forward(self, input: torch.Tensor) -> torch.Tensor:
+        return super().forward(input)[0]
+
+
+class HFRowParallelLinear(RowParallelLinear):
+
+    def forward(self, input: torch.Tensor) -> torch.Tensor:
+        return super().forward(input)[0]
+
+
+def replace_linear_class(
+    linear: nn.Linear,
+    style: Literal["colwise", "rowwise"],
+    quant_config: QuantizationConfig,
+) -> Union[ColumnParallelLinear, RowParallelLinear]:
+    """
+    Replace nn.Linear with one of vLLM's tensor parallel linear classes.
+
+    Args:
+        linear (nn.Linear): `nn.Linear` to be replaced.
+        style (str): Tensor parallel style of the new linear, e.g. "colwise".
+        quant_config (QuantConfig): Quantization config for the new linear.
+    Returns:
+        Union[ColumnParallelLinear, RowParallelLinear]: The new linear.
+    """
+
+    if not isinstance(style, str):
+        raise ValueError(f"Unsupported parallel style type {type(style)}, expected str")
+
+    sglang_linear_cls = {
+        "colwise": ColumnParallelLinear,
+        "rowwise": RowParallelLinear,
+    }.get(style, ReplicatedLinear)
+
+    class HFCompatibleLinear(sglang_linear_cls):
+        """
+        Wrapper class that removes `output_bias` from returned output.
+        """
+
+        @property
+        def parent_cls(self) -> type:
+            return sglang_linear_cls
+
+        def forward(self, input: torch.Tensor) -> torch.Tensor:
+            return super().forward(input)[0]
+
+    return HFCompatibleLinear(
+        input_size=linear.in_features,
+        output_size=linear.out_features,
+        bias=linear.bias is not None,
+        quant_config=quant_config,
+    )
+
+
+class TransformersForCausalLM(nn.Module):
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        logger.info("Using Transformers backend.")
+
+        self.quant_config = quant_config
+        self.config = config
+        self.vocab_size = config.vocab_size
+        self.unpadded_vocab_size = config.vocab_size
+
+        # model is loaded under set_default_torch_dtype(model_config.dtype)
+        self.model: PreTrainedModel = AutoModel.from_config(
+            self.config,
+            torch_dtype=torch.get_default_dtype(),
+            attn_implementation="sglang",
+            trust_remote_code=True,
+        )
+
+        # Attention modifications (assumes 1 attention op per hidden layer)
+        tp_size = get_tensor_model_parallel_world_size()
+
+        # MLP modifications
+        self.tensor_parallel(tp_size)
+
+        head_dim = (
+            (config.hidden_size // config.num_attention_heads)
+            if not hasattr(config, "head_dim")
+            else config.head_dim
+        )
+        self.attention_instances = [
+            RadixAttention(
+                num_heads=divide(config.num_attention_heads, tp_size),
+                head_dim=head_dim,
+                # NOTE: We use Llama scale as default, if it's set by
+                # Transformers, it's updated in sglang_flash_attention_forward
+                scaling=head_dim**-0.5,
+                num_kv_heads=divide(config.num_key_value_heads, tp_size),
+                layer_id=i,
+                quant_config=self.quant_config,
+                prefix=f"{i}.attn",
+            )
+            for i in range(config.num_hidden_layers)
+        ]
+
+        # Model modifications
+        self.replace_vocab_embed_class(self.model)
+
+        # ForCausalLM modifications
+        self.lm_head = ParallelLMHead(
+            config.vocab_size,
+            config.hidden_size,
+            quant_config=self.quant_config,
+            prefix=maybe_prefix(prefix, "lm_head"),
+        )
+        if config.tie_word_embeddings:
+            self.lm_head.weight = self.model.get_input_embeddings().weight
+
+        self.logits_processor = LogitsProcessor(config)
+
+    def log_replacement(self, name: str, old_module: nn.Module, new_module: nn.Module):
+        logger.debug("%s: %s -> %s", name, old_module, new_module)
+
+    def tensor_parallel(self, tp_size: int):
+        """
+        Apply the model's tensor parallelization plan.
+        Currently only supports linear layers.
+        """
+        tp_plan = getattr(self.model.config, "base_model_tp_plan", None) or {}
+
+        if not tp_plan and tp_size > 1:
+            raise ValueError(
+                f"{type(self.model)} does not support tensor parallel yet!"
+            )
+
+        def _tensor_parallel(module: nn.Module, prefix: str = ""):
+            for child_name, child_module in module.named_children():
+                qual_name = maybe_prefix(prefix, child_name)
+                for pattern, style in tp_plan.items():
+                    if re.match(pattern, qual_name) and isinstance(
+                        child_module, nn.Linear
+                    ):
+                        new_module = replace_linear_class(
+                            child_module, style, self.quant_config
+                        )
+                        setattr(module, child_name, new_module)
+                        self.log_replacement(qual_name, child_module, new_module)
+                else:
+                    _tensor_parallel(child_module, prefix=qual_name)
+
+        _tensor_parallel(self.model)
+
+    def replace_vocab_embed_class(self, module: nn.Module):
+        # Use native set input embeddings
+        new_module = VocabParallelEmbedding(
+            self.vocab_size,
+            self.config.hidden_size,
+            org_num_embeddings=self.config.vocab_size,
+            quant_config=None,
+        )
+        self.log_replacement(
+            "input embedding", self.model.get_input_embeddings(), new_module
+        )
+        self.model.set_input_embeddings(new_module)
+
+    @torch.no_grad()
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        input_embeds: torch.Tensor = None,
+        get_embedding: bool = False,
+    ) -> LogitsProcessorOutput:
+        assert get_embedding is False, "embedding is not supported yet"
+        aux_hidden_states = None
+        hidden_states = self.model(
+            input_ids[None, ...],
+            use_cache=False,
+            position_ids=positions[None, ...],
+            forward_batch=forward_batch,
+            attention_instances=self.attention_instances,
+            return_dict=False,
+        )[0][
+            0, ...
+        ]  # we remove batch dimension for now
+
+        return self.logits_processor(
+            input_ids, hidden_states, self.lm_head, forward_batch, aux_hidden_states
+        )
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        params_dict = dict(self.named_parameters())
+        for name, loaded_weight in weights:
+            if name not in params_dict:
+                name = f"{self.model.base_model_prefix}.{name}"
+            if name in params_dict:
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
+                weight_loader(param, loaded_weight)
+
+
+EntryClass = [TransformersForCausalLM]
diff --git a/sglang/python/sglang/srt/models/utils.py b/sglang/python/sglang/srt/models/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..ca3f6c09d4d584bab0509d7fa52d49f63d5dcbd3
--- /dev/null
+++ b/sglang/python/sglang/srt/models/utils.py
@@ -0,0 +1,268 @@
+# Copyright 2023-2025 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+from __future__ import annotations
+
+from collections.abc import Iterable, Mapping
+from dataclasses import dataclass, field
+from functools import lru_cache
+from typing import TYPE_CHECKING, Any, Optional, Tuple
+
+import numpy as np
+import torch
+
+from sglang.jit_kernel.norm import can_use_fused_inplace_qknorm, fused_inplace_qknorm
+from sglang.srt.environ import envs
+from sglang.srt.layers.radix_attention import RadixAttention
+from sglang.srt.mem_cache.swa_memory_pool import SWAKVPool
+from sglang.srt.model_executor.cuda_graph_runner import get_is_capture_mode
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch
+from sglang.srt.utils import get_current_device_stream_fast, is_cuda
+from sglang.srt.utils.custom_op import register_custom_op
+
+if TYPE_CHECKING:
+    from sglang.srt.layers.layernorm import RMSNorm
+
+_is_cuda = is_cuda()
+
+WeightsMapping = Mapping[str, Optional[str]]
+"""If a key maps to a value of `None`, the corresponding weight is ignored."""
+
+
+@dataclass
+class WeightsMapper:
+    """Maps the name of each weight if they match the following patterns."""
+
+    orig_to_new_substr: WeightsMapping = field(default_factory=dict)
+    orig_to_new_prefix: WeightsMapping = field(default_factory=dict)
+    orig_to_new_suffix: WeightsMapping = field(default_factory=dict)
+
+    def _map_name(self, key: str) -> Optional[str]:
+        for substr, new_key in sorted(
+            self.orig_to_new_substr.items(), key=lambda i: len(i[0]), reverse=True
+        ):
+            if substr in key:
+                if new_key is None:
+                    return None
+
+                key = key.replace(substr, new_key, 1)
+                break
+
+        for prefix, new_key in sorted(
+            self.orig_to_new_prefix.items(), key=lambda i: len(i[0]), reverse=True
+        ):
+            if key.startswith(prefix):
+                if new_key is None:
+                    return None
+
+                key = key.replace(prefix, new_key, 1)
+                break
+
+        for suffix, new_key in sorted(
+            self.orig_to_new_suffix.items(), key=lambda i: len(i[0]), reverse=True
+        ):
+            if key.endswith(suffix):
+                if new_key is None:
+                    return None
+
+                key = new_key.join(key.rsplit(suffix, 1))
+                break
+
+        return key
+
+    def apply(
+        self, weights: Iterable[tuple[str, torch.Tensor]]
+    ) -> Iterable[tuple[str, torch.Tensor]]:
+        return (
+            (out_name, data)
+            for name, data in weights
+            if (out_name := self._map_name(name)) is not None
+        )
+
+    def apply_list(self, values: list[str]) -> list[str]:
+        return [
+            out_name
+            for name in values
+            if (out_name := self._map_name(name)) is not None
+        ]
+
+    def apply_dict(self, values: dict[str, Any]) -> dict[str, Any]:
+        return {
+            out_name: value
+            for name, value in values.items()
+            if (out_name := self._map_name(name)) is not None
+        }
+
+
+def enable_fused_set_kv_buffer(forward_batch: ForwardBatch):
+    """Enable fused set_kv_buffer only on CUDA with bfloat16 KV cache."""
+    return (
+        _is_cuda
+        and hasattr(forward_batch.token_to_kv_pool, "dtype")
+        and forward_batch.token_to_kv_pool.dtype == torch.bfloat16
+        and not isinstance(forward_batch.token_to_kv_pool, SWAKVPool)
+    )
+
+
+def create_fused_set_kv_buffer_arg(
+    value: torch.Tensor,
+    layer: RadixAttention,
+    forward_batch: ForwardBatch,
+):
+    from sglang.jit_kernel.rope import FusedSetKVBufferArg
+
+    layer_id = layer.layer_id
+    token_to_kv_pool = forward_batch.token_to_kv_pool
+
+    k_buffer = token_to_kv_pool.get_key_buffer(layer_id)
+    v_buffer = token_to_kv_pool.get_value_buffer(layer_id)
+    assert layer.k_scale is None and layer.v_scale is None, "scale not supported"
+    return FusedSetKVBufferArg(
+        value=value,
+        k_buffer=k_buffer.view(k_buffer.shape[0], -1),
+        v_buffer=v_buffer.view(v_buffer.shape[0], -1),
+        cache_loc=forward_batch.out_cache_loc,
+    )
+
+
+def permute_inv(perm: torch.Tensor) -> torch.Tensor:
+    inv_perm = torch.empty_like(perm)
+    inv_perm[perm] = torch.arange(perm.numel(), device=perm.device, dtype=perm.dtype)
+    return inv_perm
+
+
+def compute_cu_seqlens_from_grid_numpy(grid_thw: torch.Tensor) -> torch.Tensor:
+    """
+    Compute cu_seqlens from grid_thw using NumPy.
+
+    grid_thw: [T, 3] int tensor on CPU.
+              columns: [repeat_count, H, W]
+    Returns:
+        cu_seqlens: 1D int32 tensor on CPU, shape [N + 1]
+    """
+    assert (
+        grid_thw.device.type == "cpu"
+    ), "compute_cu_seqlens_from_grid_numpy expects a CPU tensor"
+    arr = grid_thw.numpy()
+
+    cu_seqlens = np.repeat(arr[:, 1] * arr[:, 2], arr[:, 0]).cumsum(
+        axis=0, dtype=np.int32
+    )
+    cu_seqlens = np.concatenate([np.zeros(1, dtype=np.int32), cu_seqlens])
+    cu_seqlens = torch.from_numpy(cu_seqlens)
+    return cu_seqlens
+
+
+class RotaryPosMixin:
+
+    @staticmethod
+    @lru_cache(maxsize=1024)
+    def rot_pos_ids(h: int, w: int, spatial_merge_size: int) -> torch.Tensor:
+        if isinstance(h, torch.Tensor):
+            h = int(h.item())
+        if isinstance(w, torch.Tensor):
+            w = int(w.item())
+        if isinstance(spatial_merge_size, torch.Tensor):
+            spatial_merge_size = int(spatial_merge_size.item())
+        hpos_ids = np.broadcast_to(np.arange(h).reshape(h, 1), (h, w))
+        h_div = h // spatial_merge_size
+        w_div = w // spatial_merge_size
+        hpos_ids = hpos_ids.reshape(
+            h_div,
+            spatial_merge_size,
+            w_div,
+            spatial_merge_size,
+        )
+        hpos_ids = hpos_ids.transpose(0, 2, 1, 3)
+        hpos_ids = hpos_ids.flatten()
+
+        wpos_ids = np.broadcast_to(np.arange(w).reshape(1, w), (h, w))
+        wpos_ids = wpos_ids.reshape(
+            h_div,
+            spatial_merge_size,
+            w_div,
+            spatial_merge_size,
+        )
+        wpos_ids = wpos_ids.transpose(0, 2, 1, 3)
+        wpos_ids = wpos_ids.flatten()
+
+        return torch.from_numpy(np.stack([hpos_ids, wpos_ids], axis=-1))
+
+
+def apply_qk_norm(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    q_norm: RMSNorm,
+    k_norm: RMSNorm,
+    head_dim: int,
+    alt_stream: Optional[torch.cuda.Stream] = None,
+    allow_inplace: bool = True,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """
+    Apply QK normalization for query and key tensors.
+    If eligible, we will use JIT fused inplace QK normalization for better performance.
+
+    Args:
+        q: Query tensor of shape [batch_size, ...]
+        k: Key tensor of shape [batch_size, ...]
+        q_norm: RMSNorm layer for query normalization
+        k_norm: RMSNorm layer for key normalization
+        head_dim: Dimension of each attention head
+        alt_stream: Optional alternative CUDA stream for overlapping computation
+        allow_inplace: Whether to allow inplace normalization. (True for better performance)
+
+    Returns:
+        Tuple of normalized query and key tensors
+    """
+
+    batch_size = q.size(0)
+    q_eps = q_norm.variance_epsilon
+    k_eps = k_norm.variance_epsilon
+    if (
+        _is_cuda  # TODO(dark): have not tested on ROCm or other backends
+        and allow_inplace  # TODO(dark): this can be relaxed if needed
+        and (q_eps == k_eps)  # TODO(dark): this can also be relaxed
+        and not envs.SGLANG_ENABLE_DETERMINISTIC_INFERENCE.get()
+        and can_use_fused_inplace_qknorm(head_dim, q.dtype)
+    ):
+        fused_inplace_qknorm(
+            q=q.view(batch_size, -1, head_dim),
+            k=k.view(batch_size, -1, head_dim),
+            q_weight=q_norm.weight,
+            k_weight=k_norm.weight,
+            head_dim=head_dim,
+            eps=q_eps,
+        )
+        return q, k
+
+    if alt_stream is not None and get_is_capture_mode():
+        current_stream = get_current_device_stream_fast()
+        alt_stream.wait_stream(current_stream)
+        q_by_head = q.reshape(-1, head_dim)
+        q_by_head = q_norm(q_by_head)
+        with torch.cuda.stream(alt_stream):
+            k_by_head = k.reshape(-1, head_dim)
+            k_by_head = k_norm(k_by_head)
+        current_stream.wait_stream(alt_stream)
+    else:
+        q_by_head = q.reshape(-1, head_dim)
+        q_by_head = q_norm(q_by_head)
+        k_by_head = k.reshape(-1, head_dim)
+        k_by_head = k_norm(k_by_head)
+    q = q_by_head.view(q.shape)
+    k = k_by_head.view(k.shape)
+    return q, k
+
+
+# Register the inplace op
+fused_inplace_qknorm = register_custom_op(fused_inplace_qknorm, mutates_args=["q", "k"])
diff --git a/sglang/python/sglang/srt/models/whisper.py b/sglang/python/sglang/srt/models/whisper.py
new file mode 100644
index 0000000000000000000000000000000000000000..d69fb666d2d8d3042276f0910eb70f235eb53bb2
--- /dev/null
+++ b/sglang/python/sglang/srt/models/whisper.py
@@ -0,0 +1,543 @@
+from typing import Any, Iterable, List, Optional, Tuple
+
+import torch
+from transformers import WhisperConfig
+
+from sglang.srt.distributed import get_tensor_model_parallel_world_size
+from sglang.srt.layers.activation import get_act_fn
+from sglang.srt.layers.linear import (
+    ColumnParallelLinear,
+    QKVParallelLinear,
+    RowParallelLinear,
+)
+from sglang.srt.layers.logits_processor import LogitsProcessor, LogitsProcessorOutput
+from sglang.srt.layers.quantization import QuantizationConfig
+from sglang.srt.layers.radix_attention import AttentionType, RadixAttention
+from sglang.srt.layers.vocab_parallel_embedding import ParallelLMHead
+from sglang.srt.managers.schedule_batch import MultimodalInputs
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch
+from sglang.srt.model_loader.weight_utils import default_weight_loader
+
+
+class WhisperAttention(torch.nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    def __init__(
+        self,
+        embed_dim: int,
+        num_heads: int,
+        bias: bool = True,
+        layer_id: Optional[int] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        is_cross_attention: bool = False,
+        is_encoder=False,
+    ):
+        super().__init__()
+        self.total_num_heads = num_heads
+        head_dim = embed_dim // num_heads
+        self.is_cross_attention = is_cross_attention
+        self.is_encoder = is_encoder
+
+        tp_size = get_tensor_model_parallel_world_size()
+        assert (
+            num_heads % tp_size == 0
+        ), f"num_heads ({num_heads}) must be divisible by tp_size ({tp_size})"
+        self.num_heads = num_heads // tp_size
+
+        if (head_dim * num_heads) != embed_dim:
+            raise ValueError(
+                f"embed_dim must be divisible by num_heads (got `embed_dim`: {embed_dim}"
+                f" and `num_heads`: {num_heads})."
+            )
+        self.scaling = head_dim**-0.5
+        self.head_dim = head_dim
+        self.kv_size = self.num_heads * head_dim
+
+        if is_cross_attention:
+            self.q_proj = ColumnParallelLinear(
+                embed_dim, embed_dim, quant_config=quant_config
+            )
+            self.kv_proj = QKVParallelLinear(
+                hidden_size=embed_dim,
+                head_size=head_dim,
+                total_num_heads=0,
+                total_num_kv_heads=num_heads,
+                bias=bias,
+                quant_config=quant_config,
+            )
+        else:
+            self.qkv_proj = QKVParallelLinear(
+                embed_dim, head_dim, num_heads, quant_config=quant_config
+            )
+        self.out_proj = RowParallelLinear(
+            embed_dim, embed_dim, bias=bias, quant_config=quant_config
+        )
+        self.attn = RadixAttention(
+            self.num_heads,
+            head_dim,
+            scaling=1.0,
+            num_kv_heads=self.num_heads,
+            layer_id=layer_id,
+            quant_config=quant_config,
+            is_cross_attention=is_cross_attention,
+            attn_type=(
+                AttentionType.ENCODER_ONLY if is_encoder else AttentionType.DECODER
+            ),
+        )
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+        cross_hidden_states: Optional[torch.Tensor] = None,
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
+        """Input shape: Batch x Time x Channel"""
+
+        if self.is_cross_attention:
+            q, _ = self.q_proj(hidden_states)
+            if cross_hidden_states is not None:
+                kv, _ = self.kv_proj(cross_hidden_states)
+                k, v = kv.split([self.kv_size, self.kv_size], dim=-1)
+            else:
+                k = torch.zeros_like(q)
+                v = torch.zeros_like(q)
+
+            q = q * self.scaling
+            num_heads = self.attn.tp_q_head_num
+            head_dim = self.attn.head_dim
+
+            q = q.view(-1, num_heads, head_dim)
+            k = k.view(-1, num_heads, head_dim)
+            v = v.view(-1, num_heads, head_dim)
+
+            q_len = q.shape[0]
+            kv_len = k.shape[0]
+
+            q = q.transpose(0, 1)
+            k = k.transpose(0, 1)
+            v = v.transpose(0, 1)
+
+            attn_weights = torch.bmm(q, k.transpose(1, 2))
+
+            # Apply block-diagonal mask for batched cross-attention
+            batch_size = forward_batch.batch_size if forward_batch else 1
+            if batch_size > 1 and kv_len > 0:
+                encoder_len_per_request = kv_len // batch_size
+                if encoder_len_per_request * batch_size == kv_len:
+                    is_decode = forward_batch.forward_mode.is_decode()
+                    if is_decode:
+                        mask = torch.zeros(
+                            (q_len, kv_len), device=q.device, dtype=torch.bool
+                        )
+                        for i in range(batch_size):
+                            enc_start = i * encoder_len_per_request
+                            enc_end = (i + 1) * encoder_len_per_request
+                            mask[i, enc_start:enc_end] = True
+                        attn_weights = attn_weights.masked_fill(
+                            ~mask.unsqueeze(0), float("-inf")
+                        )
+                    else:
+                        seq_lens = forward_batch.seq_lens
+                        if seq_lens is not None and len(seq_lens) == batch_size:
+                            seq_lens_list = seq_lens.tolist()
+                            mask = torch.zeros(
+                                (q_len, kv_len), device=q.device, dtype=torch.bool
+                            )
+                            q_start = 0
+                            for i, dec_len in enumerate(seq_lens_list):
+                                enc_start = i * encoder_len_per_request
+                                enc_end = (i + 1) * encoder_len_per_request
+                                q_end = q_start + dec_len
+                                mask[q_start:q_end, enc_start:enc_end] = True
+                                q_start = q_end
+                            attn_weights = attn_weights.masked_fill(
+                                ~mask.unsqueeze(0), float("-inf")
+                            )
+
+            attn_weights = torch.nn.functional.softmax(attn_weights, dim=-1)
+            attn_output = torch.bmm(attn_weights, v)
+            attn_output = attn_output.transpose(0, 1)
+            attn_output = attn_output.reshape(q_len, num_heads * head_dim)
+        else:
+            qkv, _ = self.qkv_proj(hidden_states)
+            q, k, v = qkv.chunk(chunks=3, dim=-1)
+            q = q * self.scaling
+
+            if self.is_encoder:
+                num_heads = self.attn.tp_q_head_num
+                head_dim = self.attn.head_dim
+                batch_size, seq_len, _ = hidden_states.shape
+
+                q = q.view(batch_size, seq_len, num_heads, head_dim).permute(0, 2, 1, 3)
+                k = k.view(batch_size, seq_len, num_heads, head_dim).permute(0, 2, 1, 3)
+                v = v.view(batch_size, seq_len, num_heads, head_dim).permute(0, 2, 1, 3)
+
+                attn_output = torch.nn.functional.scaled_dot_product_attention(
+                    q, k, v, scale=1.0
+                )
+                attn_output = attn_output.permute(0, 2, 1, 3).reshape(
+                    batch_size, seq_len, num_heads * head_dim
+                )
+            else:
+                attn_output = self.attn(q, k, v, forward_batch, save_kv_cache=True)
+
+        attn_output, _ = self.out_proj(attn_output)
+
+        return attn_output
+
+
+class WhisperEncoderLayer(torch.nn.Module):
+    def __init__(
+        self,
+        config: WhisperConfig,
+        layer_id: Optional[int] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+    ):
+        super().__init__()
+        self.embed_dim = config.d_model
+
+        self.self_attn = WhisperAttention(
+            embed_dim=self.embed_dim,
+            num_heads=config.encoder_attention_heads,
+            layer_id=layer_id,
+            quant_config=quant_config,
+            is_encoder=True,
+        )
+        self.self_attn_layer_norm = torch.nn.LayerNorm(self.embed_dim)
+
+        self.activation_fn = get_act_fn(
+            config.activation_function, quant_config=quant_config
+        )
+
+        self.fc1 = ColumnParallelLinear(self.embed_dim, config.encoder_ffn_dim)
+        self.fc2 = RowParallelLinear(config.encoder_ffn_dim, self.embed_dim)
+        self.final_layer_norm = torch.nn.LayerNorm(self.embed_dim)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+    ) -> torch.Tensor:
+
+        residual = hidden_states
+        hidden_states = self.self_attn_layer_norm(hidden_states)
+        hidden_states = self.self_attn(hidden_states, forward_batch)
+
+        hidden_states = residual + hidden_states
+
+        residual = hidden_states
+        hidden_states = self.final_layer_norm(hidden_states)
+        hidden_states, _ = self.fc1(hidden_states)
+        hidden_states = self.activation_fn(hidden_states)
+
+        hidden_states, _ = self.fc2(hidden_states)
+
+        hidden_states = residual + hidden_states
+
+        if hidden_states.dtype == torch.float16:
+            clamp_value = torch.finfo(hidden_states.dtype).max - 1000
+            hidden_states = torch.clamp(
+                hidden_states, min=-clamp_value, max=clamp_value
+            )
+        return hidden_states
+
+
+class WhisperDecoderLayer(torch.nn.Module):
+    def __init__(
+        self,
+        config: WhisperConfig,
+        layer_id: Optional[int] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+    ):
+        super().__init__()
+        self.embed_dim = config.d_model
+
+        # Offset decoder layer IDs to avoid overlap with encoder layers
+        decoder_self_attn_layer_id = config.encoder_layers + layer_id
+        decoder_cross_attn_layer_id = (
+            config.encoder_layers + config.decoder_layers + layer_id
+        )
+
+        self.self_attn = WhisperAttention(
+            embed_dim=self.embed_dim,
+            num_heads=config.decoder_attention_heads,
+            layer_id=decoder_self_attn_layer_id,
+            quant_config=quant_config,
+        )
+
+        self.activation_fn = get_act_fn(
+            config.activation_function, quant_config=quant_config
+        )
+
+        self.self_attn_layer_norm = torch.nn.LayerNorm(self.embed_dim)
+        self.encoder_attn = WhisperAttention(
+            embed_dim=self.embed_dim,
+            num_heads=config.decoder_attention_heads,
+            layer_id=decoder_cross_attn_layer_id,
+            quant_config=quant_config,
+            is_cross_attention=True,
+        )
+        self.encoder_attn_layer_norm = torch.nn.LayerNorm(self.embed_dim)
+        self.fc1 = ColumnParallelLinear(self.embed_dim, config.decoder_ffn_dim)
+        self.fc2 = RowParallelLinear(config.decoder_ffn_dim, self.embed_dim)
+        self.final_layer_norm = torch.nn.LayerNorm(self.embed_dim)
+
+    def forward(
+        self,
+        decoder_hidden_states: torch.Tensor,
+        encoder_hidden_states: Optional[torch.Tensor],
+        forward_batch: ForwardBatch,
+    ) -> torch.Tensor:
+
+        residual = decoder_hidden_states
+        decoder_hidden_states = self.self_attn_layer_norm(decoder_hidden_states)
+        decoder_hidden_states = self.self_attn(decoder_hidden_states, forward_batch)
+        decoder_hidden_states = residual + decoder_hidden_states
+
+        residual = decoder_hidden_states
+        decoder_hidden_states = self.encoder_attn_layer_norm(decoder_hidden_states)
+        decoder_hidden_states = self.encoder_attn(
+            decoder_hidden_states, forward_batch, encoder_hidden_states
+        )
+        decoder_hidden_states = residual + decoder_hidden_states
+
+        residual = decoder_hidden_states
+        decoder_hidden_states = self.final_layer_norm(decoder_hidden_states)
+        decoder_hidden_states, _ = self.fc1(decoder_hidden_states)
+        decoder_hidden_states = self.activation_fn(decoder_hidden_states)
+        decoder_hidden_states, _ = self.fc2(decoder_hidden_states)
+
+        decoder_hidden_states = residual + decoder_hidden_states
+
+        return decoder_hidden_states
+
+
+class WhisperEncoder(torch.nn.Module):
+
+    def __init__(
+        self, config: WhisperConfig, quant_config: Optional[QuantizationConfig] = None
+    ):
+        super().__init__()
+
+        embed_dim = config.d_model
+        self.embed_scale = embed_dim**-0.5 if config.scale_embedding else 1.0
+
+        self.conv1 = torch.nn.Conv1d(
+            config.num_mel_bins, embed_dim, kernel_size=3, padding=1
+        )
+        self.conv2 = torch.nn.Conv1d(
+            embed_dim, embed_dim, kernel_size=3, stride=2, padding=1
+        )
+        self.embed_positions = torch.nn.Embedding(
+            config.max_source_positions, embed_dim
+        )
+
+        self.layers = torch.nn.ModuleList(
+            [
+                WhisperEncoderLayer(config, id, quant_config)
+                for id in range(config.encoder_layers)
+            ]
+        )
+        self.layer_norm = torch.nn.LayerNorm(config.d_model)
+
+    def forward(
+        self,
+        input_features: torch.Tensor,
+        position_ids: torch.Tensor,
+        forward_batch: ForwardBatch,
+    ):
+        inputs_embeds = torch.nn.functional.gelu(self.conv1(input_features))
+        inputs_embeds = torch.nn.functional.gelu(self.conv2(inputs_embeds))
+
+        inputs_embeds = inputs_embeds.mT
+
+        hidden_states = inputs_embeds + self.embed_positions(position_ids)
+
+        for encoder_layer in self.layers:
+            hidden_states = encoder_layer(hidden_states, forward_batch)
+
+        hidden_states = self.layer_norm(hidden_states)
+        return hidden_states
+
+
+class WhisperDecoder(torch.nn.Module):
+
+    def __init__(
+        self, config: WhisperConfig, quant_config: Optional[QuantizationConfig] = None
+    ):
+        super().__init__()
+        self.max_target_positions = config.max_target_positions
+        self.max_source_positions = config.max_source_positions
+        self.embed_scale = config.d_model**-0.5 if config.scale_embedding else 1.0
+
+        self.embed_tokens = torch.nn.Embedding(
+            config.vocab_size, config.d_model, padding_idx=config.pad_token_id
+        )
+        self.embed_positions = torch.nn.Embedding(
+            self.max_target_positions, config.d_model
+        )
+
+        self.layers = torch.nn.ModuleList(
+            [
+                WhisperDecoderLayer(config, layer_idx, quant_config)
+                for layer_idx in range(config.decoder_layers)
+            ]
+        )
+
+        self.layer_norm = torch.nn.LayerNorm(config.d_model)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        encoder_hidden_states: Optional[torch.Tensor],
+        forward_batch: ForwardBatch,
+        position_ids=None,
+    ):
+        inputs_embeds = self.embed_tokens(input_ids)
+        positions = self.embed_positions(position_ids)
+        hidden_states = inputs_embeds + positions.to(inputs_embeds.device)
+
+        for decoder_layer in self.layers:
+            hidden_states = decoder_layer(
+                hidden_states, encoder_hidden_states, forward_batch
+            )
+
+        hidden_states = self.layer_norm(hidden_states)
+
+        return hidden_states
+
+
+class WhisperForConditionalGeneration(torch.nn.Module):
+
+    def __init__(
+        self, config: WhisperConfig, quant_config: Optional[QuantizationConfig] = None
+    ):
+        super().__init__()
+        self.encoder = WhisperEncoder(config, quant_config)
+        self.decoder = WhisperDecoder(config, quant_config)
+        self.proj_out = ParallelLMHead(
+            config.vocab_size, config.d_model, quant_config=quant_config
+        )
+        self.logits_processor = LogitsProcessor(config)
+        self.config = config
+        self._encoder_cache = {}
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        stacked_params_mapping = [
+            (".self_attn.qkv_proj", ".self_attn.q_proj", "q"),
+            (".self_attn.qkv_proj", ".self_attn.k_proj", "k"),
+            (".self_attn.qkv_proj", ".self_attn.v_proj", "v"),
+            (".encoder_attn.kv_proj", ".encoder_attn.k_proj", "k"),
+            (".encoder_attn.kv_proj", ".encoder_attn.v_proj", "v"),
+        ]
+
+        params_dict = dict(self.named_parameters())
+        weights_dict = dict(weights)
+
+        # Whisper has no k_proj bias, create zeros
+        for layer_idx in range(self.config.decoder_layers):
+            layer_prefix = f"model.decoder.layers.{layer_idx}.encoder_attn."
+            k_proj_key = layer_prefix + "k_proj.weight"
+            if k_proj_key in weights_dict:
+                k_proj_weight = weights_dict[k_proj_key]
+                bias_key = layer_prefix + "k_proj.bias"
+                if bias_key not in weights_dict:
+                    weights_dict[bias_key] = torch.zeros(k_proj_weight.size(0))
+
+        weights_dict["proj_out.weight"] = weights_dict[
+            "model.decoder.embed_tokens.weight"
+        ]
+
+        for name, loaded_weight in weights_dict.items():
+            name = name.replace("model.", "")
+
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                if name not in params_dict:
+                    break
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                if name not in params_dict:
+                    continue
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
+                weight_loader(param, loaded_weight)
+
+    def pad_input_ids(self, input_ids: List[int], _mm_inputs: MultimodalInputs):
+        return input_ids
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        **kwargs: Any,
+    ) -> LogitsProcessorOutput:
+        dtype = self.encoder.conv1.weight.dtype
+        is_decode = forward_batch.forward_mode.is_decode()
+
+        if is_decode:
+            encoder_outputs = None
+            if forward_batch.req_pool_indices is not None:
+                req_indices = forward_batch.req_pool_indices.tolist()
+                encoder_list = []
+                for req_idx in req_indices:
+                    if req_idx in self._encoder_cache:
+                        encoder_list.append(self._encoder_cache[req_idx])
+                if encoder_list:
+                    encoder_outputs = torch.cat(encoder_list, dim=0)
+        else:
+            encoder_list = []
+            mm_inputs_list = forward_batch.mm_inputs if forward_batch.mm_inputs else []
+            req_indices = (
+                forward_batch.req_pool_indices.tolist()
+                if forward_batch.req_pool_indices is not None
+                else []
+            )
+
+            for req_idx, mm_input in zip(req_indices, mm_inputs_list):
+                if mm_input is None or not mm_input.mm_items:
+                    continue
+
+                features = mm_input.mm_items[0].feature
+                if features.ndim == 2:
+                    features = features.unsqueeze(0)
+
+                encoder_len = features.shape[-1] // 2
+                encoder_position_ids = torch.arange(encoder_len).to(
+                    features.device, non_blocking=True
+                )
+
+                req_encoder_outputs = self.encoder(
+                    features.to(dtype), encoder_position_ids, forward_batch
+                )
+                req_encoder_outputs = req_encoder_outputs.squeeze(0)
+
+                self._encoder_cache[req_idx] = req_encoder_outputs
+                encoder_list.append(req_encoder_outputs)
+
+            if encoder_list:
+                encoder_outputs = torch.cat(encoder_list, dim=0)
+            else:
+                encoder_outputs = None
+
+        decoder_outputs = self.decoder(
+            input_ids, encoder_outputs, forward_batch, positions
+        )
+
+        logits = self.logits_processor(
+            input_ids=input_ids,
+            lm_head=self.proj_out,
+            hidden_states=decoder_outputs,
+            logits_metadata=forward_batch,
+        )
+
+        return logits
+
+
+EntryClass = [WhisperForConditionalGeneration]
diff --git a/sglang/python/sglang/srt/models/xverse.py b/sglang/python/sglang/srt/models/xverse.py
new file mode 100644
index 0000000000000000000000000000000000000000..f84755b03635aa2ce3c8a7c722605a26d5a8985e
--- /dev/null
+++ b/sglang/python/sglang/srt/models/xverse.py
@@ -0,0 +1,382 @@
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+# Adapted from
+# https://github.com/vllm-project/vllm/blob/c7f2cf2b7f67bce5842fedfdba508440fe257375/vllm/model_executor/models/xverse.py#L1
+"""Inference-only XVERSE model compatible with HuggingFace weights."""
+
+from typing import Any, Dict, Iterable, Optional, Tuple
+
+import torch
+from torch import nn
+from transformers import LlamaConfig
+
+from sglang.srt.distributed import get_tensor_model_parallel_world_size
+from sglang.srt.layers.activation import SiluAndMul
+from sglang.srt.layers.layernorm import RMSNorm
+from sglang.srt.layers.linear import (
+    MergedColumnParallelLinear,
+    QKVParallelLinear,
+    RowParallelLinear,
+)
+from sglang.srt.layers.logits_processor import LogitsProcessor
+from sglang.srt.layers.quantization.base_config import QuantizationConfig
+from sglang.srt.layers.radix_attention import RadixAttention
+from sglang.srt.layers.rotary_embedding import get_rope
+from sglang.srt.layers.vocab_parallel_embedding import (
+    ParallelLMHead,
+    VocabParallelEmbedding,
+)
+from sglang.srt.model_executor.model_runner import ForwardBatch
+from sglang.srt.model_loader.weight_utils import default_weight_loader
+from sglang.srt.utils import add_prefix
+
+
+class XverseMLP(nn.Module):
+    def __init__(
+        self,
+        hidden_size: int,
+        intermediate_size: int,
+        hidden_act: str,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.gate_up_proj = MergedColumnParallelLinear(
+            hidden_size,
+            [intermediate_size] * 2,
+            bias=False,
+            quant_config=quant_config,
+            prefix=add_prefix("gate_up_proj", prefix),
+        )
+        self.down_proj = RowParallelLinear(
+            intermediate_size,
+            hidden_size,
+            bias=False,
+            quant_config=quant_config,
+            prefix=add_prefix("down_proj", prefix),
+        )
+        if hidden_act != "silu":
+            raise ValueError(
+                f"Unsupported activation: {hidden_act}. "
+                "Only silu is supported for now."
+            )
+        self.act_fn = SiluAndMul()
+
+    def forward(self, x):
+        gate_up, _ = self.gate_up_proj(x)
+        x = self.act_fn(gate_up)
+        x, _ = self.down_proj(x)
+        return x
+
+
+class XverseAttention(nn.Module):
+    def __init__(
+        self,
+        config: LlamaConfig,
+        hidden_size: int,
+        num_heads: int,
+        num_kv_heads: int,
+        layer_id: int = 0,
+        rope_theta: float = 10000,
+        rope_scaling: Optional[Dict[str, Any]] = None,
+        rope_is_neox_style: bool = True,
+        max_position_embeddings: int = 8192,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.hidden_size = hidden_size
+        tp_size = get_tensor_model_parallel_world_size()
+        self.total_num_heads = num_heads
+        assert self.total_num_heads % tp_size == 0
+        self.num_heads = self.total_num_heads // tp_size
+        self.total_num_kv_heads = num_kv_heads
+        if self.total_num_kv_heads >= tp_size:
+            # Number of KV heads is greater than TP size, so we partition
+            # the KV heads across multiple tensor parallel GPUs.
+            assert self.total_num_kv_heads % tp_size == 0
+        else:
+            # Number of KV heads is less than TP size, so we replicate
+            # the KV heads across multiple tensor parallel GPUs.
+            assert tp_size % self.total_num_kv_heads == 0
+        self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
+        # MistralConfig has an optional head_dim introduced by Mistral-Nemo
+        self.head_dim = getattr(
+            config, "head_dim", self.hidden_size // self.total_num_heads
+        )
+        self.q_size = self.num_heads * self.head_dim
+        self.kv_size = self.num_kv_heads * self.head_dim
+        self.scaling = self.head_dim**-0.5
+        self.rope_theta = rope_theta
+        self.max_position_embeddings = max_position_embeddings
+
+        self.qkv_proj = QKVParallelLinear(
+            hidden_size,
+            self.head_dim,
+            self.total_num_heads,
+            self.total_num_kv_heads,
+            bias=False,
+            quant_config=quant_config,
+            prefix=add_prefix("qkv_proj", prefix),
+        )
+        self.o_proj = RowParallelLinear(
+            self.total_num_heads * self.head_dim,
+            hidden_size,
+            bias=False,
+            quant_config=quant_config,
+            prefix=add_prefix("o_proj", prefix),
+        )
+
+        self.rotary_emb = get_rope(
+            self.head_dim,
+            rotary_dim=self.head_dim,
+            max_position=max_position_embeddings,
+            base=rope_theta,
+            rope_scaling=rope_scaling,
+            is_neox_style=rope_is_neox_style,
+        )
+        self.attn = RadixAttention(
+            self.num_heads,
+            self.head_dim,
+            self.scaling,
+            num_kv_heads=self.num_kv_heads,
+            layer_id=layer_id,
+            quant_config=quant_config,
+            prefix=add_prefix("attn", prefix),
+        )
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+    ) -> torch.Tensor:
+        qkv, _ = self.qkv_proj(hidden_states)
+        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+        q, k = self.rotary_emb(positions, q, k)
+        attn_output = self.attn(q, k, v, forward_batch)
+        output, _ = self.o_proj(attn_output)
+        return output
+
+
+class XverseDecoderLayer(nn.Module):
+    def __init__(
+        self,
+        config: LlamaConfig,
+        layer_id: int = 0,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        rope_theta = getattr(config, "rope_theta", 10000)
+        rope_scaling = getattr(config, "rope_scaling", None)
+        if rope_scaling is not None and getattr(
+            config, "original_max_position_embeddings", None
+        ):
+            rope_scaling["original_max_position_embeddings"] = (
+                config.original_max_position_embeddings
+            )
+        rope_is_neox_style = getattr(config, "rope_is_neox_style", True)
+        max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
+        num_kv_heads = getattr(
+            config, "num_key_value_heads", config.num_attention_heads
+        )
+        self.self_attn = XverseAttention(
+            config=config,
+            hidden_size=self.hidden_size,
+            num_heads=config.num_attention_heads,
+            num_kv_heads=num_kv_heads,
+            layer_id=layer_id,
+            rope_theta=rope_theta,
+            rope_scaling=rope_scaling,
+            rope_is_neox_style=rope_is_neox_style,
+            max_position_embeddings=max_position_embeddings,
+            quant_config=quant_config,
+            prefix=add_prefix("self_attn", prefix),
+        )
+        self.mlp = XverseMLP(
+            hidden_size=self.hidden_size,
+            intermediate_size=config.intermediate_size,
+            hidden_act=config.hidden_act,
+            quant_config=quant_config,
+            prefix=add_prefix("mlp", prefix),
+        )
+        self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = RMSNorm(
+            config.hidden_size, eps=config.rms_norm_eps
+        )
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+        residual: Optional[torch.Tensor],
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        # Self Attention
+        if residual is None:
+            residual = hidden_states
+            hidden_states = self.input_layernorm(hidden_states)
+        else:
+            hidden_states, residual = self.input_layernorm(hidden_states, residual)
+        hidden_states = self.self_attn(
+            positions=positions,
+            hidden_states=hidden_states,
+            forward_batch=forward_batch,
+        )
+
+        # Fully Connected
+        hidden_states, residual = self.post_attention_layernorm(hidden_states, residual)
+        hidden_states = self.mlp(hidden_states)
+        return hidden_states, residual
+
+
+class XverseModel(nn.Module):
+    def __init__(
+        self,
+        config: LlamaConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.config = config
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+        self.embed_tokens = VocabParallelEmbedding(
+            config.vocab_size,
+            config.hidden_size,
+            prefix=add_prefix("embed_tokens", prefix),
+        )
+        self.layers = nn.ModuleList(
+            [
+                XverseDecoderLayer(
+                    config,
+                    i,
+                    quant_config=quant_config,
+                    prefix=add_prefix(f"layers.{i}", prefix),
+                )
+                for i in range(config.num_hidden_layers)
+            ]
+        )
+        self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        input_embeds: torch.Tensor = None,
+    ) -> torch.Tensor:
+        if input_embeds is None:
+            hidden_states = self.embed_tokens(input_ids)
+        else:
+            hidden_states = input_embeds
+        residual = None
+        for i in range(len(self.layers)):
+            layer = self.layers[i]
+            hidden_states, residual = layer(
+                positions,
+                hidden_states,
+                forward_batch,
+                residual,
+            )
+            # print(f"layer[{i}].hidden_states: {hidden_states}")
+        hidden_states, _ = self.norm(hidden_states, residual)
+        return hidden_states
+
+
+class XverseForCausalLM(nn.Module):
+    def __init__(
+        self,
+        config: LlamaConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.config = config
+        self.quant_config = quant_config
+        self.model = XverseModel(
+            config, quant_config=quant_config, prefix=add_prefix("model", prefix)
+        )
+        self.lm_head = ParallelLMHead(
+            config.vocab_size, config.hidden_size, prefix=add_prefix("lm_head", prefix)
+        )
+        self.logits_processor = LogitsProcessor(config)
+
+    @torch.no_grad()
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        input_embeds: torch.Tensor = None,
+    ) -> torch.Tensor:
+        hidden_states = self.model(input_ids, positions, forward_batch, input_embeds)
+        return self.logits_processor(
+            input_ids, hidden_states, self.lm_head, forward_batch
+        )
+
+    def load_weights(
+        self, weights: Iterable[Tuple[str, torch.Tensor]], name=None, loaded_weight=None
+    ):
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+            ("gate_up_proj", "gate_proj", 0),
+            ("gate_up_proj", "up_proj", 1),
+        ]
+        params_dict = dict(self.named_parameters())
+
+        def load_weights_per_param(name, loaded_weight):
+            if "rotary_emb.inv_freq" in name or "projector" in name:
+                return
+            if "rotary_emb.cos_cached" in name or "rotary_emb.sin_cached" in name:
+                # Models trained using ColossalAI may include these tensors in
+                # the checkpoint. Skip them.
+                return
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                if name.startswith("model.vision_tower") and name not in params_dict:
+                    continue
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    return
+                if name.startswith("model.vision_tower") and name not in params_dict:
+                    return
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
+                weight_loader(param, loaded_weight)
+
+        if name is None or loaded_weight is None:
+            for name, loaded_weight in weights:
+                load_weights_per_param(name, loaded_weight)
+        else:
+            load_weights_per_param(name, loaded_weight)
+
+
+EntryClass = XverseForCausalLM
diff --git a/sglang/python/sglang/srt/models/xverse_moe.py b/sglang/python/sglang/srt/models/xverse_moe.py
new file mode 100644
index 0000000000000000000000000000000000000000..e4489055f6367af054cd6e8d21de94846ec9962c
--- /dev/null
+++ b/sglang/python/sglang/srt/models/xverse_moe.py
@@ -0,0 +1,482 @@
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Inference-only XVERSE MoE model."""
+
+from typing import Any, Dict, Iterable, Optional, Tuple
+
+import torch
+from torch import nn
+from transformers import PretrainedConfig
+
+from sglang.srt.distributed import (
+    get_tensor_model_parallel_rank,
+    get_tensor_model_parallel_world_size,
+    tensor_model_parallel_all_reduce,
+)
+from sglang.srt.hardware_backend.npu.quantization.fused_moe_method_npu import (
+    fused_moe_npu,
+)
+from sglang.srt.layers.activation import SiluAndMul
+from sglang.srt.layers.layernorm import RMSNorm
+from sglang.srt.layers.linear import (
+    MergedColumnParallelLinear,
+    QKVParallelLinear,
+    ReplicatedLinear,
+    RowParallelLinear,
+)
+from sglang.srt.layers.logits_processor import LogitsProcessor
+from sglang.srt.layers.moe.fused_moe_triton.fused_moe import fused_moe
+from sglang.srt.layers.moe.moe_runner import MoeRunnerConfig
+from sglang.srt.layers.moe.topk import TopK
+from sglang.srt.layers.quantization.base_config import QuantizationConfig
+from sglang.srt.layers.radix_attention import RadixAttention
+from sglang.srt.layers.rotary_embedding import get_rope
+from sglang.srt.layers.vocab_parallel_embedding import (
+    ParallelLMHead,
+    VocabParallelEmbedding,
+)
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch
+from sglang.srt.model_loader.weight_utils import default_weight_loader
+from sglang.srt.utils import add_prefix, is_npu
+
+
+class XverseMLP(nn.Module):
+
+    def __init__(
+        self,
+        hidden_size: int,
+        intermediate_size: int,
+        hidden_act: str,
+        quant_config: Optional[QuantizationConfig] = None,
+        reduce_results: bool = True,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.gate_up_proj = MergedColumnParallelLinear(
+            hidden_size,
+            [intermediate_size] * 2,
+            bias=False,
+            quant_config=quant_config,
+            prefix=add_prefix("gate_up_proj", prefix),
+        )
+        self.down_proj = RowParallelLinear(
+            intermediate_size,
+            hidden_size,
+            bias=False,
+            quant_config=quant_config,
+            reduce_results=reduce_results,
+            prefix=add_prefix("down_proj", prefix),
+        )
+        if hidden_act != "silu":
+            raise ValueError(
+                f"Unsupported activation: {hidden_act}. "
+                "Only silu is supported for now."
+            )
+        self.act_fn = SiluAndMul()
+
+    def forward(self, x):
+        gate_up, _ = self.gate_up_proj(x)
+        x = self.act_fn(gate_up)
+        x, _ = self.down_proj(x)
+        return x
+
+
+class XverseMoE(nn.Module):
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.config = config
+        self.rank = get_tensor_model_parallel_rank()
+        self.tp_size = get_tensor_model_parallel_world_size()
+        self.n_routed_experts = config.num_experts
+        self.top_k = config.moe_top_k
+        if self.tp_size > self.n_routed_experts:
+            raise ValueError(
+                f"Tensor parallel size {self.tp_size} is greater than "
+                f"the number of experts {self.n_routed_experts}."
+            )
+
+        self.experts = nn.ModuleList(
+            [
+                XverseMLP(
+                    hidden_size=config.hidden_size,
+                    intermediate_size=config.intermediate_size,
+                    hidden_act=config.hidden_act,
+                    quant_config=quant_config,
+                    reduce_results=False,
+                    prefix=add_prefix(f"experts.{i}", prefix),
+                )
+                for i in range(self.n_routed_experts)
+            ]
+        )
+        self.pack_params()
+        self.moe_runner_config = MoeRunnerConfig(inplace=True)
+
+        self.router = ReplicatedLinear(
+            config.hidden_size,
+            self.n_routed_experts,
+            bias=False,
+            quant_config=None,
+            prefix=add_prefix("router", prefix),
+        )
+        self.topk = TopK(
+            top_k=self.top_k,
+            renormalize=getattr(self.config, "norm_topk_prob", False),
+        )
+
+        if config.num_shared_experts is not None:
+            intermediate_size = config.intermediate_size * config.num_shared_experts
+            self.shared_experts = XverseMLP(
+                hidden_size=config.hidden_size,
+                intermediate_size=intermediate_size,
+                hidden_act=config.hidden_act,
+                quant_config=quant_config,
+                reduce_results=False,
+                prefix=add_prefix("shared_experts", prefix),
+            )
+        self.fused_moe_method = fused_moe if not is_npu() else fused_moe_npu
+
+    def pack_params(self):
+        w1 = []
+        w2 = []
+        for expert in self.experts:
+            w1.append(expert.gate_up_proj.weight)
+            w2.append(expert.down_proj.weight)
+        self.w1 = torch._utils._flatten_dense_tensors(w1)
+        w1s = torch._utils._unflatten_dense_tensors(self.w1, w1)
+        for data, param in zip(w1s, w1):
+            param.data = data
+        self.w1 = self.w1.view(len(w1), *w1s[0].shape)
+
+        self.w2 = torch._utils._flatten_dense_tensors(w2)
+        w2s = torch._utils._unflatten_dense_tensors(self.w2, w2)
+        for data, param in zip(w2s, w2):
+            param.data = data
+
+        self.w2 = self.w2.view(len(w2), *w2s[0].shape)
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        num_tokens, hidden_dim = hidden_states.shape
+        hidden_states = hidden_states.view(-1, hidden_dim)
+        if self.config.num_shared_experts is not None:
+            shared_output = self.shared_experts(hidden_states)
+        # router_logits: (num_tokens, n_experts)
+        router_logits, _ = self.router(hidden_states)
+        topk_output = self.topk(hidden_states, router_logits)
+        final_hidden_states = self.fused_moe_method(
+            hidden_states,
+            self.w1,
+            self.w2,
+            topk_output,
+            self.moe_runner_config,
+        )
+
+        if self.config.num_shared_experts is not None:
+            final_hidden_states = final_hidden_states + shared_output
+        final_hidden_states = tensor_model_parallel_all_reduce(final_hidden_states)
+
+        return final_hidden_states.view(num_tokens, hidden_dim)
+
+
+class XverseAttention(nn.Module):
+
+    def __init__(
+        self,
+        hidden_size: int,
+        num_heads: int,
+        num_kv_heads: int,
+        layer_id: int = 0,
+        rope_theta: float = 10000,
+        rope_scaling: Optional[Dict[str, Any]] = None,
+        max_position_embeddings: int = 8192,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.hidden_size = hidden_size
+        tp_size = get_tensor_model_parallel_world_size()
+        self.total_num_heads = num_heads
+        assert self.total_num_heads % tp_size == 0
+        self.num_heads = self.total_num_heads // tp_size
+        self.total_num_kv_heads = num_kv_heads
+        if self.total_num_kv_heads >= tp_size:
+            # Number of KV heads is greater than TP size, so we partition
+            # the KV heads across multiple tensor parallel GPUs.
+            assert self.total_num_kv_heads % tp_size == 0
+        else:
+            # Number of KV heads is less than TP size, so we replicate
+            # the KV heads across multiple tensor parallel GPUs.
+            assert tp_size % self.total_num_kv_heads == 0
+        self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
+        self.head_dim = hidden_size // self.total_num_heads
+        self.q_size = self.num_heads * self.head_dim
+        self.kv_size = self.num_kv_heads * self.head_dim
+        self.scaling = self.head_dim**-0.5
+        self.rope_theta = rope_theta
+        self.max_position_embeddings = max_position_embeddings
+
+        self.qkv_proj = QKVParallelLinear(
+            hidden_size,
+            self.head_dim,
+            self.total_num_heads,
+            self.total_num_kv_heads,
+            bias=False,
+            quant_config=quant_config,
+            prefix=add_prefix("qkv_proj", prefix),
+        )
+
+        self.o_proj = RowParallelLinear(
+            self.total_num_heads * self.head_dim,
+            hidden_size,
+            bias=False,
+            quant_config=quant_config,
+            prefix=add_prefix("o_proj", prefix),
+        )
+
+        self.rotary_emb = get_rope(
+            self.head_dim,
+            rotary_dim=self.head_dim,
+            max_position=max_position_embeddings,
+            base=rope_theta,
+            rope_scaling=rope_scaling,
+        )
+        self.attn = RadixAttention(
+            self.num_heads,
+            self.head_dim,
+            self.scaling,
+            num_kv_heads=self.num_kv_heads,
+            layer_id=layer_id,
+            quant_config=quant_config,
+            prefix=add_prefix("attn", prefix),
+        )
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+    ) -> torch.Tensor:
+        qkv, _ = self.qkv_proj(hidden_states)
+        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+        q, k = self.rotary_emb(positions, q, k)
+        attn_output = self.attn(q, k, v, forward_batch)
+        output, _ = self.o_proj(attn_output)
+        return output
+
+
+class XverseDecoderLayer(nn.Module):
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        layer_id: int,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        rope_theta = getattr(config, "rope_theta", 10000)
+        rope_scaling = getattr(config, "rope_scaling", None)
+        max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
+        num_key_value_heads = getattr(
+            config, "num_key_value_heads", config.num_attention_heads
+        )
+        self.self_attn = XverseAttention(
+            hidden_size=self.hidden_size,
+            num_heads=config.num_attention_heads,
+            num_kv_heads=num_key_value_heads,
+            layer_id=layer_id,
+            rope_theta=rope_theta,
+            rope_scaling=rope_scaling,
+            max_position_embeddings=max_position_embeddings,
+            quant_config=quant_config,
+            prefix=add_prefix("self_attn", prefix),
+        )
+        if config.num_experts is not None:
+            self.mlp = XverseMoE(
+                config=config,
+                quant_config=quant_config,
+                prefix=add_prefix("mlp", prefix),
+            )
+        else:
+            self.mlp = XverseMLP(
+                hidden_size=config.hidden_size,
+                intermediate_size=config.intermediate_size,
+                hidden_act=config.hidden_act,
+                quant_config=quant_config,
+                prefix=add_prefix("mlp", prefix),
+            )
+        self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = RMSNorm(
+            config.hidden_size, eps=config.rms_norm_eps
+        )
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+        residual: Optional[torch.Tensor],
+    ) -> torch.Tensor:
+        # Self Attention
+        if residual is None:
+            residual = hidden_states
+            hidden_states = self.input_layernorm(hidden_states)
+        else:
+            hidden_states, residual = self.input_layernorm(hidden_states, residual)
+        hidden_states = self.self_attn(
+            positions=positions,
+            hidden_states=hidden_states,
+            forward_batch=forward_batch,
+        )
+
+        # Fully Connected
+        hidden_states, residual = self.post_attention_layernorm(hidden_states, residual)
+        hidden_states = self.mlp(hidden_states)
+        return hidden_states, residual
+
+
+class XverseModel(nn.Module):
+
+    fall_back_to_pt_during_load = False
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+
+        self.embed_tokens = VocabParallelEmbedding(
+            config.vocab_size,
+            config.hidden_size,
+            prefix=add_prefix("embed_tokens", prefix),
+        )
+        self.layers = nn.ModuleList(
+            [
+                XverseDecoderLayer(
+                    config,
+                    layer_id,
+                    quant_config=quant_config,
+                    prefix=add_prefix(f"layers.{layer_id}", prefix),
+                )
+                for layer_id in range(config.num_hidden_layers)
+            ]
+        )
+        self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+    ) -> torch.Tensor:
+        hidden_states = self.embed_tokens(input_ids)
+        residual = None
+        for i in range(len(self.layers)):
+            layer = self.layers[i]
+            hidden_states, residual = layer(
+                positions, hidden_states, forward_batch, residual
+            )
+        hidden_states, _ = self.norm(hidden_states, residual)
+        return hidden_states
+
+
+class XverseMoeForCausalLM(nn.Module):
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.config = config
+        self.quant_config = quant_config
+        self.model = XverseModel(
+            config, quant_config, prefix=add_prefix("model", prefix)
+        )
+        self.lm_head = ParallelLMHead(
+            config.vocab_size,
+            config.hidden_size,
+            quant_config=quant_config,
+            prefix=add_prefix("lm_head", prefix),
+        )
+        self.logits_processor = LogitsProcessor(config)
+
+    @torch.no_grad()
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+    ) -> torch.Tensor:
+        hidden_states = self.model(input_ids, positions, forward_batch)
+        return self.logits_processor(
+            input_ids, hidden_states, self.lm_head, forward_batch
+        )
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+            ("gate_up_proj", "gate_proj", 0),
+            ("gate_up_proj", "up_proj", 1),
+        ]
+        params_dict = dict(self.named_parameters())
+
+        for name, loaded_weight in weights:
+            if "rotary_emb.inv_freq" in name:
+                continue
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                # Skip experts that are not assigned to this worker.
+                if (
+                    "mlp.experts." in name or "mlp.shared_experts." in name
+                ) and name not in params_dict:
+                    continue
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                # Skip experts that are not assigned to this worker.
+                if (
+                    "mlp.experts." in name or "mlp.shared_experts." in name
+                ) and name not in params_dict:
+                    continue
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
+                weight_loader(param, loaded_weight)
+
+
+EntryClass = XverseMoeForCausalLM
diff --git a/sglang/python/sglang/srt/models/yivl.py b/sglang/python/sglang/srt/models/yivl.py
new file mode 100644
index 0000000000000000000000000000000000000000..4c50b0d3c2b2bab1304493a2e0f70627676421ca
--- /dev/null
+++ b/sglang/python/sglang/srt/models/yivl.py
@@ -0,0 +1,115 @@
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Inference-only Yi-VL model."""
+
+from typing import Iterable, Optional, Tuple
+
+import torch
+import torch.nn as nn
+from transformers import CLIPVisionModel, LlavaConfig
+
+from sglang.srt.layers.quantization.base_config import QuantizationConfig
+from sglang.srt.model_loader.weight_utils import default_weight_loader
+from sglang.srt.models.llava import LlavaLlamaForCausalLM
+
+
+class YiVLForCausalLM(LlavaLlamaForCausalLM):
+    def __init__(
+        self,
+        config: LlavaConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__(config, quant_config, prefix=prefix)
+
+        self.multi_modal_projector = YiVLMultiModalProjector(self.config)
+        self.vision_tower_subfolder = self.config.mm_vision_tower.replace(
+            "./", ""
+        )  # Everything after "./"
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        # We have to use the subfolder of the main model directory (e.g. 01-ai/Yi-VL-6B)
+        self.vision_tower = CLIPVisionModel.from_pretrained(
+            self.config._name_or_path,
+            torch_dtype=torch.float16,
+            subfolder=self.vision_tower_subfolder,
+        ).to("cuda")
+
+        self.vision_tower.eval()
+
+        self.vision_feature_layer = self.config.mm_vision_select_layer
+        self.vision_feature_select_strategy = self.config.mm_vision_select_feature
+        self.image_size = self.vision_tower.config.image_size
+        self.patch_size = self.vision_tower.config.patch_size
+
+        self.mm_patch_merge_type = getattr(self.config, "mm_patch_merge_type", "flat")
+        self.image_aspect_ratio = getattr(self.config, "image_aspect_ratio", "square")
+        self.image_grid_pinpoints = getattr(self.config, "image_grid_pinpoints", None)
+
+        self.image_feature_len = int((self.image_size / self.patch_size) ** 2)
+        if self.vision_feature_select_strategy == "patch":
+            pass
+        elif self.vision_feature_select_strategy == "cls_patch":
+            self.image_feature_len += 1
+        else:
+            raise ValueError(f"Unexpected select feature: {self.select_feature}")
+
+        # load mm_projector
+        # TODO: support TP?
+        projector_weights = {
+            "model.mm_projector.0": "multi_modal_projector.linear_1",
+            "model.mm_projector.1": "multi_modal_projector.ln_1",
+            "model.mm_projector.3": "multi_modal_projector.linear_2",
+            "model.mm_projector.4": "multi_modal_projector.ln_2",
+            "model.vision_tower.vision_tower": "vision_tower",  # Update the vision tower weights if we find them in the checkpoint (it may be finetuned).
+        }
+        params_dict = dict(self.named_parameters())
+        weights = list(weights)
+        for name, loaded_weight in weights:
+            if "projector" in name or "vision_tower" in name:
+                for weight_name, param_name in projector_weights.items():
+                    if weight_name in name:
+                        name = name.replace(weight_name, param_name)
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
+                weight_loader(param, loaded_weight)
+
+        # load language model
+        self.language_model.load_weights(weights)
+
+
+class YiVLMultiModalProjector(nn.Module):
+    def __init__(self, config: LlavaConfig):
+        super().__init__()
+
+        self.linear_1 = nn.Linear(
+            config.vision_config.hidden_size, config.text_config.hidden_size
+        )
+        self.ln_1 = nn.LayerNorm(config.text_config.hidden_size)
+        self.act = nn.GELU()
+        self.linear_2 = nn.Linear(
+            config.text_config.hidden_size, config.text_config.hidden_size
+        )
+        self.ln_2 = nn.LayerNorm(config.text_config.hidden_size)
+
+    def forward(self, image_features):
+        hidden_states = self.linear_1(image_features)
+        hidden_states = self.ln_1(hidden_states)
+        hidden_states = self.act(hidden_states)
+        hidden_states = self.linear_2(hidden_states)
+        hidden_states = self.ln_2(hidden_states)
+        return hidden_states
+
+
+EntryClass = YiVLForCausalLM
diff --git a/sglang/python/sglang/srt/multiplex/__pycache__/multiplexing_mixin.cpython-311.pyc b/sglang/python/sglang/srt/multiplex/__pycache__/multiplexing_mixin.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..553a34932d08a73f478520ba894863eb096eff30
Binary files /dev/null and b/sglang/python/sglang/srt/multiplex/__pycache__/multiplexing_mixin.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/multiplex/__pycache__/pdmux_context.cpython-311.pyc b/sglang/python/sglang/srt/multiplex/__pycache__/pdmux_context.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3ec7ced6fe4ff1568bb1557f118530db28c15bfe
Binary files /dev/null and b/sglang/python/sglang/srt/multiplex/__pycache__/pdmux_context.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/multiplex/multiplexing_mixin.py b/sglang/python/sglang/srt/multiplex/multiplexing_mixin.py
new file mode 100644
index 0000000000000000000000000000000000000000..1e1e858aefb6ee5e69b6a06d025ad7d293b582dd
--- /dev/null
+++ b/sglang/python/sglang/srt/multiplex/multiplexing_mixin.py
@@ -0,0 +1,221 @@
+"""
+Mixin class providing multiplexing scheduling logic
+"""
+
+from __future__ import annotations
+
+import logging
+from typing import TYPE_CHECKING, Optional
+
+import torch
+import torch.distributed as dist
+from torch.cuda.streams import ExternalStream
+
+from sglang.srt.distributed.parallel_state import set_pdmux_status
+from sglang.srt.model_executor.forward_batch_info import ForwardMode
+from sglang.srt.multiplex.pdmux_context import (
+    get_current_stream_idx,
+    get_sm_counts,
+    get_stream_groups,
+    initialize_stream_groups,
+    load_pdmux_config,
+    set_current_stream_idx,
+)
+
+if TYPE_CHECKING:
+    from sglang.srt.managers.schedule_batch import ScheduleBatch
+    from sglang.srt.managers.scheduler import Scheduler
+
+logger = logging.getLogger(__name__)
+
+
+class SchedulerMultiplexMixin:
+
+    def init_pdmux(self: Scheduler):
+        # The current split prefill batch
+        self.split_prefill_batch: Optional[ScheduleBatch] = None
+
+        # for pd_multiplexing, Init stream_groups, exclude normal stream for prefill only and decode only
+        self.pdmux_config = load_pdmux_config(self.server_args.pdmux_config_path)
+        initialize_stream_groups(self.gpu_id, self.pdmux_config)
+        self.stream_groups = get_stream_groups()
+        self.sm_counts = get_sm_counts()
+        self.real_sm_group_num = len(self.stream_groups)
+        logger.info(
+            f"PD-Multiplexing enabled with {self.real_sm_group_num} stream groups, sm_counts (prefill_sm, decode_sm): {self.sm_counts}"
+        )
+
+    # TODO(jason-fxz): This is a temporary demo
+    def adjust_stream_groups(
+        self: Scheduler,
+    ) -> tuple[int, tuple[ExternalStream, ExternalStream]]:
+        if not self.running_batch.is_empty() and self.split_prefill_batch:
+            decode_bs = self.running_batch.batch_size()
+            manual_divisions = self.pdmux_config.manual_divisions
+            if manual_divisions:
+                for i in range(len(manual_divisions)):
+                    _, _, threshold = manual_divisions[i]
+                    if decode_bs >= threshold:
+                        stream_idx = i + 1
+            else:
+                stream_idx = max(
+                    1,
+                    min(
+                        self.real_sm_group_num - 2,
+                        decode_bs
+                        * (self.real_sm_group_num - 2)
+                        // self.pdmux_config.decode_bs_divisor,
+                    ),
+                )
+            set_current_stream_idx(stream_idx)
+        elif not self.running_batch.is_empty():
+            set_current_stream_idx(self.real_sm_group_num - 1)
+        else:
+            set_current_stream_idx(0)
+
+        stream_idx = get_current_stream_idx()
+
+        self.tp_worker.model_runner.update_decode_attn_backend(stream_idx)
+        return stream_idx, self.stream_groups[stream_idx]
+
+    def update_split_prefill_batch(self: Scheduler, sm_count: int) -> bool:
+        if self.split_prefill_batch:
+            return False
+
+        # add new request
+        batch = self.get_new_batch_prefill()
+        if batch and not batch.is_empty():
+            batch.forward_mode = (
+                ForwardMode.SPLIT_PREFILL
+            )  # Set forward mode for split prefill
+            self.split_prefill_batch = batch
+            return True
+        return False
+
+    @torch.inference_mode()
+    def event_loop_pdmux(self: Scheduler):
+        """A scheduler loop for pd multiplexing."""
+        decode_done = False
+        prefill_done = False
+        wait_prefill_kernel_done = False
+        adjust_stream_group = False
+        stream_idx = get_current_stream_idx()
+        stream_group = self.stream_groups[stream_idx]
+        prefill_stream = stream_group[0]
+        decode_stream = stream_group[1]
+        torch.cuda.empty_cache()
+
+        logger.debug("Starting event loop for pd multiplexing...")
+
+        while True:
+            with torch.cuda.stream(decode_stream):
+                set_pdmux_status(False)
+                recv_reqs = self.recv_requests()
+                self.process_input_requests(recv_reqs)
+
+            with torch.cuda.stream(prefill_stream):
+                set_pdmux_status(True)
+                sm_count = self.sm_counts[stream_idx][0]
+                if not wait_prefill_kernel_done:
+                    adjust_stream_group = (
+                        self.update_split_prefill_batch(sm_count) or adjust_stream_group
+                    )
+
+            with torch.cuda.stream(decode_stream):
+                set_pdmux_status(False)
+                self.running_batch = self.update_running_batch(self.running_batch)
+                adjust_stream_group = adjust_stream_group or (
+                    stream_idx > 0 and self.running_batch.is_empty()
+                )
+                if self.running_batch.is_empty() and self.split_prefill_batch is None:
+                    self.check_memory()
+                    self.check_tree_cache()
+                    self.new_token_ratio = self.init_new_token_ratio
+                    self.maybe_sleep_on_idle()
+
+            if adjust_stream_group:
+                prefill_stream.synchronize()
+                decode_stream.synchronize()
+                stream_idx, stream_group = self.adjust_stream_groups()
+                prefill_stream = stream_group[0]
+                decode_stream = stream_group[1]
+                adjust_stream_group = False
+                logger.debug(
+                    f"Adjusting stream groups: {stream_idx}, prefill sm: {self.sm_counts[stream_idx][0]}, decode sm: {self.sm_counts[stream_idx][1]}"
+                )
+
+            with torch.cuda.stream(decode_stream):
+                set_pdmux_status(False)
+                # process decode batch
+                if self.running_batch and not self.running_batch.is_empty():
+                    decode_result = self.run_batch(self.running_batch)
+                    decode_done = True
+                else:
+                    decode_done = False
+            with torch.cuda.stream(prefill_stream):
+                set_pdmux_status(True)
+                if (
+                    self.split_prefill_batch
+                    and not self.split_prefill_batch.is_empty()
+                    and not wait_prefill_kernel_done
+                ):
+                    prefill_done = True
+                    forward_count = (
+                        max(
+                            1,
+                            self.pdmux_config.split_forward_token_budget
+                            // self.split_prefill_batch.extend_num_tokens,
+                        )
+                        if self.split_prefill_batch.extend_num_tokens > 0
+                        else self.model_config.num_hidden_layers
+                    )
+                    next_split_index = min(
+                        self.split_prefill_batch.split_index + forward_count,
+                        self.model_config.num_hidden_layers,
+                    )
+                    forward_count = (
+                        next_split_index - self.split_prefill_batch.split_index
+                    )
+
+                    self.split_prefill_batch.split_forward_count = forward_count
+                    prefill_result = self.run_batch(self.split_prefill_batch)
+                    if next_split_index == self.model_config.num_hidden_layers:
+                        self.split_prefill_batch.split_prefill_finished = True
+                        prefill_exe_done = prefill_stream.record_event()
+                    self.split_prefill_batch.split_index = next_split_index
+
+                elif wait_prefill_kernel_done:
+                    prefill_done = True
+                else:
+                    prefill_done = False
+
+            with torch.cuda.stream(decode_stream):
+                set_pdmux_status(False)
+                decode_stream.synchronize()
+                if decode_done:
+                    self.process_batch_result(self.running_batch, decode_result)
+
+            with torch.cuda.stream(prefill_stream):
+                set_pdmux_status(True)
+                if prefill_done and self.split_prefill_batch.split_prefill_finished:
+                    wait_prefill_kernel_done = True
+                    prefill_exe_done_flag = prefill_exe_done.query()
+                    flags = (
+                        torch.ones(1, device="cpu", dtype=torch.int32)
+                        if prefill_exe_done_flag
+                        else torch.zeros(1, device="cpu", dtype=torch.int32)
+                    )
+
+                    self.tp_cpu_group.allreduce(flags, dist.ReduceOp.SUM).wait()
+                    if flags.item() == self.tp_size:
+                        self.process_batch_result(
+                            self.split_prefill_batch, prefill_result
+                        )
+                        if self.running_batch and not self.running_batch.is_empty():
+                            self.running_batch.merge_batch(self.split_prefill_batch)
+                        else:
+                            self.running_batch = self.split_prefill_batch
+
+                        self.split_prefill_batch = None
+                        wait_prefill_kernel_done = False
+                        adjust_stream_group = True
diff --git a/sglang/python/sglang/srt/multiplex/pdmux_context.py b/sglang/python/sglang/srt/multiplex/pdmux_context.py
new file mode 100644
index 0000000000000000000000000000000000000000..05cde13710a6a0da436d95ffd1171241bd6fc2da
--- /dev/null
+++ b/sglang/python/sglang/srt/multiplex/pdmux_context.py
@@ -0,0 +1,164 @@
+from dataclasses import dataclass, field
+from typing import List
+
+import torch
+import yaml
+
+STREAM_GROUPS = []
+SM_COUNTS = []
+SM_GROUP_NUM = 8  # Default number of SM groups
+CURRENT_STREAM_IDX = 0
+CURRENT_STREAM_GROUP = None
+
+
+@dataclass
+class PDMuxConfig:
+    sm_group_num: int = 8
+    manual_divisions: List[List[int]] = field(
+        default_factory=list
+    )  # [prefill_sm, decode_sm, decode_bs_threshold]
+    split_forward_token_budget: int = 65536
+    decode_bs_divisor: int = 36
+
+
+def load_pdmux_config(config_path: str) -> PDMuxConfig:
+    """Load pdmux configuration from YAML file into a dataclass."""
+    if not config_path:
+        return PDMuxConfig()
+
+    with open(config_path, "r") as f:
+        raw = yaml.safe_load(f)
+
+    if "sm_group_num" not in raw:
+        raise ValueError("Missing required field: sm_group_num")
+
+    if raw["sm_group_num"] < 3:
+        raise ValueError("sm_group_num must be >= 3")
+
+    manual_divisions = raw.get("manual_divisions", [])
+
+    expected = raw["sm_group_num"] - 2
+    if manual_divisions and len(manual_divisions) != expected:
+        raise ValueError(
+            f"manual_divisions must have {expected} entries, "
+            f"but got {len(manual_divisions)}"
+        )
+
+    return PDMuxConfig(
+        sm_group_num=raw["sm_group_num"],
+        manual_divisions=manual_divisions,
+        split_forward_token_budget=raw.get("split_forward_token_budget", 65536),
+        decode_bs_divisor=raw.get("decode_bs_divisor", 36),
+    )
+
+
+def get_arch_constraints(compute_capability):
+    major, minor = compute_capability
+    # green context constraints for different architectures
+    if major == 6:
+        return 1, 1  # min_per_part, multiple
+    elif major == 7:
+        return 2, 2
+    elif major == 8:
+        return 4, 2
+    elif major == 9 and minor >= 0:
+        return 8, 8
+    else:
+        raise ValueError(f"Unsupported compute capability: {major}.{minor}")
+
+
+def divide_sm(total_sms, compute_capability, groups):
+    """
+    :param total_sms: total sm count on a single GPU
+    :param compute_capability: (major, minor)
+    :return: SM partition group(prefill sm, decode sm)
+    """
+    min_per_part, multiple = get_arch_constraints(compute_capability)
+    possible_values = [
+        x
+        for x in range(min_per_part, total_sms - min_per_part + 1, multiple)
+        if x >= total_sms - x and total_sms - x >= 16
+    ]
+    if not possible_values:
+        raise ValueError(
+            f"No valid partitions found for total SMs {total_sms} "
+            f"with constraints (min per part: {min_per_part}, multiple: {multiple})"
+        )
+
+    if len(possible_values) >= groups:
+        step = max(1, len(possible_values) // groups)
+        selected_values = possible_values[::step][:groups]
+    else:
+        selected_values = possible_values
+
+    divisions = []
+    for part1 in selected_values:
+        part2 = total_sms - part1
+        divisions.append((part1, part2))
+
+    divisions.reverse()  # Reverse to have larger prefill SM first
+
+    return divisions
+
+
+def initialize_stream_groups(gpu_id: int, config: PDMuxConfig):
+    from sgl_kernel import spatial
+
+    global STREAM_GROUPS, SM_COUNTS, SM_GROUP_NUM, CURRENT_STREAM_IDX, CURRENT_STREAM_GROUP
+    # for pd_multiplexing, Init stream_groups
+    device = torch.cuda.current_device()
+    total_sm_count = spatial.get_sm_available(gpu_id)
+    # (prefill_sm_count, decode_sm_count)
+    if config.manual_divisions:
+        divisions = [
+            (prefill_sm, decode_sm)
+            for prefill_sm, decode_sm, _ in config.manual_divisions
+        ]
+    else:
+        divisions = divide_sm(
+            total_sm_count,
+            torch.cuda.get_device_capability(device),
+            config.sm_group_num - 2,
+        )
+
+    SM_COUNTS = []
+    SM_COUNTS.append((total_sm_count, 0))  # Normal stream for prefill
+    SM_COUNTS.extend(divisions)  # Add the divided SM counts
+    SM_COUNTS.append((0, total_sm_count))  # Normal stream for decode
+    STREAM_GROUPS = []
+    STREAM_GROUPS.append(
+        (torch.cuda.Stream(gpu_id), torch.cuda.Stream(gpu_id))
+    )  # Normal stream for prefill
+    for prefill_sm, decode_sm in divisions:
+        STREAM_GROUPS.append(
+            (spatial.create_greenctx_stream_by_value(prefill_sm, decode_sm, gpu_id))
+        )
+    STREAM_GROUPS.append(
+        (torch.cuda.Stream(gpu_id), torch.cuda.Stream(gpu_id))
+    )  # Normal stream for decode
+
+    CURRENT_STREAM_IDX = 0
+    CURRENT_STREAM_GROUP = STREAM_GROUPS[CURRENT_STREAM_IDX]
+
+
+def set_current_stream_idx(idx: int):
+    global CURRENT_STREAM_IDX, CURRENT_STREAM_GROUP
+    if idx < 0 or idx >= len(STREAM_GROUPS):
+        raise ValueError(f"Invalid stream index: {idx}")
+    CURRENT_STREAM_IDX = idx
+    CURRENT_STREAM_GROUP = STREAM_GROUPS[CURRENT_STREAM_IDX]
+
+
+def get_stream_groups() -> list[tuple[torch.cuda.Stream, torch.cuda.Stream]]:
+    """Get the stream groups."""
+    return STREAM_GROUPS
+
+
+def get_sm_counts() -> list[tuple[int, int]]:
+    """Get the SM counts."""
+    return SM_COUNTS
+
+
+def get_current_stream_idx() -> int:
+    """Get the current stream index."""
+    return CURRENT_STREAM_IDX
diff --git a/sglang/python/sglang/srt/parser/__pycache__/code_completion_parser.cpython-311.pyc b/sglang/python/sglang/srt/parser/__pycache__/code_completion_parser.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7978f681a79d1bc8963b647ec353f3159ea082d2
Binary files /dev/null and b/sglang/python/sglang/srt/parser/__pycache__/code_completion_parser.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/parser/__pycache__/conversation.cpython-311.pyc b/sglang/python/sglang/srt/parser/__pycache__/conversation.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2670a219d60a2e9bbfcf2901c4d0acd868d86547
Binary files /dev/null and b/sglang/python/sglang/srt/parser/__pycache__/conversation.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/parser/__pycache__/harmony_parser.cpython-311.pyc b/sglang/python/sglang/srt/parser/__pycache__/harmony_parser.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..157ac167648d3df38c517c2ca93e9f4dd19b9434
Binary files /dev/null and b/sglang/python/sglang/srt/parser/__pycache__/harmony_parser.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/parser/__pycache__/jinja_template_utils.cpython-311.pyc b/sglang/python/sglang/srt/parser/__pycache__/jinja_template_utils.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..df839d384e47f8056c11b37a673568beb58ff73e
Binary files /dev/null and b/sglang/python/sglang/srt/parser/__pycache__/jinja_template_utils.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/parser/__pycache__/reasoning_parser.cpython-311.pyc b/sglang/python/sglang/srt/parser/__pycache__/reasoning_parser.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9d4492738b921edc9cc08352672ade21883bf907
Binary files /dev/null and b/sglang/python/sglang/srt/parser/__pycache__/reasoning_parser.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/parser/code_completion_parser.py b/sglang/python/sglang/srt/parser/code_completion_parser.py
new file mode 100644
index 0000000000000000000000000000000000000000..510f744685b474e6343708bc37b5cc3d35af5604
--- /dev/null
+++ b/sglang/python/sglang/srt/parser/code_completion_parser.py
@@ -0,0 +1,138 @@
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Completion templates."""
+
+import dataclasses
+import logging
+from enum import Enum, auto
+from typing import Optional
+
+from sglang.srt.entrypoints.openai.protocol import CompletionRequest
+
+logger = logging.getLogger(__name__)
+completion_template_name: Optional[str] = None
+
+
+class FimPosition(Enum):
+    """Position of fim middle token."""
+
+    MIDDLE = auto()
+    END = auto()
+
+
+@dataclasses.dataclass
+class CompletionTemplate:
+    """A class that manages completion prompt templates. only for code completion currently."""
+
+    # The name of this template
+    name: str
+
+    # the fim begin token
+    fim_begin_token: str
+
+    # The fim middle token
+    fim_middle_token: str
+
+    # The fim end token
+    fim_end_token: str
+
+    # The position of the fim middle token
+    fim_position: FimPosition
+
+
+# A global registry for all completion templates
+completion_templates: dict[str, CompletionTemplate] = {}
+
+
+def register_completion_template(template: CompletionTemplate, override: bool = False):
+    """Register a new completion template."""
+    if not override:
+        assert (
+            template.name not in completion_templates
+        ), f"{template.name} has been registered."
+
+    completion_templates[template.name] = template
+
+
+def completion_template_exists(template_name: str) -> bool:
+    return template_name in completion_templates
+
+
+def set_completion_template(template_name: str) -> None:
+    global completion_template_name
+    if completion_template_name is None:
+        completion_template_name = template_name
+
+
+def is_completion_template_defined() -> bool:
+    global completion_template_name
+    return completion_template_name is not None
+
+
+def generate_completion_prompt_from_request(request: CompletionRequest) -> str:
+    global completion_template_name
+    if request.suffix == "":
+        return request.prompt
+
+    return generate_completion_prompt(
+        request.prompt, request.suffix, completion_template_name
+    )
+
+
+def generate_completion_prompt(prompt: str, suffix: str, template_name: str) -> str:
+
+    completion_template = completion_templates[template_name]
+    fim_begin_token = completion_template.fim_begin_token
+    fim_middle_token = completion_template.fim_middle_token
+    fim_end_token = completion_template.fim_end_token
+    fim_position = completion_template.fim_position
+
+    if fim_position == FimPosition.MIDDLE:
+        prompt = f"{fim_begin_token}{prompt}{fim_middle_token}{suffix}{fim_end_token}"
+    elif fim_position == FimPosition.END:
+        prompt = f"{fim_begin_token}{prompt}{fim_end_token}{suffix}{fim_middle_token}"
+
+    return prompt
+
+
+register_completion_template(
+    CompletionTemplate(
+        name="deepseek_coder",
+        fim_begin_token="<｜fim▁begin｜>",
+        fim_middle_token="<｜fim▁hole｜>",
+        fim_end_token="<｜fim▁end｜>",
+        fim_position=FimPosition.MIDDLE,
+    )
+)
+
+
+register_completion_template(
+    CompletionTemplate(
+        name="star_coder",
+        fim_begin_token="<fim_prefix>",
+        fim_middle_token="<fim_middle>",
+        fim_end_token="<fim_suffix>",
+        fim_position=FimPosition.END,
+    )
+)
+
+register_completion_template(
+    CompletionTemplate(
+        name="qwen_coder",
+        fim_begin_token="<|fim_prefix|>",
+        fim_middle_token="<|fim_middle|>",
+        fim_end_token="<|fim_suffix|>",
+        fim_position=FimPosition.END,
+    )
+)
diff --git a/sglang/python/sglang/srt/parser/conversation.py b/sglang/python/sglang/srt/parser/conversation.py
new file mode 100644
index 0000000000000000000000000000000000000000..954cb168ba34b1d066af099715b133ff18719730
--- /dev/null
+++ b/sglang/python/sglang/srt/parser/conversation.py
@@ -0,0 +1,1157 @@
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Conversation chat templates.
+
+This module provides conversation template definitions, data structures, and utilities
+for managing chat templates across different model types in SGLang.
+
+Key components:
+- Conversation class: Defines the structure and behavior of chat templates
+- SeparatorStyle enum: Different conversation formatting styles
+- Template registry: Functions to register and retrieve templates by name or model path
+- Built-in templates: Pre-defined templates for popular models
+"""
+
+# Adapted from
+# https://github.com/lm-sys/FastChat/blob/main/fastchat/conversation.py
+import dataclasses
+import json
+import os
+import re
+from enum import IntEnum, auto
+from typing import Callable, Dict, List, Optional, Tuple, Union
+
+from typing_extensions import Literal
+
+from sglang.srt.entrypoints.openai.protocol import ChatCompletionRequest
+from sglang.srt.utils import ImageData, read_system_prompt_from_file
+
+
+class SeparatorStyle(IntEnum):
+    """Separator styles."""
+
+    ADD_COLON_SINGLE = auto()
+    ADD_COLON_TWO = auto()
+    ADD_COLON_SPACE_SINGLE = auto()
+    NO_COLON_SINGLE = auto()
+    NO_COLON_TWO = auto()
+    ADD_NEW_LINE_SINGLE = auto()
+    LLAMA2 = auto()
+    LLAMA3 = auto()
+    LLAMA4 = auto()
+    CHATGLM = auto()
+    CHATML = auto()
+    CHATINTERN = auto()
+    DOLLY = auto()
+    RWKV = auto()
+    PHOENIX = auto()
+    ROBIN = auto()
+    FALCON_CHAT = auto()
+    CHATGLM3 = auto()
+    DEEPSEEK_CHAT = auto()
+    METAMATH = auto()
+    DeepSeekVL2 = auto()
+    QWEN2_VL_EMBED = auto()
+    QWEN2_AUDIO = auto()
+    GEMMA3 = auto()
+    MPT = auto()
+    PADDLE_OCR = auto()
+
+
+@dataclasses.dataclass
+class Conversation:
+    """A class that manages prompt templates and keeps all conversation history."""
+
+    # The name of this template
+    name: str
+    # The template of the system prompt
+    system_template: str = "{system_message}"
+    # The system message
+    system_message: str = ""
+    # The names of two roles
+    roles: Tuple[str] = ("USER", "ASSISTANT")
+    # All messages. Each item is (role, message).
+    messages: List[List[str]] = ()
+    # The number of few shot examples
+    offset: int = 0
+    # The separator style and configurations
+    sep_style: SeparatorStyle = SeparatorStyle.ADD_COLON_SINGLE
+    sep: str = "\n"
+    sep2: str = None
+    # Stop criteria (the default one is EOS token)
+    stop_str: Union[str, List[str]] = None
+    # The string that represents an image token in the prompt
+    image_token: str = "<image>"
+    video_token: str = "<video>"
+    audio_token: str = "<audio>"
+
+    image_data: Optional[List[ImageData]] = None
+    video_data: Optional[List[str]] = None
+    modalities: Optional[List[str]] = None
+    stop_token_ids: Optional[int] = None
+
+    audio_data: Optional[List[str]] = None
+    image_token_at_prefix: bool = False
+
+    def get_prompt(self) -> str:
+        """Get the prompt for generation."""
+        system_prompt = self.system_template.format(system_message=self.system_message)
+        if self.sep_style == SeparatorStyle.ADD_COLON_SINGLE:
+            ret = system_prompt + self.sep
+            for role, message in self.messages:
+                if message:
+                    ret += role + ": " + message + self.sep
+                else:
+                    ret += role + ":"
+            return ret
+        elif self.sep_style == SeparatorStyle.ADD_COLON_TWO:
+            seps = [self.sep, self.sep2]
+            ret = system_prompt + seps[0]
+            for i, (role, message) in enumerate(self.messages):
+                if message:
+                    ret += role + ": " + message + seps[i % 2]
+                else:
+                    ret += role + ":"
+            return ret
+        elif self.sep_style == SeparatorStyle.ADD_COLON_SPACE_SINGLE:
+            ret = system_prompt + self.sep
+            for role, message in self.messages:
+                if message:
+                    ret += role + ": " + message + self.sep
+                else:
+                    ret += role + ": "  # must be end with a space
+            return ret
+        elif self.sep_style == SeparatorStyle.ADD_NEW_LINE_SINGLE:
+            ret = "" if system_prompt == "" else system_prompt + self.sep
+            for role, message in self.messages:
+                if message:
+                    ret += role + "\n" + message + self.sep
+                else:
+                    ret += role + "\n"
+            return ret
+        elif self.sep_style == SeparatorStyle.QWEN2_VL_EMBED:
+            ret = "" if system_prompt == "" else system_prompt + self.sep
+            for role, message in self.messages:
+                if message:
+                    ret += role + "\n" + message + self.sep
+                else:
+                    ret += role + "\n"
+            ret += self.stop_str
+            return ret
+        elif self.sep_style == SeparatorStyle.NO_COLON_SINGLE:
+            ret = system_prompt
+            for role, message in self.messages:
+                if message:
+                    ret += role + message + self.sep
+                else:
+                    ret += role
+            return ret
+        elif self.sep_style == SeparatorStyle.NO_COLON_TWO:
+            seps = [self.sep, self.sep2]
+            ret = system_prompt
+            for i, (role, message) in enumerate(self.messages):
+                if message:
+                    ret += role + message + seps[i % 2]
+                else:
+                    ret += role
+            return ret
+        elif self.sep_style == SeparatorStyle.RWKV:
+            ret = system_prompt
+            for i, (role, message) in enumerate(self.messages):
+                if message:
+                    ret += (
+                        role
+                        + ": "
+                        + message.replace("\r\n", "\n").replace("\n\n", "\n")
+                    )
+                    ret += "\n\n"
+                else:
+                    ret += role + ":"
+            return ret
+        elif self.sep_style == SeparatorStyle.LLAMA4:
+            # begin_of_text is added by default
+            if self.system_message:
+                ret = system_prompt
+            else:
+                ret = ""
+            for i, (role, message) in enumerate(self.messages):
+                if message:
+                    ret += f"<|header_start|>{role}<|header_end|>\n\n"
+                    ret += f"{message.strip()}<|eot|>"
+                else:
+                    ret += f"<|header_start|>{role}<|header_end|>\n\n"
+            return ret
+        elif self.sep_style == SeparatorStyle.LLAMA3:
+            if self.system_message:
+                ret = system_prompt
+            else:
+                ret = ""
+            for i, (role, message) in enumerate(self.messages):
+                if message:
+                    ret += f"<|start_header_id|>{role}<|end_header_id|>\n\n"
+                    ret += f"{message.strip()}<|eot_id|>"
+                else:
+                    ret += f"<|start_header_id|>{role}<|end_header_id|>\n\n"
+            return ret
+        elif self.sep_style == SeparatorStyle.LLAMA2:
+            seps = [self.sep, self.sep2]
+            if self.system_message:
+                ret = system_prompt
+            else:
+                ret = "[INST] "
+            for i, (role, message) in enumerate(self.messages):
+                tag = self.roles[i % 2]
+                if message:
+                    if i == 0:
+                        ret += message + " "
+                    else:
+                        ret += tag + " " + message + seps[i % 2]
+                else:
+                    ret += tag
+            return ret
+        elif self.sep_style == SeparatorStyle.CHATGLM:
+            # source: https://huggingface.co/THUDM/chatglm-6b/blob/1d240ba371910e9282298d4592532d7f0f3e9f3e/modeling_chatglm.py#L1302-L1308
+            # source2: https://huggingface.co/THUDM/chatglm2-6b/blob/e186c891cf64310ac66ef10a87e6635fa6c2a579/modeling_chatglm.py#L926
+            round_add_n = 1 if self.name == "chatglm2" else 0
+            if system_prompt:
+                ret = system_prompt + self.sep
+            else:
+                ret = ""
+
+            for i, (role, message) in enumerate(self.messages):
+                if i % 2 == 0:
+                    ret += f"[Round {i // 2 + round_add_n}]{self.sep}"
+
+                if message:
+                    ret += f"{role}：{message}{self.sep}"
+                else:
+                    ret += f"{role}："
+            return ret
+        elif self.sep_style == SeparatorStyle.CHATML:
+            ret = "" if system_prompt == "" else system_prompt + self.sep + "\n"
+            for role, message in self.messages:
+                if message:
+                    ret += role + "\n" + message + self.sep + "\n"
+                else:
+                    ret += role + "\n"
+            return ret
+        elif self.sep_style == SeparatorStyle.CHATGLM3:
+            ret = ""
+            if self.system_message:
+                ret += system_prompt
+            for role, message in self.messages:
+                if message:
+                    ret += role + "\n" + message
+                else:
+                    ret += role
+            return ret
+        elif self.sep_style == SeparatorStyle.CHATINTERN:
+            # source: https://huggingface.co/internlm/internlm-chat-7b-8k/blob/bd546fa984b4b0b86958f56bf37f94aa75ab8831/modeling_internlm.py#L771
+            seps = [self.sep, self.sep2]
+            ret = system_prompt
+            for i, (role, message) in enumerate(self.messages):
+                if i % 2 == 0:
+                    ret += "<s>"
+                if message:
+                    ret += role + ":" + message + seps[i % 2] + "\n"
+                else:
+                    ret += role + ":"
+            return ret
+        elif self.sep_style == SeparatorStyle.DOLLY:
+            seps = [self.sep, self.sep2]
+            ret = system_prompt
+            for i, (role, message) in enumerate(self.messages):
+                if message:
+                    ret += role + ":\n" + message + seps[i % 2]
+                    if i % 2 == 1:
+                        ret += "\n\n"
+                else:
+                    ret += role + ":\n"
+            return ret
+        elif self.sep_style == SeparatorStyle.PHOENIX:
+            ret = system_prompt
+            for role, message in self.messages:
+                if message:
+                    ret += role + ": " + "<s>" + message + "</s>"
+                else:
+                    ret += role + ": " + "<s>"
+            return ret
+        elif self.sep_style == SeparatorStyle.ROBIN:
+            ret = system_prompt + self.sep
+            for role, message in self.messages:
+                if message:
+                    ret += role + ":\n" + message + self.sep
+                else:
+                    ret += role + ":\n"
+            return ret
+        elif self.sep_style == SeparatorStyle.FALCON_CHAT:
+            ret = ""
+            if self.system_message:
+                ret += system_prompt + self.sep
+            for role, message in self.messages:
+                if message:
+                    ret += role + ": " + message + self.sep
+                else:
+                    ret += role + ":"
+            return ret
+        elif self.sep_style == SeparatorStyle.METAMATH:
+            ret = "" if system_prompt == "" else system_prompt + self.sep
+            for i, (role, message) in enumerate(self.messages):
+                # For MetaMath, sep2 is used to prefix the message.
+                starting_sep = ":\n" if i % 2 == 0 else ": " + self.sep2
+                ending_sep = self.sep if i % 2 == 0 else ""
+                if message:
+                    ret += role + starting_sep + message + ending_sep
+                else:
+                    ret += role + starting_sep
+            return ret
+        elif self.sep_style == SeparatorStyle.DEEPSEEK_CHAT:
+            seps = [self.sep, self.sep2]
+            ret = system_prompt
+            for i, (role, message) in enumerate(self.messages):
+                if message:
+                    ret += role + ": " + message + seps[i % 2]
+                else:
+                    ret += role + ":"
+            return ret
+        elif self.sep_style == SeparatorStyle.DeepSeekVL2:
+            seps = [self.sep, self.sep2]
+            if system_prompt == "" or system_prompt is None:
+                ret = ""
+            else:
+                ret = system_prompt + seps[0]
+            for i, (role, message) in enumerate(self.messages):
+                if message:
+                    ret += role + ": " + message + seps[i % 2]
+                else:
+                    ret += role + ":"
+            return ret
+        elif self.sep_style == SeparatorStyle.GEMMA3:
+            ret = system_prompt
+            for i, (role, message) in enumerate(self.messages):
+                if message:
+                    if i == 0:
+                        ret += message + self.sep
+                    else:
+                        ret += role + message + self.sep
+                else:
+                    ret += role
+            return ret
+
+        elif self.sep_style == SeparatorStyle.MPT:
+            ret = system_prompt + self.sep
+            for role, message in self.messages:
+                if message:
+                    if type(message) is tuple:
+                        message, _, _ = message
+                    ret += role + message + self.sep
+                else:
+                    ret += role
+            return ret
+        elif self.sep_style == SeparatorStyle.QWEN2_AUDIO:
+            ret = "" if system_prompt == "" else system_prompt + self.sep
+
+            counter = 1
+            for role, message in self.messages:
+                if message:
+                    while self.audio_token in message:
+                        message = message.replace(
+                            self.audio_token, self.audio_token.format(idx=counter), 1
+                        )
+                        counter += 1
+
+                    ret += role + "\n" + message + self.sep
+                else:
+                    ret += role + "\n"
+
+            return ret
+        elif self.sep_style == SeparatorStyle.PADDLE_OCR:
+            ret = system_prompt
+            for role, message in self.messages:
+                if message:
+                    ret += role + ": "
+                    if role == self.roles[0]:
+                        if self.image_token in message:
+                            ret += message.replace(
+                                self.image_token + "\n", self.image_token
+                            )
+                        else:
+                            ret += message
+                        ret += "\n"
+                    else:
+                        ret += message + self.sep
+                else:
+                    ret += role + ": "  # must be end with a space
+            return ret
+        else:
+            raise ValueError(f"Invalid style: {self.sep_style}")
+
+    def set_system_message(self, system_message: str):
+        """Set the system message."""
+        self.system_message = system_message
+
+    def append_message(self, role: str, message: str):
+        """Append a new message."""
+        self.messages.append([role, message])
+
+    def append_image(self, image: str, detail: Literal["auto", "low", "high"]):
+        """Append a new image."""
+        self.image_data.append(ImageData(url=image, detail=detail))
+
+    def append_video(self, video: str):
+        """Append a new video."""
+        self.video_data.append(video)
+
+    def append_audio(self, audio: str):
+        """Append a new audio."""
+        self.audio_data.append(audio)
+
+    def update_last_message(self, message: str):
+        """Update the last output.
+
+        The last message is typically set to be None when constructing the prompt,
+        so we need to update it in-place after getting the response from a model.
+        """
+        self.messages[-1][1] = message
+
+    def to_gradio_chatbot(self):
+        """Convert the conversation to gradio chatbot format."""
+        ret = []
+        for i, (role, msg) in enumerate(self.messages[self.offset :]):
+            if i % 2 == 0:
+                ret.append([msg, None])
+            else:
+                ret[-1][-1] = msg
+        return ret
+
+    def to_openai_api_messages(self):
+        """Convert the conversation to OpenAI chat completion format."""
+        if self.system_message == "":
+            ret = []
+        else:
+            ret = [{"role": "system", "content": self.system_message}]
+
+        for i, (_, msg) in enumerate(self.messages[self.offset :]):
+            if i % 2 == 0:
+                ret.append({"role": "user", "content": msg})
+            else:
+                if msg is not None:
+                    ret.append({"role": "assistant", "content": msg})
+        return ret
+
+    def copy(self):
+        return Conversation(
+            name=self.name,
+            system_template=self.system_template,
+            system_message=self.system_message,
+            roles=self.roles,
+            messages=[[x, y] for x, y in self.messages],
+            offset=self.offset,
+            sep_style=self.sep_style,
+            sep=self.sep,
+            sep2=self.sep2,
+            stop_str=self.stop_str,
+            image_token=self.image_token,
+            video_token=self.video_token,
+            audio_token=self.audio_token,
+            image_token_at_prefix=self.image_token_at_prefix,
+        )
+
+    def dict(self):
+        return {
+            "template_name": self.name,
+            "system_message": self.system_message,
+            "roles": self.roles,
+            "messages": self.messages,
+            "offset": self.offset,
+        }
+
+
+# A global registry for all conversation templates
+chat_templates: Dict[str, Conversation] = {}
+matching_function_registry: List[Callable] = []
+
+
+def register_conv_template(template: Conversation, override: bool = False):
+    """Register a new conversation template."""
+    if not override:
+        assert (
+            template.name not in chat_templates
+        ), f"{template.name} has been registered."
+
+    chat_templates[template.name] = template
+
+
+def register_conv_template_matching_function(func):
+    matching_function_registry.append(func)
+
+
+def get_conv_template_by_model_path(model_path):
+    for matching_func in matching_function_registry:
+        conv_name = matching_func(model_path)
+        if conv_name is not None:
+            return conv_name
+    return None
+
+
+def chat_template_exists(template_name: str) -> bool:
+    return template_name in chat_templates
+
+
+def generate_embedding_convs(
+    texts: List[str], images: List[str], videos: List[str], template_name: str
+) -> List[Conversation]:
+    conv_template = chat_templates[template_name].copy()
+    convs = []
+    for text, image, video in zip(texts, images, videos):
+        conv = Conversation(
+            name=conv_template.name,
+            system_template=conv_template.system_template,
+            system_message=conv_template.system_message,
+            roles=conv_template.roles,
+            messages=list(conv_template.messages),  # prevent in-place modification
+            offset=conv_template.offset,
+            sep_style=SeparatorStyle(conv_template.sep_style),
+            sep=conv_template.sep,
+            sep2=conv_template.sep2,
+            stop_str=conv_template.stop_str,
+            image_data=[],
+            video_data=[],
+            audio_data=[],
+            modalities=[],
+            image_token=conv_template.image_token,
+            video_token=conv_template.video_token,
+            audio_token=conv_template.audio_token,
+            image_token_at_prefix=conv_template.image_token_at_prefix,
+        )
+        real_content = ""
+
+        if image is not None:
+            image_token = (
+                conv.image_token + "\n"
+                if conv.name != "gme-qwen2-vl"
+                else conv.image_token
+            )
+            real_content += image_token
+        if video is not None:
+            real_content += conv.video_token
+        if text is not None:
+            real_content += text
+        conv.append_message(conv.roles[0], real_content)
+        # Add a blank message for the assistant.
+        conv.append_message(conv.roles[1], None)
+        convs.append(conv)
+
+    return convs
+
+
+# Models in which system adds modality tokens at prompt start automatically
+# when media inputs exceed modality tokens in prompt (e.g. 3 images but 2 <image> tokens)
+_MODELS_REQUIRING_MODALITY_SUPPLEMENT = {"deepseek-vl2"}
+
+
+# adapted from https://github.com/vllm-project/vllm/blob/5124f5bf51b83e6f344c1bc6652e8c4d81313b34/vllm/entrypoints/chat_utils.py#L856
+def _get_full_multimodal_text_prompt(
+    modality_token: str, modality_count: int, text_prompt: str
+) -> str:
+    """Combine multimodal prompts for a multimodal language model."""
+
+    # For any existing placeholder in the text prompt, we leave it as is
+    left: int = modality_count - text_prompt.count(modality_token)
+    if left < 0:
+        raise ValueError(
+            f"Found more '{modality_token}' placeholders in input prompt than "
+            "actual multimodal data items."
+        )
+
+    # NOTE: For now we always add missing modality_token at the front of
+    # the prompt. This may change to be customizable in the future.
+    return "\n".join([modality_token] * left + [text_prompt])
+
+
+def generate_chat_conv(
+    request: ChatCompletionRequest, template_name: str
+) -> Conversation:
+    conv = chat_templates[template_name].copy()
+    conv = Conversation(
+        name=conv.name,
+        system_template=conv.system_template,
+        system_message=conv.system_message,
+        roles=conv.roles,
+        messages=list(conv.messages),  # prevent in-place modification
+        offset=conv.offset,
+        sep_style=SeparatorStyle(conv.sep_style),
+        sep=conv.sep,
+        sep2=conv.sep2,
+        stop_str=conv.stop_str,
+        image_data=[],
+        video_data=[],
+        audio_data=[],
+        modalities=[],
+        image_token=conv.image_token,
+        audio_token=conv.audio_token,
+        video_token=conv.video_token,
+        image_token_at_prefix=conv.image_token_at_prefix,
+    )
+
+    if isinstance(request.messages, str):
+        raise ValueError("The messages should be a list of dict.")
+    for message in request.messages:
+        msg_role = message.role
+        if msg_role == "system":
+            if isinstance(message.content, str):
+                conv.system_message = message.content
+            elif isinstance(message.content, list):
+                if (
+                    len(message.content) != 1
+                    or getattr(message.content[0], "type", None) != "text"
+                ):
+                    raise ValueError("The system message should be a single text.")
+                else:
+                    conv.system_message = getattr(message.content[0], "text", "")
+        elif msg_role == "user":
+            # Handle the various types of Chat Request content types here.
+            if isinstance(message.content, str):
+                conv.append_message(conv.roles[0], message.content)
+            else:
+                real_content = ""
+                # calculate number of image_url
+                num_image_url = 0
+                for content in message.content:
+                    if content.type == "image_url":
+                        num_image_url += 1
+                        conv.modalities.append(content.modalities)
+                image_token = (
+                    conv.image_token + "\n"
+                    if conv.name != "qwen2-vl"
+                    else conv.image_token
+                )
+                add_token_as_needed: bool = (
+                    conv.name in _MODELS_REQUIRING_MODALITY_SUPPLEMENT
+                )
+                if add_token_as_needed:
+                    image_token = ""
+
+                audio_token = conv.audio_token
+                video_token = conv.video_token
+                for content in message.content:
+                    if content.type == "text":
+                        if num_image_url > 16:
+                            real_content += "\n"  # for video
+                        real_content += content.text
+                    elif content.type == "image_url":
+                        # NOTE: works for llava and intervl2_5
+                        if conv.image_token_at_prefix:
+                            real_content = image_token + real_content
+                        else:
+                            real_content += image_token
+                        conv.append_image(
+                            content.image_url.url, content.image_url.detail
+                        )
+                    elif content.type == "video_url":
+                        real_content += video_token
+                        conv.append_video(content.video_url.url)
+                    elif content.type == "audio_url":
+                        real_content += audio_token
+                        conv.append_audio(content.audio_url.url)
+                if add_token_as_needed:
+                    real_content = _get_full_multimodal_text_prompt(
+                        conv.image_token, num_image_url, real_content
+                    )
+                conv.append_message(conv.roles[0], real_content)
+        elif msg_role == "assistant":
+            parsed_content = ""
+            if isinstance(message.content, str):
+                parsed_content = message.content
+            elif isinstance(message.content, list):
+                if (
+                    len(message.content) != 1
+                    or getattr(message.content[0], "type", None) != "text"
+                ):
+                    raise ValueError(
+                        "The assistant's response should be a single text."
+                    )
+                else:
+                    parsed_content = getattr(message.content[0], "text", "")
+            conv.append_message(conv.roles[1], parsed_content)
+        else:
+            raise ValueError(f"Unknown role: {msg_role}")
+
+    # Add a blank message for the assistant.
+    conv.append_message(conv.roles[1], None)
+    return conv
+
+
+# llama2 template
+# reference: https://github.com/lm-sys/FastChat/blob/main/fastchat/conversation.py
+# reference: https://github.com/facebookresearch/llama/blob/1a240688810f8036049e8da36b073f63d2ac552c/llama/generation.py#L212
+register_conv_template(
+    Conversation(
+        name="llama-2",
+        system_template="[INST] <<SYS>>\n{system_message}\n<</SYS>>\n\n",
+        roles=("[INST]", "[/INST]"),
+        sep_style=SeparatorStyle.LLAMA2,
+        sep=" ",
+        sep2=" </s><s>",
+        stop_str=["[INST]", "[/INST]", "<<SYS>>", "<</SYS>>"],
+    )
+)
+
+# reference: https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503/blob/main/chat_template.json
+register_conv_template(
+    Conversation(
+        name="mistral",
+        system_template="[SYSTEM_PROMPT]\n{system_message}\n[/SYSTEM_PROMPT]\n\n",
+        roles=("[INST]", "[/INST]"),
+        sep_style=SeparatorStyle.LLAMA2,
+        sep=" ",
+        sep2=" </s><s>",
+        stop_str=["[INST]", "[/INST]", "[SYSTEM_PROMPT]", "[/SYSTEM_PROMPT]"],
+        image_token="[IMG]",
+    )
+)
+
+register_conv_template(
+    Conversation(
+        name="devstral",
+        system_template="[SYSTEM_PROMPT]\n{system_message}\n[/SYSTEM_PROMPT]\n\n",
+        system_message=read_system_prompt_from_file("mistralai/Devstral-Small-2505"),
+        roles=("[INST]", "[/INST]"),
+        sep_style=SeparatorStyle.LLAMA2,
+        sep=" ",
+        sep2=" </s><s>",
+        stop_str=["[INST]", "[/INST]", "[SYSTEM_PROMPT]", "[/SYSTEM_PROMPT]"],
+        image_token="[IMG]",
+    )
+)
+
+# reference: https://huggingface.co/meta-llama/Llama-4-Scout-17B-16E-Instruct/blob/main/chat_template.json
+register_conv_template(
+    Conversation(
+        name="llama-4",
+        system_template="<|header_start|>system<|header_end|>\n\n{system_message}<|eot|>",
+        roles=("user", "assistant"),
+        sep_style=SeparatorStyle.LLAMA4,
+        sep="",
+        stop_str=["<|end_of_text|>", "<|eot|>", "<|eom|>"],
+        image_token="<|image|>",
+    )
+)
+
+# TODO (lifuhuang): Refactor BaseMultimodalProcessor to support the default image token "<|image_{index}|>" in the future.
+register_conv_template(
+    Conversation(
+        name="phi-4-mm",
+        system_message="",
+        system_template="{system_message}",
+        roles=("<|user|>", "<|assistant|>"),
+        sep_style=SeparatorStyle.NO_COLON_SINGLE,
+        sep="<|end|>",
+        stop_str="<|end|>",
+        image_token="<|endoftext10|>",
+        audio_token="<|endoftext11|>",
+    )
+)
+
+register_conv_template(
+    Conversation(
+        name="chatml",
+        system_template="<|im_start|>system\n{system_message}",
+        system_message="You are a helpful assistant.",
+        roles=("<|im_start|>user", "<|im_start|>assistant"),
+        sep_style=SeparatorStyle.CHATML,
+        sep="<|im_end|>",
+        stop_str=["<|endoftext|>", "<|im_end|>"],
+    )
+)
+
+register_conv_template(
+    Conversation(
+        name="chatml-llava",
+        system_template="<|im_start|>system\n{system_message}",
+        system_message="You are a helpful assistant.",
+        roles=("<|im_start|>user", "<|im_start|>assistant"),
+        sep_style=SeparatorStyle.CHATML,
+        sep="<|im_end|>",
+        stop_str=["<|endoftext|>", "<|im_end|>"],
+    )
+)
+
+register_conv_template(
+    Conversation(
+        name="vicuna_v1.1",
+        system_message="A chat between a curious user and an artificial intelligence assistant. "
+        "The assistant gives helpful, detailed, and polite answers to the user's questions.",
+        roles=("USER", "ASSISTANT"),
+        sep_style=SeparatorStyle.ADD_COLON_TWO,
+        sep=" ",
+        sep2="</s>",
+    )
+)
+
+register_conv_template(
+    Conversation(
+        name="llama_3_vision",
+        system_message="You are a helpful language and vision assistant. You are able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language.",
+        system_template="<|start_header_id|>system<|end_header_id|>\n\n{system_message}<|eot_id|>",
+        roles=("user", "assistant"),
+        sep_style=SeparatorStyle.LLAMA3,
+        sep="",
+        stop_str=["<|end_of_text|>", "<|eot_id|>"],
+        image_token="<|image|>",
+    )
+)
+
+register_conv_template(
+    Conversation(
+        name="llava_llama_3",
+        system_message="You are a helpful language and vision assistant. You are able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language.",
+        system_template="<|start_header_id|>system<|end_header_id|>\n\n{system_message}<|eot_id|>",
+        roles=("user", "assistant"),
+        sep_style=SeparatorStyle.LLAMA3,
+        sep="",
+        stop_str=["<|end_of_text|>", "<|eot_id|>"],
+    )
+)
+# Reference: https://github.com/InternLM/lmdeploy/blob/387bf54b4f124e72aab30ae9755f562e435d3d01/lmdeploy/model.py#L425-L442
+register_conv_template(
+    Conversation(
+        name="internlm2-chat",
+        system_template="<|im_start|>system\n{system_message}",
+        roles=("<|im_start|>user", "<|im_start|>assistant"),
+        sep="\n",
+        stop_str=["<|im_end|>", "<|action_end|>"],
+    )
+)
+
+register_conv_template(
+    Conversation(
+        name="internvl-2-5",
+        system_template="<|im_start|>system\n{system_message}",
+        system_message="你是书生·万象，英文名是InternVL，是由上海人工智能实验室、清华大学及多家合作单位联合开发的多模态大语言模型。",
+        roles=("<|im_start|>user\n", "<|im_start|>assistant\n"),
+        sep_style=SeparatorStyle.MPT,
+        sep="<|im_end|>\n",
+        stop_str=["<|im_end|>", "<|action_end|>"],
+        image_token="<IMG_CONTEXT>",
+        image_token_at_prefix=True,
+    )
+)
+
+# Reference: https://huggingface.co/docs/transformers/main/model_doc/qwen2_vl#usage-example
+register_conv_template(
+    Conversation(
+        name="qwen2-vl",
+        system_message="You are a helpful assistant.",
+        system_template="<|im_start|>system\n{system_message}",
+        roles=("<|im_start|>user", "<|im_start|>assistant"),
+        sep="<|im_end|>\n",
+        sep_style=SeparatorStyle.ADD_NEW_LINE_SINGLE,
+        stop_str=["<|im_end|>"],
+        image_token="<|vision_start|><|image_pad|><|vision_end|>",
+        video_token="<|vision_start|><|video_pad|><|vision_end|>",
+    )
+)
+
+register_conv_template(
+    Conversation(
+        name="deepseek-ocr",
+        system_message="",
+        system_template="",
+        roles=("", ""),
+        sep="",
+        sep_style=SeparatorStyle.NO_COLON_SINGLE,
+        stop_str=["<｜end▁of▁sentence｜>"],
+        image_token="<image>",
+        image_token_at_prefix=True,
+    )
+)
+
+register_conv_template(
+    Conversation(
+        name="paddle-ocr",
+        system_message="",
+        system_template="<|begin_of_sentence|>{system_message}",
+        roles=("User", "Assistant"),
+        sep="<|end_of_sentence|>",
+        sep_style=SeparatorStyle.PADDLE_OCR,
+        stop_str=["<|end_of_sentence|>"],
+        image_token="<|IMAGE_START|><|IMAGE_PLACEHOLDER|><|IMAGE_END|>",
+    )
+)
+
+register_conv_template(
+    Conversation(
+        name="deepseek-vl2",
+        system_template="{system_message}",
+        # system_message="You are a helpful assistant. Please answer truthfully and write out your "
+        # "thinking step by step to be sure you get the right answer.",
+        system_message="",
+        roles=("<|User|>", "<|Assistant|>"),
+        messages=(),
+        offset=0,
+        sep_style=SeparatorStyle.DeepSeekVL2,
+        sep="\n\n",
+        sep2="<｜end▁of▁sentence｜>",
+        stop_str=["User:", "<｜end▁of▁sentence｜>"],
+    )
+)
+
+# Reference: https://huggingface.co/google/gemma-3-4b-it/blob/main/config.json
+register_conv_template(
+    Conversation(
+        name="gemma-it",
+        system_message="You are a helpful assistant.",
+        system_template="<start_of_turn>user\n{system_message}\n\n",
+        roles=("<start_of_turn>user\n", "<start_of_turn>model\n"),
+        sep="<end_of_turn>\n",
+        sep_style=SeparatorStyle.GEMMA3,
+        stop_str=["<end_of_turn>"],
+        image_token="<start_of_image>",
+        audio_token="<start_of_audio>",
+    )
+)
+
+# Reference: https://huggingface.co/Alibaba-NLP/gme-Qwen2-VL-2B-Instruct#usage
+register_conv_template(
+    Conversation(
+        name="gme-qwen2-vl",
+        system_message="You are a helpful assistant.",
+        system_template="<|im_start|>system\n{system_message}",
+        roles=("<|im_start|>user", "<|im_start|>assistant"),
+        sep="<|im_end|>\n",
+        sep_style=SeparatorStyle.QWEN2_VL_EMBED,
+        stop_str="<|endoftext|>",
+        image_token="<|vision_start|><|image_pad|><|vision_end|>",
+    )
+)
+
+# Reference: https://huggingface.co/openbmb/MiniCPM-V-2_6#usage
+register_conv_template(
+    Conversation(
+        name="minicpmv",
+        system_message="You are a helpful assistant",
+        system_template="<|im_start|>system\n{system_message}.",
+        roles=("<|im_start|>user", "<|im_start|>assistant"),
+        sep="<|im_end|>\n",
+        sep_style=SeparatorStyle.ADD_NEW_LINE_SINGLE,
+        stop_str=("<|im_end|>", "<|endoftext|>"),
+        image_token="(<image>./</image>)",
+        video_token="(<video>./</video>)",
+    )
+)
+
+# Reference: https://github.com/deepseek-ai/Janus?tab=readme-ov-file#janus-pro
+register_conv_template(
+    Conversation(
+        name="janus-pro",
+        system_message="You are a helpful language and vision assistant. You are able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language",
+        system_template="{system_message}.",
+        roles=("User", "Assistant"),
+        sep="\n\n",
+        sep2="<｜end▁of▁sentence｜>",
+        sep_style=SeparatorStyle.ADD_COLON_TWO,
+        stop_str=["<|User|>", "<｜end▁of▁sentence｜>"],
+        image_token="<image_placeholder>",
+    )
+)
+
+# Reference: https://huggingface.co/openbmb/MiniCPM-o-2_6#usage
+register_conv_template(
+    Conversation(
+        name="minicpmo",
+        system_message="You are Qwen, created by Alibaba Cloud. You are a helpful assistant.",
+        system_template="<|im_start|>system\n{system_message}",
+        roles=("<|im_start|>user", "<|im_start|>assistant"),
+        sep="<|im_end|>\n",
+        sep_style=SeparatorStyle.ADD_NEW_LINE_SINGLE,
+        stop_str=("<|im_end|>", "<|endoftext|>"),
+        image_token="(<image>./</image>)",
+        audio_token="(<audio>./</audio>)",
+    )
+)
+
+# Reference: https://huggingface.co/moonshotai/Kimi-VL-A3B-Instruct/blob/main/chat_template.jinja
+register_conv_template(
+    Conversation(
+        name="kimi-vl",
+        system_message="You are a helpful assistant",
+        system_template="<|im_system|>system<|im_middle|>{system_message}",
+        roles=(
+            "<|im_user|>user<|im_middle|>",
+            "<|im_assistant|>assistant<|im_middle|>",
+        ),
+        messages=[],
+        sep="<|im_end|>",
+        sep_style=SeparatorStyle.NO_COLON_SINGLE,
+        stop_str="<|im_end|>",
+        image_token="<|media_start|>image<|media_content|><|media_pad|><|media_end|>",
+    )
+)
+
+register_conv_template(
+    Conversation(
+        name="qwen2-audio",
+        system_template="<|im_start|>system\n{system_message}",
+        system_message="You are a helpful assistant.",
+        roles=("<|im_start|>user", "<|im_start|>assistant"),
+        sep="<|im_end|>\n",
+        sep_style=SeparatorStyle.QWEN2_AUDIO,
+        stop_str=["<|im_end|>"],
+        audio_token="Audio {idx}: <|audio_bos|><|AUDIO|><|audio_eos|>\n",
+    )
+)
+
+register_conv_template(
+    Conversation(
+        name="points-v15-chat",
+        system_message="",
+        system_template="",
+        roles=("<|im_start|>user", "<|im_start|>assistant"),
+        sep="<|im_end|>\n",
+        sep_style=SeparatorStyle.ADD_NEW_LINE_SINGLE,
+        stop_str=["<|im_end|>"],
+        image_token="<|vision_start|><|image_pad|><|vision_end|>",
+        video_token="<|vision_start|><|video_pad|><|vision_end|>",
+    )
+)
+
+# Whisper speech-to-text template
+# Whisper uses special tokens: <|startoftranscript|>, <|en|>, <|transcribe|>, etc.
+# Audio features are processed by encoder separately, not inserted into text
+# The decoder start tokens (task, language) should be set via generation config
+register_conv_template(
+    Conversation(
+        name="whisper",
+        system_template="",
+        system_message="",
+        roles=("", ""),
+        sep_style=SeparatorStyle.NO_COLON_SINGLE,
+        sep="",
+        stop_str=["<|endoftext|>"],
+        audio_token="",  # Empty - audio is handled by encoder, not as text token
+    )
+)
+
+MODEL_TYPE_TO_TEMPLATE = {
+    "internvl_chat": "internvl-2-5",
+    "deepseek_vl_v2": "deepseek-vl2",
+    "multi_modality": "janus-pro",
+    "phi4mm": "phi-4-mm",
+    "minicpmv": "minicpmv",
+    "minicpmo": "minicpmo",
+    "deepseek-ocr": "deepseek-ocr",
+    "paddleocr_vl": "paddle-ocr",
+    "whisper": "whisper",
+}
+
+
+@register_conv_template_matching_function
+def match_points_v15_chat(model_path: str):
+    # reference: https://github.com/sgl-project/sglang/issues/12791
+    if re.search(r"\bpoints\b", model_path, re.IGNORECASE):
+        return "points-v15-chat"
+
+
+def get_model_type(model_path: str) -> Optional[str]:
+    config_path = os.path.join(model_path, "config.json")
+    if not os.path.exists(config_path):
+        return None
+    try:
+        with open(config_path, "r", encoding="utf-8") as f:
+            config = json.load(f)
+        return config.get("model_type")
+    except (IOError, json.JSONDecodeError):
+        return None
+
+
+@register_conv_template_matching_function
+def match_internvl(model_path: str):
+    if re.search(r"internvl", model_path, re.IGNORECASE):
+        return "internvl-2-5"
+    model_type = get_model_type(model_path)
+    return MODEL_TYPE_TO_TEMPLATE.get(model_type)
+
+
+@register_conv_template_matching_function
+def match_deepseek_janus_pro(model_path: str):
+    if re.search(r"janus", model_path, re.IGNORECASE):
+        return "janus-pro"
+    model_type = get_model_type(model_path)
+    return MODEL_TYPE_TO_TEMPLATE.get(model_type)
+
+
+@register_conv_template_matching_function
+def match_vicuna(model_path: str):
+    if re.search(r"vicuna|llava-v1\.5|llava-next-video-7b", model_path, re.IGNORECASE):
+        return "vicuna_v1.1"
+
+
+@register_conv_template_matching_function
+def match_deepseek_vl(model_path: str):
+    if re.search(r"deepseek.*vl2", model_path, re.IGNORECASE):
+        return "deepseek-vl2"
+    model_type = get_model_type(model_path)
+    return MODEL_TYPE_TO_TEMPLATE.get(model_type)
+
+
+@register_conv_template_matching_function
+def match_qwen_chat_ml(model_path: str):
+    if re.search(
+        r"llava-v1\.6-34b|llava-v1\.6-yi-34b|llava-next-video-34b|llava-onevision-qwen2",
+        model_path,
+        re.IGNORECASE,
+    ):
+        return "chatml-llava"
+
+
+@register_conv_template_matching_function
+def match_minicpm(model_path: str):
+    match = re.search(r"minicpm-(v|o)", model_path, re.IGNORECASE)
+    if match:
+        return f"minicpm{match.group(1).lower()}"
+    model_type = get_model_type(model_path)
+    return MODEL_TYPE_TO_TEMPLATE.get(model_type)
+
+
+@register_conv_template_matching_function
+def match_phi_4_mm(model_path: str):
+    if "phi-4-multimodal" in model_path.lower():
+        return "phi-4-mm"
+    model_type = get_model_type(model_path)
+    return MODEL_TYPE_TO_TEMPLATE.get(model_type)
+
+
+@register_conv_template_matching_function
+def match_deepseek_ocr(model_path: str):
+    if "deepseek-ocr" in model_path.lower():
+        return "deepseek-ocr"
+    model_type = get_model_type(model_path)
+    return MODEL_TYPE_TO_TEMPLATE.get(model_type)
+
+
+@register_conv_template_matching_function
+def match_paddle_ocr(model_path: str):
+    if "paddleocr" in model_path.lower():
+        return "paddle-ocr"
+    model_type = get_model_type(model_path)
+    return MODEL_TYPE_TO_TEMPLATE.get(model_type)
+
+
+@register_conv_template_matching_function
+def match_whisper(model_path: str):
+    if "whisper" in model_path.lower():
+        return "whisper"
+    model_type = get_model_type(model_path)
+    return MODEL_TYPE_TO_TEMPLATE.get(model_type)
diff --git a/sglang/python/sglang/srt/parser/harmony_parser.py b/sglang/python/sglang/srt/parser/harmony_parser.py
new file mode 100644
index 0000000000000000000000000000000000000000..ffc0be95ec714c3a0e6571860fd0cf8fddc3dea6
--- /dev/null
+++ b/sglang/python/sglang/srt/parser/harmony_parser.py
@@ -0,0 +1,588 @@
+import re
+from dataclasses import dataclass
+from typing import Iterator, List, Optional, Tuple
+
+
+@dataclass
+class Event:
+    """Represents a parsed event from the Harmony stream."""
+
+    event_type: str
+    content: str
+    raw_text: str = None  # Original text including structural markers
+
+
+@dataclass
+class Token:
+    """A structural token in the Harmony format."""
+
+    type: str
+    start: int
+    end: int
+
+
+def prefix_hold(text: str, tokens: List[str]) -> Tuple[str, str]:
+    """
+    Holds back the longest suffix of `text` that could be a prefix of any token.
+    Returns (emit_now, keep_for_later).
+    """
+    if not text:
+        return "", ""
+    max_hold = 0
+    for tok in tokens:
+        if not tok:
+            continue
+        # Check for prefixes of tok in the suffix of text
+        L = min(len(tok) - 1, len(text))
+        for k in range(L, 0, -1):
+            if tok.startswith(text[-k:]):
+                max_hold = max(max_hold, k)
+                break
+    if max_hold == 0:
+        return text, ""
+    return text[:-max_hold], text[-max_hold:]
+
+
+def iter_tokens(text: str, start_pos: int = 0) -> Iterator[Token]:
+    """Iterate over structural tokens in left-to-right order."""
+    TOKENS = {
+        "<|start|>": "START",
+        "<|channel|>": "CHANNEL",
+        "<|message|>": "MESSAGE",
+        "<|constrain|>": "CONSTRAIN",
+        "<|end|>": "END",
+        "<|call|>": "CALL",
+        "<|return|>": "RETURN",
+    }
+
+    pos = start_pos
+    has_unknown_tokens = False
+    while pos < len(text):
+        # Find next "<|"
+        marker_pos = text.find("<|", pos)
+        if marker_pos == -1:
+            break
+
+        # Emit any text before the marker
+        if marker_pos > pos:
+            yield Token("TEXT", pos, marker_pos)
+
+        # Check which token it is
+        found_token = False
+
+        for literal, token_type in TOKENS.items():
+            if text.startswith(literal, marker_pos):
+                yield Token(token_type, marker_pos, marker_pos + len(literal))
+                pos = marker_pos + len(literal)
+                found_token = True
+                break
+        if not found_token:
+            tail = text[marker_pos:]
+            is_partial = any(lit.startswith(tail) for lit in TOKENS)
+            if is_partial:
+                # Hold whole tail (partial token)
+                yield Token("TEXT", marker_pos, len(text))
+                pos = len(text)
+                break
+            else:
+                # Unknown token like <|weird|> ...
+                has_unknown_tokens = True
+                # Emit the "<|" as a TEXT token first
+                yield Token("TEXT", marker_pos, marker_pos + 2)
+
+                # Try to find a closing "|>" for this unknown token
+                close_pos = text.find("|>", marker_pos + 2)
+                if close_pos != -1:
+                    # Look ahead to the next structural token after the unknown close
+                    next_marker = text.find("<|", close_pos + 2)
+                    if next_marker != -1:
+                        # Emit the unknown body + any following plain text up to next marker
+                        yield Token("TEXT", marker_pos + 2, next_marker)
+                        pos = next_marker
+                    else:
+                        # Emit until the end
+                        yield Token("TEXT", marker_pos + 2, len(text))
+                        pos = len(text)
+                        break
+                else:
+                    # No closing; advance past "<|" and continue scanning
+                    pos = marker_pos + 2
+
+    # Emit any remaining text
+    if pos < len(text):
+        yield Token("TEXT", pos, len(text))
+    elif pos == len(text) and has_unknown_tokens:
+        # Add an empty trailing TEXT token only when we encountered unknown tokens
+        # and the text ends with a known structural token. This matches expected tests.
+        for literal in TOKENS.keys():
+            if text.endswith(literal):
+                yield Token("TEXT", pos, pos)
+                break
+
+
+class CanonicalStrategy:
+    """Parses the canonical Harmony format with channel markers."""
+
+    def __init__(self):
+        self.guard_tokens = [
+            "<|start|>",
+            "<|channel|>",
+            "<|message|>",
+            "<|constrain|>",
+            "<|end|>",
+            "<|call|>",
+            "<|return|>",
+        ]
+
+    def parse(self, text: str) -> Tuple[List[Event], str]:
+        events = []
+        tokens = list(iter_tokens(text))
+
+        if not tokens:
+            return events, ""
+
+        pos = 0
+        while pos < len(tokens):
+            token = tokens[pos]
+
+            if token.type == "TEXT":
+                # Check if this might be incomplete
+                if pos == len(tokens) - 1:  # Last token
+                    emit, hold = prefix_hold(
+                        text[token.start : token.end], self.guard_tokens
+                    )
+                    if emit:
+                        events.append(Event("normal", emit))
+                    return events, hold
+                else:
+                    # Check if this might be commentary filler between blocks
+                    if self._is_commentary_filler_between_blocks(text, tokens, pos):
+                        # Skip this filler text - don't emit as normal content
+                        pos += 1
+                    else:
+                        content = text[token.start : token.end]
+                        # Skip standalone structural tokens that shouldn't be emitted as normal text
+                        if not self._is_standalone_structural_token(content):
+                            events.append(Event("normal", content))
+                        pos += 1
+
+            elif token.type in ("START", "CHANNEL"):
+                # Parse a channel block starting here
+                block_result = self._parse_block(text, tokens, pos)
+                if block_result is None:
+                    # Incomplete block - check if we can emit partial reasoning content
+                    partial_result = self._parse_partial_analysis(text, tokens, pos)
+                    if partial_result:
+                        event, remaining_text = partial_result
+                        events.append(event)
+                        return events, remaining_text
+                    # No partial content, hold entire remaining text
+                    remaining_start = tokens[pos].start
+                    return events, text[remaining_start:]
+                event, new_pos = block_result
+                if event:
+                    events.append(event)
+                pos = new_pos
+
+            else:
+                # Check if this might be commentary filler between blocks
+                if self._is_commentary_filler_between_blocks(text, tokens, pos):
+                    # Skip this filler text - don't emit as normal content
+                    pos += 1
+                else:
+                    # Unexpected token - only emit as text if it's not a standalone structural token
+                    content = text[token.start : token.end]
+                    if not self._is_standalone_structural_token(content):
+                        events.append(Event("normal", content))
+                    pos += 1
+
+        return events, ""
+
+    def _parse_partial_analysis(
+        self, text: str, tokens: List[Token], start_pos: int
+    ) -> Optional[Tuple[Event, str]]:
+        """Try to parse partial analysis content for incremental streaming."""
+        pos = start_pos
+
+        # Skip <|start|> if present
+        if pos < len(tokens) and tokens[pos].type == "START":
+            pos += 1
+
+        # Look for <|channel|> followed by analysis
+        channel_pos = None
+        message_pos = None
+
+        for i in range(pos, len(tokens)):
+            if tokens[i].type == "CHANNEL" and channel_pos is None:
+                channel_pos = i
+            elif tokens[i].type == "MESSAGE":
+                message_pos = i
+                break
+
+        if channel_pos is None or message_pos is None:
+            return None
+
+        # Extract channel type
+        channel_start = (
+            tokens[channel_pos + 1].start
+            if channel_pos + 1 < len(tokens)
+            else tokens[channel_pos].end
+        )
+        channel_end = tokens[message_pos].start
+        channel_header = text[channel_start:channel_end]
+
+        channel_type = self._extract_channel_type(channel_header)
+        if channel_type != "analysis":
+            return None  # Only stream analysis content - tool calls wait for completion
+
+        # Extract partial content after <|message|>
+        content_start = tokens[message_pos].end
+        content = text[content_start:]
+
+        # Return partial reasoning content and preserve the channel structure for next parse
+        remaining_text = text[tokens[start_pos].start : content_start]
+        return Event("reasoning", content), remaining_text
+
+    def _extract_channel_type(self, header_text: str) -> Optional[str]:
+        """Extract channel type from header, ignoring other attributes like to=... or <|constrain|>..."""
+        # Look for channel type at the start of the header (case insensitive)
+        header_clean = header_text.strip()
+
+        if header_clean.lower().startswith("analysis"):
+            return "analysis"
+        elif header_clean.lower().startswith("commentary"):
+            return "commentary"
+        elif header_clean.lower().startswith("final"):
+            return "final"
+        else:
+            return None  # Unknown channel type
+
+    def _parse_block(
+        self, text: str, tokens: List[Token], start_pos: int
+    ) -> Optional[Tuple[Optional[Event], int]]:
+        """Parse a channel block. Returns (event, next_pos) or None if incomplete."""
+        pos = start_pos
+
+        # Skip <|start|> if present
+        if pos < len(tokens) and tokens[pos].type == "START":
+            pos += 1
+
+        # Look for <|channel|> or <|message|> (tool responses go direct to message)
+        channel_pos = None
+        message_pos = None
+
+        for i in range(pos, len(tokens)):
+            if tokens[i].type == "CHANNEL" and channel_pos is None:
+                channel_pos = i
+            elif tokens[i].type == "MESSAGE":
+                message_pos = i
+                break
+
+        if message_pos is None:
+            return None  # No message token found
+
+        # If no channel found, this is a tool response - treat as normal text
+        if channel_pos is None:
+            content_start = tokens[message_pos].end
+            # Find end token after message
+            end_token_pos = None
+            for i in range(message_pos + 1, len(tokens)):
+                if tokens[i].type in ("END", "CALL", "RETURN"):
+                    end_token_pos = i
+                    break
+            if end_token_pos is None:
+                return None  # Incomplete
+            content = text[content_start : tokens[end_token_pos].start]
+            return Event("normal", content), end_token_pos + 1
+
+        # Standard channel block processing - message_pos is already found above
+        pos = channel_pos + 1  # Skip CHANNEL token
+
+        # Extract channel type from header (ignoring other attributes like to=... or <|constrain|>...)
+        channel_start = tokens[pos].start if pos < len(tokens) else tokens[pos - 1].end
+        channel_end = tokens[message_pos].start
+        channel_header = text[channel_start:channel_end]
+
+        channel_type = self._extract_channel_type(channel_header)
+        if not channel_type:
+            return None  # Unknown or malformed channel
+
+        pos = message_pos + 1  # Skip MESSAGE token
+
+        # Find content and end token
+        content_start = tokens[message_pos].end
+        end_pos = pos
+
+        # Each channel type has specific valid end tokens
+        if channel_type == "final":
+            while end_pos < len(tokens) and tokens[end_pos].type != "RETURN":
+                end_pos += 1
+        elif channel_type == "analysis":
+            while end_pos < len(tokens) and tokens[end_pos].type not in ("END", "CALL"):
+                end_pos += 1
+        else:  # commentary
+            while end_pos < len(tokens) and tokens[end_pos].type not in ("END", "CALL"):
+                end_pos += 1
+
+        if end_pos >= len(tokens):
+            # No end token found
+            if channel_type == "final":
+                # Final blocks can end at end of input without requiring <|return|>
+                content = text[content_start:]
+                return Event("normal", content), end_pos
+            return None  # Analysis and commentary need proper end tokens
+
+        end_token = tokens[end_pos]
+        content = text[content_start : end_token.start]
+
+        # Create event based on channel and end token
+        if channel_type == "analysis":
+            if end_token.type == "CALL":
+                # Built-in tools (browser, python) use analysis channel with <|call|>
+                raw_text = text[tokens[start_pos].start : end_token.end]
+                return Event("tool_call", content.strip(), raw_text), end_pos + 1
+            else:
+                return Event("reasoning", content), end_pos + 1
+        elif channel_type == "commentary":
+            if end_token.type == "CALL":
+                raw_text = text[tokens[start_pos].start : end_token.end]
+                return Event("tool_call", content.strip(), raw_text), end_pos + 1
+            else:
+                return Event("normal", content), end_pos + 1
+        elif channel_type == "final":
+            # For final blocks, include any trailing TEXT immediately after <|return|>
+            final_content = content
+            if end_token.type == "RETURN" and end_pos + 1 < len(tokens):
+                next_token = tokens[end_pos + 1]
+                if next_token.type == "TEXT":
+                    final_content += text[next_token.start : next_token.end]
+                    return Event("normal", final_content), end_pos + 2
+            return Event("normal", final_content), end_pos + 1
+
+        return None, end_pos + 1
+
+    def _is_commentary_filler_between_blocks(
+        self, text: str, tokens: List[Token], pos: int
+    ) -> bool:
+        """Check if this is commentary filler text or problematic structural tokens in malformed sequences."""
+        current_token = tokens[pos]
+        current_text = text[current_token.start : current_token.end].strip()
+
+        # Check for commentary filler between CALL and CHANNEL
+        if pos > 0 and pos + 1 < len(tokens):
+            prev_token = tokens[pos - 1]
+            next_token = tokens[pos + 1]
+
+            # Check if we have CALL -> TEXT("commentary") -> CHANNEL pattern
+            if (
+                prev_token.type == "CALL"
+                and next_token.type == "CHANNEL"
+                and current_text.lower() == "commentary"
+            ):
+                return True
+
+        # Check for problematic patterns after CALL tokens (malformed sequences)
+        if pos > 0:
+            prev_token = tokens[pos - 1]
+
+            # Only filter structural tokens that appear immediately after CALL in malformed sequences
+            # These patterns indicate the content is malformed and the structural tokens are noise
+            if prev_token.type == "CALL":
+                # Filter MESSAGE tokens after CALL (should not happen in well-formed content)
+                if current_token.type == "MESSAGE":
+                    return True
+
+                # Filter standalone "commentary" text after CALL
+                if (
+                    current_token.type == "TEXT"
+                    and current_text.lower() == "commentary"
+                ):
+                    return True
+
+        return False
+
+    def _is_standalone_structural_token(self, content: str) -> bool:
+        """Check if content is just a standalone structural token that should be filtered."""
+        content_stripped = content.strip()
+        structural_tokens = [
+            "<|start|>",
+            "<|channel|>",
+            "<|message|>",
+            "<|constrain|>",
+            "<|end|>",
+            "<|call|>",
+            "<|return|>",
+        ]
+        return content_stripped in structural_tokens
+
+
+class TextStrategy:
+    """Parses the text-based Harmony fallback format."""
+
+    def __init__(self):
+        self.buffer_context = ""
+        self.patterns = {
+            "analysis_then_final": re.compile(
+                r"^\s*(?:assistant)?\s*(analysis|commentary)(.*?)\s*assistantfinal\s*(.*)\s*$",
+                re.IGNORECASE | re.DOTALL,
+            ),
+            "final_only": re.compile(
+                r"^\s*assistantfinal\s*(.*)\s*$", re.IGNORECASE | re.DOTALL
+            ),
+            "analysis_only": re.compile(
+                r"^\s*(?:assistant)?\s*(analysis|commentary)(.*)\s*$",
+                re.IGNORECASE | re.DOTALL,
+            ),
+        }
+
+    def set_buffer_context(self, buffer: str):
+        self.buffer_context = buffer
+
+    def parse(self, text: str) -> Tuple[List[Event], str]:
+        events = []
+
+        m = self.patterns["analysis_then_final"].match(text)
+        if m:
+            channel, reasoning, final = m.groups()
+            if channel.lower() == "analysis" and reasoning.strip():
+                events.append(Event("reasoning", reasoning.strip()))
+            elif channel.lower() == "commentary" and reasoning.strip():
+                events.append(Event("normal", reasoning.strip()))
+            if final.strip():
+                events.append(Event("normal", final.strip()))
+            return events, ""
+
+        # If assistantfinal appears to be incomplete (e.g., 'assistantfin'), hold entire buffer
+        if re.search(
+            r"(?:^|\s)(?:assistant)?\s*(analysis|commentary)", text, re.IGNORECASE
+        ):
+            low = text.lower()
+            if "assistantfin" in low and "assistantfinal" not in low:
+                return events, text
+
+        m = self.patterns["final_only"].match(text)
+        if m:
+            final = m.group(1)
+            if final.strip():
+                events.append(Event("normal", final.strip()))
+            return events, ""
+
+        m = self.patterns["analysis_only"].match(text)
+        if m:
+            channel, content = m.groups()
+            emit, hold = prefix_hold(content, ["assistantfinal"])
+            if channel.lower() == "analysis" and emit:
+                # Stream reasoning content as-is based on structural markers only.
+                events.append(Event("reasoning", emit))
+                # Keep the channel header in the remaining buffer to continue parsing
+                # subsequent chunks in the text fallback format. Preserve any held
+                # prefix that may complete into "assistantfinal".
+                if hold:
+                    return events, text[: m.start(2)] + hold
+                else:
+                    return events, channel
+            elif channel.lower() == "commentary" and emit:
+                # For commentary, stream as normal text. Preserve spaces unless holding.
+                content_out = emit if hold else emit.strip()
+                events.append(Event("normal", content_out))
+                if hold:
+                    return events, text[: m.start(2)] + hold
+                else:
+                    return events, ""
+            # If no emit, just return the held content
+            return events, text[: m.start(2)] + hold
+
+        emit, hold = prefix_hold(text, ["analysis", "commentary", "assistantfinal"])
+        if emit:
+            events.append(Event("normal", emit))
+        return events, hold
+
+
+class HarmonyParser:
+    """Facade for parsing Harmony format, switching between strategies."""
+
+    def __init__(self):
+        self.strategy = None
+        self._buffer = ""
+        self._should_filter_commentary = (
+            False  # Track if we should filter commentary in next chunks
+        )
+        self._partial_commentary = (
+            ""  # Track partial commentary being built across chunks
+        )
+
+    def parse(self, chunk: str) -> List[Event]:
+        self._buffer += chunk
+
+        if self.strategy is None:
+            if "<|channel|>" in self._buffer or "<|start|>" in self._buffer:
+                self.strategy = CanonicalStrategy()
+            elif re.search(
+                r"(?:^|\s)(?:assistant)?\s*(analysis|commentary|assistantfinal)",
+                self._buffer,
+                re.IGNORECASE,
+            ):
+                self.strategy = TextStrategy()
+            else:
+                # Not yet determined, hold
+                return []
+
+        if hasattr(self.strategy, "set_buffer_context"):
+            # Provide full buffer context to strategy for smarter whitespace handling
+            self.strategy.set_buffer_context(self._buffer)
+
+        events, remaining = self.strategy.parse(self._buffer)
+
+        # Check if we should start filtering commentary (after <|call|> token or tool_call event)
+        buffer_has_call_token = self._buffer.rstrip().endswith("<|call|>")
+
+        self._buffer = remaining
+
+        # Filter events for streaming case
+        filtered_events = []
+        for event in events:
+            should_filter = False
+
+            if event.event_type == "normal":
+                # Check if we're in a commentary filtering state
+                if self._should_filter_commentary or self._partial_commentary:
+                    # Try to build partial commentary
+                    potential_commentary = (
+                        self._partial_commentary + event.content.strip().lower()
+                    )
+
+                    if potential_commentary == "commentary":
+                        # Complete commentary found - filter it
+                        should_filter = True
+                        self._partial_commentary = ""  # Reset
+                        self._should_filter_commentary = False  # Done filtering
+                    elif "commentary".startswith(potential_commentary):
+                        # Partial match - accumulate and filter this chunk
+                        should_filter = True
+                        self._partial_commentary = potential_commentary
+                    else:
+                        # Not commentary - reset and keep the event
+                        self._partial_commentary = ""
+                        self._should_filter_commentary = False
+                else:
+                    # Not in commentary filtering state - reset partial state
+                    self._partial_commentary = ""
+
+            if should_filter:
+                # Skip this commentary filler
+                continue
+
+            # Update filtering state based on events and buffer state
+            if event.event_type == "tool_call":
+                self._should_filter_commentary = (
+                    True  # Filter commentary after tool calls
+                )
+                self._partial_commentary = ""  # Reset on tool call
+            elif buffer_has_call_token:
+                self._should_filter_commentary = (
+                    True  # Filter commentary after <|call|> token
+                )
+
+            filtered_events.append(event)
+
+        return filtered_events
diff --git a/sglang/python/sglang/srt/parser/jinja_template_utils.py b/sglang/python/sglang/srt/parser/jinja_template_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..37434713de570798404757f39f4cf639926b2c66
--- /dev/null
+++ b/sglang/python/sglang/srt/parser/jinja_template_utils.py
@@ -0,0 +1,230 @@
+"""Template utilities for Jinja template processing.
+
+This module provides utilities for analyzing and processing Jinja chat templates,
+including content format detection and message processing.
+"""
+
+import logging
+
+import jinja2
+import transformers.utils.chat_template_utils as hf_chat_utils
+
+from sglang.srt.utils import ImageData
+
+logger = logging.getLogger(__name__)
+
+# ============================================================================
+# JINJA TEMPLATE CONTENT FORMAT DETECTION
+# ============================================================================
+#
+# This adapts vLLM's approach for detecting chat template content format:
+# https://github.com/vllm-project/vllm/blob/02f0c7b220422792f5e53de2a7d51d2d3ff2df28/vllm/entrypoints/chat_utils.py#L296-L313
+# - Analyzes Jinja template AST to detect content iteration patterns
+# - 'openai' format: templates with {%- for content in message['content'] -%} loops
+# - 'string' format: templates that expect simple string content
+# - Processes content accordingly to match template expectations
+
+
+def _is_var_access(node: jinja2.nodes.Node, varname: str) -> bool:
+    """Check if node is a variable access like {{ varname }}"""
+    if isinstance(node, jinja2.nodes.Name):
+        return node.ctx == "load" and node.name == varname
+    return False
+
+
+def _is_attr_access(node: jinja2.nodes.Node, varname: str, key: str) -> bool:
+    """Check if node is an attribute access like {{ varname['key'] }} or {{ varname.key }}"""
+    if isinstance(node, jinja2.nodes.Getitem):
+        return (
+            _is_var_access(node.node, varname)
+            and isinstance(node.arg, jinja2.nodes.Const)
+            and node.arg.value == key
+        )
+
+    if isinstance(node, jinja2.nodes.Getattr):
+        return _is_var_access(node.node, varname) and node.attr == key
+
+    return False
+
+
+def _is_var_or_elems_access(
+    node: jinja2.nodes.Node,
+    varname: str,
+    key: str = None,
+) -> bool:
+    """Check if node accesses varname or varname[key] with filters/tests"""
+    if isinstance(node, jinja2.nodes.Filter):
+        return node.node is not None and _is_var_or_elems_access(
+            node.node, varname, key
+        )
+    if isinstance(node, jinja2.nodes.Test):
+        return _is_var_or_elems_access(node.node, varname, key)
+
+    if isinstance(node, jinja2.nodes.Getitem) and isinstance(
+        node.arg, jinja2.nodes.Slice
+    ):
+        return _is_var_or_elems_access(node.node, varname, key)
+
+    return _is_attr_access(node, varname, key) if key else _is_var_access(node, varname)
+
+
+def _try_extract_ast(chat_template: str):
+    """Try to parse the Jinja template into an AST"""
+    try:
+        jinja_compiled = hf_chat_utils._compile_jinja_template(chat_template)
+        return jinja_compiled.environment.parse(chat_template)
+    except Exception as e:
+        logger.debug(f"Error when compiling Jinja template: {e}")
+        return None
+
+
+def detect_jinja_template_content_format(chat_template: str) -> str:
+    """
+    Detect whether a chat template expects 'string' or 'openai' content format.
+
+    - 'string': content is a simple string (like DeepSeek templates)
+    - 'openai': content is a list of structured dicts (like Llama4 templates)
+
+    Detection logic:
+    - If template has loops like {%- for content in message['content'] -%} → 'openai'
+    - Otherwise → 'string'
+    """
+    # Shortcut for multimodal templates
+    if any(
+        keyword in chat_template for keyword in ["image", "audio", "video", "vision"]
+    ):
+        return "openai"
+
+    jinja_ast = _try_extract_ast(chat_template)
+    if jinja_ast is None:
+        return "string"
+
+    try:
+        # Look for patterns like: {%- for content in message['content'] -%}
+        for loop_ast in jinja_ast.find_all(jinja2.nodes.For):
+            loop_iter = loop_ast.iter
+
+            # Check if iterating over message['content'] or similar
+            if _is_var_or_elems_access(loop_iter, "message", "content"):
+                return "openai"  # Found content iteration → openai format
+
+            # Also check for patterns like: {%- for item in msg.content -%} or {%- for item in m.content -%}
+            if _is_var_or_elems_access(
+                loop_iter, "msg", "content"
+            ) or _is_var_or_elems_access(loop_iter, "m", "content"):
+                return "openai"  # Found content iteration → openai format (glm4v)
+
+        return "string"  # No content loops found → string format
+    except Exception as e:
+        logger.debug(f"Error when parsing AST of Jinja template: {e}")
+        return "string"
+
+
+def process_content_for_template_format(
+    msg_dict: dict,
+    content_format: str,
+    image_data: list,
+    video_data: list,
+    audio_data: list,
+    modalities: list,
+    use_dpsk_v32_encoding: bool = False,
+) -> dict:
+    """
+    Process message content based on detected template format.
+
+    Args:
+        msg_dict: Message dictionary with content
+        content_format: 'string' or 'openai' (detected via AST analysis)
+        image_data: List to append extracted image URLs
+        video_data: List to append extracted video URLs
+        audio_data: List to append extracted audio URLs
+        modalities: List to append modalities
+        use_dpsk_v32_encoding: If True, extract multimodal data and convert content to string (for DeepSeek-V3.2 encoding)
+
+    Returns:
+        Processed message dictionary
+    """
+    if not isinstance(msg_dict.get("content"), list):
+        # Already a string or None, no processing needed
+        return {k: v for k, v in msg_dict.items() if v is not None}
+
+    if content_format == "openai" or use_dpsk_v32_encoding:
+        # OpenAI format: preserve structured content list, normalize types
+        # V32 encoding: extract multimodal data but convert content to string
+        processed_content_parts = []
+        text_parts = []
+        for chunk in msg_dict["content"]:
+            if isinstance(chunk, dict):
+                chunk_type = chunk.get("type")
+
+                if chunk_type == "image_url":
+                    image_obj = chunk.get("image_url") or {}
+                    mdp = image_obj.get("max_dynamic_patch", None)
+                    # Also allow flat style: chunk["max_dynamic_patch"]
+                    image_data.append(
+                        ImageData(
+                            url=image_obj["url"],
+                            detail=image_obj.get("detail", "auto"),
+                            max_dynamic_patch=mdp,
+                        )
+                    )
+
+                    if chunk.get("modalities"):
+                        modalities.append(chunk.get("modalities"))
+                    # Normalize to simple 'image' type for template compatibility
+                    processed_content_parts.append({"type": "image"})
+                elif chunk_type == "video_url":
+                    video_obj = chunk.get("video_url") or {}
+                    mdp = video_obj.get("max_dynamic_patch", None)
+                    if mdp is None:
+                        video_data.append(chunk["video_url"]["url"])
+                    else:
+                        # Keep structured info for backend, but template only sees {"type":"video"}
+                        video_data.append(
+                            {
+                                "url": video_obj["url"],
+                                "max_dynamic_patch": mdp,
+                            }
+                        )
+                    if chunk.get("modalities"):
+                        modalities.append(chunk.get("modalities"))
+                    # Normalize to simple 'video' type for template compatibility
+                    processed_content_parts.append({"type": "video"})
+                elif chunk_type == "audio_url":
+                    audio_data.append(chunk["audio_url"]["url"])
+                    # Normalize to simple 'audio' type
+                    processed_content_parts.append({"type": "audio"})
+                elif chunk_type == "text":
+                    # For v32 encoding, collect text parts separately
+                    if use_dpsk_v32_encoding:
+                        text_parts.append(chunk["text"])
+                    else:
+                        # Keep text content as-is for openai format
+                        processed_content_parts.append(chunk)
+
+        new_msg = {
+            k: v for k, v in msg_dict.items() if v is not None and k != "content"
+        }
+        if use_dpsk_v32_encoding:
+            new_msg["content"] = " ".join(text_parts) if text_parts else ""
+        else:
+            new_msg["content"] = processed_content_parts
+        return new_msg
+
+    elif content_format == "string":
+        # String format: flatten to text only (for templates like DeepSeek)
+        text_parts = []
+        for chunk in msg_dict["content"]:
+            if isinstance(chunk, dict) and chunk.get("type") == "text":
+                text_parts.append(chunk["text"])
+            # Note: For string format, we ignore images/audio since the template
+            # doesn't expect structured content - multimodal placeholders would
+            # need to be inserted differently
+
+        new_msg = msg_dict.copy()
+        new_msg["content"] = " ".join(text_parts) if text_parts else ""
+        new_msg = {k: v for k, v in new_msg.items() if v is not None}
+        return new_msg
+
+    else:
+        raise ValueError(f"Invalid content format: {content_format}")
diff --git a/sglang/python/sglang/srt/parser/reasoning_parser.py b/sglang/python/sglang/srt/parser/reasoning_parser.py
new file mode 100644
index 0000000000000000000000000000000000000000..7eef5c0fbf32fe165507d652ca5364964bf57de3
--- /dev/null
+++ b/sglang/python/sglang/srt/parser/reasoning_parser.py
@@ -0,0 +1,517 @@
+from typing import Dict, Optional, Tuple, Type
+
+from sglang.srt.entrypoints.openai.protocol import ChatCompletionRequest
+from sglang.srt.parser.harmony_parser import HarmonyParser
+
+
+class StreamingParseResult:
+    """Result of streaming incremental parsing."""
+
+    def __init__(
+        self,
+        normal_text: Optional[str] = None,
+        reasoning_text: Optional[str] = None,
+    ):
+        self.normal_text = normal_text or ""
+        self.reasoning_text = reasoning_text or ""
+
+
+class BaseReasoningFormatDetector:
+    """Base class providing two sets of interfaces: one-time and streaming incremental."""
+
+    def __init__(
+        self,
+        think_start_token: str,
+        think_end_token: str,
+        force_reasoning: bool = False,
+        stream_reasoning: bool = True,
+        tool_start_token: Optional[str] = None,
+        continue_final_message: bool = False,
+        previous_content: str = "",
+    ):
+        self.think_start_token = think_start_token
+        self.think_end_token = think_end_token
+        self.tool_start_token = tool_start_token
+        self._in_reasoning = force_reasoning
+        self.stream_reasoning = stream_reasoning
+
+        self._buffer = ""
+        self.stripped_think_start = False
+
+        self.continue_final_message = continue_final_message
+        if self.continue_final_message:
+            self.previous_content = previous_content
+            self.previous_count = len(previous_content)
+        else:
+            self.previous_content = ""
+            self.previous_count = 0
+
+        if self.think_start_token in self.previous_content:
+            self._in_reasoning = True
+        if self.think_end_token in self.previous_content:
+            self._in_reasoning = False
+
+    def detect_and_parse(self, text: str) -> StreamingParseResult:
+        """
+        One-time parsing: Detects and parses reasoning sections in the provided text.
+        Returns both reasoning content and normal text separately.
+        """
+        in_reasoning = self._in_reasoning or self.think_start_token in text
+
+        if not in_reasoning:
+            return StreamingParseResult(normal_text=text)
+
+        # The text is considered to be in a reasoning block.
+        processed_text = text.replace(self.think_start_token, "").strip()
+
+        if (
+            self.think_end_token not in processed_text
+            and self.think_end_token not in self.previous_content
+        ):
+            # Check for tool_start_token interruption
+            if (
+                in_reasoning
+                and self.tool_start_token is not None
+                and self.tool_start_token in processed_text
+            ):
+                # Find the first occurrence of tool_start_token and split there
+                tool_idx = processed_text.find(self.tool_start_token)
+                reasoning_text = processed_text[:tool_idx].strip()
+                # Preserve tool_start_token in normal text
+                normal_text = processed_text[tool_idx:]
+                return StreamingParseResult(
+                    normal_text=normal_text, reasoning_text=reasoning_text
+                )
+            # Assume reasoning was truncated before end token
+            return StreamingParseResult(reasoning_text=processed_text)
+
+        # Extract reasoning content
+        if self.think_end_token in processed_text:
+            splits = processed_text.split(self.think_end_token, maxsplit=1)
+            reasoning_text = splits[0]
+            normal_text = splits[1].strip()
+
+            return StreamingParseResult(
+                normal_text=normal_text, reasoning_text=reasoning_text
+            )
+        else:
+            # think_end_token is in self.previous_content for continue_final_message=True case
+            return StreamingParseResult(normal_text=processed_text)
+
+    def parse_streaming_increment(self, new_text: str) -> StreamingParseResult:
+        """
+        Streaming incremental parsing for reasoning content.
+        Handles partial reasoning tags and content.
+
+        If stream_reasoning is False:
+            Accumulates reasoning content until the end tag is found
+        If stream_reasoning is True:
+            Streams reasoning content as it arrives
+        """
+        self._buffer += new_text
+        current_text = self._buffer
+
+        # If the current text is a prefix of the think token, keep buffering
+        tokens_to_check = [self.think_start_token, self.think_end_token]
+        if self.tool_start_token:
+            tokens_to_check.append(self.tool_start_token)
+        if any(
+            token.startswith(current_text) and token != current_text
+            for token in tokens_to_check
+        ):
+            return StreamingParseResult()
+
+        # Strip `<think>` token if present
+        if not self.stripped_think_start and self.think_start_token in current_text:
+            current_text = current_text.replace(self.think_start_token, "")
+            self.stripped_think_start = True
+            self._in_reasoning = True
+
+        # Handle end of reasoning block
+        if self._in_reasoning and self.think_end_token in current_text:
+            end_idx = current_text.find(self.think_end_token)
+
+            reasoning_text = current_text[:end_idx]
+
+            self._buffer = ""
+            self._in_reasoning = False
+            normal_text = current_text[end_idx + len(self.think_end_token) :]
+
+            return StreamingParseResult(
+                normal_text=normal_text, reasoning_text=reasoning_text.rstrip()
+            )
+
+        # Continue with reasoning content
+        if self._in_reasoning:
+            # Check for tool_start_token interruption
+            if self.tool_start_token and self.tool_start_token in current_text:
+                tool_idx = current_text.find(self.tool_start_token)
+                reasoning_text = current_text[:tool_idx]
+                # Preserve tool_start_token in normal text
+                normal_text = current_text[tool_idx:]
+                self._buffer = ""
+                self._in_reasoning = False
+                return StreamingParseResult(
+                    normal_text=normal_text, reasoning_text=reasoning_text
+                )
+            if self.stream_reasoning:
+                # Stream the content immediately
+                self._buffer = ""
+                return StreamingParseResult(reasoning_text=current_text)
+            else:
+                return StreamingParseResult()
+
+        # If we're not in a reasoning block return as normal text
+        if not self._in_reasoning:
+            self._buffer = ""
+            return StreamingParseResult(normal_text=current_text)
+
+        return StreamingParseResult()
+
+
+class DeepSeekR1Detector(BaseReasoningFormatDetector):
+    """
+    Detector for DeepSeek-R1 model.
+    Assumes reasoning format:
+      (<think>)*(.*)</think>
+    Returns all the text before the </think> tag as `reasoning_text`
+    and the rest of the text as `normal_text`.
+
+    Supported models:
+      - DeepSeek-R1: Always generates thinking content without <think> start tag
+      - DeepSeek-R1-0528: Generates thinking content with <think> start tag
+
+    Format patterns:
+      - DeepSeek-R1: "I need to think about this...</think>The answer is 42."
+      - DeepSeek-R1-0528: "<think>I need to think about this...</think>The answer is 42."
+
+    Args:
+        stream_reasoning (bool): If False, accumulates reasoning content until the end tag.
+            If True, streams reasoning content as it arrives.
+    """
+
+    def __init__(
+        self,
+        stream_reasoning: bool = True,
+        force_reasoning: bool = True,
+        continue_final_message: bool = False,
+        previous_content: str = "",
+    ):
+        # DeepSeek-R1 is assumed to be reasoning until `</think>` token
+        super().__init__(
+            "<think>",
+            "</think>",
+            force_reasoning=True,
+            stream_reasoning=stream_reasoning,
+            continue_final_message=continue_final_message,
+            previous_content=previous_content,
+        )
+        # https://github.com/sgl-project/sglang/pull/3202#discussion_r1950153599
+
+
+class Qwen3Detector(BaseReasoningFormatDetector):
+    """
+    Detector for Qwen3 models (e.g., Qwen/Qwen3-235B-A22B).
+    Assumes reasoning format:
+      (<think>)*(.*)</think>
+
+    Qwen3 models released before 07/2025 supports switching between thinking mode and normal
+    mode using `enable_thinking` parameter in the request parameter.
+      - enable_thinking=True: "<think>reasoning content</think>The answer is 42."
+      - enable_thinking=False: "The answer is 42." (no thinking tokens)
+
+    Args:
+        stream_reasoning (bool): If False, accumulates reasoning content until the end tag.
+            If True, streams reasoning content as it arrives.
+    """
+
+    def __init__(
+        self,
+        stream_reasoning: bool = True,
+        force_reasoning: bool = False,
+        continue_final_message: bool = False,
+        previous_content: str = "",
+    ):
+        super().__init__(
+            "<think>",
+            "</think>",
+            force_reasoning=force_reasoning,
+            stream_reasoning=stream_reasoning,
+            continue_final_message=continue_final_message,
+            previous_content=previous_content,
+        )
+
+
+class KimiDetector(BaseReasoningFormatDetector):
+    """
+    Detector for Kimi Thinking model.
+    Assumes reasoning format:
+      ◁think▷*(.*)◁/think▷
+    Returns all the text before the ◁/think▷ tag as `reasoning_text`
+    and the rest of the text as `normal_text`.
+    """
+
+    def __init__(
+        self,
+        stream_reasoning: bool = True,
+        force_reasoning: bool = False,
+        continue_final_message: bool = False,
+        previous_content: str = "",
+    ):
+        super().__init__(
+            "◁think▷",
+            "◁/think▷",
+            force_reasoning=False,
+            stream_reasoning=stream_reasoning,
+            continue_final_message=continue_final_message,
+            previous_content=previous_content,
+        )
+
+
+class KimiK2Detector(BaseReasoningFormatDetector):
+    """
+    Detector for Kimi K2 models.
+    Assumes reasoning format:
+      (<think>)*(.*)</think>
+
+    Kimi K2 can switch from reasoning to tool-call section with
+    `<|tool_calls_section_begin|>` before emitting `</think>`.
+    """
+
+    def __init__(
+        self,
+        stream_reasoning: bool = True,
+        force_reasoning: bool = False,
+        continue_final_message: bool = False,
+        previous_content: str = "",
+    ):
+        super().__init__(
+            "<think>",
+            "</think>",
+            force_reasoning=force_reasoning,
+            stream_reasoning=stream_reasoning,
+            tool_start_token="<|tool_calls_section_begin|>",
+            continue_final_message=continue_final_message,
+            previous_content=previous_content,
+        )
+
+
+class Glm45Detector(BaseReasoningFormatDetector):
+    """
+    Detector for GLM-4.5 models.
+    Assumes reasoning format:
+      (<think>)*(.*)</think>
+
+    GLM-4.5 uses `<tool_call>` as the tool start token to switch from reasoning mode to normal mode.
+
+    Args:
+        stream_reasoning (bool): If False, accumulates reasoning content until the end tag.
+            If True, streams reasoning content as it arrives.
+    """
+
+    def __init__(self, stream_reasoning: bool = True, force_reasoning: bool = False):
+        super().__init__(
+            "<think>",
+            "</think>",
+            force_reasoning=force_reasoning,
+            stream_reasoning=stream_reasoning,
+            tool_start_token="<tool_call>",
+        )
+
+
+class GptOssDetector(BaseReasoningFormatDetector):
+    """
+    Detector for T4-style reasoning format (GPT-OSS), using the HarmonyParser.
+    """
+
+    def __init__(
+        self,
+        stream_reasoning: bool = True,
+        force_reasoning: bool = True,
+        continue_final_message: bool = False,
+        previous_content: str = "",
+    ):
+        super().__init__(
+            "<|channel|>analysis<|message|>",
+            "<|end|>",
+            force_reasoning=force_reasoning,
+            stream_reasoning=stream_reasoning,
+            continue_final_message=continue_final_message,
+            previous_content=previous_content,
+        )
+        self.parser = HarmonyParser()
+
+    def detect_and_parse(self, text: str) -> StreamingParseResult:
+        events = self.parser.parse(text)
+        # Flush the buffer for one-shot parsing
+        events += self.parser.parse("")
+
+        reasoning_text = "".join(
+            [e.content for e in events if e.event_type == "reasoning"]
+        )
+        normal_parts = []
+        for e in events:
+            if e.event_type == "normal":
+                normal_parts.append(e.content)
+            elif e.event_type == "tool_call":
+                # Use raw_text to preserve structural markers for function call detector
+                normal_parts.append(e.raw_text if e.raw_text else e.content)
+        normal_text = "".join(normal_parts)
+        # Tool call events preserve raw text with structural markers
+
+        return StreamingParseResult(
+            normal_text=normal_text,
+            reasoning_text=reasoning_text,
+        )
+
+    def parse_streaming_increment(self, new_text: str) -> StreamingParseResult:
+        events = self.parser.parse(new_text)
+
+        reasoning_text = "".join(
+            [e.content for e in events if e.event_type == "reasoning"]
+        )
+        normal_parts = []
+        for e in events:
+            if e.event_type == "normal":
+                normal_parts.append(e.content)
+            elif e.event_type == "tool_call":
+                # Use raw_text to preserve structural markers for function call detector
+                normal_parts.append(e.raw_text if e.raw_text else e.content)
+        normal_text = "".join(normal_parts)
+
+        return StreamingParseResult(
+            normal_text=normal_text,
+            reasoning_text=reasoning_text,
+        )
+
+
+class MiniMaxAppendThinkDetector(BaseReasoningFormatDetector):
+    """
+    Append `<think>` token to the beginning of the text.
+    """
+
+    def __init__(
+        self,
+        stream_reasoning: bool = True,
+        force_reasoning: bool = False,
+        continue_final_message: bool = False,
+        previous_content: str = "",
+    ):
+        # scheduler.py need `reasoning_parser.detector.think_end_token`
+        super().__init__(
+            "<think>",
+            "</think>",
+            force_reasoning=force_reasoning,
+            stream_reasoning=stream_reasoning,
+            continue_final_message=continue_final_message,
+            previous_content=previous_content,
+        )
+        self.is_first_chunk = False
+
+    def parse_streaming_increment(self, new_text: str) -> StreamingParseResult:
+        if not self.is_first_chunk:
+            self.is_first_chunk = True
+            new_text = self.think_start_token + new_text
+        return StreamingParseResult(normal_text=new_text)
+
+    def detect_and_parse(self, text: str) -> StreamingParseResult:
+        return StreamingParseResult(normal_text=self.think_start_token + text)
+
+
+class Nemotron3Detector(BaseReasoningFormatDetector):
+    """
+    Detector for Nemotron3 model.
+    Uses the same reasoning format as DeepSeek-R1: (<think>)*(.*)</think>
+
+    """
+
+    def __init__(
+        self,
+        stream_reasoning: bool = True,
+        force_reasoning: bool = False,
+        continue_final_message: bool = False,
+        previous_content: str = "",
+    ):
+        super().__init__(
+            "<think>",
+            "</think>",
+            force_reasoning=force_reasoning,
+            stream_reasoning=stream_reasoning,
+            continue_final_message=continue_final_message,
+            previous_content=previous_content,
+        )
+
+
+class ReasoningParser:
+    """
+    Parser that handles both streaming and non-streaming scenarios for extracting
+    reasoning content from model outputs.
+
+    Args:
+        model_type (str): Type of model to parse reasoning from
+        stream_reasoning (bool): If False, accumulates reasoning content until complete.
+            If True, streams reasoning content as it arrives.
+    """
+
+    DetectorMap: Dict[str, Type[BaseReasoningFormatDetector]] = {
+        "deepseek-r1": DeepSeekR1Detector,
+        "deepseek-v3": Qwen3Detector,
+        "glm45": Glm45Detector,
+        "gpt-oss": GptOssDetector,
+        "kimi": KimiDetector,
+        "kimi_k2": KimiK2Detector,
+        "qwen3": Qwen3Detector,
+        "qwen3-thinking": Qwen3Detector,
+        "minimax": Qwen3Detector,
+        "minimax-append-think": MiniMaxAppendThinkDetector,
+        "step3": DeepSeekR1Detector,
+        "step3p5": DeepSeekR1Detector,
+        "nemotron_3": Nemotron3Detector,
+        "interns1": Qwen3Detector,
+    }
+
+    def __init__(
+        self,
+        model_type: Optional[str] = None,
+        stream_reasoning: bool = True,
+        force_reasoning: Optional[bool] = None,
+        request: ChatCompletionRequest = None,
+    ):
+        if not model_type:
+            raise ValueError("Model type must be specified")
+
+        detector_class = self.DetectorMap.get(model_type.lower())
+        if not detector_class:
+            raise ValueError(f"Unsupported model type: {model_type}")
+
+        # Special cases where we override force_reasoning
+        if model_type.lower() in {"qwen3-thinking", "gpt-oss", "minimax"}:
+            force_reasoning = True
+
+        # Only pass force_reasoning if explicitly set, let detectors use their defaults
+        kwargs = {"stream_reasoning": stream_reasoning}
+        if force_reasoning is not None:
+            kwargs["force_reasoning"] = force_reasoning
+
+        if (
+            request is not None
+            and isinstance(request, ChatCompletionRequest)
+            and request.continue_final_message
+            and request.messages[-1].role == "assistant"
+        ):
+            kwargs["continue_final_message"] = True
+            kwargs["previous_content"] = request.messages[-1].content
+
+        self.detector = detector_class(**kwargs)
+
+    def parse_non_stream(self, full_text: str) -> Tuple[Optional[str], Optional[str]]:
+        """Non-streaming call: one-time parsing"""
+        ret = self.detector.detect_and_parse(full_text)
+        return ret.reasoning_text, ret.normal_text
+
+    def parse_stream_chunk(
+        self, chunk_text: str
+    ) -> Tuple[Optional[str], Optional[str]]:
+        """Streaming call: incremental parsing"""
+        ret = self.detector.parse_streaming_increment(chunk_text)
+        return ret.reasoning_text, ret.normal_text
diff --git a/sglang/python/sglang/srt/ray/__init__.py b/sglang/python/sglang/srt/ray/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..5927c789f92672b2b80ac15647c1a10edc0236dd
--- /dev/null
+++ b/sglang/python/sglang/srt/ray/__init__.py
@@ -0,0 +1,3 @@
+from sglang.srt.ray.engine import RayEngine
+
+__all__ = ["RayEngine"]
diff --git a/sglang/python/sglang/srt/ray/engine.py b/sglang/python/sglang/srt/ray/engine.py
new file mode 100644
index 0000000000000000000000000000000000000000..9c1fe75cb3b53e37b98d60f26ce54ac5d2d07c8a
--- /dev/null
+++ b/sglang/python/sglang/srt/ray/engine.py
@@ -0,0 +1,174 @@
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""RayEngine - Engine subclass that launches schedulers as Ray actors."""
+
+from __future__ import annotations
+
+import dataclasses
+import logging
+from typing import Callable
+
+import ray
+from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy
+
+from sglang.srt.entrypoints.engine import (
+    Engine,
+    SchedulerInitResult,
+    _calculate_rank_ranges,
+    _compute_parallelism_ranks,
+)
+from sglang.srt.ray.scheduler_actor import SchedulerActor
+from sglang.srt.server_args import ZMQ_TCP_PORT_DELTA, PortArgs, ServerArgs
+
+logger = logging.getLogger(__name__)
+
+
+@dataclasses.dataclass
+class RaySchedulerInitResult(SchedulerInitResult):
+    """SchedulerInitResult that also holds Ray actor handles for cleanup."""
+
+    scheduler_actors: list = dataclasses.field(default_factory=list)
+
+
+def _get_rank0_node_ip(placement_group) -> str:
+    """Get the IP address of the node where rank 0 will run.
+
+    Uses a probe task to discover the IP of the placement group's first bundle node.
+    This is needed because rank 0 starts the TCPStore server for torch.distributed,
+    so dist_init_addr must be the IP of the node where rank 0 runs, not the driver node.
+    """
+
+    @ray.remote(num_cpus=0, num_gpus=0)
+    def get_node_ip():
+        return ray.util.get_node_ip_address()
+
+    return ray.get(
+        get_node_ip.options(
+            scheduling_strategy=PlacementGroupSchedulingStrategy(
+                placement_group=placement_group,
+                placement_group_bundle_index=0,
+            ),
+        ).remote()
+    )
+
+
+class RayEngine(Engine):
+    """Engine using Ray actors for scheduler processes."""
+
+    def shutdown(self):
+        """Shutdown the engine — kill Ray scheduler actors then local processes."""
+        for actor in self._scheduler_init_result.scheduler_actors:
+            try:
+                ray.kill(actor)
+            except Exception:
+                logger.error(f"Failed to kill Ray scheduler actor: {actor}")
+        super().shutdown()
+
+    @classmethod
+    def _launch_scheduler_processes(
+        cls,
+        server_args: ServerArgs,
+        port_args: PortArgs,
+        run_scheduler_process_func: Callable,
+    ) -> SchedulerInitResult:
+        """Launch schedulers as Ray actors."""
+        if server_args.dp_size > 1:
+            raise NotImplementedError(
+                "Ray support for dp_size > 1 is not yet implemented. "
+                "Set dp_size=1 or use_ray=False."
+            )
+
+        pg = ray.util.get_current_placement_group()
+        if pg is None:
+            raise RuntimeError(
+                "use_ray=True requires a placement group, but none was detected. "
+                "Schedule the Engine actor onto a placement group"
+            )
+
+        world_size = server_args.tp_size * server_args.pp_size
+        nnodes = server_args.nnodes
+        gpus_per_node = world_size // nnodes
+
+        logger.info(
+            f"Ray cluster: {nnodes} nodes, "
+            f"Use {gpus_per_node} GPUs/node, world_size={world_size}"
+        )
+
+        rank0_node_ip = _get_rank0_node_ip(pg)
+        dist_init_addr = f"{rank0_node_ip}:{server_args.port + ZMQ_TCP_PORT_DELTA}"
+        logger.info(f"dist_init_addr: {dist_init_addr}")
+
+        scheduler_actors = []
+
+        for node_idx in range(nnodes):
+            pp_range, tp_range, pp_per_node, tp_per_node = _calculate_rank_ranges(
+                nnodes, server_args.pp_size, server_args.tp_size, node_rank=node_idx
+            )
+            for pp_rank in pp_range:
+                for tp_rank in tp_range:
+                    local_gpu_idx = (pp_rank % pp_per_node) * tp_per_node + (
+                        tp_rank % tp_per_node
+                    )
+
+                    attn_cp_rank, moe_dp_rank, moe_ep_rank = _compute_parallelism_ranks(
+                        server_args, tp_rank
+                    )
+
+                    actor = SchedulerActor.options(
+                        num_cpus=0,
+                        num_gpus=1,
+                        name=f"sglang_scheduler_rank0node={rank0_node_ip}_pp{pp_rank}_tp{tp_rank}",
+                        scheduling_strategy=PlacementGroupSchedulingStrategy(
+                            placement_group=pg,
+                            placement_group_bundle_index=node_idx,
+                        ),
+                    ).remote(
+                        server_args=server_args,
+                        port_args=port_args,
+                        gpu_id=local_gpu_idx,
+                        tp_rank=tp_rank,
+                        attn_cp_rank=attn_cp_rank,
+                        moe_dp_rank=moe_dp_rank,
+                        moe_ep_rank=moe_ep_rank,
+                        pp_rank=pp_rank,
+                        dp_rank=0,
+                        dist_init_addr=dist_init_addr,
+                    )
+                    scheduler_actors.append(actor)
+
+        try:
+            scheduler_infos = ray.get(
+                [actor.get_info.remote() for actor in scheduler_actors]
+            )
+        except ray.exceptions.RayActorError as e:
+            for actor in scheduler_actors:
+                try:
+                    ray.kill(actor)
+                except Exception:
+                    logger.error(f"Failed to kill Ray scheduler actor: {actor}")
+            raise RuntimeError(f"Scheduler actor failed to initialize: {e}")
+
+        event_loop_refs = [actor.run_event_loop.remote() for actor in scheduler_actors]
+
+        def wait_for_completion():
+            try:
+                ray.get(event_loop_refs)
+            except Exception as e:
+                logger.error(f"Ray scheduler actor terminated with error: {e}")
+
+        return RaySchedulerInitResult(
+            scheduler_infos=scheduler_infos,
+            wait_for_completion=wait_for_completion,
+            scheduler_actors=scheduler_actors,
+        )
diff --git a/sglang/python/sglang/srt/ray/http_server.py b/sglang/python/sglang/srt/ray/http_server.py
new file mode 100644
index 0000000000000000000000000000000000000000..c0580838e195acc9ce5844b6a0bcf2db37dd2b2e
--- /dev/null
+++ b/sglang/python/sglang/srt/ray/http_server.py
@@ -0,0 +1,64 @@
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Ray-aware HTTP server launcher."""
+
+from typing import Callable, Optional
+
+from sglang.srt.entrypoints.engine import (
+    init_tokenizer_manager,
+    run_detokenizer_process,
+    run_scheduler_process,
+)
+from sglang.srt.server_args import ServerArgs
+
+
+def launch_server(
+    server_args: ServerArgs,
+    init_tokenizer_manager_func: Callable = init_tokenizer_manager,
+    run_scheduler_process_func: Callable = run_scheduler_process,
+    run_detokenizer_process_func: Callable = run_detokenizer_process,
+    execute_warmup_func: Optional[Callable] = None,
+    launch_callback: Optional[Callable[[], None]] = None,
+):
+    """Launch HTTP server with Ray-based scheduler actors.
+
+    Mirrors http_server.launch_server() but uses RayEngine for scheduler launching.
+    """
+    from sglang.srt.entrypoints.http_server import (
+        _execute_server_warmup,
+        _setup_and_run_http_server,
+    )
+    from sglang.srt.ray.engine import RayEngine
+
+    if execute_warmup_func is None:
+        execute_warmup_func = _execute_server_warmup
+
+    tokenizer_manager, template_manager, port_args, scheduler_init_result = (
+        RayEngine._launch_subprocesses(
+            server_args,
+            init_tokenizer_manager_func=init_tokenizer_manager_func,
+            run_scheduler_process_func=run_scheduler_process_func,
+            run_detokenizer_process_func=run_detokenizer_process_func,
+        )
+    )
+
+    _setup_and_run_http_server(
+        server_args,
+        tokenizer_manager,
+        template_manager,
+        port_args,
+        scheduler_init_result.scheduler_infos,
+        execute_warmup_func=execute_warmup_func,
+        launch_callback=launch_callback,
+    )
diff --git a/sglang/python/sglang/srt/ray/scheduler_actor.py b/sglang/python/sglang/srt/ray/scheduler_actor.py
new file mode 100644
index 0000000000000000000000000000000000000000..e68b78e94f21d2f6451bc609c0ddeda78c0762dc
--- /dev/null
+++ b/sglang/python/sglang/srt/ray/scheduler_actor.py
@@ -0,0 +1,111 @@
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Ray actor wrapper for SGLang Scheduler."""
+
+from __future__ import annotations
+
+import logging
+from typing import TYPE_CHECKING, Any, Dict, Optional
+
+import ray
+
+if TYPE_CHECKING:
+    from sglang.srt.server_args import PortArgs, ServerArgs
+
+
+logger = logging.getLogger(__name__)
+
+
+@ray.remote
+class SchedulerActor:
+    """Ray actor wrapper for SGLang Scheduler.
+
+    Each actor manages one GPU and runs the Scheduler + TpModelWorker stack.
+    Ray is used for process lifecycle; ZMQ handles request/response communication.
+    """
+
+    def __init__(
+        self,
+        server_args: ServerArgs,
+        port_args: PortArgs,
+        gpu_id: int,
+        tp_rank: int,
+        attn_cp_rank: int,
+        moe_dp_rank: int,
+        moe_ep_rank: int,
+        pp_rank: int,
+        dp_rank: Optional[int],
+        dist_init_addr: Optional[str] = None,
+    ):
+        import dataclasses
+
+        from sglang.srt.managers.scheduler import Scheduler, configure_scheduler
+
+        # Override dist_init_addr if provided (for multi-node)
+        if dist_init_addr:
+            server_args = dataclasses.replace(
+                server_args, dist_init_addr=dist_init_addr
+            )
+
+        # Get actual GPU IDs from Ray runtime context
+        accelerator_ids = ray.get_runtime_context().get_accelerator_ids()
+        assigned_gpus = accelerator_ids.get("GPU", [])
+
+        if assigned_gpus:
+            # Ray assigned specific GPU(s), use the first one
+            actual_gpu_id = int(assigned_gpus[0])
+            logger.info(f"[TP{tp_rank}] Ray assigned GPU: {actual_gpu_id}")
+        else:
+            # Fallback to passed gpu_id
+            actual_gpu_id = gpu_id
+            logger.info(f"[TP{tp_rank}] Using passed gpu_id: {gpu_id}")
+
+        # Configure worker (logging, process title, etc.)
+        dp_rank = configure_scheduler(
+            server_args,
+            tp_rank,
+            attn_cp_rank,
+            moe_dp_rank,
+            moe_ep_rank,
+            pp_rank,
+            dp_rank,
+        )
+
+        # Create scheduler (loads model into GPU, initializes NCCL)
+        self.scheduler = Scheduler(
+            server_args=server_args,
+            port_args=port_args,
+            gpu_id=actual_gpu_id,
+            tp_rank=tp_rank,
+            moe_ep_rank=moe_ep_rank,
+            pp_rank=pp_rank,
+            attn_cp_rank=attn_cp_rank,
+            moe_dp_rank=moe_dp_rank,
+            dp_rank=dp_rank,
+        )
+
+        self._tp_rank = tp_rank
+        self._pp_rank = pp_rank
+
+    def get_info(self) -> Dict[str, Any]:
+        """Return scheduler initialization info for handshake."""
+        return self.scheduler.get_init_info()
+
+    def run_event_loop(self) -> None:
+        """Run the scheduler's event loop. Blocks until shutdown."""
+        try:
+            self.scheduler.run_event_loop()
+        except Exception as e:
+            logger.error(f"Scheduler PP{self._pp_rank} TP{self._tp_rank} crashed: {e}")
+            raise
diff --git a/sglang/python/sglang/srt/sampling/__pycache__/custom_logit_processor.cpython-311.pyc b/sglang/python/sglang/srt/sampling/__pycache__/custom_logit_processor.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..fb1232c674f726815b22389f685cc108e6df4dc1
Binary files /dev/null and b/sglang/python/sglang/srt/sampling/__pycache__/custom_logit_processor.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/sampling/__pycache__/sampling_batch_info.cpython-311.pyc b/sglang/python/sglang/srt/sampling/__pycache__/sampling_batch_info.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ea537bccec6e88266459db16bbb636c48d77cdf3
Binary files /dev/null and b/sglang/python/sglang/srt/sampling/__pycache__/sampling_batch_info.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/sampling/__pycache__/sampling_params.cpython-311.pyc b/sglang/python/sglang/srt/sampling/__pycache__/sampling_params.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..611d58c44af37f4895d818e73e4ca9bfb6be9f65
Binary files /dev/null and b/sglang/python/sglang/srt/sampling/__pycache__/sampling_params.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/sampling/custom_logit_processor.py b/sglang/python/sglang/srt/sampling/custom_logit_processor.py
new file mode 100644
index 0000000000000000000000000000000000000000..d58a5f6cf1492520b1abbef374f22aed8c9ffd99
--- /dev/null
+++ b/sglang/python/sglang/srt/sampling/custom_logit_processor.py
@@ -0,0 +1,202 @@
+import json
+from abc import ABC, abstractmethod
+from functools import lru_cache
+from typing import TYPE_CHECKING, Any, Dict, List, Optional, Set
+
+import dill
+import orjson
+import torch
+
+if TYPE_CHECKING:
+    from sglang.srt.managers.schedule_batch import Req
+
+
+@lru_cache(maxsize=None)
+def _cache_from_str(json_str: str):
+    """Deserialize a json string to a Callable object.
+    This function is cached to avoid redundant deserialization.
+    """
+    data = orjson.loads(json_str)
+    return dill.loads(bytes.fromhex(data["callable"]))
+
+
+class CustomLogitProcessor(ABC):
+    """Abstract base class for callable functions."""
+
+    @abstractmethod
+    def __call__(
+        self,
+        logits: torch.Tensor,
+        custom_param_list: Optional[List[Dict[str, Any]]] = None,
+    ) -> torch.Tensor:
+        """Define the callable behavior."""
+        raise NotImplementedError
+
+    @classmethod
+    def to_str(cls) -> str:
+        """Serialize the callable function to a JSON-compatible string."""
+        return json.dumps({"callable": dill.dumps(cls).hex()})
+
+    @classmethod
+    def from_str(cls, json_str: str):
+        """Deserialize a callable function from a JSON string."""
+        return _cache_from_str(json_str)()
+
+
+class DisallowedTokensLogitsProcessor(CustomLogitProcessor):
+    def __call__(
+        self,
+        logits: torch.Tensor,
+        custom_param_list: Optional[List[Dict[str, Any]]] = None,
+    ) -> torch.Tensor:
+        disallowed_token_ids = custom_param_list[0]["token_ids"]
+        assert all(
+            disallowed_token_ids == c["token_ids"] for c in custom_param_list
+        ), f"{custom_param_list=}"
+        logits[..., disallowed_token_ids] = -float("inf")
+        return logits
+
+
+class ThinkingBudgetLogitProcessor(CustomLogitProcessor):
+    """A logit processor that controls the length of thinking."""
+
+    THINKING_START_TOKEN_ID: int
+    THINKING_END_TOKEN_ID: int
+    NEW_LINE_TOKEN_ID: int
+
+    def __call__(self, logits, custom_param_list: list[dict[str, Any]]):
+        if custom_param_list is None or not custom_param_list:
+            return logits
+        for i, param_dict in enumerate(custom_param_list):
+            if param_dict is None:
+                continue
+
+            thinking_budget: int | None = param_dict.get("thinking_budget")
+
+            # Skip if thinking_budget is unset, or not an integer, or negative
+            if (
+                thinking_budget is None
+                or not isinstance(thinking_budget, int)
+                or thinking_budget < 0
+            ):
+                continue
+            req: Req = param_dict.get("__req__")
+            cur_ids: list[int] = [*req.origin_input_ids, *req.output_ids]
+
+            # Check if out of thinking stage
+            if (
+                self.THINKING_START_TOKEN_ID not in cur_ids
+                or self.THINKING_END_TOKEN_ID in cur_ids
+            ):
+                continue
+
+            # Find the index of the thinking start token
+            start_index = cur_ids.index(self.THINKING_START_TOKEN_ID)
+
+            # Count the number of tokens after the thinking start token
+            num_tokens_after_start = len(cur_ids) - start_index - 1
+
+            if num_tokens_after_start < thinking_budget:
+                continue
+
+            # Ensure new line token before thinking end token
+            if not req.output_ids or req.output_ids[-1] != self.NEW_LINE_TOKEN_ID:
+                logits[i, :] = -float("inf")
+                logits[i, self.NEW_LINE_TOKEN_ID] = 0.0
+                continue
+
+            # Assign highest probability to the thinking end token
+            logits[i, :] = -float("inf")
+            logits[i, self.THINKING_END_TOKEN_ID] = 0.0
+
+        return logits
+
+
+class Glm4MoeThinkingBudgetLogitProcessor(ThinkingBudgetLogitProcessor):
+    """A logit processor that controls the length of thinking for GLM-4.5 / GLM-4.6 / GLM-4.5V / GLM-4.6V models."""
+
+    THINKING_START_TOKEN_ID: int = 151350
+    THINKING_END_TOKEN_ID: int = 151351
+    NEW_LINE_TOKEN_ID: int = 198
+
+
+class Qwen3ThinkingBudgetLogitProcessor(ThinkingBudgetLogitProcessor):
+    """A logit processor that controls the length of thinking for Qwen3 models."""
+
+    THINKING_START_TOKEN_ID: int = 151667
+    THINKING_END_TOKEN_ID: int = 151668
+    NEW_LINE_TOKEN_ID: int = 198
+
+
+class DeepSeekR1ThinkingBudgetLogitProcessor(ThinkingBudgetLogitProcessor):
+    """A logit processor that controls the length of thinking for DeepSeek-R1 models."""
+
+    THINKING_START_TOKEN_ID: int = 128798
+    THINKING_END_TOKEN_ID: int = 128799
+    NEW_LINE_TOKEN_ID: int = 201
+
+
+# Adapted from DeepSeek's implementation: https://github.com/deepseek-ai/DeepSeek-OCR/blob/main/DeepSeek-OCR-master/DeepSeek-OCR-vllm/process/ngram_norepeat.py
+class DeepseekOCRNoRepeatNGramLogitProcessor(CustomLogitProcessor):
+    """Block n-gram repetitions within a sliding window for DeepSeek-OCR outputs."""
+
+    def __call__(
+        self,
+        logits: torch.Tensor,
+        custom_param_list: Optional[List[Dict[str, Any]]] = None,
+    ) -> torch.Tensor:
+        if not custom_param_list:
+            return logits
+
+        for batch_idx, params in enumerate(custom_param_list):
+            if not params:
+                continue
+
+            req = params.get("__req__")
+            if req is None:
+                continue
+
+            try:
+                ngram_size = int(params.get("ngram_size") or 0)
+                window_size = int(params.get("window_size") or 0)
+            except (TypeError, ValueError):
+                continue
+
+            if ngram_size <= 0 or window_size <= 0:
+                continue
+
+            sequence: List[int] = req.origin_input_ids + req.output_ids
+            if len(sequence) < ngram_size:
+                continue
+
+            search_start = max(0, len(sequence) - window_size)
+            search_end = len(sequence) - ngram_size + 1
+            if search_end <= search_start:
+                continue
+
+            if ngram_size > 1:
+                current_prefix = tuple(sequence[-(ngram_size - 1) :])
+            else:
+                current_prefix = tuple()
+
+            banned_tokens: Set[int] = set()
+            for idx in range(search_start, search_end):
+                ngram = sequence[idx : idx + ngram_size]
+                if ngram_size == 1 or tuple(ngram[:-1]) == current_prefix:
+                    banned_tokens.add(ngram[-1])
+
+            whitelist_ids = params.get("whitelist_token_ids") or []
+            try:
+                whitelist = {int(token_id) for token_id in whitelist_ids}
+            except (TypeError, ValueError):
+                whitelist = set()
+
+            banned_tokens.difference_update(whitelist)
+
+            if not banned_tokens:
+                continue
+
+            indices = list(banned_tokens)
+            logits[batch_idx, indices] = -float("inf")
+
+        return logits
diff --git a/sglang/python/sglang/srt/sampling/penaltylib/__init__.py b/sglang/python/sglang/srt/sampling/penaltylib/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..26a780517ce7d70849195dd765f3e38c5dd71863
--- /dev/null
+++ b/sglang/python/sglang/srt/sampling/penaltylib/__init__.py
@@ -0,0 +1,11 @@
+from sglang.srt.sampling.penaltylib.frequency_penalty import BatchedFrequencyPenalizer
+from sglang.srt.sampling.penaltylib.min_new_tokens import BatchedMinNewTokensPenalizer
+from sglang.srt.sampling.penaltylib.orchestrator import BatchedPenalizerOrchestrator
+from sglang.srt.sampling.penaltylib.presence_penalty import BatchedPresencePenalizer
+
+__all__ = [
+    "BatchedFrequencyPenalizer",
+    "BatchedMinNewTokensPenalizer",
+    "BatchedPresencePenalizer",
+    "BatchedPenalizerOrchestrator",
+]
diff --git a/sglang/python/sglang/srt/sampling/penaltylib/__pycache__/__init__.cpython-311.pyc b/sglang/python/sglang/srt/sampling/penaltylib/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..49f8b06eb02341eba468d9e4c03194c2390e3bfa
Binary files /dev/null and b/sglang/python/sglang/srt/sampling/penaltylib/__pycache__/__init__.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/sampling/penaltylib/__pycache__/frequency_penalty.cpython-311.pyc b/sglang/python/sglang/srt/sampling/penaltylib/__pycache__/frequency_penalty.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5524071bea2320d32e4d1070605ec04373c997bf
Binary files /dev/null and b/sglang/python/sglang/srt/sampling/penaltylib/__pycache__/frequency_penalty.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/sampling/penaltylib/__pycache__/min_new_tokens.cpython-311.pyc b/sglang/python/sglang/srt/sampling/penaltylib/__pycache__/min_new_tokens.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..fd3272a9a44760eaed6bdc497613d0589d80feea
Binary files /dev/null and b/sglang/python/sglang/srt/sampling/penaltylib/__pycache__/min_new_tokens.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/sampling/penaltylib/__pycache__/orchestrator.cpython-311.pyc b/sglang/python/sglang/srt/sampling/penaltylib/__pycache__/orchestrator.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2d71f03bfa0e76f00ed24a6acbe36b1dd517c67f
Binary files /dev/null and b/sglang/python/sglang/srt/sampling/penaltylib/__pycache__/orchestrator.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/sampling/penaltylib/__pycache__/presence_penalty.cpython-311.pyc b/sglang/python/sglang/srt/sampling/penaltylib/__pycache__/presence_penalty.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0c726c6910c5e6a3e6bd7909f670f25379ca6ac9
Binary files /dev/null and b/sglang/python/sglang/srt/sampling/penaltylib/__pycache__/presence_penalty.cpython-311.pyc differ
diff --git a/sglang/python/sglang/srt/sampling/penaltylib/frequency_penalty.py b/sglang/python/sglang/srt/sampling/penaltylib/frequency_penalty.py
new file mode 100644
index 0000000000000000000000000000000000000000..63d838574faf85aa59146fb2aa6d6abbd0527931
--- /dev/null
+++ b/sglang/python/sglang/srt/sampling/penaltylib/frequency_penalty.py
@@ -0,0 +1,63 @@
+import torch
+
+from sglang.srt.sampling.penaltylib.orchestrator import _BatchedPenalizer
+
+
+class BatchedFrequencyPenalizer(_BatchedPenalizer):
+    """
+    Frequency penalizer penalizes tokens based on their frequency in the output.
+    """
+
+    def _is_required(self) -> bool:
+        return any(
+            req.sampling_params.frequency_penalty != 0.0
+            for req in self.orchestrator.reqs()
+        )
+
+    def _prepare(self):
+        self.cumulated_frequency_penalties = torch.zeros(
+            (len(self.orchestrator.reqs()), self.orchestrator.vocab_size),
+            dtype=torch.float32,
+            device=self.orchestrator.device,
+        )
+
+        self.frequency_penalties = (
+            torch.tensor(
+                data=[
+                    req.sampling_params.frequency_penalty
+                    for req in self.orchestrator.reqs()
+                ],
+                dtype=torch.float32,
+                device=self.orchestrator.device,
+            )
+        ).unsqueeze_(1)
+
+    def _cumulate_output_tokens(self, output_ids: torch.Tensor):
+        self.cumulated_frequency_penalties.scatter_add_(
+            dim=1,
+            index=output_ids.unsqueeze(1),
+            src=self.frequency_penalties,
+        )
+
+    def _apply(self, logits: torch.Tensor) -> torch.Tensor:
+        logits.sub_(self.cumulated_frequency_penalties)
+
+    def _filter(self, keep_indices: torch.Tensor):
+        self.frequency_penalties = self.frequency_penalties[keep_indices]
+        self.cumulated_frequency_penalties = self.cumulated_frequency_penalties[
+            keep_indices
+        ]
+
+    def _merge(self, their: "BatchedFrequencyPenalizer"):
+        self.frequency_penalties = torch.cat(
+            [self.frequency_penalties, their.frequency_penalties], dim=0
+        )
+        self.cumulated_frequency_penalties = torch.cat(
+            [self.cumulated_frequency_penalties, their.cumulated_frequency_penalties],
+            dim=0,
+        )
+
+    def _teardown(self) -> None:
+        for name in ("frequency_penalties", "cumulated_frequency_penalties"):
+            if hasattr(self, name):
+                delattr(self, name)
diff --git a/sglang/python/sglang/srt/sampling/penaltylib/min_new_tokens.py b/sglang/python/sglang/srt/sampling/penaltylib/min_new_tokens.py
new file mode 100644
index 0000000000000000000000000000000000000000..08f76e1f145b333f201ad86400f44fa2d88ccc24
--- /dev/null
+++ b/sglang/python/sglang/srt/sampling/penaltylib/min_new_tokens.py
@@ -0,0 +1,93 @@
+import torch
+
+from sglang.srt.sampling.penaltylib.orchestrator import _BatchedPenalizer
+
+
+class BatchedMinNewTokensPenalizer(_BatchedPenalizer):
+    """
+    Min new tokens penalizer penalizes tokens based on the length of the output.
+    """
+
+    def _is_required(self) -> bool:
+        return any(
+            req.sampling_params.min_new_tokens > 0 for req in self.orchestrator.reqs()
+        )
+
+    def _prepare(self):
+        self.min_new_tokens = torch.tensor(
+            data=[
+                req.sampling_params.min_new_tokens for req in self.orchestrator.reqs()
+            ],
+            dtype=torch.int32,
+            device=self.orchestrator.device,
+        ).unsqueeze_(1)
+
+        padded_stop_token_ids = torch.nn.utils.rnn.pad_sequence(
+            sequences=[
+                torch.tensor(
+                    data=(
+                        list(
+                            (req.sampling_params.stop_token_ids or set())
+                            | (req.tokenizer.additional_stop_token_ids or set())
+                            | {req.tokenizer.eos_token_id}
+                        )
+                    ),
+                    dtype=torch.int64,
+                    device=self.orchestrator.device,
+                )
+                for req in self.orchestrator.reqs()
+            ],
+            batch_first=True,
+            padding_value=self.orchestrator.vocab_size,
+        )
+        self.stop_token_penalties = torch.zeros(
+            size=(len(self.orchestrator.reqs()), self.orchestrator.vocab_size + 1),
+            dtype=torch.float32,
+            device=self.orchestrator.device,
+        ).scatter_add_(
+            dim=1,
+            index=padded_stop_token_ids,
+            src=torch.full_like(
+                input=padded_stop_token_ids,
+                dtype=torch.float32,
+                fill_value=float("-inf"),
+                device=self.orchestrator.device,
+            ),
+        )[
+            :, : self.orchestrator.vocab_size
+        ]
+
+        self.len_output_tokens = torch.zeros(
+            size=(len(self.orchestrator.reqs()), 1),
+            dtype=torch.int32,
+            device=self.orchestrator.device,
+        )
+
+    def _cumulate_output_tokens(self, output_ids: torch.Tensor):
+        self.len_output_tokens += 1
+
+    def _apply(self, logits: torch.Tensor):
+        mask = (self.len_output_tokens < self.min_new_tokens).expand_as(logits)
+        logits[mask] += self.stop_token_penalties[mask]
+
+    def _filter(self, keep_indices: torch.Tensor):
+        self.min_new_tokens = self.min_new_tokens[keep_indices]
+        self.stop_token_penalties = self.stop_token_penalties[keep_indices]
+        self.len_output_tokens = self.len_output_tokens[keep_indices]
+
+    def _merge(self, their: "BatchedMinNewTokensPenalizer"):
+        self.min_new_tokens = torch.cat(
+            [self.min_new_tokens, their.min_new_tokens], dim=0
+        )
+        self.stop_token_penalties = torch.cat(
+            [self.stop_token_penalties, their.stop_token_penalties], dim=0
+        )
+        self.len_output_tokens = torch.cat(
+            [self.len_output_tokens, their.len_output_tokens], dim=0
+        )
+
+    # Explicit resource cleanup to aid GC and free CUDA memory promptly
+    def _teardown(self) -> None:
+        for name in ("min_new_tokens", "stop_token_penalties", "len_output_tokens"):
+            if hasattr(self, name):
+                delattr(self, name)
diff --git a/sglang/python/sglang/srt/sampling/penaltylib/orchestrator.py b/sglang/python/sglang/srt/sampling/penaltylib/orchestrator.py
new file mode 100644
index 0000000000000000000000000000000000000000..7ef123f554f94a2b291733913a10078ee9ba3b10
--- /dev/null
+++ b/sglang/python/sglang/srt/sampling/penaltylib/orchestrator.py
@@ -0,0 +1,249 @@
+from __future__ import annotations
+
+import abc
+import weakref
+from typing import TYPE_CHECKING, Optional, Set, Type
+
+import torch
+
+if TYPE_CHECKING:
+    from sglang.srt.managers.schedule_batch import ScheduleBatch
+
+
+class BatchedPenalizerOrchestrator:
+    def __init__(
+        self,
+        vocab_size: int,
+        batch: ScheduleBatch,
+        penalizers: Set[Type["_BatchedPenalizer"]],
+    ):
+        self.vocab_size = vocab_size
+        self._batch_ref = weakref.ref(batch)
+        self.device = batch.device
+        self.penalizers = {Penalizer: Penalizer(self) for Penalizer in penalizers}
+
+        is_required = False
+        for penalizer in self.penalizers.values():
+            pen_is_required = penalizer.prepare_if_required()
+            is_required |= pen_is_required
+        self.is_required = is_required
+
+    @property
+    def batch(self) -> ScheduleBatch | None:
+        return self._batch_ref()
+
+    @batch.setter
+    def batch(self, value: Optional[ScheduleBatch]):
+        if value is None:
+            self._batch_ref = lambda: None
+        else:
+            self._batch_ref = weakref.ref(value)
+
+    def reqs(self):
+        return self.batch.reqs
+
+    def cumulate_output_tokens(self, output_ids: torch.Tensor):
+        """
+        Feed the output tokens to the penalizers.
+
+        Args:
+            output_ids (torch.Tensor): The output tokens.
+        """
+        for penalizer in self.penalizers.values():
+            penalizer.cumulate_output_tokens(output_ids=output_ids)
+
+    def apply(self, logits: torch.Tensor) -> torch.Tensor:
+        """
+        Apply the penalizers to the logits.
+        Note that it may apply the penalizers in-place.
+
+        Args:
+            logits (torch.Tensor): The logits to apply the penalizers to.
+
+        Returns:
+            torch.Tensor: The logits after applying the penalizers.
+        """
+        for penalizer in self.penalizers.values():
+            penalizer.apply(logits)
+
+    def filter(self, keep_indices: torch.Tensor):
+        """
+        Filter the penalizers based on the indices to keep in the batch.
+
+        Args:
+            keep_indices (torch.Tensor): Tensor of indices to keep in the batch.
+        """
+        if not self.is_required:
+            return
+
+        if len(keep_indices) == 0:
+            # No requests left in the batch, fully release orchestrator resources
+            self.release()
+            return
+
+        is_required = False
+        for penalizer in self.penalizers.values():
+            tmp_is_required = penalizer.is_required()
+            is_required |= tmp_is_required
+            if tmp_is_required:
+                penalizer.filter(keep_indices=keep_indices)
+            else:
+                penalizer.teardown()
+        self.is_required = is_required
+
+    # Resource management helpers
+    def release(self) -> None:
+        """Release all penalizers and break references so GC can reclaim promptly."""
+        for penalizer in self.penalizers.values():
+            penalizer.teardown()
+        self.penalizers.clear()
+        # Break reference to ScheduleBatch
+        self._batch_ref = None
+        self.is_required = False
+
+    # Context manager support
+    def __enter__(self) -> "BatchedPenalizerOrchestrator":
+        return self
+
+    def __exit__(self, exc_type, exc, tb) -> None:
+        self.release()
+
+    def merge(self, their: "BatchedPenalizerOrchestrator"):
+        """
+        Merge the penalizers of another orchestrator into this one.
+
+        Note that this function **must** be called _before_ self.batch.reqs is updated (filtered).
+        Each unprepared penalizers would have to be prepared (creating tensors, etc.) first before merging.
+        This step requires the original batch.reqs, before it gets merged with other batch.reqs.
+
+        Args:
+            their (BatchedPenalizerOrchestrator): The orchestrator to merge into this one.
+        """
+        if not self.is_required and not their.is_required:
+            return
+
+        self.is_required = True
+        for penalizer, their_penalizer in their.penalizers.items():
+            self.penalizers[penalizer].merge(their_penalizer)
+
+
+class _BatchedPenalizer(abc.ABC):
+    """
+    An abstract class for a batched penalizer.
+    """
+
+    def __init__(self, orchestrator: BatchedPenalizerOrchestrator):
+        self._orchestrator_ref: weakref.ReferenceType[BatchedPenalizerOrchestrator] = (
+            weakref.ref(orchestrator)
+        )
+        self._is_prepared = False
+
+    @property
+    def orchestrator(self) -> BatchedPenalizerOrchestrator:
+        orch: Optional[BatchedPenalizerOrchestrator] = self._orchestrator_ref()
+        # This should never happen, but we need to handle it gracefully
+        if orch is None:
+            raise RuntimeError(
+                "BatchedPenalizerOrchestrator has been garbage-collected"
+            )
+        return orch
+
+    def is_prepared(self) -> bool:
+        return self._is_prepared
+
+    def is_required(self) -> bool:
+        return self._is_required()
+
+    def prepare(self):
+        if not self._is_prepared:
+            self._prepare()
+            self._is_prepared = True
+
+    def prepare_if_required(self):
+        if self._is_required():
+            self.prepare()
+            return True
+        else:
+            return False
+
+    def teardown(self):
+        self._teardown()
+        self._is_prepared = False
+
+    def cumulate_output_tokens(self, output_ids: torch.Tensor):
+        if not self._is_prepared:
+            return
+
+        self._cumulate_output_tokens(output_ids=output_ids)
+
+    def apply(self, logits: torch.Tensor) -> torch.Tensor:
+        if not self._is_prepared:
+            return
+
+        self._apply(logits=logits)
+
+    def filter(self, keep_indices: torch.Tensor):
+        if not self._is_prepared:
+            return
+
+        self._filter(keep_indices=keep_indices)
+
+    def merge(self, their: "_BatchedPenalizer"):
+        if not self._is_prepared and not their._is_prepared:
+            return
+
+        self.prepare()
+        their.prepare()
+        self._merge(their)
+
+    @abc.abstractmethod
+    def _is_required(self) -> bool:
+        """
+        Check if the penalizer is required to be prepared.
+        """
+        pass
+
+    @abc.abstractmethod
+    def _prepare(self):
+        """
+        Prepare the penalizer.
+        Usually, this is where the penalizer initializes its tensors.
+        """
+        pass
+
+    @abc.abstractmethod
+    def _cumulate_output_tokens(self, output_ids: torch.Tensor):
+        """
+        Cumulate the output tokens.
+        Orchestrator will call this function to feed the output tokens to the penalizer.
+        """
+        pass
+
+    @abc.abstractmethod
+    def _apply(self, logits: torch.Tensor) -> torch.Tensor:
+        """
+        Apply the penalizer to the logits.
+        Penalizers can modify the logits in-place if needed.
+        """
+        pass
+
+    @abc.abstractmethod
+    def _filter(self, keep_indices: torch.Tensor):
+        """
+        Filter the penalizer (tensors or underlying data) based on the indices to keep in the batch.
+        """
+        pass
+
+    @abc.abstractmethod
+    def _merge(self, their: "_BatchedPenalizer"):
+        """
+        Merge the penalizer with another penalizer.
+        """
+        pass
+
+    @abc.abstractmethod
+    def _teardown(self):
+        """
+        Teardown the penalizer.
+        """
+        pass
diff --git a/sglang/python/sglang/srt/sampling/penaltylib/presence_penalty.py b/sglang/python/sglang/srt/sampling/penaltylib/presence_penalty.py
new file mode 100644
index 0000000000000000000000000000000000000000..1c045039e137897daaf414a9b144dde767f6bbe5
--- /dev/null
+++ b/sglang/python/sglang/srt/sampling/penaltylib/presence_penalty.py
@@ -0,0 +1,63 @@
+import torch
+
+from sglang.srt.sampling.penaltylib.orchestrator import _BatchedPenalizer
+
+
+class BatchedPresencePenalizer(_BatchedPenalizer):
+    """
+    Presence penalizer penalizes tokens based on their presence in the output.
+    """
+
+    def _is_required(self) -> bool:
+        return any(
+            req.sampling_params.presence_penalty != 0.0
+            for req in self.orchestrator.reqs()
+        )
+
+    def _prepare(self):
+        self.cumulated_presence_penalties = torch.zeros(
+            (len(self.orchestrator.reqs()), self.orchestrator.vocab_size),
+            dtype=torch.float32,
+            device=self.orchestrator.device,
+        )
+
+        self.presence_penalties = (
+            torch.tensor(
+                data=[
+                    req.sampling_params.presence_penalty
+                    for req in self.orchestrator.reqs()
+                ],
+                dtype=torch.float32,
+                device=self.orchestrator.device,
+            )
+        ).unsqueeze_(1)
+
+    def _cumulate_output_tokens(self, output_ids: torch.Tensor):
+        self.cumulated_presence_penalties.scatter_(
+            dim=1,
+            index=output_ids.unsqueeze(1),
+            src=self.presence_penalties,
+        )
+
+    def _apply(self, logits: torch.Tensor) -> torch.Tensor:
+        logits.sub_(self.cumulated_presence_penalties)
+
+    def _filter(self, keep_indices: torch.Tensor):
+        self.presence_penalties = self.presence_penalties[keep_indices]
+        self.cumulated_presence_penalties = self.cumulated_presence_penalties[
+            keep_indices
+        ]
+
+    def _merge(self, their: "BatchedPresencePenalizer"):
+        self.presence_penalties = torch.cat(
+            [self.presence_penalties, their.presence_penalties], dim=0
+        )
+        self.cumulated_presence_penalties = torch.cat(
+            [self.cumulated_presence_penalties, their.cumulated_presence_penalties],
+            dim=0,
+        )
+
+    def _teardown(self) -> None:
+        for name in ("presence_penalties", "cumulated_presence_penalties"):
+            if hasattr(self, name):
+                delattr(self, name)
diff --git a/sglang/python/sglang/srt/sampling/sampling_batch_info.py b/sglang/python/sglang/srt/sampling/sampling_batch_info.py
new file mode 100644
index 0000000000000000000000000000000000000000..a8f17c754d8ec5b4fbf820973c46058789b3dae3
--- /dev/null
+++ b/sglang/python/sglang/srt/sampling/sampling_batch_info.py
@@ -0,0 +1,423 @@
+from __future__ import annotations
+
+import dataclasses
+import logging
+from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple
+
+import torch
+
+import sglang.srt.sampling.penaltylib as penaltylib
+from sglang.srt.sampling.custom_logit_processor import CustomLogitProcessor
+from sglang.srt.sampling.sampling_params import TOP_K_ALL
+from sglang.srt.server_args import get_global_server_args
+
+if TYPE_CHECKING:
+    from sglang.srt.managers.schedule_batch import ScheduleBatch
+
+
+logger = logging.getLogger(__name__)
+
+
+@dataclasses.dataclass
+class SamplingBatchInfo:
+    # Basic batched sampling params
+    temperatures: torch.Tensor
+    top_ps: torch.Tensor
+    top_ks: torch.Tensor
+    min_ps: torch.Tensor
+
+    # Whether all requests use greedy sampling
+    is_all_greedy: bool
+
+    # Whether any requests use top_p sampling
+    need_top_p_sampling: bool
+
+    # Whether any requests use top_k sampling
+    need_top_k_sampling: bool
+
+    # Whether any request needs min_p sampling
+    need_min_p_sampling: bool
+
+    # Masking tensors for grammar-guided structured outputs
+    vocab_size: int
+    grammars: Optional[List] = None
+    vocab_mask: Optional[torch.Tensor] = None
+    apply_mask_func: Optional[Callable[[torch.Tensor, torch.Tensor], None]] = None
+
+    # Penalizer
+    penalizer_orchestrator: Optional[penaltylib.BatchedPenalizerOrchestrator] = None
+    acc_linear_penalties: torch.Tensor = None  # Used in the overlap mode
+
+    # Whether any request has custom logit processor
+    has_custom_logit_processor: bool = False
+    # Custom parameters
+    custom_params: Optional[List[Optional[Dict[str, Any]]]] = None
+    # Custom logit processor
+    custom_logit_processor: Optional[
+        Dict[int, Tuple[CustomLogitProcessor, torch.Tensor]]
+    ] = None
+
+    # Used for deterministic sampling
+    sampling_seed: Optional[torch.Tensor] = None
+
+    # Device
+    device: str = "cuda"
+
+    # Handle logit bias
+    logit_bias: Optional[torch.Tensor] = None
+
+    @classmethod
+    def from_schedule_batch(cls, batch: ScheduleBatch, vocab_size: int):
+        global_server_args = get_global_server_args()
+        enable_deterministic = global_server_args.enable_deterministic_inference
+
+        reqs = batch.reqs
+        device = batch.device
+        temperatures = torch.tensor(
+            [r.sampling_params.temperature for r in reqs],
+            dtype=torch.float,
+            device=device,
+        ).view(-1, 1)
+        top_ps = torch.tensor(
+            [r.sampling_params.top_p for r in reqs], dtype=torch.float, device=device
+        )
+        top_ks = torch.tensor(
+            [r.sampling_params.top_k for r in reqs], dtype=torch.int32, device=device
+        )
+        min_ps = torch.tensor(
+            [r.sampling_params.min_p for r in reqs], dtype=torch.float, device=device
+        )
+        sampling_seed = (
+            torch.tensor(
+                [
+                    (
+                        r.sampling_params.sampling_seed
+                        if r.sampling_params.sampling_seed is not None
+                        else 42
+                    )
+                    for r in reqs
+                ],
+                dtype=torch.int64,
+                device=device,
+            )
+            if enable_deterministic
+            else None
+        )
+
+        logit_bias = None
+        if any(r.sampling_params.logit_bias is not None for r in reqs):
+            logit_bias = torch.zeros(len(reqs), vocab_size, device=device)
+            for i, r in enumerate(reqs):
+                if r.sampling_params.logit_bias is not None:
+                    for key, value in r.sampling_params.logit_bias.items():
+                        logit_bias[i, int(key)] = value
+
+        # Check if any request has custom logit processor
+        has_custom_logit_processor = (
+            global_server_args.enable_custom_logit_processor
+            and any(r.custom_logit_processor for r in reqs)  # check the flag first.
+        )  # then check the requests.
+
+        if has_custom_logit_processor:
+            # Merge the same type of custom logit processors together
+            processor_dict = {}
+            for i, r in enumerate(reqs):
+                if r.custom_logit_processor is None:
+                    continue
+                processor_str = r.custom_logit_processor
+                if processor_str not in processor_dict:
+                    processor_dict[processor_str] = []
+                processor_dict[processor_str].append(i)
+
+            merged_custom_logit_processor = {
+                hash(processor_str): (
+                    # The deserialized custom logit processor object
+                    CustomLogitProcessor.from_str(processor_str),
+                    # The mask tensor for the requests that use this custom logit processor
+                    torch.zeros(len(reqs), dtype=torch.bool)
+                    .scatter_(0, torch.tensor(true_indices), True)
+                    .to(device, non_blocking=True),
+                )
+                for processor_str, true_indices in processor_dict.items()
+            }
+            custom_params = [r.sampling_params.custom_params for r in reqs]
+        else:
+            merged_custom_logit_processor = None
+            custom_params = None
+
+        # Each penalizers will do nothing if they evaluate themselves as not required by looking at
+        # the sampling_params of the requests (See {_is_required()} of each penalizers). So this
+        # should not add hefty computation overhead other than simple checks.
+        #
+        # While we can choose not to even create the class instances if they are not required, this
+        # could add additional complexity to the {ScheduleBatch} class, especially we need to
+        # handle {filter_batch()} and {merge_batch()} cases as well.
+        penalizer_orchestrator = penaltylib.BatchedPenalizerOrchestrator(
+            vocab_size=vocab_size,
+            batch=batch,
+            penalizers={
+                penaltylib.BatchedFrequencyPenalizer,
+                penaltylib.BatchedMinNewTokensPenalizer,
+                penaltylib.BatchedPresencePenalizer,
+            },
+        )
+
+        ret = cls(
+            temperatures=temperatures,
+            top_ps=top_ps,
+            top_ks=top_ks,
+            min_ps=min_ps,
+            sampling_seed=sampling_seed,
+            is_all_greedy=all(r.sampling_params.top_k <= 1 for r in reqs),
+            need_top_p_sampling=any(r.sampling_params.top_p != 1.0 for r in reqs),
+            need_top_k_sampling=any(r.sampling_params.top_k != TOP_K_ALL for r in reqs),
+            need_min_p_sampling=any(r.sampling_params.min_p > 0 for r in reqs),
+            vocab_size=vocab_size,
+            penalizer_orchestrator=penalizer_orchestrator,
+            has_custom_logit_processor=has_custom_logit_processor,
+            custom_params=custom_params,
+            custom_logit_processor=merged_custom_logit_processor,
+            device=device,
+            logit_bias=logit_bias,
+        )
+        ret.adjusted_from_schedule_batch(batch, vocab_size)
+        return ret
+
+    # placeholder for override
+    def adjusted_from_schedule_batch(self, batch: ScheduleBatch, vocab_size: int):
+        pass
+
+    # placeholder for override
+    def adjusted_merge_batch(self, other: "SamplingBatchInfo"):
+        pass
+
+    def __len__(self):
+        return len(self.temperatures)
+
+    def update_regex_vocab_mask(self):
+        if not self.grammars:
+            self.vocab_mask = None
+            self.apply_mask_func = None
+            return
+
+        # Find a grammar from the list
+        first_grammar = next(grammar for grammar in self.grammars if grammar)
+
+        # TODO(lianmin): Maybe we can reuse the existing mask?
+        self.vocab_mask = first_grammar.allocate_vocab_mask(
+            vocab_size=self.vocab_size,
+            batch_size=len(self.temperatures),
+            device=self.device,
+        )
+        self.apply_mask_func = (
+            first_grammar.apply_vocab_mask
+        )  # force to use static method
+
+        # Apply the mask
+        for i, grammar in enumerate(self.grammars):
+            if grammar and not grammar.finished and not grammar.is_terminated():
+                grammar.fill_vocab_mask(self.vocab_mask, i)
+
+        # Move the mask to the device if needed
+        self.vocab_mask = first_grammar.move_vocab_mask(self.vocab_mask, self.device)
+
+    def update_penalties(self):
+        if self.penalizer_orchestrator.is_required:
+            self.acc_linear_penalties = torch.zeros(
+                (len(self.temperatures), self.vocab_size),
+                dtype=torch.float32,
+                device=self.temperatures.device,
+            )
+            self.penalizer_orchestrator.apply(self.acc_linear_penalties)
+        else:
+            self.acc_linear_penalties = None
+
+    def apply_logits_bias(self, logits: torch.Tensor):
+        if self.acc_linear_penalties is not None:
+            # Used in the overlap mode
+            logits.add_(self.acc_linear_penalties)
+
+        if self.penalizer_orchestrator and self.penalizer_orchestrator.is_required:
+            # Used in the non-overlap mode
+            self.penalizer_orchestrator.apply(logits)
+
+        if self.vocab_mask is not None:
+            self.apply_mask_func(logits=logits, vocab_mask=self.vocab_mask)
+
+        if self.logit_bias is not None:
+            logits.add_(self.logit_bias)
+
+    def filter_batch(self, keep_indices: List[int], keep_indices_device: torch.Tensor):
+        self.penalizer_orchestrator.filter(keep_indices_device)
+
+        if self.has_custom_logit_processor:
+            self._filter_batch_custom_logit_processor(keep_indices, keep_indices_device)
+
+        for item in [
+            "temperatures",
+            "top_ps",
+            "top_ks",
+            "min_ps",
+            "sampling_seed",
+        ]:
+            value = getattr(self, item, None)
+            if value is not None:
+                setattr(self, item, value[keep_indices_device])
+
+        if self.logit_bias is not None:
+            self.logit_bias = self.logit_bias[keep_indices_device]
+
+    def _filter_batch_custom_logit_processor(
+        self, keep_indices: List[int], keep_indices_device: torch.Tensor
+    ):
+        """Filter the custom logit processor and custom params"""
+        self.custom_logit_processor = {
+            k: (p, mask[keep_indices_device])
+            for k, (p, mask) in self.custom_logit_processor.items()
+            if torch.any(
+                mask[keep_indices_device]
+            )  # ignore the custom logit processor whose mask is all False
+        }
+        self.custom_params = [self.custom_params[i] for i in keep_indices]
+
+        # If the custom logit processor is an empty dict, set the flag to False,
+        # and set the custom logit processor and custom params to None.
+        if len(self.custom_logit_processor) == 0:
+            self.custom_logit_processor = None
+            self.custom_params = None
+            self.has_custom_logit_processor = False
+
+    @staticmethod
+    def merge_custom_logit_processor(
+        lhs: Optional[Dict[int, Tuple[CustomLogitProcessor, torch.Tensor]]],
+        rhs: Optional[Dict[int, Tuple[CustomLogitProcessor, torch.Tensor]]],
+        bs1: int,
+        bs2: int,
+        device: str,
+    ):
+        if lhs is None and rhs is None:
+            return None
+        lhs, rhs = lhs or {}, rhs or {}
+
+        keys = set(lhs.keys()).union(set(rhs.keys()))
+        merged_dict = {}
+
+        for k in keys:
+            # Get the logit processor object
+            processor = lhs[k][0] if k in lhs else rhs[k][0]
+            # Get and merge the mask tensors from the two dicts
+            left_mask = (
+                lhs[k][1]
+                if k in lhs
+                else torch.zeros(bs1, dtype=torch.bool, device=device)
+            )
+            right_mask = (
+                rhs[k][1]
+                if k in rhs
+                else torch.zeros(bs2, dtype=torch.bool, device=device)
+            )
+            merged_dict[k] = (processor, torch.cat([left_mask, right_mask]))
+
+            assert merged_dict[k][1].shape[0] == bs1 + bs2, (
+                f"The batch size of merged mask ({merged_dict[k][1].shape[0]}) does not match "
+                f"the sum of the batch sizes of the two masks ({bs1 + bs2})"
+                f"\n{left_mask=}\n{right_mask=}\n{bs1=}\n{bs2=}"
+                f"\n{lhs=}\n{rhs=}"
+            )
+
+        return merged_dict
+
+    def merge_batch(self, other: "SamplingBatchInfo"):
+        self.penalizer_orchestrator.merge(other.penalizer_orchestrator)
+
+        # Merge the custom logit processors and custom params lists
+        if self.has_custom_logit_processor or other.has_custom_logit_processor:
+            # Merge the custom logit processors
+            self.custom_logit_processor = (
+                SamplingBatchInfo.merge_custom_logit_processor(
+                    self.custom_logit_processor,
+                    other.custom_logit_processor,
+                    len(self),
+                    len(other),
+                    self.device,
+                )
+            )
+            # Merge the custom params lists
+            self.custom_params = self.custom_params or [None] * len(self)
+            other.custom_params = other.custom_params or [None] * len(other)
+            self.custom_params.extend(other.custom_params)
+
+            # Set the flag to True if any of the two has custom logit processor
+            self.has_custom_logit_processor = True
+
+        # Merge logit bias - note this has to come before the temperatures tensor update! Otherwise will cause crashes.
+        # See note below on len(self) and len(other).
+        self.logit_bias = merge_bias_tensor(
+            self.logit_bias, other.logit_bias, len(self), len(other), self.device, 0.0
+        )
+
+        # Note: because the __len()__ operator is defined on the temperatures tensor,
+        # please make sure any merge operation with len(self) or len(other) is done before
+        # the merge operation of the temperatures tensor below.
+        for item in [
+            "temperatures",
+            "top_ps",
+            "top_ks",
+            "min_ps",
+            "sampling_seed",
+        ]:
+            self_val = getattr(self, item, None)
+            other_val = getattr(other, item, None)
+            if self_val is not None and other_val is not None:
+                setattr(self, item, torch.cat([self_val, other_val]))
+
+        self.is_all_greedy &= other.is_all_greedy
+        self.need_top_p_sampling |= other.need_top_p_sampling
+        self.need_top_k_sampling |= other.need_top_k_sampling
+        self.need_min_p_sampling |= other.need_min_p_sampling
+
+        self.adjusted_merge_batch(other)
+
+    def copy_for_forward(self):
+        # Accumulate the penalty into a pre-allocated buffer to get rid of the dependency of `penalizer_orchestrator` later
+        self.update_penalties()
+        return dataclasses.replace(self, penalizer_orchestrator=None)
+
+
+def merge_bias_tensor(
+    lhs: Optional[torch.Tensor],
+    rhs: Optional[torch.Tensor],
+    bs1: int,
+    bs2: int,
+    device: str,
+    default: float,
+):
+    """Merge two bias tensors for batch merging.
+
+    Args:
+        lhs: Left-hand side tensor
+        rhs: Right-hand side tensor
+        bs1: Batch size of left-hand side tensor
+        bs2: Batch size of right-hand side tensor
+        device: Device to place the merged tensor on
+        default: Default value for missing tensor elements
+
+    Returns:
+        Merged tensor or None if both inputs are None
+    """
+    if lhs is None and rhs is None:
+        return None
+
+    if lhs is not None and rhs is not None:
+        return torch.cat([lhs, rhs])
+    else:
+        if lhs is not None:
+            shape, dtype = lhs.shape[1:], lhs.dtype
+        else:
+            shape, dtype = rhs.shape[1:], rhs.dtype
+
+        if lhs is None:
+            lhs = torch.empty((bs1, *shape), device=device, dtype=dtype).fill_(default)
+        if rhs is None:
+            rhs = torch.empty((bs2, *shape), device=device, dtype=dtype).fill_(default)
+        return torch.cat([lhs, rhs])
diff --git a/sglang/python/sglang/srt/sampling/sampling_params.py b/sglang/python/sglang/srt/sampling/sampling_params.py
new file mode 100644
index 0000000000000000000000000000000000000000..a7f4c664ff5f25cc6cd5cd5bd547c256cf47dc82
--- /dev/null
+++ b/sglang/python/sglang/srt/sampling/sampling_params.py
@@ -0,0 +1,243 @@
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Sampling parameters for text generation."""
+
+import logging
+from typing import Any, Dict, List, Optional, Union
+
+# sre_parse is deprecated in Python 3.11+, use re._parser instead
+try:
+    import re._parser as sre_parse
+except ImportError:
+    import sre_parse  # Python < 3.11
+
+_SAMPLING_EPS = 1e-6
+TOP_K_ALL = 1 << 30
+
+logger = logging.getLogger(__name__)
+
+
+class SamplingParams:
+    """
+    The sampling parameters.
+
+    See docs/backend/sampling_params.md or
+    https://docs.sglang.io/backend/sampling_params.html
+    for the documentation.
+    """
+
+    def __init__(
+        self,
+        max_new_tokens: int = 128,
+        stop: Optional[Union[str, List[str]]] = None,
+        stop_token_ids: Optional[List[int]] = None,
+        stop_regex: Optional[Union[str, List[str]]] = None,
+        temperature: float = 1.0,
+        top_p: float = 1.0,
+        top_k: int = -1,
+        min_p: float = 0.0,
+        frequency_penalty: float = 0.0,
+        presence_penalty: float = 0.0,
+        repetition_penalty: float = 1.0,
+        min_new_tokens: int = 0,
+        n: int = 1,
+        json_schema: Optional[str] = None,
+        regex: Optional[str] = None,
+        ebnf: Optional[str] = None,
+        structural_tag: Optional[str] = None,
+        ignore_eos: bool = False,
+        skip_special_tokens: bool = True,
+        spaces_between_special_tokens: bool = True,
+        no_stop_trim: bool = False,
+        custom_params: Optional[Dict[str, Any]] = None,
+        stream_interval: Optional[int] = None,
+        logit_bias: Optional[Dict[str, float]] = None,
+        sampling_seed: Optional[int] = None,
+    ) -> None:
+        self.max_new_tokens = max_new_tokens
+        self.stop_strs = stop
+        if stop_token_ids:
+            self.stop_token_ids = set(stop_token_ids)
+        else:
+            self.stop_token_ids = None
+        self.stop_regex_strs = stop_regex
+        self.temperature = temperature
+        self.top_p = top_p
+        self.top_k = top_k
+        self.min_p = min_p
+        self.frequency_penalty = frequency_penalty
+        self.presence_penalty = presence_penalty
+        self.repetition_penalty = repetition_penalty
+        self.min_new_tokens = min_new_tokens
+        self.regex = regex
+        self.n = n
+        self.json_schema = json_schema
+        self.ebnf = ebnf
+        self.structural_tag = structural_tag
+        self.ignore_eos = ignore_eos
+        self.skip_special_tokens = skip_special_tokens
+        self.spaces_between_special_tokens = spaces_between_special_tokens
+        self.no_stop_trim = no_stop_trim
+        self.custom_params = custom_params
+        self.stream_interval = stream_interval
+        self.logit_bias = logit_bias
+        self.sampling_seed = sampling_seed
+
+        # Process some special cases
+        if 0 <= self.temperature < _SAMPLING_EPS:
+            # top_k = 1 means greedy sampling
+            self.temperature = 1.0
+            self.top_k = 1
+        if self.top_k == -1:
+            self.top_k = TOP_K_ALL  # whole vocabulary
+
+    def verify(self, vocab_size):
+        if self.temperature < 0.0:
+            raise ValueError(
+                f"temperature must be non-negative, got {self.temperature}."
+            )
+        if not 0.0 < self.top_p <= 1.0:
+            raise ValueError(f"top_p must be in (0, 1], got {self.top_p}.")
+        if not 0.0 <= self.min_p <= 1.0:
+            raise ValueError(f"min_p must be in [0, 1], got {self.min_p}.")
+        if self.top_k < 1 or self.top_k == -1:
+            raise ValueError(
+                f"top_k must be -1 (disable) or at least 1, got {self.top_k}."
+            )
+        if not -2.0 <= self.frequency_penalty <= 2.0:
+            raise ValueError(
+                "frequency_penalty must be in [-2, 2], got "
+                f"{self.frequency_penalty}."
+            )
+        if not -2.0 <= self.presence_penalty <= 2.0:
+            raise ValueError(
+                "presence_penalty must be in [-2, 2], got " f"{self.presence_penalty}."
+            )
+        if not 0.0 <= self.repetition_penalty <= 2.0:
+            raise ValueError(
+                "repetition_penalty must be in [0, 2], got "
+                f"{self.repetition_penalty}."
+            )
+        if not 0 <= self.min_new_tokens:
+            raise ValueError(
+                f"min_new_tokens must be in [0, max_new_tokens], got "
+                f"{self.min_new_tokens}."
+            )
+        if self.max_new_tokens is not None:
+            if self.max_new_tokens < 0:
+                raise ValueError(
+                    f"max_new_tokens must be at least 0, got {self.max_new_tokens}."
+                )
+            if not self.min_new_tokens <= self.max_new_tokens:
+                raise ValueError(
+                    f"min_new_tokens must be in [0, max_new_tokens({self.max_new_tokens})], got "
+                    f"{self.min_new_tokens}."
+                )
+        if self.logit_bias is not None:
+            for token_id in self.logit_bias:
+                if not 0 <= int(token_id) < vocab_size:
+                    raise ValueError(
+                        f"logit_bias must has keys in [0, {vocab_size - 1}], got "
+                        f"{token_id}."
+                    )
+
+        grammars = [
+            self.json_schema,
+            self.regex,
+            self.ebnf,
+        ]  # since mutually exclusive, only one can be set
+        if sum(x is not None for x in grammars) > 1:
+            raise ValueError("Only one of regex, json_schema, or ebnf can be set.")
+
+    def normalize(self, tokenizer):
+        # Process stop strings
+        if self.stop_strs is None:
+            self.stop_strs = []
+            self.stop_str_max_len = 0
+        else:
+            if isinstance(self.stop_strs, str):
+                self.stop_strs = [self.stop_strs]
+
+            stop_str_max_len = 0
+            for stop_str in self.stop_strs:
+                if tokenizer is not None:
+                    stop_str_ids = tokenizer.encode(stop_str, add_special_tokens=False)
+                    stop_str_max_len = max(stop_str_max_len, len(stop_str_ids))
+                else:
+                    stop_str_max_len = max(stop_str_max_len, len(stop_str))
+            self.stop_str_max_len = stop_str_max_len
+
+        # Process stop regex strings
+        if self.stop_regex_strs is None:
+            self.stop_regex_strs = []
+            self.stop_regex_max_len = 0
+        else:
+            if isinstance(self.stop_regex_strs, str):
+                self.stop_regex_strs = [self.stop_regex_strs]
+
+            stop_regex_max_len = 0
+            for stop_regex in self.stop_regex_strs:
+                stop_regex_max_len = max(
+                    stop_regex_max_len, get_max_seq_length(stop_regex)
+                )
+
+            self.stop_regex_max_len = stop_regex_max_len
+
+
+# This function gets a strict upperbound on the maximum number of tokens that would need
+# to be buffered to match the input regex string
+# NOTE: in the worst case, one character that needs to be buffered corresponds to one
+# token
+def get_max_seq_length(regex_str: str):
+    return _max_length_from_subpattern(sre_parse.parse(regex_str))
+
+
+MAX_LEN = 2**30
+
+
+def _max_length_from_subpattern(subpattern: sre_parse.SubPattern):
+    total = 0
+    for token, value in subpattern:
+        if token in {
+            sre_parse.LITERAL,  # `value` is any one character
+            sre_parse.IN,  # Any character within `value`
+            sre_parse.ANY,  # "."
+        }:
+            total += 1
+        elif token == sre_parse.SUBPATTERN:
+            # EG: (a\d+) ->
+            # [(SUBPATTERN,
+            #   (1, 0, 0, [(LITERAL, 97),
+            #              (MAX_REPEAT, (1, MAXREPEAT, [(IN, [(CATEGORY, CATEGORY_DIGIT)])]))]))]
+            _, _, _, inner_subpattern = value
+            total += _max_length_from_subpattern(inner_subpattern)
+        elif token == sre_parse.BRANCH:
+            _, branches = value
+            total += max(_max_length_from_subpattern(branch) for branch in branches)
+        elif token in {sre_parse.MAX_REPEAT, sre_parse.MIN_REPEAT}:
+            _, max_num_repeat, inner_subpattern = value
+            if max_num_repeat == sre_parse.MAXREPEAT:
+                total += MAX_LEN
+            else:
+                total += max_num_repeat * _max_length_from_subpattern(inner_subpattern)
+        elif token == sre_parse.AT:
+            # These are zero-width assertions like ^, $, and \b that don't add to the max
+            # length
+            total += 0
+        else:
+            logger.warning(f"Got unhandled regex token: {token}")
+
+            total += MAX_LEN
+
+    return total
diff --git a/sglang/python/sglang/srt/tokenizer/tiktoken_tokenizer.py b/sglang/python/sglang/srt/tokenizer/tiktoken_tokenizer.py
new file mode 100644
index 0000000000000000000000000000000000000000..b290155472768d8de83f1c7da8076c3e376a0d43
--- /dev/null
+++ b/sglang/python/sglang/srt/tokenizer/tiktoken_tokenizer.py
@@ -0,0 +1,167 @@
+import functools
+import json
+from typing import AbstractSet, Collection, List, Literal, Union
+
+
+class TiktokenProcessor:
+    def __init__(self, name: str):
+        self.tokenizer = TiktokenTokenizer(name)
+
+    def image_processor(self, image):
+        return {"pixel_values": [image]}
+
+
+RESERVED_TOKEN_TEXTS = [f"<|reserved_{i}|>" for i in range(3, 128)]
+CONTROL_TOKEN_TEXTS = [f"<|control{i}|>" for i in range(1, 705)]
+
+
+PAD = "<|pad|>"
+EOS = "<|eos|>"
+SEP = "<|separator|>"
+
+DEFAULT_SPECIAL_TOKENS = [PAD, SEP, EOS]
+DEFAULT_CONTROL_TOKENS = {"pad": PAD, "sep": EOS, "eos": SEP}
+
+# default + separate each single digit
+PAT_STR_B = r"""(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+"""
+
+
+class TiktokenTokenizer:
+    def __init__(self, tokenizer_path):
+        import tiktoken
+        from jinja2 import Template
+
+        # Read the JSON
+        with open(tokenizer_path, "rb") as fin:
+            xtok_dict = json.load(fin)
+
+        # Copy from train/xlm/tokenizers/tiktoken_wrapper.py::Encoding::from_xtok_dict
+        mergeable_ranks = {
+            bytes(item["bytes"]): item["token"] for item in xtok_dict["regular_tokens"]
+        }
+        special_tokens = {
+            bytes(item["bytes"]).decode(): item["token"]
+            for item in xtok_dict["special_tokens"]
+        }
+        if xtok_dict["word_split"] == "V1":
+            pad_str = PAT_STR_B
+        else:
+            assert False, f"Unknown word_split: {xtok_dict['word_split']}"
+        pad_str = xtok_dict.get("pat_str", pad_str)
+
+        kwargs = {
+            "name": tokenizer_path,
+            "pat_str": pad_str,
+            "mergeable_ranks": mergeable_ranks,
+            "special_tokens": special_tokens,
+        }
+        if "default_allowed_special" in xtok_dict:
+            default_allowed_special = set(
+                [
+                    bytes(bytes_list).decode()
+                    for bytes_list in xtok_dict["default_allowed_special"]
+                ]
+            )
+        if "vocab_size" in xtok_dict:
+            kwargs["explicit_n_vocab"] = xtok_dict["vocab_size"]
+
+        # Copy from train/xlm/tokenizers/tiktoken_wrapper.py::Encoding::__init__
+        default_allowed_special = None
+        control_tokens = DEFAULT_CONTROL_TOKENS
+        tokenizer = tiktoken.Encoding(**kwargs)
+        tokenizer._default_allowed_special = default_allowed_special or set()
+        tokenizer._control_tokens = control_tokens
+
+        def encode_patched(
+            self,
+            text: str,
+            *,
+            allowed_special: Union[
+                Literal["all"], AbstractSet[str]
+            ] = set(),  # noqa: B006
+            disallowed_special: Union[Literal["all"], Collection[str]] = "all",
+        ) -> List[int]:
+            if isinstance(allowed_special, set):
+                allowed_special |= self._default_allowed_special
+            return tiktoken.Encoding.encode(
+                self,
+                text,
+                allowed_special=allowed_special,
+                disallowed_special=(),
+            )
+
+        tokenizer.encode = functools.partial(encode_patched, tokenizer)
+
+        # Allow more tokens to prevent crash
+        tokenizer._default_allowed_special |= set(DEFAULT_CONTROL_TOKENS.values())
+        tokenizer._default_allowed_special |= set(
+            CONTROL_TOKEN_TEXTS + RESERVED_TOKEN_TEXTS
+        )
+
+        # Convert to HF interface
+        self.tokenizer = tokenizer
+        self.bos_token_id = None
+        self.eos_token_id = tokenizer._special_tokens[EOS]
+        self.vocab_size = tokenizer.n_vocab
+        self.chat_template = "{% for message in messages %}{% if message['role'] == 'user' %}{{ 'Human: ' + message['content'].strip() + '<|separator|>\n\n' }}{% elif message['role'] == 'system' %}{{ 'System: ' + message['content'].strip() + '<|separator|>\n\n' }}{% elif message['role'] == 'assistant' %}{{ 'Assistant: '  + message['content'] + '<|separator|>\n\n' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ 'Assistant:' }}{% endif %}"
+        self.chat_template_jinja = Template(self.chat_template)
+        self.additional_stop_token_ids = None
+
+    def encode(self, x, add_special_tokens=False):
+        return self.tokenizer.encode(x)
+
+    def decode(self, x, *args, **kwargs):
+        return self.tokenizer.decode(x)
+
+    def batch_decode(
+        self, batch, skip_special_tokens=True, spaces_between_special_tokens=False
+    ):
+        if len(batch) > 0 and isinstance(batch[0], int):
+            batch = [[x] for x in batch]
+        return self.tokenizer.decode_batch(batch)
+
+    def apply_chat_template(
+        self,
+        messages,
+        tokenize,
+        add_generation_prompt,
+        tools=None,
+        reasoning_effort=None,
+        **kwargs,  # Accept additional parameters (e.g., return_dict) for compatibility
+    ):
+        ret = self.chat_template_jinja.render(
+            messages=messages, add_generation_prompt=add_generation_prompt
+        )
+        return self.encode(ret) if tokenize else ret
+
+    def __call__(self, text: List[str], **kwargs):
+        return {
+            "input_ids": [self.encode(x) for x in text],
+        }
+
+    def init_xgrammar(self):
+        from xgrammar import TokenizerInfo
+
+        XGRAMMAR_SPECIAL_TOKEN_TEMPLATE = "<|xg_special_token_{}|>"
+
+        enc = self.tokenizer
+        encoded_vocab = {**enc._mergeable_ranks, **enc._special_tokens}
+        encoded_vocab = [
+            token for token, _ in sorted(encoded_vocab.items(), key=lambda x: x[1])
+        ]
+        override_stop_tokens = [2]  # eos
+        # These are treated as special tokens in xgrammar; we want to avoid them
+        # For now, xgrammar treats anything starting with b'\x00' as a special token
+        xgrammar_special_token_ids = []
+        for i, token in enumerate(encoded_vocab):
+            if isinstance(token, bytes) and token.startswith(b"\x00"):
+                xgrammar_special_token_ids.append(i)
+
+        for i, id in enumerate(xgrammar_special_token_ids):
+            encoded_vocab[id] = XGRAMMAR_SPECIAL_TOKEN_TEMPLATE.format(i)
+        tokenizer_info = TokenizerInfo(
+            encoded_vocab, stop_token_ids=override_stop_tokens
+        )
+        assert len(tokenizer_info.special_token_ids) == 0
+
+        return tokenizer_info, override_stop_tokens
diff --git a/sglang/python/sglang/srt/utils/profile_utils.py b/sglang/python/sglang/srt/utils/profile_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..1f44af979e89c7579ef1208c67dc0f8e0f69d543
--- /dev/null
+++ b/sglang/python/sglang/srt/utils/profile_utils.py
@@ -0,0 +1,388 @@
+import logging
+import os
+import time
+from abc import ABC
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Callable, Dict, List, Optional
+
+import torch
+
+from sglang.srt.managers.io_struct import ProfileReqOutput
+from sglang.srt.model_executor.forward_batch_info import ForwardMode
+from sglang.srt.server_args import get_global_server_args
+from sglang.srt.utils import is_npu
+
+_is_npu = is_npu()
+if _is_npu:
+    import torch_npu
+
+    patches = [
+        ["profiler.profile", torch_npu.profiler.profile],
+        ["profiler.ProfilerActivity.CUDA", torch_npu.profiler.ProfilerActivity.NPU],
+        ["profiler.ProfilerActivity.CPU", torch_npu.profiler.ProfilerActivity.CPU],
+    ]
+    torch_npu._apply_patches(patches)
+
+logger = logging.getLogger(__name__)
+
+
+class ProfileManager:
+    def __init__(self, tp_rank: int, cpu_group, gpu_id: int):
+        self.stage_based_trigger = _StageBasedTrigger(
+            on_start=self._do_start,
+            on_stop=self._do_stop,
+        )
+        self.tp_rank = tp_rank
+        self.cpu_group = cpu_group
+        self.first_rank_in_node = gpu_id == get_global_server_args().base_gpu_id
+        self.profiler_kwargs = None
+        self.profiler = None
+
+    def step(self, forward_mode: ForwardMode):
+        stage = _get_stage_from_forward_mode(forward_mode)
+        if stage is None:
+            return
+
+        self.stage_based_trigger.step(stage=stage)
+
+    def configure(
+        self,
+        *,
+        output_dir: Optional[str],
+        start_step: Optional[int],
+        num_steps: Optional[int],
+        activities: Optional[List[str]],
+        with_stack: Optional[bool],
+        record_shapes: Optional[bool],
+        profile_by_stage: bool,
+        profile_id: str,
+        merge_profiles: bool,
+        profile_prefix: str,
+        profile_stages: Optional[List[str]] = None,
+    ):
+        # not supported yet
+        assert start_step is None
+        assert (
+            profile_by_stage
+        ), "only support profile_by_stage=true now"  # `false` can be easily supported
+        assert not merge_profiles
+
+        if output_dir is None:
+            output_dir = os.getenv("SGLANG_TORCH_PROFILER_DIR", "/tmp")
+        if activities is None:
+            activities = ["CPU", "GPU"]
+
+        self.profiler_kwargs = dict(
+            activities=activities,
+            with_stack=with_stack,
+            record_shapes=record_shapes,
+            output_dir=output_dir,
+            output_prefix=profile_prefix,
+            profile_id=profile_id,
+        )
+
+        self.stage_based_trigger.configure(
+            num_steps=num_steps,
+            interesting_stages=profile_stages or ["prefill", "decode"],
+        )
+
+        return ProfileReqOutput(success=True, message="Succeeded")
+
+    def manual_start(self):
+        raise NotImplementedError("manually start is only supported yet")
+
+    def manual_stop(self):
+        raise NotImplementedError("manually stop is only supported yet")
+
+    def _do_start(self, stage: Optional[str] = None):
+        logger.info(
+            f"Profiling starts{f' for {stage}' if stage else ''}. "
+            f"Traces will be saved to: {self.profiler_kwargs['output_dir']} "
+            f"(with profile id: {self.profiler_kwargs['profile_id']})",
+        )
+
+        assert self.profiler is None
+        self.profiler = _ProfilerBase.create(
+            **self.profiler_kwargs,
+            tp_rank=self.tp_rank,
+            cpu_group=self.cpu_group,
+            first_rank_in_node=self.first_rank_in_node,
+            output_suffix=f"-{stage}" if stage else "",
+        )
+        self.profiler.start()
+
+    def _do_stop(self):
+        logger.info("Stop profiling...")
+        self.profiler.stop()
+        logger.info(
+            f"Profiling done. Traces are saved to: {self.profiler_kwargs['output_dir']}"
+        )
+        self.profiler = None
+
+
+def _get_stage_from_forward_mode(forward_mode: ForwardMode):
+    if forward_mode.is_prefill():
+        return "prefill"
+    elif forward_mode.is_decode():
+        return "decode"
+    elif forward_mode.is_idle():
+        return None
+    else:
+        raise RuntimeError(f"unsupported profile stage: {forward_mode=}")
+
+
+# ======================================== Stage related ==========================================
+
+
+class _StageBasedTrigger:
+    @dataclass
+    class _StageConfig:
+        target_count: int
+
+    @dataclass
+    class _RunningState:
+        curr_stage: str
+        curr_count: int
+
+    def __init__(self, on_start: Callable, on_stop: Callable):
+        self.on_start = on_start
+        self.on_stop = on_stop
+
+        self.running_state: Optional[_StageBasedTrigger._RunningState] = None
+        # When a stage is in the dict, it means it is being or should be executed
+        self.stage_configs: Dict[str, _StageBasedTrigger._StageConfig] = {}
+
+    def configure(self, num_steps: int, interesting_stages: List[str]):
+        assert self.running_state is None
+        self.stage_configs = {
+            stage: self._StageConfig(target_count=num_steps)
+            for stage in interesting_stages
+        }
+
+    def step(self, stage: str):
+        # Incr counter
+        if (s := self.running_state) is not None:
+            s.curr_count += 1
+
+        # Maybe stop
+        if ((s := self.running_state) is not None) and (
+            (s.curr_count > self.stage_configs[s.curr_stage].target_count)
+            or (stage != s.curr_stage)
+        ):
+            del self.stage_configs[s.curr_stage]
+            self.running_state = None
+            self.on_stop()
+
+        # Maybe start
+        if (self.running_state is None) and (stage in self.stage_configs):
+            self.running_state = self._RunningState(
+                curr_stage=stage,
+                curr_count=0,
+            )
+            self.on_start(stage=stage)
+
+        # Sanity check
+        assert (self.running_state is not None) == (stage in self.stage_configs)
+        if (s := self.running_state) is not None:
+            assert s.curr_stage == stage
+
+
+# ======================================== Concrete profilers ==========================================
+
+
+class _ProfilerBase(ABC):
+    @staticmethod
+    def create(activities, with_stack, record_shapes, **kwargs):
+        inners = []
+        if ("CPU" in activities) or ("GPU" in activities):
+            inners.append(
+                _ProfilerTorch(
+                    **kwargs,
+                    activities=activities,
+                    with_stack=with_stack,
+                    record_shapes=record_shapes,
+                )
+            )
+        if "MEM" in activities:
+            inners.append(_ProfilerMemory(**kwargs))
+        if "CUDA_PROFILER" in activities:
+            inners.append(_ProfilerCudart(**kwargs))
+        if "RPD" in activities:  # for ROCM
+            inners.append(_ProfilerRPD(**kwargs))
+
+        return _ProfilerList(inners)
+
+    def start(self):
+        raise NotImplementedError
+
+    def stop(self):
+        raise NotImplementedError
+
+
+class _ProfilerList(_ProfilerBase):
+    def __init__(self, inners: List[_ProfilerBase]):
+        self.inners = inners
+
+    def start(self):
+        for inner in self.inners:
+            inner.start()
+
+    def stop(self):
+        for inner in self.inners:
+            inner.stop()
+
+
+class _ProfilerConcreteBase(_ProfilerBase):
+    def __init__(
+        self,
+        output_dir: str,
+        output_prefix: str,
+        output_suffix: str,
+        profile_id: str,
+        tp_rank: int,
+        cpu_group,
+        first_rank_in_node: bool,
+    ):
+        self.output_dir = output_dir
+        self.output_prefix = output_prefix
+        self.output_suffix = output_suffix
+        self.profile_id = profile_id
+        self.tp_rank = tp_rank
+        self.cpu_group = cpu_group
+        self.first_rank_in_node = first_rank_in_node
+
+
+class _ProfilerTorch(_ProfilerConcreteBase):
+    def __init__(self, with_stack: bool, record_shapes: bool, activities, **kwargs):
+        super().__init__(**kwargs)
+        self.with_stack = with_stack
+        self.record_shapes = record_shapes
+        self.activities = activities
+
+    def start(self):
+        activity_map = {
+            "CPU": torch.profiler.ProfilerActivity.CPU,
+            "GPU": torch.profiler.ProfilerActivity.CUDA,
+        }
+        torchprof_activities = [
+            activity_map[a] for a in self.activities if a in activity_map
+        ]
+
+        self.torch_profiler = torch.profiler.profile(
+            activities=torchprof_activities,
+            with_stack=self.with_stack if self.with_stack is not None else True,
+            record_shapes=(
+                self.record_shapes if self.record_shapes is not None else False
+            ),
+            on_trace_ready=(
+                None
+                if not _is_npu
+                else torch_npu.profiler.tensorboard_trace_handler(self.output_dir)
+            ),
+        )
+        self.torch_profiler.start()
+
+    def stop(self):
+        Path(self.output_dir).mkdir(parents=True, exist_ok=True)
+
+        self.torch_profiler.stop()
+        if not _is_npu:
+            # Build filename with only non-zero ranks to maintain backward compatibility
+            filename_parts = [self.profile_id, f"TP-{self.tp_rank}"]
+
+            # Only add other ranks if parallelism is enabled (size > 1)
+            if getattr(self, "dp_size", 1) > 1:
+                filename_parts.append(f"DP-{getattr(self, 'dp_rank', 0)}")
+            if getattr(self, "pp_size", 1) > 1:
+                filename_parts.append(f"PP-{getattr(self, 'pp_rank', 0)}")
+            if getattr(self, "moe_ep_size", 1) > 1:
+                filename_parts.append(f"EP-{getattr(self, 'moe_ep_rank', 0)}")
+
+            filename = (
+                (self.output_prefix + "-" if self.output_prefix else "")
+                + "-".join(filename_parts)
+                + self.output_suffix
+                + ".trace.json.gz"
+            )
+
+            self.torch_profiler.export_chrome_trace(
+                os.path.join(self.output_dir, filename)
+            )
+        torch.distributed.barrier(self.cpu_group)
+
+        # TODO: migrate `_merge_profile_traces`
+
+
+class _ProfilerMemory(_ProfilerConcreteBase):
+    def start(self):
+        torch.cuda.memory._record_memory_history(max_entries=100000)
+
+    def stop(self):
+        Path(self.output_dir).mkdir(parents=True, exist_ok=True)
+
+        memory_profile_path = os.path.join(
+            self.output_dir,
+            str(time.time())
+            + f"-TP-{self.tp_rank}-memory"
+            + self.output_suffix
+            + ".pickle",
+        )
+        torch.cuda.memory._dump_snapshot(memory_profile_path)
+        torch.cuda.memory._record_memory_history(enabled=None)
+
+
+class _ProfilerCudart(_ProfilerConcreteBase):
+    def start(self):
+        if self.first_rank_in_node:
+            logger.info(f"Call cudaProfilerStart")
+            torch.cuda.cudart().cudaProfilerStart()
+
+    def stop(self):
+        if self.first_rank_in_node:
+            logger.info(f"Call cudaProfilerStop")
+            torch.cuda.cudart().cudaProfilerStop()
+
+
+class _ProfilerRPD(_ProfilerConcreteBase):
+    def start(self):
+        Path(self.output_dir).mkdir(parents=True, exist_ok=True)
+
+        from rpdTracerControl import rpdTracerControl
+
+        rpdTracerControl.skipCreate()
+
+        self.rpd_profile_path = os.path.join(
+            self.output_dir,
+            "rpd-" + str(time.time()) + f"-TP-{self.tp_rank}" + ".trace.json.gz",
+        )
+
+        if self.tp_rank == 0:
+            import sqlite3
+
+            from rocpd.schema import RocpdSchema
+
+            if os.path.exists("trace.rpd"):
+                os.unlink("trace.rpd")
+            schema = RocpdSchema()
+            connection = sqlite3.connect("trace.rpd")
+            schema.writeSchema(connection)
+            connection.commit()
+            del connection
+        torch.distributed.barrier(self.cpu_group)
+
+        self.rpd_profiler = rpdTracerControl()
+        self.rpd_profiler.setPythonTrace(True)
+        self.rpd_profiler.start()
+        self.rpd_profiler.rangePush("", "rpd profile range", "")
+
+    def stop(self):
+        self.rpd_profiler.rangePop()
+        self.rpd_profiler.stop()
+        self.rpd_profiler.flush()
+
+        torch.distributed.barrier(self.cpu_group)
+        if self.tp_rank == 0:
+            from sglang.srt.utils.rpd_utils import rpd_to_chrome_trace
+
+            rpd_to_chrome_trace("trace.rpd", self.rpd_profile_path)
diff --git a/sglang/python/sglang/srt/utils/request_logger.py b/sglang/python/sglang/srt/utils/request_logger.py
new file mode 100644
index 0000000000000000000000000000000000000000..aad5664a756d50892676a99309f336628bee3e7f
--- /dev/null
+++ b/sglang/python/sglang/srt/utils/request_logger.py
@@ -0,0 +1,298 @@
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+from __future__ import annotations
+
+import dataclasses
+import logging
+from functools import lru_cache
+from typing import TYPE_CHECKING, Any, Dict, List, Optional, Set, Tuple, Union
+
+from sglang.srt.environ import envs
+from sglang.srt.utils.common import get_bool_env_var
+from sglang.srt.utils.log_utils import create_log_targets, log_json
+
+if TYPE_CHECKING:
+    import fastapi
+
+    from sglang.srt.managers.io_struct import EmbeddingReqInput, GenerateReqInput
+
+logger = logging.getLogger(__name__)
+
+_DEFAULT_WHITELISTED_HEADERS = ["x-smg-routing-key"]
+WHITELISTED_HEADERS = _DEFAULT_WHITELISTED_HEADERS + [
+    h.lower() for h in envs.SGLANG_LOG_REQUEST_HEADERS.get()
+]
+
+
+def _extract_whitelisted_headers(
+    request: Optional["fastapi.Request"],
+) -> Optional[Dict[str, str]]:
+    if request is None:
+        return None
+    return {h: v for h in WHITELISTED_HEADERS if (v := request.headers.get(h))}
+
+
+class RequestLogger:
+    def __init__(
+        self,
+        log_requests: bool,
+        log_requests_level: int,
+        log_requests_format: str,
+        log_requests_target: Optional[List[str]],
+    ):
+        self.log_requests = log_requests
+        self.log_requests_level = log_requests_level
+        self.log_requests_format = log_requests_format
+        self.log_requests_target = log_requests_target
+
+        self.metadata: Tuple[Optional[int], Optional[Set[str]], Optional[Set[str]]] = (
+            self._compute_metadata()
+        )
+        self.targets = self._setup_targets()
+
+        self.log_exceeded_ms = envs.SGLANG_LOG_REQUEST_EXCEEDED_MS.get()
+
+    def _setup_targets(self) -> List[logging.Logger]:
+        return create_log_targets(
+            targets=self.log_requests_target, name_prefix=__name__
+        )
+
+    def configure(
+        self,
+        log_requests: Optional[bool] = None,
+        log_requests_level: Optional[int] = None,
+        log_requests_format: Optional[str] = None,
+        log_requests_target: Optional[List[str]] = None,
+    ) -> None:
+        if log_requests is not None:
+            self.log_requests = log_requests
+        if log_requests_level is not None:
+            self.log_requests_level = log_requests_level
+        if log_requests_format is not None:
+            self.log_requests_format = log_requests_format
+        if log_requests_target is not None:
+            self.log_requests_target = log_requests_target
+
+        self.metadata = self._compute_metadata()
+        self.targets = self._setup_targets()
+
+    def log_received_request(
+        self,
+        obj: Union["GenerateReqInput", "EmbeddingReqInput"],
+        tokenizer: Any = None,
+        request: Optional["fastapi.Request"] = None,
+    ) -> None:
+        if not self.log_requests:
+            return
+
+        max_length, skip_names, _ = self.metadata
+        headers = _extract_whitelisted_headers(request)
+        if self.log_requests_format == "json":
+            log_data = {
+                "rid": obj.rid,
+                "obj": _transform_data_for_logging(obj, max_length, skip_names),
+            }
+            if headers:
+                log_data["headers"] = headers
+            log_json(self.targets, "request.received", log_data)
+        else:
+            headers_str = f", headers={headers}" if headers else ""
+            self._log(
+                f"Receive: obj={_dataclass_to_string_truncated(obj, max_length, skip_names=skip_names)}{headers_str}"
+            )
+
+        # FIXME: This is a temporary fix to get the text from the input ids.
+        # We should remove this once we have a proper way.
+        if (
+            self.log_requests_level >= 2
+            and obj.text is None
+            and obj.input_ids is not None
+            and tokenizer is not None
+        ):
+            if obj.input_ids and isinstance(obj.input_ids[0], list):
+                # Prefill node warmup while PD disaggregated.
+                decoded = [
+                    tokenizer.decode(_input_ids, skip_special_tokens=False)
+                    for _input_ids in obj.input_ids
+                ]
+            else:
+                decoded = tokenizer.decode(obj.input_ids, skip_special_tokens=False)
+            obj.text = decoded
+
+    def log_finished_request(
+        self,
+        obj: Union["GenerateReqInput", "EmbeddingReqInput"],
+        out: Any,
+        is_multimodal_gen: bool = False,
+        request: Optional["fastapi.Request"] = None,
+    ) -> None:
+        if not self.log_requests:
+            return
+
+        e2e_latency_ms = out["meta_info"].get("e2e_latency", 0) * 1000
+        if self.log_exceeded_ms > 0 and e2e_latency_ms < self.log_exceeded_ms:
+            return
+
+        max_length, skip_names, out_skip_names = self.metadata
+        headers = _extract_whitelisted_headers(request)
+        if self.log_requests_format == "json":
+            log_data = {
+                "rid": obj.rid,
+                "obj": _transform_data_for_logging(obj, max_length, skip_names),
+            }
+            if headers:
+                log_data["headers"] = headers
+            if not is_multimodal_gen:
+                log_data["out"] = _transform_data_for_logging(
+                    out, max_length, out_skip_names
+                )
+            log_json(self.targets, "request.finished", log_data)
+        else:
+            obj_str = _dataclass_to_string_truncated(
+                obj, max_length, skip_names=skip_names
+            )
+            out_str = (
+                ""
+                if is_multimodal_gen
+                else f", out={_dataclass_to_string_truncated(out, max_length, skip_names=out_skip_names)}"
+            )
+            headers_str = f", headers={headers}" if headers else ""
+            self._log(f"Finish: obj={obj_str}{headers_str}{out_str}")
+
+    def _compute_metadata(
+        self,
+    ) -> Tuple[Optional[int], Optional[Set[str]], Optional[Set[str]]]:
+        max_length: Optional[int] = None
+        skip_names: Optional[Set[str]] = None
+        out_skip_names: Optional[Set[str]] = None
+        if self.log_requests:
+            if self.log_requests_level == 0:
+                max_length = 1 << 30
+                skip_names = {
+                    "text",
+                    "input_ids",
+                    "input_embeds",
+                    "image_data",
+                    "audio_data",
+                    "lora_path",
+                    "sampling_params",
+                }
+                out_skip_names = {"text", "output_ids", "embedding"}
+            elif self.log_requests_level == 1:
+                max_length = 1 << 30
+                skip_names = {
+                    "text",
+                    "input_ids",
+                    "input_embeds",
+                    "image_data",
+                    "audio_data",
+                    "lora_path",
+                }
+                out_skip_names = {"text", "output_ids", "embedding"}
+            elif self.log_requests_level == 2:
+                max_length = 2048
+            elif self.log_requests_level == 3:
+                max_length = 1 << 30
+            else:
+                raise ValueError(
+                    f"Invalid --log-requests-level: {self.log_requests_level=}"
+                )
+        return max_length, skip_names, out_skip_names
+
+    def _log(self, msg: str) -> None:
+        for target in self.targets:
+            target.info(msg)
+
+
+# TODO remove this?
+@lru_cache(maxsize=2)
+def disable_request_logging() -> bool:
+    return get_bool_env_var("SGLANG_DISABLE_REQUEST_LOGGING")
+
+
+# TODO unify this w/ `_transform_data_for_logging` if we find performance enough
+def _dataclass_to_string_truncated(
+    data: Any, max_length: int = 2048, skip_names: Optional[Set[str]] = None
+) -> str:
+    if skip_names is None:
+        skip_names = set()
+    if isinstance(data, str):
+        if len(data) > max_length:
+            half_length = max_length // 2
+            return f"{repr(data[:half_length])} ... {repr(data[-half_length:])}"
+        else:
+            return f"{repr(data)}"
+    elif isinstance(data, (list, tuple)):
+        if len(data) > max_length:
+            half_length = max_length // 2
+            return str(data[:half_length]) + " ... " + str(data[-half_length:])
+        else:
+            return str(data)
+    elif isinstance(data, dict):
+        return (
+            "{"
+            + ", ".join(
+                f"'{k}': {_dataclass_to_string_truncated(v, max_length)}"
+                for k, v in data.items()
+                if k not in skip_names
+            )
+            + "}"
+        )
+    elif dataclasses.is_dataclass(data):
+        fields = dataclasses.fields(data)
+        return (
+            f"{data.__class__.__name__}("
+            + ", ".join(
+                f"{f.name}={_dataclass_to_string_truncated(getattr(data, f.name), max_length)}"
+                for f in fields
+                if f.name not in skip_names
+            )
+            + ")"
+        )
+    else:
+        return str(data)
+
+
+def _transform_data_for_logging(
+    data: Any, max_length: int = 2048, skip_names: Optional[Set[str]] = None
+) -> Any:
+    if skip_names is None:
+        skip_names = set()
+    if isinstance(data, str):
+        if len(data) > max_length:
+            half_length = max_length // 2
+            return data[:half_length] + "..." + data[-half_length:]
+        return data
+    elif isinstance(data, (list, tuple)):
+        if len(data) > max_length:
+            half_length = max_length // 2
+            return list(data[:half_length]) + ["..."] + list(data[-half_length:])
+        return [_transform_data_for_logging(v, max_length) for v in data]
+    elif isinstance(data, dict):
+        return {
+            k: _transform_data_for_logging(v, max_length)
+            for k, v in data.items()
+            if k not in skip_names
+        }
+    elif dataclasses.is_dataclass(data):
+        fields = dataclasses.fields(data)
+        return {
+            f.name: _transform_data_for_logging(getattr(data, f.name), max_length)
+            for f in fields
+            if f.name not in skip_names
+        }
+    elif isinstance(data, (int, float, bool, type(None))):
+        return data
+    else:
+        return str(data)